11  Example 04: Using and Excluding Sensitive Variables

In this example, we demonstrate a very specific use-case: how to withhold some “sensitive” variables from the synthetic dataset while still using them in the synthesis process.

11.1 Setup



11.1.1 The data

We once again return to the Palmer Penguins data set, include categorical variables, and drop incomplete records.

Suppose we view the species and sex variables as sensitive and determine that we do not want to publicize them in the final synthetic dataset. One potential workflow could be to remove those columns from the start. However, this approach would have the flaw of losing data that could inform our synthesis process.

11.1.2 A better approach

Instead, we can use the following process:

  1. Create the confidential dataset

  2. Use the sensitive records as our starting data

  3. Go through the synthesis process “as normal”

  4. Once we have synthesized all of the columns, simply remove the sensitive starting variables

11.2 Create the confidential data:

penguins_complete <- penguins %>%
  filter(complete.cases(.)) %>%
  mutate(flipper_length_mm = as.numeric(flipper_length_mm),
         body_mass_g = as.numeric(body_mass_g)) %>%
  select(bill_length_mm, flipper_length_mm, island, species, sex)

11.3 Create the starting data including only sensitive variables:


starting_data <- penguins_complete %>% 
  select(species, sex) %>%
  slice_sample(n = nrow(penguins_complete), replace = TRUE)

11.4 Proceed with the synthesis “as normal”:

schema <- schema(
  conf_data = penguins_complete,
  start_data = starting_data

visit_sequence <- visit_sequence(
  schema = schema,
  type = "manual",
  manual_vars = c("island", "flipper_length_mm", "bill_length_mm")

roadmap <- roadmap(
  visit_sequence = visit_sequence

rpart_mod_cat <- parsnip::decision_tree() %>% 
  parsnip::set_mode("classification") %>%

rpart_mod_num <- parsnip::decision_tree() %>%
  parsnip::set_mode("regression") %>%

numeric_vars <- c("flipper_length_mm", "bill_length_mm")
algorithms <- construct_algos(
  roadmap = roadmap,
  default_algo = rpart_mod_cat,
  custom_algos = list(
      vars = numeric_vars, 
      algorithm = rpart_mod_num

synth_spec <- synth_spec(
  roadmap = roadmap,
  synth_algorithms = algorithms,
  recipes = construct_recipes(roadmap = roadmap),
  predict_methods = sample_rpart

noise <- noise(
  roadmap = roadmap,
  add_noise = FALSE,
  exclusions = 0

# don't impose constraints
constraints <- constraints(
  roadmap = roadmap,
  constraints = NULL,
  max_z = 0

replicates <- replicates(
  replicates = 1,
  workers = 1,
  summary_function = NULL

presynth1 <- presynth(
  roadmap = roadmap,
  synth_spec = synth_spec,
  noise = noise, 
  constraints = constraints,
  replicates = replicates


synth1 <- synthesize(presynth1)

11.5 Create the final dataset by removing sensitive variables:

synth_final <- synth1$synthetic_data %>%
  select(-c(sex, species))

# A tibble: 333 × 5
   species   sex    island    flipper_length_mm bill_length_mm
   <fct>     <fct>  <fct>                 <dbl>          <dbl>
 1 Chinstrap male   Torgersen               201           55.1
 2 Gentoo    male   Biscoe                  215           51.7
 3 Adelie    female Dream                   187           34.5
 4 Chinstrap male   Dream                   197           47.3
 5 Chinstrap male   Dream                   187           49.6
 6 Gentoo    male   Biscoe                  219           49.5
 7 Chinstrap female Dream                   186           44.9
 8 Adelie    female Biscoe                  192           35.7
 9 Chinstrap male   Dream                   201           48.7
10 Chinstrap female Dream                   185           45.3
# ℹ 323 more rows

11.6 Session info

R version 4.3.1 (2023-06-16)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.5

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] palmerpenguins_0.1.1 syntheval_0.0.3      tidysynthesis_0.0.4 
 [4] lubridate_1.9.3      forcats_1.0.0        stringr_1.5.1       
 [7] dplyr_1.1.4          purrr_1.0.2          readr_2.1.5         
[10] tidyr_1.3.0          tibble_3.2.1         ggplot2_3.4.4       
[13] tidyverse_2.0.0     

loaded via a namespace (and not attached):
 [1] gtable_0.3.4        xfun_0.45           htmlwidgets_1.6.4  
 [4] recipes_1.0.9       lattice_0.22-5      tzdb_0.4.0         
 [7] vctrs_0.6.5         tools_4.3.1         generics_0.1.3     
[10] parallel_4.3.1      fansi_1.0.6         pkgconfig_2.0.3    
[13] Matrix_1.6-5        data.table_1.14.10  lifecycle_1.0.4    
[16] compiler_4.3.1      munsell_0.5.0       codetools_0.2-19   
[19] htmltools_0.5.7     class_7.3-22        yaml_2.3.8         
[22] prodlim_2023.08.28  furrr_0.3.1         pillar_1.9.0       
[25] ellipsis_0.3.2      MASS_7.3-60.0.1     gower_1.0.1        
[28] rpart_4.1.23        parallelly_1.36.0   lava_1.7.3         
[31] tidyselect_1.2.0    digest_0.6.34       stringi_1.8.3      
[34] future_1.33.1       listenv_0.9.0       splines_4.3.1      
[37] fastmap_1.1.1       parsnip_1.1.1       grid_4.3.1         
[40] colorspace_2.1-0    cli_3.6.2           magrittr_2.0.3     
[43] survival_3.5-7      utf8_1.2.4          future.apply_1.11.1
[46] withr_3.0.0         scales_1.3.0        timechange_0.3.0   
[49] rmarkdown_2.25      globals_0.16.2      nnet_7.3-19        
[52] timeDate_4032.109   workflows_1.1.3     hms_1.1.3          
[55] evaluate_0.23       knitr_1.45          hardhat_1.3.0      
[58] rlang_1.1.3         Rcpp_1.0.12         glue_1.7.0         
[61] ipred_0.9-14        rstudioapi_0.15.0   jsonlite_1.8.8     
[64] R6_2.5.1