10  Example 03: Penguins (Again)

10.1 Setup



10.1.1 The data

We return to the Palmer Penguins data set and include categorical variables. First we drop incomplete records.

penguins_complete <- penguins %>%
  filter(complete.cases(.)) %>%
  mutate(flipper_length_mm = as.numeric(flipper_length_mm),
         body_mass_g = as.numeric(body_mass_g)) %>%
  select(bill_length_mm, flipper_length_mm, island, species, sex)

10.2 Synthesis

10.2.1 Start data

We start by sampling bill_length_mm with replacement. We synthesize the same number of observations as the original data.


starting_data <- penguins_complete %>% 
  select(bill_length_mm) %>%
  slice_sample(n = nrow(penguins_complete), replace = TRUE)

10.2.2 Visit sequence

We have a blend of numeric and categorical variables, so we need to manually specify the visit sequence.

schema <- schema(
  conf_data = penguins_complete, 
  start_data = starting_data

visit_sequence <- visit_sequence(
  schema = schema,
  type = "manual",
  manual_vars = c("flipper_length_mm", "sex", "island")

10.2.3 roadmap

The roadmap combines the confidential data, starting data, and visit sequence. It also includes information about the types of variables. This object will inform the remaining synthesis objects.

roadmap <- roadmap(
  visit_sequence = visit_sequence

10.2.4 synthspec

We use regression trees with node sampling and no feature/target engineering to synthesize numeric variables. We use decision trees with uniform sampling and no feature/target engineering to synthesize categorical variables.

rpart_mod_cat <- parsnip::decision_tree() %>% 
  parsnip::set_mode("classification") %>%

rpart_mod_num <- parsnip::decision_tree() %>%
  parsnip::set_mode("regression") %>%

algorithms <- construct_algos(
  roadmap = roadmap,
  default_algo = rpart_mod_cat,
  custom_algos = list(
      vars = "flipper_length_mm",
      algorithm = rpart_mod_num

synth_spec <- synth_spec(
  roadmap = roadmap,
  synth_algorithms = algorithms,
  recipes = construct_recipes(roadmap = roadmap),
  predict_methods = sample_rpart

10.2.5 noise

We add extra noise to numeric variables using variables from 100 ntiles. We don’t add extra noise to categorical variables.

noise <- noise(
  roadmap = roadmap,
  add_noise = TRUE,
  ntiles = 100,
  exclusions = 0

10.2.6 constraints

We demonstrate an unconditional constraint and a conditional constraint for flipper_length_mm.

Constraints don’t work for categorical variables.

constraints_df <- 
  tribble(~var, ~min, ~max, ~conditions,
          "flipper_length_mm", 0, Inf, "TRUE",
          "flipper_length_mm", 0, 180, "bill_length_mm < 40"

constraints <- constraints(
  roadmap = roadmap,
  constraints = constraints_df,
  max_z = 0

10.2.7 replicates

We don’t use replicates for this synthesis.

replicates <- replicates(
  replicates = 1,
  workers = 1,
  summary_function = NULL

10.2.8 presynth

presynth1 <- presynth(
  roadmap = roadmap,
  synth_spec = synth_spec,
  noise = noise, 
  constraints = constraints,
  replicates = replicates

10.2.9 synthesize


synth1 <- synthesize(presynth1)

10.3 Evaluation

# A tibble: 333 × 4
   bill_length_mm flipper_length_mm sex    island
            <dbl>             <dbl> <fct>  <fct> 
 1           50.2              225. male   Biscoe
 2           50.2              402. male   Biscoe
 3           38.1              180  male   Dream 
 4           51                208. male   Biscoe
 5           52.7              230. male   Biscoe
 6           49.6              210. male   Biscoe
 7           46.2              210. female Biscoe
 8           35.7              180  female Biscoe
 9           51.7              197. male   Dream 
10           43.5              209. female Biscoe
# ℹ 323 more rows

10.4 Session info

R version 4.3.1 (2023-06-16)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.5

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] palmerpenguins_0.1.1 syntheval_0.0.3      tidysynthesis_0.0.4 
 [4] lubridate_1.9.3      forcats_1.0.0        stringr_1.5.1       
 [7] dplyr_1.1.4          purrr_1.0.2          readr_2.1.5         
[10] tidyr_1.3.0          tibble_3.2.1         ggplot2_3.4.4       
[13] tidyverse_2.0.0     

loaded via a namespace (and not attached):
 [1] gtable_0.3.4        xfun_0.45           htmlwidgets_1.6.4  
 [4] recipes_1.0.9       lattice_0.22-5      tzdb_0.4.0         
 [7] vctrs_0.6.5         tools_4.3.1         generics_0.1.3     
[10] parallel_4.3.1      fansi_1.0.6         pkgconfig_2.0.3    
[13] Matrix_1.6-5        data.table_1.14.10  lifecycle_1.0.4    
[16] compiler_4.3.1      munsell_0.5.0       codetools_0.2-19   
[19] htmltools_0.5.7     class_7.3-22        yaml_2.3.8         
[22] prodlim_2023.08.28  furrr_0.3.1         pillar_1.9.0       
[25] ellipsis_0.3.2      MASS_7.3-60.0.1     gower_1.0.1        
[28] rpart_4.1.23        parallelly_1.36.0   lava_1.7.3         
[31] tidyselect_1.2.0    digest_0.6.34       stringi_1.8.3      
[34] future_1.33.1       listenv_0.9.0       splines_4.3.1      
[37] fastmap_1.1.1       parsnip_1.1.1       grid_4.3.1         
[40] colorspace_2.1-0    cli_3.6.2           magrittr_2.0.3     
[43] survival_3.5-7      utf8_1.2.4          future.apply_1.11.1
[46] withr_3.0.0         scales_1.3.0        timechange_0.3.0   
[49] rmarkdown_2.25      globals_0.16.2      nnet_7.3-19        
[52] timeDate_4032.109   workflows_1.1.3     hms_1.1.3          
[55] evaluate_0.23       knitr_1.45          hardhat_1.3.0      
[58] rlang_1.1.3         Rcpp_1.0.12         glue_1.7.0         
[61] ipred_0.9-14        rstudioapi_0.15.0   jsonlite_1.8.8     
[64] R6_2.5.1