8  Example 01: Penguins

This example synthesizes the penguins data set demonstrated in the last few chapters.

8.1 Setup

library(palmerpenguins)
library(patchwork)
library(tidyverse)
library(tidysynthesis)

theme_set(theme_minimal())

The example uses complete observations and a subset of variables from the Palmer Penguins data set (cite!).

penguins_complete <- penguins %>%
  select(-year) %>%
  filter(complete.cases(.)) %>%
  mutate(
    flipper_length_mm = as.numeric(flipper_length_mm),
    body_mass_g = as.numeric(body_mass_g)
  )

8.2 Start Data

The synthesis starts with a bootstrap sample of species, island, and sex. The number of observations synthesized will exactly equal the number of observations in the confidential data set.

set.seed(1)

starting_data <- penguins_complete %>% 
  select(species, island, sex) %>%
  slice_sample(n = nrow(penguins_complete), replace = TRUE)

8.3 Roadmap

We use correlation order with bill_length_mm as the reference variable.

schema <- schema(
  conf_data = penguins_complete,
  start_data = starting_data
)
# create a synthesis order based on correlation with bill_length_mm
visit_sequence <- visit_sequence(
  schema = schema,
  type = "correlation",
  cor_var = "bill_length_mm"
)
# create an object that is the basis for all subsequent operations
roadmap <- roadmap(
  visit_sequence = visit_sequence
)

8.4 Synth Spec

# use library(parsnip) and library(recipes) to create specifications for
# each variable
rpart_mod <- parsnip::decision_tree() %>% 
  parsnip::set_mode("regression") %>%
  parsnip::set_engine("rpart") 
  

synth_spec <- synth_spec(
  roadmap = roadmap,
  synth_algorithms = rpart_mod,
  recipes = construct_recipes(roadmap = roadmap),
  predict_methods = sample_rpart
)

8.5 Extras

# don't add extra noise to predictions
noise <- noise(
  roadmap = roadmap,
  add_noise = FALSE,
  exclusions = 0
)

# don't impose constraints
constraints <- constraints(
  roadmap = roadmap,
  constraints = NULL,
  max_z = 0
)

# only generate one synthetic data set
replicates <- replicates(
  replicates = 1,
  workers = 1,
  summary_function = NULL
)

8.6 Synthesize

presynth1 <- presynth(
  roadmap = roadmap,
  synth_spec = synth_spec,
  noise = noise, 
  constraints = constraints,
  replicates = replicates
)
set.seed(1)
synth1 <- synthesize(presynth1)

8.7 Evaluate

data_combined <- bind_rows(
  "confidential" = penguins_complete,
  "synthetic" = synth1$synthetic_data,
  .id = "source"
) 

data_combined %>%
  select(source, bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g) %>%
  pivot_longer(-source, names_to = "variable") %>%
  ggplot(aes(value, fill = source)) +
  geom_density(alpha = 0.4, color = NA) +
  facet_wrap(~variable, scales = "free") +
  theme(
    strip.background = element_blank(),
    panel.grid = element_blank()
  ) +
  guides(fill = "none")

pane1 <- data_combined %>%
  ggplot(aes(flipper_length_mm, bill_length_mm, color = source)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = FALSE) +
  guides(color = "none")

pane2 <- data_combined %>%
  ggplot(aes(flipper_length_mm, bill_depth_mm, color = source)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = FALSE) +
  guides(color = "none")

pane3 <- data_combined %>%
  ggplot(aes(flipper_length_mm, body_mass_g, color = source)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = FALSE) +
  guides(color = "none")

pane1 + pane2 + pane3
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'