Aaron R. Williams - Data Scientist (IBP)
Source: RStudio
Much predictive modeling in R can be handled with library(tidymodels)
.
library(rsample)
handles resampling.library(parsnip)
is a common interface to packages with predictive algorithms.library(recipes)
handles feature engineering.library(workflows)
manages putting everything together in resampling.library(tune)
helps with hyperparameter tuning.library(yardstick)
is used for evaluating models.library(rsample)
initial_split()
Create an index for creating training and testing data.training()
Use the initial_split()
object to create a training set.testing()
Use the initial_split()
object to create a testing set.vfold_cv()
Create indices for v-fold cross-validation.library(recipes)
recipe()
Begin creating a recipe for preprocessing.step_*()
A collection of functions with preprocessing and feature engineering steps.prep()
Estimate parameters for a recipe.bake()
Apply the computations from a recipe to a new data set.library(parsnip)
nearest_neighbor()
Generate a KNN specification before fitting a model.linear_reg()
Generate a linear regression specification before fitting a model.random_forest()
Generate a random forests specification before fitting a model.set_engine()
Pick the package used to fit a model.fit()
Estimate model parameters.library(yardstick)
metrics()
Estimate common performance metricsrmse()
Estimate Root Mean Squared Errorlibrary(tune)
fit_resamples()
Estimate model parameters with resampling.collect_metrics()
Obtain and format results produced during resampling.collect_predictions()
Obtain and format predictions produced during resampling.library(workflows)
workflow()
Create a container object to manage the model making process.add_model()
Add a parnsip model to a workflow object.add_recipe()
Add a recipe to a workflow object.Example 1 uses simulated data with one predictor \(x\) and one outcome variable \(y\).
library(tidyverse)
library(tidymodels)
set.seed(20201004)
x <- runif(n = 1000, min = 0, max = 10)
data1 <- bind_cols(
x = x,
y = 10 * sin(x) + x + 20 + rnorm(n = length(x), mean = 0, sd = 2)
)
Here, we know y = f(x)
. In practice, this is unknown and our goal is to estimate or approximate it.
set.seed(20201007)
# create a split object
data1_split <- initial_split(data = data1, prop = 0.75)
# create the training and testing data
data1_train <- training(x = data1_split)
data1_test <- testing(x = data1_split)
# visualize the data
data1_train %>%
ggplot(aes(x = x, y = y)) +
geom_point() +
labs(title = "Example 1 Data") +
theme_minimal()
# create a knn model specification
knn_mod <-
nearest_neighbor(neighbors = 5) %>%
set_engine(engine = "kknn") %>%
set_mode(mode = "regression")
# create a workflow object
ex1_wf <-
workflow() %>%
add_formula(y ~ x) %>%
add_model(knn_mod)
# fit the ex1 workflow on the training data
knn_fit <- ex1_wf %>%
fit(data = data1_train)
# use the estimated model to predict values in the testing data
predictions <-
bind_cols(
data1_test,
predict(object = knn_fit, new_data = data1_test)
)
# calculate the rmse on the testing data
rmse(data = predictions, truth = y, estimate = .pred)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 2.19
How good is this RMSE? It is relatively small when compared to the range of \(y\), which suggests that this model is accurate.
summary(data1_test$y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.909 19.612 26.212 26.326 32.558 41.125
# make a novel prediction
predict(object = knn_fit, new_data = tibble(x = 1:5))
## # A tibble: 5 x 1
## .pred
## <dbl>
## 1 30.7
## 2 31.2
## 3 22.9
## 4 17.3
## 5 16.1
predictions <-
bind_cols(
x = seq(from = 0, to = 10, by = 0.1),
predict(object = knn_fit, new_data = tibble(x = seq(from = 0, to = 10, by = 0.1)))
)
ggplot() +
geom_point(data = data1, aes(x = x, y = y)) +
geom_line(data = predictions, aes(x = x, y = .pred), color = "#1696d2", size = 1) +
labs(title = "Example 1 Data with Predictions") +
theme_minimal()
penguins <- read_csv(here::here("data", "penguins.csv"))
penguins
## # A tibble: 333 x 8
## species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie Torge… 39.1 18.7 181 3750
## 2 Adelie Torge… 39.5 17.4 186 3800
## 3 Adelie Torge… 40.3 18 195 3250
## 4 Adelie Torge… 36.7 19.3 193 3450
## 5 Adelie Torge… 39.3 20.6 190 3650
## 6 Adelie Torge… 38.9 17.8 181 3625
## 7 Adelie Torge… 39.2 19.6 195 4675
## 8 Adelie Torge… 41.1 17.6 182 3200
## 9 Adelie Torge… 38.6 21.2 191 NA
## 10 Adelie Torge… 34.6 21.1 198 4400
## # … with 323 more rows, and 2 more variables: sex <chr>,
## # body_mass_g_complete <dbl>
This examples uses the Palmer Penguins data set from library(palmerpenguins)
. It contains the measurements of several hundred penguins. Imagine the scale randomly malfunctioned for twenty penguins and we need to impute the NA
values for body_mass_g
using other measurements as predictors. The predictors will be bill_length_mm
, bill_depth_mm
, and flipper_length_mm
.
First, we need to split our dataset into a data with missing observations and a dataset without missing observations.
penguins_na <- penguins %>%
filter(is.na(body_mass_g))
penguins <- penguins %>%
filter(!is.na(body_mass_g))
Now, we will go through a supervised machine learning exercise with the non-missing data to estimate the best possible model to predict body_mass_g
.
# split the data into a testing set.
set.seed(20201124)
penguins_split <- initial_split(data = penguins, prop = 0.75)
penguins_train <- training(x = penguins_split)
# create a KNN model object
# set neighbors to 31 - I chose this number in R/pick_n.R
knn_mod <-
nearest_neighbor(neighbors = 31) %>%
set_engine(engine = "kknn") %>%
set_mode(mode = "regression")
# create a recipe
penguins_rec <-
recipe(body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm,
data = penguins_train) %>%
step_normalize(bill_length_mm, bill_depth_mm, flipper_length_mm)
# create a workflow
penguins_wf <-
workflow() %>%
add_model(knn_mod) %>%
add_recipe(penguins_rec)
# fit the model
penguins_best <- fit(penguins_wf, data = penguins_train)
We now have a model estimated on the training data that we can use to predict values on the testing data. We use the testing data to get an estimate of the uncertainty of our predictions.
# create the testing data from the split object create above
penguins_test <- testing(penguins_split)
# predict with KNN
# predict with mean imputation
# append the testing data
predictions <- bind_cols(
predict(penguins_best, new_data = penguins_test),
.mean = mean(penguins_train$body_mass_g),
penguins_test
)
# rmse of KNN model
rmse(data = predictions, truth = body_mass_g, estimate = .pred)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 321.
# rmse of mean imputation
rmse(data = predictions, truth = body_mass_g, estimate = .mean)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 830.
I figured out how to fix the original measurements. We can now see how well our model did of predicting the actual missing observations.
predictions_na <- bind_cols(
predict(penguins_best, new_data = penguins_na),
.mean = mean(penguins_train$body_mass_g),
penguins_na
)
# rmse of KNN model (the out-of-sample rmse is usually worse!)
rmse(data = predictions_na, truth = body_mass_g_complete, estimate = .pred)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 230.
# rmse of mean imputation
rmse(data = predictions_na, truth = body_mass_g_complete, estimate = .mean)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 761.
Example 3 uses simulated data with two predictors, \(x_1\) and \(x_2\), and one outcome variable \(y\).
set.seed(20201005)
x1 <- runif(n = 1000, min = 0, max = 10)
x2 <- runif(n = 1000, min = 10, max = 20)
data3 <- bind_cols(
x1 = x1,
x2 = x2,
y = 10 * sin(x1) + x2 + 20 + rnorm(n = length(x), mean = 0, sd = 2)
)
head(data3)
## # A tibble: 6 x 3
## x1 x2 y
## <dbl> <dbl> <dbl>
## 1 4.52 15.3 24.9
## 2 2.95 10.9 36.1
## 3 2.52 16.0 42.1
## 4 6.77 12.7 38.0
## 5 1.94 13.3 42.6
## 6 7.01 14.6 38.6