This document contains summary statistics and tabulations from the IRS SOI PUF.

Preprocessing

library(tidyverse)
library(janitor)
library(gt)
library(DT)

theme_set(theme_minimal())
theme_update(plot.title.position = "plot")

options(scipen = 999)

The weights in the SOI PUF have an implied decimal at two places. We divide by 100 to get rid of the implied decimal.

puf <- synthpuf::puf_2012 %>%
  mutate(S006 = S006 / 100)

Histograms

Mortenson Figure 1

Jacob A. Mortenson and Andrew Whitten (2020) released a paper called "Bunching to Maximize Tax Credits: Evidence from Kinks in the US Tax Schedule. It calculates bunching estimators on detailed histograms of univariate distributions.

Figure 1 presents earned income in $200-wide bins for single taxpayers with two children in 2014 from $0 to $30,000. The goal is to identify bunching around the first EITC kink point. We will attempt produce similar results with the 2012 SOI PUF. The results will likely be muted or not visible at all. Alternatively, we could pool SOI PUFs from 1996 to 2014.

Earned income will be calculated for filers with no dependent-status indicator who file as single or head of household. Dependents is the sum of exemptions for children living at home, exemptions for children living away from home, and exemptions for other dependents. Earned income is wage and salary income, positive net business income, and positive net farm income.

# subset to the sample of interest
sample_of_interest <- puf %>%
  filter(DSI == 0) %>%
  filter(MARS %in% c(1, 4)) %>%
  mutate(dependents = XOCAH + XOCAWH + XOODEP) %>%
  filter(dependents == 2)

# calculate earned income
sample_of_interest <- sample_of_interest %>%
  mutate(
    earned_income =
      pmax(E00200, 0) +
      pmax(E00900, 0) +
      pmax(E02100, 0) 
  ) %>%
  filter(earned_income <= 30000)

# count the number of samples
sum(sample_of_interest$earned_income < 30000)
## [1] 3872
# create a plot
sample_of_interest %>%
  filter(earned_income > 0) %>%
  ggplot(aes(earned_income)) +
  geom_histogram(binwidth = 200) +
  labs(title = "Earned Income for Single or Head of Household Returns with Two Children",
       subtitle = "Unweighted; Limited to Those with Earned Income < $30,000")

# calculate the weighted histogram
weighted_histogram <- sample_of_interest %>%  
  mutate(earned_income_group = cut(earned_income, 
                                   breaks = seq(-200, 30000, 200),
                                   labels = seq(0, 30000, 200))) %>%
  mutate(earned_income_group = as.numeric(as.character(earned_income_group))) %>%
  group_by(earned_income_group) %>%
  summarize(count = sum(S006))

# create a plot
weighted_histogram %>%
  filter(earned_income_group > 0) %>%
  ggplot(aes(earned_income_group, count)) +
  geom_line(alpha = 0.1) +
  geom_point() +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "Earned Income for Single or Head of Household Returns with Two Children",
       subtitle = "Weighted; Limited to Those with Earned Income < $30,000")

source(here::here("R", "prep_puf01.R"))

# prepare the data
sample_of_interest <- prep_puf01()

# values for specifications
breaks <- seq(-100, 30100, 200)

# no noise ----------------------------------------------------------------
true_histogram <- hist(sample_of_interest$E00200, breaks = breaks, plot = FALSE)$counts

Noisy Results

This table summarizes the results across all methods for histograms. Detailed data visualizations are in subsequent sections.

histogram_results <- read_csv(here::here("results", "01_soi_histograms.csv"), guess_max = 36000)

histogram_results <- histogram_results %>%
  mutate(
    relative_bias = bias /  mean(true_histogram)
  )

probs <- seq(0.1, 0.9, 0.1)

histogram_results_summary <- histogram_results %>%
  group_by(method, epsilon, delta) %>%
  summarize(
    q = probs, 
    relative_bias = quantile(relative_bias, probs = probs),
    bias = quantile(bias, probs = probs),
    rmse = quantile(rmse, probs = probs)
  ) %>%
  mutate(across(relative_bias:rmse, round, 3)) %>%
  ungroup()

histogram_results_summary %>%
  filter(q == 0.5) %>%
  datatable()
histogram_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(q == 0.5,
         delta %in% c(NA, 0.000001)) %>%
  ggplot(aes(epsilon, relative_bias)) +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +
  labs(
    title = "Median Relative Bias for Income Histogram"
  )

histogram_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         q %in% c(0.1, 0.5, 0.9)) %>%
  ggplot(aes(factor(epsilon), relative_bias)) +
  geom_line(aes(group = factor(q)), alpha = 0.2) +
  geom_point(aes(color = factor(q)), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +
  labs(
    title = "Relative Bias for Income Histogram",
    subtitle = "Selected percentiles"
  )

histogram_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(q == 0.5,
         delta %in% c(NA, 0.000001)) %>%
  ggplot(aes(epsilon, bias)) +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +
  labs(
    title = "Median Bias for Income Histogram"
  )

histogram_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         q %in% c(0.1, 0.5, 0.9)) %>%
  ggplot(aes(factor(epsilon), bias)) +
  geom_line(aes(group = factor(q)), alpha = 0.2) +
  geom_point(aes(color = factor(q)), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +
  labs(
    title = "Bias for Income Histogram",
    subtitle = "Selected Percentiles"
  )

histogram_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(q == 0.5,
         delta %in% c(NA, 0.000001)) %>%
  ggplot(aes(epsilon, rmse)) +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method) +
  labs(
    title = "Median RMSE for Income Histogram"
  )

histogram_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         q %in% c(0.1, 0.5, 0.9)) %>%
  ggplot(aes(factor(epsilon), rmse)) +
  geom_line(aes(group = factor(q)), alpha = 0.2) +
  geom_point(aes(color = factor(q)), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +
  labs(
    title = "RMSE for Income Histogram",
    subtitle = "Selected Percentiles"
  )

Means

No Noise

We calculate the unweighted and weighted mean of income from salaries and wages.

true_mean <- mean(x = puf$E00200)

true_mean
## [1] 243369
true_se <- sd(x = puf$E00200) / sqrt(length(puf$E00200))

true_se
## [1] 2536.975
true_ci <- c(true_mean - qnorm(0.975) * true_se, true_mean + qnorm(0.975) * true_se)

true_ci
## [1] 238396.6 248341.3

Noisy Results

This table summarizes the results across all methods for means.

mean_income_results <- read_csv(here::here("results", "01_soi_means.csv"), guess_max = 4500)

mean_income_results <- mean_income_results %>%
  mutate(
    noisy_mean = bias + true_mean,
    relative_bias = bias / true_mean#,
    #`Median Relative CI Length` = median(ci_length / ci_length_original)#,
  )

summary_mean_income_results <- mean_income_results %>%
  group_by(method, epsilon, delta) %>%
  summarize(
    `Median Relative Bias` = median(relative_bias), 
    RMSE = sqrt(mean(bias ^ 2)),
    `Median CI Overlap` = median(ci_overlap, na.rm = TRUE),
    `NAs in CI Overlap` = sum(is.na(ci_overlap)),
  ) %>%
  ungroup()

summary_mean_income_results %>%
  mutate(
    `Median Relative Bias` = round(`Median Relative Bias`, digits = 3),
    RMSE = round(RMSE, digits = 3),
    `Median CI Overlap` = round(`Median CI Overlap`, digits = 3),
    `NAs in CI Overlap` = round(`NAs in CI Overlap`, digits = 0)
  ) %>%
  datatable()
summary_mean_income_results %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001)) %>%
  ggplot(aes(epsilon, `Median Relative Bias`)) +
  geom_hline(yintercept = 0, color = "red") +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  labs(title = "Median Relative Bias for Mean Income")

summary_mean_income_results %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001)) %>%
  ggplot(aes(epsilon, `Median CI Overlap`)) +
  geom_hline(yintercept = 1, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  labs(title = "CI Overlap for Mean Income")

summary_mean_income_results %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001)) %>%
  ggplot(aes(epsilon, RMSE)) +
  geom_hline(yintercept = 0, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  labs(title = "RMSE for Mean Income")

This table summarizes the results across all methods for means. Detailed data visualizations are in subsequent sections.

mean_results <- read_csv(here::here("results", "01_soi_means.csv"), guess_max = 4500)

mean_results %>%
  group_by(method, epsilon, delta) %>%
  summarize(
    `Median Bias` = median(bias), 
    RMSE = sqrt(mean(bias ^ 2)),
    `Median CI Overlap` = median(ci_overlap, na.rm = TRUE),
    `NAs in CI Overlap` = sum(is.na(ci_overlap)),
  ) %>%
  ungroup() %>%
  gt() %>%
  fmt_number(
    columns = c(`Median Bias`, RMSE),
    decimals = 1,
    use_seps = TRUE
  ) %>%
  fmt_number(columns = `Median CI Overlap`, decimals = 3) %>%
  fmt_number(columns = `NAs in CI Overlap`, decimals = 0)
method epsilon delta Median Bias RMSE Median CI Overlap NAs in CI Overlap
bhm 0.10 0.000000001 −2,746.7 15,797.6 0.526 1
bhm 0.10 0.000001000 −1,662.6 15,529.6 0.537 0
bhm 0.10 0.010000000 −4.1 5,235.6 0.581 0
bhm 0.50 0.000000001 123.8 3,952.2 0.630 1
bhm 0.50 0.000001000 −307.6 2,940.1 0.647 1
bhm 0.50 0.010000000 419.0 1,838.0 0.753 0
bhm 1.00 0.000000001 118.4 2,276.1 0.719 0
bhm 1.00 0.000001000 −58.1 1,833.9 0.751 0
bhm 1.00 0.010000000 185.4 1,231.7 0.849 0
bhm 5.00 0.000000001 58.4 891.4 0.886 0
bhm 5.00 0.000001000 154.3 837.7 0.900 0
bhm 5.00 0.010000000 −33.5 830.1 0.903 0
bhm 10.00 0.000000001 3.5 833.1 0.905 0
bhm 10.00 0.000001000 18.5 855.9 0.891 0
bhm 10.00 0.010000000 −182.6 851.3 0.903 0
bhm 15.00 0.000000001 −85.9 823.5 0.901 0
bhm 15.00 0.000001000 25.6 712.1 0.904 0
bhm 15.00 0.010000000 −57.8 855.9 0.894 0
bhm 20.00 0.000000001 124.9 755.2 0.916 0
bhm 20.00 0.000001000 −49.9 721.3 0.897 0
bhm 20.00 0.010000000 −119.0 863.5 0.901 0
NOISYMAD 0.01 NA −2,025.7 51,947.0 0.523 0
NOISYMAD 0.10 NA 152.6 4,777.8 0.723 0
NOISYMAD 0.50 NA −27.4 1,022.3 0.789 0
NOISYMAD 1.00 NA −18.1 436.1 0.724 0
NOISYMAD 5.00 NA 8.8 98.6 0.701 0
NOISYMAD 10.00 NA −5.8 40.3 0.700 0
NOISYMAD 15.00 NA 3.2 42.0 0.699 0
NOISYMAD 20.00 NA 0.4 24.6 0.699 0
NOISYVAR 0.01 NA −922.8 62,060.5 0.521 0
NOISYVAR 0.10 NA −564.6 5,380.1 0.694 0
NOISYVAR 0.50 NA 29.8 1,111.7 0.926 0
NOISYVAR 1.00 NA −56.3 513.6 0.966 0
NOISYVAR 5.00 NA 6.7 118.1 0.987 0
NOISYVAR 10.00 NA −3.6 57.5 0.989 0
NOISYVAR 15.00 NA −4.6 31.3 0.989 0
NOISYVAR 20.00 NA −3.1 27.3 0.989 0

Weighted Means

Means for the total population and subsets of the total population will be useful for publication and for exploratory data analysis before statistical modeling.

# unweighted mean salaries and wages
mean(puf$E00200)
## [1] 243369
# weighted mean salaries and wages
weighted.mean(puf$E00200, w = puf$S006)
## [1] 43443.95

Quantiles

No Noise

We calculate unweighted and weighted percentiles of income from salaries and wages.

percentiles <- seq(from = 0.1, to = 0.9, by = 0.1)

# unweighted percentiles
true_quantile <- quantile(x = puf$E00200, probs = percentiles)

Noisy Results

With quantile_smooth

quantile_results <- read_csv(here::here("results", "01_soi_quantiles.csv"))

quantile_results <- quantile_results %>%
  mutate(
    relative_bias =  bias / mean(true_quantile)
  )

quantile_results_summary <- quantile_results %>%
  group_by(method, epsilon, delta) %>%
  summarize(
    `Median bias` = median(bias),
    `Median relative bias` = median(relative_bias),
    RMSE = sqrt(mean(bias ^ 2))
    ) %>%
  ungroup()

quantile_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100,
         epsilon >= 0.1) %>%
  ggplot(aes(epsilon, `Median relative bias`)) +
  geom_hline(yintercept = 0, color = "red") +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +
  labs(
    title = "Median Relative Bias for Income Quantiles"
  )

quantile_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100,
         epsilon >= 0.1) %>%
  ggplot(aes(epsilon, `Median bias`)) +
  geom_hline(yintercept = 0, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +  
  labs(
    title = "Median Bias for Income Quantiles"
  )

quantile_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100,
         epsilon >= 0.1) %>%
  ggplot(aes(epsilon, RMSE)) +
  geom_hline(yintercept = 0, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +  
  labs(
    title = "RMSE for Income Quantiles"
  )

Without quantile_smooth

quantile_results_summary <- quantile_results_summary %>%
  filter(method != "quantile_smooth")

quantile_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100,
         epsilon >= 0.1) %>%
  ggplot(aes(epsilon, `Median relative bias`)) +
  geom_hline(yintercept = 0, color = "red") +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method) +
  labs(
    title = "Median Relative Bias for Income Quantiles"
  )

quantile_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100,
         epsilon >= 0.1) %>%
  ggplot(aes(epsilon, `Median bias`)) +
  geom_hline(yintercept = 0, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method) +  
  labs(
    title = "Median Bias for Income Quantiles"
  )

quantile_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100,
         epsilon >= 0.1) %>%
  ggplot(aes(epsilon, RMSE)) +
  geom_hline(yintercept = 0, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method) +  
  labs(
    title = "RMSE for Income Quantiles"
  )

Works Cited

  • Mortenson, Jacob A. and Andrew Whitten, 2020. “Bunching to Maximize Tax Credits: Evidence from Kinks in the US Tax Schedule.” American Economic Journal: Economic Policy 12(3): 402-432.