This document contains summary statistics and tabulations from the IRS SOI PUF.

Preprocessing

library(tidyverse)
library(janitor)
library(gt)
library(DT)

theme_set(theme_minimal())
theme_update(plot.title.position = "plot")

options(scipen = 999)

The weights in the SOI PUF have an implied decimal at two places. We divide by 100 to get rid of the implied decimal.

puf <- synthpuf::puf_2012 %>%
  mutate(S006 = S006 / 100)

Histograms

Mortenson Figure 1

Jacob A. Mortenson and Andrew Whitten (2020) released a paper called "Bunching to Maximize Tax Credits: Evidence from Kinks in the US Tax Schedule. It calculates bunching estimators on detailed histograms of univariate distributions.

Figure 1 presents earned income in $200-wide bins for single taxpayers with two children in 2014 from $0 to $30,000. The goal is to identify bunching around the first EITC kink point. We will attempt produce similar results with the 2012 SOI PUF. The results will likely be muted or not visible at all. Alternatively, we could pool SOI PUFs from 1996 to 2014.

Earned income will be calculated for filers with no dependent-status indicator who file as single or head of household. Dependents is the sum of exemptions for children living at home, exemptions for children living away from home, and exemptions for other dependents. Earned income is wage and salary income, positive net business income, and positive net farm income.

# subset to the sample of interest
sample_of_interest <- puf %>%
  filter(DSI == 0) %>%
  filter(MARS %in% c(1, 4)) %>%
  mutate(dependents = XOCAH + XOCAWH + XOODEP) %>%
  filter(dependents == 2)

# calculate earned income
sample_of_interest <- sample_of_interest %>%
  mutate(
    earned_income =
      pmax(E00200, 0) +
      pmax(E00900, 0) +
      pmax(E02100, 0) 
  ) %>%
  filter(earned_income <= 30000)

# count the number of samples
sum(sample_of_interest$earned_income < 30000)

## [1] 3872

# create a plot
sample_of_interest %>%
  filter(earned_income > 0) %>%
  ggplot(aes(earned_income)) +
  geom_histogram(binwidth = 200) +
  labs(title = "Earned Income for Single or Head of Household Returns with Two Children",
       subtitle = "Unweighted; Limited to Those with Earned Income < $30,000")

# calculate the weighted histogram
weighted_histogram <- sample_of_interest %>%  
  mutate(earned_income_group = cut(earned_income, 
                                   breaks = seq(-200, 30000, 200),
                                   labels = seq(0, 30000, 200))) %>%
  mutate(earned_income_group = as.numeric(as.character(earned_income_group))) %>%
  group_by(earned_income_group) %>%
  summarize(count = sum(S006))

# create a plot
weighted_histogram %>%
  filter(earned_income_group > 0) %>%
  ggplot(aes(earned_income_group, count)) +
  geom_line(alpha = 0.1) +
  geom_point() +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "Earned Income for Single or Head of Household Returns with Two Children",
       subtitle = "Weighted; Limited to Those with Earned Income < $30,000")

source(here::here("R", "prep_puf01.R"))

# prepare the data
sample_of_interest <- prep_puf01()

# values for specifications
breaks <- seq(-100, 30100, 200)

# no noise ----------------------------------------------------------------
true_histogram <- hist(sample_of_interest$E00200, breaks = breaks, plot = FALSE)$counts

Noisy Results

This table summarizes the results across all methods for histograms. Detailed data visualizations are in subsequent sections.

histogram_results <- read_csv(here::here("results", "01_soi_histograms.csv"), guess_max = 36000)

histogram_results <- histogram_results %>%
  mutate(
    relative_bias = bias /  mean(true_histogram)
  )

probs <- seq(0.1, 0.9, 0.1)

histogram_results_summary <- histogram_results %>%
  group_by(method, epsilon, delta) %>%
  summarize(
    q = probs, 
    relative_bias = quantile(relative_bias, probs = probs),
    bias = quantile(bias, probs = probs),
    rmse = quantile(rmse, probs = probs)
  ) %>%
  mutate(across(relative_bias:rmse, round, 3)) %>%
  ungroup()

histogram_results_summary %>%
  filter(q == 0.5) %>%
  datatable()

histogram_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(q == 0.5,
         delta %in% c(NA, 0.000001)) %>%
  ggplot(aes(epsilon, relative_bias)) +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +
  labs(
    title = "Median Relative Bias for Income Histogram"
  )

histogram_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         q %in% c(0.1, 0.5, 0.9)) %>%
  ggplot(aes(factor(epsilon), relative_bias)) +
  geom_line(aes(group = factor(q)), alpha = 0.2) +
  geom_point(aes(color = factor(q)), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +
  labs(
    title = "Relative Bias for Income Histogram",
    subtitle = "Selected percentiles"
  )

histogram_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(q == 0.5,
         delta %in% c(NA, 0.000001)) %>%
  ggplot(aes(epsilon, bias)) +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +
  labs(
    title = "Median Bias for Income Histogram"
  )

histogram_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         q %in% c(0.1, 0.5, 0.9)) %>%
  ggplot(aes(factor(epsilon), bias)) +
  geom_line(aes(group = factor(q)), alpha = 0.2) +
  geom_point(aes(color = factor(q)), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +
  labs(
    title = "Bias for Income Histogram",
    subtitle = "Selected Percentiles"
  )

histogram_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(q == 0.5,
         delta %in% c(NA, 0.000001)) %>%
  ggplot(aes(epsilon, rmse)) +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method) +
  labs(
    title = "Median RMSE for Income Histogram"
  )

histogram_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         q %in% c(0.1, 0.5, 0.9)) %>%
  ggplot(aes(factor(epsilon), rmse)) +
  geom_line(aes(group = factor(q)), alpha = 0.2) +
  geom_point(aes(color = factor(q)), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +
  labs(
    title = "RMSE for Income Histogram",
    subtitle = "Selected Percentiles"
  )

Means

No Noise

We calculate the unweighted and weighted mean of income from salaries and wages.

true_mean <- mean(x = puf$E00200)

true_mean

## [1] 243369

true_se <- sd(x = puf$E00200) / sqrt(length(puf$E00200))

true_se

## [1] 2536.975

true_ci <- c(true_mean - qnorm(0.975) * true_se, true_mean + qnorm(0.975) * true_se)

true_ci

## [1] 238396.6 248341.3

Noisy Results

This table summarizes the results across all methods for means.

mean_income_results <- read_csv(here::here("results", "01_soi_means.csv"), guess_max = 4500)

mean_income_results <- mean_income_results %>%
  mutate(
    noisy_mean = bias + true_mean,
    relative_bias = bias / true_mean#,
    #`Median Relative CI Length` = median(ci_length / ci_length_original)#,
  )

summary_mean_income_results <- mean_income_results %>%
  group_by(method, epsilon, delta) %>%
  summarize(
    `Median Relative Bias` = median(relative_bias), 
    RMSE = sqrt(mean(bias ^ 2)),
    `Median CI Overlap` = median(ci_overlap, na.rm = TRUE),
    `NAs in CI Overlap` = sum(is.na(ci_overlap)),
  ) %>%
  ungroup()

summary_mean_income_results %>%
  mutate(
    `Median Relative Bias` = round(`Median Relative Bias`, digits = 3),
    RMSE = round(RMSE, digits = 3),
    `Median CI Overlap` = round(`Median CI Overlap`, digits = 3),
    `NAs in CI Overlap` = round(`NAs in CI Overlap`, digits = 0)
  ) %>%
  datatable()

summary_mean_income_results %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001)) %>%
  ggplot(aes(epsilon, `Median Relative Bias`)) +
  geom_hline(yintercept = 0, color = "red") +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  labs(title = "Median Relative Bias for Mean Income")

summary_mean_income_results %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001)) %>%
  ggplot(aes(epsilon, `Median CI Overlap`)) +
  geom_hline(yintercept = 1, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  labs(title = "CI Overlap for Mean Income")

summary_mean_income_results %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001)) %>%
  ggplot(aes(epsilon, RMSE)) +
  geom_hline(yintercept = 0, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  labs(title = "RMSE for Mean Income")

This table summarizes the results across all methods for means. Detailed data visualizations are in subsequent sections.

mean_results <- read_csv(here::here("results", "01_soi_means.csv"), guess_max = 4500)

mean_results %>%
  group_by(method, epsilon, delta) %>%
  summarize(
    `Median Bias` = median(bias), 
    RMSE = sqrt(mean(bias ^ 2)),
    `Median CI Overlap` = median(ci_overlap, na.rm = TRUE),
    `NAs in CI Overlap` = sum(is.na(ci_overlap)),
  ) %>%
  ungroup() %>%
  gt() %>%
  fmt_number(
    columns = c(`Median Bias`, RMSE),
    decimals = 1,
    use_seps = TRUE
  ) %>%
  fmt_number(columns = `Median CI Overlap`, decimals = 3) %>%
  fmt_number(columns = `NAs in CI Overlap`, decimals = 0)

method	epsilon	delta	Median Bias	RMSE	Median CI Overlap	NAs in CI Overlap
bhm	0.10	0.000000001	−2,746.7	15,797.6	0.526	1
bhm	0.10	0.000001000	−1,662.6	15,529.6	0.537	0
bhm	0.10	0.010000000	−4.1	5,235.6	0.581	0
bhm	0.50	0.000000001	123.8	3,952.2	0.630	1
bhm	0.50	0.000001000	−307.6	2,940.1	0.647	1
bhm	0.50	0.010000000	419.0	1,838.0	0.753	0
bhm	1.00	0.000000001	118.4	2,276.1	0.719	0
bhm	1.00	0.000001000	−58.1	1,833.9	0.751	0
bhm	1.00	0.010000000	185.4	1,231.7	0.849	0
bhm	5.00	0.000000001	58.4	891.4	0.886	0
bhm	5.00	0.000001000	154.3	837.7	0.900	0
bhm	5.00	0.010000000	−33.5	830.1	0.903	0
bhm	10.00	0.000000001	3.5	833.1	0.905	0
bhm	10.00	0.000001000	18.5	855.9	0.891	0
bhm	10.00	0.010000000	−182.6	851.3	0.903	0
bhm	15.00	0.000000001	−85.9	823.5	0.901	0
bhm	15.00	0.000001000	25.6	712.1	0.904	0
bhm	15.00	0.010000000	−57.8	855.9	0.894	0
bhm	20.00	0.000000001	124.9	755.2	0.916	0
bhm	20.00	0.000001000	−49.9	721.3	0.897	0
bhm	20.00	0.010000000	−119.0	863.5	0.901	0
NOISYMAD	0.01	NA	−2,025.7	51,947.0	0.523	0
NOISYMAD	0.10	NA	152.6	4,777.8	0.723	0
NOISYMAD	0.50	NA	−27.4	1,022.3	0.789	0
NOISYMAD	1.00	NA	−18.1	436.1	0.724	0
NOISYMAD	5.00	NA	8.8	98.6	0.701	0
NOISYMAD	10.00	NA	−5.8	40.3	0.700	0
NOISYMAD	15.00	NA	3.2	42.0	0.699	0
NOISYMAD	20.00	NA	0.4	24.6	0.699	0
NOISYVAR	0.01	NA	−922.8	62,060.5	0.521	0
NOISYVAR	0.10	NA	−564.6	5,380.1	0.694	0
NOISYVAR	0.50	NA	29.8	1,111.7	0.926	0
NOISYVAR	1.00	NA	−56.3	513.6	0.966	0
NOISYVAR	5.00	NA	6.7	118.1	0.987	0
NOISYVAR	10.00	NA	−3.6	57.5	0.989	0
NOISYVAR	15.00	NA	−4.6	31.3	0.989	0
NOISYVAR	20.00	NA	−3.1	27.3	0.989	0

Weighted Means

Means for the total population and subsets of the total population will be useful for publication and for exploratory data analysis before statistical modeling.

# unweighted mean salaries and wages
mean(puf$E00200)

## [1] 243369

# weighted mean salaries and wages
weighted.mean(puf$E00200, w = puf$S006)

## [1] 43443.95

Quantiles

No Noise

We calculate unweighted and weighted percentiles of income from salaries and wages.

percentiles <- seq(from = 0.1, to = 0.9, by = 0.1)

# unweighted percentiles
true_quantile <- quantile(x = puf$E00200, probs = percentiles)

Noisy Results

With quantile_smooth

quantile_results <- read_csv(here::here("results", "01_soi_quantiles.csv"))

quantile_results <- quantile_results %>%
  mutate(
    relative_bias =  bias / mean(true_quantile)
  )

quantile_results_summary <- quantile_results %>%
  group_by(method, epsilon, delta) %>%
  summarize(
    `Median bias` = median(bias),
    `Median relative bias` = median(relative_bias),
    RMSE = sqrt(mean(bias ^ 2))
    ) %>%
  ungroup()

quantile_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100,
         epsilon >= 0.1) %>%
  ggplot(aes(epsilon, `Median relative bias`)) +
  geom_hline(yintercept = 0, color = "red") +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +
  labs(
    title = "Median Relative Bias for Income Quantiles"
  )

quantile_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100,
         epsilon >= 0.1) %>%
  ggplot(aes(epsilon, `Median bias`)) +
  geom_hline(yintercept = 0, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +  
  labs(
    title = "Median Bias for Income Quantiles"
  )

quantile_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100,
         epsilon >= 0.1) %>%
  ggplot(aes(epsilon, RMSE)) +
  geom_hline(yintercept = 0, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +  
  labs(
    title = "RMSE for Income Quantiles"
  )

Without quantile_smooth

quantile_results_summary <- quantile_results_summary %>%
  filter(method != "quantile_smooth")

quantile_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100,
         epsilon >= 0.1) %>%
  ggplot(aes(epsilon, `Median relative bias`)) +
  geom_hline(yintercept = 0, color = "red") +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method) +
  labs(
    title = "Median Relative Bias for Income Quantiles"
  )

quantile_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100,
         epsilon >= 0.1) %>%
  ggplot(aes(epsilon, `Median bias`)) +
  geom_hline(yintercept = 0, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method) +  
  labs(
    title = "Median Bias for Income Quantiles"
  )

quantile_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100,
         epsilon >= 0.1) %>%
  ggplot(aes(epsilon, RMSE)) +
  geom_hline(yintercept = 0, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method) +  
  labs(
    title = "RMSE for Income Quantiles"
  )

Works Cited

Mortenson, Jacob A. and Andrew Whitten, 2020. “Bunching to Maximize Tax Credits: Evidence from Kinks in the US Tax Schedule.” American Economic Journal: Economic Policy 12(3): 402-432.

IRS PUF Summary Statistics and Tabulations

Preprocessing

Histograms

Mortenson Figure 1

Noisy Results

Means

No Noise

Noisy Results

Weighted Means

Quantiles

No Noise

Noisy Results

With quantile_smooth

Without quantile_smooth

Works Cited