This document contains summary statistics and tabulations from the IRS SOI PUF.
library(tidyverse)
library(janitor)
library(gt)
library(DT)
theme_set(theme_minimal())
theme_update(plot.title.position = "plot")
options(scipen = 999)
The weights in the SOI PUF have an implied decimal at two places. We divide by 100 to get rid of the implied decimal.
puf <- synthpuf::puf_2012 %>%
mutate(S006 = S006 / 100)
Jacob A. Mortenson and Andrew Whitten (2020) released a paper called "Bunching to Maximize Tax Credits: Evidence from Kinks in the US Tax Schedule. It calculates bunching estimators on detailed histograms of univariate distributions.
Figure 1 presents earned income in $200-wide bins for single taxpayers with two children in 2014 from $0 to $30,000. The goal is to identify bunching around the first EITC kink point. We will attempt produce similar results with the 2012 SOI PUF. The results will likely be muted or not visible at all. Alternatively, we could pool SOI PUFs from 1996 to 2014.
Earned income will be calculated for filers with no dependent-status indicator who file as single or head of household. Dependents is the sum of exemptions for children living at home, exemptions for children living away from home, and exemptions for other dependents. Earned income is wage and salary income, positive net business income, and positive net farm income.
# subset to the sample of interest
sample_of_interest <- puf %>%
filter(DSI == 0) %>%
filter(MARS %in% c(1, 4)) %>%
mutate(dependents = XOCAH + XOCAWH + XOODEP) %>%
filter(dependents == 2)
# calculate earned income
sample_of_interest <- sample_of_interest %>%
mutate(
earned_income =
pmax(E00200, 0) +
pmax(E00900, 0) +
pmax(E02100, 0)
) %>%
filter(earned_income <= 30000)
# count the number of samples
sum(sample_of_interest$earned_income < 30000)
## [1] 3872
# create a plot
sample_of_interest %>%
filter(earned_income > 0) %>%
ggplot(aes(earned_income)) +
geom_histogram(binwidth = 200) +
labs(title = "Earned Income for Single or Head of Household Returns with Two Children",
subtitle = "Unweighted; Limited to Those with Earned Income < $30,000")
# calculate the weighted histogram
weighted_histogram <- sample_of_interest %>%
mutate(earned_income_group = cut(earned_income,
breaks = seq(-200, 30000, 200),
labels = seq(0, 30000, 200))) %>%
mutate(earned_income_group = as.numeric(as.character(earned_income_group))) %>%
group_by(earned_income_group) %>%
summarize(count = sum(S006))
# create a plot
weighted_histogram %>%
filter(earned_income_group > 0) %>%
ggplot(aes(earned_income_group, count)) +
geom_line(alpha = 0.1) +
geom_point() +
scale_x_continuous(labels = scales::dollar) +
labs(title = "Earned Income for Single or Head of Household Returns with Two Children",
subtitle = "Weighted; Limited to Those with Earned Income < $30,000")
source(here::here("R", "prep_puf01.R"))
# prepare the data
sample_of_interest <- prep_puf01()
# values for specifications
breaks <- seq(-100, 30100, 200)
# no noise ----------------------------------------------------------------
true_histogram <- hist(sample_of_interest$E00200, breaks = breaks, plot = FALSE)$counts
This table summarizes the results across all methods for histograms. Detailed data visualizations are in subsequent sections.
histogram_results <- read_csv(here::here("results", "01_soi_histograms.csv"), guess_max = 36000)
histogram_results <- histogram_results %>%
mutate(
relative_bias = bias / mean(true_histogram)
)
probs <- seq(0.1, 0.9, 0.1)
histogram_results_summary <- histogram_results %>%
group_by(method, epsilon, delta) %>%
summarize(
q = probs,
relative_bias = quantile(relative_bias, probs = probs),
bias = quantile(bias, probs = probs),
rmse = quantile(rmse, probs = probs)
) %>%
mutate(across(relative_bias:rmse, round, 3)) %>%
ungroup()
histogram_results_summary %>%
filter(q == 0.5) %>%
datatable()
histogram_results_summary %>%
filter(epsilon > 0.01) %>%
filter(q == 0.5,
delta %in% c(NA, 0.000001)) %>%
ggplot(aes(epsilon, relative_bias)) +
geom_line(aes(group = method), alpha = 0.2) +
geom_point(aes(color = method), alpha = 0.4) +
facet_wrap(~method, scales = "free_y") +
labs(
title = "Median Relative Bias for Income Histogram"
)
histogram_results_summary %>%
filter(epsilon > 0.01) %>%
filter(delta %in% c(NA, 0.000001),
q %in% c(0.1, 0.5, 0.9)) %>%
ggplot(aes(factor(epsilon), relative_bias)) +
geom_line(aes(group = factor(q)), alpha = 0.2) +
geom_point(aes(color = factor(q)), alpha = 0.4) +
facet_wrap(~method, scales = "free_y") +
labs(
title = "Relative Bias for Income Histogram",
subtitle = "Selected percentiles"
)
histogram_results_summary %>%
filter(epsilon > 0.01) %>%
filter(q == 0.5,
delta %in% c(NA, 0.000001)) %>%
ggplot(aes(epsilon, bias)) +
geom_line(aes(group = method), alpha = 0.2) +
geom_point(aes(color = method), alpha = 0.4) +
facet_wrap(~method, scales = "free_y") +
labs(
title = "Median Bias for Income Histogram"
)
histogram_results_summary %>%
filter(epsilon > 0.01) %>%
filter(delta %in% c(NA, 0.000001),
q %in% c(0.1, 0.5, 0.9)) %>%
ggplot(aes(factor(epsilon), bias)) +
geom_line(aes(group = factor(q)), alpha = 0.2) +
geom_point(aes(color = factor(q)), alpha = 0.4) +
facet_wrap(~method, scales = "free_y") +
labs(
title = "Bias for Income Histogram",
subtitle = "Selected Percentiles"
)
histogram_results_summary %>%
filter(epsilon > 0.01) %>%
filter(q == 0.5,
delta %in% c(NA, 0.000001)) %>%
ggplot(aes(epsilon, rmse)) +
geom_line(aes(group = method), alpha = 0.2) +
geom_point(aes(color = method), alpha = 0.4) +
facet_wrap(~method) +
labs(
title = "Median RMSE for Income Histogram"
)
histogram_results_summary %>%
filter(epsilon > 0.01) %>%
filter(delta %in% c(NA, 0.000001),
q %in% c(0.1, 0.5, 0.9)) %>%
ggplot(aes(factor(epsilon), rmse)) +
geom_line(aes(group = factor(q)), alpha = 0.2) +
geom_point(aes(color = factor(q)), alpha = 0.4) +
facet_wrap(~method, scales = "free_y") +
labs(
title = "RMSE for Income Histogram",
subtitle = "Selected Percentiles"
)
We calculate the unweighted and weighted mean of income from salaries and wages.
true_mean <- mean(x = puf$E00200)
true_mean
## [1] 243369
true_se <- sd(x = puf$E00200) / sqrt(length(puf$E00200))
true_se
## [1] 2536.975
true_ci <- c(true_mean - qnorm(0.975) * true_se, true_mean + qnorm(0.975) * true_se)
true_ci
## [1] 238396.6 248341.3
This table summarizes the results across all methods for means.
mean_income_results <- read_csv(here::here("results", "01_soi_means.csv"), guess_max = 4500)
mean_income_results <- mean_income_results %>%
mutate(
noisy_mean = bias + true_mean,
relative_bias = bias / true_mean#,
#`Median Relative CI Length` = median(ci_length / ci_length_original)#,
)
summary_mean_income_results <- mean_income_results %>%
group_by(method, epsilon, delta) %>%
summarize(
`Median Relative Bias` = median(relative_bias),
RMSE = sqrt(mean(bias ^ 2)),
`Median CI Overlap` = median(ci_overlap, na.rm = TRUE),
`NAs in CI Overlap` = sum(is.na(ci_overlap)),
) %>%
ungroup()
summary_mean_income_results %>%
mutate(
`Median Relative Bias` = round(`Median Relative Bias`, digits = 3),
RMSE = round(RMSE, digits = 3),
`Median CI Overlap` = round(`Median CI Overlap`, digits = 3),
`NAs in CI Overlap` = round(`NAs in CI Overlap`, digits = 0)
) %>%
datatable()
summary_mean_income_results %>%
filter(epsilon > 0.01) %>%
filter(delta %in% c(NA, 0.000001)) %>%
ggplot(aes(epsilon, `Median Relative Bias`)) +
geom_hline(yintercept = 0, color = "red") +
geom_line(aes(group = method), alpha = 0.2) +
geom_point(aes(color = method), alpha = 0.4) +
labs(title = "Median Relative Bias for Mean Income")
summary_mean_income_results %>%
filter(epsilon > 0.01) %>%
filter(delta %in% c(NA, 0.000001)) %>%
ggplot(aes(epsilon, `Median CI Overlap`)) +
geom_hline(yintercept = 1, color = "red") +
geom_line(aes(group = method), alpha = 0.2) +
geom_point(aes(color = method), alpha = 0.4) +
labs(title = "CI Overlap for Mean Income")
summary_mean_income_results %>%
filter(epsilon > 0.01) %>%
filter(delta %in% c(NA, 0.000001)) %>%
ggplot(aes(epsilon, RMSE)) +
geom_hline(yintercept = 0, color = "red") +
geom_line(aes(group = method), alpha = 0.2) +
geom_point(aes(color = method), alpha = 0.4) +
labs(title = "RMSE for Mean Income")
This table summarizes the results across all methods for means. Detailed data visualizations are in subsequent sections.
mean_results <- read_csv(here::here("results", "01_soi_means.csv"), guess_max = 4500)
mean_results %>%
group_by(method, epsilon, delta) %>%
summarize(
`Median Bias` = median(bias),
RMSE = sqrt(mean(bias ^ 2)),
`Median CI Overlap` = median(ci_overlap, na.rm = TRUE),
`NAs in CI Overlap` = sum(is.na(ci_overlap)),
) %>%
ungroup() %>%
gt() %>%
fmt_number(
columns = c(`Median Bias`, RMSE),
decimals = 1,
use_seps = TRUE
) %>%
fmt_number(columns = `Median CI Overlap`, decimals = 3) %>%
fmt_number(columns = `NAs in CI Overlap`, decimals = 0)
method | epsilon | delta | Median Bias | RMSE | Median CI Overlap | NAs in CI Overlap |
---|---|---|---|---|---|---|
bhm | 0.10 | 0.000000001 | −2,746.7 | 15,797.6 | 0.526 | 1 |
bhm | 0.10 | 0.000001000 | −1,662.6 | 15,529.6 | 0.537 | 0 |
bhm | 0.10 | 0.010000000 | −4.1 | 5,235.6 | 0.581 | 0 |
bhm | 0.50 | 0.000000001 | 123.8 | 3,952.2 | 0.630 | 1 |
bhm | 0.50 | 0.000001000 | −307.6 | 2,940.1 | 0.647 | 1 |
bhm | 0.50 | 0.010000000 | 419.0 | 1,838.0 | 0.753 | 0 |
bhm | 1.00 | 0.000000001 | 118.4 | 2,276.1 | 0.719 | 0 |
bhm | 1.00 | 0.000001000 | −58.1 | 1,833.9 | 0.751 | 0 |
bhm | 1.00 | 0.010000000 | 185.4 | 1,231.7 | 0.849 | 0 |
bhm | 5.00 | 0.000000001 | 58.4 | 891.4 | 0.886 | 0 |
bhm | 5.00 | 0.000001000 | 154.3 | 837.7 | 0.900 | 0 |
bhm | 5.00 | 0.010000000 | −33.5 | 830.1 | 0.903 | 0 |
bhm | 10.00 | 0.000000001 | 3.5 | 833.1 | 0.905 | 0 |
bhm | 10.00 | 0.000001000 | 18.5 | 855.9 | 0.891 | 0 |
bhm | 10.00 | 0.010000000 | −182.6 | 851.3 | 0.903 | 0 |
bhm | 15.00 | 0.000000001 | −85.9 | 823.5 | 0.901 | 0 |
bhm | 15.00 | 0.000001000 | 25.6 | 712.1 | 0.904 | 0 |
bhm | 15.00 | 0.010000000 | −57.8 | 855.9 | 0.894 | 0 |
bhm | 20.00 | 0.000000001 | 124.9 | 755.2 | 0.916 | 0 |
bhm | 20.00 | 0.000001000 | −49.9 | 721.3 | 0.897 | 0 |
bhm | 20.00 | 0.010000000 | −119.0 | 863.5 | 0.901 | 0 |
NOISYMAD | 0.01 | NA | −2,025.7 | 51,947.0 | 0.523 | 0 |
NOISYMAD | 0.10 | NA | 152.6 | 4,777.8 | 0.723 | 0 |
NOISYMAD | 0.50 | NA | −27.4 | 1,022.3 | 0.789 | 0 |
NOISYMAD | 1.00 | NA | −18.1 | 436.1 | 0.724 | 0 |
NOISYMAD | 5.00 | NA | 8.8 | 98.6 | 0.701 | 0 |
NOISYMAD | 10.00 | NA | −5.8 | 40.3 | 0.700 | 0 |
NOISYMAD | 15.00 | NA | 3.2 | 42.0 | 0.699 | 0 |
NOISYMAD | 20.00 | NA | 0.4 | 24.6 | 0.699 | 0 |
NOISYVAR | 0.01 | NA | −922.8 | 62,060.5 | 0.521 | 0 |
NOISYVAR | 0.10 | NA | −564.6 | 5,380.1 | 0.694 | 0 |
NOISYVAR | 0.50 | NA | 29.8 | 1,111.7 | 0.926 | 0 |
NOISYVAR | 1.00 | NA | −56.3 | 513.6 | 0.966 | 0 |
NOISYVAR | 5.00 | NA | 6.7 | 118.1 | 0.987 | 0 |
NOISYVAR | 10.00 | NA | −3.6 | 57.5 | 0.989 | 0 |
NOISYVAR | 15.00 | NA | −4.6 | 31.3 | 0.989 | 0 |
NOISYVAR | 20.00 | NA | −3.1 | 27.3 | 0.989 | 0 |
Means for the total population and subsets of the total population will be useful for publication and for exploratory data analysis before statistical modeling.
# unweighted mean salaries and wages
mean(puf$E00200)
## [1] 243369
# weighted mean salaries and wages
weighted.mean(puf$E00200, w = puf$S006)
## [1] 43443.95
We calculate unweighted and weighted percentiles of income from salaries and wages.
percentiles <- seq(from = 0.1, to = 0.9, by = 0.1)
# unweighted percentiles
true_quantile <- quantile(x = puf$E00200, probs = percentiles)
quantile_results <- read_csv(here::here("results", "01_soi_quantiles.csv"))
quantile_results <- quantile_results %>%
mutate(
relative_bias = bias / mean(true_quantile)
)
quantile_results_summary <- quantile_results %>%
group_by(method, epsilon, delta) %>%
summarize(
`Median bias` = median(bias),
`Median relative bias` = median(relative_bias),
RMSE = sqrt(mean(bias ^ 2))
) %>%
ungroup()
quantile_results_summary %>%
filter(epsilon > 0.01) %>%
filter(delta %in% c(NA, 0.000001),
epsilon < 100,
epsilon >= 0.1) %>%
ggplot(aes(epsilon, `Median relative bias`)) +
geom_hline(yintercept = 0, color = "red") +
geom_line(aes(group = method), alpha = 0.2) +
geom_point(aes(color = method), alpha = 0.4) +
facet_wrap(~method, scales = "free_y") +
labs(
title = "Median Relative Bias for Income Quantiles"
)
quantile_results_summary %>%
filter(epsilon > 0.01) %>%
filter(delta %in% c(NA, 0.000001),
epsilon < 100,
epsilon >= 0.1) %>%
ggplot(aes(epsilon, `Median bias`)) +
geom_hline(yintercept = 0, color = "red") +
geom_line(aes(group = method), alpha = 0.2) +
geom_point(aes(color = method), alpha = 0.4) +
facet_wrap(~method, scales = "free_y") +
labs(
title = "Median Bias for Income Quantiles"
)
quantile_results_summary %>%
filter(epsilon > 0.01) %>%
filter(delta %in% c(NA, 0.000001),
epsilon < 100,
epsilon >= 0.1) %>%
ggplot(aes(epsilon, RMSE)) +
geom_hline(yintercept = 0, color = "red") +
geom_line(aes(group = method), alpha = 0.2) +
geom_point(aes(color = method), alpha = 0.4) +
facet_wrap(~method, scales = "free_y") +
labs(
title = "RMSE for Income Quantiles"
)
quantile_results_summary <- quantile_results_summary %>%
filter(method != "quantile_smooth")
quantile_results_summary %>%
filter(epsilon > 0.01) %>%
filter(delta %in% c(NA, 0.000001),
epsilon < 100,
epsilon >= 0.1) %>%
ggplot(aes(epsilon, `Median relative bias`)) +
geom_hline(yintercept = 0, color = "red") +
geom_line(aes(group = method), alpha = 0.2) +
geom_point(aes(color = method), alpha = 0.4) +
facet_wrap(~method) +
labs(
title = "Median Relative Bias for Income Quantiles"
)
quantile_results_summary %>%
filter(epsilon > 0.01) %>%
filter(delta %in% c(NA, 0.000001),
epsilon < 100,
epsilon >= 0.1) %>%
ggplot(aes(epsilon, `Median bias`)) +
geom_hline(yintercept = 0, color = "red") +
geom_line(aes(group = method), alpha = 0.2) +
geom_point(aes(color = method), alpha = 0.4) +
facet_wrap(~method) +
labs(
title = "Median Bias for Income Quantiles"
)
quantile_results_summary %>%
filter(epsilon > 0.01) %>%
filter(delta %in% c(NA, 0.000001),
epsilon < 100,
epsilon >= 0.1) %>%
ggplot(aes(epsilon, RMSE)) +
geom_hline(yintercept = 0, color = "red") +
geom_line(aes(group = method), alpha = 0.2) +
geom_point(aes(color = method), alpha = 0.4) +
facet_wrap(~method) +
labs(
title = "RMSE for Income Quantiles"
)