diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29..e5bda1c 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + updated: + - Introduction, Background and Data sections with more detailed literature review. diff --git a/paper/bibliography/references.bib b/paper/bibliography/references.bib index d59c202..6b0b7ab 100644 --- a/paper/bibliography/references.bib +++ b/paper/bibliography/references.bib @@ -8,6 +8,36 @@ @techreport{advani2020capital series={Warwick Economics Research Papers} } +@article{bishop1994mixture, + title = {Mixture density networks}, + author = {Bishop, Christopher M.}, + journal = {Neural Computing Research Group Report}, + institution = {Aston University}, + number = {NCRG/94/004}, + year = {1994} +} + +@incollection{bourguignon2006microsimulation, + title = {Microsimulation as a tool for evaluating redistribution policies}, + author = {Bourguignon, Fran{\c{c}}ois and Spadaro, Amedeo}, + booktitle = {Journal of Economic Inequality}, + volume = {4}, + number = {1}, + pages = {77--106}, + year = {2006}, + publisher = {Springer} +} + +@article{dugoff2014generalizing, + title = {Generalizing observational study results: Applying propensity score methods to complex surveys}, + author = {DuGoff, Eva H. and Schuler, Megan and Stuart, Elizabeth A.}, + journal = {Health Services Research}, + volume = {49}, + number = {1}, + pages = {284--303}, + year = {2014} +} + @article{advani2023measuring, title={Measuring top income shares in the {UK}}, author={Advani, Arun and Summers, Andy and Tarrant, Hannah}, @@ -161,6 +191,13 @@ @incollection{dempster1983introduction year = {1983} } +@book{dorazio2006statistical, + title = {Statistical Matching: Theory and Practice}, + author = {D'Orazio, Marcello and Di Zio, Marco and Scanu, Mauro}, + publisher = {John Wiley \& Sons}, + year = {2006} +} + @article{dorazio2021statistical, title = {Statistical matching and imputation of survey data with R}, author = {D'Orazio, Marcello and Di Zio, Marco and Scanu, Mauro}, @@ -351,6 +388,30 @@ @misc{ons2019using url = {https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/incomeandwealth/articles/usingtaxdatatobettercapturetopearnersinhouseholdincomeinequalitystatistics/2019-02-26} } +@techreport{nunns2012tax, + title = {How {TPC} distributes the corporate income tax}, + author = {Nunns, James R.}, + institution = {Urban-Brookings Tax Policy Center}, + year = {2012}, + note = {Technical documentation of the Tax Policy Center microsimulation model} +} + +@misc{itep2023model, + title = {{ITEP} Tax Microsimulation Model Overview}, + author = {{Institute on Taxation and Economic Policy}}, + year = {2023}, + url = {https://itep.org/itep-tax-model/} +} + +@article{abowd2019census, + title = {The {U.S. Census Bureau} adopts differential privacy}, + author = {Abowd, John M.}, + journal = {Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining}, + pages = {2867--2867}, + year = {2018}, + note = {Discusses challenges with administrative data linkage and privacy} +} + @techreport{ota2012revenue, title = {Revenue estimating models at the U.S. Treasury Department}, author = {{Office of Tax Analysis}}, @@ -375,6 +436,13 @@ @misc{policyengine2025microimpute url = {https://policyengine.github.io/microimpute/} } +@misc{pytorch_tabular, + title = {PyTorch Tabular: A Framework for Deep Learning with Tabular Data}, + author = {Joseph, Manu}, + year = {2021}, + url = {https://github.com/manujosephv/pytorch_tabular} +} + @book{rubin1987multiple, title = {Multiple imputation for nonresponse in surveys}, author = {Rubin, Donald B.}, @@ -392,6 +460,16 @@ @article{siddique2008multiple year = {2008} } +@article{sutherland2013euromod, + title = {{EUROMOD}: The {European Union} tax-benefit microsimulation model}, + author = {Sutherland, Holly and Figari, Francesco}, + journal = {International Journal of Microsimulation}, + volume = {6}, + number = {1}, + pages = {4--26}, + year = {2013} +} + @article{stuart2009multiple, title = {Multiple imputation with large data sets: A case study of the Children's Mental Health Initiative}, author = {Stuart, Elizabeth A. and Azur, Melissa and Frangakis, Constantine and Leaf, Philip}, diff --git a/paper/main.pdf b/paper/main.pdf index 0389616..0c1d103 100644 Binary files a/paper/main.pdf and b/paper/main.pdf differ diff --git a/paper/sections/background.tex b/paper/sections/background.tex index 41188ec..5dde698 100644 --- a/paper/sections/background.tex +++ b/paper/sections/background.tex @@ -1,78 +1,196 @@ \section{Background} -Effective imputation requires understanding both the data's distributional properties and the techniques available to handle them. This section first explores the statistical properties of wealth microdata that challenge imputation. It then reviews the literature on microdata imputation methods, tracing their development and practical applications. +This section establishes the theoretical foundations for statistical matching and reviews the imputation methods implemented in \texttt{microimpute}. We begin with the formal problem definition and the key assumption underlying all statistical matching procedures, then discuss the relationship to missing data mechanisms, and finally describe each of the five imputation methods. -\subsection{Statistical properties of wealth distributions and imputation challenges} +\subsection{The Statistical Matching Problem} -Wealth microdata present unique statistical challenges that can render innacurate policy analyses when using traditional imputation methods. +Statistical matching arises when a researcher seeks to combine information from two (or more) data sources that have been collected independently on different samples \citep{dorazio2006statistical}. The canonical setup involves: +\begin{itemize} + \item A \textbf{donor file} $\mathcal{D} = \{(x_i, y_i)\}_{i=1}^{n_D}$ containing $n_D$ observations with common variables $X$ and target variable(s) $Y$ + \item A \textbf{receiver file} $\mathcal{R} = \{x_j\}_{j=1}^{n_R}$ containing $n_R$ observations with only the common variables $X$ +\end{itemize} + +The goal is to impute values $\hat{y}_j$ for each observation $j$ in the receiver file, creating a synthetic file that combines the variables from both sources. When the receiver file also contains variables $Z$ not present in the donor file, the resulting synthetic file contains all three sets of variables $(X, Y, Z)$, though $Y$ and $Z$ are never jointly observed. + +Formally, statistical matching seeks to estimate the conditional distribution $P(Y|X)$ from the donor data and use it to generate imputations for the receiver observations. Let $\hat{P}(Y|X=x)$ denote an estimate of this conditional distribution. For a receiver observation with covariates $x_j$, we can either: \begin{enumerate} - \item \textbf{High skewness and concentration}: Wealth distributions are typically right-skewed, with a small percentage of households holding a large share of total net worth \citep{chen2020imputation}. This concentration means that imputation models assuming normality can perform poorly, biasing estimates of wealth aggregates and inequality \citep{lun2019multiple}. - \item \textbf{Outliers and extreme values}: Legitimate extreme values are common and can unduly influence parametric imputation models. Robust methods or data transformations are often necessary \citep{chen2020imputation}. - \item \textbf{Non-linear relationships}: Wealth's relationship with predictor variables such as age, education, and income is highly non-linear \citep{zillow2024quantile}, requiring more flexible imputation methods. + \item Predict a specific quantile, such as the conditional median: $\hat{y}_j = \hat{Q}_{0.5}(Y|X=x_j)$ + \item Draw a random value from the estimated distribution: $\hat{y}_j \sim \hat{P}(Y|X=x_j)$ \end{enumerate} -\subsection{Traditional microdata imputation methods} +The first approach (deterministic imputation) may be appropriate when only point predictions are needed, though it systematically underestimates variance \citep{rubin1987multiple}. The second approach (stochastic imputation) preserves the variability of $Y$ and is preferred when distributional properties must be maintained. This is often the case in microsimulation modeling, where full distributional policy analysis is required. + +\subsection{The Conditional Independence Assumption} + +All statistical matching procedures rely, either explicitly or implicitly, on the Conditional Independence Assumption (CIA): +\begin{equation} + Y \perp Z \mid X +\end{equation} + +This assumption states that the target variable $Y$ (from the donor) and any variables $Z$ unique to the receiver are independent, conditional on the common variables $X$ \citep{dorazio2006statistical}. In other words, once we account for the shared covariates, knowing $Z$ provides no additional information about $Y$. + +The CIA is fundamentally untestable because $Y$ and $Z$ are never jointly observed in either data source. Its plausibility depends on the richness of the common variables $X$. When $X$ captures the key determinants of both $Y$ and $Z$, the assumption is more likely to hold. In practice, researchers must rely on substantive knowledge to assess whether the available common variables are sufficient to render $Y$ and $Z$ conditionally independent. + +Violations of the CIA lead to biased estimates of the joint distribution of $(Y, Z)$ in the synthetic file. However, if the research question concerns only the marginal distribution of $Y$ in the receiver population, the CIA is not strictly required. What matters is that the conditional distribution $P(Y|X)$ is correctly specified and that $X$ has sufficient overlap between donor and receiver files \citep{dorazio2021statistical}. + +\subsection{Missing Data Mechanisms} + +Statistical matching may also warrant discussion within the framework of missing data theory, where $Y$ is ``missing'' for all observations in the receiver file \citep{little2002statistical}. The missing data literature distinguishes three mechanisms: + +\begin{itemize} + \item \textbf{Missing Completely at Random (MCAR)}: The probability of missingness is unrelated to both observed and unobserved variables. In the statistical matching context, this would require that selection into the donor versus receiver sample is independent of all variables. + + \item \textbf{Missing at Random (MAR)}: The probability of missingness depends only on observed variables. For statistical matching, this means that conditional on $X$, the probability of being in the donor sample (and thus having $Y$ observed) does not depend on the value of $Y$ itself. + + \item \textbf{Missing Not at Random (MNAR)}: The probability of missingness depends on the unobserved values. This would occur if, for example, high-wealth individuals were systematically less likely to appear in the wealth survey. +\end{itemize} + +Under MAR, valid inference about $P(Y|X)$ can be obtained from the donor data alone, which justifies the statistical matching approach. Most imputation methods, including those implemented in \texttt{microimpute}, assume MAR. When MNAR is suspected, sensitivity analyses or methods that explicitly model the selection mechanism may be required \citep{little2002statistical}. -Among traditional imputation methods, we have selected three to examine, namely Ordinary Least Squares regression, Quantile regression, and Hot Deck Matching, due to their diverse approaches to imputation and relevance in the literature. We also study more novel approaches like Quantile Regression Forests, which provides an opportunity for more robust microdata imputation. In the following sections we discuss the methodological details of each method. +In statistical matching applications, an important consideration arises when the donor and receiver files come from surveys with different sampling designs. For instance, wealth surveys often oversample high-net-worth households to improve precision for this rare population, while general household surveys use designs representative of the broader population. This differential sampling can introduce a form of selection that resembles MNAR if not properly addressed. Luckily, survey weights provide a mechanism to mitigate this issue. By incorporating weights that reflect each observation's representativeness of the target population, imputation methods can adjust for known differences in sampling probabilities between the donor and receiver surveys \citep{dugoff2014generalizing}. The \texttt{microimpute} package supports the use of survey weights throughout its imputation pipeline, allowing methods to account for complex survey designs. While survey weights cannot fully address MNAR when missingness depends on unobserved values of $Y$ itself, they can substantially reduce bias arising from differential sampling designs across data sources. + +\subsection{Imputation Methods} + +We now describe the five imputation methods implemented in \texttt{microimpute}. Each method provides a different approach to estimating the conditional distribution $P(Y|X)$ and generating imputations. + +\subsubsection{Hot Deck Matching} + +Hot deck imputation replaces missing values in a receiver record with observed values from ``similar'' donor records \citep{andridge2010review}. The \texttt{microimpute} implementation uses the constrained distance hot deck approach from the StatMatch R package \citep{dorazio2021statistical}. To identify the best match, the method computes distances between each receiver observation and all donor observations based on the common variables $X$, selects donor records within a specified distance threshold, and randomly samples from the eligible donors, with optional weighting by survey weights. For continuous and mixed covariates, Mahalanobis or Gower distance metrics can capture similarity across different variable types. The donated value will thus be an actual observed value from the donor file, ensuring plausibility. + +Hot deck methods are nonparametric and avoid distributional assumptions, making them robust when the true conditional distribution is unknown or complex \citep{dorazio2006statistical}. However, they face limitations: +\begin{itemize} + \item \textbf{Donor scarcity}: For receiver observations in sparse regions of the covariate space suitable donors may be unavailable, which can lead to the reuse of the same donors, which can affect the accuracy in the distribution of imputed values + \item \textbf{Discrete outputs}: Imputed values are restricted to the set of observed donor values, which may not adequately represent the continuous nature of $Y$ + \item \textbf{Tail behavior}: Extreme values in the receiver population may have no appropriate donors or poor matches, leading to biased imputations at the tails +\end{itemize} \subsubsection{Ordinary Least Squares (OLS)} -OLS imputation predicts missing values in a recipient dataset based on a linear regression model trained on a donor dataset. The model is specified as: +OLS imputation models the conditional mean of $Y$ given $X$ through linear regression: +\begin{equation} + E[Y|X] = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + \cdots + \beta_p x_p +\end{equation} -$$y_i = \beta_0 + \beta_1 x_{i1} + \beta_2 x_{i2} + ... + \beta_p x_{ip} + \varepsilon_i,$$ +The coefficients $\boldsymbol{\beta}$ are estimated from the donor data by minimizing squared residuals. For stochastic imputation, the \texttt{microimpute} implementation, which builds on the \texttt{statsmodels} library, assumes normally distributed errors: +\begin{equation} + Y|X \sim \mathcal{N}(\boldsymbol{X}'\boldsymbol{\hat{\beta}}, \hat{\sigma}^2) +\end{equation} -where $y_i$ is the variable to be imputed for observation $i$ in the recipient dataset, $x_{i1}$,...,$x_{ip}$ are predictor variables common to both donor and recipient datasets, $\beta_0$,...,$\beta_p$ are coefficients estimated from the donor dataset, and $\varepsilon_i$ is the error term \citep{bruch2023imputation}. In deterministic regression imputation, the imputed value is typically the expected value of $y_i$. An alternative, stochastic regression imputation, adds a randomly drawn residual (from the donor model's residuals or a normal distribution with estimated variance $\sigma^2$) to the predicted value: $y_{imputed} = y_i + e_i$, where $e_i~N(0,\sigma^2)$. Stochastic imputation aims to preserve the variability of the original data better than deterministic imputation \citep{anil_regression}. +where $\hat{\sigma}^2$ is the estimated residual variance. Quantiles of $Y|X$ can then be computed as: +\begin{equation} + \hat{Q}_\tau(Y|X) = \boldsymbol{X}'\boldsymbol{\hat{\beta}} + \hat{\sigma} \cdot \Phi^{-1}(\tau) +\end{equation} -OLS assumes linearity, homoscedasticity (constant variance of errors), and normally distributed errors, all of which are typically violated by skewed wealth data \citep{vonhippel2007should}. While OLS imputation might yield consistent estimates for means and variances even with non-normal data, it can produce considerable bias for shape-dependent estimands like percentiles or skewness coefficients \citep{vonhippel2007should}. Furthermore, deterministic OLS imputation systematically underestimates the true variance of the completed data \citep{barcelo2008impact}. +where $\Phi^{-1}(\tau)$ is the $\tau$-th quantile of the standard normal distribution. -\subsubsection{Quantile Regression (QR)} +OLS is computationally efficient and well-understood, but its assumptions are often violated by economic variables: +\begin{itemize} + \item \textbf{Linearity}: Relationships between $Y$ and $X$ may be nonlinear + \item \textbf{Homoscedasticity}: Variance often increases with the level of $Y$, particularly for wealth + \item \textbf{Normality}: Heavy-tailed distributions violate the normal error assumption, leading to inaccurate quantile estimates \citep{vonhippel2007should} +\end{itemize} -Quantile regression (QR) models the conditional quantiles (e.g., median, 10th percentile, 90th percentile) of a response variable, $Y$, given a set of predictors, $X$ \citep{koenker1978regression}. The model for the $\tau$-th quantile is: +\subsubsection{Quantile Regression} -$$Q_{Y}(\tau|X) = \beta_0(\tau) + \beta_1(\tau)x_1 + \beta_2(\tau)x_2 + ... + \beta_p(\tau)x_p$$ +Quantile regression directly models conditional quantiles rather than the conditional mean \citep{koenker1978regression}. For quantile $\tau \in (0,1)$, the model is: +\begin{equation} + Q_\tau(Y|X) = \beta_0(\tau) + \beta_1(\tau) x_1 + \cdots + \beta_p(\tau) x_p +\end{equation} -When inputting from a donor to a recipient dataset, QR models for various quantiles $\tau$ are fitted on the donor dataset using common predictor variables. These fitted models are then applied to the recipient dataset to predict the conditional quantiles for observations with missing data \citep{parker_missing}. To generate a single imputed value, one might impute the conditional median ($\tau=0.5$) or draw a value from an estimated conditional distribution constructed from multiple quantile predictions \citep{wei2014multiple}. For instance, a random quantile $\tau*$ can be selected from a uniform distribution, and the imputed value computed by interpolating between the estimated responses for quantiles directly above and below $\tau*$ \citep{chen2007confidentiality}. +The coefficients $\boldsymbol{\beta}(\tau)$ are estimated by minimizing the asymmetric quantile loss function: +\begin{equation} + \mathcal{L}_\tau(\boldsymbol{\beta}) = \sum_{i=1}^{n} \rho_\tau(y_i - \boldsymbol{x}_i'\boldsymbol{\beta}) +\end{equation} -QR is more robust to outliers and better at handling skewed distributions and heteroscedasticity than OLS because it does not make strong assumptions about the error distribution \citep{zhao2023quantile}. This makes it particularly suitable for economic variables like wealth, where relationships may vary across the distribution, better preserving its overall shape \citep{kleinke2020multiple}. However, while more robust than OLS, standard quantile regression still assumes linear relationships between predictors and the outcome at each specific quantile \citep{meinshausen2006quantile}. It requires fitting separate models for different quantiles, which can increase complexity, and may struggle with high-dimensional data or very complex non-linear patterns \citep{meinshausen2006quantile}. +where the check function $\rho_\tau(u) = u(\tau - \mathbf{1}_{u<0})$ penalizes positive and negative residuals asymmetrically. -\subsubsection{Hot Deck Matching imputation} +The \texttt{microimpute} implementation of Quantile regression also uses the \texttt{statsmodels} library. By fitting models at multiple quantiles, it can approximate the entire conditional distribution of $Y|X$. For imputation, a random quantile $\tau^* \sim \text{Uniform}(0,1)$ can be drawn, and the imputed value computed by interpolating between estimated quantile functions \citep{wei2014multiple}. -Hot Deck imputation replaces missing values in a recipient record with an observed value from a ``similar" donor record. When imputing from a donor dataset to a recipient dataset, ``similarity" is established using variables common to both datasets \citep{dorazio2021statistical}. This often involves defining adjustment cells, which are groupings of records in both datasets -based on shared categorical variables (e.g., gender, education level). Donors to be matched to a receiver record are then selected from the corresponding cell in the donor dataset \citep{chen2000nearest}. For continuous or mixed data, a distance metric (e.g., Euclidean, Mahalanobis) is calculated between a recipient record and potential donor records based on common variables. The donor record with the smallest distance is chosen \citep{dorazio2021statistical}. Once the matching is done, the exact value from the selected donor record in the donor dataset is then used to fill the missing item in the recipient dataset \citep{andridge2010review}. +Quantile regression offers advantages over OLS: +\begin{itemize} + \item \textbf{Robustness}: Less sensitive to outliers than least squares + \item \textbf{Heteroscedasticity}: Naturally accommodates varying spread across the distribution + \item \textbf{No normality assumption}: Does not require specifying an error distribution +\end{itemize} -Hot Deck methods are non-parametric and do not require explicit model specification, making them robust to weak distributional assumptions \citep{dorazio2021statistical}. Since imputed values are actual observed values from the donor dataset, they are inherently plausible and can help preserve the marginal distribution of the imputed variable if donors are well-matched \citep{andridge2010review}. Nonetheless, a critical challenge is ensuring an adequate and representative donor pool in the donor dataset for all types of recipients in the target dataset. This is particularly difficult for extreme wealth values, where suitable donors may be scarce or unrepresentative, leading to biased imputations or overuse of certain donors \citep{haziza2009imputation}. Additionally, it may struggle to maintain complex multivariate relationships, especially when imputing across datasets with different underlying structures or sampling designs \citep{siddique2008multiple}. The effectiveness is highly dependent on the choice of matching variables \citep{ota2012revenue}, as well as “similarity” metrics. Poorly defined cells or metrics can lead to inappropriate donor selection and biased results \citep{andridge2010review}. Most critically for policy analysis, Hot Deck methods provide only observed values rather than capturing the full conditional distribution, making it impossible to properly quantify imputation uncertainty and data variance. This affects winner-loser analysis and the evaluation of distributional impacts of policy changes where small differences in imputed values can dramatically affect conclusions about who benefits or loses \citep{rubin1987multiple}. +However, quantile regression still assumes linear relationships at each quantile, limiting its flexibility for complex data structures \citep{meinshausen2006quantile}. -\subsubsection{Quantile Regression Forests (QRF)} +\subsubsection{Mixture Density Networks (MDN)} -Quantile Regression Forests (QRF) \citep{meinshausen2006quantile} extend Random Forests (RF), which are ensemble learners that build multiple decision trees and excel at capturing non-linearities and interactions \citep{breiman2001random}, to estimate conditional quantiles. When imputing from a donor dataset to a receiver dataset, QRF models are trained on the donor dataset using predictor variables common to both. Instead of storing only mean values in terminal nodes (as in standard RF regression), QRF retains all observed outcome values for the training instances that fall into each terminal leaf of each tree \citep{meinshausen2006quantile}. +Mixture Density Networks combine neural networks with mixture models to estimate flexible conditional densities \citep{bishop1994mixture}. The approach models $P(Y|X)$ as a mixture of Gaussian components: +\begin{equation} + p(y|x) = \sum_{k=1}^{K} \pi_k(x) \cdot \mathcal{N}(y \mid \mu_k(x), \sigma_k^2(x)) +\end{equation} -For a given point $x$ (representing the predictor values for an observation in the recipient dataset), the conditional distribution function $\hat{F}(y|X=x)$ of the target variable $Y$ is estimated as: +where $K$ is the number of mixture components, and the mixing coefficients $\pi_k(x)$, means $\mu_k(x)$, and variances $\sigma_k^2(x)$ are all functions of the input $x$, parameterized by a neural network. -$$\hat{F}(y|X=x) = \sum_{i=1}^n w_i(x) \cdot \mathbf{1}_{{Y_i \leq y}},$$ +The neural network outputs $3K$ values for each input: $K$ unnormalized log-probabilities (transformed via softmax to obtain $\pi_k$), $K$ means, and $K$ log-variances (exponentiated to ensure positivity). The network is trained by maximizing the log-likelihood: +\begin{equation} + \mathcal{L} = \sum_{i=1}^{n} \log p(y_i | x_i) +\end{equation} -where $Y_i$ are the observed values from the donor dataset, $1_{Y_i} \leq y$ is an indicator function ($1 \text{ if } Y_i \leq y$, $0 \text{ otherwise}$), and $w_i(x)$ represents the weight assigned to each donor observation $i$. This weight is derived from the forest structure; specifically, $w_i(x)$ is positive if observation $i$ from the donor set falls into the same terminal node as $x$ in any tree, and its magnitude reflects how often this co-occurrence happens across all trees in the forest \citep{kleinke2023robust}. +For imputation, one can either sample from the mixture distribution or compute quantiles numerically by inverting the cumulative distribution function. -The $\tau$-th conditional quantile is then estimated by finding the minimum value $y$ for which the estimated cumulative distribution function $\hat{F}(y|X=x)$ is greater than or equal to $\tau$: +The \texttt{microimpute} implementation uses PyTorch Tabular \citep{pytorch_tabular}, which provides optimized architectures for tabular data. MDNs offer several advantages: +\begin{itemize} + \item \textbf{Multimodal distributions}: The mixture formulation can capture multiple modes in $P(Y|X)$ + \item \textbf{Flexible nonlinearity}: Neural networks can approximate complex relationships between $X$ and distributional parameters + \item \textbf{Full density estimation}: Provides a complete probabilistic model, not just quantiles +\end{itemize} -$$\hat{Q}_\tau(y|X=x) = \inf{y: \hat{F}(y|X=x) \geq \tau}$$ +Nonetheless, its limitations include: +\begin{itemize} + \item \textbf{Data requirements}: Neural networks typically require larger sample sizes for reliable estimation + \item \textbf{Hyperparameter sensitivity}: Performance depends on architecture choices, learning rate, and number of components + \item \textbf{Computational cost}: Training is more expensive than parametric methods +\end{itemize} -\citep{meinshausen2006quantile}. This allows for the estimation of any quantile without retraining the model \citep{woodruff2024enhancing}. For imputation, particularly multiple imputation, values can be drawn randomly from this estimated conditional distribution for each observation in the recipient dataset requiring imputation. This process helps reflect the uncertainty inherent in the imputation. +\subsubsection{Quantile Random Forests (QRF)} -This approach offers several critical advantages for microdata imputation, and wealth imputation more specifically: +Quantile Random Forests extend Random Forests to estimate conditional quantiles \citep{meinshausen2006quantile}. A standard Random Forest builds an ensemble of decision trees, each trained on a bootstrap sample, with predictions averaged across trees. QRF modifies this by retaining all training observations in each terminal node rather than just their mean. -\begin{enumerate} - \item \textbf{Distribution preservation}: By modeling the entire conditional distribution, QRF is adept at capturing and preserving the right-skewness and heavy tails characteristic of wealth distributions \citep{meinshausen2006quantile}. +For a new observation with covariates $x$, QRF estimates the conditional distribution function as: +\begin{equation} + \hat{F}(y|X=x) = \sum_{i=1}^{n} w_i(x) \cdot \mathbf{1}_{Y_i \leq y} +\end{equation} + +where the weight $w_i(x)$ reflects how often training observation $i$ falls in the same terminal node as $x$ across trees in the forest. Specifically: +\begin{equation} + w_i(x) = \frac{1}{B} \sum_{b=1}^{B} \frac{\mathbf{1}_{x_i \in L_b(x)}}{|L_b(x)|} +\end{equation} - \item \textbf{Non-linear relationship handling}: The tree-based structure of RF, and thus QRF, automatically handles complex non-linear relationships between predictors (e.g., demographic variables) and the imputation target (e.g., wealth) without requiring explicit transformation or pre-specification of these functional forms \citep{tang2017random}. +where $B$ is the number of trees, $L_b(x)$ is the terminal node containing $x$ in tree $b$, and $|L_b(x)|$ is the number of training observations in that node. - \item \textbf{Automatic interaction detection}: QRF naturally incorporates interactions between predictor variables, as tree splitting rules inherently consider combinations of features \citep{tang2017random}. +The $\tau$-th conditional quantile is then: +\begin{equation} + \hat{Q}_\tau(Y|X=x) = \inf\{y : \hat{F}(y|X=x) \geq \tau\} +\end{equation} - \item \textbf{Robustness to outliers}: The ensemble nature of random forests and the focus on quantiles (rather than just the mean) make QRF less sensitive to extreme outliers in the donor data that might distort parametric models \citep{tang2017random}. +The \texttt{microimpute} implementation uses the \texttt{quantile-forest} package, which offers several advantages for statistical matching: - \item \textbf{Single model for all quantiles}: Unlike standard QR, which requires fitting separate models for different quantiles, QRF produces an estimate of the entire conditional distribution from a single trained model, making it computationally more efficient \citep{meinshausen2006quantile}. +\begin{enumerate} + \item \textbf{Distribution preservation}: By modeling the entire conditional distribution, QRF captures skewness, heavy tails, and other distributional features without parametric assumptions \citep{meinshausen2006quantile}. + \item \textbf{Nonlinear relationships}: The tree-based structure automatically captures complex nonlinearities and interactions without requiring explicit specification \citep{breiman2001random}. + \item \textbf{Single model for all quantiles}: Unlike standard quantile regression, which requires fitting separate models for each quantile, QRF estimates all quantiles from a single trained forest. + \item \textbf{Robustness}: The ensemble approach and quantile focus make QRF less sensitive to outliers than mean-based methods \citep{tang2017random}. \end{enumerate} -When imputing from a survey with specific design features to a more general survey with somewhat different distributional properties, QRF's ability to learn localized relationships in the predictor space can be advantageous. If survey weights from the donor are incorporated during the QRF training (e.g., by influencing tree construction or the sampling of observations for bootstrap aggregation), the model can learn to represent the oversampled segments appropriately. The subsequent prediction onto the receiver dataset, which has its distinct sample structure, then relies on the learned conditional distributions. The challenge lies in ensuring that the relationships learned from the donor are transportable and applicable to the recipient, and that the resulting imputations in the recipient dataset, when combined with its own survey weights, yield valid population estimates. While QRF itself doesn't explicitly model survey design features like clustering or stratification in a formal statistical sense unless specifically adapted, its flexibility in capturing complex data structures can implicitly handle some of the heterogeneity introduced by such designs \citep{hao2007quantile}, making it much stronger than other more limited imputation approaches. +Nevertheless, QRF also has limitations: +\begin{itemize} + \item \textbf{Extrapolation}: Trees cannot extrapolate beyond the range of training data, potentially underestimating extreme quantiles when donor and receiver distributions differ + \item \textbf{Discrete approximation}: The estimated distribution is discrete, with support limited to observed $Y$ values in the training data + \item \textbf{Computational cost}: Training and prediction are slower than parametric methods, though generally faster than neural networks +\end{itemize} + +\subsection{Current Practice in Microsimulation} + +Statistical matching and data fusion are fundamental operations in microsimulation modeling, yet the methods employed in practice have remained relatively unchanged for decades. A review of major tax-benefit microsimulation models reveals a strong reliance on traditional imputation approaches, primarily hot deck matching and OLS-based regression. + +Hot deck matching remains the dominant approach in European microsimulation. EUROMOD, the EU-wide tax-benefit model, employs a multi-stage imputation procedure combining predictive mean matching with distance-based hot deck methods to integrate consumption data from Household Budget Surveys into its EU-SILC input data \citep{sutherland2013euromod}. The appeal of hot deck methods lies in their simplicity and the guarantee that imputed values are observed values from the donor file, ensuring plausibility. However, as discussed above, these methods struggle with tail behavior and may not adequately capture the full conditional distribution of the target variable. + +In U.S. tax policy microsimulation, regression-based approaches are more common. The Tax Policy Center employs a two-stage probit and OLS procedure for wealth imputation, first predicting the probability of holding each asset type, then predicting amounts conditional on positive holdings \citep{nunns2012tax}. Similarly, the Institute on Taxation and Economic Policy (ITEP) model relies on statistical matching between tax return data and the American Community Survey, supplemented with regression-based imputations from the Survey of Consumer Finances \citep{itep2023model}. These regression approaches assume linear relationships and normally distributed errors; assumptions that are frequently violated by economic variables with heavy tails and heteroscedastic relationships. + +More recent microsimulation efforts have begun incorporating administrative data through record linkage rather than statistical matching \citep{abowd2019census}, but this approach requires access to restricted data and raises privacy concerns. For researchers working with publicly available survey data, statistical matching remains essential, creating a need for methods that can better capture complex distributional features. -Nonetheless, QRF has its own limitations. Given the data-splitting nature of a tree, certain terminal nodes may receive a single or very few extreme training samples. When imputing, all the data points from the receiver dataset that land on those leaves will likely receive the same or very similar values for the imputed variable, even if there are differences in predictor values between them. In practice, this means that if the donor and receiver datasets have distributional differences, imputations at the extreme tails of the receiver dataset may suffer. Data points that are not necessarily unusual or extreme might receive extreme imputations, while truly extreme values in the receiver dataset are at risk of not being regarded as so if the donor dataset had a narrower range of values for the training variable. \ No newline at end of file +The limitations of traditional approaches can be particularly acute for heavily-tailed variables where relationships with predictors vary across the distribution. Machine learning methods such as Quantile Regression Forests offer a promising alternative, as they can capture nonlinear relationships, model the entire conditional distribution, and handle heavy-tailed data without restrictive parametric assumptions. Despite these advantages, QRF and similar methods have seen limited adoption in mainstream microsimulation practice. The \texttt{microimpute} package aims to lower barriers to adopting these more flexible methods by providing a unified framework for comparing traditional and machine learning approaches, enabling researchers to empirically evaluate which method best suits their specific data characteristics. \ No newline at end of file diff --git a/paper/sections/data.tex b/paper/sections/data.tex index b17d7ab..8c7a646 100644 --- a/paper/sections/data.tex +++ b/paper/sections/data.tex @@ -1,5 +1,7 @@ \section{Data}\label{sec:data} +We demonstrate the \texttt{microimpute} methodology through an application to wealth imputation, transferring net worth from the Survey of Consumer Finances (SCF) onto the Current Population Survey (CPS). + \subsection{Survey of Consumer Finances} The Survey of Consumer Finances (SCF), sponsored by the Federal Reserve Board, is a triennial survey providing detailed information on U.S. households' assets, liabilities, income, and demographic characteristics. Its dual-frame sample design includes a standard national area-probability sample and a list sample deliberately oversampling wealthy households to better capture the skewed wealth distribution \citep{barcelo2006imputation}. The SCF is a benchmark for wealth imputation research due to its detailed financial data and the known complexities arising from its design and the nature of wealth. Item nonresponse in public-use SCF datasets is addressed by the Federal Reserve through a multiple imputation approach that generates five complete datasets with different imputed values, using sequential regression-based procedures that incorporate range constraints, logical data structures, and empirical residuals to preserve the complex multivariate relationships inherent in wealth data \citep{kennickell1998multiple}. @@ -8,16 +10,8 @@ \subsection{Survey of Consumer Finances} \subsection{Current Population Survey} -The Current Population Survey (CPS), conducted by the U.S. Census Bureau and the U.S. Bureau of Labor Statistics, is a monthly survey primarily focused on labor market information. The Annual Social and Economic Supplement (ASEC) collects detailed annual income data and some information on assets and liabilities, though far less comprehensively than the SCF. The CPS uses a national probability sample and is a key source for income and poverty statistics. Missing data, particularly for income items, is also a feature of the CPS. +The Current Population Survey (CPS), conducted jointly by the U.S. Census Bureau and the Bureau of Labor Statistics, is a monthly survey of approximately 60,000 U.S. households that serves as the primary source of labor force statistics for the United States. The CPS uses a multistage probability-based sample designed to represent the civilian non-institutional population; on average, each sampled household represents approximately 2,500 households in the population. The Annual Social and Economic Supplement (ASEC) extends the core survey with detailed annual income data, including earnings, unemployment compensation, Social Security, pension income, interest, dividends, and other income sources. However, despite its comprehensive coverage of income and employment, the CPS does not collect information on household wealth, assets, or liabilities, which are key variables needed for wealth-based policy analysis. This omission motivates the need for statistical matching to transfer wealth information from the SCF onto the CPS. \subsection{Comparative analysis and characteristics for imputation} -Beyond wealth data's inherent challenges, imputing between SCF and CPS presents additional complications due to their differences in scope, design, and wealth data measurement. These complications include: - -\begin{enumerate} - \item \textbf{Sampling approach}: The SCF employs a dual-frame sample design, deliberately oversampling wealthy households through a list sample derived from tax returns. The CPS uses a more standard probability sample that does not effectively capture the upper tail of the wealth distribution \citep{bryant2023general}. - \item \textbf{Sample size and frequency}: The SCF typically includes about 4,500-6,000 households and is conducted triennially, while the CPS surveys approximately 60,000 households monthly. - \item \textbf{Wealth variable coverage}: The SCF collects extremely detailed information on financial assets and liabilities, while the CPS survey design does not request most of this data from its respondents, making direct matching of asset categories difficult. -\end{enumerate} - -These structural differences create challenges for transferring wealth information between surveys through traditional imputation methods. The predictors available in both datasets may not involve linear relationships with wealth, and the surveys may have vastly different sample sizes at various points along the wealth distribution. \ No newline at end of file +Imputing between SCF and CPS exemplifies the challenges of statistical matching, as it requires combining a specialized survey with detailed wealth data but limited sample size with a large representative survey that lacks wealth measures entirely. The SCF-to-CPS imputation is directly relevant to U.S. policy microsimulation, where researchers require comprehensive microdata combining income, demographics, and wealth to analyze the distributional effects of tax and benefit reforms. By imputing wealth onto the CPS, analysts can examine wealth distributions in a sample roughly thirteen times larger than the SCF alone, enabling finer demographic disaggregations and more precise subgroup analyses. Moreover, the heavy-tailed nature of wealth distributions and the different sampling designs of these surveys create methodological challenges that make this an informative test case for comparing imputation methods. \ No newline at end of file diff --git a/paper/sections/introduction.tex b/paper/sections/introduction.tex index fcdb833..68e40a2 100644 --- a/paper/sections/introduction.tex +++ b/paper/sections/introduction.tex @@ -1,17 +1,18 @@ \section{Introduction} -Microsimulation models and detailed microdata analyses are essential tools for understanding the distributional impacts of policies and social changes. These analyses require data that accurately represent both the demographic composition of a population and its economic circumstances. However, available data sources, particularly large-scale surveys, often suffer from missing data due to item nonresponse \citep{dempster1983introduction} or as a result of survey design. If not appropriately addressed, missing data can introduce substantial bias, undermining the validity of research conclusions \citep{graham2009missing} or limiting analysis opportunities altogether, increasing the risk of ineffective policy decisions and unintended consequences. +Statistical matching, also known as data fusion or full variable imputation, addresses the challenge of combining information from multiple data sources that share common variables but contain different samples of units \citep{dorazio2006statistical}, which is commonly found across fields in empirical research. In its simplest form, the problem involves a donor dataset containing variables $(X, Y)$ and a receiver dataset containing only $X$, where the goal is to impute values of $Y$ for observations in the receiver file based on their shared characteristics $X$ \citep{dorazio2021statistical}. This framework generalizes traditional missing data imputation by recognizing that ``missingness'' can arise not only from nonresponse within a single survey but also from the structural absence of variables across distinct data sources. -This problem extends beyond item nonresponse to systematic underreporting of certain income types in surveys. Evidence from fiscal and financial surveys in the UK demonstrate how data imputation can improve data quality. Dividend income in the Family Resources Survey is severely underreported, with the survey not even collecting data about directors' dividend incomes until the 2021-2022 survey year \citep{dwp2023frs}. The UK's "SPI adjustment" methodology addresses this by replacing survey responses with administrative tax data for high earners, revealing income distributions previously hidden by measurement error \citep{advani2023measuring}. These examples illustrate that imputation from administrative sources does not merely fill gaps but can provide superior data quality compared to self-reported survey responses, particularly for sensitive financial variables prone to underreporting. +The relevance of statistical matching extends across the social sciences, particularly in microsimulation modeling where policy analysis requires combining detailed demographic information from one survey with economic variables from another \citep{bourguignon2006microsimulation}. For instance, household surveys may capture income and consumption patterns but lack wealth data, while financial surveys provide wealth information for different samples. Constructing synthetic files that combine these variables enables richer policy analysis than either source alone permits. -Traditional imputation approaches struggle with wealth data's right-skewness, heavy tails, and non-linear relationships with demographic and economic predictors. Wealth's heterogeneous interaction with income complicates imputation. Advani and Summers \citep{advani2020capital} demonstrated that capital gains, a key component of wealth changes, are distributed across the income spectrum with substantial volatility, finding that even individuals at the 80th income percentile have only a 1\% probability of realizing any taxable gains, while those who do receive gains show extreme variability in amounts. These characteristics fundamentally violate assumptions underpinning conventional methods like Ordinary Least Squares (OLS) and Quantile Regression, resulting in significant distortions that undermine policy analysis \citep{meinshausen2006quantile}. +The methodological challenge lies in accurately modeling the conditional distribution $P(Y|X)$ from the donor data and using it to generate plausible values of $Y$ for receiver observations. Different approaches make different assumptions about this distribution. Parametric methods like Ordinary Least Squares assume linearity and homoscedasticity, which are often violated by economic variables exhibiting heavy tails and heterogeneous relationships across the distribution \citep{meinshausen2006quantile}. Nonparametric methods like hot deck matching avoid distributional assumptions but may struggle to capture complex multivariate relationships \citep{andridge2010review}. More recent machine learning approaches, including Quantile Regression Forests and Mixture Density Networks, offer flexible alternatives that can model entire conditional distributions without restrictive parametric assumptions. -This paper demonstrates that Quantile Regression Forests (QRF) provides superior performance for wealth imputation between the Survey of Consumer Finances (SCF) and the Current Population Survey (CPS). By modelling entire conditional distributions rather than conditional means alone, QRF preserves critical distributional features of wealth data. We implement this approach through the $\texttt{microimpute}$ package, a specialised tool developed for survey data imputation that provides a complete pipeline for imputation and analysis, tailored to the dataset at hand. This package automates the comparison of four imputation methods, namely QRF, OLS, Hot Deck Matching, and Quantile Regression, automatically selecting the one achieving lowest average quantile loss to perform the final wealth impuation. +This paper presents the \texttt{microimpute} package, a Python library implementing five statistical matching methods: Hot Deck Matching, Ordinary Least Squares, Quantile Regression, Mixture Density Networks, and Quantile Regression Forests. The package provides a comprehensive framework for comparing these methods through systematic benchmarking, with automated model selection based on quantile loss performance. We demonstrate the methodology through an application to wealth imputation, transferring net worth data from the Survey of Consumer Finances to the Current Population Survey. This task exemplifies the challenges of statistical matching for heavy-tailed distributions. -The remainder of this paper is organized as follows: Section 2 reviews the statistical properties of wealth microdata and the evolution of imputation techniques in the literature, discussing the strengths and limitations of the four methods evaluated. Section 3 describes our data sources (SCF and CPS) and their characteristics. Section 4 presents the $\texttt{microimpute}$ package in detail. Section 5 presents our empirical results. Section 6 discusses implications and limitations, and Section 7 concludes. -Our analysis makes two key contributions: +This work provides the following contributions: \begin{itemize} - \item An open-source microimputation package that facilitates the evaluation of multiple imputation methods tailored to specific dataset needs - \item A validation framework comparing novel methodological approaches to traditional imputation methods demonstrated on statistically challenging data like wealth distributions - \item A demonstration of QRF's advantages for wealth imputation, achieving better distributional estimates and a reduction in average quantile loss compared to traditional methods -\end{itemize} \ No newline at end of file + \item An open-source implementation of a unified framework for five imputation methods, facilitating systematic comparison across methods and support statistical matching through accessible, user-friendly software + \item An automated benchmarking pipeline that evaluates distributional accuracy using quantile loss, enabling researchers to select the most appropriate method for their specific data characteristics + \item Empirical evidence on the relative performance of these methods for wealth imputation, confirming the advantages of Quantile Regression Forests for heavy-tailed distributions +\end{itemize} + +The remainder of this paper is organized as follows. Section 2 reviews the statistical matching problem, including its formal definition, the conditional independence assumption that underlies all matching methods, and the five imputation approaches implemented in \texttt{microimpute}. Section 3 describes our data sources. Section 4 presents the package architecture and benchmarking methodology. Section 5 reports empirical results from the wealth imputation application. Section 6 discusses implications and limitations, and Section 7 concludes. \ No newline at end of file diff --git a/paper/sections/methodology.tex b/paper/sections/methodology.tex index 263c2ed..e1d10c6 100644 --- a/paper/sections/methodology.tex +++ b/paper/sections/methodology.tex @@ -2,11 +2,11 @@ \section{Methodology}\label{sec:methodology} \subsection{$\texttt{microimpute}$ package implementation} -$\texttt{microimpute}$\footnote{Complete documentation, implementation details, and usage examples are available at https://policyengine.github.io/microimpute/.} is PolicyEngine's specialized Python framework that enables variable imputation through multiple statistical methods, providing a consistent interface for comparing and benchmarking different imputation approaches using quantile loss calculations. +$\texttt{microimpute}$\footnote{Complete documentation, implementation details, and usage examples are available at https://policyengine.github.io/microimpute/.} is a Python framework that enables variable imputation through multiple statistical methods, providing a consistent interface for comparing and benchmarking different imputation approaches using quantile loss calculations. \subsubsection{Core capabilities} -The package currently supports four primary imputation methods: Hot Deck Matching, Ordinary Least Squares Linear Regression, Quantile Regression Forests (QRF), and Quantile Regression. This approach allows researchers to systematically evaluate which technique provides the most accurate results for their specific dataset and research objectives. +The package currently supports four primary imputation methods: Hot Deck Matching, Ordinary Least Squares Linear Regression, Quantile Regression Forests (QRF), and Quantile Regression. This approach allows researchers to systematically evaluate which technique provides the most accurate results for their specific dataset and research objectives. Additionally, the package is designed to be modular, allowing for easy extension with additional imputation methods in the future. \subsubsection{Key features for microimputation} @@ -20,6 +20,19 @@ \subsubsection{Key features for microimputation} \item \textbf{Autoimputation}: Provides an integrated imputation pipeline that tunes method hyperparameters to the specific datasets, compares methods, and selects the best-performing to conduct the requested imputation in a single function call. \end{enumerate} -\subsubsection{Implementation details} +\subsection{Evaluation Framework} -$\texttt{microimpute}$'s QRF implementation extends $\texttt{scikit-learn}$'s Random Forest to provide full conditional quantile estimation, enabling stochastic imputation that preserves distributional properties rather than relying solely on point estimates. OLS and QuantReg methods are implemented using $\texttt{statsmodels}$, while Matching uses the R $\texttt{StatMatch}$ package's Hot Deck Matching capabilities. The $\texttt{microimpute}$ package is designed to be modular, allowing for easy extension with additional imputation methods in the future. \ No newline at end of file +Comparing imputation methods requires metrics that assess distributional accuracy, not just point prediction. The \texttt{microimpute} package uses quantile loss as the primary evaluation metric: +\begin{equation} + \mathcal{L}_\tau = \rho_\tau(y - \hat{Q}_\tau(y|x)) = (y - \hat{Q}_\tau(y|x)) \cdot (\tau - \mathbf{1}_{y < \hat{Q}_\tau(y|x)}) +\end{equation} + +Averaging over multiple quantiles $\tau \in \{0.1, 0.2, \ldots, 0.9\}$ provides a comprehensive measure of how well the method estimates the conditional distribution across its entire range. Lower average quantile loss indicates better distributional calibration. + +Additional metrics include: +\begin{itemize} + \item \textbf{Wasserstein distance}: Measures the distance between the marginal distributions of observed and imputed values + \item \textbf{Kolmogorov-Smirnov statistic}: Tests whether observed and imputed values come from the same distribution +\end{itemize} + +These metrics enable systematic comparison across methods, allowing researchers to select the approach best suited to their specific data characteristics. \ No newline at end of file