/************************************************* Statistical Analysis with Missing Data: Lab Session 1 K. Welch and B. West Complete Case Analysis, Mean Imputation, Response Propensity Analysis, and Multiple Imputation using SAS Software. *************************************************/ /******************************************************************* We will use a linear regression framework to illustrate various approaches for handling incomplete data. We will use a subset of data from the Longitudinal Study of Aging (a supplement to 1984 National Health Interview Survey). For illustration purposes, we will ignore the complex design features of the LSOA, such as clustering, stratification and weighting for unequal probabilities of selection. ******************************************************************/ title1 "Missing Data Workshop. Lab Session 1."; options linesize=80 pagesize=72 pageno=1 nodate label; options formchar="|----|+|---+=|-/\<>*"; options label; libname lab "E:\lsoa"; /***************************************************************** First, take a look at the values of the original variables ******************************************************************/ title2 "LSOA Data Set"; proc contents data=lab.lsoa_86x; run; title2 "Descriptives for Variables in Original Data Set"; proc means data=lab.lsoa_86x n nmiss mean std; var numadl numadl2r poverty ownbuyr mortgage amtowed presval educ age84 racer sex ; run; title2 "Frequencies of Original Variables"; proc freq data=lab.lsoa_86x; tables poverty / missing; tables ownbuyr mortgage/ missing; tables racer / missing; tables numadl numadl2r / missing; tables educ / missing; run; /****************************************************************** Now, read the permanent LSOA data set with missing data into a temporary working data set and create new variables. Note: different types of missing values can be indicated by using either a simple . or a code such as .D *******************************************************************/ data one; set lab.lsoa_86x; /* Recode the not-applicables to differentiate from missing */ if ownbuyr=. then ownbuyr=5; if mortgage=. then mortgage=4; if ownbuyr=5 then amtowed=0; if ownbuyr=5 then presval=0; /* Create a binary ownership variable. */ own=0; if ownbuyr in (1,2) then own=1; if ownbuyr=.D then own=.; /* Take natural log transforms of number of ADLs requiring assistance in 1986 and 1984 (to improve normality), and compute the difference. Add 1 to avoid taking logarithm of zero. NUMADL= Number of restricted activities of daily living (baseline, 1984) NUMADL2R= Number of restricted activities of daily living (followup, 1986) */ logadl=log(numadl+1); logadl2r=log(numadl2r+1); diffadl=logadl2r-logadl; /* Create dummy variables for race */ black=0; if racer=2 then black=1; if racer=. then black=.; other=0; if racer=3 then other=1; if racer=. then other=.; run; /****************************************************************** Now get the number of missing values and the distributions of respondent values for the variables of interest. ********************************************************************/ options nolabel; proc means data = one n nmiss mean std; var diffadl poverty own educ age84 sex black other; title2 "Amount of Missing data and the Distributional"; title3 "Characteristics of Respondent Values"; run; /*********************************************************/ /* Analysis Method #1: Complete Case Regression Analysis */ /*********************************************************/ /* Fit a regression model predicting diffadl with a variety of demographic predictors */ proc reg data = one; model diffadl=poverty own educ logadl age84 sex black other; output out=regdat rstudent=resid; title2 "Model 1: Complete Case Method. The Default."; run; quit; /* Note that 1883 observations are excluded from the analysis! There are some variables with very few missing values. Common Approach: Substitute mean or modal values. Theoretically, this is problematic. We will assess the impact of doing this naive imputation later. */ /*********************************************************************/ /* Analysis Method #2: Unconditional Mean Imputation of Missing Data */ /*********************************************************************/ /* Create a second data set where variables with little missing data have values imputed by mean imputation. Note: no direct imputation for the response variable (Diffadl), although adding imputed values for logadl indirectly gives us an additional 21 values for Diffadl. */ data two; set one; /*Use Mode for Own (Since Mean=0.66, Mode is 1)*/ if own=. then own=1; if educ=.D then educ=9.7573719; if logadl=. then logadl=.3488198; /* Remember we have to recompute the derived variables. This will give us more values for the dependent variable, since it is the change in log of Numadl from 1984 to 1986. */ diffadl=logadl2r-logadl; run; /* Re-examine the descriptive statistics in the new data set: fewer missing values.*/ proc means data = two n nmiss mean std; var diffadl poverty own educ logadl age84 sex black other; title2 "Descriptives for Partially Imputed Data"; run; /* Refit the regression model, using the partially imputed data set. */ proc reg data = two; model diffadl=poverty own educ logadl age84 sex black other; title2 "Partially Imputed Data"; title3 "Model 2: Mean Imputation Method"; run; quit; /* Now, 1768 observations are excluded from the analysis instead of 1883. Results similar to analysis 1. */ /************************************************************************/ /* Analysis Method #3: Response Propensity Method */ /************************************************************************/ /* We will now develop the Response Propensity model to adjust the complete case analysis for missingness*/ /* First, define a missing indicator for each respondent, based on the values of Poverty and Diffadl, the only two variables that have missing data in our partially imputed data set. Variable Miss: Nonrespondent = 1, Respondent=0 */ data two; set two; miss=0; if poverty=.D or diffadl=. then miss=1; run; title "Missing Data Indicator Variable"; proc freq data=two; tables miss; run; /* Now, Fit a logistic regression model, modeling the probability that Miss = 0, for the variables Poverty, Educ, Own, and Diffadl (i.e. a respondent has no missing data for these variables)). Save predicted probabilities in a new dataset (data=three), using the variable name Predprob */ proc logistic data = two; model miss=age84 sex educ logadl black other own; output out=three p=predprob; title2 "Model the Probability of Being Non-Missing"; run; /* Use the saved predicted probabilities of having complete data to develop complete case weights, computing the weights as the reciprocals of the response probabilities. */ data four; set three; if miss=1 then delete; ccwt=1/predprob; run; /*Examine Weights and relationship of Weights to Predictors*/ proc univariate data=FOUR plot; var ccwt; histogram; title2 "Examine Response Propensity Weights"; run; proc sgplot data=FOUR; scatter y=ccwt x=age84; run; proc sgplot data=FOUR; scatter y=ccwt x=educ; run; proc sgplot data=FOUR; scatter y=ccwt x=logadl; run; proc sgplot data=FOUR; vbox ccwt / category=sex; run; proc sgplot data=FOUR; vbox ccwt / category=black; run; proc sgplot data=FOUR; vbox ccwt / category=other; run; proc sgplot data=FOUR; vbox ccwt /category=own; run; /* Refit the regression model, using the complete case weights and the surveyreg procedure to correctly handle the weights. Set the degrees of freedom for the regression model.*/ proc surveyreg data = FOUR ; model diffadl=poverty own educ logadl age84 sex black other / df=3374; weight ccwt; title2 "Regression Analysis: Weighted CC Analysis"; title3 "Based on Response Propensity Method"; run; quit; /*******************************************************************/ /* Analysis Method #4: Multiple Imputation Analysis using SAS Procs MI and MIANALYZE, Assuming Multivariate Normality for the variables to be imputed. */ /*******************************************************************/ /* We now use Proc MI to impute all of the missing data in the very first data set (one). Proc MI has different options for monotone and non-monotone missing data. Our example data have a non-monotone missing data pattern, so we use the full-data Markov-Chain Monte Carlo (MCMC) method. NOTE: Proc MI assumes a multivariate normal distribution for all variables to be imputed. The variables OWN and POVERTY are binary, and therefore clearly non-normal. For now, we ignore this problem, and continue as if all variables were normally distributed. */ /* Create a new data set, keeping only the key analysis variables from the original data set. Make sure all missing value codes are only . (not .D), as required by Proc MI. */ data five; set one; keep diffadl poverty own educ logadl logadl2r age84 sex black other ownmiss povmiss racer; /* Proc MI only likes . as missing values */ if poverty = .D then poverty = .; if educ = .D then educ = .; /* Create a dummy variable for missing on OWN, to be used later. */ if own = . then ownmiss = 1; if own in (0,1) then ownmiss = 0; /* Create a dummy variable for missing on OWN, to be used later. */ if poverty = . then povmiss = 1; if poverty in (1,2) then povmiss = 0; /* drop cases that are missing on all continuous variables (2 Cases) */ if diffadl = . and educ = . and logadl=. then delete; run; proc means data=five nmiss mean; run; /* First run Proc MI to look at the missing data patterns and check for monotonicity. */ ods trace on; proc mi data=five nimpute=0 simple ; var diffadl poverty own educ logadl logadl2r; ods exclude EMPostEstimates; title2 "Proc MI Analysis to Get Missing Data Patterns"; run; ods trace off; /* Run Proc MI to impute values for the variables with missing data. */ /* Note: 5 imputed data sets will be generated by default. Seed is a random number seed, for MCMC. The 5 imputed data sets will be saved in the data set outmi, and are indexed by the variable _Imputation_ */ proc mi data=FIVE NIMPUTE=5 out=OUTMI seed=3355; var poverty own sex black other educ age84 logadl logadl2r ; title2 "Using Proc MI to Impute Assuming Multivariate Normality"; title3 "Five Imputations"; run; /*Create Difference on Log Scale AFTER Imputation*/ data OUTMI; set OUTMI; diffadl = logadl2r - logadl; run; /* Look at the new distributions of the imputed analysis variables for the first imputation. Note that the minimum and maximum imputed values for the binary variable OWN are outside the range of 0 to 1, because we assumed multivariate normality. We also see that Poverty is outside the range 1 to 2.*/ options nolabel; proc means data = outmi n nmiss min max; where _Imputation_ = 1 ; var logadl diffadl own poverty educ; title2 "Distributional Characteristics of Respondent Values"; title3 "After Proc MI"; run; /* Now, fit the regression model by _Imputation_ Get separate parameter estimates for each imputed data set, and save the resulting estimates in a data set named outreg. Note: the noprint option means that the regression output for each imputed data set will not show up in the SAS output window. */ proc reg data=OUTMI outest=OUTREG covout noprint; by _Imputation_; model diffadl=poverty own educ logadl age84 sex black other; title2 "Regression Analysis on Normal Imputation Data Sets"; run; /* Now, use Proc MIANALYZE to perform a multiple imputation analysis using the estimated coefficients and standard errors from each of the imputed data sets. */ proc mianalyze data=OUTREG; modeleffects Intercept poverty own educ logadl age84 sex black other; title2 "Combine Multivariate Normal Estimates Using Proc MIANALYZE"; run; /******************************************************************* Analysis Method #5: Alternative Method for Imputation for the Binary Variable: OWN. Using the Imputed data sets, we now set the imputed values of OWN to missing. Then, we rerun Proc MI on each of the 5 imputed data sets, which are now monotone missing for OWN, and generate a _single_ imputed data set for each imputed data set using the Logistic Method. Then, we rerun Proc Reg and Proc MIANALYZE. **********************************************************************/ /* Reset the missing values of OWN to missing in the imputed data set*/ data six; set outmi; if ownmiss = 1 then own = .; run; proc mi data=six nimpute=1 out=outmi2 seed=3355; by _Imputation_; class own; monotone logistic (OWN = poverty educ logadl age84 sex black other / details); var poverty educ logadl age84 sex black other own; title2 "Impute Values for OWN, Using Logistic regression on the Monotone Data"; run; proc freq data=outmi2; tables _imputation_*own / nocol nopercent; title2 "Distribution of Imputed Values After Logistic Regression"; title3 "Imputed Values for OWN are now 0 or 1"; run; /* We now do the same thing for Poverty, using the data that we have already imputed for OWN.*/ data seven; set outmi2; if povmiss = 1 then poverty = .; run; proc mi data=seven nimpute=1 out=outmi3 seed=3355; by _Imputation_; class poverty; monotone logistic (poverty = own educ logadl age84 sex black other / details); var own educ logadl age84 sex black other poverty; title2 "Impute Values for Poverty, Using Logistic regression on the Monotone Data"; run; proc freq data=outmi3; tables _imputation_*poverty / nocol nopercent; title2 "Distribution of Imputed Values After Logistic Regression"; title3 "Imputed Values for Poverty are now 1 or 2"; run; proc reg data=outmi3 outest=outreg2 covout noprint; by _imputation_; model diffadl=poverty own educ logadl age84 sex black other; run; /* Now, use Proc MIANALYZE to perform a multiple imputation analysis using the estimated coefficients and standard errors from each of the imputed data sets. */ proc mianalyze data=outreg2 ; modeleffects Intercept poverty own educ logadl age84 sex black other; title2 "Multiple Imputation Analysis After Logistic Regression for OWN"; run; /************************************************************ Redo Multiple Imputation With Min and Max Set, Note: Make all missing values . Prior to running Proc MI Use . when a missing value is not to be set for a variable. Min and Max values correspond to variables in order listed. Normal imputation method is once again used. *************************************************************/ proc mi data=FIVE nimpute=5 out=OUTMI4 seed=3355 minimum = 1 0 . . . 0 . 0 0 maximum = 2 1 . . . 18 . . .; var poverty own sex black other educ age84 logadl logadl2r ; title2 "Multiple Imputation Using Multivariate Normal Assumption"; title3 "With Min and Max Specified"; run; /*Recalculate diffadl after imputation*/ data OUTMI4; set OUTMI4; diffadl = logadl2r - logadl; run; /*Check minimum and maximum values after imputation. Then fit regression model for each of the imputed datasets and use Proc Mianalyze to combine estimates from imputed data.*/ proc means data=OUTMI4 n nmiss min max mean; where _Imputation_ = 1; title2 "Descriptives on First Imputed Data Set, After Setting Min and Max"; run; proc reg data=OUTMI4 outest=OUTREG3 covout ; by _Imputation_; model diffadl=poverty own educ logadl age84 sex black other; title2 "Regression Analysis on Imputed Data"; run; proc mianalyze data=OUTREG3; modeleffects Intercept poverty own educ logadl age84 sex black other; title2 "Combine the Estimates Using Proc MIANALYZE"; title3 "Assume Multivariate Normal Distribution, but with Min and Max"; run; /**************************************************************** MIANALYZE on Parameter Estimates and Covariances from GENMOD Note that we output three datasets, which are the inputs to Proc Mianalyze. *****************************************************************/ proc genmod data=outmi; model diffadl = poverty own educ logadl age84 sex black other / covb dist=normal; by _Imputation_; ods output ParameterEstimates=gmparms ParmInfo=gmpinfo CovB=gmcovb; title2 "Proc GENMOD on Imputed Data"; run; proc print data=gmparms(obs=20); var _Imputation_ Parameter Estimate StdErr; title2 "GENMOD Model Coefficients (First Two Imputations)"; run; proc print data=gmpinfo (obs=20); title2 "GENMOD Parameter Information (First Two Imputations)"; run; proc print data=gmcovb(obs=20); var _Imputation_ RowName Prm1-Prm9; title2 "GENMOD Covariance Matrices (First Two Imputations)"; run; proc mianalyze parms=gmparms covb=gmcovb parminfo=gmpinfo; modeleffects intercept poverty own educ logadl age84 sex black other; test Poverty = Own = Educ = 0 / mult; title2 "Analysis of Coefficients from GENMOD"; run; /**************************************************************** MIANALYZE on Parameter Estimates and Covariances from LOGISTIC Use dataset=outmi3, because it contains the categorical imputed data for Poverty, with values restricted to 1 and 2. Note that we output two datasets from Proc Logistic, the Parameter estimates go into lgparms, and the covariance matrix of the parameter estimates goes into lgcovb. These are then the inputs to Proc Mianalyze. *****************************************************************/ proc logistic data=outmi3; model poverty = own educ own*educ age84 sex black other / covb ; by _Imputation_; ods output ParameterEstimates=lgparms CovB=lgcovb; title2 "Proc LOGISTIC on Imputed Data"; run; proc print data=lgparms (obs=16); title2 "LOGISTIC Model Coefficients (First Two Imputations)"; run; proc print data=lgcovb (obs=16); title2 "LOGISTIC Model Covariance Matrices (First Two Imputations)"; run; proc mianalyze parms=lgparms covb(effectvar=stacking)=lgcovb; modeleffects Intercept own educ own*educ age84 sex black other; title2 "Analysis of Coefficients from LOGISTIC"; run; /**************************************************************** MIANALYZE on Parameter Estimates and Covariances from MIXED *****************************************************************/ proc mixed data=outmi; model diffadl=poverty own educ logadl age84 sex black other/solution covb; by _Imputation_; ods output SolutionF=mixparms CovB=mixcovb; title2 "Use Proc Mixed to Analyze Imputed Data"; run; proc print data=mixparms (obs=18); var _Imputation_ Effect Estimate StdErr; title "Model Coefficients from Proc MIXED (First Two Imputations)"; run; proc print data=mixcovb (obs=18); var _Imputation_ Row Effect Col1-Col9; title "Covariance Matrices (First Two Imputations)"; run; proc mianalyze parms=mixparms covb(effectvar=rowcol)=mixcovb; modeleffects Intercept poverty own educ logadl age84 sex black other ; SES: test Poverty = OWN = EDUC = 0 / mult; RACE: test black = 0, other = 0 / mult; title "Analysis of Coefficients from Proc MIXED"; run; /**************************************************************** MIANALYZE on Parameter Estimates and Covariances from MIXED with a CLASS Statement *****************************************************************/ proc mixed data=outmi; class racer; model diffadl=poverty own educ logadl age84 sex racer/solution covb; by _Imputation_; ods output SolutionF=mixparms2; title2 "Use Proc Mixed to Analyze Imputed Data"; title2 "Including a CLASS statement for RACER"; run; proc print data=mixparms2 (obs=20); var _Imputation_ Effect RACER Estimate StdErr; title "Model Coefficients from Proc MIXED (First Two Imputations)"; run; proc mianalyze parms(classvar=full)=mixparms2; class racer; modeleffects Intercept poverty own educ logadl age84 sex racer; title "Analysis of Coefficients from Proc MIXED, with CLASS Variable"; run;