##########################Chapter 8, missing data imputation########################## ex.data<-read.table(file="C:/jenn/teaching/stat472572/data/airquality20.txt",header=T) #airquality dataset is available in R, just type airquality #or you can use #data(airquality) #ex.data<-airquality[1:20,] #I used the first 20 observation from the airquality data ex.data #notice 2 missing points for "Ozone", and 3 missing points for "Solar.R" summary(ex.data) #set up a function to calculate the missing rate for each variable pmiss <- function(x){sum(is.na(x))/length(x)} #for a matrix 1 indicates rows, 2 indicates columns, #c(1, 2) indicates rows and columns. apply(ex.data,2,pmiss) apply(ex.data,1,pmiss) #inpute missing values using package mice install.packages("mice") library(mice) md.pattern(ex.data) #Display missing-data patterns. ##(1) use method pmm, predictive mean matching #m=5 Number of multiple imputations. The default is m=5. #meth='pmm' refers to the predictive mean matching method, especially #for imputing quantitative variables that are not normally distributed. #type methods(mice) for a list of the available imputation methods. #method=norm.predict (Linear regression, predicted values) #method=mean (Unconditional mean imputation) tempdata <- mice(ex.data,m=5,meth='pmm',seed=500) summary(tempdata) tempdata$imp$Ozone #look at the imputed values for Ozone, no 5 and 10 missing tempdata$imp$Solar.R #look at the imputed values for Solar.R imputedata<- complete(tempdata,1) #get the first imputed sample ##(2) use method mean, predictive mean matching tempdata2 <- mice(ex.data,m=1,meth='mean',seed=500) summary(tempdata2) ex.data2<-ex.data[which(ex.data$Ozone!="NA"),] ##remove the two observations #with Ozone missing mean(ex.data2$Ozone) #19.22 tempdata2$imp$Ozone #look at the imputed values for Ozone, they are 19.22 tempdata2$imp$Solar.R #look at the imputed values for Ozone imputedata2<- complete(tempdata2) imputedata2