################################################################################ ####################Chapter 5 Cluster Sampling with Equal Probability########### ################################################################################ ################################################################################ #####one stage cluster sampling, GPA data####################################### ################################################################################ ex.data <- data.frame(suite = c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5), gpa = c(3.08, 2.60,3.44,3.04,2.36,3.04,3.28,2.68,2.00,2.56,2.52,1.88,3.00,2.88,3.44,3.64,2.68,1.92,3.28,3.20), wt=c(20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20)) ex.data #descriptive statistics aa<-tapply(ex.data$gpa,ex.data$suite,sum) #sum gpa for each suite aa mean(aa) var(aa) #gives s_t^2 #define one-stage cluster design design1<-svydesign(id=~suite,fpc=~rep(100,20),nest=TRUE,data=ex.data) design1 ##recall srs: dsrs <- svydesign(id = ~1, fpc=rep(3078,300), data = agsrs) summary(design1) total1<-svytotal(~gpa,design1) total1 mean1<-svymean(~gpa,design1) mean1 ci1<-confint(mean1,level=.95) ci1 ci2<-confint(mean1,level=.95,df=4) ##use t-approximation ci2 #anova table, consider suites as factor with 5 levels, one-way ANOVA can be used to derive the anova talbe myfit<-lm(gpa~factor(suite),ex.data) anova(myfit) ################################################################################ #####two stage cluster sampling, Coots data##################################### ################################################################################ library(survey) library(SDaA) data(coots) attach(coots) #coots.dat come from ArnoldĄ¯s (1991) work on egg size #and volume of American coot eggs in Minnedosa, Manitoba #In this data set, we look at volumes of a subsample of eggs #in clutches (nests of eggs) with at least two eggs available for #measurement #Randomly select 2 eggs in each clutch and measure their volume #Want to estimate the mean egg volume nrow(coots) coots[1:10,] #clutch is psu #csize: number of eggs in the clutch, M_i #m_i=2 # n= 184 clutches selected ##plot data plot(volume ~ clutch, xlim = c(0, 200), pch = 19, data = coots, xlab = "Clutch Number", ylab = "Egg Volume") #Note the wide variation in the means from clutch to clutch. #eggs within the same clutch tend to be more similar #than two randomly selected eggs from different clutches, and that clustering does #not provide as much information per egg as would an SRS of eggs. ##calculate using direct formulas #split data using clutch, response interested is volume coots2<-split(volume,clutch) coots2 coots2[[1]] nclus<-length(coots2) # number of clusters selected nclus #184 clusmean<-sapply(coots2,mean) #this is yibar clusmean clusvar<-sapply(coots2,var) #this is si clusvar #split data using clutch, response interested is csize coots3<-split(csize,clutch) mi<-sapply(coots3,length) mi Mi<-sapply(coots3,mean) Mi ##unbiased estimator, suppose N is known as 10000, 1-n/N goes to 1 ##suppose M0=95000 N<-10000 tunb<-N*sum(Mi*clusmean)/nclus tunb yunb<-tunb/95000 yunb #variances ssufpc<-1-mi/Mi st2<-var(Mi*clusmean) varterm2<-sum(ssufpc*(Mi^2)*clusvar/mi) vartunb<-N^2*(1-nclus/N)*st2/nclus + (N/nclus)*varterm2 vartunb varyunb<-vartunb/(95000)^2 varyunb seyunb<-sqrt(varyunb) seyunb ##ratio estimation ybarratio<-sum(Mi*clusmean)/sum(Mi) ybarratio sr2<-var(Mi*(clusmean - ybarratio)) varybarr<-( (1 - nclus/N)*sr2/nclus + varterm2/(nclus*N) )/(mean(Mi))^2 varybarr seybarr<-sqrt(varybarr) seybarr #compare first term and second term in variance formula v1<-((1 - nclus/N)*sr2/nclus)/(mean(Mi))^2 v2<- (varterm2/(nclus*N))/(mean(Mi))^2 v1 v2 v1+v2 #textbook use ssufpc<-1, and ignore second term varybarrtext<-(sr2/nclus)/(mean(Mi))^2 varybarrtext seybarrtext<-sqrt(varybarrtext) seybarrtext list(yunb=yunb,seyunb<-seyunb, yratio=ybarratio,seybarr<-seybarr,seybarrtext=seybarrtext) ##use svymean and svytotal to calculate the estimates cootsfpc1<-rep(10000,368) #Note: 10000 is an arbitrary number, we don't have information of N cootsfpc2<-coots$csize #number of ssus in each psu ssu<-rep(1:2,184) #index of ssu design3<-svydesign(id=~clutch+ssu,fpc=~cootsfpc1+cootsfpc2,data=coots) summary(design3) svymean(~volume,design3) #ratio estimator svytotal(~volume,design3) #unbiased estimator of the total ################################################################################ #####two stage cluster sampling, API data####################################### ################################################################################ # The Academic Performance Index (API) is computed for all California # schools based on standardized testing of students. # The data sets contain information for all schools with at least 100 students # and for various probability samples of the data. # apipop contains the entire population. # apistrat contains a sample stratified by stype (Elementary/Middle/High School). # apiclus1 contains a cluster sample of school districts. # apiclus2 contains a two-stage cluster sample of schools within districts. (this is the data we used here) #apiclus2 is a sample obtained using a two-stage cluster sampling design, # 1st stage, n = 40 SRS districts were selected from N=757 districts, # 1st stage, fpc1 is a vector of N = 757 # 2nd stage, within selected district i with Mi schools, one or more schools were selected using SRS # 2nd stage, fpc2 is a vector of total number of schools in each sampled district (i.e., fpc2 = Mi). # snum: School number, dnum: District number # enroll: number of students enrolled (response variable) # there are missing data of enroll, use na.rm=TRUE, consider that these are # missing at random so as not to bias inferences. data(api) #loads several data frames that are samples from the data frame apipop. #help(api) apiclus2[1:5,] dimnames(apiclus2)[2] clu2design<-svydesign(id=~dnum+snum,fpc=~fpc1+fpc2,data=apiclus2) summary(clu2design) smean<-svymean(~enroll,clu2design,na.rm=TRUE) #ratio estimation smean #identify the schools with missing values with(apiclus2, sname[is.na(enroll)]) svytotal(~enroll, clu2design, na.rm = TRUE) #unbiased estimator #you may also use ratio estimator when M0 is known