################################################################################
####################Chapter 5 Cluster Sampling with Equal Probability###########
################################################################################

################################################################################
#####one stage cluster sampling, GPA data#######################################
################################################################################
ex.data <- data.frame(suite = c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5), gpa = c(3.08,
2.60,3.44,3.04,2.36,3.04,3.28,2.68,2.00,2.56,2.52,1.88,3.00,2.88,3.44,3.64,2.68,1.92,3.28,3.20),
wt=c(20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20))
ex.data
#descriptive statistics
aa<-tapply(ex.data$gpa,ex.data$suite,sum)  #sum gpa for each suite
aa
mean(aa)
var(aa) #gives s_t^2
#define one-stage cluster design
design1<-svydesign(id=~suite,fpc=~rep(100,20),nest=TRUE,data=ex.data) 
design1
##recall srs: dsrs <- svydesign(id = ~1, fpc=rep(3078,300), data = agsrs)
summary(design1)
total1<-svytotal(~gpa,design1)
total1
mean1<-svymean(~gpa,design1)
mean1
ci1<-confint(mean1,level=.95)
ci1
ci2<-confint(mean1,level=.95,df=4) ##use t-approximation
ci2

#anova table, consider suites as factor with 5 levels, one-way ANOVA can be used to derive the anova talbe
myfit<-lm(gpa~factor(suite),ex.data)
anova(myfit)


################################################################################
#####two stage cluster sampling, Coots data#####################################
################################################################################

library(survey)
library(SDaA)
data(coots)
attach(coots)

#coots.dat come from Arnold’s (1991) work on egg size
#and volume of American coot eggs in Minnedosa, Manitoba
#In this data set, we look at volumes of a subsample of eggs
#in clutches (nests of eggs) with at least two eggs available for
#measurement
#Randomly select 2 eggs in each clutch and measure their volume
#Want to estimate the mean egg volume

nrow(coots)
coots[1:10,]
#clutch is psu
#csize: number of eggs in the clutch, M_i
#m_i=2
# n= 184 clutches selected
##plot data
plot(volume ~ clutch, xlim = c(0, 200), pch = 19,
data = coots, xlab = "Clutch Number", ylab = "Egg Volume")

#Note the wide variation in the means from clutch to clutch. 
#eggs within the same clutch tend to be more similar 
#than two randomly selected eggs from different clutches, and that clustering does 
#not provide as much information per egg as would an SRS of eggs.

##calculate using direct formulas

#split data using clutch, response interested is volume
coots2<-split(volume,clutch)
coots2
coots2[[1]]
nclus<-length(coots2) # number of clusters selected
nclus  #184
clusmean<-sapply(coots2,mean) #this is yibar
clusmean
clusvar<-sapply(coots2,var) #this is si
clusvar

#split data using clutch, response interested is csize
coots3<-split(csize,clutch)
mi<-sapply(coots3,length)
mi
Mi<-sapply(coots3,mean)
Mi

##unbiased estimator, suppose N is known as 10000, 1-n/N goes to 1
##suppose M0=95000
N<-10000
tunb<-N*sum(Mi*clusmean)/nclus
tunb
yunb<-tunb/95000
yunb
#variances
ssufpc<-1-mi/Mi
st2<-var(Mi*clusmean)
varterm2<-sum(ssufpc*(Mi^2)*clusvar/mi)
vartunb<-N^2*(1-nclus/N)*st2/nclus + (N/nclus)*varterm2
vartunb
varyunb<-vartunb/(95000)^2
varyunb
seyunb<-sqrt(varyunb)
seyunb
##ratio estimation
ybarratio<-sum(Mi*clusmean)/sum(Mi)
ybarratio
sr2<-var(Mi*(clusmean - ybarratio))
varybarr<-( (1 - nclus/N)*sr2/nclus + varterm2/(nclus*N) )/(mean(Mi))^2
varybarr
seybarr<-sqrt(varybarr)
seybarr
#compare first term and second term in variance formula
v1<-((1 - nclus/N)*sr2/nclus)/(mean(Mi))^2
v2<- (varterm2/(nclus*N))/(mean(Mi))^2
v1
v2
v1+v2

#textbook use ssufpc<-1, and ignore second term
varybarrtext<-(sr2/nclus)/(mean(Mi))^2
varybarrtext
seybarrtext<-sqrt(varybarrtext)
seybarrtext

list(yunb=yunb,seyunb<-seyunb, yratio=ybarratio,seybarr<-seybarr,seybarrtext=seybarrtext)


##use svymean and svytotal to calculate the estimates 
cootsfpc1<-rep(10000,368) #Note: 10000 is an arbitrary number, we don't have information of N
cootsfpc2<-coots$csize #number of ssus in each psu
ssu<-rep(1:2,184) #index of ssu
design3<-svydesign(id=~clutch+ssu,fpc=~cootsfpc1+cootsfpc2,data=coots)
summary(design3)

svymean(~volume,design3)  #ratio estimator
svytotal(~volume,design3)  #unbiased estimator of the total


################################################################################
#####two stage cluster sampling, API data#######################################
################################################################################

# The Academic Performance Index (API) is computed for all California
# schools based on standardized testing of students.
# The data sets contain information for all schools with at least 100 students
# and for various probability samples of the data.
# apipop contains the entire population.
# apistrat contains a sample stratified by stype (Elementary/Middle/High School).
# apiclus1 contains a cluster sample of school districts.
# apiclus2 contains a two-stage cluster sample of schools within districts. (this is the data we used here)

#apiclus2 is a sample obtained using a two-stage cluster sampling design, 
# 1st stage,  n = 40 SRS districts were selected from N=757 districts,
# 1st stage, fpc1 is a vector of N = 757
# 2nd stage, within selected district i with Mi schools, one or more schools were selected using SRS
# 2nd stage, fpc2 is a vector of total number of schools in each sampled district (i.e., fpc2 = Mi).
# snum: School number, dnum: District number 
# enroll: number of students enrolled (response variable)
# there are missing data of enroll, use na.rm=TRUE, consider that these are 
# missing at random so as not to bias inferences.

data(api) #loads several data frames that are samples from the data frame apipop.
#help(api)
apiclus2[1:5,]
dimnames(apiclus2)[2]
clu2design<-svydesign(id=~dnum+snum,fpc=~fpc1+fpc2,data=apiclus2)
summary(clu2design)

smean<-svymean(~enroll,clu2design,na.rm=TRUE) #ratio estimation
smean
#identify the schools with missing values 
with(apiclus2, sname[is.na(enroll)])

svytotal(~enroll, clu2design, na.rm = TRUE)  #unbiased estimator
#you may also use ratio estimator when M0 is known