################################################################################
####################Chapter 6 Unequal Probability Sampling######################
################################################################################


##########################example of physicians in United States#################
library(SDaA)
library(survey)
data(statepop)
nrow(statepop)
statepop[1:10,]
statepop$psi <- statepop$popn/255077536 #M_0 =  255077536  #define psi_i
plot(phys ~ psi, data = statepop, xlab = expression(paste(Psi[i]," 
for County")), ylab = "Physicians in County (in thousands)")

###one stage sampling with replacement
dppswr<- svydesign(id=~1, probs=~psi, data=statepop)
est1<-svytotal(~phys,dppswr)  #total of the n estimates
est2<-as.data.frame(est1)  
tpsihat<-est2[1]/100  #avergae the n estimates to get the \hat t_phi
setpsihat<-est2[2]/100 
tpsihat
setpsihat

#direct formula
tpsihat2<-sum(statepop$phys/statepop$psi)/100
tpsihat2
vtpsihat2<-sum((statepop$phys/statepop$psi-tpsihat2)^2)/(100*99)
setpsihat2<-sqrt(vtpsihat2)
setpsihat2

list(tpsihat=tpsihat,setpsihat=setpsihat,tpsihat2=tpsihat2,setpsihat2=
setpsihat2)

###Variances: without replacement
## Horvitz-Thompson type
#use brewer methods
dppsbr<-svydesign(id=~1,  fpc=~psi, data=statepop, 
pps="brewer",variance="HT")  #default is "HT"
dppsbr
svytotal(~phys,dppsbr)

#use Hartley-Rao approximation
dppsht<- svydesign(id=~1,  fpc=~psi, data=statepop, 
pps=HR(sum(statepop$psi^2)/100)) ##use Hartley-Rao approximation
dppsht
svytotal(~phys,dppsht)

## Yates-Grundy type
dppsyg<- svydesign(id=~1,  fpc=~psi, data=statepop, 
pps=HR(sum(statepop$psi^2)/100),variance="YG")  #default is "HT"
dppsyg
svytotal(~phys,dppsyg)

##########################PPS package#################
install.packages("pps")
library(pps)

data(calif)
data(califcty)

#The data set calif is based
#on data from the 2000 U.S. census, available at the U.S. Census Bureau’s web
#site, http://www.census.gov. The file includes one record for each place in
#California. A place can be anything from a tiny town to Los Angeles. The file
#includes a county code, and a stratum code has been added. The four strata
#were derived according to county size: counties with fewer than 50,000 people
#are in stratum 1 (as are the places in those counties), with 50,000 to 300,000
#people in stratum 2, with 300,000 to 1,000,000 people in stratum 3, and the
#remaining ones are in stratum 4. The other variables in the file are total
#population (variable name population), and the number of whites (white),
#American Indians (amerind) and Hispanics (hispanic). A second data set,
#califcty, is derived from calif. It is the county-level summary of the bigger
#file, i.e., each record in califcty corresponds to a county.

####stratified sampling

table(calif$stratum) # take a look at the size of each stratum
nh <- c(40,50,40,60) # these are the sample sizes we want
calif<-calif[order(calif$stratum),] # sort by stratum
califsample <- calif[stratsrs(calif$stratum,nh),]
califsample[1:10,]
by(califsample[,2:5],califsample$stratum,summary)
by(calif[,2:5],calif$stratum,summary)

####ppswr, pps with replacement

#The function call
#is ppswr(sizes,n), where sizes is a vector of the sizes of the population
#units and n is the sample size.

county1 <- calif[calif$county==1,] # get the units in county 1, there are 20 
#places in county 1 with different population size
county1
county1sample<-county1[ppswr(county1$population,5),] # select 5 units
county1sample

####ppswor, pps without replacement, sampford and sampfordpi
#The function sampford select a PPS sample without replacement using Sampford’s
#method. The function sampfordpi computes the corresponding inclusion
#pi_i and joint inclusion probabilities  pi_{ij}.
#the Sampford ones incorporate a check for units that are
#too big and aborts if such units are found

sampford(county1$popul,5) # what if n is too big?
worsample<-county1[sampford(county1$popul,3),]
worsample
piij<-sampfordpi(county1$popul,3)
diag(piij) # these are the inclusion probabilities
wt<-1/diag(piij) # these are the weights to use for estimation