handout6

################################################################################
####################Chapter 6 Unequal Probability Sampling######################
################################################################################


##########################example of physicians in United States#################
library(SDaA)
library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

data(statepop)
nrow(statepop)

## [1] 100

statepop[1:10,]

##    state       county landarea    popn  phys farmpop numfarm farmacre
## 1     AL       Wilcox      889   13672     4     666     322   156950
## 2     AZ     Maricopa     9204 2209567  4320    2124    2334  1391456
## 3     AZ     Maricopa     9204 2209567  4320    2124    2334  1391456
## 4     AZ        Pinal     5370  120786    61     881     730  1958489
## 5     AR      Garland      678   76100   131     524     389    41293
## 6     AR  Mississippi      898   55060    48     955     615   488042
## 7     CA Contra_Costa      720  840585  1761     607     840   200262
## 8     CA         Kern     8142  587680   682    3307    2255  3037068
## 9     CA  Los_Angeles     4060 9053645 23677    1154    2035   280156
## 10    CA  Los_Angeles     4060 9053645 23677    1154    2035   280156
##    veterans percviet
## 1       836     20.8
## 2    262170     31.5
## 3    262170     31.5
## 4     14858     29.1
## 5     11055     21.3
## 6      5285     33.8
## 7     93838     32.5
## 8     56910     32.9
## 9    688576     27.3
## 10   688576     27.3

statepop$psi <- statepop$popn/255077536 #M_0 =  255077536  #define psi_i
plot(phys ~ psi, data = statepop, xlab = expression(paste(Psi[i]," 
for County")), ylab = "Physicians in County (in thousands)")

###one stage sampling with replacement
dppswr<- svydesign(id=~1, probs=~psi, data=statepop)
dppswr

## Independent Sampling design (with replacement)
## svydesign(id = ~1, probs = ~psi, data = statepop)

est1<-svytotal(~phys,dppswr)  #total of the n estimates
est1

##         total      SE
## phys 57030430 4140123

est2<-as.data.frame(est1) 
est2

##         total    phys
## phys 57030430 4140123

tpsihat<-est2[1]/100  #avergae the n estimates to get the \hat t_phi
setpsihat<-est2[2]/100 
tpsihat

##         total
## phys 570304.3

setpsihat

##          phys
## phys 41401.23

#direct formula
tpsihat2<-sum(statepop$phys/statepop$psi)/100
tpsihat2

## [1] 570304.3

vtpsihat2<-sum((statepop$phys/statepop$psi-tpsihat2)^2)/(100*99)
setpsihat2<-sqrt(vtpsihat2)
setpsihat2

## [1] 41401.23

list(tpsihat=tpsihat,setpsihat=setpsihat,tpsihat2=tpsihat2,setpsihat2=
setpsihat2)

## $tpsihat
##         total
## phys 570304.3
## 
## $setpsihat
##          phys
## phys 41401.23
## 
## $tpsihat2
## [1] 570304.3
## 
## $setpsihat2
## [1] 41401.23

###Variances: without replacement
## Horvitz-Thompson type
#use brewer methods
dppsbr<-svydesign(id=~1,  fpc=~psi, data=statepop, 
pps="brewer",variance="HT")  #default is "HT"
dppsbr

## Independent Sampling design
## svydesign(id = ~1, fpc = ~psi, data = statepop, pps = "brewer", 
##     variance = "HT")

svytotal(~phys,dppsbr)

##         total      SE
## phys 57030430 4132937

#use Hartley-Rao approximation
dppsht<- svydesign(id=~1,  fpc=~psi, data=statepop, 
pps=HR(sum(statepop$psi^2)/100)) ##use Hartley-Rao approximation
dppsht

## Sparse-matrix design object:
##  svydesign(id = ~1, fpc = ~psi, data = statepop, pps = HR(sum(statepop$psi^2)/100))

svytotal(~phys,dppsht)

##         total      SE
## phys 57030430 4148640

## Yates-Grundy type
dppsyg<- svydesign(id=~1,  fpc=~psi, data=statepop, 
pps=HR(sum(statepop$psi^2)/100),variance="YG")  #default is "HT"
dppsyg

## Sparse-matrix design object:
##  svydesign(id = ~1, fpc = ~psi, data = statepop, pps = HR(sum(statepop$psi^2)/100), 
##     variance = "YG")

svytotal(~phys,dppsyg)

##         total      SE
## phys 57030430 4124554

##########################PPS package#################
#install.packages("pps")
library(pps)

data(calif)
data(califcty)

#The data set calif is based
#on data from the 2000 U.S. census, available at the U.S. Census Bureau???s web
#site, http://www.census.gov. The file includes one record for each place in
#California. A place can be anything from a tiny town to Los Angeles. The file
#includes a county code, and a stratum code has been added. The four strata
#were derived according to county size: counties with fewer than 50,000 people
#are in stratum 1 (as are the places in those counties), with 50,000 to 300,000
#people in stratum 2, with 300,000 to 1,000,000 people in stratum 3, and the
#remaining ones are in stratum 4. The other variables in the file are total
#population (variable name population), and the number of whites (white),
#American Indians (amerind) and Hispanics (hispanic). A second data set,
#califcty, is derived from calif. It is the county-level summary of the bigger
#file, i.e., each record in califcty corresponds to a county.

####stratified sampling

table(calif$stratum) # take a look at the size of each stratum

## 
##   1   2   3   4 
## 179 223 291 384

nh <- c(40,50,40,60) # these are the sample sizes we want
calif<-calif[order(calif$stratum),] # sort by stratum
califsample <- calif[stratsrs(calif$stratum,nh),]
califsample[1:10,]

##      county population white amerind hispanic stratum
## 576      32       1160   975     108      108       1
## 925      47        351   304      26       14       1
## 275      18       1998  1766     105      170       1
## 51        5       2061  1943      20      116       1
## 61        6       3670  1668      42     2613       1
## 1046     54       1865  1660      80      130       1
## 923      47        136   121       5        3       1
## 1039     54       3388  3138      54      167       1
## 1044     54       4423  4041      66      372       1
## 564      32         37    33       4        0       1

by(califsample[,2:5],califsample$stratum,summary)

## califsample$stratum: 1
##    population          white            amerind          hispanic      
##  Min.   :   37.0   Min.   :   33.0   Min.   :  0.00   Min.   :   0.00  
##  1st Qu.:  302.2   1st Qu.:  263.5   1st Qu.:  9.25   1st Qu.:  24.75  
##  Median : 1597.5   Median : 1466.5   Median : 28.50   Median : 121.50  
##  Mean   : 2338.6   Mean   : 1947.6   Mean   : 59.17   Mean   : 336.07  
##  3rd Qu.: 3080.5   3rd Qu.: 2329.2   3rd Qu.: 62.25   3rd Qu.: 221.00  
##  Max.   :13541.0   Max.   :10823.0   Max.   :431.00   Max.   :2613.00  
## -------------------------------------------------------- 
## califsample$stratum: 2
##    population        white            amerind         hispanic      
##  Min.   :  149   Min.   :  124.0   Min.   :  1.0   Min.   :    3.0  
##  1st Qu.: 1253   1st Qu.:  934.8   1st Qu.: 22.0   1st Qu.:  192.2  
##  Median : 5998   Median : 3674.0   Median : 53.0   Median :  803.5  
##  Mean   :11059   Mean   : 8071.2   Mean   :138.2   Mean   : 2705.7  
##  3rd Qu.:13051   3rd Qu.:10880.2   3rd Qu.:139.0   3rd Qu.: 2091.0  
##  Max.   :79921   Max.   :68756.0   Max.   :818.0   Max.   :26425.0  
## -------------------------------------------------------- 
## califsample$stratum: 3
##    population         white          amerind          hispanic      
##  Min.   :   339   Min.   :  140   Min.   :   4.0   Min.   :   37.0  
##  1st Qu.:  1949   1st Qu.: 1500   1st Qu.:  26.0   1st Qu.:  348.5  
##  Median :  6380   Median : 3502   Median :  59.0   Median : 1052.0  
##  Mean   : 16568   Mean   :12123   Mean   : 151.6   Mean   : 3957.0  
##  3rd Qu.: 18124   3rd Qu.:15425   3rd Qu.: 172.5   3rd Qu.: 4478.2  
##  Max.   :100916   Max.   :79511   Max.   :1173.0   Max.   :24573.0  
## -------------------------------------------------------- 
## califsample$stratum: 4
##    population         white           amerind          hispanic       
##  Min.   :   748   Min.   :   372   Min.   :   6.0   Min.   :   122.0  
##  1st Qu.:  5940   1st Qu.:  3853   1st Qu.:  36.5   1st Qu.:   841.5  
##  Median : 17537   Median : 10809   Median :  87.0   Median :  3735.5  
##  Mean   : 40774   Mean   : 22996   Mean   : 321.1   Mean   : 14964.9  
##  3rd Qu.: 55831   3rd Qu.: 28993   3rd Qu.: 373.0   3rd Qu.: 15839.0  
##  Max.   :337977   Max.   :150194   Max.   :4013.0   Max.   :257097.0

by(calif[,2:5],calif$stratum,summary)

## calif$stratum: 1
##    population          white          amerind          hispanic      
##  Min.   :    5.0   Min.   :    5   Min.   :  0.00   Min.   :    0.0  
##  1st Qu.:  196.5   1st Qu.:  178   1st Qu.:  4.00   1st Qu.:   11.0  
##  Median : 1160.0   Median :  862   Median : 24.00   Median :   85.0  
##  Mean   : 2378.4   Mean   : 1909   Mean   : 58.05   Mean   :  427.6  
##  3rd Qu.: 2838.5   3rd Qu.: 2436   3rd Qu.: 58.00   3rd Qu.:  266.5  
##  Max.   :34413.0   Max.   :20341   Max.   :587.00   Max.   :18949.0  
## -------------------------------------------------------- 
## calif$stratum: 2
##    population        white          amerind          hispanic    
##  Min.   :   96   Min.   :   81   Min.   :   0.0   Min.   :    1  
##  1st Qu.: 1722   1st Qu.: 1118   1st Qu.:  19.5   1st Qu.:  189  
##  Median : 4687   Median : 3098   Median :  53.0   Median :  729  
##  Mean   :10400   Mean   : 7518   Mean   : 140.8   Mean   : 2987  
##  3rd Qu.:10533   3rd Qu.: 8465   3rd Qu.: 139.0   3rd Qu.: 2912  
##  Max.   :91565   Max.   :71727   Max.   :1802.0   Max.   :33254  
## -------------------------------------------------------- 
## calif$stratum: 3
##    population         white           amerind          hispanic     
##  Min.   :    77   Min.   :    66   Min.   :   0.0   Min.   :     4  
##  1st Qu.:  1862   1st Qu.:  1422   1st Qu.:  17.5   1st Qu.:   270  
##  Median :  5352   Median :  3593   Median :  54.0   Median :  1059  
##  Mean   : 22606   Mean   : 13807   Mean   : 212.3   Mean   :  6383  
##  3rd Qu.: 18985   3rd Qu.: 10661   3rd Qu.: 169.0   3rd Qu.:  5389  
##  Max.   :776733   Max.   :385728   Max.   :6763.0   Max.   :170520  
## -------------------------------------------------------- 
## calif$stratum: 4
##    population          white            amerind           hispanic      
##  Min.   :      3   Min.   :      3   Min.   :    0.0   Min.   :      0  
##  1st Qu.:   8192   1st Qu.:   5292   1st Qu.:   48.0   1st Qu.:   1190  
##  Median :  22927   Median :  13803   Median :  190.0   Median :   5472  
##  Mean   :  56888   Mean   :  31530   Mean   :  464.6   Mean   :  20085  
##  3rd Qu.:  55521   3rd Qu.:  32008   3rd Qu.:  445.8   3rd Qu.:  16603  
##  Max.   :3694820   Max.   :1734036   Max.   :29412.0   Max.   :1719073

####ppswr, pps with replacement

#The function call
#is ppswr(sizes,n), where sizes is a vector of the sizes of the population
#units and n is the sample size.

county1 <- calif[calif$county==1,] # get the units in county 1, there are 20 
#places in county 1 with different population size
county1

##    county population  white amerind hispanic stratum
## 1       1      72259  41148     484     6725       4
## 2       1      16444  10078      64     1312       4
## 3       1      20793   8115     269     6753       4
## 4       1     102743  60797     467    10001       4
## 5       1      57292  40587     336     6984       4
## 6       1      13837   7319     161     5774       4
## 7       1      29973  20793     220     4059       4
## 8       1       6882   3096      34      616       4
## 9       1       9470   5234      53     1433       4
## 10      1     203413  96968    1048    27409       4
## 11      1     140030  60146    1177    47850       4
## 12      1      73345  60070     444    10541       4
## 13      1      42471  22179     273    12145       4
## 14      1     399484 125013    2655    87467       4
## 15      1      10952   8607      12      325       4
## 16      1      63654  51203     210     5011       4
## 17      1      79452  40754     609    15939       4
## 18      1      21898  13865     195     5398       4
## 19      1       1332   1125      13      116       4
## 20      1      66869  20198     356    16020       4

county1sample<-county1[ppswr(county1$population,5),] # select 5 units
county1sample

##    county population white amerind hispanic stratum
## 7       1      29973 20793     220     4059       4
## 13      1      42471 22179     273    12145       4
## 20      1      66869 20198     356    16020       4
## 10      1     203413 96968    1048    27409       4
## 4       1     102743 60797     467    10001       4

####ppswor, pps without replacement, sampford and sampfordpi
#The function sampford select a PPS sample without replacement using Sampford???s
#method. The function sampfordpi computes the corresponding inclusion
#pi_i and joint inclusion probabilities  pi_{ij}.
#the Sampford ones incorporate a check for units that are
#too big and aborts if such units are found

sampford(county1$popul,5) # what if n is too big?

## Unit 14 is too big.
## Some units are too big. Aborting.

worsample<-county1[sampford(county1$popul,3),]
worsample

##    county population white amerind hispanic stratum
## 15      1      10952  8607      12      325       4
## 16      1      63654 51203     210     5011       4
## 4       1     102743 60797     467    10001       4

piij<-sampfordpi(county1$popul,3)
diag(piij) # these are the inclusion probabilities

##  [1] 0.151317925 0.034435461 0.043542723 0.215154618 0.119975457
##  [6] 0.028976129 0.062766606 0.014411630 0.019831173 0.425968157
## [11] 0.293237507 0.153592123 0.088938729 0.836561396 0.022934637
## [16] 0.133298152 0.166380821 0.045856709 0.002789348 0.140030700

wt<-1/diag(piij) # these are the weights to use for estimation