################################################################################
####################Chapter 6 Unequal Probability Sampling######################
################################################################################
##########################example of physicians in United States#################
library(SDaA)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
data(statepop)
nrow(statepop)
## [1] 100
statepop[1:10,]
## state county landarea popn phys farmpop numfarm farmacre
## 1 AL Wilcox 889 13672 4 666 322 156950
## 2 AZ Maricopa 9204 2209567 4320 2124 2334 1391456
## 3 AZ Maricopa 9204 2209567 4320 2124 2334 1391456
## 4 AZ Pinal 5370 120786 61 881 730 1958489
## 5 AR Garland 678 76100 131 524 389 41293
## 6 AR Mississippi 898 55060 48 955 615 488042
## 7 CA Contra_Costa 720 840585 1761 607 840 200262
## 8 CA Kern 8142 587680 682 3307 2255 3037068
## 9 CA Los_Angeles 4060 9053645 23677 1154 2035 280156
## 10 CA Los_Angeles 4060 9053645 23677 1154 2035 280156
## veterans percviet
## 1 836 20.8
## 2 262170 31.5
## 3 262170 31.5
## 4 14858 29.1
## 5 11055 21.3
## 6 5285 33.8
## 7 93838 32.5
## 8 56910 32.9
## 9 688576 27.3
## 10 688576 27.3
statepop$psi <- statepop$popn/255077536 #M_0 = 255077536 #define psi_i
plot(phys ~ psi, data = statepop, xlab = expression(paste(Psi[i],"
for County")), ylab = "Physicians in County (in thousands)")
###one stage sampling with replacement
dppswr<- svydesign(id=~1, probs=~psi, data=statepop)
dppswr
## Independent Sampling design (with replacement)
## svydesign(id = ~1, probs = ~psi, data = statepop)
est1<-svytotal(~phys,dppswr) #total of the n estimates
est1
## total SE
## phys 57030430 4140123
est2<-as.data.frame(est1)
est2
## total phys
## phys 57030430 4140123
tpsihat<-est2[1]/100 #avergae the n estimates to get the \hat t_phi
setpsihat<-est2[2]/100
tpsihat
## total
## phys 570304.3
setpsihat
## phys
## phys 41401.23
#direct formula
tpsihat2<-sum(statepop$phys/statepop$psi)/100
tpsihat2
## [1] 570304.3
vtpsihat2<-sum((statepop$phys/statepop$psi-tpsihat2)^2)/(100*99)
setpsihat2<-sqrt(vtpsihat2)
setpsihat2
## [1] 41401.23
list(tpsihat=tpsihat,setpsihat=setpsihat,tpsihat2=tpsihat2,setpsihat2=
setpsihat2)
## $tpsihat
## total
## phys 570304.3
##
## $setpsihat
## phys
## phys 41401.23
##
## $tpsihat2
## [1] 570304.3
##
## $setpsihat2
## [1] 41401.23
###Variances: without replacement
## Horvitz-Thompson type
#use brewer methods
dppsbr<-svydesign(id=~1, fpc=~psi, data=statepop,
pps="brewer",variance="HT") #default is "HT"
dppsbr
## Independent Sampling design
## svydesign(id = ~1, fpc = ~psi, data = statepop, pps = "brewer",
## variance = "HT")
svytotal(~phys,dppsbr)
## total SE
## phys 57030430 4132937
#use Hartley-Rao approximation
dppsht<- svydesign(id=~1, fpc=~psi, data=statepop,
pps=HR(sum(statepop$psi^2)/100)) ##use Hartley-Rao approximation
dppsht
## Sparse-matrix design object:
## svydesign(id = ~1, fpc = ~psi, data = statepop, pps = HR(sum(statepop$psi^2)/100))
svytotal(~phys,dppsht)
## total SE
## phys 57030430 4148640
## Yates-Grundy type
dppsyg<- svydesign(id=~1, fpc=~psi, data=statepop,
pps=HR(sum(statepop$psi^2)/100),variance="YG") #default is "HT"
dppsyg
## Sparse-matrix design object:
## svydesign(id = ~1, fpc = ~psi, data = statepop, pps = HR(sum(statepop$psi^2)/100),
## variance = "YG")
svytotal(~phys,dppsyg)
## total SE
## phys 57030430 4124554
##########################PPS package#################
#install.packages("pps")
library(pps)
data(calif)
data(califcty)
#The data set calif is based
#on data from the 2000 U.S. census, available at the U.S. Census Bureau???s web
#site, http://www.census.gov. The file includes one record for each place in
#California. A place can be anything from a tiny town to Los Angeles. The file
#includes a county code, and a stratum code has been added. The four strata
#were derived according to county size: counties with fewer than 50,000 people
#are in stratum 1 (as are the places in those counties), with 50,000 to 300,000
#people in stratum 2, with 300,000 to 1,000,000 people in stratum 3, and the
#remaining ones are in stratum 4. The other variables in the file are total
#population (variable name population), and the number of whites (white),
#American Indians (amerind) and Hispanics (hispanic). A second data set,
#califcty, is derived from calif. It is the county-level summary of the bigger
#file, i.e., each record in califcty corresponds to a county.
####stratified sampling
table(calif$stratum) # take a look at the size of each stratum
##
## 1 2 3 4
## 179 223 291 384
nh <- c(40,50,40,60) # these are the sample sizes we want
calif<-calif[order(calif$stratum),] # sort by stratum
califsample <- calif[stratsrs(calif$stratum,nh),]
califsample[1:10,]
## county population white amerind hispanic stratum
## 576 32 1160 975 108 108 1
## 925 47 351 304 26 14 1
## 275 18 1998 1766 105 170 1
## 51 5 2061 1943 20 116 1
## 61 6 3670 1668 42 2613 1
## 1046 54 1865 1660 80 130 1
## 923 47 136 121 5 3 1
## 1039 54 3388 3138 54 167 1
## 1044 54 4423 4041 66 372 1
## 564 32 37 33 4 0 1
by(califsample[,2:5],califsample$stratum,summary)
## califsample$stratum: 1
## population white amerind hispanic
## Min. : 37.0 Min. : 33.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 302.2 1st Qu.: 263.5 1st Qu.: 9.25 1st Qu.: 24.75
## Median : 1597.5 Median : 1466.5 Median : 28.50 Median : 121.50
## Mean : 2338.6 Mean : 1947.6 Mean : 59.17 Mean : 336.07
## 3rd Qu.: 3080.5 3rd Qu.: 2329.2 3rd Qu.: 62.25 3rd Qu.: 221.00
## Max. :13541.0 Max. :10823.0 Max. :431.00 Max. :2613.00
## --------------------------------------------------------
## califsample$stratum: 2
## population white amerind hispanic
## Min. : 149 Min. : 124.0 Min. : 1.0 Min. : 3.0
## 1st Qu.: 1253 1st Qu.: 934.8 1st Qu.: 22.0 1st Qu.: 192.2
## Median : 5998 Median : 3674.0 Median : 53.0 Median : 803.5
## Mean :11059 Mean : 8071.2 Mean :138.2 Mean : 2705.7
## 3rd Qu.:13051 3rd Qu.:10880.2 3rd Qu.:139.0 3rd Qu.: 2091.0
## Max. :79921 Max. :68756.0 Max. :818.0 Max. :26425.0
## --------------------------------------------------------
## califsample$stratum: 3
## population white amerind hispanic
## Min. : 339 Min. : 140 Min. : 4.0 Min. : 37.0
## 1st Qu.: 1949 1st Qu.: 1500 1st Qu.: 26.0 1st Qu.: 348.5
## Median : 6380 Median : 3502 Median : 59.0 Median : 1052.0
## Mean : 16568 Mean :12123 Mean : 151.6 Mean : 3957.0
## 3rd Qu.: 18124 3rd Qu.:15425 3rd Qu.: 172.5 3rd Qu.: 4478.2
## Max. :100916 Max. :79511 Max. :1173.0 Max. :24573.0
## --------------------------------------------------------
## califsample$stratum: 4
## population white amerind hispanic
## Min. : 748 Min. : 372 Min. : 6.0 Min. : 122.0
## 1st Qu.: 5940 1st Qu.: 3853 1st Qu.: 36.5 1st Qu.: 841.5
## Median : 17537 Median : 10809 Median : 87.0 Median : 3735.5
## Mean : 40774 Mean : 22996 Mean : 321.1 Mean : 14964.9
## 3rd Qu.: 55831 3rd Qu.: 28993 3rd Qu.: 373.0 3rd Qu.: 15839.0
## Max. :337977 Max. :150194 Max. :4013.0 Max. :257097.0
by(calif[,2:5],calif$stratum,summary)
## calif$stratum: 1
## population white amerind hispanic
## Min. : 5.0 Min. : 5 Min. : 0.00 Min. : 0.0
## 1st Qu.: 196.5 1st Qu.: 178 1st Qu.: 4.00 1st Qu.: 11.0
## Median : 1160.0 Median : 862 Median : 24.00 Median : 85.0
## Mean : 2378.4 Mean : 1909 Mean : 58.05 Mean : 427.6
## 3rd Qu.: 2838.5 3rd Qu.: 2436 3rd Qu.: 58.00 3rd Qu.: 266.5
## Max. :34413.0 Max. :20341 Max. :587.00 Max. :18949.0
## --------------------------------------------------------
## calif$stratum: 2
## population white amerind hispanic
## Min. : 96 Min. : 81 Min. : 0.0 Min. : 1
## 1st Qu.: 1722 1st Qu.: 1118 1st Qu.: 19.5 1st Qu.: 189
## Median : 4687 Median : 3098 Median : 53.0 Median : 729
## Mean :10400 Mean : 7518 Mean : 140.8 Mean : 2987
## 3rd Qu.:10533 3rd Qu.: 8465 3rd Qu.: 139.0 3rd Qu.: 2912
## Max. :91565 Max. :71727 Max. :1802.0 Max. :33254
## --------------------------------------------------------
## calif$stratum: 3
## population white amerind hispanic
## Min. : 77 Min. : 66 Min. : 0.0 Min. : 4
## 1st Qu.: 1862 1st Qu.: 1422 1st Qu.: 17.5 1st Qu.: 270
## Median : 5352 Median : 3593 Median : 54.0 Median : 1059
## Mean : 22606 Mean : 13807 Mean : 212.3 Mean : 6383
## 3rd Qu.: 18985 3rd Qu.: 10661 3rd Qu.: 169.0 3rd Qu.: 5389
## Max. :776733 Max. :385728 Max. :6763.0 Max. :170520
## --------------------------------------------------------
## calif$stratum: 4
## population white amerind hispanic
## Min. : 3 Min. : 3 Min. : 0.0 Min. : 0
## 1st Qu.: 8192 1st Qu.: 5292 1st Qu.: 48.0 1st Qu.: 1190
## Median : 22927 Median : 13803 Median : 190.0 Median : 5472
## Mean : 56888 Mean : 31530 Mean : 464.6 Mean : 20085
## 3rd Qu.: 55521 3rd Qu.: 32008 3rd Qu.: 445.8 3rd Qu.: 16603
## Max. :3694820 Max. :1734036 Max. :29412.0 Max. :1719073
####ppswr, pps with replacement
#The function call
#is ppswr(sizes,n), where sizes is a vector of the sizes of the population
#units and n is the sample size.
county1 <- calif[calif$county==1,] # get the units in county 1, there are 20
#places in county 1 with different population size
county1
## county population white amerind hispanic stratum
## 1 1 72259 41148 484 6725 4
## 2 1 16444 10078 64 1312 4
## 3 1 20793 8115 269 6753 4
## 4 1 102743 60797 467 10001 4
## 5 1 57292 40587 336 6984 4
## 6 1 13837 7319 161 5774 4
## 7 1 29973 20793 220 4059 4
## 8 1 6882 3096 34 616 4
## 9 1 9470 5234 53 1433 4
## 10 1 203413 96968 1048 27409 4
## 11 1 140030 60146 1177 47850 4
## 12 1 73345 60070 444 10541 4
## 13 1 42471 22179 273 12145 4
## 14 1 399484 125013 2655 87467 4
## 15 1 10952 8607 12 325 4
## 16 1 63654 51203 210 5011 4
## 17 1 79452 40754 609 15939 4
## 18 1 21898 13865 195 5398 4
## 19 1 1332 1125 13 116 4
## 20 1 66869 20198 356 16020 4
county1sample<-county1[ppswr(county1$population,5),] # select 5 units
county1sample
## county population white amerind hispanic stratum
## 7 1 29973 20793 220 4059 4
## 13 1 42471 22179 273 12145 4
## 20 1 66869 20198 356 16020 4
## 10 1 203413 96968 1048 27409 4
## 4 1 102743 60797 467 10001 4
####ppswor, pps without replacement, sampford and sampfordpi
#The function sampford select a PPS sample without replacement using Sampford???s
#method. The function sampfordpi computes the corresponding inclusion
#pi_i and joint inclusion probabilities pi_{ij}.
#the Sampford ones incorporate a check for units that are
#too big and aborts if such units are found
sampford(county1$popul,5) # what if n is too big?
## Unit 14 is too big.
## Some units are too big. Aborting.
worsample<-county1[sampford(county1$popul,3),]
worsample
## county population white amerind hispanic stratum
## 15 1 10952 8607 12 325 4
## 16 1 63654 51203 210 5011 4
## 4 1 102743 60797 467 10001 4
piij<-sampfordpi(county1$popul,3)
diag(piij) # these are the inclusion probabilities
## [1] 0.151317925 0.034435461 0.043542723 0.215154618 0.119975457
## [6] 0.028976129 0.062766606 0.014411630 0.019831173 0.425968157
## [11] 0.293237507 0.153592123 0.088938729 0.836561396 0.022934637
## [16] 0.133298152 0.166380821 0.045856709 0.002789348 0.140030700
wt<-1/diag(piij) # these are the weights to use for estimation