handout10

library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(SDaA)
################################################################################
#####two stage cluster sampling, API data#######################################
################################################################################

# The Academic Performance Index (API) is computed for all California
# schools based on standardized testing of students.
# The data sets contain information for all schools with at least 100 students
# and for various probability samples of the data.
# apiclus2 contains a two-stage cluster sample of schools within districts. 

#apiclus2 is a sample obtained using a two-stage cluster sampling design, 
# 1st stage,  n = 40 SRS districts were selected from N=757 districts,
# 1st stage, fpc1 is a vector of N = 757
# 2nd stage, within selected district i with Mi schools, one or more schools were selected using SRS
# 2nd stage, fpc2 is a vector of total number of schools in each sampled district (i.e., fpc2 = Mi).
# snum: School number, dnum: District number 
# "api00: 2000 API, predicted by the proportions of students learning English (ell), receiving subsidized meals (meals) 
# and having moved to the school within the past year (mobility). 

data(api) #loads several data frames that are samples from the data frame apipop.
#help(api)
apiclus2[1:5,]

##              cds stype            name                      sname snum
## 1 31667796031017     E Alta-Dutch Flat Alta-Dutch Flat Elementary 3269
## 2 55751846054837     E Tenaya Elementa          Tenaya Elementary 5979
## 3 41688746043517     E Panorama Elemen        Panorama Elementary 4958
## 4 41688746043509     M   Lipman Middle              Lipman Middle 4957
## 5 41688746043491     E Brisbane Elemen        Brisbane Elementary 4956
##                      dname dnum     cname cnum flag pcttest api00 api99
## 1     Alta-Dutch Flat Elem   15    Placer   30   NA     100   821   785
## 2 Big Oak Flat-Grvlnd Unif   63  Tuolumne   54   NA     100   773   718
## 3      Brisbane Elementary   83 San Mateo   40   NA      98   600   632
## 4      Brisbane Elementary   83 San Mateo   40   NA     100   740   740
## 5      Brisbane Elementary   83 San Mateo   40   NA      98   716   711
##   target growth sch.wide comp.imp both awards meals ell yr.rnd mobility
## 1      1     36      Yes      Yes  Yes    Yes    27   0     No       14
## 2      4     55      Yes      Yes  Yes    Yes    43   0     No       12
## 3      8    -32       No       No   No     No    33   5     No        9
## 4      3      0       No       No   No     No    11   4     No        8
## 5      4      5      Yes      Yes  Yes    Yes     5   2     No        6
##   acs.k3 acs.46 acs.core pct.resp not.hsg hsg some.col col.grad grad.sch
## 1     17     20       NA       89       4  16       53       21        6
## 2     18     34       NA       98       8  33       37       15        7
## 3     19     29       NA       79       8  28       30       32        1
## 4     NA     30       24       96       5  27       35       27        6
## 5     18     28       NA       98       3  14       22       58        3
##   avg.ed full emer enroll api.stu     pw fpc1 fpc2
## 1   3.07  100    0    152     120 18.925  757    1
## 2   2.79  100    0    312     270 18.925  757    1
## 3   2.90  100    0    173     151 18.925  757    3
## 4   3.03   82   18    201     179 18.925  757    3
## 5   3.44  100    8    147     136 18.925  757    3

dimnames(apiclus2)[2]

## [[1]]
##  [1] "cds"      "stype"    "name"     "sname"    "snum"     "dname"   
##  [7] "dnum"     "cname"    "cnum"     "flag"     "pcttest"  "api00"   
## [13] "api99"    "target"   "growth"   "sch.wide" "comp.imp" "both"    
## [19] "awards"   "meals"    "ell"      "yr.rnd"   "mobility" "acs.k3"  
## [25] "acs.46"   "acs.core" "pct.resp" "not.hsg"  "hsg"      "some.col"
## [31] "col.grad" "grad.sch" "avg.ed"   "full"     "emer"     "enroll"  
## [37] "api.stu"  "pw"       "fpc1"     "fpc2"

dclus2<-svydesign(id=~dnum+snum,fpc=~fpc1+fpc2,data=apiclus2)
summary(dclus2)

## 2 - level Cluster Sampling design
## With (40, 126) clusters.
## svydesign(id = ~dnum + snum, fpc = ~fpc1 + fpc2, data = apiclus2)
## Probabilities:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.003669 0.037740 0.052840 0.042390 0.052840 0.052840 
## Population size (PSUs): 757 
## Data variables:
##  [1] "cds"      "stype"    "name"     "sname"    "snum"     "dname"   
##  [7] "dnum"     "cname"    "cnum"     "flag"     "pcttest"  "api00"   
## [13] "api99"    "target"   "growth"   "sch.wide" "comp.imp" "both"    
## [19] "awards"   "meals"    "ell"      "yr.rnd"   "mobility" "acs.k3"  
## [25] "acs.46"   "acs.core" "pct.resp" "not.hsg"  "hsg"      "some.col"
## [31] "col.grad" "grad.sch" "avg.ed"   "full"     "emer"     "enroll"  
## [37] "api.stu"  "pw"       "fpc1"     "fpc2"

#use ell, meals and mobility to predict api00
summary(svyglm(api00 ~ ell + meals + mobility, design = dclus2))

## 
## Call:
## svyglm(formula = api00 ~ ell + meals + mobility, design = dclus2)
## 
## Survey design:
## svydesign(id = ~dnum + snum, fpc = ~fpc1 + fpc2, data = apiclus2)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 811.4907    30.2338  26.840   <2e-16 ***
## ell          -2.0592     1.3798  -1.492    0.144    
## meals        -1.7772     1.0830  -1.641    0.110    
## mobility      0.3253     0.6103   0.533    0.597    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 8363.101)
## 
## Number of Fisher Scoring iterations: 2

#A useful property of regression models is that they provide another way 
#to get domain estimates. Suppose we want the mean of api00 for each school type:
summary(svyglm(api00~stype-1, dclus2))

## 
## Call:
## svyglm(formula = api00 ~ stype - 1, dclus2)
## 
## Survey design:
## svydesign(id = ~dnum + snum, fpc = ~fpc1 + fpc2, data = apiclus2)
## 
## Coefficients:
##        Estimate Std. Error t value Pr(>|t|)    
## stypeE   692.81      29.93   23.15   <2e-16 ***
## stypeH   598.34      17.69   33.82   <2e-16 ***
## stypeM   642.35      45.09   14.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 17528.44)
## 
## Number of Fisher Scoring iterations: 2

summary(svyglm(api00~stype, dclus2))

## 
## Call:
## svyglm(formula = api00 ~ stype, dclus2)
## 
## Survey design:
## svydesign(id = ~dnum + snum, fpc = ~fpc1 + fpc2, data = apiclus2)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   692.81      29.93  23.150  < 2e-16 ***
## stypeH        -94.47      28.21  -3.348  0.00188 ** 
## stypeM        -50.46      25.30  -1.994  0.05354 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 17528.44)
## 
## Number of Fisher Scoring iterations: 2

#compared to using svyby
svyby(~api00,~stype,dclus2,svymean, keep.var=TRUE)

##   stype    api00       se
## E     E 692.8104 29.92660
## H     H 598.3407 17.69417
## M     M 642.3520 45.09132