#############dental data, Fitting population-averaged models uing the gls() function in nlme#############
library(reshape2) #to change long format to wide format or change from wide format to long format
library(car)
## Loading required package: carData
library(heplots)  #test equivalence of covariance matrix
#  Function to compute correct and robust standard errors from a gls()
#  model object
library(nlme)
library(Matrix)
library(magic)
## Loading required package: abind
library(lmerTest)
## Loading required package: lme4
## 
## Attaching package: 'lme4'
## The following object is masked from 'package:nlme':
## 
##     lmList
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
ex.data <- read.table("~/Desktop/jenn/teaching/stat579/data/dental.dat",
                      col.names=c("obs","child","age","distance","gender"))
nrow(ex.data)
## [1] 108
head(ex.data)  #1: boys, 0: girls"
##   obs child age distance gender
## 1   1     1   8     21.0      0
## 2   2     1  10     20.0      0
## 3   3     1  12     21.5      0
## 4   4     1  14     23.0      0
## 5   5     2   8     21.0      0
## 6   6     2  10     21.5      0
ex.data2<-ex.data[ex.data$gender==0,]  #dataset with only girls
ex.data3<-ex.data[ex.data$gender==1,]  #dataset with only boys
aa<-aggregate(distance~age+gender, data=ex.data, mean)  #look at the cell means
aa
##   age gender distance
## 1   8      0 21.18182
## 2  10      0 22.22727
## 3  12      0 23.09091
## 4  14      0 24.09091
## 5   8      1 22.87500
## 6  10      1 23.81250
## 7  12      1 25.71875
## 8  14      1 27.46875
#checking correlation matrices
#### From long to wide format
d2 <- dcast(ex.data, child+gender ~ age, value.var = "distance")
d2
##    child gender    8   10   12   14
## 1      1      0 21.0 20.0 21.5 23.0
## 2      2      0 21.0 21.5 24.0 25.5
## 3      3      0 20.5 24.0 24.5 26.0
## 4      4      0 23.5 24.5 25.0 26.5
## 5      5      0 21.5 23.0 22.5 23.5
## 6      6      0 20.0 21.0 21.0 22.5
## 7      7      0 21.5 22.5 23.0 25.0
## 8      8      0 23.0 23.0 23.5 24.0
## 9      9      0 20.0 21.0 22.0 21.5
## 10    10      0 16.5 19.0 19.0 19.5
## 11    11      0 24.5 25.0 28.0 28.0
## 12    12      1 26.0 25.0 29.0 31.0
## 13    13      1 21.5 22.5 23.0 26.5
## 14    14      1 23.0 22.5 24.0 27.5
## 15    15      1 25.5 27.5 26.5 27.0
## 16    16      1 20.0 23.5 22.5 26.0
## 17    17      1 24.5 25.5 27.0 28.5
## 18    18      1 22.0 22.0 24.5 26.5
## 19    19      1 24.0 21.5 24.5 25.5
## 20    20      1 23.0 20.5 31.0 26.0
## 21    21      1 27.5 28.0 31.0 31.5
## 22    22      1 23.0 23.0 23.5 25.0
## 23    23      1 21.5 23.5 24.0 28.0
## 24    24      1 17.0 24.5 26.0 29.5
## 25    25      1 22.5 25.5 25.5 26.0
## 26    26      1 23.0 24.5 26.0 30.0
## 27    27      1 22.0 21.5 23.5 25.0
nrow(d2)
## [1] 27
d2g<-d2[1:11,3:6]  #girls with 4 columns, ages 8,10, 12, 14
d2b<-d2[12:27,3:6] #boys with 4 columns, ages 8,10, 12, 14
nrow(d2g)  #11 girls
## [1] 11
nrow(d2b)   #16 boys
## [1] 16
girls.cov<-cov(d2g)
girls.corr<-cor(d2g)
boys.cov<-cov(d2b)
boys.corr<-cor(d2b)
round(girls.cov,3)
##        8    10    12    14
## 8  4.514 3.355 4.332 4.357
## 10 3.355 3.618 4.027 4.077
## 12 4.332 4.027 5.591 5.466
## 14 4.357 4.077 5.466 5.941
round(boys.cov,3)
##        8    10    12    14
## 8  6.017 2.292 3.629 1.612
## 10 2.292 4.562 2.194 2.810
## 12 3.629 2.194 7.032 3.241
## 14 1.612 2.810 3.241 4.349
##test if the three covariance matrices are equivalent across the groups
tres <- boxM(d2[, 3:6], d2[, "gender"])
tres
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  d2[, 3:6]
## Chi-Sq (approx.) = 17.335, df = 10, p-value = 0.06727
summary(tres)
## Summary for Box's M-test of Equality of Covariance Matrices
## 
## Chi-Sq:   17.33534 
## df:   10 
## p-value: 0.06727 
## 
## log of Covariance determinants:
##         0         1    pooled 
## 0.8947425 5.2957931 4.3826063 
## 
## Eigenvalues:
##            0         1     pooled
## 1 17.9526585 13.668410 15.2972768
## 2  0.8825295  3.914267  2.6815584
## 3  0.5451995  3.219589  2.1590642
## 4  0.2832490  1.158150  0.9038051
## 
## Statistics based on eigenvalues:
##                    0           1     pooled
## product    2.4467057 199.4957821 80.0463895
## sum       19.6636364  21.9604167 21.0417045
## precision  0.1525911   0.6654769  0.4980364
## max       17.9526585  13.6684105 15.2972768
#  Center and scale to create scatterplot matrices using the scale and
#  pairs functions

girls.mean <- apply(d2g,2,mean)
boys.mean <- apply(d2b,2,mean)

girls.sd <- sqrt(diag(girls.cov))
boys.sd <- sqrt(diag(boys.cov))

girls.centscale <- scale(d2g,center=girls.mean,scale=girls.sd)
boys.centscale <- scale(d2b,center=boys.mean,scale=boys.sd)

#  Now create the scatterplots

agelabs <- c("Age 8", "Age 10", "Age 12", "Age 14")

pdf("girlsscatter.pdf",width=8)
pairs(girls.centscale,label=agelabs,oma=c(5,5,5,5),main="Girls")
graphics.off()

pdf("boysscatter.pdf",width=8)
pairs(boys.centscale,label=agelabs,oma=c(5,5,5,5),main="Boys")
graphics.off()

#interaction plots
par(mfrow=c(1,1),oma=c(0,0,3,0),pch=42,font.sub=3,adj=0.5)
interaction.plot(ex.data$age,ex.data$gender,ex.data$distance,
  trace.label="gender",xlab="age",ylab="distance",col=c("black","red"))
mtext(side=3,outer=T,line=0,cex=1.3,"Age Distance Study")
par(adj=1)
title(sub="interact.ps")
# savePlot(filename="dentalinteraction",
#          type=c("png"),
#          device=dev.cur())

par(mfrow=c(1,1),oma=c(0,0,3,0),pch=42,font.sub=3,adj=0.5)
interaction.plot(ex.data2$age,ex.data2$child,ex.data2$distance,
                 trace.label="girls",xlab="age",ylab="distance",col=1:11)
mtext(side=3,outer=T,line=0,cex=1.3,"Age Distance Study Female")
par(adj=1)
title(sub="interact2.ps")
# savePlot(filename="dentalinteraction2",
#          type=c("png"),
#          device=dev.cur())

par(mfrow=c(1,1),oma=c(0,0,3,0),pch=42,font.sub=3,adj=0.5)
interaction.plot(ex.data3$age,ex.data3$child,ex.data3$distance,
                 trace.label="boys",xlab="age",ylab="distance",col=1:16)
mtext(side=3,outer=T,line=0,cex=1.3,"Age Distance Study Male")
par(adj=1)
title(sub="interact3.ps")
# savePlot(filename="dentalinteraction3",
#          type=c("png"),
#          device=dev.cur())



#model fitting
#  Create factor variables for use in gls()

ex.data$child <- factor(ex.data$child)
ex.data$gender <- factor(ex.data$gender)

#  First do OLS fit ignoring correlation
#  Assume group difference paramatrization
fit.ls.gdp <- lm(distance ~ gender + age+ age:gender,data=ex.data)
summary(fit.ls.gdp)
## 
## Call:
## lm(formula = distance ~ gender + age + age:gender, data = ex.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.6156 -1.3219 -0.1682  1.3299  5.2469 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  17.3727     1.7080  10.171  < 2e-16 ***
## gender1      -1.0321     2.2188  -0.465  0.64279    
## age           0.4795     0.1522   3.152  0.00212 ** 
## gender1:age   0.3048     0.1977   1.542  0.12608    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.257 on 104 degrees of freedom
## Multiple R-squared:  0.4227, Adjusted R-squared:  0.4061 
## F-statistic: 25.39 on 3 and 104 DF,  p-value: 2.108e-12
anova(fit.ls.gdp)
## Analysis of Variance Table
## 
## Response: distance
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## gender       1 140.46 140.465 27.5756 8.054e-07 ***
## age          1 235.36 235.356 46.2042 6.884e-10 ***
## gender:age   1  12.11  12.114  2.3782    0.1261    
## Residuals  104 529.76   5.094                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#  Assume separate intercept and slope by gender
fit.ls.sgp <- lm(distance ~ -1 + gender+ age:gender,data=ex.data)
summary(fit.ls.sgp)
## 
## Call:
## lm(formula = distance ~ -1 + gender + age:gender, data = ex.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.6156 -1.3219 -0.1682  1.3299  5.2469 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## gender0      17.3727     1.7080  10.171  < 2e-16 ***
## gender1      16.3406     1.4162  11.538  < 2e-16 ***
## gender0:age   0.4795     0.1522   3.152  0.00212 ** 
## gender1:age   0.7844     0.1262   6.217 1.07e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.257 on 104 degrees of freedom
## Multiple R-squared:  0.9916, Adjusted R-squared:  0.9913 
## F-statistic:  3078 on 4 and 104 DF,  p-value: < 2.2e-16
anova(fit.ls.sgp)
## Analysis of Variance Table
## 
## Response: distance
##             Df Sum Sq Mean Sq  F value    Pr(>F)    
## gender       2  62469 31234.3 6131.797 < 2.2e-16 ***
## gender:age   2    247   123.7   24.291 2.205e-09 ***
## Residuals  104    530     5.1                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# in the following,
#  Assume separate intercept and slope by gender and fit different
#  covariance structures using ML; REML is the default, so we have to 
#  add option method="ML"


#gls: generalized least squares
#  u <- model object from fit, m <- total # individuals
#robust.cov is to calculate the correct ses
robust.cov <- function(u,m){
  form <-  formula(u)
  mf <- model.frame(form,getData(u))
  Xmat <- model.matrix(form,mf)
  Vlist <-  as.list(1:m)
  for (i in 1:m){
    Vlist[[i]] <- getVarCov(u,individual=i,type="marginal")
  }
  V <- Reduce(adiag,Vlist)
  Vinv <- solve(V)
  Sig.model <- solve(t(Xmat)%*%Vinv%*%Xmat)
  resid <- diag(residuals(u,type="response"))
  ones.list <- lapply(Vlist,FUN=function(u){matrix(1,nrow(u),ncol(u))})
  Ones <- Reduce(adiag,ones.list)
  meat <- t(Xmat)%*%Vinv%*%resid%*%Ones%*%resid%*%Vinv%*%Xmat
  Sig.robust <- Sig.model%*%meat%*%Sig.model
  se.robust <- sqrt(diag(Sig.robust))
  se.model <- sqrt(diag(Sig.model))
  return(list(Sig.model=Sig.model,se.model=se.model,Sig.robust=Sig.robust,se.robust=se.robust))
}

#  (a) Common unstructured correlation with variances changing over time
#  for both genders -- we extract components of the fit.  Note that
#  gls() defines BIC dfferently from SAS (it uses the total number of
#  observations N while SAS MIXED uses the total number of individuals m                                      
#  The weights statement makes the variances on the diagonal differ over
#  time - the default with no weight statement is that they are the same
#  for all times

fit.un <- gls(distance ~ -1 + gender + age:gender,data=ex.data,
                 correlation=corSymm(form=~1|child),
                 weights=varIdent(form=~1|age),method="ML")
##varIdent: constant variance(s), generally used to allow different variances according 
#to the levels of a classification factor.

beta.un <- coef(fit.un)
sebeta.un <- summary(fit.un)$tTable[,"Std.Error"]
V.un <- getVarCov(fit.un)  
Gamma.un <- cov2cor(V.un)
vars.un <- diag(V.un)
#  Call robust.cov to get corrected model-based SEs and compare to
#  those from gls().  Compare the results to those from SAS proc
#  mixed, which DOES compute the correct standard errors

u<-fit.un
m<-27
robust.un <- robust.cov(u,m)
sebeta.un.corrected <- robust.un$se.model
#compare 
rbind(beta.un,sebeta.un)
##             gender0    gender1 gender0:age gender1:age
## beta.un   17.425368 15.8422796  0.47636477  0.82680439
## sebeta.un  1.149879  0.9534291  0.09723354  0.08062179
sebeta.un.corrected
##     gender0     gender1 gender0:age gender1:age 
##  1.12838378  0.93560641  0.09541593  0.07911471
#marginal covariance and correlation matrices
V.un
## Marginal variance covariance matrix
##        [,1]   [,2]   [,3]   [,4]
## [1,] 5.1192 2.4409 3.6105 2.5222
## [2,] 2.4409 3.9279 2.7175 3.0624
## [3,] 3.6105 2.7175 5.9798 3.8235
## [4,] 2.5222 3.0624 3.8235 4.6180
##   Standard Deviations: 2.2626 1.9819 2.4454 2.149
Gamma.un
## Marginal variance covariance matrix
##         [,1]    [,2]    [,3]    [,4]
## [1,] 1.00000 0.54434 0.65256 0.51875
## [2,] 0.54434 1.00000 0.56072 0.71903
## [3,] 0.65256 0.56072 1.00000 0.72760
## [4,] 0.51875 0.71903 0.72760 1.00000
##   Standard Deviations: 1 1 1 1
##  In the rest of these fits, we do not calculate the corrected
##  standard errors.  We can calculate them when we settle on a final model

#  (b1) Compound symmetry 
fit.cs <- gls(distance ~ -1 + gender + age:gender,data=ex.data,
                 correlation=corCompSymm(form = ~ 1 | child),method="ML")
beta.cs <- coef(fit.cs)
sebeta.cs <- summary(fit.cs)$tTable[,"Std.Error"]
V.cs <- getVarCov(fit.cs)  
Gamma.cs <- cov2cor(V.cs)     
vars.cs <- diag(V.cs)
rbind(beta.cs,sebeta.cs)
##            gender0   gender1 gender0:age gender1:age
## beta.cs   17.37273 16.340625  0.47954545  0.78437500
## sebeta.cs  1.18365  0.981431  0.09406714  0.07799635
V.cs
## Marginal variance covariance matrix
##        [,1]   [,2]   [,3]   [,4]
## [1,] 4.9052 3.0306 3.0306 3.0306
## [2,] 3.0306 4.9052 3.0306 3.0306
## [3,] 3.0306 3.0306 4.9052 3.0306
## [4,] 3.0306 3.0306 3.0306 4.9052
##   Standard Deviations: 2.2148 2.2148 2.2148 2.2148
Gamma.cs
## Marginal variance covariance matrix
##         [,1]    [,2]    [,3]    [,4]
## [1,] 1.00000 0.61783 0.61783 0.61783
## [2,] 0.61783 1.00000 0.61783 0.61783
## [3,] 0.61783 0.61783 1.00000 0.61783
## [4,] 0.61783 0.61783 0.61783 1.00000
##   Standard Deviations: 1 1 1 1
#  (b2) Compound symmetry with variance different by gender
fit.cs2 <- gls(distance ~ -1 + gender + gender:age,data=ex.data,correlation=corCompSymm(form = ~ 1 | child),
                  weights = varIdent(form = ~ 1 | gender),method="ML")
beta.cs2 <- coef(fit.cs2)
sebeta.cs2 <- summary(fit.cs2)$tTable[,"Std.Error"]
V.cs2.girl <-  getVarCov(fit.cs2, individual=1)
V.cs2.boy <-  getVarCov(fit.cs2, individual=12)
Gamma.cs2 <- cov2cor(V.cs2.girl)
rbind(beta.cs2,sebeta.cs2)
##               gender0   gender1 gender0:age gender1:age
## beta.cs2   17.3727273 16.340625  0.47954545  0.78437500
## sebeta.cs2  0.8175862  1.163776  0.06125437  0.08719126
V.cs2.girl
## Marginal variance covariance matrix
##        [,1]   [,2]   [,3]   [,4]
## [1,] 2.8677 2.0728 2.0728 2.0728
## [2,] 2.0728 2.8677 2.0728 2.0728
## [3,] 2.0728 2.0728 2.8677 2.0728
## [4,] 2.0728 2.0728 2.0728 2.8677
##   Standard Deviations: 1.6934 1.6934 1.6934 1.6934
V.cs2.boy
## Marginal variance covariance matrix
##        [,1]   [,2]   [,3]   [,4]
## [1,] 8.4514 6.1088 6.1088 6.1088
## [2,] 6.1088 8.4514 6.1088 6.1088
## [3,] 6.1088 6.1088 8.4514 6.1088
## [4,] 6.1088 6.1088 6.1088 8.4514
##   Standard Deviations: 2.9071 2.9071 2.9071 2.9071
Gamma.cs2
## Marginal variance covariance matrix
##         [,1]    [,2]    [,3]    [,4]
## [1,] 1.00000 0.72281 0.72281 0.72281
## [2,] 0.72281 1.00000 0.72281 0.72281
## [3,] 0.72281 0.72281 1.00000 0.72281
## [4,] 0.72281 0.72281 0.72281 1.00000
##   Standard Deviations: 1 1 1 1
#  (c) AR(1)

fit.ar1 <- gls(distance ~ -1 + gender + age:gender,data=ex.data,
                  correlation=corAR1(form = ~ 1 | child),method="ML")
beta.ar1 <- coef(fit.ar1)
sebeta.ar1 <- summary(fit.ar1)$tTable[,"Std.Error"]
V.ar1 <- getVarCov(fit.ar1)  #  or corMatrix(dental.un$modelStruct$corStruct)[[1]]
Gamma.ar1 <- cov2cor(V.ar1)
vars.ar1 <- diag(V.ar1)

rbind(beta.ar1,sebeta.ar1)
##              gender0   gender1 gender0:age gender1:age
## beta.ar1   17.321720 16.591996   0.4837322   0.7695718
## sebeta.ar1  1.634509  1.355263   0.1409898   0.1169026
V.ar1
## Marginal variance covariance matrix
##        [,1]   [,2]   [,3]   [,4]
## [1,] 4.8908 2.9693 1.8027 1.0944
## [2,] 2.9693 4.8908 2.9693 1.8027
## [3,] 1.8027 2.9693 4.8908 2.9693
## [4,] 1.0944 1.8027 2.9693 4.8908
##   Standard Deviations: 2.2115 2.2115 2.2115 2.2115
Gamma.ar1
## Marginal variance covariance matrix
##         [,1]    [,2]    [,3]    [,4]
## [1,] 1.00000 0.60712 0.36859 0.22378
## [2,] 0.60712 1.00000 0.60712 0.36859
## [3,] 0.36859 0.60712 1.00000 0.60712
## [4,] 0.22378 0.36859 0.60712 1.00000
##   Standard Deviations: 1 1 1 1
#########################################################################
#
#   SAS and R use different conventions to calculate AIC and BIC.  We
#   have used ML here, in which case AIC is the same but BIC differs as
#   noted above.  If we'd used REML, both AIC and BIC values are calculated
#   differently by SAS and R using different conventions regarding the
#   number of observations and number of parameters, so are not comparable, 
#   but can be compared within a single implementation (but not across
#   SAS and R).
#
#   We use the anova() function to print out these values for the above
#   models.  Both AIC and BIC support the common compound symmetric correlation
#   model with different variance for each gender (constant across time),
#   This is as close as the models we can fit here with 
#   the generic gls() function can get to the model with different
#   compound correlation and variance for each gender we fit in SAS,
#   which was preferred among those fit in that program.
#
#   It seems that the data are suggesting that the correlation pattern
#   is close to compound symmetric, with heterogeneity of some sort 
#   between genders.
#
#########################################################################

anova(fit.un,fit.cs,fit.cs2,fit.ar1)
##         Model df      AIC      BIC    logLik   Test  L.Ratio p-value
## fit.un      1 14 447.4770 485.0269 -209.7385                        
## fit.cs      2  6 440.6391 456.7318 -214.3195 1 vs 2  9.16201  0.3288
## fit.cs2     3  7 430.6521 449.4270 -208.3261 2 vs 3 11.98695  0.0005
## fit.ar1     4  6 452.6810 468.7738 -220.3405 3 vs 4 24.02890  <.0001
#seems model fit.cs2, compound symmetry structure with differenct covariance
#matrix for different genders is the choice
#consider reduced model under this assumption but with same slope for 
#both gender
reduced <- gls(distance ~ -1 + gender + age,data=ex.data,correlation=corCompSymm(form = ~ 1 | child),
               weights = varIdent(form = ~ 1 | gender),method="ML")

anova(fit.cs2,reduced)  #stay with fit.cs2 with differenct covariance
##         Model df      AIC      BIC    logLik   Test  L.Ratio p-value
## fit.cs2     1  7 430.6521 449.4270 -208.3261                        
## reduced     2  6 436.7324 452.8252 -212.3662 1 vs 2 8.080332  0.0045
#matrix for different genders

summary(fit.cs2)
## Generalized least squares fit by maximum likelihood
##   Model: distance ~ -1 + gender + gender:age 
##   Data: ex.data 
##        AIC     BIC    logLik
##   430.6521 449.427 -208.3261
## 
## Correlation Structure: Compound symmetry
##  Formula: ~1 | child 
##  Parameter estimate(s):
##       Rho 
## 0.7228109 
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | gender 
##  Parameter estimates:
##       0       1 
## 1.00000 1.71672 
## 
## Coefficients:
##                 Value Std.Error   t-value p-value
## gender0     17.372727 0.8175862 21.248802       0
## gender1     16.340625 1.1637761 14.041038       0
## gender0:age  0.479545 0.0612544  7.828755       0
## gender1:age  0.784375 0.0871913  8.996028       0
## 
##  Correlation: 
##             gendr0 gendr1 gndr0:
## gender1      0.000              
## gender0:age -0.824  0.000       
## gender1:age  0.000 -0.824  0.000
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.78081373 -0.63039864 -0.08111374  0.52140257  2.87744432 
## 
## Residual standard error: 1.693422 
## Degrees of freedom: 108 total; 104 residual
anova(fit.cs2)
## Denom. DF: 104 
##            numDF  F-value p-value
## gender         2 1913.362  <.0001
## gender:age     2   71.109  <.0001
#  Get the corrected standard errors
u <- fit.cs2
robust.fit.cs2<- robust.cov(u,m)
se.fit.cs2.corrected <- robust.fit.cs2$se.model
se.fit.cs2.corrected
##     gender0     gender1 gender0:age gender1:age 
##  0.80230287  1.14202137  0.06010933  0.08556137