#### Example: Comparison of Fats fat <- read.table(text=" Row fat1 fat2 fat3 fat4 1 164 178 175 155 2 172 191 186 166 3 168 197 178 149 4 177 182 171 164 5 190 185 163 170 6 176 177 176 168 ", header=TRUE) fat #### From wide to long format library(reshape2) fat.long <- melt(fat, # id.vars: ID variables # all variables to keep but not split apart on id.vars=c("Row"), # measure.vars: The source columns # (if unspecified then all other variables are measure.vars) measure.vars = c("fat1", "fat2", "fat3", "fat4"), # variable.name: Name of the destination column identifying each # original column that the measurement came from variable.name = "type", # value.name: column name for values in table value.name = "amount" ) ## naming variables manually, the variable.name and value.name not working 11/2012 #names(fat.long) <- c("Row", "type", "amount") fat.long # or as simple as: # melt(fat, "Row") #### From long to wide format fat.wide <- dcast(fat.long, Row ~ type, value.var = "amount") fat.wide #### Back to ANOVA # Calculate the mean, sd, n, and se for the four fats # The plyr package is an advanced way to apply a function to subsets of data # "Tools for splitting, applying and combining data" library(plyr) # ddply "dd" means the input and output are both data.frames fat.summary <- ddply(fat.long, "type", function(X) { data.frame( m = mean(X$amount), s = sd(X$amount), n = length(X$amount) ) } ) # standard errors fat.summary$se <- fat.summary$s/sqrt(fat.summary$n) # individual confidence limits fat.summary$ci.l <- fat.summary$m - qt(1-.05/2, df=fat.summary$n-1) * fat.summary$se fat.summary$ci.u <- fat.summary$m + qt(1-.05/2, df=fat.summary$n-1) * fat.summary$se fat.summary # Plot the data boxplot(amount~type,data=fat.long) fit.f <- aov(amount ~ type, data = fat.long) summary(fit.f) fit.f #### Multiple Comparisons # all pairwise comparisons among levels of fat # Fisher's LSD (FSD) uses "none" pairwise.t.test(fat.long$amount, fat.long$type, pool.sd = TRUE, p.adjust.method = "none") # Bonferroni 95% Individual p-values # All Pairwise Comparisons among Levels of fat pairwise.t.test(fat.long$amount, fat.long$type, pool.sd = TRUE, p.adjust.method = "bonf") #### Example from Koopmans: glabella facial tissue thickness glabella <- read.table(text=" Row cauc afam naaa 1 5.75 6.00 8.00 2 5.50 6.25 7.00 3 6.75 6.75 6.00 4 5.75 7.00 6.25 5 5.00 7.25 5.50 6 5.75 6.75 4.00 7 5.75 8.00 5.00 8 7.75 6.50 6.00 9 5.75 7.50 7.25 10 5.25 6.25 6.00 11 4.50 5.00 6.00 12 6.25 5.75 4.25 13 NA 5.00 4.75 14 NA NA 6.00 ", header=TRUE) glabella.long <- melt(glabella, id.vars=c("Row"), variable.name = "pop", value.name = "thickness", # remove NAs na.rm = TRUE ) # naming variables manually, the variable.name and value.name not working 11/2012 names(glabella.long) <- c("Row", "pop", "thickness") # another way to remove NAs: #glabella.long <- subset(glabella.long, !is.na(thickness)) # Plot the data using ggplot library(ggplot2) p <- ggplot(glabella.long, aes(x = pop, y = thickness)) # plot a reference line for the global mean (assuming no groups) p <- p + geom_hline(yintercept = mean(glabella.long$thickness), colour = "black", linetype = "dashed", size = 0.3, alpha = 0.5) # boxplot, size=.75 to stand out behind CI p <- p + geom_boxplot(size = 0.75, alpha = 0.5) # points for observed data p <- p + geom_point(position = position_jitter(w = 0.05, h = 0), alpha = 0.5) # diamond at mean for each group p <- p + stat_summary(fun.y = mean, geom = "point", shape = 18, size = 6, colour = "red", alpha = 0.8) # confidence limits based on normal distribution p <- p + stat_summary(fun.data = "mean_cl_normal", geom = "errorbar", width = .2, colour = "red", alpha = 0.8) p <- p + labs(title = "Glabella thickness") + ylab("thickness (mm)") print(p) glabella.summary <- ddply(glabella.long, "pop", function(X) { data.frame( m = mean(X$thickness), s = sd(X$thickness), n = length(X$thickness) ) } ) glabella.summary fit.g <- aov(thickness ~ pop, data = glabella.long) summary(fit.g) fit.g # Bonferroni 95% Individual p-values # All Pairwise Comparisons among Levels of glabella pairwise.t.test(glabella.long$thickness, glabella.long$pop, pool.sd = TRUE, p.adjust.method = "bonf") #### Tukey's honest significant difference method (HSD) ## Fat # Tukey 95% Individual p-values # All Pairwise Comparisons among Levels of fat TukeyHSD(fit.f) ## Glabella # Tukey 95% Individual p-values # All Pairwise Comparisons among Levels of pop TukeyHSD(fit.g) #### false discovery rate (FDR) ## Fat # FDR pairwise.t.test(fat.long$amount, fat.long$type, pool.sd = TRUE, p.adjust.method = "BH") ## Glabella # FDR pairwise.t.test(glabella.long$thickness, glabella.long$pop, pool.sd = TRUE, p.adjust.method = "BH") #### Checking Assumptions in ANOVA Problems # plot of data par(mfrow=c(2,1)) # Histogram overlaid with kernel density curve hist(fit.g$residuals, freq = FALSE, breaks = 20) points(density(fit.g$residuals), type = "l") rug(fit.g$residuals) # boxplot boxplot(fit.g$residuals, horizontal=TRUE) # QQ plot par(mfrow=c(1,1)) library(car) qqPlot(fit.g$residuals, las = 1, id.n = 8, id.cex = 1, lwd = 1, main="QQ Plot") shapiro.test(fit.g$residuals) library(nortest) ad.test(fit.g$residuals) # lillie.test(fit.g$residuals) cvm.test(fit.g$residuals) ## Test equal variance # Barlett assumes populations are normal bartlett.test(thickness ~ pop, data = glabella.long) # Levene does not assume normality, requires car package library(car) leveneTest(thickness ~ pop, data = glabella.long) # Fligner is a nonparametric test fligner.test(thickness ~ pop, data = glabella.long) #### Example from the Child Health and Development Study (CHDS) # description at http://statacumen.com/teach/ADA1/ADA1_notes_05-CHDS_desc.txt # read data from website chds <- read.csv("http://statacumen.com/teach/ADA1/ADA1_notes_05-CHDS.csv") chds$smoke <- rep(NA, nrow(chds)); # no cigs chds[(chds$m_smok == 0), "smoke"] <- "0 cigs"; # less than 1 pack (20 cigs = 1 pack) chds[(chds$m_smok > 0) & (chds$m_smok < 20),"smoke"] <- "1-19 cigs"; # at least 1 pack (20 cigs = 1 pack) chds[(chds$m_smok >= 20),"smoke"] <- "20+ cigs"; chds$smoke <- factor(chds$smoke) # histogram using ggplot p1 <- ggplot(chds, aes(x = c_bwt)) p1 <- p1 + geom_histogram(binwidth = .4) p1 <- p1 + geom_rug() p1 <- p1 + facet_grid(smoke ~ .) p1 <- p1 + labs(title = "Child birthweight vs maternal smoking") + xlab("child birthweight (lb)") #print(p1) p2 <- ggplot(chds, aes(x = c_bwt, fill=smoke)) p2 <- p2 + geom_histogram(binwidth = .4, alpha = 1/3, position="identity") p2 <- p2 + geom_rug(aes(colour = smoke), alpha = 1/3) p2 <- p2 + labs(title = "Child birthweight vs maternal smoking") + xlab("child birthweight (lb)") #print(p2) library(gridExtra) grid.arrange(grobs = list(p1, p2), ncol=1) # Plot the data using ggplot library(ggplot2) p <- ggplot(chds, aes(x = smoke, y = c_bwt)) # plot a reference line for the global mean (assuming no groups) p <- p + geom_hline(yintercept = mean(chds$c_bwt), colour = "black", linetype = "dashed", size = 0.3, alpha = 0.5) # boxplot, size=.75 to stand out behind CI p <- p + geom_boxplot(size = 0.75, alpha = 0.5) # points for observed data p <- p + geom_point(position = position_jitter(w = 0.05, h = 0), alpha = 0.2) # diamond at mean for each group p <- p + stat_summary(fun.y = mean, geom = "point", shape = 18, size = 4, colour = "red", alpha = 0.8) # confidence limits based on normal distribution p <- p + stat_summary(fun.data = "mean_cl_normal", geom = "errorbar", width = .2, colour = "red", alpha = 0.8) p <- p + labs(title = "Child birthweight vs maternal smoking") + ylab("child birthweight (lb)") print(p) library(car) par(mfrow=c(1,3)) qqPlot(subset(chds, smoke == "0 cigs" )$c_bwt, las = 1, id.n = 0, id.cex = 1, lwd = 1, main="QQ Plot, 0 cigs") qqPlot(subset(chds, smoke == "1-19 cigs")$c_bwt, las = 1, id.n = 0, id.cex = 1, lwd = 1, main="QQ Plot, 1-19 cigs") qqPlot(subset(chds, smoke == "20+ cigs" )$c_bwt, las = 1, id.n = 0, id.cex = 1, lwd = 1, main="QQ Plot, 20+ cigs") library(nortest) # 0 cigs -------------------- shapiro.test(subset(chds, smoke == "0 cigs" )$c_bwt) ad.test( subset(chds, smoke == "0 cigs" )$c_bwt) cvm.test( subset(chds, smoke == "0 cigs" )$c_bwt) # 1-19 cigs -------------------- shapiro.test(subset(chds, smoke == "1-19 cigs")$c_bwt) ad.test( subset(chds, smoke == "1-19 cigs")$c_bwt) cvm.test( subset(chds, smoke == "1-19 cigs")$c_bwt) # 20+ cigs -------------------- shapiro.test(subset(chds, smoke == "20+ cigs" )$c_bwt) ad.test( subset(chds, smoke == "20+ cigs" )$c_bwt) cvm.test( subset(chds, smoke == "20+ cigs" )$c_bwt) fit.c <- aov(c_bwt ~ smoke, data = chds) # plot of data par(mfrow=c(3,1)) # Histogram overlaid with kernel density curve hist(fit.c$residuals, freq = FALSE, breaks = 20) points(density(fit.c$residuals), type = "l") rug(fit.c$residuals) # violin plot library(vioplot) vioplot(fit.c$residuals, horizontal=TRUE, col="gray") # boxplot boxplot(fit.c$residuals, horizontal=TRUE) # QQ plot par(mfrow=c(1,1)) library(car) qqPlot(fit.c$residuals, las = 1, id.n = 0, id.cex = 1, lwd = 1, main="QQ Plot") shapiro.test(fit.c$residuals) library(nortest) ad.test(fit.c$residuals) cvm.test(fit.c$residuals) # calculate summaries chds.summary <- ddply(chds, "smoke", function(X) { data.frame( m = mean(X$c_bwt), s = sd(X$c_bwt), n = length(X$c_bwt) ) } ) chds.summary ## Test equal variance # assumes populations are normal bartlett.test(c_bwt ~ smoke, data = chds) # does not assume normality, requires car package library(car) leveneTest(c_bwt ~ smoke, data = chds) # nonparametric test fligner.test(c_bwt ~ smoke, data = chds) summary(fit.c) fit.c ## CHDS # Tukey 95% Individual p-values TukeyHSD(fit.c) library(lsmeans) #tukey comparison library(multcompView) #tukey comparison comp1<-lsmeans(fit.c, "smoke",adjust="tukey") cld(comp1, alpha=.05,Letters=letters)