#### Example: Log-linear population growth pop <- read.table(text=" Year Pop_M 1 170 400 190 800 220 1200 360 1600 545 1800 900 1850 1200 1900 1625 1950 2500 1975 3900 2000 6080 2012 7000 ", header=TRUE) pop$Pop <- 1e6 * pop$Pop_M # convert to millions pop$PopL10 <- log10(pop$Pop) # calculate the residuals from a simple linear regression lm.fit <- lm(PopL10 ~ Year, data = pop) # include those residuals in the pop table pop$Res <- residuals(lm.fit) pop$R10 <- 10^residuals(lm.fit) # residuals on the original scale library(xtable) xtab.out <- xtable(pop, digits=c(0,0,0,0,3,3,3)) print(xtab.out, floating=FALSE, math.style.negative=TRUE) library(ggplot2) p1 <- ggplot(pop, aes(x = Year, y = Pop)) p1 <- p1 + geom_point() #print(p1) p2 <- ggplot(pop, aes(x = Year, y = PopL10)) p2 <- p2 + geom_point() p2 <- p2 + geom_smooth(method = lm, se = FALSE) #print(p2) library(gridExtra) grid.arrange(grobs = list(p1, p2), nrow=1 , top = "Log-linear transformation: world population") # http://www.ncbi.nlm.nih.gov/pmc/articles/PMC153045/ # Supp: # http://www.ncbi.nlm.nih.gov/pmc/articles/PMC153045/bin/pnas_0436428100_index.html # Supporting information for White and Seymour (2003) # Proc. Natl. Acad. Sci. USA, 10.1073/pnas.0436428100 library(gdata) fn <- "data/ADA1_notes_08_data_log-logScaling_BodyMassMetabolicRate_2003_WhiteSeymour.xlsx" bm.bmr <- read.xls(fn, skip = 4, stringsAsFactors = TRUE) bm.bmr$Log10BodyMass <- log10(bm.bmr$BodyMass) bm.bmr$Log10BaseMetRate <- log10(bm.bmr$BaseMetRate) # remove a very strange group bm.bmr <- subset(bm.bmr, !(Group == "Artiodactyla 7")) str(bm.bmr) # log-log scale linear regression lm.fit <- lm(Log10BaseMetRate ~ Log10BodyMass, data = bm.bmr) # coefficients for regression line coef(lm.fit) library(ggplot2) p1 <- ggplot(subset(bm.bmr, (Genus == "")), aes(x = BodyMass, y = BaseMetRate)) p1 <- p1 + geom_point(data = subset(bm.bmr, !(Genus == "")), aes(colour = Group), alpha = 0.2) p1 <- p1 + geom_point(size = 3) # Using a custom function f.org.scale <- function(BodyMass) { 10^coef(lm.fit)[1] * BodyMass ^ coef(lm.fit)[2]} p1 <- p1 + stat_function(fun = f.org.scale, size = 1) p1 <- p1 + labs(title = paste("BaseMetRate = ", signif(10^coef(lm.fit)[1], 3), " * ", "BodyMass ^ ", signif(coef(lm.fit)[2], 3), sep = "")) p1 <- p1 + scale_y_continuous(limits=c(0, 1300)) p1 <- p1 + scale_x_continuous(limits=c(0, 5000)) #print(p1) p2 <- ggplot(subset(bm.bmr, (Genus == "")), aes(x = Log10BodyMass, y = Log10BaseMetRate)) p2 <- p2 + geom_point(data = subset(bm.bmr, !(Genus == "")), aes(colour = Group), alpha = 0.3) p2 <- p2 + geom_point(size = 3) p2 <- p2 + geom_smooth(method = lm, se = FALSE, fullrange = TRUE, size = 1, colour = "black") p2 <- p2 + labs(title = paste("log10(BaseMetRate) = ", signif(coef(lm.fit)[1], 3), " + ", signif(coef(lm.fit)[2], 3), " log10(BodyMass)", sep = "")) p2 <- p2 + scale_y_continuous(limits=c(NA, log10(1300))) p2 <- p2 + scale_x_continuous(limits=c(NA, log10(5000))) #print(p2) p3 <- ggplot(subset(bm.bmr, (Genus == "")), aes(x = BodyMass, y = BaseMetRate)) p3 <- p3 + geom_point(data = subset(bm.bmr, !(Genus == "")), aes(colour = Group), alpha = 0.3) p3 <- p3 + geom_point(size = 3) p3 <- p3 + geom_smooth(method = lm, se = FALSE, fullrange = TRUE, size = 1, colour = "black") p3 <- p3 + labs(title = paste("log10(BaseMetRate) = ", signif(coef(lm.fit)[1], 3), " + ", signif(coef(lm.fit)[2], 3), " log10(BodyMass)", sep = "")) p3 <- p3 + scale_y_log10()#limits=c(NA, log10(1300))) p3 <- p3 + scale_x_log10()#limits=c(NA, log10(5000))) #print(p3) library(gridExtra) grid.arrange(grobs = list(p1, p2, p3), ncol=1 , top = "Log-log transformation: metabolic rates") pred.bm.bmr <- data.frame(BodyMass = 5 * c(1, 10, 100, 1000)) pred.bm.bmr$Log10BodyMass <- log10(pred.bm.bmr$BodyMass) pred.bm.bmr$Log10BaseMetRate <- predict(lm.fit, pred.bm.bmr) pred.bm.bmr$BaseMetRate <- 10^pred.bm.bmr$Log10BaseMetRate tab.pred <- subset(pred.bm.bmr, select = c("BodyMass", "BaseMetRate")) library(xtable) xtab.out <- xtable(tab.pred, digits=c(0,0,2)) print(xtab.out, floating=FALSE, math.style.negative=TRUE) #### Example: Blood loss thyroid <- read.table(text=" weight time blood_loss 44.3 105 503 40.6 80 490 69.0 86 471 43.7 112 505 50.3 109 482 50.2 100 490 35.4 96 513 52.2 120 464 ", header=TRUE) # show the structure of the data.frame str(thyroid) # display the data.frame #thyroid library(xtable) xtab.out <- xtable(thyroid, digits=1) print(xtab.out, floating=FALSE, math.style.negative=TRUE) p.corr <- cor(thyroid); #p.corr # initialize pvalue table with dim names p.corr.pval <- p.corr; for (i1 in 1:ncol(thyroid)) { for (i2 in 1:ncol(thyroid)) { p.corr.pval[i1,i2] <- cor.test(thyroid[, i1], thyroid[, i2])$p.value } } #p.corr.pval library(xtable) xtab.out <- xtable(p.corr, digits=3) print(xtab.out, floating=FALSE, math.style.negative=TRUE) xtab.out <- xtable(p.corr.pval, digits=4) print(xtab.out, floating=FALSE, math.style.negative=TRUE) s.corr <- cor(thyroid, method = "spearman"); #s.corr # initialize pvalue table with dim names s.corr.pval <- p.corr; for (i1 in 1:ncol(thyroid)) { for (i2 in 1:ncol(thyroid)) { s.corr.pval[i1,i2] <- cor.test(thyroid[, i1], thyroid[, i2], method = "spearman")$p.value } } #s.corr.pval library(xtable) xtab.out <- xtable(s.corr, digits=3) print(xtab.out, floating=FALSE, math.style.negative=TRUE) xtab.out <- xtable(s.corr.pval, digits=4) print(xtab.out, floating=FALSE, math.style.negative=TRUE) # Plot the data using ggplot library(ggplot2) library(GGally) p1 <- ggpairs(thyroid[,1:3]) print(p1) thyroid$rank_weight <- rank(thyroid$weight ) thyroid$rank_time <- rank(thyroid$time ) thyroid$rank_blood_loss <- rank(thyroid$blood_loss) p2 <- ggpairs(thyroid[,4:6]) print(p2) lm.blood.wt <- lm(blood_loss ~ weight, data = thyroid) lm.blood.wt # use summary() to get t-tests of parameters (slope, intercept) summary(lm.blood.wt) # ggplot: Plot the data with linear regression fit and confidence bands library(ggplot2) p <- ggplot(thyroid, aes(x = weight, y = blood_loss)) p <- p + geom_point() p <- p + geom_smooth(method = lm, se = FALSE) print(p) # Base graphics: Plot the data with linear regression fit and confidence bands # scatterplot plot(thyroid$weight, thyroid$blood_loss) # regression line from lm() fit abline(lm.blood.wt) # ANOVA table of the simple linear regression fit anova(lm.blood.wt) #### function to plot normal distributions centered at the regression line f.plot.norm <- function(x.ind = 1, lm.fit, height.line = 1, sd.mult = 4, plot.col = "gray50", n.lines = 375) { # scales package, to include transparency for curves library(scales) # these are the x-values in the plot x.fit <- lm.fit$model[, 2] # the spread for the curve will match the estimated variance from the model fit sd.fit <- summary(lm.fit)$sigma # calculate horizontal lines for the height of a normal distribution # vertical steps lines.steps <- seq(predict(lm.fit)[x.ind] - sd.mult * sd.fit, predict(lm.fit)[x.ind] + sd.mult * sd.fit, length = n.lines) # top of curve lines.norm <- rep(x.fit[x.ind], n.lines) + height.line * dnorm(lines.steps, mean = predict(lm.fit)[x.ind], sd = sd.fit) # plot each line to make curve for(i.step in 1:length(lines.steps)) { y.pos <- lines.steps[i.step] x.pos <- lines.norm[i.step] segments(x0 = x.fit[x.ind], y0 = y.pos, x1 = x.pos, y1 = y.pos , lty = 1, col = alpha(plot.col, 0.3)) } } # simulate data along a regression line n <- 100 sd.mult <- 3 x <- rgamma(n, 3, 0.5) eps <- rnorm(n, 0, sd.mult) y <- 4 + -2 * x + eps sim <- data.frame(x = x, y = y) # plot data plot(x, y, pch = 16, cex = 0.5) # fit model lm.fit <- lm(y ~ x, data = sim) # plot fitted line abline(lm.fit, lty = 1, col = "gray20") # add normal curves f.plot.norm(1, lm.fit, height.line = 8) f.plot.norm(2, lm.fit, height.line = 8) f.plot.norm(3, lm.fit, height.line = 8) # show one with two standard deviations highlighted f.plot.norm(6, lm.fit, height.line = 8, plot.col = "red") f.plot.norm(6, lm.fit, height.line = 8, sd.mult = 2, plot.col = "skyblue") # partially refresh points points(x, y, pch = 16, cex = 0.5, col = alpha("black", 0.5)) # CI for beta1 sum.lm.blood.wt <- summary(lm.blood.wt) sum.lm.blood.wt$coefficients est.beta1 <- sum.lm.blood.wt$coefficients[2,1] se.beta1 <- sum.lm.blood.wt$coefficients[2,2] sum.lm.blood.wt$fstatistic df.beta1 <- sum.lm.blood.wt$fstatistic[3] t.crit <- qt(1-0.05/2, df.beta1) t.crit CI.lower <- est.beta1 - t.crit * se.beta1 CI.upper <- est.beta1 + t.crit * se.beta1 c(CI.lower, CI.upper) # CI for the mean and PI for a new observation at weight=50 predict(lm.blood.wt, data.frame(weight=50), interval = "confidence", level = 0.95) predict(lm.blood.wt, data.frame(weight=50), interval = "prediction", level = 0.95) # ggplot: Plot the data with linear regression fit and confidence bands library(ggplot2) p <- ggplot(thyroid, aes(x = weight, y = blood_loss)) p <- p + geom_point() p <- p + geom_smooth(method = lm, se = TRUE) print(p) # Base graphics: Plot the data with linear regression fit and confidence bands # scatterplot plot(thyroid$weight, thyroid$blood_loss) # regression line from lm() fit abline(lm.blood.wt) # x values of weight for predictions of confidence bands x.pred <- data.frame(weight = seq(min(thyroid$weight), max(thyroid$weight), length = 20)) # draw upper and lower confidence bands lines(x.pred$weight, predict(lm.blood.wt, x.pred, interval = "confidence")[, "upr"], col = "blue") lines(x.pred$weight, predict(lm.blood.wt, x.pred, interval = "confidence")[, "lwr"], col = "blue") #### Nonconstant variance vs sample size dat.var.sam <- function(n, s) { dat <- data.frame( x = c(rep(0, n[1]), rep(1, n[2]), rep(2, n[3]), rep(3, n[4]), rep(4, n[5])), y = c(rnorm(n[1], mean = 0, sd = s[1]), rnorm(n[2], mean = 0, sd = s[2]), rnorm(n[3], mean = 0, sd = s[3]), rnorm(n[4], mean = 0, sd = s[4]), rnorm(n[5], mean = 0, sd = s[5])) ) return(dat) } library(ggplot2) n <- c(25, 25, 25, 25, 25) s <- c(1, 1, 1, 1, 1) dat <- dat.var.sam(n, s) p1 <- ggplot(dat, aes(x = x, y = y)) p1 <- p1 + geom_point(position = position_jitter(width = 0.1)) p1 <- p1 + labs(title = "Constant variance, constant sample size") #print(p1) n <- c(3, 5, 10, 25, 125) s <- c(1, 1, 1, 1, 1) dat <- dat.var.sam(n, s) p2 <- ggplot(dat, aes(x = x, y = y)) p2 <- p2 + geom_point(position = position_jitter(width = 0.1)) p2 <- p2 + labs(title = "Constant variance, different sample sizes") #print(p2) n <- c(25, 25, 25, 25, 25) s <- c(0.1, 0.5, 1, 1.5, 3) dat <- dat.var.sam(n, s) p3 <- ggplot(dat, aes(x = x, y = y)) p3 <- p3 + geom_point(position = position_jitter(width = 0.1)) p3 <- p3 + labs(title = "Different variance, constant sample size") #print(p3) n <- c(3, 5, 10, 25, 125) s <- c(0.1, 0.5, 1, 1.5, 3) dat <- dat.var.sam(n, s) p4 <- ggplot(dat, aes(x = x, y = y)) p4 <- p4 + geom_point(position = position_jitter(width = 0.1)) p4 <- p4 + labs(title = "Different variance, different sample sizes") #print(p4) library(gridExtra) grid.arrange(grobs = list(p1, p2, p3, p4), nrow=2, ncol=2 , top = "Nonconstant variance vs sample size") #### Residual and Diagnostic Analysis of the Blood Loss Data thyroid <- read.table(text=" weight time blood_loss 44.3 105 503 40.6 80 490 69.0 86 471 43.7 112 505 50.3 109 482 50.2 100 490 35.4 96 513 52.2 120 464 ", header=TRUE) # show the structure of the data.frame #str(thyroid) # display the data.frame #thyroid library(xtable) xtab.out <- xtable(thyroid, digits=1) print(xtab.out, floating=FALSE, math.style.negative=TRUE) # create data ids thyroid$id <- 1:nrow(thyroid) # ggplot: Plot the data with linear regression fit and confidence bands library(ggplot2) p <- ggplot(thyroid, aes(x = weight, y = blood_loss, label = id)) p <- p + geom_point() # plot labels next to points p <- p + geom_text(hjust = 0.5, vjust = -0.5) # plot regression line and confidence band p <- p + geom_smooth(method = lm) print(p) lm.blood.wt <- lm(blood_loss ~ weight, data = thyroid) # use summary() to get t-tests of parameters (slope, intercept) summary(lm.blood.wt) # plot diagnistics par(mfrow=c(2,3)) plot(lm.blood.wt, which = c(1,4,6)) # residuals vs weight plot(thyroid$weight, lm.blood.wt$residuals, main="Residuals vs weight") # horizontal line at zero abline(h = 0, col = "gray75") # Normality of Residuals library(car) # qq plot for studentized resid # las = 1 : turns labels on y-axis to read horizontally # id.n = n : labels n most extreme observations, and outputs to console # id.cex = 1 : is the size of those labels # lwd = 1 : line width qqPlot(lm.blood.wt$residuals, las = 1, id.n = 3, main="QQ Plot") # residuals vs order of data plot(lm.blood.wt$residuals, main="Residuals vs Order of data") # horizontal line at zero abline(h = 0, col = "gray75") # wt = 1 for all except obs 3 where wt = 0 thyroid$wt <- as.numeric(!(thyroid$id == 3)) thyroid$wt lm.blood.wt.no3 <- lm(blood_loss ~ weight, data = thyroid, weights = wt) # use summary() to get t-tests of parameters (slope, intercept) summary(lm.blood.wt.no3) # exclude obs 3 thyroid.no3 <- subset(thyroid, wt == 1) # ggplot: Plot the data with linear regression fit and confidence bands library(ggplot2) p <- ggplot(thyroid.no3, aes(x = weight, y = blood_loss, label = id)) p <- p + geom_point() # plot labels next to points p <- p + geom_text(hjust = 0.5, vjust = -0.5) # plot regression line and confidence band p <- p + geom_smooth(method = lm) print(p) # plot diagnistics par(mfrow=c(2,3)) plot(lm.blood.wt.no3, which = c(1,4,6)) # residuals vs weight plot(thyroid.no3$weight, lm.blood.wt.no3$residuals[(thyroid$wt == 1)] , main="Residuals vs weight") # horizontal line at zero abline(h = 0, col = "gray75") # Normality of Residuals library(car) # qq plot for studentized resid # las = 1 : turns labels on y-axis to read horizontally # id.n = n : labels n most extreme observations, and outputs to console # id.cex = 1 : is the size of those labels # lwd = 1 : line width qqPlot(lm.blood.wt.no3$residuals, las = 1, id.n = 3, main="QQ Plot") # residuals vs order of data plot(lm.blood.wt.no3$residuals, main="Residuals vs Order of data") # horizontal line at zero abline(h = 0, col = "gray75") # CI for the mean and PI for a new observation at weight=50 predict(lm.blood.wt , data.frame(weight=50), interval = "prediction") predict(lm.blood.wt.no3, data.frame(weight=50), interval = "prediction") #### Gesell data gesell <- read.table(text=" id age score 1 15 95 2 26 71 3 10 83 4 9 91 5 15 102 6 20 87 7 18 93 8 11 100 9 8 104 10 20 94 11 7 113 12 9 96 13 10 83 14 11 84 15 11 102 16 10 100 17 12 105 18 42 57 19 17 121 20 11 86 21 10 100 ", header=TRUE) # show the structure of the data.frame #str(gesell) # display the data.frame #gesell library(xtable) xtab.out <- xtable(gesell, digits=0) print(xtab.out, floating=FALSE, math.style.negative=TRUE) # ggplot: Plot the data with linear regression fit and confidence bands library(ggplot2) p <- ggplot(gesell, aes(x = age, y = score, label = id)) p <- p + geom_point() # plot labels next to points p <- p + geom_text(hjust = 0.5, vjust = -0.5) # plot regression line and confidence band p <- p + geom_smooth(method = lm) print(p) lm.score.age <- lm(score ~ age, data = gesell) # use summary() to get t-tests of parameters (slope, intercept) summary(lm.score.age) # plot diagnistics par(mfrow=c(2,3)) plot(lm.score.age, which = c(1,4,6)) # residuals vs weight plot(gesell$age, lm.score.age$residuals, main="Residuals vs age") # horizontal line at zero abline(h = 0, col = "gray75") # Normality of Residuals library(car) qqPlot(lm.score.age$residuals, las = 1, id.n = 3, main="QQ Plot") # residuals vs order of data plot(lm.score.age$residuals, main="Residuals vs Order of data") # horizontal line at zero abline(h = 0, col = "gray75") #### Weighted Least Squares # R code to generate data set.seed(7) n <- 100 # 1s, Xs uniform 0 to 100 X <- matrix(c(rep(1,n),runif(n,0,100)), ncol=2) # intercept and slope (5, 5) beta <- matrix(c(5,5),ncol=1) # errors are X*norm(0,1), so variance increases with X e <- X[,2]*rnorm(n,0,1) # response variables y <- X %*% beta + e # put data into data.frame wlsdat <- data.frame(y, x = X[,2]) # fit regression lm.y.x <- lm(y ~ x, data = wlsdat) # put residuals in data.frame wlsdat$res <- lm.y.x$residuals # ggplot: Plot the data with linear regression fit library(ggplot2) p <- ggplot(wlsdat, aes(x = x, y = y)) p <- p + geom_point() p <- p + geom_smooth(method = lm, se = FALSE) print(p) # ggplot: Plot the residuals library(ggplot2) p <- ggplot(wlsdat, aes(x = x, y = res)) p <- p + geom_point() p <- p + geom_smooth(method = lm, se = FALSE) print(p) # ggplot: Plot the absolute value of the residuals library(ggplot2) p <- ggplot(wlsdat, aes(x = x, y = abs(res))) p <- p + geom_point() print(p) # fit regression lm.y.x.wt <- lm(y ~ x, data = wlsdat, weights = x^(-2)) # put residuals in data.frame wlsdat$reswt <- lm.y.x.wt$residuals wlsdat$wt <- lm.y.x.wt$weights^(1/2) # ggplot: Plot the absolute value of the residuals library(ggplot2) p <- ggplot(wlsdat, aes(x = x, y = reswt*wt)) p <- p + geom_point() print(p) summary(lm.y.x) summary(lm.y.x.wt)