#### Example: Log-linear population growth
pop <- read.table(text="
Year Pop_M
   1   170
 400   190
 800   220
1200   360
1600   545
1800   900
1850  1200
1900  1625
1950  2500
1975  3900
2000  6080
2012  7000
", header=TRUE)
pop$Pop    <- 1e6 * pop$Pop_M  # convert to millions
pop$PopL10 <- log10(pop$Pop)

# calculate the residuals from a simple linear regression
lm.fit <- lm(PopL10 ~ Year, data = pop)
# include those residuals in the pop table
pop$Res <- residuals(lm.fit)
pop$R10 <- 10^residuals(lm.fit) # residuals on the original scale
library(xtable)
xtab.out <- xtable(pop, digits=c(0,0,0,0,3,3,3))
print(xtab.out, floating=FALSE, math.style.negative=TRUE)
library(ggplot2)

p1 <- ggplot(pop, aes(x = Year, y = Pop))
p1 <- p1 + geom_point()
#print(p1)

p2 <- ggplot(pop, aes(x = Year, y = PopL10))
p2 <- p2 + geom_point()
p2 <- p2 + geom_smooth(method = lm, se = FALSE)
#print(p2)

library(gridExtra)
grid.arrange(grobs = list(p1, p2), nrow=1
            , top = "Log-linear transformation: world population")
# http://www.ncbi.nlm.nih.gov/pmc/articles/PMC153045/
# Supp:
# http://www.ncbi.nlm.nih.gov/pmc/articles/PMC153045/bin/pnas_0436428100_index.html
# Supporting information for White and Seymour (2003)
#   Proc. Natl. Acad. Sci. USA, 10.1073/pnas.0436428100

library(gdata)
fn <- "data/ADA1_notes_08_data_log-logScaling_BodyMassMetabolicRate_2003_WhiteSeymour.xlsx"
bm.bmr <- read.xls(fn, skip = 4, stringsAsFactors = TRUE)
bm.bmr$Log10BodyMass <- log10(bm.bmr$BodyMass)
bm.bmr$Log10BaseMetRate <- log10(bm.bmr$BaseMetRate)

# remove a very strange group
bm.bmr <- subset(bm.bmr, !(Group == "Artiodactyla 7"))
str(bm.bmr)

# log-log scale linear regression
lm.fit <- lm(Log10BaseMetRate ~ Log10BodyMass, data = bm.bmr)
# coefficients for regression line
coef(lm.fit)

library(ggplot2)

p1 <- ggplot(subset(bm.bmr, (Genus == "")), aes(x = BodyMass, y = BaseMetRate))
p1 <- p1 + geom_point(data = subset(bm.bmr, !(Genus == "")), aes(colour = Group), alpha = 0.2)
p1 <- p1 + geom_point(size = 3)
  # Using a custom function
  f.org.scale <- function(BodyMass) { 10^coef(lm.fit)[1] * BodyMass ^ coef(lm.fit)[2]}
p1 <- p1 + stat_function(fun = f.org.scale, size = 1)
p1 <- p1 + labs(title = paste("BaseMetRate = ", signif(10^coef(lm.fit)[1], 3), " * ", "BodyMass ^ ", signif(coef(lm.fit)[2], 3), sep = ""))
p1 <- p1 + scale_y_continuous(limits=c(0, 1300))
p1 <- p1 + scale_x_continuous(limits=c(0, 5000))
#print(p1)

p2 <- ggplot(subset(bm.bmr, (Genus == "")), aes(x = Log10BodyMass, y = Log10BaseMetRate))
p2 <- p2 + geom_point(data = subset(bm.bmr, !(Genus == "")), aes(colour = Group), alpha = 0.3)
p2 <- p2 + geom_point(size = 3)
p2 <- p2 + geom_smooth(method = lm, se = FALSE, fullrange = TRUE, size = 1, colour = "black")
p2 <- p2 + labs(title = paste("log10(BaseMetRate) = ", signif(coef(lm.fit)[1], 3), " + ", signif(coef(lm.fit)[2], 3), " log10(BodyMass)", sep = ""))
p2 <- p2 + scale_y_continuous(limits=c(NA, log10(1300)))
p2 <- p2 + scale_x_continuous(limits=c(NA, log10(5000)))
#print(p2)

p3 <- ggplot(subset(bm.bmr, (Genus == "")), aes(x = BodyMass, y = BaseMetRate))
p3 <- p3 + geom_point(data = subset(bm.bmr, !(Genus == "")), aes(colour = Group), alpha = 0.3)
p3 <- p3 + geom_point(size = 3)
p3 <- p3 + geom_smooth(method = lm, se = FALSE, fullrange = TRUE, size = 1, colour = "black")
p3 <- p3 + labs(title = paste("log10(BaseMetRate) = ", signif(coef(lm.fit)[1], 3), " + ", signif(coef(lm.fit)[2], 3), " log10(BodyMass)", sep = ""))
p3 <- p3 + scale_y_log10()#limits=c(NA, log10(1300)))
p3 <- p3 + scale_x_log10()#limits=c(NA, log10(5000)))
#print(p3)

library(gridExtra)
grid.arrange(grobs = list(p1, p2, p3), ncol=1
           , top = "Log-log transformation: metabolic rates")
pred.bm.bmr <- data.frame(BodyMass = 5 * c(1, 10, 100, 1000))
pred.bm.bmr$Log10BodyMass <- log10(pred.bm.bmr$BodyMass)
pred.bm.bmr$Log10BaseMetRate <- predict(lm.fit, pred.bm.bmr)
pred.bm.bmr$BaseMetRate <- 10^pred.bm.bmr$Log10BaseMetRate

tab.pred <- subset(pred.bm.bmr, select = c("BodyMass", "BaseMetRate"))
library(xtable)
xtab.out <- xtable(tab.pred, digits=c(0,0,2))
print(xtab.out, floating=FALSE, math.style.negative=TRUE)
#### Example: Blood loss
thyroid <- read.table(text="
weight time blood_loss
  44.3  105        503
  40.6   80        490
  69.0   86        471
  43.7  112        505
  50.3  109        482
  50.2  100        490
  35.4   96        513
  52.2  120        464
", header=TRUE)

# show the structure of the data.frame
str(thyroid)
# display the data.frame
#thyroid
library(xtable)
xtab.out <- xtable(thyroid, digits=1)
print(xtab.out, floating=FALSE, math.style.negative=TRUE)
p.corr <- cor(thyroid);
#p.corr

# initialize pvalue table with dim names
p.corr.pval <- p.corr;
for (i1 in 1:ncol(thyroid)) {
  for (i2 in 1:ncol(thyroid)) {
    p.corr.pval[i1,i2] <- cor.test(thyroid[, i1], thyroid[, i2])$p.value
  }
}
#p.corr.pval
library(xtable)
xtab.out <- xtable(p.corr, digits=3)
print(xtab.out, floating=FALSE, math.style.negative=TRUE)
xtab.out <- xtable(p.corr.pval, digits=4)
print(xtab.out, floating=FALSE, math.style.negative=TRUE)
s.corr <- cor(thyroid, method = "spearman");
#s.corr

# initialize pvalue table with dim names
s.corr.pval <- p.corr;
for (i1 in 1:ncol(thyroid)) {
  for (i2 in 1:ncol(thyroid)) {
    s.corr.pval[i1,i2] <- cor.test(thyroid[, i1], thyroid[, i2],
                                   method = "spearman")$p.value
  }
}
#s.corr.pval
library(xtable)
xtab.out <- xtable(s.corr, digits=3)
print(xtab.out, floating=FALSE, math.style.negative=TRUE)
xtab.out <- xtable(s.corr.pval, digits=4)
print(xtab.out, floating=FALSE, math.style.negative=TRUE)
# Plot the data using ggplot
library(ggplot2)
library(GGally)
p1 <- ggpairs(thyroid[,1:3])
print(p1)


thyroid$rank_weight     <- rank(thyroid$weight    )
thyroid$rank_time       <- rank(thyroid$time      )
thyroid$rank_blood_loss <- rank(thyroid$blood_loss)

p2 <- ggpairs(thyroid[,4:6])
print(p2)
lm.blood.wt <- lm(blood_loss ~ weight, data = thyroid)
lm.blood.wt
# use summary() to get t-tests of parameters (slope, intercept)
summary(lm.blood.wt)
# ggplot: Plot the data with linear regression fit and confidence bands
library(ggplot2)
p <- ggplot(thyroid, aes(x = weight, y = blood_loss))
p <- p + geom_point()
p <- p + geom_smooth(method = lm, se = FALSE)
print(p)


# Base graphics: Plot the data with linear regression fit and confidence bands
# scatterplot
plot(thyroid$weight, thyroid$blood_loss)
# regression line from lm() fit
abline(lm.blood.wt)
# ANOVA table of the simple linear regression fit
anova(lm.blood.wt)
#### function to plot normal distributions centered at the regression line
f.plot.norm <- function(x.ind = 1, lm.fit, height.line = 1, sd.mult = 4, plot.col = "gray50", n.lines = 375) {
  # scales package, to include transparency for curves
  library(scales)
  # these are the x-values in the plot
  x.fit <- lm.fit$model[, 2]
  # the spread for the curve will match the estimated variance from the model fit
  sd.fit <- summary(lm.fit)$sigma
  # calculate horizontal lines for the height of a normal distribution
  # vertical steps
  lines.steps <- seq(predict(lm.fit)[x.ind] - sd.mult * sd.fit, predict(lm.fit)[x.ind] + sd.mult * sd.fit, length = n.lines)
  # top of curve
  lines.norm <- rep(x.fit[x.ind], n.lines) +
                  height.line * dnorm(lines.steps, mean = predict(lm.fit)[x.ind], sd = sd.fit)
  # plot each line to make curve
  for(i.step in 1:length(lines.steps)) {
     y.pos <- lines.steps[i.step]
     x.pos <- lines.norm[i.step]
     segments(x0 = x.fit[x.ind], y0 = y.pos, x1 = x.pos, y1 = y.pos
            , lty = 1, col = alpha(plot.col, 0.3))
  }
}

# simulate data along a regression line
n <- 100
sd.mult <- 3
x <- rgamma(n, 3, 0.5)
eps <- rnorm(n, 0, sd.mult)
y <- 4 + -2 * x + eps
sim <- data.frame(x = x, y = y)

# plot data
plot(x, y, pch = 16, cex = 0.5)

# fit model
lm.fit <- lm(y ~ x, data = sim)

# plot fitted line
abline(lm.fit, lty = 1, col = "gray20")

# add normal curves
f.plot.norm(1, lm.fit, height.line = 8)
f.plot.norm(2, lm.fit, height.line = 8)
f.plot.norm(3, lm.fit, height.line = 8)
# show one with two standard deviations highlighted
f.plot.norm(6, lm.fit, height.line = 8, plot.col = "red")
f.plot.norm(6, lm.fit, height.line = 8, sd.mult = 2, plot.col = "skyblue")
# partially refresh points
points(x, y, pch = 16, cex = 0.5, col = alpha("black", 0.5))
# CI for beta1
sum.lm.blood.wt <- summary(lm.blood.wt)
sum.lm.blood.wt$coefficients
est.beta1 <- sum.lm.blood.wt$coefficients[2,1]
se.beta1  <- sum.lm.blood.wt$coefficients[2,2]
sum.lm.blood.wt$fstatistic
df.beta1  <- sum.lm.blood.wt$fstatistic[3]
t.crit    <- qt(1-0.05/2, df.beta1)
t.crit
CI.lower  <- est.beta1 - t.crit * se.beta1
CI.upper  <- est.beta1 + t.crit * se.beta1
c(CI.lower, CI.upper)
# CI for the mean and PI for a new observation at weight=50
predict(lm.blood.wt, data.frame(weight=50), interval = "confidence", level = 0.95)
predict(lm.blood.wt, data.frame(weight=50), interval = "prediction", level = 0.95)
# ggplot: Plot the data with linear regression fit and confidence bands
library(ggplot2)
p <- ggplot(thyroid, aes(x = weight, y = blood_loss))
p <- p + geom_point()
p <- p + geom_smooth(method = lm, se = TRUE)
print(p)


# Base graphics: Plot the data with linear regression fit and confidence bands
# scatterplot
plot(thyroid$weight, thyroid$blood_loss)
# regression line from lm() fit
abline(lm.blood.wt)
# x values of weight for predictions of confidence bands
x.pred <- data.frame(weight = seq(min(thyroid$weight), max(thyroid$weight),
                      length = 20))
# draw upper and lower confidence bands
lines(x.pred$weight, predict(lm.blood.wt, x.pred,
                             interval = "confidence")[, "upr"], col = "blue")
lines(x.pred$weight, predict(lm.blood.wt, x.pred,
                             interval = "confidence")[, "lwr"], col = "blue")
#### Nonconstant variance vs sample size
dat.var.sam <- function(n, s) {
  dat <- data.frame(
           x = c(rep(0, n[1]),
                 rep(1, n[2]),
                 rep(2, n[3]),
                 rep(3, n[4]),
                 rep(4, n[5])),
           y = c(rnorm(n[1], mean = 0, sd = s[1]),
                 rnorm(n[2], mean = 0, sd = s[2]),
                 rnorm(n[3], mean = 0, sd = s[3]),
                 rnorm(n[4], mean = 0, sd = s[4]),
                 rnorm(n[5], mean = 0, sd = s[5]))
          )
  return(dat)
}

library(ggplot2)

n <- c(25, 25, 25, 25, 25)
s <- c(1, 1, 1, 1, 1)
dat <- dat.var.sam(n, s)
p1 <- ggplot(dat, aes(x = x, y = y))
p1 <- p1 + geom_point(position = position_jitter(width = 0.1))
p1 <- p1 + labs(title = "Constant variance, constant sample size")
#print(p1)

n <- c(3, 5, 10, 25, 125)
s <- c(1, 1, 1, 1, 1)
dat <- dat.var.sam(n, s)
p2 <- ggplot(dat, aes(x = x, y = y))
p2 <- p2 + geom_point(position = position_jitter(width = 0.1))
p2 <- p2 + labs(title = "Constant variance, different sample sizes")
#print(p2)

n <- c(25, 25, 25, 25, 25)
s <- c(0.1, 0.5, 1, 1.5, 3)
dat <- dat.var.sam(n, s)
p3 <- ggplot(dat, aes(x = x, y = y))
p3 <- p3 + geom_point(position = position_jitter(width = 0.1))
p3 <- p3 + labs(title = "Different variance, constant sample size")
#print(p3)

n <- c(3, 5, 10, 25, 125)
s <- c(0.1, 0.5, 1, 1.5, 3)
dat <- dat.var.sam(n, s)
p4 <- ggplot(dat, aes(x = x, y = y))
p4 <- p4 + geom_point(position = position_jitter(width = 0.1))
p4 <- p4 + labs(title = "Different variance, different sample sizes")
#print(p4)

library(gridExtra)
grid.arrange(grobs = list(p1, p2, p3, p4), nrow=2, ncol=2
            , top = "Nonconstant variance vs sample size")
#### Residual and Diagnostic Analysis of the Blood Loss Data
thyroid <- read.table(text="
weight time blood_loss
  44.3  105        503
  40.6   80        490
  69.0   86        471
  43.7  112        505
  50.3  109        482
  50.2  100        490
  35.4   96        513
  52.2  120        464
", header=TRUE)

# show the structure of the data.frame
#str(thyroid)
# display the data.frame
#thyroid
library(xtable)
xtab.out <- xtable(thyroid, digits=1)
print(xtab.out, floating=FALSE, math.style.negative=TRUE)
# create data ids
thyroid$id <- 1:nrow(thyroid)
# ggplot: Plot the data with linear regression fit and confidence bands
library(ggplot2)
p <- ggplot(thyroid, aes(x = weight, y = blood_loss, label = id))
p <- p + geom_point()
# plot labels next to points
p <- p + geom_text(hjust = 0.5, vjust = -0.5)
# plot regression line and confidence band
p <- p + geom_smooth(method = lm)
print(p)
lm.blood.wt <- lm(blood_loss ~ weight, data = thyroid)
# use summary() to get t-tests of parameters (slope, intercept)
summary(lm.blood.wt)
# plot diagnistics
par(mfrow=c(2,3))
plot(lm.blood.wt, which = c(1,4,6))

# residuals vs weight
plot(thyroid$weight, lm.blood.wt$residuals, main="Residuals vs weight")
  # horizontal line at zero
  abline(h = 0, col = "gray75")

# Normality of Residuals
library(car)
# qq plot for studentized resid
#   las    =  1 : turns labels on y-axis to read horizontally
#   id.n   =  n : labels n most extreme observations, and outputs to console
#   id.cex =  1 : is the size of those labels
#   lwd    =  1 : line width
qqPlot(lm.blood.wt$residuals, las = 1, id.n = 3, main="QQ Plot")

# residuals vs order of data
plot(lm.blood.wt$residuals, main="Residuals vs Order of data")
  # horizontal line at zero
  abline(h = 0, col = "gray75")
# wt = 1 for all except obs 3 where wt = 0
thyroid$wt <- as.numeric(!(thyroid$id == 3))
thyroid$wt
lm.blood.wt.no3 <- lm(blood_loss ~ weight, data = thyroid, weights = wt)
# use summary() to get t-tests of parameters (slope, intercept)
summary(lm.blood.wt.no3)
# exclude obs 3
thyroid.no3 <- subset(thyroid, wt == 1)
# ggplot: Plot the data with linear regression fit and confidence bands
library(ggplot2)
p <- ggplot(thyroid.no3, aes(x = weight, y = blood_loss, label = id))
p <- p + geom_point()
# plot labels next to points
p <- p + geom_text(hjust = 0.5, vjust = -0.5)
# plot regression line and confidence band
p <- p + geom_smooth(method = lm)
print(p)
# plot diagnistics
par(mfrow=c(2,3))
plot(lm.blood.wt.no3, which = c(1,4,6))

# residuals vs weight
plot(thyroid.no3$weight, lm.blood.wt.no3$residuals[(thyroid$wt == 1)]
   , main="Residuals vs weight")
  # horizontal line at zero
  abline(h = 0, col = "gray75")

# Normality of Residuals
library(car)
# qq plot for studentized resid
#   las    =  1 : turns labels on y-axis to read horizontally
#   id.n   =  n : labels n most extreme observations, and outputs to console
#   id.cex =  1 : is the size of those labels
#   lwd    =  1 : line width
qqPlot(lm.blood.wt.no3$residuals, las = 1, id.n = 3, main="QQ Plot")

# residuals vs order of data
plot(lm.blood.wt.no3$residuals, main="Residuals vs Order of data")
  # horizontal line at zero
  abline(h = 0, col = "gray75")
# CI for the mean and PI for a new observation at weight=50
predict(lm.blood.wt    , data.frame(weight=50), interval = "prediction")
predict(lm.blood.wt.no3, data.frame(weight=50), interval = "prediction")
#### Gesell data
gesell <- read.table(text="
id  age  score
 1   15     95
 2   26     71
 3   10     83
 4    9     91
 5   15    102
 6   20     87
 7   18     93
 8   11    100
 9    8    104
10   20     94
11    7    113
12    9     96
13   10     83
14   11     84
15   11    102
16   10    100
17   12    105
18   42     57
19   17    121
20   11     86
21   10    100
", header=TRUE)

# show the structure of the data.frame
#str(gesell)
# display the data.frame
#gesell
library(xtable)
xtab.out <- xtable(gesell, digits=0)
print(xtab.out, floating=FALSE, math.style.negative=TRUE)
# ggplot: Plot the data with linear regression fit and confidence bands
library(ggplot2)
p <- ggplot(gesell, aes(x = age, y = score, label = id))
p <- p + geom_point()
# plot labels next to points
p <- p + geom_text(hjust = 0.5, vjust = -0.5)
# plot regression line and confidence band
p <- p + geom_smooth(method = lm)
print(p)
lm.score.age <- lm(score ~ age, data = gesell)
# use summary() to get t-tests of parameters (slope, intercept)
summary(lm.score.age)
# plot diagnistics
par(mfrow=c(2,3))
plot(lm.score.age, which = c(1,4,6))

# residuals vs weight
plot(gesell$age, lm.score.age$residuals, main="Residuals vs age")
  # horizontal line at zero
  abline(h = 0, col = "gray75")

# Normality of Residuals
library(car)
qqPlot(lm.score.age$residuals, las = 1, id.n = 3, main="QQ Plot")

# residuals vs order of data
plot(lm.score.age$residuals, main="Residuals vs Order of data")
  # horizontal line at zero
  abline(h = 0, col = "gray75")
#### Weighted Least Squares
# R code to generate data
set.seed(7)
n <- 100
# 1s, Xs uniform 0 to 100
X <- matrix(c(rep(1,n),runif(n,0,100)), ncol=2)
# intercept and slope (5, 5)
beta <- matrix(c(5,5),ncol=1)
# errors are X*norm(0,1), so variance increases with X
e <- X[,2]*rnorm(n,0,1)
# response variables
y <- X %*% beta + e

# put data into data.frame
wlsdat <- data.frame(y, x = X[,2])

# fit regression
lm.y.x <- lm(y ~ x, data = wlsdat)

# put residuals in data.frame
wlsdat$res <- lm.y.x$residuals
# ggplot: Plot the data with linear regression fit
library(ggplot2)
p <- ggplot(wlsdat, aes(x = x, y = y))
p <- p + geom_point()
p <- p + geom_smooth(method = lm, se = FALSE)
print(p)


# ggplot: Plot the residuals
library(ggplot2)
p <- ggplot(wlsdat, aes(x = x, y = res))
p <- p + geom_point()
p <- p + geom_smooth(method = lm, se = FALSE)
print(p)
# ggplot: Plot the absolute value of the residuals
library(ggplot2)
p <- ggplot(wlsdat, aes(x = x, y = abs(res)))
p <- p + geom_point()
print(p)
# fit regression
lm.y.x.wt <- lm(y ~ x, data = wlsdat, weights = x^(-2))

# put residuals in data.frame
wlsdat$reswt <- lm.y.x.wt$residuals
wlsdat$wt <- lm.y.x.wt$weights^(1/2)
# ggplot: Plot the absolute value of the residuals
library(ggplot2)
p <- ggplot(wlsdat, aes(x = x, y = reswt*wt))
p <- p + geom_point()
print(p)
summary(lm.y.x)
summary(lm.y.x.wt)