## Parallel sets function parallelset <- function(..., freq, col="gray", border=0, layer, alpha=0.5, gap.width=0.05) { p <- data.frame(..., freq, col, border, alpha, stringsAsFactors=FALSE) n <- nrow(p) if(missing(layer)) { layer <- 1:n } p$layer <- layer np <- ncol(p) - 5 d <- p[ , 1:np, drop=FALSE] p <- p[ , -c(1:np), drop=FALSE] p$freq <- with(p, freq/sum(freq)) col <- col2rgb(p$col, alpha=TRUE) if(!identical(alpha, FALSE)) { col["alpha", ] <- p$alpha*256 } p$col <- apply(col, 2, function(x) do.call(rgb, c(as.list(x), maxColorValue = 256))) getp <- function(i, d, f, w=gap.width) { a <- c(i, (1:ncol(d))[-i]) o <- do.call(order, d[a]) x <- c(0, cumsum(f[o])) * (1-w) x <- cbind(x[-length(x)], x[-1]) gap <- cumsum( c(0L, diff(as.numeric(d[o,i])) != 0) ) gap <- gap / max(gap) * w (x + gap)[order(o),] } dd <- lapply(seq_along(d), getp, d=d, f=p$freq) par(mar = c(0, 0, 2, 0) + 0.1, xpd=TRUE ) plot(NULL, type="n",xlim=c(0, 1), ylim=c(np, 1), xaxt="n", yaxt="n", xaxs="i", yaxs="i", xlab='', ylab='', frame=FALSE) for(i in rev(order(p$layer)) ) { for(j in 1:(np-1) ) polygon(c(dd[[j]][i,], rev(dd[[j+1]][i,])), c(j, j, j+1, j+1), col=p$col[i], border=p$border[i]) } text(0, seq_along(dd), labels=names(d), adj=c(0,-2), font=2) for(j in seq_along(dd)) { ax <- lapply(split(dd[[j]], d[,j]), range) for(k in seq_along(ax)) { lines(ax[[k]], c(j, j)) text(ax[[k]][1], j, labels=names(ax)[k], adj=c(0, -0.25)) } } } # The Titanic dataset is a 4-dimensional table: Class, Sex, Age, Survived library(datasets) data(Titanic) Titanic # reshape into long data.frame library(reshape2) df.titanic <- melt(Titanic, value.name = "Freq") df.titanic # Total number of people sum(df.titanic$Freq) # create colors based on survival df.titanic$Color <- ifelse(df.titanic$Survived == "Yes", "#008888", "#330066") # subset only the adults (since there were so few children) df.titanic.adult <- subset(df.titanic, Age == "Adult") # see R code on website for function parallelset() # see help for with(), it allows temporary direct reference to columns in a data.frame # otherwise, we'd need to specify df.titanic.adult$Survived, ... with(df.titanic.adult , parallelset(Survived, Sex, Class, freq = Freq, col = Color, alpha=0.2) ) #### Single Proportion Problems # Approximate normal test for proportion, without Yates' continuity correction prop.test(21, 200, p = 0.15, correct = FALSE) # Approximate normal test for proportion, with Yates' continuity correction #prop.test(21, 200, p = 0.15) #### Example: brain hemispheres # Approximate normal test for proportion, without Yates' continuity correction prop.test(22, 53, p = 0.25, alternative = "greater", correct = FALSE) #### Example: swimming segregation ## The prop.test() does an additional adjustment, so does not match precisely ## the results in the above paragraphs # Approximate normal test for proportion, without Yates' continuity correction prop.test(1, 6, p = 0.85, correct = FALSE)$conf.int # Agresti's method prop.test(1+2, 6+4, p = 0.85, correct = FALSE)$conf.int # Exact binomial test for proportion binom.test(1, 6, p = 0.85)$conf.int # Exact binomial test for proportion binom.test(1, 6, alternative = "less", p = 0.85) #### Example: swimming segregation, raw data ## read.table options # sep = default is any white space, but our strings contain a space, # so I changed this to a comma # header = there are no column headers # stringsAsFActors = default converts strings to factors, but I want them # to just be the plain character text swim <- read.table(text=" not approved not approved not approved approved not approved not approved ", sep = ",", header=FALSE, stringsAsFactors=FALSE) # name the column names(swim) <- c("application") # show the structure of the data.frame str(swim) # display the data.frame swim # count the frequency of each categorical variable table(swim) # use the counts from table() for input in binom.text() # the help for binom.test() says x can be a vector of length 2 # giving the numbers of successes and failures, respectively # that's exactly what table(swim) gave us binom.test(table(swim), p = 0.85, alternative = "less") n <- 6 x <- 0:n p0 <- 0.85 bincdf <- pbinom(x, n, p0) cdf <- data.frame(x, bincdf) cdf #### Example: jury pool jury <- read.table(text=" Age Count CensusProp 18-19 23 0.061 20-24 96 0.150 25-29 134 0.135 30-39 293 0.217 40-49 297 0.153 50-64 380 0.182 65-99 113 0.102 ", header=TRUE) # show the structure of the data.frame str(jury) # display the data.frame jury # calculate chi-square goodness-of-fit test x.summary <- chisq.test(jury$Count, correct = FALSE, p = jury$CensusProp) # print result of test x.summary # use output in x.summary and create table x.table <- data.frame(age = jury$Age , obs = x.summary$observed , exp = x.summary$expected , res = x.summary$residuals , chisq = x.summary$residuals^2 , stdres = x.summary$stdres) x.table library(reshape2) x.table.obsexp <- melt(x.table, # id.vars: ID variables # all variables to keep but not split apart on id.vars = c("age"), # measure.vars: The source columns # (if unspecified then all other variables are measure.vars) measure.vars = c("obs", "exp"), # variable.name: Name of the destination column identifying each # original column that the measurement came from variable.name = "stat", # value.name: column name for values in table value.name = "value" ) # naming variables manually, the variable.name and value.name not working 11/2012 names(x.table.obsexp) <- c("age", "stat", "value") # Observed vs Expected counts library(ggplot2) p <- ggplot(x.table.obsexp, aes(x = age, fill = stat, weight=value)) p <- p + geom_bar(position="dodge") p <- p + labs(title = "Observed and Expected frequencies") p <- p + xlab("Age category (years)") print(p) # Contribution to chi-sq # pull out only the age and chisq columns x.table.chisq <- x.table[, c("age","chisq")] # reorder the age categories to be descending relative to the chisq statistic x.table.chisq$age <- with(x.table, reorder(age, -chisq)) p <- ggplot(x.table.chisq, aes(x = age, weight = chisq)) p <- p + geom_bar() p <- p + labs(title = "Contribution to Chi-sq statistic") p <- p + xlab("Sorted age category (years)") p <- p + ylab("Chi-sq") print(p) b.sum1 <- binom.test(jury$Count[1], sum(jury$Count), p = jury$CensusProp[1], alternative = "two.sided", conf.level = 1-0.05/7) b.sum2 <- binom.test(jury$Count[2], sum(jury$Count), p = jury$CensusProp[2], alternative = "two.sided", conf.level = 1-0.05/7) b.sum3 <- binom.test(jury$Count[3], sum(jury$Count), p = jury$CensusProp[3], alternative = "two.sided", conf.level = 1-0.05/7) b.sum4 <- binom.test(jury$Count[4], sum(jury$Count), p = jury$CensusProp[4], alternative = "two.sided", conf.level = 1-0.05/7) b.sum5 <- binom.test(jury$Count[5], sum(jury$Count), p = jury$CensusProp[5], alternative = "two.sided", conf.level = 1-0.05/7) b.sum6 <- binom.test(jury$Count[6], sum(jury$Count), p = jury$CensusProp[6], alternative = "two.sided", conf.level = 1-0.05/7) b.sum7 <- binom.test(jury$Count[7], sum(jury$Count), p = jury$CensusProp[7], alternative = "two.sided", conf.level = 1-0.05/7) # put the p-value and CI into a data.frame b.sum <- data.frame( rbind( c(b.sum1$p.value, b.sum1$conf.int) , c(b.sum2$p.value, b.sum2$conf.int) , c(b.sum3$p.value, b.sum3$conf.int) , c(b.sum4$p.value, b.sum4$conf.int) , c(b.sum5$p.value, b.sum5$conf.int) , c(b.sum6$p.value, b.sum6$conf.int) , c(b.sum7$p.value, b.sum7$conf.int) ) ) names(b.sum) <- c("p.value", "CI.lower", "CI.upper") b.sum$Age <- jury$Age b.sum$Observed <- x.table$obs/sum(x.table$obs) b.sum$CensusProp <- jury$CensusProp b.sum library(xtable) xtab.out <- xtable(b.sum[,c("Age","p.value","CI.lower","CI.upper","Observed","CensusProp")], digits=3) print(xtab.out, floating=FALSE, math.style.negative=TRUE) #### Example, vitamin C # Approximate normal test for two-proportions, without Yates' continuity correction prop.test(c(17, 31), c(139, 140), correct = FALSE) # Create the labelled table hpv <- matrix(c(164, 130, 11, 178), nrow = 2, byrow = TRUE, dimnames = list("HPV.Outcome" = c("Positive", "Negative"), "Group" = c("Cases", "Controls"))) hpv # calculate the column (margin = 2) proportions hpv.col.prop <- prop.table(hpv, margin = 2) hpv.col.prop # OR, convert to long format and use xtabs to produce the table library(reshape2) hpv.long <- melt(hpv, value.name = "Frequency") hpv.long T1 <- xtabs(Frequency ~ HPV.Outcome + Group, data = hpv.long) T1 hpv.col.prop <- prop.table(T1, margin = 2) hpv.col.prop library(reshape2) hpv.col.prop.long <- melt(hpv.col.prop, value.name = "Proportion") hpv.col.prop.long # join these two datasets to have both Freq and Prop columns library(plyr) hpv.long <- join(hpv.long, hpv.col.prop.long) hpv.long # plots are easier now that data are in long format. library(ggplot2) p <- ggplot(data = hpv.long, aes(x = Group, y = Frequency, fill = HPV.Outcome)) p <- p + geom_bar(stat="identity", position = "dodge") p <- p + theme_bw() p <- p + labs(title = "Frequency of HPV by Case/Control group") print(p) # bars, stacked library(ggplot2) p <- ggplot(data = hpv.long, aes(x = Group, y = Proportion, fill = HPV.Outcome)) p <- p + geom_bar(stat="identity") p <- p + theme_bw() p <- p + labs(title = "Proportion of HPV by Case/Control group") print(p) # bars, dodged library(ggplot2) p <- ggplot(data = hpv.long, aes(x = Group, y = Proportion, fill = HPV.Outcome)) p <- p + geom_bar(stat="identity", position = "dodge") p <- p + theme_bw() p <- p + labs(title = "Proportion of HPV by Case/Control group") p <- p + scale_y_continuous(limits = c(0, 1)) print(p) # lines are sometimes easier, especially when many categories along the x-axis library(ggplot2) p <- ggplot(data = hpv.long, aes(x = Group, y = Proportion, colour = HPV.Outcome)) p <- p + geom_hline(yintercept = c(0, 1), alpha = 1/4) p <- p + geom_point(aes(shape = HPV.Outcome)) p <- p + geom_line(aes(linetype = HPV.Outcome, group = HPV.Outcome)) p <- p + theme_bw() p <- p + labs(title = "Proportion of HPV by Case/Control group") p <- p + scale_y_continuous(limits = c(0, 1)) print(p) # Approximate normal test for two-proportions, without Yates' continuity correction prop.test(c(164, 130), c(175, 308), correct = FALSE) # one-sided test, are cases more likely to be HPV positive? prop.test(c(164, 130), c(175, 308), correct = FALSE, alternative = "greater") #### Example, President performance # McNemar's test needs data as a matrix # Presidential Approval Ratings. # Approval of the President's performance in office in two surveys, # one month apart, for a random sample of 1600 voting-age Americans. pres <- matrix(c(794, 150, 86, 570), nrow = 2, byrow = TRUE, dimnames = list("1st Survey" = c("Approve", "Disapprove"), "2nd Survey" = c("Approve", "Disapprove"))) pres mcnemar.test(pres, correct=FALSE) # => significant change (in fact, drop) in approval ratings #### Example, cancer deaths candeath <- matrix(c( 94, 418, 23, 116, 524, 34, 156, 581, 109, 138, 558, 238), nrow = 4, byrow = TRUE, dimnames = list("Age" = c("15-54", "55-64", "65-74", "75+"), "Location of death" = c("Home", "Acute Care", "Chronic care"))) candeath chisq.summary <- chisq.test(candeath, correct=FALSE) chisq.summary # The Pearson residuals chisq.summary$residuals # The sum of the squared residuals is the chi-squared statistic: chisq.summary$residuals^2 sum(chisq.summary$residuals^2) # mosaic plot library(vcd) mosaic(candeath, shade=TRUE, legend=TRUE) # association plot library(vcd) assoc(candeath, shade=TRUE) # sieve plot library(vcd) # plot expected values sieve(candeath, sievetype = "expected", shade = TRUE) # plot observed table, then label cells with observed values in the cells sieve(candeath, pop = FALSE, shade = TRUE) labeling_cells(text = candeath, gp_text = gpar(fontface = 2))(as.table(candeath)) #### Example, antismoking adverts antismokead <- matrix(c( 8, 14, 35, 21, 19, 31, 42, 78, 61, 69), nrow = 2, byrow = TRUE, dimnames = list( "Status" = c("Smoker", "Non-smoker"), "Reaction" = c("Str. Dislike", "Dislike", "Neutral", "Like", "Str. Like"))) antismokead chisq.summary <- chisq.test(antismokead, correct=FALSE) chisq.summary # All expected frequencies are at least 5 chisq.summary$expected # Contribution to chi-squared statistic chisq.summary$residuals^2 #### Example: drugs and nausea, testing for homogeneity nausea <- matrix(c(96, 70, 52, 100, 52, 33, 35, 32, 37, 48), nrow = 5, byrow = TRUE, dimnames = list("Drug" = c("PL", "CH", "DI", "PE100", "PE150"), "Result" = c("Nausea", "No Nausea"))) nausea # Sorted proportions of nausea by drug nausea.prop <- sort(nausea[,1]/rowSums(nausea)) nausea.prop # chi-sq test of association chisq.summary <- chisq.test(nausea, correct=FALSE) chisq.summary # All expected frequencies are at least 5 chisq.summary$expected nausea.table <- data.frame(Interval = rep(NA,10) , CI.lower = rep(NA,10) , CI.upper = rep(NA,10) , Z = rep(NA,10) , p.value = rep(NA,10) , sig.temp = rep(NA,10) , sig = rep(NA,10)) # row names for table nausea.table[,1] <- c("p_PL - p_CH" , "p_PL - p_DI" , "p_PL - p_PE100" , "p_PL - p_PE150" , "p_CH - p_DI" , "p_CH - p_PE100" , "p_CH - p_PE150" , "p_DI - p_PE100" , "p_DI - p_PE150" , "p_PE100 - p_PE150") # test results together in a table i.tab <- 0 for (i in 1:4) { for (j in (i+1):5) { i.tab <- i.tab + 1 nausea.summary <- prop.test(nausea[c(i,j),], correct = FALSE, conf.level = 1-0.05/10) nausea.table[i.tab, 2:6] <- c(nausea.summary$conf.int[1] , nausea.summary$conf.int[2] , sign(-diff(nausea.summary$estimate)) * nausea.summary$statistic^0.5 , nausea.summary$p.value , (nausea.summary$p.value < 0.05/10)) if (nausea.table$sig.temp[i.tab] == 1) { nausea.table$sig[i.tab] <- "*" } else { nausea.table$sig[i.tab] <- " " } } } # remove temporary sig indicator nausea.table <- subset(nausea.table, select = -sig.temp) #nausea.table library(xtable) xtab.out <- xtable(nausea.table, digits=4) print(xtab.out, floating=FALSE, math.style.negative=TRUE)