##########################################################
################ Logistic Regression#####################
##########################################################
#################Example Admission Data######################

install.packages("aod")
library(aod)

ex.data <- read.csv("C:/jenn/teaching/stat579/data/binary.csv")
nrow(ex.data)
#This dataset has a binary response (outcome, dependent) variable called admit, 1 admit, 0 no admission. 
#There are three predictor variables: gre, gpa and rank. 
#variables gre and gpa are continuous. The variable rank takes on the values 1 through 4. Institutions 
#with a rank of 1 have the highest prestige, 
#while those with a rank of 4 have the lowest. 

## view the first few rows of the data
head(ex.data)


#Descriptive Statistics
summary(ex.data)
sapply(ex.data, sd) #use sapply to apply the sd function to each variable in the dataset.
tapply(ex.data$gpa,ex.data$rank,mean)
tapply(ex.data$gre,ex.data$rank,mean)


## two-way contingency table of categorical outcome and predictors

xtabs(~admit + rank, data = ex.data)


####Logistic regression modeling
ex.data$rank <- factor(ex.data$rank) #convert rank to a factor to indicate that rank should be treated as a 
#categorical variable.

###fit data with all variables
myfit <- glm(admit ~ gre + gpa + rank, data = ex.data, family = "binomial")
summary(myfit)
anova(myfit)

confint(myfit) ## CIs using profiled log-likelihood
confint.default(myfit)   ## CIs using standard errors
myfit$deviance   #G^2
logLik(myfit) #log likelihood ratio
# -2*(-229.2587) =458.52
##GOF test

pearsonchi<-sum(residuals(myfit, type = "pearson")^2)
1 - pchisq(deviance(myfit), df.residual(myfit))
#The p-value is large indicating no evidence of lack of fit.

###prediction
myfit$fit     #fitted probabilities
plot(myfit$fit ~ gre, data=ex.data)
plot(myfit$fit ~ gpa, data=ex.data)
ggplot(ex.data,aes(x=gpa,y=admit))+geom_point()+geom_smooth(method="glm",se=FALSE,method.args = list(family="binomial"))+xlab("GPA") + ylab("Probability of Admission") 
#default se=TRUE will give confidence bands



mgre<-tapply(ex.data$gre, ex.data$rank, mean) # mean of gre by rank
mgpa<-tapply(ex.data$gpa, ex.data$rank, mean) # mean of gpa by rank
newdata1 <- with(ex.data, data.frame(gre = mgre, gpa = mgpa, rank = factor(1:4)))
newdata1
newdata1$rankP <- predict(myfit, newdata = newdata1, type = "response")
newdata1
fitted1<-predict(myfit)
fitted2<-predict(myfit,type="response")
exp(fitted1)/(1+exp(fitted1))

## odds ratios
exp(coef(myfit))

## odds ratios and 95% CI
exp(cbind(OR = coef(myfit), confint(myfit)))


#test for an overall effect of rank using the wald.test function 
wald.test(b = coef(myfit), Sigma = vcov(myfit), Terms = 4:6)

#test that the coefficient for rank=2 is equal to the coefficient for rank=3
l <- cbind(0, 0, 0, 1, -1, 0)
wald.test(b = coef(myfit), Sigma = vcov(myfit), L = l)

##comparing models

myfit0<-glm(admit ~ 1, data = ex.data, family = "binomial")
summary(myfit0)
myfit2<-glm(admit ~ gre + gpa, data = ex.data, family = "binomial")
summary(myfit2)
myfit3<-glm(admit ~ gpa+rank, data = ex.data, family = "binomial")
summary(myfit3)


anova(myfit0,myfit,test="Chisq")
qchisq(0.95,5)
pchisq(41.459,5,lower.tail = FALSE)

anova(myfit, myfit2)
qchisq(0.95,3)
pchisq(21.826,3,lower.tail = FALSE)

anova(myfit3,myfit)
qchisq(0.95,1)
pchisq(4.3578,1,lower.tail = FALSE)


#model selection
upper<-formula(~gre+gpa+rank,data=ex.data)
model.aic = step(myfit0, scope=list(lower= ~., upper= upper))
                                   

##diagnostics
residuals(myfit) # deviance residuals
residuals(myfit, "pearson") # pearson residuals

smyfit<-summary(myfit)
infv <- c(ex.data$admit,myfit$fit,hatvalues(myfit),rstandard(myfit),cooks.distance(myfit))
inf<-matrix(infv,I(smyfit$df[1]+smyfit$df[2]),5,dimnames = list(NULL,
c("y", "yhat", "lev","r","C")))
inf

#leverages
plot(hatvalues(myfit))
highleverage <- which(hatvalues(myfit) > .045)
highleverage
hatvalues(myfit)[highleverage]
ex.data[373,]


#cooks distance
plot(cooks.distance(myfit))
highcook <- which((cooks.distance(myfit)) > .05)
cooks.distance(myfit)[highcook]