## R code for the Poisson regression handout covering the shark attack ## data # download data from Simonoff's home page, read into R download.file(url="http://pages.stern.nyu.edu/~jsimonof/AnalCatData/Data/Comma_separated/floridashark.csv",destfile="shark.csv") df <- read.csv("shark.csv") # The following line applies a UNIX command to uncapitalize variable names # (I'm lazy about typing Capitals) names(df) <- unix("tr A-Z a-z", input = names(df)) # Attach the dataframe so variables can be easily accessed attach(df) # Plot rates versus year, + non-parametric fit and log-linear fit rate <- 1000000*attacks/population plot(year,rate ,xlab="Year",ylab="Rate per Million Pop.", main="Shark Attacks") smu <- supsmu(year,rate) lines(smu,lty=3) llfit <- glm( attacks ~ offset(log(population)) + year, family=poisson) predRate <- 1000000*fitted(llfit)/population # Note could also use predict(llfit,type="response") instead of fitted(llfit) lines(year,predRate,lty=2) summary(llfit) # Define a function for printing coefficients information + Wald type # confidence intervals. Also allows for transforming estimates and # intervals and profile intervals) coefTable <- function(mfit, profile=FALSE, transf = function(x) x ) { if (profile) estCi <- transf(cbind(coef(summary(mfit))[,1,drop=FALSE],confint(mfit))) else estCi <- transf(cbind(coef(summary(mfit))[,1,drop=FALSE],confint.default(mfit))) ciCols <- order(estCi[1,2:3]) + 1 ans <- cbind(estCi[,1,drop=FALSE],coef(summary(mfit))[,c(2,4)],estCi[,ciCols]) dimnames(ans)[[2]][ciCols+2] <- dimnames(ans)[[2]][4:5] ans } cat("Estimates with confidence intervals",fill=TRUE) coefTable(llfit) cat("Back-transformed estimates with confidence intervals",fill=TRUE) coefTable(llfit,transf=exp) cat("Back-transformed estimates with profile Confidence Intervals",fill=TRUE) coefTable(llfit,transf=exp,profile=TRUE) # Plot regression diagnostics par(mfrow=c(2,2)) plot.lm(llfit) # Fit 3rd degree polynomial regression, using poly() to generate orthogonal # polynomials llfitPoly <- glm( attacks ~ offset(log(population)) + poly(year,3), family=poisson) summary(llfitPoly) # Fit 3rd degree polynomial using naive method - seemed to do OK here summary(glm( attacks ~ offset(log(population)) + year + I(year^2) + I(year^3), family=poisson)) cat("Estimates from polynomial model with confidence intervals",fill=TRUE) coefTable(llfitPoly) # Plot regression diagnostics for polynomial model par(mfrow=c(2,2)) plot(llfitPoly) # Plot polynomial fit, with added non-parametric fit. plot(year,rate ,xlab="Year",ylab="Rate per Million Pop.", main="Shark Attacks") smu <- supsmu(year,rate) lines(smu,lty=1) predRate <- 1000000*fitted(llfitPoly)/population lines(year,predRate,lty=2) dev.off() # Segmented regression line upTo1985 <- (year <= 1985) * year after1985 <- (year > 1985) * year llfitSeg <- glm( attacks ~ offset(log(population)) + (year > 1985) + upTo1985 + after1985, family=poisson) # Plot segmented regression line plot(year,rate ,xlab="Year",ylab="Rate per Million Pop.", main="Shark Attacks") lines(year,predRate,lty=1) predRate <- 1000000*predict(llfitSeg,type="response")/population lines(year,predRate,lty=2) # Regression summaries summary(llfitSeg) # Coefficients with confidence intervals options(digits=4) coefTable(llfitSeg) # Fit model with both segmented and polynomial terms llfitSegPoly <- glm( attacks ~ offset(log(population)) + (year > 1985) + upTo1985 + after1985 + poly(year,3), family=poisson) summary(llfitSegPoly) # Perform Likelihood Ratio Tests to test utility of augmented model # relative to simpler models anova(llfitSeg, llfitSegPoly,test="Chi") anova(llfitPoly, llfitSegPoly,test="Chi")