## **************************************************
## Prediction using simple linear regression
## **************************************************

## --------------------------------------------------
## Read and examine the data

## 1.
ll <- read.csv('https://www.sfu.ca/~lmgonigl/materials-qm/data/lions.csv',
               stringsAsFactors=TRUE,
               strip.white=TRUE)

## 2.
head(ll)

## 3.
plot(x=ll$black, y=ll$age, xlab='Amount of black in nose', ylab='Age',
     pch=16, col='red', las=1)

## --------------------------------------------------
## Fit a linear model

## 1.
out <- lm(age~black, data=ll)
## also create a summary object (useful for quick extraction of
## parameter estimates)
summ <- summary(out)
summ

## 2.
##
## extract coefficients first
int <- coef(summ)['(Intercept)','Estimate']
bla <- coef(summ)['black',      'Estimate']
abline(a=int, b=bla, col='blue')

## note that this can be done by simply plotting the out object with
## abline:
abline(out, lwd=2, lty=2)

## The variance of the residuals for black in the nose tends to rise
## with increasing age but the trend is not severe. The residuals
## might not be quite normal, but they are not badly skewed so we are
## probably OK.

## 3.
summary(out)
## R-squared = 0.6238

## 4.
confint(out, '(Intercept)', level=0.95)
confint(out, 'black', level=0.95)

## 5.
drop1(out, test='F')

## 6.
plot(out)

## It is easier to see with these plots how the variance of the
## residuals tends to increase at higher fitted values.  A
## transformation of one or both of the variables is usually the first
## course of action.  R also has a toolkit for robust regression,
## which as the name suggests is more robust to violations of standard
## assumptions.

## --------------------------------------------------
## Prediction

## 1.
plot(x=ll$black, y=ll$age, xlab='Amount of black in nose', ylab='Age',
     pch=16, col='red', las=1)
abline(out, lwd=2, col='blue')

## 2.
new.vals <- data.frame(black=seq(min(ll$black),max(ll$black),
                                 length=20))
conf.arr <- predict(out,
                    newdata=new.vals,
                    interval='confidence',
                    level=0.95)
lines(x=new.vals[,'black'], y=conf.arr[,'lwr'], lwd=2, lty=2, col='blue')
lines(x=new.vals[,'black'], y=conf.arr[,'upr'], lwd=2, lty=2, col='blue')

## 3.
conf.arr <- predict(out,
                    newdata=new.vals,
                    interval='predict',
                    level=0.95)
lines(x=new.vals[,'black'], y=conf.arr[,'lwr'], lwd=2, lty=3, col='blue')
lines(x=new.vals[,'black'], y=conf.arr[,'upr'], lwd=2, lty=3, col='blue')

## **************************************************
## Effects of light treatment on circadian rhythms
## **************************************************

## --------------------------------------------------
## Read and examine the data

## 1.
kk <- read.csv('https://www.sfu.ca/~lmgonigl/materials-qm/data/knees.csv',
               stringsAsFactors=TRUE,
               strip.white=TRUE)

## 2.
head(kk)

## 3.
class(kk[,'treatment'])

## 4.
levels(kk[,'treatment'])
## having control first is useful, because the 'control' will then be
## used as the intercept, and all comparisons will be made relative to
## that

## 5.
levels(kk[,'treatment']) <- levels(kk[,'treatment'])[c(1,3,2)]

## 6.
plot(x=as.integer(kk[,'treatment']), y=kk[,'shift'],
     ylim=range(kk[,'shift']),
     xlim=c(0.75,3.25), pch=16, las=1, col='red', xaxt='n',
     xlab='', ylab='Shift')
axis(1, at=1:3, labels=levels(kk[,'treatment']))

## --------------------------------------------------
## Fit a linear model

## 1.
out <- lm(shift~treatment, data=kk)
## also create a summary object
summ <- summary(out)

## 2.
##
## Making the plot manually (a good learning exercise)
plot(x=jitter(as.integer(kk[,'treatment'])), y=kk[,'shift'],
     ylim=range(kk[,'shift']),
     xlim=c(0.75,3.25), pch=16, las=1, col='red', xaxt='n',
     xlab='', ylab='Shift')
axis(1, at=1:3, labels=levels(kk[,'treatment']))
## add lines
contr <- coef(summ)['(Intercept)',  'Estimate']
trt.k <- contr + coef(summ)['treatmentknee','Estimate']
trt.e <- contr + coef(summ)['treatmenteyes','Estimate']
lines(x=c(0.85,1.15), y=c(contr,contr), col='blue', lwd=2)
lines(x=c(1.85,2.15), y=c(trt.k,trt.k), col='blue', lwd=2)
lines(x=c(2.85,3.15), y=c(trt.e,trt.e), col='blue', lwd=2)

## now that you've had some practice plotting manually, try the
## 'visreg' function, which does all the hard work for you!
library(visreg)
visreg(out, xvar='treatment')

## 3.
plot(out)

## 4.
model.matrix(out)

## One of the columns must be dropped because the information in the 4
## columns is redundant in a particular way (a combination of three of
## the columns exactly equals the fourth).  By default, R drops the
## column corresponding to the first level of the categorical
## variables which, in this case, is the control.

## 5.
summ

## The mean of the first group and the differences between the second
## and third groups from the first.

## 6.
##
library(emmeans)
emmeans(out, spec='treatment')

## means are the same, because 'treatment' is the only predictor in
## the model
mean(kk[,'shift'][kk[,'treatment']=='control'])
mean(kk[,'shift'][kk[,'treatment']=='knee'])
mean(kk[,'shift'][kk[,'treatment']=='eyes'])

## standard errors are not the same
##
## control
ind <- kk[,'treatment']=='control'
sd(kk[,'shift'][ind])/sqrt(sum(ind))
## knee
ind <- kk[,'treatment']=='knee'
sd(kk[,'shift'][ind])/sqrt(sum(ind))
## eyes
ind <- kk[,'treatment']=='eyes'
sd(kk[,'shift'][ind])/sqrt(sum(ind))
##
## these are not the same as those from emmeans, because emmeans is
## using the entire model object to calculate these.

## 7.
drop1(out, test='F')

## **************************************************
## Fly sex and longevity revisited
## **************************************************

## --------------------------------------------------
## Read and examine the data

## 1.
ff <- read.csv('https://www.sfu.ca/~lmgonigl/materials-qm/data/fruitflies.csv',
               stringsAsFactors=TRUE,
               strip.white=TRUE)

## 2.
head(ff)

## 3.
class(ff[,'treatment'])

## 4.
levels(ff[,'treatment'])

## 5.
ff[,'treatment'] <- factor(ff[,'treatment'],
                           levels(ff[,'treatment'])[c(5,1:4)])

## --------------------------------------------------
## Fit a linear model

## 1.
out <- lm(longevity.days~thorax.mm+treatment, data=ff)
summ <- summary(out)
summ

## 2.
plot(out)

## 3.
out <- lm(log(longevity.days)~thorax.mm+treatment, data=ff)
summ <- summary(out)
summ

plot(out)

## 4.
library(visreg)
visreg(out, xvar='thorax.mm', by='treatment')
visreg(out, xvar='treatment', by='thorax.mm')

## 5.
summ
## '8 virgin females' differs most from the control group

## 6.
library(emmeans)
emmeans(out, spec='thorax.mm')
emmeans(out, spec='treatment')

## 7.
drop1(out, test='F')

## 8.
out.with.int <- lm(log(longevity.days)~thorax.mm*treatment, data=ff)
summ.with.int <- summary(out.with.int)
summ.with.int

plot(out.with.int)

## 9.
drop1(out.with.int, test='F')
## there is no support for an interaction

## 10.
##
## adjusted R^2 from model without interaction
summ$adj.r.squared
## and from model with interaction
summ.with.int$adj.r.squared
## not that different...