## **************************************************
## Caribbean bird immigration
## **************************************************
rm(list=ls())

## Read and examine the data
aa <- read.csv('https://www.sfu.ca/~lmgonigl/materials-qm/data/antilles.csv',
               stringsAsFactors=TRUE,
               strip.white=TRUE)
head(aa)

## 1.
hist(aa$immigration.date, las=1, xlab='Immigration Date',
     col='red')

## 2.
mean(aa$immigration.date)
median(aa$immigration.date)

## 3.
boot.1 <- sample(aa$immigration.date, replace=TRUE)
hist(boot.1, las=1,
     xlab='Boot-strap replicate of Immigration Date', col='red')

## 4.
boot.strap.median <- function()
  median(sample(aa$immigration.date, replace=TRUE))
## call function once to test it
boot.strap.median()
## now do it 10^4 times (note, the 'simplify' argument in replicate
## just tells replicate to return a vector instead of a list)
boot.vals <- replicate(1e4, boot.strap.median(), simplify=TRUE)

## 5.
hist(boot.vals, las=1,
     xlab='Median', col='red', breaks=10)

## 6.
sd(boot.vals)

## 7.
mean(boot.vals)
median(aa$immigration.date)
## The mean of the sampling distribution is larger than the true value
## of the median.  Thus, our boot-strap tells us that our boot-strap
## statistic is biased and tends to over-estimate the true median.

## 8.
quantile(boot.vals, probs=c(0.025, 0.975))

## 9. using boot package
library(boot)

calc.stat <- function(x,i)
  median(x$immigration.date[i])

## The above function is a bit odd looking - it is constructed to
## satisfy the needs of the "boot" function that we will use
## below. From the help for boot, we can find that the argument for
## the "statistic" must be a function whose first argument is the data
## and whose second argument is a vector of indices.
##
## To clarify, we could create a boot-strap sample from our data by
## first sampling random row indices from our data-set, with
## replacement:
boot.indices <- sample(1:nrow(aa),replace=TRUE)
## and next extracting the immigration.date entries for those random
## rows:
boot.immigration.dates <- aa$immigration.date[boot.indices]
## and then calculating the median of those values
median(boot.immigration.dates)
## this is the thing our function would do if we called it, passing
## our random indices in as the second argument:
calc.stat(aa,boot.indices)

z <- boot(data=aa,
          statistic=calc.stat,
          R=1e4)
boot.ci(z, type = 'bca')
boot.ci(z, type = 'perc')

## **************************************************
## Trillium fragmentation
## **************************************************
rm(list=ls())

## Read and examine the data
tt <- read.csv('https://www.sfu.ca/~lmgonigl/materials-qm/data/trillium.csv',
               stringsAsFactors=TRUE,
               strip.white=TRUE)
head(tt)

## 1.
plot(x=tt$fragsize.ha, y=tt$recruitment, las=1,
     xlab='Fragment size', ylab='Recruitment', pch=16)

## 2.
plot(x=log(tt$fragsize.ha), y=log(tt$recruitment), las=1,
     xlab='Fragment size', ylab='Recruitment', pch=16)
## log looks better

tt$log.fragsize.ha <- log(tt$fragsize.ha)
tt$log.recruitment <- log(tt$recruitment)

## 3.
out.actual <- lm(log.recruitment ~ log.fragsize.ha, data=tt)
summary(out.actual)
slope.actual <- coef(out.actual)['log.fragsize.ha']

## residual plot
plot(x=fitted(out.actual), y=resid(out.actual),
     xlab='Fitted values', ylab='Residuals', las=1, pch=16)
abline(h=0, col='red')

## qq-plot
qqnorm(resid(out.actual), pch=16)
abline(a=0, b=1, col='red')

## 4.

## first lets write a function to fit the model to premuted data and
## extract the slope

permute.and.get.slope <- function() {
  ## let's permute just the response (permuting just the predictor, or
  ## both would be equivalent)
  rr <- sample(tt$log.recruitment, replace=FALSE)
  pp <- tt$log.fragsize.ha
  out <- lm(rr ~ pp)
  coef(out)['pp']
}

## test function once
permute.and.get.slope()

## now calculate 10^4 slopes
slopes.perm <-
  replicate(1e4, permute.and.get.slope(), simplify=TRUE)

## 5.
hist(slopes.perm, xlab='Permuted slopes', las=1)
abline(v=slope.actual, col='red', lwd=2)

## 6.
frac.greater <- sum(slopes.perm>slope.actual) /
  length(slopes.perm)

## 7.
## Because we didn't have an a priori expectation about the direction
## of the effect (although, you might argue this), this is a
## two-tailed test.  Thus, our p-value is:
2*frac.greater

## **************************************************
## Vampire attack
## **************************************************
rm(list=ls())

## 1. Create new data structure
est <- rep(c('y','n'), c(15,7))
anest <- rep(c('y','n'), c(6,322))
cc <- data.frame(status=rep(c('est', 'anest'),
                            c(length(est), length(anest))),
                 bitten=c(est, anest))

## 2.
table(cc$bitten, cc$status)

## 3.
out.chi <- chisq.test(cc$status, cc$bitten)
out.chi

## to get observed and expected counts, you can use the following:
out.chi$observed
out.chi$expected

## The chi-square2 approximation to the null distribution may not be
## accurate when too many of the expected frequencies are less than 5.

## 4.
chisq.test(cc$status, cc$bitten, simulate.p.value=TRUE, B=10^4)

## 5.

## write function to calculate OR:
calc.OR <- function(x1, x2) {
  a <- sum(x1=='y')
  b <- sum(x1=='n')
  c <- sum(x2=='y')
  d <- sum(x2=='n')
  (a/c)/(b/d)
}
## apply it to our actual data
OR.act <- calc.OR(x1=est, x2=anest)
OR.act

## first write a function to create a permuted data-set and 
permute.and.calc.OR <- function() {
  est.perm <- sample(est, replace=TRUE)
  anest.perm <- sample(anest, replace=TRUE)
  calc.OR(x1=est.perm, x2=anest.perm)
}
## test by running once
permute.and.calc.OR()
## do it 10^4 times
OR.perm <- replicate(1e4, permute.and.calc.OR())
OR.perm
## note the inf values
range(OR.perm)

## let's re-phrase by adding 0.5

## write function to calculate OR:
calc.OR <- function(x1, x2) {
  a <- sum(x1=='y')+0.5
  b <- sum(x1=='n')+0.5
  c <- sum(x2=='y')+0.5
  d <- sum(x2=='n')+0.5
  (a/c)/(b/d)
}
## apply mew PR.act to our actual data
OR.act <- calc.OR(x1=est, x2=anest)
## calculate OR for 10^4 permuted data-sets
OR.perm <- replicate(1e4, permute.and.calc.OR())
## note the inf problem is gone
range(OR.perm)

## 6.
hist(OR.perm, xlab='Permuted slopes', las=1)
abline(v=OR.act, col='red', lwd=2)

## calulate se (sd of sampling distribution)
sd(OR.perm)

## 7.
quantile(OR.perm, probs=c(0.025, 0.975))

## below is an alternate version of steps 6 and 7 which doesn't add
## 0.5, but instead drops non-finite values of OR

## function to calculate OR:
calc.OR <- function(x1, x2) {
  a <- sum(x1=='y')
  b <- sum(x1=='n')
  c <- sum(x2=='y')
  d <- sum(x2=='n')
  (a/c)/(b/d)
}
## apply it to our actual data
OR.act <- calc.OR(x1=est, x2=anest)
## calculate OR for 10^4 permuted data-sets
OR.perm <- replicate(1e4, permute.and.calc.OR())
## drop non-finite values
OR.perm <- OR.perm[is.finite(OR.perm)]
## note no inf values
range(OR.perm)

## calulate se
sd(OR.perm)
## and 95% CI
quantile(OR.perm, probs=c(0.025, 0.975))