## ************************************************** ## Caribbean bird immigration ## ************************************************** rm(list=ls()) ## Read and examine the data aa <- read.csv('https://www.sfu.ca/~lmgonigl/materials-qm/data/antilles.csv', stringsAsFactors=TRUE, strip.white=TRUE) head(aa) ## 1. hist(aa$immigration.date, las=1, xlab='Immigration Date', col='red') ## 2. mean(aa$immigration.date) median(aa$immigration.date) ## 3. boot.1 <- sample(aa$immigration.date, replace=TRUE) hist(boot.1, las=1, xlab='Boot-strap replicate of Immigration Date', col='red') ## 4. boot.strap.median <- function() median(sample(aa$immigration.date, replace=TRUE)) ## call function once to test it boot.strap.median() ## now do it 10^4 times (note, the 'simplify' argument in replicate ## just tells replicate to return a vector instead of a list) boot.vals <- replicate(1e4, boot.strap.median(), simplify=TRUE) ## 5. hist(boot.vals, las=1, xlab='Median', col='red', breaks=10) ## 6. sd(boot.vals) ## 7. mean(boot.vals) median(aa$immigration.date) ## The mean of the sampling distribution is larger than the true value ## of the median. Thus, our boot-strap tells us that our boot-strap ## statistic is biased and tends to over-estimate the true median. ## 8. quantile(boot.vals, probs=c(0.025, 0.975)) ## 9. using boot package library(boot) calc.stat <- function(x,i) median(x$immigration.date[i]) ## The above function is a bit odd looking - it is constructed to ## satisfy the needs of the "boot" function that we will use ## below. From the help for boot, we can find that the argument for ## the "statistic" must be a function whose first argument is the data ## and whose second argument is a vector of indices. ## ## To clarify, we could create a boot-strap sample from our data by ## first sampling random row indices from our data-set, with ## replacement: boot.indices <- sample(1:nrow(aa),replace=TRUE) ## and next extracting the immigration.date entries for those random ## rows: boot.immigration.dates <- aa$immigration.date[boot.indices] ## and then calculating the median of those values median(boot.immigration.dates) ## this is the thing our function would do if we called it, passing ## our random indices in as the second argument: calc.stat(aa,boot.indices) z <- boot(data=aa, statistic=calc.stat, R=1e4) boot.ci(z, type = 'bca') boot.ci(z, type = 'perc') ## ************************************************** ## Trillium fragmentation ## ************************************************** rm(list=ls()) ## Read and examine the data tt <- read.csv('https://www.sfu.ca/~lmgonigl/materials-qm/data/trillium.csv', stringsAsFactors=TRUE, strip.white=TRUE) head(tt) ## 1. plot(x=tt$fragsize.ha, y=tt$recruitment, las=1, xlab='Fragment size', ylab='Recruitment', pch=16) ## 2. plot(x=log(tt$fragsize.ha), y=log(tt$recruitment), las=1, xlab='Fragment size', ylab='Recruitment', pch=16) ## log looks better tt$log.fragsize.ha <- log(tt$fragsize.ha) tt$log.recruitment <- log(tt$recruitment) ## 3. out.actual <- lm(log.recruitment ~ log.fragsize.ha, data=tt) summary(out.actual) slope.actual <- coef(out.actual)['log.fragsize.ha'] ## residual plot plot(x=fitted(out.actual), y=resid(out.actual), xlab='Fitted values', ylab='Residuals', las=1, pch=16) abline(h=0, col='red') ## qq-plot qqnorm(resid(out.actual), pch=16) abline(a=0, b=1, col='red') ## 4. ## first lets write a function to fit the model to premuted data and ## extract the slope permute.and.get.slope <- function() { ## let's permute just the response (permuting just the predictor, or ## both would be equivalent) rr <- sample(tt$log.recruitment, replace=FALSE) pp <- tt$log.fragsize.ha out <- lm(rr ~ pp) coef(out)['pp'] } ## test function once permute.and.get.slope() ## now calculate 10^4 slopes slopes.perm <- replicate(1e4, permute.and.get.slope(), simplify=TRUE) ## 5. hist(slopes.perm, xlab='Permuted slopes', las=1) abline(v=slope.actual, col='red', lwd=2) ## 6. frac.greater <- sum(slopes.perm>slope.actual) / length(slopes.perm) ## 7. ## Because we didn't have an a priori expectation about the direction ## of the effect (although, you might argue this), this is a ## two-tailed test. Thus, our p-value is: 2*frac.greater ## ************************************************** ## Vampire attack ## ************************************************** rm(list=ls()) ## 1. Create new data structure est <- rep(c('y','n'), c(15,7)) anest <- rep(c('y','n'), c(6,322)) cc <- data.frame(status=rep(c('est', 'anest'), c(length(est), length(anest))), bitten=c(est, anest)) ## 2. table(cc$bitten, cc$status) ## 3. out.chi <- chisq.test(cc$status, cc$bitten) out.chi ## to get observed and expected counts, you can use the following: out.chi$observed out.chi$expected ## The chi-square2 approximation to the null distribution may not be ## accurate when too many of the expected frequencies are less than 5. ## 4. chisq.test(cc$status, cc$bitten, simulate.p.value=TRUE, B=10^4) ## 5. ## write function to calculate OR: calc.OR <- function(x1, x2) { a <- sum(x1=='y') b <- sum(x1=='n') c <- sum(x2=='y') d <- sum(x2=='n') (a/c)/(b/d) } ## apply it to our actual data OR.act <- calc.OR(x1=est, x2=anest) OR.act ## first write a function to create a permuted data-set and permute.and.calc.OR <- function() { est.perm <- sample(est, replace=TRUE) anest.perm <- sample(anest, replace=TRUE) calc.OR(x1=est.perm, x2=anest.perm) } ## test by running once permute.and.calc.OR() ## do it 10^4 times OR.perm <- replicate(1e4, permute.and.calc.OR()) OR.perm ## note the inf values range(OR.perm) ## let's re-phrase by adding 0.5 ## write function to calculate OR: calc.OR <- function(x1, x2) { a <- sum(x1=='y')+0.5 b <- sum(x1=='n')+0.5 c <- sum(x2=='y')+0.5 d <- sum(x2=='n')+0.5 (a/c)/(b/d) } ## apply mew PR.act to our actual data OR.act <- calc.OR(x1=est, x2=anest) ## calculate OR for 10^4 permuted data-sets OR.perm <- replicate(1e4, permute.and.calc.OR()) ## note the inf problem is gone range(OR.perm) ## 6. hist(OR.perm, xlab='Permuted slopes', las=1) abline(v=OR.act, col='red', lwd=2) ## calulate se (sd of sampling distribution) sd(OR.perm) ## 7. quantile(OR.perm, probs=c(0.025, 0.975)) ## below is an alternate version of steps 6 and 7 which doesn't add ## 0.5, but instead drops non-finite values of OR ## function to calculate OR: calc.OR <- function(x1, x2) { a <- sum(x1=='y') b <- sum(x1=='n') c <- sum(x2=='y') d <- sum(x2=='n') (a/c)/(b/d) } ## apply it to our actual data OR.act <- calc.OR(x1=est, x2=anest) ## calculate OR for 10^4 permuted data-sets OR.perm <- replicate(1e4, permute.and.calc.OR()) ## drop non-finite values OR.perm <- OR.perm[is.finite(OR.perm)] ## note no inf values range(OR.perm) ## calulate se sd(OR.perm) ## and 95% CI quantile(OR.perm, probs=c(0.025, 0.975))