# Discourse on lm() and glm() models in CRD/SRS design
# 2021-09-27 CJS emmeans::CLD to multcomp::cld
# 2015-02-01 CJS Updates
# 2014-05-19 CJS First Editions
options(useFancyQuotes=FALSE) # renders summary output corrects
source("schwarz.functions.r")
#source('http://www.stat.sfu.ca/~cschwarz/Stat-650/Notes/MyPrograms/schwarz.functions.r')
library(car)
## Loading required package: carData
library(ggfortify)
## Loading required package: ggplot2
library(ggplot2)
library(gridExtra)
library(emmeans)
library(multcomp)
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: TH.data
## Loading required package: MASS
##
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
##
## geyser
library(plyr)
library(readxl)
cereal <- read.csv(file.path('..',"sampledata",'cereal.csv'),
header=TRUE, as.is=TRUE,
strip.white=TRUE)
cereal[1:5,]
## name mfr type calories protein fat sodium fiber carbo
## 1 100%_Bran N C 60 4 1 130 10 5
## 2 100%_Natural_Bran Q C 110 3 5 15 2 8
## 3 All-Bran K C 80 4 1 260 9 7
## 4 All-Bran_with_Extra_Fiber K C 50 4 0 140 14 8
## 5 Almond_Delight R C 110 2 2 200 1 14
## sugars shelf potass vitamins weight cups
## 1 6 3 280 25 1 0.331
## 2 8 3 135 0 1 NA
## 3 5 3 320 25 1 0.330
## 4 0 3 330 25 1 0.500
## 5 8 3 NA 25 1 0.750
# start a plot. ggplot ONLY uses data frames.
newplot <- ggplot(cereal, aes(x=fat, y=calories))+
ggtitle("Calories vs Grams of Fat")+
xlab("Fat/serving (g)")+ylab("Calories/serving")+
geom_point()
newplot
# Fit the regression line
cereal.fit <- lm( calories ~ fat, data=cereal)
str(cereal.fit)
## List of 12
## $ coefficients : Named num [1:2] 95.13 9.81
## ..- attr(*, "names")= chr [1:2] "(Intercept)" "fat"
## $ residuals : Named num [1:77] -44.94 -34.16 -24.94 -45.13 -4.74 ...
## ..- attr(*, "names")= chr [1:77] "1" "2" "3" "4" ...
## $ effects : Named num [1:77] -921.94 -86.04 -20.37 -42.88 2.14 ...
## ..- attr(*, "names")= chr [1:77] "(Intercept)" "fat" "" "" ...
## $ rank : int 2
## $ fitted.values: Named num [1:77] 104.9 144.2 104.9 95.1 114.7 ...
## ..- attr(*, "names")= chr [1:77] "1" "2" "3" "4" ...
## $ assign : int [1:2] 0 1
## $ qr :List of 5
## ..$ qr : num [1:77, 1:2] -8.775 0.114 0.114 0.114 0.114 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:77] "1" "2" "3" "4" ...
## .. .. ..$ : chr [1:2] "(Intercept)" "fat"
## .. ..- attr(*, "assign")= int [1:2] 0 1
## ..$ qraux: num [1:2] 1.11 1.45
## ..$ pivot: int [1:2] 1 2
## ..$ tol : num 1e-07
## ..$ rank : int 2
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 75
## $ xlevels : Named list()
## $ call : language lm(formula = calories ~ fat, data = cereal)
## $ terms :Classes 'terms', 'formula' language calories ~ fat
## .. ..- attr(*, "variables")= language list(calories, fat)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "calories" "fat"
## .. .. .. ..$ : chr "fat"
## .. ..- attr(*, "term.labels")= chr "fat"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(calories, fat)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. ..- attr(*, "names")= chr [1:2] "calories" "fat"
## $ model :'data.frame': 77 obs. of 2 variables:
## ..$ calories: int [1:77] 60 110 80 50 110 110 110 140 90 90 ...
## ..$ fat : int [1:77] 1 5 1 0 2 2 0 2 1 0 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language calories ~ fat
## .. .. ..- attr(*, "variables")= language list(calories, fat)
## .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:2] "calories" "fat"
## .. .. .. .. ..$ : chr "fat"
## .. .. ..- attr(*, "term.labels")= chr "fat"
## .. .. ..- attr(*, "order")= int 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(calories, fat)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. .. ..- attr(*, "names")= chr [1:2] "calories" "fat"
## - attr(*, "class")= chr "lm"
names(cereal.fit)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
# The standard methods to extract coefficients etc
summary(cereal.fit)
##
## Call:
## lm(formula = calories ~ fat, data = cereal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.132 -5.132 4.868 14.868 45.256
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95.132 3.141 30.285 < 2e-16 ***
## fat 9.806 2.207 4.443 3.01e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.36 on 75 degrees of freedom
## Multiple R-squared: 0.2084, Adjusted R-squared: 0.1978
## F-statistic: 19.74 on 1 and 75 DF, p-value: 3.009e-05
anova(cereal.fit) # Caution Type I only
## Analysis of Variance Table
##
## Response: calories
## Df Sum Sq Mean Sq F value Pr(>F)
## fat 1 7402.9 7402.9 19.743 3.009e-05 ***
## Residuals 75 28121.8 375.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef(cereal.fit)
## (Intercept) fat
## 95.131579 9.806005
sqrt(diag(vcov(cereal.fit))) # SE of coefficients
## (Intercept) fat
## 3.141224 2.206897
confint(cereal.fit)
## 2.5 % 97.5 %
## (Intercept) 88.873939 101.38922
## fat 5.409642 14.20237
summary(cereal.fit)$r.squared
## [1] 0.2083875
summary(cereal.fit)$sigma
## [1] 19.36381
methods(class=class(cereal.fit)) # shows methods available
## [1] add1 addterm alias
## [4] anova Anova attrassign
## [7] autoplot avPlot Boot
## [10] bootCase boxcox boxCox
## [13] brief case.names ceresPlot
## [16] coerce concordance confidenceEllipse
## [19] confint Confint cooks.distance
## [22] crPlot deltaMethod deviance
## [25] dfbeta dfbetaPlots dfbetas
## [28] dfbetasPlots drop1 dropterm
## [31] dummy.coef durbinWatsonTest effects
## [34] emm_basis extractAIC family
## [37] formula fortify hatvalues
## [40] hccm infIndexPlot influence
## [43] influencePlot initialize inverseResponsePlot
## [46] kappa labels leveneTest
## [49] leveragePlot linearHypothesis logLik
## [52] logtrans mcPlot mmp
## [55] model.frame model.matrix ncvTest
## [58] nextBoot nobs outlierTest
## [61] plot powerTransform predict
## [64] Predict print proj
## [67] qqPlot qr recover_data
## [70] residualPlot residualPlots residuals
## [73] rstandard rstudent S
## [76] show sigmaHat simulate
## [79] slotsFromS3 spreadLevelPlot summary
## [82] symbox variable.names vcov
## see '?methods' for accessing help and source code
# CAUTION - lm() gives Type I (incremental) tests - you want
# Type III (marginal) tests
# Compare the following cereal.fits
cereal.fit <- lm(calories ~ fat, data=cereal)
anova(cereal.fit)
## Analysis of Variance Table
##
## Response: calories
## Df Sum Sq Mean Sq F value Pr(>F)
## fat 1 7402.9 7402.9 19.743 3.009e-05 ***
## Residuals 75 28121.8 375.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(cereal.fit)
##
## Call:
## lm(formula = calories ~ fat, data = cereal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.132 -5.132 4.868 14.868 45.256
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95.132 3.141 30.285 < 2e-16 ***
## fat 9.806 2.207 4.443 3.01e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.36 on 75 degrees of freedom
## Multiple R-squared: 0.2084, Adjusted R-squared: 0.1978
## F-statistic: 19.74 on 1 and 75 DF, p-value: 3.009e-05
cereal.fit2 <- lm(calories ~ fat + protein, data=cereal)
anova(cereal.fit2)
## Analysis of Variance Table
##
## Response: calories
## Df Sum Sq Mean Sq F value Pr(>F)
## fat 1 7402.9 7402.9 19.6236 3.207e-05 ***
## protein 1 205.7 205.7 0.5453 0.4626
## Residuals 74 27916.1 377.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(cereal.fit2)
##
## Call:
## lm(formula = calories ~ fat + protein, data = cereal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.153 -7.308 2.383 12.538 45.765
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 98.690 5.757 17.142 < 2e-16 ***
## fat 10.154 2.263 4.486 2.6e-05 ***
## protein -1.537 2.081 -0.738 0.463
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.42 on 74 degrees of freedom
## Multiple R-squared: 0.2142, Adjusted R-squared: 0.1929
## F-statistic: 10.08 on 2 and 74 DF, p-value: 0.000134
cereal.fit3 <- lm(calories ~ protein + fat, data=cereal)
anova(cereal.fit3)
## Analysis of Variance Table
##
## Response: calories
## Df Sum Sq Mean Sq F value Pr(>F)
## protein 1 15.3 15.3 0.0404 0.8412
## fat 1 7593.4 7593.4 20.1285 2.604e-05 ***
## Residuals 74 27916.1 377.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(cereal.fit3)
##
## Call:
## lm(formula = calories ~ protein + fat, data = cereal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.153 -7.308 2.383 12.538 45.765
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 98.690 5.757 17.142 < 2e-16 ***
## protein -1.537 2.081 -0.738 0.463
## fat 10.154 2.263 4.486 2.6e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.42 on 74 degrees of freedom
## Multiple R-squared: 0.2142, Adjusted R-squared: 0.1929
## F-statistic: 10.08 on 2 and 74 DF, p-value: 0.000134
# Finding the type III SS for a model with CONTINUOUS variables
car::Anova(cereal.fit2, type=3)
## Anova Table (Type III tests)
##
## Response: calories
## Sum Sq Df F value Pr(>F)
## (Intercept) 110850 1 293.8425 < 2.2e-16 ***
## fat 7593 1 20.1285 2.604e-05 ***
## protein 206 1 0.5453 0.4626
## Residuals 27916 74
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::Anova(cereal.fit3, type=3)
## Anova Table (Type III tests)
##
## Response: calories
## Sum Sq Df F value Pr(>F)
## (Intercept) 110850 1 293.8425 < 2.2e-16 ***
## protein 206 1 0.5453 0.4626
## fat 7593 1 20.1285 2.604e-05 ***
## Residuals 27916 74
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Standard Diagnostic plots
library(ggfortify)
plot1 <- autoplot(cereal.fit) # ggplot
plot1
ggsave(#plot1, # see https://github.com/sinhrks/ggfortify/issues/98
file="../../MyStuff/Images/cereal-fit-lm-diagplot.png",
h=4, w=6, unit="in", dpi=300)
library(car)
residualPlots(cereal.fit)
## Test stat Pr(>|Test stat|)
## fat -1.6177 0.1100
## Tukey test -1.6177 0.1057
# Obtaining predictions
# First set up the points where you want predictions
newfat <- data.frame(fat=seq(0,6,.1))
newfat[1:5,]
## [1] 0.0 0.1 0.2 0.3 0.4
str(newfat)
## 'data.frame': 61 obs. of 1 variable:
## $ fat: num 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 ...
# Predict the AVERAGE number of calories/serving at each fat level
# You need to specify help(predict.lm) tp see the documentation
predict.avg <- predict(cereal.fit, newdata=newfat,
se.fit=TRUE,interval="confidence")
# This creates a list that you need to restructure to make it look nice
predict.avg.df <- cbind(newfat, predict.avg$fit, se=predict.avg$se.fit)
tail(predict.avg.df)
## fat fit lwr upr se
## 56 5.5 149.0646 128.8542 169.2750 10.14527
## 57 5.6 150.0452 129.4055 170.6850 10.36079
## 58 5.7 151.0258 129.9563 172.0953 10.57652
## 59 5.8 152.0064 130.5067 173.5061 10.79245
## 60 5.9 152.9870 131.0568 174.9172 11.00857
## 61 6.0 153.9676 131.6065 176.3287 11.22487
# Add the fitted line and confidence intervals to the plot
plotfit.avgci <- newplot +
geom_line( data=predict.avg.df, aes(x=fat, y=fit))+
geom_ribbon(data=predict.avg.df,
aes(x=fat,y=NULL, ymin=lwr, ymax=upr),alpha=0.2)
plotfit.avgci
ggsave(plot=plotfit.avgci, file='../../MyStuff/Images/lm-predict-cereal-001.png',
h=4, w=6, units="in",dpi=300)
# Predict the INDIVIDUAL number of calories in each cereal
# R does not product the se for individual predictions
predict.indiv <- predict(cereal.fit, newdata=newfat,
interval="prediction")
# This creates a list that you need to restructure to make it look nice
predict.indiv.df <- cbind(newfat, predict.indiv)
tail(predict.indiv.df)
## fat fit lwr upr
## 56 5.5 149.0646 105.5162 192.6131
## 57 5.6 150.0452 106.2959 193.7946
## 58 5.7 151.0258 107.0721 194.9795
## 59 5.8 152.0064 107.8449 196.1680
## 60 5.9 152.9870 108.6143 197.3597
## 61 6.0 153.9676 109.3803 198.5549
# Add the prediction intervals to the plot
plotfit.indivci <- plotfit.avgci +
geom_ribbon(data=predict.indiv.df,
aes(x=fat,y=NULL, ymin=lwr, ymax=upr),
alpha=0.1)
plotfit.indivci
ggsave(plot=plotfit.indivci, file='../../MyStuff/Images/lm-predict-cereal-002.png',
h=4, w=6, units="in", dpi=300)
# Exercise with Birds and Butts
# Relationship between parasite load and number of butts in nest
library(readxl)
butts <- readxl::read_excel('../sampledata/bird-butts-data.xlsx',
sheet='Correlational',
col_names = TRUE,
trim_ws=TRUE,
skip=1,
.name_repair='universal')
## New names:
## * `Nest content` -> Nest.content
## * `Butts wieght` -> Butts.wieght
## * `Number of mites` -> Number.of.mites
butts[1:5,]
## # A tibble: 5 × 5
## Nest Species Nest.content Butts.wieght Number.of.mites
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 HOSP empty 6.13 4
## 2 2 HOSP empty 3.73 30
## 3 3 HOSP eggs 0.06 84
## 4 4 HOSP eggs 8.3 2
## 5 5 HOSP eggs 0 12
# Fix the names
names(butts)[ grepl('wieght', names(butts))] <- "Butts.weight"
butts[1:5,]
## # A tibble: 5 × 5
## Nest Species Nest.content Butts.weight Number.of.mites
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 HOSP empty 6.13 4
## 2 2 HOSP empty 3.73 30
## 3 3 HOSP eggs 0.06 84
## 4 4 HOSP eggs 8.3 2
## 5 5 HOSP eggs 0 12
# Look like we need a log-transform on the number of mites
butts$log.mites <- log(butts$Number.of.mites)
outlier <- butts[butts$log.mites<= 0,]
outlier
## # A tibble: 1 × 6
## Nest Species Nest.content Butts.weight Number.of.mites log.mites
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 56 HOFI eggs 4.46 1 0
prelimplotlog <- ggplot(data=butts, aes(x=Butts.weight, y=log.mites))+
ggtitle("Number of mites vs. butts weight")+
xlab("Butts weight (g)")+ylab("log(Number of mites)")+
geom_point()
prelimplotlog
fit.log.butts <- lm(log.mites ~ Butts.weight, data=butts)
anova(fit.log.butts)
## Analysis of Variance Table
##
## Response: log.mites
## Df Sum Sq Mean Sq F value Pr(>F)
## Butts.weight 1 32.394 32.394 52.295 1.574e-09 ***
## Residuals 55 34.069 0.619
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit.log.butts)
##
## Call:
## lm(formula = log.mites ~ Butts.weight, data = butts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.58637 -0.47091 0.04604 0.58008 1.34377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.49066 0.12981 26.891 < 2e-16 ***
## Butts.weight -0.20276 0.02804 -7.232 1.57e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.787 on 55 degrees of freedom
## Multiple R-squared: 0.4874, Adjusted R-squared: 0.4781
## F-statistic: 52.3 on 1 and 55 DF, p-value: 1.574e-09
sf.autoplot.lm(fit.log.butts) # ggplot
## ****** sf.autoplot.lm() no longer needed *****
## use autoplot() from the ggfortify package
newbutts <- data.frame(Butts.weight=seq(min(butts$Butts.weight,na.rm=TRUE),
max(butts$Butts.weight,na.rm=TRUE),.1))
newbutts[1:5,]
## [1] 0.0 0.1 0.2 0.3 0.4
str(newbutts)
## 'data.frame': 149 obs. of 1 variable:
## $ Butts.weight: num 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 ...
# Predict the AVERAGE number of calories/serving at each fat level
# You need to specify help(predict.lm) tp see the documentation
predict.avg <- predict(fit.log.butts, newdata=newbutts,
se.fit=TRUE,interval="confidence")
# This creates a list that you need to restructure to make it look nice
predict.avg.df <- cbind(newbutts, predict.avg$fit, se=predict.avg$se.fit)
tail(predict.avg.df)
## Butts.weight fit lwr upr se
## 144 14.3 0.5912462 -0.0900707 1.272563 0.3399709
## 145 14.4 0.5709706 -0.1156967 1.257638 0.3426407
## 146 14.5 0.5506950 -0.1413269 1.242717 0.3453126
## 147 14.6 0.5304194 -0.1669613 1.227800 0.3479866
## 148 14.7 0.5101438 -0.1925997 1.212887 0.3506626
## 149 14.8 0.4898682 -0.2182421 1.197978 0.3533406
# Add the fitted line and confidence intervals to the plot
plotfit.avgci <- prelimplotlog +
geom_line( data=predict.avg.df, aes(x=Butts.weight, y=fit))+
geom_ribbon(data=predict.avg.df,
aes(x=Butts.weight,y=NULL, ymin=lwr, ymax=upr),alpha=0.2)
plotfit.avgci
ggsave(plot=plotfit.avgci, file='../../MyStuff/Images/lm-predict-nests-001.png',
h=4, w=6, units="in", dpi=300)
# Predict the INDIVIDUAL number of calories in each cereal
# R does not product the se for individual predictions
predict.indiv <- predict(fit.log.butts, newdata=newbutts,
interval="prediction")
# This creates a list that you need to restructure to make it look nice
predict.indiv.df <- cbind(newbutts, predict.indiv)
tail(predict.indiv.df)
## Butts.weight fit lwr upr
## 144 14.3 0.5912462 -1.126881 2.309374
## 145 14.4 0.5709706 -1.149286 2.291227
## 146 14.5 0.5506950 -1.171706 2.273096
## 147 14.6 0.5304194 -1.194141 2.254980
## 148 14.7 0.5101438 -1.216593 2.236880
## 149 14.8 0.4898682 -1.239059 2.218796
# Add the prediction intervals to the plot
plotfit.indivci <- plotfit.avgci +
geom_ribbon(data=predict.indiv.df,
aes(x=Butts.weight,y=NULL, ymin=lwr, ymax=upr),
alpha=0.1)
plotfit.indivci
ggsave(plot=plotfit.indivci, file='../../MyStuff/Images/lm-predict-nests-002.png',
h=4, w=6, units="in", dpi=300)
# Plot on the regular scale by taking anti-logs
prelimplot <- ggplot(data=butts, aes(x=Butts.weight, y=Number.of.mites))+
ggtitle("Number of mites vs. butts weight")+
xlab("Butts weight (g)")+ylab("Number of mites")+
geom_point()
prelimplot
plotfit.avgci <- prelimplot +
geom_line( data=predict.avg.df, aes(x=Butts.weight, y=exp(fit)))+
geom_ribbon(data=predict.avg.df,
aes(x=Butts.weight,y=NULL, ymin=exp(lwr),
ymax=exp(upr)),alpha=0.2)
plotfit.avgci
plotfit.indivci <- plotfit.avgci +
geom_ribbon(data=predict.indiv.df,
aes(x=Butts.weight,y=NULL, ymin=exp(lwr), ymax=exp(upr)),
alpha=0.1)
plotfit.indivci
ggsave(plot=plotfit.indivci, file='../../MyStuff/Images/lm-predict-nests-003.png',
h=4, w=6, units="in", dpi=300)
# Does MEAN sugars/serving depend on display shelf?
# Declare shelf as a factor
cereal$shelfF <- factor(cereal$shelf)
cereal[1:3,]
## name mfr type calories protein fat sodium fiber carbo sugars
## 1 100%_Bran N C 60 4 1 130 10 5 6
## 2 100%_Natural_Bran Q C 110 3 5 15 2 8 8
## 3 All-Bran K C 80 4 1 260 9 7 5
## shelf potass vitamins weight cups shelfF
## 1 3 280 25 1 0.331 3
## 2 3 135 0 1 NA 3
## 3 3 320 25 1 0.330 3
str(cereal)
## 'data.frame': 77 obs. of 16 variables:
## $ name : chr "100%_Bran" "100%_Natural_Bran" "All-Bran" "All-Bran_with_Extra_Fiber" ...
## $ mfr : chr "N" "Q" "K" "K" ...
## $ type : chr "C" "C" "C" "C" ...
## $ calories: int 60 110 80 50 110 110 110 140 90 90 ...
## $ protein : int 4 3 4 4 2 2 2 3 2 3 ...
## $ fat : int 1 5 1 0 2 2 0 2 1 0 ...
## $ sodium : int 130 15 260 140 200 180 125 210 200 210 ...
## $ fiber : num 10 2 9 14 1 1.5 1 2 4 5 ...
## $ carbo : num 5 8 7 8 14 10.5 11 18 15 13 ...
## $ sugars : int 6 8 5 0 8 10 14 8 6 5 ...
## $ shelf : int 3 3 3 3 3 1 2 3 1 3 ...
## $ potass : int 280 135 320 330 NA 70 30 100 125 190 ...
## $ vitamins: int 25 0 25 25 25 25 25 25 25 25 ...
## $ weight : num 1 1 1 1 1 1 1 1.33 1 1 ...
## $ cups : num 0.331 NA 0.33 0.5 0.75 0.75 1 0.75 0.67 0.67 ...
## $ shelfF : Factor w/ 3 levels "1","2","3": 3 3 3 3 3 1 2 3 1 3 ...
# Preliminary plots
prelimplot <- ggplot(data=cereal, aes(x=shelfF, y=sugars))+
ggtitle("Compare MEAN sugar among display shelves")+
xlab("Shelf")+ylab("Sugar (g)")+
geom_point(position=position_jitter(width=0.05))+
geom_boxplot(alpha=0.2, notch=TRUE, outlier.size=-1) # -1 removes outiers from plot
prelimplot
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
ggsave(prelimplot, file='../../MyStuff/Images/lm-prelim-cereal-001.png',
h=4, w=6, units="in", dpi=300)
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
# Table of means and SD
library(plyr)
sumstat <- ddply(cereal, "shelfF", sf.simple.summary,
variable="sugars", crd=TRUE)
sumstat
## shelfF n nmiss mean sd se lcl ucl
## 1 1 20 1 5.105263 4.483237 1.0285251 2.944412 7.266114
## 2 2 21 0 9.619048 4.128876 0.9009947 7.739606 11.498490
## 3 3 36 0 6.527778 3.835817 0.6393028 5.229924 7.825632
# Fit a single factor CRD anova
shelf.fit <- lm(sugars ~ shelfF, data=cereal)
str(shelf.fit)
## List of 14
## $ coefficients : Named num [1:3] 5.11 4.51 1.42
## ..- attr(*, "names")= chr [1:3] "(Intercept)" "shelfF2" "shelfF3"
## $ residuals : Named num [1:76] -0.528 1.472 -1.528 -6.528 1.472 ...
## ..- attr(*, "names")= chr [1:76] "1" "2" "3" "4" ...
## $ effects : Named num [1:76] -61.25 13.97 -5.02 -6.45 1.55 ...
## ..- attr(*, "names")= chr [1:76] "(Intercept)" "shelfF2" "shelfF3" "" ...
## $ rank : int 3
## $ fitted.values: Named num [1:76] 6.53 6.53 6.53 6.53 6.53 ...
## ..- attr(*, "names")= chr [1:76] "1" "2" "3" "4" ...
## $ assign : int [1:3] 0 1 1
## $ qr :List of 5
## ..$ qr : num [1:76, 1:3] -8.718 0.115 0.115 0.115 0.115 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:76] "1" "2" "3" "4" ...
## .. .. ..$ : chr [1:3] "(Intercept)" "shelfF2" "shelfF3"
## .. ..- attr(*, "assign")= int [1:3] 0 1 1
## .. ..- attr(*, "contrasts")=List of 1
## .. .. ..$ shelfF: chr "contr.treatment"
## ..$ qraux: num [1:3] 1.11 1.06 1.08
## ..$ pivot: int [1:3] 1 2 3
## ..$ tol : num 1e-07
## ..$ rank : int 3
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 73
## $ na.action : 'omit' Named int 58
## ..- attr(*, "names")= chr "58"
## $ contrasts :List of 1
## ..$ shelfF: chr "contr.treatment"
## $ xlevels :List of 1
## ..$ shelfF: chr [1:3] "1" "2" "3"
## $ call : language lm(formula = sugars ~ shelfF, data = cereal)
## $ terms :Classes 'terms', 'formula' language sugars ~ shelfF
## .. ..- attr(*, "variables")= language list(sugars, shelfF)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "sugars" "shelfF"
## .. .. .. ..$ : chr "shelfF"
## .. ..- attr(*, "term.labels")= chr "shelfF"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(sugars, shelfF)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "factor"
## .. .. ..- attr(*, "names")= chr [1:2] "sugars" "shelfF"
## $ model :'data.frame': 76 obs. of 2 variables:
## ..$ sugars: int [1:76] 6 8 5 0 8 10 14 8 6 5 ...
## ..$ shelfF: Factor w/ 3 levels "1","2","3": 3 3 3 3 3 1 2 3 1 3 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language sugars ~ shelfF
## .. .. ..- attr(*, "variables")= language list(sugars, shelfF)
## .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:2] "sugars" "shelfF"
## .. .. .. .. ..$ : chr "shelfF"
## .. .. ..- attr(*, "term.labels")= chr "shelfF"
## .. .. ..- attr(*, "order")= int 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(sugars, shelfF)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "factor"
## .. .. .. ..- attr(*, "names")= chr [1:2] "sugars" "shelfF"
## ..- attr(*, "na.action")= 'omit' Named int 58
## .. ..- attr(*, "names")= chr "58"
## - attr(*, "class")= chr "lm"
anova(shelf.fit) # CAUTION - Type I (incremental)
## Analysis of Variance Table
##
## Response: sugars
## Df Sum Sq Mean Sq F value Pr(>F)
## shelfF 2 220.23 110.117 6.6013 0.002316 **
## Residuals 73 1217.71 16.681
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(shelf.fit) # Not useful
##
## Call:
## lm(formula = sugars ~ shelfF, data = cereal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.6190 -3.2109 -0.5278 3.0163 9.8947
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.105 0.937 5.449 6.55e-07 ***
## shelfF2 4.514 1.293 3.490 0.000822 ***
## shelfF3 1.423 1.158 1.228 0.223293
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.084 on 73 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1532, Adjusted R-squared: 0.13
## F-statistic: 6.601 on 2 and 73 DF, p-value: 0.002316
# use the emmeans::emmeans() package to get the individual
# means and the pairwise comparisons and plot them
library(emmeans)
shelf.fit.emmo <- emmeans::emmeans(shelf.fit, ~shelfF)
# Where do difference in marginal means lie?
shelf.fit.cld <- multcomp::cld(shelf.fit.emmo)
shelf.fit.cld
## shelfF emmean SE df lower.CL upper.CL .group
## 1 5.11 0.937 73 3.24 6.97 1
## 3 6.53 0.681 73 5.17 7.88 1
## 2 9.62 0.891 73 7.84 11.40 2
##
## Confidence level used: 0.95
## P value adjustment: tukey method for comparing a family of 3 estimates
## significance level used: alpha = 0.05
## NOTE: Compact letter displays can be misleading
## because they show NON-findings rather than findings.
## Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.
# You can now write ggplot() to display the results or
# use my builtin functions (have a look at them)
sf.cld.plot.bar(shelf.fit.cld, "shelfF")+
ggtitle("Mean sugars/serving by shelf (with 95% ci)")+
xlab("Shelf")+ylab("Mean sugars/serving with 95% ci")
ggsave(file='../../MyStuff/Images/lm-cld-cereal-001.png',
h=4, w=6, units="in",dpi=300)
sf.cld.plot.line(shelf.fit.cld, "shelfF")+
ggtitle("Mean sugar/serving by shelf (with 95% ci)")+
xlab("Shelf")+ylab("Mean sugars/serving with 95% ci")
ggsave(file='../../MyStuff/Images/lm-cld-cereal-002.png',
h=4, w=6, units="in", dpi=300)
# Estimate all of the pairwise differences
pairs(shelf.fit.emmo)
## contrast estimate SE df t.ratio p.value
## 1 - 2 -4.51 1.29 73 -3.490 0.0023
## 1 - 3 -1.42 1.16 73 -1.228 0.4405
## 2 - 3 3.09 1.12 73 2.756 0.0199
##
## P value adjustment: tukey method for comparing a family of 3 estimates
confint(pairs(shelf.fit.emmo))
## contrast estimate SE df lower.CL upper.CL
## 1 - 2 -4.51 1.29 73 -7.608 -1.42
## 1 - 3 -1.42 1.16 73 -4.193 1.35
## 2 - 3 3.09 1.12 73 0.408 5.77
##
## Confidence level used: 0.95
## Conf-level adjustment: tukey method for comparing a family of 3 estimates
summary(pairs(shelf.fit.emmo), infer=TRUE)
## contrast estimate SE df lower.CL upper.CL t.ratio p.value
## 1 - 2 -4.51 1.29 73 -7.608 -1.42 -3.490 0.0023
## 1 - 3 -1.42 1.16 73 -4.193 1.35 -1.228 0.4405
## 2 - 3 3.09 1.12 73 0.408 5.77 2.756 0.0199
##
## Confidence level used: 0.95
## Conf-level adjustment: tukey method for comparing a family of 3 estimates
## P value adjustment: tukey method for comparing a family of 3 estimates
# Birds 'n Butts
# Is there a difference in mean weight of buts between 3 nest contents?
library(readxl)
butts <- readxl::read_excel('../sampledata/bird-butts-data.xlsx',
sheet='Correlational',
col_names=TRUE,
trim_ws=TRUE,
skip=1,
.name_repair="universal")
## New names:
## * `Nest content` -> Nest.content
## * `Butts wieght` -> Butts.wieght
## * `Number of mites` -> Number.of.mites
butts[1:5,]
## # A tibble: 5 × 5
## Nest Species Nest.content Butts.wieght Number.of.mites
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 HOSP empty 6.13 4
## 2 2 HOSP empty 3.73 30
## 3 3 HOSP eggs 0.06 84
## 4 4 HOSP eggs 8.3 2
## 5 5 HOSP eggs 0 12
# Fix the names
names(butts)[ grepl('wieght', names(butts))] <- "Butts.weight"
butts[1:5,]
## # A tibble: 5 × 5
## Nest Species Nest.content Butts.weight Number.of.mites
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 HOSP empty 6.13 4
## 2 2 HOSP empty 3.73 30
## 3 3 HOSP eggs 0.06 84
## 4 4 HOSP eggs 8.3 2
## 5 5 HOSP eggs 0 12
# Look like we need a log-transform on the number of mites
butts$log.mites <- log(butts$Number.of.mites)
outlier <- butts[butts$log.mites<= 0,]
outlier
## # A tibble: 1 × 6
## Nest Species Nest.content Butts.weight Number.of.mites log.mites
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 56 HOFI eggs 4.46 1 0
# Declare Nest.Contents as a factor
butts$Nest.contentF <- factor(butts$Nest.content,
levels=c("eggs","chicks","empty"),
order=TRUE)
butts[1:3,]
## # A tibble: 3 × 7
## Nest Species Nest.content Butts.weight Number.of.mites log.mites
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 HOSP empty 6.13 4 1.39
## 2 2 HOSP empty 3.73 30 3.40
## 3 3 HOSP eggs 0.06 84 4.43
## # … with 1 more variable: Nest.contentF <ord>
str(butts)
## tibble [57 × 7] (S3: tbl_df/tbl/data.frame)
## $ Nest : num [1:57] 1 2 3 4 5 6 7 8 9 10 ...
## $ Species : chr [1:57] "HOSP" "HOSP" "HOSP" "HOSP" ...
## $ Nest.content : chr [1:57] "empty" "empty" "eggs" "eggs" ...
## $ Butts.weight : num [1:57] 6.13 3.73 0.06 8.3 0 1.23 1.03 0 2.4 0.35 ...
## $ Number.of.mites: num [1:57] 4 30 84 2 12 7 10 44 16 32 ...
## $ log.mites : num [1:57] 1.386 3.401 4.431 0.693 2.485 ...
## $ Nest.contentF : Ord.factor w/ 3 levels "eggs"<"chicks"<..: 3 3 1 1 1 2 2 3 2 2 ...
# Fit a single factor CRD anova
nc.fit <- lm(Butts.weight ~ Nest.contentF, data=butts)
str(nc.fit)
## List of 13
## $ coefficients : Named num [1:3] 2.519 0.726 1.807
## ..- attr(*, "names")= chr [1:3] "(Intercept)" "Nest.contentF.L" "Nest.contentF.Q"
## $ residuals : Named num [1:57] 2.3592 -0.0408 -2.6837 5.5563 -2.7437 ...
## ..- attr(*, "names")= chr [1:57] "1" "2" "3" "4" ...
## $ effects : Named num [1:57] -20.83 -3.77 -7.18 5.47 -2.83 ...
## ..- attr(*, "names")= chr [1:57] "(Intercept)" "Nest.contentF.L" "Nest.contentF.Q" "" ...
## $ rank : int 3
## $ fitted.values: Named num [1:57] 3.77 3.77 2.74 2.74 2.74 ...
## ..- attr(*, "names")= chr [1:57] "1" "2" "3" "4" ...
## $ assign : int [1:3] 0 1 1
## $ qr :List of 5
## ..$ qr : num [1:57, 1:3] -7.55 0.132 0.132 0.132 0.132 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:57] "1" "2" "3" "4" ...
## .. .. ..$ : chr [1:3] "(Intercept)" "Nest.contentF.L" "Nest.contentF.Q"
## .. ..- attr(*, "assign")= int [1:3] 0 1 1
## .. ..- attr(*, "contrasts")=List of 1
## .. .. ..$ Nest.contentF: chr "contr.poly"
## ..$ qraux: num [1:3] 1.13 1.12 1.09
## ..$ pivot: int [1:3] 1 2 3
## ..$ tol : num 1e-07
## ..$ rank : int 3
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 54
## $ contrasts :List of 1
## ..$ Nest.contentF: chr "contr.poly"
## $ xlevels :List of 1
## ..$ Nest.contentF: chr [1:3] "eggs" "chicks" "empty"
## $ call : language lm(formula = Butts.weight ~ Nest.contentF, data = butts)
## $ terms :Classes 'terms', 'formula' language Butts.weight ~ Nest.contentF
## .. ..- attr(*, "variables")= language list(Butts.weight, Nest.contentF)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "Butts.weight" "Nest.contentF"
## .. .. .. ..$ : chr "Nest.contentF"
## .. ..- attr(*, "term.labels")= chr "Nest.contentF"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(Butts.weight, Nest.contentF)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "ordered"
## .. .. ..- attr(*, "names")= chr [1:2] "Butts.weight" "Nest.contentF"
## $ model :'data.frame': 57 obs. of 2 variables:
## ..$ Butts.weight : num [1:57] 6.13 3.73 0.06 8.3 0 1.23 1.03 0 2.4 0.35 ...
## ..$ Nest.contentF: Ord.factor w/ 3 levels "eggs"<"chicks"<..: 3 3 1 1 1 2 2 3 2 2 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language Butts.weight ~ Nest.contentF
## .. .. ..- attr(*, "variables")= language list(Butts.weight, Nest.contentF)
## .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:2] "Butts.weight" "Nest.contentF"
## .. .. .. .. ..$ : chr "Nest.contentF"
## .. .. ..- attr(*, "term.labels")= chr "Nest.contentF"
## .. .. ..- attr(*, "order")= int 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(Butts.weight, Nest.contentF)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "ordered"
## .. .. .. ..- attr(*, "names")= chr [1:2] "Butts.weight" "Nest.contentF"
## - attr(*, "class")= chr "lm"
anova(nc.fit) # CAUTION - Type I (incremental)
## Analysis of Variance Table
##
## Response: Butts.weight
## Df Sum Sq Mean Sq F value Pr(>F)
## Nest.contentF 2 65.77 32.887 2.459 0.09505 .
## Residuals 54 722.20 13.374
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(nc.fit) # Not useful
##
## Call:
## lm(formula = Butts.weight ~ Nest.contentF, data = butts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7708 -2.6837 -0.8036 1.1264 12.1163
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.5194 0.4963 5.077 4.88e-06 ***
## Nest.contentF.L 0.7263 0.7941 0.915 0.3644
## Nest.contentF.Q 1.8075 0.9204 1.964 0.0547 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.657 on 54 degrees of freedom
## Multiple R-squared: 0.08347, Adjusted R-squared: 0.04953
## F-statistic: 2.459 on 2 and 54 DF, p-value: 0.09505
# use the emmeans::emmeans() package to get the individual
# means and the pairwise comparisons and plot them
library(emmeans)
nc.fit.emmo <- emmeans::emmeans(nc.fit, ~Nest.contentF)
# Where do difference in marginal means lie?
nc.fit.cld <- multcomp::cld(nc.fit.emmo)
nc.fit.cld
## Nest.contentF emmean SE df lower.CL upper.CL .group
## chicks 1.04 0.977 54 -0.916 3.00 1
## eggs 2.74 0.839 54 1.062 4.43 1
## empty 3.77 0.746 54 2.274 5.27 1
##
## Confidence level used: 0.95
## P value adjustment: tukey method for comparing a family of 3 estimates
## significance level used: alpha = 0.05
## NOTE: Compact letter displays can be misleading
## because they show NON-findings rather than findings.
## Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.
# You can now write ggplot() to display the results or
# use my builtin functions (have a look at them)
sf.cld.plot.bar(nc.fit.cld, "Nest.contentF")+
ggtitle("Mean Butts.weight by nest contents(with 95% ci)")+
xlab("Nest Content")+ylab("Mean Butts.weight with 95% ci")
ggsave(file='../../MyStuff/Images/lm-cld-butts-001.png',
h=4, w=6,unit="in", dpi=300)
sf.cld.plot.line(nc.fit.cld, "Nest.contentF")+
ggtitle("Mean Butts.weight by nest content (with 95% ci)")+
xlab("Nest Content")+ylab("Mean Butts.weight with 95% ci")
ggsave(file='../../MyStuff/Images/lm-cld-butts-002.png',
h=4, w=6,unit="in", dpi=300)
# Estimate all of the pairwise differences
pairs(nc.fit.emmo)
## contrast estimate SE df t.ratio p.value
## eggs - chicks 1.70 1.29 54 1.320 0.3905
## eggs - empty -1.03 1.12 54 -0.915 0.6335
## chicks - empty -2.73 1.23 54 -2.218 0.0772
##
## P value adjustment: tukey method for comparing a family of 3 estimates
confint(pairs(nc.fit.emmo))
## contrast estimate SE df lower.CL upper.CL
## eggs - chicks 1.70 1.29 54 -1.40 4.804
## eggs - empty -1.03 1.12 54 -3.73 1.679
## chicks - empty -2.73 1.23 54 -5.69 0.237
##
## Confidence level used: 0.95
## Conf-level adjustment: tukey method for comparing a family of 3 estimates
# CRD ANCOVA
# Is the relationship between calories and fat the same for all display shelves?\\
# Declare shelf as a factor
cereal$shelfF <- factor(cereal$shelf)
cereal[1:3,]
## name mfr type calories protein fat sodium fiber carbo sugars
## 1 100%_Bran N C 60 4 1 130 10 5 6
## 2 100%_Natural_Bran Q C 110 3 5 15 2 8 8
## 3 All-Bran K C 80 4 1 260 9 7 5
## shelf potass vitamins weight cups shelfF
## 1 3 280 25 1 0.331 3
## 2 3 135 0 1 NA 3
## 3 3 320 25 1 0.330 3
str(cereal)
## 'data.frame': 77 obs. of 16 variables:
## $ name : chr "100%_Bran" "100%_Natural_Bran" "All-Bran" "All-Bran_with_Extra_Fiber" ...
## $ mfr : chr "N" "Q" "K" "K" ...
## $ type : chr "C" "C" "C" "C" ...
## $ calories: int 60 110 80 50 110 110 110 140 90 90 ...
## $ protein : int 4 3 4 4 2 2 2 3 2 3 ...
## $ fat : int 1 5 1 0 2 2 0 2 1 0 ...
## $ sodium : int 130 15 260 140 200 180 125 210 200 210 ...
## $ fiber : num 10 2 9 14 1 1.5 1 2 4 5 ...
## $ carbo : num 5 8 7 8 14 10.5 11 18 15 13 ...
## $ sugars : int 6 8 5 0 8 10 14 8 6 5 ...
## $ shelf : int 3 3 3 3 3 1 2 3 1 3 ...
## $ potass : int 280 135 320 330 NA 70 30 100 125 190 ...
## $ vitamins: int 25 0 25 25 25 25 25 25 25 25 ...
## $ weight : num 1 1 1 1 1 1 1 1.33 1 1 ...
## $ cups : num 0.331 NA 0.33 0.5 0.75 0.75 1 0.75 0.67 0.67 ...
## $ shelfF : Factor w/ 3 levels "1","2","3": 3 3 3 3 3 1 2 3 1 3 ...
# Preliminary plots
prelimplot <- ggplot(data=cereal, aes(x=fat, y=calories, color=shelfF))+
ggtitle("Compare calories vs fat by shelf")+
xlab("Calories")+ylab("Calories")+
geom_point(position=position_jitter(width=0.05))+
geom_smooth(method="lm", se=FALSE)
prelimplot
## `geom_smooth()` using formula 'y ~ x'
ggsave(prelimplot, file='../../MyStuff/Images/lm-ancova-prelim-cereal-001.png',
h=4, w=6, units="in", dpi=300)
## `geom_smooth()` using formula 'y ~ x'
# Fit a single factor CRD ancova
library(car)
shelf.fit <- lm(calories ~ shelfF + fat + shelfF:fat, data=cereal,
contrasts=list(shelfF=contr.sum))
anova(shelf.fit) # CAUTION - Type I (incremental)
## Analysis of Variance Table
##
## Response: calories
## Df Sum Sq Mean Sq F value Pr(>F)
## shelfF 2 593.2 296.6 0.7875 0.4589
## fat 1 7025.6 7025.6 18.6546 4.997e-05 ***
## shelfF:fat 2 1166.3 583.1 1.5484 0.2197
## Residuals 71 26739.6 376.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::Anova(shelf.fit, type=3) # marginal tests
## Anova Table (Type III tests)
##
## Response: calories
## Sum Sq Df F value Pr(>F)
## (Intercept) 303495 1 805.8500 < 2.2e-16 ***
## shelfF 572 2 0.7592 0.471782
## fat 2693 1 7.1495 0.009298 **
## shelfF:fat 1166 2 1.5484 0.219687
## Residuals 26740 71
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# summary table is useless
summary(shelf.fit)
##
## Call:
## lm(formula = calories ~ shelfF + fat + shelfF:fat, data = cereal,
## contrasts = list(shelfF = contr.sum))
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.368 -10.278 -0.278 9.722 46.838
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 96.144 3.387 28.387 <2e-16 ***
## shelfF1 4.134 4.683 0.883 0.3803
## shelfF2 0.642 5.283 0.122 0.9036
## fat 7.666 2.867 2.674 0.0093 **
## shelfF1:fat -7.296 4.455 -1.638 0.1059
## shelfF2:fat 3.167 4.322 0.733 0.4661
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.41 on 71 degrees of freedom
## Multiple R-squared: 0.2473, Adjusted R-squared: 0.1943
## F-statistic: 4.665 on 5 and 71 DF, p-value: 0.0009712
# use the emmeans::emmeans() package to get the individual
# means and the pairwise comparisons and plot them
library(emmeans)
shelf.fit.emmo <- emmeans::emtrends(shelf.fit, ~shelfF, var="fat")
shelf.fit.cld <- multcomp::cld(shelf.fit.emmo)
shelf.fit.cld
## shelfF fat.trend SE df lower.CL upper.CL .group
## 1 0.37 5.91 71 -11.404 12.1 1
## 2 10.83 5.60 71 -0.337 22.0 1
## 3 11.79 2.78 71 6.253 17.3 1
##
## Confidence level used: 0.95
## P value adjustment: tukey method for comparing a family of 3 estimates
## significance level used: alpha = 0.05
## NOTE: Compact letter displays can be misleading
## because they show NON-findings rather than findings.
## Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.
# Fit the parallel slope model
library(car)
shelf.fit2 <- lm(calories ~ shelfF + fat, data=cereal,
contrasts=list(shelfF=contr.sum))
car::Anova(shelf.fit2, type=3) # marginal tests
## Anova Table (Type III tests)
##
## Response: calories
## Sum Sq Df F value Pr(>F)
## (Intercept) 342333 1 895.5195 < 2.2e-16 ***
## shelfF 216 2 0.2823 0.7548
## fat 7026 1 18.3785 5.457e-05 ***
## Residuals 27906 73
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# summary table is useless except for estimate of common slope
summary(shelf.fit2)
##
## Call:
## lm(formula = calories ~ shelfF + fat, data = cereal, contrasts = list(shelfF = contr.sum))
##
## Residuals:
## Min 1Q Median 3Q Max
## -53.73 -7.71 2.29 12.29 46.46
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95.3297 3.1856 29.925 < 2e-16 ***
## shelfF1 -0.7752 3.5145 -0.221 0.826
## shelfF2 2.3802 3.3775 0.705 0.483
## fat 9.9092 2.3114 4.287 5.46e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.55 on 73 degrees of freedom
## Multiple R-squared: 0.2145, Adjusted R-squared: 0.1822
## F-statistic: 6.643 on 3 and 73 DF, p-value: 0.0004979
# use the emmeans::emmeans() package to get the individual
# means and the pairwise comparisons and plot them
library(emmeans)
shelf.fit2.emmo <- emmeans::emmeans(shelf.fit2, ~shelfF)
# Where do difference in marginal means lie?
shelf.fit2.cld <- multcomp::cld(shelf.fit2.emmo)
shelf.fit2.cld
## shelfF emmean SE df lower.CL upper.CL .group
## 3 104 3.30 73 97.2 110 1
## 1 105 4.47 73 95.7 114 1
## 2 108 4.27 73 99.2 116 1
##
## Confidence level used: 0.95
## P value adjustment: tukey method for comparing a family of 3 estimates
## significance level used: alpha = 0.05
## NOTE: Compact letter displays can be misleading
## because they show NON-findings rather than findings.
## Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.
# Birds 'n Butts
# Is there a difference in relationship between $\log{{Number of Mites}$ and $Butts Weight$ the same for all 3 nest contents?
library(readxl)
butts <- readxl::read_excel('../sampledata/bird-butts-data.xlsx',
sheet='Correlational',
col_names=TRUE,
trim_ws=TRUE,
skip=1,
.name_repair="universal")
## New names:
## * `Nest content` -> Nest.content
## * `Butts wieght` -> Butts.wieght
## * `Number of mites` -> Number.of.mites
butts[1:5,]
## # A tibble: 5 × 5
## Nest Species Nest.content Butts.wieght Number.of.mites
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 HOSP empty 6.13 4
## 2 2 HOSP empty 3.73 30
## 3 3 HOSP eggs 0.06 84
## 4 4 HOSP eggs 8.3 2
## 5 5 HOSP eggs 0 12
# Fix the names
names(butts)[ grepl('wieght', names(butts))] <- "Butts.weight"
butts[1:5,]
## # A tibble: 5 × 5
## Nest Species Nest.content Butts.weight Number.of.mites
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 HOSP empty 6.13 4
## 2 2 HOSP empty 3.73 30
## 3 3 HOSP eggs 0.06 84
## 4 4 HOSP eggs 8.3 2
## 5 5 HOSP eggs 0 12
# Look like we need a log-transform on the number of mites
butts$log.mites <- log(butts$Number.of.mites)
outlier <- butts[butts$log.mites<= 0,]
outlier
## # A tibble: 1 × 6
## Nest Species Nest.content Butts.weight Number.of.mites log.mites
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 56 HOFI eggs 4.46 1 0
# Declare Nest.Contents as a factor
butts$Nest.contentF <- factor(butts$Nest.content,
levels=c("eggs","chicks","empty"),
order=TRUE)
butts[1:3,]
## # A tibble: 3 × 7
## Nest Species Nest.content Butts.weight Number.of.mites log.mites
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 HOSP empty 6.13 4 1.39
## 2 2 HOSP empty 3.73 30 3.40
## 3 3 HOSP eggs 0.06 84 4.43
## # … with 1 more variable: Nest.contentF <ord>
str(butts)
## tibble [57 × 7] (S3: tbl_df/tbl/data.frame)
## $ Nest : num [1:57] 1 2 3 4 5 6 7 8 9 10 ...
## $ Species : chr [1:57] "HOSP" "HOSP" "HOSP" "HOSP" ...
## $ Nest.content : chr [1:57] "empty" "empty" "eggs" "eggs" ...
## $ Butts.weight : num [1:57] 6.13 3.73 0.06 8.3 0 1.23 1.03 0 2.4 0.35 ...
## $ Number.of.mites: num [1:57] 4 30 84 2 12 7 10 44 16 32 ...
## $ log.mites : num [1:57] 1.386 3.401 4.431 0.693 2.485 ...
## $ Nest.contentF : Ord.factor w/ 3 levels "eggs"<"chicks"<..: 3 3 1 1 1 2 2 3 2 2 ...
# preliminary plot
plot <- ggplot(data=butts, aes(x=Butts.weight, y=log.mites, color=Nest.contentF))+
ggtitle("Non-parallel slopes")+
geom_point( position=position_jitter(h=.1, w=.1))+
geom_smooth(method="lm", se=FALSE)
plot
## `geom_smooth()` using formula 'y ~ x'
ggsave(plot=plot, file='../../MyStuff/Images/lm-ancova-butts-001.png',
h=4, w=6,unit="in", dpi=300)
## `geom_smooth()` using formula 'y ~ x'
# Fit a single factor CRD ancova
np.fit <- lm(log.mites ~ Nest.contentF + Butts.weight + Butts.weight:Nest.contentF, data=butts,
contrasts=list(Nest.contentF=contr.sum))
car::Anova(np.fit, type=3)
## Anova Table (Type III tests)
##
## Response: log.mites
## Sum Sq Df F value Pr(>F)
## (Intercept) 288.693 1 446.7996 <2e-16 ***
## Nest.contentF 0.654 2 0.5063 0.6057
## Butts.weight 0.729 1 1.1281 0.2932
## Nest.contentF:Butts.weight 1.024 2 0.7925 0.4582
## Residuals 32.953 51
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# use the emmeans::emmeans() package to get the individual
# means and plot them
library(emmeans)
np.fit.emmo <- emmeans::emtrends(np.fit, ~Nest.contentF, var='Butts.weight')
np.fit.cld <- multcomp::cld(np.fit.emmo)
np.fit.cld
## Nest.contentF Butts.weight.trend SE df lower.CL upper.CL .group
## eggs -0.226 0.0471 51 -0.321 -0.132 1
## empty -0.197 0.0391 51 -0.276 -0.119 1
## chicks 0.119 0.2802 51 -0.443 0.682 1
##
## Confidence level used: 0.95
## P value adjustment: tukey method for comparing a family of 3 estimates
## significance level used: alpha = 0.05
## NOTE: Compact letter displays can be misleading
## because they show NON-findings rather than findings.
## Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.
# Fit a single factor CRD ancova with parallel slopes
p.fit <- lm(log.mites ~ Nest.contentF + Butts.weight, data=butts,
contrasts=list(Nest.contentF=contr.sum))
car::Anova(p.fit, type=3)
## Anova Table (Type III tests)
##
## Response: log.mites
## Sum Sq Df F value Pr(>F)
## (Intercept) 447.86 1 698.6061 < 2.2e-16 ***
## Nest.contentF 0.09 2 0.0716 0.931
## Butts.weight 30.48 1 47.5405 6.657e-09 ***
## Residuals 33.98 53
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(p.fit)
##
## Call:
## lm(formula = log.mites ~ Nest.contentF + Butts.weight, data = butts,
## contrasts = list(Nest.contentF = contr.sum))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.54886 -0.52424 0.04152 0.55431 1.38048
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.49042 0.13206 26.431 < 2e-16 ***
## Nest.contentF1 -0.02536 0.15198 -0.167 0.868
## Nest.contentF2 -0.03067 0.17030 -0.180 0.858
## Butts.weight -0.20543 0.02979 -6.895 6.66e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8007 on 53 degrees of freedom
## Multiple R-squared: 0.4888, Adjusted R-squared: 0.4598
## F-statistic: 16.89 on 3 and 53 DF, p-value: 7.959e-08
# use the emmeans::emmeans() package to get the individual
# means and the pairwise comparisons and plot them
library(emmeans)
p.fit.emmo <- emmeans::emmeans(p.fit, ~Nest.contentF)
p.fit.cld <- multcomp::cld(p.fit.emmo)
p.fit.cld
## Nest.contentF emmean SE df lower.CL upper.CL .group
## chicks 2.89 0.220 53 2.45 3.33 1
## eggs 2.90 0.184 53 2.53 3.27 1
## empty 2.98 0.166 53 2.65 3.31 1
##
## Confidence level used: 0.95
## P value adjustment: tukey method for comparing a family of 3 estimates
## significance level used: alpha = 0.05
## NOTE: Compact letter displays can be misleading
## because they show NON-findings rather than findings.
## Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.
# Logistic ANOVA
#
# Does PROPORTION of accidents that result in fatality differ by day of the week?
# Read in the accidents dataset
radf <- read.csv(file.path("..","sampledata","Accidents","road-accidents-2010.csv"),
header=TRUE,
as.is=TRUE,
strip.white=TRUE)
dim(radf)
## [1] 154414 32
names(radf)
## [1] "Accident_Index"
## [2] "Location_Easting_OSGR"
## [3] "Location_Northing_OSGR"
## [4] "Longitude"
## [5] "Latitude"
## [6] "Police_Force"
## [7] "Accident_Severity"
## [8] "Number_of_Vehicles"
## [9] "Number_of_Casualties"
## [10] "Date"
## [11] "Day_of_Week"
## [12] "Time"
## [13] "Local_Authority_.District."
## [14] "Local_Authority_.Highway."
## [15] "X1st_Road_Class"
## [16] "X1st_Road_Number"
## [17] "Road_Type"
## [18] "Speed_limit"
## [19] "Junction_Detail"
## [20] "Junction_Control"
## [21] "X2nd_Road_Class"
## [22] "X2nd_Road_Number"
## [23] "Pedestrian_Crossing.Human_Control"
## [24] "Pedestrian_Crossing.Physical_Facilities"
## [25] "Light_Conditions"
## [26] "Weather_Conditions"
## [27] "Road_Surface_Conditions"
## [28] "Special_Conditions_at_Site"
## [29] "Carriageway_Hazards"
## [30] "Urban_or_Rural_Area"
## [31] "Did_Police_Officer_Attend_Scene_of_Accident"
## [32] "LSOA_of_Accident_Location"
# Declare day of week as a factor
radf$Day_of_WeekF <- factor(radf$Day_of_Week)
# Recode AccidentSeverity as fatal or not
xtabs(~Accident_Severity, data=radf)
## Accident_Severity
## 1 2 3
## 1731 20440 132243
library(car)
radf$fatal <- recode(radf$Accident_Severity,
' 1="yes";2:3="no" ')
xtabs(~fatal+Accident_Severity, data=radf)
## Accident_Severity
## fatal 1 2 3
## no 0 20440 132243
## yes 1731 0 0
# Need to also declare response as a factor (because it is categorical)
radf$fatalF <- factor(radf$fatal, level=c("no","yes"), order=TRUE)
# Preliminary tabulation
xtabs(~fatalF+Day_of_WeekF, data=radf)
## Day_of_WeekF
## fatalF 1 2 3 4 5 6 7
## no 16489 22231 22819 22808 22602 25214 20520
## yes 305 221 226 210 208 261 300
round(prop.table(xtabs(~fatalF+Day_of_WeekF, data=radf),2),3)
## Day_of_WeekF
## fatalF 1 2 3 4 5 6 7
## no 0.982 0.990 0.990 0.991 0.991 0.990 0.986
## yes 0.018 0.010 0.010 0.009 0.009 0.010 0.014
# Now to fit the glm model
fatal.fit <- glm(fatalF ~ Day_of_WeekF, family=binomial, data=radf)
anova(fatal.fit, test='Chi') # CAUTION Type I tests
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: fatalF
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 154413 18990
## Day_of_WeekF 6 108.98 154407 18881 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fatal.fit) # not useful
##
## Call:
## glm(formula = fatalF ~ Day_of_WeekF, family = binomial, data = radf)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.1915 -0.1435 -0.1407 -0.1354 3.0651
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.99014 0.05779 -69.049 < 2e-16 ***
## Day_of_WeekF2 -0.62094 0.08893 -6.982 2.91e-12 ***
## Day_of_WeekF3 -0.62468 0.08836 -7.070 1.55e-12 ***
## Day_of_WeekF4 -0.69762 0.09025 -7.730 1.08e-14 ***
## Day_of_WeekF5 -0.69812 0.09050 -7.714 1.22e-14 ***
## Day_of_WeekF6 -0.58050 0.08491 -6.836 8.13e-12 ***
## Day_of_WeekF7 -0.23524 0.08198 -2.869 0.00411 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 18990 on 154413 degrees of freedom
## Residual deviance: 18881 on 154407 degrees of freedom
## AIC: 18895
##
## Number of Fisher Scoring iterations: 7
# We get the emmeans (eventhough these are logits of proportions)
fatal.fit.emmo <- emmeans::emmeans(fatal.fit, ~Day_of_WeekF)
# where do the p(fatal) differ?
fatal.fit.cld <- multcomp::cld(fatal.fit.emmo)
fatal.fit.cld
## Day_of_WeekF emmean SE df asymp.LCL asymp.UCL .group
## 5 -4.69 0.0697 Inf -4.82 -4.55 1
## 4 -4.69 0.0693 Inf -4.82 -4.55 1
## 3 -4.61 0.0668 Inf -4.75 -4.48 1
## 2 -4.61 0.0676 Inf -4.74 -4.48 1
## 6 -4.57 0.0622 Inf -4.69 -4.45 1
## 7 -4.23 0.0582 Inf -4.34 -4.11 2
## 1 -3.99 0.0578 Inf -4.10 -3.88 2
##
## Results are given on the logit (not the response) scale.
## Confidence level used: 0.95
## Results are given on the log odds ratio (not the response) scale.
## P value adjustment: tukey method for comparing a family of 7 estimates
## significance level used: alpha = 0.05
## NOTE: Compact letter displays can be misleading
## because they show NON-findings rather than findings.
## Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.
multcomp::cld(fatal.fit.emmo, type="response")
## Day_of_WeekF prob SE df asymp.LCL asymp.UCL .group
## 5 0.00912 0.000629 Inf 0.00796 0.0104 1
## 4 0.00912 0.000627 Inf 0.00797 0.0104 1
## 3 0.00981 0.000649 Inf 0.00861 0.0112 1
## 2 0.00984 0.000659 Inf 0.00863 0.0112 1
## 6 0.01025 0.000631 Inf 0.00908 0.0116 1
## 7 0.01441 0.000826 Inf 0.01288 0.0161 2
## 1 0.01816 0.001030 Inf 0.01625 0.0203 2
##
## Confidence level used: 0.95
## Intervals are back-transformed from the logit scale
## P value adjustment: tukey method for comparing a family of 7 estimates
## Tests are performed on the log odds ratio scale
## significance level used: alpha = 0.05
## NOTE: Compact letter displays can be misleading
## because they show NON-findings rather than findings.
## Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.
sf.cld.plot.bar(fatal.fit.cld, "Day_of_WeekF")+
ggtitle("logit(fatality) by day of week (with 95% ci)")+
xlab("Day of Week (1=Sunday)")+ylab("logit(fatality rate) with 95% ci")
ggsave(file='../../MyStuff/Images/glm-cld-radf-001.png',
h=4, w=6, units="in", dpi=300)
sf.cld.plot.line(fatal.fit.cld, "Day_of_WeekF")+
ggtitle("logit(fatality) by day of week (with 95% ci)")+
xlab("Day of Week (1=Sunday)")+ylab("logit(fatality rate) with 95% ci")
ggsave(file='../../MyStuff/Images/glm-cld-radf-002.png',
h=4, w=6, units="in", dpi=300)
summary(fatal.fit.emmo, type="response")
## Day_of_WeekF prob SE df asymp.LCL asymp.UCL
## 1 0.01816 0.001030 Inf 0.01625 0.0203
## 2 0.00984 0.000659 Inf 0.00863 0.0112
## 3 0.00981 0.000649 Inf 0.00861 0.0112
## 4 0.00912 0.000627 Inf 0.00797 0.0104
## 5 0.00912 0.000629 Inf 0.00796 0.0104
## 6 0.01025 0.000631 Inf 0.00908 0.0116
## 7 0.01441 0.000826 Inf 0.01288 0.0161
##
## Confidence level used: 0.95
## Intervals are back-transformed from the logit scale
# Estimate the odds ratio
pairs(fatal.fit.emmo)
## contrast estimate SE df z.ratio p.value
## 1 - 2 0.620943 0.0889 Inf 6.982 <.0001
## 1 - 3 0.624677 0.0884 Inf 7.070 <.0001
## 1 - 4 0.697622 0.0902 Inf 7.730 <.0001
## 1 - 5 0.698119 0.0905 Inf 7.714 <.0001
## 1 - 6 0.580497 0.0849 Inf 6.836 <.0001
## 1 - 7 0.235236 0.0820 Inf 2.869 0.0625
## 2 - 3 0.003733 0.0951 Inf 0.039 1.0000
## 2 - 4 0.076679 0.0968 Inf 0.792 0.9858
## 2 - 5 0.077175 0.0971 Inf 0.795 0.9855
## 2 - 6 -0.040446 0.0919 Inf -0.440 0.9995
## 2 - 7 -0.385707 0.0892 Inf -4.325 0.0003
## 3 - 4 0.072945 0.0963 Inf 0.757 0.9888
## 3 - 5 0.073442 0.0965 Inf 0.761 0.9885
## 3 - 6 -0.044179 0.0913 Inf -0.484 0.9991
## 3 - 7 -0.389441 0.0886 Inf -4.395 0.0002
## 4 - 5 0.000496 0.0983 Inf 0.005 1.0000
## 4 - 6 -0.117125 0.0931 Inf -1.257 0.8712
## 4 - 7 -0.462386 0.0905 Inf -5.110 <.0001
## 5 - 6 -0.117621 0.0934 Inf -1.259 0.8704
## 5 - 7 -0.462883 0.0907 Inf -5.101 <.0001
## 6 - 7 -0.345261 0.0852 Inf -4.054 0.0010
##
## Results are given on the log odds ratio (not the response) scale.
## P value adjustment: tukey method for comparing a family of 7 estimates
summary(pairs(fatal.fit.emmo), infer=TRUE, type="response")
## contrast odds.ratio SE df asymp.LCL asymp.UCL null z.ratio p.value
## 1 / 2 1.861 0.1655 Inf 1.432 2.418 1 6.982 <.0001
## 1 / 3 1.868 0.1650 Inf 1.439 2.423 1 7.070 <.0001
## 1 / 4 2.009 0.1813 Inf 1.540 2.621 1 7.730 <.0001
## 1 / 5 2.010 0.1819 Inf 1.539 2.625 1 7.714 <.0001
## 1 / 6 1.787 0.1517 Inf 1.391 2.295 1 6.836 <.0001
## 1 / 7 1.265 0.1037 Inf 0.994 1.611 1 2.869 0.0625
## 2 / 3 1.004 0.0954 Inf 0.758 1.328 1 0.039 1.0000
## 2 / 4 1.080 0.1045 Inf 0.812 1.436 1 0.792 0.9858
## 2 / 5 1.080 0.1049 Inf 0.811 1.438 1 0.795 0.9855
## 2 / 6 0.960 0.0882 Inf 0.732 1.259 1 -0.440 0.9995
## 2 / 7 0.680 0.0606 Inf 0.523 0.884 1 -4.325 0.0003
## 3 / 4 1.076 0.1036 Inf 0.810 1.429 1 0.757 0.9888
## 3 / 5 1.076 0.1039 Inf 0.810 1.431 1 0.761 0.9885
## 3 / 6 0.957 0.0874 Inf 0.731 1.252 1 -0.484 0.9991
## 3 / 7 0.677 0.0600 Inf 0.522 0.880 1 -4.395 0.0002
## 4 / 5 1.000 0.0983 Inf 0.749 1.337 1 0.005 1.0000
## 4 / 6 0.889 0.0829 Inf 0.676 1.171 1 -1.257 0.8712
## 4 / 7 0.630 0.0570 Inf 0.482 0.822 1 -5.110 <.0001
## 5 / 6 0.889 0.0830 Inf 0.675 1.171 1 -1.259 0.8704
## 5 / 7 0.629 0.0571 Inf 0.482 0.823 1 -5.101 <.0001
## 6 / 7 0.708 0.0603 Inf 0.551 0.910 1 -4.054 0.0010
##
## Confidence level used: 0.95
## Conf-level adjustment: tukey method for comparing a family of 7 estimates
## Intervals are back-transformed from the log odds ratio scale
## P value adjustment: tukey method for comparing a family of 7 estimates
## Tests are performed on the log odds ratio scale
# Exercise - look at proportion of fatal by weather conditions
dim(radf)
## [1] 154414 35
radf2 <- radf[ radf$Weather_Conditions %in% 1:6,]
dim(radf2)
## [1] 146024 35
# Define the weather conditons as a factor
radf2$WeatherF <- factor(radf2$Weather_Conditions)
xtabs(~fatalF + WeatherF, data=radf2)
## WeatherF
## fatalF 1 2 3 4 5 6
## no 121961 16327 3256 1184 1301 345
## yes 1435 158 25 18 13 1
round(prop.table(xtabs(~fatalF+WeatherF, data=radf2),2),3)
## WeatherF
## fatalF 1 2 3 4 5 6
## no 0.988 0.990 0.992 0.985 0.990 0.997
## yes 0.012 0.010 0.008 0.015 0.010 0.003
# Now to fit the glm model
fatal.fit <- glm(fatalF ~ WeatherF, family=binomial, data=radf2)
anova(fatal.fit, test='Chi') # CAUTION Type I tests
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: fatalF
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 146023 18075
## WeatherF 5 14.934 146018 18060 0.01065 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fatal.fit) # not useful
##
## Call:
## glm(formula = fatalF ~ WeatherF, family = binomial, data = radf2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.1737 -0.1530 -0.1530 -0.1530 3.4195
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.44254 0.02655 -167.308 <2e-16 ***
## WeatherF2 -0.19544 0.08423 -2.320 0.0203 *
## WeatherF3 -0.42684 0.20251 -2.108 0.0351 *
## WeatherF4 0.25625 0.23897 1.072 0.2836
## WeatherF5 -0.16340 0.27999 -0.584 0.5595
## WeatherF6 -1.40101 1.00174 -1.399 0.1619
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 18075 on 146023 degrees of freedom
## Residual deviance: 18060 on 146018 degrees of freedom
## AIC: 18072
##
## Number of Fisher Scoring iterations: 8
# We get the emmeans (eventhough these are logits of proportions)
fatal.fit.emmo <- emmeans::emmeans(fatal.fit, ~WeatherF)
# where do the p(fatal) differ?
fatal.fit.cld <- multcomp::cld(fatal.fit.emmo)
fatal.fit.cld # interesting what happens
## WeatherF emmean SE df asymp.LCL asymp.UCL .group
## 6 -5.84 1.0014 Inf -7.81 -3.88 1
## 3 -4.87 0.2008 Inf -5.26 -4.48 1
## 2 -4.64 0.0799 Inf -4.79 -4.48 1
## 5 -4.61 0.2787 Inf -5.15 -4.06 1
## 1 -4.44 0.0266 Inf -4.49 -4.39 1
## 4 -4.19 0.2375 Inf -4.65 -3.72 1
##
## Results are given on the logit (not the response) scale.
## Confidence level used: 0.95
## Results are given on the log odds ratio (not the response) scale.
## P value adjustment: tukey method for comparing a family of 6 estimates
## significance level used: alpha = 0.05
## NOTE: Compact letter displays can be misleading
## because they show NON-findings rather than findings.
## Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.
sf.cld.plot.bar(fatal.fit.cld, "WeatherF")+
ggtitle("logit(fatality) by Weather Conditions (with 95% ci)")+
xlab("Weather Conditions")+ylab("logit(fatality rate) with 95% ci")
ggsave(file='../../MyStuff/Images/glm-cld-radf-weather-001.png',
h=4, w=6, units='in', dpi=300)
sf.cld.plot.line(fatal.fit.cld, "WeatherF")+
ggtitle("logit(fatality) by Weather Conditions (with 95% ci)")+
xlab("Weather Conditions")+ylab("logit(fatality rate) with 95% ci")
ggsave(file='../../MyStuff/Images/glm-cld-radf-weather-002.png',
h=4, w=6, units='in', dpi=300)
# Exercise - look at proportion of fatal by speed
dim(radf)
## [1] 154414 35
names(radf)
## [1] "Accident_Index"
## [2] "Location_Easting_OSGR"
## [3] "Location_Northing_OSGR"
## [4] "Longitude"
## [5] "Latitude"
## [6] "Police_Force"
## [7] "Accident_Severity"
## [8] "Number_of_Vehicles"
## [9] "Number_of_Casualties"
## [10] "Date"
## [11] "Day_of_Week"
## [12] "Time"
## [13] "Local_Authority_.District."
## [14] "Local_Authority_.Highway."
## [15] "X1st_Road_Class"
## [16] "X1st_Road_Number"
## [17] "Road_Type"
## [18] "Speed_limit"
## [19] "Junction_Detail"
## [20] "Junction_Control"
## [21] "X2nd_Road_Class"
## [22] "X2nd_Road_Number"
## [23] "Pedestrian_Crossing.Human_Control"
## [24] "Pedestrian_Crossing.Physical_Facilities"
## [25] "Light_Conditions"
## [26] "Weather_Conditions"
## [27] "Road_Surface_Conditions"
## [28] "Special_Conditions_at_Site"
## [29] "Carriageway_Hazards"
## [30] "Urban_or_Rural_Area"
## [31] "Did_Police_Officer_Attend_Scene_of_Accident"
## [32] "LSOA_of_Accident_Location"
## [33] "Day_of_WeekF"
## [34] "fatal"
## [35] "fatalF"
xtabs(~Speed_limit+fatalF, data=radf, exclude=NULL, na.action=na.pass)
## fatalF
## Speed_limit no yes
## 10 1 1
## 15 1 0
## 20 1543 5
## 30 99716 545
## 40 12427 164
## 50 5094 99
## 60 22852 658
## 70 11049 259
round(prop.table(xtabs(~fatalF+Speed_limit, data=radf),2),3)
## Speed_limit
## fatalF 10 15 20 30 40 50 60 70
## no 0.500 1.000 0.997 0.995 0.987 0.981 0.972 0.977
## yes 0.500 0.000 0.003 0.005 0.013 0.019 0.028 0.023
radf2 <- radf[ radf$Speed_limit >= 20,]
radf2$Speed_limitF <- factor(radf2$Speed_limit)
# Now to fit the glm model
fatal.fit <- glm(fatalF ~ Speed_limitF, family=binomial, data=radf2)
anova(fatal.fit, test='Chi') # CAUTION Type I tests
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: fatalF
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 154410 18981
## Speed_limitF 5 941.24 154405 18040 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fatal.fit) # not useful
##
## Call:
## glm(formula = fatalF ~ Speed_limitF, family = binomial, data = radf2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.2383 -0.1962 -0.1044 -0.1044 3.3868
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.7320 0.4479 -12.797 < 2e-16 ***
## Speed_limitF30 0.5228 0.4500 1.162 0.24535
## Speed_limitF40 1.4043 0.4548 3.088 0.00202 **
## Speed_limitF50 1.7913 0.4593 3.900 9.60e-05 ***
## Speed_limitF60 2.1845 0.4497 4.858 1.19e-06 ***
## Speed_limitF70 1.9788 0.4523 4.375 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 18981 on 154410 degrees of freedom
## Residual deviance: 18040 on 154405 degrees of freedom
## AIC: 18052
##
## Number of Fisher Scoring iterations: 8
# We get the emmeans (eventhough these are logits of proportions)
fatal.fit.emmo <- emmeans::emmeans(fatal.fit, ~Speed_limitF)
# where do the p(fatal) differ?
fatal.fit.cld <- multcomp::cld(fatal.fit.emmo, type="response")
fatal.fit.cld # interesting what happens
## Speed_limitF prob SE df asymp.LCL asymp.UCL .group
## 20 0.00323 0.001442 Inf 0.00135 0.00774 1
## 30 0.00544 0.000232 Inf 0.00500 0.00591 1
## 40 0.01303 0.001010 Inf 0.01119 0.01516 2
## 50 0.01906 0.001898 Inf 0.01568 0.02316 3
## 70 0.02290 0.001407 Inf 0.02030 0.02583 34
## 60 0.02799 0.001076 Inf 0.02596 0.03018 4
##
## Confidence level used: 0.95
## Intervals are back-transformed from the logit scale
## P value adjustment: tukey method for comparing a family of 6 estimates
## Tests are performed on the log odds ratio scale
## significance level used: alpha = 0.05
## NOTE: Compact letter displays can be misleading
## because they show NON-findings rather than findings.
## Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.
sf.cld.plot.bar(fatal.fit.cld, "Speed_limitF")+
ggtitle("Prob(fatality) by Speed Limit (with 95% ci)")+
xlab("Speed limit")+ylab("Prob(fatality rate) with 95% ci")
ggsave(file='../../MyStuff/Images/glm-cld-radf-speed-001.png',
h=4, w=6, units='in', dpi=300)
sf.cld.plot.line(fatal.fit.cld, "Speed_limitF")+
ggtitle("Prob(fatality) by Speed limit (with 95% ci)")+
xlab("Speed limit")+ylab("Prob(fatality rate) with 95% ci")
ggsave(file='../../MyStuff/Images/glm-cld-radf-speed-002.png',
h=4, w=6, units='in', dpi=300)