Rcourse-code-CRD-lm-glm.R

# Discourse on lm() and glm() models in CRD/SRS design
# 2021-09-27 CJS emmeans::CLD to multcomp::cld
# 2015-02-01 CJS Updates
# 2014-05-19 CJS First Editions

options(useFancyQuotes=FALSE) # renders summary output corrects
source("schwarz.functions.r")
#source('http://www.stat.sfu.ca/~cschwarz/Stat-650/Notes/MyPrograms/schwarz.functions.r')

library(car)

## Loading required package: carData

library(ggfortify)

## Loading required package: ggplot2

library(ggplot2)
library(gridExtra)
library(emmeans)
library(multcomp)

## Loading required package: mvtnorm

## Loading required package: survival

## Loading required package: TH.data

## Loading required package: MASS

## 
## Attaching package: 'TH.data'

## The following object is masked from 'package:MASS':
## 
##     geyser

library(plyr)
library(readxl)

cereal <- read.csv(file.path('..',"sampledata",'cereal.csv'), 
          header=TRUE, as.is=TRUE,
          strip.white=TRUE)
cereal[1:5,]

##                        name mfr type calories protein fat sodium fiber carbo
## 1                 100%_Bran   N    C       60       4   1    130    10     5
## 2         100%_Natural_Bran   Q    C      110       3   5     15     2     8
## 3                  All-Bran   K    C       80       4   1    260     9     7
## 4 All-Bran_with_Extra_Fiber   K    C       50       4   0    140    14     8
## 5            Almond_Delight   R    C      110       2   2    200     1    14
##   sugars shelf potass vitamins weight  cups
## 1      6     3    280       25      1 0.331
## 2      8     3    135        0      1    NA
## 3      5     3    320       25      1 0.330
## 4      0     3    330       25      1 0.500
## 5      8     3     NA       25      1 0.750

# start a plot. ggplot ONLY uses data frames.
newplot <- ggplot(cereal, aes(x=fat, y=calories))+
   ggtitle("Calories vs Grams of Fat")+
   xlab("Fat/serving (g)")+ylab("Calories/serving")+
   geom_point()
newplot

# Fit the regression line
cereal.fit <- lm( calories ~ fat, data=cereal)
str(cereal.fit)

## List of 12
##  $ coefficients : Named num [1:2] 95.13 9.81
##   ..- attr(*, "names")= chr [1:2] "(Intercept)" "fat"
##  $ residuals    : Named num [1:77] -44.94 -34.16 -24.94 -45.13 -4.74 ...
##   ..- attr(*, "names")= chr [1:77] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:77] -921.94 -86.04 -20.37 -42.88 2.14 ...
##   ..- attr(*, "names")= chr [1:77] "(Intercept)" "fat" "" "" ...
##  $ rank         : int 2
##  $ fitted.values: Named num [1:77] 104.9 144.2 104.9 95.1 114.7 ...
##   ..- attr(*, "names")= chr [1:77] "1" "2" "3" "4" ...
##  $ assign       : int [1:2] 0 1
##  $ qr           :List of 5
##   ..$ qr   : num [1:77, 1:2] -8.775 0.114 0.114 0.114 0.114 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:77] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:2] "(Intercept)" "fat"
##   .. ..- attr(*, "assign")= int [1:2] 0 1
##   ..$ qraux: num [1:2] 1.11 1.45
##   ..$ pivot: int [1:2] 1 2
##   ..$ tol  : num 1e-07
##   ..$ rank : int 2
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 75
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = calories ~ fat, data = cereal)
##  $ terms        :Classes 'terms', 'formula'  language calories ~ fat
##   .. ..- attr(*, "variables")= language list(calories, fat)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "calories" "fat"
##   .. .. .. ..$ : chr "fat"
##   .. ..- attr(*, "term.labels")= chr "fat"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(calories, fat)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:2] "calories" "fat"
##  $ model        :'data.frame':   77 obs. of  2 variables:
##   ..$ calories: int [1:77] 60 110 80 50 110 110 110 140 90 90 ...
##   ..$ fat     : int [1:77] 1 5 1 0 2 2 0 2 1 0 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language calories ~ fat
##   .. .. ..- attr(*, "variables")= language list(calories, fat)
##   .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:2] "calories" "fat"
##   .. .. .. .. ..$ : chr "fat"
##   .. .. ..- attr(*, "term.labels")= chr "fat"
##   .. .. ..- attr(*, "order")= int 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(calories, fat)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:2] "calories" "fat"
##  - attr(*, "class")= chr "lm"

names(cereal.fit)

##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"

# The standard methods to extract coefficients etc
summary(cereal.fit)

## 
## Call:
## lm(formula = calories ~ fat, data = cereal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.132  -5.132   4.868  14.868  45.256 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   95.132      3.141  30.285  < 2e-16 ***
## fat            9.806      2.207   4.443 3.01e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.36 on 75 degrees of freedom
## Multiple R-squared:  0.2084, Adjusted R-squared:  0.1978 
## F-statistic: 19.74 on 1 and 75 DF,  p-value: 3.009e-05

anova(cereal.fit) # Caution Type I only

## Analysis of Variance Table
## 
## Response: calories
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## fat        1  7402.9  7402.9  19.743 3.009e-05 ***
## Residuals 75 28121.8   375.0                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

coef(cereal.fit)

## (Intercept)         fat 
##   95.131579    9.806005

sqrt(diag(vcov(cereal.fit))) # SE of coefficients

## (Intercept)         fat 
##    3.141224    2.206897

confint(cereal.fit)

##                 2.5 %    97.5 %
## (Intercept) 88.873939 101.38922
## fat          5.409642  14.20237

summary(cereal.fit)$r.squared

## [1] 0.2083875

summary(cereal.fit)$sigma

## [1] 19.36381

methods(class=class(cereal.fit))  # shows methods available

##  [1] add1                addterm             alias              
##  [4] anova               Anova               attrassign         
##  [7] autoplot            avPlot              Boot               
## [10] bootCase            boxcox              boxCox             
## [13] brief               case.names          ceresPlot          
## [16] coerce              concordance         confidenceEllipse  
## [19] confint             Confint             cooks.distance     
## [22] crPlot              deltaMethod         deviance           
## [25] dfbeta              dfbetaPlots         dfbetas            
## [28] dfbetasPlots        drop1               dropterm           
## [31] dummy.coef          durbinWatsonTest    effects            
## [34] emm_basis           extractAIC          family             
## [37] formula             fortify             hatvalues          
## [40] hccm                infIndexPlot        influence          
## [43] influencePlot       initialize          inverseResponsePlot
## [46] kappa               labels              leveneTest         
## [49] leveragePlot        linearHypothesis    logLik             
## [52] logtrans            mcPlot              mmp                
## [55] model.frame         model.matrix        ncvTest            
## [58] nextBoot            nobs                outlierTest        
## [61] plot                powerTransform      predict            
## [64] Predict             print               proj               
## [67] qqPlot              qr                  recover_data       
## [70] residualPlot        residualPlots       residuals          
## [73] rstandard           rstudent            S                  
## [76] show                sigmaHat            simulate           
## [79] slotsFromS3         spreadLevelPlot     summary            
## [82] symbox              variable.names      vcov               
## see '?methods' for accessing help and source code

# CAUTION - lm() gives Type I (incremental) tests - you want 
#    Type III (marginal) tests
# Compare the following cereal.fits
cereal.fit <- lm(calories ~ fat, data=cereal)
anova(cereal.fit)

## Analysis of Variance Table
## 
## Response: calories
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## fat        1  7402.9  7402.9  19.743 3.009e-05 ***
## Residuals 75 28121.8   375.0                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(cereal.fit)

## 
## Call:
## lm(formula = calories ~ fat, data = cereal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.132  -5.132   4.868  14.868  45.256 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   95.132      3.141  30.285  < 2e-16 ***
## fat            9.806      2.207   4.443 3.01e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.36 on 75 degrees of freedom
## Multiple R-squared:  0.2084, Adjusted R-squared:  0.1978 
## F-statistic: 19.74 on 1 and 75 DF,  p-value: 3.009e-05

cereal.fit2 <- lm(calories ~ fat + protein, data=cereal)
anova(cereal.fit2)

## Analysis of Variance Table
## 
## Response: calories
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## fat        1  7402.9  7402.9 19.6236 3.207e-05 ***
## protein    1   205.7   205.7  0.5453    0.4626    
## Residuals 74 27916.1   377.2                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(cereal.fit2)

## 
## Call:
## lm(formula = calories ~ fat + protein, data = cereal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.153  -7.308   2.383  12.538  45.765 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   98.690      5.757  17.142  < 2e-16 ***
## fat           10.154      2.263   4.486  2.6e-05 ***
## protein       -1.537      2.081  -0.738    0.463    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.42 on 74 degrees of freedom
## Multiple R-squared:  0.2142, Adjusted R-squared:  0.1929 
## F-statistic: 10.08 on 2 and 74 DF,  p-value: 0.000134

cereal.fit3 <- lm(calories ~ protein + fat, data=cereal)
anova(cereal.fit3)

## Analysis of Variance Table
## 
## Response: calories
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## protein    1    15.3    15.3  0.0404    0.8412    
## fat        1  7593.4  7593.4 20.1285 2.604e-05 ***
## Residuals 74 27916.1   377.2                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(cereal.fit3)

## 
## Call:
## lm(formula = calories ~ protein + fat, data = cereal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.153  -7.308   2.383  12.538  45.765 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   98.690      5.757  17.142  < 2e-16 ***
## protein       -1.537      2.081  -0.738    0.463    
## fat           10.154      2.263   4.486  2.6e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.42 on 74 degrees of freedom
## Multiple R-squared:  0.2142, Adjusted R-squared:  0.1929 
## F-statistic: 10.08 on 2 and 74 DF,  p-value: 0.000134

# Finding the type III SS for a model with CONTINUOUS variables
car::Anova(cereal.fit2, type=3)

## Anova Table (Type III tests)
## 
## Response: calories
##             Sum Sq Df  F value    Pr(>F)    
## (Intercept) 110850  1 293.8425 < 2.2e-16 ***
## fat           7593  1  20.1285 2.604e-05 ***
## protein        206  1   0.5453    0.4626    
## Residuals    27916 74                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

car::Anova(cereal.fit3, type=3)

## Anova Table (Type III tests)
## 
## Response: calories
##             Sum Sq Df  F value    Pr(>F)    
## (Intercept) 110850  1 293.8425 < 2.2e-16 ***
## protein        206  1   0.5453    0.4626    
## fat           7593  1  20.1285 2.604e-05 ***
## Residuals    27916 74                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Standard Diagnostic plots

library(ggfortify)
plot1 <- autoplot(cereal.fit) # ggplot
plot1

ggsave(#plot1,  # see https://github.com/sinhrks/ggfortify/issues/98
       file="../../MyStuff/Images/cereal-fit-lm-diagplot.png",
       h=4, w=6, unit="in", dpi=300)

library(car)
residualPlots(cereal.fit)

##            Test stat Pr(>|Test stat|)
## fat          -1.6177           0.1100
## Tukey test   -1.6177           0.1057

# Obtaining predictions
# First set up the points where you want predictions
newfat <- data.frame(fat=seq(0,6,.1))
newfat[1:5,]

## [1] 0.0 0.1 0.2 0.3 0.4

str(newfat)

## 'data.frame':    61 obs. of  1 variable:
##  $ fat: num  0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 ...

# Predict the AVERAGE number of calories/serving at each fat level
# You need to specify help(predict.lm) tp see the documentation
predict.avg <- predict(cereal.fit, newdata=newfat, 
                       se.fit=TRUE,interval="confidence")
# This creates a list that you need to restructure to make it look nice
predict.avg.df <- cbind(newfat, predict.avg$fit, se=predict.avg$se.fit)
tail(predict.avg.df)

##    fat      fit      lwr      upr       se
## 56 5.5 149.0646 128.8542 169.2750 10.14527
## 57 5.6 150.0452 129.4055 170.6850 10.36079
## 58 5.7 151.0258 129.9563 172.0953 10.57652
## 59 5.8 152.0064 130.5067 173.5061 10.79245
## 60 5.9 152.9870 131.0568 174.9172 11.00857
## 61 6.0 153.9676 131.6065 176.3287 11.22487

# Add the fitted line and confidence intervals to the plot
plotfit.avgci <- newplot +
    geom_line( data=predict.avg.df, aes(x=fat, y=fit))+
    geom_ribbon(data=predict.avg.df, 
                aes(x=fat,y=NULL, ymin=lwr, ymax=upr),alpha=0.2)
plotfit.avgci

ggsave(plot=plotfit.avgci, file='../../MyStuff/Images/lm-predict-cereal-001.png',
       h=4, w=6, units="in",dpi=300)


# Predict the INDIVIDUAL number of calories in each cereal
# R does not product the se for individual predictions
predict.indiv <- predict(cereal.fit, newdata=newfat, 
                        interval="prediction")
# This creates a list that you need to restructure to make it look nice
predict.indiv.df <- cbind(newfat, predict.indiv)
tail(predict.indiv.df)

##    fat      fit      lwr      upr
## 56 5.5 149.0646 105.5162 192.6131
## 57 5.6 150.0452 106.2959 193.7946
## 58 5.7 151.0258 107.0721 194.9795
## 59 5.8 152.0064 107.8449 196.1680
## 60 5.9 152.9870 108.6143 197.3597
## 61 6.0 153.9676 109.3803 198.5549

# Add the prediction intervals to the plot
plotfit.indivci <- plotfit.avgci +
    geom_ribbon(data=predict.indiv.df,
                aes(x=fat,y=NULL, ymin=lwr, ymax=upr),
                alpha=0.1)
plotfit.indivci

ggsave(plot=plotfit.indivci, file='../../MyStuff/Images/lm-predict-cereal-002.png',
       h=4, w=6, units="in", dpi=300)





# Exercise with Birds and Butts
# Relationship between parasite load and number of butts in nest

library(readxl)
butts <- readxl::read_excel('../sampledata/bird-butts-data.xlsx', 
                  sheet='Correlational', 
                  col_names = TRUE,
                  trim_ws=TRUE,
                  skip=1,
                  .name_repair='universal')

## New names:
## * `Nest content` -> Nest.content
## * `Butts wieght` -> Butts.wieght
## * `Number of mites` -> Number.of.mites

butts[1:5,]

## # A tibble: 5 × 5
##    Nest Species Nest.content Butts.wieght Number.of.mites
##   <dbl> <chr>   <chr>               <dbl>           <dbl>
## 1     1 HOSP    empty                6.13               4
## 2     2 HOSP    empty                3.73              30
## 3     3 HOSP    eggs                 0.06              84
## 4     4 HOSP    eggs                 8.3                2
## 5     5 HOSP    eggs                 0                 12

# Fix the names
names(butts)[ grepl('wieght', names(butts))] <- "Butts.weight"
butts[1:5,]

## # A tibble: 5 × 5
##    Nest Species Nest.content Butts.weight Number.of.mites
##   <dbl> <chr>   <chr>               <dbl>           <dbl>
## 1     1 HOSP    empty                6.13               4
## 2     2 HOSP    empty                3.73              30
## 3     3 HOSP    eggs                 0.06              84
## 4     4 HOSP    eggs                 8.3                2
## 5     5 HOSP    eggs                 0                 12

# Look like we need a log-transform on the number of mites
butts$log.mites <- log(butts$Number.of.mites)
outlier <- butts[butts$log.mites<= 0,]
outlier

## # A tibble: 1 × 6
##    Nest Species Nest.content Butts.weight Number.of.mites log.mites
##   <dbl> <chr>   <chr>               <dbl>           <dbl>     <dbl>
## 1    56 HOFI    eggs                 4.46               1         0

prelimplotlog <- ggplot(data=butts, aes(x=Butts.weight, y=log.mites))+
     ggtitle("Number of mites vs. butts weight")+
     xlab("Butts weight (g)")+ylab("log(Number of mites)")+
     geom_point()
prelimplotlog

fit.log.butts <- lm(log.mites ~ Butts.weight, data=butts)
anova(fit.log.butts)

## Analysis of Variance Table
## 
## Response: log.mites
##              Df Sum Sq Mean Sq F value    Pr(>F)    
## Butts.weight  1 32.394  32.394  52.295 1.574e-09 ***
## Residuals    55 34.069   0.619                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(fit.log.butts)

## 
## Call:
## lm(formula = log.mites ~ Butts.weight, data = butts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.58637 -0.47091  0.04604  0.58008  1.34377 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.49066    0.12981  26.891  < 2e-16 ***
## Butts.weight -0.20276    0.02804  -7.232 1.57e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.787 on 55 degrees of freedom
## Multiple R-squared:  0.4874, Adjusted R-squared:  0.4781 
## F-statistic:  52.3 on 1 and 55 DF,  p-value: 1.574e-09

sf.autoplot.lm(fit.log.butts) # ggplot

## ****** sf.autoplot.lm() no longer needed        *****
##        use autoplot() from the ggfortify package

newbutts <- data.frame(Butts.weight=seq(min(butts$Butts.weight,na.rm=TRUE),
                              max(butts$Butts.weight,na.rm=TRUE),.1))
newbutts[1:5,]

## [1] 0.0 0.1 0.2 0.3 0.4

str(newbutts)

## 'data.frame':    149 obs. of  1 variable:
##  $ Butts.weight: num  0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 ...

# Predict the AVERAGE number of calories/serving at each fat level
# You need to specify help(predict.lm) tp see the documentation
predict.avg <- predict(fit.log.butts, newdata=newbutts, 
                       se.fit=TRUE,interval="confidence")
# This creates a list that you need to restructure to make it look nice
predict.avg.df <- cbind(newbutts, predict.avg$fit, se=predict.avg$se.fit)
tail(predict.avg.df)

##     Butts.weight       fit        lwr      upr        se
## 144         14.3 0.5912462 -0.0900707 1.272563 0.3399709
## 145         14.4 0.5709706 -0.1156967 1.257638 0.3426407
## 146         14.5 0.5506950 -0.1413269 1.242717 0.3453126
## 147         14.6 0.5304194 -0.1669613 1.227800 0.3479866
## 148         14.7 0.5101438 -0.1925997 1.212887 0.3506626
## 149         14.8 0.4898682 -0.2182421 1.197978 0.3533406

# Add the fitted line and confidence intervals to the plot
plotfit.avgci <- prelimplotlog +
    geom_line( data=predict.avg.df, aes(x=Butts.weight, y=fit))+
    geom_ribbon(data=predict.avg.df, 
                aes(x=Butts.weight,y=NULL, ymin=lwr, ymax=upr),alpha=0.2)
plotfit.avgci

ggsave(plot=plotfit.avgci, file='../../MyStuff/Images/lm-predict-nests-001.png',
       h=4, w=6, units="in", dpi=300)


# Predict the INDIVIDUAL number of calories in each cereal
# R does not product the se for individual predictions
predict.indiv <- predict(fit.log.butts, newdata=newbutts, 
                        interval="prediction")
# This creates a list that you need to restructure to make it look nice
predict.indiv.df <- cbind(newbutts, predict.indiv)
tail(predict.indiv.df)

##     Butts.weight       fit       lwr      upr
## 144         14.3 0.5912462 -1.126881 2.309374
## 145         14.4 0.5709706 -1.149286 2.291227
## 146         14.5 0.5506950 -1.171706 2.273096
## 147         14.6 0.5304194 -1.194141 2.254980
## 148         14.7 0.5101438 -1.216593 2.236880
## 149         14.8 0.4898682 -1.239059 2.218796

# Add the prediction intervals to the plot
plotfit.indivci <- plotfit.avgci +
    geom_ribbon(data=predict.indiv.df,
                aes(x=Butts.weight,y=NULL, ymin=lwr, ymax=upr),
                alpha=0.1)
plotfit.indivci

ggsave(plot=plotfit.indivci, file='../../MyStuff/Images/lm-predict-nests-002.png',
       h=4, w=6, units="in", dpi=300)


# Plot on the regular scale by taking anti-logs
prelimplot <- ggplot(data=butts, aes(x=Butts.weight, y=Number.of.mites))+
     ggtitle("Number of mites vs. butts weight")+
     xlab("Butts weight (g)")+ylab("Number of mites")+
     geom_point()
prelimplot

plotfit.avgci <- prelimplot +
    geom_line( data=predict.avg.df, aes(x=Butts.weight, y=exp(fit)))+
    geom_ribbon(data=predict.avg.df, 
                aes(x=Butts.weight,y=NULL, ymin=exp(lwr), 
                    ymax=exp(upr)),alpha=0.2)
plotfit.avgci

plotfit.indivci <- plotfit.avgci +
    geom_ribbon(data=predict.indiv.df,
                aes(x=Butts.weight,y=NULL, ymin=exp(lwr), ymax=exp(upr)),
                alpha=0.1)
plotfit.indivci

ggsave(plot=plotfit.indivci, file='../../MyStuff/Images/lm-predict-nests-003.png',
       h=4, w=6, units="in", dpi=300)

# Does MEAN sugars/serving depend on display shelf?

# Declare shelf as a factor
cereal$shelfF <- factor(cereal$shelf)
cereal[1:3,]

##                name mfr type calories protein fat sodium fiber carbo sugars
## 1         100%_Bran   N    C       60       4   1    130    10     5      6
## 2 100%_Natural_Bran   Q    C      110       3   5     15     2     8      8
## 3          All-Bran   K    C       80       4   1    260     9     7      5
##   shelf potass vitamins weight  cups shelfF
## 1     3    280       25      1 0.331      3
## 2     3    135        0      1    NA      3
## 3     3    320       25      1 0.330      3

str(cereal)

## 'data.frame':    77 obs. of  16 variables:
##  $ name    : chr  "100%_Bran" "100%_Natural_Bran" "All-Bran" "All-Bran_with_Extra_Fiber" ...
##  $ mfr     : chr  "N" "Q" "K" "K" ...
##  $ type    : chr  "C" "C" "C" "C" ...
##  $ calories: int  60 110 80 50 110 110 110 140 90 90 ...
##  $ protein : int  4 3 4 4 2 2 2 3 2 3 ...
##  $ fat     : int  1 5 1 0 2 2 0 2 1 0 ...
##  $ sodium  : int  130 15 260 140 200 180 125 210 200 210 ...
##  $ fiber   : num  10 2 9 14 1 1.5 1 2 4 5 ...
##  $ carbo   : num  5 8 7 8 14 10.5 11 18 15 13 ...
##  $ sugars  : int  6 8 5 0 8 10 14 8 6 5 ...
##  $ shelf   : int  3 3 3 3 3 1 2 3 1 3 ...
##  $ potass  : int  280 135 320 330 NA 70 30 100 125 190 ...
##  $ vitamins: int  25 0 25 25 25 25 25 25 25 25 ...
##  $ weight  : num  1 1 1 1 1 1 1 1.33 1 1 ...
##  $ cups    : num  0.331 NA 0.33 0.5 0.75 0.75 1 0.75 0.67 0.67 ...
##  $ shelfF  : Factor w/ 3 levels "1","2","3": 3 3 3 3 3 1 2 3 1 3 ...

# Preliminary plots
prelimplot <- ggplot(data=cereal, aes(x=shelfF, y=sugars))+
    ggtitle("Compare MEAN sugar among display shelves")+
    xlab("Shelf")+ylab("Sugar (g)")+
    geom_point(position=position_jitter(width=0.05))+
    geom_boxplot(alpha=0.2, notch=TRUE, outlier.size=-1) # -1 removes outiers from plot
prelimplot

## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

## Warning: Removed 1 rows containing missing values (geom_point).

## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

ggsave(prelimplot, file='../../MyStuff/Images/lm-prelim-cereal-001.png',
       h=4, w=6, units="in", dpi=300)

## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

## Warning: Removed 1 rows containing missing values (geom_point).

## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

# Table of means and SD
library(plyr)
sumstat <- ddply(cereal, "shelfF", sf.simple.summary, 
                 variable="sugars", crd=TRUE)
sumstat

##   shelfF  n nmiss     mean       sd        se      lcl       ucl
## 1      1 20     1 5.105263 4.483237 1.0285251 2.944412  7.266114
## 2      2 21     0 9.619048 4.128876 0.9009947 7.739606 11.498490
## 3      3 36     0 6.527778 3.835817 0.6393028 5.229924  7.825632

# Fit a single factor CRD anova
shelf.fit <- lm(sugars ~ shelfF, data=cereal)
str(shelf.fit)

## List of 14
##  $ coefficients : Named num [1:3] 5.11 4.51 1.42
##   ..- attr(*, "names")= chr [1:3] "(Intercept)" "shelfF2" "shelfF3"
##  $ residuals    : Named num [1:76] -0.528 1.472 -1.528 -6.528 1.472 ...
##   ..- attr(*, "names")= chr [1:76] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:76] -61.25 13.97 -5.02 -6.45 1.55 ...
##   ..- attr(*, "names")= chr [1:76] "(Intercept)" "shelfF2" "shelfF3" "" ...
##  $ rank         : int 3
##  $ fitted.values: Named num [1:76] 6.53 6.53 6.53 6.53 6.53 ...
##   ..- attr(*, "names")= chr [1:76] "1" "2" "3" "4" ...
##  $ assign       : int [1:3] 0 1 1
##  $ qr           :List of 5
##   ..$ qr   : num [1:76, 1:3] -8.718 0.115 0.115 0.115 0.115 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:76] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:3] "(Intercept)" "shelfF2" "shelfF3"
##   .. ..- attr(*, "assign")= int [1:3] 0 1 1
##   .. ..- attr(*, "contrasts")=List of 1
##   .. .. ..$ shelfF: chr "contr.treatment"
##   ..$ qraux: num [1:3] 1.11 1.06 1.08
##   ..$ pivot: int [1:3] 1 2 3
##   ..$ tol  : num 1e-07
##   ..$ rank : int 3
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 73
##  $ na.action    : 'omit' Named int 58
##   ..- attr(*, "names")= chr "58"
##  $ contrasts    :List of 1
##   ..$ shelfF: chr "contr.treatment"
##  $ xlevels      :List of 1
##   ..$ shelfF: chr [1:3] "1" "2" "3"
##  $ call         : language lm(formula = sugars ~ shelfF, data = cereal)
##  $ terms        :Classes 'terms', 'formula'  language sugars ~ shelfF
##   .. ..- attr(*, "variables")= language list(sugars, shelfF)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "sugars" "shelfF"
##   .. .. .. ..$ : chr "shelfF"
##   .. ..- attr(*, "term.labels")= chr "shelfF"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(sugars, shelfF)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "factor"
##   .. .. ..- attr(*, "names")= chr [1:2] "sugars" "shelfF"
##  $ model        :'data.frame':   76 obs. of  2 variables:
##   ..$ sugars: int [1:76] 6 8 5 0 8 10 14 8 6 5 ...
##   ..$ shelfF: Factor w/ 3 levels "1","2","3": 3 3 3 3 3 1 2 3 1 3 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language sugars ~ shelfF
##   .. .. ..- attr(*, "variables")= language list(sugars, shelfF)
##   .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:2] "sugars" "shelfF"
##   .. .. .. .. ..$ : chr "shelfF"
##   .. .. ..- attr(*, "term.labels")= chr "shelfF"
##   .. .. ..- attr(*, "order")= int 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(sugars, shelfF)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "factor"
##   .. .. .. ..- attr(*, "names")= chr [1:2] "sugars" "shelfF"
##   ..- attr(*, "na.action")= 'omit' Named int 58
##   .. ..- attr(*, "names")= chr "58"
##  - attr(*, "class")= chr "lm"

anova(shelf.fit) # CAUTION - Type I (incremental)

## Analysis of Variance Table
## 
## Response: sugars
##           Df  Sum Sq Mean Sq F value   Pr(>F)   
## shelfF     2  220.23 110.117  6.6013 0.002316 **
## Residuals 73 1217.71  16.681                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(shelf.fit) # Not useful

## 
## Call:
## lm(formula = sugars ~ shelfF, data = cereal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.6190 -3.2109 -0.5278  3.0163  9.8947 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.105      0.937   5.449 6.55e-07 ***
## shelfF2        4.514      1.293   3.490 0.000822 ***
## shelfF3        1.423      1.158   1.228 0.223293    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.084 on 73 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1532, Adjusted R-squared:   0.13 
## F-statistic: 6.601 on 2 and 73 DF,  p-value: 0.002316

# use the emmeans::emmeans() package to get the individual 
# means and the pairwise comparisons and plot them
library(emmeans)
shelf.fit.emmo <- emmeans::emmeans(shelf.fit, ~shelfF)

# Where do difference in marginal means lie?
shelf.fit.cld <- multcomp::cld(shelf.fit.emmo)
shelf.fit.cld

##  shelfF emmean    SE df lower.CL upper.CL .group
##  1        5.11 0.937 73     3.24     6.97  1    
##  3        6.53 0.681 73     5.17     7.88  1    
##  2        9.62 0.891 73     7.84    11.40   2   
## 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 3 estimates 
## significance level used: alpha = 0.05 
## NOTE: Compact letter displays can be misleading
##       because they show NON-findings rather than findings.
##       Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.

# You can now write ggplot() to display the results or
# use my builtin functions (have a look at them)
sf.cld.plot.bar(shelf.fit.cld, "shelfF")+
  ggtitle("Mean sugars/serving by shelf (with 95% ci)")+
  xlab("Shelf")+ylab("Mean sugars/serving with 95% ci")

ggsave(file='../../MyStuff/Images/lm-cld-cereal-001.png',
       h=4, w=6, units="in",dpi=300)

sf.cld.plot.line(shelf.fit.cld, "shelfF")+
  ggtitle("Mean sugar/serving by shelf (with 95% ci)")+
  xlab("Shelf")+ylab("Mean sugars/serving with 95% ci")

ggsave(file='../../MyStuff/Images/lm-cld-cereal-002.png',
       h=4, w=6, units="in", dpi=300)


# Estimate all of the pairwise differences
pairs(shelf.fit.emmo)

##  contrast estimate   SE df t.ratio p.value
##  1 - 2       -4.51 1.29 73  -3.490  0.0023
##  1 - 3       -1.42 1.16 73  -1.228  0.4405
##  2 - 3        3.09 1.12 73   2.756  0.0199
## 
## P value adjustment: tukey method for comparing a family of 3 estimates

confint(pairs(shelf.fit.emmo))

##  contrast estimate   SE df lower.CL upper.CL
##  1 - 2       -4.51 1.29 73   -7.608    -1.42
##  1 - 3       -1.42 1.16 73   -4.193     1.35
##  2 - 3        3.09 1.12 73    0.408     5.77
## 
## Confidence level used: 0.95 
## Conf-level adjustment: tukey method for comparing a family of 3 estimates

summary(pairs(shelf.fit.emmo), infer=TRUE)

##  contrast estimate   SE df lower.CL upper.CL t.ratio p.value
##  1 - 2       -4.51 1.29 73   -7.608    -1.42  -3.490  0.0023
##  1 - 3       -1.42 1.16 73   -4.193     1.35  -1.228  0.4405
##  2 - 3        3.09 1.12 73    0.408     5.77   2.756  0.0199
## 
## Confidence level used: 0.95 
## Conf-level adjustment: tukey method for comparing a family of 3 estimates 
## P value adjustment: tukey method for comparing a family of 3 estimates

# Birds 'n Butts
# Is there a difference in mean weight of buts between 3 nest contents?

library(readxl)
butts <- readxl::read_excel('../sampledata/bird-butts-data.xlsx', 
              sheet='Correlational', 
              col_names=TRUE, 
              trim_ws=TRUE,
              skip=1,
              .name_repair="universal")

## New names:
## * `Nest content` -> Nest.content
## * `Butts wieght` -> Butts.wieght
## * `Number of mites` -> Number.of.mites

butts[1:5,]

## # A tibble: 5 × 5
##    Nest Species Nest.content Butts.wieght Number.of.mites
##   <dbl> <chr>   <chr>               <dbl>           <dbl>
## 1     1 HOSP    empty                6.13               4
## 2     2 HOSP    empty                3.73              30
## 3     3 HOSP    eggs                 0.06              84
## 4     4 HOSP    eggs                 8.3                2
## 5     5 HOSP    eggs                 0                 12

# Fix the names
names(butts)[ grepl('wieght', names(butts))] <- "Butts.weight"
butts[1:5,]

## # A tibble: 5 × 5
##    Nest Species Nest.content Butts.weight Number.of.mites
##   <dbl> <chr>   <chr>               <dbl>           <dbl>
## 1     1 HOSP    empty                6.13               4
## 2     2 HOSP    empty                3.73              30
## 3     3 HOSP    eggs                 0.06              84
## 4     4 HOSP    eggs                 8.3                2
## 5     5 HOSP    eggs                 0                 12

# Look like we need a log-transform on the number of mites
butts$log.mites <- log(butts$Number.of.mites)
outlier <- butts[butts$log.mites<= 0,]
outlier

## # A tibble: 1 × 6
##    Nest Species Nest.content Butts.weight Number.of.mites log.mites
##   <dbl> <chr>   <chr>               <dbl>           <dbl>     <dbl>
## 1    56 HOFI    eggs                 4.46               1         0

# Declare Nest.Contents as a factor
butts$Nest.contentF <- factor(butts$Nest.content,
                    levels=c("eggs","chicks","empty"),
                    order=TRUE)
butts[1:3,]

## # A tibble: 3 × 7
##    Nest Species Nest.content Butts.weight Number.of.mites log.mites
##   <dbl> <chr>   <chr>               <dbl>           <dbl>     <dbl>
## 1     1 HOSP    empty                6.13               4      1.39
## 2     2 HOSP    empty                3.73              30      3.40
## 3     3 HOSP    eggs                 0.06              84      4.43
## # … with 1 more variable: Nest.contentF <ord>

str(butts)

## tibble [57 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Nest           : num [1:57] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Species        : chr [1:57] "HOSP" "HOSP" "HOSP" "HOSP" ...
##  $ Nest.content   : chr [1:57] "empty" "empty" "eggs" "eggs" ...
##  $ Butts.weight   : num [1:57] 6.13 3.73 0.06 8.3 0 1.23 1.03 0 2.4 0.35 ...
##  $ Number.of.mites: num [1:57] 4 30 84 2 12 7 10 44 16 32 ...
##  $ log.mites      : num [1:57] 1.386 3.401 4.431 0.693 2.485 ...
##  $ Nest.contentF  : Ord.factor w/ 3 levels "eggs"<"chicks"<..: 3 3 1 1 1 2 2 3 2 2 ...

# Fit a single factor CRD anova
nc.fit <- lm(Butts.weight ~ Nest.contentF, data=butts)
str(nc.fit)

## List of 13
##  $ coefficients : Named num [1:3] 2.519 0.726 1.807
##   ..- attr(*, "names")= chr [1:3] "(Intercept)" "Nest.contentF.L" "Nest.contentF.Q"
##  $ residuals    : Named num [1:57] 2.3592 -0.0408 -2.6837 5.5563 -2.7437 ...
##   ..- attr(*, "names")= chr [1:57] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:57] -20.83 -3.77 -7.18 5.47 -2.83 ...
##   ..- attr(*, "names")= chr [1:57] "(Intercept)" "Nest.contentF.L" "Nest.contentF.Q" "" ...
##  $ rank         : int 3
##  $ fitted.values: Named num [1:57] 3.77 3.77 2.74 2.74 2.74 ...
##   ..- attr(*, "names")= chr [1:57] "1" "2" "3" "4" ...
##  $ assign       : int [1:3] 0 1 1
##  $ qr           :List of 5
##   ..$ qr   : num [1:57, 1:3] -7.55 0.132 0.132 0.132 0.132 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:57] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:3] "(Intercept)" "Nest.contentF.L" "Nest.contentF.Q"
##   .. ..- attr(*, "assign")= int [1:3] 0 1 1
##   .. ..- attr(*, "contrasts")=List of 1
##   .. .. ..$ Nest.contentF: chr "contr.poly"
##   ..$ qraux: num [1:3] 1.13 1.12 1.09
##   ..$ pivot: int [1:3] 1 2 3
##   ..$ tol  : num 1e-07
##   ..$ rank : int 3
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 54
##  $ contrasts    :List of 1
##   ..$ Nest.contentF: chr "contr.poly"
##  $ xlevels      :List of 1
##   ..$ Nest.contentF: chr [1:3] "eggs" "chicks" "empty"
##  $ call         : language lm(formula = Butts.weight ~ Nest.contentF, data = butts)
##  $ terms        :Classes 'terms', 'formula'  language Butts.weight ~ Nest.contentF
##   .. ..- attr(*, "variables")= language list(Butts.weight, Nest.contentF)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "Butts.weight" "Nest.contentF"
##   .. .. .. ..$ : chr "Nest.contentF"
##   .. ..- attr(*, "term.labels")= chr "Nest.contentF"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(Butts.weight, Nest.contentF)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "ordered"
##   .. .. ..- attr(*, "names")= chr [1:2] "Butts.weight" "Nest.contentF"
##  $ model        :'data.frame':   57 obs. of  2 variables:
##   ..$ Butts.weight : num [1:57] 6.13 3.73 0.06 8.3 0 1.23 1.03 0 2.4 0.35 ...
##   ..$ Nest.contentF: Ord.factor w/ 3 levels "eggs"<"chicks"<..: 3 3 1 1 1 2 2 3 2 2 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language Butts.weight ~ Nest.contentF
##   .. .. ..- attr(*, "variables")= language list(Butts.weight, Nest.contentF)
##   .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:2] "Butts.weight" "Nest.contentF"
##   .. .. .. .. ..$ : chr "Nest.contentF"
##   .. .. ..- attr(*, "term.labels")= chr "Nest.contentF"
##   .. .. ..- attr(*, "order")= int 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(Butts.weight, Nest.contentF)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "ordered"
##   .. .. .. ..- attr(*, "names")= chr [1:2] "Butts.weight" "Nest.contentF"
##  - attr(*, "class")= chr "lm"

anova(nc.fit) # CAUTION - Type I (incremental)

## Analysis of Variance Table
## 
## Response: Butts.weight
##               Df Sum Sq Mean Sq F value  Pr(>F)  
## Nest.contentF  2  65.77  32.887   2.459 0.09505 .
## Residuals     54 722.20  13.374                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(nc.fit) # Not useful

## 
## Call:
## lm(formula = Butts.weight ~ Nest.contentF, data = butts)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7708 -2.6837 -0.8036  1.1264 12.1163 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.5194     0.4963   5.077 4.88e-06 ***
## Nest.contentF.L   0.7263     0.7941   0.915   0.3644    
## Nest.contentF.Q   1.8075     0.9204   1.964   0.0547 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.657 on 54 degrees of freedom
## Multiple R-squared:  0.08347,    Adjusted R-squared:  0.04953 
## F-statistic: 2.459 on 2 and 54 DF,  p-value: 0.09505

# use the emmeans::emmeans() package to get the individual 
# means and the pairwise comparisons and plot them
library(emmeans)
nc.fit.emmo <- emmeans::emmeans(nc.fit, ~Nest.contentF)

# Where do difference in marginal means lie?
nc.fit.cld <- multcomp::cld(nc.fit.emmo)
nc.fit.cld

##  Nest.contentF emmean    SE df lower.CL upper.CL .group
##  chicks          1.04 0.977 54   -0.916     3.00  1    
##  eggs            2.74 0.839 54    1.062     4.43  1    
##  empty           3.77 0.746 54    2.274     5.27  1    
## 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 3 estimates 
## significance level used: alpha = 0.05 
## NOTE: Compact letter displays can be misleading
##       because they show NON-findings rather than findings.
##       Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.

# You can now write ggplot() to display the results or
# use my builtin functions (have a look at them)
sf.cld.plot.bar(nc.fit.cld, "Nest.contentF")+
  ggtitle("Mean Butts.weight by nest contents(with 95% ci)")+
  xlab("Nest Content")+ylab("Mean Butts.weight with 95% ci")

ggsave(file='../../MyStuff/Images/lm-cld-butts-001.png',
       h=4, w=6,unit="in", dpi=300)

sf.cld.plot.line(nc.fit.cld, "Nest.contentF")+
  ggtitle("Mean Butts.weight by nest content (with 95% ci)")+
  xlab("Nest Content")+ylab("Mean Butts.weight with 95% ci")

ggsave(file='../../MyStuff/Images/lm-cld-butts-002.png',
       h=4, w=6,unit="in", dpi=300)


# Estimate all of the pairwise differences
pairs(nc.fit.emmo)

##  contrast       estimate   SE df t.ratio p.value
##  eggs - chicks      1.70 1.29 54   1.320  0.3905
##  eggs - empty      -1.03 1.12 54  -0.915  0.6335
##  chicks - empty    -2.73 1.23 54  -2.218  0.0772
## 
## P value adjustment: tukey method for comparing a family of 3 estimates

confint(pairs(nc.fit.emmo))

##  contrast       estimate   SE df lower.CL upper.CL
##  eggs - chicks      1.70 1.29 54    -1.40    4.804
##  eggs - empty      -1.03 1.12 54    -3.73    1.679
##  chicks - empty    -2.73 1.23 54    -5.69    0.237
## 
## Confidence level used: 0.95 
## Conf-level adjustment: tukey method for comparing a family of 3 estimates

#  CRD ANCOVA

# Is the relationship between calories and fat the same for all display shelves?\\

# Declare shelf as a factor
cereal$shelfF <- factor(cereal$shelf)
cereal[1:3,]

##                name mfr type calories protein fat sodium fiber carbo sugars
## 1         100%_Bran   N    C       60       4   1    130    10     5      6
## 2 100%_Natural_Bran   Q    C      110       3   5     15     2     8      8
## 3          All-Bran   K    C       80       4   1    260     9     7      5
##   shelf potass vitamins weight  cups shelfF
## 1     3    280       25      1 0.331      3
## 2     3    135        0      1    NA      3
## 3     3    320       25      1 0.330      3

str(cereal)

## 'data.frame':    77 obs. of  16 variables:
##  $ name    : chr  "100%_Bran" "100%_Natural_Bran" "All-Bran" "All-Bran_with_Extra_Fiber" ...
##  $ mfr     : chr  "N" "Q" "K" "K" ...
##  $ type    : chr  "C" "C" "C" "C" ...
##  $ calories: int  60 110 80 50 110 110 110 140 90 90 ...
##  $ protein : int  4 3 4 4 2 2 2 3 2 3 ...
##  $ fat     : int  1 5 1 0 2 2 0 2 1 0 ...
##  $ sodium  : int  130 15 260 140 200 180 125 210 200 210 ...
##  $ fiber   : num  10 2 9 14 1 1.5 1 2 4 5 ...
##  $ carbo   : num  5 8 7 8 14 10.5 11 18 15 13 ...
##  $ sugars  : int  6 8 5 0 8 10 14 8 6 5 ...
##  $ shelf   : int  3 3 3 3 3 1 2 3 1 3 ...
##  $ potass  : int  280 135 320 330 NA 70 30 100 125 190 ...
##  $ vitamins: int  25 0 25 25 25 25 25 25 25 25 ...
##  $ weight  : num  1 1 1 1 1 1 1 1.33 1 1 ...
##  $ cups    : num  0.331 NA 0.33 0.5 0.75 0.75 1 0.75 0.67 0.67 ...
##  $ shelfF  : Factor w/ 3 levels "1","2","3": 3 3 3 3 3 1 2 3 1 3 ...

# Preliminary plots
prelimplot <- ggplot(data=cereal, aes(x=fat, y=calories, color=shelfF))+
    ggtitle("Compare calories vs fat by shelf")+
    xlab("Calories")+ylab("Calories")+
    geom_point(position=position_jitter(width=0.05))+
    geom_smooth(method="lm", se=FALSE)
prelimplot

## `geom_smooth()` using formula 'y ~ x'

ggsave(prelimplot, file='../../MyStuff/Images/lm-ancova-prelim-cereal-001.png',
       h=4, w=6, units="in", dpi=300)

## `geom_smooth()` using formula 'y ~ x'

# Fit a single factor CRD ancova
library(car)
shelf.fit <- lm(calories ~ shelfF + fat + shelfF:fat, data=cereal,
                contrasts=list(shelfF=contr.sum))
anova(shelf.fit) # CAUTION - Type I (incremental)

## Analysis of Variance Table
## 
## Response: calories
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## shelfF      2   593.2   296.6  0.7875    0.4589    
## fat         1  7025.6  7025.6 18.6546 4.997e-05 ***
## shelfF:fat  2  1166.3   583.1  1.5484    0.2197    
## Residuals  71 26739.6   376.6                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

car::Anova(shelf.fit, type=3) # marginal tests

## Anova Table (Type III tests)
## 
## Response: calories
##             Sum Sq Df  F value    Pr(>F)    
## (Intercept) 303495  1 805.8500 < 2.2e-16 ***
## shelfF         572  2   0.7592  0.471782    
## fat           2693  1   7.1495  0.009298 ** 
## shelfF:fat    1166  2   1.5484  0.219687    
## Residuals    26740 71                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# summary table is useless
summary(shelf.fit)

## 
## Call:
## lm(formula = calories ~ shelfF + fat + shelfF:fat, data = cereal, 
##     contrasts = list(shelfF = contr.sum))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.368 -10.278  -0.278   9.722  46.838 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   96.144      3.387  28.387   <2e-16 ***
## shelfF1        4.134      4.683   0.883   0.3803    
## shelfF2        0.642      5.283   0.122   0.9036    
## fat            7.666      2.867   2.674   0.0093 ** 
## shelfF1:fat   -7.296      4.455  -1.638   0.1059    
## shelfF2:fat    3.167      4.322   0.733   0.4661    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.41 on 71 degrees of freedom
## Multiple R-squared:  0.2473, Adjusted R-squared:  0.1943 
## F-statistic: 4.665 on 5 and 71 DF,  p-value: 0.0009712

# use the emmeans::emmeans() package to get the individual 
# means and the pairwise comparisons and plot them
library(emmeans)
shelf.fit.emmo <- emmeans::emtrends(shelf.fit, ~shelfF, var="fat")
shelf.fit.cld <- multcomp::cld(shelf.fit.emmo)
shelf.fit.cld

##  shelfF fat.trend   SE df lower.CL upper.CL .group
##  1           0.37 5.91 71  -11.404     12.1  1    
##  2          10.83 5.60 71   -0.337     22.0  1    
##  3          11.79 2.78 71    6.253     17.3  1    
## 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 3 estimates 
## significance level used: alpha = 0.05 
## NOTE: Compact letter displays can be misleading
##       because they show NON-findings rather than findings.
##       Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.

# Fit the parallel slope model
library(car)
shelf.fit2 <- lm(calories ~ shelfF + fat, data=cereal,
                contrasts=list(shelfF=contr.sum))
car::Anova(shelf.fit2, type=3) # marginal tests

## Anova Table (Type III tests)
## 
## Response: calories
##             Sum Sq Df  F value    Pr(>F)    
## (Intercept) 342333  1 895.5195 < 2.2e-16 ***
## shelfF         216  2   0.2823    0.7548    
## fat           7026  1  18.3785 5.457e-05 ***
## Residuals    27906 73                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# summary table is useless except for estimate of common slope
summary(shelf.fit2)

## 
## Call:
## lm(formula = calories ~ shelfF + fat, data = cereal, contrasts = list(shelfF = contr.sum))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -53.73  -7.71   2.29  12.29  46.46 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  95.3297     3.1856  29.925  < 2e-16 ***
## shelfF1      -0.7752     3.5145  -0.221    0.826    
## shelfF2       2.3802     3.3775   0.705    0.483    
## fat           9.9092     2.3114   4.287 5.46e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.55 on 73 degrees of freedom
## Multiple R-squared:  0.2145, Adjusted R-squared:  0.1822 
## F-statistic: 6.643 on 3 and 73 DF,  p-value: 0.0004979

# use the emmeans::emmeans() package to get the individual 
# means and the pairwise comparisons and plot them
library(emmeans)
shelf.fit2.emmo <- emmeans::emmeans(shelf.fit2, ~shelfF)

# Where do difference in marginal means lie?
shelf.fit2.cld <- multcomp::cld(shelf.fit2.emmo)
shelf.fit2.cld

##  shelfF emmean   SE df lower.CL upper.CL .group
##  3         104 3.30 73     97.2      110  1    
##  1         105 4.47 73     95.7      114  1    
##  2         108 4.27 73     99.2      116  1    
## 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 3 estimates 
## significance level used: alpha = 0.05 
## NOTE: Compact letter displays can be misleading
##       because they show NON-findings rather than findings.
##       Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.

# Birds 'n Butts
# Is there a difference in relationship between $\log{{Number of Mites}$ and $Butts Weight$ the same for all 3 nest contents?


library(readxl)
butts <- readxl::read_excel('../sampledata/bird-butts-data.xlsx', 
                            sheet='Correlational', 
                            col_names=TRUE,
                            trim_ws=TRUE,
                            skip=1,
                            .name_repair="universal")

## New names:
## * `Nest content` -> Nest.content
## * `Butts wieght` -> Butts.wieght
## * `Number of mites` -> Number.of.mites

butts[1:5,]

## # A tibble: 5 × 5
##    Nest Species Nest.content Butts.wieght Number.of.mites
##   <dbl> <chr>   <chr>               <dbl>           <dbl>
## 1     1 HOSP    empty                6.13               4
## 2     2 HOSP    empty                3.73              30
## 3     3 HOSP    eggs                 0.06              84
## 4     4 HOSP    eggs                 8.3                2
## 5     5 HOSP    eggs                 0                 12

# Fix the names
names(butts)[ grepl('wieght', names(butts))] <- "Butts.weight"
butts[1:5,]

## # A tibble: 5 × 5
##    Nest Species Nest.content Butts.weight Number.of.mites
##   <dbl> <chr>   <chr>               <dbl>           <dbl>
## 1     1 HOSP    empty                6.13               4
## 2     2 HOSP    empty                3.73              30
## 3     3 HOSP    eggs                 0.06              84
## 4     4 HOSP    eggs                 8.3                2
## 5     5 HOSP    eggs                 0                 12

# Look like we need a log-transform on the number of mites
butts$log.mites <- log(butts$Number.of.mites)
outlier <- butts[butts$log.mites<= 0,]
outlier

## # A tibble: 1 × 6
##    Nest Species Nest.content Butts.weight Number.of.mites log.mites
##   <dbl> <chr>   <chr>               <dbl>           <dbl>     <dbl>
## 1    56 HOFI    eggs                 4.46               1         0

# Declare Nest.Contents as a factor
butts$Nest.contentF <- factor(butts$Nest.content,
                    levels=c("eggs","chicks","empty"),
                    order=TRUE)
butts[1:3,]

## # A tibble: 3 × 7
##    Nest Species Nest.content Butts.weight Number.of.mites log.mites
##   <dbl> <chr>   <chr>               <dbl>           <dbl>     <dbl>
## 1     1 HOSP    empty                6.13               4      1.39
## 2     2 HOSP    empty                3.73              30      3.40
## 3     3 HOSP    eggs                 0.06              84      4.43
## # … with 1 more variable: Nest.contentF <ord>

str(butts)

## tibble [57 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Nest           : num [1:57] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Species        : chr [1:57] "HOSP" "HOSP" "HOSP" "HOSP" ...
##  $ Nest.content   : chr [1:57] "empty" "empty" "eggs" "eggs" ...
##  $ Butts.weight   : num [1:57] 6.13 3.73 0.06 8.3 0 1.23 1.03 0 2.4 0.35 ...
##  $ Number.of.mites: num [1:57] 4 30 84 2 12 7 10 44 16 32 ...
##  $ log.mites      : num [1:57] 1.386 3.401 4.431 0.693 2.485 ...
##  $ Nest.contentF  : Ord.factor w/ 3 levels "eggs"<"chicks"<..: 3 3 1 1 1 2 2 3 2 2 ...

# preliminary plot
plot <- ggplot(data=butts, aes(x=Butts.weight, y=log.mites, color=Nest.contentF))+
   ggtitle("Non-parallel slopes")+
   geom_point( position=position_jitter(h=.1, w=.1))+
   geom_smooth(method="lm", se=FALSE)
plot

## `geom_smooth()` using formula 'y ~ x'

ggsave(plot=plot, file='../../MyStuff/Images/lm-ancova-butts-001.png',
       h=4, w=6,unit="in", dpi=300)

## `geom_smooth()` using formula 'y ~ x'

# Fit a single factor CRD ancova
np.fit <- lm(log.mites ~ Nest.contentF + Butts.weight + Butts.weight:Nest.contentF, data=butts,
              contrasts=list(Nest.contentF=contr.sum))
car::Anova(np.fit, type=3)

## Anova Table (Type III tests)
## 
## Response: log.mites
##                             Sum Sq Df  F value Pr(>F)    
## (Intercept)                288.693  1 446.7996 <2e-16 ***
## Nest.contentF                0.654  2   0.5063 0.6057    
## Butts.weight                 0.729  1   1.1281 0.2932    
## Nest.contentF:Butts.weight   1.024  2   0.7925 0.4582    
## Residuals                   32.953 51                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# use the emmeans::emmeans() package to get the individual 
# means and plot them
library(emmeans)
np.fit.emmo <- emmeans::emtrends(np.fit, ~Nest.contentF, var='Butts.weight')
np.fit.cld <- multcomp::cld(np.fit.emmo)
np.fit.cld

##  Nest.contentF Butts.weight.trend     SE df lower.CL upper.CL .group
##  eggs                      -0.226 0.0471 51   -0.321   -0.132  1    
##  empty                     -0.197 0.0391 51   -0.276   -0.119  1    
##  chicks                     0.119 0.2802 51   -0.443    0.682  1    
## 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 3 estimates 
## significance level used: alpha = 0.05 
## NOTE: Compact letter displays can be misleading
##       because they show NON-findings rather than findings.
##       Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.

# Fit a single factor CRD ancova with parallel slopes
p.fit <- lm(log.mites ~ Nest.contentF + Butts.weight, data=butts,
              contrasts=list(Nest.contentF=contr.sum))
car::Anova(p.fit, type=3)

## Anova Table (Type III tests)
## 
## Response: log.mites
##               Sum Sq Df  F value    Pr(>F)    
## (Intercept)   447.86  1 698.6061 < 2.2e-16 ***
## Nest.contentF   0.09  2   0.0716     0.931    
## Butts.weight   30.48  1  47.5405 6.657e-09 ***
## Residuals      33.98 53                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(p.fit)

## 
## Call:
## lm(formula = log.mites ~ Nest.contentF + Butts.weight, data = butts, 
##     contrasts = list(Nest.contentF = contr.sum))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.54886 -0.52424  0.04152  0.55431  1.38048 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.49042    0.13206  26.431  < 2e-16 ***
## Nest.contentF1 -0.02536    0.15198  -0.167    0.868    
## Nest.contentF2 -0.03067    0.17030  -0.180    0.858    
## Butts.weight   -0.20543    0.02979  -6.895 6.66e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8007 on 53 degrees of freedom
## Multiple R-squared:  0.4888, Adjusted R-squared:  0.4598 
## F-statistic: 16.89 on 3 and 53 DF,  p-value: 7.959e-08

# use the emmeans::emmeans() package to get the individual 
# means and the pairwise comparisons and plot them
library(emmeans)
p.fit.emmo <- emmeans::emmeans(p.fit, ~Nest.contentF)
p.fit.cld <- multcomp::cld(p.fit.emmo)
p.fit.cld

##  Nest.contentF emmean    SE df lower.CL upper.CL .group
##  chicks          2.89 0.220 53     2.45     3.33  1    
##  eggs            2.90 0.184 53     2.53     3.27  1    
##  empty           2.98 0.166 53     2.65     3.31  1    
## 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 3 estimates 
## significance level used: alpha = 0.05 
## NOTE: Compact letter displays can be misleading
##       because they show NON-findings rather than findings.
##       Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.

# Logistic ANOVA
#
# Does PROPORTION of accidents that result in fatality differ by day of the week?

# Read in the accidents dataset
radf <- read.csv(file.path("..","sampledata","Accidents","road-accidents-2010.csv"),
                header=TRUE,
                as.is=TRUE,
                strip.white=TRUE)
dim(radf)

## [1] 154414     32

names(radf)

##  [1] "Accident_Index"                             
##  [2] "Location_Easting_OSGR"                      
##  [3] "Location_Northing_OSGR"                     
##  [4] "Longitude"                                  
##  [5] "Latitude"                                   
##  [6] "Police_Force"                               
##  [7] "Accident_Severity"                          
##  [8] "Number_of_Vehicles"                         
##  [9] "Number_of_Casualties"                       
## [10] "Date"                                       
## [11] "Day_of_Week"                                
## [12] "Time"                                       
## [13] "Local_Authority_.District."                 
## [14] "Local_Authority_.Highway."                  
## [15] "X1st_Road_Class"                            
## [16] "X1st_Road_Number"                           
## [17] "Road_Type"                                  
## [18] "Speed_limit"                                
## [19] "Junction_Detail"                            
## [20] "Junction_Control"                           
## [21] "X2nd_Road_Class"                            
## [22] "X2nd_Road_Number"                           
## [23] "Pedestrian_Crossing.Human_Control"          
## [24] "Pedestrian_Crossing.Physical_Facilities"    
## [25] "Light_Conditions"                           
## [26] "Weather_Conditions"                         
## [27] "Road_Surface_Conditions"                    
## [28] "Special_Conditions_at_Site"                 
## [29] "Carriageway_Hazards"                        
## [30] "Urban_or_Rural_Area"                        
## [31] "Did_Police_Officer_Attend_Scene_of_Accident"
## [32] "LSOA_of_Accident_Location"

# Declare day of week as a factor
radf$Day_of_WeekF <- factor(radf$Day_of_Week)

# Recode AccidentSeverity as fatal or not
xtabs(~Accident_Severity, data=radf)

## Accident_Severity
##      1      2      3 
##   1731  20440 132243

library(car)
radf$fatal <- recode(radf$Accident_Severity,
              ' 1="yes";2:3="no"  ')
xtabs(~fatal+Accident_Severity, data=radf)

##      Accident_Severity
## fatal      1      2      3
##   no       0  20440 132243
##   yes   1731      0      0

# Need to also declare response as a factor (because it is categorical)
radf$fatalF <- factor(radf$fatal, level=c("no","yes"), order=TRUE)

# Preliminary tabulation
xtabs(~fatalF+Day_of_WeekF, data=radf)

##       Day_of_WeekF
## fatalF     1     2     3     4     5     6     7
##    no  16489 22231 22819 22808 22602 25214 20520
##    yes   305   221   226   210   208   261   300

round(prop.table(xtabs(~fatalF+Day_of_WeekF, data=radf),2),3)

##       Day_of_WeekF
## fatalF     1     2     3     4     5     6     7
##    no  0.982 0.990 0.990 0.991 0.991 0.990 0.986
##    yes 0.018 0.010 0.010 0.009 0.009 0.010 0.014

# Now to fit the glm model
fatal.fit <- glm(fatalF ~ Day_of_WeekF, family=binomial, data=radf)

anova(fatal.fit, test='Chi') # CAUTION Type I tests

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: fatalF
## 
## Terms added sequentially (first to last)
## 
## 
##              Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                        154413      18990              
## Day_of_WeekF  6   108.98    154407      18881 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(fatal.fit) # not useful

## 
## Call:
## glm(formula = fatalF ~ Day_of_WeekF, family = binomial, data = radf)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.1915  -0.1435  -0.1407  -0.1354   3.0651  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -3.99014    0.05779 -69.049  < 2e-16 ***
## Day_of_WeekF2 -0.62094    0.08893  -6.982 2.91e-12 ***
## Day_of_WeekF3 -0.62468    0.08836  -7.070 1.55e-12 ***
## Day_of_WeekF4 -0.69762    0.09025  -7.730 1.08e-14 ***
## Day_of_WeekF5 -0.69812    0.09050  -7.714 1.22e-14 ***
## Day_of_WeekF6 -0.58050    0.08491  -6.836 8.13e-12 ***
## Day_of_WeekF7 -0.23524    0.08198  -2.869  0.00411 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 18990  on 154413  degrees of freedom
## Residual deviance: 18881  on 154407  degrees of freedom
## AIC: 18895
## 
## Number of Fisher Scoring iterations: 7

# We get the emmeans (eventhough these are logits of proportions)
fatal.fit.emmo <- emmeans::emmeans(fatal.fit, ~Day_of_WeekF)

# where do the p(fatal) differ?
fatal.fit.cld <- multcomp::cld(fatal.fit.emmo)
fatal.fit.cld

##  Day_of_WeekF emmean     SE  df asymp.LCL asymp.UCL .group
##  5             -4.69 0.0697 Inf     -4.82     -4.55  1    
##  4             -4.69 0.0693 Inf     -4.82     -4.55  1    
##  3             -4.61 0.0668 Inf     -4.75     -4.48  1    
##  2             -4.61 0.0676 Inf     -4.74     -4.48  1    
##  6             -4.57 0.0622 Inf     -4.69     -4.45  1    
##  7             -4.23 0.0582 Inf     -4.34     -4.11   2   
##  1             -3.99 0.0578 Inf     -4.10     -3.88   2   
## 
## Results are given on the logit (not the response) scale. 
## Confidence level used: 0.95 
## Results are given on the log odds ratio (not the response) scale. 
## P value adjustment: tukey method for comparing a family of 7 estimates 
## significance level used: alpha = 0.05 
## NOTE: Compact letter displays can be misleading
##       because they show NON-findings rather than findings.
##       Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.

multcomp::cld(fatal.fit.emmo, type="response")

##  Day_of_WeekF    prob       SE  df asymp.LCL asymp.UCL .group
##  5            0.00912 0.000629 Inf   0.00796    0.0104  1    
##  4            0.00912 0.000627 Inf   0.00797    0.0104  1    
##  3            0.00981 0.000649 Inf   0.00861    0.0112  1    
##  2            0.00984 0.000659 Inf   0.00863    0.0112  1    
##  6            0.01025 0.000631 Inf   0.00908    0.0116  1    
##  7            0.01441 0.000826 Inf   0.01288    0.0161   2   
##  1            0.01816 0.001030 Inf   0.01625    0.0203   2   
## 
## Confidence level used: 0.95 
## Intervals are back-transformed from the logit scale 
## P value adjustment: tukey method for comparing a family of 7 estimates 
## Tests are performed on the log odds ratio scale 
## significance level used: alpha = 0.05 
## NOTE: Compact letter displays can be misleading
##       because they show NON-findings rather than findings.
##       Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.

sf.cld.plot.bar(fatal.fit.cld, "Day_of_WeekF")+
  ggtitle("logit(fatality) by day of week (with 95% ci)")+
  xlab("Day of Week (1=Sunday)")+ylab("logit(fatality rate) with 95% ci")

ggsave(file='../../MyStuff/Images/glm-cld-radf-001.png',
       h=4, w=6, units="in", dpi=300)

sf.cld.plot.line(fatal.fit.cld, "Day_of_WeekF")+
  ggtitle("logit(fatality) by day of week (with 95% ci)")+
  xlab("Day of Week (1=Sunday)")+ylab("logit(fatality rate) with 95% ci")

ggsave(file='../../MyStuff/Images/glm-cld-radf-002.png',
       h=4, w=6, units="in", dpi=300)


summary(fatal.fit.emmo, type="response")

##  Day_of_WeekF    prob       SE  df asymp.LCL asymp.UCL
##  1            0.01816 0.001030 Inf   0.01625    0.0203
##  2            0.00984 0.000659 Inf   0.00863    0.0112
##  3            0.00981 0.000649 Inf   0.00861    0.0112
##  4            0.00912 0.000627 Inf   0.00797    0.0104
##  5            0.00912 0.000629 Inf   0.00796    0.0104
##  6            0.01025 0.000631 Inf   0.00908    0.0116
##  7            0.01441 0.000826 Inf   0.01288    0.0161
## 
## Confidence level used: 0.95 
## Intervals are back-transformed from the logit scale

# Estimate the odds ratio
pairs(fatal.fit.emmo)

##  contrast  estimate     SE  df z.ratio p.value
##  1 - 2     0.620943 0.0889 Inf   6.982  <.0001
##  1 - 3     0.624677 0.0884 Inf   7.070  <.0001
##  1 - 4     0.697622 0.0902 Inf   7.730  <.0001
##  1 - 5     0.698119 0.0905 Inf   7.714  <.0001
##  1 - 6     0.580497 0.0849 Inf   6.836  <.0001
##  1 - 7     0.235236 0.0820 Inf   2.869  0.0625
##  2 - 3     0.003733 0.0951 Inf   0.039  1.0000
##  2 - 4     0.076679 0.0968 Inf   0.792  0.9858
##  2 - 5     0.077175 0.0971 Inf   0.795  0.9855
##  2 - 6    -0.040446 0.0919 Inf  -0.440  0.9995
##  2 - 7    -0.385707 0.0892 Inf  -4.325  0.0003
##  3 - 4     0.072945 0.0963 Inf   0.757  0.9888
##  3 - 5     0.073442 0.0965 Inf   0.761  0.9885
##  3 - 6    -0.044179 0.0913 Inf  -0.484  0.9991
##  3 - 7    -0.389441 0.0886 Inf  -4.395  0.0002
##  4 - 5     0.000496 0.0983 Inf   0.005  1.0000
##  4 - 6    -0.117125 0.0931 Inf  -1.257  0.8712
##  4 - 7    -0.462386 0.0905 Inf  -5.110  <.0001
##  5 - 6    -0.117621 0.0934 Inf  -1.259  0.8704
##  5 - 7    -0.462883 0.0907 Inf  -5.101  <.0001
##  6 - 7    -0.345261 0.0852 Inf  -4.054  0.0010
## 
## Results are given on the log odds ratio (not the response) scale. 
## P value adjustment: tukey method for comparing a family of 7 estimates

summary(pairs(fatal.fit.emmo), infer=TRUE, type="response")

##  contrast odds.ratio     SE  df asymp.LCL asymp.UCL null z.ratio p.value
##  1 / 2         1.861 0.1655 Inf     1.432     2.418    1   6.982  <.0001
##  1 / 3         1.868 0.1650 Inf     1.439     2.423    1   7.070  <.0001
##  1 / 4         2.009 0.1813 Inf     1.540     2.621    1   7.730  <.0001
##  1 / 5         2.010 0.1819 Inf     1.539     2.625    1   7.714  <.0001
##  1 / 6         1.787 0.1517 Inf     1.391     2.295    1   6.836  <.0001
##  1 / 7         1.265 0.1037 Inf     0.994     1.611    1   2.869  0.0625
##  2 / 3         1.004 0.0954 Inf     0.758     1.328    1   0.039  1.0000
##  2 / 4         1.080 0.1045 Inf     0.812     1.436    1   0.792  0.9858
##  2 / 5         1.080 0.1049 Inf     0.811     1.438    1   0.795  0.9855
##  2 / 6         0.960 0.0882 Inf     0.732     1.259    1  -0.440  0.9995
##  2 / 7         0.680 0.0606 Inf     0.523     0.884    1  -4.325  0.0003
##  3 / 4         1.076 0.1036 Inf     0.810     1.429    1   0.757  0.9888
##  3 / 5         1.076 0.1039 Inf     0.810     1.431    1   0.761  0.9885
##  3 / 6         0.957 0.0874 Inf     0.731     1.252    1  -0.484  0.9991
##  3 / 7         0.677 0.0600 Inf     0.522     0.880    1  -4.395  0.0002
##  4 / 5         1.000 0.0983 Inf     0.749     1.337    1   0.005  1.0000
##  4 / 6         0.889 0.0829 Inf     0.676     1.171    1  -1.257  0.8712
##  4 / 7         0.630 0.0570 Inf     0.482     0.822    1  -5.110  <.0001
##  5 / 6         0.889 0.0830 Inf     0.675     1.171    1  -1.259  0.8704
##  5 / 7         0.629 0.0571 Inf     0.482     0.823    1  -5.101  <.0001
##  6 / 7         0.708 0.0603 Inf     0.551     0.910    1  -4.054  0.0010
## 
## Confidence level used: 0.95 
## Conf-level adjustment: tukey method for comparing a family of 7 estimates 
## Intervals are back-transformed from the log odds ratio scale 
## P value adjustment: tukey method for comparing a family of 7 estimates 
## Tests are performed on the log odds ratio scale

# Exercise - look at proportion of fatal by weather conditions
dim(radf)

## [1] 154414     35

radf2 <- radf[ radf$Weather_Conditions %in% 1:6,]
dim(radf2)

## [1] 146024     35

# Define the weather conditons as a factor
radf2$WeatherF <- factor(radf2$Weather_Conditions)
xtabs(~fatalF + WeatherF, data=radf2)

##       WeatherF
## fatalF      1      2      3      4      5      6
##    no  121961  16327   3256   1184   1301    345
##    yes   1435    158     25     18     13      1

round(prop.table(xtabs(~fatalF+WeatherF, data=radf2),2),3)

##       WeatherF
## fatalF     1     2     3     4     5     6
##    no  0.988 0.990 0.992 0.985 0.990 0.997
##    yes 0.012 0.010 0.008 0.015 0.010 0.003

# Now to fit the glm model
fatal.fit <- glm(fatalF ~ WeatherF, family=binomial, data=radf2)

anova(fatal.fit, test='Chi') # CAUTION Type I tests

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: fatalF
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
## NULL                    146023      18075           
## WeatherF  5   14.934    146018      18060  0.01065 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(fatal.fit) # not useful

## 
## Call:
## glm(formula = fatalF ~ WeatherF, family = binomial, data = radf2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.1737  -0.1530  -0.1530  -0.1530   3.4195  
## 
## Coefficients:
##             Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) -4.44254    0.02655 -167.308   <2e-16 ***
## WeatherF2   -0.19544    0.08423   -2.320   0.0203 *  
## WeatherF3   -0.42684    0.20251   -2.108   0.0351 *  
## WeatherF4    0.25625    0.23897    1.072   0.2836    
## WeatherF5   -0.16340    0.27999   -0.584   0.5595    
## WeatherF6   -1.40101    1.00174   -1.399   0.1619    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 18075  on 146023  degrees of freedom
## Residual deviance: 18060  on 146018  degrees of freedom
## AIC: 18072
## 
## Number of Fisher Scoring iterations: 8

# We get the emmeans (eventhough these are logits of proportions)
fatal.fit.emmo <- emmeans::emmeans(fatal.fit, ~WeatherF)

# where do the p(fatal) differ?
fatal.fit.cld <- multcomp::cld(fatal.fit.emmo)
fatal.fit.cld  # interesting what happens

##  WeatherF emmean     SE  df asymp.LCL asymp.UCL .group
##  6         -5.84 1.0014 Inf     -7.81     -3.88  1    
##  3         -4.87 0.2008 Inf     -5.26     -4.48  1    
##  2         -4.64 0.0799 Inf     -4.79     -4.48  1    
##  5         -4.61 0.2787 Inf     -5.15     -4.06  1    
##  1         -4.44 0.0266 Inf     -4.49     -4.39  1    
##  4         -4.19 0.2375 Inf     -4.65     -3.72  1    
## 
## Results are given on the logit (not the response) scale. 
## Confidence level used: 0.95 
## Results are given on the log odds ratio (not the response) scale. 
## P value adjustment: tukey method for comparing a family of 6 estimates 
## significance level used: alpha = 0.05 
## NOTE: Compact letter displays can be misleading
##       because they show NON-findings rather than findings.
##       Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.

sf.cld.plot.bar(fatal.fit.cld, "WeatherF")+
  ggtitle("logit(fatality) by Weather Conditions (with 95% ci)")+
  xlab("Weather Conditions")+ylab("logit(fatality rate) with 95% ci")

ggsave(file='../../MyStuff/Images/glm-cld-radf-weather-001.png',
    h=4, w=6, units='in', dpi=300)

sf.cld.plot.line(fatal.fit.cld, "WeatherF")+
  ggtitle("logit(fatality) by Weather Conditions (with 95% ci)")+
  xlab("Weather Conditions")+ylab("logit(fatality rate) with 95% ci")

ggsave(file='../../MyStuff/Images/glm-cld-radf-weather-002.png',
    h=4, w=6, units='in', dpi=300)

# Exercise - look at proportion of fatal by speed
dim(radf)

## [1] 154414     35

names(radf)

##  [1] "Accident_Index"                             
##  [2] "Location_Easting_OSGR"                      
##  [3] "Location_Northing_OSGR"                     
##  [4] "Longitude"                                  
##  [5] "Latitude"                                   
##  [6] "Police_Force"                               
##  [7] "Accident_Severity"                          
##  [8] "Number_of_Vehicles"                         
##  [9] "Number_of_Casualties"                       
## [10] "Date"                                       
## [11] "Day_of_Week"                                
## [12] "Time"                                       
## [13] "Local_Authority_.District."                 
## [14] "Local_Authority_.Highway."                  
## [15] "X1st_Road_Class"                            
## [16] "X1st_Road_Number"                           
## [17] "Road_Type"                                  
## [18] "Speed_limit"                                
## [19] "Junction_Detail"                            
## [20] "Junction_Control"                           
## [21] "X2nd_Road_Class"                            
## [22] "X2nd_Road_Number"                           
## [23] "Pedestrian_Crossing.Human_Control"          
## [24] "Pedestrian_Crossing.Physical_Facilities"    
## [25] "Light_Conditions"                           
## [26] "Weather_Conditions"                         
## [27] "Road_Surface_Conditions"                    
## [28] "Special_Conditions_at_Site"                 
## [29] "Carriageway_Hazards"                        
## [30] "Urban_or_Rural_Area"                        
## [31] "Did_Police_Officer_Attend_Scene_of_Accident"
## [32] "LSOA_of_Accident_Location"                  
## [33] "Day_of_WeekF"                               
## [34] "fatal"                                      
## [35] "fatalF"

xtabs(~Speed_limit+fatalF, data=radf, exclude=NULL, na.action=na.pass)

##            fatalF
## Speed_limit    no   yes
##          10     1     1
##          15     1     0
##          20  1543     5
##          30 99716   545
##          40 12427   164
##          50  5094    99
##          60 22852   658
##          70 11049   259

round(prop.table(xtabs(~fatalF+Speed_limit, data=radf),2),3)

##       Speed_limit
## fatalF    10    15    20    30    40    50    60    70
##    no  0.500 1.000 0.997 0.995 0.987 0.981 0.972 0.977
##    yes 0.500 0.000 0.003 0.005 0.013 0.019 0.028 0.023

radf2 <- radf[ radf$Speed_limit >= 20,]
radf2$Speed_limitF <- factor(radf2$Speed_limit)

# Now to fit the glm model
fatal.fit <- glm(fatalF ~ Speed_limitF, family=binomial, data=radf2)

anova(fatal.fit, test='Chi') # CAUTION Type I tests

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: fatalF
## 
## Terms added sequentially (first to last)
## 
## 
##              Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                        154410      18981              
## Speed_limitF  5   941.24    154405      18040 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(fatal.fit) # not useful

## 
## Call:
## glm(formula = fatalF ~ Speed_limitF, family = binomial, data = radf2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.2383  -0.1962  -0.1044  -0.1044   3.3868  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -5.7320     0.4479 -12.797  < 2e-16 ***
## Speed_limitF30   0.5228     0.4500   1.162  0.24535    
## Speed_limitF40   1.4043     0.4548   3.088  0.00202 ** 
## Speed_limitF50   1.7913     0.4593   3.900 9.60e-05 ***
## Speed_limitF60   2.1845     0.4497   4.858 1.19e-06 ***
## Speed_limitF70   1.9788     0.4523   4.375 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 18981  on 154410  degrees of freedom
## Residual deviance: 18040  on 154405  degrees of freedom
## AIC: 18052
## 
## Number of Fisher Scoring iterations: 8

# We get the emmeans (eventhough these are logits of proportions)
fatal.fit.emmo <- emmeans::emmeans(fatal.fit, ~Speed_limitF)

# where do the p(fatal) differ?
fatal.fit.cld <- multcomp::cld(fatal.fit.emmo, type="response")
fatal.fit.cld  # interesting what happens

##  Speed_limitF    prob       SE  df asymp.LCL asymp.UCL .group
##  20           0.00323 0.001442 Inf   0.00135   0.00774  1    
##  30           0.00544 0.000232 Inf   0.00500   0.00591  1    
##  40           0.01303 0.001010 Inf   0.01119   0.01516   2   
##  50           0.01906 0.001898 Inf   0.01568   0.02316    3  
##  70           0.02290 0.001407 Inf   0.02030   0.02583    34 
##  60           0.02799 0.001076 Inf   0.02596   0.03018     4 
## 
## Confidence level used: 0.95 
## Intervals are back-transformed from the logit scale 
## P value adjustment: tukey method for comparing a family of 6 estimates 
## Tests are performed on the log odds ratio scale 
## significance level used: alpha = 0.05 
## NOTE: Compact letter displays can be misleading
##       because they show NON-findings rather than findings.
##       Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.

sf.cld.plot.bar(fatal.fit.cld, "Speed_limitF")+
  ggtitle("Prob(fatality) by Speed Limit (with 95% ci)")+
  xlab("Speed limit")+ylab("Prob(fatality rate) with 95% ci")

ggsave(file='../../MyStuff/Images/glm-cld-radf-speed-001.png',
    h=4, w=6, units='in', dpi=300)

sf.cld.plot.line(fatal.fit.cld, "Speed_limitF")+
  ggtitle("Prob(fatality) by Speed limit (with 95% ci)")+
  xlab("Speed limit")+ylab("Prob(fatality rate) with 95% ci")

ggsave(file='../../MyStuff/Images/glm-cld-radf-speed-002.png',
    h=4, w=6, units='in', dpi=300)