# Helper functions that are commonly used in my course notes

# 2020-05-24 CJS fixed sf.autoplot.lmer to use broom::augment() rather than fortify()
# 2018-10-27 CJS fixed plot.cld/plot.bar for new way to extract y range
# 2018-03-26 CJS fixed plot.cld to recognize 'rate" from glm()
# 2018-01-15 CJS fixed plot.cld to recognize emmeans now that emmeans package is called emmeans package
# 2017-10-08 CJS fixed plot.cld to recognize emmeans, prob, odds.ratio from lm() and glm()
# 2016-11-24 CJS removed sf.autoplot.lm and sf.autplot.glm  as the autoplot() from ggfortify now works properly
# 2015-07-19 CJS fixed ggCatterpillar; but we now have a problem that arrangeGrob no longer returns a 
#                graphic object and you need to use grid.view() to display hit preceeded by a grid.newpage() - Groan
#                http://stackoverflow.com/questions/31463445/how-do-i-get-rid-of-random-background-grid-from-arrangegrob
# 2015-07-07 CJS updated plot.cld to recognize different CLD from lm() and glm()
# 2015-07-07 CJS added diagnostic plots for glm()
# 2014-11-29 CJS added sf.autopla.lmer() for diagnostic plots for lmer() models
# 2014-04-19 CJS sf.simple.summary


sf.simple.summary <- function(df, variable, crd=FALSE, conflevel=0.95){
  # Return simple statistics on 'variable' in the dataframe 'df'
  # If this is a crd design, also compute se and confidence intervals
  # This is commonly used in a ddply() function to get separate statistics for
  # each group.
  #
  data  <- df[,variable]
  n     <- length(data)
  nmiss <- sum(is.na(data)) # number of missing values
  mean  <- mean(data, na.rm=TRUE)
  sd    <- sd  (data, na.rm=TRUE)
  res   <- c(n, nmiss, mean, sd)
  res.names <- c("n","nmiss","mean","sd")
  # If this a CRD, compute the se and confint using a simple linear model
   if(crd){
     fit   <- lm(data ~1)
     se    <- sqrt(diag(vcov(fit)))
     ci    <- confint(fit, level=conflevel)
     res   <- c(res, se, ci)
     res.names <- c(res.names, "se","lcl",'ucl')
  }
  # put together all of the information
  names(res) <- res.names
  return(res)
}


# Create residual plots from lm() objects.
# See http://librestats.com/2012/06/11/autoplot-graphical-methods-with-ggplot2/
#     http://stackoverflow.com/questions/17059099/saving-grid-arrange-plot-to-file

# See http://rpubs.com/sinhrks/plot_lm for another autoplot function
#  2015-08-31 Added yintercept=0 to geom_hline objects

sf.autoplot.lm <- function(...){
   cat("****** sf.autoplot.lm() no longer needed        *****\n")
   cat("       use autoplot() from the ggfortify package\n")
}

# sf.autoplot.lm no longer needed as autoplot() from ggfortify package now works
#sf.autoplot.lm <- function(model, ..., which=c(1:3, 5), mfrow=c(2,2)){
#  require(ggplot2) 
#  require(grid)
#  require(gridExtra)
#  df <- fortify(model)
#  df <- cbind(df, rows=1:nrow(df))
# 
#  # residuals vs fitted
#  g1 <- ggplot(df, aes(.fitted, .resid)) +
#    geom_point()  +
#    geom_smooth(se=FALSE) +
#    geom_hline(linetype=2, size=.2, yintercept=0) +
#      scale_x_continuous("Fitted Values") +
#      scale_y_continuous("Residual") +
#      ggtitle("Residuals vs Fitted")
# 
#  # normal qq
#  a <- quantile(df$.stdresid, c(0.25, 0.75))
#  b <- qnorm(c(0.25, 0.75))
#  slope <- diff(a)/diff(b)
#  int <- a[1] - slope * b[1]
#  g2 <- ggplot(df, aes(sample=.stdresid)) +
#    stat_qq() +
#    geom_abline(slope=slope, intercept=int) +
#      xlab("Theoretical Quantiles") +
#      ylab("Standardized Residuals") +
#      ggtitle("Normal Q-Q")
# 
#  # scale-location
#  g3 <- ggplot(df, aes(.fitted, sqrt(abs(.stdresid)))) +
#    geom_point() +
#    geom_smooth(se=FALSE) +
#      scale_x_continuous("Fitted Values") +
#      scale_y_continuous("Root of |Standardized Residuals|") +
#      ggtitle("Scale-Location")
# 
#  # cook's distance
#  g4 <-  ggplot(df, aes(rows, .cooksd, ymin=0, ymax=.cooksd)) +
#    geom_point() + geom_linerange() +
#      scale_x_continuous("Observation Number") +
#      scale_y_continuous("Cook's distance") +
#      ggtitle("Cook's Distance")
# 
#  # residuals vs leverage
#  g5 <- ggplot(df, aes(.hat, .stdresid)) +
#    geom_point() +
#    geom_smooth(se=FALSE) +
#    geom_hline(linetype=2, size=.2, yintercept=0) +
#      scale_x_continuous("Leverage") +
#      scale_y_continuous("Standardized Residuals") +
#      ggtitle("Residuals vs Leverage")
# 
#  # cooksd vs leverage
#  g6 <- ggplot(df, aes(.hat, .cooksd)) +
#    geom_point() +
#    geom_smooth(se=FALSE) +
#      scale_x_continuous("Leverage") +
#      scale_y_continuous("Cook's distance") +
#      ggtitle("Cook's dist vs Leverage")
# 
#  #browser()
#  plots <- list(g1, g2, g3, g4, g5, g6)
#  plots.subset <- plots[which]
#  plots.subset$ncol <- mfrow[2]
#  plots.subset$nrow <- mfrow[1]
#  
#  gridplots <- do.call(arrangeGrob, plots.subset)
#  gridplots  # return the final object
#}

# Create residual and other diagnostic plots from lmer() objects.
sf.autoplot.lmer <- function(model, ..., which=TRUE, mfrow=c(2,2)){
  # which = TRUE implies select all plots; specify a vector if only want some of the plots
  require(broom.mixed)
  require(ggplot2) 
  require(grid)
  require(gridExtra)
  require(lattice)
  require(plyr)
  
  ggCaterpillar <- function(re, QQ=TRUE, likeDotplot=TRUE) {
    # Create Caterpillar plots
    # Refer to http://stackoverflow.com/questions/13847936/in-r-plotting-random-effects-from-lmer-lme4-package-using-qqmath-or-dotplot
    # We modified it to access the name of the random effect for use in the plots
    # http://stackoverflow.com/questions/9950144/access-lapply-index-names-inside-fun
    
    require(ggplot2)
    f <- function(i, allre) {
      re_name <- names(allre)[i] # name of the random effect
      x   <- allre[[i]]
      pv   <- attr(x, "postVar")
      cols <- 1:(dim(pv)[1])
      se   <- unlist(lapply(cols, function(i) sqrt(pv[i, i, ])))
      ord  <- unlist(lapply(x, order)) + rep((0:(ncol(x) - 1)) * nrow(x), each=nrow(x))
      pDf  <- data.frame(y=unlist(x)[ord],
                         ci=1.96*se[ord],
                         nQQ=rep(qnorm(ppoints(nrow(x))), ncol(x)),
                         ID=factor(rep(rownames(x), ncol(x))[ord], levels=rownames(x)[ord]),
                         ind=gl(ncol(x), nrow(x), labels=names(x)))
      
      if(QQ) {  ## normal QQ-plot
        p <- ggplot(pDf, aes(nQQ, y))
        p <- p + facet_wrap(~ ind, scales="free")
        p <- p + xlab("Standard normal quantiles") + ylab("Random effect quantiles")
      } else {  ## caterpillar dotplot
        p <- ggplot(pDf, aes(ID, y)) + coord_flip()
        if(likeDotplot) {  ## imitate dotplot()  - same scales for random effects
          p <- p + facet_wrap(~ ind)
        } else {           ## different scales for random effects
          p <- p + facet_grid(ind ~ ., scales="free_y")
        }
        p <- p + xlab("Levels") + ylab("Random effects") + ggtitle(paste("Caterpillar Plot of ", re_name))
      }
      
      p <- p + theme(legend.position="none")
      p <- p + geom_hline(yintercept=0)
      p <- p + geom_errorbar(aes(ymin=y-ci, ymax=y+ci), width=0, colour="black")
      p <- p + geom_point(aes(size=1.2), colour="blue") 
      return(p)
    }
    res<- lapply(seq_along(re), f, allre=re)
    names(res) <- names(re)
    res
  }
  
  
  df <- broom::augment(model)
  #df <- fortify(model)
  df <- cbind(df, rows=1:nrow(df))
  
  # residuals vs fitted
  g1 <- ggplot(df, aes(x=.fitted, y=.resid)) +
    geom_point()  +
    geom_smooth(se=FALSE) +
    geom_hline(yintercept=0, linetype=2, size=.2) +
    scale_x_continuous("Fitted Values") +
    scale_y_continuous("Residual") +
    ggtitle("Residuals vs Fitted")
  
  # normal qq on residuals
  a <- quantile(df$.resid, c(0.25, 0.75))
  b <- qnorm(c(0.25, 0.75))
  slope <- diff(a)/diff(b)
  int <- a[1] - slope * b[1]
  g2 <- ggplot(df, aes(sample=.resid)) +
    stat_qq() +
    geom_abline(slope=slope, intercept=int) +
    xlab("Theoretical Quantiles") +
    ylab("Residuals") +
    ggtitle("Normal Q-Q on residuals")
  
  # caterpillar plots on the all of the random effects
  cat_plot <- ggCaterpillar( ranef(model, condVar=TRUE),  QQ=FALSE, likeDotplot=FALSE)
  
  plots <- list(g1=g1, g2=g2)
  cat_names <- names(cat_plot)
  l_ply(cat_names, function(name){
    # add the caterpiller plots to the list of plots
    #browser()
    plots <<- c(plots, cat_plot[name])
  })
  plots.subset <- plots[which]
  plots.subset$ncol <- mfrow[2]
  #plots.subset$nrow <- mfrow[1]
  # browser()
  gridplots <- do.call(arrangeGrob, plots.subset)
  gridplots  # return the final object
}


sf.autoplot.glm <- function(...){
   cat("****** sf.autoplot.glm() no longer needed        *****\n")
   cat("       use autoplot() from the ggfortify package\n")
}


sf.cld.plot.bar<- function(cld.obj, variable, order=TRUE, whereCLD=0.20, ciwidth=0.2){
  # Create a ggplot object of the cld as a bar graph
  # You can add axes labels as needed after plot creating
  #    cld.obj  - cld object created by emmeans
  #    variable - name of grouping varible (usually the first column in the cld.obj
  #    order    - plot bars sorted from smallest to largest)
  #    whereCLD - where should the cld letters be plotted as a proportion of y axis from bottom
  #
  # See where the lower and upper confidence limits are from lm() and glm() objects respectively
  # 
  # Typical usage is
  #      fit <- lm( Y ` x1 + x2 ...)
  #      fit.emmo <- emmeans::emmeans(fit, ~x1)
  #      fit.cld  <- emmeans::CLD(fit.emmo)
  #      myplot <- sf.cld.plot.bar(fit.cld, "x1")
  #      myplot <- myplot + ggtitle() + xlab()+ ylab() + ...

  lcl.col <- which(grepl('lower.CL', names(cld.obj)) | grepl('asymp.LCL', names(cld.obj)) )
  ucl.col <- which(grepl('upper.CL', names(cld.obj)) | grepl('asymp.UCL', names(cld.obj)) )
  xvar <- which(grepl("lsmean",   names(cld.obj)) | grepl("prob",  names(cld.obj)) |
                grepl("emmean",   names(cld.obj)) | grepl("rate",  names(cld.obj)) |
                grepl("response", names(cld.obj)) | grepl('trend', names(cld.obj))  )
  cld.obj$y <- cld.obj[, xvar]
  #browser()
  if(order) { cld.obj$gf <- factor(cld.obj[,variable], cld.obj[,variable])} # sorted levels 
  if(!order){ cld.obj$gf <- cld.obj[,variable]}
  require(ggplot2)
  plot <- ggplot(cld.obj, aes(x=gf,y=y), environment=environment())+
             geom_bar(stat="identity", alpha=0.5)+
             geom_errorbar( aes(ymax=cld.obj[,lcl.col], ymin=cld.obj[,ucl.col]), width=ciwidth)
  
  # Extract the range of the y axis to decide where to annotate the cld values
  # See end if https://stackoverflow.com/questions/7705345/how-can-i-extract-plot-axes-ranges-for-a-ggplot2-object
  yrange <- ggplot_build(plot)$layout$panel_scales_y[[1]]$range$range
  plot <- plot +
          annotate("text", 
            x=cld.obj$gf,
            y=yrange[1]+whereCLD*sum(c(-1,1)*yrange),
            label=cld.obj$".group", angle=-90,vjust=1)
  plot
} # end of sf.cld.plot.bar


sf.cld.plot.line<- function(cld.obj, variable, order=TRUE, whereCLD=0.20, ciwidth=0.20){
  # Create a ggplot object of the cld as a line graph
  # You can add axes labels as needed after plot creating
  #    cld.obj  - cld object created by emmeans
  #    variable - name of grouping varible (usually the first column in the cld.obj
  #    order    - plot bars sorted from smallest to largest)
  #    whereCLD - where should the cld letters be plotted as a proportion of y axis from bottom
  # See where the lower and upper confidence limits are from lm() and glm() objects respectively
  #
  # Typical usage is
  #      fit <- lm( Y ` x1 + x2 ...)
  #      fit.emmo <- emmeans::emmeans(fit, ~x1)
  #      fit.cld  <- emmeans::CLD(fit.emmo)
  #      myplot <- sf.cld.plot.bar(fit.cld, "x1")
  #      myplot <- myplot + ggtitle() + xlab()+ ylab() + ...

  lcl.col <- which(grepl('lower.CL', names(cld.obj)) | grepl('asymp.LCL', names(cld.obj)) )
  ucl.col <- which(grepl('upper.CL', names(cld.obj)) | grepl('asymp.UCL', names(cld.obj)) )
  
  if(order) { cld.obj$gf <- factor(cld.obj[,variable], cld.obj[,variable])} # sorted levels 
  if(!order){ cld.obj$gf <- cld.obj[,variable]}
  xvar <- which(grepl("lsmean",   names(cld.obj)) | grepl("prob",  names(cld.obj)) |
                grepl("emmean",   names(cld.obj)) | grepl('rate',  names(cld.obj)) |
                grepl("response", names(cld.obj)) | grepl('trend', names(cld.obj))  )
  cld.obj$y <- cld.obj[, xvar]
  require(ggplot2)
  plot <- ggplot(cld.obj, aes(x=gf,y=y), environment=environment())+
             geom_line(aes(group=1))+
             geom_errorbar( aes(ymax=cld.obj[,lcl.col], ymin=cld.obj[,ucl.col]), width=ciwidth)
  
  # Extract the range of the y axis to decide where to annotate the cld values
  # See end if https://stackoverflow.com/questions/7705345/how-can-i-extract-plot-axes-ranges-for-a-ggplot2-object
  yrange <- ggplot_build(plot)$layout$panel_scales_y[[1]]$range$range
  plot <- plot +
          annotate("text", 
            x=cld.obj$gf,
            y=yrange[1]+whereCLD*sum(c(-1,1)*yrange),
            label=cld.obj$".group", angle=-90,vjust=1)
  plot
} # end of sf.cld.plot.line