#------------ Split - Apply - Combine # 2015-01-31 CJS Update to only use plyr rather than Base R function # 2014-05-20 CJS First Edition options(useFancyQuotes=FALSE) # renders summary output corrects #source("schwarz.functions.r") source('http://www.stat.sfu.ca/~cschwarz/Stat-650/Notes/MyPrograms/schwarz.functions.r') library(car) library(ggplot2) library(gridExtra) library(emmeans) library(plyr) # Get the data sets ready cereal <- read.csv(file.path("..","sampledata",'cereal.csv'), header=TRUE, as.is=TRUE, strip.white=TRUE) cereal[1:5,] accidents <- read.csv(file.path("..","sampledata",'road-accidents-2010.csv'), header=TRUE, as.is=TRUE, strip.white=TRUE) # Convert date to internal date format accidents$mydate <- as.Date(accidents$Date, format="%d/%m/%Y") # Create the fatality variable accidents$Fatality <- accidents$Accident_Severity == 1 #-------------------------------------------------------- # Passing a function to ddply either pre-defined or inline mysummary <- function(x){ # compute the mean fat and calories and their ratio mean.fat = mean(x$fat, na.rm=TRUE) mean.calories=mean(x$calories, na.rm=TRUE) ratio = mean.calories / mean.fat data.frame(mean.fat, mean.calories, ratio, stringsAsFactors=FALSE) } mysummary(cereal) # now for each shelf report <- plyr::ddply(cereal, "shelf", mysummary) report # Pass the function as an anonymous function; report <- plyr::ddply(cereal, "shelf", function(x){ # compute the mean fat and calories and their ratio mean.fat = mean(x$fat, na.rm=TRUE) mean.calories=mean(x$calories, na.rm=TRUE) ratio = mean.calories / mean.fat data.frame(mean.fat, mean.calories, ratio, stringsAsFactors=FALSE) }) report #----------------------------------------- # Exercise sumstats <- plyr::ddply(cereal, "shelf", function(x) { result <- lm(calories ~ fat, data=x) # notice use of x intercept <- coef(result)[1] slope <- coef(result)[2] rmse <- summary(result)$sigma res <- data.frame(intercept, slope, rmse, stringsAsFactors=FALSE) return(res) }) sumstats #----------------------------------------- # Exercise naccidents <- plyr::ddply(accidents, "mydate", plyr::summarize, freq=length(mydate), pfatal=mean(Fatality), mean.weather=mean(Weather_Conditions), dow=format(mydate, "%w")[1]) naccidents[1:5,] naccidents <- plyr::ddply(accidents, "mydate", function(x){ freq <- nrow(x) mean.weather <- mean(x$Weather_Conditions) pfatal <- mean(x$Fatality) dow=format(x$mydate, "%w")[1] res <- data.frame(freq, mean.weather, pfatal, dow, stringsAsFactors=FALSE) return(res) }) naccidents[1:5,] newplot <- ggplot(data=naccidents, aes(x=mydate, y=freq ))+ ggtitle("Number of accidents by date with mean weather coded")+ geom_point( aes(size=mean.weather))+ geom_smooth(method="loess", color="red", se=FALSE) newplot newplot <- ggplot(data=naccidents, aes(x=mean.weather, y=freq))+ geom_point( ) + geom_smooth(method="loess", color="red", se=FALSE) + ggtitle("Number of accidents by mean weather coded") newplot newplot <- ggplot(data=naccidents, aes(x=dow, y=freq))+ geom_boxplot( ) + geom_jitter(aes(size=pfatal, color=mean.weather), position=position_jitter(w=.3, h=.0))+ ggtitle("Number of accident by day of the week")+ xlab("Day of the week. 0=Sunday") + ylab("Number of accidents in a day") newplot #----------------------------------------- # More advanced features of plyr() package # Passing additonal arguments to the function in plyr routine sumstat <- function(x, var){ # Compute some summary statistics for a data frame values <- x[,var] # extract the variable values n <- length(values) nmiss <- sum(is.na(values)) mean <- mean(values, na.rm=TRUE) sd <- sd(values, na.rm=TRUE) res <- data.frame(n,nmiss,mean,sd, stringsAsFactors=FALSE) return(res) } # end of sumstat sumstat(cereal, "calories") sumstat(cereal, "weight") plyr::ddply(cereal, "shelf", sumstat, var="calories") plyr::ddply(cereal, "shelf", sumstat, var="weight") #---------------------------- # Write a function that finds the mean, se.mean and 95% ci based on normal # theory my.simple.summary <- function(df, variable, conflevel=0.95){ # Return simple statistics on 'variable' in the dataframe 'df' # This is commonly used in a ddply() function to get separate statistics for # each group. # data <- df[,variable] n <- length(data) nmiss <- sum(is.na(data)) # number of missing values mean <- mean(data, na.rm=TRUE) sd <- sd (data, na.rm=TRUE) # compute the se and confint using a simple linear model fit <- lm(data ~1) se <- sqrt(diag(vcov(fit))) ci <- confint(fit, level=conflevel) res <- data.frame(variable=variable, n=n, nmiss=nmiss, mean=mean, sd =sd, se =se, conflevel=conflevel, lcl=ci[1], ucl=ci[2]) row.names(res) <- NULL res } my.simple.summary(cereal, "fat") my.simple.summary(cereal, "fat", conflevel=.90) plyr::ddply(cereal, "shelf", my.simple.summary, variable="fat") plyr::ddply(cereal, "shelf", my.simple.summary, variable="fat", conflevel=0.90) plyr::ddply(cereal, "shelf", my.simple.summary, variable="weight") #----------------------------------------------------- # dlply() and ldply() sumstat <- function(x, Yvar, Xvar){ # Do the plot (use aes_string) plot <- ggplot(data=x, aes_string(x=Xvar, y=Yvar))+ ggtitle(paste("Scatterplot of ", Yvar, " vs. ", Xvar," for shelf ", x$shelf[1], sep=""))+ geom_point(position=position_jitter(h=.1, w=.1))+ geom_smooth(method="lm", se=FALSE) fit <- lm( x[,Yvar] ~ x[, Xvar], data=x) list(plot=plot, fit=fit) } res<- sumstat(cereal, "calories", "fat") length(res) res <- plyr::dlply(cereal, "shelf", sumstat, Yvar="calories", Xvar="fat") length(res) res[[1]] res[[2]] res[[3]] se.summary <- plyr::ldply(res, function(x){ # x is now the list of the flot and the fit slope <- summary(x$fit)$coefficients[2,] slope }) se.summary # display all of the plots plyr::l_ply(res, function(x){ plot(x$plot) # needed because within a function }) # Extract the plots and put into a single file pdf('summary.pdf', h=4, w=6 ) plyr::l_ply(res, function(x){ plot(x$plot) # needed because within a function }) dev.off() # Extract residuals and plot for each shelf using faceting resid <- ldply(res, function(x){ data.frame(resid= resid(x$fit), fitted=fitted(x$fit)) }) head(resid) resid.plot <- ggplot(data=resid, aes(x=fitted, y=resid))+ ggtitle("Residual plots")+ geom_point()+ geom_hline(yintercept=0)+ facet_wrap(~shelf, ncol=2, scales="free") resid.plot #ggsave(resid.plot, # file=file.path("..","..","MyStuff","Images","ldply-001.png"), h=6, w=6, units="in", dpi=300) #----------------------------------------- # Exercise III to find the ratio of accidents on weekdays to weekends by month accidents <- read.csv('../sampledata/road-accidents-2010.csv', header=TRUE, as.is=TRUE, strip.white=TRUE) accidents$mydate <- as.Date(accidents$Date, format="%d/%m/%Y") accidents$month <- as.numeric(format(accidents$mydate, "%m")) # get the month for each accident date accidents[1:5, c("mydate","month")] mysummary <- function(accidents){ # Compute the number of weekend and weekdays in the month (excluding holidays) # Compute the number of accidents on weekend/weekdays # Report the two ratio. DaysOfMonth <- unique(accidents$mydate) DaysOfWeeks <- format(DaysOfMonth, "%w") nDays <- length(DaysOfMonth) nWeekdays <- sum(DaysOfWeeks %in% 1:5) nWeekends <- sum(DaysOfWeeks %in% c(0,6)) AccDaysOfWeek <- format(accidents$mydate, "%w") nAccTotal <- length(accidents$mydate) nAccWeekdays <- sum(AccDaysOfWeek %in% 1:5) nAccWeekends <- sum(AccDaysOfWeek %in% c(0,6)) rAccWdWe <- nAccWeekdays/nAccWeekends rDaysWdWe <- nWeekdays/nWeekends res <- data.frame(nDays, nWeekdays, nWeekends, nAccTotal, nAccWeekdays, nAccWeekends, rDaysWdWe, rAccWdWe, stringsAsFactors=FALSE) return(res) } testdata <- subset(accidents, accidents$month == 1) dim(testdata) mysummary(testdata) results <- plyr::ddply(accidents, "month", mysummary) results newplot <- ggplot(data=results, aes(x=month, y=rDaysWdWe))+ ggtitle("Comparing accidents on weekends/weekdays")+ xlab("Month")+ylab("Ratio Weekends/Weekdays")+ geom_line(group=1, color="blue", linetype=2)+ geom_line(aes(y=rAccWdWe,group=1)) newplot ggsave(newplot, file="../../MyStuff/Images/accidents-ratio-wd-we-001.png", h=4, w=6, units="in", dpi=300) newplot <- ggplot(data=results, aes(x=month, y=rAccWdWe/rDaysWdWe))+ ggtitle("Comparing accidents on weekends/weekdays")+ xlab("Month")+ylab("Ratio Weekends/Weekdays")+ geom_line(group=1, color="blue", linetype=2) newplot ggsave(newplot, file="../../MyStuff/Images/accidents-ratio-wd-we-002.png", h=4, w=6, units="in", dpi=300) #----------------------------------------------- # running in parallel doParallel <- TRUE # should I set up parallel processing if(doParallel) { library(doMC) # for parallel model fitting # see http://viktoriawagner.weebly.com/blog/five-steps-to-parallel-computing-in-r detectCores() cl <- makeCluster(4) # Need to export some libraries to the cluster # see http://stackoverflow.com/questions/18981932/logging-with-plyr-in-parallel-true clusterEvalQ(cl, library(unmarked)) registerDoMC(5) } Sys.time() res <- plyr::ddply(cereal, "shelf", plyr::summarize, mean=sum(log(shelf[1]*(1:10000000)), na.rm=TRUE), .parallel=FALSE) Sys.time() Sys.time() res <- plyr::ddply(cereal, "shelf", plyr::summarize, mean=prod(shelf[1]*(1:100000000), na.rm=TRUE), .parallel=doParallel) Sys.time() if(doParallel) stopCluster(cl) # stop parallel processing # More advanced features of plyr() package # Row/column operations mat <- matrix(1:30, nrow=6) mat # Now some operations on rows or columns aaply(mat, 1, sum) aaply(mat, 2, mean) aaply(mat, 1, function(x){ res <- prod(sin(x)) return(res) })