# Writing Functions. # 2015-02-01 CJS split into separate section # 2014-04-20 CJS First Edition options(useFancyQuotes=FALSE) # renders summary output corrects #source("schwarz.functions.r") source('http://www.stat.sfu.ca/~cschwarz/Stat-650/Notes/MyPrograms/schwarz.functions.r') library(boot) library(car) library(ggplot2) library(gridExtra) library(plyr) cereal <- read.csv(file.path('..','sampledata','cereal.csv'), header=TRUE, as.is=TRUE, strip.white=TRUE) cereal[1:5,] # Create a simple function to compute the cv # my.cv <- function( x ){ # Compute the coefficient of variation my.mean <- mean(x) my.sd <- sd(x) cv <- my.sd / my.mean names(cv) <- "cv" return(cv) } # end of my.cv my.cv my.cv(cereal$calories) my.cv(cereal$weight) # Adding complexity. Remove the NA before computaton my.cv <- function( x ,remove.na=FALSE){ # Compute the coefficient of variation # after removing na's my.mean <- mean(x, na.rm=remove.na) my.sd <- sd(x, na.rm=remove.na) cv <- my.sd / my.mean names(cv) <- 'cv' return(cv) } # end of my.cv my.cv(cereal$calories) my.cv(cereal$weight) my.cv(cereal$weight, remove.na=TRUE) # Adding complexity. Remove the NA before computaton # Understanding na.rm=na.rm my.cv <- function( x ,na.rm=FALSE){ # Compute the coefficient of variation # after removing na's my.mean <- mean(x, na.rm=na.rm) my.sd <- sd(x, na.rm=na.rm) cv <- my.sd / my.mean names(cv) <- 'cv' return(cv) } # end of my.cv my.cv(cereal$calories) my.cv(cereal$weight) my.cv(cereal$weight, na.rm=TRUE) # Adding complexity. More arguments to the function my.cv <- function( x , chop=0, na.rm=FALSE){ # Compute the coefficient of variation # after removing a fraction 'chop' from top and bottom and # dealing with na's newx <- x[ x >= quantile(x, probs=chop, na.rm=TRUE) & x <= quantile(x, probs=1-chop,na.rm=TRUE)] my.mean <- mean(newx, na.rm=na.rm) my.sd <- sd(newx, na.rm=na.rm) cv <- my.sd / my.mean names(cv) <- 'cv' return(cv) } # end of my.cv my.cv(cereal$calories) my.cv(cereal$weight) my.cv(cereal$weight, na.rm=TRUE) my.cv(cereal$weight, na.rm=TRUE, chop=0.10) #------------------------------------------------------ # Functions - dealing with missing values a simple summary # Function to make a nice report my.summary <- function(x){ # Compute the number, number missing, mean, sd, cv # with all the NA removed nobs <- length(x) nmiss <- sum(is.na(x)) mean <- mean(x, na.rm=TRUE) sd <- sd(x, na.rm=TRUE) cv <- sd/mean res <- data.frame(nobs,nmiss,mean,sd,cv, stringsAsFactors=FALSE) return(res) }# end of my.summary my.summary(cereal$calories) my.summary(cereal$weight) # ------------------------------- Exercise II ------------------------- my.line <- function(df){ # do a regression of Calories vs Fat and return # the slope, its se, and the 95% ci on the slope fit <- lm(calories ~ fat, data=df) slope <- coef(fit)[2] slope.se <- sqrt(diag(vcov(fit)))[2] slope.ci <- confint(fit)[2,] lcl <- slope.ci[1] ucl <- slope.ci[2] res <- data.frame(slope, slope.se, lcl, ucl, stringsAsFactors=FALSE) return(res) } # end my.line my.line(cereal) # ----- Exercise III # Add the X and Y variable to the call my.line2 <- function(df, Xname, Yname){ # do a regression of Y vs X and return # the slope, its se, and the 95% ci on the slope fit <- lm(df[,Yname] ~ df[,Xname], data=df) slope <- coef(fit)[2] slope.se <- sqrt(diag(vcov(fit)))[2] slope.ci <- confint(fit)[2,] lcl <- slope.ci[1] ucl <- slope.ci[2] res <- data.frame(X=Xname, Y=Yname, slope, slope.se, lcl, ucl) return(res) } # end my.line2 my.line2(cereal, Xname="fat", Yname="calories") my.line2(cereal, Xname="protein", Yname="calories") # ----- Exercise IV # Add the X and Y variable to the call # Return a list with the plot, the fit object, and the summary table my.line3 <- function(df, Xname, Yname){ # Get the plot plot <- ggplot(data=df, aes_string(x=Xname, y=Yname))+ ggtitle(paste("Plot of ", Yname," vs ", Xname, sep=""))+ geom_point( position=position_jitter(h=.1, w=.1))+ geom_smooth(method="lm", se=FALSE) # do a regression of Y vs X and return # the slope, its se, and the 95% ci on the slope fit <- lm(df[,Yname] ~ df[,Xname], data=df) slope <- coef(fit)[2] slope.se <- sqrt(diag(vcov(fit)))[2] slope.ci <- confint(fit)[2,] lcl <- slope.ci[1] ucl <- slope.ci[2] res <- data.frame(X=Xname, Y=Yname, slope, slope.se, lcl, ucl) return(list(plot=plot, fit=fit, res=res)) } # end my.line2 my.line3(cereal, Xname="fat", Yname="calories") my.line3(cereal, Xname="protein", Yname="calories") # ---- ------------------------------------------------------------------ # Passing functions as arguments # Separate fits for each shelf ddply(cereal, "shelf", my.line) ddply(cereal, "shelf", my.line2, Xname="fat", Yname="calories") results <- dlply(cereal, "shelf", my.line3, Xname="fat", Yname="calories") names(results) results[[1]] #----------------------------------------------------------------------- # Bootstrapping Example ratio.meanY.meanX <- function(df, ind, Y,X, na.rm=FALSE){ # Compute the ratio of the mean of Y to mean of X potentially removing missing values res <- mean(df[ind,X],na.rm=na.rm)/ mean(df[ind,Y],na.rm=na.rm) names(res) <- "ratio" return(res) } ratio.meanY.meanX(df=cereal,1:nrow(cereal), X="fat",Y="calories", na.rm=TRUE ) library(boot) bootres <- boot(cereal, ratio.meanY.meanX, R=100, X="fat",Y="calories", na.rm=TRUE ) bootres str(bootres) bootres$t0 bootres$t quantile(bootres$t, prob=c(0.25, .975)) # Bootstrapping Exercise r2YX <- function(df, ind, Y,X){ # Compute the regression of Y on X and then find R2 fit <- lm( df[ind,Y] ~ df[ind, X]) res <- summary(fit)$r.squared names(res) <- "R2" return(res) } #r2YX # Test out on full data fit <- lm( calories ~ fat, data=cereal) summary(fit)$r.squared r2YX(cereal, 1:nrow(cereal), Y="calories", X="fat") bootres <- boot(cereal, r2YX, R=100, X="fat",Y="calories" ) bootres str(bootres) bootres$t0 bootres$t[1:10] quantile(bootres$t, prob=c(0.25, .975))