options(useFancyQuotes=FALSE) # renders summary output corrects source("schwarz.functions.r") #source('http://www.stat.sfu.ca/~cschwarz/Stat-650/Notes/MyPrograms/schwarz.functions.r') # This is a quick demo of using Rstudio x <- 1:10 x plot(x,x) #-------------------------------------------------------- # This script will read in the cereal data set, # do a simple listing, # fit a regression line, # draw a scatter plot and add the line to the plot # do a single factor crd anova # get the compact letter display # make some plots # load required libraries library(ggplot2) library(emmeans) # Read in the cereal data from a csv file cereal <- read.csv('../SampleData/cereal.csv', header=TRUE, as.is=TRUE, strip.white=TRUE) cereal3 <- read.table("http://lib.stat.cmu.edu/datasets/1993.expo/cereal", header=FALSE, as.is=TRUE, strip.white=TRUE) names(cereal3) <- c('Name','mfr','type','Calories','protein','Fat','sodium','fiber','carbo', 'sugars','shelf','potass','vitamins','weight','cups') # Define new variables and factors (for categorical variables). CHeck the structure of the data frame cereal$shelfF <- factor(cereal$shelf) cereal$Calories.fr.Protein <- cereal$protein * 4; str(cereal) # List the first few records cereal[1:5,] # List some variables cereal$calories cereal[,"calories"] cereal$fat cereal[1:5,c("name","fat","calories")] # Make a basic scatter plot plotbasic <- ggplot(data=cereal, aes(x=fat, y=calories))+ ggtitle("Calories vs Fat in cereals")+ xlab("Grams of Fat")+ylab("Calories/serving")+ geom_point() plotbasic ggsave(plotbasic, file='cal-vs-fat1.png', h=4, w=6, units="in", dpi=300) plotbasic2 <- ggplot(data=cereal, aes(x=fat, y=calories))+ ggtitle("Calories vs Fat in cereals")+ xlab("Grams of Fat")+ylab("Calories/serving")+ geom_jitter() plotbasic2 ggsave(plotbasic, file='cal-vs-fat2.png', h=4, w=6, units="in", dpi=300) # Fit a regression between calories and grams of fat fit.calories.fat <- lm( calories ~ fat, data=cereal) summary(fit.calories.fat) anova(fit.calories.fat) # careful Type I SS coef(fit.calories.fat) sqrt(diag(vcov(fit.calories.fat))) # extract the SE confint(fit.calories.fat) # confidence intervals on parameters names(summary(fit.calories.fat)) summary(fit.calories.fat)$r.squared summary(fit.calories.fat)$sigma class(fit.calories.fat) methods(class=class(fit.calories.fat)) # Add the fitted line to the scatter plot; and save plotline <- plotbasic2 + geom_abline(intercept=coef(fit.calories.fat)[1], slope =coef(fit.calories.fat)[2]) plotline ggsave(plot=plotline, file="cal-vs-fat3.png", h=4, w=6, units="in", dpi=300) # Or, if you don't want' to do the actual fit, use ggplot directly plot.calories.fat <- ggplot(data=cereal, aes(x=fat, y=calories)) + geom_jitter(shape=1) + # Use hollow circles geom_smooth(method=lm, # Add linear regression line se=FALSE) # Don't add shaded confidence region plot.calories.fat # Do a simple single factor ANOVA # Is the mean number of calories the same for all shelves # Need to use a FACTOR variable for the categorical variable fit.sugars.shelf <- lm( sugars ~ shelfF, data=cereal) anova(fit.sugars.shelf) # Estimate the marginal means along with confidence limits and Tukey multiple comparison. fit.sugars.shelf.emmo <- emmeans::emmeans(fit.sugars.shelf, ~shelfF) fit.sugars.shelf.cld <- multcomp::cld(fit.sugars.shelf.emmo, adjust='tukey') fit.sugars.shelf.cld cld.plot <- sf.cld.plot.bar(fit.sugars.shelf.cld, "shelfF", order=FALSE) cld.plot ggsave(cld.plot, file="fat-vs-shelf.png",h=4, w=6, units="in", dpi=200) # Estimate the pairwise differences pairs(fit.sugars.shelf.emmo) # an alternate way to look at pairwise comparisons pwpp(fit.sugars.shelf.emmo) #-------------------------------------------------------- # Fun with vectors age <- c(56, 56, 28, 23, 22) height <- c(185, 162, 185, 167, 190) f.names <- c('Carl', "Lois", 'Matthew', 'Marianne', 'David') over.30 <- c(T, T, F, F, F) # AVOID using T/F for TRUE/FALSE odd <- c(2.3, 'Carl') # surprising, but look at result! length(age) length(family) # number of elements not lengths of elements str(age) # what is the structure of age? str(f.names) # The c() function is very versatile # The c() function is very versatile ah <- c(age, height) ah age0age <- c(age, 0, age) age0age length(age0age) odd <- c(f.names, over.30) # ?? odd #-------------------------------------------------------- # Dataframes age <- c(56, 56, 28, 23, 22) height <- c(185, 162, 185, 167, 190) f.names <- c('Carl', "Lois", 'Matthew', 'Marianne', 'David') over.30 <- c(TRUE, TRUE, FALSE, FALSE, FALSE) schwarz <- data.frame( f.names, age, height, over.30, stringsAsFactors=FALSE) schwarz str(schwarz) length(schwarz) # number of vectors, not length of vectors dim(schwarz) nrow(schwarz) ncol(schwarz) names(schwarz) # Most commonly created from data. cereal <- read.csv('../sampledata/cereal.csv', header=TRUE, as.is=TRUE, strip.white=TRUE) str(cereal) # this function is VERY useful when things seem to go wrong length(cereal) # number of vectors, not length of vectors dim(cereal) nrow(cereal) ncol(cereal) names(cereal) # How to refer to parts of data frame names(cereal) cereal$name cereal$calories cereal[ , "calories"] # first index missing = ALL rows #calories # doesn't work because vector is hidden with(cereal, calories) # careful of case. cereal[1,] cereal[1:5,] cereal[, 1] cereal[, 1:5] cereal[1:4, 1:5] # Adding removing variables from data frames cereal$CalPerGramFat <- cereal$calories / cereal$fat cereal$CalPerGramFat # some interesting values! cereal$CalPerGramFat <- NULL # removes this variable from df #-------------------------------------------------------- # More fun with vectors # Operations with vectors age <- c(56, 56, 28, 23, 22) age.next.year <- age + 1 yob <- 2013- age # element by element operations if same length # Operations on vectors x <- c(0.5, 1, 1.5, 2, 4, 6, 8, 9, 10, 12) length(x) str(x) sqrt(x) # function applied to EACH element # Other useful functions range(x) mean(x) sd(x) median(x) summary(x) sum(x) # Compare the min() and pmin() functions min(x, 3) pmin(x, 3) # Exercise mean(cereal$calories) max(cereal$fat) min(cereal$fat) range(cereal$fat) mean(cereal$weight) mean(cereal$weight, na.rm=TRUE) #-------------------------------------------------------- # Simple increments help(":") # be sure to put operators in quotes for the help() function 5:10 10.2:3 5:10-1 # careful : is evaluated prior to arithmetic seq(1, 100, 10) seq(to=100, from=1, by=10) # replicate things x <- c(5, 6, 7) help(rep) rep(TRUE, 10) rep(x, times=2) rep(x, length.out=8) rep(x, each=2) #-------------------------------------------------------- # Indexing x <- c(5:9, 12:15, 34:37) x # Simple indexing x[2] # note the use of SQUARE brackets for indexing x[c(2,3,7)] # the index can also be a vector #x[2,3,7] # oops, not a proper index for a vector n <- 10 x[n] # indices can be variables. What does this mean? inx <- c(3, 5, 7) #x(inx) # Oops wrong types of brackets x[inx] # using selection vector select <- x >6 & x < 10 select sum(select) x[select] # Indexing x <- c(5:9, 12:15, 34:37) x # Simple replacement indexing x x[2] <- 100 # note the use of SQUARE brackets for indexing x x[c(2,3,7)] <- 200 # the index can also be a vector x x[c(2,3,7)] <- c(200,201) # notice the warning message x n <- 10 x[n] <- 500 # indices can be variables. What does this mean? inx <- c(8,9,10) x[inx] <- c(300, 400) x # using selection vector select <- x >6 & x < 10 select sum(select) x[select] <- -1 x sum(cereal== -1, na.rm=TRUE) sum(is.na(cereal)) # Dropping elements x x[-2] # note the use of SQUARE brackets for indexing x[-c(2,3,7)] # the index can also be a vector n <- -10 x[n] # indices can be variables. What does this mean? inx <- c(3, 5, 7) x[-inx] # Using logical vectors to select elements x <- c(5:9, 12:15, 34:37) x # Selecting elements where entry is TRUE x > 10 x[ x>10 ] x[ x > 10 & x < 20] x[ x %% 2 == 0] # %note the use of == to test for equality x[ x>10 ] <- 500 x # We want to rename "Fiber" to "Fibre" # Avoid using explicit index (i.e. names(cereal)[9] <-"Fibre" as not robust names(cereal) names(cereal)[grepl("fiber",names(cereal))] <- "fibre" names(cereal) # Select certain cereal manufacturers cereal[ cereal$mfr %in% c("P","A"),] # don't forget the last comma cereal[grep("Bran", cereal$name),] # don't forget the last comma #--------------------------------------------------------------------- # Fun with functions # concatenating objects together, especially to make a vector limits <- c(0, 100) ggplot(data=cereal, aes(x=fat, y=calories))+ geom_point()+ ylim(c(0,100)) # Generating sequences seq(1,10,2) seq(1, by=2, length.out=10 ) # Generating all possible combinations expand.grid( sex=c("m","f"), age=c(10,20,30), stringsAsFactors=FALSE) # checking for and counting number of missing values; selecting rows without missing values is.na(cereal$weight) sum(is.na(cereal$weight)) sum(!is.na(cereal$weight)) select <- is.na(cereal$weight) cereal[ !select,] # max and parallel maximum x <- c(1,2,3,4,5,6) max(x) pmax(3, x) # finding the set of unique values unique(cereal$type) # xtabs - counting and checking xtabs(~type, data=cereal, exclude=NULL, na.action=na.pass) xtabs(~type+cups, data=cereal, exclude=NULL, na.action=na.pass) # pasting text together paste("Analysis of ", nrow(cereal), ' breakfast cereals', sep="") ggplot(data=cereal, aes(x=fat, y=calories))+ geom_point()+ ggtitle(paste("Analysis of ", nrow(cereal), ' breakfast cereals', sep="")) # pattern matching - Googleis your friend select <- grepl("bran", cereal$name) # exact match cereal[select,] select <- grepl("bran", cereal$name, ignore.case=TRUE) cereal[select,] select <- grepl("^bran", cereal$name, ignore.case=TRUE) cereal[select,] # start with bran select <- grepl("bran$", cereal$name, ignore.case=TRUE) cereal[select,] # end with bran # dealing with strings toupper(cereal$name) tolower(cereal$name) trimws(cereal$name) substr(cereal$name, 1, 4) substring(cereal$name, 4) substr(cereal$name, 1, -1 +regexpr("_", cereal$name, fixed=TRUE)) substr(cereal$name, 1, pmax(5,-1 +regexpr("_", cereal$name, fixed=TRUE))) gsub("-","_", cereal$name) # sorting sort(cereal$name) cereal[ order(cereal$name), ] # reorder a data frame cereal[ order(cereal$calories, cereal$name),] # file path that is device independent file.path("..","sampledata") # merging and combining data frames byear <- data.frame(name =c('Carl', 'Lois', 'Matthew', 'Marianne', 'David'), byear=c( 1956, 1956, 1986, 1990, 1991)) bcity <- data.frame(name =c('Carl', 'Lois', 'Matthew'), city =c('Wpg', 'Brandon', 'Wpg')) wcity <- data.frame(name =c('Matthew', 'Marianne', 'David'), city =c('Ottawa', 'Vancouver', 'Victoria')) # cbind must be used with caution cbind(bcity, wcity) # merge - careful of non-matches merge(byear, bcity) merge(byear, bcity, all=TRUE) merge(byear, bcity, all.y=TRUE) # multiple merges - Google is your friend Reduce(function(...){merge(..., all=TRUE)}, list(byear, bcity, wcity))