# Different ways to read in data options(useFancyQuotes=FALSE) # renders summary output corrects #source("schwarz.functions.r") source('http://www.stat.sfu.ca/~cschwarz/Stat-650/Notes/MyPrograms/schwarz.functions.r') # load required libraries library(readxl) library(readxl) # Read in the cereal data from a csv file cereal <- read.csv(file.path('..','sampledata','cereal.csv'), header=TRUE, as.is=TRUE, strip.white=TRUE) head(cereal) str(cereal) #------------------------------------------------------- # Reading an Excel workbook library(readxl) cereal2 <- readxl::read_excel(file.path('..','sampledata','ALLofDATA.xls'), sheet='cereal', col_names=TRUE, trim_ws=TRUE, skip=7, .name_repair="universal") head(cereal2) str(cereal2) # notice that cereal2 is a tibble and not a data frame # tibbles vs. data.frames # These are mostly interchangeable except for print() method and subsetting # single columns. # see help(package=tibble) and vignettes for more details df1 <- data.frame(v1=c("a", "b"), v2=c(1,2), stringsAsFactors=FALSE) tib1 <- tibble::tibble (v1=c("a", "b"), v2=c(1,2)) # notice the _ # compare the output from df1 tib1 # compare the output from df1$xx tib1$xx # compare the output from df1 [,"v1"] tib1[,"v1"] # first is a vector; second is a tibble with 1 columns # some legacy code gets upset with the latter behaviour # you can force a tibble to be a data frame using df2 <- as.data.frame(tib1) #------------------------------------------------------- # Reading white space delimited table from the Internet cereal4 <- read.table("http://lib.stat.cmu.edu/datasets/1993.expo/cereal", header=FALSE, as.is=TRUE, strip.white=TRUE) names(cereal4) <- c('Name','mfr','type','Calories','protein','Fat','sodium','fiber','carbo', 'sugars','shelf','potass','vitamins','weight','cups') head(cereal4) str(cereal4) # Reading small table stored with the script type.code.csv <- textConnection(" type, code C , Cold Cereal H , Hot Cereal ") type.code <- read.csv(type.code.csv, header=TRUE, strip.white=FALSE, as.is=TRUE) head(type.code) str(type.code) type.code$type == "C" type.code$type # oops forgot to strip white space type.code.csv <- textConnection(" type, code C , Cold Cereal H , Hot Cereal ") type.code <- read.csv(type.code.csv, header=TRUE, strip.white=TRUE, as.is=TRUE) head(type.code) str(type.code) type.code$type == "C" type.code$type #------------------------------------------------------- # Correcting variable names sample.csv <- textConnection(" Bird #, Wieght, Length mm, Mass (g) 1, 100, 101, 102 2, 200, 201, 202") sample <- read.csv(sample.csv, header=TRUE, strip.white=TRUE, as.is=TRUE) head(sample) str(sample) sample$Bird.. sample.csv <- textConnection(" Bird #, Wieght, Length mm, Mass (g) 1, 100, 101, 102 2, 200, 201, 202") sample <- read.csv(sample.csv, header=TRUE, strip.white=TRUE, as.is=TRUE, check.names=FALSE) head(sample) str(sample) sample$Bird.. sample$"Bird #" # Using the names function to convert variable names sample2 <- sample names(sample2) names(sample2) <- c("Bird","Weight","Length","Mass") head(sample2) # you can access individual names, but this is fragile # in case the order of the columns changes sample2 <- sample names(sample2) names(sample2)[2] <- c("Weight") head(sample2) # you can access individual names through a selection process # be paranoid about change sample2 <- sample names(sample2) select <- grepl("Wieght", names(sample2)) select sum(select) names(sample2)[select] names(sample2)[select] <- c("Weight") head(sample2) # the plyr package also has a useful function # you can rename all or selected columns sample3 <- sample head(sample3) sample3 <- plyr::rename(sample3, c("Bird #"="Bird", "Wieght"="Weight", "Length mm"="Length", "Mass (g)"="Mass")) head(sample3) #----------------------------------------------------------- # Exercise library(readxl) butts <- readxl::read_excel(file.path('..','sampledata','bird-butts-data.xlsx'), sheet='Correlational', col_names=TRUE, trim_ws = TRUE, skip=1, .name_repair="universal") butts[1:5,] dim(butts) str(butts) # Or, save the sheet from the Excel file and read the csv file butts <- read.csv(file.path("..","sampledata","bird-butts-data-correlational.csv"), header=TRUE, skip=1, as.is=TRUE, strip.white=TRUE) butts[1:5,] dim(butts) str(butts) # Fix the names select <- grepl('wieght', names(butts)) select sum(select) names(butts)[select] names(butts)[ select] <- "Butts.weight" butts[1:5,] # Or use the rename butts <- plyr::rename(butts, c("Butts.wieght"="Butts.weight")) head(butts)