# Dates and Times.
# 2015-01-31 CJS Updates and minor revisions (use POSIXct instead of POSIXlt)
# 2014-04-20 CJS First Edition

options(useFancyQuotes=FALSE) # renders summary output corrects
#source("schwarz.functions.r")
source('http://www.stat.sfu.ca/~cschwarz/Stat-650/Notes/MyPrograms/schwarz.functions.r')

library(boot)
library(car)
library(ggplot2)
library(gridExtra)
library(emmeans)
library(plyr)

library(lubridate)
library(nycflights13)

#------------------- Dates (without times) ------------------------

# Create a virtual csv file for the test data in this exercise

testDates.csv <- textConnection("
d1c,   d2c,    d3c
23aug01, 1970-07-10, 13/07/1956
14sep01, 1972-07-11, 14/07/1956
30feb01, 1972-13-12, 15/07/1956
1mar2001,72-08-14,   16/07/1956") # example of inputing data inline

my.dates <- read.csv(testDates.csv, header=TRUE, # notice no quotes around virtal file name 
            as.is=TRUE, strip.white=TRUE)
my.dates
str(my.dates)

# Convert from character to internal Date representation
my.dates$d1 <- as.Date(my.dates$d1c, format="%d%b%y")
my.dates$d1
as.numeric(my.dates$d1)

my.dates$d2 <- as.Date(my.dates$d2c, format="%Y-%m-%d")
my.dates$d2
as.numeric(my.dates$d2)

my.dates$d3 <- as.Date(my.dates$d3c, format="%d/%m/%Y")
my.dates$d3
as.numeric(my.dates$d3)

# arithmetic operations on dates and time
my.dates$d3 + 20  # adds 20 days

mean(my.dates$d3)
as.numeric(mean(my.dates$d3))


seq(from=my.dates$d3[1], by="3 weeks", length.out=3)

seq(from=my.dates$d3[1], by="3 weeks", length.out=3) #sequen
seq(from=my.dates$d3[1], by="2 months", length.out=3)
seq(from=my.dates$d3[1], by="1 year", length.out=3)


# Extracting parts of the date
# Caution - results of format are always character
format(my.dates$d3, "%d")
#format(my.dates$d3, "%d") + 1  # oops
as.numeric(format(my.dates$d3, "%d"))

# Extract day of the week (0=Sunday)
as.numeric(format(my.dates$d3, "%w"))

# Julian day (number of days since 1 Jan of that year) 001-366
as.numeric(format(my.dates$d3, "%j"))

# Other useful functions from the lubridate package
ymd("2017-01-31")
ymd("17/01/31")  # careful - how do you know if 1900 or 2000?
dmy(31012017)

# putting together multiple fields
library(nycflights13)
head(nycflights13::flights)
data(flights)
head(flights[, c("year","month","day")])
flights$Date <- lubridate::make_date(year =flights$year,
                                     month=flights$month,
                                     day  =flights$day)
head(flights[, c("year","month","day","Date")])


# Extracting parts of a date using lubridate functions rather than format()
lubridate::year("2017-01-31")
lubridate::month("2017-01-31")
lubridate::day("2017-01-31")


#-------- Exercise 1
naccidents <- read.csv(
    file.path('..','sampledata','road-accidents-2010-summary.csv'), header=TRUE,
    as.is=TRUE, strip.white=TRUE)
naccidents[1:5,]
str(naccidents)

# Convert date to internal date format
naccidents$mydate <- as.Date(naccidents$Date, format="%d/%m/%Y")
sum(is.na(naccidents$mydate))
naccidents[1:5,]
str(naccidents)


# Plot the number of acidents by day of the year
plotnacc <- ggplot(data=naccidents, aes(x=mydate, y=naccidents))+
     ggtitle("Number of accidents/day by date")+
     xlab("Date")+ylab("Number of accidents/day")+
     geom_point()+
     geom_smooth()
plotnacc
ggsave(plot=plotnacc, 
  file=file.path("..","..","MyStuff","Images","accidents-summary-by-date.png"), 
  h=4, w=6, units="in", dpi=300)

# look at proportion of fatalities
naccidents$pfatal <- naccidents$nfatal /
                     naccidents$naccidents
plotpfatal <- ggplot(data=naccidents, aes(x=mydate, y=pfatal))+
     ggtitle("P(fatal) by date")+
     xlab("Date")+ylab("p(fatal)")+
     geom_point()+
     geom_smooth()
plotpfatal
ggsave(plot=plotpfatal, 
  file=file.path("..","..","MyStuff","Images","accidents-summary-by-date2.png"), 
  h=4, w=6, units="in", dpi=300)

# melt the data set and plot
plotdata <- reshape2::melt(naccidents,
                id.var="mydate",
                measure.var=c("naccidents","pfatal"),
                variable.name="Measure",
                value.name="value")
head(plotdata,n=2)
tail(plotdata,n=2)

plotboth <- ggplot(data=plotdata, aes(x=mydate, y=value))+
     ggtitle("P(fatal) and # accidents by date")+
     xlab("Date")+ylab("p(fatal) or # accidents")+
     geom_point()+
     geom_smooth()+
     facet_wrap(~Measure, ncol=1, scales="free_y")
plotboth
ggsave(plot=plotboth, 
  file=file.path("..","..","MyStuff","Images","accidents-summary-by-date3.png"), 
  h=6, w=6, units="in", dpi=300)


naccidents$weekday <- format(naccidents$mydate, format="%w")
naccidents[1:10,]

plotnacc2 <- ggplot(data=naccidents, aes(x=weekday, y=naccidents))+
  ggtitle("Number of accidents/day by day of the week")+
  xlab("Day of the week")+ylab("Number of accidents/day")+
  geom_point(position=position_jitter(w=0.2))+
  geom_boxplot(notch=TRUE, alpha=0.2, outlier.size=-1, outlier.colour="red")
plotnacc2
ggsave(plot=plotnacc2, 
  file=file.path("..","..","MyStuff","Images","accidents-summary-by-weekday.png"),
  h=4, w=6, units="in", dpi=300)


#-------- Exercise 2
# The accident data
accidents <- read.csv(
     file.path('..','sampledata','Accidents','road-accidents-2010.csv'), 
     header=TRUE,
     as.is=TRUE, strip.white=TRUE)
accidents[1:5,]
str(accidents)

# Convert date to internal date format
accidents$mydate <- as.Date(accidents$Date, format="%d/%m/%Y")
sum(is.na(accidents$mydate))
accidents[1:5,]
str(accidents)


# tabulate number of accidents on each date
# The summarize function from the plyr package is very useful for group processing
library(plyr)
naccidents <- ddply(accidents, "mydate", summarize,
                    freq=length(Accident_Index))
naccidents[1:5,]
str(naccidents)


# Plot the number of acidents by day of the year
plotnacc <- ggplot(data=naccidents, aes(x=mydate, y=freq))+
     ggtitle("Number of accidents/day by date")+
     xlab("Date")+ylab("Number of accidents/day")+
     geom_point()+
     geom_smooth()
plotnacc
ggsave(plot=plotnacc, 
  file=file.path("..","..","MyStuff","Images","accidents-by-date.png"), 
  h=4, w=6, units="in", dpi=300)


# Extract day of the week
# Leave in character form so that box plots work properly
naccidents$weekday <- format(naccidents$mydate, format="%w")
naccidents[1:10,]

plotnacc2 <- ggplot(data=naccidents, aes(x=weekday, y=freq))+
  ggtitle("Number of accidents/day by day of the week")+
  xlab("Day of the week")+ylab("Number of accidents/day")+
  geom_point(position=position_jitter(w=0.2))+
  geom_boxplot(notch=TRUE, alpha=0.2, outlier.size=-1, outlier.colour="red")
plotnacc2
ggsave(plot=plotnacc2, 
  file=file.path("..","..","MyStuff","Images","accidents-by-weekday.png"),
  h=4, w=6, units="in", dpi=300)


# Repeat for proportion of fatalities
names(accidents)
unique(accidents$Accident_Severity)
library(car)
accidents$Fatality <- car::recode(accidents$Accident_Severity,
              ' 1=1; 2:hi=0')
accidents[1:5, c("Accident_Severity", "Fatality")]

xtabs(~Fatality + Accident_Severity, data=accidents)

# find the proportion of fatalities.
# Note that the mean of a 0/1 variable is a proportion
library(plyr)
pfatal.df <- ddply(accidents, "mydate", summarize,
            freq=length(mydate),
            pfatal=mean(Fatality))
pfatal.df[1:5,]

# Plot the number of acidents by day of the year
plotpfatal <- ggplot(data=pfatal.df, aes(x=mydate, y=pfatal))+
     ggtitle("P(fatal) by date")+
     xlab("Date")+ylab("P(fatal)")+
     geom_point()+
     geom_smooth()
plotpfatal
ggsave(plot=plotpfatal, 
  file=file.path("..","..","MyStuff","Images","accidents-pfatal-by-date.png"),
  h=4, w=6, units="in", dpi=300)


# Extract day of the week
# Leave as character so that the boxplots work properly
pfatal.df$weekday <- format(pfatal.df$mydate, format="%w") # leave as character
pfatal.df[1:10,]

plotpfatal2 <- ggplot(data=pfatal.df, aes(x=weekday, y=pfatal))+
  ggtitle("P(fatal) by day of the week")+
  xlab("Day of the week")+ylab("P(fatal)")+
  geom_point(position=position_jitter(w=0.2))+
  geom_boxplot(notch=TRUE, alpha=0.2, outlier.size=-1)
plotpfatal2
ggsave(plot=plotpfatal2, 
  file=file.path("..","..","MyStuff","Images","accidents-pfatal-by-weekday.png"),
  h=4, w=6, units="in", dpi=300)


#-------------------------------------
# Exercise IV
# Number of flights by day from nyc
# Notice how you can read a zip file directly

flights<- read.csv(
   unz(file.path('..','sampledata','nycflights13','flights.csv.zip'), "flights.csv"), 
   header=TRUE, as.is=TRUE, strip.white=TRUE)
flights<- readr::read_csv(file.path('..','sampledata','nycflights13','flights.csv.zip'))

head(flights)

# make date by combining the columns
flights$Date <- lubridate::make_date(year=flights$year,
                                     month=flights$month,
                                     day  =flights$day)
nflights <- plyr::ddply(flights, "Date", plyr::summarize,
                        n.flights=length(Date))
ggplot(data=nflights, aes(x=Date, y=n.flights))+
  ggtitle("Number of flights leaving NYC")+
  geom_point()+
  geom_line()

# Classify dates by day of the week
nflights$dow <- wday(nflights$Date) # notice 1=Monday - see help(wday)

ggplot(data=nflights, aes(x=as.factor(dow), y=n.flights))+
  ggtitle("Number of flights by day of week")+
  geom_point(position=position_jitter(w=.2))+
  geom_boxplot(alpha=.20, outlier.size=0)


###########################################################################
###########################################################################
###########################################################################
###########################################################################
###########################################################################

#------------------- Dates and Times in R -------------------
# Use the POSIXct (constant time with tz=UTC) representation when every possible
# Create a virtual file with the test data
testDateTimes.csv <- textConnection("
dt1c,   dt2c   
2013-10-23 10:23, 2012-07-13 1:23:00 
2013-11-23 25:23, 2012-07-14 3:23:00 
2013-12-23 13:23, 2012-07-15 5:23:10 
2013-12-24 10:62, 2013-07-15 7:23:23 ") # example of inputing data inline

my.dt <- read.csv(testDateTimes.csv, header=TRUE,  # notice no quotes around virtual file name
            as.is=TRUE, strip.white=TRUE)
my.dt
str(my.dt) 

# Convert from character to internal DateTime representation
my.dt$dt1 <- as.POSIXct(my.dt$dt1c, 
              format="%Y-%m-%d %H:%M", tz="UTC") 
my.dt$dt1
as.numeric(my.dt$dt1)
str(my.dt)

# Look what happens if you don't specify a time zone
my.dt$dt1b <- as.POSIXct(my.dt$dt1c, 
              format="%Y-%m-%d %H:%M") 
my.dt$dt1b
as.numeric(my.dt$dt1b)
str(my.dt)

# read in the the other data values
my.dt$dt2 <- as.POSIXct(my.dt$dt2c, 
              format="%Y-%m-%d %H:%M", tz="UTC") 
my.dt$dt2
as.numeric(my.dt$dt2)


# Arithmetic operations allowed
mean(my.dt$dt2)
as.numeric(mean(my.dt$dt2))

# sequences of dates etc
seq(from=my.dt$dt2[1], by="3 weeks", length.out=3)


# extract the various bits from the date-time format
as.numeric(format(my.dt$dt2, "%d"))
as.numeric(format(my.dt$dt2, "%H"))


# Beware of daylight saving changes
# In BC, this occurred at 2013-11-03 at 2:00 am
# Compare the behaviour of 
mytime <- as.POSIXct("2013-11-03 01:57:00", tz="UTC")
mytime
seq(mytime, by='1 min', length.out=10)
as.numeric(seq(mytime, by='1 min', length.out=10))

mytime <- as.POSIXct("2013-11-03 01:57:00")
mytime
seq(mytime, by='1 min', length.out=10)
as.numeric(seq(mytime, by='1 min', length.out=10))


# But be careful of daylight savings time problems
# In BC, this occurred at 2013-11-03 at 2:00 am

mytime <- as.POSIXct("2013-11-03 01:57:00", tz="UTC")
as.numeric(format(seq(mytime, by='1 min', length.out=10), "%H"))

mytime <- as.POSIXct("2013-11-03 01:57:00")
as.numeric(format(seq(mytime, by='1 min', length.out=10),"%H"))


# useful functions from lubridate
arrive1 <- lubridate::ymd_hms("2011-06-04 12:00:00", tz = "Pacific/Auckland")
arrive1
arrive2 <- lubridate::ymd_hms("2011-06-04 12:00:00") # default tz="UTC"
arrive2
arrive1-arrive2

lubridate::second("2011-06-04 12:01:02")

# time intervals
vacation.interval <- lubridate::interval(lubridate::ymd("2017-07-01"),
                                        lubridate::ymd('2017-07-22'))
vacation.interval

# Other functions that work with intervals include int_start, int_end, int_flip,
#    int_shift, int_aligns, union, intersect, setdiff, and %within%.
int_length(vacation.interval) # in seconds
int_length(vacation.interval)/(24*3600) # in days


# Dealing with time zone if you must
# List of current time zone recognized by R
OlsonNames()


#---------------- Exercise with the accident data -----------------
# The accident data
accidents <- read.csv(
  file.path('..','sampledata','Accidents','road-accidents-2010.csv'),
  header=TRUE,
  as.is=TRUE, strip.white=TRUE)

# Convert date to internal date format
accidents$mydate <- as.Date(accidents$Date, format="%d/%m/%Y")

accidents$DateTime <- paste(accidents$Date, accidents$Time)
accidents[1:5,c("Date","Time","DateTime")]

# Convert date & time to internal DateTime format
accidents$mydt <- as.POSIXct(accidents$DateTime,
                  format="%d/%m/%Y %H:%M", tz="UTC")
accidents[1:5,c("Date","Time","DateTime","mydt")]
str(accidents)

# look at missing values
sum(is.na(accidents$mydt))
accidents[is.na(accidents$mydt),c("Accident_Index","Date","Time","mydt")]

# Plot the time of the accidents by the minute it occurred
# Extract the minute of the accident
accidents$min <- as.numeric(format(accidents$mydt, "%M"))
accidents[1:5, c("Date","Time","mydt","min")]

plotaccmin <- ggplot(data=accidents, aes(x=min))+
  ggtitle("Minute of when accidents occur")+
  xlab("Minute of the hour")+ylab("Number of accidents")+
  geom_histogram(binwidth=1, alpha=0.2)
plotaccmin
ggsave(plot=plotaccmin, 
  file=file.path("..","..","MyStuff","Images","accidents-time-by-minute.png"),
  h=4, w=6, units="in", dpi=300)


# find the proportion of fatalities by the hour of the day
# We currently ignore any effects of different daylight hours.
# Note that the mean of a 0/1 variable is a proportion
names(accidents)
unique(accidents$Accident_Severity)
accidents$Fatality <- accidents$Accident_Severity==1
xtabs(~Fatality + Accident_Severity, data=accidents)

accidents$hour <- as.numeric(format(accidents$mydt, "%H"))
xtabs(~hour, data=accidents)
sum(is.na(accidents$hour))
accidents[1:5, c("Date","Time","mydt","Fatality","hour")]

library(plyr)
pfatal.df <- ddply(accidents, "hour", summarize,
              pfatal=mean(Fatality))
pfatal.df[1:5,]

plotpfatalhour <- ggplot(data=pfatal.df, aes(x=hour, y=pfatal))+
  ggtitle("Proportion of accidents by hour of the day")+
  xlab("Hour of the day")+ylab("P(fatal)")+
  geom_point()+
  geom_line()
plotpfatalhour
ggsave(plot=plotpfatalhour,
  file=file.path("..","..","MyStuff","Images","accidents-pfatal-by-hour.png"),
  h=4, w=6, units="in", dpi=300)


#-------------------------------------
# Exercise II
# Number of flights by day from nyc
# Notice how you can read a zip file directly

flights<- read.csv(
  unz(file.path('..','sampledata','nycflights13','flights.csv.zip'), "flights.csv"), 
  header=TRUE, as.is=TRUE, strip.white=TRUE)
flights<- readr::read_csv(file.path('..','sampledata','nycflights13','flights.csv.zip'))

head(flights)

# make date by combining the columns
flights$DateTime <- lubridate::make_datetime(year=flights$year,
                                     month=flights$month,
                                     day  =flights$day,
                                     hour =floor(flights$dep_time / 100),
                                     min  = flights$dep_time %% 100)
head(flights[,c("year","month","day","dep_time","DateTime")])

flights$hour <- lubridate::hour(flights$DateTime)
flights$min  <- lubridate::minute(flights$DateTime)

ggplot(data=flights, aes(x=hour))+
  ggtitle("Number of flights by hour of the day")+
  geom_histogram(breaks=0:24)

ggplot(data=flights, aes(x=min))+
  ggtitle("Number of flights by hour of the day")+
  geom_histogram(breaks=0:60)


#########################################################################
#########################################################################
#########################################################################
#########################################################################

#--------------------------------------------------------------------
# Duration data

library(lubridate)
testDuration.csv <- textConnection("
du1c,   du2c   
10:23, 1:23:00 
25:23, 3:23:00 
13:23, 5:23:10 
10:62, 7:23:23 ") # example of inputing data inline

my.du <- read.csv(testDuration.csv, header=TRUE,  # notice no quotes around virtual file name
            as.is=TRUE, strip.white=TRUE)
my.du
str(my.du) 

# Notice that first duration is ambiguous, is it HH:MM or MM:SS?
my.du$du1a <- lubridate::ms(my.du$du1c)
my.du$du1a
as.numeric(my.du$du1a)

my.du$du1b <- lubridate::hm(my.du$du1c)
my.du$du1b
as.numeric(my.du$du1b)

my.du$du2 <- lubridate::hms(my.du$du2c)
my.du$du2
as.numeric(my.du$du2)

mean(my.du$du1a)  #???????
mean(as.duration(my.du$du1a))


#-------------------------------------
# Exercise I
# Number of flights by day from nyc
# Notice how you can read a zip file directly

flights<- read.csv(
  unz(file.path('..','sampledata','nycflights13','flights.csv.zip'), "flights.csv"), 
  header=TRUE,
  as.is=TRUE, strip.white=TRUE)
flights<- readr::read_csv(file.path('..','sampledata','nycflights13','flights.csv.zip'))

head(flights)

# Create duration data
flights$dep_delay2 <- lubridate::dminutes(flights$dep_delay)
flights$arr_delay2 <- lubridate::dminutes(flights$arr_delay)

head(flights[,c("year","month","day","dep_delay","dep_delay2","arr_delay","arr_delay2")])

# Hmmm bug in lubridate package when plotting
# See https://github.com/tidyverse/ggplot2/issues/2414
ggplot(data=flights, aes(x=dep_delay2@.Data))+
  ggtitle("Departure delays")+
  geom_histogram()

ggplot(data=flights, aes(x=air_time, y=dep_delay2@.Data))+
  ggtitle("Relationship between air time and departure delay")+
  geom_point()


#########################################################################
#########################################################################
#########################################################################
#########################################################################

#--------------------------------------------------------------------
# Time of day data

# is the first column hh:mm? 

library(hms)
testTod.csv <- textConnection("
tod1c,   tod2c   
10:23, 1:23:00 
25:23, 3:23:00 
13:23, 5:23:10 
10:62, 7:23:23 ") # example of inputing data inline

my.tod <- read.csv(testTod.csv, header=TRUE,  # notice no quotes around virtual file name
               as.is=TRUE, strip.white=TRUE)
my.tod
str(my.tod) 

# convert to time of day
my.tod$tod1 <- hms::parse_hm(my.tod$tod1c) 
my.tod$tod2 <- hms::parse_hm(my.tod$tod2c) 
my.tod

# does averaging work? properly
as.hms(mean(my.tod$tod2))
as.hms(mean( c(hms::parse_hm("23:50"), hms::parse_hm("00:20"))))

my.tod$tod2[1]-my.tod$tod2[4]
my.tod$tod2[4]-my.tod$tod2[1]
hms::parse_hm("23:50") - hms::parse_hm("00:20")
hms::parse_hm("00:20") - hms::parse_hm("23:50") 


#------------------------------------------------------------------------------------
# Make accident summary by date

# The accident data
accidents <- read.csv(
  file.path('..','sampledata','Accidents','road-accidents-2010.csv'),
  header=TRUE,
  as.is=TRUE, strip.white=TRUE)
accidents[1:5,]

accident.summary <- plyr::ddply(accidents, "Date", plyr::summarize,
                                naccidents=length(Date),
                                nfatal    =sum(Accident_Severity==1))
accident.summary
write.csv(accident.summary, "accident-summary.csv", row.names=FALSE)