Austin Bikes
Austin Bike Data Exploration
Data set and description can be found here
Github page with cleaned up data and presentation here
Libraries I might use
library(ISLR)
library(tidyverse)
library(randomForest)
library(gbm)
library(MASS)
library(lubridate)
library(zoo)
library(GGally)
library(e1071) #SVM and NB library
Loading in the Data
bikedata = read.csv("Austin_MetroBike_Trips.csv")
weatherdata = read.csv("austindaily.csv")
bikedata
weatherdata
Data Cleaning
Weather Data
weatherclean = filter(weatherdata, year(X1938.06.01)>2012)
weatherclean = weatherclean[,c(1,2)]
colnames(weatherclean) <- c("Date","AvgTemp")
weatherclean$Date = as.Date(strptime(weatherclean$Date, format = '%Y-%m-%d'))
summary(weatherclean)
Date AvgTemp
Min. :2013-01-01 Min. :-9.20
1st Qu.:2015-02-12 1st Qu.:15.20
Median :2017-03-26 Median :22.00
Mean :2017-03-26 Mean :20.86
3rd Qu.:2019-05-07 3rd Qu.:27.50
Max. :2021-06-20 Max. :34.00
NA's :88
But we can see we have 80 dates with missing temperature values. I would like to impute an average around it, but is that a sound strategy?
set.seed(123)
weathercleannn=na.omit(weatherclean)
fakeweather = weathercleannn
nas = sample (1:nrow(weathercleannn), nrow(weathercleannn)*0.02934703) # The same rate of missing values as in our original data set
fakeweather$AvgTemp[nas] <- NA
fakeweather$AvgTemp = (na.locf(fakeweather$AvgTemp) + rev(na.locf(rev(fakeweather$AvgTemp))))/2 # Fills in missing values with the average of the weather values on either side
mean((weathercleannn$AvgTemp[nas]-fakeweather$AvgTemp[nas])^2)
[1] 5.109943
Great! We are in a reasonable range of the true temperature and can apply this method to our dataset.
weatherclean$AvgTemp = (na.locf(weatherclean$AvgTemp) + rev(na.locf(rev(weatherclean$AvgTemp))))/2
weatherclean
Bike data
bikeclean = bikedata[,c(2,4,5,7,9,10)]
bikeclean[bikeclean==""] <- NA
bikeclean$Checkout.Time=hms(bikeclean$Checkout.Time)
bikeclean$Checkout.Date = as.Date(bikeclean$Checkout.Date,format='%m/%d/%Y')
summary((bikeclean))
Membership.Type Checkout.Date Checkout.Time Checkout.Kiosk Return.Kiosk Trip.Duration.Minutes
Length:1342066 Min. :2013-12-21 Min. :0S Length:1342066 Length:1342066 Min. : 0.00
Class :character 1st Qu.:2015-12-11 1st Qu.:12H 3M 46S Class :character Class :character 1st Qu.: 6.00
Mode :character Median :2017-09-21 Median :15H 12M 0S Mode :character Mode :character Median : 13.00
Mean :2017-06-08 Mean :14H 46M 47.7450237171652S Mean : 30.59
3rd Qu.:2018-08-27 3rd Qu.:18H 12M 0S 3rd Qu.: 28.00
Max. :2021-02-28 Max. :23H 59M 56S Max. :34238.00
Checkout Kiosk
sort(unique(bikeclean$Checkout.Kiosk))
[1] "10th & Red River" "10th/Red River" "11th & Salina"
[4] "11th & Salina " "11th & San Jacinto" "11th/Congress @ The Texas Capitol"
[7] "11th/Salina " "11th/San Jacinto" "12th/San Jacinto @ State Capitol Visitors Garage"
[10] "13th & San Antonio" "13th/San Antonio" "16th/San Antonio"
[13] "17th & Guadalupe" "17th/Guadalupe" "21st & Speedway @PCL"
[16] "21st & University" "21st/Guadalupe" "21st/Speedway @ PCL"
[19] "21st/University" "22nd & Pearl" "22nd/Pearl"
[22] "23rd & Rio Grande" "23rd & San Jacinto @ DKR Stadium" "23rd/Rio Grande"
[25] "23rd/San Jacinto @ DKR Stadium" "26th/Nueces" "28th/Rio Grande"
[28] "2nd & Congress" "2nd/Congress" "2nd/Lavaca @ City Hall"
[31] "3rd & West" "3rd/Nueces" "3rd/Trinity @ The Convention Center"
[34] "3rd/West" "4th & Congress" "4th/Congress"
[37] "4th/Guadalupe @ Republic Square" "4th/Neches @ MetroRail Downtown" "4th/Sabine"
[40] "5th & Bowie" "5th & Campbell" "5th & San Marcos"
[43] "5th/Bowie" "5th/Campbell" "5th/Guadalupe @ Republic Square"
[46] "6th & Chalmers" "6th & Chalmers " "6th & Congress"
[49] "6th & Navasota St." "6th/Brazos" "6th/Chalmers "
[52] "6th/Congress" "6th/Lavaca" "6th/Trinity"
[55] "6th/West" "8th & Congress" "8th & Guadalupe"
[58] "8th & Lavaca" "8th/Congress" "8th/Lavaca"
[61] "8th/Red River" "8th/San Jacinto" "9th/Henderson"
[64] "ACC - Rio Grande & 12th" "ACC - West & 12th" "ACC - West & 12th Street"
[67] "Barton Springs & Riverside" "Barton Springs @ Kinney Ave" "Barton Springs Pool"
[70] "Barton Springs/Bouldin @ Palmer Auditorium" "Barton Springs/Kinney" "Boardwalk West"
[73] "Brazos & 6th" "Bullock Museum @ Congress & MLK" "Capital Metro HQ - East 5th at Broadway"
[76] "Capitol Station / Congress & 11th" "cesar Chavez/Congress" "Cesar Chavez/Congress"
[79] "City Hall / Lavaca & 2nd" "Congress & Cesar Chavez" "Convention Center / 3rd & Trinity"
[82] "Convention Center / 4th St. @ MetroRail" "Convention Center/ 3rd & Trinity" "Customer Service"
[85] "Davis at Rainey Street" "Dean Keeton & Speedway" "Dean Keeton & Speedway "
[88] "Dean Keeton & Whitis" "Dean Keeton/Speedway" "Dean Keeton/Speedway "
[91] "Dean Keeton/Whitis" "East 11th St. & San Marcos" "East 11th St. at Victory Grill"
[94] "East 11th Street at Victory Grill" "East 11th/San Marcos" "East 11th/Victory Grill"
[97] "East 2nd & Pedernales" "East 2nd/Pedernales" "East 4th & Chicon"
[100] "East 4th/Chicon" "East 5th/Broadway @ Capital Metro HQ" "East 5th/Shady @ Eastside Bus Plaza"
[103] "East 5th/Shady Ln" "East 6th & Pedernales St." "East 6th at Robert Martinez"
[106] "East 6th/Medina" "East 6th/Pedernales" "East 6th/Robert T. Martinez"
[109] "East 7th & Pleasant Valley" "Eeyore's 2017" "Eeyore's 2018"
[112] "Electric Drive/Sandra Muraida Way @ Pfluger Ped Bridge" "Guadalupe & 21st" "Guadalupe & 6th"
[115] "Guadalupe/West Mall @ University Co-op" "Henderson & 9th" "Hollow Creek & Barton Hills"
[118] "Hollow Creek/Barton Hills" "Lake Austin & Enfield" "Lake Austin Blvd @ Deep Eddy"
[121] "Lake Austin Blvd/Deep Eddy" "Lake Austin/Enfield" "Lakeshore & Pleasant Valley"
[124] "Lakeshore @ Austin Hostel" "Lakeshore/Austin Hostel" "Lakeshore/Pleasant Valley"
[127] "Lavaca & 6th" "Long Center @ South 1st & Riverside" "Main Office"
[130] "MapJam at French Legation" "MapJam at Hops & Grain Brewery" "MapJam at Pan Am Park"
[133] "MapJam at Scoot Inn" "Marketing Event" "Medina & East 6th"
[136] "Mobile Station" "Mobile Station @ Bike Fest" "Mobile Station @ Boardwalk Opening Ceremony"
[139] "Mobile Station @ Unplugged" "MoPac Pedestrian Bridge @ Veterans Drive" "Nash Hernandez @ RBJ South"
[142] "Nash Hernandez/East @ RBJ South" "Nueces & 26th" "Nueces & 3rd"
[145] "Nueces @ 3rd" "Palmer Auditorium" "Pease Park"
[148] "Pfluger Bridge @ W 2nd Street" "Plaza Saltillo" "Rainey @ River St"
[151] "Rainey St @ Cummings" "Rainey/Cummings" "Rainey/Davis"
[154] "Re-branding" "Red River & 8th Street" "Red River @ LBJ Library"
[157] "Red River/Cesar Chavez @ The Fairmont" "Repair Shop" "Republic Square"
[160] "Republic Square @ 5th & Guadalupe" "Republic Square @ Federal Courthouse Plaza" "Republic Square @ Guadalupe & 4th St."
[163] "Rio Grande & 28th" "Riverside @ S. Lamar" "Riverside/South Lamar"
[166] "Rosewood & Angelina" "Rosewood & Chicon" "Rosewood/Angelina"
[169] "Rosewood/Chicon" "San Jacinto & 8th Street" "Shop"
[172] "South 1st/Riverside @ Long Center" "South Congress & Academy" "South Congress & Barton Springs at the Austin American-Statesman"
[175] "South Congress & Elizabeth" "South Congress & James" "South Congress @ Bouldin Creek"
[178] "South Congress/Academy" "South Congress/Barton Springs @ The Austin American-Statesman" "South Congress/Elizabeth"
[181] "South Congress/James" "State Capitol @ 14th & Colorado" "State Capitol Visitors Garage @ San Jacinto & 12th"
[184] "State Parking Garage @ Brazos & 18th" "Sterzing at Barton Springs" "Sterzing/Barton Springs"
[187] "Stolen" "Toomey Rd @ South Lamar" "Trinity & 6th Street"
[190] "UT West Mall @ Guadalupe" "Veterans/Atlanta @ MoPac Ped Bridge" "Waller & 6th St."
[193] "West & 6th St." "Zilker Park" "Zilker Park at Barton Springs & William Barton Drive"
[196] "Zilker Park West"
Oh no, we have a lot of cleaning to do here
tmp = tolower(bikeclean$Checkout.Kiosk)
tmp = gsub("\\.", " ",tmp)
tmp = gsub("&", "/",tmp)
tmp = gsub("street", "",tmp)
tmp = gsub(" st ", "",tmp)
tmp = gsub(" at ", "@",tmp)
tmp = gsub("@", "/",tmp)
tmp = gsub(" ", "",tmp)
tmp = data.frame(tmp)
tmp = separate(data = tmp, col = 1, into = c("1st","2nd","3rd","4th"), sep = "[^[:alnum:]]+")
Expected 4 pieces. Missing pieces filled with `NA` in 1323839 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
tmp[is.na(tmp)] <- "ZZZZZ" # Sort doesn't like NA values, so I'm replacing with ZZZZ to go to the end of the row
tmp = data.frame(t(apply(tmp,1,sort))) # Sorts by row
tmp = unite(tmp, united, sep="/")
tmp = gsub("/ZZZZZ", "",tmp$united)
tmp = gsub("3rd/theconventioncenter/trinity","3rd/conventioncenter/trinity",tmp)
tmp = gsub("slamar","southlamar",tmp)
tmp = gsub("bartonsprings/kinneyave","bartonsprings/kinney",tmp)
tmp = gsub("branding/re","rebranding",tmp)
tmp = gsub("guadalupe/op/universityco/westmall","guadalupe/utwestmall",tmp)
tmp = gsub("east6th/roberttmartinez","east6th/robertmartinez",tmp)
bikeclean$Checkout.Kiosk = tmp
Return Kiosk
tmp = tolower(bikeclean$Return.Kiosk)
tmp = gsub("\\.", " ",tmp)
tmp = gsub("&", "/",tmp)
tmp = gsub("street", "",tmp)
tmp = gsub(" st ", "",tmp)
tmp = gsub(" at ", "@",tmp)
tmp = gsub("@", "/",tmp)
tmp = gsub(" ", "",tmp)
tmp = data.frame(tmp)
tmp = separate(data = tmp, col = 1, into = c("1st","2nd","3rd","4th"), sep = "[^[:alnum:]]+")
Expected 4 pieces. Missing pieces filled with `NA` in 1323892 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
tmp[is.na(tmp)] <- "ZZZZZ" # Sort doesn't like NA values, so I'm replacing with ZZZZ to go to the end of the row
tmp = data.frame(t(apply(tmp,1,sort))) # Sorts by row
tmp = unite(tmp, united, sep="/")
tmp = gsub("/ZZZZZ", "",tmp$united)
tmp = gsub("3rd/theconventioncenter/trinity","3rd/conventioncenter/trinity",tmp)
tmp = gsub("slamar","southlamar",tmp)
tmp = gsub("bartonsprings/kinneyave","bartonsprings/kinney",tmp)
tmp = gsub("branding/re","rebranding",tmp)
tmp = gsub("guadalupe/op/universityco/westmall","guadalupe/utwestmall",tmp)
tmp = gsub("east6th/roberttmartinez","east6th/robertmartinez",tmp)
Let’s compare the drop off and pickup groups to see if we need to recode anything.
comparison = data.frame(sort(unique(tmp)),c(sort(unique(bikeclean$Checkout.Kiosk)), NA, NA, NA,NA))
colnames(comparison) <- c("Return", "Checkout")
comparison$Return[!comparison$Return %in% comparison$Checkout]
[1] "acl2019dropoff" "earthdayatx2017" "fantasyzilker" "mainshop" "missing"
comparison$Checkout[!comparison$Checkout %in% comparison$Return]
[1] "eeyore/s2018" NA NA NA NA
Mainshop should be recoded to shop
tmp = gsub("mainshop","shop",tmp)
bikeclean$Return.Kiosk = tmp
Membership types
I want to encode these as the following:
Single Trip
“$1 Pay by Trip Fall Special”,“Single Trip”,“Try Before You Buy Special”,“$1 Pay by Trip Winter Special”,“RideScout Single Ride”,“Single Trip”,“Single Trip Ride”, “Pay-as-you-ride”,“Single Trip (Pay-as-you-ride)”,“24-Hour Kiosk (Austin B-cycle)”,“24 Hour Walk Up Pass”,“Walk Up”
Daily
“24-Hour-Online (Austin B-cycle)”,“24-Hour Membership (Austin B-cycle)”,“Explorer”,“Explorer ($8 plus tax)”
3-Day
“3-Day Explorer”,“Weekender”,“3-Day Weekender”,“Weekender ($15 plus tax)”
Weekly
“7-Day”,7-Day Membership (Austin B-cycle),“7-Day Membership (Austin B-cycle)”
Monthly
“Local30 ($11 plus tax)”,“Local30”,“Local31”
Annual
“Annual”,“Annual Membership”,“Annual Pass”,“Annual Plus”,“Local365”,“Local365 ($80 plus tax)”,“Local365 Youth (age 13-17 riders)- 1/2,”Local365+Guest Pass“,”Annual“,”Annual Member“,”Annual Membership“,”Annual Membership (Austin B-cycle)“,”Annual Pass (30 minute)“,”Annual Plus Membership“,”Local365- 1/2 off Anniversary Special“,”Local365 Youth (age 13-17 riders)“,Special” “Local365 Youth with helmet (age 13-17 riders)”,“Local365+Guest Pass- 1/2 off Anniversary Special”,“Membership: pay once one-year commitment”
Student
“HT Ram Membership”,“Semester Membership”,“UT Student Membership”,“Semester Membership (Austin B-cycle)”,“U.T. Student Membership”,“U.T. Student Membership”
Event
“ACL 2019 Pass”,“ACL Weekend Pass Special (Austin B-cycle)”,“FunFunFun Fest 3 Day Pass”
Share
“Annual (Broward B-cycle)”,“Annual (Denver B-cycle)”,“Annual (Kansas City B-cycle)”,“Annual (Nashville B-cycle)”,“Annual (San Antonio B-cycle)”,“Annual Member (Houston B-cycle)”,“Annual Membership (Charlotte B-cycle)”,“Annual Membership (GREENbike)”,“Denver B-cycle Founder”,“Heartland Pass (Annual Pay)”,“Madtown Monthly”,“Republic Rider”,“Annual (Cincy Red Bike)”,“Annual (Omaha B-cycle)”,“Annual (Madison B-cycle)”,“Annual (Denver Bike Sharing)”,“Annual Membership (Fort Worth Bike Sharing)”,“Heartland Pass (Monthly Pay)”,“Republic Rider (Annual)”
Other “Aluminum Access”,“Founding Member (Austin B-cycle)”,“Founding Member”
Missing NA, “PROHIBITED”, “RESTRICTED”
bikeclean$Membership.Type[is.na(bikeclean$Membership.Type)] = "missing"
bikeclean$Membership.Type[which(bikeclean$Membership.Type %in% c("PROHIBITED", "RESTRICTED"))]="missing"
bikeclean$Membership.Type[which(bikeclean$Membership.Type %in% c("3-Day Explorer","Weekender","3-Day Weekender","Weekender ($15 plus tax)"))]="3 Day"
bikeclean$Membership.Type[which(bikeclean$Membership.Type %in% c("$1 Pay by Trip Fall Special","Single Trip","Try Before You Buy Special","$1 Pay by Trip Winter Special","RideScout Single Ride","Single Trip ","Single Trip Ride", "Pay-as-you-ride","Single Trip (Pay-as-you-ride)","24-Hour Kiosk (Austin B-cycle)", "24 Hour Walk Up Pass","Walk Up"))]="Single Trip"
bikeclean$Membership.Type[which(bikeclean$Membership.Type %in% c("24-Hour-Online (Austin B-cycle)","24-Hour Membership (Austin B-cycle)","Explorer","Explorer ($8 plus tax)"))]="Daily"
bikeclean$Membership.Type[which(bikeclean$Membership.Type%in%c("7-Day","7-Day Membership (Austin B-cycle)","7-Day Membership (Austin B-cycle)"))]="Weekly"
bikeclean$Membership.Type[which(bikeclean$Membership.Type%in%c("Local30 ($11 plus tax)","Local30","Local31"))]="Monthly"
bikeclean$Membership.Type[which(bikeclean$Membership.Type%in%c("Annual ","Annual Membership ","Annual Pass","Annual Plus","Local365","Local365 ($80 plus tax)","Local365 Youth (age 13-17 riders)- 1/2" ,"Local365+Guest Pass","Annual","Annual Member","Annual Membership","Annual Membership (Austin B-cycle)","Annual Pass (30 minute)","Annual Plus Membership","Local365- 1/2 off Anniversary Special","Local365 Youth (age 13-17 riders)", "Local365 Youth with helmet (age 13-17 riders)","Local365+Guest Pass- 1/2 off Anniversary Special","Membership: pay once one-year commitment","Local365 Youth (age 13-17 riders)- 1/2 off Special"))]="Annual"
bikeclean$Membership.Type[which(bikeclean$Membership.Type%in%c("HT Ram Membership","Semester Membership","UT Student Membership","Semester Membership (Austin B-cycle)","U.T. Student Membership","U.T. Student Membership"))]="Student"
bikeclean$Membership.Type[which(bikeclean$Membership.Type%in%c("ACL 2019 Pass","ACL Weekend Pass Special (Austin B-cycle)","FunFunFun Fest 3 Day Pass"))]="Event"
bikeclean$Membership.Type[which(bikeclean$Membership.Type%in%c("Aluminum Access","Founding Member (Austin B-cycle)","Founding Member"))]="Other"
bikeclean$Membership.Type[which(bikeclean$Membership.Type%in%c("Annual (Broward B-cycle)","Annual (Denver B-cycle)","Annual (Kansas City B-cycle)","Annual (Nashville B-cycle)","Annual (San Antonio B-cycle)","Annual Member (Houston B-cycle)","Annual Membership (Charlotte B-cycle)","Annual Membership (GREENbike)","Denver B-cycle Founder","Heartland Pass (Annual Pay)","Madtown Monthly","Republic Rider","Annual (Cincy Red Bike)","Annual (Omaha B-cycle)","Annual (Madison B-cycle)","Annual (Denver Bike Sharing)","Annual Membership (Fort Worth Bike Sharing)","Heartland Pass (Monthly Pay)","Republic Rider (Annual)", "Annual Membership (Indy - Pacers Bikeshare )", "Annual (Boulder B-cycle)"))]="Share"
sort(unique(bikeclean$Membership.Type))
[1] "3 Day" "Annual" "Daily" "Event" "missing" "Monthly" "Other" "Share" "Single Trip" "Student"
[11] "Weekly"
Looking good! I might try and predict our missing membership types if I have time.
Merging our Datasets
merged = merge(bikeclean,weatherclean,by.x = "Checkout.Date",by.y ="Date")
saveRDS(merged,file = 'merged.rds')
Now I don’t have to run everything above this EVERY time I need to start over.
merged = readRDS("merged.rds")
merged$Membership.Type=factor(merged$Membership.Type)
Data visualization
ggplot(merged)+
geom_point(aes(x = Checkout.Date, y=round(Trip.Duration.Minutes/60)))+
geom_hline(yintercept=1,color = "red")
Trips are expected to be at 1 hour in length. Any more than that and there is a fine associated with each minute over you ride. as time has gone on, more people aren’t checking their bikes in on time.
I wonder what’s up with the weird gaps in 2016.
ggplot(merged[which(year(merged$Checkout.Date)==2016),])+
geom_point(aes(x = Checkout.Date, y=round(Trip.Duration.Minutes/60)))
ggplot(merged[which(year(merged$Checkout.Date)==2016 & month(merged$Checkout.Date)==4),])+
geom_point(aes(x = Checkout.Date, y=round(Trip.Duration.Minutes/60)))
merged[which(year(merged$Checkout.Date)==2016 & month(merged$Checkout.Date)==4),]
Great, no data for that month.
merged[which(year(merged$Checkout.Date)==2016 & month(merged$Checkout.Date)==12),]
Same with December.
ggplot(merged)+
geom_bar(aes(x=factor(month(Checkout.Date))), fill = "blue")+
facet_wrap(~year(Checkout.Date))
We can see there is a cyclical nature to the number of trips taken per month. Intuitively, there are more riders when the weather is nice.
summary(merged$AvgTemp)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-9.20 17.30 22.40 21.81 27.20 34.00
ggplot(merged)+
geom_histogram(aes(x= AvgTemp), bins = 45, fill = "blue")
Most people ride between and 12 and 32 degrees Celsius
merged %>%
group_by(Checkout.Kiosk) %>%
summarise(Departing_Bikes = length(Checkout.Kiosk)) %>% arrange(.,desc(Departing_Bikes))
merged %>%
ggplot() +
geom_bar(aes(x=Checkout.Kiosk), fill = "red")+
theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank())
Bikes leave the campus Speedway at a very high rate
merged %>%
ggplot() +
geom_bar(aes(x=Return.Kiosk), fill = "blue")+
theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank())
merged %>%
group_by(Return.Kiosk) %>%
summarise(Departing_Bikes = length(Return.Kiosk)) %>% arrange(.,desc(Departing_Bikes))
Even more are returned there
Daily Differential Requiring Human Intervention
checkout = merged %>% group_by(Checkout.Date, Checkout.Kiosk) %>% tally() %>% spread(Checkout.Date, n)
checkout = checkout[-which(checkout$Checkout.Kiosk == "eeyore/s2018"),]
return = merged %>% group_by(Checkout.Date, Return.Kiosk) %>% tally() %>% spread(Checkout.Date, n)
return = return[-which(!return$Return.Kiosk %in% checkout$Checkout.Kiosk),]
checkout[is.na(checkout)] = 0
return[is.na(return)]= 0
diff = data.frame(return$Return.Kiosk,return[,-1]-checkout[,-1])
diff
daily_diff = data.frame(colSums(abs(diff[,-1])))
daily_diff <- rownames_to_column(daily_diff, "Date")
daily_diff$Date = gsub( "X", "", daily_diff$Date)
daily_diff$Date = as.Date(strptime(daily_diff$Date, format = '%Y.%m.%d'))
colnames(daily_diff) = c("Date","Diff")
daily_diff
ggplot(daily_diff)+
geom_bar(aes(x=Date, y = Diff), stat = "identity")
daily_diff = merge(daily_diff,weatherclean,by.x = "Date",by.y ="Date")
daily_diff$month = month(daily_diff$Date)
xgrid = seq(-10,50,.001)
ggplot(daily_diff) +
geom_point(aes(x = AvgTemp, y = Diff, color =factor(month(Date))))
This seems like one of those things we would want to predict so that we can know staffing levels. Let’s try to predict it with a random forest.
Random Forest
head(daily_diff)
set.seed(123)
train = sample (1:nrow(daily_diff), nrow(daily_diff)/2)
rf.dd= randomForest(Diff ~ Date + AvgTemp,data=daily_diff, subset=train)
yhat.rf = predict(rf.dd ,newdata=daily_diff[- train ,])
mean((yhat.rf-daily_diff[-train, "Diff"])^2)
[1] 2518.162
ggplot()+
geom_point(aes(x=daily_diff$Diff[-train],y = yhat.rf))+
geom_abline(color = "red")
summary(daily_diff)
Date Diff AvgTemp month
Min. :2013-12-21 Min. : 6.0 Min. :-9.20 Min. : 1.000
1st Qu.:2015-09-24 1st Qu.: 90.0 1st Qu.:15.00 1st Qu.: 3.000
Median :2017-08-26 Median :118.0 Median :22.10 Median : 6.000
Mean :2017-08-03 Mean :131.3 Mean :20.84 Mean : 6.393
3rd Qu.:2019-05-29 3rd Qu.:154.0 3rd Qu.:27.60 3rd Qu.: 9.000
Max. :2021-02-28 Max. :726.0 Max. :34.00 Max. :12.000
Not terrible, but there are some areas where the model gets it pretty wrong
Let’s try boosting
set.seed(12)
boost.dd=gbm(Diff~AvgTemp+month(Date)+year(Date)+day(Date),data=daily_diff[train ,], distribution="gaussian",n.trees=5000, shrinkage = .01,interaction.depth = 3)
best.iter <- gbm.perf(boost.dd, method = "OOB", plot = FALSE)
OOB generally underestimates the optimal number of iterations although predictive performance is reasonably competitive. Using cv_folds>1 when calling gbm usually results in improved predictive performance.
yhat.boost=predict (boost.dd ,newdata =daily_diff[-train ,], n.trees=best.iter)
mean((yhat.boost-daily_diff[-train, "Diff"])^2)
[1] 3167.117
ggplot()+
geom_point(aes(x=daily_diff$Diff[-train],y = yhat.boost))+
geom_abline(color = "red")
daily_diff[daily_diff$Diff>300, 'Date']
[1] "2014-03-12" "2014-03-14" "2014-03-15" "2014-10-03" "2014-10-04" "2014-10-05" "2014-10-10" "2014-10-11" "2014-10-12" "2015-03-13" "2015-03-14"
[12] "2015-03-15" "2015-03-16" "2015-03-17" "2015-03-18" "2015-03-19" "2015-03-20" "2015-10-02" "2015-10-03" "2015-10-04" "2015-10-09" "2015-10-10"
[23] "2015-10-11" "2016-01-23" "2016-01-30" "2016-01-31" "2016-03-12" "2016-03-13" "2016-03-16" "2016-03-19" "2016-09-30" "2016-10-01" "2016-10-02"
[34] "2016-10-07" "2016-10-08" "2016-10-09" "2017-03-13" "2017-03-14" "2017-03-15" "2017-03-16" "2017-03-17" "2017-03-18" "2017-10-06" "2017-10-07"
[45] "2017-10-08" "2017-10-13" "2017-10-14" "2017-10-15" "2018-02-19" "2018-03-01" "2018-03-02" "2018-03-06" "2018-03-08" "2018-03-09" "2018-03-10"
[56] "2018-03-11" "2018-03-13" "2018-03-14" "2018-03-15" "2018-03-16" "2018-03-17" "2018-03-18" "2018-03-20" "2018-03-21" "2018-03-22" "2018-03-23"
[67] "2018-03-28" "2018-03-29" "2018-04-02" "2018-04-04" "2018-04-06" "2018-04-10" "2018-04-11" "2018-04-12" "2018-04-13" "2018-04-17" "2018-04-18"
[78] "2018-04-19" "2018-04-20" "2018-04-23" "2018-04-26" "2018-04-27" "2018-05-01" "2018-05-03" "2018-05-08" "2018-10-05" "2018-10-06" "2018-10-07"
[89] "2018-10-13"
ANN Try
library(neuralnet)
library(NeuralNetTools)
daily_diff$year = year(daily_diff$Date)
daily_diff$day = day(daily_diff$Date)
dd.scaled <- as.data.frame(scale(daily_diff[,-1]))
min.diff <- min(daily_diff$Diff)
max.diff <- max(daily_diff$Diff)
dd.scaled
dd.scaled$Diff <- scale(daily_diff$Diff, center = min.diff, scale = max.diff - min.diff)
dd.split <- sample(dim(daily_diff)[1],dim(daily_diff)[1]/2 )
# Train-test split
dd.train.scaled <- dd.scaled[dd.split, ]
dd.test.scaled <- dd.scaled[-dd.split, ]
generate.full.fmla<- function(df, response){
names(df)[!(names(df) == response)]%>%
paste(.,collapse = "+")%>%
paste(response, "~", .)%>%
formula(.)
}
dd.nn.fmla <- generate.full.fmla(dd.train.scaled, "Diff")
dd.nn.5.3 <- neuralnet(dd.nn.fmla
, data=dd.train.scaled
, hidden=c(5,3)
, linear.output=TRUE)
dd.nn.8 <- neuralnet(dd.nn.fmla
, data=dd.train.scaled
, hidden=8
, linear.output=TRUE)
rf.dd= randomForest(dd.nn.fmla,data=dd.train.scaled)
fitness.measures <- function(test, model, response){
data.frame(y = test[, response], yhat = predict(model, newdata = test))%>%
summarize(MSE = sum((y-yhat)^2)/n(), MAD = sum(abs(y - yhat))/n())%>%
mutate(RMSE = sqrt(MSE))
}
fitness.measures(dd.test.scaled, dd.nn.5.3, "Diff")
fitness.measures(dd.test.scaled, dd.nn.8, "Diff")
fitness.measures(dd.test.scaled, rf.dd, "Diff")
models = list()
for (i in 3:12){
nn <- neuralnet(dd.nn.fmla
, data=dd.train.scaled
, hidden= i
, linear.output=TRUE)
models[[i]] <- nn
}
mse = list()
for (i in 3:12) {
mse[i] <- (fitness.measures(dd.test.scaled, models[[i]], "Diff")[1] * ((max.diff-min.diff)**2))
}
mse[1] <- fitness.measures(dd.test.scaled, dd.nn.5.3, "Diff")[1] * ((max.diff-min.diff)**2)
mse[2] <- fitness.measures(dd.test.scaled, rf.dd, "Diff")[1] * ((max.diff-min.diff)**2)
mse
[[1]]
[1] 4037.12
[[2]]
[1] 3321.5
[[3]]
[1] 4496.449
[[4]]
[1] 4440.882
[[5]]
[1] 4367.814
[[6]]
[1] 4364.864
[[7]]
[1] 4650.306
[[8]]
[1] 4138.011
[[9]]
[1] 3994.522
[[10]]
[1] 3924.122
[[11]]
[1] 4297.862
[[12]]
[1] 4095.124
NeuralNetTools::garson(models[[10]])
NeuralNetTools::plotnet(models[[10]])
Trip duration
ggplot(merged)+
geom_point(aes(x=Checkout.Kiosk, y = Trip.Duration.Minutes ))+
theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank())
Some stations don’t have long trips associated with them at all, while others have TONS of long trips associated with them.
ggplot(merged)+
geom_point(aes(y=Membership.Type, x = Trip.Duration.Minutes ))+
geom_vline(xintercept = 60, color = "red")
Single trips run long the most often, but there a lot of late trips period.
merged$late = ifelse(merged$Trip.Duration.Minutes > 60,1 ,0)
ggplot(merged)+
geom_bar(aes(y = Membership.Type,fill = factor(late)))
ggplot(merged)+
geom_bar(aes(x = Checkout.Kiosk,fill = factor(late)))+
theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank())
head(merged)
temp_merged = merged#[year(merged$Checkout.Date)>2018,]
temp_merged$Checkout.Kiosk = factor(temp_merged$Checkout.Kiosk)
temp_merged$hour = hour(temp_merged$Checkout.Time)
temp_merged$Membership.Type = factor(temp_merged$Membership.Type)
set.seed(123)
train = sample (1:nrow(temp_merged), nrow(temp_merged)/2)
set.seed(12)
boost.merged=gbm(Trip.Duration.Minutes~Membership.Type+hour + Checkout.Kiosk+AvgTemp,data=temp_merged[train ,], distribution="gaussian",n.trees=500, shrinkage = .1)
Error in gbm(Trip.Duration.Minutes ~ Membership.Type + hour + Checkout.Kiosk + :
could not find function "gbm"
summary(boost.merged)
gbm.perf(boost.merged, method = "OOB")
OOB generally underestimates the optimal number of iterations although predictive performance is reasonably competitive. Using cv_folds>1 when calling gbm usually results in improved predictive performance.
[1] 34
attr(,"smoother")
Call:
loess(formula = object$oobag.improve ~ x, enp.target = min(max(4,
length(x)/10), 50))
Number of Observations: 500
Equivalent Number of Parameters: 39.85
Residual Standard Error: 4.938
pred = predict.gbm(object = boost.merged,newdata = temp_merged[-train,], n.trees = 34)
mean((pred - temp_merged$Trip.Duration.Minutes[-train])^2)
[1] 54794.91
ggplot()+
geom_point(aes(x=temp_merged$Trip.Duration.Minutes[-train], y = pred))+
geom_abline()
head(temp_merged)
boost.merged=gbm(late~Membership.Type+hour + Checkout.Kiosk+AvgTemp,data=temp_merged[train ,], distribution="bernoulli",n.trees=500, shrinkage = .1)
summary(boost.merged)
prob = predict(boost.merged, newdata = temp_merged[-train ,], type = "response")
Using 500 trees...
yhat = ifelse(prob >.5,1,0)
sum(yhat)
[1] 1
hist(prob)
yhat = ifelse(prob >.5,1,0)
cm = t(table(yhat, temp_merged$late[-train]))
cm
yhat
0 1
0 103124 1
1 12738 0
sum(diag(cm))/sum(cm) #Accuracy
[1] 0.8900512
cm[1,1]/sum(cm[1, c(1,2)]) #Correct rate of negative predicted values
[1] 0.9999903
cm[2,2]/sum(cm[2, c(1,2)]) #Correct rate of positive predicted value
[1] 0
yhat = ifelse(prob >.15,1,0)
cm = t(table(yhat, temp_merged$late[-train]))
cm
yhat
0 1
0 74522 28603
1 3786 8952
sum(diag(cm))/sum(cm) #Accuracy
[1] 0.7204543
cm[1,1]/sum(cm[1, c(1,2)]) #Correct rate of negative predicted values
[1] 0.7226376
cm[2,2]/sum(cm[2, c(1,2)]) #Correct rate of positive predicted value
[1] 0.7027791