import numpy as np
import matplotlib.pyplot as plt
import folium
import pandas as pd
from IPython.display import display
from PIL import Image
#citibike data Octobor, 2019
df= pd.read_csv("201910-citibike-tripdata.csv")
df.head()
df.dtypes
#focus on date and time
#convert [string] to [datetime]
from datetime import datetime
df['stoptime_dt']= pd.to_datetime(df['stoptime'])
df['starttime_dt']=pd.to_datetime(df['starttime'])
df.head()
# get the day of week
#df['weekday'] = df [].dt.weekday
# Datetime: from time (stoptime) to days of the week;
df['weekday'] = df['stoptime_dt'].dt.weekday
df.head()
# Datetime Library: M = 0; T = 1; W = 2; TH = 3; F = 4; Sat = 5; SUN = 6
# convert [int64] to [string]
df['weekday'] = df['weekday'].apply(str)
df['weekday'].dtypes
# df.groupby().size; to calculate which day in a week is the busiest riding day
weekday = df.groupby('weekday').size()
p = weekday.plot(kind='barh', figsize=(6, 5), color='#a4afb0', width=0.6, grid = True)
p.set_xlabel("Total Number of Riders", labelpad=20, size=14)
p.set_ylabel("Weekday", labelpad=20,size=14)
More people using Citi Bike on weekdays than on weekends. Max: Tuesday; Min: Sunday I'll use Tuesday and Sunday as sample to prove my hypothesis.
#get hh:mm from 'stoptime_dt'
df['hh'] = df['stoptime_dt'].dt.strftime('%H')
df.head()
#select riders from 8-10 am on Tuesday(weekdays)
df[(df['weekday']== '1') & (df['hh'].isin(['08','09']))]
df1 = df[(df['weekday']== '1') & (df['hh'].isin(['08','09']))]
df1.head()
#Geodataframe
import geopandas
from shapely.geometry import Point
import warnings
warnings.simplefilter('ignore')
#list(zip(df1['longitude'],df1['latitude']))
df1['endcoordinates'] = list(zip(df1['end station longitude'],df1['end station latitude']))
# df1['coordinates'].apply(Point)
df1['endcoordinates'] = df1['endcoordinates'].apply(Point)
gdf_Tuesday = geopandas.GeoDataFrame(df1, geometry='endcoordinates')
gdf_Tuesday.head()
gdf_Tuesday_rushhour = gdf_Tuesday[[
'tripduration',
'end station id',
'end station name',
'stoptime_dt',
'starttime_dt',
'weekday',
'hh',
'endcoordinates',
'usertype',
'birth year']]
gdf_Tuesday_rushhour.to_csv('gdf_Tuesday_rush_hour.csv')
# arriving: end station
arriving = gdf_Tuesday.groupby(['end station latitude','end station longitude','end station name']).size()
arriving.head(10)
# arriving: end station
# Which stations are the busiest stations in rush hours on weekdays?
arriving_sort = arriving.sort_values(ascending = False)
arriving_sort.head(5)
# Top 5 busiest station at rush hour on Tuesday:
## Broadway & E 22 St
## E 47 St & Park Ave
## Pershing Square North
## West St & Chambers St
## E 24 St & Park Ave S
for i in range(10):
print(arriving_sort.keys()[i])
len(arriving_sort)
# plot the arriving points on map(zoom Manhattan)
# m = folium.Map()
latitude = []
longitude = []
number = []
for i in range(len(arriving_sort)):
lat = arriving_sort.keys()[i][0]
long = arriving_sort.keys()[i][1]
total = arriving_sort[i]
latitude.append(lat)
longitude.append(long)
number.append(total)
m = folium.Map(location=[40.76727216, -73.99392888], tiles='cartodbpositron',zoom_start = 12)
for j in range(0,len(latitude)):
labels = number[j]
folium.CircleMarker(
location = [latitude[j],longitude[j]],
radius = number[j]/100,
color = '#ce6dbd',
fill_color='#ce6dbd',
popup=labels).add_to(m)
display(m)
#download the map
m.save('html_map_output_Tuesday_rushhour.html')
riders_Tuesday = gdf_Tuesday.groupby(['birth year']).size()
riders_Tuesday.head(10)
riders_Tuesday_sort = riders_Tuesday.sort_values(ascending = False)
riders_Tuesday_sort.head(10).plot(
kind='barh',
figsize=(6, 5),
color='#de7ad0',
width=0.6,
grid = True)
# Who rides during rush hour on Tuesday?
# 1990; 1989; 1992;
# year-old: 29; 30; 27
trip_T = gdf_Tuesday.groupby(['tripduration']).size()
trip_T.head(10)
trip_T_sort = trip_T.sort_values(ascending = False)
trip_T_sort.head(5).plot(
kind='barh',
figsize=(6, 5),
color='#de7ad0',
width=0.6,
grid = True)
# In this case, more ridings are around 5 minutes.
New York City, NY- Nov. 22, 2019 - New Yorkers are a fan of cycling. According to the survey of NYC DOT, nearly eight-hundred thousand New Yorkers (793,000) ride a bicycle regularly. Citi Bike, one of the world’s largest bike shares, has surpassed 60 million trips by June 2018 since the system launch in 2013. Recently, Citi Bike releases its riding data, totaling around 2.1 million rides from last month. Including date, time, and arriving stations, the riding data provides an opportunity to investigate where people head to at morning rush hours on weekdays and which are the busiest station during rush hours.
According to an average of the riding data provided by Citi Bike, there are approximately 68,000 trips every day. Among the month, more people use Citi Bike on Tuesday(Chart1-1). And during rush hours (from 8:00 am to10:00am, and from 5:00 pm to 7:00 pm), over 40% of people choose a bicycle as their primary mode of commuting to work(Chart 1-2). The proximity from Citi Bike station to working place is the factor that riders consider while choosing the way to work.
Key finds from the Citi Bike data analysis show the following of arriving points during rush hours on a weekday morning (Sampling: Tuesday morning, 8 am to 10 am, October 2019)
1. In the morning, people head to different directions of the city, but more people ride to Midtown Manhattan and Downtown Manhattan where there are commercial office buildings and more companies.
2. Bicycle commuters are clustered at the stations that are close to business centers in New York City (Map1-1). This pattern displays several hubs which are related to businesses where people work. Top 5 busiest station at rush hour on Tuesday: (1) Broadway & E 22 St; (2) E 47 St & Park Ave; (3) Pershing Square North; (4) West St & Chambers St; (5) E 24 St & Park Ave S. These stations are near the area of banks, financial centers. For example, big-name banking enterprises are gathering around E 47 St & Park Ave.
3. Among these people who are daily riding commuters, most of them are around 30 years old. These young people make up the dominant parts of the business world. Even though the subways are accessible from their workplaces, they are more willing to try this healthy commuting mode.
[1] Motivate International, Inc. (n.d.). About Citi Bike: Company, History, Motivate. Retrieved from https://www.citibikenyc.com/about.
[2] NYC DOT. (2019). Cycling In The City. CYCLING IN THE CITY. Retrieved from https://www1.nyc.gov/html/dot/downloads/pdf/cycling-in-the-city.pdf
[3] Shah, V. (2018, May 25). Citi Bike 2017 Analysis. Retrieved from https://towardsdatascience.com/citi-bike-2017-analysis-efd298e6c22c.
# Comparision, Sunday(weekday=6) 8-10 am
df2 = df[(df['weekday']== '6') & (df['hh'].isin(['08','09']))]
df2.head()
df2['endcoordinates'] = list(zip(df2['end station longitude'],df2['end station latitude']))
df2['endcoordinates'] = df2['endcoordinates'].apply(Point)
gdf_Smorning = geopandas.GeoDataFrame(df2, geometry='endcoordinates')
gdf_Smorning.head()
gdf_Sunday_Morning = gdf_Smorning[[
'end station id',
'end station name',
'stoptime_dt',
'starttime_dt',
'weekday',
'hh',
'endcoordinates',
'usertype',
'birth year']]
gdf_Sunday_Morning.to_csv('gdf_Sunday_Morning.csv')
arriving1 = gdf_Smorning.groupby(['end station latitude','end station longitude','end station name']).size()
arriving1.head(10)
# arriving: end station
# Which stations are the busiest stations in 8:00 - 10:00 am on Sunday?
arriving1_sort = arriving1.sort_values(ascending = False)
arriving1_sort.head(5)
# Top 5 busiest station at rush hour on Tuesday:
# W 21 St & 6 Ave
# Pershing Square North
# W 20 St & 11 Ave
# Broadway & W 60 St
# E 17 St & Broadway
## But the arriving points are not in clustered pattern as on Tuesday.
for i in range(10):
print(arriving1_sort.keys()[i])
len(arriving1_sort)
#plot arriving points(Sunday morning) on the map
latitude1 = []
longitude1 = []
number1 = []
for i in range(len(arriving1_sort)):
lat1 = arriving1_sort.keys()[i][0]
long1 = arriving1_sort.keys()[i][1]
total1 = arriving1_sort[i]
latitude1.append(lat1)
longitude1.append(long1)
number1.append(total1)
m_Smorning = folium.Map(location=[40.76727216, -73.99392888], tiles='cartodbpositron',zoom_start = 12)
for j in range(0,len(latitude1)):
labels = number1[j]
folium.CircleMarker(
location = [latitude1[j],longitude1[j]],
radius = number1[j]/100,
color = '#2ca02c',
fill_color='#2ca02c',
popup=labels).add_to(m_Smorning)
display(m_Smorning)
# save Sunday Morning
m_Smorning.save('html_map_output_Sunday_8to10.html')
# No cluster on Sunday morning(8-10pm)
# Sunday whole day
# df[(df['weekday']== '6')]
df3 = df[(df['weekday']== '6')]
df3.head()
df3['endcoordinates'] = list(zip(df3['end station longitude'],df3['end station latitude']))
df3['endcoordinates'] = df3['endcoordinates'].apply(Point)
gdf_Sun = geopandas.GeoDataFrame(df3, geometry='endcoordinates')
gdf_Sun.head()
gdf_Sunday_whole = gdf_Sun[[
'end station id',
'end station name',
'stoptime_dt',
'starttime_dt',
'weekday',
'hh',
'endcoordinates',
'usertype',
'birth year']]
gdf_Sunday_whole.to_csv('gdf_Sunday_wholeday.csv')
arriving2 = gdf_Sun.groupby(['end station latitude','end station longitude','end station name']).size()
arriving2.head(10)
#arriving: end station; Which stations are the busiest stations in rush hours on weekdays?
arriving2_sort = arriving2.sort_values(ascending = False)
arriving2_sort.head(5)
# Top 5 busiest station at on Sunday:
# W 21 St & 6 Ave
# Broadway & W 60 St
# Broadway & E 14 St
# Central Park S & 6 Ave
# E 17 St & Broadway
for i in range(10):
print(arriving2_sort.keys()[i])
len(arriving2_sort)
# plot Sunday (the whole day) arriving points on map
latitude2 = []
longitude2 = []
number2 = []
for i in range(len(arriving2_sort)):
lat2 = arriving2_sort.keys()[i][0]
long2 = arriving2_sort.keys()[i][1]
total2 = arriving2_sort[i]
latitude2.append(lat2)
longitude2.append(long2)
number2.append(total2)
m_Sun = folium.Map(location=[40.76727216, -73.99392888],
tiles='cartodbpositron',
zoom_start = 12)
for j in range(0,len(latitude2)):
labels = number2[j]
folium.CircleMarker(location = [latitude2[j],longitude2[j]],
radius = number2[j]/150,
color = '#2ca02c',
fill_color='#2ca02c',
popup=labels).add_to(m_Sun)
display(m_Sun)
# save Sunday Morning
m_Sun.save('html_map_output_Sunday_wholeday.html')
busyhour = gdf_Sun.groupby('hh').size()
p1 = busyhour.plot(kind='barh', figsize=(12, 10), color='#2ca02c', width=0.6, grid = True)
p1.set_xlabel("Total Number of Riders", labelpad=20, size=14)
p1.set_ylabel("Hour_time of the day", labelpad=20,size=14)
busyhour_sorting = busyhour.sort_values(ascending = False)
busyhour_sorting.head(5)
# 16:00 --18:00 Busiest time slots
riders_Sunday = gdf_Sun.groupby(['birth year']).size()
riders_Sunday.head(10)
riders_Sunday_sort = riders_Sunday.sort_values(ascending = False)
riders_Sunday_sort.head(10).plot(
kind='barh',
figsize=(6, 5),
color='#2ca02c',
width=0.6,
grid = True)
# # Who rides during the day on Sunday?
# 1969; 1990; 1992
# year-old: 50; 29; 27
trip_S = gdf_Sun.groupby(['tripduration']).size()
trip_S.head(10)
trip_S_sort = trip_S.sort_values(ascending = False)
trip_S_sort.head(5).plot(
kind='barh',
figsize=(6, 5),
color='#2ca02c',
width=0.6,
grid = True)