Avi Drucker / May 27 2024
Module 5 - Tutorial - Time Series
import pandas as pdimport numpy as npimport seaborn; seaborn.set()import matplotlib.pyplot as pltfrom datetime import datetime%matplotlib inlineNumpy datetime
date = np.array('2015-07-04', dtype=np.datetime64)date#creates 12 concsecutive dates starting from July 4, 2015date + np.arange(12)#day-based datetimenp.datetime64('2015-07-04')#minute-based datetime#sets to time zone of local computernp.datetime64('2015-07-04 12:00')Pandas datetime
date = pd.to_datetime("4th of July, 2015")date#output day of the weekdate.strftime('%A')#create 12 consecutive dates starting from July 4, 2015date + pd.to_timedelta(np.arange(12), 'D')#make index using timestampindex = pd.DatetimeIndex(['2014-07-04', '2014-08-04', '2015-07-04', '2015-08-04'])data = pd.Series([0,1,2,3], index=index) #a series is a singular column from a dataframedata#can slice date index similarly to regular index slicing#includes ending indexdata['2014-07-04':'2015-07-04']#get rows from 2015data['2015']#will convert different formats into datetimedates = pd.to_datetime([datetime(2015, 7, 3), '4th of July 2015', '2015-Jul-6', '07-07-2015', '20150708'])dates#assigned a 'day' frequencydates.to_period('D')#calculate # of days between 07-03-2015 and each datedates - dates[0]#create days between start date and end date#default frequency is 'day'pd.date_range('2015-07-03', '2015-07-10')#create days from start date with periods (how many)pd.date_range('2015-07-03', periods=8)#make range using hourly frequency#default start at 00:00 (midnight)pd.date_range('2015-07-03', periods=8, freq='H')#create period frequency by monthpd.period_range('2015-07', periods=8, freq='M')#create only hour frequencies (no date attached)#seconds also includedpd.timedelta_range(0, periods=10, freq='H')#create 2 hour 30 minute intervalspd.timedelta_range(0, periods=9, freq='2H30T')#create business day offsetsfrom pandas.tseries.offsets import BDaypd.date_range('2015-07-01', periods=5, freq=BDay())Visualize time series data
FremontBridgeBicycle.csv
1.53 MBDownload#make dates the index#format dates as we load the datafilename = FremontBridgeBicycle.csvdata = pd.read_csv(filename, index_col="Date", parse_dates=True)data.head()data.describe()#rename columns with shorter name#create a new column 'Total' with combined values of west and east columnsdata.columns = ['West', 'East']data['Total'] = data.eval('West + East')#bar plot of datadata.plot()plt.ylabel('Hourly Bicycle Count')#hourly intervals are too narrow to make sense of data#aggregate as weekly data insteadweekly = data.resample('W').sum()weekly.plot(style=[':', '--', '-']) #line styles for each featureplt.ylabel('Weekly Bicycle Count')People tend to bike more in the summers than in the winters
#see what an average day looks likeby_time = data.groupby(data.index.time).mean()hourly_ticks = 4*60*60*np.arange(6) #6 intervals of 4 hoursby_time.plot(xticks=hourly_ticks, style=[':','--','-'])Bike usage peaks around 8am and 5pm
#What does bike usage look like by day of the week?by_weekday = data.groupby(data.index.dayofweek).mean()by_weekday.index = ['Mon', 'Tue', 'Wed', 'Thur', 'Fri', 'Sat', 'Sun']by_weekday.plot(style=[':','--','-'])Bike usage is highest during weekdays and drops off on weekends
#set conditions to show hourly trend on weekdays vs weekendsweekend = np.where(data.index.weekday < 5, 'Weekday', 'Weekend')by_time = data.groupby([weekend, data.index.time]).mean()fig, ax = plt.subplots(1, 2, figsize=(14, 5))by_time.loc['Weekday'].plot(ax=ax[0], title='Weekdays', xticks=hourly_ticks, style=[':', '--', '-'])by_time.loc['Weekend'].plot(ax=ax[1], title='Weekends', xticks=hourly_ticks, style=[':', '--', '-']);