Avi Drucker / May 27 2024

Module 5 - Tutorial - Time Series

import pandas as pdimport numpy as npimport seaborn; seaborn.set()import matplotlib.pyplot as pltfrom datetime import datetime%matplotlib inline

Numpy datetime

date = np.array('2015-07-04', dtype=np.datetime64)date

#creates 12 concsecutive dates starting from July 4, 2015date + np.arange(12)

#day-based datetimenp.datetime64('2015-07-04')

#minute-based datetime#sets to time zone of local computernp.datetime64('2015-07-04 12:00')

Pandas datetime

date = pd.to_datetime("4th of July, 2015")date

#output day of the weekdate.strftime('%A')

#create 12 consecutive dates starting from July 4, 2015date + pd.to_timedelta(np.arange(12), 'D')

#make index using timestampindex = pd.DatetimeIndex(['2014-07-04', '2014-08-04',                          '2015-07-04', '2015-08-04'])data = pd.Series([0,1,2,3], index=index) #a series is a singular column from a dataframedata

#can slice date index similarly to regular index slicing#includes ending indexdata['2014-07-04':'2015-07-04']

#get rows from 2015data['2015']

#will convert different formats into datetimedates = pd.to_datetime([datetime(2015, 7, 3), '4th of July 2015', '2015-Jul-6', '07-07-2015', '20150708'])dates

#assigned a 'day' frequencydates.to_period('D')

#calculate # of days between 07-03-2015 and each datedates - dates[0]

#create days between start date and end date#default frequency is 'day'pd.date_range('2015-07-03', '2015-07-10')

#create days from start date with periods (how many)pd.date_range('2015-07-03', periods=8)

#make range using hourly frequency#default start at 00:00 (midnight)pd.date_range('2015-07-03', periods=8, freq='H')

#create period frequency by monthpd.period_range('2015-07', periods=8, freq='M')

#create only hour frequencies (no date attached)#seconds also includedpd.timedelta_range(0, periods=10, freq='H')

#create 2 hour 30 minute intervalspd.timedelta_range(0, periods=9, freq='2H30T')

#create business day offsetsfrom pandas.tseries.offsets import BDaypd.date_range('2015-07-01', periods=5, freq=BDay())

Visualize time series data

FremontBridgeBicycle.csv

1.53 MBDownload

#make dates the index#format dates as we load the datafilename = FremontBridgeBicycle.csvdata = pd.read_csv(filename, index_col="Date", parse_dates=True)data.head()

data.describe()

#rename columns with shorter name#create a new column 'Total' with combined values of west and east columnsdata.columns = ['West', 'East']data['Total'] = data.eval('West + East')

#bar plot of datadata.plot()plt.ylabel('Hourly Bicycle Count')

#hourly intervals are too narrow to make sense of data#aggregate as weekly data insteadweekly = data.resample('W').sum()weekly.plot(style=[':', '--', '-']) #line styles for each featureplt.ylabel('Weekly Bicycle Count')

People tend to bike more in the summers than in the winters

#see what an average day looks likeby_time = data.groupby(data.index.time).mean()hourly_ticks = 4*60*60*np.arange(6) #6 intervals of 4 hoursby_time.plot(xticks=hourly_ticks, style=[':','--','-'])

Bike usage peaks around 8am and 5pm

#What does bike usage look like by day of the week?by_weekday = data.groupby(data.index.dayofweek).mean()by_weekday.index = ['Mon', 'Tue', 'Wed', 'Thur', 'Fri', 'Sat', 'Sun']by_weekday.plot(style=[':','--','-'])

Bike usage is highest during weekdays and drops off on weekends

#set conditions to show hourly trend on weekdays vs weekendsweekend = np.where(data.index.weekday < 5, 'Weekday', 'Weekend')by_time = data.groupby([weekend, data.index.time]).mean()

fig, ax = plt.subplots(1, 2, figsize=(14, 5))by_time.loc['Weekday'].plot(ax=ax[0], title='Weekdays',                           xticks=hourly_ticks, style=[':', '--', '-'])by_time.loc['Weekend'].plot(ax=ax[1], title='Weekends',                           xticks=hourly_ticks, style=[':', '--', '-']);

Back to the course outline

Module 5 - Tutorial - Time Series

Numpy datetime

Pandas datetime

Visualize time series data

Runtimes (1)