Avi Drucker / May 27 2024
Module 5 - Tutorial - Time Series
import pandas as pd
import numpy as np
import seaborn; seaborn.set()
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline
Numpy datetime
date = np.array('2015-07-04', dtype=np.datetime64)
date
#creates 12 concsecutive dates starting from July 4, 2015
date + np.arange(12)
#day-based datetime
np.datetime64('2015-07-04')
#minute-based datetime
#sets to time zone of local computer
np.datetime64('2015-07-04 12:00')
Pandas datetime
date = pd.to_datetime("4th of July, 2015")
date
#output day of the week
date.strftime('%A')
#create 12 consecutive dates starting from July 4, 2015
date + pd.to_timedelta(np.arange(12), 'D')
#make index using timestamp
index = pd.DatetimeIndex(['2014-07-04', '2014-08-04',
'2015-07-04', '2015-08-04'])
data = pd.Series([0,1,2,3], index=index) #a series is a singular column from a dataframe
data
#can slice date index similarly to regular index slicing
#includes ending index
data['2014-07-04':'2015-07-04']
#get rows from 2015
data['2015']
#will convert different formats into datetime
dates = pd.to_datetime([datetime(2015, 7, 3), '4th of July 2015', '2015-Jul-6', '07-07-2015', '20150708'])
dates
#assigned a 'day' frequency
dates.to_period('D')
#calculate # of days between 07-03-2015 and each date
dates - dates[0]
#create days between start date and end date
#default frequency is 'day'
pd.date_range('2015-07-03', '2015-07-10')
#create days from start date with periods (how many)
pd.date_range('2015-07-03', periods=8)
#make range using hourly frequency
#default start at 00:00 (midnight)
pd.date_range('2015-07-03', periods=8, freq='H')
#create period frequency by month
pd.period_range('2015-07', periods=8, freq='M')
#create only hour frequencies (no date attached)
#seconds also included
pd.timedelta_range(0, periods=10, freq='H')
#create 2 hour 30 minute intervals
pd.timedelta_range(0, periods=9, freq='2H30T')
#create business day offsets
from pandas.tseries.offsets import BDay
pd.date_range('2015-07-01', periods=5, freq=BDay())
Visualize time series data
FremontBridgeBicycle.csv
1.53 MBDownload#make dates the index
#format dates as we load the data
filename = FremontBridgeBicycle.csv
data = pd.read_csv(filename, index_col="Date", parse_dates=True)
data.head()
data.describe()
#rename columns with shorter name
#create a new column 'Total' with combined values of west and east columns
data.columns = ['West', 'East']
data['Total'] = data.eval('West + East')
#bar plot of data
data.plot()
plt.ylabel('Hourly Bicycle Count')
#hourly intervals are too narrow to make sense of data
#aggregate as weekly data instead
weekly = data.resample('W').sum()
weekly.plot(style=[':', '--', '-']) #line styles for each feature
plt.ylabel('Weekly Bicycle Count')
People tend to bike more in the summers than in the winters
#see what an average day looks like
by_time = data.groupby(data.index.time).mean()
hourly_ticks = 4*60*60*np.arange(6) #6 intervals of 4 hours
by_time.plot(xticks=hourly_ticks, style=[':','--','-'])
Bike usage peaks around 8am and 5pm
#What does bike usage look like by day of the week?
by_weekday = data.groupby(data.index.dayofweek).mean()
by_weekday.index = ['Mon', 'Tue', 'Wed', 'Thur', 'Fri', 'Sat', 'Sun']
by_weekday.plot(style=[':','--','-'])
Bike usage is highest during weekdays and drops off on weekends
#set conditions to show hourly trend on weekdays vs weekends
weekend = np.where(data.index.weekday < 5, 'Weekday', 'Weekend')
by_time = data.groupby([weekend, data.index.time]).mean()
fig, ax = plt.subplots(1, 2, figsize=(14, 5))
by_time.loc['Weekday'].plot(ax=ax[0], title='Weekdays',
xticks=hourly_ticks, style=[':', '--', '-'])
by_time.loc['Weekend'].plot(ax=ax[1], title='Weekends',
xticks=hourly_ticks, style=[':', '--', '-']);