Micah P. Dombrowski / Mar 18 2020
COVID-19 Exploratory Data Analysis
(Almost) Everything You Want To Know About COVID-19.
These visualizations were made by Devakumar kp. Original notebook is here.
#hide
# essential libraries
import json
import random
from urllib.request import urlopen
# storing and anaysis
import numpy as np
import pandas as pd
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
import folium
# color pallette
cnf = '#393e46' # confirmed - grey
dth = '#ff2e63' # death - red
rec = '#21bf73' # recovered - cyan
act = '#fe9801' # active case - yellow
# converter
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
# hide warnings
import warnings
warnings.filterwarnings('ignore')
# html embedding
from IPython.display import Javascript
from IPython.core.display import display, HTML
8.0s
Python
#hide
# importing datasets
url = 'https://raw.githubusercontent.com/imdevskp/covid_19_jhu_data_web_scrap_and_cleaning/master/covid_19_clean_complete.csv'
full_table = pd.read_csv(url,
parse_dates=['Date'])
full_table.head()
0.5s
Python
#hide
# cases
cases = ['Confirmed', 'Deaths', 'Recovered', 'Active']
# Active Case = confirmed - deaths - recovered
full_table['Active'] = full_table['Confirmed'] - full_table['Deaths'] - full_table['Recovered']
# replacing Mainland china with just China
full_table['Country/Region'] = full_table['Country/Region'].replace('Mainland China', 'China')
# filling missing values
full_table[['Province/State']] = full_table[['Province/State']].fillna('')
full_table[cases] = full_table[cases].fillna(0)
0.1s
Python
#hide
# cases in the ships
ship = full_table[full_table['Province/State'].str.contains('Grand Princess')|full_table['Province/State'].str.contains('Diamond Princess cruise ship')]
# china and the row
china = full_table[full_table['Country/Region']=='China']
row = full_table[full_table['Country/Region']!='China']
# latest
full_latest = full_table[full_table['Date'] == max(full_table['Date'])].reset_index()
china_latest = full_latest[full_latest['Country/Region']=='China']
row_latest = full_latest[full_latest['Country/Region']!='China']
# latest condensed
full_latest_grouped = full_latest.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
china_latest_grouped = china_latest.groupby('Province/State')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
row_latest_grouped = row_latest.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
0.3s
Python
World-Wide Totals
#hide
temp = full_table.groupby(['Country/Region', 'Province/State'])['Confirmed', 'Deaths', 'Recovered', 'Active'].max()
# temp.style.background_gradient(cmap='Reds')
0.1s
Python
#hide_input
temp = full_table.groupby('Date')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
temp = temp[temp['Date']==max(temp['Date'])].reset_index(drop=True)
temp.style.background_gradient(cmap='Pastel1')
0.7s
Python
Progression of Virus Over Time
#hide_input
# https://app.flourish.studio/visualisation/1571387/edit
HTML('''<div class="flourish-embed flourish-bar-chart-race" data-src="visualisation/1571387"><script src="https://public.flourish.studio/resources/embed.js"></script></div>''')
0.1s
Python
Cumalitive Outcomes
#hide
temp = full_table.groupby('Date')['Recovered', 'Deaths', 'Active'].sum().reset_index()
temp = temp.melt(id_vars="Date", value_vars=['Recovered', 'Deaths', 'Active'],
var_name='Case', value_name='Count')
temp.head()
fig = px.area(temp, x="Date", y="Count", color='Case',
title='Cases over time', color_discrete_sequence = [rec, dth, act])
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-2-1.png')
3.3s
Python
Loading viewer…
Recovery and Mortality Rate
#hide
temp = full_table.groupby('Date').sum().reset_index()
# adding two more columns
temp['No. of Deaths to 100 Confirmed Cases'] = round(temp['Deaths']/temp['Confirmed'], 3)*100
temp['No. of Recovered to 100 Confirmed Cases'] = round(temp['Recovered']/temp['Confirmed'], 3)*100
# temp['No. of Recovered to 1 Death Case'] = round(temp['Recovered']/temp['Deaths'], 3)
temp = temp.melt(id_vars='Date', value_vars=['No. of Deaths to 100 Confirmed Cases', 'No. of Recovered to 100 Confirmed Cases'],
var_name='Ratio', value_name='Value')
fig = px.line(temp, x="Date", y="Value", color='Ratio', log_y=True,
title='Recovery and Mortality Rate Over The Time',
color_discrete_sequence=[dth, rec])
fig.update_layout(legend=dict(orientation="h", y=1, x=0,
xanchor="left", yanchor="top"),
margin=dict(t=80,l=0,r=0,b=0))
fig
1.1s
Python
Loading viewer…
No. of Places To Which COVID-19 spread
#hide
c_spread = china[china['Confirmed']!=0].groupby('Date')['Province/State'].unique().apply(len)
c_spread = pd.DataFrame(c_spread).reset_index()
fig = px.line(c_spread, x='Date', y='Province/State', text='Province/State',
title='Number of Provinces/States/Regions of China<br>to which COVID-19 spread over the time',
color_discrete_sequence=[cnf,dth, rec])
fig.update_traces(textposition='top center')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-3-1.png')
1.3s
Python
Loading viewer…
spread = full_table[full_table['Confirmed']!=0].groupby('Date')['Country/Region'].unique().apply(len)
spread = pd.DataFrame(spread).reset_index()
fig = px.line(spread, x='Date', y='Country/Region', text='Country/Region',
title='Number of Countries/Regions<br>to which COVID-19 spread over the time',
color_discrete_sequence=[cnf,dth, rec])
fig.update_traces(textposition='top center')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-3-2.png')
1.2s
Python
Loading viewer…
Maps
#hide
# Confirmed
fig = px.choropleth(full_latest_grouped, locations="Country/Region",
locationmode='country names', color="Confirmed",
hover_name="Country/Region", range_color=[1,7000],
color_continuous_scale="aggrnyl",
title='Countries with Confirmed Cases')
fig.update(layout_coloraxis_showscale=False)
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-1-1.png')
0.9s
Python
Loading viewer…
#hide
# Deaths
fig = px.choropleth(full_latest_grouped[full_latest_grouped['Deaths']>0],
locations="Country/Region", locationmode='country names',
color="Deaths", hover_name="Country/Region",
range_color=[1,50], color_continuous_scale="agsunset",
title='Countries with Deaths Reported')
fig.update(layout_coloraxis_showscale=False)
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-1-2.png')
1.3s
Python
Loading viewer…
Top 20 Countries
#hide
flg = full_latest_grouped
flg.head()
0.1s
Python
#hide
fig = px.bar(flg.sort_values('Confirmed', ascending=False).head(20).sort_values('Confirmed', ascending=True),
x="Confirmed", y="Country/Region", title='Confirmed Cases', text='Confirmed', orientation='h',
width=700, height=700, range_x = [0, max(flg['Confirmed'])+10000])
fig.update_traces(marker_color=cnf, opacity=0.6, textposition='outside')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-4-1.png')
1.6s
Python
Loading viewer…
#hide
fig = px.bar(flg.sort_values('Deaths', ascending=False).head(20).sort_values('Deaths', ascending=True),
x="Deaths", y="Country/Region", title='Deaths', text='Deaths', orientation='h',
width=700, height=700, range_x = [0, max(flg['Deaths'])+500])
fig.update_traces(marker_color=dth, opacity=0.6, textposition='outside')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-4-2.png')
0.9s
Python
Loading viewer…
#hide
fig = px.bar(flg.sort_values('Recovered', ascending=False).head(20).sort_values('Recovered', ascending=True),
x="Recovered", y="Country/Region", title='Recovered', text='Recovered', orientation='h',
width=700, height=700, range_x = [0, max(flg['Recovered'])+10000])
fig.update_traces(marker_color=rec, opacity=0.6, textposition='outside')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-4-3.png')
1.2s
Python
Loading viewer…
#hide
fig = px.bar(flg.sort_values('Active', ascending=False).head(20).sort_values('Active', ascending=True),
x="Active", y="Country/Region", title='Active', text='Active', orientation='h',
width=700, height=700, range_x = [0, max(flg['Active'])+3000])
fig.update_traces(marker_color=act, opacity=0.6, textposition='outside')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-4-4.png')
1.3s
Python
Loading viewer…
#hide
# (Only countries with more than 100 case are considered)
flg['Mortality Rate'] = round((flg['Deaths']/flg['Confirmed'])*100, 2)
temp = flg[flg['Confirmed']>100]
temp = temp.sort_values('Mortality Rate', ascending=False)
fig = px.bar(temp.sort_values('Mortality Rate', ascending=False).head(15).sort_values('Mortality Rate', ascending=True),
x="Mortality Rate", y="Country/Region", text='Mortality Rate', orientation='h',
width=700, height=600, range_x = [0, 8], title='No. of Deaths Per 100 Confirmed Case')
fig.update_traces(marker_color=act, opacity=0.6, textposition='outside')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-4-5.png')
1.3s
Python
Loading viewer…
Composition of Cases
#hide_input
fig = px.treemap(full_latest.sort_values(by='Confirmed', ascending=False).reset_index(drop=True),
path=["Country/Region", "Province/State"], values="Confirmed", height=700,
title='Number of Confirmed Cases',
color_discrete_sequence = px.colors.qualitative.Prism)
fig.data[0].textinfo = 'label+text+value'
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-8-1.png')
1.3s
Python
Loading viewer…
fig = px.treemap(full_latest.sort_values(by='Deaths', ascending=False).reset_index(drop=True),
path=["Country/Region", "Province/State"], values="Deaths", height=700,
title='Number of Deaths reported',
color_discrete_sequence = px.colors.qualitative.Prism)
fig.data[0].textinfo = 'label+text+value'
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-8-2.png')
1.4s
Python
Loading viewer…
Epidemic Span
Note : In the graph, last day is shown as one day after the last time a new confirmed cases reported in the Country / Region
#hide_input
# first date
# ----------
first_date = full_table[full_table['Confirmed']>0]
first_date = first_date.groupby('Country/Region')['Date'].agg(['min']).reset_index()
# first_date.head()
from datetime import timedelta
# last date
# ---------
last_date = full_table.groupby(['Country/Region', 'Date', ])['Confirmed', 'Deaths', 'Recovered']
last_date = last_date.sum().diff().reset_index()
mask = last_date['Country/Region'] != last_date['Country/Region'].shift(1)
last_date.loc[mask, 'Confirmed'] = np.nan
last_date.loc[mask, 'Deaths'] = np.nan
last_date.loc[mask, 'Recovered'] = np.nan
last_date = last_date[last_date['Confirmed']>0]
last_date = last_date.groupby('Country/Region')['Date'].agg(['max']).reset_index()
# last_date.head()
# first_last
# ----------
first_last = pd.concat([first_date, last_date[['max']]], axis=1)
# added 1 more day, which will show the next day as the day on which last case appeared
first_last['max'] = first_last['max'] + timedelta(days=1)
# no. of days
first_last['Days'] = first_last['max'] - first_last['min']
# task column as country
first_last['Task'] = first_last['Country/Region']
# rename columns
first_last.columns = ['Country/Region', 'Start', 'Finish', 'Days', 'Task']
# sort by no. of days
first_last = first_last.sort_values('Days')
# first_last.head()
# visualization
# --------------
# produce random colors
clr = ["#"+''.join([random.choice('0123456789ABC') for j in range(6)]) for i in range(len(first_last))]
#plot
fig = ff.create_gantt(first_last, index_col='Country/Region', colors=clr,
show_colorbar=False, bar_width=0.2, showgrid_x=True,
showgrid_y=True, height=1600, title=('Gantt Chart'))
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0),
autosize=False,width=700,height=5000)
fig #.write_image('covid-eda-9-1.png')
3.5s
Python
Loading viewer…
China vs. Not China
#hide
# In China
temp = china.groupby('Date')['Confirmed', 'Deaths', 'Recovered'].sum().diff()
temp = temp.reset_index()
temp = temp.melt(id_vars="Date",
value_vars=['Confirmed', 'Deaths', 'Recovered'])
fig = px.bar(temp, x="Date", y="value", color='variable',
title='In China',
color_discrete_sequence=[cnf, dth, rec])
fig.update_layout(barmode='group')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-10-1.png')
1.4s
Python
Loading viewer…
# ROW
temp = row.groupby('Date')['Confirmed', 'Deaths', 'Recovered'].sum().diff()
temp = temp.reset_index()
temp = temp.melt(id_vars="Date",
value_vars=['Confirmed', 'Deaths', 'Recovered'])
fig = px.bar(temp, x="Date", y="value", color='variable',
title='Outside China',
color_discrete_sequence=[cnf, dth, rec])
fig.update_layout(barmode='group')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-10-2.png')
1.7s
Python
Loading viewer…
#hide
def from_china_or_not(row):
if row['Country/Region']=='China':
return 'From China'
else:
return 'Outside China'
temp = full_table.copy()
temp['Region'] = temp.apply(from_china_or_not, axis=1)
temp = temp.groupby(['Region', 'Date'])['Confirmed', 'Deaths', 'Recovered']
temp = temp.sum().diff().reset_index()
mask = temp['Region'] != temp['Region'].shift(1)
temp.loc[mask, 'Confirmed'] = np.nan
temp.loc[mask, 'Deaths'] = np.nan
temp.loc[mask, 'Recovered'] = np.nan
fig = px.bar(temp, x='Date', y='Confirmed', color='Region', barmode='group',
text='Confirmed', title='Confirmed', color_discrete_sequence= [cnf, dth, rec])
fig.update_traces(textposition='outside')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-10-3.png')
1.6s
Python
Loading viewer…
fig = px.bar(temp, x='Date', y='Deaths', color='Region', barmode='group',
text='Confirmed', title='Deaths', color_discrete_sequence= [cnf, dth, rec])
fig.update_traces(textposition='outside')
fig.update_traces(textangle=-90)
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-10-4.png')
1.3s
Python
Loading viewer…
#hide
gdf = full_table.groupby(['Date', 'Country/Region'])['Confirmed', 'Deaths', 'Recovered'].max()
gdf = gdf.reset_index()
temp = gdf[gdf['Country/Region']=='China'].reset_index()
temp = temp.melt(id_vars='Date', value_vars=['Confirmed', 'Deaths', 'Recovered'],
var_name='Case', value_name='Count')
fig = px.bar(temp, x="Date", y="Count", color='Case', facet_col="Case",
title='China', color_discrete_sequence=[cnf, dth, rec])
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-10-5.png')
1.5s
Python
Loading viewer…
temp = gdf[gdf['Country/Region']!='China'].groupby('Date').sum().reset_index()
temp = temp.melt(id_vars='Date', value_vars=['Confirmed', 'Deaths', 'Recovered'],
var_name='Case', value_name='Count')
fig = px.bar(temp, x="Date", y="Count", color='Case', facet_col="Case",
title='ROW', color_discrete_sequence=[cnf, dth, rec])
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-10-6.png')
1.3s
Python
Loading viewer…
Data By Country
Top 50 Countries By Confirmed Cases
#hide_input
temp_f = full_latest_grouped.sort_values(by='Confirmed', ascending=False).head(50)
temp_f = temp_f.reset_index(drop=True)
temp_f.style.background_gradient(cmap='Reds')
0.6s
Python
Top 25 Countries By Deaths Reported
#hide_input
temp_flg = temp_f[temp_f['Deaths']>0][['Country/Region', 'Deaths']].head(25)
temp_flg.sort_values('Deaths', ascending=False).reset_index(drop=True).style.background_gradient(cmap='Reds')
0.5s
Python
Top 25 Chinese Provinces By Confirmed Cases
#hide_input
temp_f = china_latest_grouped[['Province/State', 'Confirmed', 'Deaths', 'Recovered']]
temp_f = temp_f.sort_values(by='Confirmed', ascending=False)
temp_f = temp_f.reset_index(drop=True)
temp_f.style.background_gradient(cmap='Pastel1_r')
0.6s
Python
Related Work
https://www.kaggle.com/imdevskp/mers-outbreak-analysis
https://www.kaggle.com/imdevskp/sars-2003-outbreak-analysis
https://www.kaggle.com/imdevskp/western-africa-ebola-outbreak-analysis