Avi Drucker / May 27 2024
Module 2 - Tutorial - Explore Data
Libraries have pre-defined code for other functions that are not included in basic Python. Once a library has been imported, any of its functions can be used throughout the entire notebook.
%%sh
pip3 install --upgrade pandas openpyxl &> /dev/null
12.1s
# import libraries
import pandas as pd
import numpy as np
0.7s
print(pd.__version__)
0.4s
Load data
csv files
Stands for "comma separated values"; it is a plain text file where each value is separated by some delimiter (usually commas but can be tabs, semicolons, spaces, etc.)
smallgradesh.csv
23.53 KBDownload# load csv file data with headers
#location = "smallgradesh.csv" # uncomment this line for Jupyter Notebook
location = smallgradesh.csv # comment out this line for Jupyter Notebook
df = pd.read_csv(location)
0.2s
df.head()
0.0s
df.tail(3)
0.0s
smallgrades.csv
23.52 KBDownload# load data without headers
# location2 = "smallgrades.csv" # uncomment this in Jupyter Notebook
location2 = smallgrades.csv # comment out this in Jupyter Notebook
df_nohead = pd.read_csv(location2, header=None) # try w/o header=None
df_nohead.head()
0.2s
# add headers during data load
df_during = pd.read_csv(location2, names=['Name', 'Grade'])
df_during.head()
0.0s
# add headers after data load
df_nohead.columns = ['Name', 'Grade']
df_nohead.head()
0.0s
# create data from scratch
names = ['Bob','Jessica','Mary','John','Mel']
grades = [76,95,77,78,99]
GradeList = list(zip(names,grades))
GradeList
0.0s
# export csv files
df = pd.DataFrame(data = GradeList, columns=['Names','Grades'])
df.to_csv('studentgrades.csv',index=False,header=False)
df
0.0s
Excel files
gradedata.xlsx
122.16 KBDownload# import Excel file
location = gradedata.xlsx
# location = "gradedata.xlsx"
df = pd.read_excel(location, engine='openpyxl') # overwrites the info from the df variable in the examples above
df.head()
1.5s
# save dataframe as Excel file
# using GradeList from above
names = ['Bob','Jessica','Mary','John','Mel']
grades = [76,95,77,78,99]
GradeList = list(zip(names,grades))
df = pd.DataFrame(data = GradeList, columns=['Names','Grades'])
writer = pd.ExcelWriter('dataframe_01.xlsx')
df.to_excel(writer, sheet_name='Sheet1')
writer.save()
0.0s
# multiple sheets
df = pd.DataFrame(data = GradeList, columns=['Names','Grades'])
writer = pd.ExcelWriter('dataframe_02.xlsx')
df.to_excel(writer, sheet_name='Sheet1')
df.to_excel(writer, sheet_name='Sheet2')
writer.save()
0.0s
Exploratory Analysis
gradedata.csv
148.77 KBDownload# load gradedata.csv file
# location = "gradedata.csv"
location = gradedata.csv
df = pd.read_csv(location)
df.head()
0.3s
# show the number of rows and columns in a dataframe
df.shape
0.0s
# show the column names in the dataset
df.columns
# or print(df.columns)
0.0s
# show the data type of each column
df.dtypes
0.0s
# number of non-NA values
# len(df) would count rows including NA values
df.count()
0.0s
df['hours'].max()
0.0s
df['hours'].min()
0.0s
df['hours'].sum()
0.0s
df['hours'].mean()
0.0s
df['hours'].median()
0.0s
df['hours'].mode()
0.0s
df['hours'].value_counts()
0.0s
# standard deviation
df['hours'].std()
0.0s
# descriptive statistics
df.describe()
0.1s
# doesn't do anything on its own
df.groupby('gender')
0.0s
# add a stats or math function to it
df['hours'].groupby(df['gender']).mean()
# mean of multiple columns
# df[['hours', 'exercise']].groupby(df['gender']).mean()
0.0s
# use two columns to groupby
df.groupby(['gender', 'age']).mean()
0.1s
# pivot table default function is mean
pd.pivot_table(df, index=['gender'])
0.0s
pd.pivot_table(df, values=['hours'], index=['gender'])
0.0s
# unique values in a column
df['age'].unique()
0.0s
gradedatamissing.csv
150.25 KBDownload# find missing values
filename = gradedatamissing.csv # comment out this in Jupyter Notebook
#filename = "gradedatamissing.csv" # uncomment this in Jupyter Notebook
df_missing = pd.read_csv(filename)
# df.head()
0.2s
# total missing values
df_missing.isnull().sum()
0.0s
# show rows with missing values
missing = df_missing['exercise'].isnull()
# missing will only show True/False values
df_missing.loc[missing]
0.1s
Finding duplicate rows
names = ['Jessica','John','Bob','Jessica','Mary','John','Mel','Mel']
grades = [95,78,76,95,77,78,99,100]
GradeList = list(zip(names,grades))
df = pd.DataFrame(data = GradeList, columns=['Names', 'Grades'])
df
0.1s
# boolean values for if there's another row with the exact values in each column
dupe = df.duplicated()
# duplicate of Jessica, 95; John, 78
# returns false on first instance of duplicate row
dupe
0.0s
df.loc[dupe]
0.0s
Choosing Rows
colA = [10, 20, 40, 50]
colB = ['no', 'yes', 'yes', 'no']
A_B = list(zip(colA, colB))
df_A = pd.DataFrame(data=A_B, columns=['A', 'B'])
df_A
0.0s
index = ['a', 'b', 'c', 'd']
colA = [10, 20, 40, 50]
colB = ['no', 'yes', 'yes', 'no']
df_B = pd.DataFrame(data=A_B, columns=['A', 'B'], index=index)
df_B
0.0s
# loc: label based indexing
df_A.loc[0]
0.0s
# iloc: select row by index(position) number
df_A.iloc[0]
0.0s
# loc: label based indexing
df_B.loc['b']
0.0s
df_B.iloc[1]
0.0s
# let's see how it gets tricky
df_C = df_A.copy()
df_C
0.0s
index = [2,1,0,3]
colA = [10, 20, 40, 50]
colB = ['no', 'yes', 'yes', 'no']
df_D = pd.DataFrame(data=A_B, columns=['A', 'B'], index=index)
df_D
0.0s
# works like python slicing; give me index 0 through the index before 3
df_C.iloc[0:3]
0.0s
# give me the index label '0' and everything in between and include the label '3'
df_C.loc[0:3]
0.0s
# gave index position 0 through the index position before 3
df_D.iloc[0:3]
0.0s
# gave the label '0' and all the rows that are in between and ending with the row with label '3'
df_D.loc[0:3]
0.0s
Bonus Code!
Below are some code snippets for advanced tasks
Load multiple data files
data1.xlsx
7.87 KBDownloaddata2.xlsx
7.88 KBDownloaddata3.xlsx
7.86 KBDownload# import glob # uncomment this in Jupyter Notebook
data_files = [data1.xlsx, data2.xlsx, data3.xlsx] # comment out this line in Jupyter Notebook
all_data = pd.DataFrame()
for f in data_files: # comment out this line in Jupyter Notebook
# for f in glob.glob("data*.xlsx"): # uncomment this in Jupyter Notebook
df = pd.read_excel(f)
all_data = all_data.append(df, ignore_index=True)
all_data.count() # each data file had 100 rows
0.8s
Create random data
from numpy import random
names = ['Bob','Jessica','Mary','John','Mel']
random.seed(500)
random_names = [names[random.randint(low=0,high=len(names))]
for i in range(1000)]
births = [random.randint(low=0,high=1000)
for i in range(1000)]
BabyDataSet = list(zip(random_names,births))
df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])
df.head()
0.0s
Appendix
This repo is mounted by: Python