Module 2 - Tutorial - Explore Data

Libraries have pre-defined code for other functions that are not included in basic Python. Once a library has been imported, any of its functions can be used throughout the entire notebook.

# import libraries
import pandas as pd
import numpy as np
print(pd.__version__)

Load data

csv files

Stands for "comma separated values"; it is a plain text file where each value is separated by some delimiter (usually commas but can be tabs, semicolons, spaces, etc.)

smallgradesh.csv
23.53 KBDownload
# load csv file data with headers
#location = "smallgradesh.csv" # uncomment this line for Jupyter Notebooks
location = smallgradesh.csv # comment out this line for Jyputer Notebook
df = pd.read_csv(location)
df.head()
# df.head?
df.tail(3)
smallgrades.csv
23.52 KBDownload
# load data without headers
# location2 = "smallgrades.csv" # uncomment this in Jupyter Notebook
location2 = smallgrades.csv # comment out this in Jupyter Notebook
df_nohead = pd.read_csv(location2, header=None) # try w/o header=None
df_nohead.head()
# add headers during data load
df_during = pd.read_csv(location2, names=['Name', 'Grade'])
df_during.head()
# add headers after data load
df_nohead.columns = ['Name', 'Grade']
df_nohead.head()
# create data from scratch
names = ['Bob','Jessica','Mary','John','Mel']
grades = [76,95,77,78,99]
GradeList = list(zip(names,grades))
GradeList
# export csv files
df = pd.DataFrame(data = GradeList, columns=['Names','Grades'])
df.to_csv('studentgrades.csv',index=False,header=False)
df

Excel files

gradedata.xlsx
122.16 KBDownload
# import Excel file
location = gradedata.xlsx
# location = "gradedata.xlsx"
df = pd.read_excel(location, engine='openpyxl') # overwrites the info from the df variable in the examples above
df.head()
# save dataframe as Excel file
# using GradeList from above
names = ['Bob','Jessica','Mary','John','Mel']
grades = [76,95,77,78,99]
GradeList = list(zip(names,grades))
df = pd.DataFrame(data = GradeList, columns=['Names','Grades'])
writer = pd.ExcelWriter('dataframe_01.xlsx')
df.to_excel(writer, sheet_name='Sheet1')
writer.save()
# multiple sheets
df = pd.DataFrame(data = GradeList, columns=['Names','Grades'])
writer = pd.ExcelWriter('dataframe_02.xlsx')
df.to_excel(writer, sheet_name='Sheet1')
df.to_excel(writer, sheet_name='Sheet2')
writer.save()

Exploratory Analysis

gradedata.csv
148.77 KBDownload
# load gradedata.csv file
# location = "gradedata.csv"
location = gradedata.csv
df = pd.read_csv(location)
df.head()
# show the number of rows and columns in a dataframe
df.shape
# show the column names in the dataset
df.columns
# or print(df.columns)
# show the data type of each column
df.dtypes
# number of non-NA values
# len(df) would count rows including NA values
df.count()
df['hours'].max()
df['hours'].min()
df['hours'].sum()
df['hours'].mean()
df['hours'].median()
df['hours'].mode()
df['hours'].value_counts()
# standard deviation
df['hours'].std()
# descriptive statistics
df.describe()
# doesn't do anything on its own
df.groupby('gender')
# add a stats or math function to it
df['hours'].groupby(df['gender']).mean()
# mean of multiple columns
# df[['hours', 'exercise']].groupby(df['gender']).mean()
# use two columns to groupby
df.groupby(['gender', 'age']).mean()
# pivot table default function is mean
pd.pivot_table(df, index=['gender'])
pd.pivot_table(df, values=['hours'], index=['gender'])
# unique values in a column
df['age'].unique()
gradedatamissing.csv
150.25 KBDownload
# find missing values
filename = gradedatamissing.csv # comment out this in Jupyter Notebook
#filename = "gradedatamissing.csv" # uncomment this in Jupyter Notebook
df_missing = pd.read_csv(filenamprint(pd.__version__)e)
# df.head()
# total missing values
df_missing.isnull().sum()
# show rows with missing values
missing = df_missing['exercise'].isnull()
# missing will only show True/False values
df_missing.loc[missing]

Finding duplicate rows

names = ['Jessica','John','Bob','Jessica','Mary','John','Mel','Mel']
grades = [95,78,76,95,77,78,99,100]
GradeList = list(zip(names,grades))
df = pd.DataFrame(data = GradeList, columns=['Names', 'Grades'])
df
# boolean values for if there's another row with the exact values in each column
dupe = df.duplicated()
# duplicate of Jessica, 95; John, 78
# returns false on first instance of duplicate row
dupe
df.loc[dupe]

Choosing Rows

colA = [10, 20, 40, 50]
colB = ['no', 'yes', 'yes', 'no']
A_B = list(zip(colA, colB))
df_A = pd.DataFrame(data=A_B, columns=['A', 'B'])
df_A
index = ['a', 'b', 'c', 'd']
colA = [10, 20, 40, 50]
colB = ['no', 'yes', 'yes', 'no']
df_B = pd.DataFrame(data=A_B, columns=['A', 'B'], index=index)
df_B
# loc: label based indexing
df_A.loc[0]
# iloc: select row by index(position) number
df_A.iloc[0]
# loc: label based indexing
df_B.loc['b']
df_B.iloc[1]
# let's see how it gets tricky
df_C = df_A.copy()
df_C
index = [2,1,0,3]
colA = [10, 20, 40, 50]
colB = ['no', 'yes', 'yes', 'no']
df_D = pd.DataFrame(data=A_B, columns=['A', 'B'], index=index)
df_D
# works like python slicing; give me index 0 through the index before 3
df_C.iloc[0:3]
# give me the index label '0' and everything in between and include the label '3'
df_C.loc[0:3]
# gave index position 0 through the index position before 3
df_D.iloc[0:3]
# gave the label '0' and all the rows that are in between and ending with the row with label '3'
df_D.loc[0:3]

Bonus Code!

Below are some code snippets for advanced tasks

Load multiple data files

data1.xlsx
7.87 KBDownload
data2.xlsx
7.88 KBDownload
data3.xlsx
7.86 KBDownload
# import glob # uncomment this in Jupyter Notebook
data_files = [data1.xlsx, data2.xlsx, data3.xlsx] # comment out this line in Jupyter Notebook
all_data = pd.DataFrame()
for f in data_files: # comment out this line in Jupyter Notebook 
# for f in glob.glob("data*.xlsx"): # uncomment this in Jupyter Notebook
    df = pd.read_excel(f)
    all_data = all_data.append(df, ignore_index=True)
    
all_data.count() # each data file had 100 rows

Create random data

from numpy import random
names = ['Bob','Jessica','Mary','John','Mel']
random.seed(500)
random_names = [names[random.randint(low=0,high=len(names))] 
 				for i in range(1000)]
births = [random.randint(low=0,high=1000) 
 		  for i in range(1000)]
BabyDataSet = list(zip(random_names,births))
df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])
df.head()

Appendix

This repo is mounted by: Python
Runtimes (1)