Avi Drucker / May 27 2024

Module 2 - Tutorial - Explore Data

Libraries have pre-defined code for other functions that are not included in basic Python. Once a library has been imported, any of its functions can be used throughout the entire notebook.

%%shpip3 install --upgrade pandas openpyxl &> /dev/null

# import librariesimport pandas as pdimport numpy as np

print(pd.__version__)

Load data

csv files

Stands for "comma separated values"; it is a plain text file where each value is separated by some delimiter (usually commas but can be tabs, semicolons, spaces, etc.)

smallgradesh.csv

23.53 KBDownload

# load csv file data with headers#location = "smallgradesh.csv" # uncomment this line for Jupyter Notebooklocation = smallgradesh.csv # comment out this line for Jupyter Notebookdf = pd.read_csv(location)

df.head()

df.tail(3)

smallgrades.csv

23.52 KBDownload

# load data without headers# location2 = "smallgrades.csv" # uncomment this in Jupyter Notebooklocation2 = smallgrades.csv # comment out this in Jupyter Notebookdf_nohead = pd.read_csv(location2, header=None) # try w/o header=Nonedf_nohead.head()

# add headers during data loaddf_during = pd.read_csv(location2, names=['Name', 'Grade'])df_during.head()

# add headers after data loaddf_nohead.columns = ['Name', 'Grade']df_nohead.head()

# create data from scratchnames = ['Bob','Jessica','Mary','John','Mel']grades = [76,95,77,78,99]GradeList = list(zip(names,grades))GradeList

# export csv filesdf = pd.DataFrame(data = GradeList, columns=['Names','Grades'])df.to_csv('studentgrades.csv',index=False,header=False)df

Excel files

gradedata.xlsx

122.16 KBDownload

# import Excel filelocation = gradedata.xlsx# location = "gradedata.xlsx"df = pd.read_excel(location, engine='openpyxl') # overwrites the info from the df variable in the examples abovedf.head()

# save dataframe as Excel file# using GradeList from abovenames = ['Bob','Jessica','Mary','John','Mel']grades = [76,95,77,78,99]GradeList = list(zip(names,grades))df = pd.DataFrame(data = GradeList, columns=['Names','Grades'])writer = pd.ExcelWriter('dataframe_01.xlsx')df.to_excel(writer, sheet_name='Sheet1')writer.save()

# multiple sheetsdf = pd.DataFrame(data = GradeList, columns=['Names','Grades'])writer = pd.ExcelWriter('dataframe_02.xlsx')df.to_excel(writer, sheet_name='Sheet1')df.to_excel(writer, sheet_name='Sheet2')writer.save()

Exploratory Analysis

gradedata.csv

148.77 KBDownload

# load gradedata.csv file# location = "gradedata.csv"location = gradedata.csvdf = pd.read_csv(location)df.head()

# show the number of rows and columns in a dataframedf.shape

# show the column names in the datasetdf.columns# or print(df.columns)

# show the data type of each columndf.dtypes

# number of non-NA values# len(df) would count rows including NA valuesdf.count()

df['hours'].max()

df['hours'].min()

df['hours'].sum()

df['hours'].mean()

df['hours'].median()

df['hours'].mode()

df['hours'].value_counts()

# standard deviationdf['hours'].std()

# descriptive statisticsdf.describe()

# doesn't do anything on its owndf.groupby('gender')

# add a stats or math function to itdf['hours'].groupby(df['gender']).mean()# mean of multiple columns# df[['hours', 'exercise']].groupby(df['gender']).mean()

# use two columns to groupbydf.groupby(['gender', 'age']).mean()

# pivot table default function is meanpd.pivot_table(df, index=['gender'])

pd.pivot_table(df, values=['hours'], index=['gender'])

# unique values in a columndf['age'].unique()

gradedatamissing.csv

150.25 KBDownload

# find missing valuesfilename = gradedatamissing.csv # comment out this in Jupyter Notebook#filename = "gradedatamissing.csv" # uncomment this in Jupyter Notebookdf_missing = pd.read_csv(filename)# df.head()

# total missing valuesdf_missing.isnull().sum()

# show rows with missing valuesmissing = df_missing['exercise'].isnull()# missing will only show True/False valuesdf_missing.loc[missing]

Finding duplicate rows

names = ['Jessica','John','Bob','Jessica','Mary','John','Mel','Mel']grades = [95,78,76,95,77,78,99,100]GradeList = list(zip(names,grades))df = pd.DataFrame(data = GradeList, columns=['Names', 'Grades'])df

# boolean values for if there's another row with the exact values in each columndupe = df.duplicated()# duplicate of Jessica, 95; John, 78# returns false on first instance of duplicate rowdupe

df.loc[dupe]

Choosing Rows

colA = [10, 20, 40, 50]colB = ['no', 'yes', 'yes', 'no']A_B = list(zip(colA, colB))df_A = pd.DataFrame(data=A_B, columns=['A', 'B'])df_A

index = ['a', 'b', 'c', 'd']colA = [10, 20, 40, 50]colB = ['no', 'yes', 'yes', 'no']df_B = pd.DataFrame(data=A_B, columns=['A', 'B'], index=index)df_B

# loc: label based indexingdf_A.loc[0]

# iloc: select row by index(position) numberdf_A.iloc[0]

# loc: label based indexingdf_B.loc['b']

df_B.iloc[1]

# let's see how it gets trickydf_C = df_A.copy()df_C

index = [2,1,0,3]colA = [10, 20, 40, 50]colB = ['no', 'yes', 'yes', 'no']df_D = pd.DataFrame(data=A_B, columns=['A', 'B'], index=index)df_D

# works like python slicing; give me index 0 through the index before 3df_C.iloc[0:3]

# give me the index label '0' and everything in between and include the label '3'df_C.loc[0:3]

# gave index position 0 through the index position before 3df_D.iloc[0:3]

# gave the label '0' and all the rows that are in between and ending with the row with label '3'df_D.loc[0:3]

Bonus Code!

Below are some code snippets for advanced tasks

Load multiple data files

data1.xlsx

7.87 KBDownload

data2.xlsx

7.88 KBDownload

data3.xlsx

7.86 KBDownload

# import glob # uncomment this in Jupyter Notebookdata_files = [data1.xlsx, data2.xlsx, data3.xlsx] # comment out this line in Jupyter Notebookall_data = pd.DataFrame()for f in data_files: # comment out this line in Jupyter Notebook # for f in glob.glob("data*.xlsx"): # uncomment this in Jupyter Notebook    df = pd.read_excel(f)    all_data = all_data.append(df, ignore_index=True)    all_data.count() # each data file had 100 rows

Create random data

from numpy import randomnames = ['Bob','Jessica','Mary','John','Mel']random.seed(500)random_names = [names[random.randint(low=0,high=len(names))]  				for i in range(1000)]births = [random.randint(low=0,high=1000)  		  for i in range(1000)]BabyDataSet = list(zip(random_names,births))df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])df.head()

Back to the course outline

Appendix

priesterkc/Data-Analytics-Lessons

This repo is mounted by: Python