Avi Drucker / May 27 2024

Module 3 - Tutorial - Data Manipulation

import pandas as pdimport numpy as np

Delete Duplicate Data

#create duplicate datanames = ['Jessica','John','Bob','Jessica','Mary','John','Mel','Mel']grades = [95,78,76,95,77,78,99,100]GradeList = list(zip(names,grades))df = pd.DataFrame(data = GradeList, columns=['Names', 'Grades'])df

dupe = df.duplicated() #creates list of True/False valuesdf[dupe] #shows rows where duplicated is True

#drops duplicate rows, default is to keep first observationnodupedf = df.drop_duplicates()nodupedf.count()

#keep last observationdf.drop_duplicates(['Names'], keep='last')

Select Rows by a Condition

#create fake datanames = ['Bob','Jessica','Mary','John','Mel']grades = [76,-2,77,78,101]GradeList = list(zip(names,grades))df = pd.DataFrame(data = GradeList, columns=['Names', 'Grades'])df

#find rows equal to 101df.loc[df['Grades'] == 101] 

#finds rows where the grade is less than or equal to 100df.loc[df['Grades'] <= 100] 

#multiple conditionsdf.loc[(df['Grades'] >= 60) & (df['Grades'] < 100)] #between 60 and 100

#replacing values with new valuedf.loc[df['Grades'] <= 77, 'Grades'] = 100df

Missing Data

gradedatamissing.csv

150.25 KBDownload

location = gradedatamissing.csv # note: in Jupyter Notebook we must wrap file names in quotesdf = pd.read_csv(location)df.head()

#Selecting rows with no missing age or genderdf[df['age'].notnull() & df['gender'].notnull()]

#drop rows with any missing datadf_no_missing = df.dropna()df_no_missing

#replace empty cells with 0df.fillna(0)

#replace empty cells with average of columndf["grade"].fillna(df["grade"].mean(), inplace=True)

Binning Data

gradedata.csv

148.77 KBDownload

filename = gradedata.csvdf = pd.read_csv(filename)df.head()

df['grade'].dtypes

#Define bins as 0-59, 60-69, 70-79, 80-89, 90-100bins = [0, 60, 70, 80, 90, 100]# Create names for the four groupsgroup_names = ['F', 'D', 'C', 'B', 'A']#make new column with letter gradesdf['lettergrade'] = pd.cut(df['grade'], bins, labels=group_names)df.head()

pd.value_counts(df['lettergrade']) # number of observations per letter grade

df.groupby('lettergrade')['hours'].mean() #mean of hours studied per letter grade

#apply functions to datadf['grade'] = df['grade'].apply(lambda x: int(x)) #turns float type grades to int typedf.head()

Add and Remove Columns

names = ['Bob','Jessica','Mary','John','Mel']grades = [76,95,77,78,99]bsdegress = [1,1,0,0,1]msdegrees = [2,1,0,0,0]phddegrees = [0,1,0,0,0]GradeList = list(zip(names,grades,bsdegress,msdegrees,phddegrees))df = pd.DataFrame(data = GradeList, columns=['Names','Grades','BS','MS','PhD'])df

#make a new column with each value = 0df['HighSchool']=0#make new column with all NaN valuesdf['Preschool'] = np.nan#manually add data for new columnd = ([0,1,0,1,0])s = pd.Series(d, index= df.index)df['DriversLicense'] = sdf

#drop a columndf.drop('PhD', axis=1)

#drop completely empty columns (all NaN/null values)df.dropna(axis=1, how='all')

Make new columns

#using functionsfilename = gradedata.csvdf = pd.read_csv(filename)def singlename(fn, ln):    return fn + " " + lndf['fullname'] = singlename(df['fname'], df['lname'])df.head()

#create column based on binary conditionaldf['isFailing'] = np.where(df['grade']<70, 'yes', 'no')df.tail()

#make numeric value column based on string value column#create a function that will return 1 if female and 0 if maledef score_to_numeric(x):    if x=='female':        return 1    if x=='male':        return 0    df['gender_val'] = df['gender'].apply(score_to_numeric)df.tail()

Sort Values

#still using gradedata.csv#df.head() if neededdf.sort_values('hours') #default is ascending=True#does not change structure of df

Calculate and Remove Outliers

#still using gradedata.csv#check df.head() if needed#Standard Deviation Methodmeangrade = df['grade'].mean()stdgrade = df['grade'].std()toprange = meangrade + stdgrade * 1.96botrange = meangrade - stdgrade * 1.96newdf = df.copy() #to not mess up the original dfnewdf = newdf.drop(newdf[newdf['grade'] > toprange].index)newdf = newdf.drop(newdf[newdf['grade'] < botrange].index)newdf.head()

#Interquartile Range Methodq1 = df['grade'].quantile(.25)q3 = df['grade'].quantile(.75)iqr = q3-q1toprange = q3 + iqr * 1.5botrange = q1 - iqr * 1.5newdf = df.copy()newdf = newdf.drop(newdf[newdf['grade'] > toprange].index)newdf = newdf.drop(newdf[newdf['grade'] < botrange].index)newdf.head()

Back to the course outline

Appendix

priesterkc/Data-Analytics-Lessons

This repo is mounted by: Python

Module 3 - Tutorial - Data Manipulation

Delete Duplicate Data

Select Rows by a Condition

Missing Data

Binning Data

Add and Remove Columns

Make new columns

Sort Values

Calculate and Remove Outliers

Appendix

Runtimes (1)