Module 3 - Tutorial - Data Manipulation

import pandas as pd
import numpy as np

Delete Duplicate Data

#create duplicate data
names = ['Jessica','John','Bob','Jessica','Mary','John','Mel','Mel']
grades = [95,78,76,95,77,78,99,100]
GradeList = list(zip(names,grades))
df = pd.DataFrame(data = GradeList, columns=['Names', 'Grades'])
df
dupe = df.duplicated() #creates list of True/False values
df[dupe] #shows rows where duplicated is True
#drops duplicate rows, default is to keep first observation
nodupedf = df.drop_duplicates()
nodupedf.count()
#keep last observation
df.drop_duplicates(['Names'], keep='last')

Select Rows by a Condition

#create fake data
names = ['Bob','Jessica','Mary','John','Mel']
grades = [76,-2,77,78,101]
GradeList = list(zip(names,grades))
df = pd.DataFrame(data = GradeList, columns=['Names', 'Grades'])
df
#find rows equal to 101
df.loc[df['Grades'] == 101] 
#finds rows where the grade is less than or equal to 100
df.loc[df['Grades'] <= 100] 
#multiple conditions
df.loc[(df['Grades'] >= 60) & (df['Grades'] < 100)] #between 60 and 100
#replacing values with new value
df.loc[df['Grades'] <= 77, 'Grades'] = 100
df

Missing Data

df = pd.read_csv("datasets/gradedatamissing.csv")
df.head()
#Selecting rows with no missing age or gender
df[df['age'].notnull() & df['gender'].notnull()]
#drop rows with any missing data
df_no_missing = df.dropna()
df_no_missing
#replace empty cells with 0
df.fillna(0)
#replace empty cells with average of column
df["grade"].fillna(df["grade"].mean(), inplace=True)

Binning Data

filename = "datasets/gradedata.csv"
df = pd.read_csv(filename)
df.head()
df['grade'].dtypes
#Define bins as 0-59, 60-69, 70-79, 80-89, 90-100
bins = [0, 60, 70, 80, 90, 100]
# Create names for the four groups
group_names = ['F', 'D', 'C', 'B', 'A']
#make new column with letter grades
df['lettergrade'] = pd.cut(df['grade'], bins, labels=group_names)
df.head()
pd.value_counts(df['lettergrade']) # number of observations per letter grade
df.groupby('lettergrade')['hours'].mean() #mean of hours studied per letter grade
#apply functions to data
df['grade'] = df['grade'].apply(lambda x: int(x)) #turns float type grades to int type
df.head()

Add and Remove Columns

names = ['Bob','Jessica','Mary','John','Mel']
grades = [76,95,77,78,99]
bsdegress = [1,1,0,0,1]
msdegrees = [2,1,0,0,0]
phddegrees = [0,1,0,0,0]
GradeList = list(zip(names,grades,bsdegress,msdegrees,phddegrees))
df = pd.DataFrame(data = GradeList, columns=['Names','Grades','BS','MS','PhD'])
df
#make a new column with each value = 0
df['HighSchool']=0
#make new column with all NaN values
df['Preschool'] = np.nan
#manually add data for new column
d = ([0,1,0,1,0])
s = pd.Series(d, index= df.index)
df['DriversLicense'] = s
df
#drop a column
df.drop('PhD', axis=1)
#drop completely empty columns (all NaN/null values)
df.dropna(axis=1, how='all')
Make new columns
#using functions
filename = "datasets/gradedata.csv"
df = pd.read_csv(filename)
def singlename(fn, ln):
    return fn + " " + ln
df['fullname'] = singlename(df['fname'], df['lname'])
df.head()
#create column based on binary conditional
df['isFailing'] = np.where(df['grade']<70, 'yes', 'no')
df.tail()
#make numeric value column based on string value column
#create a function that will return 1 if female and 0 if male
def score_to_numeric(x):
    if x=='female':
        return 1
    if x=='male':
        return 0
    
df['gender_val'] = df['gender'].apply(score_to_numeric)
df.tail()

Sort Values

#still using gradedata.csv
#df.head() if needed
df.sort_values('hours') #default is ascending=True
#does not change structure of df

Calculate and Remove Outliers

#still using gradedata.csv
#check df.head() if needed
#Standard Deviation Method
meangrade = df['grade'].mean()
stdgrade = df['grade'].std()
toprange = meangrade + stdgrade * 1.96
botrange = meangrade - stdgrade * 1.96
newdf = df.copy() #to not mess up the original df
newdf = newdf.drop(newdf[newdf['grade'] > toprange].index)
newdf = newdf.drop(newdf[newdf['grade'] < botrange].index)
newdf.head()
#Interquartile Range Method
q1 = df['grade'].quantile(.25)
q3 = df['grade'].quantile(.75)
iqr = q3-q1
toprange = q3 + iqr * 1.5
botrange = q1 - iqr * 1.5
newdf = df.copy()
newdf = newdf.drop(newdf[newdf['grade'] > toprange].index)
newdf = newdf.drop(newdf[newdf['grade'] < botrange].index)
newdf.head()

Appendix

Runtimes (1)