Avi Drucker / May 27 2024
Module 3 - Tutorial - Data Manipulation
import pandas as pdimport numpy as np1.2s
Delete Duplicate Data
#create duplicate datanames = ['Jessica','John','Bob','Jessica','Mary','John','Mel','Mel']grades = [95,78,76,95,77,78,99,100]GradeList = list(zip(names,grades))df = pd.DataFrame(data = GradeList, columns=['Names', 'Grades'])df0.1s
dupe = df.duplicated() #creates list of True/False valuesdf[dupe] #shows rows where duplicated is True0.0s
#drops duplicate rows, default is to keep first observationnodupedf = df.drop_duplicates()nodupedf.count()0.0s
#keep last observationdf.drop_duplicates(['Names'], keep='last')0.1s
Select Rows by a Condition
#create fake datanames = ['Bob','Jessica','Mary','John','Mel']grades = [76,-2,77,78,101]GradeList = list(zip(names,grades))df = pd.DataFrame(data = GradeList, columns=['Names', 'Grades'])df0.0s
#find rows equal to 101df.loc[df['Grades'] == 101] 0.1s
#finds rows where the grade is less than or equal to 100df.loc[df['Grades'] <= 100] 0.0s
#multiple conditionsdf.loc[(df['Grades'] >= 60) & (df['Grades'] < 100)] #between 60 and 1000.0s
#replacing values with new valuedf.loc[df['Grades'] <= 77, 'Grades'] = 100df0.0s
Missing Data
gradedatamissing.csv
150.25 KBDownloadlocation = gradedatamissing.csv # note: in Jupyter Notebook we must wrap file names in quotesdf = pd.read_csv(location)df.head()0.2s
#Selecting rows with no missing age or genderdf[df['age'].notnull() & df['gender'].notnull()]#drop rows with any missing datadf_no_missing = df.dropna()df_no_missing#replace empty cells with 0df.fillna(0)#replace empty cells with average of columndf["grade"].fillna(df["grade"].mean(), inplace=True)Binning Data
gradedata.csv
148.77 KBDownloadfilename = gradedata.csvdf = pd.read_csv(filename)df.head()df['grade'].dtypes#Define bins as 0-59, 60-69, 70-79, 80-89, 90-100bins = [0, 60, 70, 80, 90, 100]# Create names for the four groupsgroup_names = ['F', 'D', 'C', 'B', 'A']#make new column with letter gradesdf['lettergrade'] = pd.cut(df['grade'], bins, labels=group_names)df.head()pd.value_counts(df['lettergrade']) # number of observations per letter gradedf.groupby('lettergrade')['hours'].mean() #mean of hours studied per letter grade#apply functions to datadf['grade'] = df['grade'].apply(lambda x: int(x)) #turns float type grades to int typedf.head()Add and Remove Columns
names = ['Bob','Jessica','Mary','John','Mel']grades = [76,95,77,78,99]bsdegress = [1,1,0,0,1]msdegrees = [2,1,0,0,0]phddegrees = [0,1,0,0,0]GradeList = list(zip(names,grades,bsdegress,msdegrees,phddegrees))df = pd.DataFrame(data = GradeList, columns=['Names','Grades','BS','MS','PhD'])df#make a new column with each value = 0df['HighSchool']=0#make new column with all NaN valuesdf['Preschool'] = np.nan#manually add data for new columnd = ([0,1,0,1,0])s = pd.Series(d, index= df.index)df['DriversLicense'] = sdf#drop a columndf.drop('PhD', axis=1)#drop completely empty columns (all NaN/null values)df.dropna(axis=1, how='all')Make new columns
#using functionsfilename = gradedata.csvdf = pd.read_csv(filename)def singlename(fn, ln): return fn + " " + lndf['fullname'] = singlename(df['fname'], df['lname'])df.head()#create column based on binary conditionaldf['isFailing'] = np.where(df['grade']<70, 'yes', 'no')df.tail()#make numeric value column based on string value column#create a function that will return 1 if female and 0 if maledef score_to_numeric(x): if x=='female': return 1 if x=='male': return 0 df['gender_val'] = df['gender'].apply(score_to_numeric)df.tail()Sort Values
#still using gradedata.csv#df.head() if neededdf.sort_values('hours') #default is ascending=True#does not change structure of dfCalculate and Remove Outliers
#still using gradedata.csv#check df.head() if needed#Standard Deviation Methodmeangrade = df['grade'].mean()stdgrade = df['grade'].std()toprange = meangrade + stdgrade * 1.96botrange = meangrade - stdgrade * 1.96newdf = df.copy() #to not mess up the original dfnewdf = newdf.drop(newdf[newdf['grade'] > toprange].index)newdf = newdf.drop(newdf[newdf['grade'] < botrange].index)newdf.head()#Interquartile Range Methodq1 = df['grade'].quantile(.25)q3 = df['grade'].quantile(.75)iqr = q3-q1toprange = q3 + iqr * 1.5botrange = q1 - iqr * 1.5newdf = df.copy()newdf = newdf.drop(newdf[newdf['grade'] > toprange].index)newdf = newdf.drop(newdf[newdf['grade'] < botrange].index)newdf.head()Appendix
This repo is mounted by: Python