Avi Drucker / May 27 2024

Module 9 - Tutorial - Decision Trees

import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as sns%matplotlib inlinefrom sklearn import treefrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import confusion_matrixfrom sklearn.metrics import classification_report

titanic.xls

284.16 KBDownload

location = titanic.xlsdf = pd.read_excel(location)df.head()

Clean the data

#find columns that have missing valuesdf.isnull().sum()

#fill missing values for age based on survival status, sex, and passenger classdf['age'].fillna(df.groupby(['survived', 'sex', 'pclass'])['age'].transform('mean'), inplace=True)

#only 2 missing values so we'll fill with most common embarkation pointdf['embarked'].value_counts()

#fill missing valuesdf['embarked'].fillna('S', inplace=True)

df.isnull().sum()

modeldf = df.drop(['name','ticket','fare', 'cabin', 'boat', 'body', 'home.dest'], axis=1)

#columns left in our dataframemodeldf.columns

Create dummy variables for categorical values

#dummy variables for passenger class embarkation port#get_dummies will auto-drop columns that dummies were created frommodeldf = pd.get_dummies(data=modeldf, columns=['pclass','embarked'])#modeldf.head()

#change sex values to binary#female=0, male=1modeldf['sex'] = modeldf['sex'].map({'female':0, 'male':1})#modeldf.head()

#create new column based on number of family members#drop sibsp and parch columnsmodeldf['family_num'] = modeldf['sibsp'] + modeldf['parch']modeldf.drop(['sibsp', 'parch'], axis=1, inplace=True)#modeldf.head()

modeldf['TravelAlone']=np.where((modeldf['family_num'] > 0), 0, 1)#modeldf.head()

Build a Decision Tree

#extract target variable#make copy of 'survived' columny = modeldf['survived']

#copy of modeldf without 'survived' columnX = modeldf.drop(['survived'], axis=1)

#80% for training data, 20% for test dataX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

#assign decision tree function to model variabletree = tree.DecisionTreeClassifier()

#develop model using training data#defining arguments in the model can help prevent overfittingtree.fit(X_train, y_train)

#accuracy score of model on training datatree.score(X_train, y_train)

#run the predictions on the test datay_predict = tree.predict(X_test)

#accuracy score of model on test datatree.score(X_test, y_test)

#look at true and false predictionspd.DataFrame(    confusion_matrix(y_test, y_predict),    columns=['Predicted Not Survival', 'Predicted Survival'],    index=['True Not Survival', 'True Survival'])

#from precision column, model is better at predicting passengers that do not surviveprint(classification_report(y_test, y_predict))

Back to the course outline

Module 9 - Tutorial - Decision Trees

Clean the data

Build a Decision Tree

Runtimes (1)