Avi Drucker / May 27 2024

Module 8 - Tutorial - Logistic Regression

import pandas as pdimport numpy as npimport sklearnimport matplotlib.pyplot as pltimport seaborn as sns%matplotlib inlinefrom sklearn.linear_model import LogisticRegressionfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import classification_reportfrom sklearn.metrics import confusion_matrix

titanic.xls

284.16 KBDownload

location = titanic.xlsdf = pd.read_excel(location)df.head()

Explore through visualized data

#bar chart of survival status countsns.countplot(x='survived', data=df)

#survival status by sexsns.barplot('sex', 'survived', data=df)

#survival rate by passenger classsns.barplot('pclass', 'survived', data=df)

#plot age by survival statusplt.figure(figsize=(10,6))ax = sns.kdeplot(df['age'][df['survived'] == 1], #passengers that survived                 color="darkturquoise",                 shade=True)sns.kdeplot(df['age'][df['survived'] == 0], #passengers that did not survive            color="lightcoral",            shade=True)plt.legend (['Survived', 'Died'])plt.title("Density Plot of Age for Survived vs Deceased Population")ax.set(xlabel='Age')#plt.show()

Handle missing values

#find columns that have missing valuesdf.isnull().sum()

Let's clean up 'age' and 'embarked'

#rows where the age is missingmissing_age = df.loc[df['age'].isnull()]missing_age.head()

#get index numbers of missing rows - we'll use this latermals = list(missing_age.index)

#table of avg age of passenger by survival status, sex, and passenger classdf.groupby(['survived', 'sex', 'pclass'])['age'].mean()

#fill missing values for age based on survival status, sex, and passenger classdf['age'].fillna(df.groupby(['survived', 'sex', 'pclass'])['age'].transform('mean'), inplace=True)

#verify filled missing values df.iloc[mals].head()

#verify there are no more missing age valuesdf.isnull().sum()

#missing values for 'embarked'embark = df.loc[df['embarked'].isnull()]embark

#save index for missing values to verify laterembarkls = list(embark.index)

#only 2 missing values so we'll fill with most common embarkation pointdf['embarked'].value_counts()

#fill missing valuesdf['embarked'].fillna('S', inplace=True)

#check that they're filleddf.iloc[embarkls]

df.isnull().sum()

Get rid of columns that we don't want to use in the model

modeldf = df.drop(['name','ticket','fare', 'cabin', 'boat', 'body', 'home.dest'], axis=1)

#columns left in our dataframemodeldf.columns

Create dummy variables for categorical values

#dummy variables for passenger class embarkation port#get_dummies will auto-drop columns that dummies were created frommodeldf = pd.get_dummies(data=modeldf, columns=['pclass','embarked'])modeldf.head()

#change sex values to binary#female=0, male=1modeldf['sex'] = modeldf['sex'].map({'female':0, 'male':1})modeldf.head()

#create new column based on number of family members#drop sibsp and parch columnsmodeldf['family_num'] = modeldf['sibsp'] + modeldf['parch']modeldf.drop(['sibsp', 'parch'], axis=1, inplace=True)modeldf.head()

modeldf['TravelAlone']=np.where((modeldf['family_num'] > 0), 0, 1)modeldf.head()

Logistic Regression

Split data into train and test

#extract target variable#make copy of 'survived' columny = modeldf['survived']

#copy of modeldf without 'survived' columnX = modeldf.drop(['survived'], axis=1)

#80% for training data, 20% for test dataX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

#build logistic regression modelLogReg = LogisticRegression()LogReg.fit(X_train, y_train)

#accuracy score of model using training dataLogReg.score(X_train, y_train)

#generate prediction valuesy_pred = LogReg.predict(X_test)

#Confusion matrix shows which values model predicted correctly vs incorrectlycm = pd.DataFrame(    confusion_matrix(y_test, y_pred),    columns=['Predicted Not Survival', 'Predicted Survival'],    index=['True Not Survival', 'True Survival'])cm

#accuracy score of model on test dataLogReg.score(X_test, y_test)

#from precision column, model is better at predicting passengers that do not surviveprint(classification_report(y_test, y_pred))

Back to the course outline

Sources:

https://nbviewer.jupyter.org/github/BigDataGal/Data-Mania-Demos/blob/master/Logistic Regression Demo.ipynb

https://mashimo.wordpress.com/2018/03/31/logistic-regression-using-sklearn/

https://www.kaggle.com/mnassrib/titanic-logistic-regression-with-python/notebook

https://datascienceplus.com/would-you-survive-the-titanic-getting-started-in-python/

https://towardsdatascience.com/predicting-the-survival-of-titanic-passengers-30870ccc7e8

Module 8 - Tutorial - Logistic Regression

Explore through visualized data

Handle missing values

Logistic Regression

Split data into train and test

Runtimes (1)