Module 7 - Tutorial - Linear Regression
import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport sklearn #library for predictive modeling%matplotlib inlinefrom sklearn.datasets import load_boston#load_boston is a sample dataset from the scikit-learn libraryboston = load_boston()Available Datasets
load_boston([return_X_y]) Load and return the boston house-prices dataset (regression).
load_iris([return_X_y]) Load and return the iris dataset (classification).
load_diabetes([return_X_y]) Load and return the diabetes dataset (regression).
load_digits([n_class, return_X_y]) Load and return the digits dataset (classification).
load_linnerud([return_X_y]) Load and return the linnerud dataset (multivariate regression).
load_wine([return_X_y]) Load and return the wine dataset (classification).
load_breast_cancer([return_X_y]) Load and return the breast cancer wisconsin dataset (classification).
#data = dataset, target = dependent variable, feature_names = column headers, DESCR = data dictionaryboston.keys()#506 rows, 13 columnsboston.data.shape#list of column names from sample Boston datasetprint(boston.feature_names)#description of the sample Boston housing datasetprint(boston.DESCR)#make a dataframe from the sample Boston housing datasetbos = pd.DataFrame(boston.data)bos.head()#add the list of column names from the sample Boston housing dataset to the dataframebos.columns = boston.feature_namesbos.head()#verifying the first 5 rows in the dependent variable datatsetboston.target[:5]#assigning dependent variable to column named "Price"bos['PRICE'] = boston.targetbos.head()#compare number of rooms to home priceplt.scatter(bos['RM'], bos['PRICE']) plt.xlabel("Average number of rooms per dwelling (RM)") plt.ylabel("Housing Price") plt.title("Relationship between RM and Price") plt.show()from sklearn.linear_model import LinearRegression#make dataframe that only contains predictive featuresX = bos.drop('PRICE', axis = 1)#assign linear regression function to a variablelm = LinearRegression()lmImportant functions to keep in mind while fitting a linear regression model are:
lm.fit() -> fits a linear model
lm.predict() -> Predict Y using the linear model with estimated coefficients
lm.score() -> Returns the coefficient of determination (R^2). A measure of how well observed outcomes are replicated by the model, as the proportion of total variation of outcomes explained by the model.
#same as statsmodel; fit will calculate linear regression model based on values of predictive featureslm.fit(X, bos['PRICE'])#y-intercept for the linear regression formulaprint('Estimated intercept coefficient:', lm.intercept_)#number of predictive featuresprint('Number of coefficients:', len(lm.coef_))#show the slope (weight) for each predictive featurepd.DataFrame(list(zip(X.columns, lm.coef_)), columns = ['features', 'estimatedCoefficients'])#show the first 5 values that the model predictedlm.predict(X)[0:5]#score determines how accurate the model predictedlm.score(X, bos['PRICE'])#visual comparison between the true price of a house and the price the model predictedplt.scatter(bos['PRICE'], lm.predict(X))plt.xlabel("Price")plt.ylabel("Predicted Price")plt.title("Price vs Predicted Price")plt.show()Randomize train and test data
Training data should be randomize so that the model can learn from a diverse set of predictive features. This helps it to have better predictive ability when trying on a test set (data that it has not seen before).
#test_size default = 0.25#train_test_split function give back 4 variables#2 variables for X (the predictive features) - training and testing#2 variables for y (the target) - training and testingX_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split( X, bos.PRICE, test_size=0.33, random_state = 5)print(X_train.shape)print(X_test.shape)print(Y_train.shape)print(Y_test.shape)#calculate the model using the training datalm.fit(X_train, Y_train)#predictions for training and test datapred_train = lm.predict(X_train)pred_test = lm.predict(X_test)#evaluate the accuracy of the model of training vs testprint(lm.score(X_train, Y_train))print(lm.score(X_test, Y_test))