Nick Doiron / Jul 30 2019
Remix of Python by Nextjournal
ELI5 and AOC Tweets
import sys; sys.version.split()[0]
'3.6.8'
pip install xgboost scikit-learn eli5 numpy
import csv from random import shuffle import numpy as np positives = [] negatives = [] rowcutoff = 5000 with open(bset_automl_2.csv) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') index = -1 for line in csv_reader: # skipping header row index += 1 if index > 0: if line[1] == 'True': positives.append(line) else: negatives.append(line) # even numbers of positives and negatives # if we don't have enough for 50% positive, negatives will fill to rowcutoff datarows = positives[:int(rowcutoff / 2)] datarows += negatives[:(rowcutoff - len(datarows))] shuffle(datarows) print(datarows[0])
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV x = [] y = [] testcutoff = int(0.85 * len(datarows)) for line in datarows: x.append(line[0]) if line[1] == 'True': y.append(0) else: y.append(1) x = np.array(x) y = np.array(y) tfid = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') x = tfid.fit_transform(x).todense() gnb = RidgeClassifierCV() gnb.fit(x[:testcutoff], y[:testcutoff]) y_predicted = gnb.predict(x[testcutoff:])
from sklearn.metrics import classification_report print(classification_report(y[testcutoff:], y_predicted, target_names=['known weird', 'less weird']))
import eli5 eli5.explain_prediction(gnb, datarows[51][0], vec=tfid, target_names=['known weird', 'less weird'])