Nick Doiron / Jul 30 2019
Remix of Python by Nextjournal

ELI5 and AOC Tweets

import sys; sys.version.split()[0]
'3.6.8'
pip install xgboost scikit-learn eli5 numpy

bset_automl_2.csv
import csv
from random import shuffle

import numpy as np

positives = []
negatives = []
rowcutoff = 5000

with open(
bset_automl_2.csv
) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') index = -1 for line in csv_reader: # skipping header row index += 1 if index > 0: if line[1] == 'True': positives.append(line) else: negatives.append(line) # even numbers of positives and negatives # if we don't have enough for 50% positive, negatives will fill to rowcutoff datarows = positives[:int(rowcutoff / 2)] datarows += negatives[:(rowcutoff - len(datarows))] shuffle(datarows) print(datarows[0])
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV

x = []
y = []
testcutoff = int(0.85 * len(datarows))

for line in datarows:
    x.append(line[0])
    if line[1] == 'True':
        y.append(0)
    else:
        y.append(1)

x = np.array(x)
y = np.array(y)

tfid = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
x = tfid.fit_transform(x).todense()

gnb = RidgeClassifierCV()
gnb.fit(x[:testcutoff], y[:testcutoff])

y_predicted = gnb.predict(x[testcutoff:])
from sklearn.metrics import classification_report
print(classification_report(y[testcutoff:], y_predicted, target_names=['known weird', 'less weird']))
import eli5

eli5.explain_prediction(gnb, datarows[51][0], vec=tfid, target_names=['known weird', 'less weird'])