Nick Doiron / Aug 12 2019
Remix of Python by Nextjournal
ELI5 and GPT-2
import sys; sys.version.split()[0]
'3.6.8'
pip install xgboost scikit-learn eli5 numpy pytorch-transformers
import csv from random import shuffle import numpy as np from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV from xgboost import XGBClassifier from sklearn.metrics import classification_report
import torch from pytorch_transformers import GPT2Tokenizer, GPT2Model tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2Model.from_pretrained('gpt2')
import eli5 from eli5.lime import TextExplainer
0.9s
Python
positives = [] negatives = [] rowcutoff = 2000 with open(bset_automl_2.csv) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') index = -1 for line in csv_reader: # skipping header row index += 1 if index > 0: if line[1] == 'True': positives.append(line) else: negatives.append(line) # even numbers of positives and negatives # if we don't have enough for 50% positive, negatives will fill to rowcutoff datarows = positives[:int(rowcutoff / 2)] datarows += negatives[:(rowcutoff - len(datarows))] shuffle(datarows) x = [] y = [] testcutoff = int(0.85 * len(datarows)) for line in datarows: x.append(line[0]) if line[1] == 'True': y.append(0) else: y.append(1) x = np.array(x) y = np.array(y)
0.4s
Python
from sklearn.pipeline import make_pipeline from sklearn.feature_extraction.text import BaseEstimator, VectorizerMixin classifier = LogisticRegressionCV() known_vectors = {} printStuff = False class V(VectorizerMixin): def fit (self, X, y=None): return self def transform (self, X): xout = [] for row in X: if printStuff: print(row) input_ids = torch.tensor([tokenizer.encode(row)]) words = model(input_ids)[0][0] average_word_vector = [] for word in words: index = 0 for word_block in word: if len(average_word_vector) == index: average_word_vector.append(0) average_word_vector[index] += float(word_block) index += 1 index = 0 for word_block in average_word_vector: average_word_vector[index] /= float(len(words)) index += 1 xout.append(average_word_vector) return np.array(xout) vectorizer = V() pipe = make_pipeline(vectorizer, classifier)
printStuff = False pipe.fit(x[:testcutoff], y[:testcutoff])
Pipeline(memo... verbose=0))])
y_predicted = pipe.predict_proba(x[testcutoff:]) # print(classification_report(y[testcutoff:], y_predicted, target_names=['known weird', 'less weird']))
te = TextExplainer(random_state=101, n_samples=500)
# this isn't a real Tweet. I needed content and was too lazy to open a tab te.fit('Green new deal is the best bro, bring it on', pipe.predict_proba) te.show_prediction(target_names=['known weird', 'less weird'])
printStuff = True # this is a real Tweet te.fit('She needs a speech writer! She continually says one thing then contradicts herself.', pipe.predict_proba) te.show_prediction(target_names=['known weird', 'less weird'])