Nick Doiron / Aug 12 2019
Remix of Python by Nextjournal

ELI5 and GPT-2

import sys; sys.version.split()[0]

pip install xgboost scikit-learn eli5 numpy pytorch-transformers
import csv
from random import shuffle

import numpy as np

from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV
from xgboost import XGBClassifier

from sklearn.metrics import classification_report
import torch
from pytorch_transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
import eli5
from eli5.lime import TextExplainer
positives = []
negatives = []
rowcutoff = 2000

with open(
) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') index = -1 for line in csv_reader: # skipping header row index += 1 if index > 0: if line[1] == 'True': positives.append(line) else: negatives.append(line) # even numbers of positives and negatives # if we don't have enough for 50% positive, negatives will fill to rowcutoff datarows = positives[:int(rowcutoff / 2)] datarows += negatives[:(rowcutoff - len(datarows))] shuffle(datarows) x = [] y = [] testcutoff = int(0.85 * len(datarows)) for line in datarows: x.append(line[0]) if line[1] == 'True': y.append(0) else: y.append(1) x = np.array(x) y = np.array(y)
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import BaseEstimator, VectorizerMixin

classifier = LogisticRegressionCV()

known_vectors = {}
printStuff = False

class V(VectorizerMixin):
  def fit (self, X, y=None):
    return self
  def transform (self, X):
    xout = []
    for row in X:
        if printStuff:
        input_ids = torch.tensor([tokenizer.encode(row)])
        words = model(input_ids)[0][0]
        average_word_vector = []
        for word in words:
            index = 0
            for word_block in word:
                if len(average_word_vector) == index:
                average_word_vector[index] += float(word_block)
                index += 1
        index = 0
        for word_block in average_word_vector:
            average_word_vector[index] /= float(len(words))
            index += 1
    return np.array(xout)

vectorizer = V()

pipe = make_pipeline(vectorizer, classifier)
printStuff = False[:testcutoff], y[:testcutoff]) 
Pipeline(memo... verbose=0))])
y_predicted = pipe.predict_proba(x[testcutoff:])
# print(classification_report(y[testcutoff:], y_predicted, target_names=['known weird', 'less weird']))
te = TextExplainer(random_state=101, n_samples=500)
# this isn't a real Tweet. I needed content and was too lazy to open a tab'Green new deal is the best bro, bring it on', pipe.predict_proba)
te.show_prediction(target_names=['known weird', 'less weird'])
printStuff = True
# this is a real Tweet'She needs a speech writer! She continually says one thing then contradicts herself.', pipe.predict_proba)
te.show_prediction(target_names=['known weird', 'less weird'])