ELI5 and GPT-2

import sys; sys.version.split()[0]

0.2s

Python

'3.6.8'

pip install xgboost scikit-learn eli5 numpy pytorch-transformers

108.3s

Bash in Python

import csv
from random import shuffle

import numpy as np

from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV
from xgboost import XGBClassifier

from sklearn.metrics import classification_report

1.7s

Python

import torch
from pytorch_transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

28.4s

Python

import eli5
from eli5.lime import TextExplainer

0.6s

Python

bset_automl_2.csv

0.9s

Python

positives = []
negatives = []
rowcutoff = 2000

with open(bset_automl_2.csv
) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    index = -1
    for line in csv_reader:
        # skipping header row
        index += 1
        if index > 0:
            if line[1] == 'True':
                positives.append(line)
            else:
                negatives.append(line)

# even numbers of positives and negatives
# if we don't have enough for 50% positive, negatives will fill to rowcutoff
datarows = positives[:int(rowcutoff / 2)]
datarows += negatives[:(rowcutoff - len(datarows))]

shuffle(datarows)

x = []
y = []
testcutoff = int(0.85 * len(datarows))

for line in datarows:
  x.append(line[0])
  if line[1] == 'True':
    y.append(0)
  else:
    y.append(1)

x = np.array(x)
y = np.array(y)

0.9s

Python

0.4s

Python

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import BaseEstimator, VectorizerMixin

classifier = LogisticRegressionCV()

known_vectors = {}
printStuff = False

class V(VectorizerMixin):
  def fit (self, X, y=None):
    return self
  
  def transform (self, X):
    xout = []
    for row in X:
        if printStuff:
            print(row)
        input_ids = torch.tensor([tokenizer.encode(row)])
        words = model(input_ids)[0][0]
        average_word_vector = []
        for word in words:
            index = 0
            for word_block in word:
                if len(average_word_vector) == index:
                    average_word_vector.append(0)
                average_word_vector[index] += float(word_block)
                index += 1
        index = 0
        for word_block in average_word_vector:
            average_word_vector[index] /= float(len(words))
            index += 1
        xout.append(average_word_vector)
    return np.array(xout)

vectorizer = V()

pipe = make_pipeline(vectorizer, classifier)

0.4s

Python

printStuff = False
pipe.fit(x[:testcutoff], y[:testcutoff])

337.3s

Python

Pipeline(memo... verbose=0))])

y_predicted = pipe.predict_proba(x[testcutoff:])
# print(classification_report(y[testcutoff:], y_predicted, target_names=['known weird', 'less weird']))

58.4s

Python

te = TextExplainer(random_state=101, n_samples=500)

0.4s

Python

# this isn't a real Tweet. I needed content and was too lazy to open a tab
te.fit('Green new deal is the best bro, bring it on', pipe.predict_proba)
te.show_prediction(target_names=['known weird', 'less weird'])

677.7s

Python

printStuff = True
# this is a real Tweet
te.fit('She needs a speech writer! She continually says one thing then contradicts herself.', pipe.predict_proba)
te.show_prediction(target_names=['known weird', 'less weird'])

83.0s

Python