Anon / Aug 22 2019

HTML Test

from fastai import *
from fastai.text import *
def pad_to(x:Collection[str], pad_til = 128) -> Collection[str]:
    res = []
    count = 0
    for t in x:
        res.append(t)
        count += 1
    while count < pad_til:
        res.append(PAD)
        count +=1
    return res
tokenizer = Tokenizer(SpacyTokenizer, 'en', pre_rules=[fix_html, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces], post_rules=[replace_all_caps, deal_caps, pad_to], n_cpus=1)

processor = [TokenizeProcessor(tokenizer=tokenizer), NumericalizeProcessor()]
data = (TextList.from_csv('/.nextjournal/data-named/QmQhy5wz8vnWVwg6mDTW2TtRqiGL3MehRYJPmNWiP6hbbt/', 'reddit_POL_BAL_FULL.csv', cols='comment',processor=processor))
data = data.split_from_df(col='valid').label_from_df(cols=0).databunch()
data.show_batch()

# This is the HTML object