Anon / Aug 22 2019
Remix of PyTorch Template by
Nextjournal
HTML Test
from fastai import * from fastai.text import *
def pad_to(x:Collection[str], pad_til = 128) -> Collection[str]: res = [] count = 0 for t in x: res.append(t) count += 1 while count < pad_til: res.append(PAD) count +=1 return res
tokenizer = Tokenizer(SpacyTokenizer, 'en', pre_rules=[fix_html, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces], post_rules=[replace_all_caps, deal_caps, pad_to], n_cpus=1) processor = [TokenizeProcessor(tokenizer=tokenizer), NumericalizeProcessor()]
data = (TextList.from_csv('/.nextjournal/data-named/QmQhy5wz8vnWVwg6mDTW2TtRqiGL3MehRYJPmNWiP6hbbt/', 'reddit_POL_BAL_FULL.csv', cols='comment',processor=processor))
data = data.split_from_df(col='valid').label_from_df(cols=0).databunch()
data.show_batch() # This is the HTML object