# Tax Docs ```python id=f2121371-fc2e-4f1e-ad07-865fb5614a47 import sys; sys.version.split()[0] ``` ```bash id=c7049b4d-de9c-4ccf-adfd-3143b41ee341 apt-get update -y apt-get install -y poppler-utils tesseract-ocr imagemagick ``` [Tax Clearance Certificates.pdf][nextjournal#file#832b2ef6-d6dc-4b7f-a625-211a5594b067] ```bash id=77bf0f39-123c-4e8f-b41e-a674414902df pip3 install pillow pytesseract ``` ```python id=beeefed8-db38-4720-845b-e181f4f3c9d1 from os import system, listdir system("mkdir extract") ``` ```python id=5cdfe618-d14d-4fb8-87b8-7aec3fcb02e0 try: from PIL import Image except ImportError: import Image import pytesseract # system("mkdir ./extract") months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] filename = [reference][nextjournal#reference#6667b138-f953-4cef-87ac-b27415f25def] print("looking up " + filename) system("rm extract/*.png") system("pdfimages -p -png " + filename.replace(" ", "\\ ") + " extract/") images = listdir("./extract") companies = {} for img in images: if '.png' in img: # -rotate -90 system("convert ./extract/" + img + " -negate " + " ./extract/rot_" + img) myimg = Image.open('./extract/rot_' + img) #myimg = Image.open('./extract/' + img) txt = pytesseract.image_to_string(myimg) corp = img.split('-')[1] if corp not in companies: companies[corp] = { "ein": None, "address": None, "date": None } if '/000' in txt: lines = txt.split("\n") for line in lines: if '/000' in line: ein = line[line.index('/000') - 3 : line.index('/000')] companies[corp]["ein"] = ein for month in months: if month in line: companies[corp]["date"] = line break if 'NJ' in txt: print(img) lines = txt.split("\n") index = 0 for line in lines: index += 1 if 'NJ' in line: companies[corp]["address"] = lines[index - 3 : index] break ``` ```python id=e2746a74-5229-4127-8a2c-a7f5c5588fa2 print(companies["001"]) print(companies["002"]) print("\n") ``` [nextjournal#file#832b2ef6-d6dc-4b7f-a625-211a5594b067]: [nextjournal#reference#6667b138-f953-4cef-87ac-b27415f25def]: <#nextjournal#reference#6667b138-f953-4cef-87ac-b27415f25def>
This notebook was exported from https://nextjournal.com/a/MLAWmux4umwhry2S6u3V6?change-id=CfVJmiuoSNZQsRF9J2MujK ```edn nextjournal-metadata {:article {:settings nil, :nodes {"5cdfe618-d14d-4fb8-87b8-7aec3fcb02e0" {:compute-ref #uuid "f0f2c248-2c6d-4995-be18-bdbe87c2ac93", :exec-duration 26746, :id "5cdfe618-d14d-4fb8-87b8-7aec3fcb02e0", :kind "code", :output-log-lines {:stdout 4}, :runtime [:runtime "bab9f562-df84-4d17-9df0-d2b0a87cb385"]}, "6667b138-f953-4cef-87ac-b27415f25def" {:id "6667b138-f953-4cef-87ac-b27415f25def", :kind "reference", :link [:output "832b2ef6-d6dc-4b7f-a625-211a5594b067" nil]}, "77bf0f39-123c-4e8f-b41e-a674414902df" {:compute-ref #uuid "5bf60bb5-3300-4a8d-8217-7559e6c6e8b7", :exec-duration 5120, :id "77bf0f39-123c-4e8f-b41e-a674414902df", :kind "code", :output-log-lines {:stdout 12}, :runtime [:runtime "bab9f562-df84-4d17-9df0-d2b0a87cb385"]}, "832b2ef6-d6dc-4b7f-a625-211a5594b067" {:id "832b2ef6-d6dc-4b7f-a625-211a5594b067", :kind "file"}, "bab9f562-df84-4d17-9df0-d2b0a87cb385" {:environment [:environment {:article/nextjournal.id #uuid "5b45e08b-5b96-413e-84ed-f03b5b65bd66", :change/nextjournal.id #uuid "5df5e18c-0be4-4d8d-b099-6ce55ca12cf4", :node/id "0149f12a-08de-4f3d-9fd3-4b7a665e8624"}], :id "bab9f562-df84-4d17-9df0-d2b0a87cb385", :kind "runtime", :language "python", :type :nextjournal}, "beeefed8-db38-4720-845b-e181f4f3c9d1" {:compute-ref #uuid "5138e1e9-de39-4733-a43c-b6257632c33a", :exec-duration 417, :id "beeefed8-db38-4720-845b-e181f4f3c9d1", :kind "code", :output-log-lines {:stdout 2}, :runtime [:runtime "bab9f562-df84-4d17-9df0-d2b0a87cb385"]}, "c7049b4d-de9c-4ccf-adfd-3143b41ee341" {:compute-ref #uuid "b9102306-ac71-4590-adf9-11c01d4d6e39", :exec-duration 28061, :id "c7049b4d-de9c-4ccf-adfd-3143b41ee341", :kind "code", :output-log-lines {:stdout 460}, :runtime [:runtime "bab9f562-df84-4d17-9df0-d2b0a87cb385"]}, "e2746a74-5229-4127-8a2c-a7f5c5588fa2" {:compute-ref #uuid "c416b2b1-6902-4be0-a045-34b269541a94", :exec-duration 416, :id "e2746a74-5229-4127-8a2c-a7f5c5588fa2", :kind "code", :output-log-lines {:stdout 5}, :runtime [:runtime "bab9f562-df84-4d17-9df0-d2b0a87cb385"], :stdout-collapsed? false}, "f2121371-fc2e-4f1e-ad07-865fb5614a47" {:compute-ref #uuid "7b3d567e-fb1c-4cde-9017-0662e9a07b62", :exec-duration 306, :id "f2121371-fc2e-4f1e-ad07-865fb5614a47", :kind "code", :name "", :output-log-lines {}, :refs (), :runtime [:runtime "bab9f562-df84-4d17-9df0-d2b0a87cb385"]}}, :nextjournal/id #uuid "02d6af86-b0fd-4387-a191-cc8b599539f5", :article/change {:nextjournal/id #uuid "5e743ba3-18ab-423e-be8e-ee5fda3319b6"}}} ```