Dainius Jocas / Aug 10 2022 / Published
Text analysis
{:deps {org.clojure/clojure {:mvn/version "1.10.3"} ;; compliment is used for autocompletion ;; add your libs here (and restart the runtime to pick up changes) compliment/compliment {:mvn/version "0.3.9"} lt.jocas/lucene-custom-analyzer {:mvn/version "1.0.21"} lt.jocas/lucene-text-analysis {:mvn/version "1.0.17"} org.clojure/tools.deps.alpha {:git/url "https://github.com/clojure/tools.deps.alpha.git" :sha "d0b33e0d346736aa985c150145b332f97b92135e"}}}Extensible Data Notation
apt-get updateapt-get install graphviz12.1s
; Setup notebook helpers(require [lucene.custom.text-analysis :as analysis])(require [lucene.custom.analyzer :as custom-analyzer])(require [clojure.java.io :as io])(require [clojure.java.shell :as sh])(defn draw-text "Analyzes the text, converts token stream into the dot program, shells-out to graphviz. Resulting graph image is stored in the results/ directory. Image will be previewed below" [text analyzer-conf] (let [analyzer (custom-analyzer/create analyzer-conf) dot-program (analysis/text->graph text analyzer)] (sh/sh "dot" "-Tpng" "-o" "results/text-graph.png" :in dot-program)) nil)0.5s
(draw-text "foo bar baz" {:tokenizer :standard :char-filters [{:patternReplace {:pattern "foo" :replacement "aaa"}}] :token-filters [:uppercase :reverseString {:edgeNGram {:minGramSize 1 :maxGramSize 5 :preserveOriginal true}} {:shingle {:minShingleSize 2 :maxShingleSize 4 :tokenSeparator "_"}}]})0.2s