Dainius Jocas / Aug 10 2022 / Published
Text analysis
{:deps {org.clojure/clojure {:mvn/version "1.10.3"}
;; compliment is used for autocompletion
;; add your libs here (and restart the runtime to pick up changes)
compliment/compliment {:mvn/version "0.3.9"}
lt.jocas/lucene-custom-analyzer {:mvn/version "1.0.21"}
lt.jocas/lucene-text-analysis {:mvn/version "1.0.17"}
org.clojure/tools.deps.alpha {:git/url "https://github.com/clojure/tools.deps.alpha.git"
:sha "d0b33e0d346736aa985c150145b332f97b92135e"}}}
Extensible Data Notation
apt-get update
apt-get install graphviz
12.1s
; Setup notebook helpers
(require [lucene.custom.text-analysis :as analysis])
(require [lucene.custom.analyzer :as custom-analyzer])
(require [clojure.java.io :as io])
(require [clojure.java.shell :as sh])
(defn draw-text
"Analyzes the text, converts token stream into the dot program, shells-out to graphviz.
Resulting graph image is stored in the results/ directory.
Image will be previewed below"
[text analyzer-conf]
(let [analyzer (custom-analyzer/create analyzer-conf)
dot-program (analysis/text->graph text analyzer)]
(sh/sh "dot" "-Tpng" "-o" "results/text-graph.png" :in dot-program))
nil)
0.5s
(draw-text
"foo bar baz"
{:tokenizer :standard
:char-filters [{:patternReplace {:pattern "foo"
:replacement "aaa"}}]
:token-filters [:uppercase
:reverseString
{:edgeNGram {:minGramSize 1 :maxGramSize 5 :preserveOriginal true}}
{:shingle {:minShingleSize 2 :maxShingleSize 4 :tokenSeparator "_"}}]})
0.2s