Phil Cooper / Apr 22 2023 / Published

Aggregation

{:deps {org.clojure/clojure {:mvn/version "1.10.3"}        ;; complient is used for autocompletion        ;; add your libs here (and restart the runtime to pick up changes)        compliment/compliment {:mvn/version "0.3.9"}        redux/redux {:mvn/version "0.1.4"}        org.clojure/data.json {:mvn/version "2.2.0"}}}

(require '[clojure.pprint :refer [pprint]]         '[clojure.data.json :as json])

Multi-index by sorted vectors

Something to set the index. there is just one level of indexing with vector as the key

TODO: does not verify_integrity i.e. does not enforce uniqueness

(def data [{:a 1 :b 3 :i :ia :c 99}           {:a 2 :b 3 :i :ib :c 34}]);; like pandas set-index(defn set-index  [data keyset]  (with-meta    (->> data     (map (juxt            (apply juxt keyset)            #(apply dissoc % keyset)))      (into (sorted-map)))    {:keys keyset}))(def indexed-data (set-index data [:i :b]))indexed-data

Now let's reset the index link pandas `df.reset_index(drop=False)` for drop =True a simple (vals data) should suffice).

(defn reset-index  "Takes an indexed set of rows and turns the index back into columns  Like: pandas df.reset_index(drop=False)"   [data]  (let [key-names (:keys (meta data))]    (->> data      (mapv (fn [[key-vals row]]              (merge                (zipmap key-names key-vals)                row))))))  (reset-index indexed-data)

; note that the dates are unordered(def data [           {:date "2020-01-02" :item "Socks" :region "North" :sales 110 :cost 100}           {:date "2020-01-01" :item "Pants" :region "North" :sales 51 :cost 51}           {:date "2020-01-01" :item "Pants" :region "North" :sales 51 :cost 31} ; 2 entries for same date/item/region           {:date "2020-01-02" :item "Pants" :region "North" :sales 112 :cost 92}           {:date "2020-01-03" :item "Pants" :region "North" :sales 99 :cost 100}           {:date "2020-01-01" :item "Socks" :region "North" :sales 100 :cost 80}           {:date "2020-01-03" :item "Socks" :region "North" :sales 200 :cost 199}           {:date "2020-01-01" :item "Pants" :region "South" :sales 9 :cost 9}           {:date "2020-01-02" :item "Pants" :region "South" :sales 33 :cost 40}           {:date "2020-01-03" :item "Pants" :region "South" :sales 20 :cost 30}           {:date "2020-01-01" :item "Socks" :region "South" :sales 10 :cost 8}           {:date "2020-01-03" :item "Socks" :region "South" :sales 20 :cost 19}           ])

(time (let [index-cols [:item :date]      value-cols [:sales :cost]] (->> data  (map (fn [row]         [(mapv row index-cols) (select-keys row value-cols)]))   (reduce     (fn [acc [index vals]]        (update acc index (partial merge-with +) vals))     (with-meta (sorted-map) {:keys [:item :date]}))  (reset-index))))

(time (let [index-cols [:item :date]      value-cols [:sales :cost]] (->> data  (map (fn [row]         [(mapv row index-cols) (select-keys row value-cols)]))   (reduce     (fn [acc [index vals]]        (update acc index (partial merge-with +) vals))     (with-meta (sorted-map) {:keys index-cols}))  #_(reset-index))))

(time  (let [index-cols [:item :date]       value-cols [:sales :cost]       agg-fn +]    (->> data    (group-by (apply juxt index-cols))  ; is this extra iteration thu the list too costly?    (map (fn [[k group]]           [k (->> group                (map #(select-keys % value-cols))                (apply merge-with agg-fn))]))    (into (with-meta (sorted-map) {:keys index-cols}))    (reset-index)))  )

(time  (let [index-cols [:item :date]       value-cols [:sales :cost]       agg-fn +]    (->> data      (group-by (apply juxt index-cols))  ; is this extra iteration thu the list too costly?      (mapv (fn [[k group]]             [k (->> group                  (mapv (apply juxt value-cols))                  (apply mapv (fn sum [& col] (reduce agg-fn col))))]))      (into (with-meta (sorted-map) {:index index-cols :columns value-cols}))      (reset-index)       #_(apply mapv (fn sum [& col] (reduce agg-fn col)))      )      )  )

(apply mapv (fn [& col] (reduce + col)) [[1 2][10 20]])

(defn update-in-sorted  "'Updates' a value in a nested associative structure, where ks is a  sequence of keys and f is a function that will take the old value  and any supplied args and return the new value, and returns a new  nested structure.  If any levels do not exist, hash-maps will be  created."  ([m ks f & args]     (let [up (fn up [m ks f args]                (let [[k & ks] ks]                  (if ks                    (assoc m k (up (or (get m k)                                        (if (sorted? m)                                            (sorted-map))) ks f args))                    (assoc m k (apply f (get m k) args)))))]       (up m ks f args))))(defn assoc-in-sorted  "Associates a value in a nested associative structure, where ks is a  sequence of keys and v is the new value and returns a new nested structure.  If any levels do not exist, hash-maps will be created."  [m [k & ks] v]  (if ks    (assoc m k (assoc-in (or (get m k) (if (sorted? m) (sorted-map))) ks v))    (assoc m k v)))

(->> data  (sort-by (juxt :item :date))  (partition-by (juxt :item :date))  (map (fn[x] [((juxt :date :item) (first x)) (transduce (map :sales) + x)])))

(->> data  (sort-by (juxt :item :date))  (partition-by #(select-keys % [:item :date]))  (map (fn [x] (-> (select-keys (first x) [:item :date])                  (assoc :sales  (transduce (map :sales) + x))))))

Redux

See Henry Gartner's redux homepage.

TODO: check whether tesser does the same

(require '[redux.core :refer [facet fuse with-xform]])

;; total sales and cost(defn data-facets  [facets data]  (->> data    (transduce identity (facet + facets))    (map vector facets)    (into {})))(data-facets [:sales :cost] data)

(sort-by :date data)

(group-by :date data)

(->> data  (group-by :date)  ;(vals)  (map (fn [[date rows]] [date (data-facets [:sales :cost] rows)]))  (sort)  (into {}))

(->>   (for [[row-key row] *1]    (for [[col-key val] row]      [[col-key row-key] val]))  (apply concat)  (reduce (fn [acc [k v]] (assoc-in acc k v)) {}))

(let [subtotaled (->> data                      (group-by :date)                      (map (fn [[date rows]] [date (data-facets [:sales :cost] rows)]))                      (into (sorted-map)))      transposed (->> (for [[row-key row] subtotaled]                         (for [[col-key val] row]                            [[col-key row-key] val]))                      (apply concat)                   (reduce (fn [acc [k v]] (assoc-in acc k v)) {}))]  transposed)

(reduce (fn [acc row]          (update acc (:date row) (fnil + 0) (:sales row)))  (sorted-map)  data)

(defn make-rf [col]  (fn [acc row]    (update acc (:date row) (fnil + 0) (col row))))(reduce (make-rf :sales) (sorted-map) data)

(defn rf-date-val  [acc [date val]]  (update acc date (fnil + 0) val))(->> (map (juxt :date :sales) data)  (reduce rf-date-val (sorted-map)))

(defn tf-date-val  "Create a transducing function su sum by date and another column"  ([] (sorted-map))  ([acc] (apply map vector acc))  ([acc [date val]] (update acc date (fnil + 0) val)))

(transduce (map (juxt :date :sales)) tf-date-val data)

(transduce identity  (facet tf-date-val [(juxt :date :sales)                      (juxt :date :cost)])   data)

(def plotly-data (let [cols [:sales :cost]]  (->> data    (transduce identity      (facet tf-date-val (map (partial juxt :date) cols)))    (map cons cols)    (map (fn [[col dates values]] {:x dates :y values :name col}))    (assoc ^{:nextjournal/viewer :plotly}{:layout {:title "Sum by type"}} :data))))

 (let [cols [:sales :cost]]  (->> data    (transduce identity      (facet tf-date-val (map (partial juxt :date) cols)))    (map cons cols)    (map (fn [[col dates values]] {:x dates :y values :name col}))    (assoc ^{:nextjournal/viewer :plotly}{} :data)))

Vega-lite

 "https://vega.github.io/vega-lite/examples/"

Hi there see some vega-lite examples

We can perform the aggregation in vega. for total sales by region:

^{:nextjournal/viewer :vega-lite}{  "$schema" "https://vega.github.io/schema/vega-lite/v5.json", "title" "total sales by region" "data" {    "values" data          } "mark"  "line"  "encoding" {    "x" {"field" "date", "type" "temporal"},    "y" {"field" "sales", "aggregate" "sum" "type" "quantitative"},         "color" {"field" "region","type" "nominal"}} :width 500 }

^{:nextjournal/viewer :vega-lite}{ "$schema" "https://vega.github.io/schema/vega-lite/v7.json" "title" "total sales by Item"  "data" {    "values" data          }  "mark"  "line"  "encoding" {    :x {"field" "date", "type" "temporal"},    :y {"field" "sales", "aggregate" "sum" "type" "quantitative"},    :color {:field "item", :type "nominal"}    :row {:field "region" :type "nominal" :title "Region"}} :width 500 }

^{:nextjournal/viewer :vega-lite}{ "$schema" "https://vega.github.io/schema/vega-lite/v5.json" :title "Total sales and cost" :data {:values data} :repeat {:layer [:sales :cost]} :spec {   :mark  "line"   :encoding {:x {:field "date", :type "temporal"},              :y {:field {:repeat :layer}, :aggregate "sum" :type "quantitative"}              :color {:datum {:repeat :layer}, :type "nominal"}}} :width 500 }

^{:nextjournal/viewer :vega-lite}{ "$schema" "https://vega.github.io/schema/vega-lite/v5.json" :title "Total sales and cost" :data {:values data} :transform [{:fold [:sales :cost] :as ["Type"]}] :mark "line" :encoding {:x {:field "date",:type "temporal"}            :y {:field :value :aggregate "sum" :type "quantitative"}            :row {:field "region" :type "nominal"}            :color {:field "Type" :type "nominal"}            } :width 500 }

^{:nextjournal/viewer :vega-lite}{ "$schema" "https://vega.github.io/schema/vega-lite/v5.json" :title "Total sales and cost" :data {:values data} :transform [{:fold [:sales :cost] :as ["Type"]}] :mark "line" :encoding {:x {:field "date",:type "temporal"}            :y {:field :value :aggregate "sum" :type "quantitative"}            :row {:field "Type" :type "nominal"}            :color {:field "region" :type "nominal"}            } :width 500 }

^{:nextjournal/viewer :vega-lite}{ "$schema" "https://vega.github.io/schema/vega-lite/v5.json" :title "Total sales and cost" :data {:values data} :transform {:fold [:sales :cost]} ;:facet {:row {:field :region}} :spec {  :mark  "line"  :encoding {:x {:field "date", :type "temporal" },             :y {:field "value",:aggregate "sum" :type "quantitative"}             :row {:field "region" :type "nominal"}                :color {:field "key" :type "nominal"}}        }  :width 500 }

Plotly(.js)

^{:nextjournal/viewer :hiccup} [:p [:a {:href "https://plotly.com/javascript/"} "Vega-Lite examples"]]

(def plotly-data  (let [cols [:sales :cost]]   (->> data       (transduce identity       (facet tf-date-val (map (partial juxt :date) cols)))       (map cons cols)       (map (fn [[col dates values]] {:x dates :y values :name col})))))^{:nextjournal/viewer :plotly}{:layout {:title "Totals"} :data plotly-data}

^{:nextjournal/viewer :plotly}{:data [{:x [1 2 3] :y [1 3 2] :name "foo" :type "scatter" :mode "lines"}        {:x [1 2 3] :y [2 1 3] :name "bar" :type "scatter" :mode "lines"         :xaxis "x2" :yaxis "y2"}        {:x [1 2 3] :y [10 6 7] :name "baz" :type "scatter" :mode "lines"         :xaxis "x3" :yaxis "y3"}] :layout {:title "HI here!"          :height 600          :grid {:rows 3 :columns 1}          :yaxis {:domain [0.67 1]}          :yaxis2 {:domain [0.33 0.57]}          :yaxis3 {:domain [0 0.23]}          :annotations [{:x 0.5 :y 1                         :xref "paper" :yref "paper"                         :font {:size 16 :color "black"}                         :showarrow false                         :text-valign "bottom"                         :text "me first"}                        {:x 0.5 :y 0.57                         :xref "paper" :yref "paper"                         :font {:size 16 :color "black"}                         :showarrow false                         :text "me too"}                        {:x 0.5 :y 0.23                         :xref "paper" :yref "paper"                         :font {:size 16 :color "darkpurple"}                         :showarrow false                         :text "me three"}]}}

New Datastructures

Juxt blog on new datastructures

Subplots

Let's try to get plots by region

Note that vega(-lite) and d3 provide their own data manipulation functionality

(defn tf-date-val  "Create a transducing function to sum by date and another column"  ([] (sorted-map))  ([acc] acc)  ([acc [val & keys]] (update-in-sorted acc keys (fnil + 0) val)))

(let [cols [:sales :cost]      groups [:date]]  (->> data    (transduce identity      (facet tf-date-val (map #(apply juxt (cons % groups)) cols)))    ;(map cons cols)    #_(map (fn [[col dates values]] {:x dates :y values :name col}))    #_(assoc ^{:nextjournal/viewer :plotly} {:layout {:title "Sum by type"}} :data)     pprint))

(let [cols [:sales :cost]      groups [:region :date]]  (->> data    (transduce identity      (facet tf-date-val (map #(apply juxt (cons % groups)) cols)))    (map vector cols)    (into {})    #_(map (fn [[col dates values]] {:x dates :y values :name col}))    #_(assoc ^{:nextjournal/viewer :plotly} {:layout {:title "Sum by type"}} :data)     pprint))

(let [cols [:sales :cost]      groups [:region :date]      colors {"North" "rgb(0,0,255)" "South" "rgb(255,0,0)"}      data-lines (fn [[col regions]]                      (for [[region dt-val] regions]                        (let [[dates vals] (apply map vector dt-val)]                          {:x dates :y vals :name region                           :line {:color (colors region)}})))]  (->> data    (transduce identity      (facet tf-date-val (map #(apply juxt (cons % groups)) cols)))    (map vector cols)    ;(into {})    #_(map (fn [[col dates values]] {:x dates :y values :name col}))     #_(assoc ^{:nextjournal/viewer :plotly} {:layout {:title "Sum by type"}} :data)     (map data-lines)    ;(mapcat (fn subplots [[xaxis lines]](map #(assoc % :xasis xaxis) lines))) ["x" "x2" "x3"])    (mapcat (fn subplots [subplot lines]              (map #(assoc %                      :xaxis (str "x" subplot)                      :yaxis (str "y" subplot)                      :showlegend (= subplot "")) lines))          ["" "2" "3"])    (assoc ^{:nextjournal/viewer :plotly}      {:layout {;:grid {:rows 2 :columns 1}                :yaxis {:domain [0 0.4]}                :yaxis2 {:domain [0.5 0.9]}                :annotations {}                :height (* 2 300)                :title "One subplot per column (sales or cost)"}}      :data)    ;; requires extra work still to create subplot titles as annotations    ;pprint    ))

(let [cols [:sales :cost]      groups [:region :date]      colors {"North" "rgb(0,0,255)" "South" "rgb(255,0,0)"}      data-lines (fn [[col regions]]                      (for [[region dt-val] regions]                        (let [[dates vals] (apply map vector dt-val)]                          {:x dates :y vals :name region                           :line {:color (colors region)}})))]  (->> data    (transduce identity      (facet tf-date-val (map #(apply juxt (cons % groups)) cols)))    (map vector cols)    ;(into {})    #_(map (fn [[col dates values]] {:x dates :y values :name col}))     #_(assoc ^{:nextjournal/viewer :plotly} {:layout {:title "Sum by type"}} :data)     (map data-lines)    ;(mapcat (fn subplots [[xaxis lines]](map #(assoc % :xasis xaxis) lines))) ["x" "x2" "x3"])    (mapcat (fn subplots [subplot lines]              (map #(assoc % :xaxis (str "x" subplot) :yaxis (str "y" subplot)) lines))          ["" "2" "3"])    (assoc ^{:nextjournal/viewer :plotly}      {:layout {:grid {:rows 2 :columns 1}                :title "One subplot per column (sales or cost)"}}      :data)    pprint    ))

(let [cols [:sales :cost]      groups [:region :item :date]]  (->> data    (transduce identity      (facet tf-date-val (map #(apply juxt (cons % groups)) cols)))    (map vector cols)    #_(map (fn [[col dates values]] {:x dates :y values :name col}))    #_(assoc ^{:nextjournal/viewer :plotly} {:layout {:title "Sum by type"}} :data)     pprint))

(-> (sorted-map)  (assoc-in [:a :b] 1)  (assoc-in [:c :x] 4)  (assoc-in [:c :a] 3)  (assoc-in  [:b :c] 2))

Improve sorted aggregation

assoc-in and update-in are fine but when starting with a sorted-map and adding layers, it does not add sorted-map in the layers (i.e. only the first level remains sorted.

Let's fix that:

(-> (sorted-map)  (assoc-in-sorted [:a :b] 1)  (assoc-in-sorted [:c :x] 4)  (assoc-in-sorted [:c :a] 3)  (assoc-in-sorted [:b :c] 2)  pprint)