Aggregation

{:deps {org.clojure/clojure {:mvn/version "1.10.3"}
        ;; complient is used for autocompletion
        ;; add your libs here (and restart the runtime to pick up changes)
        compliment/compliment {:mvn/version "0.3.9"}
        redux/redux {:mvn/version "0.1.4"}
        org.clojure/data.json {:mvn/version "2.2.0"}}}
Extensible Data Notation
(require '[clojure.pprint :refer [pprint]]
         '[clojure.data.json :as json])
1.3s

Multi-index by sorted vectors

Something to set the index. there is just one level of indexing with vector as the key

TODO: does not verify_integrity i.e. does not enforce uniqueness

(def data [{:a 1 :b 3 :i :ia :c 99}
           {:a 2 :b 3 :i :ib :c 34}])
;; like pandas set-index
(defn set-index
  [data keyset]
  (with-meta
    (->> data
     (map (juxt
            (apply juxt keyset)
            #(apply dissoc % keyset)))
      (into (sorted-map)))
    {:keys keyset}))
(def indexed-data (set-index data [:i :b]))
indexed-data
0.1s

Now let's reset the index link pandas `df.reset_index(drop=False)` for drop =True a simple (vals data) should suffice).

(defn reset-index
  "Takes an indexed set of rows and turns the index back into columns
  Like: pandas df.reset_index(drop=False)" 
  [data]
  (let [key-names (:keys (meta data))]
    (->> data
      (mapv (fn [[key-vals row]]
              (merge
                (zipmap key-names key-vals)
                row))))))
  
(reset-index indexed-data)
0.0s
; note that the dates are unordered
(def data [
           {:date "2020-01-02" :item "Socks" :region "North" :sales 110 :cost 100}
           {:date "2020-01-01" :item "Pants" :region "North" :sales 51 :cost 51}
           {:date "2020-01-01" :item "Pants" :region "North" :sales 51 :cost 31} ; 2 entries for same date/item/region
           {:date "2020-01-02" :item "Pants" :region "North" :sales 112 :cost 92}
           {:date "2020-01-03" :item "Pants" :region "North" :sales 99 :cost 100}
           {:date "2020-01-01" :item "Socks" :region "North" :sales 100 :cost 80}
           {:date "2020-01-03" :item "Socks" :region "North" :sales 200 :cost 199}
           {:date "2020-01-01" :item "Pants" :region "South" :sales 9 :cost 9}
           {:date "2020-01-02" :item "Pants" :region "South" :sales 33 :cost 40}
           {:date "2020-01-03" :item "Pants" :region "South" :sales 20 :cost 30}
           {:date "2020-01-01" :item "Socks" :region "South" :sales 10 :cost 8}
           {:date "2020-01-03" :item "Socks" :region "South" :sales 20 :cost 19}
           ])
0.1s
(time
 (let [index-cols [:item :date]
      value-cols [:sales :cost]]
 (->> data
  (map (fn [row]
         [(mapv row index-cols) (select-keys row value-cols)]))
   (reduce
     (fn [acc [index vals]]
        (update acc index (partial merge-with +) vals))
     (with-meta (sorted-map) {:keys [:item :date]}))
  (reset-index))))
0.4s
(time
 (let [index-cols [:item :date]
      value-cols [:sales :cost]]
 (->> data
  (map (fn [row]
         [(mapv row index-cols) (select-keys row value-cols)]))
   (reduce
     (fn [acc [index vals]]
        (update acc index (partial merge-with +) vals))
     (with-meta (sorted-map) {:keys index-cols}))
  #_(reset-index))))
0.3s
(time
  (let [index-cols [:item :date]
       value-cols [:sales :cost]
       agg-fn +]
    (->> data
    (group-by (apply juxt index-cols))  ; is this extra iteration thu the list too costly?
    (map (fn [[k group]]
           [k (->> group
                (map #(select-keys % value-cols))
                (apply merge-with agg-fn))]))
    (into (with-meta (sorted-map) {:keys index-cols}))
    (reset-index)))
  )
0.3s
(time
  (let [index-cols [:item :date]
       value-cols [:sales :cost]
       agg-fn +]
    (->> data
      (group-by (apply juxt index-cols))  ; is this extra iteration thu the list too costly?
      (mapv (fn [[k group]]
             [k (->> group
                  (mapv (apply juxt value-cols))
                  (apply mapv (fn sum [& col] (reduce agg-fn col))))]))
      (into (with-meta (sorted-map) {:index index-cols :columns value-cols}))
      (reset-index) 
      #_(apply mapv (fn sum [& col] (reduce agg-fn col)))
      )  
    )
  )
0.3s
(apply mapv (fn [& col] (reduce + col)) [[1 2][10 20]])
0.1s
(defn update-in-sorted
  "'Updates' a value in a nested associative structure, where ks is a
  sequence of keys and f is a function that will take the old value
  and any supplied args and return the new value, and returns a new
  nested structure.  If any levels do not exist, hash-maps will be
  created."
  ([m ks f & args]
     (let [up (fn up [m ks f args]
                (let [[k & ks] ks]
                  (if ks
                    (assoc m k (up (or (get m k)
                                        (if (sorted? m)
                                            (sorted-map))) ks f args))
                    (assoc m k (apply f (get m k) args)))))]
       (up m ks f args))))
(defn assoc-in-sorted
  "Associates a value in a nested associative structure, where ks is a
  sequence of keys and v is the new value and returns a new nested structure.
  If any levels do not exist, hash-maps will be created."
  [m [k & ks] v]
  (if ks
    (assoc m k (assoc-in (or (get m k) (if (sorted? m) (sorted-map))) ks v))
    (assoc m k v)))
0.0s
(->> data
  (sort-by (juxt :item :date))
  (partition-by (juxt :item :date))
  (map (fn[x] [((juxt :date :item) (first x)) (transduce (map :sales) + x)])))
0.0s
(->> data
  (sort-by (juxt :item :date))
  (partition-by #(select-keys % [:item :date]))
  (map (fn [x] (-> (select-keys (first x) [:item :date])
                  (assoc :sales  (transduce (map :sales) + x))))))
0.0s

Redux

See Henry Gartner's redux homepage.

TODO: check whether tesser does the same

(require '[redux.core :refer [facet fuse with-xform]])
0.2s
;; total sales and cost
(defn data-facets
  [facets data]
  (->> data
    (transduce identity (facet + facets))
    (map vector facets)
    (into {})))
(data-facets [:sales :cost] data)
0.0s
(sort-by :date data)
0.0s
(group-by :date data)
0.1s
(->> data
  (group-by :date)
  ;(vals)
  (map (fn [[date rows]] [date (data-facets [:sales :cost] rows)]))
  (sort)
  (into {}))
0.0s
(->>
   (for [[row-key row] *1]
    (for [[col-key val] row]
      [[col-key row-key] val]))
  (apply concat)
  (reduce (fn [acc [k v]] (assoc-in acc k v)) {}))
0.1s
(let [subtotaled (->> data
                      (group-by :date)
                      (map (fn [[date rows]] [date (data-facets [:sales :cost] rows)]))
                      (into (sorted-map)))
      transposed (->> (for [[row-key row] subtotaled]
                         (for [[col-key val] row]
                            [[col-key row-key] val]))
                      (apply concat)
                   (reduce (fn [acc [k v]] (assoc-in acc k v)) {}))]
  transposed)
0.1s
(reduce (fn [acc row]
          (update acc (:date row) (fnil + 0) (:sales row)))
  (sorted-map)
  data)
0.0s
(defn make-rf [col]
  (fn [acc row]
    (update acc (:date row) (fnil + 0) (col row))))
(reduce (make-rf :sales) (sorted-map) data)
0.0s
(defn rf-date-val
  [acc [date val]]
  (update acc date (fnil + 0) val))
(->> (map (juxt :date :sales) data)
  (reduce rf-date-val (sorted-map)))
0.0s
(defn tf-date-val
  "Create a transducing function su sum by date and another column"
  ([] (sorted-map))
  ([acc] (apply map vector acc))
  ([acc [date val]] (update acc date (fnil + 0) val)))
0.0s
(transduce (map (juxt :date :sales)) tf-date-val data)
0.0s
(transduce identity
  (facet tf-date-val [(juxt :date :sales)
                      (juxt :date :cost)])
   data)
0.0s
(def plotly-data
 (let [cols [:sales :cost]]
  (->> data
    (transduce identity
      (facet tf-date-val (map (partial juxt :date) cols)))
    (map cons cols)
    (map (fn [[col dates values]] {:x dates :y values :name col}))
    (assoc ^{:nextjournal/viewer :plotly}{:layout {:title "Sum by type"}} :data))))
0.0s
 (let [cols [:sales :cost]]
  (->> data
    (transduce identity
      (facet tf-date-val (map (partial juxt :date) cols)))
    (map cons cols)
    (map (fn [[col dates values]] {:x dates :y values :name col}))
    (assoc ^{:nextjournal/viewer :plotly}{} :data)))
0.1s

Vega-lite

 "https://vega.github.io/vega-lite/examples/"
0.0s

Hi there see some vega-lite examples

We can perform the aggregation in vega. for total sales by region:

^{:nextjournal/viewer :vega-lite}
{
  "$schema" "https://vega.github.io/schema/vega-lite/v5.json",
 "title" "total sales by region"
 "data" {
    "values" data
          }
 "mark"  "line"
  "encoding" {
    "x" {"field" "date", "type" "temporal"},
    "y" {"field" "sales", "aggregate" "sum" "type" "quantitative"},
         "color" {"field" "region","type" "nominal"}}
 :width 500
 }
0.1s
^{:nextjournal/viewer :vega-lite}
{
 "$schema" "https://vega.github.io/schema/vega-lite/v7.json"
 "title" "total sales by Item"
  "data" {
    "values" data
          } 
 "mark"  "line"
  "encoding" {
    :x {"field" "date", "type" "temporal"},
    :y {"field" "sales", "aggregate" "sum" "type" "quantitative"},
    :color {:field "item", :type "nominal"}
    :row {:field "region" :type "nominal" :title "Region"}}
 :width 500
 }
0.1s
^{:nextjournal/viewer :vega-lite}
{
 "$schema" "https://vega.github.io/schema/vega-lite/v5.json"
 :title "Total sales and cost"
 :data {:values data}
 :repeat {:layer [:sales :cost]}
 :spec {
   :mark  "line"
   :encoding {:x {:field "date", :type "temporal"},
              :y {:field {:repeat :layer}, :aggregate "sum" :type "quantitative"}
              :color {:datum {:repeat :layer}, :type "nominal"}}}
 :width 500
 }
0.0s
^{:nextjournal/viewer :vega-lite}
{
 "$schema" "https://vega.github.io/schema/vega-lite/v5.json"
 :title "Total sales and cost"
 :data {:values data}
 :transform [{:fold [:sales :cost] :as ["Type"]}]
 :mark "line"
 :encoding {:x {:field "date",:type "temporal"}
            :y {:field :value :aggregate "sum" :type "quantitative"}
            :row {:field "region" :type "nominal"}
            :color {:field "Type" :type "nominal"}
            }
 :width 500
 }
0.0s
^{:nextjournal/viewer :vega-lite}
{
 "$schema" "https://vega.github.io/schema/vega-lite/v5.json"
 :title "Total sales and cost"
 :data {:values data}
 :transform [{:fold [:sales :cost] :as ["Type"]}]
 :mark "line"
 :encoding {:x {:field "date",:type "temporal"}
            :y {:field :value :aggregate "sum" :type "quantitative"}
            :row {:field "Type" :type "nominal"}
            :color {:field "region" :type "nominal"}
            }
 :width 500
 }
0.1s
^{:nextjournal/viewer :vega-lite}
{
 "$schema" "https://vega.github.io/schema/vega-lite/v5.json"
 :title "Total sales and cost"
 :data {:values data}
 :transform {:fold [:sales :cost]}
 ;:facet {:row {:field :region}}
 :spec {
  :mark  "line"
  :encoding {:x {:field "date", :type "temporal" },
             :y {:field "value",:aggregate "sum" :type "quantitative"}
             :row {:field "region" :type "nominal"}   
             :color {:field "key" :type "nominal"}}
        }
 
 :width 500
 }
0.1s

Plotly(.js)

(def plotly-data
  (let [cols [:sales :cost]]
   (->> data
       (transduce identity
       (facet tf-date-val (map (partial juxt :date) cols)))
       (map cons cols)
       (map (fn [[col dates values]] {:x dates :y values :name col})))))
^{:nextjournal/viewer :plotly}
{:layout {:title "Totals"}
 :data plotly-data}
0.1s

^{:nextjournal/viewer :plotly}
{:data [{:x [1 2 3] :y [1 3 2] :name "foo" :type "scatter" :mode "lines"}
        {:x [1 2 3] :y [2 1 3] :name "bar" :type "scatter" :mode "lines"
         :xaxis "x2" :yaxis "y2"}
        {:x [1 2 3] :y [10 6 7] :name "baz" :type "scatter" :mode "lines"
         :xaxis "x3" :yaxis "y3"}]
 :layout {:title "HI here!"
          :height 600
          :grid {:rows 3 :columns 1}
          :yaxis {:domain [0.67 1]}
          :yaxis2 {:domain [0.33 0.57]}
          :yaxis3 {:domain [0 0.23]}
          :annotations [{:x 0.5 :y 1
                         :xref "paper" :yref "paper"
                         :font {:size 16 :color "black"}
                         :showarrow false
                         :text-valign "bottom"
                         :text "me first"}
                        {:x 0.5 :y 0.57
                         :xref "paper" :yref "paper"
                         :font {:size 16 :color "black"}
                         :showarrow false
                         :text "me too"}
                        {:x 0.5 :y 0.23
                         :xref "paper" :yref "paper"
                         :font {:size 16 :color "darkpurple"}
                         :showarrow false
                         :text "me three"}]}}
0.1s

New Datastructures

Juxt blog on new datastructures

Subplots

Let's try to get plots by region

Note that vega(-lite) and d3 provide their own data manipulation functionality

(defn tf-date-val
  "Create a transducing function to sum by date and another column"
  ([] (sorted-map))
  ([acc] acc)
  ([acc [val & keys]] (update-in-sorted acc keys (fnil + 0) val)))
0.0s
(let [cols [:sales :cost]
      groups [:date]]
  (->> data
    (transduce identity
      (facet tf-date-val (map #(apply juxt (cons % groups)) cols)))
    ;(map cons cols)
    #_(map (fn [[col dates values]] {:x dates :y values :name col}))
    #_(assoc ^{:nextjournal/viewer :plotly} {:layout {:title "Sum by type"}} :data) 
    pprint))
0.5s
(let [cols [:sales :cost]
      groups [:region :date]]
  (->> data
    (transduce identity
      (facet tf-date-val (map #(apply juxt (cons % groups)) cols)))
    (map vector cols)
    (into {})
    #_(map (fn [[col dates values]] {:x dates :y values :name col}))
    #_(assoc ^{:nextjournal/viewer :plotly} {:layout {:title "Sum by type"}} :data) 
    pprint))
0.5s

(let [cols [:sales :cost]
      groups [:region :date]
      colors {"North" "rgb(0,0,255)" "South" "rgb(255,0,0)"}
      data-lines (fn [[col regions]]
                      (for [[region dt-val] regions]
                        (let [[dates vals] (apply map vector dt-val)]
                          {:x dates :y vals :name region
                           :line {:color (colors region)}})))]
  (->> data
    (transduce identity
      (facet tf-date-val (map #(apply juxt (cons % groups)) cols)))
    (map vector cols)
    ;(into {})
    #_(map (fn [[col dates values]] {:x dates :y values :name col})) 
    #_(assoc ^{:nextjournal/viewer :plotly} {:layout {:title "Sum by type"}} :data) 
    (map data-lines)
    ;(mapcat (fn subplots [[xaxis lines]](map #(assoc % :xasis xaxis) lines))) ["x" "x2" "x3"])
    (mapcat (fn subplots [subplot lines]
              (map #(assoc %
                      :xaxis (str "x" subplot)
                      :yaxis (str "y" subplot)
                      :showlegend (= subplot "")) lines))
          ["" "2" "3"])
    (assoc ^{:nextjournal/viewer :plotly}
      {:layout {;:grid {:rows 2 :columns 1}
                :yaxis {:domain [0 0.4]}
                :yaxis2 {:domain [0.5 0.9]}
                :annotations {}
                :height (* 2 300)
                :title "One subplot per column (sales or cost)"}}
      :data)
    ;; requires extra work still to create subplot titles as annotations
    ;pprint
    ))
0.1s
(let [cols [:sales :cost]
      groups [:region :date]
      colors {"North" "rgb(0,0,255)" "South" "rgb(255,0,0)"}
      data-lines (fn [[col regions]]
                      (for [[region dt-val] regions]
                        (let [[dates vals] (apply map vector dt-val)]
                          {:x dates :y vals :name region
                           :line {:color (colors region)}})))]
  (->> data
    (transduce identity
      (facet tf-date-val (map #(apply juxt (cons % groups)) cols)))
    (map vector cols)
    ;(into {})
    #_(map (fn [[col dates values]] {:x dates :y values :name col})) 
    #_(assoc ^{:nextjournal/viewer :plotly} {:layout {:title "Sum by type"}} :data) 
    (map data-lines)
    ;(mapcat (fn subplots [[xaxis lines]](map #(assoc % :xasis xaxis) lines))) ["x" "x2" "x3"])
    (mapcat (fn subplots [subplot lines]
              (map #(assoc % :xaxis (str "x" subplot) :yaxis (str "y" subplot)) lines))
          ["" "2" "3"])
    (assoc ^{:nextjournal/viewer :plotly}
      {:layout {:grid {:rows 2 :columns 1}
                :title "One subplot per column (sales or cost)"}}
      :data)
    pprint
    ))
0.6s
(let [cols [:sales :cost]
      groups [:region :item :date]]
  (->> data
    (transduce identity
      (facet tf-date-val (map #(apply juxt (cons % groups)) cols)))
    (map vector cols)
    #_(map (fn [[col dates values]] {:x dates :y values :name col}))
    #_(assoc ^{:nextjournal/viewer :plotly} {:layout {:title "Sum by type"}} :data) 
    pprint))
0.5s
(-> (sorted-map)
  (assoc-in [:a :b] 1)
  (assoc-in [:c :x] 4)
  (assoc-in [:c :a] 3)
  (assoc-in  [:b :c] 2))
0.0s

Improve sorted aggregation

assoc-in and update-in are fine but when starting with a sorted-map and adding layers, it does not add sorted-map in the layers (i.e. only the first level remains sorted.

Let's fix that:

(-> (sorted-map)
  (assoc-in-sorted [:a :b] 1)
  (assoc-in-sorted [:c :x] 4)
  (assoc-in-sorted [:c :a] 3)
  (assoc-in-sorted [:b :c] 2)
  pprint)
0.3s
Runtimes (1)