Aggregation
{:deps {org.clojure/clojure {:mvn/version "1.10.3"}
;; complient is used for autocompletion
;; add your libs here (and restart the runtime to pick up changes)
compliment/compliment {:mvn/version "0.3.9"}
redux/redux {:mvn/version "0.1.4"}
org.clojure/data.json {:mvn/version "2.2.0"}}}
(require [clojure.pprint :refer [pprint]]
[clojure.data.json :as json])
Multi-index by sorted vectors
Something to set the index. there is just one level of indexing with vector as the key
TODO: does not verify_integrity i.e. does not enforce uniqueness
(def data [{:a 1 :b 3 :i :ia :c 99}
{:a 2 :b 3 :i :ib :c 34}])
;; like pandas set-index
(defn set-index
[data keyset]
(with-meta
(->> data
(map (juxt
(apply juxt keyset)
(apply dissoc % keyset)))
(into (sorted-map)))
{:keys keyset}))
(def indexed-data (set-index data [:i :b]))
indexed-data
Now let's reset the index link pandas `df.reset_index(drop=False)` for drop =True a simple (vals data) should suffice).
(defn reset-index
"Takes an indexed set of rows and turns the index back into columns
Like: pandas df.reset_index(drop=False)"
[data]
(let [key-names (:keys (meta data))]
(->> data
(mapv (fn [[key-vals row]]
(merge
(zipmap key-names key-vals)
row))))))
(reset-index indexed-data)
; note that the dates are unordered
(def data [
{:date "2020-01-02" :item "Socks" :region "North" :sales 110 :cost 100}
{:date "2020-01-01" :item "Pants" :region "North" :sales 51 :cost 51}
{:date "2020-01-01" :item "Pants" :region "North" :sales 51 :cost 31} ; 2 entries for same date/item/region
{:date "2020-01-02" :item "Pants" :region "North" :sales 112 :cost 92}
{:date "2020-01-03" :item "Pants" :region "North" :sales 99 :cost 100}
{:date "2020-01-01" :item "Socks" :region "North" :sales 100 :cost 80}
{:date "2020-01-03" :item "Socks" :region "North" :sales 200 :cost 199}
{:date "2020-01-01" :item "Pants" :region "South" :sales 9 :cost 9}
{:date "2020-01-02" :item "Pants" :region "South" :sales 33 :cost 40}
{:date "2020-01-03" :item "Pants" :region "South" :sales 20 :cost 30}
{:date "2020-01-01" :item "Socks" :region "South" :sales 10 :cost 8}
{:date "2020-01-03" :item "Socks" :region "South" :sales 20 :cost 19}
])
(time
(let [index-cols [:item :date]
value-cols [:sales :cost]]
(->> data
(map (fn [row]
[(mapv row index-cols) (select-keys row value-cols)]))
(reduce
(fn [acc [index vals]]
(update acc index (partial merge-with +) vals))
(with-meta (sorted-map) {:keys [:item :date]}))
(reset-index))))
(time
(let [index-cols [:item :date]
value-cols [:sales :cost]]
(->> data
(map (fn [row]
[(mapv row index-cols) (select-keys row value-cols)]))
(reduce
(fn [acc [index vals]]
(update acc index (partial merge-with +) vals))
(with-meta (sorted-map) {:keys index-cols}))
_(reset-index))))
(time
(let [index-cols [:item :date]
value-cols [:sales :cost]
agg-fn +]
(->> data
(group-by (apply juxt index-cols)) ; is this extra iteration thu the list too costly?
(map (fn [[k group]]
[k (->> group
(map (select-keys % value-cols))
(apply merge-with agg-fn))]))
(into (with-meta (sorted-map) {:keys index-cols}))
(reset-index)))
)
(time
(let [index-cols [:item :date]
value-cols [:sales :cost]
agg-fn +]
(->> data
(group-by (apply juxt index-cols)) ; is this extra iteration thu the list too costly?
(mapv (fn [[k group]]
[k (->> group
(mapv (apply juxt value-cols))
(apply mapv (fn sum [& col] (reduce agg-fn col))))]))
(into (with-meta (sorted-map) {:index index-cols :columns value-cols}))
(reset-index)
_(apply mapv (fn sum [& col] (reduce agg-fn col)))
)
)
)
(apply mapv (fn [& col] (reduce + col)) [[1 2][10 20]])
(defn update-in-sorted
"'Updates' a value in a nested associative structure, where ks is a
sequence of keys and f is a function that will take the old value
and any supplied args and return the new value, and returns a new
nested structure. If any levels do not exist, hash-maps will be
created."
([m ks f & args]
(let [up (fn up [m ks f args]
(let [[k & ks] ks]
(if ks
(assoc m k (up (or (get m k)
(if (sorted? m)
(sorted-map))) ks f args))
(assoc m k (apply f (get m k) args)))))]
(up m ks f args))))
(defn assoc-in-sorted
"Associates a value in a nested associative structure, where ks is a
sequence of keys and v is the new value and returns a new nested structure.
If any levels do not exist, hash-maps will be created."
[m [k & ks] v]
(if ks
(assoc m k (assoc-in (or (get m k) (if (sorted? m) (sorted-map))) ks v))
(assoc m k v)))
(->> data
(sort-by (juxt :item :date))
(partition-by (juxt :item :date))
(map (fn[x] [((juxt :date :item) (first x)) (transduce (map :sales) + x)])))
(->> data
(sort-by (juxt :item :date))
(partition-by (select-keys % [:item :date]))
(map (fn [x] (-> (select-keys (first x) [:item :date])
(assoc :sales (transduce (map :sales) + x))))))
Redux
See Henry Gartner's redux homepage.
TODO: check whether tesser does the same
(require [redux.core :refer [facet fuse with-xform]])
;; total sales and cost
(defn data-facets
[facets data]
(->> data
(transduce identity (facet + facets))
(map vector facets)
(into {})))
(data-facets [:sales :cost] data)
(sort-by :date data)
(group-by :date data)
(->> data
(group-by :date)
;(vals)
(map (fn [[date rows]] [date (data-facets [:sales :cost] rows)]))
(sort)
(into {}))
(->>
(for [[row-key row] *1]
(for [[col-key val] row]
[[col-key row-key] val]))
(apply concat)
(reduce (fn [acc [k v]] (assoc-in acc k v)) {}))
(let [subtotaled (->> data
(group-by :date)
(map (fn [[date rows]] [date (data-facets [:sales :cost] rows)]))
(into (sorted-map)))
transposed (->> (for [[row-key row] subtotaled]
(for [[col-key val] row]
[[col-key row-key] val]))
(apply concat)
(reduce (fn [acc [k v]] (assoc-in acc k v)) {}))]
transposed)
(reduce (fn [acc row]
(update acc (:date row) (fnil + 0) (:sales row)))
(sorted-map)
data)
(defn make-rf [col]
(fn [acc row]
(update acc (:date row) (fnil + 0) (col row))))
(reduce (make-rf :sales) (sorted-map) data)
(defn rf-date-val
[acc [date val]]
(update acc date (fnil + 0) val))
(->> (map (juxt :date :sales) data)
(reduce rf-date-val (sorted-map)))
(defn tf-date-val
"Create a transducing function su sum by date and another column"
([] (sorted-map))
([acc] (apply map vector acc))
([acc [date val]] (update acc date (fnil + 0) val)))
(transduce (map (juxt :date :sales)) tf-date-val data)
(transduce identity
(facet tf-date-val [(juxt :date :sales)
(juxt :date :cost)])
data)
(def plotly-data
(let [cols [:sales :cost]]
(->> data
(transduce identity
(facet tf-date-val (map (partial juxt :date) cols)))
(map cons cols)
(map (fn [[col dates values]] {:x dates :y values :name col}))
(assoc {:nextjournal/viewer :plotly}{:layout {:title "Sum by type"}} :data))))
(let [cols [:sales :cost]]
(->> data
(transduce identity
(facet tf-date-val (map (partial juxt :date) cols)))
(map cons cols)
(map (fn [[col dates values]] {:x dates :y values :name col}))
(assoc {:nextjournal/viewer :plotly}{} :data)))
Vega-lite
"https://vega.github.io/vega-lite/examples/"
Hi there see some vega-lite examples
We can perform the aggregation in vega. for total sales by region:
{:nextjournal/viewer :vega-lite}
{
"$schema" "https://vega.github.io/schema/vega-lite/v5.json",
"title" "total sales by region"
"data" {
"values" data
}
"mark" "line"
"encoding" {
"x" {"field" "date", "type" "temporal"},
"y" {"field" "sales", "aggregate" "sum" "type" "quantitative"},
"color" {"field" "region","type" "nominal"}}
:width 500
}
{:nextjournal/viewer :vega-lite}
{
"$schema" "https://vega.github.io/schema/vega-lite/v7.json"
"title" "total sales by Item"
"data" {
"values" data
}
"mark" "line"
"encoding" {
:x {"field" "date", "type" "temporal"},
:y {"field" "sales", "aggregate" "sum" "type" "quantitative"},
:color {:field "item", :type "nominal"}
:row {:field "region" :type "nominal" :title "Region"}}
:width 500
}
{:nextjournal/viewer :vega-lite}
{
"$schema" "https://vega.github.io/schema/vega-lite/v5.json"
:title "Total sales and cost"
:data {:values data}
:repeat {:layer [:sales :cost]}
:spec {
:mark "line"
:encoding {:x {:field "date", :type "temporal"},
:y {:field {:repeat :layer}, :aggregate "sum" :type "quantitative"}
:color {:datum {:repeat :layer}, :type "nominal"}}}
:width 500
}
{:nextjournal/viewer :vega-lite}
{
"$schema" "https://vega.github.io/schema/vega-lite/v5.json"
:title "Total sales and cost"
:data {:values data}
:transform [{:fold [:sales :cost] :as ["Type"]}]
:mark "line"
:encoding {:x {:field "date",:type "temporal"}
:y {:field :value :aggregate "sum" :type "quantitative"}
:row {:field "region" :type "nominal"}
:color {:field "Type" :type "nominal"}
}
:width 500
}
{:nextjournal/viewer :vega-lite}
{
"$schema" "https://vega.github.io/schema/vega-lite/v5.json"
:title "Total sales and cost"
:data {:values data}
:transform [{:fold [:sales :cost] :as ["Type"]}]
:mark "line"
:encoding {:x {:field "date",:type "temporal"}
:y {:field :value :aggregate "sum" :type "quantitative"}
:row {:field "Type" :type "nominal"}
:color {:field "region" :type "nominal"}
}
:width 500
}
{:nextjournal/viewer :vega-lite}
{
"$schema" "https://vega.github.io/schema/vega-lite/v5.json"
:title "Total sales and cost"
:data {:values data}
:transform {:fold [:sales :cost]}
;:facet {:row {:field :region}}
:spec {
:mark "line"
:encoding {:x {:field "date", :type "temporal" },
:y {:field "value",:aggregate "sum" :type "quantitative"}
:row {:field "region" :type "nominal"}
:color {:field "key" :type "nominal"}}
}
:width 500
}
Plotly(.js)
{:nextjournal/viewer :hiccup}
[:p [:a {:href "https://plotly.com/javascript/"} "Vega-Lite examples"]]
(def plotly-data
(let [cols [:sales :cost]]
(->> data
(transduce identity
(facet tf-date-val (map (partial juxt :date) cols)))
(map cons cols)
(map (fn [[col dates values]] {:x dates :y values :name col})))))
{:nextjournal/viewer :plotly}
{:layout {:title "Totals"}
:data plotly-data}
{:nextjournal/viewer :plotly}
{:data [{:x [1 2 3] :y [1 3 2] :name "foo" :type "scatter" :mode "lines"}
{:x [1 2 3] :y [2 1 3] :name "bar" :type "scatter" :mode "lines"
:xaxis "x2" :yaxis "y2"}
{:x [1 2 3] :y [10 6 7] :name "baz" :type "scatter" :mode "lines"
:xaxis "x3" :yaxis "y3"}]
:layout {:title "HI here!"
:height 600
:grid {:rows 3 :columns 1}
:yaxis {:domain [0.67 1]}
:yaxis2 {:domain [0.33 0.57]}
:yaxis3 {:domain [0 0.23]}
:annotations [{:x 0.5 :y 1
:xref "paper" :yref "paper"
:font {:size 16 :color "black"}
:showarrow false
:text-valign "bottom"
:text "me first"}
{:x 0.5 :y 0.57
:xref "paper" :yref "paper"
:font {:size 16 :color "black"}
:showarrow false
:text "me too"}
{:x 0.5 :y 0.23
:xref "paper" :yref "paper"
:font {:size 16 :color "darkpurple"}
:showarrow false
:text "me three"}]}}
New Datastructures
Juxt blog on new datastructures
Subplots
Let's try to get plots by region
Note that vega(-lite) and d3 provide their own data manipulation functionality
(defn tf-date-val
"Create a transducing function to sum by date and another column"
([] (sorted-map))
([acc] acc)
([acc [val & keys]] (update-in-sorted acc keys (fnil + 0) val)))
(let [cols [:sales :cost]
groups [:date]]
(->> data
(transduce identity
(facet tf-date-val (map (apply juxt (cons % groups)) cols)))
;(map cons cols)
_(map (fn [[col dates values]] {:x dates :y values :name col}))
_(assoc {:nextjournal/viewer :plotly} {:layout {:title "Sum by type"}} :data)
pprint))
(let [cols [:sales :cost]
groups [:region :date]]
(->> data
(transduce identity
(facet tf-date-val (map (apply juxt (cons % groups)) cols)))
(map vector cols)
(into {})
_(map (fn [[col dates values]] {:x dates :y values :name col}))
_(assoc {:nextjournal/viewer :plotly} {:layout {:title "Sum by type"}} :data)
pprint))
(let [cols [:sales :cost]
groups [:region :date]
colors {"North" "rgb(0,0,255)" "South" "rgb(255,0,0)"}
data-lines (fn [[col regions]]
(for [[region dt-val] regions]
(let [[dates vals] (apply map vector dt-val)]
{:x dates :y vals :name region
:line {:color (colors region)}})))]
(->> data
(transduce identity
(facet tf-date-val (map (apply juxt (cons % groups)) cols)))
(map vector cols)
;(into {})
_(map (fn [[col dates values]] {:x dates :y values :name col}))
_(assoc {:nextjournal/viewer :plotly} {:layout {:title "Sum by type"}} :data)
(map data-lines)
;(mapcat (fn subplots [[xaxis lines]](map #(assoc % :xasis xaxis) lines))) ["x" "x2" "x3"])
(mapcat (fn subplots [subplot lines]
(map (assoc %
:xaxis (str "x" subplot)
:yaxis (str "y" subplot)
:showlegend (= subplot "")) lines))
["" "2" "3"])
(assoc {:nextjournal/viewer :plotly}
{:layout {;:grid {:rows 2 :columns 1}
:yaxis {:domain [0 0.4]}
:yaxis2 {:domain [0.5 0.9]}
:annotations {}
:height (* 2 300)
:title "One subplot per column (sales or cost)"}}
:data)
;; requires extra work still to create subplot titles as annotations
;pprint
))
(let [cols [:sales :cost]
groups [:region :date]
colors {"North" "rgb(0,0,255)" "South" "rgb(255,0,0)"}
data-lines (fn [[col regions]]
(for [[region dt-val] regions]
(let [[dates vals] (apply map vector dt-val)]
{:x dates :y vals :name region
:line {:color (colors region)}})))]
(->> data
(transduce identity
(facet tf-date-val (map (apply juxt (cons % groups)) cols)))
(map vector cols)
;(into {})
_(map (fn [[col dates values]] {:x dates :y values :name col}))
_(assoc {:nextjournal/viewer :plotly} {:layout {:title "Sum by type"}} :data)
(map data-lines)
;(mapcat (fn subplots [[xaxis lines]](map #(assoc % :xasis xaxis) lines))) ["x" "x2" "x3"])
(mapcat (fn subplots [subplot lines]
(map (assoc % :xaxis (str "x" subplot) :yaxis (str "y" subplot)) lines))
["" "2" "3"])
(assoc {:nextjournal/viewer :plotly}
{:layout {:grid {:rows 2 :columns 1}
:title "One subplot per column (sales or cost)"}}
:data)
pprint
))
(let [cols [:sales :cost]
groups [:region :item :date]]
(->> data
(transduce identity
(facet tf-date-val (map (apply juxt (cons % groups)) cols)))
(map vector cols)
_(map (fn [[col dates values]] {:x dates :y values :name col}))
_(assoc {:nextjournal/viewer :plotly} {:layout {:title "Sum by type"}} :data)
pprint))
(-> (sorted-map)
(assoc-in [:a :b] 1)
(assoc-in [:c :x] 4)
(assoc-in [:c :a] 3)
(assoc-in [:b :c] 2))
Improve sorted aggregation
assoc-in and update-in are fine but when starting with a sorted-map and adding layers, it does not add sorted-map in the layers (i.e. only the first level remains sorted.
Let's fix that:
(-> (sorted-map)
(assoc-in-sorted [:a :b] 1)
(assoc-in-sorted [:c :x] 4)
(assoc-in-sorted [:c :a] 3)
(assoc-in-sorted [:b :c] 2)
pprint)