Alan / Feb 19 2019
Remix of Clojure by Nextjournal
Data Science Tutorial for Beginners
Data Science Tutorial for Beginners
{:deps {org.clojure/clojure {:mvn/version "1.10.0"} org.clojure/data.csv {:mvn/version "0.1.4"} generateme/fastmath {:mvn/version "1.3.0-SNAPSHOT"} huri {:mvn/version "0.10.0-SNAPSHOT"} org.clojure/math.combinatorics {:mvn/version "0.1.4"} org.clojure/tools.deps.alpha {:git/url "https://github.com/clojure/tools.deps.alpha.git" :sha "f6c080bd0049211021ea59e516d1785b08302515"}}}
deps.edn
Extensible Data Notation
(use 'clojure.tools.deps.alpha.repl) (clojure-version)
"1.10.0"
ls | grep csv
(slurp "combats.csv")
2["First_pokemon,Second_pokemon,Winner
266,298,298
702,701,701
191,668,668
237,683,683
151,231,151
657,752,657
192,134,134
73,545,545
220,763,763
302,31,31
442,130,130
701,624,701
15,283,283
151,87,151
269,462,269
763,448,448
143,263,263
365,240,240
499,774,499
563,578,563
440,687,687
373,441,441
654,671,671
121,355,121
609,649,649
270,708,270
460,568,568
626,767,767
139,451,139
694,747,694
455,549,549
181,301,301
208,184,184
536,365,536
260,381,381
471,269,269
621,287,287
406,134,406
592,503,503
564,242,242
579,242,242
200,719,200
376,135,376
76,441,441
667,645,645
475,400,475
3,737,737
435,45,435
231,146,146
510,715,715
456,260,260
164,751,751
424,754,424
258,779,258
360,208,208
723,646,646
384,533,533
734,673,673
141,721,141
244,741,244
509,217,509
633,44,633
595,719,595
606,40,606
368,598,368
746,584,746
350,187,350
303,198,303
169,701,701
569,115,115
231,172,172
71,440,71
659,610,610
570,557,557
712,40,712
451,352,451
661,95,95
540,4,540
130,319,130
125,418,418
309,276,276
579,612,612
716,80,80
482,535,535
376,37,37
6,328,328
343,159,159
509,574,574
546,530,546
486,229,229
686,213,213
304,386,386
514,732,732
161,757,161
150,265,265
436,701,701
221,265,265
436,603,603
242,365,242
792,251,792
607,295,607
439,209,209
657,360,657
436,40,40
325,408,408
472,89,472
71,777,71
47,615,47
5,670,5
793,314,314
195,461,195
493,568,493
400,341,341
63,297,63
541,446,541
141,152,141
54,115,115
378,161,161
224,316,316
311,398,398
298,35,298
356,132,356
103,370,103
503,453,503
600,598,600
445,45,445
360,548,548
303,133,133
51,331,51
171,559,171
758,129,758
6,255,6
698,154,154
92,491,491
381,73,381
354,271,271
219,49,219
736,244,244
683,245,245
533,452,533
200,588,588
403,636,403
56,561,561
54,119,119
629,141,629
547,779,547
380,547,547
407,307,307
153,559,153
306,268,306
89,382,89
54,32,32
172,745,745
16,110,110
537,734,537
278,94,278
342,643,643
435,165,165
638,691,691
734,718,718
298,781,298
42,113,42
97,316,97
416,397,397
483,400,483
275,155,155
154,461,154
309,4,4
539,30,30
545,266,545
283,340,340
232,254,254
384,427,427
161,781,161
496,420,496
268,159,159
473,593,473
550,475,550
199,785,785
385,732,732
488,196,488
30,587,30
36,494,494
513,184,184
129,106,129
6,604,6
137,315,137
83,509,509
585,534,585
743,57,57
220,723,723
764,665,764
708,439,708
28,545,545
23,796,23
698,147,147
306,635,635
158,666,158
55,750,750
513,439,513
769,264,264
232,499,499
23,623,23
97,250,250
324,544,544
517,10,517
586,461,461
779,349,349
669,226,226
623,53,623
538,718,718
438,242,438
523,759,523
14,103,103
378,152,152
478,361,478
137,698,137
354,739,739
233,684,684
291,605,291
96,264,264
730,137,137
129,744,129
611,107,611
58,560,560
658,146,658
625,753,625
524,27,524
794,474,794
365,391,391
592,585,592
512,586,512
526,424,424
70,456,70
540,288,540
347,389,347
689,128,128
211,701,701
336,385,336
594,80,80
785,490,490
425,308,425
27,731,731
459,399,459
421,494,421
578,768,768
63,82,63
257,221,257
4,150,4
546,333,546
308,544,544
213,442,213
575,302,302
209,645,209
92,768,92
116,672,116
132,69,69
568,396,568
644,218,644
321,163,163
17,164,164
715,190,715
51,57,51
237,448,448
580,676,580
789,726,726
420,718,420
83,499,499
526,159,159
115,56,56
358,636,358
54,532,532
534,798,534
768,179,768
94,724,724
795,437,795
313,432,432
305,251,305
269,289,269
593,733,733
568,428,428
290,287,287
762,733,762
217,100,100
668,87,668
165,49,165
123,771,771
261,154,154
225,640,640
313,103,103
106,505,505
453,614,614
107,48,48
722,590,590
559,431,431
141,727,727
490,239,490
490,340,340
373,128,128
3,730,3
502,91,502
714,104,714
788,276,276
459,16,16
432,143,432
724,2,724
648,533,648
250,357,250
155,439,155
447,20,20
366,371,366
337,770,337
358,765,358
198,110,198
152,760,760
164,577,164
174,646,646
438,59,59
614,443,614
789,127,127
504,421,421
565,214,214
158,635,635
386,21,21
408,404,404
349,440,349
410,295,410
83,569,569
249,411,249
173,609,173
770,572,572
51,473,473
452,106,106
550,769,550
525,740,525
464,762,762
147,798,798
572,45,572
569,741,569
193,28,28
247,445,247
511,280,280
435,44,44
794,359,794
196,758,758
470,410,410
631,164,164
451,790,451
211,442,442
773,582,773
403,59,59
401,413,413
228,679,679
753,196,196
578,434,434
210,40,40
38,716,716
18,458,458
538,742,538
702,627,702
403,412,403
517,136,136
465,89,465
505,502,505
601,644,644
459,84,84
548,525,525
707,521,521
14,62,62
728,501,728
182,246,246
741,117,741
29,431,431
719,574,574
209,477,477
492,37,37
425,43,425
207,284,284
749,512,512
539,153,153
152,159,159
738,18,738
182,579,579
69,302,69
173,713,713
488,199,488
683,506,683
2,619,2
735,255,735
70,800,70
120,67,67
738,610,610
68,397,397
540,714,540
184,197,184
577,58,58
750,29,29
391,640,391
211,737,211
462,694,694
337,147,147
459,644,644
615,652,615
786,180,786
247,594,594
536,720,536
729,586,586
715,80,715
516,740,516
440,525,525
174,410,410
634,424,424
252,797,797
317,294,317
463,790,790
770,7,7
110,783,110
454,469,469
241,731,241
688,190,688
271,21,271
738,789,738
476,223,476
574,283,574
621,551,551
723,617,723
1,679,679
595,131,595
433,339,339
247,452,247
19,709,709
410,278,410
103,142,103
37,337,337
453,29,453
73,113,113
374,87,374
634,784,784
681,790,681
181,765,181
584,498,498
586,590,590
732,1,732
478,524,478
734,19,19
634,348,634
567,755,567
354,82,354
417,783,417
420,85,420
610,215,610
350,160,350
122,588,588
159,77,159
398,598,398
300,704,704
508,293,508
763,685,685
428,203,428
509,199,509
106,592,106
550,394,550
310,48,48
80,314,314
463,203,463
484,9,9
700,49,700
437,533,533
73,485,73
627,314,314
216,657,216
126,617,126
632,707,707
692,331,692
11,309,309
483,22,483
665,170,170
540,337,540
39,299,299
761,762,762
441,1,441
434,137,137
700,433,700
52,379,379
158,171,158
769,374,769
789,690,690
393,255,255
307,741,307
528,492,528
212,24,24
99,173,99
58,253,58
226,289,226
365,609,365
409,532,409
647,294,647
15,756,756
196,774,196
265,68,265
564,749,749
750,248,248
142,107,142
427,404,427
454,302,302
323,574,574
262,67,67
710,434,710
291,666,291
233,445,233
126,494,494
252,108,108
69,212,212
120,342,120
224,173,173
695,94,94
672,355,672
409,254,409
195,607,607
528,327,327
127,81,127
698,336,336
422,705,705
432,397,432
409,376,409
582,424,424
521,268,521
304,708,708
385,492,492
560,70,560
545,149,149
68,362,362
672,732,732
632,14,632
641,341,341
365,428,428
594,425,425
764,529,764
397,161,397
111,670,670
219,738,219
408,647,647
516,107,107
622,736,736
212,128,212
685,648,648
406,420,420
501,689,689
533,211,533
413,580,413
74,277,277
460,713,713
450,8,8
376,196,196
292,510,510
282,186,282
384,699,699
295,741,741
346,790,346
269,187,269
45,323,323
89,778,778
151,238,151
774,667,774
266,301,301
602,375,375
31,490,490
92,182,92
588,390,588
476,434,476
360,426,426
101,21,21
267,182,267
245,72,72
10,17,17
110,425,425
244,177,177
209,465,465
778,760,778
192,721,192
340,110,110
555,443,443
538,82,538
123,434,123
597,627,597
409,56,409
458,301,301
65,644,65
584,319,319
262,210,210
116,492,116
75,480,480
38,302,302
750,656,656
453,174,453
375,509,375
791,203,791
617,266,617
480,541,541
425,750,425
325,276,276
385,157,157
331,648,648
566,34,34
86,502,502
214,435,214
551,778,551
300,354,300
438,630,438
209,397,209
582,666,582
52,780,780
492,417,417
519,581,581
11,771,771
725,340,340
286,313,313
79,633,633
435,83,435
565,55,55
313,754,313
247,685,247
698,215,698
76,567,567
451,21,451
132,473,132
500,215,500
4,13,4
111,188,111
39,519,519
129,303,129
306,39,306
752,595,595
699,458,699
708,456,708
102,304,102
399,239,239
37,13,13
498,282,498
649,489,649
535,189,535
147,189,147
107,32,32
688,350,350
559,540,540
28,44,44
277,168,168
285,328,328
650,652,650
638,393,393
365,233,233
215,290,215
576,231,576
335,333,335
79,392,79
181,41,181
136,546,136
208,685,685
634,480,480
481,690,481
262,47,47
87,59,59
637,695,695
712,74,712
696,274,274
485,101,101
484,661,661
721,631,631
354,257,257
485,605,605
27,798,798
604,131,131
336,532,532
298,179,179
297,394,394
260,358,260
429,475,429
546,416,546
174,224,174
65,173,65
793,247,793
530,437,437
154,650,154
627,150,150
47,334,334
146,660,660
348,105,348
712,575,712
602,728,728
523,461,523
347,526,526
579,373,579
2,214,214
10,596,596
531,762,531
430,516,430
305,400,305
158,258,158
528,38,528
54,600,600
424,728,424
232,333,232
162,114,162
59,118,59
632,742,742
488,708,708
180,690,690
129,423,423
162,439,162
592,267,267
727,219,727
707,764,707
537,525,537
798,672,672
6,47,6
421,225,421
703,732,732
743,248,248
773,454,773
286,680,286
541,302,541
210,686,686
503,62,503
423,734,423
280,524,280
116,681,116
365,680,680
330,664,664
462,463,463
242,325,242
131,340,340
713,794,794
234,667,234
270,772,772
200,393,393
181,351,181
546,350,350
665,39,39
475,402,475
383,722,383
401,580,580
123,609,123
502,123,502
98,537,537
565,716,716
512,100,512
240,93,93
646,468,646
89,470,89
438,284,438
125,31,125
570,222,570
350,512,512
643,42,643
560,316,316
715,343,715
126,622,126
278,184,184
604,130,130
558,573,573
411,220,411
132,181,132
592,748,748
709,260,709
515,598,598
576,530,576
512,205,512
268,458,268
633,113,633
368,101,101
371,61,61
248,339,339
256,30,30
691,593,691
118,349,349
216,30,216
540,545,540
664,13,13
292,52,52
120,256,256
178,456,178
148,343,343
693,68,68
547,719,547
93,738,93
664,398,398
347,589,589
222,568,568
349,425,349
166,559,166
50,306,306
361,716,716
226,509,226
2,594,2
50,134,134
459,111,111
667,96,667
260,215,215
380,694,694
496,485,496
310,153,153
349,663,349
516,130,516
783,681,783
657,355,355
549,100,549
462,690,690
112,465,465
117,370,370
549,54,549
478,593,478
497,280,280
766,591,766
677,17,677
251,259,251
53,494,494
746,494,494
475,723,475
305,238,305
288,524,288
751,45,751
291,470,291
154,325,154
638,359,638
604,233,233
19,369,19
526,203,526
129,695,129
215,719,719
518,118,518
127,287,127
559,214,214
762,86,762
557,501,557
282,605,282
307,149,307
425,390,425
612,731,731
387,311,311
143,116,116
340,268,340
14,162,162
570,492,570
549,127,549
484,212,212
524,222,524
400,151,151
375,255,255
738,176,738
116,490,490
2,394,394
369,505,505
283,254,254
545,241,545
49,268,268
522,177,522
699,225,225
160,398,398
579,691,69",{1}]
(require '[clojure.data.csv :as csv] '[clojure.java.io :as io] '[fastmath.stats :as stats])
(defn csv-reader [filename] (with-open [reader (io/reader filename)] (doall (csv/read-csv reader)))) (defn strip [coll chars] (apply str (remove #((set chars) %) coll))) (defn data->maps [data] (doall (map zipmap (->> (first data) (map #(strip % " .,")) (map keyword) repeat) (rest data)))) (defn csv->maps [filename] (data->maps (csv-reader filename)))
user/csv->maps
(def pokemon (csv->maps "pokemon.csv"))
user/pokemon
(first pokemon)
12{:SpDef"65":Type1"Grass":Attack"49":Speed"45":Type2"Poison":SpAtk"65":#"1":Defense"49":Name"Bulbasaur":Generation"1":HP"45":Legendary"False"}
(def pokemon (let [nums (map #(select-keys % [:SpDef :Attack :Speed :SpAtk :# :Defense :Generation :HP]) pokemon) upvals (fn [m] (reduce-kv (fn [m k v] (assoc m k (Integer/parseInt v))) {} m)) ->nums (map upvals nums)] (map #(merge %1 %2) pokemon ->nums)))
user/pokemon
(first pokemon)
12{:SpDef65:Type1"Grass":Attack49:Speed45:Type2"Poison":SpAtk65:#1:Defense49:Name"Bulbasaur":Generation1:HP45:Legendary"False"}
(defn nilify [v] (if (string? v) (when-not (clojure.string/blank? v) v) v))
user/nilify
(defn upvals [m f] (reduce-kv (fn [m k v] (assoc m k (f v))) {} m)) (def pokemon (map (fn [m] (upvals m nilify)) pokemon))
user/pokemon
(require '[huri.core :as h])
(h/size pokemon)
2[800,12]
(h/cols pokemon)
12(:SpDef,:Type1,:Attack,:Speed,:Type2,:SpAtk,:#,:Defense,:Name,:Generation,:HP,:Legendary)
(defn nilcounter [df] (let [c (h/cols df) dists (map #(h/distribution % df) c) nils (map #(% nil) dists)] (zipmap c nils)))
user/nilcounter
(clojure.pprint/print-table [(nilcounter pokemon)])
(require '[clojure.math.combinatorics :as combinatorics])
(defn cross-corr [df numcols] (let [pairs (combinatorics/cartesian-product numcols numcols)] (for [p pairs] (if (apply = p) {:keys (vec p) :corr 1} {:keys (vec p) :corr (->> df (map #(select-keys % p)) h/col-oriented vals (apply stats/correlation))}))))
user/cross-corr
(defn heatmap [x y z] {:data [{:x x :y y :z z :type "heatmap" :colorscale [[0 "#24ff24"] [1 "#004b00"]]}] :layout {:autosize false :width 600 :height 600}})
user/heatmap
(def numcols [:SpDef :Attack :Speed :SpAtk :# :Defense :Generation :HP])
user/numcols
(heatmap numcols numcols (->> (cross-corr pokemon numcols) h/col-oriented :corr (partition 8)))
2{:data[1]:layout{3}}
(.plot js/Nextjournal plotly↩)
{:data [{:x (map :# pokemon) :y (map :Speed pokemon) :type "scatter" :name "Speed" :opacity 0.6} {:x (map :# pokemon) :y (map :Defense pokemon) :type "scatter" :name "Defense" :opacity 0.6}] :layout {:autosize false :width 600 :height 600}}
2{:data[2]:layout{3}}
(.plot js/Nextjournal speed-defense↩)
(defn scatter [df kx ky] {:data [{:x (map kx df) :y (map ky df) :mode "markers" :type "scatter"}] :layout {:autosize false :width 600 :height 600}})
user/scatter
(scatter pokemon :Attack :Defense)
2{:data[1]:layout{3}}
(.plot js/Nextjournal attkdfs↩)
(defn hist [df kx] {:data [{:x (map kx df) :type "histogram"}] :layout {:autosize false :width 600 :height 600}})
user/hist
(hist pokemon :Speed)
2{:data[1]:layout{3}}
(.plot js/Nextjournal histspeed↩)
(h/rollup :Type1 count pokemon)
18{"Poison"28"Grass"70"Electric"44"Fighting"27"Ice"24"Ground"32"Fairy"17"Fire"52"Dark"31"Water"112"Bug"69"Rock"44"Flying"4"Ghost"32"Dragon"32"Normal"98"Psychic"57"Steel"27}
(let [df (h/rollup :Legendary identity :Attack pokemon)] {:data [{:y (df "False") :type "box" :name "False"} {:y (df "True") :type "box" :name "True"}] :layout {:autosize false :width 600 :height 600}})
2{:data[2]:layout{3}}
(.plot js/Nextjournal box↩)
(map type (vals (first pokemon)))
12(java.lang.Integer,java.lang.String,java.lang.Integer,java.lang.Integer,java.lang.String,java.lang.Integer,java.lang.Integer,java.lang.Integer,java.lang.String,java.lang.Integer,java.lang.Integer,java.lang.String)
(group-by :Name (map #(select-keys % [:Name :Attack :Defense]) pokemon))
800{nil[1]"Darkrai"[1]"Uxie"[1]"Arbok"[1]"Azurill"[1]"Pawniard"[1]"Gourgeist Small Size"[1]"Fearow"[1]"Exeggcute"[1]"Kyurem Black Kyurem"[1]"Pumpkaboo Super Size"[1]"Altaria"[1]"Venomoth"[1]"Chansey"[1]"Mandibuzz"[1]"Nincada"[1]"Makuhita"[1]"Dedenne"[1]"Pidgeot"[1]"Seadra"[1]780 more…}
(def pokemon-clean (filter #(not (nil? (:Type2 %))) pokemon))
user/pokemon-clean
(h/distribution :Type2 pokemon-clean)
18{"Poison"0.0821256038647343"Grass"0.060386473429951695"Electric"0.014492753623188408"Fighting"0.06280193236714976"Ice"0.03381642512077295"Ground"0.08454106280193238"Fairy"0.055555555555555566"Fire"0.028985507246376815"Dark"0.04830917874396136"Water"0.03381642512077295"Bug"0.007246376811594204"Rock"0.03381642512077295"Flying"0.2342995169082126"Ghost"0.03381642512077295"Dragon"0.04347826086956522"Normal"0.009661835748792272"Psychic"0.07971014492753624"Steel"0.053140096618357495}
(defn fill-nil [row k sub] (if (nil? (k row)) (assoc row k sub) row))
user/fill-nil
(def pokemon (map #(fill-nil % :Type2 "empty") pokemon))
user/pokemon
(let [n (map :# pokemon)] {:data [{:y (map :Attack pokemon) :x n :type "scatter" :name "Attack"} {:y (map :Defense pokemon) :x n :type "scatter" :name "Defense" :xaxis "x2" :yaxis "y2"} {:y (map :Speed pokemon) :x n :type "scatter" :name "Speed" :xaxis "x3" :yaxis "y3"}] :layout {:autosize false :width 600 :height 600 :grid {:rows 3 :columns 1 :pattern "independent" :roworder "bottom to top"}}})
2{:data[3]:layout{4}}
(.plot js/Nextjournal multiline↩)
(assoc-in (hist pokemon :Speed) [:data 0 :cumulative] {:enabled true})
2{:data[1]:layout{3}}
(.plot js/Nextjournal histcum↩)
(->> pokemon first :HP)