Alan / Feb 19 2019
Remix of Clojure by Nextjournal

Data Science Tutorial for Beginners

{:deps
 {org.clojure/clojure {:mvn/version "1.10.0"}
  org.clojure/data.csv {:mvn/version "0.1.4"}
  generateme/fastmath {:mvn/version "1.3.0-SNAPSHOT"}
  huri {:mvn/version "0.10.0-SNAPSHOT"}
  org.clojure/math.combinatorics {:mvn/version "0.1.4"}
  org.clojure/tools.deps.alpha
  {:git/url "https://github.com/clojure/tools.deps.alpha.git"
   :sha "f6c080bd0049211021ea59e516d1785b08302515"}}}
deps.edn
Extensible Data Notation
combats.csv
pokemon.csv
tests.csv
(use 'clojure.tools.deps.alpha.repl)
(clojure-version)
"1.10.0"
ls | grep csv
(slurp "combats.csv")
2["First_pokemon,Second_pokemon,Winner 266,298,298 702,701,701 191,668,668 237,683,683 151,231,151 657,752,657 192,134,134 73,545,545 220,763,763 302,31,31 442,130,130 701,624,701 15,283,283 151,87,151 269,462,269 763,448,448 143,263,263 365,240,240 499,774,499 563,578,563 440,687,687 373,441,441 654,671,671 121,355,121 609,649,649 270,708,270 460,568,568 626,767,767 139,451,139 694,747,694 455,549,549 181,301,301 208,184,184 536,365,536 260,381,381 471,269,269 621,287,287 406,134,406 592,503,503 564,242,242 579,242,242 200,719,200 376,135,376 76,441,441 667,645,645 475,400,475 3,737,737 435,45,435 231,146,146 510,715,715 456,260,260 164,751,751 424,754,424 258,779,258 360,208,208 723,646,646 384,533,533 734,673,673 141,721,141 244,741,244 509,217,509 633,44,633 595,719,595 606,40,606 368,598,368 746,584,746 350,187,350 303,198,303 169,701,701 569,115,115 231,172,172 71,440,71 659,610,610 570,557,557 712,40,712 451,352,451 661,95,95 540,4,540 130,319,130 125,418,418 309,276,276 579,612,612 716,80,80 482,535,535 376,37,37 6,328,328 343,159,159 509,574,574 546,530,546 486,229,229 686,213,213 304,386,386 514,732,732 161,757,161 150,265,265 436,701,701 221,265,265 436,603,603 242,365,242 792,251,792 607,295,607 439,209,209 657,360,657 436,40,40 325,408,408 472,89,472 71,777,71 47,615,47 5,670,5 793,314,314 195,461,195 493,568,493 400,341,341 63,297,63 541,446,541 141,152,141 54,115,115 378,161,161 224,316,316 311,398,398 298,35,298 356,132,356 103,370,103 503,453,503 600,598,600 445,45,445 360,548,548 303,133,133 51,331,51 171,559,171 758,129,758 6,255,6 698,154,154 92,491,491 381,73,381 354,271,271 219,49,219 736,244,244 683,245,245 533,452,533 200,588,588 403,636,403 56,561,561 54,119,119 629,141,629 547,779,547 380,547,547 407,307,307 153,559,153 306,268,306 89,382,89 54,32,32 172,745,745 16,110,110 537,734,537 278,94,278 342,643,643 435,165,165 638,691,691 734,718,718 298,781,298 42,113,42 97,316,97 416,397,397 483,400,483 275,155,155 154,461,154 309,4,4 539,30,30 545,266,545 283,340,340 232,254,254 384,427,427 161,781,161 496,420,496 268,159,159 473,593,473 550,475,550 199,785,785 385,732,732 488,196,488 30,587,30 36,494,494 513,184,184 129,106,129 6,604,6 137,315,137 83,509,509 585,534,585 743,57,57 220,723,723 764,665,764 708,439,708 28,545,545 23,796,23 698,147,147 306,635,635 158,666,158 55,750,750 513,439,513 769,264,264 232,499,499 23,623,23 97,250,250 324,544,544 517,10,517 586,461,461 779,349,349 669,226,226 623,53,623 538,718,718 438,242,438 523,759,523 14,103,103 378,152,152 478,361,478 137,698,137 354,739,739 233,684,684 291,605,291 96,264,264 730,137,137 129,744,129 611,107,611 58,560,560 658,146,658 625,753,625 524,27,524 794,474,794 365,391,391 592,585,592 512,586,512 526,424,424 70,456,70 540,288,540 347,389,347 689,128,128 211,701,701 336,385,336 594,80,80 785,490,490 425,308,425 27,731,731 459,399,459 421,494,421 578,768,768 63,82,63 257,221,257 4,150,4 546,333,546 308,544,544 213,442,213 575,302,302 209,645,209 92,768,92 116,672,116 132,69,69 568,396,568 644,218,644 321,163,163 17,164,164 715,190,715 51,57,51 237,448,448 580,676,580 789,726,726 420,718,420 83,499,499 526,159,159 115,56,56 358,636,358 54,532,532 534,798,534 768,179,768 94,724,724 795,437,795 313,432,432 305,251,305 269,289,269 593,733,733 568,428,428 290,287,287 762,733,762 217,100,100 668,87,668 165,49,165 123,771,771 261,154,154 225,640,640 313,103,103 106,505,505 453,614,614 107,48,48 722,590,590 559,431,431 141,727,727 490,239,490 490,340,340 373,128,128 3,730,3 502,91,502 714,104,714 788,276,276 459,16,16 432,143,432 724,2,724 648,533,648 250,357,250 155,439,155 447,20,20 366,371,366 337,770,337 358,765,358 198,110,198 152,760,760 164,577,164 174,646,646 438,59,59 614,443,614 789,127,127 504,421,421 565,214,214 158,635,635 386,21,21 408,404,404 349,440,349 410,295,410 83,569,569 249,411,249 173,609,173 770,572,572 51,473,473 452,106,106 550,769,550 525,740,525 464,762,762 147,798,798 572,45,572 569,741,569 193,28,28 247,445,247 511,280,280 435,44,44 794,359,794 196,758,758 470,410,410 631,164,164 451,790,451 211,442,442 773,582,773 403,59,59 401,413,413 228,679,679 753,196,196 578,434,434 210,40,40 38,716,716 18,458,458 538,742,538 702,627,702 403,412,403 517,136,136 465,89,465 505,502,505 601,644,644 459,84,84 548,525,525 707,521,521 14,62,62 728,501,728 182,246,246 741,117,741 29,431,431 719,574,574 209,477,477 492,37,37 425,43,425 207,284,284 749,512,512 539,153,153 152,159,159 738,18,738 182,579,579 69,302,69 173,713,713 488,199,488 683,506,683 2,619,2 735,255,735 70,800,70 120,67,67 738,610,610 68,397,397 540,714,540 184,197,184 577,58,58 750,29,29 391,640,391 211,737,211 462,694,694 337,147,147 459,644,644 615,652,615 786,180,786 247,594,594 536,720,536 729,586,586 715,80,715 516,740,516 440,525,525 174,410,410 634,424,424 252,797,797 317,294,317 463,790,790 770,7,7 110,783,110 454,469,469 241,731,241 688,190,688 271,21,271 738,789,738 476,223,476 574,283,574 621,551,551 723,617,723 1,679,679 595,131,595 433,339,339 247,452,247 19,709,709 410,278,410 103,142,103 37,337,337 453,29,453 73,113,113 374,87,374 634,784,784 681,790,681 181,765,181 584,498,498 586,590,590 732,1,732 478,524,478 734,19,19 634,348,634 567,755,567 354,82,354 417,783,417 420,85,420 610,215,610 350,160,350 122,588,588 159,77,159 398,598,398 300,704,704 508,293,508 763,685,685 428,203,428 509,199,509 106,592,106 550,394,550 310,48,48 80,314,314 463,203,463 484,9,9 700,49,700 437,533,533 73,485,73 627,314,314 216,657,216 126,617,126 632,707,707 692,331,692 11,309,309 483,22,483 665,170,170 540,337,540 39,299,299 761,762,762 441,1,441 434,137,137 700,433,700 52,379,379 158,171,158 769,374,769 789,690,690 393,255,255 307,741,307 528,492,528 212,24,24 99,173,99 58,253,58 226,289,226 365,609,365 409,532,409 647,294,647 15,756,756 196,774,196 265,68,265 564,749,749 750,248,248 142,107,142 427,404,427 454,302,302 323,574,574 262,67,67 710,434,710 291,666,291 233,445,233 126,494,494 252,108,108 69,212,212 120,342,120 224,173,173 695,94,94 672,355,672 409,254,409 195,607,607 528,327,327 127,81,127 698,336,336 422,705,705 432,397,432 409,376,409 582,424,424 521,268,521 304,708,708 385,492,492 560,70,560 545,149,149 68,362,362 672,732,732 632,14,632 641,341,341 365,428,428 594,425,425 764,529,764 397,161,397 111,670,670 219,738,219 408,647,647 516,107,107 622,736,736 212,128,212 685,648,648 406,420,420 501,689,689 533,211,533 413,580,413 74,277,277 460,713,713 450,8,8 376,196,196 292,510,510 282,186,282 384,699,699 295,741,741 346,790,346 269,187,269 45,323,323 89,778,778 151,238,151 774,667,774 266,301,301 602,375,375 31,490,490 92,182,92 588,390,588 476,434,476 360,426,426 101,21,21 267,182,267 245,72,72 10,17,17 110,425,425 244,177,177 209,465,465 778,760,778 192,721,192 340,110,110 555,443,443 538,82,538 123,434,123 597,627,597 409,56,409 458,301,301 65,644,65 584,319,319 262,210,210 116,492,116 75,480,480 38,302,302 750,656,656 453,174,453 375,509,375 791,203,791 617,266,617 480,541,541 425,750,425 325,276,276 385,157,157 331,648,648 566,34,34 86,502,502 214,435,214 551,778,551 300,354,300 438,630,438 209,397,209 582,666,582 52,780,780 492,417,417 519,581,581 11,771,771 725,340,340 286,313,313 79,633,633 435,83,435 565,55,55 313,754,313 247,685,247 698,215,698 76,567,567 451,21,451 132,473,132 500,215,500 4,13,4 111,188,111 39,519,519 129,303,129 306,39,306 752,595,595 699,458,699 708,456,708 102,304,102 399,239,239 37,13,13 498,282,498 649,489,649 535,189,535 147,189,147 107,32,32 688,350,350 559,540,540 28,44,44 277,168,168 285,328,328 650,652,650 638,393,393 365,233,233 215,290,215 576,231,576 335,333,335 79,392,79 181,41,181 136,546,136 208,685,685 634,480,480 481,690,481 262,47,47 87,59,59 637,695,695 712,74,712 696,274,274 485,101,101 484,661,661 721,631,631 354,257,257 485,605,605 27,798,798 604,131,131 336,532,532 298,179,179 297,394,394 260,358,260 429,475,429 546,416,546 174,224,174 65,173,65 793,247,793 530,437,437 154,650,154 627,150,150 47,334,334 146,660,660 348,105,348 712,575,712 602,728,728 523,461,523 347,526,526 579,373,579 2,214,214 10,596,596 531,762,531 430,516,430 305,400,305 158,258,158 528,38,528 54,600,600 424,728,424 232,333,232 162,114,162 59,118,59 632,742,742 488,708,708 180,690,690 129,423,423 162,439,162 592,267,267 727,219,727 707,764,707 537,525,537 798,672,672 6,47,6 421,225,421 703,732,732 743,248,248 773,454,773 286,680,286 541,302,541 210,686,686 503,62,503 423,734,423 280,524,280 116,681,116 365,680,680 330,664,664 462,463,463 242,325,242 131,340,340 713,794,794 234,667,234 270,772,772 200,393,393 181,351,181 546,350,350 665,39,39 475,402,475 383,722,383 401,580,580 123,609,123 502,123,502 98,537,537 565,716,716 512,100,512 240,93,93 646,468,646 89,470,89 438,284,438 125,31,125 570,222,570 350,512,512 643,42,643 560,316,316 715,343,715 126,622,126 278,184,184 604,130,130 558,573,573 411,220,411 132,181,132 592,748,748 709,260,709 515,598,598 576,530,576 512,205,512 268,458,268 633,113,633 368,101,101 371,61,61 248,339,339 256,30,30 691,593,691 118,349,349 216,30,216 540,545,540 664,13,13 292,52,52 120,256,256 178,456,178 148,343,343 693,68,68 547,719,547 93,738,93 664,398,398 347,589,589 222,568,568 349,425,349 166,559,166 50,306,306 361,716,716 226,509,226 2,594,2 50,134,134 459,111,111 667,96,667 260,215,215 380,694,694 496,485,496 310,153,153 349,663,349 516,130,516 783,681,783 657,355,355 549,100,549 462,690,690 112,465,465 117,370,370 549,54,549 478,593,478 497,280,280 766,591,766 677,17,677 251,259,251 53,494,494 746,494,494 475,723,475 305,238,305 288,524,288 751,45,751 291,470,291 154,325,154 638,359,638 604,233,233 19,369,19 526,203,526 129,695,129 215,719,719 518,118,518 127,287,127 559,214,214 762,86,762 557,501,557 282,605,282 307,149,307 425,390,425 612,731,731 387,311,311 143,116,116 340,268,340 14,162,162 570,492,570 549,127,549 484,212,212 524,222,524 400,151,151 375,255,255 738,176,738 116,490,490 2,394,394 369,505,505 283,254,254 545,241,545 49,268,268 522,177,522 699,225,225 160,398,398 579,691,69",{1}]
(require '[clojure.data.csv :as csv]
         '[clojure.java.io :as io]
         '[fastmath.stats :as stats])
(defn csv-reader
  [filename]
  (with-open [reader (io/reader filename)]
    (doall
      (csv/read-csv reader))))

(defn strip [coll chars]
  (apply str (remove #((set chars) %) coll)))

(defn data->maps
  [data]
  (doall
  (map zipmap
       (->> (first data)
         		(map #(strip % " .,"))
            (map keyword)
            repeat)
	  (rest data))))

(defn csv->maps
  [filename]
  (data->maps (csv-reader filename)))
user/csv->maps
(def pokemon (csv->maps "pokemon.csv"))
user/pokemon
(first pokemon)
12{:SpDef"65":Type1"Grass":Attack"49":Speed"45":Type2"Poison":SpAtk"65":#"1":Defense"49":Name"Bulbasaur":Generation"1":HP"45":Legendary"False"}
(def pokemon 
  (let [nums (map #(select-keys % [:SpDef :Attack 
                                   :Speed :SpAtk 
                                   :# :Defense 
                                   :Generation :HP]) pokemon)
        upvals (fn [m] 
                 (reduce-kv 
                  (fn [m k v] 
                    (assoc m k (Integer/parseInt v))) 
                  {} m))
        ->nums (map upvals nums)]
    (map #(merge %1 %2) pokemon ->nums)))
user/pokemon
(first pokemon)
12{:SpDef65:Type1"Grass":Attack49:Speed45:Type2"Poison":SpAtk65:#1:Defense49:Name"Bulbasaur":Generation1:HP45:Legendary"False"}
(defn nilify
  [v]
  (if (string? v)
    (when-not (clojure.string/blank? v)
      v)
    v))
user/nilify
(defn upvals [m f]
    (reduce-kv 
      (fn [m k v] 
        (assoc m k (f v))) 
      {} m))

(def pokemon (map (fn [m] (upvals m nilify)) pokemon))
user/pokemon
(require '[huri.core :as h])
(h/size pokemon)
2[800,12]
(h/cols pokemon)
12(:SpDef,:Type1,:Attack,:Speed,:Type2,:SpAtk,:#,:Defense,:Name,:Generation,:HP,:Legendary)
(defn nilcounter
  [df]
  (let [c (h/cols df)
        dists (map #(h/distribution % df) c)
        nils  (map #(% nil) dists)]
    (zipmap c nils)))
user/nilcounter
(clojure.pprint/print-table [(nilcounter pokemon)])
(require '[clojure.math.combinatorics :as combinatorics])
(defn cross-corr
  [df numcols]
  (let [pairs (combinatorics/cartesian-product numcols numcols)]
    (for [p pairs]
      (if (apply = p)
        {:keys (vec p) :corr 1}
        {:keys (vec p) :corr (->> df (map #(select-keys % p)) 
                     h/col-oriented vals (apply stats/correlation))}))))
user/cross-corr
(defn heatmap
  [x y z]
  {:data 
    [{:x x
      :y y
      :z z
      :type "heatmap"
      :colorscale [[0 "#24ff24"] [1 "#004b00"]]}]
   :layout {:autosize false :width 600 :height 600}})
user/heatmap
(def numcols [:SpDef :Attack :Speed :SpAtk 
              :# :Defense :Generation :HP])
user/numcols
(heatmap numcols 
         numcols
         (->> (cross-corr pokemon numcols)
             h/col-oriented
             :corr
             (partition 8)))
2{:data[1]:layout{3}}
(.plot js/Nextjournal plotly)
{:data 
    [{:x (map :# pokemon)
      :y (map :Speed pokemon)
      :type "scatter"
      :name "Speed"
      :opacity 0.6}
     {:x (map :# pokemon)
      :y (map :Defense pokemon)
      :type "scatter"
      :name "Defense"
      :opacity 0.6}]
   :layout {:autosize false :width 600 :height 600}}
2{:data[2]:layout{3}}
(.plot js/Nextjournal speed-defense)
(defn scatter
  [df kx ky]
  {:data 
    [{:x (map kx df)
      :y (map ky df)
      :mode "markers"
      :type "scatter"}]
   :layout {:autosize false :width 600 :height 600}})
user/scatter
(scatter pokemon :Attack :Defense)
2{:data[1]:layout{3}}
(.plot js/Nextjournal attkdfs)
(defn hist
  [df kx]
  {:data 
    [{:x (map kx df)
      :type "histogram"}]
   :layout {:autosize false :width 600 :height 600}})
user/hist
(hist pokemon :Speed)
2{:data[1]:layout{3}}
(.plot js/Nextjournal histspeed)
(h/rollup :Type1 count pokemon)
18{"Poison"28"Grass"70"Electric"44"Fighting"27"Ice"24"Ground"32"Fairy"17"Fire"52"Dark"31"Water"112"Bug"69"Rock"44"Flying"4"Ghost"32"Dragon"32"Normal"98"Psychic"57"Steel"27}
(let [df (h/rollup :Legendary identity :Attack pokemon)]
  {:data 
    [{:y (df "False")
      :type "box"
      :name "False"}
     {:y (df "True")
      :type "box"
      :name "True"}]
   :layout {:autosize false :width 600 :height 600}})
2{:data[2]:layout{3}}
(.plot js/Nextjournal box)
(map type (vals (first pokemon)))
12(java.lang.Integer,java.lang.String,java.lang.Integer,java.lang.Integer,java.lang.String,java.lang.Integer,java.lang.Integer,java.lang.Integer,java.lang.String,java.lang.Integer,java.lang.Integer,java.lang.String)
(group-by :Name (map #(select-keys % [:Name :Attack :Defense]) pokemon))
800{nil[1]"Darkrai"[1]"Uxie"[1]"Arbok"[1]"Azurill"[1]"Pawniard"[1]"Gourgeist Small Size"[1]"Fearow"[1]"Exeggcute"[1]"Kyurem Black Kyurem"[1]"Pumpkaboo Super Size"[1]"Altaria"[1]"Venomoth"[1]"Chansey"[1]"Mandibuzz"[1]"Nincada"[1]"Makuhita"[1]"Dedenne"[1]"Pidgeot"[1]"Seadra"[1]780 more…}
(def pokemon-clean
  (filter #(not (nil? (:Type2 %))) pokemon))
user/pokemon-clean
(h/distribution :Type2 pokemon-clean)
18{"Poison"0.0821256038647343"Grass"0.060386473429951695"Electric"0.014492753623188408"Fighting"0.06280193236714976"Ice"0.03381642512077295"Ground"0.08454106280193238"Fairy"0.055555555555555566"Fire"0.028985507246376815"Dark"0.04830917874396136"Water"0.03381642512077295"Bug"0.007246376811594204"Rock"0.03381642512077295"Flying"0.2342995169082126"Ghost"0.03381642512077295"Dragon"0.04347826086956522"Normal"0.009661835748792272"Psychic"0.07971014492753624"Steel"0.053140096618357495}
(defn fill-nil
  [row k sub]
  (if (nil? (k row))
    (assoc row k sub)
    row))
user/fill-nil
(def pokemon (map #(fill-nil % :Type2 "empty") pokemon))
user/pokemon
(let [n (map :# pokemon)]
  {:data 
    [{:y (map :Attack pokemon)
      :x n
      :type "scatter"
      :name "Attack"}
     {:y (map :Defense pokemon)
      :x n
      :type "scatter"
      :name "Defense"
      :xaxis "x2"
      :yaxis "y2"}
     {:y (map :Speed pokemon)
      :x n
      :type "scatter"
      :name "Speed"
      :xaxis "x3"
      :yaxis "y3"}]
   :layout {:autosize false :width 600 :height 600
            :grid {:rows 3 :columns 1 
                   :pattern "independent"
                   :roworder "bottom to top"}}})
2{:data[3]:layout{4}}
(.plot js/Nextjournal multiline)
(assoc-in (hist pokemon :Speed) [:data 0 :cumulative] {:enabled true})
2{:data[1]:layout{3}}
(.plot js/Nextjournal histcum)
(->> pokemon first :HP)