({:tag :p,
  :content
  ("--- title: \"Dataset manipulation API for tech.ml.dataset library\" author: GenerateMe date: \""
   {:tag :code, :content ("r Sys.Date()")}
   "\" output:  html"
   "_"
   "document:  highlight: pygments  theme: simplex  toc: true  toc"
   "_"
   "depth: 4  toc"
   "_"
   "float:  smooth"
   "_"
   "scroll: true  collapsed: true  includes:  in"
   "_"
   "header: better"
   "_"
   "tables.html  smart: false  pdf"
   "_"
   "document:  highlight: tango  md"
   "_"
   "document:")}
 {:tag :pre, :content ({:tag :code, :content ("    variant: gfm\n")})}
 {:tag :hr}
 {:tag :pre,
  :content
  ({:tag :code,
    :content
    (".github-corner:hover.octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}\ntable { width: auto !important; margin-right: auto; margin-left: 2%; }\nh1, h2, h3, h4, h5, h6 { padding-top: 16px; border-bottom: 1px solid; }\npre.sourceCode { position: relative; }\npre.clojure::before {\n    content: \"Clojure\";\n    opacity: 0.2;\n    font-size: 150%;\n    position: absolute;\n    text-align: right;\n    right: 8px;\n    bottom: 0px;\n}\npre.r::before {\n    content: \"R\";\n    opacity: 0.2;\n    font-size: 150%;\n    position: absolute;\n    text-align: right;\n    right: 8px;\n    bottom: 0px;\n}\n"),
    :attrs {:class "{css echo=FALSE}"}})}
 {:tag :a,
  :attrs
  {:href "https://github.com/scicloj/tablecloth",
   :class "github-corner",
   :aria-label "View source on GitHub"},
  :content
  ({:tag :svg,
    :attrs
    {:width "80",
     :height "80",
     :viewbox "0 0 250 250",
     :style
     "fill:#d9230f; color:#fff; position: absolute; top: 0; border: 0; right: 0;",
     :aria-hidden "true"},
    :content
    ({:tag :path,
      :attrs {:d "M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"},
      :content nil}
     {:tag :path,
      :attrs
      {:d
       "M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2",
       :fill "currentColor",
       :style "transform-origin: 130px 106px;",
       :class "octo-arm"},
      :content nil}
     {:tag :path,
      :attrs
      {:d
       "M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z",
       :fill "currentColor",
       :class "octo-body"},
      :content nil})})}
 {:tag :pre,
  :content
  ({:tag :code,
    :content
    ("find_nrepl_port_up <- function() {\n    wd <- getwd()\n    while(wd != dirname(wd)) {\n        f <- paste0(wd,\"/.nrepl-port\")\n        if(file.exists(f)) return(paste0(\"@\",f))\n        wd <- dirname(wd)\n        f <- NULL\n    }\n}\nport_file <- find_nrepl_port_up()\nif(is.null(port_file)) stop(\"nREPL port not found\")\nlibrary(knitr)\nknitr_one_string <- knitr:::one_string\nnrepl_cmd  <- \"rep\"\nopts_chunk$set(comment=NA, highlight=TRUE)\nknit_engines$set(clojure = function(options) {\n    rep_params <- if(isTRUE(options$stdout_only)) {\n                      \"--print 'out,1,%{out}' --print 'value,1,' -p\"\n                  } else {\n                      \"-p\"\n                  }\n    code <- paste(rep_params, port_file, shQuote(knitr_one_string(options$code)))\n    out <- if (options$eval) {\n               if (options$message) message('running: ', nrepl_cmd, ' ', code)\n               tryCatch(\n                   system2(nrepl_cmd, code, stdout = TRUE, stderr = TRUE, env = options$engine.env),\n                   error = function(e) {\n                       if (!options$error) stop(e)\n                       paste('Error in running command', nrepl_cmd)\n                   }\n               )\n           } else ''\n    if (!options$error && !is.null(attr(out, 'status'))) stop(knitr_one_string(out))\n    engine_output(options, options$code, out)})\n"),
    :attrs {:class "{r setup, include=FALSE}"}})}
 {:tag :pre,
  :content
  ({:tag :code,
    :content
    ("(def tech-ml-version (get-in (read-string (slurp \"deps.edn\")) [:deps 'techascent/tech.ml.dataset :mvn/version]))\n"),
    :attrs {:class "{clojure include=FALSE}"}})}
 {:tag :pre,
  :content
  ({:tag :code,
    :content ("tech-ml-version\n"),
    :attrs {:class "{clojure results=\"asis\"}"}})}
 {:tag :h2, :content ("Introduction")}
 {:tag :p,
  :content
  ({:tag :a,
    :attrs {:href "https://github.com/techascent/tech.ml.dataset"},
    :content ("tech.ml.dataset")}
   " is a great and fast library which brings columnar dataset to the Clojure. Chris Nuernberger has been working on this library for last year as a part of bigger "
   {:tag :code, :content ("tech.ml")}
   " stack.")}
 {:tag :p,
  :content
  ("I've started to test the library and help to fix uncovered bugs. My main goal was to compare functionalities with the other standards from other platforms. I focused on R solutions: "
   {:tag :a, :attrs {:href "https://dplyr.tidyverse.org/"}, :content ("dplyr")}
   ", "
   {:tag :a, :attrs {:href "https://tidyr.tidyverse.org/"}, :content ("tidyr")}
   " and "
   {:tag :a,
    :attrs {:href "https://rdatatable.gitlab.io/data.table/"},
    :content ("data.table")}
   ".")}
 {:tag :p,
  :content
  ("During conversions of the examples I've come up how to reorganized existing "
   {:tag :code, :content ("tech.ml.dataset")}
   " functions into simple to use API. The main goals were:")}
 {:tag :ul,
  :content
  ({:tag :li,
    :content
    ("Focus on dataset manipulation functionality, leaving other parts of "
     {:tag :code, :content ("tech.ml")}
     " like pipelines, datatypes, readers, ML, etc.")}
   {:tag :li,
    :content
    ("Single entry point for common operations - one function dispatching on given arguments.")}
   {:tag :li,
    :content
    ({:tag :code, :content ("group-by")}
     " results with special kind of dataset - a dataset containing subsets created after grouping as a column.")}
   {:tag :li,
    :content
    ("Most operations recognize regular dataset and grouped dataset and process data accordingly.")}
   {:tag :li,
    :content ("One function form to enable thread-first on dataset.")})}
 {:tag :p,
  :content
  ("If you want to know more about "
   {:tag :code, :content ("tech.ml.dataset")}
   " and "
   {:tag :code, :content ("tech.ml.datatype")}
   " please refer their documentation:")}
 {:tag :ul,
  :content
  ({:tag :li,
    :content
    ({:tag :a,
      :attrs
      {:href
       "https://github.com/techascent/tech.datatype/blob/master/docs/cheatsheet.md"},
      :content ("Datatype")})}
   {:tag :li,
    :content
    ({:tag :a,
      :attrs
      {:href
       "https://github.com/techascent/tech.datatype/blob/master/docs/datetime.md"},
      :content ("Date/time")})}
   {:tag :li,
    :content
    ({:tag :a,
      :attrs
      {:href
       "https://github.com/techascent/tech.ml.dataset/blob/master/docs/walkthrough.md"},
      :content ("Dataset")})})}
 {:tag :p,
  :content
  ({:tag :a,
    :attrs {:href "https://github.com/scicloj/tablecloth"},
    :content ("SOURCE CODE")})}
 {:tag :p,
  :content
  ("Join the discussion on "
   {:tag :a,
    :attrs
    {:href
     "https://clojurians.zulipchat.com/#narrow/stream/236259-tech.2Eml.2Edataset.2Edev/topic/api"},
    :content ("Zulip")})}
 {:tag :p,
  :content
  ("Let's require main namespace and define dataset used in most examples:")}
 {:tag :pre,
  :content
  ({:tag :code,
    :content
    ("(require '[tablecloth.api :as api])\n(def DS (api/dataset {:V1 (take 9 (cycle [1 2]))\n                      :V2 (range 1 10)\n                      :V3 (take 9 (cycle [0.5 1.0 1.5]))\n                      :V4 (take 9 (cycle [\"A\" \"B\" \"C\"]))}))\n"),
    :attrs {:class "{clojure results=\"hide\"}"}})}
 {:tag :pre,
  :content
  ({:tag :code,
    :content ("DS\n"),
    :attrs {:class "{clojure results=\"asis\"}"}})}
 {:tag :h2, :content ("Functionality")}
 {:tag :h3, :content ("Dataset")}
 {:tag :p,
  :content
  ("Dataset is a special type which can be considered as a map of columns implemented around "
   {:tag :code, :content ("tech.ml.datatype")}
   " library. Each column can be considered as named sequence of typed data. Supported types include integers, floats, string, boolean, date/time, objects etc.")}
 {:tag :h4, :content ("Dataset creation")}
 {:tag :p,
  :content
  ("Dataset can be created from various of types of Clojure structures and files:")}
 {:tag :ul,
  :content
  ({:tag :li, :content ("single values")}
   {:tag :li, :content ("sequence of maps")}
   {:tag :li, :content ("map of sequences or values")}
   {:tag :li,
    :content
    ("sequence of columns "
     "("
     "taken from other dataset or created manually"
     ")")}
   {:tag :li,
    :content
    ("sequence of pairs: "
     {:tag :code, :content ("[string column-data]")}
     " or "
     {:tag :code, :content ("[keyword column-data]")})}
   {:tag :li, :content ("array of native arrays")}
   {:tag :li,
    :content
    ("file types: raw/gzipped csv/tsv, json, xls"
     "("
     "x"
     ")"
     " taken from local file system or URL")}
   {:tag :li, :content ("input stream")})}
 {:tag :p, :content ({:tag :code, :content ("api/dataset")} " accepts:")}
 {:tag :ul,
  :content
  ({:tag :li, :content ("data")}
   {:tag :li,
    :content
    ("options "
     "("
     "see documentation of "
     {:tag :code, :content ("tech.ml.dataset/->dataset")}
     " function for full list"
     ")"
     ":"
     {:tag :ul,
      :content
      ({:tag :li,
        :content
        ({:tag :code, :content (":dataset-name")} " - name of the dataset")}
       {:tag :li,
        :content
        ({:tag :code, :content (":num-rows")}
         " - number of rows to read from file")}
       {:tag :li,
        :content
        ({:tag :code, :content (":header-row?")}
         " - indication if first row in file is a header")}
       {:tag :li,
        :content
        ({:tag :code, :content (":key-fn")}
         " - function applied to column names "
         "("
         "eg. "
         {:tag :code, :content ("keyword")}
         ", to convert column names to keywords"
         ")")}
       {:tag :li,
        :content ({:tag :code, :content (":separator")} " - column separator")}
       {:tag :li,
        :content
        ({:tag :code, :content (":single-value-column-name")}
         " - name of the column when single value is provided")}
       {:tag :li,
        :content
        ({:tag :code, :content (":column-names")}
         " - in case you want to name columns - only works for sequential input "
         "("
         "arrays"
         ")")}
       {:tag :li,
        :content
        ({:tag :code, :content (":layout")}
         " - for numberical, native array of arrays - treat entries "
         {:tag :code, :content (":as-rows")}
         " or "
         {:tag :code, :content (":as-columns")}
         " "
         "("
         "default"
         ")")})})})}
 {:tag :hr}
 {:tag :p, :content ("Empty dataset.")}
 {:tag :pre,
  :content
  ({:tag :code, :content ("(api/dataset)\n"), :attrs {:class "{clojure}"}})}
 {:tag :hr}
 {:tag :p, :content ("Dataset from single value.")}
 {:tag :pre,
  :content
  ({:tag :code,
    :content ("(api/dataset 999)\n"),
    :attrs {:class "{clojure results=\"asis\"}"}})}
 {:tag :hr}
 {:tag :p,
  :content ("Set column name for single value. Also set the dataset name.")}
 {:tag :pre,
  :content
  ({:tag :code,
    :content
    ("(api/dataset 999 {:single-value-column-name \"my-single-value\"})\n(api/dataset 999 {:single-value-column-name \"\"\n                  :dataset-name \"Single value\"})\n"),
    :attrs {:class "{clojure results=\"asis\"}"}})}
 {:tag :hr}
 {:tag :p,
  :content
  ("Sequence of pairs "
   "("
   "first = column name, second = value"
   "("
   "s"
   ")"
   ")"
   ".")}
 {:tag :pre,
  :content
  ({:tag :code,
    :content ("(api/dataset [[:A 33] [:B 5] [:C :a]])\n"),
    :attrs {:class "{clojure results=\"asis\"}"}})}
 {:tag :hr}
 {:tag :p,
  :content ("Not sequential values are repeated row-count number of times.")}
 {:tag :pre,
  :content
  ({:tag :code,
    :content ("(api/dataset [[:A [1 2 3 4 5 6]] [:B \"X\"] [:C :a]])\n"),
    :attrs {:class "{clojure results=\"asis\"}"}})}
 {:tag :hr}
 {:tag :p,
  :content
  ("Dataset created from map "
   "("
   "keys = column names, vals = value"
   "("
   "s"
   ")"
   ")"
   ". Works the same as sequence of pairs.")}
 {:tag :pre,
  :content
  ({:tag :code,
    :content
    ("(api/dataset {:A 33})\n(api/dataset {:A [1 2 3]})\n(api/dataset {:A [3 4 5] :B \"X\"})\n"),
    :attrs {:class "{clojure results=\"asis\"}"}})}
 {:tag :hr}
 {:tag :p, :content ("You can put any value inside a column")}
 {:tag :pre,
  :content
  ({:tag :code,
    :content ("(api/dataset {:A [[3 4 5] [:a :b]] :B \"X\"})\n"),
    :attrs {:class "{clojure results=\"asis\"}"}})}
 {:tag :hr}
 ...)
