(ns witan.phyrexian.ons-ingester
  (:require [clojure.java.io :as io]
            [clojure.string :as str]
            [clojure.set :as st]
            [clojure.data.csv :as data-csv]
            [clojure.core.matrix.dataset :as ds]
            [schema.coerce :as coerce]
            [schema.core :as s]
            [schema.utils :as su]))

(defn open-csv [filename]
  (with-open [in-file (io/reader filename)]
    (into []
          (data-csv/read-csv in-file))))

(defn schema-coercion [schema data]
  ((coerce/coercer schema coerce/string-coercion-matcher) data))

(defn gather-by-year
  "takes a vector of keys and a map, keeps keys which correspond to years
   and values that correspond to popn, and converts years from columns to
   values of year as a col and that years value as a value of popn"
  [keys m]
  (let [keep-keys (select-keys m keys)]
    (->> (reduce-kv (fn [a k v] (conj a
                                      (when
                                          (and (integer? (read-string (name k)))
                                               (float? (read-string v)))
                                        (merge
                                         keep-keys
                                         {:year (name k)
                                          :popn v})))) [] m)
         (remove nil?))))

(def onsSchema
  {:gss.code s/Str
   :age s/Int
   :year s/Int
   :popn double})

(defn load-csv [filename]
  "takes a filename of a csv and returns a seq of maps with
   headers as keys and rows as vals"
  (let [[header & file-data] (open-csv filename)
        hk (mapv keyword header)]
    (map #(zipmap hk %) file-data)))

(defn is-90-or-over? [m]
  (= (:AGE_GROUP m) "90 and over"))

(defn year-equals-total? [m]
  (= "All ages" (:AGE_GROUP m)))

(defn coerce-by-over-90 [row]
  (if (is-90-or-over? row)
    (assoc row :AGE_GROUP "90")
    row))

(def scrub-data
  (comp
   (map coerce-by-over-90)
   (remove year-equals-total?)
   (map #(st/rename-keys % {:AGE_GROUP :age :AREA_CODE :gss.code}))))

(defn format-data [schema]
  (comp
   (mapcat #(gather-by-year [:gss.code :age] %))
   (map #(schema-coercion schema %))
   (remove su/error?)))

(defn process-ons-data
  [filename schema]
  (let [csv (load-csv filename)
        xf (comp scrub-data (format-data schema))]
    (into [] xf csv)))
