(ns splendid.ml.text)

;;
;; ##Text Normalization
;;

(defn construct-replacement-map
  "Takes a map `rm` and returns a map. The keys of `rm` are Strings of one or more chars.
  If a key `k` is one char then nothing happens to it.
  If `k` has more than one char, then `k` and its corresponding value `v`
  are removed in the returned map.
  But each of its chars is added as key/value pair, always using `v` as the value.
  See also: `replace-chars`"
  [rm]
  (let [char-groups (filter #(> (count (key %)) 1)
                            rm)
        maps (map (fn [[group replacement]]
                    (let [single-chars (map str group)]
                      (zipmap single-chars (repeat (count single-chars) replacement))))
                  char-groups)
        rm (apply dissoc rm (map first char-groups))
        rm (apply merge rm maps)]
    rm))

(defn ^String replace-chars
  "Takes a String `s` and a `replacement-map` which maps from one-char Strings to Strings.
  Replaces each occurence of all keys from the `replacement-map` with the
  corresponding value in `s` and returns the result.
  Example:
    (replace-chars \"ML macht André Spaß!\" {\"ß\" \"ss\", \"é\" \"e\"}
    ==> \"ML macht Andre Spass!\"

    (replace-chars \"àáâȧạ!\" (construct-replacement-map {\"àáâȧạ\" \"a\"}))
    ==> \"aaaaa\""
  [s replacement-map]
  (let [sb (StringBuilder. (count s))]
    (doseq [c (map str s)]
      (if (contains? replacement-map c)
        (.append sb ^String (get replacement-map c))
        (.append sb ^String c)))
    (str sb)))

(def norm-map
  (construct-replacement-map
   (hash-map
    "ä" "ae"
    "ö" "oe"
    "ü" "ue"
    "ß" "ss"
    "âàáǎąãɐåⱥȧạāă" "a"
    "êèéěęẽɘɇėëẹēĕ" "e"
    "îìíǐįĩıïịīĭ"   "i"
    "ôòóǒǫõøȯxőọōŏ" "o"
    "ûùúǔųũůʉűụūŭ"  "u"
    )))

(defn normalize-text [^String text]
  (-> text
      .trim
      .toLowerCase
      (replace-chars norm-map)
      .trim))

;;
;; Text segmentation
;;

(defn split-at-non-alphanum
  "Splits `text` at chars that are neither digits, nor alphabetic,
  thus keeping only alphanumeric unicode chars."
  [text]
  (remove empty? (clojure.string/split text #"[^\p{N}\p{L}]+")))

(defn split-into-words
  "Splits `text` at whitespace, punctuation or control chars."
  [text]
  (remove empty? (clojure.string/split text #"[\p{Z}\p{P}\p{C}]+")))

(defn into-n-tuple
  "Partitions `words` into an `n`-tuple.
  Do this if you want to train bigrams or trigrams."
  [n words]
  (map #(clojure.string/join " " %)
       (partition n 1 words)))


(defn keep-words
  "Takes a seq of `words` and a Set of Strings `word-set`.
  Keeps only those `words` that occur in `word-set`."
  [words word-set]
  (filter word-set words))

(defn remove-words
  "Takes a seq of `words` and a Set of Strings `word-set`.
  Removes all entries from `words` that are also in `word-set`."
  [words word-set]
  (remove word-set words))


(defrecord Text [id text tags])

;; (Text. "c:/temp/1234.txt" "Hello world!" #{:spam :training})

(defmethod print-method Text [{:keys [id text tags]} ^java.io.Writer w]
  (let [c (count text)]
    (.write w
            (format "#<Text id=%s, tags=%s, text(len %d)=\"%s\">"
                    (pr-str id)
                    (pr-str tags)
                    c
                    (if (<= c 35)
                      text
                      (str (subs text 0 35) "[…]"))))))
