(ns leafgrabber.free-text.combine
  (:use [cascalog.api :only (hfs-textline lfs-textline deffilterop defbufferop union <- ?<- ?-)]
        [clojure.contrib.string :only (as-str)]
        [clojure.contrib.generic.math-functions :only (log abs exp sqr sqrt)]
        [clojure.contrib.str-utils :only (re-split)]
        [clojure.data.json :only (read-json json-str)]
        [cascalog-commons.core :only (with-standard-conf)])
  (:require [leafgrabber.free-text.query :as qry]
            [leafgrabber.free-text.utils :as ftu])
  )

(defbufferop get-avg-value
  [tuples]
  (let [sum (reduce #(+ %1 (read-string (first %2))) 0 tuples)]
    (list (/ sum (count tuples))))
  )

(defn average-ext-values
  "get the average of all the extractor values in one file and
   put the results in another

   ext-group - the name of the extractor group that locates the file

   date-str  - the date string which further locates the file

   in-file   - the name of the input file

   out-file  - the name of the output file"
  [ext-group date-str in-file out-file]
  (let [group-str (as-str ext-group)
        base-dir (str qry/results-dir "/" group-str "/" date-str)
        agg-ext-dir (str base-dir "/" in-file)
        avg-ext-dir (str base-dir "/" out-file)
        avg-query (with-standard-conf
                    (<- [?uuid ?avg-ext ?avg-val]
                        ((hfs-textline agg-ext-dir) ?line)
                        (ftu/get-field ?line 0 :> ?uuid)
                        (ftu/get-field ?line 2 :> ?ext-val)
                        (identity group-str :> ?avg-ext)
                        (get-avg-value ?ext-val :> ?avg-val)))
        orig-query (with-standard-conf
                     (<- [?uuid ?orig-ext ?orig-val]
                         ((hfs-textline agg-ext-dir) ?line)
                         (ftu/get-field ?line 0 :> ?uuid)
                         (ftu/get-field ?line 1 :> ?orig-ext)
                         (ftu/get-field ?line 2 :> ?orig-val)))]
    (println)
    (println "Adding average of extractor values")
    (println)
    (ftu/hadoop-delete avg-ext-dir)
    (with-standard-conf
      (?- (hfs-textline avg-ext-dir) (union avg-query orig-query))))
  )

(defn unify-ext-vals
  "Get the union of two extractor-results files

   group-str - the name of the group (or single extractor) that
               locates the files to merge

   date-str  - the date string that further locates the files

   file-1    - the first file to merge

   file-2    - the second file to merge

   out-file  - where to put the results"
  [group-str date-str file-1 file-2 out-file]
  (let [base-dir (str qry/results-dir "/" group-str "/" date-str)
        dir-1 (str base-dir "/" file-1)
        dir-2 (str base-dir "/" file-2)
        out-dir (str base-dir "/" out-file)
        query-1 (with-standard-conf
                  (<- [?uuid ?ext ?val]
                      ((hfs-textline dir-1) ?line)
                      (ftu/get-field ?line 0 :> ?uuid)
                      (ftu/get-field ?line 1 :> ?ext)
                      (ftu/get-field ?line 2 :> ?val)))
        query-2 (with-standard-conf
                  (<- [?uuid ?gold-ext ?gold-val]
                      ((hfs-textline dir-2) ?line)
                      (ftu/get-field ?line 0 :> ?uuid)
                      (ftu/get-field ?line 1 :> ?ext)
                      (ftu/get-field ?line 2 :> ?val)))]
    (println)
    (println "Adding gold standard to query with average")
    (println)
    (ftu/hadoop-delete out-dir)
    (with-standard-conf
      (?- (hfs-textline out-dir) (union query-1 query-2)))
    )
  )

(defn unify-7-files
  []
  (let [base-dir "/apps/extract/poi/UnitedStates/other/hotels_restaurants/data/"
        out-file (str base-dir "signals.6.gold.union")
        in-1 (str base-dir "4square.count.norm")
        in-2 (str base-dir "crossrefs.norm")
        in-3 (str base-dir "crosswalk.reqs.filtered.norm")
        in-4 (str base-dir "crosswalks.norm")
        in-5 (str base-dir "read.reqs.norm")
        in-6 (str base-dir "review.count")
        in-7 (str base-dir "longtail.density.good")
        q1 (<- [?line] ((hfs-textline in-1) ?line))
        q2 (<- [?line] ((hfs-textline in-2) ?line))
        q3 (<- [?line] ((hfs-textline in-3) ?line))
        q4 (<- [?line] ((hfs-textline in-4) ?line))
        q5 (<- [?line] ((hfs-textline in-5) ?line))
        q6 (<- [?line] ((hfs-textline in-6) ?line))
        q7 (<- [?line] ((hfs-textline in-7) ?line))]
    (ftu/hadoop-delete out-file)
    (with-standard-conf
      (?- (hfs-textline out-file) (union q1 q2 q3 q4 q5 q6 q7))))
  )

(defbufferop compare-exts
  [tuples]
  (let [tuple (first tuples)
        gold-ext (first tuple)
        test-ext (second tuple)
        gold-tuple (first (filter #(= (nth % 2) gold-ext) tuples))
        test-tuple (first (filter #(= (nth % 2) test-ext) tuples))
        gold-value (read-string (nth gold-tuple 3))
        test-value (read-string (nth test-tuple 3))
        diff-value (- gold-value test-value)]
    (list (and (> diff-value -0.1) (< diff-value 0.1)))
    )
  )

(defn compare-to-gold
  "compare the results of a numeric extractor (e.g. place-rank) with
   a numeric gold standard extractor

   group-str - The extractor group containing the numeric extractors,
               used to locate the directory containing everything

   date-str  - The date, used to locate the right subdirectory

   in-file   - The file within the subdirectory that contains all
               the data

   test-ext  - The name of the extractor to be tested

   gold-ext  - The name of the gold standard extractor

   out-file  - The name of the file to store the result of the comparison

   result-ext - The name of the 'extractor' that contains the result"
  [group-str date-str in-file test-ext gold-ext out-file result-ext]
  (let [base-dir (str qry/results-dir "/" group-str "/" date-str)
        in-dir (str base-dir "/" in-file)
        out-dir (str base-dir "/" out-file)
        comp-query (with-standard-conf
                     (<- [?uuid ?result-ext ?comp-val]
                         ((hfs-textline in-dir) ?line)
                         (ftu/get-field ?line 0 :> ?uuid)
                         (ftu/get-field ?line 1 :> ?ext)
                         (ftu/get-field ?line 2 :> ?val)
                         (identity result-ext :> ?result-ext)
                         (compare-exts gold-ext test-ext ?ext ?val :> ?comp-val)))
        orig-query (with-standard-conf
                     (<- [?uuid ?ext ?val]
                         ((hfs-textline in-dir) ?line)
                         (ftu/get-field ?line 0 :> ?uuid)
                         (ftu/get-field ?line 1 :> ?ext)
                         (ftu/get-field ?line 2 :> ?val)))]
    (println)
    (println "Adding difference between test extractor and gold standard")
    (println)
    (ftu/hadoop-delete out-dir)
    (with-standard-conf
      (?- (hfs-textline out-dir) (union comp-query orig-query)))))

(defn get-checkin-count
  [json]
  (let [jmap (try
               (read-json json)
               (catch Exception e {}))]
    (get-in jmap [:response :venue :stats :checkinsCount]))
  )

(defn make-foursquare-data
  "Get check-in count data in the same format as if it were derived
   from an extractor.

   The source is foursquare/json.data
   The destination is foursquare/checkin.count"
  []
  (let [out-dir (str qry/results-dir "/foursquare/checkin.count")]
    (ftu/hadoop-delete out-dir)
    (with-standard-conf
      (?<- (hfs-textline out-dir) [?uuid ?ext ?checkin-count]
           ((hfs-textline (str qry/results-dir "/foursquare/json.data")) ?line)
           (ftu/get-field ?line 0 :> ?uuid)
           (identity "foursquare_checkins" :> ?ext)
           (ftu/get-field ?line 1 :> ?json)
           (get-checkin-count ?json :> ?checkin-count))))
  )

(defn get-foursquare-from-files
  [in-files]
  (loop [result {}
         files-to-go in-files]
    (if (empty? files-to-go)
      result
      (let [next-file (first files-to-go)
            raw_data (slurp next-file)
            raw_lines (re-split #"\n" raw_data)
            split_lines (map #(re-split #"\t" %) raw_lines)
            pairs (map #(vector (first %) (get-checkin-count (second %))) split_lines)
            new-map (apply hash-map (flatten pairs))]
        (recur (merge result new-map) (rest files-to-go))
        ))
    )
  )

(defn normalize-count
  [raw-val denom]
  (let [norm (/ (log (+ 1 (Integer/parseInt (.trim raw-val)))) denom)]
    (if (> norm 1.0) 1.0 norm)
 ))

(defn normalize-q
  [in-dir out-dir trap-dir denom]
  (ftu/hadoop-delete out-dir)
  (with-standard-conf
    (?<- (hfs-textline out-dir) [?uuid ?ext ?norm-val]
         ((hfs-textline in-dir) ?line)
         (ftu/get-field ?line 0 :> ?uuid)
         (ftu/get-field ?line 1 :> ?ext)
         (ftu/get-field ?line 2 :> ?raw-val)
         (:trap (hfs-textline trap-dir))
         (:distinct false)
         (normalize-count ?raw-val denom :> ?norm-val))
    )
  )

(defbufferop matricify
  [tuples]
  (let [tuple-map (reduce #(assoc %1 (first %2) (second %2)) {} tuples)
        att0 (get tuple-map "foursquare_checkins" 0)
        att1 (get tuple-map "crossrefs" 0)
        att2 (get tuple-map "crosswalks" 0)
        att3 (get tuple-map "crosswalk_reqs" 0)
        att4 (get tuple-map "read_reqs" 0)
        att5 (get tuple-map "lt_density" 0)
        att6 (get tuple-map "review_count" 0)]
    (list [att0 att1 att2 att3 att4 att5 att6])
 ))

(deffilterop positive?
  [gold-val]
  (and (not (= gold-val 0))
       (not (= gold-val "0.0"))))

(defn matricify-q
  [base-dir in-file out-file trap-file]
  (let [in-dir (str base-dir "/" in-file)
        out-dir (str base-dir "/" out-file)
        trap-dir (str base-dir "/" trap-file)]
    (ftu/hadoop-delete out-dir)
    (ftu/hadoop-delete trap-dir)
    (?<- (hfs-textline out-dir) [?uuid ?gold ?xref ?xwalk ?xwalkreq ?read ?lta-dens ?rvw_cnt]
         ((hfs-textline in-dir) ?line)
         (ftu/get-trimmed-field ?line 0 :> ?uuid)
         (ftu/get-trimmed-field ?line 1 :> ?ext)
         (ftu/get-field ?line 2 :> ?value)
         (matricify ?ext ?value :> ?gold ?xref ?xwalk ?xwalkreq ?read ?lta-dens ?rvw_cnt)
         ;(positive? ?gold)
         (:trap (hfs-textline trap-dir))
         (:distinct false))
    )
  )

(defn weight-values
  [line weight-vector]
  (let [values (map read-string (rest (rest (re-split #"\t" line))))
        denom (reduce + weight-vector)]
    (/ (reduce + (map * values weight-vector)) denom)))

(defn combine-signals-q
  "get a weighted combination of signals"
  [hadoop-dir matrix-file local-dir weight-vector]
  (let [out-file (apply str (concat '("test.") (interpose "." weight-vector)))
        in-file (str hadoop-dir "/" matrix-file)
        hadoop-out-file (str hadoop-dir "/" out-file)
        local-out-file (str local-dir "/" out-file)
        denom (reduce + weight-vector)]
    (ftu/hadoop-delete hadoop-out-file)
    (?<- (hfs-textline hadoop-out-file) [?uuid ?gold ?combined]
         ((hfs-textline in-file) ?line)
         (ftu/get-field ?line 0 :> ?uuid)
         (ftu/get-field ?line 1 :> ?gold)
         (weight-values ?line weight-vector :> ?combined)
         (:distinct false))
    (ftu/hadoop-copy-results hadoop-out-file local-out-file)
    (ftu/hadoop-delete hadoop-out-file)
    )
  )

(defn evaluate-sort
  [file comparison_count]
  (let [data (slurp file)
        lines (re-split #"\n" data)]
    (loop [to-go comparison_count
           matches 0]
      (if (= to-go 0)
        (float (/ matches comparison_count))
        (let [line-1 (re-split #"\t" (rand-nth lines))
              line-2 (re-split #"\t" (rand-nth lines))
              gold-1 (read-string (nth line-1 1))
              test-1 (read-string (nth line-1 2))
              gold-2 (read-string (nth line-2 1))
              test-2 (read-string (nth line-2 2))]
          (if (= (> gold-1 gold-2) (> test-1 test-2))
            (recur (- to-go 1) (+ matches 1))
            (recur (- to-go 1) matches))
        )))
    ))

(defn mean-and-std
  [numbers]
  (let [cnt (count numbers)
        sum (reduce + numbers)
        mean (/ sum cnt)
        dev-sum (reduce #(+ %1 (abs (- %2 mean))) 0 numbers)]
    [(float mean) (float (/ dev-sum cnt))])
  )

(defn evaluate-pairs
  [pairs comparison_count]
  )

(defn new-evaluate-sort-helper
  [split-lines comparison_count]
  (let [gold-numbers (map first split-lines)
        test-numbers (map second split-lines)
        [gold-mean gold-std] (mean-and-std gold-numbers)
        [test-mean test-std] (mean-and-std test-numbers)]
    (loop [to-go comparison_count
           numerator 0
           denominator 0]
      (if (= to-go 0)
        (float (/ numerator denominator))
        (let [line-1 (rand-nth split-lines)
              line-2 (rand-nth split-lines)
              gold-1 (first line-1)
              test-1 (second line-1)
              gold-2 (first line-2)
              test-2 (second line-2)
              gold-delta (/ (- gold-1 gold-2) gold-std)
              test-delta (/ (- test-1 test-2) test-std)
              denom (abs (/ (+ gold-delta test-delta) 2))
              closeness (/ 1 (exp (abs (- gold-delta test-delta))))]
          (recur (- to-go 1) (+ numerator (* closeness denom)) (+ denominator denom)))
        )
      )
    )
  )

(defn split_fields
  [file]
  (let [data (slurp file)
        lines (re-split #"\n" data)]
    (map #(let [fields (re-split #"\t" %)]
            [(read-string (nth fields 1)) (read-string (nth fields 2))]) lines)
    )
  )

(defn new-evaluate-sort
  [file comparison_count]
  (let [data (slurp file)
        lines (re-split #"\n" data)
        split-lines (map #(let [fields (re-split #"\t" %)]
                            [(read-string (nth fields 1)) (read-string (nth fields 2))]) lines)]
    (new-evaluate-sort-helper split-lines comparison_count)
    )
  )

(defn root-mean-sq-helper
  [split-lines]
  (let [diff-square-sum (reduce #(let [gold-num (first %2)
                                       test-num (second %2)
                                       diff-sq (sqr (- gold-num test-num))]
                                   (+ %1 diff-sq)) 0 split-lines)]
    (sqrt (/ diff-square-sum (count split-lines)))
    ))

(defn root-mean-sq
  [file]
  (let [data (slurp file)
        lines (re-split #"\n" data)
        split-lines (map #(let [fields (re-split #"\t" %)]
                            [(read-string (nth fields 1)) (read-string (nth fields 2))]) lines)]
    (root-mean-sq-helper split-lines)
    )
  )

(defn process-rank
  [result in-process low-rank high-rank]
  (concat (map #(conj % (/ (+ low-rank high-rank) 2)) in-process) result))

(defn add-rank-field
  [sorted-field table]
  (loop [result ()
         in-process (take 1 table)
         current-value (nth (first in-process) sorted-field)
         low-rank 1
         high-rank 1
         still-to-go (rest table)]
    (if (empty? still-to-go)
      (process-rank result in-process low-rank high-rank)
      (let [next-tuple (first still-to-go)
            next-value (nth next-tuple sorted-field)]
        (if (= current-value next-value)
          (recur result (conj in-process next-tuple) current-value low-rank (+ high-rank 1) (rest still-to-go))
          (let []
            (recur (process-rank result in-process low-rank high-rank)
                   (list next-tuple)
                   next-value
                   (+ high-rank 1)
                   (+ high-rank 1)
                   (rest still-to-go))))
        ))
  ))

(defn spearman_helper
  [split-lines]
  (let [line-count (count split-lines)
        sorted-lines-1 (sort-by first split-lines)
        ranked-1 (add-rank-field 0 sorted-lines-1)
        sorted-lines-2 (sort-by second ranked-1)
        ranked-2 (add-rank-field 1 sorted-lines-2)
        delta-sq (map #(conj % (sqr (- (nth % 2) (nth % 3)))) ranked-2)
        sum-delta-sq (reduce #(+ %1 (nth %2 4)) 0 delta-sq)]
    (float (- 1 (/ (* 6 sum-delta-sq) (* line-count (- (sqr line-count) 1)))))
    )
  )

(defn spearman_coefficient
  [file]
  (let [split-lines (split_fields file)]
    (spearman_helper split-lines)
    ))