(ns leafgrabber.free-text.extractor
  (:use [clojure.contrib.str-utils :only (re-split)]
        [clojure.java.io :only (resource)]
        [clojure.data.json :only (json-str write-json read-json)]
        [clojure.string :only (trim)])
  (:require [clojure.contrib.str-utils2 :as cst]
            [leafgrabber.free-text.utils :as ftu]
            [leafgrabber.free-text.tables :as tbl]
            [leafgrabber.page :as pge]
            [leafgrabber.register :as reg]
            [leafgrabber.xpath :as xpt]
            [opennlp.nlp :as nlp]
            ))

(def tokenize (nlp/make-tokenizer (resource "en-token.bin")))
(def pos-tag (nlp/make-pos-tagger (resource "en-pos-maxent.bin")))

(defn list-pairs
  [in-list text]
  (if ; re-split doesn't really split if the match is at the end of the string
      (and (= (count in-list) 1)
           (not (= (count (first in-list)) (count text))))
    (list (list (first in-list) ""))
    (loop [ret nil left in-list]

      (if (< (count left) 2)
        ret
        (recur (concat ret (list (list (first left) (second left))))
               (rest left)))
      )))

(defn parse-patterns
  "turn a list of patterns of the form ['<pref> CORE <suff>' <value> <conf>?]
   into a list of the form [<pref> <suff> <value> <conf>]"
  [patterns core-rx]
  (map #(let [shards (re-split #"CORE" (str (first %)))
              value (second %)
              conf (if (= (count %) 3) (nth % 2) 1)
              prefix (if-not (empty? (first shards))
                       (re-pattern (str "(?is)" (first shards) "$")))
              suffix (if-not (empty? (second shards))
                       (re-pattern (str "(?is)^" (second shards))))]
          (vector prefix core-rx suffix value conf))
       patterns)
  )

(def default-filter
  #{"select" "a" "script" "input" "label"
    "div[@id='BPmainRightCol']"
    "div[@id='advanced_search']"
    "div[@id='ft']"
    "div[@id='other_category_recommendations']"
    "div[@id='popularfrom']"
    "div[@id='hotel-list-topic']"
    "div[@id='related_questions']"
    "div[@id='SearchControlMainDiv']"
    "div[@id='treeview_body_restaurants']"
    "div[@class='item cg_featured']"
    "div[@class='spmightlike']"})

(defn page-to-filtered-html-features
  [page filter-xpath only-xpath content-only?]
  (let [dom (:dom @page)
        html-fn (if content-only? xpt/content xpt/to-html)
        ignored-nodes (xpt/xpath filter-xpath dom)]
    (doseq [node ignored-nodes] (xpt/unlink node))
    (let [nodes (if only-xpath
                  (xpt/xpath only-xpath dom)
                  (xpt/children dom))]
      (if (not (empty? nodes))
        (apply str (cons " -BREAK- " (interpose " -BREAK- " (map html-fn nodes))))
        ""))
    ))

(defn page-to-filtered-html
  "Runs an extractor against a page object"
  [extractor page]
  (let [filter-xpath (:filter-xpath extractor)
        only-xpath (:only-xpath extractor)
        content-only? (:content-only extractor)]
    (page-to-filtered-html-features page filter-xpath only-xpath content-only?)
 ))

(defn url-to-filtered-html
  [url filter-xpath only-xpath content-only?]
  (when-let [page (pge/url->page (trim url))]
    (page-to-filtered-html-features page filter-xpath only-xpath content-only?))
  )

(defn extract-from-page
  "Runs an extractor against a page object"
  [ext page]
  (let [extractor (ext tbl/*extractor-table*)
        html (page-to-filtered-html extractor page)
        classifier (:classifier extractor)
        value (apply classifier (list html))]
    {ext (read-json value)}))

(defn add-ext-group
  [group exts]
  (tbl/add-group group exts)
  (reg/register-extractor-group group exts extract-from-page))

(defn get-enum-value
  [value pref-find core-find suff-find]
  (if (number? value)
    (let [pref-groups (if (vector? pref-find) (rest pref-find))
          core-groups (if (vector? core-find) (rest core-find))
          suff-groups (if (vector? suff-find) (rest suff-find))
          all-groups (concat pref-groups core-groups suff-groups)]
      (nth all-groups (- value 1)))
    value)
  )

(defn eval-single-enum-context
  [prefix core suffix patterns default default-conf]
  (loop [patterns-left patterns]
    (if (empty? patterns-left)
      (hash-map default default-conf)
      (let [pattern (first patterns-left)
            pref-rx (nth pattern 0)
            pref-find (if pref-rx (re-find pref-rx prefix))
            core-rx (nth pattern 1)
            core-find (re-find core-rx core)
            suff-rx (nth pattern 2)
            suff-find (if suff-rx (re-find suff-rx suffix))
            value (nth pattern 3)
            conf (nth pattern 4)]
        (cond (and pref-rx (not pref-find)) (recur (rest patterns-left))
              (and suff-rx (not suff-find)) (recur (rest patterns-left))
              true (hash-map (get-enum-value value pref-find core-find suff-find) conf))
        ))
  ))

(defn combine-enum-values
  "each member of values is a singleton map: {value conf}"
  [values out-map]
  (if (empty? values)
    out-map
    (let [first-val (first values)
          kvp (first first-val)
          value (key kvp)
          conf (val kvp)]
      (if (not value)
        (recur (rest values) out-map)
        (recur (rest values) (assoc out-map value (+ conf (get out-map value 0))))))
   ))

(defn add-boundaries
  [regex nobreak]
  (let [regex1 (if (and (not nobreak)
                        (not-any? #(= (first (str regex)) %) '(\: \?)))
                 (str "\\b(?:" regex ")")
                 (str "(?:" regex ")"))
        regex2 (if (and (not nobreak)
                        (not-any? #(= (last (str regex)) %) '(\: \?)))
                 (str regex1 "\\b")
                 regex1)]
    (re-pattern regex2))
  )

(defn affix-pairs
  [core-rx text nobreak]
  (list-pairs (re-split (add-boundaries core-rx nobreak) text) text))

(defn cores
  [core-rx text nobreak]
  (map #(if (vector? %)
          (first %)
          %)
       (re-seq (add-boundaries core-rx nobreak) text)))

(defn enum-values
  [affix-pairs cores patterns default default-conf]
  (map #(eval-single-enum-context (first %1) %2 (second %1) patterns default default-conf) affix-pairs cores))

(defn make-enum-classifier
  [core-rx patterns default default-conf nobreak]
  (fn
    [text]
    (let [affix-pairs (affix-pairs core-rx text nobreak)
          cores (cores core-rx text nobreak)]
      (if affix-pairs
        (let [enum-values (enum-values affix-pairs cores patterns default default-conf)]
          (json-str
           (combine-enum-values enum-values {})))
        "{\"no-evidence\":1}")
      ))
    )

(defn make-enum-validator
  [core-rx patterns default default-conf context-size nobreak]
  (fn
    [text]
    (let [affix-pairs (list-pairs (re-split (add-boundaries core-rx nobreak) text) text)
          cores (map #(if (vector? %) (first %) %) (re-seq (add-boundaries core-rx nobreak) text))]
      (if affix-pairs
        (map (fn [affix-pair core]
               (json-str (vector (eval-single-enum-context (first affix-pair)
                                                           core
                                                           (second affix-pair)
                                                           patterns
                                                           default
                                                           default-conf)
                                 (ftu/remove-breaks
                                  (cst/tail (first affix-pair) context-size))
                                 core
                                 (ftu/remove-breaks
                                  (cst/take (second affix-pair) context-size))
                                 )))
             affix-pairs cores)
        '("[{\"no-evidence\":1},null,null,null]"))
      )))

(defn make-enum-preprocessor
  "Make a function that takes a url and returns the string to work on"
  [filter-xpath only-xpath content-only?]
  (fn [url]
    (url-to-filtered-html url filter-xpath only-xpath content-only?))
  )

(defn sum-maps
  [map1 map2]
  (if (empty? map1)
    map2
    (recur (rest map1)
           (let [kvp (first map1)
                 key (first kvp)
                 value (second kvp)]
             (assoc map2 key (+ value (get map2 key 0)))))
    ))

(defn sum-enum-ext-agg-helper
  [values ret]
  (if (empty? values)
    ret
    (recur (rest values) (sum-maps (read-json (first values)) ret))
   ))

(defn sum-enum-ext-aggregator
  [values]
  (json-str (sum-enum-ext-agg-helper values {})))

(defn enum-comparer
  [re-value test-value]
  (cond (= re-value test-value) true
        ((read-json re-value) (keyword test-value)) true
        true false)
  )

(defn enum-regex-errors
  [specs]
  (let [spec-keys (keys specs)
        good-keys '(:name :core :context :patterns :default :filter :content-only
                          :comment :nobreak :normalize :only-xpath)
        bad-keys (mapcat (fn [spec-key]
                           (if (not-any? #(= % spec-key) good-keys)
                             (list spec-key)))
                         spec-keys)
        bad-key (first bad-keys)]
    (cond (not-any? #(= % :name) spec-keys) "no name given"
          (not-any? #(= % :core) spec-keys) (str "no core given for " (:name specs))
          (not-any? #(= % :default) spec-keys) (str "no default given for " (:name specs))
          bad-key (str "unknown field " bad-key " in " (:name specs)))
  ))

(defn add-enum-regex
  [specs]
  (let [error (enum-regex-errors specs)
        name (:name specs)
        core-rx (re-pattern (str #"(?is)" (:core specs)))
        context-size (or (:context specs) 40)
        parsed-patterns (parse-patterns (:patterns specs) core-rx)
        default (:default specs)
        filter (:filter specs)
        filter-xpath (apply str (rest (apply str (map #(str "|.//" %) (or filter default-filter)))))
        only-xpath (:only-xpath specs)
        content-only (:content-only specs)
        nobreak (:nobreak specs)
        list-def? (or (vector? default) (list? default))
        def-val (if list-def? (first default) default)
        def-conf (if list-def?
                   (if (= (count default) 2)
                     (second default)
                     1)
                   1)]
    (if error
      (println error)
      (do (tbl/add-extractor name
                             {:core core-rx
                              :patterns parsed-patterns
                              :default def-val
                              :default-conf def-conf
                              :classifier (make-enum-classifier core-rx parsed-patterns def-val def-conf nobreak)
                              :validator (make-enum-validator core-rx parsed-patterns def-val def-conf context-size nobreak)
                              :normalize (:normalize specs)
                              :aggregator sum-enum-ext-aggregator
                              :comparer enum-comparer
                              :filter-xpath filter-xpath
                              :only-xpath only-xpath
                              :content-only content-only
                              :preprocess (make-enum-preprocessor filter-xpath only-xpath content-only)
                              :preprockey ["enum" filter content-only]})
          (reg/register-extractor-group name (list name) extract-from-page)
          )
      )
    ))

(defn remove-tags-old
  [enum-value]
  (let [raw-val (first enum-value)
        untag-val (.trim (apply str (interpose " " (take-nth 2 (re-split #"[- ]" raw-val)))))
        ;untag-val raw-val
        ]
    (hash-map untag-val (second enum-value))
    )
  )

(defn remove-tags
  [enum-value]
  (loop [ret {}
         togo enum-value]
    (if (empty? togo)
      ret
      (let [raw-val (.trim (first (first togo)))
            count (second (first togo))
            untag-val (if (= raw-val "no-evidence")
                        raw-val
                        (apply str (interpose " " (take-nth 2 (re-split #"[_ ]" raw-val)))))]
        (recur (assoc ret untag-val count) (rest togo))
    ))))

(defn make-tag-validator
  [core-rx patterns default default-conf context-size])

(defn make-tag-preprocessor
  [filter-xpath only-xpath content-only?]
  (fn [url]
    (let [filtered-html (url-to-filtered-html url filter-xpath only-xpath content-only?)]
      (if (and filtered-html (not (= filtered-html "")))
        (let [tagged-html (pos-tag (tokenize filtered-html))]
          (apply str (interpose " " (map #(apply str (interpose "_" %)) tagged-html))))))
    )
  )

(defn make-tag-classifier
  [core-rx patterns default default-conf nobreak]
  (fn
    [text]
    (let [affix-pairs (affix-pairs core-rx text nobreak)
          cores (cores core-rx text nobreak)]
      (if affix-pairs
        (let [enum-values (enum-values affix-pairs cores patterns default default-conf)
              tag-values (map remove-tags enum-values)]
          (json-str
           (combine-enum-values tag-values {})))
        "{\"no-evidence\":1}")
      ))
    )


(defn add-tag-regex
  [specs]
  (let [error (enum-regex-errors specs)
        name (:name specs)
        core-rx (re-pattern (str "(?is)" (:core specs)))
        context-size (or (:context specs) 40)
        parsed-patterns (parse-patterns (:patterns specs) core-rx)
        default (:default specs)
        filter (:filter specs)
        filter-xpath (apply str (rest (apply str (map #(str "|.//" %) (or filter default-filter)))))
        only-xpath (:only-xpath specs)
        content-only (:content-only specs)
        nobreak (:nobreak specs)
        list-def? (or (vector? default) (list? default))
        def-val (if list-def? (first default) default)
        def-conf (if list-def?
                   (if (= (count default) 2)
                     (second default)
                     1)
                   1)]
    (if error
      (println error)
      (do (tbl/add-extractor name
                             {:core core-rx
                              :patterns parsed-patterns
                              :default def-val
                              :default-conf def-conf
                              :classifier (make-tag-classifier core-rx parsed-patterns def-val def-conf nobreak)
                              :validator (make-enum-validator core-rx parsed-patterns def-val def-conf context-size nobreak)
                              :aggregator sum-enum-ext-aggregator
                              :comparer enum-comparer
                              :filter-xpath filter-xpath
                              :only-xpath only-xpath
                              :content-only content-only
                              :preprocess (make-tag-preprocessor filter-xpath only-xpath content-only)
                              :preprockey ["tag" filter content-only]})
          (reg/register-extractor-group name (list name) extract-from-page)
          )
      )
    ))

(defn extract-from-string
  "Get a value for an extractor for a single string"
  [ext-key string]
  (let [extractor (ext-key tbl/*extractor-table*)
        classifier (:classifier extractor)
        value (apply classifier (list string))]
    value
  ))
