;;----------------------------------------------------------------------------
;; Copyright 2011 Factual, Inc.
;; All Rights Reserved.
;;
;; This is UNPUBLISHED PROPRIETARY SOURCE CODE of Factual, Inc.
;; Factual, Inc. reserves all rights in the source code as
;; delivered. The source code and all contents of this file, may not be
;; used, copied, modified, distributed, sold, disclosed to third parties
;; or duplicated in any form, in whole or in part, for any purpose,
;; without the prior written permission of Factual, Inc.
;;----------------------------------------------------------------------------

(ns leafgrabber.core
  (:use cascalog.api
        [clojure.data.json :only [json-str]]
        [clojure.tools.logging :only [warn debug]]
        [clj-time
         [core :only [now]]
         [format :only [unparse formatters]]]
        [let-else :only [let?]]
        [leafgrabber.register :only [extractor-groups]]
        [jackknife.def :only [defnk]])
  (:require [leafgrabber
             [page :as page]
             [clean :as clean]]
            [cascalog [ops :as c] [vars :as v]]
            [cascalog-commons.core :as cc]
            [clojure.string :as s]
            [juiceful.utils :as u]
            [clojure.tools.cli :as cl]

            ;; force extractors to register themselves
            [leafgrabber.extract
             base
             classify
             authority
             sitegrab]
            [leafgrabber.free-text.author]

            )
  (:import [java.io File])
  (:gen-class
   :methods [#^{:static true} [extractApi [String String String] String]]))


(defn combine-maps
  [maps]
  (if (map? maps)
    maps
    (into {} maps)))

(defn extract-to-map
  "Run the named extractor groups on page to produce map.
   Each extractor within a group can return a map or a seq of maps.
   All maps for each extractor and for all extractors in the group
   and for all groups in group-names-str are combined into one map.

   group-names is a collection of zero or more group names.
   If group-names is empty or missing, the base extractors are run.

   Registered groups can be a collection of fns to apply to pages, or a collection of
   extractor names and a single fn of arity two to apply to extractor names and pages.

   Throws an exception if any of the group names are not registered."

  [page group-names]

  (let [group-names (or (seq group-names)
                         ["base"])

         look-up-group
         (fn [group-name]
           (let [m (re-find #"^:(.*)" group-name)
                 k (if m
                     (keyword (m 1))
                     group-name)]
             (or (@extractor-groups k)
                 (throw (RuntimeException. (str "unregistered extractor group: " group-name))))))

         group-tuples (map look-up-group group-names)

         run-one-extractor-group
         (fn [group-tuple]
           (let [[extractors f] group-tuple
                  page-fn (if f
                            #(f % page)
                            #(% page))
                  maps (map #(combine-maps (page-fn %))
                            extractors)]
             (combine-maps maps)))]

    (combine-maps
     (map run-one-extractor-group group-tuples))))

(defn remove-map-nils
  "Remove keys from a map whose values are falsey. Doesn't share structure, so not especially efficient."
  [map]
  (into {} (filter second map)))

(defnk extract-to-datastore-map
  "Run all extractors on page to produce map, then make map Datastore 2.0 compatible."

  [page group-names]

  (let [input-meta (remove-map-nils
                    {:origin "LEAF_GRABBER"
                     :type "STANDARD"
                     :sourceUrl (:url @page)
                     :processedDate (unparse (:basic-date formatters)
                                             (now))})
        ex (extract-to-map page group-names)
        name (or (ex "hcard:org")
                 (ex "og:title"))
        payload (if name
                  (merge (sorted-map "_header" name) ex)
                  ex)]
    (remove-map-nils
     {:uuid (:uuid @page)
      :payload payload
      :inputMeta input-meta})))

(defn extract
  "Make a page from url, then run all extractors in group-names-str on the page
   to produce a map, then convert the map to a JSON string.
   If the uuid argument is non-nil, include it in the map and JSON.

   group-names is a collection of zero or more group names."

  [url group-names uuid]

  (page/use-public-dcache-server)
  (let? [page (page/url->page url :uuid uuid)
         :else "cache-miss"]
    (json-str
     (extract-to-datastore-map page group-names))))

(defn separate-group-names
  [group-names-str]
  (let? [group-names-str (not-empty group-names-str) :else []]
    (s/split (s/trim group-names-str) #"\s+")))

(defn -extractApi
  [url group-names-str uuid]
  (extract url (separate-group-names group-names-str) uuid))

(defn join-paths [prefix-path suffix-path]
  (str (File. prefix-path suffix-path)))

(defn extract-to-json-gen
  "Fetches and parses 'url', and runs extractors.
   Returns a status string, a JSON of extractor results, and the length of the page."
  [url uuid group-names max-len]
  (debug (str "URL: " url))
  (let? [page (page/url->page url :uuid uuid :max-len max-len)
         :else ["cache-miss" nil 0]
         c (:len @page)]
    (if (:overlimit @page)
      ["overlimit" nil c]
      ["json"
       (json-str
        (extract-to-datastore-map page group-names))
       c])))

(defnk extract-from-urls
  "Takes a directory of tabbed URL files (url-dir), the positions of the URL and UUID and
   the extractor group names, as optional keyword args :url-pos, :uuid-pos and :groups,
   fetches html from dcache, converts to xhtml, runs the appropriate extractors on the xhtml,
   and writes JSON strings with extractor names/values and URL to subdir 'json' of out-dir.

   URLs not in dcache are written to subdir 'cache-miss'.
   Other errors are written to subdir 'trap'.
   Counts of each type of output are written to subdir 'count'.

   If :url-pos is missing, position zero is used.
   If :uuid-pos is missing the uuids are not extracted.

   :groups is a collection of zero or more extractor group names,
   e.g. \"base\", \"classify\" or \"authority\".
   If :groups is empty or missing the base extractors are used.

   url-dir can be either a file or a directory.
   If :path is present, it is prepended to url-dir and out-dir, with a separator character
   inserted if necessary.

   If :config is present, it's a map of hadoop job configs that override our \"standard\" configs.

   If :max-len is present, pages greater than max-len characters are written to an overlimit subdir,
   with no extractions done."

  [url-dir out-dir :path "" :url-pos 0 :uuid-pos nil :groups [] :config {} :max-len nil]
  (time
   (cc/with-standard-conf
     config
     (cascalog.io/with-fs-tmp [_ tmp-dir]
       (let [extr-tap (hfs-seqfile tmp-dir)
             url-path (join-paths path url-dir)
             url-tap (hfs-textline url-path)
             out-path (join-paths path out-dir)
             error-tap (cc/hfs-template
                        (hfs-textline out-path)
                        "%s"
                        "?status")
             tapify #(hfs-textline (join-paths out-path %))
             json-tap (tapify "json")
             count-tap (tapify "count")
             trap-tap (tapify "trap")]
         (?<- extr-tap
              [?url !json ?status ?len]
              (url-tap ?line)
              (cc/parse-on-indices [url-pos uuid-pos] ?line :> ?url !uuid)
              (extract-to-json-gen ?url !uuid groups max-len :> ?status !json ?len)
              (:trap trap-tap)
              (:distinct false))
         (?- json-tap
             (<- [?json]
                 (extr-tap _ ?json _ _)
                 (:distinct false))
             error-tap
             (<- [?url ?status ?len]
                 (extr-tap ?url _ ?status ?len)
                 (not= ?status "json")
                 (:distinct false))
             count-tap
             (<- [?status ?count]
                 (extr-tap _ _ ?status _)
                 (c/count ?count))))))))

(defn classify-page [url max-len]
  (debug (str "URL: " url))
  (u/debug-time
   "classify"
   (let? [page (page/url->page url :max-len max-len)
          :else ["cache-miss" 0]
          c (:len @page)]
     (if (:overlimit @page)
       ["overlimit" c]
       [(:page_class (extract-to-map page ["classify"])) c]))))

(defn make-class-query
  [url-dir url-pos max-len trap-path]

  (let [url-tap (hfs-textline url-dir)
        trap-tap (hfs-textline trap-path)]
    (<- [?line ?class ?len]
        (url-tap ?line)
        (cc/parse-on-indices [url-pos] ?line :> ?url)
        (classify-page ?url max-len :> ?class ?len)
        (:trap trap-tap)
        (:distinct false))))

(defnk classify-pages
  "Run the page classifier on URLs in url-dir at field position given by the keyword param :url-pos (which defaults to 0).
   Input lines are written to subdirectories {leaf,directory,cache-miss,overlimit,trap}."

  [url-dir out-dir :path "" :url-pos 0 :config {} :max-len nil]
  (time
   (cc/with-standard-conf
     config
     (let [url-path (join-paths path url-dir)
           out-path (join-paths path out-dir)
           out-tap (cc/hfs-template
                    (hfs-textline out-path)
                    "%s"
                    "?class")
           trap-path (join-paths out-path "trap")]
       (let [class-query (make-class-query url-path url-pos max-len trap-path)]
         (?<- out-tap
              [?line ?class ?len]
              (class-query ?line ?class ?len)
              (:distinct false)))))))

(defn has-exclude-xpath-gen
  [url]
  (let [pipeline-map (clean/get-pipeline-map url)
        xp (:exclude-xpath pipeline-map)]
    [(if xp
       "excludexp"
       "noexcludexp")]))

(defnk find-urls-with-exclude-xpath
  "For each URL in url-dir at field position given by the keyword param :url-pos (which defaults to 0),
   determine whether there is an :exclude-xpath in the pipeline map for that URL.
   Input lines are written to subdirectories {excludexp,noexcludexp,trap}."

  [url-dir out-dir :path "" :url-pos 0 :config {}]
  (cc/with-standard-conf
    config
    (let [url-path (join-paths path url-dir)
          url-tap (hfs-textline url-path)
          out-path (join-paths path out-dir)
          out-tap (cc/hfs-template
                   (hfs-textline out-path)
                   "%s"
                   "?excludexp")
          trap-path (join-paths out-path "trap")
          trap-tap (hfs-textline trap-path)]
      (?<- out-tap
           [?line ?excludexp]
           (url-tap ?line)
           (cc/parse-on-indices [url-pos] ?line :> ?url)
           (has-exclude-xpath-gen ?url :> ?excludexp)
           (:distinct false)
           (:trap trap-tap)))))

(defnk extract-from-json
  "Get values from JSONs in in-dir from the fields described by keys, as separate output tuples,
   which are written as tab-delimited records to out-dir.
   json-keys are keywords or sequences of keywords that are paths through nested maps to the field.
   When values are sequences, the output tuples are made by cartesian product of the value sequences.

   :path is an optional prefix to in-dir and out-dir.

   When :limit is specified, it is the maximum number of output records to produce per input line.

   The number of output fields is :num-fields if specified, otherwise is the same as the number of keys.
   This allows for seqs of seqs, which get spread across output fields.

   If :include-json? is truthy, include the entire JSON as the last output field."

  [in-dir out-dir json-keys :path "" :limit nil :num-fields nil :include-json? nil]
  (let [in-tap (cc/json-datasource-keys
                (join-paths path in-dir)
                json-keys
                :limit limit
                :num-fields num-fields
                :include-json? include-json?)
        out-tap (hfs-textline (join-paths path out-dir))]
    (?- out-tap in-tap)))


(defnk find-failed-records
  "Find join keys in dir-1 that aren't in dir-2.
   Path is prefix for dir-1, dir-2 and out-dir."
  [dir-1 dir-2 out-dir :path "" :join-pos-1 0 :join-pos-2 0]
  (time
   (cc/with-standard-conf
     (let [in-1 (hfs-textline (join-paths path dir-1))
           in-2 (hfs-textline (join-paths path dir-2))
           out-tap (hfs-textline (join-paths path out-dir))
           in-1-query (<- [?join]
                        (in-1 ?line)
                        (cc/parse-on-indices [join-pos-1] ?line :> ?join)
                        (:distinct false))
           in-2-query (<- [?join]
                        (in-2 ?line)
                        (cc/parse-on-indices [join-pos-2] ?line :> ?join)
                        (:distinct false))]
       (?<- out-tap
            [?join]
            (in-1-query ?join)
            (in-2-query ?join :> false))))))


(defn -main [& args]
  (let [nil-or-num #(when-not (nil? %) (Integer. %))
        m (cl/cli args
                  (cl/optional ["-i" "--in" "Path to a directory or file of URLs"])
                  (cl/optional ["-o" "--out" "Output path for JSON"])
                  (cl/optional ["-u" "--url-pos" "Field position of URL" :default 0] #(Integer. %))
                  (cl/optional ["-d" "--uuid-pos" "Field position of UUID"] nil-or-num)
                  (cl/optional ["-x" "--max-len" "Maximum length of page"] nil-or-num)
                  (cl/optional ["-g" "--groups" "Extractor group names" :default ""] separate-group-names)
                  (cl/optional ["-m" "--mappers" "Number of map tasks"] nil-or-num)
                  (cl/optional ["-c" "--classify-pages" "Use classify-pages rather than extract-from-urls"])
                  (cl/optional ["--urls-with-excludexp" "Use find-urls-with-exclude-xpath rather than extract-from-urls"])
                  (cl/optional ["--url" "A single URL to extract from"])
                  (cl/optional ["--uuid" "A UUID to include in the extracted JSON"])
                  (cl/optional ["-p" "--public-dcache" "Use dcache.factual.com rather than localhost:8050"]))]

    (when (:public-dcache m)
      (page/use-public-dcache-server))
    (cond (and (:in m)
               (:out m))

          (let [f (cond
                   (:classify-pages m) classify-pages
                   (:urls-with-excludexp m) find-urls-with-exclude-xpath
                   :else extract-from-urls)]
            (apply f
                   (:in m)
                   (:out m)
                   :config (select-keys m [:mappers])
                   (apply concat
                          (select-keys m [:groups :url-pos :uuid-pos :max-len]))))
          ,
          (:url m)
          (apply extract (map m [:url :groups :uuid])),
          ,
          :else
          "Either [-i and -o] or --url is required."
          )))
