
(ns leafgrabber.free-text.query
  (:use [cascalog.api :only (lfs-textline hfs-textline defmapop defbufferop with-job-conf deffilterop defmapcatop <- ?<-)]
        [cascalog-commons.core :only (with-standard-conf)]
        [clojure.tools.logging :only (warn debug)]
        [clojure.contrib.string :only (as-str)]
        [clojure.contrib.str-utils :only (re-split)]
        [clojure.data.json :only (json-str read-json)]
        )
  (:require [leafgrabber.free-text
             [utils :as ftu]
             [author :as aut]
             [tables :as tbl]]
            [leafgrabber.core :as lgc]
            [leafgrabber.page :as pge]
            [leafgrabber.xpath :as xpt]
            [cascalog.ops :as cop]
            [clojure.tools.cli :as cl]
            )
  (:gen-class)
  )

(def sample-uuid-url-dir
  "/apps/extract/poi/UnitedStates/other/TEST_2011_05_04_DailyDedupeRun/restaurants.sample.0.5/leafgrabber.urls/merged"
  ;"/apps/extract/poi/UnitedStates/other/hotels_restaurants/hotels/sample.02.filtered"
  )

(def hotel-sample-url-dir
  "/apps/extract/poi/UnitedStates/other/hotels_restaurants/hotels/sample.02.filtered")

(def hotel-leaf-url-dir
  "/apps/extract/poi/UnitedStates/other/hotels_restaurants/hotels/filtered/leaf")

(def uuid-url-input-dir
  ;"/tmp/hadoop-davidg/test-urls"
  "/apps/extract/poi/UnitedStates/other/TEST_2011_05_04_DailyDedupeRun/restaurants.sample.0.5/leafgrabber.urls/merged"
  )

(def auth-uuid-url-dir
  "/apps/extract/poi/UnitedStates/other/TEST_2011_05_04_DailyDedupeRun/restaurants/leafgrabber.urls/auth"
  )

(def full-uuid-url-dir
  "/apps/extract/poi/UnitedStates/other/TEST_2011_05_04_DailyDedupeRun/restaurants/leafgrabber.urls/filtered"
  )

(def full-uuid-url-home
  "/apps/extract/poi/UnitedStates/other/TEST_2011_05_04_DailyDedupeRun/restaurants/leafgrabber.urls")

(def results-dir
  "/apps/extract/poi/UnitedStates/other/hotels_restaurants/free-text")

(def uuid-name-input-dir
  "/apps/extract/poi/UnitedStates/other/TEST_2011_05_04_DailyDedupeRun/summaries")

(def lg-merge-dir
  "/apps/extract/poi/UnitedStates/other/TEST_2011_05_04_DailyDedupeRun/restaurants.sample.0.5/merged-data")

(defn copy-uuid-urls
  "copy just the uuids and urls from the input directory

   att - the attribute (or directory name) defining where to copy the info
   full? - if true, copy the full data, otherwise copy the sample"
  [uuid-url-tap source-tap]
  (with-standard-conf
    (let []
      (println)
      (println (str "copying uuid-url pairs to " uuid-url-tap))
      (println)
      (?<- uuid-url-tap [?uuid ?url]
           (source-tap ?line)
           (ftu/uuid-from-tabbed ?line :> ?uuid)
           (ftu/url-from-tabbed ?line :> ?url)
           (:distinct false))
      )))

(defn extractor-value
  "get the extractor value for a given string and extractor"
  [string-val ext]
  (let [extractor ((keyword ext) tbl/*extractor-table*)
        classifier (:classifier extractor)
        ext-value (apply classifier (list string-val))]
    (list (as-str ext) ext-value))
  )

(defn ext-by-filter
  "group extractors by filter/content-only flag"
  [exts]
  (loop [ret {}
         exts-to-go exts]
    (if (empty? exts-to-go)
      ret
      (let [ext (first exts-to-go)
            extractor (tbl/*extractor-table* (keyword ext))
            key (:preprockey extractor)]
        (recur (assoc ret key (cons ext (get ret key))) (rest exts-to-go))
      ))
  ))

(defn extract-from-url
  [ext-key url]
  (pge/use-public-dcache-server)
  (let [extractor (ext-key tbl/*extractor-table*)
        preprocessor (:preprocess extractor)
        page (apply preprocessor (list url))
        classifier (:classifier extractor)]
    (println ext-key)
    (if page
      (apply classifier (list page))
      "page not found in dcache")
  ))

(defmapcatop add-extractor-values
  [url exts max-len]
  (let [filter-map (ext-by-filter exts)]
    (mapcat (fn [map-entry]
              (let [key (first map-entry)
                    entry-exts (second map-entry)
                    extractor (tbl/*extractor-table* (keyword (first entry-exts)))
                    preprocessor (:preprocess extractor)
                    html (apply preprocessor (list url))]
                (map #(extractor-value (or html "") %) entry-exts)
                ))
            filter-map)
  ))

(defmapcatop trivial-op
  [url exts]
  '(("hello" "world"))
  )

(defn raw-ext-val-q
  "get the raw extractor values for the uuids in the src tap"
  [sink-tap src-tap trap-tap exts config max-count]
  (println)
  (println "Building raw extractor values: " sink-tap)
  (println)
  (comment)
  (with-standard-conf
    config
    (?<- sink-tap [?uuid ?url ?ext ?ext-raw-val]
         (src-tap ?line)
         (ftu/get-field ?line 0 :> ?uuid)
         (ftu/get-field ?line 1 :> ?url)
         (add-extractor-values ?url exts max-count :> ?ext ?ext-raw-val)
         (:distinct false)
         (:trap trap-tap)))
  )

(defn trivial-q
  [sink-tap src-tap]
  (with-standard-conf
    (?<- sink-tap [?hello ?world]
         (src-tap ?line)
         (trivial-op ?line :> ?hello ?world)))
  )

(defn get-mode-value
  [raw-val]
  (let [val-map (read-json raw-val)]
    (loop [top-count 0
           second-val? false
           top-value "no-evidence"
           map-left val-map]
      (if (empty? map-left)
        (as-str top-value)
        (let [val (first (first map-left))
              cnt (second (first map-left))]
          (cond (= val :no-evidence) (recur top-count second-val? top-value (rest map-left))
                (> cnt top-count) (recur cnt false val (rest map-left))
                (= cnt top-count) (recur cnt true "no-evidence" (rest map-left))
                true (recur top-count second-val? top-value (rest map-left)))
          ))
      ))
  )

(defn pick-one-val-q
  [sink-tap raw-ext-tap]
  (println)
  (println "Pick the most common value: " sink-tap)
  (println)
  (with-standard-conf
    (?<- sink-tap [?uuid ?url ?ext ?mode-val]
         (raw-ext-tap ?line)
         (ftu/get-field ?line 0 :> ?uuid)
         (ftu/get-field ?line 1 :> ?url)
         (ftu/get-field ?line 2 :> ?ext)
         (ftu/get-field ?line 3 :> ?ext-raw-val)
         (get-mode-value ?ext-raw-val :> ?mode-val)))
  )

(defbufferop aggregate-extractor-values
  [tuples]
  (let [ext-key (keyword (first (first tuples)))
        extractor (ext-key tbl/*extractor-table*)
        aggregator (:aggregator extractor)]
    (list (apply aggregator (list (map second tuples))))
    )
  )

(defn agg-ext-val-q
  "aggregate the values from an extractor"
  [sink-tap raw-ext-tap]
  (println)
  (println "Aggregating extractor values: " sink-tap)
  (println)
  (with-standard-conf
    (?<- sink-tap [?uuid ?ext ?agg-ext-val]
         (raw-ext-tap ?line)
         (ftu/get-field ?line 0 :> ?uuid)
         (ftu/get-field ?line 1 :> ?url)
         (ftu/get-field ?line 2 :> ?ext)
         (ftu/get-field ?line 3 :> ?ext-raw-val)
         (aggregate-extractor-values ?ext ?ext-raw-val :> ?agg-ext-val)))
  )

(defn total-agg-val-q
  "aggregate the values from an extractor across all UUIDs"
  [sink-tap agg-ext-tap]
  (println)
  (println "Aggregating extractor values: " sink-tap)
  (println)
  (with-standard-conf
    (?<- sink-tap [?ext ?agg-val]
         (agg-ext-tap ?line)
         (ftu/get-field ?line 1 :> ?ext)
         (ftu/get-field ?line 2 :> ?agg-ext-val)
         (aggregate-extractor-values ?ext ?agg-ext-val :> ?agg-val)))
  )

(defbufferop aggregate-numeric-values-old
  [tuples]
  (let [non-digit-exp #"\D"
        ext-key (keyword (first (first tuples)))
        extractor (ext-key tbl/*extractor-table*)
        normalizer (:normalize extractor)
        values (map #(let [raw-val (second %)]
                       (if (re-seq non-digit-exp raw-val)
                         0
                         (Integer/parseInt raw-val)))
                    tuples)]
    (list (apply normalizer (list (apply + values))))
 ))

(defbufferop aggregate-numeric-values
  [tuples]
  (let [non-digit-exp #"\D"
        values (map #(let [raw-val (second %)]
                       (if (re-seq non-digit-exp raw-val)
                         "NA"
                         (Integer/parseInt raw-val)))
                    tuples)]
    (if (every? #(= "NA" %) values)
      (list "NA")
      (let [ext-key (keyword (first (first tuples)))
            extractor (ext-key tbl/*extractor-table*)
            normalizer (:normalize extractor)
            num-values (map #(if (= % "NA") 0 %) values)]
        (list (apply normalizer (list (apply + num-values))))))
    ))

(defn agg-sum-val-q
  "aggregate numeric values from an extractor by summing them"
  [sink-tap pick-one-tap]
  (println)
  (println "Summing extractor values: " sink-tap)
  (println)
  (with-standard-conf
    (?<- sink-tap [?uuid ?ext ?sum-val]
         (pick-one-tap ?line)
         (ftu/get-field ?line 0 :> ?uuid)
         (ftu/get-field ?line 1 :> ?url)
         (ftu/get-field ?line 2 :> ?ext)
         (ftu/get-field ?line 3 :> ?ext-raw-val)
         (aggregate-numeric-values ?ext ?ext-raw-val :> ?sum-val)))
  )

(defbufferop make-agg-ext-json
  [tuples]
  (list (json-str (apply hash-map (flatten tuples))))
  )

(defn json-agg-val-q
  "put aggregated extractor values into json form"
  [sink-tap agg-ext-tap]
  (println)
  (println "Putting aggregated extractor values into json form: " sink-tap)
  (println)
  (with-standard-conf
    (?<- sink-tap [?uuid ?json]
         (agg-ext-tap ?line)
         (ftu/get-field ?line 0 :> ?uuid)
         (ftu/get-field ?line 1 :> ?ext)
         (ftu/get-field ?line 2 :> ?agg-ext-val)
         (make-agg-ext-json ?ext ?agg-ext-val :> ?json)))
  )

(defn extractor-qa-value
  "get the extractor value and validator value for a given string and
   extractor-attribute pair"
  [page ext]
  (let [extractor ((keyword ext) tbl/*extractor-table*)
        validator (:validator extractor)
        val-value (apply validator (list page))]
    (map #(if (ftu/get-json-item % 1)
            (list (as-str ext) %)
            ())
         val-value)
    ))

(defmapcatop add-extractor-qa-values
  "get a url's raw extractor values for specified attributes and extractors"
  [url exts]
  (let [filter-map (ext-by-filter exts)]
    (mapcat (fn [map-entry]
              (let [entry-exts (second map-entry)
                    extractor (tbl/*extractor-table* (keyword (first entry-exts)))
                    page (apply (:preprocess extractor) (list url))]
                (if (empty? page)
                  nil
                  (mapcat #(extractor-qa-value page %) entry-exts))))
            filter-map)
  ))

(defn add-extractor-qa-values-fn
  "get a url's raw extractor values for specified attributes and extractors"
  [url exts]
  (let [filter-map (ext-by-filter exts)]
    (mapcat (fn [map-entry]
              (let [entry-exts (second map-entry)
                    extractor (tbl/*extractor-table* (keyword (first entry-exts)))
                    page (apply (:preprocess extractor) (list url))]
                (if (empty? page)
                  nil
                  (mapcat #(extractor-qa-value page %) entry-exts))))
            filter-map)
  ))

(defn raw-ext-qa-q
  [sink-tap src-tap trap-tap exts]
  (with-standard-conf
    (?<- sink-tap [?uuid ?url ?ext ?ext-val-val]
         (src-tap ?line)
         (ftu/get-field ?line 0 :> ?uuid)
         (ftu/get-field ?line 1 :> ?url)
         (add-extractor-qa-values ?url exts :> ?ext ?ext-val-val)
         (:distinct false)
         (:trap trap-tap)))
  )

(defn raw-ext-join-qa-q
  [sink-dir ext-qa-dir summ-dir]
  (println)
  (println "Joining name and address: " sink-dir)
  (println)
  (ftu/hadoop-delete sink-dir)
  (with-standard-conf
    (?<- (hfs-textline sink-dir) [?uuid ?name ?addr ?url ?ext ?att ?ext-val-val]
         ((hfs-textline ext-qa-dir) ?qa-line)
         (ftu/get-field ?qa-line 0 :> ?uuid)
         (ftu/get-field ?qa-line 1 :> ?url)
         (ftu/get-field ?qa-line 2 :> ?ext)
         (ftu/get-field ?qa-line 3 :> ?att)
         (ftu/get-field ?qa-line 4 :> ?ext-val-val)
         ((hfs-textline summ-dir) ?summ-line)
         (ftu/get-field ?summ-line 0 :> ?uuid)
         (ftu/get-field ?summ-line 1 :> ?name)
         (ftu/get-field ?summ-line 2 :> ?addr)))
  )

(defn drop-no-evidence
  [tuple]
  (let [ext (first tuple)
        ext-val (read-json (second tuple))]
    (list ext (dissoc ext-val :no-evidence)))
  )

(defn hash-map-from-tuples
  [tuples]
  (loop [result {}
         to-go tuples]
    (if (empty? to-go)
      result
      (let [next-tuple (first to-go)
            key (first next-tuple)
            val (second next-tuple)]
        (if key
          (recur (assoc result key val) (rest to-go))
          (recur result (rest to-go)))))
    ))

(defbufferop make-ext-json
  "each tuple contains ext, ext-raw-val, for a single uuid/url pair"
  [tuples]
  (let [filtered (filter #(not (empty? (second %))) (map drop-no-evidence tuples))]
    (list (json-str (hash-map-from-tuples filtered))))
  )

(defmapop make-full-json
  "add factual_id and _source to the given json"
  [uuid url json]
  (json-str (assoc (read-json json) :factual_id uuid :_source url))
  )

(defn json-ext-val-q
  [sink-tap src-tap error-tap]
  (println)
  (println "Putting extractor values into json")
  (println)
  (with-standard-conf
    (?<- sink-tap [?uuid ?url ?json]
         (src-tap ?line)
         (ftu/get-field ?line 0 :> ?uuid)
         (ftu/get-field ?line 1 :> ?url)
         (ftu/get-field ?line 2 :> ?ext)
         (ftu/get-field ?line 3 :> ?ext-raw-val)
         (make-ext-json ?ext ?ext-raw-val :> ?json)
         (:trap error-tap)
         (:distinct false))))

(defmapop make-thrift-json
  "make thrift json given the payload"
  [uuid url payload-json]
  (let [payload-map (read-json payload-json)
        field-meta-map (apply hash-map
                              (mapcat #(list (first %) {:scores (second %)})
                                      payload-map))]
    (json-str (hash-map :payload payload-map
                        :uuid uuid
                        :processingState "UNPROCESSED"
                        :inputDate ftu/date-millis
                        :inputMeta (hash-map :origin "FREE_TEXT"
                                             :type "STANDARD"
                                             :sourceUrl url
                                             :fieldMetas field-meta-map)))
    )
  )

(defn thrift-ext-val-q
  [sink-tap src-tap]
  (println)
  (println "Putting extractor values into thrift json " sink-tap)
  (println)
  (with-standard-conf
    (?<- sink-tap [?thrift-json]
         (src-tap ?line)
         (ftu/get-field ?line 0 :> ?uuid)
         (ftu/get-field ?line 1 :> ?url)
         (ftu/get-field ?line 2 :> ?json)
         (make-thrift-json ?uuid ?url ?json :> ?thrift-json)
         (:distinct false)))
  )

(defn step-raw-ext-val-many
  [exts uuid-url-tap raw-ext-tap trap-tap config max-count]
  (let [str-exts (map as-str exts)]
    (raw-ext-val-q raw-ext-tap uuid-url-tap trap-tap str-exts config max-count))
  )

(defn step-json-ext-val-many
  "put the raw-ext-val output into json format

   Source:      raw.ext.val
   Destination: json.ext.val"
  [raw-ext-tap json-ext-tap error-tap]
  (let []
    (json-ext-val-q json-ext-tap raw-ext-tap error-tap)
    ))

(defn step-thrift-ext-val-many
  [raw-ext-tap thrift-ext-tap]
  (let []
    (thrift-ext-val-q thrift-ext-tap raw-ext-tap)
    )
  )

(defn step-pick-one-val-many
  "pick the value with the highest count"
  [raw-ext-tap pick-one-tap]
  (let []
    (pick-one-val-q pick-one-tap raw-ext-tap))
  )

(defn step-agg-ext-val-many
  [raw-ext-tap agg-ext-tap]
  (let []
    (agg-ext-val-q agg-ext-tap raw-ext-tap)
    ))

(defn step-total-agg-val-many
  [agg-ext-tap total-agg-tap]
  (let []
    (total-agg-val-q total-agg-tap agg-ext-tap))
  )

(defn step-agg-sum-val-many
  "only take this step for numeric extractors"
  [pick-one-tap agg-ext-tap]
  (let []
    (agg-sum-val-q agg-ext-tap pick-one-tap)
    )
  )

(defn step-json-agg-val-many
  "put the agg-ext-val output into json format

   Source:      agg.ext.val
   Destination: json.agg.val"
  [agg-ext-tap json-agg-tap]
  (let []
    (json-agg-val-q json-agg-tap agg-ext-tap)
  ))

(defn step-raw-ext-qa-many
  [exts uuid-url-tap raw-ext-tap trap-tap]
  (let [str-exts (map as-str exts)]
    (raw-ext-qa-q raw-ext-tap uuid-url-tap trap-tap str-exts))
  )

(defn step-raw-ext-join-qa
  [att date]
  (let [base-dir (str results-dir "/" (as-str att) "/" date)
        raw-join-dir (str base-dir "/raw.ext.qa.join")
        raw-ext-dir (str base-dir "/raw.ext.qa")
        summ-dir uuid-name-input-dir]
    (raw-ext-join-qa-q raw-join-dir raw-ext-dir summ-dir)
    ))

(defn run-ft-error
  [arg-map]
  (if (not (map? arg-map)) "Argument to run-ft must be a map"
      (let [bad-arg (some #(and (not (= (first (as-str %)) \x))
                                (not (contains? #{:ext :group :dir :date :flow :source :filesystem
                                                  :results-dir :sum-agg :config :max-count} %)))
                          (keys arg-map))
            ext (:ext arg-map)
            group (:group arg-map)
            group-exts (tbl/*extractor-groups* group)
            bad-exts (filter #(not (tbl/*extractor-table* %)) group-exts)
            dir (:dir arg-map)
            date (:date arg-map)
            flow (:flow arg-map)
            source (:source arg-map)
            filesystem (:filesystem arg-map)]
        (cond bad-arg (str "Unrecognized argument " bad-arg " to run-ft")
              (and ext group) ":ext and :group must not both be specified"
              (and (nil? ext) (nil? group)) "either :ext or :group must be specified"
              (and ext (not (keyword? ext))) "value of :ext must be a keyword"
              (and ext (not (tbl/*extractor-table* ext))) "unrecognized extractor"
              (and group (not (keyword? group))) "value of :group must be a keyword"
              (and group (not (tbl/*extractor-groups* group))) "unrecognized group"
              (not (empty? bad-exts)) (str "group contains bad extractor(s): " (apply str (interpose " " bad-exts)))
              (and dir (not (string? dir))) "value of :dir must be a string YYYY_MM_DD"
              (and flow (not (contains? #{"production" "qa" "sum"} flow))) "value of :flow must be \"production\", \"qa\" or \"sum\""
              (and filesystem (not (contains? #{"hadoop" "local"} filesystem))) "value of :filesystem must be \"hadoop\" or \"local\""
              )
        )))

(defn get-dir-and-tap
  [base-dir spec-dir file-system]
  (let [dir (str base-dir "/" spec-dir)
        tap (if (= file-system "hadoop")
              (hfs-textline dir)
              (lfs-textline dir))]
    [dir tap]))

(defn run-ft-sum
  [exts base-dir file-system source-dir do-copy config max-count]
  (let [[trap-dir trap-tap] (get-dir-and-tap base-dir "error.tap" file-system)
        [uuid-url-dir uuid-url-tap] (get-dir-and-tap base-dir "src.urls" file-system)
        [raw-ext-dir raw-ext-tap] (get-dir-and-tap base-dir "raw.ext.val" file-system)
        [pick-one-dir pick-one-tap] (get-dir-and-tap base-dir "pick.one" file-system)
        [agg-ext-dir agg-ext-tap] (get-dir-and-tap base-dir "agg.ext.val" file-system)]
    (if do-copy
      (do
        (ftu/hadoop-delete uuid-url-dir)
        (copy-uuid-urls uuid-url-tap (hfs-textline source-dir))))
    (ftu/file-delete raw-ext-dir file-system)
    (step-raw-ext-val-many exts uuid-url-tap raw-ext-tap trap-tap config max-count)
    (ftu/file-delete pick-one-dir file-system)
    (step-pick-one-val-many raw-ext-tap pick-one-tap)
    (ftu/file-delete agg-ext-dir file-system)
    (step-agg-sum-val-many pick-one-tap agg-ext-tap))
  )

(defn run-ft-qa
  [exts base-dir file-system source-dir do-copy config max-count]
  (let [[trap-dir trap-tap] (get-dir-and-tap base-dir "error.tap" file-system)
        [uuid-url-dir uuid-url-tap] (get-dir-and-tap base-dir "src.urls" file-system)
        [raw-ext-qa-dir raw-ext-qa-tap] (get-dir-and-tap base-dir "raw.ext.qa" file-system)]
    (if do-copy
      (do
        (ftu/hadoop-delete uuid-url-dir)
        (copy-uuid-urls uuid-url-tap (hfs-textline source-dir))))
    (ftu/file-delete raw-ext-qa-dir file-system)
    (step-raw-ext-qa-many exts uuid-url-tap raw-ext-qa-tap trap-tap))
  )

(defn run-ft-prod
  [exts base-dir file-system source-dir do-copy config max-count]
  (let [[trap-dir trap-tap] (get-dir-and-tap base-dir "error.tap" file-system)
        [uuid-url-dir uuid-url-tap] (get-dir-and-tap base-dir "src.urls" file-system)
        [raw-ext-dir raw-ext-tap] (get-dir-and-tap base-dir "raw.ext.val" file-system)
        [json-ext-dir json-ext-tap] (get-dir-and-tap base-dir "json.ext.val" file-system)
        [thrift-ext-dir thrift-ext-tap] (get-dir-and-tap base-dir "thrift.ext.val" file-system)]
    (println "in run-ft-prod")
    (if do-copy
      (do
        (ftu/hadoop-delete uuid-url-dir)
        (copy-uuid-urls uuid-url-tap (hfs-textline source-dir))))
    (ftu/file-delete raw-ext-dir file-system)
    (ftu/file-delete trap-dir file-system)
    (step-raw-ext-val-many exts uuid-url-tap raw-ext-tap trap-tap config max-count)
    (ftu/file-delete json-ext-dir file-system)
    (ftu/file-delete trap-dir file-system)
    (step-json-ext-val-many raw-ext-tap json-ext-tap trap-tap)
    (ftu/file-delete thrift-ext-dir file-system)
    (step-thrift-ext-val-many json-ext-tap thrift-ext-tap)
    )
  )

(defn run-ft-agg
  [exts base-dir file-system source-dir do-copy config max-count]
  (let [[trap-dir trap-tap] (get-dir-and-tap base-dir "error.tap" file-system)
        [uuid-url-dir uuid-url-tap] (get-dir-and-tap base-dir "src.urls" file-system)
        [raw-ext-dir raw-ext-tap] (get-dir-and-tap base-dir "raw.ext.val" file-system)
        [agg-ext-dir agg-ext-tap] (get-dir-and-tap base-dir "agg.ext.val" file-system)
        [total-agg-dir total-agg-tap] (get-dir-and-tap base-dir "total.agg.val" file-system)]
    (if do-copy
      (do
        (ftu/hadoop-delete uuid-url-dir)
        (copy-uuid-urls uuid-url-tap (hfs-textline source-dir))))
    (ftu/file-delete raw-ext-dir file-system)
    (ftu/file-delete trap-dir file-system)
    (step-raw-ext-val-many exts uuid-url-tap raw-ext-tap trap-tap config max-count)
    (ftu/file-delete agg-ext-dir file-system)
    (ftu/file-delete trap-dir file-system)
    (step-agg-ext-val-many raw-ext-tap agg-ext-tap)
    (ftu/file-delete total-agg-dir file-system)
    (step-total-agg-val-many agg-ext-tap total-agg-tap))
  )

(defn run-ft
  [arg-map]
  (let [{ext :ext, date :date, config :config, max-count :max-count, source :source,
         res-dir :results-dir, no-copy :no-copy, group :group, file-system :filesystem,
         flow :flow
         :or {config {}, date ftu/date-str, res-dir results-dir,
              file-system "hadoop", flow "production", source "sample"}} arg-map
         exts (or (tbl/*extractor-groups* group) (set (list ext)))
         ext-or-group (if group (as-str group) (as-str ext))
         base-dir (str res-dir "/" ext-or-group "/" date)
         source-dir (cond (= source "sample") sample-uuid-url-dir
                          (= source "full") full-uuid-url-dir
                          true source)
         do-copy (not (or no-copy (= file-system "local")))]
    (cond (or (= flow "production") (nil? flow))
          (run-ft-prod exts base-dir (or file-system "hadoop") source-dir do-copy config max-count),
          (= flow "qa")
          (run-ft-qa exts base-dir (or file-system "hadoop") source-dir do-copy config max-count),
          (= flow "sum")
          (run-ft-sum exts base-dir (or file-system "hadoop") source-dir do-copy config max-count),
          (= flow "agg")
          (run-ft-agg exts base-dir (or file-system "hadoop") source-dir do-copy config max-count))
 ))

(defn old-run-ft
  "This version simply stopped working. The above version does all the same
   things, except it works"
  [arg-map]
  (let [error (run-ft-error arg-map)]
    (if error error
        (let [{ext :ext group :group date :date source :source flow :flow
               file-system :filesystem res-dir :results-dir config :config
               max-count :max-count
               :or {flow "production" file-system "hadoop" res-dir results-dir config {}}} arg-map
               exts (or (tbl/*extractor-groups* group) (set (list ext)))
              dir (cond (:dir arg-map) (:dir arg-map)
                        group (as-str group)
                        ext (as-str ext))
              use-date (or date ftu/date-str)
              source-dir (cond (or (nil? source) (= source "sample")) sample-uuid-url-dir
                               (= source "full") full-uuid-url-dir
                               true source)
              base-dir (str res-dir "/" dir "/" use-date)
              [trap-dir trap-tap] (get-dir-and-tap base-dir "error.trap" file-system)
              [uuid-url-dir uuid-url-tap] (get-dir-and-tap base-dir "src.urls" file-system)
              [raw-ext-dir raw-ext-tap] (get-dir-and-tap base-dir "raw.ext.val" file-system)
              [raw-ext-qa-dir raw-ext-qa-tap] (get-dir-and-tap base-dir "raw.ext.qa" file-system)
              [agg-ext-dir agg-ext-tap] (get-dir-and-tap base-dir "agg.ext.val" file-system)
              [json-ext-dir json-ext-tap] (get-dir-and-tap base-dir "json.ext.val" file-system)
              [thrift-ext-dir thrift-ext-tap] (get-dir-and-tap base-dir "thrift.ext.val" file-system)
              [json-agg-dir json-agg-tap] (get-dir-and-tap base-dir "json.agg.val" file-system)
              [pick-one-dir pick-one-tap] (get-dir-and-tap base-dir "pick.one.val" file-system)]
          (if (and (nil? date) (= file-system "hadoop"))
            (do
              (ftu/hadoop-delete uuid-url-dir)
              (copy-uuid-urls uuid-url-tap (hfs-textline source-dir))))
          (if (and (= file-system "hadoop")
                   (= (ftu/hadoop-ls (str base-dir "/src.urls")) -1))
            (println "Source URLs not found. Run again without specifying a date")
            (cond (= flow "production")
              (do
                (if (= file-system "hadoop")
                  (do
                    (ftu/hadoop-delete raw-ext-dir)
                    (ftu/hadoop-delete trap-dir)))
                (step-raw-ext-val-many exts uuid-url-tap raw-ext-tap trap-tap config max-count)
                (comment (if (= file-system "hadoop")
                           (ftu/hadoop-delete json-ext-dir)))
                (comment (step-json-ext-val-many raw-ext-tap json-ext-tap))
                (comment) (if (= file-system "hadoop")
                            (ftu/hadoop-delete thrift-ext-dir))
                (comment) (step-thrift-ext-val-many raw-ext-tap thrift-ext-tap)
                (comment (if (= file-system "hadoop")
                           (ftu/hadoop-delete pick-one-dir)))
                (comment (step-pick-one-val-many raw-ext-tap pick-one-tap))
                (comment (if (= file-system "hadoop")
                           (ftu/hadoop-delete agg-ext-dir)))
                (comment (step-agg-ext-val-many raw-ext-tap agg-ext-tap))
                (comment (if (= file-system "hadoop")
                           (ftu/hadoop-delete json-agg-dir)))
                (comment (step-json-agg-val-many agg-ext-tap json-agg-tap)))

              (= flow "sum")
              (do
                (comment "Get the raw values for each extractor")
                (if (= file-system "hadoop")
                    (ftu/hadoop-delete raw-ext-dir))
                (step-raw-ext-val-many exts uuid-url-tap raw-ext-tap trap-tap config max-count)
                (comment "Get just one (numeric) value for each url/extractor")
                (if (= file-system "hadoop")
                  (ftu/hadoop-delete pick-one-dir))
                (step-pick-one-val-many raw-ext-tap pick-one-tap)
                (comment "Apply the normalizer for each extractor")
                (if (= file-system "hadoop")
                  (ftu/hadoop-delete agg-ext-dir))
                (step-agg-sum-val-many pick-one-tap agg-ext-tap))

              (= flow "qa")
              (do
                (println "qa flow on extractors " exts)
                (if (= file-system "hadoop")
                  (ftu/hadoop-delete raw-ext-qa-dir)
                  (ftu/local-delete raw-ext-qa-dir))
                (step-raw-ext-qa-many exts uuid-url-tap raw-ext-qa-tap)
                ;(step-raw-ext-join-qa dir use-date)
                )

              )
            )
          ))))

(defn run-single-ext
  [ext ? [date]]
  (run-ft {:ext ext :date date}))

(defn run-single-ext-full
  [ext ? [date]]
  (run-ft {:ext ext :date date :source "full"}))

(defn run-qa-single-ext
  [ext ? [date]]
  (run-ft {:ext ext :date date :flow "qa"}))

(defn run-qa-single-ext-full
  [ext ? [date]]
  (run-ft {:ext ext :date date :flow "qa" :source "full"}))

(defn run-group
  [group ? [date]]
  (run-ft {:group group :date date}))

(defn run-group-full
  [group ? [date]]
  (run-ft {:group group :date date :source "full"}))

(defn run-qa-group
  [group ? [date]]
  (run-ft {:group group :date date :flow "qa"}))

(defn run-qa-group-full
  [group ? [date]]
  (run-ft {:group group :date date :flow "qa" :source "full"}))

(defn -main [& args]
  (let [m1 (cl/cli args
                   (cl/optional ["-e" "--ext" "attribute to run"])
                   (cl/optional ["-g" "--group" "group of attributes to run"])
                   (cl/optional ["-dr" "--dir" "results subdirectory"])
                   (cl/optional ["-dt" "--date" "date subdirectory"])
                   (cl/optional ["-f" "--flow" "flow: production or qa"])
                   (cl/optional ["-s" "--source" "source: sample or full"])
                   (cl/optional ["-fs" "--filesystem" "file system: hadoop or local"])
                   (cl/optional ["-r" "--results-dir" "results directory"])
                   (cl/optional ["-m" "--mappers" "number of mappers"])
                   )
        m2 (if (:ext m1)
             (assoc m1 :ext (keyword (:ext m1)))
             (assoc m1 :group (keyword (:group m1))))
        m3 (if (:mappers m2)
             (assoc m2 :config {:mappers (Integer. (:mappers m2))})
             m2)]
    (println m3)
    (run-ft m3)
    ))
