(ns com.kahui.spiders.dianping.commands
  (:import [java.util.concurrent Executors ThreadPoolExecutor TimeUnit ArrayBlockingQueue]
   [java.util.concurrent ThreadPoolExecutor$CallerRunsPolicy]
   [com.cdg.kahui.craft.etl.utils FileSystemUtils]
   [com.cdg.kahui.craft.etl.hadoop.avro AvorUtils AvorUtils$SizeFilterFunction AvorUtils$NoSuffixFileNameFunction AvorUtils$AutoUnCompressFunction]
   [com.google.common.io ByteStreams]
   [org.jsoup Jsoup]
   [java.net URL]
   [org.apache.hadoop.fs Path])
  (:require [com.kahui.spiders.dianping.jsoup-parser :as parser]
   [clojure.data.json :as json]
   [com.kahui.spiders.dianping.commons :as commons]
   [com.kahui.spiders.dianping.tools :as tools]
   [com.kahui.spiders.tools.utils :as utils]
   [clj-http.client :as http-client])
  (:use [com.kahui.spiders.dianping.spider]
   [com.kahui.spiders.tools.utils]
   [clojure.tools.logging :only (debug info error warn)]
   [clojure.tools.cli :as cli]
   [clojure.java.io :as io]))

(defn- usage-with-exit
  [exit-code banner msg]
  (if msg (println msg))
  (println banner) (System/exit exit-code))

;定义占位符
(def next-dp-id-range)

(defn spider-dianping-store
  "下载大众点评的商户数据"
  [args]
  (let [[options args banner] (cli/cli args "下载大众点评的商户数据"
                                            ["-h" "--help" "帮助" :default false :flag true]
                                            ["-s" "--start-id" "id-range:开始id(包含)" :parse-fn #(Integer. %)]
                                            ["-e" "--end-id" "id-range:结束id(不包含)" :parse-fn #(Integer. %)]
                                            ["--id-file" "id-file:id文件,每行一个"]
                                            ["-t" "--threads" "线程个数" :parse-fn #(Integer. %)]
                                            ["-c" "--cookie" "cookie文件(可选)"]
                                            ["-d" "--data-dir" "下载数据存储的目录"]
                                            ["-p" "--proxy" "代理文件(可选)"]
                                            ["--cool-down-ms" "cool down毫秒数" :default (* 30 60 1000) :parse-fn #(Integer. %)])
        {:keys [down-type start-id end-id threads cookie data-dir proxy id-file cool-down-ms]} options
        cookie-content (if cookie (slurp cookie))
        create-ids-from-range-fn (fn [s e]
                                   (if (and s e)
                                     {:seq (range s e 1)}))
        create-ids-from-file-fn (fn [f]
                                  (if f (let [r (io/reader f)]
                                          (println "load ids from file " f)
                                          {:seq (line-seq r) :close r})))]
    (when (:help options)
      (usage-with-exit 0 banner nil))
    (when (some #(nil? %) [threads data-dir])
      (usage-with-exit 1 banner "The threads and data-dir must be set."))
    (when (and (nil? id-file) (some #(nil? %) [start-id end-id]))
      (usage-with-exit 1 banner "The start-id,end-id or id-file must be set."))

    (binding [*proxy-pool* (if-not (nil? proxy) (create-proxy-pool proxy cool-down-ms nil nil))]
      (let [ids (or (create-ids-from-range-fn start-id end-id) (create-ids-from-file-fn id-file))
            ids-seq (:seq ids)
            to-close (:close ids)]
        (when (nil? ids-seq)
          (throw (RuntimeException. "no ids-seq")))
        (try
          (batch-down ids-seq threads cookie-content data-dir)
          (finally
            (if to-close (do (println "close" to-close) (. to-close close)))))))
    (info "Shutdowning agents ")
    (shutdown-agents)
    "Finish"))

(defn op-store-es
  "将点评的商户数据写入到ES中
  es-url: ElasticSearch的URL
  store-file:点评的商户数据
  last-lc:需要跳过的行号
  threads:线程的个数
  "
  ([es-url store-file last-lc threads op-fn]
   (let [last-lc-number (if (nil? last-lc) 0 (Integer/parseInt last-lc))
         threads-number (if (nil? threads) 1 (Integer/parseInt threads))
         thread-pool (ThreadPoolExecutor. threads-number (* 2 threads-number) 1 TimeUnit/SECONDS (ArrayBlockingQueue. 100))]
     (.setRejectedExecutionHandler thread-pool (ThreadPoolExecutor$CallerRunsPolicy.))
     (with-local-vars [lc 1]
       (with-open [store-file-f (io/reader (.getInput (FileSystemUtils/getGlobFilesInputSupplier ^String store-file)))]
         (doseq [line (line-seq store-file-f)]
           (var-set lc (inc @lc))
           (if (> @lc last-lc-number)
             (.execute thread-pool (fn []
                                     (try
                                       (op-fn es-url line)
                                       (catch Exception e
                                                        (error e "Post line " line))))))
           (if (= (mod @lc 5000) 0)
             (info "lines:" @lc)))))
     (info "Waiting post agent finish")
     (.shutdown thread-pool)
     (.awaitTermination thread-pool Integer/MAX_VALUE TimeUnit/MINUTES)
     (info "Waiting post agent finish..Done"))))

(defn put-dianping-store-es
  "将点评的商户数据写入到ES中
  arg 0: ElasticSearch的URL
  arg 1:点评的商户数据
  arg 2:需要跳过的行号
  arg 3:线程的个数
  arg 4:操作的类型
  "
  [arg]
  (let [
         url (first arg)
         store-file (second arg)
         last-lc (nth arg 2)
         threads (nth arg 3)
         op-type (nth arg 4)
         op-fns {
                  ;增加商户数据
                  "put-store"
                  (fn [es-url json-str]
                    (let [json-obj (json/read-json json-str)
                          shopId (:shopID (:script-data json-obj))
                          confidence (tools/calc-store-confidence json-obj)
                          confidence (if (nil? confidence) 0 confidence)
                          rq-body (json/write-str (merge json-obj {:kahui-confidence confidence}))]
                      (if-not (nil? shopId)
                        (http-client/post es-url {:body rq-body}))))
                  ; 更新商户的置信度
                  "update-store-confidence"
                  (fn [es-url json-str]
                    (let [json-obj (json/read-json json-str)
                          shopId (:shopID (:script-data json-obj))
                          confidence (tools/calc-store-confidence json-obj)
                          confidence (if (nil? confidence) 0 confidence)
                          rq-body (json/write-str {:doc {:kahui-confidence confidence}})
                          es-url (str es-url "/" (:shopID (:script-data json-obj)) "/_update")]
                      (if-not (nil? shopId)
                        (http-client/post es-url {:body rq-body}))))}]
    (op-store-es url store-file last-lc threads (.get op-fns op-type))))

(defn merge-file
  "合并文件
  arg 0: 目标文件
  arg 1: 输入文件,支持通配符
  "
  [arg]
  (let [dest-file (first arg)
        input-file (second arg)]
    (println "Merge " input-file " to " dest-file)
    (ByteStreams/copy (FileSystemUtils/getGlobFilesInputSupplier ^String input-file) (FileSystemUtils/getOutputSupplier ^String dest-file))
    (println "Merge " input-file " to " dest-file " finished.")))

(defn merge-file-avro
  "合并文件
  arg 0: 目标文件
  arg 1: 输入文件的根目录
  arg 2: 文件通配符
  "
  [arg]
  (let [dest-file (first arg)
        input-root (second arg)
        input-pattern (nth arg 2)]
    (println "Merge " input-root " to " dest-file " input-pattern: " input-pattern)
    (doseq [dir (.listFiles (io/file input-root))]
      (if (.isDirectory dir)
        (let [input-file (str dir "/" input-pattern)]
          (println "Merge " input-file " to " dest-file " begin")
          (AvorUtils/mergeFiles input-file dest-file (AvorUtils$SizeFilterFunction. 100) (AvorUtils$NoSuffixFileNameFunction. true) AvorUtils$AutoUnCompressFunction/INSTANCE)
          (println "Merge " input-file " to " dest-file " finished."))))))

(defn list-store-id
  "列出由input-file指定的文件中的shopID,将结果输出到dest-file"
  [arg]
  (let [input-file (first arg)
        dest-file (second arg)
        input (FileSystemUtils/getGlobFilesInputSupplier ^String input-file)]
    (with-open [r (io/reader (.getInput input))
                w (io/writer dest-file)]
      (binding [*out* w]
        (doseq [line (line-seq r)]
          (let [json-obj (json/read-str line)
                script-data (json-obj "script-data")
                shop-id (script-data "shopID")]
            (println shop-id)))))))


(defn parse-failed-id
  "从dianping.mr.parser MR Job输出的error目录中的文件中抽取出HTML解析页面失败的ID
  input-file 输入的文件
  dest-file 输出的文件
  "
  ([arg] (parse-failed-id (first arg) (second arg)))
  ([input-file dest-file]
   (let [input (FileSystemUtils/getGlobFilesInputSupplier ^String input-file)]
     (with-open [r (io/reader (.getInput input))
                 w (io/writer dest-file)]
       (binding [*out* w]
         (doseq [line (line-seq r)]
           (let [json-obj (json/read-json line) id (:name json-obj)]
             (if-let [id id]
               (println id)))))))))

(defn next-dp-id-range
  "dp-fresh-shop-url: http://www.dianping.com/freshshop
  archive-dir: 已经下载的点评商户数据目录
  out-file: 结果输出文件,可选
  分别取得`dp-fresh-shop-url`中最新的点评商户ID和`acchive-dir`目录中文件名中最大的ID,作为下次抓取的起始和结束ID[archive-max-id,dp-fresh-id]
  "
  ([args]
   (let [
          [options args banner] (cli/cli args "取得下一批点评爬取的ID区间"
                                              ["-h" "--help" "帮助" :default false :flag true]
                                              ["--dp-fresh-shop-url" "点评最新商户URL地址"]
                                              ["--archive-dir" "卡惠爬取点评的数据目录"]
                                              ["--out-file" "结果的输出文件,可选"])
          {:keys [dp-fresh-shop-url archive-dir out-file]} options]
     (let [dp-fresh-shop-doc (Jsoup/parse (URL. dp-fresh-shop-url) 10000)
           latest-shop (first (.select dp-fresh-shop-doc "div.shopname-list ul li span.shopname a"))
           latest-shop-id (Integer. (utils/re-match-group #"/shop/(\d+)$" (.attr latest-shop "href") 1))
           fs (FileSystemUtils/getFileSystemByPath archive-dir)
           archive-dir-path (Path. archive-dir)
           archive-file-ids (map #(if (.isFile %) (utils/re-match-group #".+-\d+-(\d+).*" (.getName (.getPath %)) 1)) (.listStatus fs archive-dir-path))
           archive-file-ids (reverse (sort (map #(Integer. %) (filter #(not (nil? %)) archive-file-ids))))
           max-archive-id (first archive-file-ids)]
       (when (some #(or (nil? %) (= 0 %)) [max-archive-id latest-shop-id])
         (throw (IllegalStateException. (str "Can't find the max-archive-id or latest-shop-id [" max-archive-id "," latest-shop-id "]"))))
       (when-not (nil? out-file)
         (with-open [w (io/writer out-file)]
           (.write w (str max-archive-id "\t" latest-shop-id))))
       [max-archive-id latest-shop-id]))))
