;;; 解析大众点评页面需要的一些工具

(ns com.kahui.spiders.dianping.tools
  (:import [javax.script ScriptEngineManager]
           [org.apache.commons.lang StringEscapeUtils]
           [com.cdg.kahui.craft.etl.poi Dianping2POIFunc WGS84ToGCJ02Func DianpingMapBar2GCJ02Func]
           [org.elasticsearch.index.query QueryBuilders]
           [com.cdg.kahui.craft.service.basic.es ESClientFactory])
  (:require [clojure.data.json :as json]
            [clojure.string :as string]
            [clojure.tools.logging :as logging]
            [com.kahui.spiders.dianping.commons :as commons])
  (:use [clojure.java.io :as io]))

(def engine (-> (ScriptEngineManager.) (.getEngineByName "js")))
(defn- parse-js-without-exception
  [js]
  (let [engine-bindings (.createBindings engine)
        eval-js (str "ret=" js "")]
    (try
      (.eval engine eval-js engine-bindings)
      (let [ret (.get engine-bindings "ret")
            ret-map (reduce merge {} (map #(->> % json/write-str json/read-str) (filter #(instance? java.util.Map %) ret)))]
        ret-map)
      (catch Exception e
        (logging/error (str e) "src:" commons/*src* " str:" js)))))

(defn- to-int
  [s]
  (let [s (str s)]
    (when-not (empty? s)
      (.intValue (Double/parseDouble s)))))

(defn- get-val-or-nil
  [m k]
  (if-let [m m]
    (m k)))

(defn- extract-script-data
  "从script中解析出数据"
  [dp-data dp-data-require]
  (when-not (and (empty? dp-data) (empty? dp-data-require))
    {:cityCnName (dp-data "cityCnName")
     :cityEnName (dp-data "cityEnName")
     :shopGroupID (to-int (dp-data "shopGroupID"))
     :shopGroupName (dp-data "shopGroupName")
     :categoryID (to-int (dp-data "categoryID"))
     :cityID (to-int (dp-data "cityID"))
     :regionID (to-int (dp-data "regionID"))
     :shopType (to-int (dp-data "shopType"))
     :shopID (to-int (dp-data "shopID"))
     :poi (some identity [(get-in dp-data-require ["config" "poi"] nil) (get-in dp-data ["mapData" "poi"] nil)])
     :mapType (get-in dp-data-require ["config" "mapType"] nil)}))

(defn- extract-by-regex
  "使用正则表达式查找数据"
  [regex t]
  (let [reg-find-seq (map #(re-seq regex %) t)]
    (some #(second (first %)) reg-find-seq)))

(defn- refind-value
  "重新查找值"
  [pre-value regex script-tag-coll value-convert]
  (if (nil? pre-value)
    (if-let [new-value (extract-by-regex regex script-tag-coll)]
      (value-convert new-value))
    pre-value))

(defn- clean-js-express
  "修正js表达式"
  [js]
  (if-let [js js]
    (let [js (.trim js)
          bad-end-re #"\)\n?\r?\s*\}$"]
      (cond
        (not (.startsWith js "{")) nil
        (.find (re-matcher bad-end-re js)) (.replaceAll (re-matcher bad-end-re js) "")
        :else js))))

(defn- parse-js-seq
  [js-seq]
  (reduce merge {}
    (map (fn [d-js]
           (parse-js-without-exception (str "[" d-js "]")))
      (filter #(not= nil %) (map clean-js-express js-seq)))))

(defn- extract-by-script
  "使用javascript engine 解析scrpit中的DP.data()和DP.require()中的数据"
  [script]
  (let [regex-script-data #"(?s)DP.data\((.+?)\);"
        regex-script-require #"(?s)DP.require\((.+?)\);"
        data-seq (re-seq regex-script-data script)
        require-seq (re-seq regex-script-require script)
        data-group (map #(second %) data-seq)
        require-group (map #(second %) require-seq)
        data-json (parse-js-seq data-group)
        require-json (parse-js-seq require-group)]
    {:data data-json :require-config require-json}))

(defn- unesacpe-unicode
  ""
  [^String s]
  (if-let [s s]
    (cond
      (or (>= (.indexOf s "\\U") 0) (>= (.indexOf s "\\u"))) (StringEscapeUtils/unescapeJava s)
      :else s)))



(defn parse-script
  "解析script标签中的数据"
  [script-tag-coll]
  (let [json-data (map extract-by-script script-tag-coll)
        dp-data (reduce merge {} (map #(:data %) json-data))
        dp-require (reduce merge {} (map #(:require-config %) json-data))
        ret (extract-script-data dp-data dp-require)
        ret (if (nil? ret) {} ret)
        ; 防止script解析异常,使用正则表达式再次取得需要的数据
        cityID (refind-value (:cityID ret) #"[^\w]*cityID:\s*\"(\d+)\"," script-tag-coll to-int)
        shopID (refind-value (:shopID ret) #"[^\w]*shopID:\s*(\d+)," script-tag-coll to-int)
        shopType (refind-value (:shopType ret) #"[^\w]*shopType:\s*\"(\d+)\"," script-tag-coll to-int)
        cityCnName (refind-value (:cityCnName ret) #"[^\w]*cityCnName:\s*\"(.+)\"," script-tag-coll identity)
        cityEnName (refind-value (:cityEnName ret) #"[^\w]*cityEnName:\s*\"(.+)\"," script-tag-coll identity)
        poi (refind-value (:poi ret) #"[^\w]*poi:\s*\"(.+)\"," script-tag-coll identity)
        shopGroupName (refind-value (:shopGroupName ret) #"[^\w]*shopGroupName:\s*\"(.+)\"," script-tag-coll unesacpe-unicode)
        shopGroupID (refind-value (:shopGroupID ret) #"[^\w]*shopGroupID:\s*(\d+)," script-tag-coll to-int)]
    (merge ret {:cityID cityID
                :shopID shopID
                :shopType shopType
                :cityCnName cityCnName
                :cityEnName cityEnName
                :shopGroupName shopGroupName
                :shopGroupID shopGroupID
                :poi poi})))

(defn calc-store-confidence
  "
  计算点评商户的置信度,计算商户置信度规则如下:
  1：带卡标识，团标识， 优惠券标识的商户 100分
  2：星级： 一星：10分
           二星：20分
           三星：30分
           四星：40分
           准四星：38分
           五星：50分
           准五星：48分
           其他：0分
  3：分店数：大于等于10家分店：30分; 其他：分店数*3分
  4：评论数与图片数总和：等于0:0分
                      小于等于20:2分
                      小于等于40:4分
                      大于等于2000：20分
                      其他：5+（总和-40）/130
  "
  [store]
  (if-let [store store]
    (let [card (:card store)
          team-buying (:team-buying store)
          grade (:grade store)
          branch-office (:branch-office store)
          comment-count (:comment-count store)
          photo-count (:photo-count store)
          sum
          (+
            ;计算卡标识和团标识
            (if (some #(and (not (nil? %)) (> (count %) 0)) [card team-buying]) 100 0)
            ;计算星级评分
            (cond
              (= "一星商户" grade) 10
              (= "二星商户" grade) 20
              (= "三星商户" grade) 30
              (= "准四星商户" grade) 38
              (= "四星商户" grade) 40
              (= "准五星商户" grade) 48
              (= "五星商户" grade) 50
              :else 0)
            ;分店数
            (if-let [branch-find (re-seq #"其它(\d+)家" (if (nil? branch-office) "" branch-office))]
              (let [branch-count (Integer/parseInt (second (first branch-find)))]
                (cond
                  (>= branch-count 10) 30
                  :else (* branch-count 3)))
              0)
            ;评论数+图片数
            (let [comment-photo-count (+
                                        (if (nil? comment-count) 0 (Integer/parseInt (str comment-count)))
                                        (if (nil? photo-count) 0 (Integer/parseInt (str photo-count))))]
              (cond
                (<= comment-photo-count 0) 0
                (<= comment-photo-count 20) 2
                (<= comment-photo-count 40) 4
                (>= comment-photo-count 2000) 20
                :else (+ 5 (/ (- comment-photo-count 40) 130))
                )))]
      (int sum))
    0))


(def poifuncs {"1" (Dianping2POIFunc.)
               "2" (DianpingMapBar2GCJ02Func.)
               "4" (Dianping2POIFunc.)
               "7" (Dianping2POIFunc.)
               "unknown" (DianpingMapBar2GCJ02Func.)})

(defn parse-dp-store-poi
  "解析点评商户,store-json由json/read-json获得,返回格式{lat:double,lon:double}"
  [store-json]
  (if-let [script-data (:script-data store-json)]
    (let [shop-poi (:poi script-data)
          shop-map-type (:mapType script-data)
          shop-map-type (if (nil? shop-map-type) "unknown" (str shop-map-type))]
      (when (and
              (not (some #(nil? %) [shop-map-type shop-poi]))
              (> (count shop-poi) 0))
        (if-let [func (poifuncs shop-map-type)]
          (if-let [parsed-poi (.apply func shop-poi)]
            {:lat (.getLat parsed-poi) :lon (.getLng parsed-poi)}))))))

(defn find-dianping-brand-city
  "从ES中查找指定的品牌id的城市名和城市ID"
  [es-client brand-id]
  (let [shop-group-query (QueryBuilders/termQuery "script-data.shopGroupID" brand-id)
        request-builder (doto
                          (.prepareSearch es-client (into-array String ["dianping-nobrand"]))
                          (.setTypes (into-array String ["store"]))
                          (.setQuery shop-group-query)
                          (.addFields (into-array String ["script-data.cityCnName" "script-data.cityID"])))
        response (-> (.execute request-builder) (.actionGet 10000))
        hits (.getHits response)]
    (when (> (.getTotalHits hits) 0)
      (let [fields (.getFields (first hits))]
        (reduce merge (map (fn [kv]
                             (let [[key ret-key] kv]
                               (if-let [value (.get fields key)]
                                 {ret-key (.getValue value)})))
                        [["script-data.cityCnName" :cityCnName] ["script-data.cityID" :cityID]]))))))

(defn find-dinaping-brand-city-from-csv
  "读取csv文件,输出带城市名和城市ID的csv"
  [input-csv output-csv]
  (with-open [csv-reader (io/reader input-csv)
              csv-writer (io/writer output-csv)
              es-client (ESClientFactory/createTransportClient "es-kahui0" (into-array String ["12.0.0.200:9300"]))]
    (let [csv-seq (line-seq csv-reader)
          header (first csv-seq)
          csv-seq (rest csv-seq)]
      (.write csv-writer (str header ",DP_CITY_CN_NAME,DP_CITY_ID\n"))
      (doseq [csv csv-seq]
        (let [[_ _ brand-id] (.split csv ",")
              {:keys [cityID cityCnName]} (find-dianping-brand-city es-client brand-id)]
          (.write csv-writer (str csv "," cityCnName "," cityID "\n")))))))

