;;; 处理点评的商户数据,抽取poi
(ns com.kahui.spiders.dianping.mr.poi
  (:require [clojure.string :as str]
            [clojure.data.json :as json]
            [clojure.core.reducers :as r]
            [clojure.tools.logging :as logging]
            [parkour.conf :as conf]
            [parkour.fs :as fs]
            [parkour.mapreduce :as mr]
            [parkour.graph :as pg]
            [parkour.tool :as tool]
            [parkour.io.text :as text]
            [parkour.io.dseq :as dseq]
            [parkour.io.dux :as dux]
            [com.kahui.spiders.dianping.tools :as tools])
  (:import [org.apache.hadoop.io Text LongWritable NullWritable])
  (:gen-class))

(defn extract
  "抽取商户poi数据"
  [input]
  (if-let [input input]
    (let [json-obj (json/read-json input)
          script-data (:script-data json-obj)
          shop-id (:shopID script-data)
          parse-dp-poi (tools/parse-dp-store-poi json-obj)]
      (if (not (some #(nil? %) [parse-dp-poi shop-id]))
        [nil (json/write-str (merge {:shopID shop-id} {:loc parse-dp-poi}))]))))

(defn mapper
  {::mr/source-as :vals}
  [input]
  (->> input
    (r/map extract)
    (r/filter #(not (nil? %)))))


(defn mr
  [conf workdir lines]
  (let [out-path (fs/path workdir)
        dsink (text/dsink (fs/path out-path "store-poi"))]
    (conf/assoc! conf "mapred.job.reuse.jvm.num.task" -1)
    (-> (pg/input lines)
      (pg/map #'mapper)
      (pg/output dsink)
      (pg/execute conf "poi")
      first)))

(defn tool
  [conf & args]
  (let [[workdir & inpaths] args
        lines (apply text/dseq inpaths)]
    (mr conf workdir lines)))

(defn -main
  [& args] (System/exit (tool/run tool args)))
