;;----------------------------------------------------------------------------
;; Copyright 2011 Factual, Inc.
;; All Rights Reserved.
;;
;; This is UNPUBLISHED PROPRIETARY SOURCE CODE of Factual, Inc.
;; Factual, Inc. reserves all rights in the source code as
;; delivered. The source code and all contents of this file, may not be
;; used, copied, modified, distributed, sold, disclosed to third parties
;; or duplicated in any form, in whole or in part, for any purpose,
;; without the prior written permission of Factual, Inc.
;;----------------------------------------------------------------------------

(ns leafgrabber.extract.base
  (:use [leafgrabber.clean :only [drop-nodes]]
        [leafgrabber.register :only [register-extractor-group]])
  (:require [leafgrabber.xpath :as x]
            [leafgrabber.page :as page]
            [clojure.string :as s]
            [juiceful.utils :as u])
  (:import [org.w3c.dom Node]))


(page/def-get :img
  #(drop-nodes ".//img" (:dom @%)))

(defn get-dom [page]
  (get-img page) ; forces dropping img nodes from dom
  (:dom @page))


(def node-joiner "@@")
(def max-label-size 40)
(def max-value-size 150)

;; TODO: Are these names correct with the #s?
(def val-node-names #{"#a" "#span" "#p" "#div" "#td" "#dd"})


(defn re-pats
  "Make a regex pattern from all the args, converted to strings."
  [& args]
  (re-pattern (apply str args)))

(defn node-as-label
  [node]
  (second
   (re-find (re-pats "^([^:]{0," max-label-size "}[^: ])\\s*:$")
            (x/content node))))

(defn get-val-node
  [following-text label-node]
  (let [ancestors (reverse (x/xpath ".//ancestor::*" following-text))
        val-node (some #(if (x/after? % label-node)
                          (when (val-node-names (x/node-name %)) %)
                          :break)
                       ancestors)]
    (when-not (= :break val-node)
      val-node)))

(defn val-node-cand
  [label-node following-text]
  (let [ftnode-content (x/content following-text)]
    (when-not (or (re-find #":$" ftnode-content)
                  (> (count ftnode-content)
                     max-value-size))
      (if-let [val-node (get-val-node following-text label-node)]
        val-node
        following-text))))

;; TODO: see FieldGrabber::Cleanable decode_entities()
(defn decode-entities [s]
  s)

(defn replace-and-cleanup
  [s regex indicator]
  (-> s
      s/trim
      (s/replace regex
                 indicator)
      s/trim
      (s/replace (re-pats "(\\s*" indicator "\\s*)+")
                 indicator)
      s/trim
      (s/replace-first (re-pats "^" indicator "\\s*")
                       "")
      (s/replace-first (re-pats "\\s*" indicator "$")
                       "")
      decode-entities
      ))

(defn node-pretty-content
  [node]
  (if (x/text? node)
    (x/content node)
    ;; TODO: handle various tags better? eg 'a'?
    ;; http://www.yellowpages.com/wheeling-wv/mip/robert-l-joseph-md-inc-18276758
    (replace-and-cleanup (x/inner-html node)
                         #"<[^<>]*?>"
                         node-joiner)))

(defn pretty-content
  [node-or-seq]
  (cond
   (instance? Node node-or-seq)
   (node-pretty-content node-or-seq)

   (seq? node-or-seq)
   (let [vals (map pretty-content node-or-seq)]
     (if (> (count vals) 1)
       vals
       (first vals)))))

(defn value-as-string
  [val]
  (if (or (seq? val)
          (instance? Node val))
    (pretty-content val)
    (s/trim (str val))))

(defn label-ok?
  [label-str]
  (assert (string? label-str))
  (let [label-size (count label-str)]
    (and (<= label-size max-label-size)
         ;; label can be short in chinese: http://businessdirectory.esdlife.com/gourmet-restaurant/CH-TC/findCompInfo.php?pages=2&location=32&merName=
         (or (> label-size 2)
             (re-find #"(?i)[^a-z0-9\s]" label-str)))))

(defn value-ok?
  [value-str]
  (let [val-size (count value-str)]
    (and (<= val-size max-value-size)
         (> val-size 0))))

(defn register-value
  ([label val]
     (register-value label val nil))
  ([label val nodes-to-drop]
     (let [label-trimmed (s/trim (s/replace-first label #":$" ""))
           val-str (value-as-string val)
           key-val (when (and (label-ok? label-trimmed)
                              (value-ok? val-str))
                     ;; TODO: This will override previous values for this label, unlike old LG. Is this acceptable?
                     {label-trimmed val-str})]
       (drop-nodes nodes-to-drop)
       key-val)))

(defn pull-one-labeled
  [label-node following-text-node]
  (when-let [label (node-as-label label-node)]
    (when-let [val-node (val-node-cand label-node following-text-node)]
      (register-value label val-node
                      [label-node val-node]))))

(defn has-label?
  [node]
  (re-find (re-pattern (str "^[^:]{1," max-label-size "}:(?: |$)")
                       (x/content node))))

(defn pull-colon-led-after-label
  [label node label-node]
  (let [node-content (x/content node)]
    (if-let [m (re-find #"^: +([^ ].{0,150})$" node-content)]
      (register-value label (m 1)
                      [label-node, node])
      (when (= ":" node-content)
        (when-let [val-node (x/at "./following::text()[1]" node)]
          (when (and (not (has-label? val-node))
                     (< (count (x/content val-node))
                        50))
            (register-value label val-node
                            [label-node, node, val-node])))))))

(defn pull-colon-led-node
  [node]
  (when-let [label-node (x/at "./preceding::text()[1]" node)]
    (let [label (pretty-content label-node)
          c (count label)]
      (when (and (> c 0)
                 (<= c max-label-size))
        (pull-colon-led-after-label label node label-node)))))

(defn pull-self-labeled
  [node]
  (condp re-find (s/trim (x/content node))

    (re-pats "^([^:]{0," max-label-size "})\\s*(?:(?<=[[:alpha:]]):(?!/)|: )\\s*(.{0," max-value-size "})$")
    :>>
    (fn [m]
      (register-value (s/trim (m 1))
                      (s/trim (m 2))))

    #"^:"
    (pull-colon-led-node node)))

(defn pull-labeled
  "Return map of name/vals that are extracted from colon-delimited fields."
  [page]
  (let [dom (get-dom page)

        ;; don't need ./self::text() because we always start with a Document node, unlike the Ruby fieldgrabber
        ;; ./descendant-or-self:: not consistent order!!
;;;     all-text-nodes (search "./self::text()|.//text()" dom)

        all-text-nodes (x/xpath ".//text()" dom)

        pairs (partition 2 1 all-text-nodes)

        ;; labels can't contain '. ' -- a sentence-like structure
        ;; skip if does not have trailing or space-followed colon:
        name-vals
        (for [pair pairs
              :let [[label-node following-text-node] pair
                    lnode-content (x/content label-node)]
              :when (and (<= (count lnode-content)
                             (+ max-label-size max-value-size))
                         (u/has? lnode-content \:)
                         (not (re-find #"[.] .+:" lnode-content)))]
          (or (pull-one-labeled label-node following-text-node)
              (pull-self-labeled label-node)))]
    ;; nils in name-vals will be skipped
    (conj name-vals
          (pull-self-labeled (last all-text-nodes)))))


(defn pull-og [page]
  (let [dom (get-dom page)
        ;; http://www.yelp.com/biz/the-blue-room-cambridge
        ;; http://akron.canton.diningguide.com/data/d101655.htm
        og-nodes (x/xpath ".//head/meta[contains(@property,'og:') or contains(@property,'fb:')]"
                          dom)]
    (map #(register-value (x/attr "property" %)
                          (x/attr "content" %))
         og-nodes)))


(defn pull-by-class [node sel sub-sels]
  ;; TODO: pick up value-title title idiom, as in country-name in http://www.cylex.at/verkehrssicherheit.html
  (if-let [subnode (x/at-css (str \. sel)
                             node)]
    (for [sub-sel sub-sels]
      (register-value (str "hcard:" sel \: sub-sel)
                      (x/at-css (str \. sub-sel)
                                subnode))))  )

(defn pull-addr-aux [node]
  (pull-by-class node "adr" ["post-office-box" "extended-address" "street-address" "locality" "region" "postal-code" "country-name"]))

(defn pull-geo-aux [node]
  (pull-by-class node "geo" ["latitude" "longitude"]))

(def vcard-selectors
  ["fn" "n" "org" "url" "email" "nickname" "category" "organization-name" "organization-unit" "role" "title"])

(defn get-type [node]
  (if-let [type-node (x/at-css ".type" node)]
    (or (x/attr "title" type-node)
        (x/content type-node))))

(defn pull-vcard-aux [vcard-node]
  (concat

   (for [sel vcard-selectors]
     (register-value (str "hcard:" sel)
                     (x/at-css (str \. sel)
                               vcard-node)))

   (map (fn [tel-node]
          (let [label (if (= "fax" (get-type tel-node))
                        "fax" "tel")]
            (register-value (str "hcard:" label)
                            (or (x/at-css ".value" tel-node)
                                tel-node))))
        (x/css ".tel" vcard-node))

   (if-let [url-node (x/at-css ".url" vcard-node)]
     [(register-value "hcard:url_href"
                      (x/at "./@href" url-node))])

   (pull-addr-aux vcard-node)
   (pull-geo-aux vcard-node)))

(defn pull-vcard [page]
  (let [dom (get-dom page)]
    (if-let [vcard-node (x/at-css ".vcard" dom)]
      (pull-vcard-aux vcard-node))))


(defn pull-img-coords [page]
  (let [img-nodes (get-img page)
        img-sources (remove nil?
                            (map #(x/attr "src" %)
                                 img-nodes))
        map-srcs (filter #(re-find #"http://maps?[.]\w+[.]com" %)
                          img-sources)]
    (if (= 1 (count map-srcs))
      ;; eg http://11870.com/pro/60629
      (if-let [m (re-find #"\bmarkers=([^;&,]+),([^;&,]+)"
                          (first map-srcs))]
        (list
         (register-value "img:map:latitude"
                         (m 1))
         (register-value "img:map:longitude"
                         (m 2)))))))

(defn match-json-latlng [javascripts]
  (let [lat-m (re-find #"(?i)\blat(?:itude)?[\"']?\s*:\s*[\"']?\s*([0-9.-]+)"
                       javascripts)
        lng-m (re-find #"(?i)\blo?ng(?:itude)?[\"']?\s*:\s*[\"']?\s*([0-9.-]+)"
                       javascripts)]
    (when (and lat-m lng-m)
      [(lat-m 1) (lng-m 1)])))

(defn pull-latlngs-aux [page]
  (let [script-nodes (:script @page)
        javascripts (s/join "\n"
                            (map x/content script-nodes))
        latlngs (map #(do [(% 1) (% 2)])
                     (re-seq #"LatLng\(\s*([0-9.-]+),\s*([0-9.-]+)\s*\)"
                             javascripts))
        latlng (match-json-latlng javascripts)]
    (if latlng
      (cons latlng latlngs)
      latlngs)))

(defn pull-javascript-coords [page]
  ;; eg
  ;; http://www.75vn.com/companydirectory/auto/yeu12914.html
  ;; http://map.evesta.jp/ceremony/e816750.html
  (let [latlngs (distinct
                 (pull-latlngs-aux page))]
    (if (= 1 (count latlngs))
      (let [[lat lng] (first latlngs)]
        (list
         (register-value "javascript:latitude"
                         lat)
         (register-value "javascript:longitude"
                         lng))))))

(defn pull-coords [page]
  (concat
   (pull-img-coords page)
   (pull-javascript-coords page)
   ))


(register-extractor-group
 "base"
 [pull-og
  pull-vcard
  pull-coords

  ;; pull-labeled
  ])
