;;----------------------------------------------------------------------------
;; Copyright 2011 Factual, Inc.
;; All Rights Reserved.
;;
;; This is UNPUBLISHED PROPRIETARY SOURCE CODE of Factual, Inc.
;; Factual, Inc. reserves all rights in the source code as
;; delivered. The source code and all contents of this file, may not be
;; used, copied, modified, distributed, sold, disclosed to third parties
;; or duplicated in any form, in whole or in part, for any purpose,
;; without the prior written permission of Factual, Inc.
;;----------------------------------------------------------------------------

(ns leafgrabber.clean
  (:use [clojure.java.io :only [resource]]
        [clojure.tools.logging :only [warn debug]])
  (:require [leafgrabber.xpath :as x]
            [clojure.string :as s])
  (:import [java.net URL]
           [com.factual.jagave.web UrlUtils]))


(defn drop-nodes
  "Drop all the nodes in a seq, or that satisfy the given xpath and dom.
   Note that this can result in consecutive text nodes. Returns the nodes."
  ([nodes]
     (doseq [node nodes]
       (x/unlink node))
     nodes)
  ([xp dom]
     (let [nodes (x/xpath xp dom)]
       (drop-nodes nodes))))

(defn drop-ignored-nodes [dom]
  (drop-nodes ".//comment()|.//style|.//script|.//link|.//object|.//wbr"
              dom))

(defn cleanup-text-nodes [dom]
  (let
      [text-nodes
       (x/xpath ".//text()[normalize-space(.)!=.]"
                (do (.normalize dom)
                    dom))]
    (doseq [tnode text-nodes]
      (x/setContent tnode (s/trim (x/content tnode))))))

(def pipeline-maps
  "Map from domain name to collection of pattern maps for the domain. Each pattern map
   has zero or more of these key/values:

   :pattern (a regex that matches the \"path\" + \"query\" part of the URL)
   :exclude-xpath (zero or more xpaths separated by | for nodes to remove from the DOM)
   :page-class (either \"leaf\" or \"directory\")

   Temporarily these pipeline maps are loaded from a file in the resources directory.
   Later they will be fetched from the pipelines API and will be cached per domain."
  (let [maps-file "pipelines-patterns.txt"]
    (try
      (read-string
       (slurp (resource maps-file)))
      (catch Exception e (do (warn (str "Invalid " maps-file " resource."))
                             {})))))

(defn get-pipeline-map
  "Get the pipeline map for the given host and whose pattern matches either the whole URL
   or the \"path\" + \"query\" part of the URL."
  [url]
  (let [url-obj (URL. url)
        host (UrlUtils/getTopPrivateDomain url)
        query (.getQuery url-obj)
        relative-url (str (.getPath url-obj)
                          (when query
                            (str \? (.getQuery url-obj))))

        domain-pipeline-maps (pipeline-maps host)]

    (when domain-pipeline-maps
      (let [m (some (fn [pipeline-map]
                      (let [p (:pattern pipeline-map)]
                        (when (or (not p)
                                  (re-find p relative-url)
                                  (re-find p url))
                          pipeline-map)))
                    domain-pipeline-maps)]
        (or m
            (warn (str "All pipelines patterns failed to match for URL " url)))))))

(defn apply-pipeline-map
  "Get the pipeline map for the page and remove nodes from the DOM that
   match the :exclude-xpath value. Associate the whole pipeline map with
   the page map, so, e.g., the :page-class value is available to the page
   classifier."
  [page]
  (let [url (:url @page)
        pipeline-map (get-pipeline-map url)
        xp (:exclude-xpath pipeline-map)]
    (when xp
      (let [dom (:dom @page)
            nodes (x/xpath xp dom)]
        (if (not-empty nodes)
          (drop-nodes nodes)
          (warn (str "Failed to find nodes for exclude-xpath " xp " for URL " url)))))
    (swap! page assoc
           :pipeline pipeline-map)))

(defn clean-page
  "Save and remove script nodes and clean up text nodes.
   Returns the page map."
  [page]

  (apply-pipeline-map page)

  (let [dom (:dom @page)
        ;; capture script and link nodes before they are dropped
        script-nodes (x/xpath ".//script" dom)
        link-nodes (x/xpath ".//link" dom)]
    (drop-ignored-nodes dom)
    (cleanup-text-nodes dom)
    (swap! page assoc
           :script script-nodes
           :links link-nodes)
    page))
