(ns tendril.core
  "Scrape a set of URLs as politely as possible, giving full flexibility over
  what to do with the scraped results. Effectively, the scraper's only job is to
  fetch the content from the provided URLs without spamming, and hand off the
  results to a provided function."
  (:require [cemerick.url :as cemurl]

            [clojure.core.async :as async]
            [clojure.string :as st]

            [hub.http.client :as client]
            [hub.cache :as cache]

            [tidy.core :as tidy]

            [com.stuartsierra.component :as component :refer [Lifecycle]]
            [org.bovinegenius.exploding-fish :as uri]
            [taoensso.timbre :as log]))

;;; Declarations

(declare start-queue-workers!)

(def ^:private default-politeness-ms 2500)
(def ^:private default-n-workers 2)

(defrecord Scraper [response-ch n-workers]
  Lifecycle
  (start [this]
    (if (:started? this)
      this
      (let [enqueue-ch (async/chan (long (Math/pow 2 16)))
            worker-ch (async/chan)
            web-cache (component/start (cache/cache))]
        (tidy/feeder-pipe enqueue-ch worker-ch (comp uri/host second))
        (start-queue-workers! n-workers worker-ch)
        (assoc this
               :enqueue-ch enqueue-ch
               :worker-ch worker-ch
               :web-cache web-cache
               :http-client (component/start
                             (client/client
                              web-cache
                              default-politeness-ms))
               :started? true))))
  (stop [this]
    (if-not (:started? this)
      this
      (do (async/close! (:enqueue-ch this))
          (async/close! (:worker-ch this))
          (component/stop (:web-cache this))
          (component/stop (:http-client this))
          (dissoc this
                  :started? :enqueue-ch :worker-ch
                  :http-client :web-cache)))))

;;; Public

(defn scraper
  ([response-ch] (scraper response-ch default-n-workers))
  ([response-ch n-workers]
   (->Scraper response-ch n-workers)))

(defn scrape!
  [this url]
  (assert (and (:started? this) (not-empty url)))
  (async/put! (:enqueue-ch this) [this url]))

;;; Private

(defn- start-queue-workers!
  [n ch]
  (dotimes [i n]
    (async/go
      (try (loop []
             (when-let [[this url] (async/<! ch)]
               (when-let [response (async/<! (client/get (:http-client this) url))]
                 (async/>! (:response-ch this) (assoc response :url url)))
               (recur)))
           (catch Exception e
             (log/error e "Error occurred in scraper queue worker.")))
      (log/debug "Scraper queue worker exited."))))
