(ns tendril.core
  "Scrape a set of URLs as politely as possible, giving full flexibility over
  what to do with the scraped results. Effectively, the scraper's only job is to
  fetch the content from the provided URLs without spamming, and hand off the
  results to a provided function.

  Clojure.core.async is used as the underlying concurrency mechanism."
  (:require [cemerick.url :as cemurl]

            [clojure.core.async :as async]
            [clojure.string :as st]

            [hub.http.client :as client]
            [hub.cache :as cache]

            [tidy.core :as tidy]

            [com.stuartsierra.component :as component :refer [Lifecycle]]
            [org.bovinegenius.exploding-fish :as uri]
            [taoensso.timbre :as log]))

;;; Declarations

(declare start-queue-workers!)

(def ^:private default-politeness-ms 2500)
(def ^:private default-n-workers 2)
(def ^:private default-buf-size (long (Math/pow 2 16)))
(def ^:private default-dropping-buffer? false)
(def ^:private default-web-cache-size 64)
(def ^:private default-force-enqueue-size 1024)

(defrecord Scraper [response-ch n-workers politeness buf-size dropping-buffer? web-cache-size]
  Lifecycle
  (start [this]
    (if (:started? this)
      this
      (let [_ (println "tendril starting")
            enqueue-ch (if dropping-buffer?
                         (async/chan (async/dropping-buffer buf-size))
                         (async/chan buf-size))
            worker-ch (async/chan)
            web-cache (component/start (cache/cache web-cache-size))
            http-client (component/start (client/client web-cache politeness))]
        (tidy/feeder-pipe enqueue-ch worker-ch (comp uri/host second) buf-size)
        (start-queue-workers! n-workers worker-ch)
        (println "tendril started")
        (assoc this
               :stats (atom {:enqueued 0})
               :enqueue-ch enqueue-ch
               :worker-ch worker-ch
               :web-cache web-cache
               :http-client http-client
               :started? true))))
  (stop [this]
    (if-not (:started? this)
      this
      (do (log/debug "stopping tendril")
          (async/close! (:enqueue-ch this))
          (async/close! (:worker-ch this))
          (component/stop (:web-cache this))
          (component/stop (:http-client this))
          (log/debug "stopped tendril")
          (dissoc this
                  :started? :enqueue-ch :worker-ch
                  :http-client :web-cache :stats)))))

;;; Public

(defn scraper
  "Fetched responses are placed onto 'response-ch'.

  - 'workers' is the number of concurrent go-blocks dedicated to dequeuing items
  and fetching their content.

  - 'politeness' can be provided as the number of ms to wait between fetches to
  the same host.

  - 'buf-size' is the number of items to queue in the underlying core.async
  channel.

  - 'dropping-buffer?' when true, use a dropping-buffer on the underlying
  core.async enqueue channel, resulting in all items coming in after 'buf-size'
  items are queued being dropped.

  - 'web-cache-size' controls the maximum number of cached responses in-memory
  under the hood of the underlying http-client component.

  "
  [response-ch & {:keys [workers politeness buf-size dropping-buffer? web-cache-size]
                  :or {workers default-n-workers
                       politeness default-politeness-ms
                       buf-size default-buf-size
                       dropping-buffer? default-dropping-buffer?
                       web-cache-size default-web-cache-size}}]
  (assert (and (number? workers) (pos? workers) (number? politeness)))
  (->Scraper response-ch workers politeness buf-size dropping-buffer? web-cache-size))

(defn scrape!
  [this url]
  (assert (and (:started? this) (not-empty url)))
  (async/go
    (when (async/>! (:enqueue-ch this) [this url])
      (swap! (:stats this)
             update-in [:enqueued]
             (fn [c]
               (if (not (:dropping-buffer? this))
                 (inc c)
                 (min (:buf-size this) (inc c))))))))

(defn drain!
  ([this] (drain! this 0))
  ([this n]
   (assert (:started? this))
   (async/go
     (loop [c 0]
       (let [[v ch] (async/alts! [(async/timeout 25) (:enqueue-ch this)])]
         (when (and v (= ch (:enqueue-ch this))
                    (or (zero? n) (< c n)))
           (swap! (:stats this) update-in [:enqueued] dec)
           (recur (inc c))))))))

(defn stats
  [this]
  (assert (:started? this))
  @(:stats this))

;;; Private

(defn- start-queue-workers!
  [n ch]
  (dotimes [i n]
    (async/go
      (try (loop []
             (when-let [[this url] (async/<! ch)]
               (swap! (:stats this) update-in [:enqueued] dec)
               (when-let [response (async/<! (client/get (:http-client this) url))]
                 (async/>! (:response-ch this) (assoc response :url url)))
               (recur)))
           (catch Exception e
             (log/error e "Error occurred in scraper queue worker.")))
      (log/debug "Scraper queue worker exited."))))
