(ns ^:no-doc onyx.peer.task-lifecycle
  (:require [clojure.core.async :refer [alts!! <!! >!! <! >! poll! timeout chan close! thread go]]
            [com.stuartsierra.component :as component]
            [onyx.schema :as os]
            [onyx.static.planning :as planning :refer [find-task]]
            [onyx.static.uuid :as uuid]
            [onyx.extensions :as extensions]
            [onyx.checkpoint :as checkpoint]
            [onyx.compression.nippy :refer [checkpoint-compress checkpoint-decompress]]
            [onyx.flow-conditions.fc-routing :as r]
            [onyx.lifecycles.lifecycle-compile :as lc]
            [onyx.log.commands.common :as common]
            [onyx.log.entry :as entry]
            [onyx.log.replica]
            [onyx.messaging.common :as mc]
            [onyx.messaging.messenger-state :as ms]
            [onyx.messaging.protocols.messenger :as m]
            [onyx.messaging.protocols.publisher :as pub]
            [onyx.messaging.protocols.endpoint-status :as endpoint-status]
            [onyx.messaging.protocols.subscriber :as sub]
            [onyx.messaging.protocols.status-publisher :as status-pub]
            [onyx.monitoring.measurements :refer [emit-latency emit-latency-value]]
            [onyx.monitoring.metrics-monitoring :as metrics-monitoring]
            [onyx.peer.constants :refer [initialize-epoch]]
            [onyx.peer.task-compile :as c]
            [onyx.peer.coordinator :as coordinator :refer [new-peer-coordinator]]
            [onyx.peer.read-batch :as read-batch]
            [onyx.peer.operation :as operation]
            [onyx.peer.resume-point :as res]
            [onyx.peer.status :refer [merge-statuses]]
            ;[onyx.peer.visualization :as viz]
            [onyx.peer.window-state :as ws]
            [onyx.peer.transform :as transform :refer [apply-fn]]
            [onyx.protocol.task-state :as t
             :refer [advance advanced? exec get-context get-event
                     get-input-pipeline get-lifecycle evict-peer!
                     get-messenger get-output-pipeline get-replica
                     get-windows-state goto-next-batch! goto-next-iteration!
                     goto-recover! heartbeat! killed? next-epoch!
                     next-replica! new-iteration? log-state reset-event!
                     sealed? set-context! set-windows-state! set-sealed!
                     set-replica! set-coordinator!  set-messenger! set-epoch!
                     update-event!]]
            [onyx.plugin.messaging-output :as mo]
            [onyx.plugin.protocols.input :as oi]
            [onyx.plugin.protocols.output :as oo]
            [onyx.plugin.protocols.plugin :as op]
            [onyx.windowing.window-compile :as wc]
            [onyx.static.default-vals :refer [arg-or-default]]
            [onyx.static.logging :as logger]
            [onyx.state.state-extensions :as state-extensions]
            [onyx.static.util :refer [ms->ns deserializable-exception]]
            [onyx.types :refer [->Results ->MonitorEvent ->MonitorEventLatency]]
            [schema.core :as s]
            [taoensso.timbre :refer [debug info error warn trace fatal]])
  (:import [org.agrona.concurrent IdleStrategy SleepingIdleStrategy BackoffIdleStrategy]
           [java.util.concurrent TimeUnit]
           [java.util.concurrent.atomic AtomicLong]
           [java.util.concurrent.locks LockSupport]))

(s/defn start-lifecycle? [event start-fn]
  (let [rets (start-fn event)]
    (when-not (:start-lifecycle? rets)
      (info (:onyx.core/log-prefix event)
            "Peer chose not to start the task yet. Backing off and retrying..."))
    rets))

(defn input-task? [event]
  (= :input (:onyx/type (:onyx.core/task-map event))))

(defn output-task? [event]
  (= :output (:onyx/type (:onyx.core/task-map event))))

(defn function-task? [event]
  (= :function (:onyx/type (:onyx.core/task-map event))))

; (s/defn flow-retry-segments :- Event
;   [{:keys [onyx.core/task-state onyx.core/state onyx.core/messenger 
;            onyx.core/monitoring onyx.core/results] :as event} 
;   (doseq [root (:retries results)]
;     (when-let [site (peer-site task-state (:completion-id root))]
;       (emit-latency :peer-retry-segment
;                     monitoring
;                     #(extensions/internal-retry-segment messenger (:id root) site))))
;   event)

(s/defn next-iteration
  [state]
  {:post [(empty? (:onyx.core/batch (:event %)))]}
  (-> state
      (set-context! nil)
      (reset-event!)
      (update-event! #(assoc % :onyx.core/lifecycle-id (uuid/random-uuid)))
      (advance)))

(defn prepare-batch [state]
  (if (oo/prepare-batch (get-output-pipeline state)
                        (get-event state)
                        (get-replica state)
                        (get-messenger state))
    (advance state)
    state))

(defn write-batch [state]
  (if (oo/write-batch (get-output-pipeline state)
                      (get-event state)
                      (get-replica state)
                      (get-messenger state))
    (advance state)
    state))

(defn handle-exception [task-info log e lifecycle exception-action group-ch outbox-ch id job-id]
  (let [data (ex-data e)
        ;; Default to original exception if Onyx didn't wrap the original exception
        inner (or (.getCause ^Throwable e) e)]
    (if (= exception-action :restart)
      (let [msg (format "Caught exception inside task lifecycle %s. Rebooting the task." lifecycle)]
        (warn (logger/merge-error-keys inner task-info id msg))
        (>!! group-ch [:restart-vpeer id]))
      (let [msg (format "Handling uncaught exception thrown inside task lifecycle %s. Killing the job." lifecycle)
            entry (entry/create-log-entry :kill-job {:job job-id})]
        (warn (logger/merge-error-keys e task-info id msg))
        (extensions/write-chunk log :exception (deserializable-exception inner {}) job-id)
        (>!! outbox-ch entry)))))

(defn merged-statuses [state]
  (->> (get-messenger state)
       (m/publishers)
       (mapcat (comp endpoint-status/statuses pub/endpoint-status))
       (map val)
       (into [{:ready? true
               :replica-version (t/replica-version state)
               :checkpointing? (not (checkpoint/complete? (:onyx.core/storage (get-event state))))
               :epoch (t/epoch state)
               :heartbeat (System/nanoTime)
               :min-epoch (t/epoch state)}])
       merge-statuses))

(defn input-poll-barriers [state]
  (m/poll (get-messenger state))
  (advance state))

(defn coordinator-peer-id->peer-id [peer-id]
  (cond-> peer-id
    (vector? peer-id) second))

(defn check-upstream-heartbeats [state liveness-timeout-ns]
  (let [curr-time (System/nanoTime)]
    (->> (sub/status-pubs (m/subscriber (get-messenger state)))
         (filter (fn [[peer-id spub]] 
                   ;; if the publisher is blocked, then it's not its fault we're
                   ;; not getting its heartbeats, and thus we should not time it out
                   (and (not (status-pub/blocked? spub))
                        (< (+ (status-pub/get-heartbeat spub)
                              liveness-timeout-ns)
                           curr-time))))
         (map key)
         (reduce evict-peer! state)
         (advance))))

(defn offer-heartbeats [state]
  (advance (heartbeat! state)))

(defn checkpoint-input [state]
  (let [{:keys [onyx.core/job-id onyx.core/task-id onyx.core/slot-id
                onyx.core/storage onyx.core/monitoring onyx.core/tenancy-id]} (get-event state)
        pipeline (get-input-pipeline state)
        checkpoint (oi/checkpoint pipeline)
        checkpoint-bytes (checkpoint-compress checkpoint)]
    (.set ^AtomicLong (:checkpoint-size monitoring) (alength checkpoint-bytes))
    (checkpoint/write-checkpoint storage tenancy-id job-id (t/replica-version state)
                                 (t/epoch state) task-id slot-id :input checkpoint-bytes)
    (info "Checkpointed input" job-id (t/replica-version state) (t/epoch state) task-id slot-id :input)
    (advance state)))

(defn checkpoint-state [state]
  (let [{:keys [onyx.core/job-id onyx.core/task-id onyx.core/slot-id
                onyx.core/storage onyx.core/monitoring onyx.core/tenancy-id]} (get-event state)
        exported-state (->> (get-windows-state state)
                            (map (juxt ws/window-id ws/export-state))
                            (into {}))
        checkpoint-bytes (checkpoint-compress exported-state)]
    (.set ^AtomicLong (:checkpoint-size monitoring) (alength checkpoint-bytes))
    (checkpoint/write-checkpoint storage tenancy-id job-id (t/replica-version state)
                                 (t/epoch state) task-id slot-id :windows checkpoint-bytes)
    (info "Checkpointed state" job-id (t/replica-version state) (t/epoch state) task-id slot-id :windows)
    (advance state)))

(defn checkpoint-output [state]
  (let [{:keys [onyx.core/job-id onyx.core/task-id onyx.core/slot-id
                onyx.core/storage onyx.core/monitoring onyx.core/tenancy-id]} (get-event state)
        pipeline (get-output-pipeline state)
        checkpoint (oo/checkpoint pipeline)
        checkpoint-bytes (checkpoint-compress checkpoint)]
    (.set ^AtomicLong (:checkpoint-size monitoring) (alength checkpoint-bytes))
    (checkpoint/write-checkpoint storage tenancy-id job-id (t/replica-version state)
                                 (t/epoch state) task-id slot-id :output checkpoint-bytes)
    (info "Checkpointed output" job-id (t/replica-version state) (t/epoch state) task-id slot-id :output)
    (advance state)))

(defn completed? [state]
  (sub/completed? (m/subscriber (get-messenger state))))

(defn try-seal-job! [state]
  (if (and (completed? state)
           (not (sealed? state)))
    (let [messenger (get-messenger state)
          {:keys [onyx.core/triggers]} (get-event state)]
      (if (empty? triggers)
        (set-sealed! state true)
        (set-sealed! (ws/assign-windows state :job-completed) true)))
    state))

(defn synced? [state]
  (cond (input-task? (get-event state))
        (oi/synced? (get-input-pipeline state) (t/epoch state))

        (output-task? (get-event state))
        (oo/synced? (get-output-pipeline state) (t/epoch state))

        :else true))

(defn input-function-seal-barriers? [state]
  (let [messenger (get-messenger state)
        subscriber (m/subscriber messenger)]
    (if (sub/blocked? subscriber)
      (if (synced? state)
        (-> state
            (next-epoch!)
            (try-seal-job!)
            (set-context! {:barrier-opts {:completed? (completed? state)}
                           :src-peers (sub/src-peers subscriber)
                           :publishers (m/publishers messenger)})
            (advance))
        ;; we need to wait until we're synced
        state)
      (goto-next-batch! state))))

(defn output-seal-barriers? [state]
  (let [subscriber (m/subscriber (get-messenger state))] 
    (if (sub/blocked? subscriber)
      (if (synced? state)
        (-> state
            (next-epoch!)   
            (try-seal-job!)
            ;; almost same as input-function-seal-barriers?
            ;; but without the downstream barrier sending
            (set-context! {:src-peers (sub/src-peers subscriber)})
            (advance))
        state)
      (goto-next-batch! state))))

(defn offer-barriers [state]
  (let [messenger (get-messenger state)
        {:keys [barrier-opts publishers] :as context} (get-context state)
        _ (assert (not (empty? publishers)))
        offer-xf (comp (map (fn [pub]
                              [(m/offer-barrier messenger pub barrier-opts)
                               pub]))
                       (remove (comp pos? first))
                       (map second))
        remaining-pubs (sequence offer-xf publishers)]
    (if (empty? remaining-pubs)
      (advance state)
      (set-context! state (assoc context :publishers remaining-pubs)))))

(defn barrier-status-opts [state]
  (let [status (merged-statuses state)]
    {:checkpointing? (:checkpointing? status)
     :min-epoch (:min-epoch status)
     :drained? (or (not (input-task? (get-event state)))
                   (oi/completed? (get-input-pipeline state)))}))

(defn offer-barrier-status [state]
  (let [messenger (get-messenger state)
        {:keys [src-peers] :as context} (get-context state)
        _ (assert (not (empty? src-peers)) (get-replica state))
        opts (assoc (barrier-status-opts state) :event :next-barrier)
        offer-xf (comp (map (fn [src-peer-id]
                              [(sub/offer-barrier-status! (m/subscriber messenger) src-peer-id opts)
                               src-peer-id]))
                       (remove (comp pos? first))
                       (map second))
        remaining-peers (sequence offer-xf src-peers)]
    (if (empty? remaining-peers)
      (advance state)
      (set-context! state (assoc context :src-peers remaining-peers)))))

(defn unblock-subscribers [state]
  (sub/unblock! (m/subscriber (get-messenger state)))
  (advance (set-context! state nil)))

;; Re-enable to prevent CPU burn?
; (defn backoff-when-drained! [event]
;   (Thread/sleep (arg-or-default :onyx.peer/drained-back-off (:peer-opts event))))

(defn assign-windows [state]
  (advance (ws/assign-windows state :new-segment)))

(defn build-lifecycle-invoke-fn [event lifecycle-kw]
  (if-let [f (lc/compile-lifecycle-functions event lifecycle-kw)]
    (fn [state]
      (advance (update-event! state f)))))

(defn recover-input [state]
  (let [{:keys [recover-coordinates recovered?] :as context} (get-context state)
        input-pipeline (get-input-pipeline state)]
    (when-not recovered?
      (let [event (get-event state)
            stored (res/recover-input event recover-coordinates)
            _ (info (:onyx.core/log-prefix event) "Recover pipeline checkpoint:" stored)]
        (oi/recover! input-pipeline (t/replica-version state) stored)))
    (if (oi/synced? input-pipeline (t/epoch state))
      (-> state
          (set-context! nil)
          (advance))
      ;; ensure we don't try to recover input again before synced
      (set-context! state (assoc context :recovered? true)))))

(defn recover-state
  [state]
  (let [{:keys [onyx.core/log-prefix
                onyx.core/windows onyx.core/triggers
                onyx.core/task-id onyx.core/job-id onyx.core/peer-opts
                onyx.core/resume-point] :as event} (get-event state)
        {:keys [recover-coordinates]} (get-context state)
        recovered-windows (res/recover-windows event recover-coordinates)]
    (-> state
        (set-windows-state! recovered-windows)
        ;; Notify triggers that we have recovered our windows
        (ws/assign-windows :recovered)
        (advance))))

(defn recover-output [state]
  (let [{:keys [recover-coordinates recovered?] :as context} (get-context state)
        pipeline (get-output-pipeline state)]
    (when-not recovered?
      (let [event (get-event state)
            ;; output recovery is not currently used by any output plugin.
            ;; checkpoints must currently exist for all state slots.
            stored nil ; (res/recover-output event recover-coordinates)
            _ (info (:onyx.core/log-prefix event) "Recover output pipeline checkpoint:" stored)]
        (oo/recover! pipeline (t/replica-version state) stored)))
    (if (oo/synced? pipeline (t/epoch state))
      (-> state
          (set-context! nil)
          (advance))
      ;; ensure we don't try to recover output again before synced
      (set-context! state (assoc context :recovered? true)))))

(defn poll-recover-input-function [state]
  (let [messenger (get-messenger state)
        subscriber (m/subscriber messenger)
        _ (sub/poll! subscriber)]
    (if (and (sub/blocked? subscriber)
             (sub/recovered? subscriber))
      (-> state
          (next-epoch!)
          (set-context! {:recover-coordinates (sub/get-recover subscriber)
                         :recovered? false
                         :barrier-opts {:recover-coordinates (sub/get-recover subscriber)
                                        :completed? false}
                         :src-peers (sub/src-peers subscriber)
                         :publishers (m/publishers messenger)})
          (advance))
      state)))

(defn poll-recover-output [state]
  (let [subscriber (m/subscriber (get-messenger state))
        _ (sub/poll! subscriber)]
    (if (and (sub/blocked? subscriber)
             (sub/recovered? subscriber))
      (-> state
          (next-epoch!)
          (set-context! {:recovered? false
                         :recover-coordinates (sub/get-recover subscriber)
                         :src-peers (sub/src-peers subscriber)})
          (advance))
      state)))

(def DEBUG false)

(defn iteration [state n-iters]
  ;(when DEBUG (viz/update-monitoring! state-machine))
  (loop [state (exec state) n n-iters]
    ;(log-state state)
    ; (when (zero? (rand-int 10000)) 
    ;   (log-state state))
    (if (and (advanced? state) (pos? n))
      (recur (exec state) ;; we could unroll exec loop a bit
             (if (new-iteration? state)
               (dec n)
               n))
      state)))

(def task-iterations 1)

(defn run-task-lifecycle!
  "The main task run loop, read batch, ack messages, etc."
  [state handle-exception-fn exception-action-fn]
  (try
    (let [{:keys [onyx.core/replica-atom] :as event} (get-event state)]
      (loop [state state
             prev-replica-val (get-replica state)
             replica-val @replica-atom]
        (debug (:onyx.core/log-prefix event) "new task iteration")
        (if (and (= replica-val prev-replica-val)
                 (not (killed? state)))
          (recur (iteration state task-iterations) replica-val @replica-atom)
          (let [next-state (next-replica! state replica-val)]
            (if (killed? next-state)
              (do
                (info (:onyx.core/log-prefix event) "Fell out of task lifecycle loop")
                next-state)
              (recur next-state replica-val replica-val))))))
    (catch Throwable e
      (let [lifecycle (get-lifecycle state)
            action (if (:kill-job? (ex-data e))
                     :kill
                     (exception-action-fn (get-event state) lifecycle e))]
        (handle-exception-fn lifecycle action e))
      state)))

(defn instantiate-plugin [{:keys [onyx.core/task-map] :as event}]
  (let [kw (:onyx/plugin task-map)]
    (case (:onyx/language task-map)
      :java (operation/instantiate-plugin-instance (name kw) event)
      (let [user-ns (namespace kw)
            user-fn (name kw)
            pipeline (if (and user-ns user-fn)
                       (if-let [f (ns-resolve (symbol user-ns) (symbol user-fn))]
                         (f event)))]
        pipeline))))

(defrecord TaskInformation
           [log job-id task-id workflow catalog task flow-conditions windows triggers lifecycles metadata]
  component/Lifecycle
  (start [component]
    (let [catalog (extensions/read-chunk log :catalog job-id)
          task (extensions/read-chunk log :task job-id task-id)
          flow-conditions (extensions/read-chunk log :flow-conditions job-id)
          windows (extensions/read-chunk log :windows job-id)
          triggers (extensions/read-chunk log :triggers job-id)
          workflow (extensions/read-chunk log :workflow job-id)
          lifecycles (extensions/read-chunk log :lifecycles job-id)
          metadata (extensions/read-chunk log :job-metadata job-id)
          resume-point (extensions/read-chunk log :resume-point job-id task-id)]
      (assoc component
             :workflow workflow :catalog catalog :task task :flow-conditions flow-conditions
             :windows windows :triggers triggers :lifecycles lifecycles
             :metadata metadata :resume-point resume-point)))
  (stop [component]
    (assoc component
           :workflow nil :catalog nil :task nil :flow-conditions nil :windows nil
           :triggers nil :lifecycles nil :metadata nil :resume-point nil)))

(defn new-task-information [peer task]
  (map->TaskInformation (select-keys (merge peer task) [:log :job-id :task-id :id])))

(defn compile-apply-fn [event]
  (let [f (:onyx.core/fn event)
        a-fn (if (:onyx/batch-fn? (:onyx.core/task-map event))
               transform/apply-fn-batch
               transform/apply-fn-single)]
    (fn [state]
      (transform/apply-fn a-fn f state))))

(def lifecycles
  {:recover [{:lifecycle :lifecycle/poll-recover
              :builder (fn [event] 
                         (if (output-task? event) 
                           poll-recover-output
                           poll-recover-input-function))
              :type #{:input :function :output}
              :doc "Poll the messenger for the first recovery barrier sent by the coordinator. Once it has received the first barrier, it advances to the next state."
              :blockable? true
              :phase :recover}
             {:lifecycle :lifecycle/offer-barriers
              :doc "Offers the next barrier to downstream tasks. Once it succeeds in offering the barrier to all downstream tasks, it advances to the next state."
              :phase :recover
              :type #{:input :function}
              :blockable? true
              :builder (fn [_] offer-barriers)}
             {:lifecycle :lifecycle/offer-barrier-status
              :type #{:input :function :output}
              :doc "Offers the peer's current status up to upstream peers. Once it succeeds in offering the status to all upstream tasks, it advances to the next state."
              :phase :recover
              :blockable? true
              :builder (fn [_] offer-barrier-status)}
             {:lifecycle :lifecycle/recover-input
              :doc "Reads the checkpoint from durable storage and then supplies the checkpoint to the input plugin recover! method. Advance to the next state."
              :phase :recover
              :type #{:input}
              :blockable? false
              :builder (fn [_] recover-input)}
             {:lifecycle :lifecycle/recover-state
              :doc "Reads the checkpoint from durable storage and then supplies the checkpoint to recover the window and trigger states. Advance to the next state."
              :phase :recover
              :blockable? false
              :type #{:windowed}
              :builder (fn [_] recover-state)}
             {:lifecycle :lifecycle/recover-output
              :type #{:output}
              :phase :recover
              :doc "Reads the checkpoint from durable storage and then supplies the checkpoint to the output plugin recover! method. Advance to the next state."
              :blockable? false
              :builder (fn [_] recover-output)}
             {:lifecycle :lifecycle/unblock-subscribers
              :type #{:input :function :output}
              :phase :recover
              :doc "Unblock the messenger subscriptions, allowing messages to be read by the task. Advance to the next state."
              :blockable? false
              :builder (fn [_] unblock-subscribers)}]
   :processing [{:lifecycle :lifecycle/next-iteration
                 :type #{:input :function :output}
                 :doc "Resets the event map to start a new interation in the processing phase. Advance to the next state."
                 :phase :processing
                 :blockable? false
                 :builder (fn [_] next-iteration)}
                {:lifecycle :lifecycle/input-poll-barriers
                 :type #{:input}
                 :doc "Poll messenger subscriptions for new barriers. Advance to the next state."
                 :phase :processing
                 :blockable? false
                 :builder (fn [_] input-poll-barriers)}
                {:lifecycle :lifecycle/check-publisher-heartbeats
                 :doc "Check whether upstream has timed out directly after subscriber poll. Evict if timeout has been met. Advance to the next state."
                 :type #{:input}
                 :phase :processing
                 :blockable? false
                 :builder (fn [event] 
                            (let [timeout (ms->ns (arg-or-default :onyx.peer/publisher-liveness-timeout-ms 
                                                                  (:onyx.core/peer-opts event)))] 
                              (fn [state] (check-upstream-heartbeats state timeout))))}
                {:lifecycle :lifecycle/seal-barriers?
                 :type #{:input :function}
                 :doc "Check whether barriers have been received from all upstream sources. If all barriers have been received, advance to checkpoint states, otherwise advance to :lifecycle/before-read-batch."
                 :phase :processing
                 :blockable? false
                 :builder (fn [_] input-function-seal-barriers?)}
                {:lifecycle :lifecycle/seal-barriers?
                 :builder (fn [_] output-seal-barriers?)
                 :type #{:output}
                 :doc "Check whether barriers have been received from all upstream sources. If all barriers have been received, advance to checkpoint states, otherwise advance to :lifecycle/before-read-batch."
                 :blockable? false
                 :phase :processing}
                {:lifecycle :lifecycle/checkpoint-input
                 :builder (fn [_] checkpoint-input)
                 :type #{:input}
                 :doc "Start checkpoint of input state. Advance to the next state."
                 :blockable? true
                 :phase :processing}
                {:lifecycle :lifecycle/checkpoint-state
                 :builder (fn [_] checkpoint-state)
                 :type #{:windowed}
                 :doc "Start checkpoint of window and trigger states. Advance to the next state."
                 :blockable? true
                 :phase :processing}
                {:lifecycle :lifecycle/checkpoint-output
                 :builder (fn [_] checkpoint-output)
                 :doc "Start checkpoint of output state. Advance to the next state."
                 :type #{:output}
                 :blockable? true
                 :phase :processing}
                {:lifecycle :lifecycle/offer-barriers
                 :builder (fn [_] offer-barriers)
                 :type #{:input :function}
                 :doc "Offers the next barrier to downstream tasks. Once it succeeds in offering the barrier to all downstream tasks, it advances to the next state."
                 :blockable? true
                 :phase :processing}
                {:lifecycle :lifecycle/offer-barrier-status
                 :type #{:input :function :output}
                 :builder (fn [_] offer-barrier-status)
                 :doc "Offers the peer's current status up to upstream peers. Once it succeeds in offering the status to all upstream tasks, it advances to the next state."
                 :blockable? true
                 :phase :processing}
                {:lifecycle :lifecycle/unblock-subscribers
                 :doc "Unblock the messenger subscriptions, allowing messages to be read by the task. Advance to the next state."
                 :phase :processing
                 :type #{:input :function :output}
                 :blockable? false
                 :builder (fn [_] unblock-subscribers)}
                {:lifecycle :lifecycle/before-batch
                 :type #{:input :function :output}
                 :doc "Call all `:lifecycle/before-batch` fns supplied via lifecycle calls maps. Advance to the next state."
                 :phase :processing
                 :blockable? false
                 :builder (fn [event] (build-lifecycle-invoke-fn event :lifecycle/before-batch))}
                {:lifecycle :lifecycle/read-batch
                 :type #{:input}
                 :phase :processing
                 :doc "Poll input source for messages, placing these messages in `:onyx.core/batch` in the event map. Advance to the next state."
                 :blockable? false
                 :builder (fn [_] read-batch/read-input-batch)}
                {:lifecycle :lifecycle/read-batch
                 :type #{:function :output}
                 :phase :processing
                 :blockable? false
                 :builder (fn [_] read-batch/read-function-batch)}
                {:lifecycle :lifecycle/check-publisher-heartbeats
                 :doc "Check whether upstream has timed out directly after subscriber poll. Evict if timeout has been met. Advance to the next state."
                 :type #{:function :output}
                 :phase :processing
                 :blockable? false
                 :builder (fn [event] 
                            (let [timeout (ms->ns (arg-or-default :onyx.peer/publisher-liveness-timeout-ms 
                                                                  (:onyx.core/peer-opts event)))] 
                              (fn [state] (check-upstream-heartbeats state timeout))))}
                {:lifecycle :lifecycle/after-read-batch
                 :type #{:input :function :output}
                 :phase :processing
                 :doc "Call all `:lifecycle/after-read-batch` fns supplied via lifecycle calls maps. Advance to the next state."
                 :builder (fn [event] (build-lifecycle-invoke-fn event :lifecycle/after-read-batch))}
                {:lifecycle :lifecycle/apply-fn
                 :type #{:input :function :output}
                 :phase :processing
                 :doc "Call `:onyx/fn` supplied for this task on each segment in `:onyx.core/batch`, placing the results in `:onyx.core/results`. Advance to the next state."
                 :blockable? false
                 :builder compile-apply-fn}
                {:lifecycle :lifecycle/after-apply-fn
                 :type #{:input :function :output}
                 :phase :processing
                 :doc "Call all `:lifecycle/after-apply-fn` fns supplied via lifecycle calls maps. Advance to the next state."
                 :blockable? false
                 :builder (fn [event] (build-lifecycle-invoke-fn event :lifecycle/after-apply-fn))}
                {:lifecycle :lifecycle/assign-windows
                 :type #{:windowed}
                 :phase :processing
                 :doc "Update windowed aggregation states, and call any trigger functions. Advance to the next state."
                 :builder (fn [_] assign-windows)}
                {:lifecycle :lifecycle/prepare-batch
                 :type #{:input :function :output}
                 :phase :processing
                 :doc "Prepare batch for emission to downstream tasks or output mediums. The prepare-batch method is called on any plugins. prepare-batch is useful when output mediums may reject offers of segments, where write-batch may have to retry writes multiple times. Advance if the plugin prepare-batch method returns true, otherwise idle and retry prepare-batch."
                 :blockable? true
                 :builder (fn [_] prepare-batch)}
                {:lifecycle :lifecycle/write-batch
                 :type #{:input :function :output}
                 :doc "Write :onyx.core/results to output medium or message :onyx.core/results to downstream peers. write-batch will be called on any plugins. Advance to the next state if write-batch returns true, otherwise idle and retry write-batch."
                 :phase :processing
                 :blockable? true
                 :builder (fn [_] write-batch)}
                {:lifecycle :lifecycle/after-batch
                 :type #{:input :function :output}
                 :doc "Call all `:lifecycle/after-batch` fns supplied via lifecycle calls maps. Advance to the next state."
                 :phase :processing
                 :blockable? false
                 :builder (fn [event] (build-lifecycle-invoke-fn event :lifecycle/after-batch))}
                {:lifecycle :lifecycle/offer-heartbeats
                 :type #{:input :function :output}
                 :doc "Offer heartbeat messages to peers if it has been `:onyx.peer/heartbeat-ms` milliseconds since the previous heartbeats were sent. Set state to :lifecycle/next-iteration to perform the next task-lifecycle iteration."
                 :phase :processing
                 :blockable? false
                 :builder (fn [_] offer-heartbeats)}]})

(defn build-task-fns
  [{:keys [onyx.core/task-map onyx.core/windows onyx.core/triggers] :as event}]
  (let [task-type (cond-> #{(:onyx/type task-map)}
                    (or (not (empty? windows))
                        (not (empty? triggers)))
                    (conj :windowed))]
    (->> (concat (:recover lifecycles) (:processing lifecycles))
         (filter (fn [lifecycle]
                   (not-empty (clojure.set/intersection task-type (:type lifecycle)))))
         (map (fn [lifecycle] (assoc lifecycle :fn ((:builder lifecycle) event))))
         (vec))))

;; Used in tests to detect when a task stop is called
(defn stop-flag! [])

(defn timed-out-subscribers [publishers timeout-ms]
  (let [curr-time (System/nanoTime)] 
    (sequence (comp (mapcat pub/statuses)
                    (filter (fn [[peer-id status]] 
                              (< (+ (:heartbeat status) timeout-ms)
                                 curr-time)))
                    (map key))
              publishers)))

(defn all-heartbeat-times [messenger]
  (let [upstream (->> (mapcat vals (map pub/statuses (m/publishers messenger)))
                      (map :heartbeat))
        downstream (->> (sub/status-pubs (m/subscriber messenger))
                        (vals)
                        (map status-pub/get-heartbeat))]
    (into upstream downstream)))

(deftype TaskStateMachine [monitoring
                           subscriber-liveness-timeout-ms
                           publisher-liveness-timeout-ms
                           input-pipeline
                           output-pipeline
                           ^IdleStrategy idle-strategy
                           ^int recover-idx
                           ^int iteration-idx
                           ^int batch-idx
                           ^int nstates
                           #^"[Lclojure.lang.Keyword;" lifecycle-names
                           #^"[Lclojure.lang.IFn;" lifecycle-fns
                           ^:unsynchronized-mutable ^int idx
                           ^:unsynchronized-mutable ^java.lang.Boolean advanced
                           ^:unsynchronized-mutable sealed
                           ^:unsynchronized-mutable replica
                           ^:unsynchronized-mutable messenger
                           messenger-group
                           ^:unsynchronized-mutable coordinator
                           init-event
                           ^:unsynchronized-mutable event
                           ^:unsynchronized-mutable windows-state
                           ^:unsynchronized-mutable context
                           ^:unsynchronized-mutable replica-version
                           ^:unsynchronized-mutable epoch
                           heartbeat-ns
                           ^:unsynchronized-mutable last-heartbeat
                           ^:unsynchronized-mutable evicted]
  t/PTaskStateMachine
  (start [this] this)
  (stop [this scheduler-event]
    (stop-flag!)
    (when coordinator (coordinator/stop coordinator scheduler-event))
    (when messenger (component/stop messenger))
    (when input-pipeline (op/stop input-pipeline event))
    (when output-pipeline (op/stop output-pipeline event))
    (some-> event :onyx.core/storage checkpoint/stop)
    this)
  (killed? [this]
    (or @(:onyx.core/task-kill-flag event) @(:onyx.core/kill-flag event)))
  (new-iteration? [this]
    (= idx iteration-idx))
  (advanced? [this]
    advanced)
  (get-lifecycle [this]
    (aget lifecycle-names idx))
  (heartbeat! [this]
    (let [curr-time (System/nanoTime)]
      (if (> curr-time (+ last-heartbeat heartbeat-ns))
        ;; send our status back upstream, and heartbeat
        (let [heartbeat-timer ^com.codahale.metrics.Timer (:last-heartbeat-timer monitoring)
              pubs (m/publishers messenger)
              sub (m/subscriber messenger)
              _ (run! pub/poll-heartbeats! pubs)
              _ (run! pub/offer-heartbeat! pubs)
              opts (assoc (barrier-status-opts this) :event :heartbeat)]
          (->> (sub/src-peers sub)
               (run! (fn [peer-id]
                       (sub/offer-barrier-status! sub peer-id opts))))
          (set! last-heartbeat curr-time)
          (run! (fn [hb]
                  (.update heartbeat-timer (- (System/nanoTime) hb) TimeUnit/NANOSECONDS))
                (all-heartbeat-times messenger))
          ;; check if downstream peers are still up
          (->> (timed-out-subscribers pubs subscriber-liveness-timeout-ms)
               (reduce evict-peer! this)))
        this)))
  (log-state [this]
    (let [task-map (:onyx.core/task-map event)]
      (info "Task state"
            {:type (:onyx/type task-map)
             :name (:onyx/name task-map)
             :slot (:onyx.core/slot-id event)
             :id (:onyx.core/id event) 
             :lifecycle (get-lifecycle this)
             :adv? advanced
             :rv replica-version
             :e epoch
             :n-pubs (count (m/publishers messenger))
             :batch (:onyx.core/batch event)
             :results (:onyx.core/results event)}))
    this)
  (set-context! [this new-context]
    (set! context new-context)
    this)
  (get-context [this]
    context)
  (set-sealed! [this new-sealed]
    (set! sealed new-sealed)
    this)
  (sealed? [this]
    sealed)
  (get-input-pipeline [this]
    input-pipeline)
  (get-output-pipeline [this]
    output-pipeline)
  (next-replica! [this new-replica]
    (if (= replica new-replica)
      this
      (let [{:keys [onyx.core/job-id onyx.core/task-id]} event
            old-version (get-in replica [:allocation-version job-id])
            new-version (get-in new-replica [:allocation-version job-id])]
        (cond (= old-version new-version)
              (-> this
                  (set-coordinator! (coordinator/next-state coordinator replica new-replica))
                  (set-replica! new-replica))

              (let [allocated (common/peer->allocated-job (:allocations new-replica) (:onyx.core/id event))]
                (or (killed? this)
                    (not= task-id (:task allocated))
                    (not= job-id (:job allocated))))
              ;; Manually hit the kill switch early since we've been
              ;; reallocated and we want to escape ASAP
              (do
                (reset! (:onyx.core/task-kill-flag event) true)
                this)

              :else
              (let [next-messenger (ms/next-messenger-state! messenger event replica new-replica)]
                (checkpoint/cancel! (:onyx.core/storage event))
                (set! evicted #{})
                (-> this
                    (set-sealed! false)
                    (set-messenger! next-messenger)
                    (set-coordinator! (coordinator/next-state coordinator replica new-replica))
                    (set-replica! new-replica)
                    (reset-event!)
                    (goto-recover!)))))))
  (set-windows-state! [this new-windows-state]
    (set! windows-state new-windows-state)
    this)
  (get-windows-state [this]
    windows-state)
  (set-replica! [this new-replica]
    (set! replica new-replica)
    (let [new-version (get-in new-replica [:allocation-version (:onyx.core/job-id event)])]
      (when-not (= new-version replica-version)
        (set-epoch! this initialize-epoch)
        (set! replica-version new-version)))
    this)
  (get-replica [this]
    replica)
  (set-event! [this new-event]
    (set! event new-event)
    this)
  (evict-peer! [this peer-id]
    (let [{:keys [onyx.core/log-prefix onyx.core/id onyx.core/log
                  onyx.core/id onyx.core/outbox-ch]} event]
      ;; If we're not up, don't emit a log message. We're probably dead too.
      (when (and (extensions/connected? log)
                 (not (get evicted peer-id)))
        (set! evicted (conj evicted peer-id))
        (let [peer-id (coordinator-peer-id->peer-id peer-id)
              entry {:fn :leave-cluster
                     :peer-parent id
                     :args {:id peer-id
                            :group-id (get-in replica [:groups-reverse-index peer-id])}}]
          (info log-prefix "Peer timed out with no heartbeats. Emitting leave cluster." entry)
          (>!! outbox-ch entry))))
    this)
  (reset-event! [this]
    (set! event init-event)
    this)
  (update-event! [this f]
    (set! event (f event))
    this)
  (get-event [this] event)
  (set-epoch! [this new-epoch]
    (set! epoch new-epoch)
    (m/set-epoch! messenger new-epoch)
    this)
  (next-epoch! [this]
    (set-epoch! this (inc epoch)))
  (epoch [this]
    epoch)
  (replica-version [this]
    replica-version)
  (set-messenger! [this new-messenger]
    (set! messenger new-messenger)
    this)
  (get-messenger [this]
    messenger)
  (set-coordinator! [this next-coordinator]
    (set! coordinator next-coordinator)
    this)
  (goto-recover! [this]
    (set! idx recover-idx)
    (-> this
        (set-context! nil)
        (reset-event!)))
  (goto-next-iteration! [this]
    (set! idx iteration-idx))
  (goto-next-batch! [this]
    (set! advanced true)
    (set! idx batch-idx)
    this)
  (get-coordinator [this]
    coordinator)
  (exec [this]
    (set! advanced false)
    (let [task-fn (aget lifecycle-fns idx)
          next-state (task-fn this)]
      (if advanced
        (do
          (.idle idle-strategy 1)
          next-state)
        (do (.idle idle-strategy 0)
            (heartbeat! next-state)))))
  (advance [this]
    (let [new-idx ^int (unchecked-add-int idx 1)]
      (set! advanced true)
      (if (= new-idx nstates)
        (goto-next-iteration! this)
        (set! idx new-idx))
      this)))

(defn lookup-lifecycle-idx [lifecycles name]
  (->> lifecycles
       (map-indexed (fn [idx v]
                      (if (= name (:lifecycle v))
                        idx)))
       (remove nil?)
       (first)))

(defn wrap-lifecycle-metrics [monitoring lifecycle]
  (let [lfn (:fn lifecycle)]
    (if-let [mon-fn (get monitoring (:lifecycle lifecycle))]
      (fn [state]
        (let [start (System/nanoTime)
              next-state (lfn state)
              end (System/nanoTime)
              elapsed (unchecked-subtract end start)]
          (mon-fn next-state elapsed)
          next-state))
      lfn)))

(defn lookup-batch-start-index [lifecycles]
  ;; before-batch may be stripped, thus before or read may be first batch fn
  (int (or (lookup-lifecycle-idx lifecycles :lifecycle/before-batch)
           (lookup-lifecycle-idx lifecycles :lifecycle/read-batch))))

(defn new-state-machine [event peer-config messenger-group coordinator]
  (let [{:keys [onyx.core/input-plugin onyx.core/output-plugin onyx.core/monitoring onyx.core/id]} event
        {:keys [replica-version] :as base-replica} (onyx.log.replica/starting-replica peer-config)
        lifecycles (filter :fn (build-task-fns event))
        names (into-array clojure.lang.Keyword (mapv :lifecycle lifecycles))
        state-fns (->> lifecycles
                       (mapv #(wrap-lifecycle-metrics monitoring %))
                       (into-array clojure.lang.IFn))
        recover-idx (int 0)
        iteration-idx (int (lookup-lifecycle-idx lifecycles :lifecycle/next-iteration))
        batch-idx (lookup-batch-start-index lifecycles)
        start-idx recover-idx
        heartbeat-ns (ms->ns (arg-or-default :onyx.peer/heartbeat-ms peer-config))
        messenger (m/build-messenger peer-config messenger-group monitoring id)
        idle-strategy (BackoffIdleStrategy. 5
                                            5
                                            (arg-or-default :onyx.peer/idle-min-sleep-ns peer-config)
                                            (arg-or-default :onyx.peer/idle-max-sleep-ns peer-config))
        window-states (c/event->windows-states event)]
    (->TaskStateMachine monitoring
                        (ms->ns (arg-or-default :onyx.peer/subscriber-liveness-timeout-ms peer-config))
                        (ms->ns (arg-or-default :onyx.peer/publisher-liveness-timeout-ms peer-config))
                        input-plugin output-plugin
                        idle-strategy recover-idx iteration-idx batch-idx
                        (count state-fns) names state-fns start-idx false
                        false base-replica messenger messenger-group
                        coordinator event event window-states nil
                        replica-version initialize-epoch
                        heartbeat-ns (System/nanoTime) #{})))

;; NOTE: currently, if task doesn't start before the liveness timeout, the peer will be killed
;; peer should probably be heartbeating here
(defn backoff-until-task-start!
  [{:keys [onyx.core/kill-flag onyx.core/task-kill-flag onyx.core/opts] :as event} start-fn]
  (while (and (not (or @kill-flag @task-kill-flag))
              (not (start-lifecycle? event start-fn)))
    (Thread/sleep (arg-or-default :onyx.peer/peer-not-ready-back-off opts))))

(defn start-task-lifecycle! [state handle-exception-fn exception-action-fn]
  (thread (run-task-lifecycle! state handle-exception-fn exception-action-fn)))

(defn take-final-state!! [component]
  (<!! (:task-lifecycle-ch component)))

(defn compile-task
  [{:keys [task-information job-id task-id id monitoring log replica-origin
           replica opts outbox-ch group-ch task-kill-flag kill-flag]}]
  (let [{:keys [workflow catalog task flow-conditions resume-point
                windows triggers lifecycles metadata]} task-information
        log-prefix (logger/log-prefix task-information)
        task-map (find-task catalog (:name task))
        filtered-windows (vec (wc/filter-windows windows (:name task)))
        window-ids (set (map :window/id filtered-windows))
        filtered-triggers (filterv (comp window-ids :trigger/window-id) triggers)
        _ (info log-prefix "Compiling lifecycle") ]
    (->> {:onyx.core/id id
          :onyx.core/tenancy-id (:onyx/tenancy-id opts)
          :onyx.core/job-id job-id
          :onyx.core/task-id task-id
          :onyx.core/slot-id (get-in replica-origin [:task-slot-ids job-id task-id id])
          :onyx.core/task (:name task)
          :onyx.core/catalog catalog
          :onyx.core/workflow workflow
          :onyx.core/windows filtered-windows
          :onyx.core/triggers filtered-triggers
          :onyx.core/flow-conditions flow-conditions
          :onyx.core/lifecycles lifecycles
          :onyx.core/metadata metadata
          :onyx.core/task-map task-map
          :onyx.core/serialized-task task
          :onyx.core/log log
          :onyx.core/monitoring monitoring
          :onyx.core/task-information task-information
          :onyx.core/outbox-ch outbox-ch
          :onyx.core/group-ch group-ch
          :onyx.core/task-kill-flag task-kill-flag
          :onyx.core/kill-flag kill-flag
          :onyx.core/peer-opts opts
          :onyx.core/fn (operation/resolve-task-fn task-map)
          :onyx.core/resume-point resume-point
          :onyx.core/replica-atom replica
          :onyx.core/log-prefix log-prefix}
         c/task-params->event-map
         c/flow-conditions->event-map
         c/task->event-map)))

(defn build-input-pipeline [{:keys [onyx.core/task-map] :as event}]
  (if (= :input (:onyx/type task-map))
    (op/start (instantiate-plugin event) event)))

(defn build-output-pipeline [{:keys [onyx.core/task-map] :as event}]
  (if (= :output (:onyx/type task-map))
    (op/start (instantiate-plugin event) event)
    (op/start (mo/new-messenger-output event) event)))

(defrecord TaskLifeCycle
           [id log messenger-group job-id task-id replica group-ch log-prefix
            kill-flag outbox-ch completion-ch peer-group opts task-kill-flag
            scheduler-event task-information replica-origin]

  component/Lifecycle
  (start [component]
    (let [handle-exception-fn (fn [lifecycle action e]
                                (handle-exception task-information log e lifecycle
                                                  action group-ch outbox-ch id job-id))]
      (try
        (let [log-prefix (logger/log-prefix task-information)
              event (compile-task component)
              exception-action-fn (lc/compile-lifecycle-handle-exception-functions event)
              start?-fn (lc/compile-start-task-functions event)
              before-task-start-fn (or (lc/compile-lifecycle-functions event :lifecycle/before-task-start) identity)
              after-task-stop-fn (or (lc/compile-lifecycle-functions event :lifecycle/after-task-stop) identity)]
          (try
            (info log-prefix "Warming up task lifecycle" (:onyx.core/serialized-task event))
            (backoff-until-task-start! event start?-fn)
            (try
              (let [{:keys [onyx.core/task-map] :as event} (before-task-start-fn event)]
                (try
                 (let [task-monitoring (component/start (metrics-monitoring/new-task-monitoring event))
                       event (assoc event :onyx.core/monitoring task-monitoring)
                       input-pipeline (build-input-pipeline event)
                       output-pipeline (build-output-pipeline event)
                       {:keys [workflow resume-point]} task-information
                       coordinator (new-peer-coordinator workflow resume-point
                                                         log messenger-group
                                                         task-monitoring opts
                                                         id job-id group-ch)
                       ;; TODO, move storage into group. Both S3 transfer manager and ZooKeeper conn can be re-used
                       storage (if (= :zookeeper (or (:onyx.peer/storage opts) :zookeeper))
                                 ;; reuse group zookeeper connection
                                 (:onyx.core/log event)
                                 (onyx.checkpoint/storage opts task-monitoring))
                       event (assoc event 
                                    :onyx.core/input-plugin input-pipeline
                                    :onyx.core/output-plugin output-pipeline
                                    :onyx.core/monitoring task-monitoring
                                    :onyx.core/storage storage)
                       state (new-state-machine event opts messenger-group coordinator)
                       _ (info log-prefix "Enough peers are active, starting the task")
                       task-lifecycle-ch (start-task-lifecycle! state handle-exception-fn exception-action-fn)]
                    (s/validate os/Event event)
                    (assoc component
                           :event event
                           :state state
                           :task-monitoring task-monitoring
                           :log-prefix log-prefix
                           :task-information task-information
                           :after-task-stop-fn after-task-stop-fn
                           :task-kill-flag task-kill-flag
                           :kill-flag kill-flag
                           :task-lifecycle-ch task-lifecycle-ch
                       ;; atom for storing peer test state in property test
                           :holder (atom nil)))
                  (catch Throwable e
                    (let [lifecycle :lifecycle/initializing
                          action (exception-action-fn event lifecycle e)]
                      (handle-exception-fn lifecycle action e)
                      component))))
              (catch Throwable e
                (let [lifecycle :lifecycle/before-task-start
                      action (exception-action-fn event lifecycle e)]
                  (handle-exception-fn lifecycle action e))
                component))
            (catch Throwable e
              (let [lifecycle :lifecycle/start-task?
                    action (exception-action-fn event lifecycle e)]
                (handle-exception-fn lifecycle action e))
              component)))
        (catch Throwable e
         ;; kill job as errors are unrecoverable if thrown in the compile stage
          (handle-exception-fn :lifecycle/compiling :kill e)
          component))))

  (stop [component]
    (if-let [task-name (:name (:task (:task-information component)))]
      (info (:log-prefix component) "Stopping task lifecycle.")
      (warn (:log-prefix component) "Stopping task lifecycle, failed to initialize task set up."))

    (when-let [event (:event component)]
      (debug (:log-prefix component) "Stopped task. Waiting to fall out of task loop.")
      (reset! (:kill-flag component) true)
      (some-> component :task-monitoring component/stop)
      (when-let [final-state (take-final-state!! component)]
        (t/stop final-state (:scheduler-event component))
        (reset! (:task-kill-flag component) true))
      (when-let [f (:after-task-stop-fn component)]
        ;; do we want after-task-stop to fire before seal / job completion, at
        ;; the risk of it firing more than once?
        ;; we may need an extra lifecycle function which can be used for job completion, 
        ;; but not cleaning up resources
        (f event)))
    (assoc component
           :event nil
           :state nil
           :holder nil
           :log-prefix nil
           :task-information nil
           :task-kill-flag nil
           :kill-flag nil
           :task-lifecycle-ch nil)))

(defn task-lifecycle [peer task]
  (map->TaskLifeCycle (merge peer task)))
