(ns beagle.phrases
  (:gen-class)
  (:require [clojure.string :as s]
            [clojure.java.io :as io]
            [jsonista.core :as json]
            [beagle.validator :as validator]
            [beagle.annotation-merger :as merger]
            [beagle.dictionary-optimizer :as optimizer]
            [beagle.text-analysis :as text-analysis])
  (:import (java.util UUID)
           (org.apache.lucene.document Document FieldType Field)
           (org.apache.lucene.index IndexOptions)
           (org.apache.lucene.monitor Monitor MonitorQuery HighlightsMatch MonitorConfiguration
                                      MonitorQuerySerializer HighlightsMatch$Hit)
           (org.apache.lucene.search PhraseQuery MatchAllDocsQuery)
           (org.apache.lucene.util BytesRef)))

(defn match->annotation [text monitor type-name ^HighlightsMatch match]
  (mapcat
    (fn [[_ hits]]
      (let [meta (.getMetadata (.getQuery monitor (.getQueryId match)))]
        (map (fn [hit]
               (let [start-offset (.-startOffset ^HighlightsMatch$Hit hit)
                     end-offset (.-endOffset ^HighlightsMatch$Hit hit)]
                 {:text          (subs text start-offset end-offset)
                  :type          (or (get meta "_type") type-name)
                  :dict-entry-id (.getQueryId match)
                  :meta          (into {} meta)
                  :begin-offset  start-offset
                  :end-offset    end-offset})) hits)))
    (.getHits match)))

(def ^FieldType field-type
  (doto (FieldType.)
    (.setTokenized true)
    (.setIndexOptions IndexOptions/DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
    (.setStoreTermVectors true)
    (.setStoreTermVectorOffsets true)))

(defn annotate-text [^String text ^Monitor monitor analysis-conf ^String type-name default-analysis-conf]
  (let [field-name (text-analysis/get-field-name analysis-conf default-analysis-conf)
        doc (doto (Document.)
              (.add (Field. ^String field-name text field-type)))
        matches (.getMatches (.match monitor doc (HighlightsMatch/MATCHER)))]
    (mapcat #(match->annotation (.get doc field-name) monitor type-name %) matches)))

(defn prepare-synonyms [query-id {:keys [synonyms] :as dict-entry}]
  (map (fn [synonym]
         (-> dict-entry
             (assoc :text synonym)
             (dissoc :synonyms)
             (assoc :id (str (UUID/randomUUID)))
             (update-in [:meta] assoc :synonym? "true" :query-id query-id)))
       synonyms))

(defn save-queries-in-monitor [^Monitor monitor monitor-queries]
  (try
    (.register monitor ^Iterable monitor-queries)
    (catch Exception e
      (.printStackTrace e))))

(defn phrase->strings [dict-entry default-analysis-conf]
  (let [analyzer (text-analysis/get-string-analyzer dict-entry default-analysis-conf)]
    (into-array String (text-analysis/text->token-strings (:text dict-entry) analyzer))))

(defn dict-entry->monitor-query [{:keys [id text meta type] :as dict-entry} default-analysis-conf idx]
  (let [query-id (or id (str idx))
        metadata (reduce-kv (fn [m k v] (assoc m (name k) v)) {} (if type (assoc meta :_type type) meta))]
    (MonitorQuery. query-id
                   (PhraseQuery. (text-analysis/get-field-name dict-entry default-analysis-conf)
                                 (phrase->strings dict-entry default-analysis-conf))
                   text
                   metadata)))

(defn dict-entries->monitor-queries [dict-entries default-analysis-conf]
  (flatten
    (map (fn [{id :id :as dict-entry} idx]
           (let [query-id (or id (str idx))]
             (cons
               (dict-entry->monitor-query dict-entry default-analysis-conf idx)
               (map #(dict-entry->monitor-query % default-analysis-conf nil)
                    (prepare-synonyms query-id dict-entry)))))
         dict-entries (range))))

(defn prepare-monitor [monitor dict-entries default-analysis-conf]
  (save-queries-in-monitor monitor (dict-entries->monitor-queries dict-entries default-analysis-conf)))

(def monitor-query-serializer
  (reify MonitorQuerySerializer
    (serialize [_ query]
      (BytesRef.
        (json/write-value-as-string
          {"query-id" (.getId query)
           "query"    (.getQueryString query)
           "metadata" (.getMetadata query)})))
    (deserialize [_ binary-value]
      (let [dq (json/read-value (io/reader (.bytes ^BytesRef binary-value)))]
        (MonitorQuery. (get dq "query-id")
                       (MatchAllDocsQuery.)
                       (get dq "query")
                       (get dq "metadata"))))))

(defn create-monitor [analysis-conf default-analysis-conf]
  (let [^MonitorConfiguration config (MonitorConfiguration.)]
    (.setIndexPath config nil monitor-query-serializer)
    (Monitor. (text-analysis/get-string-analyzer analysis-conf default-analysis-conf) config)))

(defn setup-monitors [dictionary default-analysis-conf]
  (reduce-kv (fn [acc _ v]
               (let [analysis-conf (select-keys (first v) text-analysis/analysis-keys)
                     monitor (create-monitor analysis-conf default-analysis-conf)]
                 (prepare-monitor monitor v default-analysis-conf)
                 (conj acc {:analysis-conf analysis-conf :monitor monitor})))
             [] (group-by text-analysis/conf->analyzers dictionary)))

(defn synonym-annotation? [annotation]
  (= "true" (get-in annotation [:meta "synonym?"])))

(defn meta-type? [annotation]
  (string? (get-in annotation [:meta "_type"])))

(defn post-process [annotation]
  (cond-> annotation
          (synonym-annotation? annotation) (assoc :dict-entry-id (get-in annotation [:meta "query-id"]))
          (meta-type? annotation) (update-in [:meta] dissoc "_type")))

(defn annotator
  "Creates an annotator function with for a given dictionary.
  Params:
  - dictionary: a list of dictionary entries as described in `beagle.schema`
  Options:
  - type-name: a string, defaults to \"PHRASE\"
  - validate-dictionary?: if set to true then validates the dictionary, default false
  - optimize-dictionary?: if set to true then optimizes dictionary before creating the monitor, default false
  - tokenizer: a keyword one of #{:standard :whitespace}, default :standard"
  [dictionary & {:keys [type-name validate-dictionary? optimize-dictionary? tokenizer]}]
  (when validate-dictionary? (validator/validate-dictionary dictionary))
  (let [dictionary (if optimize-dictionary? (optimizer/optimize dictionary) dictionary)
        type-name (if (s/blank? type-name) "PHRASE" type-name)
        default-analysis-conf {:tokenizer tokenizer}
        monitors (setup-monitors dictionary default-analysis-conf)]
    (fn [text & {:keys [merge-annotations?]}]
      (if (s/blank? text)
        []
        (let [annotations (map post-process
                               (mapcat (fn [{:keys [monitor analysis-conf]}]
                                         (annotate-text text monitor analysis-conf
                                                        type-name default-analysis-conf))
                                       monitors))]
          (if merge-annotations?
            (merger/merge-same-type-annotations annotations)
            annotations))))))
