(ns bionlp.umls
  "Utility to work with UMLS data source e.g. Read UMLS data and create lookup tries for 
   entity resolution to UMLS cui based on exact-match with text.

   Tool uses UMLS Lexical Variant Generator (LvgCmd) to generate spelling variants so it
   needs to be installed for the lookups to work."
  (:require [clojure.java.io :as io]
            [clojure.string :as str]
            [opennlp.nlp :refer :all]
            [bionlp.util :refer :all])
  (:import [gov.nih.nlm.nls.lvg.Api LvgCmdApi]))

(def apicmd (LvgCmdApi. "-f:b" "resources/data.config.lvg"))
(def acronym-expn (LvgCmdApi. "-f:a" "resources/data.config.lvg"))
(def variant-gen (LvgCmdApi. "-f:G" "resources/data.config.lvg"))
(def syn-gen (LvgCmdApi. "-f:y" "resources/data.config.lvg"))
(def pos-tag (make-pos-tagger "resources/models/en-pos-maxent.bin"))
(def tui-map {:disease '("T019" "T020" "T037" "T046" "T047" "T048" "T184" "T190" "T191")
              :drug '("T195" "T200" "T109" "T197" "T121")
              :gene '("T116" "T087" "T028" "T114" "T086")})

(defn generate-variants
  "Uses Lvg fruitful variant generator to generate variants for given text"
  [txt]
  (map #(second (str/split % #"[|]")) (str/split (.MutateToString variant-gen txt) #"\n")))

(defn generate-syns
  "Uses Lvg Synonym generator to generate variants for given text"
  [txt]
  (map #(second (str/split % #"[|]")) (str/split (.MutateToString syn-gen txt) #"\n")))

(defn synonymous-variants
  "Generates possible variations to supplied list of tokens
  `tokens` - tokens associated with text to generate synonymous variants for"
  [tokens]
  (loop [tokens tokens
         result '()]
    (if (not (empty? tokens))
      (let [expanded (filter #(not (nil? %)) (concat (list (first tokens)) (into #{} (generate-variants (first tokens))) (into #{} (generate-syns (first tokens)))))]
        (recur (rest tokens)
               (if (empty? result)
                 expanded
                 (for [r result
                       exp expanded]
                   (str r " " exp)))))
      result)))

(defn line->stype
  "Parse provided line from the UMLS MRSTY file into a map.
   MRSTY file provides the semantic type information for concepts in UMLS.
   The MRSTY file contains following columns in order specified - 
   CUI - Concept unique identifier 
   TUI - unique identifier of semantic type
   STN - semantic type tree number
   STY - semantic type
   ATUI - unique identifier of attribute
   CVF - content view flag"
  [line]
  (let [[cui tui stn sty atui cvf] (str/split line #"[|]")]
    {:cui cui
     :tui tui
     :stn stn
     :sty sty
     :atui atui
     :cvf cvf}))

(defn line->concept
  "Parse provided line from the UMLS MRCONSO file into a map.
   MRCONSO file provides concept information in UMLS.
   The MRCONSO file contains following columns in order specified -
   CUI - Unique identifier of concept
   LAT - Language of term
   TS - Term Status
   LUI - Unique identifier for term
   STT - String type
   SUI - Unique identifier for string
   ISPREF - Indicates whether AUI is preferred
   AUI - Unique identifier for atom
   SAUI - Source asserted atom identifier
   SCUI - Source asserted concept identifier
   SDUI - Source asserted descriptor identifier
   SAB - Source Abbreviation
   TTY - Term type in source
   CODE - Unique identifier or code for string in source
   STR - String
   SRL - Source restriction level
   SUPRESS - Suppressible flag
   CVF - Content view flag"
  [line]
  (let [[cui lat ts lui stt sui ispref aui saui scui sdui sab tty code str & rest] (str/split line #"[|]")]
    {:cui cui
     :lat lat
     :ts ts
     :lui lui
     :stt stt
     :sui sui
     :ispref ispref
     :aui aui
     :scui scui
     :sab sab
     :tty tty
     :str str
     :code code}))

(defn load-semtypes
  "Looks for MRSTY.RRF file in resources folder and lazy loads it"
  []
  (if (.exists (io/file "resources/MRSTY.RRF"))
    (->> (io/file "resources/MRSTY.RRF")
         (io/reader)
         (line-seq)
         (map #(line->stype %)))))

(defn load-concepts
  "Looks for MRCONSO.RRF file in resources folder and lazy loads it"
  []
  (if (.exists (io/file "resources/MRCONSO.RRF"))
    (->> (io/file "resources/MRCONSO.RRF")
         (io/reader)
         (line-seq)
         (map #(line->concept %)))))

(defn create-semtype-trie
  "Builds a semantic look up trie of concepts filtered based on supplied tuis.
   `tuis` - TUIs of semantic types to filter the concepts on.
   Returns a map containing concept trie keyed on concept CUIs"
  [tuis]
  (let [tuis (if (keyword? tuis)
               (tuis tui-map)
               tuis)]
    (reduce (fn [trie x]
              (assoc-in trie (:cui x) {:tui ((comp to-array concat)
                                             (:tui (get-in trie (:cui x)))
                                             (to-array [(Integer/parseInt (subs (:tui x) 1))]))}))
            {}
            (if (empty? tuis)
              (load-semtypes)
              (->> (load-semtypes)
                   (filter #(some (fn [t] (= t (:tui %))) tuis)))))))

(defn create-concept-lookup-trie
  "Builds a trie of concepts filtered by given list of tuis or semantic groups.
   `tuis` - TUIs of semantic types to filter the concept on. `tuis` can be sent-in as a
   list of TUIs e.g. '(\"T047\" \"T184\") or it could be one of following keywords with 
   pre-selected list of TUIs:

   |keyword    |semantic types     |
   |-----------|-------------------|
   |:disease   |acab, anab, cgab, inpo, mobd, dsyn, neop, sosy |
   |:drug      |antb, clnd, orch, inch, phsu |
   |:gene      |aapp, amas, gngm, nnon, nusq |

   * for umls semantic group reference - https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt
   `source-exclude-list` can be provided as a list of source abbreviations of sources to exclude.
   E.g. For human disease, you might want to skip the SNOMEDCT_VET source for veterinary concepts.

   Returns a trie of concepts keyed by concept string"
  ([tuis]
   (let [sem-trie (create-semtype-trie tuis)]
     (create-concept-lookup-trie tuis sem-trie nil)))
  ([tuis sem-trie]
   (create-concept-lookup-trie tuis sem-trie nil))
  ([tuis sem-trie source-exclude-list]
   (let [source-exclude-list (into #{} source-exclude-list)
         tuis (into #{}
                    (map #(Integer/parseInt (subs % 1))
                         (if (keyword? tuis)
                           (tuis tui-map)
                           tuis)))]
     (reduce (fn [trie concept]
               (if (or
                    (:cui (get-in sem-trie (:cui concept)))
                    (contains? source-exclude-list (:sab concept))
                    (empty? (clojure.set/intersection (into #{} (:tui (get-in sem-trie (:cui concept)))) tuis)))
                 trie
                 (assoc-in trie (:str concept) (merge (get-in trie (:str concept)) {:cui (:cui concept)}))))
             {}
             (load-concepts)))))

(defn lookup-concept
  "Perform exact-match on supplied `line` or an inflected/un-inflected version of the text.
   
   `concept-trie` - concept trie to use. create using `create-concept-lookup-trie`
   `line` - text to match
   `partial-match` - implement partial match by incrementally ignoring the first word in text sequence and matching rest against the trie.

   Returns matched CUI if found otherwise returns nil."
  ([concept-trie line]
   (lookup-concept concept-trie line false false))
  
  ([concept-trie line expand-acronym? partial-match?]
   (let [check-match (fn [txt]
                       (if txt
                         (or (:cui (get-in concept-trie txt))
                             (:cui (get-in concept-trie (str/lower-case txt)))
                             (:cui (get-in concept-trie (str/capitalize txt)))
                             (:cui (get-in concept-trie (str/upper-case txt)))
                             (:cui (get-in concept-trie (str/replace txt #"[-]" " "))))))]
     (loop [tokens (str/split line #"\s+")]
       (if (not (empty? tokens))
         (let [txt (str/trim (str/join " " tokens))
               found (or ;Check direct match
                         (check-match txt)

                         ;Check by dropping first token for text with tokens between 3 and 4.
                         (if (and (>= (count tokens) 3) (<= (count tokens) 4))
                           (check-match (str/trim (str/join " " (rest tokens)))))

                         ;Try to expand acronym if only one word token
                         ;In case of multiple matches - we need a way to sort by most relevant expansions first.
                         (if (and expand-acronym? (= (count tokens) 1))
                           (apply (make-fn or) (map #(check-match (second (str/split % #"[|]"))) (str/split (.MutateToString  acronym-expn txt) #"\n"))))

                         ;Convert plural noun tags to singular and check match
                         ;Also check against spelling variants and synonyms of token for matches [disabled for now]
                         (let [uninflected-tokens
                               (keep-indexed
                                (fn [idx elem]
                                  (if (= "NNS" (second elem))
                                    [idx (into #{}
                                               (map #(second (str/split % #"[|]"))
                                                    (str/split (.MutateToString apicmd (first elem)) #"\n")))]))
                                (into [] (pos-tag tokens)))]
                            (or
                             (apply (make-fn or) (filter #(not (nil? %))
                                                         (flatten (map (fn [[idx uninf]]
                                                                         (map #(check-match (str/join " " (assoc tokens idx %))) uninf)) uninflected-tokens))))
                             (if (<= (count tokens) 3)
                               (apply (make-fn or) (filter #(not (nil? %)) (map (fn [variant] (check-match variant)) (synonymous-variants tokens))))))))]
           (or found
               (and partial-match?
                    (recur (into [] (rest tokens)))))))))))

