(ns morri.tss-cpg
  (:require [honeysql.core :as sql]
            [honeysql.helpers :refer :all]
            [morri.meth450k.common.command-line :as cli]
            [morri.meth450k.common.db-utils :as db-utils]
            [morri.meth450k.common.illumina450k-db :as i450k-db]
            [morri.meth450k.common.ucsc-db :as ucsc-db]
            [morri.meth450k.common.utils :as utils]
            [morri.one-tx-per-gene :as one-tx]
            [morri.unique-tss :as tss]
            [morri.lib :refer [tprn]]
            [pallet.thread-expr :refer [when->]])
  (:gen-class))

;; the goal here is to take a list of transcripts, and return the last
;; illumina 450k CpG before the tss of each transcript.  Or take a
;; list of cpgs and return the closest tx downstream of each one.
;; Output should probably be csv format in two columns, tx-id cg-id.

;; Functions for cpg->tss

(defn closest-tss-query [table]
  [(format
    "
select name,
IF(strand = '+', ? - cast(txStart as signed), cast(txEnd as signed) - ?)
as dist_to_tss
from %s where chrom = ?
order by abs(dist_to_tss) asc
"
    (name table))])

(defn closest-tss [prefered-tx-set base-query {:keys [chr mapinfo]}]
  (let [query (conj base-query mapinfo mapinfo (utils/require-chr chr))
        result (ucsc-db/ucsc-query query)
        filtered-results (filter #(prefered-tx-set (:name %)) result)]
    (when-not (seq filtered-results)
      (println "Can't find closest tss to"
               (utils/ucsc chr mapinfo (+ 2 mapinfo))))
    (first filtered-results)))

(defn all-450k-cpgs [cpg-limit]
  (i450k-db/i450k-query
   (sql/format
    (-> (select :ilmnID :mapinfo :chr) (from i450k-db/i450k-table)
        (when-> cpg-limit (limit cpg-limit))))))

(defn cpg->tss [{:keys [transcripts output-file
                        gene-model cpg-limit id->gene-symbol]}]
  (let [prefered-tx-set (set transcripts)
        all-cpgs (utils/show-progress 500 (all-450k-cpgs cpg-limit))
        base-query (closest-tss-query gene-model)
        header ["cpg" "gene-symbol" "tx-id" "dist-to-tss"]
        cpg-to-tss-output
        (for [cpg-map all-cpgs
              :when (and (:mapinfo cpg-map) (:chr cpg-map))
              :let [closest-tss-map
                    (closest-tss  prefered-tx-set base-query cpg-map)]
              :when closest-tss-map ; remove any cpgs
                                        ; without a result!
              :let [ilmn-id (:ilmnID cpg-map)
                    tx (:name closest-tss-map)
                    dist-to-tss (:dist_to_tss closest-tss-map)
                    gene-symbol (id->gene-symbol tx)]]
          [ilmn-id gene-symbol tx dist-to-tss])]
    (utils/csv-write output-file (conj cpg-to-tss-output header))))

;; Functions for tss->cpg

(defn tss->cpg [{:keys [transcripts output-file gene-model id->gene-symbol]}]
  (let [header ["cpg" "gene-symbol" "tx-id" "dist-to-tss"]
        last-cpg-output
        (for [tx transcripts
              :let [tss-map (tss/get-tx-tss gene-model tx)
                    [last-cpg dist-to-tss] (i450k-db/last-cpg tss-map)
                    gene-symbol (id->gene-symbol tx)]
              :when last-cpg]
          [i450k-db/last-cpg gene-symbol tx dist-to-tss])]
    (utils/csv-write output-file (conj last-cpg-output header))))

(def valid-directions #{:tss-to-cpg :cpg-to-tss})

(def gene-model-options #{:wgEncodeGencodeBasicV17
                          :knownGene})

(def options-config
  [["-h" "--help"
    "Connect transcripts to the nearest upstream cpg or
cpgs to the nearest transcript"
    :default false :flag true]
   ["-d" "--direction" (str "Select from " valid-directions)
    :parse-fn (cli/validate valid-directions)]
   ["-tx" "--transcript-file" "File with list of transcripts"]
   ["-f" "--output-file" "File for csv output"]
   ["--cpg-limit" "CpG Limit for testing cpg-to-tss" :default false
    :parse-fn cli/ensure-int]
   ["-g" "--gene-model"
    (str "Database for gene model, choose from " gene-model-options)
    :default :knownGene
    :parse-fn (cli/validate gene-model-options)]])

(defn gs-lookup-fn [gene-model]
  (comp second
        (case gene-model
          :wgEncodeGencodeBasicV17 one-tx/lookup-gene-symbol-gencode
          :knownGene one-tx/lookup-gene-symbol-ucsc)))

(defn -main [& args]
  (let [cl-args (cli/parse-command-line args options-config)
        transcripts (flatten (utils/csv-read (:transcript-file cl-args)))
        gene-model (:gene-model cl-args)
        id->gene-symbol (gs-lookup-fn gene-model)
        cl-args (assoc cl-args
                  :transcripts transcripts
                  :id->gene-symbol id->gene-symbol)]
    (db-utils/try-create-index (db-utils/mysql-db "ucsc") gene-model [:txStart])
    (db-utils/try-create-index (db-utils/mysql-db "ucsc") gene-model [:txEnd])
    (db-utils/try-create-index (db-utils/mysql-db "ucsc") gene-model
                               [:txStart :txEnd])
    (case (:direction cl-args)
      :tss-to-cpg (tss->cpg cl-args)
      :cpg-to-tss (cpg->tss cl-args))))
