(ns bcbio.variation.ensemble.prep
  "Prepare variant inputs for ensemble calling approaches.
   Creates normalized, bgzipped tabix indexed inputs from VCF files."
  (:require [bcbio.run.fsp :as fsp]
            [bcbio.run.itx :as itx]
            [clojure.core.strint :refer [<<]]
            [clojure.java.io :as io]
            [clojure.string :as string]
            [me.raynes.fs :as fs]))

(defn unique-work-file
  "Create a work file with unique name in case of shared base names."
  [orig-file ext all-files work-dir]
  (let [cmp-files (remove #(= % orig-file) all-files)
        parts (reverse (string/split orig-file #"/"))
        unique-file (loop [i 1]
                      (let [cur-name (string/join "-" (reverse (take i parts)))]
                        (if (not-any? #(.endsWith % cur-name) cmp-files)
                          cur-name
                          (recur (inc i)))))]
    (fsp/add-file-part unique-file ext work-dir)))

(defn- tabix-index-vcf
  "Tabix index input VCF inside a transactional directory."
  [bgzip-file]
  (let [tabix-file (str bgzip-file ".tbi")]
    (when (or (itx/needs-run? tabix-file) (not (itx/up-to-date? tabix-file bgzip-file)))
      (itx/with-tx-file [tx-tabix-file tabix-file]
        (let [tx-bgzip-file (fsp/file-root tx-tabix-file)
              full-bgzip-file (str (fs/file bgzip-file))
              tmp-dir (str (fs/parent tx-bgzip-file))]
          (itx/check-run (<< "ln -s ~{full-bgzip-file} ~{tx-bgzip-file}"))
          (itx/check-run (<< "bcftools tabix -p vcf ~{tx-bgzip-file}")))))
    tabix-file))

(defn bgzip-index-vcf
  "Prepare a VCF file for positional query with bgzip and tabix indexing."
  [vcf-file & {:keys [remove-orig? remove-nopass? dir orig-files]}]
  (let [out-orig (str (fsp/file-root
                       (if orig-files
                         (unique-work-file vcf-file "qprep" orig-files nil)
                         vcf-file))
                      (if remove-nopass? "-passonly" "")
                      ".vcf.gz")
        out-file (if dir (str (io/file dir (fs/base-name out-orig))) out-orig)]
    (if remove-nopass?
      (itx/run-cmd out-file "bcftools view -f 'PASS,.' ~{vcf-file} | bgzip -c > ~{out-file}")
      (itx/run-cmd out-file "bgzip -c ~{vcf-file} > ~{out-file}"))
    (when (and (not (.endsWith vcf-file ".gz")) remove-orig?)
      (fsp/remove-path vcf-file))
    (tabix-index-vcf out-file)
    out-file))

(defn region->samstr
  [region]
  (format "%s:%s-%s" (:chrom region) (inc (:start region)) (:end region)))

(defn region->safestr
  [region]
  (format "%s_%s_%s" (:chrom region) (:start region) (:end region)))

(defn region->freebayes
  [region]
  (format "%s:%s..%s" (:chrom region) (:start region) (:end region)))

(defn norm-bgzip
  "Normalize and bgzip/tabix index a VCF input file in a defined region."
  [vcf-file region out-dir]
  (let [prep-vcf-file (bgzip-index-vcf vcf-file)
        out-file (str (io/file out-dir (str (fs/base-name vcf-file) ".gz")))]
    (itx/run-cmd out-file
             "bcftools view -r ~{(region->samstr region)} ~{prep-vcf-file} | "
             "vcfallelicprimitives | "
             "bgzip -c /dev/stdin > ~{out-file}")
    (tabix-index-vcf out-file)
    out-file))

(defn gatk-mem
  "Simple calculation of GATK memory requirements based on input samples."
  [xs]
  (cond (< (count xs) 250) "2g"
        (< (count xs) 500) "4g"
        (< (count xs) 1000) "6g"
        (< (count xs) 1500) "8g"
        (< (count xs) 2000) "10g"
        (< (count xs) 2500) "12g"
        (< (count xs) 3000) "14g"
        :else "16g"))

(defmulti create-union
  "Create a minimal union file with inputs from multiple variant callers in the given region."
  (fn [& args]
    (keyword (first args))))

(defmethod create-union :bcftools
  ^{:doc "Use bcftools isec and custom awk command to handle merge of multiple files"}
  [_ vcf-files ref-file region out-dir]
  (let [out-file (str (io/file out-dir (str "union-" (region->safestr region) ".vcf.gz")))
        vcf-files-str (string/join " " vcf-files)
        vcf-header "echo -e '##fileformat=VCFv4.1\\n#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO'"
        isec-to-vcf "awk -F'\\t' '{ print $1 FS $2 FS \".\" FS $3 FS $4 FS \".\" FS \".\" FS \".\"}'"]
    (itx/run-cmd out-file
                 "cat <(~{vcf-header}) "
                 "<(bcftools isec -n +1 -r ~{(region->samstr region)} ~{vcf-files-str} | ~{isec-to-vcf)) | "
                 "vcfcreatemulti | bgzip -c > ~{out-file}")
    (bgzip-index-vcf out-file :remove-orig? true)))
