(ns leafgrabber.free-text.utils
  (:use [clojure.contrib.duck-streams :only (read-lines)]
        [clojure.contrib.str-utils :only (re-split)]
        [clojure.data.json :only (read-json json-str)]
        [cascalog.api :only (hfs-textline <- defmapcatop)]
        [clj-time.core :only (now)]
        [clj-time.format :only (unparse formatter)])
  (:require [clojure.string :as str]
            [opennlp.nlp :as nlp]
            [opennlp.treebank :as tbk])
  )

(defn parse-tree
  [str]
  (let [treebank-parser (tbk/make-treebank-parser "models/en-parser-chunking.bin")
        chunk-tree (first (treebank-parser (vector str)))
        tag (:tag chunk-tree)
        chunk (:chunk chunk-tree)
        tree1 {:cat tag}]
    (cond (seq? chunk) (assoc tree1 :dtrs (map parse-tree chunk))
          (map? chunk) (assoc tree1 :dtrs (list (parse-tree chunk)))
          (string? chunk) (assoc tree1 :lex chunk))
  ))

(defn hadoop-delete
  "delete a hadoop directory"
  [dir]
  (let [command (str "hadoop fs -rmr " dir)
        process (.exec (Runtime/getRuntime) command)]
    (println (str "deleting " dir))
    (.waitFor process)
    (. Thread (sleep 2000)))
  )

(defn local-delete
  [dir]
  (.exec (Runtime/getRuntime) (str "rm -r " dir))
  (. Thread (sleep 2000))
  )

(defn file-delete
  [dir filesystem]
  (if (= filesystem "hadoop")
    (hadoop-delete dir)
    (local-delete dir)))

(defn hadoop-copy-results
  "copy a hadoop directory to a local director"
  [hadoop-dir local-file]
  (let [command (str "hadoop fs -getmerge " hadoop-dir " " local-file)
        process (.exec (Runtime/getRuntime) command)]    
    (.waitFor process)
  ))

(defn get-from-hadoop
  "get the contents of a hadoop output directory"
  [hadoop-dir]
  (let [command (str "hadoop fs -cat " hadoop-dir "/p*")
        process (.exec (Runtime/getRuntime) command)]
    (if (= 0 (.waitFor process))
      (read-lines (.getInputStream process))
      (read-lines (.getErrorStream process)))
  ))

(defn hadoop-ls
  [dir]
  (let [command (str "hadoop fs -ls " dir)
        process (.exec (Runtime/getRuntime) command)]
    (.waitFor process)
  ))

(defn not-empty? [z] (not (empty? z)))

(defn remove-breaks
  [text]
  (apply str (re-split #"\t|\n|\r" text)))

(defn get-field
  [line idx]
  (let [fields (str/split line #"\t")]
    (if (> (count fields) idx)
      (fields idx)
      "NONE")
  ))

(defn get-trimmed-field
  [line idx]
  (let [fields (str/split line #"\t")]
    (.trim (fields idx))))

(defn url-from-tabbed
  "this works on UUID - URL files when there may be an md5 in the middle"
  [line]
  (get-field line 2))

(defn uuid-from-tabbed [line] (first (str/split line #"\t")))

(defn urls-from-uuid
  "get all the urls for a uuid"
  [input-dir uuid]
  (<- [?url] 
      ((hfs-textline input-dir) ?line)
      (url-from-tabbed ?line :> ?url)
      (uuid-from-tabbed ?line :> ?uuid)
      (= ?uuid uuid)
  ))

(defn get-json-item
  [jstr idx]
  (let [val ((read-json jstr) idx)]
    (if (or (string? val) (nil? val) (true? val) (false? val))
      val
      (json-str val))
   ))

(def date-str (unparse (formatter "yyyy_MM_dd") (now)))

(def date-millis (.getMillis (now)))

(defn exec-command
  [command]
  (let [arr (make-array java.lang.String 3)]
    (aset arr 0 "bash")
    (aset arr 1 "-c")
    (aset arr 2 command)
    (let [process (.exec (Runtime/getRuntime) arr)]
      (println command)
      (if (= 0 (.waitFor process))
        (read-lines (.getInputStream process))
        (read-lines (.getErrorStream process)))
  )))