(ns antistock.db.features
  (:refer-clojure :exclude [distinct group-by update])
  (:require [antistock.db.quotes :as quotes]
            [antistock.json :as json]
            [clj-time.core :as time]
            [clj-time.coerce :refer [to-date-time to-sql-time]]
            [clojure.data.csv :as csv]
            [clojure.java.io :as io]
            [clojure.string :as str]
            [datumbazo.core :refer :all :exclude [columns]]))

(def key-features
  "The machine learning key features."
  [:quote-id :date])

(def price-features
  "The machine learning price features."
  [:open
   :close
   :high
   :low
   :volume
   :adj-close
   :daily-return])

(def twitter-features
  "The machine learning twitter features."
  [:tday
   :rtid
   :sentneg
   :sentneut
   :sentpos
   :sentvneg
   :sentvpos
   :tgeo
   :thtg
   :tid
   :turl
   :tusm
   :uflw
   :ufrn
   :tuday
   :rtu
   :uid])

(def window-features
  "The machine learning features in a window."
  (concat price-features twitter-features))

(def all-features
  "All machine learning features."
  (concat key-features window-features))

(defn- constrain-subgraph
  "Restrict `stmt` on the subgraph of `quotes` between `start` and `end`."
  [& {:keys [quotes start end]}]
  (if (or (not (empty? quotes)) start end)
    (where `(and ~@(->> [(if start `(>= :twitter.tweets.created-at ~(to-date-time start)))
                         (if end `(< :twitter.tweets.created-at ~(to-date-time end)))
                         (if-not (empty? quotes)
                           `(in :quote-id ~(map :id quotes)))]
                        (remove nil?)))
           :and)))

(defn- truncate-created-at [precision]
  `(date_trunc ~(or precision "day") :tweets.created-at))

(defn- group-and-order-by
  "Group and order by `columns`."
  [& columns]
  (chain-state
   [(apply group-by columns)
    (apply order-by columns)]))

(defn number-of-retweets
  "Return the RTID feature (number of retweets in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id
              (as (truncate-created-at precision) :date)
              (as '(count :*) :rtid)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (where '(is-not-null :tweets.in-reply-to-status-id))
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-and-order-by 1 2)))

(defn number-distinct-users-with-retweets
  "Return the RTU (number of different users that have re-tweeted in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id
              (as (truncate-created-at precision) :date)
              (as '(count distinct :tweets.user-id) :rtu)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (where '(is-not-null :tweets.in-reply-to-status-id))
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-and-order-by 1 2)))

(defn number-of-tweets-with-geolocation
  "Return the TGEO feature (number of tweets with geo location in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id (as (truncate-created-at precision) :date) (as '(count *) :tgeo)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.users.id :twitter.tweets.user-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (where `(and (is-not-null :location)
                 (~(keyword "!~") :location "^[[:space:]]*$"))
           :and)
    (group-and-order-by 1 2)))

(defn number-of-tweets
  "Return the TID feature (number of tweets in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id (as (truncate-created-at precision) :date) (as '(count *) :tid)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-and-order-by 1 2)))

(defn number-of-tweets-mentioning-user
  "Return the TUSM feature (number of tweets that mention any user in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id (as (truncate-created-at precision) :date) (as '(count *) :tusm)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.users.id :twitter.tweets.user-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-and-order-by 1 2)))

(defn avg-number-of-friends
  "Return the UFRN feature (average number of friends for user that posted in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id (as (truncate-created-at precision) :date) (as '(avg :friends-count) :ufrn)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.users.id :twitter.tweets.user-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-and-order-by 1 2)))

(defn number-of-hash-tags-in-tweets
  "Return the THTG feature (number of hash tags used in all the tweets in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id (as (truncate-created-at precision) :date) (as '(count *) :thtg)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.hash-tags-tweets.tweet-id :twitter.tweets-quotes.tweet-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-and-order-by 1 2)))

(defn number-of-tweets-with-urls
  "Return the TURL feature (number of tweets with urls in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id (as (truncate-created-at precision) :date) (as '(count *) :turl)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.links-tweets.tweet-id :twitter.tweets-quotes.tweet-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-and-order-by 1 2)))

(defn avg-number-of-followers-for-users
  "Return the UFLW feature (average number of followers for user that posted in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id (as (truncate-created-at precision) :date) (as '(avg :followers-count) :uflw)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.users.id :twitter.tweets.user-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-and-order-by 1 2)))

(defn number-of-distinct-users
  "Return the UID feature (number of different users that posted in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id (as (truncate-created-at precision) :date)
              (as '(count distinct :twitter.tweets.user-id) :uid)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.users.id :twitter.tweets.user-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-and-order-by 1 2)))

(defn number-of-tweets-in-interval
  "Return the number of tweets in `precision` interval."
  [db precision & [{:keys [quotes start end] :as opts}]]
  (select db [(as (truncate-created-at precision) :date) '(count :*)]
    (from :twitter.tweets)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-by 1)))

(defn prices
  "Return the prices for `quotes` between `start` and `end`."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id :date :open :close :high :low :volume :adj-close]
    (from :prices)
    (if (or (not (empty? quotes)) start end)
      (where `(and ~@(->> [(if start `(>= :date ~(to-date-time start)))
                           (if end `(< :date ~(to-date-time end)))
                           (if-not (empty? quotes)
                             `(in :quote-id ~(map :id quotes)))]
                          (remove nil?)))))))

(defn- join-features [feature-1 feature-2]
  (join feature-2
        `(on (and (= ~(keyword (str (name feature-1) ".quote-id"))
                     ~(keyword (str (name feature-2) ".quote-id")))
                  (= ~(keyword (str (name feature-1) ".date"))
                     ~(keyword (str (name feature-2) ".date")))))
        :type :left))

(defn feature-alias [day feature]
  (keyword (str (last (str/split (name feature) #"\.")) "-" day)))

(defn- feature-window [window days features]
  (for [day (range 0 days)
        feature features]
    (as `(coalesce (over (lag ~feature ~day) ~window) 0)
        (feature-alias day feature))))

(defn features
  "Return the machine learning features."
  [db & [{:keys [quotes start end] :as opts}]]
  (select db [:*]
    (from :ml.features)
    (when (not-empty quotes)
      (where `(in :features.quote-id ~(map :id quotes)) :and))
    (when start
      (where `(>= :features.date ~(to-sql-time start)) :and))
    (when end
      (where `(< :features.date ~(to-sql-time end)) :and))
    (order-by :features.quote-id (desc :features.date))))

(defn scaled-features
  "Return the scaled machine learning features."
  [db & [{:keys [quotes start end] :as opts}]]
  (select db [:date
              :quote-id
              :open
              :close
              :high
              :low
              :volume
              :adj_close
              :daily_return
              ;; Tweet features
              :tday
              (as '(/ :rtid (cast :tday :real)) :rtid)
              (as '(/ :sentneg (cast :tday :real)) :sentneg)
              (as '(/ :sentneut (cast :tday :real)) :sentneut)
              (as '(/ :sentpos (cast :tday :real)) :sentpos)
              (as '(/ :sentvneg (cast :tday :real)) :sentvneg)
              (as '(/ :sentvpos (cast :tday :real)) :sentvpos)
              (as '(/ :tgeo (cast :tday :real)) :tgeo)
              (as '(/ :thtg (cast :tday :real)) :thtg)
              (as '(/ :tid (cast :tday :real)) :tid)
              (as '(/ :turl (cast :tday :real)) :turl)
              (as '(/ :tusm (cast :tday :real)) :tusm)
              (as '(/ :uflw (cast :tday :real)) :uflw)
              (as '(/ :ufrn (cast :tday :real)) :ufrn)
              ;; Twitter user features
              :tuday
              (as '(/ :rtu (cast :tuday :real)) :rtu)
              (as '(/ :uid (cast :tuday :real)) :uid)]
    (from (as (features db opts) :features))
    (order-by :quote-id (desc :date))))

(defn min-feature-date
  "Return the min date of the machine earning features."
  [db & opts]
  (some-> @(select db [(as '(min :date) :date)]
             (from :ml.features))
          first :date to-date-time))

(defn max-feature-date
  "Return the max date of the machine earning features."
  [db & opts]
  (some-> @(select db [(as '(max :date) :date)]
             (from :ml.features))
          first :date to-date-time))

(defn feature-start-time
  "Return the feature start time, either from `opts` or from `db`."
  [db & [opts]]
  (or (to-date-time (:start opts))
      (min-feature-date db opts)))

(defn feature-end-time
  "Return the feature end time, either from `opts` or from `db`."
  [db & [opts]]
  (or (to-date-time (:end opts))
      (time/plus (max-feature-date db opts)
                 (time/days 1))))

(defn scaled-features-in-window
  "Return the scaled machine learning features in a window of N days."
  [db & [{:keys [days quotes start end] :as opts}]]
  (let [days (or days 7)
        start (feature-start-time db opts)
        end (feature-end-time db opts)]
    (select db [:*]
      (from (as (select db (concat key-features
                                   (feature-window :w days window-features))
                  (from (as (scaled-features
                             db
                             {:start (time/minus start (time/days days))
                              :end end
                              :quotes quotes})
                            :scaled-features))
                  (window (as '(partition-by
                                :scaled-features.quote-id
                                (order-by :scaled-features.date)) :w))
                  (order-by :quote-id (desc :date)))
                :scaled-features-in-window))
      (when start
        (where `(>= :date ~(to-sql-time start)) :and))
      (when end
        (where `(< :date ~(to-sql-time end)) :and)))))
