(ns antistock.db.features
  (:refer-clojure :exclude [distinct group-by update])
  (:require [antistock.db.quotes :as quotes]
            [antistock.json :as json]
            [clj-time.coerce :refer [to-date-time to-sql-time]]
            [clojure.data.csv :as csv]
            [clojure.java.io :as io]
            [clojure.string :as str]
            [datumbazo.core :refer :all :exclude [columns]]))

(def window-features
  [:adj-close
   :close
   :high
   :low
   :open
   :rtid
   :rtu
   :tgeo
   :thtg
   :tid
   :turl
   :tusm
   :uflw
   :ufrn
   :uid
   :volume])

(def all-features
  (concat [:quote-id :date :ttot] window-features))

(defn- constrain-subgraph
  "Restrict `stmt` on the subgraph of `quotes` between `start` and `end`."
  [& {:keys [quotes start end]}]
  (if (or (not (empty? quotes)) start end)
    (where `(and ~@(->> [(if start `(>= :twitter.tweets.created-at ~(to-date-time start)))
                         (if end `(< :twitter.tweets.created-at ~(to-date-time end)))
                         (if-not (empty? quotes)
                           `(in :quote-id ~(map :id quotes)))]
                        (remove nil?)))
           :and)))

(defn- truncate-created-at [precision]
  `(date_trunc ~(or precision "day") :tweets.created-at))

(defn- group-and-order-by
  "Group and order by `columns`."
  [& columns]
  (chain-state
   [(apply group-by columns)
    (apply order-by columns)]))

(defn number-of-retweets
  "Return the RTID feature (number of retweets in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id
              (as (truncate-created-at precision) :date)
              (as '(count :*) :rtid)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (where '(= :tweets.retweeted true))
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-and-order-by 1 2)))

(defn number-distinct-users-with-retweets
  "Return the RTU (number of different users that have re-tweeted in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id
              (as (truncate-created-at precision) :date)
              (as '(count distinct :tweets.user-id) :rtu)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (where '(= :tweets.retweeted true))
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-and-order-by 1 2)))

(defn number-of-tweets-with-geolocation
  "Return the TGEO feature (number of tweets with geo location in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id (as (truncate-created-at precision) :date) (as '(count *) :tgeo)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.users.id :twitter.tweets.user-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (where `(and (is-not-null :location)
                 (~(keyword "!~") :location "^[[:space:]]*$"))
           :and)
    (group-and-order-by 1 2)))

(defn number-of-tweets
  "Return the TID feature (number of tweets in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id (as (truncate-created-at precision) :date) (as '(count *) :tid)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-and-order-by 1 2)))

(defn number-of-tweets-mentioning-user
  "Return the TUSM feature (number of tweets that mention any user in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id (as (truncate-created-at precision) :date) (as '(count *) :tusm)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.users.id :twitter.tweets.user-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-and-order-by 1 2)))

(defn avg-number-of-friends
  "Return the UFRN feature (average number of friends for user that posted in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id (as (truncate-created-at precision) :date) (as '(avg :friends-count) :ufrn)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.users.id :twitter.tweets.user-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-and-order-by 1 2)))

(defn number-of-hash-tags-in-tweets
  "Return the THTG feature (number of hash tags used in all the tweets in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id (as (truncate-created-at precision) :date) (as '(count *) :thtg)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.hash-tags-tweets.tweet-id :twitter.tweets-quotes.tweet-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-and-order-by 1 2)))

(defn number-of-tweets-with-urls
  "Return the TURL feature (number of tweets with urls in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id (as (truncate-created-at precision) :date) (as '(count *) :turl)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.links-tweets.tweet-id :twitter.tweets-quotes.tweet-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-and-order-by 1 2)))

(defn avg-number-of-followers-for-users
  "Return the UFLW feature (average number of followers for user that posted in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id (as (truncate-created-at precision) :date) (as '(avg :followers-count) :uflw)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.users.id :twitter.tweets.user-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-and-order-by 1 2)))

(defn number-of-distinct-users
  "Return the UID feature (number of different users that posted in G)."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id (as (truncate-created-at precision) :date)
              (as '(count distinct :twitter.tweets.user-id) :uid)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.users.id :twitter.tweets.user-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-and-order-by 1 2)))

(defn number-of-tweets-in-interval
  "Return the number of tweets in `precision` interval."
  [db precision & [{:keys [quotes start end] :as opts}]]
  (select db [(as (truncate-created-at precision) :date) '(count :*)]
    (from :twitter.tweets)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-by 1)))

(defn prices
  "Return the prices for `quotes` between `start` and `end`."
  [db & [{:keys [end precision quotes start]}]]
  (select db [:quote-id :date :open :close :high :low :volume :adj-close]
    (from :prices)
    (if (or (not (empty? quotes)) start end)
      (where `(and ~@(->> [(if start `(>= :date ~(to-date-time start)))
                           (if end `(< :date ~(to-date-time end)))
                           (if-not (empty? quotes)
                             `(in :quote-id ~(map :id quotes)))]
                          (remove nil?)))))))

(defn- join-features [feature-1 feature-2]
  (join feature-2
        `(on (and (= ~(keyword (str (name feature-1) ".quote-id"))
                     ~(keyword (str (name feature-2) ".quote-id")))
                  (= ~(keyword (str (name feature-1) ".date"))
                     ~(keyword (str (name feature-2) ".date")))))
        :type :left))

(defn feature-alias [day feature]
  (keyword (str (last (str/split (name feature) #"\.")) "-" day)))

(defn- feature-window [window days features]
  (for [day (range 0 days)
        feature features]
    (as `(coalesce (over (lag ~feature ~day) ~window) 0)
        (feature-alias day feature))))

(defn features
  "Return machine learning features."
  [db & [{:keys [days quotes start end] :as opts}]]
  (with db [:prices (prices db opts)
            :rtid (number-of-retweets db opts)
            :rtu (number-distinct-users-with-retweets db opts)
            :tgeo (number-of-tweets-with-geolocation db opts)
            :thtg (number-of-hash-tags-in-tweets db opts)
            :tid (number-of-tweets db opts)
            :ttot (number-of-tweets-in-interval db "day" {:start start :end end})
            :turl (number-of-tweets-with-urls db opts)
            :tusm (number-of-tweets-mentioning-user db opts)
            :uflw (avg-number-of-followers-for-users db opts)
            :ufrn (avg-number-of-friends db opts)
            :uid (number-of-distinct-users db opts)]
    (select db (concat
                [:prices.quote-id
                 (as '(coalesce :ttot.count 0) :ttot)
                 (as '(to-char :prices.date "YYYY-MM-DD") :date)]
                (feature-window :w (or days 1) window-features))
      (from :prices)
      (join-features :prices :tgeo)
      (join-features :prices :thtg)
      (join-features :prices :tid)
      (join-features :prices :rtid)
      (join-features :prices :rtu)
      (join-features :prices :turl)
      (join-features :prices :tusm)
      (join-features :prices :uflw)
      (join-features :prices :ufrn)
      (join-features :prices :uid)
      (join :ttot.date :prices.date :type :left)
      (window (as '(partition-by
                    :prices.quote-id
                    (order-by :prices.date)) :w))
      (order-by :prices.quote-id (desc :prices.date)))))
