/*
 * Decompiled with CFR 0.152.
 */
package de.jetwick.snacktory;

import de.jetwick.snacktory.JResult;
import de.jetwick.snacktory.OutputFormatter;
import de.jetwick.snacktory.SHelper;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ArticleTextExtractor {
    private static final Logger logger = LoggerFactory.getLogger(ArticleTextExtractor.class);
    private static final Pattern UNLIKELY = Pattern.compile("^(com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|header|menu|re(mark|ply)|rss|sh(are|outbox)|sponsora(d|ll|gegate|rchive|ttachment)|(pag(er|ination))|popup|print|login|si(debar|gn|ngle))");
    private static final Pattern POSITIVE = Pattern.compile("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))|arti(cle|kel)|instapaper_body");
    private static final Pattern NEGATIVE = Pattern.compile("nav($|igation)|user|com(ment|bx)|(^com-)|contact|foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|sidebar|sponsor|tags|tool|widget|player");
    private static final Pattern NEGATIVE_STYLE = Pattern.compile("hidden|display: ?none");
    private static final String IMAGE_CAPTION = "caption";
    private static final Set<String> IGNORED_TITLE_PARTS = new LinkedHashSet<String>(){
        {
            this.add("hacker news");
            this.add("facebook");
        }
    };
    private static final OutputFormatter DEFAULT_FORMATTER = new OutputFormatter();

    public JResult extractContent(String html) throws Exception {
        return this.extractContent(new JResult(), html);
    }

    public JResult extractContent(JResult res, String html) throws Exception {
        return this.extractContent(res, html, DEFAULT_FORMATTER);
    }

    public JResult extractContent(JResult res, String html, OutputFormatter formatter) throws Exception {
        if (html.isEmpty()) {
            throw new IllegalArgumentException("html string is empty!?");
        }
        return this.extractContent(res, Jsoup.parse((String)html), formatter);
    }

    public JResult extractContent(JResult res, Document doc, OutputFormatter formatter) throws Exception {
        String imageUrl;
        if (doc == null) {
            throw new NullPointerException("missing document");
        }
        res.setTitle(this.extractTitle(doc));
        res.setDescription(this.extractDescription(doc));
        res.setCanonicalUrl(this.extractCanonicalUrl(doc));
        this.prepareDocument(doc);
        Collection<Element> nodes = this.getNodes(doc);
        int maxWeight = 0;
        Element bestMatchElement = null;
        for (Element entry : nodes) {
            int currentWeight = this.getWeight(entry);
            if (currentWeight <= maxWeight) continue;
            maxWeight = currentWeight;
            bestMatchElement = entry;
            if (maxWeight <= 200) continue;
            break;
        }
        if (bestMatchElement != null) {
            Element imgEl = this.determineImageSource(bestMatchElement);
            if (imgEl != null) {
                res.setImageUrl(SHelper.replaceSpaces(imgEl.attr("src")));
            }
            String text = formatter.getFormattedText(bestMatchElement);
            if ((text = this.removeTitleFromText(text, res.getTitle())).length() > res.getTitle().length()) {
                res.setText(text);
            }
        }
        if (!(imageUrl = this.extractImageUrl(doc)).isEmpty()) {
            res.setImageUrl(imageUrl);
        }
        res.setRssUrl(this.extractRssUrl(doc));
        res.setVideoUrl(this.extractVideoUrl(doc));
        res.setFaviconUrl(this.extractFaviconUrl(doc));
        res.setKeywords(this.extractKeywords(doc));
        return res;
    }

    protected String extractTitle(Document doc) {
        String title = this.cleanTitle(doc.title());
        if (title.isEmpty() && (title = SHelper.innerTrim(doc.select("head title").text())).isEmpty() && (title = SHelper.innerTrim(doc.select("head meta[name=title]").attr("content"))).isEmpty()) {
            title = SHelper.innerTrim(doc.select("head meta[property=og:title]").attr("content"));
        }
        return title;
    }

    protected String extractCanonicalUrl(Document doc) {
        String url = SHelper.replaceSpaces(doc.select("head link[rel=canonical]").attr("href"));
        if (url.isEmpty()) {
            url = SHelper.replaceSpaces(doc.select("head meta[property=og:url]").attr("content"));
        }
        return url;
    }

    protected String extractDescription(Document doc) {
        String description = SHelper.innerTrim(doc.select("head meta[name=description]").attr("content"));
        if (description.isEmpty()) {
            description = SHelper.innerTrim(doc.select("head meta[property=og:description]").attr("content"));
        }
        return description;
    }

    protected Collection<String> extractKeywords(Document doc) {
        String content = SHelper.innerTrim(doc.select("head meta[name=keywords]").attr("content"));
        if (content != null) {
            String[] split;
            if (content.startsWith("[") && content.endsWith("]")) {
                content = content.substring(1, content.length() - 1);
            }
            if ((split = content.split("\\s*,\\s*")).length > 1 || !split[0].equals("")) {
                return Arrays.asList(split);
            }
        }
        return Collections.emptyList();
    }

    protected String extractImageUrl(Document doc) {
        String imageUrl = SHelper.replaceSpaces(doc.select("head meta[property=og:image]").attr("content"));
        if (imageUrl.isEmpty() && (imageUrl = SHelper.replaceSpaces(doc.select("link[rel=image_src]").attr("href"))).isEmpty()) {
            imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=thumbnail]").attr("content"));
        }
        return imageUrl;
    }

    protected String extractRssUrl(Document doc) {
        return SHelper.replaceSpaces(doc.select("link[rel=alternate]").select("link[type=application/rss+xml]").attr("href"));
    }

    protected String extractVideoUrl(Document doc) {
        return SHelper.replaceSpaces(doc.select("head meta[property=og:video]").attr("content"));
    }

    protected String extractFaviconUrl(Document doc) {
        String faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel=icon]").attr("href"));
        if (faviconUrl.isEmpty()) {
            faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel^=shortcut],link[rel$=icon]").attr("href"));
        }
        return faviconUrl;
    }

    protected int getWeight(Element e) {
        String style;
        Integer weight = 0;
        if (POSITIVE.matcher(e.className()).find()) {
            weight = weight + 35;
        }
        if (POSITIVE.matcher(e.id()).find()) {
            weight = weight + 40;
        }
        if (UNLIKELY.matcher(e.className()).find()) {
            weight = weight - 20;
        }
        if (UNLIKELY.matcher(e.id()).find()) {
            weight = weight - 20;
        }
        if (NEGATIVE.matcher(e.className()).find()) {
            weight = weight - 50;
        }
        if (NEGATIVE.matcher(e.id()).find()) {
            weight = weight - 50;
        }
        if ((style = e.attr("style")) != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find()) {
            weight = weight - 50;
        }
        weight = weight + (int)Math.round((double)e.ownText().length() / 100.0 * 10.0);
        weight = weight + this.weightChildNodes(e);
        return weight;
    }

    protected int weightChildNodes(Element e) {
        int weight = 0;
        Element caption = null;
        ArrayList<Element> headerEls = new ArrayList<Element>(5);
        ArrayList<Element> pEls = new ArrayList<Element>(5);
        for (Element child : e.children()) {
            String ownText = child.ownText();
            int ownTextLength = ownText.length();
            if (ownTextLength < 20) continue;
            if (ownTextLength > 200) {
                weight += Math.max(50, ownTextLength / 10);
            }
            if (e.id().contains(IMAGE_CAPTION) || e.className().contains(IMAGE_CAPTION)) {
                weight += 30;
            }
            if (child.tagName().equals("h1") || child.tagName().equals("h2")) {
                weight += 30;
                continue;
            }
            if (!child.tagName().equals("div") && !child.tagName().equals("p")) continue;
            weight += this.calcWeightForChild(child, e, ownText);
            if (child.tagName().equals("p") && ownTextLength > 50) {
                pEls.add(child);
            }
            if (!child.className().toLowerCase().equals(IMAGE_CAPTION)) continue;
            caption = child;
        }
        if (caption != null) {
            weight += 30;
        }
        if (pEls.size() >= 2) {
            for (Element subEl : e.children()) {
                if ("h1;h2;h3;h4;h5;h6".contains(subEl.tagName())) {
                    weight += 20;
                    headerEls.add(subEl);
                }
                if (!"p".contains(subEl.tagName())) continue;
                this.addScore(subEl, 30);
            }
            weight += 60;
        }
        return weight;
    }

    public int getScore(Element el) {
        int old = 0;
        try {
            old = Integer.parseInt(el.attr("gravityScore"));
        }
        catch (Exception exception) {
            // empty catch block
        }
        return old;
    }

    public void addScore(Element el, int score) {
        int old = this.getScore(el);
        this.setScore(el, score + old);
    }

    public void setScore(Element el, int score) {
        el.attr("gravityScore", Integer.toString(score));
    }

    public int calcWeightForChild(Element child, Element e, String ownText) {
        int c = SHelper.count(ownText, "&quot;");
        c += SHelper.count(ownText, "&lt;");
        c += SHelper.count(ownText, "&gt;");
        int val = (c += SHelper.count(ownText, "px")) > 5 ? -30 : (int)Math.round((double)ownText.length() / 25.0);
        this.addScore(child, val);
        return val;
    }

    public Element determineImageSource(Element el) {
        int maxWeight = 0;
        Element maxNode = null;
        Elements els = el.select("img");
        if (els.isEmpty()) {
            els = el.parent().select("img");
        }
        double score = 1.0;
        for (Element e : els) {
            String rel;
            String title;
            String sourceUrl = e.attr("src");
            if (sourceUrl.isEmpty() || this.isAdImage(sourceUrl)) continue;
            int weight = 0;
            try {
                int height = Integer.parseInt(e.attr("height"));
                if (height > 50) {
                    weight += 20;
                } else if (height < 50) {
                    weight -= 20;
                }
            }
            catch (Exception ex) {
                // empty catch block
            }
            try {
                int width = Integer.parseInt(e.attr("width"));
                if (width > 50) {
                    weight += 20;
                } else if (width < 50) {
                    weight -= 20;
                }
            }
            catch (Exception ex) {
                // empty catch block
            }
            String alt = e.attr("alt");
            if (alt.length() > 35) {
                weight += 20;
            }
            if ((title = e.attr("title")).length() > 35) {
                weight += 20;
            }
            if (e.parent() != null && (rel = e.parent().attr("rel")) != null && rel.contains("nofollow")) {
                weight -= 40;
            }
            if ((weight = (int)((double)weight * score)) <= maxWeight) continue;
            maxWeight = weight;
            maxNode = e;
            score /= 2.0;
        }
        return maxNode;
    }

    protected void prepareDocument(Document doc) {
        this.removeScriptsAndStyles(doc);
    }

    protected void stripUnlikelyCandidates(Document doc) {
        for (Element child : doc.select("body").select("*")) {
            String className = child.className().toLowerCase();
            String id = child.id().toLowerCase();
            if (!NEGATIVE.matcher(className).find() && !NEGATIVE.matcher(id).find()) continue;
            child.remove();
        }
    }

    private Document removeScriptsAndStyles(Document doc) {
        Elements scripts = doc.getElementsByTag("script");
        for (Element item : scripts) {
            item.remove();
        }
        Elements styles = doc.getElementsByTag("style");
        for (Element style : styles) {
            style.remove();
        }
        return doc;
    }

    private void print(Element child) {
        this.print("", child, "");
    }

    private void print(String add, Element child) {
        this.print(add, child, "");
    }

    private void print(String add1, Element child, String add2) {
        logger.info(add1 + " " + child.nodeName() + " id=" + child.id() + " class=" + child.className() + " text=" + child.text() + " " + add2);
    }

    private boolean isAdImage(String imageUrl) {
        return SHelper.count(imageUrl, "ad") >= 2;
    }

    public String removeTitleFromText(String text, String title) {
        return text;
    }

    private String doTitleSplits(String title, String delimeter) {
        String[] titlePieces;
        String largeText = "";
        int largetTextLen = 0;
        for (String p : titlePieces = title.split(delimeter)) {
            if (p.length() <= largetTextLen) continue;
            largeText = p;
            largetTextLen = p.length();
        }
        largeText = largeText.replace("&raquo;", " ");
        largeText = largeText.replace("\u00ac\u00aa", " ");
        return largeText.trim();
    }

    public Collection<Element> getNodes(Document doc) {
        LinkedHashMap nodes = new LinkedHashMap(64);
        int score = 100;
        for (Element el : doc.select("body").select("*")) {
            if (!"p;div;td;h1;h2".contains(el.tagName())) continue;
            nodes.put(el, null);
            this.setScore(el, score);
            score /= 2;
        }
        return nodes.keySet();
    }

    public String cleanTitle(String title) {
        String[] strs;
        StringBuilder res = new StringBuilder();
        int counter = 0;
        for (String part : strs = title.split("\\|")) {
            if (IGNORED_TITLE_PARTS.contains(part.toLowerCase().trim()) || counter == strs.length - 1 && res.length() > part.length()) continue;
            if (counter > 0) {
                res.append("|");
            }
            res.append(part);
            ++counter;
        }
        return SHelper.innerTrim(res.toString());
    }
}

