/*
 * Decompiled with CFR 0.152.
 */
package com.aliasi.chunk;

import com.aliasi.chunk.Chunk;
import com.aliasi.chunk.ChunkTagHandlerAdapter2;
import com.aliasi.chunk.Chunking;
import com.aliasi.chunk.CompiledEstimator;
import com.aliasi.chunk.TokenShapeChunker;
import com.aliasi.chunk.TokenShapeDecoder;
import com.aliasi.chunk.TrainableEstimator;
import com.aliasi.corpus.ObjectHandler;
import com.aliasi.tokenizer.TokenCategorizer;
import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;
import com.aliasi.util.AbstractExternalizable;
import com.aliasi.util.Compilable;
import com.aliasi.util.ObjectToCounterMap;
import com.aliasi.util.Strings;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;

public class TrainTokenShapeChunker
implements ObjectHandler<Chunking>,
Compilable {
    private final boolean mValidateTokenizer = false;
    private final int mKnownMinTokenCount;
    private final int mMinTokenCount;
    private final int mMinTagCount;
    private final TokenCategorizer mTokenCategorizer;
    private final TokenizerFactory mTokenizerFactory;
    private final TrainableEstimator mTrainableEstimator;
    private final List<String> mTokenList = new ArrayList<String>();
    private final List<String> mTagList = new ArrayList<String>();
    static final Chunk[] EMPTY_CHUNK_ARRAY = new Chunk[0];

    public TrainTokenShapeChunker(TokenCategorizer categorizer, TokenizerFactory factory) {
        this(categorizer, factory, 8, 1, 1);
    }

    public TrainTokenShapeChunker(TokenCategorizer categorizer, TokenizerFactory factory, int knownMinTokenCount, int minTokenCount, int minTagCount) {
        this.mTokenCategorizer = categorizer;
        this.mTokenizerFactory = factory;
        this.mKnownMinTokenCount = knownMinTokenCount;
        this.mMinTokenCount = minTokenCount;
        this.mMinTagCount = minTagCount;
        this.mTrainableEstimator = new TrainableEstimator(categorizer);
    }

    void handle(String[] tokens, String[] whitespaces, String[] tags) {
        if (tokens.length != tags.length) {
            String msg = "Tokens and tags must be same length. Found tokens.length=" + tokens.length + " tags.length=" + tags.length;
            throw new IllegalArgumentException(msg);
        }
        for (int i = 0; i < tokens.length; ++i) {
            if (tokens[i] == null || tags[i] == null) {
                String msg = "Tags and tokens must not be null. Found tokens[" + i + "]=" + tokens[i] + " tags[" + i + "]=" + tags[i];
                throw new NullPointerException(msg);
            }
            this.mTokenList.add(tokens[i]);
            this.mTagList.add(tags[i]);
        }
    }

    @Override
    public void handle(Chunking chunking) {
        CharSequence cSeq = chunking.charSequence();
        char[] cs = Strings.toCharArray(cSeq);
        Set<Chunk> chunkSet = chunking.chunkSet();
        Chunk[] chunks = chunkSet.toArray(EMPTY_CHUNK_ARRAY);
        Arrays.sort(chunks, Chunk.TEXT_ORDER_COMPARATOR);
        ArrayList<String> tokenList = new ArrayList<String>();
        ArrayList<String> whiteList = new ArrayList<String>();
        ArrayList<String> tagList = new ArrayList<String>();
        int pos = 0;
        for (Chunk nextChunk : chunks) {
            String type = nextChunk.type();
            int start = nextChunk.start();
            int end = nextChunk.end();
            TrainTokenShapeChunker.outTag(cs, pos, start, tokenList, whiteList, tagList, this.mTokenizerFactory);
            TrainTokenShapeChunker.chunkTag(cs, start, end, type, tokenList, whiteList, tagList, this.mTokenizerFactory);
            pos = end;
        }
        TrainTokenShapeChunker.outTag(cs, pos, cSeq.length(), tokenList, whiteList, tagList, this.mTokenizerFactory);
        String[] toks = tokenList.toArray(Strings.EMPTY_STRING_ARRAY);
        String[] whites = whiteList.toArray(Strings.EMPTY_STRING_ARRAY);
        String[] tags = tagList.toArray(Strings.EMPTY_STRING_ARRAY);
        this.handle(toks, whites, tags);
    }

    @Override
    public void compileTo(ObjectOutput objOut) throws IOException {
        objOut.writeObject(new Externalizer(this));
    }

    void replaceUnknownsWithCategories(String[] tokens) {
        int i;
        ObjectToCounterMap<String> counter = new ObjectToCounterMap<String>();
        for (i = 0; i < tokens.length; ++i) {
            counter.increment(tokens[i]);
        }
        for (i = 0; i < tokens.length; ++i) {
            if (counter.getCount(tokens[i]) >= this.mKnownMinTokenCount) continue;
            tokens[i] = this.mTokenCategorizer.categorize(tokens[i]);
        }
    }

    static void outTag(char[] cs, int start, int end, List<String> tokenList, List<String> whiteList, List<String> tagList, TokenizerFactory factory) {
        String nextToken;
        Tokenizer tokenizer = factory.tokenizer(cs, start, end - start);
        whiteList.add(tokenizer.nextWhitespace());
        while ((nextToken = tokenizer.nextToken()) != null) {
            tokenList.add(nextToken);
            tagList.add(ChunkTagHandlerAdapter2.OUT_TAG);
            whiteList.add(tokenizer.nextWhitespace());
        }
    }

    static void chunkTag(char[] cs, int start, int end, String type, List<String> tokenList, List<String> whiteList, List<String> tagList, TokenizerFactory factory) {
        Tokenizer tokenizer = factory.tokenizer(cs, start, end - start);
        String firstToken = tokenizer.nextToken();
        tokenList.add(firstToken);
        tagList.add(ChunkTagHandlerAdapter2.BEGIN_TAG_PREFIX + type);
        while (true) {
            String nextWhitespace = tokenizer.nextWhitespace();
            String nextToken = tokenizer.nextToken();
            if (nextToken == null) break;
            tokenList.add(nextToken);
            whiteList.add(nextWhitespace);
            tagList.add(ChunkTagHandlerAdapter2.IN_TAG_PREFIX + type);
        }
    }

    static boolean consistentTokens(String[] toks, String[] whitespaces, TokenizerFactory tokenizerFactory) {
        if (toks.length + 1 != whitespaces.length) {
            return false;
        }
        char[] cs = TrainTokenShapeChunker.getChars(toks, whitespaces);
        Tokenizer tokenizer = tokenizerFactory.tokenizer(cs, 0, cs.length);
        String nextWhitespace = tokenizer.nextWhitespace();
        if (!whitespaces[0].equals(nextWhitespace)) {
            return false;
        }
        for (int i = 0; i < toks.length; ++i) {
            String token = tokenizer.nextToken();
            if (token == null) {
                return false;
            }
            if (!toks[i].equals(token)) {
                return false;
            }
            nextWhitespace = tokenizer.nextWhitespace();
            if (whitespaces[i + 1].equals(nextWhitespace)) continue;
            return false;
        }
        return true;
    }

    List<String> tokenization(String[] toks, String[] whitespaces) {
        ArrayList<String> tokList = new ArrayList<String>();
        ArrayList whiteList = new ArrayList();
        char[] cs = TrainTokenShapeChunker.getChars(toks, whitespaces);
        Tokenizer tokenizer = this.mTokenizerFactory.tokenizer(cs, 0, cs.length);
        tokenizer.tokenize(tokList, whiteList);
        return tokList;
    }

    static char[] getChars(String[] toks, String[] whitespaces) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < toks.length; ++i) {
            sb.append(whitespaces[i]);
            sb.append(toks[i]);
        }
        sb.append(whitespaces[whitespaces.length - 1]);
        return Strings.toCharArray(sb);
    }

    static class Externalizer
    extends AbstractExternalizable {
        private static final long serialVersionUID = 142720610674437597L;
        final TrainTokenShapeChunker mChunker;

        public Externalizer() {
            this(null);
        }

        public Externalizer(TrainTokenShapeChunker chunker) {
            this.mChunker = chunker;
        }

        @Override
        public Object read(ObjectInput in) throws ClassNotFoundException, IOException {
            TokenizerFactory factory = (TokenizerFactory)in.readObject();
            TokenCategorizer categorizer = (TokenCategorizer)in.readObject();
            CompiledEstimator estimator = (CompiledEstimator)in.readObject();
            TokenShapeDecoder decoder = new TokenShapeDecoder(estimator, categorizer, 1000.0);
            return new TokenShapeChunker(factory, decoder);
        }

        @Override
        public void writeExternal(ObjectOutput objOut) throws IOException {
            int len = this.mChunker.mTagList.size();
            String[] tokens = this.mChunker.mTokenList.toArray(Strings.EMPTY_STRING_ARRAY);
            String[] tags = this.mChunker.mTagList.toArray(Strings.EMPTY_STRING_ARRAY);
            this.mChunker.mTrainableEstimator.handle(tokens, tags);
            this.mChunker.replaceUnknownsWithCategories(tokens);
            this.mChunker.mTrainableEstimator.handle(tokens, tags);
            this.mChunker.mTrainableEstimator.prune(this.mChunker.mMinTagCount, this.mChunker.mMinTokenCount);
            this.mChunker.mTrainableEstimator.smoothTags(1);
            AbstractExternalizable.compileOrSerialize(this.mChunker.mTokenizerFactory, objOut);
            AbstractExternalizable.compileOrSerialize(this.mChunker.mTokenCategorizer, objOut);
            this.mChunker.mTrainableEstimator.compileTo(objOut);
        }
    }
}

