package edu.berkeley.nlp.util.experiments;

import edu.berkeley.nlp.tokenizer.PTBLineLexer;
import edu.berkeley.nlp.util.Filter;
import edu.berkeley.nlp.util.IOUtil;
import edu.berkeley.nlp.util.Iterators;
import edu.berkeley.nlp.util.Method;
import edu.berkeley.nlp.util.StringUtils;
import fig.basic.IOUtils;
import fig.basic.Option;
import fig.exec.Execution;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

/* loaded from: input_file:edu/berkeley/nlp/util/experiments/SentenceIterable.class */
public class SentenceIterable implements Iterable<List<String>>, Runnable {

    @Option(required = true)
    public String dataRoot;

    @Option(gloss = "Prefix of files to consider")
    public String prefix = "";

    @Option(gloss = "Extension of files to consider. If you pass .gz as a suffix, we will unzip them with gzip")
    public String extension = ".txt";

    @Option(gloss = "Maxixum number of sentences")
    public int maxNumSentences = Integer.MAX_VALUE;

    @Option(gloss = "Tokenize each sentence")
    public boolean tokenize = false;

    @Option(gloss = "Lowercase data")
    public boolean lowercase = false;

    @Option(gloss = "How many sentences to buffer")
    public int bufferSize = 1;

    @Option(gloss = "Maximum number of sentences")
    public int maxSentenceLength = Integer.MAX_VALUE;

    @Option(gloss = "Do we need to segment senteces first. Forces tokenize=true")
    public boolean sentenceSegment = false;
    private Iterable<File> files;
    private static MainRunOptions mainOpts = new MainRunOptions();

    /* loaded from: input_file:edu/berkeley/nlp/util/experiments/SentenceIterable$MainRunOptions.class */
    public static class MainRunOptions {

        @Option(required = true, gloss = "Where to put data one-sentence per-line")
        public String outDir;

        @Option(gloss = "Extension of output data")
        public String outExtension = ".tok";
    }

    /* loaded from: input_file:edu/berkeley/nlp/util/experiments/SentenceIterable$MyIterator.class */
    public class MyIterator implements Iterator<List<String>> {
        Iterator<File> fileIt;
        Iterator<List<String>> curLinesIt = Iterators.emptyIterator();

        public MyIterator(Iterator<File> it) {
            this.fileIt = it;
        }

        @Override // java.util.Iterator
        public boolean hasNext() {
            return queueNext();
        }

        /* JADX WARN: Can't rename method to resolve collision */
        @Override // java.util.Iterator
        public List<String> next() {
            if (queueNext()) {
                return this.curLinesIt.next();
            }
            throw new IllegalStateException();
        }

        private boolean queueNext() {
            if (this.curLinesIt.hasNext()) {
                return true;
            }
            if (!this.fileIt.hasNext()) {
                return false;
            }
            this.curLinesIt = nextFileLineIterator();
            return queueNext();
        }

        private Iterator<List<String>> nextFileLineIterator() {
            DocumentSentenceSegmenter documentSentenceSegmenter = new DocumentSentenceSegmenter();
            File next = this.fileIt.next();
            Iterator<List<String>> it = null;
            if (SentenceIterable.this.sentenceSegment) {
                it = documentSentenceSegmenter.getSentences(next).iterator();
            } else {
                try {
                    Iterator<String> lineIterator = IOUtils.lineIterator(next.getAbsolutePath());
                    final PTBLineLexer pTBLineLexer = new PTBLineLexer();
                    it = new Iterators.TransformingIterator(lineIterator, new Method<String, List<String>>() { // from class: edu.berkeley.nlp.util.experiments.SentenceIterable.MyIterator.1
                        @Override // edu.berkeley.nlp.util.Method
                        public List<String> call(String str) {
                            List<String> list = null;
                            if (SentenceIterable.this.tokenize) {
                                try {
                                    list = pTBLineLexer.tokenizeLine(str);
                                } catch (IOException e) {
                                    e.printStackTrace();
                                    System.exit(0);
                                }
                            } else {
                                list = Arrays.asList(str.split("\\s+"));
                            }
                            return list;
                        }
                    });
                } catch (IOException e) {
                    e.printStackTrace();
                    System.exit(0);
                }
            }
            if (SentenceIterable.this.lowercase) {
                it = new Iterators.TransformingIterator(it, new Method<List<String>, List<String>>() { // from class: edu.berkeley.nlp.util.experiments.SentenceIterable.MyIterator.2
                    @Override // edu.berkeley.nlp.util.Method
                    public List<String> call(List<String> list) {
                        for (int i = 0; i < list.size(); i++) {
                            list.set(i, list.get(i).toLowerCase());
                        }
                        return list;
                    }
                });
            }
            return it;
        }

        @Override // java.util.Iterator
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }

    public SentenceIterable(List<File> list) {
        this.files = list;
    }

    public SentenceIterable() {
    }

    public void setTokenize(boolean z) {
        this.tokenize = z;
    }

    public void setPrefix(String str) {
        this.prefix = str;
    }

    public void setExtension(String str) {
        this.extension = str;
    }

    public boolean isLowercase() {
        return this.lowercase;
    }

    public void setLowercase(boolean z) {
        this.lowercase = z;
    }

    @Override // java.lang.Iterable
    public Iterator<List<String>> iterator() {
        return Iterators.maxLengthIterator(Iterators.filter(new MyIterator(IOUtils.getFilesUnder(this.dataRoot, IOUtil.getFileFilter(this.prefix, this.extension)).iterator()), new Filter<List<String>>() { // from class: edu.berkeley.nlp.util.experiments.SentenceIterable.1
            @Override // edu.berkeley.nlp.util.Filter
            public boolean accept(List<String> list) {
                return list.size() <= SentenceIterable.this.maxSentenceLength;
            }
        }), this.maxNumSentences);
    }

    @Override // java.lang.Runnable
    public void run() {
        List<File> filesUnder = IOUtils.getFilesUnder(this.dataRoot, IOUtil.getFileFilter(this.prefix, this.extension));
        IOUtils.createNewDirIfNotExistsEasy(mainOpts.outDir);
        for (File file : filesUnder) {
            IOUtils.writeLinesHard(new File(mainOpts.outDir, file.getName() + mainOpts.outExtension).getAbsolutePath(), Iterators.fillList(new Iterators.TransformingIterator(Iterators.maxLengthIterator(Iterators.filter(new MyIterator(Collections.singletonList(file).iterator()), new Filter<List<String>>() { // from class: edu.berkeley.nlp.util.experiments.SentenceIterable.2
                @Override // edu.berkeley.nlp.util.Filter
                public boolean accept(List<String> list) {
                    return list.size() <= SentenceIterable.this.maxSentenceLength;
                }
            }), this.maxNumSentences), new Method<List<String>, String>() { // from class: edu.berkeley.nlp.util.experiments.SentenceIterable.3
                @Override // edu.berkeley.nlp.util.Method
                public String call(List<String> list) {
                    return StringUtils.join((List<?>) list, " ");
                }
            })));
        }
    }

    public static void main(String[] strArr) {
        Execution.run(strArr, new SentenceIterable(), mainOpts);
    }
}
