package edu.stanford.nlp.process;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.Timing;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/stanford/nlp/process/PTBTokenizer.class */
public class PTBTokenizer<T extends HasWord> extends AbstractTokenizer<T> {
    private boolean tokenizeCRs;
    private boolean invertible;
    private boolean suppressEscaping;
    private PTBLexer lexer;
    private LexedTokenFactory<T> tokenFactory;

    /* loaded from: input_file:edu/stanford/nlp/process/PTBTokenizer$PTBTokenizerFactory.class */
    public static class PTBTokenizerFactory<T extends HasWord> implements TokenizerFactory<T> {
        protected boolean tokenizeCRs;
        protected boolean invertible;
        protected boolean suppressEscaping;
        protected LexedTokenFactory<T> factory;

        public static PTBTokenizerFactory<Word> newPTBTokenizerFactory() {
            return newPTBTokenizerFactory(false);
        }

        public static PTBTokenizerFactory<Word> newPTBTokenizerFactory(boolean z) {
            return new PTBTokenizerFactory<>(z, new WordTokenFactory());
        }

        public PTBTokenizerFactory(boolean z, LexedTokenFactory<T> lexedTokenFactory) {
            this(z, false, false, lexedTokenFactory);
        }

        public static PTBTokenizerFactory<CoreLabel> newPTBTokenizerFactory(boolean z, boolean z2) {
            return new PTBTokenizerFactory<>(z, z2, new CoreLabelTokenFactory());
        }

        public static PTBTokenizerFactory<Word> newPTBTokenizerFactory(boolean z, boolean z2, boolean z3) {
            return new PTBTokenizerFactory<>(z, z2, z3, new WordTokenFactory());
        }

        private PTBTokenizerFactory(boolean z, boolean z2, LexedTokenFactory<T> lexedTokenFactory) {
            this(z, z2, false, lexedTokenFactory);
        }

        private PTBTokenizerFactory(boolean z, boolean z2, boolean z3, LexedTokenFactory<T> lexedTokenFactory) {
            this.tokenizeCRs = z;
            this.invertible = z2;
            this.suppressEscaping = z3;
            this.factory = lexedTokenFactory;
        }

        @Override // edu.stanford.nlp.objectbank.IteratorFromReaderFactory
        public Iterator<T> getIterator(Reader reader) {
            return getTokenizer(reader);
        }

        @Override // edu.stanford.nlp.objectbank.TokenizerFactory
        public Tokenizer<T> getTokenizer(Reader reader) {
            return new PTBTokenizer(reader, this.tokenizeCRs, this.invertible, this.suppressEscaping, this.factory);
        }
    }

    public static PTBTokenizer<Word> newPTBTokenizer(Reader reader) {
        return newPTBTokenizer(reader, false);
    }

    public static PTBTokenizer<Word> newPTBTokenizer(Reader reader, boolean z) {
        return new PTBTokenizer<>(reader, z, new WordTokenFactory());
    }

    public static PTBTokenizer<CoreLabel> newPTBTokenizer(Reader reader, boolean z, boolean z2) {
        return new PTBTokenizer<>(reader, z, z2, new CoreLabelTokenFactory());
    }

    public PTBTokenizer(Reader reader, boolean z, LexedTokenFactory<T> lexedTokenFactory) {
        this(reader, z, false, lexedTokenFactory);
    }

    private PTBTokenizer(Reader reader, boolean z, boolean z2, LexedTokenFactory<T> lexedTokenFactory) {
        this(reader, z, z2, false, lexedTokenFactory);
    }

    private PTBTokenizer(Reader reader, boolean z, boolean z2, boolean z3, LexedTokenFactory<T> lexedTokenFactory) {
        this.tokenizeCRs = z;
        this.tokenFactory = lexedTokenFactory;
        this.invertible = z2;
        this.suppressEscaping = z3;
        setSource(reader);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r0v6, types: [edu.stanford.nlp.ling.HasWord] */
    @Override // edu.stanford.nlp.process.AbstractTokenizer
    public T getNext() {
        T t = null;
        try {
            t = (HasWord) this.lexer.next();
        } catch (Exception e) {
            this.nextToken = null;
        }
        return t;
    }

    public final void setSource(Reader reader) {
        if (this.invertible) {
            this.lexer = new PTBLexer(reader, this.invertible, this.tokenizeCRs);
        } else {
            this.lexer = new PTBLexer(reader, this.tokenFactory, this.tokenizeCRs, this.suppressEscaping);
        }
    }

    public static String ptb2Text(String str) {
        StringBuilder sb = new StringBuilder(str.length());
        PTB2TextLexer pTB2TextLexer = new PTB2TextLexer(new StringReader(str));
        while (true) {
            try {
                String next = pTB2TextLexer.next();
                if (next == null) {
                    break;
                }
                sb.append(next);
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return sb.toString();
    }

    public static int ptb2Text(Reader reader, Writer writer) throws IOException {
        int i = 0;
        PTB2TextLexer pTB2TextLexer = new PTB2TextLexer(reader);
        while (true) {
            String next = pTB2TextLexer.next();
            if (next == null) {
                return i;
            }
            i++;
            writer.write(next);
        }
    }

    private static void untok(List<String> list, List<String> list2, String str) throws IOException {
        Timing timing = new Timing();
        int i = 0;
        int size = list.size();
        if (size == 0) {
            i = ptb2Text(new InputStreamReader(System.in, str), new PrintWriter((OutputStream) System.out, true));
        } else {
            for (int i2 = 0; i2 < size; i2++) {
                BufferedReader readReaderFromString = IOUtils.readReaderFromString(list.get(i2), str);
                PrintWriter printWriter = list2 == null ? new PrintWriter((OutputStream) System.out, true) : new PrintWriter((Writer) new BufferedWriter(new OutputStreamWriter(new FileOutputStream(list2.get(i2)), str)), true);
                i += ptb2Text(readReaderFromString, printWriter);
                printWriter.close();
            }
        }
        System.err.println("PTBTokenizer untokenized " + i + " tokens at " + new DecimalFormat("0.00").format(i / (timing.stop() / 1000.0d)) + " tokens per second.");
    }

    public static String ptb2Text(List<String> list) {
        return ptb2Text(StringUtils.join(list));
    }

    public static String labelList2Text(List<? extends HasWord> list) {
        ArrayList arrayList = new ArrayList();
        Iterator<? extends HasWord> it = list.iterator();
        while (it.hasNext()) {
            arrayList.add(it.next().word());
        }
        return ptb2Text(arrayList);
    }

    private static void tok(List<String> list, List<String> list2, String str, Pattern pattern, Pattern pattern2, boolean z, boolean z2, boolean z3) throws IOException {
        Timing timing = new Timing();
        int i = 0;
        int size = list.size();
        if (size == 0) {
            i = 0 + tokReader(new InputStreamReader(System.in, str), new PrintWriter((OutputStream) System.out, true), pattern, pattern2, z, z2, z3);
        } else {
            for (int i2 = 0; i2 < size; i2++) {
                BufferedReader readReaderFromString = IOUtils.readReaderFromString(list.get(i2), str);
                PrintWriter printWriter = list2 == null ? new PrintWriter((OutputStream) System.out, true) : new PrintWriter((Writer) new BufferedWriter(new OutputStreamWriter(new FileOutputStream(list2.get(i2)), str)), true);
                i += tokReader(readReaderFromString, printWriter, pattern, pattern2, z, z2, z3);
                readReaderFromString.close();
                if (list2 != null) {
                    printWriter.close();
                }
            }
        }
        System.err.println("PTBTokenizer tokenized " + i + " tokens at " + new DecimalFormat("0.00").format(i / (timing.stop() / 1000.0d)) + " tokens per second.");
    }

    /* JADX WARN: Multi-variable type inference failed */
    private static int tokReader(Reader reader, PrintWriter printWriter, Pattern pattern, Pattern pattern2, boolean z, boolean z2, boolean z3) {
        int i = 0;
        PTBTokenizer<CoreLabel> newPTBTokenizer = newPTBTokenizer(reader, z, true);
        boolean z4 = pattern == null;
        boolean z5 = true;
        while (newPTBTokenizer.hasNext()) {
            CoreLabel coreLabel = (CoreLabel) newPTBTokenizer.next();
            String word = coreLabel.word();
            if (pattern != null && pattern.matcher(word).matches()) {
                z4 = true;
            } else if (pattern2 != null && pattern2.matcher(word).matches()) {
                z4 = false;
            } else if (z4) {
                if (z3) {
                    word = coreLabel.toString();
                }
                if (!z2) {
                    printWriter.println(word);
                } else if (PTBLexer.cr.equals(word)) {
                    z5 = true;
                    printWriter.println();
                } else {
                    if (z5) {
                        z5 = false;
                    } else {
                        printWriter.print(" ");
                    }
                    printWriter.print(word);
                }
            }
            i++;
        }
        return i;
    }

    public static TokenizerFactory<Word> factory() {
        return PTBTokenizerFactory.newPTBTokenizerFactory();
    }

    public static TokenizerFactory<Word> factory(boolean z) {
        return PTBTokenizerFactory.newPTBTokenizerFactory(z);
    }

    public static <T extends HasWord> TokenizerFactory<T> factory(boolean z, LexedTokenFactory<T> lexedTokenFactory) {
        return new PTBTokenizerFactory(z, lexedTokenFactory);
    }

    public static TokenizerFactory<CoreLabel> factory(boolean z, boolean z2) {
        return PTBTokenizerFactory.newPTBTokenizerFactory(z, z2);
    }

    public static TokenizerFactory<Word> factory(boolean z, boolean z2, boolean z3) {
        return PTBTokenizerFactory.newPTBTokenizerFactory(z, z2, z3);
    }

    public static void main(String[] strArr) throws IOException {
        int i = 0;
        String str = "utf-8";
        Pattern pattern = null;
        Pattern pattern2 = null;
        boolean z = false;
        boolean z2 = false;
        boolean z3 = false;
        boolean z4 = false;
        boolean z5 = false;
        while (i < strArr.length && strArr[i].charAt(0) == '-') {
            if ("-nl".equals(strArr[i])) {
                z = true;
            } else if ("-preserveLines".equals(strArr[i])) {
                z2 = true;
                z = true;
            } else if ("-dump".equals(strArr[i])) {
                z4 = true;
            } else if ("-ioFileList".equals(strArr[i])) {
                z3 = true;
            } else if ("-charset".equals(strArr[i]) && i < strArr.length - 1) {
                i++;
                str = strArr[i];
            } else if ("-parseInside".equals(strArr[i]) && i < strArr.length - 1) {
                i++;
                try {
                    pattern = Pattern.compile("<(?:" + strArr[i] + ")[^>]*?>");
                    pattern2 = Pattern.compile("</(?:" + strArr[i] + ")[^>]*?>");
                } catch (Exception e) {
                    pattern = null;
                    pattern2 = null;
                }
            } else if ("-untok".equals(strArr[i])) {
                z5 = true;
            } else {
                if ("-h".equals(strArr[i]) || "-help".equals(strArr[i]) || "--help".equals(strArr[i])) {
                    System.err.println("usage: java edu.stanford.nlp.process.PTBTokenizer [options]* filename*");
                    System.err.println("  options: -nl|-preserveLines|-dump|-ioFileList|-charset|-parseInside|-h");
                    return;
                }
                System.err.println("Unknown option: " + strArr[i]);
            }
            i++;
        }
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = null;
        if (z3) {
            arrayList2 = new ArrayList();
            for (int i2 = i; i2 < strArr.length; i2++) {
                BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(strArr[i2]), str));
                while (true) {
                    String readLine = bufferedReader.readLine();
                    if (readLine != null) {
                        String[] split = readLine.split("\\s+");
                        arrayList.add(split[0]);
                        if (split.length > 1) {
                            arrayList2.add(split[1]);
                        } else {
                            arrayList2.add(split[0] + ".tok");
                        }
                    }
                }
                bufferedReader.close();
            }
        } else {
            arrayList.addAll(Arrays.asList(strArr).subList(i, strArr.length));
        }
        if (z5) {
            untok(arrayList, arrayList2, str);
        } else {
            tok(arrayList, arrayList2, str, pattern, pattern2, z, z2, z4);
        }
    }
}
