package com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece;

import com.johnsnowlabs.nlp.annotators.common.IndexedToken;
import com.johnsnowlabs.nlp.annotators.common.Sentence;
import java.text.Normalizer;
import scala.Predef$;
import scala.collection.immutable.Nil$;
import scala.collection.immutable.StringOps;
import scala.collection.immutable.StringOps$;
import scala.collection.mutable.ArrayBuffer;
import scala.collection.mutable.ArrayBuffer$;
import scala.reflect.ClassTag$;
import scala.reflect.ScalaSignature;

/* compiled from: BasicTokenizer.scala */
@ScalaSignature(bytes = "\u0006\u000194Q!\u0001\u0002\u0001\u00119\u0011aBQ1tS\u000e$vn[3oSj,'O\u0003\u0002\u0004\t\u0005Iqo\u001c:ea&,7-\u001a\u0006\u0003\u000b\u0019\t\u0011\u0002^8lK:L'0\u001a:\u000b\u0005\u001dA\u0011AC1o]>$\u0018\r^8sg*\u0011\u0011BC\u0001\u0004]2\u0004(BA\u0006\r\u00031Qw\u000e\u001b8t]><H.\u00192t\u0015\u0005i\u0011aA2p[N\u0011\u0001a\u0004\t\u0003!Mi\u0011!\u0005\u0006\u0002%\u0005)1oY1mC&\u0011A#\u0005\u0002\u0007\u0003:L(+\u001a4\t\u0011Y\u0001!\u0011!Q\u0001\na\tQbY1tKN+gn]5uSZ,7\u0001\u0001\t\u0003!eI!AG\t\u0003\u000f\t{w\u000e\\3b]\")A\u0004\u0001C\u0001;\u00051A(\u001b8jiz\"\"A\b\u0011\u0011\u0005}\u0001Q\"\u0001\u0002\t\u000fYY\u0002\u0013!a\u00011!)!\u0005\u0001C\u0001G\u0005a\u0011n],iSR,7\u000f]1dKR\u0011\u0001\u0004\n\u0005\u0006K\u0005\u0002\rAJ\u0001\u0005G\"\f'\u000f\u0005\u0002\u0011O%\u0011\u0001&\u0005\u0002\u0005\u0007\"\f'\u000fC\u0003+\u0001\u0011\u00051&A\u0005jg\u000e{g\u000e\u001e:pYR\u0011\u0001\u0004\f\u0005\u0006K%\u0002\rA\n\u0005\u0006]\u0001!\taL\u0001\u000bSN$vNR5mi\u0016\u0014HC\u0001\r1\u0011\u0015)S\u00061\u0001'\u0011\u0015\u0011\u0004\u0001\"\u00014\u00035I7\u000fU;oGR,\u0018\r^5p]R\u0011\u0001\u0004\u000e\u0005\u0006KE\u0002\rA\n\u0005\u0006m\u0001!\taN\u0001\rgR\u0014\u0018\u000e]!dG\u0016tGo\u001d\u000b\u0003q}\u0002\"!\u000f\u001f\u000f\u0005AQ\u0014BA\u001e\u0012\u0003\u0019\u0001&/\u001a3fM&\u0011QH\u0010\u0002\u0007'R\u0014\u0018N\\4\u000b\u0005m\n\u0002\"\u0002!6\u0001\u0004A\u0014\u0001\u0002;fqRDQA\u0011\u0001\u0005\u0002\r\u000b\u0011\"[:DQ&tWm]3\u0015\u0005a!\u0005\"B\u0013B\u0001\u00041\u0003\"\u0002$\u0001\t\u00039\u0015!\u00038pe6\fG.\u001b>f)\tA\u0004\nC\u0003A\u000b\u0002\u0007\u0001\bC\u0003K\u0001\u0011\u00051*\u0001\u0005u_.,g.\u001b>f)\taU\u000bE\u0002\u0011\u001b>K!AT\t\u0003\u000b\u0005\u0013(/Y=\u0011\u0005A\u001bV\"A)\u000b\u0005I3\u0011AB2p[6|g.\u0003\u0002U#\na\u0011J\u001c3fq\u0016$Gk\\6f]\")a+\u0013a\u0001/\u0006A1/\u001a8uK:\u001cW\r\u0005\u0002Q1&\u0011\u0011,\u0015\u0002\t'\u0016tG/\u001a8dK\u001eA1LAA\u0001\u0012\u0003AA,\u0001\bCCNL7\rV8lK:L'0\u001a:\u0011\u0005}if\u0001C\u0001\u0003\u0003\u0003E\t\u0001\u00030\u0014\u0005u{\u0001\"\u0002\u000f^\t\u0003\u0001G#\u0001/\t\u000f\tl\u0016\u0013!C\u0001G\u0006YB\u0005\\3tg&t\u0017\u000e\u001e\u0013he\u0016\fG/\u001a:%I\u00164\u0017-\u001e7uIE*\u0012\u0001\u001a\u0016\u00031\u0015\\\u0013A\u001a\t\u0003O2l\u0011\u0001\u001b\u0006\u0003S*\f\u0011\"\u001e8dQ\u0016\u001c7.\u001a3\u000b\u0005-\f\u0012AC1o]>$\u0018\r^5p]&\u0011Q\u000e\u001b\u0002\u0012k:\u001c\u0007.Z2lK\u00124\u0016M]5b]\u000e,\u0007")
/* loaded from: input_file:com/johnsnowlabs/nlp/annotators/tokenizer/wordpiece/BasicTokenizer.class */
public class BasicTokenizer {
    private final boolean caseSensitive;

    public boolean isWhitespace(char c) {
        return c == ' ' || c == '\t' || c == '\n' || c == '\r' || Character.isWhitespace(c);
    }

    public boolean isControl(char c) {
        if (c == '\t' || c == '\n' || c == '\r') {
            return false;
        }
        return Character.isISOControl(c);
    }

    public boolean isToFilter(char c) {
        return c == 0 || c == 65533 || isControl(c);
    }

    public boolean isPunctuation(char c) {
        if (c >= '!' && c <= '/') {
            return true;
        }
        if (c >= ':' && c <= '@') {
            return true;
        }
        if (c >= '[' && c <= '`') {
            return true;
        }
        if (c >= '{' && c <= '~') {
            return true;
        }
        try {
            String name = Character.getName(c);
            return (name != null ? name : "").contains("PUNCTUATION");
        } catch (Exception unused) {
            return false;
        }
    }

    public String stripAccents(String str) {
        return Normalizer.normalize(str, Normalizer.Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
    }

    public boolean isChinese(char c) {
        return (c >= 19968 && c <= 40959) || (c >= 13312 && c <= 19903) || ((c >= 0 && c <= 42719) || ((c >= 42752 && c <= 46911) || ((c >= 46912 && c <= 47135) || ((c >= 47136 && c <= 52911) || ((c >= 63744 && c <= 64255) || (c >= 63488 && c <= 64031))))));
    }

    public String normalize(String str) {
        String mkString = new StringOps(Predef$.MODULE$.augmentString((String) new StringOps(Predef$.MODULE$.augmentString(stripAccents(str.trim()))).filter(new BasicTokenizer$$anonfun$1(this)))).mkString("");
        return this.caseSensitive ? mkString : mkString.toLowerCase();
    }

    public IndexedToken[] tokenize(Sentence sentence) {
        int i;
        ArrayBuffer arrayBuffer = (ArrayBuffer) ArrayBuffer$.MODULE$.apply(Nil$.MODULE$);
        String content = sentence.content();
        int i2 = 0;
        while (true) {
            int i3 = i2;
            if (i3 >= content.length()) {
                return (IndexedToken[]) arrayBuffer.toArray(ClassTag$.MODULE$.apply(IndexedToken.class));
            }
            while (i3 < content.length() && isWhitespace(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(content), i3)) && !isPunctuation(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(content), i3))) {
                i3++;
            }
            int i4 = i3;
            while (true) {
                i = i4;
                if (i >= content.length() || isToFilter(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(content), i)) || isPunctuation(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(content), i)) || isChinese(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(content), i)) || isWhitespace(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(content), i))) {
                    break;
                }
                i4 = i + 1;
            }
            if (i > i3) {
                append$1(i3, i, sentence, arrayBuffer, content);
            }
            if (i < content.length() && (isPunctuation(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(content), i)) || isChinese(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(content), i)))) {
                append$1(i, i + 1, sentence, arrayBuffer, content);
            }
            i2 = i + 1;
        }
    }

    private final void append$1(int i, int i2, Sentence sentence, ArrayBuffer arrayBuffer, String str) {
        Predef$.MODULE$.assert(i2 > i);
        String normalize = normalize(str.substring(i, i2));
        if (normalize.isEmpty()) {
            return;
        }
        arrayBuffer.append(Predef$.MODULE$.wrapRefArray(new IndexedToken[]{new IndexedToken(normalize, i + sentence.start(), (i2 - 1) + sentence.start())}));
    }

    public BasicTokenizer(boolean z) {
        this.caseSensitive = z;
    }
}
