package com.johnsnowlabs.nlp.annotators.tokenizer.bpe;

import com.johnsnowlabs.nlp.annotators.common.IndexedToken;
import java.nio.charset.Charset;
import scala.Array$;
import scala.None$;
import scala.Option;
import scala.Predef$;
import scala.Some;
import scala.Tuple2;
import scala.collection.GenIterable;
import scala.collection.TraversableLike;
import scala.collection.TraversableOnce;
import scala.collection.generic.TraversableForwarder;
import scala.collection.immutable.IndexedSeq$;
import scala.collection.immutable.Map;
import scala.collection.immutable.Map$;
import scala.collection.immutable.StringOps;
import scala.collection.mutable.ArrayOps;
import scala.collection.mutable.ListBuffer;
import scala.collection.mutable.ListBuffer$;
import scala.math.Numeric$IntIsIntegral$;
import scala.reflect.ClassTag$;
import scala.reflect.ScalaSignature;
import scala.runtime.BoxesRunTime;
import scala.runtime.IntRef;
import scala.runtime.RichInt$;
import scala.util.matching.Regex;

/* compiled from: Gpt2Tokenizer.scala */
@ScalaSignature(bytes = "\u0006\u0001\u0005mc\u0001\u0002\u000e\u001c\u0001!B\u0011\"\f\u0001\u0003\u0002\u0003\u0006IAL#\t\u0013\u0019\u0003!\u0011!Q\u0001\n\u001dC\u0005\"C%\u0001\u0005\u0003\u0005\u000b\u0011\u0002&N\u0011!q\u0005A!A!\u0002\u0013y\u0005\u0002\u0003*\u0001\u0005\u0003\u0005\u000b\u0011B \t\u000bM\u0003A\u0011\u0001+\t\u000fm\u0003!\u0019!C\u00059\"1a\f\u0001Q\u0001\nuCqa\u0018\u0001C\u0002\u0013\u0005\u0003\r\u0003\u0004e\u0001\u0001\u0006I!\u0019\u0005\bK\u0002\u0011\r\u0011\"\u0003g\u0011\u0019q\u0007\u0001)A\u0005O\"9q\u000e\u0001b\u0001\n\u0013\u0001\bBB9\u0001A\u0003%q\tC\u0003s\u0001\u0011\u00053\u000fC\u0004w\u0001\t\u0007I\u0011A<\t\u000f\u0005\u0005\u0001\u0001)A\u0005q\"9\u00111\u0001\u0001\u0005B\u0005\u0015\u0001bBA\u0011\u0001\u0011\u0005\u00111\u0005\u0005\u000e\u0003W\u0001\u0001\u0013aA\u0001\u0002\u0013%\u0011QF'\b\u0013\u0005=2$!A\t\u0002\u0005Eb\u0001\u0003\u000e\u001c\u0003\u0003E\t!a\r\t\rM3B\u0011AA\u001e\u0011%\tiDFI\u0001\n\u0003\ty\u0004C\u0005\u0002VY\t\n\u0011\"\u0001\u0002X\tiq\t\u001d;3)>\\WM\\5{KJT!\u0001H\u000f\u0002\u0007\t\u0004XM\u0003\u0002\u001f?\u0005IAo\\6f]&TXM\u001d\u0006\u0003A\u0005\n!\"\u00198o_R\fGo\u001c:t\u0015\t\u00113%A\u0002oYBT!\u0001J\u0013\u0002\u0019)|\u0007N\\:o_^d\u0017MY:\u000b\u0003\u0019\n1aY8n\u0007\u0001\u0019\"\u0001A\u0015\u0011\u0005)ZS\"A\u000e\n\u00051Z\"\u0001\u0004\"qKR{7.\u001a8ju\u0016\u0014\u0018AB7fe\u001e,7\u000f\u0005\u00030qm\u0012eB\u0001\u00197!\t\tD'D\u00013\u0015\t\u0019t%\u0001\u0004=e>|GO\u0010\u0006\u0002k\u0005)1oY1mC&\u0011q\u0007N\u0001\u0007!J,G-\u001a4\n\u0005eR$aA'ba*\u0011q\u0007\u000e\t\u0005yuzt(D\u00015\u0013\tqDG\u0001\u0004UkBdWM\r\t\u0003_\u0001K!!\u0011\u001e\u0003\rM#(/\u001b8h!\ta4)\u0003\u0002Ei\t\u0019\u0011J\u001c;\n\u00055Z\u0013!\u0002<pG\u0006\u0014\u0007\u0003B\u00189\u007f\tK!AR\u0016\u0002\u001bM\u0004XmY5bYR{7.\u001a8t!\tQ3*\u0003\u0002M7\ti1\u000b]3dS\u0006dGk\\6f]NL!!S\u0016\u0002+A\fGmV5uQN+g\u000e^3oG\u0016$vn[3ogB\u0011A\bU\u0005\u0003#R\u0012qAQ8pY\u0016\fg.A\u0007qe\u0016\u0004XM\u001c3TiJLgnZ\u0001\u0007y%t\u0017\u000e\u001e \u0015\rU3v\u000bW-[!\tQ\u0003\u0001C\u0003.\r\u0001\u0007a\u0006C\u0003G\r\u0001\u0007q\tC\u0003J\r\u0001\u0007!\nC\u0004O\rA\u0005\t\u0019A(\t\u000fI3\u0001\u0013!a\u0001\u007f\u0005)\"-\u001f;fgR{WK\\5d_\u0012,W*\u00199qS:<W#A/\u0011\t=B$iP\u0001\u0017Ef$Xm\u001d+p+:L7m\u001c3f\u001b\u0006\u0004\b/\u001b8hA\u0005\t\u0002O]3qK:$gi\u001c:QS\u0016\u001cW-\u00133\u0016\u0003\u0005\u00042\u0001\u00102@\u0013\t\u0019GG\u0001\u0004PaRLwN\\\u0001\u0013aJ,\u0007/\u001a8e\r>\u0014\b+[3dK&#\u0007%\u0001\u0007eK\u000e|G-\u001a:W_\u000e\f'-F\u0001h!\u0011AWNQ \u000e\u0003%T!A[6\u0002\u0013%lW.\u001e;bE2,'B\u000175\u0003)\u0019w\u000e\u001c7fGRLwN\\\u0005\u0003s%\fQ\u0002Z3d_\u0012,'OV8dC\n\u0004\u0013\u0001F;oS\u000e|G-\u001a+p\u0005f$X-T1qa&tw-F\u0001H\u0003U)h.[2pI\u0016$vNQ=uK6\u000b\u0007\u000f]5oO\u0002\nQ\u0003\u001d:f!J|7-Z:t)>\\WM\u001c$pe\n\u0003X\r\u0006\u0002@i\")Qo\u0004a\u0001\u007f\u0005)Ao\\6f]\u0006a1\u000f\u001d7jiB\u000bG\u000f^3s]V\t\u0001\u0010\u0005\u0002z}6\t!P\u0003\u0002|y\u0006AQ.\u0019;dQ&twM\u0003\u0002~i\u0005!Q\u000f^5m\u0013\ty(PA\u0003SK\u001e,\u00070A\u0007ta2LG\u000fU1ui\u0016\u0014h\u000eI\u0001\u0010i>\\WM\\5{KN+(\rV3yiR1\u0011qAA\r\u0003;\u0001R\u0001PA\u0005\u0003\u001bI1!a\u00035\u0005\u0015\t%O]1z!\u0011\ty!!\u0006\u000e\u0005\u0005E!bAA\n?\u000511m\\7n_:LA!a\u0006\u0002\u0012\ta\u0011J\u001c3fq\u0016$Gk\\6f]\"1\u00111\u0004\nA\u0002}\nA\u0001^3yi\"1\u0011q\u0004\nA\u0002\t\u000b1\"\u001b8eKb|eMZ:fi\u0006aA-Z2pI\u0016$vn[3ogR\u0019q(!\n\t\u000f\u0005\u001d2\u00031\u0001\u0002*\u00051Ao\\6f]N\u0004B\u0001PA\u0005\u0005\u0006\u00192/\u001e9fe\u0012\u001a\b/Z2jC2$vn[3ogV\t!*A\u0007HaR\u0014Dk\\6f]&TXM\u001d\t\u0003UY\u00192AFA\u001b!\ra\u0014qG\u0005\u0004\u0003s!$AB!osJ+g\r\u0006\u0002\u00022\u0005YB\u0005\\3tg&t\u0017\u000e\u001e\u0013he\u0016\fG/\u001a:%I\u00164\u0017-\u001e7uIQ*\"!!\u0011+\u0007=\u000b\u0019e\u000b\u0002\u0002FA!\u0011qIA)\u001b\t\tIE\u0003\u0003\u0002L\u00055\u0013!C;oG\",7m[3e\u0015\r\ty\u0005N\u0001\u000bC:tw\u000e^1uS>t\u0017\u0002BA*\u0003\u0013\u0012\u0011#\u001e8dQ\u0016\u001c7.\u001a3WCJL\u0017M\\2f\u0003m!C.Z:tS:LG\u000fJ4sK\u0006$XM\u001d\u0013eK\u001a\fW\u000f\u001c;%kU\u0011\u0011\u0011\f\u0016\u0004\u007f\u0005\r\u0003")
/* loaded from: input_file:com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Gpt2Tokenizer.class */
public class Gpt2Tokenizer extends BpeTokenizer {
    private final Map<Object, String> bytesToUnicodeMapping;
    private final Option<String> prependForPieceId;
    private final Map<Object, String> decoderVocab;
    private final Map<String, Object> unicodeToByteMapping;
    private final Regex splitPattern;

    private /* synthetic */ SpecialTokens super$specialTokens() {
        return super.specialTokens();
    }

    private Map<Object, String> bytesToUnicodeMapping() {
        return this.bytesToUnicodeMapping;
    }

    @Override // com.johnsnowlabs.nlp.annotators.tokenizer.bpe.BpeTokenizer
    public Option<String> prependForPieceId() {
        return this.prependForPieceId;
    }

    private Map<Object, String> decoderVocab() {
        return this.decoderVocab;
    }

    private Map<String, Object> unicodeToByteMapping() {
        return this.unicodeToByteMapping;
    }

    @Override // com.johnsnowlabs.nlp.annotators.tokenizer.bpe.BpeTokenizer
    public String preProcessTokenForBpe(String str) {
        return (String) new StringOps(Predef$.MODULE$.augmentString(str)).foldLeft("", (str2, obj) -> {
            return $anonfun$preProcessTokenForBpe$1(this, str2, BoxesRunTime.unboxToChar(obj));
        });
    }

    public Regex splitPattern() {
        return this.splitPattern;
    }

    @Override // com.johnsnowlabs.nlp.annotators.tokenizer.bpe.BpeTokenizer
    public IndexedToken[] tokenizeSubText(String str, int i) {
        return (IndexedToken[]) splitPattern().findAllMatchIn((prependForPieceId().isDefined() || str.startsWith(" ")) ? str : new StringBuilder(1).append(" ").append(str).toString()).map(match -> {
            return new IndexedToken(match.matched(), match.start() + i, (match.end() + i) - 1);
        }).toArray(ClassTag$.MODULE$.apply(IndexedToken.class));
    }

    public String decodeTokens(int[] iArr) {
        return new String((byte[]) ((TraversableOnce) ((TraversableLike) new StringOps(Predef$.MODULE$.augmentString(new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps((Object[]) new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps((Object[]) new ArrayOps.ofInt(Predef$.MODULE$.intArrayOps(iArr)).map(obj -> {
            return $anonfun$decodeTokens$1(this, BoxesRunTime.unboxToInt(obj));
        }, Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(String.class))))).filter(str -> {
            return BoxesRunTime.boxToBoolean($anonfun$decodeTokens$2(this, str));
        }))).mkString(""))).map(obj2 -> {
            return BoxesRunTime.boxToInteger($anonfun$decodeTokens$3(this, BoxesRunTime.unboxToChar(obj2)));
        }, Predef$.MODULE$.fallbackStringCanBuildFrom())).map(obj3 -> {
            return BoxesRunTime.boxToByte($anonfun$decodeTokens$4(BoxesRunTime.unboxToInt(obj3)));
        }, IndexedSeq$.MODULE$.canBuildFrom())).toArray(ClassTag$.MODULE$.Byte()), Charset.forName("UTF-8"));
    }

    public static final /* synthetic */ String $anonfun$bytesToUnicodeMapping$2(int i) {
        return BoxesRunTime.boxToCharacter((char) i).toString();
    }

    public static final /* synthetic */ String $anonfun$preProcessTokenForBpe$1(Gpt2Tokenizer gpt2Tokenizer, String str, char c) {
        return new StringBuilder(0).append(str).append(gpt2Tokenizer.bytesToUnicodeMapping().apply(BoxesRunTime.boxToInteger(c))).toString();
    }

    public static final /* synthetic */ String $anonfun$decodeTokens$1(Gpt2Tokenizer gpt2Tokenizer, int i) {
        return (String) gpt2Tokenizer.decoderVocab().apply(BoxesRunTime.boxToInteger(i));
    }

    public static final /* synthetic */ boolean $anonfun$decodeTokens$2(Gpt2Tokenizer gpt2Tokenizer, String str) {
        return !gpt2Tokenizer.super$specialTokens().contains(str);
    }

    public static final /* synthetic */ int $anonfun$decodeTokens$3(Gpt2Tokenizer gpt2Tokenizer, char c) {
        return BoxesRunTime.unboxToInt(gpt2Tokenizer.unicodeToByteMapping().apply(BoxesRunTime.boxToCharacter(c).toString()));
    }

    public static final /* synthetic */ byte $anonfun$decodeTokens$4(int i) {
        return (byte) i;
    }

    public Gpt2Tokenizer(Map<Tuple2<String, String>, Object> map, Map<String, Object> map2, SpecialTokens specialTokens, boolean z, String str) {
        super(map, map2, specialTokens, z);
        ListBuffer $plus$plus = ListBuffer$.MODULE$.range(BoxesRunTime.boxToInteger(33), BoxesRunTime.boxToInteger(127), Numeric$IntIsIntegral$.MODULE$).$plus$plus(ListBuffer$.MODULE$.range(BoxesRunTime.boxToInteger(161), BoxesRunTime.boxToInteger(173), Numeric$IntIsIntegral$.MODULE$)).$plus$plus(ListBuffer$.MODULE$.range(BoxesRunTime.boxToInteger(174), BoxesRunTime.boxToInteger(256), Numeric$IntIsIntegral$.MODULE$));
        ListBuffer clone = $plus$plus.clone();
        IntRef create = IntRef.create(0);
        RichInt$.MODULE$.to$extension0(Predef$.MODULE$.intWrapper(0), 256).foreach$mVc$sp(i -> {
            if ($plus$plus.contains(BoxesRunTime.boxToInteger(i))) {
                return;
            }
            $plus$plus.$plus$eq(BoxesRunTime.boxToInteger(i));
            clone.$plus$eq(BoxesRunTime.boxToInteger(256 + create.elem));
            create.elem++;
        });
        this.bytesToUnicodeMapping = ((TraversableForwarder) $plus$plus.zip((GenIterable) clone.map(obj -> {
            return $anonfun$bytesToUnicodeMapping$2(BoxesRunTime.unboxToInt(obj));
        }, ListBuffer$.MODULE$.canBuildFrom()), ListBuffer$.MODULE$.canBuildFrom())).toMap(Predef$.MODULE$.$conforms());
        this.prependForPieceId = new StringOps(Predef$.MODULE$.augmentString(str)).nonEmpty() ? new Some(str) : None$.MODULE$;
        this.decoderVocab = (Map) super.vocab().map(tuple2 -> {
            return new Tuple2(BoxesRunTime.boxToInteger(tuple2._2$mcI$sp()), tuple2._1());
        }, Map$.MODULE$.canBuildFrom());
        this.unicodeToByteMapping = (Map) bytesToUnicodeMapping().map(tuple22 -> {
            return new Tuple2(tuple22._2(), BoxesRunTime.boxToInteger(tuple22._1$mcI$sp()));
        }, Map$.MODULE$.canBuildFrom());
        this.splitPattern = new StringOps(Predef$.MODULE$.augmentString("'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+")).r();
    }
}
