package com.johnsnowlabs.nlp.annotators.tokenizer.bpe;

import com.johnsnowlabs.nlp.annotators.common.IndexedToken;
import com.johnsnowlabs.nlp.annotators.common.TokenPiece;
import com.johnsnowlabs.nlp.annotators.tokenizer.moses.MosesTokenizer;
import com.johnsnowlabs.nlp.annotators.tokenizer.normalizer.MosesPunctNormalizer;
import com.johnsnowlabs.nlp.annotators.tokenizer.normalizer.MosesPunctNormalizer$;
import java.text.Normalizer;
import scala.Array$;
import scala.MatchError;
import scala.Option;
import scala.Predef$;
import scala.Some;
import scala.Tuple2;
import scala.collection.TraversableOnce;
import scala.collection.immutable.Map;
import scala.collection.immutable.StringOps;
import scala.collection.mutable.ArrayOps;
import scala.reflect.ClassTag$;
import scala.reflect.ScalaSignature;
import scala.runtime.BoxesRunTime;

/* compiled from: XlmTokenizer.scala */
@ScalaSignature(bytes = "\u0006\u0001\u0005Ec!\u0002\r\u001a\u0001})\u0003\"\u0003\u0016\u0001\u0005\u0003\u0005\u000b\u0011\u0002\u0017D\u0011%!\u0005A!A!\u0002\u0013)e\tC\u0005H\u0001\t\u0005\t\u0015!\u0003I\u0017\"AA\n\u0001B\u0001B\u0003%Q\n\u0003\u0005Q\u0001\t\u0005\t\u0015!\u0003>\u0011!\t\u0006A!A!\u0002\u0013i\u0005\"\u0002*\u0001\t\u0003\u0019\u0006\"B.\u0001\t\u0003a\u0006bB0\u0001\u0005\u0004%\t\u0001\u0019\u0005\u0007O\u0002\u0001\u000b\u0011B1\t\u000f!\u0004!\u0019!C\u0001S\"1\u0001\u000f\u0001Q\u0001\n)DQ!\u001d\u0001\u0005\nIDQ\u0001\u001f\u0001\u0005BeD\u0011\"!\u0003\u0001\u0005\u0004%\t%a\u0003\t\u0011\u0005M\u0001\u0001)A\u0005\u0003\u001bAaA\u0007\u0001\u0005B\u0005UqACA\u00123\u0005\u0005\t\u0012A\u0010\u0002&\u0019I\u0001$GA\u0001\u0012\u0003y\u0012q\u0005\u0005\u0007%N!\t!a\f\t\u0013\u0005E2#%A\u0005\u0002\u0005M\u0002\"CA%'E\u0005I\u0011AA&\u0011%\tyeEI\u0001\n\u0003\t\u0019D\u0001\u0007YY6$vn[3oSj,'O\u0003\u0002\u001b7\u0005\u0019!\r]3\u000b\u0005qi\u0012!\u0003;pW\u0016t\u0017N_3s\u0015\tqr$\u0001\u0006b]:|G/\u0019;peNT!\u0001I\u0011\u0002\u00079d\u0007O\u0003\u0002#G\u0005a!n\u001c5og:|w\u000f\\1cg*\tA%A\u0002d_6\u001c\"\u0001\u0001\u0014\u0011\u0005\u001dBS\"A\r\n\u0005%J\"\u0001\u0004\"qKR{7.\u001a8ju\u0016\u0014\u0018AB7fe\u001e,7o\u0001\u0001\u0011\t52\u0014\b\u0011\b\u0003]Q\u0002\"a\f\u001a\u000e\u0003AR!!M\u0016\u0002\rq\u0012xn\u001c;?\u0015\u0005\u0019\u0014!B:dC2\f\u0017BA\u001b3\u0003\u0019\u0001&/\u001a3fM&\u0011q\u0007\u000f\u0002\u0004\u001b\u0006\u0004(BA\u001b3!\u0011Q4(P\u001f\u000e\u0003IJ!\u0001\u0010\u001a\u0003\rQ+\b\u000f\\33!\tic(\u0003\u0002@q\t11\u000b\u001e:j]\u001e\u0004\"AO!\n\u0005\t\u0013$aA%oi&\u0011!\u0006K\u0001\u0006m>\u001c\u0017M\u0019\t\u0005[Yj\u0004)\u0003\u0002EQ\u0005i1\u000f]3dS\u0006dGk\\6f]N\u0004\"aJ%\n\u0005)K\"!D*qK\u000eL\u0017\r\u001c+pW\u0016t7/\u0003\u0002HQ\u0005)\u0002/\u00193XSRD7+\u001a8uK:\u001cW\rV8lK:\u001c\bC\u0001\u001eO\u0013\ty%GA\u0004C_>dW-\u00198\u0002\t1\fgnZ\u0001\u001bI>dun^3sG\u0006\u001cX-\u00118e%\u0016lwN^3BG\u000e,g\u000e^\u0001\u0007y%t\u0017\u000e\u001e \u0015\u000fQ+fk\u0016-Z5B\u0011q\u0005\u0001\u0005\u0006U\u001d\u0001\r\u0001\f\u0005\u0006\t\u001e\u0001\r!\u0012\u0005\u0006\u000f\u001e\u0001\r\u0001\u0013\u0005\b\u0019\u001e\u0001\n\u00111\u0001N\u0011\u001d\u0001v\u0001%AA\u0002uBq!U\u0004\u0011\u0002\u0003\u0007Q*\u0001\rm_^,'oY1tK\u0006sGMU3n_Z,\u0017iY2f]R$\"!P/\t\u000byC\u0001\u0019A\u001f\u0002\u000b%t\u0007/\u001e;\u0002\u001f5|7/Z:O_Jl\u0017\r\\5{KJ,\u0012!\u0019\t\u0003E\u0016l\u0011a\u0019\u0006\u0003In\t!B\\8s[\u0006d\u0017N_3s\u0013\t17M\u0001\u000bN_N,7\u000fU;oGRtuN]7bY&TXM]\u0001\u0011[>\u001cXm\u001d(pe6\fG.\u001b>fe\u0002\na\"\\8tKN$vn[3oSj,'/F\u0001k!\tYg.D\u0001m\u0015\ti7$A\u0003n_N,7/\u0003\u0002pY\nqQj\\:fgR{7.\u001a8ju\u0016\u0014\u0018aD7pg\u0016\u001cHk\\6f]&TXM\u001d\u0011\u0002\u001b5|7/Z:QSB,G.\u001b8f)\t\u0019h\u000fE\u0002;ivJ!!\u001e\u001a\u0003\u000b\u0005\u0013(/Y=\t\u000b]l\u0001\u0019A\u001f\u0002\tQ,\u0007\u0010^\u0001\u0010i>\\WM\\5{KN+(\rV3yiR)!0a\u0001\u0002\u0006A\u0019!\b^>\u0011\u0005q|X\"A?\u000b\u0005yl\u0012AB2p[6|g.C\u0002\u0002\u0002u\u0014A\"\u00138eKb,G\rV8lK:DQa\u001e\bA\u0002uBa!a\u0002\u000f\u0001\u0004\u0001\u0015aC5oI\u0016DxJ\u001a4tKR\f\u0001#\u00199qK:$gi\u001c:QS\u0016\u001cW-\u00133\u0016\u0005\u00055\u0001\u0003\u0002\u001e\u0002\u0010uJ1!!\u00053\u0005\u0019y\u0005\u000f^5p]\u0006\t\u0012\r\u001d9f]\u00124uN\u001d)jK\u000e,\u0017\n\u001a\u0011\u0015\t\u0005]\u0011q\u0004\t\u0005uQ\fI\u0002E\u0002}\u00037I1!!\b~\u0005)!vn[3o!&,7-\u001a\u0005\u0007\u0003C\t\u0002\u0019A>\u0002\u0011%tG\rV8lK:\fA\u0002\u00177n)>\\WM\\5{KJ\u0004\"aJ\n\u0014\u0007M\tI\u0003E\u0002;\u0003WI1!!\f3\u0005\u0019\te.\u001f*fMR\u0011\u0011QE\u0001\u001cI1,7o]5oSR$sM]3bi\u0016\u0014H\u0005Z3gCVdG\u000f\n\u001b\u0016\u0005\u0005U\"fA'\u00028-\u0012\u0011\u0011\b\t\u0005\u0003w\t)%\u0004\u0002\u0002>)!\u0011qHA!\u0003%)hn\u00195fG.,GMC\u0002\u0002DI\n!\"\u00198o_R\fG/[8o\u0013\u0011\t9%!\u0010\u0003#Ut7\r[3dW\u0016$g+\u0019:jC:\u001cW-A\u000e%Y\u0016\u001c8/\u001b8ji\u0012:'/Z1uKJ$C-\u001a4bk2$H%N\u000b\u0003\u0003\u001bR3!PA\u001c\u0003m!C.Z:tS:LG\u000fJ4sK\u0006$XM\u001d\u0013eK\u001a\fW\u000f\u001c;%m\u0001")
/* loaded from: input_file:com/johnsnowlabs/nlp/annotators/tokenizer/bpe/XlmTokenizer.class */
public class XlmTokenizer extends BpeTokenizer {
    private final boolean doLowercaseAndRemoveAccent;
    private final MosesPunctNormalizer mosesNormalizer;
    private final MosesTokenizer mosesTokenizer;
    private final Option<String> appendForPieceId;

    public String lowercaseAndRemoveAccent(String str) {
        return new ArrayOps.ofChar(Predef$.MODULE$.charArrayOps((char[]) new ArrayOps.ofChar(Predef$.MODULE$.charArrayOps(Normalizer.normalize(str.toLowerCase(), Normalizer.Form.NFD).toCharArray())).filter(obj -> {
            return BoxesRunTime.boxToBoolean($anonfun$lowercaseAndRemoveAccent$1(BoxesRunTime.unboxToChar(obj)));
        }))).mkString().toLowerCase();
    }

    public MosesPunctNormalizer mosesNormalizer() {
        return this.mosesNormalizer;
    }

    public MosesTokenizer mosesTokenizer() {
        return this.mosesTokenizer;
    }

    private String[] mosesPipeline(String str) {
        return mosesTokenizer().tokenize(mosesNormalizer().removeNonPrintingChar(mosesNormalizer().normalize(str)));
    }

    @Override // com.johnsnowlabs.nlp.annotators.tokenizer.bpe.BpeTokenizer
    public IndexedToken[] tokenizeSubText(String str, int i) {
        String[] mosesPipeline = mosesPipeline(str);
        String lowercaseAndRemoveAccent = this.doLowercaseAndRemoveAccent ? lowercaseAndRemoveAccent(new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps(mosesPipeline)).mkString(" ")) : new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps(mosesPipeline)).mkString(" ");
        String lowercaseAndRemoveAccent2 = this.doLowercaseAndRemoveAccent ? lowercaseAndRemoveAccent(str) : str;
        return (IndexedToken[]) new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps(lowercaseAndRemoveAccent.split(" "))).map(str2 -> {
            int indexOf = lowercaseAndRemoveAccent2.indexOf(str2);
            return new IndexedToken(str2, i + indexOf, ((i + indexOf) + str2.length()) - 1);
        }, Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(IndexedToken.class)));
    }

    @Override // com.johnsnowlabs.nlp.annotators.tokenizer.bpe.BpeTokenizer
    public Option<String> appendForPieceId() {
        return this.appendForPieceId;
    }

    @Override // com.johnsnowlabs.nlp.annotators.tokenizer.bpe.BpeTokenizer
    public TokenPiece[] bpe(IndexedToken indexedToken) {
        String[] strArr;
        String preProcessTokenForBpe = preProcessTokenForBpe(indexedToken.token());
        String[] strArr2 = (String[]) ((TraversableOnce) new StringOps(Predef$.MODULE$.augmentString(preProcessTokenForBpe)).map(obj -> {
            return $anonfun$bpe$1(BoxesRunTime.unboxToChar(obj));
        }, Predef$.MODULE$.fallbackStringCanBuildFrom())).toArray(ClassTag$.MODULE$.apply(String.class));
        Tuple2<String, String>[] bytePairs = getBytePairs(strArr2);
        if (new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps(bytePairs)).isEmpty()) {
            strArr = new String[]{preProcessTokenForBpe};
        } else {
            int length = bytePairs.length - 1;
            Tuple2<String, String> tuple2 = bytePairs[bytePairs.length - 1];
            if (tuple2 == null) {
                throw new MatchError(tuple2);
            }
            bytePairs[length] = new Tuple2<>((String) tuple2._1(), new StringBuilder(4).append((String) tuple2._2()).append("</w>").toString());
            int length2 = strArr2.length - 1;
            strArr2[length2] = new StringBuilder(4).append(strArr2[length2]).append("</w>").toString();
            strArr = (String[]) new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps(performMerges(strArr2, bytePairs))).map(str -> {
                return str.replace("</w>", "");
            }, Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(String.class)));
        }
        return getTokenPieces(indexedToken, strArr, preProcessTokenForBpe);
    }

    public static final /* synthetic */ boolean $anonfun$lowercaseAndRemoveAccent$1(char c) {
        return Character.getType(c) != 6;
    }

    public static final /* synthetic */ String $anonfun$bpe$1(char c) {
        return BoxesRunTime.boxToCharacter(c).toString();
    }

    /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
    public XlmTokenizer(Map<Tuple2<String, String>, Object> map, Map<String, Object> map2, SpecialTokens specialTokens, boolean z, String str, boolean z2) {
        super(map, map2, specialTokens, z);
        this.doLowercaseAndRemoveAccent = z2;
        Predef$.MODULE$.require(str != null ? str.equals("en") : "en" == 0, () -> {
            return "Only English is supported currently.";
        });
        this.mosesNormalizer = new MosesPunctNormalizer(MosesPunctNormalizer$.MODULE$.$lessinit$greater$default$1());
        this.mosesTokenizer = new MosesTokenizer(str);
        this.appendForPieceId = new Some("</w>");
    }
}
