/*
 * Decompiled with CFR 0.152.
 */
package com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece;

import com.johnsnowlabs.nlp.annotators.common.IndexedToken;
import com.johnsnowlabs.nlp.annotators.common.Sentence;
import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.BasicTokenizer$;
import java.text.Normalizer;
import scala.Function1;
import scala.Predef$;
import scala.Serializable;
import scala.collection.Seq;
import scala.collection.immutable.Nil$;
import scala.collection.immutable.StringOps;
import scala.collection.immutable.StringOps$;
import scala.collection.mutable.ArrayBuffer;
import scala.collection.mutable.ArrayBuffer$;
import scala.reflect.ClassTag$;
import scala.reflect.ScalaSignature;
import scala.runtime.BoxesRunTime;

@ScalaSignature(bytes="\u0006\u0001I4Qa\u0004\t\u0001-qA\u0001b\t\u0001\u0003\u0002\u0003\u0006I!\n\u0005\u0006Q\u0001!\t!\u000b\u0005\u0006[\u0001!\tA\f\u0005\u0006i\u0001!\t!\u000e\u0005\u0006o\u0001!\t\u0001\u000f\u0005\u0006u\u0001!\ta\u000f\u0005\u0006{\u0001!\tA\u0010\u0005\u0006\u0019\u0002!\t!\u0014\u0005\u0006\u001f\u0002!\t\u0001\u0015\u0005\u0006%\u0002!\taU\u0004\tEB\t\t\u0011#\u0001\u0017G\u001aAq\u0002EA\u0001\u0012\u00031B\rC\u0003)\u0019\u0011\u0005Q\rC\u0004g\u0019E\u0005I\u0011A4\u0003\u001d\t\u000b7/[2U_.,g.\u001b>fe*\u0011\u0011CE\u0001\no>\u0014H\r]5fG\u0016T!a\u0005\u000b\u0002\u0013Q|7.\u001a8ju\u0016\u0014(BA\u000b\u0017\u0003)\tgN\\8uCR|'o\u001d\u0006\u0003/a\t1A\u001c7q\u0015\tI\"$\u0001\u0007k_\"t7O\\8xY\u0006\u00147OC\u0001\u001c\u0003\r\u0019w.\\\n\u0003\u0001u\u0001\"AH\u0011\u000e\u0003}Q\u0011\u0001I\u0001\u0006g\u000e\fG.Y\u0005\u0003E}\u0011a!\u00118z%\u00164\u0017!D2bg\u0016\u001cVM\\:ji&4Xm\u0001\u0001\u0011\u0005y1\u0013BA\u0014 \u0005\u001d\u0011un\u001c7fC:\fa\u0001P5oSRtDC\u0001\u0016-!\tY\u0003!D\u0001\u0011\u0011\u001d\u0019#\u0001%AA\u0002\u0015\nA\"[:XQ&$Xm\u001d9bG\u0016$\"!J\u0018\t\u000bA\u001a\u0001\u0019A\u0019\u0002\t\rD\u0017M\u001d\t\u0003=IJ!aM\u0010\u0003\t\rC\u0017M]\u0001\nSN\u001cuN\u001c;s_2$\"!\n\u001c\t\u000bA\"\u0001\u0019A\u0019\u0002\u0015%\u001cHk\u001c$jYR,'\u000f\u0006\u0002&s!)\u0001'\u0002a\u0001c\u0005i\u0011n\u001d)v]\u000e$X/\u0019;j_:$\"!\n\u001f\t\u000bA2\u0001\u0019A\u0019\u0002\u0019M$(/\u001b9BG\u000e,g\u000e^:\u0015\u0005}R\u0005C\u0001!H\u001d\t\tU\t\u0005\u0002C?5\t1I\u0003\u0002EI\u00051AH]8pizJ!AR\u0010\u0002\rA\u0013X\rZ3g\u0013\tA\u0015J\u0001\u0004TiJLgn\u001a\u0006\u0003\r~AQaS\u0004A\u0002}\nA\u0001^3yi\u0006I\u0011n]\"iS:,7/\u001a\u000b\u0003K9CQ\u0001\r\u0005A\u0002E\n\u0011B\\8s[\u0006d\u0017N_3\u0015\u0005}\n\u0006\"B&\n\u0001\u0004y\u0014\u0001\u0003;pW\u0016t\u0017N_3\u0015\u0005Qk\u0006c\u0001\u0010V/&\u0011ak\b\u0002\u0006\u0003J\u0014\u0018-\u001f\t\u00031nk\u0011!\u0017\u0006\u00035R\taaY8n[>t\u0017B\u0001/Z\u00051Ie\u000eZ3yK\u0012$vn[3o\u0011\u0015q&\u00021\u0001`\u0003!\u0019XM\u001c;f]\u000e,\u0007C\u0001-a\u0013\t\t\u0017L\u0001\u0005TK:$XM\\2f\u00039\u0011\u0015m]5d)>\\WM\\5{KJ\u0004\"a\u000b\u0007\u0014\u00051iB#A2\u00027\u0011bWm]:j]&$He\u001a:fCR,'\u000f\n3fM\u0006,H\u000e\u001e\u00132+\u0005A'FA\u0013jW\u0005Q\u0007CA6q\u001b\u0005a'BA7o\u0003%)hn\u00195fG.,GM\u0003\u0002p?\u0005Q\u0011M\u001c8pi\u0006$\u0018n\u001c8\n\u0005Ed'!E;oG\",7m[3e-\u0006\u0014\u0018.\u00198dK\u0002")
public class BasicTokenizer {
    private final boolean caseSensitive;

    public static boolean $lessinit$greater$default$1() {
        return BasicTokenizer$.MODULE$.$lessinit$greater$default$1();
    }

    public boolean isWhitespace(char c) {
        return c == ' ' || c == '\t' || c == '\n' || c == '\r' || Character.isWhitespace(c);
    }

    public boolean isControl(char c) {
        if (c == '\t' || c == '\n' || c == '\r') {
            return false;
        }
        return Character.isISOControl(c);
    }

    public boolean isToFilter(char c) {
        char cp = c;
        return cp == '\u0000' || cp == '\ufffd' || this.isControl(c);
    }

    public boolean isPunctuation(char c) {
        boolean bl;
        char cp = c;
        if (cp >= '!' && cp <= '/' || cp >= ':' && cp <= '@' || cp >= '[' && cp <= '`' || cp >= '{' && cp <= '~') {
            return true;
        }
        try {
            String string;
            String charCategory;
            String string2 = charCategory = Character.getName(c);
            String string3 = string2 != null ? (string = string2) : "";
            String charCategoryString = string3;
            bl = charCategoryString.contains("PUNCTUATION");
        }
        catch (Exception exception) {
            bl = false;
        }
        return bl;
    }

    public String stripAccents(String text) {
        return Normalizer.normalize(text, Normalizer.Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
    }

    public boolean isChinese(char c) {
        char c2 = c;
        return c2 >= '\u4e00' && c2 <= '\u9fff' || c2 >= '\u3400' && c2 <= '\u4dbf' || c2 >= '\u20000' && c2 <= '\u2a6df' || c2 >= '\u2a700' && c2 <= '\u2b73f' || c2 >= '\u2b740' && c2 <= '\u2b81f' || c2 >= '\u2b820' && c2 <= '\u2ceaf' || c2 >= '\uf900' && c2 <= '\ufaff' || c2 >= '\u2f800' && c2 <= '\u2fa1f';
    }

    public String normalize(String text) {
        String result = new StringOps(Predef$.MODULE$.augmentString((String)new StringOps(Predef$.MODULE$.augmentString(this.stripAccents(text.trim()))).filter((Function1 & java.io.Serializable & Serializable)c -> BoxesRunTime.boxToBoolean((boolean)BasicTokenizer.$anonfun$normalize$1(this, BoxesRunTime.unboxToChar((Object)c)))))).mkString("");
        return this.caseSensitive ? result : result.toLowerCase();
    }

    public IndexedToken[] tokenize(Sentence sentence) {
        ArrayBuffer tokens = (ArrayBuffer)ArrayBuffer$.MODULE$.apply((Seq)Nil$.MODULE$);
        String s = sentence.content();
        int i = 0;
        while (i < s.length()) {
            int end;
            while (i < s.length() && this.isWhitespace(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(s), i)) && !this.isPunctuation(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(s), i))) {
                ++i;
            }
            for (end = i; !(end >= s.length() || this.isToFilter(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(s), end)) || this.isPunctuation(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(s), end)) || this.isChinese(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(s), end)) || this.isWhitespace(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(s), end))); ++end) {
            }
            if (end > i) {
                this.append$1(i, end, s, sentence, tokens);
            }
            if (end < s.length() && (this.isPunctuation(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(s), end)) || this.isChinese(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(s), end)))) {
                this.append$1(end, end + 1, s, sentence, tokens);
            }
            i = end + 1;
        }
        return (IndexedToken[])tokens.toArray(ClassTag$.MODULE$.apply(IndexedToken.class));
    }

    public static final /* synthetic */ boolean $anonfun$normalize$1(BasicTokenizer $this, char c) {
        return !$this.isToFilter(c);
    }

    private final void append$1(int start, int end, String s$1, Sentence sentence$1, ArrayBuffer tokens$1) {
        block0: {
            Predef$.MODULE$.assert(end > start);
            String text = s$1.substring(start, end);
            String normalized = this.normalize(text);
            if (normalized.isEmpty()) break block0;
            IndexedToken token = new IndexedToken(normalized, start + sentence$1.start(), end - 1 + sentence$1.start());
            tokens$1.append((Seq)Predef$.MODULE$.wrapRefArray((Object[])new IndexedToken[]{token}));
        }
    }

    public BasicTokenizer(boolean caseSensitive) {
        this.caseSensitive = caseSensitive;
    }
}

