/*
 * Decompiled with CFR 0.152.
 */
package com.johnsnowlabs.nlp.annotators;

import com.johnsnowlabs.nlp.Annotation;
import com.johnsnowlabs.nlp.AnnotatorModel;
import com.johnsnowlabs.nlp.AnnotatorType$;
import com.johnsnowlabs.nlp.HasSimpleAnnotate;
import com.johnsnowlabs.nlp.annotators.common.IndexedToken;
import com.johnsnowlabs.nlp.annotators.common.Sentence;
import com.johnsnowlabs.nlp.annotators.common.SentenceSplit$;
import com.johnsnowlabs.nlp.annotators.common.TokenizedSentence;
import com.johnsnowlabs.nlp.annotators.common.TokenizedWithSentence$;
import java.io.Serializable;
import org.apache.spark.ml.param.BooleanParam;
import org.apache.spark.ml.param.IntParam;
import org.apache.spark.ml.param.Param;
import org.apache.spark.ml.param.ParamPair;
import org.apache.spark.ml.param.ParamValidators$;
import org.apache.spark.ml.util.Identifiable;
import org.apache.spark.ml.util.Identifiable$;
import org.apache.spark.sql.expressions.UserDefinedFunction;
import scala.Array$;
import scala.Function1;
import scala.MatchError;
import scala.Predef$;
import scala.collection.Seq;
import scala.collection.Seq$;
import scala.collection.immutable.StringOps;
import scala.collection.mutable.ArrayOps;
import scala.package$;
import scala.reflect.ClassTag$;
import scala.reflect.ScalaSignature;
import scala.runtime.BoxesRunTime;
import scala.runtime.IntRef;
import scala.runtime.java8.JFunction1;
import scala.util.matching.Regex;

@ScalaSignature(bytes="\u0006\u0001\u0005Ed\u0001\u0002\u0011\"\u0001)B\u0001\u0002\u000e\u0001\u0003\u0006\u0004%\t%\u000e\u0005\t\u0007\u0002\u0011\t\u0011)A\u0005m!)A\t\u0001C\u0001\u000b\"9q\t\u0001b\u0001\n\u0003B\u0005BB(\u0001A\u0003%\u0011\nC\u0004Q\u0001\t\u0007I\u0011I)\t\rY\u0003\u0001\u0015!\u0003S\u0011\u0015!\u0005\u0001\"\u0001X\u0011\u001dA\u0006A1A\u0005\u0002eCa\u0001\u001b\u0001!\u0002\u0013Q\u0006\"B5\u0001\t\u0003Q\u0007\"B7\u0001\t\u0003)\u0004b\u00028\u0001\u0005\u0004%\ta\u001c\u0005\u0007g\u0002\u0001\u000b\u0011\u00029\t\u000bQ\u0004A\u0011A;\t\u000bi\u0004A\u0011A>\t\u000fq\u0004!\u0019!C\u0001{\"9\u00111\u0001\u0001!\u0002\u0013q\bbBA\u0003\u0001\u0011\u0005\u0011q\u0001\u0005\b\u0003#\u0001A\u0011AA\n\u0011!\t)\u0002\u0001b\u0001\n\u0003i\bbBA\f\u0001\u0001\u0006IA \u0005\b\u00033\u0001A\u0011AA\u000e\u0011\u001d\ty\u0002\u0001C\u0001\u0003'A\u0001\"!\t\u0001\u0005\u0004%\ta\u001c\u0005\b\u0003G\u0001\u0001\u0015!\u0003q\u0011\u001d\t)\u0003\u0001C\u0001\u0003OAa!a\u000b\u0001\t\u0003Y\bbBA\u0017\u0001\u0011\u0005\u0011q\u0006\u0005\b\u00037\u0002A\u0011AA/\u0011\u001d\t\t\u0007\u0001C!\u0003G\u0012aBU3hKb$vn[3oSj,'O\u0003\u0002#G\u0005Q\u0011M\u001c8pi\u0006$xN]:\u000b\u0005\u0011*\u0013a\u00018ma*\u0011aeJ\u0001\rU>Dgn\u001d8po2\f'm\u001d\u0006\u0002Q\u0005\u00191m\\7\u0004\u0001M\u0019\u0001aK\u0019\u0011\u00071js&D\u0001$\u0013\tq3E\u0001\bB]:|G/\u0019;pe6{G-\u001a7\u0011\u0005A\u0002Q\"A\u0011\u0011\u00071\u0012t&\u0003\u00024G\t\t\u0002*Y:TS6\u0004H.Z!o]>$\u0018\r^3\u0002\u0007ULG-F\u00017!\t9\u0004I\u0004\u00029}A\u0011\u0011\bP\u0007\u0002u)\u00111(K\u0001\u0007yI|w\u000e\u001e \u000b\u0003u\nQa]2bY\u0006L!a\u0010\u001f\u0002\rA\u0013X\rZ3g\u0013\t\t%I\u0001\u0004TiJLgn\u001a\u0006\u0003\u007fq\nA!^5eA\u00051A(\u001b8jiz\"\"a\f$\t\u000bQ\u001a\u0001\u0019\u0001\u001c\u0002'=,H\u000f];u\u0003:tw\u000e^1u_J$\u0016\u0010]3\u0016\u0003%\u0003\"AS&\u000e\u0003\u0001I!\u0001T'\u0003\u001b\u0005sgn\u001c;bi>\u0014H+\u001f9f\u0013\tq5E\u0001\fICN|U\u000f\u001e9vi\u0006sgn\u001c;bi>\u0014H+\u001f9f\u0003QyW\u000f\u001e9vi\u0006sgn\u001c;bi>\u0014H+\u001f9fA\u0005\u0019\u0012N\u001c9vi\u0006sgn\u001c;bi>\u0014H+\u001f9fgV\t!\u000bE\u0002T)&k\u0011\u0001P\u0005\u0003+r\u0012Q!\u0011:sCf\fA#\u001b8qkR\feN\\8uCR|'\u000fV=qKN\u0004C#A\u0018\u0002\u000fA\fG\u000f^3s]V\t!\fE\u0002\\MZj\u0011\u0001\u0018\u0006\u0003;z\u000bQ\u0001]1sC6T!a\u00181\u0002\u00055d'BA1c\u0003\u0015\u0019\b/\u0019:l\u0015\t\u0019G-\u0001\u0004ba\u0006\u001c\u0007.\u001a\u0006\u0002K\u0006\u0019qN]4\n\u0005\u001dd&!\u0002)be\u0006l\u0017\u0001\u00039biR,'O\u001c\u0011\u0002\u0015M,G\u000fU1ui\u0016\u0014h\u000e\u0006\u0002KW\")An\u0003a\u0001m\u0005)a/\u00197vK\u0006Qq-\u001a;QCR$XM\u001d8\u0002\u0017Q|Gj\\<fe\u000e\f7/Z\u000b\u0002aB\u00111,]\u0005\u0003er\u0013ABQ8pY\u0016\fg\u000eU1sC6\fA\u0002^8M_^,'oY1tK\u0002\nab]3u)>dun^3sG\u0006\u001cX\r\u0006\u0002Km\")An\u0004a\u0001oB\u00111\u000b_\u0005\u0003sr\u0012qAQ8pY\u0016\fg.\u0001\bhKR$v\u000eT8xKJ\u001c\u0017m]3\u0016\u0003]\f\u0011\"\\5o\u0019\u0016tw\r\u001e5\u0016\u0003y\u0004\"aW@\n\u0007\u0005\u0005AL\u0001\u0005J]R\u0004\u0016M]1n\u0003)i\u0017N\u001c'f]\u001e$\b\u000eI\u0001\rg\u0016$X*\u001b8MK:<G\u000f\u001b\u000b\u0004\u0015\u0006%\u0001B\u00027\u0014\u0001\u0004\tY\u0001E\u0002T\u0003\u001bI1!a\u0004=\u0005\rIe\u000e^\u0001\rO\u0016$X*\u001b8MK:<G\u000f[\u000b\u0003\u0003\u0017\t\u0011\"\\1y\u0019\u0016tw\r\u001e5\u0002\u00155\f\u0007\u0010T3oORD\u0007%\u0001\u0007tKRl\u0015\r\u001f'f]\u001e$\b\u000eF\u0002K\u0003;Aa\u0001\\\fA\u0002\u0005-\u0011\u0001D4fi6\u000b\u0007\u0010T3oORD\u0017A\u00049pg&$\u0018n\u001c8bY6\u000b7o[\u0001\u0010a>\u001c\u0018\u000e^5p]\u0006dW*Y:lA\u0005\t2/\u001a;Q_NLG/[8oC2l\u0015m]6\u0015\u0007)\u000bI\u0003C\u0003m7\u0001\u0007q/A\thKR\u0004vn]5uS>t\u0017\r\\'bg.\fQ\u0003^1h/&$\b\u000eU8tSRLwN\\1m\u001b\u0006\u001c8\u000e\u0006\u0003\u00022\u0005=\u0003CBA\u001a\u0003{\t\u0019E\u0004\u0003\u00026\u0005ebbA\u001d\u00028%\tQ(C\u0002\u0002<q\nq\u0001]1dW\u0006<W-\u0003\u0003\u0002@\u0005\u0005#aA*fc*\u0019\u00111\b\u001f\u0011\t\u0005\u0015\u00131J\u0007\u0003\u0003\u000fR1!!\u0013\"\u0003\u0019\u0019w.\\7p]&!\u0011QJA$\u0005E!vn[3oSj,GmU3oi\u0016t7-\u001a\u0005\b\u0003#j\u0002\u0019AA*\u0003%\u0019XM\u001c;f]\u000e,7\u000f\u0005\u0004\u00024\u0005u\u0012Q\u000b\t\u0005\u0003\u000b\n9&\u0003\u0003\u0002Z\u0005\u001d#\u0001C*f]R,gnY3\u0002\u0007Q\fw\r\u0006\u0003\u00022\u0005}\u0003bBA)=\u0001\u0007\u00111K\u0001\tC:tw\u000e^1uKR!\u0011QMA7!\u0019\t\u0019$!\u0010\u0002hA\u0019A&!\u001b\n\u0007\u0005-4E\u0001\u0006B]:|G/\u0019;j_:Dq!a\u001c \u0001\u0004\t)'A\u0006b]:|G/\u0019;j_:\u001c\b")
public class RegexTokenizer
extends AnnotatorModel<RegexTokenizer>
implements HasSimpleAnnotate<RegexTokenizer> {
    private final String uid;
    private final String outputAnnotatorType;
    private final String[] inputAnnotatorTypes;
    private final Param<String> pattern;
    private final BooleanParam toLowercase;
    private final IntParam minLength;
    private final IntParam maxLength;
    private final BooleanParam positionalMask;

    @Override
    public UserDefinedFunction dfAnnotate() {
        return HasSimpleAnnotate.dfAnnotate$(this);
    }

    public String uid() {
        return this.uid;
    }

    @Override
    public String outputAnnotatorType() {
        return this.outputAnnotatorType;
    }

    @Override
    public String[] inputAnnotatorTypes() {
        return this.inputAnnotatorTypes;
    }

    public Param<String> pattern() {
        return this.pattern;
    }

    public RegexTokenizer setPattern(String value) {
        return (RegexTokenizer)this.set(this.pattern(), value);
    }

    public String getPattern() {
        return (String)this.$(this.pattern());
    }

    public BooleanParam toLowercase() {
        return this.toLowercase;
    }

    public RegexTokenizer setToLowercase(boolean value) {
        return (RegexTokenizer)this.set((Param)this.toLowercase(), BoxesRunTime.boxToBoolean((boolean)value));
    }

    public boolean getToLowercase() {
        return BoxesRunTime.unboxToBoolean((Object)this.$((Param)this.toLowercase()));
    }

    public IntParam minLength() {
        return this.minLength;
    }

    public RegexTokenizer setMinLength(int value) {
        return (RegexTokenizer)this.set((Param)this.minLength(), BoxesRunTime.boxToInteger((int)value));
    }

    public int getMinLength() {
        return BoxesRunTime.unboxToInt((Object)this.$((Param)this.minLength()));
    }

    public IntParam maxLength() {
        return this.maxLength;
    }

    public RegexTokenizer setMaxLength(int value) {
        return (RegexTokenizer)this.set((Param)this.maxLength(), BoxesRunTime.boxToInteger((int)value));
    }

    public int getMaxLength() {
        return BoxesRunTime.unboxToInt((Object)this.$((Param)this.maxLength()));
    }

    public BooleanParam positionalMask() {
        return this.positionalMask;
    }

    public RegexTokenizer setPositionalMask(boolean value) {
        return (RegexTokenizer)this.set((Param)this.positionalMask(), BoxesRunTime.boxToBoolean((boolean)value));
    }

    public boolean getPositionalMask() {
        return BoxesRunTime.unboxToBoolean((Object)this.$((Param)this.positionalMask()));
    }

    public Seq<TokenizedSentence> tagWithPositionalMask(Seq<Sentence> sentences) {
        return (Seq)sentences.map((Function1 & Serializable & scala.Serializable)text -> {
            Regex re = new StringOps(Predef$.MODULE$.augmentString((String)this.$(this.pattern()))).r();
            String _content = BoxesRunTime.unboxToBoolean((Object)this.$((Param)this.toLowercase())) ? text.content().toLowerCase() : text.content();
            int[] _mask = new int[_content.length()];
            IndexedToken[] tokens = (IndexedToken[])new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps((Object[])new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps((Object[])re.split((CharSequence)_content))).map((Function1 & Serializable & scala.Serializable)token -> new IndexedToken((String)token, RegexTokenizer.calculateIndex$1("begin", _mask, _content, token), RegexTokenizer.calculateIndex$1("end", _mask, _content, token)), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(IndexedToken.class))))).filter((Function1 & Serializable & scala.Serializable)t -> BoxesRunTime.boxToBoolean((boolean)RegexTokenizer.$anonfun$tagWithPositionalMask$4(this, t)));
            return new TokenizedSentence(tokens, text.index());
        }, Seq$.MODULE$.canBuildFrom());
    }

    public Seq<TokenizedSentence> tag(Seq<Sentence> sentences) {
        return (Seq)sentences.map((Function1 & Serializable & scala.Serializable)text -> {
            IntRef curPos = IntRef.create((int)0);
            Regex re = new StringOps(Predef$.MODULE$.augmentString((String)this.$(this.pattern()))).r();
            String str = BoxesRunTime.unboxToBoolean((Object)this.$((Param)this.toLowercase())) ? text.content().toLowerCase() : text.content();
            IndexedToken[] tokens = (IndexedToken[])new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps((Object[])new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps((Object[])re.split((CharSequence)str))).map((Function1 & Serializable & scala.Serializable)token -> {
                void var3_3;
                IndexedToken indexedTokens = new IndexedToken((String)token, text.start() + curPos$1.elem, text.start() + curPos$1.elem + token.length() - 1);
                curPos$1.elem += token.length() + 1;
                return var3_3;
            }, Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(IndexedToken.class))))).filter((Function1 & Serializable & scala.Serializable)t -> BoxesRunTime.boxToBoolean((boolean)RegexTokenizer.$anonfun$tag$3(this, t)));
            return new TokenizedSentence(tokens, text.index());
        }, Seq$.MODULE$.canBuildFrom());
    }

    @Override
    public Seq<Annotation> annotate(Seq<Annotation> annotations) {
        Seq<Sentence> sentences = SentenceSplit$.MODULE$.unpack(annotations);
        Seq<TokenizedSentence> tokenized = this.getPositionalMask() ? this.tagWithPositionalMask(sentences) : this.tag(sentences);
        return TokenizedWithSentence$.MODULE$.pack(tokenized);
    }

    private static final int calculateIndex$1(String indexType, int[] mask, String text, String token) {
        int n;
        int tokenBeginIndex = text.substring(new ArrayOps.ofInt(Predef$.MODULE$.intArrayOps(mask)).indexOf((Object)BoxesRunTime.boxToInteger((int)0)), text.length()).indexOf(token) + new ArrayOps.ofInt(Predef$.MODULE$.intArrayOps(mask)).indexOf((Object)BoxesRunTime.boxToInteger((int)0));
        String string = indexType;
        if ("begin".equals(string)) {
            n = tokenBeginIndex;
        } else if ("end".equals(string)) {
            int endIndex = tokenBeginIndex + token.length();
            package$.MODULE$.Range().apply(0, endIndex).foreach$mVc$sp((Function1)(JFunction1.mcVI.sp & Serializable & scala.Serializable)i -> {
                mask$1[i] = 1;
            });
            n = endIndex == 0 ? endIndex : endIndex - 1;
        } else {
            throw new MatchError((Object)string);
        }
        return n;
    }

    public static final /* synthetic */ boolean $anonfun$tagWithPositionalMask$4(RegexTokenizer $this, IndexedToken t) {
        return new StringOps(Predef$.MODULE$.augmentString(t.token())).nonEmpty() && t.token().length() >= BoxesRunTime.unboxToInt((Object)$this.$((Param)$this.minLength())) && $this.get((Param)$this.maxLength()).forall((Function1)(JFunction1.mcZI.sp & Serializable & scala.Serializable)m -> t.token().length() <= m);
    }

    public static final /* synthetic */ boolean $anonfun$tag$3(RegexTokenizer $this, IndexedToken t) {
        return new StringOps(Predef$.MODULE$.augmentString(t.token())).nonEmpty() && t.token().length() >= BoxesRunTime.unboxToInt((Object)$this.$((Param)$this.minLength())) && $this.get((Param)$this.maxLength()).forall((Function1)(JFunction1.mcZI.sp & Serializable & scala.Serializable)m -> t.token().length() <= m);
    }

    public RegexTokenizer(String uid) {
        this.uid = uid;
        HasSimpleAnnotate.$init$(this);
        this.outputAnnotatorType = AnnotatorType$.MODULE$.TOKEN();
        this.inputAnnotatorTypes = (String[])((Object[])new String[]{AnnotatorType$.MODULE$.DOCUMENT()});
        this.pattern = new Param((Identifiable)this, "pattern", "regex pattern used for tokenizing");
        this.toLowercase = new BooleanParam((Identifiable)this, "toLowercase", "Indicates whether to convert all characters to lowercase before tokenizing.\n");
        this.minLength = new IntParam((Identifiable)this, "minLength", "minimum token length (>= 0)", ParamValidators$.MODULE$.gtEq(0.0));
        this.maxLength = new IntParam((Identifiable)this, "maxLength", "maximum token length (>= 1)", ParamValidators$.MODULE$.gtEq(1.0));
        this.positionalMask = new BooleanParam((Identifiable)this, "positionalMask", "Using a positional mask to guarantee the incremental progression of the tokenization.");
        this.setDefault((Seq)Predef$.MODULE$.wrapRefArray((Object[])new ParamPair[]{this.inputCols().$minus$greater((Object)new String[]{AnnotatorType$.MODULE$.DOCUMENT()}), this.outputCol().$minus$greater((Object)"regexToken"), this.toLowercase().$minus$greater((Object)BoxesRunTime.boxToBoolean((boolean)false)), this.minLength().$minus$greater((Object)BoxesRunTime.boxToInteger((int)1)), this.pattern().$minus$greater((Object)"\\s+"), this.positionalMask().$minus$greater((Object)BoxesRunTime.boxToBoolean((boolean)false))}));
    }

    public RegexTokenizer() {
        this(Identifiable$.MODULE$.randomUID("RegexTokenizer"));
    }
}

