package com.johnsnowlabs.nlp.annotators.ws;

import com.johnsnowlabs.nlp.AnnotatorApproach;
import com.johnsnowlabs.nlp.AnnotatorType$;
import com.johnsnowlabs.nlp.annotators.common.TaggedSentence;
import com.johnsnowlabs.nlp.annotators.pos.perceptron.AveragedPerceptron;
import com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronTrainingUtils;
import com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronUtils;
import com.johnsnowlabs.nlp.annotators.pos.perceptron.TrainingPerceptronLegacy;
import com.johnsnowlabs.nlp.annotators.pos.perceptron.TrainingPerceptronLegacy$;
import org.apache.spark.ml.PipelineModel;
import org.apache.spark.ml.param.BooleanParam;
import org.apache.spark.ml.param.DoubleParam;
import org.apache.spark.ml.param.IntParam;
import org.apache.spark.ml.param.Param;
import org.apache.spark.ml.param.ParamPair;
import org.apache.spark.ml.util.Identifiable$;
import org.apache.spark.ml.util.MLReader;
import org.apache.spark.sql.Dataset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Array$;
import scala.Option;
import scala.Predef$;
import scala.collection.immutable.Map;
import scala.collection.immutable.Nil$;
import scala.collection.mutable.ArrayOps;
import scala.collection.mutable.Map$;
import scala.reflect.ClassTag$;
import scala.reflect.ScalaSignature;
import scala.runtime.BoxesRunTime;

/* compiled from: WordSegmenterApproach.scala */
@ScalaSignature(bytes = "\u0006\u0001\u0005mg\u0001\u0002\u0014(\u0001IB\u0001b\u0011\u0001\u0003\u0006\u0004%\t\u0005\u0012\u0005\t%\u0002\u0011\t\u0011)A\u0005\u000b\")1\u000b\u0001C\u0001)\")1\u000b\u0001C\u0001/\"9\u0001\f\u0001b\u0001\n\u0003\"\u0005BB-\u0001A\u0003%Q\tC\u0004[\u0001\t\u0007I\u0011A.\t\r)\u0004\u0001\u0015!\u0003]\u0011\u001dY\u0007A1A\u0005\u00021Da\u0001\u001d\u0001!\u0002\u0013i\u0007bB9\u0001\u0005\u0004%\t\u0001\u001c\u0005\u0007e\u0002\u0001\u000b\u0011B7\t\u000fM\u0004!\u0019!C\u0001i\"1\u0001\u0010\u0001Q\u0001\nUDq!\u001f\u0001C\u0002\u0013\u0005!\u0010\u0003\u0004\u007f\u0001\u0001\u0006Ia\u001f\u0005\b\u007f\u0002\u0011\r\u0011\"\u0001{\u0011\u001d\t\t\u0001\u0001Q\u0001\nmD\u0001\"a\u0001\u0001\u0005\u0004%\ta\u0017\u0005\b\u0003\u000b\u0001\u0001\u0015!\u0003]\u0011\u001d\t9\u0001\u0001C\u0001\u0003\u0013Aq!!\u0005\u0001\t\u0003\t\u0019\u0002C\u0004\u0002 \u0001!\t!!\t\t\u000f\u0005\u0015\u0002\u0001\"\u0001\u0002(!9\u0011\u0011\u0007\u0001\u0005\u0002\u0005M\u0002bBA\u001f\u0001\u0011\u0005\u0011q\b\u0005\b\u0003\u0007\u0002A\u0011AA#\u0011\u001d\tI\u0005\u0001C\u0001\u0003\u0017Bq!!\u0014\u0001\t\u0003\ny\u0005C\u0005\u0002\f\u0002\u0011\r\u0011\"\u0011\u0002\u000e\"A\u0011\u0011\u0014\u0001!\u0002\u0013\ty\tC\u0005\u0002\u001c\u0002\u0011\r\u0011\"\u0011\u0002\u001e\"A\u0011Q\u0015\u0001!\u0002\u0013\tyjB\u0004\u0002(\u001eB\t!!+\u0007\r\u0019:\u0003\u0012AAV\u0011\u0019\u00196\u0005\"\u0001\u0002F\"I\u0011qY\u0012\u0002\u0002\u0013%\u0011\u0011\u001a\u0002\u0016/>\u0014HmU3h[\u0016tG/\u001a:BaB\u0014x.Y2i\u0015\tA\u0013&\u0001\u0002xg*\u0011!fK\u0001\u000bC:tw\u000e^1u_J\u001c(B\u0001\u0017.\u0003\rqG\u000e\u001d\u0006\u0003]=\nAB[8i]Ntwn\u001e7bENT\u0011\u0001M\u0001\u0004G>l7\u0001A\n\u0004\u0001MZ\u0004c\u0001\u001b6o5\t1&\u0003\u00027W\t\t\u0012I\u001c8pi\u0006$xN]!qaJ|\u0017m\u00195\u0011\u0005aJT\"A\u0014\n\u0005i:#AE,pe\u0012\u001cVmZ7f]R,'/T8eK2\u0004\"\u0001P!\u000e\u0003uR!AP \u0002\u0015A,'oY3qiJ|gN\u0003\u0002AS\u0005\u0019\u0001o\\:\n\u0005\tk$a\u0006)fe\u000e,\u0007\u000f\u001e:p]R\u0013\u0018-\u001b8j]\u001e,F/\u001b7t\u0003\r)\u0018\u000eZ\u000b\u0002\u000bB\u0011ai\u0014\b\u0003\u000f6\u0003\"\u0001S&\u000e\u0003%S!AS\u0019\u0002\rq\u0012xn\u001c;?\u0015\u0005a\u0015!B:dC2\f\u0017B\u0001(L\u0003\u0019\u0001&/\u001a3fM&\u0011\u0001+\u0015\u0002\u0007'R\u0014\u0018N\\4\u000b\u00059[\u0015\u0001B;jI\u0002\na\u0001P5oSRtDCA+W!\tA\u0004\u0001C\u0003D\u0007\u0001\u0007Q\tF\u0001V\u0003-!Wm]2sSB$\u0018n\u001c8\u0002\u0019\u0011,7o\u0019:jaRLwN\u001c\u0011\u0002\rA|7oQ8m+\u0005a\u0006cA/i\u000b6\taL\u0003\u0002`A\u0006)\u0001/\u0019:b[*\u0011\u0011MY\u0001\u0003[2T!a\u00193\u0002\u000bM\u0004\u0018M]6\u000b\u0005\u00154\u0017AB1qC\u000eDWMC\u0001h\u0003\ry'oZ\u0005\u0003Sz\u0013Q\u0001U1sC6\fq\u0001]8t\u0007>d\u0007%A\u0006o\u0013R,'/\u0019;j_:\u001cX#A7\u0011\u0005us\u0017BA8_\u0005!Ie\u000e\u001e)be\u0006l\u0017\u0001\u00048Ji\u0016\u0014\u0018\r^5p]N\u0004\u0013A\u00054sKF,XM\\2z)\"\u0014Xm\u001d5pY\u0012\f1C\u001a:fcV,gnY=UQJ,7\u000f[8mI\u0002\n!#Y7cS\u001e,\u0018\u000e^=UQJ,7\u000f[8mIV\tQ\u000f\u0005\u0002^m&\u0011qO\u0018\u0002\f\t>,(\r\\3QCJ\fW.A\nb[\nLw-^5usRC'/Z:i_2$\u0007%\u0001\u000bf]\u0006\u0014G.\u001a*fO\u0016DHk\\6f]&TXM]\u000b\u0002wB\u0011Q\f`\u0005\u0003{z\u0013ABQ8pY\u0016\fg\u000eU1sC6\fQ#\u001a8bE2,'+Z4fqR{7.\u001a8ju\u0016\u0014\b%A\u0006u_2{w/\u001a:dCN,\u0017\u0001\u0004;p\u0019><XM]2bg\u0016\u0004\u0013a\u00029biR,'O\\\u0001\ta\u0006$H/\u001a:oA\u0005a1/\u001a;Q_N\u001cu\u000e\\;n]R!\u00111BA\u0007\u001b\u0005\u0001\u0001BBA\b+\u0001\u0007Q)A\u0003wC2,X-\u0001\btKRt\u0015\n^3sCRLwN\\:\u0015\t\u0005-\u0011Q\u0003\u0005\b\u0003\u001f1\u0002\u0019AA\f!\u0011\tI\"a\u0007\u000e\u0003-K1!!\bL\u0005\rIe\u000e^\u0001\u0016g\u0016$hI]3rk\u0016t7-\u001f+ie\u0016\u001c\bn\u001c7e)\u0011\tY!a\t\t\u000f\u0005=q\u00031\u0001\u0002\u0018\u0005)2/\u001a;B[\nLw-^5usRC'/Z:i_2$G\u0003BA\u0006\u0003SAq!a\u0004\u0019\u0001\u0004\tY\u0003\u0005\u0003\u0002\u001a\u00055\u0012bAA\u0018\u0017\n1Ai\\;cY\u0016\fqc]3u\u000b:\f'\r\\3SK\u001e,\u0007\u0010V8lK:L'0\u001a:\u0015\t\u0005-\u0011Q\u0007\u0005\b\u0003\u001fI\u0002\u0019AA\u001c!\u0011\tI\"!\u000f\n\u0007\u0005m2JA\u0004C_>dW-\u00198\u0002\u001dM,G\u000fV8M_^,'oY1tKR!\u00111BA!\u0011\u001d\tyA\u0007a\u0001\u0003o\t!b]3u!\u0006$H/\u001a:o)\u0011\tY!a\u0012\t\r\u0005=1\u00041\u0001F\u000399W\r\u001e(Ji\u0016\u0014\u0018\r^5p]N,\"!a\u0006\u0002\u000bQ\u0014\u0018-\u001b8\u0015\u000b]\n\t&!\u001f\t\u000f\u0005MS\u00041\u0001\u0002V\u00059A-\u0019;bg\u0016$\b\u0007BA,\u0003O\u0002b!!\u0017\u0002`\u0005\rTBAA.\u0015\r\tiFY\u0001\u0004gFd\u0017\u0002BA1\u00037\u0012q\u0001R1uCN,G\u000f\u0005\u0003\u0002f\u0005\u001dD\u0002\u0001\u0003\r\u0003S\n\t&!A\u0001\u0002\u000b\u0005\u00111\u000e\u0002\u0004?\u0012\n\u0014\u0003BA7\u0003g\u0002B!!\u0007\u0002p%\u0019\u0011\u0011O&\u0003\u000f9{G\u000f[5oOB!\u0011\u0011DA;\u0013\r\t9h\u0013\u0002\u0004\u0003:L\b\"CA>;A\u0005\t\u0019AA?\u0003E\u0011XmY;sg&4X\rU5qK2Lg.\u001a\t\u0007\u00033\ty(a!\n\u0007\u0005\u00055J\u0001\u0004PaRLwN\u001c\t\u0005\u0003\u000b\u000b9)D\u0001a\u0013\r\tI\t\u0019\u0002\u000e!&\u0004X\r\\5oK6{G-\u001a7\u0002'=,H\u000f];u\u0003:tw\u000e^1u_J$\u0016\u0010]3\u0016\u0005\u0005=\u0005\u0003BA\u0006\u0003#KA!a%\u0002\u0016\ni\u0011I\u001c8pi\u0006$xN\u001d+za\u0016L1!a&,\u0005YA\u0015m](viB,H/\u00118o_R\fGo\u001c:UsB,\u0017\u0001F8viB,H/\u00118o_R\fGo\u001c:UsB,\u0007%A\nj]B,H/\u00118o_R\fGo\u001c:UsB,7/\u0006\u0002\u0002 B)\u0011\u0011DAQ\u000b&\u0019\u00111U&\u0003\u000b\u0005\u0013(/Y=\u0002)%t\u0007/\u001e;B]:|G/\u0019;peRK\b/Z:!\u0003U9vN\u001d3TK\u001elWM\u001c;fe\u0006\u0003\bO]8bG\"\u0004\"\u0001O\u0012\u0014\u000f\r\ni+a-\u0002@B!\u0011\u0011DAX\u0013\r\t\tl\u0013\u0002\u0007\u0003:L(+\u001a4\u0011\u000b\u0005U\u00161X+\u000e\u0005\u0005]&bAA]A\u0006!Q\u000f^5m\u0013\u0011\ti,a.\u0003+\u0011+g-Y;miB\u000b'/Y7t%\u0016\fG-\u00192mKB!\u0011\u0011DAa\u0013\r\t\u0019m\u0013\u0002\r'\u0016\u0014\u0018.\u00197ju\u0006\u0014G.\u001a\u000b\u0003\u0003S\u000b1B]3bIJ+7o\u001c7wKR\u0011\u00111\u001a\t\u0005\u0003\u001b\f9.\u0004\u0002\u0002P*!\u0011\u0011[Aj\u0003\u0011a\u0017M\\4\u000b\u0005\u0005U\u0017\u0001\u00026bm\u0006LA!!7\u0002P\n1qJ\u00196fGR\u0004")
/* loaded from: input_file:com/johnsnowlabs/nlp/annotators/ws/WordSegmenterApproach.class */
public class WordSegmenterApproach extends AnnotatorApproach<WordSegmenterModel> implements PerceptronTrainingUtils {
    private final String uid;
    private final String description;
    private final Param<String> posCol;
    private final IntParam nIterations;
    private final IntParam frequencyThreshold;
    private final DoubleParam ambiguityThreshold;
    private final BooleanParam enableRegexTokenizer;
    private final BooleanParam toLowercase;
    private final Param<String> pattern;
    private final String outputAnnotatorType;
    private final String[] inputAnnotatorTypes;
    private final Logger logger;
    private final String[] START;
    private final String[] END;

    public static MLReader<WordSegmenterApproach> read() {
        return WordSegmenterApproach$.MODULE$.read();
    }

    public static Object load(String str) {
        return WordSegmenterApproach$.MODULE$.load(str);
    }

    @Override // com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronTrainingUtils
    public TaggedSentence[] generatesTagBook(Dataset<?> dataset) {
        TaggedSentence[] generatesTagBook;
        generatesTagBook = generatesTagBook(dataset);
        return generatesTagBook;
    }

    @Override // com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronTrainingUtils
    public Map<String, String> buildTagBook(TaggedSentence[] taggedSentenceArr, int i, double d) {
        Map<String, String> buildTagBook;
        buildTagBook = buildTagBook(taggedSentenceArr, i, d);
        return buildTagBook;
    }

    @Override // com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronTrainingUtils
    public AveragedPerceptron trainPerceptron(int i, TrainingPerceptronLegacy trainingPerceptronLegacy, TaggedSentence[] taggedSentenceArr, Map<String, String> map) {
        AveragedPerceptron trainPerceptron;
        trainPerceptron = trainPerceptron(i, trainingPerceptronLegacy, taggedSentenceArr, map);
        return trainPerceptron;
    }

    @Override // com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronUtils
    public String normalized(String str) {
        String normalized;
        normalized = normalized(str);
        return normalized;
    }

    @Override // com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronUtils
    public Map<String, Object> getFeatures(int i, String str, String[] strArr, String str2, String str3) {
        Map<String, Object> features;
        features = getFeatures(i, str, strArr, str2, str3);
        return features;
    }

    @Override // com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronTrainingUtils
    public Logger logger() {
        return this.logger;
    }

    @Override // com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronTrainingUtils
    public void com$johnsnowlabs$nlp$annotators$pos$perceptron$PerceptronTrainingUtils$_setter_$logger_$eq(Logger logger) {
        this.logger = logger;
    }

    @Override // com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronUtils
    public String[] START() {
        return this.START;
    }

    @Override // com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronUtils
    public String[] END() {
        return this.END;
    }

    @Override // com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronUtils
    public void com$johnsnowlabs$nlp$annotators$pos$perceptron$PerceptronUtils$_setter_$START_$eq(String[] strArr) {
        this.START = strArr;
    }

    @Override // com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronUtils
    public void com$johnsnowlabs$nlp$annotators$pos$perceptron$PerceptronUtils$_setter_$END_$eq(String[] strArr) {
        this.END = strArr;
    }

    public String uid() {
        return this.uid;
    }

    @Override // com.johnsnowlabs.nlp.AnnotatorApproach
    public String description() {
        return this.description;
    }

    public Param<String> posCol() {
        return this.posCol;
    }

    public IntParam nIterations() {
        return this.nIterations;
    }

    public IntParam frequencyThreshold() {
        return this.frequencyThreshold;
    }

    public DoubleParam ambiguityThreshold() {
        return this.ambiguityThreshold;
    }

    public BooleanParam enableRegexTokenizer() {
        return this.enableRegexTokenizer;
    }

    public BooleanParam toLowercase() {
        return this.toLowercase;
    }

    public Param<String> pattern() {
        return this.pattern;
    }

    public WordSegmenterApproach setPosColumn(String str) {
        return (WordSegmenterApproach) set(posCol(), str);
    }

    public WordSegmenterApproach setNIterations(int i) {
        return (WordSegmenterApproach) set(nIterations(), BoxesRunTime.boxToInteger(i));
    }

    public WordSegmenterApproach setFrequencyThreshold(int i) {
        return (WordSegmenterApproach) set(frequencyThreshold(), BoxesRunTime.boxToInteger(i));
    }

    public WordSegmenterApproach setAmbiguityThreshold(double d) {
        return (WordSegmenterApproach) set(ambiguityThreshold(), BoxesRunTime.boxToDouble(d));
    }

    public WordSegmenterApproach setEnableRegexTokenizer(boolean z) {
        return (WordSegmenterApproach) set(enableRegexTokenizer(), BoxesRunTime.boxToBoolean(z));
    }

    public WordSegmenterApproach setToLowercase(boolean z) {
        return (WordSegmenterApproach) set(toLowercase(), BoxesRunTime.boxToBoolean(z));
    }

    public WordSegmenterApproach setPattern(String str) {
        return (WordSegmenterApproach) set(pattern(), str);
    }

    public int getNIterations() {
        return BoxesRunTime.unboxToInt($(nIterations()));
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // com.johnsnowlabs.nlp.AnnotatorApproach
    public WordSegmenterModel train(Dataset<?> dataset, Option<PipelineModel> option) {
        TaggedSentence[] generatesTagBook = generatesTagBook(dataset);
        Map<String, String> buildTagBook = buildTagBook(generatesTagBook, BoxesRunTime.unboxToInt($(frequencyThreshold())), BoxesRunTime.unboxToDouble($(ambiguityThreshold())));
        return new WordSegmenterModel().setModel(trainPerceptron(BoxesRunTime.unboxToInt($(nIterations())), new TrainingPerceptronLegacy((String[]) new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps((Object[]) new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps(generatesTagBook)).flatMap(taggedSentence -> {
            return new ArrayOps.ofRef($anonfun$train$1(taggedSentence));
        }, Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(String.class))))).distinct(), buildTagBook, Map$.MODULE$.apply(Nil$.MODULE$), TrainingPerceptronLegacy$.MODULE$.$lessinit$greater$default$4()), generatesTagBook, buildTagBook)).setEnableRegexTokenizer(BoxesRunTime.unboxToBoolean($(enableRegexTokenizer()))).setToLowercase(BoxesRunTime.unboxToBoolean($(toLowercase()))).setPattern((String) $(pattern()));
    }

    @Override // com.johnsnowlabs.nlp.HasOutputAnnotatorType
    public String outputAnnotatorType() {
        return this.outputAnnotatorType;
    }

    @Override // com.johnsnowlabs.nlp.HasInputAnnotationCols
    public String[] inputAnnotatorTypes() {
        return this.inputAnnotatorTypes;
    }

    @Override // com.johnsnowlabs.nlp.AnnotatorApproach
    public /* bridge */ /* synthetic */ WordSegmenterModel train(Dataset dataset, Option option) {
        return train((Dataset<?>) dataset, (Option<PipelineModel>) option);
    }

    public static final /* synthetic */ Object[] $anonfun$train$1(TaggedSentence taggedSentence) {
        return Predef$.MODULE$.refArrayOps(taggedSentence.tags());
    }

    public WordSegmenterApproach(String str) {
        this.uid = str;
        PerceptronUtils.$init$(this);
        com$johnsnowlabs$nlp$annotators$pos$perceptron$PerceptronTrainingUtils$_setter_$logger_$eq(LoggerFactory.getLogger("PerceptronApproachUtils"));
        this.description = "Word segmentation";
        this.posCol = new Param<>(this, "posCol", "column of Array of POS tags that match tokens");
        this.nIterations = new IntParam(this, "nIterations", "Number of iterations in training, converges to better accuracy");
        this.frequencyThreshold = new IntParam(this, "frequencyThreshold", "How many times at least a tag on a word to be marked as frequent");
        this.ambiguityThreshold = new DoubleParam(this, "ambiguityThreshold", "How much percentage of total amount of words are covered to be marked as frequent");
        this.enableRegexTokenizer = new BooleanParam(this, "enableRegexTokenizer", "Whether to use RegexTokenizer before segmentation. Useful for multilingual text");
        this.toLowercase = new BooleanParam(this, "toLowercase", "Indicates whether to convert all characters to lowercase before tokenizing. Used only when enableRegexTokenizer is true");
        this.pattern = new Param<>(this, "pattern", "regex pattern used for tokenizing. Used only when enableRegexTokenizer is true");
        setDefault(Predef$.MODULE$.wrapRefArray(new ParamPair[]{nIterations().$minus$greater(BoxesRunTime.boxToInteger(5)), frequencyThreshold().$minus$greater(BoxesRunTime.boxToInteger(20)), ambiguityThreshold().$minus$greater(BoxesRunTime.boxToDouble(0.97d)), enableRegexTokenizer().$minus$greater(BoxesRunTime.boxToBoolean(false)), toLowercase().$minus$greater(BoxesRunTime.boxToBoolean(false)), pattern().$minus$greater("\\s+")}));
        this.outputAnnotatorType = AnnotatorType$.MODULE$.TOKEN();
        this.inputAnnotatorTypes = new String[]{AnnotatorType$.MODULE$.DOCUMENT()};
    }

    public WordSegmenterApproach() {
        this(Identifiable$.MODULE$.randomUID("WORD_SEGMENTER"));
    }
}
