/*
 * Decompiled with CFR 0.152.
 */
package com.johnsnowlabs.nlp.training;

import com.johnsnowlabs.ml.crf.CrfDataset;
import com.johnsnowlabs.ml.crf.DatasetMetadata;
import com.johnsnowlabs.ml.crf.Instance;
import com.johnsnowlabs.ml.crf.InstanceLabels;
import com.johnsnowlabs.ml.crf.TextSentenceLabels;
import com.johnsnowlabs.nlp.annotators.common.TaggedSentence;
import com.johnsnowlabs.nlp.annotators.common.TokenPieceEmbeddings;
import com.johnsnowlabs.nlp.annotators.common.TokenPieceEmbeddings$;
import com.johnsnowlabs.nlp.annotators.common.WordpieceEmbeddingsSentence;
import com.johnsnowlabs.nlp.annotators.ner.crf.DictionaryFeatures$;
import com.johnsnowlabs.nlp.annotators.ner.crf.FeatureGenerator;
import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsBinaryIndexer$;
import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsReader;
import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsTextIndexer$;
import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsWriter;
import com.johnsnowlabs.nlp.training.CoNLL;
import com.johnsnowlabs.nlp.training.CoNLL$;
import com.johnsnowlabs.nlp.training.CoNLLDocument;
import com.johnsnowlabs.nlp.util.io.ExternalResource;
import com.johnsnowlabs.nlp.util.io.ReadAs$;
import com.johnsnowlabs.storage.RocksDBConnection;
import com.johnsnowlabs.storage.RocksDBConnection$;
import java.io.File;
import java.io.Serializable;
import scala.Array$;
import scala.Enumeration;
import scala.Function0;
import scala.Function1;
import scala.MatchError;
import scala.None$;
import scala.Option;
import scala.Predef$;
import scala.Tuple2;
import scala.Tuple3;
import scala.collection.GenIterable;
import scala.collection.Seq;
import scala.collection.Seq$;
import scala.collection.TraversableLike;
import scala.collection.TraversableOnce;
import scala.collection.mutable.ArrayOps;
import scala.reflect.ClassTag$;
import scala.reflect.ScalaSignature;
import scala.runtime.BoxedUnit;
import scala.runtime.BoxesRunTime;
import scala.runtime.java8.JFunction0;

@ScalaSignature(bytes="\u0006\u0001\u0005=e\u0001\u0002\n\u0014\u0001qA\u0001b\t\u0001\u0003\u0002\u0003\u0006I\u0001\n\u0005\t_\u0001\u0011\t\u0011)A\u0005a!A1\u0007\u0001B\u0001B\u0003%A\u0007\u0003\u00058\u0001\t\u0005\t\u0015!\u00039\u0011!)\u0005A!A!\u0002\u00131\u0005\"\u0002'\u0001\t\u0003i\u0005bB+\u0001\u0005\u0004%IA\u0016\u0005\u00075\u0002\u0001\u000b\u0011B,\t\u0013m\u0003\u0001\u0019!a\u0001\n\u0013a\u0006\"C2\u0001\u0001\u0004\u0005\r\u0011\"\u0003e\u0011%Q\u0007\u00011A\u0001B\u0003&Q\fC\u0004l\u0001\t\u0007I\u0011\u00027\t\r]\u0004\u0001\u0015!\u0003n\u0011\u0015A\b\u0001\"\u0003z\u0011\u001d\tY\u0004\u0001C\u0005\u0003{Aq!a\u0018\u0001\t\u0003\t\t\u0007C\u0005\u0002x\u0001\t\n\u0011\"\u0001\u0002z\t\u00112i\u001c(M\u0019J\u0002\u0004g\r(feJ+\u0017\rZ3s\u0015\t!R#\u0001\u0005ue\u0006Lg.\u001b8h\u0015\t1r#A\u0002oYBT!\u0001G\r\u0002\u0019)|\u0007N\\:o_^d\u0017MY:\u000b\u0003i\t1aY8n\u0007\u0001\u0019\"\u0001A\u000f\u0011\u0005y\tS\"A\u0010\u000b\u0003\u0001\nQa]2bY\u0006L!AI\u0010\u0003\r\u0005s\u0017PU3g\u0003I9xN\u001d3F[\n,G\rZ5oON4\u0015\u000e\\3\u0011\u0005\u0015bcB\u0001\u0014+!\t9s$D\u0001)\u0015\tI3$\u0001\u0004=e>|GOP\u0005\u0003W}\ta\u0001\u0015:fI\u00164\u0017BA\u0017/\u0005\u0019\u0019FO]5oO*\u00111fH\u0001\u0014o>\u0014H-R7cK\u0012$\u0017N\\4t\u001d\u0012KWn\u001d\t\u0003=EJ!AM\u0010\u0003\u0007%sG/A\u0005o_Jl\u0017\r\\5{KB\u0011a$N\u0005\u0003m}\u0011qAQ8pY\u0016\fg.\u0001\tf[\n,G\rZ5oON4uN]7biB\u0011\u0011(\u0011\b\u0003u}j\u0011a\u000f\u0006\u0003yu\n!![8\u000b\u0005y*\u0012\u0001B;uS2L!\u0001Q\u001e\u0002\rI+\u0017\rZ!t\u0013\t\u00115IA\u0003WC2,X-\u0003\u0002E?\tYQI\\;nKJ\fG/[8o\u0003i\u0001xn]:jE2,W\t\u001f;fe:\fG\u000eR5di&|g.\u0019:z!\rqr)S\u0005\u0003\u0011~\u0011aa\u00149uS>t\u0007C\u0001\u001eK\u0013\tY5H\u0001\tFqR,'O\\1m%\u0016\u001cx.\u001e:dK\u00061A(\u001b8jiz\"bA\u0014)R%N#\u0006CA(\u0001\u001b\u0005\u0019\u0002\"B\u0012\u0007\u0001\u0004!\u0003\"B\u0018\u0007\u0001\u0004\u0001\u0004\"B\u001a\u0007\u0001\u0004!\u0004\"B\u001c\u0007\u0001\u0004A\u0004\"B#\u0007\u0001\u00041\u0015!\u00038feJ+\u0017\rZ3s+\u00059\u0006CA(Y\u0013\tI6CA\u0003D_:cE*\u0001\u0006oKJ\u0014V-\u00193fe\u0002\nab^8sI\u0016k'-\u001a3eS:<7/F\u0001^!\tq\u0016-D\u0001`\u0015\t\u0001W#\u0001\u0006f[\n,G\rZ5oONL!AY0\u0003)]{'\u000fZ#nE\u0016$G-\u001b8hgJ+\u0017\rZ3s\u0003I9xN\u001d3F[\n,G\rZ5oON|F%Z9\u0015\u0005\u0015D\u0007C\u0001\u0010g\u0013\t9wD\u0001\u0003V]&$\bbB5\u000b\u0003\u0003\u0005\r!X\u0001\u0004q\u0012\n\u0014aD<pe\u0012,UNY3eI&twm\u001d\u0011\u0002\u0005\u0019<W#A7\u0011\u00059,X\"A8\u000b\u0005A\f\u0018aA2sM*\u0011!o]\u0001\u0004]\u0016\u0014(B\u0001;\u0016\u0003)\tgN\\8uCR|'o]\u0005\u0003m>\u0014\u0001CR3biV\u0014XmR3oKJ\fGo\u001c:\u0002\u0007\u0019<\u0007%A\tsKN|GN^3F[\n,G\rZ5oON$2A_A\n!\u0015Y\u0018\u0011AA\u0004\u001d\tahP\u0004\u0002({&\t\u0001%\u0003\u0002\u0000?\u00059\u0001/Y2lC\u001e,\u0017\u0002BA\u0002\u0003\u000b\u00111aU3r\u0015\tyx\u0004\u0005\u0003\u0002\n\u0005=QBAA\u0006\u0015\r\tia]\u0001\u0007G>lWn\u001c8\n\t\u0005E\u00111\u0002\u0002\u001c/>\u0014H\r]5fG\u0016,UNY3eI&twm]*f]R,gnY3\t\u000f\u0005Ua\u00021\u0001\u0002\u0018\u0005I1/\u001a8uK:\u001cWm\u001d\t\u0006w\u0006\u0005\u0011\u0011\u0004\t\u0005\u00037\t)D\u0004\u0003\u0002\u001e\u0005Eb\u0002BA\u0010\u0003_qA!!\t\u0002.9!\u00111EA\u0016\u001d\u0011\t)#!\u000b\u000f\u0007\u001d\n9#C\u0001\u001b\u0013\tA\u0012$\u0003\u0002\u0017/%\u0011A/F\u0005\u0004\u0003\u001b\u0019\u0018\u0002BA\u001a\u0003\u0017\t\u0011\"\u00118o_R\fG/\u001a3\n\t\u0005]\u0012\u0011\b\u0002\u0012!>\u001cH+Y4hK\u0012\u001cVM\u001c;f]\u000e,'\u0002BA\u001a\u0003\u0017\t1B]3bI\u0012\u000bG/Y:fiR!\u0011qHA.!\u0015Y\u0018\u0011AA!!%q\u00121IA$\u0003+\n9!C\u0002\u0002F}\u0011a\u0001V;qY\u0016\u001c\u0004\u0003BA%\u0003#j!!a\u0013\u000b\u0007A\fiEC\u0002\u0002P]\t!!\u001c7\n\t\u0005M\u00131\n\u0002\u0013)\u0016DHoU3oi\u0016t7-\u001a'bE\u0016d7\u000f\u0005\u0003\u0002\n\u0005]\u0013\u0002BA-\u0003\u0017\u0011a\u0002V1hO\u0016$7+\u001a8uK:\u001cW\r\u0003\u0004\u0002^=\u0001\r!S\u0001\u0003KJ\faB]3bI:+'\u000fR1uCN,G\u000f\u0006\u0004\u0002d\u0005%\u00141\u000e\t\u0005\u0003\u0013\n)'\u0003\u0003\u0002h\u0005-#AC\"sM\u0012\u000bG/Y:fi\"1\u0011Q\f\tA\u0002%C\u0011\"!\u001c\u0011!\u0003\u0005\r!a\u001c\u0002\u00115,G/\u00193bi\u0006\u0004BAH$\u0002rA!\u0011\u0011JA:\u0013\u0011\t)(a\u0013\u0003\u001f\u0011\u000bG/Y:fi6+G/\u00193bi\u0006\f\u0001D]3bI:+'\u000fR1uCN,G\u000f\n3fM\u0006,H\u000e\u001e\u00133+\t\tYH\u000b\u0003\u0002p\u0005u4FAA@!\u0011\t\t)a#\u000e\u0005\u0005\r%\u0002BAC\u0003\u000f\u000b\u0011\"\u001e8dQ\u0016\u001c7.\u001a3\u000b\u0007\u0005%u$\u0001\u0006b]:|G/\u0019;j_:LA!!$\u0002\u0004\n\tRO\\2iK\u000e\\W\r\u001a,be&\fgnY3")
public class CoNLL2003NerReader {
    private final int wordEmbeddingsNDims;
    private final CoNLL nerReader;
    private WordEmbeddingsReader wordEmbeddings;
    private final FeatureGenerator fg;

    private CoNLL nerReader() {
        return this.nerReader;
    }

    private WordEmbeddingsReader wordEmbeddings() {
        return this.wordEmbeddings;
    }

    private void wordEmbeddings_$eq(WordEmbeddingsReader x$1) {
        this.wordEmbeddings = x$1;
    }

    private FeatureGenerator fg() {
        return this.fg;
    }

    private Seq<WordpieceEmbeddingsSentence> resolveEmbeddings(Seq<TaggedSentence> sentences) {
        return (Seq)((TraversableLike)sentences.zipWithIndex(Seq$.MODULE$.canBuildFrom())).map((Function1 & Serializable & scala.Serializable)x0$1 -> {
            Tuple2 tuple2 = x0$1;
            if (tuple2 == null) {
                throw new MatchError((Object)tuple2);
            }
            TaggedSentence s = (TaggedSentence)tuple2._1();
            int idx = tuple2._2$mcI$sp();
            TokenPieceEmbeddings[] tokens = (TokenPieceEmbeddings[])new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps((Object[])s.indexedTaggedWords())).map((Function1 & Serializable & scala.Serializable)token -> {
                Option<float[]> vectorOption = this.wordEmbeddings().lookup(token.word());
                return TokenPieceEmbeddings$.MODULE$.apply(token.word(), token.word(), -1, true, vectorOption, (float[])Array$.MODULE$.fill($this.wordEmbeddingsNDims, (Function0)(JFunction0.mcF.sp & Serializable & scala.Serializable)() -> 0.0f, ClassTag$.MODULE$.Float()), token.begin(), token.end());
            }, Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(TokenPieceEmbeddings.class)));
            WordpieceEmbeddingsSentence wordpieceEmbeddingsSentence = new WordpieceEmbeddingsSentence(tokens, idx);
            return wordpieceEmbeddingsSentence;
        }, Seq$.MODULE$.canBuildFrom());
    }

    private Seq<Tuple3<TextSentenceLabels, TaggedSentence, WordpieceEmbeddingsSentence>> readDataset(ExternalResource er) {
        Seq<CoNLLDocument> docs = this.nerReader().readDocs(er);
        Seq labels = (Seq)((TraversableLike)docs.flatMap((Function1 & Serializable & scala.Serializable)x$1 -> x$1.nerTagged(), Seq$.MODULE$.canBuildFrom())).map((Function1 & Serializable & scala.Serializable)sentence -> new TextSentenceLabels((Seq<String>)Predef$.MODULE$.wrapRefArray((Object[])sentence.tags())), Seq$.MODULE$.canBuildFrom());
        Seq posTaggedSentences = (Seq)docs.flatMap((Function1 & Serializable & scala.Serializable)x$2 -> x$2.posTagged(), Seq$.MODULE$.canBuildFrom());
        Seq<WordpieceEmbeddingsSentence> withEmbeddings = this.resolveEmbeddings((Seq<TaggedSentence>)posTaggedSentences);
        return (Seq)((TraversableLike)labels.zip((GenIterable)posTaggedSentences.zip(withEmbeddings, Seq$.MODULE$.canBuildFrom()), Seq$.MODULE$.canBuildFrom())).map((Function1 & Serializable & scala.Serializable)x0$1 -> {
            Tuple2 tuple2;
            TextSentenceLabels l;
            block3: {
                Tuple2 tuple22;
                block2: {
                    tuple22 = x0$1;
                    if (tuple22 == null) break block2;
                    l = (TextSentenceLabels)tuple22._1();
                    tuple2 = (Tuple2)tuple22._2();
                    if (tuple2 != null) break block3;
                }
                throw new MatchError((Object)tuple22);
            }
            TaggedSentence p = (TaggedSentence)tuple2._1();
            WordpieceEmbeddingsSentence w = (WordpieceEmbeddingsSentence)tuple2._2();
            Tuple3 tuple3 = new Tuple3((Object)l, (Object)p, (Object)w);
            return tuple3;
        }, Seq$.MODULE$.canBuildFrom());
    }

    public CrfDataset readNerDataset(ExternalResource er, Option<DatasetMetadata> metadata) {
        CrfDataset crfDataset;
        Seq<Tuple3<TextSentenceLabels, TaggedSentence, WordpieceEmbeddingsSentence>> lines = this.readDataset(er);
        if (metadata.isEmpty()) {
            crfDataset = this.fg().generateDataset((TraversableOnce<Tuple3<TextSentenceLabels, TaggedSentence, WordpieceEmbeddingsSentence>>)lines);
        } else {
            Seq labeledInstances = (Seq)lines.map((Function1 & Serializable & scala.Serializable)line -> {
                Instance instance = this.fg().generate((TaggedSentence)line._2(), (WordpieceEmbeddingsSentence)line._3(), (DatasetMetadata)metadata.get());
                InstanceLabels labels = new InstanceLabels((Seq<Object>)((Seq)((TextSentenceLabels)line._1()).labels().map((Function1 & Serializable & scala.Serializable)l -> BoxesRunTime.boxToInteger((int)CoNLL2003NerReader.$anonfun$readNerDataset$2(metadata, l)), Seq$.MODULE$.canBuildFrom())));
                return new Tuple2((Object)labels, (Object)instance);
            }, Seq$.MODULE$.canBuildFrom());
            crfDataset = new CrfDataset((Seq<Tuple2<InstanceLabels, Instance>>)labeledInstances, (DatasetMetadata)metadata.get());
        }
        return crfDataset;
    }

    public Option<DatasetMetadata> readNerDataset$default$2() {
        return None$.MODULE$;
    }

    public static final /* synthetic */ int $anonfun$readNerDataset$2(Option metadata$1, String l) {
        return BoxesRunTime.unboxToInt((Object)((DatasetMetadata)metadata$1.get()).label2Id().getOrElse((Object)l, (Function0)(JFunction0.mcI.sp & Serializable & scala.Serializable)() -> -1));
    }

    public CoNLL2003NerReader(String wordEmbeddingsFile, int wordEmbeddingsNDims, boolean normalize, Enumeration.Value embeddingsFormat, Option<ExternalResource> possibleExternalDictionary) {
        this.wordEmbeddingsNDims = wordEmbeddingsNDims;
        this.nerReader = new CoNLL("document", "sentence", "token", "pos", CoNLL$.MODULE$.apply$default$5(), CoNLL$.MODULE$.apply$default$6(), CoNLL$.MODULE$.apply$default$7(), CoNLL$.MODULE$.apply$default$8(), CoNLL$.MODULE$.apply$default$9(), CoNLL$.MODULE$.apply$default$10());
        if (wordEmbeddingsFile != null) {
            Predef$.MODULE$.require(new File(wordEmbeddingsFile).exists());
            String fileDb = new StringBuilder(3).append(wordEmbeddingsFile).append(".db").toString();
            RocksDBConnection connection = RocksDBConnection$.MODULE$.getOrCreate(fileDb);
            if (!new File(fileDb).exists()) {
                Enumeration.Value value = embeddingsFormat;
                Enumeration.Value value2 = ReadAs$.MODULE$.TEXT();
                Enumeration.Value value3 = value;
                if (!(value2 != null ? !value2.equals(value3) : value3 != null)) {
                    WordEmbeddingsTextIndexer$.MODULE$.index(wordEmbeddingsFile, new WordEmbeddingsWriter(connection, false, wordEmbeddingsNDims, 5000, 5000));
                    BoxedUnit boxedUnit = BoxedUnit.UNIT;
                } else {
                    Enumeration.Value value4 = ReadAs$.MODULE$.BINARY();
                    Enumeration.Value value5 = value;
                    if (!(value4 != null ? !value4.equals(value5) : value5 != null)) {
                        WordEmbeddingsBinaryIndexer$.MODULE$.index(wordEmbeddingsFile, new WordEmbeddingsWriter(connection, false, wordEmbeddingsNDims, 5000, 5000));
                        BoxedUnit boxedUnit = BoxedUnit.UNIT;
                    } else {
                        throw new MatchError((Object)value);
                    }
                }
            }
            if (new File(fileDb).exists()) {
                this.wordEmbeddings_$eq(new WordEmbeddingsReader(connection, normalize, wordEmbeddingsNDims, 1000));
            }
        }
        this.fg = new FeatureGenerator(DictionaryFeatures$.MODULE$.read(possibleExternalDictionary));
    }
}

