package com.yahoo.vespa.indexinglanguage.linguistics;

import com.yahoo.document.annotation.Annotation;
import com.yahoo.document.annotation.AnnotationTypes;
import com.yahoo.document.annotation.Span;
import com.yahoo.document.annotation.SpanList;
import com.yahoo.document.annotation.SpanTree;
import com.yahoo.document.datatypes.StringFieldValue;
import com.yahoo.language.Linguistics;
import com.yahoo.language.LinguisticsCase;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.text.Text;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

/* loaded from: input_file:com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.class */
public class LinguisticsAnnotator {
    private final Linguistics factory;
    private final AnnotatorConfig config;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator$TermOccurrences.class */
    public static class TermOccurrences {
        final Map<String, Integer> termOccurrences = new HashMap();
        final int maxOccurrences;

        public TermOccurrences(int i) {
            this.maxOccurrences = i;
        }

        boolean termCountBelowLimit(String str) {
            String lowerCase = LinguisticsCase.toLowerCase(str);
            int intValue = this.termOccurrences.getOrDefault(lowerCase, 0).intValue();
            if (intValue >= this.maxOccurrences) {
                return false;
            }
            this.termOccurrences.put(lowerCase, Integer.valueOf(intValue + 1));
            return true;
        }
    }

    public LinguisticsAnnotator(Linguistics linguistics, AnnotatorConfig annotatorConfig) {
        this.factory = linguistics;
        this.config = annotatorConfig;
    }

    public boolean annotate(StringFieldValue stringFieldValue) {
        if (stringFieldValue.getSpanTree("linguistics") != null) {
            return true;
        }
        Iterable iterable = this.factory.getTokenizer().tokenize(stringFieldValue.getString().length() <= this.config.getMaxTokenizeLength() ? stringFieldValue.getString() : Text.substringByCodepoints(stringFieldValue.getString(), 0, this.config.getMaxTokenizeLength()), this.config.getLanguage(), this.config.getStemMode(), this.config.getRemoveAccents());
        TermOccurrences termOccurrences = new TermOccurrences(this.config.getMaxTermOccurrences());
        SpanTree spanTree = new SpanTree("linguistics");
        Iterator it = iterable.iterator();
        while (it.hasNext()) {
            addAnnotationSpan(stringFieldValue.getString(), spanTree.spanList(), (Token) it.next(), this.config.getStemMode(), termOccurrences, this.config.getMaxTokenLength());
        }
        if (spanTree.numAnnotations() == 0) {
            return false;
        }
        stringFieldValue.setSpanTree(spanTree);
        return true;
    }

    public static Annotation termAnnotation(String str, String str2) {
        return str.equals(str2) ? new Annotation(AnnotationTypes.TERM) : new Annotation(AnnotationTypes.TERM, new StringFieldValue(str));
    }

    private static void addAnnotation(Span span, String str, String str2, TermOccurrences termOccurrences, int i) {
        if (str.length() <= i && termOccurrences.termCountBelowLimit(str)) {
            span.annotate(termAnnotation(str, str2));
        }
    }

    private static void addAnnotationSpan(String str, SpanList spanList, Token token, StemMode stemMode, TermOccurrences termOccurrences, int i) {
        if (!token.isSpecialToken()) {
            if (token.getNumComponents() > 0) {
                for (int i2 = 0; i2 < token.getNumComponents(); i2++) {
                    addAnnotationSpan(str, spanList, token.getComponent(i2), stemMode, termOccurrences, i);
                }
                return;
            }
            if (!token.isIndexable()) {
                return;
            }
        }
        if (token.getOffset() >= str.length()) {
            IllegalArgumentException illegalArgumentException = new IllegalArgumentException(String.valueOf(token) + " has offset " + token.getOffset() + ", which is outside the bounds of the input string '" + illegalArgumentException + "'");
            throw illegalArgumentException;
        }
        if (token.getOffset() + token.getOrig().length() > str.length()) {
            IllegalArgumentException illegalArgumentException2 = new IllegalArgumentException(String.valueOf(token) + " has offset " + token.getOffset() + ", which makes it overflow the bounds of the input string; " + illegalArgumentException2);
            throw illegalArgumentException2;
        }
        if (stemMode != StemMode.ALL) {
            String tokenString = token.getTokenString();
            if (tokenString == null || tokenString.trim().isEmpty() || tokenString.length() > i || !termOccurrences.termCountBelowLimit(tokenString)) {
                return;
            }
            spanList.span((int) token.getOffset(), token.getOrig().length()).annotate(termAnnotation(tokenString, token.getOrig()));
            return;
        }
        Span span = spanList.span((int) token.getOffset(), token.getOrig().length());
        String lowerCase = LinguisticsCase.toLowerCase(token.getOrig());
        String tokenString2 = token.getTokenString();
        if (tokenString2 != null) {
            addAnnotation(span, tokenString2, token.getOrig(), termOccurrences, i);
            if (!tokenString2.equals(lowerCase)) {
                addAnnotation(span, lowerCase, token.getOrig(), termOccurrences, i);
            }
        }
        for (int i3 = 0; i3 < token.getNumStems(); i3++) {
            String stem = token.getStem(i3);
            if (!stem.equals(lowerCase) && !stem.equals(tokenString2)) {
                addAnnotation(span, stem, token.getOrig(), termOccurrences, i);
            }
        }
    }
}
