/*
 * Decompiled with CFR 0.152.
 */
package com.yahoo.vespa.indexinglanguage.linguistics;

import ai.vespa.sampling.ProbabilisticSampleRate;
import com.yahoo.document.DocumentId;
import com.yahoo.document.annotation.Annotation;
import com.yahoo.document.annotation.AnnotationTypes;
import com.yahoo.document.annotation.Span;
import com.yahoo.document.annotation.SpanList;
import com.yahoo.document.annotation.SpanTree;
import com.yahoo.document.datatypes.FieldValue;
import com.yahoo.document.datatypes.StringFieldValue;
import com.yahoo.language.Linguistics;
import com.yahoo.language.LinguisticsCase;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.Tokenizer;
import com.yahoo.text.Text;
import com.yahoo.vespa.indexinglanguage.expressions.InvalidInputException;
import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.Filter;
import java.util.logging.LogRecord;
import java.util.logging.Logger;

public class LinguisticsAnnotator {
    private static final Logger log = Logger.getLogger(LinguisticsAnnotator.class.getName());
    private static final boolean binaryCheckDisabled = Boolean.parseBoolean(System.getenv("VESPA_DISABLE_LINGUISTICS_BINARY_CHECK"));
    private final Linguistics factory;
    private final AnnotatorConfig config;

    public LinguisticsAnnotator(Linguistics factory, AnnotatorConfig config) {
        this.factory = factory;
        this.config = config;
    }

    public boolean annotate(StringFieldValue text, DocumentId docId, boolean isReindexingOperation) {
        String input;
        if (text.getSpanTree("linguistics") != null) {
            return true;
        }
        Tokenizer tokenizer = this.factory.getTokenizer();
        String string = input = text.getString().length() <= this.config.getMaxTokenizeLength() ? text.getString() : Text.substringByCodepoints((String)text.getString(), (int)0, (int)this.config.getMaxTokenizeLength());
        if (this.checkLikelyBinaryData(input, docId, isReindexingOperation)) {
            return false;
        }
        Iterable tokens = tokenizer.tokenize(input, this.config.asLinguisticsParameters());
        TermOccurrences termOccurrences = new TermOccurrences(this.config.getMaxTermOccurrences());
        SpanTree tree = new SpanTree("linguistics");
        for (Token token : tokens) {
            LinguisticsAnnotator.addAnnotationSpan(text.getString(), tree.spanList(), token, this.config.getStemMode(), this.config.getLowercase(), termOccurrences, this.config.getMaxTokenLength());
        }
        if (tree.numAnnotations() == 0) {
            return false;
        }
        text.setSpanTree(tree);
        return true;
    }

    boolean annotate(StringFieldValue text) {
        return this.annotate(text, null, false);
    }

    public static Annotation termAnnotation(String term, String originalTerm) {
        if (term.equals(originalTerm)) {
            return new Annotation(AnnotationTypes.TERM);
        }
        return new Annotation(AnnotationTypes.TERM, (FieldValue)new StringFieldValue(term));
    }

    private static void addAnnotation(Span here, String term, String orig, TermOccurrences termOccurrences, int maxTokenLength) {
        if (term.length() > maxTokenLength) {
            return;
        }
        if (termOccurrences.termCountBelowLimit(term)) {
            here.annotate(LinguisticsAnnotator.termAnnotation(term, orig));
        }
    }

    private static void addAnnotationSpan(String input, SpanList parent, Token token, StemMode mode, boolean lowercase, TermOccurrences termOccurrences, int maxTokenLength) {
        if (!token.isSpecialToken()) {
            if (token.getNumComponents() > 0) {
                for (int i = 0; i < token.getNumComponents(); ++i) {
                    LinguisticsAnnotator.addAnnotationSpan(input, parent, token.getComponent(i), mode, lowercase, termOccurrences, maxTokenLength);
                }
                return;
            }
            if (!token.isIndexable()) {
                return;
            }
        }
        if (token.getOffset() >= (long)input.length()) {
            throw new IllegalArgumentException(String.valueOf(token) + " has offset " + token.getOffset() + ", which is outside the bounds of the input string '" + input + "'");
        }
        if (token.getOffset() + (long)token.getOrig().length() > (long)input.length()) {
            throw new IllegalArgumentException(String.valueOf(token) + " has offset " + token.getOffset() + ", which makes it overflow the bounds of the input string; " + input);
        }
        if (mode == StemMode.ALL) {
            Span where = parent.span((int)token.getOffset(), token.getOrig().length());
            String indexableOriginal = lowercase ? LinguisticsCase.toLowerCase((String)token.getOrig()) : token.getOrig();
            String term = token.getTokenString();
            if (term != null) {
                LinguisticsAnnotator.addAnnotation(where, term, token.getOrig(), termOccurrences, maxTokenLength);
                if (!term.equals(indexableOriginal)) {
                    LinguisticsAnnotator.addAnnotation(where, indexableOriginal, token.getOrig(), termOccurrences, maxTokenLength);
                }
            }
            for (int i = 0; i < token.getNumStems(); ++i) {
                String stem = token.getStem(i);
                if (stem.equals(indexableOriginal) || stem.equals(term)) continue;
                LinguisticsAnnotator.addAnnotation(where, stem, token.getOrig(), termOccurrences, maxTokenLength);
            }
        } else {
            String term = token.getTokenString();
            if (term == null || term.trim().isEmpty()) {
                return;
            }
            if (term.length() > maxTokenLength) {
                return;
            }
            if (termOccurrences.termCountBelowLimit(term)) {
                parent.span((int)token.getOffset(), token.getOrig().length()).annotate(LinguisticsAnnotator.termAnnotation(term, token.getOrig()));
            }
        }
    }

    private boolean checkLikelyBinaryData(String text, DocumentId docId, boolean isReindexingOperation) {
        if (binaryCheckDisabled) {
            return false;
        }
        double maxRatio = this.config.getMaxReplacementCharactersRatio();
        int maxCharacters = this.config.getMaxReplacementCharacters();
        if (maxRatio >= 1.0 && (maxCharacters < 0 || maxCharacters == Integer.MAX_VALUE)) {
            return false;
        }
        long replacementCharCount = text.chars().filter(c -> c == 65533).count();
        if (replacementCharCount > (long)maxCharacters && (double)replacementCharCount > (double)text.length() * maxRatio) {
            String docIdString;
            String reason = "Some text of length %d is classified as binary data as it contains %d Unicode replacement characters. (max-replacement-character-ratio=%d%%, max-replacement-characters=%d)".formatted(text.length(), replacementCharCount, (int)Math.round(maxRatio * 100.0), maxCharacters);
            String string = docIdString = docId != null ? "%s".formatted(docId.toString()) : "<unknown>";
            if (isReindexingOperation) {
                log.warning("Skipping tokenization of '%s' while reindexing: %s. ".formatted(docIdString, reason));
                return true;
            }
            throw new InvalidInputException(reason);
        }
        return false;
    }

    static {
        class RateLimitingLogFilter
        implements Filter {
            final ProbabilisticSampleRate sampleRate = ProbabilisticSampleRate.withSystemDefaults((double)0.1);
            final Filter prevFilter;

            RateLimitingLogFilter(Filter prevFilter) {
                this.prevFilter = prevFilter;
            }

            @Override
            public boolean isLoggable(LogRecord lr) {
                return this.sampleRate.shouldSample() && (this.prevFilter == null || this.prevFilter.isLoggable(lr));
            }
        }
        log.setFilter(new RateLimitingLogFilter(log.getFilter()));
    }

    private static class TermOccurrences {
        final Map<String, Integer> termOccurrences = new HashMap<String, Integer>();
        final int maxOccurrences;

        public TermOccurrences(int maxOccurrences) {
            this.maxOccurrences = maxOccurrences;
        }

        boolean termCountBelowLimit(String term) {
            String lowerCasedTerm = LinguisticsCase.toLowerCase((String)term);
            int occurrences = this.termOccurrences.getOrDefault(lowerCasedTerm, 0);
            if (occurrences >= this.maxOccurrences) {
                return false;
            }
            this.termOccurrences.put(lowerCasedTerm, occurrences + 1);
            return true;
        }
    }
}

