/*
 * Decompiled with CFR 0.152.
 */
package com.yahoo.language.opennlp;

import com.huaban.analysis.jieba.JiebaSegmenter;
import com.huaban.analysis.jieba.SegToken;
import com.yahoo.language.Language;
import com.yahoo.language.LinguisticsCase;
import com.yahoo.language.process.Normalizer;
import com.yahoo.language.process.SpecialTokenRegistry;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.TokenType;
import com.yahoo.language.process.Tokenizer;
import com.yahoo.language.process.Transformer;
import com.yahoo.language.simple.SimpleNormalizer;
import com.yahoo.language.simple.SimpleToken;
import com.yahoo.language.simple.SimpleTokenType;
import com.yahoo.language.simple.SimpleTokenizer;
import com.yahoo.language.simple.SimpleTransformer;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import opennlp.tools.stemmer.Stemmer;
import opennlp.tools.stemmer.snowball.SnowballStemmer;

public class OpenNlpTokenizer
implements Tokenizer {
    private final Mode mode;
    private final boolean snowballStemmingForEnglish;
    private final boolean createCjkGrams;
    private final Normalizer normalizer;
    private final Transformer transformer;
    private final Optional<JiebaSegmenter> chineseSegmenter;
    private final SimpleTokenizer simpleTokenizer;
    private final SpecialTokenRegistry specialTokenRegistry;

    public OpenNlpTokenizer() {
        this((Normalizer)new SimpleNormalizer(), (Transformer)new SimpleTransformer());
    }

    public OpenNlpTokenizer(Normalizer normalizer, Transformer transformer) {
        this(Mode.query, normalizer, transformer, false, false, false);
    }

    public OpenNlpTokenizer(Mode mode, Normalizer normalizer, Transformer transformer, boolean snowballStemmingForEnglish, boolean cjk, boolean createCjkGrams) {
        this(mode, normalizer, transformer, snowballStemmingForEnglish, cjk, createCjkGrams, new SpecialTokenRegistry(List.of()));
    }

    public OpenNlpTokenizer(Mode mode, Normalizer normalizer, Transformer transformer, boolean snowballStemmingForEnglish, boolean cjk, boolean createCjkGrams, SpecialTokenRegistry specialTokenRegistry) {
        this(mode, normalizer, transformer, snowballStemmingForEnglish, cjk ? Optional.of(new JiebaSegmenter()) : Optional.empty(), createCjkGrams, specialTokenRegistry);
    }

    public OpenNlpTokenizer(Mode mode, Normalizer normalizer, Transformer transformer, boolean snowballStemmingForEnglish, Optional<JiebaSegmenter> jiebaSegmenter, boolean createCjkGrams, SpecialTokenRegistry specialTokenRegistry) {
        this.mode = mode;
        this.normalizer = normalizer;
        this.transformer = transformer;
        this.snowballStemmingForEnglish = snowballStemmingForEnglish;
        this.chineseSegmenter = jiebaSegmenter;
        this.createCjkGrams = createCjkGrams;
        this.specialTokenRegistry = specialTokenRegistry;
        this.simpleTokenizer = new SimpleTokenizer(normalizer, transformer, specialTokenRegistry);
    }

    public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
        if (this.chineseSegmenter.isPresent() && (language == Language.CHINESE_SIMPLIFIED || language == Language.CHINESE_TRADITIONAL)) {
            return this.segmentChinese(input);
        }
        Stemmer stemmer = this.stemmerFor(language, stemMode);
        if (stemmer == null) {
            return this.simpleTokenizer.tokenize(input, language, stemMode, removeAccents);
        }
        return this.simpleTokenizer.tokenize(input, token -> this.processToken((String)token, language, stemMode, removeAccents, stemmer));
    }

    private Iterable<Token> segmentChinese(String input) {
        if (input.isEmpty()) {
            return List.of();
        }
        ArrayList<Token> tokens = new ArrayList<Token>();
        JiebaSegmenter.SegMode jiebaMode = this.mode == Mode.index && this.createCjkGrams ? JiebaSegmenter.SegMode.INDEX : JiebaSegmenter.SegMode.SEARCH;
        for (SegToken token : this.chineseSegmenter.get().process(input, jiebaMode)) {
            int nextCode = token.word.codePointAt(0);
            TokenType tokenType = SimpleTokenType.valueOf((int)nextCode);
            String originToken = input.substring(token.startOffset, token.startOffset + token.word.length());
            SimpleToken simpleToken = new SimpleToken(originToken).setOffset((long)token.startOffset).setType(tokenType).setTokenString(token.word);
            tokens.add((Token)simpleToken);
        }
        return tokens;
    }

    private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents, Stemmer stemmer) {
        token = this.normalizer.normalize(token);
        token = LinguisticsCase.toLowerCase((String)token);
        if (removeAccents) {
            token = this.transformer.accentDrop(token, language);
        }
        if (stemMode != StemMode.NONE) {
            token = stemmer.stem((CharSequence)token).toString();
        }
        return token;
    }

    private Stemmer stemmerFor(Language language, StemMode stemMode) {
        if (language == null || stemMode == StemMode.NONE) {
            return null;
        }
        if (language == Language.ENGLISH && !this.snowballStemmingForEnglish) {
            return null;
        }
        SnowballStemmer.ALGORITHM algorithm = this.algorithmFor(language);
        if (algorithm == null) {
            return null;
        }
        return new SnowballStemmer(algorithm);
    }

    private SnowballStemmer.ALGORITHM algorithmFor(Language language) {
        return switch (language) {
            case Language.ARABIC -> SnowballStemmer.ALGORITHM.ARABIC;
            case Language.CATALAN -> SnowballStemmer.ALGORITHM.CATALAN;
            case Language.DANISH -> SnowballStemmer.ALGORITHM.DANISH;
            case Language.DUTCH -> SnowballStemmer.ALGORITHM.DUTCH;
            case Language.ENGLISH -> SnowballStemmer.ALGORITHM.ENGLISH;
            case Language.FINNISH -> SnowballStemmer.ALGORITHM.FINNISH;
            case Language.FRENCH -> SnowballStemmer.ALGORITHM.FRENCH;
            case Language.GERMAN -> SnowballStemmer.ALGORITHM.GERMAN;
            case Language.GREEK -> SnowballStemmer.ALGORITHM.GREEK;
            case Language.HUNGARIAN -> SnowballStemmer.ALGORITHM.HUNGARIAN;
            case Language.INDONESIAN -> SnowballStemmer.ALGORITHM.INDONESIAN;
            case Language.IRISH -> SnowballStemmer.ALGORITHM.IRISH;
            case Language.ITALIAN -> SnowballStemmer.ALGORITHM.ITALIAN;
            case Language.NORWEGIAN_BOKMAL -> SnowballStemmer.ALGORITHM.NORWEGIAN;
            case Language.NORWEGIAN_NYNORSK -> SnowballStemmer.ALGORITHM.NORWEGIAN;
            case Language.PORTUGUESE -> SnowballStemmer.ALGORITHM.PORTUGUESE;
            case Language.ROMANIAN -> SnowballStemmer.ALGORITHM.ROMANIAN;
            case Language.RUSSIAN -> SnowballStemmer.ALGORITHM.RUSSIAN;
            case Language.SPANISH -> SnowballStemmer.ALGORITHM.SPANISH;
            case Language.SWEDISH -> SnowballStemmer.ALGORITHM.SWEDISH;
            case Language.TURKISH -> SnowballStemmer.ALGORITHM.TURKISH;
            default -> null;
        };
    }

    OpenNlpTokenizer withMode(Mode mode) {
        return new OpenNlpTokenizer(mode, this.normalizer, this.transformer, this.snowballStemmingForEnglish, this.chineseSegmenter, this.createCjkGrams, this.specialTokenRegistry);
    }

    public static enum Mode {
        index,
        query;

    }
}

