package com.yahoo.language.simple;

import com.yahoo.language.LinguisticsCase;
import com.yahoo.language.process.LinguisticsParameters;
import com.yahoo.language.process.Normalizer;
import com.yahoo.language.process.SpecialTokenRegistry;
import com.yahoo.language.process.SpecialTokens;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.TokenScript;
import com.yahoo.language.process.TokenType;
import com.yahoo.language.process.Tokenizer;
import com.yahoo.language.process.Transformer;
import com.yahoo.language.simple.kstem.KStemmer;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Function;
import java.util.logging.Level;
import java.util.logging.Logger;

/* loaded from: input_file:com/yahoo/language/simple/SimpleTokenizer.class */
public class SimpleTokenizer implements Tokenizer {
    private static final Logger log = Logger.getLogger(SimpleTokenizer.class.getName());
    private static final int SPACE_CODE = 32;
    private final Normalizer normalizer;
    private final Transformer transformer;
    private final KStemmer stemmer;
    private final SpecialTokenRegistry specialTokenRegistry;

    public SimpleTokenizer() {
        this(new SimpleNormalizer(), new SimpleTransformer());
    }

    public SimpleTokenizer(Normalizer normalizer) {
        this(normalizer, new SimpleTransformer());
    }

    public SimpleTokenizer(Normalizer normalizer, Transformer transformer) {
        this(normalizer, transformer, new SpecialTokenRegistry((List<SpecialTokens>) List.of()));
    }

    public SimpleTokenizer(Normalizer normalizer, Transformer transformer, SpecialTokenRegistry specialTokenRegistry) {
        this.stemmer = new KStemmer();
        this.normalizer = normalizer;
        this.transformer = transformer;
        this.specialTokenRegistry = specialTokenRegistry;
    }

    @Override // com.yahoo.language.process.Tokenizer
    public Iterable<Token> tokenize(String str, LinguisticsParameters linguisticsParameters) {
        return tokenize(str, str2 -> {
            return processToken(str2, linguisticsParameters);
        });
    }

    public Iterable<Token> tokenize(String str, Function<String, String> function) {
        TokenScript determineScript;
        if (str.isEmpty()) {
            return List.of();
        }
        ArrayList arrayList = new ArrayList();
        int codePointAt = str.codePointAt(0);
        TokenType valueOf = SimpleTokenType.valueOf(codePointAt);
        TokenType tokenType = valueOf;
        TokenScript valueOf2 = SimpleTokenScript.valueOf(codePointAt);
        int i = 0;
        int charCount = Character.charCount(codePointAt);
        while (true) {
            int i2 = charCount;
            if (i2 > str.length()) {
                return arrayList;
            }
            int codePointAt2 = i2 < str.length() ? str.codePointAt(i2) : SPACE_CODE;
            TokenType valueOf3 = SimpleTokenType.valueOf(codePointAt2);
            TokenScript valueOf4 = SimpleTokenScript.valueOf(codePointAt2);
            if (isAtTokenBoundary(valueOf, valueOf3)) {
                String substring = str.substring(i, i2);
                arrayList.add(new SimpleToken(substring).setOffset(i).setType(tokenType).setTokenString(function.apply(substring)).setScript(valueOf2));
                i = i2;
                valueOf = valueOf3;
                tokenType = valueOf;
                determineScript = valueOf4;
            } else {
                tokenType = determineType(tokenType, valueOf3);
                determineScript = determineScript(valueOf2, valueOf4);
            }
            valueOf2 = determineScript;
            charCount = i2 + Character.charCount(codePointAt2);
        }
    }

    private boolean isAtTokenBoundary(TokenType tokenType, TokenType tokenType2) {
        return tokenType == TokenType.INDEXABLE_SYMBOL || tokenType2 == TokenType.INDEXABLE_SYMBOL || !tokenType.isIndexable() || !tokenType2.isIndexable();
    }

    private TokenType determineType(TokenType tokenType, TokenType tokenType2) {
        return tokenType2 == TokenType.ALPHABETIC ? TokenType.ALPHABETIC : tokenType;
    }

    private TokenScript determineScript(TokenScript tokenScript, TokenScript tokenScript2) {
        return tokenScript2 == TokenScript.LATIN ? TokenScript.LATIN : tokenScript;
    }

    private String processToken(String str, LinguisticsParameters linguisticsParameters) {
        log.log(Level.FINEST, () -> {
            return "processToken '" + str + "'";
        });
        String normalize = this.normalizer.normalize(str);
        if (linguisticsParameters.lowercase()) {
            normalize = LinguisticsCase.toLowerCase(normalize);
        }
        if (linguisticsParameters.removeAccents()) {
            normalize = this.transformer.accentDrop(normalize, linguisticsParameters.language());
        }
        if (linguisticsParameters.stemMode() != StemMode.NONE) {
            String str2 = normalize;
            normalize = this.stemmer.stem(normalize);
            log.log(Level.FINEST, () -> {
                return "stem '" + str2 + "' to '" + normalize + "'";
            });
        }
        String str3 = normalize;
        log.log(Level.FINEST, () -> {
            return "processed token is: " + str3;
        });
        return str3;
    }
}
