/*
 * Decompiled with CFR 0.152.
 */
package com.robrua.nlp.bert;

import com.robrua.nlp.bert.Tokenizer;
import java.util.Arrays;
import java.util.Map;
import java.util.stream.Stream;

public class WordpieceTokenizer
extends Tokenizer {
    private static final int DEFAULT_MAX_CHARACTERS_PER_WORD = 200;
    private static final String DEFAULT_UNKNOWN_TOKEN = "[UNK]";
    private final int maxCharactersPerWord;
    private final String unknownToken;
    private final Map<String, Integer> vocabulary;

    public WordpieceTokenizer(Map<String, Integer> vocabulary) {
        this.vocabulary = vocabulary;
        this.unknownToken = DEFAULT_UNKNOWN_TOKEN;
        this.maxCharactersPerWord = 200;
    }

    public WordpieceTokenizer(Map<String, Integer> vocabulary, String unknownToken, int maxCharactersPerToken) {
        this.vocabulary = vocabulary;
        this.unknownToken = unknownToken;
        this.maxCharactersPerWord = maxCharactersPerToken;
    }

    private Stream<String> splitToken(String token) {
        char[] characters = token.toCharArray();
        if (characters.length > this.maxCharactersPerWord) {
            return Stream.of(this.unknownToken);
        }
        Stream.Builder<String> subtokens = Stream.builder();
        int start = 0;
        while (start < characters.length) {
            int end;
            boolean found = false;
            for (end = characters.length; start < end; --end) {
                String substring = (start > 0 ? "##" : "") + String.valueOf(characters, start, end - start);
                if (!this.vocabulary.containsKey(substring)) continue;
                subtokens.accept(substring);
                start = end;
                found = true;
                break;
            }
            if (!found) {
                subtokens.accept(this.unknownToken);
                break;
            }
            start = end;
        }
        return subtokens.build();
    }

    @Override
    public String[] tokenize(String sequence) {
        return (String[])WordpieceTokenizer.whitespaceTokenize(sequence).flatMap(this::splitToken).toArray(String[]::new);
    }

    @Override
    public String[][] tokenize(String ... sequences) {
        return (String[][])Arrays.stream(sequences).map(sequence -> (String[])WordpieceTokenizer.whitespaceTokenize(sequence).toArray(String[]::new)).map(tokens -> (String[])Arrays.stream(tokens).flatMap(this::splitToken).toArray(String[]::new)).toArray(x$0 -> new String[x$0][]);
    }
}

