package com.github.tjake.jlama.model.llama;

import com.github.tjake.jlama.safetensors.tokenizer.BPETokenizer;
import java.nio.file.Path;
import java.util.Optional;
import java.util.stream.Collectors;

/* loaded from: input_file:com/github/tjake/jlama/model/llama/LlamaTokenizer.class */
public class LlamaTokenizer extends BPETokenizer {
    static final String SPIECE_UNDERLINE = "▁";
    private final int byteFallbackEncodingOffset;

    public LlamaTokenizer(Path path) {
        super(path);
        this.byteFallbackEncodingOffset = ((Long) getModel().vocabLookup.getOrDefault("<0x00>", -1L)).intValue();
    }

    @Override // com.github.tjake.jlama.safetensors.tokenizer.BPETokenizer
    protected long encodeCharacterAsToken(byte b) {
        return Byte.toUnsignedLong(b) + Math.max(this.byteFallbackEncodingOffset, 0);
    }

    @Override // com.github.tjake.jlama.safetensors.tokenizer.BPETokenizer
    protected Optional<Character> maybeDecodeTokenAsCharacter(long j) {
        return (!this.model.byteFallback || this.byteFallbackEncodingOffset <= 0 || j < ((long) this.byteFallbackEncodingOffset) || j >= ((long) (256 + this.byteFallbackEncodingOffset))) ? Optional.empty() : Optional.of(Character.valueOf((char) (j - this.byteFallbackEncodingOffset)));
    }

    @Override // com.github.tjake.jlama.safetensors.tokenizer.BPETokenizer
    protected String preProcess(String str) {
        if (this.model.normalizer() != null) {
            str = this.model.normalizer().normalize(str);
        }
        if (this.model.isLegacy() && !this.model.byteFallback) {
            str = (String) str.codePoints().map(i -> {
                return ((Integer) alteredBytes.getOrDefault(Integer.valueOf(i), Integer.valueOf(i))).intValue();
            }).mapToObj(Character::toString).collect(Collectors.joining());
        }
        return str;
    }

    @Override // com.github.tjake.jlama.safetensors.tokenizer.BPETokenizer
    protected String postProcess(String str) {
        return str.stripLeading();
    }

    @Override // com.github.tjake.jlama.safetensors.tokenizer.BPETokenizer
    protected String postProcessToken(String str) {
        if (str == null) {
            str = this.model.unkToken;
        }
        String replaceAll = str.replaceAll("</?s>", "").replaceAll(SPIECE_UNDERLINE, " ");
        if (this.model.isLegacy() && !this.model.byteFallback) {
            replaceAll = (String) replaceAll.codePoints().map(i -> {
                return ((Integer) alteredBytes.inverse().getOrDefault(Integer.valueOf(i), Integer.valueOf(i))).intValue();
            }).mapToObj(Character::toString).collect(Collectors.joining());
        }
        return replaceAll;
    }
}
