/*
 * Decompiled with CFR 0.152.
 */
package com.github.tjake.jlama.model.llama;

import com.github.tjake.jlama.safetensors.tokenizer.BPETokenizer;
import java.nio.file.Path;
import java.util.Optional;
import java.util.stream.Collectors;

public class LlamaTokenizer
extends BPETokenizer {
    static final String SPIECE_UNDERLINE = "\u2581";
    private final int byteFallbackEncodingOffset;

    public LlamaTokenizer(Path modelRoot) {
        super(modelRoot);
        this.byteFallbackEncodingOffset = ((Long)this.getModel().vocabLookup.getOrDefault((Object)"<0x00>", (Object)-1L)).intValue();
    }

    @Override
    protected long encodeCharacterAsToken(byte c) {
        return Byte.toUnsignedLong(c) + (long)Math.max(this.byteFallbackEncodingOffset, 0);
    }

    @Override
    protected Optional<Character> maybeDecodeTokenAsCharacter(long id) {
        if (this.model.byteFallback && this.byteFallbackEncodingOffset > 0 && id >= (long)this.byteFallbackEncodingOffset && id < (long)(256 + this.byteFallbackEncodingOffset)) {
            char c = (char)(id - (long)this.byteFallbackEncodingOffset);
            return Optional.of(Character.valueOf(c));
        }
        return Optional.empty();
    }

    @Override
    protected String preProcess(String sentence) {
        if (this.model.normalizer() != null) {
            sentence = this.model.normalizer().normalize(sentence);
        }
        if (this.model.isLegacy() && !this.model.byteFallback) {
            sentence = sentence.codePoints().map(c -> (Integer)alteredBytes.getOrDefault((Object)c, (Object)c)).mapToObj(Character::toString).collect(Collectors.joining());
        }
        return sentence;
    }

    @Override
    protected String postProcess(String sentence) {
        return sentence.stripLeading();
    }

    @Override
    protected String postProcessToken(String decoded) {
        if (decoded == null) {
            decoded = this.model.unkToken;
        }
        decoded = decoded.replaceAll("</?s>", "");
        decoded = decoded.replaceAll(SPIECE_UNDERLINE, " ");
        if (this.model.isLegacy() && !this.model.byteFallback) {
            decoded = decoded.codePoints().map(c -> (Integer)alteredBytes.inverse().getOrDefault((Object)c, (Object)c)).mapToObj(Character::toString).collect(Collectors.joining());
        }
        return decoded;
    }
}

