package org.springframework.ai.transformer.splitter;

import com.knuddels.jtokkit.Encodings;
import com.knuddels.jtokkit.api.Encoding;
import com.knuddels.jtokkit.api.EncodingRegistry;
import com.knuddels.jtokkit.api.EncodingType;
import com.knuddels.jtokkit.api.IntArrayList;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import org.springframework.util.Assert;

/* loaded from: input_file:org/springframework/ai/transformer/splitter/TokenTextSplitter.class */
public class TokenTextSplitter extends TextSplitter {
    private static final int DEFAULT_CHUNK_SIZE = 800;
    private static final int MIN_CHUNK_SIZE_CHARS = 350;
    private static final int MIN_CHUNK_LENGTH_TO_EMBED = 5;
    private static final int MAX_NUM_CHUNKS = 10000;
    private static final boolean KEEP_SEPARATOR = true;
    private final EncodingRegistry registry;
    private final Encoding encoding;
    private final int chunkSize;
    private final int minChunkSizeChars;
    private final int minChunkLengthToEmbed;
    private final int maxNumChunks;
    private final boolean keepSeparator;

    /* loaded from: input_file:org/springframework/ai/transformer/splitter/TokenTextSplitter$Builder.class */
    public static final class Builder {
        private int chunkSize = TokenTextSplitter.DEFAULT_CHUNK_SIZE;
        private int minChunkSizeChars = TokenTextSplitter.MIN_CHUNK_SIZE_CHARS;
        private int minChunkLengthToEmbed = TokenTextSplitter.MIN_CHUNK_LENGTH_TO_EMBED;
        private int maxNumChunks = TokenTextSplitter.MAX_NUM_CHUNKS;
        private boolean keepSeparator = true;

        private Builder() {
        }

        public Builder withChunkSize(int i) {
            this.chunkSize = i;
            return this;
        }

        public Builder withMinChunkSizeChars(int i) {
            this.minChunkSizeChars = i;
            return this;
        }

        public Builder withMinChunkLengthToEmbed(int i) {
            this.minChunkLengthToEmbed = i;
            return this;
        }

        public Builder withMaxNumChunks(int i) {
            this.maxNumChunks = i;
            return this;
        }

        public Builder withKeepSeparator(boolean z) {
            this.keepSeparator = z;
            return this;
        }

        public TokenTextSplitter build() {
            return new TokenTextSplitter(this.chunkSize, this.minChunkSizeChars, this.minChunkLengthToEmbed, this.maxNumChunks, this.keepSeparator);
        }
    }

    public TokenTextSplitter() {
        this(DEFAULT_CHUNK_SIZE, MIN_CHUNK_SIZE_CHARS, MIN_CHUNK_LENGTH_TO_EMBED, MAX_NUM_CHUNKS, true);
    }

    public TokenTextSplitter(boolean z) {
        this(DEFAULT_CHUNK_SIZE, MIN_CHUNK_SIZE_CHARS, MIN_CHUNK_LENGTH_TO_EMBED, MAX_NUM_CHUNKS, z);
    }

    public TokenTextSplitter(int i, int i2, int i3, int i4, boolean z) {
        this.registry = Encodings.newLazyEncodingRegistry();
        this.encoding = this.registry.getEncoding(EncodingType.CL100K_BASE);
        this.chunkSize = i;
        this.minChunkSizeChars = i2;
        this.minChunkLengthToEmbed = i3;
        this.maxNumChunks = i4;
        this.keepSeparator = z;
    }

    public static Builder builder() {
        return new Builder();
    }

    @Override // org.springframework.ai.transformer.splitter.TextSplitter
    protected List<String> splitText(String str) {
        return doSplit(str, this.chunkSize);
    }

    protected List<String> doSplit(String str, int i) {
        if (str == null || str.trim().isEmpty()) {
            return new ArrayList();
        }
        List<Integer> encodedTokens = getEncodedTokens(str);
        ArrayList arrayList = new ArrayList();
        int i2 = 0;
        while (!encodedTokens.isEmpty() && i2 < this.maxNumChunks) {
            List<Integer> subList = encodedTokens.subList(0, Math.min(i, encodedTokens.size()));
            String decodeTokens = decodeTokens(subList);
            if (decodeTokens.trim().isEmpty()) {
                encodedTokens = encodedTokens.subList(subList.size(), encodedTokens.size());
            } else {
                int max = Math.max(decodeTokens.lastIndexOf(46), Math.max(decodeTokens.lastIndexOf(63), Math.max(decodeTokens.lastIndexOf(33), decodeTokens.lastIndexOf(10))));
                if (max != -1 && max > this.minChunkSizeChars) {
                    decodeTokens = decodeTokens.substring(0, max + KEEP_SEPARATOR);
                }
                String trim = this.keepSeparator ? decodeTokens.trim() : decodeTokens.replace(System.lineSeparator(), " ").trim();
                if (trim.length() > this.minChunkLengthToEmbed) {
                    arrayList.add(trim);
                }
                encodedTokens = encodedTokens.subList(getEncodedTokens(decodeTokens).size(), encodedTokens.size());
                i2 += KEEP_SEPARATOR;
            }
        }
        if (!encodedTokens.isEmpty()) {
            String trim2 = decodeTokens(encodedTokens).replace(System.lineSeparator(), " ").trim();
            if (trim2.length() > this.minChunkLengthToEmbed) {
                arrayList.add(trim2);
            }
        }
        return arrayList;
    }

    private List<Integer> getEncodedTokens(String str) {
        Assert.notNull(str, "Text must not be null");
        return this.encoding.encode(str).boxed();
    }

    private String decodeTokens(List<Integer> list) {
        Assert.notNull(list, "Tokens must not be null");
        IntArrayList intArrayList = new IntArrayList(list.size());
        Objects.requireNonNull(intArrayList);
        list.forEach((v1) -> {
            r1.add(v1);
        });
        return this.encoding.decode(intArrayList);
    }
}
