/*
 * Decompiled with CFR 0.152.
 */
package ai.vespa.language.chunker;

import com.yahoo.language.process.CharacterClasses;
import com.yahoo.language.process.Chunker;
import com.yahoo.text.UnicodeString;
import java.util.ArrayList;
import java.util.List;

public class FixedLengthChunker
implements Chunker {
    private static final int defaultChunkLength = 1000;
    private final CharacterClasses characters = new CharacterClasses();

    @Override
    public List<Chunker.Chunk> chunk(String inputText, Chunker.Context context) {
        int chunkLength = context.arguments().isEmpty() ? 1000 : this.asInteger(context.arguments().get(0));
        boolean isCjk = context.getLanguage().isCjk();
        return context.computeCachedValueIfAbsent(new CacheKey(this, inputText, chunkLength, isCjk), () -> new ChunkComputer(inputText, chunkLength, isCjk).chunk());
    }

    private int asInteger(String s) {
        try {
            return Integer.parseInt(s);
        }
        catch (NumberFormatException e) {
            throw new IllegalArgumentException("Expected a chunk length integer argument to the fixed-length chunker, got '" + s + "'");
        }
    }

    private record CacheKey(FixedLengthChunker chunker, String inputText, int chunkLength, boolean isCjk) {
    }

    private class ChunkComputer {
        final UnicodeString text;
        final int targetLength;
        final boolean isCjk;
        final int softMaxLength;
        final int hardMaxLength;
        final List<Chunker.Chunk> chunks = new ArrayList<Chunker.Chunk>();
        int index = 0;

        public ChunkComputer(String text, int chunkLength, boolean isCjk) {
            this.text = new UnicodeString(text);
            this.isCjk = isCjk;
            double chunkCount = (double)text.length() / (double)chunkLength;
            int targetChunkCount = (int)Math.ceil(chunkCount);
            this.targetLength = (int)Math.ceil(chunkCount / (double)targetChunkCount * (double)chunkLength);
            this.softMaxLength = (int)Math.round((double)this.targetLength * 1.05);
            this.hardMaxLength = (int)Math.round((double)this.targetLength * 1.1);
        }

        List<Chunker.Chunk> chunk() {
            StringBuilder currentChunk = new StringBuilder();
            int currentLength = 0;
            while (this.index < this.text.length()) {
                int currentChar = this.text.codePointAt(this.index);
                currentChunk.appendCodePoint(currentChar);
                if (this.endOfChunk(++currentLength)) {
                    this.chunks.add(new Chunker.Chunk(currentChunk.toString()));
                    currentChunk.setLength(0);
                    currentLength = 0;
                }
                this.index = this.nextIndex();
            }
            if (currentLength > 0) {
                this.chunks.add(new Chunker.Chunk(currentChunk.toString()));
            }
            return this.chunks;
        }

        private boolean endOfChunk(int currentLength) {
            if (currentLength < this.targetLength) {
                return false;
            }
            if (this.isCjk) {
                return true;
            }
            if (currentLength < this.softMaxLength) {
                return !this.isLetter(this.index) && !this.isLetter(this.nextIndex());
            }
            if (currentLength < this.hardMaxLength) {
                return !this.isLetter(this.index);
            }
            return true;
        }

        int charAt(int index) {
            return this.text.codePointAt(index);
        }

        boolean isLetter(int index) {
            if (index >= this.text.length()) {
                return false;
            }
            return FixedLengthChunker.this.characters.isLetterOrDigit(this.charAt(index));
        }

        int nextIndex() {
            return this.text.nextIndex(this.index);
        }
    }
}

