package edu.berkeley.nlp.lm.io;

import cern.colt.matrix.impl.AbstractFormatter;
import edu.berkeley.nlp.lm.ConfigOptions;
import edu.berkeley.nlp.lm.WordIndexer;
import edu.berkeley.nlp.lm.collections.Iterators;
import edu.berkeley.nlp.lm.util.Logger;
import edu.berkeley.nlp.lm.util.LongRef;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.lucene.search.suggest.FileDictionary;

/* loaded from: input_file:berkeleylm-1.1.2.jar:edu/berkeley/nlp/lm/io/GoogleLmReader.class */
public class GoogleLmReader<W> implements LmReader<LongRef, NgramOrderedLmReaderCallback<LongRef>> {
    private static final String START_SYMBOL = "<S>";
    private static final String END_SYMBOL = "</S>";
    private static final String UNK_SYMBOL = "<UNK>";
    private static final String sortedVocabFile = "vocab_cs.gz";
    private final String rootDir;
    private final WordIndexer<W> wordIndexer;

    public GoogleLmReader(String str, WordIndexer<W> wordIndexer, ConfigOptions configOptions) {
        this.rootDir = str;
        this.wordIndexer = wordIndexer;
    }

    @Override // edu.berkeley.nlp.lm.io.LmReader
    public void parse(NgramOrderedLmReaderCallback<LongRef> ngramOrderedLmReaderCallback) {
        RuntimeException runtimeException;
        File[] listFiles = new File(this.rootDir).listFiles(new FilenameFilter() { // from class: edu.berkeley.nlp.lm.io.GoogleLmReader.1
            @Override // java.io.FilenameFilter
            public boolean accept(File file, String str) {
                return str.endsWith("gms");
            }
        });
        Arrays.sort(listFiles);
        int i = 0;
        for (File file : listFiles) {
            final int i2 = i;
            final String str = (i2 + 1) + "gm-\\d+(.gz)?";
            File[] listFiles2 = file.listFiles(new FilenameFilter() { // from class: edu.berkeley.nlp.lm.io.GoogleLmReader.2
                @Override // java.io.FilenameFilter
                public boolean accept(File file2, String str2) {
                    return i2 == 0 ? str2.equals(GoogleLmReader.sortedVocabFile) : str2.matches(str);
                }
            });
            if (i == 0) {
                if (listFiles2.length != 1) {
                    throw new RuntimeException("Could not find expected vocab file vocab_cs.gz");
                }
                addToIndexer(this.wordIndexer, listFiles2[0].getPath());
            } else if (listFiles2.length == 0) {
                Logger.warn("Did not find any files matching expected regex " + str);
            }
            Arrays.sort(listFiles2);
            Logger.startTrack("Reading ngrams of order " + (i2 + 1), new Object[0]);
            int length = listFiles2.length;
            for (int i3 = 0; i3 < length; i3++) {
                File file2 = listFiles2[i3];
                Logger.startTrack("Reading ngrams from file " + file2, new Object[0]);
                try {
                    int i4 = 0;
                    for (String str2 : Iterators.able(IOUtils.lineIterator(file2.getPath()))) {
                        if (i4 % 10000 == 0) {
                            Logger.logs("Line " + i4);
                        }
                        i4++;
                        try {
                            parseLine(str2.trim(), i, ngramOrderedLmReaderCallback);
                        } finally {
                        }
                    }
                    Logger.endTrack();
                } catch (IOException e) {
                    throw new RuntimeException("Could not read file " + file2 + AbstractFormatter.DEFAULT_ROW_SEPARATOR, e);
                }
            }
            Logger.endTrack();
            i++;
            ngramOrderedLmReaderCallback.handleNgramOrderFinished(i);
        }
        ngramOrderedLmReaderCallback.cleanup();
    }

    private void parseLine(String str, int i, NgramOrderedLmReaderCallback<LongRef> ngramOrderedLmReaderCallback) {
        int indexOf = str.indexOf(9);
        int i2 = 0;
        int[] iArr = new int[i + 1];
        String substring = str.substring(0, indexOf);
        int i3 = 0;
        while (true) {
            int indexOf2 = str.indexOf(32, i2);
            if (indexOf2 < 0) {
                indexOf2 = substring.length();
            }
            iArr[i3] = this.wordIndexer.getOrAddIndexFromString(substring.substring(i2, indexOf2));
            if (indexOf2 == substring.length()) {
                ngramOrderedLmReaderCallback.call(iArr, 0, iArr.length, new LongRef(Long.parseLong(str.substring(indexOf + 1))), substring);
                return;
            } else {
                i2 = indexOf2 + 1;
                i3++;
            }
        }
    }

    public static <W> void addToIndexer(WordIndexer<W> wordIndexer, String str) {
        if (!new File(str).getName().equals(sortedVocabFile)) {
            Logger.warn("You have specified that " + str + " is the count-sorted vocab file for Google n-grams, but it is usually named " + sortedVocabFile);
        }
        try {
            Iterator it = Iterators.able(IOUtils.lineIterator(str)).iterator();
            while (it.hasNext()) {
                wordIndexer.getOrAddIndexFromString(((String) it.next()).split(FileDictionary.DEFAULT_FIELD_DELIMITER)[0]);
            }
            addSpecialSymbols(wordIndexer);
        } catch (IOException e) {
            throw new RuntimeException(e);
        } catch (NumberFormatException e2) {
            throw new RuntimeException(e2);
        }
    }

    private static <W> void addSpecialSymbols(WordIndexer<W> wordIndexer) {
        wordIndexer.setStartSymbol(wordIndexer.getWord(wordIndexer.getOrAddIndexFromString(START_SYMBOL)));
        wordIndexer.setEndSymbol(wordIndexer.getWord(wordIndexer.getOrAddIndexFromString(END_SYMBOL)));
        wordIndexer.setUnkSymbol(wordIndexer.getWord(wordIndexer.getOrAddIndexFromString(UNK_SYMBOL)));
    }
}
