/*
 * Decompiled with CFR 0.152.
 */
package hex.tfidf;

import water.MRTask;
import water.fvec.Chunk;
import water.fvec.NewChunk;
import water.parser.BufferedString;

public class TfIdfPreprocessorTask
extends MRTask<TfIdfPreprocessorTask> {
    private static final String WORDS_DELIMITER_REGEX = "\\s+";
    private final int _docIdsColIdx;
    private final int _docContentsColIdx;

    public TfIdfPreprocessorTask(int docIdsColIdx, int docContentsColIdx) {
        this._docIdsColIdx = docIdsColIdx;
        this._docContentsColIdx = docContentsColIdx;
    }

    @Override
    public void map(Chunk[] cs, NewChunk[] ncs) {
        Chunk inputDocumentIds = cs[this._docIdsColIdx];
        Chunk inputDocs = cs[this._docContentsColIdx];
        NewChunk outputDocumentIds = ncs[0];
        NewChunk outputTokens = ncs[1];
        for (int row = 0; row < inputDocs._len; ++row) {
            String[] words;
            if (inputDocs.isNA(row)) continue;
            String document = inputDocs.atStr(new BufferedString(), row).toString();
            long documentId = inputDocumentIds.at8(row);
            for (String word : words = document.split(WORDS_DELIMITER_REGEX)) {
                outputDocumentIds.addNum(documentId);
                outputTokens.addStr(word);
            }
        }
    }
}

