package org.apdplat.word.analysis;

import java.math.BigInteger;
import java.util.List;
import org.apdplat.word.segmentation.Word;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apdplat/word/analysis/SimHashPlusHammingDistanceTextSimilarity.class */
public class SimHashPlusHammingDistanceTextSimilarity extends TextSimilarity {
    private static final Logger LOGGER = LoggerFactory.getLogger(SimHashPlusHammingDistanceTextSimilarity.class);
    private int hashBitCount;

    public SimHashPlusHammingDistanceTextSimilarity() {
        this.hashBitCount = 128;
    }

    public SimHashPlusHammingDistanceTextSimilarity(int i) {
        this.hashBitCount = 128;
        this.hashBitCount = i;
    }

    public int getHashBitCount() {
        return this.hashBitCount;
    }

    public void setHashBitCount(int i) {
        this.hashBitCount = i;
    }

    @Override // org.apdplat.word.analysis.TextSimilarity
    protected double scoreImpl(List<Word> list, List<Word> list2) {
        taggingWeightWithWordFrequency(list, list2);
        String simHash = simHash(list);
        String simHash2 = simHash(list2);
        int hammingDistance = hammingDistance(simHash, simHash2);
        if (hammingDistance == -1) {
            LOGGER.error("文本1：" + list.toString());
            LOGGER.error("文本2：" + list2.toString());
            LOGGER.error("文本1SimHash值：" + simHash);
            LOGGER.error("文本2SimHash值：" + simHash2);
            LOGGER.error("文本1和文本2的SimHash值长度不相等，不能计算汉明距离");
            return 0.0d;
        }
        int length = simHash.length();
        double d = 1.0d - (hammingDistance / length);
        if (LOGGER.isDebugEnabled()) {
            LOGGER.debug("文本1：" + list.toString());
            LOGGER.debug("文本2：" + list2.toString());
            LOGGER.debug("文本1SimHash值：" + simHash);
            LOGGER.debug("文本2SimHash值：" + simHash2);
            LOGGER.debug("hashBitCount：" + this.hashBitCount);
            LOGGER.debug("SimHash值之间的汉明距离：" + hammingDistance);
            LOGGER.debug("文本1和文本2的相似度分值：1 - " + hammingDistance + " / (double)" + length + "=" + d);
        }
        return d;
    }

    private String simHash(List<Word> list) {
        float[] fArr = new float[this.hashBitCount];
        list.forEach(word -> {
            float floatValue = word.getWeight() == null ? 1.0f : word.getWeight().floatValue();
            BigInteger hash = hash(word.getText());
            for (int i = 0; i < this.hashBitCount; i++) {
                if (hash.and(new BigInteger("1").shiftLeft(i)).signum() != 0) {
                    int i2 = i;
                    fArr[i2] = fArr[i2] + floatValue;
                } else {
                    int i3 = i;
                    fArr[i3] = fArr[i3] - floatValue;
                }
            }
        });
        StringBuffer stringBuffer = new StringBuffer();
        for (int i = 0; i < this.hashBitCount; i++) {
            if (fArr[i] >= 0.0f) {
                stringBuffer.append("1");
            } else {
                stringBuffer.append("0");
            }
        }
        return stringBuffer.toString();
    }

    private BigInteger hash(String str) {
        if (str == null || str.length() == 0) {
            return new BigInteger("0");
        }
        char[] charArray = str.toCharArray();
        BigInteger valueOf = BigInteger.valueOf(charArray[0] << 7);
        BigInteger bigInteger = new BigInteger("1000003");
        BigInteger subtract = new BigInteger("2").pow(this.hashBitCount).subtract(new BigInteger("1"));
        long j = 0;
        for (char c : charArray) {
            j += c;
        }
        BigInteger xor = valueOf.multiply(bigInteger).xor(BigInteger.valueOf(j)).and(subtract).xor(new BigInteger(String.valueOf(str.length())));
        if (xor.equals(new BigInteger("-1"))) {
            xor = new BigInteger("-2");
        }
        return xor;
    }

    private int hammingDistance(String str, String str2) {
        if (str.length() != str2.length()) {
            return -1;
        }
        int i = 0;
        int length = str.length();
        for (int i2 = 0; i2 < length; i2++) {
            if (str.charAt(i2) != str2.charAt(i2)) {
                i++;
            }
        }
        return i;
    }

    public static void main(String[] strArr) throws Exception {
        SimHashPlusHammingDistanceTextSimilarity simHashPlusHammingDistanceTextSimilarity = new SimHashPlusHammingDistanceTextSimilarity();
        double similarScore = simHashPlusHammingDistanceTextSimilarity.similarScore("我爱购物", "我爱购物");
        double similarScore2 = simHashPlusHammingDistanceTextSimilarity.similarScore("我爱购物", "我爱读书");
        double similarScore3 = simHashPlusHammingDistanceTextSimilarity.similarScore("我爱购物", "他是黑客");
        double similarScore4 = simHashPlusHammingDistanceTextSimilarity.similarScore("我爱读书", "我爱读书");
        double similarScore5 = simHashPlusHammingDistanceTextSimilarity.similarScore("我爱读书", "他是黑客");
        double similarScore6 = simHashPlusHammingDistanceTextSimilarity.similarScore("他是黑客", "他是黑客");
        System.out.println("我爱购物 和 我爱购物 的相似度分值：" + similarScore);
        System.out.println("我爱购物 和 我爱读书 的相似度分值：" + similarScore2);
        System.out.println("我爱购物 和 他是黑客 的相似度分值：" + similarScore3);
        System.out.println("我爱读书 和 我爱读书 的相似度分值：" + similarScore4);
        System.out.println("我爱读书 和 他是黑客 的相似度分值：" + similarScore5);
        System.out.println("他是黑客 和 他是黑客 的相似度分值：" + similarScore6);
    }
}
