Jaccard 通过对比后更适合用于文字的查重率
# import numpy as np # from scipy.spatial.distance import pdist#直接调包可以计算JC值 :需要两个句子长度一样;所以暂时不用 import jieba def Jaccrad(model, reference): # terms_reference为源句子,terms_model为候选句子 terms_reference = jieba.cut(reference) # 默认精准模式 terms_model = jieba.cut(model) grams_reference = set(terms_reference) # 去重;如果不需要就改为list grams_model = set(terms_model) temp = 0 for i in grams_reference: # 遍历传进来的list print("传进来对比的值",i) if i in grams_model: temp = temp + 1 fenmu = len(grams_model) + len(grams_reference) - temp # 并集 计算并集数量 jaccard_coefficient = float(temp / fenmu) # 交集 return jaccard_coefficient if __name__ == '__main__': a = "香农在信息论中提出的信息熵定义为自信息的期望" b = "香农在信息论中提出的信息熵定义为自信息的期望" jaccard_coefficient = Jaccrad(a,b) print(jaccard_coefficient)