simhash是locality sensitive hash(局部敏感哈希)的一种,最早由Moses Charikar在《similarity estimation techniques from rounding algorithms》一文中提出。Google就是基于此算法实现网页文件查重的。
3、提取原始文本中的特征,一般采用各种分词的方式。比如对于"the cat sat on the mat",采用两两分词的方式得到如下结果:{"th", "he", "e ", " c", "ca", "at", "t ", " s", "sa", " o", "on", "n ", " t", " m", "ma"}
4、使用传统的32位hash函数计算各个word的hashcode,比如:"th".hash = -502157718
,"he".hash = -369049682,……
a) 然后使用F’的高pi位检索,找出Ti中高pi位相同的集合
b) 在检索出的集合中比较f-pi位,找出海明距离小于等于k的指纹
#!/usr/bin/python #-*- coding:utf-8 -*- from __future__ import division,unicode_literals import sys import re import hashlib import collections import datetime reload(sys) sys.setdefaultencoding('utf-8') import codecs import itertools lib_newsfp_file = sys.argv[1] #读入库中存储的所有新闻 result_file = sys.argv[2] test_news_fp = {} lib_news_fp = {} bucket = collections.defaultdict(set) offsets = [] def cacu_frequent(list1): frequent = {} for i in list1: if i not in frequent: frequent[i] = 0 frequent[i] += 1 return frequent def load_lib_newsfp_file(): global lib_news_fp fin =,'r','utf-8') for line in fin: lines = line.strip() if len(lines) == 0: continue Arr = lines.split(' ') if len(Arr) < 3: continue lib_news_fp[Arr[0]] = Arr[3] def get_near_dups(check_value): ans = set() for key in get_keys(int(check_value)): dups = bucket[key] for dup in dups: total_value,url = dup.split(',',1) if isSimilar(int(check_value),int(total_value)) == True: ans.add(url) break #与一条重复 退出查找 if ans: break return list(ans) def ini_Index(): global bucket getoffsets() print offsets objs = [(str(url),str(values)) for url,values in lib_news_fp.items()] for i,q in enumerate(objs): addindex(*q) def addindex(url,value): global bucket for key in get_keys(int(value)): v = '%d,%s' % (int(value),url) bucket[key].add(v) def deleteindex(url,value): global bucket for key in get_keys(int(value)): v = '%d,%s' %(int(value),url) if v in bucket[key]: bucket[key].remove(v) def getoffsets(f = 64 , k = 4): global offsets offsets = [f // (k + 1) * i for i in range(k + 1)] def get_keys(value, f = 64): for i, offset in enumerate(offsets): if i == (len(offsets) - 1): m = 2 ** (f - offset) - 1 else: m = 2 ** (offsets[i + 1] - offset) - 1 c = value >> offset & m yield '%x:%x' % (c , i) def bucket_size(): return len(bucket) def isSimilar(value1,value2,n = 4,f = 64): ans = 0 x = (value1 ^ value2) &((1 << f) - 1) while x and (ans <= n): ans += 1 x &= x - 1 if ans <= n: return True return False def load_test_file(): global test_news_fp for line in sys.stdin: features = [] result = line.strip().split(' ') url = result[0] content = result[2].split() title = result[1].split() features.extend(content) features.extend(title) total_features = cacu_frequent(features) test_news_fp[url] = build_by_features(total_features) def load_test_newsfp_file(): global test_news_fp for line in sys.stdin: lines = line.strip() if len(lines) == 0: continue Arr = lines.split(' ') if len(Arr) < 3: continue test_news_fp[Arr[0]] = Arr[3] def build_by_features(features,f=64,hashfunc=None): v = [0]*f masks = [1 << i for i in range(f+f)] if hashfunc is None: def _hashfunc(x): return int(hashlib.md5(x).hexdigest(),16) hashfunc = _hashfunc if isinstance(features,dict): total_features = features.items() else: total_features = features for fea in total_features: if isinstance(fea,basestring): h = hashfunc(fea.encode('utf-8')) w = 1 else: h = hashfunc(fea[0].encode('utf-8')) w = fea[1] for i in range(f): v[i] += w if h & masks[i+32] else -w ans = 0 for i in range(f): if v[i] >= 0: ans |= masks[i] return ans sum = 0 def process(): global test_news_fp global sum fout =,'w','utf-8') load_lib_newsfp_file() # load_test_file() ini_Index() check_features = test_news_fp.items() lib_features = lib_news_fp.items() i = 0 for check_fp in check_features: # print i ans = [] ans = get_near_dups(check_fp[1]) if ans: for url in ans: output_str = str(check_fp[0])+' '+str(url) fout.write(output_str+' ') #break #print check_fp[0],'is duplicate' sum = sum + 1 #del test_news_fp[check_fp[0]] print i i += 1 fout.close() if __name__ == '__main__': # process() begin = load_test_newsfp_file() # load_test_file() # getoffsets() # print offsets # load_lib_newsfp_file() process() end = print '耗时:',end - begin,' 重复新闻数:',sum,' 准确率: ', sum/2589
Jaccard index是用来计算相似性,也就是距离的一种度量标准。假如有集合A、B,那么, 也就是说,集合A,B的Jaccard系数等于A,B中共同拥有的元素数与A,B总共拥有的元素数的比例。很显然,Jaccard系数值区间为[0,1]。
那么对集合A、B,hmin(A) = hmin(B)成立的条件是A ∪ B 中具有最小哈希值的元素也在 ∩ B中。这里
所以有,Pr[hmin(A) = hmin(B)] = J(A,B),即集合A和B的相似度为集合A、B经过hash后最小哈希值相
1 #!/usr/bin/python 2 #-* coding:utf-8 -*- 3 4 4 import sys 5 5 import re 6 6 import hashlib 7 7 import collections 8 8 import datetime 9 9 import codecs 10 10 11 11 reload(sys) 12 12 sys.setdefaultencoding('utf-8') 13 13 14 14 import threading 15 15 from Queue import Queue 16 16 queue = Queue() 17 17 thread_flag_list = [0,0,0,0,0] 18 18 19 19 res_file = sys.argv[1] 20 20 21 21 news_list = [] 22 22 def load(): 23 23 global news_list 24 24 for line in sys.stdin: 25 25 line = line.strip() 26 26 if len(line) == 0: 27 27 continue 28 28 Arr = line.split(' ') 29 29 30 30 if len(Arr) < 3: 31 31 continue 32 32 33 33 url = Arr[0] 34 34 title = Arr[1] 35 35 content = Arr[2] 36 36 37 37 term_list = content.split(' ') 38 38 term_set = set(term_list) 39 39 news_list.append([url,term_set]) 40 40 41 41 42 42 def calculate(news_f,news_s): 43 43 set1 = news_f[1] 44 44 set2 = news_s[1] 45 45 46 46 set_join = set1 & set2 47 47 set_union = set1 | set2 48 48 49 49 simi_value = float(len(set_join))/float(len(set_union)) 50 50 return simi_value 51 51 52 52 def run_thread(start_id,thread_id): 53 53 global queue 54 54 global thread_flag_list 55 55 news_first = news_list[start_id] 56 56 for i in range(start_id+1,len(news_list)): 57 57 news_second = news_list[i] 58 58 simi_value = calculate(news_first,news_second) 59 59 if simi_value > 0.8: 60 60 url1 = news_first[0] 61 61 url2 = news_second[0] 62 62 output_str = url1+' '+url2+' '+str(simi_value) 63 63 queue.put(output_str) 64 64 thread_flag_list[thread_id] = 0#标记线程结束 65 65 66 66 def process(): 67 67 global queue 68 68 global thread_flag_list 69 69 fout =,'w','utf-8') 70 70 id_max = len(news_list) 71 71 id_now = 0 72 72 while True: 73 73 run_flag = False 74 74 thread_list = [] 75 75 for i in range(0,len(thread_flag_list)): 76 76 if thread_flag_list[i] == 0: 77 77 if id_now == id_max: 78 continue 79 79 thread_flag_list[i] = 1 80 80 print 'now run is:',id_now 81 81 82 82 thread = threading.Thread(target=run_thread,args=(id_now,i)) 83 83 thread_list.append(thread) 84 84 85 85 id_now = id_now + 1 86 86 else: 87 87 run_flag = True 88 88 89 89 for thread in thread_list: 90 90 thread.setDaemon(True) 91 91 thread.start() 92 92 93 93 while not queue.empty(): 94 94 elem = queue.get() 95 95 print elem 96 96 fout.write(elem+' ') 97 97 98 98 if run_flag != True and id_now == id_max: 99 99 break 100 100 101 101 fout.close() 102 102 103 103 if __name__ == '__main__': 104 104 load() 105 105 print 'load done' 106 106 process() 107 107