• [爬虫学习笔记]基于 SimHash 的去重复处理模块ContentSeen的构建




            GoogleMoses Charikar发表的一篇论文“detecting near-duplicates for web crawling”中提出了simhash算法,专门用来解决亿万级别的网页的去重任务。

            SimHash作为locality sensitive hash(局部敏感哈希)的一种:

            其主要思想是降维,将高维的特征向量映射成低维的特征向量,通过两个向量的Hamming Distance来确定文章是否重复或者高度近似。

            其中,Hamming Distance,又称汉明距离,在信息论中,两个等长字符串之间的汉明距离是两个字符串对应位置的不同字符的个数。也就是说,它就是将一个字符串变换成 另外一个字符串所需要替换的字符个数。例如:1011101 与 1001001 之间的汉明距离是 2。至于我们常说的字符串编辑距离则是一般形式的汉明距离。





    using System;
    using System.Collections.Generic;
    using System.Linq;
    namespace Crawler.Common
        public class SimHashAnalyser
            private const int HashSize = 32;
            public static float GetLikenessValue(string needle, string haystack, TokeniserType type = TokeniserType.Overlapping)
                var needleSimHash = GetSimHash(needle, type);
                var hayStackSimHash = GetSimHash(haystack, type);
                return GetLikenessValue(needleSimHash, hayStackSimHash);
            public static float GetLikenessValue(int needleSimHash, int hayStackSimHash)
                return (HashSize - GetHammingDistance(needleSimHash, hayStackSimHash)) / (float)HashSize;
            private static IEnumerable<int> DoHashTokens(IEnumerable<string> tokens)
                return tokens.Select(token => token.GetHashCode()).ToList();
            private static int GetHammingDistance(int firstValue, int secondValue)
                var hammingBits = firstValue ^ secondValue;
                var hammingValue = 0;
                for (var i = 0; i < 32; i++)
                    if (IsBitSet(hammingBits, i))
                        hammingValue += 1;
                return hammingValue;
            private static bool IsBitSet(int b, int pos)
                return (b & (1 << pos)) != 0;
            public static int GetSimHash(string input)
                return GetSimHash(input, TokeniserType.Overlapping);
            public static int GetSimHash(string input, TokeniserType tokeniserType)
                ITokeniser tokeniser;
                if (tokeniserType == TokeniserType.Overlapping)
                    tokeniser = new OverlappingStringTokeniser();
                    tokeniser = new FixedSizeStringTokeniser();
                var hashedtokens = DoHashTokens(tokeniser.Tokenise(input));
                var vector = new int[HashSize];
                for (var i = 0; i < HashSize; i++)
                    vector[i] = 0;
                foreach (var value in hashedtokens)
                    for (var j = 0; j < HashSize; j++)
                        if (IsBitSet(value, j))
                            vector[j] += 1;
                            vector[j] -= 1;
                var fingerprint = 0;
                for (var i = 0; i < HashSize; i++)
                    if (vector[i] > 0)
                        fingerprint += 1 << i;
                return fingerprint;
        public interface ITokeniser
            IEnumerable<string> Tokenise(string input);
        public class FixedSizeStringTokeniser : ITokeniser
            private readonly ushort _tokensize;
            public FixedSizeStringTokeniser(ushort tokenSize = 5)
                if (tokenSize < 2)
                    throw new ArgumentException("Token 不能超出范围");
                if (tokenSize > 127)
                    throw new ArgumentException("Token 不能超出范围");
                _tokensize = tokenSize;
            public IEnumerable<string> Tokenise(string input)
                var chunks = new List<string>();
                var offset = 0;
                while (offset < input.Length)
                    chunks.Add(new string(input.Skip(offset).Take(_tokensize).ToArray()));
                    offset += _tokensize;
                return chunks;
        public class OverlappingStringTokeniser : ITokeniser
            private readonly ushort _chunkSize;
            private readonly ushort _overlapSize;
            public OverlappingStringTokeniser(ushort chunkSize = 4, ushort overlapSize = 3)
                if (chunkSize <= overlapSize)
                    throw new ArgumentException("Chunck 必须大于 overlap");
                _overlapSize = overlapSize;
                _chunkSize = chunkSize;
            public IEnumerable<string> Tokenise(string input)
                var result = new List<string>();
                var position = 0;
                while (position < input.Length - _chunkSize)
                    result.Add(input.Substring(position, _chunkSize));
                    position += _chunkSize - _overlapSize;
                return result;
        public enum TokeniserType


    var s1 = "the cat sat on the mat.";
    var s2 = "the cat sat on a mat.";
    var similarity = SimHashAnalyser.GetLikenessValue(s1, s2);
    Console.WriteLine("相似度: {0}%", similarity * 100);


    相似度: 78.125%
    using Crawler.Common;
    namespace Crawler.Processing
        /// <summary>
        /// 对于每一份抓取到的网页,它首先需要进入Content Seen模块。该模块会判断网页的内容是否和已下载过的某个网页的内容一致,如果一致,则该网页不会再被送去进行下一步的处理。
        /// </summary>
        public class ContentSeen
            public static int GetFingerPrint(string html)
                return SimHashAnalyser.GetSimHash(html);
            public static float Similarity(int print1, int print2)
                return SimHashAnalyser.GetLikenessValue(print1, print2);
  • 相关阅读:
    Atitit 索引技术--位图索引
    Atitit View事件分发机制
    Atitit 代码复用的理解attilax总结
    Atitit 深入理解软件的本质 attilax总结 软件三原则"三次原则"是DRY原则和YAGNI原则的折
    Atitit事件代理机制原理 基于css class的事件代理
    Atitit  图像处理Depixelizing Pixel Art像素风格画的矢量化
    Atitit Mysql查询优化器 存取类型 范围存取类型 索引存取类型 AND or的分析
    Atitit常见的标准化组织与规范数量jcp ecma iso
    Atitit sql计划任务与查询优化器--统计信息模块
    Atitti 存储引擎支持的国内点与特性attilax总结
  • 原文地址:https://www.cnblogs.com/WayneShao/p/5869609.html
Copyright © 2020-2023  润新知