• c#-SimHash匹配相似-算法


    使用场景:Google 的 simhash 算法

     //通过大量测试,simhash用于比较大文本,比如500字以上效果都还蛮好,距离小于3的基本都是相似,误判率也比较低。
    
     //从我的经验,如果我们假定N是每个块的大小,M是重叠的字符的数目,N = 4和M = 3是最好的选择
    

      

        public class SimHashAnalyser : IAnalyser
        {
    
            private const int HashSize = 32;
    
            public float GetLikenessValue(string needle, string haystack)
            {
                var needleSimHash = this.DoCalculateSimHash(needle);
                var hayStackSimHash = this.DoCalculateSimHash(haystack);
                return (HashSize - GetHammingDistance(needleSimHash, hayStackSimHash)) / (float)HashSize;
            }
    
            private static IEnumerable<int> DoHashTokens(IEnumerable<string> tokens)
            {
                var hashedTokens = new List<int>();
                foreach (string token in tokens)
                {
                    hashedTokens.Add(token.GetHashCode());
                }
                return hashedTokens;
            }
    
            private static int GetHammingDistance(int firstValue, int secondValue)
            {
                var hammingBits = firstValue ^ secondValue;
                var hammingValue = 0;
                for (int i = 0; i < 32; i++)
                {
                    if (IsBitSet(hammingBits, i))
                    {
                        hammingValue += 1;
                    }
                }
                return hammingValue;
            }
    
            private static bool IsBitSet(int b, int pos)
            {
                return (b & (1 << pos)) != 0;
            }
    
            private int DoCalculateSimHash(string input)
            {
                ITokeniser tokeniser = new OverlappingStringTokeniser(4, 3);
                var hashedtokens = DoHashTokens(tokeniser.Tokenise(input));
                var vector = new int[HashSize];
                for (var i = 0; i < HashSize; i++)
                {
                    vector[i] = 0;
                }
    
                foreach (var value in hashedtokens)
                {
                    for (var j = 0; j < HashSize; j++)
                    {
                        if (IsBitSet(value, j))
                        {
                            vector[j] += 1;
                        }
                        else
                        {
                            vector[j] -= 1;
                        }
                    }
                }
    
                var fingerprint = 0;
                for (var i = 0; i < HashSize; i++)
                {
                    if (vector[i] > 0)
                    {
                        fingerprint += 1 << i;
                    }
                }
                return fingerprint;
            }
    
    
        }
    
    
    
        public interface IAnalyser
        {
            float GetLikenessValue(string needle, string haystack);
        }
    
        public interface ITokeniser
        {
            IEnumerable<string> Tokenise(string input);
        }
    
        public class FixedSizeStringTokeniser : ITokeniser
        {
            private readonly ushort tokensize = 5;
            public FixedSizeStringTokeniser(ushort tokenSize)
            {
                if (tokenSize < 2 || tokenSize > 127)
                {
                    throw new ArgumentException("Token 不能超出范围");
                }
                this.tokensize = tokenSize;
            }
    
            public IEnumerable<string> Tokenise(string input)
            {
                var chunks = new List<string>();
                int offset = 0;
                while (offset < input.Length)
                {
                    chunks.Add(new string(input.Skip(offset).Take(this.tokensize).ToArray()));
                    offset += this.tokensize;
                }
                return chunks;
            }
    
        }
    
    
        public class OverlappingStringTokeniser : ITokeniser
        {
                  
            private readonly ushort chunkSize = 4;
            private readonly ushort overlapSize = 3;
    
            public OverlappingStringTokeniser(ushort chunkSize, ushort overlapSize)
            {
                if (chunkSize <= overlapSize)
                {
                    throw new ArgumentException("Chunck 必须大于 overlap");
                }
                this.overlapSize = overlapSize;
                this.chunkSize = chunkSize;
            }
    
            public IEnumerable<string> Tokenise(string input)
            {
                var result = new List<string>();
                int position = 0;
                while (position < input.Length - this.chunkSize)
                {
                    result.Add(input.Substring(position, this.chunkSize));
                    position += this.chunkSize - this.overlapSize;
                }
                return result;
            }
    
    
        }
    

      

    使用:

        const string HayStack = "中国香港………………";
        const string Needle = "中国香港 2013………………";
    
        IAnalyser analyser = new SimHashAnalyser();
        var likeness = analyser.GetLikenessValue(Needle, HayStack);
    
        Console.Clear();
        Console.WriteLine("Likeness: {0}%", likeness * 100);
        Console.ReadKey();
    

      

     SimHash for c#

  • 相关阅读:
    sql 查询重复数据 删除重复数据
    echarts 仪表板指针点击事件
    Java调用webservice 天气预报
    性能优化高手 一站通关从设计到交付的性能问题
    element-ui 添加空白表格
    Linux文件管理
    Linux第五周
    Linux第四周
    Linux第三周
    Linux第二周
  • 原文地址:https://www.cnblogs.com/zengxiangzhan/p/3311114.html
Copyright © 2020-2023  润新知