• 数据结构1:Hashtable原理


    面试场景1

    面试官:你了解过数据结构吗?

    我:平时会用到的数组,ListHashtableDictionary

    面试官:你知道他们的原理吗?

    我:心里默默的说一句会用不就行了吗...怎么问题那么多,嘴上却说我不太清楚,但是我知道ListHashtableDictionary线程不安全,一顿瞎吹...

    面试官:给你台机器,实现一个线程安全的MyList

    我:OMG...

    以上面试场景,是我曾经真实遇到过的,非常遗憾和尴尬,我那次面试也基本就到此为止了,结果不出意外的就没有下文了。

    下面是我总结的数据结构方面的一点内容,欢迎大家point out mistake

    一.Hashtable

    Hashtable h = new Hashtable();
    
    h.Add("a", "a");
    h.Add("b", "a");
    var a=h["b"];
    h["b"] = "c";

      h.Contains("a");
      h.Remove("a");
      h.ContainsKey("b");

    构造函数:

    // Constructs a new hashtable. The hashtable is created with an initial
    // capacity of zero and a load factor of 1.0.
    public Hashtable() : this(0, 1.0f) {
    }
     
     
    // Constructs a new hashtable with the given initial capacity and load
    // factor. The capacity argument serves as an indication of the
    // number of entries the hashtable will contain. When this number (or an
    // approximation) is known, specifying it in the constructor can eliminate
    // a number of resizing operations that would otherwise be performed when
    // elements are added to the hashtable. The loadFactor argument
    // indicates the maximum ratio of hashtable entries to hashtable buckets.
    // Smaller load factors cause faster average lookup times at the cost of
    // increased memory consumption. A load factor of 1.0 generally provides
    // the best balance between speed and size.
    // 
    public Hashtable(int capacity, float loadFactor) {
        if (capacity < 0)
            throw new ArgumentOutOfRangeException("capacity", Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
        if (!(loadFactor >= 0.1f && loadFactor <= 1.0f))
            throw new ArgumentOutOfRangeException("loadFactor", Environment.GetResourceString("ArgumentOutOfRange_HashtableLoadFactor", .1, 1.0));
        Contract.EndContractBlock();
    
        // Based on perf work, .72 is the optimal load factor for this table.  
        this.loadFactor = 0.72f * loadFactor;
    
        double rawsize = capacity / this.loadFactor;
        if (rawsize > Int32.MaxValue)
            throw new ArgumentException(Environment.GetResourceString("Arg_HTCapacityOverflow"));
    
        // Avoid awfully small sizes
        int hashsize = (rawsize > InitialSize) ? HashHelpers.GetPrime((int)rawsize) : InitialSize;
        buckets = new bucket[hashsize];  //buckets是放数据的桶
    
        loadsize = (int)(this.loadFactor * hashsize);
        isWriterInProgress = false;
        // Based on the current algorithm, loadsize must be less than hashsize.
        Contract.Assert( loadsize < hashsize, "Invalid hashtable loadsize!");
    }

    // The hash table data.
    // This cannot be serialised
    private struct bucket {  //结构体,存储hashtable中的kv,hash_coll存k的hashcode
        public Object key;
        public Object val;
        public int hash_coll;   // Store hash code; sign bit means there was a collision.
    }
    
    private bucket[] buckets;  
    // Adds an entry with the given key and value to this hashtable. An
    // ArgumentException is thrown if the key is null or if the key is already
    // present in the hashtable.
    // 
    public virtual void Add(Object key, Object value) {  //字典的添加方法
        Insert(key, value, true);
    }
    
    // Inserts an entry into this hashtable. This method is called from the Set
    // and Add methods. If the add parameter is true and the given key already
    // exists in the hashtable, an exception is thrown.
    [ReliabilityContract(Consistency.WillNotCorruptState, Cer.MayFail)]
    private void Insert (Object key, Object nvalue, bool add) {
    
        if (key == null) {
            throw new ArgumentNullException("key", Environment.GetResourceString("ArgumentNull_Key"));
        }
        Contract.EndContractBlock();
        if (count >= loadsize) {  //扩容
            expand();
        }
        else if(occupancy > loadsize && count > 100) {
            rehash();
        }
        
        uint seed;  //seed为k的hashcode
        uint incr;  //不同k的hashcode不同,但是取余buckets.Lenght后,索引会冲突,冲突后需要将k所在索引+incr
        // Assume we only have one thread writing concurrently.  Modify
        // buckets to contain new data, as long as we insert in the right order.
        uint hashcode = InitHash(key, buckets.Length, out seed, out incr);
        int  ntry = 0;
        int emptySlotNumber = -1; // We use the empty slot number to cache the first empty slot. We chose to reuse slots
        // create by remove that have the collision bit set over using up new slots.
        int bucketNumber = (int) (seed % (uint)buckets.Length);  //数据存放在buckets数组中的索引
        do {
    
            // Set emptySlot number to current bucket if it is the first available bucket that we have seen
            // that once contained an entry and also has had a collision.
            // We need to search this entire collision chain because we have to ensure that there are no 
            // duplicate entries in the table.
            if (emptySlotNumber == -1 && (buckets[bucketNumber].key == buckets) && (buckets[bucketNumber].hash_coll < 0))//(((buckets[bucketNumber].hash_coll & unchecked(0x80000000))!=0)))
                emptySlotNumber = bucketNumber;
    
            // Insert the key/value pair into this bucket if this bucket is empty and has never contained an entry
            // OR
            // This bucket once contained an entry but there has never been a collision
            if ((buckets[bucketNumber].key == null) ||  如果key==null说明此索引位置为空,可以存放当前值,否则说明当前值不为空,要么是更新操作,要么是需要将索引+incr继续遍历
                (buckets[bucketNumber].key == buckets && ((buckets[bucketNumber].hash_coll & unchecked(0x80000000))==0))) {
    
                // If we have found an available bucket that has never had a collision, but we've seen an available
                // bucket in the past that has the collision bit set, use the previous bucket instead
                if (emptySlotNumber != -1) // Reuse slot
                    bucketNumber = emptySlotNumber;
    
                isWriterInProgress = true;                    
                buckets[bucketNumber].val = nvalue;   //将值存放到该索引位置
                buckets[bucketNumber].key  = key;
                buckets[bucketNumber].hash_coll |= (int) hashcode;  //k的hashcode值,大于0的无符号整形
                count++;
                UpdateVersion();
                isWriterInProgress = false;   
    
                return;
            }
    
            // The current bucket is in use
            // OR
            // it is available, but has had the collision bit set and we have already found an available bucket
            if (((buckets[bucketNumber].hash_coll & 0x7FFFFFFF) == hashcode) &&   //hashcode冲突时候hash_coll值为负的,需要转成正值,当前bucket中存放的k为参数中的key
                KeyEquals (buckets[bucketNumber].key, key)) {
                if (add) {
                    throw new ArgumentException(Environment.GetResourceString("Argument_AddingDuplicate__", buckets[bucketNumber].key, key)); //如果是添加操作,key已存在,否则更新key对应的值
                }
          
                isWriterInProgress = true;                    
                buckets[bucketNumber].val = nvalue;
                UpdateVersion();                    
                isWriterInProgress = false; 
                
                return;
            }
    
            // The current bucket is full, and we have therefore collided.  We need to set the collision bit
            // UNLESS
            // we have remembered an available slot previously.
            if (emptySlotNumber == -1) {// We don't need to set the collision bit here since we already have an empty slot
                if( buckets[bucketNumber].hash_coll >= 0 ) {
                    buckets[bucketNumber].hash_coll |= unchecked((int)0x80000000);
                    occupancy++;
                }
            }
    
            bucketNumber = (int) (((long)bucketNumber + incr)% (uint)buckets.Length);  //当前索引位置存放了别的key的值,key对应的bucket的索引位置+incr后取余bucket.Length为当前key的索引              
        } while (++ntry < buckets.Length);
    
        // This code is here if and only if there were no buckets without a collision bit set in the entire table
        if (emptySlotNumber != -1)
        {
            // We pretty much have to insert in this order.  Don't set hash
            // code until the value & key are set appropriately.
           
            isWriterInProgress = true;                    
            buckets[emptySlotNumber].val = nvalue;
            buckets[emptySlotNumber].key  = key;
            buckets[emptySlotNumber].hash_coll |= (int) hashcode;
            count++;
            UpdateVersion();                
            isWriterInProgress = false;     
    
            return;
        }
    
        // If you see this assert, make sure load factor & count are reasonable.
        // Then verify that our double hash function (h2, described at top of file)
        // meets the requirements described above. You should never see this assert.
        Contract.Assert(false, "hash table insert failed!  Load factor too high, or our double hashing function is incorrect.");
        throw new InvalidOperationException(Environment.GetResourceString("InvalidOperation_HashInsertFailed"));
    }
    
    // ‘InitHash’ is basically an implementation of classic DoubleHashing (see http://en.wikipedia.org/wiki/Double_hashing)  
    //
    // 1) The only ‘correctness’ requirement is that the ‘increment’ used to probe 
    //    a. Be non-zero
    //    b. Be relatively prime to the table size ‘hashSize’. (This is needed to insure you probe all entries in the table before you ‘wrap’ and visit entries already probed)
    // 2) Because we choose table sizes to be primes, we just need to insure that the increment is 0 < incr < hashSize
    //
    // Thus this function would work: Incr = 1 + (seed % (hashSize-1))
    // 
    // While this works well for ‘uniformly distributed’ keys, in practice, non-uniformity is common. 
    // In particular in practice we can see ‘mostly sequential’ where you get long clusters of keys that ‘pack’. 
    // To avoid bad behavior you want it to be the case that the increment is ‘large’ even for ‘small’ values (because small 
    // values tend to happen more in practice). Thus we multiply ‘seed’ by a number that will make these small values
    // bigger (and not hurt large values). We picked HashPrime (101) because it was prime, and if ‘hashSize-1’ is not a multiple of HashPrime
    // (enforced in GetPrime), then incr has the potential of being every value from 1 to hashSize-1. The choice was largely arbitrary.
    // 
    // Computes the hash function:  H(key, i) = h1(key) + i*h2(key, hashSize).
    // The out parameter seed is h1(key), while the out parameter 
    // incr is h2(key, hashSize).  Callers of this function should 
    // add incr each time through a loop.
    private uint InitHash(Object key, int hashsize, out uint seed, out uint incr) {
        // Hashcode must be positive.  Also, we must not use the sign bit, since
        // that is used for the collision bit.
        uint hashcode = (uint) GetHash(key) & 0x7FFFFFFF;
        seed = (uint) hashcode; //当前key的hashcode
        // Restriction: incr MUST be between 1 and hashsize - 1, inclusive for
        // the modular arithmetic to work correctly.  This guarantees you'll
        // visit every bucket in the table exactly once within hashsize 
        // iterations.  Violate this and it'll cause obscure bugs forever.
        // If you change this calculation for h2(key), update putEntry too!
        incr = (uint)(1 + ((seed * HashPrime) % ((uint)hashsize - 1))); //hashcode扩大固定素数倍数后取余 buckets.Length
        return hashcode; //当前key的hashcode
    }
    
    // Increases the bucket count of this hashtable. This method is called from
    // the Insert method when the actual load factor of the hashtable reaches
    // the upper limit specified when the hashtable was constructed. The number
    // of buckets in the hashtable is increased to the smallest prime number
    // that is larger than twice the current number of buckets, and the entries
    // in the hashtable are redistributed into the new buckets using the cached
    // hashcodes.
    private void expand()  {
        int rawsize = HashHelpers.ExpandPrime(buckets.Length);//buckets数据扩容时,计算新数组Length,如何扩容详见 HashHelpers.ExpandPrime方法,下面粘贴了部分源码
        rehash(rawsize, false);
    }
    
    private void rehash( int newsize, bool forceNewHashCode ) 
    {
     
        // reset occupancy
        occupancy=0;
    
        // Don't replace any internal state until we've finished adding to the 
        // new bucket[].  This serves two purposes: 
        //   1) Allow concurrent readers to see valid hashtable contents 
        //      at all times
        //   2) Protect against an OutOfMemoryException while allocating this 
        //      new bucket[].
        bucket[] newBuckets = new bucket[newsize]; //扩容后的新的buckets
    
        // rehash table into new buckets
        int nb;
        for (nb = 0; nb < buckets.Length; nb++){
            bucket oldb = buckets[nb];
            if ((oldb.key != null) && (oldb.key != buckets)) {
                int hashcode = ((forceNewHashCode ? GetHash(oldb.key) : oldb.hash_coll) & 0x7FFFFFFF);                              
                putEntry(newBuckets, oldb.key, oldb.val, hashcode);//旧buckets中的数据存放到新buckets中
            }
        }
    
        // New bucket[] is good to go - replace buckets and other internal state.
        Thread.BeginCriticalRegion();     
        isWriterInProgress = true;
        buckets = newBuckets;
        loadsize = (int)(loadFactor * newsize);
        UpdateVersion();
        isWriterInProgress = false;
              
        Thread.EndCriticalRegion();           
        // minimun size of hashtable is 3 now and maximum loadFactor is 0.72 now.
        Contract.Assert(loadsize < newsize, "Our current implementaion means this is not possible.");
        return;
        }
        
    private void putEntry (bucket[] newBuckets, Object key, Object nvalue, int hashcode)
    {
        Contract.Assert(hashcode >= 0, "hashcode >= 0");  // make sure collision bit (sign bit) wasn't set.
    
        uint seed = (uint) hashcode;
        uint incr = (uint)(1 + ((seed * HashPrime) % ((uint)newBuckets.Length - 1)));
        int bucketNumber = (int) (seed % (uint)newBuckets.Length);     //不需要重新计算hashcode,但是需要重新获取索引值       
        do {
    
            if ((newBuckets[bucketNumber].key == null) || (newBuckets[bucketNumber].key == buckets)) { //当前索引未使用,将key存入当前索引
                newBuckets[bucketNumber].val = nvalue;
                newBuckets[bucketNumber].key = key;
                newBuckets[bucketNumber].hash_coll |= hashcode;
                return;
            }
            
            if( newBuckets[bucketNumber].hash_coll >= 0 ) {
            newBuckets[bucketNumber].hash_coll |= unchecked((int)0x80000000);  //当前索引冲突,标识为负的hashcode
                occupancy++; //冲突占用+1
            }
            bucketNumber = (int) (((long)bucketNumber + incr)% (uint)newBuckets.Length);  //索引+incr后取余,继续寻找当前key的索引              
        } while (true);
    }
    HashHelpers中的ExpandPrime方法:
    HashHelpers中的ExpandPrime方法:
    // Returns size of hashtable to grow to.
    public static int ExpandPrime(int oldSize)
    {
        int newSize = 2 * oldSize;  //扩大为原来两倍
    
        // Allow the hashtables to grow to maximum possible size (~2G elements) before encoutering capacity overflow.
        // Note that this check works even when _items.Length overflowed thanks to the (uint) cast
        if ((uint)newSize > MaxPrimeArrayLength && MaxPrimeArrayLength > oldSize)  //新容量大于数组最大长度,默认最大数组长度
        {
            Contract.Assert( MaxPrimeArrayLength == GetPrime(MaxPrimeArrayLength), "Invalid MaxPrimeArrayLength");
            return MaxPrimeArrayLength;
        }
    
        return GetPrime(newSize);
    }
    
    public static int GetPrime(int min) 
    {
        if (min < 0)
            throw new ArgumentException(Environment.GetResourceString("Arg_HTCapacityOverflow"));
        Contract.EndContractBlock();
    
        for (int i = 0; i < primes.Length; i++)   //从素数数组中寻找一个大于当前值的最小素数
        {
            int prime = primes[i];
            if (prime >= min) return prime;
        }
    
        //outside of our predefined table. 
        //compute the hard way. 
        for (int i = (min | 1); i < Int32.MaxValue;i+=2) //素数数组中没有找到,继续+2的找,感觉这儿有点小问题,这么找是有可能比MaxPrimeArrayLength大的,不知道微软大牛们是如何想的,可能是我们用的时候不会放那么多数据吧,-_-||
        {
            if (IsPrime(i) && ((i - 1) % Hashtable.HashPrime != 0))
                return i;
        }
        return min;
    }

    为啥可以这么写:

    var a=h["b"];
    h["b"] = "c";

    看源码:

    // Returns the value associated with the given key. If an entry with the
    // given key is not found, the returned value is null.
    // 
    public virtual Object this[Object key] {
        get {
            if (key == null) {
                throw new ArgumentNullException("key", Environment.GetResourceString("ArgumentNull_Key"));
            }
            Contract.EndContractBlock();
    
            uint seed; //key的hashcode
            uint incr; //如果key的hashcode取余buckets.Length 即 key所在的bucket索引冲突,新索引需要挪的长度
    
            
            // Take a snapshot of buckets, in case another thread does a resize
            bucket[] lbuckets = buckets;
            uint hashcode = InitHash(key, lbuckets.Length, out seed, out incr);  //上面已经分析过了,计算key的hashcode
            int  ntry = 0;
    
            bucket b;
            int bucketNumber = (int) (seed % (uint)lbuckets.Length);                
            do
            {
                int currentversion;
    
                //     A read operation on hashtable has three steps:
                //        (1) calculate the hash and find the slot number.
                //        (2) compare the hashcode, if equal, go to step 3. Otherwise end.
                //        (3) compare the key, if equal, go to step 4. Otherwise end.
                //        (4) return the value contained in the bucket.
                //     After step 3 and before step 4. A writer can kick in a remove the old item and add a new one 
                //     in the same bukcet. So in the reader we need to check if the hash table is modified during above steps.
                //
                // Writers (Insert, Remove, Clear) will set 'isWriterInProgress' flag before it starts modifying 
                // the hashtable and will ckear the flag when it is done.  When the flag is cleared, the 'version'
                // will be increased.  We will repeat the reading if a writer is in progress or done with the modification 
                // during the read.
                //
                // Our memory model guarantee if we pick up the change in bucket from another processor, 
                // we will see the 'isWriterProgress' flag to be true or 'version' is changed in the reader.
                //                    
                int spinCount = 0;
                do {
                    // this is violate read, following memory accesses can not be moved ahead of it.
                    currentversion = version;
                    b = lbuckets[bucketNumber];                        
    
                    // The contention between reader and writer shouldn't happen frequently.
                    // But just in case this will burn CPU, yield the control of CPU if we spinned a few times.
                    // 8 is just a random number I pick. 
                    if( (++spinCount) % 8 == 0 ) {   
                        Thread.Sleep(1);   // 1 means we are yeilding control to all threads, including low-priority ones.
                    }
                } while ( isWriterInProgress || (currentversion != version) ); //hashtable在更新扩容写的时候,或者版本不同时,读的时候要个锁哦if (b.key == null) { //如果key==null,说明没有当前key,返回null
                    return null;
                }
                if (((b.hash_coll & 0x7FFFFFFF) == hashcode) && //如果索引冲突,hash_coll值为hashcode负值,需要转成正的比较,相等就返回值
                    KeyEquals (b.key, key))
                    return b.val;  
                bucketNumber = (int) (((long)bucketNumber + incr)% (uint)lbuckets.Length);     //如果索引冲突且循环次数小于buckets.Lenght,继续循环寻找当前key所在的索引值                             
            } while (b.hash_coll < 0 && ++ntry < lbuckets.Length);
            return null;
        }
    
        set {
            Insert(key, value, false);
        }
    }

    好了,到此为止吧,hashcode应该大致了解了吧,基本上就是用一个结构体的数组,key二次hash来实现的,

    个人想法:buckets数组扩容时,需要重新计算所有key的索引,旧数组的数据需要挪到新数组,旧数组就是gc需要管理的内存垃圾,这个数组较大时,会存储到大对象堆吗?这个不太了解,暂时留个问题吧,待以后再了解后再做记录吧

    以上是framework4.8的部分源码,跟.net core 6.1的有微小的区别

    参考:

    https://referencesource.microsoft.com/#mscorlib/system/collections/hashtable.cs,10fefb6e0ae510dd

  • 相关阅读:
    BZOJ 3260: 跳 (组合恒等式)
    BZOJ 1924 [SDOI 2010] 所驼门王的宝藏 (优化建图+tarjan+最长链)
    BZOJ 3451Normal (点分治+FFT)
    BZOJ 2213: [Poi2011]Difference (DP)
    BZOJ 3251 树上三角形 (暴力)
    [HNOI2008]明明的烦恼
    HDU 1521 排列组合
    [HNOI2008]神奇的国度
    [HNOI2008]遥远的行星
    [HNOI2001]软件开发
  • 原文地址:https://www.cnblogs.com/liuqiyun/p/15901297.html
Copyright © 2020-2023  润新知