二分答案 + 滚动哈希寻找重复子串

1044. Longest Duplicate Substring

本题的high level的思想是二分搜索这样的substring最多能有多长。对于一个固定的len，如果能在S里找到两处相同的子串滑窗，那么我们就可以对len的搜索往上调整；反之，我们就必须将len的搜索往下调。

于是本题转化为，如果快速在S里找到两处相同的、长度都是len的滑窗。比较直观的想法是对每一处滑窗组成的字符串都放入集合，如果看到集合中已经有一个相同的substring了，就意味着duplicate了。但是当len很大的时候，将字符串放入集合的内置hash化操作很耗时。于是rolling hash是比较常见的解决方案。

Rolling hash的基本思想就是将abcd转化为26进制的0123.将一个数作为key放入集合很轻松。除此之外，对于abcdef这样的字符串，如果已经知道了[abcd]这个子串的hash key是0123，那么再求相邻的[bcde]的hash key 1234就非常高效，只需要将之前的key的最高位数字去掉(a对应的0)、左移一位、加上最低位数字(e对应的4）。用o(1)的时间就能完成。

总结一下，rolling hash的思想，就是将两个字符串的比较，转化为hash化之后的两个26进制数的比较。但是当len很大时，这个数可能很大，我们通常不得不对一个大数取模再作为hash key。这样就会有两个不同的字符串，但是会对应同一个hash key（26进制数取模后的结果）。针对这种hash collision的情况，我们可以调整base和mod来规避。或者使用两套hash规则来生成两个key，那么不同字符串拥有两个相同key的概率就会大大降低。

class Solution {
public:
    typedef long long LL;
    LL mod = 1L << 32;
    unordered_map<int, int> mp;
    
    string longestDupSubstring(string S) {
        int n = S.size();
        int l = 1, r = n, mid;
        // 二分查找最长可能子串长度，相当于在 0 0 0 0 1 1 1 1中找最后一个0的位置
        // i(1-indexed)位置的0代表长度为i的重复子串存在，1代表不存在
        // 因此可以转换为求0的upper_bound的idx，然后再减一的值
        // 初始l = 1, r = n
        // 初始l = 0 会出现check(s, 0)的情况，虽不影响结果，但是l = 1更合理
        // 初始r = n - 1出错，否则0 0 0 0这样的字符串找不到正确upper_bound
        while (l < r) {
            mid = (l + r) >> 1;
            if (check(S, mid)) l = mid + 1;
            else r = mid;
        }
        return S.substr(mp[l - 1], l - 1);
    }
    bool check(string& s, int k) {
        int n = s.size();
        long long hash = 0, base = 1;
        for (int i = 0; i < k; ++i) {
            hash = (hash * 26 + (s[i] - 'a')) % mod;;
            base = base * 26 % mod;
        }
        unordered_set<int> h;
        h.insert(hash);
        for (int i = k; i < n; ++i) {
            hash = (hash * 26 + (s[i] - 'a')) % mod;
            hash = (hash - (s[i - k] - 'a') * base) % mod;
            if (h.count(hash)) {
                mp[k] = i - k + 1;
                return true;
            }
            h.insert(hash);
        }
        return false;
    }
};

718. Maximum Length of Repeated Subarray

class Solution {
public:
    typedef long long LL;
    
    const int mod = 1e9 + 7;
    const int base = 113;
    
    LL qPow(LL x, LL k) {
        LL res = 1;
        while (k) {
            if (k & 1) {
                res = res * x % mod;
            } 
            x = x * x % mod;
            k >>= 1;
        }
        return res;
    }
    
    int findLength(vector<int>& A, vector<int>& B) {
        int l = 1, r = min(A.size(), B.size()) + 1, mid;
        while (l < r) {
            mid = (l + r) >> 1;
            if (check(A, B, mid)) l = mid + 1;
            else r = mid;
        }
        return l - 1;
    }
    
    bool check(vector<int>& A, vector<int>& B, int k) {
        unordered_set<int> hashA;
        LL hash = 0;
        
        for (int i = 0; i < k; ++i) {
            hash = (hash * base + A[i]) % mod;
        }
        hashA.insert(hash);
        for (int i = k; i < A.size(); ++i) {
            hash = ((hash - A[i - k] * qPow(base, k - 1) % mod + mod) % mod * base + A[i]) % mod;
            hashA.insert(hash);
        }
        
        hash = 0;
        for (int i = 0; i < k; ++i) {
            hash = (hash * base + B[i]) % mod;
        }
        if (hashA.count(hash)) return true;
        for (int i = k; i < B.size(); ++i) {
            hash = ((hash - B[i - k] * qPow(base, k - 1) % mod + mod) % mod * base + B[i]) % mod;
            if (hashA.count(hash)) return true;
        }
        
        return false;
    }
};

相关阅读:
uva11572 Unique Snowflakes
codeforces#333 div2 B. Approximating a Constant Range
uva11134 Fabled Rooks
吐槽。。。
uva 1605 Building for UN
uva 120 Stacks of Flapjacks
uva1152 4 Values whose Sum is 0
uva817 According to Bartjens
uva11214 Guarding the Chessboard
无标题
原文地址：https://www.cnblogs.com/betaa/p/13382944.html

二分答案 + 滚动哈希 寻找重复子串

二分答案 + 滚动哈希寻找重复子串