还是直接上代码吧:
1 #include <stdlib.h> 2 #include <string> 3 #include <iostream> 4 #include <fstream> 5 #include <vector> 6 #include <list> 7 8 using namespace std; 9 10 static const int hashtable_length = 6151; 11 static const int hashtable_compare = 769; 12 13 // 用于定位一个Bucket 14 unsigned int hash1(const char* str) 15 { 16 const char* end_of_str = str+strlen(str); 17 unsigned int sum = 0; 18 while (end_of_str - str > 3) 19 { 20 sum = (sum + (unsigned int)*((unsigned int*)str))%hashtable_length; 21 str += 4; 22 } 23 return sum; 24 } 25 26 // 用于快速比较两个字符串,理论上会提高检索速度 27 // 用长度来判断是一种简单的方法,有时间可以设计更复杂的方法 28 unsigned int hash2(const char* str) 29 { 30 return strlen(str)%hashtable_compare; 31 } 32 bool find_in_bucket(list<string>& l, const char* str) 33 { 34 list<string>::iterator iter; 35 unsigned int hash_key1 = hash1(str); 36 unsigned int hash_key2 = hash2(str); 37 bool exist = false; 38 for (iter = l.begin(); iter != l.end(); iter++) 39 { 40 if (hash_key2 == hash2(iter->c_str())) 41 if (strcmp(str, iter->c_str()) == 0) 42 { 43 return true; 44 } 45 } 46 return false; 47 } 48 int insert_in_bucket(list<string>& l, const char* str) 49 { 50 if (!find_in_bucket(l, str)) 51 { 52 l.push_back(string(str)); 53 return l.size(); 54 }else 55 return -1; 56 } 57 bool find_in_hashtable(vector<list<string>>& v, const char* str) 58 { 59 return find_in_bucket(v[hash1(str)], str); 60 } 61 int insert_in_hashtable(vector<list<string>>& v, const char* str) 62 { 63 return insert_in_bucket(v[hash1(str)], str); 64 } 65 66 int main() 67 { 68 // 学习list 69 // 学习hashtable 70 71 vector<list<string>> hashtable(hashtable_length, list<string>()); 72 73 ofstream log_file; 74 log_file.open("D:\log.txt"); 75 ifstream input_file; 76 input_file.open("D:\input.txt"); 77 if(!input_file) 78 return -1; 79 80 // 10KB的缓冲区。注意!如果缓冲区过大,会导致栈空间不足,在函数层层调用的时候,会产生 stack overflow 错误! 81 char buff[10240] = {0}; 82 int max_of_bucket = -1; 83 while (input_file.getline(buff, 10240-1)) 84 { 85 // 使用哈希表,存储海量数据,能够快速查找和插入 86 int len = insert_in_hashtable(hashtable, buff); 87 if (len > max_of_bucket) 88 max_of_bucket = len; 89 log_file<< "hashkey = " << hash1(buff) << " length = "<< len << " " << endl; 90 } 91 log_file<< "max_of_bucket = " << max_of_bucket << endl; 92 input_file.close(); 93 94 // 快速查找 95 input_file.open("D:\test.txt"); 96 if(!input_file) 97 return -2; 98 while (input_file.getline(buff, 10240-1)) 99 { 100 // 使用哈希表,存储海量数据,能够快速查找 101 if (find_in_hashtable(hashtable, buff)) 102 log_file.write("Found it ! ", strlen("Found it ! ")); 103 else 104 log_file.write("Missed it ! ", strlen("Missed it ! ")); 105 } 106 input_file.close(); 107 log_file.close(); 108 109 return 0; 110 }