2013-09-14 15:34:16
用后缀数组求一个字符串中重复出现的最长的子串。
- 用C++中的string类可以很方便地进行操作,需将后缀数组保存在vector<string>,如下面代码中的string版本所示,但这样就会因为<string>有很大的开销;
- 直接用字符指针指向后缀字符串的首地址,可以节省很大的空间,如下面代码中的char *版本所示.
- 注意使用char *版本时,用qsort函数最后缀字符串数组排序,需要提供comp函数,该函数的写法如下:
1 int pStrcmp(const void *p,const void *q) 2 { 3 return ( strcmp( *(char **)p , *(char **)q ) ); 4 }
更多关于该函数的说明,详见博文http://www.cnblogs.com/youngforever/articles/3321469.html。
代码(string版本):
1 #include <iostream> 2 #include <cassert> 3 #include <string> //用string类模板,必须包含该头文件 4 #include <vector> //用vector模板,必须包含该头文件 5 #include <algorithm> //用sort函数,必须包含该头文件 6 using namespace std; 7 8 //获取两个字符串的最长公共子串 9 size_t GetLCS(const string &str1,const string &str2) 10 { 11 size_t len1 = str1.length(); 12 size_t len2 = str2.length(); 13 14 size_t end = len1 < len2 ? len1 : len2; 15 16 size_t index = 0; 17 size_t commenLen = 0; 18 19 for (index = 0;index < end;++index) 20 { 21 if (str1.at(index) == str2.at(index)) 22 { 23 ++commenLen; 24 } 25 else 26 { 27 break; 28 } 29 } 30 31 return commenLen; 32 } 33 34 //获取字符串中重复出现且最长的子串 35 size_t FindLongeStringApearTwice(const string &srcStr,string &subStr) 36 { 37 vector<string> vecStr; 38 39 size_t index; 40 size_t end = srcStr.length(); 41 42 string tmpStr; 43 size_t tmpLen = 0; 44 size_t maxLen = 0; 45 46 for ( index = 0;index < end;++index) //生成后缀数组 47 { 48 tmpStr = srcStr.substr(index,(end - 1 - index)); 49 vecStr.push_back(tmpStr); 50 //tmpStr.clear(); //此处的清楚是不必要的,可以删去 51 } 52 53 sort(vecStr.begin(),vecStr.end()); //对后缀字符串数组排序 54 55 vector<string>::iterator iter; 56 57 for (iter = vecStr.begin();(iter + 1) != vecStr.end();++iter) //求相邻string的LCS 58 { 59 tmpLen = GetLCS(*iter,*(iter + 1)); 60 61 if(tmpLen > maxLen) 62 { 63 maxLen = tmpLen; 64 subStr = (*iter).substr(0,maxLen); 65 } 66 } 67 68 return maxLen; 69 } 70 71 72 73 74 //测试FindLongeStringApearTwice 75 void TestDriver() 76 { 77 string strArray[] = {"0123456","yyabcdabjcabceg","abcbcbcabc","hello,li mei! hello,li lei!"}; 78 size_t arrayLength = 4; 79 80 string srcStr; 81 string subStr; 82 size_t maxLen = 0; 83 84 for (size_t index = 0;index < arrayLength;++index) 85 { 86 //srcStr.clear(); //此处的清楚是不必要的,可以删去 87 srcStr = strArray[index]; 88 maxLen = FindLongeStringApearTwice(srcStr,subStr); 89 90 cout<<"the source string is : "<<srcStr<<endl; 91 cout<<"the longest sub string is : "<<subStr<<endl; 92 cout<<"the max length is : "<<maxLen<<endl; 93 cout<<endl; 94 } 95 } 96 97 int main() 98 { 99 TestDriver(); 100 return 0; 101 }
代码(char *版本):
1 #include <iostream> 2 #include <cassert> 3 #include <string> 4 #include <vector> 5 #include <algorithm> 6 using namespace std; 7 8 size_t GetLCS(const char *str1,const char *str2) 9 { 10 assert(str1 != NULL && str2 != NULL); 11 12 size_t len1 = strlen(str1); 13 size_t len2 = strlen(str2); 14 15 size_t end = len1 < len2 ? len1 : len2; 16 17 size_t index = 0; 18 size_t commenLen = 0; 19 20 for (index = 0;index < end;++index) 21 { 22 if ( *(str1 + index) == *(str2 + index) ) 23 { 24 ++commenLen; 25 } 26 else 27 { 28 break; 29 } 30 } 31 32 return commenLen; 33 } 34 35 typedef char * pCHAR; 36 37 int pStrcmp(const void *p,const void *q) 38 { 39 return ( strcmp( *(char **)p , *(char **)q ) ); 40 } 41 42 void DisplayString(const char *pStr) 43 { 44 size_t index = 0; 45 46 while (*(pStr + index)) 47 { 48 cout<<*(pStr + index); 49 ++index; 50 } 51 cout<<endl; 52 } 53 54 size_t FindLongestStringApearTwice(const char *pSrcStr,char *&pSubStr) 55 { 56 assert(pSrcStr != NULL && pSubStr != NULL); 57 58 const size_t lenOfSrcStr = strlen(pSrcStr); 59 pCHAR *pSuffixArray = new pCHAR[lenOfSrcStr]; 60 61 size_t index; 62 size_t end = strlen(pSrcStr); 63 64 size_t tmpLen = 0; 65 size_t maxLen = 0; 66 67 for ( index = 0;index < end;++index) 68 { 69 pSuffixArray[index] = (char *)pSrcStr + index; 70 //DisplayString(pSuffixArray[index]); 71 } 72 73 qsort(pSuffixArray,lenOfSrcStr,sizeof(pCHAR),pStrcmp); 74 /* 75 for ( index = 0;index < end;++index ) 76 { 77 DisplayString(pSuffixArray[index]); 78 }*/ 79 80 for (index = 0;index + 1 < end;++index) 81 { 82 tmpLen = GetLCS(pSuffixArray[index],pSuffixArray[index + 1]); 83 84 if(tmpLen > maxLen) 85 { 86 maxLen = tmpLen; 87 pSubStr = pSuffixArray[index]; 88 } 89 } 90 91 return maxLen; 92 } 93 94 95 void TestDriver() 96 { 97 pCHAR strArray[] = {"0123456","yyabcdabjcabceg","abcbcbcabc","hello,li mei! hello,li lei!"}; 98 size_t arrayLength = 4; 99 100 pCHAR srcStr; 101 pCHAR subStr; 102 size_t maxLen = 0; 103 104 for (size_t index = 0;index < arrayLength;++index) 105 { 106 srcStr = strArray[index]; 107 maxLen = FindLongestStringApearTwice(srcStr,subStr); 108 109 cout<<"the source string is : "<<srcStr<<endl; 110 cout<<"the longest sub string is : "; 111 112 for (size_t i = 0;i < maxLen;++i) 113 { 114 cout<<*(subStr + i); 115 } 116 cout<<endl; 117 118 cout<<"the max length is : "<<maxLen<<endl; 119 cout<<endl; 120 } 121 } 122 123 124 int main() 125 { 126 TestDriver(); 127 return 0; 128 }
测试结果:
the source string is : 0123456 the longest sub string is : the max length is : 0 the source string is : yyabcdabjcabceg the longest sub string is : abc the max length is : 3 the source string is : abcbcbcabc the longest sub string is : bcbc the max length is : 4 the source string is : hello,li mei! hello,li lei! the longest sub string is : hello,li the max length is : 9 请按任意键继续. . .