20190529更新
1 增加测试用例
2 修复中文查找可能导致越界的bug
3 strstr改为不使用二分(效率会慢一些,但匹配结果相对可控),推荐使用strstrs_ext
==================================================================================
20190529:windows上建议使用strstrs_ext,linux上在数据不匹配的场景好像strstrs_normal更快一点。我把测试效率代码附上,有需要的可以自己验证。
从我自己测试的效率对比猜测,linux上gcc的strstr应该不是普通的暴力匹配法,网上的说法不正确。
==================================================================================
平时项目中有时需要用到在字符串中搜索两个或更多的关键字的情景。例如:将字符串"ab|cd#ef|"按竖线或者井号做分隔
如果是大项目,一般会采用正则表达式做处理。但有时写个小程序,不想因此引进一个正则库,所以我自己写了一个支持多关键字版本的字符串查找函数strstrs
函数说明:
1 #include <stdio.h> 2 #include <windows.h> 3 4 #ifndef IN 5 #define IN 6 #endif 7 8 //函数说明:在字符串中搜索指定的关键字,支持1-nCnt个关键字 9 //strToFind 待查找字符串 不允许为空 10 //strKeywords 搜索关键字字符串数组 不允许为空 数组元素不允许为空(NULL),但可以是空串("") 11 //nCnt 关键字个数 12 //pFound 查找到的关键字在字符串数组的位置 不允许为空 13 //返回值: 14 //1 如果关键字存在空串,则返回strToFind 15 //2 如果找不到关键字则返回NULL 16 //3 如果找到关键字,则返回关键字在strKeywords中的位置(位置从0开始) 17 18 //使用哈希加二分查找实现 19 const char *strstrs(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound); 20 //使用哈希加链接实现 推荐使用 21 const char *strstrs_ext(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound); 22 //依次查找关键字的实现 23 const char *strstrs_normal(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound); 24 25 //以下是为了使用方便而增加的一些重载,没多大意义 26 char *strstrs(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound); 27 char *strstrs_ext(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound); 28 char *strstrs_normal(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound); 29 30 char *strstrs(IN char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound); 31 char *strstrs_ext(IN char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound); 32 char *strstrs_normal(IN char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound); 33 34 const char *strstrs(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound); 35 const char *strstrs_ext(const char *strToFind, const char *strKeywords[], size_t nCnt, int pFound); 36 const char *strstrs_normal(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound); 37 void tets_strstrs(int nStep); // 0 strstrs 1 strstrs_ext 2 strstrs_normal
函数实现及相应测试代码:
// stdafx.cpp : source file that includes just the standard includes // sqlite_test.pch will be the pre-compiled header // stdafx.obj will contain the pre-compiled type information #include "stdafx.h" #include <assert.h> #include <stdlib.h> #include <time.h> #include <stdio.h> // TODO: reference any additional headers you need in STDAFX.H // and not in this file const char *strstrs(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound) { return strstrs(const_cast<char *>(strToFind), strKeywords, nCnt, pFound); } const char *strstrs_ext(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound) { return strstrs_ext(const_cast<char *>(strToFind), strKeywords, nCnt, pFound); } const char *strstrs_normal(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound) { return strstrs_normal(const_cast<char *>(strToFind), strKeywords, nCnt, pFound); } const char *strstrs(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound) { return strstrs(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound); } const char *strstrs_ext(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound) { return strstrs_ext(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound); } const char *strstrs_normal(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound) { return strstrs_normal(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound); } char *strstrs(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound) { return strstrs(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound); } char *strstrs_ext(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound) { return strstrs_ext(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound); } char *strstrs_normal(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound) { return strstrs_normal(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound); } typedef struct tagKeyPos { const char *m_str; size_t m_nIdx; size_t m_strLen; }KeyPos; int __strstrs_cmp(const void *p1, const void *p2) { const KeyPos *pLeft = (KeyPos *)p1, *pRight = (KeyPos *)p2; int nCmp = strcmp(pLeft->m_str, pRight->m_str); if (nCmp == 0) { return pLeft->m_nIdx - pRight->m_nIdx; } return nCmp; } /* //lower_bound KeyPos *__strstrs_find_first(KeyPos *pRealBeg, KeyPos *pRealEnd, size_t *pKeyLenArr, KeyPos *pKey) { KeyPos *pBeg = pRealBeg; KeyPos *pEnd = pRealEnd; KeyPos *pEqal = NULL; while (pBeg != pEnd) { pEqal = pBeg + (pEnd - pBeg) / 2; int nCmp = memcmp( pEqal->m_str, pKey->m_str, pEqal->m_strLen ); if (nCmp == 0) { //若相等,则往前找,直至找到最后一个相等的元素 while (pEqal != pBeg) { pEqal--; if (memcmp( pEqal->m_str, pKey->m_str, pEqal->m_strLen )) { return pEqal + 1; } } return pBeg; } else if (nCmp > 0) { //中值比目标值大 pEnd = pEqal; } else { //中值比目标值小 pBeg = pEqal + 1; } } return pRealEnd; } */ KeyPos *__strstrs_find_first(KeyPos *pRealBeg, KeyPos *pRealEnd, size_t *pKeyLenArr, KeyPos *pKey) { KeyPos *pBeg = pRealBeg; KeyPos *pEnd = pRealEnd; while (pBeg != pEnd) { int nCmp = memcmp( pBeg->m_str, pKey->m_str, pBeg->m_strLen ); if (nCmp == 0) { return pBeg; } ++pBeg; } return pRealEnd; } char *strstrs(char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound) { //作者:皇家救星 创建于:2016-10-19 //有bug请发送邮件至89475049@qq.com 邮件主题注明:strstrs问题反馈 //异常参数判断 assert(strToFind != NULL); assert(strKeywords != NULL); assert(pFound != NULL); assert(nCnt > 0); //记录各个关键字首字符到集合中 后面判断用 bool mpFirstChar[256] = {0}; //这里如果用位图,可以节省不少空间 for (size_t i = 0; i < nCnt; i++) { //linux和win的char类型定义不一样 这里统一强制转换一下 assert(strKeywords[i] != NULL); //使用unsigned char 确保char类型是负数时强制转换不会超过256而越界 mpFirstChar[(unsigned char)strKeywords[i][0]] = true; if (strKeywords[i][0] == '