为了能够让用户更清晰的写采集规则,特意把采集回来的源码经过精简之后再进行处理,比如,要
删除 <style> </style> <script> </script>注释等内容去除标签垃圾属性等,使用正则表达式,大一点的网页
数据,匹配替换为空一次竟然可以达到几百毫秒,吓死人,故自己写了个函数,同样的数据大小1-3毫秒完成.
- <span><span style="color: #000000;">
- //------这上面几段垃圾html代码删不掉了,请无视它,自动跑进去的
- class BaseLib
- {
- public:
- BaseLib();
- static QString removeOf(QString strData,QString strBegin,QString strEnd,bool bIgnoreCase = false,bool bAllStr = false);
- };
- </span>
- </span>
- QString BaseLib::removeOf(QString strData,QString strBegin,QString strEnd,bool bIgnoreCase,bool bAllStr)
- {
- int nPos = 0;
- int nStartPos = -1;
- int nEndPos = -1;
- int nBeginLen = strBegin.length();
- int nEndLen = strEnd.length();
- if(bIgnoreCase == true){
- QString strNewData = strData.toUpper();
- strBegin = strBegin.toUpper();
- strEnd = strEnd.toUpper();
- while(1){
- nStartPos = strNewData.indexOf(strBegin,nPos);
- if(nStartPos == -1){
- break;
- }
- nEndPos = strNewData.indexOf(strEnd,nStartPos + nBeginLen);
- if(nEndPos == -1){
- break;
- }
- nEndPos = nEndPos - nStartPos + nEndLen;
- if(bAllStr == false){
- strNewData.remove(nStartPos + nBeginLen,nEndPos - nBeginLen - nEndLen);
- strData.remove(nStartPos + nBeginLen,nEndPos - nBeginLen - nEndLen);
- nPos = nStartPos + nEndPos - nBeginLen - nEndLen;
- }else{
- strNewData.remove(nStartPos,nEndPos);
- strData.remove(nStartPos,nEndPos);
- nPos = nStartPos;
- }
- }
- }else{
- while(1){
- nStartPos = strData.indexOf(strBegin,nPos);
- if(nStartPos == -1){
- break;
- }
- nEndPos = strData.indexOf(strEnd,nStartPos + nBeginLen);
- if(nEndPos == -1){
- break;
- }
- nEndPos = nEndPos - nStartPos + nEndLen;
- if(bAllStr == false){
- strData.remove(nStartPos + nBeginLen,nEndPos - nBeginLen - nEndLen);
- nPos = nStartPos + nEndPos - nBeginLen - nEndLen;
- }else{
- strData.remove(nStartPos,nEndPos);
- nPos = nStartPos;
- }
- }
- }
- return strData;
- }
参数1 :传递数据
参数2 :传递要处理的数据开头
参数3 :传递要处理的数据结尾
参数4 :bool值,如果填写true则忽略大小写,false,不忽略大小写(速度更快点),可以不填写,默认false
参数5 :是否只删除开头和结尾之间的数据,true 删除包含数据开头和数据结尾的数据,false 保留数据开头和结尾
(参数4,5可不填写)
例子:
- QString str = "<head><title>this is a test</title></head>";
- QString strData = BaseLib::removeOf(strData,"<title>","</title>",false,true);
返回:
- <head></head>
字符串处理使用的QT的QString 库,如果用MFC的人,可以替换成CString
c++标准库string版(比QString稍微快点)
- string BaseLib::removeOf(string strData,string strBegin,string strEnd,bool bIgnoreCase,bool bAllStr)
- {
- int nPos = 0;
- int nStartPos = -1;
- int nEndPos = -1;
- int nBeginLen = strBegin.length();
- int nEndLen = strEnd.length();
- if(bIgnoreCase == true){
- string strNewData = strData;
- for(int i=0;i<(int)strNewData.size();++i){
- strNewData[i]=toupper(strNewData[i]);
- }
- for(int i=0;i<(int)strBegin.size();++i){
- strBegin[i]=toupper(strBegin[i]);
- }
- for(int i=0;i<(int)strEnd.size();++i){
- strEnd[i]=toupper(strEnd[i]);
- }
- while(1){
- nStartPos = strNewData.find(strBegin,nPos);
- if(nStartPos == -1){
- break;
- }
- nEndPos = strNewData.find(strEnd,nStartPos + nBeginLen);
- if(nEndPos == -1){
- break;
- }
- nEndPos = nEndPos - nStartPos + nEndLen;
- if(bAllStr == false){
- strNewData.erase(nStartPos + nBeginLen,nEndPos - nBeginLen - nEndLen);
- strData.erase(nStartPos + nBeginLen,nEndPos - nBeginLen - nEndLen);
- nPos = nStartPos + nEndPos - nBeginLen - nEndLen;
- }else{
- strNewData.erase(nStartPos,nEndPos);
- strData.erase(nStartPos,nEndPos);
- nPos = nStartPos;
- }
- }
- }else{
- while(1){
- nStartPos = strData.find(strBegin,nPos);
- if(nStartPos == -1){
- break;
- }
- nEndPos = strData.find(strEnd,nStartPos + nBeginLen);
- if(nEndPos == -1){
- break;
- }
- nEndPos = nEndPos - nStartPos + nEndLen;
- if(bAllStr == false){
- strData.erase(nStartPos + nBeginLen,nEndPos - nBeginLen - nEndLen);
- nPos = nStartPos + nEndPos - nBeginLen - nEndLen;
- }else{
- strData.erase(nStartPos,nEndPos);
- nPos = nStartPos;
- }
- }
- }
- return strData;
- }