1 #ifndef _Document_H_040410_ 2 #define _Document_H_040410_ 3 4 #include <string> 5 6 typedef struct{ 7 int docid; 8 int offset; 9 }DocIdx; 10 11 using namespace std; 12 13 class CDocument 14 { 15 public: 16 17 int m_nDocId; 18 int m_nPos; 19 int m_nLength; 20 string m_sChecksum; 21 22 string m_sUrl; 23 string m_sRecord; // a record including a HEAD, a header and body 24 string m_sHead; 25 string m_sHeader; 26 string m_sBody; 27 28 string m_sBodyNoTags; 29 30 public: 31 CDocument(); 32 ~CDocument(); 33 34 bool ParseRecord(string &content) const; 35 bool CleanBody(string &body) const; 36 37 void RemoveTags(char *s); 38 }; 39 40 #endif /* _Document_H_040410_ */
1 /*Document handling 2 */ 3 4 #include "Document.h" 5 6 CDocument::CDocument() 7 { 8 m_nDocId = -1; 9 m_nPos = -1; 10 m_nLength = 0; 11 m_sChecksum = ""; 12 13 m_sUrl = ""; 14 } 15 16 CDocument::~CDocument() 17 { 18 } 19 20 bool CDocument::ParseRecord(string &content) const 21 { 22 return true; 23 } 24 25 bool CDocument::CleanBody(string &body) const 26 { 27 return true; 28 } 29 30 //把 <...> 删掉 31 void CDocument::RemoveTags(char *s) 32 { 33 int intag; 34 char *p, *q; 35 36 if (!s || !*s) return; 37 38 for (p=q=s, intag=0; *q; q++) { 39 switch (*q){ 40 case '<': 41 intag = 1; 42 *p++ = ' '; 43 break; 44 case '>': 45 intag = 0; 46 break; 47 default: 48 if (!intag) { 49 *p++ = *q; 50 } 51 break; 52 } 53 } 54 55 *p = '\0'; 56 57 /* second method 58 char *d = s; 59 while (*s) { 60 if (*s == '<') { 61 while (*s && *s!='>') s++; 62 if( *s == '\0') break; 63 s++; 64 continue; 65 } 66 67 *d++ = *s++; 68 } 69 *d = 0; 70 */ 71 }