1 #include <iostream> 2 #include <fstream> 3 #include <cstring> 4 #include "Md5.h" 5 #include "Url.h" 6 #include "Document.h" 7 8 using namespace std; 9 10 int main(int argc, char* argv[]) {/* 11 * DocIndex.cpp 12 * Created on: 2011-11-9 13 * function: 14 * 将一个原始网页库进行索引, 15 * 生成网页索引文件Doc.idx 16 * 和URL索引文件Url.idx 17 */ 18 19 ifstream ifs("Tianwang.raw.1078930288"); 20 if (!ifs) { 21 cout << "不能打开原始网页库<Tianwang.raw.******>" << endl; 22 return -1; 23 } 24 25 ofstream ofsUrl("Url.idx", ios::in | ios::out | ios::trunc | ios::binary); 26 if (!ofsUrl) { 27 cout << "不能打开或者创建URL索引文件<Url.idx>" << endl; 28 cout << "error open file " << endl; 29 } 30 31 ofstream ofsDoc("Doc.idx", ios::in | ios::out | ios::trunc | ios::binary); 32 if (!ofsDoc) { 33 cout << "error open file " << endl; 34 } 35 36 ofstream ofsDocId2Url("DocId2Url.idx", 37 ios::in | ios::out | ios::trunc | ios::binary); 38 if (!ofsDocId2Url) { 39 cout << "error open file " << endl; 40 } 41 42 int cnt = 0; 43 string strLine, strPage; 44 CUrl iUrl; 45 CDocument iDocument; 46 CMD5 iMD5; 47 48 int nOffset = ifs.tellg(); //得到文件读指针距该文件头的字节数 49 cout << "tellg() is:" << nOffset << endl; 50 while (getline(ifs, strLine)) { 51 if (strLine[0] == '\0' || strLine[0] == '#' || strLine[0] == '\n') { 52 nOffset = ifs.tellg(); 53 continue; 54 } 55 56 if (!strncmp(strLine.c_str(), "version: 1.0", 12)) { 57 if (!getline(ifs, strLine)) 58 break; 59 60 if (!strncmp(strLine.c_str(), "url: ", 4)) { 61 iUrl.m_sUrl = strLine.substr(5);//保存url 62 iMD5.GenerateMD5((unsigned char*) iUrl.m_sUrl.c_str(), 63 iUrl.m_sUrl.size()); 64 iUrl.m_sChecksum = iMD5.ToString(); 65 66 } else { 67 continue; 68 } 69 70 while (getline(ifs, strLine)) {//保存文件长度信息 71 if (!strncmp(strLine.c_str(), "length: ", 8)) { 72 sscanf(strLine.substr(8).c_str(), "%d", 73 &(iDocument.m_nLength)); 74 break; 75 } 76 } 77 78 getline(ifs, strLine); 79 80 iDocument.m_nDocId = cnt; 81 iDocument.m_nPos = nOffset; 82 char *pContent = new char[iDocument.m_nLength + 1]; 83 84 memset(pContent, 0, iDocument.m_nLength + 1); 85 ifs.read(pContent, iDocument.m_nLength); 86 iMD5.GenerateMD5((unsigned char*) pContent, iDocument.m_nLength); 87 iDocument.m_sChecksum = iMD5.ToString(); 88 89 delete[] pContent; 90 91 ofsUrl << iUrl.m_sChecksum;//MD5值 92 ofsUrl << "\t" << iDocument.m_nDocId << endl;//doc的Id 93 94 ofsDoc << iDocument.m_nDocId;//文件偏移位置到MD5的映射 95 ofsDoc << "\t" << iDocument.m_nPos; 96 //ofsDoc << "\t" << iDocument.m_nLength ; 97 ofsDoc << "\t" << iDocument.m_sChecksum << endl; 98 99 ofsDocId2Url << iDocument.m_nDocId;//文件编号到url的映射 100 ofsDocId2Url << "\t" << iUrl.m_sUrl << endl; 101 102 cnt++; 103 } 104 105 nOffset = ifs.tellg(); 106 107 } 108 109 ofsDoc << cnt; 110 ofsDoc << "\t" << nOffset << endl; 111 112 return (0); 113 }