• DocIndex


      1 #include <iostream>
      2 #include <fstream>
      3 #include <cstring>
      4 #include "Md5.h"
      5 #include "Url.h"
      6 #include "Document.h"
      7 
      8 using namespace std;
      9 
     10 int main(int argc, char* argv[]) {/*
     11  * DocIndex.cpp
     12  * Created on: 2011-11-9
     13  *   function:
     14  *   将一个原始网页库进行索引,
     15  *   生成网页索引文件Doc.idx
     16  *   和URL索引文件Url.idx
     17  */
     18 
     19     ifstream ifs("Tianwang.raw.1078930288");
     20     if (!ifs) {
     21         cout << "不能打开原始网页库<Tianwang.raw.******>" << endl;
     22         return -1;
     23     }
     24 
     25     ofstream ofsUrl("Url.idx", ios::in | ios::out | ios::trunc | ios::binary);
     26     if (!ofsUrl) {
     27         cout << "不能打开或者创建URL索引文件<Url.idx>" << endl;
     28         cout << "error open file " << endl;
     29     }
     30 
     31     ofstream ofsDoc("Doc.idx", ios::in | ios::out | ios::trunc | ios::binary);
     32     if (!ofsDoc) {
     33         cout << "error open file " << endl;
     34     }
     35 
     36     ofstream ofsDocId2Url("DocId2Url.idx",
     37             ios::in | ios::out | ios::trunc | ios::binary);
     38     if (!ofsDocId2Url) {
     39         cout << "error open file " << endl;
     40     }
     41 
     42     int cnt = 0;
     43     string strLine, strPage;
     44     CUrl iUrl;
     45     CDocument iDocument;
     46     CMD5 iMD5;
     47 
     48     int nOffset = ifs.tellg(); //得到文件读指针距该文件头的字节数
     49     cout << "tellg() is:" << nOffset << endl;
     50     while (getline(ifs, strLine)) {
     51         if (strLine[0] == '\0' || strLine[0] == '#' || strLine[0] == '\n') {
     52             nOffset = ifs.tellg();
     53             continue;
     54         }
     55 
     56         if (!strncmp(strLine.c_str(), "version: 1.0", 12)) {
     57             if (!getline(ifs, strLine))
     58                 break;
     59 
     60             if (!strncmp(strLine.c_str(), "url: ", 4)) {
     61                 iUrl.m_sUrl = strLine.substr(5);//保存url
     62                 iMD5.GenerateMD5((unsigned char*) iUrl.m_sUrl.c_str(),
     63                         iUrl.m_sUrl.size());
     64                 iUrl.m_sChecksum = iMD5.ToString();
     65 
     66             } else {
     67                 continue;
     68             }
     69 
     70             while (getline(ifs, strLine)) {//保存文件长度信息
     71                 if (!strncmp(strLine.c_str(), "length: ", 8)) {
     72                     sscanf(strLine.substr(8).c_str(), "%d",
     73                             &(iDocument.m_nLength));
     74                     break;
     75                 }
     76             }
     77 
     78             getline(ifs, strLine);
     79 
     80             iDocument.m_nDocId = cnt;
     81             iDocument.m_nPos = nOffset;
     82             char *pContent = new char[iDocument.m_nLength + 1];
     83 
     84             memset(pContent, 0, iDocument.m_nLength + 1);
     85             ifs.read(pContent, iDocument.m_nLength);
     86             iMD5.GenerateMD5((unsigned char*) pContent, iDocument.m_nLength);
     87             iDocument.m_sChecksum = iMD5.ToString();
     88 
     89             delete[] pContent;
     90 
     91             ofsUrl << iUrl.m_sChecksum;//MD5值
     92             ofsUrl << "\t" << iDocument.m_nDocId << endl;//doc的Id
     93 
     94             ofsDoc << iDocument.m_nDocId;//文件偏移位置到MD5的映射
     95             ofsDoc << "\t" << iDocument.m_nPos;
     96             //ofsDoc << "\t" << iDocument.m_nLength ;
     97             ofsDoc << "\t" << iDocument.m_sChecksum << endl;
     98 
     99             ofsDocId2Url << iDocument.m_nDocId;//文件编号到url的映射
    100             ofsDocId2Url << "\t" << iUrl.m_sUrl << endl;
    101 
    102             cnt++;
    103         }
    104 
    105         nOffset = ifs.tellg();
    106 
    107     }
    108 
    109     ofsDoc << cnt;
    110     ofsDoc << "\t" << nOffset << endl;
    111 
    112     return (0);
    113 }
  • 相关阅读:
    Vector-Constructors
    C++:多维数组的动态分配(new)和释放(delete)
    C++:多维数组的动态分配(new)和释放(delete)
    COM_利用GetWallpaper()获取墙纸路径
    COM_利用GetWallpaper()获取墙纸路径
    COM 技术相关概念
    COM 技术相关概念
    全排列与next_permutation
    全排列与next_permutation
    屏蔽MFC程序中的ESC键和ENTER键关闭窗口
  • 原文地址:https://www.cnblogs.com/kakamilan/p/2581260.html
Copyright © 2020-2023  润新知