• data filter 去掉HTML文件中的所有标记


    编写一个C++程序来读取文件,过滤掉所有的标记,将过滤掉标记后的内容输出到一个新文件中。

    1. 从文件中读取一个字符

    2. 确定字符是否是HTML标记的一部分

    3. 打印出所有不是HTML标记的字符

    /* --------------------------------------------
     * This program reads a html file, and writes
     * the text without the tags to a new file.
     * --------------------------------------------*/
    
    #include <iostream> // Required for cin, cout, cerr
    #include <fstream>  // Required for ifstream, ofstream
    #include <string>   // Required for string
    #include <cstdlib>  // Required for exit
    
    using namespace std;
    
    int main()
    {
        // Declare objects
        char ch;
        bool text_state(true);
        string infile, outfile;
        ifstream html;
        ofstream htmltext;
    
        // Prompt user for name of input file
        cout << "Enter the name of the input file : 
    ( *.*, such as : demo.html ) 
    " ;
        cout << "Make sure the file is under current project file ! 
    " ;
        // My English is poor ~~
        cin >> infile;
    
        cout<< "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    " ;
        // Prompt user for name of output file
        cout << "Enter the name of the output file :  " ;
        cin >> outfile;
    
        // Open files
        html.open(infile.c_str());
        if(html.fail())
        {
            cout<< "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    " ;
            cerr << "Error opening input file" << endl ;
            exit(1);
        }
        htmltext.open(outfile.c_str());
    
        // Read first character from html file
        html.get(ch);
    
        while(!html.eof())
        {
            // Check state
            if(text_state)
            {
                if(ch == '<')
                    // Beginning of a tag
                    text_state = false;
                    // Change states
    
                else
                    htmltext << ch;
                    // Still text, write to the file
            }
            else
            {
                // Command state, no output required
                if(ch == '>')
                // End of tag
                    text_state = true;
                // Change states
            }
    
            // Read next character from html file
            html.get(ch);
        }
        html.close();
        htmltext.close();
        cout<< "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    " ;
        cout << "Success transformed ! 
    " ;
        cout << "Look for " << outfile << " in current file.
    " ;
        cout<< "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    " ;
    
        return 0;
    }
    

    之后就可以拿个HTML文件试试了,不过这个程序只是把所有标记过滤掉,还有待完善。如果非标记字符有很多无关内容,效果就差强人意。建议用典型的HTML文件测试,如:

    <html>
    
    <head>
    <title>我的第一个 HTML 页面</title>
    </head>
    
    <body>
    <p>body 元素的内容会显示在浏览器中。</p>
    <p>title 元素的内容会显示在浏览器的标题栏中。</p>
    </body>
    
    </html>
    


  • 相关阅读:
    Error和Exception的区别
    当try和finally都包含return时的执行顺序
    String,StringBuffer处理字符串的区别
    使用idea对XML的增删改查
    IO流,字节流复制文件,字符流+缓冲复制文件
    MySQL同步故障:" Slave_SQL_Running:No" 主从同步的从表进行了写操作
    常用MQ的对比冷知识
    Redis-避免缓存穿透
    Docker容器与虚拟化技术——部署KVM虚拟化平台
    HTML日记 第三篇 关于图片的冷知识(附带一些浮动的基础知识)
  • 原文地址:https://www.cnblogs.com/Genesis2018/p/9079829.html
Copyright © 2020-2023  润新知