• 抓取网页文本内容


    使用的是WebRequest类,在这以http://novel.hongxiu.com/a/1036665/10425842.html为例。

    代码如下:

    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.IO;
    using System.Net;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Windows.Forms;
    
    namespace 网页抓取
    {
        public partial class Form1 : Form
        {
            public Form1()
            {
                InitializeComponent();
            }
    
            public void zhuaqu()
            {
                WebRequest request = WebRequest.Create(label1.Text);//发出请求
                WebResponse response = request.GetResponse();//Internet请求的响应
                StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8);//按编码方式读取Internet返回的数据流
                string html = sr.ReadToEnd();
                string th = thtxt(html);//使用正则表达式替换html源代码中的标签为空格
                sr.Close();
                
    
                int sindex = th.IndexOf("红|袖|言|情|小|说");//查索引
                int lindex = th.IndexOf("但是什么?");
                string subtxt = th.Substring(sindex,lindex-sindex+6);//截取想要的内容
                StreamWriter sw = new StreamWriter("E:\x1.txt");//写入流保存
                sw.WriteLine(subtxt);
                sw.Close();
                richTextBox1.Text = subtxt;
    
            }
            private void button1_Click(object sender, EventArgs e)
            {
                zhuaqu();
            }
    
            private string thtxt(string Html)
            {
                Regex reg = new Regex("<(.|
    )+?>");
                //Regex r = new Regex(@"s+");//把空格替换掉的正则表达式
                string th = reg.Replace(Html, "");
                th = th.Replace("<", "<");
                th = th.Replace(">", "");
                //th = r.Replace(th,"");
                return th;
            }
        }
    }

    运行效果

  • 相关阅读:
    Codeforces Round #603 (Div. 2)
    【bzoj1997】[Hnoi2010]Planar(平面图+2-sat)
    【poj3207】Ikki's Story IV
    【HDU1814】Peaceful Commission(2-sat+暴力染色)
    Educational Codeforces Round 77 (Rated for Div. 2)
    【hdu3311】Dig The Wells(斯坦纳树+dp)
    [USACO3.3] A Game
    [TJOI2013] 单词
    [USACO3.3] Home on the Range
    [NOI2011] 阿狸的打字机
  • 原文地址:https://www.cnblogs.com/happinesshappy/p/4579410.html
Copyright © 2020-2023  润新知