• .NET2.0抓取网页全部链接(转)


     

    .NET2.0抓取网页全部链接

    作者:清清月儿

    主页:http://blog.csdn.net/21aspnet/           时间:2007.4.18 

    该方法经过对各大门户网站测试结果是抓取率100%!
    后台代码:

    using System;
    using System.Data;
    using System.Configuration;
    using System.Web;
    using System.Web.Security;
    using System.Web.UI;
    using System.Web.UI.WebControls;
    using System.Web.UI.WebControls.WebParts;
    using System.Web.UI.HtmlControls;
    using System.Text.RegularExpressions;
    using System.Net;
    using System.IO;
    using System.Collections;

    public partial class _Default : System.Web.UI.Page
    {
        protected void Page_Load(object sender, EventArgs e)
        {
            if (!IsPostBack)
            {
               
            }
           
        }


        protected void Button1_Click(object sender, EventArgs e)
        {
            TextBox2.Text = "";
            string web_url = this.TextBox1.Text;//"http://blog.csdn.net/21aspnet/"
            string all_code = "";
            HttpWebRequest all_codeRequest = (HttpWebRequest)WebRequest.Create(web_url);
            WebResponse all_codeResponse = all_codeRequest.GetResponse();
            StreamReader the_Reader = new StreamReader(all_codeResponse.GetResponseStream());
            all_code = the_Reader.ReadToEnd();
            the_Reader.Close();
            ArrayList my_list = new ArrayList();
            string p = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";
            Regex re = new Regex(p, RegexOptions.IgnoreCase);
            MatchCollection mc = re.Matches(all_code);

            for (int i = 0; i <= mc.Count - 1; i++)
            {
                bool _foo = false;
                string name = mc[i].ToString();
                foreach (string list in my_list)
                {
                    if (name == list)
                    {
                        _foo = true;
                        break;
                    }
                }//过滤

                if (!_foo)
                {
                    TextBox2.Text += name + "\n";
                }
            }
        }
    }

    前台
    <%@ Page Language="C#" AutoEventWireup="true"  CodeFile="Default.aspx.cs" Inherits="_Default" %>

    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

    <html xmlns="http://www.w3.org/1999/xhtml" >
    <head runat="server">
        <title>抓取网页所有链接</title>
       
    </head>
    <body >
        <form id="form1" runat="server">
        <div>
            <asp:TextBox ID="TextBox1" runat="server" Width="481px"></asp:TextBox>
            <asp:Button ID="Button1" runat="server" OnClick="Button1_Click" Text="提取" />
            <br />
            <asp:TextBox ID="TextBox2" runat="server" Height="304px" TextMode="MultiLine" Width="524px"></asp:TextBox></div>
        </form>
    </body>
    </html>

    点评:
    精髓所在: string p = @http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?;

  • 相关阅读:
    智器SmartQ T7实体店试用体验
    BI笔记之SSAS库Process的几种方案
    PowerTip of the Day from powershell.com上周汇总(八)
    PowerTip of the Day2010071420100716 summary
    PowerTip of the Day from powershell.com上周汇总(十)
    PowerTip of the Day from powershell.com上周汇总(六)
    重新整理Cellset转Datatable
    自动加密web.config配置节批处理
    与DotNet数据对象结合的自定义数据对象设计 (二) 数据集合与DataTable
    在VS2003中以ClassLibrary工程的方式管理Web工程.
  • 原文地址:https://www.cnblogs.com/ami/p/720621.html
Copyright © 2020-2023  润新知