• 正则抓取页面信息


    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.Linq;
    using System.Text;
    using System.Windows.Forms;
    using System.Net;
    using System.IO;
    using System.Text.RegularExpressions;
    using System.Xml.Linq;

    namespace CollectingInformation
    {
        public partial class Form1 : Form
        {
            public Form1()
            {
                InitializeComponent();
            }

            private string XMLPath = Application.StartupPath.ToString() + "/58.xml";
            private string HTMLPath = Application.StartupPath.ToString() + "/58.html";

            private void btnOK_Click(object sender, EventArgs e)
            {
                string pagePath = textBox1.Text.Trim();
                try
                {
                    if (!File.Exists(XMLPath))
                    {
                        XElement xeCreateFile = new XElement("InfoBy58");
                        xeCreateFile.Save(XMLPath);
                    }

                    //开始抓取数据
                    //获得指定页面的内容  
                    WebRequest hwr = WebRequest.Create(pagePath);
                    HttpWebResponse hwp = hwr.GetResponse() as HttpWebResponse;
                    StreamReader sr;
                    string code = hwp.ContentType;
                    //得到编码了
                    //如果取不到则默认为gb2312
                    try
                    {
                        code = code.Split('=')[1];
                    }
                    catch
                    {
                        code = "gb2312";
                    }
                    Stream rep = hwp.GetResponseStream();
                    sr = new StreamReader(rep, Encoding.GetEncoding(code));
                    string strSource = sr.ReadToEnd();

                    Regex rx = new Regex("<h1>" + @"([Ss]*?)" + "<h2>"
                            , RegexOptions.Compiled | RegexOptions.IgnoreCase);

                    MatchCollection matchs = rx.Matches(strSource);
                    if (matchs.Count > 0)
                    {
                        strSource = matchs[0].Value;//@all</td><td>(.*)@all</td>
                        string pattern = "<h1>(.*)</h1>@allusername:'(.*)'@all<img src='(.*)'@all";
                        pattern = pattern.Replace("@all", @"[Ss]*?");
                        rx = new Regex(pattern, RegexOptions.Compiled | RegexOptions.IgnoreCase);

                        matchs = rx.Matches(strSource);
                        if (matchs.Count == 1)
                        {
                            XDocument root = XDocument.Load(XMLPath);
                            XElement xele = root.Element("InfoBy58");
                            xele.Add(new XElement("UserInfo", new XElement("Title", matchs[0].Groups[1].Value), new XElement("Name", matchs[0].Groups[2].Value), new XElement("Tel", matchs[0].Groups[3].Value)));
                            root.Save(XMLPath);
                        }
                    }
                }
                catch (Exception ex)
                {
                    MessageBox.Show(ex.Message);
                }

                // pictureBox1.ImageLocation = "http://image.58.com/showphone.aspx?t=v55&v=3041A034B4AF246DD511D9E44B08582D7";

            }

            private void btnExport_Click(object sender, EventArgs e)
            {
                try
                {
                    XDocument root = XDocument.Load(XMLPath);
                    XElement xele = root.Element("InfoBy58");

                    StringBuilder strBuilder = new StringBuilder();
                    strBuilder.Append("<html>");
                    strBuilder.Append("<body>");
                    strBuilder.Append("<table border="1">");
                    strBuilder.Append("<th>");
                    strBuilder.Append("<td>标题</td>");
                    strBuilder.Append("<td>联系人</td>");
                    strBuilder.Append("<td>电话</td>");
                    strBuilder.Append("</th>");

                    foreach (var item in root.Elements("UserInfo"))
                    {
                        strBuilder.Append("<tr>");
                        strBuilder.Append("<td>" + item.Element("CategoryName").Value + "</td>");
                        strBuilder.Append("<td>" + item.Element("CategoryName").Value + "</td>");
                        strBuilder.Append("<td><img src='" + item.Element("CategoryName").Value + "'/></td>");
                        strBuilder.Append("</tr>");
                    }
                    strBuilder.Append("</body></html>");

                    if (!File.Exists(HTMLPath))
                    {
                        File.Create(HTMLPath);
                    }
                    FileStream fs = new FileStream(HTMLPath, FileMode.Open, FileAccess.ReadWrite);
                    StreamWriter sw = new StreamWriter(fs);
                    fs.SetLength(0);//首先把文件清空了。
                    sw.Write(strBuilder.ToString());//写你的字符串。
                    sw.Close();

                }
                catch (Exception ex)
                {
                    MessageBox.Show(ex.Message);
                }
            }
        }
    }

  • 相关阅读:
    ASP.NE网站发布注意事项
    jQuery中使用$.ajax提交表单
    DataTable类Clone及Copy方法的区别
    DataList控件使用初步
    一个非常标准的Java连接Oracle数据库的示例代码
    详细解析用C#写的小游戏《彩色连珠》(附源代码)
    VS 2010 复制代码到word出现乱码解决办法
    Java之简单的图片动态显示(实现类似GIF动画效果)
    Class.forName(String driverClassName)加载JDBC驱动程序时,底层都做了些什么???
    Java设计好看的窗体必加的代码(使用内置皮肤控件):
  • 原文地址:https://www.cnblogs.com/contain/p/3285699.html
Copyright © 2020-2023  润新知