• 正则 挖网站表格复习


    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
    using System.Net;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Web;
    using System.Web.UI;
    using System.Web.UI.WebControls;
    using System.Xml;
    namespace WebApplication19
    {
        public enum SearchRange
        {
            th=0,
            td=1
        }
        public partial class WebForm1 : System.Web.UI.Page
        {
            public string MKT;
            private string getHtml()
            {
                List<string> trList = new List<string>();
                try
                {
                    WebClient wc = new WebClient();
                    using (Stream stream = wc.OpenRead("http://srh.bankofchina.com/search/whpj/search.jsp?erectDate=2001-11-01&nothing=2016-11-04&pjname=1316&page=4"))
                    {
                        using (StreamReader sr = new StreamReader(stream, Encoding.UTF8))
                        {
    
                            string content = sr.ReadToEnd();
                            //提取div内容开始
                            string divPatern = @"(?<=<div (.*)?class=""BOC_main publish""[^>]*?>)([sS]*?)(?=</div>)";
                            MatchCollection divMatches = Regex.Matches(content, divPatern);
                            string divContent = string.Empty;
                            foreach (Match match in divMatches)
                            {
                                divContent = match.Groups[0].Value;
                                break;
                            }
                            //提取div内容结束
    
                            //提取表格内容开始
                            string tablePatern = @"(?<=<table (.*)?[^>]*?>)([sS]*?)(?=</table>)";
                            MatchCollection tableMatches = Regex.Matches(divContent, tablePatern);
                            string tableContent = string.Empty;
                            foreach (Match match in tableMatches)
                            {
                                tableContent = match.Groups[0].Value;
                                break;
                            }
    
                            //提取表格内容结束
    
    
                            //提取行开始
    
                            string trPatern = @"(?<=<tr(.*)?[^>]*?>)([sS]*?)(?=</tr>)";
                            MatchCollection trMatchCollection = Regex.Matches(tableContent, trPatern);
                            for (int j = 0; j < trMatchCollection.Count; j++)
                            {
                                Match match = trMatchCollection[j];
                                string tr = string.Empty;
                                tr = match.Groups[0].Value;
                                trList.Add(tr);
    
    
                            }
                            //提取行结束
    
                        }
    
                        //获取表头列元素,或者内容行的单元格元素 trlist[0]是表头 SearchR,ange告诉程序要查表头 还是 内容行
                        List<string> thList = GET_TH_OR_TD_LIST(SearchRange.th, trList[0]);
                        System.Collections.ArrayList tdsList = new System.Collections.ArrayList();
                        for (int i = 1; i < trList.Count; i++)
                        {
                            tdsList.Add(GET_TH_OR_TD_LIST(SearchRange.td, trList[i]));
                        }
                      
                    }
                }
                catch (Exception ex)
                {
                   
                }
                return MKT;
            }
    
            private List<string> GET_TH_OR_TD_LIST(SearchRange range,string row)
            {
                string tmp = "";
                tmp = range.ToString();
                string tdPatern = $@"(?<=(<{tmp}[^>]*?>))(?<tdCell>[sS]*?)(?=</{tmp}>)";
                MatchCollection CurrenttdMatchCollection = Regex.Matches(row, tdPatern);
                string td = string.Empty;
                List<string> tdlList = new List<string>();
                List<string> contentList = new List<string>();
                foreach (Match match in CurrenttdMatchCollection)
                {
    
                    td = match.Groups["tdCell"].Value;
                    contentList.Add(td);
    
                }
                return contentList;
    
            }
            protected void Page_Load(object sender, EventArgs e)
            {
                getHtml();
            }
        }
    }
  • 相关阅读:
    Visual Studio 必备神器
    MVC中用Jpaginate分页 So easy!(兼容ie家族)
    仿花田:相亲网站 意中人 已在GitHub上开源
    仿花田:内部相亲网站 意中人(Asp.net MVC,Bootstrap2)
    Myeclipse最全快捷键
    SVG Viewer 3.0安装发现SVG Viewer License.txt无法介入写入,安装失败
    SVG报错error on line 39 at column 26: Namespace prefix xlink for href on script is not defined
    SVG可缩放矢量图形
    Google帮助IE浏览器实现对SVG支持
    JS代码判断IE6,IE7,IE8,IE9的函数代码
  • 原文地址:https://www.cnblogs.com/kexb/p/6035938.html
Copyright © 2020-2023  润新知