• 使用c#采集目标网页


    经常做网站的朋友会用到一个工具叫做火车头的工具,这个工具可以方便的把网页上的重复数据抓取下来,有的时候我们会需要自己去抓取页面数据而火车头存在的一些限制没办法完成的时候可以使用c#的正则表达式配合字符串处理完成火车头的采集功能。

    using System.IO;
    using System.Net;
    using System.Text;
    using System;
    using System.Text.RegularExpressions;
    using System.Collections.Generic;
    
    
    namespace testtaobao {
        public class caiji
        {
            #region 获取网页内容
            /// <summary>
            /// 获取网页内容
            /// </summary>
            /// <param name="url">网址</param>
            /// <param name="code">网页编码例如GB2312</param>
            /// <returns>网页源码</returns>
            public string gethtml(string url,string code){
                string strResult;
                try
                {
                    HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
                    //声明一个HttpWebRequest请求   
                    request.Timeout = 30000;
                    //设置连接超时时间   
                    request.Headers.Set("Pragma", "no-cache");
                    HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                    Stream streamReceive = response.GetResponseStream();
                    Encoding encoding = Encoding.GetEncoding(code);
                    StreamReader streamReader = new StreamReader(streamReceive, encoding);
                    strResult = streamReader.ReadToEnd();
                    return strResult;
                }
                catch (Exception ex)
                {
                    throw ex;
                }
            }
            #endregion
            #region 替换换行符
            /// <summary>
            /// 替换掉网页源码里面的换行符,方便匹配
            /// </summary>
            /// <param name="HtmlCode">html代码</param>
            /// <returns>去除换行符后的字符串</returns>
            public string ReplaceEnter(string HtmlCode)
            {
                string s = "";
                if (HtmlCode == null || HtmlCode == "")
                    s = "";
                else
                    s = HtmlCode.Replace("\"", "");
                s = s.Replace("\r\n", "");
                return s;
            }
            #endregion
            #region 执行正则提取出值
            /// <summary>
            /// 执行正则提取出值
            /// </summary>
            /// <param name="RegexString">正则表达式</param>
            /// <param name="RemoteStr">HtmlCode源代码</param>
            /// <returns></returns>
            public MatchCollection GetRegValue(string RegexString, string RemoteStr)
            {
                Regex r = new Regex(RegexString,RegexOptions.Multiline);            
                MatchCollection matches = r.Matches(RemoteStr);
                return matches;
                
            }
            #endregion
    
    
            #region 获取目标字符串
            /// <summary>
            /// 获取目标字符串
            /// </summary>
            /// <param name="fstr">目标字符串前面的字串</param>
            /// <param name="estr">目标字符串后面的字串</param>
            /// <param name="scstr">源字符串</param>
            /// <returns>匹配到的字符串数组</returns>
            public List<string> getstr(string fstr, string estr, string scstr) {
                //StringBuilder stb = new StringBuilder();
                string regstr = fstr + @".*?" + estr;
                List<string> rlist = new List<string>();
                MatchCollection match = GetRegValue(regstr, scstr);
                
                for (int i = 0; i < match.Count; i++)
                {
                    string tpstr = match[i].ToString();
                    tpstr = tpstr.Replace(fstr, "");
                    tpstr = tpstr.Replace(estr, "");
                    rlist.Add(tpstr);
                }
                return rlist;
            }
            #endregion
        }
    }
  • 相关阅读:
    将Apache2.4手动安装成Windows的服务
    [译文]PHP千年虫(y2k compliance)
    Apache2.4 authz_core_module模块使用
    Cannot start session without errors, please check errors given in your PHP and/or webserver log file and configure your PHP installation properly.错误
    [转载]开启debug调试模式
    thinkphp 去掉URL 里面的index.php
    在WINDOWS下安装PEAR
    php5.5.15注释问题PHP Deprecated: Comments starting with '#' are deprecated in *.ini 警告解决办法
    Maven 与 IntelliJ IDEA 的完美结合
    JavaRebel 2.0 发布,一个JVM插件
  • 原文地址:https://www.cnblogs.com/lijurui/p/2703221.html
Copyright © 2020-2023  润新知