• asp.net 网页抓取内容


    网页抓取代码

    复制代码
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Web;
    //
    using System.Net;
    using System.IO;
    using System.Text.RegularExpressions;
    using System.Text;
    
    namespace WSYL.Web.Common
    {
        public static class GetSteamShipInfo
        {
            public static string GetWebSite(string steamshipname,int itype)
            {
                if (steamshipname == null || steamshipname.Trim() == "")
                    return null;
                //step1: get html from url
                string urlToCrawl = @"网址";
                //generate http request
                HttpWebRequest req = (HttpWebRequest)WebRequest.Create(urlToCrawl);
                //use GET method to get url's html
                req.Method = "GET";
                //use request to get response
                HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
                // 二〇一五年八月十二日 18:14:45 需要增加判断网页解析超时问题 防止网页假死
                // string htmlCharset = "UTF-8";
                string htmlCharset = "utf-8";
                //use songtaste's html's charset GB2312 to decode html
                //otherwise will return messy code
                Encoding htmlEncoding = Encoding.GetEncoding(htmlCharset);
                StreamReader sr = new StreamReader(resp.GetResponseStream(), htmlEncoding);
                //read out the returned html
                string respHtml = sr.ReadToEnd();
                //第三种获取内容
                //Match TitleMatch = Regex.Match(rtbExtractedHtml.Text.ToString(), "<td width=\"30%\">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
    //需要获取的代码开始和结尾内容
    Match TitleMatch2 = Regex.Match(respHtml.ToString(), "<td align=\"left\" bgcolor=\"#EEEEEE\">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline); // txbExtractedInfo.Text = TitleMatch2.Groups[1].Value+"/"+ TitleMatch2.Groups[2].Value; if (TitleMatch2.Groups[1].Value.Length == 0 || TitleMatch2.Groups[1].Value=="") return respHtml = ""; if(itype==0) { respHtml = TitleMatch2.Groups[1].Value.ToString(); } if(itype==1) { respHtml = StripHtml(TitleMatch2.NextMatch().Value.ToString()); } if (itype == 2) { respHtml = TitleMatch2.Groups[1].Value + "/" + StripHtml(TitleMatch2.NextMatch().Value.ToString()); } return respHtml; } /// <summary> /// 去除html标签和空格有些例外会使得去除不干净,所以建议连续两次转化。这样将Html标签转化为了空格。太多连续的空格会影响之后对字符串的操作 /// </summary> /// <param name="strHtml">标签内容</param> /// <returns></returns> private static string StripHtml(string strHtml) { Regex objRegExp = new Regex("<(.|\n)+?>"); string strOutput = objRegExp.Replace(strHtml, ""); strOutput = strOutput.Replace("<", "&lt;"); strOutput = strOutput.Replace(">", "&gt;"); //把所有空格变为一个空格 Regex r = new Regex(@"\s+"); strOutput = r.Replace(strOutput, " "); return strOutput.Trim(); } } }
    复制代码
    走在通往梦想国度的路上,加油!
  • 相关阅读:
    aop 切面编程
    动态代理模式
    idea 从接口方法 跳转到 实现类 对应的方法
    2019年的某一天
    javaweb . 页面登出 操作
    Could not find acceptable representation报错
    window下mysql character_set_server修改不生效问题
    git 命令推送
    spring boot 2.x + elasticsearch+mybatis-plus
    java8的Consumer函数式接口
  • 原文地址:https://www.cnblogs.com/hs8888/p/5520564.html
Copyright © 2020-2023  润新知