• asp.net 利用HttpWebRequest自动获取网页编码并获取网页源代码


         /// <summary>
        /// 获取源代码
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static string GetHtml(string url, Encoding encoding)
        {
            HttpWebRequest request = null;
            HttpWebResponse response = null;
            StreamReader reader = null;
            try
            {
                request = (HttpWebRequest)WebRequest.Create(url);
                request.Timeout = 20000;
                request.AllowAutoRedirect = false;
    
                response = (HttpWebResponse)request.GetResponse();
                if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
                {
                    if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                        reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding);
                    else
                        reader = new StreamReader(response.GetResponseStream(), encoding);
                    string html = reader.ReadToEnd();
    
                    return html;
                }
            }
            catch
            {
            }
            finally
            {
    
                if (response != null)
                {
                    response.Close();
                    response = null;
                }
                if (reader != null)
                    reader.Close();
    
                if (request != null)
                    request = null;
    
            }
    
            return string.Empty;
        }
        public static string GetEncoding(string url)
        {
            HttpWebRequest request = null;
            HttpWebResponse response = null;
            StreamReader reader = null;
            try
            {
                request = (HttpWebRequest)WebRequest.Create(url);
                request.Timeout = 20000;
                request.AllowAutoRedirect = false;
    
                response = (HttpWebResponse)request.GetResponse();
                if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
                {
                    if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                        reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
                    else
                        reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);
    
                    string html = reader.ReadToEnd();
    
                    Regex reg_charset = new Regex(@"charsets*=s*(?<charset>[^""]*)");
                    if (reg_charset.IsMatch(html))
                    {
                        return reg_charset.Match(html).Groups["charset"].Value;
                    }
                    else if (response.CharacterSet != string.Empty)
                    {
                        return response.CharacterSet;
                    }
                    else
                        return Encoding.Default.BodyName;
                }
            }
            catch
            {
            }
            finally
            {
    
                if (response != null)
                {
                    response.Close();
                    response = null;
                }
                if (reader != null)
                    reader.Close();
    
                if (request != null)
                    request = null;
    
            }
        }
     using System; 
     using System.Net; 
     using System.Text; 
     using System.Text.RegularExpressions; 
      
     class Program 
     { 
       // 获取网页的HTML内容,根据网页的charset自动判断Encoding 
       static string GetHtml(string url) 
       { 
         return GetHtml(url, null); 
       } 
      
       // 获取网页的HTML内容,指定Encoding 
       static string GetHtml(string url, Encoding encoding) 
       { 
         byte[] buf = new WebClient().DownloadData(url); 
         if (encoding != null) return encoding.GetString(buf); 
         string html = Encoding.UTF8.GetString(buf); 
         encoding = GetEncoding(html); 
         if (encoding == null || encoding == Encoding.UTF8) return html; 
         return encoding.GetString(buf); 
       } 
      
       // 根据网页的HTML内容提取网页的Encoding 
       static Encoding GetEncoding(string html) 
       { 
         string pattern = @"(?i)charset=(? <charset>[-a-zA-Z_0-9]+)"; 
         string charset = Regex.Match(html, pattern).Groups["charset"].Value; 
         try { return Encoding.GetEncoding(charset); } 
         catch (ArgumentException) { return null; } 
       } 
      
       // 根据网页的HTML内容提取网页的Title 
       static string GetTitle(string html) 
       { 
         string pattern = @"(?si) <title(?:s+(?:""[^""]*""|'[^']*'|[^""'>])*)?>(? <title>.*?) </title>"; 
         return Regex.Match(html, pattern).Groups["title"].Value.Trim(); 
       } 
      
       // 打印网页的Encoding和Title 
       static void PrintEncodingAndTitle(string url) 
       { 
         string html = GetHtml(url); 
         Console.WriteLine("[{0}] [{1}]", GetEncoding(html), GetTitle(html)); 
       } 
      
       // 程序入口 
       static void Main() 
       { 
         PrintEncodingAndTitle("http://www.msdn.net/"); 
         PrintEncodingAndTitle("http://www.cnblogs.com/"); 
         PrintEncodingAndTitle("http://www.cnblogs.com/skyiv/"); 
         PrintEncodingAndTitle("http://www.csdn.net/"); 
         PrintEncodingAndTitle("http://news.163.com/"); 
       } 
     } 
     /* 程序输出: 
     [] [MSDN: Microsoft Developer Network] 
     [System.Text.UTF8Encoding] [博客园 - 程序员的网上家园] 
     [System.Text.UTF8Encoding] [空间/IV - 博客园] 
     [System.Text.UTF8Encoding] [CSDN.NET - 中国最大的IT技术社区,为IT专业技术人员提供最全面的信息传播和服务平台] 
     [System.Text.DBCSCodePageEncoding] [新闻中心_网易新闻] 
     */
  • 相关阅读:
    工作日时间,每10分钟执行一次磁盘空间检查,一旦发现任何分区利用率高 于80%,就发送邮件报警
    编写脚本,使用for和while分别实现192.168.0.0/24网段内,地址是否能够ping通,若ping通则输出"success!",若ping不通则输出"fail!"
    显示统计占用系统内存最多的进程,并排序
    总结IP配置方法
    总结ip分类以及每个分类可以分配的IP数量
    总结描述TCP三次握手四次挥手
    描述TCP和UDP区别
    简述osi七层模型和TCP/IP五层模型
    创建一个至少有两个PV组成的大小为20G的名为testvg的VG;要求PE大小 为16MB, 而后在卷组中创建大小为5G的逻辑卷testlv;挂载至/users目录
    【转载】Centos升级gcc至5.4.0
  • 原文地址:https://www.cnblogs.com/yeminglong/p/5132987.html
Copyright © 2020-2023  润新知