• 提取网页中的超链接(C#)


    using System;
    using System.Xml;
    using System.Text;
    using System.Net;
    using System.IO;
    using System.Collections;
    using System.Text.RegularExpressions;

    public class App
    {
     public static void Main()
     {  
      string strCode;
      ArrayList alLinks;
      
      Console.Write("请输入一个网页地址:"); 
      string strURL = Console.ReadLine();
      if(strURL.Substring(0,7) != @"http://")
      {
       strURL = @"http://" + strURL;
      }

      Console.WriteLine("正在获取页面代码,请稍侯..."); 
      strCode = GetPageSource(strURL);

      Console.WriteLine("正在提取超链接,请稍侯..."); 
      alLinks = GetHyperLinks(strCode);

      Console.WriteLine("正在写入文件,请稍侯..."); 
      WriteToXml(strURL,alLinks);
     }

     // 获取指定网页的HTML代码
     static string GetPageSource(string URL)
     {
      Uri uri =new Uri(URL);

      HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);
      HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();

      hwReq.Method = "Get";

      hwReq.KeepAlive = false;

      StreamReader reader = new StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding("GB2312"));

      return reader.ReadToEnd();
     }

     // 提取HTML代码中的网址
     static ArrayList GetHyperLinks(string htmlCode)
     {
      ArrayList al = new ArrayList();

      string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";

      Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);
      MatchCollection m = r.Matches(htmlCode);

      for(int i=0; i<=m.Count-1; i++)
      {
       bool rep = false;
       string strNew = m[i].ToString();

       // 过滤重复的URL
       foreach(string str in al)
       {
        if(strNew==str)
        {
         rep =true;
         break;
        }   
       }

       if(!rep) al.Add(strNew);
      }

      al.Sort();

      return al;
     }

     // 把网址写入xml文件
     static void WriteToXml(string strURL, ArrayList alHyperLinks)
     {
      XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml",Encoding.UTF8); 
      
      writer.Formatting = Formatting.Indented;
      writer.WriteStartDocument(false);
      writer.WriteDocType("HyperLinks", null, "urls.dtd", null);
      writer.WriteComment("提取自" + strURL + "的超链接");
      writer.WriteStartElement("HyperLinks");
      writer.WriteStartElement("HyperLinks", null);
      writer.WriteAttributeString("DateTime",DateTime.Now.ToString());


      foreach(string str in alHyperLinks)
      {
       string title = GetDomain(str);
       string body = str;
       writer.WriteElementString(title,null,body);
      }

      writer.WriteEndElement();
      writer.WriteEndElement();

      writer.Flush();
      writer.Close();
     }

     // 获取网址的域名后缀
     static string GetDomain(string strURL)
     {
      string retVal;

      string strRegex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";

      Regex r = new Regex(strRegex,RegexOptions.IgnoreCase); 
      Match m = r.Match(strURL);
      retVal = m.ToString();

      strRegex = @"\.|/$";
      retVal = Regex.Replace(retVal, strRegex, "").ToString();

      if(retVal == "")
       retVal = "other";

      return retVal;
     }
    }

  • 相关阅读:
    240. Search a 2D Matrix II
    436. Find Right Interval
    378. Kth Smallest Element in a Sorted Matrix
    278. First Bad Version
    374. Guess Number Higher or Lower
    207. Course Schedule
    Java enum的用法详解
    Android中RelativeLayout各个属性 android:layout_alignParentLeft=”true”找不到有时候
    android:layout_gravity和android:gravity的区别
    Android 相对布局 扩展
  • 原文地址:https://www.cnblogs.com/superstar/p/2011292.html
Copyright © 2020-2023  润新知