• 提取网页中的超级链接


    using System;
    using System.Xml;
    using System.Text;
    using System.Net;
    using System.IO;
    using System.Collections;
    using System.Text.RegularExpressions;

    public class App
    {
    public static void Main()
    {
    string strCode;
    ArrayList alLinks;

    Console.Write("请输入一个网页地址:");
    string strURL = Console.ReadLine();
    if(strURL.Substring(0,7) != @"http://")
    {
    strURL = @"http://" + strURL;
    }

    Console.WriteLine("正在获取页面代码,请稍侯...");
    strCode = GetPageSource(strURL);

    Console.WriteLine("正在提取超链接,请稍侯...");
    alLinks = GetHyperLinks(strCode);

    Console.WriteLine("正在写入文件,请稍侯...");
    WriteToXml(strURL,alLinks);
    }

    // 获取指定网页的HTML代码
    static string GetPageSource(string URL)
    {
    Uri uri =new Uri(URL);

    HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);
    HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();

    hwReq.Method = "Get";

    hwReq.KeepAlive = false;

    StreamReader reader = new StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding("GB2312"));

    return reader.ReadToEnd();
    }

    // 提取HTML代码中的网址
    static ArrayList GetHyperLinks(string htmlCode)
    {
    ArrayList al = new ArrayList();

    string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";

    Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);
    MatchCollection m = r.Matches(htmlCode);

    for(int i=0; i<=m.Count-1; i++)
    {
    bool rep = false;
    string strNew = m[i].ToString();

    // 过滤重复的URL
    foreach(string str in al)
    {
    if(strNew==str)
    {
    rep =true;
    break;
    }
    }

    if(!rep) al.Add(strNew);
    }

    al.Sort();

    return al;
    }

    // 把网址写入xml文件
    static void WriteToXml(string strURL, ArrayList alHyperLinks)
    {
    XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml",Encoding.UTF8);

    writer.Formatting = Formatting.Indented;
    writer.WriteStartDocument(false);
    writer.WriteDocType("HyperLinks", null, "urls.dtd", null);
    writer.WriteComment("提取自" + strURL + "的超链接");
    writer.WriteStartElement("HyperLinks");
    writer.WriteStartElement("HyperLinks", null);
    writer.WriteAttributeString("DateTime",DateTime.Now.ToString());


    foreach(string str in alHyperLinks)
    {
    string title = GetDomain(str);
    string body = str;
    writer.WriteElementString(title,null,body);
    }

    writer.WriteEndElement();
    writer.WriteEndElement();

    writer.Flush();
    writer.Close();
    }

    // 获取网址的域名后缀
    static string GetDomain(string strURL)
    {
    string retVal;

    string strRegex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";

    Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);
    Match m = r.Match(strURL);
    retVal = m.ToString();

    strRegex = @"\.|/$";
    retVal = Regex.Replace(retVal, strRegex, "").ToString();

    if(retVal == "")
    retVal = "other";

    return retVal;
    }

  • 相关阅读:
    C文件读写函数介绍(转)
    字节存储排序:大端和小端的判别及转换
    vc++上的MFC的对象序列化和反序列化
    unicode下各种类型转换,CString,string,char*,int,char[]
    CString与std::string unicode下相互转化
    VS2010每次编译都重新编译整个工程的解决方案
    Windows下用C语言获取进程cpu使用率,内存使用,IO情况
    hadoop 安装
    python---pyc pyo文件详解
    C 高级编程 2 内存管理
  • 原文地址:https://www.cnblogs.com/zhangpengshou/p/1699886.html
Copyright © 2020-2023  润新知