using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
namespace ConsoleApplication1
{
class Program
{
static void Main(string[] args)
{
//需要解析的集合
List<string> list = new List<string>();
//已经解析的集合
List<string> listCount = new List<string>();
list.Add("http://www.baidu.com");
ReadHtml(list,listCount);
Console.ReadLine();
}
/// <summary>
/// 读取HTML中的URL
/// </summary>
/// <param name="list"></param>
/// <param name="listCount"></param>
public static void ReadHtml(List<string> list, List<string> listCount)
{
List<string> count = new List<string>();
for (int a = 0; a < list.Count; a++)
{
//没有解析过该项
if (!listCount.Contains(list[a]))
{
try
{
//在已解析过的集合里面添加本条数据
listCount.Add(list[a]);
WebRequest req = WebRequest.Create(list[a]);
WebResponse result = req.GetResponse();
//得到的流是网页内容
Stream ReceiveStream = result.GetResponseStream();
StreamReader readerOfStream = new StreamReader(ReceiveStream, System.Text.Encoding.GetEncoding("GB2312"));
//得到当前URL的源码
string str = readerOfStream.ReadToEnd();
//解析
Regex regex = new Regex(@"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?");
foreach (Match mc in regex.Matches(str))
{
Regex regexOhter = new Regex(list[a] + "|.png|.jpg|.gif|.bmp|.js|.css|.xls|.doc|.pdf|.chw|.exe|.mp3|.mp4|.avi|.swf|.xml");
if (!regexOhter.IsMatch(mc.ToString()))
{
Console.WriteLine(mc);
count.Add(mc.ToString());
}
}
Console.WriteLine("----------------------解析完一个页面!--------------------");
if (a == list.Count - 1)
{
//递归调用本方法
ReadHtml(count, listCount);
}
}
catch (System.Exception ex) { }
finally
{
List<string> error = new List<string>();
//如果出错在出错的后面一条URL继续解析
for (int z = a + 1; z < list.Count; z++)
{
error.Add(list[z]);
}
//继续解析
ReadHtml(error, listCount);
}
}
}
}
}
}