毕业设计准备做搜索方面,所以开始写爬虫程序,想法是这样,从一个网站开始,抓内容,分析页面,获取页面所有链接,将链接放到UrlList列表,然后索引,一直不断循环。这星期一直在学习多线程,下面是抓取页面内容的代码,先做个备忘先。
开始事件,以及线程函数
private void Start_Click(object sender, EventArgs e)
{
startUrl = StartUrl.Text.Trim();
//获取网页内容的线程数
if (!string.IsNullOrEmpty(ThreadCount.Text.Trim()))
{
threadCount = int.Parse(ThreadCount.Text.Trim());
}
else
{
threadCount = 1;
}
//获取链接线程数
if (!string.IsNullOrEmpty(GetUrlThreadCount.Text.Trim()))
{
getUrlThreadCount = int.Parse(GetUrlThreadCount.Text.Trim());
}
else
{
getUrlThreadCount = 1;
}
if (startUrl == null)
{
MessageBox.Show("请输入链接地址");
return;
}
else
{
Regex re = new Regex(@"http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?");
if (!re.Match(startUrl).Success)
{
MessageBox.Show("链接格式错误");
return;
}
else
{
urllist.Url.Add(startUrl);
urllist.IsDownload.Add(Encrypt.MD5EncryptStr(startUrl),0);
}
}
urllist.Url.Add("http://www.hao123.com");
urllist.IsDownload.Add(Encrypt.MD5EncryptStr("http://www.hao123.com"), 0);
urllist.Url.Add("http://www.zhku.com");
urllist.IsDownload.Add(Encrypt.MD5EncryptStr("http://www.zhku.com"), 0);
urllist.Url.Add("http://www.sina.com");
urllist.IsDownload.Add(Encrypt.MD5EncryptStr("http://www.sina.com"), 0);
urllist.Url.Add("http://www.zhku.edu.cn");
urllist.IsDownload.Add(Encrypt.MD5EncryptStr("http://www.zhku.edu.cn"), 0);
urllist.Url.Add("http://www.39.net");
urllist.IsDownload.Add(Encrypt.MD5EncryptStr("http://www.39.net"), 0);
urllist.Url.Add("http://www.cnblogs.com");
urllist.IsDownload.Add(Encrypt.MD5EncryptStr("http://www.cnblogs.com"), 0);
urllist.Url.Add("http://www.google.com");
urllist.IsDownload.Add(Encrypt.MD5EncryptStr("http://www.google.com"), 0);
Thread[] threadPool = new Thread[threadCount];
int count=3;
for (int i = 0; i < count; i++)
{
threadPool[i] = new Thread(new ParameterizedThreadStart(GetPageContent));
threadPool[i].Start(i);
}
while (true)
{
if (!threadPool[0].IsAlive && !threadPool[1].IsAlive && !threadPool[2].IsAlive)
{
listBox1.DataSource = PageContentList.PageContents;
threadPool[0].Abort();
threadPool[1].Abort();
threadPool[2].Abort();
break;
}
}
}
public void GetPageContent(object startindex)
{
int start = (int)startindex;
lock(urllist)
{
Monitor.Enter(urllist);
int n = 0;
if (urllist.IsDownload.Count < threadCount)
{
n = urllist.IsDownload.Count;
}
else
{
n = threadCount;
}
for (int i = start; i < urllist.IsDownload.Count; i = i + n)
{
if (i > urllist.IsDownload.Count)
{
break;
}
string key = Encrypt.MD5EncryptStr(urllist.Url[i]);
if ((int)urllist.IsDownload[key] == 0)
{
urllist.IsDownload[key] = 1;
string sb = null;
WebClient client = new WebClient();
Byte[] read = new Byte[1024];
read = client.DownloadData(urllist.Url[i].Trim().ToString());
System.Text.Encoding encoder = System.Text.Encoding.GetEncoding("GB2312");
sb += encoder.GetString(read, 0, read.Length);
lock (PageContentList)
{
Monitor.Enter(PageContentList);
PageContentList.PageContents.Add(sb);
PageContentList.IsAnalyse.Add(Encrypt.MD5EncryptStr(sb), 0);
PageContentList.IsIndexed.Add(Encrypt.MD5EncryptStr(sb), 0);
Monitor.Pulse(PageContentList);
Monitor.Exit(PageContentList);
}
}
}
Monitor.Pulse(urllist);
Monitor.Exit(urllist);
}
Thread.Sleep(500);
}
{
startUrl = StartUrl.Text.Trim();
//获取网页内容的线程数
if (!string.IsNullOrEmpty(ThreadCount.Text.Trim()))
{
threadCount = int.Parse(ThreadCount.Text.Trim());
}
else
{
threadCount = 1;
}
//获取链接线程数
if (!string.IsNullOrEmpty(GetUrlThreadCount.Text.Trim()))
{
getUrlThreadCount = int.Parse(GetUrlThreadCount.Text.Trim());
}
else
{
getUrlThreadCount = 1;
}
if (startUrl == null)
{
MessageBox.Show("请输入链接地址");
return;
}
else
{
Regex re = new Regex(@"http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?");
if (!re.Match(startUrl).Success)
{
MessageBox.Show("链接格式错误");
return;
}
else
{
urllist.Url.Add(startUrl);
urllist.IsDownload.Add(Encrypt.MD5EncryptStr(startUrl),0);
}
}
urllist.Url.Add("http://www.hao123.com");
urllist.IsDownload.Add(Encrypt.MD5EncryptStr("http://www.hao123.com"), 0);
urllist.Url.Add("http://www.zhku.com");
urllist.IsDownload.Add(Encrypt.MD5EncryptStr("http://www.zhku.com"), 0);
urllist.Url.Add("http://www.sina.com");
urllist.IsDownload.Add(Encrypt.MD5EncryptStr("http://www.sina.com"), 0);
urllist.Url.Add("http://www.zhku.edu.cn");
urllist.IsDownload.Add(Encrypt.MD5EncryptStr("http://www.zhku.edu.cn"), 0);
urllist.Url.Add("http://www.39.net");
urllist.IsDownload.Add(Encrypt.MD5EncryptStr("http://www.39.net"), 0);
urllist.Url.Add("http://www.cnblogs.com");
urllist.IsDownload.Add(Encrypt.MD5EncryptStr("http://www.cnblogs.com"), 0);
urllist.Url.Add("http://www.google.com");
urllist.IsDownload.Add(Encrypt.MD5EncryptStr("http://www.google.com"), 0);
Thread[] threadPool = new Thread[threadCount];
int count=3;
for (int i = 0; i < count; i++)
{
threadPool[i] = new Thread(new ParameterizedThreadStart(GetPageContent));
threadPool[i].Start(i);
}
while (true)
{
if (!threadPool[0].IsAlive && !threadPool[1].IsAlive && !threadPool[2].IsAlive)
{
listBox1.DataSource = PageContentList.PageContents;
threadPool[0].Abort();
threadPool[1].Abort();
threadPool[2].Abort();
break;
}
}
}
public void GetPageContent(object startindex)
{
int start = (int)startindex;
lock(urllist)
{
Monitor.Enter(urllist);
int n = 0;
if (urllist.IsDownload.Count < threadCount)
{
n = urllist.IsDownload.Count;
}
else
{
n = threadCount;
}
for (int i = start; i < urllist.IsDownload.Count; i = i + n)
{
if (i > urllist.IsDownload.Count)
{
break;
}
string key = Encrypt.MD5EncryptStr(urllist.Url[i]);
if ((int)urllist.IsDownload[key] == 0)
{
urllist.IsDownload[key] = 1;
string sb = null;
WebClient client = new WebClient();
Byte[] read = new Byte[1024];
read = client.DownloadData(urllist.Url[i].Trim().ToString());
System.Text.Encoding encoder = System.Text.Encoding.GetEncoding("GB2312");
sb += encoder.GetString(read, 0, read.Length);
lock (PageContentList)
{
Monitor.Enter(PageContentList);
PageContentList.PageContents.Add(sb);
PageContentList.IsAnalyse.Add(Encrypt.MD5EncryptStr(sb), 0);
PageContentList.IsIndexed.Add(Encrypt.MD5EncryptStr(sb), 0);
Monitor.Pulse(PageContentList);
Monitor.Exit(PageContentList);
}
}
}
Monitor.Pulse(urllist);
Monitor.Exit(urllist);
}
Thread.Sleep(500);
}
链接列表类,页面内容类
public class UrlList
{
private List<string> url = new List<string>();
private Hashtable isDownload = new Hashtable();
/// <summary>
/// 下载链接
/// </summary>
public List<string> Url
{
get { return url; }
set { url = value; }
}
/// <summary>
/// 是否为已下载链接0为否,1为是
/// </summary>
public Hashtable IsDownload
{
get { return isDownload; }
set { isDownload = value; }
}
}
public class PageContent
{
private List<string> pageContents = new List<string>();
private Hashtable isAnalyse = new Hashtable();
private Hashtable isIndexed = new Hashtable();
/// <summary>
/// 页面内容列表
/// </summary>
public List<string> PageContents
{
get { return pageContents; }
set { pageContents = value; }
}
/// <summary>
/// 是否分析了页面,0为否,1为是
/// </summary>
public Hashtable IsAnalyse
{
get { return isAnalyse; }
set { isAnalyse = value; }
}
/// <summary>
/// 是否对页面进行索引,0为否,1为是
/// </summary>
public Hashtable IsIndexed
{
get { return isIndexed; }
set { isIndexed = value; }
}
}
{
private List<string> url = new List<string>();
private Hashtable isDownload = new Hashtable();
/// <summary>
/// 下载链接
/// </summary>
public List<string> Url
{
get { return url; }
set { url = value; }
}
/// <summary>
/// 是否为已下载链接0为否,1为是
/// </summary>
public Hashtable IsDownload
{
get { return isDownload; }
set { isDownload = value; }
}
}
public class PageContent
{
private List<string> pageContents = new List<string>();
private Hashtable isAnalyse = new Hashtable();
private Hashtable isIndexed = new Hashtable();
/// <summary>
/// 页面内容列表
/// </summary>
public List<string> PageContents
{
get { return pageContents; }
set { pageContents = value; }
}
/// <summary>
/// 是否分析了页面,0为否,1为是
/// </summary>
public Hashtable IsAnalyse
{
get { return isAnalyse; }
set { isAnalyse = value; }
}
/// <summary>
/// 是否对页面进行索引,0为否,1为是
/// </summary>
public Hashtable IsIndexed
{
get { return isIndexed; }
set { isIndexed = value; }
}
}
上面的页面都可以抓取到,可是点击开始后就好卡,不知道什么原因,各路高人帮忙看看,指点一下。小弟感激不尽!