• 多线程的一点东西。。


    毕业设计准备做搜索方面,所以开始写爬虫程序,想法是这样,从一个网站开始,抓内容,分析页面,获取页面所有链接,将链接放到UrlList列表,然后索引,一直不断循环。这星期一直在学习多线程,下面是抓取页面内容的代码,先做个备忘先。

    开始事件,以及线程函数

            private void Start_Click(object sender, EventArgs e)
            {
                startUrl 
    = StartUrl.Text.Trim();
                
    //获取网页内容的线程数
                if (!string.IsNullOrEmpty(ThreadCount.Text.Trim()))
                {
                    threadCount 
    = int.Parse(ThreadCount.Text.Trim());
                }
                
    else
                {
                    threadCount 
    = 1;
                }
                
    //获取链接线程数
                if (!string.IsNullOrEmpty(GetUrlThreadCount.Text.Trim()))
                {
                    getUrlThreadCount 
    = int.Parse(GetUrlThreadCount.Text.Trim());
                }
                
    else
                {
                    getUrlThreadCount 
    = 1;
                }

                
    if (startUrl == null)
                {
                    MessageBox.Show(
    "请输入链接地址");
                    
    return;
                }
                
    else
                {
                    Regex re 
    = new Regex(@"http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?");
                    
    if (!re.Match(startUrl).Success)
                    {
                        MessageBox.Show(
    "链接格式错误");
                        
    return;
                    }
                    
    else
                    {
                        urllist.Url.Add(startUrl);
                        urllist.IsDownload.Add(Encrypt.MD5EncryptStr(startUrl),
    0);
                    }
                }


                urllist.Url.Add(
    "http://www.hao123.com");
                urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
    "http://www.hao123.com"), 0);

                urllist.Url.Add(
    "http://www.zhku.com");
                urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
    "http://www.zhku.com"), 0);

                urllist.Url.Add(
    "http://www.sina.com");
                urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
    "http://www.sina.com"), 0);

                urllist.Url.Add(
    "http://www.zhku.edu.cn");
                urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
    "http://www.zhku.edu.cn"), 0);

                urllist.Url.Add(
    "http://www.39.net");
                urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
    "http://www.39.net"), 0);

                urllist.Url.Add(
    "http://www.cnblogs.com");
                urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
    "http://www.cnblogs.com"), 0);

                urllist.Url.Add(
    "http://www.google.com");
                urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
    "http://www.google.com"), 0);

                Thread[] threadPool 
    = new Thread[threadCount];
                
    int count=3;

                
    for (int i = 0; i < count; i++)
                {
                    threadPool[i] 
    = new Thread(new ParameterizedThreadStart(GetPageContent));
                    threadPool[i].Start(i);
                }

                           
    while (true)
                {
                    
    if (!threadPool[0].IsAlive && !threadPool[1].IsAlive && !threadPool[2].IsAlive)
                    {
                        listBox1.DataSource 
    = PageContentList.PageContents;
                        threadPool[
    0].Abort();
                        threadPool[
    1].Abort();
                        threadPool[
    2].Abort();
                        
    break;
                    }
                }
            }

            
    public void GetPageContent(object startindex)
            {
                
    int start = (int)startindex;
                
    lock(urllist)
                {
                    Monitor.Enter(urllist);
                    
    int n = 0;
                    
    if (urllist.IsDownload.Count < threadCount)
                    {
                        n 
    = urllist.IsDownload.Count;
                    }
                    
    else
                    {
                        n 
    = threadCount;
                    }
                    
    for (int i = start; i < urllist.IsDownload.Count; i = i + n)
                    {
                        
    if (i > urllist.IsDownload.Count)
                        {
                            
    break;
                        }
                        
    string key = Encrypt.MD5EncryptStr(urllist.Url[i]);
                        
                        
    if ((int)urllist.IsDownload[key] == 0)
                        {
                            urllist.IsDownload[key] 
    = 1;
                            
    string sb = null;

                            WebClient client 
    = new WebClient();
                            Byte[] read 
    = new Byte[1024];
                            read 
    = client.DownloadData(urllist.Url[i].Trim().ToString());

                            System.Text.Encoding encoder 
    = System.Text.Encoding.GetEncoding("GB2312");
                            sb 
    += encoder.GetString(read, 0, read.Length);

                            
    lock (PageContentList)
                            {
                                Monitor.Enter(PageContentList);
                                PageContentList.PageContents.Add(sb);
                                PageContentList.IsAnalyse.Add(Encrypt.MD5EncryptStr(sb), 
    0);
                                PageContentList.IsIndexed.Add(Encrypt.MD5EncryptStr(sb), 
    0);
                                Monitor.Pulse(PageContentList);
                                Monitor.Exit(PageContentList);
                            }
                        }
                    }
                    Monitor.Pulse(urllist);
                    Monitor.Exit(urllist);
                }
                Thread.Sleep(
    500);
            }

    链接列表类,页面内容类

        public class UrlList
        {
            
    private List<string> url = new List<string>();
            
    private Hashtable isDownload = new Hashtable();

            
    /// <summary>
            
    /// 下载链接
            
    /// </summary>
            public List<string> Url
            {
                
    get { return url; }
                
    set { url = value; }
            }

            
    /// <summary>
            
    /// 是否为已下载链接0为否,1为是
            
    /// </summary>
            public Hashtable IsDownload
            {
                
    get { return isDownload; }
                
    set { isDownload = value; }
            }
        }

        
    public class PageContent
        {
            
    private List<string> pageContents = new List<string>();
            
    private Hashtable isAnalyse = new Hashtable();
            
    private Hashtable isIndexed = new Hashtable();

            
    /// <summary>
            
    /// 页面内容列表
            
    /// </summary>
            public List<string> PageContents
            {
                
    get { return pageContents; }
                
    set { pageContents = value; }
            }
            
    /// <summary>
            
    /// 是否分析了页面,0为否,1为是
            
    /// </summary>
            public Hashtable IsAnalyse
            {
                
    get { return isAnalyse; }
                
    set { isAnalyse = value; }
            }
            
    /// <summary>
            
    /// 是否对页面进行索引,0为否,1为是
            
    /// </summary>
            public Hashtable IsIndexed
            {
                
    get { return isIndexed; }
                
    set { isIndexed = value; }
            }
        }

    上面的页面都可以抓取到,可是点击开始后就好卡,不知道什么原因,各路高人帮忙看看,指点一下。小弟感激不尽!

  • 相关阅读:
    better-scroll 的基本使用
    JavaScript模式(2):函数
    JavaScript模式(1):字面量和构造函数
    模电非基础01——从一种常见的防反接,上电缓启动,过压保护电路集成电路讲解再到MOS管常用技巧讲解
    数字电路基础那些事2——组合逻辑:从异或门到半加器与全加器+比较器
    数字电路基础那些事1——组合逻辑:从译码器到编码器
    入门音响电路 —— 从扬声器原理开始讲起
    从多谐振荡器详细解析到555定时器基本电路(控制LED闪烁)
    用HAL库结合STM cube编写代码控制stm32f103c8t6来驱动减速电机实现慢快逐步切换转动
    mac搭建php开发环境(mac+apache+mysql+php)并且安装zend framework1框架
  • 原文地址:https://www.cnblogs.com/coolkiss/p/1374857.html
Copyright © 2020-2023  润新知