• 蜘蛛程序(广度优先,c#多线程版本)


    多线程这里,我主要是使用全局变量来控制当前线程数量,在每个线程内部,已启动就进行原子操作增加当前活动线程数量,线程结束时再进行原子操作,减少当前活动线程数量。当队列为空并且活动线成为0时,认为任务完成,退出循环。如果队列为空但是有活动线程则主线程休眠,然后再次判断条件。队列线程均不为空,或队列不空线程为空,判断线程数量并决定是否开启新线程抓取。

     

    /*

    * XssScan.cs,云舒,070704下午

    */

    using System;

    using System.Threading;

    using System.Collections;

    using System.Collections.Generic;

    using System.Text;

    using Winista.Text.HtmlParser;

    using Winista.Text.HtmlParser.Data;

    namespace Ph4nt0m.XssScan

    {

        public class XssScan

        {

            public static string        domain;

            public static Hashtable        url_hash;

            public static string[]        excempt_file;

            public static Int32            max_thread = 10;

            public static Int32            current_thread = 0;

            public static Int32            time_out;

            public static Queue            pre_url;

        

            public static int Main(string[] args)

            {

                string base_url;

                // 检查参数个数

                if ( args.Length != 2 )

                {

                    Console.WriteLine( "Usage: XssScan.exe   <url>   <sleep>" );

                    return -1;

                }

                base_url = args[0];

                time_out = Int32.Parse(args[1]);

                url_hash = new Hashtable();

                pre_url = new Queue(100);

                // 不进行分析的文件

                excempt_file = new string[] { ".exe", ".rar", ".zip", ".tar", ".gz",

                                                ".pdf", ".swf", ".jpg", ".png", ".gif", ".bmp",

                                                ".mp3", ".mp4", ".rm", ".rmvb", ".smil", ".wma",

                                                ".pl", ".c", ".cpp"

                                            };

                // 处理url,方便下面截取基地址

                if (base_url.StartsWith("http://"))/

                {

                    base_url = base_url.Substring(7);

                }

                if (base_url.EndsWith("/") )

                {

                    base_url = base_url.TrimEnd('/');

                }

                else if (base_url.EndsWith("\"))

                {

                    base_url = base_url.TrimEnd('\');

                }

                // 按照/字符分割url,获取域名.

                // 域名取得较长,若包含目录,则可以防止抓取到上层目录,此处包含了纯域名后面的目录

                if (base_url.IndexOf('/') == -1)

                {

                    domain = base_url;

                }

                else

                {

                    string[] domains = base_url.Split('/');

                    // 是文件还是目录

                    if (domains[domains.Length - 1].IndexOf('.') != -1)

                    {

                        Int32 pos = base_url.LastIndexOf('/');

                        domain = base_url.Substring(0, pos);

                    }

                    else

                    {

                        domain = base_url;

                    }

                }

                base_url = "http://"/ + base_url;

                domain = "http://"/ + domain;

                //Console.WriteLine("base_url: ", base_url);

                //Console.WriteLine( "domain: ", domain);

                // 将基URL加入到队列并开始抓取

                //pre_url.Enqueue(base_url);

                Ph4nt0m.XssScan.Parser parser_base = new Ph4nt0m.XssScan.Parser(base_url);

                parser_base.GetLinksFromUrl();

                while ( true )

                {

                    // 没有活动线程且队列为空,则说明抓取完成

                    if ( current_thread == 0 && pre_url.Count == 0 )

                    {

                        break;

                    }

                    // 队列为空但是有活动线程则主线程休眠,然后再次判断条件

                    if (pre_url.Count == 0)

                    {

                        Thread.Sleep(100);

                        continue;

                    }

                    // 队列线程均不为空,或队列不空线程为空,判断线程数量并决定是否开启新线程抓取

                    if (current_thread < max_thread)

                    {

                        string current_url = (string)pre_url.Dequeue();

                        Ph4nt0m.XssScan.Parser parser = new Ph4nt0m.XssScan.Parser(current_url);

                        Thread work_thread = new Thread(new ThreadStart(parser.GetLinksFromUrl));

                        work_thread.Start();

                    }

                    else

                    {

                        Console.WriteLine("休眠主线程,当前线程数量为: ", current_thread);

                    }

                    Thread.Sleep(time_out);

                }

                Console.WriteLine("All done.\nThere are links:", url_hash.Count);

                foreach (string key in url_hash.Keys)

                {

                    //Console.WriteLine(key);

                }

                return 0;

            }

        }

    }

    执行工作的类,C#比较变态的是不能给线程启动的函数传递参数,所以无赖之下我使用构造函数来设置变量,也就是需要抓取的URL地址:

    代码:

    /*

    * Parser.cs,云舒,云舒,070704下午

    */

    using System;

    using System.Collections;

    using System.Collections.Generic;

    using System.Text;

    using System.Threading;

    using Winista.Text.HtmlParser;

    using Winista.Text.HtmlParser.Data;

    namespace Ph4nt0m.XssScan

    {

        public class Parser

        {

            private string _url;

            public Parser(string url)

            {

                this._url = url;

            }

            /// <summary>

            /// 从给定的链接中获取网页内容,并获取该网页内的所有站内链接

            /// </summary>

            /// <param name="url">url地址,string</param>

            public void GetLinksFromUrl( )

            {

                // 递增当前线程数量

                Interlocked.Increment(ref XssScan.current_thread);

                Winista.Text.HtmlParser.Parser parser;

                PageData page_data;

                Hashtable tmp_hash = new Hashtable();

                string url = CheckUrl(_url);

                parser = new Winista.Text.HtmlParser.Parser(new Uri(url));

                // 解析html

                page_data = parser.GetAllOutLinks(1, true);

            

                Int32 excempt_found = 0;

                foreach (LinkData link_data in page_data.OutLinks)

                {

                    //Console.WriteLine( "[DEBUG1]: ", link_data.Url );

                    // 跳过二进制文件

                    foreach (string ext_name in XssScan.excempt_file)

                    {

                        if (link_data.Url.EndsWith(ext_name))

                        {

                            //Console.WriteLine("[DEBUG2]: igonre ", link_data.Url, ext_name);

                            excempt_found = 1;

                            break;

                        }

                    }

                    

                    if( excempt_found == 1 )

                    {

                        // 重置标记

                        excempt_found = 0;

                        continue;

                    }

                    //Console.WriteLine("[DEBUG3]: ", link_data.Url);

                    // 是否抓出边界,这里先判断边界,尽可能少的遍历hash

                    // 仔细权衡,感觉检查边界放在这里更节省资源

                    if (link_data.Url.StartsWith(XssScan.domain))

                    {

                        // URL记录中是否已经包含了此URL

                        if (!XssScan.url_hash.ContainsKey(link_data.Url))

                        {

                            Console.WriteLine("Url: ", link_data.Url);

                            try

                            {

                                // 加锁,确保只有一个线程能写URL HASH和队列

                                Monitor.Enter(this);

                                // 将URL加入到全局的URL列表中

                                XssScan.url_hash[link_data.Url] = "true";

                                // 将URL加入到确保同时只有一个线程更新URL队列

                                XssScan.pre_url.Enqueue(link_data.Url);

                            }

                            finally

                            {

                                // 释放锁

                                Monitor.Exit(this);

                            }

                        }

                    }

                }

                // 减少当前线程数量

                Interlocked.Decrement(ref XssScan.current_thread);

            }

            private static string CheckUrl(string url)

            {

                // 在url后面加上/字符,这个似乎是winista.HtmlParser的bug,如果url包含目录而没有以/结尾,它会

                // 认为这个是文件。

                if (!url.EndsWith("/"))

                {

                    string[] tokens = url.Split('/');

                    Int32 count = tokens.Length;

                    // .ddd和ddd.均当作目录处理

                    if (tokens[count - 1].StartsWith(".") || tokens[count - 1].EndsWith("."))

                    {

                        url += '/';

                    }

                    // 如果中间不包含"."也当成目录,加上"/"字符

                    else if (tokens[count - 1].IndexOf('.') == -1)

                    {

                        url += '/';

                    }

                }

                //Console.WriteLine(url);

                return url;

            }

        }

    }

  • 相关阅读:
    [转]c#匿名类
    MVC中的验证码
    js常用方法
    centos6.x一直停留在进度条的问题
    使用linux flock文件锁实现任务锁定避免计划任务程序冲突
    nginx访问日志的几个统计命令
    centos安装tidy扩展
    用alert打印js对象
    laravel中的管道设计模式
    CentOS查看每个进程的网络流量
  • 原文地址:https://www.cnblogs.com/0000/p/1580201.html
Copyright © 2020-2023  润新知