.net HttpCrawler

using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;

namespace HttpCrawler
{
    class Program
    {
        static void Main(string[] args)
        {
            Stopwatch sw = new Stopwatch();
            sw.Start();
            var titles = from row in GetHtml("http://bbs.csdn.net/forums/DotNET/").DocumentNode.SelectSingleNode("//table[@class='table_list parent_forum ']").Elements("tr").Skip(1)
                         let td = row.Element("td")
                         where td != null
                         let a = td.Descendants("a").FirstOrDefault()
                         where a != null
                         select new
                         {
                             href = a.Attributes["href"].Value,
                             text = a.InnerText
                         };
            var pages = from t in titles
                            .AsParallel().WithDegreeOfParallelism(20)
                        where t.href != null
                        let path = "http://bbs.csdn.net" + t.href
                        let subQuery = from nick in GetHtml(path).DocumentNode.SelectNodes("//span[@class='name2nick']")
                                       where nick.InnerText == "sp1234"
                                       select nick
                        where subQuery.Any()
                        select new
                        {
                            title = t.text,
                            href = path
                        };
            var results = pages.ToList();
            sw.Stop();
            Console.WriteLine("不加并发的时间:"+sw.ElapsedMilliseconds);
            Console.ReadKey();
        }
        static HtmlDocument GetHtml(string url)
        {
            var content = Encoding.UTF8.GetString(new WebClient().DownloadData(url));
            var doc = new HtmlDocument();
            doc.Load(new StringReader(content));
            return doc;
        }
    }
}

相关阅读:
LN : leetcode 217 Contains Duplicate
LN : leetcode 53 Maximum Subarray
day23——删除数据、更改数据、索引
day22——创建表、增加数据、查询数据
day21——游标、mysql连接池、设计表结构
day20——安装客户端、数据库连接、mysql事务、mysql操作数据
day19——常用正则表达式、re正则对象和正则匹配效率比较、编译正则对象
day20——re的matche方法和search方法、re的split,findall,finditer方法、re的matche对象
day18——json
day17——logging、os模块、commands模块、sys模块

原文地址：https://www.cnblogs.com/c-x-a/p/7792750.html