• .NET Core 使用 DotnetSpider 抓取页面教程


    本文主要介绍通过DotnetSpider写少量代码快速的实现网页的抓取。

    1、 通过Nuget安装引用DotnetSpider

    项目上右键 -》选择"管理Nuget程序包" -》搜索"DotnetsSpider" -》点击"DotnetsSpider.Core"安装,还要安装"DotnetSpider.Extension"。

    Nuget使用教程

    2、数据存储EntityPipeline

    可以使用框架提供的ConsoleEntityPipeline实现控制台输入,还支持excel、mysql、mongodb等,命名空间在DotnetSpider.Extension.Pipeline下,可以在这个下面查看其它EntityPipeline的实现类,继承自EntityPipeline类,可实现自己的存储逻辑,例如,

        public class StoragePipeline : EntityPipeline
        {
            protected override int Process(List<IBaseEntity> items, dynamic sender = null)
            {
                if (items == null) return 0;
                DateTime dateTime;
                string dateTimeString = string.Empty;
                string path = "./web.txt";
                foreach (var data in items)
                {
                            lock (this)
                            {
                                if (!File.Exists(path))
                                {
                                    File.Create(path);
                                }
                                var streamWriter = File.AppendText(path);
                                using (streamWriter)
                                {
                                    streamWriter.Write(Newtonsoft.Json.JsonConvert.SerializeObject(data));
                                    streamWriter.WriteLine();
                                }
                            }
                        }
                    }
                }
                return items.Count;
            }
        }

    3、爬虫类的实现

    继承EntitySpider类,来实现爬虫类,继承BaseEntity类实现爬虫实体,实体属性要加上 [Column]标签,通过Field标签写Xpath表达式提取内容,和ReplaceFormatter标签实现内容格式的替换,最后值赋给对应的实体属性,例如,

      private class SpiderWeb : EntitySpider
            {
                protected override void OnInit(params string[] arguments)
                {
                    var page = 1;
                    var listRequest = new List<Request>();
                   //循环添加要请求的url
                    for (int i = 1; i < 500; i++)
                    {
                        page = i;
                      listRequest.Add(new Request(string.Format("https://stackoverflow.com/questions/tagged/python?sort=frequent&page={0}&pagesize=15", page),
                      new Dictionary<string, dynamic> { { "page", page } }));
                    }
                    AddRequests(listRequest);
                    AddEntityType<StackoverflowSearchEntry>();
                    //AddPipeline(new ConsoleEntityPipeline());
                   AddPipeline(new StoragePipeline());
                }
                [Schema("stackoverflow", "stackoverflow_search_entity_model")]
                [Entity(Expression = "//div[@id='questions']/*", Type = SelectorType.XPath)]
                class StackoverflowSearchEntry : BaseEntity
                {
                    [Column]
                    [Field(Expression = "page", Type = SelectorType.Enviroment)]
                    public string Page { get; set; }
                    [Column]
                    [Field(Expression = ".//div[@class='summary']/h3/a")]
                    [ReplaceFormatter(NewValue = "", OldValue = "<em>")]
                    [ReplaceFormatter(NewValue = "", OldValue = "</em>")]
                    public string Title { get; set; }
                    [Column]
                    [Field(Expression = ".//div[@class='summary']/h3/a/@href")]
                    public string Url { get; set; }
                    [Column]
                    [Field(Expression = ".//div[@class='summary']/div[1]")]
                    public string description { get; set; }
                    //匹配到的完整的内容
                    [Column]
                    [Field(Expression = ".", Option = FieldOptions.InnerText)]
                    public string PlainText { get; set; }
                }
            }

    4、DotnetSpider使用完整代码

    using DotnetSpider.Downloader;
    using DotnetSpider.Extension;
    using DotnetSpider.Extension.Model;
    using DotnetSpider.Extension.Pipeline;
    using DotnetSpider.Extraction;
    using DotnetSpider.Extraction.Model;
    using DotnetSpider.Extraction.Model.Attribute;
    using DotnetSpider.Extraction.Model.Formatter;
    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.Runtime.CompilerServices;
    namespace SpiderContent
    {
        class Program
        {
            static void Main(string[] args)
            {
                Console.WriteLine("Hello World!");
                var spider = new SpiderWeb();
               //每次抓取的时间间隔,防止抓取频过快
                spider.SleepTime = 1000;
                spider.Run();
                Console.ReadKey();
            }
           
            private class SpiderWeb : EntitySpider
            {
                protected override void OnInit(params string[] arguments)
                {
                    var page = 1;
                    var listRequest = new List<Request>();
                   //循环添加要请求的url
                    for (int i = 1; i < 500; i++)
                    {
                        page = i;
                      listRequest.Add(new Request(string.Format("https://stackoverflow.com/questions/tagged/python?sort=frequent&page={0}&pagesize=15", page),
                      new Dictionary<string, dynamic> { { "page", page } }));
                    }
                    AddRequests(listRequest);
                    AddEntityType<StackoverflowSearchEntry>();
                    //AddPipeline(new ConsoleEntityPipeline());
                   AddPipeline(new StoragePipeline());
                }
                [Schema("stackoverflow", "stackoverflow_search_entity_model")]
                [Entity(Expression = "//div[@id='questions']/*", Type = SelectorType.XPath)]
                class StackoverflowSearchEntry : BaseEntity
                {
                    [Column]
                    [Field(Expression = "page", Type = SelectorType.Enviroment)]
                    public string Page { get; set; }
                    [Column]
                    [Field(Expression = ".//div[@class='summary']/h3/a")]
                    [ReplaceFormatter(NewValue = "", OldValue = "<em>")]
                    [ReplaceFormatter(NewValue = "", OldValue = "</em>")]
                    public string Title { get; set; }
                    [Column]
                    [Field(Expression = ".//div[@class='summary']/h3/a/@href")]
                    public string Url { get; set; }
                    [Column]
                    [Field(Expression = ".//div[@class='summary']/div[1]")]
                    public string description { get; set; }
                    [Column]
                    [Field(Expression = ".", Option = FieldOptions.InnerText)]
                    public string PlainText { get; set; }
                }
            }
        }
        public class StoragePipeline : EntityPipeline
        {
            protected override int Process(List<IBaseEntity> items, dynamic sender = null)
            {
                if (items == null) return 0;
                DateTime dateTime;
                string dateTimeString = string.Empty;
                string path = "./web.txt";
                foreach (var data in items)
                {
                            lock (this)
                            {
                                if (!File.Exists(path))
                                {
                                    File.Create(path);
                                }
                                var streamWriter = File.AppendText(path);
                                using (streamWriter)
                                {
                                    streamWriter.Write(Newtonsoft.Json.JsonConvert.SerializeObject(data));
                                    streamWriter.WriteLine();
                                }
                            }
                        }
                    }
                }
                return items.Count;
            }
        }
    }
  • 相关阅读:
    New Day
    apache mod_xsendfile 让php提供更快的文件下载
    XSS跨站测试代码大全
    HTML5 使用application cache 接口实现离线数据缓存
    HTTP 204 与 205 应用
    php HTTP请求类,支持GET,POST,Multipart/form-data
    php 过滤html标记属性类
    php 利用fsockopen GET/POST 提交表单及上传文件
    php 实现BigPipe分块输出
    同一域名对应不同IP,访问指定主机文件内容的方法
  • 原文地址:https://www.cnblogs.com/fireicesion/p/16809554.html
Copyright © 2020-2023  润新知