本文主要介绍通过DotnetSpider写少量代码快速的实现网页的抓取。
1、 通过Nuget安装引用DotnetSpider
项目上右键 -》选择"管理Nuget程序包" -》搜索"DotnetsSpider" -》点击"DotnetsSpider.Core"安装,还要安装"DotnetSpider.Extension"。
2、数据存储EntityPipeline
可以使用框架提供的ConsoleEntityPipeline
实现控制台输入,还支持excel、mysql、mongodb等,命名空间在DotnetSpider.Extension.Pipeline
下,可以在这个下面查看其它EntityPipeline的实现类,继承自EntityPipeline类,可实现自己的存储逻辑,例如,
public class StoragePipeline : EntityPipeline
{
protected override int Process(List<IBaseEntity> items, dynamic sender = null)
{
if (items == null) return 0;
DateTime dateTime;
string dateTimeString = string.Empty;
string path = "./web.txt";
foreach (var data in items)
{
lock (this)
{
if (!File.Exists(path))
{
File.Create(path);
}
var streamWriter = File.AppendText(path);
using (streamWriter)
{
streamWriter.Write(Newtonsoft.Json.JsonConvert.SerializeObject(data));
streamWriter.WriteLine();
}
}
}
}
}
return items.Count;
}
}
3、爬虫类的实现
继承EntitySpider类,来实现爬虫类,继承BaseEntity类实现爬虫实体,实体属性要加上 [Column]标签,通过Field标签写Xpath表达式提取内容,和ReplaceFormatter标签实现内容格式的替换,最后值赋给对应的实体属性,例如,
private class SpiderWeb : EntitySpider
{
protected override void OnInit(params string[] arguments)
{
var page = 1;
var listRequest = new List<Request>();
//循环添加要请求的url
for (int i = 1; i < 500; i++)
{
page = i;
listRequest.Add(new Request(string.Format("https://stackoverflow.com/questions/tagged/python?sort=frequent&page={0}&pagesize=15", page),
new Dictionary<string, dynamic> { { "page", page } }));
}
AddRequests(listRequest);
AddEntityType<StackoverflowSearchEntry>();
//AddPipeline(new ConsoleEntityPipeline());
AddPipeline(new StoragePipeline());
}
[Schema("stackoverflow", "stackoverflow_search_entity_model")]
[Entity(Expression = "//div[@id='questions']/*", Type = SelectorType.XPath)]
class StackoverflowSearchEntry : BaseEntity
{
[Column]
[Field(Expression = "page", Type = SelectorType.Enviroment)]
public string Page { get; set; }
[Column]
[Field(Expression = ".//div[@class='summary']/h3/a")]
[ReplaceFormatter(NewValue = "", OldValue = "<em>")]
[ReplaceFormatter(NewValue = "", OldValue = "</em>")]
public string Title { get; set; }
[Column]
[Field(Expression = ".//div[@class='summary']/h3/a/@href")]
public string Url { get; set; }
[Column]
[Field(Expression = ".//div[@class='summary']/div[1]")]
public string description { get; set; }
//匹配到的完整的内容
[Column]
[Field(Expression = ".", Option = FieldOptions.InnerText)]
public string PlainText { get; set; }
}
}
4、DotnetSpider使用完整代码
using DotnetSpider.Downloader;
using DotnetSpider.Extension;
using DotnetSpider.Extension.Model;
using DotnetSpider.Extension.Pipeline;
using DotnetSpider.Extraction;
using DotnetSpider.Extraction.Model;
using DotnetSpider.Extraction.Model.Attribute;
using DotnetSpider.Extraction.Model.Formatter;
using System;
using System.Collections.Generic;
using System.IO;
using System.Runtime.CompilerServices;
namespace SpiderContent
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine("Hello World!");
var spider = new SpiderWeb();
//每次抓取的时间间隔,防止抓取频过快
spider.SleepTime = 1000;
spider.Run();
Console.ReadKey();
}
private class SpiderWeb : EntitySpider
{
protected override void OnInit(params string[] arguments)
{
var page = 1;
var listRequest = new List<Request>();
//循环添加要请求的url
for (int i = 1; i < 500; i++)
{
page = i;
listRequest.Add(new Request(string.Format("https://stackoverflow.com/questions/tagged/python?sort=frequent&page={0}&pagesize=15", page),
new Dictionary<string, dynamic> { { "page", page } }));
}
AddRequests(listRequest);
AddEntityType<StackoverflowSearchEntry>();
//AddPipeline(new ConsoleEntityPipeline());
AddPipeline(new StoragePipeline());
}
[Schema("stackoverflow", "stackoverflow_search_entity_model")]
[Entity(Expression = "//div[@id='questions']/*", Type = SelectorType.XPath)]
class StackoverflowSearchEntry : BaseEntity
{
[Column]
[Field(Expression = "page", Type = SelectorType.Enviroment)]
public string Page { get; set; }
[Column]
[Field(Expression = ".//div[@class='summary']/h3/a")]
[ReplaceFormatter(NewValue = "", OldValue = "<em>")]
[ReplaceFormatter(NewValue = "", OldValue = "</em>")]
public string Title { get; set; }
[Column]
[Field(Expression = ".//div[@class='summary']/h3/a/@href")]
public string Url { get; set; }
[Column]
[Field(Expression = ".//div[@class='summary']/div[1]")]
public string description { get; set; }
[Column]
[Field(Expression = ".", Option = FieldOptions.InnerText)]
public string PlainText { get; set; }
}
}
}
public class StoragePipeline : EntityPipeline
{
protected override int Process(List<IBaseEntity> items, dynamic sender = null)
{
if (items == null) return 0;
DateTime dateTime;
string dateTimeString = string.Empty;
string path = "./web.txt";
foreach (var data in items)
{
lock (this)
{
if (!File.Exists(path))
{
File.Create(path);
}
var streamWriter = File.AppendText(path);
using (streamWriter)
{
streamWriter.Write(Newtonsoft.Json.JsonConvert.SerializeObject(data));
streamWriter.WriteLine();
}
}
}
}
}
return items.Count;
}
}
}