创建项目
scrapy startproject shebao
items.py
1 import scrapy 2 3 4 class ShebaoItem(scrapy.Item): 5 # define the fields for your item here like: 6 # name = scrapy.Field() 7 title = scrapy.Field() 8 content = scrapy.Field() 9 url = scrapy.Field() 10 number = scrapy.Field()
创建CrawSpider,使用模版crawl
scrapy genspider -t crawl SB www.bjrbj.gov.cn
SB.py
1 import scrapy 2 from scrapy.linkextractors import LinkExtractor 3 from scrapy.spiders import CrawlSpider, Rule 4 from shebao.items import ShebaoItem 5 6 class SbSpider(CrawlSpider): 7 name = 'SB' 8 allowed_domains = ['www.bjrbj.gov.cn'] 9 start_urls = ['http://www.bjrbj.gov.cn/mzhd/list_more.htm?stid=-1&ps=10&ishot=0&pn=1&ps=10'] 10 11 rules = ( 12 Rule(LinkExtractor(allow=r'&pn=d+'), follow=True), 13 Rule(LinkExtractor(allow=r'/mzhd/detail_d+.htm'), callback = 'parse_item'), 14 #Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True), 15 ) 16 17 def parse_item(self, response): 18 19 item = ShebaoItem() 20 item['title'] = response.xpath('//div[@class="xx_neirong"]/h1/text()').extract()[0] 21 # 编号 22 item['number'] = response.xpath('//p[@class="jz_p1"]/text()').extract()[0] 23 24 item['content'] = response.xpath('//p[@class="jz_p2"]/text()').extract()[0] 25 # 链接 26 item['url'] = response.url 27 28 yield item
pipelines.py
1 import json 2 3 class ShebaoPipeline(object): 4 5 6 def __init__(self): 7 self.filename = open("shebao.json", "w") 8 9 def process_item(self, item, spider): 10 text = json.dumps(dict(item), ensure_ascii = False) + ", " 11 self.filename.write(text) 12 return item 13 14 def close_spider(self, spider): 15 self.filename.close()
settings.py
1 BOT_NAME = 'shebao' 2 3 SPIDER_MODULES = ['shebao.spiders'] 4 NEWSPIDER_MODULE = 'shebao.spiders' 5 6 7 ITEM_PIPELINES = { 8 'shebao.pipelines.ShebaoPipeline': 300, 9 } 10 11 LOG_FILE = "dg.log" 12 LOG_LEVEL = "DEBUG"
执行
scrapy crawl SB