• 爬取人力资源社保局咨询问题


    创建项目

    scrapy startproject shebao

    items.py

     1 import scrapy
     2 
     3 
     4 class ShebaoItem(scrapy.Item):
     5     # define the fields for your item here like:
     6     # name = scrapy.Field()
     7     title = scrapy.Field()
     8     content = scrapy.Field()
     9     url = scrapy.Field()
    10     number = scrapy.Field()

    创建CrawSpider,使用模版crawl

    scrapy genspider -t crawl SB www.bjrbj.gov.cn

    SB.py

     1 import scrapy
     2 from scrapy.linkextractors import LinkExtractor
     3 from scrapy.spiders import CrawlSpider, Rule
     4 from shebao.items import ShebaoItem
     5 
     6 class SbSpider(CrawlSpider):
     7     name = 'SB'
     8     allowed_domains = ['www.bjrbj.gov.cn']
     9     start_urls = ['http://www.bjrbj.gov.cn/mzhd/list_more.htm?stid=-1&ps=10&ishot=0&pn=1&ps=10']
    10 
    11     rules = (
    12         Rule(LinkExtractor(allow=r'&pn=d+'), follow=True),
    13         Rule(LinkExtractor(allow=r'/mzhd/detail_d+.htm'), callback = 'parse_item'),
    14         #Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
    15     )
    16 
    17     def parse_item(self, response):
    18 
    19         item = ShebaoItem()
    20         item['title'] = response.xpath('//div[@class="xx_neirong"]/h1/text()').extract()[0]
    21         # 编号
    22         item['number'] = response.xpath('//p[@class="jz_p1"]/text()').extract()[0]
    23 
    24         item['content'] = response.xpath('//p[@class="jz_p2"]/text()').extract()[0]
    25         # 链接
    26         item['url'] = response.url
    27 
    28         yield item

    pipelines.py

     1 import json
     2 
     3 class ShebaoPipeline(object):
     4 
     5 
     6     def __init__(self):
     7         self.filename = open("shebao.json", "w")
     8 
     9     def process_item(self, item, spider):
    10         text = json.dumps(dict(item), ensure_ascii = False) + ",
    "
    11         self.filename.write(text)
    12         return item
    13 
    14     def close_spider(self, spider):
    15         self.filename.close()

    settings.py
     1 BOT_NAME = 'shebao'
     2 
     3 SPIDER_MODULES = ['shebao.spiders']
     4 NEWSPIDER_MODULE = 'shebao.spiders'
     5 
     6 
     7 ITEM_PIPELINES = {
     8     'shebao.pipelines.ShebaoPipeline': 300,
     9 }
    10 
    11 LOG_FILE = "dg.log"
    12 LOG_LEVEL = "DEBUG"

    执行

    scrapy crawl SB

     
  • 相关阅读:
    说一说Vuex有哪几种状态和属性
    vue中key的作用
    JavaScript 中 reduce去重方法
    Promise对象
    axios的封装
    Vuex白话教程第六讲:Vuex的管理员Module(实战篇)
    token 拼接
    redux 安装
    vue中computed 和 watch 语法
    在浏览器地址栏按回车、F5、Ctrl+F5刷新网页的区别
  • 原文地址:https://www.cnblogs.com/wanglinjie/p/9231519.html
Copyright © 2020-2023  润新知