• 爬取人力资源社保局咨询问题


    创建项目

    scrapy startproject shebao

    items.py

     1 import scrapy
     2 
     3 
     4 class ShebaoItem(scrapy.Item):
     5     # define the fields for your item here like:
     6     # name = scrapy.Field()
     7     title = scrapy.Field()
     8     content = scrapy.Field()
     9     url = scrapy.Field()
    10     number = scrapy.Field()

    创建CrawSpider,使用模版crawl

    scrapy genspider -t crawl SB www.bjrbj.gov.cn

    SB.py

     1 import scrapy
     2 from scrapy.linkextractors import LinkExtractor
     3 from scrapy.spiders import CrawlSpider, Rule
     4 from shebao.items import ShebaoItem
     5 
     6 class SbSpider(CrawlSpider):
     7     name = 'SB'
     8     allowed_domains = ['www.bjrbj.gov.cn']
     9     start_urls = ['http://www.bjrbj.gov.cn/mzhd/list_more.htm?stid=-1&ps=10&ishot=0&pn=1&ps=10']
    10 
    11     rules = (
    12         Rule(LinkExtractor(allow=r'&pn=d+'), follow=True),
    13         Rule(LinkExtractor(allow=r'/mzhd/detail_d+.htm'), callback = 'parse_item'),
    14         #Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
    15     )
    16 
    17     def parse_item(self, response):
    18 
    19         item = ShebaoItem()
    20         item['title'] = response.xpath('//div[@class="xx_neirong"]/h1/text()').extract()[0]
    21         # 编号
    22         item['number'] = response.xpath('//p[@class="jz_p1"]/text()').extract()[0]
    23 
    24         item['content'] = response.xpath('//p[@class="jz_p2"]/text()').extract()[0]
    25         # 链接
    26         item['url'] = response.url
    27 
    28         yield item

    pipelines.py

     1 import json
     2 
     3 class ShebaoPipeline(object):
     4 
     5 
     6     def __init__(self):
     7         self.filename = open("shebao.json", "w")
     8 
     9     def process_item(self, item, spider):
    10         text = json.dumps(dict(item), ensure_ascii = False) + ",
    "
    11         self.filename.write(text)
    12         return item
    13 
    14     def close_spider(self, spider):
    15         self.filename.close()

    settings.py
     1 BOT_NAME = 'shebao'
     2 
     3 SPIDER_MODULES = ['shebao.spiders']
     4 NEWSPIDER_MODULE = 'shebao.spiders'
     5 
     6 
     7 ITEM_PIPELINES = {
     8     'shebao.pipelines.ShebaoPipeline': 300,
     9 }
    10 
    11 LOG_FILE = "dg.log"
    12 LOG_LEVEL = "DEBUG"

    执行

    scrapy crawl SB

     
  • 相关阅读:
    我就是想找个人聊聊天,说说我这近四年来的经历-02
    我就是想找个人聊聊天,说说我这近四年来的经历
    Padas交叉表新增二级分类小计
    superset开启本地缓存filesystem
    Superset连接Impala数据源
    Python实现网站注册验证码生成类
    Python爬虫原理
    Superset导出pivot_table到excel
    Superset导出CSV文件中文或日文乱码
    Linux下如何高效删除一个几十G的文本文件的最后一行或几行
  • 原文地址:https://www.cnblogs.com/wanglinjie/p/9231519.html
Copyright © 2020-2023  润新知