• scrapy框架


    几个scrapy框架的指令:
      scrapy startproject xxxx
      scrapy genspider xxx www.ooo.com
      scrapy crawl xxx
    基于管道的持久化存储:
      1.数据解析
      2.在item类中定义相关的属性
      3.在parse方法中实例化一个item类型的对象
      4.将解析到的数据存储到item类型的对象中
      5.使用yield item将item对象提交给管道
      6.在process_item 这里面 接收数据 并进行持久化存储
      7.在配置文件里面开启管道
     1 # -*- coding: utf-8 -*-
     2 import scrapy
     3 from qiubai01.items import Qiubai01Item
     4 
     5 
     6 class QiubaiSpider(scrapy.Spider):
     7     name = 'qiubai'
     8     # allowed_domains = ['www.xxx.com']
     9     start_urls = ['https://www.qiushibaike.com/text/']
    10 
    11     # def parse(self, response):
    12     #     # response 是请求返回对象
    13     #     div_list = response.xpath("//*[@id="content-left"]/div")
    14     #     for div in div_list:
    15     #         # author = div.xpath("./div[1]/a[2]/h2/text()")[0].extract()
    16     #         # 这里面换一种写法也可以
    17     #         author = div.xpath("./div[1]/a[2]/h2/text()").extract_first()
    18     #         content = div.xpath('./a/div/span//text()').extract()
    19     #         content = "".join(content)
    20     #         print(author)
    21     #         print()
    22     #         print(content)
    23 
    24     # 基于终端指令存储  scrapy crawl -o qiushi.csv
    25     # def parse(self, response):
    26     #     # response 是请求返回对象
    27     #     div_list = response.xpath("//*[@id="content-left"]/div")
    28     #     all_data_list = []
    29     #     for div in div_list:
    30     #         # author = div.xpath("./div[1]/a[2]/h2/text()")[0].extract()
    31     #         # 这里面换一种写法也可以
    32     #         author = div.xpath("./div[1]/a[2]/h2/text()").extract_first()
    33     #         content = div.xpath('./a/div/span//text()').extract()
    34     #         content = "".join(content)
    35     #         dic = {}
    36     #         dic['author'] = author
    37     #         dic['content'] = content
    38     #         all_data_list.append(dic)
    39     #     return all_data_list
    40 
    41     # 基于管道的持久化存储
    42 
    43     def parse(self, response):
    44         """
    45         1.数据解析
    46         2.在item类中定义相关的属性
    47         3.在parse方法中实例化一个item类型的对象
    48         4.将解析到的数据存储到item类型的对象中
    49         5.使用yield item将item对象提交给管道
    50         6.在process_item 这里面 接收数据 并进行持久化存储
    51         7.在配置文件里面开启管道
    52         """
    53         # response 是请求返回对象
    54         div_list = response.xpath("//*[@id="content-left"]/div")
    55 
    56         for div in div_list:
    57             # author = div.xpath("./div[1]/a[2]/h2/text()")[0].extract()
    58             # 这里面换一种写法也可以
    59             author = div.xpath("./div[1]/a[2]/h2/text()").extract_first()
    60             content = div.xpath('./a/div/span//text()').extract()
    61             content = "".join(content)
    62             item = Qiubai01Item()
    63             item['author'] = author
    64             item['content'] = content
    65             # 向管道提交item
    66             yield item
    View Code

     全栈数据爬取:

    1.解析数据页面

     1 # -*- coding: utf-8 -*-
     2 import scrapy
     3 from choutipro.items import ChoutiproItem
     4 
     5 
     6 class ChoutiSpider(scrapy.Spider):
     7     name = 'chouti'
     8     # allowed_domains = ['https://dig.chouti.com/']
     9     start_urls = ['https://dig.chouti.com/all/hot/recent/1']
    10 
    11     pageNum = 1
    12     url = "https://dig.chouti.com/all/hot/recent/%s"
    13 
    14     def parse(self, response):
    15         div_list = response.xpath('//*[@id="content-list"]/div')
    16         for div in div_list:
    17             # //*[@id="newsContent26168656"]/div[1]/a[1]
    18             # //*[@id="newsContent26168656"]/div[1]
    19             # //*[@id="newsContent26168656"]
    20             content = div.xpath('./div[4]/div[1]/a[1]/text()').extract_first()
    21             author = div.xpath('./div[4]/div[2]/a[4]/b/text()').extract_first()
    22             item = ChoutiproItem()
    23             item['author'] = author
    24             item['content'] = content
    25             yield item
    26         if self.pageNum <= 120:
    27             self.pageNum += 1
    28             new_url = self.url % str(self.pageNum)
    29             print(new_url)
    30             yield scrapy.Request(url=new_url, callback=self.parse)
    View Code

    2.item类定义

    1 import scrapy
    2 
    3 
    4 class ChoutiproItem(scrapy.Item):
    5     # define the fields for your item here like:
    6     author = scrapy.Field()
    7     content = scrapy.Field()
    View Code

    3.管道文件

     1 class ChoutiproPipeline(object):
     2     file = None
     3 
     4     def open_spider(self, spider):
     5         self.file = open("./test.txt", "a", encoding='utf-8')
     6 
     7     def process_item(self, item, spider):
     8         # author = item['author']
     9         content = item['content'].strip()
    10         try:
    11             self.file.write( content+'
    ')
    12         except:
    13             pass
    14         return item
    15 
    16     def close_spider(self, spider):
    17         self.file.close()
    View Code
     1 # -*- coding: utf-8 -*-
     2 
     3 # Define your item pipelines here
     4 #
     5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
     6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
     7 
     8 import pymysql
     9 
    10 
    11 class Qiubai01Pipeline(object):
    12     fp = None
    13 
    14     def open_spider(self, spider):
    15         self.fp = open('./qiubai.txt', 'w', encoding='utf-8')
    16 
    17     def process_item(self, item, spider):
    18         author = item['author']
    19         content = item['content']
    20         print(type(author), type(content))
    21         try:
    22             self.fp.write(author + ":" + content)
    23         except:
    24             pass
    25         return item
    26 
    27     def close_soider(self, spider):
    28         self.fp.close()
    29 
    30 
    31 class MySql01Pipeline(object):
    32     conn = None
    33     cursor = None
    34 
    35     def open_spider(self, item):
    36         self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='', password='', db='spider')
    37 
    38     def process_item(self, item, spider):
    39         self.cursor = self.conn.cursor()
    40         try:
    41             self.cursor.execute('insert into qiubqi values ("%s","%s")' % (item['author'], item['content']))
    42             self.conn.commit()
    43         except:
    44             self.conn.rollback()
    45 
    46     def close_spider(self, spider):
    47         self.cursor.close()
    48         self.conn.close()
    View Code

     4.日志等级处理

      在setting里面设置:LOG_LEVEL = 'ERROR'  或者LOG_FILE = 'log.txt' 都可以。

    5.请求传参:当解析的页面不在同一个页面的时候需要使用请求传参,主要是回调函数里面的参数需要重新写。

  • 相关阅读:
    Node.js入门
    Promise入门
    MongoDB基础
    Express入门
    常用特殊符号HTML代码 (转)
    sqlserver函数简单介绍
    git忽略本地提交命令
    Scrapy入门到放弃05:让Item在Pipeline中飞一会儿
    Scrapy入门到放弃06:Spider中间件
    2022虎年微信红包封面的设计和领取
  • 原文地址:https://www.cnblogs.com/d9e84208/p/10891342.html
Copyright © 2020-2023  润新知