几个scrapy框架的指令:
scrapy startproject xxxx
scrapy genspider xxx www.ooo.com
scrapy crawl xxx
基于管道的持久化存储:
1.数据解析
2.在item类中定义相关的属性
3.在parse方法中实例化一个item类型的对象
4.将解析到的数据存储到item类型的对象中
5.使用yield item将item对象提交给管道
6.在process_item 这里面 接收数据 并进行持久化存储
7.在配置文件里面开启管道
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from qiubai01.items import Qiubai01Item 4 5 6 class QiubaiSpider(scrapy.Spider): 7 name = 'qiubai' 8 # allowed_domains = ['www.xxx.com'] 9 start_urls = ['https://www.qiushibaike.com/text/'] 10 11 # def parse(self, response): 12 # # response 是请求返回对象 13 # div_list = response.xpath("//*[@id="content-left"]/div") 14 # for div in div_list: 15 # # author = div.xpath("./div[1]/a[2]/h2/text()")[0].extract() 16 # # 这里面换一种写法也可以 17 # author = div.xpath("./div[1]/a[2]/h2/text()").extract_first() 18 # content = div.xpath('./a/div/span//text()').extract() 19 # content = "".join(content) 20 # print(author) 21 # print() 22 # print(content) 23 24 # 基于终端指令存储 scrapy crawl -o qiushi.csv 25 # def parse(self, response): 26 # # response 是请求返回对象 27 # div_list = response.xpath("//*[@id="content-left"]/div") 28 # all_data_list = [] 29 # for div in div_list: 30 # # author = div.xpath("./div[1]/a[2]/h2/text()")[0].extract() 31 # # 这里面换一种写法也可以 32 # author = div.xpath("./div[1]/a[2]/h2/text()").extract_first() 33 # content = div.xpath('./a/div/span//text()').extract() 34 # content = "".join(content) 35 # dic = {} 36 # dic['author'] = author 37 # dic['content'] = content 38 # all_data_list.append(dic) 39 # return all_data_list 40 41 # 基于管道的持久化存储 42 43 def parse(self, response): 44 """ 45 1.数据解析 46 2.在item类中定义相关的属性 47 3.在parse方法中实例化一个item类型的对象 48 4.将解析到的数据存储到item类型的对象中 49 5.使用yield item将item对象提交给管道 50 6.在process_item 这里面 接收数据 并进行持久化存储 51 7.在配置文件里面开启管道 52 """ 53 # response 是请求返回对象 54 div_list = response.xpath("//*[@id="content-left"]/div") 55 56 for div in div_list: 57 # author = div.xpath("./div[1]/a[2]/h2/text()")[0].extract() 58 # 这里面换一种写法也可以 59 author = div.xpath("./div[1]/a[2]/h2/text()").extract_first() 60 content = div.xpath('./a/div/span//text()').extract() 61 content = "".join(content) 62 item = Qiubai01Item() 63 item['author'] = author 64 item['content'] = content 65 # 向管道提交item 66 yield item
全栈数据爬取:
1.解析数据页面
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from choutipro.items import ChoutiproItem 4 5 6 class ChoutiSpider(scrapy.Spider): 7 name = 'chouti' 8 # allowed_domains = ['https://dig.chouti.com/'] 9 start_urls = ['https://dig.chouti.com/all/hot/recent/1'] 10 11 pageNum = 1 12 url = "https://dig.chouti.com/all/hot/recent/%s" 13 14 def parse(self, response): 15 div_list = response.xpath('//*[@id="content-list"]/div') 16 for div in div_list: 17 # //*[@id="newsContent26168656"]/div[1]/a[1] 18 # //*[@id="newsContent26168656"]/div[1] 19 # //*[@id="newsContent26168656"] 20 content = div.xpath('./div[4]/div[1]/a[1]/text()').extract_first() 21 author = div.xpath('./div[4]/div[2]/a[4]/b/text()').extract_first() 22 item = ChoutiproItem() 23 item['author'] = author 24 item['content'] = content 25 yield item 26 if self.pageNum <= 120: 27 self.pageNum += 1 28 new_url = self.url % str(self.pageNum) 29 print(new_url) 30 yield scrapy.Request(url=new_url, callback=self.parse)
2.item类定义
1 import scrapy 2 3 4 class ChoutiproItem(scrapy.Item): 5 # define the fields for your item here like: 6 author = scrapy.Field() 7 content = scrapy.Field()
3.管道文件
1 class ChoutiproPipeline(object): 2 file = None 3 4 def open_spider(self, spider): 5 self.file = open("./test.txt", "a", encoding='utf-8') 6 7 def process_item(self, item, spider): 8 # author = item['author'] 9 content = item['content'].strip() 10 try: 11 self.file.write( content+' ') 12 except: 13 pass 14 return item 15 16 def close_spider(self, spider): 17 self.file.close()
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 8 import pymysql 9 10 11 class Qiubai01Pipeline(object): 12 fp = None 13 14 def open_spider(self, spider): 15 self.fp = open('./qiubai.txt', 'w', encoding='utf-8') 16 17 def process_item(self, item, spider): 18 author = item['author'] 19 content = item['content'] 20 print(type(author), type(content)) 21 try: 22 self.fp.write(author + ":" + content) 23 except: 24 pass 25 return item 26 27 def close_soider(self, spider): 28 self.fp.close() 29 30 31 class MySql01Pipeline(object): 32 conn = None 33 cursor = None 34 35 def open_spider(self, item): 36 self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='', password='', db='spider') 37 38 def process_item(self, item, spider): 39 self.cursor = self.conn.cursor() 40 try: 41 self.cursor.execute('insert into qiubqi values ("%s","%s")' % (item['author'], item['content'])) 42 self.conn.commit() 43 except: 44 self.conn.rollback() 45 46 def close_spider(self, spider): 47 self.cursor.close() 48 self.conn.close()
4.日志等级处理
在setting里面设置:LOG_LEVEL = 'ERROR' 或者LOG_FILE = 'log.txt' 都可以。
5.请求传参:当解析的页面不在同一个页面的时候需要使用请求传参,主要是回调函数里面的参数需要重新写。