步骤1.建立工程和Spider
scrapy startproject BaiduStocks cd BaiduStocks
scrapy genspider stocks baidu.com
步骤2.编写爬虫Spider
配置stocks.py文件
修改返回页面的处理
修改对新增url爬取请求的处理
import scrapy import re class StocksSpider(scrapy.Spider): name = "stocks" start_urls = ['http://quote.eastmoney.com/stocklist.html'] def parse(self,response): for href in response.css('a::aattr(href)').extract(): try: stock = re.findall(r"[s][hz]d{6}",href)[0] url = 'https://gupiao.bai.com/stock/' + stock +'.html' yield scrapy.Request(url,callback=self.parse_stock) except: continue def parse_stock(self.response): infoDict = {} stockInfo = respomse.css('.stock-bets') name = stockInfo.css('.bets-name').extact()[0] keyList = stockInfo.css('dt').extract() valueList = stockInfo.css('dd').extract() for i in range(len(keyList)): key = re.findall(r'>.*</dt>',keyList[i])[0][1:-5] try: val = re.findall(r'd+.?.*',valueList[i])[0][0:-5] except: val = '--' infoDict[key]=val infoDict.update( {'股票名称'}:re.findall('s.*(',name)[0].split()[0]+ re.findall('>.*<',name)[0][1:-i]})
yield inofDict
步骤3.编写Piplines.py文件
定义对爬取项(Scraped Item)的处理类
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html class BaidustocksPipeline(object): def process_item(self, item, spider): return item class BaidustocksinfoPipeline(object): def open_spider(self,spider): self.f = open('BaiduStockInfo.txt','w') def close_spider(self.spider): self.f.close() def process_item(self.item.spider): try: line = str(dict(item)) + ' ' self.f.write(line) except: pass return item
配置ITEM_PIPLINES选项
修改setting.py
ITEM_PIPELINES = { 'BaiduStocks.pipelines.BaidustocksInfoPipeline': 300,