环境:centos6 + python3
安装:pip3 install scrapy
报错:src/twisted/test/raiser.c:4:20: error: Python.h: No such file or directory
src/twisted/test/raiser.c:6:6: error: #error Python headers needed to compile C extensions, please install development version of Python.
error: command 'gcc' failed with exit status 1
解决:需要安装Python的头文件和静态库包(python-devel)
yum search python3 | grep devel #搜索python3下的devel
yum install -y python34-devel.x86_64#安装python34-devel.x86_64
cd myscrapy/myscrapy/spiders
安装:pip3 install scrapy
报错:src/twisted/test/raiser.c:4:20: error: Python.h: No such file or directory
src/twisted/test/raiser.c:6:6: error: #error Python headers needed to compile C extensions, please install development version of Python.
error: command 'gcc' failed with exit status 1
解决:需要安装Python的头文件和静态库包(python-devel)
yum search python3 | grep devel #搜索python3下的devel
yum install -y python34-devel.x86_64#安装python34-devel.x86_64
pip3 install scrapy#成功
1、创建项目
cd /home/chaoge/mypython/crawler/
scrapy startproject myscrapy
vi items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class MyscrapyItem(scrapy.Item): # define the fields for your item here like: #职位名称 positionName = scrapy.Field() #职位连接 positionLink = scrapy.Field() #职位类型 positionType = scrapy.Field() #招聘人数 peopleNum = scrapy.Field() #工作地点 workLocation = scrapy.Field() #发布时间 publishTime = scrapy.Field()
vi pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import json class MyscrapyPipeline(object): def __init__(self): self.filename = open("tencent.json","wb") def process_item(self, item, spider): text = json.dumps(dict(item),ensure_ascii=False)+" " self.filename.write(text.encode("utf-8")) return item def close_spider(self,spider): self.filename.close()
vi settings.py
# Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', } # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'myscrapy.middlewares.MyscrapySpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'myscrapy.middlewares.MyscrapyDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'myscrapy.pipelines.MyscrapyPipeline': 300, }2、创建基础类
cd myscrapy/myscrapy/spiders
scrapy genspider tencent "tencent.com"
vi tencent.py
# -*- coding: utf-8 -*- import scrapy from myscrapy.items import MyscrapyItem class TencentSpider(scrapy.Spider): name = 'tencent' allowed_domains = ['tencent.com'] url = "http://hr.tencent.com/position.php?&start=" offset=0 #start_urls = ['http://tencent.com/'] start_urls = [url+str(offset)] def parse(self, response): for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"): #初始化模型对象 item = MyscrapyItem() #职位名称 item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0] #职位连接 item['positionLink'] = each.xpath("./td[1]/a/@href").extract()[0] #职位类型 item['positionType'] = each.xpath("./td[2]/text()").extract()[0] #招聘人数 item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0] #工作地点 item['workLocation'] = each.xpath("./td[4]/text()").extract()[0] #发布时间 item['publishTime'] = each.xpath("./td[5]/text()").extract()[0] #将数据交给管道文件处理 yield item if self.offset < 50: self.offset += 10 #将请求重新发送给调度器,入队列,出队列,交给下载器下载 yield scrapy.Request(self.url + str(self.offset),callback = self.parse) else: print("end.")3、执行
scrapy crawl tencent
模拟登陆
# -*- coding: utf-8 -*- import scrapy class RenrenspiderSpider(scrapy.Spider): name = 'renrenspider' allowed_domains = ['renren.com'] #start_urls = ['http://renren.com/'] def start_requests(self): url = 'http://www.renren.com/PLogin.do' yield scrapy.FormRequest(url = url, formdata={"email":"XXXX@163.com","password":"XXXXXX"},callback=self.parse_page) def parse_page(self, response): with open("info.html","wb") as filename: filename.write(response.body)
下载图片:
运行时报错: File "/usr/lib64/python3.4/site-packages/scrapy/pipelines/images.py", line 15, in <module>
from PIL import Image
ImportError: No module named 'PIL'
解决办法:pip3 install pillow
vi items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class DouyuItem(scrapy.Item): # define the fields for your item here like: nickname = scrapy.Field() imagelink = scrapy.Field() imagepath = scrapy.Field()vi pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import scrapy from scrapy.utils.project import get_project_settings from scrapy.pipelines.images import ImagesPipeline import json import os class DouyuPipeline(ImagesPipeline): #获取settings中的常量 IMAGES_STORE = get_project_settings().get('IMAGES_STORE') def get_media_requests(self, item, info): image_url = item['imagelink'] yield scrapy.Request(image_url) def item_completed(self, results, item, info): print(results) #exit(1) image_path = [x['path'] for ok, x in results if ok] os.rename(self.IMAGES_STORE+"/"+image_path[0],self.IMAGES_STORE+"/"+item['nickname']+".jpg") item['imagepath'] = self.IMAGES_STORE+"/"+item['nickname'] return item
vi spiders/douyuavatar.py
# -*- coding: utf-8 -*- import scrapy from douyu.items import DouyuItem import json class DouyuavatarSpider(scrapy.Spider): name = 'douyuavatar' allowed_domains = ['capi.douyucdn.cn'] url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=" offset=0 #start_urls = ['http://tencent.com/'] start_urls = [url+str(offset)] def parse(self, response): #把json转换为python格式 data = json.loads(response.text)['data'] #print(data) #exit(1) for value in data: item = DouyuItem() item['nickname'] = value['nickname'] item['imagelink'] = value['vertical_src'] yield item if self.offset < 50: self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback = self.parse)