• python3+scrapy 趣头条爬虫实例


    项目简介

    爬取趣头条新闻(http://home.qutoutiao.net/pages/home.html),具体内容:
    1、列表页(json):标题,简介、封面图、来源、发布时间
    2、详情页(html):详细内容和图片

    目录结构

    生成的数据文件-单条记录


    主要代码说明

    爬虫:
    #爬取趣头条列表和详情页
    qutoutiao.spiders.qutoutiaos.QutoutiaosSpider
    管道文件:
    #封面图片处理类
    qutoutiao.imagepipelines.CoverImagePipeline
    #内容图片处理类
    qutoutiao.imagepipelines.ContentImagePipeline
    #数据处理类
    qutoutiao.pipelines.QutoutiaoPipeline
    中间件:
    #请求头设置类-这里只设置了user agent
    qutoutiao.middlewares.RandomUserAgent
    #代理设置类

    qutoutiao.middlewares.RandomProxy

    自定义:
    #配置文件
    qutoutiao.qttconfig.py  
    #工具类
    qutoutiao.qttutils.QttUtils

    创建项目

    cd /home/chaoge/mypython/crawler/

    scrapy startproject qutoutiao

    创建爬虫类即(qutoutiao.spiders.qutoutiaos.QutoutiaosSpider)

    cd qutoutiao/qutoutiao/spiders
    scrapy genspider qutoutiaos "api.1sapp.com"

    执行

          scrapy crawl qutoutiaos

          #scrapy crawl qutoutiaos --nolog#不显示log

    #scrapy crawl qutoutiaos -o qutoutiaos_log.json #将log输出到qutoutiaos_log.json

    代码实现

    qutoutiao.qttconfig.py  
    # 爬取域名(趣头条)
    DOMAIN = 'http://home.qutoutiao.net/pages/home.html'
    
    #数据存储路径
    DATA_STORE = '/home/chaoge/mypython/crawler/qutoutiao/data'
    
    #列表:http://api.1sapp.com/content/outList?cid=255&tn=1&page=1&limit=10
    #列表API
    LIST_API = 'http://api.1sapp.com/content/outList?'
    #列表记录数
    LIST_LIMIT = 10
    #分类
    CATEGORY_INFO = [
    	{"cid":255,"name":"推荐"},
    	{"cid":1,"name":"热点"},
    	{"cid":6,"name":"娱乐"},
    	{"cid":5,"name":"养生"},
    	{"cid":2,"name":"搞笑"},
    	{"cid":7,"name":"科技"},
    	{"cid":8,"name":"生活"},
    	{"cid":10,"name":"财经"},
    	{"cid":9,"name":"汽车"},
    ]


    qutoutiao.qttutils.py  
    # -*- coding: utf-8 -*- 
    # 趣头条工具类
    
    import time
    import os
    import shutil
    from qutoutiao import qttconfig as QttConfig
    
    class QttUtils:
    
    	# 获取存储路径
    	#
    	# @param  [string] action [remove删除目录,默认create]
    	# @return [string] path/year/month/day/*
    
    	@staticmethod
    	def getStorePath(action='create'):
    		localtimes = time.localtime()
    		year = time.strftime("%Y", localtimes)
    		month = time.strftime("%m", localtimes)
    		day = time.strftime("%d", localtimes)
    		store_path = QttConfig.DATA_STORE+"/%s/%s/%s"%(year,month,day)
    
    		#删除目录
    		if os.path.exists(store_path) and action == 'remove':
    			#os.rmdir(store_path)  
    			shutil.rmtree(store_path)
    
    		#创建多级目录
    		if not os.path.exists(store_path) and action == 'create':
    			os.makedirs(store_path)
    
    		return store_path


    qutoutiao.settings.py
    # -*- coding: utf-8 -*-
    
    # Scrapy settings for qutoutiao project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://doc.scrapy.org/en/latest/topics/settings.html
    #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'qutoutiao'
    
    SPIDER_MODULES = ['qutoutiao.spiders']
    NEWSPIDER_MODULE = 'qutoutiao.spiders'
    
    #日志
    #LOG_FILE = "qutoutiao.log"
    #日志等级
    #LOG_LEVEL = "DEBUG"
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'qutoutiao (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = True
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    SPIDER_MIDDLEWARES = {
    #    'qutoutiao.middlewares.QutoutiaoSpiderMiddleware': 543,
         'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': None,#spider中的allowed_domains将不受限制
    }
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    DOWNLOADER_MIDDLEWARES = {
       'qutoutiao.middlewares.RandomUserAgent': 100,
       'qutoutiao.middlewares.RandomProxy': 200,
    }
    #中间件中的UserAgent池
    USER_AGENTS = [
            'User-Agent:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
            'User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
            'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',
            'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko',
            'User-Agent:Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
            'User-Agent:Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
            'User-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
            'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
    ]
    #中间件中的Proxy池
    PROXIES = [
            {'ip_port':'121.42.140.113:16816','user_password':'username-xxxx:password-xxxx'},
            {'ip_port':'117.90.137.181:9000'},
            {'ip_port':'117.90.2.151:9000'},
            {'ip_port':'114.235.23.147:9000'},
    ]
    
    
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
        'qutoutiao.imagepipelines.CoverImagePipeline': 301,#封面图片下载
        'qutoutiao.imagepipelines.ContentImagePipeline': 302,#内容图片下载
        'qutoutiao.pipelines.QutoutiaoPipeline': 400,#数据处理
    }
    #图片存储路径
    IMAGES_STORE = "/home/chaoge/mypython/crawler/qutoutiao/tmp/images"
    #缩图设置
    #IMAGES_THUMBS = {
    #    'small':(50,50),
    #    'big':(270,270),
    #}
    #图片宽和高在110*110以下忽略
    IMAGE_MIN_HEIGHT = 110
    IMAGE_MIN_WIDTH = 110
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'


    qutoutiao.items.py
    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    class QutoutiaoItem(scrapy.Item):
    	# define the fields for your item here like:
    
    	#文章id
    	aid = scrapy.Field()
    	#来源
    	source_name = scrapy.Field()
    	#标题
    	title = scrapy.Field()
    	#详细页url
    	url = scrapy.Field()
    	#简介
    	introduction = scrapy.Field()
    	#封面图
    	cover = scrapy.Field()
    	#发布时间
    	publish_time = scrapy.Field()
    	#分类ID
    	cid = scrapy.Field()
    	#内容
    	content = scrapy.Field()
    	#内容-中的图片
    	content_images = scrapy.Field()


    qutoutiao.middlewares.py  
    # -*- coding: utf-8 -*- 
    
    import random
    import base64
    from settings import USER_AGENTS
    from settings import PROXIES
    
    #随机User-Agent
    class RandomUserAgent(object):
    	def process_request(self,request,spider):
    		useragent = random.choice(USER_AGENTS)
    		request.headers.setdefault('User-Agent',useragent)
    		#request.headers.setdefault('Host','html2.qktoutiao.com')
    		#request.headers.setdefault('Referer','http://home.qutoutiao.net/pages/home.html')
    
    #随机代理	
    class RandomProxy(object):
    	def process_request(self,request,spider):
    		proxy = random.choice(PROXIES)
    		request.meta['proxy'] = 'http://'+proxy['ip_port']
    		#base64_user_password = base64.b64encode(bytes(proxy['user_password'], 'utf-8'))
    		#decodebs64 = base64.b64decode(base64_user_password)
    		#print(base64_user_password,decodebs64)
    		if 'user_password' in proxy and proxy['user_password']:#需要用户名密码的代理
    			base64_user_password = str(base64.b64encode(bytes(proxy['user_password'], 'utf-8')))
    			request.headers['Proxy-Authorization'] = 'Basic '+base64_user_password

    qutoutiao.imagepipelines.py
    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    import scrapy
    from scrapy.utils.project import get_project_settings
    from scrapy.pipelines.images import ImagesPipeline
    import os
    from qutoutiao.qttutils import QttUtils
    
    #封面图下载
    class CoverImagePipeline(ImagesPipeline):
    	#获取settings中的常量
    	IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
    	#下载图片
    	def get_media_requests(self, item, info):
    		cover_images = item['cover']
    		if cover_images:
    			for image_url in cover_images:
    				yield scrapy.Request(url=image_url)
    
    
    	#下载完成
    	def item_completed(self, results, item, info):
    		#print('*'*20,results,item,info)
    		image_path = [x['path'] for ok, x in results if ok] 
    		#获取自定义存储路径
    		store_path = QttUtils.getStorePath()
    		coverImages = []
    		#将图片移动到新的路径
    		if image_path:
    			for image_url in image_path:
    				file_name = os.path.split(str(image_url))
    				new_image = store_path+"/"+file_name[1]
    				coverImages.append(new_image)
    				os.rename(self.IMAGES_STORE+"/"+image_url,new_image)
    		item['cover'] = coverImages
    		return item
    #内容图片下载
    class ContentImagePipeline(ImagesPipeline):
    	#获取settings中的常量
    	IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
    	#下载图片
    	def get_media_requests(self, item, info):
    		content_images = item['content_images']
    		if content_images:
    			for image_url in content_images:
    				yield scrapy.Request(image_url)
    
    	#下载完成
    	def item_completed(self, results, item, info):
    		image_info = [(x['path'],x['url']) for ok, x in results if ok] 
    		#获取自定义存储路径
    		store_path = QttUtils.getStorePath()
    		contentImages = []
    		#将图片移动到新的路径
    		if image_info:
    			for value in image_info:
    				image_url = value[0]
    				image_source = value[1]
    				file_name = os.path.split(str(image_url))
    				new_image = store_path+"/"+file_name[1]
    				contentImages.append((new_image,image_source))
    				os.rename(self.IMAGES_STORE+"/"+image_url,new_image)
    		item['content_images'] = contentImages
    		return item


    qutoutiao.pipelines.py 
    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    import json
    from qutoutiao.qttutils import QttUtils
    
    class QutoutiaoPipeline(object):
    	def __init__(self):
    		#获取自定义存储路径
    		store_path = QttUtils.getStorePath()
    		json_path = store_path+"/"+"qutoutiao.json"
    		self.filename = open(json_path,"wb")
    
    	def process_item(self, item, spider):
    		text = json.dumps(dict(item),ensure_ascii=False)+"
    "
    		self.filename.write(text.encode("utf-8"))
    		return item
    
    	def close_spider(self,spider):
    		self.filename.close()


    qutoutiao.spiders.qutoutiaos.py
    # -*- coding: utf-8 -*-
    #web site:http://home.qutoutiao.net/pages/home.html
    
    import scrapy
    #通过CrawlSpider,Rule类爬取
    #-*-from scrapy.spiders import CrawlSpider,Rule-*-
    #-*-from scrapy.linkextractors import LinkExtractor-*-
    from qutoutiao.items import QutoutiaoItem
    import json
    import re
    from qutoutiao import qttconfig as QttConfig
    
    #-*-class QutoutiaosSpider(CrawlSpider):-*-
    
    class QutoutiaosSpider(scrapy.Spider):
    	name = 'qutoutiaos'
    	allowed_domains = ['api.1sapp.com']
    
    	#爬取地址
    	start_urls = []
    	categoryInfo = QttConfig.CATEGORY_INFO
    	limit = QttConfig.LIST_LIMIT
    	for value in categoryInfo:
    			url = QttConfig.LIST_API+"cid=%s&tn=1&page=1&limit=%s"%(str(value['cid']),str(limit))
    			start_urls.append(url)
    
    	#response里链接的提取规则
    	# -*-pageLink = LinkExtractor(allow=("start=d+"))-*-
    	# -*-rules = [
    	# -*-   #用pageLink提取规则跟进,通过parseQtt进行解析
    	# -*-   Rule(pageLink,callback="parseQtt",follow=True)
    	# -*-]
    	def parse(self, response):
    		response_url = response.url
    		#分类id从url又获取了一次
    		searchObj = re.search( r'(.*)cid=(d+)', response_url)
    		cid = searchObj and searchObj.group(2) or 0 
    
    		data = json.loads(response.text)['data']['data']
    	
    		for value in data:
    			#初始化模型对象
    			item = QutoutiaoItem()
    			#来源
    			item['source_name'] = value['source_name']
    			#标题
    			item['title'] = value['title']
    			#详细页url
    			url = item['url'] = value['url']
    			#url = url[0:url.find('?')]
    			#简介
    			item['introduction'] = value['introduction']
    			#封面图
    			item['cover'] = value['cover']
    			#发布时间
    			item['publish_time'] = value['publish_time']
    			#分类
    			item['cid'] = cid
    
    			#爬取详情页
    			yield scrapy.Request( url = item['url'], meta={'meta_item': item}, callback=self.detail_parse)
    
    	#详情页
    	def detail_parse(self, response):
    		# 提取每次Response的meta数据
    		meta_item = response.meta['meta_item']
    		#取内容
    		content_selector = response.xpath('//div[@class="content"]')
    		meta_item['content_images'] = content_selector.xpath('//img/@src|//img/@data-src').extract()
    		meta_item['content'] = content_selector.extract()[0]
    		yield meta_item


  • 相关阅读:
    Fiddler 抓包工具总结(转)
    JMS之ActiveMQ下载安装
    Centos中查询目录中内容命名ls
    JS解析Excel
    Echarts图表
    Oracle 控制语句
    时间日期的获取
    Random随机类(11选5彩票)BigInteger大数据类(华为面试题1000的阶乘)
    Java的析构函数System的finalize()
    JSP Standard Tag Library JSP标准标签库
  • 原文地址:https://www.cnblogs.com/fonyer/p/8871445.html
Copyright © 2020-2023  润新知