• 爬虫框架scrapy学习笔记


    爬虫框架scrapy学习笔记

    1. 框架运行大致流程

    1. 引擎从调度器中取出一个链接(URL)用于接下来的抓取
    2. 引擎把URL封装成一个请求(Request)传给下载器
    3. 下载器把资源下载下来,并封装成应答包(Response)
    4. 爬虫解析Response
    5. 解析出实体(Item),则交给实体管道进行进一步的处理
    6. 解析出的是链接(URL),则把URL交给调度器等待抓取

    2. 安装与使用

    1. pip install scrapy -i 资源网址(详见爬虫基础笔记)
    2. 使用:
    3. 定位到要创建项目的目录
    4. 创建项目:scrapy startproject myfrist(your_project_name)
    5. 创建爬虫:scrapy genspider 爬虫名 爬虫的地址
    6. 运行爬虫:scrapy crawl 爬虫名

    3. 案例(爬取小说)

    81中文网小说爬取(道神和替天行道)
    

    ttxd.py

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class TtxdSpider(scrapy.Spider):
        name = 'ttxd'
        allowed_domains = ['zwdu.com']
        # 第一章开始,爬取小说:替天行道
        start_urls = ['https://www.zwdu.com/book/28364/9673844.html']
    
        # 测试最后几章
        # start_urls = ['https://www.zwdu.com/book/28364/19653880.html']
    
        def parse(self, response):
            title = response.xpath('//h1/text()').extract_first()
            content = ''.join(response.xpath('//div[@id="content"]/text()').extract()).replace('    ', '
    ')
    
            yield {
                'title': title,
                'content': content
            }   
    
            next_url = response.xpath('//div[@class="bottem1"]/a[3]/@href').extract_first()
            # base_url = 'https://www.zwdu.com/{}'.format(next_url)
            if next_url.find('.html') != -1:
                yield scrapy.Request(response.urljoin(next_url))
    
    

    ds.py

    ds.py使用了 CrawlSpider模板
    创建爬虫需执行命令:scrapy genspider -t crawl 文件名 (allowed_url)
    
    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    
    
    class DsSpider(CrawlSpider):
        name = 'ds'
        allowed_domains = ['zwdu.com']
        start_urls = ['https://www.zwdu.com/book/8725/']
    
        rules = (
            Rule(LinkExtractor(restrict_xpaths=r'//div[@id="list"]//dd[2]'),callback='parse_item',follow=True),
            Rule(LinkExtractor(restrict_xpaths=r'/div[@class="bottem1"]/a[3]'), callback='parse_item', follow=True),
        )
    
        def parse_item(self, response):
            title = response.xpath('//h1/text()').extract_first()
            content = ''.join(response.xpath('//div[@id="content"]/text()').extract()).replace('    ', '
    ')
    
            yield {
                'title': title,
                'content': content
            }
    
    
    

    main.py

    from scrapy.cmdline import execute
    
    execute(['scrapy','crawl','ttxd'])
    # execute(['scrapy','crawl','ds'])
    

    pipline.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    class XiaoshuoPipeline(object):
        def open_spider(self, spider):
            # self.file = open('ttxd.txt', 'w', encoding='utf-8')
            self.file = open('ds.txt', 'w', encoding='utf-8')
    
        def process_item(self, item, spider):
            title = item['title']
            content = item['content']
            # info = '
    -------'+title+'------
    '+content+'
    '
            info = title+'
    '
            self.file.write(info)
            self.file.flush()
            return item
    
        def close_spider(self, spider):
            self.file.close()
    
    

    setting.py

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for xiaoshuo project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://docs.scrapy.org/en/latest/topics/settings.html
    #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'xiaoshuo'
    
    SPIDER_MODULES = ['xiaoshuo.spiders']
    NEWSPIDER_MODULE = 'xiaoshuo.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    DOWNLOAD_DELAY = 2
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'xiaoshuo.middlewares.XiaoshuoSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'xiaoshuo.middlewares.XiaoshuoDownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See https://docs.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       'xiaoshuo.pipelines.XiaoshuoPipeline': 300,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    
    

    4. scrapy的登录

    普通传参登录
    
    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class Login1Spider(scrapy.Spider):
        name = 'login1'
        allowed_domains = ['zengqiang.club']
        # start_urls = ['http://zengqiang.club/admin']
    
        def start_requests(self):
            url = 'http://www.zengqiang.club/admin/login'
            from_data = {
                'username':'曾强',
                'password':'ZQZ981004'
            }
            # 发送post请求用scrapy.FormRequest()
            yield scrapy.FormRequest(url,formdata=from_data,callback=self.parse)
    
        def parse(self, response):
            # print(response.text)
            next_url = 'http://www.zengqiang.club/admin/blogs'
            yield scrapy.Request(next_url,callback=self.parse_info)
    
        def parse_info(self,response):
            titles = response.xpath('//tr//td[2]/text()').extract()
            for title in titles:
                print(title)
    
    cookie登录
    
    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class Login2Spider(scrapy.Spider):
        name = 'login2'
        allowed_domains = ['zengqiang.club']
        # start_urls = ['http://zengqiang.club/']
    
        def start_requests(self):
            url = 'http://www.zengqiang.club/admin/blogs'
            cookie_str = 'JSESSIONID=CBB390075280E7FA8BB4B7A3A7890D94;'
            cookies = {
                'JSESSIONID': 'CBB390075280E7FA8BB4B7A3A7890D94'
            }
            # 适合于cookie有很多key和value时
            # cookies = {}
            # for cookie in cookie_str.split(';'):
            #     # key,value = cookie.split('=',1)
            #     key, value = cookie.split('=',1)
            #     cookies[key.strip()] = value.strip()
    
            yield scrapy.Request(url,cookies=cookies,callback=self.parse)
    
        def parse(self, response):
            print(response.text)
    
    
    登录需要验证码,登录(赶集网案例)
    
    # -*- coding: utf-8 -*-
    import scrapy,re
    
    
    class Login3Spider(scrapy.Spider):
        name = 'login3'
        allowed_domains = ['ganji.com']
        start_urls = ['https://passport.ganji.com/login.php']
    
        def parse(self, response):
            hash_code = re.findall(r'"__hash__":"(.+)"', response.text)[0]
            image_url = response.xpath('//img[@class="login-img-checkcode"]/@data-url').extract_first()
            print(hash_code,'
    ',image_url)
    
            yield scrapy.Request(image_url,callback=self.parse_info,meta={'hash_code':hash_code})
    
        def parse_info(self,response):
            hash_code = response.request.meta['hash_code']
            print(hash_code)
            with open('yzm.jpg','wb')as f:
                f.write(response.body)
    
            code = input('请输入验证码:')
    
            form_data = {
                "username": "17784476955",
                "password": "ZQZ981004",
                "setcookie": "0",
                "checkCode": code,
                "next": "/",
                "source": "passport",
                "__hash__": hash_code
            }
    
            login_url = 'https://passport.ganji.com/login.php'
            yield scrapy.FormRequest(login_url,formdata=form_data,callback=self.login)
    
        def login(self,response):
            print(response.text)
            user_info_url = 'http://www.ganji.com/vip'
            yield scrapy.Request(user_info_url,callback=self.login_check)
    
        def login_check(self,response):
            print(response.text)
    
    

    5. scrapy代理的使用

    创建一个代理中间件
    在setting中打开自己定义的中间件,使用代理
    
    # proxymiddleware.py
    
    class ProxyMiddleware(object):
    
        def process_request(self, request, spider):
            # request.meta['proxy']='http://ip:port'
            # request.meta['proxy']='http://user:password@ip:port'
    
            request.meta['proxy'] = 'http://222.95.240.159:3000'
    
    # setting中打开
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    DOWNLOADER_MIDDLEWARES = {
       # 'zol.middlewares.ZolDownloaderMiddleware': 543,
       'zol.proxymiddleware.ProxyMiddleware': 301
    }
    

    6. 爬取zol桌面壁纸案例

    wallpaper.py

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class WallpaperSpider(scrapy.Spider):
        name = 'wallpaper'
        allowed_domains = ['zol.com.cn']
        start_urls = ['http://desk.zol.com.cn/bizhi/8672_106957_2.html']
    
        def parse(self, response):
            image_url = response.xpath('//img[@id="bigImg"]/@src').extract()
            image_name = response.xpath('string(//h3)').extract_first()
    
            yield {
                'image_urls': image_url,
                'image_name': image_name
            }
    
            next_url = response.xpath('//a[@id="pageNext"]/@href').extract_first()
            if next_url.find('.html') != -1:
                yield scrapy.Request(response.urljoin(next_url), callback=self.parse)
    

    plpeline.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    from scrapy.pipelines.images import ImagesPipeline
    import scrapy
    
    class ZolPipeline(object):
        def process_item(self, item, spider):
            return item
    
    
    class ImagePipeline(ImagesPipeline):
        def get_media_requests(self, item, info):
            for image_url in item['image_urls']:
                yield scrapy.Request(image_url, meta={'image_name':item['image_name']})
    
        def file_path(self, request, response=None, info=None):
            filename = request.meta['image_name'].strip().replace('
    		','').replace('/','_')+'.jpg'
            return filename
    

    setting

    from fake_useragent import UserAgent
    
    
    BOT_NAME = 'zol'
    
    SPIDER_MODULES = ['zol.spiders']
    NEWSPIDER_MODULE = 'zol.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = UserAgent().random
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    DOWNLOAD_DELAY = 0
    
    
    ITEM_PIPELINES = {
       # 'zol.pipelines.ZolPipeline': 300,
       # 'scrapy.contrib.pipeline.images.ImagesPipeline': 300,这个不行
       # 'scrapy.pipelines.images.ImagesPipeline': 300,
       'zol.pipelines.ImagePipeline': 300,
    }
    IMAGES_STORE = 'F:/PythonProjects/study/爬虫学习/scrapy框架/zol/zol/img'
    

    7. scrapy使用selenium

    selenium用于爬取Ajax异步请求网站的数据
    爬取瓜子二手车页面代码(Ajax请求的网页)
    实现爬取完成关闭浏览器,程序未结束,不关浏览器
    

    guazi.py

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy import signals
    from selenium import webdriver
    
    
    class GuaziSpider(scrapy.Spider):
        name = 'guazi'
        allowed_domains = ['gauzi.com']
        start_urls = ['https://www.guazi.com/cd/buy/']
    
        @classmethod
        def from_crawler(cls, crawler, *args, **kwargs):
            spider = super(GuaziSpider, cls).from_crawler(crawler, *args, **kwargs)
            spider.driver = webdriver.Chrome()
            crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
            return spider
    
        def spider_closed(self, spider):
            spider.driver.quit()
            # print('爬虫结束!')
    
        # 得到代码,即可解析
        def parse(self, response):
            print(response.text)
    
    

    middlewares.py

    from selenium import webdriver
    from scrapy.http import HtmlResponse
    
    
    class SeleniumMiddleware(object):
    
        # 初始化创建的程序结束浏览器也不会关闭
        # def __init__(self):
        #     self.driver = webdriver.Chrome()
    
        def process_request(self, request, spider):
            url = request.url
            # driver = webdriver.Chrome()
            # self.driver.get(url)
            # html = self.driver.page_source
    
            # 使用spider的driver
            spider.driver.get(url)
            html = spider.driver.page_source
            # print(html)
            return HtmlResponse(url=url, body=html, request=request, encoding='utf-8')
    
    

    setting.py(略)

    8. scrapy案例

    爬取链家二手房的房源数据(地点,价格,大小,等等)
    保存数据到Mongo数据库和Mysql数据库
    

    lianjia.py

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class LianjiSpider(scrapy.Spider):
        name = 'lianjia'
        allowed_domains = ['lianjia.com']
        # 测试 只爬取两页数据(60个房源信息)
        start_urls = ['https://cd.lianjia.com/ershoufang/cd/pg{}/'.format(num) for num in range(1, 3)]
        # 测试
        # start_urls = ['https://cd.lianjia.com/ershoufang/cd/pg1/']
    
        def parse(self, response):
            # print(response.url)
            urls = response.xpath('//div[@class="info clear"]//div[@class="title"]/a/@href').extract()
            for url in urls:
                yield scrapy.Request(url, callback=self.parse_info)
    
        def parse_info(self, response):
            # print(response.url)
            # 二手房标题
            title = response.xpath('//h1/text()').extract_first()
            # 二手房价格
            total_price = response.xpath(
                'concat(//div[@class="price "]//span[@class="total"]/text(),//div[@class="price "]//span[@class="unit"]/span/text())').extract_first()
            # 单价
            unitPriceValue = response.xpath('string(//span[@class="unitPriceValue"])').extract_first()
            # 地址
            areaName = response.xpath(
                'concat(//div[@class="areaName"]//span[2],//div[@class="areaName"]/a)').extract_first()
            # 小区
            village = response.xpath('//div[@class="communityName"]/a[1]/text()').extract_first()
    
            # 户型
            hu_xing = response.xpath('//div[@class="base"]//ul/li[1]/text()').extract_first()
            # 楼层
            lou_ceng = response.xpath('//div[@class="base"]//ul/li[2]/text()').extract_first()
            # 面积
            area = response.xpath('//div[@class="base"]//ul/li[3]/text()').extract_first()
            # 产权
            chan_quan = response.xpath('//div[@class="transaction"]//ul/li[last()-2]/span[2]/text()').extract_first()
    
            yield {
                'title': title,
                'total_price': total_price,
                'unitPriceValue': unitPriceValue,
                'areaName': areaName,
                'village': village,
                'hu_xing': hu_xing,
                'lou_ceng': lou_ceng,
                'area': area,
                'chan_quan': chan_quan
            }
    
    

    pipelines.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    import pymongo
    import pymysql
    
    
    # class LianjiaHourseMongoPipeline(object):
    #
    #     def open_spider(self, spider):
    #         self.client = pymongo.MongoClient()
    #
    #     def process_item(self, item, spider):
    #         self.client.lianjia.ershoufang.insert(item)
    #         return item
    #
    #     def close_spider(self, spider):
    #         self.client.close()
    
    
    class LianjiaHourseMysqlPipeline(object):
    
        def open_spider(self, spider):
            self.client = pymysql.connect(host='localhost', port=3306, user='root', password='ZQZ981004', db='python',
                                          charset='utf8')
            self.cursor = self.client.cursor()
    
        def process_item(self, item, spider):
            args = [
                item['title'],
                item['total_price'],
                item['unitPriceValue'],
                item['areaName'],
                item['village'],
                item['hu_xing'],
                item['lou_ceng'],
                item['area'],
                item['chan_quan']
            ]
            sql = 'insert into lianjia_ershoufang values (0,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
            self.cursor.execute(sql, args)
            self.client.commit()
            return item
    
        def close_spider(self, spider):
            self.cursor.close()
            self.client.close()
    
    

    setting.py(略)

    资料

    链接:https://pan.baidu.com/s/10e8PphvR7Um0-WPAylw8Yw

    提取码:h8i8

    版权声明:本文为博主原创文章,转载请附上博文链接!
  • 相关阅读:
    Mechanism of Loading Resources
    Dashboards (Android)
    Tips: compilation and creating new projects on Android 4.0
    设备方向
    【转】字符串分割(C++)
    Moving From Objective-C to C++
    Simulate android behaviors on win32
    【ybtoj高效进阶 21265】排队问题(fhq-Treap)(构造)
    【ybtoj高效进阶 21262】头文字 D(线段树)(数学)
    【ybtoj高效进阶 21261】头文字 C(单调队列优化DP)
  • 原文地址:https://www.cnblogs.com/zq98/p/15028009.html
Copyright © 2020-2023  润新知