• Scrapy项目实战:爬取某社区用户详情


    get_cookies.py

    from selenium import webdriver
    from pymongo import MongoClient
    from scrapy.crawler import overridden_settings
    # from segmentfault import settings
    import time
    import settings
    
    class GetCookies(object):
        def __init__(self):
            # 初始化组件
            # 设定webdriver选项
            self.opt = webdriver.ChromeOptions()
            # self.opt.add_argument("--headless")
            # 初始化用户列表
            self.user_list = settings.USER_LIST
            # 初始化MongoDB参数
            self.client = MongoClient(settings.MONGO_URI)
            self.db = self.client[settings.MONGO_DB]
            self.collection = self.db["cookies"]
    
        def get_cookies(self,username,password):
            """
    
            :param username:
            :param password:
            :return: cookies
            """
            # 使用webdriver选项创建driver
            driver = webdriver.Chrome(executable_path="/Users/Hank/scrapy/segmentfault/segmentfault/chromedriver",options=self.opt)
            driver.get("https://segmentfault.com/user/login")
            driver.find_element_by_name("username").send_keys(username)
            driver.find_element_by_name("password").send_keys(password)
            driver.find_element_by_xpath("//button[@type='submit']").click()
            time.sleep(2)
            driver.get("https://segmentfault.com/u/luwangmeilun/users/following")
            # 登陆之后获取页面cookies
            cookies = driver.get_cookies()
            driver.quit()
    
            return cookies
    
        def format_cookies(self,cookies):
            """
    
            :param cookies:
            从driver.get_cookies的形式为:
            [{'domain': 'segmentfault.com', 'httpOnly': False, 'name': 'PHPSESSID',
            'path': '/', 'secure': False, 'value': 'web2~5grmfa89j12eksub8hja3bvaq4'},
            {'domain': '.segmentfault.com', 'expiry': 1581602940, 'httpOnly': False,
            'name': 'Hm_lvt_e23800c454aa573c0ccb16b52665ac26', 'path': '/', 'secure': False,
            'value': '1550066940'},
            {'domain': '.segmentfault.com', 'httpOnly': False,
            'name': 'Hm_lpvt_e23800c454aa573c0ccb16b52665ac26',
            'path': '/', 'secure': False, 'value': '1550066940'},
            {'domain': '.segmentfault.com', 'expiry': 1550067000, 'httpOnly': False,
            'name': '_gat', 'path': '/', 'secure': False, 'value': '1'},
            {'domain': '.segmentfault.com', 'expiry': 1550153340, 'httpOnly': False,
            'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.783265084.1550066940'},
            {'domain': '.segmentfault.com', 'expiry': 1613138940, 'httpOnly': False, 'name': '_ga',
            'path': '/', 'secure': False, 'value': 'GA1.2.1119166665.1550066940'}]
            只需提取每一项的name与value即可
    
            :return:
            """
            c = dict()
            for item in cookies:
                c[item['name']] = item['value']
    
            return c
    
        def save(self):
            print("开始获取Cookies....")
            # 从用户列表中获取用户名与密码,分别登陆获取cookies
            for username,password in self.user_list:
                cookies = self.get_cookies(username,password)
                f_cookies = self.format_cookies(cookies)
                print("insert cookie:{}".format(f_cookies))
                # 将格式整理后的cookies插入MongoDB数据库
                self.collection.insert_one(f_cookies)
    
            # s = db[self.collection].find()
            # for i in s:
            #     print(i)
    
    
    if __name__ == '__main__':
    
        cookies = GetCookies()
        for i in range(20):
            cookies.save()
    

    item.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class SegmentfaultItem(scrapy.Item):
        # define the fields for your item here like:
        # 个人属性
        # 姓名
        name = scrapy.Field()
        # 声望
        rank = scrapy.Field()
        # 学校
        school = scrapy.Field()
        # 专业
        majors = scrapy.Field()
        # 公司
        company = scrapy.Field()
        # 工作
        job = scrapy.Field()
        # blog
        blog = scrapy.Field()
        # 社交活动数据
        # 关注人数
        following = scrapy.Field()
        # 粉丝数
        fans = scrapy.Field()
        # 回答数
        answers = scrapy.Field()
        # 提问数
        questions = scrapy.Field()
        # 文章数
        articles = scrapy.Field()
        # 讲座数
        lives = scrapy.Field()
        # 徽章数
        badges = scrapy.Field()
        # 技能属性
        # 点赞数
        like = scrapy.Field()
        # 技能
        skills = scrapy.Field()
        # 注册日期
        register_date = scrapy.Field()
        # 问答统计
        # 回答最高得票数
        answers_top_score = scrapy.Field()
        # 得票数最高的回答对应的问题的标题
        answers_top_title = scrapy.Field()
        # 得票数最高的回答对应的问题的标签
        answers_top_tags = scrapy.Field()
        # 得票数最高的回答对应的问题的内容
        answers_top_question = scrapy.Field()
        # 得票数最高的回答对应的问题的内容
        answers_top_content = scrapy.Field()
    

    pipeline.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    import pymongo
    
    class SegmentfaultPipeline(object):
        # 设定MongoDB集合名称
        collection_name = 'userinfo'
    
        def __init__(self,mongo_uri,mongo_db):
            self.mongo_uri = mongo_uri
            self.mongo_db = mongo_db
    
        # 通过crawler获取settings.py中设定的MongoDB连接信息
        @classmethod
        def from_crawler(cls,crawler):
            return cls(
                mongo_uri = crawler.settings.get('MONGO_URI'),
                mongo_db = crawler.settings.get('MONGO_DB','segmentfault')
            )
    
        # 当爬虫启动时连接MongoDB
        def open_spider(self,spider):
            self.client = pymongo.MongoClient(self.mongo_uri)
            self.db = self.client[self.mongo_db]
    
        # 当爬虫关闭时断开MongoDB连接
        def close_spider(self,spider):
            self.client.close()
    
        # 将Item插入数据库保存
        def process_item(self, item, spider):
            self.db[self.collection_name].insert_one(dict(item))
            return item
    

    settings.py

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for segmentfault project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://doc.scrapy.org/en/latest/topics/settings.html
    #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'segmentfault'
    
    SPIDER_MODULES = ['segmentfault.spiders']
    NEWSPIDER_MODULE = 'segmentfault.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    CONCURRENT_REQUESTS = 100
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    # DOWNLOAD_DELAY = 2
    # The download delay setting will honor only one of:
    # CONCURRENT_REQUESTS_PER_DOMAIN = 32
    # CONCURRENT_REQUESTS_PER_IP = 32
    
    # Disable cookies (enabled by default)
    # COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    RETRY_ENABLED = False
    
    REDIRECT_ENABLED = False
    
    DOWNLOAD_TIMEOUT = 5
    
    # HTTPALLOW
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    SPIDER_MIDDLEWARES = {
       'segmentfault.middlewares.SegmentfaultSpiderMiddleware': 543,
    }
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    DOWNLOADER_MIDDLEWARES = {
       # 'segmentfault.middlewares.SegmentfaultHttpProxyMiddleware': 543,
       'segmentfault.middlewares.SegmentfaultUserAgentMiddleware':643,
       'segmentfault.middlewares.SegmentfaultCookiesMiddleware':743,
       'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
       'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
       # 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware':None,
    
    }
    
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       'segmentfault.pipelines.SegmentfaultPipeline': 300,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
    # AUTOTHROTTLE_ENABLED = True
    # # The initial download delay
    # AUTOTHROTTLE_START_DELAY = 5
    # # The maximum download delay to be set in case of high latencies
    # AUTOTHROTTLE_MAX_DELAY = 60
    # # The average number of requests Scrapy should be sending in parallel to
    # # each remote server
    # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # # Enable showing throttling stats for every response received:
    # AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    
    # 配置MONGODB
    MONGO_URI = 'localhost:27017'
    MONGO_DB = 'segmentfault'
    
    # 用户列表
    USER_LIST = [
       ("798549150@qq.com","guoqing1010"),
       ("learnscrapy@163.com","guoqing1010"),
    ]
    
    # 配置代理列表
    PROXY_LIST = [
       'http://115.182.212.169:8080',
       'http://121.61.25.149:9999',
       'http://180.118.247.189:9000',
       'http://115.151.3.12:9999',
       'http://183.154.213.160:9000',
       'http://113.128.9.106:9999',
       'http://124.42.68.152:90',
       'http://49.70.48.50:9999',
       'http://113.128.11.172:9999',
       'http://111.177.177.40:9999',
       'http://59.62.83.253:9999',
       'http://39.107.84.185:8123',
       'http://124.94.195.107:9999',
       'http://111.177.160.132:9999',
       'http://120.25.203.182:7777'
    ]
    
    USER_AGENT_LIST = [
       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
       'Opera/8.0 (Windows NT 5.1; U; en)',
       'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
       'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
       'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
       'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
       'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
       'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)',
       'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
       'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
       'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
    ]
    

    userinfo.py

    # -*- coding: utf-8 -*-
    import scrapy
    import time
    from scrapy import Request
    from pymongo import MongoClient
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider,Rule
    from scrapy.http import FormRequest
    from segmentfault.items import SegmentfaultItem
    
    
    class UserinfoSpider(CrawlSpider):
        name = 'userinfo'
        allowed_domains = ['segmentfault.com']
        start_urls = ['https://segmentfault.com/u/mybigbigcat/users/following']
    
        rules = (
            # 用户主页地址,跟进并进行解析
            Rule(LinkExtractor(allow=r'/u/w+$'),callback='parse_item',follow=True),
            # 用户关注列表,跟进列表页面,抓取用户主页地址进行后续操作
            # Rule(LinkExtractor(allow=r'/users/followed$'),follow=True),
            # 用户粉丝列表,跟进列表页面,抓取用户主页地址进行后续操作
            Rule(LinkExtractor(allow=r'/users/following$'),follow=True),
            # 跟进其他页面地址
            # Rule(LinkExtractor(allow=r'/users/[followed|following]?page=d+'),follow=True),
        )
    
        def start_requests(self):
            # 从MongoDB中获取一条cookie,添加到开始方法
            client = MongoClient(self.crawler.settings['MONGO_URI'])
            db = client[self.crawler.settings['MONGO_DB']]
            cookies_collection = db.cookies
            # 获取一条cookie
            cookies = cookies_collection.find_one()
            # cookie中的'Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'参数是当前时间的10位表示法,因此重新填充
            cookies['Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time()))
    
            return [Request("https://segmentfault.com",
                            cookies=cookies,
                            meta={'cookiejar':1},
                            callback=self.after_login)]
    
        # 登录之后从start_url中开始抓取数据
        def after_login(self,response):
            for url in self.start_urls:
                return self.make_requests_from_url(url)
        # def after_login(self,response):
        #     yield Request(self.start_urls[0],
        #                    meta={'cookiejar':response.meta['cookiejar']},
        #                    callback=self.parse_item)
    
        def parse_item(self, response):
            """
            :param response:
            :return:
            """
            item = SegmentfaultItem()
            # 个人属性模块
            profile_head = response.css('.profile__heading')
            # 姓名
            item['name'] = profile_head.css('h2[class*=name]::text').re_first(r'w+')
            # 声望
            item['rank'] = profile_head.css('.profile__rank-btn > span::text').extract_first()
            # 学校专业信息
            school_info = profile_head.css('.profile__school::text').extract()
            if school_info:
                # 学校
                item['school'] = school_info[0]
                # 专业
                item['majors'] = school_info[1].strip()
            else:
                item['school'] = ''
                item['majors'] = ''
            # 公司职位信息
            company_info = profile_head.css('.profile__company::text').extract()
            if company_info:
                # 公司
                item['company'] = company_info[0]
                # 职位
                item['job'] = company_info[1].strip()
            else:
                item['company'] = ''
                item['job'] = ''
            # 个人博客
            item['blog'] = profile_head.css('a[class*=other-item-link]::attr(href)').extract_first()
    
            # 统计面板模块
            profile_active = response.xpath("//div[@class='col-md-2']")
            # 关注人数
            item['following'] = profile_active.css('div[class*=info] a > .h5::text').re(r'd+')[0]
            # 粉丝人数
            item['fans'] = profile_active.css('div[class*=info] a > .h5::text').re(r'd+')[1]
            # 回答问题数
            item['answers'] = profile_active.css('a[href*=answer] .count::text').re_first(r'd+')
            # 提问数
            item['questions'] = profile_active.css('a[href*=questions] .count::text').re_first(r'd+')
            # 文章数
            item['articles'] = profile_active.css('a[href*=articles] .count::text').re_first(r'd+')
            # 讲座数
            item['lives'] = profile_active.css('a[href*=lives] .count::text').re_first(r'd+')
            # 徽章数
            item['badges'] = profile_active.css('a[href*=badges] .count::text').re_first(r'd+')
            # 徽章详细页面地址
            badge_url = profile_active.css('a[href*=badges]::attr(href)').extract_first()
    
            # 技能面板模块
            profile_skill = response.xpath("//div[@class='col-md-3']")
            # 技能标签列表
            item['skills'] = profile_skill.css('.tag::text').re(r'w+')
            # 获得的点赞数
            item['like'] = profile_skill.css('.authlist').re_first(r'获得 (d+) 次点赞')
            # 注册日期
            item['register_date'] = profile_skill.css('.profile__skill--other p::text').extract_first()
            # if register_time:
            #     item['register_date'] = ''.join(re.findall(r'd+',register_time))
            # else:
            #     item['register_date'] = ''
    
            # 产出数据模块
            profile_work = response.xpath("//div[@class='col-md-7']")
            # 回答获得的最高分
            item['answers_top_score'] = profile_work.css('#navAnswer .label::text').re_first(r'd+')
            # 最高分回答对应的问题的标题
            item['answers_top_title'] = profile_work.css('#navAnswer div[class*=title-warp] > a::text').extract_first()
            # 最高分回答对应的问题的url
            answer_url = profile_work.css('#navAnswer div[class*=title-warp] > a::attr(href)').extract_first()
    
            # 将需要继续跟进抓取数据的url与item作为参数传递给相应方法继续抓取数据
            request = scrapy.Request(
                # 问题详细页url
                url=response.urljoin(answer_url),
                meta={
                # item需要传递
                'item':item,
                # 徽章的url
                'badge_url':response.urljoin(badge_url)},
                # 调用parse_ansser继续处理
                callback=self.parse_answer)
            yield request
    
        def parse_answer(self,response):
            # 取出传递的item
            item = response.meta['item']
            # 取出传递的徽章详细页url
            badge_url = response.meta['badge_url']
            # 问题标签列表
            item['answers_top_tags'] = response.css('.question__title--tag .tag::text').re(r'w+')
            # 先获取组成问题内容的字符串列表
            question_content = response.css('.widget-question__item p').re(r'>(.*?)<')
            # 拼接后传入item
            item['answers_top_question'] = ''.join(question_content)
            # 先获取组成答案的字符串列表
            answer_content = response.css('.qa-answer > article .answer').re(r'>(.*?)<')
            # 拼接后传入item
            item['answers_top_content'] = ''.join(answer_content)
    
            # 问题页面内容抓取后继续抓取徽章页内容,并将更新后的item继续传递
            request = scrapy.Request(url=badge_url,
                                     meta={'item':item},
                                     callback=self.parse_badge)
            yield request
    
        def parse_badge(self,response):
            item = response.meta['item']
            badge_name = response.css('span.badge span::text').extract()
            badge_count = response.css('span[class*=badges-count]::text').re(r'd+')
            name_count = {}
            for i in range(len(badge_count)):
                name_count[badge_name[i]] = badge_count[i]
            item['badges'] = name_count
            yield item
    

    middlewars.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your spider middleware
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    import random
    import re
    import datetime
    import scrapy
    import logging
    import time
    from scrapy.conf import settings
    from pymongo import MongoClient
    from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
    import pymongo
    logger = logging.getLogger(__name__)
    
    
    class SegmentfaultSpiderMiddleware(object):
        """
        处理Item中保存的三种类型注册日期数据:
        1. 注册于 2015年12月12日
        2. 注册于 3 天前
        3. 注册于 5 小时前
        """
    
        def process_spider_output(self,response,result,spider):
    
            """
            输出response时调用此方法处理item中register_date
            :param response:
            :param result: 包含item
            :param spider:
            :return:处理过注册日期的item
            """
            for item in result:
                # 判断获取的数据是否是scrapy.item类型
                if isinstance(item,scrapy.Item):
                    # 获取当前时间
                    now = datetime.datetime.now()
                    register_date = item['register_date']
                    logger.info("获取注册日志格式为{}".format(register_date))
                    # 提取注册日期字符串,如'注册于2015年12月12日' => '20151212'
                    day = ''.join(re.findall(r'd+',register_date))
                    # 如果提取数字字符串长度大于4位,则为'注册于2015年12月12日'形式
                    if len(day) > 4:
                        date = day
                    # 如果‘时’在提取的字符串中,则为'注册于8小时前'形式
                    elif '时' in register_date:
                        d = now - datetime.timedelta(hours=int(day))
                        date = d.strftime("%Y%m%d")
                    # 最后一种情况就是'注册于3天前'形式
                    else:
                        d = now - datetime.timedelta(days=int(day))
                        date = d.strftime("%Y%m%d")
    
                    # 更新register_date值
                    item['register_date'] = date
                yield item
    
    
    class SegmentfaultHttpProxyMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the downloader middleware does not modify the
        # passed objects.
        def __init__(self):
            self.proxy_list = settings['PROXY_LIST']
    
        def process_request(self, request, spider):
            proxy = random.choice(self.proxy_list)
            logger.info('使用代理:{}'.format(proxy))
            request.meta['proxy'] = proxy
    
    
    class SegmentfaultUserAgentMiddleware(object):
        def __init__(self):
            self.useragent_list = settings['USER_AGENT_LIST']
    
        def process_request(self,request,spider):
            user_agent = random.choice(self.useragent_list)
    
            # logger.info('使用的USE USER-AGENT:{}'.format(user_agent))
            request.headers['User-Agent'] = user_agent
    
    
    
    class SegmentfaultCookiesMiddleware(object):
        client = MongoClient(settings['MONGO_URI'])
        db = client[settings['MONGO_DB']]
        collection = db['cookies']
    
        def get_cookies(self):
            """
            随机获取cookies
            :return:
            """
            cookies = random.choice([cookie for cookie in self.collection.find()])
            # 将不需要的"_id"与"_gat"参数删除
            cookies.pop('_id')
            cookies.pop('_gat')
            # 将"Hm_lpvt_e23800c454aa573c0ccb16b52665ac26"填充当前时间
            cookies['Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time()))
            return cookies
    
        def remove_cookies(self,cookies):
            """
            删除已失效的cookies
            :param cookies:
            :return:
            """
            # 随机获取cookies中的一对键值,返回结果是一个元祖
            i = cookies.popitem()
            # 删除cookies
            try:
                logger.info("删除cookies{}".format(cookies))
                self.collection.remove({i[0]:i[1]})
            except Exception as e:
                logger.info("No this cookies:{}".format(cookies))
    
        def process_request(self,request,spider):
            """
            为每一个request添加一个cookie
            :param request:
            :param spider:
            :return:
            """
            cookies = self.get_cookies()
            request.cookies = cookies
    
        def process_response(self,request,response,spider):
            """
            对于登录失效的情况,可能会重定向到登录页面,这时添加新的cookies继续,将请求放回调度器
            :param request:
            :param response:
            :param spider:
            :return:
            """
            if response.status in [301,302]:
                logger.info("Redirect response:{}".format(response))
                redirect_url = response.headers['location']
                if b'/user/login' in redirect_url:
                    logger.info("Cookies失效")
    
                    # 请求失败,重新获取一个cookie,添加到request,并停止后续中间件处理此request,将此request放入调度器
                    new_cookie = self.get_cookies()
                    logger.info("获取新cookie:{}".format(new_cookie))
                    # 删除旧cookies
                    self.remove_cookies(request.cookies)
                    request.cookies = new_cookie
                return request
            #
            return response
    

    run.py

    from scrapy import cmdline
    # from segmentfault.get_cookies import GetCookies
    from get_cookies import GetCookies
    
    if __name__ == '__main__':
        cookies = GetCookies()
        cookies.save()
        name = 'userinfo'
        ""
        cmd = 'scrapy crawl {}'.format(name)
        cmdline.execute(cmd.split())
    
  • 相关阅读:
    4、numpy——创建数组
    3、NumPy 数组属性
    2、NumPy 数据类型
    windos常见命令操作
    PHP操作MongoDB学习笔记
    MongoDB(八)Mongodb——GridFS存储
    MongoDB(七)MongoDb数据结构
    MongoDB(五)mongo语法和mysql语法对比学习
    MongoDB可视化工具RoboMongo----Windows安装 1
    MongoDB(四)mongodb设置用户访问权限
  • 原文地址:https://www.cnblogs.com/hankleo/p/12994207.html
Copyright © 2020-2023  润新知