• scrapy全栈抓xpc练习


    #  spider文件
    # -*- coding: utf-8 -*-
    import scrapy
    import re
    from scrapy import Request
    import json
    import string
    import random
    from xpc.items import PostItem, CommentItem, CopyItem  # 多个item
    
    def strip(s):
        # s存在就去空,不存在就返回空
        if s:
            return s.strip()
        return ""
    
    
    # 使用scrapy.Request和scrapy.FormRequest发送请求的时候,默认会把cookies保存下来
    # 模拟登录的时候不用scrapy框架,直接使用request模块
    cookies = dict(
        Authorization='4F635191B0602B5D3B06024483B0602AAF8B06023C2F6259656D'
    )
    # 上面的cookies是网站返回的,需要先登陆的一下把这个cookies找到
    
    # 生成26个字母+数字
    def gen_sessionid():
        return "".join(random.choices(string.ascii_lowercase + string.digits, k=26))
    
    
    class XinpianchangSpider(scrapy.Spider):
        name = 'XinPianChang'
        allowed_domains = ['xinpianchang.com', 'openapi-vtom.vmovier.com']
        start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=tabArticle']
        # 假设从21页开始访问,这里就需要带上cookies,这时候最开始设置的cookies就不能用了,网站会返回4个cookies。需要从写start_requests函数
        # start_urls = ['https://www.xinpianchang.com/channel/index/type-/sort-like/duration_type-0/resolution_type-/page-21']
        page_count = 0
    
        # 重写父类中的 start_requests方法,该方法默认对start_urls中的url发get请求
        # def start_requests(self):
        #     for url in self.start_urls:
        #         # data = {
        #         #     "kw": "cat"
        #         # }
        #         # post请求发送,使用FormRequest
        #         # yield scrapy.FormRequest(url=url, callback=self.parse, formdata=data)
        #
        #         c = cookies.copy()
        #         c.update(PHPSESSID=gen_sessionid(),
        #                  SERVER_ID='b52601c8-285bdd26',
        #                  channel_page='apU%3D')
        #         yield Request(url, cookies=c, dont_filter=True)
    
    
        def parse(self, response):
            # from scrapy.shell import inspect_response
            # inspect_response(response, self)
            self.page_count += 1
            if self.page_count >= 100:
                cookies.update(PHPSESSID=gen_sessionid())
                self.page_count = 0
    
            url_list = response.xpath('//ul[@class="video-list"]/li/@data-articleid').extract()
            for pid in url_list:
                detail_url = 'https://www.xinpianchang.com/a{}?from=ArticleList'.format(pid)
                # print(detail_url)
                request = response.follow(detail_url, callback=self.parse_post)
                request.meta['pid'] = pid
                yield request  # 进入作品的详情页请求
    
            pages = response.xpath('//div[@class="page"]/a/@href').extract()
            for page_url in pages:
                # print("列表页翻页url", page_url)  # page_url是一个相对路径,不完整的
                yield response.follow(page_url, self.parse, cookies=cookies)
    
        def parse_post(self, response):
            pid = response.meta['pid']
            post = PostItem()
            post['pid'] = pid
            post['title'] = response.xpath('//div[@class="title-wrap"]/h3/text()').get()
            # video_url = 'https://openapi-vtom.vmovier.com/v3/video/5E34203E92450?expand=resource&usage=xpc_web'
            # response.text拿到网页返回的源码
            vid = re.findall('vid: "(.*?)",', response.text)[0]
            # print(vid)
            video_url = 'https://openapi-vtom.vmovier.com/v3/video/{}?expand=resource&usage=xpc_web'.format(vid)
            cates = response.xpath('//span[@class="cate v-center"]/a/text()').extract()
            post['category'] = ''.join([cate.strip() for cate in cates])
            post['create_time'] = response.xpath('//span[contains(@class,"update-time")]/i/text()').get()
            post['play_count'] = response.xpath('//i[contains(@class,"play-counts")]/text()').get()
            desc_lst = response.xpath('//p[contains(@class,"desc")]//text()').extract()
            post['desc'] = ' '.join([i.strip() for i in desc_lst])
    
            # 请求这个video_url, 多了一步这个注意一下
            request = Request(video_url, callback=self.parse_video)
            # 把之前获取到的post通过meta传到下一个函数中. 这个post是请求传参
            request.meta['post'] = post
            yield request
    
            # 获取评论链接‘https://app.xinpianchang.com/comments?resource_id=10664352&type=article&page=1&per_page=24’
            comment_url = "https://app.xinpianchang.com/comments?resource_id={}&type=article&page=1&per_page=24".format(
                pid)
            request = Request(comment_url, callback=self.parse_comment)
            # 把之前获取到的post通过meta传到下一个函数中
            request.meta['pid'] = pid
            yield request
    
            # 获取作者页链接
            creator_list = response.xpath('//div[@class="filmplay-creator right-section"]/ul[@class="creator-list"]/li')
            composer_url = 'https://www.xinpianchang.com/u{}?from=articleList'
            # cid = response.xpath('//div[@class="filmplay-creator right-section"]/ul[@class="creator-list"]/li/a/@data-userid')
            for creator in creator_list:
                cid = creator.xpath('./a/@data-userid').get()
                composer_url = 'https://www.xinpianchang.com/u{}?from=articleList'.format(cid)
                request = response.follow(composer_url, self.parse_composer)
                request.meta['cid'] = cid
                # 避免在cookies更新之后,不断的添加到请求头里面,避免请求头里带有一串cookies
                request.meta['dont_merge_cookies'] = True
                yield request
    
                # 作者和视频的对应关系
                cr = CopyItem()
                cr['pid'] = pid
                cr['cid'] = cid
                cr['pcid'] = pid + cid
                cr['role'] = creator.xpath('./div[@class="creator-info"]/span/text()').get()
                # print("cr", cr)
                yield cr
    
        def parse_video(self, response):  # 这个response是json格式
            post = response.meta['post']
            # 先把返回的json转化一下, 注意一下
            result = json.loads(response.text)
            post['video_url'] = result['data']['resource']['default']['url']
            # 直接返回给管道了
            yield post
    
        def parse_comment(self, response):
            result = json.loads(response.text)
            for c in result['data']['list']:
                comment = CommentItem()
                comment['uname'] = c['userInfo']['username']
                comment['user_id'] = c['userInfo']['id']
                # comment['user_page'] = c['userInfo']['web_url']
                comment['content'] = c['content']
                comment['content_id'] = c['id']
                print(comment)
                yield comment
    
            # 如果有下一页
            if result['data']['next_page_url']:
                next_page = 'https://app.xinpianchang.com' + result['data']['next_page_url']
                # print("next_page", next_page)
                yield response.follow(next_page, self.parse_comment)
    
        def parse_composer(self, response):
            pass
    # settings文件
    # -*- coding: utf-8 -*-
    
    # Scrapy settings for xpc project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://docs.scrapy.org/en/latest/topics/settings.html
    #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'xpc'
    
    SPIDER_MODULES = ['xpc.spiders']
    NEWSPIDER_MODULE = 'xpc.spiders'
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    # USER_AGENT = 'xpc (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    # CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    # DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    # CONCURRENT_REQUESTS_PER_DOMAIN = 16
    # CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    # 如果使用自定义cookie就把COOKIES_ENABLED设置为True
    # 如果使用settings的cookie就把COOKIES_ENABLED设置为False
    COOKIES_ENABLED = True
    COOKIES_DEBUG = True  # 可以打印出来详细的cookies信息
    
    # Disable Telnet Console (enabled by default)
    # TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    DEFAULT_REQUEST_HEADERS = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)',
    }
    
    # Enable or disable spider middlewares
    # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    # SPIDER_MIDDLEWARES = {
    #    'xpc.middlewares.XpcSpiderMiddleware': 543,
    # }
    
    # Enable or disable downloader middlewares
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    # DOWNLOADER_MIDDLEWARES = {
    #    'xpc.middlewares.XpcDownloaderMiddleware': 543,
    # }
    
    # Enable or disable extensions
    # See https://docs.scrapy.org/en/latest/topics/extensions.html
    # EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    # }
    
    # Configure item pipelines
    # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       'xpc.pipelines.XpcPipeline': 300,  # 优先级高
       # 'xpc.pipelines.MysqlPipeline': 301,
       # 'xpc.pipelines.RedisPipeline': 302,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
    # AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    # AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    # AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    # AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    HTTPCACHE_ENABLED = False   # True缓存访问过的网页,不会真实的发请求
    # HTTPCACHE_ENABLED = True
    
    # HTTPCACHE_EXPIRATION_SECS = 0
    # HTTPCACHE_DIR = 'httpcache'
    # HTTPCACHE_IGNORE_HTTP_CODES = []
    # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    
    # 日志类型: INFO DEBUG ERROR
    LOG_LEVEL = 'DEBUG'
    # item文件
    # -*- coding: utf-8 -*-
    import scrapy
    
    class PostItem(scrapy.Item):
        # 保存视频信息
        # 自定义字段,有多个表的时候需要写个table_name
        table_name = 'posts'
    
        # 下面的是数据字段
        pid = scrapy.Field()
        title = scrapy.Field()
        category = scrapy.Field()
        create_time = scrapy.Field()
        play_count = scrapy.Field()
        desc = scrapy.Field()
        video_url = scrapy.Field()
    
    
    class CommentItem(scrapy.Item):
        # 保存评论信息
        table_name = 'comments'
        content_id = scrapy.Field()
        pid = scrapy.Field()
        cid = scrapy.Field()
        uname = scrapy.Field()
        user_id = scrapy.Field()
        content = scrapy.Field()
        user_page = scrapy.Field()
    

    class CopyItem(scrapy.Item): table_name = 'copyrights' pcid = scrapy.Field() # 表的主键 pid = scrapy.Field() cid = scrapy.Field() role = scrapy.Field()
    # pipeline文件
    # -*- coding: utf-8 -*-
    
    import csv
    from xpc.items import PostItem, CommentItem, CopyItem
    import pymysql
    from redis import Redis
    import os
    
    class XpcPipeline(object):
        def __init__(self):
            # 当前文件的上一级
            store_file = os.path.dirname(__file__) + '/xpc.csv'
            # 打开文件
            self.file = open(store_file, 'w', newline="")
            # csv 写法
            self.writer = csv.writer(self.file)
    
        def open_spider(self, spider):
            print("pipeline 开始爬虫......")
      

      # 执行多个不同的item时
    def process_item(self, item, spider): if isinstance(item, PostItem): print("这是发布信息:", item) elif isinstance(item, CommentItem): print("这是评论信息:", item) elif isinstance(item, CopyItem): print("这是版权信息:", item) return item # 返回给下一个要执行的管道类 def close_spider(self, spider): print("pipeline 结束爬虫......") # 连接数据库 class MysqlPipeline(object): conn = None cursor = None def open_spider(self, spider): self.conn = pymysql.Connect( host='127.0.0.1', port=3306, user='root', password='', db='test_db', charset='utf8' ) print("数据库连接成功") def process_item(self, item, spider): self.cursor = self.conn.cursor() try: self.cursor.execute('insert into test_db values("%s", "%s")' % (item['author'], item['content'])) self.conn.commit() except Exception as e: print("数据库插入异常:", e) print("数据库执行回滚") self.conn.rollback() return item def close_spider(self, spider): print("断开数据库连接") self.cursor.close() self.conn.close() # 连接数据库 class RedisPipeline(object): conn = None cursor = None def open_spider(self, spider): self.conn = Redis( host='127.0.0.1', port=6379 ) print("数据库连接成功") def process_item(self, item, spider): dic = { "author": item["author"], "content": item["content"] } self.conn.lpush("队列名字", dic) def close_spider(self, spider): print("断开数据库连接") self.cursor.close() self.conn.close()
  • 相关阅读:
    Java使用默认浏览器打开指定URL
    eclipse.ini内存设置
    Eclipse关联Java源代码
    C#调用Java代码
    UVA12161 Ironman Race in Treeland
    [JSOI2012]玄武密码
    着色问题
    luogu P2680 运输计划
    [BJWC2010]严格次小生成树
    [SDOI2015]异象石
  • 原文地址:https://www.cnblogs.com/kenD/p/12304304.html
Copyright © 2020-2023  润新知