• macOS下python3通过scrapy框架重新生成不得姐网站视频采集过程日志


    1.搭建虚拟python3环境(Virtualenvwrapper)

    2.安装scrapy

    前提先安装好pip,setuptools,然后安装以下模块

     pip install lxml

    pip install twisted

    pip install pyopenssl

    windows下需要安装pywin32(pip install pywin32)

    最后安装scrapy

    3.通过scrapy生成scrapy spider 工程模版

    scrapy startproject <project_name> [project_dir]

    如:

    scrapy startproject budejie

    4.生成spider模块

    scrapy genspider [options] <name> <domain>

    如:

    cd budejie 

    scrapy genspider getbudejievideo budejie.com

    5.修改spider模块(getbudejievideo.py)

      

    # -*- coding: utf-8 -*-
    import scrapy
    import os
    import urllib
    from lxml import etree


    # urlretrieve()的回调函数,显示当前的下载进度
    # a为已经下载的数据块
    # b为数据块大小
    # c为远程文件的大小
    # myper=0

    def jindu(a, b, c):
    if not a:
    print("连接打开")
    if c < 0:
    print("要下载的文件大小为0")
    else:
    # global myper
    per = 100 * a * b / c

    if per > 100:
    per = 100
    print(" 当前下载进度为:" + '%.2f%%' % per, end='')
    if per == 100:
    return True


    ''' def __init__(self):
    self.headers = {
    # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    # 'Accept-Encoding': 'gzip, deflate',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    }
    '''


    class GetbudejievideoSpider(scrapy.Spider):
    name = 'getbudejievideo'
    allowed_domains = ['budejie.com']
    start_urls = ['http://budejie.com/video']

    # 获取视频当前页视频url列表
    def getVideoList(self, html):
    try:
    data = etree.HTML(html)
    video_urls = data.xpath('//div[@class="j-video-c"]/div[@data-mp4]')
    # print(type(video_urls[0]))
    # print(dir(video_urls[0]))
    # <a href="2" class="pagenxt">下一页</a>
    next_page = data.xpath('//a[@class="pagenxt"]')
    if next_page:
    next_page = next_page[0].get('href')

    # videos[0].get('data-mp4')
    return video_urls, next_page
    # t(video_urls[0].get('data-mp4'))
    except Exception:
    print('lxml parse failed')
    return None, None

    __init_url = "http://www.budejie.com/video"

    cur_page = 1

    def parse(self, response):
    print('*' * 100)
    print(type(response))
    # print(response.text)

    # 创建video文件保持目录
    path = os.path.join(os.path.abspath(os.path.curdir), 'videos')
    if not os.path.exists(path):
    os.mkdir(path)
    # 获取当前页所有video 的url

    try:
    data = etree.HTML(response.text)
    video_urls = data.xpath('//div[@class="j-video-c"]/div[@data-mp4]')

    # <a href="2" class="pagenxt">下一页</a>
    nextpage = data.xpath('//a[@class="pagenxt"]')
    if nextpage:
    nextpage = nextpage[0].get('href')

    except Exception:
    print('lxml parse failed------------------------------')
    return
    if not video_urls:
    return
    # 下载当前页下所有video url对应的视频文件
    for v in video_urls:
    # if v:
    video_url = v.get('data-mp4')
    print('下载:{}'.format(video_url))
    p = os.path.join(path, v.get('data-mp4').split('/')[-1])

    print(p)

    if not os.path.exists(p):
    try:
    urllib.request.urlretrieve(video_url, p, jindu)
    except Exception:
    print(" 下载文件:{}失败".format(video_url))

    # 检测是否有下一页
    if nextpage:
    if nextpage == '1':
    return

    nextpage_url = self.__init_url + '/' + nextpage

    self.cur_page += 1
    print(' 下载第{}页视频数据:{}'.format(self.cur_page, nextpage_url))
    #通过生成器对新生成的url继续回调parse
    yield scrapy.Request(nextpage_url, callback=self.parse)

    else:
    return
    4.修改配置文件settings.py中得
    。。。
    #以下为模拟浏览器验证
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    # USER_AGENT = 'budejie (+http://www.budejie.com)'
    USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'

    # Obey robots.txt rules 跳过robotsy验证
    ROBOTSTXT_OBEY = False
    。。。
    5.通过scrapy开启爬虫数据采集
    scrapy crawl getbudejievideo
    6.测试成功。

     

     

  • 相关阅读:
    Eclipse下,修改MAVEN 中央仓库地址,解决maven下载慢问题
    C语言中头文件string的用法
    Linux中变量$#,$@,$0,$1,$2,$*,$$,$?的含义
    Curl
    LDAP是什么
    Linux网络基本网络配置
    vim
    request,session,cookie的比较
    J2EE开发过程中遇到的问题
    实现弹出登录窗口
  • 原文地址:https://www.cnblogs.com/it-tsz/p/8902834.html
Copyright © 2020-2023  润新知