• macOS下python3通过scrapy框架重新生成不得姐网站视频采集过程日志


    1.搭建虚拟python3环境(Virtualenvwrapper)

    参考http://www.cnblogs.com/it-tsz/p/pyhton.html

    2.安装scrapy

    前提先安装好pip,setuptools,然后安装以下模块

     pip install lxml

    pip install twisted

    pip install pyopenssl

    windows下需要安装pywin32(pip install pywin32)

    最后安装scrapy

    pip install scrapy

    3.通过scrapy生成scrapy spider 工程模版

    scrapy startproject <project_name> [project_dir]

    如:

    scrapy startproject budejie

    4.生成spider模块

    scrapy genspider [options] <name> <domain>

    如:

    cd budejie 

    scrapy genspider getbudejievideo budejie.com

    5.修改spider模块(getbudejievideo.py)

    # -*- coding: utf-8 -*-
    import scrapy
    import os
    import urllib
    from lxml import etree


    # urlretrieve()的回调函数,显示当前的下载进度
    # a为已经下载的数据块
    # b为数据块大小
    # c为远程文件的大小


    def jindu(a, b, c):
    if not a:
    print("连接打开")
    if c < 0:
    print("要下载的文件大小为0")
    else:

    per = 100 * a * b / c

    if per > 100:
    per = 100
    print(" 当前下载进度为:" + '%.2f%%' % per, end='')
    if per == 100:
    return True


    ''' def __init__(self):
    self.headers = {
    # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    # 'Accept-Encoding': 'gzip, deflate',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    }
    '''


    class GetbudejievideoSpider(scrapy.Spider):
    name = 'getbudejievideo'
    allowed_domains = ['budejie.com']
    start_urls = ['http://budejie.com/video']

    cur_page = 1

    def parse(self, response):
    print('*' * 100)

    # 创建video文件保持目录
    path = os.path.join(os.path.abspath(os.path.curdir), 'videos')
    if not os.path.exists(path):
    os.mkdir(path)
    # 获取当前页所有video 的url

    try:
    data = etree.HTML(response.text)
    video_urls = data.xpath('//div[@class="j-video-c"]/div[@data-mp4]')

    # <a href="2" class="pagenxt">下一页</a>
    nextpage = data.xpath('//a[@class="pagenxt"]')
    if nextpage:
    nextpage = nextpage[0].get('href')

    except Exception:
    print('lxml parse failed------------------------------')
    return
    if not video_urls:
    return
    # 下载当前页下所有video url对应的视频文件
    for v in video_urls:
    # if v:
    video_url = v.get('data-mp4')
    print('下载:{}'.format(video_url))
    p = os.path.join(path, v.get('data-mp4').split('/')[-1])

    print(p)

    if not os.path.exists(p):
    try:
    urllib.request.urlretrieve(video_url, p, jindu)
    except Exception:
    print(" 下载文件:{}失败".format(video_url))

    # 检测是否有下一页
    if nextpage:
    if nextpage == '1':
    return
    nextpage_url = self.start_urls【0】 + '/' + nextpage
                self.cur_page += 1
    print(' 下载第{}页视频数据:{}'.format(self.cur_page, nextpage_url))
    #通过生成器对新生成的url继续回调parse
    yield scrapy.Request(nextpage_url, callback=self.parse)

    else:
    return
    4.修改配置文件settings.py以下参数选项
    。。。
    #以下为模拟浏览器验证
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    # USER_AGENT = 'budejie (+http://www.budejie.com)'
    USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'

    # Obey robots.txt rules 跳过robots协议验证
    ROBOTSTXT_OBEY = False
    。。。
    5.通过scrapy开启爬虫数据采集
    scrapy crawl getbudejievideo
    6.测试成功。
    通过该案例,进一步学习了scrapy,xpath等相关知识,实践过程中提高了分析问题和解决问题的能力,继续加油!

     

    不使用外部xpath库源代码修改spider模块(getbudejievideo.py)修改如下:

    # -*- coding: utf-8 -*-
    import scrapy
    import os
    import urllib
    import re

    # urlretrieve()的回调函数,显示当前的下载进度
    # a为已经下载的数据块
    # b为数据块大小
    # c为远程文件的大小

    def jindu(a, b, c):
    if not a:
    print("连接打开")
    if c < 0:
    print("要下载的文件大小为0")
    else:
    # global myper
    per = 100 * a * b / c

    if per > 100:
    per = 100
    print(" 当前下载进度为:" + '%.2f%%' % per, end='')
    if per == 100:
    return True


    class GetbudejievideoSpider(scrapy.Spider):
    name = 'getbudejievideo'
    allowed_domains = ['www.budejie.com']
    start_urls = ['http://www.budejie.com/video']

    cur_page = 1

    def parse(self, response):
    print('下载url:{}'.format(response.url))
    # 创建video文件保持目录
    path = os.path.join(os.path.abspath(os.path.curdir), 'videos')
    if not os.path.exists(path):
    os.mkdir(path)
    # 获取当前页所有video 的url
    print('-' * 100)
    try:
    # data = etree.HTML(response.text)
    video_urls = response.xpath('//div[@class="j-video-c"]/div[@data-mp4]').extract()
    v_urls=[]
    for i in video_urls:
    v= re.findall(r'data-mp4="(.*?)"', i,re.M)
    if v:
    v_urls.append(v[0])
    # <a href="2" class="pagenxt">下一页</a>
    nextpage = response.xpath('//a[@class="pagenxt"]').extract()
    if nextpage:
    nextpage = re.findall(r'href="(.*?)"', nextpage[0])

    except Exception:
    print('lxml parse failed:')
    return
    if not v_urls:
    return
    # 下载当前页下所有video url对应的视频文件
    for v in v_urls:
    # video_url = v.get('data-mp4')
    print('下载:{}'.format(v))
    p = os.path.join(path, v.split('/')[-1])

    print(p)

    if not os.path.exists(p):
    try:
    urllib.request.urlretrieve(v, p, jindu)
    except Exception:
    print(" 下载文件:{}失败".format(v))

    # 检测是否有下一页
    if nextpage:
    if nextpage[0] == '1':
    return

    nextpage_url = self.start_urls[0] + '/' + nextpage[0]
    print(nextpage_url)

    self.cur_page += 1
    print(' 下载第{}页视频数据:{}'.format(self.cur_page, nextpage_url))
    # 通过生成器对新生成的url继续回调parse
    yield scrapy.Request(nextpage_url, callback=self.parse)
    else:
    return

    附属调试模块:(debug.py)

    from scrapy import cmdline

    if __name__ == '__main__':
    cmdline.execute('scrapy crawl getbudejievideo'.split(' '))

    from scrapy import cmdline

    if __name__ == '__main__':
    cmdline.execute('scrapy crawl getbudejievideo'.split(' '))

     

     

    
    
    
    
    
    
  • 相关阅读:
    第十六周项目5-为动态数组扩容
    Remoting
    C# 调用https服务
    12306
    Byte[]和Stream相互转换
    SQL Server之数据库语句优化
    前端框架VUE学习
    Oracle连接字符串总结
    .net 操作Oracle 海量数据
    新建一个Windows Service的方法
  • 原文地址:https://www.cnblogs.com/it-tsz/p/8902833.html
Copyright © 2020-2023  润新知