1.搭建虚拟python3环境(Virtualenvwrapper)
2.安装scrapy
前提先安装好pip,setuptools,然后安装以下模块
pip install lxml
pip install twisted
pip install pyopenssl
windows下需要安装pywin32(pip install pywin32)
最后安装scrapy
3.通过scrapy生成scrapy spider 工程模版
scrapy startproject <project_name> [project_dir]
如:
scrapy startproject budejie
4.生成spider模块
scrapy genspider [options] <name> <domain>
如:
cd budejie
scrapy genspider getbudejievideo budejie.com
5.修改spider模块(getbudejievideo.py)
)
# -*- coding: utf-8 -*-
import scrapy
import os
import urllib
from lxml import etree
# urlretrieve()的回调函数,显示当前的下载进度
# a为已经下载的数据块
# b为数据块大小
# c为远程文件的大小
# myper=0
def jindu(a, b, c):
if not a:
print("连接打开")
if c < 0:
print("要下载的文件大小为0")
else:
# global myper
per = 100 * a * b / c
if per > 100:
per = 100
print(" 当前下载进度为:" + '%.2f%%' % per, end='')
if per == 100:
return True
''' def __init__(self):
self.headers = {
# 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
# 'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}
'''
class GetbudejievideoSpider(scrapy.Spider):
name = 'getbudejievideo'
allowed_domains = ['budejie.com']
start_urls = ['http://budejie.com/video']
# 获取视频当前页视频url列表
def getVideoList(self, html):
try:
data = etree.HTML(html)
video_urls = data.xpath('//div[@class="j-video-c"]/div[@data-mp4]')
# print(type(video_urls[0]))
# print(dir(video_urls[0]))
# <a href="2" class="pagenxt">下一页</a>
next_page = data.xpath('//a[@class="pagenxt"]')
if next_page:
next_page = next_page[0].get('href')
# videos[0].get('data-mp4')
return video_urls, next_page
# t(video_urls[0].get('data-mp4'))
except Exception:
print('lxml parse failed')
return None, None
__init_url = "http://www.budejie.com/video"
cur_page = 1
def parse(self, response):
print('*' * 100)
print(type(response))
# print(response.text)
# 创建video文件保持目录
path = os.path.join(os.path.abspath(os.path.curdir), 'videos')
if not os.path.exists(path):
os.mkdir(path)
# 获取当前页所有video 的url
try:
data = etree.HTML(response.text)
video_urls = data.xpath('//div[@class="j-video-c"]/div[@data-mp4]')
# <a href="2" class="pagenxt">下一页</a>
nextpage = data.xpath('//a[@class="pagenxt"]')
if nextpage:
nextpage = nextpage[0].get('href')
except Exception:
print('lxml parse failed------------------------------')
return
if not video_urls:
return
# 下载当前页下所有video url对应的视频文件
for v in video_urls:
# if v:
video_url = v.get('data-mp4')
print('下载:{}'.format(video_url))
p = os.path.join(path, v.get('data-mp4').split('/')[-1])
print(p)
if not os.path.exists(p):
try:
urllib.request.urlretrieve(video_url, p, jindu)
except Exception:
print(" 下载文件:{}失败".format(video_url))
# 检测是否有下一页
if nextpage:
if nextpage == '1':
return
nextpage_url = self.__init_url + '/' + nextpage
self.cur_page += 1
print(' 下载第{}页视频数据:{}'.format(self.cur_page, nextpage_url))
#通过生成器对新生成的url继续回调parse
yield scrapy.Request(nextpage_url, callback=self.parse)
else:
return
4.修改配置文件settings.py中得
。。。
#以下为模拟浏览器验证
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'budejie (+http://www.budejie.com)'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
# Obey robots.txt rules 跳过robotsy验证
ROBOTSTXT_OBEY = False
。。。
5.通过scrapy开启爬虫数据采集
scrapy crawl getbudejievideo
6.测试成功。