python爬虫实战项目
1. LOL所有英雄皮肤下载
from fake_useragent import UserAgent
import requests, json, os
# 爬取网页所有英雄的皮肤图片
# https://lol.qq.com/data/info-heros.shtml
# 获取英雄id
def get_heroList():
url = 'https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js'
headers = {
'User-Agent': UserAgent().chrome
}
try:
response = requests.get(url, headers=headers)
# print(response.text)
# print(type(response.text))
response = json.loads(response.text)
# print(type(response))
hero_ids = []
for i in response['hero']:
hero_ids.append(i['heroId'])
# print(hero_ids)
return hero_ids
except:
print('获取英雄id失败')
return None
# 根据英雄id获取英雄皮肤名称和图片下载地址
def get_skinNames(id):
url = 'https://game.gtimg.cn/images/lol/act/img/js/hero/{}.js'.format(id)
headers = {
'User-Agent': UserAgent().chrome
}
try:
response = requests.get(url, headers=headers)
response = json.loads(response.text)
skinnames = []
skin_urls = []
for i in response['skins'][:-1]:
if i['mainImg'] != '':
skinnames.append(i['name'])
skin_urls.append(i['mainImg'])
# print(skinnames)
return skinnames, skin_urls
except:
print('获取英雄皮肤名称失败')
return None
# 根据名称,下载图片保存文件夹
def downloadImg(skinnames, skin_urls):
headers = {
'User-Agent': UserAgent().chrome
}
filename = skinnames[0]
os.makedirs(filename, exist_ok=True)
for skinname, skin_url in zip(skinnames, skin_urls):
try:
response = requests.get(skin_url, headers=headers)
except:
print(skinname + ' 下载失败')
return
with open(filename+'/'+skinname.replace('/', '_') + '.jpg', 'wb') as f:
f.write(response.content)
# print(filename + ' 下载完成')
if __name__ == '__main__':
hero_ids = get_heroList()
i = 1
for id in hero_ids:
skinnames, skin_urls = get_skinNames(id)
# print(skinnames[0]+':'+str(len(skin_urls))+'张')
downloadImg(skinnames, skin_urls)
print('
下载进度:' + str(i) + '/' + str(len(hero_ids)), end='')
i = i + 1
2. 音乐下载软件
import requests, json, re
from tkinter import Tk, Button, Entry, StringVar, Radiobutton, Frame
from tkinter import messagebox
# 说明:
# 爬取网站:https://music.zhuolin.wang/
# ajax异步请求
# 下载的歌曲在软件所在目录下
# 根据输入找到歌曲信息
def get_musicInfo(query, sourse):
music_ids = []
music_names = []
music_singers = []
url = 'https://music.zhuolin.wang/api.php?'
data = {
'types': 'search',
'count': '5',
'source': sourse,
'pages': '1',
'name': query
}
headers = {
'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '37',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': '',
'Host': 'music.zhuolin.wang',
'Origin': 'https://music.zhuolin.wang',
'Referer': 'https://music.zhuolin.wang/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
response = requests.post(url, headers=headers, data=data)
response = json.loads(response.text)
for i in response:
music_ids.append(i['id'])
music_names.append(i['name'])
music_singers.append(i['artist'])
print(music_ids)
print(music_names)
print(music_singers)
# return music_ids, music_names, music_singers
# 根据id获取歌曲下载链接
def get_downloadUrl(music_id, name, singer, sourse):
url = 'https://music.zhuolin.wang/api.php?'
data = {
'types': 'url',
'id': music_id,
'source': sourse
}
headers = {
'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '37',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': '',
'Host': 'music.zhuolin.wang',
'Origin': 'https://music.zhuolin.wang',
'Referer': 'https://music.zhuolin.wang/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
response = requests.post(url, data=data, headers=headers)
print(response.text)
downloadurl = re.search(r'http:(.+)",', response.text)
if downloadurl != None:
downloadurl = downloadurl.group().replace('\', '')
downloadMusic(downloadurl, name, singer)
else:
messagebox.showinfo('抱歉', '该歌曲暂不提供下载,请您更换其他平台下载')
# 下载歌曲到本地
def downloadMusic(url, name, singer):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36'
}
try:
response = requests.get(url, headers=headers)
with open(name + '-' + singer + '.mp3', 'wb')as f:
f.write(response.content)
messagebox.showinfo('恭喜', name + '-' + singer + ' 下载完成')
except:
messagebox.showinfo('抱歉', name + ' 下载失败')
# 点击搜索执行
def search_music():
query = entry.get()
sourse = v.get()
if query == '':
messagebox.showinfo('提示', '请输入内容!')
return False
music_ids, music_names, music_singers = get_musicInfo(query, sourse)
# 重新进行组件内容和按钮功能的设置
for i in range(5):
if i == 0:
id1 = str(music_ids[i])
name1 = str(music_names[i])
singer1 = str(music_singers[i][0])
value1.set(name1 + ' ' + singer1)
entry1['textvariable'] = value1
button1['command'] = lambda: download(id1, name1, singer1)
if i == 1:
id2 = str(music_ids[i])
name2 = str(music_names[i])
singer2 = str(music_singers[i][0])
value2.set(name2 + ' ' + singer2)
entry2['textvariable'] = value2
button2['command'] = lambda: download(id2, name2, singer2)
if i == 2:
id3 = str(music_ids[i])
name3 = str(music_names[i])
singer3 = str(music_singers[i][0])
value3.set(name3 + ' ' + singer3)
entry3['textvariable'] = value3
button3['command'] = lambda: download(id3, name3, singer3)
if i == 3:
id4 = str(music_ids[i])
name4 = str(music_names[i])
singer4 = str(music_singers[i][0])
value4.set(name4 + ' ' + singer4)
entry4['textvariable'] = value4
button4['command'] = lambda: download(id4, name4, singer4)
if i == 4:
id5 = str(music_ids[i])
name5 = str(music_names[i])
singer5 = str(music_singers[i][0])
value5.set(name5 + ' ' + singer5)
entry5['textvariable'] = value5
button5['command'] = lambda: download(id5, name5, singer5)
# 没有搜索之前点击下载按钮的提示
def tishi():
messagebox.showinfo('提示', '请先进行搜索')
# 点击下载按钮执行(有点多余,可以去掉直接用get_downloadUrl)
def download(id, name, singer):
sourse = v.get()
get_downloadUrl(id, name, singer, sourse)
if __name__ == '__main__':
# get_musicInfo('嘲笑声','tencent')
# get_downloadUrl('0030tRLQ1e4mCn','嘲笑声','Big Daddy','tencent')
root = Tk()
win_width = root.winfo_screenwidth()
win_height = root.winfo_screenheight()
root.geometry('500x400+' + str(int(win_width / 2 - 250)) + '+' + str(int(win_height / 2 - 200)))
root.minsize(500, 400)
root.maxsize(500, 400)
root.title('音乐下载器-敲出一片天')
# get_downloadUrl('64561','单车(Live)','陈奕迅')
query = StringVar()
query.set('歌名+歌手更准确哦')
# entry的参数:https://www.cnblogs.com/monsteryang/p/6575877.html
entry = Entry(root, width=21, font=('隶书', 20), foreground='orange',
borderwidth=3, insertbackground='red', textvariable=query)
entry.place(relx=0.05, rely=0.1)
button = Button(root, width=8, text='搜索', font=('隶书', 18), bg='orange', fg='white', command=search_music)
button.place(relx=0.7, rely=0.09)
v = StringVar()
v.set('netease')
r1 = Radiobutton(text='网易', value='netease', font=('隶书', 18), fg='orange', variable=v)
r2 = Radiobutton(text='qq', value='tencent', font=('隶书', 18), fg='orange', variable=v)
r3 = Radiobutton(text='酷狗', value='kugou', font=('隶书', 18), fg='orange', variable=v)
r4 = Radiobutton(text='百度', value='baidu', font=('隶书', 18), fg='orange', variable=v)
r1.place(relx=0.08, rely=0.2)
r2.place(relx=0.28, rely=0.2)
r3.place(relx=0.48, rely=0.2)
r4.place(relx=0.68, rely=0.2)
frame = Frame(root, height=250, width=420, bd=1, relief="groove", bg='gray')
frame.place(relx=0.06, rely=0.3)
value1 = StringVar()
entry1 = Entry(frame, width=21, font=('隶书', 15), bg='gray', relief="flat",
borderwidth=3, textvariable=query)
entry1.place(relx=0.05, rely=0.04)
button1 = Button(frame, width=8, text='下载', font=('隶书', 12), bg='gray', fg='black', command=tishi)
button1.place(relx=0.7, rely=0.04)
value2 = StringVar()
entry2 = Entry(frame, width=21, font=('隶书', 15), relief="flat", bg='gray',
borderwidth=3, textvariable=query)
entry2.place(relx=0.05, rely=0.24)
button2 = Button(frame, width=8, text='下载', font=('隶书', 12), bg='gray', fg='black', command=tishi)
button2.place(relx=0.7, rely=0.24)
value3 = StringVar()
entry3 = Entry(frame, width=21, font=('隶书', 15), bg='gray', relief="flat",
borderwidth=3, textvariable=query)
entry3.place(relx=0.05, rely=0.44)
button3 = Button(frame, width=8, text='下载', font=('隶书', 12), bg='gray', fg='black', command=tishi)
button3.place(relx=0.7, rely=0.44)
value4 = StringVar()
entry4 = Entry(frame, width=21, font=('隶书', 15), bg='gray', relief="flat",
borderwidth=3, textvariable=query)
entry4.place(relx=0.05, rely=0.64)
button4 = Button(frame, width=8, text='下载', font=('隶书', 12), bg='gray', fg='black', command=tishi)
button4.place(relx=0.7, rely=0.64)
value5 = StringVar()
entry5 = Entry(frame, width=21, font=('隶书', 15), bg='gray', relief="flat",
borderwidth=3, textvariable=query)
entry5.place(relx=0.05, rely=0.84)
button5 = Button(frame, width=8, text='下载', font=('隶书', 12), bg='gray', fg='black', command=tishi)
button5.place(relx=0.7, rely=0.84)
root.mainloop()
3. b站视频下载
import requests
import re
import json
from tkinter import *
from tkinter import messagebox
# 获得播放页面代码,获取我们需要的数据,转为json数据
def get_html_one(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
response = requests.get(url, headers=headers)
try:
title = re.findall(r'<title data-vue-meta="true">(.+)_.+</title>', response.text)
response = re.search(r'"data":.+,"session"', response.text)
text = response.group()
text = json.loads(text[7:-10])
video_url = text['dash']['video'][0]['baseUrl']
audio_url = text['dash']['audio'][0]['baseUrl']
return video_url, audio_url, title[0]
except:
print('该视频不支持下载')
info.set('该视频不支持下载')
messagebox.showinfo('提示', '该视频不支持下载')
return None
# 下载合集
def get_html_more(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
response = requests.get(url, headers=headers)
title = re.findall(r'<title data-vue-meta="true">(.+)_.+</title>', response.text)
video_title.set(title[0])
response = re.search(r'window.__INITIAL_STATE__=.+;(function', response.text)
text = json.loads(response.group()[25:-10])
cids = []
names = []
for info in text['videoData']['pages']:
cids.append(str(info['cid']))
names.append(info['part'])
return cids, names
# 下载视频和音频到本地
def download_one(video_url, audio_url, title):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
'Referer': 'https://www.bilibili.com/video/',
'Origin': 'https://www.bilibili.com',
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, sdch, br',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
print(title + ' 开始下载')
try:
video_response = requests.get(video_url, headers=headers)
audio_response = requests.get(audio_url, headers=headers)
with open(title + '.mp4', 'wb') as f:
f.write(video_response.content)
with open(title + '.mp3', 'wb') as f:
f.write(audio_response.content)
except:
print(title + ' 下载失败')
info.set(title + ' 下载失败')
messagebox.showinfo('抱歉', title + ' 下载失败')
return
print(title + ' 下载完成')
info.set(title + ' 下载完成')
messagebox.showinfo('恭喜', title + ' 下载完成')
# 下载合集
def download_more(cids, names, url):
number = len(cids)
for i in range(number):
url = url + '?p{}'.format(i + 1)
video_url, audio_url, title = get_html_one(url)
download_one(video_url, audio_url, names[i])
print('=========================================')
# 点击搜索
def serach():
button1.config(state="active")
baseurl = 'https://www.bilibili.com/video/{}'
video_id = entry.get()
url = baseurl.format(video_id)
flag = v.get()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
try:
response = requests.get(url, headers=headers)
title = re.findall(r'<title data-vue-meta="true">(.+)_.+</title>', response.text)
if title[0] == '视频去哪了呢?':
messagebox.showinfo('提示', '您输入的视频id不正确')
return
video_title.set(title[0])
button1['command'] = lambda: download(url, flag)
except:
messagebox.showinfo('提示', '您输入的视频id不正确')
return
# 点击下载
def download(url, flag):
button1.config(state="disable")
if flag == 0:
video_url, audio_url, title = get_html_one(url)
if video_url == None:
return
download_one(video_url, audio_url, title)
else:
cids, names = get_html_more(url)
download_more(cids, names, url)
print('下载完成,感谢您的使用')
info.set('下载完成,感谢您的使用')
def tishi():
messagebox.showinfo('提示', '请先进行搜索')
if __name__ == '__main__':
root = Tk()
win_width = root.winfo_screenwidth()
win_height = root.winfo_screenheight()
root.geometry('400x270+' + str(int(win_width / 2 - 200)) + '+' + str(int(win_height / 2 - 135)))
root.minsize(400, 250)
root.maxsize(400, 250)
root.title('小破站下载器-敲出一片天')
video_id = StringVar()
video_id.set('请输入视频ID')
entry = Entry(root, width=19, font=('隶书', 20), foreground='orange',
borderwidth=3, insertbackground='red', textvariable=video_id)
entry.place(relx=0.02, rely=0.1)
button = Button(root, width=7, text='搜索', font=('隶书', 18), bg='orange', fg='white', command=serach)
button.place(relx=0.72, rely=0.09)
v = IntVar()
v.set(0)
r1 = Radiobutton(text='单个视频', value=0, font=('隶书', 18), fg='orange', variable=v)
r2 = Radiobutton(text='视频合集', value=1, font=('隶书', 18), fg='orange', variable=v)
r1.place(relx=0.05, rely=0.25)
r2.place(relx=0.45, rely=0.25)
video_title = StringVar()
video_title.set('视频标题')
entry1 = Entry(root, width=30, font=('隶书', 15), fg='black', bg='#F0F0F0', relief='flat',
borderwidth=3, insertbackground='red', textvariable=video_title)
entry1.place(relx=0.06, rely=0.4)
button1 = Button(root, width=8, text='开始下载', font=('隶书', 12), bg='gray', fg='black', command=tishi)
button1.place(relx=0.7, rely=0.4)
info = StringVar()
info.set('下载结果')
entry_info = Entry(root, width=30, font=('隶书', 15), fg='red', bg='#F0F0F0', relief='flat',
borderwidth=3, textvariable=info)
entry_info.place(relx=0.2, rely=0.6)
label = Label(root, text='下载过程可能会出现无响应情况
下载完就好了', width=30, font=('隶书', 15), fg='black', bg='#F0F0F0',
relief='flat',
borderwidth=3)
label.place(relx=0.06, rely=0.8)
root.mainloop()
4.python爬虫框架scrapy爬取B站排行榜数据并保存到MongoDB数据库
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class BiliItem(scrapy.Item):
# define the fields for your item here like:
_id = scrapy.Field()
title = scrapy.Field()
play_num = scrapy.Field()
up_name = scrapy.Field()
score = scrapy.Field()
bili.py
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import scrapy
from bilibili.bili.bili.items import BiliItem
class BiliRankeSpider(scrapy.Spider):
name = 'bili_ranke'
allowed_domains = ['bilibili.com']
start_urls = ['https://www.bilibili.com/ranking/all/0/0/3']
def parse(self, response):
titles = response.xpath('//div[@class="info"]//a[@class="title"]/text()').extract()
play_nums = response.xpath('//div[@class="detail"]/span[@class="data-box"][1]/text()').extract()
up_names = response.xpath('//div[@class="detail"]/a/span[@class="data-box"][1]/text()').extract()
scores = response.xpath('//div[@class="pts"]/div/text()').extract()
for title, play_num, up_name, score in zip(titles, play_nums, up_names, scores):
item = BiliItem()
item['title'] = title
item['play_num'] = play_num
item['up_name'] = up_name
item['score'] = score
yield item
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
class MoviesPipeline(object):
def open_spider(self, spider):
self.client = pymongo.MongoClient()
def process_item(self, item, spider):
self.client.bilibili.ranke.insert_one(item)
return item
def close_spider(self, spider):
self.client.close()
附(MongoDB数据库python基本操作)
import pymongo
# 连接数据库
# 默认
client = pymongo.MongoClient()
# 自定义
# client = pymongo.MongoClient('ip',port)
# 选择实例(数据库)
person = client.person
# 选择集合(表)
student = person.student
#操作数据
# 查找所有信息
# result = student.find()
# for r in result:
# print(r)
# print(result.next())
# 筛选
# result = student.find({"age":20})
# for r in result:
# print(r)
# 排序
# result = student.find().sort("age",1)
# result = student.find().sort("age",pymongo.ASCENDING)
# for r in result:
# print(r)
# 分页(偏移)
# result = student.find().limit(3)
# for r in result:
# print(r)
#
#
# result = student.find().limit(3).skip(2)
# for r in result:
# print(r)
# 统计
# result = student.find().count()
# print(result)
# 增加数据
# data = {"name":'曾强','age':22}
# student.insert(data)
# result = student.count()
# print(result)
# 删除数据
# data = {"name":'zq2','age':20}
# student.remove(data)
# 更新
data = {"name":"zq1"}
result = student.find_one(data)
print(result)
result["country"]="中国"
student.update(data,{'$set':result})
以上项目我都在bilibili上录有视频,看不明白可以去看一下视频,我的B站名:敲出一片天_bili