• python爬虫实战项目


    python爬虫实战项目

    1. LOL所有英雄皮肤下载

    from fake_useragent import UserAgent
    import requests, json, os
    
    # 爬取网页所有英雄的皮肤图片
    # https://lol.qq.com/data/info-heros.shtml
    
    
    # 获取英雄id
    def get_heroList():
        url = 'https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js'
        headers = {
            'User-Agent': UserAgent().chrome
        }
        try:
            response = requests.get(url, headers=headers)
            # print(response.text)
            # print(type(response.text))
            response = json.loads(response.text)
            # print(type(response))
            hero_ids = []
            for i in response['hero']:
                hero_ids.append(i['heroId'])
            # print(hero_ids)
            return hero_ids
        except:
            print('获取英雄id失败')
            return None
    
    
    # 根据英雄id获取英雄皮肤名称和图片下载地址
    def get_skinNames(id):
        url = 'https://game.gtimg.cn/images/lol/act/img/js/hero/{}.js'.format(id)
        headers = {
            'User-Agent': UserAgent().chrome
        }
        try:
            response = requests.get(url, headers=headers)
            response = json.loads(response.text)
            skinnames = []
            skin_urls = []
            for i in response['skins'][:-1]:
                if i['mainImg'] != '':
                    skinnames.append(i['name'])
                    skin_urls.append(i['mainImg'])
            # print(skinnames)
            return skinnames, skin_urls
        except:
            print('获取英雄皮肤名称失败')
            return None
    
    
    # 根据名称,下载图片保存文件夹
    def downloadImg(skinnames, skin_urls):
        headers = {
            'User-Agent': UserAgent().chrome
        }
        filename = skinnames[0]
        os.makedirs(filename, exist_ok=True)
        for skinname, skin_url in zip(skinnames, skin_urls):
            try:
                response = requests.get(skin_url, headers=headers)
            except:
                print(skinname + ' 下载失败')
                return
            with open(filename+'/'+skinname.replace('/', '_') + '.jpg', 'wb') as f:
                f.write(response.content)
        # print(filename + ' 下载完成')
    
    
    if __name__ == '__main__':
        hero_ids = get_heroList()
        i = 1
        for id in hero_ids:
            skinnames, skin_urls = get_skinNames(id)
            # print(skinnames[0]+':'+str(len(skin_urls))+'张')
            downloadImg(skinnames, skin_urls)
            print('
    下载进度:' + str(i) + '/' + str(len(hero_ids)), end='')
            i = i + 1
    
    

    2. 音乐下载软件

    import requests, json, re
    from tkinter import Tk, Button, Entry, StringVar, Radiobutton, Frame
    from tkinter import messagebox
    
    
    # 说明:
    # 爬取网站:https://music.zhuolin.wang/
    # ajax异步请求
    # 下载的歌曲在软件所在目录下
    
    
    # 根据输入找到歌曲信息
    def get_musicInfo(query, sourse):
        music_ids = []
        music_names = []
        music_singers = []
        url = 'https://music.zhuolin.wang/api.php?'
        data = {
            'types': 'search',
            'count': '5',
            'source': sourse,
            'pages': '1',
            'name': query
        }
        headers = {
            'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Content-Length': '37',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Cookie': '',
            'Host': 'music.zhuolin.wang',
            'Origin': 'https://music.zhuolin.wang',
            'Referer': 'https://music.zhuolin.wang/',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest'
        }
        response = requests.post(url, headers=headers, data=data)
        response = json.loads(response.text)
        for i in response:
            music_ids.append(i['id'])
            music_names.append(i['name'])
            music_singers.append(i['artist'])
    
        print(music_ids)
        print(music_names)
        print(music_singers)
        # return music_ids, music_names, music_singers
    
    
    # 根据id获取歌曲下载链接
    def get_downloadUrl(music_id, name, singer, sourse):
        url = 'https://music.zhuolin.wang/api.php?'
        data = {
            'types': 'url',
            'id': music_id,
            'source': sourse
        }
        headers = {
            'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Content-Length': '37',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Cookie': '',
            'Host': 'music.zhuolin.wang',
            'Origin': 'https://music.zhuolin.wang',
            'Referer': 'https://music.zhuolin.wang/',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest'
        }
        response = requests.post(url, data=data, headers=headers)
        print(response.text)
        downloadurl = re.search(r'http:(.+)",', response.text)
        if downloadurl != None:
            downloadurl = downloadurl.group().replace('\', '')
            downloadMusic(downloadurl, name, singer)
        else:
            messagebox.showinfo('抱歉', '该歌曲暂不提供下载,请您更换其他平台下载')
    
    
    # 下载歌曲到本地
    def downloadMusic(url, name, singer):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36'
        }
        try:
            response = requests.get(url, headers=headers)
            with open(name + '-' + singer + '.mp3', 'wb')as f:
                f.write(response.content)
            messagebox.showinfo('恭喜', name + '-' + singer + ' 下载完成')
        except:
            messagebox.showinfo('抱歉', name + ' 下载失败')
    
    
    # 点击搜索执行
    def search_music():
        query = entry.get()
        sourse = v.get()
        if query == '':
            messagebox.showinfo('提示', '请输入内容!')
            return False
        music_ids, music_names, music_singers = get_musicInfo(query, sourse)
        # 重新进行组件内容和按钮功能的设置
        for i in range(5):
            if i == 0:
                id1 = str(music_ids[i])
                name1 = str(music_names[i])
                singer1 = str(music_singers[i][0])
                value1.set(name1 + '   ' + singer1)
                entry1['textvariable'] = value1
                button1['command'] = lambda: download(id1, name1, singer1)
            if i == 1:
                id2 = str(music_ids[i])
                name2 = str(music_names[i])
                singer2 = str(music_singers[i][0])
                value2.set(name2 + '   ' + singer2)
                entry2['textvariable'] = value2
                button2['command'] = lambda: download(id2, name2, singer2)
            if i == 2:
                id3 = str(music_ids[i])
                name3 = str(music_names[i])
                singer3 = str(music_singers[i][0])
                value3.set(name3 + '   ' + singer3)
                entry3['textvariable'] = value3
                button3['command'] = lambda: download(id3, name3, singer3)
            if i == 3:
                id4 = str(music_ids[i])
                name4 = str(music_names[i])
                singer4 = str(music_singers[i][0])
                value4.set(name4 + '   ' + singer4)
                entry4['textvariable'] = value4
                button4['command'] = lambda: download(id4, name4, singer4)
            if i == 4:
                id5 = str(music_ids[i])
                name5 = str(music_names[i])
                singer5 = str(music_singers[i][0])
                value5.set(name5 + '   ' + singer5)
                entry5['textvariable'] = value5
                button5['command'] = lambda: download(id5, name5, singer5)
    
    
    # 没有搜索之前点击下载按钮的提示
    def tishi():
        messagebox.showinfo('提示', '请先进行搜索')
    
    
    # 点击下载按钮执行(有点多余,可以去掉直接用get_downloadUrl)
    def download(id, name, singer):
        sourse = v.get()
        get_downloadUrl(id, name, singer, sourse)
    
    
    if __name__ == '__main__':
        # get_musicInfo('嘲笑声','tencent')
        # get_downloadUrl('0030tRLQ1e4mCn','嘲笑声','Big Daddy','tencent')
    
        root = Tk()
        win_width = root.winfo_screenwidth()
        win_height = root.winfo_screenheight()
        root.geometry('500x400+' + str(int(win_width / 2 - 250)) + '+' + str(int(win_height / 2 - 200)))
        root.minsize(500, 400)
        root.maxsize(500, 400)
        root.title('音乐下载器-敲出一片天')
        # get_downloadUrl('64561','单车(Live)','陈奕迅')
    
        query = StringVar()
        query.set('歌名+歌手更准确哦')
    
        # entry的参数:https://www.cnblogs.com/monsteryang/p/6575877.html
    
        entry = Entry(root, width=21, font=('隶书', 20), foreground='orange',
                      borderwidth=3, insertbackground='red', textvariable=query)
        entry.place(relx=0.05, rely=0.1)
    
        button = Button(root, width=8, text='搜索', font=('隶书', 18), bg='orange', fg='white', command=search_music)
        button.place(relx=0.7, rely=0.09)
    
        v = StringVar()
        v.set('netease')
        r1 = Radiobutton(text='网易', value='netease', font=('隶书', 18), fg='orange', variable=v)
        r2 = Radiobutton(text='qq', value='tencent', font=('隶书', 18), fg='orange', variable=v)
        r3 = Radiobutton(text='酷狗', value='kugou', font=('隶书', 18), fg='orange', variable=v)
        r4 = Radiobutton(text='百度', value='baidu', font=('隶书', 18), fg='orange', variable=v)
        r1.place(relx=0.08, rely=0.2)
        r2.place(relx=0.28, rely=0.2)
        r3.place(relx=0.48, rely=0.2)
        r4.place(relx=0.68, rely=0.2)
    
        frame = Frame(root, height=250, width=420, bd=1, relief="groove", bg='gray')
        frame.place(relx=0.06, rely=0.3)
    
        value1 = StringVar()
        entry1 = Entry(frame, width=21, font=('隶书', 15), bg='gray', relief="flat",
                       borderwidth=3, textvariable=query)
        entry1.place(relx=0.05, rely=0.04)
        button1 = Button(frame, width=8, text='下载', font=('隶书', 12), bg='gray', fg='black', command=tishi)
        button1.place(relx=0.7, rely=0.04)
    
        value2 = StringVar()
        entry2 = Entry(frame, width=21, font=('隶书', 15), relief="flat", bg='gray',
                       borderwidth=3, textvariable=query)
        entry2.place(relx=0.05, rely=0.24)
        button2 = Button(frame, width=8, text='下载', font=('隶书', 12), bg='gray', fg='black', command=tishi)
        button2.place(relx=0.7, rely=0.24)
    
        value3 = StringVar()
        entry3 = Entry(frame, width=21, font=('隶书', 15), bg='gray', relief="flat",
                       borderwidth=3, textvariable=query)
        entry3.place(relx=0.05, rely=0.44)
        button3 = Button(frame, width=8, text='下载', font=('隶书', 12), bg='gray', fg='black', command=tishi)
        button3.place(relx=0.7, rely=0.44)
    
        value4 = StringVar()
        entry4 = Entry(frame, width=21, font=('隶书', 15), bg='gray', relief="flat",
                       borderwidth=3, textvariable=query)
        entry4.place(relx=0.05, rely=0.64)
        button4 = Button(frame, width=8, text='下载', font=('隶书', 12), bg='gray', fg='black', command=tishi)
        button4.place(relx=0.7, rely=0.64)
    
        value5 = StringVar()
        entry5 = Entry(frame, width=21, font=('隶书', 15), bg='gray', relief="flat",
                       borderwidth=3, textvariable=query)
        entry5.place(relx=0.05, rely=0.84)
        button5 = Button(frame, width=8, text='下载', font=('隶书', 12), bg='gray', fg='black', command=tishi)
        button5.place(relx=0.7, rely=0.84)
    
        root.mainloop()
    
    

    3. b站视频下载

    import requests
    import re
    import json
    from tkinter import *
    from tkinter import messagebox
    
    
    # 获得播放页面代码,获取我们需要的数据,转为json数据
    def get_html_one(url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
        }
        response = requests.get(url, headers=headers)
        try:
            title = re.findall(r'<title data-vue-meta="true">(.+)_.+</title>', response.text)
            response = re.search(r'"data":.+,"session"', response.text)
            text = response.group()
            text = json.loads(text[7:-10])
            video_url = text['dash']['video'][0]['baseUrl']
            audio_url = text['dash']['audio'][0]['baseUrl']
            return video_url, audio_url, title[0]
        except:
            print('该视频不支持下载')
            info.set('该视频不支持下载')
            messagebox.showinfo('提示', '该视频不支持下载')
            return None
    
    
    # 下载合集
    def get_html_more(url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
        }
        response = requests.get(url, headers=headers)
        title = re.findall(r'<title data-vue-meta="true">(.+)_.+</title>', response.text)
        video_title.set(title[0])
        response = re.search(r'window.__INITIAL_STATE__=.+;(function', response.text)
        text = json.loads(response.group()[25:-10])
        cids = []
        names = []
        for info in text['videoData']['pages']:
            cids.append(str(info['cid']))
            names.append(info['part'])
        return cids, names
    
    
    # 下载视频和音频到本地
    def download_one(video_url, audio_url, title):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
            'Referer': 'https://www.bilibili.com/video/',
            'Origin': 'https://www.bilibili.com',
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate, sdch, br',
            'Accept-Language': 'zh-CN,zh;q=0.8'
        }
        print(title + ' 开始下载')
    
        try:
            video_response = requests.get(video_url, headers=headers)
            audio_response = requests.get(audio_url, headers=headers)
            with open(title + '.mp4', 'wb') as f:
                f.write(video_response.content)
            with open(title + '.mp3', 'wb') as f:
                f.write(audio_response.content)
        except:
            print(title + ' 下载失败')
            info.set(title + ' 下载失败')
            messagebox.showinfo('抱歉', title + ' 下载失败')
            return
        print(title + ' 下载完成')
        info.set(title + ' 下载完成')
        messagebox.showinfo('恭喜', title + ' 下载完成')
    
    
    # 下载合集
    def download_more(cids, names, url):
        number = len(cids)
        for i in range(number):
            url = url + '?p{}'.format(i + 1)
            video_url, audio_url, title = get_html_one(url)
            download_one(video_url, audio_url, names[i])
            print('=========================================')
    
    
    # 点击搜索
    def serach():
        button1.config(state="active")
        baseurl = 'https://www.bilibili.com/video/{}'
        video_id = entry.get()
        url = baseurl.format(video_id)
        flag = v.get()
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
        }
        try:
            response = requests.get(url, headers=headers)
            title = re.findall(r'<title data-vue-meta="true">(.+)_.+</title>', response.text)
            if title[0] == '视频去哪了呢?':
                messagebox.showinfo('提示', '您输入的视频id不正确')
                return
            video_title.set(title[0])
            button1['command'] = lambda: download(url, flag)
        except:
            messagebox.showinfo('提示', '您输入的视频id不正确')
            return
    
    
    # 点击下载
    def download(url, flag):
        button1.config(state="disable")
        if flag == 0:
            video_url, audio_url, title = get_html_one(url)
            if video_url == None:
                return
            download_one(video_url, audio_url, title)
        else:
            cids, names = get_html_more(url)
            download_more(cids, names, url)
    
        print('下载完成,感谢您的使用')
        info.set('下载完成,感谢您的使用')
    
    
    def tishi():
        messagebox.showinfo('提示', '请先进行搜索')
    
    
    if __name__ == '__main__':
        root = Tk()
        win_width = root.winfo_screenwidth()
        win_height = root.winfo_screenheight()
        root.geometry('400x270+' + str(int(win_width / 2 - 200)) + '+' + str(int(win_height / 2 - 135)))
        root.minsize(400, 250)
        root.maxsize(400, 250)
        root.title('小破站下载器-敲出一片天')
    
        video_id = StringVar()
        video_id.set('请输入视频ID')
    
        entry = Entry(root, width=19, font=('隶书', 20), foreground='orange',
                      borderwidth=3, insertbackground='red', textvariable=video_id)
        entry.place(relx=0.02, rely=0.1)
    
        button = Button(root, width=7, text='搜索', font=('隶书', 18), bg='orange', fg='white', command=serach)
        button.place(relx=0.72, rely=0.09)
    
        v = IntVar()
        v.set(0)
        r1 = Radiobutton(text='单个视频', value=0, font=('隶书', 18), fg='orange', variable=v)
        r2 = Radiobutton(text='视频合集', value=1, font=('隶书', 18), fg='orange', variable=v)
        r1.place(relx=0.05, rely=0.25)
        r2.place(relx=0.45, rely=0.25)
    
        video_title = StringVar()
        video_title.set('视频标题')
        entry1 = Entry(root, width=30, font=('隶书', 15), fg='black', bg='#F0F0F0', relief='flat',
                       borderwidth=3, insertbackground='red', textvariable=video_title)
        entry1.place(relx=0.06, rely=0.4)
    
        button1 = Button(root, width=8, text='开始下载', font=('隶书', 12), bg='gray', fg='black', command=tishi)
        button1.place(relx=0.7, rely=0.4)
    
        info = StringVar()
        info.set('下载结果')
        entry_info = Entry(root, width=30, font=('隶书', 15), fg='red', bg='#F0F0F0', relief='flat',
                           borderwidth=3, textvariable=info)
        entry_info.place(relx=0.2, rely=0.6)
    
        label = Label(root, text='下载过程可能会出现无响应情况
    下载完就好了', width=30, font=('隶书', 15), fg='black', bg='#F0F0F0',
                      relief='flat',
                      borderwidth=3)
        label.place(relx=0.06, rely=0.8)
    
        root.mainloop()
    
    

    4.python爬虫框架scrapy爬取B站排行榜数据并保存到MongoDB数据库

    items.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://docs.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class BiliItem(scrapy.Item):
        # define the fields for your item here like:
        _id = scrapy.Field()
        title = scrapy.Field()
        play_num = scrapy.Field()
        up_name = scrapy.Field()
        score = scrapy.Field()
    
    
    

    bili.py

    # -*- coding: utf-8 -*-
    # -*- coding: utf-8 -*-
    import scrapy
    from bilibili.bili.bili.items import BiliItem
    
    
    class BiliRankeSpider(scrapy.Spider):
        name = 'bili_ranke'
        allowed_domains = ['bilibili.com']
        start_urls = ['https://www.bilibili.com/ranking/all/0/0/3']
    
        def parse(self, response):
            titles = response.xpath('//div[@class="info"]//a[@class="title"]/text()').extract()
            play_nums = response.xpath('//div[@class="detail"]/span[@class="data-box"][1]/text()').extract()
            up_names = response.xpath('//div[@class="detail"]/a/span[@class="data-box"][1]/text()').extract()
            scores = response.xpath('//div[@class="pts"]/div/text()').extract()
    
            for title, play_num, up_name, score in zip(titles, play_nums, up_names, scores):
                item = BiliItem()
                item['title'] = title
                item['play_num'] = play_num
                item['up_name'] = up_name
                item['score'] = score
                yield item
    
    
    

    pipelines.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    import pymongo
    
    class MoviesPipeline(object):
    
        def open_spider(self, spider):
            self.client = pymongo.MongoClient()
    
        def process_item(self, item, spider):
            self.client.bilibili.ranke.insert_one(item)
            return item
    
        def close_spider(self, spider):
            self.client.close()
    

    附(MongoDB数据库python基本操作)

    import pymongo
    
    # 连接数据库
    # 默认
    client = pymongo.MongoClient()
    # 自定义
    # client = pymongo.MongoClient('ip',port)
    
    # 选择实例(数据库)
    person = client.person
    # 选择集合(表)
    student = person.student
    
    #操作数据
    # 查找所有信息
    # result = student.find()
    # for r in result:
    #     print(r)
    
    # print(result.next())
    
    # 筛选
    # result = student.find({"age":20})
    # for r in result:
    #     print(r)
    
    # 排序
    # result = student.find().sort("age",1)
    # result = student.find().sort("age",pymongo.ASCENDING)
    # for r in result:
    #     print(r)
    
    # 分页(偏移)
    # result = student.find().limit(3)
    # for r in result:
    #     print(r)
    #
    #
    # result = student.find().limit(3).skip(2)
    # for r in result:
    #     print(r)
    
    # 统计
    # result = student.find().count()
    # print(result)
    
    # 增加数据
    # data = {"name":'曾强','age':22}
    # student.insert(data)
    # result = student.count()
    # print(result)
    
    # 删除数据
    # data = {"name":'zq2','age':20}
    # student.remove(data)
    
    # 更新
    data = {"name":"zq1"}
    result = student.find_one(data)
    print(result)
    result["country"]="中国"
    student.update(data,{'$set':result})
    

    以上项目我都在bilibili上录有视频,看不明白可以去看一下视频,我的B站名:敲出一片天_bili

    版权声明:本文为博主原创文章,转载请附上博文链接!
  • 相关阅读:
    plsql记住登录密码
    java之通过反射,来获得某对象的所有方法(类方法提取器)
    java之RTTI和反射的理解
    Thinking in java之正则表达式小例子
    java正则表达式之java小爬虫
    【ACM】Binary String Matching
    PHP var_export
    PHP FPM
    【ACM】阶乘之和
    【ACM】最少乘法次数
  • 原文地址:https://www.cnblogs.com/zq98/p/15028039.html
Copyright © 2020-2023  润新知