• 斗鱼直播实时数据爬取 标签: django数据爬虫 2017-05-30 23:46 229人阅读 评论(0)


    思路

    1, 解析URL

    页面解析

    2, 利用爬虫神器 bs4 和 正则表达式得到想要的信息;
    3, 进库和本地保存

    DJango后台展示和本地CSV(卖相太差,不发了)

    Django后台部分数据展示

    * 存储本地的CSV 直接运行 DySpyder().summary_data180() 即可*

    直接上代码

    # -*- coding: utf-8 -*-
    import os
    import re
    import django
    import urllib.request as ur
    
    class DySpyder():
    
        def __init__(self):
            pass
    
        def open_url(self, url):
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
            req = ur.Request(url=url, headers=headers)  # python2,urllib.request()
            response = ur.urlopen(req)  # python2,urllib2.urlopen()
            return response.read().decode('utf-8')
    
        def tv_spyder(self):
            url = "https://www.douyu.com/directory/all"
            data = self.open_url(url)
            from bs4 import BeautifulSoup
            cate = ['', 'serach_lastli', 'last','lastserach_lastli']  # - - s- l - ll 6loop
            soup1 = BeautifulSoup(data, 'html.parser')
            soup = soup1.find("ul", id='live-list-contentbox')
            res = []
            for c in cate:
                tmp = soup.findAll('li', c)
                res.extend(tmp)
            return res
    
        import datetime
        def set_data(self, x):
            import datetime
            res = {}
            # title.__init__
            title0 = str(x.find("h3").next_element)
            spans = x.findAll(["span"])
            # basic info to the link
            tag, dy_name, dy_num = tuple([s.next_element for s in spans][2:5])
            parterb = r'''.*<img data-original=(.*?) height="163" src=(.*?) width="283"/>.*'''
            # the urls of img and gif
            img, gif = re.findall(parterb, repr(x))[0]
            p2 = r'''.*a class="play-list-link" (.*?)=(.*?) (.*?)=(.*?) (.*?)=(.*?) (.*?)=(.*?) (.*?)=(.*?) (.*?)=(.*?) (.*?)=(.*?) (.*?)=(.*?)>.*'''
            t1 = [x for x in re.findall(p2, repr(x))][0]
            # the head of link-info
            for i in range(int(len(t1)/2 - 1)):
                res.setdefault(t1[2*i], t1[2*i+1])
            res.setdefault("dt", datetime.datetime.today())
            res.setdefault('tag', tag)
            res.setdefault('dy_name', dy_name)
            res.setdefault('dy_num', dy_num)
            res.setdefault('title0', title0)
            res.setdefault('img', img)
            res.setdefault('gif', gif)
    
            return res
    
        def summary_data180(self):
            l = [self.set_data(x) for x in self.tv_spyder()]
            import pandas as pd
            df_tmp = pd.DataFrame(l)
            df_tmp.to_csv("C:\Users\lenovo\Desktop\dy180.csv")
            return df_tmp
    
        #print(summary_data180())
    
        def main(self):
            os.environ.setdefault("DJANGO_SETTINGS_MODULE", "minicms.settings")
            django.setup()
    
            from tv.models import Info
            from django.utils import timezone
    
            df = self.summary_data180()
            print(df.columns)
            import numpy as np
            array2 = np.array(df)
            for i in range(len(df)):
                Info.objects.create(data_rid=array2[i][0],
                                    data_rpos=array2[i][1],
                                    data_sid=array2[i][2],
                                    data_sub_rt=array2[i][3],
                                    data_tid=array2[i][4],
                                    dt=timezone.now(), ##修改了时间
                                    dy_name=array2[i][6],
                                    dy_num=array2[i][7],
                                    gif=array2[i][8],
                                    href=array2[i][9],
                                    img=array2[i][10],
                                    tag=array2[i][11],
                                    target=array2[i][12],
                                    title0=array2[i][13]
                                    )
            print("执行完毕")
    
    
    dyspyder = DySpyder()
    #dyspyder.main()
    
    

    后续

    • 随着时间更新, 每隔 10min 自动一次到数据库——可以获取 Tag 或者用户的规律
    • 增加 虎牙-战旗-龙珠 的数据
    • 增加图片进库和自己定义的页面实时优化; 实现多直播平台的归一化推荐

    Django 存库模板

    from django.db import models
    
    # Create your models here.
    
    class Info(models.Model):
        data_rid = models.CharField("房间ID", max_length=20)
        data_rpos = models.CharField("", max_length=20)
        data_sid= models.CharField("", max_length=20)
        data_sub_rt = models.CharField("", max_length=20)
        data_tid = models.CharField("", max_length=20)
        dt = models.DateTimeField("时间")
        dy_name = models.CharField("账号名字", max_length=50)
        dy_num = models.CharField("观看数", max_length=20)
        gif= models.CharField("GIF", max_length=120)
        href = models.CharField("房间url", max_length=20)
        img = models.CharField("IMG_url", max_length=120)
        tag = models.CharField("标签", max_length=120)
        target = models.CharField("目标", max_length=20)
        title0 = models.CharField("标题", max_length=120)
    
        def __str__(self):
            return self.dy_name + "_" + self.title0
    
        class Meta:
            verbose_name = '斗鱼时间信息'
            verbose_name_plural = '斗鱼时间信息180条'
    
    class ImgTools(models.Model):
        img_url = models.URLField(verbose_name="线上路径")
        dt = models.DateTimeField("时间")
        data_rid = models.CharField("房间ID", max_length=20)
        upload_to = models.URLField(verbose_name="本地路径")

    TXT 爬取更新

    def find_min(nums):
        for i in range(len(nums)):
            if nums[i+1] > nums[i]:
                return i, nums[i]
    
    def set_urls(book_id):
        url = "http://www.biqudu.com/" + book_id + "/"
        partern = r".*<dd> <a href=(.*?)>(.*?)</a></dd>.*"
        import pandas as pd
        import numpy as np
        ## 本方法不能分卷, 后续补上相关的事情优化
        df1 = pd.DataFrame(np.array(re.findall(partern, open_url(url))), columns=["url", "title"])
        df1["num"] = [int(list(re.findall(r".*/(.*?).html", x))[0]) for x in df1["url"]]
        ####### find all-span
        start_index = find_min(df1["num"])[0]
        return df1[start_index: len(df1)]
    
    
    # 为单独一个小说页面爬取;txt 文档; content;
    def detail():
        url = "http://www.biqudu.com/21_21470/1394112.html"
        data = open_url(url)
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(data, 'html.parser')
        content = soup.findAll('div', id="content")[0]
        return content
    
    # print(detail())
    
    
    def test(request):
        content = detail()
        return render(request, "base_test.html", {"content": content})
    

    * 近期会花精力弄微信小程序, 爬虫放置一段时间。 ==== END ====*

  • 相关阅读:
    虚拟机类加载机制详解
    简单了解Tomcat与OSGi的类加载器架构
    Java高并发编程(四)
    Java高并发编程(三)
    Java高并发编程(一)
    垃圾收集与几种常用的垃圾收集算法
    初识java内存区域
    vue2.0基础学习(1)
    git/github 生成密钥
    手机预览vue项目
  • 原文地址:https://www.cnblogs.com/actanble/p/7128670.html
Copyright © 2020-2023  润新知