• python_爬虫_微信公众号抓取


    目前卡在视频部分,公众号的视频来源是腾讯视频,播放和下载地址都是加密的,目前vid及vkey均已经获取,但使用爬虫得到的结果永远是403、405,尚未解决。

    考虑方法:

       selenium当页面加载后查看广告用时,等广告加载时间过去后再点击视频,再由网页中获取加载的视频地址进行下载,明天测试看有无效果

    import requests,pymysql
    import json,jsonpath,random,re,time,datetime,os,imghdr
    from lxml import etree
    from selenium import webdriver
    from urllib import request
    import ssl
    ssl._create_default_https_context = ssl._create_unverified_context
    
    '''
    注意点:如果同一时间内刷新次数,或者获取分页太频繁,会被封
    '''
    # --------------------
    user_info = {'username':'####@163.com','password':'####'}
    base_url = 'https://mp.weixin.qq.com/'
    base_headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    query_list = [ # 需要抓取的公众号列表
        {'fakeid':'MzIzOTQ0MTUwMA==','nickname':'Sir电影'},
        {'fakeid':'MzIxODc5MzM4NQ==','nickname':'鱼Sir电影'},
    ]
    table = 'p_weixin' # 数据库名称
    key = 'title,author,js_name,publish_time,images,vedios'
    
    # --------------------
    
    def get_cookie(): # 登陆并获取cookie值
        driver = webdriver.Chrome(executable_path=r'/Applications/Google Chrome.app/chromedriver')
        driver.get(base_url)
        time.sleep(2) # 页面缓冲
    
        driver.find_element_by_name('account').clear()
        driver.find_element_by_name('account').send_keys(user_info['username'])
        driver.find_element_by_name('password').clear()
        driver.find_element_by_name('password').send_keys(user_info['password'])
    
        driver.find_element_by_class_name('icon_checkbox').click()
        driver.find_element_by_class_name('btn_login').click()
        time.sleep(25) # 等待手机扫码
    
        c_total = driver.get_cookies()
        cookies = {} # cookies存储
        for i in c_total:
            cookies[i['name']] = i['value']
        return cookies
    
        
    def get_info(): # 获取公众号名称、总页数、token、fakeid等信息,
        cookies = get_cookie()
    
        res_token = requests.get(base_url,cookies=cookies,headers=base_headers)
        token = re.compile(r'token=(d+)').findall(str(res_token.url))
    
        for query in query_list: # 从列表里控制要爬取多少个公众号
            url = 'https://mp.weixin.qq.com/cgi-bin/appmsg' # 公众号里面的电影
    
            fakeid = query['fakeid']
            appmsg = {
                'token': token,
                'lang': 'zh_CN',
                'f': 'json',
                'ajax': '1',
                'random': random.random(),
                'action': 'list_ex',
                'begin': '0',
                'count': '5',
                'fakeid': fakeid,
                'type': '9',
            }
            res_cnt = requests.get(url,params=appmsg,cookies=cookies) # 看总共多少页 一页16篇,begin 0~4为一页
            res_cnt = json.loads(res_cnt.text)
    
            for cnt in range(0,res_cnt['app_msg_cnt'],5): # 循环所有分页
                appmsg['begin'] = cnt # 当前的分页值
                response = requests.get(url,params=appmsg,cookies=cookies)
                
                data_list = json.loads(response.text)
                for data in data_list['app_msg_list']: # 对当前页里的信息进行提取
                    yield [data['title'],data['link']]
                time.sleep(random.randrange(30,41)) # 设置间隔
    
            print('公众号:%s,共有文章%s' % (query['nickname'],res_cnt['app_msg_cnt'])) # 返回页数,这里应该单独输入,不需要存储
                
    def get_news():
    # def get_news(url): # 获取文章,这里可以用协程?也需要换agent
        '''视频、音频、图片'''
        print('-'*40)
        url = 'https://mp.weixin.qq.com/s?src=3&timestamp=1533397256&ver=1&signature=RbnX4tUBODpql9qsvp4jJRDrtHc-LSXXm9gSM*BNY*PTRKHJ2bUyeKkGPlpKGGsnKl4IyaxubTPPWv6jQzhm52M7qFY5*BJ8dEugb4XPUcLRSs8U-4Bb9ab9mso2NWDq0*RwRzZ2*zZ6r1YyQtNjpg=='
        res = request.Request(url,headers=base_headers)
        response = request.urlopen(res)
    
        re_data = response.read().decode()
        data = etree.HTML(re_data)
        title = get_try(data, '//h2[@id="activity-name"]/text()')  # 标题
        author = get_try(data, '//div[@id="meta_content"]//span[@class="rich_media_meta rich_media_meta_text"]//text()')  # 作者
        js_name = get_try(data, '//div[@id="meta_content"]//span[@class="rich_media_meta rich_media_meta_text"]//text()')  # 公众号名称
        publish_time = re.compile(r'var publish_time.*?"(.*?)"').findall(re_data)[0]  # 发布时间 昨天、前天、今天、1周前
    
        images_list = []  # 图片
        vedio_list = []  # 音视频
    
        # 还有图片、视频、音频地址
        js_content = data.xpath('//div[@id="js_content"]//p//text()|//div[@id="js_content"]//p//img/@data-src|//div[@id="js_content"]//p//iframe/@data-src|//mpvoice')
        for i in range(len(js_content)):
            if '' == js_content[i] or '' == js_content[i]:
                js_content[i] = ''
            elif isinstance(js_content[i], etree._Element):  # 音频
                res = js_content[i].xpath('//mpvoice/@voice_encode_fileid')[0]
                js_content[i] = 'https://res.wx.qq.com/voice/getvoice?mediaid={}'.format(res)
                vedio_list.append(js_content[i])
            elif 'pic' in js_content[i]:  # 图片
                images_list.append(js_content[i])
            elif 'v.qq' in js_content[i]:  # 视频
                vedio_json = 'https://h5vv.video.qq.com/getinfo?callback=txplayerJsonpCallBack_getinfo_24936&otype=json&vid={}' # 包括视频播放地址的json文件
                url = vedio_json.format(js_content[i].split('vid=')[-1].split('&')[0])
                js_content[i] = url
                vedio_list.append(js_content[i])
            else:
                js_content[i] = '<p>%s</p>' % js_content[i]
    
        get_video(vedio_list) # 从视频、音频json文件地址分析出下载路径
    
    
    
        print('-' * 30)
    
        total_data = {
            'title': title,
            'author': author,
            'js_name': js_name,
            'publish_time': publish_time,
            'js_content': js_content,
            'images': images_list,
            'vedios': vedio_list
        }
        # Down(total_data)  # 下载函数
    
    def get_try(data, fangfa):  # 把try except封到一起
        try:
            res = data.xpath(fangfa)[0].strip()
            return res
        except Exception as e:
            return '暂无'
    
    def get_video(url_list): # 获取视频 如果是音频,直接下载,否则转到视频界面处理
        print('获取音、视频路径列表',url_list)
        for base_url in url_list:
            if 'voice' in base_url:
                pass
                # voice_name = base_url.split('=')[-1][-10:]
                # request.urlretrieve(base_url,'./'+voice_name+'.mp3') # 目前mp3可以下载没有加密,后续需要转到Down函数统一处理
            else:
                print('视频的json文件地址',base_url)
                res = request.Request(base_url,headers=base_headers)
                response = request.urlopen(res)
    
                video_json = re.compile(r'txplayerJsonpCallBack_getinfo_24936((.*))',re.S).search(response.read().decode()).group(1)
                video_data = json.loads(video_json)
    
                title = jsonpath.jsonpath(video_data,'$..vl.vi..ti')[0]
                vid = jsonpath.jsonpath(video_data,'$..vl.vi..lnk')[0]
                vkey = jsonpath.jsonpath(video_data,'$..vl.vi..fvkey')[0]
                fn = jsonpath.jsonpath(video_data,'$..vl.vi..fn')[0]
                url_list = jsonpath.jsonpath(video_data,'$..vl.vi..ul.ui')[0]
    
                full_url = 'http://ugcsjy.qq.com/'+vid+'.p712.1.mp4?vkey='+vkey
                print('下载路径',full_url)
    
                try:
                    base_headers['Host'] = 'ugcbsy.qq.com'
                    v_response = requests.get(full_url,headers = base_headers)
                    print(base_headers)
                    print(v_response.status_code)
                except Exception as e:
                    print('该下载路径下载失败',e)
    
    def Down(data):  # 先下载 下载之后还需要改路径名称 按时间建立文件夹
        # 检测是否存在、下载、改类型
        # 视频现在路径有问题,先搞定音频和图片名称和路径
        for i in [data['images'],data['vedios']]:
            for img in i:
                img_name = img.split('/')[-2][-10:]  # 下载后的图片名称
    
                down_path = '../download/公众号_图片音视频/{}'.format(img_name)  # 下载路径
    
                path = os.listdir('../download/公众号_图片音视频/')
                path = ','.join(path) # 把当前所需要下载的文章资源路径拼接在一起,便于使用re.search方法判断
    
                if re.search(img_name + '.*', path): # 正则匹配后缀名,避免判断时因为本地文件已经改了后缀找不到
                    print('文件已存在', '-', img_name)
                else:
                    request.urlretrieve(img, down_path)  # 下载
                    end_name = imghdr.what(down_path)  # 后缀名称
                    if end_name: # imghdr只能查看图片类,视频不能判断,所以判断是否为空
                        os.rename(down_path, down_path + '.' + end_name)
                    print('已下载成功', '-', down_path)
    
    
    def Mydb(data): # 下载后再存数据库
        db = pymysql.connect('127.0.0.1','root','123456','PaChong',charset='utf8')
        curosr = db.cursor()
    
        value = ','.join(['%s']*len(data))
        sql = 'insert into {}({}) VALUES({})'.format(table,key,value)
        data = data.values()
    
        curosr.execute(sql,data)
    
    def main(): # 主体,从这里运行就好
        start = datetime.datetime.now() # 开始时间
        for info in get_info(): # 获取公众号taken fakeid等信息
            get_news(info[-1]) # 获取地址
        end = datetime.datetime.now() # 结束时间
    
        print('-'*30)
        print('总用时',end-start)
        print('-'*30)
    
    if __name__ == '__main__':
        # main()
        get_news()
  • 相关阅读:
    JVM和HotSpot
    java中的四种引用类型
    垃圾回收与算法
    Full GC
    JVM内存结构
    事务不同的隔离级别实现原理
    事务的隔离级别
    jQuery后续和 前端框架Bootstrap
    jQuery
    BOM和DOM操作
  • 原文地址:https://www.cnblogs.com/hejianlong/p/9420695.html
Copyright © 2020-2023  润新知