• 一点资讯 视频抓取 phantomjs


    # _*_ coding: utf-8 _*_
    
    """
    思路:
    1.列表页使用phantomjs模拟点击2.每个链接只抓取第一页9-10条内容,按照标题去重
    3.布置定时任务,每天8点执行一次
    """
    
    import MySQLdb
    import redis
    import sys
    import os
    import re
    import urllib
    import requests
    import time
    import hashlib
    import traceback
    import urlparse
    import random
    import signal
    # import multiprocessing
    import matplotlib
    matplotlib.use("Agg")
    import shutil
    import socket #图片下载延迟的
    socket.setdefaulttimeout(30)
    import multiprocessing
    from config import IConfig
    from video_list import ydzx_url_list
    from bs4 import BeautifulSoup
    from upload_images import UploadFile
    from moviepy.editor import VideoFileClip
    from selenium import webdriver
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    class WxpnVideo(multiprocessing.Process):
    
        def __init__(self):
            self.redisConf = IConfig.load('resource.redis')
            self.redisServer = redis.Redis(host=self.redisConf['host'], port=self.redisConf['port'], db=self.redisConf['db'], password=self.redisConf['passwd'])
    
            self.dbConfig = IConfig.load('resource.mysql')
            self.conn = MySQLdb.connect(
                user = self.dbConfig['user'],
                passwd = self.dbConfig['password'],
                db = self.dbConfig['dbname'],
                host = self.dbConfig['host'],
                charset = "utf8",
                use_unicode = True)
    
            self.conn.ping(True)
            self.cursor = self.conn.cursor()
    
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
                'Host': 'www.yidianzixun.com',
                'X-Requested-With': 'XMLHttpRequest',
                'Connection': 'keep-alive',
                'Accept': '*/*',
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'zh-CN,zh;q=0.9',
            }
    
            self.domain = IConfig.load('resource.domain')
            self.apiConf = IConfig.load('resource.apiurl')
    
            self.key_video_list = 'wxpn:video:list'
            self.key_title = 'wxpn:video:title'
    
            self.storeConfig = IConfig.load('resource.store')
            self.thumb_path = self.storeConfig['images_path']
    
            self.ossConf = IConfig.load('resource.oss')
            self.key_id = self.ossConf['access_key_id']
            self.key_secret = self.ossConf['access_key_secret']
            self.endponit = self.ossConf['endponit']
    
            self.img_upload = UploadFile()
            self.auth = self.img_upload.auth_oss(self.key_id, self.key_secret)
    
            self.videoConf = IConfig.load('resource.apiurl')
            self.video_publish = self.videoConf['video_publish_api']
            self.ydzx_page_api = self.videoConf['ydzx_page_api']
    
            self.start_time = int(time.time())
            multiprocessing.Process.__init__(self)
    
        def store_video_list_redis(self, video_list):
    
            if video_list:
                for per_list in video_list:
                    if not self.redisServer.sismember(self.key_video_list, per_list):
                        self.redisServer.sadd(self.key_video_list, per_list)
            else:
                return False
    
        def get_video_para(self):
            while True:
                if self.redisServer.scard(self.key_video_list) == 0:
                    break
    
                link = self.redisServer.spop(self.key_video_list)
                print(link)
                # url = self.ydzx_page_api + link
    
                # try:
                #     res = requests.get(url=url, timeout=60)
                # except Exception as e:
                #     print('连接失败')
                # print(res.status_code)
    
                dcap = dict(DesiredCapabilities.PHANTOMJS)
                dcap["phantomjs.page.settings.userAgent"] = (
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
                )
                try:
                    driver = webdriver.PhantomJS(desired_capabilities=dcap, executable_path='/usr/local/phantomjs/bin/phantomjs')
    
                    # driver.set_page_load_timeout(10)
                    # driver.set_script_timeout(10)
    
                    time.sleep(random.randrange(3, 8))
                    driver.get(link)
                    time.sleep(random.randrange(2, 6))
    
                    text = driver.page_source
    
                    driver.service.process.send_signal(signal.SIGTERM)
                    driver.quit()
    
                except Exception as e:
    
                    print(traceback.format_exc())
                    continue
    
                # if res.status_code == 200:
                soup = BeautifulSoup(text, 'lxml')
                title_list = soup.select('div.channel-news div.doc-title')
                itemid_list = soup.select('div.channel-news a.style-content-middle')
    
                if title_list and itemid_list:
    
                    try:
                        for num, title in enumerate(title_list):
                            m = hashlib.md5()
                            m.update(str(title.text).strip())
                            psw = m.hexdigest()
    
                            print(title.text)
                            itemid = itemid_list[num]['data-docid']
    
                            if not self.redisServer.sismember(self.key_title, psw):
    
                                yield psw, itemid
                    except Exception as e:
                        print(traceback.format_exc())
                        continue
    
                else:
                    print('一点资讯视频列表{0}页链接请求失败,请及时查看原因'.format(link))
    
                    # self.form_data['content'] = '【创业黑马】预警:一点资讯视频列表{0}页链接请求失败,请及时查看原因'.format(link)
                    # res = requests.post(self.msg_api, data=self.form_data)
    
        def time_cycle(self,origin_time):
            now = time.time()
    
            try:
                if origin_time == '昨天':
                    published = int(now) - 24*3600
                elif '' in origin_time:
                    day_one = re.compile('(.*?)天')
                    published = int(now)-int(day_one.findall(origin_time)[0])*24*3600
                elif '小时' in origin_time:
                    hour_one = re.compile('(.*?)小时')
                    published = int(now)-int(hour_one.findall(origin_time)[0])*24*60
                elif '' in origin_time:
                    min_one = re.compile('(.*?)分')
                    published = int(now)-int(min_one.findall(origin_time)[0])*60
                elif '' in origin_time:
                    month_one = re.compile('(.*?)个月')
                    published = int(now)-int(month_one.findall(origin_time)[0])*30*24*3600
                else:
                    timeArray = time.strptime(origin_time, "%Y.%m.%d")
                    published = int(time.mktime(timeArray))
                return published
            except Exception as e:
                print(traceback.format_exc())
    
        def download_video(self, psw, itemid):
            now = int(time.time())
    
            url = 'http://www.yidianzixun.com/article/' + itemid
            print(url)
            self.headers['Referer'] = url
            try:
                res = requests.get(url=url, headers=self.headers, timeout=60)
                print(res.status_code)
            except Exception as e:
                print('小链接连接失败')
    
            if res.status_code == 200:
                soup = BeautifulSoup(res.text, 'lxml')
    
                title = soup.select('div.left-wrapper > h2')[0].text
    
                try:
                    video_src = soup.select('div.video-wrapper > video')[0]['src']
                except Exception as e:
                    print('此篇为文章,不是视频')
    
                thumb_src = soup.select('div.video-wrapper > video')[0]['poster']
    
                try:
                    source = soup.select('body.page-article .left-wrapper > .meta > a')[0].text
                except Exception as e:
                    source = soup.select('body.page-article .left-wrapper > .meta > span')[0].text
                    source_re = re.sub('来源:', '', str(source))
                    source = source_re
    
                publishtime = soup.select('body.page-article .left-wrapper > .meta > span')[0].text
    
                try:
                    timestamp = self.time_cycle(str(publishtime))
                except Exception as e:
                    timestamp = now
    
                img_url_parts = urlparse.urlparse(thumb_src)
                img_url_query = urlparse.parse_qs(img_url_parts.query,True)
    
                if img_url_query.has_key('wx_fmt'):
                    ext_name = '.' + img_url_query['wx_fmt'][0]
                else:
                    ext_name = '.png'
    
                thumb_p = self.thumb_path + 'video/thumb'
                if not os.path.exists(thumb_p):
                    os.mkdir(thumb_p)
    
                img_down_local_path = thumb_p + '/' + psw[:20] + ext_name
                urllib.urlretrieve(thumb_src, img_down_local_path)
                file_name = psw[:20] + ext_name
    
                if os.path.exists(img_down_local_path):
                    images_path = self.ossConf['video_thumb_path']
                    status = self.img_upload.upload_to_oss(self.auth, self.endponit, images_path, file_name, img_down_local_path)
    
                thumb_src = self.domain['img_url_oss'] + 'Cmstop/ydzx/' + file_name
                m = hashlib.md5()
                m.update(str(thumb_src))
                psw_thumb = m.hexdigest()
    
                try:
                    delay_re = re.compile('"duration":(d+)')
                    playtime = delay_re.findall(str(res.text))[0]
                except Exception as e:
                    print(traceback.format_exc())
                    playtime = None
    
                video_path = self.thumb_path + 'video/' + str(video_src).split('/')[-1]
    
                video_res = requests.get(video_src).content
                with open(video_path, 'wb') as f:
                    f.write(video_res)
    
                try:
                    clip = VideoFileClip(video_path)
                    print(clip.duration)
                except Exception as e:
                    print(traceback.format_exc())
                    return False
    
                # with open(video_path, 'r') as f:
                #     length = len(f.read())
                #     if length < 819200:
                #         return False
    
                video_name = str(video_src).split('/')[-1][10:]
                if os.path.exists(video_path):
                    images_path = self.ossConf['video_path']
                    status = self.img_upload.upload_to_oss(self.auth, self.endponit, images_path, video_name, video_path)
    
                    print('')
                    if status != 'success':
                        return False
    
                print('视频上传成功')
                video_link = self.domain['img_url_oss'] + 'Cmstop/video/ydzx/' + video_name
    
                try:
                    sql = "insert into cmstop_comment_topic(title, description, thumb, created, url_md5, url) values(%s, '', %s, %s, %s, '')"
                    params = (title, thumb_src, now, psw_thumb)
                    self.cursor.execute(sql, params)
                    self.conn.commit()
    
                    topicid = self.cursor.lastrowid
    
                except Exception as e:
                    print(traceback.format_exc())
                    self.conn.rollback()
    
                sourceid = self.get_article_sourceid(source)
    
                try:
                    result = self.cursor.execute("""
                                insert into cmstop_content(topicid, sourceid, catid, modelid, title, subtitle, source_title, source_link, weight, status, created, score, published, thumb, createdby)
                                values(%s, %s, %s, %s, %s, %s, %s, %s, 60, %s, %s, %s, %s, %s, %s)
                                """, (topicid, sourceid, 47, 4, title, None, title, '', 3, now, 0, timestamp, thumb_src, 0))
                    self.conn.commit()
    
                    lastrowid = self.cursor.lastrowid
                except Exception as e:
                    print(traceback.format_exc())
                    self.conn.rollback()
    
                video_id = str(video_src).split('/')[-1][10:-4]
                try:
                    sql = "insert into cmstop_video(contentid, video, playtime, author, video_id, aid) values(%s, %s, %s, %s, %s, %s)"
                    print(sql)
                    params = (lastrowid, video_link, playtime, source, video_id, 0)
                    self.cursor.execute(sql, params)
                    self.conn.commit()
    
                except Exception as e:
                    print(traceback.format_exc())
                    self.conn.rollback()
    
                self.redisServer.sadd(self.key_title, psw)
    
                api_url = self.video_publish + str(lastrowid)
                try:
                    resp = urllib.urlopen(api_url)
                    result = resp.read()
                except:
                    print 'connect failed'
    
            else:
                print('一点资讯视频主链接请求失败,请及时查看原因')
    
                # self.form_data['content'] = '【创业黑马】预警:一点资讯视频主链接请求失败,请及时查看原因'
                # res = requests.post(self.msg_api, data=self.form_data)
    
        def get_article_sourceid(self, source, medias = []):
            source = source.strip()
            sourceid = 0
    
            """
            print source
            print set([source.encode('utf-8')])
            print medias
            """
    
            result = self.cursor.execute('select `sourceid`, `name`, `has_signed_contract` from `cmstop_source` where `name`="' + source + '"')
            has_signed_contract = 0
    
            if medias and (set([source.encode('utf-8')]) & medias):
                has_signed_contract = 1
    
            if result:
                data = self.cursor.fetchone()
                sourceid = data[0]
    
                if data[2] != has_signed_contract:
                    try:
                        result = self.cursor.execute("""
                            update `cmstop_source` set `has_signed_contract`=%s where sourceid=%s
                            """, (has_signed_contract, sourceid))
                        self.conn.commit()
                    except:
                        self.conn.rollback()
            else:
                try:
                    result = self.cursor.execute("""
                        insert into `cmstop_source`(`name`, `logo`, `url`, `initial`, `has_signed_contract`)
                        values(%s, %s, %s, %s, %s)
                        """, (source, '', '', '', has_signed_contract))
                    self.conn.commit()
                    sourceid = self.cursor.lastrowid
                except:
                    self.conn.rollback()
    
            return sourceid
    
        def run(self):
            os.system('pkill phantomjs')
    
            lockConf = IConfig.load('resource.lock')
            lock_file = lockConf['lock_path_ydzx']
    
            if os.path.exists(lock_file):
                print('lock file exists')
                return False
            os.system(r'touch %s '% lock_file)
    
            self.store_video_list_redis(ydzx_url_list)
            get_video_para = self.get_video_para()
    
            for psw, itemid in get_video_para:
    
                print(psw)
    
                stop_time = int(time.time())
                balance_time = stop_time - self.start_time
    
                if balance_time >= 10800: #运行时间为3个小时
    
                    self.del_file(self.thumb_path + 'video')
                    os.system(r'rm -rf %s' % lock_file)
                    os._exit(0)
    
                try:
                    self.download_video(psw=psw, itemid=itemid)
    
                    time.sleep(random.uniform(2, 8))
    
                    os.system('pkill ffmpeg-osx-v3.2.4')
    
                except Exception as e:
                    print(traceback.format_exc())
                    continue
    
            self.del_file(self.thumb_path + 'video')
            os.system(r'rm -rf %s' % lock_file)
    
        def video_publish(self):
            sql = 'select contentid from cmstop_video where contentid<=3528920 and contentid>=3430851'
            self.cursor.execute(sql)
            data = self.cursor.fetchall()
    
            for num in data:
                api_url = self.video_publish + str(num[0])
                try:
                    resp = urllib.urlopen(api_url)
                    result = resp.read()
                except:
                    print 'connect failed'
    
        def del_file(self, path):
            os.chdir(path) #进入要清空的目录
            ds = list(os.listdir(path)) #获得该目录下所有文件或文件夹列表皮
    
            for d in ds: #遍历该列表
    
                if os.path.isfile(d): #如果列表项是文件
                    os.remove(d) #直接删除
                else: #如果不会文件
                    shutil.rmtree(d) #也直接删除
    
    if __name__ == '__main__':
    
        video_one = WxpnVideo()
        # video_two = WxpnVideo()
    
        video_one.start()
        # video_two.start()
    
        video_one.join()
        # video_two.join()
  • 相关阅读:
    二分图最大匹配的K&#246;nig定理及其证明
    HDOJ 2389 Rain on your Parade
    HDOJ 1083 Courses
    HDOJ 2063 过山车
    POJ 1469 COURSES
    UESTC 1817 Complete Building the Houses
    POJ 3464 ACM Computer Factory
    POJ 1459 Power Network
    HDOJ 1532 Drainage Ditches
    HDU 1017 A Mathematical Curiosity
  • 原文地址:https://www.cnblogs.com/19921019yy/p/9355369.html
Copyright © 2020-2023  润新知