一点资讯视频抓取 phantomjs

# _*_ coding: utf-8 _*_

"""
思路:
1.列表页使用phantomjs模拟点击2.每个链接只抓取第一页9-10条内容,按照标题去重
3.布置定时任务,每天8点执行一次
"""

import MySQLdb
import redis
import sys
import os
import re
import urllib
import requests
import time
import hashlib
import traceback
import urlparse
import random
import signal
# import multiprocessing
import matplotlib
matplotlib.use("Agg")
import shutil
import socket #图片下载延迟的
socket.setdefaulttimeout(30)
import multiprocessing
from config import IConfig
from video_list import ydzx_url_list
from bs4 import BeautifulSoup
from upload_images import UploadFile
from moviepy.editor import VideoFileClip
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

reload(sys)
sys.setdefaultencoding('utf-8')

class WxpnVideo(multiprocessing.Process):

    def __init__(self):
        self.redisConf = IConfig.load('resource.redis')
        self.redisServer = redis.Redis(host=self.redisConf['host'], port=self.redisConf['port'], db=self.redisConf['db'], password=self.redisConf['passwd'])

        self.dbConfig = IConfig.load('resource.mysql')
        self.conn = MySQLdb.connect(
            user = self.dbConfig['user'],
            passwd = self.dbConfig['password'],
            db = self.dbConfig['dbname'],
            host = self.dbConfig['host'],
            charset = "utf8",
            use_unicode = True)

        self.conn.ping(True)
        self.cursor = self.conn.cursor()

        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
            'Host': 'www.yidianzixun.com',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
        }

        self.domain = IConfig.load('resource.domain')
        self.apiConf = IConfig.load('resource.apiurl')

        self.key_video_list = 'wxpn:video:list'
        self.key_title = 'wxpn:video:title'

        self.storeConfig = IConfig.load('resource.store')
        self.thumb_path = self.storeConfig['images_path']

        self.ossConf = IConfig.load('resource.oss')
        self.key_id = self.ossConf['access_key_id']
        self.key_secret = self.ossConf['access_key_secret']
        self.endponit = self.ossConf['endponit']

        self.img_upload = UploadFile()
        self.auth = self.img_upload.auth_oss(self.key_id, self.key_secret)

        self.videoConf = IConfig.load('resource.apiurl')
        self.video_publish = self.videoConf['video_publish_api']
        self.ydzx_page_api = self.videoConf['ydzx_page_api']

        self.start_time = int(time.time())
        multiprocessing.Process.__init__(self)

    def store_video_list_redis(self, video_list):

        if video_list:
            for per_list in video_list:
                if not self.redisServer.sismember(self.key_video_list, per_list):
                    self.redisServer.sadd(self.key_video_list, per_list)
        else:
            return False

    def get_video_para(self):
        while True:
            if self.redisServer.scard(self.key_video_list) == 0:
                break

            link = self.redisServer.spop(self.key_video_list)
            print(link)
            # url = self.ydzx_page_api + link

            # try:
            #     res = requests.get(url=url, timeout=60)
            # except Exception as e:
            #     print('连接失败')
            # print(res.status_code)

            dcap = dict(DesiredCapabilities.PHANTOMJS)
            dcap["phantomjs.page.settings.userAgent"] = (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
            )
            try:
                driver = webdriver.PhantomJS(desired_capabilities=dcap, executable_path='/usr/local/phantomjs/bin/phantomjs')

                # driver.set_page_load_timeout(10)
                # driver.set_script_timeout(10)

                time.sleep(random.randrange(3, 8))
                driver.get(link)
                time.sleep(random.randrange(2, 6))

                text = driver.page_source

                driver.service.process.send_signal(signal.SIGTERM)
                driver.quit()

            except Exception as e:

                print(traceback.format_exc())
                continue

            # if res.status_code == 200:
            soup = BeautifulSoup(text, 'lxml')
            title_list = soup.select('div.channel-news div.doc-title')
            itemid_list = soup.select('div.channel-news a.style-content-middle')

            if title_list and itemid_list:

                try:
                    for num, title in enumerate(title_list):
                        m = hashlib.md5()
                        m.update(str(title.text).strip())
                        psw = m.hexdigest()

                        print(title.text)
                        itemid = itemid_list[num]['data-docid']

                        if not self.redisServer.sismember(self.key_title, psw):

                            yield psw, itemid
                except Exception as e:
                    print(traceback.format_exc())
                    continue

            else:
                print('一点资讯视频列表{0}页链接请求失败,请及时查看原因'.format(link))

                # self.form_data['content'] = '【创业黑马】预警:一点资讯视频列表{0}页链接请求失败,请及时查看原因'.format(link)
                # res = requests.post(self.msg_api, data=self.form_data)

    def time_cycle(self,origin_time):
        now = time.time()

        try:
            if origin_time == '昨天':
                published = int(now) - 24*3600
            elif '天' in origin_time:
                day_one = re.compile('(.*?)天')
                published = int(now)-int(day_one.findall(origin_time)[0])*24*3600
            elif '小时' in origin_time:
                hour_one = re.compile('(.*?)小时')
                published = int(now)-int(hour_one.findall(origin_time)[0])*24*60
            elif '分' in origin_time:
                min_one = re.compile('(.*?)分')
                published = int(now)-int(min_one.findall(origin_time)[0])*60
            elif '月' in origin_time:
                month_one = re.compile('(.*?)个月')
                published = int(now)-int(month_one.findall(origin_time)[0])*30*24*3600
            else:
                timeArray = time.strptime(origin_time, "%Y.%m.%d")
                published = int(time.mktime(timeArray))
            return published
        except Exception as e:
            print(traceback.format_exc())

    def download_video(self, psw, itemid):
        now = int(time.time())

        url = 'http://www.yidianzixun.com/article/' + itemid
        print(url)
        self.headers['Referer'] = url
        try:
            res = requests.get(url=url, headers=self.headers, timeout=60)
            print(res.status_code)
        except Exception as e:
            print('小链接连接失败')

        if res.status_code == 200:
            soup = BeautifulSoup(res.text, 'lxml')

            title = soup.select('div.left-wrapper > h2')[0].text

            try:
                video_src = soup.select('div.video-wrapper > video')[0]['src']
            except Exception as e:
                print('此篇为文章,不是视频')

            thumb_src = soup.select('div.video-wrapper > video')[0]['poster']

            try:
                source = soup.select('body.page-article .left-wrapper > .meta > a')[0].text
            except Exception as e:
                source = soup.select('body.page-article .left-wrapper > .meta > span')[0].text
                source_re = re.sub('来源：', '', str(source))
                source = source_re

            publishtime = soup.select('body.page-article .left-wrapper > .meta > span')[0].text

            try:
                timestamp = self.time_cycle(str(publishtime))
            except Exception as e:
                timestamp = now

            img_url_parts = urlparse.urlparse(thumb_src)
            img_url_query = urlparse.parse_qs(img_url_parts.query,True)

            if img_url_query.has_key('wx_fmt'):
                ext_name = '.' + img_url_query['wx_fmt'][0]
            else:
                ext_name = '.png'

            thumb_p = self.thumb_path + 'video/thumb'
            if not os.path.exists(thumb_p):
                os.mkdir(thumb_p)

            img_down_local_path = thumb_p + '/' + psw[:20] + ext_name
            urllib.urlretrieve(thumb_src, img_down_local_path)
            file_name = psw[:20] + ext_name

            if os.path.exists(img_down_local_path):
                images_path = self.ossConf['video_thumb_path']
                status = self.img_upload.upload_to_oss(self.auth, self.endponit, images_path, file_name, img_down_local_path)

            thumb_src = self.domain['img_url_oss'] + 'Cmstop/ydzx/' + file_name
            m = hashlib.md5()
            m.update(str(thumb_src))
            psw_thumb = m.hexdigest()

            try:
                delay_re = re.compile('"duration":(d+)')
                playtime = delay_re.findall(str(res.text))[0]
            except Exception as e:
                print(traceback.format_exc())
                playtime = None

            video_path = self.thumb_path + 'video/' + str(video_src).split('/')[-1]

            video_res = requests.get(video_src).content
            with open(video_path, 'wb') as f:
                f.write(video_res)

            try:
                clip = VideoFileClip(video_path)
                print(clip.duration)
            except Exception as e:
                print(traceback.format_exc())
                return False

            # with open(video_path, 'r') as f:
            #     length = len(f.read())
            #     if length < 819200:
            #         return False

            video_name = str(video_src).split('/')[-1][10:]
            if os.path.exists(video_path):
                images_path = self.ossConf['video_path']
                status = self.img_upload.upload_to_oss(self.auth, self.endponit, images_path, video_name, video_path)

                print('')
                if status != 'success':
                    return False

            print('视频上传成功')
            video_link = self.domain['img_url_oss'] + 'Cmstop/video/ydzx/' + video_name

            try:
                sql = "insert into cmstop_comment_topic(title, description, thumb, created, url_md5, url) values(%s, '', %s, %s, %s, '')"
                params = (title, thumb_src, now, psw_thumb)
                self.cursor.execute(sql, params)
                self.conn.commit()

                topicid = self.cursor.lastrowid

            except Exception as e:
                print(traceback.format_exc())
                self.conn.rollback()

            sourceid = self.get_article_sourceid(source)

            try:
                result = self.cursor.execute("""
                            insert into cmstop_content(topicid, sourceid, catid, modelid, title, subtitle, source_title, source_link, weight, status, created, score, published, thumb, createdby)
                            values(%s, %s, %s, %s, %s, %s, %s, %s, 60, %s, %s, %s, %s, %s, %s)
                            """, (topicid, sourceid, 47, 4, title, None, title, '', 3, now, 0, timestamp, thumb_src, 0))
                self.conn.commit()

                lastrowid = self.cursor.lastrowid
            except Exception as e:
                print(traceback.format_exc())
                self.conn.rollback()

            video_id = str(video_src).split('/')[-1][10:-4]
            try:
                sql = "insert into cmstop_video(contentid, video, playtime, author, video_id, aid) values(%s, %s, %s, %s, %s, %s)"
                print(sql)
                params = (lastrowid, video_link, playtime, source, video_id, 0)
                self.cursor.execute(sql, params)
                self.conn.commit()

            except Exception as e:
                print(traceback.format_exc())
                self.conn.rollback()

            self.redisServer.sadd(self.key_title, psw)

            api_url = self.video_publish + str(lastrowid)
            try:
                resp = urllib.urlopen(api_url)
                result = resp.read()
            except:
                print 'connect failed'

        else:
            print('一点资讯视频主链接请求失败,请及时查看原因')

            # self.form_data['content'] = '【创业黑马】预警:一点资讯视频主链接请求失败,请及时查看原因'
            # res = requests.post(self.msg_api, data=self.form_data)

    def get_article_sourceid(self, source, medias = []):
        source = source.strip()
        sourceid = 0

        """
        print source
        print set([source.encode('utf-8')])
        print medias
        """

        result = self.cursor.execute('select `sourceid`, `name`, `has_signed_contract` from `cmstop_source` where `name`="' + source + '"')
        has_signed_contract = 0

        if medias and (set([source.encode('utf-8')]) & medias):
            has_signed_contract = 1

        if result:
            data = self.cursor.fetchone()
            sourceid = data[0]

            if data[2] != has_signed_contract:
                try:
                    result = self.cursor.execute("""
                        update `cmstop_source` set `has_signed_contract`=%s where sourceid=%s
                        """, (has_signed_contract, sourceid))
                    self.conn.commit()
                except:
                    self.conn.rollback()
        else:
            try:
                result = self.cursor.execute("""
                    insert into `cmstop_source`(`name`, `logo`, `url`, `initial`, `has_signed_contract`)
                    values(%s, %s, %s, %s, %s)
                    """, (source, '', '', '', has_signed_contract))
                self.conn.commit()
                sourceid = self.cursor.lastrowid
            except:
                self.conn.rollback()

        return sourceid

    def run(self):
        os.system('pkill phantomjs')

        lockConf = IConfig.load('resource.lock')
        lock_file = lockConf['lock_path_ydzx']

        if os.path.exists(lock_file):
            print('lock file exists')
            return False
        os.system(r'touch %s '% lock_file)

        self.store_video_list_redis(ydzx_url_list)
        get_video_para = self.get_video_para()

        for psw, itemid in get_video_para:

            print(psw)

            stop_time = int(time.time())
            balance_time = stop_time - self.start_time

            if balance_time >= 10800: #运行时间为3个小时

                self.del_file(self.thumb_path + 'video')
                os.system(r'rm -rf %s' % lock_file)
                os._exit(0)

            try:
                self.download_video(psw=psw, itemid=itemid)

                time.sleep(random.uniform(2, 8))

                os.system('pkill ffmpeg-osx-v3.2.4')

            except Exception as e:
                print(traceback.format_exc())
                continue

        self.del_file(self.thumb_path + 'video')
        os.system(r'rm -rf %s' % lock_file)

    def video_publish(self):
        sql = 'select contentid from cmstop_video where contentid<=3528920 and contentid>=3430851'
        self.cursor.execute(sql)
        data = self.cursor.fetchall()

        for num in data:
            api_url = self.video_publish + str(num[0])
            try:
                resp = urllib.urlopen(api_url)
                result = resp.read()
            except:
                print 'connect failed'

    def del_file(self, path):
        os.chdir(path) #进入要清空的目录
        ds = list(os.listdir(path)) #获得该目录下所有文件或文件夹列表皮

        for d in ds: #遍历该列表

            if os.path.isfile(d): #如果列表项是文件
                os.remove(d) #直接删除
            else: #如果不会文件
                shutil.rmtree(d) #也直接删除

if __name__ == '__main__':

    video_one = WxpnVideo()
    # video_two = WxpnVideo()

    video_one.start()
    # video_two.start()

    video_one.join()
    # video_two.join()
相关阅读:
二分图最大匹配的König定理及其证明
 HDOJ 2389 Rain on your Parade
HDOJ 1083 Courses
HDOJ 2063 过山车
 POJ 1469 COURSES
UESTC 1817 Complete Building the Houses
POJ 3464 ACM Computer Factory
POJ 1459 Power Network
HDOJ 1532 Drainage Ditches
HDU 1017 A Mathematical Curiosity
原文地址：https://www.cnblogs.com/19921019yy/p/9355369.html
一点资讯 视频抓取 phantomjs

一点资讯视频抓取 phantomjs