• 【课程章节更新】猫影项目新爬虫源代码


    起因

            我在慕课有个flask 入门的课程:点击这里查看慕课课程。当时课程讲解的使用学习的视频网站已经不再提供服务了,为了方便大家学习这里重新找了一个视频源。这里郑重声明:该代码仅用于学习演示,请大家妥善使用,不要给源网站造成任何压力。


    示例代码

    新建一个python文件,文件名称是 movie2.py,代码如下

    # -*- coding: utf-8 -*-
    from application import app,db
    import requests,os,time,hashlib,json,re
    from bs4 import BeautifulSoup
    from common.libs.DataHelper import getCurrentTime
    from urllib.parse import urlparse
    from common.models.movie import Movie
    import logging
    from flask.logging import default_handler
    '''
    示例命令如下
    python manager.py runjob -m movie2 -a list | parse
    '''
    class JobTask():
        def __init__(self):
            ## 设置Job使用debug模式
            app.config['DEBUG'] = True
            logging_format = logging.Formatter(
                '%(levelname)s %(asctime)s %(filename)s:%(funcName)s L%(lineno)s %(message)s')
            default_handler.setFormatter(logging_format)
            self.source = "2345movie"
            self.url = {
                "num" : 3,
                "url" : "https://dianying.2345.com/list/-------#d#.html",
                "path" : "/tmp/%s/" %( self.source )
            }
    
        '''
        第一步 首先 获取列表list html 回来,通过解析html 获取详情 的 url等信息,在根据详情url 获取详情html
        第二步 解析 详情的html
        '''
        def run(self,params):
            act = params['act']
            self.date = getCurrentTime( frm = "%Y%m%d")
            if act == "list":
                self.getList()
                self.parseInfo()
            elif act == "parse":
                self.parseInfo()
    
        '''
        获取列表
        '''
        def getList(self):
            config = self.url
            path_root = config['path'] + self.date
            path_list = path_root + "/list"
            path_info = path_root + "/info"
            path_json = path_root + "/json"
            path_vid = path_root + "/vid"
            self.makeSuredirs( path_root )
            self.makeSuredirs( path_list )
            self.makeSuredirs( path_info )
            self.makeSuredirs( path_json )
            self.makeSuredirs( path_vid )
    
            pages = range( 1,config['num'] + 1 )
            for idx in pages:
                tmp_path = path_list + "/" + str( idx )
                tmp_url = config['url'].replace("#d#",str( idx ) )
                app.logger.info( "get list : " +  tmp_url )
                if os.path.exists( tmp_path ):
                    continue
    
                tmp_content = self.getHttpContent( tmp_url )
                self.saveContent( tmp_path,tmp_content )
                time.sleep(0.3)
    
            for idx in os.listdir( path_list ):
                tmp_content = self.getContent( path_list + "/" + str( idx ) )
                items_data = self.parseList( tmp_content )
                if not items_data:
                    continue
    
                for item in items_data:
                    app.logger.info("----------------")
                    app.logger.info( item )
                    tmp_json_path = path_json + "/" + item['hash']
                    tmp_info_path = path_info + "/" + item['hash']
                    tmp_vid_path = path_vid + "/" + item['hash']
                    if not os.path.exists( tmp_json_path ):
                        self.saveContent( tmp_json_path, json.dumps( item,ensure_ascii=False ) )
    
                    if not os.path.exists(tmp_info_path):
                        tmp_content = self.getHttpContent( item['url'] )
                        self.saveContent(  tmp_info_path,tmp_content )
    
                    if not os.path.exists( tmp_vid_path ):
                        tmp_content = self.getHttpContent( item['vid_url'] )
                        self.saveContent(  tmp_vid_path,tmp_content )
    
                    time.sleep( 0.3 )
    
    
        def parseList(self,content):
            data = []
            config = self.url
            url_info = urlparse( config['url'] )
            url_domain = url_info[0] + "://" + url_info[1]
    
            tmp_soup = BeautifulSoup( str(content),"html.parser" )
            tmp_list = tmp_soup.select( "div#contentList ul li" )
            for tmp_item in tmp_list:
                tmp_target = tmp_item.select("div.li-pic a.aPlayBtn")
                tmp_name = tmp_target[0]['title']
                tmp_href = tmp_target[0]['href']
                if "https:" not in tmp_href and "//" in tmp_href:
                    tmp_href = "https:%s" %( tmp_href )
                tmp_vid_url = "" ##这里获取不到下载地址了,那就进去获取
                tmp_data = {
                    "name" : tmp_name,
                    "url" : tmp_href,
                    "vid_url" : tmp_vid_url,
                    "hash" :  hashlib.md5( tmp_href.encode("utf-8") ).hexdigest()
                }
                data.append( tmp_data )
    
            return data
    
        '''
        解析详情信息
        '''
        def parseInfo(self):
            config = self.url
            path_root = config['path'] + self.date
            path_info = path_root + "/info"
            path_json = path_root + "/json"
            path_vid = path_root + "/vid"
            for filename in os.listdir(  path_info ):
                tmp_json_path = path_json + "/" + filename
                tmp_info_path = path_info + "/" + filename
                tmp_vid_path = path_vid + "/" + filename
                tmp_data = json.loads( self.getContent( tmp_json_path) )
                app.logger.info( tmp_info_path )
                tmp_content = self.getContent( tmp_info_path )
                tmp_soup = BeautifulSoup( tmp_content,"html.parser")
                try:
                    ##页面没有日期我们就去当天吧
                    tmp_pub_date = self.date
                    tmp_desc = tmp_soup.select( "div.txtIntroCon div.wholeTxt ul.newIntro li.extend .pHide" )[0].getText()
                    tmp_classify = tmp_soup.select( "div.txtIntroCon div.wholeTxt ul.txtList li.li_3 div.emTit-l" )[2].getText()
                    tmp_actor = tmp_soup.select( "div.txtIntroCon div.wholeTxt ul.txtList li.liActor div.emTit-l" )[1].getText()
                    tmp_pic_list = tmp_soup.select("div.posterPlaceholder div.pic img ")
                    tmp_pics = []
                    for tmp_pic in tmp_pic_list:
                        tmp_pics.append(  "https:" +  tmp_pic['src'] )
    
                    #获取下载地址 直接从当前页面获取
                    #tmp_download_content = self.getContent( tmp_vid_path  )
                    #tmp_vid_soup = BeautifulSoup( tmp_download_content ,"html.parser")
                    tmp_download_list = tmp_soup.select( "div.txtIntroCon div.series div.series-con div.series-con-i a" )
                    tmp_magnet_url = ""
                    if tmp_download_list:
                        tmp_magnet_url = tmp_download_list[0]['href']
    
    
                    tmp_data['pub_date'] = tmp_pub_date
                    tmp_data['desc'] = tmp_desc.strip()
                    tmp_data['classify'] = tmp_classify.strip()
                    tmp_data['actor'] = tmp_actor.strip()
                    tmp_data['magnet_url'] = tmp_magnet_url
                    tmp_data['source'] = self.source
                    tmp_data['created_time'] = tmp_data['updated_time'] = getCurrentTime()
                    if tmp_pics:
                        tmp_data['cover_pic'] = tmp_pics[0]
                        tmp_data['pics'] = json.dumps( tmp_pics )
    
                    tmp_movie_info = Movie.query.filter_by( hash  = tmp_data['hash']).first()
                    if tmp_movie_info:
                        continue
    
                    tmp_model_movie = Movie( **tmp_data )
                    db.session.add( tmp_model_movie )
                    db.session.commit()
                except Exception as e:
                    app.logger.info( e )
                    continue
            return True
    
        def getHttpContent(self,url):
            try:
                headers = {
                    'Content-Type': 'text/html;charset=utf-8',
                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
                    'Referer': "https://dianying.2345.com/list/",
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
                }
                r = requests.get(url, headers=headers)
                if r.status_code != 200 :
                    return None
    
                return r.text
    
            except Exception:
                return None
    
        def saveContent(self,path,content):
            if content:
                with open( path,mode="w+",encoding="utf-8" ) as f:
                    if type(content) != str:
                        content = content.decode("utf-8")
    
                    f.write(content )
                    f.flush()
                    f.close()
    
        def getContent(self,path):
            if os.path.exists( path ):
                with open( path ,"r") as  f:
                    return f.read()
    
            return ''
    
        def makeSuredirs(self,path):
            if not os.path.exists( path ):
                os.makedirs( path )



    原文地址:【课程章节更新】猫影项目新爬虫源代码
    标签:flask   linux   

    智能推荐

  • 相关阅读:
    冲刺周期第七天
    软件体系架构课下作业01
    大型网站技术架构-核心原理与案例分析-阅读笔记6
    大型网站技术架构-核心原理与案例分析-阅读笔记5
    大型网站技术架构-核心原理与案例分析-阅读笔记4
    大型网站技术架构-核心原理与案例分析-阅读笔记3
    大型网站技术架构-核心原理与案例分析-阅读笔记02
    《大型网站技术架构核心原理与案例分析》阅读笔记-01
    掌握需求过程阅读笔记—3
    掌握需求过程阅读笔记—2
  • 原文地址:https://www.cnblogs.com/apanly/p/16110195.html
Copyright © 2020-2023  润新知