爬取mtime网目标城市的热映电影

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

声明：仅学习参考

版本：verison_0

效果图：

说明：city_name需要手动输入，即当前城市的url需要手动构造

　　　version_1待实现的功能：（1）通过输入城市名称，即可对应于对应url，而无需手动构造

　　　　　　　　　　　　　　（2）多线程（在循环电影数据的时候，可构造一个线程，通过for的方式，生成多线程，注意设置为守护线程，join）

流程：（1）输入城市名字（注意热门城市，是China_Beijing字样，非热门城市是China_Hubei_Province_Huanggang字样）

　　　（2）获取当前目录下的所有上映的热门电影（如果不需要提取图片等信息，可直接抓取“<script type="text/javascript">var hotplaySvList”下的信息，包括id,url和title

　　　（3）通过获取到的url和id构造详情页请求，获取主演，类型，描述，热门评论等信息

　　　（4）票房或评分等信息在ajax请求（Movie.api....）中，通过id可构造相应的请求

注意点：（1）在每次请求前带上referer

　　　　（2）请求的频率不宜过快，且应该随机

　　　　（3）输入city_name后注意首次发起的请求是否成功，如果不成功，说明url没有构造好，应该在网站上找出规律，如.../China_Hubei_Province_Huanggang字样

源码：

from urllib import parse,error
import requests
from lxml import etree
import re
import json
from pprint import pprint
import time,random


class MtimeSpider(object):
    def __init__(self):
        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
    
    def parse_ajax_content(self,response):
        totalboxoffice = re.findall(r'"TotalBoxOffice":"(.*?)"',string=response,flags=re.S)
        totalboxoffice = totalboxoffice[0] if totalboxoffice else None
        totalboxofficeunit = re.findall(r'"TotalBoxOfficeUnit":"(.*?)"',string=response,flags=re.S) # 获取单位
        totalboxofficeunit = totalboxofficeunit[0] if totalboxofficeunit else None
        if totalboxoffice is not None and totalboxofficeunit is not None:
            totalboxoffice = totalboxoffice+totalboxofficeunit
        ratingfinall = re.findall(r'"RatingFinal":(.*?),',string=response,flags=re.S)
        ratingfinall = ratingfinall[0] if ratingfinall else None
        return totalboxoffice,ratingfinall

    def parse_movie_detail(self,response):
        html = etree.HTML(response)
        # 获取分类和上映日期
        cate = str()
        category = html.xpath("//div[contains(@class,'db_head')]/div[last()]/a/text()")
        for str_ in category[0:-1]:
            cate += "/"+str_
        showdata = category[-1]
        # 获取描述
        desc = html.xpath('//p[contains(@class,"mt6 lh18")]/text()')
        description = desc[0] if desc else None
        # 获取主演
        main_actor= html.xpath("//dl[contains(@class,'main_actor')]/dd/a/img/@alt")
        return cate,showdata,description,main_actor

    
    def parse_hotplay(self,hot_play_list,referer):
        items_list = list()
        for hot_play in hot_play_list:
            item = dict()
            item['title'] = hot_play['Title']
            item['href'] = hot_play['Url']
            # 获取到单个电影的分类，描述，主演，总票房和评分
            ## 获取电影详情页的响应
            ### 获取分类，上映时间，描述，主演
            movie_html_str = self.get_request(url=hot_play['Url'],referer=referer)
            category,showdata,description,main_actor = self.parse_movie_detail(movie_html_str)
            item["category"] = category
            item['showdata'] = showdata
            item['description'] = description
            item['main_actor'] = main_actor
            items_list.append(item)
            # 获取总票房和评分
            ajax_url = "http://service.library.mtime.com/Movie.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Library.Services&Ajax_CallBackMethod=GetMovieOverviewRating&Ajax_CallBackArgument0={}".format(hot_play['Id'])
            ajax_content = self.get_request(url=ajax_url,referer=hot_play['Url'])
            totalboxoffice,ratingfinall = self.parse_ajax_content(response=ajax_content)
            item['totalboxoffice'] = totalboxoffice
            item['ratingfinall'] = ratingfinall
            time.sleep(random.random()) 
        return items_list
            
    def parse_city_request(self,response):
        useful_info = re.findall(pattern=r'hotplaySvLists+=s+(.*?);',string=response,flags=re.S)
        useful_info = useful_info[0] if useful_info else None
        return json.loads(s=useful_info,encoding='utf-8')

    def get_request(self,url,referer=None):
        headers = {
            "User-Agent":self.user_agent,
            "Referer":referer,
        }
        try:
            response = requests.get(url=url,headers=headers)
            if response.status_code == 200 and response.text is not None:
                return response.content.decode()
        except error.HTTPError as e:
            print(e)
        
    def run(self,city_name):
        # 获取当前城市正在热映的电影
        url = "http://theater.mtime.com/China_{}/".format(city_name)
        city_html_str = self.get_request(url,referer='http://www.mtime.com/')
        hot_play_list = self.parse_city_request(city_html_str)   # 拿到当前城市的所有热映电影，返回列表
        # 进入电影详情页，获取电影标题，分类，描述，主演，总票房
        items_list = self.parse_hotplay(hot_play_list,referer=url)
        pprint(items_list)

    
if __name__=="__main__":
    # city_name = input("请输入城市（拼音形式，首字母大写）:")
    city_name = "Huangshi"
    obj = MtimeSpider()
    obj.run(city_name=city_name)

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

相关阅读:
OpenCvSharp 识别颜色
 树莓派 Raspberry Pi 4，.net core 3.0 ,Avalonia UI 开发
 iis频繁奔溃，分析dump
prerender.io 搜索引擎优化部署成windows服务实现开机自动开启服务
 ionic3包还原使用yarn命令执行步骤（解决ionic3懒加载报找不到 module的错误）
firebreath 在谷歌和火狐浏览器下的调试以及打包
 ASP.NET Web API 2 对 CORS 的支持
 .net mvc通过ucenter和 discuz的整合，nopcommerce ucenter 插件的方式实现
 EF + Mysql
Jenkins持续集成 & .NET
原文地址：https://www.cnblogs.com/nuochengze/p/13124526.html