>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
声明:仅学习参考
版本:verison_0
效果图:
说明:city_name需要手动输入,即当前城市的url需要手动构造
version_1待实现的功能:(1)通过输入城市名称,即可对应于对应url,而无需手动构造
(2)多线程(在循环电影数据的时候,可构造一个线程,通过for的方式,生成多线程,注意设置为守护线程,join)
流程:(1)输入城市名字(注意热门城市,是China_Beijing字样,非热门城市是China_Hubei_Province_Huanggang字样)
(2)获取当前目录下的所有上映的热门电影(如果不需要提取图片等信息,可直接抓取“<script type="text/javascript">var hotplaySvList”下的信息,包括id,url和title
(3)通过获取到的url和id构造详情页请求,获取主演,类型,描述,热门评论等信息
(4)票房或评分等信息在ajax请求(Movie.api....)中,通过id可构造相应的请求
注意点:(1)在每次请求前带上referer
(2)请求的频率不宜过快,且应该随机
(3)输入city_name后注意首次发起的请求是否成功,如果不成功,说明url没有构造好,应该在网站上找出规律,如.../China_Hubei_Province_Huanggang字样
源码:
from urllib import parse,error import requests from lxml import etree import re import json from pprint import pprint import time,random class MtimeSpider(object): def __init__(self): self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36" def parse_ajax_content(self,response): totalboxoffice = re.findall(r'"TotalBoxOffice":"(.*?)"',string=response,flags=re.S) totalboxoffice = totalboxoffice[0] if totalboxoffice else None totalboxofficeunit = re.findall(r'"TotalBoxOfficeUnit":"(.*?)"',string=response,flags=re.S) # 获取单位 totalboxofficeunit = totalboxofficeunit[0] if totalboxofficeunit else None if totalboxoffice is not None and totalboxofficeunit is not None: totalboxoffice = totalboxoffice+totalboxofficeunit ratingfinall = re.findall(r'"RatingFinal":(.*?),',string=response,flags=re.S) ratingfinall = ratingfinall[0] if ratingfinall else None return totalboxoffice,ratingfinall def parse_movie_detail(self,response): html = etree.HTML(response) # 获取分类和上映日期 cate = str() category = html.xpath("//div[contains(@class,'db_head')]/div[last()]/a/text()") for str_ in category[0:-1]: cate += "/"+str_ showdata = category[-1] # 获取描述 desc = html.xpath('//p[contains(@class,"mt6 lh18")]/text()') description = desc[0] if desc else None # 获取主演 main_actor= html.xpath("//dl[contains(@class,'main_actor')]/dd/a/img/@alt") return cate,showdata,description,main_actor def parse_hotplay(self,hot_play_list,referer): items_list = list() for hot_play in hot_play_list: item = dict() item['title'] = hot_play['Title'] item['href'] = hot_play['Url'] # 获取到单个电影的分类,描述,主演,总票房和评分 ## 获取电影详情页的响应 ### 获取分类,上映时间,描述,主演 movie_html_str = self.get_request(url=hot_play['Url'],referer=referer) category,showdata,description,main_actor = self.parse_movie_detail(movie_html_str) item["category"] = category item['showdata'] = showdata item['description'] = description item['main_actor'] = main_actor items_list.append(item) # 获取总票房和评分 ajax_url = "http://service.library.mtime.com/Movie.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Library.Services&Ajax_CallBackMethod=GetMovieOverviewRating&Ajax_CallBackArgument0={}".format(hot_play['Id']) ajax_content = self.get_request(url=ajax_url,referer=hot_play['Url']) totalboxoffice,ratingfinall = self.parse_ajax_content(response=ajax_content) item['totalboxoffice'] = totalboxoffice item['ratingfinall'] = ratingfinall time.sleep(random.random()) return items_list def parse_city_request(self,response): useful_info = re.findall(pattern=r'hotplaySvLists+=s+(.*?);',string=response,flags=re.S) useful_info = useful_info[0] if useful_info else None return json.loads(s=useful_info,encoding='utf-8') def get_request(self,url,referer=None): headers = { "User-Agent":self.user_agent, "Referer":referer, } try: response = requests.get(url=url,headers=headers) if response.status_code == 200 and response.text is not None: return response.content.decode() except error.HTTPError as e: print(e) def run(self,city_name): # 获取当前城市正在热映的电影 url = "http://theater.mtime.com/China_{}/".format(city_name) city_html_str = self.get_request(url,referer='http://www.mtime.com/') hot_play_list = self.parse_city_request(city_html_str) # 拿到当前城市的所有热映电影,返回列表 # 进入电影详情页,获取电影标题,分类,描述,主演,总票房 items_list = self.parse_hotplay(hot_play_list,referer=url) pprint(items_list) if __name__=="__main__": # city_name = input("请输入城市(拼音形式,首字母大写):") city_name = "Huangshi" obj = MtimeSpider() obj.run(city_name=city_name)
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<