代码:
import re import json from multiprocessing import Pool import requests from requests.exceptions import RequestException basic_url = 'http://maoyan.com/board/4?offset=%d' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36', } file = open("maoyan_movies.txt", 'a', encoding="utf-8") def get_page(url): try: response = requests.get(url, headers=headers) if response.status_code == requests.codes.ok: return response.text else: return None except RequestException: return None def parse_page(content): pattern = re.compile( '<dd>.*?board-index.*?>(d+)</i>' '.*?<img data-src="(.*?)"' '.*?class="name"><a.*?>(.*?)</a>' '.*?class="star">(.*?)</p>' '.*?class="releasetime">(.*?)</p>' '.*?class="score"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i>' '.*?</dd>', re.S) items = pattern.findall(content) for item in items: yield { 'id': item[0], 'image': item[1], 'name': item[2].strip(), 'actor': item[3].strip()[3:], 'releasetime': item[4][5:], 'score': item[5] + item[6], } def save_to_file(content): json.dump(content, file, ensure_ascii=False) file.write(' ') def get_page_movies(offset): ''' 获取一页的电影信息 offset用来构建完整的网页url,以10为最小单位 ''' step = 10 url = basic_url % (step * offset) html = get_page(url) for movie_info in parse_page(html): save_to_file(movie_info) # 获取猫眼电影top100的电影信息: 排名,图片url,电影名,主演,上映日期,评分 def get_top_100_movies(): offset_list = [i for i in range(10)] pool = Pool(processes=4) pool.map(get_page_movies, offset_list) pool.close() pool.join() if __name__ == "__main__": get_top_100_movies()