1 import requests
2 import re, json
3
4
5 # 请求网页,返回响应体
6 def get_one_page(url):
7 headers = {
8 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
9 }
10 response = requests.get(url, headers=headers)
11 if response.status_code == 200:
12 return response.text
13 return None
14
15
16 # 使用正则将需要的数据剥离
17 def parse_one_page(html):
18 pattern = re.compile(
19 '<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)"'
20 + '.*?name.*?a.*?>(.*?)</a>.*?star.*?>(.*?)</p>'
21 + '.*?releasetime.*?>(.*?)</p>.*?integer.*?>(.*?)</i>'
22 + '.*?fraction.*?>(.*?)</i>.*?</dd>',
23 re.S)
24 res = re.findall(pattern, html)
25 for i in res:
26 yield {
27 '排名': i[0],
28 '封面': i[1],
29 '影片名称': i[2],
30 '主演': i[3].strip()[3:],
31 '上映时间': i[4].strip()[5:],
32 '评分': i[5].strip() + i[6].strip(),
33 }
34
35
36 # 将结果写入到文件
37 def write_to_file(content):
38 with open('猫眼电影排行TOP100.txt', 'a', encoding='utf-8') as f:
39 f.write(json.dumps(content, ensure_ascii=False) + '
')
40
41
42 def main(offset):
43 url = 'https://maoyan.com/board/4?offset=' + str(offset)
44 html = get_one_page(url)
45 for item in parse_one_page(html):
46 print(item)
47 write_to_file(item)
48
49
50 if __name__ == '__main__':
51 for i in range(10):
52 main(offset=i * 10)
问题:
转载地址 https://www.jianshu.com/p/86d66257de41