# 导入我们需要的模块 import re import requests # 一、获取网页内容 # (1)声明目标url,就是爬取的网站地址 base_url = "http://maoyan.com/board" # (2)模仿浏览器 headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'} # (3)发起请求 response = requests.get(base_url, headers=headers) # (4)接收响应的数据 html = response.text # (5)将接收的数据写入 with open("maoyan.html", 'w', encoding='utf-8') as f: f.write(html) # 2.提取数据 # (1)缩小范围(通过正则获取数据) pattern = re.compile(r'<dd>.*?</dd>',re.S) movie_list = pattern.findall(html) # (2) 分别拿取每部电影中的数据 for movie in movie_list: # print(movie) # 获取排名信息 pattern = re.compile(r'<i class="board-index board-index-[d]*">(d{1,2})</i>') index = pattern.findall(movie)[0] index = '排名:' + index print(index) # 获取电影名称信息 pattern = re.compile(r'title="(.*?)"') title = pattern.findall(movie)[0] title = '电影名称:' + title print(title) # 获取图片信息 pattern = re.compile(r'<img data-src="(.*?)@') img = pattern.findall(movie)[0] img = '图片:' + img print(img) # 获取主演信息 pattern = re.compile(r'<p class="star">([wW]*?)</p>') star = pattern.findall(movie)[0].strip() print(star) # 获取上映时间信息 pattern = re.compile(r'<p class="releasetime">(.*?)</p>') releaseTime = pattern.findall(movie)[0] print(releaseTime) # 获取评分信息 pattern = re.compile(r'<p class="score"><i class="integer">(d+.)</i><i class="fraction">(d)</i></p> ') score = pattern.findall(movie) score = '评分:' + score[0][0]+score[0][1] print(score) # 将所有信息拼接成字符串 result = index + ' ' + title + ' ' + img + ' ' + star + ' ' + releaseTime + ' ' + score + ' ' # 将所有信息存入文档 with open("maoyan.txt", 'a+', encoding='utf-8') as f: f.write(result)