1.需求描述
爬取hao6v电影网的数据,先通过xpath解析第一个页面,获取到每部电影的url详情页地址,然后解析详情页地址,获取出所需的数据
页面如下:
2.实现代码
# Author:Logan
import requests
from lxml import etree
HEADERS = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
def get_detail_urls(url):
response = requests.get(url, headers=HEADERS)
html_str = response.text
# 获取数据
html = etree.HTML(html_str)
ul = html.xpath("//ul[@class='list']/li/a/@href")
urls_list =list()
for li in ul:
urls_list.append(li)
return urls_list
def parse_detail_page(detail_url):
response = requests.get(detail_url,headers=HEADERS)
html_str = response.content.decode('GBK')
html = etree.HTML(html_str)
# 定义字典存储电影信息
movie = dict()
# 获取电影名字
movie['title'] = html.xpath("//div[@id='endText']/strong/a/text()")[0]
infos = html.xpath("//div[@id='endText']/p/text()")
for index,info in enumerate(infos):
info = info.strip()
# print('=' * 30)
# print(index,info)
if info.startswith("◎年 代"):
movie['year'] = info.replace('◎年 代','').strip()
elif info.startswith("◎IMDb评分"):
movie['IMDBscore'] = info.replace('◎IMDb评分', '').strip()
elif info.startswith("◎片 长"):
movie['duration'] = info.replace('◎片 长', '').strip()
elif info.startswith("◎导 演"):
movie['direction'] = info.replace('◎导 演', '').strip()
elif info.startswith("◎主 演"):
info = info.replace('◎主 演', '').strip()
actors = [info]
for x in range(index+1, len(infos)):
actor = infos[x].strip()
if actor.startswith("◎"):
break
actors.append(actor)
movie['actors'] = actors
return movie
def main():
# 1.构造url地址
base_url = 'http://www.hao6v.net/dy/index_{}.html'
for i in range(1,2):
if i == 1:
url = base_url.replace('_{}','')
else:
url = base_url.format(i)
# 2.获取详细地址
urls_list = get_detail_urls(url)
# 3.解析详情页面
movie_detail_info = list()
for detail_url in urls_list:
movie = parse_detail_page(detail_url)
movie_detail_info.append(movie)
print(movie_detail_info)
if __name__ == '__main__':
main()
运行结果如下: