#coding:utf-8
import requests
from lxml import etree
BASE_DOMAIN = "http://www.8080s.net/"
url = "http://www.8080s.net/dm/list/----14--p2"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def get_detail_urls(url):
response = requests.get(url,headers=headers)
text = response.text
html = etree.HTML(text)
detail_urls = html.xpath("//ul[@class='me1 clearfix']//a/@href")
#for detail_url in detail_urls:
# print(BASE_DOMAIN+detail_url)
detail_urls = map(lambda url: BASE_DOMAIN+url,detail_urls)
return detail_urls
#获取内容页数据
def parse_detail_page(url):
movie ={}
response = requests.get(url,headers=headers)
text=response.text
html = etree.HTML(text)
title = html.xpath("//div[@class='info']/text")
#print(title)
movie['title'] = title
update = html.xpath("//span[@class='tip']//text()")
movie['update'] = update
return movie
#获取列表数据
def spider():
base_url = "http://www.8080s.net/dm/list/----14--p{}"
movies = []
for x in range(2,9):
url = base_url.format(x)
#print(url)
detail_urls = get_detail_urls(url)
for detail_url in detail_urls:
movie = parse_detail_page(detail_url)
movies.append(movie)
print(movie)
if __name__ == '__main__':
spider()