1 from lxml import etree 2 import requests 3 4 5 baseurl = 'https://www.dytt8.net' 6 headers = { 7 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 8 'Referer': 'https://www.dytt8.net/html/gndy/dyzz/index.html' 9 } 10 def agent(ur): 11 resp = requests.get(ur,headers = headers) 12 # parse = etree.HTMLParser() 13 text = resp.text 14 html = etree.HTML(text) 15 # a = etree.tostring(html, encoding='utf-8').decode('utf-8') 16 return html 17 18 def movie_url_list(html): 19 url = html.xpath("//table[@class='tbspan']//a/@href") 20 return url 21 22 def parse_info(info,rule): 23 return info.replace(rule,'').strip() 24 25 def xiangqingye(url): 26 resp = requests.get(url, headers=headers) 27 text = resp.content.decode('gbk') 28 html = etree.HTML(text) 29 a = html.xpath('//div[@id="Zoom"]//text()') 30 movie = {} 31 for info in a: 32 if info.startswith("◎片 名"): 33 info = parse_info(info, '◎片 名') 34 movie['pianming'] = info 35 if info.startswith("◎年 代"): 36 info = parse_info(info, '◎年 代') 37 movie['niandai'] = info 38 if info.startswith("◎产 地"): 39 info = parse_info(info, '◎产 地') 40 movie['chandi'] = info 41 if info.startswith("◎类 别"): 42 info = parse_info(info, '◎类 别') 43 movie['leixing'] = info 44 if info.startswith("◎上映日期"): 45 info = parse_info(info, '◎上映日期') 46 movie['shangyingshijian'] = info 47 if info.startswith("◎豆瓣评分"): 48 info = parse_info(info, '◎豆瓣评分') 49 movie['doubanpingfen'] = info 50 if info.startswith("◎片 长"): 51 info = parse_info(info, '◎片 长') 52 movie['pianchang'] = info 53 if info.startswith("◎标 签"): 54 info = parse_info(info, '◎标 签') 55 movie['biaoqian'] = info 56 return movie 57 58 def alldata(): 59 srt1 = 'https://www.dytt8.net/html/gndy/dyzz/list_23_' 60 str2 = '.html' 61 movies = [] 62 for i in range(1,2): 63 url = srt1+str(i)+str2 64 ura = agent(url) 65 b = movie_url_list(ura) 66 for z in b: 67 c = baseurl + z 68 movielist = xiangqingye(c) 69 movies.append(movielist) 70 return movies 71 if __name__ == '__main__': 72 print(alldata())