14 天堂电影信息爬取

  1 """电影天堂爬虫"""
  2 
  3 
  4 import requests
  5 from lxml import etree
  6 
  7 BASE_DOMAIN = 'https://dytt8.net/'
  8 HEADERS = {
  9     'User-Agent': 'Mozilla/5.0'
 10 }
 11 
 12 def get_detail_urls(url):
 13     """爬取指定页面下所有子页面的超链接并返回"""
 14 
 15     respose = requests.get(url, headers=HEADERS)
 16     # 页面编码有些GBK无法识别的字符ignore
 17     text = respose.content.decode('GBK', errors='ignore')
 18     # print(text)
 19     html = etree.HTML(text)
 20     # 获取当前页面下每个电影信息的子页面的超链接
 21     detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
 22     detail_urls = map(lambda url: BASE_DOMAIN + url, detail_urls)
 23     map(lambda url: BASE_DOMAIN + url, detail_urls)
 24     # for detail in detail_urls:
 25     #     print(detail)
 26     return detail_urls
 27 
 28 def parse_detail_page(detail_urls):
 29     """爬取子页面的电影具体信息"""
 30 
 31     movie = {}      # 电影信息存放字典
 32     r = requests.get(detail_urls, headers=HEADERS)
 33     text = r.content.decode('GBK', errors='ignore')
 34     html = etree.HTML(text)
 35     title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
 36     #print(title)
 37     movie['title'] = title
 38     zoomE = html.xpath("//div[@id='Zoom']")[0]
 39     imgs = zoomE.xpath(".//img/@src")
 40     cover = imgs[0]
 41     movie['cover'] = cover
 42     # 不是所有子页面下的电影信息都有电影截屏信息，会超界出错
 43     # screenshot = imgs[1]
 44     #movie['screenshot'] = screenshot
 45 
 46     def parse_info(info, rule):
 47         return info.replace(rule, "").strip()
 48 
 49     infos = zoomE.xpath("//text()")
 50     actors = []
 51     for index,info in enumerate(infos):
 52         #print(index)
 53         #print(info)
 54         #print("*"*30)
 55         if info.startswith("◎年　　代"):
 56             info = parse_info(info, "◎年　　代")
 57             movie['year'] = info
 58             #print(info)
 59         elif info.startswith("◎产　　地"):
 60             info = parse_info(info, "◎产　　地")
 61             movie['country'] = info
 62         elif info.startswith("◎类　　别"):
 63             info = parse_info(info, "◎类　　别")
 64             movie['category'] = info
 65         elif info.startswith("◎豆瓣评分"):
 66             info = parse_info(info, "◎豆瓣评分")
 67             movie['duoban_rating'] = info
 68         elif info.startswith("◎片　　长"):
 69             info = parse_info(info, "◎片　　长")
 70             movie['duration'] = info
 71         elif info.startswith("◎导　　演"):
 72             info = parse_info(info, "◎导　　演")
 73             movie['director'] = info
 74         elif info.startswith("◎主　　演"):
 75             # 得到第一行主演信息
 76             info = parse_info(info, "◎主　　演")
 77             actors = [info]
 78             # 主演有多行，对后续信息遍历，直到出现"◎"为止
 79             for x in range(index+1, len(infos)):
 80                 actor = infos[x].strip()
 81                 if actor.startswith("◎"):
 82                     break
 83                 # 不为空时加入主演
 84                 if actor != "":
 85                     actors.append(actor)
 86             movie['actors'] = actors
 87 
 88     movie['download'] = html.xpath("//div[@id='Zoom']//tbody//a/text()")
 89     movie['magnet'] = html.xpath("//div[@id='Zoom']//a/@href")[0]
 90 
 91     return movie
 92 
 93 def spider():
 94     base_url = "https://dytt8.net/html/gndy/dyzz/list_23_{}.html"
 95     movies = []     # 所有电影信息
 96     count = 0       # 爬取的电影数
 97     for x in range(1,3):
 98         """所有页数"""
 99         url = base_url.format(x)
100         detail_urls = get_detail_urls(url)
101         for detail_url in detail_urls:
102             """对每一页进行解析"""
103             #print(detail_url)
104             movies.append(parse_detail_page(detail_url))
105             count += 1
106         print(movies)
107         print(count)
108 
109 
110 if __name__ == '__main__':
111     spider()
来自Python网络爬虫视频
相关阅读:
IO流操作-图片操作（二）
SQL Server 大数据量批量插入
 XSS【跨站脚本攻击】
Log4net配置
 发布网站碰到的问题
 操作百度API
Html.BeginForm
jquery自动识别输入的都是数字
 mysql中随机取出几条数据
 Html中截切文章内容，造成标签不全的问题
原文地址：https://www.cnblogs.com/sruzzg/p/13083894.html