• 14 天堂电影信息爬取


      1 """电影天堂爬虫"""
      2 
      3 
      4 import requests
      5 from lxml import etree
      6 
      7 BASE_DOMAIN = 'https://dytt8.net/'
      8 HEADERS = {
      9     'User-Agent': 'Mozilla/5.0'
     10 }
     11 
     12 def get_detail_urls(url):
     13     """爬取指定页面下所有子页面的超链接并返回"""
     14 
     15     respose = requests.get(url, headers=HEADERS)
     16     # 页面编码有些GBK无法识别的字符ignore
     17     text = respose.content.decode('GBK', errors='ignore')
     18     # print(text)
     19     html = etree.HTML(text)
     20     # 获取当前页面下每个电影信息的子页面的超链接
     21     detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
     22     detail_urls = map(lambda url: BASE_DOMAIN + url, detail_urls)
     23     map(lambda url: BASE_DOMAIN + url, detail_urls)
     24     # for detail in detail_urls:
     25     #     print(detail)
     26     return detail_urls
     27 
     28 def parse_detail_page(detail_urls):
     29     """爬取子页面的电影具体信息"""
     30 
     31     movie = {}      # 电影信息存放字典
     32     r = requests.get(detail_urls, headers=HEADERS)
     33     text = r.content.decode('GBK', errors='ignore')
     34     html = etree.HTML(text)
     35     title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
     36     #print(title)
     37     movie['title'] = title
     38     zoomE = html.xpath("//div[@id='Zoom']")[0]
     39     imgs = zoomE.xpath(".//img/@src")
     40     cover = imgs[0]
     41     movie['cover'] = cover
     42     # 不是所有子页面下的电影信息都有电影截屏信息,会超界出错
     43     # screenshot = imgs[1]
     44     #movie['screenshot'] = screenshot
     45 
     46     def parse_info(info, rule):
     47         return info.replace(rule, "").strip()
     48 
     49     infos = zoomE.xpath("//text()")
     50     actors = []
     51     for index,info in enumerate(infos):
     52         #print(index)
     53         #print(info)
     54         #print("*"*30)
     55         if info.startswith("◎年  代"):
     56             info = parse_info(info, "◎年  代")
     57             movie['year'] = info
     58             #print(info)
     59         elif info.startswith("◎产  地"):
     60             info = parse_info(info, "◎产  地")
     61             movie['country'] = info
     62         elif info.startswith("◎类  别"):
     63             info = parse_info(info, "◎类  别")
     64             movie['category'] = info
     65         elif info.startswith("◎豆瓣评分"):
     66             info = parse_info(info, "◎豆瓣评分")
     67             movie['duoban_rating'] = info
     68         elif info.startswith("◎片  长"):
     69             info = parse_info(info, "◎片  长")
     70             movie['duration'] = info
     71         elif info.startswith("◎导  演"):
     72             info = parse_info(info, "◎导  演")
     73             movie['director'] = info
     74         elif info.startswith("◎主  演"):
     75             # 得到第一行主演信息
     76             info = parse_info(info, "◎主  演")
     77             actors = [info]
     78             # 主演有多行,对后续信息遍历,直到出现"◎"为止
     79             for x in range(index+1, len(infos)):
     80                 actor = infos[x].strip()
     81                 if actor.startswith(""):
     82                     break
     83                 # 不为空时加入主演
     84                 if actor != "":
     85                     actors.append(actor)
     86             movie['actors'] = actors
     87 
     88     movie['download'] = html.xpath("//div[@id='Zoom']//tbody//a/text()")
     89     movie['magnet'] = html.xpath("//div[@id='Zoom']//a/@href")[0]
     90 
     91     return movie
     92 
     93 def spider():
     94     base_url = "https://dytt8.net/html/gndy/dyzz/list_23_{}.html"
     95     movies = []     # 所有电影信息
     96     count = 0       # 爬取的电影数
     97     for x in range(1,3):
     98         """所有页数"""
     99         url = base_url.format(x)
    100         detail_urls = get_detail_urls(url)
    101         for detail_url in detail_urls:
    102             """对每一页进行解析"""
    103             #print(detail_url)
    104             movies.append(parse_detail_page(detail_url))
    105             count += 1
    106         print(movies)
    107         print(count)
    108 
    109 
    110 if __name__ == '__main__':
    111     spider()

    来自Python网络爬虫视频

  • 相关阅读:
    IO流操作-图片操作(二)
    SQL Server 大数据量批量插入
    XSS【跨站脚本攻击】
    Log4net配置
    发布网站碰到的问题
    操作百度API
    Html.BeginForm
    jquery自动识别输入的都是数字
    mysql中随机取出几条数据
    Html中截切文章内容,造成标签不全的问题
  • 原文地址:https://www.cnblogs.com/sruzzg/p/13083894.html
Copyright © 2020-2023  润新知