• python 爬虫系列04-电影天堂连接爬虫


    学习的第四个爬虫

      

    from lxml import etree
    import requests
    BASE_D = 'http://www.dytt8.net'
    headers = {
        'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0"
    }
    def get_detail_urls(url):
        response = requests.get(url, headers=headers)
        text = response.text
        html = etree.HTML(text)
        detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
        detail_urls = map(lambda url: BASE_D+url, detail_urls)
        return detail_urls
    def parse_detail_page(url):
        movie = {}
        response = requests.get(url, headers=headers)
        text = response.content.decode('gbk')
        html = etree.HTML(text)
        title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
        # for x in title:
        #     print(etree.tostring(x,encoding='utf-8').encode('utf-8'))
        movie['title'] = title
        zoomE = html.xpath("//div[@id='Zoom']")[0]
        imgs = zoomE.xpath(".//img/@src")
        cover = imgs[0]
       # screenshot = imgs[1]
        movie['cover'] = cover
        #movie['screenshot'] = screenshot
    
        def parse_info(info,rule):
            return info.replace(rule,"").strip()
        infos = zoomE.xpath(".//text()")
        for index,info in enumerate(infos):
            # print(info)
            # print(index)
            # print("="*30)
            if info.startswith("◎年  代"):
                info = parse_info(info, "◎年  代")
                movie['year'] = info
            elif info.startswith("◎产  地"):
                info = parse_info(info, "◎产  地")
                movie['country'] = info
            elif info.startswith("◎类  别"):
                info = parse_info(info, "◎类  别")
                movie['category'] = info
            elif info.startswith("◎豆瓣评分"):
                info = parse_info(info, "◎豆瓣评分")
                movie['category'] = info
            elif info.startswith("◎片  长"):
                info = parse_info(info, "◎片  长")
                movie['duration'] = info
            elif info.startswith("◎导  演"):
                info = parse_info(info, "◎导  演")
                movie['director'] = info
            elif info.startswith("◎主  演"):
                info = parse_info(info, "◎主  演")
                actors = [info]
                for x in range(index+1, len(infos)):
                    actor = infos[x].strip()
                    if actor.startswith(""):
                        break
                    actors.append(actor)
                movie['actors'] = actors
            elif info.startswith("◎简  介 "):
                info = parse_info(info, "◎简  介 ")
                for x in range(index+1, len(infos)):
                    profile = infos[x].strip()
                    movie["profile"] = profile
        download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]
        movie['download_url'] = download_url
        return movie
    def spider():
        base_url = "http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
        movies = []
        for x in range(1,8):
            # print("="*30)
            # print(x)
            url = base_url.format(x)
            detail_urls = get_detail_urls(url)
            for detail_url in detail_urls:
                movie = parse_detail_page(detail_url)
                movies.append(movie)
                print(movie)
    if __name__ =='__main__':
        spider()
  • 相关阅读:
    C#多线程(16):手把手教你撸一个工作流
    C#多线程(15):任务基础③
    C#多线程(14):任务基础②
    C#多线程(13):任务基础①
    C#多线程(12):线程池
    C#多线程(11):线程等待
    C#多线程(10):读写锁
    C#多线程(9):多阶段并行线程
    C#多线程(8):线程完成数
    C#多线程(7):手动线程通知
  • 原文地址:https://www.cnblogs.com/kingle-study/p/9916157.html
Copyright © 2020-2023  润新知