• 爬虫11-lxml爬取复杂网页,电影天堂


    import requests
    from  lxml import  etree
    url_domain="https://www.dytt8.net"
    headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
    }
    def get_detail_urls(url):
        response=requests.get(url,headers=headers)
        text =response.content.decode('gbk', "ignore")
    
        html=etree.HTML(text)
        detail_urls=html.xpath("//table[@class='tbspan']//a[2]/@href")
        detail_urls=map(lambda url:url_domain+url,detail_urls)
        return detail_urls
    
    def parse_info(info,rule):
        return info.replace(rule, "").strip()
    
    def parse_detail_url(url="https://www.dytt8.net/html/gndy/dyzz/20200306/59787.html"):
        movie={}
        response=requests.get(url,headers=headers)
        text=response.content.decode("gbk","ignore")
        html=etree.HTML(text)
        title=html.xpath("//div[@class='title_all']//font[@color='#07519a']//text()")[0]
        movie['title']=title
        zoomE=html.xpath("//div[@id='Zoom']")[0]
        infos=zoomE.xpath("//text()")
        for index,info in enumerate(infos):
            if info.startswith("◎年  代"):
                info=parse_info(info,"◎年  代")
                movie['year']=info
            elif info.startswith("◎产  地"):
                info = parse_info(info, "◎产  地")
                movie['country'] = info
            elif info.startswith("◎类  别"):
                info = parse_info(info, "◎类  别")
                movie['category'] = info
            elif info.startswith("◎上映日期"):
                info = parse_info(info, "◎上映日期")
                movie['date'] = info
            elif info.startswith("◎片  长"):
                info = parse_info(info, "◎片  长")
                movie['time'] = info
            elif info.startswith("◎豆瓣评分"):
                info = parse_info(info, "◎豆瓣评分")
                movie['score'] = info
            elif info.startswith("◎导  演"):
                info = parse_info(info, "◎导  演")
                movie['director'] = info
            elif info.startswith("◎主  演"):
                info = parse_info(info, "◎主  演")
                actors = [info]
                for x in range(index+1,len(infos)):
                    actor=infos[x].strip()
                    if actor.startswith("◎"):
                        break
                    actors.append(actor)
                movie['actors']=actors
        return movie
    
    def spider():
        base_url="https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
        movies=[]
        for x in range(1,8):
            url=base_url.format(x)
            detail_urls=get_detail_urls(url)
            for detail_url in detail_urls:
                movie=parse_detail_url(detail_url)
                movies.append(movie)
                print(movie)
    
    if __name__ == '__main__':
        spider()
    

      

  • 相关阅读:
    mysql半同步复制 gtid复制 MHA搭建 孙龙
    mysql数据迁移换主机,换版本升级 孙龙
    cpu和核心关系,负载 lscpu uptime mpstat pidstat iostat iotop 孙龙
    mysql5.7主从复制 孙龙
    mysql information_schema元数据和索引和锁的介绍 孙龙
    mysql存储引擎 ,逻辑结构,存储结构,表空间迁移 孙龙
    mysql日志介绍 孙龙
    mysqldba1安装,权限,多实例,修改密码 孙龙
    mysql事务特性以及事务工作原理,刷盘策略(redo,undo)mysql如何去保证事务的ACID特性 孙龙
    mysql备份恢复与迁移 孙龙
  • 原文地址:https://www.cnblogs.com/wcyMiracle/p/12468299.html
Copyright © 2020-2023  润新知