• python 爬虫示例,方便日后参考


    参考网址:https://zhuanlan.zhihu.com/p/32037625

    def getOneMoviesInfo(Mid,url):
        import requests
        from lxml import etree
        
        #print(url)
        data = requests.get(url).text   #download the website
        s = etree.HTML(data)            #analyse data
    
        picture = s.xpath('//*[@id="main"]/section/div[1]/div/div/section/div[1]/div[1]/img/@src')
        if len(picture)== 0:
            picture = 'NULL'
        #longPicture = s.xpath('//*[@id="media_v4"]/div[2]/div[1]/div/div/section[3]/div[2]/div/div[1]/img/@src')
        name = s.xpath('//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/a/h2/text()')
        if len(name)==0:
            print("Mid = %s , failed for a lack of TMDB id "%Mid)
            return
        name = s.xpath('//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/a/h2/text()')[0]
        year = s.xpath('//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/span/text()')[0].strip("(").strip().strip(")")
        date = s.xpath('//*[@id="media_v4"]/div[2]/div[2]/div/section/div[1]/div/section[1]/ul/li[1]/text()')[1].strip()
        brief = s.xpath('//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[2]/div/p/text()')[0].replace("
    ","\n")
    
        mainCreators =s.xpath('//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[2]/ol/li') #all main creators array
        writers = []
        director = "NULL"
        for div in mainCreators:
            if len(div.xpath('./p[1]/a/text()'))== 0:
                director = 'NULL'
                writers = ['NULL','NULL','NULL']
            else:            
                creatorName = div.xpath('./p[1]/a/text()')[0]
                #print(creatorName)
                creatorProfession = div.xpath('./p[2]/text()')[0]
                #print(creatorProfession)
                if  'Director' in creatorProfession:
                    director = creatorName
                elif 'Screenplay' in creatorProfession or 'Writer' in creatorProfession:
                    writers.append(creatorName)
        
            
        stars = []
        starsData = s.xpath('//*[@id="media_v4"]/div[2]/div[1]/div/div/section[1]/ol/li')
        for div in starsData:
            star = div.xpath('./p[1]/a/text()')
            if len(star)== 0:
                stars == ["NULL","NULL","NULL"]
            else:
                star = star[0]
                stars.append(star)
                    
        
        writerslen = len(writers)
        starslen=len(stars)
        
        for i in range(writerslen,3):
            writers.append("NULL");
        for i in range(starslen,5):
            stars.append("NULL");
        
        with open(r'C:UsersyuqiaoDesktop	estSpider.txt','a',encoding='utf-8') as f:
            f.write("{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}
    ".format(Mid,name,brief,year,date,director,
                                                     writers[0],writers[1],writers[2],
                                                     stars[0],stars[1],stars[2],stars[3],stars[4],
                                                     picture))
        print(Mid)
        print(name)
        
    #______________________________________________________主函数__________________________________________________________
    import time
    with open(r'C:UsersyuqiaoDesktop	estSpider.txt','w',encoding='utf-8') as f:
            f.write("")
    language = '?language=zh-CN' #######################
    with open(r'D:gitiyeMovieMidURL.txt', "rt",encoding='utf-8') as in_file:
        all = in_file.read()
        lines = all.split("
    ")
        
        #for i in range(51,61):    51~60
        for i in range(9124,9125):
            line = lines[i]
            print(line)
    
    print('finished')    
    
    
    
  • 相关阅读:
    【.NET】VS2013创建Windows服务与调试服务
    【JS】处理数据四舍五入(tofixed与round的区别详解)
    【微信小程序】 基础语义笔记2:基本组件、获取节点信息
    面向对象和面向过程的优点和缺点
    【微信小程序】 基础语义笔记1:配置、页面文件、组件。
    【微信小程序】 wxParse组件
    zookeeper 碎片知识点
    zookeeper 基本概念
    RocketMQ 知识点
    单例模式---双层检验锁+volatile
  • 原文地址:https://www.cnblogs.com/YuQiao0303/p/9277666.html
Copyright © 2020-2023  润新知