• 豆瓣最佳影评-星级转换


    总的来说,爬取豆瓣信息不算难,因为在网上一抓一大把教程,但是自己写的代码还是和别人的不一样,特别是自己一个一个想出来一个一个敲出来的那种酸爽赶脚

    import requests
    from bs4 import BeautifulSoup
    from selenium import webdriver
    import time
    from lxml import etree
    import csv
    
    file = open('douban1.csv','a',newline='',encoding='utf-8')
    writer = csv.writer(file)#这种方式比with open慢!
    writer.writerow(['星级','内容'])
    
    def id(url):
        #获取每个影评的id
        rsp=requests.get(url)
        html=etree.HTML(rsp.text)
        id=html.xpath('//div[@class="main-bd"]/div/@data-rid')
        return id
    
    
    def next_url(url):    #获取下一页url
        rsp = requests.get(url)
        html = etree.HTML(rsp.text)
        next_url=html.xpath('//div[@class="paginator"]/span[@class="next"]/a/@href')
        next_url='https://movie.douban.com'+next_url[0]
        # print(next_url)
        sel(next_url)
    
    def star(url,i):
        #获取每个影评的星级
        rsp=requests.get(url)
        html=etree.HTML(rsp.text)
        star=html.xpath('//div[@data-cid="'+i+'"]/div[@class="main review-item"]/header[@class="main-hd"]/span/@class')
        return star
    
    
    def translate(star):#把代码转换为相应的星级
        if star[0]=='allstar50 main-title-rating':
            star=str('五星')
            return star
        if star[0]=='allstar40 main-title-rating':
            star=str('四星')
            return star
        if star[0]=='allstar30 main-title-rating':
            star=str('三星')
            return star
        if star[0]=='allstar20 main-title-rating':
            star=str('二星')
            return star
        if star[0]=='allstar10 main-title-rating':
            star=str('一星')
            return star
        if star[0]=='main-meta':#有些没有星级
            star=str('默认好评')
            return star
    
    def sel(url):
        # 加载动态页面,使用selenium调用Chrome浏览器点击展开按钮
        brow = webdriver.Chrome(r"D:PythonScriptschromedriver.exe")
        brow.get(url)
        id1=id(url)
        for i in id1:
            box1=brow.find_element_by_xpath('//div[@class="short-content"]/a[@id="toggle-'+i+'-copy"]')
            brow.execute_script("window.scrollTo(0,"+i*1200+")")#让屏幕滚动,使得鼠标可以聚焦!
            # print(box1)#1200是实验得到的数值,没有统一性
            box1.click() #点击展开按钮
            time.sleep(2)  # 必须等待两秒,否则获取到的代码是未点击加载的代码
            html = brow.page_source
            soup = BeautifulSoup(html, "lxml")
            for j in soup.find_all('div', class_='review-content clearfix'):
                content = j.get_text()
                content=content.strip()  #去掉空格
                content = content.replace('
    ', '').replace('	', '').replace('xa0', '').replace('
    ', '')#去掉转义字符
                content = content.split('*')#转换为列表
            # print(content)
            # print(type(content))
            star1=star(url,i)#获取星级代码
            # print(star1)  # ['allstar50 main-title-rating']
            star2 = translate(star1)#获取转换后的相应星级
            star2 = star2.split('*')
            # print(star2)
            for i,j in zip(star2,content):
                params=(i,j)
                print(params)
                writer.writerow(params)#写入文件中
        brow.close()#关闭浏览器
        next_url(url)#点击下一页
    
    
    if __name__=='__main__':
        url = 'https://movie.douban.com/review/best/'
        sel(url)#启动程序
    
    '''
    selenium.common.exceptions.WebDriverException: Message: unknown error: Element <a href="javascript:;" id="toggle-9590829-copy" class="unfold" title="...">展开</a> is not clickable at point (120, 586). Other element would receive the click: <div class="review-content clearfix" data-author="夜第七章" data-url="https://movie.douban.com/review/9592082/" data-original="1">...</div>
      (Session info: chrome=54.0.2840.99)
      (Driver info: chromedriver=2.27.440174 (e97a722caafc2d3a8b807ee115bfb307f7d2cfd9),platform=Windows NT 10.0.14393 x86_64)
      错误原因:选的元素不是input,无法聚集焦点,使用sleep,window.scrollTo(0,x)
    '''
    

      

  • 相关阅读:
    C++内联函数
    C++类中创建线程
    windows下搭建Redis集群
    tcpdump截帧工具使用
    使用gdb调试应用程序
    工作之用
    primecoin服务常用命令和参数说明
    Windows mysql默认字符集修改
    primecoin在ubuntu16.04上部署服务:
    ubuntu磁盘分配和挂载
  • 原文地址:https://www.cnblogs.com/fodalaoyao/p/10474958.html
Copyright © 2020-2023  润新知