• Python 爬取汽车领域问答语料(自用)


    #coding=utf-8
    
    import time
    import requests
    from lxml import etree
    from pymongo import MongoClient
    from selenium import webdriver
    
    client = MongoClient("IP", 27017)
    db = client["Automobile"]
    collection = db["wenda_autohome"]
    db.authenticate("","")
    
    driver = webdriver.Chrome(executable_path=r"D:chromedriver_win32chromedriver.exe")
    
    def splist(l, s):
        return [l[i: i+s] for i in range(len(l)) if i%s==0]
    
    for i in range(36726, 40202):  
        # url = 'https://wenda.autohome.com.cn/topic/detail/40195'
        url = 'https://wenda.autohome.com.cn/topic/detail/' + str(i)
    
        time.sleep(1)
        driver.get(url)
        html = driver.page_source
        tree = etree.HTML(html)
    
        question = tree.xpath("//h1[@class='card-title']/text()")
        answer_list = tree.xpath("//a[@class='text']/text()")
        if question==[] or answer_list==[]:
            continue
    
        n = 0
        for j in answer_list:
            
            n += 1
            answer_list[n-1] = j[41:-37]
            if answer_list[n-1][-3:]!='...':
                continue
    
            s = "//div[@class='card-reply-wrap'][" + str(n) + "]//a[@class='more']"
            try:
                driver.find_element_by_xpath(s).click()
                
                html_answer = driver.page_source
                tree_answer = etree.HTML(html_answer)
                answer_part = tree_answer.xpath("//div[@class='answer-content']/div/div[@class='ahe__area ahe__block ahe__text']/p/text()")
                answer = ''
                for item in answer_part:
                    answer += item
    
                answer_list[n-1] = answer
                time.sleep(1)
                driver.get(url)
            except Exception as e:
                print e 
                continue
    
        keywords = tree.xpath("//ul[@class='card-tag-list']/li/text()")
    discription_list
    = tree.xpath("//div[@class='ahe__area ahe__block ahe__text']/p/text()") discription = '' for j in discription_list: discription += j zancai = tree.xpath("//span[@class='js-praise-count']/text()") zancai_list = splist(zancai, 2) dc = {} dc['keywords'] = keywords dc['question'] = question[0] dc['discription'] = discription dc['answer'] = answer_list dc['zancai'] = zancai_list dc['url'] = url collection.insert(dc) driver.close()
  • 相关阅读:
    01 网络基础
    01 ansible的基本介绍
    10 面向对象的编程
    03 docker容器镜像基础
    09 异常处理
    08 输入输出
    07 数据结构
    02 docker的基本用法
    01 docker容器技术基础入门
    06 字符串
  • 原文地址:https://www.cnblogs.com/zhangtianyuan/p/9430435.html
Copyright © 2020-2023  润新知