• python获取页面文字信息


    # -*- coding: utf-8 -*-
    
    from selenium import webdriver
    import time, re,requests,os,time,random,traceback
    import urllib.request,threading
    from bs4 import BeautifulSoup
    import html.parser
    from tkinter import *
    from tkinter import ttk
    import tkinter.messagebox 
    
    
    def getHtml(questionId,page):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--start-maximized')  # 最大化运行(全屏窗口),不设置,取元素会报错
        chrome_options.add_argument('--disable-infobars')  # 禁用浏览器正在被自动化程序控制的提示
        chrome_options.add_argument('--incognito')  # 隐身模式(无痕模式)
        chrome_options.add_argument('--headless')  # 浏览器不提供可视化页面
    
        driver = webdriver.Chrome(executable_path = "chromedriver",options=chrome_options)  # 打开浏览器
        driver.get("https://www.zhihu.com/question/"+questionId+"/answers/updated?page="+str(page)) # 打开想要爬取的知乎页面 
    
        # 模拟用户操作
        def execute_times(times):
            for i in range(times):
                print(''+str(i)+'次点击') 
                driver.execute_script("window.scrollTo(0, "+str(1000 * i)+");")
                time.sleep(3)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")    
        execute_times(12)
    
        result_raw = driver.page_source  # 这是原网页 HTML 信息
        result_soup = BeautifulSoup(result_raw, 'html.parser')# 然后将其解析
        result_bf = result_soup.prettify()  # 结构化原 HTML 文件
        answers = driver.find_elements_by_class_name("RichContent-inner")
        txt = "start
    "
        for answer in answers:
            if len(answer.text) > 300:
               txt = txt + answer.text + "
    -----------我是分隔符------
    "
        with open(questionId +"/page_"+str(page)+".txt", 'w',encoding="utf-8") as zhpage:  # 存储路径里的文件夹需要事先创建。
            zhpage.write(txt)
        zhpage.close()
        print("爬取回答页面成功!!!")
        driver.quit()
        return result_soup
    
    def readTxt(path):
        f = open(path,'r',encoding='utf-8')
        strTxt = f.read()
        f.close()
        return strTxt
            
    
    def main(questionId,startPage,endPage):
        mkdir([questionId])
        for i in range(startPage,endPage):
            try:
               getHtml(questionId,i)
               time.sleep(random.choice(range(5,8)))
            except Exception:
                traceback.print_exc()
                pass
    
    def mkdir(paths):
        for path in paths:
            if not os.path.exists(path):
                os.mkdir(path)
    
    def getanswer():
        questionId = var_id.get()
        start = var_start.get()
        end = var_end.get()
        main(questionId,start,end)
    
    if __name__ == '__main__':
        main(str(308829198),101,200)
    
    
    tk = Tk()
    tk.title('获取知乎问题所有答案')
    tk.geometry('600x150')
    
    frame = Frame(tk)
    Label(tk,text='问题标识:(例:https://www.zhihu.com/question/324405640/answer/720532471中的324405640 )',width=200,anchor=W, justify=LEFT).place(x=10,y=10)
    var_id = Variable()
    question_id = Entry(tk,textvariable=var_id,width=30)
    question_id.place(x=10,y=40)
    
    Label(tk,text='开始页:').place(x=230,y=40)
    var_start = Variable()
    e = Entry(tk, textvariable=var_start,width=10).place(x=290,y=40)
    var_start.set(1)
    
    
    Label(tk,text='结束页:').place(x=360,y=40)
    var_end = Variable()
    e = Entry(tk, textvariable=var_end,width=10).place(x=420,y=40)
    var_end.set(10)
    
    Button(tk, text="获取答案", command=getanswer).place(x=200,y=80)
    #tk.mainloop()
  • 相关阅读:
    腾讯云发布“创新成长快线”,首期向创业者赠送10亿分钟实时音视频时长
    Tencent Kona JDK11正式开源,腾讯大数据将持续贡献Java生态发展
    腾讯视频云勇夺云端视频转码大赛多项第一
    分享一些常用的开源博客社区网站
    分享些发表技术类文章的平台
    Linux之ps命令基本使用
    彻底卸载 Oracle11g r2 教程(亲测有效,已重装过)
    Oracle11g R2 安装教程(非常详细 )
    苹果CMS搭建影视网站教程
    JSON 基本使用
  • 原文地址:https://www.cnblogs.com/liangblog/p/11396397.html
Copyright © 2020-2023  润新知