• 展开阅读全文 js 爬虫操作


    from selenium import webdriver
    import time
    import random
    from bs4 import *
    
    browser = webdriver.Chrome()
    url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx'
    browser.get(url)
    
    ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))
    ck_l_ori_ok = 0
    try:
        for isc in range(100):
            if ck_l_ori_ok == ck_l_ori_len:
                break
            time.sleep(1)
            js = 'window.scrollTo(0,document.body.scrollHeight)'
            js = 'window.scrollTo(0,100*{})'.format(isc)
            browser.execute_script(js)
            ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')
            for i in ck_l:
                try:
                    i.click()
                    ck_l_ori_ok += 1
                except Exception as e:
                    print(e)
    except Exception as e:
        print('window.scrollTo-->', e)
    
    # ck_l=browser.find_elements_by_link_text('展开阅读全文 ∨')
    # for i in ck_l:
    #     try:
    #         i.click()
    #     except Exception as e:
    #         print(e)
    
    
    xp_l = ['//*[@id="fanyi967"]/div/div[3]/a', ]
    
    myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999))
    with open(myhtml, 'w', encoding='utf-8') as fw:
        fw.write(browser.page_source)
    sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
    with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
        bs = BeautifulSoup(myhtml_o, 'html.parser')
    
        dd = 9
    a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){a_[i].click()}}
    

      

    a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){try{a_[i].click()}catch(err){console.log(err)}}}
    

      

    from selenium import webdriver
    import time
    import random
    from bs4 import *
    
    browser = webdriver.Chrome()
    url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx'
    browser.get(url)
    
    # ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))
    # ck_l_ori_ok = 0
    # try:
    #     for isc in range(100):
    #         if ck_l_ori_ok == ck_l_ori_len:
    #             break
    #         time.sleep(1)
    #         js = 'window.scrollTo(0,document.body.scrollHeight)'
    #         js = 'window.scrollTo(0,100*{})'.format(isc)
    #         browser.execute_script(js)
    #         ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')
    #         for i in ck_l:
    #             try:
    #                 i.click()
    #                 ck_l_ori_ok += 1
    #             except Exception as e:
    #                 print(e)
    # except Exception as e:
    #     print('window.scrollTo-->', e)
    
    js = "a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){try{a_[i].click()}catch(err){console.log(err)}}}"
    try:
        browser.execute_script(js)
    except Exception as e:
        print(e)
        ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))
        ck_l_ori_ok = 0
        try:
            for isc in range(100):
                if ck_l_ori_ok == ck_l_ori_len:
                    break
                time.sleep(1)
                js = 'window.scrollTo(0,document.body.scrollHeight)'
                js = 'window.scrollTo(0,100*{})'.format(isc)
                browser.execute_script(js)
                ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')
                for i in ck_l:
                    try:
                        i.click()
                        ck_l_ori_ok += 1
                    except Exception as e:
                        print(e)
        except Exception as e:
            print('window.scrollTo-->', e)
    from selenium import webdriver
    import time
    import random
    from bs4 import *
    from pyquery import PyQuery as pq
    
    browser = webdriver.Chrome()
    url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx'
    browser.get(url)
    
    js = "a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){try{a_[i].click()}catch(err){console.log(err)}}}"
    try:
        browser.execute_script(js)
    except Exception as e:
        print(e)
        ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))
        ck_l_ori_ok = 0
        try:
            for isc in range(100):
                if ck_l_ori_ok == ck_l_ori_len:
                    break
                time.sleep(1)
                js = 'window.scrollTo(0,document.body.scrollHeight)'
                js = 'window.scrollTo(0,100*{})'.format(isc)
                browser.execute_script(js)
                ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')
                for i in ck_l:
                    try:
                        i.click()
                        ck_l_ori_ok += 1
                    except Exception as e:
                        print(e)
        except Exception as e:
            print('window.scrollTo-->', e)
    
    doc = pq(browser.page_source)
    pq_r_d = {'xmlns="http://www.w3.org/1999/xhtml"': ''}
    r_k, r_v = 'xmlns="http://www.w3.org/1999/xhtml"', ''
    article_ = doc('.left>:nth-child(2).sons>.cont>.contson').html().replace(r_k, r_v)
    title_d = {'h1': doc('.left>:nth-child(2).sons>.cont>:nth-child(2)').html().replace(r_k, r_v)}
    author_d = {'h3': doc('.left>:nth-child(2).sons>.cont>:nth-child(3)').text()}
    translation_ = doc('.left>:nth-child(4)>.contyishang>:nth-child(2)').html().replace(r_k, r_v)
    explanation_ = doc('.left>:nth-child(4)>.contyishang>:nth-child(3)').html().replace(r_k, r_v)
    refer_ = doc('.left>:nth-child(4)>.cankao').html().replace(r_k, r_v)
    
    author_img_url = doc('.left>.sonspic>.cont>.divimg>:nth-child(1)').html().split('src="')[-1].split('"')[0]
    
    d = 4
    

      

  • 相关阅读:
    paip.关于动画特效原理 html js 框架总结
    paip.utf-8,unicode编码的本质输出unicode文件原理 python
    paip.多维理念 输入法的外码输入理论跟文字输出类型精髓
    paip.前端加载时间分析之道优化最佳实践
    paip.输入法编程--英文ati化By音标原理与中文atiEn处理流程 python 代码为例
    paip.导入数据英文音标到数据库mysql为空的问题之道解决原理
    paip.元数据驱动的转换-读取文件行到个list理念 uapi java php python总结
    paip.python3 的类使用跟python2 的不同之处
    paip.日志中文编码原理问题本质解决python
    paip.性能跟踪profile原理与架构与本质-- python扫带java php
  • 原文地址:https://www.cnblogs.com/rsapaper/p/8933564.html
Copyright © 2020-2023  润新知