• spider csdn博客和quantstart文章


    spider csdn博客和quantstart文章

    功能

    1. 提取csdn博客文章
    2. 提取quantstart.com 博客文章, Micheal Hall-Moore 创办的网站

    特色功能就是: 想把原来文章里的格式/样式(段落, 标题等等排版信息)保留到word文档里. 不仅仅是把文本提取出来.

    目前能够全部得到文章信息. 而且博文里的段落和小标题信息也都保留了下来

    TODO:
    把它们写入word文档, 同时也要保留段落以及样式信息.
    (转码到docx的部分留到以后实现)

    遇到的问题以及解决办法

    csdn博文用常规的requests库, 得到的是乱码.
    多次尝试了指定编码, 也没有得到正解.
    无奈只好改用selenium这个大笨蛋获取.

    如果读者朋友知道requests的实现方案, 请不吝赐教.

    关于用webdriver定位页面元素的技巧:
    用wait...untill 组合手段来解决. 完美解决无法定位问题/或者简单设定超时等等时间.

    WebDriverWait(brower, timeOut).until(lambda x: x.title)
    WebDriverWait(brower, timeOut).until(lambda x: x.page_source)

    WebDriverWait(brower, timeOut).until(lambda x: x.find_element_by_class('abc'))

    截图

    代码

    v 0.2

    
    
    import os; type(os)
    import time; type(time)
    import re
    anys = '.*?' # 任意长的字符串, 贪婪型的
    
    import random; type(random)
    
    import requests
    from lxml import etree
    from selenium import webdriver
    chrome_options = webdriver.ChromeOptions()
    chrome_options.binary_location = r'C:UsersAdministratorAppDataRoaming360se6Application360se.exe'
    chrome_options.add_argument(r'--lang=zh-CN') # 这里添加一些启动的参数
    
    import logging
    logging.basicConfig(level=logging.INFO, 
        format= '%(asctime)s - %(name)s - %(levelname)s : %(message)s', 
        #format='%(asctime)s %(filename)s [line:%(lineno)d] %(levelname)s %(message)s', 
        )
    logger = logging.getLogger(__name__)
    
    #logger.info("Start print log")
    #logger.debug("Do something")
    #logger.warning("Something maybe fail.")
    #logger.info("Finish")
    
    
    
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC
    type(By)
    type(EC)
    
    #%%
    
    def get_it_wait_untill(browser, element_func='title', sleep_time=80, arg=''):
        '''
        selenium内核的锁定页面元素, 然后取之. 比如:
            获取网页标题
            获取整个网页的源文件
            获取指定页面元素: 
                by_id
                by_xpath
        Example:
            >>> get_it_wait_untill(browser, 'title')
            >>> get_it_wait_untill(browser, 'page_source')
            >>> get_it_wait_untill(browser, element_func='find_element_by_id', 
                            arg='content_views',
                            )
            >>> get_it_wait_untill(browser, element_func='find_element_by_xpath', 
                            arg='//section[@class="content article-content"]',
                            )
        '''
        prop = str(type(getattr(browser, element_func)))
        
        if prop == "<class 'str'>":
            element = WebDriverWait(browser, sleep_time).until(
                    lambda x: getattr(x, element_func)
                    )
        #elif callable(getattr(browser, element_func)):
        elif prop == "<class 'method'>":
            element = WebDriverWait(browser, sleep_time).until(
                    lambda x: getattr(x, element_func)(arg)
                    )
            
        return element
    
    #%%
    
    def get_quantstart_article(
            url = 'https://www.quantstart.com/articles/Beginners-Guide-to-Quantitative-Trading'
            ,
            sleep_time=40
            ,
            kernel='requests', # or 'selenium'
            xpath = '//section[@class="content article-content"]'
            ,
            ):
        '''quantstart blog by:  Micheal Hall-Moore
        '''
        #Beginner's Guide to Quantitative Trading | QuantStart  
        logger.info(f'当前的url: {url}')
    
        if kernel=='requests':
            headers = {
                    'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
            #        'Cookie'    : 'acw_tc=2760820715764664133076924e81d61fc0916132e54d88a183dbc0bd00aa01; acw_sc__v2=5df6f7edcaabd3ef5262a7a3d573f5675b385aec; uuid_tt_dd=10_17296990010-1576466413788-862921; dc_session_id=10_1576466413788.893537; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1576466384; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_17296990010-1576466413788-862921; announcement=%257B%2522isLogin%2522%253Afalse%252C%2522announcementUrl%2522%253A%2522https%253A%252F%252Fblogdev.blog.csdn.net%252Farticle%252Fdetails%252F103053996%2522%252C%2522announcementCount%2522%253A0%252C%2522announcementExpire%2522%253A3600000%257D; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1576466781; c-login-auto=2; dc_tos=q2l5ox', 
                    }
            try:
                resp = requests.get(url, params=headers)
                if resp.status_code==200:    
                    logger.info(f'get网络请求的响应ok!!!, 响应的编码是{resp.encoding}')
                    html = resp.text    
                    #html2 = str(resp.content, encoding='utf-8')
                    #is_eqs = html==html2; print(is_eqs)
                    
                    tree = etree.HTML(html)
                    article_tree = tree.xpath(xpath)[0]
                    article_html = etree.tostring(article_tree,); type(article_html)
                    article_unicode = etree.tounicode(article_tree)
                    article_unicode = re.findall('<section.*?(<p>.*?)</section>', 
                                   article_unicode, re.S)[0]
                    logger.info(article_unicode[-100:])
                    return article_unicode.strip()
                else:
                    logger.info(f'responce status code: {resp.status_code}')
                    logger.error('requests.get()网络连接/网络请求不正常!!!')
                    return False
                
            except Exception as ee:
                logger.error(f'Exception: {ee}')
                return False
    
        elif kernel=='selenium':
            browser = webdriver.Chrome(options=chrome_options)
            #timeout_wait = WebDriverWait(browser, 2*5) # 10sec
            browser.get(url)
            timeout_wait = WebDriverWait(browser, 2*5) # 10sec;
            type(timeout_wait)
            
            #title = browser.title
            title = get_it_wait_untill(browser, 'title')
            logger.info(f'网页的标题是: {title} ')
            html = get_it_wait_untill(browser, 'page_source') # 需要花点时间
            
            article_webelement = browser.find_element_by_xpath(xpath)
            article_text = article_webelement.text; type(article_text)
            logger.info('网页源码的长度和博文的长度分别是: {1} {0}'.
                  format(len(article_text), len(html))
                  )
        
            #    pattern = '<title>(.*?)</title>'
            #    pattern = '<p>(.*?)</p>'
            pattern = '</style></form>' + 
                '(.*?)' + 
                '<div class="col-md-4 book-card order-md-1">' 
            a = re.findall(pattern, html, re.S)
            a = a[0]
            
            a = re.findall(f'{anys}(<p>{anys})</section>{anys}', a, re.S)[0]
            
            return a
        else:
            logger.error('出错了!!! 你必须指定正确的内核. 或者selenium 或者requests!')
            return False
    
    
    
    
    
    #%%
    def get_csdn_blog(
            url='https://blog.csdn.net/Lili_0820/article/details/70155949' 
            ,
            sleep_time=40
            ,
            ):
        '''
        爬取csdn blog文章
        参数:
        url: str,
        sleep_time: int, wait time in seconds
        '''
        logger.info(f'当前的url: {url}')
        browser = webdriver.Chrome(options=chrome_options)
        #timeout_wait = WebDriverWait(browser, 2*5) # 10sec
        browser.get(url)
        timeout_wait = WebDriverWait(browser, sleep_time) # 10sec;
        type(timeout_wait)
        
        '''
        我们需要确保: 网页信息已经全部加载, 否则可能提取不到有用信息.
        
        Sets a sticky timeout to implicitly wait for an element to be found,
        or a command to complete. 
        This method only needs to be called one time per session.
        当浏览器(webdriver实例)在定位元素的时候, 
        我们可以设置一个隐式的超时等待时间, 
        如果超过这个设定的时间还不能锁定元素, 那么就报错或者继续执行.
        本方法在整个对话期内, 只需调用一次.
        
        '''
        browser.implicitly_wait(200)
        '''
        i=0
        while 1: # 有可能是一个无限循环, 改用: wait...untill
            title = browser.title
            # selenium 取数据时的等待时间间隔是: 0.5sec
            # poll_frequency=0.5
            i+=1
            logger.info(f'提取网页标题: 定位title元素的次数: 
    {i} {title}')
            if len(title)>=5 : 
                break
        '''    
        title = WebDriverWait(browser, sleep_time).until(lambda x: x.title)
        logger.info(f'提取网页标题: {title}')
        
        html= WebDriverWait(browser, sleep_time).until(lambda x: x.page_source)
        #html = browser.page_source 
        #需要花点时间
        #time.sleep(sleep_time) # 太粗暴简单了
        content = browser.find_element_by_id('content_views') # selenium.webelement
        text = content.text; type(text)
        logger.info('网页源码的长度和博文的长度分别是: {1} {0}'.
              format(len(text), len(html))
              )
    
        pattern = 'id="content_views" class="markdown_views.*?>' + 
            '(.*?)' + 
            '<link href="https://csdnimg.cn/release/' + 
            'phoenix/mdeditor/markdown_views'
        a = re.findall(pattern, html, re.S)
        a = a[0]
        a = re.findall(f'{anys}(<p>{anys})</div>{anys}', a, re.S)[0]
    
        '''
        tree = etree.HTML(html)    
        tree_cv = tree.xpath('//div[@id="content_views"]')[0]
        txt = tree_cv.xpath('*/text()')
        txt2 = tree_cv.xpath('*//text()')
        txt22 = '
    '.join(txt2)
        
        html_cv = etree.tostring(tree_cv)
        '''
        browser.close()
        return a
    
    
    if __name__=='__main__':
        pass
    #    blog2r=get_quantstart_article()
    #    blog2r=get_quantstart_article(kernel='requests')
    #    blog2r=get_quantstart_article(url='http://www.ba', kernel='requests')
    #    blog2s=get_quantstart_article(kernel='selenium')
    #    blog2=get_quantstart_article(kernel='req')
        
    #    blog1 = get_csdn_blog(sleep_time=80)
    
    

    v 0.1

    
    
    import os; type(os)
    import time
    import re
    anys = '.*?' # 任意长的字符串, 贪婪型的
    
    import random; type(random)
    
    import requests
    from lxml import etree
    from selenium import webdriver
    chrome_options = webdriver.ChromeOptions()
    chrome_options.binary_location = r'C:UsersAdministratorAppDataRoaming360se6Application360se.exe'
    chrome_options.add_argument(r'--lang=zh-CN') # 这里添加一些启动的参数
    
    import logging
    logging.basicConfig(level=logging.INFO, 
        format= '%(asctime)s - %(name)s - %(levelname)s : %(message)s', 
        #format='%(asctime)s %(filename)s [line:%(lineno)d] %(levelname)s %(message)s', 
        )
    logger = logging.getLogger(__name__)
    
    #logger.info("Start print log")
    #logger.debug("Do something")
    #logger.warning("Something maybe fail.")
    #logger.info("Finish")
    
    
    
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC
    type(By)
    type(EC)
    
    
    
    
    
    def get_quantstart_article(
            url = 'https://www.quantstart.com/articles/Beginners-Guide-to-Quantitative-Trading'
            ,
            sleep_time=40
            ,
            kernel='requests', # or 'selenium'
            xpath = '//section[@class="content article-content"]'
            ,
            ):
        '''quantstart blog by:  Micheal Hall-Moore
        '''
        #Beginner's Guide to Quantitative Trading | QuantStart  
        logger.info(f'当前的url: {url}')
    
        if kernel=='requests':
            headers = {
                    'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
            #        'Cookie'    : 'acw_tc=2760820715764664133076924e81d61fc0916132e54d88a183dbc0bd00aa01; acw_sc__v2=5df6f7edcaabd3ef5262a7a3d573f5675b385aec; uuid_tt_dd=10_17296990010-1576466413788-862921; dc_session_id=10_1576466413788.893537; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1576466384; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_17296990010-1576466413788-862921; announcement=%257B%2522isLogin%2522%253Afalse%252C%2522announcementUrl%2522%253A%2522https%253A%252F%252Fblogdev.blog.csdn.net%252Farticle%252Fdetails%252F103053996%2522%252C%2522announcementCount%2522%253A0%252C%2522announcementExpire%2522%253A3600000%257D; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1576466781; c-login-auto=2; dc_tos=q2l5ox', 
                    }
            try:
                resp = requests.get(url, params=headers)
                if resp.status_code==200:    
                    logger.info(f'get网络请求的响应ok!!!, 响应的编码是{resp.encoding}')
                    html = resp.text    
                    #html2 = str(resp.content, encoding='utf-8')
                    #is_eqs = html==html2; print(is_eqs)
                    
                    tree = etree.HTML(html)
                    article_tree = tree.xpath(xpath)[0]
                    article_html = etree.tostring(article_tree,); type(article_html)
                    article_unicode = etree.tounicode(article_tree)
                    article_unicode = re.findall('<section.*?(<p>.*?)</section>', 
                                   article_unicode, re.S)[0]
                    logger.info(article_unicode[-100:])
                    return article_unicode.strip()
                else:
                    logger.info(f'responce status code: {resp.status_code}')
                    logger.error('requests.get()网络连接/网络请求不正常!!!')
                    return False
                
            except Exception as ee:
                logger.error(f'Exception: {ee}')
                return False
    
        elif kernel=='selenium':
            browser = webdriver.Chrome(options=chrome_options)
            #timeout_wait = WebDriverWait(browser, 2*5) # 10sec
            browser.get(url)
            timeout_wait = WebDriverWait(browser, 2*5) # 10sec;
            type(timeout_wait)
            
            title = browser.title
            print('网页的标题是: ', title)
            html = browser.page_source # 需要花点时间
            time.sleep(sleep_time)
            article_webelement = browser.find_element_by_xpath(xpath)
            article_text = article_webelement.text; type(article_text)
            print('网页源码的长度和博文的长度分别是: {1} {0}'.
                  format(len(article_text), len(html))
                  )
        
            #    pattern = '<title>(.*?)</title>'
            #    pattern = '<p>(.*?)</p>'
            pattern = '</style></form>' + 
                '(.*?)' + 
                '<div class="col-md-4 book-card order-md-1">' 
            a = re.findall(pattern, html, re.S)
            a = a[0]
            
            a = re.findall(f'{anys}(<p>{anys})</section>{anys}', a, re.S)[0]
            
            return a
        else:
            logger.error('出错了!!! 你必须指定正确的内核. 或者selenium 或者requests!')
            return False
    
    
    
    
    
    #%%
    def get_csdn_blog(
            url='https://blog.csdn.net/Lili_0820/article/details/70155949' 
            ,
            sleep_time=40
            ,
            ):
        '''
        爬取csdn blog文章
        参数:
        url: str,
        sleep_time: int, wait time in seconds
        '''
        logger.info(f'当前的url: {url}')
        browser = webdriver.Chrome(options=chrome_options)
        #timeout_wait = WebDriverWait(browser, 2*5) # 10sec
        browser.get(url)
        timeout_wait = WebDriverWait(browser, sleep_time) # 10sec;
        type(timeout_wait)
        
        '''
        我们需要确保: 网页信息已经全部加载, 否则可能提取不到有用信息.
        
        Sets a sticky timeout to implicitly wait for an element to be found,
        or a command to complete. 
        This method only needs to be called one time per session.
        当浏览器(webdriver实例)在定位元素的时候, 
        我们可以设置一个隐式的超时等待时间, 
        如果超过这个设定的时间还不能锁定元素, 那么就报错或者继续执行.
        本方法在整个对话期内, 只需调用一次.
        
        '''
        browser.implicitly_wait(200)
        '''
        i=0
        while 1: # 有可能是一个无限循环, 改用: wait...untill
            title = browser.title
            # selenium 取数据时的等待时间间隔是: 0.5sec
            # poll_frequency=0.5
            i+=1
            logger.info(f'提取网页标题: 定位title元素的次数: 
    {i} {title}')
            if len(title)>=5 : 
                break
        '''    
        title = WebDriverWait(browser, sleep_time).until(lambda x: x.title)
        logger.info(f'提取网页标题: {title}')
        
        html= WebDriverWait(browser, sleep_time).until(lambda x: x.page_source)
        #html = browser.page_source 
        #需要花点时间
        #time.sleep(sleep_time) # 太粗暴简单了
        content = browser.find_element_by_id('content_views') # selenium.webelement
        text = content.text; type(text)
        logger.info('网页源码的长度和博文的长度分别是: {1} {0}'.
              format(len(text), len(html))
              )
    
        pattern = 'id="content_views" class="markdown_views.*?>' + 
            '(.*?)' + 
            '<link href="https://csdnimg.cn/release/' + 
            'phoenix/mdeditor/markdown_views'
        a = re.findall(pattern, html, re.S)
        a = a[0]
        a = re.findall(f'{anys}(<p>{anys})</div>{anys}', a, re.S)[0]
    
        '''
        tree = etree.HTML(html)    
        tree_cv = tree.xpath('//div[@id="content_views"]')[0]
        txt = tree_cv.xpath('*/text()')
        txt2 = tree_cv.xpath('*//text()')
        txt22 = '
    '.join(txt2)
        
        html_cv = etree.tostring(tree_cv)
        '''
        browser.close()
        return a
    
    
    if __name__=='__main__':
    #    blog2r=get_quantstart_article()
    #    blog2r=get_quantstart_article(kernel='requests')
    #    blog2r=get_quantstart_article(url='http://www.ba', kernel='requests')
    #    blog2s=get_quantstart_article(kernel='selenium')
    #    blog2=get_quantstart_article(kernel='req')
        
        blog1 = get_csdn_blog(sleep_time=80)
    
    
  • 相关阅读:
    第十五周作业
    十三周作业
    第十三周上机作业
    第十二周作业
    第十二周作业
    第十一周作业
    第十一周上机作业
    第十次上机作业
    第九周上机作业
    安卓第一周
  • 原文地址:https://www.cnblogs.com/duan-qs/p/12049934.html
Copyright © 2020-2023  润新知