• selenium+谷歌无头浏览器爬取网易新闻国内板块


    网页分析

     首先来看下要爬取的网站的页面

    查看网页源代码:你会发现它是由js动态加载显示的

    所以采用selenium+谷歌无头浏览器来爬取它

    1 加载网站,并拖动到底,发现其还有个加载更多

     2 模拟点击它,然后再次拖动到底,,就可以加载完整个页面

    示例代码
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.common.by import By
    from time import sleep
    from lxml import etree
    import os
    import requests
    
    # 使用谷歌无头浏览器来加载动态js
    def main():
        # 创建一个无头浏览器对象
        chrome_options = Options()
        # 设置它为无框模式
        chrome_options.add_argument('--headless')
        # 如果在windows上运行需要加代码
        chrome_options.add_argument('--disable-gpu')
        browser = webdriver.Chrome(chrome_options=chrome_options)
        # 设置一个10秒的隐式等待
        browser.implicitly_wait(10)
        browser.get(url)
        sleep(1)
        # 翻到页底
        browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
        # 点击加载更多
        browser.find_element(By.CSS_SELECTOR, '.load_more_btn').click()
        sleep(1)
        # 再次翻页到底
        browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
        # 拿到页面源代码
        source = browser.page_source
        browser.quit()
        with open('xinwen.html', 'w', encoding='utf-8') as f:
            f.write(source)
            parse_page(source)
    
    # 对新闻列表页面进行解析
    def parse_page(html):
        # 创建etree对象
        tree = etree.HTML(html)
        new_lst = tree.xpath('//div[@class="ndi_main"]/div')
        for one_new in new_lst:
            title = one_new.xpath('.//div[@class="news_title"]/h3/a/text()')[0]
            link = one_new.xpath('.//div[@class="news_title"]/h3/a/@href')[0]
            write_in(title, link)
    
    # 将其写入到文件
    def write_in(title, link):
        print('开始写入篇新闻{}'.format(title))
        response = requests.get(url=link, headers=headers)
        tree = etree.HTML(response.text)
        content_lst = tree.xpath('//div[@class="post_text"]//p')
        title = title.replace('?', '')
        with open('new/' + title + '.txt', 'a+', encoding='utf-8') as f:
            for one_content in content_lst:
                if one_content.text:
                        con = one_content.text.strip()
                        f.write(con + '
    ')
    
    
    if __name__ == '__main__':
        url = 'https://news.163.com/domestic/'
        headers = {"User-Agent": 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'}
        if not os.path.exists('new'):
            os.mkdir('new')
        main()

     得到结果:

    随意打开一个txt:

    Scrapy版

    wangyi.py

    # -*- coding: utf-8 -*-
    import scrapy
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from happy1.items import Happy1Item
    
    class WangyiSpider(scrapy.Spider):
        name = 'wangyi'
        # allowed_domains = ['https://news.163.com/domestic/']
        start_urls = ['http://news.163.com/domestic/']
    
        def __init__(self):
            # 创建一个无头浏览器对象
            chrome_options = Options()
            # 设置它为无框模式
            chrome_options.add_argument('--headless')
            # 如果在windows上运行需要加代码
            chrome_options.add_argument('--disable-gpu')
            # 示例话一个浏览器对象(实例化一次)
            self.bro = webdriver.Chrome(chrome_options=chrome_options)
    
        def parse(self, response):
            new_lst = response.xpath('//div[@class="ndi_main"]/div')
            for one_new in new_lst:
                item = Happy1Item()
                title = one_new.xpath('.//div[@class="news_title"]/h3/a/text()')[0].extract()
                link = one_new.xpath('.//div[@class="news_title"]/h3/a/@href')[0].extract()
                item['title'] = title
                yield scrapy.Request(url=link,callback=self.parse_detail, meta={'item':item})
    
        def parse_detail(self, response):
            item = response.meta['item']
            content_list = response.xpath('//div[@class="post_text"]//p/text()').extract()
            item['content'] = content_list
            yield item
    
        # 在爬虫结束后,关闭浏览器
        def close(self, spider):
            print('爬虫结束')
            self.bro.quit()
    pipelines.py
    class Happy1Pipeline(object):
        def __init__(self):
            self.fp = None
    
        def open_spider(self, spider):
            print('开始爬虫')
    
        def process_item(self, item, spider):
            title = item['title'].replace('?', '')
            self.fp = open('news/' + title + '.txt', 'a+', encoding='utf-8')
            for one in item['content']:
                self.fp.write(one.strip() + '
    ')
            self.fp.close()
            return item
    items.py
    import scrapy
    
    
    class Happy1Item(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        content = scrapy.Field()
    middlewares.py
        def process_response(self, request, response, spider):
            if request.url in ['http://news.163.com/domestic/']:
                spider.bro.get(url=request.url)
                time.sleep(1)
                spider.bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
                spider.bro.find_element(By.CSS_SELECTOR, '.load_more_btn').click()
                time.sleep(1)
                spider.bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
                page_text = spider.bro.page_source
                return HtmlResponse(url=spider.bro.current_url, body=page_text, encoding='utf-8', request=request)
            else:
                return response
    settings.py
    DOWNLOADER_MIDDLEWARES = {
       'happy1.middlewares.Happy1DownloaderMiddleware': 543,
    }
    
    ITEM_PIPELINES = {
       'happy1.pipelines.Happy1Pipeline': 300,
    }

     得到结果

    总结:

    1 其实主要的工作还是模拟浏览器来进行操作。

    2 处理动态的js其实还有其他办法。

    3 爬虫的方法有好多种,主要还是选择适合自己的。

    4 自己的代码写的太烂了。

  • 相关阅读:
    postman: 字符串与数字的转换
    postman:截取字符串字符
    postman:获取txt变量中数据
    postman:参数化外部文件txt、csv、json
    postman:全局变量、环境变量、collection模块变量
    windows下更新python报错permission denied
    vertica 7.0 使用kafka
    Kafka报错-as it has seen zxid 0x83808 our last zxid is 0x0 client must try another server
    vertica 8.0 新特性
    mysql source导入报错ERROR 1366的解决方法
  • 原文地址:https://www.cnblogs.com/xiaozx/p/10744604.html
Copyright © 2020-2023  润新知