• 爬取知名社区技术文章_article_3


    爬虫主逻辑处理,获取字段,获取主url和子url

    #!/usr/bin/python3
    # -*- coding: utf-8 -*-
    
    import scrapy
    from scrapy.http import Request
    from urllib import parse
    from JobBole.items import JobboleItem, ArticleItemLoader
    
    
    class ExampleSpider(scrapy.Spider):
        name = 'jobbole'
        # allowed_domains = ['example.com']
        # 起始url
        start_urls = ['http://blog.jobbole.com/all-posts/']
    
        def parse(self, response):
            # 获取主页面所以有效详情页面url,图片url,下一页url
            home_all_node = response.css('.post.floated-thumb .post-thumb a')
            # 获取节点,循环出每个单张图片和单个url交个下载器和子url处理
            for home_node in home_all_node:
                img_url = home_node.css('img::attr(src)').extract_first('')
                cont_url = home_node.css('::attr(href)').extract_first('')
                # yield 提供异步方法,parser 进行域名拼接,meta进行传值
                yield Request(url=parse.urljoin(response.url, cont_url),
                              meta={'img_url': img_url},
                              callback=self.analysie_go)
            # 提取下一页并下载
            next_page_url = response.css('.next.page-numbers::attr(href)').extract_first('')
            if next_page_url:
                yield Request(url=parse.urljoin(response.url, next_page_url), callback=self.parse)
                
        def analysie_go(self, response):
            # 解析详情页面获取所需其他字段的值
            img_url = response.meta.get('img_url', '0')
            load_item = ArticleItemLoader(item=JobboleItem(), response=response)
            load_item.add_value('img_url', img_url)
            load_item.add_value('cont_url', response.url)
            load_item.add_value('cont_id', response.url)
            load_item.add_css('title', '.entry-header h1::text')
            load_item.add_css('publish_time', '.entry-meta-hide-on-mobile::text')
            load_item.add_xpath('cont', '//div[@class="entry"]//text()')
            load_item.add_css('link_num', '.vote-post-up h10::text')
            load_item.add_css('collection_num', '.bookmark-btn::text')
            load_item.add_css('comment_num', '.post-adds a span::text')
            article_items = load_item.load_item()
            # 把获取的字段交给items
            yield article_items
    

      

  • 相关阅读:
    pandas,对dataFrame中某一个列的数据进行处理
    pandas的简单使用
    pandas,读取或存储DataFrames的数据到mysql中
    pandas,pd.ExcelWriter保存结果到已存在的excel文件中
    用命令让vbox的虚拟硬盘文件转换成vmware的vmdk
    vbox磁盘空间如何扩容
    Linux ext3/ext4数据恢复
    VirtualBox安装64位系统
    ubuntu 12.04启用休眠
    美化你的GRUB,全面支持中文(菜单、提示、帮助)适用7.04-9.04
  • 原文地址:https://www.cnblogs.com/2bjiujiu/p/7233314.html
Copyright © 2020-2023  润新知