• python+scrapy爬取知乎日报全站文章


    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # http://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class ZhihudailyItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        date=scrapy.Field()
        title=scrapy.Field()
        url=scrapy.Field()
        content=scrapy.Field()
    

      

    #!/usr/bin/python
    #coding:utf-8
    import scrapy
    
    class ZhihudailySpider(scrapy.spider.Spider):
        name='zhihudaily'
        allowd_domains=['zhihu.com']
        start_urls=[
            "http://zhihudaily.ahorn.me/page/1"]
        def parse(self,response):
            for sel in response.xpath("//div[@class='post']"):
                for sub in sel.xpath("./div/div"):
                    url=sub.xpath("./a/@href").extract()[0]
                    yield scrapy.Request(url,callback=self.parse_url)                
    
            for page in range(2,500):
                request=scrapy.Request("http://zhihudaily.ahorn.me/page/"+str(page),callback=self.parse)
                yield request
    
        def parse_url(self,response):
            title=response.xpath("//h1[@class='headline-title']/text()").extract()[0]
            print "标题:",title
            print "*************************************************************************"
            for p in response.xpath("//div[@class='content']/p/text()").extract():
                print p
  • 相关阅读:
    git把dev部分提交过的内容合并到master
    shell命令修改文件内容
    js时间格式化
    js判断对象是否为空
    JS数组遍历方法
    批量修改文件后缀
    curl实现put请求
    lumen伪静态路由设置示例
    nginx client_body_buffer_size
    nginx模块开发
  • 原文地址:https://www.cnblogs.com/tmyyss/p/4551974.html
Copyright © 2020-2023  润新知