• scrapy项目3


    # -*- coding: utf-8 -*-
    import scrapy
    
    #导入items
    from tencent.items import TencentItem
    
    class HrSpider(scrapy.Spider):
        name = 'hr'
        allowed_domains = ['tencent.com']
        start_urls = ['https://hr.tencent.com/position.php']
    
        def parse(self, response):
            print("=========")
            tr_list = response.xpath("//table[@class='tablelist']/tr")[1:-1] #去掉第一个和最后一个
    
            for tr in tr_list:
                # 使用item,items里面的字段要和这下面的字段一样
                item = TencentItem()
                # item = {}  #不使用items
    
                item["title"] = tr.xpath("./td[1]/a/text()").extract_first()
                item["position"] = tr.xpath("./td[4]/text()").extract_first()
                item["publish_date"] = tr.xpath("./td[5]/text()").extract_first()
                yield item
    
                # <a href="javascript:;"class="noactive" id="next">下一页</a>
                #判断下一页,最后一页的href="javascript
            next_url = response.xpath("//a[@id='next']/@href").extract_first()
            if next_url != "javascript": #如果href的属性不为javascript,代表有下一页
                next_url = 'https://hr.tencent.com/'+next_url
                yield scrapy.Request(
                    next_url,
                    callback=self.parse
                )

    pipelines.py

    # mogodb数据库
    # from pymongo import MogoClient
    # client = MogoClient()
    # collection = client["tencent"]["hr"]
    #导入items
    from tencent.items import TencentItem
    class TencentPipeline(object):
        def process_item(self, item, spider):
            # print(spider.name)
            if isinstance(item,TencentItem):
    
    
                print(item)
            # collection.insert(dict(item))
            return item

    items,py

    import scrapy
    
    # 可以定义多个item对应不同的爬虫项目字段,比如怕京东,抽屉,汽车之家
    #然后再pipelines中做判断
    class TencentItem(scrapy.Item): #scrapy.Item也是一个字典
        # define the fields for your item here like:
        # name = scrapy.Field()
        num = scrapy.Field()
        title = scrapy.Field()   #scrapy.Field()是一个字典
        position = scrapy.Field()
        publish_date = scrapy.Field()
    
    
    class ChoutiItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        position = scrapy.Field()
        publish_date = scrapy.Field()
    
    
    class JdItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        position = scrapy.Field()
        publish_date = scrapy.Field()

    settings.py

    LOG_LEVEL = "WARNING"
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'

     项目地址:https://github.com/CH-chen/tencent

  • 相关阅读:
    mongo复制集
    s10d71_分组查询_分页_cookie_session_装饰器
    s10_part3_django_ORM_查询相关_非常重要
    s10_part3_django_html模板_view_model
    s10_part3_django_project_library
    记录替换calico为flannel的过程
    kubelet证书过期解决方法
    css
    ubuntu 20.04 ibus添加五笔输入法
    马哥k8s
  • 原文地址:https://www.cnblogs.com/chvv/p/10332460.html
Copyright © 2020-2023  润新知