• python-scrapy爬取腾讯社招网


    学习完python爬虫之后,脑袋发热想实践一下,看看下各位大佬的博客,然后自己也开始做了些测试,以下将介绍scrapy框架爬去腾讯社招网的数据并保存至mongoDB。

    首先在搭建好scrapy框架后,首先进入在需要创建项目的路径进入cmd   输入命令:scrapy startproject 项目名,本次demo名字为tecent

    接下来开始代码的编写,创建爬虫类:

    import scrapy
    import scrapy.http
    from bs4 import BeautifulSoup
    from tecent import items
    
    class tecentSpider(scrapy.Spider):
    
        # 爬虫名称
        name = 'tecentSpider'
        # 爬取范围
        allowed_domains = ["hr.tencent.com"]
        #起始链接
        start_url = 'https://hr.tencent.com/position.php'
        #拼接URL
        get_url = 'https://hr.tencent.com/'
        #基础链接
        base_url = 'https://hr.tencent.com/position.php?&start='
    
        def start_requests(self):
            yield scrapy.Request(self.start_url, self.parse)
    
        def parse(self, response):
            html = BeautifulSoup(response.text, 'lxml')
            #地址列表
            add_list = html.select('#additems > a')
            #类型列表
            type_list = html.select('#searchrow3 > .pl9 > a')
            for add in add_list:
                if  add.get_text() != '全部':
                    itemAdd = items.cityItem()
                    itemAdd['addressUrl'] = add.attrs.get('href')
                    itemAdd['addressName'] = add.get_text()
                    yield itemAdd
                    yield scrapy.Request(self.get_url+add.attrs.get('href'), self.parse_city_page)
    
            for type in type_list:
                if type.get_text() != '全部':
                    itemType = items.typeItem()
                    itemType['typeUrl'] = type.attrs.get('href')
                    itemType['typeName'] = type.get_text()
                    yield itemType
    
        def parse_city_page(self,response):
            html = BeautifulSoup(response.text, 'lxml')
            page_list = html.select('.pagenav > a')
            max_page = int(page_list[len(page_list)-2].get_text())+1
            city_url = self.get_url+html.select('#searchrow2 > #additems > .active')[0].attrs.get('href')
            for i in range(1,max_page):
                #获取每页路径
                url = city_url+'&start='+str(int(i-1)*10)
                yield scrapy.Request(url, self.parse_page_data)
    
        def parse_page_data(self,response):
            html = BeautifulSoup(response.text, 'lxml')
            tr_list = html.select('.tablelist > tr')
            for tr in tr_list:
                item = items.TecentItem()
                if tr.attrs.get('class')[0] != 'h':
                    if tr.attrs.get('class')[0] != 'f':
                        item['name'] = tr.select('.square > a')[0].get_text()
                        item['type'] = tr.select('td')[1].get_text()
                        item['personNum'] = tr.select('td')[2].get_text()
                        item['address'] = tr.select('td')[3].get_text()
                        item['time'] = tr.select('td')[4].get_text()
                        item['pageUrl'] = self.get_url+tr.select('.square > a')[0].attrs.get('href')
                        yield item
                        yield scrapy.Request(item['pageUrl'], self.parse_item_detail)
    
        def parse_item_detail(self,response):
            html = BeautifulSoup(response.text, 'lxml')
            tr_list = html.select('.tablelist > tr')
            item = items.TecentItemDetail()
            item['title'] = tr_list[0].select('td')[0].get_text()
            item['address'] = tr_list[1].select('td')[0].get_text().split('')[-1]
            item['type'] = tr_list[1].select('td')[1].get_text().split('')[-1]
            item['num'] = tr_list[1].select('td')[2].get_text().split('')[-1]
            item['task'] = tr_list[2].select('.squareli')[0].get_text()
            item['require'] = tr_list[3].select('.squareli')[0].get_text()
            yield item

    以上代码包含了五个解析方法,分别返回指定的item,解析html界面使用BeautifulSoup,个人觉得还不错,这个可以看个人喜好

    接下来创建items文件,scrapy创建项目包含了items.py,我们只需在此文件中编写即可:

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    #招聘信息条目
    class TecentItem(scrapy.Item):
        #职位名称
        name = scrapy.Field()
        #职位类型
        type = scrapy.Field()
        #招聘人数
        personNum = scrapy.Field()
        #招聘地址
        address = scrapy.Field()
        #发布时间
        time = scrapy.Field()
        #职位链接
        pageUrl = scrapy.Field()
    
    #招聘地址
    class cityItem(scrapy.Item):
        #城市招聘信息链接
        addressUrl = scrapy.Field()
        #地址
        addressName = scrapy.Field()
    
    #招聘类型
    class typeItem(scrapy.Item):
        #类型招聘信息链接
        typeUrl = scrapy.Field()
        #类型
        typeName = scrapy.Field()
    
    #招聘条目详情
    class TecentItemDetail(scrapy.Item):
        #招聘标题
        title = scrapy.Field()
        #工作地点
        address = scrapy.Field()
        #职位类别
        type = scrapy.Field()
        #招聘人数
        num = scrapy.Field()
        #工作职责
        task = scrapy.Field()
        #工作要求
        require = scrapy.Field()

    items.py中的item可以根据自己需要爬取的内容定义,我这里是定义了四个,具体可以见注释。

    接下来scapy会将处理完的结果返回至pipelines.py文件处理,其处理方式根据自己的业务进行编写,本demo中编写代码如下:

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    import pymongo
    from scrapy.utils.project import get_project_settings
    #保存社招条目
    class TecentPipeline(object):
    
        def __init__(self):
            self.settings = get_project_settings()
            # 链接数据库
            self.client = pymongo.MongoClient(host=self.settings['MONGO_HOST'], port=self.settings['MONGO_PORT'])
            # 数据库登录需要帐号密码的话
            # self.client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW'])
            self.db = self.client[self.settings['MONGO_DB']]
    
        def process_item(self, item, spider):
            data = dict(item)
            if item.__class__.__name__ == 'TecentItem':
                self.coll = self.db[self.settings['MONGO_COLL_ITEM']]
            if item.__class__.__name__ == 'TecentItemDetail':
                self.coll = self.db[self.settings['MONGO_COLL_ITEM_DETAIL']]
            if item.__class__.__name__ == 'cityItem':
                self.coll = self.db[self.settings['MONGO_COLL_ADDRESS']]
            if item.__class__.__name__ == 'typeItem':
                self.coll = self.db[self.settings['MONGO_COLL_TYPE']]
    
            self.coll.insert(data)
            return item

    注意:爬虫类中的解析方法在为item赋完值后,都会使用yield关键词,这个其实很关键,不使用yield关键词你的item在pipelines.py文件中无法获得值。

    最后,你还需要做一些配置,比如数据库的配置,开启pipelines.py,这些配置主要在setting.py中,其代码如下:

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for tecent project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://doc.scrapy.org/en/latest/topics/settings.html
    #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'tecent'
    
    SPIDER_MODULES = ['tecent.spiders']
    NEWSPIDER_MODULE = 'tecent.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'tecent (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = True
    
    MONGO_HOST = "127.0.0.1"  # 主机IP
    MONGO_PORT = 27017  # 端口号
    MONGO_DB = "tecent"  # 库名
    MONGO_COLL_TYPE = "tecent_type"
    MONGO_COLL_ADDRESS = "tecent_address"
    MONGO_COLL_ITEM = "tecent_item"
    MONGO_COLL_ITEM_DETAIL = "tecent_item_detail"
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'tecent.middlewares.TecentSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'tecent.middlewares.TecentDownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
        'tecent.pipelines.TecentPipeline': 1,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

    好了,代码基本已经编写完毕,最后编写启动方法,写在begin.py中,没有的话就在同路径下创建一个:

    from scrapy import cmdline
    
    cmdline.execute(['scrapy', 'crawl' ,'tecentSpider'])

    最后是项目的结构图:

    代码仅供参考,才疏学浅,望各位多多指教!

  • 相关阅读:
    笔记本
    物料主档建立(PP模组)
    烦!烦!烦!
    Windows Live Writer试用
    SAP系统中发送公告的几种办法
    [CSS样式表之] 渐变色的实现
    今天终于开通了这个博客了
    MFC消息映射机制过程
    绘图
    C++ 内存分配和指针
  • 原文地址:https://www.cnblogs.com/advanceBlog/p/9117687.html
Copyright © 2020-2023  润新知