• scrapy 爬取拉勾网


    一、模板使用

    scrapy 在建立爬虫的时候,还可以指定使用的模板进行建立

    默认建立爬虫文件的命令:

    scrapy genspider 爬虫名称 爬虫地址

    可以用  scrapy genspider --list 命令 查看scrapy的模板

    $ scrapy genspider --list
    Available templates:
      basic
      crawl
      csvfeed
      xmlfeed

    通过crawl模板生成拉钩网爬虫文件

    $ scrapy genspider -t crawl lagou www.lagou.com
    
    Created spider 'lagou' using template 'crawl' in module:
      ArticleSpider.spiders.lagou

    二、编写lagou.py

    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from ..items import LagouJobItemLoader, LagouJobItem
    from ..utils.common import get_md5
    from datetime import datetime
    
    
    class LagouSpider(CrawlSpider):
        name = 'lagou'
        allowed_domains = ['www.lagou.com']
        start_urls = ['https://www.lagou.com/']
    
        rules = (
            # Rule(LinkExtractor(allow=r'zhaopin/.*'), follow=True),
            # Rule(LinkExtractor(allow=r'gongsi/jd+.html'), follow=True),
            Rule(LinkExtractor(allow=r'jobs/d+.html'), callback='parse_job', follow=True),
        )
    
        # 这个位置可以对这两个函数进行扩展
        # def parse_start_url(self, response):
        #     return []
        #
        # def process_results(self, response, results):
        #     return results
    
        def parse_job(self, response):
            # 解析拉勾网的职位
            item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response)
            item_loader.add_css('title', '.job-name::attr(title)')
            item_loader.add_value('url', response.url)
            item_loader.add_value('url_object_id', get_md5(response.url))
            item_loader.add_css('salary', '.job_request .salary::text')
            item_loader.add_xpath('job_city', '//*[@class="job_request"]/p/span[2]/text()')
            item_loader.add_xpath('work_years', '//*[@class="job_request"]/p/span[3]/text()')
            item_loader.add_xpath('degree_need', '//*[@class="job_request"]/p/span[4]/text()')
            item_loader.add_xpath('job_type', '//*[@class="job_request"]/p/span[5]/text()')
    
            item_loader.add_css('tags', '.position-label li::text')
            item_loader.add_css('publish_time', '.publish_time::text')
            item_loader.add_css('job_advantage', '.job-advantage p::text')
            item_loader.add_css('job_desc', '.job_bt div')
            item_loader.add_css('job_addr', '.work_addr')
            item_loader.add_css('company_name', '#job_company dt a img::attr(alt)')
            item_loader.add_css('company_url', '#job_company dt a::attr(href)')
            item_loader.add_value('crawl_time', datetime.now())
    
            job_item = item_loader.load_item()
    
            return job_item
    View Code

    三、编写items.py

    import scrapy
    from scrapy.loader import ItemLoader
    from scrapy.loader.processors import MapCompose,TakeFirst,Join
    from w3lib.html import remove_tags
    from ArticleSpider.settings import SQL_DATE_FORMAT, SQL_DATETIME_FORMAT
    
    
    def remove_splash(value):
        # 去掉工作城市的斜线
        return value.replace("/", "").strip()
    
    
    def handle_jobaddr(value):
        addr_list = value.split("
    ")
        # addr = []
        # for item in addr_list:
        #     if item.strip() != "查看地图":
        #         addr.append(item.strip())
        # return ''.join(addr)
        # 简写
        addr_list = [item.strip() for item in addr_list if item.strip() != "查看地图"]
        return ''.join(addr_list)
    
    
    class LagouJobItemLoader(ItemLoader):
        default_output_processor = TakeFirst()
    
    
    class LagouJobItem(scrapy.Item):    # 拉勾网职位信息
        title = scrapy.Field()
        url = scrapy.Field()
        url_object_id = scrapy.Field()
        salary = scrapy.Field(
            input_processor=MapCompose(remove_splash),
        )
        job_city = scrapy.Field(
            input_processor=MapCompose(remove_splash),
        )
        work_years = scrapy.Field(
            input_processor=MapCompose(remove_splash),
        )
        degree_need = scrapy.Field(
            input_processor=MapCompose(remove_splash),
        )
        job_type = scrapy.Field()
        publish_time = scrapy.Field()
        job_advantage = scrapy.Field()
        job_desc = scrapy.Field()
        job_addr = scrapy.Field(
            input_processor=MapCompose(remove_tags, handle_jobaddr),
        )
        company_name = scrapy.Field()
        company_url = scrapy.Field()
        tags = scrapy.Field(
            input_processor=Join(",")
        )
        crawl_time = scrapy.Field()
    
        def get_insert_sql(self):
            insert_sql = """
                insert into lagou_job(title,url,url_object_id,salary,job_city,work_years,degree_need,
                job_type,publish_time,job_advantage,job_desc,job_addr,company_name,company_url,
                tags,crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                ON DUPLICATE KEY UPDATE salary=VALUES(salary), job_desc=VALUES(job_desc)
            """
            params = (
                self["title"], self["url"], self["url_object_id"], self["salary"], self["job_city"], self["work_years"], self["degree_need"],
                self["job_type"], self["publish_time"], self["job_advantage"], self["job_desc"], self["job_addr"], self["company_name"],
                self["company_url"], self["tags"], self["crawl_time"].strftime(SQL_DATETIME_FORMAT),
            )
    
            return insert_sql, params
    View Code

    四、编写pipelines.py

    # -*- coding: utf-8 -*-
    
    from twisted.enterprise import adbapi
    import MySQLdb
    import MySQLdb.cursors
    import json
    import codecs   # codecs与open类似,但是减少了很多的编码工作
    from scrapy.exporters import JsonItemExporter
    from scrapy.pipelines.images import ImagesPipeline
    
    
    class MysqlTwistedPipline(object):
    
        def __init__(self, dbpool):
            self.dbpool = dbpool
    
        @classmethod
        def from_settings(cls, settings):
            dbparms = dict(
                host=settings["MYSQL_HOST"],
                db=settings["MYSQL_DBNAME"],
                user=settings["MYSQL_USER"],
                password=settings["MYSQL_PASSWORD"],
                charset='utf8',
                cursorclass=MySQLdb.cursors.DictCursor,
                use_unicode=True,
            )
            dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
    
            return cls(dbpool)
    
        def process_item(self, item, spider):
            # 使用twisted将mysql插入编程异步执行
            query = self.dbpool.runInteraction(self.do_insert, item)
            query.addErrback(self.handle_error)   # 处理异常
    
        def handle_error(self, failure):
            # 处理异步插入的异常
            print(failure)
    
        def do_insert(self, cursor, item):
            # 执行具体的插入操作
            insert_sql, params = item.get_insert_sql()
            cursor.execute(insert_sql, params)
    View Code

    五、编辑settings

    ITEM_PIPELINES = {
       'ArticleSpider.pipelines.MysqlTwistedPipline': 300,
    }
    
    
    MYSQL_HOST = "127.0.0.1"
    MYSQL_DBNAME = "article_spider"
    MYSQL_USER = "root"
    MYSQL_PASSWORD = "123"
    
    SQL_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
    SQL_DATE_FORMAT = "%Y-%m-%d"
    View Code
  • 相关阅读:
    Go语言函数之可变参数
    Python 调用系统命令的模块 Subprocess
    python关闭socket端口立即释放
    Python面试题(四)
    Project简介
    Office Visio简介
    [转载]Windows 2012 R2安装SharePoint 2013 手动安装工具软件
    [转载]SharePoint 2013测试环境安装配置指南
    [转载]SharePoint 2013 解决方案中使用JavaScript
    [转载]我们可以用SharePoint做什么
  • 原文地址:https://www.cnblogs.com/trunkslisa/p/9834578.html
Copyright © 2020-2023  润新知