• Python的scrapy之爬取51job网站的职位


    今天老师讲解了Python中的爬虫框架--scrapy,然后带领我们做了一个小爬虫--爬取51job网的职位信息,并且保存到数据库中

    用的是Python3.6  pycharm编辑器

    爬虫主体:

    import scrapy
    from ..items import JobspidersItem
    
    class JobsspiderSpider(scrapy.Spider):
        name = 'jobsspider'
        #allowed_domains = ['search.51job.com/list/010000,000000,0000,00,9,99,%2520,2,1.html']
        #start_urls = ['https://search.51job.com/list/010000,000000,0000,00,9,99,%2520,2,1.html/']
        start_urls = [
            'https://search.51job.com/list/010000,000000,0000,01,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']
    
        def parse(self, response):
            currentPageItems = response.xpath('/html/body/div[@class="dw_wp"]/div[@class="dw_table"]/div[@class="el"]')
            print(currentPageItems)
    
            # currentPageItems = response.xpath('//div[@class="el"]')
            for jobItem in currentPageItems:
                print('----',jobItem)
                jobspidersItem = JobspidersItem()
    
                jobPosition = jobItem.xpath('p[@class="t1 "]/span/a/text()').extract()
                if jobPosition:
                    #print(jobPosition[0].strip())
                    jobspidersItem['jobPosition'] = jobPosition[0].strip()
    
                jobCompany = jobItem.xpath('span[@class="t2"]/a/text()').extract()
                if jobCompany:
                    #print(jobCompany[0].strip())
                    jobspidersItem['jobCompany'] = jobCompany[0].strip()
    
                jobArea = jobItem.xpath('span[@class="t3"]/text()').extract()
                if jobArea:
                    #print(jobArea[0].strip())
                    jobspidersItem['jobArea'] = jobArea[0].strip()
    
                jobSale = jobItem.xpath('span[@class="t4"]/text()').extract()
                if jobSale:
                    # print(jobCompany[0].strip())
                    jobspidersItem['jobSale'] = jobSale[0].strip()
    
                jobDate = jobItem.xpath('span[@class="t5"]/text()').extract()
                if jobDate:
                    # print(jobCompany[0].strip())
                    jobspidersItem['jobDate'] = jobDate[0].strip()
    
    
    
                yield jobspidersItem  # 通过yield 调用输出管道
                pass
            nextPageURL = response.xpath('//li[@class="bk"]/a/@href').extract()  # 取下一页的地址
            print(nextPageURL)
            if nextPageURL:
                url = response.urljoin(nextPageURL[-1])
                print('url', url)
                # 发送下一页请求并调用parse()函数继续解析
                yield scrapy.Request(url, self.parse, dont_filter=False)
                pass
            else:
                print("退出")
            pass

    items.py   设置五个items

    import scrapy
    
    
    class JobspidersItem(scrapy.Item):
        # define the fields for your item here like:
        jobPosition = scrapy.Field()
        jobCompany = scrapy.Field()
        jobArea = scrapy.Field()
        jobSale = scrapy.Field()
        jobDate = scrapy.Field()
        pass

    pipelines.py  输出管道

    class JobspidersPipeline(object):
        def process_item(self, item, spider):
            print('职位:', item['jobPosition'])
            print('公司:', item['jobCompany'])
            print('工作地点:', item['jobArea'])
            print('薪资:', item['jobSale'])
            print('发布时间:', item['jobDate'])
            print('----------------------------')
            return item

    pipelinesmysql.py  输出到mysql中  第一行的意思是使用了以前封装的数据库操作类

    from week5_day04.dbutil import dbutil
    
    # 作业: 自定义的管道,将完整的爬取数据,保存到MySql数据库中
    class JobspidersPipeline(object):
        def process_item(self, item, spider):
            dbu = dbutil.MYSQLdbUtil()
            dbu.getConnection()  # 开启事物
    
            # 1.添加
            try:
                #sql = "insert into jobs (职位名,公司名,工作地点,薪资,发布时间)values(%s,%s,%s,%s,%s)"
                sql = "insert into t_job (jobname,jobcompany,jobarea,jobsale,jobdata)values(%s,%s,%s,%s,%s)"
                #date = []
                #dbu.execute(sql, date, True)
                dbu.execute(sql, (item['jobPosition'],item['jobCompany'],item['jobArea'],item['jobSale'],item['jobDate']),True)
                #dbu.execute(sql,True)
                dbu.commit()
                print('插入数据库成功!!')
            except:
                dbu.rollback()
                dbu.commit()  # 回滚后要提交
            finally:
                dbu.close()
            return item

    最终结果:

    通过这个最基础的51job爬虫,进入到scrapy框架的学习中,这东西挺好使

  • 相关阅读:
    mysql 触发器 插入
    【经验】STL的list vector在iterator迭代器的循环中 使用erase 造成的BUG
    C/C++ 关于 for循环 的第二个表达式右侧非常量的时候
    MySQL C API的一个让我头疼的问题,获得一行记录中包括NULL
    vim粘贴代码的时候,恶心的缩进.
    [转]分析MySQL数据类型的长度【mysql数据字段 中length和decimals的作用!熟悉mysql必看】
    [转]对于孩子:旅行的意义何在?
    libc中的标准函数 localtime和localtime_r 的用法
    【腾讯面试题目】非循环方式 计算一个32位整数中被置1的位数
    C++对带有分隔符的字符串 分割为数字的通用解决方案
  • 原文地址:https://www.cnblogs.com/yuxuanlian/p/9721787.html
Copyright © 2020-2023  润新知