• scrapy--boss直聘


      Hi,大家好。有段时间没来更新scrapy爬取实例信息了,前2天同事说爬取拉勾,boss直聘等网站信息比较困难。昨天下午开始着手爬取boss直聘内Python爬虫的信息,比想象中的简单很多。

    需要解决的问题:

      boss直聘网的信息是大部分以静态加载和少许动态加载方式显示网站。
    
      1.静态加载:公司的具体信息和岗位职责(1_1)
    
      2.动态加载:首页搜索框,搜索python爬虫(1_2)

    解决的思路:

      1.静态加载:常规爬取信息(简单)
    
      2.动态加载:selenium(简单)

                      图(1_1)

                      图(1_2)

     老规矩,给各位爬取结果的图,大家也可以去尝试一下:

    (三)开始正题

    3_1.需要提取的信息:items.py

    import scrapy
    
    class BossItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        #pass
        job_title = scrapy.Field()
        salary    = scrapy.Field()
        address   = scrapy.Field()
        job_time  = scrapy.Field()
        education = scrapy.Field()
        company   = scrapy.Field()
        company_info= scrapy.Field()
        detail_text = scrapy.Field()

    3_2.设置代理:middlewares.py

    class BossSpiderMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the spider middleware does not modify the
        # passed objects.
    
        def __init__(self,ip=''):
            self.ip = ip
        def process_request(self,request,spider):
            print('http://10.240.252.16:911')
            request.meta['proxy']= 'http://10.240.252.16:911'
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_spider_input(self, response, spider):
            # Called for each response that goes through the spider
            # middleware and into the spider.
    
            # Should return None or raise an exception.
            return None
    
        def process_spider_output(self, response, result, spider):
            # Called with the results returned from the Spider, after
            # it has processed the response.
    
            # Must return an iterable of Request, dict or Item objects.
            for i in result:
                yield i
    
        def process_spider_exception(self, response, exception, spider):
            # Called when a spider or process_spider_input() method
            # (from other spider middleware) raises an exception.
    
            # Should return either None or an iterable of Response, dict
            # or Item objects.
            pass
    
        def process_start_requests(self, start_requests, spider):
            # Called with the start requests of the spider, and works
            # similarly to the process_spider_output() method, except
            # that it doesn’t have a response associated.
    
            # Must return only requests (not items).
            for r in start_requests:
                yield r
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)
    
    
    class BossDownloaderMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the downloader middleware does not modify the
        # passed objects.
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_request(self, request, spider):
            # Called for each request that goes through the downloader
            # middleware.
    
            # Must either:
            # - return None: continue processing this request
            # - or return a Response object
            # - or return a Request object
            # - or raise IgnoreRequest: process_exception() methods of
            #   installed downloader middleware will be called
            return None
    
        def process_response(self, request, response, spider):
            # Called with the response returned from the downloader.
    
            # Must either;
            # - return a Response object
            # - return a Request object
            # - or raise IgnoreRequest
            return response
    
        def process_exception(self, request, exception, spider):
            # Called when a download handler or a process_request()
            # (from other downloader middleware) raises an exception.
    
            # Must either:
            # - return None: continue processing this exception
            # - return a Response object: stops process_exception() chain
            # - return a Request object: stops process_exception() chain
            pass
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)

    3_3.下载数据(存储到mongodb):pipelines.py

    import scrapy
    import pymongo
    from scrapy.item import Item
    
    class BossPipeline(object):
        def process_item(self, item, spider):
            return item
    
    class MongoDBPipeline(object):    #存储到mongodb中
        @classmethod
        def from_crawler(cls,crawler):
            cls.DB_URL = crawler.settings.get("MONGO_DB_URL",'mongodb://localhost:27017/')
            cls.DB_NAME = crawler.settings.get("MONGO_DB_NAME",'scrapy_data')
            return cls()
    
        def open_spider(self,spider):
            self.client = pymongo.MongoClient(self.DB_URL)
            self.db     = self.client[self.DB_NAME]
    
        def close_spider(self,spider):
            self.client.close()
    
        def process_item(self,item,spider):
            collection = self.db[spider.name]
            post = dict(item) if isinstance(item,Item) else item
            collection.insert(post)
    
            return item

    3_4.settings.py

    MONGO_DB_URL = 'mongodb://localhost:27017/'
    MONGO_DB_NAME = 'boss_detail'
    
    USER_AGENT ={       #设置浏览器的User_agent
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    }
    
    FEED_EXPORT_FIELDS = ['job_title','salary','address','job_time','education','company','company_info']
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    CONCURRENT_REQUESTS = 10
    
    # See also autothrottle settings and docs
    DOWNLOAD_DELAY = 0.5
    
    # Disable cookies (enabled by default)
    COOKIES_ENABLED = False
    
    DOWNLOADER_MIDDLEWARES = {
        #'Boss.middlewares.BossDownloaderMiddleware': 543,
        'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware':543,
        'Boss.middlewares.BossSpiderMiddleware':123,
    }
    
    ITEM_PIPELINES = {
        'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware':1,
        'Boss.pipelines.MongoDBPipeline': 300,
    }

    3_5.spider/boss.py

    #-*- coding:utf-8 -*-
    import time
    from selenium import webdriver
    import pdb
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.common.keys    import Keys
    from lxml import etree
    import re
    from bs4 import BeautifulSoup
    import scrapy
    from Boss.items import BossItem
    from Boss.settings import USER_AGENT
    from scrapy.linkextractors import LinkExtractor
    
    chrome_options = Options()
    driver = webdriver.Chrome()
    
    class BossSpider(scrapy.Spider):
        name = 'boss'
        allowed_domains = ['www.zhipin.com']
        start_urls = ['http://www.zhipin.com/']
    
        headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Content-Length': '11',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'www.zhipin.com',
            'Origin': 'www.zhipin.com',
            'Referer': 'http://www.zhipin.com/',
            'User-Agent': USER_AGENT,
            'X-Requested-With': 'XMLHttpRequest',
        }
    
        def start_requests(self):
            driver.get(
                self.start_urls[0]
                )
            time.sleep(3)
    
            #搜索python爬虫
            driver.find_element_by_name('query').send_keys(u'python爬虫')
            time.sleep(3)
            driver.find_element_by_class_name('btn-search').click()
            time.sleep(3)
    
            new_url = driver.current_url.encode('utf8') #获取跳转之后的url
            yield scrapy.Request(new_url)
    
        def parse(self, response):
            #提取网页链接url
            links = LinkExtractor(restrict_css="div.info-primary>h3>a")
            link = links.extract_links(response)
            for each_link in link:
                yield scrapy.Request(each_link.url,callback=self.job_detail)
    
            #sels = LinkExtractor(restrict_css='div.page')
            #yield scrapy.Request(sels.extract_links(response)[0].url,callback=self.parse)
    
        def job_detail(self,response):
            spiderItem = BossItem()
            #想要提取的信息
            spiderItem['job_title']     = response.css('div.job-primary.detail-box div.name h1::text').extract()[0]
            #pdb.set_trace()
            salar  = response.css('div.job-primary.detail-box span.badge ::text').extract()[0]
            spiderItem['salary']        = re.findall(r'(d.*?)
    ',salar)[0] #re提取金额
            spiderItem['address']       = response.css('div.job-primary.detail-box p::text').extract()[0]
            spiderItem['job_time']      = response.css('div.job-primary.detail-box p::text').extract()[1]
            spiderItem['education']     = response.css('div.job-primary.detail-box p::text').extract()[2]
            spiderItem['company']       = response.css('div.job-primary.detail-box div.info-company h3.name a::text').extract()[0]
            spiderItem['company_info']  = response.css('div.job-primary.detail-box div.info-company>p::text').extract()[0]
    
            detail = response.css('div.job-sec div.text ::text').extract()
            details = ''.join(detail).replace(' ','')      ##将列表内所有字符串提取成一个整的字符串,并且去除空格
            spiderItem['detail_text']   = details
    
            print spiderItem
            yield spiderItem
  • 相关阅读:
    jQuery 间歇式无缝滚动特效分享(三张图片平行滚动)
    百度网页分享js代码
    如何在linux中搭建JEECMS系统
    Python菜鸟之路:Python基础-类(2)——成员、成员修饰符、异常及其他
    Python菜鸟之路:Python基础-类(1)——概念
    Python菜鸟之路:Python基础-生成器和迭代器、递归
    Python菜鸟之路:Python基础-逼格提升利器:装饰器Decorator
    Python菜鸟之路:Python基础-内置函数补充
    Python菜鸟之路:Python基础——函数
    Python菜鸟之路:Python基础(三)
  • 原文地址:https://www.cnblogs.com/eilinge/p/9810038.html
Copyright © 2020-2023  润新知