• 【飞谷六期】爬虫项目4


    经过了几天的摸索,照猫画虎的把爬虫的部分做完了。

    但是很多原理性的东西都不是很理解,就是照着抄的,还需要继续学习。

    看这个目录结构,只看.py的文件,.pyc的文件是运行的时候生成的不管它。

    items.py:定义想要导出的数据

    Pipelines.py:用于将数据导出

    settings.py:告诉程序数据传输需要的文件

    init.py:没用到过还

    (以上是我自己暂时的理解)

    我最终抓取的是智联招聘的信息,注意点都在注释里

    获取职位连接的关键代码,linkspider,其他文件都是生成时默认的

    #encoding:utf-8
    from scrapy.spider import BaseSpider
    from scrapy.http import FormRequest, Request
    from scrapy.selector import HtmlXPathSelector
    import os
    import sys
    import datetime
    import re
    
    class ZhaoPinSpider(BaseSpider):
        name = "zhaopin"
        allowed_domains = ["zhaopin.com"]
        #url加上pd=1就只显示今天新添加的职位 城市后面都固定为全国
        zlzp_urlpatten = "http://sou.zhaopin.com/jobs/searchresult.ashx?pd=1&jl={CITY}&kw={KEYWORD}&p={CURR_PAGE}"
        
        def __init__(self):
            self.headers = {
                            'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0',
                            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                            'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
                            'Connection':'keep-alive'
                            }   
            self.start_urls = self.set_url()
        
        #set_url方法动态设定要抓取的链接列表
        def set_url(self):
            url_list = []
            #从配置文件中取出所有的关键字列表,逐个检索
            keys = '大数据,hadoop,hive,hbase,spark,storm,sqoop,pig'
            for keyword in keys.split(','):
                url = self.zlzp_urlpatten
                url = url.format(CITY='全国', KEYWORD=keyword, CURR_PAGE=1)
                url_list.append(url)
            return url_list
        
        def start_requests(self): #该方法必须返回一个可迭代对象(iterable)。该对象包含了spider用于爬取的第一个Request。
            for url in self.start_urls:
                yield FormRequest(url,
                                    headers = self.headers,
                                    callback = self.parse)#使用回调函数
        
        def parse(self, response):
            hxs = HtmlXPathSelector(response)
            keyword = hxs.select('//div[@class="search"]//input[@name="KeyWord"]/@value').extract()[0]
            keyword = keyword.encode('utf-8')
            url = self.zlzp_urlpatten
            #找总页数
            pageInfo = hxs.select('//div[@class="pagesDown"]//button/@onclick').extract()
            if pageInfo: #注意 只有一页时找不到pageInfo
                pageInfo = pageInfo[0]
                pattern = re.compile('.*?value,(.*?),.*', re.S)
                findPageNum = re.search(pattern, pageInfo)
                pageNum = int(findPageNum.group(1))
            else:
                pageNum = 1
            for curPage in range(1,pageNum + 1):
                each_url = url.format(CITY='全国', KEYWORD=keyword, CURR_PAGE=curPage)
                yield Request(each_url,callback=self.get_joburls_bypage)
        
        def get_joburls_bypage(self, response):
            hxs = HtmlXPathSelector(response)
            links = hxs.select('//td[@class="zwmc"]//a/@href').extract()
            # 获得的信息都是当天的 直接入库
            for link in links:
                if(link != 'http://e.zhaopin.com/products/1/detail.do'): #有的地方会冒出这个链接 去掉
                    open('../output/link_output/link.txt', 'ab').write(link+'
    ')

    获取具体职位信息的page相关代码:

    settings

    # Scrapy settings for zhaopin_page project
    #
    # For simplicity, this file contains only the most important settings by
    # default. All the other settings are documented here:
    #
    #     http://doc.scrapy.org/topics/settings.html
    #
    
    BOT_NAME = 'zhaopin_page'
    BOT_VERSION = '1.0'
    
    SPIDER_MODULES = ['zhaopin_page.spiders']
    NEWSPIDER_MODULE = 'zhaopin_page.spiders'
    USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
    ITEM_PIPELINES = {'zhaopin_page.FilePipelines.PagePipeline':5}

    FilePipelines

    # encoding: utf-8
    import traceback
    import datetime
    import sys
    reload(sys)
    sys.setdefaultencoding( "utf-8" )
    sys.path.append("../../../")
    
    class PagePipeline(object):
        
        #把解析后的内容放入文件中
        def process_item(self, item, spider):
            fname =  '../output/page_output/' + item['file_id'] + '.txt'
            try:
                outfile = open(fname, 'wb')
                outfile.write(item['web_id']+self.getJobFieldSpt()+item['job_url']+self.getJobFieldSpt()+item['job_name']+self.getJobFieldSpt()+item['job_location']+self.getJobFieldSpt()+item['job_desc']+self.getJobFieldSpt()+item['edu']+self.getJobFieldSpt()+item['gender']+self.getJobFieldSpt()+item['language']+self.getJobFieldSpt()+item['major']+self.getJobFieldSpt()+item['work_years']+self.getJobFieldSpt()+item['salary']+self.getJobFieldSpt()+item['company_name']+self.getJobFieldSpt()+item['company_desc']+self.getJobFieldSpt()+item['company_address']+self.getJobFieldSpt()+item['company_worktype']+self.getJobFieldSpt()+item['company_scale']+self.getJobFieldSpt()+item['company_prop']+self.getJobFieldSpt()+item['company_website']+self.getJobFieldSpt()+self.getCurrentTimestamp())
            except Exception as e:
                print "ERROR GEN FILE!! >>> " + fname
                print traceback.format_exc()
    
        def getCurrentTimestamp(self):
        # 得到时间戳
            return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    
        def getJobFieldSpt(self):
        #得到生成的职位文件字段间的分隔符。使用ascii码1,和hive中默认的分隔符相同        
            return chr(1)

    items

    # encoding: utf-8
    
    from scrapy.item import Item, Field
    
    #定义存放帖子内容的类
    class PageItem(Item):
        #网站标识
        web_id = Field()
        #生成的文件名
        file_id = Field()
        #职位来源网址
        job_url = Field()
        #工作名称
        job_name = Field()
        #工作地点    
        job_location = Field()
        #职位描述 
        job_desc = Field()
        #学历要求   
        edu = Field()
        #性别要求      
        gender = Field()
        #语言要求       
        language = Field()
        #专业要求        
        major = Field()
        #工作年限    
        work_years = Field()
        #薪水范围         
        salary = Field()
        #职位发布时间
        job_datetime = Field()
        #公司名称      
        company_name = Field()
        #企业介绍
        company_desc = Field()
        #公司地址
        company_address = Field()
        #行业
        company_worktype = Field()
        #规模
        company_scale = Field()
        #性质
        company_prop = Field()
        #网址
        company_website = Field()

    spider

    # encoding: utf-8
    
    from scrapy.spider import BaseSpider
    from scrapy.http import FormRequest, Request
    from scrapy.selector import HtmlXPathSelector
    from zhaopin_page import items
    import traceback
    import sys
    import datetime
    import re
    
    #定义要抓取页面的爬虫类
    class ZhaoPinPageSpider(BaseSpider):
        name = "page"    
        start_urls = []
        
        def __init__(self):        
            self.start_urls = self.set_url()
        
        #从jobs_task表中读出要抓取的链接列表,放入数组中
        def set_url(self):
            url_list = []
            link_file = open('../output/link_output/link.txt', 'r')
            loops = 0
            for each_link in link_file:
                each_link = each_link.replace('
    ','')
                each_link = each_link.replace('
    ','')
                url_list.append(each_link)
                loops+=1
                if (loops == 100):
                    break
            link_file.close()
            return url_list
        
        def parse(self, response):
            try:
                #url中后面的数字串
                file_id = response.url.split("/")[-1].split(".")[0]
                hxs = HtmlXPathSelector(response)
                
                #获取最上面一栏的内容
                title = ''  #职位名
                companyName = '' #公司名
                basicInfo = hxs.select('//div[@class="fixed-inner-box"]').extract()[0] #会有两个 后面有个clone的
                pattern = re.compile('.*?<h1>(.*?)</h1>.*?<a.*?>(.*?)</a>.*?', re.S)
                findBasicInfo = re.search(pattern, basicInfo)
                if findBasicInfo:
                    title = findBasicInfo.group(1).strip()  #职位名
                    companyName = findBasicInfo.group(2).strip() #公司名
                
                
                #获取左侧基本公司信息 不能用正则表达式,因为有信息不全的情况 如http://jobs.zhaopin.com/297851037250005.htm
                companySize = ''  #公司规模
                companyType = ''  #公司性质 
                companyLine = ''  #公司行业
                companyHost = ''  #公司主页
                companyAddress = ''  #公司地
                companyInfo = hxs.select('//div[@class="company-box"]').extract()[0].encode('utf-8') #尽管只有一个,但是是列表形式,还是需要取出来
                if(companyInfo.find('公司规模:')>-1):
                    companySize = companyInfo.split('公司规模:</span>')[1]
                    companySize = companySize.split('<strong>')[1]
                    companySize = companySize.split('</strong>')[0].strip()
                if(companyInfo.find('公司性质:')>-1):
                    companyType = companyInfo.split('公司性质:</span>')[1]
                    companyType = companyType.split('<strong>')[1]
                    companyType = companyType.split('</strong>')[0].strip()
                if(companyInfo.find('公司行业:')>-1):
                    companyLine = companyInfo.split('公司行业:</span>')[1]
                    companyLine = companyLine.split('<strong>')[1]
                    companyLine = companyLine.split('</a>')[0]
                    companyLine = companyLine.split('>')[1].strip()
                if(companyInfo.find('公司主页:')>-1):
                    companyHost = companyInfo.split('公司主页:</span>')[1]
                    companyHost = companyHost.split('<strong>')[1]
                    companyHost = companyHost.split('</a>')[0]
                    companyHost = companyHost.split('>')[1].strip()
                if(companyInfo.find('公司地址:')>-1):
                    companyAddress = companyInfo.split('公司地址:</span>')[1]
                    companyAddress = companyAddress.split('<strong>')[1]
                    companyAddress = companyAddress.split('</strong>')[0].strip()
                
                
                #获取中部工作要求信息
                salary = '' #职位月薪  必须先声明变量 否则会出错
                address = '' #工作地点
                jobDateTime = '' #发布日期
                jobCategory = '' #工作性质
                experience = '' #工作经验
                education = '' #最低学历
                numberInNeed = '' #招聘人数
                jobType = '' #职位类别
                jobRequirementInfo = hxs.select('/html/body/div[4]/div[1]/ul').extract()[0]
                pattern = re.compile('.*?<strong>(.*?)</strong>
    .*?<strong>.*?<a.*?>(.*?)</a>
    .*?<strong>.*?<span.*?>(.*?)</span>
    .*?<strong>(.*?)</strong>
    .*?<strong>(.*?)</strong>
    .*?<strong>(.*?)</strong>
    .*?<strong>(.*?)</strong>
    .*?<strong>.*?target.*?>(.*?)</a>',re.S) #前面不能有空格或者TAB 否则匹配不上
                findJobRequirementInfo = re.search(pattern, jobRequirementInfo)
                if findJobRequirementInfo:
                    salary = findJobRequirementInfo.group(1).strip() #职位月薪
                    address = findJobRequirementInfo.group(2).strip() #工作地点
                    jobDateTime = findJobRequirementInfo.group(3).strip() #发布日期
                    jobCategory = findJobRequirementInfo.group(4).strip() #工作性质
                    experience = findJobRequirementInfo.group(5).strip() #工作经验
                    education = findJobRequirementInfo.group(6).strip() #最低学历
                    numberInNeed = findJobRequirementInfo.group(7).strip() #招聘人数
                    jobType = findJobRequirementInfo.group(8).strip() #职位类别
                
                #获取描述信息
                detailInfo = hxs.select('//div[@class="tab-inner-cont"]').extract()
                jobDescribe = detailInfo[0]
                companyDescribe = detailInfo[1]
                
                
                pattern = re.compile('<.*?>|&nbsp',re.S)  #删除无用的信息
                jobDescribe = re.sub(pattern,'',jobDescribe).strip()  #职位描述
                companyDescribe = re.sub(pattern,'',companyDescribe).strip()  #公司介绍
                companySize = re.sub(pattern,'',companySize).strip()
                companyType = re.sub(pattern,'',companyType).strip()
                companyLine = re.sub(pattern,'',companyLine).strip()
                companyHost = re.sub(pattern,'',companyHost).strip()
                companyAddress = re.sub(pattern,'',companyAddress).strip() 
                salary = re.sub(pattern,'',salary).strip()
                address = re.sub(pattern,'',address).strip()
                jobDateTime = re.sub(pattern,'',jobDateTime).strip()
                jobCategory = re.sub(pattern,'',jobCategory).strip()
                experience = re.sub(pattern,'',experience).strip()
                education = re.sub(pattern,'',education).strip()
                numberInNeed = re.sub(pattern,'',numberInNeed).strip()
                jobType = re.sub(pattern,'',jobType).strip()
                title = re.sub(pattern,'',title).strip()
                companyName = re.sub(pattern,'',companyName).strip()
                
                
                data = items.PageItem()
                data['web_id'] = "zhaopin"
                data['file_id'] = file_id
                data['job_url'] = response.url
                data['job_name'] = title
                data['job_desc'] = jobDescribe
                data['gender'] = ""
                data['major'] = ""
                data['company_name'] = companyName
                data['job_datetime'] = jobDateTime
                data['job_location'] = address
                data['work_years'] = experience
                data['edu'] = education
                data['salary'] = salary
                data['company_desc'] = companyDescribe
                data['company_address'] = companyAddress
                data['company_website'] = companyHost
                data['language'] = ""
                data['company_worktype'] = companyLine
                data['company_prop'] = companyType
                data['company_scale'] = companySize
                
                #更新任务表中抓取状态
                #self.jobsTool.updateCrulInfo(ConfigPropObj.liepin_webid, response.url, 1, "")
                return data
            except Exception as e:
                print "ERROR PARSE"
                print response.url
                print traceback.format_exc()
  • 相关阅读:
    SpringSource发布Spring Data Redis 1.0.0
    C#实现的GDI+时钟
    敏捷团队应对打扰的七种方法
    JBoss发布Hibernate 4.0
    在 IE10 的 XHR 的麦子
    Spring AMQP 1.0 GA发布了
    对 64 个以上逻辑处理器使用任务管理器
    预览Visual Studio11: 敏捷的支持、团队协作以及代码克隆监测
    在 Windows 8 中支持传感器
    HTTP API可演进性最佳实践
  • 原文地址:https://www.cnblogs.com/dplearning/p/4925542.html
Copyright © 2020-2023  润新知