import scrapy from jobspider.items import JobspiderItem import logging class JobSpider(scrapy.Spider): name = "job_spider" start_urls = [ "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&kw=java&isadv=0&sg=df4b40a6bfaf49c08ef0cb9e8e2181f2&p=1" ] def parse(self, response): # save html file. # filename = 'zhaopin.html' # with open(filename, 'wb') as f: # f.write(response.body) # self.log('Saved file %s' % filename) jobs = response.xpath('//div[@id="newlist_list_content_table"]/table[@class="newlist"]') for job in jobs[1:]: item = JobspiderItem() item['jobname'] = ''.join(job.xpath('.//td[@class="zwmc"]/div/a//text()').extract()) item['companyname'] = job.xpath('.//td[@class="gsmc"]/a/text()').extract_first() item['salary'] = job.xpath('.//td[@class="zwyx"]/text()').extract_first() item['workplace'] = job.xpath('.//td[@class="gzdd"]/text()').extract_first() yield item
爬取智联招聘。
智联html结构:
<div class="newlist_list_content" id="newlist_list_content_table"> <table class="newlist" width="853" cellspacing="0" cellpadding="0"> <tr> <td class="zwmc" style=" 250px;"> <input name="vacancyid" data-monitor="CZ751712970J00017764214|3" value="CZ751712970J00017764214_719_1_03_409__1_" onclick="zlapply.uncheckAll('allvacancyid')" type="checkbox"> <div style=" 224px;* 218px; _200px; float: left"> <a style="font-weight: bold" par="ssidkey=y&ss=409&ff=03&sg=df4b40a6bfaf49c08ef0cb9e8e2181f2&so=3" href="http://jobs.zhaopin.com/CZ751712970J00017764214.htm" target="_blank"><b>java</b>开发工程师 </a><a href="http://e.zhaopin.com/products/1/detail.do" target="_blank" title="点击“顶”字,了解更多"><img src="/assets/images/top.png" border="0" align="absmiddle"> <img src="/assets/images/jp.gif" border="0" align="absmiddle"></a> </div> </td> <td style=" 60px;" class="fk_lv"><span>64%</span></td> <td class="gsmc"><a href="http://company.zhaopin.com/CZ751712970.htm" target="_blank">北京中科网联信息技术研究院(有限合伙)</a> <a href="http://company.zhaopin.com/CZ751712970.htm" target="_blank" style="vertical-align: top;"><img src="//img03.zhaopin.cn/IHRNB/img/souvip1002.png" alt="1002" class="icon_vip" border="0" align="absmiddle"></a></td> <td class="zwyx">4001-6000</td> <td class="gzdd">郑州</td> <td class="gxsj"><span>置顶</span><a class="newlist_list_xlbtn" href="javascript:;"></a></td> </tr> <tr style="display: none" class="newlist_tr_detail"> <td style="line-height: 0;" colspan="6" width="833px"> <div class="newlist_detail"> <div class="clearfix"> <ul> <li class="newlist_deatil_two"><span>地点:郑州</span><span>公司性质:民营</span><span>经验:1-3年</span><span>学历:不限</span><span>职位月薪:4001-6000元/月</span></li><li class="newlist_deatil_last">...<b>Java</b>开发经验,熟悉J2EE体系结构,并能熟悉掌握SSH等开源框架; 3. 能熟练掌握和开发Web Service、SOAP、Socket、NIO等开发技术,对http、tcp、udp协议有一定的了解; 4. 精通Ajax、<b>Java</b>Script、HTML5等前...</li> </ul> <dl> <dt> <a href="javascript:zlapply.searchjob.ajaxApplyBrig1('CZ751712970J00017764214_719','ssi','_1_03_409__2_');searchMonitor.logSingleApplyData('CZ751712970J00017764214|3');"> <img src="/assets/images/newlist_sqimg_03.jpg"> </a> </dt> <dd><a href="javascript:zlapply.searchjob.saveOne('CZ751712970J00017764214_719');"><img src="/assets/images/newlist_scimg_06.jpg"></a></dd> </dl> </div> </div> </td></tr> </table> </div>