1 # -*- coding: utf-8 -*- 2 import scrapy 3 from ..items import JobscrawlerQianchengwuyouItem 4 import datetime 5 6 class QianchengSpiderSpider(scrapy.Spider): 7 name = 'qiancheng_spider' 8 # allowed_domains = ['qq.com'] 9 start_urls = [ 10 # 数据分析师 11 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=', 12 # 人工智能 13 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E4%25BA%25BA%25E5%25B7%25A5%25E6%2599%25BA%25E8%2583%25BD,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=', 14 # 算法工程师 15 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E7%25AE%2597%25E6%25B3%2595%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=', 16 # 深度学习 17 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%25B7%25B1%25E5%25BA%25A6%25E5%25AD%25A6%25E4%25B9%25A0,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=', 18 # 数据挖掘 19 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E6%258C%2596%25E6%258E%2598,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=', 20 # 机器学习 21 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%259C%25BA%25E5%2599%25A8%25E5%25AD%25A6%25E4%25B9%25A0,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' 22 ] 23 start_url_tags=[ 24 "数据分析师", 25 "人工智能", 26 "算法工程师", 27 "深度学习", 28 "数据挖掘", 29 "机器学习", 30 ] 31 32 def __init__(self): 33 self.record_date = datetime.datetime.now().strftime('%Y-%m-%d') 34 35 def start_requests(self): 36 for index in range(len(self.start_urls)): 37 url = self.start_urls[index] 38 tag = self.start_url_tags[index] 39 yield scrapy.Request(url,callback=self.parse,meta={'tag':tag},dont_filter=True) 40 41 def parse(self, response): 42 tag = response.meta['tag'] 43 xpath = '//div[@class="el"]' 44 items = response.xpath(xpath) 45 for item in items: 46 if not len(item.xpath('./p[@class="t1 "]')): 47 continue 48 url = item.xpath('./p[@class="t1 "]//a/@href').extract_first() 49 title = item.xpath('./p[@class="t1 "]//a/text()').extract_first() 50 if tag == '算法' and not ('算法' in title): 51 continue 52 yield scrapy.Request(url,callback=self.detail_parse,meta={'tag':tag},dont_filter=True) 53 next_page_url = response.xpath('//a[@id="rtNext"]/@href').extract_first() 54 if next_page_url is None: 55 yield scrapy.Request(next_page_url, callback=self.parse,meta={'tag':tag},dont_filter=True) 56 57 def detail_parse(self,response): 58 item = JobscrawlerQianchengwuyouItem() 59 item['job_tag'] = response.meta['tag'] 60 item['job_url'] = response.url 61 item['record_date'] = self.record_date 62 # 招聘名称、职位信息、薪资、职位福利、经验要求、学历要求 63 item['job_name'] = response.xpath('//div[@class = "cn"]/h1/text()').extract_first().strip() 64 item['job_info'] = "".join(response.xpath('//div[@class = "bmsg job_msg inbox"]//text()').extract()).strip() 65 item['job_salary'] = "".join(response.xpath('//div[@class = "cn"]/strong/text()').extract()).strip() 66 item['job_welfare'] = ",".join(response.xpath('//span[@class="sp4"]/text()').extract()).strip() 67 item['job_exp_require'] = response.xpath('//p[@class="msg ltype"]/text()').extract()[1].strip() 68 item['job_edu_require'] = response.xpath('//p[@class="msg ltype"]/text()').extract()[2].strip() 69 # 公司名称、公司行业、公司性质、公司人数、公司地址、公司概况、公司融资阶段 70 item['company_name'] = response.xpath('//div[@class = "com_msg"]//p/text()').extract_first().strip() 71 item['company_industry'] = "".join(response.xpath('//span[@class = "i_trade"]/..//text()').extract()).strip() 72 item['company_nature'] = "".join(response.xpath('//span[@class = "i_flag"]/../text()').extract()).strip() 73 item['company_people'] = "".join(response.xpath('//span[@class = "i_people"]/../text()').extract()).strip() 74 item['company_location'] = "" 75 item['company_overview'] = "".join(response.xpath('//div[@class = "tmsg inbox"]//text()').extract()).strip() 76 item['company_financing_stage'] = "" 77 yield item