python3爬虫 -----爬取职位招聘信息-------from腾讯社会招聘

 1 # -*- coding: utf-8 -*-
 2 # author:zxy
 3 #Date:2018-9-23
 4 
 5 from lxml import etree
 6 import requests
 7 
 8 BASE_DOMAIN="http://hr.tencent.com/"
 9 HEADERS = {
10     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
11                   'AppleWebKit/537.36 (KHTML, like Gecko)'
12                   ' Chrome/67.0.3396.99 Safari/537.36'
13 }
14 BASE_URL="https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start=0"
15 
16 def parse_detail_page(url):
17     position={}
18     response=requests.get(url,headers=HEADERS)
19     html=etree.HTML(response.text)
20     work_name=html.xpath("//tr[@class='h']/td/text()")[0]
21     work_place=html.xpath("//tr[@class='c bottomline']/td[1]/text()")[0]
22     work_category=html.xpath("//tr[@class='c bottomline']/td[2]/text()")[0]
23     work_lack_number=html.xpath("//tr[@class='c bottomline']/td[3]/text()")[0]
24     # print(work_lack_number)
25     more_infos=html.xpath("//ul[@class='squareli']")
26     work_duty=more_infos[0].xpath(".//text()")
27     work_require=more_infos[1].xpath(".//text()")
28 
29     position['work_name']=work_name
30     position['work_place']=work_place
31     position['work_category']=work_category
32     position['work_lack_number']=work_lack_number
33     position['work_duty']=work_duty
34     position['work_require']=work_require
35 
36     return position
37 
38 def get_detail_urls(url):
39     response=requests.get(url=BASE_URL,headers=HEADERS)
40     text=response.text
41     html=etree.HTML(text)
42     links=html.xpath("//tr[@class='even']//a/@href")
43     links=map(lambda url:BASE_DOMAIN+url,links)
44     return links
45 
46 def spider():
47     base_url="https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start={}#a"
48     positions=[]
49     for x in range(0,4): #43
50         x*=10
51         url=base_url.format(x)
52         detail_urls=get_detail_urls(url)
53         for detail_url in detail_urls:
54             position=parse_detail_page(detail_url)
55             positions.append(position)
56             #print(position)
57             with open('tecentRecruit.txt','a',encoding='utf-8') as f:
58                 for (key,value) in position.items():
59                     if(key=='work_duty'):
60                         str='work_duty :{}'
61                         f.write(str.format(value))
62                         f.write('
')
63                     elif(key=='work_require'):
64                         str="work_require :{}"
65                         f.write(str.format(value))
66                         f.write('
')
67                     else:
68                         f.write(key+":"+value)
69                         f.write('
')
70                 f.write('
'*3)
71 
72     #print(positions)
73 
74 if __name__ == '__main__':
75     spider()

效果如图所示：

相关阅读:
新浪微博数据抓取(java实现)
在Tomcat下配置Solr 4.x 版本
 使用AWT组件实现验证码功能
 css自动换行
 CentOS6.5把MySQL从5.1升级到5.6后，MySQL不能启动
 centos绑定多个域名
 Centos下Yum安装PHP5.5,5.6,7.0
CSS总结
 覆盖物
 高德地图插件
原文地址：https://www.cnblogs.com/z-712/p/9693729.html