1 # -*- ecoding: utf-8 -*-
2 # @ModuleName: 3、免费简历模版爬虫
3 # @Function:
4 # @Author: merry
5 # @Time: 2021/1/18 17:02
6 import requests
7 from lxml import etree
8 import os
9
10 # 定义请求头
11 headers = {
12 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
13 }
14
15 url = 'https://sc.chinaz.com/jianli/free.html'
16 # 获取响应页面源码
17 reponse = requests.get(url, headers=headers)
18 # 设置编码
19 reponse.encoding = 'utf-8'
20 new_reponse = reponse.text
21
22 # 使用etree解析详情页面的超链接
23 tree = etree.HTML(new_reponse)
24 # 解析当前页面简历的大div
25 get_url_list = tree.xpath('//div[@id="container"]/div')
26 # 创建简历保存的文件夹
27 if not os.path.exists('./doc'):
28 os.mkdir('./doc')
29
30 # 遍历大的div
31 for url_li in get_url_list:
32 # 得到简历详情页的url
33 doc_url = 'https:' + url_li.xpath('./a/@href')[0]
34 # 得到简历的名称
35 doc_name = url_li.xpath('./a/img/@alt')[0]
36 # 请求详情页url
37 reponse = requests.get(doc_url, headers=headers)
38 # 设置响应编码
39 reponse.encoding = 'utf-8'
40 new_reponse = reponse.text
41 # etree解析
42 tree = etree.HTML(new_reponse)
43 # 拼接文件名
44 filename = f'./doc/{doc_name}.rar'
45 # 根据class属性获得第一个下载链接的url列表
46 li_list = tree.xpath('//div[@class="clearfix mt20 downlist"]//li[1]')
47 # 遍历下载链接的url
48 for li in li_list:
49 # 获得下载链接的href属性获取超链接
50 get_down_url = li.xpath('./a/@href')[0]
51 # 请求下载doc链接的二进制文件
52 doc = requests.get(get_down_url, headers=headers).content
53 # 保存到文件夹
54 with open(filename, 'wb') as fp:
55 fp.write(doc)
56 print(f' 33[32m爬取----{doc_name}----完成')