day51——爬虫(一)

 1 #!/usr/bin/env python
 2 # -*- coding: utf-8 -*-
 3 # @Time    : 2018/1/11 22:07
 4 # @Author  : lingxiangxiang
 5 # @File    : demon1.py
 6 '''爬虫阿铭linux教程，保存为本地的pdf文件'''
 7 # 需要知道三招教你做人
 8 # pdfkit.from_string("hello world", "1.pdf")
 9 # pdfkit.from_url("www.baidu.com", "2.pdf")
10 # pdfkit.from_file("hello.html", "3.pdf")
11 import re
12 
13 import os
14 
15 import pdfkit
16 import requests
17 
18 
19 if not os.path.exists("aminglinux"):
20     os.mkdir("aminglinux")
21 os.chdir("aminglinux")
22 
23 url = "http://www.apelearn.com/study_v2/"
24 s = requests.session()
25 text = s.get(url).text
26 print(text)
27 reg = re.compile(r'<li class="toctree-l1"><a class="reference internal" href="(.*)">.*</a></li>')
28 result = reg.findall(text)
29 res = list(set(result))
30 pdfUrl = "http://www.apelearn.com/study_v2/"
31 for i in res:
32     url = "{0}{1}".format(pdfUrl, i)
33     pdfFileName = i.replace("html", "pdf")
34     print(pdfFileName)
35     try:
36         pdfkit.from_url(url, pdfFileName)
37     except:
38         continue

相关阅读:
FreePbx
ntpdate和date
Linux系统/dev/mapper目录浅谈
利用rsync做全平台备份
windows 共享给 linux
Linux、UNIX设置开机自动运行命令
JNU周练1019
JNU周练1013
2013/7/30 JNU周练
二叉树遍历

原文地址：https://www.cnblogs.com/yangjinbiao/p/8276268.html