1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 # @Time : 2018/1/11 22:07 4 # @Author : lingxiangxiang 5 # @File : demon1.py 6 '''爬虫阿铭linux教程,保存为本地的pdf文件''' 7 # 需要知道三招教你做人 8 # pdfkit.from_string("hello world", "1.pdf") 9 # pdfkit.from_url("www.baidu.com", "2.pdf") 10 # pdfkit.from_file("hello.html", "3.pdf") 11 import re 12 13 import os 14 15 import pdfkit 16 import requests 17 18 19 if not os.path.exists("aminglinux"): 20 os.mkdir("aminglinux") 21 os.chdir("aminglinux") 22 23 url = "http://www.apelearn.com/study_v2/" 24 s = requests.session() 25 text = s.get(url).text 26 print(text) 27 reg = re.compile(r'<li class="toctree-l1"><a class="reference internal" href="(.*)">.*</a></li>') 28 result = reg.findall(text) 29 res = list(set(result)) 30 pdfUrl = "http://www.apelearn.com/study_v2/" 31 for i in res: 32 url = "{0}{1}".format(pdfUrl, i) 33 pdfFileName = i.replace("html", "pdf") 34 print(pdfFileName) 35 try: 36 pdfkit.from_url(url, pdfFileName) 37 except: 38 continue