• day51——爬虫(一)


     1 #!/usr/bin/env python
     2 # -*- coding: utf-8 -*-
     3 # @Time    : 2018/1/11 22:07
     4 # @Author  : lingxiangxiang
     5 # @File    : demon1.py
     6 '''爬虫阿铭linux教程,保存为本地的pdf文件'''
     7 # 需要知道三招教你做人
     8 # pdfkit.from_string("hello world", "1.pdf")
     9 # pdfkit.from_url("www.baidu.com", "2.pdf")
    10 # pdfkit.from_file("hello.html", "3.pdf")
    11 import re
    12 
    13 import os
    14 
    15 import pdfkit
    16 import requests
    17 
    18 
    19 if not os.path.exists("aminglinux"):
    20     os.mkdir("aminglinux")
    21 os.chdir("aminglinux")
    22 
    23 url = "http://www.apelearn.com/study_v2/"
    24 s = requests.session()
    25 text = s.get(url).text
    26 print(text)
    27 reg = re.compile(r'<li class="toctree-l1"><a class="reference internal" href="(.*)">.*</a></li>')
    28 result = reg.findall(text)
    29 res = list(set(result))
    30 pdfUrl = "http://www.apelearn.com/study_v2/"
    31 for i in res:
    32     url = "{0}{1}".format(pdfUrl, i)
    33     pdfFileName = i.replace("html", "pdf")
    34     print(pdfFileName)
    35     try:
    36         pdfkit.from_url(url, pdfFileName)
    37     except:
    38         continue
  • 相关阅读:
    FreePbx
    ntpdate和date
    Linux系统/dev/mapper目录浅谈
    利用rsync做全平台备份
    windows 共享给 linux
    Linux、UNIX设置开机自动运行命令
    JNU周练1019
    JNU周练1013
    2013/7/30 JNU周练
    二叉树遍历
  • 原文地址:https://www.cnblogs.com/yangjinbiao/p/8276268.html
Copyright © 2020-2023  润新知