• 爬取廖雪峰老师的博客


    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    import pdfkit
    import os
    import re
    import time
    import sys
    import random
    sys.path.append('../' )
    from mytools import mail
    import logging
    import requests
    from bs4 import BeautifulSoup
    from fake_useragent import UserAgent
    
    # 环境配置
    log_path = '/home/jiangwenwen/python/log/'
    if not os.path.exists(log_path):
        os.makedirs(log_path)
    
    # logger配置
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')
    file_handler = logging.FileHandler('/home/jiangwenwen/python/log/liaoxuefeng.log', encoding='utf-8')
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    
    ua = UserAgent()
    
    headers = {
        "Host": "www.liaoxuefeng.com",
        "User-Agent": ua.random,
        "Referer": "https://www.liaoxuefeng.com/wiki/1252599548343744",
    }
    
    # 执行的JavaScript脚本(懒加载对应)
    run_script = "$(function () { $('[data-src]').each(function () { $(this).attr('src', $(this).attr('data-src')); })})"
    
    options = {
        # Wait some milliseconds for javascript finish (default 200)
        '--javascript-delay': '5000',
        '--run-script': run_script
    }
    
    
    def save_pdf(url, category):
        try:
            response = requests.get('https://httpbin.org/ip')
            logger.info('Your IP is {0}'.format(response.json()['origin']))
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            for child in soup.find_all("a", class_="x-wiki-index-item"):
                # 下载链接
                url = "https://www.liaoxuefeng.com" + child.get('href')
                # 文件夹不存在新建文件夹
                file_path = "/home/jiangwenwen/liaoxuefeng/" + category + "/"
                if not os.path.exists(file_path):
                    os.makedirs(file_path)
                # 文件绝对路径
                file_name = file_path + child.string.replace('/', '/').replace('\', '\') + ".pdf"
                # 文件不存在时下载
                if not os.path.exists(file_name):
                    pdfkit.from_url(url, file_name, options=options)
                    logger.info(file_name + u'下载成功')
                    time.sleep(random.randint(720, 1200))
        except Exception as e:
            mail.sendMail('廖雪峰的官方网站:' + str(e))
            logger.exception(str(e))
    
    
    # java下载
    save_pdf("https://www.liaoxuefeng.com/wiki/1252599548343744", "java")
    
    # python下载
    save_pdf("https://www.liaoxuefeng.com/wiki/1016959663602400", "python")
    
    # JavaScript下载
    save_pdf("https://www.liaoxuefeng.com/wiki/1022910821149312", "JavaScript")
    
    # SQL下载
    save_pdf("https://www.liaoxuefeng.com/wiki/1177760294764384", "sql")
    
    # git下载
    save_pdf("https://www.liaoxuefeng.com/wiki/896043488029600", "git")
    
    logger.info('下载成功!!!')
    
    
    
    
    
  • 相关阅读:
    bzoj 4548 小奇的糖果
    CF1151F Sonya and Informatics
    loj 2392「JOISC 2017 Day 1」烟花棒
    loj 2336「JOI 2017 Final」绳
    luogu P3620 [APIO/CTSC 2007]数据备份
    bzoj 4771 七彩树
    CF765F Souvenirs
    bzoj 3145 [Feyat cup 1.5]Str
    luogu P4482 [BJWC2018]Border 的四种求法
    第五章例题
  • 原文地址:https://www.cnblogs.com/jiangwenwen1/p/12188092.html
Copyright © 2020-2023  润新知