• python- www.thisamericanlife.org转pdf


    环境安装

    pip install  requests
    pip install  beautifulsoup4
    pip install  pdfkit
    
    
    $ sudo apt-get install wkhtmltopdf  # ubuntu
    $ sudo yum intsall wkhtmltopdf      # centos
    

    脚本

    #!/usr/bin/env python3.5
    # -*- coding: utf-8 -*-
    # @Time    : 2019/11/18 下午10:48
    # @Author  : yon
    # @Email   : xxx@qq.com
    # @File    : day1.py.py
    
    import os
    import re
    import time
    import logging
    import pdfkit
    from bs4 import BeautifulSoup
    import requests
    
    
    headers = {
        # 'Accept': 'application/json, text/javascript, */*; q=0.01',
        # 'Accept': '*/*',
        # 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7',
        # 'Cache-Control': 'no-cache',
        # 'accept-encoding': 'gzip, deflate, br',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
        'Referer': 'https://www.google.com/'
    }
    options= {
        'page-size': 'Letter',
        'encoding': "UTF-8",
        'custom-header': [
            ('Accept-Encoding', 'gzip')
        ]
    }
    
    resp = requests.get('https://www.thisamericanlife.org/687/transcript', headers=headers)
    
    soup = BeautifulSoup(resp.content, "html.parser")
    body = soup.find("article")
    all1 = str(body)
    pdfkit.from_string(all1, "/home/yon/Desktop/tt.pdf")
    
    

    另外一种写法

    import os
    import re
    import time
    import logging
    import requests
    import urllib.request
    import os
    import stat
    import pdfkit
    from bs4 import BeautifulSoup
    
    # headers = {
    #     # 'Accept': 'application/json, text/javascript, */*; q=0.01',
    #     'Accept': '*/*',
    #     'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7',
    #     'Cache-Control': 'no-cache',
    #     'accept-encoding': 'gzip, deflate, br',
    #     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
    #     'Referer': 'https://www.google.com/'
    # }
    #
    #
    # resp = requests.get('https://www.thisamericanlife.org/687/transcript', headers=headers)
    #
    # html = resp.content
    # with open("thisaericanlife.html", 'wb') as f:
    #     f.write(html)
    
    soup = BeautifulSoup(open("thisaericanlife.html"), "html.parser")
    print(soup.article.contents)
    print("类型")
    
    html = ""
    for x in soup.article.contents:
        # print(str(x))
        html += str(x)
    
    print(html)
    
    
    # html = BeautifulSoup(soup.article.contents)
    #print(type(html))
    # print(html)
    pdfkit.from_string(html, "/home/baixiaoxu/desk/tt.pdf")
    
    
  • 相关阅读:
    重写与重载的区别
    UDP模式与TCP模式的区别
    什么是GC?为什么会有GC?
    centos 7-8 安装 ms sql server 2019
    Phaser3 游戏开发入门——自定义构建Phaser库
    Visual Studio 下C#编译器在解析属性名时如果增加一个get_[您的另一个已经包含在类中属性名]的属性会报错,微软大哥这是什么鬼?
    Visual Studio 2015 Update 3 ISO
    react项目中引用amap
    js 截取网址中的某一段字符串
    解决react下找不到原生高德地图AMap类的问题
  • 原文地址:https://www.cnblogs.com/g2thend/p/11893161.html
Copyright © 2020-2023  润新知