抓取播客翻译
#!/usr/bin/env python # -*- coding: utf-8 -*- # get_transcript.py """ 一个自动从https://podcast.duolingo.com/spanish中下载transcripts的程序 """ # requests.encoding 编码 # requests.status_code 状态码 # 200 成功 # 4xx 客户端错误 -> 404 Page Not Found # 5xx 服务器错误 import requests import re import os main = 'https://podcast.duolingo.com/spanish' # 主页面 headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', } for i in range(1, 10): # 遍历所有页面 if i == 1: # 第一页即主页面 page = main else: # 'https://podcast.duolingo.com/spanish2' 以此类推 page = main + str(i) r = requests.get(page, headers=headers) print('{page} with status code {status}.'.format(page=page, status=r.status_code)) if r.status_code == 404: # 如果找不到更多的页面,跳出 print('404 Page Not Found!') break hrefs = re.findall('entry-title">s*<a href="(.*)" rel', r.text) # 获取页面所有节目链接 for h in hrefs: title = h[2:] episode = main[:-7] + title # 节目链接 filename = 'transcript/' + title + '.txt' if os.path.exists(filename): print(filename, 'existed!') continue req = requests.get(episode, headers=headers) print('{episode} with status code {status}.'.format(episode=episode, status=req.status_code)) if not os.path.exists('transcript'): os.mkdir('transcript') with open(filename, 'w+') as fp: for lines in re.findall('strong>(.*)</strong>(.*)</p>', req.text): for line in lines: fp.write(line) fp.write(' ') print(filename, 'added!')
结果:
注意事项:
1、以上是在ubuntu系统实现的,如果使用windows的话需要进行一些修改,如将"/"转换为"",而因为python中转义字符由""符号开头,所以在写路径时要写双斜杠"\"。
2、以上代码可以在ubuntu系统中运行,但是我在windows中运行时出现了"UnicodeEncodeError: 'gbk' codec can't encode character 'xf1' in position 30: illegal multibyte sequence"错误,需要对代码进行修改,在第49行打开文本文件时需要指明编码:"with open(filename, 'w+', encoding='utf-8') as fp:",亲测可运行。