抓取播客翻译
#!/usr/bin/env python # -*- coding: utf-8 -*- # get_transcript.py """ 一个自动从https://podcast.duolingo.com/spanish中下载transcripts的程序 """ # requests.encoding 编码 # requests.status_code 状态码 # 200 成功 # 4xx 客户端错误 -> 404 Page Not Found # 5xx 服务器错误 import requests import re import os main = \'https://podcast.duolingo.com/spanish\' # 主页面 headers = { \'User-Agent\': \'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36\', } for i in range(1, 10): # 遍历所有页面 if i == 1: # 第一页即主页面 page = main else: # \'https://podcast.duolingo.com/spanish2\' 以此类推 page = main + str(i) r = requests.get(page, headers=headers) print(\'{page} with status code {status}.\'.format(page=page, status=r.status_code)) if r.status_code == 404: # 如果找不到更多的页面,跳出 print(\'404 Page Not Found!\') break hrefs = re.findall(\'entry-title">\s*<a href="(.*)" rel\', r.text) # 获取页面所有节目链接 for h in hrefs: title = h[2:] episode = main[:-7] + title # 节目链接 filename = \'transcript/\' + title + \'.txt\' if os.path.exists(filename): print(filename, \'existed!\') continue req = requests.get(episode, headers=headers) print(\'{episode} with status code {status}.\'.format(episode=episode, status=req.status_code)) if not os.path.exists(\'transcript\'): os.mkdir(\'transcript\') with open(filename, \'w+\') as fp: for lines in re.findall(\'strong>(.*)</strong>(.*)</p>\', req.text): for line in lines: fp.write(line) fp.write(\'\n\n\') print(filename, \'added!\')
结果:
注意事项:
1、以上是在ubuntu系统实现的,如果使用windows的话需要进行一些修改,如将"/"转换为"\",而因为python中转义字符由"\"符号开头,所以在写路径时要写双斜杠"\\"。
2、以上代码可以在ubuntu系统中运行,但是我在windows中运行时出现了"UnicodeEncodeError: \'gbk\' codec can\'t encode character \'\xf1\' in position 30: illegal multibyte sequence"错误,需要对代码进行修改,在第49行打开文本文件时需要指明编码:"with open(filename, \'w+\', encoding=\'utf-8\') as fp:",亲测可运行。