noonjuan

抓取播客翻译

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# get_transcript.py

"""
一个自动从https://podcast.duolingo.com/spanish中下载transcripts的程序
"""

# requests.encoding 编码
# requests.status_code 状态码
#     200 成功
#    4xx 客户端错误 -> 404 Page Not Found
#    5xx 服务器错误

import requests
import re
import os

main = \'https://podcast.duolingo.com/spanish\'  # 主页面
headers = {
    \'User-Agent\': \'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36\',
}

for i in range(1, 10):  # 遍历所有页面
    if i == 1:  # 第一页即主页面
        page = main
    else:  # \'https://podcast.duolingo.com/spanish2\' 以此类推
        page = main + str(i)
    r = requests.get(page, headers=headers)
    print(\'{page} with status code {status}.\'.format(page=page, status=r.status_code)) 

    if r.status_code == 404:  # 如果找不到更多的页面,跳出
        print(\'404 Page Not Found!\')
        break

    hrefs = re.findall(\'entry-title">\s*<a href="(.*)" rel\', r.text)  # 获取页面所有节目链接

    for h in hrefs:
        title = h[2:]
        episode = main[:-7] + title  # 节目链接
        filename = \'transcript/\' + title + \'.txt\'
        if os.path.exists(filename):
            print(filename, \'existed!\')
            continue
        req = requests.get(episode, headers=headers)
        print(\'{episode} with status code {status}.\'.format(episode=episode, status=req.status_code))
        if not os.path.exists(\'transcript\'):
            os.mkdir(\'transcript\')
        with open(filename, \'w+\') as fp:
            for lines in re.findall(\'strong>(.*)</strong>(.*)</p>\', req.text):
                for line in lines:
                    fp.write(line)
                fp.write(\'\n\n\')
            print(filename, \'added!\')

 

结果:  

 

 

注意事项:

  1、以上是在ubuntu系统实现的,如果使用windows的话需要进行一些修改,如将"/"转换为"\",而因为python中转义字符由"\"符号开头,所以在写路径时要写双斜杠"\\"。

  2、以上代码可以在ubuntu系统中运行,但是我在windows中运行时出现了"UnicodeEncodeError: \'gbk\' codec can\'t encode character \'\xf1\' in position 30: illegal multibyte sequence"错误,需要对代码进行修改,在第49行打开文本文件时需要指明编码:"with open(filename, \'w+\', encoding=\'utf-8\') as fp:",亲测可运行。

分类:

技术点:

相关文章: