下载、备份豆瓣广播

这是这一个用于备份豆瓣广播的爬虫，用国内的网站是很没安全感的，不小心说错了话账号一秒没下载、备份豆瓣广播。其实很多时候账号是不重要的，重要的是自己发过的内容，这些都是自己的劳动心血啊，也是情感回忆。给自己留一条后路吧

直接上干货

github项目地址:Backup-Douban-Broadcast 备份豆瓣广播

python3源代码

所需的库:1、requests 2、lxml

#!/usr/bin/python3
from lxml import etree
import requests
import time



#伪装用户数据，用户，cookie
headers = {
    'Uesr-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
    'Cookie':'bid=HcsfdfDgjY; ll="118283"; _ga=GA1.2.698252965426060945; gr_user_id=55fea3dd-24665146bb-be056d20662f; __utmv=30149280.6427; _vwo_uuid_v2=FA20618BAACsfs063B0F50F1614561268fa58eacb69817156a2; push_doumail_num=0; push_noty_num=0; viewed="26462816"; ap=1; __utmc=30145680; ct=y; ps=y; _gid=GA1.2.654610.1526725718; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1526752769%2C%22https%3A%2F%2Faccounts.douban.com%2Fsafety%2Funlock_sms%2Fresetpassword%3Fconfirmation%3D5f656742cffec30%26alias%3D%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.698252912.1506060945.1526737379.1526752773.89; __utmz=30149280.1526752773.89.44.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/safety/unlock_sms/resetpassword; __utmt=1; dbcl2="64279887:1ntQKZ/e4dU"; ck=P2O4; _pk_id.100001.8cb4=e7e1a240646f34ee.1506738033.106.1526752928.1526737419.; __utmb=30149280.5.10.1526752773'
}

#获取网页数据，解析为html
def getWeb(page):
    url = 'https://www.douban.com/people/yekingyan/statuses?p=%s' % page
    webData = requests.get(url,headers=headers).text
    s = etree.HTML(webData)
    #设置一个暂停时间，太快的话，豆瓣会锁号的（不是封号）。 一毛一条解锁短信：）
    time.sleep(2)

#用lxml获得豆瓣广播，广播时间
    says = s.xpath('//*[@id="content"]/div/div[1]/div[3]/div/div/div/div[2]/div[1]/blockquote/p/text()')
    times = s.xpath('//*[@id="content"]/div/div[1]/div[3]/div/div/div/div[2]/div[2]/span/@title')
    for (time1,say) in zip(times,says):
        print(time1)
        print(say)
        print('')

        #写入文件,如果没有指定为utf-8，脚本会在遇到生僻字时停止运行
        with open('douban.txt','a',encoding='utf-8') as f:
            f.write(time1)
            f.write('\n')
            f.write(say)
            f.write('\n')
            f.write('\n')
            f.seek(0)


#启动前清除历史数据
with open('douban.txt','wt') as f:
    f.write("If you see a garbled file,make sure the file is encoded as utf-8.")
    f.seek(0)

#控制页数循环
pageNumber = int(input(u"请输入要备份的页数，为豆瓣前几页的页数\n（如若全部备份，请直接输入一个较大数）："))
pageNumber += 1
for i in range(1,pageNumber):
    getWeb(i)

python2源代码

所需的库:1、requests 2、lxml

改自上面的python3代码，测试过，可用。最好还是学python3吧，未来趋势，毕境python2到2020年就不维护了。

#!/usr/bin/python2
#coding:utf-8

from lxml import etree
import requests
import time
import sys

reload(sys)
sys.setdefaultencoding('UTF-8')


#伪装用户数据，用户，cookie
headers = {
    'Uesr-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
    'Cookie':'bid=HcsfdfDgjY; ll="118283"; _ga=GA1.2.698252965426060945; gr_user_id=55fea3dd-24665146bb-be056d20662f; __utmv=30149280.6427; _vwo_uuid_v2=FA20618BAACsfs063B0F50F1614561268fa58eacb69817156a2; push_doumail_num=0; push_noty_num=0; viewed="26462816"; ap=1; __utmc=30145680; ct=y; ps=y; _gid=GA1.2.654610.1526725718; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1526752769%2C%22https%3A%2F%2Faccounts.douban.com%2Fsafety%2Funlock_sms%2Fresetpassword%3Fconfirmation%3D5f656742cffec30%26alias%3D%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.698252912.1506060945.15267465479.1526752773.89; __utmz=30149280.1526752773.89.44.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/safety/unlock_sms/resetpassword; __utmt=1; dbcl2="64279887:1ntQKZ/e4dU"; ck=P2O4; _pk_id.100001.8cb4=e7e1a240646f34ee.1506738033.106.1526752928.1526737419.; __utmb=3015680.5.10.1526752773'
}


#获取网页数据，解析为html
def getWeb(page):
    url = 'https://www.douban.com/people/yekingyan/statuses?p=%s' % page
    webData = requests.get(url,headers=headers).text
    s = etree.HTML(webData)
    #设置一个暂停时间，太快的话，豆瓣会锁号的（不是封号）。 一毛一条解锁短信：）
    time.sleep(2)

#用lxml获得豆瓣广播，广播时间
    says = s.xpath('//*[@id="content"]/div/div[1]/div[3]/div/div/div/div[2]/div[1]/blockquote/p/text()')
    times = s.xpath('//*[@id="content"]/div/div[1]/div[3]/div/div/div/div[2]/div[2]/span/@title')
    for (time1,say) in zip(times,says):
        print time1
        print say
        print ''

        #写入文件
        with open('douban.txt','a') as f:
            f.write(time1)
            f.write('\n')
            f.write(say)
            f.write('\n')
            f.write('\n')
            f.seek(0)


#启动前清除历史数据
with open('douban.txt','wt') as f:
    f.write("If you see a garbled file,make sure the file is encoded as utf-8.")
    f.seek(0)

#控制页数循环
pageNumber = int(input(u"请输入要备份的页数，为豆瓣前几页的页数\n（如若全部备份，请直接输入一个较大数）："))
pageNumber += 1
for i in range(1,pageNumber):
    getWeb(i)

使用说明

复制的代码是不能用的，脚本因为下面的'Uesr-Agent'和 'Cookie'信息还是我的~~（我现在cookie变了，你别想做坏事）~~，需要改headers数据，改成你自己的。

headers = {
    'Uesr-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
    'Cookie':'bid=HcsfdfDgjY; ll="118283"; _ga=GA1.2.698252965426060945; gr_user_id=55fea3dd-24665146bb-be056d20662f; __utmv=30149280.6427; _vwo_uuid_v2=FA20618BAACsfs063B0F50F1614561268fa58eacb69817156a2; push_doumail_num=0; push_noty_num=0; viewed="26462816"; ap=1; __utmc=30145680; ct=y; ps=y; _gid=GA1.2.654610.1526725718; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1526752769%2C%22https%3A%2F%2Faccounts.douban.com%2Fsafety%2Funlock_sms%2Fresetpassword%3Fconfirmation%3D5f656742cffec30%26alias%3D%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.698252912.1506060945.15267465479.1526752773.89; __utmz=30149280.1526752773.89.44.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/safety/unlock_sms/resetpassword; __utmt=1; dbcl2="64279887:1ntQKZ/e4dU"; ck=P2O4; _pk_id.100001.8cb4=e7e1a240646f34ee.1506738033.106.1526752928.1526737419.; __utmb=3015680.5.10.1526752773'
}

怎么获取'Uesr-Agent'和 'Cookie'呢？

1、首先你要在浏览器登陆你的豆瓣账号。

2、请用火狐或chrome等浏览器用F12打开开发者模式

下图演示的是chrome浏览器的操作

下载、备份豆瓣广播

如内容空白，请保持开发者模式，再刷新一下页面。

上面绿色框的就是需要获取的内容。

3、更替脚本中的Uesr-Agent和Cookie

4、运行并输入要备份的页数

enjoy