【发布时间】:2012-11-01 14:57:27
【问题描述】:
我使用这个脚本来获取某个目录的所有文件更新列表。然后我解析该列表以获取我在该目录中一直处于活动状态的时间段列表。这样我就可以快速了解我在该项目上花费了多少时间,并知道要向我的客户收取什么费用。
我写了一个小python脚本,改编自:https://github.com/jncraton/PythonDropboxUploader
我添加了底部函数来从https://www.dropbox.com/events?ns=false&n=50检索特定事件页面
我在 2 个月前使用过该脚本,它运行良好,但现在我收到 403:禁止错误:
eventSrc = self.browser.open(req).read()
可能 DropBox 试图阻止像我这样的爬虫,以促使程序员改用他们的 API,但不幸的是,该 API 不支持列出事件。
谁能帮我让它重新工作?
这是创建连接的python代码:
import mechanize
import urllib
import re
import json
class DropboxConnection:
""" Creates a connection to Dropbox """
email = ""
password = ""
root_ns = ""
token = ""
browser = None
def __init__(self, email, password):
self.email = email
self.password = password
self.login()
self.get_constants()
def login(self):
""" Login to Dropbox and return mechanize browser instance """
# Fire up a browser using mechanize
self.browser = mechanize.Browser()
self.browser.set_handle_equiv(False)
self.browser.set_handle_redirect(True)
self.browser.set_handle_referer(True)
self.browser.set_handle_robots(False)
self.browser.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:14.0) Gecko/20120722 Firefox/14.0.1')]
# Browse to the login page
self.browser.open('https://www.dropbox.com/login')
# Enter the username and password into the login form
isLoginForm = lambda l: l.action == "https://www.dropbox.com/login" and l.method == "POST"
try:
self.browser.select_form(predicate=isLoginForm)
except:
self.browser = None
raise(Exception('Unable to find login form'))
self.browser['login_email'] = self.email
self.browser['login_password'] = self.password
self.browser['t'] = "1230"
# Send the form
response = self.browser.submit()
def get_constants(self):
""" Load constants from page """
home_src = self.browser.open('https://www.dropbox.com/home').read()
try:
self.root_ns = re.findall(r"root_ns: (\d+)", home_src)[0]
self.token = re.findall(r"TOKEN: '(.+)'", home_src)[0]
except:
raise(Exception("Unable to find constants for AJAX requests"))
def upload_file(self, local_file, remote_dir, remote_file):
""" Upload a local file to Dropbox """
if(not self.is_logged_in()):
raise(Exception("Can't upload when not logged in"))
self.browser.open('https://www.dropbox.com/')
# Add our file upload to the upload form
isUploadForm = lambda u: u.action == "https://dl-web.dropbox.com/upload" and u.method == "POST"
try:
self.browser.select_form(predicate=isUploadForm)
except:
raise(Exception('Unable to find upload form'))
self.browser.form.find_control("dest").readonly = False
self.browser.form.set_value(remote_dir, "dest")
self.browser.form.add_file(open(local_file, "rb"), "", remote_file)
# Submit the form with the file
self.browser.submit()
def get_dir_list(self, remote_dir):
""" Get file info for a directory """
if(not self.is_logged_in()):
raise(Exception("Can't download when not logged in"))
req_vars = "ns_id=" + self.root_ns + "&referrer=&t=" + self.token
req = urllib2.Request('https://www.dropbox.com/browse' + remote_dir, data=req_vars)
req.add_header('Referer', 'https://www.dropbox.com/home' + remote_dir)
dir_info = json.loads(self.browser.open(req).read())
dir_list = {}
for item in dir_info['file_info']:
# Eliminate directories
if(item[0] == False):
# get local filename
absolute_filename = item[3]
local_filename = re.findall(r".*\/(.*)", absolute_filename)[0]
# get file URL and add it to the dictionary
file_url = item[8]
dir_list[local_filename] = file_url
return dir_list
def get_download_url(self, remote_dir, remote_file):
""" Get the URL to download a file """
return self.get_dir_list(remote_dir)[remote_file]
def download_file(self, remote_dir, remote_file, local_file):
""" Download a file and save it locally """
fh = open(local_file, "wb")
fh.write(self.browser.open(self.get_download_url(remote_dir, remote_file)).read())
fh.close()
def is_logged_in(self):
""" Checks if a login has been established """
if(self.browser):
return True
else:
return False
def getEventsPage(self, n):
if(not self.is_logged_in()):
raise(Exception("Can't get event page when not logged in"))
url = 'https://www.dropbox.com/next_events'
values = {'cur_page': n, 'ns_id': 'false'}
data = urllib.urlencode(values)
req = mechanize.Request(url, data)
# print url + '?' + data
eventSrc = self.browser.open(req).read()
return eventSrc
这是解析事件页面的循环:
from dbupload import DropboxConnection
from getpass import getpass
from bs4 import BeautifulSoup
import re
import parsedatetime.parsedatetime as pdt
import parsedatetime.parsedatetime_consts as pdc
c = pdc.Constants()
p = pdt.Calendar(c)
email = "myemail@gmail.com" # raw_input("Enter Dropbox email address:")
password = getpass("Enter Dropbox password:")
dateFile = open('all_file_updates.txt', "wb")
try:
# Create the connection
conn = DropboxConnection(email, password)
except:
print("Connection failed")
else:
print("Connection succesful")
n = 250
found = 0
while(n >= 0):
eventsPageSrc = conn.getEventsPage(n)
soup = BeautifulSoup(eventsPageSrc)
table = soup.find("table", {"id": "events"})
for row in table.findAll('tr'):
link = row.find("a", href=re.compile('^https://dl-web.dropbox.com/get/ProjectName'))
if(link != None):
dateString = row.find("td", attrs={'class': 'modified'}).string
date = p.parse(dateString)
dateFile.write('Date: ' + str(date) + ' file: ' + link.string + '\n')
found = found + 1
n = n - 1
print 'page: ' + str(n) + ' Total found: ' + str(found)
【问题讨论】:
-
你确定你给了它正确的用户名/密码组合吗?
-
是的,当我在事件 url 中将 https 更改为 http 时,返回的 HTML 包含我的 DropBox 文件夹列表。大概和这个有关:stackoverflow.com/questions/12743873/…
-
问题:为什么不直接使用Dropbox API?
-
我找不到获取 API 中所有文件更新列表的方法。它支持上传和下载文件以及获取某个文件的修订版,但我没有看到对我想要做的事情的支持:获取所有更新的列表。更新以 RSS 提要的形式提供,但没有足够的历史记录(约 200 页)
标签: python mechanize dropbox http-status-code-403 scrape