第一个模块,模拟登陆sina微博,创建weiboLogin.py文件,输入以下代码:
- #! /usr/bin/env python
- # -*- coding: utf-8 -*-
- import sys
- import urllib
- import urllib2
- import cookielib
- import base64
- import re
- import json
- import hashlib
- class weiboLogin:
- cj = cookielib.LWPCookieJar()
- cookie_support = urllib2.HTTPCookieProcessor(cj)
- opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
- urllib2.install_opener(opener)
- postdata = {
- \'entry\': \'weibo\',
- \'gateway\': \'1\',
- \'from\': \'\',
- \'savestate\': \'7\',
- \'userticket\': \'1\',
- \'ssosimplelogin\': \'1\',
- \'vsnf\': \'1\',
- \'vsnval\': \'\',
- \'su\': \'\',
- \'service\': \'miniblog\',
- \'servertime\': \'\',
- \'nonce\': \'\',
- \'pwencode\': \'wsse\',
- \'sp\': \'\',
- \'encoding\': \'UTF-8\',
- \'url\': \'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack\',
- \'returntype\': \'META\'
- }
- def get_servertime(self):
- url = \'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=dW5kZWZpbmVk&client=ssologin.js(v1.3.18)&_=1329806375939\'
- data = urllib2.urlopen(url).read()
- p = re.compile(\'(.∗)\')
- try:
- json_data = p.search(data).group(1)
- data = json.loads(json_data)
- servertime = str(data[\'servertime\'])
- nonce = data[\'nonce\']
- return servertime, nonce
- except:
- print \'Get severtime error!\'
- return None
- def get_pwd(self, pwd, servertime, nonce):
- pwd1 = hashlib.sha1(pwd).hexdigest()
- pwd2 = hashlib.sha1(pwd1).hexdigest()
- pwd3_ = pwd2 + servertime + nonce
- pwd3 = hashlib.sha1(pwd3_).hexdigest()
- return pwd3
- def get_user(self, username):
- username_ = urllib.quote(username)
- username = base64.encodestring(username_)[:-1]
- return username
- def login(self,username,pwd):
- url = \'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.3.18)\'
- try:
- servertime, nonce = self.get_servertime()
- except:
- print \'get servertime error!\'
- return
- weiboLogin.postdata[\'servertime\'] = servertime
- weiboLogin.postdata[\'nonce\'] = nonce
- weiboLogin.postdata[\'su\'] = self.get_user(username)
- weiboLogin.postdata[\'sp\'] = self.get_pwd(pwd, servertime, nonce)
- weiboLogin.postdata = urllib.urlencode(weiboLogin.postdata)
- headers = {\'User-Agent\':\'Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0 Chrome/20.0.1132.57 Safari/536.11\'}
- req = urllib2.Request(
- url = url,
- data = weiboLogin.postdata,
- headers = headers
- )
- result = urllib2.urlopen(req)
- text = result.read()
- p = re.compile(\'location\.replace\\'(.∗?)\\'\')
- try:
- login_url = p.search(text).group(1)
- urllib2.urlopen(login_url)
- print "Login success!"
- except:
- print \'Login error!\'
然后创建main.py文件,输入以下代码:
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import weiboLogin
- import urllib
- import urllib2
- username = \'你的微博用户名\'
- pwd = \'你的微博密码\'
- WBLogin = weiboLogin.weiboLogin()
- WBLogin.login(username, pwd)
注意:若登陆失败,可能是你的账号在登陆的时候需要输入验证码!你在网页上登陆你的账号试试看,在账号设置里面可以设置某些地区不输入验证码。
参考:http://www.douban.com/note/201767245/
接下来,考虑实现抓取微博的内容。
此时遇到一个困难,当抓取指定URL的微博时,初始显示只有15条。后面的是延迟显示的(ajax里面叫lazy load?)。也就是说,当滚动条第一次拖到最下面的时候,会显示第二部分,再拖到最下面,会显示第三部分。此时一个页面的微博才是完整的。所以,要获取一个微博页面的全部微博,需要访问这个页面三次。创建getWeiboPage.py文件,相应代码如下:
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import urllib
- import urllib2
- import sys
- import time
- reload(sys)
- sys.setdefaultencoding(\'utf-8\')
- class getWeiboPage:
- body = {
- \'__rnd\':\'\',
- \'_k\':\'\',
- \'_t\':\'0\',
- \'count\':\'50\',
- \'end_id\':\'\',
- \'max_id\':\'\',
- \'page\':1,
- \'pagebar\':\'\',
- \'pre_page\':\'0\',
- \'uid\':\'\'
- }
- uid_list = []
- charset = \'utf8\'
- def get_msg(self,uid):
- getWeiboPage.body[\'uid\'] = uid
- url = self.get_url(uid)
- self.get_firstpage(url)
- self.get_secondpage(url)
- self.get_thirdpage(url)
- def get_firstpage(self,url):
- getWeiboPage.body[\'pre_page\'] = getWeiboPage.body[\'page\']-1
- url = url +urllib.urlencode(getWeiboPage.body)
- req = urllib2.Request(url)
- result = urllib2.urlopen(req)
- text = result.read()
- self.writefile(\'./output/text1\',text)
- self.writefile(\'./output/result1\',eval("u\'\'\'"+text+"\'\'\'"))
- def get_secondpage(self,url):
- getWeiboPage.body[\'count\'] = \'15\'
- # getWeiboPage.body[\'end_id\'] = \'3490160379905732\'
- # getWeiboPage.body[\'max_id\'] = \'3487344294660278\'
- getWeiboPage.body[\'pagebar\'] = \'0\'
- getWeiboPage.body[\'pre_page\'] = getWeiboPage.body[\'page\']
- url = url +urllib.urlencode(getWeiboPage.body)
- req = urllib2.Request(url)
- result = urllib2.urlopen(req)
- text = result.read()
- self.writefile(\'./output/text2\',text)
- self.writefile(\'./output/result2\',eval("u\'\'\'"+text+"\'\'\'"))
- def get_thirdpage(self,url):
- getWeiboPage.body[\'count\'] = \'15\'
- getWeiboPage.body[\'pagebar\'] = \'1\'
- getWeiboPage.body[\'pre_page\'] = getWeiboPage.body[\'page\']
- url = url +urllib.urlencode(getWeiboPage.body)
- req = urllib2.Request(url)
- result = urllib2.urlopen(req)
- text = result.read()
- self.writefile(\'./output/text3\',text)
- self.writefile(\'./output/result3\',eval("u\'\'\'"+text+"\'\'\'"))
- def get_url(self,uid):
- url = \'http://weibo.com/\' + uid + \'?from=otherprofile&wvr=3.6&loc=tagweibo\'
- return url
- def get_uid(self,filename):
- fread = file(filename)
- for line in fread:
- getWeiboPage.uid_list.append(line)
- print line
- time.sleep(1)
- def writefile(self,filename,content):
- fw = file(filename,\'w\')
- fw.write(content)
- fw.close()
在刚刚的main.py中加入相应内容,完整内容为:
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import weiboLogin
- import getWeiboMsg
- import urllib
- import urllib2
- username = \'你的微博用户名\'
- pwd = \'你的微博密码\'
- WBLogin = weiboLogin.weiboLogin()
- WBLogin.login(username, pwd)
- WBmsg = getWeiboMsg.getWeiboMsg()
- url = \'http://weibo.com/1624087025?from=otherprofile&wvr=3.6&loc=tagweibo\'
- WBmsg.get_firstpage(url)
- WBmsg.get_secondpage(url)
- WBmsg.get_thirdpage(url)