1 # -*- coding:utf-8 -*- 2 # ************************************* 3 # 程序:学习蜘蛛协议的第一个例子 4 # 版本:1.0 5 # 作者:Silence 6 # 语言:Python 2.7 7 # 日期:2014-03-15 8 # 操作:就是下载贴吧里面某个贴吧的所有网页,并且存储为html文件 9 # ************************************* 10 11 import string,urllib2,re 12 from urllib2 import HTTPError 13 14 def baidu_tieba(url,begin_page,end_page): 15 for i in range(begin_page,end_page+1): 16 sName = string.zfill(i,5) + \'.html\' 17 print \'正在下载第\',str(i),\'个网页,并将其存储为\',sName,\'.....\' 18 try: 19 m = urllib2.urlopen(url + str(i)).read() 20 except HTTPError, e: 21 print \'亲,你给的地址出问题了。\' 22 if hasattr(e,\'reason\'): 23 print \'Code:\',e.code,\';Reason\',e.reason 24 pass 25 26 f = open(sName,\'w\') 27 try: 28 f.write(m) 29 except Exception, e: 30 print \'存储网页\',sName,\'出错!\' 31 pass 32 finally: 33 f.close() 34 35 if __name__ == \'__main__\': 36 bdurl = str(raw_input(\'请输入贴吧的地址,去掉pn=后面的数字:\n\')) 37 #因为现在贴吧需要登录上去,并且点击页数才会出现pn=,所以在这里加个判断,自己补全pn= 38 pattern=\'.+pn=$\' 39 m=re.match(pattern,bdurl) 40 if m == None: 41 bdurl += \'?pn=\' 42 print bdurl 43 begin_page = int(raw_input(\'请输入开始的页数:\n\')) 44 end_page = int(raw_input(\'请输入终点的页数:\n\')) 45 46 baidu_tieba(bdurl,begin_page,end_page)