#!/usr/bin/python #-*-coding:utf-8-*- # 简易采集爬虫 # 1.采集Yahoo!Answers,parseData函数修改一下,可以采集任何网站 # 2.需要sqlite3或者pysqlite支持 # 3.可以在DreamHost.com空间上面运行 # 4.可以修改User-Agent冒充搜索引擎蜘蛛 # 5.可以设置暂停的时间,控制采集速度 # 6.采集Yahoo会被封IP数小时,所以这个采集用处不大 # Author: Lukin<mylukin@gmail.com> # Date : 2008-09-25 # 导入采集需要用到的模块 import re, sys, time import httplib, os.path as osp from urlparse import urlparse # 使用sqite数据库,为了兼容DreamHost.com的空间,只能这么写了 try : import sqlite3 as sqlite except ImportError: from pysqlite2 import dbapi2 as sqlite # 采集速度控制,单位秒 sleep = 0 # 数据库路径 dbname = \'./database.db\' # 设置提交的header头 headers = {"Accept": "*/*","Referer": "http://answers.yahoo.com/","User-Agent": "Mozilla/5.0+(compatible;+Googlebot/2.1;++http://www.google.com/bot.html)"} # 连接服务器 dl = httplib.HTTPConnection(\'answers.yahoo.com\') # 连接数据库 conn = sqlite.connect(osp.abspath(dbname)) # 创建数据库 def createDatabase(): global conn,dbname; if osp.isfile(osp.abspath(dbname)) : return c = conn.cursor() # 创建url列表存放表 c.execute(\'\'\'CREATE TABLE IF NOT EXISTS [collect]([cid] INTEGER PRIMARY KEY,[curl] TEXT,[state] INTEGER DEFAULT \'0\',UNIQUE([curl]));\'\'\') c.execute(\'\'\'CREATE INDEX IF NOT EXISTS [collect_idx_state] ON [collect]([state]);\'\'\') # 创建分类表 c.execute(\'\'\'CREATE TABLE IF NOT EXISTS [sorts]([sortid] INTEGER PRIMARY KEY,[sortname] TEXT,[sortpath] TEXT,[sortfoot] INTEGER DEFAULT \'0\',[sortnum] INTEGER DEFAULT \'0\',UNIQUE([sortpath]));\'\'\') c.execute(\'\'\'CREATE INDEX IF NOT EXISTS [sorts_idx_sortname] ON [sorts]([sortname]);\'\'\') c.execute(\'\'\'CREATE INDEX IF NOT EXISTS [sorts_idx_sortfoot] ON [sorts]([sortfoot]);\'\'\') # 创建文章表 c.execute(\'\'\'CREATE TABLE IF NOT EXISTS [article]([aid] INTEGER PRIMARY KEY,[sortid] INTEGER DEFAULT \'0\',[hits] INTEGER DEFAULT \'0\',[title] TEXT,[path] TEXT,[question] TEXT,[banswer] TEXT,[oanswer] TEXT,UNIQUE([path]));\'\'\') c.execute(\'\'\'CREATE INDEX IF NOT EXISTS [article_idx_sortid] ON [article]([sortid]);\'\'\') # 事物提交 conn.commit() c.close() # 执行采集 def collect(url="http://answers.yahoo.com/"): global dl,error,headers; R = 0 print "GET:",url urls = urlparse(url); path = urls[2]; if urls[4]!=\'\' : path += \'?\' + urls[4] dl.request(method="GET", url=path, headers=headers); rs = dl.getresponse() if rs.status==200 : R = parseData(rs.read(),url); else : print "3 seconds, try again ..."; time.sleep(3) dl.request(method="GET", url=path, headers=headers); rs = dl.getresponse() if rs.status==200 : R = parseData(rs.read(),url); else : print "3 seconds, try again ..."; time.sleep(3) dl.request(method="GET", url=path, headers=headers); rs = dl.getresponse() if rs.status==200 : R = parseData(rs.read(),url); else : print "Continue to collect ..." R = 3 # 更新记录 updateOneUrl(url,R) # 返回结果 return R # 处理采集到的数据 def parseData(html,url): global dl,conn; R = 2; c = conn.cursor() # 格式化html代码 format = formatURL(clearBlank(html),url) # 取出所有的连接 urls = re.findall(r\'\'\'(<a[^>]*?href="([^"]+)"[^>]*?>)|(<a[^>]*?href=\'([^\']+)\'[^>]*?>)\'\'\',format,re.I) if urls != None : i = 0 # 循环所有的连接 for regs in urls : # 得到一个单一的url sUrl = en2chr(regs[1].strip()) # 判断url是否符合规则,符合,则插入数据库 if re.search(\'http(.*?)/(dir|question)/index(.*?)\',sUrl,re.I) != None : if re.search(\'http(.*?)/dir/index(.*?)\',sUrl,re.I) != None: if sUrl.find(\'link=list\') == -1 and sUrl.find(\'link=over\') == -1 : sUrl+= \'&link=over\' else: sUrl = sUrl.replace(\'link=list\',\'link=over\') if sUrl[-11:]==\'link=mailto\' : continue try : c.execute(\'INSERT INTO [collect]([curl])VALUES(?);\',(sUrl,)) i = i + 1 except sqlite.IntegrityError : pass if i>0 : print "Message: %d get a new URL." % (i,) # 截取数据 if re.search(\'http(.*)/question/index(.*)\',url,re.I) != None : sortfoot = 0 # 自动创建分类和分类关系 guide = sect(format,\'<ol id="yan-breadcrumbs">\',\'</ol>\',\'(<li>(.*?)Home(.*?)</li>)\') aGuide = re.findall(\'<a[^>]*href="[^"]*"[^>]*>(.*?)</a>\',guide,re.I) if aGuide != None : sortname = "" for sortname in aGuide : sortname = sortname.strip() sortpath = en2path(sortname) # 查询分类是否存在 c.execute(\'SELECT [sortid],[sortname] FROM [sorts] WHERE [sortpath]=? LIMIT 0,1;\',(sortpath,)) row = c.fetchone(); # 分类不存在,添加分类 if row==None : c.execute(\'INSERT INTO [sorts]([sortname],[sortpath],[sortfoot])VALUES(?,?,?);\',(sortname,sortpath,sortfoot)) sortfoot = c.lastrowid else: sortfoot = row[0] # 标题 title = sect(format,\'<h1 class="subject">\',\'</h1>\') # 最佳答案 BestAnswer = sect(format,\'(<h2><span>Best Answer</span>(.*?)</h2>(.*?)<div class="content">)\',\'(</div>)\') # 最佳答案不存在,则不采集 if BestAnswer != None : # 文章路径 path = en2path(sortname + \'-\' + title.strip()) # 问题 adddata = sect(format,\'<div class="additional-details">\',\'</div>\') content = sect(format,\'(<h1 class="subject">(.*?)<div class="content">)\',\'(</div>)\') if adddata != None : content += \'<br/>\' + adddata # 其他回答 OtherAnswer = \'\' for regs in re.findall(\'<div class="qa-container">(.+?)<div class="utils-container">\',format): if regs.find(\'<h2>\') == -1 and regs.find(\'</h2>\') == -1 : a1 = sect(regs,\'<div class="content">\',\'</div>\') a2 = sect(regs,\'<div class="reference">\',\'</div>\') OtherAnswer+= \'<div class="oAnswer">\' + a1 if a2 != None : OtherAnswer+= \'<div class="reference">\' + a2 + \'</div>\' OtherAnswer+= \'</div>\' # 判断采集成功 if title != None and content != None : # 将数据写入到数据 try : c.execute(\'INSERT INTO [article]([sortid],[title],[path],[question],[banswer],[oanswer])VALUES(?,?,?,?,?,?);\',(sortfoot,title,path,content,BestAnswer,OtherAnswer)) print "Message:%s.html" % (path,) R = 1 except sqlite.IntegrityError : pass # 提交写入数据库 conn.commit(); c.close() return R # 取得一条URL def getOneUrl(): global conn; c = conn.cursor() c.execute(\'SELECT [curl] FROM [collect] WHERE [state] IN(0,3) LIMIT 0,1;\') row = c.fetchone(); c.close() if row==None : return "" return row[0].encode(\'utf-8\') # 更新一条记录的状态 def updateOneUrl(url,state): global conn; c = conn.cursor() c.execute(\'UPDATE [collect] SET [state]=? WHERE [curl]=?;\',(state,url)) conn.commit(); c.close() # 清除html代码里的多余空格 def clearBlank(html): if len(html) == 0 : return \'\' html = re.sub(\'\r|\n|\t\',\'\',html) while html.find(" ")!=-1 or html.find(\' \')!=-1 : html = html.replace(\' \',\' \').replace(\' \',\' \') return html # 格式化url def formatURL(html,url): urls = re.findall(\'\'\'(<a[^>]*?href="([^"]+)"[^>]*?>)|(<a[^>]*?href=\'([^\']+)\'[^>]*?>)\'\'\',html,re.I) if urls == None : return html for regs in urls : html = html.replace(regs[0],matchURL(regs[0],url)) return html # 格式化单个url def matchURL(tag,url): urls = re.findall(\'\'\'(.*)(src|href)=(.+?)( |/>|>).*|(.*)url\(([^\)]+)\)\'\'\',tag,re.I) if urls == None : return tag else : if urls[0][5] == \'\' : urlQuote = urls[0][2] else: urlQuote = urls[0][5] if len(urlQuote) > 0 : cUrl = re.sub(\'\'\'[\'"]\'\'\',\'\',urlQuote) else : return tag urls = urlparse(url); scheme = urls[0]; if scheme!=\'\' : scheme+=\'://\' host = urls[1]; host = scheme + host if len(host)==0 : return tag path = osp.dirname(urls[2]); if path==\'/\' : path = \'\'; if cUrl.find("#")!=-1 : cUrl = cUrl[:cUrl.find("#")] # 判断类型 if re.search(\'\'\'^(http|https|ftp):(//|\\\\)(([\w/\\\+\-~`@:%])+\.)+([\w/\\\.\=\?\+\-~`@\':!%#]|(&)|&)+\'\'\',cUrl,re.I) != None : # http开头的url类型要跳过 return tag elif cUrl[:1] == \'/\' : # 绝对路径 cUrl = host + cUrl elif cUrl[:3]==\'../\' : # 相对路径 while cUrl[:3]==\'../\' : cUrl = cUrl[3:] if len(path) > 0 : path = osp.dirname(path) elif cUrl[:2]==\'./\' : cUrl = host + path + cUrl[1:] elif cUrl.lower()[:7]==\'mailto:\' or cUrl.lower()[:11]==\'javascript:\' : return tag else : cUrl = host + path + \'/\' + cUrl R = tag.replace(urlQuote,\'"\' + cUrl + \'"\') return R # html代码截取函数 def sect(html,start,end,cls=\'\'): if len(html)==0 : return ; # 正则表达式截取 if start[:1]==chr(40) and start[-1:]==chr(41) and end[:1]==chr(40) and end[-1:]==chr(41) : reHTML = re.search(start + \'(.*?)\' + end,html,re.I) if reHTML == None : return reHTML = reHTML.group() intStart = re.search(start,reHTML,re.I).end() intEnd = re.search(end,reHTML,re.I).start() R = reHTML[intStart:intEnd] # 字符串截取 else : # 取得开始字符串的位置 intStart = html.lower().find(start.lower()) # 如果搜索不到开始字符串,则直接返回空 if intStart == -1 : return # 取得结束字符串的位置 intEnd = html[intStart+len(start):].lower().find(end.lower()) # 如果搜索不到结束字符串,也返回为空 if intEnd == -1 : return # 开始和结束字符串都有了,可以开始截取了 R = html[intStart+len(start):intStart+intEnd+len(start)] # 清理内容 if cls != \'\' : R = clear(R,cls) # 返回截取的字符 return R # 正则清除 def clear(html,regexs): if regexs == \'\' : return html for regex in regexs.split(chr(10)): regex = regex.strip() if regex != \'\' : if regex[:1]==chr(40) and regex[-1:]==chr(41): html = re.sub(regex,\'\',html,re.I|re.S) else : html = html.replace(regex,\'\') return html # 格式化为路径 def en2path(enStr): return re.sub(\'[\W]+\',\'-\',en2chr(enStr),re.I|re.U).strip(\'-\') # 替换实体为正常字符 def en2chr(enStr): return enStr.replace(\'&\',\'&\') # ------------------------------------- 开始执行程序 ------------------------------------------- # 首先创建数据库 createDatabase() # 开始采集 loops = 0 while True: if loops>0 : url = getOneUrl() if url == "" : loops = 0 else : loops = collect(url) else : loops = collect() # 暂停 time.sleep(sleep) if loops==0 : break # 关闭HTTP连接 dl.close() # 退出程序 sys.exit()