原文:http://wiki.woodpecker.org.cn/moin/zspy
1. PycURL
Pycurl http://pycurl.sourceforge.net/
外部libcurl的接口,C写的,比urllib快,功能强.支持循环rewrite陷井的安全深度. 用于做网络爬虫,抓网页.
从 http://pycurl.sourceforge.net/download/ 下载 pycurl-ssl-7.16.4.win32-py2.5.exe 安装.
参考文献1,测试代码
1 #像操作文件一样操作字符串,也可以from cStringIO import StringIO,性能应该会好一些 2 import StringIO 3 4 html = StringIO.StringIO() 5 6 import pycurl 7 c = pycurl.Curl() 8 9 c.setopt(pycurl.URL, \'http://www.baidu.com\') 10 11 #写的回调 12 c.setopt(pycurl.WRITEFUNCTION, html.write) 13 14 c.setopt(pycurl.FOLLOWLOCATION, 1) 15 16 #最大重定向次数,可以预防重定向陷阱 17 c.setopt(pycurl.MAXREDIRS, 5) 18 19 #访问,阻塞到访问结束 20 c.perform() 21 22 #打印出 200(HTTP状态码) http://www.baidu.com(生效的url) 23 print c.getinfo(pycurl.HTTP_CODE), c.getinfo(pycurl.EFFECTIVE_URL) 24 25 #输出百度首页的html 26 #print html.getvalue()
然后看看多线程,http://pycurl.cvs.sourceforge.net/pycurl/pycurl/tests/ 有很多例子,还可做参考http://pycurl.sourceforge.net/doc/curlmultiobject.html
我自己改写了一个:)
1 #!/usr/bin/env python
2 #coding=utf-8
3
4 import threading
5 import pycurl
6 from cStringIO import StringIO
7
8 class UrlOpen(threading.Thread):
9 """异步下载网页"""
10
11 def __init__(self):
12 super(UrlOpen,self).__init__()
13 self.opener = pycurl.CurlMulti()
14 self.handle_list=[]
15
16 def add(self,url,recall,writer=StringIO()):
17 """
18 参数:网址,回调函数,存放临时数据的对象
19 """
20 c = pycurl.Curl()
21
22 #可以传给回调函数
23 c.url=url
24 c.content = writer
25 c.recall = recall
26 c.setopt(c.URL,url)
27 c.setopt(c.WRITEFUNCTION,c.content.write)
28
29 self.handle_list.append(c)
30 self.opener.add_handle(c)
31
32 def _remove(self,c):
33 c.close()
34 self.opener.remove_handle(c)
35 self.handle_list.remove(c)
36
37
38 def run(self):
39 num_handle=len(self.handle_list)
40 while 1:
41 ret = self.opener.select(10.0)
42 if ret == -1: continue
43 while 1:
44 num_handle_pre=num_handle
45 ret, num_handle =self.opener.perform()
46 #活动的连接数改变时
47 if num_handle!=num_handle_pre:
48 result=self.opener.info_read()
49 print result
50 for i in result[1]:
51 #成功
52 i.http_code = i.getinfo(i.HTTP_CODE)
53 self._remove(i)
54 i.recall(i)
55 for i in result[2]:
56 #失败,应该记录一下
57 self._remove(i)
58
59 if ret != pycurl.E_CALL_MULTI_PERFORM:
60 break
61
62 _opener=None
63 def urlopen(*arg,**key):
64 global _opener
65 if _opener is None:
66 _opener=UrlOpen()
67 _opener.add(*arg,**key)
68 _opener.start()
69 else:
70 _opener.add(*arg,**key)
71
72 def show(x):
73 print x.content.getvalue()
74 if __name__=="__main__":
75 urlopen("http://www.baidu.com/",show)
76 _opener.join()
又封装了一个异步打开网页的类和函数
#coding=utf-8
import threading
from cStringIO import StringIO
import pycurl
"""
Asyn open url
Author:zsp007@gmail.com
2008-1-25 17:14
"""
from Queue import Queue,Empty
class UrlOpen(threading.Thread):
"""异步下载网页"""
def __init__(self):
super(UrlOpen, self).__init__()
self.opener = pycurl.CurlMulti()
self.handle_list = []
self.waiting = Queue(2048)
def add(self, url, recall, catch=None, writer=StringIO):
"""
参数:网址,回调函数,存放临时数据的对象
"""
if catch is None:
def catch(curl, error_no, desp):
print "Url:%s\nError:%s - %s"%(curl.url, error_no, desp)
c = pycurl.Curl()
#可以传给回调函数
c.url = url
c.content = writer()
c.recall = recall
c.catch = catch
c.setopt(c.URL,
url.encode(\'utf-8\') if type(url) is unicode else url
)
c.setopt(c.WRITEFUNCTION, c.content.write)
c.setopt(pycurl.CONNECTTIMEOUT, 30)
c.setopt(pycurl.MAXREDIRS, 3)
c.setopt(pycurl.TIMEOUT, 300)
c.setopt(pycurl.FOLLOWLOCATION, 1)
self.waiting.put(c)
def _add(self,c):
self.handle_list.append(c)
self.opener.add_handle(c)
def _pull(self):
while True:
try:
self._add(self.waiting.get_nowait())
except Empty:
break
def _remove(self, c):
c.close()
self.opener.remove_handle(c)
self.handle_list.remove(c)
del c
def run(self):
import select
import time
num_handle = 0
while 1:
#print 1
if self.handle_list:
#print "select start"
ret = self.opener.select(1.0)
#print "select end"
if ret >= 0:
while 1:
#print "perform start"
num_handle_pre = num_handle
ret, num_handle = self.opener.perform()
#print "preform end"
#活动的连接数改变时
if num_handle != num_handle_pre:
result = self.opener.info_read()
for i in result[1]:
#成功
i.http_code = i.getinfo(i.HTTP_CODE)
i.recall(i)
self._remove(i)
for i in result[2]:
#失败,应该记录一下,或回调失败函数
#i为(<pycurl.Curl object at
0x00C04C80>, 6, \'Could not resolve host: www.msn.com (Domain name not
found)\')
i[0].catch(*i)
self._remove(i[0])
if ret != pycurl.E_CALL_MULTI_PERFORM:
#print "break"
break
self._pull()
self._pull()
else:
self._add(self.waiting.get())
_opener = None
def urlopen(*arg, **key):
global _opener
if _opener is None:
_opener = UrlOpen()
_opener.start()
_opener.add(*arg, **key)
import time
if __name__ == "__main__":
link = [\'http://www.baidu.com/\', \'http://www.sina.com.cn\',
\'http://www.qq.com\', \'http://www.sohu.com\', \'http://www.163.com/\',
\'http://www.ifeng.com/\', \'http://www.cctv.com/default.shtml\',
\'http://www.xinhuanet.com/\', \'http://www.people.com.cn/\',
\'http://cn.msn.com/\', \'http://www.google.cn/\', \'http://cn.yahoo.com/\',
\'http://www.amazon.cn/?source=2009hao123famousdaohang\',
\'http://www.chinamobile.com/\', \'http://www.pconline.com.cn/\',
\'http://www.chinahr.com/\', \'http://www.gov.cn/\',
\'http://www.zhcw.com/\', \'http://www.autohome.com.cn/\',
\'http://www.zhaopin.com/Market/hao123.jsp\',
\'http://fund.eastmoney.com/\', \'http://www.eastmoney.com/\',
\'http://www.xiaonei.com/\', \'http://www.soufun.com/\',
\'http://www.51.com/\', \'http://www.rayli.com.cn/\',
\'http://youa.baidu.com/\', \'http://www.360.cn/\',
\'http://www.ctrip.com/\', \'http://www.xcar.com.cn/\',
\'http://www.qq163.com\', \'http://www.samsung.com/\',
\'http://www.zol.com.cn/\', \'http://www.taobao.com/\',
\'http://www.icbc.com.cn/\', \'http://www.sto.cn\',
\'http://www.dianping.com\', \'http://www.gougou.com\',
\'http://www.ct10000.com\', \'http://www.anjuke.com/?&pi=H-1\',
\'http://www.360buy.com/union/union_default.asp?union_Id=75\',
\'http://tl.sohu.com/?rcc_id=061f93406c7a77d6a6e4c8647b09fb56\',
\'http://www.51job.com/default.php?code=gb2312\',
\'http://central.dangdang.com/league/leagueref.asp?from=P-227107a&backurl=http://a.oadz.com/link/C/51/52648/ZzZIg.TXwwIV69FJbh3yJe4H7WI_/a/898?home.dangdang.com\',
\'http://www.jiayuan.com/st/?id=3237&url=http://www.jiayuan.com\']
link +=[\'http://www.qidian.com/\', \'http://www.readnovel.com/\',
\'http://www.hongxiu.com/\', \'http://www.bookge.com/\',
\'http://www.jjwxc.net/\', \'http://hjsm.tom.com/\',
\'http://www.4yt.net/\', \'http://www.cuiweiju.com/\',
\'http://book.sina.com.cn/\', \'http://www.xxsy.net/\',
\'http://www.wansong.net/\', \'http://www.myfreshnet.com/\',
\'http://www.fmx.cn/\', \'http://www.xs8.cn/\',
\'http://www.rongshuxia.com/\', \'http://www.booksky.org/\',
\'http://www.zhulang.com/\', \'http://www.3320.net/\',
\'http://www.17k.com/\', \'http://www.xhsd.net/\',
\'http://www.qukanshu.com/\', \'http://www.fbook.net/\',
\'http://www.duyidu.com/\', \'http://www.soso999.com/\',
\'http://www.junzitang.com/\', \'http://msn.hongxiu.com/\',
\'http://www.yuanwen.com/\', \'http://top.baidu.com/book.html\',
\'http://www.lcread.com/\', \'http://www.sodu.com.cn/\',
\'http://www.cc222.com/\', \'http://www.feiku.com/\',
\'http://book.hqdoor.com/\', \'http://book.sooyuu.com/\',
\'http://www.52eshu.com/\', \'http://bbs.91txt.com/\',
\'http://book.qq.com/\', \'http://book.sohu.com/\',
\'http://www.baidu.com/search/guoxue/dir/fenlei.html\',
\'http://wind.yinsha.com/\', \'http://www.duzhe.com/\',
\'http://www.storychina.cn/\', \'http://www.shigeku.org/\',
\'http://www.goodmood.cn/\', \'http://www.nlc.gov.cn/\',
\'http://www.qnwz.cn/\', \'http://wenxue.xilu.com/\']
link +=[\'http://www.ganji.com/\', \'http://www.58.com/\',
\'http://www.baixing.com/\', \'http://www.263.com/\',
\'http://www.kuxun.cn/\', \'http://www.mangocity.com/\',
\'http://www.qunar.com/\', \'http://www.dianping.com/\',
\'http://www.fantong.com/\', \'http://www.55bbs.com/\',
\'http://www.19lou.com/\', \'http://www.koubei.com/\',
\'http://www.nike.com.cn/\', \'http://www.li-ning.com.cn/\',
\'http://www.bosideng.com/\', \'http://www.pirateship.com.cn/\',
\'http://www.goelia.com.cn/\', \'http://www.adidas.com/\',
\'http://www.converse.com.cn/\', \'http://www.romon.com/index.php\',
\'http://www.youngor.com/\', \'http://www.etam.com.cn\',
\'http://www.heilanhome.com/\', \'http://www.mizuno.com.cn/\',
\'http://www.goldlion-china.com/\', \'http://www.phland.com.cn/\',
\'http://www.betu.com.hk/\', \'http://www.puma.com.cn/\',
\'http://www.anta.com/\', \'http://www.pierrecardin.com.cn/\',
\'http://www.bobdog.com.cn/\', \'http://www.idaphne.com/\',
\'http://www.e-giordano.com/\', \'http://www.361sport.com/\',
\'http://www.levi.com.cn/\', \'http://www.lee.com.cn/\',
\'http://www.shanshan.com/\', \'http://www.semir.com\',
\'http://www.versace.com/flash.html\', \'http://www.k-boxing.com/\',
\'http://only.nzn.cn/\', \'http://www.pb89.com/%20\',
\'http://www.aimer.com.cn/\', \'http://www.balenciaga.com\',
\'http://www.ordifen.com.cn/\', \'http://www.ochirly.com/\',
\'http://www.uggaustralia.com/\', \'http://www.jshyx.com/\',
\'http://www.givenchy.com/default.php\',
\'http://www.thenorthface.com.cn/\', \'http://www.tissot.com.hk/\',
\'http://www.azona.com.hk/\', \'http://www.3suisses.com.cn/\',
\'http://www.valentino.it/\', \'http://www.yishion.com.cn/\',
\'http://www.chowtaiseng.com/\', \'http://www.tsljewellery.com/\',
\'http://www.jeanswest.com/\', \'http://www.baoxiniao.com.cn/\',
\'http://www.qsyr.com/%20\', \'http://www.septwolves.com/\',
\'http://www.baleno.com.hk/\', \'http://www.belle.com.cn/\',
\'http://www.teenmix.com.cn/\', \'http://www.fairwhale.com.cn/\',
\'http://www.swatch.com.cn/\', \'http://www.staccato.com/\',
\'http://www.daphne.com.cn/\', \'http://www.c-banner.com/\',
\'http://www.xtep.com.cn/\', \'http://www1.jeanswest.com.cn/\',
\'http://www.kappa.com.cn/\', \'http://www.laofengxiang.com/\',
\'http://www.cnhqt.com/\', \'http://www.tatashoes.com.cn/\',
\'http://www.robinhood.com.cn/\', \'http://www.doublestar.com.cn/\',
\'http://www.ozarkgear.com.cn/\', \'http://www.aokang.com.cn/\',
\'http://www.ctf.com.cn/\', \'http://www.crpttan.com/\',
\'http://www.calvinklein.com/\', \'http://www.citizen.com.cn/\',
\'http://www.longines.com/\', \'http://www.jackjonescn.net/\',
\'http://www.famoustone.com/\', \'http://www.kfc.com.cn/\',
\'http://www.bjyoshinoya.com.cn/\', \'http://www.starbucks.cn/\',
\'http://www.icoke.cn/\', \'http://www.mengniu.com.cn/\',
\'http://www.mcdonalds.com.cn/\', \'http://www.yonghe.com.cn/\',
\'http://www.ubccn.com/\', \'http://www.dicos.com.cn/\',
\'http://www.yili.com/\', \'http://www.pizzahut.com.cn/\',
\'http://www.quanjude.com.cn/direct.php\', \'http://www.nescafe.com.cn/\',
\'http://www.masterkong.com.cn/\', \'http://www.heinz.com.cn/\',
\'http://www.origus.com/\', \'http://www.xfy.com.cn/\',
\'http://www.haagendazs.com.cn/\', \'http://www.wyeth.com.cn/\',
\'http://www.moutaichina.com/index.asp\', \'http://www.tsingtao.com.cn/\',
\'http://www.meadjohnson.com.cn/\', \'http://www.dumex.com.cn/\',
\'http://www.wuliangye.com.cn/\', \'http://www.zkungfu.com/\',
\'http://www.dovechocolate.com.cn/\',
\'http://www.ganso.com.cn/%20%20%20\', \'http://www.beingmate.com/\',
\'http://www.waffleboy.com.cn/\', \'http://www.holiland.com.cn/\',
\'http://www.goldenjaguar.com/\', \'http://www.huiyuan.com.cn/%20%20%20\',
\'http://www.hsufuchifoods.com/%20%20%20%20\',
\'http://www.maybellinechina.com/\', \'http://www.dabao.com/\',
\'http://www.lorealchina.com/\', \'http://www.shiseidochina.com/\',
\'http://www.esteelauder.com.cn/\',
\'http://www.avon.com.cn/PRSuite/home/home.jsp\',
\'http://www.tjoy.biz/\',
\'http://www.lancome.com.cn/_zh/_cn/index.aspx\',
\'http://www.kose.co.jp/\', \'http://www.h2oplus.com.hk/\',
\'http://www.yuesai.com.cn/\', \'http://www.nivea.com.cn/\',
\'http://www.chanel.com/\',
\'http://www.clinique.com.cn/index.tmpl?ngextredir=1\',
\'http://www.ponds.com.cn/\', \'http://www.vichy.com.cn/\',
\'http://www.efu.org.cn/\',
\'http://www.laneigechina.com/Front-Page/index2.jsp\',
\'http://www.olay.com.cn/\', \'http://www.guerlain.com.cn/\',
\'http://www.aupres-shiseido.com.cn/\',
\'http://www.dior.com/pcd/International/JSP/Home/prehomeFlash.jsp\',
\'http://www.herborist.com.cn/\', \'http://www.dhc.net.cn/\',
\'http://www.ysl.com/\', \'http://www.kose.com.cn/\',
\'http://www.liangfei.com/\', \'http://www.tayoi.com/\',
\'http://www.chcedo.com/\', \'http://www.head-shoulders.com.cn/\',
\'http://www.slek.com.cn/\', \'http://www.mentholatum.com.cn/\',
\'http://www.pg.com.cn/\', \'http://www.china-ajjj.com/\',
\'http://www.rejoice.com.cn/\', \'http://www.cnnice.com/\',
\'http://www.watsons.com.cn/\', \'http://www.unilever.com.cn/\',
\'http://www.ikea.com/cn/zh/\', \'http://www.pantene.com.cn/\',
\'http://www.colgate.com.cn/app/Colgate/CN/HomePage.cvsp\',
\'http://www.auchan.com.cn/\', \'http://www.c-bons.com.cn/\',
\'http://www.carrefour.com.cn/\', \'http://www.vs.com.cn/\',
\'http://www.crest.com.cn/\', \'http://www.tongrentang.com/\',
\'http://www.amway.com.cn/index.aspx\', \'http://www.wal-martchina.com/\',
\'http://www.tupperware.com.cn/\', \'http://www.ourlotus.com/\',
\'http://www.skyworth.com/cn/\', \'http://www.sony.com.cn/\',
\'http://www.siemens.com.cn/\', \'http://www.gree.com.cn/\',
\'http://www.shinco.com/\', \'http://www.midea.com.cn/\',
\'http://www.samsung.com.cn/\', \'http://www.hitachi-shha.com.cn/\',
\'http://www.electrolux.com.cn/\', \'http://www.toshiba.com.cn/\',
\'http://www.panasonic.com.cn/\', \'http://www.canon.com.cn/\',
\'http://www.tcl.com/\', \'http://www.lg.com.cn/\',
\'http://cn.changhong.com/\', \'http://www.haier.com/\',
\'http://www.philips.com.cn/\', \'http://www.konka.com/\',
\'http://www.rsd.com.cn/\', \'http://www.supor.com.cn/\',
\'http://www.fotile.com/\', \'http://www.cnsuning.com/\',
\'http://www.sharp.cn/\', \'http://www.galanz.com.cn/\',
\'http://www.chinamacro.cn/\', \'http://www.robam.com/\',
\'http://www.gome.com.cn/\', \'http://www.joyoung.com.cn/\',
\'http://www.staccato.com/\', \'http://www.meiling.com/\',
\'http://www.fushibao.com/\', \'http://www.sacon.cn/\',
\'http://www.yongle.com.cn/\', \'http://www.xinfei.com/\']
begin = time.time()
number = 0
dumped = set()
def show(c):
global number
number +=1
content = c.content.getvalue()
print number,". cost time", time.time() - begin,"htmllen", len(content)
print c.url
begin_pos = None
count = 1
while True:
pos = content.rfind(\'<a href="http://\',None,begin_pos)
#print content[pos:pos+200]
if pos!=-1:
begin_pos = pos
url = content[pos+9:content.find(\'"\',pos+13)]
if url in dumped:
return
dumped.add(url)
print "\tadd",url
urlopen(url,show)
count +=1
if count>10:
break
else:
break
link = set(link)
print "total link", len(link)
for i in link:
urlopen(i, show)
dumped.add(i)
_opener.join()
print "cost time", time.time() - begin
1.1. 相关文献
-
PycURL简单学习 http://blog.donews.com/limodou/archive/2005/11/28/641257.aspx
-
python中的pycurl模块学习 https://forum.eviloctal.com/read.php?tid=27337