//生命太短 我用Python!
//Python真是让一直用c++的村里孩子长知识了!
这个仅仅是一个测试,成功抓取了某网站1000多张图片。
下一步要做一个大新闻 大工程
1 #config = utf-8 2 3 import urllib 4 import urllib2 5 import re 6 import os 7 8 global CNT 9 CNT = 0 10 11 def getHtml(url): 12 13 #! /usr/bin/env python 14 # -*- coding=utf-8 -*- 15 # @Author pythontab.com 16 #url="http://pythontab.com" 17 req_header = {\'User-Agent\':\'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11\', 18 \'Accept\':\'text/html;q=0.9,*/*;q=0.8\', 19 \'Accept-Charset\':\'ISO-8859-1,utf-8;q=0.7,*;q=0.3\', 20 \'Accept-Encoding\':\'gzip\', 21 \'Connection\':\'close\', 22 \'Referer\':None #注意如果依然不能抓取的话,这里可以设置抓取网站的host 23 } 24 req_header_2 = { 25 \'User-Agent\':\'Mozilla/5.0 (Windows NT 6.1; rv:28.0) Gecko/20100101 Firefox/28.0\' 26 } 27 28 req_timeout = 5 29 #status = urllib.urlopen(url).code 30 #print status 31 #if status != 200: 32 # print \'Http Error %s\' % status 33 # return False 34 35 req = urllib2.Request(url,None,req_header_2) 36 resp = urllib2.urlopen(req,None,req_timeout) 37 html = resp.read() 38 return html 39 40 def getAllUrl(html): 41 reg = r\'<a href="(.+)" target=\' 42 theurl = re.compile(reg) 43 url = re.findall(theurl,html) 44 return url 45 46 def getNext(html): 47 reg = r"<a href=\'.+pai/(.+).html\'" 48 nxtre = re.compile(reg) 49 nxt = re.findall(nxtre,html) 50 return nxt[0] 51 52 def getName(html): 53 reg = r\'<title>(.+)</title>\' 54 nare = re.compile(reg) 55 name = re.findall(nare,html) 56 return name[0] 57 58 def getImg(name,html): 59 global CNT 60 61 reg = r\'<img src="(.{0,80}\.jpg)" border="0"\' 62 imgre = re.compile(reg) 63 imglist = re.findall(imgre,html) 64 65 reg = r\'src="(.{0,80}\.jpeg)" border\' 66 imgre = re.compile(reg) 67 imglist.extend(re.findall(imgre,html)) 68 69 reg = r"<img src=\'(.{0,80}\.jpg)\'" 70 imgre = re.compile(reg) 71 imglist.extend(re.findall(imgre,html)) 72 73 reg = r"<img src=\'(.{0,80}\.jepg)\'" 74 imgre = re.compile(reg) 75 imglist.extend(re.findall(imgre,html)) 76 77 local = \'.\%s-[%sp]\' % (name,len(imglist)) 78 if os.path.exists(unicode(local,\'utf-8\')): 79 return unicode(local,\'utf-8\')+u\'was existed\' 80 81 os.mkdir(unicode(local,\'utf-8\')) 82 83 x = 0 84 for imgurl in imglist: 85 print imgurl 86 urllib.urlretrieve(imgurl,unicode(local+\'\%s.jpg\' % x,\'utf-8\')) 87 x+=1 88 CNT+=1 89 90 return unicode(\'%s: get %s pthoto(s)\' % (name,x),\'utf-8\') 91 92 93 94 def getAll(num): 95 global CNT 96 nxt = 164680 97 while num > 0: 98 99 url = \'---%s.html\' % nxt 100 print nxt 101 html = getHtml(url) 102 nxt -= 1 103 num -= 1 104 if html == False: 105 print \'Error\' 106 continue 107 108 print getImg(getName(html),html) 109 110 return \'done! %s photos!\' % str(CNT) 111 112 def getAll_update(index): 113 global CNT 114 num = CNT 115 urls = getAllUrl(getHtml(index)) 116 117 for url in urls: 118 html = getHtml(\'---\'+url) 119 print getImg(getName(html),html) 120 return \'done! %s photos!\' % str(CNT-num) 121 122 123 #print getAll(10) 124 #html = getHtml(\'---\') 125 #print getNext(html) 126 127 x = 3 128 while x < 50: 129 print getAll_update(\'---\' % x) 130 x+=1 131 132 #print getAll_update(\'---\')
header 伪装成浏览器
正则表达式 http://www.cnblogs.com/huxi/archive/2010/07/04/1771073.html //我也是刚刚学
基本都是一路百度写出来的