1 #coding=utf-8 2 __author__ = "carry" 3 4 5 import sys 6 reload(sys) 7 sys.setdefaultencoding(\'utf-8\') 8 9 import urllib 10 import urllib2 11 import re 12 13 14 #获取源码 15 def get_content(page): 16 headers = {#\'Host\':\'search.51job.com\', 17 \'User-Agent\':\'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0\', 18 #\'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\', 19 #\'Connection\':\'keep-alive\' 20 } 21 url =\'http://search.51job.com/list/000000,000000,0000,00,9,99,python,2,\'+ str(page)+\'.html\' 22 req = urllib2.Request(url,headers=headers) 23 r = urllib2.urlopen(req) 24 response = r.read() #读取源代码并转为unicode 25 html = response.decode(\'gbk\').encode(\'utf-8\') 26 return html 27 28 def get(html): 29 reg = re.compile(r\'class="t1 ">.*? <a target="_blank" title="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*? <span class="t5">(.*?)</span>\',re.S)#匹配换行符 30 items=re.findall(reg,html) 31 return items 32 33 #多页处理,下载到文件 34 for j in range(1,11): 35 print(u"正在爬取第"+str(j)+"页数据...") 36 html = get_content(j) #调用获取网页原码 37 for i in get(html): 38 #print(i[0],i[1],i[2],i[3],i[4]) 39 with open (\'51job.txt\',\'a\') as f: 40 f.write(i[0]+\'\t\'+i[1]+\'\t\'+i[2]+\'\t\'+i[3]+\'\t\'+i[4]+\'\n\') 41 f.write("-----------------------------------------------------") 42 f.close()