http://www.cnblogs.com/Dadio/p/5513594.html
这个是爬P站的代码,目前还没看,感觉很棒
from bs4 import BeautifulSoup import requests from PIL import Image from io import BytesIO import os import codecs import sys headers={ \'Accept\':\'text/html\', \'Accept-Language\':\'zh-CN,zh;q=0.8\', \'Referer\':"", \'User-Agent\':"此处为浏览器的user-agent"#浏览器数据 } order=1 def getpic (src,href,mode=""): os.system("cls") print("共有%d个文件需要下载"%number_of_file) if src[-3:] == "gif": return\'\'\'使用gif来保存静态图片的都是邪教\'\'\' headers[\'Referer\'] = href ispng=False url=src.replace("_master1200","") url=url.replace(url[20:40],"img-original") if mode==\'mul\': print(\'正在下载第%d个...\'%order) print("该文件含有多张图:") else: print(\'正在下载第%d个...\'%order) if os.path.exists(file_path+(url.replace(\'/\',""))[-15:]): print(\'已下载第%d个\'%order) return else: data=requests.get(url,headers=headers,timeout=60) if str(data)!=\'<Response [200]>\': ispng=True url=url.replace("jpg","png") if mode == \'mul\': if ispng: print("********正在下载第1张") if os.path.exists(file_path+(url.replace(\'/\',""))[-15:]): pass else: data=requests.get(url,headers=headers) im=Image.open(BytesIO(data.content)) im.save(file_path+(url.replace(\'/\',""))[-15:],\'png\') print("********已下载第1张") for i in range(150): url=url.replace("p%d.png"%i,"p%d.png"%(i+1)) os.system("cls") print("********正在下载第%d张..."%(i+2)) if os.path.exists(file_path+(url.replace(\'/\',""))[-15:]): pass else: data=requests.get(url,headers=headers,timeout=60) if str(data)!=\'<Response [200]>\': break im=Image.open(BytesIO(data.content)) im.save(file_path+(url.replace(\'/\',""))[-15:],\'png\') print("********已下载第%d张"%(i+2)) else: print("********正在下载第1张") if os.path.exists(file_path+(url.replace(\'/\',""))[-15:]): pass else: data=requests.get(url,headers=headers,timeout=60) im=Image.open(BytesIO(data.content)) im.save(file_path+(url.replace(\'/\',""))[-15:],\'jpeg\') print("********已下载第1张") for i in range(150): url=url.replace("p%d.jpg"%i,"p%d.jpg"%(i+1)) os.system("cls") print("********正在下载第%d张..."%(i+2)) if os.path.exists(file_path+(url.replace(\'/\',""))[-15:]): pass else: data=requests.get(url,headers=headers,timeout=60) if str(data)!=\'<Response [200]>\': break im=Image.open(BytesIO(data.content)) im.save(file_path+(url.replace(\'/\',""))[-15:],\'jpeg\') print("********已下载第%d张"%(i+2)) else: if ispng : if os.path.exists(file_path+(url.replace(\'/\',""))[-15:]): print(\'已下载第%d个\'%order) return else: data=requests.get(url,headers=headers,timeout=60) if str(data) == \'<Response [200]>\': im=Image.open(BytesIO(data.content)) im.save(file_path+(url.replace(\'/\',""))[-15:],\'png\') print(\'已下载第%d个\'%order) else: im=Image.open(BytesIO(data.content)) im.save(file_path+(url.replace(\'/\',""))[-15:],\'jpeg\') print(\'已下载第%d个\'%order) number=sys.argv[1] file_path=sys.argv[2]+\'\\Picture\\\'#修改此处即可改变路径 url_save="http://spotlight.pics/zh/a/%s"%number wb=requests.get(url_save,headers=headers) wb_data=BeautifulSoup(wb.text,\'lxml\') title=wb_data.h2.string.replace("\n","").replace(":","").replace("?","").replace("\"","").replace(" ","") title=title.replace("<","").replace(">","").replace("|","").replace("*","").replace("/","").replace("\\","") #依据windows目录命名规则 file_path=(file_path+title)+"\\" \'\'\'判断文件是否存在\'\'\' if not os.path.exists(file_path): introduce=str(wb_data.h2.next_sibling.next_sibling.next_element) os.mkdir(file_path) f=codecs.open(file_path+"介绍.txt","w","utf-8") f.write("特辑号:%s\n"%number+introduce) f.close() divs=wb_data.body.select(\'div[class="illust-wrap"]\') number_of_file=len(divs) headers[\'Accept\']=\'image/webp,image/*,*/*;q=0.8\' for div in divs: if str(div.a.parent[\'class\'])!=\'[\\'ugoira-player\\', \\'ui-scroll-view\\']\': if str(div.a.parent[\'class\'])==\'[\\'illust-multi-page-wrap\\']\': getpic(div.img[\'src\'],div.a[\'href\'],"mul") else: getpic(div.img[\'src\'],div.a[\'href\']) #要想好动图怎么办 order+=1