import urllib
import urllib2
import re
import time
 
path="G:/123/"
path_file="1.txt"
 
def geturllist(text):
    text= text.decode('utf-8')  
    urllist = re.findall(r'(?=https:)+[^ ]+(?<=hd.jpg)',text) 
    return urllist
 
with open(path+path_file, "r") as f:  
    textall=f.read()
   urllist=geturllist(textall)
f.close() 
 
urllistonly = {}.fromkeys(urllist).keys()#删除数组里的重复值
 
for i in range(len(urllistonly)):  
    pic = urllistonly[i]
   pic =pic .replace('002','')
   pic =pic .replace('https:\\\\','http://')
   pic =pic .replace('\\','/')
   urllib.urlretrieve(pic ,path+str(i)+".jpg")
print 'End!'
 
 
注意
https:\\pic2.com\50\vc9ddaa4c92da.jpg
地址是无效的,所以会报错:no host given。
 
http://pic2.com/50/vc9ddaa4c92da.jpg
改成这样就不报错了。
 
 

相关文章:

  • 2022-12-23
  • 2021-11-19
  • 2021-10-25
  • 2022-12-23
  • 2021-10-19
  • 2021-06-29
猜你喜欢
  • 2022-12-23
  • 2021-08-05
  • 2022-12-23
  • 2021-06-10
  • 2021-06-28
  • 2021-07-20
相关资源
相似解决方案