import os#导入操作系统模块
from urllib.request import urlretrieve#下载url对应的文件
from urllib.request import urlopen #打开url,得到网页源代码
from bs4 import BeautifulSoup #bs库,对源代码进行各种操作
downloadDirectory = "downloaded" #下载至名为“download”的文件夹
baseUrl = "http://pythonscraping.com"#########################################
#将任意链接转换成absolute URL——清理和标准化
def getAbsoluteURL(baseUrl, source):
if source.startswith("http://www."):
url = "http://"+source[11:]
elif source.startswith("http://"):
url = source
elif source.startswith("www."):
url = source[4:]
url = "http://"+url
else:
url = baseUrl+"/"+source
if baseUrl not in url:
return None
return url
#新建一个文件夹,存放下载文件
def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
path = absoluteUrl.replace("www.", "")
path = path.replace(baseUrl, "")
path = downloadDirectory+path
directory = os.path.dirname(path)
if not os.path.exists(directory):
os.makedirs(directory)
return path
html = urlopen("http://www.pythonscraping.com")##############################
bsObj = BeautifulSoup(html)
downloadList = bsObj.findAll(src=True)#获取src对应的链接list
#print(downloadList)
#将链接list中每一个链接转换成absoluteURL
for download in downloadList:
fileUrl = getAbsoluteURL(baseUrl, download["src"])
if fileUrl is not None:
print(fileUrl)
urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))
相关文章:
- python爬虫下载壁纸图片 2021-11-23
- Python网络爬虫 - 下载图片 2021-11-30
- python爬虫-图片批量下载 2021-09-18
- python 爬虫下载图片方法 2021-05-22
- 爬虫_图片下载 2021-12-10
- python图片爬虫 - 批量下载unsplash图片 2021-09-18
- Python爬虫之下载网页图片 2021-12-08
- python爬虫之图片下载APP 2.0 2021-05-25