目标,豆瓣读书,
下载页面书籍图片。
import urllib.request
import re #使用正则表达式
def getJpg(date):
jpgList = re.findall(r\'(img src="http.+?.jpg")([\s\S]*?)(.+?.alt=".+?.")\',date)
return jpgList
def downLoad(jpgUrl,sTitle,n):
try:
urllib.request.urlretrieve(jpgUrl,\
\'C:\\Users\\74172\\source\\repos\\Python\\spidertest1\\images\\book.douban\\%s.jpg\' %sTitle)
except Exception as e:
print(e)
finally:
print(\'图片%s下载操作完成\' % n)
def getTitle(date):
titleList = re.findall(r\'title=".">\',date)
return titleList
if __name__ == \'__main__\':
url = \'https://book.douban.com/\'
res = urllib.request.urlopen(url)
date = res.read().decode(\'utf-8\')
date_jpg = getJpg(date)
imageTitle = getTitle(date)
global n
n = 1
for jpginfo in date_jpg:
s = re.findall(r\'http.+?.jpg\',str(jpginfo))
print(n,\'--- url -->\',str(s)[2:-2])
sTitleInfo = re.findall(r\'alt=".+?."\',str(jpginfo))
sTitleL = re.findall(r\'".+?."\',str(sTitleInfo))
sTitle = str(sTitleL)[3:-3]
downLoad(s[0],sTitle,n)
n = n + 1
又做了点修改,并将书名写入txt文件中
import urllib.request
import re #使用正则表达式
def getJpg(html):
jpgList = re.findall(r\'(img src="http.+?.jpg")([\s\S]*?)(.+?.alt=".+?.")\',html)
jpgList = re.findall(r\'http.+?.jpg\',str(jpgList))
return jpgList
def downLoad(jpgUrl,sTitle,n):
try:
urllib.request.urlretrieve(jpgUrl,\
\'C:/Users/74172/source/repos/Python/spidertest1/images/book.douban/%s.jpg\' %sTitle)
finally:
print(\'图片---%s----下载操作完成\' % sTitle)
def getTitle(html):
titleList = re.findall(r\'(img src="http.+?.jpg")([\s\S]*?)(.+?.alt=".+?.")\',html)
titleList = re.findall(r\'alt=".+?."\',str(titleList))
titleList = re.findall(r\'".+?."\',str(titleList))
return titleList
def writeTxt(imageTitle):
try:
#目录建立txt文件
f = open((url[8:-5]+\'.txt\'),"a",encoding="utf-8")
#写入
f.write(imageTitle+\'\n\')
finally:
if f:
#关闭文件
f.close()
if __name__ == \'__main__\':
url = \'https://book.douban.com/\'
res = urllib.request.urlopen(url)
html = res.read().decode(\'utf-8\')
urlJpgs = getJpg(html)
imageTitle = getTitle(html)
n = 0
for urlJpg in urlJpgs:
print(n,\'--- url -->\',urlJpg)
downLoad(urlJpg,imageTitle[n][1:-1],n)
writeTxt(imageTitle[n][1:-1])
n = n + 1