Python基础(十三)爬虫demo
美女图片网
访问分析网站
需要从下图中找到当前页
在从下图中拿出图片地址
具体代码如下:
import urllib.request
import os
import re
def url_open(url):
# 不使用代理
# req = urllib.request.Request(url)
# req.add_header(\'User-Host\',\'ptlogin2.qq.com\')
# req.add_header(\'User-Agent\',\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36 LBBROWSER\')
# response = urllib.request.urlopen(url)
# html = response.read()
# 使用代理
proxy_support = urllib.request.ProxyHandler({\'http\': \'116.62.134.173:9999\'})
opener = urllib.request.build_opener(proxy_support)
opener.addheaders = [(\'User-Agent\',
\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36\')]
response = opener.open(url)
html = response.read()
return html
# 取地址页面是第几页
# <span class="current">5</span>
def get_page(url):
html = url_open(url).decode(\'utf-8\')
# print(html)
lastpath = re.search(r\'<a class="this" href="javascript:void\(0\)" >\d{1,3}</a>\', html)
# print(lastpath)
lastpath = re.search(r\'[1-9]|[1-9]\d\', lastpath.group(0))
# print(lastpath)
return lastpath.group(0)
def find_imgs(url):
html = url_open(url).decode(\'utf-8\')
# <img class="magazine_img" src="https://img.ugirls.tv/uploads/magazine/cover/e5b7e22d23a9548de3b9bea7b5f1db55_cover_web_l.jpg" alt="[U385]张馨彤">
img_addrs = re.findall(r\'https://img\.ugirls\.tv/uploads/magazine/cover/.+\.jpg\', html)
print(img_addrs)
return img_addrs
def save_imgs(img_addrs):
# https://img.ugirls.tv/uploads/magazine/cover/e5b7e22d23a9548de3b9bea7b5f1db55_cover_web_l.jpg
for each in img_addrs:
# e5b7e22d23a9548de3b9bea7b5f1db55_cover_web_l.jpg
filename = each.split(\'/\')[-1]
try:
img = url_open(each)
with open(filename, \'wb\') as f:
f.write(img)
except OSError as reason:
print(reason)
continue
#pages下载几页的数据
def download_mm(folder=\'pic\', pages=2 ):
url = "https://www.ugirls.com/Content/"
page_num = int(get_page(url))
try:
os.mkdir(folder)
except:
print(\'目录已存在\')
finally:
os.chdir(folder)
for i in range(0, pages):
folderTemp = folder + str(page_num)
try:
os.mkdir(folderTemp)
os.chdir(folderTemp)
except:
# 如果目录存在,直接跳过,有可能是上次下载的时候出错或暂停了
print(\'目录已存在\')
continue
page_url = url +"Page-"+ str(page_num) + ".html"
img_addrs = find_imgs(page_url)
save_imgs(img_addrs)
os.chdir(os.pardir)
page_num += 1
if __name__ == \'__main__\':
download_mm()