ttzzyy

Python基础(十三)爬虫demo

美女图片网
访问分析网站

需要从下图中找到当前页

在从下图中拿出图片地址

具体代码如下:

import urllib.request
import os
import re


def url_open(url):
    # 不使用代理
    # req = urllib.request.Request(url)
    # req.add_header(\'User-Host\',\'ptlogin2.qq.com\')
    # req.add_header(\'User-Agent\',\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36 LBBROWSER\')
    # response = urllib.request.urlopen(url)
    # html = response.read()

    # 使用代理
    proxy_support = urllib.request.ProxyHandler({\'http\': \'116.62.134.173:9999\'})
    opener = urllib.request.build_opener(proxy_support)
    opener.addheaders = [(\'User-Agent\',
                          \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36\')]
    response = opener.open(url)
    html = response.read()
    return html


# 取地址页面是第几页
# <span class="current">5</span>
def get_page(url):
    html = url_open(url).decode(\'utf-8\')
    # print(html)
    lastpath = re.search(r\'<a class="this" href="javascript:void\(0\)" >\d{1,3}</a>\', html)
    # print(lastpath)
    lastpath = re.search(r\'[1-9]|[1-9]\d\', lastpath.group(0))
    # print(lastpath)
    return lastpath.group(0)


def find_imgs(url):
    html = url_open(url).decode(\'utf-8\')
    # <img class="magazine_img" src="https://img.ugirls.tv/uploads/magazine/cover/e5b7e22d23a9548de3b9bea7b5f1db55_cover_web_l.jpg" alt="[U385]张馨彤">
    img_addrs = re.findall(r\'https://img\.ugirls\.tv/uploads/magazine/cover/.+\.jpg\', html)
    print(img_addrs)
    return img_addrs


def save_imgs(img_addrs):
    # https://img.ugirls.tv/uploads/magazine/cover/e5b7e22d23a9548de3b9bea7b5f1db55_cover_web_l.jpg
    for each in img_addrs:
        # e5b7e22d23a9548de3b9bea7b5f1db55_cover_web_l.jpg
        filename = each.split(\'/\')[-1]
        try:
            img = url_open(each)
            with open(filename, \'wb\') as f:
                f.write(img)
        except OSError as reason:
            print(reason)
            continue

#pages下载几页的数据
def download_mm(folder=\'pic\', pages=2 ):
    url = "https://www.ugirls.com/Content/"
    page_num = int(get_page(url))
    try:
        os.mkdir(folder)
    except:
        print(\'目录已存在\')
    finally:
        os.chdir(folder)

    for i in range(0, pages):
        folderTemp = folder + str(page_num)
        try:
            os.mkdir(folderTemp)
            os.chdir(folderTemp)
        except:
            # 如果目录存在,直接跳过,有可能是上次下载的时候出错或暂停了
            print(\'目录已存在\')
            continue

        page_url = url +"Page-"+ str(page_num) + ".html"
        img_addrs = find_imgs(page_url)
        save_imgs(img_addrs)
        os.chdir(os.pardir)
        page_num += 1

if __name__ == \'__main__\':
    download_mm()


分类:

技术点:

相关文章:

  • 2022-02-05
  • 2021-12-02
  • 2022-01-03
  • 2021-09-02
  • 2021-08-10
猜你喜欢
  • 2021-12-09
  • 2022-03-04
  • 2021-12-01
  • 2021-11-29
  • 2022-02-09
  • 2021-12-29
  • 2021-10-10
相关资源
相似解决方案