huipengbo

1.今天给大家介绍自己写的一个图片爬虫,说白了就是从网页自动上下载需要的图片

2.首先选取目标为:http://www.zhangzishi.cc/涨姿势这个网站如下图,我们的目标就是爬取该网站福利社的所有美图

3.福利社地址为http://www.zhangzishi.cc/category/welfare,获取图片就是获取所有网站图片的url地址,首先

A.打开URL,获取html代码

def url_open(url):
    req = urllib.request.Request(url)
    req.add_header(\'User-Agent\',\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36\')
    response = urllib.request.urlopen(req)
    html = response.read()
    print(\'url_open\')
    return html

B.从html代码中摘取网页链接,返回的是一个列表

def page_htmls(url,count):
    html = url_open(url).decode(\'utf-8\')
    pages = []
    a = html.find(\'a target="_blank" href=\')
    i = 0
    while a != -1:
        i += 1
        b = html.find(\'.html\',a,a+200)
        if b != -1:
            pages.append(html[a+24:b+5])
        else:
            b = a + 24
        a = html.find(\'a target="_blank" href=\',b)
        if i == count:
            break
    for each in pages:
        print(each)
    return pages

C.从每一个链接页中获取图片地址,我这用了两种方法

def find_imgs(url):
    html = url_open(url).decode(\'utf-8\')
    imgs = []

    a = html.find(\'img src=\')
    while a != -1:
        b = html.find(\'.jpg\',a,a+100)
        if b != -1:
            if html[a+9:b+4].find(\'http\') == -1:
                imgs.append(\'http:\'+html[a+9:b+4])
            else:
                imgs.append(html[a+9:b+4])
        else:
            b = a + 9
        a = html.find(\'img src=\',b)
        \'\'\'
    for each in imgs:
        print(each)
        \'\'\'
    return imgs




def imgurl_get(url):
    html = url_open(url).decode(\'utf-8\')
    imgurls = []
    a = html.find(\'color: #555555;" src=\')
    while a != -1:
        b = html.find(\'.jpg\',a,a+100)
        if b != -1:
            imgurls.append(\'http:\'+html[a+22:b+4])
        else:
            b = a + 22
        a = html.find(\'color: #555555;" src=\',b)

    return imgurls

D.根据图片url下载图片到文件

def save_imgs(folder,imgs):
    for ea in imgs:
        filename = ea.split(\'/\')[-1]
        with open(filename,\'wb\') as f:
            img = url_open(ea)
            f.write(img)
            
def download_mm(folder=\'H:\\xxoo2\',page_count = 100,count = 100):
    main_url = \'http://www.zhangzishi.cc/category/welfare\'
    main_urls = []
    for i in range(count):
        main_urls.append(main_url+\'/page/\'+str(i+1))     
    os.mkdir(folder)
    os.chdir(folder)
    for url in main_urls:
        htmls = page_htmls(url,page_count)
        for page in htmls:
            imgurls = imgurl_get(page)
            
            save_imgs(folder,imgurls)

E.开始下载

def download__img(folder=\'H:\\xxoo\',page_count=100):
    main_url = \'http://www.zhangzishi.cc/category/welfare\'
    os.mkdir(folder)
    os.chdir(folder)
    htmls = page_htmls(main_url,page_count)
    for page in htmls:   
       imgs_url =  find_imgs(page)
       
       save_imgs(folder,imgs_url)
       
if __name__ == \'__main__\':
     
    download_mm()
    #download__img()

F:下载结果

顺便附上全部代码:

import urllib.request
import os

def url_open(url):
    req = urllib.request.Request(url)
    req.add_header(\'User-Agent\',\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36\')
    response = urllib.request.urlopen(req)
    html = response.read()
    print(\'url_open\')
    return html

def page_htmls(url,count):
    html = url_open(url).decode(\'utf-8\')
    pages = []
    a = html.find(\'a target="_blank" href=\')
    i = 0
    while a != -1:
        i += 1
        b = html.find(\'.html\',a,a+200)
        if b != -1:
            pages.append(html[a+24:b+5])
        else:
            b = a + 24
        a = html.find(\'a target="_blank" href=\',b)
        if i == count:
            break
    for each in pages:
        print(each)
    return pages
\'\'\'

\'\'\'
def find_imgs(url):
    html = url_open(url).decode(\'utf-8\')
    imgs = []

    a = html.find(\'img src=\')
    while a != -1:
        b = html.find(\'.jpg\',a,a+100)
        if b != -1:
            if html[a+9:b+4].find(\'http\') == -1:
                imgs.append(\'http:\'+html[a+9:b+4])
            else:
                imgs.append(html[a+9:b+4])
        else:
            b = a + 9
        a = html.find(\'img src=\',b)
        \'\'\'
    for each in imgs:
        print(each)
        \'\'\'
    return imgs




def imgurl_get(url):
    html = url_open(url).decode(\'utf-8\')
    imgurls = []
    a = html.find(\'color: #555555;" src=\')
    while a != -1:
        b = html.find(\'.jpg\',a,a+100)
        if b != -1:
            imgurls.append(\'http:\'+html[a+22:b+4])
        else:
            b = a + 22
        a = html.find(\'color: #555555;" src=\',b)

    return imgurls
\'\'\'
    for each in imgurls:
        print(each)
\'\'\'

def save_imgs(folder,imgs):
    for ea in imgs:
        filename = ea.split(\'/\')[-1]
        with open(filename,\'wb\') as f:
            img = url_open(ea)
            f.write(img)
            
def download_mm(folder=\'H:\\xxoo2\',page_count = 100,count = 100):
    main_url = \'http://www.zhangzishi.cc/category/welfare\'
    main_urls = []
    for i in range(count):
        main_urls.append(main_url+\'/page/\'+str(i+1))     
    os.mkdir(folder)
    os.chdir(folder)
    for url in main_urls:
        htmls = page_htmls(url,page_count)
        for page in htmls:
            imgurls = imgurl_get(page)
            
            save_imgs(folder,imgurls)


        
def download__img(folder=\'H:\\xxoo\',page_count=100):
    main_url = \'http://www.zhangzishi.cc/category/welfare\'
    os.mkdir(folder)
    os.chdir(folder)
    htmls = page_htmls(main_url,page_count)
    for page in htmls:   
       imgs_url =  find_imgs(page)
       
       save_imgs(folder,imgs_url)
       
if __name__ == \'__main__\':
     
    download_mm()
    #download__img()

 

分类:

技术点:

相关文章: