keer2345

使用正则表达式

# -*- coding: utf-8 -*-
import urllib
import re


def get_content(url):
    \'\'\'
    获取网页内容
    \'\'\'
    html = urllib.urlopen(url)
    content = html.read()
    html.close()
    return content


def get_images(info):
    \'\'\'
    获取需要的图片
    \'\'\'
    \'\'\'
    <img class="BDE_Image" src="http://imgsrc.baidu.com/forum/w%3D580/
    sign=269396684d4a20a4311e3ccfa0539847/0aa95edf8db1cb132cd1f269df54564e92584b15.jpg"
    pic_ext="jpeg" width="510" height="765">
    \'\'\'
    regex = r\'class="BDE_Image" src="(.+?\.jpg)"\'
    mod = re.compile(regex)
    images = re.findall(mod, info)

    i = 0
    for image in images:
        urllib.urlretrieve(image, "images/%s.jpg" % i)
        i = i + 1


url = "http://tieba.baidu.com/p/2772656630"
info = get_content(url)
get_images(info)

使用BeautifulSoup

# -*- coding: utf-8 -*-
import urllib
from bs4 import BeautifulSoup


def get_content(url):
    \'\'\'
    获取网页内容
    \'\'\'
    html = urllib.urlopen(url)
    content = html.read()
    html.close()
    return content


def get_images(info):
    \'\'\'
    获取需要的图片
    \'\'\'

    soup = BeautifulSoup(info)
    images = soup.find_all(class_="BDE_Image")
    i = 1
    for image in images:
        image_add = image.get(\'src\')
        print i, \'--\', image_add
        if i < 10:
            i = \'00\' + str(i)
        elif i < 100:
            i = \'0\' + str(i)

        image_add = image.get(\'src\')
        urllib.urlretrieve(image_add, "images/bs4-%s.jpg" % i)
        i = int(i) + 1


url = "http://tieba.baidu.com/p/2772656630"
info = get_content(url)
get_images(info)

分类:

技术点:

相关文章: