使用正则表达式
# -*- coding: utf-8 -*-
import urllib
import re
def get_content(url):
\'\'\'
获取网页内容
\'\'\'
html = urllib.urlopen(url)
content = html.read()
html.close()
return content
def get_images(info):
\'\'\'
获取需要的图片
\'\'\'
\'\'\'
<img class="BDE_Image" src="http://imgsrc.baidu.com/forum/w%3D580/
sign=269396684d4a20a4311e3ccfa0539847/0aa95edf8db1cb132cd1f269df54564e92584b15.jpg"
pic_ext="jpeg" width="510" height="765">
\'\'\'
regex = r\'class="BDE_Image" src="(.+?\.jpg)"\'
mod = re.compile(regex)
images = re.findall(mod, info)
i = 0
for image in images:
urllib.urlretrieve(image, "images/%s.jpg" % i)
i = i + 1
url = "http://tieba.baidu.com/p/2772656630"
info = get_content(url)
get_images(info)
使用BeautifulSoup
# -*- coding: utf-8 -*-
import urllib
from bs4 import BeautifulSoup
def get_content(url):
\'\'\'
获取网页内容
\'\'\'
html = urllib.urlopen(url)
content = html.read()
html.close()
return content
def get_images(info):
\'\'\'
获取需要的图片
\'\'\'
soup = BeautifulSoup(info)
images = soup.find_all(class_="BDE_Image")
i = 1
for image in images:
image_add = image.get(\'src\')
print i, \'--\', image_add
if i < 10:
i = \'00\' + str(i)
elif i < 100:
i = \'0\' + str(i)
image_add = image.get(\'src\')
urllib.urlretrieve(image_add, "images/bs4-%s.jpg" % i)
i = int(i) + 1
url = "http://tieba.baidu.com/p/2772656630"
info = get_content(url)
get_images(info)