python3.3 爬虫小样例

第一个爬虫小样例仿照大神：http://blog.csdn.net/pleasecallmewhy/article/details/8927832 的博客转化成python3.3，算是我的入门爬虫吧

第一个爬虫小样例：

import urllib.request as request
import urllib.parse as parse
import string
print("""
+++++++++++++++++++++++
  学校：超神学院
  专业：德玛班
  姓名：德玛之力
  version: python3.2
+++++++++++++++++=++++
     """)
def baidu_tieba(url, begin_page, end_page):
    for i in range(begin_page, end_page + 1):
        sName = \'f:/test/\'+str(i).zfill(5)+\'.html\'
        print(\'正在下载第\'+str(i)+\'个页面, 并保存为\'+sName)
        m = request.urlopen(url+str(i)).read()
        with open(sName,\'wb\') as file:
            file.write(m)
        file.close()
if __name__ == "__main__":
    url = "http://tieba.baidu.com/p/"
    begin_page = 1
    end_page = 3
    baidu_tieba(url, begin_page, end_page)

>>> 

+++++++++++++++++++++++
  学校：超神学院
  专业：德玛班
  姓名：德玛之力
  version: python3.2
+++++++++++++++++=++++
     
正在下载第1个页面, 并保存为f:/test/00001.html
正在下载第2个页面, 并保存为f:/test/00002.html
正在下载第3个页面, 并保存为f:/test/00003.html

这个是我写的一个爬取多个页面的png图像代码

import urllib.request as request
import urllib.parse as parse
import string
import re
import os
import urllib.error as error
print("""
+++++++++++++++++++++++
  学校：超神学院
  专业：德玛班
  姓名：德玛之力
  version: python3.2
+++++++++++++++++=++++
     """)
def baidu_tieba(url, begin_page, end_page):
    count = 1
    for i in range(begin_page, end_page + 1):
        sName = \'f:/test/\'+str(i).zfill(5)+\'.html\'
        print(\'正在下载第\'+str(i)+\'个页面, 并保存为\'+sName)
        m = request.urlopen(url+str(i)).read()
        #创建文件夹保存每一个网页上的图片
        dirpath = \'f:/test/\'
        dirname = str(i)
        new_path = os.path.join(dirpath, dirname)
        if not os.path.isdir(new_path):
            os.makedirs(new_path)
        page_data = m.decode(\'GBK\')   
        page_image = re.compile(\'<img src=\"(.+?)\"\')
        for image in page_image.findall(page_data):
            pattern = re.compile(r\'^http://.*.png$\')
            if  pattern.match(image):
                try:
                    image_data = request.urlopen(image).read()
                    image_path = dirpath + dirname +\'/\'+str(count)+\'.png\'
                    count += 1
                    print(image_path)
                    with open(image_path, \'wb\') as image_file:
                        image_file.write(image_data)
                    image_file.close()
                except error.URLError as e:
                    print(\'Download failed\')
        with open(sName,\'wb\') as file:
            file.write(m)
        file.close()
if __name__ == "__main__":
    url = "http://tieba.baidu.com/p/"
    begin_page = 1
    end_page = 3
    baidu_tieba(url, begin_page, end_page)

>>> 

+++++++++++++++++++++++
  学校：超神学院
  专业：德玛班
  姓名：德玛之力
  version: python3.2
+++++++++++++++++=++++
     
正在下载第1个页面, 并保存为f:/test/00001.html
f:/test/1/1.png
f:/test/1/2.png
f:/test/1/3.png
f:/test/1/4.png
f:/test/1/5.png
f:/test/1/6.png
f:/test/1/7.png
f:/test/1/8.png
f:/test/1/9.png
正在下载第2个页面, 并保存为f:/test/00002.html
f:/test/2/10.png
正在下载第3个页面, 并保存为f:/test/00003.html
f:/test/3/11.png
f:/test/3/12.png
f:/test/3/13.png
f:/test/3/14.png
f:/test/3/15.png
f:/test/3/16.png
f:/test/3/17.png
f:/test/3/18.png
f:/test/3/19.png
f:/test/3/20.png
f:/test/3/21.png
f:/test/3/22.png

样例3，用BeautifulSoup爬取淘宝首页的文件夹中的汉字，比re好用，基本不要写正则，也能够嵌套正则

BeatifulSoup4.* 中文文档 http://www.crummy.com/software/BeautifulSoup/bs4/doc/

import urllib
import urllib.request as request
from bs4 import BeautifulSoup
def taobao(url):
    response = request.urlopen(url)
    html = response.read()
    #我是win7系统，默认是gdk要先解码，再用utf8编码就能够显示汉字了
    data = html.decode(\'gbk\').encode(\'utf-8\')
    soup = BeautifulSoup(data)
    for list in soup.find_all(\'h3\'):
        print(list.string)
if __name__ == \'__main__\':
    print("""
+++++++++++++++++++++++
  学校：超神学院
  专业：德玛班
  姓名：德玛之力
  version: python3.2
+++++++++++++++++=++++
     """)
    url = \'http://www.taobao.com/?spm=a310q.2219005.1581860521.1.b9kUd4\'
    taobao(url)

>>> 

+++++++++++++++++++++++
  学校：超神学院
  专业：德玛班
  姓名：德玛之力
  version: python3.2
+++++++++++++++++=++++
     
便民服务
服装内衣
鞋包配饰
运动户外
珠宝手表
手机数码
家电办公
护肤彩妆
母婴用品
家纺居家
家具建材
美食特产
日用百货
汽车摩托
文化娱乐
本地生活
虚拟服务
万能的淘宝
None
热卖单品
更新日志