[爬虫]爬取搜狗的词库

同学要做用户搜索词意图分析，要用到分词，让我给写一个爬虫爬取搜狗词库的脚本。以前爬取网页都使用正则匹配，想要用美丽的汤很久了，正好借此机会体验一下它的强大威力。脚本对搜狗词库主页进行一级爬取结果，然后对每一个分类进行二级页面爬取，然后获取该分类下的词库文件，保存到执行脚本同目录的1文件夹下。python还是新手一个，要是对有帮助的同学尽管拿去。


#coding=utf-8
\'\'\'
Created on 2017年4月6日

@author: lenovo
\'\'\'
#######
#   
#
########
from bs4 import BeautifulSoup
import re
import urllib
import sys,time


def callbackfunc(blocknum, blocksize, totalsize):
    \'\'\'回调函数
    @blocknum: 已经下载的数据块
    @blocksize: 数据块的大小
    @totalsize: 远程文件的大小
    \'\'\'
    percent = 100.0 * blocknum * blocksize / totalsize
    if percent > 100:
        percent = 100
    sys.stdout.write("\r%6.2f%%"% percent)
    sys.stdout.flush()


if __name__ == "__main__":
    
    BaseUrl = "http://pinyin.sogou.com"
    HomePageUrl = "http://pinyin.sogou.com/dict/"
    html = urllib.urlopen(HomePageUrl).read()
    
    soup = BeautifulSoup(html,"html.parser")
    soup =  soup.find(id="dict_category_show").find_all(\'div\',class_=\'dict_category_list\')
    
    fc = 0
    sc = 0
    tc = 0
    for ii in soup:
        fc+=1
        print "Level 1 :" + ii.find(class_=\'dict_category_list_title\').find(\'a\').contents[0]
        for k in ii.find(class_=\'catewords\').find_all(\'a\'):
            secondclass = k.contents[0]
            secondUrl = BaseUrl+"%s" % (k[\'href\'])
            print " " * 4 + "Level 2 :" + secondclass #+ " " * 8 + secondUrl
            sc += 1
            
            soup2 = BeautifulSoup(urllib.urlopen(secondUrl).read(),"html.parser")
            totalpagenum = soup2.find(id=\'dict_page_list\').find(\'ul\').find_all(\'span\')[-2].a.contents[0]
            
            for pageind in range(1, int(totalpagenum)+1):
                
                soup2 = BeautifulSoup(urllib.urlopen( "%s/default/%d" %  (secondUrl.replace("?rf=dictindex",""),pageind)  ).read(),"html.parser")
                for kk in soup2.find_all(\'div\', class_=\'dict_detail_block\') :
                    thirdclass = kk.find(class_=\'detail_title\').find(\'a\').contents[0]
                    thirdUrl = kk.find(class_=\'dict_dl_btn\').a[\'href\']
                    print " " * 8 + "Level 3 :" + thirdclass + " " * 10 + "Downloading....."
                    tc += 1 
                    urllib.urlretrieve(thirdUrl.encode(\'utf8\'), "1\%s-%s.scel" % (secondclass,thirdclass),callbackfunc)
    print "Total :%d, %d, %d" % (fc, sc, tc)