[爬虫]爬取搜狗的词库
同学要做用户搜索词意图分析,要用到分词,让我给写一个爬虫爬取搜狗词库的脚本。以前爬取网页都使用正则匹配,想要用美丽的汤很久了,正好借此机会体验一下它的强大威力。脚本对搜狗词库主页进行一级爬取结果,然后对每一个分类进行二级页面爬取,然后获取该分类下的词库文件,保存到执行脚本同目录的1文件夹下。python还是新手一个,要是对有帮助的同学尽管拿去。
#coding=utf-8
\'\'\'
Created on 2017年4月6日
@author: lenovo
\'\'\'
#######
#
#
########
from bs4 import BeautifulSoup
import re
import urllib
import sys,time
def callbackfunc(blocknum, blocksize, totalsize):
\'\'\'回调函数
@blocknum: 已经下载的数据块
@blocksize: 数据块的大小
@totalsize: 远程文件的大小
\'\'\'
percent = 100.0 * blocknum * blocksize / totalsize
if percent > 100:
percent = 100
sys.stdout.write("\r%6.2f%%"% percent)
sys.stdout.flush()
if __name__ == "__main__":
BaseUrl = "http://pinyin.sogou.com"
HomePageUrl = "http://pinyin.sogou.com/dict/"
html = urllib.urlopen(HomePageUrl).read()
soup = BeautifulSoup(html,"html.parser")
soup = soup.find(id="dict_category_show").find_all(\'div\',class_=\'dict_category_list\')
fc = 0
sc = 0
tc = 0
for ii in soup:
fc+=1
print "Level 1 :" + ii.find(class_=\'dict_category_list_title\').find(\'a\').contents[0]
for k in ii.find(class_=\'catewords\').find_all(\'a\'):
secondclass = k.contents[0]
secondUrl = BaseUrl+"%s" % (k[\'href\'])
print " " * 4 + "Level 2 :" + secondclass #+ " " * 8 + secondUrl
sc += 1
soup2 = BeautifulSoup(urllib.urlopen(secondUrl).read(),"html.parser")
totalpagenum = soup2.find(id=\'dict_page_list\').find(\'ul\').find_all(\'span\')[-2].a.contents[0]
for pageind in range(1, int(totalpagenum)+1):
soup2 = BeautifulSoup(urllib.urlopen( "%s/default/%d" % (secondUrl.replace("?rf=dictindex",""),pageind) ).read(),"html.parser")
for kk in soup2.find_all(\'div\', class_=\'dict_detail_block\') :
thirdclass = kk.find(class_=\'detail_title\').find(\'a\').contents[0]
thirdUrl = kk.find(class_=\'dict_dl_btn\').a[\'href\']
print " " * 8 + "Level 3 :" + thirdclass + " " * 10 + "Downloading....."
tc += 1
urllib.urlretrieve(thirdUrl.encode(\'utf8\'), "1\%s-%s.scel" % (secondclass,thirdclass),callbackfunc)
print "Total :%d, %d, %d" % (fc, sc, tc)