打算分以下几个部分进行
1. 用python写一个爬虫爬取网易新闻
2. 用分词工具对爬下来的文字进行处理, 形成语料库
3. 根据TF-IDF, 自动找出新闻的关键词
4. 根据TF-IDF, 实现相似新闻推荐
step 1a
今天一天都在弄python爬虫, 花了好大力气才写出一个勉强可用的版本
1 # -*- coding: utf-8 -* 2 3 import re, urllib, sys 4 import pyodbc 5 6 newsLink = set()##获取的所有新闻 7 processLink = set()##正在处理的新闻 8 newLink = set()##新读取的新闻 9 viewedLink = set()##已经读取过的新闻 10 11 ##打开输入的链接, 用正则表达式找出新页面中其他的链接, 并添加到全局set中 12 def getNewsLink(link): 13 ##print link 14 if(link in viewedLink): 15 return 16 viewedLink.add(link) 17 content = "" 18 try:##这一步可能会抛出异常 19 content = urllib.urlopen(link).read().decode('gbk').encode('utf-8') 20 except: 21 info=sys.exc_info() 22 print info[0],":",info[1] 23 print "caused by link : ", link 24 m = re.findall(r"news\.163\.com/\d{2}/\d{4}/\d{2}/\w+\.html",content,re.M)##网易新闻链接格式为http://news.163.com/14/0621/12/9V8V9AL60001124J.html 25 for i in m: 26 url = "http://" + i 27 newLink.add(url) 28 newsLink.add(url) 29 print "crawled %d page, get %d link"%(len(viewedLink), len(newsLink)) 30 31 ##将读取到的新闻ID存入数据库中 32 def saveNewsIDtoDB(): 33 newsID = dict() 34 for link in newsLink: 35 ID = link[31:47] 36 newsID[ID] = link##截取其中新闻ID 37 conn = pyodbc.connect('DRIVER={SQL Server};SERVER=STEVEN-PC\\MSSQLSERVER_R2;DATABASE=TF-IDF;UID=sa;PWD=123456') 38 cursor = conn.cursor() 39 for (ID, url) in newsID.items(): 40 sql = "INSERT INTO News(NewsID, Url) VALUES ('%s','%s')"%(ID, url) 41 try: 42 cursor.execute(sql) 43 except: 44 info=sys.exc_info() 45 print info[0],":",info[1] 46 print "caused by sql : ", sql 47 conn.commit() 48 conn.close() 49 print "total get %d news ID"%(len(newsID)) 50 51 ##读取指定数量的新闻 52 def readNews(count): 53 processLink = set() 54 processLink.add("http://news.163.com/") 55 while(len(newsLink) < count): 56 for link in processLink: 57 getNewsLink(link) 58 processLink = newLink.copy() 59 newLink.clear() 60 61 readNews(10000) 62 saveNewsIDtoDB()