wuhen1

跟随udacity的cs101课程学习,今天学完了unit 3,写了一个爬虫程序:

import urllib2

def get_next_target(page):
     start_link=page.find(\'<a href=\')
     if start_link==-1:
          return None,0
     start_quote=page.find(\'"\',start_link)
     end_quote=page.find(\'"\',start_quote+1)
     url=page[start_quote+1:end_quote]
     return url,end_quote

def get_all_links(page):
     links=[]
     while True:
          url,endpos=get_next_target(page)
          if url:
               links.append(url)
               page=page[endpos:]
          else:
               break
     return links

def crawl_web(seed):
     tocrawl=[seed]
     crawled=[]
     while len(tocrawl)>0:
          link=tocrawl.pop()
          if link  not in crawled:
               page = urllib2.urlopen(link).read()          
               tocrawl=tocrawl+(get_all_links(page))
               crawled.append(link)
     return crawled


link=\'http://www.udacity.com/cs101x/index.html\'
##page = urllib2.urlopen(\'http://www.udacity.com/cs101x/index.html\').read()
##links=get_all_links(page)
print crawl_web(link)

注意的几点:

1. 抓取url里的html代码,要用到urllib2包里的urllib2.urlopen(link).read() 函数

2.这里用的是pop,故而是深搜

下面是一个广搜的爬虫例子,只需要改一下crawl_web(),添加一个next_tocrawl变量,记录下一层的点:

def crawl_web(seed):
     tocrawl=[seed]
     crawled=[]
     next_tocrawl=[]
     while len(tocrawl)>0:
          link=tocrawl.pop()
          if link  not in crawled:
               page = urllib2.urlopen(link).read()          
               next_tocrawl=next_tocrawl+(get_all_links(page))
               crawled.append(link)
          if len(tocrwal)==0:
               tocrwal,next_tocrawl=next_tocrawl,[]
     return crawled

分类:

技术点:

相关文章: