1 #!/usr/bin/env python 2 3 import cStringIO # 4 import formatter # 5 from htmllib import HTMLParser # We use various classes in these modules for parsing HTML. 6 import httplib # We only need an exception from this module 7 import os # This provides various file system functions 8 import sys # We are just using argv for command-line arguments 9 import urllib # We only need the urlretrieve()function for downloading Web pages 10 import urlparse # We use the urlparse()and urljoin()functions for URL manipulation 11 12 class Retriever(object): 13 __slots__ = (\'url\',\'file\') 14 15 def __init__(self,url): 16 self.url, self.file = self.get_file(url) 17 18 def get_file(self, url, default=\'index.html\'): 19 \'Create usable local filename from URL\' 20 parsed = urlparse.urlparse(url) # ParseResult(scheme=\'http\', netloc=\'www.baidu.com\', path=\'\', params=\'\', query=\'\', fragment=\'\') 21 host = parsed.netloc.split(\'@\')[-1].split(\':\')[0] # \'www.baidu.com\' 22 filepath = \'%s%s\' % (host,parsed.path) # \'www.baidu.com\' 23 if not os.path.splitext(parsed.path)[1]: # \'\' 24 filepath = os.path.join(filepath, default) # \'www.baidu.com\\index.html\' 25 linkdir = os.path.dirname(filepath) # \'www.baidu.com\' 26 if not os.path.isdir(linkdir): # False 27 if os.path.exists(linkdir): # False 28 os.unlink(linkdir) 29 os.makedirs(linkdir) # make a directory named by link directory on the hard disc 30 return url, filepath 31 32 def download(self): 33 \'Download URL to specific name file\' 34 try: 35 retval = urllib.urlretrieve(self.url, self.file) 36 except (IOError, httplib.InvalidURL) as e: 37 retval = ((\'*** ERROR:bad URL "%s": %s\' % (self.url,e)),) 38 return retval 39 40 def parse_links(self): 41 \'Parse out the links found in downloaded HTML file\' 42 f = open(self.file, \'r\') 43 data = f.read() 44 f.close() 45 parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO()))) 46 parser.feed(data) 47 parser.close() 48 return parser.anchorlist 49 50 class Crawler(object): 51 count = 0 # the number of objects downloaded from the internet 52 53 def __init__(self, url): 54 self.q = [url] # a queue of links to download 55 self.seen = set() # a set containing all the links that we have seen(downloaded) already 56 parsed = urlparse.urlparse(url) 57 host = parsed.netloc.split(\'@\')[-1].split(\':\')[0] 58 self.dom = \'.\'.join(host.split(\'.\')[-2:]) # \'b.a.i.d.u\' 59 60 def get_page(self, url, media=False): 61 \'Download page & parse links, add to queue if nec\' 62 r = Retriever(url) 63 fname = r.download()[0] # \'www.baidu.com\\index.html\' 64 if fname[0] == \'*\': # \'w\' 65 print fname, \'... skipping parse\' 66 return 67 Crawler.count += 1 # 1 68 print \'\n(\', Crawler.count, \')\' # (1) 69 print \'URL:\', url # URL: http://www.baidu.com 70 print \'FILE:\', fname # FILE: www.baidu.com\\index.html 71 self.seen.add(url) # set([\'http://www.baidu.com\']) 72 ftype = os.path.splitext(fname)[1] # \'.html\' 73 if ftype not in (\'.htm\', \'.html\'): # False 74 return 75 76 for link in r.parse_links(): 77 if link.startswith(\'mailto:\'): # False 78 print \'... discarded, mailto link\' 79 continue 80 if not media: # False 81 ftype = os.path.splitext(link)[1] 82 if ftype in (\'.mp3\',\'.mp4\',\'.m4v\',\'.wav\'): 83 print \'... discarded, media file\' 84 continue 85 if not link.startswith(\'http://\'): # False 86 link = urlparse.urljoin(url, link) 87 print \'*\', link, 88 if link not in self.seen: # True 89 if self.dom not in link: # False 90 print \'... discarded, not in domain\' 91 else: 92 if link not in self.q: 93 self.q.append(link) 94 print \'... new, added to Q\' 95 else: 96 print \'... discarded, already in Q\' 97 else: 98 print \'... discarded, already processed\' 99 100 def go(self, media=False): 101 \'Process next page in queue (if any)\' 102 while self.q: 103 url = self.q.pop() 104 self.get_page(url, media) 105 106 def main(): 107 if len(sys.argv) > 1: 108 url = sys.argv[1] 109 else: 110 try: 111 url = raw_input(\'Enter starting URL:\') 112 except(KeyboardInterrupt, EOFError): 113 url = \'\' 114 if not url: 115 return 116 if not url.startswith(\'http://\') and not url.startswith(\'ftp://\'): 117 url = \'http://%s/\' % url 118 robot = Crawler(url) 119 robot.go() 120 121 if __name__ == \'__main__\': 122 main()
相关文章: