python核心编程中网络爬虫的例子

  1 #!/usr/bin/env python
  2 
  3 import cStringIO                    #
  4 import formatter                    #
  5 from htmllib import HTMLParser         # We use various classes in these modules for parsing HTML.
  6 import httplib                        # We only need an exception from this module
  7 import os                            # This provides various file system functions
  8 import sys                            # We are just using argv for command-line arguments
  9 import urllib                        # We only need the urlretrieve()function for downloading Web pages
 10 import urlparse                        # We use the urlparse()and urljoin()functions for URL manipulation
 11 
 12 class Retriever(object):
 13     __slots__ = (\'url\',\'file\')
 14     
 15     def __init__(self,url):
 16         self.url, self.file = self.get_file(url)
 17         
 18     def get_file(self, url, default=\'index.html\'):
 19         \'Create usable local filename from URL\'
 20         parsed = urlparse.urlparse(url)                     # ParseResult(scheme=\'http\', netloc=\'www.baidu.com\', path=\'\', params=\'\', query=\'\', fragment=\'\')
 21         host = parsed.netloc.split(\'@\')[-1].split(\':\')[0]    # \'www.baidu.com\'
 22         filepath = \'%s%s\' % (host,parsed.path)                # \'www.baidu.com\'
 23         if not os.path.splitext(parsed.path)[1]:            # \'\'
 24             filepath = os.path.join(filepath, default)        # \'www.baidu.com\\index.html\'
 25         linkdir = os.path.dirname(filepath)                    # \'www.baidu.com\'
 26         if not os.path.isdir(linkdir):                        # False
 27             if os.path.exists(linkdir):                        # False
 28                 os.unlink(linkdir)                            
 29             os.makedirs(linkdir)                            # make a directory named by link directory on the hard disc
 30         return url, filepath
 31         
 32     def download(self):
 33         \'Download URL to specific name file\'
 34         try:
 35             retval = urllib.urlretrieve(self.url, self.file)
 36         except (IOError, httplib.InvalidURL) as e:
 37             retval = ((\'*** ERROR:bad URL "%s": %s\' % (self.url,e)),)
 38         return retval
 39         
 40     def parse_links(self):
 41         \'Parse out the links found in downloaded HTML file\'
 42         f = open(self.file, \'r\')
 43         data = f.read()
 44         f.close()
 45         parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO())))
 46         parser.feed(data)
 47         parser.close()
 48         return parser.anchorlist
 49         
 50 class Crawler(object):
 51     count = 0                                                # the number of objects downloaded from the internet
 52     
 53     def __init__(self, url):
 54         self.q = [url]                                        # a queue of links to download
 55         self.seen = set()                                    # a set containing all the links that we have seen(downloaded) already
 56         parsed = urlparse.urlparse(url)
 57         host = parsed.netloc.split(\'@\')[-1].split(\':\')[0]
 58         self.dom = \'.\'.join(host.split(\'.\')[-2:])            # \'b.a.i.d.u\'
 59 
 60     def get_page(self, url, media=False):
 61         \'Download page & parse links, add to queue if nec\'
 62         r = Retriever(url)
 63         fname = r.download()[0]                                # \'www.baidu.com\\index.html\'
 64         if fname[0] == \'*\':                                    # \'w\'
 65             print fname, \'... skipping parse\'
 66             return
 67         Crawler.count += 1                                    # 1
 68         print \'\n(\', Crawler.count, \')\'                        # (1)
 69         print \'URL:\', url                                    # URL: http://www.baidu.com
 70         print \'FILE:\', fname                                # FILE: www.baidu.com\\index.html
 71         self.seen.add(url)                                    # set([\'http://www.baidu.com\'])
 72         ftype = os.path.splitext(fname)[1]                    # \'.html\'
 73         if ftype not in (\'.htm\', \'.html\'):                    # False
 74             return
 75             
 76         for link in r.parse_links():
 77             if link.startswith(\'mailto:\'):                    # False
 78                 print \'... discarded, mailto link\'
 79                 continue
 80             if not media:                                    # False
 81                 ftype = os.path.splitext(link)[1]
 82                 if ftype in (\'.mp3\',\'.mp4\',\'.m4v\',\'.wav\'):
 83                     print \'... discarded, media file\'
 84                     continue
 85             if not link.startswith(\'http://\'):                # False
 86                 link = urlparse.urljoin(url, link)
 87             print \'*\', link,
 88             if link not in self.seen:                        # True
 89                 if self.dom not in link:                    # False
 90                     print \'... discarded, not in domain\'
 91                 else:
 92                     if link not in self.q:
 93                         self.q.append(link)
 94                         print \'... new, added to Q\'
 95                     else:
 96                         print \'... discarded, already in Q\'
 97             else:
 98                 print \'... discarded, already processed\'
 99                 
100     def go(self, media=False):
101         \'Process next page in queue (if any)\'
102         while self.q:
103             url = self.q.pop()
104             self.get_page(url, media)
105             
106 def main():
107         if len(sys.argv) > 1:
108             url = sys.argv[1]
109         else:
110             try:
111                 url = raw_input(\'Enter starting URL:\')
112             except(KeyboardInterrupt, EOFError):
113                 url = \'\'
114         if not url:
115             return
116         if not url.startswith(\'http://\') and not url.startswith(\'ftp://\'):
117             url = \'http://%s/\' % url
118         robot = Crawler(url)
119         robot.go()
120         
121 if __name__ == \'__main__\':
122         main()