# -*- coding: utf-8 -*- import requests import re import sys reload(sys) sys.setdefaultencoding(\'utf-8\') class Spider(object): def __init__(self): print(\'开始爬取豆瓣图书top250的内容。。。。。。\') # 传入url,返回网页源代码 def getSourceCode(self, url): html = requests.get(url) return html.text # 从源代码中提取出我们需要的内容块:{书名、作者出版社等、评分、评价人数}。 def getEveryBookContent(self, sourceCode): everyBookContent = re.findall(\'<table width="100%">(.*?)</table>\', sourceCode, re.S) # everyBookContent = re.findall(\'<div class="pl2">(.*?)</div>(.*?)<p class="pl">(.*?)</p>\', sourceCode, re.S) return everyBookContent # 从内容块中提取出数据 def getBookInfo(self, eachBookContent): bookInfo = {} # bookInfo[\'title\'] = re.subn(\'( |\n|<br/>|</?span.*?>)\', "", re.search(\'<a href=.*?>(.*?)</a>\', eachBookContent, re.S).group(1))[0] bookInfo[\'title\'] = re.sub(\'( |\n|<br/>|</?span.*?>)\', "", re.search(\'<a href=.*?>(.*?)</a>\', eachBookContent, re.S).group(1)) bookInfo[\'author\'] = re.search(\'<p class="pl">(.*?)</p>\', eachBookContent, re.S).group(1) bookInfo[\'discussNum\'] = re.sub(\'( |\n|<br/>)\', "", re.search(\'<span class="pl">\((.*?)\)</span>\', eachBookContent, re.S).group(1)) bookInfo[\'score\'] = re.search(\'<span class="rating_nums">(.*?)</span>\', eachBookContent, re.S).group(1) return bookInfo # 将结果保存到文件 def saveBookInfo(self, bookList): f = open("bookList.txt", "a") for each in bookList: f.writelines(\'书 名:\t {}\n\'.format(each[\'title\'])) f.writelines(\'作 者:\t {}\n\'.format(each[\'author\'])) f.writelines(\'评论数:\t {}\n\'.format(each[\'discussNum\'])) f.writelines(\'评 分:\t {}\n\n\'.format(each[\'score\'])) f.close() def start(self, url): sourceCode = self.getSourceCode(url) everyBookContent = self.getEveryBookContent(sourceCode) bookList = [] for each in everyBookContent: bookList.append(self.getBookInfo(each)) self.saveBookInfo(bookList) if __name__ == \'__main__\': douban = Spider() url = \'http://book.douban.com/top250?start=0\' i = 0 while i <= 225: url = \'http://book.douban.com/top250?start={}\'.format(i) douban.start(url) i += 25