豆瓣电影TOP250和书籍TOP250爬虫
豆瓣电影 TOP250 和书籍 TOP250 爬虫
最近开始玩 Python , 学习爬虫相关知识的时候,心血来潮,爬取了豆瓣电影TOP250 和书籍TOP250, 这里记录一下自己玩的过程。
电影 TOP250 爬虫
import requests
from bs4 import BeautifulSoup
import time
def getlist(list_url):
time.sleep(2)
res = requests.get(list_url)
soup = BeautifulSoup(res.text, \'html.parser\')
movie_list = soup.select(\'.grid_view li\')
for m in movie_list:
rank = m.select(\'em\')[0].text
score = m.select(\'.rating_num\')[0].text
title = m.select(\'.title\')[0].text
direct = m.select(\'.info .bd p\')[0].text.strip()
actor = \'\n主演:\'.join(direct.split(\' 主演:\'))
director = \'年代:\'.join(actor.split(\' \'))
if m.select(\'.inq\'):
comments = m.select(\'.inq\')[0].text.strip()
else:
comments = \'None\'
movie.append(
\'排名: \' + rank + \'\n\'
+ \'评分: \' + score + \'\n\'
+ \'片名: \' + title + \'\n\'
+ director + \'\n\'
+ \'评论: \' + comments + \'\n\'
+ \'\n\')
if soup.select(\'.next a\'):
asoup = soup.select(\'.next a\')[0][\'href\']
next_page = seed_url + asoup
getlist(next_page)
else:
print(\'结束\')
return movie
def write(movies):
with open(\'movie.txt\', \'w\', encoding=\'utf8\') as m:
for a in movies:
m.write(a)
def main():
write(getlist(seed_url))
pass
if __name__ == \'__main__\':
seed_url = \'https://movie.douban.com/top250\'
movie = []
main()
书籍 TOP250 爬虫
import bs4
import requests
import re
from bs4 import BeautifulSoup
from operator import itemgetter
def getHtmlText(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def parserText(text, book_list):
soup = BeautifulSoup(text, \'html.parser\')
for table in soup(\'table\', {\'width\': \'100%\'}):
if isinstance(table, bs4.element.Tag):
tds = table.find(\'tr\')(\'td\')
divs = tds[1](\'div\')
content = {}
for div in divs:
if isinstance(div, bs4.element.Tag):
if div.find(\'a\'):
name = div.find(\'a\').attrs[\'title\']
content.update({"书名": name})
if div.select(\'.rating_nums\'):
score = div.select(\'.rating_nums\')[0].text
content.update({"评分": score})
if div.select(\'.pl\'):
people_num = div.select(\'.pl\')[0].text
regex = re.compile(r\'[\d]{1,10}\')
content.update({"评价人数": regex.findall(people_num)[0]})
ps = tds[1](\'p\')
for p in ps:
if isinstance(p, bs4.element.Tag):
if p.attrs[\'class\'][0] == \'quote\':
description = p.find(\'span\').string
content.update({"介绍": description})
if p.attrs[\'class\'][0] == \'pl\':
author = p.string
content.update({"作者信息": author})
book_list.append(content)
next_books = soup.find(\'span\', {\'class\': \'next\'})
if next_books.find(\'a\'):
a = next_books.find(\'a\').attrs[\'href\']
text = getHtmlText(a)
parserText(text, books)
return book_list
def sortedBookTop250(book_list):
tmp = sorted(book_list, key=itemgetter(\'评分\'), reverse=True)
for i in range(len(tmp)):
tmp[i].update({"排名": i + 1})
return tmp
def writeToFile(book_list):
with open(\'good_books.txt\', \'w\', encoding=\'utf8\') as book_file:
for book in book_list:
for key, value in book.items():
book_file.write(f\'{key}:{value}\n\')
book_file.write(\'\n\')
pass
def main():
text = getHtmlText(seed_url)
book_list = parserText(text, books)
writeToFile(sortedBookTop250(book_list))
pass
if __name__ == \'__main__\':
seed_url = "https://book.douban.com/top250"
books = []
main()
总结
- 点击查看我的Github
- 点击查看我的个人Blog
- 日拱一卒,不期速成
以上直接贴出了代码,这是很简单的两段代码,主要用到了 requests 库和 beautifulsoup 库,需要的可以直接拿去,或者直接去我的 GIthub上拿 movies.txt 和 good_books.txt