1.爬虫入门必备知识
爬取网站:https://movie.douban.com/top250?start=225&filter=
2.爬虫思路讲解:
a) 了解翻页url的变化规律
第一页:https://movie.douban.com/top250?start=0&filter=
第二页:https://movie.douban.com/top250?start=25&filter=
b) 了解每一页提取内容定位:
每一页包含25部电影
c) 了解如何提取每部电影的详细信息
3.完整代码:
#!/usr/bin/env python
#-*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
headers = {
\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36\'
}
class SpiderDouban(object):
def __init__(self, url):
self.url = url
def get_collection(self):
client = MongoClient(\'localhost\', 27017)
database = client.spider
collection = database.douban
return collection
def get_reponse(self):
try:
response = requests.get(self.url, headers=headers)
response.raise_for_status
response.encoding = response.apparent_encoding
html = response.text
except Exception as e:
html = \'None\'
return html
def get_soup(self,html):
try:
soup = BeautifulSoup(html,\'html.parser\')
except:
soup = BeautifulSoup(html, \'xml\')
return soup
def get_items(self,soup):
items = soup.select(\'div.article>ol>li\')
return items
def get_item_content(self,item):
try:
head = item.select(\'div.hd\')[0].text.strip()
except:
head = \'None\'
try:
people = item.select("div.article>ol>li>div p[class=\'\']")[0].text.strip().replace(\' \', \'\')
except:
people = \'None\'
try:
star = item.select(\'div.article>ol>li>div div.star\')[0].text.strip().replace(\'\n\',\' \')
except:
star = \'None\'
try:
comment = item.select(\'div.article>ol>li>div p.quote\')[0].text.strip()
except:
comment = \'None\'
content = {
\'head\': head,
\'people\': people,
\'star\': star,
\'comment\': comment
}
return content
def start(self):
collection = self.get_collection()
html = self.get_reponse()
soup = self.get_soup(html)
items = self.get_items(soup)
for item in items:
content = self.get_item_content(item)
if collection.find_one(content):
print(\'\033[1;31m该item已经在数据库中,不进行存储\033[0m\')
else:
collection.insert_one(content)
print(\'\033[1;32m该item是新的, 进行存储\033[0m\')
if __name__ == \'__main__\':
urls = [\'https://movie.douban.com/top250?start={num}&filter=\'.format(num=num) for num in range(0,250,25)]
for page,url in enumerate(urls):
print(\'\033[1;33m开始爬取第{page}页\033[0m\'.format(page=page+1))
ss = SpiderDouban(url)
ss.start()