因为Python无需编译直接执行,所以可以成为脚本
- 脚本:简单的Python程序
- 程序:大一点的、复杂的Python脚本
爬虫:一种自动抓取互联网信息的程序
爬虫的价值
基于爬取的数据进行分析,提取数据分析服务
爬虫网络架构
爬虫调度端:启动 运行 监控
1. URL管理器
2. 网页下载器
3. 网页解析器
运行流程
URL管理器
4. URL管理器:管理待抓取URL集合和已抓取URL集合
5. 防止重复抓取、防止循环抓取
步骤
- 新的URL添加到待爬取集合中
- 判断待添加URL是否在容器中
- 若为新的URL则添加到管理器中
- 获取一个待爬去的URL(先判断是否存在待爬去URL)
- 爬取完成后将待爬取移动到已爬取 -
URL实现方式
1.内存(小型)
采用set()方式存储:set会自动去重
2.关系数据库(个人实现永久存储)
MYSQL
urls(url,is_crawled)
#is_crawled表示是否已经爬取
3.缓存数据库(高性能大项目)
redis
待爬取set
已爬去set
网页下载器
将互联网上URL对应的网页下载到本地的工具
将网页以HTML形式存储到本地文件或内存字符串
1.utllib2
Python官方基础模块 cookie处理 代理处理
2.request
Python第三方插件 更为强大
utllib2
三种方法
1.最简单的方法urllib2.urlopen(url)
import urllib2
#直接请求
response = urllib2.urlopen(\'http://www.baidu.com\')
#获取状态码,如果是200表示获取成功
print response.getcode()
#读取内容
cont = response.read()
2.添加data、http header
url data header传送给urllib2.request类 以request类作为请求传递给urlopen
相比与1 ulr地址变为了request
imort urllib2
#创建session对象
request = urllib2.Requset(url)
#添加数据
requset.add_data(\'a\',\'1\')
#添加http的header 访问伪装为Mozilla/5.0浏览器
requset.add_header(\'User-Agent\',\'Mozilla/5.0\')
#同1
response = urllib2.urlopen(requset)
3.添加特殊情景的处理器
#Python3
#cookie处理
import urllib2,cookielib
#创建cookie容器
cj = cookielib.CookieJar()
#将cookie架作为参数传递给header 传递给build_opener
#床架一个opener
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
#给uillb2安装opener
urllib2.install)opener(opener)
#使用带有cookie的urllib2访问网页
response = urllib2.urlopen("http://www.baidu.com/")
项目案例
from http import cookiejar
import urllib3
import urllib
url = "http://www.baidu.com"
print("第一种方法")
response1 = urllib.request.urlopen(url)
print (response1.getcode())
print (len(response1.read()))
print("第二种方法")
request = urllib.request.Request(url)
request.add_header("user-agent","Mozilla/5.0")
response2 = urllib.request.urlopen(request)
print(response2.getcode())
print(len(response2.read()))
print("第三中方法")
cj = cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
urllib.request.install_opener(opener)
response3 = urllib.request.urlopen(url)
print(response3.getcode())
print(cj)
print(response3.read())
网页解析器
从网页中提取出有价值的工具
几种网页解释器
BeautifulSoup是第三方解释器,他既可以调用官方自带的html.parser也可以调用lxml
结构化解析
将整个网页文件加载成 结构化解析-DOM(Document Object Model)树(文档-对象模型)
官方定义的网页解析模型
BeautifulSoup4
1.安装
conda install beautifulsoup4
测试安装是否成功
import bs4
print(bs4)
语法
1.创建bs对象,文档字符串加载为DOM树
按照DOM树进行搜索
- find_all(搜索全部的节点)
- find(只搜索第一个节点)
访问节点的 名称、文字、属性
案例
form bs4 import BeautifulSoup
#根据HTML网页字符串创建bs对象
soup = BeautifulSoup(
html_doc,#HTML文档字符串
\'html.parser\'#HTML解析器
from_encoding = \'utf-8\'#HTML文档编码)
搜索节点(find_all find)
#方法:find_all(name,atters,string)
#查找所有为a的节点
soup.find_all(\'a\')
#差找所有标签为a的节点 链接符合/view/123.html形式的节点
soup.find_all(\'a\',href=‘/view/123.html’)
soup.find_all(\'a\',href=re.complie(r\'/view/\d+\.html\'))
#查找所有有标签为div,class为abc,文字为Python的节点
soup.find_all(\'div\',class_=\'abc\',string=\'python\')
#class_:_是因为Python关键字有class避免冲突 所以加—
访问节点信息
#得到节点:<a href=\'1.html\'>Pythone</a>
#获取查找到的节点的标签
node.name
#获取a节点的href属性
node[\'href\']
#获取查找到a节点的链接文字
node.get_text()
整体案例
https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse\'s story</title></head>
<body>
<p class="title"><b>The Dormouse\'s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,\'html.parser\',from_encoding=\'utf-8\')
print(\'获取所有的链接\')
links = soup.find_all(\'a\')
for link in links:
print (link.name,link[\'href\'],link.get_text())
print(\'获取lacie的来接\')
link_node1 = soup.find(\'a\',href=\'http://example.com/lacie\')
print (link_node1.name,link_node1[\'href\'],link_node1.get_text())
#模糊匹配ill
import re
print(\'正则匹配\')
link_node2 = soup.find(\'a\',href =re.compile(r"ill"))
print (link_node2.name,link_node2[\'href\'],link_node2.get_text())
#指定class获取他的内容
print(\'获取p段落文字\')
p_node = soup.find(\'p\',class_ = "tittle")
print (p_node.name,p_node.get_text())
爬虫开发实例
1.确定目标:确定要爬取的网页信息
2.分析目标:确定url格式,限定抓取信息的范围
3.分析抓取数据的格式
4.分析页面的编码,指定网页编码
5.编写代码
6.执行爬虫
目标:百度百科Python词条相关1000词条网页–标题和简介
分析数据
入口页:
https://baike.baidu.com/item/Python/407313.html
URL格式:
-词条页面URL:/view/125370.html
数据格式
-标题:
<dd class="lemmaWgt-lemmaTitle-title"><h1>***</h1></dd>
-简介:
<div class="lemma-summary">***<div>
页面编码:
UTF8
源码
目录结构
- spider_main(爬虫调度器)
- html_downloader(网页下载)
- url_manager(url管理)
- html_parser(网页解析器)
- html_outputer(所有爬去好的数据)
spider_main.py
# conding:utf-8
from baidubk import url_manager, html_downloader, html_outputer, html_parser
class SpiderMain(object):
def __init__(self):
# 引入模块
self.urls = url_manager.UrlManager()
self.downloader = html_downloader.HtmlDownloader()
self.parser = html_parser.HtmlParser()
self.outputer = html_outputer.HtmlOutput()
def craw(self, root_url):
count = 1
self.urls.add_new_url(root_url)
# 当有待爬去url时:
while self.urls.has_new_url():
try:
new_url = self.urls.get_new_url()
print(\'craw %d : %s\' % (count, new_url))
# 启动下载器 下载到download
html_cont = self.downloader.download(new_url)
print(html_cont)
# 解析器解析这个页面 得到新的url列表
print(\'解析器解析这个页面 得到新的url列表\')
new_urls, new_data = self.parser.parse(new_url, html_cont)
self.urls.add_new_urls(new_urls)
self.outputer.collect_data(new_data)
if count == 10:
break
else:
pass
count = count + 1
except:
print(\'craw failed\')
self.outputer.output_html()
# 编写main函数
if __name__ == "__main__":
root_url = "https://baike.baidu.com/item/Python/407313.html"
obj_spider = SpiderMain()
# craw方法启动爬虫
obj_spider.craw(root_url)
url_manager.py
encoding = \'utf-8\'
class UrlManager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()
#像管理器中添加一个新的url
def add_new_url(self,url):
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
# 向管理器中添加新的多个URL
def add_new_urls(self,urls):
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_new_url(url)
#判断管理器中是否有新的待爬取的
def has_new_url(self):
return len(self.new_urls) != 0
#从url管理器中获取新的待爬取的URL
def get_new_url(self):
#pop会从列表中获取一个url并移除这个url
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
html_parser.py
encoding = \'utf-8\'
from bs4 import BeautifulSoup
import re
from urllib import parse
class HtmlParser(object):
def _get_new_urls(self, page_url, soup):
new_urls = set()
links = soup.find_all(\'a\', href=re.compile(r"[\s\S]*"))
print(links)
for link in links:
new_url = link[\'href\']
# urljoin自动将两个url拼接成一个完整的url
new_full_url = parse.urljoin(page_url, new_url)
new_urls.add(new_full_url)
return new_urls
# 解析
def _get_new_data(self, page_url, soup):
res_data = {}
res_data[\'url\'] = page_url
# <dd class="lemmaWgt-lemmaTitle-title">
title_node = soup.find(\'dd\', class_=\'lemmaWgt-lemmaTitle-title\').find(\'h1\')
res_data[\'title\'] = title_node.get_text()
# <div class="lemma-summary" label-module="lemmaSummary">
summary_node = soup.find(\'div\', class_=\'lemma-summary\')
res_data[\'summary\'] = summary_node.get_text()
return res_data
def parse(self, page_url, html_cont):
print(type(html_cont))
html_conts = html_cont.decode(\'utf-8\')
print(type(html_conts))
if page_url is None or html_cont is None:
print(\'page_url is None or html_cont is None\')
return None
print(\'Get new urls start\')
soup = BeautifulSoup(html_conts, \'html.parser\', from_encoding=\'utf-8\')
print(\'Get new urls over\')
print(type(html_conts))
new_urls = self._get_new_urls(page_url, soup)
new_data = self._get_new_data(page_url, soup)
return new_urls, new_data
html_downloader.py
encoding = \'utf-8\'
import urllib.request
class HtmlDownloader(object):
def download(self,url):
print(\'Here is download(),url:\',url)
if url is None:
print(\'But url is None\')
return None
print(\'URL is not None\')
response = urllib.request.urlopen(url)
print("Get response is over")
if response.getcode() != 200:
print("response.getcode() != 200")
return None
return response.read()
html_outputer.py
encoding = \'utf-8\'
class HtmlOutput(object):
def __init__(self):
self.datas = []
def collect_data(self,data):
if data is None:
return
self.datas.append(data)
def output_html(self):
fout = open(\'output.html\',\'w\')
fout.write("<html>")
fout.write("<body>")
fout.write("<table>")
for data in self.datas:
fout.write("<tr>")
fout.write("<td>%s</td>" % data[\'url\'])
fout.write("<td>%s</td>" % data[\'title\'])
fout.write("<td>%s</td>" % data[\'summary\'])
fout.write("<tr>")
fout.write("</table>")
fout.write("</body>")
fout.write("</html>")
fout.close()