爬取b站排行榜并存到mysql中
目的
b站是我平时看得最多的一个网站,最近接到了一个爬虫的课设。首先要选择一个网站,并对其进行爬取,最后将该网站的数据存储并使其可视化。
网站的结构
目标网站:bilibili排行榜
网页的层次
- 首先要确定要提取的信息,也就是标题、播放量、作者up主、评分、播放量和评论量
- 在网页源代码中找到要找的信息
每个网页中大概有多条这样的rank-item项目,要提取的信息就包含其中
<li class="rank-item">
<div class="num">3</div>
<div class="content">
<div class="img">
<a href="//bangumi.bilibili.com/anime/28016" target="_blank">
<div class="lazy-img cover"><img alt="女高中生的虚度日常" src=""></div>
</a>
<!---->
</div>
<div class="info">
<a href="//bangumi.bilibili.com/anime/28016" target="_blank"class="title">
女高中生的虚度日常
</a>
<div class="bangumi-info">全12话</div>
<div class="detail">
<span class="data-box">
<i class="b-icon play"></i>
3491.1万
</span>
<span class="data-box">
<i class="b-icon view"></i>
74.3万
</span>
<span class="data-box">
<i class="fav">
</i>
176.4万
</span></div>
<div class="pts">
<div>2218000</div>综合得分
</div>
</div>
</div>
</li>
1.名称在title类的a标签下
2.播放量、评论数、和up主在data-box类的span标签下
3.综合评分在pts类的div标签下
对应解析其的代码
def getPage(url):#爬取单个页面,核心代码
spider=Spider(url)
spider.setSoup()
itemList=spider.findTagByAttrs(\'li\',\'rank-item\')
pageContentList=[]
for item in itemList:
pageContentItem=[]
for title in item.find_all(\'a\',\'title\'):
pageContentItem.append(title.string)
# print(title.string)
for playnum in item.find_all(\'span\',\'data-box\'):
pattern=r">([^<]+)<"
n=re.findall(pattern,playnum.__str__())[0]
pageContentItem.append(n)
# print(n)
# print(item.find_all(\'div\',\'pts\')[0].div.string)
pageContentItem.append(item.find_all(\'div\',\'pts\')[0].div.string)
pageContentList.append(pageContentItem)
return pageContentList
网站的层次
通过观察连接参数的变化
https://www.bilibili.com/ranking/all/0/0/3
以这个链接为例,通过实验,该网页链接的参数代表各种意义,ranking代表排行,all代表是否是全站还是原创,第一个参数0代表,各个分区,第二个参数0代表了全部投稿还是近期投稿,第三个参数3代表了是三日内投递的,根据实验规律,得到了生成连接的代码,但是只有全站榜和原创榜支持这个规律,其他的暂时没爬
def getURLFormBilibili():# 获取各种各样排行的榜单的信息
date={
1:\'日排行\',
3:\'三日排行\',
7:\'周排行\',
30:\'月排行\'
}
areatype={
0:\'全站\',
1:\'动画\',
168:\'国漫相关\',
3:\'音乐\',
129:\'舞蹈\',
4:\'游戏\',
36:\'科技\',
188:\'数码\',
160:\'生活\',
119:\'鬼畜\',
155:\'时尚\',
5:\'娱乐\',
181:\'影视\'
}
ranktype={
\'all\':\'全站\',
\'origin\':\'原创\'
}
submit={
\'0\':\'全部投稿\',
\'1\':\'近期投稿\'
}
urlDict={}#存放相应url的字典
for ranktypeItem in ranktype.keys():
for areatypeItem in areatype.keys():
for submitItem in submit.keys():
for dateTypeItem in date.keys():
title=ranktype[ranktypeItem]+\'_\'+areatype[areatypeItem]+\'_\'+submit[submitItem]+\'_\'+date[dateTypeItem]
destinaTionUrl=\'https://www.bilibili.com/ranking/{}/{}/{}/{}\'.format(ranktypeItem,areatypeItem,submitItem,dateTypeItem)
urlDict[title]=destinaTionUrl
return urlDict
保存到mysql数据库
这里使用了pymysql这个库,安装使用pip就好了,就不再赘述,为了方便我把它写成了一个类
class MysqlConnect:#数据库的连接类
def __init__(self):
pass
def getConnect(self):
db=coon = pymysql.connect(
host = \'localhost\',user = \'root\',passwd = \'你的密码\',
port = 3306,db = \'bilibilirank\',charset = \'utf8\'
#port必须写int类型
#charset必须写utf8,不能写utf-8
)
return db
def insertInfo(self,sql):
db=self.getConnect()
cursor=db.cursor()
try:
cursor.execute(sql)
db.commit()
print("sucessed...")
except:
print("failed...")
db.rollback()
def queryOutCome(self,sql):
# 获取数据库连接
db = self.getConnect()
# 使用cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
try:
# 执行sql语句
cursor.execute(sql)
result = cursor.fetchone()
except: #方法二:采用traceback模块查看异常
#输出异常信息
traceback.print_exc()
# 如果发生异常,则回滚
db.rollback()
finally:
# 最终关闭数据库连接
db.close()
return result
def getCreateTableSql(self,tableName):#获取创建表的sql语句
sql=\'\'\'
create table `{}` (
id int(11) auto_increment primary key,
title char(100) NOT NULL UNIQUE,
playnum char(100) NOT NULL,
commentnum char(100) NOT NULL,
author char(100) NOT NULL,
score char(100) NOT NULL
)ENGINE=innodb DEFAULT CHARSET=utf8;
\'\'\'.format(tableName)
return sql
def getInsertToTableSql(self,tableName,title,playnum,commentnum,author,score):
sql=\'\'\'
insert into `{}` values(null,\'{}\',\'{}\',\'{}\',\'{}\',\'{}\');
\'\'\'.format(tableName,title,playnum,commentnum,author,score)
return sql
def createTable(self,tableName,sql):
db=self.getConnect()
cursor=db.cursor()
cursor.execute("drop table if exists %s" %(tableName))
cursor.execute(sql)
db.close()
爬取数据
按照页面逐个爬取保存到数据库
if __name__ == "__main__":
#开始爬取数据
urlDict=getURLFormBilibili()#获取对应的URL信息
mysqlconnect=MysqlConnect()#用于连接数据库
for urlName in urlDict:
print("正在处理"+urlName+"页面...")
url=urlDict[urlName]
tableName=urlName
createsql=mysqlconnect.getCreateTableSql(tableName)
mysqlconnect.createTable(tableName,createsql)
pageList=getPage(url)
for contentItem in pageList:
insertsql=mysqlconnect.getInsertToTableSql(tableName,contentItem[0],contentItem[1],contentItem[2],contentItem[3],contentItem[4])
print(insertsql)
mysqlconnect.insertInfo(insertsql)
结果
源代码
import requests
import re
from bs4 import BeautifulSoup
import pymysql
import traceback
class Spider:#常用的爬取方法的简单封装
def __init__(self,url):
self.url=url
def getHTML(self):#获取html的对应代码
headers={\'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.103 Safari/537.36\'}
try:
response=requests.get(url=self.url,headers=headers,timeout=20)
response.raise_for_status()
response.encoding=response.apparent_encoding
return response.text
except:
return "网页访问失败"
def setSoup(self):#获取soup对象
html=self.getHTML()
self.soup=BeautifulSoup(html,\'html.parser\')
def findTag(self,tagName):#按照标签名查找标签
return self.soup.find_all(tagName)
def findTagByAttrs(self,tagName,attrs):
return self.soup.find_all(tagName,attrs)
def getBeautifyHTML(self):
return self.soup.prettify()
def getPage(url):#爬取单个页面,核心代码
spider=Spider(url)
spider.setSoup()
itemList=spider.findTagByAttrs(\'li\',\'rank-item\')
pageContentList=[]
for item in itemList:
pageContentItem=[]
for title in item.find_all(\'a\',\'title\'):
pageContentItem.append(title.string)
# print(title.string)
for playnum in item.find_all(\'span\',\'data-box\'):
pattern=r">([^<]+)<"
n=re.findall(pattern,playnum.__str__())[0]
pageContentItem.append(n)
# print(n)
# print(item.find_all(\'div\',\'pts\')[0].div.string)
pageContentItem.append(item.find_all(\'div\',\'pts\')[0].div.string)
pageContentList.append(pageContentItem)
return pageContentList
def getURLFormBilibili():# 获取各种各样排行的榜单的信息
date={
1:\'日排行\',
3:\'三日排行\',
7:\'周排行\',
30:\'月排行\'
}
areatype={
0:\'全站\',
1:\'动画\',
168:\'国漫相关\',
3:\'音乐\',
129:\'舞蹈\',
4:\'游戏\',
36:\'科技\',
188:\'数码\',
160:\'生活\',
119:\'鬼畜\',
155:\'时尚\',
5:\'娱乐\',
181:\'影视\'
}
ranktype={
\'all\':\'全站\',
\'origin\':\'原创\'
}
submit={
\'0\':\'全部投稿\',
\'1\':\'近期投稿\'
}
urlDict={}#存放相应url的字典
for ranktypeItem in ranktype.keys():
for areatypeItem in areatype.keys():
for submitItem in submit.keys():
for dateTypeItem in date.keys():
title=ranktype[ranktypeItem]+\'_\'+areatype[areatypeItem]+\'_\'+submit[submitItem]+\'_\'+date[dateTypeItem]
destinaTionUrl=\'https://www.bilibili.com/ranking/{}/{}/{}/{}\'.format(ranktypeItem,areatypeItem,submitItem,dateTypeItem)
urlDict[title]=destinaTionUrl
return urlDict
class MysqlConnect:#数据库的连接类
def __init__(self):
pass
def getConnect(self):
db=coon = pymysql.connect(
host = \'localhost\',user = \'root\',passwd = \'你的密码\',
port = 3306,db = \'bilibilirank\',charset = \'utf8\'
#port必须写int类型
#charset必须写utf8,不能写utf-8
)
return db
def insertInfo(self,sql):
db=self.getConnect()
cursor=db.cursor()
try:
cursor.execute(sql)
db.commit()
print("sucessed...")
except:
print("failed...")
db.rollback()
def queryOutCome(self,sql):
# 获取数据库连接
db = self.getConnect()
# 使用cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
try:
# 执行sql语句
cursor.execute(sql)
result = cursor.fetchone()
except: #方法二:采用traceback模块查看异常
#输出异常信息
traceback.print_exc()
# 如果发生异常,则回滚
db.rollback()
finally:
# 最终关闭数据库连接
db.close()
return result
def getCreateTableSql(self,tableName):#获取创建表的sql语句
sql=\'\'\'
create table `{}` (
id int(11) auto_increment primary key,
title char(100) NOT NULL UNIQUE,
playnum char(100) NOT NULL,
commentnum char(100) NOT NULL,
author char(100) NOT NULL,
score char(100) NOT NULL
)ENGINE=innodb DEFAULT CHARSET=utf8;
\'\'\'.format(tableName)
return sql
def getInsertToTableSql(self,tableName,title,playnum,commentnum,author,score):
sql=\'\'\'
insert into `{}` values(null,\'{}\',\'{}\',\'{}\',\'{}\',\'{}\');
\'\'\'.format(tableName,title,playnum,commentnum,author,score)
return sql
def createTable(self,tableName,sql):
db=self.getConnect()
cursor=db.cursor()
cursor.execute("drop table if exists %s" %(tableName))
cursor.execute(sql)
db.close()
if __name__ == "__main__":
#开始爬取数据
urlDict=getURLFormBilibili()#获取对应的URL信息
mysqlconnect=MysqlConnect()#用于连接数据库
for urlName in urlDict:
print("正在处理"+urlName+"页面...")
url=urlDict[urlName]
tableName=urlName
createsql=mysqlconnect.getCreateTableSql(tableName)
mysqlconnect.createTable(tableName,createsql)
pageList=getPage(url)
for contentItem in pageList:
insertsql=mysqlconnect.getInsertToTableSql(tableName,contentItem[0],contentItem[1],contentItem[2],contentItem[3],contentItem[4])
print(insertsql)
mysqlconnect.insertInfo(insertsql)