python爬虫实战项目
1. LOL所有英雄皮肤下载
from fake_useragent import UserAgent
import requests, json, os
# 爬取网页所有英雄的皮肤图片
# https://lol.qq.com/data/info-heros.shtml
# 获取英雄id
def get_heroList():
url = \'https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js\'
headers = {
\'User-Agent\': UserAgent().chrome
}
try:
response = requests.get(url, headers=headers)
# print(response.text)
# print(type(response.text))
response = json.loads(response.text)
# print(type(response))
hero_ids = []
for i in response[\'hero\']:
hero_ids.append(i[\'heroId\'])
# print(hero_ids)
return hero_ids
except:
print(\'获取英雄id失败\')
return None
# 根据英雄id获取英雄皮肤名称和图片下载地址
def get_skinNames(id):
url = \'https://game.gtimg.cn/images/lol/act/img/js/hero/{}.js\'.format(id)
headers = {
\'User-Agent\': UserAgent().chrome
}
try:
response = requests.get(url, headers=headers)
response = json.loads(response.text)
skinnames = []
skin_urls = []
for i in response[\'skins\'][:-1]:
if i[\'mainImg\'] != \'\':
skinnames.append(i[\'name\'])
skin_urls.append(i[\'mainImg\'])
# print(skinnames)
return skinnames, skin_urls
except:
print(\'获取英雄皮肤名称失败\')
return None
# 根据名称,下载图片保存文件夹
def downloadImg(skinnames, skin_urls):
headers = {
\'User-Agent\': UserAgent().chrome
}
filename = skinnames[0]
os.makedirs(filename, exist_ok=True)
for skinname, skin_url in zip(skinnames, skin_urls):
try:
response = requests.get(skin_url, headers=headers)
except:
print(skinname + \' 下载失败\')
return
with open(filename+\'/\'+skinname.replace(\'/\', \'_\') + \'.jpg\', \'wb\') as f:
f.write(response.content)
# print(filename + \' 下载完成\')
if __name__ == \'__main__\':
hero_ids = get_heroList()
i = 1
for id in hero_ids:
skinnames, skin_urls = get_skinNames(id)
# print(skinnames[0]+\':\'+str(len(skin_urls))+\'张\')
downloadImg(skinnames, skin_urls)
print(\'\r下载进度:\' + str(i) + \'/\' + str(len(hero_ids)), end=\'\')
i = i + 1
2. 音乐下载软件
import requests, json, re
from tkinter import Tk, Button, Entry, StringVar, Radiobutton, Frame
from tkinter import messagebox
# 说明:
# 爬取网站:https://music.zhuolin.wang/
# ajax异步请求
# 下载的歌曲在软件所在目录下
# 根据输入找到歌曲信息
def get_musicInfo(query, sourse):
music_ids = []
music_names = []
music_singers = []
url = \'https://music.zhuolin.wang/api.php?\'
data = {
\'types\': \'search\',
\'count\': \'5\',
\'source\': sourse,
\'pages\': \'1\',
\'name\': query
}
headers = {
\'Accept\': \'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01\',
\'Accept-Encoding\': \'gzip, deflate, br\',
\'Accept-Language\': \'zh-CN,zh;q=0.9\',
\'Connection\': \'keep-alive\',
\'Content-Length\': \'37\',
\'Content-Type\': \'application/x-www-form-urlencoded; charset=UTF-8\',
\'Cookie\': \'\',
\'Host\': \'music.zhuolin.wang\',
\'Origin\': \'https://music.zhuolin.wang\',
\'Referer\': \'https://music.zhuolin.wang/\',
\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36\',
\'X-Requested-With\': \'XMLHttpRequest\'
}
response = requests.post(url, headers=headers, data=data)
response = json.loads(response.text)
for i in response:
music_ids.append(i[\'id\'])
music_names.append(i[\'name\'])
music_singers.append(i[\'artist\'])
print(music_ids)
print(music_names)
print(music_singers)
# return music_ids, music_names, music_singers
# 根据id获取歌曲下载链接
def get_downloadUrl(music_id, name, singer, sourse):
url = \'https://music.zhuolin.wang/api.php?\'
data = {
\'types\': \'url\',
\'id\': music_id,
\'source\': sourse
}
headers = {
\'Accept\': \'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01\',
\'Accept-Encoding\': \'gzip, deflate, br\',
\'Accept-Language\': \'zh-CN,zh;q=0.9\',
\'Connection\': \'keep-alive\',
\'Content-Length\': \'37\',
\'Content-Type\': \'application/x-www-form-urlencoded; charset=UTF-8\',
\'Cookie\': \'\',
\'Host\': \'music.zhuolin.wang\',
\'Origin\': \'https://music.zhuolin.wang\',
\'Referer\': \'https://music.zhuolin.wang/\',
\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36\',
\'X-Requested-With\': \'XMLHttpRequest\'
}
response = requests.post(url, data=data, headers=headers)
print(response.text)
downloadurl = re.search(r\'http:(.+)",\', response.text)
if downloadurl != None:
downloadurl = downloadurl.group().replace(\'\\\', \'\')
downloadMusic(downloadurl, name, singer)
else:
messagebox.showinfo(\'抱歉\', \'该歌曲暂不提供下载,请您更换其他平台下载\')
# 下载歌曲到本地
def downloadMusic(url, name, singer):
headers = {
\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36\'
}
try:
response = requests.get(url, headers=headers)
with open(name + \'-\' + singer + \'.mp3\', \'wb\')as f:
f.write(response.content)
messagebox.showinfo(\'恭喜\', name + \'-\' + singer + \' 下载完成\')
except:
messagebox.showinfo(\'抱歉\', name + \' 下载失败\')
# 点击搜索执行
def search_music():
query = entry.get()
sourse = v.get()
if query == \'\':
messagebox.showinfo(\'提示\', \'请输入内容!\')
return False
music_ids, music_names, music_singers = get_musicInfo(query, sourse)
# 重新进行组件内容和按钮功能的设置
for i in range(5):
if i == 0:
id1 = str(music_ids[i])
name1 = str(music_names[i])
singer1 = str(music_singers[i][0])
value1.set(name1 + \' \' + singer1)
entry1[\'textvariable\'] = value1
button1[\'command\'] = lambda: download(id1, name1, singer1)
if i == 1:
id2 = str(music_ids[i])
name2 = str(music_names[i])
singer2 = str(music_singers[i][0])
value2.set(name2 + \' \' + singer2)
entry2[\'textvariable\'] = value2
button2[\'command\'] = lambda: download(id2, name2, singer2)
if i == 2:
id3 = str(music_ids[i])
name3 = str(music_names[i])
singer3 = str(music_singers[i][0])
value3.set(name3 + \' \' + singer3)
entry3[\'textvariable\'] = value3
button3[\'command\'] = lambda: download(id3, name3, singer3)
if i == 3:
id4 = str(music_ids[i])
name4 = str(music_names[i])
singer4 = str(music_singers[i][0])
value4.set(name4 + \' \' + singer4)
entry4[\'textvariable\'] = value4
button4[\'command\'] = lambda: download(id4, name4, singer4)
if i == 4:
id5 = str(music_ids[i])
name5 = str(music_names[i])
singer5 = str(music_singers[i][0])
value5.set(name5 + \' \' + singer5)
entry5[\'textvariable\'] = value5
button5[\'command\'] = lambda: download(id5, name5, singer5)
# 没有搜索之前点击下载按钮的提示
def tishi():
messagebox.showinfo(\'提示\', \'请先进行搜索\')
# 点击下载按钮执行(有点多余,可以去掉直接用get_downloadUrl)
def download(id, name, singer):
sourse = v.get()
get_downloadUrl(id, name, singer, sourse)
if __name__ == \'__main__\':
# get_musicInfo(\'嘲笑声\',\'tencent\')
# get_downloadUrl(\'0030tRLQ1e4mCn\',\'嘲笑声\',\'Big Daddy\',\'tencent\')
root = Tk()
win_width = root.winfo_screenwidth()
win_height = root.winfo_screenheight()
root.geometry(\'500x400+\' + str(int(win_width / 2 - 250)) + \'+\' + str(int(win_height / 2 - 200)))
root.minsize(500, 400)
root.maxsize(500, 400)
root.title(\'音乐下载器-敲出一片天\')
# get_downloadUrl(\'64561\',\'单车(Live)\',\'陈奕迅\')
query = StringVar()
query.set(\'歌名+歌手更准确哦\')
# entry的参数:https://www.cnblogs.com/monsteryang/p/6575877.html
entry = Entry(root, width=21, font=(\'隶书\', 20), foreground=\'orange\',
borderwidth=3, insertbackground=\'red\', textvariable=query)
entry.place(relx=0.05, rely=0.1)
button = Button(root, width=8, text=\'搜索\', font=(\'隶书\', 18), bg=\'orange\', fg=\'white\', command=search_music)
button.place(relx=0.7, rely=0.09)
v = StringVar()
v.set(\'netease\')
r1 = Radiobutton(text=\'网易\', value=\'netease\', font=(\'隶书\', 18), fg=\'orange\', variable=v)
r2 = Radiobutton(text=\'qq\', value=\'tencent\', font=(\'隶书\', 18), fg=\'orange\', variable=v)
r3 = Radiobutton(text=\'酷狗\', value=\'kugou\', font=(\'隶书\', 18), fg=\'orange\', variable=v)
r4 = Radiobutton(text=\'百度\', value=\'baidu\', font=(\'隶书\', 18), fg=\'orange\', variable=v)
r1.place(relx=0.08, rely=0.2)
r2.place(relx=0.28, rely=0.2)
r3.place(relx=0.48, rely=0.2)
r4.place(relx=0.68, rely=0.2)
frame = Frame(root, height=250, width=420, bd=1, relief="groove", bg=\'gray\')
frame.place(relx=0.06, rely=0.3)
value1 = StringVar()
entry1 = Entry(frame, width=21, font=(\'隶书\', 15), bg=\'gray\', relief="flat",
borderwidth=3, textvariable=query)
entry1.place(relx=0.05, rely=0.04)
button1 = Button(frame, width=8, text=\'下载\', font=(\'隶书\', 12), bg=\'gray\', fg=\'black\', command=tishi)
button1.place(relx=0.7, rely=0.04)
value2 = StringVar()
entry2 = Entry(frame, width=21, font=(\'隶书\', 15), relief="flat", bg=\'gray\',
borderwidth=3, textvariable=query)
entry2.place(relx=0.05, rely=0.24)
button2 = Button(frame, width=8, text=\'下载\', font=(\'隶书\', 12), bg=\'gray\', fg=\'black\', command=tishi)
button2.place(relx=0.7, rely=0.24)
value3 = StringVar()
entry3 = Entry(frame, width=21, font=(\'隶书\', 15), bg=\'gray\', relief="flat",
borderwidth=3, textvariable=query)
entry3.place(relx=0.05, rely=0.44)
button3 = Button(frame, width=8, text=\'下载\', font=(\'隶书\', 12), bg=\'gray\', fg=\'black\', command=tishi)
button3.place(relx=0.7, rely=0.44)
value4 = StringVar()
entry4 = Entry(frame, width=21, font=(\'隶书\', 15), bg=\'gray\', relief="flat",
borderwidth=3, textvariable=query)
entry4.place(relx=0.05, rely=0.64)
button4 = Button(frame, width=8, text=\'下载\', font=(\'隶书\', 12), bg=\'gray\', fg=\'black\', command=tishi)
button4.place(relx=0.7, rely=0.64)
value5 = StringVar()
entry5 = Entry(frame, width=21, font=(\'隶书\', 15), bg=\'gray\', relief="flat",
borderwidth=3, textvariable=query)
entry5.place(relx=0.05, rely=0.84)
button5 = Button(frame, width=8, text=\'下载\', font=(\'隶书\', 12), bg=\'gray\', fg=\'black\', command=tishi)
button5.place(relx=0.7, rely=0.84)
root.mainloop()
3. b站视频下载
import requests
import re
import json
from tkinter import *
from tkinter import messagebox
# 获得播放页面代码,获取我们需要的数据,转为json数据
def get_html_one(url):
headers = {
\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0\'
}
response = requests.get(url, headers=headers)
try:
title = re.findall(r\'<title data-vue-meta="true">(.+)_.+</title>\', response.text)
response = re.search(r\'"data":.+,"session"\', response.text)
text = response.group()
text = json.loads(text[7:-10])
video_url = text[\'dash\'][\'video\'][0][\'baseUrl\']
audio_url = text[\'dash\'][\'audio\'][0][\'baseUrl\']
return video_url, audio_url, title[0]
except:
print(\'该视频不支持下载\')
info.set(\'该视频不支持下载\')
messagebox.showinfo(\'提示\', \'该视频不支持下载\')
return None
# 下载合集
def get_html_more(url):
headers = {
\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0\'
}
response = requests.get(url, headers=headers)
title = re.findall(r\'<title data-vue-meta="true">(.+)_.+</title>\', response.text)
video_title.set(title[0])
response = re.search(r\'window.__INITIAL_STATE__=.+;\(function\', response.text)
text = json.loads(response.group()[25:-10])
cids = []
names = []
for info in text[\'videoData\'][\'pages\']:
cids.append(str(info[\'cid\']))
names.append(info[\'part\'])
return cids, names
# 下载视频和音频到本地
def download_one(video_url, audio_url, title):
headers = {
\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0\',
\'Referer\': \'https://www.bilibili.com/video/\',
\'Origin\': \'https://www.bilibili.com\',
\'Accept\': \'*/*\',
\'Accept-Encoding\': \'gzip, deflate, sdch, br\',
\'Accept-Language\': \'zh-CN,zh;q=0.8\'
}
print(title + \' 开始下载\')
try:
video_response = requests.get(video_url, headers=headers)
audio_response = requests.get(audio_url, headers=headers)
with open(title + \'.mp4\', \'wb\') as f:
f.write(video_response.content)
with open(title + \'.mp3\', \'wb\') as f:
f.write(audio_response.content)
except:
print(title + \' 下载失败\')
info.set(title + \' 下载失败\')
messagebox.showinfo(\'抱歉\', title + \' 下载失败\')
return
print(title + \' 下载完成\')
info.set(title + \' 下载完成\')
messagebox.showinfo(\'恭喜\', title + \' 下载完成\')
# 下载合集
def download_more(cids, names, url):
number = len(cids)
for i in range(number):
url = url + \'?p{}\'.format(i + 1)
video_url, audio_url, title = get_html_one(url)
download_one(video_url, audio_url, names[i])
print(\'=========================================\')
# 点击搜索
def serach():
button1.config(state="active")
baseurl = \'https://www.bilibili.com/video/{}\'
video_id = entry.get()
url = baseurl.format(video_id)
flag = v.get()
headers = {
\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0\'
}
try:
response = requests.get(url, headers=headers)
title = re.findall(r\'<title data-vue-meta="true">(.+)_.+</title>\', response.text)
if title[0] == \'视频去哪了呢?\':
messagebox.showinfo(\'提示\', \'您输入的视频id不正确\')
return
video_title.set(title[0])
button1[\'command\'] = lambda: download(url, flag)
except:
messagebox.showinfo(\'提示\', \'您输入的视频id不正确\')
return
# 点击下载
def download(url, flag):
button1.config(state="disable")
if flag == 0:
video_url, audio_url, title = get_html_one(url)
if video_url == None:
return
download_one(video_url, audio_url, title)
else:
cids, names = get_html_more(url)
download_more(cids, names, url)
print(\'下载完成,感谢您的使用\')
info.set(\'下载完成,感谢您的使用\')
def tishi():
messagebox.showinfo(\'提示\', \'请先进行搜索\')
if __name__ == \'__main__\':
root = Tk()
win_width = root.winfo_screenwidth()
win_height = root.winfo_screenheight()
root.geometry(\'400x270+\' + str(int(win_width / 2 - 200)) + \'+\' + str(int(win_height / 2 - 135)))
root.minsize(400, 250)
root.maxsize(400, 250)
root.title(\'小破站下载器-敲出一片天\')
video_id = StringVar()
video_id.set(\'请输入视频ID\')
entry = Entry(root, width=19, font=(\'隶书\', 20), foreground=\'orange\',
borderwidth=3, insertbackground=\'red\', textvariable=video_id)
entry.place(relx=0.02, rely=0.1)
button = Button(root, width=7, text=\'搜索\', font=(\'隶书\', 18), bg=\'orange\', fg=\'white\', command=serach)
button.place(relx=0.72, rely=0.09)
v = IntVar()
v.set(0)
r1 = Radiobutton(text=\'单个视频\', value=0, font=(\'隶书\', 18), fg=\'orange\', variable=v)
r2 = Radiobutton(text=\'视频合集\', value=1, font=(\'隶书\', 18), fg=\'orange\', variable=v)
r1.place(relx=0.05, rely=0.25)
r2.place(relx=0.45, rely=0.25)
video_title = StringVar()
video_title.set(\'视频标题\')
entry1 = Entry(root, width=30, font=(\'隶书\', 15), fg=\'black\', bg=\'#F0F0F0\', relief=\'flat\',
borderwidth=3, insertbackground=\'red\', textvariable=video_title)
entry1.place(relx=0.06, rely=0.4)
button1 = Button(root, width=8, text=\'开始下载\', font=(\'隶书\', 12), bg=\'gray\', fg=\'black\', command=tishi)
button1.place(relx=0.7, rely=0.4)
info = StringVar()
info.set(\'下载结果\')
entry_info = Entry(root, width=30, font=(\'隶书\', 15), fg=\'red\', bg=\'#F0F0F0\', relief=\'flat\',
borderwidth=3, textvariable=info)
entry_info.place(relx=0.2, rely=0.6)
label = Label(root, text=\'下载过程可能会出现无响应情况\n下载完就好了\', width=30, font=(\'隶书\', 15), fg=\'black\', bg=\'#F0F0F0\',
relief=\'flat\',
borderwidth=3)
label.place(relx=0.06, rely=0.8)
root.mainloop()
4.python爬虫框架scrapy爬取B站排行榜数据并保存到MongoDB数据库
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class BiliItem(scrapy.Item):
# define the fields for your item here like:
_id = scrapy.Field()
title = scrapy.Field()
play_num = scrapy.Field()
up_name = scrapy.Field()
score = scrapy.Field()
bili.py
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import scrapy
from bilibili.bili.bili.items import BiliItem
class BiliRankeSpider(scrapy.Spider):
name = \'bili_ranke\'
allowed_domains = [\'bilibili.com\']
start_urls = [\'https://www.bilibili.com/ranking/all/0/0/3\']
def parse(self, response):
titles = response.xpath(\'//div[@class="info"]//a[@class="title"]/text()\').extract()
play_nums = response.xpath(\'//div[@class="detail"]/span[@class="data-box"][1]/text()\').extract()
up_names = response.xpath(\'//div[@class="detail"]/a/span[@class="data-box"][1]/text()\').extract()
scores = response.xpath(\'//div[@class="pts"]/div/text()\').extract()
for title, play_num, up_name, score in zip(titles, play_nums, up_names, scores):
item = BiliItem()
item[\'title\'] = title
item[\'play_num\'] = play_num
item[\'up_name\'] = up_name
item[\'score\'] = score
yield item
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don\'t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
class MoviesPipeline(object):
def open_spider(self, spider):
self.client = pymongo.MongoClient()
def process_item(self, item, spider):
self.client.bilibili.ranke.insert_one(item)
return item
def close_spider(self, spider):
self.client.close()
附(MongoDB数据库python基本操作)
import pymongo
# 连接数据库
# 默认
client = pymongo.MongoClient()
# 自定义
# client = pymongo.MongoClient(\'ip\',port)
# 选择实例(数据库)
person = client.person
# 选择集合(表)
student = person.student
#操作数据
# 查找所有信息
# result = student.find()
# for r in result:
# print(r)
# print(result.next())
# 筛选
# result = student.find({"age":20})
# for r in result:
# print(r)
# 排序
# result = student.find().sort("age",1)
# result = student.find().sort("age",pymongo.ASCENDING)
# for r in result:
# print(r)
# 分页(偏移)
# result = student.find().limit(3)
# for r in result:
# print(r)
#
#
# result = student.find().limit(3).skip(2)
# for r in result:
# print(r)
# 统计
# result = student.find().count()
# print(result)
# 增加数据
# data = {"name":\'曾强\',\'age\':22}
# student.insert(data)
# result = student.count()
# print(result)
# 删除数据
# data = {"name":\'zq2\',\'age\':20}
# student.remove(data)
# 更新
data = {"name":"zq1"}
result = student.find_one(data)
print(result)
result["country"]="中国"
student.update(data,{\'$set\':result})
以上项目我都在bilibili上录有视频,看不明白可以去看一下视频,我的B站名:敲出一片天_bili