python爬虫实战项目

1. LOL所有英雄皮肤下载

from fake_useragent import UserAgent
import requests, json, os

# 爬取网页所有英雄的皮肤图片
# https://lol.qq.com/data/info-heros.shtml


# 获取英雄id
def get_heroList():
    url = \'https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js\'
    headers = {
        \'User-Agent\': UserAgent().chrome
    }
    try:
        response = requests.get(url, headers=headers)
        # print(response.text)
        # print(type(response.text))
        response = json.loads(response.text)
        # print(type(response))
        hero_ids = []
        for i in response[\'hero\']:
            hero_ids.append(i[\'heroId\'])
        # print(hero_ids)
        return hero_ids
    except:
        print(\'获取英雄id失败\')
        return None


# 根据英雄id获取英雄皮肤名称和图片下载地址
def get_skinNames(id):
    url = \'https://game.gtimg.cn/images/lol/act/img/js/hero/{}.js\'.format(id)
    headers = {
        \'User-Agent\': UserAgent().chrome
    }
    try:
        response = requests.get(url, headers=headers)
        response = json.loads(response.text)
        skinnames = []
        skin_urls = []
        for i in response[\'skins\'][:-1]:
            if i[\'mainImg\'] != \'\':
                skinnames.append(i[\'name\'])
                skin_urls.append(i[\'mainImg\'])
        # print(skinnames)
        return skinnames, skin_urls
    except:
        print(\'获取英雄皮肤名称失败\')
        return None


# 根据名称,下载图片保存文件夹
def downloadImg(skinnames, skin_urls):
    headers = {
        \'User-Agent\': UserAgent().chrome
    }
    filename = skinnames[0]
    os.makedirs(filename, exist_ok=True)
    for skinname, skin_url in zip(skinnames, skin_urls):
        try:
            response = requests.get(skin_url, headers=headers)
        except:
            print(skinname + \' 下载失败\')
            return
        with open(filename+\'/\'+skinname.replace(\'/\', \'_\') + \'.jpg\', \'wb\') as f:
            f.write(response.content)
    # print(filename + \' 下载完成\')


if __name__ == \'__main__\':
    hero_ids = get_heroList()
    i = 1
    for id in hero_ids:
        skinnames, skin_urls = get_skinNames(id)
        # print(skinnames[0]+\':\'+str(len(skin_urls))+\'张\')
        downloadImg(skinnames, skin_urls)
        print(\'\r下载进度:\' + str(i) + \'/\' + str(len(hero_ids)), end=\'\')
        i = i + 1

2. 音乐下载软件

import requests, json, re
from tkinter import Tk, Button, Entry, StringVar, Radiobutton, Frame
from tkinter import messagebox


# 说明:
# 爬取网站:https://music.zhuolin.wang/
# ajax异步请求
# 下载的歌曲在软件所在目录下


# 根据输入找到歌曲信息
def get_musicInfo(query, sourse):
    music_ids = []
    music_names = []
    music_singers = []
    url = \'https://music.zhuolin.wang/api.php?\'
    data = {
        \'types\': \'search\',
        \'count\': \'5\',
        \'source\': sourse,
        \'pages\': \'1\',
        \'name\': query
    }
    headers = {
        \'Accept\': \'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01\',
        \'Accept-Encoding\': \'gzip, deflate, br\',
        \'Accept-Language\': \'zh-CN,zh;q=0.9\',
        \'Connection\': \'keep-alive\',
        \'Content-Length\': \'37\',
        \'Content-Type\': \'application/x-www-form-urlencoded; charset=UTF-8\',
        \'Cookie\': \'\',
        \'Host\': \'music.zhuolin.wang\',
        \'Origin\': \'https://music.zhuolin.wang\',
        \'Referer\': \'https://music.zhuolin.wang/\',
        \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36\',
        \'X-Requested-With\': \'XMLHttpRequest\'
    }
    response = requests.post(url, headers=headers, data=data)
    response = json.loads(response.text)
    for i in response:
        music_ids.append(i[\'id\'])
        music_names.append(i[\'name\'])
        music_singers.append(i[\'artist\'])

    print(music_ids)
    print(music_names)
    print(music_singers)
    # return music_ids, music_names, music_singers


# 根据id获取歌曲下载链接
def get_downloadUrl(music_id, name, singer, sourse):
    url = \'https://music.zhuolin.wang/api.php?\'
    data = {
        \'types\': \'url\',
        \'id\': music_id,
        \'source\': sourse
    }
    headers = {
        \'Accept\': \'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01\',
        \'Accept-Encoding\': \'gzip, deflate, br\',
        \'Accept-Language\': \'zh-CN,zh;q=0.9\',
        \'Connection\': \'keep-alive\',
        \'Content-Length\': \'37\',
        \'Content-Type\': \'application/x-www-form-urlencoded; charset=UTF-8\',
        \'Cookie\': \'\',
        \'Host\': \'music.zhuolin.wang\',
        \'Origin\': \'https://music.zhuolin.wang\',
        \'Referer\': \'https://music.zhuolin.wang/\',
        \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36\',
        \'X-Requested-With\': \'XMLHttpRequest\'
    }
    response = requests.post(url, data=data, headers=headers)
    print(response.text)
    downloadurl = re.search(r\'http:(.+)",\', response.text)
    if downloadurl != None:
        downloadurl = downloadurl.group().replace(\'\\\', \'\')
        downloadMusic(downloadurl, name, singer)
    else:
        messagebox.showinfo(\'抱歉\', \'该歌曲暂不提供下载,请您更换其他平台下载\')


# 下载歌曲到本地
def downloadMusic(url, name, singer):
    headers = {
        \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36\'
    }
    try:
        response = requests.get(url, headers=headers)
        with open(name + \'-\' + singer + \'.mp3\', \'wb\')as f:
            f.write(response.content)
        messagebox.showinfo(\'恭喜\', name + \'-\' + singer + \' 下载完成\')
    except:
        messagebox.showinfo(\'抱歉\', name + \' 下载失败\')


# 点击搜索执行
def search_music():
    query = entry.get()
    sourse = v.get()
    if query == \'\':
        messagebox.showinfo(\'提示\', \'请输入内容!\')
        return False
    music_ids, music_names, music_singers = get_musicInfo(query, sourse)
    # 重新进行组件内容和按钮功能的设置
    for i in range(5):
        if i == 0:
            id1 = str(music_ids[i])
            name1 = str(music_names[i])
            singer1 = str(music_singers[i][0])
            value1.set(name1 + \'   \' + singer1)
            entry1[\'textvariable\'] = value1
            button1[\'command\'] = lambda: download(id1, name1, singer1)
        if i == 1:
            id2 = str(music_ids[i])
            name2 = str(music_names[i])
            singer2 = str(music_singers[i][0])
            value2.set(name2 + \'   \' + singer2)
            entry2[\'textvariable\'] = value2
            button2[\'command\'] = lambda: download(id2, name2, singer2)
        if i == 2:
            id3 = str(music_ids[i])
            name3 = str(music_names[i])
            singer3 = str(music_singers[i][0])
            value3.set(name3 + \'   \' + singer3)
            entry3[\'textvariable\'] = value3
            button3[\'command\'] = lambda: download(id3, name3, singer3)
        if i == 3:
            id4 = str(music_ids[i])
            name4 = str(music_names[i])
            singer4 = str(music_singers[i][0])
            value4.set(name4 + \'   \' + singer4)
            entry4[\'textvariable\'] = value4
            button4[\'command\'] = lambda: download(id4, name4, singer4)
        if i == 4:
            id5 = str(music_ids[i])
            name5 = str(music_names[i])
            singer5 = str(music_singers[i][0])
            value5.set(name5 + \'   \' + singer5)
            entry5[\'textvariable\'] = value5
            button5[\'command\'] = lambda: download(id5, name5, singer5)


# 没有搜索之前点击下载按钮的提示
def tishi():
    messagebox.showinfo(\'提示\', \'请先进行搜索\')


# 点击下载按钮执行(有点多余,可以去掉直接用get_downloadUrl)
def download(id, name, singer):
    sourse = v.get()
    get_downloadUrl(id, name, singer, sourse)


if __name__ == \'__main__\':
    # get_musicInfo(\'嘲笑声\',\'tencent\')
    # get_downloadUrl(\'0030tRLQ1e4mCn\',\'嘲笑声\',\'Big Daddy\',\'tencent\')

    root = Tk()
    win_width = root.winfo_screenwidth()
    win_height = root.winfo_screenheight()
    root.geometry(\'500x400+\' + str(int(win_width / 2 - 250)) + \'+\' + str(int(win_height / 2 - 200)))
    root.minsize(500, 400)
    root.maxsize(500, 400)
    root.title(\'音乐下载器-敲出一片天\')
    # get_downloadUrl(\'64561\',\'单车(Live)\',\'陈奕迅\')

    query = StringVar()
    query.set(\'歌名+歌手更准确哦\')

    # entry的参数:https://www.cnblogs.com/monsteryang/p/6575877.html

    entry = Entry(root, width=21, font=(\'隶书\', 20), foreground=\'orange\',
                  borderwidth=3, insertbackground=\'red\', textvariable=query)
    entry.place(relx=0.05, rely=0.1)

    button = Button(root, width=8, text=\'搜索\', font=(\'隶书\', 18), bg=\'orange\', fg=\'white\', command=search_music)
    button.place(relx=0.7, rely=0.09)

    v = StringVar()
    v.set(\'netease\')
    r1 = Radiobutton(text=\'网易\', value=\'netease\', font=(\'隶书\', 18), fg=\'orange\', variable=v)
    r2 = Radiobutton(text=\'qq\', value=\'tencent\', font=(\'隶书\', 18), fg=\'orange\', variable=v)
    r3 = Radiobutton(text=\'酷狗\', value=\'kugou\', font=(\'隶书\', 18), fg=\'orange\', variable=v)
    r4 = Radiobutton(text=\'百度\', value=\'baidu\', font=(\'隶书\', 18), fg=\'orange\', variable=v)
    r1.place(relx=0.08, rely=0.2)
    r2.place(relx=0.28, rely=0.2)
    r3.place(relx=0.48, rely=0.2)
    r4.place(relx=0.68, rely=0.2)

    frame = Frame(root, height=250, width=420, bd=1, relief="groove", bg=\'gray\')
    frame.place(relx=0.06, rely=0.3)

    value1 = StringVar()
    entry1 = Entry(frame, width=21, font=(\'隶书\', 15), bg=\'gray\', relief="flat",
                   borderwidth=3, textvariable=query)
    entry1.place(relx=0.05, rely=0.04)
    button1 = Button(frame, width=8, text=\'下载\', font=(\'隶书\', 12), bg=\'gray\', fg=\'black\', command=tishi)
    button1.place(relx=0.7, rely=0.04)

    value2 = StringVar()
    entry2 = Entry(frame, width=21, font=(\'隶书\', 15), relief="flat", bg=\'gray\',
                   borderwidth=3, textvariable=query)
    entry2.place(relx=0.05, rely=0.24)
    button2 = Button(frame, width=8, text=\'下载\', font=(\'隶书\', 12), bg=\'gray\', fg=\'black\', command=tishi)
    button2.place(relx=0.7, rely=0.24)

    value3 = StringVar()
    entry3 = Entry(frame, width=21, font=(\'隶书\', 15), bg=\'gray\', relief="flat",
                   borderwidth=3, textvariable=query)
    entry3.place(relx=0.05, rely=0.44)
    button3 = Button(frame, width=8, text=\'下载\', font=(\'隶书\', 12), bg=\'gray\', fg=\'black\', command=tishi)
    button3.place(relx=0.7, rely=0.44)

    value4 = StringVar()
    entry4 = Entry(frame, width=21, font=(\'隶书\', 15), bg=\'gray\', relief="flat",
                   borderwidth=3, textvariable=query)
    entry4.place(relx=0.05, rely=0.64)
    button4 = Button(frame, width=8, text=\'下载\', font=(\'隶书\', 12), bg=\'gray\', fg=\'black\', command=tishi)
    button4.place(relx=0.7, rely=0.64)

    value5 = StringVar()
    entry5 = Entry(frame, width=21, font=(\'隶书\', 15), bg=\'gray\', relief="flat",
                   borderwidth=3, textvariable=query)
    entry5.place(relx=0.05, rely=0.84)
    button5 = Button(frame, width=8, text=\'下载\', font=(\'隶书\', 12), bg=\'gray\', fg=\'black\', command=tishi)
    button5.place(relx=0.7, rely=0.84)

    root.mainloop()

3. b站视频下载

import requests
import re
import json
from tkinter import *
from tkinter import messagebox


# 获得播放页面代码,获取我们需要的数据，转为json数据
def get_html_one(url):
    headers = {
        \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0\'
    }
    response = requests.get(url, headers=headers)
    try:
        title = re.findall(r\'<title data-vue-meta="true">(.+)_.+</title>\', response.text)
        response = re.search(r\'"data":.+,"session"\', response.text)
        text = response.group()
        text = json.loads(text[7:-10])
        video_url = text[\'dash\'][\'video\'][0][\'baseUrl\']
        audio_url = text[\'dash\'][\'audio\'][0][\'baseUrl\']
        return video_url, audio_url, title[0]
    except:
        print(\'该视频不支持下载\')
        info.set(\'该视频不支持下载\')
        messagebox.showinfo(\'提示\', \'该视频不支持下载\')
        return None


# 下载合集
def get_html_more(url):
    headers = {
        \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0\'
    }
    response = requests.get(url, headers=headers)
    title = re.findall(r\'<title data-vue-meta="true">(.+)_.+</title>\', response.text)
    video_title.set(title[0])
    response = re.search(r\'window.__INITIAL_STATE__=.+;\(function\', response.text)
    text = json.loads(response.group()[25:-10])
    cids = []
    names = []
    for info in text[\'videoData\'][\'pages\']:
        cids.append(str(info[\'cid\']))
        names.append(info[\'part\'])
    return cids, names


# 下载视频和音频到本地
def download_one(video_url, audio_url, title):
    headers = {
        \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0\',
        \'Referer\': \'https://www.bilibili.com/video/\',
        \'Origin\': \'https://www.bilibili.com\',
        \'Accept\': \'*/*\',
        \'Accept-Encoding\': \'gzip, deflate, sdch, br\',
        \'Accept-Language\': \'zh-CN,zh;q=0.8\'
    }
    print(title + \' 开始下载\')

    try:
        video_response = requests.get(video_url, headers=headers)
        audio_response = requests.get(audio_url, headers=headers)
        with open(title + \'.mp4\', \'wb\') as f:
            f.write(video_response.content)
        with open(title + \'.mp3\', \'wb\') as f:
            f.write(audio_response.content)
    except:
        print(title + \' 下载失败\')
        info.set(title + \' 下载失败\')
        messagebox.showinfo(\'抱歉\', title + \' 下载失败\')
        return
    print(title + \' 下载完成\')
    info.set(title + \' 下载完成\')
    messagebox.showinfo(\'恭喜\', title + \' 下载完成\')


# 下载合集
def download_more(cids, names, url):
    number = len(cids)
    for i in range(number):
        url = url + \'?p{}\'.format(i + 1)
        video_url, audio_url, title = get_html_one(url)
        download_one(video_url, audio_url, names[i])
        print(\'=========================================\')


# 点击搜索
def serach():
    button1.config(state="active")
    baseurl = \'https://www.bilibili.com/video/{}\'
    video_id = entry.get()
    url = baseurl.format(video_id)
    flag = v.get()
    headers = {
        \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0\'
    }
    try:
        response = requests.get(url, headers=headers)
        title = re.findall(r\'<title data-vue-meta="true">(.+)_.+</title>\', response.text)
        if title[0] == \'视频去哪了呢？\':
            messagebox.showinfo(\'提示\', \'您输入的视频id不正确\')
            return
        video_title.set(title[0])
        button1[\'command\'] = lambda: download(url, flag)
    except:
        messagebox.showinfo(\'提示\', \'您输入的视频id不正确\')
        return


# 点击下载
def download(url, flag):
    button1.config(state="disable")
    if flag == 0:
        video_url, audio_url, title = get_html_one(url)
        if video_url == None:
            return
        download_one(video_url, audio_url, title)
    else:
        cids, names = get_html_more(url)
        download_more(cids, names, url)

    print(\'下载完成,感谢您的使用\')
    info.set(\'下载完成,感谢您的使用\')


def tishi():
    messagebox.showinfo(\'提示\', \'请先进行搜索\')


if __name__ == \'__main__\':
    root = Tk()
    win_width = root.winfo_screenwidth()
    win_height = root.winfo_screenheight()
    root.geometry(\'400x270+\' + str(int(win_width / 2 - 200)) + \'+\' + str(int(win_height / 2 - 135)))
    root.minsize(400, 250)
    root.maxsize(400, 250)
    root.title(\'小破站下载器-敲出一片天\')

    video_id = StringVar()
    video_id.set(\'请输入视频ID\')

    entry = Entry(root, width=19, font=(\'隶书\', 20), foreground=\'orange\',
                  borderwidth=3, insertbackground=\'red\', textvariable=video_id)
    entry.place(relx=0.02, rely=0.1)

    button = Button(root, width=7, text=\'搜索\', font=(\'隶书\', 18), bg=\'orange\', fg=\'white\', command=serach)
    button.place(relx=0.72, rely=0.09)

    v = IntVar()
    v.set(0)
    r1 = Radiobutton(text=\'单个视频\', value=0, font=(\'隶书\', 18), fg=\'orange\', variable=v)
    r2 = Radiobutton(text=\'视频合集\', value=1, font=(\'隶书\', 18), fg=\'orange\', variable=v)
    r1.place(relx=0.05, rely=0.25)
    r2.place(relx=0.45, rely=0.25)

    video_title = StringVar()
    video_title.set(\'视频标题\')
    entry1 = Entry(root, width=30, font=(\'隶书\', 15), fg=\'black\', bg=\'#F0F0F0\', relief=\'flat\',
                   borderwidth=3, insertbackground=\'red\', textvariable=video_title)
    entry1.place(relx=0.06, rely=0.4)

    button1 = Button(root, width=8, text=\'开始下载\', font=(\'隶书\', 12), bg=\'gray\', fg=\'black\', command=tishi)
    button1.place(relx=0.7, rely=0.4)

    info = StringVar()
    info.set(\'下载结果\')
    entry_info = Entry(root, width=30, font=(\'隶书\', 15), fg=\'red\', bg=\'#F0F0F0\', relief=\'flat\',
                       borderwidth=3, textvariable=info)
    entry_info.place(relx=0.2, rely=0.6)

    label = Label(root, text=\'下载过程可能会出现无响应情况\n下载完就好了\', width=30, font=(\'隶书\', 15), fg=\'black\', bg=\'#F0F0F0\',
                  relief=\'flat\',
                  borderwidth=3)
    label.place(relx=0.06, rely=0.8)

    root.mainloop()

4.python爬虫框架scrapy爬取B站排行榜数据并保存到MongoDB数据库

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class BiliItem(scrapy.Item):
    # define the fields for your item here like:
    _id = scrapy.Field()
    title = scrapy.Field()
    play_num = scrapy.Field()
    up_name = scrapy.Field()
    score = scrapy.Field()

bili.py

# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import scrapy
from bilibili.bili.bili.items import BiliItem


class BiliRankeSpider(scrapy.Spider):
    name = \'bili_ranke\'
    allowed_domains = [\'bilibili.com\']
    start_urls = [\'https://www.bilibili.com/ranking/all/0/0/3\']

    def parse(self, response):
        titles = response.xpath(\'//div[@class="info"]//a[@class="title"]/text()\').extract()
        play_nums = response.xpath(\'//div[@class="detail"]/span[@class="data-box"][1]/text()\').extract()
        up_names = response.xpath(\'//div[@class="detail"]/a/span[@class="data-box"][1]/text()\').extract()
        scores = response.xpath(\'//div[@class="pts"]/div/text()\').extract()

        for title, play_num, up_name, score in zip(titles, play_nums, up_names, scores):
            item = BiliItem()
            item[\'title\'] = title
            item[\'play_num\'] = play_num
            item[\'up_name\'] = up_name
            item[\'score\'] = score
            yield item

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don\'t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo

class MoviesPipeline(object):

    def open_spider(self, spider):
        self.client = pymongo.MongoClient()

    def process_item(self, item, spider):
        self.client.bilibili.ranke.insert_one(item)
        return item

    def close_spider(self, spider):
        self.client.close()

附（MongoDB数据库python基本操作）

import pymongo

# 连接数据库
# 默认
client = pymongo.MongoClient()
# 自定义
# client = pymongo.MongoClient(\'ip\',port)

# 选择实例（数据库）
person = client.person
# 选择集合（表）
student = person.student

#操作数据
# 查找所有信息
# result = student.find()
# for r in result:
#     print(r)

# print(result.next())

# 筛选
# result = student.find({"age":20})
# for r in result:
#     print(r)

# 排序
# result = student.find().sort("age",1)
# result = student.find().sort("age",pymongo.ASCENDING)
# for r in result:
#     print(r)

# 分页（偏移）
# result = student.find().limit(3)
# for r in result:
#     print(r)
#
#
# result = student.find().limit(3).skip(2)
# for r in result:
#     print(r)

# 统计
# result = student.find().count()
# print(result)

# 增加数据
# data = {"name":\'曾强\',\'age\':22}
# student.insert(data)
# result = student.count()
# print(result)

# 删除数据
# data = {"name":\'zq2\',\'age\':20}
# student.remove(data)

# 更新
data = {"name":"zq1"}
result = student.find_one(data)
print(result)
result["country"]="中国"
student.update(data,{\'$set\':result})

以上项目我都在bilibili上录有视频，看不明白可以去看一下视频，我的B站名：敲出一片天_bili