czqczq

一、选题背景

由于现在的音乐版权问题,很多音乐分布在各个平台的音乐播放器,而版权问题也使很多人非常的困扰,从而找不到音乐的资源。因此为帮助使用网易云的伙伴们,更好的找到各个平台的资源,听到更多自己喜欢的歌。

 

二、网络爬虫设计方案

网络爬虫名称:“网易云音乐歌单”

内容与数据分析特征:该爬虫主要获取性能榜的数据进行分析。

三、主题页面的结构特征分析

全部歌单 - 歌单 - 网易云音乐 (163.com)

 

 

 

 

四、网络爬虫程序设计

1.数据爬取与采集

from bs4 import BeautifulSoup
import requests
import time

headers = {
    \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36\'
}

for i in range(0, 1330, 35):
    print(i)
    time.sleep(2)
    url = \'https://music.163.com/discover/playlist/?cat=欧美&order=hot&limit=35&offset=\' + str(i)
    response = requests.get(url=url, headers=headers)
    html = response.text
    soup = BeautifulSoup(html, \'html.parser\')
    # 获取包含歌单详情页网址的标签
    ids = soup.select(\'.dec a\')
    # 获取包含歌单索引页信息的标签
    lis = soup.select(\'#m-pl-container li\')
    print(len(lis))
    for j in range(len(lis)):
        # 获取歌单详情页地址
        url = ids[j][\'href\']
        # 获取歌单标题
        title = ids[j][\'title\']
        # 获取歌单播放量
        play = lis[j].select(\'.nb\')[0].get_text()
        # 获取歌单贡献者名字
        user = lis[j].select(\'p\')[1].select(\'a\')[0].get_text()
        # 输出歌单索引页信息
        print(url, title, play, user)
        # 将信息写入CSV文件中
        with open(\'playlist.csv\', \'a+\', encoding=\'utf-8-sig\') as f:
            f.write(url + \',\' + title + \',\' + play + \',\' + user + \'
\') 

 

 

 

from bs4 import BeautifulSoup
import pandas as pd
import requests
import time

df = pd.read_csv(\'playlist.csv\', header=None, error_bad_lines=False, names=[\'url\', \'title\', \'play\', \'user\'])

headers = {
    \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36\'
}

for i in df[\'url\']:
    time.sleep(2)
    url = \'https://music.163.com\' + i
    response = requests.get(url=url, headers=headers)
    html = response.text
    soup = BeautifulSoup(html, \'html.parser\')
    # 获取歌单标题
    title = soup.select(\'h2\')[0].get_text().replace(\',\', \'\')
    # 获取标签
    tags = []
    tags_message = soup.select(\'.u-tag i\')
    for p in tags_message:
        tags.append(p.get_text())
    # 对标签进行格式化
    if len(tags) > 1:
        tag = \'-\'.join(tags)
    else:
        tag = tags[0]
    # 获取歌单介绍
    if soup.select(\'#album-desc-more\'):
        text = soup.select(\'#album-desc-more\')[0].get_text().replace(\'
\', \'\').replace(\',\', \'\')
    else:
        text = \'\'
    # 获取歌单收藏量
    collection = soup.select(\'#content-operation i\')[1].get_text().replace(\'(\', \'\').replace(\')\', \'\')
    # 歌单播放量
    play = soup.select(\'.s-fc6\')[0].get_text()
    # 歌单内歌曲数
    songs = soup.select(\'#playlist-track-count\')[0].get_text()
    # 歌单评论数
    comments = soup.select(\'#cnt_comment_count\')[0].get_text()
    # 输出歌单详情页信息
    print(title, tag, text, collection, play, songs, comments)
    # 将详情页信息写入CSV文件中
    with open(\'music_message.csv\', \'a+\', encoding=\'utf-8-sig\') as f:
        f.write(title + \',\' + tag + \',\' + text + \',\' + collection + \',\' + play + \',\' + songs + \',\' + comments + \'
\')
    # 获取歌单内歌曲名称
    li = soup.select(\'.f-hide li a\')
    for j in li:
        with open(\'music_name.csv\', \'a+\', encoding=\'utf-8-sig\') as f:
            f.write(j.get_text() + \'
\')

 

 

 

2.数据可视化

 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv(\'music_message_4.csv\', header=None)
# 对播放数取对数
dom = []
for i in df[4]:
    dom.append(np.log(i))
df[\'collection\'] = dom
# 设置图片显示属性,字体及大小
plt.rcParams[\'font.sans-serif\'] = [\'STXihei\']
plt.rcParams[\'font.size\'] = 12
plt.rcParams[\'axes.unicode_minus\'] = False
# 设置图片显示属性
fig = plt.figure(figsize=(16, 8), dpi=80)
ax = plt.subplot(1, 1, 1)
ax.patch.set_color(\'white\')
# 设置坐标轴属性
lines = plt.gca()
# 设置坐标轴颜色
lines.spines[\'right\'].set_color(\'none\')
lines.spines[\'top\'].set_color(\'none\')
lines.spines[\'left\'].set_color((64/255, 64/255, 64/255))
lines.spines[\'bottom\'].set_color((64/255, 64/255, 64/255))
lines.xaxis.set_ticks_position(\'none\')
lines.yaxis.set_ticks_position(\'none\')
# 绘制直方图,设置直方图颜色
ax.hist(df[\'collection\'], bins=30, alpha=0.7, color=(255/255, 153/255, 0/255))
ax.set_title(\'华语歌单播放数量分布情况\', fontsize=20)
# 显示图片
plt.show()

 

 

 

 

 

 

import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv(\'music_message_3.csv\', header=None, names=[\'title\'], encoding=\'utf-8-sig\')
# 数据聚合分组
place_message = df.groupby([\'title\'])
place_com = place_message[\'title\'].agg([\'count\'])
place_com.reset_index(inplace=True)
place_com_last = place_com.sort_index()
dom = place_com_last.sort_values(\'count\', ascending=False)[0:10]
# 设置显示数据
names = [i for i in dom.title]
names.reverse()
nums = [i for i in dom[\'count\']]
nums.reverse()
data = pd.Series(nums, index=names)
# 设置图片显示属性,字体及大小
plt.rcParams[\'font.sans-serif\'] = [\'Microsoft YaHei\']
plt.rcParams[\'font.size\'] = 10
plt.rcParams[\'axes.unicode_minus\'] = False
# 设置图片显示属性
fig = plt.figure(figsize=(16, 8), dpi=80)
ax = plt.subplot(1, 1, 1)
ax.patch.set_color(\'white\')
# 设置坐标轴属性
lines = plt.gca()
# 设置坐标轴颜色
lines.spines[\'right\'].set_color(\'none\')
lines.spines[\'top\'].set_color(\'none\')
lines.spines[\'left\'].set_color((64/255, 64/255, 64/255))
lines.spines[\'bottom\'].set_color((64/255, 64/255, 64/255))
# 设置坐标轴刻度
lines.xaxis.set_ticks_position(\'none\')
lines.yaxis.set_ticks_position(\'none\')
# 绘制柱状图,设置柱状图颜色
data.plot.barh(ax=ax, width=0.7, alpha=0.7, color=(16/255, 152/255, 168/255))
# 添加标题,设置字体大小
ax.set_title(\'网易云音乐华语歌单歌曲 TOP10\', fontsize=18, fontweight=\'light\')
# 添加歌曲出现次数文本
for x, y in enumerate(data.values):
    plt.text(y+3.5, x-0.12, \'%s\' % y, ha=\'center\')
# 显示图片
plt.show()

 

 

 

 

 

 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv(\'music_message_4.csv\', header=None)
# 对收藏数取对数
dom = []
for i in df[3]:
    dom.append(np.log(int(i.replace(\'\', \'0000\'))))
df[\'collection\'] = dom
# 设置图片显示属性,字体及大小
plt.rcParams[\'font.sans-serif\'] = [\'STXihei\']
plt.rcParams[\'font.size\'] = 12
plt.rcParams[\'axes.unicode_minus\'] = False
# 设置图片显示属性
fig = plt.figure(figsize=(16, 8), dpi=80)
ax = plt.subplot(1, 1, 1)
ax.patch.set_color(\'white\')
# 设置坐标轴属性
lines = plt.gca()
# 设置坐标轴颜色
lines.spines[\'right\'].set_color(\'none\')
lines.spines[\'top\'].set_color(\'none\')
lines.spines[\'left\'].set_color((64/255, 64/255, 64/255))
lines.spines[\'bottom\'].set_color((64/255, 64/255, 64/255))
lines.xaxis.set_ticks_position(\'none\')
lines.yaxis.set_ticks_position(\'none\')
# 绘制直方图,设置直方图颜色
ax.hist(df[\'collection\'], bins=30, alpha=0.7, color=(21/255, 47/255, 71/255))
ax.set_title(\'华语歌单收藏数量分布情况\', fontsize=20)
# 显示图片
plt.show()

 

 

 

 

 

 

import squarify
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv(\'music_message_4.csv\', header=None)
# 处理标签信息
tags = []
dom2 = []
for i in df[1]:
    c = i.split(\'-\')
    for j in c:
        if j not in tags:
            tags.append(j)
        else:
            continue
for item in tags:
    num = 0
    for i in df[1]:
        type2 = i.split(\'-\')
        for j in range(len(type2)):
            if type2[j] == item:
                num += 1
            else:
                continue
    dom2.append(num)
# 数据创建
data = {\'tags\': tags, \'num\': dom2}
frame = pd.DataFrame(data)
df1 = frame.sort_values(by=\'num\', ascending=False)
name = df1[\'tags\'][:10]
income = df1[\'num\'][:10]
# 绘图details
colors = [\'#993333\', \'#CC9966\',  \'#333333\', \'#663366\', \'#003366\', \'#009966\', \'#FF6600\', \'#FF0033\', \'#009999\', \'#333366\']
plot = squarify.plot(sizes=income, label=name, color=colors, alpha=1, value=income, edgecolor=\'white\', linewidth=1.5)
# 设置图片显示属性,字体及大小
plt.rcParams[\'font.sans-serif\'] = [\'Microsoft YaHei\']
plt.rcParams[\'font.size\'] = 8
plt.rcParams[\'axes.unicode_minus\'] = False
# 设置标签大小为1
plt.rc(\'font\', size=6)
# 设置标题大小
plot.set_title(\'网易云音乐华语歌单标签图\', fontsize=13, fontweight=\'light\')
# 除坐标轴
plt.axis(\'off\')
# 除上边框和右边框刻度
plt.tick_params(top=False, right=False)
# 图形展示
plt.show()

 

 

 五、总结

网易云音乐的使用还是非常火爆的,以上是对网易云爬虫的一次愉快的探索之旅~

分类:

技术点:

相关文章: