【发布时间】:2021-09-30 22:48:59
【问题描述】:
我正在尝试获取价值约 4.5 年的 Spotify 前 200 名排行榜的音频特征数据。这是针对68个国家+全球排名的,总共大约2000万条记录。我正在使用所有这些数据查询 SQL Lite 数据库。 这是为一个数据分析项目做准备,我目前将我的范围限制在每月的第三个星期五,因为我可以最快的时间为图表提取一整天的音频功能是 15.8 分钟。那是 18.5 天的直接处理才能得到全部 1701 天。
有谁知道我有什么方法可以让这更快吗?我目前正在为每首曲目调用 spotipy.audio_features() 函数ID。该函数仅限于 100 个 id,我不确定这是否会快得多。
这是处理前的示例条目:
column_names = ['title', 'rank', 'date', 'artist', 'url', 'region', 'chart', 'trend', 'streams']
('You Were Right', 179, '2017-01-20', 'RÜFÜS DU SOL', 'https://open.spotify.com/track/77lqbary6vt1DSc1MBN6sx', 'Australia', 'top200', 'NEW_ENTRY', 14781)
并经过处理:
column_names = ['title', 'rank', 'date', 'artist', 'url', 'region', 'chart', 'trend', 'streams', 'track_id', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
('You Were Right', 179, '2017-01-20', 'RÜFÜS DU SOL', 'https://open.spotify.com/track/77lqbary6vt1DSc1MBN6sx', 'Australia', 'top200', 'NEW_ENTRY', 14781, '77lqbary6vt1DSc1MBN6sx', 0.708, 0.793, 5, -5.426, 0, 0.0342, 0.0136, 0.00221, 0.118, 0.734, 122.006, 239418, 4)
完整脚本:
import sqlite3
import os
import spotipy
import numpy as np
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials
from requests.exceptions import ReadTimeout
from datetime import datetime
"""Gets the third Friday of each month and checks that the date exists in the database."""
def date_range_checker(cursor, start_date, end_date):
# Put in the range for that year. It's till 2021.
date_range = pd.date_range(start_date, end_date ,freq='WOM-3FRI')
cursor.execute("""SELECT DISTINCT Date(date) FROM charts""")
sql_date_fetch = cursor.fetchall()
sql_dates = [r[0] for r in sql_date_fetch]
validated_dates = []
for date in date_range:
# print(str(date)[0:-9])
if str(date)[0:-9] in sql_dates:
validated_dates.append(str(date)[0:-9])
return validated_dates
"""Connects to the database. For each date in validated_dates, it queries all the records with that date.
Then splits the track IDs from the Spotify link into a new list of tuples. Then for each tuple in that list, it calls the Spotify API with the track ID.
Finally it creates a numpy array for the entire list so the csv converter can be used."""
def main():
now_start = datetime.now()
start_time = now_start.strftime("%H:%M:%S")
print(f'Main Function - start time: {start_time}')
""""This script queries """
print("working on it...")
dbname = 'charts.db'
if os.path.exists(dbname):
db = sqlite3.connect(dbname, isolation_level=None)
cursor = db.cursor()
""""Selects 3rd friday of the month because it takes about 15.8 minutes per day. That's 14.2 hours total to get one friday a month for all 4.5 years.
Or 18.6 full days of processing for every single day for all 1701 days.
Fridays are a preferable release day in the industry. Cite this later."""
# Date range list created and checked in this function
validated_dates = date_range_checker(cursor, '2017-02-01', '2017-12-31') # change year here
column_names = ['title', 'rank', 'date', 'artist', 'url', 'region', 'chart', 'trend', 'streams', 'track_id', 'danceability',
'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
'duration_ms', 'time_signature']
for date_chosen in validated_dates:
cursor.execute("""SELECT * FROM charts WHERE Date("date") = ?""", (date_chosen,))
db_result = cursor.fetchall()
data_with_track_ids = []
final_data = []
# Splits ID from Spotify link.
for entry in db_result:
track_id = entry[4].split('/')[-1]
entry += (track_id,)
data_with_track_ids.append(entry)
print("I've got all the track IDs. Will start calls to Spotify API now.")
# Calls to spotify with the new extracted track_id
for entry in data_with_track_ids:
track_id = entry[-1]
try:
audio_features = spotify.audio_features(track_id)
except ReadTimeout:
print('Spotify timed out... trying again...')
audio_features = spotify.audio_features(track_id)
entry += (audio_features[0]['danceability'], audio_features[0]['energy'], audio_features[0]['key'],
audio_features[0]['loudness'], audio_features[0]['mode'], audio_features[0]['speechiness'], audio_features[0]['acousticness'],
audio_features[0]['instrumentalness'], audio_features[0]['liveness'],
audio_features[0]['valence'], audio_features[0]['tempo'], audio_features[0]['duration_ms'], audio_features[0]['time_signature'])
final_data.append(entry)
np_data = np.array(final_data)
my_dataframe = pd.DataFrame(np_data, columns=column_names)
my_dataframe.to_csv(f'spotify_csv_data/spotify_top_200 {date_chosen}.csv')
now_end = datetime.now()
end_time = now_end.strftime("%H:%M:%S")
print(f'Main Function - Start time: {start_time}. End time: {end_time}.')
print(f'The date {date_chosen} took {now_end - now_start} to run.')
db.close()
if __name__ == "__main__":
now_start = datetime.now()
start_time = now_start.strftime("%H:%M:%S")
print(f'Script - start time: {start_time}')
os.environ['SPOTIPY_CLIENT_ID'] = 'ENTER YOUR CLIENT_ID'
os.environ['SPOTIPY_CLIENT_SECRET'] = 'ENTER YOUR CLIENT_SECRET'
# Allows for retries. Seems to be enough that it doesn't crash.
spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(), requests_timeout=10, retries=10)
"""Leave above set."""
main()
now_end = datetime.now()
end_time = now_end.strftime("%H:%M:%S")
print(f'Script - Start time: {start_time}. End time: {end_time}.')
print(f'This script took {now_end - now_start} to run.\n')
【问题讨论】:
标签: python performance spotify spotipy