- 使用来自Kaggle: Brooklyn NY Schools 的数据
- 单独计算条形组可能会出现问题。 最好在一个数据框中进行计算,对数据框进行整形,然后进行绘图,因为这样可以确保将条形图绘制在正确的组中。
- 由于未提供数据,因此从宽格式数字数据开始,然后对数据框进行清理和整形。
- 在
python 3.8、pandas 1.3.1 和matplotlib 3.4.2 中测试
导入、加载和清理 DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
# data
data = {'BOY': [11.0, 11.0, 11.0, 11.0, 11.0, 8.0, 11.0, 14.0, 12.0, 13.0, 11.0, 14.0, 10.0, 9.0, 10.0, 10.0, 10.0, 12.0, 12.0, 13.0, 12.0, 11.0, 9.0, 12.0, 16.0, 12.0, 12.0, 12.0, 15.0, 10.0, 10.0, 10.0, 8.0, 11.0, 12.0, 14.0, 10.0, 8.0, 11.0, 12.0, 14.0, 12.0, 13.0, 15.0, 13.0, 8.0, 8.0, 11.0, 10.0, 11.0, 13.0, 11.0, 13.0, 15.0, 10.0, 8.0, 10.0, 9.0, 8.0, 11.0, 13.0, 11.0, 8.0, 11.0, 15.0, 11.0, 12.0, 17.0, 12.0, 11.0, 18.0, 14.0, 15.0, 16.0, 7.0, 11.0, 15.0, 16.0, 13.0, 13.0, 13.0, 0.0, 11.0, 15.0, 14.0, 11.0, 13.0, 16.0, 14.0, 12.0, 8.0, 13.0, 13.0, 14.0, 7.0, 10.0, 16.0, 10.0, 13.0, 10.0, 14.0, 8.0, 16.0, 13.0, 12.0, 14.0, 12.0, 14.0, 16.0, 15.0, 13.0, 13.0, 10.0, 14.0, 8.0, 10.0, 10.0, 11.0, 12.0, 10.0, 12.0, 14.0, 17.0, 13.0, 14.0, 16.0, 15.0, 13.0, 16.0, 9.0, 16.0, 15.0, 11.0, 11.0, 15.0, 14.0, 12.0, 15.0, 11.0, 16.0, 14.0, 14.0, 15.0, 14.0, 14.0, 14.0, 16.0, 15.0, 12.0, 12.0, 14.0, 15.0, 13.0, 14.0, 13.0, 17.0, 14.0, 13.0, 14.0, 13.0, 13.0, 12.0, 10.0, 15.0, 14.0, 12.0, 12.0, 14.0, 12.0, 14.0, 13.0, 15.0, 13.0, 14.0, 14.0, 12.0, 11.0, 15.0, 14.0, 14.0, 10.0], 'EOY': [16.0, 16.0, 16.0, 14.0, 10.0, 14.0, 16.0, 14.0, 15.0, 15.0, 15.0, 11.0, 11.0, 15.0, 10.0, 14.0, 17.0, 14.0, 9.0, 15.0, 14.0, 16.0, 14.0, 13.0, 11.0, 13.0, 12.0, 14.0, 15.0, 13.0, 14.0, 15.0, 12.0, 19.0, 9.0, 13.0, 11.0, 14.0, 17.0, 17.0, 14.0, 13.0, 14.0, 10.0, 16.0, 15.0, 12.0, 11.0, 12.0, 14.0, 15.0, 10.0, 15.0, 14.0, 14.0, 15.0, 18.0, 15.0, 10.0, 10.0, 15.0, 15.0, 13.0, 15.0, 19.0, 13.0, 18.0, 20.0, 21.0, 17.0, 18.0, 17.0, 18.0, 17.0, 12.0, 16.0, 15.0, 18.0, 19.0, 17.0, 20.0, 11.0, 18.0, 19.0, 11.0, 12.0, 17.0, 20.0, 17.0, 15.0, 13.0, 18.0, 14.0, 17.0, 12.0, 12.0, 16.0, 12.0, 14.0, 15.0, 14.0, 10.0, 20.0, 13.0, 18.0, 20.0, 11.0, 20.0, 17.0, 20.0, 13.0, 17.0, 15.0, 18.0, 14.0, 13.0, 13.0, 18.0, 10.0, 13.0, 12.0, 18.0, 20.0, 20.0, 16.0, 18.0, 15.0, 20.0, 22.0, 18.0, 21.0, 18.0, 18.0, 18.0, 17.0, 16.0, 19.0, 16.0, 20.0, 19.0, 19.0, 20.0, 20.0, 14.0, 18.0, 20.0, 20.0, 18.0, 16.0, 21.0, 20.0, 18.0, 15.0, 14.0, 17.0, 19.0, 21.0, 14.0, 18.0, 15.0, 18.0, 21.0, 19.0, 17.0, 16.0, 16.0, 15.0, 20.0, 19.0, 16.0, 21.0, 17.0, 19.0, 15.0, 18.0, 20.0, 18.0, 20.0, 18.0, 16.0, 16.0]}
df = pd.DataFrame(data)
# replace numbers with categorical labels; could also create new columns
labels = ['Remedial', 'Below Proficient', 'Proficient', 'Advanced']
bins = [1, 11, 13, 15, np.inf]
df['BOY'] = pd.cut(x=df.BOY, labels=labels, bins=bins, right=True)
df['EOY'] = pd.cut(x=df.EOY, labels=labels, bins=bins, right=True)
# melt the relevant columns into a long form
dfm = df.melt(var_name='Tested', value_name='Proficiency')
# set the categorical label order, which makes the xaxis labels print in the specific order
dfm['Proficiency'] = pd.Categorical(dfm['Proficiency'], labels, ordered=True)
Groupby、百分比计算和绘图形状
# groupby and get the value counts
dfg = dfm.groupby('Tested')['Proficiency'].value_counts().reset_index(level=1, name='Size').rename({'level_1': 'Proficiency'}, axis=1)
# divide by the Tested value counts to get the percent
dfg['percent'] = dfg['Size'].div(dfm.Tested.value_counts()).mul(100).round(1)
# reshape to plot
dfp = dfg.reset_index().pivot(index='Proficiency', columns='Tested', values='percent')
# display(dfp)
Tested BOY EOY
Proficiency
Remedial 34.8 9.9
Below Proficient 28.7 12.7
Proficient 27.1 25.4
Advanced 8.8 51.9
情节
ax = dfp.plot(kind='bar', figsize=(15, 5), rot=0, color=['orchid', 'teal'])
# formatting
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_ylabel('Students at Proficiency Level', fontsize=18)
ax.set_xlabel('')
ax.set_title('Bushwick Middle Change in Proficiency Levels', fontsize=25)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=25)
ax.legend(fontsize=25)
_ = plt.yticks(fontsize=15)
# add bar labels
for p in ax.containers:
ax.bar_label(p, fmt='%.1f%%', label_type='edge', fontsize=12)
# pad the spacing between the number and the edge of the figure
ax.margins(y=0.2)