import matplotlib.pyplot as plt
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
plt.rcParams[“font.sans-serif”] = [“SimHei”]
books=pd.read_excel(“豆瓣读书.xlsx”) #读取数据
books.info()
books
books.reset_index() #重置索引
books[“单价”] = books[“单价”].str.replace(“元”,"")
books[“单价”] = books[“单价”].str.replace(“CNY”,"")
books[“单价”] = books[“单价”].str.replace(“GBP”,"")
books[“单价”] = books[“单价”].str.replace(“NTD”,"")
books[“单价”] = books[“单价”].str.replace(“NT”,"")
books[“单价”] = books[“单价”].str.replace(" 臺幣","")
books[“单价”] = books[“单价”].str.replace(“圓”,"")
books[“单价”] = books[“单价”].str.replace(“NT”,"")
books[“单价”] = books[“单价”].str.replace("$","")
books[“单价”] = books[“单价”].str.replace(“TWD”,"")
books[“单价”] = books[“单价”].str.replace("(全三册)","")
books[“单价”] = books[“单价”].astype(“float”) #转为可计算类型
books.info()
books_rank=books[“评分”].sort_values(ascending=False) #将评分列进行降序
books_rank
sns.distplot(books[“评分”],bins=10,rug=True) #直观体现评分数据主要分布在哪个区间
sns.relplot(x=“单价”,y=“评分”,data=books) #直观体现单价与评分的分布
dp_max=books[books[“单价”]==books[“单价”].max()].index.values
del_data=books.drop(dp_max)
sns.relplot(x=“单价”,y=“评分”,data=del_data)
top100=books[books[“评分”]>books_rank[30]] # 不知道为啥写books_rank[100]居然有227个值
top100
sns.catplot(x=“出版社”,y=“评分”,data=top100,kind=“bar”,estimator=sum)