import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
df = pd.read_csv('/Users/PycharmProjects/newwork/BeijingPM20100101_20151231.csv')
df.head(3)
No year month day hour season PM_Dongsi PM_Dongsihuan PM_Nongzhanguan PM_US Post DEWP HUMI PRES TEMP cbwd Iws precipitation Iprec
0 1 2010 1 1 0 4 NaN NaN NaN NaN -21.0 43.0 1021.0 -11.0 NW 1.79 0.0 0.0
1 2 2010 1 1 1 4 NaN NaN NaN NaN -21.0 47.0 1020.0 -12.0 NW 4.92 0.0 0.0
2 3 2010 1 1 2 4 NaN NaN NaN NaN -21.0 43.0 1019.0 -11.0 NW 6.71 0.0 0.0
#将分散的时间合并到一起
period=pd.PeriodIndex(year=df['year'],month=df['month'],day=df['day'],hour=df['hour'],freq='H')
type(period)
pandas.core.indexes.period.PeriodIndex
print(period)
PeriodIndex(['2010-01-01 00:00', '2010-01-01 01:00', '2010-01-01 02:00',
             '2010-01-01 03:00', '2010-01-01 04:00', '2010-01-01 05:00',
             '2010-01-01 06:00', '2010-01-01 07:00', '2010-01-01 08:00',
             '2010-01-01 09:00',
             ...
             '2015-12-31 14:00', '2015-12-31 15:00', '2015-12-31 16:00',
             '2015-12-31 17:00', '2015-12-31 18:00', '2015-12-31 19:00',
             '2015-12-31 20:00', '2015-12-31 21:00', '2015-12-31 22:00',
             '2015-12-31 23:00'],
            dtype='period[H]', length=52584, freq='H')
#设置统一的时间索引
df['datatime'] = period
df.head(5)
No year month day hour season PM_Dongsi PM_Dongsihuan PM_Nongzhanguan PM_US Post DEWP HUMI PRES TEMP cbwd Iws precipitation Iprec datatime
0 1 2010 1 1 0 4 NaN NaN NaN NaN -21.0 43.0 1021.0 -11.0 NW 1.79 0.0 0.0 2010-01-01 00:00
1 2 2010 1 1 1 4 NaN NaN NaN NaN -21.0 47.0 1020.0 -12.0 NW 4.92 0.0 0.0 2010-01-01 01:00
2 3 2010 1 1 2 4 NaN NaN NaN NaN -21.0 43.0 1019.0 -11.0 NW 6.71 0.0 0.0 2010-01-01 02:00
3 4 2010 1 1 3 4 NaN NaN NaN NaN -21.0 55.0 1019.0 -14.0 NW 9.84 0.0 0.0 2010-01-01 03:00
4 5 2010 1 1 4 4 NaN NaN NaN NaN -20.0 51.0 1018.0 -12.0 NW 12.97 0.0 0.0 2010-01-01 04:00
df.set_index('datatime',inplace=True)    #重设行索引
df.head(5)
No year month day hour season PM_Dongsi PM_Dongsihuan PM_Nongzhanguan PM_US Post DEWP HUMI PRES TEMP cbwd Iws precipitation Iprec
datatime
2010-01-01 00:00 1 2010 1 1 0 4 NaN NaN NaN NaN -21.0 43.0 1021.0 -11.0 NW 1.79 0.0 0.0
2010-01-01 01:00 2 2010 1 1 1 4 NaN NaN NaN NaN -21.0 47.0 1020.0 -12.0 NW 4.92 0.0 0.0
2010-01-01 02:00 3 2010 1 1 2 4 NaN NaN NaN NaN -21.0 43.0 1019.0 -11.0 NW 6.71 0.0 0.0
2010-01-01 03:00 4 2010 1 1 3 4 NaN NaN NaN NaN -21.0 55.0 1019.0 -14.0 NW 9.84 0.0 0.0
2010-01-01 04:00 5 2010 1 1 4 4 NaN NaN NaN NaN -20.0 51.0 1018.0 -12.0 NW 12.97 0.0 0.0
data = df['PM_US Post'].dropna()
plt.figure(figsize=(20,8),dpi=80)
_x = data.index
_y = data.values
plt.plot(range(len(_x)),_y)
#数据过长,不方便分析使用,所以下一步降采样
df = df.resample('7D').mean()    #忽略na
df.head(5)
No year month day hour season PM_Dongsi PM_Dongsihuan PM_Nongzhanguan PM_US Post DEWP HUMI PRES TEMP Iws precipitation Iprec
datatime
2010-01-01 372.5 2010.0 1.0 16.0 11.5 4.0 NaN NaN NaN 90.40367 -17.013441 46.449597 1028.009409 -6.162634 41.227325 0.015054 0.177688
2010-01-08 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2010-01-15 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2010-01-22 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2010-01-29 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
data = df['PM_US Post'].dropna()
_x = data.index
_y = data.values
plt.figure(figsize=(20,8),dpi=80)
<Figure size 1600x640 with 0 Axes>




<Figure size 1600x640 with 0 Axes>
plt.plot(range(len(_x)),_y)
plt.xticks(range(0,len(_x),10),list(_x)[::10],rotation=45)
plt.show()

pandas练习

两组数据对比:

df = pd.read_csv('/Users/marvinking/PycharmProjects/newwork/BeijingPM20100101_20151231.csv')

period=pd.PeriodIndex(year=df['year'],month=df['month'],day=df['day'],hour=df['hour'],freq='H')

df['datatime'] = period
df.set_index('datatime',inplace=True)

df = df.resample('7D').mean()
data = df['PM_US Post']
data_cn = df['PM_Nongzhanguan']
_x = data.index
_x = [i.strftime("%Y%m%d") for i in _x]
_x_china = [i.strftime("%Y%m%d") for i in data_cn.index]
_y = data.values
_y_china = data_cn.values
plt.figure(figsize=(20,8),dpi=80)


plt.plot(range(len(_x)),_y,label="US_POST",alpha=0.7)
plt.plot(range(len(_x_china)),_y_china,label="CN_POST",alpha=0.7)

plt.xticks(range(0,len(_x_china),10),list(_x_china)[::10],rotation=45)

plt.legend(loc="best")

plt.show()

pandas练习


相关文章: