10分钟学习pandas

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

s = pd.Series([1,3,5,np.nan,6,8])

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

len(s)

s.describe()

count    5.000000
mean     4.600000
std      2.701851
min      1.000000
25%      3.000000
50%      5.000000
75%      6.000000
max      8.000000
dtype: float64

dates = pd.date_range('20190101',periods=6)

dates

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06'],
              dtype='datetime64[ns]', freq='D')

df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))#randn返回一个标准正态分布

df

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D
2019-01-01	0.317575	0.330549	0.055375	-1.284453
2019-01-02	-0.272201	-0.770618	0.217658	1.349425
2019-01-03	0.842686	-0.973354	0.596166	0.215889
2019-01-04	0.252868	-0.778050	-0.324255	-1.378140
2019-01-05	0.281876	1.143140	-0.781625	-1.245069
2019-01-06	-0.468258	-1.682376	1.494058	1.262588

df.describe()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D
count	6.000000	6.000000	6.000000	6.000000
mean	0.159091	-0.455118	0.209563	-0.179960
std	0.468160	1.014914	0.786212	1.293573
min	-0.468258	-1.682376	-0.781625	-1.378140
25%	-0.140934	-0.924528	-0.229347	-1.274607
50%	0.267372	-0.774334	0.136517	-0.514590
75%	0.308650	0.055257	0.501539	1.000913
max	0.842686	1.143140	1.494058	1.349425

df.shape

(6, 4)

df2 = pd.DataFrame({'A':1.,
                   'B':pd.Timestamp('20190101'),
                   'C':pd.Series(1,index=list(range(4)),dtype='float32'),
                   'D':np.array([3]*4,dtype='int32'),
                   'E':pd.Categorical(["test","train","test","train"]),
                   'F':'foo'})#foo被用作占位符的名字，用于实体举例，类似有foo,bar,baz

df2

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D	E	F
0	1.0	2019-01-01	1.0	3	test	foo
1	1.0	2019-01-01	1.0	3	train	foo
2	1.0	2019-01-01	1.0	3	test	foo
3	1.0	2019-01-01	1.0	3	train	foo

df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

df.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D
2019-01-01	0.317575	0.330549	0.055375	-1.284453
2019-01-02	-0.272201	-0.770618	0.217658	1.349425
2019-01-03	0.842686	-0.973354	0.596166	0.215889
2019-01-04	0.252868	-0.778050	-0.324255	-1.378140
2019-01-05	0.281876	1.143140	-0.781625	-1.245069

df.tail()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D
2019-01-02	-0.272201	-0.770618	0.217658	1.349425
2019-01-03	0.842686	-0.973354	0.596166	0.215889
2019-01-04	0.252868	-0.778050	-0.324255	-1.378140
2019-01-05	0.281876	1.143140	-0.781625	-1.245069
2019-01-06	-0.468258	-1.682376	1.494058	1.262588

df.index

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06'],
              dtype='datetime64[ns]', freq='D')

df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

df.values

array([[ 0.31757454,  0.33054893,  0.05537508, -1.28445319],
       [-0.27220143, -0.77061807,  0.21765843,  1.34942538],
       [ 0.84268621, -0.97335385,  0.59616646,  0.21588867],
       [ 0.25286828, -0.77804969, -0.32425479, -1.37813964],
       [ 0.28187609,  1.14314031, -0.78162546, -1.24506887],
       [-0.4682577 , -1.68237556,  1.49405812,  1.26258772]])

df.describe()#以列统计

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D
count	6.000000	6.000000	6.000000	6.000000
mean	0.159091	-0.455118	0.209563	-0.179960
std	0.468160	1.014914	0.786212	1.293573
min	-0.468258	-1.682376	-0.781625	-1.378140
25%	-0.140934	-0.924528	-0.229347	-1.274607
50%	0.267372	-0.774334	0.136517	-0.514590
75%	0.308650	0.055257	0.501539	1.000913
max	0.842686	1.143140	1.494058	1.349425

df2.describe(include='all')

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D	E	F
count	4.0	4	4.0	4.0	4	4
unique	NaN	1	NaN	NaN	2	1
top	NaN	2019-01-01 00:00:00	NaN	NaN	train	foo
freq	NaN	4	NaN	NaN	2	4
first	NaN	2019-01-01 00:00:00	NaN	NaN	NaN	NaN
last	NaN	2019-01-01 00:00:00	NaN	NaN	NaN	NaN
mean	1.0	NaN	1.0	3.0	NaN	NaN
std	0.0	NaN	0.0	0.0	NaN	NaN
min	1.0	NaN	1.0	3.0	NaN	NaN
25%	1.0	NaN	1.0	3.0	NaN	NaN
50%	1.0	NaN	1.0	3.0	NaN	NaN
75%	1.0	NaN	1.0	3.0	NaN	NaN
max	1.0	NaN	1.0	3.0	NaN	NaN

df.T

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	2019-01-01 00:00:00	2019-01-02 00:00:00	2019-01-03 00:00:00	2019-01-04 00:00:00	2019-01-05 00:00:00	2019-01-06 00:00:00
A	0.317575	-0.272201	0.842686	0.252868	0.281876	-0.468258
B	0.330549	-0.770618	-0.973354	-0.778050	1.143140	-1.682376
C	0.055375	0.217658	0.596166	-0.324255	-0.781625	1.494058
D	-1.284453	1.349425	0.215889	-1.378140	-1.245069	1.262588

df.sort_index(axis=1,ascending=True)#以行为序，降序

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D
2019-01-01	0.317575	0.330549	0.055375	-1.284453
2019-01-02	-0.272201	-0.770618	0.217658	1.349425
2019-01-03	0.842686	-0.973354	0.596166	0.215889
2019-01-04	0.252868	-0.778050	-0.324255	-1.378140
2019-01-05	0.281876	1.143140	-0.781625	-1.245069
2019-01-06	-0.468258	-1.682376	1.494058	1.262588

df.sort_values(by='B')#以B的值升序排列，以列

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D
2019-01-06	-0.468258	-1.682376	1.494058	1.262588
2019-01-03	0.842686	-0.973354	0.596166	0.215889
2019-01-04	0.252868	-0.778050	-0.324255	-1.378140
2019-01-02	-0.272201	-0.770618	0.217658	1.349425
2019-01-01	0.317575	0.330549	0.055375	-1.284453
2019-01-05	0.281876	1.143140	-0.781625	-1.245069

df['A']#df.A 选择列

2019-01-01    0.317575
2019-01-02   -0.272201
2019-01-03    0.842686
2019-01-04    0.252868
2019-01-05    0.281876
2019-01-06   -0.468258
Freq: D, Name: A, dtype: float64

df[0:3]#只显示0,1,2,此时切割行

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D
2019-01-01	0.317575	0.330549	0.055375	-1.284453
2019-01-02	-0.272201	-0.770618	0.217658	1.349425
2019-01-03	0.842686	-0.973354	0.596166	0.215889

df['20190101':'20190103']#此时包含头尾

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D
2019-01-01	0.317575	0.330549	0.055375	-1.284453
2019-01-02	-0.272201	-0.770618	0.217658	1.349425
2019-01-03	0.842686	-0.973354	0.596166	0.215889

df.loc[dates[0]]#行

A    0.317575
B    0.330549
C    0.055375
D   -1.284453
Name: 2019-01-01 00:00:00, dtype: float64

dates[0]

Timestamp('2019-01-01 00:00:00', freq='D')

df.loc[:,['A','B']]#列

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B
2019-01-01	0.317575	0.330549
2019-01-02	-0.272201	-0.770618
2019-01-03	0.842686	-0.973354
2019-01-04	0.252868	-0.778050
2019-01-05	0.281876	1.143140
2019-01-06	-0.468258	-1.682376

df.loc['20190104',['A','B']].shape#返回形状变小

(2,)

#loc,at也可用来获得标量值
df.loc[dates[0],'A']

0.3175745430028141

df.at[dates[0],'A']

0.3175745430028141

df.at[dates[0],'A']#其中at/iat通过便签获取某个数值的具体位置
df.iat[1,2]#iat可用行号

0.21765843113729494

df.iloc[3]

A    0.252868
B   -0.778050
C   -0.324255
D   -1.378140
Name: 2019-01-04 00:00:00, dtype: float64

df.iloc[3,1]

-0.7780496857268889

df.iloc[[1,2,4],[0,2]]

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	C
2019-01-02	-0.272201	0.217658
2019-01-03	0.842686	0.596166
2019-01-05	0.281876	-0.781625

df.iloc[1:3,:]

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D
2019-01-02	-0.272201	-0.770618	0.217658	1.349425
2019-01-03	0.842686	-0.973354	0.596166	0.215889

df.iloc[1:3]

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D
2019-01-02	-0.272201	-0.770618	0.217658	1.349425
2019-01-03	0.842686	-0.973354	0.596166	0.215889

df[df.A>0]

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D
2019-01-01	0.317575	0.330549	0.055375	-1.284453
2019-01-03	0.842686	-0.973354	0.596166	0.215889
2019-01-04	0.252868	-0.778050	-0.324255	-1.378140
2019-01-05	0.281876	1.143140	-0.781625	-1.245069

df[df>0]

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D
2019-01-01	0.317575	0.330549	0.055375	NaN
2019-01-02	NaN	NaN	0.217658	1.349425
2019-01-03	0.842686	NaN	0.596166	0.215889
2019-01-04	0.252868	NaN	NaN	NaN
2019-01-05	0.281876	1.143140	NaN	NaN
2019-01-06	NaN	NaN	1.494058	1.262588

df3 = df.copy()

df3['E']=['one','two','three','four','three','one']

df3

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D	E
2019-01-01	0.317575	0.330549	0.055375	-1.284453	one
2019-01-02	-0.272201	-0.770618	0.217658	1.349425	two
2019-01-03	0.842686	-0.973354	0.596166	0.215889	three
2019-01-04	0.252868	-0.778050	-0.324255	-1.378140	four
2019-01-05	0.281876	1.143140	-0.781625	-1.245069	three
2019-01-06	-0.468258	-1.682376	1.494058	1.262588	one

df3[df3['E'].isin(['one','two'])]

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D	E
2019-01-01	0.317575	0.330549	0.055375	-1.284453	one
2019-01-02	-0.272201	-0.770618	0.217658	1.349425	two
2019-01-06	-0.468258	-1.682376	1.494058	1.262588	one

#设置新列时会自动按索引对齐
s1= pd.Series([1,2,3,4,5,6],index=pd.date_range('20190102',periods=6))

s1

2019-01-02    1
2019-01-03    2
2019-01-04    3
2019-01-05    4
2019-01-06    5
2019-01-07    6
Freq: D, dtype: int64

df['F']=s1

df

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D	F
2019-01-01	0.317575	0.330549	0.055375	-1.284453	NaN
2019-01-02	-0.272201	-0.770618	0.217658	1.349425	1.0
2019-01-03	0.842686	-0.973354	0.596166	0.215889	2.0
2019-01-04	0.252868	-0.778050	-0.324255	-1.378140	3.0
2019-01-05	0.281876	1.143140	-0.781625	-1.245069	4.0
2019-01-06	-0.468258	-1.682376	1.494058	1.262588	5.0

df.at[dates[0],'A']=0

df.iat[0,1]=0

df.loc[:,'D']=np.array([5]*len(df))

df

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D	F
2019-01-01	0.000000	0.000000	0.055375	5	NaN
2019-01-02	-0.272201	-0.770618	0.217658	5	1.0
2019-01-03	0.842686	-0.973354	0.596166	5	2.0
2019-01-04	0.252868	-0.778050	-0.324255	5	3.0
2019-01-05	0.281876	1.143140	-0.781625	5	4.0
2019-01-06	-0.468258	-1.682376	1.494058	5	5.0

df4 = df.copy()

df4[df4>0]=-df4

df4

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D	F
2019-01-01	0.000000	0.000000	-0.055375	-5	NaN
2019-01-02	-0.272201	-0.770618	-0.217658	-5	-1.0
2019-01-03	-0.842686	-0.973354	-0.596166	-5	-2.0
2019-01-04	-0.252868	-0.778050	-0.324255	-5	-3.0
2019-01-05	-0.281876	-1.143140	-0.781625	-5	-4.0
2019-01-06	-0.468258	-1.682376	-1.494058	-5	-5.0

df1 = df.reindex(index=dates[0:4],columns=list(df.columns)+['E'])

df1.loc[dates[0]:dates[1],'E']=1

df1

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D	F	E
2019-01-01	0.000000	0.000000	0.055375	5	NaN	1.0
2019-01-02	-0.272201	-0.770618	0.217658	5	1.0	1.0
2019-01-03	0.842686	-0.973354	0.596166	5	2.0	NaN
2019-01-04	0.252868	-0.778050	-0.324255	5	3.0	NaN

df1.dropna(how='any')

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D	F	E
2019-01-02	-0.272201	-0.770618	0.217658	5	1.0	1.0

df1.fillna(value=5)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D	F	E
2019-01-01	0.000000	0.000000	0.055375	5	5.0	1.0
2019-01-02	-0.272201	-0.770618	0.217658	5	1.0	1.0
2019-01-03	0.842686	-0.973354	0.596166	5	2.0	5.0
2019-01-04	0.252868	-0.778050	-0.324255	5	3.0	5.0

pd.isna(df1)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D	F	E
2019-01-01	False	False	False	False	True	False
2019-01-02	False	False	False	False	False	False
2019-01-03	False	False	False	False	False	True
2019-01-04	False	False	False	False	False	True

df.mean()#列

A    0.106162
B   -0.510209
C    0.209563
D    5.000000
F    3.000000
dtype: float64

df.mean(1)#行

2019-01-01    1.263844
2019-01-02    1.034968
2019-01-03    1.493100
2019-01-04    1.430113
2019-01-05    1.928678
2019-01-06    1.868685
Freq: D, dtype: float64

#操作具有不同维度且需要对齐的对象。此外，pandas会自动沿着指定的维度广播。
s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2)

2019-01-01    NaN
2019-01-02    NaN
2019-01-03    1.0
2019-01-04    3.0
2019-01-05    5.0
2019-01-06    NaN
Freq: D, dtype: float64

df.sub(s,axis='index')#每行每列按照s值进行减法操作

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D	F
2019-01-01	NaN	NaN	NaN	NaN	NaN
2019-01-02	NaN	NaN	NaN	NaN	NaN
2019-01-03	-0.157314	-1.973354	-0.403834	4.0	1.0
2019-01-04	-2.747132	-3.778050	-3.324255	2.0	0.0
2019-01-05	-4.718124	-3.856860	-5.781625	0.0	-1.0
2019-01-06	NaN	NaN	NaN	NaN	NaN

df

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D	F
2019-01-01	0.000000	0.000000	0.055375	5	NaN
2019-01-02	-0.272201	-0.770618	0.217658	5	1.0
2019-01-03	0.842686	-0.973354	0.596166	5	2.0
2019-01-04	0.252868	-0.778050	-0.324255	5	3.0
2019-01-05	0.281876	1.143140	-0.781625	5	4.0
2019-01-06	-0.468258	-1.682376	1.494058	5	5.0

df.apply(np.cumsum)#从第一行加到最后一行

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D	F
2019-01-01	0.000000	0.000000	0.055375	5	NaN
2019-01-02	-0.272201	-0.770618	0.273034	10	1.0
2019-01-03	0.570485	-1.743972	0.869200	15	3.0
2019-01-04	0.823353	-2.522022	0.544945	20	6.0
2019-01-05	1.105229	-1.378881	-0.236680	25	10.0
2019-01-06	0.636971	-3.061257	1.257378	30	15.0

df.apply(lambda x:x.max()-x.min())#以列

A    1.310944
B    2.825516
C    2.275684
D    0.000000
F    4.000000
dtype: float64

df['F'].value_counts()

5.0    1
4.0    1
3.0    1
2.0    1
1.0    1
Name: F, dtype: int64

s = pd.Series(np.random.randint(0,7,size=10))

0    1
1    1
2    1
3    2
4    3
5    2
6    6
7    0
8    4
9    6
dtype: int32

s.dtype

dtype('int32')

s.shape

(10,)

s.value_counts()

1    3
6    2
2    2
4    1
3    1
0    1
dtype: int64

s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

s.dtype

dtype('O')

s.str

<pandas.core.strings.StringMethods at 0x22f0313d160>

s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

#数据合并
df = pd.DataFrame(np.random.randn(10,4))

df

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	0	1	2	3
0	-0.936960	-0.210650	1.887069	0.128005
1	0.657660	0.254801	-0.092190	-1.957160
2	-0.920141	-2.259964	0.083965	0.371865
3	-0.755403	0.633426	0.090949	-0.626964
4	0.170052	1.164537	-1.193706	1.391785
5	0.084560	-1.513814	0.069032	0.099851
6	-0.683663	0.031873	-0.062998	0.523253
7	-0.926594	0.125286	-1.894089	-0.449402
8	0.610722	0.329156	0.025149	-2.673445
9	0.336673	1.205792	-1.346179	0.214389

pieces = [df[:3],df[3:7],df[7:]]

pieces

[          0         1         2         3
 0 -0.936960 -0.210650  1.887069  0.128005
 1  0.657660  0.254801 -0.092190 -1.957160
 2 -0.920141 -2.259964  0.083965  0.371865,
           0         1         2         3
 3 -0.755403  0.633426  0.090949 -0.626964
 4  0.170052  1.164537 -1.193706  1.391785
 5  0.084560 -1.513814  0.069032  0.099851
 6 -0.683663  0.031873 -0.062998  0.523253,
           0         1         2         3
 7 -0.926594  0.125286 -1.894089 -0.449402
 8  0.610722  0.329156  0.025149 -2.673445
 9  0.336673  1.205792 -1.346179  0.214389]

pd.concat(pieces)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	0	1	2	3
0	-0.936960	-0.210650	1.887069	0.128005
1	0.657660	0.254801	-0.092190	-1.957160
2	-0.920141	-2.259964	0.083965	0.371865
3	-0.755403	0.633426	0.090949	-0.626964
4	0.170052	1.164537	-1.193706	1.391785
5	0.084560	-1.513814	0.069032	0.099851
6	-0.683663	0.031873	-0.062998	0.523253
7	-0.926594	0.125286	-1.894089	-0.449402
8	0.610722	0.329156	0.025149	-2.673445
9	0.336673	1.205792	-1.346179	0.214389

concat

axis=1列合并；axis=0行合并（default）
join=‘inner’只返回合并的共同对象；join=‘outer’返回合并后的所有列
ignore_index=True会自动连续生成索引值；ignore_index=False则不改变原有的索引值
verify_integrity会检查当前合并是否重复索引
keys会导致生成多维数组
name会重新命名列名。

pd.concat(pieces,axis=1)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	0	1	2	3	0	1	2	3	0	1	2	3
0	-0.936960	-0.210650	1.887069	0.128005	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	0.657660	0.254801	-0.092190	-1.957160	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	-0.920141	-2.259964	0.083965	0.371865	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	NaN	NaN	NaN	NaN	-0.755403	0.633426	0.090949	-0.626964	NaN	NaN	NaN	NaN
4	NaN	NaN	NaN	NaN	0.170052	1.164537	-1.193706	1.391785	NaN	NaN	NaN	NaN
5	NaN	NaN	NaN	NaN	0.084560	-1.513814	0.069032	0.099851	NaN	NaN	NaN	NaN
6	NaN	NaN	NaN	NaN	-0.683663	0.031873	-0.062998	0.523253	NaN	NaN	NaN	NaN
7	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	-0.926594	0.125286	-1.894089	-0.449402
8	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.610722	0.329156	0.025149	-2.673445
9	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.336673	1.205792	-1.346179	0.214389

pd.concat(pieces,join='outer',axis=1)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	0	1	2	3	0	1	2	3	0	1	2	3
0	-0.936960	-0.210650	1.887069	0.128005	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	0.657660	0.254801	-0.092190	-1.957160	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	-0.920141	-2.259964	0.083965	0.371865	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	NaN	NaN	NaN	NaN	-0.755403	0.633426	0.090949	-0.626964	NaN	NaN	NaN	NaN
4	NaN	NaN	NaN	NaN	0.170052	1.164537	-1.193706	1.391785	NaN	NaN	NaN	NaN
5	NaN	NaN	NaN	NaN	0.084560	-1.513814	0.069032	0.099851	NaN	NaN	NaN	NaN
6	NaN	NaN	NaN	NaN	-0.683663	0.031873	-0.062998	0.523253	NaN	NaN	NaN	NaN
7	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	-0.926594	0.125286	-1.894089	-0.449402
8	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.610722	0.329156	0.025149	-2.673445
9	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.336673	1.205792	-1.346179	0.214389

#sql格式合并
left = pd.DataFrame({'key':['foo','bar'],'lval':[1,2]})
right = pd.DataFrame({'key':['foo','bar'],'rval':[4,5]})

left

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	key	lval
0	foo	1
1	bar	2

right

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	key	rval
0	foo	4
1	bar	5

pd.merge(left,right,on='key')

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	key	lval	rval
0	foo	1	4
1	bar	2	5

left = pd.DataFrame({'key':['foo','foo'],'lval':[1,2]})
right = pd.DataFrame({'key':['foo','foo'],'rval':[4,5]})

left

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	key	lval
0	foo	1
1	foo	2

right

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	key	rval
0	foo	4
1	foo	5

pd.merge(left,right,on='key')#以某列为键值进行合并

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	key	lval	rval
0	foo	1	4
1	foo	1	5
2	foo	2	4
3	foo	2	5

#将行append到dataframe中
df = pd.DataFrame(np.random.randn(8,4),columns=list('ABCD'))

df

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D
0	0.862756	-1.436692	0.367396	1.033803
1	-0.732357	-0.350199	-0.116083	-2.435210
2	0.316582	0.468616	0.433001	-0.443120
3	-0.189939	-2.437137	0.126893	-2.273711
4	0.913514	-0.752727	-1.651140	1.156839
5	-0.314581	1.296585	0.579130	-0.871556
6	0.361473	0.687854	-1.044602	0.233138
7	0.045199	2.176608	-0.258569	-1.018576

s = df.iloc[3]

df.append(s,ignore_index=False)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D
0	0.862756	-1.436692	0.367396	1.033803
1	-0.732357	-0.350199	-0.116083	-2.435210
2	0.316582	0.468616	0.433001	-0.443120
3	-0.189939	-2.437137	0.126893	-2.273711
4	0.913514	-0.752727	-1.651140	1.156839
5	-0.314581	1.296585	0.579130	-0.871556
6	0.361473	0.687854	-1.044602	0.233138
7	0.045199	2.176608	-0.258569	-1.018576
3	-0.189939	-2.437137	0.126893	-2.273711

“分组依据”是指涉及以下一个或多个步骤的过程:
根据某些标准将数据分成组
独立地将函数应用于每个组
将结果组合成数据结构

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
   ....:                           'foo', 'bar', 'foo', 'foo'],
   ....:                    'B' : ['one', 'one', 'two', 'three',
   ....:                           'two', 'two', 'one', 'three'],
   ....:                    'C' : np.random.randn(8),
   ....:                    'D' : np.random.randn(8)})

df

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D
0	foo	one	0.982988	-0.386029
1	bar	one	1.632482	-0.327520
2	foo	two	0.149619	-0.138297
3	bar	three	-1.480397	1.105690
4	foo	two	0.647044	-1.097276
5	bar	two	-0.675596	0.250176
6	foo	one	0.437309	1.031742
7	foo	three	0.434659	1.197695

df.groupby('A').sum()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	C	D
A
bar	-0.523512	1.028346
foo	2.651618	0.607834

df.groupby('A').max()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	B	C	D
A
bar	two	1.632482	1.105690
foo	two	0.982988	1.197695

df.groupby(['A','B']).sum()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

		C	D
A	B
bar	one	1.632482	-0.327520
	three	-1.480397	1.105690
	two	-0.675596	0.250176
foo	one	1.420297	0.645713
	three	0.434659	1.197695
	two	0.796663	-1.235574

tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
   ....:                      'foo', 'foo', 'qux', 'qux'],
   ....:                     ['one', 'two', 'one', 'two',
   ....:                      'one', 'two', 'one', 'two']]))

tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

zip() 函数用于将可迭代的对象作为参数，将对象中对应的元素打包成一个个元组，然后返回由这些元组组成的列表,在python3中用于减少内存

index = pd.MultiIndex.from_tuples(tuples,names=['first','second'])

index

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

df = pd.DataFrame(np.random.randn(8,2),index=index,columns=['A','B'])

df

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

		A	B
first	second
bar	one	-0.752364	0.982241
bar	two	-0.626690	1.358099
baz	one	0.342360	-0.618870
baz	two	-0.399448	-0.500175
foo	one	-0.746328	-0.244752
foo	two	1.556458	-1.340098
qux	one	-0.569393	-0.340625
qux	two	0.513714	0.151477

#stack压缩列
stacked = df.stack()

stacked

first  second   
bar    one     A   -0.752364
               B    0.982241
       two     A   -0.626690
               B    1.358099
baz    one     A    0.342360
               B   -0.618870
       two     A   -0.399448
               B   -0.500175
foo    one     A   -0.746328
               B   -0.244752
       two     A    1.556458
               B   -1.340098
qux    one     A   -0.569393
               B   -0.340625
       two     A    0.513714
               B    0.151477
dtype: float64

stacked.unstack()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

		A	B
first	second
bar	one	-0.752364	0.982241
bar	two	-0.626690	1.358099
baz	one	0.342360	-0.618870
baz	two	-0.399448	-0.500175
foo	one	-0.746328	-0.244752
foo	two	1.556458	-1.340098
qux	one	-0.569393	-0.340625
qux	two	0.513714	0.151477

stacked.unstack(1)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	second	one	two
first
bar	A	-0.752364	-0.626690
bar	B	0.982241	1.358099
baz	A	0.342360	-0.399448
baz	B	-0.618870	-0.500175
foo	A	-0.746328	1.556458
foo	B	-0.244752	-1.340098
qux	A	-0.569393	0.513714
qux	B	-0.340625	0.151477

stacked.unstack(0)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	first	bar	baz	foo	qux
second
one	A	-0.752364	0.342360	-0.746328	-0.569393
one	B	0.982241	-0.618870	-0.244752	-0.340625
two	A	-0.626690	-0.399448	1.556458	0.513714
two	B	1.358099	-0.500175	-1.340098	0.151477

df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
   .....:                    'B' : ['A', 'B', 'C'] * 4,
   .....:                    'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
   .....:                    'D' : np.random.randn(12),
   .....:                    'E' : np.random.randn(12)})

df

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D	E
0	one	A	foo	0.346892	-0.615216
1	one	B	foo	0.808681	0.161598
2	two	C	foo	-1.324783	-0.088082
3	three	A	bar	-0.227795	-0.180022
4	one	B	bar	1.130028	-0.361439
5	one	C	bar	0.510629	-1.466063
6	two	A	foo	0.379503	0.008279
7	three	B	foo	0.921087	-0.148614
8	one	C	foo	0.332222	-0.127428
9	one	A	bar	-0.784876	-0.736117
10	two	B	bar	-0.793957	1.705022
11	three	C	bar	-0.898485	1.038166

pd.pivot_table(df,values='D',index=['A','B'],columns=['C'])

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	C	bar	foo
A	B
one	A	-0.784876	0.346892
	B	1.130028	0.808681
	C	0.510629	0.332222
three	A	-0.227795	NaN
	B	NaN	0.921087
	C	-0.898485	NaN
two	A	NaN	0.379503
	B	-0.793957	NaN
	C	NaN	-1.324783

rng = pd.date_range('1/1/2012', periods=100, freq='S')

rng

DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
               '2012-01-01 00:00:02', '2012-01-01 00:00:03',
               '2012-01-01 00:00:04', '2012-01-01 00:00:05',
               '2012-01-01 00:00:06', '2012-01-01 00:00:07',
               '2012-01-01 00:00:08', '2012-01-01 00:00:09',
               '2012-01-01 00:00:10', '2012-01-01 00:00:11',
               '2012-01-01 00:00:12', '2012-01-01 00:00:13',
               '2012-01-01 00:00:14', '2012-01-01 00:00:15',
               '2012-01-01 00:00:16', '2012-01-01 00:00:17',
               '2012-01-01 00:00:18', '2012-01-01 00:00:19',
               '2012-01-01 00:00:20', '2012-01-01 00:00:21',
               '2012-01-01 00:00:22', '2012-01-01 00:00:23',
               '2012-01-01 00:00:24', '2012-01-01 00:00:25',
               '2012-01-01 00:00:26', '2012-01-01 00:00:27',
               '2012-01-01 00:00:28', '2012-01-01 00:00:29',
               '2012-01-01 00:00:30', '2012-01-01 00:00:31',
               '2012-01-01 00:00:32', '2012-01-01 00:00:33',
               '2012-01-01 00:00:34', '2012-01-01 00:00:35',
               '2012-01-01 00:00:36', '2012-01-01 00:00:37',
               '2012-01-01 00:00:38', '2012-01-01 00:00:39',
               '2012-01-01 00:00:40', '2012-01-01 00:00:41',
               '2012-01-01 00:00:42', '2012-01-01 00:00:43',
               '2012-01-01 00:00:44', '2012-01-01 00:00:45',
               '2012-01-01 00:00:46', '2012-01-01 00:00:47',
               '2012-01-01 00:00:48', '2012-01-01 00:00:49',
               '2012-01-01 00:00:50', '2012-01-01 00:00:51',
               '2012-01-01 00:00:52', '2012-01-01 00:00:53',
               '2012-01-01 00:00:54', '2012-01-01 00:00:55',
               '2012-01-01 00:00:56', '2012-01-01 00:00:57',
               '2012-01-01 00:00:58', '2012-01-01 00:00:59',
               '2012-01-01 00:01:00', '2012-01-01 00:01:01',
               '2012-01-01 00:01:02', '2012-01-01 00:01:03',
               '2012-01-01 00:01:04', '2012-01-01 00:01:05',
               '2012-01-01 00:01:06', '2012-01-01 00:01:07',
               '2012-01-01 00:01:08', '2012-01-01 00:01:09',
               '2012-01-01 00:01:10', '2012-01-01 00:01:11',
               '2012-01-01 00:01:12', '2012-01-01 00:01:13',
               '2012-01-01 00:01:14', '2012-01-01 00:01:15',
               '2012-01-01 00:01:16', '2012-01-01 00:01:17',
               '2012-01-01 00:01:18', '2012-01-01 00:01:19',
               '2012-01-01 00:01:20', '2012-01-01 00:01:21',
               '2012-01-01 00:01:22', '2012-01-01 00:01:23',
               '2012-01-01 00:01:24', '2012-01-01 00:01:25',
               '2012-01-01 00:01:26', '2012-01-01 00:01:27',
               '2012-01-01 00:01:28', '2012-01-01 00:01:29',
               '2012-01-01 00:01:30', '2012-01-01 00:01:31',
               '2012-01-01 00:01:32', '2012-01-01 00:01:33',
               '2012-01-01 00:01:34', '2012-01-01 00:01:35',
               '2012-01-01 00:01:36', '2012-01-01 00:01:37',
               '2012-01-01 00:01:38', '2012-01-01 00:01:39'],
              dtype='datetime64[ns]', freq='S')

ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)

ts

2012-01-01 00:00:00    225
2012-01-01 00:00:01    354
2012-01-01 00:00:02    438
2012-01-01 00:00:03    440
2012-01-01 00:00:04      9
2012-01-01 00:00:05    179
2012-01-01 00:00:06    396
2012-01-01 00:00:07    200
2012-01-01 00:00:08    413
2012-01-01 00:00:09    490
2012-01-01 00:00:10     37
2012-01-01 00:00:11     57
2012-01-01 00:00:12     33
2012-01-01 00:00:13    388
2012-01-01 00:00:14     44
2012-01-01 00:00:15     95
2012-01-01 00:00:16      8
2012-01-01 00:00:17      1
2012-01-01 00:00:18    307
2012-01-01 00:00:19    332
2012-01-01 00:00:20     20
2012-01-01 00:00:21     84
2012-01-01 00:00:22    309
2012-01-01 00:00:23    308
2012-01-01 00:00:24     67
2012-01-01 00:00:25    245
2012-01-01 00:00:26    180
2012-01-01 00:00:27      9
2012-01-01 00:00:28    126
2012-01-01 00:00:29    232
                      ... 
2012-01-01 00:01:10    409
2012-01-01 00:01:11    355
2012-01-01 00:01:12     70
2012-01-01 00:01:13    266
2012-01-01 00:01:14    118
2012-01-01 00:01:15    325
2012-01-01 00:01:16    214
2012-01-01 00:01:17      3
2012-01-01 00:01:18    143
2012-01-01 00:01:19     28
2012-01-01 00:01:20     56
2012-01-01 00:01:21    120
2012-01-01 00:01:22     99
2012-01-01 00:01:23    102
2012-01-01 00:01:24     71
2012-01-01 00:01:25    464
2012-01-01 00:01:26    489
2012-01-01 00:01:27    404
2012-01-01 00:01:28    356
2012-01-01 00:01:29    197
2012-01-01 00:01:30    390
2012-01-01 00:01:31    345
2012-01-01 00:01:32    115
2012-01-01 00:01:33    377
2012-01-01 00:01:34    388
2012-01-01 00:01:35     39
2012-01-01 00:01:36    406
2012-01-01 00:01:37    408
2012-01-01 00:01:38    410
2012-01-01 00:01:39    256
Freq: S, Length: 100, dtype: int32

rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')

rng

DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
               '2012-03-10'],
              dtype='datetime64[ns]', freq='D')

ts_utc = ts.tz_localize('UTC')

ts_utc

2012-01-01 00:00:00+00:00    225
2012-01-01 00:00:01+00:00    354
2012-01-01 00:00:02+00:00    438
2012-01-01 00:00:03+00:00    440
2012-01-01 00:00:04+00:00      9
2012-01-01 00:00:05+00:00    179
2012-01-01 00:00:06+00:00    396
2012-01-01 00:00:07+00:00    200
2012-01-01 00:00:08+00:00    413
2012-01-01 00:00:09+00:00    490
2012-01-01 00:00:10+00:00     37
2012-01-01 00:00:11+00:00     57
2012-01-01 00:00:12+00:00     33
2012-01-01 00:00:13+00:00    388
2012-01-01 00:00:14+00:00     44
2012-01-01 00:00:15+00:00     95
2012-01-01 00:00:16+00:00      8
2012-01-01 00:00:17+00:00      1
2012-01-01 00:00:18+00:00    307
2012-01-01 00:00:19+00:00    332
2012-01-01 00:00:20+00:00     20
2012-01-01 00:00:21+00:00     84
2012-01-01 00:00:22+00:00    309
2012-01-01 00:00:23+00:00    308
2012-01-01 00:00:24+00:00     67
2012-01-01 00:00:25+00:00    245
2012-01-01 00:00:26+00:00    180
2012-01-01 00:00:27+00:00      9
2012-01-01 00:00:28+00:00    126
2012-01-01 00:00:29+00:00    232
                            ... 
2012-01-01 00:01:10+00:00    409
2012-01-01 00:01:11+00:00    355
2012-01-01 00:01:12+00:00     70
2012-01-01 00:01:13+00:00    266
2012-01-01 00:01:14+00:00    118
2012-01-01 00:01:15+00:00    325
2012-01-01 00:01:16+00:00    214
2012-01-01 00:01:17+00:00      3
2012-01-01 00:01:18+00:00    143
2012-01-01 00:01:19+00:00     28
2012-01-01 00:01:20+00:00     56
2012-01-01 00:01:21+00:00    120
2012-01-01 00:01:22+00:00     99
2012-01-01 00:01:23+00:00    102
2012-01-01 00:01:24+00:00     71
2012-01-01 00:01:25+00:00    464
2012-01-01 00:01:26+00:00    489
2012-01-01 00:01:27+00:00    404
2012-01-01 00:01:28+00:00    356
2012-01-01 00:01:29+00:00    197
2012-01-01 00:01:30+00:00    390
2012-01-01 00:01:31+00:00    345
2012-01-01 00:01:32+00:00    115
2012-01-01 00:01:33+00:00    377
2012-01-01 00:01:34+00:00    388
2012-01-01 00:01:35+00:00     39
2012-01-01 00:01:36+00:00    406
2012-01-01 00:01:37+00:00    408
2012-01-01 00:01:38+00:00    410
2012-01-01 00:01:39+00:00    256
Freq: S, Length: 100, dtype: int32

ts_utc.tz_convert('US/Eastern')

2011-12-31 19:00:00-05:00    225
2011-12-31 19:00:01-05:00    354
2011-12-31 19:00:02-05:00    438
2011-12-31 19:00:03-05:00    440
2011-12-31 19:00:04-05:00      9
2011-12-31 19:00:05-05:00    179
2011-12-31 19:00:06-05:00    396
2011-12-31 19:00:07-05:00    200
2011-12-31 19:00:08-05:00    413
2011-12-31 19:00:09-05:00    490
2011-12-31 19:00:10-05:00     37
2011-12-31 19:00:11-05:00     57
2011-12-31 19:00:12-05:00     33
2011-12-31 19:00:13-05:00    388
2011-12-31 19:00:14-05:00     44
2011-12-31 19:00:15-05:00     95
2011-12-31 19:00:16-05:00      8
2011-12-31 19:00:17-05:00      1
2011-12-31 19:00:18-05:00    307
2011-12-31 19:00:19-05:00    332
2011-12-31 19:00:20-05:00     20
2011-12-31 19:00:21-05:00     84
2011-12-31 19:00:22-05:00    309
2011-12-31 19:00:23-05:00    308
2011-12-31 19:00:24-05:00     67
2011-12-31 19:00:25-05:00    245
2011-12-31 19:00:26-05:00    180
2011-12-31 19:00:27-05:00      9
2011-12-31 19:00:28-05:00    126
2011-12-31 19:00:29-05:00    232
                            ... 
2011-12-31 19:01:10-05:00    409
2011-12-31 19:01:11-05:00    355
2011-12-31 19:01:12-05:00     70
2011-12-31 19:01:13-05:00    266
2011-12-31 19:01:14-05:00    118
2011-12-31 19:01:15-05:00    325
2011-12-31 19:01:16-05:00    214
2011-12-31 19:01:17-05:00      3
2011-12-31 19:01:18-05:00    143
2011-12-31 19:01:19-05:00     28
2011-12-31 19:01:20-05:00     56
2011-12-31 19:01:21-05:00    120
2011-12-31 19:01:22-05:00     99
2011-12-31 19:01:23-05:00    102
2011-12-31 19:01:24-05:00     71
2011-12-31 19:01:25-05:00    464
2011-12-31 19:01:26-05:00    489
2011-12-31 19:01:27-05:00    404
2011-12-31 19:01:28-05:00    356
2011-12-31 19:01:29-05:00    197
2011-12-31 19:01:30-05:00    390
2011-12-31 19:01:31-05:00    345
2011-12-31 19:01:32-05:00    115
2011-12-31 19:01:33-05:00    377
2011-12-31 19:01:34-05:00    388
2011-12-31 19:01:35-05:00     39
2011-12-31 19:01:36-05:00    406
2011-12-31 19:01:37-05:00    408
2011-12-31 19:01:38-05:00    410
2011-12-31 19:01:39-05:00    256
Freq: S, Length: 100, dtype: int32

prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')

prng

PeriodIndex(['1990Q1', '1990Q2', '1990Q3', '1990Q4', '1991Q1', '1991Q2',
             '1991Q3', '1991Q4', '1992Q1', '1992Q2', '1992Q3', '1992Q4',
             '1993Q1', '1993Q2', '1993Q3', '1993Q4', '1994Q1', '1994Q2',
             '1994Q3', '1994Q4', '1995Q1', '1995Q2', '1995Q3', '1995Q4',
             '1996Q1', '1996Q2', '1996Q3', '1996Q4', '1997Q1', '1997Q2',
             '1997Q3', '1997Q4', '1998Q1', '1998Q2', '1998Q3', '1998Q4',
             '1999Q1', '1999Q2', '1999Q3', '1999Q4', '2000Q1', '2000Q2',
             '2000Q3', '2000Q4'],
            dtype='period[Q-NOV]', freq='Q-NOV')

ts = pd.Series(np.random.randn(len(prng)), prng)
ts

1990Q1    1.796304
1990Q2    0.659808
1990Q3   -0.647755
1990Q4    1.846486
1991Q1    0.488348
1991Q2    1.830351
1991Q3   -1.658804
1991Q4    0.585780
1992Q1   -0.596026
1992Q2   -1.900346
1992Q3   -0.066638
1992Q4    0.419037
1993Q1    0.055711
1993Q2   -2.103900
1993Q3    0.229944
1993Q4    0.317348
1994Q1   -0.776638
1994Q2   -0.241438
1994Q3   -0.587104
1994Q4    0.825772
1995Q1    2.444721
1995Q2    0.803142
1995Q3    0.494378
1995Q4   -0.984900
1996Q1   -0.431641
1996Q2    0.766768
1996Q3   -1.176313
1996Q4    0.339700
1997Q1   -1.523029
1997Q2    0.512173
1997Q3    1.359914
1997Q4    0.564407
1998Q1    0.354859
1998Q2   -0.493561
1998Q3    0.514986
1998Q4   -0.156142
1999Q1    1.047135
1999Q2    0.648944
1999Q3   -1.581937
1999Q4    0.261181
2000Q1   -0.809498
2000Q2    1.102175
2000Q3    0.424905
2000Q4   -0.775245
Freq: Q-NOV, dtype: float64

ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9#e-end,s-start

ts.head()

1990-03-01 09:00    1.796304
1990-06-01 09:00    0.659808
1990-09-01 09:00   -0.647755
1990-12-01 09:00    1.846486
1991-03-01 09:00    0.488348
Freq: H, dtype: float64

df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})

df

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	id	raw_grade
0	1	a
1	2	b
2	3	b
3	4	a
4	5	a
5	6	e

df["grade"] = df["raw_grade"].astype("category")
df["grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]

df["grade"].cat.categories = ["very good", "good", "very bad"]

df["grade"]

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (3, object): [very good, good, very bad]

df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])

df["grade"]#methods under Series .cat return a new Series by default

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]

df.sort_values(by="grade")

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	id	raw_grade	grade
5	6	e	very bad
1	2	b	good
2	3	b	good
0	1	a	very good
3	4	a	very good
4	5	a	very good

df.groupby("grade").size()

grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64

ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts

2000-01-01    0.585074
2000-01-02    0.605786
2000-01-03    0.697632
2000-01-04   -0.783338
2000-01-05    1.150780
2000-01-06   -0.718491
2000-01-07    0.696745
2000-01-08    0.270574
2000-01-09    0.657496
2000-01-10   -2.613661
2000-01-11   -1.978929
2000-01-12    0.325563
2000-01-13    0.286470
2000-01-14   -0.315502
2000-01-15    0.487703
2000-01-16   -1.830420
2000-01-17    0.847074
2000-01-18   -2.363392
2000-01-19    0.139429
2000-01-20   -0.512045
2000-01-21    0.209301
2000-01-22   -0.202987
2000-01-23   -0.605512
2000-01-24    0.113967
2000-01-25   -0.546799
2000-01-26    1.758145
2000-01-27    0.299658
2000-01-28   -0.614838
2000-01-29    0.262877
2000-01-30    0.021676
                ...   
2002-08-28    0.222110
2002-08-29   -1.846013
2002-08-30   -0.094660
2002-08-31    1.281895
2002-09-01   -1.072053
2002-09-02    0.503427
2002-09-03   -0.499512
2002-09-04   -1.080912
2002-09-05   -0.780288
2002-09-06   -0.537608
2002-09-07   -0.991904
2002-09-08    0.159327
2002-09-09    0.224638
2002-09-10    2.063388
2002-09-11    1.217366
2002-09-12    0.603689
2002-09-13    0.832689
2002-09-14   -1.788089
2002-09-15   -2.183370
2002-09-16   -0.759798
2002-09-17   -0.836241
2002-09-18    0.298536
2002-09-19    1.969939
2002-09-20   -0.688728
2002-09-21   -0.964116
2002-09-22   -1.279596
2002-09-23    0.357739
2002-09-24    1.253534
2002-09-25   -0.798673
2002-09-26   -1.023241
Freq: D, Length: 1000, dtype: float64

ts.cumsum()

2000-01-01     0.585074
2000-01-02     1.190860
2000-01-03     1.888493
2000-01-04     1.105155
2000-01-05     2.255935
2000-01-06     1.537445
2000-01-07     2.234190
2000-01-08     2.504764
2000-01-09     3.162260
2000-01-10     0.548599
2000-01-11    -1.430329
2000-01-12    -1.104767
2000-01-13    -0.818296
2000-01-14    -1.133798
2000-01-15    -0.646095
2000-01-16    -2.476516
2000-01-17    -1.629442
2000-01-18    -3.992834
2000-01-19    -3.853405
2000-01-20    -4.365450
2000-01-21    -4.156149
2000-01-22    -4.359136
2000-01-23    -4.964649
2000-01-24    -4.850682
2000-01-25    -5.397481
2000-01-26    -3.639336
2000-01-27    -3.339678
2000-01-28    -3.954516
2000-01-29    -3.691639
2000-01-30    -3.669963
                ...    
2002-08-28   -12.290664
2002-08-29   -14.136676
2002-08-30   -14.231337
2002-08-31   -12.949442
2002-09-01   -14.021495
2002-09-02   -13.518068
2002-09-03   -14.017579
2002-09-04   -15.098492
2002-09-05   -15.878779
2002-09-06   -16.416387
2002-09-07   -17.408292
2002-09-08   -17.248965
2002-09-09   -17.024327
2002-09-10   -14.960940
2002-09-11   -13.743574
2002-09-12   -13.139885
2002-09-13   -12.307196
2002-09-14   -14.095285
2002-09-15   -16.278655
2002-09-16   -17.038453
2002-09-17   -17.874694
2002-09-18   -17.576157
2002-09-19   -15.606219
2002-09-20   -16.294946
2002-09-21   -17.259062
2002-09-22   -18.538658
2002-09-23   -18.180919
2002-09-24   -16.927385
2002-09-25   -17.726058
2002-09-26   -18.749299
Freq: D, Length: 1000, dtype: float64

ts.plot()#绘制所有列

<matplotlib.axes._subplots.AxesSubplot at 0x22f031729e8>

10分钟学习pandas

df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
   .....:                   columns=['A', 'B', 'C', 'D'])

df

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	A	B	C	D
2000-01-01	-0.434770	1.797170	0.354820	-0.166193
2000-01-02	0.420233	0.823251	0.468701	-0.582949
2000-01-03	0.674668	1.032230	1.134346	0.696656
2000-01-04	0.948684	0.188295	-0.604249	-0.062779
2000-01-05	-1.381247	-0.360335	-0.346491	1.072618
2000-01-06	0.492510	1.924341	0.522605	0.293788
2000-01-07	-1.129093	0.063874	0.099183	0.557496
2000-01-08	1.142263	-0.309192	1.140049	1.007656
2000-01-09	1.751273	-0.747153	0.795127	-0.480155
2000-01-10	-1.519661	-1.187734	0.417908	-0.675147
2000-01-11	-0.096192	1.095308	0.094648	1.485943
2000-01-12	0.109115	-0.213535	-0.927250	1.189941
2000-01-13	-0.787367	-0.919787	1.286709	0.894471
2000-01-14	-0.584850	0.794088	0.533716	-0.159539
2000-01-15	-1.352332	-0.880446	0.041934	0.002573
2000-01-16	0.317933	0.957925	0.813780	0.952499
2000-01-17	0.950317	0.162642	-0.018575	-0.940598
2000-01-18	-2.021125	1.592108	0.219355	-1.300103
2000-01-19	-0.673145	-1.852674	-0.492845	0.070786
2000-01-20	-0.562802	-0.504083	0.980132	-0.079636
2000-01-21	0.693927	0.276601	-0.502267	1.824789
2000-01-22	0.240543	-0.049004	0.051460	-1.093965
2000-01-23	0.159181	0.559377	0.353952	-1.750909
2000-01-24	-1.009695	-0.169914	2.214441	-1.301680
2000-01-25	0.741394	-0.206067	-1.250305	-2.021061
2000-01-26	-1.050527	-0.448726	0.744841	0.559876
2000-01-27	-0.268987	0.755171	-0.865320	-0.077159
2000-01-28	-1.445525	-0.443887	0.048399	0.295317
2000-01-29	-0.348641	-0.570866	0.446533	-0.745215
2000-01-30	-0.803883	0.719817	0.035095	-0.057671
...	...	...	...	...
2002-08-28	-1.400857	-1.993967	-0.563839	-0.553431
2002-08-29	-0.860120	-0.252746	-0.585336	0.083630
2002-08-30	0.677218	0.113083	-0.507485	-1.247440
2002-08-31	1.901913	0.124469	-0.482948	0.093981
2002-09-01	1.728861	1.909778	-1.206848	-1.324399
2002-09-02	1.419153	-1.000495	-0.117854	-0.630926
2002-09-03	0.716920	-0.831795	2.443522	-0.247801
2002-09-04	-0.886588	-0.487240	0.476527	1.273604
2002-09-05	-2.361533	-0.074533	-1.095040	0.087406
2002-09-06	-1.225924	-0.444836	0.378192	-0.785585
2002-09-07	-1.064395	0.046003	0.148525	0.393557
2002-09-08	-0.294659	0.912430	-0.795767	0.064672
2002-09-09	0.276846	0.993007	-0.493192	0.673319
2002-09-10	1.676072	0.102106	-1.286082	-1.454404
2002-09-11	2.124521	0.069451	0.495054	0.148496
2002-09-12	0.821348	-0.880714	0.933978	1.869043
2002-09-13	-0.890738	-1.263920	0.128660	-0.282550
2002-09-14	-1.097484	0.652124	0.702043	-0.552927
2002-09-15	0.161343	0.157393	0.851718	-1.265120
2002-09-16	0.865516	-1.196734	-0.985248	-1.472387
2002-09-17	-0.539248	1.388908	-0.870515	-0.671165
2002-09-18	1.154511	0.879535	-0.249820	-0.393302
2002-09-19	1.237163	0.668046	0.917817	0.300664
2002-09-20	-0.187801	0.173142	-0.225307	2.142230
2002-09-21	0.517452	-0.547158	1.587477	-0.922776
2002-09-22	0.424784	0.696831	1.340258	1.252117
2002-09-23	-0.687751	-0.006990	-0.607220	0.709964
2002-09-24	-1.811347	0.200485	2.117700	-0.468944
2002-09-25	-0.431668	-0.385997	0.303936	0.817534
2002-09-26	0.678959	1.061957	1.252870	0.735550

1000 rows × 4 columns

df = df.cumsum()
plt.figure(); df.plot(); plt.legend(loc='best')

<matplotlib.legend.Legend at 0x22f052c62b0>

<Figure size 432x288 with 0 Axes>

10分钟学习pandas