import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
s = pd.Series([1,3,5,np.nan,6,8])
s
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
len(s)
6
s.describe()
count 5.000000
mean 4.600000
std 2.701851
min 1.000000
25% 3.000000
50% 5.000000
75% 6.000000
max 8.000000
dtype: float64
dates = pd.date_range('20190101',periods=6)
dates
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
'2019-01-05', '2019-01-06'],
dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
| 2019-01-01 |
0.317575 |
0.330549 |
0.055375 |
-1.284453 |
| 2019-01-02 |
-0.272201 |
-0.770618 |
0.217658 |
1.349425 |
| 2019-01-03 |
0.842686 |
-0.973354 |
0.596166 |
0.215889 |
| 2019-01-04 |
0.252868 |
-0.778050 |
-0.324255 |
-1.378140 |
| 2019-01-05 |
0.281876 |
1.143140 |
-0.781625 |
-1.245069 |
| 2019-01-06 |
-0.468258 |
-1.682376 |
1.494058 |
1.262588 |
df.describe()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
| count |
6.000000 |
6.000000 |
6.000000 |
6.000000 |
| mean |
0.159091 |
-0.455118 |
0.209563 |
-0.179960 |
| std |
0.468160 |
1.014914 |
0.786212 |
1.293573 |
| min |
-0.468258 |
-1.682376 |
-0.781625 |
-1.378140 |
| 25% |
-0.140934 |
-0.924528 |
-0.229347 |
-1.274607 |
| 50% |
0.267372 |
-0.774334 |
0.136517 |
-0.514590 |
| 75% |
0.308650 |
0.055257 |
0.501539 |
1.000913 |
| max |
0.842686 |
1.143140 |
1.494058 |
1.349425 |
df.shape
(6, 4)
df2 = pd.DataFrame({'A':1.,
'B':pd.Timestamp('20190101'),
'C':pd.Series(1,index=list(range(4)),dtype='float32'),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'})
df2
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
E |
F |
| 0 |
1.0 |
2019-01-01 |
1.0 |
3 |
test |
foo |
| 1 |
1.0 |
2019-01-01 |
1.0 |
3 |
train |
foo |
| 2 |
1.0 |
2019-01-01 |
1.0 |
3 |
test |
foo |
| 3 |
1.0 |
2019-01-01 |
1.0 |
3 |
train |
foo |
df2.dtypes
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
df.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
| 2019-01-01 |
0.317575 |
0.330549 |
0.055375 |
-1.284453 |
| 2019-01-02 |
-0.272201 |
-0.770618 |
0.217658 |
1.349425 |
| 2019-01-03 |
0.842686 |
-0.973354 |
0.596166 |
0.215889 |
| 2019-01-04 |
0.252868 |
-0.778050 |
-0.324255 |
-1.378140 |
| 2019-01-05 |
0.281876 |
1.143140 |
-0.781625 |
-1.245069 |
df.tail()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
| 2019-01-02 |
-0.272201 |
-0.770618 |
0.217658 |
1.349425 |
| 2019-01-03 |
0.842686 |
-0.973354 |
0.596166 |
0.215889 |
| 2019-01-04 |
0.252868 |
-0.778050 |
-0.324255 |
-1.378140 |
| 2019-01-05 |
0.281876 |
1.143140 |
-0.781625 |
-1.245069 |
| 2019-01-06 |
-0.468258 |
-1.682376 |
1.494058 |
1.262588 |
df.index
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
'2019-01-05', '2019-01-06'],
dtype='datetime64[ns]', freq='D')
df.columns
Index(['A', 'B', 'C', 'D'], dtype='object')
df.values
array([[ 0.31757454, 0.33054893, 0.05537508, -1.28445319],
[-0.27220143, -0.77061807, 0.21765843, 1.34942538],
[ 0.84268621, -0.97335385, 0.59616646, 0.21588867],
[ 0.25286828, -0.77804969, -0.32425479, -1.37813964],
[ 0.28187609, 1.14314031, -0.78162546, -1.24506887],
[-0.4682577 , -1.68237556, 1.49405812, 1.26258772]])
df.describe()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
| count |
6.000000 |
6.000000 |
6.000000 |
6.000000 |
| mean |
0.159091 |
-0.455118 |
0.209563 |
-0.179960 |
| std |
0.468160 |
1.014914 |
0.786212 |
1.293573 |
| min |
-0.468258 |
-1.682376 |
-0.781625 |
-1.378140 |
| 25% |
-0.140934 |
-0.924528 |
-0.229347 |
-1.274607 |
| 50% |
0.267372 |
-0.774334 |
0.136517 |
-0.514590 |
| 75% |
0.308650 |
0.055257 |
0.501539 |
1.000913 |
| max |
0.842686 |
1.143140 |
1.494058 |
1.349425 |
df2.describe(include='all')
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
E |
F |
| count |
4.0 |
4 |
4.0 |
4.0 |
4 |
4 |
| unique |
NaN |
1 |
NaN |
NaN |
2 |
1 |
| top |
NaN |
2019-01-01 00:00:00 |
NaN |
NaN |
train |
foo |
| freq |
NaN |
4 |
NaN |
NaN |
2 |
4 |
| first |
NaN |
2019-01-01 00:00:00 |
NaN |
NaN |
NaN |
NaN |
| last |
NaN |
2019-01-01 00:00:00 |
NaN |
NaN |
NaN |
NaN |
| mean |
1.0 |
NaN |
1.0 |
3.0 |
NaN |
NaN |
| std |
0.0 |
NaN |
0.0 |
0.0 |
NaN |
NaN |
| min |
1.0 |
NaN |
1.0 |
3.0 |
NaN |
NaN |
| 25% |
1.0 |
NaN |
1.0 |
3.0 |
NaN |
NaN |
| 50% |
1.0 |
NaN |
1.0 |
3.0 |
NaN |
NaN |
| 75% |
1.0 |
NaN |
1.0 |
3.0 |
NaN |
NaN |
| max |
1.0 |
NaN |
1.0 |
3.0 |
NaN |
NaN |
df.T
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
2019-01-01 00:00:00 |
2019-01-02 00:00:00 |
2019-01-03 00:00:00 |
2019-01-04 00:00:00 |
2019-01-05 00:00:00 |
2019-01-06 00:00:00 |
| A |
0.317575 |
-0.272201 |
0.842686 |
0.252868 |
0.281876 |
-0.468258 |
| B |
0.330549 |
-0.770618 |
-0.973354 |
-0.778050 |
1.143140 |
-1.682376 |
| C |
0.055375 |
0.217658 |
0.596166 |
-0.324255 |
-0.781625 |
1.494058 |
| D |
-1.284453 |
1.349425 |
0.215889 |
-1.378140 |
-1.245069 |
1.262588 |
df.sort_index(axis=1,ascending=True)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
| 2019-01-01 |
0.317575 |
0.330549 |
0.055375 |
-1.284453 |
| 2019-01-02 |
-0.272201 |
-0.770618 |
0.217658 |
1.349425 |
| 2019-01-03 |
0.842686 |
-0.973354 |
0.596166 |
0.215889 |
| 2019-01-04 |
0.252868 |
-0.778050 |
-0.324255 |
-1.378140 |
| 2019-01-05 |
0.281876 |
1.143140 |
-0.781625 |
-1.245069 |
| 2019-01-06 |
-0.468258 |
-1.682376 |
1.494058 |
1.262588 |
df.sort_values(by='B')
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
| 2019-01-06 |
-0.468258 |
-1.682376 |
1.494058 |
1.262588 |
| 2019-01-03 |
0.842686 |
-0.973354 |
0.596166 |
0.215889 |
| 2019-01-04 |
0.252868 |
-0.778050 |
-0.324255 |
-1.378140 |
| 2019-01-02 |
-0.272201 |
-0.770618 |
0.217658 |
1.349425 |
| 2019-01-01 |
0.317575 |
0.330549 |
0.055375 |
-1.284453 |
| 2019-01-05 |
0.281876 |
1.143140 |
-0.781625 |
-1.245069 |
df['A']
2019-01-01 0.317575
2019-01-02 -0.272201
2019-01-03 0.842686
2019-01-04 0.252868
2019-01-05 0.281876
2019-01-06 -0.468258
Freq: D, Name: A, dtype: float64
df[0:3]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
| 2019-01-01 |
0.317575 |
0.330549 |
0.055375 |
-1.284453 |
| 2019-01-02 |
-0.272201 |
-0.770618 |
0.217658 |
1.349425 |
| 2019-01-03 |
0.842686 |
-0.973354 |
0.596166 |
0.215889 |
df['20190101':'20190103']
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
| 2019-01-01 |
0.317575 |
0.330549 |
0.055375 |
-1.284453 |
| 2019-01-02 |
-0.272201 |
-0.770618 |
0.217658 |
1.349425 |
| 2019-01-03 |
0.842686 |
-0.973354 |
0.596166 |
0.215889 |
df.loc[dates[0]]
A 0.317575
B 0.330549
C 0.055375
D -1.284453
Name: 2019-01-01 00:00:00, dtype: float64
dates[0]
Timestamp('2019-01-01 00:00:00', freq='D')
df.loc[:,['A','B']]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
| 2019-01-01 |
0.317575 |
0.330549 |
| 2019-01-02 |
-0.272201 |
-0.770618 |
| 2019-01-03 |
0.842686 |
-0.973354 |
| 2019-01-04 |
0.252868 |
-0.778050 |
| 2019-01-05 |
0.281876 |
1.143140 |
| 2019-01-06 |
-0.468258 |
-1.682376 |
df.loc['20190104',['A','B']].shape
(2,)
df.loc[dates[0],'A']
0.3175745430028141
df.at[dates[0],'A']
0.3175745430028141
df.at[dates[0],'A']
df.iat[1,2]
0.21765843113729494
df.iloc[3]
A 0.252868
B -0.778050
C -0.324255
D -1.378140
Name: 2019-01-04 00:00:00, dtype: float64
df.iloc[3,1]
-0.7780496857268889
df.iloc[[1,2,4],[0,2]]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
C |
| 2019-01-02 |
-0.272201 |
0.217658 |
| 2019-01-03 |
0.842686 |
0.596166 |
| 2019-01-05 |
0.281876 |
-0.781625 |
df.iloc[1:3,:]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
| 2019-01-02 |
-0.272201 |
-0.770618 |
0.217658 |
1.349425 |
| 2019-01-03 |
0.842686 |
-0.973354 |
0.596166 |
0.215889 |
df.iloc[1:3]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
| 2019-01-02 |
-0.272201 |
-0.770618 |
0.217658 |
1.349425 |
| 2019-01-03 |
0.842686 |
-0.973354 |
0.596166 |
0.215889 |
df[df.A>0]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
| 2019-01-01 |
0.317575 |
0.330549 |
0.055375 |
-1.284453 |
| 2019-01-03 |
0.842686 |
-0.973354 |
0.596166 |
0.215889 |
| 2019-01-04 |
0.252868 |
-0.778050 |
-0.324255 |
-1.378140 |
| 2019-01-05 |
0.281876 |
1.143140 |
-0.781625 |
-1.245069 |
df[df>0]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
| 2019-01-01 |
0.317575 |
0.330549 |
0.055375 |
NaN |
| 2019-01-02 |
NaN |
NaN |
0.217658 |
1.349425 |
| 2019-01-03 |
0.842686 |
NaN |
0.596166 |
0.215889 |
| 2019-01-04 |
0.252868 |
NaN |
NaN |
NaN |
| 2019-01-05 |
0.281876 |
1.143140 |
NaN |
NaN |
| 2019-01-06 |
NaN |
NaN |
1.494058 |
1.262588 |
df3 = df.copy()
df3['E']=['one','two','three','four','three','one']
df3
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
E |
| 2019-01-01 |
0.317575 |
0.330549 |
0.055375 |
-1.284453 |
one |
| 2019-01-02 |
-0.272201 |
-0.770618 |
0.217658 |
1.349425 |
two |
| 2019-01-03 |
0.842686 |
-0.973354 |
0.596166 |
0.215889 |
three |
| 2019-01-04 |
0.252868 |
-0.778050 |
-0.324255 |
-1.378140 |
four |
| 2019-01-05 |
0.281876 |
1.143140 |
-0.781625 |
-1.245069 |
three |
| 2019-01-06 |
-0.468258 |
-1.682376 |
1.494058 |
1.262588 |
one |
df3[df3['E'].isin(['one','two'])]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
E |
| 2019-01-01 |
0.317575 |
0.330549 |
0.055375 |
-1.284453 |
one |
| 2019-01-02 |
-0.272201 |
-0.770618 |
0.217658 |
1.349425 |
two |
| 2019-01-06 |
-0.468258 |
-1.682376 |
1.494058 |
1.262588 |
one |
s1= pd.Series([1,2,3,4,5,6],index=pd.date_range('20190102',periods=6))
s1
2019-01-02 1
2019-01-03 2
2019-01-04 3
2019-01-05 4
2019-01-06 5
2019-01-07 6
Freq: D, dtype: int64
df['F']=s1
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
F |
| 2019-01-01 |
0.317575 |
0.330549 |
0.055375 |
-1.284453 |
NaN |
| 2019-01-02 |
-0.272201 |
-0.770618 |
0.217658 |
1.349425 |
1.0 |
| 2019-01-03 |
0.842686 |
-0.973354 |
0.596166 |
0.215889 |
2.0 |
| 2019-01-04 |
0.252868 |
-0.778050 |
-0.324255 |
-1.378140 |
3.0 |
| 2019-01-05 |
0.281876 |
1.143140 |
-0.781625 |
-1.245069 |
4.0 |
| 2019-01-06 |
-0.468258 |
-1.682376 |
1.494058 |
1.262588 |
5.0 |
df.at[dates[0],'A']=0
df.iat[0,1]=0
df.loc[:,'D']=np.array([5]*len(df))
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
F |
| 2019-01-01 |
0.000000 |
0.000000 |
0.055375 |
5 |
NaN |
| 2019-01-02 |
-0.272201 |
-0.770618 |
0.217658 |
5 |
1.0 |
| 2019-01-03 |
0.842686 |
-0.973354 |
0.596166 |
5 |
2.0 |
| 2019-01-04 |
0.252868 |
-0.778050 |
-0.324255 |
5 |
3.0 |
| 2019-01-05 |
0.281876 |
1.143140 |
-0.781625 |
5 |
4.0 |
| 2019-01-06 |
-0.468258 |
-1.682376 |
1.494058 |
5 |
5.0 |
df4 = df.copy()
df4[df4>0]=-df4
df4
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
F |
| 2019-01-01 |
0.000000 |
0.000000 |
-0.055375 |
-5 |
NaN |
| 2019-01-02 |
-0.272201 |
-0.770618 |
-0.217658 |
-5 |
-1.0 |
| 2019-01-03 |
-0.842686 |
-0.973354 |
-0.596166 |
-5 |
-2.0 |
| 2019-01-04 |
-0.252868 |
-0.778050 |
-0.324255 |
-5 |
-3.0 |
| 2019-01-05 |
-0.281876 |
-1.143140 |
-0.781625 |
-5 |
-4.0 |
| 2019-01-06 |
-0.468258 |
-1.682376 |
-1.494058 |
-5 |
-5.0 |
df1 = df.reindex(index=dates[0:4],columns=list(df.columns)+['E'])
df1.loc[dates[0]:dates[1],'E']=1
df1
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
F |
E |
| 2019-01-01 |
0.000000 |
0.000000 |
0.055375 |
5 |
NaN |
1.0 |
| 2019-01-02 |
-0.272201 |
-0.770618 |
0.217658 |
5 |
1.0 |
1.0 |
| 2019-01-03 |
0.842686 |
-0.973354 |
0.596166 |
5 |
2.0 |
NaN |
| 2019-01-04 |
0.252868 |
-0.778050 |
-0.324255 |
5 |
3.0 |
NaN |
df1.dropna(how='any')
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
F |
E |
| 2019-01-02 |
-0.272201 |
-0.770618 |
0.217658 |
5 |
1.0 |
1.0 |
df1.fillna(value=5)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
F |
E |
| 2019-01-01 |
0.000000 |
0.000000 |
0.055375 |
5 |
5.0 |
1.0 |
| 2019-01-02 |
-0.272201 |
-0.770618 |
0.217658 |
5 |
1.0 |
1.0 |
| 2019-01-03 |
0.842686 |
-0.973354 |
0.596166 |
5 |
2.0 |
5.0 |
| 2019-01-04 |
0.252868 |
-0.778050 |
-0.324255 |
5 |
3.0 |
5.0 |
pd.isna(df1)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
F |
E |
| 2019-01-01 |
False |
False |
False |
False |
True |
False |
| 2019-01-02 |
False |
False |
False |
False |
False |
False |
| 2019-01-03 |
False |
False |
False |
False |
False |
True |
| 2019-01-04 |
False |
False |
False |
False |
False |
True |
df.mean()
A 0.106162
B -0.510209
C 0.209563
D 5.000000
F 3.000000
dtype: float64
df.mean(1)
2019-01-01 1.263844
2019-01-02 1.034968
2019-01-03 1.493100
2019-01-04 1.430113
2019-01-05 1.928678
2019-01-06 1.868685
Freq: D, dtype: float64
s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2)
s
2019-01-01 NaN
2019-01-02 NaN
2019-01-03 1.0
2019-01-04 3.0
2019-01-05 5.0
2019-01-06 NaN
Freq: D, dtype: float64
df.sub(s,axis='index')
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
F |
| 2019-01-01 |
NaN |
NaN |
NaN |
NaN |
NaN |
| 2019-01-02 |
NaN |
NaN |
NaN |
NaN |
NaN |
| 2019-01-03 |
-0.157314 |
-1.973354 |
-0.403834 |
4.0 |
1.0 |
| 2019-01-04 |
-2.747132 |
-3.778050 |
-3.324255 |
2.0 |
0.0 |
| 2019-01-05 |
-4.718124 |
-3.856860 |
-5.781625 |
0.0 |
-1.0 |
| 2019-01-06 |
NaN |
NaN |
NaN |
NaN |
NaN |
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
F |
| 2019-01-01 |
0.000000 |
0.000000 |
0.055375 |
5 |
NaN |
| 2019-01-02 |
-0.272201 |
-0.770618 |
0.217658 |
5 |
1.0 |
| 2019-01-03 |
0.842686 |
-0.973354 |
0.596166 |
5 |
2.0 |
| 2019-01-04 |
0.252868 |
-0.778050 |
-0.324255 |
5 |
3.0 |
| 2019-01-05 |
0.281876 |
1.143140 |
-0.781625 |
5 |
4.0 |
| 2019-01-06 |
-0.468258 |
-1.682376 |
1.494058 |
5 |
5.0 |
df.apply(np.cumsum)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
F |
| 2019-01-01 |
0.000000 |
0.000000 |
0.055375 |
5 |
NaN |
| 2019-01-02 |
-0.272201 |
-0.770618 |
0.273034 |
10 |
1.0 |
| 2019-01-03 |
0.570485 |
-1.743972 |
0.869200 |
15 |
3.0 |
| 2019-01-04 |
0.823353 |
-2.522022 |
0.544945 |
20 |
6.0 |
| 2019-01-05 |
1.105229 |
-1.378881 |
-0.236680 |
25 |
10.0 |
| 2019-01-06 |
0.636971 |
-3.061257 |
1.257378 |
30 |
15.0 |
df.apply(lambda x:x.max()-x.min())
A 1.310944
B 2.825516
C 2.275684
D 0.000000
F 4.000000
dtype: float64
df['F'].value_counts()
5.0 1
4.0 1
3.0 1
2.0 1
1.0 1
Name: F, dtype: int64
s = pd.Series(np.random.randint(0,7,size=10))
s
0 1
1 1
2 1
3 2
4 3
5 2
6 6
7 0
8 4
9 6
dtype: int32
s.dtype
dtype('int32')
s.shape
(10,)
s.value_counts()
1 3
6 2
2 2
4 1
3 1
0 1
dtype: int64
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.dtype
dtype('O')
s.str
<pandas.core.strings.StringMethods at 0x22f0313d160>
s.str.lower()
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 caba
7 dog
8 cat
dtype: object
df = pd.DataFrame(np.random.randn(10,4))
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
0 |
1 |
2 |
3 |
| 0 |
-0.936960 |
-0.210650 |
1.887069 |
0.128005 |
| 1 |
0.657660 |
0.254801 |
-0.092190 |
-1.957160 |
| 2 |
-0.920141 |
-2.259964 |
0.083965 |
0.371865 |
| 3 |
-0.755403 |
0.633426 |
0.090949 |
-0.626964 |
| 4 |
0.170052 |
1.164537 |
-1.193706 |
1.391785 |
| 5 |
0.084560 |
-1.513814 |
0.069032 |
0.099851 |
| 6 |
-0.683663 |
0.031873 |
-0.062998 |
0.523253 |
| 7 |
-0.926594 |
0.125286 |
-1.894089 |
-0.449402 |
| 8 |
0.610722 |
0.329156 |
0.025149 |
-2.673445 |
| 9 |
0.336673 |
1.205792 |
-1.346179 |
0.214389 |
pieces = [df[:3],df[3:7],df[7:]]
pieces
[ 0 1 2 3
0 -0.936960 -0.210650 1.887069 0.128005
1 0.657660 0.254801 -0.092190 -1.957160
2 -0.920141 -2.259964 0.083965 0.371865,
0 1 2 3
3 -0.755403 0.633426 0.090949 -0.626964
4 0.170052 1.164537 -1.193706 1.391785
5 0.084560 -1.513814 0.069032 0.099851
6 -0.683663 0.031873 -0.062998 0.523253,
0 1 2 3
7 -0.926594 0.125286 -1.894089 -0.449402
8 0.610722 0.329156 0.025149 -2.673445
9 0.336673 1.205792 -1.346179 0.214389]
pd.concat(pieces)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
0 |
1 |
2 |
3 |
| 0 |
-0.936960 |
-0.210650 |
1.887069 |
0.128005 |
| 1 |
0.657660 |
0.254801 |
-0.092190 |
-1.957160 |
| 2 |
-0.920141 |
-2.259964 |
0.083965 |
0.371865 |
| 3 |
-0.755403 |
0.633426 |
0.090949 |
-0.626964 |
| 4 |
0.170052 |
1.164537 |
-1.193706 |
1.391785 |
| 5 |
0.084560 |
-1.513814 |
0.069032 |
0.099851 |
| 6 |
-0.683663 |
0.031873 |
-0.062998 |
0.523253 |
| 7 |
-0.926594 |
0.125286 |
-1.894089 |
-0.449402 |
| 8 |
0.610722 |
0.329156 |
0.025149 |
-2.673445 |
| 9 |
0.336673 |
1.205792 |
-1.346179 |
0.214389 |
concat
- axis=1列合并;axis=0行合并(default)
- join=‘inner’只返回合并的共同对象;join=‘outer’返回合并后的所有列
- ignore_index=True会自动连续生成索引值;ignore_index=False则不改变原有的索引值
- verify_integrity会检查当前合并是否重复索引
- keys会导致生成多维数组
- name会重新命名列名。
pd.concat(pieces,axis=1)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
0 |
1 |
2 |
3 |
0 |
1 |
2 |
3 |
0 |
1 |
2 |
3 |
| 0 |
-0.936960 |
-0.210650 |
1.887069 |
0.128005 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
| 1 |
0.657660 |
0.254801 |
-0.092190 |
-1.957160 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
| 2 |
-0.920141 |
-2.259964 |
0.083965 |
0.371865 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
| 3 |
NaN |
NaN |
NaN |
NaN |
-0.755403 |
0.633426 |
0.090949 |
-0.626964 |
NaN |
NaN |
NaN |
NaN |
| 4 |
NaN |
NaN |
NaN |
NaN |
0.170052 |
1.164537 |
-1.193706 |
1.391785 |
NaN |
NaN |
NaN |
NaN |
| 5 |
NaN |
NaN |
NaN |
NaN |
0.084560 |
-1.513814 |
0.069032 |
0.099851 |
NaN |
NaN |
NaN |
NaN |
| 6 |
NaN |
NaN |
NaN |
NaN |
-0.683663 |
0.031873 |
-0.062998 |
0.523253 |
NaN |
NaN |
NaN |
NaN |
| 7 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
-0.926594 |
0.125286 |
-1.894089 |
-0.449402 |
| 8 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
0.610722 |
0.329156 |
0.025149 |
-2.673445 |
| 9 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
0.336673 |
1.205792 |
-1.346179 |
0.214389 |
pd.concat(pieces,join='outer',axis=1)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
0 |
1 |
2 |
3 |
0 |
1 |
2 |
3 |
0 |
1 |
2 |
3 |
| 0 |
-0.936960 |
-0.210650 |
1.887069 |
0.128005 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
| 1 |
0.657660 |
0.254801 |
-0.092190 |
-1.957160 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
| 2 |
-0.920141 |
-2.259964 |
0.083965 |
0.371865 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
| 3 |
NaN |
NaN |
NaN |
NaN |
-0.755403 |
0.633426 |
0.090949 |
-0.626964 |
NaN |
NaN |
NaN |
NaN |
| 4 |
NaN |
NaN |
NaN |
NaN |
0.170052 |
1.164537 |
-1.193706 |
1.391785 |
NaN |
NaN |
NaN |
NaN |
| 5 |
NaN |
NaN |
NaN |
NaN |
0.084560 |
-1.513814 |
0.069032 |
0.099851 |
NaN |
NaN |
NaN |
NaN |
| 6 |
NaN |
NaN |
NaN |
NaN |
-0.683663 |
0.031873 |
-0.062998 |
0.523253 |
NaN |
NaN |
NaN |
NaN |
| 7 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
-0.926594 |
0.125286 |
-1.894089 |
-0.449402 |
| 8 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
0.610722 |
0.329156 |
0.025149 |
-2.673445 |
| 9 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
0.336673 |
1.205792 |
-1.346179 |
0.214389 |
left = pd.DataFrame({'key':['foo','bar'],'lval':[1,2]})
right = pd.DataFrame({'key':['foo','bar'],'rval':[4,5]})
left
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
right
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
pd.merge(left,right,on='key')
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
key |
lval |
rval |
| 0 |
foo |
1 |
4 |
| 1 |
bar |
2 |
5 |
left = pd.DataFrame({'key':['foo','foo'],'lval':[1,2]})
right = pd.DataFrame({'key':['foo','foo'],'rval':[4,5]})
left
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
right
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
pd.merge(left,right,on='key')
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
key |
lval |
rval |
| 0 |
foo |
1 |
4 |
| 1 |
foo |
1 |
5 |
| 2 |
foo |
2 |
4 |
| 3 |
foo |
2 |
5 |
df = pd.DataFrame(np.random.randn(8,4),columns=list('ABCD'))
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
| 0 |
0.862756 |
-1.436692 |
0.367396 |
1.033803 |
| 1 |
-0.732357 |
-0.350199 |
-0.116083 |
-2.435210 |
| 2 |
0.316582 |
0.468616 |
0.433001 |
-0.443120 |
| 3 |
-0.189939 |
-2.437137 |
0.126893 |
-2.273711 |
| 4 |
0.913514 |
-0.752727 |
-1.651140 |
1.156839 |
| 5 |
-0.314581 |
1.296585 |
0.579130 |
-0.871556 |
| 6 |
0.361473 |
0.687854 |
-1.044602 |
0.233138 |
| 7 |
0.045199 |
2.176608 |
-0.258569 |
-1.018576 |
s = df.iloc[3]
df.append(s,ignore_index=False)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
| 0 |
0.862756 |
-1.436692 |
0.367396 |
1.033803 |
| 1 |
-0.732357 |
-0.350199 |
-0.116083 |
-2.435210 |
| 2 |
0.316582 |
0.468616 |
0.433001 |
-0.443120 |
| 3 |
-0.189939 |
-2.437137 |
0.126893 |
-2.273711 |
| 4 |
0.913514 |
-0.752727 |
-1.651140 |
1.156839 |
| 5 |
-0.314581 |
1.296585 |
0.579130 |
-0.871556 |
| 6 |
0.361473 |
0.687854 |
-1.044602 |
0.233138 |
| 7 |
0.045199 |
2.176608 |
-0.258569 |
-1.018576 |
| 3 |
-0.189939 |
-2.437137 |
0.126893 |
-2.273711 |
“分组依据”是指涉及以下一个或多个步骤的过程:
根据某些标准将数据分成组
独立地将函数应用于每个组
将结果组合成数据结构
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
....: 'foo', 'bar', 'foo', 'foo'],
....: 'B' : ['one', 'one', 'two', 'three',
....: 'two', 'two', 'one', 'three'],
....: 'C' : np.random.randn(8),
....: 'D' : np.random.randn(8)})
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
| 0 |
foo |
one |
0.982988 |
-0.386029 |
| 1 |
bar |
one |
1.632482 |
-0.327520 |
| 2 |
foo |
two |
0.149619 |
-0.138297 |
| 3 |
bar |
three |
-1.480397 |
1.105690 |
| 4 |
foo |
two |
0.647044 |
-1.097276 |
| 5 |
bar |
two |
-0.675596 |
0.250176 |
| 6 |
foo |
one |
0.437309 |
1.031742 |
| 7 |
foo |
three |
0.434659 |
1.197695 |
df.groupby('A').sum()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
C |
D |
| A |
|
|
| bar |
-0.523512 |
1.028346 |
| foo |
2.651618 |
0.607834 |
df.groupby('A').max()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
B |
C |
D |
| A |
|
|
|
| bar |
two |
1.632482 |
1.105690 |
| foo |
two |
0.982988 |
1.197695 |
df.groupby(['A','B']).sum()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
|
C |
D |
| A |
B |
|
|
| bar |
one |
1.632482 |
-0.327520 |
| three |
-1.480397 |
1.105690 |
| two |
-0.675596 |
0.250176 |
| foo |
one |
1.420297 |
0.645713 |
| three |
0.434659 |
1.197695 |
| two |
0.796663 |
-1.235574 |
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
....: 'foo', 'foo', 'qux', 'qux'],
....: ['one', 'two', 'one', 'two',
....: 'one', 'two', 'one', 'two']]))
tuples
[('bar', 'one'),
('bar', 'two'),
('baz', 'one'),
('baz', 'two'),
('foo', 'one'),
('foo', 'two'),
('qux', 'one'),
('qux', 'two')]
zip() 函数用于将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表,在python3中用于减少内存
index = pd.MultiIndex.from_tuples(tuples,names=['first','second'])
index
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8,2),index=index,columns=['A','B'])
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
|
A |
B |
| first |
second |
|
|
| bar |
one |
-0.752364 |
0.982241 |
| two |
-0.626690 |
1.358099 |
| baz |
one |
0.342360 |
-0.618870 |
| two |
-0.399448 |
-0.500175 |
| foo |
one |
-0.746328 |
-0.244752 |
| two |
1.556458 |
-1.340098 |
| qux |
one |
-0.569393 |
-0.340625 |
| two |
0.513714 |
0.151477 |
stacked = df.stack()
stacked
first second
bar one A -0.752364
B 0.982241
two A -0.626690
B 1.358099
baz one A 0.342360
B -0.618870
two A -0.399448
B -0.500175
foo one A -0.746328
B -0.244752
two A 1.556458
B -1.340098
qux one A -0.569393
B -0.340625
two A 0.513714
B 0.151477
dtype: float64
stacked.unstack()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
|
A |
B |
| first |
second |
|
|
| bar |
one |
-0.752364 |
0.982241 |
| two |
-0.626690 |
1.358099 |
| baz |
one |
0.342360 |
-0.618870 |
| two |
-0.399448 |
-0.500175 |
| foo |
one |
-0.746328 |
-0.244752 |
| two |
1.556458 |
-1.340098 |
| qux |
one |
-0.569393 |
-0.340625 |
| two |
0.513714 |
0.151477 |
stacked.unstack(1)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
second |
one |
two |
| first |
|
|
|
| bar |
A |
-0.752364 |
-0.626690 |
| B |
0.982241 |
1.358099 |
| baz |
A |
0.342360 |
-0.399448 |
| B |
-0.618870 |
-0.500175 |
| foo |
A |
-0.746328 |
1.556458 |
| B |
-0.244752 |
-1.340098 |
| qux |
A |
-0.569393 |
0.513714 |
| B |
-0.340625 |
0.151477 |
stacked.unstack(0)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
first |
bar |
baz |
foo |
qux |
| second |
|
|
|
|
|
| one |
A |
-0.752364 |
0.342360 |
-0.746328 |
-0.569393 |
| B |
0.982241 |
-0.618870 |
-0.244752 |
-0.340625 |
| two |
A |
-0.626690 |
-0.399448 |
1.556458 |
0.513714 |
| B |
1.358099 |
-0.500175 |
-1.340098 |
0.151477 |
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
.....: 'B' : ['A', 'B', 'C'] * 4,
.....: 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
.....: 'D' : np.random.randn(12),
.....: 'E' : np.random.randn(12)})
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
E |
| 0 |
one |
A |
foo |
0.346892 |
-0.615216 |
| 1 |
one |
B |
foo |
0.808681 |
0.161598 |
| 2 |
two |
C |
foo |
-1.324783 |
-0.088082 |
| 3 |
three |
A |
bar |
-0.227795 |
-0.180022 |
| 4 |
one |
B |
bar |
1.130028 |
-0.361439 |
| 5 |
one |
C |
bar |
0.510629 |
-1.466063 |
| 6 |
two |
A |
foo |
0.379503 |
0.008279 |
| 7 |
three |
B |
foo |
0.921087 |
-0.148614 |
| 8 |
one |
C |
foo |
0.332222 |
-0.127428 |
| 9 |
one |
A |
bar |
-0.784876 |
-0.736117 |
| 10 |
two |
B |
bar |
-0.793957 |
1.705022 |
| 11 |
three |
C |
bar |
-0.898485 |
1.038166 |
pd.pivot_table(df,values='D',index=['A','B'],columns=['C'])
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
C |
bar |
foo |
| A |
B |
|
|
| one |
A |
-0.784876 |
0.346892 |
| B |
1.130028 |
0.808681 |
| C |
0.510629 |
0.332222 |
| three |
A |
-0.227795 |
NaN |
| B |
NaN |
0.921087 |
| C |
-0.898485 |
NaN |
| two |
A |
NaN |
0.379503 |
| B |
-0.793957 |
NaN |
| C |
NaN |
-1.324783 |
rng = pd.date_range('1/1/2012', periods=100, freq='S')
rng
DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
'2012-01-01 00:00:02', '2012-01-01 00:00:03',
'2012-01-01 00:00:04', '2012-01-01 00:00:05',
'2012-01-01 00:00:06', '2012-01-01 00:00:07',
'2012-01-01 00:00:08', '2012-01-01 00:00:09',
'2012-01-01 00:00:10', '2012-01-01 00:00:11',
'2012-01-01 00:00:12', '2012-01-01 00:00:13',
'2012-01-01 00:00:14', '2012-01-01 00:00:15',
'2012-01-01 00:00:16', '2012-01-01 00:00:17',
'2012-01-01 00:00:18', '2012-01-01 00:00:19',
'2012-01-01 00:00:20', '2012-01-01 00:00:21',
'2012-01-01 00:00:22', '2012-01-01 00:00:23',
'2012-01-01 00:00:24', '2012-01-01 00:00:25',
'2012-01-01 00:00:26', '2012-01-01 00:00:27',
'2012-01-01 00:00:28', '2012-01-01 00:00:29',
'2012-01-01 00:00:30', '2012-01-01 00:00:31',
'2012-01-01 00:00:32', '2012-01-01 00:00:33',
'2012-01-01 00:00:34', '2012-01-01 00:00:35',
'2012-01-01 00:00:36', '2012-01-01 00:00:37',
'2012-01-01 00:00:38', '2012-01-01 00:00:39',
'2012-01-01 00:00:40', '2012-01-01 00:00:41',
'2012-01-01 00:00:42', '2012-01-01 00:00:43',
'2012-01-01 00:00:44', '2012-01-01 00:00:45',
'2012-01-01 00:00:46', '2012-01-01 00:00:47',
'2012-01-01 00:00:48', '2012-01-01 00:00:49',
'2012-01-01 00:00:50', '2012-01-01 00:00:51',
'2012-01-01 00:00:52', '2012-01-01 00:00:53',
'2012-01-01 00:00:54', '2012-01-01 00:00:55',
'2012-01-01 00:00:56', '2012-01-01 00:00:57',
'2012-01-01 00:00:58', '2012-01-01 00:00:59',
'2012-01-01 00:01:00', '2012-01-01 00:01:01',
'2012-01-01 00:01:02', '2012-01-01 00:01:03',
'2012-01-01 00:01:04', '2012-01-01 00:01:05',
'2012-01-01 00:01:06', '2012-01-01 00:01:07',
'2012-01-01 00:01:08', '2012-01-01 00:01:09',
'2012-01-01 00:01:10', '2012-01-01 00:01:11',
'2012-01-01 00:01:12', '2012-01-01 00:01:13',
'2012-01-01 00:01:14', '2012-01-01 00:01:15',
'2012-01-01 00:01:16', '2012-01-01 00:01:17',
'2012-01-01 00:01:18', '2012-01-01 00:01:19',
'2012-01-01 00:01:20', '2012-01-01 00:01:21',
'2012-01-01 00:01:22', '2012-01-01 00:01:23',
'2012-01-01 00:01:24', '2012-01-01 00:01:25',
'2012-01-01 00:01:26', '2012-01-01 00:01:27',
'2012-01-01 00:01:28', '2012-01-01 00:01:29',
'2012-01-01 00:01:30', '2012-01-01 00:01:31',
'2012-01-01 00:01:32', '2012-01-01 00:01:33',
'2012-01-01 00:01:34', '2012-01-01 00:01:35',
'2012-01-01 00:01:36', '2012-01-01 00:01:37',
'2012-01-01 00:01:38', '2012-01-01 00:01:39'],
dtype='datetime64[ns]', freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts
2012-01-01 00:00:00 225
2012-01-01 00:00:01 354
2012-01-01 00:00:02 438
2012-01-01 00:00:03 440
2012-01-01 00:00:04 9
2012-01-01 00:00:05 179
2012-01-01 00:00:06 396
2012-01-01 00:00:07 200
2012-01-01 00:00:08 413
2012-01-01 00:00:09 490
2012-01-01 00:00:10 37
2012-01-01 00:00:11 57
2012-01-01 00:00:12 33
2012-01-01 00:00:13 388
2012-01-01 00:00:14 44
2012-01-01 00:00:15 95
2012-01-01 00:00:16 8
2012-01-01 00:00:17 1
2012-01-01 00:00:18 307
2012-01-01 00:00:19 332
2012-01-01 00:00:20 20
2012-01-01 00:00:21 84
2012-01-01 00:00:22 309
2012-01-01 00:00:23 308
2012-01-01 00:00:24 67
2012-01-01 00:00:25 245
2012-01-01 00:00:26 180
2012-01-01 00:00:27 9
2012-01-01 00:00:28 126
2012-01-01 00:00:29 232
...
2012-01-01 00:01:10 409
2012-01-01 00:01:11 355
2012-01-01 00:01:12 70
2012-01-01 00:01:13 266
2012-01-01 00:01:14 118
2012-01-01 00:01:15 325
2012-01-01 00:01:16 214
2012-01-01 00:01:17 3
2012-01-01 00:01:18 143
2012-01-01 00:01:19 28
2012-01-01 00:01:20 56
2012-01-01 00:01:21 120
2012-01-01 00:01:22 99
2012-01-01 00:01:23 102
2012-01-01 00:01:24 71
2012-01-01 00:01:25 464
2012-01-01 00:01:26 489
2012-01-01 00:01:27 404
2012-01-01 00:01:28 356
2012-01-01 00:01:29 197
2012-01-01 00:01:30 390
2012-01-01 00:01:31 345
2012-01-01 00:01:32 115
2012-01-01 00:01:33 377
2012-01-01 00:01:34 388
2012-01-01 00:01:35 39
2012-01-01 00:01:36 406
2012-01-01 00:01:37 408
2012-01-01 00:01:38 410
2012-01-01 00:01:39 256
Freq: S, Length: 100, dtype: int32
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
rng
DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
'2012-03-10'],
dtype='datetime64[ns]', freq='D')
ts_utc = ts.tz_localize('UTC')
ts_utc
2012-01-01 00:00:00+00:00 225
2012-01-01 00:00:01+00:00 354
2012-01-01 00:00:02+00:00 438
2012-01-01 00:00:03+00:00 440
2012-01-01 00:00:04+00:00 9
2012-01-01 00:00:05+00:00 179
2012-01-01 00:00:06+00:00 396
2012-01-01 00:00:07+00:00 200
2012-01-01 00:00:08+00:00 413
2012-01-01 00:00:09+00:00 490
2012-01-01 00:00:10+00:00 37
2012-01-01 00:00:11+00:00 57
2012-01-01 00:00:12+00:00 33
2012-01-01 00:00:13+00:00 388
2012-01-01 00:00:14+00:00 44
2012-01-01 00:00:15+00:00 95
2012-01-01 00:00:16+00:00 8
2012-01-01 00:00:17+00:00 1
2012-01-01 00:00:18+00:00 307
2012-01-01 00:00:19+00:00 332
2012-01-01 00:00:20+00:00 20
2012-01-01 00:00:21+00:00 84
2012-01-01 00:00:22+00:00 309
2012-01-01 00:00:23+00:00 308
2012-01-01 00:00:24+00:00 67
2012-01-01 00:00:25+00:00 245
2012-01-01 00:00:26+00:00 180
2012-01-01 00:00:27+00:00 9
2012-01-01 00:00:28+00:00 126
2012-01-01 00:00:29+00:00 232
...
2012-01-01 00:01:10+00:00 409
2012-01-01 00:01:11+00:00 355
2012-01-01 00:01:12+00:00 70
2012-01-01 00:01:13+00:00 266
2012-01-01 00:01:14+00:00 118
2012-01-01 00:01:15+00:00 325
2012-01-01 00:01:16+00:00 214
2012-01-01 00:01:17+00:00 3
2012-01-01 00:01:18+00:00 143
2012-01-01 00:01:19+00:00 28
2012-01-01 00:01:20+00:00 56
2012-01-01 00:01:21+00:00 120
2012-01-01 00:01:22+00:00 99
2012-01-01 00:01:23+00:00 102
2012-01-01 00:01:24+00:00 71
2012-01-01 00:01:25+00:00 464
2012-01-01 00:01:26+00:00 489
2012-01-01 00:01:27+00:00 404
2012-01-01 00:01:28+00:00 356
2012-01-01 00:01:29+00:00 197
2012-01-01 00:01:30+00:00 390
2012-01-01 00:01:31+00:00 345
2012-01-01 00:01:32+00:00 115
2012-01-01 00:01:33+00:00 377
2012-01-01 00:01:34+00:00 388
2012-01-01 00:01:35+00:00 39
2012-01-01 00:01:36+00:00 406
2012-01-01 00:01:37+00:00 408
2012-01-01 00:01:38+00:00 410
2012-01-01 00:01:39+00:00 256
Freq: S, Length: 100, dtype: int32
ts_utc.tz_convert('US/Eastern')
2011-12-31 19:00:00-05:00 225
2011-12-31 19:00:01-05:00 354
2011-12-31 19:00:02-05:00 438
2011-12-31 19:00:03-05:00 440
2011-12-31 19:00:04-05:00 9
2011-12-31 19:00:05-05:00 179
2011-12-31 19:00:06-05:00 396
2011-12-31 19:00:07-05:00 200
2011-12-31 19:00:08-05:00 413
2011-12-31 19:00:09-05:00 490
2011-12-31 19:00:10-05:00 37
2011-12-31 19:00:11-05:00 57
2011-12-31 19:00:12-05:00 33
2011-12-31 19:00:13-05:00 388
2011-12-31 19:00:14-05:00 44
2011-12-31 19:00:15-05:00 95
2011-12-31 19:00:16-05:00 8
2011-12-31 19:00:17-05:00 1
2011-12-31 19:00:18-05:00 307
2011-12-31 19:00:19-05:00 332
2011-12-31 19:00:20-05:00 20
2011-12-31 19:00:21-05:00 84
2011-12-31 19:00:22-05:00 309
2011-12-31 19:00:23-05:00 308
2011-12-31 19:00:24-05:00 67
2011-12-31 19:00:25-05:00 245
2011-12-31 19:00:26-05:00 180
2011-12-31 19:00:27-05:00 9
2011-12-31 19:00:28-05:00 126
2011-12-31 19:00:29-05:00 232
...
2011-12-31 19:01:10-05:00 409
2011-12-31 19:01:11-05:00 355
2011-12-31 19:01:12-05:00 70
2011-12-31 19:01:13-05:00 266
2011-12-31 19:01:14-05:00 118
2011-12-31 19:01:15-05:00 325
2011-12-31 19:01:16-05:00 214
2011-12-31 19:01:17-05:00 3
2011-12-31 19:01:18-05:00 143
2011-12-31 19:01:19-05:00 28
2011-12-31 19:01:20-05:00 56
2011-12-31 19:01:21-05:00 120
2011-12-31 19:01:22-05:00 99
2011-12-31 19:01:23-05:00 102
2011-12-31 19:01:24-05:00 71
2011-12-31 19:01:25-05:00 464
2011-12-31 19:01:26-05:00 489
2011-12-31 19:01:27-05:00 404
2011-12-31 19:01:28-05:00 356
2011-12-31 19:01:29-05:00 197
2011-12-31 19:01:30-05:00 390
2011-12-31 19:01:31-05:00 345
2011-12-31 19:01:32-05:00 115
2011-12-31 19:01:33-05:00 377
2011-12-31 19:01:34-05:00 388
2011-12-31 19:01:35-05:00 39
2011-12-31 19:01:36-05:00 406
2011-12-31 19:01:37-05:00 408
2011-12-31 19:01:38-05:00 410
2011-12-31 19:01:39-05:00 256
Freq: S, Length: 100, dtype: int32
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
prng
PeriodIndex(['1990Q1', '1990Q2', '1990Q3', '1990Q4', '1991Q1', '1991Q2',
'1991Q3', '1991Q4', '1992Q1', '1992Q2', '1992Q3', '1992Q4',
'1993Q1', '1993Q2', '1993Q3', '1993Q4', '1994Q1', '1994Q2',
'1994Q3', '1994Q4', '1995Q1', '1995Q2', '1995Q3', '1995Q4',
'1996Q1', '1996Q2', '1996Q3', '1996Q4', '1997Q1', '1997Q2',
'1997Q3', '1997Q4', '1998Q1', '1998Q2', '1998Q3', '1998Q4',
'1999Q1', '1999Q2', '1999Q3', '1999Q4', '2000Q1', '2000Q2',
'2000Q3', '2000Q4'],
dtype='period[Q-NOV]', freq='Q-NOV')
ts = pd.Series(np.random.randn(len(prng)), prng)
ts
1990Q1 1.796304
1990Q2 0.659808
1990Q3 -0.647755
1990Q4 1.846486
1991Q1 0.488348
1991Q2 1.830351
1991Q3 -1.658804
1991Q4 0.585780
1992Q1 -0.596026
1992Q2 -1.900346
1992Q3 -0.066638
1992Q4 0.419037
1993Q1 0.055711
1993Q2 -2.103900
1993Q3 0.229944
1993Q4 0.317348
1994Q1 -0.776638
1994Q2 -0.241438
1994Q3 -0.587104
1994Q4 0.825772
1995Q1 2.444721
1995Q2 0.803142
1995Q3 0.494378
1995Q4 -0.984900
1996Q1 -0.431641
1996Q2 0.766768
1996Q3 -1.176313
1996Q4 0.339700
1997Q1 -1.523029
1997Q2 0.512173
1997Q3 1.359914
1997Q4 0.564407
1998Q1 0.354859
1998Q2 -0.493561
1998Q3 0.514986
1998Q4 -0.156142
1999Q1 1.047135
1999Q2 0.648944
1999Q3 -1.581937
1999Q4 0.261181
2000Q1 -0.809498
2000Q2 1.102175
2000Q3 0.424905
2000Q4 -0.775245
Freq: Q-NOV, dtype: float64
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
ts.head()
1990-03-01 09:00 1.796304
1990-06-01 09:00 0.659808
1990-09-01 09:00 -0.647755
1990-12-01 09:00 1.846486
1991-03-01 09:00 0.488348
Freq: H, dtype: float64
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
id |
raw_grade |
| 0 |
1 |
a |
| 1 |
2 |
b |
| 2 |
3 |
b |
| 3 |
4 |
a |
| 4 |
5 |
a |
| 5 |
6 |
e |
df["grade"] = df["raw_grade"].astype("category")
df["grade"]
0 a
1 b
2 b
3 a
4 a
5 e
Name: grade, dtype: category
Categories (3, object): [a, b, e]
df["grade"].cat.categories = ["very good", "good", "very bad"]
df["grade"]
0 very good
1 good
2 good
3 very good
4 very good
5 very bad
Name: grade, dtype: category
Categories (3, object): [very good, good, very bad]
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
df["grade"]
0 very good
1 good
2 good
3 very good
4 very good
5 very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]
df.sort_values(by="grade")
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
id |
raw_grade |
grade |
| 5 |
6 |
e |
very bad |
| 1 |
2 |
b |
good |
| 2 |
3 |
b |
good |
| 0 |
1 |
a |
very good |
| 3 |
4 |
a |
very good |
| 4 |
5 |
a |
very good |
df.groupby("grade").size()
grade
very bad 1
bad 0
medium 0
good 2
very good 3
dtype: int64
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts
2000-01-01 0.585074
2000-01-02 0.605786
2000-01-03 0.697632
2000-01-04 -0.783338
2000-01-05 1.150780
2000-01-06 -0.718491
2000-01-07 0.696745
2000-01-08 0.270574
2000-01-09 0.657496
2000-01-10 -2.613661
2000-01-11 -1.978929
2000-01-12 0.325563
2000-01-13 0.286470
2000-01-14 -0.315502
2000-01-15 0.487703
2000-01-16 -1.830420
2000-01-17 0.847074
2000-01-18 -2.363392
2000-01-19 0.139429
2000-01-20 -0.512045
2000-01-21 0.209301
2000-01-22 -0.202987
2000-01-23 -0.605512
2000-01-24 0.113967
2000-01-25 -0.546799
2000-01-26 1.758145
2000-01-27 0.299658
2000-01-28 -0.614838
2000-01-29 0.262877
2000-01-30 0.021676
...
2002-08-28 0.222110
2002-08-29 -1.846013
2002-08-30 -0.094660
2002-08-31 1.281895
2002-09-01 -1.072053
2002-09-02 0.503427
2002-09-03 -0.499512
2002-09-04 -1.080912
2002-09-05 -0.780288
2002-09-06 -0.537608
2002-09-07 -0.991904
2002-09-08 0.159327
2002-09-09 0.224638
2002-09-10 2.063388
2002-09-11 1.217366
2002-09-12 0.603689
2002-09-13 0.832689
2002-09-14 -1.788089
2002-09-15 -2.183370
2002-09-16 -0.759798
2002-09-17 -0.836241
2002-09-18 0.298536
2002-09-19 1.969939
2002-09-20 -0.688728
2002-09-21 -0.964116
2002-09-22 -1.279596
2002-09-23 0.357739
2002-09-24 1.253534
2002-09-25 -0.798673
2002-09-26 -1.023241
Freq: D, Length: 1000, dtype: float64
ts.cumsum()
2000-01-01 0.585074
2000-01-02 1.190860
2000-01-03 1.888493
2000-01-04 1.105155
2000-01-05 2.255935
2000-01-06 1.537445
2000-01-07 2.234190
2000-01-08 2.504764
2000-01-09 3.162260
2000-01-10 0.548599
2000-01-11 -1.430329
2000-01-12 -1.104767
2000-01-13 -0.818296
2000-01-14 -1.133798
2000-01-15 -0.646095
2000-01-16 -2.476516
2000-01-17 -1.629442
2000-01-18 -3.992834
2000-01-19 -3.853405
2000-01-20 -4.365450
2000-01-21 -4.156149
2000-01-22 -4.359136
2000-01-23 -4.964649
2000-01-24 -4.850682
2000-01-25 -5.397481
2000-01-26 -3.639336
2000-01-27 -3.339678
2000-01-28 -3.954516
2000-01-29 -3.691639
2000-01-30 -3.669963
...
2002-08-28 -12.290664
2002-08-29 -14.136676
2002-08-30 -14.231337
2002-08-31 -12.949442
2002-09-01 -14.021495
2002-09-02 -13.518068
2002-09-03 -14.017579
2002-09-04 -15.098492
2002-09-05 -15.878779
2002-09-06 -16.416387
2002-09-07 -17.408292
2002-09-08 -17.248965
2002-09-09 -17.024327
2002-09-10 -14.960940
2002-09-11 -13.743574
2002-09-12 -13.139885
2002-09-13 -12.307196
2002-09-14 -14.095285
2002-09-15 -16.278655
2002-09-16 -17.038453
2002-09-17 -17.874694
2002-09-18 -17.576157
2002-09-19 -15.606219
2002-09-20 -16.294946
2002-09-21 -17.259062
2002-09-22 -18.538658
2002-09-23 -18.180919
2002-09-24 -16.927385
2002-09-25 -17.726058
2002-09-26 -18.749299
Freq: D, Length: 1000, dtype: float64
ts.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x22f031729e8>

df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
.....: columns=['A', 'B', 'C', 'D'])
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
A |
B |
C |
D |
| 2000-01-01 |
-0.434770 |
1.797170 |
0.354820 |
-0.166193 |
| 2000-01-02 |
0.420233 |
0.823251 |
0.468701 |
-0.582949 |
| 2000-01-03 |
0.674668 |
1.032230 |
1.134346 |
0.696656 |
| 2000-01-04 |
0.948684 |
0.188295 |
-0.604249 |
-0.062779 |
| 2000-01-05 |
-1.381247 |
-0.360335 |
-0.346491 |
1.072618 |
| 2000-01-06 |
0.492510 |
1.924341 |
0.522605 |
0.293788 |
| 2000-01-07 |
-1.129093 |
0.063874 |
0.099183 |
0.557496 |
| 2000-01-08 |
1.142263 |
-0.309192 |
1.140049 |
1.007656 |
| 2000-01-09 |
1.751273 |
-0.747153 |
0.795127 |
-0.480155 |
| 2000-01-10 |
-1.519661 |
-1.187734 |
0.417908 |
-0.675147 |
| 2000-01-11 |
-0.096192 |
1.095308 |
0.094648 |
1.485943 |
| 2000-01-12 |
0.109115 |
-0.213535 |
-0.927250 |
1.189941 |
| 2000-01-13 |
-0.787367 |
-0.919787 |
1.286709 |
0.894471 |
| 2000-01-14 |
-0.584850 |
0.794088 |
0.533716 |
-0.159539 |
| 2000-01-15 |
-1.352332 |
-0.880446 |
0.041934 |
0.002573 |
| 2000-01-16 |
0.317933 |
0.957925 |
0.813780 |
0.952499 |
| 2000-01-17 |
0.950317 |
0.162642 |
-0.018575 |
-0.940598 |
| 2000-01-18 |
-2.021125 |
1.592108 |
0.219355 |
-1.300103 |
| 2000-01-19 |
-0.673145 |
-1.852674 |
-0.492845 |
0.070786 |
| 2000-01-20 |
-0.562802 |
-0.504083 |
0.980132 |
-0.079636 |
| 2000-01-21 |
0.693927 |
0.276601 |
-0.502267 |
1.824789 |
| 2000-01-22 |
0.240543 |
-0.049004 |
0.051460 |
-1.093965 |
| 2000-01-23 |
0.159181 |
0.559377 |
0.353952 |
-1.750909 |
| 2000-01-24 |
-1.009695 |
-0.169914 |
2.214441 |
-1.301680 |
| 2000-01-25 |
0.741394 |
-0.206067 |
-1.250305 |
-2.021061 |
| 2000-01-26 |
-1.050527 |
-0.448726 |
0.744841 |
0.559876 |
| 2000-01-27 |
-0.268987 |
0.755171 |
-0.865320 |
-0.077159 |
| 2000-01-28 |
-1.445525 |
-0.443887 |
0.048399 |
0.295317 |
| 2000-01-29 |
-0.348641 |
-0.570866 |
0.446533 |
-0.745215 |
| 2000-01-30 |
-0.803883 |
0.719817 |
0.035095 |
-0.057671 |
| ... |
... |
... |
... |
... |
| 2002-08-28 |
-1.400857 |
-1.993967 |
-0.563839 |
-0.553431 |
| 2002-08-29 |
-0.860120 |
-0.252746 |
-0.585336 |
0.083630 |
| 2002-08-30 |
0.677218 |
0.113083 |
-0.507485 |
-1.247440 |
| 2002-08-31 |
1.901913 |
0.124469 |
-0.482948 |
0.093981 |
| 2002-09-01 |
1.728861 |
1.909778 |
-1.206848 |
-1.324399 |
| 2002-09-02 |
1.419153 |
-1.000495 |
-0.117854 |
-0.630926 |
| 2002-09-03 |
0.716920 |
-0.831795 |
2.443522 |
-0.247801 |
| 2002-09-04 |
-0.886588 |
-0.487240 |
0.476527 |
1.273604 |
| 2002-09-05 |
-2.361533 |
-0.074533 |
-1.095040 |
0.087406 |
| 2002-09-06 |
-1.225924 |
-0.444836 |
0.378192 |
-0.785585 |
| 2002-09-07 |
-1.064395 |
0.046003 |
0.148525 |
0.393557 |
| 2002-09-08 |
-0.294659 |
0.912430 |
-0.795767 |
0.064672 |
| 2002-09-09 |
0.276846 |
0.993007 |
-0.493192 |
0.673319 |
| 2002-09-10 |
1.676072 |
0.102106 |
-1.286082 |
-1.454404 |
| 2002-09-11 |
2.124521 |
0.069451 |
0.495054 |
0.148496 |
| 2002-09-12 |
0.821348 |
-0.880714 |
0.933978 |
1.869043 |
| 2002-09-13 |
-0.890738 |
-1.263920 |
0.128660 |
-0.282550 |
| 2002-09-14 |
-1.097484 |
0.652124 |
0.702043 |
-0.552927 |
| 2002-09-15 |
0.161343 |
0.157393 |
0.851718 |
-1.265120 |
| 2002-09-16 |
0.865516 |
-1.196734 |
-0.985248 |
-1.472387 |
| 2002-09-17 |
-0.539248 |
1.388908 |
-0.870515 |
-0.671165 |
| 2002-09-18 |
1.154511 |
0.879535 |
-0.249820 |
-0.393302 |
| 2002-09-19 |
1.237163 |
0.668046 |
0.917817 |
0.300664 |
| 2002-09-20 |
-0.187801 |
0.173142 |
-0.225307 |
2.142230 |
| 2002-09-21 |
0.517452 |
-0.547158 |
1.587477 |
-0.922776 |
| 2002-09-22 |
0.424784 |
0.696831 |
1.340258 |
1.252117 |
| 2002-09-23 |
-0.687751 |
-0.006990 |
-0.607220 |
0.709964 |
| 2002-09-24 |
-1.811347 |
0.200485 |
2.117700 |
-0.468944 |
| 2002-09-25 |
-0.431668 |
-0.385997 |
0.303936 |
0.817534 |
| 2002-09-26 |
0.678959 |
1.061957 |
1.252870 |
0.735550 |
1000 rows × 4 columns
df = df.cumsum()
plt.figure(); df.plot(); plt.legend(loc='best')
<matplotlib.legend.Legend at 0x22f052c62b0>
<Figure size 432x288 with 0 Axes>
