Pandas库常用函数和操作

1. DataFrame 处理缺失值 dropna()

df2.dropna(axis=0, how='any', subset=[u'ToC'], inplace=True)

把在ToC列有缺失值的行去掉

补充：还可以用df.fillna()来把缺失值替换为某个特殊标记

df = df.fillna("missing")  # 用字符串替代
df = df.fillna(df.mean())  # 用均值或者其它描述性统计值替代

2. 根据某维度计算重复的行 duplicated()、value_counts()

print df.duplicated(['name']).value_counts()  # 如果不指定列，默认会判断所有列
"""
输出:
False    11118
True       664
表示有664行是重复的
"""

duplicated()方法返回一个布尔型的Series，显示各行是否为重复行，非重复行显示为False，重复行显示为True

value_counts()方法统计数组或序列所有元素出现次数，对某一列统计可以直接用df.column_name.value_counts()

3. 去重 drop_duplicates()

df.drop_duplicates(['name'], keep='last', inplace=True)
"""
keep : {‘first’, ‘last’, False}, default ‘first’
    first : Drop duplicates except for the first occurrence.
    last : Drop duplicates except for the last occurrence.
    False : Drop all duplicates.
"""

4. 拼接

(1) 拼接列 merge()

result = pd.merge(left, right, on='name', how='inner')
"""
其它参数：
pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False)

Examples
>>> A              >>> B
    lkey value         rkey value
0   foo  1         0   foo  5
1   bar  2         1   bar  6
2   baz  3         2   qux  7
3   foo  4         3   bar  8

>>> A.merge(B, left_on='lkey', right_on='rkey', how='outer')
   lkey  value_x  rkey  value_y
0  foo   1        foo   5
1  foo   4        foo   5
2  bar   2        bar   6
3  bar   2        bar   8
4  baz   3        NaN   NaN
5  NaN   NaN      qux   7
"""

其它参考：Merge, join, and concatenate

(2) 拼接行

def concat_by_row(data_dir, fout):
    dfs = []
    for filename in os.listdir(data_dir):
        dfs.append(pd.read_excel(os.path.join(data_dir, filename)))
        print(dfs[-1].shape, filename)
    df = pd.concat(dfs, axis=0, ignore_index=True)  # axis=0按行拼接；axis=1按列拼接
    print(df.shape)
    df.to_excel(fout, index=False)

5. 找出在某一特定维度为空值的所有行

bool_arr = df.name.notnull()
print bool_arr.value_counts()
for idx, value in bool_arr.iteritems():
    if not value:
        print '\n', idx, value
        print df.iloc[idx]

6. 指定dataframe的维度及顺序；保存数据csv文件

res = {'name':[], 'buss':[], 'label':[]}
with codecs.open(fname, encoding='utf8') as fr:
    for idx, line in enumerate(fr):
        item = json.loads(line)
        res['name'].append(item['name'])        
        res['buss'].append(item['buss']) 
        res['label'].append(item['label'])
df = pd.DataFrame(res, columns=['name', 'buss', 'label'])
df.to_csv('data/xxx.csv', index=False, encoding='utf-8')

7. 保存到文件

7.1 读写excel/csv格式文件

import pandas as pd

def dataframe_read_and_write(fin, fout):  # 读取fin文件，添加一列"新应答"
    df = pd.read_excel(fin)
    # df = read_csv(fin, encoding='utf-8')
    print df.head()

    fields = [u"序号", u"问题描述", u"原始应答", u"新应答"]
    data_out = defaultdict(list)

    for idx, row in df.iterrows():
        try:
            row = row.to_dict()
            new_answer = "xxxxxx"
            for field in fields[:-1]:
                data_out[field].append(row[field])
            data_out[fields[-1]].append(new_answer)
        except Exception as error:
            print "Error line", idx, error

    df_out = pd.DataFrame(data_out, columns=fields)
    df_out.to_excel(fout, sheet_name="Sheet1", index=False, header=True)
    # df_out.to_csv(fout, index=False, encoding="utf-8")


if __name__ == '__main__':
    dataframe_read_and_write(fin="data/tmp.xlsx", fout="data/tmp_out.xlsx")

7.2 将多张DataFrame表写入到同一个excel文件的不同sheet中

import pandas as pd
writer = pd.ExcelWriter('foo.xlsx')
df.to_excel(writer, 'Data 0')
df.to_excel(writer, 'Data 1')
writer.save()

7.3 向一个已经存在的excel文件中写入一张新sheet；如果文件不存在则创建一个新文件再写入

import pandas
from openpyxl import load_workbook


def add_new_sheet(df, fout, sheet_name='Sheet1', columns=None):
    if fout and os.path.exists(fout):
        book = load_workbook(fout)
        writer = pd.ExcelWriter(fout, engine='openpyxl')
        writer.book = book
        writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
    else:
        writer = pd.ExcelWriter(fout)
    df.to_excel(writer, sheet_name=sheet_name, columns=columns, index=False)
    writer.save()
    
    
add_new_sheet(df, fout='Masterfile.xlsx', sheet_name="Main", columns=['Diff1', 'Diff2'])

参考：官方解决方案https://github.com/pandas-dev/pandas/issues/3441

7.4 读取excel文本中的多个sheet

import xlrd

workbook = xlrd.open_workbook(fin)
for sheet in workbook.sheets():
    df = pd.read_excel(fin, sheet_name=sheet.name, index_col=None)

8. 排序

def sort_dataframe(df, fields_to_sort, fout=None):
    df = df.sort_values(by=fields_to_sort, ascending=True)
    if fout:
        df.to_excel(fout, index=False)
    return df

df = pd.read_excel(data_file)
sort_dataframe(df, fields_to_sort=["column_A", "column_B"], fout=data_file)

df = pd.read_excel(data_file)  # note: index改变，需要从文件重新读取，才会是有序的，后面遍历df的时候才不会出问题
print(df.head(10))

9. 轴标签重命名 df.rename()（列重命名、行index重命名）

>>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
>>> df.rename(index=str, columns={"A": "a", "B": "c"})
   a  c
0  1  4
1  2  5
2  3  6

>>> df.rename(index=str, columns={"A": "a", "C": "c"})
   a  B
0  1  4
1  2  5
2  3  6

Using axis-style parameters

>>> df.rename(str.lower, axis='columns')
   a  b
0  1  4
1  2  5
2  3  6

>>> df.rename({1: 2, 2: 4}, axis='index')
   A  B
0  1  4
2  2  5
4  3  6

参数说明：

Signature: df.rename(mapper=None, index=None, columns=None, axis=None, copy=True, inplace=False, level=None)
Docstring:
Alter axes labels.

Function / dict values must be unique (1-to-1). Labels not contained in
a dict / Series will be left as-is. Extra labels listed don't throw an
error.

See the :ref:`user guide <basics.rename>` for more.

Parameters
----------
mapper, index, columns : dict-like or function, optional
    dict-like or functions transformations to apply to
    that axis' values. Use either ``mapper`` and ``axis`` to
    specify the axis to target with ``mapper``, or ``index`` and
    ``columns``.
axis : int or str, optional
    Axis to target with ``mapper``. Can be either the axis name
    ('index', 'columns') or number (0, 1). The default is 'index'.
copy : boolean, default True
    Also copy underlying data
inplace : boolean, default False
    Whether to return a new DataFrame. If True then value of copy is
    ignored.
level : int or level name, default None
    In case of a MultiIndex, only rename labels in the specified
    level.

Returns
-------
renamed : DataFrame

See Also
--------
pandas.DataFrame.rename_axis

df.rename()参数说明