import pandas as pd
from io import StringIO
data = data = ('col1,col2,col3\na,b,1\na,b,2\nc,d,3')
d = pd.read_csv(StringIO(data))# usecols 过滤列,筛选将要使用的列 使用此参数可以大大加快解析时间并降低内存使用量。
d = pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3'])
# skiprows 跨行筛选数据
d = pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0)
# print(d)
# 指定列数据类型
import numpy as np
data = ('a,b,c,d\n1,2,3,4\n5,6,7,8\n9,10,11')
df = pd.read_csv(StringIO(data), dtype=object)
print(df)
df = pd.read_csv(StringIO(data), dtype={'b': object, 'c': np.float64, 'd': 'string'})
type = df.dtypes
print(type)
# 转换器参数,加载数据时进行转换
data = ("col_1\n1\n2\n'A'\n4.22")
df = pd.read_csv(StringIO(data), converters={'col_1': str})
print(df)
r = df['col_1'].apply(type).value_counts()
print(r)
# 加载数据时,强制类型转换
df2 = pd.read_csv(StringIO(data))
df2['col_1'] = pd.to_numeric(df2['col_1'], errors='coerce')
df2['col_1'].apply(type).value_counts()
# 混合类型数据列(数据要达到一定数量级)
col_1 = list(range(500000)) + ['a', 'b'] + list(range(500000))df = pd.DataFrame({'col_1': col_1})
df.to_csv('foo.csv')
mixed_df = pd.read_csv('foo.csv')
mixed_df['col_1'].apply(type).value_counts()
mixed_df['col_1'].dtype