对于文件中的每个数据框,您似乎都
- 按
id、title 列分组数据
- 现在,对每组
a 列中的数据求和
没有必要为任务创建完整矩阵,partial 步骤也是如此。
我不确定,一个文件中存在多少个 id、title 的独特组合,或者全部存在。一个安全的步骤是批量处理文件,保存结果,然后合并所有结果
看起来像,
import pandas as pd
import numpy as np
import string
def gen_random_data(N, M):
# N = 100
# M = 10
titles = np.apply_along_axis(lambda x: ''.join(x), 1, np.random.choice(list(string.ascii_lowercase), 3*M).reshape(-1, 3))
titles = np.random.choice(titles, N)
_id = np.random.choice(np.arange(M) + 1, N)
val = np.random.randint(M, size=(N,))
df = pd.DataFrame(np.vstack((_id, titles, val)).T, columns=['id', 'title', 'a'])
df = df.astype({'id': np.int64, 'title': str, 'a': np.int64})
return df
def combine_results(grplist):
# stitch into one dataframe
comb_df = pd.concat(dflist, axis=1)
# Sum over common axes i.e. id, titles
comb_df = comb_df.apply(lambda row: np.nansum(row), axis=1)
# Return a data frame with sum of a's
return comb_df.to_frame('sum_of_a')
totalfiles = 10
batch = 2
filelist = []
for counter,nfiles in enumerate(range(0, totalfiles, batch)):
# Read data from files. generate random data
dflist = [gen_random_data(100, 2) for _ in range(nfiles)]
# Process the data in memory
dflist = [_.groupby(['id', 'title']).agg(['sum']) for _ in dflist]
collection = combine_results(dflist)
# write intermediate results to file and repeat the process for the rest of the files
intermediate_result_file_name = f'resfile_{counter}'
collection.to_parquet(intermediate_result_file_name, index=True)
filelist.append(intermediate_result_file_name)
# Combining result files.
collection = [pd.read_parquet(file) for file in filelist]
totalresult = combine_results(collection)