考虑以下问题的替代解决方案。在这里,我们迭代不同的Code 值并为每个Price 搜索适当的New Price。应该比原来的方式在时间和记忆上效率更高。还可以通过优化和/或numba 来提高效率。
import numpy as np
import pandas as pd
def get_all_new(pd_series, result):
result[pd_series.name] = np.sort(pd_series.unique())
def find_new_group(pd_series, sorted_arrays):
return pd_series.apply(lambda x: find_new(x, sorted_arrays[pd_series.name]))
def find_new(value, sorted_array):
pos = np.searchsorted(sorted_array, value)
return sorted_array[pos] if pos < sorted_array.size else None # None OR value ???
if __name__ == '__main__':
N1, N2, M1, M2 = 5, 5, 5, 5
df1 = pd.DataFrame(
{'Code': ['X'] * N1 + ['Y'] * N2,
'Price': np.random.randint(1, 100, N1 + N2) / 10})
df2 = pd.DataFrame(
{'Code': ['X'] * M1 + ['Y'] * M2,
'Price': np.random.randint(1, 100, M1 + M2) / 10})
print(df1)
print(df2)
all_new = dict()
# collect all new prices for every Code
df2.groupby('Code')['Price'].apply(lambda x: get_all_new(x, all_new))
# find appropriate new price for every old price
df1['New Price'] = df1.groupby('Code')['Price'].apply(lambda x: find_new_group(x, all_new))
print(df1)
输出:
Code Price
0 X 7.8
1 X 6.6
2 X 3.2
3 X 0.3
4 X 4.7
5 Y 0.5
6 Y 1.1
7 Y 8.9
8 Y 6.7
9 Y 0.5
Code Price
0 X 6.9
1 X 4.6
2 X 2.3
3 X 7.6
4 X 2.4
5 Y 0.8
6 Y 3.4
7 Y 0.4
8 Y 4.2
9 Y 9.6
Code Price New Price
0 X 7.8 NaN
1 X 6.6 6.9
2 X 3.2 4.6
3 X 0.3 2.3
4 X 4.7 6.9
5 Y 0.5 0.8
6 Y 1.1 3.4
7 Y 8.9 9.6
8 Y 6.7 9.6
9 Y 0.5 0.8
用N1, N2, M1, M2 = ...测试代码
100_000 - 518 ms ± 2.25 ms per loop (mean ± std. dev. of 7 runs, 1 loop each),
1_000_000 - 5.29 s ± 72.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each).