由于您的标签来自一个小的整数范围,您应该通过使用下面的np.bincount (pp) 获得相当大的加速。或者,您可以通过创建掩码 (p2) 来加快查找速度。这---就像你的原始代码一样---允许用math.fsum替换np.sum,这保证了机器精度内的精确结果(p3)。或者,我们可以对它进行 pythranize 以实现另一个 40% 加速 (p4)。
在我的装备上,numba soln (mx) 的速度与 pp 差不多,但也许我做得不对。
import numpy as np
import math
from subsum import pflat
MAXIND = 120_000
def OP():
return sum(C[np.isin(A, b)])
def pp():
return np.bincount(A.reshape(-1), C.reshape(-1), MAXIND)[np.unique(b)].sum()
def p2():
grid = np.zeros(MAXIND, bool)
grid[b] = True
return C[grid[A]].sum()
def p3():
grid = np.zeros(MAXIND, bool)
grid[b] = True
return math.fsum(C[grid[A]])
def p4():
return pflat(A.ravel(), C.ravel(), b, MAXIND)
import numba as nb
@nb.njit(parallel=True,fastmath=True)
def nb_ss(A,C,b):
s=set(b)
sum=0.
for i in nb.prange(A.shape[0]):
for j in range(A.shape[1]):
if A[i,j] in s:
sum+=C[i,j]
return sum
def mx():
return nb_ss(A,C,b)
sh = 100_000, 100
A = np.random.randint(0, MAXIND, sh)
C = np.random.random(sh)
b = np.random.randint(0, MAXIND, 1000)
print(OP(), pp(), p2(), p3(), p4(), mx())
from timeit import timeit
print("OP", timeit(OP, number=4)*250)
print("pp", timeit(pp, number=10)*100)
print("p2", timeit(p2, number=10)*100)
print("p3", timeit(p3, number=10)*100)
print("p4", timeit(p4, number=10)*100)
print("mx", timeit(mx, number=10)*100)
pythran 模块的代码:
[subsum.py]
import numpy as np
#pythran export pflat(int[:], float[:], int[:], int)
def pflat(A, C, b, MAXIND):
grid = np.zeros(MAXIND, bool)
grid[b] = True
return C[grid[A]].sum()
编译就像pythran subsum.py一样简单
示例运行:
41330.15849965791 41330.15849965748 41330.15849965747 41330.158499657475 41330.15849965791 41330.158499657446
OP 1963.3807722493657
pp 53.23419079941232
p2 21.8758742994396
p3 26.829131800332107
p4 12.988955597393215
mx 52.37018179905135