Python 解决方案:
import numpy as np
import time
def compute(A, AU):
A[:,:] = 0
# Put every 8 columns in AU into A
for i in range(A.shape[1]):
A[:, i//8] = np.bitwise_or(A[:, i//8], np.left_shift(AU[:, i], i % 8))
colm = int(20000/8)
rows = 10000
cols = int(colm*8)
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
start_time = time.time()
A = np.empty((rows,colm), dtype=np.uint8)
start_time = time.time()
compute(A, AU)
end_time = time.time()
print(end_time - start_time)
在 1/2 秒内打包位
C 中的相同代码:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int main(int argc, char* argv[]) {
int colm = 200000/8;
int rows = 10000;
int cols = colm*8;
unsigned char *A = (unsigned char *)malloc(rows * colm * sizeof(unsigned char));
unsigned char *AU = (unsigned char *)malloc(rows * cols * sizeof(unsigned char));
int i, j;
clock_t begin;
clock_t end;
double time_spent;
begin = clock();
// Create AU
for (i = 0; i < rows; i++)
for (j = 0; j < cols; j++)
*(AU + i*cols + j) = (unsigned char) (rand() & 0x01);
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("%lf seconds to create AU\n", time_spent);
begin = clock();
// Create a zeroed out A
for (i = 0; i < rows; i++)
for (j = 0; j < colm; j++)
*(A + i*colm + j) = (unsigned char) 0;
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("%lf seconds to create A\n", time_spent);
begin = clock();
// Pack into bits
for (i = 0; i < rows; i++)
for (j = 0; j < colm; j++) {
int au_idx = i*cols + j*8;
for (int k=0; k<8; k++)
*(A + i*colm + j) |= *(AU + au_idx + k) << k;
}
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("%lf seconds to pack\n", time_spent);
free(A);
free(AU);
return 0;
}
使用 colm=200,000 进行测试。位打包需要 0.27 秒,而 Jérôme Richard 提供的优化 Python 版本需要 0.64 秒。对 rand() 的调用非常昂贵,并且大大增加了整体运行时间。在内存方面,C 版本的峰值为 2GB,而 Python 为 4.2GB。进一步的代码优化和并行化肯定会减少运行时间。
Julia version:
using Random
colm = 200000÷8
rows = 30000
cols = colm*8
AU = zeros(UInt8, (rows, cols))
rand!(AU)
AU .&= 0x01
A = zeros(UInt8, (rows, colm))
function compute(A, AU)
for i in 1:size(A)[2]
start_col = (i-1) << 3
@views A[:, i] .= AU[:, start_col + 1] .|
(AU[:, start_col + 2] .<< 1) .|
(AU[:, start_col + 3] .<< 2) .|
(AU[:, start_col + 4] .<< 3) .|
(AU[:, start_col + 5] .<< 4) .|
(AU[:, start_col + 6] .<< 5) .|
(AU[:, start_col + 7] .<< 6) .|
(AU[:, start_col + 8] .<< 7)
end
end
@time compute(A, AU)
Julia 在性能方面的扩展性很好。 colm=25,000 和 rows=30,000 的结果:
Language Total Run Time (secs) Bit Packing Time (secs) Peak Memory (GB)
Python 22.1 3.0 6
Julia 11.7 1.2 6