【发布时间】:2021-04-05 19:42:39
【问题描述】:
我决定自己研究 OpenCL 并为 TEA 算法编写一个蛮力密码,我对 OpenCL 的理解正确吗?你能在速度方面改进一些东西吗?我犯了什么错误?
我按周期准备前 5 个字节,其余 3 个字节由内核整理,255 个线程,每个线程 65535
在主程序中:
for (int x5 = KEY[0]; x5 >= 0; x5--) {
KEY[0]=x5;
for (int x4 = KEY[1]; x4 >= 0; x4--) {
KEY[1]=x4;
for (int x3 = KEY[2]; x3 >= 0; x3--) {
KEY[2]=x3;
for (int x2 = KEY[3]; x2 >= 0; x2--) {
KEY[3]=x2;
for (int x = KEY[4]; x >= 0; x--) {
KEY[4]=x;
ret = clEnqueueWriteBuffer(command_queue, key_mem_obj, CL_TRUE, 0,
8 * sizeof(int), KEY, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, cadr_mem_obj, CL_TRUE, 0,
1 * sizeof(int), CADR, 0, NULL, NULL);
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&key_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&cadr_mem_obj);
NDRange = 0x0100;
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&NDRange, NULL, 0, NULL, NULL);
if (ret != CL_SUCCESS) {
break;
}
ret = clEnqueueReadBuffer(command_queue, cadr_mem_obj, CL_TRUE, 0,
1 * sizeof(int), CADR, 0, NULL, NULL);
if (CADR[0]>0) {
uint16_t k=CADR[0];
ret = clEnqueueReadBuffer(command_queue, retc_mem_obj, CL_TRUE, 0,
524280 * sizeof(int), RETC, 0, NULL, NULL);
for ((i = 0); i < k; i++) {
Form1->Memo1->Lines->BeginUpdate();
Form1->Memo1->Lines->Add(IntToHex(RETC[i*8],2)+IntToHex(RETC[i*8+1],2)+
IntToHex(RETC[i*8+2],2)+IntToHex(RETC[i*8+3],2)+IntToHex(RETC[i*8+4],2)+
IntToHex(RETC[i*8+5],2)+IntToHex(RETC[i*8+6],2)+IntToHex(RETC[i*8+7],2));
Form1->Memo1->Lines->EndUpdate();
Form1->Label6->Caption=IntToStr(Form1->Memo1->Lines->Count-1);
}
CADR[0]=0;
}
KEY2[0]=KEY[0];
KEY2[1]=KEY[1];
KEY2[2]=KEY[2];
KEY2[3]=KEY[3];
KEY2[4]=KEY[4];
KEY2[5]=KEY[5];
KEY2[6]=KEY[6];
KEY2[7]=KEY[7];
if(Terminated){
break;
}
}
KEY[4]=0xFF;
}
KEY[3]=0xFF;
}
KEY[2]=0xFF;
}
KEY[1]=0xFF;
}
KEY[0]=0xFF;`
内核:
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
__kernel void brute(__global const int *KEY, __global const int *DAT, __global int
*CADR,__global int *RETC)
{
int i = get_global_id(0);
ushort Data[2];
ushort Key[4];
Key[0]=(KEY[0]<<8)+KEY[1];
Key[1]=(KEY[2]<<8)+KEY[3];
// Key[2]=(KEY[4]<<8)+KEY[5];
Key[3]=(KEY[6]<<8)+KEY[7];
Key[2] = (KEY[4]<<8) + i;
for (int j=0xFFFF; j>=0; j--){
Key[3]=j;
Data[0]=(DAT[0]<<8)+DAT[1];
Data[1]=(DAT[2]<<8)+DAT[3];
ushort delta = 0x9e37;
ushort sum = (delta<<5);
for (uint n = 0;n < 32; ++n){
Data[1]-=(((Data[0])+Key[2])^(Data[0]+sum)^((Data[0]>>5)+Key[3]));
Data[0]-=(((Data[1]<<4)+Key[0])^(Data[1]+sum)^(Data[1]+Key[1]));
sum -= delta;
}
if ((Data[0]==0x0000) && (Data[1]==0x0000)){
int a=CADR[0];
atomic_inc(CADR);
RETC[a*8]=(Key[0] >> 8);
RETC[a*8+1]=(Key[0] & 0xFF);
RETC[a*8+2]=(Key[1] >> 8);
RETC[a*8+3]=(Key[1] & 0xFF);
RETC[a*8+4]=(Key[2] >> 8);
RETC[a*8+5]=(Key[2] & 0xFF);
RETC[a*8+6]=(Key[3] >> 8);
RETC[a*8+7]=(Key[3] & 0xFF);
}
}
}
【问题讨论】:
-
这也很大程度上取决于您的硬件;这是关于您的特定实现(我们无法在您的硬件上测试)的问题,还是关于制作通用实现(在任何硬件上都有效)的问题?
-
我只是想澄清一下线程,据我了解有256、512(每个显卡都有自己的),我选择了编写代码256的最佳选项,这是KEY [5] 256 个线程同时启动并使用常规循环遍历 KEY [6] 和 KEY [7] = 65536 个键(如果写得不正确,我很抱歉,我使用谷歌翻译)
-
我理解正确,如果我这样做,我会挤出显卡的最大值吗?
size_t work_group_size; clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &work_group_size, NULL); NDRange = 0x010000; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,&NDRange, &work_group_size, 0, NULL, NULL);
标签: opencl