【发布时间】:2021-10-12 02:54:05
【问题描述】:
我对多线程了解不多,也不知道为什么会发生这种情况,所以我直奔主题。
我正在处理一个图像并将图像分成 4 个部分并将每个部分传递给每个线程(基本上我传递每个部分的第一和最后一个像素行的索引)。例如,如果图像有 1000 行,每个线程将处理其中的 250 行。我可以详细介绍我的实现以及我想要实现的目标,以防它对您有所帮助。现在,我提供线程执行的代码,以防您发现发生这种情况的原因。我不知道这是否相关,但在这两种情况下(1 个线程或 4 个线程),该过程大约需要 15 毫秒,pfUMap 和 pbUMap 是无序映射。
void jacobiansThread(int start, int end,vector<float> &sJT,vector<float> &sJTJ) {
uchar* rgbPointer;
float* depthPointer;
float* sdfPointer;
float* dfdxPointer; float* dfdyPointer;
float fov = radians(45.0);
float aspect = 4.0 / 3.0;
float focal = 1 / (glm::tan(fov / 2));
float fu = focal * cols / 2 / aspect;
float fv = focal * rows / 2;
float strictFu = focal / aspect;
float strictFv = focal;
vector<float> pixelJacobi(6, 0);
for (int y = start; y <end; y++) {
rgbPointer = sceneImage.ptr<uchar>(y);
depthPointer = depthBuffer.ptr<float>(y);
dfdxPointer = dfdx.ptr<float>(y);
dfdyPointer = dfdy.ptr<float>(y);
sdfPointer = sdf.ptr<float>(y);
for (int x = roiX.x; x <roiX.y; x++) {
float deltaTerm;// = deltaPointer[x];
float raw = sdfPointer[x];
if (raw > 8.0)continue;
float dirac = (1.0f / float(CV_PI)) * (1.2f / (raw * 1.44f * raw + 1.0f));
deltaTerm = dirac;
vec3 rgb(rgbPointer[x * 3], rgbPointer[x * 3+1], rgbPointer[x * 3+2]);
vec3 bin = rgbToBin(rgb, numberOfBins);
int indexOfColor = bin.x * numberOfBins * numberOfBins + bin.y * numberOfBins + bin.z;
float s3 = glfwGetTime();
float pF = pfUMap[indexOfColor];
float pB = pbUMap[indexOfColor];
float heavisideTerm;
heavisideTerm = HEAVISIDE(raw);
float denominator = (heavisideTerm * pF + (1 - heavisideTerm) * pB) + 0.000001;
float commonFirstTerm = -(pF - pB) / denominator * deltaTerm;
if (pF == pB)continue;
vec3 pixel(x, y, depthPointer[x]);
float dfdxTerm = dfdxPointer[x];
float dfdyTerm = -dfdyPointer[x];
if (pixel.z == 1) {
cv::Point c = findClosestContourPoint(cv::Point(x, y), dfdxTerm, -dfdyTerm, abs(raw));
if (c.x == -1)continue;
pixel = vec3(c.x, c.y, depthBuffer.at<float>(cv::Point(c.x, c.y)));
}
vec3 point3D = pixel;
pixelToViewFast(point3D, cols, rows, strictFu, strictFv);
float Xc = point3D.x; float Xc2 = Xc * Xc; float Yc = point3D.y; float Yc2 = Yc * Yc; float Zc = point3D.z; float Zc2 = Zc * Zc;
pixelJacobi[0] = dfdyTerm * ((fv * Yc2) / Zc2 + fv) + (dfdxTerm * fu * Xc * Yc) / Zc2;
pixelJacobi[1] = -dfdxTerm * ((fu * Xc2) / Zc2 + fu) - (dfdyTerm * fv * Xc * Yc) / Zc2;
pixelJacobi[2] = -(dfdyTerm * fv * Xc) / Zc + (dfdxTerm * fu * Yc) / Zc;
pixelJacobi[3] = -(dfdxTerm * fu) / Zc;
pixelJacobi[4] = -(dfdyTerm * fv) / Zc;
pixelJacobi[5] = (dfdyTerm * fv * Yc) / Zc2 + (dfdxTerm * fu * Xc) / Zc2;
float weightingTerm = -1.0 / log(denominator);
for (int i = 0; i < 6; i++) {
pixelJacobi[i] *= commonFirstTerm;
sJT[i] += pixelJacobi[i];
}
for (int i = 0; i < 6; i++) {
for (int j = i; j < 6; j++) {
sJTJ[i * 6 + j] += weightingTerm * pixelJacobi[i] * pixelJacobi[j];
}
}
}
}
}
这是我调用每个线程的部分:
vector<std::thread> myThreads;
float step = (roiY.y - roiY.x) / numberOfThreads;
vector<vector<float>> tsJT(numberOfThreads, vector<float>(6, 0));
vector<vector<float>> tsJTJ(numberOfThreads, vector<float>(36, 0));
for (int i = 0; i < numberOfThreads; i++) {
int start = roiY.x+i * step;
int end = start + step;
if (end > roiY.y)end = roiY.y;
myThreads.push_back(std::thread(&pwp3dV2::jacobiansThread, this,start,end,std::ref(tsJT[i]), std::ref(tsJTJ[i])));
}
vector<float> sJT(6, 0);
vector<float> sJTJ(36, 0);
for (int i = 0; i < numberOfThreads; i++)myThreads[i].join();
其他说明
为了测量时间,我在第二个代码 sn-p 之前和之后使用了 glfwGetTime()。测量结果各不相同,但正如我所提到的,这两种实现的平均值约为 15 毫秒。
【问题讨论】:
-
@463035818_is_not_a_number 我在问为什么使用 4 个并行处理 250 行的线程与使用一个处理 1000 行的线程一样快。
-
15ms 不算多,创建和加入线程会增加开销
-
std::thread构造函数返回的事实并不意味着线程已经开始处理数据。除其他外,这取决于调度程序。常见的解决方案是让线程保持在后台运行,并在需要时向它们发送数据,而不是每次有工作要做时调用std::thread构造函数来创建一个新线程。 -
是的,这就是托马斯所说的
-
“基本离散”是的。它可能与调度程序时间片或其他原因有关。无论如何,这是一些定时器中断的间隔。 15.625 毫秒是 1/64 秒。一些 Windows 以不同的计时器中断间隔运行(大约 10 或 1 毫秒)。如果您没有在 Windows 上运行,但获得了约 15 毫秒(31、46、62...)的可见粒度,那会让我感到惊讶。用一个什么都不做的愚蠢的for循环来测试这个,看看你能达到什么时间差。
标签: c++ multithreading opencv optimization parallel-processing