使用并行for循环的OpenMP性能下降

2024-05-03 • 问答

我将omp并行应用于以下代码段的构造（来自霍夫变换）

for (int i = 0; i < sz; i+=2){
        for(k = 0; k < limit; k++){
            theta1 = a - v + ((double) k * step);
            theta2 = b - v + ((double) k * step);
            rho1 = useful_pixels[i]*(h->trigo[(2*k) + 1]) + useful_pixels[i+1] * (h->trigo[2*k]);
            rho2 = useful_pixels[i]* (h->trigo[2*k])  - useful_pixels[i+1]*(h->trigo[(2*k) + 1]);
            hold_index[(i*limit) + (2*k)] = index(rows,cols,rho1,theta1);
            hold_index[(i*limit) + (2*k)+ 1] = index(rows,rho2,theta2);


        }
    }

我相信不会有竞争条件，因为每次迭代和线程都从hold_index（整数指针）更新不同的索引。有人可以提出瓶颈吗？在处理300个视频帧时，并行版本的执行速度降低了50％。

在原始代码中，正在使用一个函数创建线程，该函数从while循环中为视频中的每个帧调用。在Linux中使用time实用程序捕获的时间。

串行：32,768 s 并行：45,238 s

编译： g ++ stack.cpp -o test_code -std = c ++ 11 -fopenmp

添加可复制的示例：

#include <iostream>
#include <vector>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctime>
#include <math.h>
#include <omp.h>
#define PI 3.14159265358979323846
using namespace std;

int main()
{
double * trigo;
int *hold_index;
double thetaRadian,thetaDeg = 0;
double hTime;
int rho1,rho2;
double theta1,theta2;
vector<int> pix;
vector<int>::size_type sz;  

trigo = (double *) malloc(64 * sizeof(double) / 0.1);
for (int  i = 0; i < 640; i+= 2)
{
    thetaRadian = (thetaDeg * PI) /  180.0;
    trigo [i] = cos(thetaRadian);
    trigo [i+1] = sin(thetaRadian);
    thetaDeg += 0.1;
}
sz = 3534;

hold_index = (int *)malloc(320*sz*sizeof(int));
for(int h=0; h < 300;h++){
for(int i = 0; i < sz/2; i++) {
    for (int j = 0; j < 2; j++) {
        pix.push_back(i);
        pix.push_back(j);
                }
}
memset(hold_index,320*sz*sizeof(int));
//hTime = omp_get_wtime();  
#pragma omp parallel shared (hold_index)
{
#pragma omp for private(rho1,theta1,theta2)
for (int i = 0; i < sz; i+=2){
      for(int k = 0; k < 320; k++){
    //cout << "here\n";
        theta1 = 29 + ((double) k * 0.1);
        theta2 = 119 + ((double) k * 0.1);
        rho1 = pix[i]*(trigo[(2*k) + 1]) + pix[i+1] * (trigo[2*k]);
        rho2 = pix[i]* (trigo[2*k])  - pix[i+1]*(trigo[(2*k) + 1]);
        hold_index[(i*320) + (2*k)] = 10;
        hold_index[(i*320) + (2*k)+ 1] = 20;


    }
}}
//hTime = omp_get_wtime() - hTime;



free(hold_index);
pix.clear();}
}

使用并行for循环的OpenMP性能下降

ping950726 回答：使用并行for循环的OpenMP性能下降

大家都在问