我将omp并行应用于以下代码段的构造(来自霍夫变换)
for (int i = 0; i < sz; i+=2){
for(k = 0; k < limit; k++){
theta1 = a - v + ((double) k * step);
theta2 = b - v + ((double) k * step);
rho1 = useful_pixels[i]*(h->trigo[(2*k) + 1]) + useful_pixels[i+1] * (h->trigo[2*k]);
rho2 = useful_pixels[i]* (h->trigo[2*k]) - useful_pixels[i+1]*(h->trigo[(2*k) + 1]);
hold_index[(i*limit) + (2*k)] = index(rows,cols,rho1,theta1);
hold_index[(i*limit) + (2*k)+ 1] = index(rows,rho2,theta2);
}
}
我相信不会有竞争条件,因为每次迭代和线程都从hold_index(整数指针)更新不同的索引。有人可以提出瓶颈吗?在处理300个视频帧时,并行版本的执行速度降低了50%。
在原始代码中,正在使用一个函数创建线程,该函数从while循环中为视频中的每个帧调用。 在Linux中使用time实用程序捕获的时间。
串行:32,768 s 并行:45,238 s
编译: g ++ stack.cpp -o test_code -std = c ++ 11 -fopenmp
添加可复制的示例:
#include <iostream>
#include <vector>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctime>
#include <math.h>
#include <omp.h>
#define PI 3.14159265358979323846
using namespace std;
int main()
{
double * trigo;
int *hold_index;
double thetaRadian,thetaDeg = 0;
double hTime;
int rho1,rho2;
double theta1,theta2;
vector<int> pix;
vector<int>::size_type sz;
trigo = (double *) malloc(64 * sizeof(double) / 0.1);
for (int i = 0; i < 640; i+= 2)
{
thetaRadian = (thetaDeg * PI) / 180.0;
trigo [i] = cos(thetaRadian);
trigo [i+1] = sin(thetaRadian);
thetaDeg += 0.1;
}
sz = 3534;
hold_index = (int *)malloc(320*sz*sizeof(int));
for(int h=0; h < 300;h++){
for(int i = 0; i < sz/2; i++) {
for (int j = 0; j < 2; j++) {
pix.push_back(i);
pix.push_back(j);
}
}
memset(hold_index,320*sz*sizeof(int));
//hTime = omp_get_wtime();
#pragma omp parallel shared (hold_index)
{
#pragma omp for private(rho1,theta1,theta2)
for (int i = 0; i < sz; i+=2){
for(int k = 0; k < 320; k++){
//cout << "here\n";
theta1 = 29 + ((double) k * 0.1);
theta2 = 119 + ((double) k * 0.1);
rho1 = pix[i]*(trigo[(2*k) + 1]) + pix[i+1] * (trigo[2*k]);
rho2 = pix[i]* (trigo[2*k]) - pix[i+1]*(trigo[(2*k) + 1]);
hold_index[(i*320) + (2*k)] = 10;
hold_index[(i*320) + (2*k)+ 1] = 20;
}
}}
//hTime = omp_get_wtime() - hTime;
free(hold_index);
pix.clear();}
}