我正在使用clang编译以下代码:
clang++ -std=c++11 -emit-llvm -c -S $1 --cuda-gpu-arch=sm_30
。这将生成vectoradd-cuda-nvptx64-nvidia-cuda-sm_30.ll
和vectoradd.ll
文件。运行某些LLVM分析的目标是传递给内核,这可能会对它进行检测。所以我想将分析后的IR链接到可执行文件,但是我不确定如何执行。当我尝试将.ll文件与llvm-link
链接时,出现错误Linking globals named '_Z9vectoraddPiS_S_i': symbol multiply defined!
。我不太确定如何实现此目标,因此不胜感激。
#define THREADS_PER_BLOCK 512
__global__ void vectoradd(int *A,int *B,int *C,int N) {
int gi = threadIdx.x + blockIdx.x * blockDim.x;
if ( gi < N) {
C[gi] = A[gi] + B[gi];
}
}
int main(int argc,char **argv) {
int N = 10000,*d_A,*d_B,*d_C;
/// allocate host memory
std::vector<int> A(N);
std::vector<int> B(N);
std::vector<int> C(N);
/// allocate device memory
cudaMalloc((void **) &d_A,N * sizeof(int));
cudaMalloc((void **) &d_B,N * sizeof(int));
cudaMalloc((void **) &d_C,N * sizeof(int));
/// populate host data
for ( size_t i = 0; i < N; ++i) {
A[i] = i; B[i] = i;
}
/// copy to device
cudaMemcpy(d_A,&A[0],N * sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_B,&B[0],cudaMemcpyHostToDevice);
dim3 block(THREADS_PER_BLOCK,1,1);
dim3 grid((N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK,1);
vectoradd<<<grid,block>>>(d_A,d_B,d_C,N);
cudaDeviceSynchronize();
cudaMemcpy(&C[0],cudaMemcpyDeviceToHost);
return 0;
}