void CudaRenderer::render() {
dim3 blockDim(16, 16, 1);
dim3 gridDim((image->width + blockDim.x - 1) / blockDim.x,
(image->height + blockDim.y - 1) / blockDim.y);
int *filteredCircles,
*lastIndices,
sz = gridDim.y * gridDim.x ;
cudaMalloc((void **)&filteredCircles, sizeof(int) * sz * 2000);
cudaMalloc((void **)&lastIndices, sizeof(int) * sz);
cudaMemset(lastIndices, 0, sizeof(int) * sz);
filterCircles<<<gridDim, blockDim>>>(filteredCircles, lastIndices);
for (int i = 0; i < 10; ++i)
printf("lastIndices[%d] = %d\n", i, lastIndices[i]);
kernelRenderCircles<<<gridDim, blockDim>>>(filteredCircles, lastIndices);
cudaFree(filteredCircles);
cudaDeviceSynchronize();
在添加打印之前,代码编译正常,但产生了错误的结果。在添加打印以检查 lastIndices 之后,它一直给我分段错误。我在这里做错了什么?
lastIndices
我认为您想要做的是在 filterCircles 运行后打印值。为此,你需要明白
A:GPU
__global__
功能与 CPU 异步运行。B:您无法在主机(CPU)上访问设备(GPU)指针。
诀窍是将代码更改如下:
老的:
新的: