O código abaixo sempre imprime "Olá do início" antes de qualquer outra coisa, e "Olá do fim" depois de qualquer outra coisa. Por que isso?
Código:
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void add1InGPU( int *devArr, int n, int nx )
{
int ix = blockIdx.x * blockDim.x + threadIdx.x;
int iy = blockIdx.y * blockDim.y + threadIdx.y;
int id = nx * iy + ix;
int whateverNum = 5;
if (id == whateverNum) printf("Hello from the start\n");
if ( id < n ) {
devArr[id] += 1;
printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d, threadIdx.y: %d, processing index %d\n", blockIdx.x, threadIdx.x, blockIdx.y, threadIdx.y, id );
}
else {
printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d, threadIdx.y: %d, skipped\n", blockIdx.x, threadIdx.x, blockIdx.y, threadIdx.y);
}
if (id == whateverNum) printf("Hello from the end\n");
}
int main(void)
{
int* d_arr = NULL;
cudaMalloc(&d_arr, 16 * sizeof(int));
cudaMemset(d_arr, 0, 16 * sizeof(int));
int nElem = 16;
dim3 block( 2, 2 );
dim3 grid( 2, 2 );
// dim3 grid( ( nElem + block.x - 1 ) / block.x );
add1InGPU<<<grid, block>>> ( d_arr, nElem, 4 );
cudaDeviceSynchronize();
int *h_arr = (int*)malloc(16 * sizeof(int));
cudaMemcpy(h_arr, d_arr, 10 * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_arr);
volatile int _ = 0;
for (int i = 0; i < 10; i++) {
_ += h_arr[i];
}
cudaDeviceReset();
return 0;
}
Saída:
Hello from the start
blockIdx.x: 1, threadIdx.x: 0, blockIdx.y: 0, threadIdx.y: 0, processing index 2
blockIdx.x: 1, threadIdx.x: 1, blockIdx.y: 0, threadIdx.y: 0, processing index 3
blockIdx.x: 1, threadIdx.x: 0, blockIdx.y: 0, threadIdx.y: 1, processing index 6
blockIdx.x: 1, threadIdx.x: 1, blockIdx.y: 0, threadIdx.y: 1, processing index 7
blockIdx.x: 0, threadIdx.x: 0, blockIdx.y: 1, threadIdx.y: 0, processing index 8
blockIdx.x: 0, threadIdx.x: 1, blockIdx.y: 1, threadIdx.y: 0, processing index 9
blockIdx.x: 0, threadIdx.x: 0, blockIdx.y: 1, threadIdx.y: 1, processing index 12
blockIdx.x: 0, threadIdx.x: 1, blockIdx.y: 1, threadIdx.y: 1, processing index 13
blockIdx.x: 1, threadIdx.x: 0, blockIdx.y: 1, threadIdx.y: 0, processing index 10
blockIdx.x: 1, threadIdx.x: 1, blockIdx.y: 1, threadIdx.y: 0, processing index 11
blockIdx.x: 1, threadIdx.x: 0, blockIdx.y: 1, threadIdx.y: 1, processing index 14
blockIdx.x: 1, threadIdx.x: 1, blockIdx.y: 1, threadIdx.y: 1, processing index 15
blockIdx.x: 0, threadIdx.x: 0, blockIdx.y: 0, threadIdx.y: 0, processing index 0
blockIdx.x: 0, threadIdx.x: 1, blockIdx.y: 0, threadIdx.y: 0, processing index 1
blockIdx.x: 0, threadIdx.x: 0, blockIdx.y: 0, threadIdx.y: 1, processing index 4
blockIdx.x: 0, threadIdx.x: 1, blockIdx.y: 0, threadIdx.y: 1, processing index 5
Hello from the end