Eu tenho um CreateBasePopulation do kernel CUDA onde eu uso printf para imprimir valores de struct dentro do kernel. No entanto, nenhuma saída está sendo impressa quando eu executo o kernel, aqui está o código relevante:
__global__ void CreateBasePopulation(Population* pop, int pop_num, int input_num, int output_num) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= pop_num) {
return;
}
Network* net = &pop->Networks[idx];
net->num_neurons = input_num + output_num;
net->num_connections = input_num * output_num;
net->fitness = 0.0f;
curandState state;
curand_init(clock64(), idx, 0, &state);
for (int i = 0; i < output_num; ++i) {
net->Neurons[i].type = 2;
net->Neurons[i].bias = (2.0f * sqrtf((float)input_num) * curand_uniform(&state)) - sqrtf((float)input_num);
net->Neurons[i].output = 0.0f;
net->Neurons[i].input_sum = 0.0f;
printf("%f\n", net->Neurons[i].bias);
}
for (int i = 0; i < input_num; ++i) {
net->Neurons[i].type = 0;
net->Neurons[i].bias = 0.0f;
net->Neurons[i].output = 0.0f;
net->Neurons[i].input_sum = 0.0f;
for (int j = 0; j < output_num; ++j) {
int offset = j + (output_num * i);
net->Connections[offset].from = i;
net->Connections[offset].to = j;
net->Connections[offset].innovationid = offset;
net->Connections[offset].enabled = true;
net->Connections[offset].weight = (2.0f * curand_uniform(&state)) - 1.0f;
printf("Weight [%d]: %f\n", offset, net->Connections[offset].weight);
}
}
}
Eu também tentei alocar memória assim;
....
curandState state;
curand_init(clock64(), idx, 0, &state);
cudaMalloc((void**)&(net->Neurons), sizeof(Neuron) * net->num_neurons);
cudaMalloc((void**)&(net->Connections), sizeof(Connection) * net->num_connections);
....
Mas não dá nenhuma saída (tentei copiar o dispositivo para o host e havia números muito grandes e muito pequenos, então parece que há um problema com o gerenciamento de memória)
Também função principal;
int main() {
int population_size = 1024;
int input_num = 10;
int output_num = 5;
Population* d_population;
cudaMalloc(&d_population, sizeof(Population));
Network* d_networks;
cudaMalloc(&d_networks, sizeof(Network) * population_size);
cudaMemcpy(&(d_population->Networks), &d_networks, sizeof(Network*), cudaMemcpyHostToDevice);
int threadsPerBlock = 512;
int blocks = (population_size + threadsPerBlock - 1) / threadsPerBlock;
CreateBasePopulation<<<blocks, threadsPerBlock>>>(d_population, population_size, input_num, output_num);
cudaDeviceSynchronize();
std::cout << "Population created successfully!" << std::endl;
cudaFree(d_networks);
cudaFree(d_population);
return 0;
}
Também estruturas;
struct Connection {
int innovationid;
int from;
int to;
float weight;
bool enabled;
};
struct Neuron {
int type; //0 = input, 1 = hidden, 2 = output
float input_sum; // Sum of inputs into neuron
float bias;
float output; // Activated output
};
struct Network {
Connection* Connections;
Neuron* Neurons;
int num_connections;
int num_neurons;
float fitness;
};
struct Population {
Network* Networks;
int num_networks;
int generation_id;
};