cuda copy data which dynamic malloc in kernel from device memory - cuda

I met a problem about using cudaMemcpy with cudaMemcpyDeviceToHost.
There is a struct which have a pointer int* a, It will malloc in the kernel function.
And then I need copy this int* a to host memory.
My question is: I didn't know how it can not work by using cudaMemcpy.
There my codes:
#include <cuda_runtime.h>
#include <stdio.h>
typedef struct { int n, m; int *a; } myst;
__global__ void xthread(myst *st)
{
unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
myst *mst = &st[idx];
mst->n = idx;
mst->m = idx+1;
mst->a = (int *)malloc((mst->m)*sizeof(int));
mst->a[0] = idx;
}
int main(int argc,char **argv)
{
dim3 dimGrid(1);
dim3 dimBlock(2);
myst *mst = NULL;
myst *hst = (myst *)malloc(2 * sizeof(myst));
cudaMalloc(&mst, 2 * sizeof(myst));
xthread<<<dimGrid, dimBlock>>>(mst);
cudaDeviceSynchronize();
cudaMemcpy(&hst[0],&mst[0],sizeof(myst),cudaMemcpyDeviceToHost);
cudaMemcpy(&hst[1],&mst[1],sizeof(myst),cudaMemcpyDeviceToHost);
int *pInt1 = (int *)malloc((hst[0].m)*sizeof(int)) ;
int *pInt2 = (int *)malloc((hst[1].m)*sizeof(int)) ;
cudaMemcpy(pInt1, hst[0].a, (hst[0].m)*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(pInt2, hst[1].a, (hst[1].m)*sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\t%d\t%d\n",hst[0].n,hst[0].m, pInt1[0]);
printf("%d\t%d\t%d\n",hst[1].n,hst[1].m, pInt2[0]);
free(pInt1);
free(pInt2);
return 0;
}
The codes will go warning about "Cuda API error detected: cudaMemcpy returned (0xb)"
I saw a similar question : copy data which is allocated in device from device to host
But it seem that can not solve my problem.
Thx.

Alright, I work it out with a stupid way (-.-!!).
While return form the kernel function, I count how many space I have to malloc in Host and Device, and cudaMalloc again a big space . Next, in other kernel function named ythread, copy the data which in the Heap to the big space.
typedef struct { int n, m; int *a; } myst;
__global__ void xthread(myst *st) {
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
myst *mst = &st[idx];
mst->n = idx;
mst->m = idx + 1;
mst->a = (int *) malloc((mst->m) * sizeof(int));
for (int i = 0; i < mst->m; i++) {
mst->a[i] = idx + 900 + i * 10;
}
}
__global__ void ythread(myst *st, int *total_a) {
unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
myst *mst = &st[idx];
int offset=0;
for(int i=0; i<idx; i++) {
offset += st[i].m;
}
for(int i=0; i<mst->m; i++) {
total_a[offset+i] = mst->a[i];
}
}
int main(int argc,char **argv) {
dim3 dimGrid(1);
dim3 dimBlock(2);
myst *mst = NULL;
cudaMalloc((void**)&mst, dimBlock.x * sizeof(myst));
xthread<<<dimGrid, dimBlock>>>(mst);
cudaDeviceSynchronize();
myst *hst = (myst *)malloc(dimBlock.x * sizeof(myst));
cudaMemcpy(hst, mst, dimBlock.x*sizeof(myst),cudaMemcpyDeviceToHost);
int t_size = 0;
for(int i=0; i<dimBlock.x; i++) {
t_size += hst[i].m;
}
printf("t_size:%d\n", t_size);
int * t_a_h = (int *)malloc(t_size*sizeof(int));
int * t_a_d = NULL;
cudaMalloc((void**)&t_a_d, t_size*sizeof(int));
ythread<<<dimGrid, dimBlock>>>(mst, t_a_d);
cudaDeviceSynchronize();
cudaMemcpy(t_a_h, t_a_d, t_size*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0; i<t_size; i++) {
printf("t_a_h[%d]:%d\n", i, t_a_h[i]);
}
free(t_a_h);
cudaFree(mst);
cudaFree(t_a_d);
return 0;
}
Emmmmmm, it work, but I think there is a better way to solve this problem.

Related

How to keep track of executed CUDA blocks?

Just for the sake of testing my understanding of things, I decided to modify the vector addition found in the CUDA samples so that the kernel quits after a specific time and is then re-launched to complete. The way I achieve the "timeout" is by having a pinned variable that the host sets to 1 after some time. Within the kernel, a check of this variable is performed to determine whether execution should continue. If the thread continues its execution it is marked as complete. In order to test that each thread executes just once, I've modified the addition to C[i] = C[i] + B[i] This all works as expected; the device code looks as follows:
/* Function
* Internal device function used for getting the current thread's global ID
* regardless of the block/grid configuration. It assumes that the
* grid and block are 3 dimensional.
*
* #return: The thread's global ID
*/
static __device__ int get_global_idx()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
return threadId;
}
/* Function
* Device function that determines if the current thread should continue execution.
* A check should be used on the return value. If the timeout has not been set
* and the thread has not previously executed the index at the thread's ID in the
* thread_ids array is set to 1 to indicate it was allowed to proceed.
*
* #param thread_ids: A pointer to the array with a size that matches the max number
* of threads that will be spawned
*
* #param time_out: Memory mapped variable used by the host to signal the kernel when
* execution should suspend
*
* #return: A boolean value indicating whether the current thread should continue or not
*/
__device__ bool continue(unsigned int *thread_ids, volatile unsigned int *time_out)
{
if(*time_out == 1){
return false;
}
int tid = get_global_idx();
if(thread_ids[tid] == 1)
{
return false;
}
thread_ids[tid] = 1;
return true;
}
__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, unsigned int *thread_ids, volatile unsigned int *timeout)
{
if(!continue(thread_ids, timeout))
{
return;
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
/* C[i] = A[i] + B[i]; */
C[i] = C[i] + B[i]; //Modifed from above
}
}
I considered how this may fail if __syncthreads() was used. So I decided to do block level suspension. Based on my understanding, I thought this would be simple. Keep track of if a block has started, and count how many threads have executed for that block and only suspend when all threads of an already started block have completed and deny any threads who's block has not started. So I used a struct and modified the continue function as follows:
typedef struct block_info_t{
int started; /* Initialized to zero before any kernel launch */
unsigned int thread_count;
}block_info;
__device__ bool continue(unsigned int *thread_ids, volatile unsigned int *time_out, block_info *b_info)
{
int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
if(*time_out == 1 && b_info[bid].started == 0)
{
return false;
}
if(b_info[bid].thread_count == bsize)
{
return false;
}
b_info[bid].started = 1;
atomicInc(&b_info[bid].thread_count, bsize);
return true;
}
This does not work, when I execute verification on the host (h_B[i] - h_C[i]) I don't get a consistent zero result. Which means that some threads somehow managed to execute multiple times. Any ideas how/why this is happening with the latter attempt? Thanks.
I don't care about performance at this point; just trying to understand what is really happening.
EDIT
Here is the complete code, compile with nvcc file_name.cu and execute program_name <vector-length>.
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef struct block_info_t{
int started; /* Initialized to zero before any kernel launch */
unsigned int thread_count;
}block_info;
__device__ bool continue_execution(volatile unsigned int *time_out, block_info *b_info)
{
int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
if(*time_out == 1 && b_info[bid].started == 0)
{
return false;
}
if(b_info[bid].thread_count == bsize)
{
return false;
}
b_info[bid].started = 1;
atomicInc(&b_info[bid].thread_count, bsize);
return true;
}
__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, volatile unsigned int *time_out, block_info *b_info)
{
if(!continue_execution(time_out, b_info))
{
return;
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
//C[i] = A[i] + B[i];
C[i] = C[i] + B[i]; //Modified from above
}
}
void computation_complete(int *complete, int block_amt, block_info *h_block_info)
{
size_t i;
for(i = 0; i < block_amt; i++)
{
if(h_block_info[i].started == 1)
{
continue;
}
break;
}
*complete = (i == block_amt) ? 1 : 0;
}
int main(int argc, char *argv[])
{
if(argc != 2)
{
fprintf(stderr, "usage: <program-name> <vector-length>\n");
exit(EXIT_FAILURE);
}
// Print the vector length to be used, and compute its size
long numElements = strtol(argv[1], NULL, 10);
size_t size = numElements * sizeof(float);
printf("[Vector addition of %d elements]\n", numElements);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i)
{
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
h_C[i] = 0.0;
}
float *d_A = NULL;
cudaMalloc((void **)&d_A, size);
float *d_B = NULL;
cudaMalloc((void **)&d_B, size);
float *d_C = NULL;
cudaMalloc((void **)&d_C, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
size_t block_info_bytes = blocksPerGrid * sizeof(struct block_info_t);
block_info *h_block_info = (struct block_info_t *)malloc(block_info_bytes);
for(int i = 0; i < blocksPerGrid; i++)
{
h_block_info[i].started = 0;
h_block_info[i].thread_count = 0;
}
block_info *d_block_info = NULL;
cudaMalloc(&d_block_info, block_info_bytes);
cudaMemcpy(d_block_info, h_block_info, block_info_bytes, cudaMemcpyHostToDevice);
volatile unsigned int *timeout = NULL;
cudaHostAlloc((void **)&timeout, sizeof(volatile unsigned int), cudaHostAllocMapped);
*timeout = 0;
double quantum = 0.0001 * 1000000.0;
double initial_quantum = quantum;
int complete = 0;
/* Here the kernel launch is looped until all blocks are complete */
while(complete == 0)
{
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements, timeout, d_block_info);
usleep(quantum);
*timeout = 1;
cudaDeviceSynchronize();
cudaMemcpy(h_block_info, d_block_info, block_info_bytes, cudaMemcpyDeviceToHost);
computation_complete(&complete, blocksPerGrid, h_block_info);
if(complete == 0)
{
quantum = quantum + initial_quantum;
*timeout = 0;
}
}
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i)
{
if (fabs(h_B[i] - h_C[i]) > 1e-5)
{
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Free device global memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
cudaDeviceReset();
return 0;
}
You have a race condition in your continue_execution routine. Consider the following scenario:
warp0 of a threadblock enters the continue_execution routine. At the moment that it checks the variables *time_out and b_info[bid].started it witnesses those to be 0 and 0 respectively. So it proceeds to the next if test.
warp1 of the same threadblock enters the continue_execution routine (let's say slightly later), and it witnesses the variables to be 1 and 0 respectively. So it returns false and causes the warp1 threads to exit.
warp0 continues on and eventually sets b_info[bid].started to 1, and then updates the thread_count. It then returns true and proceeds with the vector add.
I could continue with this, but I think if you consider the above 3 items carefully you will realize it is a case you did not account for. Your implicit expectation is that every thread would read a coherent (i.e. the same across a given threadblock) value for *time_out. But this is not guaranteed by your code, and if it fails to do so, then we end up with some threadblocks where some threads have completed their work and some have not.
So how could we fix this? The above description should point the way. One possible approach is to guarantee that for any given threadblock, that every thread gets the same value for *time_out whether it be 1 or 0. One possible solution would be to make the following changes to the beginning of your vectorAdd kernel:
__shared__ volatile unsigned int my_time_out;
if (!threadIdx.x) my_time_out = *time_out;
__syncthreads();
if(!continue_execution(&my_time_out, b_info))
with those changes, we ensure that every thread in a block gets a coherent view of the time out variable, and according to my testing, the problem is resolved:
$ cat t100.cu
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef struct block_info_t{
int started; /* Initialized to zero before any kernel launch */
unsigned int thread_count;
}block_info;
__device__ bool continue_execution(volatile unsigned int *time_out, block_info *b_info)
{
int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
if(*time_out == 1 && b_info[bid].started == 0)
{
return false;
}
if(b_info[bid].thread_count == bsize)
{
return false;
}
b_info[bid].started = 1;
atomicInc(&b_info[bid].thread_count, bsize);
return true;
}
__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, volatile unsigned int *time_out, block_info *b_info)
{
#ifdef USE_FIX
__shared__ volatile unsigned int my_time_out;
if (!threadIdx.x) my_time_out = *time_out;
__syncthreads();
if(!continue_execution(&my_time_out, b_info))
#else
if(!continue_execution(time_out, b_info))
#endif
{
return;
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
//C[i] = A[i] + B[i];
C[i] = C[i] + B[i]; //Modified from above
}
}
void computation_complete(int *complete, int block_amt, block_info *h_block_info)
{
size_t i;
for(i = 0; i < block_amt; i++)
{
if(h_block_info[i].started == 1)
{
continue;
}
break;
}
*complete = (i == block_amt) ? 1 : 0;
}
int main(int argc, char *argv[])
{
if(argc != 2)
{
fprintf(stderr, "usage: <program-name> <vector-length>\n");
exit(EXIT_FAILURE);
}
// Print the vector length to be used, and compute its size
long numElements = strtol(argv[1], NULL, 10);
size_t size = numElements * sizeof(float);
printf("[Vector addition of %ld elements]\n", numElements);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i)
{
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
h_C[i] = 0.0;
}
float *d_A = NULL;
cudaMalloc((void **)&d_A, size);
float *d_B = NULL;
cudaMalloc((void **)&d_B, size);
float *d_C = NULL;
cudaMalloc((void **)&d_C, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
size_t block_info_bytes = blocksPerGrid * sizeof(struct block_info_t);
block_info *h_block_info = (struct block_info_t *)malloc(block_info_bytes);
for(int i = 0; i < blocksPerGrid; i++)
{
h_block_info[i].started = 0;
h_block_info[i].thread_count = 0;
}
block_info *d_block_info = NULL;
cudaMalloc(&d_block_info, block_info_bytes);
cudaMemcpy(d_block_info, h_block_info, block_info_bytes, cudaMemcpyHostToDevice);
volatile unsigned int *timeout = NULL;
cudaHostAlloc((void **)&timeout, sizeof(volatile unsigned int), cudaHostAllocMapped);
*timeout = 0;
double quantum = 0.0001 * 1000000.0;
double initial_quantum = quantum;
int complete = 0;
/* Here the kernel launch is looped until all blocks are complete */
while(complete == 0)
{
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements, timeout, d_block_info);
usleep(quantum);
*timeout = 1;
cudaDeviceSynchronize();
cudaMemcpy(h_block_info, d_block_info, block_info_bytes, cudaMemcpyDeviceToHost);
computation_complete(&complete, blocksPerGrid, h_block_info);
if(complete == 0)
{
quantum = quantum + initial_quantum;
*timeout = 0;
}
}
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i)
{
if (fabs(h_B[i] - h_C[i]) > 1e-5)
{
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Free device global memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
cudaDeviceReset();
return 0;
}
$ nvcc -arch=sm_61 -o t100 t100.cu
$ ./t100 327678
[Vector addition of 327678 elements]
Result verification failed at element 0!
$ nvcc -arch=sm_61 -o t100 t100.cu -DUSE_FIX
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$
One other change I made to your code was in this line:
printf("[Vector addition of %d elements]\n", numElements);
this has no bearing on the problem, but your format specifier does not match your variable type. Fix by changing to %ld.

Can't find out why cudaMemcpy fails using curand library functions

I'm trying to initialize in a random manner the weights ( stored as floats ) of a neural network using CURAND functions.
I first initialize the neural netwotk with some values and after that I attempt to copy the two matrices in the nn struct ( nn stands for neural network ), that should store the weight values ( nn.wih, and nn.who ) into the Device memory.
Then I call a function that should randomize the matrices' values (assignRandomWeight), which launches two kernels that holds curand functions.
Finally I try to copy the resulting matrices back to the host memory through a cudaMemcpy call, but at this point I get the error "an illegal memory access was encountered".
I tried to print the values of the Device copy matrices of wih and who, which are d_wih and d_who. They seems to be correct; I left in the code two functions usefull for debugging :
checkCudaError can be called to check the last cudaError_t string message
showValues is useful to print the values of a Device allcated arraay
I extracted a sample of my code that compile and presents the same error, plese help me out
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include<cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
struct TNeuralNetwork {
int input_neurons;
int hidden_neurons;
int output_neurons;
float *wih; //first layer of weights (from input layer to hidden layer)
float *who; //second layer of weights (from hidden layer to output layer)
float *wih_old; //for the momentum
float *who_old; //for the momentum
float *erro;
float *errh;
float l; //learning rate
float m; //momentum
float *i; //values into input neurons
float *h; //values into hidden neurons
float *o; //values into output neurons
};
__host__ void checkCudaError(char *str);
__global__ void showValues(float *d_v, int dim);
__global__ void init_rand(unsigned int seed, curandState_t state_wih);
__global__ void generateRandomValues(curandState_t state_wih, float *wih, float *who, int inp, int hid, int out);
void assignRandomWeight(struct TNeuralNetwork *nn, float *d_wih, float *d_who);
void initNeuralNetwork(struct TNeuralNetwork *nn, int bands, int nlabel);
int main(int argc, char **argv) {
struct TNeuralNetwork nn;
//Declare Device variables
float *d_wih;
float *d_who;
unsigned int v;
cudaError_t cudaStatus;
initNeuralNetwork(&nn, 102, 10);
//Allocate Device Memory
v = (nn.input_neurons + 1)*(nn.hidden_neurons);
cudaMalloc((void**)&d_wih, (nn.input_neurons + 1)*(nn.hidden_neurons)*sizeof(float));
checkCudaError("malloc1");
//cudaMalloc((void**)&d_who, (nn.hidden_neurons + 1)*nn.output_neurons * sizeof(float));
//checkCudaError("malloc2");
for (int i = 0; i < (nn.input_neurons + 1); i++){
for (int j = 0; j < nn.hidden_neurons; j++){
nn.wih[i*nn.hidden_neurons + j] = 0;
}
}
for (int i = 0; i < (nn.hidden_neurons + 1); i++){
for (int j = 0; j < nn.output_neurons; j++){
nn.who[i*nn.output_neurons + j] = 0;
}
}
cudaMemcpy(d_wih, nn.wih, (nn.input_neurons + 1)*(nn.hidden_neurons)*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError("memcpy0");
//showValues << <v, 1 >> >(d_wih, v); TEST
//cudaMemcpy(d_who, nn.who, (nn.hidden_neurons + 1)*nn.output_neurons*sizeof(float), cudaMemcpyHostToDevice);
//checkCudaError("memcpy0.1");
assignRandomWeight(&nn, d_wih, d_who);
cudaMemcpy(nn.wih, d_wih, (nn.input_neurons + 1)*(nn.hidden_neurons)*sizeof(float), cudaMemcpyDeviceToHost);
//showValues << <v, 1 >> >(d_wih, v); TEST
checkCudaError("memcpy1");
//cudaMemcpy(nn.who, d_who, (nn.hidden_neurons + 1)*nn.output_neurons*sizeof(float), cudaMemcpyDeviceToHost);
//checkCudaError("memcpy2");
//printf("WIH:\n");
//for (int i = 0; i < (nn.input_neurons + 1); i++){
// for (int j = 0; j < (nn.hidden_neurons); j++){
// printf("%.12f\t", nn.wih[i*(nn.hidden_neurons) + j]);
// }
// printf("\n\n");
//}
//printf("WHO:\n");
//for (int i = 0; i < (nn.hidden_neurons + 1); i++){
// for (int j = 0; j < nn.output_neurons; j++){
// printf("%.12f\t", nn.wih[i*nn.output_neurons + j]);
// }
// printf("\n\n");
//}
cudaFree(d_wih);
cudaFree(d_who);
return 0;
}
__host__ void checkCudaError(char *str){
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess){
printf("Cuda Error at %s: %s \n", str, cudaGetErrorString(err));
exit(-1);
}
}
__global__ void showValues(float *d_v, int dim){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < dim){
printf("elemento[%d] = %.4f\n", tid, d_v[tid]);
}
}
__global__ void init_rand(unsigned int seed, curandState_t state_wih){
int tid = blockIdx.x*blockDim.x + threadIdx.x;
curand_init(seed, 0, tid, &state_wih);
}
__global__ void generateRandomValues(curandState_t state_wih, float *wih, float *who, int inp, int hid, int out){
int tid = (blockIdx.x)*(blockDim.x) + threadIdx.x;
printf("%.7f", (float)curand(&state_wih + tid));
if (tid <= (inp + 1)*hid){
wih[tid] = (float)curand_uniform(&state_wih + tid);
printf("%.7f", wih[tid]);
}
if (tid <= (hid + 1)*out){
who[tid] = (float)curand_uniform(&state_wih + tid);
printf("%.7f", who[tid]);
}
}
void initNeuralNetwork(struct TNeuralNetwork *nn, int bands, int nlabel) {
nn->input_neurons = bands;
nn->output_neurons = nlabel;
//nn->hidden_neurons = (int)((bands + nlabel)/2.0f);
nn->hidden_neurons = (int)((bands + nlabel)*2.0f / 3.0f);
nn->l = 0.001;
nn->m = 0.2;
nn->wih = (float*)malloc((bands + 1)*(nn->hidden_neurons) * sizeof(float)); //+1 for the bias
nn->who = (float*)malloc((nn->hidden_neurons + 1)*nlabel * sizeof(float));//+1 for the bias
nn->wih_old = (float*)malloc((bands + 1)*(nn->hidden_neurons) * sizeof(float)); //+1 for the bias
nn->who_old = (float*)malloc((nn->hidden_neurons + 1)*nlabel * sizeof(float));//+1 for the bias
nn->i = (float*)malloc(bands * sizeof(float));
nn->h = (float*)malloc(nn->hidden_neurons * sizeof(float));
nn->o = (float*)malloc(nlabel * sizeof(float));
nn->errh = (float*)malloc(nn->hidden_neurons * sizeof(float));
nn->erro = (float*)malloc(nlabel * sizeof(float));
memset(nn->wih_old, 0, (bands + 1)*(nn->hidden_neurons) * sizeof(float));
memset(nn->who_old, 0, (nn->hidden_neurons + 1)*nlabel * sizeof(float));
}
//curand
void assignRandomWeight(struct TNeuralNetwork *nn, float *d_wih, float *d_who) {
cudaError_t cudaStatus;
curandState_t state_wih;
srand(time(NULL));
unsigned int seed = rand();
//Alloco la matrice di curandState_t per la randomizzaione, in uscita dalla funzione non mi servirà più
cudaMalloc((void**)&state_wih, (nn->input_neurons + 1)*(nn->hidden_neurons)* sizeof(curandState_t));
dim3 gridSize(ceil((double)((nn->input_neurons + 1)*(nn->hidden_neurons)) / 32));
dim3 blockSize(32);
init_rand << < gridSize, blockSize >> >(seed, state_wih);
generateRandomValues << < gridSize, blockSize >> >(state_wih, d_wih, d_who, nn->input_neurons, nn->hidden_neurons, nn->output_neurons);
}
"Incorrect Indexing" will produce out-of-bounds memory access within the kernel. The CUDA runtime will destroy your context at the point where the error occurred within the kernel, after which no CUDA operations which rely the the context can be performed. The cudaMemcpycall fails because your context has been destroyed. There is no way to avoid this.
NVIDIA supply a utility called cuda-memcheck with the CUDA toolkit. Use that instead to diagnose what is going wrong with your kernel.
I'v found my error:
I was making a bad use of the "curandState_t" type variable in the assignRandomWight function, I had to use a pointer.
this is the correct version:
void assignRandomWeight(struct TNeuralNetwork *nn, float *d_wih, float *d_who) {
cudaError_t cudaStatus;
curandState_t *state_wih;
srand(time(NULL));
unsigned int seed = rand();
//Alloco la matrice di curandState_t per la randomizzaione, in uscita dalla funzione non mi servirà più
cudaMalloc((void**)&state_wih, (nn->input_neurons + 1)*(nn->hidden_neurons)* sizeof(curandState_t));
dim3 gridSize(ceil((double)((nn->input_neurons + 1)*(nn->hidden_neurons)) / 32));
dim3 blockSize(32);
init_rand << < gridSize, blockSize >> >(seed, state_wih);
generateRandomValues << < gridSize, blockSize >> >(state_wih, d_wih, d_who, nn->input_neurons, nn->hidden_neurons, nn->output_neurons);
}
and the correct version for the two kernels:
__global__ void generateRandomValues( curandState_t *state_wih, float *wih, float *who, int inp, int hid, int out){
int tid = (blockIdx.x)*(blockDim.x) + threadIdx.x;
if (tid<=(inp+1)*hid ){
printf("\ncasual : %f", (float)curand_uniform(&state_wih[tid]));
wih[tid] = (float)curand_uniform(&state_wih[tid]);
}
if (tid<=(hid+1)*out){
who[tid] = (float)curand_uniform(&state_wih[tid]);
}
}
__global__ void init_rand(unsigned int seed, curandState_t *state_wih){
int tid = blockIdx.x*blockDim.x + threadIdx.x;
curand_init(seed, tid, 0, &state_wih[tid]);
}

Cuda program not working

i'm a beginner in cuda programming. I'm trying an own easy code but it's not working and I don't know what else to do.
My code:
#include <mpi.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/wait.h>
// Prototypes
__global__ void helloWorld(char*);
__device__ int getGlobalIdx_2D_2D();
// Host function
int main(int argc, char** argv)
{
unsigned int i, N, gridX, gridY, blockX, blockY;
N = 4096000;
char *str = (char *) malloc(N*sizeof(char));
for(i=0; i < N; i++) str[i]='c';
MPI_Init (&argc, &argv);
char *d_str;
size_t size = (size_t) N*sizeof(char);
cudaMalloc((void**)&d_str, size);
cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice);
gridX = 100;
gridY = 10;
blockX = blockY = 64;
dim3 dimGrid(gridX, gridY); // 4096 chars per block
dim3 dimBlock(blockX, blockY); // one thread per character, 2D
printf("dimGrid(%d, %d)\t", gridX, gridY);
printf("dimBlock(%d, %d)\t", blockX, blockY);
helloWorld<<< dimGrid, dimBlock >>>(d_str);
cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
MPI_Barrier (MPI_COMM_WORLD);
cudaFree(d_str);
printf("\nRes:\n");
for(i = 0; i < N; i++) printf("\t[%u] %c\n", i, str[i]);
MPI_Finalize ();
free(str);
return 0.0;
}
// Device kernel
__global__ void helloWorld(char* str)
{
// determine where in the thread grid we are
int pos = getGlobalIdx_2D_2D();
if (pos % 2 == 0) str[pos] -= 2;
else str[pos] += 8;
}
__device__ int getGlobalIdx_2D_2D()
{
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y) +
(threadIdx.y * blockDim.x) + threadIdx.x;
return threadId;
}
My desired output is: jajajajajajaja... x4096000
I've read that '%' operation is not efficient, but I don't think is the problem there.
Thanks!
You are performing absolutely no CUDA error checking, it is really beneficial to do so. Once you enable it you can find that block dimensions 64 x 64 are invalid as it results into 4096 threads within one block, which is not a valid configuration.

maximum supported size for cub library

Does anyone know what is the maximum supported size for cub::scan ? I got core dump for input sizes over 500 million. I wanted to make sure I'm not doing anything wrong...
Here is my code:
#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;
bool g_verbose = false; // Whether to display input/output to console
CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
typedef int mytype;
/**
* Solve inclusive-scan problem
*/
static void solve(mytype *h_in, mytype *h_cpu, int n)
{
mytype inclusive = 0;
for (int i = 0; i < n; ++i) {
inclusive += h_in[i];
h_cpu[i] = inclusive;
}
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
for (int i = 0; i < n; i++) {
if (h_cpu[i] != h_o[i]) {
return i + 1;
}
}
return 0;
}
/**
* Main
*/
int main(int argc, char** argv)
{
cudaSetDevice(0);
struct timeval start, end;
int num_items = 1073741824;
const int repetitions = 5;
mytype *h_in, *h_out, *h_cpu;
const int size = num_items * sizeof(mytype);
// Allocate host arrays
h_in = (mytype *)malloc(size);
h_out = (mytype *)malloc(size);
h_cpu = (mytype *)malloc(size);
// Initialize problem and solution
for (int i = 0; i < num_items; i++) {
h_in[i] = i;
h_out[i] = 0;
h_cpu[i] = 0;
}
solve(h_in, h_cpu, num_items);
// Allocate problem device arrays
mytype *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array
mytype *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
gettimeofday(&start, NULL);
for (long i = 0; i < repetitions; i++)
DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
cudaThreadSynchronize();
gettimeofday(&end, NULL);
double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;
cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
int cmp = compare(h_cpu, h_out, num_items);
printf("%d\t", num_items);
if (!cmp)
printf("\t%7.4fs \n", ctime);
printf("\n");
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (h_cpu) delete[] h_cpu;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
printf("\n\n");
return 0;
}
The problem is here:
const int size = num_items * sizeof(mytype);
And it can be fixed by changing it to:
const size_t size = num_items * sizeof(mytype);
The value of num_items in the code is over 1 Billion. When we multiply that by sizeof(mytype) we are multiplying it by 4, so the result is over 4 Billion. This value cannot be stored in an int variable. If you try to use it anyway like that, then your subsequent host code will do bad things. This problem (the core dump) actually has nothing to do with CUDA. The code would core dump if you removed all the CUB elements.
When I modify the line of code above, and compile for the correct GPU (e.g. -arch=sm_35 in my case, or -arch=sm_52 for a Titan X GPU), then I get the correct answer (and no seg fault/core dump).
In general, the correct starting point when chasing a seg fault/core dump type error, is to recognize that this error arises from host code and you should attempt to localize the exact line of source code that is generating this error. This can be done trivially/tediously by putting many printf statements in your code, until you identify the line of your code after which you don't see any printf output, or by using a host code debugger, such as gdb on linux.
Also note that this code as written will require slightly more than 12GB of memory on the host, and slightly more than 8GB of memory on the GPU, so it will only run properly in such settings.
For reference, here is the fixed code (based on what OP posted here):
#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;
bool g_verbose = false; // Whether to display input/output to console
CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
typedef int mytype;
/**
* Solve inclusive-scan problem
*/
static void solve(mytype *h_in, mytype *h_cpu, int n)
{
mytype inclusive = 0;
for (int i = 0; i < n; ++i) {
inclusive += h_in[i];
h_cpu[i] = inclusive;
}
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
for (int i = 0; i < n; i++) {
if (h_cpu[i] != h_o[i]) {
return i + 1;
}
}
return 0;
}
/**
* Main
*/
int main(int argc, char** argv)
{
cudaSetDevice(0);
struct timeval start, end;
int num_items = 1073741824;
const int repetitions = 5;
mytype *h_in, *h_out, *h_cpu;
const size_t size = num_items * sizeof(mytype);
// Allocate host arrays
h_in = (mytype *)malloc(size);
h_out = (mytype *)malloc(size);
h_cpu = (mytype *)malloc(size);
// Initialize problem and solution
for (int i = 0; i < num_items; i++) {
h_in[i] = i;
h_out[i] = 0;
h_cpu[i] = 0;
}
solve(h_in, h_cpu, num_items);
// Allocate problem device arrays
mytype *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array
mytype *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
gettimeofday(&start, NULL);
for (long i = 0; i < repetitions; i++)
DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
cudaThreadSynchronize();
gettimeofday(&end, NULL);
double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;
cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
int cmp = compare(h_cpu, h_out, num_items);
printf("%d\t", num_items);
if (!cmp)
printf("\t%7.4fs \n", ctime);
printf("\n");
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (h_cpu) delete[] h_cpu;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
printf("\n\n");
return 0;
}

Cuda call won't allocate more than 8 threads per block, regardless of specification

I am creating a parallel version of the Sieve of Eratosthenes in c++. The problem is my kernel call (reduce0) seems to only ever assign 8 threads per block instead of the 256 I specify. Since even the first CUDA version allows 512 threads per block, there must be some error in my code for it. Any help would be appreciated.
#include <iostream>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cutil.h>
//#include <sieve_kernel.cu>
using namespace std;
////////////////////////////////////////////////////
int psum(int arg[], double n);
int call_kernel(int primes[], int n);
int findsmallest(int arg[], int f, double n);
int sieve(int n);
__global__ void reduce0(int *g_idata, int *g_odata);
////////////////////////////////////////////////////
int main(){
int n = pow((double) 2, 8);
int total = sieve(n);
cout << "# primes" << endl << total << endl;
return 0;
}
///////////////////////////////////////////////////
__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (int s = 1; s < blockDim.x; s *= 2) { // step = s x 2
if (tid % (s*2) == 0) { // only threadIDs divisible by the step participate
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////////////////////////////////
int call_kernel(int *primes, int n){
// Allocate and copy device arrays
int *g_idevice;
int *g_odevice;
int size = n * sizeof(int);
cudaMalloc(&g_idevice, size);
cudaMemcpy(g_idevice, primes, size, cudaMemcpyHostToDevice);
cudaMalloc(&g_odevice, size);
// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
// Copy device data back to primes
cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
//for (int i = 0; i < n; i++) {
// cout << i << " " << primes[i] << endl;
//}
int total = primes[0];
cudaFree(g_idevice);
cudaFree(g_odevice);
return total;
}
/////////////////////////////////////////////////////////////////////
int findsmallest(int arg[], int f, double n){
int i = f;
while(arg[i]!= 1 && i < n) {
i++;
}
return i;
}
//////////////////////////////////////////////////////////////////////
int psum(int arg[], double n){
int total = 0;
int i = 2;
while(i < n){
if(arg[i] == 1){
total = total + 1;
}
i++;
}
return total;
}
/////////////////////////////////////////////////////////////////////////
int sieve(int n){
int* primes = NULL;
int mult = 0;
int k = 2;
int i; int total;
//primes = new int[n];
primes = new int[256];
for(i = 0; i < n; i++){
primes[i] = 1;
}
primes[0] = primes[1] = 0;
while (k * k < n){
mult = k * k;
while (mult < n) {
primes[mult] = 0;
mult = mult + k;
}
k = findsmallest(primes,k+1, n);
}
total = call_kernel(primes, n);
//delete [] primes;
//primes = NULL;
return total;
}
Your kernel is using dynamically allocated shared memory, but the kernel launch does not include any allocation, so the result is the kernel will be aborting because of illegal memory operations on that shared memory buffer. You should find it works if you modify this part of call_kernel as follows:
// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
size_t shmsize = size_t(dimBlock.x * dimBlock.y * dimBlock.z) * sizeof(int);
reduce0<<<dimGrid, dimBlock, shmsize>>>(g_idevice, g_odevice);
If you had of included some basic error checking around the function call, perhaps like this:
reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
if (cudaPeekAtLastError() != cudaSuccess) {
cout << "kernel launch error: " << cudaGetErrorString(cudaGetLastError()) << endl;
}
// Copy device data back to primes
cudaError_t err = cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
cout << "CUDA error: " << cudaGetErrorString(err) << endl;
}
it would have been immediately obvious that the kernel launch or execution was failing with an error.