How to keep track of executed CUDA blocks? - cuda

Just for the sake of testing my understanding of things, I decided to modify the vector addition found in the CUDA samples so that the kernel quits after a specific time and is then re-launched to complete. The way I achieve the "timeout" is by having a pinned variable that the host sets to 1 after some time. Within the kernel, a check of this variable is performed to determine whether execution should continue. If the thread continues its execution it is marked as complete. In order to test that each thread executes just once, I've modified the addition to C[i] = C[i] + B[i] This all works as expected; the device code looks as follows:
/* Function
* Internal device function used for getting the current thread's global ID
* regardless of the block/grid configuration. It assumes that the
* grid and block are 3 dimensional.
*
* #return: The thread's global ID
*/
static __device__ int get_global_idx()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
return threadId;
}
/* Function
* Device function that determines if the current thread should continue execution.
* A check should be used on the return value. If the timeout has not been set
* and the thread has not previously executed the index at the thread's ID in the
* thread_ids array is set to 1 to indicate it was allowed to proceed.
*
* #param thread_ids: A pointer to the array with a size that matches the max number
* of threads that will be spawned
*
* #param time_out: Memory mapped variable used by the host to signal the kernel when
* execution should suspend
*
* #return: A boolean value indicating whether the current thread should continue or not
*/
__device__ bool continue(unsigned int *thread_ids, volatile unsigned int *time_out)
{
if(*time_out == 1){
return false;
}
int tid = get_global_idx();
if(thread_ids[tid] == 1)
{
return false;
}
thread_ids[tid] = 1;
return true;
}
__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, unsigned int *thread_ids, volatile unsigned int *timeout)
{
if(!continue(thread_ids, timeout))
{
return;
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
/* C[i] = A[i] + B[i]; */
C[i] = C[i] + B[i]; //Modifed from above
}
}
I considered how this may fail if __syncthreads() was used. So I decided to do block level suspension. Based on my understanding, I thought this would be simple. Keep track of if a block has started, and count how many threads have executed for that block and only suspend when all threads of an already started block have completed and deny any threads who's block has not started. So I used a struct and modified the continue function as follows:
typedef struct block_info_t{
int started; /* Initialized to zero before any kernel launch */
unsigned int thread_count;
}block_info;
__device__ bool continue(unsigned int *thread_ids, volatile unsigned int *time_out, block_info *b_info)
{
int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
if(*time_out == 1 && b_info[bid].started == 0)
{
return false;
}
if(b_info[bid].thread_count == bsize)
{
return false;
}
b_info[bid].started = 1;
atomicInc(&b_info[bid].thread_count, bsize);
return true;
}
This does not work, when I execute verification on the host (h_B[i] - h_C[i]) I don't get a consistent zero result. Which means that some threads somehow managed to execute multiple times. Any ideas how/why this is happening with the latter attempt? Thanks.
I don't care about performance at this point; just trying to understand what is really happening.
EDIT
Here is the complete code, compile with nvcc file_name.cu and execute program_name <vector-length>.
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef struct block_info_t{
int started; /* Initialized to zero before any kernel launch */
unsigned int thread_count;
}block_info;
__device__ bool continue_execution(volatile unsigned int *time_out, block_info *b_info)
{
int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
if(*time_out == 1 && b_info[bid].started == 0)
{
return false;
}
if(b_info[bid].thread_count == bsize)
{
return false;
}
b_info[bid].started = 1;
atomicInc(&b_info[bid].thread_count, bsize);
return true;
}
__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, volatile unsigned int *time_out, block_info *b_info)
{
if(!continue_execution(time_out, b_info))
{
return;
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
//C[i] = A[i] + B[i];
C[i] = C[i] + B[i]; //Modified from above
}
}
void computation_complete(int *complete, int block_amt, block_info *h_block_info)
{
size_t i;
for(i = 0; i < block_amt; i++)
{
if(h_block_info[i].started == 1)
{
continue;
}
break;
}
*complete = (i == block_amt) ? 1 : 0;
}
int main(int argc, char *argv[])
{
if(argc != 2)
{
fprintf(stderr, "usage: <program-name> <vector-length>\n");
exit(EXIT_FAILURE);
}
// Print the vector length to be used, and compute its size
long numElements = strtol(argv[1], NULL, 10);
size_t size = numElements * sizeof(float);
printf("[Vector addition of %d elements]\n", numElements);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i)
{
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
h_C[i] = 0.0;
}
float *d_A = NULL;
cudaMalloc((void **)&d_A, size);
float *d_B = NULL;
cudaMalloc((void **)&d_B, size);
float *d_C = NULL;
cudaMalloc((void **)&d_C, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
size_t block_info_bytes = blocksPerGrid * sizeof(struct block_info_t);
block_info *h_block_info = (struct block_info_t *)malloc(block_info_bytes);
for(int i = 0; i < blocksPerGrid; i++)
{
h_block_info[i].started = 0;
h_block_info[i].thread_count = 0;
}
block_info *d_block_info = NULL;
cudaMalloc(&d_block_info, block_info_bytes);
cudaMemcpy(d_block_info, h_block_info, block_info_bytes, cudaMemcpyHostToDevice);
volatile unsigned int *timeout = NULL;
cudaHostAlloc((void **)&timeout, sizeof(volatile unsigned int), cudaHostAllocMapped);
*timeout = 0;
double quantum = 0.0001 * 1000000.0;
double initial_quantum = quantum;
int complete = 0;
/* Here the kernel launch is looped until all blocks are complete */
while(complete == 0)
{
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements, timeout, d_block_info);
usleep(quantum);
*timeout = 1;
cudaDeviceSynchronize();
cudaMemcpy(h_block_info, d_block_info, block_info_bytes, cudaMemcpyDeviceToHost);
computation_complete(&complete, blocksPerGrid, h_block_info);
if(complete == 0)
{
quantum = quantum + initial_quantum;
*timeout = 0;
}
}
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i)
{
if (fabs(h_B[i] - h_C[i]) > 1e-5)
{
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Free device global memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
cudaDeviceReset();
return 0;
}

You have a race condition in your continue_execution routine. Consider the following scenario:
warp0 of a threadblock enters the continue_execution routine. At the moment that it checks the variables *time_out and b_info[bid].started it witnesses those to be 0 and 0 respectively. So it proceeds to the next if test.
warp1 of the same threadblock enters the continue_execution routine (let's say slightly later), and it witnesses the variables to be 1 and 0 respectively. So it returns false and causes the warp1 threads to exit.
warp0 continues on and eventually sets b_info[bid].started to 1, and then updates the thread_count. It then returns true and proceeds with the vector add.
I could continue with this, but I think if you consider the above 3 items carefully you will realize it is a case you did not account for. Your implicit expectation is that every thread would read a coherent (i.e. the same across a given threadblock) value for *time_out. But this is not guaranteed by your code, and if it fails to do so, then we end up with some threadblocks where some threads have completed their work and some have not.
So how could we fix this? The above description should point the way. One possible approach is to guarantee that for any given threadblock, that every thread gets the same value for *time_out whether it be 1 or 0. One possible solution would be to make the following changes to the beginning of your vectorAdd kernel:
__shared__ volatile unsigned int my_time_out;
if (!threadIdx.x) my_time_out = *time_out;
__syncthreads();
if(!continue_execution(&my_time_out, b_info))
with those changes, we ensure that every thread in a block gets a coherent view of the time out variable, and according to my testing, the problem is resolved:
$ cat t100.cu
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef struct block_info_t{
int started; /* Initialized to zero before any kernel launch */
unsigned int thread_count;
}block_info;
__device__ bool continue_execution(volatile unsigned int *time_out, block_info *b_info)
{
int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
if(*time_out == 1 && b_info[bid].started == 0)
{
return false;
}
if(b_info[bid].thread_count == bsize)
{
return false;
}
b_info[bid].started = 1;
atomicInc(&b_info[bid].thread_count, bsize);
return true;
}
__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, volatile unsigned int *time_out, block_info *b_info)
{
#ifdef USE_FIX
__shared__ volatile unsigned int my_time_out;
if (!threadIdx.x) my_time_out = *time_out;
__syncthreads();
if(!continue_execution(&my_time_out, b_info))
#else
if(!continue_execution(time_out, b_info))
#endif
{
return;
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
//C[i] = A[i] + B[i];
C[i] = C[i] + B[i]; //Modified from above
}
}
void computation_complete(int *complete, int block_amt, block_info *h_block_info)
{
size_t i;
for(i = 0; i < block_amt; i++)
{
if(h_block_info[i].started == 1)
{
continue;
}
break;
}
*complete = (i == block_amt) ? 1 : 0;
}
int main(int argc, char *argv[])
{
if(argc != 2)
{
fprintf(stderr, "usage: <program-name> <vector-length>\n");
exit(EXIT_FAILURE);
}
// Print the vector length to be used, and compute its size
long numElements = strtol(argv[1], NULL, 10);
size_t size = numElements * sizeof(float);
printf("[Vector addition of %ld elements]\n", numElements);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i)
{
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
h_C[i] = 0.0;
}
float *d_A = NULL;
cudaMalloc((void **)&d_A, size);
float *d_B = NULL;
cudaMalloc((void **)&d_B, size);
float *d_C = NULL;
cudaMalloc((void **)&d_C, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
size_t block_info_bytes = blocksPerGrid * sizeof(struct block_info_t);
block_info *h_block_info = (struct block_info_t *)malloc(block_info_bytes);
for(int i = 0; i < blocksPerGrid; i++)
{
h_block_info[i].started = 0;
h_block_info[i].thread_count = 0;
}
block_info *d_block_info = NULL;
cudaMalloc(&d_block_info, block_info_bytes);
cudaMemcpy(d_block_info, h_block_info, block_info_bytes, cudaMemcpyHostToDevice);
volatile unsigned int *timeout = NULL;
cudaHostAlloc((void **)&timeout, sizeof(volatile unsigned int), cudaHostAllocMapped);
*timeout = 0;
double quantum = 0.0001 * 1000000.0;
double initial_quantum = quantum;
int complete = 0;
/* Here the kernel launch is looped until all blocks are complete */
while(complete == 0)
{
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements, timeout, d_block_info);
usleep(quantum);
*timeout = 1;
cudaDeviceSynchronize();
cudaMemcpy(h_block_info, d_block_info, block_info_bytes, cudaMemcpyDeviceToHost);
computation_complete(&complete, blocksPerGrid, h_block_info);
if(complete == 0)
{
quantum = quantum + initial_quantum;
*timeout = 0;
}
}
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i)
{
if (fabs(h_B[i] - h_C[i]) > 1e-5)
{
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Free device global memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
cudaDeviceReset();
return 0;
}
$ nvcc -arch=sm_61 -o t100 t100.cu
$ ./t100 327678
[Vector addition of 327678 elements]
Result verification failed at element 0!
$ nvcc -arch=sm_61 -o t100 t100.cu -DUSE_FIX
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$
One other change I made to your code was in this line:
printf("[Vector addition of %d elements]\n", numElements);
this has no bearing on the problem, but your format specifier does not match your variable type. Fix by changing to %ld.

Related

cuda copy data which dynamic malloc in kernel from device memory

I met a problem about using cudaMemcpy with cudaMemcpyDeviceToHost.
There is a struct which have a pointer int* a, It will malloc in the kernel function.
And then I need copy this int* a to host memory.
My question is: I didn't know how it can not work by using cudaMemcpy.
There my codes:
#include <cuda_runtime.h>
#include <stdio.h>
typedef struct { int n, m; int *a; } myst;
__global__ void xthread(myst *st)
{
unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
myst *mst = &st[idx];
mst->n = idx;
mst->m = idx+1;
mst->a = (int *)malloc((mst->m)*sizeof(int));
mst->a[0] = idx;
}
int main(int argc,char **argv)
{
dim3 dimGrid(1);
dim3 dimBlock(2);
myst *mst = NULL;
myst *hst = (myst *)malloc(2 * sizeof(myst));
cudaMalloc(&mst, 2 * sizeof(myst));
xthread<<<dimGrid, dimBlock>>>(mst);
cudaDeviceSynchronize();
cudaMemcpy(&hst[0],&mst[0],sizeof(myst),cudaMemcpyDeviceToHost);
cudaMemcpy(&hst[1],&mst[1],sizeof(myst),cudaMemcpyDeviceToHost);
int *pInt1 = (int *)malloc((hst[0].m)*sizeof(int)) ;
int *pInt2 = (int *)malloc((hst[1].m)*sizeof(int)) ;
cudaMemcpy(pInt1, hst[0].a, (hst[0].m)*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(pInt2, hst[1].a, (hst[1].m)*sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\t%d\t%d\n",hst[0].n,hst[0].m, pInt1[0]);
printf("%d\t%d\t%d\n",hst[1].n,hst[1].m, pInt2[0]);
free(pInt1);
free(pInt2);
return 0;
}
The codes will go warning about "Cuda API error detected: cudaMemcpy returned (0xb)"
I saw a similar question : copy data which is allocated in device from device to host
But it seem that can not solve my problem.
Thx.
Alright, I work it out with a stupid way (-.-!!).
While return form the kernel function, I count how many space I have to malloc in Host and Device, and cudaMalloc again a big space . Next, in other kernel function named ythread, copy the data which in the Heap to the big space.
typedef struct { int n, m; int *a; } myst;
__global__ void xthread(myst *st) {
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
myst *mst = &st[idx];
mst->n = idx;
mst->m = idx + 1;
mst->a = (int *) malloc((mst->m) * sizeof(int));
for (int i = 0; i < mst->m; i++) {
mst->a[i] = idx + 900 + i * 10;
}
}
__global__ void ythread(myst *st, int *total_a) {
unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
myst *mst = &st[idx];
int offset=0;
for(int i=0; i<idx; i++) {
offset += st[i].m;
}
for(int i=0; i<mst->m; i++) {
total_a[offset+i] = mst->a[i];
}
}
int main(int argc,char **argv) {
dim3 dimGrid(1);
dim3 dimBlock(2);
myst *mst = NULL;
cudaMalloc((void**)&mst, dimBlock.x * sizeof(myst));
xthread<<<dimGrid, dimBlock>>>(mst);
cudaDeviceSynchronize();
myst *hst = (myst *)malloc(dimBlock.x * sizeof(myst));
cudaMemcpy(hst, mst, dimBlock.x*sizeof(myst),cudaMemcpyDeviceToHost);
int t_size = 0;
for(int i=0; i<dimBlock.x; i++) {
t_size += hst[i].m;
}
printf("t_size:%d\n", t_size);
int * t_a_h = (int *)malloc(t_size*sizeof(int));
int * t_a_d = NULL;
cudaMalloc((void**)&t_a_d, t_size*sizeof(int));
ythread<<<dimGrid, dimBlock>>>(mst, t_a_d);
cudaDeviceSynchronize();
cudaMemcpy(t_a_h, t_a_d, t_size*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0; i<t_size; i++) {
printf("t_a_h[%d]:%d\n", i, t_a_h[i]);
}
free(t_a_h);
cudaFree(mst);
cudaFree(t_a_d);
return 0;
}
Emmmmmm, it work, but I think there is a better way to solve this problem.

Can't find out why cudaMemcpy fails using curand library functions

I'm trying to initialize in a random manner the weights ( stored as floats ) of a neural network using CURAND functions.
I first initialize the neural netwotk with some values and after that I attempt to copy the two matrices in the nn struct ( nn stands for neural network ), that should store the weight values ( nn.wih, and nn.who ) into the Device memory.
Then I call a function that should randomize the matrices' values (assignRandomWeight), which launches two kernels that holds curand functions.
Finally I try to copy the resulting matrices back to the host memory through a cudaMemcpy call, but at this point I get the error "an illegal memory access was encountered".
I tried to print the values of the Device copy matrices of wih and who, which are d_wih and d_who. They seems to be correct; I left in the code two functions usefull for debugging :
checkCudaError can be called to check the last cudaError_t string message
showValues is useful to print the values of a Device allcated arraay
I extracted a sample of my code that compile and presents the same error, plese help me out
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include<cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
struct TNeuralNetwork {
int input_neurons;
int hidden_neurons;
int output_neurons;
float *wih; //first layer of weights (from input layer to hidden layer)
float *who; //second layer of weights (from hidden layer to output layer)
float *wih_old; //for the momentum
float *who_old; //for the momentum
float *erro;
float *errh;
float l; //learning rate
float m; //momentum
float *i; //values into input neurons
float *h; //values into hidden neurons
float *o; //values into output neurons
};
__host__ void checkCudaError(char *str);
__global__ void showValues(float *d_v, int dim);
__global__ void init_rand(unsigned int seed, curandState_t state_wih);
__global__ void generateRandomValues(curandState_t state_wih, float *wih, float *who, int inp, int hid, int out);
void assignRandomWeight(struct TNeuralNetwork *nn, float *d_wih, float *d_who);
void initNeuralNetwork(struct TNeuralNetwork *nn, int bands, int nlabel);
int main(int argc, char **argv) {
struct TNeuralNetwork nn;
//Declare Device variables
float *d_wih;
float *d_who;
unsigned int v;
cudaError_t cudaStatus;
initNeuralNetwork(&nn, 102, 10);
//Allocate Device Memory
v = (nn.input_neurons + 1)*(nn.hidden_neurons);
cudaMalloc((void**)&d_wih, (nn.input_neurons + 1)*(nn.hidden_neurons)*sizeof(float));
checkCudaError("malloc1");
//cudaMalloc((void**)&d_who, (nn.hidden_neurons + 1)*nn.output_neurons * sizeof(float));
//checkCudaError("malloc2");
for (int i = 0; i < (nn.input_neurons + 1); i++){
for (int j = 0; j < nn.hidden_neurons; j++){
nn.wih[i*nn.hidden_neurons + j] = 0;
}
}
for (int i = 0; i < (nn.hidden_neurons + 1); i++){
for (int j = 0; j < nn.output_neurons; j++){
nn.who[i*nn.output_neurons + j] = 0;
}
}
cudaMemcpy(d_wih, nn.wih, (nn.input_neurons + 1)*(nn.hidden_neurons)*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError("memcpy0");
//showValues << <v, 1 >> >(d_wih, v); TEST
//cudaMemcpy(d_who, nn.who, (nn.hidden_neurons + 1)*nn.output_neurons*sizeof(float), cudaMemcpyHostToDevice);
//checkCudaError("memcpy0.1");
assignRandomWeight(&nn, d_wih, d_who);
cudaMemcpy(nn.wih, d_wih, (nn.input_neurons + 1)*(nn.hidden_neurons)*sizeof(float), cudaMemcpyDeviceToHost);
//showValues << <v, 1 >> >(d_wih, v); TEST
checkCudaError("memcpy1");
//cudaMemcpy(nn.who, d_who, (nn.hidden_neurons + 1)*nn.output_neurons*sizeof(float), cudaMemcpyDeviceToHost);
//checkCudaError("memcpy2");
//printf("WIH:\n");
//for (int i = 0; i < (nn.input_neurons + 1); i++){
// for (int j = 0; j < (nn.hidden_neurons); j++){
// printf("%.12f\t", nn.wih[i*(nn.hidden_neurons) + j]);
// }
// printf("\n\n");
//}
//printf("WHO:\n");
//for (int i = 0; i < (nn.hidden_neurons + 1); i++){
// for (int j = 0; j < nn.output_neurons; j++){
// printf("%.12f\t", nn.wih[i*nn.output_neurons + j]);
// }
// printf("\n\n");
//}
cudaFree(d_wih);
cudaFree(d_who);
return 0;
}
__host__ void checkCudaError(char *str){
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess){
printf("Cuda Error at %s: %s \n", str, cudaGetErrorString(err));
exit(-1);
}
}
__global__ void showValues(float *d_v, int dim){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < dim){
printf("elemento[%d] = %.4f\n", tid, d_v[tid]);
}
}
__global__ void init_rand(unsigned int seed, curandState_t state_wih){
int tid = blockIdx.x*blockDim.x + threadIdx.x;
curand_init(seed, 0, tid, &state_wih);
}
__global__ void generateRandomValues(curandState_t state_wih, float *wih, float *who, int inp, int hid, int out){
int tid = (blockIdx.x)*(blockDim.x) + threadIdx.x;
printf("%.7f", (float)curand(&state_wih + tid));
if (tid <= (inp + 1)*hid){
wih[tid] = (float)curand_uniform(&state_wih + tid);
printf("%.7f", wih[tid]);
}
if (tid <= (hid + 1)*out){
who[tid] = (float)curand_uniform(&state_wih + tid);
printf("%.7f", who[tid]);
}
}
void initNeuralNetwork(struct TNeuralNetwork *nn, int bands, int nlabel) {
nn->input_neurons = bands;
nn->output_neurons = nlabel;
//nn->hidden_neurons = (int)((bands + nlabel)/2.0f);
nn->hidden_neurons = (int)((bands + nlabel)*2.0f / 3.0f);
nn->l = 0.001;
nn->m = 0.2;
nn->wih = (float*)malloc((bands + 1)*(nn->hidden_neurons) * sizeof(float)); //+1 for the bias
nn->who = (float*)malloc((nn->hidden_neurons + 1)*nlabel * sizeof(float));//+1 for the bias
nn->wih_old = (float*)malloc((bands + 1)*(nn->hidden_neurons) * sizeof(float)); //+1 for the bias
nn->who_old = (float*)malloc((nn->hidden_neurons + 1)*nlabel * sizeof(float));//+1 for the bias
nn->i = (float*)malloc(bands * sizeof(float));
nn->h = (float*)malloc(nn->hidden_neurons * sizeof(float));
nn->o = (float*)malloc(nlabel * sizeof(float));
nn->errh = (float*)malloc(nn->hidden_neurons * sizeof(float));
nn->erro = (float*)malloc(nlabel * sizeof(float));
memset(nn->wih_old, 0, (bands + 1)*(nn->hidden_neurons) * sizeof(float));
memset(nn->who_old, 0, (nn->hidden_neurons + 1)*nlabel * sizeof(float));
}
//curand
void assignRandomWeight(struct TNeuralNetwork *nn, float *d_wih, float *d_who) {
cudaError_t cudaStatus;
curandState_t state_wih;
srand(time(NULL));
unsigned int seed = rand();
//Alloco la matrice di curandState_t per la randomizzaione, in uscita dalla funzione non mi servirà più
cudaMalloc((void**)&state_wih, (nn->input_neurons + 1)*(nn->hidden_neurons)* sizeof(curandState_t));
dim3 gridSize(ceil((double)((nn->input_neurons + 1)*(nn->hidden_neurons)) / 32));
dim3 blockSize(32);
init_rand << < gridSize, blockSize >> >(seed, state_wih);
generateRandomValues << < gridSize, blockSize >> >(state_wih, d_wih, d_who, nn->input_neurons, nn->hidden_neurons, nn->output_neurons);
}
"Incorrect Indexing" will produce out-of-bounds memory access within the kernel. The CUDA runtime will destroy your context at the point where the error occurred within the kernel, after which no CUDA operations which rely the the context can be performed. The cudaMemcpycall fails because your context has been destroyed. There is no way to avoid this.
NVIDIA supply a utility called cuda-memcheck with the CUDA toolkit. Use that instead to diagnose what is going wrong with your kernel.
I'v found my error:
I was making a bad use of the "curandState_t" type variable in the assignRandomWight function, I had to use a pointer.
this is the correct version:
void assignRandomWeight(struct TNeuralNetwork *nn, float *d_wih, float *d_who) {
cudaError_t cudaStatus;
curandState_t *state_wih;
srand(time(NULL));
unsigned int seed = rand();
//Alloco la matrice di curandState_t per la randomizzaione, in uscita dalla funzione non mi servirà più
cudaMalloc((void**)&state_wih, (nn->input_neurons + 1)*(nn->hidden_neurons)* sizeof(curandState_t));
dim3 gridSize(ceil((double)((nn->input_neurons + 1)*(nn->hidden_neurons)) / 32));
dim3 blockSize(32);
init_rand << < gridSize, blockSize >> >(seed, state_wih);
generateRandomValues << < gridSize, blockSize >> >(state_wih, d_wih, d_who, nn->input_neurons, nn->hidden_neurons, nn->output_neurons);
}
and the correct version for the two kernels:
__global__ void generateRandomValues( curandState_t *state_wih, float *wih, float *who, int inp, int hid, int out){
int tid = (blockIdx.x)*(blockDim.x) + threadIdx.x;
if (tid<=(inp+1)*hid ){
printf("\ncasual : %f", (float)curand_uniform(&state_wih[tid]));
wih[tid] = (float)curand_uniform(&state_wih[tid]);
}
if (tid<=(hid+1)*out){
who[tid] = (float)curand_uniform(&state_wih[tid]);
}
}
__global__ void init_rand(unsigned int seed, curandState_t *state_wih){
int tid = blockIdx.x*blockDim.x + threadIdx.x;
curand_init(seed, tid, 0, &state_wih[tid]);
}

Cuda program not working

i'm a beginner in cuda programming. I'm trying an own easy code but it's not working and I don't know what else to do.
My code:
#include <mpi.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/wait.h>
// Prototypes
__global__ void helloWorld(char*);
__device__ int getGlobalIdx_2D_2D();
// Host function
int main(int argc, char** argv)
{
unsigned int i, N, gridX, gridY, blockX, blockY;
N = 4096000;
char *str = (char *) malloc(N*sizeof(char));
for(i=0; i < N; i++) str[i]='c';
MPI_Init (&argc, &argv);
char *d_str;
size_t size = (size_t) N*sizeof(char);
cudaMalloc((void**)&d_str, size);
cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice);
gridX = 100;
gridY = 10;
blockX = blockY = 64;
dim3 dimGrid(gridX, gridY); // 4096 chars per block
dim3 dimBlock(blockX, blockY); // one thread per character, 2D
printf("dimGrid(%d, %d)\t", gridX, gridY);
printf("dimBlock(%d, %d)\t", blockX, blockY);
helloWorld<<< dimGrid, dimBlock >>>(d_str);
cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
MPI_Barrier (MPI_COMM_WORLD);
cudaFree(d_str);
printf("\nRes:\n");
for(i = 0; i < N; i++) printf("\t[%u] %c\n", i, str[i]);
MPI_Finalize ();
free(str);
return 0.0;
}
// Device kernel
__global__ void helloWorld(char* str)
{
// determine where in the thread grid we are
int pos = getGlobalIdx_2D_2D();
if (pos % 2 == 0) str[pos] -= 2;
else str[pos] += 8;
}
__device__ int getGlobalIdx_2D_2D()
{
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y) +
(threadIdx.y * blockDim.x) + threadIdx.x;
return threadId;
}
My desired output is: jajajajajajaja... x4096000
I've read that '%' operation is not efficient, but I don't think is the problem there.
Thanks!
You are performing absolutely no CUDA error checking, it is really beneficial to do so. Once you enable it you can find that block dimensions 64 x 64 are invalid as it results into 4096 threads within one block, which is not a valid configuration.

maximum supported size for cub library

Does anyone know what is the maximum supported size for cub::scan ? I got core dump for input sizes over 500 million. I wanted to make sure I'm not doing anything wrong...
Here is my code:
#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;
bool g_verbose = false; // Whether to display input/output to console
CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
typedef int mytype;
/**
* Solve inclusive-scan problem
*/
static void solve(mytype *h_in, mytype *h_cpu, int n)
{
mytype inclusive = 0;
for (int i = 0; i < n; ++i) {
inclusive += h_in[i];
h_cpu[i] = inclusive;
}
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
for (int i = 0; i < n; i++) {
if (h_cpu[i] != h_o[i]) {
return i + 1;
}
}
return 0;
}
/**
* Main
*/
int main(int argc, char** argv)
{
cudaSetDevice(0);
struct timeval start, end;
int num_items = 1073741824;
const int repetitions = 5;
mytype *h_in, *h_out, *h_cpu;
const int size = num_items * sizeof(mytype);
// Allocate host arrays
h_in = (mytype *)malloc(size);
h_out = (mytype *)malloc(size);
h_cpu = (mytype *)malloc(size);
// Initialize problem and solution
for (int i = 0; i < num_items; i++) {
h_in[i] = i;
h_out[i] = 0;
h_cpu[i] = 0;
}
solve(h_in, h_cpu, num_items);
// Allocate problem device arrays
mytype *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array
mytype *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
gettimeofday(&start, NULL);
for (long i = 0; i < repetitions; i++)
DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
cudaThreadSynchronize();
gettimeofday(&end, NULL);
double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;
cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
int cmp = compare(h_cpu, h_out, num_items);
printf("%d\t", num_items);
if (!cmp)
printf("\t%7.4fs \n", ctime);
printf("\n");
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (h_cpu) delete[] h_cpu;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
printf("\n\n");
return 0;
}
The problem is here:
const int size = num_items * sizeof(mytype);
And it can be fixed by changing it to:
const size_t size = num_items * sizeof(mytype);
The value of num_items in the code is over 1 Billion. When we multiply that by sizeof(mytype) we are multiplying it by 4, so the result is over 4 Billion. This value cannot be stored in an int variable. If you try to use it anyway like that, then your subsequent host code will do bad things. This problem (the core dump) actually has nothing to do with CUDA. The code would core dump if you removed all the CUB elements.
When I modify the line of code above, and compile for the correct GPU (e.g. -arch=sm_35 in my case, or -arch=sm_52 for a Titan X GPU), then I get the correct answer (and no seg fault/core dump).
In general, the correct starting point when chasing a seg fault/core dump type error, is to recognize that this error arises from host code and you should attempt to localize the exact line of source code that is generating this error. This can be done trivially/tediously by putting many printf statements in your code, until you identify the line of your code after which you don't see any printf output, or by using a host code debugger, such as gdb on linux.
Also note that this code as written will require slightly more than 12GB of memory on the host, and slightly more than 8GB of memory on the GPU, so it will only run properly in such settings.
For reference, here is the fixed code (based on what OP posted here):
#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;
bool g_verbose = false; // Whether to display input/output to console
CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
typedef int mytype;
/**
* Solve inclusive-scan problem
*/
static void solve(mytype *h_in, mytype *h_cpu, int n)
{
mytype inclusive = 0;
for (int i = 0; i < n; ++i) {
inclusive += h_in[i];
h_cpu[i] = inclusive;
}
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
for (int i = 0; i < n; i++) {
if (h_cpu[i] != h_o[i]) {
return i + 1;
}
}
return 0;
}
/**
* Main
*/
int main(int argc, char** argv)
{
cudaSetDevice(0);
struct timeval start, end;
int num_items = 1073741824;
const int repetitions = 5;
mytype *h_in, *h_out, *h_cpu;
const size_t size = num_items * sizeof(mytype);
// Allocate host arrays
h_in = (mytype *)malloc(size);
h_out = (mytype *)malloc(size);
h_cpu = (mytype *)malloc(size);
// Initialize problem and solution
for (int i = 0; i < num_items; i++) {
h_in[i] = i;
h_out[i] = 0;
h_cpu[i] = 0;
}
solve(h_in, h_cpu, num_items);
// Allocate problem device arrays
mytype *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array
mytype *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
gettimeofday(&start, NULL);
for (long i = 0; i < repetitions; i++)
DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
cudaThreadSynchronize();
gettimeofday(&end, NULL);
double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;
cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
int cmp = compare(h_cpu, h_out, num_items);
printf("%d\t", num_items);
if (!cmp)
printf("\t%7.4fs \n", ctime);
printf("\n");
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (h_cpu) delete[] h_cpu;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
printf("\n\n");
return 0;
}

the Kernel delay increase by increasing the blocksPerGrid and threadsPerBlock in CUDA VecAdd example, what does it mean?

when i tested the following example, i found that by increasing the blocksPerGrid and threadsPerBlock the Kernel delay increase
such that if
int threadsPerBlock = 1;
int blocksPerGrid = 1;
blocksPerGrid and threadsPerBlock equal 1 the delay of the kernel = .0072 ms
but when i make the following it the delay become higher = .049 ms
int threadsPerBlock = 1024;
int blocksPerGrid = (N+threadsPerBlock-1) / threadsPerBlock;
where
N = 50000; //the no. of array elements
on the following the complete VecAdd example. you can test it
// Includes
#include <stdio.h>
#include <cutil_inline.h>
#include <shrQATest.h>
// Variables
float* h_A;
float* h_B;
float* h_C;
float* d_A;
float* d_B;
float* d_C;
bool noprompt = false;
// Functions
void CleanupResources(void);
void RandomInit(float*, int);
void ParseArguments(int, char**);
// Device code
__global__ void VecAdd(const float* A, const float* B, float* C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i] + B[i];
}
// Host code
int main(int argc, char** argv)
{
shrQAStart(argc, argv);
cudaEvent_t event1, event2;
cudaEventCreate(&event1);
cudaEventCreate(&event2);
printf("Vector Addition\n");
int N = 50000;
size_t size = N * sizeof(float);
ParseArguments(argc, argv);
// Allocate input vectors h_A and h_B in host memory
h_A = (float*)malloc(size);
if (h_A == 0) CleanupResources();
h_B = (float*)malloc(size);
if (h_B == 0) CleanupResources();
h_C = (float*)malloc(size);
if (h_C == 0) CleanupResources();
// Initialize input vectors
RandomInit(h_A, N);
RandomInit(h_B, N);
// Allocate vectors in device memory
cutilSafeCall( cudaMalloc((void**)&d_A, size) );
cutilSafeCall( cudaMalloc((void**)&d_B, size) );
cutilSafeCall( cudaMalloc((void**)&d_C, size) );
// Copy vectors from host memory to device memory
cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice) );
// Invoke kernel
int threadsPerBlock = 1024;
int blocksPerGrid = (N+threadsPerBlock-1) / threadsPerBlock;
cudaEventRecord(event1, 0);
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
cudaEventRecord(event2, 0);
cudaEventSynchronize(event1); //optional
cudaEventSynchronize(event2);
float dt_ms;
cudaEventElapsedTime(&dt_ms, event1, event2);
printf("delay_time = %f\n", dt_ms);
cutilCheckMsg("kernel launch failure");
#ifdef _DEBUG
cutilSafeCall( cutilDeviceSynchronize() );
#endif
// Copy result from device memory to host memory
// h_C contains the result in host memory
cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) );
// Verify result
int i;
for (i = 0; i < N; ++i) {
float sum = h_A[i] + h_B[i];
if (fabs(h_C[i] - sum) > 1e-5)
break;
}
CleanupResources();
shrQAFinishExit(argc, (const char **)argv, (i==N) ? QA_PASSED : QA_FAILED);
}
void CleanupResources(void)
{
// Free device memory
if (d_A)
cudaFree(d_A);
if (d_B)
cudaFree(d_B);
if (d_C)
cudaFree(d_C);
// Free host memory
if (h_A)
free(h_A);
if (h_B)
free(h_B);
if (h_C)
free(h_C);
cutilDeviceReset();
}
// Allocates an array with random float entries.
void RandomInit(float* data, int n)
{
for (int i = 0; i < n; ++i)
data[i] = rand() / (float)RAND_MAX;
}
// Parse program arguments
void ParseArguments(int argc, char** argv)
{
for (int i = 0; i < argc; ++i) {
if (strcmp(argv[i], "--noprompt") == 0 ||
strcmp(argv[i], "-noprompt") == 0)
{
noprompt = true;
break;
}
}
}
can any one explain for me what does it mean?
In case 1, a kernel of size 1 thread is launched and performs 2 reads and 1 write operation. In case 2, a kernel of size 50176 threads are launched and perform 100,000 reads and 50,000 writes operations. Increasing the workload by 50,000 increased execution time by ~7x. The work done by the two launches is significantly different.