Processing Shared Work Queue Using CUDA Atomic Operations and Grid Synchronization - cuda

I’m trying to write a kernel whose threads iteratively process items in a work queue. My understanding is that I should be able to do this by using atomic operations to manipulate the work queue (i.e., grab work items from the queue and insert new work items into the queue), and using grid synchronization via cooperative groups to ensure all threads are at the same iteration (I ensure the number of thread blocks doesn’t exceed the device capacity for the kernel). However, sometimes I observe that work items are skipped or processed multiple times during an iteration.
The following code is a working example to show this. In this example, an array with the size of input_len is created, which holds work items 0 to input_len - 1. The processWorkItems kernel processes these items for max_iter iterations. Each work item can put itself and its previous and next work items in the work queue, but marked array is used to ensure that during an iteration, each work item is added to the work queue at most once. What should happen in the end is that the sum of values in histogram be equal to input_len * max_iter, and no value in histogram be greater than 1. But I observe that occasionally both of these criteria are violated in the output, which implies that I’m not getting atomic operations and/or proper synchronization. I would appreciate it if someone could point out the flaws in my reasoning and/or implementation. My OS is Ubuntu 18.04, CUDA version is 10.1, and I’ve run experiments on P100, V100, and RTX 2080 Ti GPUs, and observed similar behavior.
The command I use for compiling for RTX 2080 Ti:
nvcc -O3 -o atomicsync atomicsync.cu --gpu-architecture=compute_75 -rdc=true
Some inputs and outputs of runs on RTX 2080 Ti:
./atomicsync 50 1000 1000
Skipped 0.01% of items. 5 extra item processing.
./atomicsync 500 1000 1000
Skipped 0.00% of items. 6 extra item processing.
./atomicsync 5000 1000 1000
Skipped 0.00% of items. 14 extra item processing.
atomicsync.cu:
#include <stdio.h>
#include <cooperative_groups.h>
#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ )
template< typename T >
void check(T result, char const *const func, const char *const file, int const line)
{
if (result)
{
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast<unsigned int>(result), cudaGetErrorString(result), func);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
}
__device__ inline void addWorkItem(int input_len, int item, int item_adder, int iter, int *queue, int *queue_size, int *marked) {
int already_marked = atomicExch(&marked[item], 1);
if(already_marked == 0) {
int idx = atomicAdd(&queue_size[iter + 1], 1);
queue[(iter + 1) * input_len + idx] = item;
}
}
__global__ void processWorkItems(int input_len, int max_iter, int *histogram, int *queue, int *queue_size, int *marked) {
auto grid = cooperative_groups::this_grid();
const int items_per_block = (input_len + gridDim.x - 1) / gridDim.x;
for(int iter = 0; iter < max_iter; ++iter) {
while(true) {
// Grab work item to process
int idx = atomicSub(&queue_size[iter], 1);
--idx;
if(idx < 0) {
break;
}
int item = queue[iter * input_len + idx];
// Keep track of processed work items
++histogram[iter * input_len + item];
// Add previous, self, and next work items to work queue
if(item > 0) {
addWorkItem(input_len, item - 1, item, iter, queue, queue_size, marked);
}
addWorkItem(input_len, item, item, iter, queue, queue_size, marked);
if(item + 1 < input_len) {
addWorkItem(input_len, item + 1, item, iter, queue, queue_size, marked);
}
}
__threadfence_system();
grid.sync();
// Reset marked array for next iteration
for(int i = 0; i < items_per_block; ++i) {
if(blockIdx.x * items_per_block + i < input_len) {
marked[blockIdx.x * items_per_block + i] = 0;
}
}
__threadfence_system();
grid.sync();
}
}
int main(int argc, char* argv[])
{
int input_len = atoi(argv[1]);
int max_iter = atoi(argv[2]);
int num_blocks = atoi(argv[3]);
// A histogram to keep track of work items that have been processed in each iteration
int histogram_host[input_len * max_iter];
memset(histogram_host, 0, sizeof(int) * input_len * max_iter);
int *histogram_device;
checkCudaErrors(cudaMalloc(&histogram_device, sizeof(int) * input_len * max_iter));
checkCudaErrors(cudaMemcpy(histogram_device, histogram_host, sizeof(int) * input_len * max_iter, cudaMemcpyHostToDevice));
// Size of the work queue for each iteration
int queue_size_host[max_iter + 1];
queue_size_host[0] = input_len;
memset(&queue_size_host[1], 0, sizeof(int) * max_iter);
int *queue_size_device;
checkCudaErrors(cudaMalloc(&queue_size_device, sizeof(int) * (max_iter + 1)));
checkCudaErrors(cudaMemcpy(queue_size_device, queue_size_host, sizeof(int) * (max_iter + 1), cudaMemcpyHostToDevice));
// Work queue
int queue_host[input_len * (max_iter + 1)];
for(int i = 0; i < input_len; ++i) {
queue_host[i] = i;
}
memset(&queue_host[input_len], 0, sizeof(int) * input_len * max_iter);
int *queue_device;
checkCudaErrors(cudaMalloc(&queue_device, sizeof(int) * input_len * (max_iter + 1)));
checkCudaErrors(cudaMemcpy(queue_device, queue_host, sizeof(int) * input_len * (max_iter + 1), cudaMemcpyHostToDevice));
// An array used to keep track of work items already added to the work queue to
// avoid multiple additions of a work item in the same iteration
int marked_host[input_len];
memset(marked_host, 0, sizeof(int) * input_len);
int *marked_device;
checkCudaErrors(cudaMalloc(&marked_device, sizeof(int) * input_len));
checkCudaErrors(cudaMemcpy(marked_device, marked_host, sizeof(int) * input_len, cudaMemcpyHostToDevice));
const dim3 threads(1, 1, 1);
const dim3 blocks(num_blocks, 1, 1);
processWorkItems<<<blocks, threads>>>(input_len, max_iter, histogram_device, queue_device, queue_size_device, marked_device);
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaMemcpy(histogram_host, histogram_device, sizeof(int) * input_len * max_iter, cudaMemcpyDeviceToHost));
int extra = 0;
double deficit = 0;
for(int i = 0; i < input_len; ++i) {
int cnt = 0;
for(int iter = 0; iter < max_iter; ++iter) {
if(histogram_host[iter * input_len + i] > 1) {
++extra;
}
cnt += histogram_host[iter * input_len + i];
}
deficit += max_iter - cnt;
}
printf("Skipped %.2f%% of items. %d extra item processing.\n", deficit / (input_len * max_iter) * 100, extra);
checkCudaErrors(cudaFree(histogram_device));
checkCudaErrors(cudaFree(queue_device));
checkCudaErrors(cudaFree(queue_size_device));
checkCudaErrors(cudaFree(marked_device));
return 0;
}

You may wish to read how to do a cooperative grid kernel launch in the programming gude or study any of the cuda sample codes (e.g. reductionMultiBlockCG, and there are others) that use a grid sync.
You're doing it incorrectly. You cannot launch a cooperative grid with ordinary <<<...>>> launch syntax. Because of that, there is no reason to assume that the grid.sync() in your kernel is working correctly.
It's easy to see the grid sync is not working in your code by running it under cuda-memcheck. When you do that the results will get drastically worse.
When I modify your code to do a proper cooperative launch, I have no issues on Tesla V100:
$ cat t1811.cu
#include <stdio.h>
#include <cooperative_groups.h>
#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ )
template< typename T >
void check(T result, char const *const func, const char *const file, int const line)
{
if (result)
{
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast<unsigned int>(result), cudaGetErrorString(result), func);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
}
__device__ inline void addWorkItem(int input_len, int item, int item_adder, int iter, int *queue, int *queue_size, int *marked) {
int already_marked = atomicExch(&marked[item], 1);
if(already_marked == 0) {
int idx = atomicAdd(&queue_size[iter + 1], 1);
queue[(iter + 1) * input_len + idx] = item;
}
}
__global__ void processWorkItems(int input_len, int max_iter, int *histogram, int *queue, int *queue_size, int *marked) {
auto grid = cooperative_groups::this_grid();
const int items_per_block = (input_len + gridDim.x - 1) / gridDim.x;
for(int iter = 0; iter < max_iter; ++iter) {
while(true) {
// Grab work item to process
int idx = atomicSub(&queue_size[iter], 1);
--idx;
if(idx < 0) {
break;
}
int item = queue[iter * input_len + idx];
// Keep track of processed work items
++histogram[iter * input_len + item];
// Add previous, self, and next work items to work queue
if(item > 0) {
addWorkItem(input_len, item - 1, item, iter, queue, queue_size, marked);
}
addWorkItem(input_len, item, item, iter, queue, queue_size, marked);
if(item + 1 < input_len) {
addWorkItem(input_len, item + 1, item, iter, queue, queue_size, marked);
}
}
__threadfence_system();
grid.sync();
// Reset marked array for next iteration
for(int i = 0; i < items_per_block; ++i) {
if(blockIdx.x * items_per_block + i < input_len) {
marked[blockIdx.x * items_per_block + i] = 0;
}
}
__threadfence_system();
grid.sync();
}
}
int main(int argc, char* argv[])
{
int input_len = atoi(argv[1]);
int max_iter = atoi(argv[2]);
int num_blocks = atoi(argv[3]);
// A histogram to keep track of work items that have been processed in each iteration
int *histogram_host = new int[input_len * max_iter];
memset(histogram_host, 0, sizeof(int) * input_len * max_iter);
int *histogram_device;
checkCudaErrors(cudaMalloc(&histogram_device, sizeof(int) * input_len * max_iter));
checkCudaErrors(cudaMemcpy(histogram_device, histogram_host, sizeof(int) * input_len * max_iter, cudaMemcpyHostToDevice));
// Size of the work queue for each iteration
int queue_size_host[max_iter + 1];
queue_size_host[0] = input_len;
memset(&queue_size_host[1], 0, sizeof(int) * max_iter);
int *queue_size_device;
checkCudaErrors(cudaMalloc(&queue_size_device, sizeof(int) * (max_iter + 1)));
checkCudaErrors(cudaMemcpy(queue_size_device, queue_size_host, sizeof(int) * (max_iter + 1), cudaMemcpyHostToDevice));
// Work queue
int *queue_host = new int[input_len * (max_iter + 1)];
for(int i = 0; i < input_len; ++i) {
queue_host[i] = i;
}
memset(&queue_host[input_len], 0, sizeof(int) * input_len * max_iter);
int *queue_device;
checkCudaErrors(cudaMalloc(&queue_device, sizeof(int) * input_len * (max_iter + 1)));
checkCudaErrors(cudaMemcpy(queue_device, queue_host, sizeof(int) * input_len * (max_iter + 1), cudaMemcpyHostToDevice));
// An array used to keep track of work items already added to the work queue to
// avoid multiple additions of a work item in the same iteration
int marked_host[input_len];
memset(marked_host, 0, sizeof(int) * input_len);
int *marked_device;
checkCudaErrors(cudaMalloc(&marked_device, sizeof(int) * input_len));
checkCudaErrors(cudaMemcpy(marked_device, marked_host, sizeof(int) * input_len, cudaMemcpyHostToDevice));
const dim3 threads(1, 1, 1);
const dim3 blocks(num_blocks, 1, 1);
int dev = 0;
int supportsCoopLaunch = 0;
checkCudaErrors(cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev));
if (!supportsCoopLaunch) {printf("Cooperative Launch is not supported on this machine configuration. Exiting."); return 0;}
/// This will launch a grid that can maximally fill the GPU, on the default stream with kernel arguments
int numBlocksPerSm = 0;
// Number of threads my_kernel will be launched with
int numThreads = threads.x;
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, processWorkItems, numThreads, 0));
// launch
void *kernelArgs[] = { &input_len, &max_iter, &histogram_device, &queue_device, &queue_size_device, &marked_device};
dim3 dimBlock = dim3(numThreads,1,1);
num_blocks = min(num_blocks, deviceProp.multiProcessorCount*numBlocksPerSm);
dim3 dimGrid(num_blocks, 1, 1);
printf("launching %d blocks\n", dimGrid.x);
checkCudaErrors(cudaLaunchCooperativeKernel((void*)processWorkItems, dimGrid, dimBlock, kernelArgs));
// processWorkItems<<<blocks, threads>>>(input_len, max_iter, histogram_device, queue_device, queue_size_device, marked_device);
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaMemcpy(histogram_host, histogram_device, sizeof(int) * input_len * max_iter, cudaMemcpyDeviceToHost));
int extra = 0;
double deficit = 0;
for(int i = 0; i < input_len; ++i) {
int cnt = 0;
for(int iter = 0; iter < max_iter; ++iter) {
if(histogram_host[iter * input_len + i] > 1) {
++extra;
}
cnt += histogram_host[iter * input_len + i];
}
deficit += max_iter - cnt;
}
printf("Skipped %.2f%% of items. %d extra item processing.\n", deficit / (input_len * max_iter) * 100, extra);
checkCudaErrors(cudaFree(histogram_device));
checkCudaErrors(cudaFree(queue_device));
checkCudaErrors(cudaFree(queue_size_device));
checkCudaErrors(cudaFree(marked_device));
return 0;
}
$ nvcc -o t1811 t1811.cu -arch=sm_70 -std=c++11 -rdc=true
$ cuda-memcheck ./t1811 50 1000 5000
========= CUDA-MEMCHECK
launching 2560 blocks
Skipped 0.00% of items. 0 extra item processing.
========= ERROR SUMMARY: 0 errors
$ cuda-memcheck ./t1811 50 1000 1000
========= CUDA-MEMCHECK
launching 1000 blocks
Skipped 0.00% of items. 0 extra item processing.
========= ERROR SUMMARY: 0 errors
$ ./t1811 50 1000 5000
launching 2560 blocks
Skipped 0.00% of items. 0 extra item processing.
$ ./t1811 50 1000 1000
launching 1000 blocks
Skipped 0.00% of items. 0 extra item processing.
$ ./t1811 50 1000 1000
launching 1000 blocks
Skipped 0.00% of items. 0 extra item processing.
$
I'm not suggesting the above code is defect free or suitable for any particular purpose. It is mostly your code. I've modified it just to demonstrate the concepts mentioned.
As an aside, I changed a few of your large stack-based memory allocations to heap based. I don't recommend trying to create large stack-based arrays such as this:
int histogram_host[input_len * max_iter];
in my opinion its better to do:
int *histogram_host = new int[input_len * max_iter];
As your input command-line parameters become larger, this may become an issue depending on the machine characteristics. This doesn't have much to do with CUDA, however. I've not tried to address every instance of this pattern in your code.
Although not relevant to this particular question, grid sync has other requirements for successful use as well. These are covered in the programming guide and may include but not limited to:
platform support (e.g. OS, GPU, etc.)
kernel sizing requirements (total number of threads or threadblocks launched)
The programming guide contains convenient, boiler-plate code that may be used to satisfy these requirements.

Related

How to keep track of executed CUDA blocks?

Just for the sake of testing my understanding of things, I decided to modify the vector addition found in the CUDA samples so that the kernel quits after a specific time and is then re-launched to complete. The way I achieve the "timeout" is by having a pinned variable that the host sets to 1 after some time. Within the kernel, a check of this variable is performed to determine whether execution should continue. If the thread continues its execution it is marked as complete. In order to test that each thread executes just once, I've modified the addition to C[i] = C[i] + B[i] This all works as expected; the device code looks as follows:
/* Function
* Internal device function used for getting the current thread's global ID
* regardless of the block/grid configuration. It assumes that the
* grid and block are 3 dimensional.
*
* #return: The thread's global ID
*/
static __device__ int get_global_idx()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
return threadId;
}
/* Function
* Device function that determines if the current thread should continue execution.
* A check should be used on the return value. If the timeout has not been set
* and the thread has not previously executed the index at the thread's ID in the
* thread_ids array is set to 1 to indicate it was allowed to proceed.
*
* #param thread_ids: A pointer to the array with a size that matches the max number
* of threads that will be spawned
*
* #param time_out: Memory mapped variable used by the host to signal the kernel when
* execution should suspend
*
* #return: A boolean value indicating whether the current thread should continue or not
*/
__device__ bool continue(unsigned int *thread_ids, volatile unsigned int *time_out)
{
if(*time_out == 1){
return false;
}
int tid = get_global_idx();
if(thread_ids[tid] == 1)
{
return false;
}
thread_ids[tid] = 1;
return true;
}
__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, unsigned int *thread_ids, volatile unsigned int *timeout)
{
if(!continue(thread_ids, timeout))
{
return;
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
/* C[i] = A[i] + B[i]; */
C[i] = C[i] + B[i]; //Modifed from above
}
}
I considered how this may fail if __syncthreads() was used. So I decided to do block level suspension. Based on my understanding, I thought this would be simple. Keep track of if a block has started, and count how many threads have executed for that block and only suspend when all threads of an already started block have completed and deny any threads who's block has not started. So I used a struct and modified the continue function as follows:
typedef struct block_info_t{
int started; /* Initialized to zero before any kernel launch */
unsigned int thread_count;
}block_info;
__device__ bool continue(unsigned int *thread_ids, volatile unsigned int *time_out, block_info *b_info)
{
int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
if(*time_out == 1 && b_info[bid].started == 0)
{
return false;
}
if(b_info[bid].thread_count == bsize)
{
return false;
}
b_info[bid].started = 1;
atomicInc(&b_info[bid].thread_count, bsize);
return true;
}
This does not work, when I execute verification on the host (h_B[i] - h_C[i]) I don't get a consistent zero result. Which means that some threads somehow managed to execute multiple times. Any ideas how/why this is happening with the latter attempt? Thanks.
I don't care about performance at this point; just trying to understand what is really happening.
EDIT
Here is the complete code, compile with nvcc file_name.cu and execute program_name <vector-length>.
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef struct block_info_t{
int started; /* Initialized to zero before any kernel launch */
unsigned int thread_count;
}block_info;
__device__ bool continue_execution(volatile unsigned int *time_out, block_info *b_info)
{
int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
if(*time_out == 1 && b_info[bid].started == 0)
{
return false;
}
if(b_info[bid].thread_count == bsize)
{
return false;
}
b_info[bid].started = 1;
atomicInc(&b_info[bid].thread_count, bsize);
return true;
}
__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, volatile unsigned int *time_out, block_info *b_info)
{
if(!continue_execution(time_out, b_info))
{
return;
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
//C[i] = A[i] + B[i];
C[i] = C[i] + B[i]; //Modified from above
}
}
void computation_complete(int *complete, int block_amt, block_info *h_block_info)
{
size_t i;
for(i = 0; i < block_amt; i++)
{
if(h_block_info[i].started == 1)
{
continue;
}
break;
}
*complete = (i == block_amt) ? 1 : 0;
}
int main(int argc, char *argv[])
{
if(argc != 2)
{
fprintf(stderr, "usage: <program-name> <vector-length>\n");
exit(EXIT_FAILURE);
}
// Print the vector length to be used, and compute its size
long numElements = strtol(argv[1], NULL, 10);
size_t size = numElements * sizeof(float);
printf("[Vector addition of %d elements]\n", numElements);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i)
{
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
h_C[i] = 0.0;
}
float *d_A = NULL;
cudaMalloc((void **)&d_A, size);
float *d_B = NULL;
cudaMalloc((void **)&d_B, size);
float *d_C = NULL;
cudaMalloc((void **)&d_C, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
size_t block_info_bytes = blocksPerGrid * sizeof(struct block_info_t);
block_info *h_block_info = (struct block_info_t *)malloc(block_info_bytes);
for(int i = 0; i < blocksPerGrid; i++)
{
h_block_info[i].started = 0;
h_block_info[i].thread_count = 0;
}
block_info *d_block_info = NULL;
cudaMalloc(&d_block_info, block_info_bytes);
cudaMemcpy(d_block_info, h_block_info, block_info_bytes, cudaMemcpyHostToDevice);
volatile unsigned int *timeout = NULL;
cudaHostAlloc((void **)&timeout, sizeof(volatile unsigned int), cudaHostAllocMapped);
*timeout = 0;
double quantum = 0.0001 * 1000000.0;
double initial_quantum = quantum;
int complete = 0;
/* Here the kernel launch is looped until all blocks are complete */
while(complete == 0)
{
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements, timeout, d_block_info);
usleep(quantum);
*timeout = 1;
cudaDeviceSynchronize();
cudaMemcpy(h_block_info, d_block_info, block_info_bytes, cudaMemcpyDeviceToHost);
computation_complete(&complete, blocksPerGrid, h_block_info);
if(complete == 0)
{
quantum = quantum + initial_quantum;
*timeout = 0;
}
}
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i)
{
if (fabs(h_B[i] - h_C[i]) > 1e-5)
{
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Free device global memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
cudaDeviceReset();
return 0;
}
You have a race condition in your continue_execution routine. Consider the following scenario:
warp0 of a threadblock enters the continue_execution routine. At the moment that it checks the variables *time_out and b_info[bid].started it witnesses those to be 0 and 0 respectively. So it proceeds to the next if test.
warp1 of the same threadblock enters the continue_execution routine (let's say slightly later), and it witnesses the variables to be 1 and 0 respectively. So it returns false and causes the warp1 threads to exit.
warp0 continues on and eventually sets b_info[bid].started to 1, and then updates the thread_count. It then returns true and proceeds with the vector add.
I could continue with this, but I think if you consider the above 3 items carefully you will realize it is a case you did not account for. Your implicit expectation is that every thread would read a coherent (i.e. the same across a given threadblock) value for *time_out. But this is not guaranteed by your code, and if it fails to do so, then we end up with some threadblocks where some threads have completed their work and some have not.
So how could we fix this? The above description should point the way. One possible approach is to guarantee that for any given threadblock, that every thread gets the same value for *time_out whether it be 1 or 0. One possible solution would be to make the following changes to the beginning of your vectorAdd kernel:
__shared__ volatile unsigned int my_time_out;
if (!threadIdx.x) my_time_out = *time_out;
__syncthreads();
if(!continue_execution(&my_time_out, b_info))
with those changes, we ensure that every thread in a block gets a coherent view of the time out variable, and according to my testing, the problem is resolved:
$ cat t100.cu
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef struct block_info_t{
int started; /* Initialized to zero before any kernel launch */
unsigned int thread_count;
}block_info;
__device__ bool continue_execution(volatile unsigned int *time_out, block_info *b_info)
{
int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
if(*time_out == 1 && b_info[bid].started == 0)
{
return false;
}
if(b_info[bid].thread_count == bsize)
{
return false;
}
b_info[bid].started = 1;
atomicInc(&b_info[bid].thread_count, bsize);
return true;
}
__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, volatile unsigned int *time_out, block_info *b_info)
{
#ifdef USE_FIX
__shared__ volatile unsigned int my_time_out;
if (!threadIdx.x) my_time_out = *time_out;
__syncthreads();
if(!continue_execution(&my_time_out, b_info))
#else
if(!continue_execution(time_out, b_info))
#endif
{
return;
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
//C[i] = A[i] + B[i];
C[i] = C[i] + B[i]; //Modified from above
}
}
void computation_complete(int *complete, int block_amt, block_info *h_block_info)
{
size_t i;
for(i = 0; i < block_amt; i++)
{
if(h_block_info[i].started == 1)
{
continue;
}
break;
}
*complete = (i == block_amt) ? 1 : 0;
}
int main(int argc, char *argv[])
{
if(argc != 2)
{
fprintf(stderr, "usage: <program-name> <vector-length>\n");
exit(EXIT_FAILURE);
}
// Print the vector length to be used, and compute its size
long numElements = strtol(argv[1], NULL, 10);
size_t size = numElements * sizeof(float);
printf("[Vector addition of %ld elements]\n", numElements);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i)
{
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
h_C[i] = 0.0;
}
float *d_A = NULL;
cudaMalloc((void **)&d_A, size);
float *d_B = NULL;
cudaMalloc((void **)&d_B, size);
float *d_C = NULL;
cudaMalloc((void **)&d_C, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
size_t block_info_bytes = blocksPerGrid * sizeof(struct block_info_t);
block_info *h_block_info = (struct block_info_t *)malloc(block_info_bytes);
for(int i = 0; i < blocksPerGrid; i++)
{
h_block_info[i].started = 0;
h_block_info[i].thread_count = 0;
}
block_info *d_block_info = NULL;
cudaMalloc(&d_block_info, block_info_bytes);
cudaMemcpy(d_block_info, h_block_info, block_info_bytes, cudaMemcpyHostToDevice);
volatile unsigned int *timeout = NULL;
cudaHostAlloc((void **)&timeout, sizeof(volatile unsigned int), cudaHostAllocMapped);
*timeout = 0;
double quantum = 0.0001 * 1000000.0;
double initial_quantum = quantum;
int complete = 0;
/* Here the kernel launch is looped until all blocks are complete */
while(complete == 0)
{
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements, timeout, d_block_info);
usleep(quantum);
*timeout = 1;
cudaDeviceSynchronize();
cudaMemcpy(h_block_info, d_block_info, block_info_bytes, cudaMemcpyDeviceToHost);
computation_complete(&complete, blocksPerGrid, h_block_info);
if(complete == 0)
{
quantum = quantum + initial_quantum;
*timeout = 0;
}
}
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i)
{
if (fabs(h_B[i] - h_C[i]) > 1e-5)
{
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Free device global memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
cudaDeviceReset();
return 0;
}
$ nvcc -arch=sm_61 -o t100 t100.cu
$ ./t100 327678
[Vector addition of 327678 elements]
Result verification failed at element 0!
$ nvcc -arch=sm_61 -o t100 t100.cu -DUSE_FIX
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$
One other change I made to your code was in this line:
printf("[Vector addition of %d elements]\n", numElements);
this has no bearing on the problem, but your format specifier does not match your variable type. Fix by changing to %ld.

cufftSetStream causes garbage output. Am I doing something wrong?

According to the docs, the cufftSetStream() function
Associates a CUDA stream with a cuFFT plan. All kernel launches made during plan execution are now done through the associated stream [...until...] the stream is changed with another call to cufftSetStream().
Unfortunately, the results are turned into garbage. Here is an example that demonstrates this by performing a bunch of transforms two ways: once with each stream having its own dedicated plan, and once with a single plan being reused as the documentation above indicates. The former behaves as expected, the reused/cufftSetStream approach has errors in most of the transforms. This was observed on the two cards I've tried (GTX 750 ti, Titan X) on CentOS 7 linux with
Cuda compilation tools, release 7.0, V7.0.27; and release 7.5, V7.5.17.
EDIT :see the "FIX" comments below for one way to fix things.
#include <cufft.h>
#include <stdexcept>
#include <iostream>
#include <numeric>
#include <vector>
#define ck(cmd) if ( cmd) { std::cerr << "error at line " << __LINE__ << std::endl;exit(1);}
__global__
void fill_input(cufftComplex * buf, int batch,int nbins,int stride,int seed)
{
for (int i = blockDim.y * blockIdx.y + threadIdx.y; i< batch;i += gridDim.y*blockDim.y)
for (int j = blockDim.x * blockIdx.x + threadIdx.x; j< nbins;j += gridDim.x*blockDim.x)
buf[i*stride + j] = make_cuFloatComplex( (i+seed)%101 - 50,(j+seed)%41-20);
}
__global__
void check_output(const float * buf1,const float * buf2,int batch, int nfft, int stride, int * errors)
{
for (int i = blockDim.y * blockIdx.y + threadIdx.y; i< batch;i += gridDim.y*blockDim.y) {
for (int j = blockDim.x * blockIdx.x + threadIdx.x; j< nfft;j += gridDim.x*blockDim.x) {
float e=buf1[i*stride+j] - buf2[i*stride+j];
if (e*e > 1) // gross error
atomicAdd(errors,1);
}
}
}
void demo(bool reuse_plan)
{
if (reuse_plan)
std::cout << "Reusing the same fft plan with multiple stream via cufftSetStream ... ";
else
std::cout << "Giving each stream its own dedicated fft plan ... ";
int nfft = 1024;
int batch = 1024;
int nstreams = 8;
int nbins = nfft/2+1;
int nit=100;
size_t inpitch,outpitch;
std::vector<cufftComplex*> inbufs(nstreams);
std::vector<float*> outbufs(nstreams);
std::vector<float*> checkbufs(nstreams);
std::vector<cudaStream_t> streams(nstreams);
std::vector<cufftHandle> plans(nstreams);
for (int i=0;i<nstreams;++i) {
ck( cudaStreamCreate(&streams[i]));
ck( cudaMallocPitch((void**)&inbufs[i],&inpitch,nbins*sizeof(cufftComplex),batch) );
ck( cudaMallocPitch((void**)&outbufs[i],&outpitch,nfft*sizeof(float),batch));
ck( cudaMallocPitch((void**)&checkbufs[i],&outpitch,nfft*sizeof(float),batch) );
if (i==0 || reuse_plan==false)
ck ( cufftPlanMany(&plans[i],1,&nfft,&nbins,1,inpitch/sizeof(cufftComplex),&nfft,1,outpitch/sizeof(float),CUFFT_C2R,batch) );
}
// fill the input buffers and FFT them to get a baseline for comparison
for (int i=0;i<nstreams;++i) {
fill_input<<<20,dim3(32,32)>>>(inbufs[i],batch,nbins,inpitch/sizeof(cufftComplex),i);
ck (cudaGetLastError());
if (reuse_plan) {
ck (cufftExecC2R(plans[0],inbufs[i],checkbufs[i]));
}else{
ck (cufftExecC2R(plans[i],inbufs[i],checkbufs[i]));
ck( cufftSetStream(plans[i],streams[i]) ); // only need to set the stream once
}
ck( cudaDeviceSynchronize());
}
// allocate a buffer for the error count
int * errors;
cudaMallocHost((void**)&errors,sizeof(int)*nit);
memset(errors,0,sizeof(int)*nit);
/* FIX: an event can protect the plan internal buffers
by serializing access to the plan
cudaEvent_t ev;
cudaEventCreateWithFlags(&ev,cudaEventDisableTiming);
*/
// perform the FFTs and check the outputs on streams
for (int it=0;it<nit;++it) {
int k = it % nstreams;
ck( cudaStreamSynchronize(streams[k]) ); // make sure any prior kernels have completed
if (reuse_plan) {
// FIX: ck(cudaStreamWaitEvent(streams[k],ev,0 ) );
ck(cufftSetStream(plans[0],streams[k]));
ck(cufftExecC2R(plans[0],inbufs[k],outbufs[k]));
// FIX: ck(cudaEventRecord(ev,streams[k] ) );
}else{
ck(cufftExecC2R(plans[k],inbufs[k],outbufs[k]));
}
check_output<<<100,dim3(32,32),0,streams[k]>>>(outbufs[k],checkbufs[k],batch,nfft,outpitch/sizeof(float),&errors[it]);
ck (cudaGetLastError());
}
ck(cudaDeviceSynchronize());
// report number of errors
int errcount=0;
for (int it=0;it<nit;++it)
if (errors[it])
++errcount;
std::cout << errcount << " of " << nit << " transforms had errors\n";
for (int i=0;i<nstreams;++i) {
cudaFree(inbufs[i]);
cudaFree(outbufs[i]);
cudaStreamDestroy(streams[i]);
if (i==0 || reuse_plan==false)
cufftDestroy(plans[i]);
}
}
int main(int argc,char ** argv)
{
demo(false);
demo(true);
return 0;
}
Typical output
Giving each stream its own dedicated fft plan ... 0 of 100 transforms had errors
Reusing the same fft plan with multiple stream via cufftSetStream ... 87 of 100 transforms had errors
In order to reuse plans the way you want you need to manage cuFFT work area manually.
Each plan has a space for intermediate calculation results. If you want to use plan handle at the same time for two or more different plan executions you need to provide temporary buffer for each concurrent cufftExec* call.
You can do this by using cufftSetWorkArea - please have a look at section 3.7 in cuFFT documentation. Section 2.2 also would help to understand how it works.
Here's a worked example showing the changes to your code for this:
$ cat t1241.cu
#include <cufft.h>
#include <stdexcept>
#include <iostream>
#include <numeric>
#include <vector>
#define ck(cmd) if ( cmd) { std::cerr << "error at line " << __LINE__ << std::endl;exit(1);}
__global__
void fill_input(cufftComplex * buf, int batch,int nbins,int stride,int seed)
{
for (int i = blockDim.y * blockIdx.y + threadIdx.y; i< batch;i += gridDim.y*blockDim.y)
for (int j = blockDim.x * blockIdx.x + threadIdx.x; j< nbins;j += gridDim.x*blockDim.x)
buf[i*stride + j] = make_cuFloatComplex( (i+seed)%101 - 50,(j+seed)%41-20);
}
__global__
void check_output(const float * buf1,const float * buf2,int batch, int nfft, int stride, int * errors)
{
for (int i = blockDim.y * blockIdx.y + threadIdx.y; i< batch;i += gridDim.y*blockDim.y) {
for (int j = blockDim.x * blockIdx.x + threadIdx.x; j< nfft;j += gridDim.x*blockDim.x) {
float e=buf1[i*stride+j] - buf2[i*stride+j];
if (e*e > 1) // gross error
atomicAdd(errors,1);
}
}
}
void demo(bool reuse_plan)
{
if (reuse_plan)
std::cout << "Reusing the same fft plan with multiple stream via cufftSetStream ... ";
else
std::cout << "Giving each stream its own dedicated fft plan ... ";
int nfft = 1024;
int batch = 1024;
int nstreams = 8;
int nbins = nfft/2+1;
int nit=100;
size_t inpitch,outpitch;
std::vector<cufftComplex*> inbufs(nstreams);
std::vector<float*> outbufs(nstreams);
std::vector<float*> checkbufs(nstreams);
std::vector<cudaStream_t> streams(nstreams);
std::vector<cufftHandle> plans(nstreams);
// if plan reuse, set up independent work areas
std::vector<char *> wk_areas(nstreams);
for (int i=0;i<nstreams;++i) {
ck( cudaStreamCreate(&streams[i]));
ck( cudaMallocPitch((void**)&inbufs[i],&inpitch,nbins*sizeof(cufftComplex),batch) );
ck( cudaMallocPitch((void**)&outbufs[i],&outpitch,nfft*sizeof(float),batch));
ck( cudaMallocPitch((void**)&checkbufs[i],&outpitch,nfft*sizeof(float),batch) );
if (i==0 || reuse_plan==false)
ck ( cufftPlanMany(&plans[i],1,&nfft,&nbins,1,inpitch/sizeof(cufftComplex),&nfft,1,outpitch/sizeof(float),CUFFT_C2R,batch) );
}
if (reuse_plan){
size_t ws;
ck(cufftGetSize(plans[0], &ws));
for (int i = 0; i < nstreams; i++)
ck(cudaMalloc(&(wk_areas[i]), ws));
ck(cufftSetAutoAllocation(plans[0], 0));
ck(cufftSetWorkArea(plans[0], wk_areas[0]));
}
// fill the input buffers and FFT them to get a baseline for comparison
for (int i=0;i<nstreams;++i) {
fill_input<<<20,dim3(32,32)>>>(inbufs[i],batch,nbins,inpitch/sizeof(cufftComplex),i);
ck (cudaGetLastError());
if (reuse_plan) {
ck (cufftExecC2R(plans[0],inbufs[i],checkbufs[i]));
}else{
ck (cufftExecC2R(plans[i],inbufs[i],checkbufs[i]));
ck( cufftSetStream(plans[i],streams[i]) ); // only need to set the stream once
}
ck( cudaDeviceSynchronize());
}
// allocate a buffer for the error count
int * errors;
cudaMallocHost((void**)&errors,sizeof(int)*nit);
memset(errors,0,sizeof(int)*nit);
// perform the FFTs and check the outputs on streams
for (int it=0;it<nit;++it) {
int k = it % nstreams;
ck( cudaStreamSynchronize(streams[k]) ); // make sure any prior kernels have completed
if (reuse_plan) {
ck(cufftSetStream(plans[0],streams[k]));
ck(cufftSetWorkArea(plans[0], wk_areas[k])); // update work area pointer in plan
ck(cufftExecC2R(plans[0],inbufs[k],outbufs[k]));
}else{
ck(cufftExecC2R(plans[k],inbufs[k],outbufs[k]));
}
check_output<<<100,dim3(32,32),0,streams[k]>>>(outbufs[k],checkbufs[k],batch,nfft,outpitch/sizeof(float),&errors[it]);
ck (cudaGetLastError());
}
ck(cudaDeviceSynchronize());
// report number of errors
int errcount=0;
for (int it=0;it<nit;++it)
if (errors[it])
++errcount;
std::cout << errcount << " of " << nit << " transforms had errors\n";
for (int i=0;i<nstreams;++i) {
cudaFree(inbufs[i]);
cudaFree(outbufs[i]);
cudaFree(wk_areas[i]);
cudaStreamDestroy(streams[i]);
if (i==0 || reuse_plan==false)
cufftDestroy(plans[i]);
}
}
int main(int argc,char ** argv)
{
demo(false);
demo(true);
return 0;
}
$ nvcc -o t1241 t1241.cu -lcufft
$ ./t1241
Giving each stream its own dedicated fft plan ... 0 of 100 transforms had errors
Reusing the same fft plan with multiple stream via cufftSetStream ... 0 of 100 transforms had errors
$

Cuda idx doesnt index matrices correctly

I have the following kernel in cuda:
__global__ void pagerank(Node *ingoing, Node *outgoing, int N) {
int j;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ((idx > 0) && (idx < N)){
//for(j=0;j<N;j++){
// outgoing[j].p_t1=ingoing[j].p_t1;
//}
outgoing[idx].p_t1=ingoing[idx].p_t1;
}
}
This doesnt work. The following works:
__global__ void pagerank(Node *ingoing, Node *outgoing, int N) {
int j;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ((idx > 0) && (idx < N)){
for(j=0;j<N;j++){
outgoing[j].p_t1=ingoing[j].p_t1;
}
//outgoing[idx].p_t1=ingoing[idx].p_t1;
}
}
What is wrong? Why idx doesnt index the matrices correctly?
The whole code is written below. It wouldn't be so easy to understand it. The thing is that when I print the outgoing[idx].p_t1 fields at the end of the main function they print 0s when I do
outgoing[idx].p_t1=ingoing[idx].p_t1;
but they are correct when I do
for(j=0;j<N;j++){
outgoing[j].p_t1=ingoing[j].p_t1;
}
Whats wrong?
/******************** Includes - Defines ****************/
#include "pagerank_serial.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <assert.h>
#include <string.h>
#include <sys/time.h>
#include <fcntl.h>
#include <cuda.h>
#include "string.h"
/******************** Defines ****************/
// Number of nodes
int N;
// Convergence threashold and algorithm's parameter d
double threshold, d;
// Table of node's data
Node *Nodes;
__global__ void pagerank(Node *ingoing, Node *outgoing, int N) {
int j;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ((idx > 0) && (idx < N)){
for(j=0;j<N;j++){
outgoing[j].p_t1=ingoing[j].p_t1;
}
//outgoing[idx].p_t1=ingoing[idx].p_t1;
}
}
/***** Read graph connections from txt file *****/
void Read_from_txt_file(char* filename)
{
FILE *fid;
int from_idx, to_idx;
int temp_size;
fid = fopen(filename, "r");
if (fid == NULL){
printf("Error opening data file\n");
}
while (!feof(fid))
{
if (fscanf(fid,"%d\t%d\n", &from_idx,&to_idx))
{
Nodes[from_idx].con_size++;
temp_size = Nodes[from_idx].con_size;
//Nodes[from_idx].To_id =(int*) realloc(Nodes[from_idx].To_id, temp_size * sizeof(int));
Nodes[from_idx].To_id[temp_size - 1] = to_idx;
}
}
//printf("End of connections insertion!\n");
fclose(fid);
}
/***** Read P vector from txt file*****/
void Read_P_from_txt_file()
{
FILE *fid;
double temp_P;
int index = 0;
fid = fopen("P.txt", "r");
if (fid == NULL){printf("Error opening the Probabilities file\n");}
while (!feof(fid))
{
// P's values are double!
if (fscanf(fid," double sum = 0;%lf\n", &temp_P))
{
Nodes[index].p_t1 = temp_P;
index++;
}
}
//printf("End of P insertion!");
fclose(fid);
}
/***** Read E vector from txt file*****/
void Read_E_from_txt_file()
{
FILE *fid;
double temp_E;
int index = 0;
fid = fopen("E.txt", "r");
if (fid == NULL)
printf("Error opening the E file\n");
while (!feof(fid))
{
// E's values are double!
if (fscanf(fid,"%lf\n", &temp_E))
{
Nodes[index].e = temp_E;
index++;
}
}
//printf("End of E insertion!");
fclose(fid);
}
/***** Create P and E with equal probability *****/
void Random_P_E()
{
int i;
// Sum of P (it must be =1)
double sum_P_1 = 0;
// Sum of E (it must be =1)
double sum_E_1 = 0;
// Arrays initialization
for (i = 0; i < N; i++)
{
Nodes[i].p_t0 = 0;
Nodes[i].p_t1 = 1;
Nodes[i].p_t1 = (double) Nodes[i].p_t1 / N;
sum_P_1 = sum_P_1 + Nodes[i].p_t1;
Nodes[i].e = 1;
Nodes[i].e = (double) Nodes[i].e / N;
sum_E_1 = sum_E_1 + Nodes[i].e;
}
// Assert sum of probabilities is =1
// Print sum of P (it must be =1)
//printf("Sum of P = %f\n",sum_P_1);
// Exit if sum of P is !=1
assert(sum_P_1 = 1);
//printf("\n");
// Print sum of E (it must be =1)
//printf("Sum of E = %f\n",sum_E_1);
// Exit if sum of Pt0 is !=1
assert(sum_E_1 = 1);
}
/***** Main function *****/
int main(int argc, char** argv)
{
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the maximum occupancy for a full device launch
int gridSize; // The actual grid size needed, based on input size
// Check input arguments
if (argc < 5)
{
printf("Error in arguments! Three arguments required: graph filename, N, threshold and d\n");
return 0;
}
// get arguments
char filename[256];
strcpy(filename, argv[1]);
N = atoi(argv[2]);
threshold = atof(argv[3]);
d = atof(argv[4]);
int i;
// a constant value contributed of all nodes with connectivity = 0
// it's going to be addes to all node's new probability
// Allocate memory for N nodes
Nodes = (Node*) malloc(N * sizeof(Node));
for (i = 0; i < N; i++)
{
Nodes[i].con_size = 0;
//Nodes[i].To_id = (int*) malloc(sizeof(int));
}
Read_from_txt_file(filename);
// set random probabilities
Random_P_E();
Node *h_ingoing;
Node *h_outgoing;
h_ingoing = Nodes;
h_outgoing = (Node *)calloc(N, sizeof *h_outgoing);
Node *d_ingoing;
Node *d_outgoing;
cudaMalloc(&d_ingoing, N * sizeof *d_ingoing);
cudaMalloc(&d_outgoing, N * sizeof *d_outgoing);
cudaMemcpy(d_ingoing, h_ingoing, N * sizeof *h_ingoing, cudaMemcpyHostToDevice);
cudaMemcpy(d_outgoing, h_outgoing, N * sizeof *h_outgoing, cudaMemcpyHostToDevice);
float time;
cudaEvent_t begin, end;
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, pagerank, 0, N);
// Round up according to array size
gridSize = (N + blockSize - 1) / blockSize;
printf("Gridsize, blockzise : %d , %d \n", gridSize, blockSize);
cudaEventCreate(&begin);
cudaEventCreate(&end);
cudaEventRecord(begin, 0);
pagerank<<<gridSize, blockSize>>>(d_ingoing, d_outgoing, N, threshold, d);
cudaEventRecord(end, 0);
cudaEventSynchronize(end);
cudaEventElapsedTime(&time, begin, end);
cudaMemcpy(h_outgoing, d_outgoing, N * sizeof *h_outgoing, cudaMemcpyDeviceToHost);
printf("%f\n", time) ;
printf("\n");
// Print final probabilitities
for (i = 0; i <100; i++)
{
printf("P_t1[%d] = %f\n",i,h_outgoing[i].p_t1);
}
printf("\n");
printf("End of program!\n");
return (EXIT_SUCCESS);
}
When you say main function they print 0s when I do, I assume you are refering to all entries and not just index 0. Indeed, index 0 is not processed by your code with the fisrt version as ((idx > 0) && (idx < N)) is false for idx=0.
Getting further, in your code, we are missing the definition of the Node type. which is mandatory to get a better understanding of what could go wrong in your code.
Depending on the size of Node, its contents, and the structure packing you are using in compilation, it might be that Node size on host side differs from Node size on device. Using printf to verify that would be usefull, or using a debugger.
Also, you do not seem to be checking for error in launch. You definitely want to add a cudaPeekAtLastError and a cudaDeviceSynchronize after your kernel call to make sure no error occurred. (any other method call from cuda Runtime API may also return errors your code does not check).
EDIT
Trying to reproduce, I wrote the following, as close as possible to your code. I don't have a card with sufficient memory, hence the smaller node count.
typedef struct
{
double p_t0;
double p_t1;
double e;
int To_id[460];
int con_size;
} Node ;
__global__ void pagerank(Node* ingoing, Node* outgoing, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x ;
if ((idx > 0) && (idx < N))
outgoing[idx].p_t1 = ingoing[idx].p_t1;
}
#include <cstdlib>
#define cudaCheck(a) { cudaError_t cuerr = a ; if (cuerr != cudaSuccess) { printf("[ERROR # %s : %d ] : (%d) - %s\n", __FILE__, __LINE__, cuerr, cudaGetErrorString(cuerr)) ; ::exit(1) ; } }
int main()
{
// int N = 916428 ; // does not fit on my GPU
int N = 400000 ;
int blockSize;
int minGridSize;
int gridSize;
Node* Nodes = (Node*)malloc(N * sizeof (Node)) ;
for (int i = 0 ; i < N ; ++i)
Nodes[i].p_t1 = (double)i+1;
Node* h_ingoing = Nodes;
Node* h_outgoing = (Node*)calloc(N, sizeof *h_outgoing) ;
Node* d_ingoing ;
Node* d_outgoing ;
cudaCheck (cudaMalloc(&d_ingoing, N * sizeof *d_ingoing));
cudaCheck (cudaMalloc(&d_outgoing, N * sizeof *d_outgoing));
cudaCheck (cudaMemcpy (d_ingoing, h_ingoing, N * sizeof *h_ingoing, cudaMemcpyHostToDevice));
cudaCheck (cudaMemcpy (d_outgoing, h_outgoing, N * sizeof *h_outgoing, cudaMemcpyHostToDevice));
float time;
cudaEvent_t begin, end ;
//blockSize = 256 ;
cudaOccupancyMaxPotentialBlockSize<> (&minGridSize, &blockSize, pagerank, 0, N) ;
gridSize = (N + blockSize -1) / blockSize ;
printf ("Configuration = <<< %d , %d >>>\n", gridSize, blockSize) ;
cudaCheck (cudaEventCreate (&begin)) ;
cudaCheck (cudaEventCreate (&end)) ;
cudaCheck (cudaEventRecord (begin, 0)) ;
pagerank <<< gridSize, blockSize >>> (d_ingoing, d_outgoing, N) ;
cudaCheck (cudaEventRecord (end, 0)) ;
cudaCheck (cudaEventSynchronize (end)) ;
cudaCheck (cudaMemcpy (h_outgoing, d_outgoing, N * sizeof *h_outgoing, cudaMemcpyDeviceToHost)) ;
for (int i = 0 ; i < 100 ; ++i)
{
printf ("P_t1[%d] = %f\n", i, h_outgoing[i].p_t1) ;
}
for (int i = 0 ; i < N ; ++i)
{
if (h_outgoing[i].p_t1 != (double)(i+1))
printf ("Error # %d : %lf <> %lf\n", i, h_outgoing[i].p_t1, (double)(i+1));
}
return 0 ;
}
Except at index 0 for which the first draft of answer stated there was an issue, each output is correct.

Unspecified launch failure after cudaDeviceSynchronize() call when program starts. But no errors using step-through debugging. CUDA

I've spent several hours struggling with unspecified launch failure.
I've come up with a tiny task for myself in order to understand how shared memory works.
Task is to divide array [1, 2, 3, ... , N] into K group of (N / K) elements and find the sum of each group. (Difference between current and previous element of the array equals 1)
I was planning to use N threads in grid divided between K blocks. So every threadblock contains (N / K) threads. Thus one threadblock could be used to compute sum of one group. Also I wanted to dynamically allocate shared memory.
When I start program I got unspecified launch failure after cudaDeviceSynchronize() call. But when I try step-through debugging everthing is ok and works fine.
What am I doing wrong? (Visual Studio 2012 Professional, Compute Capability 2.1) I would very appreciate any help.
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf("%s\n",cudaGetErrorString(x)); \
system("pause"); \
return EXIT_FAILURE;}} while(0)
extern __shared__ double shrd[];
__global__ void kernel(double * a){
size_t threadID_block = blockDim.x * threadIdx.y + threadIdx.x;
size_t blockID_global = (gridDim.x * blockIdx.y + blockIdx.x );
size_t threadID_global = blockID_global * blockDim.x * blockDim.y + threadID_block;
double * temp = &shrd[blockID_global * blockDim.x * blockDim.y];
temp[threadID_block] = static_cast<double>(threadID_global);
__syncthreads();
if (threadID_block == 0){
a[blockID_global] = 0.0;
for (size_t index = 0; index < blockDim.x * blockDim.y; index++){
a[blockID_global] += temp[index];
}
}
}
int main(){
int devNum = 0;
CUDA_CALL(cudaGetDevice(&devNum));
CUDA_CALL(cudaSetDevice(devNum));
dim3 gridSize(2,2,1);
dim3 blockSize(4,4,1);
double * dev_a = NULL;
size_t length = gridSize.x * gridSize.y ;
size_t byteSize = length * sizeof(double);
CUDA_CALL(cudaMalloc(&dev_a,byteSize));
size_t shmem_perBlock = blockSize.x * blockSize.y * sizeof(double);
kernel <<< gridSize, blockSize, shmem_perBlock >>> (dev_a);
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaDeviceSynchronize());
double * a = new double [length];
CUDA_CALL(cudaMemcpy(a,dev_a,byteSize,cudaMemcpyDeviceToHost));
for (size_t index = 0; index < length; index++){
printf("%.3f\n",a[index]);
}
printf("\n");
CUDA_CALL(cudaFree(dev_a));
CUDA_CALL(cudaDeviceReset());
delete[]a;
system("pause");
return 0;
}
If you are on kepler or later first read this:
http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
Otherwise if you are pre-kepler read this:
http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf
There are some fundamentals you are missing in terms of CUDA programming. I have given you a template of your code below. It is for clarification of some of these fundamentals. Do not expect this to be optimized as I am expecting you to program the parallel reduction. This will get you started with an understanding on how to use shared memory.
Good Luck!
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N 10000
#define K 100
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf("%s\n",cudaGetErrorString(x)); \
system("pause"); \
return EXIT_FAILURE;}} while(0)
__global__ void kernel(double* a, double* results){
extern __shared__ double shared[];
size_t tid, tid_local, stride;
tid = blockDim.x*blockIdx.x+threadIdx.x; //thread id within all blocks
tid_local = threadIdx.x; //thread id within a block
stride = blockDim.x*gridDim.x; //total number of threads
double *start = &a[K*blockIdx.x]; //each block will get K of a block.
shared[tid_local]=start[tid_local]; //copy K elements into shared memory
__syncthreads();
//Perform Parallel reduction, you will have to implement this
//After parallel reduction, result should be in shared[0]
//for demonstration I made the code serial for each block on thread 0.
//This is for demonstration only.
double sum=0;
if(tid_local==0){
for(int i=0; i<K; i++){
sum+=shared[i];
}
a[blockIdx.x]=sum;
}
}
int main(){
int devNum = 0;
CUDA_CALL(cudaGetDevice(&devNum));
CUDA_CALL(cudaSetDevice(devNum));
double * dev_a = NULL;
double * dev_results=NULL;
CUDA_CALL(cudaMalloc(&dev_a, N*sizeof(double) ));
CUDA_CALL(cudaMalloc(&dev_results, (N/K)*sizeof(double)));
//copy dev_a onto GPU (this is the array you are summing).
dim3 block_size(K, 1, 1);
dim3 grid_size (N/K, 1, 1);
size_t shmem_perBlock = K * sizeof(double);
kernel <<< grid_size, block_size, shmem_perBlock >>> (dev_a, dev_results);
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaDeviceSynchronize());
//copy dev_results back to CPU, this is your result.
CUDA_CALL(cudaFree(dev_a));
CUDA_CALL(cudaFree(dev_results));
system("pause");
return 0;
}

Shared memory mutex with CUDA - adding to a list of items

My problem is the following: I have an image in which I detect some points of interest using the GPU. The detection is a heavyweight test in terms of processing, however only about 1 in 25 points pass the test on average. The final stage of the algorithm is to build up a list of the points. On the CPU this would be implemented as:
forall pixels x,y
{
if(test_this_pixel(x,y))
vector_of_coordinates.push_back(Vec2(x,y));
}
On the GPU I have each CUDA block processing 16x16 pixels. The problem is that I need to do something special to eventually have a single consolidated list of points in global memory. At the moment I am trying to generate a local list of points in shared memory per block which eventually will be written to global memory. I am trying to avoid sending anything back to the CPU because there are more CUDA stages after this.
I was expecting that I could use atomic operations to implement the push_back function on shared memory. However I am unable to get this working. There are two issues. The first annoying issue is that I am constantly running into the following compiler crash: "nvcc error : 'ptxas' died with status 0xC0000005 (ACCESS_VIOLATION)" when using atomic operations. It is hit or miss whether I can compile something. Does anyone know what causes this?
The following kernel will reproduce the error:
__global__ void gpu_kernel(int w, int h, RtmPoint *pPoints, int *pCounts)
{
__shared__ unsigned int test;
atomicInc(&test, 1000);
}
Secondly, my code which includes a mutex lock on shared memory hangs the GPU and I dont understand why:
__device__ void lock(unsigned int *pmutex)
{
while(atomicCAS(pmutex, 0, 1) != 0);
}
__device__ void unlock(unsigned int *pmutex)
{
atomicExch(pmutex, 0);
}
__global__ void gpu_kernel_non_max_suppress(int w, int h, RtmPoint *pPoints, int *pCounts)
{
__shared__ RtmPoint localPoints[64];
__shared__ int localCount;
__shared__ unsigned int mutex;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int threadid = threadIdx.y * blockDim.x + threadIdx.x;
int blockid = blockIdx.y * gridDim.x + blockIdx.x;
if(threadid==0)
{
localCount = 0;
mutex = 0;
}
__syncthreads();
if(x<w && y<h)
{
if(some_test_on_pixel(x,y))
{
RtmPoint point;
point.x = x;
point.y = y;
// this is a local push_back operation
lock(&mutex);
if(localCount<64) // we should never get >64 points per block
localPoints[localCount++] = point;
unlock(&mutex);
}
}
__syncthreads();
if(threadid==0)
pCounts[blockid] = localCount;
if(threadid<localCount)
pPoints[blockid * 64 + threadid] = localPoints[threadid];
}
In the example code at this site, the author manages to successfully use atomic operations on shared memory, so I am confused as to why my case does not function. If I comment out the lock and unlock lines, the code runs ok, but obviously incorrectly adding to the list.
I would appreciate some advice about why this problem is happening and also perhaps if there is a better solution to achieving the goal, since I am concerned anyway about the performance issues with using atomic operations or mutex locks.
I suggest using prefix-sum to implement that part to increase parallelism. To do that you need to use a shared array. Basically prefix-sum will turn an array (1,1,0,1) into (0,1,2,2,3), i.e., will calculate an in-place running exclusive sum so that you'll get per-thread write indices.
__shared__ uint8_t vector[NUMTHREADS];
....
bool emit = (x<w && y<h);
emit = emit && some_test_on_pixel(x,y);
__syncthreads();
scan(emit, vector);
if (emit) {
pPoints[blockid * 64 + vector[TID]] = point;
}
prefix-sum example:
template <typename T>
__device__ uint32 scan(T mark, T *output) {
#define GET_OUT (pout?output:values)
#define GET_INP (pin?output:values)
__shared__ T values[numWorkers];
int pout=0, pin=1;
int tid = threadIdx.x;
values[tid] = mark;
syncthreads();
for( int offset=1; offset < numWorkers; offset *= 2) {
pout = 1 - pout; pin = 1 - pout;
syncthreads();
if ( tid >= offset) {
GET_OUT[tid] = (GET_INP[tid-offset]) +( GET_INP[tid]);
}
else {
GET_OUT[tid] = GET_INP[tid];
}
syncthreads();
}
if(!pout)
output[tid] =values[tid];
__syncthreads();
return output[numWorkers-1];
#undef GET_OUT
#undef GET_INP
}
Based on recommendations here, I include the code that I used in the end. It uses 16x16 pixel blocks. Note that I am now writing the data out in one global array without breaking it up. I used the global atomicAdd function to compute a base address for each set of results. Since this only gets called once per block, I did not find too much of a slow down, while I gained a lot more convenience by doing this. I'm also avoiding shared buffers for the input and output of prefix_sum. GlobalCount is set to zero prior to the kernel call.
#define BLOCK_THREADS 256
__device__ int prefixsum(int threadid, int data)
{
__shared__ int temp[BLOCK_THREADS*2];
int pout = 0;
int pin = 1;
if(threadid==BLOCK_THREADS-1)
temp[0] = 0;
else
temp[threadid+1] = data;
__syncthreads();
for(int offset = 1; offset<BLOCK_THREADS; offset<<=1)
{
pout = 1 - pout;
pin = 1 - pin;
if(threadid >= offset)
temp[pout * BLOCK_THREADS + threadid] = temp[pin * BLOCK_THREADS + threadid] + temp[pin * BLOCK_THREADS + threadid - offset];
else
temp[pout * BLOCK_THREADS + threadid] = temp[pin * BLOCK_THREADS + threadid];
__syncthreads();
}
return temp[pout * BLOCK_THREADS + threadid];
}
__global__ void gpu_kernel(int w, int h, RtmPoint *pPoints, int *pGlobalCount)
{
__shared__ int write_base;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int threadid = threadIdx.y * blockDim.x + threadIdx.x;
int valid = 0;
if(x<w && y<h)
{
if(test_pixel(x,y))
{
valid = 1;
}
}
int index = prefixsum(threadid, valid);
if(threadid==BLOCK_THREADS-1)
{
int total = index + valid;
if(total>64)
total = 64; // global output buffer is limited to 64 points per block
write_base = atomicAdd(pGlobalCount, total); // get a location to write them out
}
__syncthreads(); // ensure write_base is valid for all threads
if(valid)
{
RtmPoint point;
point.x = x;
point.y = y;
if(index<64)
pPoints[write_base + index] = point;
}
}