CUDA graph does not run as expected - cuda

I'm using the following the code to learn about how to use "CUDA graphs". The parameter NSTEP is set as 1000, and the parameter NKERNEL is set as 20. The kernel function shortKernel has three parameters, it will perform a simple calculation.
#include <cuda_runtime.h>
#include <iostream>
#define N 131072 // tuned such that kernel takes a few microseconds
#define NSTEP 1000
#define NKERNEL 20
#define BLOCKS 256
#define THREADS 512
#define CHECK(call) \
do { \
const cudaError_t error_code = call; \
if (error_code != cudaSuccess) { \
printf("CUDA Error\n"); \
printf(" File: %s\n", __FILE__); \
printf(" Line: %d\n", __LINE__); \
printf(" Error code: %d\n", error_code); \
printf(" Error text: %s\n", cudaGetErrorString(error_code)); \
exit(1); \
} \
} while (0)
__global__ void shortKernel(float * out_d, float * in_d, int i){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N) out_d[idx]=1.23*in_d[idx] + i;
}
void test2() {
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaSetDevice(0);
float x_host[N], y_host[N];
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x_host[i] = 2.0f;
y_host[i] = 2.0f;
}
float *x, *y, *z;
CHECK(cudaMalloc((void**)&x, N*sizeof(float)));
CHECK(cudaMalloc((void**)&y, N*sizeof(float)));
CHECK(cudaMalloc((void**)&z, N*sizeof(float)));
cudaMemcpy(x, x_host, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaEvent_t begin, end;
CHECK(cudaEventCreate(&begin));
CHECK(cudaEventCreate(&end));
// start recording
cudaEventRecord(begin, stream);
bool graphCreated=false;
cudaGraph_t graph;
cudaGraphExec_t instance;
// Run graphs
for(int istep=0; istep<NSTEP; istep++){
if(!graphCreated){
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
for(int ikrnl=0; ikrnl<NKERNEL; ikrnl++){
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, ikrnl);
}
cudaStreamEndCapture(stream, &graph);
cudaGraphNode_t* nodes = NULL;
size_t num_nodes = 0;
CHECK(cudaGraphGetNodes(graph, nodes, &num_nodes));
std::cout << "Num of nodes in the graph: " << num_nodes
<< std::endl;
CHECK(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
graphCreated=true;
}
CHECK(cudaGraphLaunch(instance, stream));
cudaStreamSynchronize(stream);
} // End run graphs
cudaEventRecord(end, stream);
cudaEventSynchronize(end);
float time_ms = 0;
cudaEventElapsedTime(&time_ms, begin, end);
std::cout << "CUDA Graph - CUDA Kernel overall time: " << time_ms << " ms" << std::endl;
cudaMemcpy(y_host, y, sizeof(float) * N, cudaMemcpyDeviceToHost);
for(int i = 0; i < N; i++) {
std::cout << "res " << y_host[i] << std::endl;
}
// Free memory
cudaFree(x);
cudaFree(y);
}
int main() {
test2();
std::cout << "end" << std::endl;
return 0;
}
My expected results are shown as the following:
res 2.46
res 3.46
res 4.46
res 5.46
res 6.46
...
However, the actual results are shown like this:
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
...
It seems that the all kernels' parameter i is set as NKERNEL-1. I am very confused about it, could someone give any explanations? Thanks!
I had changed the for loop as follows:
// Run graphs
for(int istep=0; istep<NSTEP; istep++){
if(!graphCreated){
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
for(int ikrnl=0; ikrnl<NKERNEL; ikrnl++){
if(ikrnl == 0)
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, 0);
else if(ikrnl == 1)
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, 1);
else if(ikrnl == 2)
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, 2);
else
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, ikrnl);
}
cudaStreamEndCapture(stream, &graph);
cudaGraphNode_t* nodes = NULL;
size_t num_nodes = 0;
CHECK(cudaGraphGetNodes(graph, nodes, &num_nodes));
std::cout << "Num of nodes in the graph: " << num_nodes
<< std::endl;
CHECK(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
graphCreated=true;
}
CHECK(cudaGraphLaunch(instance, stream));
cudaStreamSynchronize(stream);
} // End run graphs
However, the results are still the same:
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
...

The results are expected and correct.
Every time you run the graph, this entire for-loop gets executed:
for(int ikrnl=0; ikrnl<NKERNEL; ikrnl++){
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, ikrnl);
}
After the first iteration of that for-loop, the results will all be 2.46, after the second iteration the results will all be 3.46, and after the 20th iteration (ikrnl = 19) the results will all be 21.46.
Every time you run the graph, you will get that same result.
Expecting any kind of variation in the result such as this:
res 2.46
res 3.46
res 4.46
res 5.46
res 6.46
Is completely illogical, because every thread is doing precisely the same thing. Every thread starts with the same value in x, and does the same calculation on it. There is no reason to expect any difference between y[0] and y[1], for example.
Rather than trying to wade through CUDA graphs, its clear you don't have a good grasp of what the kernel is doing. My suggestion would be that you write an ordinary CUDA code that calls that kernel just once, without any CUDA graph usage, and study the output. After that, you can put a for-loop around the kernel, and watch the result behavior after every iteration of the for-loop. You don't need CUDA graphs to understand what is going on here.

Related

CUDA Graph Problem: Results not computed for the first iteration

I am trying to utilize CUDA Graphs for the computation of Fast Fourier Transform (FFT) using CUDA's cuFFT APIs.
I modified the sample FFT code present on Github into the following FFT code using CUDA Graphs:
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <iostream>
#include <cufft.h>
// Complex data type
typedef float2 Complex;
static __device__ inline Complex ComplexScale(Complex, float);
static __device__ inline Complex ComplexMul(Complex, Complex);
static __global__ void ComplexPointwiseMulAndScale(Complex*, const Complex*, int, float);
#define CUDA_CALL( call ) \
{ \
cudaError_t result = call; \
if ( cudaSuccess != result ) \
std::cerr << "CUDA error " << result << " in " << __FILE__ << ":" << __LINE__ << ": " << cudaGetErrorString( result ) << " (" << #call << ")" << std::endl; \
}
#define CUDA_FFT_CALL( call ) \
{ \
cufftResult result = call; \
if ( CUFFT_SUCCESS != result ) \
std::cerr << "FFT error " << result << " in " << __FILE__ << ":" << __LINE__ << ": " << result << std::endl; \
}
// The filter size is assumed to be a number smaller than the signal size
#define SIGNAL_SIZE 10
#define FILTER_KERNEL_SIZE 4
static __device__ inline Complex ComplexScale(Complex a, float s)
{
Complex c;
c.x = s * a.x;
c.y = s * a.y;
return c;
}
// Complex multiplication
static __device__ inline Complex ComplexMul(Complex a, Complex b)
{
Complex c;
c.x = a.x * b.x - a.y * b.y;
c.y = a.x * b.y + a.y * b.x;
return c;
}
// Complex pointwise multiplication
static __global__ void ComplexPointwiseMulAndScale(Complex* a, const Complex* b, int size, float scale)
{
const int numThreads = blockDim.x * gridDim.x;
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = threadID; i < size; i += numThreads)
{
a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale);
}
}
int main()
{
printf("[simpleCUFFT] is starting...\n");
int minRadius = FILTER_KERNEL_SIZE / 2;
int maxRadius = FILTER_KERNEL_SIZE - minRadius;
int padded_data_size = SIGNAL_SIZE + maxRadius;
// Allocate HOST Memories
Complex* h_signal = (Complex*)malloc(sizeof(Complex) * SIGNAL_SIZE); //host signal
Complex* h_filter_kernel = (Complex*)malloc(sizeof(Complex) * FILTER_KERNEL_SIZE); //host filter
Complex* h_padded_signal= (Complex*)malloc(sizeof(Complex) * padded_data_size); // host Padded signal
Complex* h_padded_filter_kernel = (Complex*)malloc(sizeof(Complex) * padded_data_size); // host Padded filter kernel
Complex* h_convolved_signal = (Complex*)malloc(sizeof(Complex) * padded_data_size); // to store convolution RESULTS
memset(h_convolved_signal, 0, padded_data_size * sizeof(Complex));
//Allocate DEVICE Memories
Complex* d_signal; //device signal
cudaMalloc((void**)&d_signal, sizeof(Complex) * padded_data_size);
Complex* d_filter_kernel;
cudaMalloc((void**)&d_filter_kernel, sizeof(Complex) * padded_data_size); //device kernel
//CUDA GRAPH
bool graphCreated = false;
cudaGraph_t graph;
cudaGraphExec_t instance;
cudaStream_t stream;
cudaStreamCreate(&stream);
// CUFFT plan
cufftHandle plan;
CUDA_FFT_CALL(cufftPlan1d(&plan, padded_data_size, CUFFT_C2C, 1));
cufftSetStream(plan, stream); // bind plan to the stream
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_signal[i].x = rand() / (float)RAND_MAX;
h_signal[i].y = 0;
}
// Initalize the memory for the filter
for (unsigned int i = 0; i < FILTER_KERNEL_SIZE; ++i)
{
h_filter_kernel[i].x = rand() / (float)RAND_MAX;
h_filter_kernel[i].y = 0;
}
//REPEAT 3 times
int nRepeatationsNeeded = 3;
for (int repeatations = 0; repeatations < nRepeatationsNeeded; repeatations++)
{
std::cout << "\n\n" << "Repeatation ------ " << repeatations << std::endl;
if (!graphCreated)
{
//Start Graph Recording --------------!!!!!!!!
CUDA_CALL(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
//Pad Data
CUDA_CALL(cudaMemcpyAsync(h_padded_signal + 0, h_signal, SIGNAL_SIZE * sizeof(Complex), cudaMemcpyHostToHost, stream));
memset(h_padded_signal + SIGNAL_SIZE, 0, (padded_data_size - SIGNAL_SIZE) * sizeof(Complex));
//CUDA_CALL(cudaMemsetAsync(h_padded_signal + SIGNAL_SIZE, 0, (padded_data_size - SIGNAL_SIZE) * sizeof(Complex), stream));
CUDA_CALL(cudaMemcpyAsync(h_padded_filter_kernel + 0, h_filter_kernel + minRadius, maxRadius * sizeof(Complex), cudaMemcpyHostToHost, stream));
/*CUDA_CALL(cudaMemsetAsync(h_padded_filter_kernel + maxRadius, 0, (padded_data_size - FILTER_KERNEL_SIZE) * sizeof(Complex), stream));*/
memset(h_padded_filter_kernel + maxRadius, 0, (padded_data_size - FILTER_KERNEL_SIZE) * sizeof(Complex));
CUDA_CALL(cudaMemcpyAsync(h_padded_filter_kernel + padded_data_size - minRadius, h_filter_kernel, minRadius * sizeof(Complex), cudaMemcpyHostToHost, stream));
// MemCpy H to D
CUDA_CALL(cudaMemcpyAsync(d_signal, h_padded_signal, sizeof(Complex) * padded_data_size, cudaMemcpyHostToDevice, stream)); //Signal
CUDA_CALL(cudaMemcpyAsync(d_filter_kernel, h_padded_filter_kernel, sizeof(Complex) * padded_data_size, cudaMemcpyHostToDevice, stream)); //Kernel
//COMPUTE FFT
CUDA_FFT_CALL(cufftExecC2C(plan, (cufftComplex*)d_signal, (cufftComplex*)d_signal, CUFFT_FORWARD)); // Transform signal
CUDA_FFT_CALL(cufftExecC2C(plan, (cufftComplex*)d_filter_kernel, (cufftComplex*)d_filter_kernel, CUFFT_FORWARD)); // Transform kernel
ComplexPointwiseMulAndScale << <64, 1, 0, stream >> > (d_signal, d_filter_kernel, padded_data_size, 1.0f / padded_data_size); // Multiply and normalize
CUDA_CALL(cudaGetLastError());
CUDA_FFT_CALL(cufftExecC2C(plan, (cufftComplex*)d_signal, (cufftComplex*)d_signal, CUFFT_INVERSE)); // Transform signal back
// Copy device memory to host
CUDA_CALL(cudaMemcpyAsync(h_convolved_signal, d_signal, sizeof(Complex) * padded_data_size, cudaMemcpyDeviceToHost, stream));
//END Graph Recording
CUDA_CALL(cudaStreamEndCapture(stream, &graph));
CUDA_CALL(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
graphCreated = true;
}
else
{
CUDA_CALL(cudaGraphLaunch(instance, stream));
CUDA_CALL(cudaStreamSynchronize(stream));
}
//verify results
for (int i = 0; i < SIGNAL_SIZE; i++)
std::cout << "index: " << i << ", fft: " << h_convolved_signal[i].x << std::endl;
}
//Destroy CUFFT context
cufftDestroy(plan);
// cleanup memory
cudaStreamDestroy(stream);
free(h_signal);
free(h_filter_kernel);
free(h_padded_signal);
free(h_padded_filter_kernel);
cudaFree(d_signal);
cudaFree(d_filter_kernel);
return 0;
}
PROBLEM: The Output of the above program is below, in which it can be seen that the values of the result are also ZEROS for the first iteration. How can I resolve this?
The results are zero for the first iteration, because for the first iteration, the work is all issued in capture mode.
In capture mode, no CUDA work actually gets done. From here:
When a stream is being captured, work launched into the stream is not enqueued for execution.
I pointed you to this same area of the documentation in a comment to your last question. You might wish to read the entire programming guide section on graphs, and there are also blogs available.

In cuda, is it possible to write dense array from sparse array with expected sequence?

There is array1 that represent 0 or 1 (for each thread block):
bool array1[]: [1, 1, 0, 0, 1, 1]
Each thread in thread block accesses array1 by using threadIdx.x.
And, I need to make shared dense array2 (each value represents thread ID with '1' value from array1:
__shared__ bool array2[] (thread ID) : [0, 1, 4, 5]
It seems that, at least, I need atomicAdd() operation to index array2.
Even with atomicAdd(), I think that it is hard to make array2 like above sequence
(0, 1, 4, 5).
Is it possible to make array2 from array1 in cuda (for each thread block)?
you can coalesced groups:
suppose the read Boolean is threasIsIN:
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
uint32_t tid = threadIdx.x;
const uint32_t warpLength = 32;
uint32_t warpIdx = tid / warpLength;
if (threadIsIn){
auto active = cg::coalesced_threads();
uint32_t idx = active.thread_rank() + warpIdx * warpLength;
array2[idx] = tid;
}
Edit
solution with multiple warps in a block:
the first warp of the block will prepare the shared array for the rest of warps in the block, this makes the other warps to wait for the first warp to finish.
thread_block block = this_thread_block();
uint32_t tid = threadIdx.x;
const uint32_t warpLength = 32;
uint32_t warpIdx = tid / warpLength;
uint32_t startIdx = 0;
uint32_t tidToWrite = tid;
uint32_t maxItr = blockSize / warpLength;
uint32_t itr = 0;
while (warpIdx == 0 && itr < maxItr){
auto warp = cg::coalesced_threads();
auto warpMask = warp.ballot(threadIsIn); // the tid'th bit is set to 1 if threadIsIn is true for tid
uint32_t trueThreadsSize = __popc(warpMask); // counts the number of bits that are set to 1
if(threadIsIn){
auto active = cg::coalesced_threads();
// active.size() has the same value as trueThreadsSize
array2[startIdx + active.thread_rank()] = tidToWrite;
}
startIdx += trueThreadsSize;
tidToWrite += warpLength;
++itr;
arr1Idx += warpLength;
threadIsIn = arr1[arr1Idx];
}
block.sync();
This is in a general category of problems called stream compaction. The canonical approach is to perform a prefix sum (scan operation) on a processed version of your data (converting the kept values to 1, the discarded values to 0), then use that prefix sum as the index to write to, in the output array.
CUB provides a convenient block-level scan operation, so we don't have to write our own. Thereafter, the indexed copy is trivial:
$ cat t1465.cu
#include <cub/cub.cuh>
#include <iostream>
#include <cstdlib>
const int nTPB = 1024;
const int ds = nTPB;
__global__ void BlockCompactKernel(bool *data, int *result, int *data_size)
{
// Specialize BlockScan for a 1D block of nTPB threads on type int
typedef cub::BlockScan<int, nTPB> BlockScan;
// Allocate shared memory for BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;
// Obtain a segment of consecutive items that are blocked across threads
int scan_data[1];
// load data
bool tmp = data[threadIdx.x];
// process data
scan_data[0] = (tmp)?1:0;
// scan data
// Collectively compute the block-wide exclusive prefix sum
BlockScan(temp_storage).ExclusiveSum(scan_data, scan_data);
// indexed copy
if (tmp) result[scan_data[0]] = threadIdx.x;
// optional: return result size
if (threadIdx.x == nTPB-1) *data_size = scan_data[0] + ((tmp)?1:0);
}
int main(){
bool *d_data, *data = new bool[ds];
int data_size, *d_data_size, *d_result, *result = new int[ds];
cudaMalloc(&d_data_size, sizeof(d_data_size[0]));
cudaMalloc(&d_result, ds*sizeof(d_result[0]));
for (int i = 0; i < ds; i++) data[i] = (rand() > (RAND_MAX/2))?true:false;
std::cout << "Original data:" << std::endl;
for (int i=0; i < ds; i++) std::cout << (int)data[i] << ",";
cudaMalloc(&d_data, ds*sizeof(d_data[0]));
cudaMemcpy(d_data, data, ds*sizeof(d_data[0]), cudaMemcpyHostToDevice);
BlockCompactKernel<<<1,nTPB>>>(d_data, d_result, d_data_size);
cudaMemcpy(&data_size, d_data_size, sizeof(d_data_size[0]), cudaMemcpyDeviceToHost);
cudaMemcpy(result, d_result, data_size*sizeof(d_result[0]), cudaMemcpyDeviceToHost);
std::cout << std::endl << "Compacted data:" << std::endl;
for (int i=0; i < data_size; i++) std::cout << result[i] << ",";
std::cout << std::endl;
}
$ nvcc -o t1465 t1465.cu
$ cuda-memcheck ./t1465
========= CUDA-MEMCHECK
Original data:
1,0,1,1,1,0,0,1,0,1,0,1,0,1,1,1,1,1,0,1,0,0,0,1,0,0,0,0,1,0,1,1,1,0,1,1,0,1,0,1,1,1,0,1,0,0,1,1,0,1,1,0,0,1,1,0,0,0,0,0,0,1,1,1,0,1,0,1,1,1,1,0,0,1,1,1,0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,1,1,1,0,1,0,1,1,1,0,0,1,1,0,1,1,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,1,1,1,0,1,0,0,0,1,0,0,1,0,1,0,1,0,1,0,1,0,0,1,1,1,1,1,1,0,0,1,0,0,0,0,1,1,1,0,0,1,0,1,0,1,0,1,0,0,1,1,0,0,0,1,1,0,1,1,0,1,0,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,1,0,1,1,1,0,1,1,0,0,1,1,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,1,0,0,1,1,1,0,0,1,1,1,0,1,0,1,1,1,0,1,0,0,1,1,0,0,0,1,1,1,0,1,0,0,0,1,0,1,0,0,0,1,1,1,1,0,1,0,1,1,1,1,0,1,1,0,1,1,1,0,1,0,0,1,0,0,1,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,1,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,1,1,0,0,1,0,0,1,1,1,1,0,1,0,0,1,0,0,0,1,0,0,1,0,1,1,0,0,1,1,1,0,0,0,1,0,1,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,0,1,0,1,1,1,1,1,0,0,1,1,1,0,0,0,1,0,1,1,1,0,0,0,0,1,1,0,0,1,1,1,0,1,0,0,1,1,1,1,0,0,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,0,0,1,1,1,1,0,1,1,1,1,0,1,0,1,1,1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,1,1,0,0,1,0,0,1,1,0,0,1,1,1,0,0,1,1,1,1,1,0,1,0,0,1,0,1,1,0,0,1,1,1,0,1,1,0,0,1,1,1,0,0,0,1,1,0,1,1,0,0,0,1,1,1,1,0,1,1,0,0,0,0,1,1,1,1,1,0,0,0,1,1,1,0,0,1,0,1,1,0,0,1,0,1,1,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,1,0,0,0,1,1,0,1,0,1,0,1,1,0,1,1,0,0,1,1,1,1,1,0,1,0,1,0,1,0,0,0,0,0,1,0,1,1,0,1,0,0,1,0,1,0,1,1,1,1,1,0,0,1,1,0,1,0,0,1,0,0,1,1,0,0,1,0,0,1,0,1,0,1,1,1,0,1,1,1,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,1,1,0,1,1,0,0,1,1,0,1,1,1,1,0,1,1,0,0,0,1,0,1,0,0,1,1,0,1,1,0,1,0,0,1,0,1,0,0,0,0,1,0,1,1,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,1,1,1,0,0,1,0,1,0,0,1,1,1,1,0,1,1,1,0,1,0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,1,1,1,0,1,0,0,1,0,0,0,0,0,1,1,0,1,1,1,0,0,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,0,1,0,0,0,0,0,1,0,0,1,1,0,1,1,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,1,1,0,0,1,1,0,1,0,1,1,1,0,1,1,1,0,0,1,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,1,1,1,0,
Compacted data:
0,2,3,4,7,9,11,13,14,15,16,17,19,23,28,30,31,32,34,35,37,39,40,41,43,46,47,49,50,53,54,61,62,63,65,67,68,69,70,73,74,75,77,78,80,83,84,87,89,90,91,92,93,95,97,98,99,102,103,105,106,108,110,116,119,123,124,125,126,128,132,135,137,139,141,143,146,147,148,149,150,151,154,159,160,161,164,166,168,170,173,174,178,179,181,182,184,186,187,189,190,191,192,193,195,196,197,198,199,200,201,202,203,204,207,208,210,212,214,219,221,222,223,225,226,229,230,233,237,238,240,244,246,249,250,251,254,255,256,258,260,261,262,264,267,268,272,273,274,276,280,282,286,287,288,289,291,293,294,295,296,298,299,301,302,303,305,308,311,315,316,318,320,321,329,330,331,332,333,337,338,343,349,350,352,353,356,357,358,360,362,366,367,368,370,374,375,378,379,382,383,386,391,392,397,398,401,402,403,404,407,410,411,412,413,415,418,422,425,427,428,431,432,433,437,439,440,441,448,450,455,457,458,459,460,461,462,464,466,467,468,469,470,473,474,475,479,481,482,483,488,489,492,493,494,496,499,500,501,502,505,506,507,508,509,511,512,513,515,516,517,518,519,520,521,522,524,525,526,527,528,529,531,534,535,536,537,539,540,541,542,544,546,547,548,549,552,554,556,563,564,565,566,569,572,573,576,577,578,581,582,583,584,585,587,590,592,593,596,597,598,600,601,604,605,606,610,611,613,614,618,619,620,621,623,624,629,630,631,632,633,637,638,639,642,644,645,648,650,651,652,653,658,662,667,668,670,677,678,682,683,685,687,689,690,692,693,696,697,698,699,700,702,704,706,712,714,715,717,720,722,724,725,726,727,728,731,732,734,737,740,741,744,747,749,751,752,753,755,756,757,761,762,763,764,765,766,767,775,776,777,782,786,787,789,790,793,794,796,797,798,799,801,802,806,808,811,812,814,815,817,820,822,827,829,830,832,833,835,836,839,847,851,852,853,854,855,858,860,863,864,865,866,868,869,870,872,876,878,879,880,881,882,883,884,885,886,887,888,890,891,895,896,897,899,902,908,909,911,912,913,916,917,918,920,921,922,923,924,926,927,928,929,932,938,941,942,944,945,950,952,954,955,961,964,968,973,975,976,977,980,981,983,985,986,987,989,990,991,994,996,999,1001,1002,1004,1008,1011,1014,1019,1020,1021,1022,
========= ERROR SUMMARY: 0 errors
$

CUB reduction using 2D grid of blocks

I'm trying to make a sum using the CUB reduction method.
The big problem is:
I'm not sure how to return the values of each block to the Host when using 2-dimensional grids.
#include <iostream>
#include <math.h>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <iomanip>
#define nat 1024
#define BLOCK_SIZE 32
#define GRID_SIZE 32
struct frame
{
int natm;
char title[100];
float conf[nat][3];
};
using namespace std;
using namespace cub;
__global__
void add(frame* s, float L, float rc, float* blocksum)
{
int i = blockDim.x*blockIdx.x + threadIdx.x;
int j = blockDim.y*blockIdx.y + threadIdx.y;
float E=0.0, rij, dx, dy, dz;
// Your calculations first so that each thread holds its result
dx = fabs(s->conf[j][0] - s->conf[i][0]);
dy = fabs(s->conf[j][1] - s->conf[i][1]);
dz = fabs(s->conf[j][2] - s->conf[i][2]);
dx = dx - round(dx/L)*L;
dy = dy - round(dy/L)*L;
dz = dz - round(dz/L)*L;
rij = sqrt(dx*dx + dy*dy + dz*dz);
if ((rij <= rc) && (rij > 0.0))
{E = (4*((1/pow(rij,12))-(1/pow(rij,6))));}
// E = 1.0;
__syncthreads();
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
}
int main(void)
{
frame * state = (frame*)malloc(sizeof(frame));
float *blocksum = (float*)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
state->natm = nat; //inicializando o numero de atomos;
char name[] = "estado1";
strcpy(state->title,name);
for (int i = 0; i < nat; i++) {
state->conf[i][0] = i;
state->conf[i][1] = i;
state->conf[i][2] = i;
}
frame * d_state;
float *d_blocksum;
cudaMalloc((void**)&d_state, sizeof(frame));
cudaMalloc((void**)&d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)));
cudaMemcpy(d_state, state, sizeof(frame),cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 gridBlock(GRID_SIZE,GRID_SIZE);
add<<<gridBlock,dimBlock>>>(d_state, 3000, 15, d_blocksum);
cudaError_t status = cudaMemcpy(blocksum, d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);
float Etotal = 0.0;
for (int k = 0; k < GRID_SIZE*GRID_SIZE; k++){
Etotal += blocksum[k];
}
cout << endl << "energy: " << Etotal << endl;
if (cudaSuccess != status)
{
cout << cudaGetErrorString(status) << endl;
}
// Free memory
cudaFree(d_state);
cudaFree(d_blocksum);
return cudaThreadExit();
}
What is happening is that if the value of GRID_SIZE is the same asBLOCK_SIZE, as written above. The calculation is correct. But if I change the value of GRID_SIZE, the result goes wrong. Which leads me to think that the error is in this code:
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
The idea here is to return a 1D array, which contains the sum of each block.
I do not intend to change the BLOCK_SIZE value, but the value of GRID_SIZE depends on the system I'm looking at, I intend to use values greater than 32 (always multiples of that).
I looked for some example that use 2D grid with CUB, but did not find.
I really new in CUDA program, maybe I'm making a mistake.
edit: I put the complete code.
For comparison, when I calculate these exact values for a serial program, it gives me energy: -297,121
Probably the main issue is that your output indexing is not correct. Here's a reduced version of your code demonstrating correct results for arbitrary GRID_SIZE:
$ cat t1360.cu
#include <stdio.h>
#include <cub/cub.cuh>
#define BLOCK_SIZE 32
#define GRID_SIZE 25
__global__
void add(float* blocksum)
{
float E = 1.0;
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, cub::BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
}
int main(){
float *d_result, *h_result;
h_result = (float *)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
cudaMalloc(&d_result, GRID_SIZE*GRID_SIZE*sizeof(float));
dim3 grid = dim3(GRID_SIZE,GRID_SIZE);
dim3 block = dim3(BLOCK_SIZE, BLOCK_SIZE);
add<<<grid, block>>>(d_result);
cudaMemcpy(h_result, d_result, GRID_SIZE*GRID_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {printf("cuda error: %s\n", cudaGetErrorString(err)); return -1;}
float result = 0;
for (int i = 0; i < GRID_SIZE*GRID_SIZE; i++) result += h_result[i];
if (result != (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE)) printf("mismatch, should be: %f, was: %f\n", (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE), result);
else printf("Success\n");
return 0;
}
$ nvcc -o t1360 t1360.cu
$ ./t1360
Success
$
The important change I made to your kernel code was in the output indexing:
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
We want a simulated 2D index into an array that has width and height of GRID_SIZE consisting of one float quantity per point. Therefore the width of this array is given by gridDim.x (not blockDim). The gridDim variable gives the dimensions of the grid in terms of blocks - and this lines up exactly with how our results array is set up.
Your posted code will fail if GRID_SIZE and BLOCK_SIZE are different (for example, if GRID_SIZE were smaller than BLOCK_SIZE, cuda-memcheck will show illegal accesses, and if GRID_SIZE is larger than BLOCK_SIZE then this indexing error will result in blocks overwriting each other's values in the output array) because of this mixup between blockDim and gridDim.
Also note that float operations typically only have around 5 decimal digits of precision. So small differences in the 5th or 6th decimal place may be attributable to order of operations differences when doing floating-point arithmetic. You can prove this to yourself by switching to double arithmetic.

cuDNN Status Not Supported when trying to use FFT Convolution

I am trying to use the cuDNN library to do a FFT convolution. The code runs when I use the Winograd convolution / the cuDNN method that selects the fastest convolution method, but when I tried to run using the FFT convolution method it does not work.
I set the forward method to FFT convolution myself.
I checked the documents and my input is in NCHW format as required for the FFT convolution. From the docs:
CUDNN_CONVOLUTION_FWD_ALGO_FFT
xDesc Format Support: NCHW HW-packed
yDesc Format Support: NCHW HW-packed
The error "CUDNN_STATUS_NOT_SUPPORTED" happens during the cudnnGetConvolutionForwardWorkspaceSize function call.
What is happening that causes this error when I use FFT convolution VS best or Winograd?
For reference I am using cuda 9.1, cuDNN 7. I compile with the following command on Ubuntu 16.04: nvcc -arch=sm_35 -std=c++11 -O2 -lcudnn FFT_cuDNN.cu -o conv pkg-config --cflags --libs opencv; ./conv TF.png
#include <cudnn.h>
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>
using namespace cv;
using namespace cv::dnn;
#define checkCUDNN(expression) \
{ \
cudnnStatus_t status = (expression); \
if (status != CUDNN_STATUS_SUCCESS) { \
std::cerr << "Error on line " << __LINE__ << ": " \
<< cudnnGetErrorString(status) << std::endl; \
std::exit(EXIT_FAILURE); \
} \
}
cv::Mat load_image_NCHW(const char* image_path)
{
cv::Mat image = cv::imread(image_path, cv::IMREAD_COLOR);
image.convertTo(image, CV_32FC3);
cv::normalize(image,image,0,1, cv::NORM_MINMAX);
cv::Mat inputBlob = blobFromImage(image, 1.0f, cv::Size(image.rows,image.cols), cv::Scalar(0,0,0));
return inputBlob;
}
void save_image(const char* output_filename,
float* buffer,
int height,
int width) {
cv::Mat output_image(height, width, CV_32FC3, buffer);
// Make negative values zero.
cv::threshold(output_image,
output_image,
/*threshold=*/0,
/*maxval=*/0,
cv::THRESH_TOZERO);
cv::normalize(output_image, output_image, 0.0, 255.0, cv::NORM_MINMAX);
output_image.convertTo(output_image, CV_8UC3);
cv::imwrite(output_filename, output_image);
std::cerr << "Wrote output to " << output_filename << std::endl;
}
int main(int argc, const char* argv[]) {
if (argc < 2) {
std::cerr << "usage: conv <image> [gpu=0] [sigmoid=0]" << std::endl;
std::exit(EXIT_FAILURE);
}
int gpu_id = (argc > 2) ? std::atoi(argv[2]) : 0;
std::cerr << "GPU: " << gpu_id << std::endl;
bool with_sigmoid = (argc > 3) ? std::atoi(argv[3]) : 0;
std::cerr << "With sigmoid: " << std::boolalpha << with_sigmoid << std::endl;
// Load the image
cv::Mat image = load_image_NCHW(argv[1]);
int imgH = 600;
int imgW = 561;
int inC = 3;
// Set GPU to use
cudaSetDevice(gpu_id);
// Create the cudnn Handle
cudnnHandle_t cudnn;
checkCUDNN(cudnnCreate(&cudnn));
// Need a descriptor for
// The input, kernel, and convolution
cudnnTensorDescriptor_t input_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1,
/*channels=*/inC,
/*image_height=*/imgH,
/*image_width=*/imgW));
cudnnFilterDescriptor_t kernel_descriptor;
checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor));
checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
/*dataType=*/CUDNN_DATA_FLOAT,
/*format=*/CUDNN_TENSOR_NCHW,
/*out_channels=*/3,
/*in_channels=*/inC,
/*kernel_height=*/3,
/*kernel_width=*/3));
cudnnConvolutionDescriptor_t convolution_descriptor;
checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor));
checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor,
/*pad_height=*/1,
/*pad_width=*/1,
/*vertical_stride=*/1,
/*horizontal_stride=*/1,
/*dilation_height=*/1,
/*dilation_width=*/1,
/*mode=*/CUDNN_CROSS_CORRELATION,
/*computeType=*/CUDNN_DATA_FLOAT));
// Need to compute the output size
int batch_size{0}, channels{0}, height{0}, width{0};
checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convolution_descriptor,
input_descriptor,
kernel_descriptor,
&batch_size,
&channels,
&height,
&width));
std::cerr << "Output Image: " << height << " x " << width << " x " << channels
<< std::endl;
// Need an output descriptor
cudnnTensorDescriptor_t output_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1,
/*channels=*/3,
/*image_height=*/imgH,
/*image_width=*/imgW));
// Need to define the forward algorithm
cudnnConvolutionFwdAlgo_t convolution_algorithm = CUDNN_CONVOLUTION_FWD_ALGO_FFT;
// Have to compute the workspace size
size_t workspace_bytes{0};
checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn,
input_descriptor,
kernel_descriptor,
convolution_descriptor,
output_descriptor,
convolution_algorithm,
&workspace_bytes));
std::cerr << "Workspace size: " << (workspace_bytes / 1048576.0) << "MB"
<< std::endl;
assert(workspace_bytes > 0);
// Allocate the memory needed for the workspace
void* d_workspace{nullptr};
cudaMalloc(&d_workspace, workspace_bytes);
// Allocate memory for the batch of images
// and copy from host to device
int image_bytes = batch_size * channels * height * width * sizeof(float);
float* d_input{nullptr};
cudaMalloc(&d_input, image_bytes);
cudaMemcpy(d_input, image.ptr<float>(0), image_bytes, cudaMemcpyHostToDevice);
// Allocate memory for the output images
// Copy from host to device
float* d_output{nullptr};
cudaMalloc(&d_output, image_bytes);
cudaMemset(d_output, 0, image_bytes);
// clang-format off
const float kernel_template[3][3] = {
{1, 1, 1},
{1, -8, 1},
{1, 1, 1}
};
// clang-format on
float h_kernel[3][3][3][3];
for (int kernel = 0; kernel < 3; ++kernel) {
for (int channel = 0; channel < 3; ++channel) {
for (int row = 0; row < 3; ++row) {
for (int column = 0; column < 3; ++column) {
h_kernel[kernel][channel][row][column] = kernel_template[row][column];
}
}
}
}
float* d_kernel{nullptr};
cudaMalloc(&d_kernel, sizeof(h_kernel));
cudaMemcpy(d_kernel, h_kernel, sizeof(h_kernel), cudaMemcpyHostToDevice);
// Perform actual convolution
const float alpha = 1.0f, beta = 0.0f;
checkCUDNN(cudnnConvolutionForward(cudnn,
&alpha,
input_descriptor,
d_input,
kernel_descriptor,
d_kernel,
convolution_descriptor,
convolution_algorithm,
d_workspace,
workspace_bytes,
&beta,
output_descriptor,
d_output));
// If wish to use sigmoid activation
if (with_sigmoid) {
cudnnActivationDescriptor_t activation_descriptor;
checkCUDNN(cudnnCreateActivationDescriptor(&activation_descriptor));
checkCUDNN(cudnnSetActivationDescriptor(activation_descriptor,
CUDNN_ACTIVATION_SIGMOID,
CUDNN_PROPAGATE_NAN,
/*relu_coef=*/0));
checkCUDNN(cudnnActivationForward(cudnn,
activation_descriptor,
&alpha,
output_descriptor,
d_output,
&beta,
output_descriptor,
d_output));
cudnnDestroyActivationDescriptor(activation_descriptor);
}
// Move results to host
float* h_output = new float[image_bytes];
cudaMemcpy(h_output, d_output, image_bytes, cudaMemcpyDeviceToHost);
save_image("cudnn-out.png", h_output, height, width);
// Free memory
delete[] h_output;
cudaFree(d_kernel);
cudaFree(d_input);
cudaFree(d_output);
cudaFree(d_workspace);
cudnnDestroyTensorDescriptor(input_descriptor);
cudnnDestroyTensorDescriptor(output_descriptor);
cudnnDestroyFilterDescriptor(kernel_descriptor);
cudnnDestroyConvolutionDescriptor(convolution_descriptor);
cudnnDestroy(cudnn);
}
I figured it out, from the docs : xDesc's feature map height + 2 * convDesc's zero-padding height must equal 256 or less xDesc's feature map width + 2 * convDesc's zero-padding width must equal 256 or less.
When I initially read it I was under the impression the zero-padding height meant the kernelH-1, when it refers to the total padded image height / width.
My image was too large. If I resize it works i.e:
cv::Mat inputBlob = blobFromImage(image, 1.0f, cv::Size(100,100), cv::Scalar(0,0,0));

CUBLAS works unpredictably

Wrote my first program using CUDA+CUBLAS. It just uses a 'cublasDgemm' function and computes a product of 2 N*N matrices.
However, all the time I was launching my program, it keeped producing the same wrong answer (e.g. when multiplying 1*1 matrix containing 5 as a single element by 1*1 matrix containing element 6, it always said the result is 36, not 30).
I checked the program several times with no success. But, when I came back to it the nexy day (i.e. after reboot), it worked just fine. I don't remember whether I recompiled it or not, but the truth is that it is the same VS project, same code, same computer with its GPU.
So, can anyone explain me why could that have happened? And do I have to expect same strange behaviour further?
Here is the code I was launching:
#include <iostream>
#include <string>
#include <iomanip>
#include <cuda_runtime.h>
#include <cublas_v2.h>
const int N = 5;
#define IDX2F(i,j) ((i) * N + j)
void fail(const cudaError_t& cudaStatus, const std::string& errorMessage) {
if (cudaStatus != cudaSuccess) {
std::cerr << errorMessage << std::endl;
exit(EXIT_FAILURE);
}
}
void fail(const cublasStatus_t& status, const std::string& errorMessage) {
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << errorMessage << std::endl;
exit(EXIT_FAILURE);
}
}
void printMatrix(const double *C) {
for (int i=0; i<N; i++) {
for (int j=0; j<N; j++) {
std::cout << std::fixed << std::setprecision(2) << C[IDX2F(i,j)] << ' ';
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int main(int argc, char **argv) {
cudaError_t cudaStatus;
cublasStatus_t status;
cublasHandle_t handle;
double *A = new double[N*N];
double *devPtrA;
double *B = new double[N*N];
double *devPtrB;
double *C = new double[N*N];
double *devPtrC;
for (int i=0; i<N; i++)
for (int j=0; j<N; j++)
A[IDX2F(i,j)] = i + j;
for (int i=0; i<N; i++)
for (int j=0; j<N; j++)
B[IDX2F(i,j)] = i + j * 0.5;
// do not have to set anything into matrix C, because beta = 0
// allocate mamory on GPU
cudaStatus = cudaMalloc((void**)&devPtrC, N*N*sizeof(*C));
fail(cudaStatus, "device memory allocation failed");
cudaStatus = cudaMalloc((void**)&devPtrA, N*N*sizeof(*A));
fail(cudaStatus, "device memory allocation failed");
cudaStatus = cudaMalloc((void**)&devPtrB, N*N*sizeof(*B));
fail(cudaStatus, "device memory allocation failed");
// create GPU handle
status = cublasCreate(&handle);
fail(status, "CUBLAS initialization failed");
// copying matrices from host to GPU
status = cublasSetMatrix(N, N, sizeof (*B), B, N, devPtrB, N);
fail(status, "failed to load data from host to GPU");
status = cublasSetMatrix(N, N, sizeof (*A), A, N, devPtrA, N);
fail(status, "failed to load data from host to GPU");
const double ONE = 1;
const double ZERO = 0;
printMatrix(A);
printMatrix(B);
status = cublasDgemm( handle,
CUBLAS_OP_N, CUBLAS_OP_N,
N, N, N,
&ONE,
devPtrA, N,
devPtrB, N,
&ZERO,
devPtrC, N);
fail(status, "error cublasDgemm");
status = cublasGetMatrix(N, N, sizeof (*C), devPtrC, N, C, N);
fail(status, "could not load result back from GPU to host");
printMatrix(C);
status = cublasDestroy(handle);
fail(status, "could not destroy CUBLAS handle");
cudaStatus = cudaFree(devPtrC);
fail(cudaStatus, "device memory freeing failed");
cudaStatus = cudaFree(devPtrB);
fail(cudaStatus, "device memory freeing failed");
cudaStatus = cudaFree(devPtrA);
fail(cudaStatus, "device memory freeing failed");
delete[] C;
delete[] B;
delete[] A;
return EXIT_SUCCESS;
}
op(B) must be CUBLAS_OP_T
.
.
status = cublasDgemm( handle,
CUBLAS_OP_N, CUBLAS_OP_T,
N, N, N,
&ONE,
devPtrA, N,
devPtrB, N,
&ZERO,
devPtrC, N);
.
.
.
.
definition is : C = α op ( A ) op ( B ) + β C
http://docs.nvidia.com/cuda/cublas/index.html#topic_8_1