CUDA Graph Problem: Results not computed for the first iteration - cuda

I am trying to utilize CUDA Graphs for the computation of Fast Fourier Transform (FFT) using CUDA's cuFFT APIs.
I modified the sample FFT code present on Github into the following FFT code using CUDA Graphs:
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <iostream>
#include <cufft.h>
// Complex data type
typedef float2 Complex;
static __device__ inline Complex ComplexScale(Complex, float);
static __device__ inline Complex ComplexMul(Complex, Complex);
static __global__ void ComplexPointwiseMulAndScale(Complex*, const Complex*, int, float);
#define CUDA_CALL( call ) \
{ \
cudaError_t result = call; \
if ( cudaSuccess != result ) \
std::cerr << "CUDA error " << result << " in " << __FILE__ << ":" << __LINE__ << ": " << cudaGetErrorString( result ) << " (" << #call << ")" << std::endl; \
}
#define CUDA_FFT_CALL( call ) \
{ \
cufftResult result = call; \
if ( CUFFT_SUCCESS != result ) \
std::cerr << "FFT error " << result << " in " << __FILE__ << ":" << __LINE__ << ": " << result << std::endl; \
}
// The filter size is assumed to be a number smaller than the signal size
#define SIGNAL_SIZE 10
#define FILTER_KERNEL_SIZE 4
static __device__ inline Complex ComplexScale(Complex a, float s)
{
Complex c;
c.x = s * a.x;
c.y = s * a.y;
return c;
}
// Complex multiplication
static __device__ inline Complex ComplexMul(Complex a, Complex b)
{
Complex c;
c.x = a.x * b.x - a.y * b.y;
c.y = a.x * b.y + a.y * b.x;
return c;
}
// Complex pointwise multiplication
static __global__ void ComplexPointwiseMulAndScale(Complex* a, const Complex* b, int size, float scale)
{
const int numThreads = blockDim.x * gridDim.x;
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = threadID; i < size; i += numThreads)
{
a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale);
}
}
int main()
{
printf("[simpleCUFFT] is starting...\n");
int minRadius = FILTER_KERNEL_SIZE / 2;
int maxRadius = FILTER_KERNEL_SIZE - minRadius;
int padded_data_size = SIGNAL_SIZE + maxRadius;
// Allocate HOST Memories
Complex* h_signal = (Complex*)malloc(sizeof(Complex) * SIGNAL_SIZE); //host signal
Complex* h_filter_kernel = (Complex*)malloc(sizeof(Complex) * FILTER_KERNEL_SIZE); //host filter
Complex* h_padded_signal= (Complex*)malloc(sizeof(Complex) * padded_data_size); // host Padded signal
Complex* h_padded_filter_kernel = (Complex*)malloc(sizeof(Complex) * padded_data_size); // host Padded filter kernel
Complex* h_convolved_signal = (Complex*)malloc(sizeof(Complex) * padded_data_size); // to store convolution RESULTS
memset(h_convolved_signal, 0, padded_data_size * sizeof(Complex));
//Allocate DEVICE Memories
Complex* d_signal; //device signal
cudaMalloc((void**)&d_signal, sizeof(Complex) * padded_data_size);
Complex* d_filter_kernel;
cudaMalloc((void**)&d_filter_kernel, sizeof(Complex) * padded_data_size); //device kernel
//CUDA GRAPH
bool graphCreated = false;
cudaGraph_t graph;
cudaGraphExec_t instance;
cudaStream_t stream;
cudaStreamCreate(&stream);
// CUFFT plan
cufftHandle plan;
CUDA_FFT_CALL(cufftPlan1d(&plan, padded_data_size, CUFFT_C2C, 1));
cufftSetStream(plan, stream); // bind plan to the stream
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_signal[i].x = rand() / (float)RAND_MAX;
h_signal[i].y = 0;
}
// Initalize the memory for the filter
for (unsigned int i = 0; i < FILTER_KERNEL_SIZE; ++i)
{
h_filter_kernel[i].x = rand() / (float)RAND_MAX;
h_filter_kernel[i].y = 0;
}
//REPEAT 3 times
int nRepeatationsNeeded = 3;
for (int repeatations = 0; repeatations < nRepeatationsNeeded; repeatations++)
{
std::cout << "\n\n" << "Repeatation ------ " << repeatations << std::endl;
if (!graphCreated)
{
//Start Graph Recording --------------!!!!!!!!
CUDA_CALL(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
//Pad Data
CUDA_CALL(cudaMemcpyAsync(h_padded_signal + 0, h_signal, SIGNAL_SIZE * sizeof(Complex), cudaMemcpyHostToHost, stream));
memset(h_padded_signal + SIGNAL_SIZE, 0, (padded_data_size - SIGNAL_SIZE) * sizeof(Complex));
//CUDA_CALL(cudaMemsetAsync(h_padded_signal + SIGNAL_SIZE, 0, (padded_data_size - SIGNAL_SIZE) * sizeof(Complex), stream));
CUDA_CALL(cudaMemcpyAsync(h_padded_filter_kernel + 0, h_filter_kernel + minRadius, maxRadius * sizeof(Complex), cudaMemcpyHostToHost, stream));
/*CUDA_CALL(cudaMemsetAsync(h_padded_filter_kernel + maxRadius, 0, (padded_data_size - FILTER_KERNEL_SIZE) * sizeof(Complex), stream));*/
memset(h_padded_filter_kernel + maxRadius, 0, (padded_data_size - FILTER_KERNEL_SIZE) * sizeof(Complex));
CUDA_CALL(cudaMemcpyAsync(h_padded_filter_kernel + padded_data_size - minRadius, h_filter_kernel, minRadius * sizeof(Complex), cudaMemcpyHostToHost, stream));
// MemCpy H to D
CUDA_CALL(cudaMemcpyAsync(d_signal, h_padded_signal, sizeof(Complex) * padded_data_size, cudaMemcpyHostToDevice, stream)); //Signal
CUDA_CALL(cudaMemcpyAsync(d_filter_kernel, h_padded_filter_kernel, sizeof(Complex) * padded_data_size, cudaMemcpyHostToDevice, stream)); //Kernel
//COMPUTE FFT
CUDA_FFT_CALL(cufftExecC2C(plan, (cufftComplex*)d_signal, (cufftComplex*)d_signal, CUFFT_FORWARD)); // Transform signal
CUDA_FFT_CALL(cufftExecC2C(plan, (cufftComplex*)d_filter_kernel, (cufftComplex*)d_filter_kernel, CUFFT_FORWARD)); // Transform kernel
ComplexPointwiseMulAndScale << <64, 1, 0, stream >> > (d_signal, d_filter_kernel, padded_data_size, 1.0f / padded_data_size); // Multiply and normalize
CUDA_CALL(cudaGetLastError());
CUDA_FFT_CALL(cufftExecC2C(plan, (cufftComplex*)d_signal, (cufftComplex*)d_signal, CUFFT_INVERSE)); // Transform signal back
// Copy device memory to host
CUDA_CALL(cudaMemcpyAsync(h_convolved_signal, d_signal, sizeof(Complex) * padded_data_size, cudaMemcpyDeviceToHost, stream));
//END Graph Recording
CUDA_CALL(cudaStreamEndCapture(stream, &graph));
CUDA_CALL(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
graphCreated = true;
}
else
{
CUDA_CALL(cudaGraphLaunch(instance, stream));
CUDA_CALL(cudaStreamSynchronize(stream));
}
//verify results
for (int i = 0; i < SIGNAL_SIZE; i++)
std::cout << "index: " << i << ", fft: " << h_convolved_signal[i].x << std::endl;
}
//Destroy CUFFT context
cufftDestroy(plan);
// cleanup memory
cudaStreamDestroy(stream);
free(h_signal);
free(h_filter_kernel);
free(h_padded_signal);
free(h_padded_filter_kernel);
cudaFree(d_signal);
cudaFree(d_filter_kernel);
return 0;
}
PROBLEM: The Output of the above program is below, in which it can be seen that the values of the result are also ZEROS for the first iteration. How can I resolve this?

The results are zero for the first iteration, because for the first iteration, the work is all issued in capture mode.
In capture mode, no CUDA work actually gets done. From here:
When a stream is being captured, work launched into the stream is not enqueued for execution.
I pointed you to this same area of the documentation in a comment to your last question. You might wish to read the entire programming guide section on graphs, and there are also blogs available.

Related

Pass a jpeg image contents as a CUDA Texture

Would like to pass the content from jpeg file (3 byte RGB) as a texture to a CUDA kernel but getting compilation error
a pointer to a bound function may only be used to call the function
on value.x = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f; and the rest of the tex2D() calls.
What may be the reason(s) for the error?
Host side code where the texture is created:
cudaArray* cudaArray;
cudaTextureObject_t textureObject{};
{
const static uint32_t bytesPerPixel{ 3u };
uint8_t* pHostData;
int32_t textureWidth, textureHeight;
uint32_t bytesPerScanline;
cudaChannelFormatDesc channelFormatDesc;
cudaResourceDesc resourceDesc{};
cudaTextureDesc textureDesc{};
int32_t componentsPerPixel = bytesPerPixel;
pHostData = stbi_load(textureFilename.c_str(), &textureWidth, &textureHeight, &componentsPerPixel, componentsPerPixel);
if (nullptr == pHostData) {
std::cerr << "ERROR: Could not load texture image file '" << textureFilename << std::endl;
return;
}
bytesPerScanline = bytesPerPixel * textureWidth;
channelFormatDesc = cudaCreateChannelDesc<uint8_t>();
checkCudaErrors(cudaMallocArray(&cudaArray, &channelFormatDesc, bytesPerScanline, textureHeight));
checkCudaErrors(cudaMemcpyToArray(cudaArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
resourceDesc.resType = cudaResourceTypeArray;
resourceDesc.res.array.array = cudaArray;
textureDesc.normalizedCoords = true;
textureDesc.filterMode = cudaFilterModePoint;
textureDesc.addressMode[0] = cudaAddressModeWrap;
textureDesc.addressMode[1] = cudaAddressModeWrap;
textureDesc.readMode = cudaReadModeElementType;
checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
STBI_FREE(pHostData);
}
Device side code:
class imageTexture {
public:
__device__ imageTexture(cudaTextureObject_t tex) :_texture(tex) {}
__device__ virtual vec3 value(float u, float v, const vec3& p) const {
vec3 value;
u *= 3;
value.x = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
u++;
value.y = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
u++;
value.z = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
return value;
}
private:
cudaTextureObject_t _texture;
};
Changed device side function, but the error persists:
class imageTexture :public textureX {
public:
__device__ imageTexture(cudaTextureObject_t tex) :_text(tex) {}
__device__ virtual vec3 value(float u, float v, const vec3& p) const override {
vec3 val;
u *= 3;
val.x = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
u++;
val.y = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
u++;
val.z = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
return val;
}
private:
cudaTextureObject_t _text;
};
I have written a new test program and planning to build up from there. The idea is have each thread read 3 values from texture and write it back to a buffer. Only the first triplet is correct. Is there anything inconsistent in my texture lookups with this:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>
#include <iostream>
#include <string>
#include <chrono>
#include <cmath>
#include <ctime>
#include <cstdint>
#include <stdio.h>
#define checkCudaErrors(val) check_cuda( (val), #val, __FILE__, __LINE__ )
void check_cuda(cudaError_t result, char const* const func, const char* const file, int const line) {
if (result) {
std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " at " << file << ":" << line << " " << func << std::endl;
std::cerr << cudaGetErrorString(result) << std::endl;
// Make sure we call CUDA Device Reset before exiting
cudaDeviceReset();
exit(99);
}
}
__global__ void texCheck(uint32_t width, uint32_t height, uint8_t* pOutput, cudaTextureObject_t textureObject) {
uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < width) && (y < height)) {
float u = (float)x / (float)width;
float v = (float)y / (float)height;
pOutput[y * (3 * width) + (3 * x)] = tex2D<uint8_t>(textureObject, 3*u, v);
pOutput[y * (3 * width) + (3 * x) + 1] = tex2D<uint8_t>(textureObject, 3*u + 1, v);
pOutput[y * (3 * width) + (3 * x) + 2] = tex2D<uint8_t>(textureObject, 3*u + 2, v);
}
}
void cudaTex() {
const uint32_t bytesPerPixel{ 3u };
const uint32_t textureWidth = 1024u;
const uint32_t textureHeight = 512u;
uint32_t bytesPerScanline;
bytesPerScanline = bytesPerPixel * textureWidth;
cudaChannelFormatDesc channelFormatDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
cudaArray* cudaArray;
checkCudaErrors(cudaMallocArray(&cudaArray, &channelFormatDesc, bytesPerScanline, textureHeight));
uint8_t* pHostData = new uint8_t[bytesPerScanline * textureHeight];
std::srand(std::time(nullptr));
for (uint64_t idx = 0ull; idx < bytesPerScanline * textureHeight; idx++)
pHostData[idx] = std::rand();
checkCudaErrors(cudaMemcpyToArray(cudaArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
cudaResourceDesc resourceDesc{};
resourceDesc.resType = cudaResourceTypeArray;
resourceDesc.res.array.array = cudaArray;
cudaTextureDesc textureDesc{};
textureDesc.normalizedCoords = false;
textureDesc.filterMode = cudaFilterModePoint;
textureDesc.addressMode[0] = cudaAddressModeWrap;
textureDesc.addressMode[1] = cudaAddressModeWrap;
textureDesc.readMode = cudaReadModeElementType;
cudaTextureObject_t textureObject{};
checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
dim3 dimBlock(8u, 8u, 1u);
dim3 dimGrid(textureWidth / dimBlock.x, textureHeight / dimBlock.y, 1u);
uint8_t* dOutput{ nullptr };
checkCudaErrors(cudaMalloc((void**)&dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t)));
texCheck << < dimGrid, dimBlock >> > (textureWidth, textureHeight, dOutput, textureObject);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
uint8_t* hOutput = new uint8_t[bytesPerScanline * textureHeight];
checkCudaErrors(cudaMemcpy(hOutput, dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyDeviceToHost));
for (uint64_t idx = 0ull; idx < textureHeight; idx++) {
for (uint64_t jdx = 0ull; jdx < bytesPerScanline; jdx++) {
if (hOutput[jdx] != pHostData[jdx])
std::cerr << "Mismatch # " << idx << " " << jdx << " Expected " << (uint32_t)pHostData[jdx] << " Received " << (uint32_t)hOutput[jdx] << std::endl;
}
hOutput += bytesPerScanline;
pHostData += bytesPerScanline;
}
checkCudaErrors(cudaDestroyTextureObject(textureObject));
checkCudaErrors(cudaFree(dOutput));
checkCudaErrors(cudaFreeArray(cudaArray));
delete[] hOutput;
delete[] pHostData;
}
int main() {
cudaTex();
return 0;
}
Switching to integer coordinated in the kernel solved the problem
Resolution of the Original Problem
It turned out that the
a pointer to a bound function may only be used to call the function
error was caused by vec3 class having a getter function x() and not a member variable named x. So the code was trying use the getter function as an l-value!!!
There are several problems with the code you have now posted:
after the discussion in the comments, hopefully you can figure out what is wrong with this line of code:
cudaArray* cudaArray;
Your kernel code appears to be trying pass normalized float coordinates but doing it incorrectly. There are several issues here: your x normalization is considering textureWidth but it should be done over 3*textureWidth (i.e. bytesPerScanline). Although you are calling the width of your texture textureWidth, really it is 3*textureWidth. Also, texturing in this fashion is typically offset by 0.5. Finally, you are doing this:
textureDesc.normalizedCoords = false;
but if you want to use float coordinates (seems to be what you want) you shoudl do:
textureDesc.normalizedCoords = true;
After you fix all that, you'll run into a non-CUDA issue. You're modifying these pointers:
hOutput += bytesPerScanline;
pHostData += bytesPerScanline;
then trying to delete them after modification:
delete[] hOutput;
delete[] pHostData;
that won't work correctly.
Here's a modified code that has the above issues addressed, it seems to run correctly for me:
$ cat t7.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>
#include <iostream>
#include <string>
#include <chrono>
#include <cmath>
#include <ctime>
#include <cstdint>
#include <stdio.h>
#define checkCudaErrors(val) check_cuda( (val), #val, __FILE__, __LINE__ )
void check_cuda(cudaError_t result, char const* const func, const char* const file, int const line) {
if (result) {
std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " at " << file << ":" << line << " " << func << std::endl;
std::cerr << cudaGetErrorString(result) << std::endl;
// Make sure we call CUDA Device Reset before exiting
cudaDeviceReset();
exit(99);
}
}
__global__ void texCheck(uint32_t width, uint32_t height, uint8_t* pOutput, cudaTextureObject_t textureObject) {
uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
const float pix_offset = 0.5;
if ((x < width) && (y < height)) {
float u = (float)(3*x+pix_offset) / (float)(3*width);
float v = (float)y / (float)height;
pOutput[y * (3 * width) + (3 * x)] = tex2D<uint8_t>(textureObject, u, v);
u = (float)(3*x+1+pix_offset) / (float)(3*width);
pOutput[y * (3 * width) + (3 * x) + 1] = tex2D<uint8_t>(textureObject, u, v);
u = (float)(3*x+2+pix_offset) / (float)(3*width);
pOutput[y * (3 * width) + (3 * x) + 2] = tex2D<uint8_t>(textureObject, u, v);
}
}
void cudaTex() {
const uint32_t bytesPerPixel{ 3u };
const uint32_t textureWidth = 1024u;
const uint32_t textureHeight = 512u;
uint32_t bytesPerScanline;
bytesPerScanline = bytesPerPixel * textureWidth;
cudaChannelFormatDesc channelFormatDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
cudaArray* cArray;
checkCudaErrors(cudaMallocArray(&cArray, &channelFormatDesc, bytesPerScanline, textureHeight));
uint8_t* pHostData = new uint8_t[bytesPerScanline * textureHeight];
std::srand(std::time(nullptr));
for (uint64_t idx = 0ull; idx < bytesPerScanline * textureHeight; idx++)
pHostData[idx] = std::rand();
checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
cudaResourceDesc resourceDesc{};
resourceDesc.resType = cudaResourceTypeArray;
resourceDesc.res.array.array = cArray;
cudaTextureDesc textureDesc{};
textureDesc.normalizedCoords = true;
textureDesc.filterMode = cudaFilterModePoint;
textureDesc.addressMode[0] = cudaAddressModeWrap;
textureDesc.addressMode[1] = cudaAddressModeWrap;
textureDesc.readMode = cudaReadModeElementType;
cudaTextureObject_t textureObject{};
checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
dim3 dimBlock(8u, 8u, 1u);
dim3 dimGrid(textureWidth / dimBlock.x, textureHeight / dimBlock.y, 1u);
uint8_t* dOutput{ nullptr };
checkCudaErrors(cudaMalloc((void**)&dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t)));
texCheck << < dimGrid, dimBlock >> > (textureWidth, textureHeight, dOutput, textureObject);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
uint8_t* hOutput = new uint8_t[bytesPerScanline * textureHeight];
checkCudaErrors(cudaMemcpy(hOutput, dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyDeviceToHost));
uint8_t *my_hOutput = hOutput;
uint8_t *my_pHostData = pHostData;
for (uint64_t idx = 0ull; idx < textureHeight; idx++) {
for (uint64_t jdx = 0ull; jdx < bytesPerScanline; jdx++) {
if (hOutput[jdx] != pHostData[jdx]){
std::cerr << "Mismatch # " << idx << " " << jdx << " Expected " << (uint32_t)pHostData[jdx] << " Received " << (uint32_t)hOutput[jdx] << std::endl;
return;}
}
hOutput += bytesPerScanline;
pHostData += bytesPerScanline;
}
checkCudaErrors(cudaDestroyTextureObject(textureObject));
checkCudaErrors(cudaFree(dOutput));
checkCudaErrors(cudaFreeArray(cArray));
delete[] my_hOutput;
delete[] my_pHostData;
}
int main() {
cudaTex();
return 0;
}
$ nvcc -o t7 t7.cu -std=c++11
t7.cu: In function ‘void cudaTex()’:
t7.cu:56:12: warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated (declared at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:6782) [-Wdeprecated-declarations]
checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
^
t7.cu:56:131: warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated (declared at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:6782) [-Wdeprecated-declarations]
checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
^
$ cuda-memcheck ./t7
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
I'm not suggesting the above code is defect-free. It's mostly your code. I'm just pointing out some things I found. You can read about how to address the deprecation warning here.

cuda nppiResize() for RGB images

nVidia Cuda nppiResize_32f_C1R works OK on grayscale 1 x 32f, but nppiResize_32f_C3R returns garbage. Clearly, a work around would be to call this routine 3 times by first de-interleaving the data as planar R, G, B, but I was expecting to be able to run it through in a single pass. nVidia has a lot of example code for single plane image processing, but a scant amount for interleaved color planes, so I'm turning to stackoverflow for help. I don't know how the stride is computed, but I understand that the stride is the image width times the number of bytes per column index. So in my case - with no padded lines - it should be width 32f x 3 for RGB.
Tried different strides/pitches in cudaMemcpy2D(). Can't get a workable solution for the color RGB code. Compiles & runs OK, no errors. The first section is for Grayscale (works OK). The second section is RGB (garbage).
// nppiResize using 2D aligned allocations
#include <Exceptions.h>
#include <cuda_runtime.h>
#include <npp.h>
#include <nppi.h>
#include <nppdefs.h>
#define CUDA_CALL(call) do { cudaError_t cuda_error = call; if(cuda_error != cudaSuccess) { std::cerr << "CUDA Error: " << cudaGetErrorString(cuda_error) << ", " << __FILE__ << ", line " << __LINE__ << std::endl; return(NULL);} } while(0)
float* decimate_cuda(float* readbuff, uint32_t nSrcH, uint32_t nSrcW, uint32_t nDstH, uint32_t nDstW, uint8_t byteperpixel)
{
if (byteperpixel == 1){ // source : Grayscale, 1 x 32f
size_t srcStep;
size_t dstStep;
NppiSize oSrcSize = {nSrcW, nSrcH};
NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
float *devSrc;
CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, nSrcW * sizeof(float), nSrcH));
CUDA_CALL(cudaMemcpy2D((void**)devSrc, srcStep,(void**)readbuff, nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));
NppiSize oDstSize = {nDstW, nDstH};
NppiRect oDstROI = {0, 0, nDstW, nDstH};
float *devDst;
CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, nDstW * sizeof(float), nDstH));
NppStatus result = nppiResize_32f_C1R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
if (result != NPP_SUCCESS) {
std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
}
Npp64s writesize;
Npp32f *hostDst;
writesize = (Npp64s) nDstW * nDstH; // Y
if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
exit(1);
}
CUDA_CALL(cudaMemcpy2D(hostDst, nDstW * sizeof(Npp32f),(void**)devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
CUDA_CALL(cudaFree(devSrc));
CUDA_CALL(cudaFree(devDst));
return(hostDst);
} // source : Grayscale 1 x 32f, YYYY...
else if (byteperpixel == 3){ // source : 3 x 32f interleaved RGBRGBRGB...
size_t srcStep;
size_t dstStep;
// rows = height; columns = width
NppiSize oSrcSize = {nSrcW, nSrcH};
NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
float *devSrc;
CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, 3 * nSrcW * sizeof(float), nSrcH));
CUDA_CALL(cudaMemcpy2D((void**)devSrc, srcStep, (void**)readbuff, 3 * nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));
NppiSize oDstSize = {nDstW, nDstH};
NppiRect oDstROI = {0, 0, nDstW, nDstH};
float *devDst;
CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, 3 * nDstW * sizeof(float), nDstH));
NppStatus result = nppiResize_32f_C3R((devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
if (result != NPP_SUCCESS) {
std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
}
Npp64s writesize;
Npp32f *hostDst;
writesize = (Npp64s) nDstW * nDstH * 3; // RGB
if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
exit(1);
}
CUDA_CALL(cudaMemcpy2D((void**)hostDst, nDstW * sizeof(Npp32f), (void**)devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
CUDA_CALL(cudaFree(devSrc));
CUDA_CALL(cudaFree(devDst));
return(hostDst);
} // source - 3 x 32f, interleaved RGBRGBRGB...
return(0);
}
You had various errors in your calls to cudaMemcpy2D (both of them, in the 3 channel code). This code seems to work for me:
$ cat t1521.cu
#include <cuda_runtime.h>
#include <npp.h>
#include <nppi.h>
#include <nppdefs.h>
#include <iostream>
#include <stdint.h>
#include <stdio.h>
#define CUDA_CALL(call) do { cudaError_t cuda_error = call; if(cuda_error != cudaSuccess) { std::cerr << "CUDA Error: " << cudaGetErrorString(cuda_error) << ", " << __FILE__ << ", line " << __LINE__ << std::endl; return(NULL);} } while(0)
using namespace std;
float* decimate_cuda(float* readbuff, uint32_t nSrcH, uint32_t nSrcW, uint32_t nDstH, uint32_t nDstW, uint8_t byteperpixel)
{
if (byteperpixel == 1){ // source : Grayscale, 1 x 32f
size_t srcStep;
size_t dstStep;
NppiSize oSrcSize = {nSrcW, nSrcH};
NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
float *devSrc;
CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, nSrcW * sizeof(float), nSrcH));
CUDA_CALL(cudaMemcpy2D(devSrc, srcStep,readbuff, nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));
NppiSize oDstSize = {nDstW, nDstH};
NppiRect oDstROI = {0, 0, nDstW, nDstH};
float *devDst;
CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, nDstW * sizeof(float), nDstH));
NppStatus result = nppiResize_32f_C1R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
if (result != NPP_SUCCESS) {
std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
}
Npp64s writesize;
Npp32f *hostDst;
writesize = (Npp64s) nDstW * nDstH; // Y
if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
exit(1);
}
CUDA_CALL(cudaMemcpy2D(hostDst, nDstW * sizeof(Npp32f),devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
CUDA_CALL(cudaFree(devSrc));
CUDA_CALL(cudaFree(devDst));
return(hostDst);
} // source : Grayscale 1 x 32f, YYYY...
else if (byteperpixel == 3){ // source : 3 x 32f interleaved RGBRGBRGB...
size_t srcStep;
size_t dstStep;
// rows = height; columns = width
NppiSize oSrcSize = {nSrcW, nSrcH};
NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
float *devSrc;
CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, 3 * nSrcW * sizeof(float), nSrcH));
CUDA_CALL(cudaMemcpy2D(devSrc, srcStep,readbuff, 3 * nSrcW * sizeof(Npp32f), 3*nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));
NppiSize oDstSize = {nDstW, nDstH};
NppiRect oDstROI = {0, 0, nDstW, nDstH};
float *devDst;
CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, 3 * nDstW * sizeof(float), nDstH));
NppStatus result = nppiResize_32f_C3R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
if (result != NPP_SUCCESS) {
std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
}
Npp64s writesize;
Npp32f *hostDst;
writesize = (Npp64s) nDstW * nDstH * 3; // RGB
if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
exit(1);
}
CUDA_CALL(cudaMemcpy2D(hostDst, nDstW*3 * sizeof(Npp32f), devDst, dstStep, nDstW*3 * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
CUDA_CALL(cudaFree(devSrc));
CUDA_CALL(cudaFree(devDst));
return(hostDst);
} // source - 3 x 32f, interleaved RGBRGBRGB...
return(0);
}
int main(){
uint32_t nSrcH = 480;
uint32_t nSrcW = 640;
uint8_t byteperpixel = 3;
float *readbuff = (float *)malloc(nSrcW*nSrcH*byteperpixel*sizeof(float));
for (int i = 0; i < nSrcH*nSrcW; i++){
readbuff [i*3+0] = 1.0f;
readbuff [i*3+1] = 2.0f;
readbuff [i*3+2] = 3.0f;}
uint32_t nDstW = nSrcW/2;
uint32_t nDstH = nSrcH/2;
float *res = decimate_cuda(readbuff, nSrcH, nSrcW, nDstH, nDstW, byteperpixel);
for (int i = 0; i < nDstH*nDstW*byteperpixel; i++) if (res[i] != ((i%3)+1.0f)) {std::cout << "error at: " << i << std::endl; return 0;}
return 0;
}
$ nvcc -o t1521 t1521.cu -lnppig
$ cuda-memcheck ./t1521
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
In the future, its convenient if you provide a complete code, just as I have done in my answer. In fact SO requires this, see item 1 here.
By the way, the use of pitched allocations on the device, here, which introduce complexity that you were not able to work your way through, should really be unnecessary both for correctness and performance, using any modern GPU and CUDA version. Ordinary linear/flat allocations, where pitch==width, should be just fine.

Cuda Unified memory vs cudaMalloc

I am trying to do some benchmarking to ensure using CUDA's Unified Memory(UM) approach will not hurt us wrt performance.
I am performing an FFT. One way i use UM, one way i use the cudaMalloc
I compare the results afterwards and they all match up (which is good).
however, the timing i'm getting for the UM approach is ~.5ms vs the cudaMalloc way of ~.04 (after performing the run multiple times an averaging)
I am using Event records to do the timing. I have one right before and after the cufftExecC2C call.
Furthermore, I added two more event records to measure the time before any memory transfer to the device, and after using the data once i get it back from the device.
when doing this, i see the UM approach take ~1.6ms and the cudaMalloc approach taking ~.7.
Below is a snippet of code that does the UM approach:
cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);
cudaMallocManaged(&inData, dataSize * sizeof(cufftComplex));
cudaMallocManaged(&outData, dataSize * sizeof(cufftComplex));
cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);
setupWave(dataSize, inData);
cudaEventRecord(start_before_memHtoD);
cudaMemPrefetchAsync(inData, dataSize * sizeof(cufftComplex), 1);
cudaDeviceSynchronize();
cudaEventRecord(start_kernel);
cufftExecC2C(plan, inData, outData, CUFFT_FORWARD);
cudaEventRecord(stop_kernel);
cudaEventSynchronize(stop_kernel);
float sum = 0;
for (int i = 0; i < dataSize; i++) {
sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);
std::cout << "sum for UM is " << sum << std::endl;
float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
stop_after_memDtoH);
resultString_um += std::to_string(dataSize) + " samples took "
+ std::to_string(umTime) + "ms, Overall: "
+ std::to_string(overallUmTime) + "\n";
cudaFree(outData);
cudaFree(inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);
cufftDestroy(plan);
The following is for the cudaMalloc approach
cufftComplex *d_inData;
cufftComplex *d_outData;
inData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
outData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
cudaMalloc((void**) (&d_inData), dataSize * sizeof(cufftComplex));
cudaMalloc((void**) (&d_outData), dataSize * sizeof(cufftComplex));
cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);
cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);
setupWave(dataSize, inData);
cudaEventRecord(start_before_memHtoD);
cudaMemcpy(d_inData, inData, dataSize * sizeof(cufftComplex),
cudaMemcpyHostToDevice);
cudaEventRecord(start_kernel);
cufftExecC2C(plan, d_inData, d_outData, CUFFT_FORWARD);
cudaEventRecord(stop_kernel);
cudaEventSynchronize(stop_kernel);
cudaMemcpy(outData, d_outData, dataSize * sizeof(cufftComplex),
cudaMemcpyDefault);
cudaEventRecord(stop_after_memDtoH);
float sum = 0;
for (int i = 0; i < dataSize; i++) {
sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);
std::cout << "sum for UM is " << sum << std::endl;
float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
stop_after_memDtoH);
resultString_um += std::to_string(dataSize) + " samples took "
+ std::to_string(umTime) + "ms, Overall: "
+ std::to_string(overallUmTime) + "\n";
cudaFree(outData);
cudaFree(inData);
cudaFree(d_outData);
cudaFree(d_inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);
cufftDestroy(plan);
Is there something else I could be doing when using the unified memory approach to speed it up? I expected UM to be slower, but not by this much.
We are using the P100 on redhat 7.3 with Cuda 9
One problem with your posted code is that you are not doing a cudaMemPrefetchAsync on the output data from the FFT. According to my testing, this makes a significant difference. There were a few other problems with your code, for example we do not call cudaFree on a pointer allocated with malloc.
Here's a complete code built around what you have shown. When I run this on CentOS7.4, CUDA 9.1, Tesla P100, I get comparable times for the FFT performed in the managed memory case (3.52ms) vs. the FFT performed in the non-managed memory case (3.45ms):
$ cat t43.cu
#include <cufft.h>
#include <iostream>
#include <string>
//using namespace std;
const int dataSize = 1048576*32;
void setupWave(const int ds, cufftComplex *d){
for (int i = 0; i < ds; i++){
d[i].x = 1.0f;
d[i].y = 0.0f;}
}
int main(){
cufftComplex *inData, *outData;
cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);
cudaMallocManaged(&inData, dataSize * sizeof(cufftComplex));
cudaMallocManaged(&outData, dataSize * sizeof(cufftComplex));
cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);
setupWave(dataSize, inData);
cudaEventRecord(start_before_memHtoD);
cudaMemPrefetchAsync(inData, dataSize * sizeof(cufftComplex), 0);
cudaMemPrefetchAsync(outData, dataSize * sizeof(cufftComplex), 0);
cudaDeviceSynchronize();
cudaEventRecord(start_kernel);
cufftExecC2C(plan, inData, outData, CUFFT_FORWARD);
cudaEventRecord(stop_kernel);
cudaEventSynchronize(stop_kernel);
float sum = 0;
for (int i = 0; i < dataSize; i++) {
sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);
std::cout << "sum for UM is " << sum << std::endl;
float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
stop_after_memDtoH);
std::string resultString_um = std::to_string(dataSize) + " samples took " + std::to_string(umTime) + "ms, Overall: " + std::to_string(overallUmTime) + "\n";
std::cout << resultString_um;
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaFree(inData);
cudaFree(outData);
cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);
cufftDestroy(plan);
cufftComplex *d_inData;
cufftComplex *d_outData;
inData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
outData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
cudaMalloc((void**) (&d_inData), dataSize * sizeof(cufftComplex));
cudaMalloc((void**) (&d_outData), dataSize * sizeof(cufftComplex));
//cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);
//cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
// stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);
setupWave(dataSize, inData);
cudaEventRecord(start_before_memHtoD);
cudaMemcpy(d_inData, inData, dataSize * sizeof(cufftComplex),
cudaMemcpyHostToDevice);
cudaEventRecord(start_kernel);
cufftExecC2C(plan, d_inData, d_outData, CUFFT_FORWARD);
cudaEventRecord(stop_kernel);
cudaEventSynchronize(stop_kernel);
cudaMemcpy(outData, d_outData, dataSize * sizeof(cufftComplex),
cudaMemcpyDefault);
sum = 0;
for (int i = 0; i < dataSize; i++) {
sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);
std::cout << "sum for non-UM is " << sum << std::endl;
//float umTime = 0;
//float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
stop_after_memDtoH);
resultString_um = std::to_string(dataSize) + " samples took "
+ std::to_string(umTime) + "ms, Overall: "
+ std::to_string(overallUmTime) + "\n";
std::cout << resultString_um;
free(outData);
free(inData);
cudaFree(d_outData);
cudaFree(d_inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);
cufftDestroy(plan);
}
$ nvcc -std=c++11 -arch=sm_60 -o t43 t43.cu -lcufft
$ ./t43
sum for UM is 3.35544e+07
33554432 samples took 3.520640ms, Overall: 221.909988
sum for non-UM is 3.35544e+07
33554432 samples took 3.456160ms, Overall: 278.099426
$

Dense-to-sparse and sparse-to-dense conversions using cuSPARSE

The following program tests the dense to sparse conversion using cuSPARSE. It produces garbage in the first several lines of output. But if I move the lines marked with (2) to the place after the lines marked with (1), the program works fine. Can someone tell me what could be the reason?
EDIT:
To make the presentation clearer, I rewrote the program with thrust, the same issue persists.
EDIT:
As suggested by Robert, I changed it back to the version without thrust and added api level error check code.
#include <iostream>
#include <cusparse_v2.h>
using std::cerr;
using std::cout;
using std::endl;
#define WRAP(x) do {x} while (0)
#define CHKcusparse(x) WRAP( \
cusparseStatus_t err = (x); \
if (err != CUSPARSE_STATUS_SUCCESS) { \
cerr << "Cusparse Error #" << int(err) << "\"TODO\" at Line " \
<< __LINE__ << " of " << __FILE__ << ": " << #x << endl; \
exit(1); \
} \
)
#define CHKcuda(x) WRAP( \
cudaError_t err = (x); \
if (err != cudaSuccess) { \
cerr << "Cuda Error #" << int(err) << ", \"" \
<< cudaGetErrorString(err) << "\" at Line " << __LINE__ \
<< " of " << __FILE__ << ": " << #x << endl; \
exit(1); \
} \
)
#define ALLOC(X, T, N) do { \
h##X = (T*) malloc(sizeof(T) * (N)); \
CHKcuda(cudaMalloc((void**)&d##X, sizeof(T) * (N))); \
} while(0)
int main() {
srand(100);
cusparseHandle_t g_cusparse_handle;
CHKcusparse(cusparseCreate(&g_cusparse_handle));
const int n = 100, in_degree = 10;
int nnz = n * in_degree, nn = n * n;
int *dnnz, *dridx, *dcols;
int *hnnz, *hridx, *hcols;
float *dvals, *dmat;
float *hvals, *hmat;
// (1) The number of non-zeros in each column.
ALLOC(nnz, int, n);
// The dense matrix.
ALLOC(mat, float, nn);
// The values in sparse matrix.
ALLOC(vals, float, nnz);
// (2) The row indices of the sparse matrix.
ALLOC(ridx, int, nnz);
// The column offsets of the sparse matrix.
ALLOC(cols, int, n+1);
// Fill and copy dense matrix and number of non-zeros.
for (int i = 0; i < nn; i++) {hmat[i] = rand();}
for (int i = 0; i < n; i++) {hnnz[i] = in_degree;}
CHKcuda(cudaMemcpyAsync(dnnz, hnnz, sizeof(int) * n, cudaMemcpyHostToDevice));
CHKcuda(cudaMemcpyAsync(dmat, hmat, sizeof(float) * nn, cudaMemcpyHostToDevice));
CHKcuda(cudaDeviceSynchronize());
// Perform dense to CSC format
cusparseMatDescr_t cspMatDesc;
CHKcusparse(cusparseCreateMatDescr(&cspMatDesc));
CHKcusparse(cusparseSdense2csc(
g_cusparse_handle, n, n, cspMatDesc, dmat, n,
dnnz, dvals, dridx, dcols
));
// Copy row indices back.
CHKcuda(cudaMemcpyAsync(hridx, dridx, sizeof(int) * nnz, cudaMemcpyDeviceToHost));
CHKcuda(cudaDeviceSynchronize());
CHKcusparse(cusparseDestroyMatDescr(cspMatDesc));
// Display row indices.
for (int i = 0; i < n; i++) {
for (int j = 0; j < in_degree; j++) {
std::cout << hridx[i * in_degree + j] << ", ";
}
std::cout << std::endl;
}
CHKcuda(cudaFree(dnnz));
CHKcuda(cudaFree(dvals));
CHKcuda(cudaFree(dridx));
CHKcuda(cudaFree(dcols));
CHKcuda(cudaFree(dmat));
free(hnnz);
free(hmat);
free(hvals);
free(hridx);
free(hcols);
return 0;
}
The basic problem is that you are passing internally inconsistent data to the dense-to-sparse routine. You are passing a dense matrix which has 100 non-zero elements per column, but you are telling cusparse that there are only 10 non-zero elements per column.
If you run your code with cuda-memcheck, you will see that there are errors coming out of cusparse.
For this code, you can fix the issue by changing your in_degree variable to 100.
For the general case, cusparse provides a convenient routine to populate the number of non-zero elements per column correctly.
As already underlined by Robert Crovella, passing from dense to sparse can be effectively performed using cuSPARSE by the cusparse<t>nnz() and cusparse<t>dense2csr() routines. The vice versa can be done by the cusparse<t>csr2dense() routine. Below, there is a fully worked out example showing how passing from dense to sparse and vice versa using cuSPARSE in CSR format.
cuSparseUtilities.cuh
#ifndef CUSPARSEUTILITIES_CUH
#define CUSPARSEUTILITIES_CUH
#include "cusparse_v2.h"
void setUpDescriptor(cusparseMatDescr_t &, cusparseMatrixType_t, cusparseIndexBase_t);
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A,
int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA,
const cusparseHandle_t handle, const int Nrows, const int Ncols);
#endif
cuSparseUtilities.cu
#include "cuSparseUtilities.cuh"
#include "Utilities.cuh"
/*****************************/
/* SETUP DESCRIPTOR FUNCTION */
/*****************************/
void setUpDescriptor(cusparseMatDescr_t &descrA, cusparseMatrixType_t matrixType, cusparseIndexBase_t indexBase) {
cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseSetMatType(descrA, matrixType));
cusparseSafeCall(cusparseSetMatIndexBase(descrA, indexBase));
}
/********************************************************/
/* DENSE TO SPARSE CONVERSION FOR REAL DOUBLE PRECISION */
/********************************************************/
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A,
int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA,
const cusparseHandle_t handle, const int Nrows, const int Ncols) {
const int lda = Nrows; // --- Leading dimension of dense matrix
gpuErrchk(cudaMalloc(&d_nnzPerVector[0], Nrows * sizeof(int)));
// --- Compute the number of nonzero elements per row and the total number of nonzero elements in the dense d_A_dense
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], &nnz));
// --- Device side sparse matrix
gpuErrchk(cudaMalloc(&d_A[0], nnz * sizeof(double)));
gpuErrchk(cudaMalloc(&d_A_RowIndices[0], (Nrows + 1) * sizeof(int)));
gpuErrchk(cudaMalloc(&d_A_ColIndices[0], nnz * sizeof(int)));
cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], d_A[0], d_A_RowIndices[0], d_A_ColIndices[0]));
}
kernel.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cusparse_v2.h>
#include "cuSparseUtilities.cuh"
#include "Utilities.cuh"
/********/
/* MAIN */
/********/
int main() {
cusparseHandle_t handle;
// --- Initialize cuSPARSE
cusparseSafeCall(cusparseCreate(&handle));
cusparseMatDescr_t descrA = 0;
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
const int Nrows = 5; // --- Number of rows
const int Ncols = 4; // --- Number of columns
const int N = Nrows;
// --- Host side dense matrix
double *h_A_dense = (double*)malloc(Nrows * Ncols * sizeof(*h_A_dense));
// --- Column-major storage
h_A_dense[ 0] = 0.4612f; h_A_dense[ 5] = -0.0006f; h_A_dense[10] = 1.3f; h_A_dense[15] = 0.0f;
h_A_dense[ 1] = 0.0f; h_A_dense[ 6] = 1.443f; h_A_dense[11] = 0.0f; h_A_dense[16] = 0.0f;
h_A_dense[ 2] = -0.0006f; h_A_dense[ 7] = 0.4640f; h_A_dense[12] = 0.0723f; h_A_dense[17] = 0.0f;
h_A_dense[ 3] = 0.3566f; h_A_dense[ 8] = 0.0723f; h_A_dense[13] = 0.7543f; h_A_dense[18] = 0.0f;
h_A_dense[ 4] = 0.f; h_A_dense[ 9] = 0.0f; h_A_dense[14] = 0.0f; h_A_dense[19] = 0.1f;
// --- Create device array and copy host array to it
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
/*******************************/
/* FROM DENSE TO SPARSE MATRIX */
/*******************************/
// --- Descriptor for sparse matrix A
setUpDescriptor(descrA, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ONE);
int nnz = 0; // --- Number of nonzero elements in dense matrix
int *d_nnzPerVector; // --- Device side number of nonzero elements per row
double *d_A; // --- Sparse matrix values - array of size nnz
int *d_A_RowIndices; // --- "Row indices"
int *d_A_ColIndices; // --- "Column indices"
dense2SparseD(d_A_dense, &d_nnzPerVector, &d_A, &d_A_RowIndices, &d_A_ColIndices, nnz, descrA, handle, Nrows, Ncols);
/*******************************************************/
/* CHECKING THE RESULTS FOR DENSE TO SPARSE CONVERSION */
/*******************************************************/
// --- Host side number of nonzero elements per row
int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(int));
gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(int), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
printf("\n");
// --- Host side sparse matrix
double *h_A = (double *)malloc(nnz * sizeof(double));
int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(int));
int *h_A_ColIndices = (int *)malloc(nnz * sizeof(int));
gpuErrchk(cudaMemcpy(h_A, d_A, nnz * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(int), cudaMemcpyDeviceToHost));
printf("\nOriginal matrix in CSR format\n\n");
for (int i = 0; i < nnz; ++i) printf("A[%i] = %f\n", i, h_A[i]); printf("\n");
printf("\n");
for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
/*******************************/
/* FROM SPARSE TO DENSE MATRIX */
/*******************************/
double *d_A_denseReconstructed; gpuErrchk(cudaMalloc(&d_A_denseReconstructed, Nrows * Ncols * sizeof(double)));
cusparseSafeCall(cusparseDcsr2dense(handle, Nrows, Ncols, descrA, d_A, d_A_RowIndices, d_A_ColIndices,
d_A_denseReconstructed, Nrows));
/*******************************************************/
/* CHECKING THE RESULTS FOR SPARSE TO DENSE CONVERSION */
/*******************************************************/
double *h_A_denseReconstructed = (double *)malloc(Nrows * Ncols * sizeof(double));
gpuErrchk(cudaMemcpy(h_A_denseReconstructed, d_A_denseReconstructed, Nrows * Ncols * sizeof(double), cudaMemcpyDeviceToHost));
printf("\nReconstructed dense matrix \n");
for (int m = 0; m < Nrows; m++) {
for (int n = 0; n < Ncols; n++)
printf("%f\t", h_A_denseReconstructed[n * Nrows + m]);
printf("\n");
}
return 0;
}

cuda addvectors memory intuitive explanation

I have the following code and
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <ctime>
#include <vector>
#include <numeric>
float random_float(void)
{
return static_cast<float>(rand()) / RAND_MAX;
}
std::vector<float> add(float alpha, std::vector<float>& v1, std::vector<float>& v2 )
{ /*Do quick size check on vectors before proceeding*/
std::vector<float> result(v1.size());
for (unsigned int i = 0; i < result.size(); ++i)
{
result[i]=alpha*v1[i]+v2[i];
}
return result;
}
__global__ void Addloop( int N, float alpha, float* x, float* y ) {
int i;
int i0 = blockIdx.x*blockDim.x + threadIdx.x;
for( i = i0; i < N; i += blockDim.x*gridDim.x )
y[i] = alpha*x[i] + y[i];
/*
if ( i0 < N )
y[i0] = alpha*x[i0] + y[i0];
*/
}
int main( int argc, char** argv ) {
float alpha = 0.3;
// create array of 256k elements
int num_elements = 10;//1<<18;
// generate random input on the host
std::vector<float> h1_input(num_elements);
std::vector<float> h2_input(num_elements);
for(int i = 0; i < num_elements; ++i)
{
h1_input[i] = random_float();
h2_input[i] = random_float();
}
for (std::vector<float>::iterator it = h1_input.begin() ; it != h1_input.end(); ++it)
std::cout << ' ' << *it;
std::cout << '\n';
for (std::vector<float>::iterator it = h2_input.begin() ; it != h2_input.end(); ++it)
std::cout << ' ' << *it;
std::cout << '\n';
std::vector<float> host_result;//(std::vector<float> h1_input, std::vector<float> h2_input );
host_result = add( alpha, h1_input, h2_input );
for (std::vector<float>::iterator it = host_result.begin() ; it != host_result.end(); ++it)
std::cout << ' ' << *it;
std::cout << '\n';
// move input to device memory
float *d1_input = 0;
cudaMalloc((void**)&d1_input, sizeof(float) * num_elements);
cudaMemcpy(d1_input, &h1_input[0], sizeof(float) * num_elements, cudaMemcpyHostToDevice);
float *d2_input = 0;
cudaMalloc((void**)&d2_input, sizeof(float) * num_elements);
cudaMemcpy(d2_input, &h2_input[0], sizeof(float) * num_elements, cudaMemcpyHostToDevice);
Addloop<<<1,3>>>( num_elements, alpha, d1_input, d2_input );
// copy the result back to the host
std::vector<float> device_result(num_elements);
cudaMemcpy(&device_result[0], d2_input, sizeof(float) * num_elements, cudaMemcpyDeviceToHost);
for (std::vector<float>::iterator it = device_result.begin() ; it != device_result.end(); ++it)
std::cout << ' ' << *it;
std::cout << '\n';
cudaFree(d1_input);
cudaFree(d2_input);
h1_input.clear();
h2_input.clear();
device_result.clear();
std::cout << "DONE! \n";
getchar();
return 0;
}
I am trying to understand the gpu memory access. The kernel, for reasons of simplicity, is launched as Addloop<<<1,3>>>. I am trying to understand how this code is working by imagining the for loops working on the gpu as instances. More specifically, I imagine the following instances but they do not help.
Instance 1:
for( i = 0; i < N; i += 3*1 ) // ( i += 0*1 --> i += 3*1 after Eric's comment)
y[i] = alpha*x[i] + y[i];
Instance 2:
for( i = 1; i < N; i += 3*1 )
y[i] = alpha*x[i] + y[i];
Instance 3:
for( i = 3; i < N; i += 3*1 )
y[i] = alpha*x[i] + y[i];
Looking inside of every loop it does not make any sense in the logic of adding two vectors. Can some one help?
The reason I am adopting this logic of instances is because it is working well in the case of the code inside the kernel which is in comments.
If these thoughts are correct what would be the instances in case we have multiple blocks inside the grid? In other words what would be the i values and the update rates (+=updaterate) in some examples?
PS: The kernel code borrowed from here.
UPDATE:
After Eric's answer I think the execution for N = 15, e.i the number of elements, goes like this (correct me if I am wrong):
For the instance 1 above i = 0 , 3, 6, 9, 12 which computes the corresponding y[i] values.
For the instance 2 above i = 1 , 4, 7, 10, 13 which computes the corresponding remaining y[i] values.
For the instance 3 above i = 2 , 5, 8, 11, 14 which computes the rest y[i] values.
Your blockDim.x is 3 and gridDim.x is 1 according to your setup <<<1,3>>>. So in each thread (you call it instance), it should be i+=3*1
update
With the for loop you can compute 15 element using only 3 threads. Generally you can use limited number of threads to do "infinit" work. And more work per threads can improve the performance by reducing the launch overhead and hiding the instruction stalls.
Another advantage is you could use fixed number of threads/blocks to do work of various sizes, thus requires less tuning.