nVidia Cuda nppiResize_32f_C1R works OK on grayscale 1 x 32f, but nppiResize_32f_C3R returns garbage. Clearly, a work around would be to call this routine 3 times by first de-interleaving the data as planar R, G, B, but I was expecting to be able to run it through in a single pass. nVidia has a lot of example code for single plane image processing, but a scant amount for interleaved color planes, so I'm turning to stackoverflow for help. I don't know how the stride is computed, but I understand that the stride is the image width times the number of bytes per column index. So in my case - with no padded lines - it should be width 32f x 3 for RGB.
Tried different strides/pitches in cudaMemcpy2D(). Can't get a workable solution for the color RGB code. Compiles & runs OK, no errors. The first section is for Grayscale (works OK). The second section is RGB (garbage).
// nppiResize using 2D aligned allocations
#include <Exceptions.h>
#include <cuda_runtime.h>
#include <npp.h>
#include <nppi.h>
#include <nppdefs.h>
#define CUDA_CALL(call) do { cudaError_t cuda_error = call; if(cuda_error != cudaSuccess) { std::cerr << "CUDA Error: " << cudaGetErrorString(cuda_error) << ", " << __FILE__ << ", line " << __LINE__ << std::endl; return(NULL);} } while(0)
float* decimate_cuda(float* readbuff, uint32_t nSrcH, uint32_t nSrcW, uint32_t nDstH, uint32_t nDstW, uint8_t byteperpixel)
{
if (byteperpixel == 1){ // source : Grayscale, 1 x 32f
size_t srcStep;
size_t dstStep;
NppiSize oSrcSize = {nSrcW, nSrcH};
NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
float *devSrc;
CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, nSrcW * sizeof(float), nSrcH));
CUDA_CALL(cudaMemcpy2D((void**)devSrc, srcStep,(void**)readbuff, nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));
NppiSize oDstSize = {nDstW, nDstH};
NppiRect oDstROI = {0, 0, nDstW, nDstH};
float *devDst;
CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, nDstW * sizeof(float), nDstH));
NppStatus result = nppiResize_32f_C1R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
if (result != NPP_SUCCESS) {
std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
}
Npp64s writesize;
Npp32f *hostDst;
writesize = (Npp64s) nDstW * nDstH; // Y
if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
exit(1);
}
CUDA_CALL(cudaMemcpy2D(hostDst, nDstW * sizeof(Npp32f),(void**)devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
CUDA_CALL(cudaFree(devSrc));
CUDA_CALL(cudaFree(devDst));
return(hostDst);
} // source : Grayscale 1 x 32f, YYYY...
else if (byteperpixel == 3){ // source : 3 x 32f interleaved RGBRGBRGB...
size_t srcStep;
size_t dstStep;
// rows = height; columns = width
NppiSize oSrcSize = {nSrcW, nSrcH};
NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
float *devSrc;
CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, 3 * nSrcW * sizeof(float), nSrcH));
CUDA_CALL(cudaMemcpy2D((void**)devSrc, srcStep, (void**)readbuff, 3 * nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));
NppiSize oDstSize = {nDstW, nDstH};
NppiRect oDstROI = {0, 0, nDstW, nDstH};
float *devDst;
CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, 3 * nDstW * sizeof(float), nDstH));
NppStatus result = nppiResize_32f_C3R((devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
if (result != NPP_SUCCESS) {
std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
}
Npp64s writesize;
Npp32f *hostDst;
writesize = (Npp64s) nDstW * nDstH * 3; // RGB
if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
exit(1);
}
CUDA_CALL(cudaMemcpy2D((void**)hostDst, nDstW * sizeof(Npp32f), (void**)devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
CUDA_CALL(cudaFree(devSrc));
CUDA_CALL(cudaFree(devDst));
return(hostDst);
} // source - 3 x 32f, interleaved RGBRGBRGB...
return(0);
}
You had various errors in your calls to cudaMemcpy2D (both of them, in the 3 channel code). This code seems to work for me:
$ cat t1521.cu
#include <cuda_runtime.h>
#include <npp.h>
#include <nppi.h>
#include <nppdefs.h>
#include <iostream>
#include <stdint.h>
#include <stdio.h>
#define CUDA_CALL(call) do { cudaError_t cuda_error = call; if(cuda_error != cudaSuccess) { std::cerr << "CUDA Error: " << cudaGetErrorString(cuda_error) << ", " << __FILE__ << ", line " << __LINE__ << std::endl; return(NULL);} } while(0)
using namespace std;
float* decimate_cuda(float* readbuff, uint32_t nSrcH, uint32_t nSrcW, uint32_t nDstH, uint32_t nDstW, uint8_t byteperpixel)
{
if (byteperpixel == 1){ // source : Grayscale, 1 x 32f
size_t srcStep;
size_t dstStep;
NppiSize oSrcSize = {nSrcW, nSrcH};
NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
float *devSrc;
CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, nSrcW * sizeof(float), nSrcH));
CUDA_CALL(cudaMemcpy2D(devSrc, srcStep,readbuff, nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));
NppiSize oDstSize = {nDstW, nDstH};
NppiRect oDstROI = {0, 0, nDstW, nDstH};
float *devDst;
CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, nDstW * sizeof(float), nDstH));
NppStatus result = nppiResize_32f_C1R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
if (result != NPP_SUCCESS) {
std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
}
Npp64s writesize;
Npp32f *hostDst;
writesize = (Npp64s) nDstW * nDstH; // Y
if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
exit(1);
}
CUDA_CALL(cudaMemcpy2D(hostDst, nDstW * sizeof(Npp32f),devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
CUDA_CALL(cudaFree(devSrc));
CUDA_CALL(cudaFree(devDst));
return(hostDst);
} // source : Grayscale 1 x 32f, YYYY...
else if (byteperpixel == 3){ // source : 3 x 32f interleaved RGBRGBRGB...
size_t srcStep;
size_t dstStep;
// rows = height; columns = width
NppiSize oSrcSize = {nSrcW, nSrcH};
NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
float *devSrc;
CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, 3 * nSrcW * sizeof(float), nSrcH));
CUDA_CALL(cudaMemcpy2D(devSrc, srcStep,readbuff, 3 * nSrcW * sizeof(Npp32f), 3*nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));
NppiSize oDstSize = {nDstW, nDstH};
NppiRect oDstROI = {0, 0, nDstW, nDstH};
float *devDst;
CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, 3 * nDstW * sizeof(float), nDstH));
NppStatus result = nppiResize_32f_C3R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
if (result != NPP_SUCCESS) {
std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
}
Npp64s writesize;
Npp32f *hostDst;
writesize = (Npp64s) nDstW * nDstH * 3; // RGB
if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
exit(1);
}
CUDA_CALL(cudaMemcpy2D(hostDst, nDstW*3 * sizeof(Npp32f), devDst, dstStep, nDstW*3 * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
CUDA_CALL(cudaFree(devSrc));
CUDA_CALL(cudaFree(devDst));
return(hostDst);
} // source - 3 x 32f, interleaved RGBRGBRGB...
return(0);
}
int main(){
uint32_t nSrcH = 480;
uint32_t nSrcW = 640;
uint8_t byteperpixel = 3;
float *readbuff = (float *)malloc(nSrcW*nSrcH*byteperpixel*sizeof(float));
for (int i = 0; i < nSrcH*nSrcW; i++){
readbuff [i*3+0] = 1.0f;
readbuff [i*3+1] = 2.0f;
readbuff [i*3+2] = 3.0f;}
uint32_t nDstW = nSrcW/2;
uint32_t nDstH = nSrcH/2;
float *res = decimate_cuda(readbuff, nSrcH, nSrcW, nDstH, nDstW, byteperpixel);
for (int i = 0; i < nDstH*nDstW*byteperpixel; i++) if (res[i] != ((i%3)+1.0f)) {std::cout << "error at: " << i << std::endl; return 0;}
return 0;
}
$ nvcc -o t1521 t1521.cu -lnppig
$ cuda-memcheck ./t1521
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
In the future, its convenient if you provide a complete code, just as I have done in my answer. In fact SO requires this, see item 1 here.
By the way, the use of pitched allocations on the device, here, which introduce complexity that you were not able to work your way through, should really be unnecessary both for correctness and performance, using any modern GPU and CUDA version. Ordinary linear/flat allocations, where pitch==width, should be just fine.
Related
I am trying to utilize CUDA Graphs for the computation of Fast Fourier Transform (FFT) using CUDA's cuFFT APIs.
I modified the sample FFT code present on Github into the following FFT code using CUDA Graphs:
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <iostream>
#include <cufft.h>
// Complex data type
typedef float2 Complex;
static __device__ inline Complex ComplexScale(Complex, float);
static __device__ inline Complex ComplexMul(Complex, Complex);
static __global__ void ComplexPointwiseMulAndScale(Complex*, const Complex*, int, float);
#define CUDA_CALL( call ) \
{ \
cudaError_t result = call; \
if ( cudaSuccess != result ) \
std::cerr << "CUDA error " << result << " in " << __FILE__ << ":" << __LINE__ << ": " << cudaGetErrorString( result ) << " (" << #call << ")" << std::endl; \
}
#define CUDA_FFT_CALL( call ) \
{ \
cufftResult result = call; \
if ( CUFFT_SUCCESS != result ) \
std::cerr << "FFT error " << result << " in " << __FILE__ << ":" << __LINE__ << ": " << result << std::endl; \
}
// The filter size is assumed to be a number smaller than the signal size
#define SIGNAL_SIZE 10
#define FILTER_KERNEL_SIZE 4
static __device__ inline Complex ComplexScale(Complex a, float s)
{
Complex c;
c.x = s * a.x;
c.y = s * a.y;
return c;
}
// Complex multiplication
static __device__ inline Complex ComplexMul(Complex a, Complex b)
{
Complex c;
c.x = a.x * b.x - a.y * b.y;
c.y = a.x * b.y + a.y * b.x;
return c;
}
// Complex pointwise multiplication
static __global__ void ComplexPointwiseMulAndScale(Complex* a, const Complex* b, int size, float scale)
{
const int numThreads = blockDim.x * gridDim.x;
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = threadID; i < size; i += numThreads)
{
a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale);
}
}
int main()
{
printf("[simpleCUFFT] is starting...\n");
int minRadius = FILTER_KERNEL_SIZE / 2;
int maxRadius = FILTER_KERNEL_SIZE - minRadius;
int padded_data_size = SIGNAL_SIZE + maxRadius;
// Allocate HOST Memories
Complex* h_signal = (Complex*)malloc(sizeof(Complex) * SIGNAL_SIZE); //host signal
Complex* h_filter_kernel = (Complex*)malloc(sizeof(Complex) * FILTER_KERNEL_SIZE); //host filter
Complex* h_padded_signal= (Complex*)malloc(sizeof(Complex) * padded_data_size); // host Padded signal
Complex* h_padded_filter_kernel = (Complex*)malloc(sizeof(Complex) * padded_data_size); // host Padded filter kernel
Complex* h_convolved_signal = (Complex*)malloc(sizeof(Complex) * padded_data_size); // to store convolution RESULTS
memset(h_convolved_signal, 0, padded_data_size * sizeof(Complex));
//Allocate DEVICE Memories
Complex* d_signal; //device signal
cudaMalloc((void**)&d_signal, sizeof(Complex) * padded_data_size);
Complex* d_filter_kernel;
cudaMalloc((void**)&d_filter_kernel, sizeof(Complex) * padded_data_size); //device kernel
//CUDA GRAPH
bool graphCreated = false;
cudaGraph_t graph;
cudaGraphExec_t instance;
cudaStream_t stream;
cudaStreamCreate(&stream);
// CUFFT plan
cufftHandle plan;
CUDA_FFT_CALL(cufftPlan1d(&plan, padded_data_size, CUFFT_C2C, 1));
cufftSetStream(plan, stream); // bind plan to the stream
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_signal[i].x = rand() / (float)RAND_MAX;
h_signal[i].y = 0;
}
// Initalize the memory for the filter
for (unsigned int i = 0; i < FILTER_KERNEL_SIZE; ++i)
{
h_filter_kernel[i].x = rand() / (float)RAND_MAX;
h_filter_kernel[i].y = 0;
}
//REPEAT 3 times
int nRepeatationsNeeded = 3;
for (int repeatations = 0; repeatations < nRepeatationsNeeded; repeatations++)
{
std::cout << "\n\n" << "Repeatation ------ " << repeatations << std::endl;
if (!graphCreated)
{
//Start Graph Recording --------------!!!!!!!!
CUDA_CALL(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
//Pad Data
CUDA_CALL(cudaMemcpyAsync(h_padded_signal + 0, h_signal, SIGNAL_SIZE * sizeof(Complex), cudaMemcpyHostToHost, stream));
memset(h_padded_signal + SIGNAL_SIZE, 0, (padded_data_size - SIGNAL_SIZE) * sizeof(Complex));
//CUDA_CALL(cudaMemsetAsync(h_padded_signal + SIGNAL_SIZE, 0, (padded_data_size - SIGNAL_SIZE) * sizeof(Complex), stream));
CUDA_CALL(cudaMemcpyAsync(h_padded_filter_kernel + 0, h_filter_kernel + minRadius, maxRadius * sizeof(Complex), cudaMemcpyHostToHost, stream));
/*CUDA_CALL(cudaMemsetAsync(h_padded_filter_kernel + maxRadius, 0, (padded_data_size - FILTER_KERNEL_SIZE) * sizeof(Complex), stream));*/
memset(h_padded_filter_kernel + maxRadius, 0, (padded_data_size - FILTER_KERNEL_SIZE) * sizeof(Complex));
CUDA_CALL(cudaMemcpyAsync(h_padded_filter_kernel + padded_data_size - minRadius, h_filter_kernel, minRadius * sizeof(Complex), cudaMemcpyHostToHost, stream));
// MemCpy H to D
CUDA_CALL(cudaMemcpyAsync(d_signal, h_padded_signal, sizeof(Complex) * padded_data_size, cudaMemcpyHostToDevice, stream)); //Signal
CUDA_CALL(cudaMemcpyAsync(d_filter_kernel, h_padded_filter_kernel, sizeof(Complex) * padded_data_size, cudaMemcpyHostToDevice, stream)); //Kernel
//COMPUTE FFT
CUDA_FFT_CALL(cufftExecC2C(plan, (cufftComplex*)d_signal, (cufftComplex*)d_signal, CUFFT_FORWARD)); // Transform signal
CUDA_FFT_CALL(cufftExecC2C(plan, (cufftComplex*)d_filter_kernel, (cufftComplex*)d_filter_kernel, CUFFT_FORWARD)); // Transform kernel
ComplexPointwiseMulAndScale << <64, 1, 0, stream >> > (d_signal, d_filter_kernel, padded_data_size, 1.0f / padded_data_size); // Multiply and normalize
CUDA_CALL(cudaGetLastError());
CUDA_FFT_CALL(cufftExecC2C(plan, (cufftComplex*)d_signal, (cufftComplex*)d_signal, CUFFT_INVERSE)); // Transform signal back
// Copy device memory to host
CUDA_CALL(cudaMemcpyAsync(h_convolved_signal, d_signal, sizeof(Complex) * padded_data_size, cudaMemcpyDeviceToHost, stream));
//END Graph Recording
CUDA_CALL(cudaStreamEndCapture(stream, &graph));
CUDA_CALL(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
graphCreated = true;
}
else
{
CUDA_CALL(cudaGraphLaunch(instance, stream));
CUDA_CALL(cudaStreamSynchronize(stream));
}
//verify results
for (int i = 0; i < SIGNAL_SIZE; i++)
std::cout << "index: " << i << ", fft: " << h_convolved_signal[i].x << std::endl;
}
//Destroy CUFFT context
cufftDestroy(plan);
// cleanup memory
cudaStreamDestroy(stream);
free(h_signal);
free(h_filter_kernel);
free(h_padded_signal);
free(h_padded_filter_kernel);
cudaFree(d_signal);
cudaFree(d_filter_kernel);
return 0;
}
PROBLEM: The Output of the above program is below, in which it can be seen that the values of the result are also ZEROS for the first iteration. How can I resolve this?
The results are zero for the first iteration, because for the first iteration, the work is all issued in capture mode.
In capture mode, no CUDA work actually gets done. From here:
When a stream is being captured, work launched into the stream is not enqueued for execution.
I pointed you to this same area of the documentation in a comment to your last question. You might wish to read the entire programming guide section on graphs, and there are also blogs available.
Would like to pass the content from jpeg file (3 byte RGB) as a texture to a CUDA kernel but getting compilation error
a pointer to a bound function may only be used to call the function
on value.x = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f; and the rest of the tex2D() calls.
What may be the reason(s) for the error?
Host side code where the texture is created:
cudaArray* cudaArray;
cudaTextureObject_t textureObject{};
{
const static uint32_t bytesPerPixel{ 3u };
uint8_t* pHostData;
int32_t textureWidth, textureHeight;
uint32_t bytesPerScanline;
cudaChannelFormatDesc channelFormatDesc;
cudaResourceDesc resourceDesc{};
cudaTextureDesc textureDesc{};
int32_t componentsPerPixel = bytesPerPixel;
pHostData = stbi_load(textureFilename.c_str(), &textureWidth, &textureHeight, &componentsPerPixel, componentsPerPixel);
if (nullptr == pHostData) {
std::cerr << "ERROR: Could not load texture image file '" << textureFilename << std::endl;
return;
}
bytesPerScanline = bytesPerPixel * textureWidth;
channelFormatDesc = cudaCreateChannelDesc<uint8_t>();
checkCudaErrors(cudaMallocArray(&cudaArray, &channelFormatDesc, bytesPerScanline, textureHeight));
checkCudaErrors(cudaMemcpyToArray(cudaArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
resourceDesc.resType = cudaResourceTypeArray;
resourceDesc.res.array.array = cudaArray;
textureDesc.normalizedCoords = true;
textureDesc.filterMode = cudaFilterModePoint;
textureDesc.addressMode[0] = cudaAddressModeWrap;
textureDesc.addressMode[1] = cudaAddressModeWrap;
textureDesc.readMode = cudaReadModeElementType;
checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
STBI_FREE(pHostData);
}
Device side code:
class imageTexture {
public:
__device__ imageTexture(cudaTextureObject_t tex) :_texture(tex) {}
__device__ virtual vec3 value(float u, float v, const vec3& p) const {
vec3 value;
u *= 3;
value.x = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
u++;
value.y = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
u++;
value.z = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
return value;
}
private:
cudaTextureObject_t _texture;
};
Changed device side function, but the error persists:
class imageTexture :public textureX {
public:
__device__ imageTexture(cudaTextureObject_t tex) :_text(tex) {}
__device__ virtual vec3 value(float u, float v, const vec3& p) const override {
vec3 val;
u *= 3;
val.x = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
u++;
val.y = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
u++;
val.z = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
return val;
}
private:
cudaTextureObject_t _text;
};
I have written a new test program and planning to build up from there. The idea is have each thread read 3 values from texture and write it back to a buffer. Only the first triplet is correct. Is there anything inconsistent in my texture lookups with this:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>
#include <iostream>
#include <string>
#include <chrono>
#include <cmath>
#include <ctime>
#include <cstdint>
#include <stdio.h>
#define checkCudaErrors(val) check_cuda( (val), #val, __FILE__, __LINE__ )
void check_cuda(cudaError_t result, char const* const func, const char* const file, int const line) {
if (result) {
std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " at " << file << ":" << line << " " << func << std::endl;
std::cerr << cudaGetErrorString(result) << std::endl;
// Make sure we call CUDA Device Reset before exiting
cudaDeviceReset();
exit(99);
}
}
__global__ void texCheck(uint32_t width, uint32_t height, uint8_t* pOutput, cudaTextureObject_t textureObject) {
uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < width) && (y < height)) {
float u = (float)x / (float)width;
float v = (float)y / (float)height;
pOutput[y * (3 * width) + (3 * x)] = tex2D<uint8_t>(textureObject, 3*u, v);
pOutput[y * (3 * width) + (3 * x) + 1] = tex2D<uint8_t>(textureObject, 3*u + 1, v);
pOutput[y * (3 * width) + (3 * x) + 2] = tex2D<uint8_t>(textureObject, 3*u + 2, v);
}
}
void cudaTex() {
const uint32_t bytesPerPixel{ 3u };
const uint32_t textureWidth = 1024u;
const uint32_t textureHeight = 512u;
uint32_t bytesPerScanline;
bytesPerScanline = bytesPerPixel * textureWidth;
cudaChannelFormatDesc channelFormatDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
cudaArray* cudaArray;
checkCudaErrors(cudaMallocArray(&cudaArray, &channelFormatDesc, bytesPerScanline, textureHeight));
uint8_t* pHostData = new uint8_t[bytesPerScanline * textureHeight];
std::srand(std::time(nullptr));
for (uint64_t idx = 0ull; idx < bytesPerScanline * textureHeight; idx++)
pHostData[idx] = std::rand();
checkCudaErrors(cudaMemcpyToArray(cudaArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
cudaResourceDesc resourceDesc{};
resourceDesc.resType = cudaResourceTypeArray;
resourceDesc.res.array.array = cudaArray;
cudaTextureDesc textureDesc{};
textureDesc.normalizedCoords = false;
textureDesc.filterMode = cudaFilterModePoint;
textureDesc.addressMode[0] = cudaAddressModeWrap;
textureDesc.addressMode[1] = cudaAddressModeWrap;
textureDesc.readMode = cudaReadModeElementType;
cudaTextureObject_t textureObject{};
checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
dim3 dimBlock(8u, 8u, 1u);
dim3 dimGrid(textureWidth / dimBlock.x, textureHeight / dimBlock.y, 1u);
uint8_t* dOutput{ nullptr };
checkCudaErrors(cudaMalloc((void**)&dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t)));
texCheck << < dimGrid, dimBlock >> > (textureWidth, textureHeight, dOutput, textureObject);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
uint8_t* hOutput = new uint8_t[bytesPerScanline * textureHeight];
checkCudaErrors(cudaMemcpy(hOutput, dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyDeviceToHost));
for (uint64_t idx = 0ull; idx < textureHeight; idx++) {
for (uint64_t jdx = 0ull; jdx < bytesPerScanline; jdx++) {
if (hOutput[jdx] != pHostData[jdx])
std::cerr << "Mismatch # " << idx << " " << jdx << " Expected " << (uint32_t)pHostData[jdx] << " Received " << (uint32_t)hOutput[jdx] << std::endl;
}
hOutput += bytesPerScanline;
pHostData += bytesPerScanline;
}
checkCudaErrors(cudaDestroyTextureObject(textureObject));
checkCudaErrors(cudaFree(dOutput));
checkCudaErrors(cudaFreeArray(cudaArray));
delete[] hOutput;
delete[] pHostData;
}
int main() {
cudaTex();
return 0;
}
Switching to integer coordinated in the kernel solved the problem
Resolution of the Original Problem
It turned out that the
a pointer to a bound function may only be used to call the function
error was caused by vec3 class having a getter function x() and not a member variable named x. So the code was trying use the getter function as an l-value!!!
There are several problems with the code you have now posted:
after the discussion in the comments, hopefully you can figure out what is wrong with this line of code:
cudaArray* cudaArray;
Your kernel code appears to be trying pass normalized float coordinates but doing it incorrectly. There are several issues here: your x normalization is considering textureWidth but it should be done over 3*textureWidth (i.e. bytesPerScanline). Although you are calling the width of your texture textureWidth, really it is 3*textureWidth. Also, texturing in this fashion is typically offset by 0.5. Finally, you are doing this:
textureDesc.normalizedCoords = false;
but if you want to use float coordinates (seems to be what you want) you shoudl do:
textureDesc.normalizedCoords = true;
After you fix all that, you'll run into a non-CUDA issue. You're modifying these pointers:
hOutput += bytesPerScanline;
pHostData += bytesPerScanline;
then trying to delete them after modification:
delete[] hOutput;
delete[] pHostData;
that won't work correctly.
Here's a modified code that has the above issues addressed, it seems to run correctly for me:
$ cat t7.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>
#include <iostream>
#include <string>
#include <chrono>
#include <cmath>
#include <ctime>
#include <cstdint>
#include <stdio.h>
#define checkCudaErrors(val) check_cuda( (val), #val, __FILE__, __LINE__ )
void check_cuda(cudaError_t result, char const* const func, const char* const file, int const line) {
if (result) {
std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " at " << file << ":" << line << " " << func << std::endl;
std::cerr << cudaGetErrorString(result) << std::endl;
// Make sure we call CUDA Device Reset before exiting
cudaDeviceReset();
exit(99);
}
}
__global__ void texCheck(uint32_t width, uint32_t height, uint8_t* pOutput, cudaTextureObject_t textureObject) {
uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
const float pix_offset = 0.5;
if ((x < width) && (y < height)) {
float u = (float)(3*x+pix_offset) / (float)(3*width);
float v = (float)y / (float)height;
pOutput[y * (3 * width) + (3 * x)] = tex2D<uint8_t>(textureObject, u, v);
u = (float)(3*x+1+pix_offset) / (float)(3*width);
pOutput[y * (3 * width) + (3 * x) + 1] = tex2D<uint8_t>(textureObject, u, v);
u = (float)(3*x+2+pix_offset) / (float)(3*width);
pOutput[y * (3 * width) + (3 * x) + 2] = tex2D<uint8_t>(textureObject, u, v);
}
}
void cudaTex() {
const uint32_t bytesPerPixel{ 3u };
const uint32_t textureWidth = 1024u;
const uint32_t textureHeight = 512u;
uint32_t bytesPerScanline;
bytesPerScanline = bytesPerPixel * textureWidth;
cudaChannelFormatDesc channelFormatDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
cudaArray* cArray;
checkCudaErrors(cudaMallocArray(&cArray, &channelFormatDesc, bytesPerScanline, textureHeight));
uint8_t* pHostData = new uint8_t[bytesPerScanline * textureHeight];
std::srand(std::time(nullptr));
for (uint64_t idx = 0ull; idx < bytesPerScanline * textureHeight; idx++)
pHostData[idx] = std::rand();
checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
cudaResourceDesc resourceDesc{};
resourceDesc.resType = cudaResourceTypeArray;
resourceDesc.res.array.array = cArray;
cudaTextureDesc textureDesc{};
textureDesc.normalizedCoords = true;
textureDesc.filterMode = cudaFilterModePoint;
textureDesc.addressMode[0] = cudaAddressModeWrap;
textureDesc.addressMode[1] = cudaAddressModeWrap;
textureDesc.readMode = cudaReadModeElementType;
cudaTextureObject_t textureObject{};
checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
dim3 dimBlock(8u, 8u, 1u);
dim3 dimGrid(textureWidth / dimBlock.x, textureHeight / dimBlock.y, 1u);
uint8_t* dOutput{ nullptr };
checkCudaErrors(cudaMalloc((void**)&dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t)));
texCheck << < dimGrid, dimBlock >> > (textureWidth, textureHeight, dOutput, textureObject);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
uint8_t* hOutput = new uint8_t[bytesPerScanline * textureHeight];
checkCudaErrors(cudaMemcpy(hOutput, dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyDeviceToHost));
uint8_t *my_hOutput = hOutput;
uint8_t *my_pHostData = pHostData;
for (uint64_t idx = 0ull; idx < textureHeight; idx++) {
for (uint64_t jdx = 0ull; jdx < bytesPerScanline; jdx++) {
if (hOutput[jdx] != pHostData[jdx]){
std::cerr << "Mismatch # " << idx << " " << jdx << " Expected " << (uint32_t)pHostData[jdx] << " Received " << (uint32_t)hOutput[jdx] << std::endl;
return;}
}
hOutput += bytesPerScanline;
pHostData += bytesPerScanline;
}
checkCudaErrors(cudaDestroyTextureObject(textureObject));
checkCudaErrors(cudaFree(dOutput));
checkCudaErrors(cudaFreeArray(cArray));
delete[] my_hOutput;
delete[] my_pHostData;
}
int main() {
cudaTex();
return 0;
}
$ nvcc -o t7 t7.cu -std=c++11
t7.cu: In function ‘void cudaTex()’:
t7.cu:56:12: warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated (declared at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:6782) [-Wdeprecated-declarations]
checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
^
t7.cu:56:131: warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated (declared at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:6782) [-Wdeprecated-declarations]
checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
^
$ cuda-memcheck ./t7
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
I'm not suggesting the above code is defect-free. It's mostly your code. I'm just pointing out some things I found. You can read about how to address the deprecation warning here.
The following program tests the dense to sparse conversion using cuSPARSE. It produces garbage in the first several lines of output. But if I move the lines marked with (2) to the place after the lines marked with (1), the program works fine. Can someone tell me what could be the reason?
EDIT:
To make the presentation clearer, I rewrote the program with thrust, the same issue persists.
EDIT:
As suggested by Robert, I changed it back to the version without thrust and added api level error check code.
#include <iostream>
#include <cusparse_v2.h>
using std::cerr;
using std::cout;
using std::endl;
#define WRAP(x) do {x} while (0)
#define CHKcusparse(x) WRAP( \
cusparseStatus_t err = (x); \
if (err != CUSPARSE_STATUS_SUCCESS) { \
cerr << "Cusparse Error #" << int(err) << "\"TODO\" at Line " \
<< __LINE__ << " of " << __FILE__ << ": " << #x << endl; \
exit(1); \
} \
)
#define CHKcuda(x) WRAP( \
cudaError_t err = (x); \
if (err != cudaSuccess) { \
cerr << "Cuda Error #" << int(err) << ", \"" \
<< cudaGetErrorString(err) << "\" at Line " << __LINE__ \
<< " of " << __FILE__ << ": " << #x << endl; \
exit(1); \
} \
)
#define ALLOC(X, T, N) do { \
h##X = (T*) malloc(sizeof(T) * (N)); \
CHKcuda(cudaMalloc((void**)&d##X, sizeof(T) * (N))); \
} while(0)
int main() {
srand(100);
cusparseHandle_t g_cusparse_handle;
CHKcusparse(cusparseCreate(&g_cusparse_handle));
const int n = 100, in_degree = 10;
int nnz = n * in_degree, nn = n * n;
int *dnnz, *dridx, *dcols;
int *hnnz, *hridx, *hcols;
float *dvals, *dmat;
float *hvals, *hmat;
// (1) The number of non-zeros in each column.
ALLOC(nnz, int, n);
// The dense matrix.
ALLOC(mat, float, nn);
// The values in sparse matrix.
ALLOC(vals, float, nnz);
// (2) The row indices of the sparse matrix.
ALLOC(ridx, int, nnz);
// The column offsets of the sparse matrix.
ALLOC(cols, int, n+1);
// Fill and copy dense matrix and number of non-zeros.
for (int i = 0; i < nn; i++) {hmat[i] = rand();}
for (int i = 0; i < n; i++) {hnnz[i] = in_degree;}
CHKcuda(cudaMemcpyAsync(dnnz, hnnz, sizeof(int) * n, cudaMemcpyHostToDevice));
CHKcuda(cudaMemcpyAsync(dmat, hmat, sizeof(float) * nn, cudaMemcpyHostToDevice));
CHKcuda(cudaDeviceSynchronize());
// Perform dense to CSC format
cusparseMatDescr_t cspMatDesc;
CHKcusparse(cusparseCreateMatDescr(&cspMatDesc));
CHKcusparse(cusparseSdense2csc(
g_cusparse_handle, n, n, cspMatDesc, dmat, n,
dnnz, dvals, dridx, dcols
));
// Copy row indices back.
CHKcuda(cudaMemcpyAsync(hridx, dridx, sizeof(int) * nnz, cudaMemcpyDeviceToHost));
CHKcuda(cudaDeviceSynchronize());
CHKcusparse(cusparseDestroyMatDescr(cspMatDesc));
// Display row indices.
for (int i = 0; i < n; i++) {
for (int j = 0; j < in_degree; j++) {
std::cout << hridx[i * in_degree + j] << ", ";
}
std::cout << std::endl;
}
CHKcuda(cudaFree(dnnz));
CHKcuda(cudaFree(dvals));
CHKcuda(cudaFree(dridx));
CHKcuda(cudaFree(dcols));
CHKcuda(cudaFree(dmat));
free(hnnz);
free(hmat);
free(hvals);
free(hridx);
free(hcols);
return 0;
}
The basic problem is that you are passing internally inconsistent data to the dense-to-sparse routine. You are passing a dense matrix which has 100 non-zero elements per column, but you are telling cusparse that there are only 10 non-zero elements per column.
If you run your code with cuda-memcheck, you will see that there are errors coming out of cusparse.
For this code, you can fix the issue by changing your in_degree variable to 100.
For the general case, cusparse provides a convenient routine to populate the number of non-zero elements per column correctly.
As already underlined by Robert Crovella, passing from dense to sparse can be effectively performed using cuSPARSE by the cusparse<t>nnz() and cusparse<t>dense2csr() routines. The vice versa can be done by the cusparse<t>csr2dense() routine. Below, there is a fully worked out example showing how passing from dense to sparse and vice versa using cuSPARSE in CSR format.
cuSparseUtilities.cuh
#ifndef CUSPARSEUTILITIES_CUH
#define CUSPARSEUTILITIES_CUH
#include "cusparse_v2.h"
void setUpDescriptor(cusparseMatDescr_t &, cusparseMatrixType_t, cusparseIndexBase_t);
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A,
int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA,
const cusparseHandle_t handle, const int Nrows, const int Ncols);
#endif
cuSparseUtilities.cu
#include "cuSparseUtilities.cuh"
#include "Utilities.cuh"
/*****************************/
/* SETUP DESCRIPTOR FUNCTION */
/*****************************/
void setUpDescriptor(cusparseMatDescr_t &descrA, cusparseMatrixType_t matrixType, cusparseIndexBase_t indexBase) {
cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseSetMatType(descrA, matrixType));
cusparseSafeCall(cusparseSetMatIndexBase(descrA, indexBase));
}
/********************************************************/
/* DENSE TO SPARSE CONVERSION FOR REAL DOUBLE PRECISION */
/********************************************************/
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A,
int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA,
const cusparseHandle_t handle, const int Nrows, const int Ncols) {
const int lda = Nrows; // --- Leading dimension of dense matrix
gpuErrchk(cudaMalloc(&d_nnzPerVector[0], Nrows * sizeof(int)));
// --- Compute the number of nonzero elements per row and the total number of nonzero elements in the dense d_A_dense
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], &nnz));
// --- Device side sparse matrix
gpuErrchk(cudaMalloc(&d_A[0], nnz * sizeof(double)));
gpuErrchk(cudaMalloc(&d_A_RowIndices[0], (Nrows + 1) * sizeof(int)));
gpuErrchk(cudaMalloc(&d_A_ColIndices[0], nnz * sizeof(int)));
cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], d_A[0], d_A_RowIndices[0], d_A_ColIndices[0]));
}
kernel.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cusparse_v2.h>
#include "cuSparseUtilities.cuh"
#include "Utilities.cuh"
/********/
/* MAIN */
/********/
int main() {
cusparseHandle_t handle;
// --- Initialize cuSPARSE
cusparseSafeCall(cusparseCreate(&handle));
cusparseMatDescr_t descrA = 0;
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
const int Nrows = 5; // --- Number of rows
const int Ncols = 4; // --- Number of columns
const int N = Nrows;
// --- Host side dense matrix
double *h_A_dense = (double*)malloc(Nrows * Ncols * sizeof(*h_A_dense));
// --- Column-major storage
h_A_dense[ 0] = 0.4612f; h_A_dense[ 5] = -0.0006f; h_A_dense[10] = 1.3f; h_A_dense[15] = 0.0f;
h_A_dense[ 1] = 0.0f; h_A_dense[ 6] = 1.443f; h_A_dense[11] = 0.0f; h_A_dense[16] = 0.0f;
h_A_dense[ 2] = -0.0006f; h_A_dense[ 7] = 0.4640f; h_A_dense[12] = 0.0723f; h_A_dense[17] = 0.0f;
h_A_dense[ 3] = 0.3566f; h_A_dense[ 8] = 0.0723f; h_A_dense[13] = 0.7543f; h_A_dense[18] = 0.0f;
h_A_dense[ 4] = 0.f; h_A_dense[ 9] = 0.0f; h_A_dense[14] = 0.0f; h_A_dense[19] = 0.1f;
// --- Create device array and copy host array to it
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
/*******************************/
/* FROM DENSE TO SPARSE MATRIX */
/*******************************/
// --- Descriptor for sparse matrix A
setUpDescriptor(descrA, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ONE);
int nnz = 0; // --- Number of nonzero elements in dense matrix
int *d_nnzPerVector; // --- Device side number of nonzero elements per row
double *d_A; // --- Sparse matrix values - array of size nnz
int *d_A_RowIndices; // --- "Row indices"
int *d_A_ColIndices; // --- "Column indices"
dense2SparseD(d_A_dense, &d_nnzPerVector, &d_A, &d_A_RowIndices, &d_A_ColIndices, nnz, descrA, handle, Nrows, Ncols);
/*******************************************************/
/* CHECKING THE RESULTS FOR DENSE TO SPARSE CONVERSION */
/*******************************************************/
// --- Host side number of nonzero elements per row
int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(int));
gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(int), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
printf("\n");
// --- Host side sparse matrix
double *h_A = (double *)malloc(nnz * sizeof(double));
int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(int));
int *h_A_ColIndices = (int *)malloc(nnz * sizeof(int));
gpuErrchk(cudaMemcpy(h_A, d_A, nnz * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(int), cudaMemcpyDeviceToHost));
printf("\nOriginal matrix in CSR format\n\n");
for (int i = 0; i < nnz; ++i) printf("A[%i] = %f\n", i, h_A[i]); printf("\n");
printf("\n");
for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
/*******************************/
/* FROM SPARSE TO DENSE MATRIX */
/*******************************/
double *d_A_denseReconstructed; gpuErrchk(cudaMalloc(&d_A_denseReconstructed, Nrows * Ncols * sizeof(double)));
cusparseSafeCall(cusparseDcsr2dense(handle, Nrows, Ncols, descrA, d_A, d_A_RowIndices, d_A_ColIndices,
d_A_denseReconstructed, Nrows));
/*******************************************************/
/* CHECKING THE RESULTS FOR SPARSE TO DENSE CONVERSION */
/*******************************************************/
double *h_A_denseReconstructed = (double *)malloc(Nrows * Ncols * sizeof(double));
gpuErrchk(cudaMemcpy(h_A_denseReconstructed, d_A_denseReconstructed, Nrows * Ncols * sizeof(double), cudaMemcpyDeviceToHost));
printf("\nReconstructed dense matrix \n");
for (int m = 0; m < Nrows; m++) {
for (int n = 0; n < Ncols; n++)
printf("%f\t", h_A_denseReconstructed[n * Nrows + m]);
printf("\n");
}
return 0;
}
I want to calculate the average of the values over the whole image in Cuda. To test how reduction in 2D array work, I write this kernel below. The final output o should be the sum of all the image values. The input g is a 2D array with value 1 in every pixel. But the result of this program is 0 as the sum. A bit weird to me.
I imitate the reduction in 1D array in this tutorial http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf I write this 2D form. I am new to Cuda. And suggestions to potential bugs and improvement are welcomed!
Just add one comment. I know it makes sense just to calculate the average in 1D array. But I want to exploit more and test more complicated reduction behaviours. It might not be right. But just a test. Hope anyone can give me suggestions more about reduction common practices.
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
cudaEvent_t start, stop;
float elapsedTime;
__global__ void
reduce(float *g, float *o, const int dimx, const int dimy)
{
extern __shared__ float sdata[];
unsigned int tid_x = threadIdx.x;
unsigned int tid_y = threadIdx.y;
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
if (i >= dimx || j >= dimy)
return;
sdata[tid_x*blockDim.y + tid_y] = g[i*dimy + j];
__syncthreads();
for(unsigned int s_y = blockDim.y/2; s_y > 0; s_y >>= 1)
{
if (tid_y < s_y)
{
sdata[tid_x * dimy + tid_y] += sdata[tid_x * dimy + tid_y + s_y];
}
__syncthreads();
}
for(unsigned int s_x = blockDim.x/2; s_x > 0; s_x >>= 1 )
{
if(tid_x < s_x)
{
sdata[tid_x * dimy] += sdata[(tid_x + s_x) * dimy];
}
__syncthreads();
}
float sum;
if( tid_x == 0 && tid_y == 0)
{
sum = sdata[0];
atomicAdd (o, sum); // The result should be the sum of all pixel values. But the program produce 0
}
//if(tid_x==0 && tid__y == 0 )
//o[blockIdx.x] = sdata[0];
}
int
main()
{
int dimx = 320;
int dimy = 160;
int num_bytes = dimx*dimy*sizeof(float);
float *d_a, *h_a, // device and host pointers
*d_o=0, *h_o=0;
h_a = (float*)malloc(num_bytes);
h_o = (float*)malloc(sizeof(float));
srand(time(NULL));
for (int i=0; i < dimx; i++)
{
for (int j=0; j < dimy; j++)
{
h_a[i*dimy + j] = 1;
}
}
cudaMalloc( (void**)&d_a, num_bytes );
cudaMalloc( (void**)&d_o, sizeof(int) );
cudaMemcpy( d_a, h_a, num_bytes, cudaMemcpyHostToDevice);
cudaMemcpy( d_o, h_o, sizeof(int), cudaMemcpyHostToDevice);
dim3 grid, block;
block.x = 4;
block.y = 4;
grid.x = dimx / block.x;
grid.y = dimy / block.y;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
int sizeofSharedMemory = dimx*dimy*sizeof(float);
reduce<<<grid, block, sizeofSharedMemory>>> (d_a, d_o, block.x, block.y);
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
std::cout << "This kernel runs: " << elapsedTime << "ms" << std::endl;
std::cout << block.x << " " << block.y << std::endl;
std::cout << grid.x << " " << grid.y << std::endl;
std::cout << dimx << " " << dimy << " " << dimx*dimy << std::endl;
cudaMemcpy( h_a, d_a, num_bytes, cudaMemcpyDeviceToHost );
cudaMemcpy( h_o, d_o, sizeof(int), cudaMemcpyDeviceToHost );
std::cout << "The sum is:" << *h_o << std::endl;
free(h_a);
free(h_o);
cudaFree(d_a);
cudaFree(d_o);
}
If you do basic cuda error checking you will discover that your reduce kernel is not even running. The reason is as follows:
int dimx = 320;
int dimy = 160;
...
int sizeofSharedMemory = dimx*dimy*sizeof(float); // = 204800
reduce<<<grid, block, sizeofSharedMemory>>> (d_a, d_o, block.x, block.y);
^
|
204800 is illegal here
You cannot request 204800 bytes of shared memory dynamically (or any other way). The maximum is slightly less than 48K bytes.
If you had done proper cuda error checking, you would discover your kernel is not running and would have gotten an instructive error message which suggests the launch configuration (the numbers between the <<< ... >>> ) is invalid. Shared memory is requested on a per-block basis, and it's probably not sensible that you need to request enough shared memory to cover your entire 2D data set, when each block only consists of a 4x4 thread array. You probably just need enough data for what will be accessed by each 4x4 thread array.
After you have properly instrumented your code with cuda error checking, and detected and corrected all the errors, then run your code with cuda-memcheck. This will do an additional level of error checking to point out any kernel access errors. You may also use cuda-memcheck if you are getting an unspecified launch failure, and it may help pinpoint the issue.
After you have done these basic trouble shooting steps, then it might make sense to ask others for help. But use the power of the tools you have been given first.
I also want to point out one other error before you come back and post this code again, asking for help.
This will not be useful:
std::cout << "The sum is:" << *h_o << std::endl;
cudaMemcpy( h_a, d_a, num_bytes, cudaMemcpyDeviceToHost );
cudaMemcpy( h_o, d_o, sizeof(int), cudaMemcpyDeviceToHost );
You are printing out the sum before you have copied the sum from the device to the host.
Reverse the order of these steps:
cudaMemcpy( h_a, d_a, num_bytes, cudaMemcpyDeviceToHost );
cudaMemcpy( h_o, d_o, sizeof(int), cudaMemcpyDeviceToHost );
std::cout << "The sum is:" << *h_o << std::endl;
Wrote my first program using CUDA+CUBLAS. It just uses a 'cublasDgemm' function and computes a product of 2 N*N matrices.
However, all the time I was launching my program, it keeped producing the same wrong answer (e.g. when multiplying 1*1 matrix containing 5 as a single element by 1*1 matrix containing element 6, it always said the result is 36, not 30).
I checked the program several times with no success. But, when I came back to it the nexy day (i.e. after reboot), it worked just fine. I don't remember whether I recompiled it or not, but the truth is that it is the same VS project, same code, same computer with its GPU.
So, can anyone explain me why could that have happened? And do I have to expect same strange behaviour further?
Here is the code I was launching:
#include <iostream>
#include <string>
#include <iomanip>
#include <cuda_runtime.h>
#include <cublas_v2.h>
const int N = 5;
#define IDX2F(i,j) ((i) * N + j)
void fail(const cudaError_t& cudaStatus, const std::string& errorMessage) {
if (cudaStatus != cudaSuccess) {
std::cerr << errorMessage << std::endl;
exit(EXIT_FAILURE);
}
}
void fail(const cublasStatus_t& status, const std::string& errorMessage) {
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << errorMessage << std::endl;
exit(EXIT_FAILURE);
}
}
void printMatrix(const double *C) {
for (int i=0; i<N; i++) {
for (int j=0; j<N; j++) {
std::cout << std::fixed << std::setprecision(2) << C[IDX2F(i,j)] << ' ';
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int main(int argc, char **argv) {
cudaError_t cudaStatus;
cublasStatus_t status;
cublasHandle_t handle;
double *A = new double[N*N];
double *devPtrA;
double *B = new double[N*N];
double *devPtrB;
double *C = new double[N*N];
double *devPtrC;
for (int i=0; i<N; i++)
for (int j=0; j<N; j++)
A[IDX2F(i,j)] = i + j;
for (int i=0; i<N; i++)
for (int j=0; j<N; j++)
B[IDX2F(i,j)] = i + j * 0.5;
// do not have to set anything into matrix C, because beta = 0
// allocate mamory on GPU
cudaStatus = cudaMalloc((void**)&devPtrC, N*N*sizeof(*C));
fail(cudaStatus, "device memory allocation failed");
cudaStatus = cudaMalloc((void**)&devPtrA, N*N*sizeof(*A));
fail(cudaStatus, "device memory allocation failed");
cudaStatus = cudaMalloc((void**)&devPtrB, N*N*sizeof(*B));
fail(cudaStatus, "device memory allocation failed");
// create GPU handle
status = cublasCreate(&handle);
fail(status, "CUBLAS initialization failed");
// copying matrices from host to GPU
status = cublasSetMatrix(N, N, sizeof (*B), B, N, devPtrB, N);
fail(status, "failed to load data from host to GPU");
status = cublasSetMatrix(N, N, sizeof (*A), A, N, devPtrA, N);
fail(status, "failed to load data from host to GPU");
const double ONE = 1;
const double ZERO = 0;
printMatrix(A);
printMatrix(B);
status = cublasDgemm( handle,
CUBLAS_OP_N, CUBLAS_OP_N,
N, N, N,
&ONE,
devPtrA, N,
devPtrB, N,
&ZERO,
devPtrC, N);
fail(status, "error cublasDgemm");
status = cublasGetMatrix(N, N, sizeof (*C), devPtrC, N, C, N);
fail(status, "could not load result back from GPU to host");
printMatrix(C);
status = cublasDestroy(handle);
fail(status, "could not destroy CUBLAS handle");
cudaStatus = cudaFree(devPtrC);
fail(cudaStatus, "device memory freeing failed");
cudaStatus = cudaFree(devPtrB);
fail(cudaStatus, "device memory freeing failed");
cudaStatus = cudaFree(devPtrA);
fail(cudaStatus, "device memory freeing failed");
delete[] C;
delete[] B;
delete[] A;
return EXIT_SUCCESS;
}
op(B) must be CUBLAS_OP_T
.
.
status = cublasDgemm( handle,
CUBLAS_OP_N, CUBLAS_OP_T,
N, N, N,
&ONE,
devPtrA, N,
devPtrB, N,
&ZERO,
devPtrC, N);
.
.
.
.
definition is : C = α op ( A ) op ( B ) + β C
http://docs.nvidia.com/cuda/cublas/index.html#topic_8_1