Batched FFTs using cufftPlanMany - cuda

I want to perform 441 2D, 32-by-32 FFTs using the batched method provided by the cuFFT library. The parameters of the transform are the following:
int n[2] = {32,32};
int inembed[] = {32,32};
int onembed[] = {32,32/2+1};
cufftPlanMany(&plan,2,n,inembed,1,32*32,onembed,1,32*(32/2+1),CUFFT_D2Z,441);
cufftPlanMany(&inverse_plan,2,n,onembed,1,32*32,inembed,1,32*32,CUFFT_Z2D,441);
After I did the forward and inverse FFTs using the above plans, I could not get the original data back.
Can anyone advise me how to set the parameters correctly for cudaPlanMany? Many thanks in advance.
By the way, is it the best way to use cudaPlanMany for my situation?

Here is a full example on how using cufftPlanMany to perform batched direct and inverse transformations in CUDA. The example refers to float to cufftComplex transformations and back. The final result of the direct+inverse transformation is correct but for a multiplicative constant equal to the overall number of matrix elements nRows*nCols.
#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <assert.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
/********/
/* MAIN */
/********/
void main() {
cufftHandle forward_plan, inverse_plan;
int batch = 3;
int rank = 2;
int nRows = 5;
int nCols = 5;
int n[2] = {nRows, nCols};
int idist = nRows*nCols;
int odist = nRows*(nCols/2+1);
int inembed[] = {nRows, nCols};
int onembed[] = {nRows, nCols/2+1};
int istride = 1;
int ostride = 1;
cufftSafeCall(cufftPlanMany(&forward_plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch));
float *h_in = (float*)malloc(sizeof(float)*nRows*nCols*batch);
for(int i=0; i<nRows*nCols*batch; i++) h_in[i] = 1.f;
float2* h_freq = (float2*)malloc(sizeof(float2)*nRows*(nCols/2+1)*batch);
float* d_in; gpuErrchk(cudaMalloc(&d_in, sizeof(float)*nRows*nCols*batch));
float2* d_freq; gpuErrchk(cudaMalloc(&d_freq, sizeof(float2)*nRows*(nCols/2+1)*batch));
gpuErrchk(cudaMemcpy(d_in,h_in,sizeof(float)*nRows*nCols*batch,cudaMemcpyHostToDevice));
cufftSafeCall(cufftExecR2C(forward_plan, d_in, d_freq));
gpuErrchk(cudaMemcpy(h_freq,d_freq,sizeof(float2)*nRows*(nCols/2+1)*batch,cudaMemcpyDeviceToHost));
for(int i=0; i<nRows*(nCols/2+1)*batch; i++) printf("Direct transform: %i %f %f\n",i,h_freq[i].x,h_freq[i].y);
cufftSafeCall(cufftPlanMany(&inverse_plan, rank, n, onembed, ostride, odist, inembed, istride, idist, CUFFT_C2R, batch));
cufftSafeCall(cufftExecC2R(inverse_plan, d_freq, d_in));
gpuErrchk(cudaMemcpy(h_in,d_in,sizeof(float)*nRows*nCols*batch,cudaMemcpyDeviceToHost));
for(int i=0; i<nRows*nCols*batch; i++) printf("Inverse transform: %i %f \n",i,h_in[i]);
getchar();
}

Related

cudaMallocManaged (unified memory) with cuBLAS

I am trying to use Unified Memory with cudaMallocManaged() with the cuBLAS library. I am performing a simple matrix to vector multiplication as a simple example, and storing the result in an array results. However when printing the results array, I get back all 0's, instead of the results of multiplying the matrix mat by the vector vec.
The flow I am using is:
allocating memory with cudaMallocManaged()
Initializing the arrays with data
Allocating the cuBLAS handle
Calling cublasDgemv to perform the multiplication storing the results in results
When using new and then cublasSetMatrix() or cublasSetVector() this works fine.
How do I use Unified Memory with cuBLAS?
Here are minimum working examples:
Unified Memory Attempt (this gives back all 0's in results):
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <ctime>
#include "cublas_v2.h"
#define cudaErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
static const char *cublasErrChk(cublasStatus_t error)
{
switch (error)
{
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
int main() {
size_t dims = 4;
double *vec, *mat, *results;
cudaErrChk( cudaMallocManaged(&vec, dims * sizeof(double)) );
cudaErrChk( cudaMallocManaged(&mat, dims * dims * sizeof(double)) );
cudaErrChk( cudaMallocManaged(&results, dims * sizeof(double)) );
printf("Vector:\n");
for (int i = 1; i < dims + 1; i++) {
vec[i] = 0.5 * i;
printf("%.2lf ", vec[i]);
}
printf("\n\nMatrix:\n");
for (int i = 1; i < dims * dims + 1; i++) {
mat[i] = 1.0 * i;
printf("%.2lf ", mat[i]);
if (i % dims == 0)
printf("\n");
}
printf("\n");
cublasHandle_t handle;
cublasErrChk( cublasCreate(&handle) );
double alpha = 1.f, beta = 1.f;
// multiply mat by vec to get results
cublasErrChk(
cublasDgemv(
handle, CUBLAS_OP_N,
dims, dims,
&alpha,
mat, dims,
vec, 1,
&beta,
results, 1
)
);
for (int i = 0; i < dims; i++)
printf("%.2lf ", results[i]);
printf("\n");
cudaErrChk( cudaFree(vec) );
cudaErrChk( cudaFree(mat) );
cudaErrChk( cudaFree(results) );
return 0;
}
Regular malloc/setMatrix() Attempt:
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <ctime>
#include "cublas_v2.h"
#define cudaErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
static const char *cublasErrChk(cublasStatus_t error)
{
switch (error)
{
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
int main() {
size_t dims = 4;
double *h_vec, *h_mat, *h_results;
h_vec = new double[dims];
h_mat = new double[dims * dims];
h_results = new double[dims];
printf("Vector:\n");
for (int i = 1; i < dims + 1; i++) {
h_vec[i] = 0.5 * i;
printf("%.2lf ", h_vec[i]);
}
printf("\n\nMatrix:\n");
for (int i = 1; i < dims * dims + 1; i++) {
h_mat[i] = 1.0 * i;
printf("%.2lf ", h_mat[i]);
if (i % dims == 0)
printf("\n");
}
printf("\n");
double *d_vec, *d_mat, *d_results;
cudaErrChk( cudaMalloc(&d_vec, dims * sizeof(double)) );
cudaErrChk( cudaMalloc(&d_mat, dims * dims * sizeof(double)) );
cudaErrChk( cudaMalloc(&d_results, dims * sizeof(double)) );
cublasHandle_t handle;
cublasErrChk( cublasCreate(&handle) );
// copy the data manually to the GPUs
cublasErrChk( cublasSetVector(dims, sizeof(*d_vec), h_vec, 1, d_vec, 1) );
cublasErrChk( cublasSetMatrix(dims, dims, sizeof(double), h_mat, dims, d_mat, dims) );
double alpha = 1.f, beta = 1.f;
// // multiply mat by vec to get results
cublasErrChk(
cublasDgemv(
handle, CUBLAS_OP_N,
dims, dims,
&alpha,
d_mat, dims,
d_vec, 1,
&beta,
d_results, 1
)
);
cublasErrChk( cublasGetVector(dims, sizeof(*h_results), d_results, 1, h_results, 1) );
for (int i = 0; i < dims; i++)
printf("%.2lf ", h_results[i]);
printf("\n");
cudaErrChk( cudaFree(d_vec) );
cudaErrChk( cudaFree(d_mat) );
cudaErrChk( cudaFree(d_results) );
delete [] h_vec;
delete [] h_mat;
delete [] h_results;
return 0;
}
Compile with
nvcc -o main main.cu -lcublas
As #talonmies pointed out, the problem was that I was using an asynchronous call and not getting the results back in time. This is fixed by adding cudaDeviceSynchronize() after the cublasDgemv() call:
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <ctime>
#include "cublas_v2.h"
#define cudaErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
static const char *cublasErrChk(cublasStatus_t error)
{
switch (error)
{
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
int main() {
size_t dims = 4;
double *vec, *mat, *results;
cudaErrChk( cudaMallocManaged(&vec, dims * sizeof(double)) );
cudaErrChk( cudaMallocManaged(&mat, dims * dims * sizeof(double)) );
cudaErrChk( cudaMallocManaged(&results, dims * sizeof(double)) );
printf("Vector:\n");
for (int i = 1; i < dims + 1; i++) {
vec[i] = 0.5 * i;
printf("%.2lf ", vec[i]);
}
printf("\n\nMatrix:\n");
for (int i = 1; i < dims * dims + 1; i++) {
mat[i] = 1.0 * i;
printf("%.2lf ", mat[i]);
if (i % dims == 0)
printf("\n");
}
printf("\n");
cublasHandle_t handle;
cublasErrChk( cublasCreate(&handle) );
double alpha = 1.f, beta = 1.f;
// multiply mat by vec to get results
cublasErrChk(
cublasDgemv(
handle, CUBLAS_OP_N,
dims, dims,
&alpha,
mat, dims,
vec, 1,
&beta,
results, 1
)
);
cudaDeviceSynchronize();
for (int i = 0; i < dims; i++)
printf("%.2lf ", results[i]);
printf("\n");
cudaErrChk( cudaFree(vec) );
cudaErrChk( cudaFree(mat) );
cudaErrChk( cudaFree(results) );
return 0;
}

CuFFT Unknown Error

I have an array of 300,000 points and I want the fft of every 600 points. I'm attempting to use cufftPlanMany to execute, but I'm getting an unknown error here:
cufftSafeCall(cufftPlanMany(&plan, rank, n, NULL, istride, idist, NULL, 1,1, CUFFT_C2C, 500));
retrevialfft.cu(82) : cufftSafeCall() CUFFT error: <unknown>
Here's the code in context
cudaSetDevice(0);
// Allocate host memory for the signal
cufftComplex* h_signal=(cufftComplex*)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) {
h_signal[i].x = rand() / (float)RAND_MAX;
h_signal[i].y = 0;
// printf("Orignal: %f %f \n", h_signal[i].x, h_signal[i].y);
}
int mem_size = sizeof(cufftComplex) * SIGNAL_SIZE;
// Allocate device memory for signal
cufftComplex* d_signal;
cudaMalloc((void**)&d_signal, mem_size);
int rank = 1; //1d plan
int numCols = 300000;
int n[] = {numCols};
int batch = 500;
int istride = 1;
int ostride = 1;
int idist = numCols;
// CUFFT plan
cufftHandle plan;
cufftSafeCall(cufftPlanMany(&plan, rank, n, NULL, istride, idist, NULL, 1,1, CUFFT_C2C, 500));
// Transform signal
printf("Transforming signal cufftExecC2C\n");
cufftSafeCall(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));
// Copy device memory to host
cufftComplex* h_transformed = (cufftComplex*)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);;
cudaMemcpy(h_transformed, d_signal, mem_size,
cudaMemcpyDeviceToHost);
//Destroy CUFFT context
cufftDestroy(plan);
// cleanup memory
free(h_signal);
free(h_transformed);
cudaFree(d_signal);
cudaDeviceReset();
Any idea of what the error actually is?
You decided not to show any more detail on your question. Below, I'm providing a full working code using cufftPlanMany() to execute batched 1D FFTs. I hope it helps.
#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <assert.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
/********/
/* MAIN */
/********/
void main() {
int batch = 3; // --- How many transforms to be performed
int numCols = 16; // --- Size of each transform
int SIGNAL_SIZE = batch * numCols; // --- Overall size for all the signals
// --- Allocate host memory for all the signals
cufftComplex* h_signal=(cufftComplex*)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);
// --- Initalize host memory for all the signals
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) {
h_signal[i].x = 1.f;
h_signal[i].y = 0.f;
}
// --- Allocate device memory for all the signals
cufftComplex* d_signal; gpuErrchk(cudaMalloc((void**)&d_signal, sizeof(cufftComplex) * SIGNAL_SIZE));
// --- Host to Device memcopy
gpuErrchk(cudaMemcpy(d_signal, h_signal, sizeof(cufftComplex) * SIGNAL_SIZE, cudaMemcpyHostToDevice));
int rank = 1; // --- 1d plan
int n[] = {numCols};
int istride = 1;
int ostride = 1;
int idist = numCols;
int odist = numCols;
// --- CUFFT plan
cufftHandle plan;
cufftSafeCall(cufftPlanMany(&plan, rank, n, NULL, istride, idist, NULL, ostride, odist, CUFFT_C2C, 500));
// --- Signals transformations
cufftSafeCall(cufftExecC2C(plan, (cufftComplex*)d_signal, (cufftComplex*)d_signal, CUFFT_FORWARD));
// --- Device to Host memcopy
gpuErrchk(cudaMemcpy(h_signal, d_signal, sizeof(cufftComplex) * SIGNAL_SIZE, cudaMemcpyDeviceToHost));
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) printf("Real part = %f; Imaginar part = %f\n", h_signal[i].x, h_signal[i].y);
// --- Destroy CUFFT context
cufftSafeCall(cufftDestroy(plan));
// --- Memory cleanup
free(h_signal);
gpuErrchk(cudaFree(d_signal));
cudaDeviceReset();
}

Wrong results of a CUDA dynamic parallelism code

I recently bumped in the problem illustrated at Uncorrectable ECC error. Shortly speaking, from time to time I receive an Uncorrectable ECC error and my dynamic parallelism code generates uncorrect results. The most probable hypothesis of the uncorrectable ECC error is a corrupted driver stack, which has also been indirectly confirmed by the experience of another user (see the above post). I would now like to face the second issue, i.e., the algorithmic one. To this end, I'm dealing with the reproducer reported below which, since the original code generating uncorrect results uses dynamic parallelism, uses this CUDA feature too.
I do not see any evindent issue with this code. I think that the synchronization regarding the child kernel launch should be ok: the first __syncthreads() should not be necessary and the cudaDeviceSynchronize() should ensure that all the memory writes of the child kernel are accomplished before the printf.
My question is: is this code wrong or the wrong results are due to a non-programming issue?
My configuration: CUDA 5.0, Windows 7, 4-GPU system equipped with Kepler K20c, driver 327.23.
#include <stdio.h>
#include <conio.h>
#define K 6
#define BLOCK_SIZE 256
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getch(); exit(code); }
}
}
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__global__ void child_kernel(double* P1)
{
int m = threadIdx.x;
P1[m] = (double)m;
}
__global__ void parent_kernel(double* __restrict__ x, int M)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if(i<M) {
double* P1 = new double[13];
dim3 dimBlock(2*K+1,1); dim3 dimGrid(1,1);
__syncthreads();
child_kernel<<<dimGrid,dimBlock>>>(P1);
cudaDeviceSynchronize();
for(int m=0; m<2*K+1; m++) printf("%f %f\n",P1[m],(double)m);
}
}
int main() {
const int M = 19000;
//gpuErrchk(cudaSetDevice(0));
double* x = (double*)malloc(M*sizeof(double));
for (int i=0; i<M; i++) x[i] = (double)i;
double* d_x; gpuErrchk(cudaMalloc((void**)&d_x,M*sizeof(double)));
gpuErrchk(cudaMemcpy(d_x,x,M*sizeof(double),cudaMemcpyHostToDevice));
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(iDivUp(M,BLOCK_SIZE));
parent_kernel<<<dimGrid,dimBlock>>>(d_x,M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
getch();
return 0;
}
I'm pretty sure you're exceeding the launch pending limit. It's nearly impossible to tell with your code as-is, but I've modified it and added error checking on the child kernel launch.
When I do that, I get launch errors, signified by a printout of !. Skipping the launch error cases, all of my in-kernel checking of P1[m] vs. m passes (I get no * printout at all.)
#include <stdio.h>
#define K 6
#define BLOCK_SIZE 256
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__global__ void child_kernel(unsigned long long* P1)
{
int m = threadIdx.x;
P1[m] = (unsigned long long)m;
}
__global__ void parent_kernel(double* __restrict__ x, int M)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if(i<M) {
unsigned long long* P1 = new unsigned long long[13];
dim3 dimBlock(2*K+1,1); dim3 dimGrid(1,1);
__syncthreads();
child_kernel<<<dimGrid,dimBlock>>>(P1);
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("!");
else for(unsigned long long m=0; m<dimBlock.x; m++) if (P1[m] != m) printf("*");
}
}
int main() {
const int M = 19000;
//gpuErrchk(cudaSetDevice(0));
double* x = (double*)malloc(M*sizeof(double));
for (int i=0; i<M; i++) x[i] = (double)i;
double* d_x; gpuErrchk(cudaMalloc((void**)&d_x,M*sizeof(double)));
gpuErrchk(cudaMemcpy(d_x,x,M*sizeof(double),cudaMemcpyHostToDevice));
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(iDivUp(M,BLOCK_SIZE));
parent_kernel<<<dimGrid,dimBlock>>>(d_x,M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
Feel free to add further decoding of the err variable in the parent kernel to convince yourself that you are exceeding the launch pending limit. As another test, you can set M to 2048 instead of 19000 in your host code, and all the ! printouts go away. (launch pending limit default == 2048)
As I've stated in the comments, I think the uncorrectable ECC error is a separate issue, and I suggest trying the driver 321.01 that I linked in the comments.

I lost data after __syncthreads() in cuda

I am trying to find the maximum of an array.. I took the help from CUDA Maximum Reduction Algorithm Not Working. and do some own modification. However I am running it for 16 data. I am finding that in kernel code shared memory copies only 1st 4data. rest are lost. I put two cuPrintf..1st printf shows data is their in the shared memory. But the 2nd cuPrintf is just after __syncthreads.. and that shows 0 from thread ids 4 onwords.. pls help
#include
#include
#include
#include
#include
#include "cuPrintf.cu"
#include "cuPrintf.cuh"
__device__ float MaxOf2(float a, float b)
{
if(a > b) return a;
else return b;
}
__global__ void findMax(int size,float *array_device , float *outPut)
{
extern __shared__ float sdata[];
int tid = threadIdx.x;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i< size)
{
sdata[tid] = array_device[i];
cuPrintf(" array_d[%d]===%f, sdata[%d]===%f\n ",i,array_device[i],tid,sdata[tid]);
__threadfence();
}
__syncthreads();
if(tid<size)
cuPrintf(" array_d[%d]===%f, sdata[%d]===%f\n ",i,array_device[i],tid,sdata[tid]);
for ( int s=blockDim.x/2; s>0; s=s>>1)//s=blockDim.x/2
{
if (tid < s)
{
sdata[tid]= MaxOf2(sdata[tid],sdata[tid+s]);
}
__syncthreads();
}
if (tid == 0) outPut[blockIdx.x] = sdata[0];
}
int main()
{
long double M = pow(2,20);
long double N = 2;
int noThreadsPerBlock = 512 ;
printf("\n Provide the array Size N.(array will be of size N * 2^20 ) :-");
scanf("%Lf",&N);
long int size = 16;
int numOfBlock = (int)size /noThreadsPerBlock + 1;
printf("\n num of blocks==%ld",numOfBlock);
float *array_device , *outPut;
float array_host[]={221,100,2,340,47,36,500,1,33,4460,5,6,7,8,9,11};
cudaMalloc((void **)&array_device, size*sizeof(float));
cudaMalloc((void **)&outPut, size*sizeof(float));
cudaError_t error0 = cudaGetLastError();
printf("\n 0CUDA error: %s\n", cudaGetErrorString(error0));
printf("size===%ld",size);
cudaMemcpy(array_device, array_host, size*sizeof(float), cudaMemcpyHostToDevice);
cudaError_t error1 = cudaGetLastError();
printf("\n1CUDA error: %s\n", cudaGetErrorString(error1));
while(size>1 )
{
cudaPrintfInit();
findMax<<< numOfBlock,noThreadsPerBlock>>>(size,array_device, outPut);cudaPrintfDisplay(stdout, true);
cudaPrintfEnd();
cudaError_t error2 = cudaGetLastError();
printf(" 2CUDA error: %s\n", cudaGetErrorString(error2));
cudaMemcpy(array_device, outPut, size*sizeof(float), cudaMemcpyDeviceToDevice);
size = numOfBlock;
printf("\n ****size==%ld\n",size);
numOfBlock = (int)size /noThreadsPerBlock + 1;
}
cudaMemcpy(array_host, outPut, size*sizeof(float), cudaMemcpyDeviceToHost);
cudaError_t error3 = cudaGetLastError();
printf("\n3CUDA error: %s\n", cudaGetErrorString(error3));
for(int i=0;i<size;i++)
printf("\n index==%d ;data=%f ",i,array_host[i]);
return 0;
}
I'm posting my comment as an answer as requested.
Firstly, you havent specified dynamic size of shared memory in kernel launch. It should look something like:
findMax<<< numOfBlock,noThreadsPerBlock,sizeof(float)*noThreadsPerBlock>>>
Secondly, what was the concept behind condition if(tid<size) on second cuPrintf? Providing output of the program could also help.

Cuda Texture Memory does not inherit the right Values

I'm Trying to bin a 2D array to a texture and to do interpolation between the data. My Problem is. When I'm binding my Array to the texture the the Values i access are total nonsense. Even when I'm trying to acces the first Value (text2D(tex,0.0f,0.0f) i doesn't make sense. So i guess I'm binding it wrong or my memcopy is wrong. Any ideas where my mistake is?
Here is the Code
#include <stdio.h>
#include <iostream>
#include "cuda.h"
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "HelloWorld.h"
#include "linearInterpolation_kernel.cu"
#include "linearInterpolation_kernel2.cu"
#include "linearInterpolation_kernel3.cu"
using namespace std;
using std::cout;
const int blocksize = 16;
__global__
void hello(char *a, int *b) {
a[threadIdx.x] += b[threadIdx.x];
}
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
printf("%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
}
}
// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
inline void __getLastCudaError( const char *errorMessage, const char *file, const int line )
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
printf("%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
}
}
int main()
{
int N = 200;
float *A;
A = (float *) malloc(N*sizeof(float));
float *B;
B = (float *) malloc(N*sizeof(float));
float *result;
result = (float *) malloc(N*sizeof(float));
float angle = 0.5f;
for(int i = 0; i < N; i++){
A[i] = (float)rand();
B[i] = (float)rand();
}
cout << A[3] << endl;
cout << B[3] << endl;
ipLinearTexture(A,B,result,angle,N);
float result2;
result2 = (angle)*A[3] + (1-angle)*B[3];
printf(" A %f B %f Result %f\n", A[3], B[3], result[3]);
cout << result2 << endl;
return 1;
}
void ipLinearTexture(float *A, float* B, float* result, float angle, int N)
{
float cuTime;
const int N2 = N;
float *dev_result;
float **AB;
AB = (float **) malloc( N * sizeof(float *));
if(AB)
{
for(int i = 0; i < N; i++)
{
AB[i] = (float *) calloc( 2 , sizeof(float *));
}
}
for (int i = 0; i < N; i++)
{
AB[i][0] = A[i];
AB[i][1] = B[i];
}
cudaMalloc(&dev_result, N * sizeof(float));
unsigned int size = N * 2 * sizeof(float);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray* cu_array;
checkCudaErrors(cudaMallocArray( &cu_array, &channelDesc,N,2 ));
checkCudaErrors(cudaMemcpyToArray( cu_array, 0, 0, AB, size, cudaMemcpyHostToDevice));
tex.addressMode[0] = cudaAddressModeClamp;
tex.addressMode[1] = cudaAddressModeClamp;
tex.filterMode = cudaFilterModeLinear;
tex.normalized = false; // access with normalized texture coordinates
checkCudaErrors(cudaBindTextureToArray( tex, cu_array, channelDesc));
dim3 dimBlock(10, 1, 1);
dim3 dimGrid((int)ceil((double)N*2/dimBlock.x), 1, 1);
transformKernel3<<< dimGrid, dimBlock, 0 >>>( dev_result, N, 2, angle);
checkCudaErrors(cudaUnbindTexture(tex));
cudaMemcpy(result, dev_result, N * sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost);
result[0] = (float)cuTime;
cout << "==================================================" << endl;
for (int i = 0 ; i < N ;i++)
{
cout << result[i] << endl;
}
cout << "==================================================" << endl;
cudaFree(dev_result);
cudaFreeArray(cu_array);
}
Here is the code inside the Kernel
#ifndef _SIMPLETEXTURE_KERNEL3_H_
#define _SIMPLETEXTURE_KERNEL3_H_
// declare texture reference for 2D float texture
texture<float, 1> tex;
////////////////////////////////////////////////////////////////////////////////
//! Transform an image using texture lookups
//! #param g_odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void
transformKernel3( float* g_odata, int width, int height, float theta)
{
unsigned int id = blockIdx.x*blockDim.x + threadIdx.x;
if (id < width*height)
{
g_odata[id] = tex1D(tex, xid * 2 + 0.5f);
}
}
#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
Like the concept in OpenGL, you could think a 2D texture is a rectangle field. The center point of each small rectangle is your array data. So, tex2D(tex, 0.5f/width, 0.5f/height) will be exactly your first value of array data. (width & height is the width and height of 2D array data)