CuFFT Double to Complex - cuda

i want to make a FFT from double to std::complex with the CuFFT Lib. My Code looks like
#include <complex>
#include <iostream>
#include <cufft.h>
#include <cuda_runtime_api.h>
typedef std::complex<double> Complex;
using namespace std;
int main(){
int n = 100;
double* in;
Complex* out;
in = (double*) malloc(sizeof(double) * n);
out = (Complex*) malloc(sizeof(Complex) * n/2+1);
for(int i=0; i<n; i++){
in[i] = 1;
}
cufftHandle plan;
plan = cufftPlan1d(&plan, n, CUFFT_D2Z, 1);
unsigned int mem_size = sizeof(double)*n;
cufftDoubleReal *d_in;
cufftDoubleComplex *d_out;
cudaMalloc((void **)&d_in, mem_size);
cudaMalloc((void **)&d_out, mem_size);
cudaMemcpy(d_in, in, mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_out, out, mem_size, cudaMemcpyHostToDevice);
int succes = cufftExecD2Z(plan,(cufftDoubleReal *) d_in,(cufftDoubleComplex *) d_out);
cout << succes << endl;
cudaMemcpy(out, d_out, mem_size, cudaMemcpyDeviceToHost);
for(int i=0; i<n/2; i++){
cout << "out: " << i << " " << out[i].real() << " " << out[i].imag() << endl;
}
return 0;
}
but it seems to me this must be wrong, because i think the transformed values should be 1 0 0 0 0 .... or without the normalization 100 0 0 0 0 .... but i just get 0 0 0 0 0 ...
Furthermore i would like it more if the cufftExecD2Z would work in place, which should be possible but i haven't figured out how to correctly do so. Can anybody help?

Your code has a variety of errors. You should probably review cufft documentation as well as the sample codes.
You should do proper cuda error checking and proper cufft error checking on all API return values.
The return value of the cufftPlan1d function does not go into the plan:
plan = cufftPlan1d(&plan, n, CUFFT_D2Z, 1);
The function itself sets the plan (that is why you pass &plan to the function), then when you assign the return value into the plan, it ruins the plan set up by the function.
You correctly identified that the output can be of size ((N/2)+1), but then you didn't allocate space for it properly either on the host side:
out = (Complex*) malloc(sizeof(Complex) * n/2+1);
or on the device side:
unsigned int mem_size = sizeof(double)*n;
...
cudaMalloc((void **)&d_out, mem_size);
The following code has some of the above problems fixed, enough to get your desired result (100, 0, 0, ...)
#include <complex>
#include <iostream>
#include <cufft.h>
#include <cuda_runtime_api.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef std::complex<double> Complex;
using namespace std;
int main(){
int n = 100;
double* in;
Complex* out;
#ifdef IN_PLACE
in = (double*) malloc(sizeof(Complex) * (n/2+1));
out = (Complex*)in;
#else
in = (double*) malloc(sizeof(double) * n);
out = (Complex*) malloc(sizeof(Complex) * (n/2+1));
#endif
for(int i=0; i<n; i++){
in[i] = 1;
}
cufftHandle plan;
cufftResult res = cufftPlan1d(&plan, n, CUFFT_D2Z, 1);
if (res != CUFFT_SUCCESS) {cout << "cufft plan error: " << res << endl; return 1;}
cufftDoubleReal *d_in;
cufftDoubleComplex *d_out;
unsigned int out_mem_size = (n/2 + 1)*sizeof(cufftDoubleComplex);
#ifdef IN_PLACE
unsigned int in_mem_size = out_mem_size;
cudaMalloc((void **)&d_in, in_mem_size);
d_out = (cufftDoubleComplex *)d_in;
#else
unsigned int in_mem_size = sizeof(cufftDoubleReal)*n;
cudaMalloc((void **)&d_in, in_mem_size);
cudaMalloc((void **)&d_out, out_mem_size);
#endif
cudaCheckErrors("cuda malloc fail");
cudaMemcpy(d_in, in, in_mem_size, cudaMemcpyHostToDevice);
cudaCheckErrors("cuda memcpy H2D fail");
res = cufftExecD2Z(plan,d_in, d_out);
if (res != CUFFT_SUCCESS) {cout << "cufft exec error: " << res << endl; return 1;}
cudaMemcpy(out, d_out, out_mem_size, cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy D2H fail");
for(int i=0; i<n/2; i++){
cout << "out: " << i << " " << out[i].real() << " " << out[i].imag() << endl;
}
return 0;
}
Review the documentation on what is necessary to do an in-place transform in the real to complex case. The above code can be recompiled with -DIN_PLACE to see the behavior for an in-place transform, and the necessary code changes.

Related

Is cuda atomicAdd operation faster than launch another kernel when we do reduce sum?

I need to sum up a vector, which is longer than the number of threads in a cuda block. So I use multi blocks to handle the task. I sum up a part of the vector within each block, after which I have two options, one is to use atomicAdd to combine the sum of each block, and the other is to write the result in some global memory and launch another kernel to sum up. Which method do you recommand me to use ?
Is cuda atomicAdd operation faster than launch another kernel when we do reduce sum?
For the following test case, using the code lifted from slides 16 and 17 in the training here (video), it seems to be a bit faster. The difference is about the cost of kernel launch overhead, which makes sense:
$ cat t1834.cu
#include <time.h>
#include <sys/time.h>
#include <iostream>
const int BLOCK_SIZE = 1024;
template <typename T>
__global__ void reduce(const T * __restrict__ gdata, T * __restrict__ out, const int N){
__shared__ T sdata[BLOCK_SIZE];
int tid = threadIdx.x;
sdata[tid] = 0.0;
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
while (idx < N) { // grid stride loop to load data
sdata[tid] += gdata[idx];
idx += gridDim.x*blockDim.x;
}
for (unsigned int s=blockDim.x/2; s>0; s>>=1) {
__syncthreads();
if (tid < s) // parallel sweep reduction
sdata[tid] += sdata[tid + s];
}
if (tid == 0)
#ifndef USE_ATOMIC
out[blockIdx.x] = sdata[0];
#else
atomicAdd(out, sdata[0]);
#endif
}
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
typedef float mt;
const int ds = 1048576*8;
int main(){
mt *h_gdata, *d_gdata, *h_out, *d_out;
h_gdata = new mt[ds];
cudaMalloc(&d_gdata, ds*sizeof(mt));
const int nblocks = 160;
h_out = new mt[1];
cudaMalloc(&d_out, nblocks*sizeof(mt));
for (int i = 0; i < ds; i++) h_gdata[i] = 1;
cudaMemcpy(d_gdata, h_gdata, ds*sizeof(mt), cudaMemcpyHostToDevice);
reduce<<<nblocks, BLOCK_SIZE>>>(d_gdata, d_out, ds); // warm-up
cudaDeviceSynchronize();
cudaMemset(d_out, 0, sizeof(mt));
unsigned long long dt = dtime_usec(0);
reduce<<<nblocks, BLOCK_SIZE>>>(d_gdata, d_out, ds);
#ifndef USE_ATOMIC
reduce<<<1, BLOCK_SIZE>>>(d_out, d_out, nblocks);
#endif
cudaDeviceSynchronize();
dt = dtime_usec(dt);
cudaMemcpy(h_out, d_out, sizeof(mt), cudaMemcpyDeviceToHost);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {std::cout << "CUDA error: " << cudaGetErrorString(err) << std::endl; return 0;}
if (h_out[0] != ds) {std::cout << "Reduce Error: " << h_out[0] << std::endl; return 0;}
std::cout << "Timing: " << dt << "us" << std::endl;
return 0;
}
$ nvcc -lineinfo -arch=sm_70 -O3 -o t1834 t1834.cu -std=c++14 -Wno-deprecated-gpu-targets
$ ./t1834
Timing: 69us
$ nvcc -lineinfo -arch=sm_70 -O3 -o t1834 t1834.cu -std=c++14 -Wno-deprecated-gpu-targets -DUSE_ATOMIC
$ ./t1834
Timing: 66us
$
(CUDA 11.2, Centos 7, V100 GPU)

Why does computing L2 norm with cuBLAS result in an error?

Edit 2: include the more full program
Edit 1: include the full program
I'm trying to compute the L2 norm of a vector using cuBLAS. My code is as follows
void GPU_Print_Matrix(real_t *A, int nrows, int ncols) {
real_t *hostA = (real_t*)malloc(nrows*ncols * sizeof(real_t));
CUDA_SAFE_CALL(cudaMemcpy(hostA, A, nrows*ncols * sizeof(real_t), cudaMemcpyDeviceToHost));
cout << "GPU Matrix of Size: " << nrows << "x" << ncols << endl;
for (int i = 0; i < nrows; ++i) {
for (int j = 0; j < ncols; ++j) {
cout << fixed << setprecision(PRINT_PRECISION) << hostA[j*nrows + i] << " ";
}
cout << endl;
}
free(hostA);
cout << endl;
}
void GPU_Random_Vector(thrust::device_vector <real_t> &vec) {
thrust::counting_iterator<unsigned int> index_sequence_begin(rand());
thrust::transform(index_sequence_begin, index_sequence_begin + vec.size(), vec.begin(), RANDOM(-initRange, initRange));
}
int main(int argc, char *argv[]) {
srand(clock());
cout << "# Running NMT" << endl;
//ParseOpts(argc, argv);
cublasHandle_t handle;
CUBLAS_SAFE_CALL(cublasCreate(&handle));
thrust::device_vector <real_t> x(10);
GPU_Random_Vector(x);
GPU_Print_Matrix(thrust::raw_pointer_cast(&x[0]), 10, 1);
real_t nrm = 0;
CUBLAS_SAFE_CALL(cublasXnrm2(handle, 10, thrust::raw_pointer_cast(&x[0]), 1, &nrm));
cout << "nrm2 = " << nrm << endl;
}
Here, CUBLAS_SAFE_CALL is defined as follows
#define CUBLAS_SAFE_CALL(call) \
{ \
const cublasStatus_t stat = call; \
if (stat != CUBLAS_STATUS_SUCCESS) { \
cout << "cuBlas Error: " << __FILE__ << ":" << __LINE__ << endl; \
cout << " Code: " << stat << endl; \
exit(1); \
} \
}
GPU_Random_Vector and GPU_Print_Matrix have been confirmed to work before. Also, cublasHandle[singleGPU] has been initialized before being called. When I ran the program, I had the following output
// GPU_Print_Matrix
GPU Matrix of Size: 10x1
0.0652332678
0.0747700930
0.0274266358
-0.0885794610
-0.0192640368
-0.0942506194
0.0283640027
-0.0411146656
-0.0460337885
-0.0970785618
cuBlas Error: nmt.cu:2252
Code: 14
What is going on? And is there any reference for how can I interpret the error number of cuBLAS? Thanks a ton.
CUBLAS error 14 is CUBLAS_STATUS_INTERNAL_ERROR and would usually mean that the internal device to host copy at the end of the L2 norm call failed. But why that happened is impossible to say without some context about what else your code was doing.
If the code you posted is assembled and fleshed out into a complete demo case (with the trivial random number seeding mistake correct) like this:
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <cublas_v2.h>
#include <thrust/transform.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
typedef float real_t;
#define CUBLAS_SAFE_CALL(call) \
{ \
const cublasStatus_t stat = call; \
if (stat != CUBLAS_STATUS_SUCCESS) { \
std::cout << "cuBlas Error: " << __FILE__ << ":" << __LINE__ << std::endl; \
std::cout << " Code: " << stat << std::endl; \
exit(1); \
} \
}
#define PRINT_PRECISION (6)
struct RANDOM
{
real_t a, b;
__host__ __device__
RANDOM(real_t _a=0, real_t _b=1) : a(_a), b(_b) {};
__host__ __device__
real_t operator()(const unsigned int n) const
{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
void GPU_Print_Matrix(real_t *A, int nrows, int ncols) {
real_t *hostA = (real_t*)malloc(nrows*ncols * sizeof(real_t));
cudaMemcpy(hostA, A, nrows*ncols * sizeof(real_t), cudaMemcpyDeviceToHost);
std::cout << "GPU Matrix of Size: " << nrows << "x" << ncols << std::endl;
for (int i = 0; i < nrows; ++i) {
for (int j = 0; j < ncols; ++j) {
std::cout << std::fixed << std::setprecision(PRINT_PRECISION) << hostA[j*nrows + i] << " ";
}
std::cout << std::endl;
}
free(hostA);
std::cout << std::endl;
}
void GPU_Random_Vector(thrust::device_vector <real_t> &vec) {
const real_t initRange = 10;
thrust::counting_iterator<unsigned int> index_sequence_begin(std::rand());
thrust::transform(index_sequence_begin, index_sequence_begin + vec.size(), vec.begin(), RANDOM(-initRange, initRange));
}
int main(int argc, char *argv[]) {
std::srand(std::time(0));
std::cout << "# Running NMT" << std::endl;
cublasHandle_t handle;
CUBLAS_SAFE_CALL(cublasCreate(&handle));
thrust::device_vector <real_t> x(10);
GPU_Random_Vector(x);
GPU_Print_Matrix(thrust::raw_pointer_cast(&x[0]), 10, 1);
real_t nrm = 0;
CUBLAS_SAFE_CALL(cublasSnrm2(handle, 10, thrust::raw_pointer_cast(&x[0]), 1, &nrm));
std::cout << "nrm2 = " << nrm << std::endl;
}
and compiled and run like this (CUDA 6.5 if that matters):
>nvcc -arch=sm_21 -run runkkari.cu -lcublas
runkkari.cu
Creating library a.lib and object a.exp
# Running NMT
GPU Matrix of Size: 10x1
-5.712992
8.181723
-0.086308
-6.177320
-5.442665
-2.889552
-1.555665
6.506872
-6.800190
8.024273
nrm2 = 18.196394
It works as expected. You should be able to compile and run this to confirm this yourself. So from this we can only conclude that you have another problem which you have failed to describe. But perhaps this helps to narrow down the list of possibilities.

Pthreads and CudaMemcpyAsync

I wrote a test program to test the following idea: (1) a cuda stream copies data to gpu. The copy is done in a pthread. (2) a second cuda stream reads and processes data. (3) One more data is copied by first stream only if the previous data is processed by the second stream.
However, it does not work: only copy first data and then waiting there.
#include "cuda.h"
#include <iostream>
#include <pthread.h>
const int UNPROCESSED = 1;
const int PROCESSED = 2;
const int DONE = 3;
const int RUNNING= 0;
const int NUM_OF_DATA = 100;
const int NUM_OF_BLOCKS = 1;
const int THREADS_PER_BLOCK = 1;
//int data_states[NUM_OF_DATA];
cudaStream_t cuda_stream[2];
volatile int* process_state;
volatile int* d_process_state;
volatile int* d_copier_state;
int* d_data_state;
int* h_data_states;
cudaError_t cuda_status;
using namespace std;
void* copy_data(void* arg){
int i=0;
//cout << "in copy_data" << endl;
while(i < NUM_OF_DATA){
if (*process_state != UNPROCESSED){
cout << "Now copy data " << i << " with state = " << h_data_states[i] << endl;
*process_state = UNPROCESSED;
cuda_status = cudaMemcpyAsync(d_data_state, &h_data_states[i], sizeof(int), cudaMemcpyHostToDevice, cuda_stream[0]);
if (cuda_status != cudaSuccess){
cout << "Error when allocating pinned host memory (full_instance_states)" << endl;
}
i++;
}
}
int copier_state = DONE;
cudaMemcpyAsync((void*) d_copier_state, &copier_state, sizeof(int), cudaMemcpyHostToDevice, cuda_stream[0]);
}
__global__ void process_data(volatile int* data_state, volatile int* process_state, volatile int* copier_state){
int i = 0;
printf(" i = %d\n", i);
while(*copier_state != DONE){
printf(" i = %d, copier_state = %d, data_state = %d\n", i, *copier_state, *data_state);
if(*data_state == UNPROCESSED){
printf("now processing data %d\n", i);
i++;
// process data here, skipped
*process_state = PROCESSED;
*data_state = PROCESSED;
//__threadfence_system();
}
}
printf("process_data is done\n");
}
int main(int argc, char **argv){
int i;
cudaSetDeviceFlags(cudaDeviceMapHost);
cuda_status = cudaMallocHost((void**) &process_state, NUM_OF_BLOCKS*sizeof(int), cudaHostAllocMapped);
if (cuda_status != cudaSuccess){
cout << "Error when allocating pinned host memory (full_instance_states)" << endl;
}
cudaHostGetDevicePointer((int**) &d_process_state, (int*) process_state, 0);
cuda_status = cudaMalloc((void**) &d_copier_state, NUM_OF_BLOCKS*sizeof(int));
if (cuda_status != cudaSuccess){
cout << "Error when allocating pinned host memory (full_instance_states)" << endl;
}
cudaMemset((void*)d_copier_state, RUNNING, sizeof(int));
cuda_status = cudaMallocHost((void**) &h_data_states, NUM_OF_DATA*sizeof(int), 0);
if (cuda_status != cudaSuccess){
cout << "Error when allocating pinned host memory (full_instance_states)" << endl;
}
for(i = 0; i < NUM_OF_DATA; i++){
h_data_states[i] = UNPROCESSED;
}
cudaStreamCreate(&cuda_stream[0]);
cudaStreamCreate(&cuda_stream[1]);
pthread_t thread;
int thread_state = pthread_create(&thread, NULL, &copy_data, h_data_states);
if(thread_state){
cout << "Error: unable to create thread (produce_instances), "<< thread_state << endl;
exit(-1);
}
//cout << "Starting kernel" << endl;
process_data<<<NUM_OF_BLOCKS, THREADS_PER_BLOCK, 0, cuda_stream[1]>>>(d_data_state, d_process_state, d_copier_state);
cudaDeviceSynchronize();
cudaFree(d_data_state);
cudaFree((void*) d_copier_state);
cudaFreeHost((void*) process_state);
return 0;
}
You never allocate d_data_state in any way. It is a NULL pointer throughout your program.
Therefore the usage here is invalid:
cuda_status = cudaMemcpyAsync(d_data_state, &h_data_states[i], sizeof(int), cudaMemcpyHostToDevice, cuda_stream[0]);
And when I run your program, I get the error printout from the next line of code.
Since your kernel also uses d_data_state (which is an invalid pointer) I get various invalid global read errors if I run your code with cuda-memcheck.
Since you have not allocated anything for d_data_state, your code cannot possibly work.
You had several other issues in your code as well. As just one example:
int copier_state = DONE;
cudaMemcpyAsync((void*) d_copier_state, &copier_state, sizeof(int), cudaMemcpyHostToDevice, cuda_stream[0]);
In order for cudaMemcpyAsync to work as expected (i.e. be asynchronous, and overlap with other stream activity) the host memory must be a pinned memory area. int copier_state = DONE; does not create a pinned allocation, so copying from that breaks the asynchronous overlap of the cudaMemcpyAsync operation.
Here is a version of your code that works correctly for me (now updated with some additional guards against race conditions):
#include <iostream>
#include <pthread.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
const int UNPROCESSED = 1;
const int PROCESSED = 2;
const int DONE = 3;
const int RUNNING= 0;
const int NUM_OF_DATA = 100;
const int NUM_OF_BLOCKS = 1;
const int THREADS_PER_BLOCK = 1;
//int data_states[NUM_OF_DATA];
cudaStream_t cuda_stream[2];
volatile int* process_state;
volatile int* d_process_state;
volatile int* d_copier_state;
int* d_data_state;
int* h_data_states;
int* h_copier_state;
cudaError_t cuda_status;
using namespace std;
void* copy_data(void* arg){
int i=0;
cudaSetDevice(0);
//cout << "in copy_data" << endl;
while(i < NUM_OF_DATA){
if (*process_state != UNPROCESSED){
// cout << "Now copy data " << i << " with state = " << h_data_states[i] << endl;
*process_state = UNPROCESSED;
cudaMemcpyAsync(d_data_state, &(h_data_states[i]), sizeof(int), cudaMemcpyHostToDevice, cuda_stream[0]);
cudaStreamSynchronize(cuda_stream[0]);
cudaCheckErrors("thread cudaMemcpyAsync fail");
//*process_state = UNPROCESSED;
i++;
}
}
*h_copier_state = DONE;
cudaMemcpyAsync((void *)d_copier_state, h_copier_state, sizeof(int), cudaMemcpyHostToDevice, cuda_stream[0]);
cudaCheckErrors("thread cudaMemcpyAsync 2 fail");
// cout << "Thread finished" << endl;
return NULL;
}
__global__ void process_data(volatile int* data_state, volatile int* process_state, volatile int* copier_state){
int i = 0;
//printf(" i = %d\n", i);
while(*copier_state != DONE){
//printf(" i = %d, copier_state = %d, data_state = %d\n", i, *copier_state, *data_state);
if(*data_state == UNPROCESSED){
//printf("now processing data %d\n", i);
i++;
// process data here, skipped
*data_state = PROCESSED;
__threadfence_system();
*process_state = PROCESSED;
__threadfence_system();
}
}
// printf("process_data is done\n");
}
int main(int argc, char **argv){
int i;
cudaSetDevice(0);
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaMallocHost((void**) &process_state, NUM_OF_BLOCKS*sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaMallocHost 1 fail");
cudaHostGetDevicePointer((int**) &d_process_state, (int*) process_state, 0);
cudaMalloc((void**) &d_copier_state, sizeof(int));
cudaCheckErrors("cudaMalloc 1 fail");
cudaMemset((void*)d_copier_state, RUNNING, sizeof(int));
cudaMallocHost((void**) &h_copier_state, sizeof(int), 0);
cudaCheckErrors("cudaMallocHost 3 fail");
*h_copier_state = RUNNING;
cudaMallocHost((void**) &h_data_states, NUM_OF_DATA*sizeof(int), 0);
cudaCheckErrors("cudaMallocHost 2 fail");
for(i = 0; i < NUM_OF_DATA; i++){
h_data_states[i] = UNPROCESSED;
}
cudaMalloc((void**) &d_data_state, sizeof(int));
cudaCheckErrors("cudaMalloc 2 fail");
cudaMemcpy((void*)d_data_state, &(h_data_states[0]), sizeof(int), cudaMemcpyHostToDevice);
cudaStreamCreate(&cuda_stream[0]);
cudaStreamCreate(&cuda_stream[1]);
pthread_t thread;
int thread_state = pthread_create(&thread, NULL, &copy_data, NULL);
if(thread_state){
cout << "Error: unable to create thread (produce_instances), "<< thread_state << endl;
exit(-1);
}
//cout << "Starting kernel" << endl;
process_data<<<NUM_OF_BLOCKS, THREADS_PER_BLOCK, 0, cuda_stream[1]>>>(d_data_state, d_process_state, d_copier_state);
cudaDeviceSynchronize();
return 0;
}
As an aside, it's not necessary to have all the complexity of pthreads to run one extra thread. After the cuda kernel launch, all of your pthread-code could have been inserted in the main host thread, and your program would still work correctly. The host thread runs asynchronously to, and in parallel to, the device kernel, after a kernel launch.

Trying to use: cudaHostAllocWriteCombined flag but I'm getting invalid argument when I try cudaMemcpy

First off I'd like to say I really do like the CUDA documentation it's really great and resourceful although I'm finding it hard to find out what is supported in what version. I'm using CUDA driver version 5.0 with compute capability 2.0 and was wondering if cudaHostAllocWriteCombined is supported?
In my code:
float *d_data, h_data;
h_data = new float[A];
assert(cudaHostAlloc((void **)&d_data, A * sizeof(float), cudaHostAllocWriteCombined) == cudaSuccess);
cudaError_t err = cudaMemcpy(d_data, h_data, A * sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
std::cout << cudaGetErrorString(err) << std::endl;
return false;
}
error returns invalid argument, however if I use cudaHostAllocDefault it seems to work fine, I understand how it works fast writing slow reading and that's why I would like to use it.
Your usage of h_data is incorrect. new returns a pointer, which should be assigned to the correct variable type. Replace h_data with *h_data in your declaration, and your code will be more-or-less correct, and cudaMemcpy should not throw an invalid argument error.
The following complete code shows the correction and compiles and runs without error for me on CUDA 6:
#include <iostream>
#include <assert.h>
#define A 1024
int main(){
float *d_data, *h_data;
h_data = new float[A];
cudaError_t err = cudaHostAlloc((void **)&d_data, A * sizeof(float), cudaHostAllocWriteCombined);
if (err != cudaSuccess)
{
std::cout << "cudaHostAlloc fail " << std::endl;
std::cout << cudaGetErrorString(err) << std::endl;
return 1;
}
err = cudaMemcpy(d_data, h_data, A * sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
std::cout << "cudaMemcpy fail" << std::endl;
std::cout << cudaGetErrorString(err) << std::endl;
return 1;
}
return 0;
}

Cuda Texture Memory does not inherit the right Values

I'm Trying to bin a 2D array to a texture and to do interpolation between the data. My Problem is. When I'm binding my Array to the texture the the Values i access are total nonsense. Even when I'm trying to acces the first Value (text2D(tex,0.0f,0.0f) i doesn't make sense. So i guess I'm binding it wrong or my memcopy is wrong. Any ideas where my mistake is?
Here is the Code
#include <stdio.h>
#include <iostream>
#include "cuda.h"
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "HelloWorld.h"
#include "linearInterpolation_kernel.cu"
#include "linearInterpolation_kernel2.cu"
#include "linearInterpolation_kernel3.cu"
using namespace std;
using std::cout;
const int blocksize = 16;
__global__
void hello(char *a, int *b) {
a[threadIdx.x] += b[threadIdx.x];
}
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
printf("%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
}
}
// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
inline void __getLastCudaError( const char *errorMessage, const char *file, const int line )
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
printf("%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
}
}
int main()
{
int N = 200;
float *A;
A = (float *) malloc(N*sizeof(float));
float *B;
B = (float *) malloc(N*sizeof(float));
float *result;
result = (float *) malloc(N*sizeof(float));
float angle = 0.5f;
for(int i = 0; i < N; i++){
A[i] = (float)rand();
B[i] = (float)rand();
}
cout << A[3] << endl;
cout << B[3] << endl;
ipLinearTexture(A,B,result,angle,N);
float result2;
result2 = (angle)*A[3] + (1-angle)*B[3];
printf(" A %f B %f Result %f\n", A[3], B[3], result[3]);
cout << result2 << endl;
return 1;
}
void ipLinearTexture(float *A, float* B, float* result, float angle, int N)
{
float cuTime;
const int N2 = N;
float *dev_result;
float **AB;
AB = (float **) malloc( N * sizeof(float *));
if(AB)
{
for(int i = 0; i < N; i++)
{
AB[i] = (float *) calloc( 2 , sizeof(float *));
}
}
for (int i = 0; i < N; i++)
{
AB[i][0] = A[i];
AB[i][1] = B[i];
}
cudaMalloc(&dev_result, N * sizeof(float));
unsigned int size = N * 2 * sizeof(float);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray* cu_array;
checkCudaErrors(cudaMallocArray( &cu_array, &channelDesc,N,2 ));
checkCudaErrors(cudaMemcpyToArray( cu_array, 0, 0, AB, size, cudaMemcpyHostToDevice));
tex.addressMode[0] = cudaAddressModeClamp;
tex.addressMode[1] = cudaAddressModeClamp;
tex.filterMode = cudaFilterModeLinear;
tex.normalized = false; // access with normalized texture coordinates
checkCudaErrors(cudaBindTextureToArray( tex, cu_array, channelDesc));
dim3 dimBlock(10, 1, 1);
dim3 dimGrid((int)ceil((double)N*2/dimBlock.x), 1, 1);
transformKernel3<<< dimGrid, dimBlock, 0 >>>( dev_result, N, 2, angle);
checkCudaErrors(cudaUnbindTexture(tex));
cudaMemcpy(result, dev_result, N * sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost);
result[0] = (float)cuTime;
cout << "==================================================" << endl;
for (int i = 0 ; i < N ;i++)
{
cout << result[i] << endl;
}
cout << "==================================================" << endl;
cudaFree(dev_result);
cudaFreeArray(cu_array);
}
Here is the code inside the Kernel
#ifndef _SIMPLETEXTURE_KERNEL3_H_
#define _SIMPLETEXTURE_KERNEL3_H_
// declare texture reference for 2D float texture
texture<float, 1> tex;
////////////////////////////////////////////////////////////////////////////////
//! Transform an image using texture lookups
//! #param g_odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void
transformKernel3( float* g_odata, int width, int height, float theta)
{
unsigned int id = blockIdx.x*blockDim.x + threadIdx.x;
if (id < width*height)
{
g_odata[id] = tex1D(tex, xid * 2 + 0.5f);
}
}
#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
Like the concept in OpenGL, you could think a 2D texture is a rectangle field. The center point of each small rectangle is your array data. So, tex2D(tex, 0.5f/width, 0.5f/height) will be exactly your first value of array data. (width & height is the width and height of 2D array data)