Texture objects for doubles - cuda

I want to use texture objects (not references) with doubles. The code below works when using floats, but double is not a supported data type.
Can I get around this using 2d textures and if so, how do I set up such a texture?
There is a similar question for texture references, but none for texture objects. Support for double type in texture memory in CUDA
__global__ void my_print(cudaTextureObject_t texObject)
{
printf("%f\n",tex1Dfetch<double>(texObject,0));
return;
}
int main()
{
double i = 0.35;
int numel = 50;
double* d_data;
cudaMalloc(&d_data,numel*sizeof(double));
cudaMemcpy((void*)d_data,&i,1*sizeof(double), cudaMemcpyHostToDevice);
cudaTextureDesc td;
memset(&td, 0, sizeof(td));
td.normalizedCoords = 0;
td.addressMode[0] = cudaAddressModeClamp;
td.readMode = cudaReadModeElementType;
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeLinear;
resDesc.res.linear.devPtr = d_data;
resDesc.res.linear.sizeInBytes = numel*sizeof(double);
resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
resDesc.res.linear.desc.x = 32;
cudaTextureObject_t texObject = 0;
gpuErrchk(cudaCreateTextureObject(&texObject, &resDesc, &td, NULL));
my_print<<<1,1>>>(texObject);
gpuErrchk(cudaDeviceSynchronize());
return 0;
}

The idea is exactly the same as for texture references. You can access double precision by binding the data to a supported 64 bit type and casting the resulting read to a double. If you modify your code like this:
#include <vector>
#include <cstdio>
static __inline__ __device__ double fetch_double(uint2 p){
return __hiloint2double(p.y, p.x);
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void my_print(cudaTextureObject_t texObject)
{
uint2 rval = tex1Dfetch<uint2>(texObject, 0);
double dval = fetch_double(rval);
printf("%f\n", dval);
}
int main()
{
double i = 0.35;
int numel = 50;
std::vector<double> h_data(numel, i);
double* d_data;
cudaMalloc(&d_data,numel*sizeof(double));
cudaMemcpy((void*)d_data, &h_data[0], numel*sizeof(double), cudaMemcpyHostToDevice);
cudaTextureDesc td;
memset(&td, 0, sizeof(td));
td.normalizedCoords = 0;
td.addressMode[0] = cudaAddressModeClamp;
td.readMode = cudaReadModeElementType;
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeLinear;
resDesc.res.linear.devPtr = d_data;
resDesc.res.linear.sizeInBytes = numel*sizeof(double);
resDesc.res.linear.desc.f = cudaChannelFormatKindUnsigned;
resDesc.res.linear.desc.x = 32;
resDesc.res.linear.desc.y = 32;
cudaTextureObject_t texObject;
gpuErrchk(cudaCreateTextureObject(&texObject, &resDesc, &td, NULL));
my_print<<<1,1>>>(texObject);
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
i.e. modify the channel description to 64 bits, read a uint2 from the texture object, and then cast it to a double, it should work as you want.

Related

cudaMemcpy returns success but does not copy anything

below are the things I have checked with cuda-gdb:
the contents of src are correct
cudaMalloc, malloc, and file I/O are successful
cudaMemcpy returns cudaSuccess
the problematic cudaMemcpy is called and throws no errors or exceptions
destination is allocated (cudaMalloc) successfully
Below are relevent parts of the code: wavenet_server.cc mallocs the source, copies data from a file to the source, and calls make_wavenet. wavenet_infer.cu calls constructor of MyWaveNet and calls setEmbeddings.
wavenet_server.cc:
#include "wavenet_infer.h"
void readArrayFromBinary(void* array, size_t len, size_t num_bytes_per_elem, const char* file_name) {
FILE* file = fopen(file_name, "rb");
fread(array, num_bytes_per_elem, len, file);
fclose(file);
}
void setEmbeddingCurr(const char* fileName, size_t len) {
this->embedding_curr = (float*)malloc(sizeof(float) * len);
readArrayFromBinary((void*)this->embedding_curr, len, sizeof(float), fileName);
}
void setWavenet(void) {
this->wavenet = make_wavenet(this->num_samples,
this->batch_size,
this->embedding_prev,
this->embedding_curr,
this->num_layers,
this->max_dilation,
this->dilate_weights_prev,
this->dilate_weights_curr,
this->dilate_biases,
this->res_weights,
this->res_biases,
this->skip_weights,
this->skip_biases,
this->conv_out,
this->conv_end,
this->is_using_embed_tanh,
this->implementation);
}
wavenet_infer.cu:
#include "nv_wavenet.cuh"
typedef nvWavenetInfer<float,float, R, S, A> MyWaveNet;
void* make_wavenet(int sample_count,
int batch_size,
float* embedding_prev,
float* embedding_curr,
int num_layers,
int max_dilation,
float** in_layer_weights_prev,
float** in_layer_weights_curr,
float** in_layer_biases,
float** res_layer_weights,
float** res_layer_biases,
float** skip_layer_weights,
float** skip_layer_biases,
float* conv_out_weight,
float* conv_end_weight,
bool use_embed_tanh,
int implementation
) {
MyWaveNet* wavenet = new MyWaveNet(num_layers, max_dilation, batch_size, sample_count,
implementation, use_embed_tanh);
wavenet->setEmbeddings(embedding_prev, embedding_curr);
// We didn't use biases on our outputs
std::vector<float> dummy_bias_first(S, 0);
std::vector<float> dummy_bias_second(A, 0);
wavenet->setOutWeights(conv_out_weight,
dummy_bias_first.data(),
conv_end_weight,
dummy_bias_second.data());
for (int l = 0; l < num_layers; l++) {
wavenet->setLayerWeights(l, in_layer_weights_prev[l],
in_layer_weights_curr[l],
in_layer_biases[l],
res_layer_weights[l],
res_layer_biases[l],
skip_layer_weights[l],
skip_layer_biases[l]);
}
return (void*)wavenet;
}
nv_wavenet.cuh:
nvWavenetInfer (int numLayers, int maxDilation, int batchSize, int numSamples, int impl=0, bool tanhEmbed=true) : m_numLayers(numLayers), m_maxBatch(batchSize), m_maxSamples(numSamples), m_implementation((nvWavenetInfer::Implementation)impl), m_tanhEmbed(tanhEmbed) {
m_maxDilation = maxDilation;
/*
gpuErrChk(cudaMalloc(&m_yOut, numSamples*batchSize*sizeof(int))); // one-hot vector represented as single value indicating which value is set
gpuErrChk(cudaMemset(m_yOut, 0, numSamples*batchSize*sizeof(int)));
*/
gpuErrChk(cudaMalloc(&m_outputSelectors, numSamples*batchSize*sizeof(float)));
gpuErrChk(cudaMalloc(&m_embedPrev, A*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_embedCur, A*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Wprev, numLayers*2*R*R*sizeof(T_weight)));
gpuErrChk(cudaMalloc(&m_Wcur, numLayers*2*R*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Bh, numLayers*2*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Lh, numSamples*numLayers*batchSize*2*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Wres, numLayers*R*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Bres, numLayers*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Wskip, numLayers*S*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Bskip, numLayers*S*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_XtOut, numLayers*R*batchSize*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_skipOut, numLayers*S*batchSize*sizeof(T_data)));
// For now, just burn memory as though all layers had the maximum dilation value
gpuErrChk(cudaMalloc(&m_XtIn, (m_maxDilation+1)*(numLayers+1)*R*batchSize*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_hOut, numLayers*batchSize*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_aPrev, numLayers*batchSize*2*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_skipIn, numLayers*S*batchSize*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_skipOutFinalAccumulate, A*batchSize*S/R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_outAccumulate, A*batchSize*A/R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_yInPrev, batchSize*sizeof(int))); // one-hot vector represented as single value indicating which value is set
gpuErrChk(cudaMalloc(&m_yInCur, batchSize*sizeof(int))); // one-hot vector represented as single value indicating which value is set
gpuErrChk(cudaMalloc(&m_WskipOut, A*S*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_BskipOut, A*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Wout, A*A*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Bout, A*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_skipOutFinal, A*batchSize*S/R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_out, A*batchSize*A/R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_p, A*batchSize*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_h, numLayers*batchSize*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_hSample, numLayers*batchSize*sizeof(int)));
gpuErrChk(cudaMalloc(&m_ySample, batchSize*sizeof(int)));
if (impl == PERSISTENT) {
gpuErrChk(cudaMalloc(&m_skipOutFinalAccumulate, A*batchSize*S/R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_outAccumulate, A*batchSize*A/R*sizeof(T_data)));
}
}
virtual void setEmbeddings (float* embedPrev, float* embedCur) {
setActivation(m_embedPrev, embedPrev, A*R);
setActivation(m_embedCur, embedCur, A*R);
}
void setActivation(float* dst, float* src, size_t size) {
gpuErrChk(cudaMemcpy(dst, src, size*sizeof(float), cudaMemcpyHostToDevice));
}
Turns out that cudaMemcpy was not the issue. when examining device global memroy using cuda-gdb, one cannot do: x/10fw float_array. It will give incorrect values. To view, try this: p ((#global float*) float_array)[0]#10

Error memcpy in device code

I write a code that get first _var positions of a vector of possibilities (i.e., matrix _size*_var with _var=3 and _size=27) and calling this function in my kernel (32 threads, ie, each has an object) but I do not get any return value of the function neither the NULL pointer.
The program exit without error but the printf lines in the kernel is not executed or displayed (even compiled with sm_20 or higher) as if the program stopped before.
dataIntern.h:
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define _MIN -1
#define _MAX 1
#ifdef __CUDACC__
#define CUDA_CALLABLE_MEMBER __host__ __device__
#else
#define CUDA_CALLABLE_MEMBER
#endif
template <class a_type>
class dataIntern{
private:
a_type *possibilities;
int _assign;
int _size;
int _var;
int _maxsize;
public:
CUDA_CALLABLE_MEMBER dataIntern(){
}
CUDA_CALLABLE_MEMBER dataIntern(int var){
_var = var;
_size = (int)pow(3.0, (double)_var);
_maxsize = _size * _var;
_assign = 1;
possibilities = (a_type*)malloc(_maxsize*sizeof(a_type));
if(!possibilities){
exit(1);
}
createTable();
}
CUDA_CALLABLE_MEMBER void createTable(){
int i, j, k, limit, pos;
a_type value;
if(_assign == 1){
for(i=0; i<_var; i++){
#ifdef __CUDA_ARCH__
limit = (int)pow(3.0, _var-i-1);
#else
limit = (int)pow(3.0, (double)_var-i-1);
#endif
value = (a_type)_MIN;
k = 0;
for(j=0; j<_size; j++){
pos = _var*j+i;
if(k >= limit){
value++;
if(value > _MAX){
value = (a_type)_MIN;
}
k = 0;
}
possibilities[pos] = value;
k++;
}
}
}
}
CUDA_CALLABLE_MEMBER void print(){
int i;
printf("Printing.\n");
if(_assign == 1){
for(i=0; i<_size*_var; i++){
printf("%d ", possibilities[i]);
if(i%_var == _var-1){
printf("\n");
}
}
}
else{
printf("Not assigned.\n");
}
}
CUDA_CALLABLE_MEMBER void retify(int posChanged, a_type valueRetified){
int i, pos, count, initpos, attrib;
a_type *newnode;
a_type *newlist = NULL, *morelist = NULL;
pos = posChanged;
initpos = 0;
count = 0;
if(_assign == 1){
attrib = 0;
newnode = (a_type*)malloc(_var*sizeof(a_type));
for(i=0; i<_size; i++){
if(possibilities[pos] == valueRetified){
memcpy(newnode, &possibilities[i*_var], _var*sizeof(a_type));
count++;
if(newlist!=NULL){
morelist = (a_type*)malloc(count*_var*sizeof(a_type));
memcpy(morelist, newlist, (count-1)*_var*sizeof(a_type));
}
newlist = (a_type*)malloc(count*_var*sizeof(a_type));
memcpy(newlist, morelist, (count-1)*_var*sizeof(a_type));
memcpy(&newlist[initpos], newnode, _var*sizeof(a_type));
initpos+=_var;
attrib = 1;
}
pos+=_var;
}
if(attrib == 1){
_size = count;
possibilities = (a_type*)malloc(_size*_var*sizeof(a_type));
if(possibilities == NULL){
printf("Allocation fail in newlist retify.\n");
exit(1);
}
memcpy(possibilities, newlist, _size*_var*sizeof(a_type));
}
else{
_assign = 0;
}
}
}
CUDA_CALLABLE_MEMBER a_type* unstack(){
a_type* solution = NULL, *backup = NULL;
if(_assign == 1){
if(_size>0){
backup = (a_type*)malloc(_var*_size*sizeof(a_type));
if(backup == NULL){
printf("Erro to alloc backup pointer on unstack function in data intern\n");
return NULL;
}
solution = (a_type*)malloc(_var*sizeof(a_type));
if(solution == NULL){
printf("Erro to alloc solution pointer on unstack function in data intern\n");
return NULL;
}
memcpy(backup, possibilities, _size*_var*sizeof(a_type));
memcpy(solution, possibilities, _var*sizeof(a_type));
free(possibilities);
_size--;
possibilities = (a_type*)malloc(_size*_var*sizeof(a_type));
if(possibilities == NULL){
printf("Error to realloc possibilities pointer in data intern\n");
return NULL;
}
memcpy(possibilities, &backup[_var], _size*_var*sizeof(a_type));
free(backup);
return solution;
}
}
return NULL;
}
CUDA_CALLABLE_MEMBER int get_size(){
return _size;
}
CUDA_CALLABLE_MEMBER ~dataIntern(){
_assign = 0;
if(possibilities)
free(possibilities);
}
};
deviceCode.h:
#ifndef DEVICECODE_H
#define DEVICECODE_H
void CallingInMain();
__global__ void kernel();
#endif
deviceCode.cu:
#include "deviceCode.h"
#include "dataIntern.h"
#include <iostream>
#include <stdio.h>
//I declared like this to my kernel:
__global__ void kernel(){
__shared__ dataIntern<int> data[32];
int *vetor;
vetor = NULL;
data[threadIdx.x] = dataIntern<int>(3);
//_var == 3 in the class above
vetor = (int*)malloc(sizeof(int)*3);
vetor = data[threadIdx.x].unstack();
while(vetor!=NULL){
//never past here
printf("%d %d %d %d\n", threadIdx.x, vetor[0], vetor[1], vetor[2]);
vetor = data[threadIdx.x].unstack();
}
//neither here in if or else
if(vetor)
printf("Not null\n");
else
printf("Null final\n");
free(vetor);
}
void CallingInMain(){
kernel<<<1, 32>>>();
cudaDeviceSynchronize();
}
main.cu:
#include <iostream>
#include <stdio.h>
#ifndef deviceCode_H
#include "deviceCode.h"
#endif
int main(int argc, char* argv[]){
CallingInMain();
return 0;
}
Some colleagues pointed out to me that your code seems to have an error in it.
Consider this line in your kernel:
data[threadIdx.x] = dataIntern<int>(3);
This line instantiates a temporary dataIntern<int> object, runs the constructor with a value of 3 on it, and then does a copy from that object to the storage in data[threadIdx.x]. Note that the constructor performs a malloc operation:
CUDA_CALLABLE_MEMBER dataIntern(int var){
...
possibilities = (a_type*)malloc(_maxsize*sizeof(a_type));
But since the original object is temporary, the C++ standard allows the object to be deleted at the termination of the statement, i.e. at the semicolon here:
data[threadIdx.x] = dataIntern<int>(3);
^
after the copy-construction process is complete. But the deletion of the object triggers the destructor, which does a free operation on possibilities:
CUDA_CALLABLE_MEMBER ~dataIntern(){
_assign = 0;
if(possibilities)
free(possibilities);
}
Therefore usage of the pointer so allocated subsequent to this line of code:
data[threadIdx.x] = dataIntern<int>(3);
such as in unstack here:
vetor = data[threadIdx.x].unstack();
will be invalid.
This is a violation of C++ programming rules, and the error is not specific to CUDA.

cublassgemm for row-major matrix

I really tried to implement a function in C to multiply to row-major matrix in cublas. I don't know where I mistaking.
In the function below A, B and C are pointers to an row matrix correctly
allocated.
I'd like to keep the option of translate a matrix before perform the product.
The function below is not working.
void matrixMul(cublasHandle_t handle,float *A,float *B,float *C, int m,int n,int k,int transA,int transB){
cublasStatus_t stat ; // CUBLAS functions status
float alfa = 1;
float beta = 0;
int
ma = transA ? n:m,
na = transA ? m:n,
nb = transB ? k:n,
mb = transB ? n:k;
if(na!=mb){
puts("Something wrong");
}
//(mb,nb)(ma,na) = (mb,na)
stat= cublasSgemm_v2(handle, (cublasOperation_t) transB, (cublasOperation_t)transA,
nb,ma,mb,&alfa,
B,k,
A,n,&beta,
C,m);
switch (stat) {
case CUBLAS_STATUS_SUCCESS:
puts("Sucess");
break;
default:
printf(">>>>ERRO %d<<<<\n",stat);
break;
}
}
The entire source code
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA Samples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// CUDA and CUBLAS functions
#include <helper_functions.h>
void getFromDevice(float *h_A,float *d_A,int size){
//printf("Copy input data from the host memory to the CUDA device\n");
cudaError_t err = cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
//A = (m,n)
//B = (n,k)
//C = (m,k)
void matrixMul(cublasHandle_t handle,float *A,float *B,float *C, int m,int n,int k,int transA,int transB){
cublasStatus_t stat ; // CUBLAS functions status
float alfa = 1;
float beta = 0;
int
ma = transA ? n:m,
na = transA ? m:n,
nb = transB ? k:n,
mb = transB ? n:k;
if(na!=mb){
puts("Something wrong");
}
//(mb,nb)(ma,na) = (mb,na)
stat= cublasSgemm_v2(handle, (cublasOperation_t) transB, (cublasOperation_t)transA,
nb,ma,mb,&alfa,
B,k,
A,n,&beta,
C,m);
switch (stat) {
case CUBLAS_STATUS_SUCCESS:
puts("Sucess");
break;
default:
printf(">>>>ERRO %d<<<<\n",stat);
break;
}
}
float *mallocfDevice(int size){
float *d_C = NULL;
cudaError_t err = cudaMalloc((void **)&d_C, size * sizeof(float));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}else{
size_t freeM, total;
cudaMemGetInfo ( &freeM, &total);
printf("MEM:%.3f\n",freeM,total,100 - ((double)freeM/total)*100 );
}
return d_C;
}
void printHostMatrix(int nl, int nc, float *h_s){
for(int j = 0; j < nl ; j++) {
for(int i = 0; i < (nc) ; i++){
int idx = j*nc + i;
printf("%.2f ", h_s[idx]);
}
printf("\n");
}
}
void printfDeviceMatrix(float *d_s,int m, int p){
float *h_s =(float*) malloc(sizeof(float)*m*p);
getFromDevice(h_s,d_s,sizeof(float)*m*p);
printHostMatrix(m,p,h_s);
free(h_s);
}
void sendTofDevice(float *h_A,float *d_A,int size){
//printf("Copy input data from the host memory to the CUDA device\n");
cudaError_t err = cudaMemcpy(d_A, h_A, size*sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
int main(int argc,char **argv){
int ma = 2,
na = 3,
mb = 3,
nb = 2;
float A[] = { 1,2,3,
4,5,6};
float B[] = {7, 8,
9,10,
11,12};
float *C = new float[ma*nb];
float *d_a = mallocfDevice(ma*mb),
*d_b = mallocfDevice(mb*nb),
*d_c = mallocfDevice(ma*nb);
sendTofDevice(A,d_a,ma*na);
sendTofDevice(B,d_b,mb*nb);
cublasHandle_t handle ; // CUBLAS context
cublasCreate (&handle );
puts("A");
printfDeviceMatrix(d_a,ma,na);
puts("B");
printfDeviceMatrix(d_b,mb,nb);
matrixMul(handle, d_a,d_b,d_c,
ma,na,nb,0,0);
puts("AB=C");
printfDeviceMatrix(d_c,ma,nb);
}
CUBLAS assumes that the matrix in the device is stored in column major:
"
where α and β are scalars, and A , B and C are matrices stored in column-major format with dimensions op ( A ) m × k , op ( B ) k × n and C m × n , respectively. Also, for matrix A
Read more at: http://docs.nvidia.com/cuda/cublas/index.html#ixzz3mSDJTWrM "
That means the matrix needs to be treated as differently on the device than on the host.

Batched FFTs using cufftPlanMany

I want to perform 441 2D, 32-by-32 FFTs using the batched method provided by the cuFFT library. The parameters of the transform are the following:
int n[2] = {32,32};
int inembed[] = {32,32};
int onembed[] = {32,32/2+1};
cufftPlanMany(&plan,2,n,inembed,1,32*32,onembed,1,32*(32/2+1),CUFFT_D2Z,441);
cufftPlanMany(&inverse_plan,2,n,onembed,1,32*32,inembed,1,32*32,CUFFT_Z2D,441);
After I did the forward and inverse FFTs using the above plans, I could not get the original data back.
Can anyone advise me how to set the parameters correctly for cudaPlanMany? Many thanks in advance.
By the way, is it the best way to use cudaPlanMany for my situation?
Here is a full example on how using cufftPlanMany to perform batched direct and inverse transformations in CUDA. The example refers to float to cufftComplex transformations and back. The final result of the direct+inverse transformation is correct but for a multiplicative constant equal to the overall number of matrix elements nRows*nCols.
#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <assert.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
/********/
/* MAIN */
/********/
void main() {
cufftHandle forward_plan, inverse_plan;
int batch = 3;
int rank = 2;
int nRows = 5;
int nCols = 5;
int n[2] = {nRows, nCols};
int idist = nRows*nCols;
int odist = nRows*(nCols/2+1);
int inembed[] = {nRows, nCols};
int onembed[] = {nRows, nCols/2+1};
int istride = 1;
int ostride = 1;
cufftSafeCall(cufftPlanMany(&forward_plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch));
float *h_in = (float*)malloc(sizeof(float)*nRows*nCols*batch);
for(int i=0; i<nRows*nCols*batch; i++) h_in[i] = 1.f;
float2* h_freq = (float2*)malloc(sizeof(float2)*nRows*(nCols/2+1)*batch);
float* d_in; gpuErrchk(cudaMalloc(&d_in, sizeof(float)*nRows*nCols*batch));
float2* d_freq; gpuErrchk(cudaMalloc(&d_freq, sizeof(float2)*nRows*(nCols/2+1)*batch));
gpuErrchk(cudaMemcpy(d_in,h_in,sizeof(float)*nRows*nCols*batch,cudaMemcpyHostToDevice));
cufftSafeCall(cufftExecR2C(forward_plan, d_in, d_freq));
gpuErrchk(cudaMemcpy(h_freq,d_freq,sizeof(float2)*nRows*(nCols/2+1)*batch,cudaMemcpyDeviceToHost));
for(int i=0; i<nRows*(nCols/2+1)*batch; i++) printf("Direct transform: %i %f %f\n",i,h_freq[i].x,h_freq[i].y);
cufftSafeCall(cufftPlanMany(&inverse_plan, rank, n, onembed, ostride, odist, inembed, istride, idist, CUFFT_C2R, batch));
cufftSafeCall(cufftExecC2R(inverse_plan, d_freq, d_in));
gpuErrchk(cudaMemcpy(h_in,d_in,sizeof(float)*nRows*nCols*batch,cudaMemcpyDeviceToHost));
for(int i=0; i<nRows*nCols*batch; i++) printf("Inverse transform: %i %f \n",i,h_in[i]);
getchar();
}

Cuda Texture Memory does not inherit the right Values

I'm Trying to bin a 2D array to a texture and to do interpolation between the data. My Problem is. When I'm binding my Array to the texture the the Values i access are total nonsense. Even when I'm trying to acces the first Value (text2D(tex,0.0f,0.0f) i doesn't make sense. So i guess I'm binding it wrong or my memcopy is wrong. Any ideas where my mistake is?
Here is the Code
#include <stdio.h>
#include <iostream>
#include "cuda.h"
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "HelloWorld.h"
#include "linearInterpolation_kernel.cu"
#include "linearInterpolation_kernel2.cu"
#include "linearInterpolation_kernel3.cu"
using namespace std;
using std::cout;
const int blocksize = 16;
__global__
void hello(char *a, int *b) {
a[threadIdx.x] += b[threadIdx.x];
}
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
printf("%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
}
}
// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
inline void __getLastCudaError( const char *errorMessage, const char *file, const int line )
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
printf("%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
}
}
int main()
{
int N = 200;
float *A;
A = (float *) malloc(N*sizeof(float));
float *B;
B = (float *) malloc(N*sizeof(float));
float *result;
result = (float *) malloc(N*sizeof(float));
float angle = 0.5f;
for(int i = 0; i < N; i++){
A[i] = (float)rand();
B[i] = (float)rand();
}
cout << A[3] << endl;
cout << B[3] << endl;
ipLinearTexture(A,B,result,angle,N);
float result2;
result2 = (angle)*A[3] + (1-angle)*B[3];
printf(" A %f B %f Result %f\n", A[3], B[3], result[3]);
cout << result2 << endl;
return 1;
}
void ipLinearTexture(float *A, float* B, float* result, float angle, int N)
{
float cuTime;
const int N2 = N;
float *dev_result;
float **AB;
AB = (float **) malloc( N * sizeof(float *));
if(AB)
{
for(int i = 0; i < N; i++)
{
AB[i] = (float *) calloc( 2 , sizeof(float *));
}
}
for (int i = 0; i < N; i++)
{
AB[i][0] = A[i];
AB[i][1] = B[i];
}
cudaMalloc(&dev_result, N * sizeof(float));
unsigned int size = N * 2 * sizeof(float);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray* cu_array;
checkCudaErrors(cudaMallocArray( &cu_array, &channelDesc,N,2 ));
checkCudaErrors(cudaMemcpyToArray( cu_array, 0, 0, AB, size, cudaMemcpyHostToDevice));
tex.addressMode[0] = cudaAddressModeClamp;
tex.addressMode[1] = cudaAddressModeClamp;
tex.filterMode = cudaFilterModeLinear;
tex.normalized = false; // access with normalized texture coordinates
checkCudaErrors(cudaBindTextureToArray( tex, cu_array, channelDesc));
dim3 dimBlock(10, 1, 1);
dim3 dimGrid((int)ceil((double)N*2/dimBlock.x), 1, 1);
transformKernel3<<< dimGrid, dimBlock, 0 >>>( dev_result, N, 2, angle);
checkCudaErrors(cudaUnbindTexture(tex));
cudaMemcpy(result, dev_result, N * sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost);
result[0] = (float)cuTime;
cout << "==================================================" << endl;
for (int i = 0 ; i < N ;i++)
{
cout << result[i] << endl;
}
cout << "==================================================" << endl;
cudaFree(dev_result);
cudaFreeArray(cu_array);
}
Here is the code inside the Kernel
#ifndef _SIMPLETEXTURE_KERNEL3_H_
#define _SIMPLETEXTURE_KERNEL3_H_
// declare texture reference for 2D float texture
texture<float, 1> tex;
////////////////////////////////////////////////////////////////////////////////
//! Transform an image using texture lookups
//! #param g_odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void
transformKernel3( float* g_odata, int width, int height, float theta)
{
unsigned int id = blockIdx.x*blockDim.x + threadIdx.x;
if (id < width*height)
{
g_odata[id] = tex1D(tex, xid * 2 + 0.5f);
}
}
#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
Like the concept in OpenGL, you could think a 2D texture is a rectangle field. The center point of each small rectangle is your array data. So, tex2D(tex, 0.5f/width, 0.5f/height) will be exactly your first value of array data. (width & height is the width and height of 2D array data)