CUDA 1D texture fetch always return 0 - cuda

I am trying to test the CUDA 1D texture with a piece of simple code. It is quite straightforward: first allocates a global memory, then bind it to a texture reference; access the texture from within a kernel by tex1D(); print out the value returned by the texture fetch.
The code is as follows:
#include "cuda.h"
#include "cuda_runtime.h"
#include <iostream>
#include <vector>
#include <cstdio>
using namespace std;
texture<float, cudaTextureType1D, cudaReadModeElementType> texX;
__global__ void kernel(float *X)
{
int i = threadIdx.x ;
if ( i >= 128 ) return;
printf("%.3f\t%.3f\n", tex1D( texX, i*1.0 ), X[i] );
}
int main()
{
float *devX;
vector<float> X(128, 3.1415926 );
cudaMalloc( &devX, 128 * sizeof(float) );
cudaMemcpy( devX, &X[0], 128*sizeof(float), cudaMemcpyDefault );
cudaDeviceSynchronize();
cudaBindTexture( (size_t)0, texX, devX, 128 * sizeof(float) );
cudaDeviceSynchronize();
kernel<<<1,128>>>( devX );
cudaDeviceSynchronize();
cout<<endl;
cout<< cudaGetErrorString( cudaGetLastError() ) <<endl;
}
But all I got was like this:
0.000 3.142
0.000 3.142
...
0.000 3.142
no error
Could anyone explain why?

You need to use tex1Dfetch() since your texture is bound to linear memory:
printf("%.3f\t%.3f\n", tex1Dfetch( texX, i ), X[i] );

From the " CUDA C Programming Guide"
tex1D is used to fetch from the CUDA array specified by the one-dimensional texture object ...
tex1Dfetch is used to fetch from the region of linear memory specified by the one-dimension texture ...
So, if you use cudaMalloc to malloc linear memory but not cuda array, u should select tex1Dfetch

Related

CUDA Seg Fault for Int Device to Host Copy

Why is the following simple program (24 lines) lead to segmentation fault at shrinked_size_host int variable:
#include <stdio.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
__global__ void cuda_set(int* device_var){
*device_var = 12;
printf("Set device variable to: %d\n", *device_var);
}
int main() {
printf("Hello world CPU\n");
int* shrinked_size_device;
cudaMalloc((void**)&shrinked_size_device, sizeof(int));
cudaDeviceSynchronize();
cudaMemset(shrinked_size_device, 0, sizeof(int));
cudaDeviceSynchronize();
cuda_set<<<1,1>>>(shrinked_size_device);
cudaDeviceSynchronize();
int* shrinked_size_host = 0;
cudaMemcpy(shrinked_size_host, shrinked_size_device, sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("shrinked_size_host=%d\n", *shrinked_size_host);
return 0;
}
That's the output produced from the program:
Hello world CPU
Set device variable to: 12
Segmentation fault (core dumped)
Not sure why there is a segmentation fault.
I figured out the answer to this question.
Memory for the shrinked_size_host should be allocated. So, either do:
Heap allocation: malloc or new int to allocate an integer of size 1. Remember to delete the allocated memory at the end.
Stack allocation: Use int shrinked_size_host[1];

cudaEventElapsedTime not expected behaviour

I'm trying to compute the total time taken in GPU to compute something. I'm using the cudaEventRecord and cudaEventElapsedTime to determine this, but I'm having a unexpected behavior, or at least, unexpected for me :) I wrote this example to understand what's happening and I'm still confused.
In the example below I was expecting to report the same time for the three iterations but the result is:
2.80342
1003
2005.6
Which means that the total time in considering the CPU sleep time.
Am I doing something wrong? If not, is it possible do what I want?
#include <iostream>
#include <thread>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
__global__ void kernel_test(int *a, int N) {
for(int i=threadIdx.x;i<N;i+=N) {
if(i<N)
a[i] = 1;
}
}
int main(int argc, char ** argv) {
cudaEvent_t start[3], stop[3];
for(int i=0;i<3;i++) {
cudaEventCreate(&start[i]);
cudaEventCreate(&stop[i]);
}
cudaStream_t stream;
cudaStreamCreate(&stream);
const int N = 1024 * 1024;
int *h_a = (int*)malloc(N * sizeof(int));
int *a = 0;
cudaMalloc((void**)&a, N * sizeof(int));
for(int i=0;i<3;i++) {
cudaEventRecord(start[i], stream);
cudaMemcpyAsync(a, h_a, N * sizeof(int), cudaMemcpyHostToDevice, stream);
kernel_test<<<1, 1024, 0, stream>>>(a, N);
cudaMemcpyAsync(h_a, a, N*sizeof(int), cudaMemcpyDeviceToHost, stream);
cudaEventRecord(stop[i], stream);
std::this_thread::sleep_for (std::chrono::seconds(i));
cudaEventSynchronize(stop[i]);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start[i], stop[i]);
std::cout<<milliseconds<<std::endl;
}
return 0;
}
I attach the nsight result to verify the behaviour of my example.
Windows 8.1
Geforce GTX 780 Ti
Nvidia drivers: 358.50
EDIT:
Added code to be complete
Attached nsight result
Added SO and drivers info
, ,
If you're running the program on Windows using the WDDM (in contrast to TCC with Tesla cards or Linux) this may be the issue:
With the WDDM kernels are not executed immediately after invocation but instead enqueued to a command buffer. Once the buffer is full it gets flushed and the enqueued commands are actually executed. Another option to force the command buffer to be explicitly flushed is to synchronize.
Now what happens is that you wait before the command buffer is acutally flushed...
Edit
Also see https://devtalk.nvidia.com/default/topic/548639/is-wddm-causing-this-/ for the problem and how cudaEventQuery(0) may help

Accurate method to calculate double FMA and Shared memory latency

I am trying to come up with an accurate way to measure the latency of two operations:
1) Latency of a double precision FMA operation.
2) Latency of a double precision load from shared memory.
I am using a K20x and was wondering if this code would give accurate measurements.
#include <cuda.h>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
using namespace std;
//Clock rate
#define MHZ 732e6
//number of streaming multiprocessors
#define SMS 14
// number of double precision units
#define DP_UNITS 16*4
//number of shared banks
#define SHARED_BANKS 32
#define ITER 100000
#define NEARONE 1.0000000000000004
__global__ void fma_latency_kernal(double *in, double *out){
int tid = blockIdx.x*blockDim.x+threadIdx.x;
double val = in[tid];
#pragma unroll 100
for(int i=0; i<ITER; i++){
val+=val*NEARONE;
}
out[tid]=val;
}
__global__ void shared_latency_kernel(double *in, double *out){
volatile extern __shared__ double smem[];
int tid = blockIdx.x*blockDim.x+threadIdx.x;
smem[threadIdx.x]=in[tid];
#pragma unroll 32
for(int i=0; i<ITER; i++){
smem[threadIdx.x]=smem[(threadIdx.x+i)%32]*NEARONE;
}
out[tid]=smem[threadIdx.x];
}
int main (int argc , char **argv){
float time;
cudaEvent_t start, stop, start2, stop2;
double *d_A, *d_B;
cudaMalloc(&d_A, DP_UNITS*SMS*sizeof(float));
cudaMalloc(&d_B, DP_UNITS*SMS*sizeof(float));
cudaError_t err;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
fma_latency_kernal<<<SMS, DP_UNITS>>>(d_A, d_B);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
time/=1000;
err = cudaGetLastError();
if(err!=cudaSuccess)
printf("Error FMA: %s\n", cudaGetErrorString(err));
printf("Latency of FMA = %3.1f clock cycles\n", (time/(double)ITER)*(double)MHZ);
cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
cudaEventCreate(&start2);
cudaEventCreate(&stop2);
cudaEventRecord(start2, 0);
shared_latency_kernel<<<1, SHARED_BANKS, sizeof(double)>>>(d_A, d_B );
cudaEventRecord(stop2, 0);
cudaEventSynchronize(stop2);
cudaEventElapsedTime(&time, start2, stop2);
time/=1000;
err = cudaGetLastError();
if(err!=cudaSuccess)
printf("Error Shared Memory: %s\n", cudaGetErrorString(err));
printf("Latency of Shared Memory = %3.1f clock cycles\n", time/(double)ITER*(double)MHZ);
}
My results on the K20x are the following:
Latency of FMA = 16.4 clock cycles
Latency of Shared Memory = 60.7 clock cycles
This seems reasonable to me, but I am not sure how accurate it is.
Your latency values look very high to me - nearly double what I'd expect. To measure how many cycles something takes on the GPU, you can insert clock() functions before and after the relevant part of the kernel function. The clock function returns the current cycle as an int, so by subtracting the first value from the second you get the the number of cycles that passed between dispatching the first clock instruction and dispatching the second clock instruction.
Note that the numbers you get from this method will include extra time from the clock instructions themselves; I believe that by default a thread will block for several cycles immediately before and after every clock instruction, so you may want to experiment with that to see how many cycles it's adding so you can subtract them back out.

Matrix not copied back from device to host successfully in CUDA

I am new to cuda. I wrote a kernel to create an identity matrix(GPUsetIdentity) of dimension sizeXsize. Further inside a function GPUfunctioncall, I called my kernel. The identity matrix should be stored in dDataInv. But when I copy it back to dataOut sizexsize , all the values are zero. I know, I am doing something very stupid somewhere, but couldnt get it, I am new to cuda, if anyone can point my mistake. Thanks.
#include <stdio.h>
#include <malloc.h>
#include <memory.h>
#include <math.h>
#include <stdlib.h>
#include <iostream>
#include <stdlib.h>
#include <string>
#include <fstream>
#include <iterator>
#include <sstream>
#include <vector>
#include <cstring>
#include <cstdlib>
#include <ctime>
#include <stdlib.h>
#include <cuda_runtime.h>
#include "cuda.h"
#define BLOCKSIZE 16
using namespace std;
__global__ void GPUsetIdentity (float* matrix, int width)
{
int tx = threadIdx.x;
int bx = blockIdx.x;
int offset = bx * BLOCKSIZE + tx;
matrix[offset + width * offset] = 1;
}
void print_matrix_host(float* A , int nr_rows_A, int nr_cols_A) {
for(int i = 0; i < nr_rows_A; ++i){
for(int j = 0; j < nr_cols_A; ++j){
std::cout << A[i * nr_rows_A + j ] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int GPUfunctioncall (float* hDataOut, int size){
float *dDataInv;
cudaMalloc ((void **) &dDataInv, size);
cudaMemset ((void *) dDataInv, 0, size);
dim3 idyThreads (BLOCKSIZE);
dim3 idyBlocks (size / BLOCKSIZE);
GPUsetIdentity <<< idyBlocks, idyThreads >>> (dDataInv, size);
cudaThreadSynchronize ();
cudaMemcpy ((void *) hDataOut, (void *) dDataInv, size, cudaMemcpyDeviceToHost);
cudaFree (dDataInv);
return 0;
}
int main()
{
int size = 4;
float* dataOut;
dataOut = new float[size*size];
GPUfunctioncall(dataOut, size);
print_matrix_host(dataOut, size, size);
}
Any time you are having trouble with a CUDA code, it's good practice to use proper cuda error checking. You can also run your code with cuda-memcheck to get a quick read on whether there are any errors.
Using either of these methods, you would have discovered an "invalid configuration error" on your kernel launch. This usually means that the parameters in the <<< >>> syntax are incorrect. When you run into this type of error, simply printing out those values may indicate the problem.
In your case, this line of code:
dim3 idyBlocks (size / BLOCKSIZE);
results in a value of 0 for idyBlocks when size is 4 and BLOCKSIZE is 16. So you are requesting a kernel launch of 0 blocks which is illegal. Therefore your kernel is not running and your results are not what you expect.
There are a variety of ways to solve this, many of them involving detecting this condition and adding an "extra block" when size is not evenly divisible by BLOCKSIZE. Using this approach, we may be launching "extra threads", so we must include a "thread check" in the kernel to prevent those extra threads from doing anything (such as accessing arrays out of bounds). For this, we often need to know the intended size in the kernel, and we can pass this value as an extra kernel parameter.
You've also made some errors in your handling of device variables. The following code:
dataOut = new float[size*size];
allocates enough space for a square matrix of dimension size. But the following code:
cudaMalloc ((void **) &dDataInv, size);
only allocates enough space for size bytes. You want size*size*sizeof(float) instead of size here, and likewise you want it in the following cudaMemset and cudaMemcpy operations. cudaMalloc, cudaMemset and cudaMemcpy require a size parameter in bytes, just like malloc, memset, and memcpy. This error is found in your usage of cudaMemset and cudaMemcpy as well.
The following code has those modifications, and seems to work correctly for me:
$ cat t580.cu
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#define BLOCKSIZE 16
using namespace std;
__global__ void GPUsetIdentity (float* matrix, int width, int size)
{
int tx = threadIdx.x;
int bx = blockIdx.x;
int offset = bx * BLOCKSIZE + tx;
if (tx < size)
matrix[offset + width * offset] = 1;
}
void print_matrix_host(float* A , int nr_rows_A, int nr_cols_A) {
for(int i = 0; i < nr_rows_A; ++i){
for(int j = 0; j < nr_cols_A; ++j){
std::cout << A[i * nr_rows_A + j ] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int GPUfunctioncall (float* hDataOut, int size){
float *dDataInv;
cudaMalloc ((void **) &dDataInv, size*size*sizeof(float));
cudaMemset ((void *) dDataInv, 0, size*size*sizeof(float));
dim3 idyThreads (BLOCKSIZE);
int num_blocks = size/BLOCKSIZE + (size%BLOCKSIZE)?1:0;
dim3 idyBlocks (num_blocks);
GPUsetIdentity <<< idyBlocks, idyThreads >>> (dDataInv, size, size);
cudaThreadSynchronize ();
cudaMemcpy ((void *) hDataOut, (void *) dDataInv, size*size*sizeof(float), cudaMemcpyDeviceToHost);
cudaFree (dDataInv);
return 0;
}
int main()
{
int size = 4;
float* dataOut;
dataOut = new float[size*size];
GPUfunctioncall(dataOut, size);
print_matrix_host(dataOut, size, size);
}
$ nvcc -arch=sm_20 -o t580 t580.cu
$ cuda-memcheck ./t580
========= CUDA-MEMCHECK
1 0 0 0
0 1 0 0
0 0 1 0
0 0 0 1
========= ERROR SUMMARY: 0 errors
$
Note that it may be redundant to pass size twice to the kernel. For this particular example, we could have easily used the width parameter to do our kernel "thread check". But for educational purposes, I chose to call it out as a separate parameter, because in the general case you will often pass it as a separate parameter to other kernels that you write.
Finally, note that cudaThreadSynchronize() is deprecated and should be replaced with cudaDeviceSynchronize() instead. In this particular example, niether are actually necessary, as the next cudaMemcpy operation will force the same kind of synchronization, but you may use it if you decide to add cuda error checking to your code (recommended).

Modulus computation of an array of cufftComplex data type in CUDA

I made a Dll file in visual C++ to compute modulus of an array of complex numbers in CUDA. The array is type of cufftComplex. I then called the Dll in LabVIEW to check the accuracy of the result. I'm receiving an incorrect result. Could anyone tell me what is wrong with the following code, please? I think there should be something wrong with my kernel function(the way I am retrieving the cufftComplex data should be incorrect).
#include <math.h>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cufft.h>
extern "C" __declspec(dllexport) void Modulus(cufftComplex *digits,float *result);
__global__ void ModulusComputation(cufftComplex *a, int N, float *temp)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx<N)
{
temp[idx] = sqrt((a[idx].x * a[idx].x) + (a[idx].y * a[idx].y));
}
}
void Modulus(cufftComplex *digits,float *result)
{
#define N 1024
cufftComplex *d_data;
float *temp;
size_t size = sizeof(cufftComplex)*N;
cudaMalloc((void**)&d_data, size);
cudaMalloc((void**)&temp, sizeof(float)*N);
cudaMemcpy(d_data, digits, size, cudaMemcpyHostToDevice);
int blockSize = 16;
int nBlocks = N/blockSize;
if( N % blockSize != 0 )
nBlocks++;
ModulusComputation <<< nBlocks, blockSize >>> (d_data, N,temp);
cudaMemcpy(result, temp, size, cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(temp);
}
In the final cudaMemcpy in your code, you have:
cudaMemcpy(result, temp, size, cudaMemcpyDeviceToHost);
It should be:
cudaMemcpy(result, temp, sizeof(float)*N, cudaMemcpyDeviceToHost);
If you had included error checking for your cuda calls, you would have seen this cuda call (as originally written) throw an error.
There's other comments that could be made. For example your block size (16) should be an integral multiple of 32. But this does not prevent proper operation.
After the kernel call, when copying back the result, you are using size as the memory size. The third argument of cudaMemcpy should be N * sizeof(float).