not able to use printf in cuda kernel function - cuda

It seems that printf doesn't work inside the Kernel of a cuda code
#include "Common.h"
#include<cuda.h>
#include <stdio.h>
__device__ __global__ void Kernel(float *a_d , float *b_d ,int size)
{
int idx = threadIdx.x ;
int idy = threadIdx.y ;
//Allocating memory in the share memory of the device
__shared__ float temp[16][16];
//Copying the data to the shared memory
temp[idy][idx] = a_d[(idy * (size+1)) + idx] ;
printf("idx=%d, idy=%d, size=%d\n", idx, idy, size);
for(int i =1 ; i<size ;i++) {
if((idy + i) < size) { // NO Thread divergence here
float var1 =(-1)*( temp[i-1][i-1]/temp[i+idy][i-1]);
temp[i+idy][idx] = temp[i-1][idx] +((var1) * (temp[i+idy ][idx]));
}
__syncthreads(); //Synchronizing all threads before Next iterat ion
}
b_d[idy*(size+1) + idx] = temp[idy][idx];
}
when compiling, it says:
error: calling a host function("printf") from a __device__/__global__ function("Kernel") is not allowed
The cuda version is 4

Quoting the CUDA Programming Guide "Formatted output is only supported by devices of compute capability 2.x and higher". See the programming guide for additional information.
Devices of compute capability < 2.x can use cuPrintf.
If you are on a 2.x and above device and you are trying to use printf make sure you have specified arch=sm_20 (or higher). The default is sm_10 which does not have sufficient features to support printf.
NVIDIA offers three source level debuggers for CUDA. You may find these more useful than printf for inspecting variables.
- Nsight Visual Studio Edition CUDA Debugger
- Nsight Eclipse Edition CUDA Debugger
- cuda-gdb

You need to use cuPrintf, as in this example. Note that printf is a pretty limited way of debugging, the Nsight or Nsight eclipse edition IDEs are much nicer.

Related

grid_group not found in CUDA 9

I tried using Cooperative Groups in CUDA 9, but I get an error in compiling.
Does anyone know the solution?
The development environment is as follows:
CUDA 9
Kepler K80
Compute Capability: 3.7
#include <cstdint>
#include <iostream>
#include <vector>
#include <cooperative_groups.h>
__global__
void kernel(uint32_t values[])
{
using namespace cooperative_groups;
grid_group g = this_grid();
}
int main(void)
{
constexpr uint32_t kNum = 1 << 24;
std::vector<uint32_t> h_values(kNum);
uint32_t *d_values;
cudaMalloc(&d_values, sizeof(uint32_t) * kNum);
cudaMemcpy(d_values, h_values.data(), sizeof(uint32_t) * kNum, cudaMemcpyHostToDevice);
const uint32_t thread_num = 256;
const dim3 block(thread_num);
const dim3 grid((kNum + block.x - 1) / block.x);
void *params[] = {&d_values};
cudaLaunchCooperativeKernel((void *)kernel, grid, block, params);
cudaMemcpy(h_values.data(), d_values, sizeof(uint32_t) * kNum, cudaMemcpyDeviceToHost);
cudaFree(d_values);
return 0;
}
$ nvcc -arch=sm_37 test.cu --std=c++11 -o test
test.cu(12): error: identifier "grid_group" is undefined
test.cu(12): error: identifier "this_grid" is undefined
The grid_group features are only supported in the Pascal architecture and later.
You can try by compiling for, e.g., sm_60 (of course the executable won't run on your GPU). Additionally you need to enable relocatable device code (-rdc=true).
Unfortunately, the Programming Guide is not very clear about that. I couldn't find this information there. However it is mentioned in some posts on devblog.nvidia.com:
From https://devblogs.nvidia.com/cuda-9-features-revealed/
While Cooperative Groups works on all GPU architectures, certain functionality is inevitably architecture-dependent as GPU capabilities have evolved. Basic functionality, such as synchronizing groups smaller than a thread block down to warp granularity, is supported on all architectures, while Pascal and Volta GPUs enable new grid-wide and multi-GPU synchronizing groups.
Or at the very end of https://devblogs.nvidia.com/cooperative-groups/
New features in Pascal and Volta GPUs help Cooperative Groups go farther, by enabling creation and synchronization of thread groups that span an entire kernel launch running on one or even multiple GPUs.

How do I direct all accesses to global memory in CUDA?

I want all accesses from my program to access global memory (even if the data is found in the L1/L2 cache). To this effect I found out that L1 cache can be skipped by passing these options to nvcc compiler:
-Xptxas -dlcm=cg
CUDA documentation states this:
.cv Cache as volatile (consider cached system memory lines stale, fetch again).
So, I am assuming when I run with either -dlcm=cg or -dlcm=cv, the PTX file generated should be different from the one that is generated normally. (The loads should be appended with either .cg or .cv)
My sample program:
__global__ void rh_kernel(int *datainRowX, int *datainRowY) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid != 0)
return;
int i, x, y;
x = datainRowX[1];
y = datainRowY[2];
datainRowX[0] = x + y;
}
int main(int argc, char** argv) {
int* d_datainRowX;
cudaMalloc((void**)&d_datainRowX, sizeof(int) * 268435456);
int* d_datainRowY;
cudaMalloc((void**)&d_datainRowY, sizeof(int) * 268435456);
rh_kernel<<<1024, 1>>>(d_datainRowX, d_datainRowY);
cudaFree(d_datainRowX); cudaFree(d_datainRowY);
return(0);
}
I notice that whatever options I pass to the nvcc compiler ("-Xptxas -dlcm=cg" or "-Xptxas -dlcm=cv" or nothing), in all the three cases the PTX generated is the same. I am using -ptx option to generate the PTX file.
What am I missing? Is there any other way to achieve what I am doing?
Thanks in advance for your time.
According to Cuda Toolkit Documentation:
L1 caching in Kepler GPUs is reserved only for local memory accesses,
such as register spills and stack data. Global loads are cached in L2
only (or in the Read-Only Data Cache).
GK110B-based products such as the Tesla K40 GPU Accelerator, GK20A,
and GK210 retain this behavior by default
L1 cache is not used in global memory reads on Kepler by default . Thus - there is no difference in PTX when you add -Xptxas -dlcm=cg.
Disabling L2 cache is not possbile.

Getting error 255 when compiling with nvcc [duplicate]

It seems that printf doesn't work inside the Kernel of a cuda code
#include "Common.h"
#include<cuda.h>
#include <stdio.h>
__device__ __global__ void Kernel(float *a_d , float *b_d ,int size)
{
int idx = threadIdx.x ;
int idy = threadIdx.y ;
//Allocating memory in the share memory of the device
__shared__ float temp[16][16];
//Copying the data to the shared memory
temp[idy][idx] = a_d[(idy * (size+1)) + idx] ;
printf("idx=%d, idy=%d, size=%d\n", idx, idy, size);
for(int i =1 ; i<size ;i++) {
if((idy + i) < size) { // NO Thread divergence here
float var1 =(-1)*( temp[i-1][i-1]/temp[i+idy][i-1]);
temp[i+idy][idx] = temp[i-1][idx] +((var1) * (temp[i+idy ][idx]));
}
__syncthreads(); //Synchronizing all threads before Next iterat ion
}
b_d[idy*(size+1) + idx] = temp[idy][idx];
}
when compiling, it says:
error: calling a host function("printf") from a __device__/__global__ function("Kernel") is not allowed
The cuda version is 4
Quoting the CUDA Programming Guide "Formatted output is only supported by devices of compute capability 2.x and higher". See the programming guide for additional information.
Devices of compute capability < 2.x can use cuPrintf.
If you are on a 2.x and above device and you are trying to use printf make sure you have specified arch=sm_20 (or higher). The default is sm_10 which does not have sufficient features to support printf.
NVIDIA offers three source level debuggers for CUDA. You may find these more useful than printf for inspecting variables.
- Nsight Visual Studio Edition CUDA Debugger
- Nsight Eclipse Edition CUDA Debugger
- cuda-gdb
You need to use cuPrintf, as in this example. Note that printf is a pretty limited way of debugging, the Nsight or Nsight eclipse edition IDEs are much nicer.

CUFFT_INVALID_VALUE in cufftGetSize1d

What is the proper way to use cufftGetSize1d (or any of the cufftGetSize*) functions?
I tried with:
cufftHandle plan;
size_t workSize;
cufftResult result;
cufftCreate(&plan);
result = cufftGetSize1d(plan, 1000, CUFFT_C2C, 1, &workSize);
However, the result of last call is always CUFFT_INVALID_VALUE, regardless of size, type, or batch i use. The same is with 2d and 3d variants. cufftEstimate1d works correctly.
This appears to be a bug which was introduced during the CUDA 6 release cycle and subsequently fixed in CUDA 7. The following code:
#include <iostream>
#include <cufft.h>
int main()
{
cufftHandle plan;
size_t workSize;
cufftResult result;
cufftCreate(&plan);
result = cufftGetSize1d(plan, 1000, CUFFT_C2C, 1, &workSize);
std::cout << "result = " << result << std::endl;
return 0;
}
fails with CUFFT_INVALID_VALUE when compiled and run with the CUFFT shipped in CUDA 6.5, but succeeds when built and run against the CUFFT version in CUDA 7.0. As noted in comments, cufftGetSize appears to work correctly in CUDA 6.5. So the workaround is to use cufftGetSize or upgrade to a newer than CUDA 6.5 version of CUFFT.
[This community wiki entry was added mostly from comments to get this question off the unanswered question list]

cudaMemcpy is too slow on Tesla C2075

I'm currently working on a server with 2 cuda capable GPU's: Quadro 400 and Tesla C2075. I made a simple vector addition test program. My problem is that while Tesla C2075 GPU is supposed to be more powerful than Quadro 400, it takes it more time to do the job. I found that cudaMemcpy takes up most of the execution time and it works slower on a more powerful gpu.
Here's the source:
void get_matrix(float* arr1,float* arr2,int N1,int N2)
{
int Nx,Ny;
int n_blocks,n_threads;
int dev=0; // 1
float time;
size_t size;
clock_t start,end;
cudaSetDevice(dev);
cudaDeviceProp deviceProp;
start = clock();
cudaGetDeviceProperties(&deviceProp, dev);
Nx=N1;
Ny=N2;
n_threads=256;
n_blocks=(Nx*Ny+n_threads-1)/n_threads;
size=Nx*Ny*sizeof(float);
cudaMalloc((void**)&d_A,size);
cudaMalloc((void**)&d_B,size);
cudaMemcpy(d_A, arr1, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, arr2, size, cudaMemcpyHostToDevice);
vector_add<<<n_blocks,n_threads>>>(d_A,d_B,size);
cudaMemcpy(arr1, d_A, size, cudaMemcpyDeviceToHost);
printf("Running device %s \n",deviceProp.name);
end = clock();
time=float(end-start)/float(CLOCKS_PER_SEC);
printf("time = %e\n",time);
}
int main()
{
int const nx = 20000,ny = nx;
static float a[nx*ny],b[nx*ny];
for(int i=0;i<nx;i++)
{
for(int j=0;j<ny;j++)
{
a[j+ny*i]=j+10*i;
b[j+ny*i]=-(j+10*i);
}
}
get_matrix(a,b,nx,ny);
return 0;
}
The output is:
Running device Quadro 400
time = 1.100000e-01
Running device Tesla C2075
time = 1.050000e+00
And my questions are:
Should I modify the code depending on what GPU I am going to use?
Is there any connection between the number of blocks, threads per block specified in the code and the number of multiprocessors, cores per multiprocessor available on a GPU?
I'm running Linux Open Suse 11.2. The source code is compiled using the nvcc compiler (version 4.2).
Thanks for your help!
Try to invoke get_matrix(a,b,nx,ny) twice and take the second timing result. First time calling to CUDA API will create the cuda context. It often takes a long time.
Please refer to this section in CUDA C Best Practice Guide for how to determine the block size and grid size.