How do we use cuPrintf()? - cuda

What do we have to do to use cuPrintf()? (device compute capability 1.2, Ubuntu 12) I couldn't find "cuPrintf.cu" and "cudaPrintf.cuh", so i downloaded their code and include them:
#include "cuPrintf.cuh"
#include "cuPrintf.cu"
By the way this is the rest of the code:
__global__ void hello_kernel (float f) {
printf ("Thread number %d. f = %d\n", threadIdx.x, f);
}
int main () {
dim3 gridSize = dim3 (1);
dim3 blockSize = dim3 (16);
cudaPrintfInit ();
hello_kernel <<< gridSize, blockSize >>> (1.2345f);
cudaPrintfDisplay (stdout, true);
cudaPrintfEnd ();
return (0);
}
But nvcc still gives a mistake:
max#max-Lenovo-G560:~/CUDA/matrixMult$ nvcc printfTest.cu -o printfTest
printfTest.cu(5): error: calling a __host__ function("printf") from a __global__
function("hello_kernel") is not allowed
Thanks!

In your kernel instead of this:
printf ("Thread number %d. f = %d\n", threadIdx.x, f);
you should do this:
cuPrintf ("Thread number %d. f = %d\n", threadIdx.x, f);
Other than that, I believe your code is correct (it works for me).
This SO question/answer gives more tips about using cuPrintf properly.

Include <stdio.h> and compile with -arch=sm_20.
Details:
code:
#include <stdio.h>
__global__ void hello_kernel (float f) {
printf ("Thread number %d. f = %d\n", threadIdx.x, f);
}
int main(){
return 0;
}
compilations:
nvcc -arch=sm_20 -o printfTest printfTest.cu

Related

Compute Capability printf Function [duplicate]

What do we have to do to use cuPrintf()? (device compute capability 1.2, Ubuntu 12) I couldn't find "cuPrintf.cu" and "cudaPrintf.cuh", so i downloaded their code and include them:
#include "cuPrintf.cuh"
#include "cuPrintf.cu"
By the way this is the rest of the code:
__global__ void hello_kernel (float f) {
printf ("Thread number %d. f = %d\n", threadIdx.x, f);
}
int main () {
dim3 gridSize = dim3 (1);
dim3 blockSize = dim3 (16);
cudaPrintfInit ();
hello_kernel <<< gridSize, blockSize >>> (1.2345f);
cudaPrintfDisplay (stdout, true);
cudaPrintfEnd ();
return (0);
}
But nvcc still gives a mistake:
max#max-Lenovo-G560:~/CUDA/matrixMult$ nvcc printfTest.cu -o printfTest
printfTest.cu(5): error: calling a __host__ function("printf") from a __global__
function("hello_kernel") is not allowed
Thanks!
In your kernel instead of this:
printf ("Thread number %d. f = %d\n", threadIdx.x, f);
you should do this:
cuPrintf ("Thread number %d. f = %d\n", threadIdx.x, f);
Other than that, I believe your code is correct (it works for me).
This SO question/answer gives more tips about using cuPrintf properly.
Include <stdio.h> and compile with -arch=sm_20.
Details:
code:
#include <stdio.h>
__global__ void hello_kernel (float f) {
printf ("Thread number %d. f = %d\n", threadIdx.x, f);
}
int main(){
return 0;
}
compilations:
nvcc -arch=sm_20 -o printfTest printfTest.cu

OpenACC: calling cuda __device__ kernel from OpenACC parallel loop

If I have simple test cuda kernel in hello.cu file as:
extern "C" __device__ float radians( float f ){
return f*3.14159265;
}
And test OpenACC code in mainacc.c:
#include <stdio.h>
#include <stdlib.h>
#define N 10
#pragma acc routine seq
extern float radians( float );
int main() {
int i;
float *hptr, *dptr;
hptr = (float *) calloc(N, sizeof(float));
#pragma acc parallel loop copy(hptr[0:N])
for(i=0; i<N; i++) {
hptr[i] = radians(i*0.1f);
}
for( i=0; i< N; i++)
printf("\n %dth value : %f", i, hptr[i]);
return 0;
}
If I try to compile this code as below I get link time errors:
nvcc hello.cu -c
cc -hacc -hlist=a mainacc.c hello.o
nvlink error : Undefined reference to 'radians' in '/tmp/pe_20271//app_cubin_20271.omainacc_1.o__sec.cubin'
cuda_link: nvlink fatal error
I tried nvcc with "--relocatable-device-code true” option etc but no success. Loaded modules are:
craype-accel-nvidia35
cudatoolkit/6.5
PrgEnv-cray/5.2.40
Could you tell me correct way to use cuda device kernel within OpenACC?
I've been able to make this sort of mixing work with PGI, but I've not yet been able to produce a sample that works with the Cray compiler. Here's a simple example that works for PGI.
This is the file containing the CUDA.
// saxpy_cuda_device.cu
extern "C"
__device__
float saxpy_dev(float a, float x, float y)
{
return a * x + y;
}
This is the file containing OpenACC.
// openacc_cuda_device.cpp
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#pragma acc routine seq
extern "C" float saxpy_dev(float, float, float);
int main(int argc, char **argv)
{
float *x, *y, tmp;
int n = 1<<20, i;
x = (float*)malloc(n*sizeof(float));
y = (float*)malloc(n*sizeof(float));
#pragma acc data create(x[0:n]) copyout(y[0:n])
{
#pragma acc kernels
{
for( i = 0; i < n; i++)
{
x[i] = 1.0f;
y[i] = 0.0f;
}
}
#pragma acc parallel loop
for( i = 0; i < n; i++ )
{
y[i] = saxpy_dev(2.0, x[i], y[i]);
}
}
fprintf(stdout, "y[0] = %f\n",y[0]);
return 0;
}
Below is the compilation command.
$ make
nvcc -rdc true -c saxpy_cuda_device.cu
pgc++ -fast -acc -ta=nvidia:rdc,cuda7.0 -c openacc_cuda_device.cpp
pgc++ -o openacc_cuda_device -fast -acc -ta=nvidia:rdc,cuda7.0 saxpy_cuda_device.o openacc_cuda_device.o -Mcuda
You can use the -Wc command line option to add the generated ptx file to the CUDA link line. I've opened a bug to make sure we document how to do this.
nvcc hello.cu -ptx -arch=sm_35
cc -hacc -hlist=a mainacc.c -Wc,hello.ptx
One suggestion is to provide both a host and device version of the subroutine and then use the "bind" clause to indicate which version to call from a compute region. This will allow you to maintain portability with the host code.
For example:
% cat radians.cu
extern "C" __device__ float cuda_radians( float f ){
return f*3.14159265;
}
extern "C" float radians( float f ){
return f*3.14159265;
}
% cat test.c
#include <stdio.h>
#include <stdlib.h>
#define N 10
#pragma acc routine (radians) bind(cuda_radians) seq
extern float radians( float f);
int main() {
int i;
float *hptr, *dptr;
hptr = (float *) calloc(N, sizeof(float));
#pragma acc parallel loop copy(hptr[0:N])
for(i=0; i<N; i++) {
hptr[i] = radians(i*0.1f);
}
for( i=0; i< N; i++)
printf("\n %dth value : %f", i, hptr[i]);
return 0;
}
% nvcc -c radians.cu --relocatable-device-code true
% pgcc -acc -ta=tesla:cuda7.0 -Minfo=accel test.c radians.o -V15.7 -Mcuda
test.c:
main:
15, Generating copy(hptr[:10])
Accelerator kernel generated
Generating Tesla code
16, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
% a.out
0th value : 0.000000
1th value : 0.314159
2th value : 0.628319
3th value : 0.942478
4th value : 1.256637
5th value : 1.570796
6th value : 1.884956
7th value : 2.199115
8th value : 2.513274
9th value : 2.827434

CUDA6.5 Can't extern the value of texture

I had written a program follow the JackOlantem's answer in CUDA extern texture declaration but my result dosen't print the value of extern texture declaration
P/s: how to add -rdc = true to enable external linkage?
Result of the program ! http://i.stack.imgur.com/aGh3U.png
Thanks for your help!!.
kernel.cu compilation unit
#include <stdio.h>
texture<int, 1, cudaReadModeElementType> texture_test;
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*************************/
/* LOCAL KERNEL FUNCTION */
/*************************/
__global__ void kernel1() {
printf("ThreadID = %i; Texture value = %i\n", threadIdx.x, tex1Dfetch(texture_test, threadIdx.x));
}
__global__ void kernel2();
/********/
/* MAIN */
/********/
int main() {
const int N = 16;
// --- Host data allocation and initialization
int *h_data = (int*)malloc(N * sizeof(int));
for (int i=0; i<N; i++) h_data[i] = i;
// --- Device data allocation and host->device memory transfer
int *d_data; gpuErrchk(cudaMalloc((void**)&d_data, N * sizeof(int)));
gpuErrchk(cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaBindTexture(NULL, texture_test, d_data, N * sizeof(int)));
kernel1<<<1, 16>>>();
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
kernel2<<<1, 16>>>();
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaUnbindTexture(texture_test));
}
kernel2.cu compilation unit
#include <stdio.h>
extern texture<int, 1, cudaReadModeElementType> texture_test;
/**********************************************/
/* DIFFERENT COMPILATION UNIT KERNEL FUNCTION */
/**********************************************/
__global__ void kernel2() {
printf("Texture value = %i\n", tex1Dfetch(texture_test, threadIdx.x));
}
P/s: how to add -rdc = true to enable external linkage?
In nsight VSE, try Properties | CUDA C/C++ | Common | Generate Relocatable Device Code" set to "Yes",
Here is an nsight VSE documentation page that describes it as well.

memset in CUBLAS gemm is always launched in default stream

I noticed that when calling cublasSgemm function for each call of gemm from a host, there are 3 kernel invocations: memset, scal_kernel and gemm kernel itself (e.g. sgemm_large). This happens even if I use constants alpha/beta allocated in device memory. While the overhead of memset and scal_kernel is relatively small, the problem is memset is always launched in default stream which causes unnecessary synchronization.
The code:
__constant__ __device__ float alpha = 1;
__constant__ __device__ float beta = 1;
int main()
{
// ... memory allocation skipped ...
float* px = thrust::raw_pointer_cast(x.data());
float* py = thrust::raw_pointer_cast(y.data());
float* pmat = thrust::raw_pointer_cast(mat.data());
for (int iter = 0; iter < 3; ++iter)
{
cbstatus = cublasSgemm(cbh, CUBLAS_OP_N, CUBLAS_OP_N, crow, ccol, cshared, &alpha, px, crow, py, cshared, &beta, pmat, crow);
assert(0 == cbstatus);
}
}
This is what I see in profiler:
The question: is there a way to avoid memset or make it run in the stream assigned to CUBLAS handle?
One idea is to use DP and run device version of the gemm function, but this will work only on CC 3.0 and higher.
There was a bug in CUBLAS5.5 where a cudaMemset was used instead of cudaMemsetAsync in the specialized path where k >> m,n.
It is fixed in CUBLAS6.0 RC. And you can have access to it if you are a registered developer.
Btw, I wonder why you use __constant__ __device__ for alpha,beta.
Are you using pointerMode = DEVICE?
If not, you could simply use alpha,beta on the host.
Try the code below. The code is conceived to have only a cublasSgemm call, apart from unavoidable memory allocations and copies. You will see that
You have only one kernel launched (gemm_kernel1x1_core);
The two calls to cublasSgemm run perfectly in two different streams.
In the picture, the Visual Profiler timeline is shown.
My system: GeForce 540M, Windows 7, CUDA 5.5.
#include <conio.h>
#include <stdio.h>
#include <assert.h>
#include <cublas_v2.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/**********************/
/* cuBLAS ERROR CHECK */
/**********************/
#ifndef cublasSafeCall
#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
#endif
inline void __cublasSafeCall(cublasStatus_t err, const char *file, const int line)
{
if( CUBLAS_STATUS_SUCCESS != err) {
fprintf(stderr, "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",__FILE__, __LINE__,err);
getch(); cudaDeviceReset(); assert(0);
}
}
/********/
/* MAIN */
/********/
int main()
{
int N = 5;
float *A1, *A2, *B1, *B2, *C1, *C2;
float *d_A1, *d_A2, *d_B1, *d_B2, *d_C1, *d_C2;
A1 = (float*)malloc(N*N*sizeof(float));
B1 = (float*)malloc(N*N*sizeof(float));
C1 = (float*)malloc(N*N*sizeof(float));
A2 = (float*)malloc(N*N*sizeof(float));
B2 = (float*)malloc(N*N*sizeof(float));
C2 = (float*)malloc(N*N*sizeof(float));
gpuErrchk(cudaMalloc((void**)&d_A1,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_B1,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_C1,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_A2,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_B2,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_C2,N*N*sizeof(float)));
for (int i=0; i<N*N; i++) {
A1[i] = ((float)rand()/(float)RAND_MAX);
A2[i] = ((float)rand()/(float)RAND_MAX);
B1[i] = ((float)rand()/(float)RAND_MAX);
B2[i] = ((float)rand()/(float)RAND_MAX);
}
gpuErrchk(cudaMemcpy(d_A1, A1, N*N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_B1, B1, N*N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_A2, A2, N*N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_B2, B2, N*N*sizeof(float), cudaMemcpyHostToDevice));
cublasHandle_t handle;
cublasSafeCall(cublasCreate(&handle));
cudaStream_t stream1, stream2;
gpuErrchk(cudaStreamCreate(&stream1));
gpuErrchk(cudaStreamCreate(&stream2));
float alpha = 1.f;
float beta = 1.f;
cublasSafeCall(cublasSetStream(handle,stream1));
cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A1, N, d_B1, N, &beta, d_C1, N));
cublasSafeCall(cublasSetStream(handle,stream2));
cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A2, N, d_B2, N, &beta, d_C2, N));
gpuErrchk(cudaDeviceReset());
return 0;
}

Wrong results of a CUDA dynamic parallelism code

I recently bumped in the problem illustrated at Uncorrectable ECC error. Shortly speaking, from time to time I receive an Uncorrectable ECC error and my dynamic parallelism code generates uncorrect results. The most probable hypothesis of the uncorrectable ECC error is a corrupted driver stack, which has also been indirectly confirmed by the experience of another user (see the above post). I would now like to face the second issue, i.e., the algorithmic one. To this end, I'm dealing with the reproducer reported below which, since the original code generating uncorrect results uses dynamic parallelism, uses this CUDA feature too.
I do not see any evindent issue with this code. I think that the synchronization regarding the child kernel launch should be ok: the first __syncthreads() should not be necessary and the cudaDeviceSynchronize() should ensure that all the memory writes of the child kernel are accomplished before the printf.
My question is: is this code wrong or the wrong results are due to a non-programming issue?
My configuration: CUDA 5.0, Windows 7, 4-GPU system equipped with Kepler K20c, driver 327.23.
#include <stdio.h>
#include <conio.h>
#define K 6
#define BLOCK_SIZE 256
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getch(); exit(code); }
}
}
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__global__ void child_kernel(double* P1)
{
int m = threadIdx.x;
P1[m] = (double)m;
}
__global__ void parent_kernel(double* __restrict__ x, int M)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if(i<M) {
double* P1 = new double[13];
dim3 dimBlock(2*K+1,1); dim3 dimGrid(1,1);
__syncthreads();
child_kernel<<<dimGrid,dimBlock>>>(P1);
cudaDeviceSynchronize();
for(int m=0; m<2*K+1; m++) printf("%f %f\n",P1[m],(double)m);
}
}
int main() {
const int M = 19000;
//gpuErrchk(cudaSetDevice(0));
double* x = (double*)malloc(M*sizeof(double));
for (int i=0; i<M; i++) x[i] = (double)i;
double* d_x; gpuErrchk(cudaMalloc((void**)&d_x,M*sizeof(double)));
gpuErrchk(cudaMemcpy(d_x,x,M*sizeof(double),cudaMemcpyHostToDevice));
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(iDivUp(M,BLOCK_SIZE));
parent_kernel<<<dimGrid,dimBlock>>>(d_x,M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
getch();
return 0;
}
I'm pretty sure you're exceeding the launch pending limit. It's nearly impossible to tell with your code as-is, but I've modified it and added error checking on the child kernel launch.
When I do that, I get launch errors, signified by a printout of !. Skipping the launch error cases, all of my in-kernel checking of P1[m] vs. m passes (I get no * printout at all.)
#include <stdio.h>
#define K 6
#define BLOCK_SIZE 256
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__global__ void child_kernel(unsigned long long* P1)
{
int m = threadIdx.x;
P1[m] = (unsigned long long)m;
}
__global__ void parent_kernel(double* __restrict__ x, int M)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if(i<M) {
unsigned long long* P1 = new unsigned long long[13];
dim3 dimBlock(2*K+1,1); dim3 dimGrid(1,1);
__syncthreads();
child_kernel<<<dimGrid,dimBlock>>>(P1);
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("!");
else for(unsigned long long m=0; m<dimBlock.x; m++) if (P1[m] != m) printf("*");
}
}
int main() {
const int M = 19000;
//gpuErrchk(cudaSetDevice(0));
double* x = (double*)malloc(M*sizeof(double));
for (int i=0; i<M; i++) x[i] = (double)i;
double* d_x; gpuErrchk(cudaMalloc((void**)&d_x,M*sizeof(double)));
gpuErrchk(cudaMemcpy(d_x,x,M*sizeof(double),cudaMemcpyHostToDevice));
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(iDivUp(M,BLOCK_SIZE));
parent_kernel<<<dimGrid,dimBlock>>>(d_x,M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
Feel free to add further decoding of the err variable in the parent kernel to convince yourself that you are exceeding the launch pending limit. As another test, you can set M to 2048 instead of 19000 in your host code, and all the ! printouts go away. (launch pending limit default == 2048)
As I've stated in the comments, I think the uncorrectable ECC error is a separate issue, and I suggest trying the driver 321.01 that I linked in the comments.