Cuda hello_world.cu compiles but wrongly prints "Hello Hello" - cuda

I have found the following hello world program for CUDA:
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
const int N = 16;
const int blocksize = 16;
__global__
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}
int main()
{
char a[N] = "Hello \0\0\0\0\0\0";
int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);
printf("%s", a);
cudaMalloc( (void**)&ad, csize );
cudaMalloc( (void**)&bd, isize );
cudaCheckErrors("cudaMalloc fail");
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice );
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice );
cudaCheckErrors("cudaMemcpy H2D fail");
dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello<<<dimGrid, dimBlock>>>(ad, bd);
cudaCheckErrors("Kernel fail");
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost );
cudaCheckErrors("cudaMemcpy D2H/Kernel fail");
cudaFree( ad );
cudaFree( bd );
printf("%s\n", a);
return EXIT_SUCCESS;
}
I compile it successfully with nvcc hello_world.cu -o hello, but when I run cuda-memcheck ./hello , I get:
========= CUDA-MEMCHECK
Fatal error: cudaMalloc fail (unknown error at hello_world.cu:39)
*** FAILED - ABORTING
Hello ========= ERROR SUMMARY: 0 errors
I'm a CUDA newbie, my questions are:
1) what's going on under the hood?
2) how can I fix it?
I'm running Ubuntu 13.04, x86_64, Cuda 5.5, without root access.
the upper output of nvidia-smi is:
+------------------------------------------------------+
| NVIDIA-SMI 337.19 Driver Version: 337.19 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce GTX TIT... Off | 0000:05:00.0 N/A | N/A |
| 26% 37C N/A N/A / N/A | 53MiB / 6143MiB | N/A Default |
+-------------------------------+----------------------+----------------------+
When I run deviceQuery, I get:
../../bin/x86_64/linux/release/deviceQuery Starting...
CUDA Device Query (Runtime API) version (CUDART static linking)
cudaGetDeviceCount returned 30
-> unknown error
Result = FAIL
And when I run deviceQueryDrv, I get:
../../bin/x86_64/linux/release/deviceQueryDrv Starting...
CUDA Device Query (Driver API) statically linked version
cuInit(0) returned 999
-> CUDA_ERROR_UNKNOWN
Result = FAIL
When I run:
#include <cublas_v2.h>
#include <cstdio>
int main()
{
int res;
cublasHandle_t handle;
res = cublasCreate(&handle);
switch(res) {
case CUBLAS_STATUS_SUCCESS:
printf("the initialization succeeded\n");
break;
case CUBLAS_STATUS_NOT_INITIALIZED:
printf("the CUDA Runtime initialization failed\n");
break;
case CUBLAS_STATUS_ALLOC_FAILED:
printf("the resources could not be allocated\n");
break;
}
return 0;
}
I get the CUDA Runtime initialization failed.

I have had a problem exactly like yours just now. Running a sample was returning an "Unknown Error" and printing "Hello Hello ", and cublasCreate was returning CUBLAS_STATUS_NOT_INITIALIZED.
I found the answer here: http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-linux/index.html#post-installation-actions
If a CUDA-capable device and the CUDA Driver are installed but deviceQuery reports that no CUDA-capable devices are present, this likely means that the /dev/nvidia* files are missing or have the wrong permissions.
Indeed, my /dev/nvidia* files were owned by root.
I just ran
chown user.user /dev/nvidia*
(where user is my local user) and all the errors disappeared.
EDIT: later I had a similar issue on another machine, and this solution did not work, because one of the devices in /dev/nvidia was missing. What did work is just executing a sample code once as a sudo. After that one extra device appeared in /dev/nvidia*, and sample code started working without sudo.

Related

Profiling code with nsight compute on Pascal fails when cuda memory pools are used

I would like to use Nsight Compute for Pascal GPUs to profile a program which uses CUDA memory pools. I am using Linux, CUDA 11.5, driver 495.46. Nsight Compute is version 2019.5.0, which is the last version that supports Pascal.
Consider the following example program
// nvcc -std=c++17 -arch=sm_61 -O3 main.cu -o main
#include <vector>
#include <memory>
#include <cassert>
#include <iostream>
__global__
void kernel(int* data){ data[0] = 1; };
int main(){
cudaMemPool_t pool{};
cudaMemPoolProps pool_props{};
pool_props.allocType = cudaMemAllocationTypePinned;
pool_props.handleTypes = cudaMemHandleTypePosixFileDescriptor;
pool_props.location.type = cudaMemLocationTypeDevice;
pool_props.location.id = 0;
auto status = cudaMemPoolCreate(&pool, &pool_props);
printf("%s\n", cudaGetErrorName(status));
auto stream = cudaStreamPerThread;
int data = 0;
int* d_data;
status = cudaMallocFromPoolAsync(&d_data, sizeof(int), pool, stream);
printf("%s\n", cudaGetErrorName(status));
kernel<<<1,1,0,stream>>>(d_data);
status = cudaGetLastError();
printf("%s\n", cudaGetErrorName(status));
status = cudaMemcpyAsync(&data, d_data, sizeof(int), cudaMemcpyDeviceToHost, stream);
printf("%s\n", cudaGetErrorName(status));
status = cudaFreeAsync(d_data, stream);
printf("%s\n", cudaGetErrorName(status));
status = cudaDeviceSynchronize();
printf("%s\n", cudaGetErrorName(status));
}
It runs fine without profiler.
compute-sanitizer ./main
========= COMPUTE-SANITIZER
cudaSuccess
cudaSuccess
cudaSuccess
cudaSuccess
cudaSuccess
cudaSuccess
========= ERROR SUMMARY: 0 errors
When run with the profiler, using the pool API returns an error cudaErrorCallRequiresNewerDriver
/opt/nvidia/nsight-compute-2019.5/nv-nsight-cu-cli ./main
==PROF== Connected to process 155966 (main)
cudaErrorCallRequiresNewerDriver
cudaErrorCallRequiresNewerDriver
==PROF== Profiling "kernel" - 1: 0%....50%....100% - 1 pass
==ERROR== Error 0: UnknownError
cudaErrorCallRequiresNewerDriver
cudaErrorIllegalAddress
cudaErrorCallRequiresNewerDriver
cudaErrorIllegalAddress
==PROF== Disconnected from process 155966
==ERROR== An error occurred while trying to profile
Is it possible to profile this program on Pascal using nsight compute?

Cuda - nvcc - No kernel image is available for execution on the device. What is the problem?

I'm trying to use nvcc with the most simple example, but it doesn't work correctly. I'm compiling and execute the example from https://devblogs.nvidia.com/easy-introduction-cuda-c-and-c/, however my server can't execute the global function. I rewrite the code to get some error message and I receive the following message:
"no kernel image is available for execution on the device"
My GPU is a Quadro 6000 and the cuda version is 9.0.
#include <stdio.h>
#include <cuda_runtime.h>
__global__ void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
y[i] = 10.0; //a*x[i] + y[i];
}
int main(int argc, char *argv[])
{
int N = 120;
int nDevices;
float *x, *y, *d_x, *d_y;
cudaError_t err = cudaGetDeviceCount(&nDevices);
if (err != cudaSuccess)
printf("%s\n", cudaGetErrorString(err));
else
printf("Number of devices %d\n", nDevices);
x = (float*)malloc(N*sizeof(float));
y = (float*)malloc(N*sizeof(float));
cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
// Perform SAXPY on 1M elements
saxpy<<<1, 1>>>(N, 2.0f, d_x, d_y);
cudaDeviceSynchronize();
err = cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
printf("%s\n",cudaGetErrorString(err));
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
}"
Execution command
bash-4.1$ nvcc -o sapx simples_cuda.cu
bash-4.1$ ./sapx
Number of devices 1
no error
Sync kernel error: no kernel image is available for execution on the device
GPUs of compute capability less than 2.0 are only supported by CUDA toolkits of version 6.5 and older.
GPUs of compute capability less than 3.0 (but greater than or equal to 2.0) are only supported by CUDA toolkits of version 8.0 and older.
Your Quadro 6000 is a compute capability 2.0 GPU. This can be determined programmatically with the deviceQuery CUDA sample code, or via a google search. It is not supported by CUDA 9.0
You should add the compute capability of your Video Card as a parameter to the nvcc compiler. In my case (windows/Visual Studio 2017) I set this at the Code Generation field. So as #einpoklum answered before add the gencode parameter like this -gencode arch=compute_${COMPUTE_CAPABILITY},code=compute_${SM_CAPABILITY} where {COMPUTE_CAPABILITY} and {SM_CAPABILITY} belong to the following pairs (you can add them all as VS2017 do),
{COMPUTE_CAPABILITY},{SM_CAPABILITY}
compute_35,sm_35
compute_37,sm_37
compute_50,sm_50
compute_52,sm_52
compute_60,sm_60
compute_61,sm_61
compute_70,sm_70
compute_75,sm_75
compute_80,sm_80
D:\Program Files\nVidia\CUDA Samples\MySamples\IntroToCUDA_1\IntroToCUDA_1>"D:\Program Files\nVidia\GPU Computing Toolkit\CUDA\v11.0\bin\nvcc.exe" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -gencode=arch=compute_37,code=\"sm_37,compute_37\" -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_52,code=\"sm_52,compute_52\" -gencode=arch=compute_60,code=\"sm_60,compute_60\" -gencode=arch=compute_61,code=\"sm_61,compute_61\" -gencode=arch=compute_70,code=\"sm_70,compute_70\" -gencode=arch=compute_75,code=\"sm_75,compute_75\" -gencode=arch=compute_80,code=\"sm_80,compute_80\" --use-local-env -ccbin "D:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\VC\Tools\MSVC\14.16.27023\bin\HostX86\x64" -x cu -I"D:\Program Files\nVidia\GPU Computing Toolkit\CUDA\v11.0\include" -I"D:\Program Files\nVidia\GPU Computing Toolkit\CUDA\v11.0\include" -G --keep-dir x64\Debug -maxrregcount=0 --machine 64 --compile -cudart static -g -D_DEBUG -D_CONSOLE -D_UNICODE -DUNICODE -Xcompiler "/EHsc /W3 /nologo /Od /Fdx64\Debug\vc141.pdb /FS /Zi /RTC1 /MDd " -o x64\Debug\IntroToCUDA_1.cu.obj "D:\Program Files\nVidia\CUDA Samples\MySamples\IntroToCUDA_1\IntroToCUDA_1\IntroToCUDA_1.cu"
You can check your CC of your video card with the deviceQuery example you can find in CUDA Samples SDK
Adding to #RobertCrovella's answer:
When compiling with nvcc, you should always set appropriate flags to generate binary kernel images for the microarchitecture / compute capability you intend to run on. For example: -gencode arch=compute_${COMPUTE_CAPABILITY},code=compute_${COMPUTE_CAPABILITY},
with, say COMPUTE_CAPABILITY=61.
Read nvcc --help for more information on these flags (although, to be honest, it's a bit of a murky subject).

Unified Memory and Streams in C

I am trying to use streams with CUDA 6 and unified memory in C. My previous stream implementation was looking like this :
for(x=0; x<DSIZE; x+=N*2){
gpuErrchk(cudaMemcpyAsync(array_d0, array_h+x, N*sizeof(char), cudaMemcpyHostToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(array_d1, array_h+x+N, N*sizeof(char), cudaMemcpyHostToDevice, stream1));
gpuErrchk(cudaMemcpyAsync(data_d0, data_h, wrap->size*sizeof(int), cudaMemcpyHostToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(data_d1, data_h, wrap->size*sizeof(int), cudaMemcpyHostToDevice, stream1));
searchGPUModified<<<N/128,128,0,stream0>>>(data_d0, array_d0, out_d0 );
searchGPUModified<<<N/128,128,0,stream1>>>(data_d1, array_d1, out_d1);
gpuErrchk(cudaMemcpyAsync(out_h+x, out_d0 , N * sizeof(int), cudaMemcpyDeviceToHost, stream0));
gpuErrchk(cudaMemcpyAsync(out_h+x+N, out_d1 ,N * sizeof(int), cudaMemcpyDeviceToHost, stream1));
}
but I cannot find an example of streams and unified memory, using the same technique, where chuncks of data are sent to the GPU. I am thus wondering if there is a way to do this ?
You should read section J.2.2 of the programming guide (and preferably all of appendix J).
With Unified Memory, memory allocated using cudaMallocManaged is by default attached to all streams ("global") and we must modify this in order to make effective use of streams, e.g. for compute/copy overlap. We can do this with the cudaStreamAttachMemAsync function as described in section J.2.2.3 By associating each memory "chunk" with a stream in this fashion, the UM subsystem can make intelligent decisions about when to transfer each data item.
The following example demonstrates this:
#include <stdio.h>
#include <time.h>
#define DSIZE 1048576
#define DWAIT 100000ULL
#define nTPB 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef int mytype;
__global__ void mykernel(mytype *data){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < DSIZE) data[idx] = 1;
unsigned long long int tstart = clock64();
while (clock64() < tstart + DWAIT);
}
int main(){
mytype *data1, *data2, *data3;
cudaStream_t stream1, stream2, stream3;
cudaMallocManaged(&data1, DSIZE*sizeof(mytype));
cudaMallocManaged(&data2, DSIZE*sizeof(mytype));
cudaMallocManaged(&data3, DSIZE*sizeof(mytype));
cudaCheckErrors("cudaMallocManaged fail");
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaStreamCreate(&stream3);
cudaCheckErrors("cudaStreamCreate fail");
cudaStreamAttachMemAsync(stream1, data1);
cudaStreamAttachMemAsync(stream2, data2);
cudaStreamAttachMemAsync(stream3, data3);
cudaDeviceSynchronize();
cudaCheckErrors("cudaStreamAttach fail");
memset(data1, 0, DSIZE*sizeof(mytype));
memset(data2, 0, DSIZE*sizeof(mytype));
memset(data3, 0, DSIZE*sizeof(mytype));
mykernel<<<(DSIZE+nTPB-1)/nTPB, nTPB, 0, stream1>>>(data1);
mykernel<<<(DSIZE+nTPB-1)/nTPB, nTPB, 0, stream2>>>(data2);
mykernel<<<(DSIZE+nTPB-1)/nTPB, nTPB, 0, stream3>>>(data3);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
for (int i = 0; i < DSIZE; i++){
if (data1[i] != 1) {printf("data1 mismatch at %d, should be: %d, was: %d\n", i, 1, data1[i]); return 1;}
if (data2[i] != 1) {printf("data2 mismatch at %d, should be: %d, was: %d\n", i, 1, data2[i]); return 1;}
if (data3[i] != 1) {printf("data3 mismatch at %d, should be: %d, was: %d\n", i, 1, data3[i]); return 1;}
}
printf("Success!\n");
return 0;
}
The above program creates a kernel that runs artificially long using clock64(), so as to give us a simulated opportunity for compute/copy overlap (simulating a compute-intensive kernel). We are launching 3 instances of this kernel, each instance operating on a separate "chunk" of data.
When we profile the above program, the following is seen:
First, note that the 3rd kernel launch is highlighted in yellow, and it begins immediately after the second kernel launch highlighted in purple. The actual cudaLaunch runtime API event that launches this 3rd kernel is indicated in the runtime API line by the mouse pointer, also highlighted in yellow (and is preceded by the cudaLaunch events for the first 2 kernels). Since this launch happens during execution of the first kernel, and there is no intervening "empty space" from that point until the start of the 3rd kernel, we can observe that the transfer of the data for the 3rd kernel launch (i.e. data3) occurred while kernels 1 and 2 were executing. Therefore we have effective overlap of copy and compute. (We could make a similar observation about kernel 2).
Although I haven't shown it here, if we omit the cudaStreamAttachMemAsync lines, the program still compiles and runs correctly, but if we profile it, we observe a different relationship between the cudaLaunch events and the kernels. The overall profile looks similar, and the kernels are executing back to back, but the entire cudaLaunch process now begins and ends before the first kernel begins executing, and there are no cudaLaunch events during the kernel execution. This indicates that (since all the cudaMallocManaged memory is global) all of the data transfers are taking place prior to the first kernel launch. The program has no way to associate a "global" allocation with any particular kernel, so all such allocated memory must be transferred before the first kernel launch (even though that kernel is only using data1).

Performance drop in nppiCopyConstBorder_8u_C1R

I have a performance drop using the nppiCopyConstBorder_8u_C1R function in two different architectures (GTX480 and GTX TITAN) involving also different CUDA version (v5.0 and v5.5 respectively).
In the first case (GTX480 and CUDA 5.0) the execution time of the function is
T = 0.00005 seconds
In the second case (GTX TITAN and CUDA 5.5) the execution time is
T = 0.969831 seconds
I have reproduced this behaviour with the following code:
// GTX480 nvcc -lnpp -m64 -O3 --ptxas-options=-v -gencode arch=compute_20,code=sm_20 --compiler-options -use_fast_math
// GTXTITAN nvcc -lnppi -m64 -O3 --ptxas-options=-v -gencode arch=compute_35,code=sm_35 --compiler-options -use_fast_math
#include <stdlib.h>
#include <stdio.h>
// CUDA
#include <cuda.h>
#include <cuda_runtime_api.h>
// CUDA Nvidia Performance Primitives
#include <npp.h>
#include <assert.h>
#define w 256 // width
#define h 256 // height
#define b 16 // extra border
#define BORDER_TYPE 0
int main(int argc, char *argv[])
{
// input data
Npp8u* h_idata[w*h];
// output data
Npp8u* h_odata[(w+b)*(h+b)];
/* MEMORY ALLOCTION AND INITIAL COPY OF DATA FROM CPU TO GPU */
Npp8u *i_devPtr, *i_devPtr_Border;
// size of input the data
int d_Size = w * h * sizeof(Npp8u);
// allocate input data
CUDA_CHECK_RETURN( cudaMalloc( (void**) &i_devPtr, d_Size ) );
// copy initial data to GPU
CUDA_CHECK_RETURN( cudaMemcpy(i_devPtr, h_idata, d_Size, cudaMemcpyHostToDevice) );
// size of output the data
int d_Size_o = (w+b) * (h+b) * sizeof(Npp8u);
// allocation for input data with extended border
CUDA_CHECK_RETURN( cudaMalloc( (void**) &i_devPtr_Border, d_Size_o ) );
// create struct with ROI size given the current mask
NppiSize SizeROI = {w, h};
NppiSize SizeROI_Border = { SizeROI.width + b, SizeROI.height + b };
// create events
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
// NPP Library Copy Constant Border
cudaEventRecord( start, 0 );
NppStatus eStatusNPP = nppiCopyConstBorder_8u_C1R(i_devPtr,SizeROI.width, SizeROI,
i_devPtr_Border, SizeROI_Border.width, SizeROI_Border,
b, b, BORDER_TYPE);
cudaDeviceSynchronize();
assert( NPP_NO_ERROR == eStatusNPP );
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("T= %1.5f sg\n", milliseconds / 1000.0f);
// copy output data from GPU
CUDA_CHECK_RETURN( cudaMemcpy(h_odata, i_devPtr_Border, d_Size_o, cudaMemcpyDeviceToHost) );
/* free resources */
cudaFree(i_devPtr);
cudaFree(i_devPtr_Border);
CUDA_CHECK_RETURN(cudaDeviceReset());
return 0;
}
Q: Anyone is aware about this issue ?
This makes me ask the following question:
Q: How is nppiCopyConstBorder_8u_C1R implemented? Does the function involve copy data from device to host, extend the border in the host and copy the result to the device?
PS: The machine with the TITAN has the GPU outside the box in a separated motherboard specially designed for multiple PCIe connections and it's connected via a PCIe wire. I have not seen any drawback in this configuration regarding other kernels I have tested.
I think you will find that the only difference is when/where API latencies are being accounted for during program execution, and the the underlying npp function itself doesn't have a vast different in performance between the two CUDA versions and GPU architectures.
My evidence for this hypothesis is this version of the code you posted:
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <npp.h>
#include <assert.h>
#define w 256 // width
#define h 256 // height
#define b 16 // extra border
#define BORDER_TYPE 0
#define CUDA_CHECK_RETURN(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main(int argc, char *argv[])
{
Npp8u* h_idata[w*h];
Npp8u* h_odata[(w+b)*(h+b)];
Npp8u *i_devPtr, *i_devPtr_Border;
int d_Size = w * h * sizeof(Npp8u);
CUDA_CHECK_RETURN( cudaMalloc( (void**) &i_devPtr, d_Size ) );
CUDA_CHECK_RETURN( cudaMemcpy(i_devPtr, h_idata, d_Size, cudaMemcpyHostToDevice) );
int d_Size_o = (w+b) * (h+b) * sizeof(Npp8u);
CUDA_CHECK_RETURN( cudaMalloc( (void**) &i_devPtr_Border, d_Size_o ) );
NppiSize SizeROI = {w, h};
NppiSize SizeROI_Border = { SizeROI.width + b, SizeROI.height + b };
NppStatus eStatusNPP;
#ifdef __WARMUP_CALL__
// Warm up call to nppi function
eStatusNPP = nppiCopyConstBorder_8u_C1R(i_devPtr,SizeROI.width, SizeROI,
i_devPtr_Border, SizeROI_Border.width, SizeROI_Border,
b, b, BORDER_TYPE);
assert( NPP_NO_ERROR == eStatusNPP );
CUDA_CHECK_RETURN( cudaDeviceSynchronize() );
#endif
// Call for timing
cudaEvent_t start, stop;
CUDA_CHECK_RETURN( cudaEventCreate( &start ) );
CUDA_CHECK_RETURN( cudaEventCreate( &stop ) );
CUDA_CHECK_RETURN( cudaEventRecord( start, 0 ) );
eStatusNPP = nppiCopyConstBorder_8u_C1R(i_devPtr,SizeROI.width, SizeROI,
i_devPtr_Border, SizeROI_Border.width, SizeROI_Border,
b, b, BORDER_TYPE);
assert( NPP_NO_ERROR == eStatusNPP );
CUDA_CHECK_RETURN( cudaEventRecord( stop, 0 ) );
CUDA_CHECK_RETURN( cudaEventSynchronize( stop ) );
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("T= %1.5f sg\n", milliseconds / 1000.0f);
CUDA_CHECK_RETURN( cudaMemcpy(h_odata, i_devPtr_Border, d_Size_o, cudaMemcpyDeviceToHost) );
cudaFree(i_devPtr);
cudaFree(i_devPtr_Border);
CUDA_CHECK_RETURN(cudaDeviceReset());
return 0;
}
Note the warm up call to nppiCopyConstBorder_8u_C1R before the timed call. When I run it (CUDA 5.5 with linux on an sm_30 device), I see this:
~$ nvcc -lnppi -m64 -O3 --ptxas-options=-v -gencode arch=compute_30,code=sm_30 --compiler-options -use_fast_math pqb.cc
~$ ./a.out
T= 0.39670 sg
~$ nvcc -lnppi -m64 -O3 --ptxas-options=-v -gencode arch=compute_30,code=sm_30 --compiler-options -use_fast_math -D__WARMUP_CALL__ pqb.cc
~$ ./a.out
T= 0.00002 sg
ie. adding a warmup call totally changes the timed performance of the function. When I look at the API trace from nvprof, I see that both npp function calls take about 6 microseconds. However, the CUDA launch for the first call takes hundreds of millseconds when the second call takes about 12 microseconds.
So, as I mentioned in an earlier comment, there is some lazy process which is getting included in the timing of the CUDA 5.5 on Titan case that probably isn't on the CUDA 5.0 on Fermi case. That isn't a feature of npp though, as I guess that the performance of the actual function is as fast or faster on Titan than on the Fermi card.

Simple adding of two int's in Cuda, result always the same

I'm starting my journary to learn Cuda. I am playing with some hello world type cuda code but its not working, and I'm not sure why.
The code is very simple, take two ints and add them on the GPU and return the result, but no matter what I change the numbers to I get the same result(If math worked that way I would have done alot better in the subject than I actually did).
Here's the sample code:
// CUDA-C includes
#include <cuda.h>
#include <stdio.h>
__global__ void add( int a, int b, int *c ) {
*c = a + b;
}
extern "C"
void runCudaPart();
// Main cuda function
void runCudaPart() {
int c;
int *dev_c;
cudaMalloc( (void**)&dev_c, sizeof(int) );
add<<<1,1>>>( 1, 4, dev_c );
cudaMemcpy( &c, dev_c, sizeof(int), cudaMemcpyDeviceToHost );
printf( "1 + 4 = %d\n", c );
cudaFree( dev_c );
}
The output seems a bit off: 1 + 4 = -1065287167
I'm working on setting up my environment and just wanted to know if there was a problem with the code otherwise its probably my environment.
Update: I tried to add some code to show the error but I don't get an output but the number changes(is it outputing error codes instead of answers? Even if I don't do any work in the kernal other than assign a variable I still get simlair results).
// CUDA-C includes
#include <cuda.h>
#include <stdio.h>
__global__ void add( int a, int b, int *c ) {
//*c = a + b;
*c = 5;
}
extern "C"
void runCudaPart();
// Main cuda function
void runCudaPart() {
int c;
int *dev_c;
cudaError_t err = cudaMalloc( (void**)&dev_c, sizeof(int) );
if(err != cudaSuccess){
printf("The error is %s", cudaGetErrorString(err));
}
add<<<1,1>>>( 1, 4, dev_c );
cudaError_t err2 = cudaMemcpy( &c, dev_c, sizeof(int), cudaMemcpyDeviceToHost );
if(err2 != cudaSuccess){
printf("The error is %s", cudaGetErrorString(err));
}
printf( "1 + 4 = %d\n", c );
cudaFree( dev_c );
}
Code appears to be fine, maybe its related to my setup. Its been a nightmare to get Cuda installed on OSX lion but I thought it worked as the examples in the SDK seemed to be fine. The steps I took so far are go to the Nvida website and download the latest mac releases for the driver, toolkit and SDK. I then added export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH and 'PATH=/usr/local/cuda/bin:$PATH` I did a deviceQuery and it passed with the following info about my system:
[deviceQuery] starting...
/Developer/GPU Computing/C/bin/darwin/release/deviceQuery Starting...
CUDA Device Query (Runtime API) version (CUDART static linking)
Found 1 CUDA Capable device(s)
Device 0: "GeForce 320M"
CUDA Driver Version / Runtime Version 4.2 / 4.2
CUDA Capability Major/Minor version number: 1.2
Total amount of global memory: 253 MBytes (265027584 bytes)
( 6) Multiprocessors x ( 8) CUDA Cores/MP: 48 CUDA Cores
GPU Clock rate: 950 MHz (0.95 GHz)
Memory Clock rate: 1064 Mhz
Memory Bus Width: 128-bit
Max Texture Dimension Size (x,y,z) 1D=(8192), 2D=(65536,32768), 3D=(2048,2048,2048)
Max Layered Texture Size (dim) x layers 1D=(8192) x 512, 2D=(8192,8192) x 512
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 16384 bytes
Total number of registers available per block: 16384
Warp size: 32
Maximum number of threads per multiprocessor: 1024
Maximum number of threads per block: 512
Maximum sizes of each dimension of a block: 512 x 512 x 64
Maximum sizes of each dimension of a grid: 65535 x 65535 x 1
Maximum memory pitch: 2147483647 bytes
Texture alignment: 256 bytes
Concurrent copy and execution: Yes with 1 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: Yes
Support host page-locked memory mapping: Yes
Concurrent kernel execution: No
Alignment requirement for Surfaces: Yes
Device has ECC support enabled: No
Device is using TCC driver mode: No
Device supports Unified Addressing (UVA): No
Device PCI Bus ID / PCI location ID: 4 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 4.2, CUDA Runtime Version = 4.2, NumDevs = 1, Device = GeForce 320M
[deviceQuery] test results...
PASSED
UPDATE: what's really weird is even if I remove all the work in the kernel I stil get a result for c? I have reinstalled cuda and used make on the examples and all of them pass.
Basically there are two problems here:
You are not compiling the kernel for the correct architecture (gleaned from comments)
Your code contains imperfect error checking which is missing the point when the runtime error is occurring, leading to mysterious and unexplained symptoms.
In the runtime API, most context related actions are performed "lazily". When you launch a kernel for the first time, the runtime API will invoke code to intelligently find a suitable CUBIN image from inside the fat binary image emitted by the toolchain for the target hardware and load it into the context. This can also include JIT recompilation of PTX for a backwards compatible architecture, but not the other way around. So if you had a kernel compiled for a compute capability 1.2 device and you run it on a compute capability 2.0 device, the driver can JIT compile the PTX 1.x code it contains for the newer architecture. But the reverse doesn't work. So in your example, the runtime API will generate an error because it cannot find a usable binary image in the CUDA fatbinary image embedded in the executable. The error message is pretty cryptic, but you will get an error (see this question for a bit more information).
If your code contained error checking like this:
cudaError_t err = cudaMalloc( (void**)&dev_c, sizeof(int) );
if(err != cudaSuccess){
printf("The error is %s", cudaGetErrorString(err));
}
add<<<1,1>>>( 1, 4, dev_c );
if (cudaPeekAtLastError() != cudaSuccess) {
printf("The error is %s", cudaGetErrorString(cudaGetLastError()));
}
cudaError_t err2 = cudaMemcpy( &c, dev_c, sizeof(int), cudaMemcpyDeviceToHost );
if(err2 != cudaSuccess){
printf("The error is %s", cudaGetErrorString(err));
}
the extra error checking after the kernel launch should catch the runtime API error generated by the kernel load/launch failure.
#include <stdio.h>
#include <conio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
__global__ void Addition(int *a,int *b,int *c)
{
*c = *a + *b;
}
int main()
{
int a,b,c;
int *dev_a,*dev_b,*dev_c;
int size = sizeof(int);
cudaMalloc((void**)&dev_a, size);
cudaMalloc((void**)&dev_b, size);
cudaMalloc((void**)&dev_c, size);
a=5,b=6;
cudaMemcpy(dev_a, &a,sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, &b,sizeof(int), cudaMemcpyHostToDevice);
Addition<<< 1,1 >>>(dev_a,dev_b,dev_c);
cudaMemcpy(&c, dev_c,size, cudaMemcpyDeviceToHost);
cudaFree(&dev_a);
cudaFree(&dev_b);
cudaFree(&dev_c);
printf("%d\n", c);
getch();
return 0;
}