Performance drop in nppiCopyConstBorder_8u_C1R - cuda

I have a performance drop using the nppiCopyConstBorder_8u_C1R function in two different architectures (GTX480 and GTX TITAN) involving also different CUDA version (v5.0 and v5.5 respectively).
In the first case (GTX480 and CUDA 5.0) the execution time of the function is
T = 0.00005 seconds
In the second case (GTX TITAN and CUDA 5.5) the execution time is
T = 0.969831 seconds
I have reproduced this behaviour with the following code:
// GTX480 nvcc -lnpp -m64 -O3 --ptxas-options=-v -gencode arch=compute_20,code=sm_20 --compiler-options -use_fast_math
// GTXTITAN nvcc -lnppi -m64 -O3 --ptxas-options=-v -gencode arch=compute_35,code=sm_35 --compiler-options -use_fast_math
#include <stdlib.h>
#include <stdio.h>
// CUDA
#include <cuda.h>
#include <cuda_runtime_api.h>
// CUDA Nvidia Performance Primitives
#include <npp.h>
#include <assert.h>
#define w 256 // width
#define h 256 // height
#define b 16 // extra border
#define BORDER_TYPE 0
int main(int argc, char *argv[])
{
// input data
Npp8u* h_idata[w*h];
// output data
Npp8u* h_odata[(w+b)*(h+b)];
/* MEMORY ALLOCTION AND INITIAL COPY OF DATA FROM CPU TO GPU */
Npp8u *i_devPtr, *i_devPtr_Border;
// size of input the data
int d_Size = w * h * sizeof(Npp8u);
// allocate input data
CUDA_CHECK_RETURN( cudaMalloc( (void**) &i_devPtr, d_Size ) );
// copy initial data to GPU
CUDA_CHECK_RETURN( cudaMemcpy(i_devPtr, h_idata, d_Size, cudaMemcpyHostToDevice) );
// size of output the data
int d_Size_o = (w+b) * (h+b) * sizeof(Npp8u);
// allocation for input data with extended border
CUDA_CHECK_RETURN( cudaMalloc( (void**) &i_devPtr_Border, d_Size_o ) );
// create struct with ROI size given the current mask
NppiSize SizeROI = {w, h};
NppiSize SizeROI_Border = { SizeROI.width + b, SizeROI.height + b };
// create events
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
// NPP Library Copy Constant Border
cudaEventRecord( start, 0 );
NppStatus eStatusNPP = nppiCopyConstBorder_8u_C1R(i_devPtr,SizeROI.width, SizeROI,
i_devPtr_Border, SizeROI_Border.width, SizeROI_Border,
b, b, BORDER_TYPE);
cudaDeviceSynchronize();
assert( NPP_NO_ERROR == eStatusNPP );
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("T= %1.5f sg\n", milliseconds / 1000.0f);
// copy output data from GPU
CUDA_CHECK_RETURN( cudaMemcpy(h_odata, i_devPtr_Border, d_Size_o, cudaMemcpyDeviceToHost) );
/* free resources */
cudaFree(i_devPtr);
cudaFree(i_devPtr_Border);
CUDA_CHECK_RETURN(cudaDeviceReset());
return 0;
}
Q: Anyone is aware about this issue ?
This makes me ask the following question:
Q: How is nppiCopyConstBorder_8u_C1R implemented? Does the function involve copy data from device to host, extend the border in the host and copy the result to the device?
PS: The machine with the TITAN has the GPU outside the box in a separated motherboard specially designed for multiple PCIe connections and it's connected via a PCIe wire. I have not seen any drawback in this configuration regarding other kernels I have tested.

I think you will find that the only difference is when/where API latencies are being accounted for during program execution, and the the underlying npp function itself doesn't have a vast different in performance between the two CUDA versions and GPU architectures.
My evidence for this hypothesis is this version of the code you posted:
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <npp.h>
#include <assert.h>
#define w 256 // width
#define h 256 // height
#define b 16 // extra border
#define BORDER_TYPE 0
#define CUDA_CHECK_RETURN(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main(int argc, char *argv[])
{
Npp8u* h_idata[w*h];
Npp8u* h_odata[(w+b)*(h+b)];
Npp8u *i_devPtr, *i_devPtr_Border;
int d_Size = w * h * sizeof(Npp8u);
CUDA_CHECK_RETURN( cudaMalloc( (void**) &i_devPtr, d_Size ) );
CUDA_CHECK_RETURN( cudaMemcpy(i_devPtr, h_idata, d_Size, cudaMemcpyHostToDevice) );
int d_Size_o = (w+b) * (h+b) * sizeof(Npp8u);
CUDA_CHECK_RETURN( cudaMalloc( (void**) &i_devPtr_Border, d_Size_o ) );
NppiSize SizeROI = {w, h};
NppiSize SizeROI_Border = { SizeROI.width + b, SizeROI.height + b };
NppStatus eStatusNPP;
#ifdef __WARMUP_CALL__
// Warm up call to nppi function
eStatusNPP = nppiCopyConstBorder_8u_C1R(i_devPtr,SizeROI.width, SizeROI,
i_devPtr_Border, SizeROI_Border.width, SizeROI_Border,
b, b, BORDER_TYPE);
assert( NPP_NO_ERROR == eStatusNPP );
CUDA_CHECK_RETURN( cudaDeviceSynchronize() );
#endif
// Call for timing
cudaEvent_t start, stop;
CUDA_CHECK_RETURN( cudaEventCreate( &start ) );
CUDA_CHECK_RETURN( cudaEventCreate( &stop ) );
CUDA_CHECK_RETURN( cudaEventRecord( start, 0 ) );
eStatusNPP = nppiCopyConstBorder_8u_C1R(i_devPtr,SizeROI.width, SizeROI,
i_devPtr_Border, SizeROI_Border.width, SizeROI_Border,
b, b, BORDER_TYPE);
assert( NPP_NO_ERROR == eStatusNPP );
CUDA_CHECK_RETURN( cudaEventRecord( stop, 0 ) );
CUDA_CHECK_RETURN( cudaEventSynchronize( stop ) );
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("T= %1.5f sg\n", milliseconds / 1000.0f);
CUDA_CHECK_RETURN( cudaMemcpy(h_odata, i_devPtr_Border, d_Size_o, cudaMemcpyDeviceToHost) );
cudaFree(i_devPtr);
cudaFree(i_devPtr_Border);
CUDA_CHECK_RETURN(cudaDeviceReset());
return 0;
}
Note the warm up call to nppiCopyConstBorder_8u_C1R before the timed call. When I run it (CUDA 5.5 with linux on an sm_30 device), I see this:
~$ nvcc -lnppi -m64 -O3 --ptxas-options=-v -gencode arch=compute_30,code=sm_30 --compiler-options -use_fast_math pqb.cc
~$ ./a.out
T= 0.39670 sg
~$ nvcc -lnppi -m64 -O3 --ptxas-options=-v -gencode arch=compute_30,code=sm_30 --compiler-options -use_fast_math -D__WARMUP_CALL__ pqb.cc
~$ ./a.out
T= 0.00002 sg
ie. adding a warmup call totally changes the timed performance of the function. When I look at the API trace from nvprof, I see that both npp function calls take about 6 microseconds. However, the CUDA launch for the first call takes hundreds of millseconds when the second call takes about 12 microseconds.
So, as I mentioned in an earlier comment, there is some lazy process which is getting included in the timing of the CUDA 5.5 on Titan case that probably isn't on the CUDA 5.0 on Fermi case. That isn't a feature of npp though, as I guess that the performance of the actual function is as fast or faster on Titan than on the Fermi card.

Related

Cuda - nvcc - No kernel image is available for execution on the device. What is the problem?

I'm trying to use nvcc with the most simple example, but it doesn't work correctly. I'm compiling and execute the example from https://devblogs.nvidia.com/easy-introduction-cuda-c-and-c/, however my server can't execute the global function. I rewrite the code to get some error message and I receive the following message:
"no kernel image is available for execution on the device"
My GPU is a Quadro 6000 and the cuda version is 9.0.
#include <stdio.h>
#include <cuda_runtime.h>
__global__ void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
y[i] = 10.0; //a*x[i] + y[i];
}
int main(int argc, char *argv[])
{
int N = 120;
int nDevices;
float *x, *y, *d_x, *d_y;
cudaError_t err = cudaGetDeviceCount(&nDevices);
if (err != cudaSuccess)
printf("%s\n", cudaGetErrorString(err));
else
printf("Number of devices %d\n", nDevices);
x = (float*)malloc(N*sizeof(float));
y = (float*)malloc(N*sizeof(float));
cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
// Perform SAXPY on 1M elements
saxpy<<<1, 1>>>(N, 2.0f, d_x, d_y);
cudaDeviceSynchronize();
err = cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
printf("%s\n",cudaGetErrorString(err));
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
}"
Execution command
bash-4.1$ nvcc -o sapx simples_cuda.cu
bash-4.1$ ./sapx
Number of devices 1
no error
Sync kernel error: no kernel image is available for execution on the device
GPUs of compute capability less than 2.0 are only supported by CUDA toolkits of version 6.5 and older.
GPUs of compute capability less than 3.0 (but greater than or equal to 2.0) are only supported by CUDA toolkits of version 8.0 and older.
Your Quadro 6000 is a compute capability 2.0 GPU. This can be determined programmatically with the deviceQuery CUDA sample code, or via a google search. It is not supported by CUDA 9.0
You should add the compute capability of your Video Card as a parameter to the nvcc compiler. In my case (windows/Visual Studio 2017) I set this at the Code Generation field. So as #einpoklum answered before add the gencode parameter like this -gencode arch=compute_${COMPUTE_CAPABILITY},code=compute_${SM_CAPABILITY} where {COMPUTE_CAPABILITY} and {SM_CAPABILITY} belong to the following pairs (you can add them all as VS2017 do),
{COMPUTE_CAPABILITY},{SM_CAPABILITY}
compute_35,sm_35
compute_37,sm_37
compute_50,sm_50
compute_52,sm_52
compute_60,sm_60
compute_61,sm_61
compute_70,sm_70
compute_75,sm_75
compute_80,sm_80
D:\Program Files\nVidia\CUDA Samples\MySamples\IntroToCUDA_1\IntroToCUDA_1>"D:\Program Files\nVidia\GPU Computing Toolkit\CUDA\v11.0\bin\nvcc.exe" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -gencode=arch=compute_37,code=\"sm_37,compute_37\" -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_52,code=\"sm_52,compute_52\" -gencode=arch=compute_60,code=\"sm_60,compute_60\" -gencode=arch=compute_61,code=\"sm_61,compute_61\" -gencode=arch=compute_70,code=\"sm_70,compute_70\" -gencode=arch=compute_75,code=\"sm_75,compute_75\" -gencode=arch=compute_80,code=\"sm_80,compute_80\" --use-local-env -ccbin "D:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\VC\Tools\MSVC\14.16.27023\bin\HostX86\x64" -x cu -I"D:\Program Files\nVidia\GPU Computing Toolkit\CUDA\v11.0\include" -I"D:\Program Files\nVidia\GPU Computing Toolkit\CUDA\v11.0\include" -G --keep-dir x64\Debug -maxrregcount=0 --machine 64 --compile -cudart static -g -D_DEBUG -D_CONSOLE -D_UNICODE -DUNICODE -Xcompiler "/EHsc /W3 /nologo /Od /Fdx64\Debug\vc141.pdb /FS /Zi /RTC1 /MDd " -o x64\Debug\IntroToCUDA_1.cu.obj "D:\Program Files\nVidia\CUDA Samples\MySamples\IntroToCUDA_1\IntroToCUDA_1\IntroToCUDA_1.cu"
You can check your CC of your video card with the deviceQuery example you can find in CUDA Samples SDK
Adding to #RobertCrovella's answer:
When compiling with nvcc, you should always set appropriate flags to generate binary kernel images for the microarchitecture / compute capability you intend to run on. For example: -gencode arch=compute_${COMPUTE_CAPABILITY},code=compute_${COMPUTE_CAPABILITY},
with, say COMPUTE_CAPABILITY=61.
Read nvcc --help for more information on these flags (although, to be honest, it's a bit of a murky subject).

Unified Memory and Streams in C

I am trying to use streams with CUDA 6 and unified memory in C. My previous stream implementation was looking like this :
for(x=0; x<DSIZE; x+=N*2){
gpuErrchk(cudaMemcpyAsync(array_d0, array_h+x, N*sizeof(char), cudaMemcpyHostToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(array_d1, array_h+x+N, N*sizeof(char), cudaMemcpyHostToDevice, stream1));
gpuErrchk(cudaMemcpyAsync(data_d0, data_h, wrap->size*sizeof(int), cudaMemcpyHostToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(data_d1, data_h, wrap->size*sizeof(int), cudaMemcpyHostToDevice, stream1));
searchGPUModified<<<N/128,128,0,stream0>>>(data_d0, array_d0, out_d0 );
searchGPUModified<<<N/128,128,0,stream1>>>(data_d1, array_d1, out_d1);
gpuErrchk(cudaMemcpyAsync(out_h+x, out_d0 , N * sizeof(int), cudaMemcpyDeviceToHost, stream0));
gpuErrchk(cudaMemcpyAsync(out_h+x+N, out_d1 ,N * sizeof(int), cudaMemcpyDeviceToHost, stream1));
}
but I cannot find an example of streams and unified memory, using the same technique, where chuncks of data are sent to the GPU. I am thus wondering if there is a way to do this ?
You should read section J.2.2 of the programming guide (and preferably all of appendix J).
With Unified Memory, memory allocated using cudaMallocManaged is by default attached to all streams ("global") and we must modify this in order to make effective use of streams, e.g. for compute/copy overlap. We can do this with the cudaStreamAttachMemAsync function as described in section J.2.2.3 By associating each memory "chunk" with a stream in this fashion, the UM subsystem can make intelligent decisions about when to transfer each data item.
The following example demonstrates this:
#include <stdio.h>
#include <time.h>
#define DSIZE 1048576
#define DWAIT 100000ULL
#define nTPB 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef int mytype;
__global__ void mykernel(mytype *data){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < DSIZE) data[idx] = 1;
unsigned long long int tstart = clock64();
while (clock64() < tstart + DWAIT);
}
int main(){
mytype *data1, *data2, *data3;
cudaStream_t stream1, stream2, stream3;
cudaMallocManaged(&data1, DSIZE*sizeof(mytype));
cudaMallocManaged(&data2, DSIZE*sizeof(mytype));
cudaMallocManaged(&data3, DSIZE*sizeof(mytype));
cudaCheckErrors("cudaMallocManaged fail");
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaStreamCreate(&stream3);
cudaCheckErrors("cudaStreamCreate fail");
cudaStreamAttachMemAsync(stream1, data1);
cudaStreamAttachMemAsync(stream2, data2);
cudaStreamAttachMemAsync(stream3, data3);
cudaDeviceSynchronize();
cudaCheckErrors("cudaStreamAttach fail");
memset(data1, 0, DSIZE*sizeof(mytype));
memset(data2, 0, DSIZE*sizeof(mytype));
memset(data3, 0, DSIZE*sizeof(mytype));
mykernel<<<(DSIZE+nTPB-1)/nTPB, nTPB, 0, stream1>>>(data1);
mykernel<<<(DSIZE+nTPB-1)/nTPB, nTPB, 0, stream2>>>(data2);
mykernel<<<(DSIZE+nTPB-1)/nTPB, nTPB, 0, stream3>>>(data3);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
for (int i = 0; i < DSIZE; i++){
if (data1[i] != 1) {printf("data1 mismatch at %d, should be: %d, was: %d\n", i, 1, data1[i]); return 1;}
if (data2[i] != 1) {printf("data2 mismatch at %d, should be: %d, was: %d\n", i, 1, data2[i]); return 1;}
if (data3[i] != 1) {printf("data3 mismatch at %d, should be: %d, was: %d\n", i, 1, data3[i]); return 1;}
}
printf("Success!\n");
return 0;
}
The above program creates a kernel that runs artificially long using clock64(), so as to give us a simulated opportunity for compute/copy overlap (simulating a compute-intensive kernel). We are launching 3 instances of this kernel, each instance operating on a separate "chunk" of data.
When we profile the above program, the following is seen:
First, note that the 3rd kernel launch is highlighted in yellow, and it begins immediately after the second kernel launch highlighted in purple. The actual cudaLaunch runtime API event that launches this 3rd kernel is indicated in the runtime API line by the mouse pointer, also highlighted in yellow (and is preceded by the cudaLaunch events for the first 2 kernels). Since this launch happens during execution of the first kernel, and there is no intervening "empty space" from that point until the start of the 3rd kernel, we can observe that the transfer of the data for the 3rd kernel launch (i.e. data3) occurred while kernels 1 and 2 were executing. Therefore we have effective overlap of copy and compute. (We could make a similar observation about kernel 2).
Although I haven't shown it here, if we omit the cudaStreamAttachMemAsync lines, the program still compiles and runs correctly, but if we profile it, we observe a different relationship between the cudaLaunch events and the kernels. The overall profile looks similar, and the kernels are executing back to back, but the entire cudaLaunch process now begins and ends before the first kernel begins executing, and there are no cudaLaunch events during the kernel execution. This indicates that (since all the cudaMallocManaged memory is global) all of the data transfers are taking place prior to the first kernel launch. The program has no way to associate a "global" allocation with any particular kernel, so all such allocated memory must be transferred before the first kernel launch (even though that kernel is only using data1).

Multi-gpu CUDA Thrust

I have a Cuda C++ code that uses Thrust currently working properly on a single GPU. I'd now like to modify it for multi-gpu. I have a host function that includes a number of Thrust calls that sort, copy, calculate differences etc on device arrays. I want to use each GPU to run this sequence of Thrust calls on it's own (independent) set of arrays at the same time. I've read that Thrust functions that return values are synchronous but can I use OpenMP to have each host thread call up a function (with Thrust calls) that runs on a separate GPU?
For example (coded in browser):
#pragma omp parallel for
for (int dev=0; dev<Ndev; dev++){
cudaSetDevice(dev);
runthrustfunctions(dev);
}
void runthrustfunctions(int dev){
/*lots of Thrust functions running on device arrays stored on corresponding GPU*/
//for example this is just a few of the lines"
thrust::device_ptr<double> pos_ptr = thrust::device_pointer_cast(particle[dev].pos);
thrust::device_ptr<int> list_ptr = thrust::device_pointer_cast(particle[dev].list);
thrust::sequence(list_ptr,list_ptr+length);
thrust::sort_by_key(pos_ptr, pos_ptr+length,list_ptr);
thrust::device_vector<double> temp(length);
thrust::gather(list_ptr,list_ptr+length,pos_ptr,temp.begin());
thrust::copy(temp.begin(), temp.end(), pos_ptr);
}`
I think I also need the structure "particle[0]" to be stored on GPU 0, particle[1] on GPU 1 etc and I my guess is this not possible. An option might be to use "switch" with separate code for each GPU case.
I'd like to know if this is a correct approach or if there is a better way?
Thanks
Yes, you can combine thrust and OpenMP.
Here's a complete worked example with results:
$ cat t340.cu
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <time.h>
#include <sys/time.h>
#define DSIZE 200000000
using namespace std;
int main(int argc, char *argv[])
{
timeval t1, t2;
int num_gpus = 0; // number of CUDA GPUs
printf("%s Starting...\n\n", argv[0]);
// determine the number of CUDA capable GPUs
cudaGetDeviceCount(&num_gpus);
if (num_gpus < 1)
{
printf("no CUDA capable devices were detected\n");
return 1;
}
// display CPU and GPU configuration
printf("number of host CPUs:\t%d\n", omp_get_num_procs());
printf("number of CUDA devices:\t%d\n", num_gpus);
for (int i = 0; i < num_gpus; i++)
{
cudaDeviceProp dprop;
cudaGetDeviceProperties(&dprop, i);
printf(" %d: %s\n", i, dprop.name);
}
printf("initialize data\n");
// initialize data
typedef thrust::device_vector<int> dvec;
typedef dvec *p_dvec;
std::vector<p_dvec> dvecs;
for(unsigned int i = 0; i < num_gpus; i++) {
cudaSetDevice(i);
p_dvec temp = new dvec(DSIZE);
dvecs.push_back(temp);
}
thrust::host_vector<int> data(DSIZE);
thrust::generate(data.begin(), data.end(), rand);
// copy data
for (unsigned int i = 0; i < num_gpus; i++) {
cudaSetDevice(i);
thrust::copy(data.begin(), data.end(), (*(dvecs[i])).begin());
}
printf("start sort\n");
gettimeofday(&t1,NULL);
// run as many CPU threads as there are CUDA devices
omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
#pragma omp parallel
{
unsigned int cpu_thread_id = omp_get_thread_num();
cudaSetDevice(cpu_thread_id);
thrust::sort((*(dvecs[cpu_thread_id])).begin(), (*(dvecs[cpu_thread_id])).end());
cudaDeviceSynchronize();
}
gettimeofday(&t2,NULL);
printf("finished\n");
unsigned long et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
if (cudaSuccess != cudaGetLastError())
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
printf("sort time = %fs\n", (float)et/(float)(1000000));
// check results
thrust::host_vector<int> result(DSIZE);
thrust::sort(data.begin(), data.end());
for (int i = 0; i < num_gpus; i++)
{
cudaSetDevice(i);
thrust::copy((*(dvecs[i])).begin(), (*(dvecs[i])).end(), result.begin());
for (int j = 0; j < DSIZE; j++)
if (data[j] != result[j]) { printf("mismatch on device %d at index %d, host: %d, device: %d\n", i, j, data[j], result[j]); return 1;}
}
printf("Success\n");
return 0;
}
$ nvcc -Xcompiler -fopenmp -O3 -arch=sm_20 -o t340 t340.cu -lgomp
$ CUDA_VISIBLE_DEVICES="0" ./t340
./t340 Starting...
number of host CPUs: 12
number of CUDA devices: 1
0: Tesla M2050
initialize data
start sort
finished
sort time = 0.398922s
Success
$ ./t340
./t340 Starting...
number of host CPUs: 12
number of CUDA devices: 4
0: Tesla M2050
1: Tesla M2070
2: Tesla M2050
3: Tesla M2070
initialize data
start sort
finished
sort time = 0.460058s
Success
$
We can see that when I restrict the program to using a single device, the sort operation takes about 0.4 seconds. Then when I allow it to use all 4 devices (repeating the same sort on all 4 devices) the overall operation only take 0.46 seconds, even though we're doing 4 times as much work.
For this particular case I happened to be using CUDA 5.0 with thrust v1.7, and gcc 4.4.6 (RHEL 6.2)

CUDA Makefile Include Error

I'm attempting to write a basic matrix multiplication program using CUDA and C. The code itself doesn't really do anything right now, but should at least compile. After some research on the issue, I've determined that the issue is failure to include CUDA header files, indicating an issue with my Makefile. I'm extremely inexperienced with CUDA (and C for that matter), so any help would be greatly appreciated.
Output on command: make matrixMult1
c99 -I. -I/usr/local/cuda/include -c matrixMult1.c -o matrixMult1.o
matrixMult1.c: In function 'main':
matrixMult1.c:77: warning: implicit declaration of function 'cudaMalloc'
matrixMult1.c:82: warning: implicit declaration of function 'cudaMemcpy'
matrixMult1.c:83: error: 'cudaMemcpyHostToDevice' undeclared (first use in this
function)
matrixMult1.c:83: error: (Each undeclared identifier is reported only once
matrixMult1.c:83: error: for each function it appears in.)
matrixMult1.c:106: warning: implicit declaration of function 'cudaFree'
make: *** [matrixMult1.o] Error 1
Makefile:
GCC = c99
CUDA_INSTALL_PATH := /usr/local/cuda
INCLUDES := -I. -I$(CUDA_INSTALL_PATH)/include
CUDA_LIBS := -L$(CUDA_INSTALL_PATH)/lib -lcudart
matrixMult1.o: matrixMult1.c
$(GCC) $(INCLUDES) -c matrixMult1.c -o $#
matrixMult1: matrixMult1.o
$(GCC) -o $# matrixMult1.o $(CUDA_LIBS)
C Program:
//********************************************************************
// matrixMult1.c
//
// A basic matrix multiplication program.
//********************************************************************
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include "cuda.h"
#define WA 3
#define HA 3
#define WB 3
#define HB WA
#define WC WB
#define HC HA
void initMatrix(float * matrix, int numIndices);
//*************
// Main Program
//*************
int main(int argc, char** argv) {
/* Set random seed */
srand(2013);
/* Compute memory sizes for matrices A, B, and C */
unsigned int sizeA = WA * HA;
unsigned int sizeB = WB * HB;
unsigned int sizeC = WC * HC;
unsigned int memoryA = sizeof(float) * sizeA;
unsigned int memoryB = sizeof(float) * sizeB;
unsigned int memoryC = sizeof(float) * sizeC;
/* Allocate memory for matrices A, B, and C */
float * matrixA = (float *) malloc(memoryA);
float * matrixB = (float *) malloc(memoryB);
float * matrixC = (float *) malloc(memoryC);
/* Initialize matrices A and B */
initMatrix(matrixA, sizeA);
initMatrix(matrixB, sizeB);
/* Print matrix A */
printf("\nMatrix A:\n");
for (int i = 0; i < sizeA; i++) {
printf("%f ", matrixA[i]);
if (((i + 1) % WA) == 0) {
printf("\n");
} else {
printf(" | ");
}
}
/* Print matrix B */
printf("\nMatrix B:\n");
for (int i = 0; i < sizeB; i++) {
printf("%f ", matrixB[i]);
if (((i + 1) % WA) == 0) {
printf("\n");
} else {
printf(" | ");
}
}
/* Allocate device memory */
float* deviceMemA;
float* deviceMemB;
float* deviceMemC;
cudaMalloc((void**) &deviceMemA, memoryA);
cudaMalloc((void**) &deviceMemB, memoryB);
cudaMalloc((void**) &deviceMemC, memoryC);
/* Copy host memory to device */
cudaMemcpy(deviceMemA, matrixA, memoryA,
cudaMemcpyHostToDevice);
cudaMemcpy(deviceMemB, matrixB, memoryB,
cudaMemcpyHostToDevice);
cudaMemcpy(deviceMemC, matrixC, memoryC,
cudaMemcpyHostToDevice);
/* Print matrix C */
printf("\nMatrix C:\n");
for (int i = 0; i < sizeC; i++) {
printf("%f ", matrixC[i]);
if (((i + 1) % WC) == 0) {
printf("\n");
} else {
printf(" | ");
}
}
printf("\n");
/* Free up memory */
free(matrixA);
free(matrixB);
free(matrixC);
cudaFree(deviceMemA);
cudaFree(deviceMemB);
cudaFree(deviceMemC);
}
//--------------------------------------------------------------------
// initMatrix - Assigns a random float value to each indice of the
// matrix.
//
// PRE: matrix is a pointer to a block of bytes in memory; numIndices
// is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been instantiated with a random
// float value.
//--------------------------------------------------------------------
void initMatrix(float * matrix, int numIndices) {
/*
Loop through the block of bytes, assigning a random float
for each index of the matrix
*/
for (int i = 0; i < numIndices; ++i) {
/* Assign a random float between 0 and 1 at this byte */
matrix[i] = rand() / (float)RAND_MAX;
}
}
CUDA programs need to be compiled by nvcc. While your program does not yet contain any CUDA kernel yet, I believe that is what you want to achieve.
Rename your file from matrixMult1.c to matrixMult1.cu, remove the #include "cuda.h" line (programs compiled with nvcc don't need any CUDA-specific includes) and compile with nvcc instead of gcc (e.g. by setting GCC = nvcc at the beginning of the Makefile).
Two problems here:
You were not including the appropriate header into your code (which you fixed)
Your Makefile is, in fact, broken. It should look something like:
GCC = c99
CUDA_INSTALL_PATH := /usr/local/cuda
INCLUDES := -I. -I$(CUDA_INSTALL_PATH)/include
CUDA_LIBS := -L$(CUDA_INSTALL_PATH)/lib -lcudart
matrixMult1.o: matrixMult1.c
$(GCC) $(INCLUDES) -c matrixMult1.c -o $#
matrixMult1: matrixMult1.o
$(GCC) -o $# matrixMult1.o $(CUDA_LIBS)
[Disclaimer: not tested, use at own risk]
The current problem is that the include path was only specified at the linkage phase of the build.
Note that these changes also preempt the missing symbols error you will get during linkage from not linking with the CUDA runtime library. Note that depending on whether you are using a 32 or 64 bit host OS, you may need to change the library path to $(CUDA_INSTALL_PATH)/lib64 for the linkage to work correctly.

CUDA 1D texture fetch always return 0

I am trying to test the CUDA 1D texture with a piece of simple code. It is quite straightforward: first allocates a global memory, then bind it to a texture reference; access the texture from within a kernel by tex1D(); print out the value returned by the texture fetch.
The code is as follows:
#include "cuda.h"
#include "cuda_runtime.h"
#include <iostream>
#include <vector>
#include <cstdio>
using namespace std;
texture<float, cudaTextureType1D, cudaReadModeElementType> texX;
__global__ void kernel(float *X)
{
int i = threadIdx.x ;
if ( i >= 128 ) return;
printf("%.3f\t%.3f\n", tex1D( texX, i*1.0 ), X[i] );
}
int main()
{
float *devX;
vector<float> X(128, 3.1415926 );
cudaMalloc( &devX, 128 * sizeof(float) );
cudaMemcpy( devX, &X[0], 128*sizeof(float), cudaMemcpyDefault );
cudaDeviceSynchronize();
cudaBindTexture( (size_t)0, texX, devX, 128 * sizeof(float) );
cudaDeviceSynchronize();
kernel<<<1,128>>>( devX );
cudaDeviceSynchronize();
cout<<endl;
cout<< cudaGetErrorString( cudaGetLastError() ) <<endl;
}
But all I got was like this:
0.000 3.142
0.000 3.142
...
0.000 3.142
no error
Could anyone explain why?
You need to use tex1Dfetch() since your texture is bound to linear memory:
printf("%.3f\t%.3f\n", tex1Dfetch( texX, i ), X[i] );
From the " CUDA C Programming Guide"
tex1D is used to fetch from the CUDA array specified by the one-dimensional texture object ...
tex1Dfetch is used to fetch from the region of linear memory specified by the one-dimension texture ...
So, if you use cudaMalloc to malloc linear memory but not cuda array, u should select tex1Dfetch