Multiple processes sharing / reading from one large block of CUDA device memory

Multiple processes sharing / reading from one large block of CUDA device memory - cuda

I have a multi-process application with a single GPU using CUDA multi process service (MPS). Each process creates several device arrays, however one of them is large (~5 GB), and its a constant array, so I thought I could allocate the memory once with one process, and instruct other processes to read from that memory block using "inter process communication" (similar to the CUDA API example shown here ).
Following the linked CUDA example, I tried to implement a simple test program, but have been hitting an API error. It seems when I call cudaIPCOpenMemHandle, I have an invalid argument. I post the code below, in hopes that someone might easily identify the reason for the error, or perhaps suggest a better use of the CUDA API to accomplish what Im trying to do.
#include <stdio.h>
#include <mpi.h>
#include <assert.h>
#include <sys/mman.h>
#define blockSize 128
#define N 1000
#define gpuErr(ans) { gpuAssert((ans), __FILE__, __LINE__); }
__global__ void kernel(double* out, double* in, double val){
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int thread_stride = blockDim.x * gridDim.x;
for (int i=tid; i < N; i+=thread_stride){
out[i] = in[i]*val;
}
}
static void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
static void error_msg(cudaError_t err, int rank){
if (err != cudaSuccess){
printf("RANK %d recvd CUDA error message: %s\n", rank, cudaGetErrorString(err));
exit(err);
}
}
void check_access(){
cudaDeviceProp prop;
gpuErr(cudaGetDeviceProperties(&prop, 0));
if (prop.unifiedAddressing)
printf("> GPU%d = is capable of UVA\n", 0);
// NOTE: only interested in enabling intra-device peer2peer, so I think this test doesnt matter ?
//int can_access=-1;
//int num_dev=2;
//// note, here I was confused, I want the ability to have a process on device 0 access
//for (peer_dev=0; peer_dev <num_dev, peer_dev++){
// int peer_dev=0; // note if peer_dev is 1
// gpuErr(cudaDeviceCanAccessPeer(&can_access, 0,peer_dev));
// if (can_access)
// printf("device 0 has peerdev=%d access\n", peer_dev);
// else
// printf("device 0 has no peerdev=%d access\n", peer_dev);
//}
}
int main(){
MPI_Init(NULL,NULL);
int size,rank;
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank==0)
check_access();
gpuErr(cudaSetDevice(0));
double* out;
double * in;
gpuErr(cudaMallocManaged((void **)&out, N*sizeof(double)));
cudaIpcMemHandle_t * memHand = (cudaIpcMemHandle_t *)
mmap(NULL, sizeof(cudaIpcMemHandle_t),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0);
assert(MAP_FAILED != memHand);
memset((void *) memHand, 0, sizeof(cudaIpcMemHandle_t));
MPI_Barrier(MPI_COMM_WORLD);
if (rank==0){
gpuErr(cudaMalloc((void **)&in, N*sizeof(double)));
gpuErr(cudaIpcGetMemHandle((cudaIpcMemHandle_t *) &memHand[0], (void *)in));
double * temp = new double[N];
for (int i=0; i < N; i++)
temp[i] = 1;
gpuErr(cudaMemcpy(in, temp, N*sizeof(double), cudaMemcpyHostToDevice));
delete temp;
}
MPI_Barrier(MPI_COMM_WORLD);
// the following is throwing a CUDAerror, invalid
if (rank >0 )
gpuErr(cudaIpcOpenMemHandle((void **) &in, memHand[0], cudaIpcMemLazyEnablePeerAccess));
MPI_Barrier(MPI_COMM_WORLD);
int numBlocks = (N + blockSize - 1) / blockSize;
double rank_val=(double) rank;
kernel<<<numBlocks, blockSize>>>(out, in, rank_val);
error_msg(cudaGetLastError(), rank);
gpuErr(cudaDeviceSynchronize());
MPI_Barrier(MPI_COMM_WORLD);
// test the kernel results
double sum = 0;
for (int i=0; i < N; i++)
sum += out[i];
printf("mpirank=%d, comm.size=%d, result=%f\n", rank, size, sum);
assert(sum==N*rank);
// cleanup
if (rank>0)
cudaIpcCloseMemHandle(in);
cudaFree(out);
if (rank==0)
cudaFree(in);
return 0;
}
I compile with
nvcc -I/usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/include --compiler-options=-march=skylake-avx512 -L/usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib -lmpi ipc_tester.cu
Single process job output: (looks correct)
$ srun -n1 -c2 ./a.out
> GPU0 = is capable of UVA
mpirank=0, comm.size=1, result=0.000000
Multi process job output: (hits error in call cudaIPCOpenMemHandle)
$ srun -n2 -c2 ./a.out
GPUassert: invalid argument ipc_tester.cu 92
Compute sanitizer output:
$ srun -n2 -c2 compute-sanitizer ./a.out
========= COMPUTE-SANITIZER
========= COMPUTE-SANITIZER
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
========= Saved host backtrace up to driver entry point at error
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:cuCtxGetDevice [0x155550d083eb]
========= in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
========= Host Frame:uct_cuda_base_query_devices [0x15553e03f170]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
========= Host Frame:cuCtxGetDevice [0x155550d083eb]
========= in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
========= Host Frame:uct_md_query_tl_resources [0x15553e6c44c6]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
========= Host Frame: [0x15553e9095a9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15553e90a7f9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15553e90abfd]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:ucp_init_version [0x15553e90b7f3]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:uct_cuda_base_query_devices [0x155546040170]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
========= Host Frame:mca_pml_ucx_open [0x15553edc7e70]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
========= Host Frame:mca_base_framework_components_open [0x15555299ef2d]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame: [0x155554472ec7]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:uct_md_query_tl_resources [0x1555466c54c6]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
========= Host Frame: [0x15554690a5a9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15554690b7f9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15554690bbfd]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:mca_base_framework_open [0x1555529a8b31]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame:ompi_mpi_init [0x15555447fb5b]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:MPI_Init [0x15555442dc01]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:ucp_init_version [0x15554690c7f3]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x403f04]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= Host Frame:__libc_start_main [0x1555531173ea]
========= in /lib64/libc.so.6
========= Host Frame: [0x403d1a]
========= Host Frame:mca_pml_ucx_open [0x155546dc8e70]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========
========= Host Frame:mca_base_framework_components_open [0x15555299ef2d]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame: [0x155554472ec7]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:mca_base_framework_open [0x1555529a8b31]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame:ompi_mpi_init [0x15555447fb5b]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:MPI_Init [0x15555442dc01]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame: [0x403f04]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= Host Frame:__libc_start_main [0x1555531173ea]
========= in /lib64/libc.so.6
========= Host Frame: [0x403d1a]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:cuCtxGetDevice [0x155550d083eb]
========= in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
========= Host Frame:uct_cuda_base_query_devices [0x15553e03f170]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
========= Host Frame:uct_md_query_tl_resources [0x15553e6c44c6]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
========= Host Frame: [0x15553e9095a9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15553e90a7f9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15553e90abfd]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:ucp_init_version [0x15553e90b7f3]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:mca_pml_ucx_open [0x15553edc7e70]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
========= Host Frame:mca_base_framework_components_open [0x15555299ef2d]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame: [0x155554472ec7]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:mca_base_framework_open [0x1555529a8b31]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame:cuCtxGetDevice [0x155550d083eb]
========= in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
========= Host Frame:ompi_mpi_init [0x15555447fb5b]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:uct_cuda_base_query_devices [0x155546040170]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
========= Host Frame:MPI_Init [0x15555442dc01]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame: [0x403f04]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= Host Frame:uct_md_query_tl_resources [0x1555466c54c6]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
========= Host Frame: [0x15554690a5a9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15554690b7f9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15554690bbfd]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:ucp_init_version [0x15554690c7f3]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:__libc_start_main [0x1555531173ea]
========= in /lib64/libc.so.6
========= Host Frame: [0x403d1a]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========
========= Host Frame:mca_pml_ucx_open [0x155546dc8e70]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
========= Host Frame:mca_base_framework_components_open [0x15555299ef2d]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame: [0x155554472ec7]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:mca_base_framework_open [0x1555529a8b31]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame:ompi_mpi_init [0x15555447fb5b]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:MPI_Init [0x15555442dc01]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame: [0x403f04]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= Host Frame:__libc_start_main [0x1555531173ea]
========= in /lib64/libc.so.6
========= Host Frame: [0x403d1a]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
========= Saved host backtrace up to driver entry point at error
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:cuCtxGetDevice [0x155550d083eb]
========= in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
========= Host Frame:cuCtxGetDevice [0x155550d083eb]
========= in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
========= Host Frame:uct_cuda_base_query_devices [0x15553e03f170]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
========= Host Frame:uct_cuda_base_query_devices [0x155546040170]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
========= Host Frame:uct_md_query_tl_resources [0x15553e6c44c6]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
========= Host Frame: [0x15553e9095a9]
========= Host Frame:uct_md_query_tl_resources [0x1555466c54c6]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
========= Host Frame: [0x15554690a5a9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15554690b7f9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15554690bbfd]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15553e90a7f9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15553e90abfd]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:ucp_init_version [0x15553e90b7f3]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:ucp_init_version [0x15554690c7f3]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:mca_pml_ucx_open [0x155546dc8e70]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
========= Host Frame:mca_pml_ucx_open [0x15553edc7e70]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
========= Host Frame:mca_base_framework_components_open [0x15555299ef2d]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame: [0x155554472ec7]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:mca_base_framework_components_open [0x15555299ef2d]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame: [0x155554472ec7]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:mca_base_framework_open [0x1555529a8b31]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame:mca_base_framework_open [0x1555529a8b31]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame:ompi_mpi_init [0x15555447fb5b]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:ompi_mpi_init [0x15555447fb5b]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:MPI_Init [0x15555442dc01]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame: [0x403f04]
========= Host Frame:MPI_Init [0x15555442dc01]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame: [0x403f04]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= Host Frame:__libc_start_main [0x1555531173ea]
========= in /lib64/libc.so.6
========= Host Frame: [0x403d1a]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========
========= Host Frame:__libc_start_main [0x1555531173ea]
========= in /lib64/libc.so.6
========= Host Frame: [0x403d1a]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========
========= Program hit invalid argument (error 1) on CUDA API call to cudaIpcOpenMemHandle.
========= Saved host backtrace up to driver entry point at error
========= Host Frame: [0x155550dde1b3]
========= in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
========= Host Frame: [0x433fac]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= Host Frame: [0x40412e]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= Host Frame:__libc_start_main [0x1555531173ea]
========= in /lib64/libc.so.6
========= Host Frame: [0x403d1a]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========
GPUassert: invalid argument ipc_tester.cu 92
========= Error: process didn't terminate successfully
========= Target application returned an error
========= ERROR SUMMARY: 4 errors
System info:
$ lsb_release -a
LSB Version: n/a
Distributor ID: SUSE
Description: SUSE Linux Enterprise Server 15 SP2
Release: 15.2
Codename: n/a
$ nvidia-smi
Tue Sep 27 10:05:48 2022
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.102.04 Driver Version: 450.102.04 CUDA Version: 11.1 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla V100-SXM2... On | 00000000:89:00.0 Off | 0 |
| N/A 34C P0 38W / 300W | 0MiB / 16160MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+

As pointed out in the comments, the memHandler was not properly set on ranks>0.
After learning how to broadcast the memHandler, I arrived at a solution. The patch below leads to a working code.
## -66,12 +66,7 ## int main(){
double * in;
gpuErr(cudaMallocManaged((void **)&out, N*sizeof(double)));
- cudaIpcMemHandle_t * memHand = (cudaIpcMemHandle_t *)
- mmap(NULL, sizeof(cudaIpcMemHandle_t),
- PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0);
-
- assert(MAP_FAILED != memHand);
- memset((void *) memHand, 0, sizeof(cudaIpcMemHandle_t));
+ cudaIpcMemHandle_t memHand[1];
MPI_Barrier(MPI_COMM_WORLD);
## -87,6 +82,21 ## int main(){
}
MPI_Barrier(MPI_COMM_WORLD);
+// Broadcast the MPI handle
+// get size of memHandler container needed for broadcast
+ int hand_size[1];
+ if (rank==0)
+ hand_size[0]= sizeof(memHand[0]);
+ MPI_Bcast(&hand_size[0], 1, MPI_INT, 0, MPI_COMM_WORLD);
+
+ // create the char container for memHandler broadcast
+ char memHand_C[hand_size[0]];
+ if (rank==0)
+ memcpy(&memHand_C, &memHand[0], hand_size[0]);
+ MPI_Bcast(&memHand_C, hand_size[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+ if (rank >0)
+ memcpy(&memHand[0], &memHand_C, hand_size[0]);
+
// the following is throwing a CUDAerror, invalid
if (rank >0 )
gpuErr(cudaIpcOpenMemHandle((void **) &in, memHand[0], cudaIpcMemLazyEnablePeerAccess));

Related

CUDA constant memory usage across multiple source files showing different behaviors on cuda-11.2 and cuda-11.4

Minimum repro:
kernel.cu:
#include <stdio.h>
__constant__ int N_GPU;
void wrapper_fn(int *ptr)
{
cudaMemcpyToSymbol(N_GPU, ptr, sizeof(int), cudaMemcpyDeviceToDevice);
}
__global__ void printKernel() {
printf("N = %d; \n", N_GPU);
}
driver.cu:
#include "cuda_runtime.h"
#include <stdio.h>
void wrapper_fn(int*);
__global__ void printKernel();
int main()
{
int N = 10;
int* d_N_ptr;
cudaMalloc(&d_N_ptr, sizeof(int));
cudaMemcpy(d_N_ptr, &N, sizeof(int), cudaMemcpyDefault);
wrapper_fn(d_N_ptr);
printKernel <<<1, 1 >>>();
cudaPeekAtLastError();
cudaDeviceSynchronize();
return 0;
}
Both on cuda-11.4 and cuda-11.2, running nvcc kernel.cu driver.cu compiles. The expected output (i.e N = 10;) is only seen in 11.2 and not 11.4.
Upon running cuda-gdb on 11.4, I get the following:
...
[New Thread 0x7fffee240700 (LWP 54339)]
warning: Cuda API error detected: cudaMalloc returned (0xde)
warning: Cuda API error detected: cudaMemcpy returned (0xde)
warning: Cuda API error detected: cudaMemcpyToSymbol returned (0xde)
warning: Cuda API error detected: cudaLaunchKernel returned (0xde)
warning: Cuda API error detected: cudaPeekAtLastError returned (0xde)
warning: Cuda API error detected: cudaDeviceSynchronize returned (0xde)
[Thread 0x7fffee240700 (LWP 54339) exited]
...
Any particular nvcc flags I'm missing that's important in the 11.4? or particular API changes I'm missing? Thanks in advance!

So the answer has to do with my driver version. The error code as seen from the cuda-gdb output (0xde = 222) is due to the fact that the compiled PTX is too new for the driver installed (my driver was 460.35), and the "CUDA Enhanced Compatibility" was used to run on my older driver, that didn't support the necessary PTX JIT.
TLDR; compiling to the exact architecture-specific SASS solved for cuda 11.4.
I did this by adding the the -arch compute_70 flag to my nvcc compilation command.

PTX kernel name mangling

I cannot link my Cuda program when a kernel is compiled from ptx file.
main.cu:
extern
__global__ void kernel(int, float*);
int main()
{
...
kernel<<<...>>>(...);
...
}
kernel.cu
__global__
void kernel(int n, float* p)
{
...
}
If I compile like below, I have no problems and I get an executable:
nvcc -dc main.cu kernel.cu --gpu-architecture=sm_70
nvcc -dlink main.o kernel.o --gpu-architecture=sm_70 -o dlink.o
g++ dlink.o main.o kernel.o -lcudart
If I compile like below (by generating ptx), I get errors:
nvcc -ptx kernel.cu --gpu-architecture=sm_70
nvcc -dc main.cu kernel.ptx --gpu-architecture=sm_70
nvcc -dlink main.o kernel.o --gpu-architecture=sm_70 -o dlink.o
g++ dlink.o main.o kernel.o -lcudart
Error:
main.o: In function `main':
tmpxft_0000b5ce_00000000-5_main.cudafe1.cpp:(.text+0x4789): undefined reference to `kernel(int, float*)'
tmpxft_0000b5ce_00000000-5_main.cudafe1.cpp:(.text+0x497e): undefined reference to `kernel(int, float*)'
collect2: error: ld returned 1 exit status
I am following an example from CUDA_Compiler_Driver_NVCC.pdf.
What do I need to do to fix the error?
(This is CUDA 10.2).

If you want to write your own PTX (or modify PTX), the proper CUDA methodology to use is the CUDA driver API and associated compilation flow.
The CUDA vectorAddDrv sample code has all the plumbing and workflow that you need.

cuDevicePrimaryCtxRetain returns CUDA_ERROR_INVALID_DEVICE after acc_init

I was trying the new PGI community release (17.4) with a toy example (see below) and I'm getting an error inside the CUDA driver api when calling acc_init.
The code to reproduce the error is:
#include <openacc.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
int main()
{
acc_init( acc_device_nvidia );
int ndev = acc_get_num_devices( acc_device_nvidia );
printf("Num OpenACC devices: %d\n", ndev);
cudaGetDeviceCount(&ndev);
printf("Num CUDA devices: %d\n", ndev);
return 0;
}
Compiled with:
/usr/local/pgi/linux86-64/17.4/bin/pgcc -acc -ta=tesla -Mcuda ./test.c -o oacc_test.pgi
cuda memcheck output:
$ cuda-memcheck ./oacc_test.pgi
========= CUDA-MEMCHECK
========= Program hit CUDA_ERROR_INVALID_DEVICE (error 101) due to "invalid device ordinal" on CUDA API call to cuDevicePrimaryCtxRetain.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so (cuDevicePrimaryCtxRetain + 0x15c) [0x1e8d1c]
========= Host Frame:/usr/local/pgi/linux86-64/17.4/lib/libaccnc.so (__pgi_uacc_cuda_initdev + 0x80b) [0x6f0b]
========= Host Frame:/usr/local/pgi/linux86-64/17.4/lib/libaccg.so (__pgi_uacc_enumerate + 0x148) [0x11388]
========= Host Frame:/usr/local/pgi/linux86-64/17.4/lib/libaccg.so (__pgi_uacc_initialize + 0x5b) [0x117ab]
========= Host Frame:/usr/local/pgi/linux86-64/17.4/lib/libaccapi.so (acc_init + 0x22) [0xe4f2]
========= Host Frame:./oacc_test.pgi [0xbc4]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf1) [0x202b1]
========= Host Frame:./oacc_test.pgi [0xaca]
=========
Num OpenACC devices: 1
Num CUDA devices: 1
========= ERROR SUMMARY: 1 error
Apparently __pgi_uacc_cuda_initdev is passing a '-1' as the second parameter (CUdevice dev) to cuDevicePrimaryCtxRetain (bug?):
Breakpoint 1, 0x00007ffff4ab0bc0 in cuDevicePrimaryCtxRetain () from /usr/lib/x86_64-linux-gnu/libcuda.so
(cuda-gdb) p /x $rsi
$7 = 0xffffffff
I suppose this isn't normal. Is this a bug of 17.4 or is my installation broken?

It's normal and a benign error. Basically what's happening is the PGI runtime is querying if there's already a CUDA context created. But since there isn't CUDA runtime call to just query the existence of a context, we call "cuDevicePrimaryCtxRetain". If it errors, then we know that we need to create a new context.
Note that in PGI release 17.7 we did change this call a bit so you will no longer see the error when running cuda-memcheck.

Build Error in developing C and MYSQL application

I am writing some code in C which connects to MYSQL server. I am using Netbeans and new to this. I configured it as directed and installed MYSQL Connector C. I also installed CYGWIN GCC, G++, GDB, MAKE from cygwin site. I created a c project and in the properties-> build-> c compiler->Include directories, set the path of mysql connector (C:\Program Files\MySQL\Connector C 6.0.2\include). Now i write some code to interect with MYSQL server, on build some error occurs.
#include <stdio.h>
#include <stdlib.h>
#include <mysql.h>
int main(int argc, char** argv) {
MYSQL *conn;
MYSQL_RES *res;
MYSQL_ROW row;
char *server = "localhost";
char *user = "root";
char *password = "aaaa"; /* set me first */
char *database = "mysql";
conn = mysql_init(NULL);
/* Connect to database */
if (!mysql_real_connect(conn, server,
user, password, database, 0, NULL, 0)) {
fprintf(stderr, "%s\n", mysql_error(conn));
exit(1);
}
/* send SQL query */
if (mysql_query(conn, "show tables")) {
fprintf(stderr, "%s\n", mysql_error(conn));
exit(1);
}
res = mysql_use_result(conn);
/* output table name */
printf("MySQL Tables in mysql database:\n");
while ((row = mysql_fetch_row(res)) != NULL)
printf("%s \n", row[0]);
/* close connection */
mysql_free_result(res);
mysql_close(conn);
return (EXIT_SUCCESS);
}
After build following error occurs:
build/Debug/Cygwin-Windows/main.o: In function `main':
/cygdrive/c/Documents and Settings/AEM/My Documents/NetBeansProjects/CppApplication_2/main.c:26: undefined reference to `_mysql_init'
/cygdrive/c/Documents and Settings/AEM/My Documents/NetBeansProjects/CppApplication_2/main.c:28: undefined reference to `_mysql_real_connect'
/cygdrive/c/Documents and Settings/AEM/My Documents/NetBeansProjects/CppApplication_2/main.c:30: undefined reference to `_mysql_error'
/cygdrive/c/Documents and Settings/AEM/My Documents/NetBeansProjects/CppApplication_2/main.c:34: undefined reference to `_mysql_query'
/cygdrive/c/Documents and Settings/AEM/My Documents/NetBeansProjects/CppApplication_2/main.c:35: undefined reference to `_mysql_error'
/cygdrive/c/Documents and Settings/AEM/My Documents/NetBeansProjects/CppApplication_2/main.c:38: undefined reference to `_mysql_use_result'
/cygdrive/c/Documents and Settings/AEM/My Documents/NetBeansProjects/CppApplication_2/main.c:41: undefined reference to `_mysql_fetch_row'
/cygdrive/c/Documents and Settings/AEM/My Documents/NetBeansProjects/CppApplication_2/main.c:44: undefined reference to `_mysql_free_result'
/cygdrive/c/Documents and Settings/AEM/My Documents/NetBeansProjects/CppApplication_2/main.c:45: undefined reference to `_mysql_close'
make[2]: Leaving directory `/cygdrive/c/Documents and Settings/AEM/My Documents/NetBeansProjects/CppApplication_2'
make[1]: Leaving directory `/cygdrive/c/Documents and Settings/AEM/My Documents/NetBeansProjects/CppApplication_2'
collect2: ld returned 1 exit status
make[2]: *** [dist/Debug/Cygwin-Windows/cppapplication_2.exe] Error 1
make[1]: *** [.build-conf] Error 2
make: *** [.build-impl] Error 2
BUILD FAILED (exit value 2, total time: 1s)
I googled a lot but could not find anything to solve this issue. Need Help.

add -lmysql to link with the mysql library (valid for gcc). Or, if you use another Compiler, tell the Compiler
where the Libs are (add the Path)
to link the Library to the executable. This is something different, then adding the Path.

Usage of printf() in Cuda 4.0 Compilation Error

I have a GTX 570 (Fermi architecture) which is of compute Capability 2.0. I have Cuda version 4.0 on my computer
and I am using Ubuntu 10.10
With Cuda 4.0 it is possible to use printf() inside kernels. Here is an example code from page 125 of the Cuda 4.0 programming guide
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
#define printf(f, ...) ((void)(f, __VA_ARGS__),0)
#endif
__global__ void helloCUDA(float f)
{
printf(“Hello thread %d, f=%f\n”, threadIdx.x, f);
}
void main()
{
helloCUDA<<<1, 5>>>(1.2345f);
cudaDeviceReset();
}
I am getting the following compilation error.
gaurish108 MyPractice: nvcc printf_inkernel.cu -o printf_inkernel
printf_inkernel.cu(10): error: unrecognized token
printf_inkernel.cu(10): error: expected an expression
printf_inkernel.cu(10): error: unrecognized token
printf_inkernel.cu(10): error: unrecognized token
printf_inkernel.cu(10): error: unrecognized token
printf_inkernel.cu(10): error: unrecognized token
printf_inkernel.cu(10): error: unrecognized token
printf_inkernel.cu(10): error: unrecognized token
printf_inkernel.cu(15): warning: return type of function "main" must be "int"
8 errors detected in the compilation of "/tmp/tmpxft_000014cd_00000000-4_printf_inkernel.cpp1.ii".
Why is it not recognizing printf? I tried adding the flag -arch=sm_20 , but I get the same error.

It looks like you've got a weird quote character at either end of printf's formatter string.
If you copy and paste this program, it ought to compile and run without error:
#include <stdio.h>
__global__ void helloCUDA(float f)
{
printf("Hello thread %d, f=%f\n", threadIdx.x, f);
}
int main()
{
helloCUDA<<<1, 5>>>(1.2345f);
cudaDeviceReset();
return 0;
}
And the output:
$ nvcc -arch=sm_20 test.cu -run
Hello thread 0, f=1.234500
Hello thread 1, f=1.234500
Hello thread 2, f=1.234500
Hello thread 3, f=1.234500
Hello thread 4, f=1.234500
I don't understand the need for the weird macro which begins the program. I'd get rid of it.

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Multiple processes sharing / reading from one large block of CUDA device memory - cuda

Related

CUDA constant memory usage across multiple source files showing different behaviors on cuda-11.2 and cuda-11.4

PTX kernel name mangling

cuDevicePrimaryCtxRetain returns CUDA_ERROR_INVALID_DEVICE after acc_init

Build Error in developing C and MYSQL application

Usage of printf() in Cuda 4.0 Compilation Error

Categories

Resources