invalid device ordinal on cudaMemPrefetchAsync - cuda

I'm running a toy CUDA sample on my GeForce 1080 Ti (Pascal) on windows 10 and CUDA 9.2.
Goal is to test cudaMemPrefetchAsync to the CPU, as it's supposed to work.
However, I get a CUDA error (invalid device ordinal) on this particular line.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdio>
#include <cstdlib>
void fill(int* a, int val, int N) {
for (int k = 0; k < N; ++k) {
a[k] = val;
}
}
__global__ void add(int* a, int* b, int N)
{
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += blockDim.x * gridDim.x) {
a[i] += b[i];
}
}
inline void check(cudaError_t err, const char* file, int line) {
if (err != cudaSuccess) {
::fprintf(stderr, "ERROR at %s[%d] : %s\n", file, line, cudaGetErrorString(err));
abort();
}
}
#define CUDA_CHECK(err) do { check(err, __FILE__, __LINE__); } while(0)
int main()
{
int deviceId;
CUDA_CHECK(cudaGetDevice(&deviceId));
const int N = 1024*1024*32;
int *a, *b;
CUDA_CHECK(cudaMallocManaged(&a, N * sizeof(int)));
CUDA_CHECK(cudaMallocManaged(&b, N * sizeof(int)));
CUDA_CHECK(cudaMemPrefetchAsync(a, N * sizeof(int), cudaCpuDeviceId)); // program breaks here
CUDA_CHECK(cudaMemPrefetchAsync(b, N * sizeof(int), cudaCpuDeviceId));
fill(a, 1, N);
fill(a, 2, N);
CUDA_CHECK(cudaMemPrefetchAsync(a, N * sizeof(int), deviceId));
CUDA_CHECK(cudaMemPrefetchAsync(b, N * sizeof(int), deviceId));
add<<<32, 256>>>(a, b, N);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
return 0;
}
Is that a hardware/driver/OS limitation? Can I simply ignore the error?

Is that a hardware/driver/OS limitation?
Yes, the latter. Quoting from the documentation
GPUs with SM architecture 6.x or higher (Pascal class or newer)
provide additional Unified Memory features such as on-demand page
migration and GPU memory oversubscription that are outlined throughout
this document. Note that currently these features are only supported
on Linux operating systems.
So asynchronous page migration is not supported in Windows at the moment and that it why you get an error when you try to enable it.

Related

CUDA-why it cannot printf the information in cuda code? [duplicate]

This question already has answers here:
Trouble compiling helloworld.cu
(2 answers)
Closed 3 years ago.
I am a beginner for cuda. I wrote a test code for testing GPU device. my gpu model is k80.
There are 8 gpu cards in one node.
#include <iostream>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define N 10000
__global__ void add(int *a, int *b, int *c)
{
int tid = blockIdx.x;
if (tid < N)
c[tid] = a[tid] + b[tid];
}
int main()
{
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
cudaMalloc((void**)&dev_a, N * sizeof(int));
cudaMalloc((void**)&dev_b, N * sizeof(int));
cudaMalloc((void**)&dev_c, N * sizeof(int));
for (int i = 0;i < N;i++)
{
a[i] = -i;
b[i] = i*i;
}
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
add << <N, 1 >> > (dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0;i < N;i++)
{
printf("%d + %d = %d\\n", a[i], b[i], c[i]);
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
When i compile the code:
nvcc gputest.cu -o gputest
I got errors :
gputest.cu(38): error: identifier "printf" is undefined
1 error detected in the compilation of "/tmp/tmpxft_000059a6_00000000-4_gputest.cpp4.ii".
I think printf is a function in iostream file, but i have already included the iostream. I don't know why?
Add:
#include <stdio.h>
and it will compile is OK.
printf is a function defined in the C standard library cstdio, so inclusion of stdio.h makes sense here. Different compilers may have different behavior here, but in the case of nvcc this is generally the right way to do it.
(It's not valid to assume in all cases that inclusion of iostream will satisfy the reference here.)

CUDA device runtime api cudaMemsetAsync doesn't work

I am trying to call cudaMemsetAsync from kernel (so called "dynamic parallelism"). But no matter what value I use, it always set memory to 0.
Here is my test code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_device_runtime_api.h"
#include <stdio.h>
const int size = 5;
__global__ void kernel(int *c)
{
cudaMemsetAsync(c, 0x7FFFFFFF, size * 4, NULL);
}
int main()
{
cudaError_t cudaStatus;
int c[size] = { 12, 12, 12, 12, 12 };
int *dev_c = 0;
cudaStatus = cudaSetDevice(0);
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
cudaStatus = cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
kernel <<< 1, 1 >>>(dev_c);
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_c);
cudaStatus = cudaDeviceReset();
printf("%d\n", cudaStatus);
printf("{%d,%d,%d,%d,%d}\n", c[0], c[1], c[2], c[3], c[4]);
return 0;
}
And if I run it, I got output like this:
>nvcc -run kernel.cu -gencode=arch=compute_35,code=\"sm_35,compute_35\" -rdc=true -lcudadevrt
kernel.cu
Creating library a.lib and object a.exp
0
{0,0,0,0,0}
When I call memory set, I use value 0x7FFFFFFF. I'm expecting non-zero numbers, but it always shows zero.
Is this a bug? or I did something wrong? I'm using CUDA 8.0
I can confirm this appears not to work in CUDA 8 on the systems I tested it with.
If you want a single thread to perform the operation, you can use memset directly in device code (it, like memcpy, has been supported forever). The kernel will emit a byte sized loop inline within your kernel and the operation will be handled by each running thread.
If you want a dynamic parallelism style memset operation, then the easiest thing is to make your own. A trivial (and very, very lightly tested) implementation in the code you posted might look like this:
#include <cstring>
#include <cstdio>
const int size = 5;
__global__ void myMemset_kernel(void* p, unsigned char val, size_t sz)
{
size_t tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned char* _p = (unsigned char*)p;
for(; tid < sz; tid += blockDim.x * gridDim.x) {
_p[tid] = val;
}
}
__device__ void myMemset(void* p, unsigned int val, size_t sz, cudaStream_t s=NULL)
{
const dim3 blocksz(256,1,1);
size_t nblocks = (sz + blocksz.x -1) / blocksz.x;
unsigned charval = val & 0xff;
myMemset_kernel<<< dim3(nblocks,1,1), blocksz, 0, s >>>(p, charval, sz);
}
__global__ void kernel(int *c)
{
cudaStream_t s;
cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
myMemset(c, 0x7FFFFFFF, size * 4, s);
cudaDeviceSynchronize();
}
int main()
{
int c[size];
int *dev_c;
memset(&c[0], 0xffffff0c, size * sizeof(int));
printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]);
cudaMalloc((void**)&dev_c, size * sizeof(int));
cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
kernel <<< 1, 1 >>>(dev_c);
cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_c);
printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]);
return 0;
}
which compiles and does this:
$ nvcc -rdc=true -arch=sm_52 -o memset memset.cu -lcudadevrt
$ ./memset
{0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c}
{ffffffff,ffffffff,ffffffff,ffffffff,ffffffff}
A final point -- note the values above and read this question and answer. In your code, it is not possible to use cudaMemset to apply a value of 0x7FFFFFFF. Although the value argument is an unsigned integer, cudaMemset and its relatives work like regular memset and set byte values. Only the least significant byte of the 32 bit argument is used to set values. If your objective is to set 32 bit values, then you will need to make your own version of memset for that purpose anyway.

Using of shared memory not showing desired result

I am trying to learn the usuage of Shared memory with a view to increase the performance . here I am trying to copy the global memory to shared memory. but when I have single block(256 thread) it gives the result and with more than 1 block it gives random result.
#include <cuda.h>
#include <stdio.h>
__global__ void staticReverse(int *d, int n)
{
__shared__ int s[400];
int t = blockIdx.x * blockDim.x + threadIdx.x;
d[t] = d[t]*d[t];
s[t] =d[t];
__syncthreads();
d[t] = s[t];
}
__global__ void dynamicReverse(int *d, int n)
{
extern __shared__ int s[];
int t = threadIdx.x;
s[t] = d[t]*d[t];
__syncthreads();
d[t] = s[t];
}
int main(void)
{
const int n = 400;
int a[n], d[n];
for (int i = 0; i < n; i++)
{
a[i] = i;
}
int *d_d;
cudaMalloc(&d_d, n * sizeof(int));
// run version with static shared memory
int block_size = 256;
int n_blocks = n/block_size + (n%block_size == 0 ? 0:1);
cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice);
staticReverse<<<n_blocks,block_size>>>(d_d, n);
cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < n; i++)
{
printf("%d\n",d[i]);
}
}
1)what does the third argument in dynamicReverse<<<n_blocks,block_size,n*sizeof(int)>>>(d_d, n);
kernal call does? does it allocates shared memory for entire block or thread.
2) if I required more than 64kb of shared memory per multiprocessor in compute capability 5.0 what I need to do?
In your static shared memory allocation code you had three issues:
The size of the statically allocated shared memory should comply with the block size, not with the size of the input array,
You should use local thread index for indexing shared memory, instead of the global one;
You had no array out of bounds checking.
The dynamic shared memory allocation code had the same issues #2 and #3 as above, plus the fact that you were indexing global memory with local thread index, instead of global. You can use the third argument to specify the size of the shared memory to be allocated. In particular, you should allocate an amount of 256 ints, i.e., related to the block size, similarly to the static shared memory allocation case.
Here is the complete working code:
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/***********************************/
/* SHARED MEMORY STATIC ALLOCATION */
/***********************************/
#include <cuda.h>
#include <stdio.h>
__global__ void staticReverse(int *d, int n)
{
__shared__ int s[256];
int t = blockIdx.x * blockDim.x + threadIdx.x;
if (t < n) {
d[t] = d[t]*d[t];
s[threadIdx.x] =d[t];
__syncthreads();
d[t] = s[threadIdx.x];
}
}
/************************************/
/* SHARED MEMORY DYNAMIC ALLOCATION */
/************************************/
__global__ void dynamicReverse(int *d, int n)
{
extern __shared__ int s[];
int t = blockIdx.x * blockDim.x + threadIdx.x;
if (t < n) {
s[threadIdx.x] = d[t]*d[t];
__syncthreads();
d[t] = s[threadIdx.x];
}
}
int main(void)
{
const int n = 400;
int* a = (int*) malloc(n*sizeof(int));
int* d = (int*) malloc(n*sizeof(int));
for (int i = 0; i < n; i++) { a[i] = i; }
int *d_d; gpuErrchk(cudaMalloc(&d_d, n * sizeof(int)));
// run version with static shared memory
int block_size = 256;
int n_blocks = n/block_size + (n%block_size == 0 ? 0:1);
gpuErrchk(cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice));
//staticReverse<<<n_blocks,block_size>>>(d_d, n);
dynamicReverse<<<n_blocks,block_size,256*sizeof(int)>>>(d_d, n);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < n; i++) { printf("%d\n",d[i]); }
}

Wrong results of a CUDA dynamic parallelism code

I recently bumped in the problem illustrated at Uncorrectable ECC error. Shortly speaking, from time to time I receive an Uncorrectable ECC error and my dynamic parallelism code generates uncorrect results. The most probable hypothesis of the uncorrectable ECC error is a corrupted driver stack, which has also been indirectly confirmed by the experience of another user (see the above post). I would now like to face the second issue, i.e., the algorithmic one. To this end, I'm dealing with the reproducer reported below which, since the original code generating uncorrect results uses dynamic parallelism, uses this CUDA feature too.
I do not see any evindent issue with this code. I think that the synchronization regarding the child kernel launch should be ok: the first __syncthreads() should not be necessary and the cudaDeviceSynchronize() should ensure that all the memory writes of the child kernel are accomplished before the printf.
My question is: is this code wrong or the wrong results are due to a non-programming issue?
My configuration: CUDA 5.0, Windows 7, 4-GPU system equipped with Kepler K20c, driver 327.23.
#include <stdio.h>
#include <conio.h>
#define K 6
#define BLOCK_SIZE 256
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getch(); exit(code); }
}
}
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__global__ void child_kernel(double* P1)
{
int m = threadIdx.x;
P1[m] = (double)m;
}
__global__ void parent_kernel(double* __restrict__ x, int M)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if(i<M) {
double* P1 = new double[13];
dim3 dimBlock(2*K+1,1); dim3 dimGrid(1,1);
__syncthreads();
child_kernel<<<dimGrid,dimBlock>>>(P1);
cudaDeviceSynchronize();
for(int m=0; m<2*K+1; m++) printf("%f %f\n",P1[m],(double)m);
}
}
int main() {
const int M = 19000;
//gpuErrchk(cudaSetDevice(0));
double* x = (double*)malloc(M*sizeof(double));
for (int i=0; i<M; i++) x[i] = (double)i;
double* d_x; gpuErrchk(cudaMalloc((void**)&d_x,M*sizeof(double)));
gpuErrchk(cudaMemcpy(d_x,x,M*sizeof(double),cudaMemcpyHostToDevice));
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(iDivUp(M,BLOCK_SIZE));
parent_kernel<<<dimGrid,dimBlock>>>(d_x,M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
getch();
return 0;
}
I'm pretty sure you're exceeding the launch pending limit. It's nearly impossible to tell with your code as-is, but I've modified it and added error checking on the child kernel launch.
When I do that, I get launch errors, signified by a printout of !. Skipping the launch error cases, all of my in-kernel checking of P1[m] vs. m passes (I get no * printout at all.)
#include <stdio.h>
#define K 6
#define BLOCK_SIZE 256
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__global__ void child_kernel(unsigned long long* P1)
{
int m = threadIdx.x;
P1[m] = (unsigned long long)m;
}
__global__ void parent_kernel(double* __restrict__ x, int M)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if(i<M) {
unsigned long long* P1 = new unsigned long long[13];
dim3 dimBlock(2*K+1,1); dim3 dimGrid(1,1);
__syncthreads();
child_kernel<<<dimGrid,dimBlock>>>(P1);
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("!");
else for(unsigned long long m=0; m<dimBlock.x; m++) if (P1[m] != m) printf("*");
}
}
int main() {
const int M = 19000;
//gpuErrchk(cudaSetDevice(0));
double* x = (double*)malloc(M*sizeof(double));
for (int i=0; i<M; i++) x[i] = (double)i;
double* d_x; gpuErrchk(cudaMalloc((void**)&d_x,M*sizeof(double)));
gpuErrchk(cudaMemcpy(d_x,x,M*sizeof(double),cudaMemcpyHostToDevice));
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(iDivUp(M,BLOCK_SIZE));
parent_kernel<<<dimGrid,dimBlock>>>(d_x,M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
Feel free to add further decoding of the err variable in the parent kernel to convince yourself that you are exceeding the launch pending limit. As another test, you can set M to 2048 instead of 19000 in your host code, and all the ! printouts go away. (launch pending limit default == 2048)
As I've stated in the comments, I think the uncorrectable ECC error is a separate issue, and I suggest trying the driver 321.01 that I linked in the comments.

why am I seeing a black screen when I try this code in cuda?

I'm using Win8 and Nsight in "visual studio 2010" and I installed "310.90-notebook-win8-win7-winvista-32bit-international-whql" for my Graphic card(9300m Gs).but when I try the code below,I see a black screen!and an error :"Display driver stoped responding and has recoverd"!
I know that the problem is with "cudaMemcpy",but I don't why!?
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define N 8
__global__ void kernel(int *a)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int step = x;
while(step<N){
a[step] = threadIdx.x;
step += x;
}
}
int main()
{
int a[N],i=N,j=0;
for(;j<N;j++)
a[j]=i--;
int *dev_a;
cudaMalloc( (void**)&dev_a, N * sizeof(int) );
cudaMemcpy( dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
kernel<<<2,2>>>(dev_a);
cudaError_t cudaStatus = cudaMemcpy(a, dev_a,N-1 * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
//goto Error;
}
for(j=0;j<N;j++)printf("\n%d",a[j]);
int t;
scanf("%d",&t);
}
In the kernel, the thread with threadIdx.x = 0 and blockIdx.x = 0 i.e. the first thread of the first block will run indefinitely, causing the kernel to crash.
When threadIdx.x = 0 and blockIdx.x = 0 the kernel code will become:
int x = 0;
int step = 0;
while(step<N)
{
a[step] = 0;
step += 0; //This will create infinite loop
}
Also (May be its a typo), there is a logical error in the following line of your code:
cudaError_t cudaStatus = cudaMemcpy(a, dev_a,N-1 * sizeof(int), cudaMemcpyDeviceToHost);
Considering the operator precedence in C, the expression N-1 * sizeof(int) will evaluate to N-4 (if sizeof(int) is 4).