Should cudaMemset work on the device pointer mapped from cudaHostRegister - cuda

I came across the sample code from one of my colleagues where the cudaMemset doesn't seem to work properly, when run on V100.
#include <iostream>
#include <stdio.h>
#define CUDACHECK(cmd) \
{\
cudaError_t error = cmd;\
if (error != cudaSuccess) { \
fprintf(stderr, "info: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__);\
}\
}
__global__ void setValue(int value, int* A_d) {
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if(tx == 0){
A_d[tx] = A_d[tx] + value;
}
}
__global__ void printValue(int* A_d) {
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if(tx == 0){
printf("A_d: %d\n", A_d[tx]);
}
}
int main(int argc, char* argv[ ]){
int *A_h, *A_d;
int size = sizeof(int);
A_h = (int*)malloc(size);
A_h[0] = 1;
CUDACHECK(cudaSetDevice(0));
CUDACHECK(cudaHostRegister(A_h, size, 0));
CUDACHECK(cudaHostGetDevicePointer((void**)&A_d, A_h, 0));
setValue<<<64,1,0,0>>>(5, A_d);
cudaDeviceSynchronize();
printf("A_h: %d\n", A_h[0]);
A_h[0] = 100;
printf("A_h: %d\n",A_h[0]);
printValue<<<64,1,0,0>>>(A_d);
cudaDeviceSynchronize();
CUDACHECK (cudaMemset(A_d, 1, size) );
printf("A_h: %d\n",A_h[0]);
printValue<<<64,1,0,0>>>(A_d);
cudaDeviceSynchronize();
cudaHostUnregister(A_h);
free(A_h);
}
When this sample is compiled and run, the output is seen as below.
/usr/local/cuda-11.0/bin/nvcc memsettest.cu -o test
./test
A_h: 6
A_h: 100
A_d: 100
A_h: 16843009
A_d: 16843009
We expect A_h and A_d to be set to 1 with cudaMemset. But it is set to some huge value as seen.
So, is cudaMemset expected to work on the device pointer A_d returned by cudaHostGetDevicePointer.
Is this A_d expected to be used only in kernels.
We also see that cudaMemcpy DtoH or HtoD seem to be working on the same device pointer A_d.
Can someone help us with the correct behavior.

We expect A_h and A_d to be set to 1 with cudaMemset.
You're confused about how cudaMemset works. Conceptually, it is very similar to memset from the C standard library. You should try your same test case with memset and see what it does.
Anyway, cudaMemset takes a pointer, a byte value, and a size in bytes to set, just like memset.
So your cudaMemset command:
CUDACHECK (cudaMemset(A_d, 1, size) );
is setting each byte to 1. Since size is 4, that means that you are setting A_d[0] to 0x01010101 (in hexadecimal). If you plug that value into your windows programmer calculator, the value is 16843009 in decimal. So everything is working as expected, here, from what I can see.
Again, I'm pretty sure you would see the same behavior with memset for the same test case/usage.

Related

Why doesn't the console show the entire output? [duplicate]

For the purpose of testing printf() call on device, I wrote a simple program which copies an array of moderate size to device and print the value of device array to screen. Although the array is correctly copied to device, the printf() function does not work correctly, which lost the first several hundred numbers. The array size in the code is 4096. Is this a bug or I'm not using this function properly? Thanks in adavnce.
EDIT: My gpu is GeForce GTX 550i, with compute capability 2.1
My code:
#include<stdio.h>
#include<stdlib.h>
#define N 4096
__global__ void Printcell(float *d_Array , int n){
int k = 0;
printf("\n=========== data of d_Array on device==============\n");
for( k = 0; k < n; k++ ){
printf("%f ", d_Array[k]);
if((k+1)%6 == 0) printf("\n");
}
printf("\n\nTotally %d elements has been printed", k);
}
int main(){
int i =0;
float Array[N] = {0}, rArray[N] = {0};
float *d_Array;
for(i=0;i<N;i++)
Array[i] = i;
cudaMalloc((void**)&d_Array, N*sizeof(float));
cudaMemcpy(d_Array, Array, N*sizeof(float), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
Printcell<<<1,1>>>(d_Array, N); //Print the device array by a kernel
cudaDeviceSynchronize();
/* Copy the device array back to host to see if it was correctly copied */
cudaMemcpy(rArray, d_Array, N*sizeof(float), cudaMemcpyDeviceToHost);
printf("\n\n");
for(i=0;i<N;i++){
printf("%f ", rArray[i]);
if((i+1)%6 == 0) printf("\n");
}
}
printf from the device has a limited queue. It's intended for small scale debug-style output, not large scale output.
referring to the programmer's guide:
The output buffer for printf() is set to a fixed size before kernel launch (see Associated Host-Side API). It is circular and if more output is produced during kernel execution than can fit in the buffer, older output is overwritten.
Your in-kernel printf output overran the buffer, and so the first printed elements were lost (overwritten) before the buffer was dumped into the standard I/O queue.
The linked documentation indicates that the buffer size can be increased, also.

Strange Cudamemcpy execution time

I'm currently working on a Cuda code which computes a simple difference pixel by pixel of two images (size: 2560x1706 px) in order to compare execution time of CPU and GPU.
I realize a "for" loop of 1000 iterations of my kernel to have a more significant execution time, and I perform the cudaMemcpy (from device to host) straight after the loop to retrieve the data computed.
Nevertheless, the execution time of this cudaMemcpy took 2800 ms which is higher than expected. I just was asking myself why I obtain such a result.
Here is my Kernel Code :
__global__ void diff (unsigned char *data1 ,unsigned char *data2, int *data_res)
{
int v = threadIdx.x + blockIdx.x*blockDim.x;
if (v < N)
{
data_res[v] = (int) data2[v] - (int) data1[v];
}
}
Here is the kernel calls :
cudaProfilerStart();
// Cuda allocation
cudaMalloc((void**)&dev_data1, N*sizeof(unsigned char));
cudaMalloc((void**)&dev_data2, N*sizeof(unsigned char));
cudaMalloc((void**)&dev_data_res, N*sizeof(int));
// Cuda memory copy
cudaMemcpy(dev_data1, img1->data, N*sizeof(unsigned char), cudaMemcpyHostToDevice);
cudaMemcpy(dev_data2, img2->data, N*sizeof(unsigned char), cudaMemcpyHostToDevice);
cudaMemcpy(dev_data_res, imgresult->data, N*sizeof(int), cudaMemcpyHostToDevice);
//Simulate nb_loops images
for(int m = 0; m < nb_loops ; m++)
{
diff<<<blck_nb, thrd_nb>>>(dev_data1, dev_data2, dev_data_res);
//printf("%4d", m);
}
printf("WAITING FOR MEMCPY...\n");
clock_t begin = clock(), diff;
cudaMemcpy(imgresult_data, dev_data_res, N*sizeof(int), cudaMemcpyDeviceToHost);
diff = clock() - begin;
float msec = diff*1000/CLOCKS_PER_SEC;
printf("\t \nTime of the MEMCPY : %2.3f ms\n", msec);
printf("MEMCPY DEVICE TO HOST OK!\n");
cudaProfilerStop();
And here is the screenshot of the execution time results :
CUDA kernel launches are asynchronous, and cudaMemcpy is a blocking call. So what you are calling memcpy time is really kernel execution + memcpy tiime. Change your code like this:
...
for(int m = 0; m < nb_loops ; m++)
{
diff<<<blck_nb, thrd_nb>>>(dev_data1, dev_data2, dev_data_res);
//printf("%4d", m);
}
cudaDeviceSynchronize();
printf("WAITING FOR MEMCPY...\n");
....
and this timing should be correct.

printf() in my CUDA kernel doesn't result produce any output

I have added some printf() statements in my CUDA program
__device__ __global__ void Kernel(float *, float * ,int );
void DeviceFunc(float *temp_h , int numvar , float *temp1_h)
{ .....
//Kernel call
printf("calling kernel\n");
Kernel<<<dimGrid , dimBlock>>>(a_d , b_d , numvar);
printf("kernel called\n");
....
}
int main(int argc , char **argv)
{ ....
printf("beforeDeviceFunc\n\n");
DeviceFunc(a_h , numvar , b_h); //Showing the data
printf("after DeviceFunc\n\n");
....
}
Also in the Kernel.cu, I wrote:
#include<cuda.h>
#include <stdio.h>
__device__ __global__ void Kernel(float *a_d , float *b_d ,int size)
{
int idx = threadIdx.x ;
int idy = threadIdx.y ;
//Allocating memory in the share memory of the device
__shared__ float temp[16][16];
//Copying the data to the shared memory
temp[idy][idx] = a_d[(idy * (size+1)) + idx] ;
printf("idx=%d, idy=%d, size=%d", idx, idy, size);
....
}
Then I compile using -arch=sm_20 like this:
nvcc -c -arch sm_20 main.cu
nvcc -c -arch sm_20 Kernel.cu
nvcc -arch sm_20 main.o Kernel.o -o main
Now when I run the program, I see:
beforeDeviceFunc
calling kernel
kernel called
after DeviceFunc
So the printf() inside the kernel is not printed. How can I fix that?
printf() output is only displayed if the kernel finishes successfully, so check the return codes of all CUDA function calls and make sure no errors are reported.
Furthermore printf() output is only displayed at certain points in the program. Appendix B.32.2 of the Programming Guide lists these as
Kernel launch via <<<>>> or cuLaunchKernel() (at the start of the launch, and if the CUDA_LAUNCH_BLOCKING environment variable is set to 1, at the end of the launch as well),
Synchronization via cudaDeviceSynchronize(), cuCtxSynchronize(), cudaStreamSynchronize(), cuStreamSynchronize(), cudaEventSynchronize(), or cuEventSynchronize(),
Memory copies via any blocking version of cudaMemcpy*() or cuMemcpy*(),
Module loading/unloading via cuModuleLoad() or cuModuleUnload(),
Context destruction via cudaDeviceReset() or cuCtxDestroy().
Prior to executing a stream callback added by cudaStreamAddCallback() or cuStreamAddCallback().
To check this is your problem, put the following code after your kernel invocation:
{
cudaError_t cudaerr = cudaDeviceSynchronize();
if (cudaerr != cudaSuccess)
printf("kernel launch failed with error \"%s\".\n",
cudaGetErrorString(cudaerr));
}
You should then see either the output of your kernel or an error message.
More conveniently, cuda-memcheck will automatically check all return codes for you if you run your executable under it. While you should always check for errors anyway, this comes handy when resolving concrete issues.
I had the same error just now and decreasing the block size to 512 helped. According to documentation maximum block size can be either 512 or 1024.
I have written a simple test that showed that my GTX 1070 has a maximum block size of 1024. UPD: you can check if your kernel has ever executed by using cudaError_t cudaPeekAtLastError() that returns cudaSuccess if the kernel has started successfully, and only after it is worse calling cudaError_t cudaDeviceSynchronize().
Testing block size of 1023
Testing block size of 1024
Testing block size of 1025
CUDA error: invalid configuration argument
Block maximum size is 1024
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
__global__
void set1(int* t)
{
t[threadIdx.x] = 1;
}
inline bool failed(cudaError_t error)
{
if (cudaSuccess == error)
return false;
fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(error));
return true;
}
int main()
{
int blockSize;
for (blockSize = 1; blockSize < 1 << 12; blockSize++)
{
printf("Testing block size of %d\n", blockSize);
int* t;
if(failed(cudaMallocManaged(&t, blockSize * sizeof(int))))
{
failed(cudaFree(t));
break;
}
for (int i = 0; i < blockSize; i++)
t[0] = 0;
set1 <<<1, blockSize>>> (t);
if (failed(cudaPeekAtLastError()))
{
failed(cudaFree(t));
break;
}
if (failed(cudaDeviceSynchronize()))
{
failed(cudaFree(t));
break;
}
bool hasError = false;
for (int i = 0; i < blockSize; i++)
if (1 != t[i])
{
printf("CUDA error: t[%d] = %d but not 1\n", i, t[i]);
hasError = true;
break;
}
if (hasError)
{
failed(cudaFree(t));
break;
}
failed(cudaFree(t));
}
blockSize--;
if(blockSize <= 0)
{
printf("CUDA error: block size cannot be 0\n");
return 1;
}
printf("Block maximum size is %d", blockSize);
return 0;
}
P.S. Please note, that the only thing in block sizing is warp granularity which is 32 nowadays, so if 0 == yourBlockSize % 32 the warps are used pretty efficiently. The only reason to make blocks bigger then 32 is when the code needs synchronization as synchronization is available only among threads in a single block which makes a developer to use a single large block instead of many small ones. So running with higher number of smaller blocks can be even more efficient than running with lower number of larger blocks.

Modulus computation of an array of cufftComplex data type in CUDA

I made a Dll file in visual C++ to compute modulus of an array of complex numbers in CUDA. The array is type of cufftComplex. I then called the Dll in LabVIEW to check the accuracy of the result. I'm receiving an incorrect result. Could anyone tell me what is wrong with the following code, please? I think there should be something wrong with my kernel function(the way I am retrieving the cufftComplex data should be incorrect).
#include <math.h>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cufft.h>
extern "C" __declspec(dllexport) void Modulus(cufftComplex *digits,float *result);
__global__ void ModulusComputation(cufftComplex *a, int N, float *temp)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx<N)
{
temp[idx] = sqrt((a[idx].x * a[idx].x) + (a[idx].y * a[idx].y));
}
}
void Modulus(cufftComplex *digits,float *result)
{
#define N 1024
cufftComplex *d_data;
float *temp;
size_t size = sizeof(cufftComplex)*N;
cudaMalloc((void**)&d_data, size);
cudaMalloc((void**)&temp, sizeof(float)*N);
cudaMemcpy(d_data, digits, size, cudaMemcpyHostToDevice);
int blockSize = 16;
int nBlocks = N/blockSize;
if( N % blockSize != 0 )
nBlocks++;
ModulusComputation <<< nBlocks, blockSize >>> (d_data, N,temp);
cudaMemcpy(result, temp, size, cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(temp);
}
In the final cudaMemcpy in your code, you have:
cudaMemcpy(result, temp, size, cudaMemcpyDeviceToHost);
It should be:
cudaMemcpy(result, temp, sizeof(float)*N, cudaMemcpyDeviceToHost);
If you had included error checking for your cuda calls, you would have seen this cuda call (as originally written) throw an error.
There's other comments that could be made. For example your block size (16) should be an integral multiple of 32. But this does not prevent proper operation.
After the kernel call, when copying back the result, you are using size as the memory size. The third argument of cudaMemcpy should be N * sizeof(float).

CUDA-GDB crashes in Kernel

I've been trying to debug my code, as I know something is going wrong in the Kernel, and I've been trying to figure out what specifically. If I try to step into the kernel it seems to completely step over the kernel functions, and will eventually cause an error on quitting:
Single stepping until exit from function dyld_stub_cudaSetupArgument,
which has no line number information.
[Launch of CUDA Kernel 0 (incrementArrayOnDevice<<<(3,1,1),(4,1,1)>>>) on
Device 0]
[Termination of CUDA Kernel 0 (incrementArrayOnDevice<<<(3,1,1),
(4,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 1 (fillinBoth<<<(40,1,1),(1,1,1)>>>) on Device 0]
[Termination of CUDA Kernel 1 (fillinBoth<<<(40,1,1),(1,1,1)>>>) on Device 0]
add (below=0x124400, newtip=0x124430, newfork=0x125ac0) at test.cu:1223
And if I try to break in the Kernel my entire computer crashes and I have to restart it.
I figure there must be something wrong with the way I'm calling the kernel, but I can't figure out what.
The code is rather long, so I'm only including an excerpt of it:
__global__ void fillinOne(seqptr qset, long max) {
int i, j;
aas aa;
int idx = blockIdx.x;
__shared__ long qs[3];
if(idx < max)
{
memcpy(qs, qset[idx], sizeof(long[3]));
for (i = 0; i <= 1; i++)
{
for (aa = ala; (long)aa <= (long)stop; aa = (aas)((long)aa + 1))
{
if (((1L << ((long)aa)) & qs[i]) != 0)
{
for (j = i + 1; j <= 2; j++)
qs[j] |= cudaTranslate[(long)aa - (long)ala][j - i];
}
}
}
}
}
//Kernel for left!= NULL and rt != NULL
void fillin(node *p, node *left, node *rt)
{
cudaError_t err = cudaGetLastError();
size_t stepsize = chars * sizeof(long);
size_t sitesize = chars * sizeof(sitearray);
//int i, j;
if (left == NULL)
{
//copy rt->numsteps into p->numsteps--doesn't actually require CUDA, because no computation to do
memcpy(p->numsteps, rt->numsteps, stepsize);
checkCUDAError("memcpy");
//allocate siteset (array of sitearrays) on device
seqptr qsites; //as in array of qs's
cudaMalloc((void **) &qsites, sitesize);
checkCUDAError("malloc");
//copy rt->siteset into device array (equivalent to memcpy(qs, rs) but for whole array)
cudaMemcpy(qsites, rt->siteset, sitesize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy");
//do loop in device
int block_size = 1; //each site operated on independently
int n_blocks = chars;
fillinOne <<< n_blocks, block_size>>> (qsites, chars);
cudaThreadSynchronize();
//put qset in p->siteset--equivalent to memcpy(p->siteset[m], qs)
cudaMemcpy(p->siteset, qsites, sitesize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy");
//Cleanup
cudaFree(qsites);
}
If anyone has any ideas at all, please resond! Thanks in advance!
I suppose you have a single card configuration. When you are debugging a cuda kernel and you break inside it you effectively put the display driver in pause. That causes what you think is a crash. If you want to use the cuda-gdb with only one graphics card you must use it in command line mode (don't start X or press ctrl-alt-fn from X).
If you have two cards you must run the code in the card not running the display. Use cudaSelectDevice(n).