It is strange that when I do not add cuda-memcheck before ./main, the program runs without any warning or error message, however, when I add it, it will have error message like following.
========= Invalid __global__ write of size 8
========= at 0x00000120 in initCurand(curandStateXORWOW*, unsigned long)
========= by thread (9,0,0) in block (3,0,0)
========= Address 0x5005413b0 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204115]
========= Host Frame:./main [0x18e11]
========= Host Frame:./main [0x369b3]
========= Host Frame:./main [0x3403]
========= Host Frame:./main [0x308c]
========= Host Frame:./main [0x30b7]
========= Host Frame:./main [0x2ebb]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830]
Here is my functions, a brief introduction on the code, I try to generate a random numbers and save them to a device variable weights, then use this vector to sample from discrete numbers.
#include<iostream>
#include<curand.h>
#include<curand_kernel.h>
#include<time.h>
using namespace std;
#define num 100
__device__ float weights[num];
// function to define seed
__global__ void initCurand(curandState *state, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &state[idx]);
}
__device__ void sampling(float *weight, float max_weight, int *index, curandState *state){
int j;
float u;
do{
j = (int)(curand_uniform(state) * (num + 0.999999));
u = curand_uniform(state); //sample from uniform distribution;
}while( u > weight[j]/max_weight);
*index = j;
}
__global__ void test(int *dev_sample, curandState *state){
int idx = threadIdx.x + blockIdx.x * blockDim.x;\
// generate random numbers from uniform distribution and save them to weights
weights[idx] = curand_uniform(&state[idx]);
// run sampling function, in which, weights is an input for the function on each thread
sampling(weights, 1, dev_sample+idx, &state[idx]);
}
int main(){
// define the seed of random generator
curandState *devState;
cudaMalloc((void**)&devState, num*sizeof(curandState));
int *h_sample;
h_sample = (int*) malloc(num*sizeof(int));
int *d_sample;
cudaMalloc((void**)&d_sample, num*sizeof(float));
initCurand<<<(int)num/32 + 1, 32>>>(devState, 1);
test<<<(int)num/32 + 1, 32>>>(d_sample, devState);
cudaMemcpy(h_sample, d_sample, num*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < num; ++i)
{
cout << *(h_sample + i) << endl;
}
//free memory
cudaFree(devState);
free(h_sample);
cudaFree(d_sample);
return 0;
}
Just start to learn cuda, if the methods to access the global memory is incorrect, please help me with that. Thanks
This is launching "extra" threads:
initCurand<<<(int)num/32 + 1, 32>>>(devState, 1);
num is 100, so the above config will launch 4 blocks of 32 threads each, i.e. 128 threads. But you are only allocating space for 100 curandState here:
cudaMalloc((void**)&devState, num*sizeof(curandState));
So your initCurand kernel will have some threads (idx = 100-127) that are attempting to initialize some curandState that you haven't allocated. As a result when you run cuda-memcheck which does fairly rigorous out-of-bounds checking, an error is reported.
One possible solution would be to modify your initCurand kernel as follows:
__global__ void initCurand(curandState *state, unsigned long seed, int num){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num)
curand_init(seed, idx, 0, &state[idx]);
}
This will prevent any out-of-bounds threads from doing anything. Note that you will need to modify the kernel call to pass num to it. Also, it appears to me you have a similar problem in your test kernel. You may want to do something similar to fix it there. This is a common construct in CUDA kernels, I call it a "thread check". You can find other questions here on the SO tag discussing this same concept.
Related
Given a 64-bit variable in a register in a kernel, let's say std::uint64_t var, I want to do some calculations and set each bit of this variable using 64 different threads separately but in parallel. Is it possible to write on a shared variable in parallel?
__shared__ std::uint64_t var = 0
in each thread (tid = 0 to 63):
do some calculations
if we should set the bit with index = tid then:
var |= ((std::uint64_t) 1 << tid)
I also realized that using atomicOr does not benefit us as it only works on integers.
You can do this using a 64-bit shared atomicOr():
$ cat t2082.cu
#include <cstdio>
#include <cstdint>
__global__ void k(){
__shared__ unsigned long long var;
if (!threadIdx.x) var = 0;
__syncthreads();
atomicOr(&var, 1<<threadIdx.x);
__syncthreads();
if (!threadIdx.x) printf("0x%lx\n", var);
}
int main(){
k<<<1,64>>>();
cudaDeviceSynchronize();
}
$ nvcc -o t2082 t2082.cu
$ compute-sanitizer ./t2082
========= COMPUTE-SANITIZER
0xffffffffffffffff
========= ERROR SUMMARY: 0 errors
$
The 64-bit shared atomicOr is supported on devices of cc3.5 or greater, which is the same device footprint supported by CUDA 11.
If you were, for example, on CUDA 10.x and using a cc3.0 device, you could do this with two 32-bit variables:
$ cat t2082.cu
#include <cstdio>
#include <cstdint>
__global__ void k(){
__shared__ unsigned var[2];
if (!threadIdx.x) {var[0] = 0; var[1] = 0;}
__syncthreads();
if (threadIdx.x < 32)
atomicOr(&(var[0]), 1<<threadIdx.x);
else
atomicOr(&(var[1]), 1<<(threadIdx.x-32));
__syncthreads();
unsigned long long var64 = (((unsigned long long)var[1])<<32) + var[0];
if (!threadIdx.x) printf("0x%lx\n", var64);
}
int main(){
k<<<1,64>>>();
cudaDeviceSynchronize();
}
$ nvcc -o t2082 t2082.cu
$ compute-sanitizer ./t2082
========= COMPUTE-SANITIZER
0xffffffffffffffff
========= ERROR SUMMARY: 0 errors
$
I came across the sample code from one of my colleagues where the cudaMemset doesn't seem to work properly, when run on V100.
#include <iostream>
#include <stdio.h>
#define CUDACHECK(cmd) \
{\
cudaError_t error = cmd;\
if (error != cudaSuccess) { \
fprintf(stderr, "info: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__);\
}\
}
__global__ void setValue(int value, int* A_d) {
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if(tx == 0){
A_d[tx] = A_d[tx] + value;
}
}
__global__ void printValue(int* A_d) {
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if(tx == 0){
printf("A_d: %d\n", A_d[tx]);
}
}
int main(int argc, char* argv[ ]){
int *A_h, *A_d;
int size = sizeof(int);
A_h = (int*)malloc(size);
A_h[0] = 1;
CUDACHECK(cudaSetDevice(0));
CUDACHECK(cudaHostRegister(A_h, size, 0));
CUDACHECK(cudaHostGetDevicePointer((void**)&A_d, A_h, 0));
setValue<<<64,1,0,0>>>(5, A_d);
cudaDeviceSynchronize();
printf("A_h: %d\n", A_h[0]);
A_h[0] = 100;
printf("A_h: %d\n",A_h[0]);
printValue<<<64,1,0,0>>>(A_d);
cudaDeviceSynchronize();
CUDACHECK (cudaMemset(A_d, 1, size) );
printf("A_h: %d\n",A_h[0]);
printValue<<<64,1,0,0>>>(A_d);
cudaDeviceSynchronize();
cudaHostUnregister(A_h);
free(A_h);
}
When this sample is compiled and run, the output is seen as below.
/usr/local/cuda-11.0/bin/nvcc memsettest.cu -o test
./test
A_h: 6
A_h: 100
A_d: 100
A_h: 16843009
A_d: 16843009
We expect A_h and A_d to be set to 1 with cudaMemset. But it is set to some huge value as seen.
So, is cudaMemset expected to work on the device pointer A_d returned by cudaHostGetDevicePointer.
Is this A_d expected to be used only in kernels.
We also see that cudaMemcpy DtoH or HtoD seem to be working on the same device pointer A_d.
Can someone help us with the correct behavior.
We expect A_h and A_d to be set to 1 with cudaMemset.
You're confused about how cudaMemset works. Conceptually, it is very similar to memset from the C standard library. You should try your same test case with memset and see what it does.
Anyway, cudaMemset takes a pointer, a byte value, and a size in bytes to set, just like memset.
So your cudaMemset command:
CUDACHECK (cudaMemset(A_d, 1, size) );
is setting each byte to 1. Since size is 4, that means that you are setting A_d[0] to 0x01010101 (in hexadecimal). If you plug that value into your windows programmer calculator, the value is 16843009 in decimal. So everything is working as expected, here, from what I can see.
Again, I'm pretty sure you would see the same behavior with memset for the same test case/usage.
I am trying to apply a kernel function on a __device__ variable, which, according to the specs, resides "in global memory"
#include <stdio.h>
#include "sys_data.h"
#include "my_helper.cuh"
#include "helper_cuda.h"
#include <cuda_runtime.h>
double X[10] = {1,-2,3,-4,5,-6,7,-8,9,-10};
double Y[10] = {0};
__device__ double DEV_X[10];
int main(void) {
checkCudaErrors(cudaMemcpyToSymbol(DEV_X, X,10*sizeof(double)));
vector_projection<double><<<1,10>>>(DEV_X, 10);
getLastCudaError("oops");
checkCudaErrors(cudaMemcpyFromSymbol(Y, DEV_X, 10*sizeof(double)));
return 0;
}
The kernel function vector_projection is defined in my_helper.cuh as follows:
template<typename T> __global__ void vector_projection(T *dx, int n) {
int tid;
tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < n) {
if (dx[tid] < 0)
dx[tid] = (T) 0;
}
}
As you can see, I use cudaMemcpyToSymbol and cudaMemcpyFromSymbol to transfer data to and from the device. However, I'm getting the following error:
CUDA error at ../src/vectorAdd.cu:19 code=4(cudaErrorLaunchFailure)
"cudaMemcpyFromSymbol(Y, DEV_X, 10*sizeof(double))"
Footnote: I can of course avoid to use __device__ variables and go for something like this which works fine; I just want to see how to do the same thing (if possible) with __device__ variables.
Update: The output of cuda-memcheck can be found at http://pastebin.com/AW9vmjFs. The error messages I get are as follows:
========= Invalid __global__ read of size 8
========= at 0x000000c8 in /home/ubuntu/Test0001/Debug/../src/my_helper.cuh:75:void vector_projection<double>(double*, int)
========= by thread (9,0,0) in block (0,0,0)
========= Address 0x000370e8 is out of bounds
The root of the problem is that you are not allowed to take the address of a device variable in ordinary host code:
vector_projection<double><<<1,10>>>(DEV_X, 10);
^
Although this seems to compile correctly, the actual address passed is garbage.
To take the address of a device variable in host code, we can use cudaGetSymbolAddress
Here is a worked example that compiles and runs correctly for me:
$ cat t577.cu
#include <stdio.h>
double X[10] = {1,-2,3,-4,5,-6,7,-8,9,-10};
double Y[10] = {0};
__device__ double DEV_X[10];
template<typename T> __global__ void vector_projection(T *dx, int n) {
int tid;
tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < n) {
if (dx[tid] < 0)
dx[tid] = (T) 0;
}
}
int main(void) {
cudaMemcpyToSymbol(DEV_X, X,10*sizeof(double));
double *my_dx;
cudaGetSymbolAddress((void **)&my_dx, DEV_X);
vector_projection<double><<<1,10>>>(my_dx, 10);
cudaMemcpyFromSymbol(Y, DEV_X, 10*sizeof(double));
for (int i = 0; i < 10; i++)
printf("%d: %f\n", i, Y[i]);
return 0;
}
$ nvcc -arch=sm_35 -o t577 t577.cu
$ cuda-memcheck ./t577
========= CUDA-MEMCHECK
0: 1.000000
1: 0.000000
2: 3.000000
3: 0.000000
4: 5.000000
5: 0.000000
6: 7.000000
7: 0.000000
8: 9.000000
9: 0.000000
========= ERROR SUMMARY: 0 errors
$
This is not the only way to address this. It is legal to take the address of a device variable in device code, so you could modify your kernel with a line something like this:
T *dx = DEV_X;
and forgo passing of the device variable as a kernel parameter. As suggested in the comments, you could also modify your code to use Unified Memory.
Regarding error checking, if you deviate from proper cuda error checking and are not careful in your deviations, the results may be confusing. Most cuda API calls can, in addition to errors arising from their own behavior, return an error that resulted from some previous CUDA asynchronous activity (usually kernel calls).
First off, I'm fairly new to CUDA programming so I apologize for such a simple question. I have researched the best way to determine dimGrid and dimBlock in my GPU kernel call and for some reason I'm not quite getting it to work.
On my home PC, I have a GeForce GTX 580 (Compute Capability 2.0). 1024 threads per block etc. I can get my code to run properly on this PC. My gpu is populating the distance array of size 988*988. Here is part of the code below:
#define SIZE 988
__global__ void createDistanceTable(double *d_distances, double *d_coordinates)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if(row < SIZE && col < SIZE)
d_distances[row * SIZE + col] =
acos(__sinf(d_coordinates[row * 2 + 0])*
__sinf(d_coordinates[col * 2 + 0])+__cosf(d_coordinates[row * 2 + 0])*
__cosf(d_coordinates[col * 2 + 0])*__cosf(d_coordinates[col * 2 + 1]-
d_coordinates[row * 2 + 1]))*6371;
}
Kernel call in main:
dim3 dimBlock(32,32,1);
dim3 dimGrid(32,32,1);
createDistanceTable<<<dimGrid, dimBlock>>>(d_distances, d_coordinates);
My issue is I simply have not found a way to get this code to run properly on my laptop. My laptop's GPU is a GeForce 9600M GT (Compute Capability 1.1). 512 threads per block etc. I would greatly appreciate any guidance in helping my understand how I should approach the dimBlock and dimGrid for my kernel call on my laptop. Thanks for any advice!
Several things were wrong in your code.
Using double-precision on CC < 1.3.
The size of your thread blocks (as you said, CC <= 1.3 means 512 threads max per block, you used 1024 threads per block). I guess you could use __CUDA_ARCH__ if you do need some multi-architecture code.
No error checking or memory checking (cuda-memcheck). You may allocate more memory than you have, or use more threads/blocks than your GPU can handle, and you will not detect it.
Consider the following example based on your code (I am using float instead of double):
#include <cuda.h>
#include <stdio.h> // printf
#define SIZE 988
#define GRID_SIZE 32
#define BLOCK_SIZE 16 // set to 16 instead of 32 for instance
#define CUDA_CHECK_ERROR() __cuda_check_errors(__FILE__, __LINE__)
#define CUDA_SAFE_CALL(err) __cuda_safe_call(err, __FILE__, __LINE__)
// See: http://codeyarns.com/2011/03/02/how-to-do-error-checking-in-cuda/
inline void
__cuda_check_errors (const char *filename, const int line_number)
{
cudaError err = cudaDeviceSynchronize ();
if (err != cudaSuccess)
{
printf ("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString (err));
exit (-1);
}
}
inline void
__cuda_safe_call (cudaError err, const char *filename, const int line_number)
{
if (err != cudaSuccess)
{
printf ("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString (err));
exit (-1);
}
}
__global__ void
createDistanceTable (float *d_distances, float *d_coordinates)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if (row < SIZE && col < SIZE)
d_distances[row * SIZE + col] =
acos (__sinf (d_coordinates[row * 2 + 0]) *
__sinf (d_coordinates[col * 2 + 0]) +
__cosf (d_coordinates[row * 2 + 0]) *
__cosf (d_coordinates[col * 2 + 0]) *
__cosf (d_coordinates[col * 2 + 1] -
d_coordinates[row * 2 + 1])) * 6371;
}
int
main ()
{
float *d_distances;
float *d_coordinates;
CUDA_SAFE_CALL (cudaMalloc (&d_distances, SIZE * SIZE * sizeof (float)));
CUDA_SAFE_CALL (cudaMalloc (&d_coordinates, SIZE * SIZE * sizeof (float)));
dim3 dimGrid (GRID_SIZE, GRID_SIZE);
dim3 dimBlock (BLOCK_SIZE, BLOCK_SIZE);
createDistanceTable <<< dimGrid, dimBlock >>> (d_distances, d_coordinates);
CUDA_CHECK_ERROR ();
CUDA_SAFE_CALL (cudaFree (d_distances));
CUDA_SAFE_CALL (cudaFree (d_coordinates));
}
Compilation command (change architecture accordingly):
nvcc prog.cu -g -G -lineinfo -gencode arch=compute_11,code=sm_11 -o prog
With 32x32 block on CC 2.0 or 16x16 on CC 1.1:
cuda-memcheck ./prog
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
With 33x33 block on CC 2.0 or 32x32 block on CC 1.1:
cuda-memcheck ./prog
========= CUDA-MEMCHECK
========= Program hit error 9 on CUDA API call to cudaLaunch
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/nvidia-current-updates/libcuda.so [0x26a230]
========= Host Frame:/opt/cuda/lib64/libcudart.so.5.0 (cudaLaunch + 0x242) [0x2f592]
========= Host Frame:./prog [0xc76]
========= Host Frame:./prog [0xa99]
========= Host Frame:./prog [0xac4]
========= Host Frame:./prog [0x9d1]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xed) [0x2176d]
========= Host Frame:./prog [0x859]
=========
========= ERROR SUMMARY: 1 error
Error 9:
/**
* This indicates that a kernel launch is requesting resources that can
* never be satisfied by the current device. Requesting more shared memory
* per block than the device supports will trigger this error, as will
* requesting too many threads or blocks. See ::cudaDeviceProp for more
* device limitations.
*/ cudaErrorInvalidConfiguration = 9,
In cuBLAS, cublasIsamin() gives the argmin for a single-precision array.
Here's the full function declaration: cublasStatus_t cublasIsamin(cublasHandle_t handle, int n,
const float *x, int incx, int *result)
The cuBLAS programmer guide provides this information about the cublasIsamin() parameters:
If I use host (CPU) memory for result, then cublasIsamin works properly. Here's an example:
void argmin_experiment_hostOutput(){
float h_A[4] = {1, 2, 3, 4}; int N = 4;
float* d_A = 0;
CHECK_CUDART(cudaMalloc((void**)&d_A, N * sizeof(d_A[0])));
CHECK_CUBLAS(cublasSetVector(N, sizeof(h_A[0]), h_A, 1, d_A, 1));
cublasHandle_t handle; CHECK_CUBLAS(cublasCreate(&handle));
int result; //host memory
CHECK_CUBLAS(cublasIsamin(handle, N, d_A, 1, &result));
printf("argmin = %d, min = %f \n", result, h_A[result]);
CHECK_CUBLAS(cublasDestroy(handle));
}
However, if I use device (GPU) memory for result, then cublasIsamin segfaults. Here's an example that segfaults:
void argmin_experiment_deviceOutput(){
float h_A[4] = {1, 2, 3, 4}; int N = 4;
float* d_A = 0;
CHECK_CUDART(cudaMalloc((void**)&d_A, N * sizeof(d_A[0])));
CHECK_CUBLAS(cublasSetVector(N, sizeof(h_A[0]), h_A, 1, d_A, 1));
cublasHandle_t handle; CHECK_CUBLAS(cublasCreate(&handle));
int* d_result = 0;
CHECK_CUDART(cudaMalloc((void**)&d_result, 1 * sizeof(d_result[0]))); //just enough device memory for 1 result
CHECK_CUDART(cudaMemset(d_result, 0, 1 * sizeof(d_result[0])));
CHECK_CUBLAS(cublasIsamin(handle, N, d_A, 1, d_result)); //SEGFAULT!
CHECK_CUBLAS(cublasDestroy(handle));
}
The Nvidia guide says that `cublasIsamin()` can output to device memory. What am I doing wrong?
Motivation: I want to compute the argmin() of several vectors concurrently in multiple streams. Outputting to host memory requires CPU-GPU synchronization and seems to kill the multi-kernel concurrency. So, I want to output the argmin to device memory instead.
The CUBLAS V2 API does support writing scalar results to device memory. But it doesn't support this by default. As per Section 2.4 "Scalar parameters" of the documentation, you need to use cublasSetPointerMode() to make the API aware that scalar argument pointers will reside in device memory. Note this also makes these level 1 BLAS functions asynchronous, so you must ensure that the GPU has completed the kernel(s) before trying to access the result pointer.
See this answer for a complete working example.