CUDA memory troubles - cuda

I have a CUDA kernel which I'm compiling to a cubin file without any special flags:
nvcc text.cu -cubin
It compiles, though with this message:
Advisory: Cannot tell what pointer points to, assuming global memory space
and a reference to a line in some temporary cpp file. I can get this to work by commenting out some seemingly arbitrary code which makes no sense to me.
The kernel is as follows:
__global__ void string_search(char** texts, int* lengths, char* symbol, int* matches, int symbolLength)
{
int localMatches = 0;
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = threadIdx.x + threadIdx.y * blockDim.x;
int blockThreads = blockDim.x * blockDim.y;
__shared__ int localMatchCounts[32];
bool breaking = false;
for(int i = 0; i < (lengths[blockId] - (symbolLength - 1)); i += blockThreads)
{
if(texts[blockId][i] == symbol[0])
{
for(int j = 1; j < symbolLength; j++)
{
if(texts[blockId][i + j] != symbol[j])
{
breaking = true;
break;
}
}
if (breaking) continue;
localMatches++;
}
}
localMatchCounts[threadId] = localMatches;
__syncthreads();
if(threadId == 0)
{
int sum = 0;
for(int i = 0; i < 32; i++)
{
sum += localMatchCounts[i];
}
matches[blockId] = sum;
}
}
If I replace the line
localMatchCounts[threadId] = localMatches;
after the first for loop with this line
localMatchCounts[threadId] = 5;
it compiles with no notices. This can also be achieved by commenting out seemingly random parts of the loop above the line. I have also tried replacing the local memory array with a normal array to no effect. Can anyone tell me what the problem is?
The system is Vista 64bit, for what its worth.
Edit: I fixed the code so it actually works, though it still produces the compiler notice. It does not seem as though the warning is a problem, at least with regards to correctness (it might affect performance).

Arrays of pointers like char** are problematic in kernels, since the kernels have no access to the host's memory.
It is better to allocate a single continuous buffer and to divide it in a manner that enables parallel access.
In this case I'd define a 1D array which contains all the strings positioned one after another and another 1D array, sized 2*numberOfStrings which contains the offset of each string within the first array and it's length:
For example - preparation for kernel:
char* buffer = st[0] + st[1] + st[2] + ....;
int* metadata = new int[numberOfStrings * 2];
int lastpos = 0;
for (int cnt = 0; cnt < 2* numberOfStrings; cnt+=2)
{
metadata[cnt] = lastpos;
lastpos += length(st[cnt]);
metadata[cnt] = length(st[cnt]);
}
In kernel:
currentIndex = threadId + blockId * numberOfBlocks;
char* currentString = buffer + metadata[2 * currentIndex];
int currentStringLength = metadata[2 * currentIndex + 1];

The problem seems to be associated with the char** parameter. Turning this into a char* solved the warning, so I suspect that cuda might have problems with this form of data. Perhaps cuda prefers that one uses the specific cuda 2D arrays in this case.

Related

Cuda printf() overlapping when using multiple devices

I have a printf in my __global__ code. It works as intended most of the time. However when using a multi GPU system (typically happens when ran on an 4-8 GPU system), once in a while, the prints will merge. By once in a while Its about 100-500 lines out of 167000 lines.
I was wondering how this situation can be remedied without adding too much overhead of transferring the data back to host (if possible). I was thinking to try a mutex lock for printing but I dont think that sort of thing exists for use in the kernel. Any other solutions I could try?
Note: The actual kernel is a long running kernel usually around 20-50 minutes to complete depending on the GPU.
Note2: I barely know what I'm doing with C/C++.
Example of merged Output
JmHp8rwXAw,031aa97714c800de47971829beded204000cfcf5e0f3775552ccf3e9b387869fxLuZJu3ZkX
qVOuKlQ0ZcMrhGXAnZ75,08bf3e90a57c31b7f355214cdf442748d9ff6ae1d49a96f7a8b9e3c86bd8e68a,5231a9e969d53c64f75bb1f07b1c95bb81f685744ed46f56348c733389c56ca5
,623f62b3198c8b62cd7a3b3cf8bf8ede5f9bfdccb7c1dc48a55530c7d5f59ce8
What it should look like
JmHp8rwXAw,031aa97714c800de47971829beded204000cfcf5e0f3775552ccf3e9b387869f
MrhGXAnZ75,08bf3e90a57c31b7f355214cdf442748d9ff6ae1d49a96f7a8b9e3c86bd8e68a
qVOuKlQ0Zc,5231a9e969d53c64f75bb1f07b1c95bb81f685744ed46f56348c733389c56ca5
xLuZJu3ZkX,623f62b3198c8b62cd7a3b3cf8bf8ede5f9bfdccb7c1dc48a55530c7d5f59ce8
My Example Code:
#define BLOCKS 384
#define THREADS 64
typedef struct HandlerInput {
unsigned char device;
} HandlerInput;
pthread_mutex_t solutionLock;
__global__ void kernel(unsigned long baseSeed) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
BYTE random[RANDOM_LEN];
BYTE data[DIGEST_LEN];
SHA256_CTX ctx;
/* Randomization routine*/
d_getRandomString((unsigned long)idx + baseSeed, random);
/* Hashing routine*/
sha256_hash(&ctx, random, data, RANDOM_LEN);
/* Print to console - randomStr,Hash */
printf("%s,%s\n", random, data);
}
void *launchGPUHandlerThread(void *vargp) {
HandlerInput *hi = (HandlerInput *)vargp;
cudaSetDevice(hi->device);
unsigned long rngSeed = timeus();
while (1) {
hostRandomGen(&rngSeed);
kernel<<<BLOCKS, THREADS>>>(rngSeed);
cudaDeviceSynchronize();
}
cudaDeviceReset();
return NULL;
}
int main() {
int GPUS;
cudaGetDeviceCount(&GPUS);
pthread_t *tids = (pthread_t *)malloc(sizeof(pthread_t) * GPUS);
for (int i = 0; i < GPUS; i++) {
HandlerInput *hi = (HandlerInput *)malloc(sizeof(HandlerInput));
hi->device = i;
pthread_create(tids + i, NULL, launchGPUHandlerThread, hi);
usleep(23);
}
pthread_mutex_lock(&solutionLock);
for (int i = 0; i < GPUS; i++)
pthread_join(tids[i], NULL);
return 0;
}
I spent 4 days trying different things to no avail. I really don't understand memory management enough in C/C++ to get past the endless segmentation fault errors.
What I ended up doing was using Unified Memory as it seemed the easiest way to handle the memory for both device and host and it doesn't seem to add too much overhead to the whole process. Then each cpu thread (gpu) can write to its own file. I ran a couple of nvprof and it seemed that after the initial setup for the memory cudaMallocManaged the rest of the overhead seemed to be measured in the microseconds. Since each loop takes 20 minutes these are really barely noticeable.
I created two __device__ functions to copy the data over to the host accessible arrays, because I wanted to utilize the #pragma unroll feature. Not really sure if that helps or what it even does, but I decided to do things this way.
If anyone has further suggestions on ways to improve I am open to trying more things out.
Here is my new example code:
#define BLOCKS 384
#define THREADS 64
typedef struct HandlerInput {
unsigned char device;
} HandlerInput;
__device__ void mycpydigest(__restrict__ BYTE *dst, __restrict__ const BYTE *src) {
#pragma unroll 64
for (BYTE i = 0; i < 64; i++) {
dst[i] = src[i];
}
dst[64] = '\0';
}
__device__ void mycpyrandom(__restrict__ BYTE *dst, __restrict__ const BYTE *src) {
#pragma unroll 10
for (BYTE i = 0; i < 10; i++) {
dst[i] = src[i];
}
dst[10] = '\0';
}
__global__ void kernel(BYTE **d_random, BYTE **d_hashes, unsigned long baseSeed) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
BYTE random[RANDOM_LEN];
BYTE data[DIGEST_LEN];
SHA256_CTX ctx;
/* Randomization routine*/
d_getRandomString((unsigned long)idx + baseSeed, random);
/* Hashing routine*/
sha256_hash(&ctx, random, data, RANDOM_LEN);
/* Send to host - randomStr & Hash */
mycpydigest(d_hashes[idx], data);
mycpyrandom(d_random[idx], random);
}
void *launchGPUHandlerThread(void *vargp) {
HandlerInput *hi = (HandlerInput *)vargp;
cudaSetDevice(hi->device);
unsigned long rngSeed = timeus();
int threadBlocks = hi->BLOCKS * hi->THREADS;
BYTE **randoms;
BYTE **hashes;
cudaMallocManaged(&randoms, sizeof(BYTE *) * (threadBlocks), cudaMemAttachGlobal);
cudaMallocManaged(&hashes, sizeof(BYTE *) * (threadBlocks), cudaMemAttachGlobal);
for (int i = 0; i < threadBlocks; i++) {
cudaMallocManaged(&randoms[i], sizeof(BYTE) * (RANDOM_LEN), cudaMemAttachGlobal);
cudaMallocManaged(&hashes[i], sizeof(BYTE) * (DIGEST_LEN), cudaMemAttachGlobal);
}
while (1) {
hostRandomGen(&rngSeed);
kernel<<<hi->BLOCKS, hi->THREADS>>>(randoms, hashes, rngSeed);
cudaDeviceSynchronize();
print2File(randoms, hashes, threadBlocks, hi->device)
}
cudaFree(hashes);
cudaFree(randoms);
cudaDeviceReset();
return NULL;
}
int main() {
int GPUS;
cudaGetDeviceCount(&GPUS);
pthread_t *tids = (pthread_t *)malloc(sizeof(pthread_t) * GPUS);
for (int i = 0; i < GPUS; i++) {
HandlerInput *hi = (HandlerInput *)malloc(sizeof(HandlerInput));
hi->device = i;
pthread_create(tids + i, NULL, launchGPUHandlerThread, hi);
usleep(23);
}
for (int i = 0; i < GPUS; i++)
pthread_join(tids[i], NULL);
return 0;
}
I want to thank #paleonix for the help in the comments. I was working on this issue for a week before I posted and your comments helped guide me down a different path.

Segmentation Fault with 3D array

I am trying to work with 3D arrays in CUDA (200x200x100).
The moment I change my z dimension (model_num) from 4 to 5, I get a segmentation fault. Why, and how can I fix it?
const int nrcells = 200;
const int nphicells = 200;
const int model_num = 5; //So far, 4 is the maximum model_num that works. At 5 and after, there is a segmentation fault
__global__ void kernel(float* mgridb)
{
const unsigned long long int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
if(tx >= 0 && tx < nphicells && ty >=0 && ty < nrcells && tz >= 0 && tz < model_num){
//Do stuff with mgridb[i]
}
}
int main (void)
{
unsigned long long int size_matrices = nphicells*nrcells*model_num;
unsigned long long int mem_size_matrices = sizeof(float) * size_matrices;
float *h_mgridb = (float *)malloc(mem_size_matrices);
float mgridb[nphicells][nrcells][model_num];
for(int k = 0; k < model_num; k++){
for(int j = 0; j < nrcells; j++){
for(int i = 0; i < nphicells; i++){
mgridb[i][j][k] = 0;
}
}
}
float *d_mgridb;
cudaMalloc( (void**)&d_mgridb, mem_size_matrices );
cudaMemcpy(d_mgridb, h_mgridb, mem_size_matrices, cudaMemcpyHostToDevice);
int threads = nphicells;
uint3 blocks = make_uint3(nrcells,model_num,1);
kernel<<<blocks,threads>>>(d_mgridb);
cudaMemcpy( h_mgridb, d_mgridb, mem_size_matrices, cudaMemcpyDeviceToHost);
cudaFree(d_mgridb);
return 0;
}
This is getting stored on the stack:
float mgridb[nphicells][nrcells][model_num];
Your stack space is limited. When you exceed the amount you can store on the stack, you are getting a seg fault, either at the point of allocation, or as soon as you try and access it.
Use malloc instead. That allocates heap storage, which has much higher limits.
None of the above has anything to do with CUDA. Furthermore its not unique or specific to "3D" arrays. Any large stack based allocation (e.g. 1D array) is going to have the same trouble.
You may also have to adjust how you access the array, but it's not difficult to handle a flattened array using pointer indexing.
Your code is actually strange looking, because you are creating an appropriately sized array h_mgridb using malloc and then copying that array to the device (into d_mgridb). It's not clear what purpose mgridb serves in your code. h_mgridb and mgridb are not the same.

Cuda kernel producing the resultant vector as zero

Here is the kernel that I am launching for calculating some array in parallel.
__device__ bool mult(int colsize,int rowsize,int *Aj,int *Bi,int *val)
{
for(int j = 0; j < rowsize;j++)
{
for(int k = 0;k < colsize;k++)
{
if(Aj[j] == Bi[k])
{
return true;
}
}
}
return false;
}
__global__ void kernel(int *Aptr,int *Aj,int *Bptr,int *Bi,int rows,int cols,int *Cjc)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int i;
if(tid < cols)
{
int beg = Bptr[tid];
int end = Bptr[tid+1];
for(i = 0;i < rows;i++)
{
int cbeg = Aptr[i];
int cend = Aptr[i+1];
if(mult(end - beg,cend - cbeg,Aj+cbeg,Bi+beg))
{
Cjc[tid+1] += 1;
//atomicAdd(Cjc+tid+1,1);
}
}
}
}
And here is how I decide the configuration of grid and blocks
int numBlocks,numThreads;
if(q % 32 == 0)
{
numBlocks = q/32;
numThreads = 32;
}
else
{
numBlocks = (q+31)/32;
numThreads = 32;
}
findkernel<<<numBlocks,numThreads>>>(devAptr,devAcol,devBjc,devBir,m,q,d_Cjc);
I am using GTX 480 with CC 2.0.
Now the problem that I am facing is that whenever q increases beyond 4096 the values in Cjc array are all produced as 0.
I know maximum number of blocks that I can use in X direction is 65535 and each block can have at most (1024,1024,64) threads. Then why does this kernel calculate the wrong output for Cjc array?
I seems like there are a couple of things wrong with the code you posted:
I guess findkernel is kernel in the CUDA code above?
kernel has 8 parameters, but you only use 7 parameters to call findkernel. This doesn't look right!
In kernel, you test if(tid < cols) - I guess this should be if(tid < count)??
Why does kernel expect count to be a pointer? I think you don't pass in an int pointer but a regular integer value to findkernel.
Why does __device__ bool mult get count/int *val if it is not used?
I guess #3 or #4 could be the source of your problem, but you should look at the other things as well.
OK so I finally figured out using cudaError_t that when I tried to cudaMemcpy the d_Cjc array from device to host, it throws following error.
CUDA error: the launch timed out and was terminated
It turns out that some of the calculations in findkernel are taking reasonably large amount of time which causes the display driver to terminate the program because of OS 'watchdog' time limit.
I believe I will have to shut down X server or ssh my gpu machine (from another machine) by removing its display.This will buy me some time to do the calculations that will not exceed the 'watchdog' limit of OS.

Shared memory mutex with CUDA - adding to a list of items

My problem is the following: I have an image in which I detect some points of interest using the GPU. The detection is a heavyweight test in terms of processing, however only about 1 in 25 points pass the test on average. The final stage of the algorithm is to build up a list of the points. On the CPU this would be implemented as:
forall pixels x,y
{
if(test_this_pixel(x,y))
vector_of_coordinates.push_back(Vec2(x,y));
}
On the GPU I have each CUDA block processing 16x16 pixels. The problem is that I need to do something special to eventually have a single consolidated list of points in global memory. At the moment I am trying to generate a local list of points in shared memory per block which eventually will be written to global memory. I am trying to avoid sending anything back to the CPU because there are more CUDA stages after this.
I was expecting that I could use atomic operations to implement the push_back function on shared memory. However I am unable to get this working. There are two issues. The first annoying issue is that I am constantly running into the following compiler crash: "nvcc error : 'ptxas' died with status 0xC0000005 (ACCESS_VIOLATION)" when using atomic operations. It is hit or miss whether I can compile something. Does anyone know what causes this?
The following kernel will reproduce the error:
__global__ void gpu_kernel(int w, int h, RtmPoint *pPoints, int *pCounts)
{
__shared__ unsigned int test;
atomicInc(&test, 1000);
}
Secondly, my code which includes a mutex lock on shared memory hangs the GPU and I dont understand why:
__device__ void lock(unsigned int *pmutex)
{
while(atomicCAS(pmutex, 0, 1) != 0);
}
__device__ void unlock(unsigned int *pmutex)
{
atomicExch(pmutex, 0);
}
__global__ void gpu_kernel_non_max_suppress(int w, int h, RtmPoint *pPoints, int *pCounts)
{
__shared__ RtmPoint localPoints[64];
__shared__ int localCount;
__shared__ unsigned int mutex;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int threadid = threadIdx.y * blockDim.x + threadIdx.x;
int blockid = blockIdx.y * gridDim.x + blockIdx.x;
if(threadid==0)
{
localCount = 0;
mutex = 0;
}
__syncthreads();
if(x<w && y<h)
{
if(some_test_on_pixel(x,y))
{
RtmPoint point;
point.x = x;
point.y = y;
// this is a local push_back operation
lock(&mutex);
if(localCount<64) // we should never get >64 points per block
localPoints[localCount++] = point;
unlock(&mutex);
}
}
__syncthreads();
if(threadid==0)
pCounts[blockid] = localCount;
if(threadid<localCount)
pPoints[blockid * 64 + threadid] = localPoints[threadid];
}
In the example code at this site, the author manages to successfully use atomic operations on shared memory, so I am confused as to why my case does not function. If I comment out the lock and unlock lines, the code runs ok, but obviously incorrectly adding to the list.
I would appreciate some advice about why this problem is happening and also perhaps if there is a better solution to achieving the goal, since I am concerned anyway about the performance issues with using atomic operations or mutex locks.
I suggest using prefix-sum to implement that part to increase parallelism. To do that you need to use a shared array. Basically prefix-sum will turn an array (1,1,0,1) into (0,1,2,2,3), i.e., will calculate an in-place running exclusive sum so that you'll get per-thread write indices.
__shared__ uint8_t vector[NUMTHREADS];
....
bool emit = (x<w && y<h);
emit = emit && some_test_on_pixel(x,y);
__syncthreads();
scan(emit, vector);
if (emit) {
pPoints[blockid * 64 + vector[TID]] = point;
}
prefix-sum example:
template <typename T>
__device__ uint32 scan(T mark, T *output) {
#define GET_OUT (pout?output:values)
#define GET_INP (pin?output:values)
__shared__ T values[numWorkers];
int pout=0, pin=1;
int tid = threadIdx.x;
values[tid] = mark;
syncthreads();
for( int offset=1; offset < numWorkers; offset *= 2) {
pout = 1 - pout; pin = 1 - pout;
syncthreads();
if ( tid >= offset) {
GET_OUT[tid] = (GET_INP[tid-offset]) +( GET_INP[tid]);
}
else {
GET_OUT[tid] = GET_INP[tid];
}
syncthreads();
}
if(!pout)
output[tid] =values[tid];
__syncthreads();
return output[numWorkers-1];
#undef GET_OUT
#undef GET_INP
}
Based on recommendations here, I include the code that I used in the end. It uses 16x16 pixel blocks. Note that I am now writing the data out in one global array without breaking it up. I used the global atomicAdd function to compute a base address for each set of results. Since this only gets called once per block, I did not find too much of a slow down, while I gained a lot more convenience by doing this. I'm also avoiding shared buffers for the input and output of prefix_sum. GlobalCount is set to zero prior to the kernel call.
#define BLOCK_THREADS 256
__device__ int prefixsum(int threadid, int data)
{
__shared__ int temp[BLOCK_THREADS*2];
int pout = 0;
int pin = 1;
if(threadid==BLOCK_THREADS-1)
temp[0] = 0;
else
temp[threadid+1] = data;
__syncthreads();
for(int offset = 1; offset<BLOCK_THREADS; offset<<=1)
{
pout = 1 - pout;
pin = 1 - pin;
if(threadid >= offset)
temp[pout * BLOCK_THREADS + threadid] = temp[pin * BLOCK_THREADS + threadid] + temp[pin * BLOCK_THREADS + threadid - offset];
else
temp[pout * BLOCK_THREADS + threadid] = temp[pin * BLOCK_THREADS + threadid];
__syncthreads();
}
return temp[pout * BLOCK_THREADS + threadid];
}
__global__ void gpu_kernel(int w, int h, RtmPoint *pPoints, int *pGlobalCount)
{
__shared__ int write_base;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int threadid = threadIdx.y * blockDim.x + threadIdx.x;
int valid = 0;
if(x<w && y<h)
{
if(test_pixel(x,y))
{
valid = 1;
}
}
int index = prefixsum(threadid, valid);
if(threadid==BLOCK_THREADS-1)
{
int total = index + valid;
if(total>64)
total = 64; // global output buffer is limited to 64 points per block
write_base = atomicAdd(pGlobalCount, total); // get a location to write them out
}
__syncthreads(); // ensure write_base is valid for all threads
if(valid)
{
RtmPoint point;
point.x = x;
point.y = y;
if(index<64)
pPoints[write_base + index] = point;
}
}

copy global memory by CUDA threads

I need to copy one array in global memory to another array in global memory by CUDA threads (not from the host).
My code is as follows:
__global__ void copy_kernel(int *g_data1, int *g_data2, int n)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int start, end;
start = some_func(idx);
end = another_func(idx);
unsigned int i;
for (i = start; i < end; i++) {
g_data2[i] = g_data1[idx];
}
}
It is very inefficient because for some idx, the [start, end] region is very large, which makes that thread issue too many copy commands. Is there any way to implement it efficiently?
Thank you,
Zheng
The way you wrote it, I am guessing each thread is trying to write the whole 'start' to 'end' chunk. Which is really really inefficient.
you need to do something like this.
___shared___ unsigned sm_start[BLOCK_SIZE];
___shared___ unsigned sm_end[BLOCK_SIZE];
sm_start[threadIdx.x] = start;
sm_end[threadIdx.y] = end;
__syncthreads();
for (int n = 0; n < blockdDim.x; n++) {
g_data2 += sm_start[n];
unsigned lim = sm_end[n] - sm_start[n];
for (int i = threadIdx.x; i < lim; i += blockDim.x) {
g_data2[i] = g_data1[idx];
}
}
try using this:
CUresult cuMemcpyDtoD(
CUdeviceptr dst,
CUdeviceptr src,
unsigned int bytes
)
UPDATE:
You're right: http://forums.nvidia.com/index.php?showtopic=88745
There is no efficient way to do this properly because the design of CUDA wants you to use only small amount of data in the kernel.