Cuda kernel - possible optimizations - cuda

Here is the kernel that I am launching for calculating some array in parallel.
__device__ bool mult(int colsize,int rowsize,int *Aj,int *Bi)
{
for(int j = 0; j < rowsize;j++)
{
for(int k = 0;k < colsize;k++)
{
if(Aj[j] == Bi[k])
{
return true;
}
}
}
return false;
}
__global__ void kernel(int *Aptr,int *Aj,int *Bptr,int *Bi,int rows,int cols,int *Cjc)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int i;
if(tid < cols)
{
int beg = Bptr[tid];
int end = Bptr[tid+1];
for(i = 0;i < rows;i++)
{
int cbeg = Aptr[i];
int cend = Aptr[i+1];
if(mult(end - beg,cend - cbeg,Aj+cbeg,Bi+beg))
{
Cjc[tid+1] += 1;
//atomicAdd(Cjc+tid+1,1);
}
}
}
}
My launch configurations and kernel call are as follows.
int numBlocks,numThreads;
if(q % 32 == 0)
{
numBlocks = q/32;
numThreads = 32;
}
else
{
numBlocks = (q+31)/32;
numThreads = 32;
}
findkernel<<<numBlocks,numThreads>>>(devAptr,devAcol,devBjc,devBir,m,q,d_Cjc);
I have to admit, this kernel is running pretty slow.Once I get the array back to host side, I use thrust::inclusive_scan to find my resultant array.
My question is, is there any room for improvement / optimization for my kernel? I tried using shared memory but its producing either wrong answers or throwing runtime exceptions.
Also, how does the dynamically allocated shared memory ( which is allocated by third parameter in kernel launch ) is distributed among the blocks?
Any help/hints/insinuations will be appreciated.
Thanks in advance.

As for the shared memory allocated using kernel<<<blocks,threads,mem>>> mem is the amount of memory allocated each block. So each block gets mem amount of memory.
For your code, I don't understand why are there 2 for loops in the mult function. Just want to point out that each thread will be executing these 2 for loops. Moreover, as you also have a for loop in the kernel function, it means that each thread will be executing the 2 for loops in the mult function several times. THis is slow. Moreover, doing
int beg = Bptr[tid];
int end = Bptr[tid+1];
is not exactly coalesced access. Non coalesced access is slow.

Related

Does calling a CUDA kernel multiple times affect execution speed?

I am trying to measure the performance difference of a GPU between allocating memory using 'malloc' in a kernel function vs. using pre-allocated storage from 'cudaMalloc' on the host. To do this, I have two kernel functions, one that uses malloc, one that uses a pre-allocated array, and I time the execution of each function repeatedly.
The problem is that the first execution of each kernel function takes between 400 - 2500 microseconds, but all subsequent runs take about 15 - 30 microseconds.
Is this behavior expected, or am I witnessing some sort of carryover effect from previous runs? If this is carryover, what can I do to prevent it?
I have tried putting in a kernel function that zeros out all memory on the GPU between each timed test run to eliminate that carryover, but nothing changed. I have also tried reversing the order in which I run the tests, and that has no effect on relative or absolute execution times.
const int TEST_SIZE = 1000;
struct node {
node* next;
int data;
};
int main() {
int numTests = 5;
for (int i = 0; i < numTests; ++i) {
memClear();
staticTest();
memClear();
dynamicTest();
}
return 0;
}
__global__ void staticMalloc(int* sum) {
// start a linked list
node head[TEST_SIZE];
// initialize nodes
for (int j = 0; j < TEST_SIZE; j++) {
// allocate the node & assign values
head[j].next = NULL;
head[j].data = j;
}
// verify creation by adding up values
int total = 0;
for (int j = 0; j < TEST_SIZE; j++) {
total += head[j].data;
}
sum[0] = total;
}
/**
* This is a test that will time execution of static allocation
*/
int staticTest() {
int expectedValue = 0;
for (int i = 0; i < TEST_SIZE; ++i) {
expectedValue += i;
}
// host output vector
int* h_sum = new int[1];
h_sum[0] = -1;
// device output vector
int* d_sum;
// vector size
size_t bytes = sizeof(int);
// allocate memory on device
cudaMalloc(&d_sum, bytes);
// only use 1 CUDA thread
dim3 blocksize(1, 1, 1), gridsize(1, 1, 1);
Timer runTimer;
int runTime = 0;
// check dynamic allocation time
runTime = 0;
runTimer.start();
staticMalloc<<<gridsize, blocksize>>>(d_sum);
runTime += runTimer.lap();
h_sum[0] = 0;
cudaMemcpy(h_sum, d_sum, bytes, cudaMemcpyDeviceToHost);
cudaFree(d_sum);
delete (h_sum);
return 0;
}
__global__ void dynamicMalloc(int* sum) {
// start a linked list
node* headPtr = (node*) malloc(sizeof(node));
headPtr->data = 0;
headPtr->next = NULL;
node* curPtr = headPtr;
// add nodes to test cudaMalloc in device
for (int j = 1; j < TEST_SIZE; j++) {
// allocate the node & assign values
node* nodePtr = (node*) malloc(sizeof(node));
nodePtr->data = j;
nodePtr->next = NULL;
// add it to the linked list
curPtr->next = nodePtr;
curPtr = nodePtr;
}
// verify creation by adding up values
curPtr = headPtr;
int total = 0;
while (curPtr != NULL) {
// add and increment current value
total += curPtr->data;
curPtr = curPtr->next;
// clean up memory
free(headPtr);
headPtr = curPtr;
}
sum[0] = total;
}
/**
* Host function that prepares data array and passes it to the CUDA kernel.
*/
int dynamicTest() {
// host output vector
int* h_sum = new int[1];
h_sum[0] = -1;
// device output vector
int* d_sum;
// vector size
size_t bytes = sizeof(int);
// allocate memory on device
cudaMalloc(&d_sum, bytes);
// only use 1 CUDA thread
dim3 blocksize(1, 1, 1), gridsize(1, 1, 1);
Timer runTimer;
int runTime = 0;
// check dynamic allocation time
runTime = 0;
runTimer.start();
dynamicMalloc<<<gridsize, blocksize>>>(d_sum);
runTime += runTimer.lap();
h_sum[0] = 0;
cudaMemcpy(h_sum, d_sum, bytes, cudaMemcpyDeviceToHost);
cudaFree(d_sum);
delete (h_sum);
return 0;
}
__global__ void clearMemory(char *zeros) {
int i = threadIdx.x + blockDim.x * blockIdx.x;
zeros[i] = 0;
}
void memClear() {
char *zeros[1024]; // device pointers
for (int i = 0; i < 1024; ++i) {
cudaMalloc((void**) &(zeros[i]), 4 * 1024 * 1024);
clearMemory<<<1024, 4 * 1024>>>(zeros[i]);
}
for (int i = 0; i < 1024; ++i) {
cudaFree(zeros[i]);
}
}
The first execution of a kernel takes more time because you have to load a lots of stuff on GPU (kernel, lib etc...). To prove it, you can just measure how long it takes to launch an empty kernel and you will see that it's take some times. Try like:
time -> start
launch emptykernel
time -> end
firstTiming = end - start
time -> start
launch empty kernel
time -> end
secondTiming = end - start
You will see that the secondTiming is significantly smaller thant the firstTiming.
The first CUDA (kernel) call initializes the CUDA system transparently. You can avoid this by calling an empty kernel first. Note that this is required in e.g. OpenCL, but there you have to do all that init-stuff manually. CUDA does it for you in the background.
Then some problems with your timing: CUDA kernel calls are asynchronous. So (assuming your Timer class is a host timer like time()) currently you measure the kernel launch time (and for the first call the init-time of CUDA) not the kernel execution time.
At the very least you HAVE to do a cudaDeviceSynchronize() before starting AND stopping the timer.
You are better of using CUDA events which can exactly measure the kernel execution time and only that. Using host-timers you still include the launch-overhead. See https://devblogs.nvidia.com/parallelforall/how-implement-performance-metrics-cuda-cc/

CUDA kernels not launching before CudaDeviceSynchronize

I am having some trouble with concurrent CUDA. Take a look at the attached image. The kernel is launched at the marked point, at 0.395 seconds. Then there is some green CpuWork. Finally, there is a call to cudaDeviceSynchronize. The kernels that is launched before CpuWork doesnt start before the synchronize call. Ideally, it should run in parallel with the CPU work.
void KdTreeGpu::traceRaysOnGpuAsync(int firstRayIndex, int numRays, int rank, int buffer)
{
int per_block = 128;
int num_blocks = numRays/per_block + (numRays%per_block==0?0:1);
Ray* rays = &this->deviceRayPtr[firstRayIndex];
int* outputHitPanelIds = &this->deviceHitPanelIdPtr[firstRayIndex];
kdTreeTraversal<<<num_blocks, per_block, 0>>>(sceneBoundingBox, rays, deviceNodesPtr, deviceTrianglesListPtr,
firstRayIndex, numRays, rank, rootNodeIndex,
deviceTHitPtr, outputHitPanelIds, deviceReflectionPtr);
CUDA_VALIDATE(cudaMemcpyAsync(resultHitDistances[buffer], deviceTHitPtr, numRays*sizeof(double), cudaMemcpyDeviceToHost));
CUDA_VALIDATE(cudaMemcpyAsync(resultHitPanelIds[buffer], outputHitPanelIds, numRays*sizeof(int), cudaMemcpyDeviceToHost));
CUDA_VALIDATE(cudaMemcpyAsync(resultReflections[buffer], deviceReflectionPtr, numRays*sizeof(Vector3), cudaMemcpyDeviceToHost));
}
The memcopies are async. The result buffers are allocated like this
unsigned int flag = cudaHostAllocPortable;
CUDA_VALIDATE(cudaHostAlloc(&resultHitPanelIds[0], MAX_RAYS_PER_ITERATION*sizeof(int), flag));
CUDA_VALIDATE(cudaHostAlloc(&resultHitPanelIds[1], MAX_RAYS_PER_ITERATION*sizeof(int), flag));
Hoping for a solution for this. Have tried many things, including not running in the default stream. When i added cudaHostAlloc i recognized that the async method returned back to the CPU. But that doesnt help when the kernel does not launch before the deviceSynchronize call later.
resultHitDistances[2] contains two allocated memory areas so that when 0 is read by the CPU, the GPU should put the result in 1.
Thanks!
Edit: This is the code that calls traceRaysAsync.
int numIterations = ceil(float(this->numPrimaryRays) / MAX_RAYS_PER_ITERATION);
int numRaysPrevious = min(MAX_RAYS_PER_ITERATION, this->numPrimaryRays);
nvtxRangePushA("traceRaysOnGpuAsync First");
traceRaysOnGpuAsync(0, numRaysPrevious, rank, 0);
nvtxRangePop();
for(int iteration = 0; iteration < numIterations; iteration++)
{
int rayFrom = (iteration+1)*MAX_RAYS_PER_ITERATION;
int rayTo = min((iteration+2)*MAX_RAYS_PER_ITERATION, this->numPrimaryRays) - 1;
int numRaysIteration = rayTo-rayFrom+1;
// Wait for results to finish and get them
waitForGpu();
// Trace the next iteration asynchronously. This will have data prepared for next iteration
if(numRaysIteration > 0)
{
int nextBuffer = (iteration+1) % 2;
nvtxRangePushA("traceRaysOnGpuAsync Interior");
traceRaysOnGpuAsync(rayFrom, numRaysIteration, rank, nextBuffer);
nvtxRangePop();
}
nvtxRangePushA("CpuWork");
// Store results for current iteration
int rayOffset = iteration*MAX_RAYS_PER_ITERATION;
int buffer = iteration % 2;
for(int i = 0; i < numRaysPrevious; i++)
{
if(this->activeRays[rayOffset+i] && resultHitPanelIds[buffer][i] >= 0)
{
this->activeRays[rayOffset+i] = false;
const TrianglePanelPair & t = this->getTriangle(resultHitPanelIds[buffer][i]);
double hitT = resultHitDistances[buffer][i];
Vector3 reflectedDirection = resultReflections[buffer][i];
Result res = Result(rays[rayOffset+i], hitT, t.panel);
results[rank].push_back(res);
t.panel->incrementIntensity(1.0);
if (t.panel->getParent().absorbtion < 1)
{
numberOfRaysGenerated++;
Ray reflected (res.endPoint() + 0.00001*reflectedDirection, reflectedDirection);
this->newRays[rayOffset+i] = reflected;
this->activeRays[rayOffset+i] = true;
numNewRays++;
}
}
}
numRaysPrevious = numRaysIteration;
nvtxRangePop();
}
This is the expected behavior on Windows with the WDDM driver model, where the driver tries to mitigate the kernel launch overhead by trying to batch kernel launches. Try inserting cudaStreamQuery(0) straight after the kernel invocation to trigger early launching of the kernel before the batch is full.

Cuda kernel producing the resultant vector as zero

Here is the kernel that I am launching for calculating some array in parallel.
__device__ bool mult(int colsize,int rowsize,int *Aj,int *Bi,int *val)
{
for(int j = 0; j < rowsize;j++)
{
for(int k = 0;k < colsize;k++)
{
if(Aj[j] == Bi[k])
{
return true;
}
}
}
return false;
}
__global__ void kernel(int *Aptr,int *Aj,int *Bptr,int *Bi,int rows,int cols,int *Cjc)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int i;
if(tid < cols)
{
int beg = Bptr[tid];
int end = Bptr[tid+1];
for(i = 0;i < rows;i++)
{
int cbeg = Aptr[i];
int cend = Aptr[i+1];
if(mult(end - beg,cend - cbeg,Aj+cbeg,Bi+beg))
{
Cjc[tid+1] += 1;
//atomicAdd(Cjc+tid+1,1);
}
}
}
}
And here is how I decide the configuration of grid and blocks
int numBlocks,numThreads;
if(q % 32 == 0)
{
numBlocks = q/32;
numThreads = 32;
}
else
{
numBlocks = (q+31)/32;
numThreads = 32;
}
findkernel<<<numBlocks,numThreads>>>(devAptr,devAcol,devBjc,devBir,m,q,d_Cjc);
I am using GTX 480 with CC 2.0.
Now the problem that I am facing is that whenever q increases beyond 4096 the values in Cjc array are all produced as 0.
I know maximum number of blocks that I can use in X direction is 65535 and each block can have at most (1024,1024,64) threads. Then why does this kernel calculate the wrong output for Cjc array?
I seems like there are a couple of things wrong with the code you posted:
I guess findkernel is kernel in the CUDA code above?
kernel has 8 parameters, but you only use 7 parameters to call findkernel. This doesn't look right!
In kernel, you test if(tid < cols) - I guess this should be if(tid < count)??
Why does kernel expect count to be a pointer? I think you don't pass in an int pointer but a regular integer value to findkernel.
Why does __device__ bool mult get count/int *val if it is not used?
I guess #3 or #4 could be the source of your problem, but you should look at the other things as well.
OK so I finally figured out using cudaError_t that when I tried to cudaMemcpy the d_Cjc array from device to host, it throws following error.
CUDA error: the launch timed out and was terminated
It turns out that some of the calculations in findkernel are taking reasonably large amount of time which causes the display driver to terminate the program because of OS 'watchdog' time limit.
I believe I will have to shut down X server or ssh my gpu machine (from another machine) by removing its display.This will buy me some time to do the calculations that will not exceed the 'watchdog' limit of OS.

global vs shared memory in CUDA

I have two CUDA kernels that compute similar stuff. One is using global memory (myfun is a device function that reads a lot from global memory and do the computation). The second kernel transfers that chunk of data from global memory to shared memory so the data can be shared among different threads of a block. My kernel that uses global memory is much faster than the one with shared memory. What are the possible reasons?
loadArray just copies a small part of d_x to m.
__global__ void mykernel(float *d_x, float *d_y, int *d_z, float *d_u, int N, int K, int D)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
int index = 0;
float max_s = 1e+37F;
if (tid < N)
{
for (int i = 0; i < K; i++)
{
float s = myfun(&d_x[i*D], d_y, tid);
if (s > max_s)
{
max_s = s;
index = i;
}
}
d_z[tid] = index;
d_u[tid] = max_s;
}
}
Using shared memory:
__global__ void mykernel(float *d_x, float *d_y, int *d_z, float *d_u, int N, int K)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
int index = 0;
float max_s = 1e+37F;
extern __shared__ float m[];
if( threadIdx.x == 0 )
loadArray( m, d_x );
__syncthreads();
if (tid < N)
{
for (int i = 0; i < K; i++)
{
float s = myfun(m, d_y, tid);
if (s > max_s)
{
max_s = s;
index = i;
}
}
d_z[tid] = index;
d_u[tid] = max_s;
}
}
The problem is that only the first thread in each block is reading from global memory into shared memory, this is much slower than letting all threads read from global memory simultaneously.
Using shared memory is an advantage when a single thread needs to access neighbouring elements from global memory - but this doesn't appear to be the case here.
IMO, if you have parallel nsight installed on say a Windows machine and conduct a trace on the executions, you might have more insights. Alternatively, run the cudaprof through your app to try to figure out where are the possible latencies.

CUDA memory troubles

I have a CUDA kernel which I'm compiling to a cubin file without any special flags:
nvcc text.cu -cubin
It compiles, though with this message:
Advisory: Cannot tell what pointer points to, assuming global memory space
and a reference to a line in some temporary cpp file. I can get this to work by commenting out some seemingly arbitrary code which makes no sense to me.
The kernel is as follows:
__global__ void string_search(char** texts, int* lengths, char* symbol, int* matches, int symbolLength)
{
int localMatches = 0;
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = threadIdx.x + threadIdx.y * blockDim.x;
int blockThreads = blockDim.x * blockDim.y;
__shared__ int localMatchCounts[32];
bool breaking = false;
for(int i = 0; i < (lengths[blockId] - (symbolLength - 1)); i += blockThreads)
{
if(texts[blockId][i] == symbol[0])
{
for(int j = 1; j < symbolLength; j++)
{
if(texts[blockId][i + j] != symbol[j])
{
breaking = true;
break;
}
}
if (breaking) continue;
localMatches++;
}
}
localMatchCounts[threadId] = localMatches;
__syncthreads();
if(threadId == 0)
{
int sum = 0;
for(int i = 0; i < 32; i++)
{
sum += localMatchCounts[i];
}
matches[blockId] = sum;
}
}
If I replace the line
localMatchCounts[threadId] = localMatches;
after the first for loop with this line
localMatchCounts[threadId] = 5;
it compiles with no notices. This can also be achieved by commenting out seemingly random parts of the loop above the line. I have also tried replacing the local memory array with a normal array to no effect. Can anyone tell me what the problem is?
The system is Vista 64bit, for what its worth.
Edit: I fixed the code so it actually works, though it still produces the compiler notice. It does not seem as though the warning is a problem, at least with regards to correctness (it might affect performance).
Arrays of pointers like char** are problematic in kernels, since the kernels have no access to the host's memory.
It is better to allocate a single continuous buffer and to divide it in a manner that enables parallel access.
In this case I'd define a 1D array which contains all the strings positioned one after another and another 1D array, sized 2*numberOfStrings which contains the offset of each string within the first array and it's length:
For example - preparation for kernel:
char* buffer = st[0] + st[1] + st[2] + ....;
int* metadata = new int[numberOfStrings * 2];
int lastpos = 0;
for (int cnt = 0; cnt < 2* numberOfStrings; cnt+=2)
{
metadata[cnt] = lastpos;
lastpos += length(st[cnt]);
metadata[cnt] = length(st[cnt]);
}
In kernel:
currentIndex = threadId + blockId * numberOfBlocks;
char* currentString = buffer + metadata[2 * currentIndex];
int currentStringLength = metadata[2 * currentIndex + 1];
The problem seems to be associated with the char** parameter. Turning this into a char* solved the warning, so I suspect that cuda might have problems with this form of data. Perhaps cuda prefers that one uses the specific cuda 2D arrays in this case.