cuda reach device function from global - cuda

I am trying to call a device function from global function. This function is only declaring an array to be used by all threads. But my problem when I printed the array its elements are not in the same order as declared. Is it because of all threads are creating the array again ? I confused about threads. If it is , Can I learn which thread is run first in global function and can I only allow it to declare the array for the others. Thanks.
Here my function to create array :
__device__ float myArray[20][20];
__device__ void calculation(int no){
filterWidth = 3+(2*no);
filterHeight = 3+(2*no);
int arraySize = filterWidth;
int middle = (arraySize - 1) / 2;
int startIndex = middle;
int stopIndex = middle;
// at first , all values of array are 0
for(int i=0; i<arraySize; i++)
for (int j = 0; j < arraySize; j++)
{
myArray[i][j] = 0;
}
// until middle line of the array, required indexes are 1
for (int i = 0; i < middle; i++)
{
for (int j = startIndex; j <= stopIndex; j++)
{ myArray[i][j] = 1; sum+=1; }
startIndex -= 1;
stopIndex += 1;
}
// for middle line
for (int i = 0; i < arraySize; i++)
{myArray[middle][i] = 1; sum+=1;}
// after middle line of the array, required indexes are 1
startIndex += 1;
stopIndex -= 1;
for (int i = (middle + 1); i < arraySize; i++)
{
for (int j = startIndex; j <= stopIndex; j++)
{ myArray[i][j] = 1; sum+=1; }
startIndex +=1 ;
stopIndex -= 1;
}
filterFactor = 1.0f / sum;
}
And global function :
__global__ void FilterKernel(Format24bppRgb* imageData)
{
int tidX = threadIdx.x + blockIdx.x * blockDim.x;
int tidY = threadIdx.y + blockIdx.y * blockDim.y;
Colour Cpixel = Colour (imageData[tidX + tidY*imageWidth] );
float depthPixel = Colour(depthData[tidX + tidY*imageWidth]).Red;
float absoluteDistanceFromFocus = fabs (depthPixel - focusDepth);
if(depthPixel == 0)
return;
Colour Cresult = Cpixel;
for (int i=0;i<8;i++)
{
calculation(i);
...
...
}
}

If you really want to select and force one thread to call the function and the rest to wait for it to do so, use __shared__ memory for the array created by the device function so that all threads in a block see the same one, and you can call it with:
for (int i=0;i<8;i++)
{
if (threadIdx.x == 0 && threadIdx.y == 0)
calculation(i);
__syncthreads();
...
}
Of course, this won't work between blocks - in a globally defined function, you have no control over the order in which blocks are computed.
Instead, if you can, you should do the initialization calculation (that only 1 thread needs to do) on the CPU and memcpy it to the GPU before launching your kernel. It looks like you'll use 8x the memory for your myArray's, but it'll dramatically speed up your computation.

Related

System get stuck on running matrix multiplication using CUDA

When i'm running this code on my system, after some seconds my system get stuck and i have to restart system again. So my question is what's i'm doing wrong here? Any suggestion will appreciated.
__global__ void matMul(float* d_M, float* d_N, float* d_P, int width) {
int row = blockIdx.y*width + threadIdx.y;
int col = blockIdx.x*width + threadIdx.x;
if (row < width && col < width) {
float product_val = 0;
for (int k = 0; k < width; k++) {
product_val += d_M[row*width + k] * d_N[k*width + col];
}
d_P[row*width + col] = product_val;
}
}
int main() {
const int n = 9;
float* d_M;
float* d_N;
float* d_P;
cudaMallocManaged(&d_M, SIZE * sizeof(float));
cudaMallocManaged(&d_N, SIZE * sizeof(float));
cudaMallocManaged(&d_P, SIZE * sizeof(float));
for (int i = 0; i < n; ++i) {
d_P[i] = 0;
}
int count = 0;
for (int i = 0; i < n; ++i) {
d_N[i] = ++count;
}
count = 0;
for (int i = 0; i < n; ++i) {
d_M[i] = ++count;
}
matMul <<<1, n>>> (d_M, d_N, d_P, 3);
cudaDeviceSynchronize();
for (int i = 0; i < n; ++i) {
printf("%f\n", d_P[i]);
}
cudaFree(d_N);
cudaFree(d_M);
cudaFree(d_P);
return 0;
}
Assuming that when you mean your system gets stuck, you get some kind of error in your program, it's likely that you're accessing memory that is invalid.
This could be in the higher indexes of your d_M and d_N iterations when k + row*width is indexing beyond the size of memory that you've allocated in cudaMallocManaged.
It's always good practice in situations like these to add some error handling using commands such as cudaPeekatLastError().
This link might be helpful for implementing some debugging.

Finding triplets in CUDA kernel

I have p.ntp test particles and every i-th particle has Cartesian coordinates tp.rh[i].x, tp.rh[i].y, tp.rh[i].z. Within this set I need to find CLUSTERS. It means, that I am looking for particles closer to the i-th particle less than hill2 (tp.D_rel < hill2). The number of such a members is stored in N_conv.
I use this cycle for (int i = 0; i < p.ntp; i++), which goes through the data set. For each i-th particle I calculate squared distances tp.D_rel[idx] relative to the others members in the set. Then I use first thread (idx == 0) to find the number of cases, which satisfy my condition. At the end, If are there more than 1 (N_conv > 1) positive cases I need to write out all particles forming possible cluster together (triplets, ...).
My code works well only in cases, where i < blockDim.x. Why? Is there a general way, how to find clusters in a set of data, but write out only triplets and more?
Note: I know, that some cases will be found twice.
__global__ void check_conv_system(double t, struct s_tp tp, struct s_mp mp, struct s_param p, double *time_step)
{
const uint bid = blockIdx.y * gridDim.x + blockIdx.x;
const uint tid = threadIdx.x;
const uint idx = bid * blockDim.x + tid;
double hill2 = 1.0e+6;
__shared__ double D[200];
__shared__ int ID1[200];
__shared__ int ID2[200];
if (idx >= p.ntp) return;
int N_conv;
for (int i = 0; i < p.ntp; i++)
{
tp.D_rel[idx] = (double)((tp.rh[i].x - tp.rh[idx].x)*(tp.rh[i].x - tp.rh[idx].x) +
(tp.rh[i].y - tp.rh[idx].y)*(tp.rh[i].y - tp.rh[idx].y) +
(tp.rh[i].z - tp.rh[idx].z)*(tp.rh[i].z - tp.rh[idx].z));
__syncthreads();
N_conv = 0;
if (idx == 0)
{
for (int n = 0; n < p.ntp; n++) {
if ((tp.D_rel[n] < hill2) && (i != n)) {
N_conv = N_conv + 1;
D[N_conv] = tp.D_rel[n];
ID1[N_conv] = i;
ID2[N_conv] = n;
}
}
if (N_conv > 0) {
for(int k = 1; k < N_conv; k++) {
printf("%lf %lf %d %d \n",t/365.2422, D[k], ID1[k], ID2[k]);
}
}
} //end idx == 0
} //end for cycle for i
}
As RobertCrovella mentionned, without an MCV example, it is hard to tell.
However, the tp.D_del array seems to be written to with idx index, and read-back after a __syncthreads() with full range indexing n. Note that the call to __syncthreads() will only perform synchronization within a block, not accross the whole device. As a result, some thread/block will access data that has not been calculated yet, hence the failure.
You want to review your code so that values computed by blocks do not depend one-another.

CUDA Reduction minimum value and index

I implemented a minimum reduce using CUDA 8 by following this great explanation and modifying it
__inline__ __device__ int warpReduceMin(int val)
{
for (int offset = warpSize / 2; offset > 0; offset /= 2)
{
int tmpVal = __shfl_down(val, offset);
if (tmpVal < val)
{
val = tmpVal;
}
}
return val;
}
__inline__ __device__ int blockReduceMin(int val)
{
static __shared__ int shared[32]; // Shared mem for 32 partial mins
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = warpReduceMin(val); // Each warp performs partial reduction
if (lane == 0)
{
shared[wid] = val; // Write reduced value to shared memory
}
__syncthreads(); // Wait for all partial reductions
//read from shared memory only if that warp existed
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : INT_MAX;
if (wid == 0)
{
val = warpReduceMin(val); //Final reduce within first warp
}
return val;
}
__global__ void deviceReduceBlockAtomicKernel(int *in, int* out, int N) {
int minVal = INT_MAX;
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < N;
i += blockDim.x * gridDim.x)
{
minVal = min(minVal, in[i]);
}
minVal = blockReduceMin(minVal);
if (threadIdx.x == 0)
{
atomicMin(out, minVal);
}
}
and it works great and I'm getting the minimum value. However, I don't care about the minimum value, only about its index in the original input array.
I tried modifying my code a bit
__inline__ __device__ int warpReduceMin(int val, int* idx) // Adding output idx
{
for (int offset = warpSize / 2; offset > 0; offset /= 2)
{
int tmpVal = __shfl_down(val, offset);
if (tmpVal < val)
{
*idx = blockIdx.x * blockDim.x + threadIdx.x + offset; // I guess I'm missing something here
val = tmpVal;
}
}
return val;
}
...
blockReduceMin stayed the same only adding idx to function calls
...
__global__ void deviceReduceBlockAtomicKernel(int *in, int* out, int N) {
int minVal = INT_MAX;
int minIdx = 0; // Added this
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < N;
i += blockDim.x * gridDim.x)
{
if (in[i] < minVal)
{
minVal = in[i];
minIdx = i; // Added this
}
}
minVal = blockReduceMin(minVal, &minIdx);
if (threadIdx.x == 0)
{
int old = atomicMin(out, minVal);
if (old != minVal) // value was updated
{
atomicExch(out + 1, minIdx);
}
}
}
But it doesn't work. I feel that I'm missing something important and that this is not the way to go about it, but my search turned up no results.
There are several problems here. You need to modify both the warp and block minimum functions to propagate both the minimum value and its index every time a new local minimum is found. Perhaps something like this:
__inline__ __device__ void warpReduceMin(int& val, int& idx)
{
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
int tmpVal = __shfl_down(val, offset);
int tmpIdx = __shfl_down(idx, offset);
if (tmpVal < val) {
val = tmpVal;
idx = tmpIdx;
}
}
}
__inline__ __device__ void blockReduceMin(int& val, int& idx)
{
static __shared__ int values[32], indices[32]; // Shared mem for 32 partial mins
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
warpReduceMin(val, idx); // Each warp performs partial reduction
if (lane == 0) {
values[wid] = val; // Write reduced value to shared memory
indices[wid] = idx; // Write reduced value to shared memory
}
__syncthreads(); // Wait for all partial reductions
//read from shared memory only if that warp existed
if (threadIdx.x < blockDim.x / warpSize) {
val = values[lane];
idx = indices[lane];
} else {
val = INT_MAX;
idx = 0;
}
if (wid == 0) {
warpReduceMin(val, idx); //Final reduce within first warp
}
}
[note: written in browser, never compiled or tested, use at own risk]
That should leave every block holding it's correct local minimum and index. Then you have a second problem. This:
int old = atomicMin(out, minVal);
if (old != minVal) // value was updated
{
atomicExch(out + 1, minIdx);
}
is broken. There is no guarantee that the minimum value and its index will be correctly set in this code. This is because there is no guarantee that both atomic operations have any synchronisation and there is a potential race where one block may correctly overwrite the minimum value of another block, but then have its index overwritten by the block it replaced. The only solution here would be some sort of mutex, or run a second reduction kernel on the results of each block.

Find max of matrix with window size in CUDA [duplicate]

I just started in CUDA. Now I have a question.
I have N*N matrix, and a window scale is 8x8. I want subdivided this matrix into multiple sub-matrix and find max value of this.
For example if I have 64*64 matrix so I will have 8 small matrix with 8*8 scale and find out 8 max values. Finally I save all max values into new array, but its order always change. I want find solution to keep them in right order
__global__ void calculate_emax_kernel(float emap[],float emax[], int img_height, int img_width,int windows_size)
{
int x_index = blockIdx.x*blockDim.x+threadIdx.x;
int y_index = blockIdx.y*blockDim.y+threadIdx.y;
int num_row_block = img_height/windows_size;
int num_col_block = img_width/windows_size;
__shared__ float window_elements[256];
__shared__ int counter;
__shared__ int emax_count;
if (threadIdx.x == 0) emax_count = 0;
__syncthreads();
int index;
int emax_idx = 0;
if(y_index >= img_height|| x_index >= img_width) return;
for(int i = 0; i < num_row_block; i++)
{
for(int j = 0; j < num_col_block; j++)
{
counter = 0;
if(y_index >= i*windows_size && y_index < (i+1)*windows_size
&& x_index >= j*windows_size && x_index < (j+1)*windows_size)
{
int idx = y_index*img_height + x_index;
index = atomicAdd(&counter, 1);
window_elements[index] = emap[idx];
__syncthreads();
// reduction
unsigned int k = (windows_size*windows_size)/2;
while(k != 0)
{
if(index < k)
{
window_elements[index] = fmaxf(window_elements[index], window_elements[index+k]);
}
k /= 2;
}
if(index == 0)
{
emax[i*num_row_block+j] = window_elements[index];
}
}
__syncthreads();
}
__syncthreads();
}
__syncthreads();
}
This is my configuration
void construct_emax(float *input,float *output, int img_height, int img_width)
{
int windows_size = 4;
float * d_input, * d_output;
cudaMalloc(&d_input, img_width*img_height*sizeof(float));
cudaMalloc(&d_output, img_width*img_height*sizeof(float));
cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
calculate_emax_kernel<<<gridsize,blocksize>>>(d_input,d_output,img_height,img_width,windows_size);
}
With CUDA, parallel reduction is tricky; segmented parallel reduction is trickier. Now you are doing it in 2-D, and your segment/window is smaller than the thread block.
For large window size, I don't think it is a problem. You could use one thread block to reduce one window. For example if you have a 16x16 window, you could simply use 16x16 thread block. If you have even larger window size, for example 64x64, you could still use 16x16 thread block. First reduce the 64x64 window to 16x16 elements during data loading, then reduce to 1 scalar within the thread block.
For window size smaller than the block size, you will have to reduce multiple windows per thread block for higher performance. You could use your current block/grid configuration, where each 256-thread block (16x16) is responsible for 16 4x4 windows. But this will not be optimal because each 32-thread wrap is organized in two parts (2x16). This is not good for coalesced global memory access, and it is hard to map a 2x16 warp to one or more 4x4 windows for efficient parallel reduction.
Alternatively I would suggest you use 1-D thread block with 256 threads. Every m threads reduce one mxm window. Then you could use 2-D grid to cover the whole image.
const int m = window_size;
dim3 blocksize(256);
dim3 gridsize((img_width+255)/256, (img_height+m-1)/m);
In the kernel function, you could
reduce each mxm window to a 1xm vector during global data loading;
use tree reduction method to reduce the 1xm vector to a scalar.
This following code is a conceptual demo which works when m is a power of 2 and m <= 32. You could further modify it for arbitrary m and better boundary checking.
#include <assert.h>
#include <cuda.h>
#include <thrust/device_vector.h>
__global__ void calculate_emax_kernel(const float* input, float* output,
int height, int width, int win_size,
int out_width) {
const int tid = threadIdx.x;
const int i = blockIdx.y * win_size;
const int j = blockIdx.x * 256 + tid;
const int win_id = j % win_size;
__shared__ float smax[256];
float tmax = -1e20;
if (j < width) {
for (int tile = 0; tile < win_size; tile++) {
if (i + tile < height) {
tmax = max(tmax, input[(i + tile) * width + j]);
}
}
}
smax[tid] = tmax;
for (int shift = win_size / 2; shift > 0; shift /= 2) {
if (win_id < shift) {
smax[tid] = max(smax[tid], smax[tid + shift]);
}
}
if (win_id == 0 && j < width) {
output[blockIdx.y * out_width + (j / win_size)] = smax[tid];
}
}
int main() {
const int height = 1024;
const int width = 1024;
const int m = 4;
thrust::device_vector<float> in(height * width);
thrust::device_vector<float> out(
((height + m - 1) / m) * ((width + m - 1) / m));
dim3 blocksize(256);
dim3 gridsize((width + 255) / 256, (height + m - 1) / m);
assert(m == 2 || m == 4 || m == 8 || m == 16 || m == 32);
calculate_emax_kernel<<<gridsize, blocksize>>>(
thrust::raw_pointer_cast(in.data()),
thrust::raw_pointer_cast(out.data()),
height, width, m, (width + m - 1) / m);
return 0;
}
In case you're willing to use a library, few pointers:
use NPP, set of primitives (from nvidia)
https://docs.nvidia.com/cuda/npp/group__image__filter__max.html
a lower level library, for other reduce operations and more granularity in the way you use the hardware (from nvidia / nvlabs)
http://nvlabs.github.io/cub/

Complicated for loop to be ported to a CUDA kernel

I have the next for nested loop and I would like to port it to CUDA to be run on a GPU
int current=0;
int ptr=0;
for (int i=0; i < Nbeans; i++){
for(int j=0;j< NbeamletsPerbeam[i];j++){
current = j + ptr;
for(int k=0;k<Nmax;k++){
......
}
ptr+=NbeamletsPerbeam[i];
}
}
I would be very happy if any body has an idea of how to do it or how can be done.
We are talking about Nbeams=5, NbeamletsPerBeam around 200 each.
This is what I currently have but I am not sure it is right...
for (int i= blockIdx.x; i < d_params->Nbeams; i += gridDim.x){
for (int j= threadIdx.y; j < d_beamletsPerBeam[i]; j+= blockDim.y){
currentBeamlet= j+k;
for (int ivoxel= threadIdx.x; ivoxel < totalVoxels; ivoxel += blockDim.x){
I would suggest this idea. But you might need to do some minor modifications based on your code.
dim3 blocks(NoOfThreads, 1);
dim3 grid(Nbeans, 1);
kernel<<grid, blocks, 1>>()
__global__ kernel()
{
int noOfBlocks = ( NbeamletsPerbeam[blockIdx.x] + blockDim.x -1)/blockDim.x;
for(int j=0; j< noOfBlocks;j++){
// use threads and compute....
if( (threadIdx.x * j) < NbeamletsPerbeam[blockIdx.x]) {
current = (threadIdx.x * j) + ptr;
for(int k=0;k<Nmax;k++){
......
}
ptr+=NbeamletsPerbeam[blockIdx.x];
}
}
}
This should do the trick and gives you better parallelization.