find nearest non-zero element in another vector in CUDA - cuda

There is a M x N matrix A and B.(Actual size of matrix is 512 x 4096)
In each row of A, the points to be processed are set to 1.
And each row of B contains values obtained through a specific operation.
Based on each row, I am going to do an operation to get the value of B that is closest to the point of 1 in A.
The example is shown in the figure below, and the code I wrote in MATLAB was also written down.
Here's how I thought of it:
Pick the non-zero element index of A with thrust. And for each element, the closest value is fetched from the corresponding row of B by for-loop.
(If there are several non-zero elements in A, it is expected to be slow.)
I want to make good use of the power of the GPU for this operation, do you have any more efficient ideas?
[idxY,idxX] = find(A == 1);
for Point = 1:length(idxY)
pointBuf = find(B(:,idxY(Point)) == 1); // find non-zero elements in Row of B
if ~isempty(pointBuf) // there are non-zero elements in Row of B
[MinValue, MinIndex] = min(abs(pointBuf - idxY(Point)));
C(idxY(Point),idxX(Point)) = B(pointBuf(MinIndex(1)),RangeInd(Point)); // Get closest point in B
else
C(DopInd(Point),RangeInd(Point)) = 0; // if there is no non-zero elements in Row of B, just set to 0
end
end

Just as reference a solution, which shifts left and right by 4095. It has similarities with bubble sort variants, which bubble up and down at the same time.
Advantage is that it does not depend on the position of the non-null elements in B and can be easily parallelized between threads.
But the inner loop, which translates to 2 SASS instructions is still just too slow (too often called): The program takes 26ms on my notebook.
It would do so in the best and the absolute worst case of the input matrices.
Parts and methods of it probably can be reused, as it shows some CUDA programming methods.
So more or less for reference, in the end not a final (fast enough) solution:
__global__ void calcmatrix(bool* A, double* B, double* C)
{
// calculate row number
int row = blockDim.x * gridDim.y + threadIdx.y;
if (row >= 512)
return;
// store index of valid double from B, this is moved up and down
// those indices are for the current thread. Each thread is responsible for 128 consecutive columns.
int indices[128];
// prefill the indices with their own number (as if every double from B is valid)
#pragma unroll
for (int i = 0; i < 128; i++)
indices[i] = threadIdx.x * 128 + i;
// Store zero flags (4 * 32 bits) for our 128 elements
unsigned int myzeroflags[4];
// For efficiently loading data from memory, we distribute the data in another way: thread 0 gets columns 0, 32, 64, 96, ...; thread 1 gets columns 1, 33, 65, 97, ...; thread 2 gets columns 2, 34, 66, 98, ...; and so on
#pragma unroll
for (int i = 0; i < 128; i++) {
// load value from B
double in = B[row * 4096 + i * 32 + threadIdx.x];
// compare to zero (!in) and combine all bool results from the 32 threads (__ballot_sync))
unsigned int zeroflag = __ballot_sync(0xFFFFFFFF, !in);
// store the ones, which belong to us
if (threadIdx.x == i / 4)
myzeroflags[i & 3] = zeroflag;
}
// go through our zero flags and set those indices to -1 (there is already a valid index "0", so we use a negative number to signify invalid)
#pragma unroll
for (int i = 0; i < 4; i++)
#pragma unroll
for (int j = 0; j < 32; j++)
if (myzeroflags[i] & (1 << j))
indices[i * 32 + j] = -1;
// main loop, do 4095 times
#pragma unroll 1
for (int i = 0; i < 4095; i++) {
// move all elements to the left (if the index there is invalid)
// send index over thread boundaries
int fromright = __shfl_down_sync(0xFFFFFFFF, indices[0], 1, 32);
#pragma unroll
// if left index is -1, set it to one index to the right
for (int j = 0; j < 127; j++)
if (indices[j] == -1)
indices[j] = indices[j + 1];
// move over thread boundaries (except for the rightmost thread)
if (threadIdx.x != 31 && indices[127] == -1)
indices[127] = fromright;
// move to the right in the same way as to the left
int fromleft = __shfl_up_sync(0xFFFFFFFF, indices[127], 1, 32);
#pragma unroll
for (int j = 127; j > 0; j--)
if (indices[j] == -1)
indices[j] = indices[j - 1];
if (threadIdx.x != 0 && indices[0] == -1)
indices[0] = fromleft;
}
// for the other distribution of elements for memory accesses, we have to redistribute the indices to the correct threads
// To not have bank conflicts, we define the shared memory array with 33 instead of 32 elements in the last dimension, but use only 32. With this method we can put threadIdx.x into the last and previous to last dimension without bank conflicts
__shared__ short2 distribidx[8][32][33];
int indices2[128];
// Redistribute first half; the index can go from 0..4095 (and also theoreticially -1, if there was no non-null element in this row). This fits into a short, convert for faster transfer
#pragma unroll
for (int i = 0; i < 32; i++)
distribidx[threadIdx.y][threadIdx.x][i] = { static_cast<short>(indices[i]), static_cast<short>(indices[i + 32]) };
__syncwarp();
#pragma unroll
for (int i = 0; i < 32; i++) {
short2 idxback = distribidx[threadIdx.y][i][threadIdx.x];
indices2[4 * i + 0] = idxback.x;
indices2[4 * i + 1] = idxback.y;
}
__syncwarp();
// Redistribute second half
#pragma unroll
for (int i = 0; i < 32; i++)
distribidx[threadIdx.y][threadIdx.x][i] = { static_cast<short>(indices[i + 64]), static_cast<short>(indices[i + 96]) };
__syncwarp();
#pragma unroll
for (int i = 0; i < 32; i++) {
short2 idxback = distribidx[threadIdx.y][i][threadIdx.x];
indices2[4 * i + 2] = idxback.x;
indices2[4 * i + 3] = idxback.y;
}
// Do final calculation
#pragma unroll
for (int i = 0; i < 128; i++) {
// Default value is zero
double result = 0;
// Read only, if A is true and indices2 is valid
if (A[row * 4096 + i * 32 + threadIdx.x] && indices2[i] != -1)
// Read B with calculated index (this read is not optimized/coalesced, because the indices can be wild, but hopefully was or can be cached)
result = B[row * 4096 + indices2[i]];
// Store result in C
C[row * 4096 + i * 32 + threadIdx.x] = result;
}
}
int main()
{
bool* A;
double* B;
double* C;
cudaMalloc(&A, 2 * 512 * 4096);
cudaMalloc(&B, 8 * 512 * 4096);
cudaMalloc(&C, 8 * 512 * 4096);
// called in this fashion
calcmatrix<<<(512 + 7) / 8, dim3(32, 8)>>>(A, B, C);
return 0;
}

This problem is really far from being simple to implement efficiently on a GPU. The main reason is that GPUs are designed to efficiently execute SIMD-friendly algorithm while this problem can hardly be solve in a SIMD friendly way.
The naive solution you propose will be very inefficient due to the many small kernels to execute (starting a kernel is expensive and Thrust tends to run them synchronously by default AFAIK), not to mention the amount of parallelism of each kernel would be far too small for any modern GPU. I expect this solution to be slower than a naive CPU implementation.
First things first, one need to find an efficient algorithm. The proposed solution runs in O(n m²) where n is the number of row and m the number of columns. That being said, the solution should be fast (ie. close to O(n m)) if most values are non-zero which is not the case in the example.
A more efficient solution is to first iterate over the B matrix and find the location of all the non-zero items so to put it in an array L. Then you can iterate over A, track the non-zero values and search for the closest index of L matching to the location of the current item in A. If the number of items in L is big for the target row (eg. >50), you can use a binary search so to find the location faster (since items of L are sorted). This solution runs in O(n m log m) time.
An even better solution is to iterate simultaneously over A and L like a merge algorithm. Indeed, the indices of A and the items of B are both sorted so the binary search is not even needed. When the index of the current non-zero item of A is bigger than the current item of L you can iterate to the next value of L (and memorize the last value of L discarded needed to compute the closest value). This algorithm runs in O(n m) (optimal). An efficient CPU implementation consists in computing chunks of raw in each many threads.
On a GPU, things are more complex since all the previously provided algorithm are not SIMD-friendly. Computing a row in an SIMD-friendly way turns out to be complex and generally inefficient (the overhead can be higher than the serial algorithm on a CPU). One possible solution would be to compute rows in parallel (1 thread per row) and transpose the matrix block per block in shared-memory so to perform SIMD-friendly memory accesses after that (assuming there is enough space). The non-zero values of A and B certainly needs to be extracted first so to avoid thread divergence as much as possible. This solution works only if the number of non-zero is relatively uniform between the lines (otherwise I doubt a GPU can actually be helpful). Note the overhead of the transposition can be significant compared to the computation. Thus, I am not sure it will be faster than a CPU based solution. In fact, if data lies on the CPU memory, then just transferring data to the GPU will certainly be more expensive than computing the result on a CPU in parallel.

Related

Cuda Kernel with reduction - logic errors for dot product of 2 matrices

I am just starting off with CUDA and am trying to wrap my brain around CUDA reduction algorithm. In my case, I have been trying to get the dot product of two matrices. But I am getting the right answer for only matrices with size 2. For any other size matrices, I am getting it wrong.
This is only the test so I am keeping matrix size very small. Only about 100 so only 1 block would fit it all.
Any help would be greatly appreciated. Thanks!
Here is the regular code
float* ha = new float[n]; // matrix a
float* hb = new float[n]; // matrix b
float* hc = new float[1]; // sum of a.b
float dx = hc[0];
float hx = 0;
// dot product
for (int i = 0; i < n; i++)
hx += ha[i] * hb[i];
Here is my cuda kernel
__global__ void sum_reduce(float* da, float* db, float* dc, int n)
{
int tid = threadIdx.x;
dc[tid] = 0;
for (int stride = 1; stride < n; stride *= 2) {
if (tid % (2 * stride) == 0)
dc[tid] += (da[tid] * db[tid]) + (da[tid+stride] * db[tid+stride]);
__syncthreads();
}
}
My complete code : http://pastebin.com/zS85URX5
Hopefully you can figure out why it works for the n=2 case, so let's skip that, and take a look at why it fails for some other case, let's choose n=4. When n = 4, you have 4 threads, numbered 0 to 3.
In the first iteration of your for-loop, stride = 1, so the threads that pass the if test are threads 0 and 2.
thread 0: dc[0] += da[0]*db[0] + da[1]*db[1];
thread 2: dc[2] += da[2]*db[2] + da[3]*db[3];
So far so good. In the second iteration of your for loop, stride is 2, so the thread that passes the if test is thread 0 (only).
thread 0: dc[0] += da[0]*db[0] + da[2]*db[2];
But this doesn't make sense and is not what we want at all. What we want is something like:
dc[0] += dc[2];
So it's broken. I spent a little while trying to think about how to fix this in just a few steps, but it just doesn't make sense to me as a reduction. If you replace your kernel code with this code, I think you'll have good results. It's not a lot like your code, but it was the closest I could come to something that would work for all the cases you've envisioned (ie. n < max thread block size, using a single block):
// CUDA kernel code
__global__ void sum_reduce(float* da, float* db, float* dc, int n)
{
int tid = threadIdx.x;
// do multiplication in parallel for full width of threads
dc[tid] = da[tid] * db[tid];
// wait for all threads to complete multiply step
__syncthreads();
int stride = blockDim.x;
while (stride > 1){
// handle odd step
if ((stride & 1) && (tid == 0)) dc[0] += dc[stride - 1];
// successively divide problem by 2
stride >>= 1;
// add each upper half element to each lower half element
if (tid < stride) dc[tid] += dc[tid + stride];
// wait for all threads to complete add step
__syncthreads();
}
}
Note that I'm not really using the n parameter. Since you are launching the kernel with n threads, the blockDim.x built-in variable is equal to n in this case.

CUDA reduction using registers

I need to calculate N signals' mean values using reduction. The input is a 1D array of size MN, where M is the length of each signal.
Originally I had additional shared memory to first copy the data and do the reduction on each signal. However, the original data is corrupted.
My program tries to minimize the shared memory. So I was wondering how I can use registers to do a reduction sum on N signals. I have N threads, a shared memory (float) s_m[N*M], 0....M-1 is the first signal, etc.
Do I need N registers (or one) to store do mean value of N different signals? (I know how to do with sequential addition using multi-thread programming and 1 register). The next step I want to do is subtract every value in the input from its correspondent signal's mean.
Your problem is very small (N = 32 and M < 128). However, some guidelines:
Assuming you are reducing across N values for each of N threads.
If N is very large (> 10s of thousands) large, just do the reductions over M sequentially in each thread.
If N is < 10s of thousands, consider using one warp or one thread block to perform each of the N reductions.
If N is very small but M is very large, consider using multiple thread blocks per each of the N reductions.
If N is very small and M is very small (as your numbers are), only consider using the GPU for the reductions if the computations that generate and / or consume the input / output of the reductions are also running on the GPU.
Based on my understanding of the question, I say that you don't need N registers to store the mean value of N different signals.
If you already have N threads [Given that each thread do reduction on only one signal], then you don't need N registers to store the reduction of one signal. All you need is one register to store the mean value.
dim3 threads (N,1);
reduction<<<threads,1>>>(signals); // signals is the [N*M] array
__global__ reduction (int *signals)
{
int id = threadIdx.x;
float meanValue = 0.0;
for(int i = 0; i < M; i++)
meanValue = signals[id*M +i];
meanValue = meanValue/M;
// Then do the subtraction
for(int i = 0; i < M; i++)
signals[id*M +i] -= meanValue;
}
If you need to do Kind of global reduction of all the meanValues of N different signals, then you need to use 2 registers [one to store the local mean and another to store the global mean] and the shared memory
dim3 threads (N,1);
reduction<<<threads,1>>>(signals); // signals is the [N*M] array
__global__ reduction (int *signals)
{
__shared__ float means[N]; // shared value
int id = threadIdx.x;
float meanValue = 0.0;
float globalMean = 0.0;
for(int i = 0; i < M; i++)
meanValue += signals[id*M +i];
means[id] = meanValue/M;
__syncthreads();
// do the global reduction
for(int i = 0; i < N; i++)
globalMean += means[i];
globalMean = globalMean/N;
// Then do the subtraction
for(int i = 0; i < M; i++)
signals[id*M +i] -= globalMean;
}
I hope this helps you. Any doubts, let me know.

cuda kernel for conway's game of life

I'm trying to calculate the number of transitions that would be made in a run of Conway's GOL for a pxq matrix for n iterations. For instance, given 1 iteration with the initial state being 1 blinker (as below). there would be 5 transitions (2 births, 1 survival, 2 deaths from underpopulation). I've already got this working, but I'd like to convert this logic to run using CUDA. Below is what I want to port to CUDA.
code:
static void gol() // call this iterations x's
{
int[] tempGrid = new int[rows * cols]; // grid holds init conditions
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
{
tempGrid[i * cols + j] = grid[i * cols + j];
}
}
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
{
int numNeighbors = neighbors(i, j); // finds # of neighbors
if (grid[i * cols + j] == 1 && numNeighbors > 3)
{
tempGrid[i * cols + j] = 0;
overcrowding++;
}
else if (grid[i * cols + j] == 1 && numNeighbors < 2)
{
tempGrid[i * cols + j] = 0;
underpopulation++;
}
else if (grid[i * cols + j] == 1 && numNeighbors > 1)
{
tempGrid[i * cols + j] = 1;
survival++;
}
else if (grid[i * cols + j] == 0 && numNeighbors == 3)
{
tempGrid[i * cols + j] = 1;
birth++;
}
}
}
grid = tempGrid;
}
Your main slowdown is going to be main memory access. So I'd suggest that you pick a largish thread block size based on the hardware you have available. 256 (16x16) is a good choice for cross-hardware compatibility. Each of those thread blocks is going to calculate the results for a slightly smaller section of the board -- if you used 16x16, they'll calculate the results for a 14x14 section of the board, since there is a one element border. (The reason to use a 16x16 block to calculate a 14x14 chunk rather than a 16x16 chunk is for memory read coalescing.)
Divide the board up into (say) 14x14 chunks; that is your grid (organized however you see fit, but most likely something like board_width / 14, board_height / 14.
Within the kernels, have each thread load its element into shared memory. Then syncthreads. Then have the middle 14x14 elements calculate the new value (using the values stored in shared memory) and write it back into global memory. The use of shared memory helps minimize global reads and writes. This is also the reason to have your thread block size as big as possible -- the edges and corners are "wasted" global memory accesses, since the values fetched there only get used 1 or 3 times, not 9 times.
Here's one way you could proceed:
Each thread makes the computation for 1 element of the grid
Each thread first loads up one element from the main grid into shared memory
Threads on the edge of the thread block need also to load up boundary elements
Each thread can then make their survival computation based on the contents of shared memory
Each thread then writes their result back to main memory

Parallel reduction and find index on CUDA

I have an array of 20K values and I am reducing it over 50 blocks with 400 threads each. num_blocks = 50 and block_size = 400.
My code looks like this:
getmax <<< num_blocks,block_size >>> (d_in, d_out1, d_indices);
__global__ void getmax(float *in1, float *out1, int *index)
{
// Declare arrays to be in shared memory.
__shared__ float max[threads];
int nTotalThreads = blockDim.x; // Total number of active threads
float temp;
float max_val;
int max_index;
int arrayIndex;
// Calculate which element this thread reads from memory
arrayIndex = gridDim.x*blockDim.x*blockIdx.y + blockDim.x*blockIdx.x + threadIdx.x;
max[threadIdx.x] = in1[arrayIndex];
max_val = max[threadIdx.x];
max_index = blockDim.x*blockIdx.x + threadIdx.x;
__syncthreads();
while(nTotalThreads > 1)
{
int halfPoint = (nTotalThreads >> 1);
if (threadIdx.x < halfPoint)
{
temp = max[threadIdx.x + halfPoint];
if (temp > max[threadIdx.x])
{
max[threadIdx.x] = temp;
max_val = max[threadIdx.x];
}
}
__syncthreads();
nTotalThreads = (nTotalThreads >> 1); // divide by two.
}
if (threadIdx.x == 0)
{
out1[num_blocks*blockIdx.y + blockIdx.x] = max[threadIdx.x];
}
if(max[blockIdx.x] == max_val )
{
index[blockIdx.x] = max_index;
}
}
The problem/issue here is that at some point “nTotalThreads” is not exactly a power of 2, resulting in garbage value for the index. The array out1 gives me the maximum value in each block, which is correct and validated. But the value of the index is wrong. For example: the max value in the first block occurs at index=40, but the kernel gives the values of index as 15. Similarly the value of the max in the second block is at 440, but the kernel gives 416.
Any suggestions??
It should be easy to ensure that nTotalThreads is always a power of 2.
Make the first reduction a special case that gets the nTotalThreads to a power of 2. eg, since you start with 400 threads in a block, do the first reduction with 256 threads. Threads 0-199 will reduce from two values, and threads 200-255 just won't have to do a reduction in this initial step. From then on out you'd be fine.
Are you sure you really need the 'issue' “nTotalThreads” is not exactly a power of 2?
It makes the code less readable and I think it can interfere with the performance too.
Anyway if you substitute
nTotalThreads = (nTotalThreads >> 1);
with
nTotalThreads = (nTotalThreads +1 ) >> 1;
it should solve one bug concerning this 'issue'.
Francesco
Second Jeff's suggestion.
Take a look at the CUDA Thrust Library's reduce function. This is demonstrated to have 95+% efficiency compared with heavily hand-tuned kernels and is pretty flexible and easy to use.
check my kernel. You can put your blockresults to array(which can be in global memory) and get the result in global memory
And see how I call it in host code:
sumSeries<<<dim3(blockCount),dim3(threadsPerBlock)>>>(deviceSum,threadsPerBlock*blockCount);

Cummulative array summation using OpenCL

I'm calculating the Euclidean distance between n-dimensional points using OpenCL. I get two lists of n-dimensional points and I should return an array that contains just the distances from every point in the first table to every point in the second table.
My approach is to do the regular doble loop (for every point in Table1{ for every point in Table2{...} } and then do the calculation for every pair of points in paralell.
The euclidean distance is then split in 3 parts:
1. take the difference between each dimension in the points
2. square that difference (still for every dimension)
3. sum all the values obtained in 2.
4. Take the square root of the value obtained in 3. (this step has been omitted in this example.)
Everything works like a charm until I try to accumulate the sum of all differences (namely, executing step 3. of the procedure described above, line 49 of the code below).
As test data I'm using DescriptorLists with 2 points each:
DescriptorList1: 001,002,003,...,127,128; (p1)
129,130,131,...,255,256; (p2)
DescriptorList2: 000,001,002,...,126,127; (p1)
128,129,130,...,254,255; (p2)
So the resulting vector should have the values: 128, 2064512, 2130048, 128
Right now I'm getting random numbers that vary with every run.
I appreciate any help or leads on what I'm doing wrong. Hopefully everything is clear about the scenario I'm working in.
#define BLOCK_SIZE 128
typedef struct
{
//How large each point is
int length;
//How many points in every list
int num_elements;
//Pointer to the elements of the descriptor (stored as a raw array)
__global float *elements;
} DescriptorList;
__kernel void CompareDescriptors_deb(__global float *C, DescriptorList A, DescriptorList B, int elements, __local float As[BLOCK_SIZE])
{
int gpidA = get_global_id(0);
int featA = get_local_id(0);
//temporary array to store the difference between each dimension of 2 points
float dif_acum[BLOCK_SIZE];
//counter to track the iterations of the inner loop
int loop = 0;
//loop over all descriptors in A
for (int i = 0; i < A.num_elements/BLOCK_SIZE; i++){
//take the i-th descriptor. Returns a DescriptorList with just the i-th
//descriptor in DescriptorList A
DescriptorList tmpA = GetDescriptor(A, i);
//copy the current descriptor to local memory.
//returns one element of the only descriptor in DescriptorList tmpA
//and index featA
As[featA] = GetElement(tmpA, 0, featA);
//wait for all the threads to finish copying before continuing
barrier(CLK_LOCAL_MEM_FENCE);
//loop over all the descriptors in B
for (int k = 0; k < B.num_elements/BLOCK_SIZE; k++){
//take the difference of both current points
dif_acum[featA] = As[featA]-B.elements[k*BLOCK_SIZE + featA];
//wait again
barrier(CLK_LOCAL_MEM_FENCE);
//square value of the difference in dif_acum and store in C
//which is where the results should be stored at the end.
C[loop] = 0;
C[loop] += dif_acum[featA]*dif_acum[featA];
loop += 1;
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
Your problem lies in these lines of code:
C[loop] = 0;
C[loop] += dif_acum[featA]*dif_acum[featA];
All threads in your workgroup (well, actually all your threads, but lets come to to that later) are trying to modify this array position concurrently without any synchronization whatsoever. Several factors make this really problematic:
The workgroup is not guaranteed to work completely in parallel, meaning that for some threads C[loop] = 0 can be called after other threads have already executed the next line
Those that execute in parallel all read the same value from C[loop], modify it with their increment and try to write back to the same address. I'm not completely sure what the result of that writeback is (I think one of the threads succeeds in writing back, while the others fail, but I'm not completely sure), but its wrong either way.
Now lets fix this:
While we might be able to get this to work on global memory using atomics, it won't be fast, so lets accumulate in local memory:
local float* accum;
...
accum[featA] = dif_acum[featA]*dif_acum[featA];
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int i = 1; i < BLOCKSIZE; i *= 2)
{
if ((featA % (2*i)) == 0)
accum[featA] += accum[featA + i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(featA == 0)
C[loop] = accum[0];
Of course you can reuse other local buffers for this, but I think the point is clear (btw: Are you sure that dif_acum will be created in local memory, because I think I read somewhere that this wouldn't be put in local memory, which would make preloading A into local memory kind of pointless).
Some other points about this code:
Your code is seems to be geared to using only on workgroup (you aren't using either groupid nor global id to see which items to work on), for optimal performance you might want to use more then that.
Might be personal preferance, but I to me it seems better to use get_local_size(0) for the workgroupsize than to use a Define (since you might change it in the host code without realizing you should have changed your opencl code to)
The barriers in your code are all unnecessary, since no thread accesses an element in local memory which is written by another thread. Therefore you don't need to use local memory for this.
Considering the last bullet you could simply do:
float As = GetElement(tmpA, 0, featA);
...
float dif_acum = As-B.elements[k*BLOCK_SIZE + featA];
This would make the code (not considering the first two bullets):
__kernel void CompareDescriptors_deb(__global float *C, DescriptorList A, DescriptorList B, int elements, __local float accum[BLOCK_SIZE])
{
int gpidA = get_global_id(0);
int featA = get_local_id(0);
int loop = 0;
for (int i = 0; i < A.num_elements/BLOCK_SIZE; i++){
DescriptorList tmpA = GetDescriptor(A, i);
float As = GetElement(tmpA, 0, featA);
for (int k = 0; k < B.num_elements/BLOCK_SIZE; k++){
float dif_acum = As-B.elements[k*BLOCK_SIZE + featA];
accum[featA] = dif_acum[featA]*dif_acum[featA];
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int i = 1; i < BLOCKSIZE; i *= 2)
{
if ((featA % (2*i)) == 0)
accum[featA] += accum[featA + i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(featA == 0)
C[loop] = accum[0];
barrier(CLK_LOCAL_MEM_FENCE);
loop += 1;
}
}
}
Thanks to Grizzly, I have now a working kernel. Some things I needed to modify based in the answer of Grizzly:
I added an IF statement at the beginning of the routine to discard all threads that won't reference any valid position in the arrays I'm using.
if(featA > BLOCK_SIZE){return;}
When copying the first descriptor to local (shared) memory (i.g. to Bs), the index has to be specified since the function GetElement returns just one element per call (I skipped that on my question).
Bs[featA] = GetElement(tmpA, 0, featA);
Then, the SCAN loop needed a little tweaking because the buffer is being overwritten after each iteration and one cannot control which thread access the data first. That is why I'm 'recycling' the dif_acum buffer to store partial results and that way, prevent inconsistencies throughout that loop.
dif_acum[featA] = accum[featA];
There are also some boundary control in the SCAN loop to reliably determine the terms to be added together.
if (featA >= j && next_addend >= 0 && next_addend < BLOCK_SIZE){
Last, I thought it made sense to include the loop variable increment within the last IF statement so that only one thread modifies it.
if(featA == 0){
C[loop] = accum[BLOCK_SIZE-1];
loop += 1;
}
That's it. I still wonder how can I make use of group_size to eliminate that BLOCK_SIZE definition and if there are better policies I can adopt regarding thread usage.
So the code looks finally like this:
__kernel void CompareDescriptors(__global float *C, DescriptorList A, DescriptorList B, int elements, __local float accum[BLOCK_SIZE], __local float Bs[BLOCK_SIZE])
{
int gpidA = get_global_id(0);
int featA = get_local_id(0);
//global counter to store final differences
int loop = 0;
//auxiliary buffer to store temporary data
local float dif_acum[BLOCK_SIZE];
//discard the threads that are not going to be used.
if(featA > BLOCK_SIZE){
return;
}
//loop over all descriptors in A
for (int i = 0; i < A.num_elements/BLOCK_SIZE; i++){
//take the gpidA-th descriptor
DescriptorList tmpA = GetDescriptor(A, i);
//copy the current descriptor to local memory
Bs[featA] = GetElement(tmpA, 0, featA);
//loop over all the descriptors in B
for (int k = 0; k < B.num_elements/BLOCK_SIZE; k++){
//take the difference of both current descriptors
dif_acum[featA] = Bs[featA]-B.elements[k*BLOCK_SIZE + featA];
//square the values in dif_acum
accum[featA] = dif_acum[featA]*dif_acum[featA];
barrier(CLK_LOCAL_MEM_FENCE);
//copy the values of accum to keep consistency once the scan procedure starts. Mostly important for the first element. Two buffers are necesarry because the scan procedure would override values that are then further read if one buffer is being used instead.
dif_acum[featA] = accum[featA];
//Compute the accumulated sum (a.k.a. scan)
for(int j = 1; j < BLOCK_SIZE; j *= 2){
int next_addend = featA-(j/2);
if (featA >= j && next_addend >= 0 && next_addend < BLOCK_SIZE){
dif_acum[featA] = accum[featA] + accum[next_addend];
}
barrier(CLK_LOCAL_MEM_FENCE);
//copy As to accum
accum[featA] = GetElementArray(dif_acum, BLOCK_SIZE, featA);
barrier(CLK_LOCAL_MEM_FENCE);
}
//tell one of the threads to write the result of the scan in the array containing the results.
if(featA == 0){
C[loop] = accum[BLOCK_SIZE-1];
loop += 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}