Parallel prefix sum with multiple elements per thread without using thrust

Parallel prefix sum with multiple elements per thread without using thrust - cuda

I'm trying to perform an inclusive scan to find the cumulative sum of an array. Following the advice given by harrism here, I'm using the procedure given here, but following the advice of those authors, I'm trying to write code that has each thread calculate 4 elements instead of one to mask memory latency.
I am staying away from thrust as performance is essential, and I need multi-stream capability. I have only just discovered CUB, and that will be my next effort, but I would like a multi-block solution and would also like to know where I've gone wrong on my existing code, just as an exercise to better understand CUDA.
The code below allocates 4 data elements to each block, where each block must have a multiple of 32 threads. My data will have a multiple of 128 threads so this restriction is acceptable to me. Enough shared memory is allocated to each block for the 4*blockDim.x elements plus an additional 32 elements to sum between warps. scanBlockAnyLength then adds the necessary offset to correct mismatch between warps, saving the final value of each warp to dev_blockSum in device global memory. sumWarp4_32 then scans this array to find the final to correct the mismatch between blocks, which is then added on in kernel_sumBlock
#include<cuda.h>
#include<iostream>
using std::cout;
using std::endl;
#define MAX_THREADS 1024
#define MAX_BLOCKS 65536
#define N 512
__device__ float sumWarp4_128(float* ptr, const int tidx = threadIdx.x) {
const unsigned int lane = tidx & 31;
const unsigned int warpid = tidx >> 5; //32 threads per warp
unsigned int i = warpid*128+lane; //first element of block data set this thread looks at
if( lane >= 1 ) ptr[i] += ptr[i-1];
if( lane >= 2 ) ptr[i] += ptr[i-2];
if( lane >= 4 ) ptr[i] += ptr[i-4];
if( lane >= 8 ) ptr[i] += ptr[i-8];
if( lane >= 16 ) ptr[i] += ptr[i-16];
if( lane==0 ) ptr[i+32] += ptr[i+31];
if( lane >= 1 ) ptr[i+32] += ptr[i+32-1];
if( lane >= 2 ) ptr[i+32] += ptr[i+32-2];
if( lane >= 4 ) ptr[i+32] += ptr[i+32-4];
if( lane >= 8 ) ptr[i+32] += ptr[i+32-8];
if( lane >= 16 ) ptr[i+32] += ptr[i+32-16];
if( lane==0 ) ptr[i+64] += ptr[i+63];
if( lane >= 1 ) ptr[i+64] += ptr[i+64-1];
if( lane >= 2 ) ptr[i+64] += ptr[i+64-2];
if( lane >= 4 ) ptr[i+64] += ptr[i+64-4];
if( lane >= 8 ) ptr[i+64] += ptr[i+64-8];
if( lane >= 16 ) ptr[i+64] += ptr[i+64-16];
if( lane==0 ) ptr[i+96] += ptr[i+95];
if( lane >= 1 ) ptr[i+96] += ptr[i+96-1];
if( lane >= 2 ) ptr[i+96] += ptr[i+96-2];
if( lane >= 4 ) ptr[i+96] += ptr[i+96-4];
if( lane >= 8 ) ptr[i+96] += ptr[i+96-8];
if( lane >= 16 ) ptr[i+96] += ptr[i+96-16];
return ptr[i+96];
}
__host__ __device__ float sumWarp4_32(float* ptr, const int tidx = threadIdx.x) {
const unsigned int lane = tidx & 31;
const unsigned int warpid = tidx >> 5; //32 elements per warp
unsigned int i = warpid*32+lane; //first element of block data set this thread looks at
if( lane >= 1 ) ptr[i] += ptr[i-1];
if( lane >= 2 ) ptr[i] += ptr[i-2];
if( lane >= 4 ) ptr[i] += ptr[i-4];
if( lane >= 8 ) ptr[i] += ptr[i-8];
if( lane >= 16 ) ptr[i] += ptr[i-16];
return ptr[i];
}
__device__ float sumBlock4(float* ptr, const int tidx = threadIdx.x, const int bdimx = blockDim.x ) {
const unsigned int lane = tidx & 31;
const unsigned int warpid = tidx >> 5; //32 threads per warp
float val = sumWarp4_128(ptr);
__syncthreads();//should be included
if( tidx==bdimx-1 ) ptr[4*bdimx+warpid] = val;
__syncthreads();
if( warpid==0 ) sumWarp4_32((float*)&ptr[4*bdimx]);
__syncthreads();
if( warpid>0 ) {
ptr[warpid*128+lane] += ptr[4*bdimx+warpid-1];
ptr[warpid*128+lane+32] += ptr[4*bdimx+warpid-1];
ptr[warpid*128+lane+64] += ptr[4*bdimx+warpid-1];
ptr[warpid*128+lane+96] += ptr[4*bdimx+warpid-1];
}
__syncthreads();
return ptr[warpid*128+lane+96];
}
__device__ void scanBlockAnyLength4(float *ptr, float* dev_blockSum, const float* dev_input, float* dev_output, const int idx = threadIdx.x, const int bdimx = blockDim.x, const int bidx = blockIdx.x) {
const unsigned int lane = idx & 31;
const unsigned int warpid = idx >> 5;
ptr[lane+warpid*128] = dev_input[lane+warpid*128+bdimx*bidx*4];
ptr[lane+warpid*128+32] = dev_input[lane+warpid*128+bdimx*bidx*4+32];
ptr[lane+warpid*128+64] = dev_input[lane+warpid*128+bdimx*bidx*4+64];
ptr[lane+warpid*128+96] = dev_input[lane+warpid*128+bdimx*bidx*4+96];
__syncthreads();
float val = sumBlock4(ptr);
__syncthreads();
dev_blockSum[0] = 0.0f;
if( idx==0 ) dev_blockSum[bidx+1] = ptr[bdimx*4-1];
dev_output[lane+warpid*128+bdimx*bidx*4] = ptr[lane+warpid*128];
dev_output[lane+warpid*128+bdimx*bidx*4+32] = ptr[lane+warpid*128+32];
dev_output[lane+warpid*128+bdimx*bidx*4+64] = ptr[lane+warpid*128+64];
dev_output[lane+warpid*128+bdimx*bidx*4+96] = ptr[lane+warpid*128+96];
__syncthreads();
}
__global__ void kernel_sumBlock(float* dev_blockSum, const float* dev_input, float* dev_output ) {
extern __shared__ float ptr[];
scanBlockAnyLength4(ptr,dev_blockSum,dev_input,dev_output);
}
__global__ void kernel_offsetBlocks(float* dev_blockSum, float* dev_arr) {
const int tidx = threadIdx.x;
const int bidx = blockIdx.x;
const int bdimx = blockDim.x;
const int lane = tidx & 31;
const int warpid = tidx >> 5;
if( warpid==0 ) sumWarp4_32(dev_blockSum);
float val = dev_blockSum[warpid];
dev_arr[warpid*128+lane] += val;
dev_arr[warpid*128+lane+32] += val;
dev_arr[warpid*128+lane+64] += val;
dev_arr[warpid*128+lane+96] += val;
}
void scan4( const float input[], float output[]) {
int blocks = 2;
int threadsPerBlock = 64; //multiple of 32
int smemsize = (threadsPerBlock*4+32)*sizeof(float);
float* dev_input, *dev_output;
cudaMalloc((void**)&dev_input,blocks*threadsPerBlock*4*sizeof(float));
cudaMalloc((void**)&dev_output,blocks*threadsPerBlock*4*sizeof(float));
float *dev_blockSum;
cudaMalloc((void**)&dev_blockSum,blocks*sizeof(float));
int offset = 0;
int Nrem = N;
int chunksize;
while( Nrem ) {
chunksize = max(Nrem,blocks*threadsPerBlock*4);
cudaMemcpy(dev_input,(void**)&input[offset],chunksize*sizeof(float),cudaMemcpyHostToDevice);
kernel_sumBlock<<<blocks,threadsPerBlock,smemsize>>>(dev_blockSum,dev_input,dev_output);
kernel_offsetBlocks<<<blocks,threadsPerBlock>>>(dev_blockSum,dev_output);
cudaMemcpy((void**)&output[offset],dev_output,chunksize*sizeof(float),cudaMemcpyDeviceToHost);
offset += chunksize;
Nrem -= chunksize;
}
cudaFree(dev_input);
cudaFree(dev_output);
}
int main() {
float h_vec[N], sol[N];
for( int i = 0; i < N; i++ ) h_vec[i] = (float)i+1.0f;
scan4(h_vec,sol);
cout << "solution:" << endl;
for( int i = 0; i < N; i++ ) cout << i << " " << (i+2)*(i+1)/2 << " " << sol[i] << endl;
return 0;
}
To my eye, the code is throwing errors because the lines in sumWarp4_128 are not executed in order within a warp. I.e, the if( lane==0 ) lines are executing before the other logical blocks that precede it. I thought this was not possible within a warp.
If I __syncthreads() before and after the lane==0 calls, I get some new exotic error that I just can't figure out.
Any help to point out where I've gone wrong would be appreciated

The code you are writing has race conditions due to not synchronizing between threads that are sharing data. While it is true that this can be done on current hardware for communication within a warp (so-called warp-synchronous programming), it is highly discouraged because the race conditions in the code could cause it to fail on possible future hardware.
While it is true that you will get higher performance by processing multiple items per thread, 4 is not a magic number -- you should make this a tunable parameter if possible. CUDPP uses 8 per thread, for example.
I would highly recommend that you use CUB for this. You should use cub::BlockLoad() to load multiple items per thread and cub::BlockScan() to scan them. Then you would just need some code to combine multiple blocks. The most bandwidth-efficient way to do this is to use the "Reduce-Scan-Scan" approach that Thrust uses. First reduce each block (cub::BlockReduce) and store the sum from each block to a blockSums array. Then scan that array to get the per-block offset. Then perform a cub::BlockScan on the blocks and add the previously computed per-block offset to each element.

Related

CUDA loop unrolling

I have some problem in loop unroll in CUDA.
In normal serial code:
//serial basic:
for(int i = 0; i < n; i++){
c[i] = a[i] + b[i];}
//serial loop unroll:
for(int i = 0; i < n/4; i++){
c[i] = a[i] + b[i];
c[i+1] = a[i+1] + b[i+1];
c[i+2] = a[i+2] + b[i+2];
c[i+3] = a[i+3] + b[i+3];}
So I think the CUDA loop unrolling looks like this:
int i = 2*(threadIdx.x + blockIdx.x * gridDim.x);
a[i+0] = b[i+0] + c[i+0];
a[i+1] = b[i+1] + c[i+1];
But in the CUDA hand-book the unrolling example I can't understand
This is a normal GlobalWrite kernel:
__global__ void GlobalWrites( T *out, T value, size_t N )
{
for(size_t i = blockIdx.x*blockDim.x+threadIdx.x;
i < N;
i += blockDim.x*gridDim.x ) {
out[i] = value;
}
}
unrolling kernel:
template<class T, const int n> __global__ void Global_write(T* out, T value, size_t N){
size_t i;
for(i = n*blockDim.x*blockIdx.x + threadIdx.x;
i < N - n*blockDim.x*blockIdx.x;
i += n*gridDim.x*blockDim.x;)
for(int j = 0; j < n; i++){
size_t index = i + j * blockDim.x;
outp[index] = value;
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) out[index] = value;
}}
I know this kernel uses less blocks but may someone explain why it works better (n=4,10% speed up).

If it wasn't obvious, because n is a template parameter, it is constant at compile time. This means that the compiler is free to optimize the constant trip count loop away by unrolling. It is, therefore, instructive to remove the template magic and unroll the loop by hand for the n=4 case you mentioned:
template<class T>
__global__ void Global_write(T* out, T value, size_t N)
{
size_t i;
for(i = 4*blockDim.x*blockIdx.x + threadIdx.x;
i < N - 4*blockDim.x*blockIdx.x;
i += 4*gridDim.x*blockDim.x;) {
out[i + 0 * blockDim.x] = value;
out[i + 1 * blockDim.x] = value;
out[i + 2 * blockDim.x] = value;
out[i + 3 * blockDim.x] = value;
}
if ( i+0*blockDim.x < N ) out[i+0*blockDim.x] = value;
if ( i+1*blockDim.x < N ) out[i+1*blockDim.x] = value;
if ( i+2*blockDim.x < N ) out[i+2*blockDim.x] = value;
if ( i+3*blockDim.x < N ) out[i+3*blockDim.x] = value;
}
The unrolled inner loop yields four completely independent writes which are coalesced. It is this instruction level parallelism which give the code higher instruction throughput and improved performance. I highly recommend Vasily Volkov's Unrolling Parallel Loops from the GTC conference of a few years ago, if you haven't already seen it. His presentation lays out the theoretical background for why this type of loop unrolling is an optimisation in CUDA.

In the templated kernel, const int n is known at compile time, allowing the compiler to actually unroll the for(int j = 0; j < n; i++) loop removing the conditional checks on that loop. If the loop size is not known at compile time, the compiler cannot unroll the loop. Simple as that.

Correct Use of CUDA shared memory for 2D Non-contiguous data access

I have a basic question related to Two-dimensional thread access.
I want to copy the non-contiguous data into contiguous buffer and the use of cuda memcopy can be illustrated as:
void pack_cuda(float *dstbuf, IOV *srciov, int num_iov)
{
int i;
float *ptr;
ptr = buf;
for (i = 0; i < num_iov; i++) {
cudaMemcpy(ptr, srciov[i].bufaddr, srciov[i].len, cudaMemcpyDefault);
ptr = (char *)ptr + srciov[i].len;
}
}
*srciov stores the start memory address and length of each non-contiguous data in an array of structure.
*dstbuf will store the packed contiguous data after the completion of the function.
Now, I want to implement it using CUDA kernels.
__global__ void pack_cuda(float *dstbuf, IOV *srciov, int num_iov)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k;
extern __shared__ size_t tmpdbuflen[16*3]; //suppose num_iov is 16
if ( j == 0 ){
if ( i < 16 ){
tmpdbuflen[i] = (srciov[i].len); //store length to calculate presum
tmpdbuflen[i+16] = tmpdbuflen[i]; //store length
tmpdbuflen[i+32] = ((srciov+i)->bufaddr) - (srciov->bufaddr); //store addr difference
}
__syncthreads();
for ( k = 0; k < i; k++)
tmpdbuflen[i] += srciov[k].len;
}
__syncthreads();
if ( i < 16 && j < srciov[i].len ){ //wondering whether this is correct use
dst[tmpdbuflen[i] + j] = *(src + tmpdbuflen[i+32] + j);
}
__syncthreads();
}
Kernel invocation part:
dim3 dimblock(16, 16); //the length of each non-contiguous data is less than 16
dim3 dimgrid(1,1);
const unsigned int shm_size = sizeof(size_t) * 16 * 3;
pack_cuda<<<dimgrid, dimblock, shm_size, 0>>>(dstbuf, srciov, num_iov);
cudaDeviceSynchronize();
However, it seems that I cannot pack all needed datas into dst buffer.
Sometimes only j = 0 and 1 (with corresponding various i) get packed.
I think the major problem is the usage of shared memory. I only use column 0 threads (threadIdx.y == 0) to copy information onto the shared memory. Then all threads (no restriction on threadIdx.y) will access and read information in shared memory.
How to modify the code enable such design?
I'd appreciate it if anyone can figure out my problems.
Thanks.

Some hints on your code:
__global__ void pack_cuda(float *dstbuf, IOV *srciov, int num_iov)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k;
extern __shared__ size_t tmpdbuflen[16*3]; //suppose num_iov is 16
This Block here will only be executed by one thread, due to the guard j==0 which only allows thread bid*bdim+tid = 0*0+0, ergo thread 0 in block 0, which is undesirable for you. I would guess you want put j < 16 there
if ( j == 0 ){
if ( i < 16 ){
tmpdbuflen[i] = (srciov[i].len); //store length to calculate presum
tmpdbuflen[i+16] = tmpdbuflen[i]; //store length
tmpdbuflen[i+32] = ((srciov+i)->bufaddr) - (srciov->bufaddr); //store addr difference
}
__syncthreads();
for ( k = 0; k < i; k++)
tmpdbuflen[i] += srciov[k].len;
}
.

What did I miss while converting from CUDA to OpenCL? Or why is my kernel returning different output than serial code

This is my code for multiplication of a sparse matrix in compressed column format
__kernel void mykernel(__global int* colvector,
__global int* val,
__global int* result,
__global int* index,
__global int* rowptr,
__global int* sync )
{
__local int vals[1000];
for(int i=0;i<4;i++)
{
result[i]=0;
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
const int items_per_row=32;//total threads working in a row
const int thread_id=get_global_id(0)+get_local_id(0);//total threads in the program
const int warpid = thread_id/items_per_row;//warp id is actual row
int lane=thread_id&(items_per_row-1);//thread id within the warp
int row = warpid;
if(row<4)
{
int sum = 0;
int row_start = rowptr[row];
int row_end = rowptr[row+1];
vals[get_global_id(0)]=0;
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
for (int i = row_start+lane; i<row_end; i+=items_per_row)
{
vals[get_local_id(0)]+=val[i]*colvector[index[i]];
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
if (lane < 16 ) vals[get_local_id(0)] += vals[get_local_id(0) + 16];
if (lane < 8 ) vals[get_local_id(0)] += vals[get_local_id(0) + 8];
if (lane < 4 ) vals[get_local_id(0)] += vals[get_local_id(0) +4];
if (lane < 2 ) vals[get_local_id(0)] += vals[get_local_id(0) + 2];
if (lane < 1 ) vals[get_local_id(0)] += vals[get_local_id(0) + 1];
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
if(lane==0)
{
result[row] += vals[get_local_id(0)];
}
}
}
the above OpenCL code was converted from the CUDA code given below:
spmv_csr_vector_kernel(const int num_rows,
const int * ptr,
const int * indices,
const float * data,
const float * x,
float * y )
{
__shared__ float vals[];
int thread_id = blockDim.x * blockIdx.x + threadIdx.x; // global thread index
int warp_id = thread_id / 32; // global warp index
int lane = thread_id & (32 - 1); // thread index within the warp
// one warp per row
int row = warp_id;
if (row < num_rows)
{
int row_start = ptr[row];
int row_end = ptr[row+1];
// compute running sum per thread
vals[threadIdx.x] = 0;
for(int jj = row_start + lane; jj < row_end; jj += 32)
{
vals[threadIdx.x] += data[jj] * x[indices[jj]];
}
// parallel reduction in shared memory
if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16];
if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8];
if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4];
if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2];
if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1];
// first thread writes the result
if (lane == 0)
{
y[row] += vals[threadIdx.x];
}
}
}
The CUDA code is correct but my OpenCL kernel is not returning correct output. I have been trying for a week now but no solution. Does anybody know what mistake I am making?

I can at least see one mistake. thread_id is not the same in each code. blockDim.x * blockIdx.x + threadIdx.x in CUDA == get_global_id(0) in OpenCL, not get_global_id(0)+get_local_id(0). Also get_local_id(0) == threadIdx.x

Try using swan, this might help you understand your problem.
you can find an article here about it.

Get statistics for a list of numbers using GPU

I have several lists of numbers on a file . For example,
.333, .324, .123 , .543, .00054
.2243, .333, .53343 , .4434
Now, I want to get the number of times each number occurs using the GPU. I believe this will be faster to do on the GPU than the CPU because each thread can process one list. What data structure should I use on the GPU to easily get the above counts. For example , for the above, the answer will look as follows:
.333 = 2 times in entire file
.324 = 1 time
etc..
I looking for a general solution. Not one that works only on devices with specific compute capability
Just writing kernel suggested by Pavan to see if I have implemented it efficiently:
int uniqueEle = newend.valiter – d_A;
int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int)); // stores the count of each unique element
int TPB = 256;
int blocks = uniqueEle + TPB -1 / TPB;
//Cast d_I to raw pointer called d_rawI
launch<<<blocks,TPB>>>(d_rawI,count,uniqueEle);
__global__ void launch(int *i, int* count, int n){
int id = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ int indexes[256];
if(id < n ){
indexes[threadIdx.x] = i[id];
//as occurs between two blocks
if(id % 255 == 0){
count[indexes] = i[id+1] - i[id];
}
}
__syncthreads();
if(id < ele - 1){
if(threadIdx.x < 255)
count[id] = indexes[threadIdx.x+1] – indexes[threadIdx.x];
}
}
Question: how to modify this kernel so that it handles arrays of arbitrary size. I.e , handle the condition when the total number of threads < number of elements

Here is how I would do the code in matlab
A = [333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434];
[values, locations] = unique(A); % Find unique values and their locations
counts = diff([0, locations]); % Find the count based on their locations
There is no easy way to do this in plain cuda, but you can use existing libraries to do this.
1) Thrust
It is also being shipped with CUDA toolkit from CUDA 4.0.
The matlab code can be roughly translated into thrust by using the following functions. I am not too proficient with thrust, but I am just trying to give you an idea on what routines to look at.
float _A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int _I[] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
float *A, *I;
// Allocate memory on device and cudaMempCpy values from _A to A and _I to I
int num = 9;
// Values vector
thrust::device_vector<float>d_A(A, A+num);
// Need to sort to get same values together
thrust::stable_sort(d_A, d_A+num);
// Vector containing 0 to num-1
thrust::device_vector<int>d_I(I, I+num);
// Find unique values and elements
thrust::device_vector<float>d_Values(num), d_Locations(num), d_counts(num);
// Find unique elements
thrust::device_vector<float>::iterator valiter;
thrust::device_vector<int>::iterator idxiter;
thrust::pair<valiter, idxiter> new_end;
new_end = thrust::unique_by_key(d_A, d_A+num, d_I, d_Values, d_Locations);
You now have the locations of the first instance of each unique value. You can now launch a kernel to find the differences between adjacent elements from 0 to new_end in d_Locations. Subtract the final value from num to get the count for final location.
EDIT (Adding code that was provided over chat)
Here is how the difference code needs to be done
#define MAX_BLOCKS 65535
#define roundup(A, B) = (((A) + (B) - 1) / (B))
int uniqueEle = newend.valiter – d_A;
int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int));
int TPB = 256;
int num_blocks = roundup(uniqueEle, TPB);
int blocks_y = roundup(num_blocks, MAX_BLOCKS);
int blocks_x = roundup(num_blocks, blocks_y);
dim3 blocks(blocks_x, blocks_y);
kernel<<<blocks,TPB>>>(d_rawI, count, uniqueEle);
__global__ void kernel(float *i, int* count, int n)
{
int tx = threadIdx.x;
int bid = blockIdx.y * gridDim.x + blockIdx.x;
int id = blockDim.x * bid + tx;
__shared__ int indexes[256];
if (id < n) indexes[tx] = i[id];
__syncthreads();
if (id < n - 1) {
if (tx < 255) count[id] = indexes[tx + 1] - indexes[tx];
else count[id] = i[id + 1] - indexes[tx];
}
if (id == n - 1) count[id] = n - indexes[tx];
return;
}
2) ArrayFire
This is an easy to use, free array based library.
You can do the following in ArrayFire.
using namespace af;
float h_A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int num = 9;
// Transfer data to device
array A(9, 1, h_A);
array values, locations, original;
// Find the unique values and locations
setunique(values, locations, original, A);
// Locations are 0 based, add 1.
// Add *num* at the end to find count of last value.
array counts = diff1(join(locations + 1, num));
Disclosure: I work for AccelerEyes, that develops this software.

To answer the latest addenum to this question - the diff kernel which would complete the thrust method proposed by Pavan could look something like this:
template<int blcksz>
__global__ void diffkernel(const int *i, int* count, const int n) {
int id = blockDim.x * blockIdx.x + threadIdx.x;
int strd = blockDim.x * gridDim.x;
int nmax = blcksz * ((n/blcksz) + ((n%blcksz>0) ? 1 : 0));
__shared__ int indices[blcksz+1];
for(; id<nmax; id+=strd) {
// Data load
indices[threadIdx.x] = (id < n) ? i[id] : n;
if (threadIdx.x == (blcksz-1))
indices[blcksz] = ((id+1) < n) ? i[id+1] : n;
__syncthreads();
// Differencing calculation
int diff = indices[threadIdx.x+1] - indices[threadIdx.x];
// Store
if (id < n) count[id] = diff;
__syncthreads();
}
}

here is a solution:
__global__ void counter(float* a, int* b, int N)
{
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if(idx < N)
{
float my = a[idx];
int count = 0;
for(int i=0; i < N; i++)
{
if(my == a[i])
count++;
}
b[idx]=count;
}
}
int main()
{
int threads = 9;
int blocks = 1;
int N = blocks*threads;
float* h_a;
int* h_b;
float* d_a;
int* d_b;
h_a = (float*)malloc(N*sizeof(float));
h_b = (int*)malloc(N*sizeof(int));
cudaMalloc((void**)&d_a,N*sizeof(float));
cudaMalloc((void**)&d_b,N*sizeof(int));
h_a[0]= .333f;
h_a[1]= .324f;
h_a[2]= .123f;
h_a[3]= .543f;
h_a[4]= .00054f;
h_a[5]= .2243f;
h_a[6]= .333f;
h_a[7]= .53343f;
h_a[8]= .4434f;
cudaMemcpy(d_a,h_a,N*sizeof(float),cudaMemcpyHostToDevice);
counter<<<blocks,threads>>>(d_a,d_b,N);
cudaMemcpy(h_b,d_b,N*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0; i < N; i++)
{
printf("%f = %d times\n",h_a[i],h_b[i]);
}
cudaFree(d_a);
cudaFree(d_b);
free(h_a);
free(h_b);
getchar();
return 0;
}

CUDA kernel - nested for loop

Hello
I'm trying to write a CUDA kernel to perform the following piece of code.
for (n = 0; n < (total-1); n++)
{
a = values[n];
for ( i = n+1; i < total ; i++)
{
b = values[i] - a;
c = b*b;
if( c < 10)
newvalues[i] = c;
}
}
This is what I have currently, but it does not seem to be giving the correct results? does anyone know what I'm doing wrong. Cheers
__global__ void calc(int total, float *values, float *newvalues){
float a,b,c;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int n = idx; n < (total-1); n += blockDim.x*gridDim.x){
a = values[n];
for(int i = n+1; i < total; i++){
b = values[i] - a;
c = b*b;
if( c < 10)
newvalues[i] = c;
}
}

Realize this problem in 2D and launch your kernel with 2D thread blocks. The total number of threads in x and y dimension will be equal to total . The kernel code should look like this:
__global__ void calc(float *values, float *newvalues, int total){
float a,b,c;
int n= blockIdx.y * blockDim.y + threadIdx.y;
int i= blockIdx.x * blockDim.x + threadIdx.x;
if (n>=total || i>=total)
return;
a = values[n];
b = values[i] - a;
c = b*b;
if( c < 10)
newvalues[i] = c;
// I don't know your problem statement but i think it should be like: newvalues[n*total+i] = c;
}
Update:
This is how you should call the kernel
dim3 block(16,16);
dim3 grid ( (total+15)/16, (total+15)/16 );
calc<<<grid,block>>>(float *val, float *newval, int T);
Also make sure you add this line in kernel (see updated kernel)
if (n>=total || i>=total)
return;
Update 2:
fixed blockIdy.y, correct is blockIdx.y

I'll probably be way wrong but the n < (total-1) check in
for (int n = idx; n < (total-1); n += blockDim.x*gridDim.x)
seems different than the original version.

Why don't you just remove the outter loop and start the kernel with as many threads as you need for this loop? It's a bit weird to have a loop that depends on your blockId. Normally you try to avoid these loops.
Secondly it seems to me that newvalues[i] can be overriden by different threads.

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Parallel prefix sum with multiple elements per thread without using thrust - cuda

Related

CUDA loop unrolling

Correct Use of CUDA shared memory for 2D Non-contiguous data access

What did I miss while converting from CUDA to OpenCL? Or why is my kernel returning different output than serial code

Get statistics for a list of numbers using GPU

CUDA kernel - nested for loop

Categories

Resources