Rearranging an array in CUDA - cuda

I have the following problem that I want to implement on CUDA:
I want to read an array (say "flag[20]"), and based on a certain condition, write indices of this array to another array (say "pindex[]")
Simple code implementation in C can be:
int N = 20;
int flag[N];
int pindex[N];
for(int i=0;i<N;i++)
flag[i] = -1;
for(int i=0;i<N;i+=2)
flag[i] = 0;
for(int i=0;i<N;i++)
pindex[i] = 0;
//operation: count # of times flag != -1 and write those indices in a different array
int pcount1 = 0;
for(int i=0;i<N;i++)
{
if(flag[i] != -1)
{
pindex[pcount1] = i;
++pcount1;
}
}
How will I implement this in CUDA?
I can use atomicAdd() to calculate total number of times my condition is satisfied. But, how do I write indices in a different array. For example, I tried the following:
__global__ void kernel_tryatomic(int N,int* pcount,int* flag, int* pindex)
{
int tId=threadIdx.x;
int n=(blockIdx.x*2+blockIdx.y)*BlockSize+tId;
if(n > N-1) return;
if(flag[n] != -1)
{
atomicAdd(pcount,1);
atomicExch(&pindex[*pcount],n);
//pindex[*pcount] = n;
}
}
This code calculates "pcount" correctly, but does not update "pindex" array.
I need help to do this operation on GPUs.
Thanks

Since your condition (flag) is conceptually a binary, you can use binary prefix sum (thoroughly explained here) to determine which place the thread with a positive flag should write.
For example if N is 20, with the help of below __device__ functions:
__device__ int lanemask_lt(int lane) {
return (1 << (lane)) − 1;
}
__device__ int warp_prefix_sums(int lane, int p) {
const int mask = lanemask_lt( lane );
int b = __ballot( p );
return __popc( b & mask );
}
your __global__ function can simply be written like below:
__global__ void kernel_scan(int N,int* pcount,int* flag, int* pindex)
{
int tId=threadIdx.x;
if(tId >= N)
return;
int threadFlag = ( flag[tId] == -1 ) ? 0 : 1;
int position_to_write = warp_prefix_sum( tId & (warpSize-1), threadFlag );
if( threadFlag )
pindex[ position_to_write ] = tId;
}
If N is bigger than the warp size (32), you can use intra-block binary prefix sum that is explained in the provided link.

Related

50 error: invalid operands to binary % (have' int ' and ' *int ')

I get this error at line 50.I dont know what to use instead of (*p).
I am learning how to use pointers and trying to use pointers in a function passing arguments by reference.
I've been staring at it for some time now.
# include "stdio.h"
int odd (int (*), int );
main(){
int i,n;
int size;
int main(){
int v[i];
int *p;
p = &v[0];
printf("Write the quantity of integers you want to ingress");
scanf("%d",&size);
for(i=0;i<size;i++){
printf("write a number");
scanf("%d",&n);
v[i]= n;
p = &v[i];
odd(&v[i],size);
printf("The value number %d is: %d \n",i,*p);
}
return 0;
}
int odd(int *p,int siz){
int i;
int counter = 0;
for(i=0;i<siz;i++){
/*50*/ if(*p % 2 = 0){ }
else counter++ ;
return counter;
}
}
You are confusing assignment (=) with testing for equality (==). Change:
if(*p % 2 = 0)
to:
if(*p % 2 == 0)
Also your prototype for odd is wrong - change:
int odd (int (*), int );
to:
int odd (int *, int );

CUDA __syncthreads(); not working; inverse in breakpoint hit order

I have a problem, I think with __syncthreads();, in the following code:
__device__ void prefixSumJoin(const bool *g_idata, int *g_odata, int n)
{
__shared__ int temp[Config::bfr*Config::bfr]; // allocated on invocation
int thid = threadIdx.y*blockDim.x + threadIdx.x;
if(thid<(n>>1))
{
int offset = 1;
temp[2*thid] = (g_idata[2*thid]?1:0); // load input into shared memory
temp[2*thid+1] = (g_idata[2*thid+1]?1:0);
for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1; // <-- breakpoint B
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
}
__global__ void selectKernel3(...)
{
int tidx = threadIdx.x;
int tidy = threadIdx.y;
int bidx = blockIdx.x;
int bidy = blockIdx.y;
int tid = tidy*blockDim.x + tidx;
int bid = bidy*gridDim.x+bidx;
int noOfRows1 = ...;
int noOfRows2 = ...;
__shared__ bool isRecordSelected[Config::bfr*Config::bfr];
__shared__ int selectedRecordsOffset[Config::bfr*Config::bfr];
isRecordSelected[tid] = false;
selectedRecordsOffset[tid] = 0;
__syncthreads();
if(tidx<noOfRows1 && tidy<noOfRows2)
if(... == ...)
isRecordSelected[tid] = true;
__syncthreads();
prefixSumJoin(isRecordSelected,selectedRecordsOffset,Config::bfr*Config::bfr); // <-- breakpoint A
__syncthreads();
if(isRecordSelected[tid]==true){
{
some_instruction;// <-- breakpoint C
...
}
}
}
...
f(){
dim3 dimGrid(13, 5);
dim3 dimBlock(Config::bfr, Config::bfr);
selectKernel3<<<dimGrid, dimBlock>>>(...)
}
//other file
class Config
{
public:
static const int bfr = 16; // blocking factor = number of rows per block
public:
Config(void);
~Config(void);
};
The prefixSum is from GPU Gems 3: Parallel Prefix Sum (Scan) with CUDA, with little change.
Ok, now I set 3 breakpoints: A, B, C. They should be hit in the order A, B, C. The problem is that they are hit in the order: A, B*x, C, B. So at point C, selectedRecordsOffset is not ready and it causes errors. After A the B is hit a few times, but not all and then C is hit and it goes further in the code and then B again for the rest of the loop. x is different depending on the input (for some inputs there isn't any inversion in the breakpoints so C is the last that was hit).
Moreover if I look on thread numbers that cause a hit it is for A and C threadIdx.y = 0 and for B threadIdx.y = 10. How is this possible while it is the same block so why some threads omit sync? There is no conditional sync.
Does anyone have any ideas on where to look for bugs?
If you need some more clarification, just ask.
Thanks in advance for any advice on how to work this out.
Adam
Thou shalt not use __syncthreads() in conditional code if the condition does not evaluate uniformly across all threads of each block.

CUDA C/C++: Calculate the average of inverse of distance per point (interaction energy, perhaps?)

I've been trying to write a kernel in that calculates the sum of the inverse of the distance between N given points over N. A serial coda in C would be like
average = 0;
for(int i = 0; i < Np; i++){
for(int j = i + 1; j < Np; j++){
average += 1.0e0f/sqrtf((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
}
average = average/(float)N;
Where rx and ry are the x and y coordinates, respectively.
I generate the points via a kernel that uses random number generator. For the kernel, I used 128(256) threads per block for 4k(8k) points. On it every thread performs the inner above inner loop, then the results are passed to a reduce sum function, as follows
Generate points:
__global__ void InitRNG ( curandState * state, const int seed ){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
curand_init (seed, tIdx, 0, &state[tIdx]);
}
__global__
void SortPoints(float* X, float* Y,const int N, curandState *state){
float rdmn1, rdmn2;
unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
float range;
if(tIdx < N){
rdmn1 = curand_uniform(&state[tIdx]);
rdmn2 = curand_uniform(&state[tIdx]);
range = sqrtf(0.25e0f*N*rdmn1);
X[tIdx] = range*cosf(2.0e0f*pi*rdmn2);
Y[tIdx] = range*sinf(2.0e0f*pi*rdmn2);
}
}
Reduction:
__device__
float ReduceSum2(float In){
__shared__ float data[BlockSize];
unsigned int tIdx = threadIdx.x;
data[tIdx] = In;
__syncthreads();
for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){
if(tIdx < i){
data[tIdx] += data[tIdx + i];
}
__syncthreads();
}
return data[0];
}
Kernel:
__global__
void AvgDistance(float *X, float *Y, float *Avg, const int N){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
int bIdx = blockIdx.x;
float x , y;
float d = 0.0f;
if(tIdx < N){
for(int i = tIdx + 1; i < N ; i++){
x = X[tIdx] - X[i];
y = Y[tIdx] - Y[i];
d += 1.0e0f/(sqrtf(x*x + y*y));
}
__syncthreads();
Avg[bIdx] = ReduceSum2(d);
}
}
The kernel is configured and launched as follows:
dim3 threads(BlockSize,BlockSize);
dim3 blocks(ceil(Np/threads.x),ceil(Np/threads.y));
InitRNG<<<blocks.x,threads.x>>>(d_state,seed);
SortPoints<<<blocks.x,threads.x>>>(d_rx,d_ry,Np,d_state);
AvgDistance<<<blocks.x,threads.x,threads.x*sizeof(float)>>>(d_rx,d_ry,d_Avg,Np);
Finally, I copy the data back to host and then perform the remaining sum:
Avg = new float[blocks.x];
CHECK(cudaMemcpy(Avg,d_Avg,blocks.x*sizeof(float),cudaMemcpyDeviceToHost),ERROR_CPY_DEVTOH);
float average = 0;
for(int i = 0; i < blocks.x; i++){
average += Avg[i];
}
average = average/(float)Np;
For 4k points, ok! the results are:
Average distance between points (via Kernel) = 108.615
Average distance between points (via CPU) = 110.191
In this case the sum may be performed in different order, causing both results to diverge from each other, I don't know...
But when it comes to 8k, the results are quiet different:
Average distance between points (via Kernel) = 153.63
Average distance between points (via CPU) = 131.471
To me it seems that both the kernel and the serial code are written the same way. What leads me to distrust the precision on CUDA calculation of floating point numbers. Does this make sense? Or are the access to global memory causing some conflicts when some threads load the same data from X and Y at the same time? Or the way I wrote the kernel is in some way 'wrong'(I mean, am I doing something that is causing both results to diverge from each other?).
Actually, from what I can tell, the problem seems to be on the CPU side. I created a sample code based on your code.
I was able to reproduce your results.
First I switched all instances of sinf, cosf, and sqrtf to their corresponding double versions. This made no difference in the results.
Next I included a typedef so I could easily switch the precision from float to double and back, replacing every relevant instance of float in the code with mytype which is my typedef.
When I run the code with typedef of float and a data size of 4096 I get these results:
GPU average = 108.294922
CPU average = 109.925285
When I run the code with typedef of double and a data size of 4096 I get these results:
GPU average = 108.294903
CPU average = 108.294903
When I run the code with typedef of float and a data size of 8192 I get these results:
GPU average = 153.447327
CPU average = 131.473526
When I run the code with typedef of double and a data size of 8192 I get these results:
GPU average = 153.447380
CPU average = 153.447380
There are at least 2 observations:
The GPU results don't vary between float and double, except in the 5th decimal place
The CPU results vary by 1-20% or so between float and double, but when double is selected, they line up exactly (to the 6th decimal place, anyway) with the GPU results.
Based on this, I believe the CPU is providing the variable, questionable behavior.
Here's my code for reference:
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#define DSIZE 8192
#define BlockSize 32
#define pi 3.14159f
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef double mytype;
__global__ void InitRNG ( curandState * state, const int seed ){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
curand_init (seed, tIdx, 0, &state[tIdx]);
}
__global__
void SortPoints(mytype* X, mytype* Y,const int N, curandState *state){
mytype rdmn1, rdmn2;
unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
mytype range;
if(tIdx < N){
rdmn1 = curand_uniform(&state[tIdx]);
rdmn2 = curand_uniform(&state[tIdx]);
range = sqrt(0.25e0f*N*rdmn1);
X[tIdx] = range*cos(2.0e0f*pi*rdmn2);
Y[tIdx] = range*sin(2.0e0f*pi*rdmn2);
}
}
__device__
mytype ReduceSum2(mytype In){
__shared__ mytype data[BlockSize];
unsigned int tIdx = threadIdx.x;
data[tIdx] = In;
__syncthreads();
for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){
if(tIdx < i){
data[tIdx] += data[tIdx + i];
}
__syncthreads();
}
return data[0];
}
__global__
void AvgDistance(mytype *X, mytype *Y, mytype *Avg, const int N){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
int bIdx = blockIdx.x;
mytype x , y;
mytype d = 0.0f;
if(tIdx < N){
for(int i = tIdx + 1; i < N ; i++){
x = X[tIdx] - X[i];
y = Y[tIdx] - Y[i];
d += 1.0e0f/(sqrt(x*x + y*y));
}
__syncthreads();
Avg[bIdx] = ReduceSum2(d);
}
}
mytype cpu_avg(const mytype *rx, const mytype *ry, const int size){
mytype average = 0.0f;
for(int i = 0; i < size; i++){
for(int j = i + 1; j < size; j++){
average += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
}
average = average/(mytype)size;
return average;
}
int main() {
int Np = DSIZE;
mytype *rx, *ry, *d_rx, *d_ry, *d_Avg, *Avg;
curandState *d_state;
int seed = 1;
dim3 threads(BlockSize,BlockSize);
dim3 blocks((int)ceilf(Np/(float)threads.x),(int)ceilf(Np/(float)threads.y));
printf("number of blocks = %d\n", blocks.x);
printf("number of threads= %d\n", threads.x);
rx = (mytype *)malloc(DSIZE*sizeof(mytype));
if (rx == 0) {printf("malloc fail\n"); return 1;}
ry = (mytype *)malloc(DSIZE*sizeof(mytype));
if (ry == 0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void**)&d_rx, DSIZE * sizeof(mytype));
cudaMalloc((void**)&d_ry, DSIZE * sizeof(mytype));
cudaMalloc((void**)&d_Avg, blocks.x * sizeof(mytype));
cudaMalloc((void**)&d_state, DSIZE * sizeof(curandState));
cudaCheckErrors("cudamalloc");
InitRNG<<<blocks.x,threads.x>>>(d_state,seed);
SortPoints<<<blocks.x,threads.x>>>(d_rx,d_ry,Np,d_state);
AvgDistance<<<blocks.x,threads.x,threads.x*sizeof(mytype)>>>(d_rx,d_ry,d_Avg,Np);
cudaCheckErrors("kernels");
Avg = new mytype[blocks.x];
cudaMemcpy(Avg,d_Avg,blocks.x*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaMemcpy(rx, d_rx, DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaMemcpy(ry, d_ry, DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaCheckErrors("cudamemcpy");
mytype average = 0;
for(int i = 0; i < blocks.x; i++){
average += Avg[i];
}
average = average/(mytype)Np;
printf("GPU average = %f\n", average);
average = cpu_avg(rx, ry, DSIZE);
printf("CPU average = %f\n", average);
return 0;
}
I am running on RHEL 5.5, CUDA 5.0, Intel Xeon X5560
compiled with:
nvcc -O3 -arch=sm_20 -lcurand -lm -o t93 t93.cu
EDIT:
After observing that the variability was on the CPU side, I found that I could eliminate most of the CPU variability by modifying your CPU averaging code like this:
mytype cpu_avg(const mytype *rx, const mytype *ry, const int size){
mytype average = 0.0f;
mytype temp = 0.0f;
for(int i = 0; i < size; i++){
for(int j = i + 1; j < size; j++){
temp += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
average += temp/(mytype)size;
temp = 0.0f;
}
return average;
}
So I would say there's a problem with intermediate results on the CPU side. It's interesting that it doesn't show up on the GPU result. I suspect the reason for this is that the final summation of GPU averages is done on the CPU (therefore each individual GPU block result is scaled down by the size, e.g. 8192), and these may have an intermediate precision that is sufficient to survive until the final division. If you inlined the CPU average calculation, you may observe something different again.

CUDA: how to read 4 (or 16) chars in one transaction per thread using textures and char4 (or int4)?

I have a large character array in the device global memory that is accessed
in a coalescent manner by threads. I've read somewhere that I could speed up
memory access by reading 4 or 16 chars in one memory transaction per thread.
I believe I would have to use textures and the char4 or int4 structs. However,
I can't find any documentation or examples on this. Could anyone here please
provide a simple example or pointers to where I can learn more about this?
In my code I define the char array as
char *database = NULL;
cudaMalloc( (void**) &database, SIZE * sizeof(char) );
What would the definition be if I want to use textures and char4 (or int4)?
Thanks very much.
I finally figured out the answer to my own question. The definition with char4
would be
char4 *database = NULL;
cudaMalloc( (void**) &database, SIZE * sizeof(char4)/4 );
Don't need textures for this. The kernel does speedup by a factor of three
with char4 but reduces to two if I do loop unrolling. For the sake of completeness
my kernel is
__global__ void kernel(unsigned int jobs_todo, char* database, float* results ) {
unsigned int id = threadIdx.x + blockIdx.x * blockDim.x;
float A = 0; int i; char ch;
if(id < jobs_todo) {
for(i = 0; i < 1000; i += 1){
ch = database[jobs_todo*i + id];
if(ch == 'A') A++;
}
results[id] = A;
}
}
And with char4 it is
__global__ void kernel4(unsigned int jobs_todo, char4* database, float* results ) {
unsigned int id = threadIdx.x + blockIdx.x * blockDim.x;
float A = 0; int i; char4 ch4;
if(id < jobs_todo) {
for(i = 0; i < 1000/4; i += 1){
ch4 = database[jobs_todo*i + id];
if(ch4.x == 'A') A++;
if(ch4.y == 'A') A++;
if(ch4.z == 'A') A++;
if(ch4.w == 'A') A++;
}
results[id] = A;
}
}
I also tried int4 but it's just .0002 seconds faster than the char4 time.

Get statistics for a list of numbers using GPU

I have several lists of numbers on a file . For example,
.333, .324, .123 , .543, .00054
.2243, .333, .53343 , .4434
Now, I want to get the number of times each number occurs using the GPU. I believe this will be faster to do on the GPU than the CPU because each thread can process one list. What data structure should I use on the GPU to easily get the above counts. For example , for the above, the answer will look as follows:
.333 = 2 times in entire file
.324 = 1 time
etc..
I looking for a general solution. Not one that works only on devices with specific compute capability
Just writing kernel suggested by Pavan to see if I have implemented it efficiently:
int uniqueEle = newend.valiter – d_A;
int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int)); // stores the count of each unique element
int TPB = 256;
int blocks = uniqueEle + TPB -1 / TPB;
//Cast d_I to raw pointer called d_rawI
launch<<<blocks,TPB>>>(d_rawI,count,uniqueEle);
__global__ void launch(int *i, int* count, int n){
int id = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ int indexes[256];
if(id < n ){
indexes[threadIdx.x] = i[id];
//as occurs between two blocks
if(id % 255 == 0){
count[indexes] = i[id+1] - i[id];
}
}
__syncthreads();
if(id < ele - 1){
if(threadIdx.x < 255)
count[id] = indexes[threadIdx.x+1] – indexes[threadIdx.x];
}
}
Question: how to modify this kernel so that it handles arrays of arbitrary size. I.e , handle the condition when the total number of threads < number of elements
Here is how I would do the code in matlab
A = [333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434];
[values, locations] = unique(A); % Find unique values and their locations
counts = diff([0, locations]); % Find the count based on their locations
There is no easy way to do this in plain cuda, but you can use existing libraries to do this.
1) Thrust
It is also being shipped with CUDA toolkit from CUDA 4.0.
The matlab code can be roughly translated into thrust by using the following functions. I am not too proficient with thrust, but I am just trying to give you an idea on what routines to look at.
float _A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int _I[] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
float *A, *I;
// Allocate memory on device and cudaMempCpy values from _A to A and _I to I
int num = 9;
// Values vector
thrust::device_vector<float>d_A(A, A+num);
// Need to sort to get same values together
thrust::stable_sort(d_A, d_A+num);
// Vector containing 0 to num-1
thrust::device_vector<int>d_I(I, I+num);
// Find unique values and elements
thrust::device_vector<float>d_Values(num), d_Locations(num), d_counts(num);
// Find unique elements
thrust::device_vector<float>::iterator valiter;
thrust::device_vector<int>::iterator idxiter;
thrust::pair<valiter, idxiter> new_end;
new_end = thrust::unique_by_key(d_A, d_A+num, d_I, d_Values, d_Locations);
You now have the locations of the first instance of each unique value. You can now launch a kernel to find the differences between adjacent elements from 0 to new_end in d_Locations. Subtract the final value from num to get the count for final location.
EDIT (Adding code that was provided over chat)
Here is how the difference code needs to be done
#define MAX_BLOCKS 65535
#define roundup(A, B) = (((A) + (B) - 1) / (B))
int uniqueEle = newend.valiter – d_A;
int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int));
int TPB = 256;
int num_blocks = roundup(uniqueEle, TPB);
int blocks_y = roundup(num_blocks, MAX_BLOCKS);
int blocks_x = roundup(num_blocks, blocks_y);
dim3 blocks(blocks_x, blocks_y);
kernel<<<blocks,TPB>>>(d_rawI, count, uniqueEle);
__global__ void kernel(float *i, int* count, int n)
{
int tx = threadIdx.x;
int bid = blockIdx.y * gridDim.x + blockIdx.x;
int id = blockDim.x * bid + tx;
__shared__ int indexes[256];
if (id < n) indexes[tx] = i[id];
__syncthreads();
if (id < n - 1) {
if (tx < 255) count[id] = indexes[tx + 1] - indexes[tx];
else count[id] = i[id + 1] - indexes[tx];
}
if (id == n - 1) count[id] = n - indexes[tx];
return;
}
2) ArrayFire
This is an easy to use, free array based library.
You can do the following in ArrayFire.
using namespace af;
float h_A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int num = 9;
// Transfer data to device
array A(9, 1, h_A);
array values, locations, original;
// Find the unique values and locations
setunique(values, locations, original, A);
// Locations are 0 based, add 1.
// Add *num* at the end to find count of last value.
array counts = diff1(join(locations + 1, num));
Disclosure: I work for AccelerEyes, that develops this software.
To answer the latest addenum to this question - the diff kernel which would complete the thrust method proposed by Pavan could look something like this:
template<int blcksz>
__global__ void diffkernel(const int *i, int* count, const int n) {
int id = blockDim.x * blockIdx.x + threadIdx.x;
int strd = blockDim.x * gridDim.x;
int nmax = blcksz * ((n/blcksz) + ((n%blcksz>0) ? 1 : 0));
__shared__ int indices[blcksz+1];
for(; id<nmax; id+=strd) {
// Data load
indices[threadIdx.x] = (id < n) ? i[id] : n;
if (threadIdx.x == (blcksz-1))
indices[blcksz] = ((id+1) < n) ? i[id+1] : n;
__syncthreads();
// Differencing calculation
int diff = indices[threadIdx.x+1] - indices[threadIdx.x];
// Store
if (id < n) count[id] = diff;
__syncthreads();
}
}
here is a solution:
__global__ void counter(float* a, int* b, int N)
{
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if(idx < N)
{
float my = a[idx];
int count = 0;
for(int i=0; i < N; i++)
{
if(my == a[i])
count++;
}
b[idx]=count;
}
}
int main()
{
int threads = 9;
int blocks = 1;
int N = blocks*threads;
float* h_a;
int* h_b;
float* d_a;
int* d_b;
h_a = (float*)malloc(N*sizeof(float));
h_b = (int*)malloc(N*sizeof(int));
cudaMalloc((void**)&d_a,N*sizeof(float));
cudaMalloc((void**)&d_b,N*sizeof(int));
h_a[0]= .333f;
h_a[1]= .324f;
h_a[2]= .123f;
h_a[3]= .543f;
h_a[4]= .00054f;
h_a[5]= .2243f;
h_a[6]= .333f;
h_a[7]= .53343f;
h_a[8]= .4434f;
cudaMemcpy(d_a,h_a,N*sizeof(float),cudaMemcpyHostToDevice);
counter<<<blocks,threads>>>(d_a,d_b,N);
cudaMemcpy(h_b,d_b,N*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0; i < N; i++)
{
printf("%f = %d times\n",h_a[i],h_b[i]);
}
cudaFree(d_a);
cudaFree(d_b);
free(h_a);
free(h_b);
getchar();
return 0;
}