CUDA __syncthreads(); not working; inverse in breakpoint hit order - cuda

I have a problem, I think with __syncthreads();, in the following code:
__device__ void prefixSumJoin(const bool *g_idata, int *g_odata, int n)
{
__shared__ int temp[Config::bfr*Config::bfr]; // allocated on invocation
int thid = threadIdx.y*blockDim.x + threadIdx.x;
if(thid<(n>>1))
{
int offset = 1;
temp[2*thid] = (g_idata[2*thid]?1:0); // load input into shared memory
temp[2*thid+1] = (g_idata[2*thid+1]?1:0);
for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1; // <-- breakpoint B
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
}
__global__ void selectKernel3(...)
{
int tidx = threadIdx.x;
int tidy = threadIdx.y;
int bidx = blockIdx.x;
int bidy = blockIdx.y;
int tid = tidy*blockDim.x + tidx;
int bid = bidy*gridDim.x+bidx;
int noOfRows1 = ...;
int noOfRows2 = ...;
__shared__ bool isRecordSelected[Config::bfr*Config::bfr];
__shared__ int selectedRecordsOffset[Config::bfr*Config::bfr];
isRecordSelected[tid] = false;
selectedRecordsOffset[tid] = 0;
__syncthreads();
if(tidx<noOfRows1 && tidy<noOfRows2)
if(... == ...)
isRecordSelected[tid] = true;
__syncthreads();
prefixSumJoin(isRecordSelected,selectedRecordsOffset,Config::bfr*Config::bfr); // <-- breakpoint A
__syncthreads();
if(isRecordSelected[tid]==true){
{
some_instruction;// <-- breakpoint C
...
}
}
}
...
f(){
dim3 dimGrid(13, 5);
dim3 dimBlock(Config::bfr, Config::bfr);
selectKernel3<<<dimGrid, dimBlock>>>(...)
}
//other file
class Config
{
public:
static const int bfr = 16; // blocking factor = number of rows per block
public:
Config(void);
~Config(void);
};
The prefixSum is from GPU Gems 3: Parallel Prefix Sum (Scan) with CUDA, with little change.
Ok, now I set 3 breakpoints: A, B, C. They should be hit in the order A, B, C. The problem is that they are hit in the order: A, B*x, C, B. So at point C, selectedRecordsOffset is not ready and it causes errors. After A the B is hit a few times, but not all and then C is hit and it goes further in the code and then B again for the rest of the loop. x is different depending on the input (for some inputs there isn't any inversion in the breakpoints so C is the last that was hit).
Moreover if I look on thread numbers that cause a hit it is for A and C threadIdx.y = 0 and for B threadIdx.y = 10. How is this possible while it is the same block so why some threads omit sync? There is no conditional sync.
Does anyone have any ideas on where to look for bugs?
If you need some more clarification, just ask.
Thanks in advance for any advice on how to work this out.
Adam

Thou shalt not use __syncthreads() in conditional code if the condition does not evaluate uniformly across all threads of each block.

Related

Matrix vector product CUDA improve performance with tiling and shared memory

Hello I'm working in a CUDA kernel about matrix vector product. I want to improve the performance with tiling and shared memory.
The problem is that with this code the M Matrix or the N vector aren't loading right.
Do you have any idea about how to Load a tile from M and N into the shared memory arrays??
M is the matrix, N is the vector and P is the result of the matrix vector product
__global__ void matrixMul( float* P, float* M, float* N, int Mw, int Nw)
{
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
__shared__ float Ms[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Ns[BLOCK_SIZE];
// ===================================================================
// Code segment 1
// Determine the update values for the tile indices in the loop
// ===================================================================
int mBegin = Mw * BLOCK_SIZE * by;
int mEnd = mBegin + Mw - 1;
int mStep = BLOCK_SIZE;
int nBegin = BLOCK_SIZE * bx;
//int nStep = BLOCK_SIZE*Nw;
int nStep = 1;
float Psub = 0.0f;
// ===================================================================
// Code segment 2
// Do matrix-matrix multiplication inside a tile
// ===================================================================
for (int m = mBegin, n = nBegin; m <= mEnd; m += mStep, n += nStep) {
// Load a tile from M and N into the shared memory arrays
Ms[ty][tx] = M[bx*mStep*Mw+m];
Ns[ty] = N[by*nStep*Nw+n];
// Synchronize the threads
__syncthreads();
// Multiply the two tiles together, each thread accumulating
// the partial sum of a single dot product.
for (int i = 0; i < BLOCK_SIZE; i++) {
Psub += Ms[i][tx] * Ns[i];
}
// Synchronize again.
__syncthreads();
}
// ===================================================================
// Code segment 3
// Store the data back to global memory
// ===================================================================
int p = Nw * BLOCK_SIZE * by + BLOCK_SIZE * bx;
P[p + nStep] = Psub;
}
I found a similar example (dealing with square matrices of identical sizes, mind you) that also loads parts of the matrix into shared memory. It seems your declarations are right, and it probably just comes down to the algebra you are using to determine which elements go where.
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width){
__shared__float Mds[TILE_WIDTH][TILE_WIDTH]; // Shared memory
__shared__float Nds[TILE_WIDTH][TILE_WIDTH]; // declarations
int bx = blockIdx.x; int by = blockIdx.y; // ID thread
int tx = threadIdx.x; int ty = threadIdx.y;
// Identify the row and column of the Pd element to work on
int Row = by * TILE_WIDTH + ty;
int Col = bx * TILE_WIDTH + tx;
float Pvalue = 0; // REGISTER!
// Loop over the Md and Nd tiles required to compute the Pd element
for (int m = 0; m < Width/TILE_WIDTH; ++m) {
// Collaborative loading of Md and Nd tiles into shared memory
Mds[ty][tx] = Md[Row*Width + (m*TILE_WIDTH + tx)];
Nds[ty][tx] = Nd[Col + (m*TILE_WIDTH + ty)*Width];
__syncthreads();
for (int k = 0; k < TILE_WIDTH; ++k)
Pvalue += Mds[ty][k] * Nds[k][tx];
__syncthreads();
}
Pd[Row*Width+Col] = Pvalue;
}

Rearranging an array in CUDA

I have the following problem that I want to implement on CUDA:
I want to read an array (say "flag[20]"), and based on a certain condition, write indices of this array to another array (say "pindex[]")
Simple code implementation in C can be:
int N = 20;
int flag[N];
int pindex[N];
for(int i=0;i<N;i++)
flag[i] = -1;
for(int i=0;i<N;i+=2)
flag[i] = 0;
for(int i=0;i<N;i++)
pindex[i] = 0;
//operation: count # of times flag != -1 and write those indices in a different array
int pcount1 = 0;
for(int i=0;i<N;i++)
{
if(flag[i] != -1)
{
pindex[pcount1] = i;
++pcount1;
}
}
How will I implement this in CUDA?
I can use atomicAdd() to calculate total number of times my condition is satisfied. But, how do I write indices in a different array. For example, I tried the following:
__global__ void kernel_tryatomic(int N,int* pcount,int* flag, int* pindex)
{
int tId=threadIdx.x;
int n=(blockIdx.x*2+blockIdx.y)*BlockSize+tId;
if(n > N-1) return;
if(flag[n] != -1)
{
atomicAdd(pcount,1);
atomicExch(&pindex[*pcount],n);
//pindex[*pcount] = n;
}
}
This code calculates "pcount" correctly, but does not update "pindex" array.
I need help to do this operation on GPUs.
Thanks
Since your condition (flag) is conceptually a binary, you can use binary prefix sum (thoroughly explained here) to determine which place the thread with a positive flag should write.
For example if N is 20, with the help of below __device__ functions:
__device__ int lanemask_lt(int lane) {
return (1 << (lane)) − 1;
}
__device__ int warp_prefix_sums(int lane, int p) {
const int mask = lanemask_lt( lane );
int b = __ballot( p );
return __popc( b & mask );
}
your __global__ function can simply be written like below:
__global__ void kernel_scan(int N,int* pcount,int* flag, int* pindex)
{
int tId=threadIdx.x;
if(tId >= N)
return;
int threadFlag = ( flag[tId] == -1 ) ? 0 : 1;
int position_to_write = warp_prefix_sum( tId & (warpSize-1), threadFlag );
if( threadFlag )
pindex[ position_to_write ] = tId;
}
If N is bigger than the warp size (32), you can use intra-block binary prefix sum that is explained in the provided link.

CUDA threads for inner loop

I've got this kernel
__global__ void kernel1(int keep, int include, int width, int* d_Xco,
int* d_Xnum, bool* d_Xvalid, float* d_Xblas)
{
int i, k;
i = threadIdx.x + blockIdx.x * blockDim.x;
if(i < keep){
for(k = 0; k < include ; k++){
int val = (d_Xblas[i*include + k] >= 1e5);
int aux = d_Xnum[i];
d_Xblas[i*include + k] *= (!val);
d_Xco[i*width + aux] = k;
d_Xnum[i] +=val;
d_Xvalid[i*include + k] = (!val);
}
}
}
launched with
int keep = 9000;
int include = 23000;
int width = 0.2*include;
int threads = 192;
int blocks = keep+threads-1/threads;
kernel1 <<< blocks,threads >>>( keep, include, width,
d_Xco, d_Xnum, d_Xvalid, d_Xblas );
This kernel1 works fine but it is obviously not totally optimized. I thought it would be straight forward to eliminate the inner loop k but for some reason it doesn't work fine.
My first idea was:
__global__ void kernel2(int keep, int include, int width,
int* d_Xco, int* d_Xnum, bool* d_Xvalid,
float* d_Xblas)
{
int i, k;
i = threadIdx.x + blockIdx.x * blockDim.x;
k = threadIdx.y + blockIdx.y * blockDim.y;
if((i < keep) && (k < include) ) {
int val = (d_Xblas[i*include + k] >= 1e5);
int aux = d_Xnum[i];
d_Xblas[i*include + k] *= (float)(!val);
d_Xco[i*width + aux] = k;
atomicAdd(&d_Xnum[i], val);
d_Xvalid[i*include + k] = (!val);
}
}
launched with a 2D grid:
int keep = 9000;
int include = 23000;
int width = 0.2*include;
int th = 32;
dim3 threads(th,th);
dim3 blocks ((keep+threads.x-1)/threads.x, (include+threads.y-1)/threads.y);
kernel2 <<< blocks,threads >>>( keep, include, width, d_Xco, d_Xnum,
d_Xvalid, d_Xblas );
Although I believe the idea is fine, it does not work and I am running out of ideas here. Could you please help me out here? I also think the problem could be in d_Xco which stores the position k in a smaller array and push them to the beginning of the array , so the order matters.
d_Xco
-------------------------------
| 2|3 |15 |4 |5 |5 | | | | | | .......
-------------------------------
In the original code, you have
for(k = 0; k < include ; k++){
...
int aux = d_Xnum[i];
...
d_Xco[i*width + aux] = k;
...
}
The index to the d_Xco array is not dependent on k and therefore writing to it each iteration is redundant. The final value will always be include-1. So, replace these two lines inside the k loop with one line outside the k loop:
d_Xco[i*width + d_Xnum[i]] = include - 1;
Once you do that, when you parallelize the k loop you will no longer have the race condition you currently have when many k threads assign different values to the same location in d_Xco concurrently (no guarantee of ordering).

CUDA: how to read 4 (or 16) chars in one transaction per thread using textures and char4 (or int4)?

I have a large character array in the device global memory that is accessed
in a coalescent manner by threads. I've read somewhere that I could speed up
memory access by reading 4 or 16 chars in one memory transaction per thread.
I believe I would have to use textures and the char4 or int4 structs. However,
I can't find any documentation or examples on this. Could anyone here please
provide a simple example or pointers to where I can learn more about this?
In my code I define the char array as
char *database = NULL;
cudaMalloc( (void**) &database, SIZE * sizeof(char) );
What would the definition be if I want to use textures and char4 (or int4)?
Thanks very much.
I finally figured out the answer to my own question. The definition with char4
would be
char4 *database = NULL;
cudaMalloc( (void**) &database, SIZE * sizeof(char4)/4 );
Don't need textures for this. The kernel does speedup by a factor of three
with char4 but reduces to two if I do loop unrolling. For the sake of completeness
my kernel is
__global__ void kernel(unsigned int jobs_todo, char* database, float* results ) {
unsigned int id = threadIdx.x + blockIdx.x * blockDim.x;
float A = 0; int i; char ch;
if(id < jobs_todo) {
for(i = 0; i < 1000; i += 1){
ch = database[jobs_todo*i + id];
if(ch == 'A') A++;
}
results[id] = A;
}
}
And with char4 it is
__global__ void kernel4(unsigned int jobs_todo, char4* database, float* results ) {
unsigned int id = threadIdx.x + blockIdx.x * blockDim.x;
float A = 0; int i; char4 ch4;
if(id < jobs_todo) {
for(i = 0; i < 1000/4; i += 1){
ch4 = database[jobs_todo*i + id];
if(ch4.x == 'A') A++;
if(ch4.y == 'A') A++;
if(ch4.z == 'A') A++;
if(ch4.w == 'A') A++;
}
results[id] = A;
}
}
I also tried int4 but it's just .0002 seconds faster than the char4 time.

Get statistics for a list of numbers using GPU

I have several lists of numbers on a file . For example,
.333, .324, .123 , .543, .00054
.2243, .333, .53343 , .4434
Now, I want to get the number of times each number occurs using the GPU. I believe this will be faster to do on the GPU than the CPU because each thread can process one list. What data structure should I use on the GPU to easily get the above counts. For example , for the above, the answer will look as follows:
.333 = 2 times in entire file
.324 = 1 time
etc..
I looking for a general solution. Not one that works only on devices with specific compute capability
Just writing kernel suggested by Pavan to see if I have implemented it efficiently:
int uniqueEle = newend.valiter – d_A;
int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int)); // stores the count of each unique element
int TPB = 256;
int blocks = uniqueEle + TPB -1 / TPB;
//Cast d_I to raw pointer called d_rawI
launch<<<blocks,TPB>>>(d_rawI,count,uniqueEle);
__global__ void launch(int *i, int* count, int n){
int id = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ int indexes[256];
if(id < n ){
indexes[threadIdx.x] = i[id];
//as occurs between two blocks
if(id % 255 == 0){
count[indexes] = i[id+1] - i[id];
}
}
__syncthreads();
if(id < ele - 1){
if(threadIdx.x < 255)
count[id] = indexes[threadIdx.x+1] – indexes[threadIdx.x];
}
}
Question: how to modify this kernel so that it handles arrays of arbitrary size. I.e , handle the condition when the total number of threads < number of elements
Here is how I would do the code in matlab
A = [333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434];
[values, locations] = unique(A); % Find unique values and their locations
counts = diff([0, locations]); % Find the count based on their locations
There is no easy way to do this in plain cuda, but you can use existing libraries to do this.
1) Thrust
It is also being shipped with CUDA toolkit from CUDA 4.0.
The matlab code can be roughly translated into thrust by using the following functions. I am not too proficient with thrust, but I am just trying to give you an idea on what routines to look at.
float _A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int _I[] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
float *A, *I;
// Allocate memory on device and cudaMempCpy values from _A to A and _I to I
int num = 9;
// Values vector
thrust::device_vector<float>d_A(A, A+num);
// Need to sort to get same values together
thrust::stable_sort(d_A, d_A+num);
// Vector containing 0 to num-1
thrust::device_vector<int>d_I(I, I+num);
// Find unique values and elements
thrust::device_vector<float>d_Values(num), d_Locations(num), d_counts(num);
// Find unique elements
thrust::device_vector<float>::iterator valiter;
thrust::device_vector<int>::iterator idxiter;
thrust::pair<valiter, idxiter> new_end;
new_end = thrust::unique_by_key(d_A, d_A+num, d_I, d_Values, d_Locations);
You now have the locations of the first instance of each unique value. You can now launch a kernel to find the differences between adjacent elements from 0 to new_end in d_Locations. Subtract the final value from num to get the count for final location.
EDIT (Adding code that was provided over chat)
Here is how the difference code needs to be done
#define MAX_BLOCKS 65535
#define roundup(A, B) = (((A) + (B) - 1) / (B))
int uniqueEle = newend.valiter – d_A;
int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int));
int TPB = 256;
int num_blocks = roundup(uniqueEle, TPB);
int blocks_y = roundup(num_blocks, MAX_BLOCKS);
int blocks_x = roundup(num_blocks, blocks_y);
dim3 blocks(blocks_x, blocks_y);
kernel<<<blocks,TPB>>>(d_rawI, count, uniqueEle);
__global__ void kernel(float *i, int* count, int n)
{
int tx = threadIdx.x;
int bid = blockIdx.y * gridDim.x + blockIdx.x;
int id = blockDim.x * bid + tx;
__shared__ int indexes[256];
if (id < n) indexes[tx] = i[id];
__syncthreads();
if (id < n - 1) {
if (tx < 255) count[id] = indexes[tx + 1] - indexes[tx];
else count[id] = i[id + 1] - indexes[tx];
}
if (id == n - 1) count[id] = n - indexes[tx];
return;
}
2) ArrayFire
This is an easy to use, free array based library.
You can do the following in ArrayFire.
using namespace af;
float h_A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int num = 9;
// Transfer data to device
array A(9, 1, h_A);
array values, locations, original;
// Find the unique values and locations
setunique(values, locations, original, A);
// Locations are 0 based, add 1.
// Add *num* at the end to find count of last value.
array counts = diff1(join(locations + 1, num));
Disclosure: I work for AccelerEyes, that develops this software.
To answer the latest addenum to this question - the diff kernel which would complete the thrust method proposed by Pavan could look something like this:
template<int blcksz>
__global__ void diffkernel(const int *i, int* count, const int n) {
int id = blockDim.x * blockIdx.x + threadIdx.x;
int strd = blockDim.x * gridDim.x;
int nmax = blcksz * ((n/blcksz) + ((n%blcksz>0) ? 1 : 0));
__shared__ int indices[blcksz+1];
for(; id<nmax; id+=strd) {
// Data load
indices[threadIdx.x] = (id < n) ? i[id] : n;
if (threadIdx.x == (blcksz-1))
indices[blcksz] = ((id+1) < n) ? i[id+1] : n;
__syncthreads();
// Differencing calculation
int diff = indices[threadIdx.x+1] - indices[threadIdx.x];
// Store
if (id < n) count[id] = diff;
__syncthreads();
}
}
here is a solution:
__global__ void counter(float* a, int* b, int N)
{
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if(idx < N)
{
float my = a[idx];
int count = 0;
for(int i=0; i < N; i++)
{
if(my == a[i])
count++;
}
b[idx]=count;
}
}
int main()
{
int threads = 9;
int blocks = 1;
int N = blocks*threads;
float* h_a;
int* h_b;
float* d_a;
int* d_b;
h_a = (float*)malloc(N*sizeof(float));
h_b = (int*)malloc(N*sizeof(int));
cudaMalloc((void**)&d_a,N*sizeof(float));
cudaMalloc((void**)&d_b,N*sizeof(int));
h_a[0]= .333f;
h_a[1]= .324f;
h_a[2]= .123f;
h_a[3]= .543f;
h_a[4]= .00054f;
h_a[5]= .2243f;
h_a[6]= .333f;
h_a[7]= .53343f;
h_a[8]= .4434f;
cudaMemcpy(d_a,h_a,N*sizeof(float),cudaMemcpyHostToDevice);
counter<<<blocks,threads>>>(d_a,d_b,N);
cudaMemcpy(h_b,d_b,N*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0; i < N; i++)
{
printf("%f = %d times\n",h_a[i],h_b[i]);
}
cudaFree(d_a);
cudaFree(d_b);
free(h_a);
free(h_b);
getchar();
return 0;
}