I have some problem in loop unroll in CUDA.
In normal serial code:
//serial basic:
for(int i = 0; i < n; i++){
c[i] = a[i] + b[i];}
//serial loop unroll:
for(int i = 0; i < n/4; i++){
c[i] = a[i] + b[i];
c[i+1] = a[i+1] + b[i+1];
c[i+2] = a[i+2] + b[i+2];
c[i+3] = a[i+3] + b[i+3];}
So I think the CUDA loop unrolling looks like this:
int i = 2*(threadIdx.x + blockIdx.x * gridDim.x);
a[i+0] = b[i+0] + c[i+0];
a[i+1] = b[i+1] + c[i+1];
But in the CUDA hand-book the unrolling example I can't understand
This is a normal GlobalWrite kernel:
__global__ void GlobalWrites( T *out, T value, size_t N )
{
for(size_t i = blockIdx.x*blockDim.x+threadIdx.x;
i < N;
i += blockDim.x*gridDim.x ) {
out[i] = value;
}
}
unrolling kernel:
template<class T, const int n> __global__ void Global_write(T* out, T value, size_t N){
size_t i;
for(i = n*blockDim.x*blockIdx.x + threadIdx.x;
i < N - n*blockDim.x*blockIdx.x;
i += n*gridDim.x*blockDim.x;)
for(int j = 0; j < n; i++){
size_t index = i + j * blockDim.x;
outp[index] = value;
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) out[index] = value;
}}
I know this kernel uses less blocks but may someone explain why it works better (n=4,10% speed up).
If it wasn't obvious, because n is a template parameter, it is constant at compile time. This means that the compiler is free to optimize the constant trip count loop away by unrolling. It is, therefore, instructive to remove the template magic and unroll the loop by hand for the n=4 case you mentioned:
template<class T>
__global__ void Global_write(T* out, T value, size_t N)
{
size_t i;
for(i = 4*blockDim.x*blockIdx.x + threadIdx.x;
i < N - 4*blockDim.x*blockIdx.x;
i += 4*gridDim.x*blockDim.x;) {
out[i + 0 * blockDim.x] = value;
out[i + 1 * blockDim.x] = value;
out[i + 2 * blockDim.x] = value;
out[i + 3 * blockDim.x] = value;
}
if ( i+0*blockDim.x < N ) out[i+0*blockDim.x] = value;
if ( i+1*blockDim.x < N ) out[i+1*blockDim.x] = value;
if ( i+2*blockDim.x < N ) out[i+2*blockDim.x] = value;
if ( i+3*blockDim.x < N ) out[i+3*blockDim.x] = value;
}
The unrolled inner loop yields four completely independent writes which are coalesced. It is this instruction level parallelism which give the code higher instruction throughput and improved performance. I highly recommend Vasily Volkov's Unrolling Parallel Loops from the GTC conference of a few years ago, if you haven't already seen it. His presentation lays out the theoretical background for why this type of loop unrolling is an optimisation in CUDA.
In the templated kernel, const int n is known at compile time, allowing the compiler to actually unroll the for(int j = 0; j < n; i++) loop removing the conditional checks on that loop. If the loop size is not known at compile time, the compiler cannot unroll the loop. Simple as that.
I have a basic question related to Two-dimensional thread access.
I want to copy the non-contiguous data into contiguous buffer and the use of cuda memcopy can be illustrated as:
void pack_cuda(float *dstbuf, IOV *srciov, int num_iov)
{
int i;
float *ptr;
ptr = buf;
for (i = 0; i < num_iov; i++) {
cudaMemcpy(ptr, srciov[i].bufaddr, srciov[i].len, cudaMemcpyDefault);
ptr = (char *)ptr + srciov[i].len;
}
}
*srciov stores the start memory address and length of each non-contiguous data in an array of structure.
*dstbuf will store the packed contiguous data after the completion of the function.
Now, I want to implement it using CUDA kernels.
__global__ void pack_cuda(float *dstbuf, IOV *srciov, int num_iov)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k;
extern __shared__ size_t tmpdbuflen[16*3]; //suppose num_iov is 16
if ( j == 0 ){
if ( i < 16 ){
tmpdbuflen[i] = (srciov[i].len); //store length to calculate presum
tmpdbuflen[i+16] = tmpdbuflen[i]; //store length
tmpdbuflen[i+32] = ((srciov+i)->bufaddr) - (srciov->bufaddr); //store addr difference
}
__syncthreads();
for ( k = 0; k < i; k++)
tmpdbuflen[i] += srciov[k].len;
}
__syncthreads();
if ( i < 16 && j < srciov[i].len ){ //wondering whether this is correct use
dst[tmpdbuflen[i] + j] = *(src + tmpdbuflen[i+32] + j);
}
__syncthreads();
}
Kernel invocation part:
dim3 dimblock(16, 16); //the length of each non-contiguous data is less than 16
dim3 dimgrid(1,1);
const unsigned int shm_size = sizeof(size_t) * 16 * 3;
pack_cuda<<<dimgrid, dimblock, shm_size, 0>>>(dstbuf, srciov, num_iov);
cudaDeviceSynchronize();
However, it seems that I cannot pack all needed datas into dst buffer.
Sometimes only j = 0 and 1 (with corresponding various i) get packed.
I think the major problem is the usage of shared memory. I only use column 0 threads (threadIdx.y == 0) to copy information onto the shared memory. Then all threads (no restriction on threadIdx.y) will access and read information in shared memory.
How to modify the code enable such design?
I'd appreciate it if anyone can figure out my problems.
Thanks.
Some hints on your code:
__global__ void pack_cuda(float *dstbuf, IOV *srciov, int num_iov)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k;
extern __shared__ size_t tmpdbuflen[16*3]; //suppose num_iov is 16
This Block here will only be executed by one thread, due to the guard j==0 which only allows thread bid*bdim+tid = 0*0+0, ergo thread 0 in block 0, which is undesirable for you. I would guess you want put j < 16 there
if ( j == 0 ){
if ( i < 16 ){
tmpdbuflen[i] = (srciov[i].len); //store length to calculate presum
tmpdbuflen[i+16] = tmpdbuflen[i]; //store length
tmpdbuflen[i+32] = ((srciov+i)->bufaddr) - (srciov->bufaddr); //store addr difference
}
__syncthreads();
for ( k = 0; k < i; k++)
tmpdbuflen[i] += srciov[k].len;
}
.
This is my code for multiplication of a sparse matrix in compressed column format
__kernel void mykernel(__global int* colvector,
__global int* val,
__global int* result,
__global int* index,
__global int* rowptr,
__global int* sync )
{
__local int vals[1000];
for(int i=0;i<4;i++)
{
result[i]=0;
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
const int items_per_row=32;//total threads working in a row
const int thread_id=get_global_id(0)+get_local_id(0);//total threads in the program
const int warpid = thread_id/items_per_row;//warp id is actual row
int lane=thread_id&(items_per_row-1);//thread id within the warp
int row = warpid;
if(row<4)
{
int sum = 0;
int row_start = rowptr[row];
int row_end = rowptr[row+1];
vals[get_global_id(0)]=0;
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
for (int i = row_start+lane; i<row_end; i+=items_per_row)
{
vals[get_local_id(0)]+=val[i]*colvector[index[i]];
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
if (lane < 16 ) vals[get_local_id(0)] += vals[get_local_id(0) + 16];
if (lane < 8 ) vals[get_local_id(0)] += vals[get_local_id(0) + 8];
if (lane < 4 ) vals[get_local_id(0)] += vals[get_local_id(0) +4];
if (lane < 2 ) vals[get_local_id(0)] += vals[get_local_id(0) + 2];
if (lane < 1 ) vals[get_local_id(0)] += vals[get_local_id(0) + 1];
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
if(lane==0)
{
result[row] += vals[get_local_id(0)];
}
}
}
the above OpenCL code was converted from the CUDA code given below:
spmv_csr_vector_kernel(const int num_rows,
const int * ptr,
const int * indices,
const float * data,
const float * x,
float * y )
{
__shared__ float vals[];
int thread_id = blockDim.x * blockIdx.x + threadIdx.x; // global thread index
int warp_id = thread_id / 32; // global warp index
int lane = thread_id & (32 - 1); // thread index within the warp
// one warp per row
int row = warp_id;
if (row < num_rows)
{
int row_start = ptr[row];
int row_end = ptr[row+1];
// compute running sum per thread
vals[threadIdx.x] = 0;
for(int jj = row_start + lane; jj < row_end; jj += 32)
{
vals[threadIdx.x] += data[jj] * x[indices[jj]];
}
// parallel reduction in shared memory
if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16];
if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8];
if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4];
if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2];
if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1];
// first thread writes the result
if (lane == 0)
{
y[row] += vals[threadIdx.x];
}
}
}
The CUDA code is correct but my OpenCL kernel is not returning correct output. I have been trying for a week now but no solution. Does anybody know what mistake I am making?
I can at least see one mistake. thread_id is not the same in each code. blockDim.x * blockIdx.x + threadIdx.x in CUDA == get_global_id(0) in OpenCL, not get_global_id(0)+get_local_id(0). Also get_local_id(0) == threadIdx.x
Try using swan, this might help you understand your problem.
you can find an article here about it.
I have several lists of numbers on a file . For example,
.333, .324, .123 , .543, .00054
.2243, .333, .53343 , .4434
Now, I want to get the number of times each number occurs using the GPU. I believe this will be faster to do on the GPU than the CPU because each thread can process one list. What data structure should I use on the GPU to easily get the above counts. For example , for the above, the answer will look as follows:
.333 = 2 times in entire file
.324 = 1 time
etc..
I looking for a general solution. Not one that works only on devices with specific compute capability
Just writing kernel suggested by Pavan to see if I have implemented it efficiently:
int uniqueEle = newend.valiter – d_A;
int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int)); // stores the count of each unique element
int TPB = 256;
int blocks = uniqueEle + TPB -1 / TPB;
//Cast d_I to raw pointer called d_rawI
launch<<<blocks,TPB>>>(d_rawI,count,uniqueEle);
__global__ void launch(int *i, int* count, int n){
int id = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ int indexes[256];
if(id < n ){
indexes[threadIdx.x] = i[id];
//as occurs between two blocks
if(id % 255 == 0){
count[indexes] = i[id+1] - i[id];
}
}
__syncthreads();
if(id < ele - 1){
if(threadIdx.x < 255)
count[id] = indexes[threadIdx.x+1] – indexes[threadIdx.x];
}
}
Question: how to modify this kernel so that it handles arrays of arbitrary size. I.e , handle the condition when the total number of threads < number of elements
Here is how I would do the code in matlab
A = [333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434];
[values, locations] = unique(A); % Find unique values and their locations
counts = diff([0, locations]); % Find the count based on their locations
There is no easy way to do this in plain cuda, but you can use existing libraries to do this.
1) Thrust
It is also being shipped with CUDA toolkit from CUDA 4.0.
The matlab code can be roughly translated into thrust by using the following functions. I am not too proficient with thrust, but I am just trying to give you an idea on what routines to look at.
float _A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int _I[] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
float *A, *I;
// Allocate memory on device and cudaMempCpy values from _A to A and _I to I
int num = 9;
// Values vector
thrust::device_vector<float>d_A(A, A+num);
// Need to sort to get same values together
thrust::stable_sort(d_A, d_A+num);
// Vector containing 0 to num-1
thrust::device_vector<int>d_I(I, I+num);
// Find unique values and elements
thrust::device_vector<float>d_Values(num), d_Locations(num), d_counts(num);
// Find unique elements
thrust::device_vector<float>::iterator valiter;
thrust::device_vector<int>::iterator idxiter;
thrust::pair<valiter, idxiter> new_end;
new_end = thrust::unique_by_key(d_A, d_A+num, d_I, d_Values, d_Locations);
You now have the locations of the first instance of each unique value. You can now launch a kernel to find the differences between adjacent elements from 0 to new_end in d_Locations. Subtract the final value from num to get the count for final location.
EDIT (Adding code that was provided over chat)
Here is how the difference code needs to be done
#define MAX_BLOCKS 65535
#define roundup(A, B) = (((A) + (B) - 1) / (B))
int uniqueEle = newend.valiter – d_A;
int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int));
int TPB = 256;
int num_blocks = roundup(uniqueEle, TPB);
int blocks_y = roundup(num_blocks, MAX_BLOCKS);
int blocks_x = roundup(num_blocks, blocks_y);
dim3 blocks(blocks_x, blocks_y);
kernel<<<blocks,TPB>>>(d_rawI, count, uniqueEle);
__global__ void kernel(float *i, int* count, int n)
{
int tx = threadIdx.x;
int bid = blockIdx.y * gridDim.x + blockIdx.x;
int id = blockDim.x * bid + tx;
__shared__ int indexes[256];
if (id < n) indexes[tx] = i[id];
__syncthreads();
if (id < n - 1) {
if (tx < 255) count[id] = indexes[tx + 1] - indexes[tx];
else count[id] = i[id + 1] - indexes[tx];
}
if (id == n - 1) count[id] = n - indexes[tx];
return;
}
2) ArrayFire
This is an easy to use, free array based library.
You can do the following in ArrayFire.
using namespace af;
float h_A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int num = 9;
// Transfer data to device
array A(9, 1, h_A);
array values, locations, original;
// Find the unique values and locations
setunique(values, locations, original, A);
// Locations are 0 based, add 1.
// Add *num* at the end to find count of last value.
array counts = diff1(join(locations + 1, num));
Disclosure: I work for AccelerEyes, that develops this software.
To answer the latest addenum to this question - the diff kernel which would complete the thrust method proposed by Pavan could look something like this:
template<int blcksz>
__global__ void diffkernel(const int *i, int* count, const int n) {
int id = blockDim.x * blockIdx.x + threadIdx.x;
int strd = blockDim.x * gridDim.x;
int nmax = blcksz * ((n/blcksz) + ((n%blcksz>0) ? 1 : 0));
__shared__ int indices[blcksz+1];
for(; id<nmax; id+=strd) {
// Data load
indices[threadIdx.x] = (id < n) ? i[id] : n;
if (threadIdx.x == (blcksz-1))
indices[blcksz] = ((id+1) < n) ? i[id+1] : n;
__syncthreads();
// Differencing calculation
int diff = indices[threadIdx.x+1] - indices[threadIdx.x];
// Store
if (id < n) count[id] = diff;
__syncthreads();
}
}
here is a solution:
__global__ void counter(float* a, int* b, int N)
{
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if(idx < N)
{
float my = a[idx];
int count = 0;
for(int i=0; i < N; i++)
{
if(my == a[i])
count++;
}
b[idx]=count;
}
}
int main()
{
int threads = 9;
int blocks = 1;
int N = blocks*threads;
float* h_a;
int* h_b;
float* d_a;
int* d_b;
h_a = (float*)malloc(N*sizeof(float));
h_b = (int*)malloc(N*sizeof(int));
cudaMalloc((void**)&d_a,N*sizeof(float));
cudaMalloc((void**)&d_b,N*sizeof(int));
h_a[0]= .333f;
h_a[1]= .324f;
h_a[2]= .123f;
h_a[3]= .543f;
h_a[4]= .00054f;
h_a[5]= .2243f;
h_a[6]= .333f;
h_a[7]= .53343f;
h_a[8]= .4434f;
cudaMemcpy(d_a,h_a,N*sizeof(float),cudaMemcpyHostToDevice);
counter<<<blocks,threads>>>(d_a,d_b,N);
cudaMemcpy(h_b,d_b,N*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0; i < N; i++)
{
printf("%f = %d times\n",h_a[i],h_b[i]);
}
cudaFree(d_a);
cudaFree(d_b);
free(h_a);
free(h_b);
getchar();
return 0;
}
Hello
I'm trying to write a CUDA kernel to perform the following piece of code.
for (n = 0; n < (total-1); n++)
{
a = values[n];
for ( i = n+1; i < total ; i++)
{
b = values[i] - a;
c = b*b;
if( c < 10)
newvalues[i] = c;
}
}
This is what I have currently, but it does not seem to be giving the correct results? does anyone know what I'm doing wrong. Cheers
__global__ void calc(int total, float *values, float *newvalues){
float a,b,c;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int n = idx; n < (total-1); n += blockDim.x*gridDim.x){
a = values[n];
for(int i = n+1; i < total; i++){
b = values[i] - a;
c = b*b;
if( c < 10)
newvalues[i] = c;
}
}
Realize this problem in 2D and launch your kernel with 2D thread blocks. The total number of threads in x and y dimension will be equal to total . The kernel code should look like this:
__global__ void calc(float *values, float *newvalues, int total){
float a,b,c;
int n= blockIdx.y * blockDim.y + threadIdx.y;
int i= blockIdx.x * blockDim.x + threadIdx.x;
if (n>=total || i>=total)
return;
a = values[n];
b = values[i] - a;
c = b*b;
if( c < 10)
newvalues[i] = c;
// I don't know your problem statement but i think it should be like: newvalues[n*total+i] = c;
}
Update:
This is how you should call the kernel
dim3 block(16,16);
dim3 grid ( (total+15)/16, (total+15)/16 );
calc<<<grid,block>>>(float *val, float *newval, int T);
Also make sure you add this line in kernel (see updated kernel)
if (n>=total || i>=total)
return;
Update 2:
fixed blockIdy.y, correct is blockIdx.y
I'll probably be way wrong but the n < (total-1) check in
for (int n = idx; n < (total-1); n += blockDim.x*gridDim.x)
seems different than the original version.
Why don't you just remove the outter loop and start the kernel with as many threads as you need for this loop? It's a bit weird to have a loop that depends on your blockId. Normally you try to avoid these loops.
Secondly it seems to me that newvalues[i] can be overriden by different threads.