What is an efficient way to broadcast a value to all the threads after using blockReduce from Cuda Unbound? - cuda

let column_mean_partial = blockReducer.Reduce(temp_storage, acc, fun a b -> a + b) / (float32 num_rows)
if threadIdx.x = 0 then
means.[col] <- column_mean_partial
column_mean_shared := column_mean_partial
__syncthreads()
let column_mean = !column_mean_shared
The above code snippet calculates the mean of every column of a 2d matrix. As only thread 0 of a block has the full value I store it to shared memory column_mean_shared, use __syncthreads() and then broadcast it to all the threads in a block as I need them to have that value in order to calculate the variance.
Would there be a better way to broadcast the value or is the above efficient enough already?

I hadn't expected much when I posted this question, but it turns out for large matrices such as 128x10000 for example there is a much better way. I wrote a warpReduce kernel that has the block size of 32, which allows it to do the whole reduction using shuffle xor.
For a 128x100000 for 100 iterations the first version that used 64 blocks per grid (and 32 threads per block) took 0.5s. For the the CUB row reduce it took 0.25s.
When I increased the blocks per grid to 256, I got a nearly 4x speedup to about 1.5s. At 384 blocks per thread it takes 1.1s and increasing the number of blocks does not seem to improve performance from there.
For the problem sizes that I am interested in, the improvements are not nearly as dramatic.
For the 128x1024 and 128x512 cases, 10000 iterations:
For 1024: 1s vs 0.82s in favor of warpReduce.
For 512: 0.914s vs 0.873s in favor of warpReduce.
For small matrices, any speedups from parallelism are eaten away by kernel launch times it seems.
For 256: 0.94s vs 0.78s in favor of warpReduce.
For 160: 0.88s vs 0.77s in favor of warpReduce.
It was tested using a GTX 970.
It is likely that for the Kepler and earlier nVidia cards the figures would have been different as in Maxwell the block limit per grid was raised from 32 to 64 per SM which improves multiprocessor occupancy.
I am satisfied with this as the performance improvement is nice and I actually hadn't been able to write a kernel that uses shared memory correctly before reaching for the Cub block reduce. I forgot how painful Cuda can be sometimes. It is amazing that the version that uses no shared memory is so competitive.
Here are the two modules that I tested with:
type rowModule(target) =
inherit GPUModule(target)
let grid_size = 64
let block_size = 128
let blockReducer = BlockReduce.RakingCommutativeOnly<float32>(dim3(block_size,1,1),worker.Device.Arch)
[<Kernel;ReflectedDefinition>]
member this.Kernel (num_rows:int) (num_cols:int) (x:deviceptr<float32>) (means:deviceptr<float32>) (stds:deviceptr<float32>) =
// Point block_start to where the column starts in the array.
let mutable col = blockIdx.x
let temp_storage = blockReducer.TempStorage.AllocateShared()
let column_mean_shared = __shared__.Variable()
while col < num_cols do
// i is the row index
let mutable row = threadIdx.x
let mutable acc = 0.0f
while row < num_rows do
// idx is the absolute index in the array
let idx = row + col * num_rows
acc <- acc + x.[idx]
// Increment the row index
row <- row + blockDim.x
let column_mean_partial = blockReducer.Reduce(temp_storage, acc, fun a b -> a + b) / (float32 num_rows)
if threadIdx.x = 0 then
means.[col] <- column_mean_partial
column_mean_shared := column_mean_partial
__syncthreads()
let column_mean = !column_mean_shared
row <- threadIdx.x
acc <- 0.0f
while row < num_rows do
// idx is the absolute index in the array
let idx = row + col * num_rows
// Accumulate the variances.
acc <- acc + (x.[idx]-column_mean)*(x.[idx]-column_mean)
// Increment the row index
row <- row + blockDim.x
let variance_sum = blockReducer.Reduce(temp_storage, acc, fun a b -> a + b) / (float32 num_rows)
if threadIdx.x = 0 then stds.[col] <- sqrt(variance_sum)
col <- col + gridDim.x
member this.Apply((dmat: dM), (means: dM), (stds: dM)) =
let lp = LaunchParam(grid_size, block_size)
this.GPULaunch <# this.Kernel #> lp dmat.num_rows dmat.num_cols dmat.dArray.Ptr means.dArray.Ptr stds.dArray.Ptr
type rowWarpModule(target) =
inherit GPUModule(target)
let grid_size = 384
let block_size = 32
[<Kernel;ReflectedDefinition>]
member this.Kernel (num_rows:int) (num_cols:int) (x:deviceptr<float32>) (means:deviceptr<float32>) (stds:deviceptr<float32>) =
// Point block_start to where the column starts in the array.
let mutable col = blockIdx.x
while col < num_cols do
// i is the row index
let mutable row = threadIdx.x
let mutable acc = 0.0f
while row < num_rows do
// idx is the absolute index in the array
let idx = row + col * num_rows
acc <- acc + x.[idx]
// Increment the row index
row <- row + blockDim.x
let inline butterflyWarpReduce (value:float32) =
let v1 = value + __shfl_xor value 16 32
let v2 = v1 + __shfl_xor v1 8 32
let v3 = v2 + __shfl_xor v2 4 32
let v4 = v3 + __shfl_xor v3 2 32
v4 + __shfl_xor v4 1 32
let column_mean = (butterflyWarpReduce acc) / (float32 num_rows)
row <- threadIdx.x
acc <- 0.0f
while row < num_rows do
// idx is the absolute index in the array
let idx = row + col * num_rows
// Accumulate the variances.
acc <- acc + (x.[idx]-column_mean)*(x.[idx]-column_mean)
// Increment the row index
row <- row + blockDim.x
let variance_sum = (butterflyWarpReduce acc) / (float32 num_rows)
if threadIdx.x = 0
then stds.[col] <- sqrt(variance_sum)
means.[col] <- column_mean
col <- col + gridDim.x
member this.Apply((dmat: dM), (means: dM), (stds: dM)) =
let lp = LaunchParam(grid_size, block_size)
this.GPULaunch <# this.Kernel #> lp dmat.num_rows dmat.num_cols dmat.dArray.Ptr means.dArray.Ptr stds.dArray.Ptr

Related

Julia CUDA - Reduce matrix columns

Consider the following kernel, which reduces along the rows of a 2-D matrix
function row_sum!(x, ncol, out)
"""out = sum(x, dims=2)"""
row_idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
for i = 1:ncol
#inbounds out[row_idx] += x[row_idx, i]
end
return
end
N = 1024
x = CUDA.rand(Float64, N, 2*N)
out = CUDA.zeros(Float64, N)
#cuda threads=256 blocks=4 row_sum!(x, size(x)[2], out)
isapprox(out, sum(x, dims=2)) # true
How do I write a similar kernel except for reducing along the columns (of a 2-D matrix)? In particular, how do I get the index of each column, similar to how we got the index of each row with row_idx?
Here is the code:
function col_sum!(x, nrow, out)
"""out = sum(x, dims=1)"""
col_idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
for i = 1:nrow
#inbounds out[col_idx] += x[i, col_idx]
end
return
end
N = 1024
x = CUDA.rand(Float64, N, 2N)
out = CUDA.zeros(Float64, 2N)
#cuda threads=256 blocks=8 col_sum!(x, size(x, 1), out)
And here is the test:
julia> isapprox(out, vec(sum(x, dims=1)))
true
As you can see the size of the result vector is now 2N instead of N, hence we had to adapt the number of blocks accordingly (that is multiply by 2 and now we have 8 instead of 4)
More materials can be found here: https://juliagpu.gitlab.io/CUDA.jl/tutorials/introduction/

how to avoid thread divergence in this CUDA kernel?

for the CUDA kernel function, get branching divergence shown below, how to optimize it?
int gx = threadIdx.x + blockDim.x * blockIdx.x;
val = g_data[gx];
if (gx % 4 == 0)
val = op1(val);
else if (gx % 4 == 1)
val = op2(val);
else if (gx % 4 == 2)
val = op3(val);
else if (gx % 4 == 3)
val = op4(val);
g_data[gx] = val;
If I were programming in CUDA, I certainly wouldn't do any of this. However to answer your question:
how to avoid thread divergence in this CUDA kernel?
You could do something like this:
int gx = threadIdx.x + blockDim.x * blockIdx.x;
val = g_data[gx];
int gx_bit_0 = gx & 1;
int gx_bit_1 = (gx & 2) >> 1;
val = (1-gx_bit_1)*(1-gx_bit_0)*op1(val) + (1-gx_bit_1)*(gx_bit_0)*op2(val) + (gx_bit_1)*(1-gx_bit_0)*op3(val) + (gx_bit_1)*(gx_bit_0)*op4(val);
g_data[gx] = val;
Here is a full test case:
$ cat t1914.cu
#include <iostream>
__device__ float op1(float val) { return val + 1.0f;}
__device__ float op2(float val) { return val + 2.0f;}
__device__ float op3(float val) { return val + 3.0f;}
__device__ float op4(float val) { return val + 4.0f;}
__global__ void k(float *g_data){
int gx = threadIdx.x + blockDim.x * blockIdx.x;
float val = g_data[gx];
int gx_bit_0 = gx & 1;
int gx_bit_1 = (gx & 2) >> 1;
val = (1-gx_bit_1)*(1-gx_bit_0)*op1(val) + (1-gx_bit_1)*(gx_bit_0)*op2(val) + (gx_bit_1)*(1-gx_bit_0)*op3(val) + (gx_bit_1)*(gx_bit_0)*op4(val);
g_data[gx] = val;
}
const int N = 32;
int main(){
float *data;
cudaMallocManaged(&data, N*sizeof(float));
for (int i = 0; i < N; i++) data[i] = 1.0f;
k<<<1,N>>>(data);
cudaDeviceSynchronize();
for (int i = 0; i < N; i++) std::cout << data[i] << std::endl;
}
$ nvcc -o t1914 t1914.cu
$ compute-sanitizer ./t1914
========= COMPUTE-SANITIZER
2
3
4
5
2
3
4
5
2
3
4
5
2
3
4
5
2
3
4
5
2
3
4
5
2
3
4
5
2
3
4
5
========= ERROR SUMMARY: 0 errors
$
Solution by changing the work per thread
The best solution with the existing data layout is to let every thread compute 4 consecutive values. It's better to have fewer threads that can work properly than have more that can't.
float* g_data;
int gx = threadIdx.x + blockDim.x * blockIdx.x;
g_data[4 * gx] = op1(g_data[4 * gx]);
g_data[4 * gx + 1] = op2(g_data[4 * gx + 1]);
g_data[4 * gx + 2] = op3(g_data[4 * gx + 2]);
g_data[4 * gx + 3] = op4(g_data[4 * gx + 3]);
If the size of g_data is not a multiple of 4, put an if around the index operations. If it is always a multiple of 4 and properly aligned, load and store 4 values as a float4 for better performance.
Solution by reordering the work
As all my talk about float4 may have suggested, your input data appears to be some form of 2D structure where one every four elements share a similar function. Maybe it is an array of structs or an array of vectors -- in other words, a matrix.
For the purpose of explaining what I mean, I consider it a Nx4 matrix. If you transpose this into a 4xN matrix and apply a kernel to this, most of your problems disappear. Because then entries for which the same operation has to be done are placed next to each other in memory and that makes writing an efficient kernel easier. Something like this:
float* g_data;
int rows_in_g;
int gx = threadIdx.x + blockDim.x * blockIdx.x;
int gy = threadIdx.y;
float& own_g = g_data[gx + rows_in_g * gy];
switch(gy) {
case 0: own_g = op1(own_g); break;
case 1: own_g = op2(own_g); break;
case 2: own_g = op3(own_g); break;
case 3: own_g = op4(own_g); break;
default: break;
}
Start this as a 2D kernel with blocksize x=32, y=4 and gridsize x=N/32, y=1.
Now your kernel is still divergent, but all threads within a warp will execute the same case and access consecutive floats in memory. That's the best you can achieve. Of course this all depends on whether you can change the data layout.

kmer counts with cython implementation

I have this function implemented in Cython:
def count_kmers_cython(str string, list alphabet, int kmin, int kmax):
"""
Count occurrence of kmers in a given string.
"""
counter = {}
cdef int i
cdef int j
cdef int N = len(string)
limits = range(kmin, kmax + 1)
for i in range(0, N - kmax + 1):
for j in limits:
kmer = string[i:i+j]
counter[kmer] = counter.get(kmer, 0) + 1
return counter
Can I do better with cython? Or Can I have any away to improve it?
I am new to cython, that is my first attempt.
I will use this to count kmers in DNA with alphabet restrict to 'ACGT'. The length of the general input string is the average bacterial genomes (130 kb to over 14 Mb, where each 1 kb = 1000 bp).
The size of the kmers will be 3 < kmer < 16.
I wish to know if I could go further and maybe use cython in this function to:
def compute_kmer_stats(kmer_list, counts, len_genome, max_e):
"""
This function computes the z_score to find under/over represented kmers
according to a cut off e-value.
Inputs:
kmer_list - a list of kmers
counts - a dictionary-type with k-mers as keys and counts as values.
len_genome - the total length of the sequence(s).
max_e - cut off e-values to report under/over represented kmers.
Outputs:
results - a list of lists as [k-mer, observed count, expected count, z-score, e-value]
"""
print(colored('Starting to compute the kmer statistics...\n',
'red',
attrs=['bold']))
results = []
# number of tests, used to convert p-value to e-value.
n = len(list(kmer_list))
for kmer in kmer_list:
k = len(kmer)
prefix, sufix, center = counts[kmer[:-1]], counts[kmer[1:]], counts[kmer[1:-1]]
# avoid zero division error
if center == 0:
expected = 0
else:
expected = (prefix * sufix) // center
observed = counts[kmer]
sigma = math.sqrt(expected * (1 - expected / (len_genome - k + 1)))
# avoid zero division error
if sigma == 0.0:
z_score = 0.0
else:
z_score = ((observed - expected) / sigma)
# pvalue for all kmers/palindromes under represented
p_value_under = (math.erfc(-z_score / math.sqrt(2)) / 2)
# pvalue for all kmers/palindromes over represented
p_value_over = (math.erfc(z_score / math.sqrt(2)) / 2)
# evalue for all kmers/palindromes under represented
e_value_under = (n * p_value_under)
# evalue for all kmers/palindromes over represented
e_value_over = (n * p_value_over)
if e_value_under <= max_e:
results.append([kmer, observed, expected, z_score, p_value_under, e_value_under])
elif e_value_over <= max_e:
results.append([kmer, observed, expected, z_score, p_value_over, e_value_over])
return results
OBS - Thank you CodeSurgeon by the help. I know there are other tools to count kmer efficiently but I am learning Python so I am trying to write my own functions and code.

Is this a CUDA thread synchronization issue or something else?

I am very new to parallel programming and stack overflow. I am working on a matrix multiplication implementation using CUDA. I am using column order float arrays as matrix representations.
The algorithm I developed is a bit unique and goes as follows. Given a matrix an n x m matrix A and an m x k matrix B, I launch an n x k blocks with m threads in each block. Essentially, I launch a block for every entry in the resulting matrix, with each thread computing one multiplication for that entry. For example,
1 0 0 0 1 2
0 1 0 * 3 4 5
0 0 1 6 7 8
For the first entry in the resulting matrix I would launch each thread with
thread 0 computing 1 * 3
thread 1 computing 0 * 0
thread 2 computing 0 * 1
With each thread adding to a 0-initialized matrix.
Right now, I am not getting a correct answer. I am getting this over and over again
0 0 2
0 0 5
0 0 8
My kernel function is below. Could this be a thread synchronization problem or am I screwing up array indexing or something?
/*#param d_A: Column order matrix
*#param d_B: Column order matrix
*#param d_result: 0-initialized matrix that kernels write to
*#param dim_A: dimensionality of A (number of rows)
*#param dim_B: dimensionality of B (number of rows)
*/
__global__ void dot(float *d_A, float *d_B, float *d_result, int dim_A, int dim_B) {
int n = blockIdx.x;
int k = blockIdx.y;
int m = threadIdx.x;
float a = d_A[(m * dim_A) + n];
float b = d_B[(k * dim_B) + m];
//d_result[(k * dim_A) + n] += (a * b);
syncthreads();
float temp = d_result[(k*dim_A) + n];
syncthreads();
temp = temp + (a * b);
syncthreads();
d_result[(k*dim_A) + n] = temp;
syncthreads();
}
The whole idea of using syncthreads() is wrong in this case. This API call has a block scope.
1. syncthreads();
2. float temp = d_result[(k*dim_A) + n];
3. syncthreads();
4. temp = temp + (a * b);
5. syncthreads();
6. d_result[(k*dim_A) + n] = temp;
7. syncthreads();
The local variable float temp; has thread scope and using this synchronization barrier is senseless.
The pointer d_result is global memory pointer and using this synchronization barrier is also senseless. Note that there isn't available yet (maybe there will never be available) a barrier which synchronizes threads globally.
Typically the usage of syncthreads() is required when shared memory is used for computation. In this case you may want to use shared memory. Here you could see an example of how to use shared memory and syncthreads() properly. Here you have an example of matrix multiplication with shared memory.

CUDA Warp Synchronization Problem

In generalizing a kernel thats shifts the values of a 2D array one space to the right (wrapping around the row boundaries), I have come across a warp synchronization problem. The full code is attached and included below.
The code is meant to work for arbitrary array width, array height, number of thread blocks, and number of threads per block. When choosing a thread size of 33 (i.e. one more thread than a full warp), the 33rd thread doesn't synchronize with __syncthreads() is called. This causes problems with the output data. The problem is only present when there is more than one warp, and the width of the array is more than the number of threads (e.g. with width=35 and 34 threads).
The following is a downsized example of what happens (in reality the array would need to have more elements for the kernel to produce the error).
Initial array:
0 1 2 3 4
5 6 7 8 9
Expected Result:
4 0 1 2 3
9 5 6 7 8
Kernel Produces:
4 0 1 2 3
8 5 6 7 8
The first line is done correctly (for each block if there are more than one), with all subsequent lines having the second last value repeated. I have tested this one two different cards (8600GT and GTX280) and get the same results. I would like to know if this is just a bug with my kernel, or a problem that can't be fixed by adjusting my code?
The full source file is included below.
Thank you.
#include <cstdio>
#include <cstdlib>
// A method to ensure all reads use the same logical layout.
inline __device__ __host__ int loc(int x, int y, int width)
{
return y*width + x;
}
//kernel to shift all items in a 2D array one position to the right (wrapping around rows)
__global__ void shiftRight ( int* globalArray, int width, int height)
{
int temp1=0; //temporary swap variables
int temp2=0;
int blockRange=0; //the number of rows that a single block will shift
if (height%gridDim.x==0) //logic to account for awkward array sizes
blockRange = height/gridDim.x;
else
blockRange = (1+height/gridDim.x);
int yStart = blockIdx.x*blockRange;
int yEnd = yStart+blockRange; //the end condition for the y-loop
yEnd = min(height,yEnd); //make sure that the array doesn't go out of bounds
for (int y = yStart; y < yEnd ; ++y)
{
//do the first read so the swap variables are loaded for the x-loop
temp1 = globalArray[loc(threadIdx.x,y,width)];
//Each block shifts an entire row by itself, even if there are more columns than threads
for (int threadXOffset = threadIdx.x ; threadXOffset < width ; threadXOffset+=blockDim.x)
{
//blockDim.x is added so that we store the next round of values
//this has to be done now, because the next operation will
//overwrite one of these values
temp2 = globalArray[loc((threadXOffset + blockDim.x)%width,y,width)];
__syncthreads(); //sync before the write to ensure all the values have been read
globalArray[loc((threadXOffset +1)%width,y,width)] = temp1;
__syncthreads(); //sync after the write so ensure all the values have been written
temp1 = temp2; //swap the storage variables.
}
if (threadIdx.x == 0 && y == 0)
globalArray[loc(12,2,width)]=globalArray[67];
}
}
int main (int argc, char* argv[])
{
//set the parameters to be used
int width = 34;
int height = 3;
int threadsPerBlock=33;
int numBlocks = 1;
int memSizeInBytes = width*height*sizeof(int);
//create the host data and assign each element of the array to equal its index
int* hostData = (int*) malloc (memSizeInBytes);
for (int y = 0 ; y < height ; ++y)
for (int x = 0 ; x < width ; ++x)
hostData [loc(x,y,width)] = loc(x,y,width);
//create an allocate the device pointers
int* deviceData;
cudaMalloc ( &deviceData ,memSizeInBytes);
cudaMemset ( deviceData,0,memSizeInBytes);
cudaMemcpy ( deviceData, hostData, memSizeInBytes, cudaMemcpyHostToDevice);
cudaThreadSynchronize();
//launch the kernel
shiftRight<<<numBlocks,threadsPerBlock>>> (deviceData, width, height);
cudaThreadSynchronize();
//copy the device data to a host array
int* hostDeviceOutput = (int*) malloc (memSizeInBytes);
cudaMemcpy (hostDeviceOutput, deviceData, memSizeInBytes, cudaMemcpyDeviceToHost);
cudaFree (deviceData);
//Print out the expected/desired device output
printf("---- Expected Device Output ----\n");
printf(" | ");
for (int x = 0 ; x < width ; ++x)
printf("%4d ",x);
printf("\n---|-");
for (int x = 0 ; x < width ; ++x)
printf("-----");
for (int y = 0 ; y < height ; ++y)
{
printf("\n%2d | ",y);
for (int x = 0 ; x < width ; ++x)
printf("%4d ",hostData[loc((x-1+width)%width,y,width)]);
}
printf("\n\n");
printf("---- Actual Device Output ----\n");
printf(" | ");
for (int x = 0 ; x < width ; ++x)
printf("%4d ",x);
printf("\n---|-");
for (int x = 0 ; x < width ; ++x)
printf("-----");
for (int y = 0 ; y < height ; ++y)
{
printf("\n%2d | ",y);
for (int x = 0 ; x < width ; ++x)
printf("%4d ",hostDeviceOutput[loc(x,y,width)]);
}
printf("\n\n");
}
Because not all threads are executing the same number of loop iterations, synchronisation is a problem! All threads should hit the same __syncthreads()-s all the time.
I would suggest transforming your innermost for loop into something like this:
for(int blockXOffset=0; blockXOffset < width; blockXOffset+=blockDim.x) {
int threadXOffset=blockXOffset+threadIdx.x;
bool isActive=(threadXOffset < width);
if (isActive) temp2 = globalArray[loc((threadXOffset + blockDim.x)%width,y,width)];
__syncthreads();
if (isActive) globalArray[loc((threadXOffset +1)%width,y,width)] = temp1;
__syncthreads();
temp1 = temp2;
}
From the Programming Guide:
__syncthreads() is allowed in
conditional code but only if the
conditional evaluates identically
across the entire thread block,
otherwise the code execution is likely
to hang or produce unintended side
effects.
In my example, not all threads are executing the same number of loop iterations, so synchronization doesn't happen.