Cuda Implementation of Partitioned Subgroup - cuda

is there a more efficient way to implement the "Partitioned Subgroup" functions of Vulkan/OpenGL, which do not have to loop over all elements in the subgroup? My current implementation just uses a loop from 0 to WARP_SIZE.
References:
(slide 37+38) https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9909-nvidia-vulkan-features-update.pdf
https://github.com/KhronosGroup/GLSL/blob/master/extensions/nv/GL_NV_shader_subgroup_partitioned.txt
Simple Implementation:
__device__ uint32_t subgroupPartitionNV(ivec2 p)
{
uint32_t result = 0;
for (int i = 0; i < 32; ++i)
{
int x = __shfl_sync(0xFFFFFFFF, p(0), i);
int y = __shfl_sync(0xFFFFFFFF, p(1), i);
uint32_t b = __ballot_sync(0xFFFFFFFF, p(0) == x && p(1) == y);
if (i == threadIdx.x & 31) result = b;
}
return result;
}
__device__ uint32_t subgroupPartitionedAddNV(float value, uint32_t ballot)
{
float result = 0;
for ( unsigned int i = 0; i < 32; ++i)
{
float other_value = __shfl_sync(0xFFFFFFFF, value, i);
if ((1U << i) & ballot) result += other_value;
}
return result;
}

Thanks to the hint of Abator I came up with a more efficient solution. It's a little ugly because labeled_partition is only implemented for int but works quite well.
template <int GROUP_SIZE = 32>
__device__ cooperative_groups::coalesced_group subgroupPartitionNV(ivec2 p)
{
using namespace cooperative_groups;
thread_block block = this_thread_block();
thread_block_tile<GROUP_SIZE> tile32 = tiled_partition<GROUP_SIZE>(block);
coalesced_group g1 = labeled_partition(tile32, p(0));
coalesced_group g2 = labeled_partition(tile32, p(1));
details::_coalesced_group_data_access acc;
return acc.construct_from_mask<coalesced_group>(acc.get_mask(g1) & acc.get_mask(g2));
}
template <typename T, int GROUP_SIZE = 32>
__device__ T subgroupPartitionedAddNV(T value, cooperative_groups::coalesced_group group)
{
int s = group.size();
int r = group.thread_rank();
for (int offset = GROUP_SIZE / 2; offset > 0; offset /= 2)
{
auto v = group.template shfl_down(value, offset);
if (r + offset < s) value += v;
}
return value;
}

Related

My CUDA kernel code is not working

I try to make a small code to generate numbers and return the result in array but once I run this code it's not working, I have tried to use Nsight debugger to understand where is my problem but it freezes and closes immediately.
Could you help me please to understand where is the problem in this code?
__global__ void mykernel( int* PF_tmp, int* PL_tmp, int* QF_tmp, int* QL_tmp,
int m[2], int p[5], int q[5], int i, int* n,
int out[10][5], int N)
{
int id = blockDim.x * blockIdx.x + threadIdx.x;
int idx = blockIdx.x;
int idy = blockIdx.y;
int w = idx/100;
int x = idx%100;
int y = idy;
int z = threadIdx.x;
int len = ((i * 2) + 5);
// Fill PF_tmp & QF_tmp
if( i > 0){
for(int k = 0; k < (i * 2); k++)
{
p[k] = PF_tmp[k];
q[k] = QF_tmp[k];
}
}
// Fill X
if( x > 10)
{
p[(i*2)] = (x - (x % 10)) / 10;
p[(i*2)+1] = x % 10;
}else{
p[(i*2)] = 0;
p[(i*2)+1] = x;
}
// Fill Y
if( y > 10)
{
q[(i*2)] = (y - (y % 10)) / 10;
q[(i*2)+1] = y % 10;
}else{
q[(i*2)] = 0;
q[(i*2)+1] = y;
}
// Fill m
p[(i * 2)+2] = m[0];
q[(i * 2)+2] = m[1];
// Fill W
if( w > 10)
{
p[(i*2)+3] = (w - (w % 10)) / 10;
p[(i*2)+4] = w % 10;
}else{
p[(i*2)+3] = 0;
p[(i*2)+4] = w;
}
// Fill Z
if( z > 10)
{
q[(i*2)+3] = (z - (z % 10)) / 10;
q[(i*2)+4] = z % 10;
}else{
q[(i*2)+3] = 0;
q[(i*2)+4] = z;
}
// Fill PL_tmp & QL_tmp
if( i > 0)
{
for(int k = 0; k < (i * 2); k++)
{
p[(len-(i * 2))+k] = PL_tmp[k];
q[(len-(i * 2))+k] = QL_tmp[k];
}
}
if(id<10)
{
for(int k =0; k<5; k++)
out[id][k] = p[k];
}
}
int main()
{
cudaError err;
dim3 blocks(10000, 100);
dim3 threads(100);
int m[2] = {4,5};
int hst_out[10][5];
int p[5];
int q[5];
err = cudaMalloc((void **)&p, 5);
err = cudaMalloc((void **)&q, 5);
err = cudaMalloc((void **)&hst_out, 50);
mykernel<<<blocks, threads>>>(NULL, NULL, NULL, NULL, m, p, q, 0, NULL, hst_out, 100000000);
return 0;
}
The error very obvious, it is all C programming.
when you declare
int m[2] = {4,5};
int hst_out[10][5];
int p[5];
int q[5];
now hst_out, p, q are not a pointer, but later it is used as a pointer:
err = cudaMalloc((void **)&p, 5);
err = cudaMalloc((void **)&q, 5);
err = cudaMalloc((void **)&hst_out, 50);
so u should have declare it initially as a pointer instead, eg,
int *p;
and used it as this way:
err = cudaMalloc((void **)&p, 5*sizeof(int));
And notice too that the size you have declared is just 5 bytes....whereas I declared it as 5*sizeof(int).
For more example see:
http://cuda-programming.blogspot.sg/2013/03/how-to-avoid-uses-of-cudamalloc-in.html

The Floyd-Warshall algorithm in CUDA

This is the sequential piece of code I am trying to parallelize in CUDA
/*
Sequential (Single Thread) APSP on CPU.
*/
void floyd_sequential(int *mat, const size_t N)
{
for(int k = 0; k < N; k ++)
for(int i = 0; i < N; i ++)
for(int j = 0; j < N; j ++)
{
int i0 = i*N + j;
int i1 = i*N + k;
int i2 = k*N + j;
if(mat[i1] != -1 && mat[i2] != -1)
mat[i0] = (mat[i0] != -1 && mat[i0] < mat[i1] + mat[i2]) ?
mat[i0] : (mat[i1] + mat[i2]);
}
}
This is my CUDA implementation
// ParallelComputing.cpp : Defines the entry point for the console application.
//
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#define DIMENSION 10;
__global__ void gpu_Floyd(int *result, int N)
{
int j,k;
int Row = blockIdx.y * blockDim.y + threadIdx.y;
for(k = 0; k < N; k++)
{
for(j = 0; j < N; j++)
{
int i0 = Row * N + j;
int i1 = Row * N + k;
int i2 = k * N + j;
if(result[i0] != -1 && result[i2] != -1)
result[i0] = (result[i0] != -1 && result[i0] < result[i1] + result[i2]) ?
result[i0] : (result[i1] + result[i2]);
__syncthreads();
}
}
}
void GenMatrix(int *mat, const size_t N)
{
for(int i = 0; i < N*N; i ++)
mat[i] = rand()%32 - 1;
}
bool CmpArray(const int *l, const int *r, const size_t eleNum)
{
for(int i = 0; i < eleNum; i ++)
if(l[i] != r[i])
{
printf("ERROR: l[%d] = %d, r[%d] = %d\n", i, l[i], i, r[i]);
return false;
}
return true;
}
int main(int argc, char **argv)
{
// generate a random matrix.
size_t N = 10;
int *mat = (int*)malloc(sizeof(int)*N*N);
GenMatrix(mat, N);
// compute the reference result.
int *ref = (int*)malloc(sizeof(int)*N*N);
memcpy(ref, mat, sizeof(int)*N*N);
Floyd_sequential(ref, N);
//CUDA Portion
int Grid_Dim_x = 1, Grid_Dim_y = 1;
int noThreads_x, noThreads_y;
int *result = (int*)malloc(sizeof(int)*N*N);
memcpy(result, mat, sizeof(int)*N*N);
int *d_result;
// compute your results
cudaMalloc((void **)&d_result, N*N);
cudaMemcpy(result, N * N, cudaMemcpyHostToDevice);
gpu_Floyd<<<1024, 256>>>(d_result, N);
cudaMemcpy(result, d_result, cudaMemcpyDeviceToHost);
// compare your result with reference result
if(CmpArray(result, ref, N*N))
printf("The matrix matches.\n");
else
printf("The matrix do not match.\n");
free(ref);
free(result);
cudaFree(d_result);
}
However, my output always shows the matrices do not match.
I understand that in CUDA we try to map each element in the matrix to each row. However, I am trying to explore possibilities by mapping each row of the matrix to a thread instead.
As has already been mentioned, your provided GPU code does not compile, so I'm curious how you got to the observation that your output matrices do not match.
Here are some of the problems with your code:
cudaMalloc, just like malloc allocates bytes, so this is not correct:
cudaMalloc((void **)&d_result, N*N);
instead you want this:
cudaMalloc((void **)&d_result, N*N*sizeof(int));
likewise cudaMemcpy, just like memcpy, operates on bytes, and furthermore cudaMemcpy requires 4 parameters so this is not correct:
cudaMemcpy(result, N * N, cudaMemcpyHostToDevice);
instead you probably want this:
cudaMemcpy(d_result, result, N * N*sizeof(int), cudaMemcpyHostToDevice);
and your other cudaMemcpy line needs to be fixed similarly.
I'd also advise doing proper cuda error checking
Your kernel is written as if it's expecting a 2 dimensional thread array, or at least one dimensional in y, whereas you are launching a one dimensional grid in x:
gpu_Floyd<<<1024, 256>>>(d_result, N);
therefore all your kernel built-in variables in y will be 1 or 0 always, and this line of code:
int Row = blockIdx.y * blockDim.y + threadIdx.y;
will evaluate to zero for all threads in your 1-D grid in x.
Your gpu kernel is putting the results in the same matrix as the input data. For sequential code this may or may not matter, but for code that is intended to run in parallel, it can often lead to race conditions, because the order of operations (i.e. order of thread execution) is largely undefined.
Below you will find a canonical, simple implementation of the Floyd-Warshall algorithm in CUDA.
The CUDA code is accompanied with a sequential implementation and both are based on the simplifying assumption that the edges are non-negative. The full, minimum distance paths are also reconstructed in both the cases. Despite the simplifying assumption, it should be possible to grasp the relevant parallelization idea, namely that a two-dimensional thread grid is exploited and that each thread along x is assigned to a matrix column, while each block along y is assigned to a matrix row. In this way, all the columns are loaded by the threadIdx.x == 0 threads of each block in shared memory.
// --- Assumption: graph with positive edges
#include <stdio.h>
#include <string>
#include <map>
#include <iostream>
#include <fstream>
#include "Utilities.cuh"
#define BLOCKSIZE 256
using namespace std;
map<string, int> nameToNum; // --- names of vertices
map<string, map<string, int>> weightMap; // --- weights of edges
/************************/
/* READ GRAPH FROM FILE */
/************************/
int *readGraphFromFile(int &N, char *fileName) {
string vertex1, vertex2;
ifstream graphFile;
int currentWeight;
N = 0; // --- Init the number of found vertices
graphFile.open(fileName); // --- Open the graph file
graphFile >> vertex1; // --- Read first vertex
while(vertex1 != "--END--") { // --- Loop untile end of file has not been found
graphFile >> vertex2; // --- Read second vertex
graphFile >> currentWeight; // --- Read weight between first and second vertex
if (nameToNum.count(vertex1) == 0) { // --- If vertex has not yet been added ...
nameToNum[vertex1] = N; // assign a progressive number to the vertex
weightMap[vertex1][vertex1] = 0; // assign a zero weight to the "self-edge"
N++; // --- Update the found number of vertices
}
if (nameToNum.count(vertex2) == 0) {
nameToNum[vertex2] = N;
weightMap[vertex2][vertex2] = 0;
N++;
}
weightMap[vertex1][vertex2] = currentWeight; // --- Update weight between vertices 1 and 2
graphFile >> vertex1;
}
graphFile.close(); // --- Close the graph file
// --- Construct the array
int *weightMatrix = (int*) malloc(N * N * sizeof(int));
// --- Loop over all the vertex couples in the wights matrix
for (int ii = 0; ii < N; ii++)
for (int jj = 0; jj < N; jj++)
weightMatrix[ii * N + jj] = INT_MAX / 2; // --- Init the weights matrix elements to infinity
map<string, int>::iterator i, j;
// --- Loop over all the vertex couples in the map
// (*i).first and (*j).first are the weight entries of the map, while (*i).second and (*j).second are their corresponding indices
for (i = nameToNum.begin(); i != nameToNum.end(); ++i)
for (j = nameToNum.begin(); j != nameToNum.end(); ++j) {
// --- If there is connection between vertices (*i).first and (*j).first, the update the weight matrix
if (weightMap[(*i).first].count((*j).first) != 0)
weightMatrix[N * (*i).second + (*j).second] = weightMap[(*i).first][(*j).first];
}
return weightMatrix;
}
/************************************/
/* PRINT MINIMUM DISTANCES FUNCTION */
/************************************/
void printMinimumDistances(int N, int *a) {
map<string, int>::iterator i;
// --- Prints all the node labels at the first row
for (i = nameToNum.begin(); i != nameToNum.end(); ++i) printf("\t%s", i->first.c_str());
printf("\n");
i = nameToNum.begin();
// --- Loop over the rows
for (int p = 0; p < N; p++) {
printf("%s\t", i -> first.c_str());
// --- Loop over the columns
for (int q = 0; q < N; q++) {
int dd = a[p * N + q];
if (dd != INT_MAX / 2) printf("%d\t", dd);
else printf("--\t");
}
printf("\n");
i++;
}
}
void printPathRecursive(int row, int col, int *minimumDistances, int *path, int N) {
map<string, int>::iterator i = nameToNum.begin();
map<string, int>::iterator j = nameToNum.begin();
if (row == col) {advance(i, row); printf("%s\t", i -> first.c_str()); }
else {
if (path[row * N + col] == INT_MAX / 2) printf("%row %row %row No path exists\t\n", minimumDistances[row * N + col], row, col);
else {
printPathRecursive(row, path[row * N + col], minimumDistances, path, N);
advance(j, col);
printf("%s\t", j -> first.c_str());
}
}
}
void printPath(int N, int *minimumDistances, int *path) {
map<string, int>::iterator i;
map<string, int>::iterator j;
// --- Loop over the rows
i = nameToNum.begin();
for (int p = 0; p < N; p++) {
// --- Loop over the columns
j = nameToNum.begin();
for (int q = 0; q < N; q++) {
printf("From %s to %s\t", i -> first.c_str(), j -> first.c_str());
printPathRecursive(p, q, minimumDistances, path, N);
printf("\n");
j++;
}
i++;
}
}
/**********************/
/* FLOYD-WARSHALL CPU */
/**********************/
void h_FloydWarshall(int *h_graphMinimumDistances, int *h_graphPath, const int N) {
for (int k = 0; k < N; k++)
for (int row = 0; row < N; row++)
for (int col = 0; col < N; col++) {
if (h_graphMinimumDistances[row * N + col] > (h_graphMinimumDistances[row * N + k] + h_graphMinimumDistances[k * N + col])) {
h_graphMinimumDistances[row * N + col] = (h_graphMinimumDistances[row * N + k] + h_graphMinimumDistances[k * N + col]);
h_graphPath[row * N + col] = h_graphPath[k * N + col];
}
}
}
/*************************/
/* FLOYD-WARSHALL KERNEL */
/*************************/
__global__ void d_FloydWarshall(int k, int *d_graphMinimumDistances, int *d_graphPath, int N) {
int col = blockIdx.x * blockDim.x + threadIdx.x; // --- Each thread along x is assigned to a matrix column
int row = blockIdx.y; // --- Each block along y is assigned to a matrix row
if (col >= N) return;
int arrayIndex = N * row + col;
// --- All the blocks load the entire k-th column into shared memory
__shared__ int d_graphMinimumDistances_row_k;
if(threadIdx.x == 0) d_graphMinimumDistances_row_k = d_graphMinimumDistances[N * row + k];
__syncthreads();
if (d_graphMinimumDistances_row_k == INT_MAX / 2) // --- If element (row, k) = infinity, no update is needed
return;
int d_graphMinimumDistances_k_col = d_graphMinimumDistances[k * N + col];
if(d_graphMinimumDistances_k_col == INT_MAX / 2) // --- If element (k, col) = infinity, no update is needed
return;
int candidateBetterDistance = d_graphMinimumDistances_row_k + d_graphMinimumDistances_k_col;
if (candidateBetterDistance < d_graphMinimumDistances[arrayIndex]) {
d_graphMinimumDistances[arrayIndex] = candidateBetterDistance;
d_graphPath[arrayIndex] = d_graphPath[k * N + col];
}
}
/********/
/* MAIN */
/********/
int main() {
int N = 0; // --- Number of vertices
// --- Read graph array from file
int *h_graphArray = readGraphFromFile(N, "graph2.txt");
printf("\n******************\n");
printf("* Original graph *\n");
printf("******************\n");
printMinimumDistances(N, h_graphArray);
// --- Floyd-Warshall on CPU
int *h_graphMinimumDistances = (int *) malloc(N * N * sizeof(int));
int *h_graphPath = (int *) malloc(N * N * sizeof(int));
memcpy(h_graphMinimumDistances, h_graphArray, N * N * sizeof(int));
for (int k = 0; k < N; k++)
for (int l = 0; l < N; l++)
if (h_graphArray[k * N + l] == INT_MAX / 2) h_graphPath[k * N + l] = INT_MAX / 2;
else h_graphPath[k * N + l] = k;
h_FloydWarshall(h_graphMinimumDistances, h_graphPath, N);
printf("\n*************************\n");
printf("* CPU result: distances *\n");
printf("*************************\n");
printMinimumDistances(N, h_graphMinimumDistances);
printf("\n********************\n");
printf("* CPU result: path *\n");
printf("********************\n");
printPath(N, h_graphMinimumDistances, h_graphPath);
// --- Graph array device allocation and host-device memory transfer
int *d_graphMinimumDistances; gpuErrchk(cudaMalloc(&d_graphMinimumDistances, N * N * sizeof(int)));
gpuErrchk(cudaMemcpy(d_graphMinimumDistances, h_graphArray, N * N * sizeof(int), cudaMemcpyHostToDevice));
int *d_graphPath; gpuErrchk(cudaMalloc(&d_graphPath, N * N * sizeof(int)));
for (int k = 0; k < N; k++)
for (int l = 0; l < N; l++)
if (h_graphArray[k * N + l] == INT_MAX / 2) h_graphPath[k * N + l] = INT_MAX / 2;
else h_graphPath[k * N + l] = k;
gpuErrchk(cudaMemcpy(d_graphPath, h_graphPath, N * N * sizeof(int), cudaMemcpyHostToDevice));
// --- Iterations
for (int k = 0; k < N; k++) {
d_FloydWarshall <<<dim3(iDivUp(N, BLOCKSIZE), N), BLOCKSIZE>>>(k, d_graphMinimumDistances, d_graphPath, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
// --- Copy results back to the host
gpuErrchk(cudaMemcpy(h_graphMinimumDistances, d_graphMinimumDistances, N * N * sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_graphPath, d_graphPath, N * N * sizeof(int), cudaMemcpyDeviceToHost));
printf("\n**************\n");
printf("* GPU result *\n");
printf("**************\n");
printMinimumDistances(N, h_graphMinimumDistances);
printf("\n********************\n");
printf("* GPU result: path *\n");
printf("********************\n");
printPath(N, h_graphMinimumDistances, h_graphPath);
}

Line detection on CUDA

I am trying to do real time line detection using CUDA. I have calculated the hough transform along with the min, max line coordinates of each bin. For getting the line segments I am tracing (using Bresenham's line algorithm) through the min to max point and get the line segments on each bin. When the hough threshold is low and when lot of lines are there in the image trace_lines takes lot of time to complete.
hough transform (hough_line_transform) computation takes around 5-10ms per frame(1280x720) on a GTX 660 (observed to be 10 times faster than CPU implementation). But tracing the line segments from the min, max points takes 1ms-15ms.
I have two questions on line detection
Does there exist a better algorithm to get the line segments from the min, max points of the hough bins?
Is it possible to optimize hought_line_transform (see the code below) further? I am using atomic operations. Is it possible to avoid atomics.
I am attaching the code below.
class Header
#ifndef _HOUGH_LINES_H_
#define _HOUGH_LINES_H_
#include <cuda_gl_interop.h>
#include <thrust/device_vector.h>
union Pos;
struct Line;
struct Hough_params
{
int w;
int h;
int r;
};
class Hough_lines
{
public:
enum Type {INT, SHORT_INT, FLOAT};
Hough_lines(int _w, int _h);
~Hough_lines();
public:
bool init();
bool detect_lines(GLuint tex_edge, int threshold, int min_length, int min_gap, GLuint line, Type type, int& count);
protected:
void get_edges(thrust::device_vector<Pos>& d_coords, int& size);
void get_hough_lines(int threshold, thrust::device_vector<Line>& d_lines, int& size);
void get_lines(int threshold, int min_length, int min_gap, GLuint line, Hough_lines::Type type, int& count);
void trace_all_lines(int min_len, int min_gap, thrust::device_vector<Line>& d_lines, int size, int* d_line_coord, int& count);
static void compute_trig_funcs();
protected:
Hough_params params;
thrust::device_vector<Hough_params> d_param;
static bool trig_init;
};
#endif
Body
#include <hough_lines.h>
#include <math.h>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_gl_interop.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/scan.h>
#define ANGLE_SIZE 360
#define MAX_LINE_PER_THREAD 10
union Pos
{
struct
{
uint16_t x;
uint16_t y;
};
uint32_t value;
};
struct Hough_info
{
Pos end;
Pos start;
int count;
};
struct Line
{
Pos start;
Pos end;
};
struct Line_info
{
int line_count;
Line line[MAX_LINE_PER_THREAD];
};
__constant__ float dev_sint[ANGLE_SIZE];
__constant__ float dev_cost[ANGLE_SIZE];
texture<uint8_t, 2, cudaReadModeElementType> luma_tex;
bool Hough_lines::trig_init = false;
__global__ void mark_edges(const Hough_params* param, int* edge)
{
int x = (blockIdx.x*blockDim.x+threadIdx.x);
int y = (blockIdx.y*blockDim.y+threadIdx.y);
int pos = x+(param->w*y);
edge[pos] = (255 == tex2D(luma_tex, x, y))?1:0;
}
__global__ void get_coords(const Hough_params* param, int* edge, Pos* coord)
{
int index;
int x = (blockIdx.x*blockDim.x+threadIdx.x);
int y = (blockIdx.y*blockDim.y+threadIdx.y);
int pos = x+(param->w*y);
if (255 == tex2D(luma_tex, x, y))
{
index = edge[pos];
coord[index].y = y;
coord[index].x = x;
}
}
__global__ void hough_line_transform(const Hough_params* param, int size, const Pos* coord, int threshold, int *mark, Hough_info* out)
{
int i;
int angle;
int rdata;
__shared__ Hough_info sh_rho_data[1001];
i = threadIdx.x;
while (i < param->r)
{
sh_rho_data[i].end.value = 0x0;
sh_rho_data[i].start.value = 0xFFFFFFFF;
sh_rho_data[i].count = 0;
i += blockDim.x;
}
__syncthreads();
i = threadIdx.x;
angle = blockIdx.x;
const float cos_angle = dev_cost[angle];
const float sin_angle = dev_sint[angle];
while (i < size)
{
rdata = (int)ceil(((float)(coord[i].x-(param->w>>1))*cos_angle)+((float)((param->h>>1)-coord[i].y)*sin_angle));
if (rdata >= 0)
{
atomicMax(&sh_rho_data[rdata].end.value, coord[i].value);
atomicMin(&sh_rho_data[rdata].start.value, coord[i].value);
atomicAdd(&sh_rho_data[rdata].count, 1);
}
i += blockDim.x;
}
__syncthreads();
i = threadIdx.x;
rdata = (angle*param->r);
while (i < param->r)
{
memcpy(&out[rdata+i], &sh_rho_data[i], sizeof(Hough_info));
mark[rdata+i] = (sh_rho_data[i].count >= threshold)?1:0;
i += blockDim.x;
}
}
__global__ void get_lines(const Hough_params* param, int threshold, Hough_info* hdata, int* mark, Line* lines)
{
int pos;
int i = threadIdx.x;
int offset = (blockIdx.x*param->r);
while (i < param->r)
{
if (hdata[offset+i].count >= threshold)
{
pos = mark[offset+i];
lines[pos].start.value = hdata[offset+i].start.value;
lines[pos].end.value = hdata[offset+i].end.value;
}
i += blockDim.x;
}
}
__device__ void add_line(int xs, int ys, int xe, int ye, int min_len, Line_info* line)
{
int d = abs(xe-xs)+abs(ye-ys);
if ((d >= min_len) && (line->line_count < MAX_LINE_PER_THREAD))
{
line->line[line->line_count].start.x = xs;
line->line[line->line_count].start.y = ys;
line->line[line->line_count].end.x = xe;
line->line[line->line_count].end.y = ye;
++line->line_count;
//printf("\n(%d %d) (%d %d) %d", xs, ys, xe, ye, d);
}
}
__global__ void trace_lines(const Line* input, int inp_size, int min_len, int min_gap, Line_info* line_info, int* mark)
{
int d;
int dsub;
int dstep;
int xstep;
int ystep;
int xs, ys, xe, ye;
int i = (blockIdx.x*blockDim.x+threadIdx.x);
if (i >= inp_size)
{
return;
}
xs = input[i].start.x;
ys = input[i].start.y;
xe = input[i].end.x;
ye = input[i].end.y;
line_info[i].line_count = 0;
int dx = abs(xe-xs);
int dy = abs(ye-ys);
int xinc = (xe > xs)?1:-1;
int yinc = (ye > ys)?1:-1;
int gap = 0;
bool sflag;
int s_x, s_y, e_x, e_y;
if (dx > dy)
{
dsub = (dx<<1);
dstep = (dy<<1);
d = dstep-dx;
xstep = xinc;
ystep = 0;
xinc = 0;
}
else
{
dsub = (dy<<1);
dstep = (dx<<1);
d = dstep-dy;
xstep = 0;
ystep = yinc;
yinc = 0;
}
sflag = true;
s_x = xs;
s_y = ys;
e_x = xs;
e_y = ys;
int x = xs;
int y = ys;
while ((abs(x-xs) <= dx) && (abs(y-ys) <= dy))
{
x += xstep;
y += ystep;
if (d > 0)
{
x += xinc;
y += yinc;
d -= dsub;
}
d += dstep;
if (255 == tex2D(luma_tex, x, y))
{
e_x = x;
e_y = y;
gap = 0;
if (!sflag)
{
s_x = x;
s_y = y;
sflag = true;
}
}
else if (sflag)
{
++gap;
if (gap >= min_gap)
{
sflag = false;
add_line(s_x, s_y, e_x, e_y, min_len, &line_info[i]);
}
}
}
if (sflag)
{
add_line(s_x, s_y, xe, ye, min_len, &line_info[i]);
}
mark[i] = line_info[i].line_count;
}
__global__ void copy_line_coords(const Hough_params* param, Line_info* line, int size, int* mark, int* coords, int* count)
{
int index = (blockIdx.x*blockDim.x+threadIdx.x);
if (index >= size)
{
return;
}
int pos;
int start = 4*mark[index];
Line* line_data = &line[index].line[0];
for (int i = 0; i < line[index].line_count; i++)
{
pos = start+(4*i);
coords[pos] = line_data[i].start.x-(param->w>>1);
coords[pos+1] = (param->h>>1)-line_data[i].start.y;
coords[pos+2] = line_data[i].end.x-(param->w>>1);
coords[pos+3] = (param->h>>1)-line_data[i].end.y;
}
if ((index+1) == size)
{
*count = mark[index];
}
}
Hough_lines::Hough_lines(int _w, int _h)
:d_param(1)
{
params.w = _w;
params.h = _h;
params.r = (int)ceil(0.5*sqrt((_w*_w)+(_h*_h)));
thrust::copy_n(&params, 1, d_param.begin());
}
Hough_lines::~Hough_lines()
{
}
bool Hough_lines::init()
{
if (false == trig_init)
{
trig_init = true;
compute_trig_funcs();
}
return true;
}
void Hough_lines::compute_trig_funcs()
{
float theta;
cudaError_t err = cudaSuccess;
static float sint[ANGLE_SIZE];
static float cost[ANGLE_SIZE];
for (int i = 0; i < ANGLE_SIZE; i++)
{
theta = (M_PI*(float)i)/180.0;
sint[i] = sin(theta);
cost[i] = cos(theta);
}
err = cudaMemcpyToSymbol(dev_sint, sint, ANGLE_SIZE*sizeof(float));
err = (cudaSuccess == err) ? cudaMemcpyToSymbol(dev_cost, cost, ANGLE_SIZE*sizeof(float)):err;
if (cudaSuccess != err)
{
printf("\n%s", cudaGetErrorString(cudaGetLastError()));
}
}
void Hough_lines::get_edges(thrust::device_vector<Pos>& d_coords, int& size)
{
dim3 bsize(16, 16);
dim3 gsize(params.w/bsize.x, params.h/bsize.y);
thrust::device_vector<int> d_mark(params.w*params.h);
size = 0;
mark_edges<<<gsize, bsize>>>(thrust::raw_pointer_cast(d_param.data()),
thrust::raw_pointer_cast(d_mark.data()));
thrust::exclusive_scan(d_mark.begin(), d_mark.end(), d_mark.begin());
get_coords<<<gsize, bsize>>>(thrust::raw_pointer_cast(d_param.data()),
thrust::raw_pointer_cast(d_mark.data()),
thrust::raw_pointer_cast(d_coords.data()));
thrust::copy_n(d_mark.begin()+d_mark.size()-1, 1, &size);
}
void Hough_lines::get_hough_lines(int threshold, thrust::device_vector<Line>& d_lines, int& size)
{
int edge_count = 0;
thrust::device_vector<Pos> d_coords(params.w*params.h);
get_edges(d_coords, edge_count);
thrust::device_vector<int> d_mark(params.r*360);
thrust::device_vector<Hough_info> d_hough_data(params.r*360);
hough_line_transform<<<360, 256>>>(thrust::raw_pointer_cast(d_param.data()),
edge_count,
thrust::raw_pointer_cast(d_coords.data()), threshold,
thrust::raw_pointer_cast(d_mark.data()),
thrust::raw_pointer_cast(d_hough_data.data()));
thrust::exclusive_scan(d_mark.begin(), d_mark.end(), d_mark.begin());
::get_lines<<<360, 256>>>(thrust::raw_pointer_cast(d_param.data()),
threshold,
thrust::raw_pointer_cast(d_hough_data.data()),
thrust::raw_pointer_cast(d_mark.data()),
thrust::raw_pointer_cast(d_lines.data()));
thrust::copy_n(d_mark.begin()+d_mark.size()-1, 1, &size);
}
void Hough_lines::trace_all_lines(int min_len, int min_gap, thrust::device_vector<Line>& d_lines, int size, int* d_line_coord, int& count)
{
thrust::device_vector<int> d_mark_line(size);
thrust::device_vector<Line_info> d_nlines(size);
trace_lines<<<(1+(size/512)), 512>>>(thrust::raw_pointer_cast(d_lines.data()),
size, min_len, min_gap, thrust::raw_pointer_cast(d_nlines.data()),
thrust::raw_pointer_cast(d_mark_line.data()));
thrust::exclusive_scan(d_mark_line.begin(), d_mark_line.end(), d_mark_line.begin());
thrust::device_vector<int> d_count(1);
copy_line_coords<<<(1+(size/512)), 512>>>(thrust::raw_pointer_cast(d_param.data()),
thrust::raw_pointer_cast(d_nlines.data()), size,
thrust::raw_pointer_cast(d_mark_line.data()), d_line_coord,
thrust::raw_pointer_cast(d_count.data()));
thrust::copy(d_count.begin(), d_count.end(), &count);
//printf("\nLine count: %d", count);
}
void Hough_lines::get_lines(int threshold, int min_len, int min_gap, GLuint line, Hough_lines::Type type, int& count)
{
int* d_line_coord = 0;
cudaGLRegisterBufferObject(line);
cudaGLMapBufferObject((void **)&d_line_coord, line);
int size = 0;
thrust::device_vector<Line> d_lines(params.r*360);
get_hough_lines(threshold, d_lines, size);
//printf("\nget_hough_lines: %d", size);
trace_all_lines(min_len, min_gap, d_lines, size, d_line_coord, count);
cudaGLUnmapBufferObject(line);
cudaGLUnregisterBufferObject(line);
}
bool Hough_lines::detect_lines(GLuint tex_edge, int threshold, int min_length, int min_gap, GLuint line, Hough_lines::Type type, int& count)
{
cudaError_t err;
cudaArray* array_edge;
cudaGraphicsResource* res_edge;
err = cudaGraphicsGLRegisterImage(&res_edge, tex_edge, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly);
if (err != cudaSuccess)
{
printf("cudaGraphicsGLRegisterImage Failed: %s", cudaGetErrorString(cudaGetLastError()));
exit(0);
}
cudaGraphicsMapResources(1, &res_edge);
cudaChannelFormatDesc chan_desc = cudaCreateChannelDesc<uint8_t>();
err = cudaGraphicsSubResourceGetMappedArray(&array_edge, res_edge, 0, 0);
if (err != cudaSuccess)
{
printf("cudaGraphicsSubResourceGetMappedArray Failed: %s", cudaGetErrorString(cudaGetLastError()));
exit(0);
}
if (cudaBindTextureToArray(&luma_tex, array_edge, &chan_desc) != cudaSuccess)
{
printf("Failed to bind texture - %s\n", cudaGetErrorString(cudaGetLastError()));
exit(0);
}
float time = 0.0;
//static float max = 0.0;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
count = 0;
get_lines(threshold, min_length, min_gap, line, type, count);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
//static int frame = 0;
//frame++;
//if (time > max)
{
//max = time;
printf("\nElpased time: %f ms", time);
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaUnbindTexture(luma_tex);
cudaGraphicsUnmapResources(1, &res_edge);
cudaGraphicsUnregisterResource(res_edge);
return true;
}
In "Prefix sums and their applications", Guy Blelloch described a parallel line-drawing algorithm using parallel prefix sum. See page 55 of that paper, it might give you ideas.
Regarding how to optimize hough_line_transfer, I think the key is to eliminate shared memory atomics in your loop. Where you use them you are effectively doing keyed reductions. Thrust provides a reduce_by_key function, but that is only callable from the host. The device-library counterpart to Thrust is CUB, but it does not have reduce_by_key. I've asked the CUB authors for ideas here and if we come up with anything I'll update this answer.
You could write your own keyed reduction but it would be more productive and robust to rely on a library if possible.

CUDA __syncthreads(); not working; inverse in breakpoint hit order

I have a problem, I think with __syncthreads();, in the following code:
__device__ void prefixSumJoin(const bool *g_idata, int *g_odata, int n)
{
__shared__ int temp[Config::bfr*Config::bfr]; // allocated on invocation
int thid = threadIdx.y*blockDim.x + threadIdx.x;
if(thid<(n>>1))
{
int offset = 1;
temp[2*thid] = (g_idata[2*thid]?1:0); // load input into shared memory
temp[2*thid+1] = (g_idata[2*thid+1]?1:0);
for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1; // <-- breakpoint B
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
}
__global__ void selectKernel3(...)
{
int tidx = threadIdx.x;
int tidy = threadIdx.y;
int bidx = blockIdx.x;
int bidy = blockIdx.y;
int tid = tidy*blockDim.x + tidx;
int bid = bidy*gridDim.x+bidx;
int noOfRows1 = ...;
int noOfRows2 = ...;
__shared__ bool isRecordSelected[Config::bfr*Config::bfr];
__shared__ int selectedRecordsOffset[Config::bfr*Config::bfr];
isRecordSelected[tid] = false;
selectedRecordsOffset[tid] = 0;
__syncthreads();
if(tidx<noOfRows1 && tidy<noOfRows2)
if(... == ...)
isRecordSelected[tid] = true;
__syncthreads();
prefixSumJoin(isRecordSelected,selectedRecordsOffset,Config::bfr*Config::bfr); // <-- breakpoint A
__syncthreads();
if(isRecordSelected[tid]==true){
{
some_instruction;// <-- breakpoint C
...
}
}
}
...
f(){
dim3 dimGrid(13, 5);
dim3 dimBlock(Config::bfr, Config::bfr);
selectKernel3<<<dimGrid, dimBlock>>>(...)
}
//other file
class Config
{
public:
static const int bfr = 16; // blocking factor = number of rows per block
public:
Config(void);
~Config(void);
};
The prefixSum is from GPU Gems 3: Parallel Prefix Sum (Scan) with CUDA, with little change.
Ok, now I set 3 breakpoints: A, B, C. They should be hit in the order A, B, C. The problem is that they are hit in the order: A, B*x, C, B. So at point C, selectedRecordsOffset is not ready and it causes errors. After A the B is hit a few times, but not all and then C is hit and it goes further in the code and then B again for the rest of the loop. x is different depending on the input (for some inputs there isn't any inversion in the breakpoints so C is the last that was hit).
Moreover if I look on thread numbers that cause a hit it is for A and C threadIdx.y = 0 and for B threadIdx.y = 10. How is this possible while it is the same block so why some threads omit sync? There is no conditional sync.
Does anyone have any ideas on where to look for bugs?
If you need some more clarification, just ask.
Thanks in advance for any advice on how to work this out.
Adam
Thou shalt not use __syncthreads() in conditional code if the condition does not evaluate uniformly across all threads of each block.

Creating identity matrix with CUDA

Hi i try to create an identity matrix with CUDA but the output is just : zeros
__global__ void initIdentityGPU(int *devMatrix, int numR, int numC) {
int x = blockIdx.x;
int y = blockIdx.y;
int offset = x * y;
for (int i = 0; i < x ; i++) {
for (int j = 0; j < numR; j++) {
if (i == j)
devMatrix[offset] = 1;
else
devMatrix[offset] = 0;
}
}
}
Why only it puts 0s ?
The simplest way how to do it is:
__global__ void initIdentityGPU(int **devMatrix, int numR, int numC) {
int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
if(y < numR && x < numC) {
if(x == y)
devMatrix[y][x] = 1;
else
devMatrix[y][x] = 0;
}
}
and you launch it as:
dim3 blockDim(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 gridDim((numC + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (numR + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y);
initIdentityGPU<<<gridDim, blockDim>>>(matrix, numR, numC);
It simply runs as many threads as matrix cells, each thread obtains the coordinates of its cell and in a case the cell is in the diagonal of matrix it assigns 1 or 0 otherwise. Note the code is untested.