I want to ask how can I convert this texture:
texture<int, 2, cudaReadModeElementType> text1;
to
texture<int4, 2, cudaReadModeElementType> text2;
So my problem is with the indexes. For example, imagine that I acceesed to the col 10 row 5 in text1 like this
int col 10;
int row 5;
int tex2d(text1, col, row);
But if I did the same, obviously I'm not accessing to the same data:
int col 10;
int row 5;
int4 tex2d(text2, col, row);
So, I tried like dividing the col / 4 and row /4, but I don't succeed either. I think that I should use DIV and MOD, but I don't know how.
Does anyone know how to access correctly using 4 channels? Thank you very much!
The only indexing you need to modify between the two cases is the horizontal (x) indexing. For the int4 case, the horizontal indexing can be divided by 4 (as compared to the int case) but it will retrieve 4 values. Here is a full example:
$ cat t1918.cu
#include <helper_cuda.h>
#include <cstdio>
#define HEIGHT 7680
#ifndef USE_INT4
#define WIDTH 7245
typedef int it;
#else
#define WIDTH 1812
typedef int4 it;
#endif
cudaArray * Array_Device;
texture<it, 2,cudaReadModeElementType> Image;
__global__ void k(int x, int y)
{
int w;
#ifdef USE_INT4
w = WIDTH*4;
#else
w = WIDTH;
#endif
for (y = 0; y < HEIGHT; y++)
for (x = 0; x < w; x++){
int nx=x, no=0;
#ifdef USE_INT4
no = x&3; //modulo by 4
nx >>= 2; //division by 4
#endif
it val = tex2D(Image,nx,y);
int rval = reinterpret_cast<int *>(&val)[no];
if (rval != y*10000+x) {
printf("mismatch at %d, %d, was: %d, should be: %d\n", x,y, rval, y*10000+x);
return;
}
}
}
void p()
{
it *h = new it[WIDTH*HEIGHT];
// this dataset and test-case only works for textures up to width of 9999 for int or 2499 for int4
for (int i = 0; i < HEIGHT; i++)
for (int j = 0; j < WIDTH; j++){
#ifndef USE_INT4
h[i*WIDTH+j] = i*10000+j;
#else
h[i*WIDTH+j] = {i*10000+j*4+0, i*10000+j*4+1, i*10000+j*4+2, i*10000+j*4+3};
#endif
}
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<it>();
checkCudaErrors(cudaMallocArray(&Array_Device, &channelDesc,WIDTH,HEIGHT ));
checkCudaErrors(cudaBindTextureToArray(Image,Array_Device));
checkCudaErrors(cudaMemcpy2DToArray( Array_Device,
0,
0,
h,
WIDTH*sizeof(it),
WIDTH*sizeof(it),
HEIGHT,
cudaMemcpyHostToDevice));
k<<<1,1>>>(0,0);
checkCudaErrors(cudaDeviceSynchronize());
}
int main(){
#ifdef USE_INT4
printf("int4\n");
#endif
p();
return 0;
}
$ nvcc -I/usr/local/cuda/samples/common/inc -o t1918 t1918.cu
t1918.cu(40): warning: function "tex2D(texture<T, 2, cudaReadModeElementType>, float, float) [with T=it]"
/usr/local/cuda/bin/../targets/x86_64-linux/include/texture_fetch_functions.h(198): here was declared deprecated
t1918.cu: In function ‘void p()’:
t1918.cu:63:49: warning: ‘cudaError_t cudaBindTextureToArray(const texture<T, dim, readMode>&, cudaArray_const_t) [with T = int; int dim = 2; cudaTextureReadMode readMode = (cudaTextureReadMode)0; cudaError_t = cudaError; cudaArray_const_t = const cudaArray*]’ is deprecated [-Wdeprecated-declarations]
checkCudaErrors(cudaBindTextureToArray(Image,Array_Device));
^
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h:1642:53: note: declared here
static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToArray(
^~~~~~~~~~~~~~~~~~~~~~
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h: In instantiation of ‘cudaError_t cudaBindTextureToArray(const texture<T, dim, readMode>&, cudaArray_const_t) [with T = int; int dim = 2; cudaTextureReadMode readMode = (cudaTextureReadMode)0; cudaError_t = cudaError; cudaArray_const_t = const cudaArray*]’:
t1918.cu:63:49: required from here
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h:1650:55: warning: ‘cudaError_t cudaBindTextureToArray(const texture<T, dim, readMode>&, cudaArray_const_t, const cudaChannelFormatDesc&) [with T = int; int dim = 2; cudaTextureReadMode readMode = (cudaTextureReadMode)0; cudaError_t = cudaError; cudaArray_const_t = const cudaArray*]’ is deprecated [-Wdeprecated-declarations]
return err == cudaSuccess ? cudaBindTextureToArray(tex, array, desc) : err;
~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h:1603:53: note: declared here
static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToArray(
^~~~~~~~~~~~~~~~~~~~~~
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h: In instantiation of ‘cudaError_t cudaBindTextureToArray(const texture<T, dim, readMode>&, cudaArray_const_t, const cudaChannelFormatDesc&) [with T = int; int dim = 2; cudaTextureReadMode readMode = (cudaTextureReadMode)0; cudaError_t = cudaError; cudaArray_const_t = const cudaArray*]’:
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h:1650:55: required from ‘cudaError_t cudaBindTextureToArray(const texture<T, dim, readMode>&, cudaArray_const_t) [with T = int; int dim = 2; cudaTextureReadMode readMode = (cudaTextureReadMode)0; cudaError_t = cudaError; cudaArray_const_t = const cudaArray*]’
t1918.cu:63:49: required from here
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h:1609:32: warning: ‘cudaError_t cudaBindTextureToArray(const textureReference*, cudaArray_const_t, const cudaChannelFormatDesc*)’ is deprecated [-Wdeprecated-declarations]
return ::cudaBindTextureToArray(&tex, array, &desc);
~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:8662:46: note: declared here
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(const struct textureReference *texref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc);
^~~~~~~~~~~~~~~~~~~~~~
$ cuda-memcheck ./t1918
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvcc -I/usr/local/cuda/samples/common/inc -o t1918 t1918.cu -DUSE_INT4
t1918.cu(40): warning: function "tex2D(texture<T, 2, cudaReadModeElementType>, float, float) [with T=it]"
/usr/local/cuda/bin/../targets/x86_64-linux/include/texture_fetch_functions.h(198): here was declared deprecated
t1918.cu: In function ‘void p()’:
t1918.cu:63:49: warning: ‘cudaError_t cudaBindTextureToArray(const texture<T, dim, readMode>&, cudaArray_const_t) [with T = int4; int dim = 2; cudaTextureReadMode readMode = (cudaTextureReadMode)0; cudaError_t = cudaError; cudaArray_const_t = const cudaArray*]’ is deprecated [-Wdeprecated-declarations]
checkCudaErrors(cudaBindTextureToArray(Image,Array_Device));
^
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h:1642:53: note: declared here
static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToArray(
^~~~~~~~~~~~~~~~~~~~~~
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h: In instantiation of ‘cudaError_t cudaBindTextureToArray(const texture<T, dim, readMode>&, cudaArray_const_t) [with T = int4; int dim = 2; cudaTextureReadMode readMode = (cudaTextureReadMode)0; cudaError_t = cudaError; cudaArray_const_t = const cudaArray*]’:
t1918.cu:63:49: required from here
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h:1650:55: warning: ‘cudaError_t cudaBindTextureToArray(const texture<T, dim, readMode>&, cudaArray_const_t, const cudaChannelFormatDesc&) [with T = int4; int dim = 2; cudaTextureReadMode readMode = (cudaTextureReadMode)0; cudaError_t = cudaError; cudaArray_const_t = const cudaArray*]’ is deprecated [-Wdeprecated-declarations]
return err == cudaSuccess ? cudaBindTextureToArray(tex, array, desc) : err;
~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h:1603:53: note: declared here
static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToArray(
^~~~~~~~~~~~~~~~~~~~~~
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h: In instantiation of ‘cudaError_t cudaBindTextureToArray(const texture<T, dim, readMode>&, cudaArray_const_t, const cudaChannelFormatDesc&) [with T = int4; int dim = 2; cudaTextureReadMode readMode = (cudaTextureReadMode)0; cudaError_t = cudaError; cudaArray_const_t = const cudaArray*]’:
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h:1650:55: required from ‘cudaError_t cudaBindTextureToArray(const texture<T, dim, readMode>&, cudaArray_const_t) [with T = int4; int dim = 2; cudaTextureReadMode readMode = (cudaTextureReadMode)0; cudaError_t = cudaError; cudaArray_const_t = const cudaArray*]’
t1918.cu:63:49: required from here
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h:1609:32: warning: ‘cudaError_t cudaBindTextureToArray(const textureReference*, cudaArray_const_t, const cudaChannelFormatDesc*)’ is deprecated [-Wdeprecated-declarations]
return ::cudaBindTextureToArray(&tex, array, &desc);
~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:8662:46: note: declared here
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(const struct textureReference *texref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc);
^~~~~~~~~~~~~~~~~~~~~~
$ cuda-memcheck ./t1918
========= CUDA-MEMCHECK
int4
========= ERROR SUMMARY: 0 errors
$
The data stored in the texture in the int case looks like this:
0 1 2 3 4 5 6 7 ...
1000 1001 1002 1003 1004 1005 1006 1007...
2000 2001 2002 2003 2004 2005 2006 2007...
3000 3001 3002 3003 3004 3005 3006 3007...
...
In the int4 case it looks like this:
{ 0, 1, 2, 3} { 4, 5, 6, 7} ...
{1000,1001,1002,1003} {1004,1005,1006,1007} ...
{2000,2001,2002,2003} {2004,2005,2006,2007} ...
{3000,3001,3002,3003} {3004,3005,3006,3007} ...
...
The kernel demonstrates how to retrieve the same value for a given (x,y) coordinate given to the kernel, in either case.
Note that textures are deprecated and for new work you should switch to texture objects.
Related
To test out dynamic parallelism, I wrote a simple code and compiled it on GTX1080 with the following commands.
nvcc -arch=sm_35 -dc dynamic_test.cu -o dynamic_test.o
nvcc -arch=sm_35 dynamic_test.o -lcudadevrt -o dynamic_test
However, the output is not as expected. It seems like the pointers passed to the child kernel are de-referenced.
#include <stdlib.h>
#include <stdio.h>
#include <cublas_v2.h>
#include <cuda_runtime_api.h>
__global__ void child(int *a, int *b, int *c){
int tid = threadIdx.x;
c[tid] = a[tid] + b[tid];
}
__global__ void Parent(int *a, int *b, int *c){
int tid = threadIdx.x;
const int n = 10;
a[tid] = tid;
b[tid] = 2*tid;
c[tid] = -10;
__syncthreads();
cudaDeviceSynchronize();
if (tid == 1){
child<<<1,n>>>(a,b,c);
cudaDeviceSynchronize();
}
}
int main(){
int *d_a, *d_b, *d_c;
const int n = 10;
int a[n],b[n],c[n],i;
cudaMalloc((void**)&d_a,n*sizeof(int));
cudaMalloc((void**)&d_b,n*sizeof(int));
cudaMalloc((void**)&d_c,n*sizeof(int));
Parent << < 1, n >>> (d_a,d_b,d_c);
cudaDeviceSynchronize();
cudaMemcpy(a,d_a,n*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(b,d_b,n*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(c,d_c,n*sizeof(int),cudaMemcpyDeviceToHost);
for(i=0; i<n; i++){
printf("a[%d] = %d\n",i,a[i]);
}
for(i=0; i<n; i++){
printf("b[%d] = %d\n",i,b[i]);
}
for(i=0; i<n; i++){
printf("c[%d] = %d\n",i,c[i]);
}
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
Here's the output:
a[0] = 1
a[1] = 0
a[2] = 4208446
a[3] = 0
a[4] = 0
a[5] = 0
a[6] = 0
a[7] = 0
a[8] = 0
a[9] = 0
b[0] = 3
b[1] = 0
b[2] = 4204323
b[3] = 0
b[4] = 4205312
b[5] = 0
b[6] = 4732449
b[7] = 0
b[8] = 4205680
b[9] = 0
c[0] = 194906208
c[1] = 32767
c[2] = 4204143
c[3] = 0
c[4] = 4205616
c[5] = 0
c[6] = 4732608
c[7] = 0
c[8] = 4231155
c[9] = 0
Reading from the programming guide, I should be able to pass global variables to child kernels without causing any deferencing. I am not sure why the output is incorrect. My ultimate goal is using the cublas library from within kernels. Any suggestion in that direction will also be helpful.
The problem was solved by switching from cuda 7.5 to cuda 8.0.
I want to write a prefix scan for large arrays using the instruction in GPUgem, It's a homework for my parallel class. I did follow all the steps in the book but still my code's not working. I got it to work for array size 4096 but it's not working for larger arrays. Here is my code :
#include <stdio.h>
#include <sys/time.h>
#define THREADS 1024
typedef int mytype;
__global__ void phaseI(mytype *g_odata, mytype *g_idata, int n, mytype *aux)
{
__shared__ mytype temp[THREADS];
const int tid1 = threadIdx.x;
int offset = 1;
temp[2*tid1] = g_idata[2*tid1]; // load input into shared memory
temp[2*tid1+1] = g_idata[2*tid1+1];
for (int d = THREADS>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
__syncthreads();
if (tid1 == 0) {
aux[blockIdx.x] = temp[THREADS - 1];
temp[THREADS - 1] = 0;
}
for (int d = 1; d < THREADS; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
mytype t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
__global__ void phaseII(mytype *g_odata, mytype *aux, int n)
{
const int tid1 = threadIdx.x;
const int B = (n / THREADS);
int offset = 1;
for (int d = B>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
__syncthreads();
if (tid1 == 0 && blockIdx.x == 0) {
aux[B - 1] = 0;
}
for (int d = 1; d < B; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
mytype t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] += aux[blockIdx.x];
g_odata[2*thid+1] += aux[blockIdx.x];
}
int main(int argc, char *argv[])
{
if (argc != 2) {
printf("usage: %s n\n", argv[0]);
return -1;
}
const int n = atoi(argv[1]);
mytype *h_i, *d_i, *h_o, *d_o, *d_temp;
const int size = n * sizeof(mytype);
h_i = (mytype *)malloc(size);
h_o = (mytype *)malloc(size);
if ((h_i == NULL) || (h_o == NULL)) {
printf("malloc failed\n");
return -1;
}
for (int i = 0; i < n; i++) {
h_i[i] = i;
h_o[i] = 0;
}
cudaMalloc(&d_i, size);
cudaMalloc(&d_temp, (n / THREADS) );
cudaMalloc(&d_o, size);
cudaMemset(d_o, 0, size);
cudaMemset(d_temp, 0, (n / THREADS));
cudaMemcpy(d_i, h_i, size, cudaMemcpyHostToDevice);
int blocks = n / THREADS;
phaseI<<<blocks, THREADS / 2 >>>(d_o, d_i, n, d_temp);
phaseII<<<blocks, THREADS / 2>>>(d_o, d_temp, n);
cudaThreadSynchronize();
cudaMemcpy(h_o, d_o, size, cudaMemcpyDeviceToHost);
printf("\n");
for (int i = 0; i < n ; i++) {
printf(" %d", h_o[i]);
}
printf("\n\n");
return 0;
}
Does anyone have any idea what I'm doing wrong?
One possible error I see in your code is here:
aux[thid] = temp[THREADS];
If your temp array is temp[1024], as you say, and each block has 1024 threads, as you say, then if THREADS is 1024, temp[THREADS] will access your shared memory array out-of-bounds (one past the end.) An array of 1024 elements only has valid indices from 0 to 1023.
Beyond that, it seems like you're asking how to take the last element out of a shared memory array (temp) and place it in a position in a (presumably global) aux array, which has one element for each block.
Here's a fully worked example:
$ cat t831.cu
#include <stdio.h>
#define THREADS 1024
#define BLOCKS 20
__global__ void kernel(int *aux){
__shared__ int temp[THREADS];
temp[threadIdx.x] = threadIdx.x + blockIdx.x;
__syncthreads();
if (threadIdx.x == 0)
aux[blockIdx.x] = temp[THREADS-1];
}
int main(){
int *h_data, *d_data;
const int dsize = BLOCKS*sizeof(int);
h_data=(int *)malloc(dsize);
cudaMalloc(&d_data, dsize);
memset(h_data, 0, dsize);
cudaMemset(d_data, 0, dsize);
kernel<<<BLOCKS, THREADS>>>(d_data);
cudaMemcpy(h_data, d_data, dsize, cudaMemcpyDeviceToHost);
for (int i = 0; i < BLOCKS; i++) printf("%d, ", h_data[i]);
printf("\n");
return 0;
}
$ nvcc -o t831 t831.cu
$ cuda-memcheck ./t831
========= CUDA-MEMCHECK
1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042,
========= ERROR SUMMARY: 0 errors
$
Here I want to calculate the distance of each two points, and decide if they are neighbours. here is my simple code in cuda.
__global__ void calcNeighbors(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
float dis = 0.0f;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=0; i<N; i++) {
DataPoint p2 = points[i];
dis = 0;
dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
(p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
(p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[tid*N+i] = true;
} else {
neighbors[tid*N+i] = false;
}
}
tid += blockDim.x * gridDim.x;
}
}
The DataPoint is a struct is
typedef struct DataPoint {
float pfDimens[3];
} DataPoint;
so here i want to reduce the time, How can i do? I have tried to use memory coalesing and share memory, but i didn't get a good speed up?
===============use share memory==============
__global__ void calcNeighbors2(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
__shared__ DataPoint sharedpoints[threadsPerBlock];
int start = blockIdx.x * blockDim.x;
int len = start+threadIdx.x;
if (len < N) {
sharedpoints[threadIdx.x] = points[len];
}
len = imin(N, blockDim.x + start);
__syncthreads();
int tid = threadIdx.x;
float dis;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=start; i<len; i++) {
dis = 0;
dis += (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) * (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) +
(p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) * (p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) +
(p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]) * (p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[i*N+tid] = true;
} else {
neighbors[i*N+tid] = false;
}
}
tid += blockDim.x;
}
}
Here i changed the neighbors[tid*N+i] to neighbors[i*N+tid], it give me amlost 8x speed up on Tesla K10.G2.8GB. But when i use share memory to store some points, it is no use?
There are at least 4 ideas, some of which have already been stated in the comments:
Transform your point distance storage from AoS format:
struct DataPoint {
float pfDimens[3];
};
to SoA format:
struct DataPoint {
float pfDimens_x[NPTS];
float pfDimens_y[NPTS];
float pfDimens_z[NPTS];
};
this will enable full coalescing on loading of the data. In fact, to help with point 4 below, I would just switch to using 3 bare arrays, rather than a structure.
reduce the computation to (slightly less than) half:
for (int i=N-1; i>tid; i--) {
then, either in the thread code itself, or in the host, you can populate the other "half" of the output matrix by copying data.
Transpose the storage in your output matrix, so that you can write a storage operation like this:
neighbors[i*N+tid] = true;
which will nicely coalesce, as opposed to this:
neighbors[tid*N+i] = true;
which will not.
Since your input point data is read only, mark the kernel parameter appropriately:
const float * __restrict__ points_x, const float * __restrict__ points_y, const float * __restrict__ points_z
in some cases, and on some GPUs, this will often lead to a speed-up due to use of the read-only cache. If you really want to get aggressive with caching, and your data array is small enough (4K or less float points), you could put a copy of the point data in global memory as well as a copy in __constant__ memory, and load the "uniform" load you are doing here through constant memory:
DataPoint p2 = c_points[i];
thus you could perform the coalesced load through the read-only cache, the uniform load through the constant cache, and the coalesced store going to ordinary global memory.
On a K40c, on linux/CUDA 7, for N = 4096, the net effect of these changes appears to be about a 3.5x speedup, at the kernel level:
$ cat t749.cu
#include <stdio.h>
#define N 4096
// if N is 16K/3 or less, we can use constant
#define USE_CONSTANT
#define THRESH 0.2f
#define nTPB 256
#define nBLK (N/nTPB+1)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
struct DataPoint {
float pfDimens[3];
};
__global__ void calcNeighbors(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
float dis = 0.0f;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=0; i<N; i++) {
DataPoint p2 = points[i];
dis = 0;
dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
(p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
(p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[tid*N+i] = true;
} else {
neighbors[tid*N+i] = false;
}
}
tid += blockDim.x * gridDim.x;
}
}
#ifdef USE_CONSTANT
__constant__ float cpx[N];
__constant__ float cpy[N];
__constant__ float cpz[N];
#endif
__global__ void calcNeighbors2(const float * __restrict__ pts_x, const float * __restrict__ pts_y, const float * __restrict__ pts_z, const float doubleRadius, bool * __restrict__ neighbors) {
int tid = threadIdx.x+blockDim.x*blockIdx.x;
while (tid < N) {
float p1x = pts_x[tid];
float p1y = pts_y[tid];
float p1z = pts_z[tid];
for (int i = N-1; i > tid; i--){
float p2x, p2y, p2z;
#ifdef USE_CONSTANT
p2x = cpx[i];
p2y = cpy[i];
p2z = cpz[i];
#else
p2x = pts_x[i];
p2y = pts_y[i];
p2z = pts_z[i];
#endif
float dis = ((p1x-p2x)*(p1x-p2x)) + ((p1y-p2y)*(p1y-p2y)) + ((p1z-p2z)*(p1z-p2z));
neighbors[i*N+tid] = (dis <= doubleRadius);
}
tid += blockDim.x * gridDim.x;
}
}
int main(){
float *dx, *dy, *dz, *hx, *hy, *hz;
DataPoint *dp, *hp;
bool *dn, *hn1, *hn2;
hx =(float *)malloc(N*sizeof(float));
hy =(float *)malloc(N*sizeof(float));
hz =(float *)malloc(N*sizeof(float));
hp =(DataPoint *)malloc(N*sizeof(DataPoint));
hn1=(bool *)malloc(N*N*sizeof(bool));
hn2=(bool *)malloc(N*N*sizeof(bool));
cudaMalloc(&dx, N*sizeof(float));
cudaMalloc(&dy, N*sizeof(float));
cudaMalloc(&dz, N*sizeof(float));
cudaMalloc(&dp, N*sizeof(DataPoint));
cudaMalloc(&dn, N*N*sizeof(bool));
for (int i =0; i < N; i++){
hx[i] = rand()/(float)RAND_MAX;
hy[i] = rand()/(float)RAND_MAX;
hz[i] = rand()/(float)RAND_MAX;
hp[i].pfDimens[0] = hx[i];
hp[i].pfDimens[1] = hy[i];
hp[i].pfDimens[2] = hz[i];}
cudaMemcpy(dx, hx, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dy, hy, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dz, hz, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dp, hp, N*sizeof(DataPoint), cudaMemcpyHostToDevice);
// warm-up
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t1 = dtime_usec(0);
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 1 error");
t1 = dtime_usec(t1);
cudaMemcpy(hn1, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
// warm-up
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t2 = dtime_usec(0);
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 2 error");
t2 = dtime_usec(t2);
cudaMemcpy(hn2, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
cudaCheckErrors("some error");
printf("t1: %fs, t2: %fs\n", t1/(float)USECPSEC, t2/(float)USECPSEC);
// results validation
for (int i = 0; i < N; i++)
for (int j = i+1; j < N; j++)
if (hn1[i*N+j] != hn2[j*N+i]) {printf("mismatch at %d, %d, was: %d, should be: %d\n", i, j, hn2[j*N+i], hn1[i*N+j]); return 1;}
return 0;
}
$ nvcc -arch=sm_35 -o t749 t749.cu
$ ./t749
t1: 0.004903s, t2: 0.001395s
$
In the case of K40c, the limited number of blocks being launched above (16) is a significant impediment to performance, due to latency. If we comment out the USE_CONSTANT define, and change N to 16384, we observe an even higher speedup with the improved kernel:
$ ./t749
t1: 0.267107s, t2: 0.008209s
$
the resultant ~48 blocks being enough to approximately "fill" the K40c which has 15 SMs.
EDIT: now that you've posted a shared memory kernel, I added it to my test case as calcNeighbors3 and compared it's timing performance (as t3). It is almost as fast as my kernel, and it seems to provide the correct result (matches your original kernel) so I'm not sure what your concerns are.
Here's the updated code and test case:
$ cat t749.cu
#include <stdio.h>
#include <math.h>
#define imin(X,Y) ((X)<(Y))?(X):(Y)
#define N 32768
// if N is 16K/3 or less, we can use constant
// #define USE_CONSTANT
#define THRESH 0.2f
#define nTPB 256
#define nBLK (N/nTPB+1)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
struct DataPoint {
float pfDimens[3];
};
__global__ void calcNeighbors(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
float dis = 0.0f;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=0; i<N; i++) {
DataPoint p2 = points[i];
dis = 0;
dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
(p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
(p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[tid*N+i] = true;
} else {
neighbors[tid*N+i] = false;
}
}
tid += blockDim.x * gridDim.x;
}
}
#ifdef USE_CONSTANT
__constant__ float cpx[N];
__constant__ float cpy[N];
__constant__ float cpz[N];
#endif
__global__ void calcNeighbors2(const float * __restrict__ pts_x, const float * __restrict__ pts_y, const float * __restrict__ pts_z, const float doubleRadius, bool * __restrict__ neighbors) {
int tid = threadIdx.x+blockDim.x*blockIdx.x;
while (tid < N) {
float p1x = pts_x[tid];
float p1y = pts_y[tid];
float p1z = pts_z[tid];
for (int i = N-1; i > tid; i--){
float p2x, p2y, p2z;
#ifdef USE_CONSTANT
p2x = cpx[i];
p2y = cpy[i];
p2z = cpz[i];
#else
p2x = pts_x[i];
p2y = pts_y[i];
p2z = pts_z[i];
#endif
float dis = ((p1x-p2x)*(p1x-p2x)) + ((p1y-p2y)*(p1y-p2y)) + ((p1z-p2z)*(p1z-p2z));
neighbors[i*N+tid] = (dis <= doubleRadius);
}
tid += blockDim.x * gridDim.x;
}
}
__global__ void calcNeighbors3(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
__shared__ DataPoint sharedpoints[nTPB];
int start = blockIdx.x * blockDim.x;
int len = start+threadIdx.x;
if (len < N) {
sharedpoints[threadIdx.x] = points[len];
}
len = imin(N, blockDim.x + start);
__syncthreads();
int tid = threadIdx.x;
float dis;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=start; i<len; i++) {
dis = 0;
dis += (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) * (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) +
(p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) * (p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) +
(p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]) * (p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[i*N+tid] = true;
} else {
neighbors[i*N+tid] = false;
}
}
tid += blockDim.x;
}
}
int main(){
float *dx, *dy, *dz, *hx, *hy, *hz;
DataPoint *dp, *hp;
bool *dn, *hn1, *hn2, *hn3;
hx =(float *)malloc(N*sizeof(float));
hy =(float *)malloc(N*sizeof(float));
hz =(float *)malloc(N*sizeof(float));
hp =(DataPoint *)malloc(N*sizeof(DataPoint));
hn1=(bool *)malloc(N*N*sizeof(bool));
hn2=(bool *)malloc(N*N*sizeof(bool));
hn3=(bool *)malloc(N*N*sizeof(bool));
cudaMalloc(&dx, N*sizeof(float));
cudaMalloc(&dy, N*sizeof(float));
cudaMalloc(&dz, N*sizeof(float));
cudaMalloc(&dp, N*sizeof(DataPoint));
cudaMalloc(&dn, N*N*sizeof(bool));
for (int i =0; i < N; i++){
hx[i] = rand()/(float)RAND_MAX;
hy[i] = rand()/(float)RAND_MAX;
hz[i] = rand()/(float)RAND_MAX;
hp[i].pfDimens[0] = hx[i];
hp[i].pfDimens[1] = hy[i];
hp[i].pfDimens[2] = hz[i];}
cudaMemcpy(dx, hx, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dy, hy, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dz, hz, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dp, hp, N*sizeof(DataPoint), cudaMemcpyHostToDevice);
#ifdef USE_CONSTANT
cudaMemcpyToSymbol(cpx, hx, N*sizeof(float));
cudaMemcpyToSymbol(cpy, hy, N*sizeof(float));
cudaMemcpyToSymbol(cpz, hz, N*sizeof(float));
#endif
// warm-up
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t1 = dtime_usec(0);
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 1 error");
t1 = dtime_usec(t1);
cudaMemcpy(hn1, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
// warm-up
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t2 = dtime_usec(0);
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 2 error");
t2 = dtime_usec(t2);
cudaMemcpy(hn2, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
// warm-up
calcNeighbors3<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t3 = dtime_usec(0);
calcNeighbors3<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 3 error");
t3 = dtime_usec(t3);
cudaMemcpy(hn3, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
cudaCheckErrors("some error");
printf("t1: %fs, t2: %fs, t3: %fs\n", t1/(float)USECPSEC, t2/(float)USECPSEC, t3/(float)USECPSEC);
// results validation
for (int i = 0; i < N; i++)
for (int j = i+1; j < N; j++)
if (hn1[i*N+j] != hn2[j*N+i]) {printf("1:2 mismatch at %d, %d, was: %d, should be: %d\n", i, j, hn2[j*N+i], hn1[i*N+j]); return 1;}
for (int i = 0; i < N*N; i++)
if (hn1[i] != hn3[i]) {printf("1:3 mismatch at %d, was: %d, should be: %d\n", i, hn1[i], hn3[i]); return 1;}
return 0;
}
$ nvcc -arch=sm_35 -o t749 t749.cu
$ ./t749
t1: 1.260010s, t2: 0.022661s, t3: 0.029632s
$
For this test, I have changed the data set size to 32768 since that is closer to the range you care about. Your shared memory kernel shows about a 42x speedup over your original kernel, and my kernel shows about a 55x speedup, on my K40c.
I am trying to do real time line detection using CUDA. I have calculated the hough transform along with the min, max line coordinates of each bin. For getting the line segments I am tracing (using Bresenham's line algorithm) through the min to max point and get the line segments on each bin. When the hough threshold is low and when lot of lines are there in the image trace_lines takes lot of time to complete.
hough transform (hough_line_transform) computation takes around 5-10ms per frame(1280x720) on a GTX 660 (observed to be 10 times faster than CPU implementation). But tracing the line segments from the min, max points takes 1ms-15ms.
I have two questions on line detection
Does there exist a better algorithm to get the line segments from the min, max points of the hough bins?
Is it possible to optimize hought_line_transform (see the code below) further? I am using atomic operations. Is it possible to avoid atomics.
I am attaching the code below.
class Header
#ifndef _HOUGH_LINES_H_
#define _HOUGH_LINES_H_
#include <cuda_gl_interop.h>
#include <thrust/device_vector.h>
union Pos;
struct Line;
struct Hough_params
{
int w;
int h;
int r;
};
class Hough_lines
{
public:
enum Type {INT, SHORT_INT, FLOAT};
Hough_lines(int _w, int _h);
~Hough_lines();
public:
bool init();
bool detect_lines(GLuint tex_edge, int threshold, int min_length, int min_gap, GLuint line, Type type, int& count);
protected:
void get_edges(thrust::device_vector<Pos>& d_coords, int& size);
void get_hough_lines(int threshold, thrust::device_vector<Line>& d_lines, int& size);
void get_lines(int threshold, int min_length, int min_gap, GLuint line, Hough_lines::Type type, int& count);
void trace_all_lines(int min_len, int min_gap, thrust::device_vector<Line>& d_lines, int size, int* d_line_coord, int& count);
static void compute_trig_funcs();
protected:
Hough_params params;
thrust::device_vector<Hough_params> d_param;
static bool trig_init;
};
#endif
Body
#include <hough_lines.h>
#include <math.h>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_gl_interop.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/scan.h>
#define ANGLE_SIZE 360
#define MAX_LINE_PER_THREAD 10
union Pos
{
struct
{
uint16_t x;
uint16_t y;
};
uint32_t value;
};
struct Hough_info
{
Pos end;
Pos start;
int count;
};
struct Line
{
Pos start;
Pos end;
};
struct Line_info
{
int line_count;
Line line[MAX_LINE_PER_THREAD];
};
__constant__ float dev_sint[ANGLE_SIZE];
__constant__ float dev_cost[ANGLE_SIZE];
texture<uint8_t, 2, cudaReadModeElementType> luma_tex;
bool Hough_lines::trig_init = false;
__global__ void mark_edges(const Hough_params* param, int* edge)
{
int x = (blockIdx.x*blockDim.x+threadIdx.x);
int y = (blockIdx.y*blockDim.y+threadIdx.y);
int pos = x+(param->w*y);
edge[pos] = (255 == tex2D(luma_tex, x, y))?1:0;
}
__global__ void get_coords(const Hough_params* param, int* edge, Pos* coord)
{
int index;
int x = (blockIdx.x*blockDim.x+threadIdx.x);
int y = (blockIdx.y*blockDim.y+threadIdx.y);
int pos = x+(param->w*y);
if (255 == tex2D(luma_tex, x, y))
{
index = edge[pos];
coord[index].y = y;
coord[index].x = x;
}
}
__global__ void hough_line_transform(const Hough_params* param, int size, const Pos* coord, int threshold, int *mark, Hough_info* out)
{
int i;
int angle;
int rdata;
__shared__ Hough_info sh_rho_data[1001];
i = threadIdx.x;
while (i < param->r)
{
sh_rho_data[i].end.value = 0x0;
sh_rho_data[i].start.value = 0xFFFFFFFF;
sh_rho_data[i].count = 0;
i += blockDim.x;
}
__syncthreads();
i = threadIdx.x;
angle = blockIdx.x;
const float cos_angle = dev_cost[angle];
const float sin_angle = dev_sint[angle];
while (i < size)
{
rdata = (int)ceil(((float)(coord[i].x-(param->w>>1))*cos_angle)+((float)((param->h>>1)-coord[i].y)*sin_angle));
if (rdata >= 0)
{
atomicMax(&sh_rho_data[rdata].end.value, coord[i].value);
atomicMin(&sh_rho_data[rdata].start.value, coord[i].value);
atomicAdd(&sh_rho_data[rdata].count, 1);
}
i += blockDim.x;
}
__syncthreads();
i = threadIdx.x;
rdata = (angle*param->r);
while (i < param->r)
{
memcpy(&out[rdata+i], &sh_rho_data[i], sizeof(Hough_info));
mark[rdata+i] = (sh_rho_data[i].count >= threshold)?1:0;
i += blockDim.x;
}
}
__global__ void get_lines(const Hough_params* param, int threshold, Hough_info* hdata, int* mark, Line* lines)
{
int pos;
int i = threadIdx.x;
int offset = (blockIdx.x*param->r);
while (i < param->r)
{
if (hdata[offset+i].count >= threshold)
{
pos = mark[offset+i];
lines[pos].start.value = hdata[offset+i].start.value;
lines[pos].end.value = hdata[offset+i].end.value;
}
i += blockDim.x;
}
}
__device__ void add_line(int xs, int ys, int xe, int ye, int min_len, Line_info* line)
{
int d = abs(xe-xs)+abs(ye-ys);
if ((d >= min_len) && (line->line_count < MAX_LINE_PER_THREAD))
{
line->line[line->line_count].start.x = xs;
line->line[line->line_count].start.y = ys;
line->line[line->line_count].end.x = xe;
line->line[line->line_count].end.y = ye;
++line->line_count;
//printf("\n(%d %d) (%d %d) %d", xs, ys, xe, ye, d);
}
}
__global__ void trace_lines(const Line* input, int inp_size, int min_len, int min_gap, Line_info* line_info, int* mark)
{
int d;
int dsub;
int dstep;
int xstep;
int ystep;
int xs, ys, xe, ye;
int i = (blockIdx.x*blockDim.x+threadIdx.x);
if (i >= inp_size)
{
return;
}
xs = input[i].start.x;
ys = input[i].start.y;
xe = input[i].end.x;
ye = input[i].end.y;
line_info[i].line_count = 0;
int dx = abs(xe-xs);
int dy = abs(ye-ys);
int xinc = (xe > xs)?1:-1;
int yinc = (ye > ys)?1:-1;
int gap = 0;
bool sflag;
int s_x, s_y, e_x, e_y;
if (dx > dy)
{
dsub = (dx<<1);
dstep = (dy<<1);
d = dstep-dx;
xstep = xinc;
ystep = 0;
xinc = 0;
}
else
{
dsub = (dy<<1);
dstep = (dx<<1);
d = dstep-dy;
xstep = 0;
ystep = yinc;
yinc = 0;
}
sflag = true;
s_x = xs;
s_y = ys;
e_x = xs;
e_y = ys;
int x = xs;
int y = ys;
while ((abs(x-xs) <= dx) && (abs(y-ys) <= dy))
{
x += xstep;
y += ystep;
if (d > 0)
{
x += xinc;
y += yinc;
d -= dsub;
}
d += dstep;
if (255 == tex2D(luma_tex, x, y))
{
e_x = x;
e_y = y;
gap = 0;
if (!sflag)
{
s_x = x;
s_y = y;
sflag = true;
}
}
else if (sflag)
{
++gap;
if (gap >= min_gap)
{
sflag = false;
add_line(s_x, s_y, e_x, e_y, min_len, &line_info[i]);
}
}
}
if (sflag)
{
add_line(s_x, s_y, xe, ye, min_len, &line_info[i]);
}
mark[i] = line_info[i].line_count;
}
__global__ void copy_line_coords(const Hough_params* param, Line_info* line, int size, int* mark, int* coords, int* count)
{
int index = (blockIdx.x*blockDim.x+threadIdx.x);
if (index >= size)
{
return;
}
int pos;
int start = 4*mark[index];
Line* line_data = &line[index].line[0];
for (int i = 0; i < line[index].line_count; i++)
{
pos = start+(4*i);
coords[pos] = line_data[i].start.x-(param->w>>1);
coords[pos+1] = (param->h>>1)-line_data[i].start.y;
coords[pos+2] = line_data[i].end.x-(param->w>>1);
coords[pos+3] = (param->h>>1)-line_data[i].end.y;
}
if ((index+1) == size)
{
*count = mark[index];
}
}
Hough_lines::Hough_lines(int _w, int _h)
:d_param(1)
{
params.w = _w;
params.h = _h;
params.r = (int)ceil(0.5*sqrt((_w*_w)+(_h*_h)));
thrust::copy_n(¶ms, 1, d_param.begin());
}
Hough_lines::~Hough_lines()
{
}
bool Hough_lines::init()
{
if (false == trig_init)
{
trig_init = true;
compute_trig_funcs();
}
return true;
}
void Hough_lines::compute_trig_funcs()
{
float theta;
cudaError_t err = cudaSuccess;
static float sint[ANGLE_SIZE];
static float cost[ANGLE_SIZE];
for (int i = 0; i < ANGLE_SIZE; i++)
{
theta = (M_PI*(float)i)/180.0;
sint[i] = sin(theta);
cost[i] = cos(theta);
}
err = cudaMemcpyToSymbol(dev_sint, sint, ANGLE_SIZE*sizeof(float));
err = (cudaSuccess == err) ? cudaMemcpyToSymbol(dev_cost, cost, ANGLE_SIZE*sizeof(float)):err;
if (cudaSuccess != err)
{
printf("\n%s", cudaGetErrorString(cudaGetLastError()));
}
}
void Hough_lines::get_edges(thrust::device_vector<Pos>& d_coords, int& size)
{
dim3 bsize(16, 16);
dim3 gsize(params.w/bsize.x, params.h/bsize.y);
thrust::device_vector<int> d_mark(params.w*params.h);
size = 0;
mark_edges<<<gsize, bsize>>>(thrust::raw_pointer_cast(d_param.data()),
thrust::raw_pointer_cast(d_mark.data()));
thrust::exclusive_scan(d_mark.begin(), d_mark.end(), d_mark.begin());
get_coords<<<gsize, bsize>>>(thrust::raw_pointer_cast(d_param.data()),
thrust::raw_pointer_cast(d_mark.data()),
thrust::raw_pointer_cast(d_coords.data()));
thrust::copy_n(d_mark.begin()+d_mark.size()-1, 1, &size);
}
void Hough_lines::get_hough_lines(int threshold, thrust::device_vector<Line>& d_lines, int& size)
{
int edge_count = 0;
thrust::device_vector<Pos> d_coords(params.w*params.h);
get_edges(d_coords, edge_count);
thrust::device_vector<int> d_mark(params.r*360);
thrust::device_vector<Hough_info> d_hough_data(params.r*360);
hough_line_transform<<<360, 256>>>(thrust::raw_pointer_cast(d_param.data()),
edge_count,
thrust::raw_pointer_cast(d_coords.data()), threshold,
thrust::raw_pointer_cast(d_mark.data()),
thrust::raw_pointer_cast(d_hough_data.data()));
thrust::exclusive_scan(d_mark.begin(), d_mark.end(), d_mark.begin());
::get_lines<<<360, 256>>>(thrust::raw_pointer_cast(d_param.data()),
threshold,
thrust::raw_pointer_cast(d_hough_data.data()),
thrust::raw_pointer_cast(d_mark.data()),
thrust::raw_pointer_cast(d_lines.data()));
thrust::copy_n(d_mark.begin()+d_mark.size()-1, 1, &size);
}
void Hough_lines::trace_all_lines(int min_len, int min_gap, thrust::device_vector<Line>& d_lines, int size, int* d_line_coord, int& count)
{
thrust::device_vector<int> d_mark_line(size);
thrust::device_vector<Line_info> d_nlines(size);
trace_lines<<<(1+(size/512)), 512>>>(thrust::raw_pointer_cast(d_lines.data()),
size, min_len, min_gap, thrust::raw_pointer_cast(d_nlines.data()),
thrust::raw_pointer_cast(d_mark_line.data()));
thrust::exclusive_scan(d_mark_line.begin(), d_mark_line.end(), d_mark_line.begin());
thrust::device_vector<int> d_count(1);
copy_line_coords<<<(1+(size/512)), 512>>>(thrust::raw_pointer_cast(d_param.data()),
thrust::raw_pointer_cast(d_nlines.data()), size,
thrust::raw_pointer_cast(d_mark_line.data()), d_line_coord,
thrust::raw_pointer_cast(d_count.data()));
thrust::copy(d_count.begin(), d_count.end(), &count);
//printf("\nLine count: %d", count);
}
void Hough_lines::get_lines(int threshold, int min_len, int min_gap, GLuint line, Hough_lines::Type type, int& count)
{
int* d_line_coord = 0;
cudaGLRegisterBufferObject(line);
cudaGLMapBufferObject((void **)&d_line_coord, line);
int size = 0;
thrust::device_vector<Line> d_lines(params.r*360);
get_hough_lines(threshold, d_lines, size);
//printf("\nget_hough_lines: %d", size);
trace_all_lines(min_len, min_gap, d_lines, size, d_line_coord, count);
cudaGLUnmapBufferObject(line);
cudaGLUnregisterBufferObject(line);
}
bool Hough_lines::detect_lines(GLuint tex_edge, int threshold, int min_length, int min_gap, GLuint line, Hough_lines::Type type, int& count)
{
cudaError_t err;
cudaArray* array_edge;
cudaGraphicsResource* res_edge;
err = cudaGraphicsGLRegisterImage(&res_edge, tex_edge, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly);
if (err != cudaSuccess)
{
printf("cudaGraphicsGLRegisterImage Failed: %s", cudaGetErrorString(cudaGetLastError()));
exit(0);
}
cudaGraphicsMapResources(1, &res_edge);
cudaChannelFormatDesc chan_desc = cudaCreateChannelDesc<uint8_t>();
err = cudaGraphicsSubResourceGetMappedArray(&array_edge, res_edge, 0, 0);
if (err != cudaSuccess)
{
printf("cudaGraphicsSubResourceGetMappedArray Failed: %s", cudaGetErrorString(cudaGetLastError()));
exit(0);
}
if (cudaBindTextureToArray(&luma_tex, array_edge, &chan_desc) != cudaSuccess)
{
printf("Failed to bind texture - %s\n", cudaGetErrorString(cudaGetLastError()));
exit(0);
}
float time = 0.0;
//static float max = 0.0;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
count = 0;
get_lines(threshold, min_length, min_gap, line, type, count);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
//static int frame = 0;
//frame++;
//if (time > max)
{
//max = time;
printf("\nElpased time: %f ms", time);
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaUnbindTexture(luma_tex);
cudaGraphicsUnmapResources(1, &res_edge);
cudaGraphicsUnregisterResource(res_edge);
return true;
}
In "Prefix sums and their applications", Guy Blelloch described a parallel line-drawing algorithm using parallel prefix sum. See page 55 of that paper, it might give you ideas.
Regarding how to optimize hough_line_transfer, I think the key is to eliminate shared memory atomics in your loop. Where you use them you are effectively doing keyed reductions. Thrust provides a reduce_by_key function, but that is only callable from the host. The device-library counterpart to Thrust is CUB, but it does not have reduce_by_key. I've asked the CUB authors for ideas here and if we come up with anything I'll update this answer.
You could write your own keyed reduction but it would be more productive and robust to rely on a library if possible.
Hi I'm writing a simple Program for practicing to work with texture memory. I Just want to write my data into Texture Memory and write it back into Global Memory. But i cannont read out the Values. Here is the code.
#include <stdio.h>
#include <iostream>
#include "cuda.h"
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "HelloWorld.h"
#include "linearInterpolation_kernel4.cu"
using namespace std;
using std::cout;
const int blocksize = 16;
__global__
void hello(char *a, int *b) {
a[threadIdx.x] += b[threadIdx.x];
}
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
printf("%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
}
}
// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
inline void __getLastCudaError( const char *errorMessage, const char *file, const int line )
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
printf("%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
}
}
int main()
{
int N = 40;
float *A;
A = (float *) malloc(N*sizeof(float));
float *B;
B = (float *) malloc(N*sizeof(float));
float *result;
result = (float *) malloc(N*sizeof(float));
float angle = 0.8f;
for(int i = 0; i < N; i++){
A[i] = i; //(float)rand();
B[i] = i+1; //(float)rand();
}
ipLinearTexture2(A,B,result,angle,N);
float result2;
result2 = (angle)*A[4] + (1-angle)*B[4];
printf(" A %f B %f Result %f\n", A[4], B[4], result[4]);
cout << result2 << endl;
return 1;
}
void ipLinearTexture2(float *A, float* B, float* result, float angle, int N)
{
float cuTime;
int N2 = N * 2;
float *dev_result;
float **AB;
AB = (float **) malloc( N * sizeof(float *));
if(AB)
{
for(int i = 0; i < N; i++)
{
AB[i] = (float *) malloc( 2 * sizeof(float *));
}
}
for (int i = 0; i < N; i = i++)
{
AB[i][0] = A[i];
AB[i][1] = B[i];
}
cudaMalloc(&dev_result, N * sizeof(float));
unsigned int size = N2 * sizeof(float);
//cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray* cu_array;
checkCudaErrors(cudaMallocArray( &cu_array, &channelDesc,N,2));
cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);
// set texture parameters
tex2.normalized = true;
tex2.filterMode = cudaFilterModeLinear;
tex2.addressMode[0] = cudaAddressModeWrap; //cudaAddressModeWrap;
tex2.addressMode[1] = cudaAddressModeWrap; //cudaAddressModeClamp;
checkCudaErrors(cudaBindTextureToArray( tex2, cu_array, channelDesc));
dim3 dimBlock(10, 1, 1);
dim3 dimGrid((int)ceil((double)N*2/dimBlock.x), 1, 1);
transformKernel4<<< 256, 256, 0 >>>( dev_result, N, 2, angle);
checkCudaErrors(cudaMemcpy(result, dev_result, N * sizeof(float), cudaMemcpyDeviceToHost));
cout << "==================================================" << endl;
for (int i = 0 ; i < N ;i++)
{
cout << result[i] << " on " << i << endl;
}
cout << "==================================================" << endl;
checkCudaErrors(cudaUnbindTexture(tex));
checkCudaErrors(cudaFree(dev_result));
checkCudaErrors(cudaFreeArray(cu_array));
}
and here is the kernel code
#ifndef _SIMPLETEXTURE_KERNEL5_H_
#define _SIMPLETEXTURE_KERNEL5_H_
// Texture references
texture<float, 2, cudaReadModeElementType> tex2;
__global__ void
transformKernel4(float* g_odata, int width, int height, float theta)
{
unsigned int xid = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int yid = blockIdx.y * blockDim.y + threadIdx.y;
if (xid >= width || yid >= height) return;
float dx = 1.0f / (float)width;
float dy = 1.0f / (float)height;
float x = ((float)xid + 0.5f) * dx;
float y = ((float)yid + 0.5f) * dy;
float value = tex2D(tex2, x , y);
printf("wert %f xid %i yid %i \n",value, xid, yid);
g_odata[yid * width + xid] = value;
}
#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
Can somebody tell what i am doing wrong?
I have edited it to remove the first 2 logical mistake. Put why am I need able to print out my data?
It was the wrong binding of the Arrays. You can not use multidimensional Arrays in C that can be copied. You have to use a onedimensional array that respresents a multidimensional.
I can see 2 logical errors here.
The first one is the one pointed out by #asm.
The output should be stored by calculating linear index from 2D x and y indices.
outputIndex = yid * width + xid;
The second one is that the memory allocation for the cudaArray structure is internally aligned.
You should consider using cudaMemcpy2DToArray function to avoid erroneous data copying.
cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);