2D kernel calling and launch parameters for non-square matrix - cuda

I am attempting to port the following (simplified) nested loop as a CUDA 2D kernel. The sizes of NgS and NgO will increase with larger data sets; for now I just want to get this kernel to output the correct results for all values:
// macro that translates 2D [i][j] array indices to 1D flattened array indices
#define idx(i,j,lda) ( (j) + ((i)*(lda)) )
int NgS = 1859;
int NgO = 900;
// 1D flattened matrices have been initialized as:
Radio_cpu = new double [NgS*NgO];
Result_cpu = new double [NgS*NgO];
// ignoring the part where they are filled w/ data
for (m=0; m<NgO; m++) {
for (n=0; n<NgS; n++) {
Result_cpu[idx(n,m,NgO)]] = k0*Radio_cpu[idx(n,m,NgO)]];
}
}
The examples I have come across usually deal with square loops, and I have been unable to get the correct output for all the GPU array indices compared to the CPU version. Here is the host code calling the kernel:
dim3 dimBlock(16, 16);
dim3 dimGrid;
dimGrid.x = (NgO + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (NgS + dimBlock.y - 1) / dimBlock.y;
// Result_gpu and Radio_gpu are allocated versions of the CPU variables on GPU
trans<<<dimGrid,dimBlock>>>(NgO, NgS, k0, Radio_gpu, Result_gpu);
Here is the kernel:
__global__ void trans(int NgO, int NgS,
double k0, double * Radio, double * Result) {
int n = blockIdx.x * blockDim.x + threadIdx.x;
int m = blockIdx.y * blockDim.y + threadIdx.y;
if(n > NgS || m > NgO) return;
// map the two 2D indices to a single linear, 1D index
int grid_width = gridDim.x * blockDim.x;
int idxxx = m + (n * grid_width);
Result[idxxx] = k0 * Radio[idxxx];
}
With the current code, I proceeded to compare the Result_cpu variable with Result_gpu variable once copied back. When I cycle through the values I get:
// matches from NgS = 0...913
Result_gpu[NgS = 913][NgO = 0]: -56887.2
Result_cpu[Ngs = 913][NgO = 0]: -56887.2
// mismatches from NgS = 914...1858
Result_gpu[NgS = 914][NgO = 0]: -12.2352
Result_cpu[NgS = 914][NgO = 0]: 79448.6
This pattern is the same, irregardless of the value of NgO. I have been trying to figure out where I have made a mistake by looking at various examples for a few hours and trying out changes, but so far this scheme has worked minus the obvious issue at hand whereas the others have caused kernel invocation errors/left the GPU array uninitialized for all values. Since I clearly cannot see the mistake, I'd really appreciate if someone could point me in the right direction towards a fix. I'm pretty sure it's right under my nose and I can't see it.
In case it matters, I'm testing this code on a Kepler card, compiling using MSVC 2010, CUDA 4.2 and 304.79 driver and have compiled the code with both arch=compute_20,code=sm_20 and arch=compute_30,code=compute_30 flags with no difference.

#vaca_loca: I tested the following kernel (it works for me also with non-square block dimensions):
__global__ void trans(int NgO, int NgS,
double k0, double * Radio, double * Result) {
int n = blockIdx.x * blockDim.x + threadIdx.x;
int m = blockIdx.y * blockDim.y + threadIdx.y;
if(n > NgO || m > NgS) return;
int ofs = m * NgO + n;
Result[ofs] = k0 * Radio[ofs];
}
void test() {
int NgS = 1859, NgO = 900;
int data_sz = NgS * NgO, bytes = data_sz * sizeof(double);
cudaSetDevice(0);
double *Radio_cpu = new double [data_sz*3],
*Result_cpu = Radio_cpu + data_sz,
*Result_gpu = Result_cpu + data_sz;
double k0 = -1.7961233;
srand48(time(NULL));
int i, j, n, m;
for(m=0; m<NgO; m++) {
for (n=0; n<NgS; n++) {
Radio_cpu[m + n*NgO] = lrand48() % 234234;
Result_cpu[m + n*NgO] = k0*Radio_cpu[m + n*NgO];
}
}
double *g_Radio, *g_Result;
cudaMalloc((void **)&g_Radio, bytes * 2);
g_Result = g_Radio + data_sz;
cudaMemcpy(g_Radio, Radio_cpu, bytes, cudaMemcpyHostToDevice);
dim3 dimBlock(16, 16);
dim3 dimGrid;
dimGrid.x = (NgO + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (NgS + dimBlock.y - 1) / dimBlock.y;
trans<<<dimGrid,dimBlock>>>(NgO, NgS, k0, g_Radio, g_Result);
cudaMemcpy(Result_gpu, g_Result, bytes, cudaMemcpyDeviceToHost);
for(m=0; m<NgO; m++) {
for (n=0; n<NgS; n++) {
double c1 = Result_cpu[m + n*NgO],
c2 = Result_gpu[m + n*NgO];
if(std::abs(c1-c2) > 1e-4)
printf("(%d;%d): %.7f %.7f\n", n, m, c1, c2);
}
}
cudaFree(g_Radio);
delete []Radio_cpu;
}
though, in my opinion, accessing data from global memory using quads might not be very cache-friendly since access stride is pretty large. You might consider using 2D textures instead if it's critical for your algorithm to access data in 2D locality

Related

Solving tridiagonal linear systems in CUDA

I am trying to implement a tridiagonal system solver based on the Cyclic Reduction method on my GTS450.
Cyclic Reduction is illustrated in this paper
Y. Zhang, J. Cohen, J.D. Owens, "Fast Tridiagonal Solvers on GPU"
However, whatever I do, my CUDA code is far slower than the sequential counterpart. My result for a total of 512 x 512 points is 7ms, however on my i7 3.4GHz it is 5ms. The GPU is not accelerating!
Which could be the problem?
#include "cutrid.cuh"
__global__ void cutrid_RC_1b(double *a,double *b,double *c,double *d,double *x)
{
int idx_global=blockIdx.x*blockDim.x+threadIdx.x;
int idx=threadIdx.x;
__shared__ double asub[512];
__shared__ double bsub[512];
__shared__ double csub[512];
__shared__ double dsub[512];
double at=0;
double bt=0;
double ct=0;
double dt=0;
asub[idx]=a[idx_global];
bsub[idx]=b[idx_global];
csub[idx]=c[idx_global];
dsub[idx]=d[idx_global];
for(int stride=1;stride<N;stride*=2)
{
int margin_left,margin_right;
margin_left=idx-stride;
margin_right=idx+stride;
at=(margin_left>=0)?(-csub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f;
bt=bsub[idx]+((margin_left>=0)?(-csub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f)
-((margin_right<512)?asub[idx+stride]*csub[idx]/bsub[idx+stride]:0.f);
ct=(margin_right<512)?(-csub[idx+stride]*asub[idx]/bsub[idx+stride]):0.f;
dt=dsub[idx]+((margin_left>=0)?(-dsub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f)
-((margin_right<512)?dsub[idx+stride]*csub[idx]/bsub[idx+stride]:0.f);
__syncthreads();
asub[idx]=at;
bsub[idx]=bt;
csub[idx]=ct;
dsub[idx]=dt;
__syncthreads();
}
x[idx_global]=dsub[idx]/bsub[idx];
}/*}}}*/
I launched this kernel by cutrid_RC_1b<<<512,512>>>(d_a,d_b,d_c,d_d,d_x), and reached 100% device occupancy. This result has puzzled me for days.
There is an improved version of my code:
#include "cutrid.cuh"
__global__ void cutrid_RC_1b(float *a,float *b,float *c,float *d,float *x)
{/*{{{*/
int idx_global=blockIdx.x*blockDim.x+threadIdx.x;
int idx=threadIdx.x;
__shared__ float asub[512];
__shared__ float bsub[512];
__shared__ float csub[512];
__shared__ float dsub[512];
asub[idx]=a[idx_global];
bsub[idx]=b[idx_global];
csub[idx]=c[idx_global];
dsub[idx]=d[idx_global];
__syncthreads();
//Reduction
for(int stride=1;stride<512;stride*=2)
{
int margin_left=(idx-stride);
int margin_right=(idx+stride);
if(margin_left<0) margin_left=0;
if(margin_right>=512) margin_right=511;
float tmp1 = asub[idx] / bsub[margin_left];
float tmp2 = csub[idx] / bsub[margin_right];
float tmp3 = dsub[margin_right];
float tmp4 = dsub[margin_left];
__syncthreads();
dsub[idx] = dsub[idx] - tmp4*tmp1-tmp3*tmp2;
bsub[idx] = bsub[idx]-csub[margin_left]*tmp1-asub[margin_right]*tmp2;
tmp3 = -csub[margin_right];
tmp4 = -asub[margin_left];
__syncthreads();
asub[idx] = tmp3*tmp1;
csub[idx] = tmp4*tmp2;
__syncthreads();
}
x[idx_global]=dsub[idx]/bsub[idx];
}/*}}}*/
The speed is improved to 0.73ms on a Quadro k4000 for 512 x 512 system, however the code in the mentioned paper runs in 0.5ms on a GTX280.
Solving a tridiagonal system of equations is a challenging parallel problem since the classical solution scheme, i.e., Gaussian elimination, is inherently sequential.
Cyclic Reduction consists of two phases:
Forward Reduction. The original system is split in two independent tridiagonal systems for two sets of unknowns, the ones with odd index and the ones with even index. Such systems can be solved independently and this step can be seen as the first of a divide et impera scheme. The two smaller systems are split again in the same way in two subsystems and the process is repeated until a system of only 2 equations is reached.
Backward Substitution. The system of 2 equations is solved first. Then, the divide et impera structure is climbed up by solving the sub-systems independently on different cores.
I'm not sure (but correct me if I'm wrong) that your code will return consistent results. N does not appear to be defined. Also, you are accessing csub[idx-stride], but I'm not sure what does it mean when idx==0 and stride>1. Furthermore, you are using several conditional statements, essentially for boundary checkings. Finally, your code lacks a proper thread structure capable to deal with the mentioned divide et impera scheme, conceptually pretty much like the one used in the CUDA SDK reduction samples.
As mentioned in one of my comments above, I remembered that at tridiagonalsolvers you can find an implementation of the Cyclic Reduction scheme for solving tridiagonal equation systems. Browsing the related google pages, it seems to me that the code is mantained, among others, by the first Author of the above paper (Yao Zhang). The code is copied and pasted below. Note that the boundary check is done only once (if (iRight >= systemSize) iRight = systemSize - 1;), thus limiting the number of conditional statements involved. Note also the thread structure capable to deal with a divide et impera scheme.
The code by Zhang, Cohen and Owens
__global__ void crKernel(T *d_a, T *d_b, T *d_c, T *d_d, T *d_x)
{
int thid = threadIdx.x;
int blid = blockIdx.x;
int stride = 1;
int numThreads = blockDim.x;
const unsigned int systemSize = blockDim.x * 2;
int iteration = (int)log2(T(systemSize/2));
#ifdef GPU_PRINTF
if (thid == 0 && blid == 0) printf("iteration = %d\n", iteration);
#endif
__syncthreads();
extern __shared__ char shared[];
T* a = (T*)shared;
T* b = (T*)&a[systemSize];
T* c = (T*)&b[systemSize];
T* d = (T*)&c[systemSize];
T* x = (T*)&d[systemSize];
a[thid] = d_a[thid + blid * systemSize];
a[thid + blockDim.x] = d_a[thid + blockDim.x + blid * systemSize];
b[thid] = d_b[thid + blid * systemSize];
b[thid + blockDim.x] = d_b[thid + blockDim.x + blid * systemSize];
c[thid] = d_c[thid + blid * systemSize];
c[thid + blockDim.x] = d_c[thid + blockDim.x + blid * systemSize];
d[thid] = d_d[thid + blid * systemSize];
d[thid + blockDim.x] = d_d[thid + blockDim.x + blid * systemSize];
__syncthreads();
//forward elimination
for (int j = 0; j <iteration; j++)
{
__syncthreads();
stride *= 2;
int delta = stride/2;
if (threadIdx.x < numThreads)
{
int i = stride * threadIdx.x + stride - 1;
int iLeft = i - delta;
int iRight = i + delta;
if (iRight >= systemSize) iRight = systemSize - 1;
T tmp1 = a[i] / b[iLeft];
T tmp2 = c[i] / b[iRight];
b[i] = b[i] - c[iLeft] * tmp1 - a[iRight] * tmp2;
d[i] = d[i] - d[iLeft] * tmp1 - d[iRight] * tmp2;
a[i] = -a[iLeft] * tmp1;
c[i] = -c[iRight] * tmp2;
}
numThreads /= 2;
}
if (thid < 2)
{
int addr1 = stride - 1;
int addr2 = 2 * stride - 1;
T tmp3 = b[addr2]*b[addr1]-c[addr1]*a[addr2];
x[addr1] = (b[addr2]*d[addr1]-c[addr1]*d[addr2])/tmp3;
x[addr2] = (d[addr2]*b[addr1]-d[addr1]*a[addr2])/tmp3;
}
// backward substitution
numThreads = 2;
for (int j = 0; j <iteration; j++)
{
int delta = stride/2;
__syncthreads();
if (thid < numThreads)
{
int i = stride * thid + stride/2 - 1;
if(i == delta - 1)
x[i] = (d[i] - c[i]*x[i+delta])/b[i];
else
x[i] = (d[i] - a[i]*x[i-delta] - c[i]*x[i+delta])/b[i];
}
stride /= 2;
numThreads *= 2;
}
__syncthreads();
d_x[thid + blid * systemSize] = x[thid];
d_x[thid + blockDim.x + blid * systemSize] = x[thid + blockDim.x];
}
I want to add a further answer to mention that tridiagonal systems can be easily solved in the framework of the cuSPARSE library by aid of the function
cusparse<t>gtsv()
cuSPARSE also provides
cusparse<t>gtsv_nopivot()
which, at variance with the first mentioned routine, does not perform pivoting. Both the above functions solve the same linear system with multiple right hand sides. A batched routine
cusparse<t>gtsvStridedBatch()
also exists which solves multiple linear systems.
For all the above routines, the system matrix is fixed by simply specifying the lower diagonal, the main diagonal and the upper diagonal.
Below, I'm reporting a fully worked out example using cusparse<t>gtsv() to solve a tridiagonal linear system.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse_v2.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if(CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs",__FILE__, __LINE__,err, \
_cusparseGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
const int N = 5; // --- Size of the linear system
// --- Lower diagonal, diagonal and upper diagonal of the system matrix
double *h_ld = (double*)malloc(N * sizeof(double));
double *h_d = (double*)malloc(N * sizeof(double));
double *h_ud = (double*)malloc(N * sizeof(double));
h_ld[0] = 0.;
h_ud[N-1] = 0.;
for (int k = 0; k < N - 1; k++) {
h_ld[k + 1] = -1.;
h_ud[k] = -1.;
}
for (int k = 0; k < N; k++) h_d[k] = 2.;
double *d_ld; gpuErrchk(cudaMalloc(&d_ld, N * sizeof(double)));
double *d_d; gpuErrchk(cudaMalloc(&d_d, N * sizeof(double)));
double *d_ud; gpuErrchk(cudaMalloc(&d_ud, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_ld, h_ld, N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_d, h_d, N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_ud, h_ud, N * sizeof(double), cudaMemcpyHostToDevice));
// --- Allocating and defining dense host and device data vectors
double *h_x = (double *)malloc(N * sizeof(double));
h_x[0] = 100.0; h_x[1] = 200.0; h_x[2] = 400.0; h_x[3] = 500.0; h_x[4] = 300.0;
double *d_x; gpuErrchk(cudaMalloc(&d_x, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_x, h_x, N * sizeof(double), cudaMemcpyHostToDevice));
// --- Allocating the host and device side result vector
double *h_y = (double *)malloc(N * sizeof(double));
double *d_y; gpuErrchk(cudaMalloc(&d_y, N * sizeof(double)));
cusparseSafeCall(cusparseDgtsv(handle, N, 1, d_ld, d_d, d_ud, d_x, N));
cudaMemcpy(h_x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost);
for (int k=0; k<N; k++) printf("%f\n", h_x[k]);
}
At this gitHub repository, a comparison of different CUDA routines available in the cuSOLVER library for the solution of tridiagonal linear systems is reported.
Things I see:
1st __syncthreads() seems redundant.
There are repetitive sets of operations such as (-csub[idx-stride]*asub[idx]/bsub[idx-stride]) in your code. Use intermediate variables to hold the result and reuse them instead of making GPU calculate those sets each time.
Use NVIDIA profiler to see where issues are.

Using CUDA Shared Memory to Improve Global Access Patterns

I have the following kernel to get the magnitude of a bunch of vectors:
__global__ void norm_v1(double *in, double *out, int n)
{
const uint i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
{
double x = in[3*i], y = in[3*i+1], z = in[3*i+2];
out[i] = sqrt(x*x + y*y + z*z);
}
}
However due to the packing of in as [x0,y0,z0,...,xn,yn,zn] it performs poorly with the profiler indicating a 32% global load efficiency. Repacking the data as [x0, x1, ..., xn, y0, y1, ..., yn, z0, z1, ..., zn] improves things greatly (with the offsets for x, y, and z changing accordingly). Runtime is down and efficiency is up to 100%.
However, this packing is simply not practical for my application. I therefore wish to investigate the use of shared memory. My idea is for each thread in a block to copy three values (blockDim.x apart) from global memory -- yielding coalesced access. Under the assumption of a maximum blockDim.x = 256 I came up with:
#define BLOCKDIM 256
__global__ void norm_v2(double *in, double *out, int n)
{
__shared__ double invec[3*BLOCKDIM];
const uint i = blockIdx.x * blockDim.x + threadIdx.x;
invec[0*BLOCKDIM + threadIdx.x] = in[0*BLOCKDIM+i];
invec[1*BLOCKDIM + threadIdx.x] = in[1*BLOCKDIM+i];
invec[2*BLOCKDIM + threadIdx.x] = in[2*BLOCKDIM+i];
__syncthreads();
if (i < n)
{
double x = invec[3*threadIdx.x];
double y = invec[3*threadIdx.x+1];
double z = invec[3*threadIdx.x+2];
out[i] = sqrt(x*x + y*y + z*z);
}
}
However this is clearly deficient when n % blockDim.x != 0, requires knowing the maximum blockDim in advance and generates incorrect results for out[i > 255] when tested with an n = 1024. How should I best remedy this?
I think this can solve the out[i > 255] problem:
__shared__ double shIn[3*BLOCKDIM];
const uint blockStart = blockIdx.x * blockDim.x;
invec[0*blockDim.x+threadIdx.x] = in[ blockStart*3 + 0*blockDim.x + threadIdx.x];
invec[1*blockDim.x+threadIdx.x] = in[ blockStart*3 + 1*blockDim.x + threadIdx.x];
invec[2*blockDim.x+threadIdx.x] = in[ blockStart*3 + 2*blockDim.x + threadIdx.x];
__syncthreads();
double x = shIn[3*threadIdx.x];
double y = shIn[3*threadIdx.x+1];
double z = shIn[3*threadIdx.x+2];
out[blockStart+threadIdx.x] = sqrt(x*x + y*y + z*z);
As for n % blockDim.x != 0 I would suggest padding the input/output arrays with 0 to match the requirement.
If you dislike the BLOCKDIM macro - explore using extern __shared__ shArr[] and then passing 3rd parameter to kernel configuration:
norm_v2<<<gridSize,blockSize,dynShMem>>>(...)
the dynShMem is the dynamic shared memory usage (in bytes). This is extra shared memory pool with its size specified at run-time, where all extern __shared__ variables will be initially assigned to.
What GPU are you using? Fermi or Kepler might help your original code with their L1 caching.
If you don't want to pad your in array, or you end up doing similar trick somewhere else, you may want to consider implementing a device-side memcopy, something like this:
template <typename T>
void memCopy(T* destination, T* source, size_t numElements) {
//assuming sizeof(T) is a multiple of sizeof(int)
//assuming one-dimentional kernel (only threadIdx.x and blockDim.x matters)
size_t totalSize = numElements*sizeof(T)/sizeof(int);
int* intDest = (int*)destination;
int* intSrc = (int*)source;
for (size_t i = threadIdx.x; i < totalSize; i += blockDim.x) {
intDest[i] = intSrc[i];
}
__syncthreads();
}
It basically treats any array as an array of int-s and copy the data from one location to another. You may want to replace the underlying int type with double-s or long long int if you work with 64-bit types only.
Then you can replace the copying lines with:
memCopy(invec, in+blockStart*3, min(blockDim.x, n-blockStart));

Performing several 1D moving averages in parallel using CUDA Thrust

I'm not a programmer with any abilities. Just someone curious about CUDA and so I'm doing a little reading. I ran across an example of using Thrust to do a moving average:
Simple Moving Average Thrust Example
The example, such as it is, runs and mostly works correctly. However it's trivial in the sense that it only does one moving average operation.
How I would do say 352 of these moving average operations in parallel, all operating on the same data stream? In my mind the program flow might be:
Generate the data & send it to one CUDA core. (Same as existing code
but think lengths of 1000 or 10000 instead of 30)
Copy it from the CUDA core it's in to all of the the other 351 CUDA
cores in my GTX 465
Tell each CUDA core what number of data items to average over.
(4, 5, 6,..., 352, 353, 354)
Tell the device to run the average in each core in parallel
Read back the results from each core
I get that this code
// compute SMA using standard summation
simple_moving_average(data, w, averages);
makes it all happen, but how to I get Thrust to do many of these in parallel?
My interest here is about something like stock data. If I'm looking at GOOG prices I'd put that in the GPU using all cores and leave it there. I'd then be free to do lots of processing without loading the data anymore and just reading back results from each core. NOTE: I might not want to use GOOG in all cores. Some cores might be GOOG, others with some other symbol, but I'll get there later. I'm just thinking I don't want the stock data in global memory if there's enough room in each core.
I assume this is pretty straightforward for CUDA & Thrust?
Here is the possible way how to do this with arrayfire:
Note that I am NOT affiliated with this library whatsoever.
I am pretty sure this can also be done with thrust
but I found this one a lot simpler with arrayfire.
And if the library is free why can't I use it instead of thrust ?
In arrayfire you can use matrix to run several SMA operations in parallel:
unsigned n_SMAs = 1000; // # of SMA indicators to evaluate
unsigned len = 2000; // # of stock prices per indicator
unsigned w = 6; // window size
// generate stock prices: [0..10]
af::array data = af::randu(n_SMAs, len) * 10;
// compute inclusive prefix sums along colums of the matrix
af::array s = af::accum(data, 1);
// compute the average
af::array avg = (s.cols(w, af::end) - s.cols(0, af::end - w)) / w;
af::eval(avg);
std::cout << avg.dims() << "\n" << avg << "\n";
let me know if that's what you are looking for. This is how I understood your question: compute several SMA indicators in parallel
My understanding is that you are interested into the following two situations:
You have a long sequence of items and you want to calculate a certain number of averages, by averaging on different numbers of items, i.e., using different lengths for the moving average window. This is what I understand from your original question.
You have a series of sequences, stored consecutively in memory, and you want to average them in parallel with a fixed averaging window of size 2 * RADIUS + 1. This is what the ArrayFire code proposed by #asm does - you have accepted it.
Instead of using CUDA Thrust, I think it would be easier to write your own CUDA kernel to do the above operations. Below, a fully worked example that operates in the same way as the ArrayFire code proposed by #asm, thus covering case #2. Modifying it to cover case #1 would be straightforward.
#include <thrust/device_vector.h>
#define RADIUS 3
#define BLOCK_SIZE_X 8
#define BLOCK_SIZE_Y 8
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, unsigned int M, unsigned int N) {
__shared__ unsigned int temp[BLOCK_SIZE_Y][BLOCK_SIZE_X + 2 * RADIUS];
unsigned int gindexx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int gindexy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int gindex = gindexy * N + gindexx;
unsigned int lindexx = threadIdx.x + RADIUS;
unsigned int lindexy = threadIdx.y;
// --- Read input elements into shared memory
temp[lindexy][lindexx] = ((gindexx < N)&&(gindexy < M))? in[gindex] : 0;
if (threadIdx.x < RADIUS) {
temp[lindexy][threadIdx.x] = ((gindexx >= RADIUS)&&(gindexx < (N + RADIUS))&&(gindexy < M)) ? in[gindex - RADIUS] : 0;
temp[lindexy][threadIdx.x + (RADIUS + min(BLOCK_SIZE_X, N - blockIdx.x * BLOCK_SIZE_X))] = (((gindexx + min(BLOCK_SIZE_X, N - blockIdx.x * BLOCK_SIZE_X)) < N)&&(gindexy < M))? in[gindexy * N + gindexx + min(BLOCK_SIZE_X, N - blockIdx.x * BLOCK_SIZE_X)] : 0;
if ((threadIdx.y == 0)&&(gindexy < M)&&((gindexx + BLOCK_SIZE_X) < N)&&(gindexy < M)) printf("Inside 2 - tidx = %i; bidx = %i; tidy = %i; bidy = %i; lindexx = %i; temp = %i\n", threadIdx.x, blockIdx.x, threadIdx.y, blockIdx.y, threadIdx.x + (RADIUS + BLOCK_SIZE_X), temp[lindexy][threadIdx.x + (RADIUS + BLOCK_SIZE_X)]);
}
__syncthreads();
// --- Apply the stencil
unsigned int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
result += temp[lindexy][lindexx + offset];
}
// --- Store the result
out[gindexy * N + gindexx] = result;
}
/********/
/* MAIN */
/********/
int main() {
const unsigned int M = 2;
const unsigned int N = 4 + 2 * RADIUS;
const unsigned int constant = 3;
thrust::device_vector<unsigned int> d_in(M * N, constant);
thrust::device_vector<unsigned int> d_out(M * N);
dim3 GridSize(iDivUp(N, BLOCK_SIZE_X), iDivUp(M, BLOCK_SIZE_Y));
dim3 BlockSize(BLOCK_SIZE_X, BLOCK_SIZE_Y);
moving_average<<<GridSize, BlockSize>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), M, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<unsigned int> h_out = d_out;
for (int j=0; j<M; j++) {
for (int i=0; i<N; i++)
printf("Element j = %i; i = %i; h_out = %i\n", j, i, h_out[N*j+i]);
}
return 0;
}

Shared memory mutex with CUDA - adding to a list of items

My problem is the following: I have an image in which I detect some points of interest using the GPU. The detection is a heavyweight test in terms of processing, however only about 1 in 25 points pass the test on average. The final stage of the algorithm is to build up a list of the points. On the CPU this would be implemented as:
forall pixels x,y
{
if(test_this_pixel(x,y))
vector_of_coordinates.push_back(Vec2(x,y));
}
On the GPU I have each CUDA block processing 16x16 pixels. The problem is that I need to do something special to eventually have a single consolidated list of points in global memory. At the moment I am trying to generate a local list of points in shared memory per block which eventually will be written to global memory. I am trying to avoid sending anything back to the CPU because there are more CUDA stages after this.
I was expecting that I could use atomic operations to implement the push_back function on shared memory. However I am unable to get this working. There are two issues. The first annoying issue is that I am constantly running into the following compiler crash: "nvcc error : 'ptxas' died with status 0xC0000005 (ACCESS_VIOLATION)" when using atomic operations. It is hit or miss whether I can compile something. Does anyone know what causes this?
The following kernel will reproduce the error:
__global__ void gpu_kernel(int w, int h, RtmPoint *pPoints, int *pCounts)
{
__shared__ unsigned int test;
atomicInc(&test, 1000);
}
Secondly, my code which includes a mutex lock on shared memory hangs the GPU and I dont understand why:
__device__ void lock(unsigned int *pmutex)
{
while(atomicCAS(pmutex, 0, 1) != 0);
}
__device__ void unlock(unsigned int *pmutex)
{
atomicExch(pmutex, 0);
}
__global__ void gpu_kernel_non_max_suppress(int w, int h, RtmPoint *pPoints, int *pCounts)
{
__shared__ RtmPoint localPoints[64];
__shared__ int localCount;
__shared__ unsigned int mutex;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int threadid = threadIdx.y * blockDim.x + threadIdx.x;
int blockid = blockIdx.y * gridDim.x + blockIdx.x;
if(threadid==0)
{
localCount = 0;
mutex = 0;
}
__syncthreads();
if(x<w && y<h)
{
if(some_test_on_pixel(x,y))
{
RtmPoint point;
point.x = x;
point.y = y;
// this is a local push_back operation
lock(&mutex);
if(localCount<64) // we should never get >64 points per block
localPoints[localCount++] = point;
unlock(&mutex);
}
}
__syncthreads();
if(threadid==0)
pCounts[blockid] = localCount;
if(threadid<localCount)
pPoints[blockid * 64 + threadid] = localPoints[threadid];
}
In the example code at this site, the author manages to successfully use atomic operations on shared memory, so I am confused as to why my case does not function. If I comment out the lock and unlock lines, the code runs ok, but obviously incorrectly adding to the list.
I would appreciate some advice about why this problem is happening and also perhaps if there is a better solution to achieving the goal, since I am concerned anyway about the performance issues with using atomic operations or mutex locks.
I suggest using prefix-sum to implement that part to increase parallelism. To do that you need to use a shared array. Basically prefix-sum will turn an array (1,1,0,1) into (0,1,2,2,3), i.e., will calculate an in-place running exclusive sum so that you'll get per-thread write indices.
__shared__ uint8_t vector[NUMTHREADS];
....
bool emit = (x<w && y<h);
emit = emit && some_test_on_pixel(x,y);
__syncthreads();
scan(emit, vector);
if (emit) {
pPoints[blockid * 64 + vector[TID]] = point;
}
prefix-sum example:
template <typename T>
__device__ uint32 scan(T mark, T *output) {
#define GET_OUT (pout?output:values)
#define GET_INP (pin?output:values)
__shared__ T values[numWorkers];
int pout=0, pin=1;
int tid = threadIdx.x;
values[tid] = mark;
syncthreads();
for( int offset=1; offset < numWorkers; offset *= 2) {
pout = 1 - pout; pin = 1 - pout;
syncthreads();
if ( tid >= offset) {
GET_OUT[tid] = (GET_INP[tid-offset]) +( GET_INP[tid]);
}
else {
GET_OUT[tid] = GET_INP[tid];
}
syncthreads();
}
if(!pout)
output[tid] =values[tid];
__syncthreads();
return output[numWorkers-1];
#undef GET_OUT
#undef GET_INP
}
Based on recommendations here, I include the code that I used in the end. It uses 16x16 pixel blocks. Note that I am now writing the data out in one global array without breaking it up. I used the global atomicAdd function to compute a base address for each set of results. Since this only gets called once per block, I did not find too much of a slow down, while I gained a lot more convenience by doing this. I'm also avoiding shared buffers for the input and output of prefix_sum. GlobalCount is set to zero prior to the kernel call.
#define BLOCK_THREADS 256
__device__ int prefixsum(int threadid, int data)
{
__shared__ int temp[BLOCK_THREADS*2];
int pout = 0;
int pin = 1;
if(threadid==BLOCK_THREADS-1)
temp[0] = 0;
else
temp[threadid+1] = data;
__syncthreads();
for(int offset = 1; offset<BLOCK_THREADS; offset<<=1)
{
pout = 1 - pout;
pin = 1 - pin;
if(threadid >= offset)
temp[pout * BLOCK_THREADS + threadid] = temp[pin * BLOCK_THREADS + threadid] + temp[pin * BLOCK_THREADS + threadid - offset];
else
temp[pout * BLOCK_THREADS + threadid] = temp[pin * BLOCK_THREADS + threadid];
__syncthreads();
}
return temp[pout * BLOCK_THREADS + threadid];
}
__global__ void gpu_kernel(int w, int h, RtmPoint *pPoints, int *pGlobalCount)
{
__shared__ int write_base;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int threadid = threadIdx.y * blockDim.x + threadIdx.x;
int valid = 0;
if(x<w && y<h)
{
if(test_pixel(x,y))
{
valid = 1;
}
}
int index = prefixsum(threadid, valid);
if(threadid==BLOCK_THREADS-1)
{
int total = index + valid;
if(total>64)
total = 64; // global output buffer is limited to 64 points per block
write_base = atomicAdd(pGlobalCount, total); // get a location to write them out
}
__syncthreads(); // ensure write_base is valid for all threads
if(valid)
{
RtmPoint point;
point.x = x;
point.y = y;
if(index<64)
pPoints[write_base + index] = point;
}
}

cuda multiplication

Serial code snippet looks like this:
int i, j;
for(j=0; j<ny; j++)
{
for(i=0; i<nx; i++)
{
x[i + j*nx] *= y[i];
}
}
I converted this to CUDA using this kernel:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int i,j;
for(tid = 0; tid <nx*ny; tid++)
{
j = tid/nx;
i = tid - j*nx;
x[tid] *= y[i];
}
However the GPU kernel does not give any speedup improvement? Any suggestions on a better solution?? Thanks in advance
If this is the serial code:
int i, j;
for(j=0; j<ny; j++)
{
for(i=0; i<nx; i++)
{
x[i + j*nx] *= y[i];
}
}
then you should be doing this:
__global__ void fn(float *x, int nx)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int j = tid/nx, i = tid - j * nx;
x[tid] *= y[i];
}
fn<<<nx*ny/B, B>>>(x, nx); // with B = 256, 512, etc.
What you're doing is fairly bizarre: you're instructing each thread of the CUDA kernel to iterate over all values of tid between 0 and nx*ny, and compute the same function as your CPU version! Moreover, instead of just iterating over the indices, you're actually doing the loop less efficiently than you did for the CPU version; in other words, you do the same thing in each thread, just less efficiently, than you are doing in 1 thread on the CPU. It's no wonder that this is slower; it should be much, much slower. Your CUDA kernel is:
int **tid** = blockIdx.x * blockDim.x + threadIdx.x;
int i,j;
for(**tid** = 0; **tid** <nx*ny; **tid**++)
{
j = tid/nx;
i = tid - j*nx;
x[tid] *= y[i];
}
This does nx*ny iterations, same as your host code, for each thread; you lose all benefit of the parallelism, since each thread is doing the same thing; you would get the same performance using one thread on the GPU, and the same result!
If this is the verbatim code from your CUDA source file, you need to change it and redo the comparison; if this is code you have written to help explain what your code is doing for a lay non-CUDA audience, then you need to present your actual CUDA code so that we can see what's going on... as it is, the performance analysis I have done - the trivial one - is all you can expect.
Given your comment to this answer:
the nx * ny = 2205; so I used no. of blocks =
(nx*ny+(threads-1))/threads and threads = 64.
is implying you are intending to launch one thread per computation, the correct CUDA implementation would just be:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int j = tid/nx;
int i = tid - j*nx;
if (tid < (nx*ny))
x[tid] *= y[i];
If you were intending for each thread to compute more than one computation per kernel launch, then you would size the grid to "fill" each of the SM on the target GPU, not use the same number of threads as the input size, and then do something like:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int gsize = blockDim.x * gridDim.x;
int i,j;
for(; tid <nx*ny; tid+=gsize)
{
j = tid/nx;
i = tid - j*nx;
x[tid] *= y[i];
}
That would get you at least coalesced reads and writes to x, and remove the enormous number of redundant calculations in your posted version. There are a number of further optimizations that could be made, but it would require more information about the problem than has been supplied in the question and subsequent comments. Your indexing scheme contains an integer division and then an integer multiply-add per calculation. That is a lot of overhead for a single FLOP per input value. However, having said all of that, if the problem size I quoted is that actual problem size you are interested in, the GPU will never be faster than even a modest host CPU. You would require many orders of magnitude larger problems to realize useful speed up using the GPU for this sort low arithmetic intensity operation.
How big is the block? it may be that the time needed to copy a small amount of data to the GPU and setup the envirnoment is much longer than the calculation time.
Remember also that CUDA does a jit compile on the first run so to get accurate benchmarking you need to run it many times.
Try this using shared memory. One of the best implementations around:
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)
typedef struct {
int width;
int height;
int stride; // In number of elements
float *elements;
} Matrix;
// Thread block size
#define BLOCK_SIZE 16
// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col)
{
return A.elements[row * A.stride + col];
}
// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col, float value)
{
A.elements[row * A.stride + col] = value;
}
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
Matrix Asub;
Asub.width = BLOCK_SIZE; Asub.height = BLOCK_SIZE;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row +
BLOCK_SIZE * col];
return Asub;
}
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Same as in previous example, except the followings:
// d_A.width = d_A.stride = A.width;
// d_B.width = d_B.stride = B.width;
// d_C.width = d_C.stride = C.width;
}
// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Block row and column
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
// Each thread block computes one sub-matrix Csub of C
Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
// Each thread computes one element of Csub
// by accumulating results into Cvalue
float Cvalue = 0;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
for (int m = 0; m < (A.width / BLOCK_SIZE); ++m)
{
// Get sub-matrix Asub of A and Bsub of B
Matrix Asub = GetSubMatrix(A, blockRow, m);
Matrix Bsub = GetSubMatrix(B, m, blockCol);
// Shared memory used to store Asub and Bsub respectively
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[row][col] = GetElement(Asub, row, col);
Bs[row][col] = GetElement(Bsub, row, col);
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
__syncthreads();
// Multiply Asub and Bsub together
for (int e = 0; e < BLOCK_SIZE; ++e)
Cvalue += As[row][e] * Bs[e][col];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write Csub to device memory
// Each thread writes one element
SetElement(Csub, row, col, Cvalue);
}