Performing several 1D moving averages in parallel using CUDA Thrust - cuda

I'm not a programmer with any abilities. Just someone curious about CUDA and so I'm doing a little reading. I ran across an example of using Thrust to do a moving average:
Simple Moving Average Thrust Example
The example, such as it is, runs and mostly works correctly. However it's trivial in the sense that it only does one moving average operation.
How I would do say 352 of these moving average operations in parallel, all operating on the same data stream? In my mind the program flow might be:
Generate the data & send it to one CUDA core. (Same as existing code
but think lengths of 1000 or 10000 instead of 30)
Copy it from the CUDA core it's in to all of the the other 351 CUDA
cores in my GTX 465
Tell each CUDA core what number of data items to average over.
(4, 5, 6,..., 352, 353, 354)
Tell the device to run the average in each core in parallel
Read back the results from each core
I get that this code
// compute SMA using standard summation
simple_moving_average(data, w, averages);
makes it all happen, but how to I get Thrust to do many of these in parallel?
My interest here is about something like stock data. If I'm looking at GOOG prices I'd put that in the GPU using all cores and leave it there. I'd then be free to do lots of processing without loading the data anymore and just reading back results from each core. NOTE: I might not want to use GOOG in all cores. Some cores might be GOOG, others with some other symbol, but I'll get there later. I'm just thinking I don't want the stock data in global memory if there's enough room in each core.
I assume this is pretty straightforward for CUDA & Thrust?

Here is the possible way how to do this with arrayfire:
Note that I am NOT affiliated with this library whatsoever.
I am pretty sure this can also be done with thrust
but I found this one a lot simpler with arrayfire.
And if the library is free why can't I use it instead of thrust ?
In arrayfire you can use matrix to run several SMA operations in parallel:
unsigned n_SMAs = 1000; // # of SMA indicators to evaluate
unsigned len = 2000; // # of stock prices per indicator
unsigned w = 6; // window size
// generate stock prices: [0..10]
af::array data = af::randu(n_SMAs, len) * 10;
// compute inclusive prefix sums along colums of the matrix
af::array s = af::accum(data, 1);
// compute the average
af::array avg = (s.cols(w, af::end) - s.cols(0, af::end - w)) / w;
af::eval(avg);
std::cout << avg.dims() << "\n" << avg << "\n";
let me know if that's what you are looking for. This is how I understood your question: compute several SMA indicators in parallel

My understanding is that you are interested into the following two situations:
You have a long sequence of items and you want to calculate a certain number of averages, by averaging on different numbers of items, i.e., using different lengths for the moving average window. This is what I understand from your original question.
You have a series of sequences, stored consecutively in memory, and you want to average them in parallel with a fixed averaging window of size 2 * RADIUS + 1. This is what the ArrayFire code proposed by #asm does - you have accepted it.
Instead of using CUDA Thrust, I think it would be easier to write your own CUDA kernel to do the above operations. Below, a fully worked example that operates in the same way as the ArrayFire code proposed by #asm, thus covering case #2. Modifying it to cover case #1 would be straightforward.
#include <thrust/device_vector.h>
#define RADIUS 3
#define BLOCK_SIZE_X 8
#define BLOCK_SIZE_Y 8
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, unsigned int M, unsigned int N) {
__shared__ unsigned int temp[BLOCK_SIZE_Y][BLOCK_SIZE_X + 2 * RADIUS];
unsigned int gindexx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int gindexy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int gindex = gindexy * N + gindexx;
unsigned int lindexx = threadIdx.x + RADIUS;
unsigned int lindexy = threadIdx.y;
// --- Read input elements into shared memory
temp[lindexy][lindexx] = ((gindexx < N)&&(gindexy < M))? in[gindex] : 0;
if (threadIdx.x < RADIUS) {
temp[lindexy][threadIdx.x] = ((gindexx >= RADIUS)&&(gindexx < (N + RADIUS))&&(gindexy < M)) ? in[gindex - RADIUS] : 0;
temp[lindexy][threadIdx.x + (RADIUS + min(BLOCK_SIZE_X, N - blockIdx.x * BLOCK_SIZE_X))] = (((gindexx + min(BLOCK_SIZE_X, N - blockIdx.x * BLOCK_SIZE_X)) < N)&&(gindexy < M))? in[gindexy * N + gindexx + min(BLOCK_SIZE_X, N - blockIdx.x * BLOCK_SIZE_X)] : 0;
if ((threadIdx.y == 0)&&(gindexy < M)&&((gindexx + BLOCK_SIZE_X) < N)&&(gindexy < M)) printf("Inside 2 - tidx = %i; bidx = %i; tidy = %i; bidy = %i; lindexx = %i; temp = %i\n", threadIdx.x, blockIdx.x, threadIdx.y, blockIdx.y, threadIdx.x + (RADIUS + BLOCK_SIZE_X), temp[lindexy][threadIdx.x + (RADIUS + BLOCK_SIZE_X)]);
}
__syncthreads();
// --- Apply the stencil
unsigned int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
result += temp[lindexy][lindexx + offset];
}
// --- Store the result
out[gindexy * N + gindexx] = result;
}
/********/
/* MAIN */
/********/
int main() {
const unsigned int M = 2;
const unsigned int N = 4 + 2 * RADIUS;
const unsigned int constant = 3;
thrust::device_vector<unsigned int> d_in(M * N, constant);
thrust::device_vector<unsigned int> d_out(M * N);
dim3 GridSize(iDivUp(N, BLOCK_SIZE_X), iDivUp(M, BLOCK_SIZE_Y));
dim3 BlockSize(BLOCK_SIZE_X, BLOCK_SIZE_Y);
moving_average<<<GridSize, BlockSize>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), M, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<unsigned int> h_out = d_out;
for (int j=0; j<M; j++) {
for (int i=0; i<N; i++)
printf("Element j = %i; i = %i; h_out = %i\n", j, i, h_out[N*j+i]);
}
return 0;
}

Related

Wrong sizes for block reduction in CUDA?

I was checking out this sum_reduction.cu example and tutorial and noticed that for certain problem sizes it doesn't work e.g. it works with problem size n=2000 but not with n=3000. Apparently it always work with problem sizes that are multiple of the block size but neither the tutorial nor the example code states so. The question is, does this reduction algorithm only works for certain problem sizes? the example they chose N=256k which is even, a power of two and also multiple of the block size 512.
For self containment I paste the most important bits of (a template version of) the code here:
template<typename T>
__global__ void kernelSum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t n) {
extern __shared__ T sdata[];
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
// load input into __shared__ memory
T x = 0.0;
if (tid < n) {
x = input[tid];
}
sdata[threadIdx.x] = x;
__syncthreads();
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
if(threadIdx.x < offset) {
// add a partial sum upstream to our own
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
// wait until all threads in the block have
// updated their partial sums
__syncthreads();
}
// thread 0 writes the final result
if(threadIdx.x == 0) {
per_block_results[blockIdx.x] = sdata[0];
}
}
and to invoke the kernel:
// launch one kernel to compute, per-block, a partial sum
block_sum<double> <<<num_blocks,block_size,block_size * sizeof(double)>>>(d_input, d_partial_sums_and_total, num_elements);
// launch a single block to compute the sum of the partial sums
block_sum<double> <<<1,num_blocks,num_blocks * sizeof(double)>>>(d_partial_sums_and_total, d_partial_sums_and_total + num_blocks, num_blocks);
To my understanding if the problem size is smaller than the block reduction this statement T x = 0.0; ensures that the element is zeroed out and thus should work but it doesn't?
UPDATE: I am sorry the float/double thing was a typo while preparing the question and not the real problem.
The code you have posted is not consistent, as your templated kernel
is called kernelSum but you are invoking something called
block_sum.
Furthermore, I don't believe your usage of the templated kernel
function could possibly be correct as written:
block_sum<double> <<<num_blocks,block_size,block_size * sizeof(float)>>>(d_input, d_partial_sums_and_total, num_elements);
^ ^
| these types are required to match |
The kernel template is being instantiated with type double. Therefore it is expecting enough shared memory to store block_size double quantities, based on this line:
extern __shared__ T sdata[];
But you are only passing half of the required storage:
block_size * sizeof(float)
I believe that's going to give you unexpected results.
The reduction as written does expect that the block
dimension is a power of 2, due to this loop:
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
This is not likely to be an issue on the first kernel call, because you are probably choosing a power of two for the number of threads per block (block_size):
block_sum<double> <<<num_blocks,block_size,...
However, for the second kernel call, this will depend on whether num_blocks is a power of two, which depends on your grid calculations, which you haven't shown:
block_sum<double> <<<1,num_blocks,...
Finally, the first kernel launch will fail if num_blocks exceeds the limit for your device. This may happen for very large data sets but probably not for size 3000, and it depends on your grid calculations which you haven't shown.
Item 3 above is a difficult requirement to satisfy on the fly for arbitrary vector sizes. Therefore I would suggest an alternate reduction strategy to handle arbitrary sized vectors. For this I would suggest that you study the CUDA reduction sample code and presentation.
Here's a complete program, mostly based on the code you have shown, that has the above issues addressed, and seems to work for me for a size of 3000:
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 3000
#define nTPB 256
template<typename T>
__global__ void block_sum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t n) {
extern __shared__ T sdata[];
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
// load input into __shared__ memory
T x = 0.0;
if (tid < n) {
x = input[tid];
}
sdata[threadIdx.x] = x;
__syncthreads();
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
if(threadIdx.x < offset) {
// add a partial sum upstream to our own
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
// wait until all threads in the block have
// updated their partial sums
__syncthreads();
}
// thread 0 writes the final result
if(threadIdx.x == 0) {
per_block_results[blockIdx.x] = sdata[0];
}
}
int main(){
double *d_input, *d_partial_sums_and_total, *h_input, *h_partial_sums_and_total;
int num_elements=DSIZE;
int block_size = nTPB;
int num_blocks = (num_elements + block_size -1)/block_size;
// bump num_blocks up to the next power of 2
int done = 0;
int test_val = 1;
while (!done){
if (test_val >= num_blocks){
num_blocks = test_val;
done = 1;}
else test_val *= 2;
if (test_val > 65535) {printf("blocks failure\n"); exit(1);}
}
h_input = (double *)malloc(num_elements * sizeof(double));
h_partial_sums_and_total = (double *)malloc((num_blocks+1)*sizeof(double));
cudaMalloc((void **)&d_input, num_elements * sizeof(double));
cudaMalloc((void **)&d_partial_sums_and_total, (num_blocks+1)*sizeof(double));
double h_result = 0.0;
for (int i = 0; i < num_elements; i++) {
h_input[i] = rand()/(double)RAND_MAX;
h_result += h_input[i];}
cudaMemcpy(d_input, h_input, num_elements*sizeof(double), cudaMemcpyHostToDevice);
cudaMemset(d_partial_sums_and_total, 0, (num_blocks+1)*sizeof(double));
// launch one kernel to compute, per-block, a partial sum
block_sum<double> <<<num_blocks,block_size,block_size * sizeof(double)>>>(d_input, d_partial_sums_and_total, num_elements);
// launch a single block to compute the sum of the partial sums
block_sum<double> <<<1,num_blocks,num_blocks * sizeof(double)>>>(d_partial_sums_and_total, d_partial_sums_and_total + num_blocks, num_blocks);
cudaMemcpy(h_partial_sums_and_total, d_partial_sums_and_total, (num_blocks+1)*sizeof(double), cudaMemcpyDeviceToHost);
printf("host result = %lf\n", h_result);
printf("device result = %lf\n", h_partial_sums_and_total[num_blocks]);
}
For brevity/readability, I have dispensed with error checking in the above code. When having difficulty with a cuda code, you should always do proper cuda error checking.
Also, in the future, you will make it easier for others to help you if you post a complete code to demonstrate what you are doing, as I have done above.

Solving tridiagonal linear systems in CUDA

I am trying to implement a tridiagonal system solver based on the Cyclic Reduction method on my GTS450.
Cyclic Reduction is illustrated in this paper
Y. Zhang, J. Cohen, J.D. Owens, "Fast Tridiagonal Solvers on GPU"
However, whatever I do, my CUDA code is far slower than the sequential counterpart. My result for a total of 512 x 512 points is 7ms, however on my i7 3.4GHz it is 5ms. The GPU is not accelerating!
Which could be the problem?
#include "cutrid.cuh"
__global__ void cutrid_RC_1b(double *a,double *b,double *c,double *d,double *x)
{
int idx_global=blockIdx.x*blockDim.x+threadIdx.x;
int idx=threadIdx.x;
__shared__ double asub[512];
__shared__ double bsub[512];
__shared__ double csub[512];
__shared__ double dsub[512];
double at=0;
double bt=0;
double ct=0;
double dt=0;
asub[idx]=a[idx_global];
bsub[idx]=b[idx_global];
csub[idx]=c[idx_global];
dsub[idx]=d[idx_global];
for(int stride=1;stride<N;stride*=2)
{
int margin_left,margin_right;
margin_left=idx-stride;
margin_right=idx+stride;
at=(margin_left>=0)?(-csub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f;
bt=bsub[idx]+((margin_left>=0)?(-csub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f)
-((margin_right<512)?asub[idx+stride]*csub[idx]/bsub[idx+stride]:0.f);
ct=(margin_right<512)?(-csub[idx+stride]*asub[idx]/bsub[idx+stride]):0.f;
dt=dsub[idx]+((margin_left>=0)?(-dsub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f)
-((margin_right<512)?dsub[idx+stride]*csub[idx]/bsub[idx+stride]:0.f);
__syncthreads();
asub[idx]=at;
bsub[idx]=bt;
csub[idx]=ct;
dsub[idx]=dt;
__syncthreads();
}
x[idx_global]=dsub[idx]/bsub[idx];
}/*}}}*/
I launched this kernel by cutrid_RC_1b<<<512,512>>>(d_a,d_b,d_c,d_d,d_x), and reached 100% device occupancy. This result has puzzled me for days.
There is an improved version of my code:
#include "cutrid.cuh"
__global__ void cutrid_RC_1b(float *a,float *b,float *c,float *d,float *x)
{/*{{{*/
int idx_global=blockIdx.x*blockDim.x+threadIdx.x;
int idx=threadIdx.x;
__shared__ float asub[512];
__shared__ float bsub[512];
__shared__ float csub[512];
__shared__ float dsub[512];
asub[idx]=a[idx_global];
bsub[idx]=b[idx_global];
csub[idx]=c[idx_global];
dsub[idx]=d[idx_global];
__syncthreads();
//Reduction
for(int stride=1;stride<512;stride*=2)
{
int margin_left=(idx-stride);
int margin_right=(idx+stride);
if(margin_left<0) margin_left=0;
if(margin_right>=512) margin_right=511;
float tmp1 = asub[idx] / bsub[margin_left];
float tmp2 = csub[idx] / bsub[margin_right];
float tmp3 = dsub[margin_right];
float tmp4 = dsub[margin_left];
__syncthreads();
dsub[idx] = dsub[idx] - tmp4*tmp1-tmp3*tmp2;
bsub[idx] = bsub[idx]-csub[margin_left]*tmp1-asub[margin_right]*tmp2;
tmp3 = -csub[margin_right];
tmp4 = -asub[margin_left];
__syncthreads();
asub[idx] = tmp3*tmp1;
csub[idx] = tmp4*tmp2;
__syncthreads();
}
x[idx_global]=dsub[idx]/bsub[idx];
}/*}}}*/
The speed is improved to 0.73ms on a Quadro k4000 for 512 x 512 system, however the code in the mentioned paper runs in 0.5ms on a GTX280.
Solving a tridiagonal system of equations is a challenging parallel problem since the classical solution scheme, i.e., Gaussian elimination, is inherently sequential.
Cyclic Reduction consists of two phases:
Forward Reduction. The original system is split in two independent tridiagonal systems for two sets of unknowns, the ones with odd index and the ones with even index. Such systems can be solved independently and this step can be seen as the first of a divide et impera scheme. The two smaller systems are split again in the same way in two subsystems and the process is repeated until a system of only 2 equations is reached.
Backward Substitution. The system of 2 equations is solved first. Then, the divide et impera structure is climbed up by solving the sub-systems independently on different cores.
I'm not sure (but correct me if I'm wrong) that your code will return consistent results. N does not appear to be defined. Also, you are accessing csub[idx-stride], but I'm not sure what does it mean when idx==0 and stride>1. Furthermore, you are using several conditional statements, essentially for boundary checkings. Finally, your code lacks a proper thread structure capable to deal with the mentioned divide et impera scheme, conceptually pretty much like the one used in the CUDA SDK reduction samples.
As mentioned in one of my comments above, I remembered that at tridiagonalsolvers you can find an implementation of the Cyclic Reduction scheme for solving tridiagonal equation systems. Browsing the related google pages, it seems to me that the code is mantained, among others, by the first Author of the above paper (Yao Zhang). The code is copied and pasted below. Note that the boundary check is done only once (if (iRight >= systemSize) iRight = systemSize - 1;), thus limiting the number of conditional statements involved. Note also the thread structure capable to deal with a divide et impera scheme.
The code by Zhang, Cohen and Owens
__global__ void crKernel(T *d_a, T *d_b, T *d_c, T *d_d, T *d_x)
{
int thid = threadIdx.x;
int blid = blockIdx.x;
int stride = 1;
int numThreads = blockDim.x;
const unsigned int systemSize = blockDim.x * 2;
int iteration = (int)log2(T(systemSize/2));
#ifdef GPU_PRINTF
if (thid == 0 && blid == 0) printf("iteration = %d\n", iteration);
#endif
__syncthreads();
extern __shared__ char shared[];
T* a = (T*)shared;
T* b = (T*)&a[systemSize];
T* c = (T*)&b[systemSize];
T* d = (T*)&c[systemSize];
T* x = (T*)&d[systemSize];
a[thid] = d_a[thid + blid * systemSize];
a[thid + blockDim.x] = d_a[thid + blockDim.x + blid * systemSize];
b[thid] = d_b[thid + blid * systemSize];
b[thid + blockDim.x] = d_b[thid + blockDim.x + blid * systemSize];
c[thid] = d_c[thid + blid * systemSize];
c[thid + blockDim.x] = d_c[thid + blockDim.x + blid * systemSize];
d[thid] = d_d[thid + blid * systemSize];
d[thid + blockDim.x] = d_d[thid + blockDim.x + blid * systemSize];
__syncthreads();
//forward elimination
for (int j = 0; j <iteration; j++)
{
__syncthreads();
stride *= 2;
int delta = stride/2;
if (threadIdx.x < numThreads)
{
int i = stride * threadIdx.x + stride - 1;
int iLeft = i - delta;
int iRight = i + delta;
if (iRight >= systemSize) iRight = systemSize - 1;
T tmp1 = a[i] / b[iLeft];
T tmp2 = c[i] / b[iRight];
b[i] = b[i] - c[iLeft] * tmp1 - a[iRight] * tmp2;
d[i] = d[i] - d[iLeft] * tmp1 - d[iRight] * tmp2;
a[i] = -a[iLeft] * tmp1;
c[i] = -c[iRight] * tmp2;
}
numThreads /= 2;
}
if (thid < 2)
{
int addr1 = stride - 1;
int addr2 = 2 * stride - 1;
T tmp3 = b[addr2]*b[addr1]-c[addr1]*a[addr2];
x[addr1] = (b[addr2]*d[addr1]-c[addr1]*d[addr2])/tmp3;
x[addr2] = (d[addr2]*b[addr1]-d[addr1]*a[addr2])/tmp3;
}
// backward substitution
numThreads = 2;
for (int j = 0; j <iteration; j++)
{
int delta = stride/2;
__syncthreads();
if (thid < numThreads)
{
int i = stride * thid + stride/2 - 1;
if(i == delta - 1)
x[i] = (d[i] - c[i]*x[i+delta])/b[i];
else
x[i] = (d[i] - a[i]*x[i-delta] - c[i]*x[i+delta])/b[i];
}
stride /= 2;
numThreads *= 2;
}
__syncthreads();
d_x[thid + blid * systemSize] = x[thid];
d_x[thid + blockDim.x + blid * systemSize] = x[thid + blockDim.x];
}
I want to add a further answer to mention that tridiagonal systems can be easily solved in the framework of the cuSPARSE library by aid of the function
cusparse<t>gtsv()
cuSPARSE also provides
cusparse<t>gtsv_nopivot()
which, at variance with the first mentioned routine, does not perform pivoting. Both the above functions solve the same linear system with multiple right hand sides. A batched routine
cusparse<t>gtsvStridedBatch()
also exists which solves multiple linear systems.
For all the above routines, the system matrix is fixed by simply specifying the lower diagonal, the main diagonal and the upper diagonal.
Below, I'm reporting a fully worked out example using cusparse<t>gtsv() to solve a tridiagonal linear system.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse_v2.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if(CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs",__FILE__, __LINE__,err, \
_cusparseGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
const int N = 5; // --- Size of the linear system
// --- Lower diagonal, diagonal and upper diagonal of the system matrix
double *h_ld = (double*)malloc(N * sizeof(double));
double *h_d = (double*)malloc(N * sizeof(double));
double *h_ud = (double*)malloc(N * sizeof(double));
h_ld[0] = 0.;
h_ud[N-1] = 0.;
for (int k = 0; k < N - 1; k++) {
h_ld[k + 1] = -1.;
h_ud[k] = -1.;
}
for (int k = 0; k < N; k++) h_d[k] = 2.;
double *d_ld; gpuErrchk(cudaMalloc(&d_ld, N * sizeof(double)));
double *d_d; gpuErrchk(cudaMalloc(&d_d, N * sizeof(double)));
double *d_ud; gpuErrchk(cudaMalloc(&d_ud, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_ld, h_ld, N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_d, h_d, N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_ud, h_ud, N * sizeof(double), cudaMemcpyHostToDevice));
// --- Allocating and defining dense host and device data vectors
double *h_x = (double *)malloc(N * sizeof(double));
h_x[0] = 100.0; h_x[1] = 200.0; h_x[2] = 400.0; h_x[3] = 500.0; h_x[4] = 300.0;
double *d_x; gpuErrchk(cudaMalloc(&d_x, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_x, h_x, N * sizeof(double), cudaMemcpyHostToDevice));
// --- Allocating the host and device side result vector
double *h_y = (double *)malloc(N * sizeof(double));
double *d_y; gpuErrchk(cudaMalloc(&d_y, N * sizeof(double)));
cusparseSafeCall(cusparseDgtsv(handle, N, 1, d_ld, d_d, d_ud, d_x, N));
cudaMemcpy(h_x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost);
for (int k=0; k<N; k++) printf("%f\n", h_x[k]);
}
At this gitHub repository, a comparison of different CUDA routines available in the cuSOLVER library for the solution of tridiagonal linear systems is reported.
Things I see:
1st __syncthreads() seems redundant.
There are repetitive sets of operations such as (-csub[idx-stride]*asub[idx]/bsub[idx-stride]) in your code. Use intermediate variables to hold the result and reuse them instead of making GPU calculate those sets each time.
Use NVIDIA profiler to see where issues are.

GPU gives no performance improvement in Julia set computation

I am trying to compare performance in CPU and GPU. I have
CPU : Intel® Core™ i5 CPU M 480 # 2.67GHz × 4
GPU : NVidia GeForce GT 420M
I can confirm that GPU is configured and works correctly with CUDA.
I am implementing Julia set computation. http://en.wikipedia.org/wiki/Julia_set
Basically for every pixel, if the co-ordinate is in the set it will paint it red
else paint it white.
Although, I get identical answer with both CPU and GPU but instead of getting a
performance improvement, I get a performance penalty by using GPU.
Running times
CPU : 0.052s
GPU : 0.784s
I am aware that transferring data from device to host can take up some time.
But still, how do I know if use of GPU is actually beneficial?
Here is the relevant GPU code
#include <stdio.h>
#include <cuda.h>
__device__ bool isJulia( float x, float y, float maxX_2, float maxY_2 )
{
float z_r = 0.8 * (float) (maxX_2 - x) / maxX_2;
float z_i = 0.8 * (float) (maxY_2 - y) / maxY_2;
float c_r = -0.8;
float c_i = 0.156;
for( int i=1 ; i<100 ; i++ )
{
float tmp_r = z_r*z_r - z_i*z_i + c_r;
float tmp_i = 2*z_r*z_i + c_i;
z_r = tmp_r;
z_i = tmp_i;
if( sqrt( z_r*z_r + z_i*z_i ) > 1000 )
return false;
}
return true;
}
__global__ void kernel( unsigned char * im, int dimx, int dimy )
{
//int tid = blockIdx.y*gridDim.x + blockIdx.x;
int tid = blockIdx.x*blockDim.x + threadIdx.x;
tid *= 3;
if( isJulia((float)blockIdx.x, (float)threadIdx.x, (float)dimx/2, (float)dimy/2)==true )
{
im[tid] = 255;
im[tid+1] = 0;
im[tid+2] = 0;
}
else
{
im[tid] = 255;
im[tid+1] = 255;
im[tid+2] = 255;
}
}
int main()
{
int dimx=768, dimy=768;
//on cpu
unsigned char * im = (unsigned char*) malloc( 3*dimx*dimy );
//on GPU
unsigned char * im_dev;
//allocate mem on GPU
cudaMalloc( (void**)&im_dev, 3*dimx*dimy );
//launch kernel.
**for( int z=0 ; z<10000 ; z++ ) // loop for multiple times computation**
{
kernel<<<dimx,dimy>>>(im_dev, dimx, dimy);
}
cudaMemcpy( im, im_dev, 3*dimx*dimy, cudaMemcpyDeviceToHost );
writePPMImage( im, dimx, dimy, 3, "out_gpu.ppm" ); //assume this writes a ppm file
free( im );
cudaFree( im_dev );
}
Here is the CPU code
bool isJulia( float x, float y, float maxX_2, float maxY_2 )
{
float z_r = 0.8 * (float) (maxX_2 - x) / maxX_2;
float z_i = 0.8 * (float) (maxY_2 - y) / maxY_2;
float c_r = -0.8;
float c_i = 0.156;
for( int i=1 ; i<100 ; i++ )
{
float tmp_r = z_r*z_r - z_i*z_i + c_r;
float tmp_i = 2*z_r*z_i + c_i;
z_r = tmp_r;
z_i = tmp_i;
if( sqrt( z_r*z_r + z_i*z_i ) > 1000 )
return false;
}
return true;
}
#include <stdlib.h>
#include <stdio.h>
int main(void)
{
const int dimx = 768, dimy = 768;
int i, j;
unsigned char * data = new unsigned char[dimx*dimy*3];
**for( int z=0 ; z<10000 ; z++ ) // loop for multiple times computation**
{
for (j = 0; j < dimy; ++j)
{
for (i = 0; i < dimx; ++i)
{
if( isJulia(i,j,dimx/2,dimy/2) == true )
{
data[3*j*dimx + 3*i + 0] = (unsigned char)255; /* red */
data[3*j*dimx + 3*i + 1] = (unsigned char)0; /* green */
data[3*j*dimx + 3*i + 2] = (unsigned char)0; /* blue */
}
else
{
data[3*j*dimx + 3*i + 0] = (unsigned char)255; /* red */
data[3*j*dimx + 3*i + 1] = (unsigned char)255; /* green */
data[3*j*dimx + 3*i + 2] = (unsigned char)255; /* blue */
}
}
}
}
writePPMImage( data, dimx, dimy, 3, "out_cpu.ppm" ); //assume this writes a ppm file
delete [] data
return 0;
}
Further, following suggestions from #hyde I have looped the computation-only part to generate 10,000 images. I am not bothering to write all those images though. Computation only is what I am doing.
Here are the running times
CPU : more than 10min and code still running
GPU : 1m 14.765s
Turning comments to answer:
To get relevant figures, you needs to calculate more than one image, so that execution time is seconds or tens of seconds at least. Also, including file saving time in results is going to add noise and hide the actual CPU vs GPU difference.
Another way to get real results is to select a Julia set which has lot points belonging to the set, then upping the iteration count so high it takes many seconds to calculate just one image. Then there is only one single calculation setup, so this is likely to be the most advantageous scenario for GPU/CUDA.
To measure how much overhead there is, change image size to 1x1 and iteration limit 1, and then calculate enough images that it takes at least a few seconds. In this scenario, GPU is likely significantly slower.
To get most relevant timings for your use case, select image size and iteration count you are really going to use, and then measure the image count, where both versions are equally fast. That will give you a rough rule-of-thumb to decide which you should use when.
Alternative approach for practical results, if you are going to get just one image: find the iteration limit for single worst-case image, where CPU and GPU are equally fast. If that many or more iterations would be advantageous, choose GPU, otherwise choose CPU.

2D kernel calling and launch parameters for non-square matrix

I am attempting to port the following (simplified) nested loop as a CUDA 2D kernel. The sizes of NgS and NgO will increase with larger data sets; for now I just want to get this kernel to output the correct results for all values:
// macro that translates 2D [i][j] array indices to 1D flattened array indices
#define idx(i,j,lda) ( (j) + ((i)*(lda)) )
int NgS = 1859;
int NgO = 900;
// 1D flattened matrices have been initialized as:
Radio_cpu = new double [NgS*NgO];
Result_cpu = new double [NgS*NgO];
// ignoring the part where they are filled w/ data
for (m=0; m<NgO; m++) {
for (n=0; n<NgS; n++) {
Result_cpu[idx(n,m,NgO)]] = k0*Radio_cpu[idx(n,m,NgO)]];
}
}
The examples I have come across usually deal with square loops, and I have been unable to get the correct output for all the GPU array indices compared to the CPU version. Here is the host code calling the kernel:
dim3 dimBlock(16, 16);
dim3 dimGrid;
dimGrid.x = (NgO + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (NgS + dimBlock.y - 1) / dimBlock.y;
// Result_gpu and Radio_gpu are allocated versions of the CPU variables on GPU
trans<<<dimGrid,dimBlock>>>(NgO, NgS, k0, Radio_gpu, Result_gpu);
Here is the kernel:
__global__ void trans(int NgO, int NgS,
double k0, double * Radio, double * Result) {
int n = blockIdx.x * blockDim.x + threadIdx.x;
int m = blockIdx.y * blockDim.y + threadIdx.y;
if(n > NgS || m > NgO) return;
// map the two 2D indices to a single linear, 1D index
int grid_width = gridDim.x * blockDim.x;
int idxxx = m + (n * grid_width);
Result[idxxx] = k0 * Radio[idxxx];
}
With the current code, I proceeded to compare the Result_cpu variable with Result_gpu variable once copied back. When I cycle through the values I get:
// matches from NgS = 0...913
Result_gpu[NgS = 913][NgO = 0]: -56887.2
Result_cpu[Ngs = 913][NgO = 0]: -56887.2
// mismatches from NgS = 914...1858
Result_gpu[NgS = 914][NgO = 0]: -12.2352
Result_cpu[NgS = 914][NgO = 0]: 79448.6
This pattern is the same, irregardless of the value of NgO. I have been trying to figure out where I have made a mistake by looking at various examples for a few hours and trying out changes, but so far this scheme has worked minus the obvious issue at hand whereas the others have caused kernel invocation errors/left the GPU array uninitialized for all values. Since I clearly cannot see the mistake, I'd really appreciate if someone could point me in the right direction towards a fix. I'm pretty sure it's right under my nose and I can't see it.
In case it matters, I'm testing this code on a Kepler card, compiling using MSVC 2010, CUDA 4.2 and 304.79 driver and have compiled the code with both arch=compute_20,code=sm_20 and arch=compute_30,code=compute_30 flags with no difference.
#vaca_loca: I tested the following kernel (it works for me also with non-square block dimensions):
__global__ void trans(int NgO, int NgS,
double k0, double * Radio, double * Result) {
int n = blockIdx.x * blockDim.x + threadIdx.x;
int m = blockIdx.y * blockDim.y + threadIdx.y;
if(n > NgO || m > NgS) return;
int ofs = m * NgO + n;
Result[ofs] = k0 * Radio[ofs];
}
void test() {
int NgS = 1859, NgO = 900;
int data_sz = NgS * NgO, bytes = data_sz * sizeof(double);
cudaSetDevice(0);
double *Radio_cpu = new double [data_sz*3],
*Result_cpu = Radio_cpu + data_sz,
*Result_gpu = Result_cpu + data_sz;
double k0 = -1.7961233;
srand48(time(NULL));
int i, j, n, m;
for(m=0; m<NgO; m++) {
for (n=0; n<NgS; n++) {
Radio_cpu[m + n*NgO] = lrand48() % 234234;
Result_cpu[m + n*NgO] = k0*Radio_cpu[m + n*NgO];
}
}
double *g_Radio, *g_Result;
cudaMalloc((void **)&g_Radio, bytes * 2);
g_Result = g_Radio + data_sz;
cudaMemcpy(g_Radio, Radio_cpu, bytes, cudaMemcpyHostToDevice);
dim3 dimBlock(16, 16);
dim3 dimGrid;
dimGrid.x = (NgO + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (NgS + dimBlock.y - 1) / dimBlock.y;
trans<<<dimGrid,dimBlock>>>(NgO, NgS, k0, g_Radio, g_Result);
cudaMemcpy(Result_gpu, g_Result, bytes, cudaMemcpyDeviceToHost);
for(m=0; m<NgO; m++) {
for (n=0; n<NgS; n++) {
double c1 = Result_cpu[m + n*NgO],
c2 = Result_gpu[m + n*NgO];
if(std::abs(c1-c2) > 1e-4)
printf("(%d;%d): %.7f %.7f\n", n, m, c1, c2);
}
}
cudaFree(g_Radio);
delete []Radio_cpu;
}
though, in my opinion, accessing data from global memory using quads might not be very cache-friendly since access stride is pretty large. You might consider using 2D textures instead if it's critical for your algorithm to access data in 2D locality

cuda multiplication

Serial code snippet looks like this:
int i, j;
for(j=0; j<ny; j++)
{
for(i=0; i<nx; i++)
{
x[i + j*nx] *= y[i];
}
}
I converted this to CUDA using this kernel:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int i,j;
for(tid = 0; tid <nx*ny; tid++)
{
j = tid/nx;
i = tid - j*nx;
x[tid] *= y[i];
}
However the GPU kernel does not give any speedup improvement? Any suggestions on a better solution?? Thanks in advance
If this is the serial code:
int i, j;
for(j=0; j<ny; j++)
{
for(i=0; i<nx; i++)
{
x[i + j*nx] *= y[i];
}
}
then you should be doing this:
__global__ void fn(float *x, int nx)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int j = tid/nx, i = tid - j * nx;
x[tid] *= y[i];
}
fn<<<nx*ny/B, B>>>(x, nx); // with B = 256, 512, etc.
What you're doing is fairly bizarre: you're instructing each thread of the CUDA kernel to iterate over all values of tid between 0 and nx*ny, and compute the same function as your CPU version! Moreover, instead of just iterating over the indices, you're actually doing the loop less efficiently than you did for the CPU version; in other words, you do the same thing in each thread, just less efficiently, than you are doing in 1 thread on the CPU. It's no wonder that this is slower; it should be much, much slower. Your CUDA kernel is:
int **tid** = blockIdx.x * blockDim.x + threadIdx.x;
int i,j;
for(**tid** = 0; **tid** <nx*ny; **tid**++)
{
j = tid/nx;
i = tid - j*nx;
x[tid] *= y[i];
}
This does nx*ny iterations, same as your host code, for each thread; you lose all benefit of the parallelism, since each thread is doing the same thing; you would get the same performance using one thread on the GPU, and the same result!
If this is the verbatim code from your CUDA source file, you need to change it and redo the comparison; if this is code you have written to help explain what your code is doing for a lay non-CUDA audience, then you need to present your actual CUDA code so that we can see what's going on... as it is, the performance analysis I have done - the trivial one - is all you can expect.
Given your comment to this answer:
the nx * ny = 2205; so I used no. of blocks =
(nx*ny+(threads-1))/threads and threads = 64.
is implying you are intending to launch one thread per computation, the correct CUDA implementation would just be:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int j = tid/nx;
int i = tid - j*nx;
if (tid < (nx*ny))
x[tid] *= y[i];
If you were intending for each thread to compute more than one computation per kernel launch, then you would size the grid to "fill" each of the SM on the target GPU, not use the same number of threads as the input size, and then do something like:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int gsize = blockDim.x * gridDim.x;
int i,j;
for(; tid <nx*ny; tid+=gsize)
{
j = tid/nx;
i = tid - j*nx;
x[tid] *= y[i];
}
That would get you at least coalesced reads and writes to x, and remove the enormous number of redundant calculations in your posted version. There are a number of further optimizations that could be made, but it would require more information about the problem than has been supplied in the question and subsequent comments. Your indexing scheme contains an integer division and then an integer multiply-add per calculation. That is a lot of overhead for a single FLOP per input value. However, having said all of that, if the problem size I quoted is that actual problem size you are interested in, the GPU will never be faster than even a modest host CPU. You would require many orders of magnitude larger problems to realize useful speed up using the GPU for this sort low arithmetic intensity operation.
How big is the block? it may be that the time needed to copy a small amount of data to the GPU and setup the envirnoment is much longer than the calculation time.
Remember also that CUDA does a jit compile on the first run so to get accurate benchmarking you need to run it many times.
Try this using shared memory. One of the best implementations around:
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)
typedef struct {
int width;
int height;
int stride; // In number of elements
float *elements;
} Matrix;
// Thread block size
#define BLOCK_SIZE 16
// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col)
{
return A.elements[row * A.stride + col];
}
// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col, float value)
{
A.elements[row * A.stride + col] = value;
}
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
Matrix Asub;
Asub.width = BLOCK_SIZE; Asub.height = BLOCK_SIZE;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row +
BLOCK_SIZE * col];
return Asub;
}
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Same as in previous example, except the followings:
// d_A.width = d_A.stride = A.width;
// d_B.width = d_B.stride = B.width;
// d_C.width = d_C.stride = C.width;
}
// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Block row and column
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
// Each thread block computes one sub-matrix Csub of C
Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
// Each thread computes one element of Csub
// by accumulating results into Cvalue
float Cvalue = 0;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
for (int m = 0; m < (A.width / BLOCK_SIZE); ++m)
{
// Get sub-matrix Asub of A and Bsub of B
Matrix Asub = GetSubMatrix(A, blockRow, m);
Matrix Bsub = GetSubMatrix(B, m, blockCol);
// Shared memory used to store Asub and Bsub respectively
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[row][col] = GetElement(Asub, row, col);
Bs[row][col] = GetElement(Bsub, row, col);
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
__syncthreads();
// Multiply Asub and Bsub together
for (int e = 0; e < BLOCK_SIZE; ++e)
Cvalue += As[row][e] * Bs[e][col];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write Csub to device memory
// Each thread writes one element
SetElement(Csub, row, col, Cvalue);
}