I'm trying to implemenet the pseudocode of the prefix-sum(scan) operation given in the CUDA documentation. The results I'm getting is absolutely wrong. I revised my code hundred times but still got problems with that.
Here is the pseudocode given by CUDA:
1: for d = 1 to log2 n do
2: for all k in parallel do
3: if k >= power(2, d) then
4: x[k] = x[k – power(2, d-1)] + x[k]
And the CUDA kernel I've coded so far is:
// CUDA Kernel
__global__ void
prefixSumCUDA(int *a, size_t n)
{
int tId = threadIdx.x;
for (int offset = 1; offset < n; offset *= 2) {
if (tId >= pow((float)2, offset)) {
int temp = tId - pow((float)2, offset - 1);
a[tId] += a[temp];
}
}
}
Please let me know if I am making any mistakes here. I know this implementation is massively dependent on the size of the blocks and grids. Thus, I will provide my kernel call here:
// Kernel launch
prefixSumCUDA << <1, 32 >> > (d_A, n);
The input array is a 8 element integer type:
[-] array: 1, 2, 3, 4, 5, 6, 7, 8
And the result of the CUDA kernel is as following:
[-] array: 1, 2, 5, 7, 14, 18, 22, 26
Thanks for any help in advance!
I solved the problem by implementing this another way. The offset is better to get started from 0 rather than 1. This results in the following code.
__global__ void
prefixSumCUDA(int *a, size_t n)
{
int tId = threadIdx.x;
int end = ceil(log2((float)n));
for (int offset = 0; offset < end; offset++) {
if (tId >= (1 << offset)) {
a[tId] += a[tId - (1 << offset)];
}
}
}
Related
This question already has an answer here:
Atomic Operation failed in CUDA
(1 answer)
Closed 3 years ago.
I have a situation where addAtomic is not performing as I would expect. I am very new to cuda so I am likely missing something, however ive been stuck on this for nearly a good day and rewritten most other areas of my program thinking it was a memory allocation issue. This doesnt seem to be the case though.
Essentially what is happening is that it calls the 'analyze' kernel which should be producing min/max and sum values of the data. The same data is used for min/max as the sum. The result from the atomicadd operation however, read like a memory address. very very large numbers. Is there something I am missing - I have gone over this a hundred times and stripped out almost everything out of the kernel except for the min/max and sum.
__global__ void analyze(int *data, int *min, int *max, int *mean)
{
int t_id = (threadIdx.x * AXIS_COUNT) + blockIdx.x;
int b_id = blockIdx.x;
int localVal = data[t_id];
atomicMin(&min[b_id], localVal);
atomicMax(&max[b_id], localVal);
atomicAdd(&mean[b_id], localVal);
}
...........
int r;
int step = WINDOW_LENGTH * AXIS_COUNT;
for (r = 0; r < out_rows; r++){
analyze<<<AXIS_COUNT, WINDOW_LENGTH>>>(
&d_data[r * step],
&d_min[r * AXIS_COUNT],
&d_max[r * AXIS_COUNT],
&d_mean[r * AXIS_COUNT]);
}
cudaDeviceSynchronize();
cudaMemcpy(h_min, d_min, int_size, cudaMemcpyDeviceToHost);
cudaMemcpy(h_max, d_max, int_size, cudaMemcpyDeviceToHost);
cudaMemcpy(h_mean, d_mean, int_size, cudaMemcpyDeviceToHost);
for(r=0; r < out_rows; r++) {
fprintf(stderr, "mean %d, x: %d, y: %d z: %d\n", r, h_mean[r*AXIS_COUNT], h_mean[r*AXIS_COUNT + 1], h_mean[r*AXIS_COUNT+2]);
}
The result are in the form:
mean 5025, x: 2078310793, y: 1999653847 z: -1453684997
mean 5026, x: 2078308025, y: 1999646363 z: -1453660854
mean 5027, x: 2078305391, y: 1999639383 z: -1453636904
mean 5028, x: 2078304342, y: 1999630356 z: -1453613212
I have validated and checked the min/max values with the relevant documents to confirm.
The answer was to initialize the shared memory in the kernel.
__shared__ double sum[AXIS_COUNT];
if (threadIdx.x == 0) {
int i;
for (i=0; i < AXIS_COUNT; i++)
sum[i] = 0;
}
syncthreads();
int t_id = (threadIdx.x * AXIS_COUNT) + blockIdx.x;
int b_id = blockIdx.x;
I've been trying to work out an algorithm to get the dot product of two vectors within a CUDA program via reduction and seem to be stuck :/
In essence, I'm trying to write this code in CUDA:
for (int i = 0; i < n; i++)
h_h += h_a[i] * h_b[i];
Where h_a and h_b are arrays of floats and h_h sums up the dot product.
I'm trying to use reduction here - so far I've got this...
__global__ void dot_product(int n, float * d_a, float * d_b){
int i = threadIdx.x;
for (int stride = 1; i + stride < n; stride <<= 1) {
if (i % (2 * stride) == 0){
d_a[i] += d_a[i + stride] * d_b[i + stride];
}
__syncthreads();
}
}
If I change the main line to d_a[i] += d_a[i + stride];, it sums up the array just fine. I seem to be running into a parallel issue here from what I gather. Can someone point out my issue?
My kernel call is:
dot_product<<<1, n>>>(n, d_a, d_b);, where n is the size of each array.
There are two problems here:
As pointed out in comments, you never calculate the product of the first elements (this is a minor issue)
Your dot product calculation is incorrect. The parallel reduction should be performing a sum of the individual products of corresponding elements. Your code performs the product at every stage of the parallel reduction, so that products are getting multiplied again as they as are summed. That is incorrect.
You want to do something like this:
__global__ void dot_product(int n, float * d_a, float * d_b){
int i = threadIdx.x;
d_a[i] = d_a[i] * d_b[i]; // d_a now contains products
__syncthreads();
for (int stride = 1; i + stride < n; stride <<= 1) {
if (i % (2 * stride) == 0){
d_a[i] += d_a[i + stride]; // which are summed by reduction
}
__syncthreads();
}
}
[disclaimer: written in browser, never compiled or test, use at own risk]
I tried summation in cuda . I cant find what i did wrong here.
The sum is always returned 0. Can anyone help.
The shared tag defines the variable common in each block.
So i tried to sum one block at a time and finally sum up the result for overall sum.
But the sum doesnt work for block. And i am stuck.
Can anyone help.
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <stdlib.h>
//#define BLOCK_SIZE 32 // size of vectors
__global__ void add( float * i_data, float * sum){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float s_data;
s_data = 0;
// must be synchronized
__syncthreads();
// reduce and sum
// typical in GPU computings
for (int i = 0; i<blockDim.x; i++)
{
__syncthreads();
if (tid <= i)
{
//s_data[blockIdx.x]+ = s_data[tid] + s_data[i+tid];
s_data+= i_data[tid];
}
}
if (tid == 0)
sum[blockIdx.x]=s_data;
}
int main() {
int T = 10, B = 5; // threads per block and blocks per grid
float *a,*b; // host pointers
float *dev_a, *dev_b; // device pointers to host memory
int sizeIN = T*B*sizeof(int);
int sizeOUT = B*sizeof(int);
a= new float[T*B];
b= new float[B];
for(int i = 0;i<B;i++)
{
for (int j=0;j<T;j++)
{
a[i*T+j]=i;
}
}
for(int i = 0;i<B;i++)
{
b[i]=0;
}
cudaMalloc((void **) &dev_a, sizeIN);
cudaMalloc((void **) &dev_b, sizeOUT);
cudaMemcpy(dev_a, a, sizeIN, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, sizeOUT, cudaMemcpyHostToDevice);
add<<< B, T >>> (dev_a, dev_b);
cudaMemcpy(a,dev_a, sizeIN, cudaMemcpyDeviceToHost);
cudaMemcpy(b,dev_b, sizeOUT, cudaMemcpyDeviceToHost);
for(int i = 0;i<B;i++)
{
for (int j=0;j<T;j++)
{
std::cout<< a[i*T+j]<<"\t";
std::cout<<std::endl;
}
std::cout<<std::endl<<std::endl<<"sum is: "<<b[i]<<std::endl;
}
std::cout<<std::endl<<std::endl;
cudaFree(dev_a);
cudaFree(dev_b);
free(a);
free(b);
return 0;
}
This is wrong in 2 ways:
if (tid = 0)
First, you should be doing a comparison == not an assignment =. I don't know why your compiler didn't warn you about this.
Second, tid is only zero for one thread in the entire grid:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
You want one thread in each block to write the block result out to global memory:
if (threadIdx.x == 0)
This is also a problem, similarly:
if (tid <= i)
This is only satisfied for threads in the first block. Beyond that, I have to start to guess at what you want. I guess you're trying to sum the values in each block. Your construction is not a parallel reduction, but to make the minimum changes to get it "functional" I would rewrite the end of your kernel like this:
// reduce and sum
// typical in GPU computings
for (int i = 0; i<blockDim.x; i++)
{
if (threadIdx.x == i)
{
//s_data[blockIdx.x]+ = s_data[tid] + s_data[i+tid];
s_data+= i_data[tid];
}
__syncthreads();
}
if (threadIdx.x == 0)
sum[blockIdx.x]=s_data;
}
Although you didn't have any CUDA API errors, it's good practice to use proper cuda error checking and also run your code with cuda-memcheck any time you are having trouble with a cuda code.
I mentioned that your code above is not a classical reduction. Its just an unoptimal for-loop.
To learn about a CUDA parallel reduction, study the cuda sample code and the accompanying presentation, and there are many examples here on the CUDA tag on SO as well that you can search on.
I wrote the following simple CUDA kernel:
__global__ void pr_kernel(float* O, const float* I, const float* W, int N)
{
int x = threadIdx.x;
float sum;
int i;
if (x < N) {
for (i = 0; i < N; i++) {
if (i == x) continue;
sum += W[x*N+i] * I[x];
}
O[x] = (0.15 / N) + 0.85 * sum;
}
}
The variables are allocated in Python as follows:
N = np.int32(4)
W = np.float32(np.asarray(
[0, 1, 0, 1, 1, 0, 1, 1,
0, 1, 0, 1,1, 1, 0]))
I = np.float32(np.asarray(
[0.25, 0.25, 0.25, 0.25]))
O = np.float32(np.zeros(N))
I'm transferring the variables using gpuarray.to_gpu, and I'm calling the kernel on a Tesla C2070 with the following line:
pr_kernel(O_d, I_d, W_d, N_d, block=blocksize, grid=gridsize)
Where:
blocksize = (128, 1, 1)
gridsize = (1, 1)
I get the error message:
pycuda.driver.LaunchError: cuLaunchKernel failed: launch out of resources.
This happens even if I reduce blocksize to something like (8, 1, 1). I can run other CUDA programs on the GPU with a blocksize of (512, 1, 1) so I'm confident this is not due to a GPU configuration issue.
What am I doing wrong? Thanks for any help.
The problem was that I was transferring the integer N to the GPU using gpuarray.to_gpu, where I should have been directly passing N to the pr_kernel function.
I got a similar problem when I used a different type in definition and as an argument to the kernel. Probably the fact that the latter required more resources generates an error.
I'm not a programmer with any abilities. Just someone curious about CUDA and so I'm doing a little reading. I ran across an example of using Thrust to do a moving average:
Simple Moving Average Thrust Example
The example, such as it is, runs and mostly works correctly. However it's trivial in the sense that it only does one moving average operation.
How I would do say 352 of these moving average operations in parallel, all operating on the same data stream? In my mind the program flow might be:
Generate the data & send it to one CUDA core. (Same as existing code
but think lengths of 1000 or 10000 instead of 30)
Copy it from the CUDA core it's in to all of the the other 351 CUDA
cores in my GTX 465
Tell each CUDA core what number of data items to average over.
(4, 5, 6,..., 352, 353, 354)
Tell the device to run the average in each core in parallel
Read back the results from each core
I get that this code
// compute SMA using standard summation
simple_moving_average(data, w, averages);
makes it all happen, but how to I get Thrust to do many of these in parallel?
My interest here is about something like stock data. If I'm looking at GOOG prices I'd put that in the GPU using all cores and leave it there. I'd then be free to do lots of processing without loading the data anymore and just reading back results from each core. NOTE: I might not want to use GOOG in all cores. Some cores might be GOOG, others with some other symbol, but I'll get there later. I'm just thinking I don't want the stock data in global memory if there's enough room in each core.
I assume this is pretty straightforward for CUDA & Thrust?
Here is the possible way how to do this with arrayfire:
Note that I am NOT affiliated with this library whatsoever.
I am pretty sure this can also be done with thrust
but I found this one a lot simpler with arrayfire.
And if the library is free why can't I use it instead of thrust ?
In arrayfire you can use matrix to run several SMA operations in parallel:
unsigned n_SMAs = 1000; // # of SMA indicators to evaluate
unsigned len = 2000; // # of stock prices per indicator
unsigned w = 6; // window size
// generate stock prices: [0..10]
af::array data = af::randu(n_SMAs, len) * 10;
// compute inclusive prefix sums along colums of the matrix
af::array s = af::accum(data, 1);
// compute the average
af::array avg = (s.cols(w, af::end) - s.cols(0, af::end - w)) / w;
af::eval(avg);
std::cout << avg.dims() << "\n" << avg << "\n";
let me know if that's what you are looking for. This is how I understood your question: compute several SMA indicators in parallel
My understanding is that you are interested into the following two situations:
You have a long sequence of items and you want to calculate a certain number of averages, by averaging on different numbers of items, i.e., using different lengths for the moving average window. This is what I understand from your original question.
You have a series of sequences, stored consecutively in memory, and you want to average them in parallel with a fixed averaging window of size 2 * RADIUS + 1. This is what the ArrayFire code proposed by #asm does - you have accepted it.
Instead of using CUDA Thrust, I think it would be easier to write your own CUDA kernel to do the above operations. Below, a fully worked example that operates in the same way as the ArrayFire code proposed by #asm, thus covering case #2. Modifying it to cover case #1 would be straightforward.
#include <thrust/device_vector.h>
#define RADIUS 3
#define BLOCK_SIZE_X 8
#define BLOCK_SIZE_Y 8
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, unsigned int M, unsigned int N) {
__shared__ unsigned int temp[BLOCK_SIZE_Y][BLOCK_SIZE_X + 2 * RADIUS];
unsigned int gindexx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int gindexy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int gindex = gindexy * N + gindexx;
unsigned int lindexx = threadIdx.x + RADIUS;
unsigned int lindexy = threadIdx.y;
// --- Read input elements into shared memory
temp[lindexy][lindexx] = ((gindexx < N)&&(gindexy < M))? in[gindex] : 0;
if (threadIdx.x < RADIUS) {
temp[lindexy][threadIdx.x] = ((gindexx >= RADIUS)&&(gindexx < (N + RADIUS))&&(gindexy < M)) ? in[gindex - RADIUS] : 0;
temp[lindexy][threadIdx.x + (RADIUS + min(BLOCK_SIZE_X, N - blockIdx.x * BLOCK_SIZE_X))] = (((gindexx + min(BLOCK_SIZE_X, N - blockIdx.x * BLOCK_SIZE_X)) < N)&&(gindexy < M))? in[gindexy * N + gindexx + min(BLOCK_SIZE_X, N - blockIdx.x * BLOCK_SIZE_X)] : 0;
if ((threadIdx.y == 0)&&(gindexy < M)&&((gindexx + BLOCK_SIZE_X) < N)&&(gindexy < M)) printf("Inside 2 - tidx = %i; bidx = %i; tidy = %i; bidy = %i; lindexx = %i; temp = %i\n", threadIdx.x, blockIdx.x, threadIdx.y, blockIdx.y, threadIdx.x + (RADIUS + BLOCK_SIZE_X), temp[lindexy][threadIdx.x + (RADIUS + BLOCK_SIZE_X)]);
}
__syncthreads();
// --- Apply the stencil
unsigned int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
result += temp[lindexy][lindexx + offset];
}
// --- Store the result
out[gindexy * N + gindexx] = result;
}
/********/
/* MAIN */
/********/
int main() {
const unsigned int M = 2;
const unsigned int N = 4 + 2 * RADIUS;
const unsigned int constant = 3;
thrust::device_vector<unsigned int> d_in(M * N, constant);
thrust::device_vector<unsigned int> d_out(M * N);
dim3 GridSize(iDivUp(N, BLOCK_SIZE_X), iDivUp(M, BLOCK_SIZE_Y));
dim3 BlockSize(BLOCK_SIZE_X, BLOCK_SIZE_Y);
moving_average<<<GridSize, BlockSize>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), M, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<unsigned int> h_out = d_out;
for (int j=0; j<M; j++) {
for (int i=0; i<N; i++)
printf("Element j = %i; i = %i; h_out = %i\n", j, i, h_out[N*j+i]);
}
return 0;
}