Atomic Operation failed in CUDA - cuda

As the compute ability is 2.1, the atomicAdd and atomicMax operations do not support double precision, then I define both functions based on some answers on stack overflow.
It is strange that the atomicAdd function works well but the atomicMax doesn't work, here is my code.
The test of my code is to generate random number on each block, and then sum the random numbers on each block, we have block sum, I want to test the atomicAdd and atomicMax on the block sum.
#include <iostream>
#include <curand.h>
#include <curand_kernel.h>
#include <stdio.h>
#include <stdlib.h>
#define num_of_blocks 2
#define threads_per_block 2
#define tot_threads 4
__device__ double gsum[num_of_blocks];
__device__ double dev_sum;
__device__ double dev_max;
// set seed for random number generator
__global__ void initcuRand(curandState* globalState, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &globalState[idx]);
}
// atomiMax for double
__device__ double atomicMax_d(double* address, double val)
{
unsigned long long int* address_as_i = (unsigned long long int*)address;
unsigned long long int old = *address_as_i, assumed;
do {
assumed = old;
old = ::atomicCAS(address_as_i, assumed, __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
} while (assumed != old);
return __longlong_as_double(old);
}
// atomicAdd for double
__device__ double atomicAdd_d(double* address, double val)
{
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do{
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
}while(assumed != old);
return __longlong_as_double(old);
}
__global__ void kernel(curandState *globalState){
// global id
int gidx = threadIdx.x + blockIdx.x * blockDim.x;
// local id
int lidx = threadIdx.x;
// creat shared memory to store seeds
__shared__ curandState localState[tot_threads];
__shared__ double srandnum[threads_per_block];
// copy global seed to local
localState[lidx] = globalState[gidx];
//synchronize the local threads writing to the local memory cache
__syncthreads();
// generate random number from normal distribution in shared memory
srandnum[lidx] = curand_normal(&localState[lidx]);
__syncthreads();
if(lidx == 0){srandnum[lidx] += srandnum[lidx + 1];} // sum of each block
if(lidx == 0){gsum[blockIdx.x] = srandnum[lidx];} // copy the sums back to global memory
__threadfence();
if( gidx < num_of_blocks){
atomicAdd_d(&dev_sum, gsum[gidx]);
}
if( gidx < num_of_blocks){
atomicMax_d(&dev_max, gsum[gidx]);
}
if( gidx == 0){
printf("Sum is: %lf\n", dev_sum);
}
if( gidx == 1){
printf("Max is: %lf\n", dev_max);
}
}
int main(){
// set seed on device
curandState *globalState;
cudaMalloc((void**)&globalState, tot_threads*sizeof(curandState));
initcuRand<<<num_of_blocks, threads_per_block>>>(globalState, 1);
// launch kernel
kernel<<<num_of_blocks, threads_per_block>>>(globalState);
double randnum[num_of_blocks];
cudaMemcpyFromSymbol(randnum, gsum, num_of_blocks*sizeof(double), 0, cudaMemcpyDeviceToHost);
std::cout << "Sum of each block:\n";
for (int i = 0; i < num_of_blocks; ++i){
std::cout << randnum[i] << std::endl;
}
cudaFree(globalState);
return 0;
}
The result I get is
Sum is: -0.898329
Max is: 0.000000
Sum of each block:
-0.0152994
-0.88303
From the result, I know that the atomicAdd function works but the atomicMax function doesn't work, I have no idea of this. Thanks beforehand.

You don't ever initialize dev_max or dev_sum. You can't sensibly do these types of atomic operations on them if they don't start with a known value.
Try something like this instead:
__device__ double dev_sum = 0.0;
__device__ double dev_max = -1e99;
and I think you'll be happier with the results.

Related

CUDA, how to find the first item in an array that makes a function maximal

In Cuda C++, I have a big array Arr of integers, and a function F: int -> int. I want to find the first index of some items in Arr that makes F maximal.
How can I write a kernel that always keeps the maximal value (in F) to compare with others using atomic stuff to avoid facing the race condition problems?
BTW, I wonder if I can use the functions in Thrust library for this purpose instead.
How can I write a kernel that always keeps the maximal value (in F) to compare with others using atomic stuff to avoid facing the race condition problems?
Based on your description, including usage of int, and a desire to use atomics, I would suggest using a custom atomic. This should work for arrays up to 4 billion elements:
$ cat t2154.cu
#include <iostream>
__device__ __host__ int get_int(unsigned long long val){return reinterpret_cast<int *>(&val)[0];}
__device__ __host__ unsigned get_uns(unsigned long long val){return reinterpret_cast<unsigned *>(&val)[1];}
__device__ bool done(int fval, int fval1, unsigned idx, unsigned idx1){
if (fval > fval1) return true;
if ((fval == fval1) && (idx <= idx1)) return true;
return false;
}
__device__ unsigned long long my_custom_atomic(unsigned long long *addr, int fval, unsigned idx){
unsigned long long old = *addr;
while (!done(get_int(old),fval, get_uns(old), idx))
old = atomicCAS(addr, old, ((((unsigned long long)idx)<<32)|fval));
return old;
}
const int minidx = 256;
__device__ int f(int t){ return minidx - (t-minidx)*(t-minidx);}
__global__ void k(int *arr, unsigned long long *min, unsigned N){
unsigned my_idx = blockIdx.x*blockDim.x+threadIdx.x;
if (my_idx < N){
int my_val = arr[my_idx];
my_val = f(my_val);
my_custom_atomic(min, my_val, my_idx);
}
}
const unsigned my_N = 32768;
int main(){
unsigned long long *min;
cudaMallocManaged(&min, sizeof(min[0]));
int *arr;
cudaMallocManaged(&arr, sizeof(arr[0])*my_N);
for (unsigned i = 0; i < my_N; i++) arr[i] = i;
*min = 0xFFFFFFFF80000000ULL; //maximum unsigned index combined with minimum int value
k<<<my_N/256, 256>>>(arr, min, my_N);
cudaDeviceSynchronize();
std::cout << " maximum val: " << get_int(*min) << " at index: " << get_uns(*min) << std::endl;
}
$ nvcc -o t2154 t2154.cu
$ compute-sanitizer ./t2154
========= COMPUTE-SANITIZER
maximum val: 256 at index: 256
========= ERROR SUMMARY: 0 errors
$
We are assembling and disassembling the 64-bit quantity as needed, and using the general method outlined in the programming guide for arbitrary atomics.
I wonder if I can use the functions in Thrust library for this purpose instead.
Yes, you can do this with a transform and a reduce operation in thrust. In fact thrust can combine these into a single algorithm call. Here is an example:
$ cat t2155.cu
#include <iostream>
#include <thrust/transform.h>
#include <thrust/reduce.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <limits>
const int minidx = 256;
const size_t my_N = 32768;
struct f
{
template <typename T>
__host__ __device__ T operator()(T t) {
T result = t;
thrust::get<0>(result) = minidx - (thrust::get<0>(t) - minidx)*(thrust::get<0>(t) - minidx);
return result;
}
};
struct r
{
template <typename T1, typename T2>
__host__ __device__ T1 operator()(T1 &t1, T2 &t2){
if (thrust::get<0>(t1) > thrust::get<0>(t2)) return t1;
if (thrust::get<0>(t1) < thrust::get<0>(t2)) return t2;
if (thrust::get<1>(t1) < thrust::get<1>(t2)) return t1;
return t2;
}
};
int main(){
thrust::device_vector<int> arr(my_N);
thrust::sequence(arr.begin(), arr.end());
auto my_zip = thrust::make_zip_iterator(thrust::make_tuple(arr.begin(), thrust::counting_iterator<size_t>(0)));
auto init = thrust::make_tuple(std::numeric_limits<int>::min(), std::numeric_limits<size_t>::max());
auto result = thrust::transform_reduce(my_zip, my_zip+my_N, f(), init, r());
std::cout << " maximum val: " << thrust::get<0>(result) << " at index: " << thrust::get<1>(result) << std::endl;
}
$ nvcc -o t2155 t2155.cu
$ compute-sanitizer ./t2155
========= COMPUTE-SANITIZER
maximum val: 256 at index: 256
========= ERROR SUMMARY: 0 errors
$
Notes:
Comparing those 2 implementations/examples, I know which one I would choose. The atomic method is brittle, type-limited, and size-limited. The thrust example could be adapted to handle types more flexibly (e.g. a function that returns a 64-bit type) and could be extended to handle beyond 4 billion elements.
The code here is just intended to be a possible roadmap. It's not thoroughly tested; bugs are always possible.
There is a strong correspondence between the two methods. The main() routines have almost a 1:1 correspondence, which hopefully you can identify. Furthermore the r() functor corresponds to the done() function, and the f() functor corresponds to the f() function.
Don't assume that you can readily/trivially increase my atomic example to 4 billion elements. The f() function I wrote would overflow/underflow an int variable. But with an appropriate data array and f() function, it should be possible to use up to 4 billion elements.
EDIT: As suggested in the comments below, we may be able to do a better job in the atomic case, by doing a threadblock level shared sweep reduction, followed by a single atomic per threadblock. Here is an example of that:
#include <iostream>
const int nTPB=512;
const unsigned long long initval = 0xFFFFFFFF80000000ULL; // maximum index and minimum int
__device__ __host__ int get_int(unsigned long long val){return reinterpret_cast<int *>(&val)[0];}
__device__ __host__ unsigned get_uns(unsigned long long val){return reinterpret_cast<unsigned *>(&val)[1];}
__device__ bool done(int fval, int fval1, unsigned idx, unsigned idx1){
if (fval > fval1) return true;
if ((fval == fval1) && (idx <= idx1)) return true;
return false;
}
__device__ unsigned long long my_custom_atomic(unsigned long long *addr, int fval, unsigned idx){
unsigned long long old = *addr;
while (!done(get_int(old),fval, get_uns(old), idx))
old = atomicCAS(addr, old, ((((unsigned long long)idx)<<32)|fval));
return old;
}
const int minidx = 256;
__device__ int f(int t){ return minidx - (t-minidx)*(t-minidx);}
__device__ unsigned long long my_reduce(unsigned long long t1, unsigned long long t2){
if (done(get_int(t1), get_int(t2), get_uns(t1), get_uns(t2))) return t1;
return t2;
}
__global__ void k(int *arr, unsigned long long *min, unsigned N){
__shared__ unsigned long long smem[nTPB];
smem[threadIdx.x] = initval;
for (unsigned int idx = blockIdx.x*blockDim.x+threadIdx.x; idx < N; idx+=gridDim.x*blockDim.x)
smem[threadIdx.x] = my_reduce(smem[threadIdx.x], (((unsigned long long)idx)<<32)|f(arr[idx]));
for (int t = nTPB>>1; t > 0; t>>=1){
__syncthreads();
if (threadIdx.x < t) smem[threadIdx.x] = my_reduce(smem[threadIdx.x], smem[threadIdx.x+t]);}
if (!threadIdx.x) my_custom_atomic(min, get_int(smem[0]), get_uns(smem[0]));
}
const unsigned my_N = 32768;
int main(){
unsigned long long *min;
cudaMallocManaged(&min, sizeof(min[0]));
int *arr;
cudaMallocManaged(&arr, sizeof(arr[0])*my_N);
for (unsigned i = 0; i < my_N; i++) arr[i] = i;
arr[1024] = minidx;
*min = initval;
k<<<(my_N+nTPB-1)/nTPB, nTPB>>>(arr, min, my_N);
cudaDeviceSynchronize();
std::cout << " minimum val: " << get_int(*min) << " at index: " << get_uns(*min) << std::endl;
}

Does bool variable in kernel need to be synchronized

I have a kernel consisting of a for loop that searches through an array for a specific int value. I'm using a grid block of 256 threads to do this. However, when one thread finds the value, I want to let the other threads know to exit. Currently I'm using a boolean flag, but I'm not sure if its working properly. My concern is synchronization.
__device__ bool found;
__global__
void search()
{
for(int i = threadIdx.x; i<1000000; i += stride)
{
if(found == true)
{
break;
}
else if(arr[i] = x)
{
found = true;
break;
}
}
}
int main()
{
bool flag = false;
cudaMemcpyToSymbol(found, &flag, sizeof(bool), 0,cudaMemcpyHostToDevice);
}
As pointed out in comments, you can probably achieve what you want by declaring the global device flag to be volatile, which will inhibit caching, and by using a memory fence function. There really isn't a global synchronization primitive which would do want you want other than the new grid synchronization mechanism introduced in CUDA 9 and new hardware, but that probably isn't necessary in this case. Turning your pseudocode into a toy example:
#include <iostream>
#include <thrust/device_vector.h>
__device__ volatile bool found;
__device__ volatile size_t idx;
template<bool docheck>
__global__
void search(const int* arr, int x, size_t N)
{
size_t i = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for(; (i<N) && (!found); i += stride)
{
if(arr[i] == x)
{
if (docheck) found = true;
idx = i;
__threadfence();
break;
}
}
}
int main()
{
const size_t N = 1 << 24;
const size_t findidx = 280270;
const int findval = 0xdeadbeef;
thrust::device_vector<int> data(N,1);
data[findidx] = findval;
bool flag = false;
size_t zero = 0;
{
cudaMemcpyToSymbol(found, &flag, sizeof(bool));
cudaMemcpyToSymbol(idx, &zero, sizeof(size_t));
int blocks, threads;
cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, search<false>);
search<false><<<blocks, threads>>>(thrust::raw_pointer_cast(data.data()), findval, N);
cudaDeviceSynchronize();
size_t result = 0;
cudaMemcpyFromSymbol(&result, idx, sizeof(size_t));
std::cout << "result = " << result << std::endl;
}
{
cudaMemcpyToSymbol(found, &flag, sizeof(bool));
cudaMemcpyToSymbol(idx, &zero, sizeof(size_t));
int blocks, threads;
cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, search<true>);
search<true><<<blocks, threads>>>(thrust::raw_pointer_cast(data.data()), findval, N);
cudaDeviceSynchronize();
size_t result = 0;
cudaMemcpyFromSymbol(&result, idx, sizeof(size_t));
std::cout << "result = " << result << std::endl;
}
return 0;
}
and profiling it gives the following:
$ nvcc -arch=sm_52 -o notify notify.cu
$ nvprof ./notify
==3916== NVPROF is profiling process 3916, command: ./notify
result = 280270
result = 280270
==3916== Profiling application: ./notify
==3916== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 78.00% 1.6773ms 1 1.6773ms 1.6773ms 1.6773ms void search<bool=0>(int const *, int, unsigned long)
19.93% 428.63us 1 428.63us 428.63us 428.63us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>(thrust::device_ptr<int>, int)
1.82% 39.199us 1 39.199us 39.199us 39.199us void search<bool=1>(int const *, int, unsigned long)
As you can see, the version which sets the found flag completes the search in 40 microseconds, whereas the version which does not set the flag takes 1.7 milliseconds. Given that the kernel is run with the maximum number of resident blocks in both cases, we can conclude that the early exit mechanism worked correctly and running blocks detected that the required value had been found.

Can't get matrix*vector multiplication to go faster in CUDA than in CPU

#include <iostream>
#include <assert.h>
#include <sys/time.h>
#define BLOCK_SIZE 32 // CUDA block size
__device__ inline int getValFromMatrix(int* matrix, int row, int col,int matSize) {
if (row<matSize && col<matSize) {return matrix[row*matSize + col];}
return 0;
}
__device__ inline int getValFromVector(int* vector, int row, int matSize) {
if (row<matSize) {return vector[row];}
return 0;
}
__global__ void matVecMultCUDAKernel(int* aOnGPU, int* bOnGPU, int* cOnGPU, int matSize) {
__shared__ int aRowShared[BLOCK_SIZE];
__shared__ int bShared[BLOCK_SIZE];
__shared__ int myRow;
__shared__ double rowSum;
int myIndexInBlock = threadIdx.x;
myRow = blockIdx.x;
rowSum = 0;
for (int m = 0; m < (matSize / BLOCK_SIZE + 1);m++) {
aRowShared[myIndexInBlock] = getValFromMatrix(aOnGPU,myRow,m*BLOCK_SIZE+myIndexInBlock,matSize);
bShared[myIndexInBlock] = getValFromVector(bOnGPU,m*BLOCK_SIZE+myIndexInBlock,matSize);
__syncthreads(); // Sync threads to make sure all fields have been written by all threads in the block to cShared and xShared
if (myIndexInBlock==0) {
for (int k=0;k<BLOCK_SIZE;k++) {
rowSum += aRowShared[k] * bShared[k];
}
}
}
if (myIndexInBlock==0) {cOnGPU[myRow] = rowSum;}
}
static inline void cudaCheckReturn(cudaError_t result) {
if (result != cudaSuccess) {
std::cerr <<"CUDA Runtime Error: " << cudaGetErrorString(result) << std::endl;
assert(result == cudaSuccess);
}
}
static void matVecMultCUDA(int* aOnGPU,int* bOnGPU, int* cOnGPU, int* c, int sizeOfc, int matSize) {
matVecMultCUDAKernel<<<matSize,BLOCK_SIZE>>>(aOnGPU,bOnGPU,cOnGPU,matSize); // Launch 1 block per row
cudaCheckReturn(cudaMemcpy(c,cOnGPU,sizeOfc,cudaMemcpyDeviceToHost));
}
static void matVecMult(int** A,int* b, int* c, int matSize) {
// Sequential implementation:
for (int i=0;i<matSize;i++) {
c[i]=0;
for (int j=0;j<matSize;j++) {
c[i]+=(A[i][j] * b[j]);
}
}
}
int main() {
int matSize = 1000;
int** A,* b,* c;
int* aOnGPU,* bOnGPU,* cOnGPU;
A = new int*[matSize];
for (int i = 0; i < matSize;i++) {A[i] = new int[matSize]();}
b = new int[matSize]();
c = new int[matSize]();
int aSizeOnGPU = matSize * matSize * sizeof(int), bcSizeOnGPU = matSize * sizeof(int);
cudaCheckReturn(cudaMalloc(&aOnGPU,aSizeOnGPU)); // cudaMallocPitch?
cudaCheckReturn(cudaMalloc(&bOnGPU,bcSizeOnGPU));
cudaCheckReturn(cudaMalloc(&cOnGPU,bcSizeOnGPU));
srand(time(NULL));
for (int i=0;i<matSize;i++) {
b[i] = rand()%100;
for (int j=0;j<matSize;j++) {
A[i][j] = rand()%100;
}
}
for (int i=0;i<matSize;i++) {cudaCheckReturn(cudaMemcpy((aOnGPU+i*matSize),A[i],bcSizeOnGPU,cudaMemcpyHostToDevice));}
cudaCheckReturn(cudaMemcpy(bOnGPU,b,bcSizeOnGPU,cudaMemcpyHostToDevice));
int iters=1;
timeval start,end;
// Sequential run:
gettimeofday(&start,NULL);
for (int i=0;i<iters;i++) {matVecMult(A,b,c,matSize);}
gettimeofday(&end,NULL);
std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;
// CUDA run:
gettimeofday(&start,NULL);
for (int i=0;i<iters;i++) {matVecMultCUDA(aOnGPU,bOnGPU,cOnGPU,c,bcSizeOnGPU,matSize);}
gettimeofday(&end,NULL);
std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;
cudaCheckReturn(cudaFree(aOnGPU));
cudaCheckReturn(cudaFree(bOnGPU));
cudaCheckReturn(cudaFree(cOnGPU));
for (int i = 0; i < matSize; ++i) {
delete[] A[i];
}
delete[] A;
delete[] b;
delete[] c;
}
Gives:
267171
580253
I've followed the guide on http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory, on how to do a matrix multiplication. I used shared memory for both the matrix (A) and the vector (B), but no matter what matrix size (100*100-20000*20000) or block size (32-1024) i choose, the sequential implementation always outperforms the CUDA implementation in terms of speed, it is about twice as fast.
Since I'm using matrix*vector multiplication, the shared arrays and blocks are handled a bit different; I'm using one block per row of the matrix instead of a 2D block over a part of the matrix.
Is my implementation wrong, or is simply CUDA not faster than the CPU?
First item: You perform checks on boundaries in the cuda implementation where you don't on CPU. Branching are really expensive on a GPU.
Second : You count the cudamemcpy in the cuda performance. It's very uncommon to perform only one multiplication before having to get the result back to cpu.
Usually (on CG for example), you perform several hundreds of multiplication on GPU before having to copy back.
Third: Dont try to implement that (except for educational purposes) and use vendor libraries (like CUBLAS, which ships with every CUDA release), which are extremely hard to outperform.

CUDA: How can I run the parallel reduction code for summation that is described in the NVIDIA paper by Mark Harris?

Although I understand the logic behind parallel reduction described in this paper, I can't seem to be able to run it for a simple example where the input array has size 1s.
Here is what I achieved so far. Keep in mind that I'm using the thrust library to manage input and output data.
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <ctime>
#include <sys/time.h>
#include <sstream>
#include <string>
#include <fstream>
using namespace std;
__global__ void reduce0(int *g_idata, int *g_odata){
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main(void){
int size = 10;
thrust::host_vector<int> data_h_i(size, 1);
//initialize the data, all values will be 1
//so the final sum will be equal to 10
int threadsPerBlock = 256;
int totalBlocks = size/threadsPerBlock + 1;
dim3 dimGrid(totalBlocks,1,1);
dim3 dimBlock(threadsPerBlock, 1, 1);
thrust::device_vector<int> data_v_i = data_h_i;
thrust::device_vector<int> data_v_o(size);
int* output = thrust::raw_pointer_cast(data_v_o.data());
int* input = thrust::raw_pointer_cast(data_v_i.data());
reduce0<<<dimGrid, dimBlock>>>(input, output);
data_v_i.clear();
data_v_i.shrink_to_fit();
thrust::host_vector<int> data_h_o = data_v_o;
data_v_o.clear();
data_v_o.shrink_to_fit();
cout<<data_h_o[0]<<endl;
return 0;
}
the code is simple, I create a host_vector of size size and initialize all the values to 1.
then I say that we need 256 threads per each block and find dynamically the amount of blocks needed for my example.
To keep things simple, I create an array of 10 values only, which means that we are going to require only one block. So one kernel invocation will be enough to produce the final result.
My questions are the following:
Question 1
After compiling the above example (nvcc -O3 reduction.cu -arch=sm_21) and entering ./a.out I get the following message:
terminate called after throwing an instance of 'thrust::system::system_error'
what(): unspecified launch failure
I'm not sure what's going on here, but it seems to me that the error comes from the line
sdata[tid] = g_idata[i]
The kernel is an exact copy of the kernel described in the paper so I'm not sure what changes are required in order to fix this problem.
Question 2
If we fix the first problem, how could we make the above code work for arbitrary size of input array? If for example our size is more than 256, then we would need at least two blocks, so each block will give an output that will then have to be combined with the outputs of other blocks. In the paper it says that we would need multiple invocations of the kernel, however I'm not sure how this can be done dynamically.
Thank you in advance
EDIT1: For Question 1 it seems that I don't allocate memory for the shared memory correctly. Calling the kernel like that: reduce0<<<dimGrid, dimBlock, size*sizeof(int)>>>(input, output); and also checking to see if tid is not out of range.
makes the code work properly.
The new kernel is the following:
__global__ void reduce0(int *g_idata, int *g_odata, int size){
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
if(tid<size){
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s=1; s < size; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
}
I'm still not sure about Question 2 though.
Question 1
Your kernel is using dynamically allocated shared memory:
extern __shared__ int sdata[];
...
sdata[tid] = g_idata[i];
But you are not allocating any dynamic shared memory in your kernel call:
reduce0<<<dimGrid, dimBlock>>>(input, output);
^
|
missing shared memory parameter.
So when you attempt to access the shared memory, you get a kernel fault.
By the way you can still do cuda error checking on your kernel calls (even though you are using thrust elsewhere).
Question 2
Question 2 is pretty well answered in Mark's paper here You can see at the bottom of slide 9 that each block writes it's partial result to an array in global memory (g_odata[]) which stores one result per block. We then simply launch another kernel of essentially the same type that is operating on g_odata[] instead of the original input data. We can do this process successively until our partial results (e.g. g_odata[]) only contain 256 results, or how many threads we are launching in a threadblock. We can then sum that final result with a single threadblock and produce a single answer value.
examples are given in the cuda sample code here.
Here's an edited version of your code, that shows how to call the two kernels in sequence to handle a larger size. I don't consider this a paragon of reduction programming, just a simple extension of what you already wrote to illustrate the concept. Note that there are a variety of changes throughout the kernel and main code to facilitate the use of the kernel to handle larger data sizes. This method still won't scale beyond a data size of (threadsPerBlock ^2), but again it's just to illustrate the concept of calling multiple kernels in sequence to sum partial results, with fewest modifications to your code.
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <ctime>
#include <sys/time.h>
#include <sstream>
#include <string>
#include <fstream>
using namespace std;
__global__ void reduce0(int *g_idata, int *g_odata, int size){
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = 0;
if(i<size)
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main(void){
int size = 40000;
thrust::host_vector<int> data_h_i(size, 1);
//initialize the data, all values will be 1
//so the final sum will be equal to size
int threadsPerBlock = 256;
int totalBlocks = (size+(threadsPerBlock-1))/threadsPerBlock;
thrust::device_vector<int> data_v_i = data_h_i;
thrust::device_vector<int> data_v_o(totalBlocks);
int* output = thrust::raw_pointer_cast(data_v_o.data());
int* input = thrust::raw_pointer_cast(data_v_i.data());
reduce0<<<totalBlocks, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(input, output, size);
reduce0<<<1, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(output, input, totalBlocks);
data_v_o[0] = data_v_i[0];
data_v_i.clear();
data_v_i.shrink_to_fit();
thrust::host_vector<int> data_h_o = data_v_o;
data_v_o.clear();
data_v_o.shrink_to_fit();
cout<<data_h_o[0]<<endl;
return 0;
}
After modifying the code made by Robert Crovella to answer my question, here is the final version which supports arbitrary amount of input values.
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <ctime>
#include <sys/time.h>
#include <sstream>
#include <string>
#include <fstream>
using namespace std;
__global__ void reduce0(int *g_idata, int *g_odata, int size){
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = 0;
if(i<size)
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main(void){
int size = 939289;
thrust::host_vector<int> data_h_i(size, 1);
//initialize the data, all values will be 1
//so the final sum will be equal to size
int threadsPerBlock = 256;
int totalBlocks = (size+(threadsPerBlock-1))/threadsPerBlock;
thrust::device_vector<int> data_v_i = data_h_i;
thrust::device_vector<int> data_v_o(totalBlocks);
int* output = thrust::raw_pointer_cast(data_v_o.data());
int* input = thrust::raw_pointer_cast(data_v_i.data());
bool turn = true;
while(true){
if(turn){
reduce0<<<totalBlocks, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(input, output, size);
turn = false;
}
else{
reduce0<<<totalBlocks, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(output, input, size);
turn = true;
}
if(totalBlocks == 1) break;
size = totalBlocks;
totalBlocks = ceil((double)totalBlocks/threadsPerBlock);
}
thrust::host_vector<int> data_h_o;
if(turn)
data_h_o = data_v_i;
else
data_h_o = data_v_o;
data_v_i.clear();
data_v_i.shrink_to_fit();
data_v_o.clear();
data_v_o.shrink_to_fit();
cout<<data_h_o[0]<<endl;
return 0;
}

CUDA random number generating

I would like to generate random numbers in my __device__ function, and keep it in my int Board[500] , I found some examples , but they used some type named curandState. I only need a function like a rand() in C++.
here is my code , I have N3[40000] array in my device memory , I generate some random numbers in my kernel working for one thread (I mean this "kernel <<<1,1>>> ... ") , then I copy it to my N2[40000] from CPU, and print it ,so here is the code
#include <iostream>
#include <Cuda.h>
#include<curand.h>
#include<curand_kernel.h>
int n = 200;
using namespace std;
__device__ float generate( curandState* globalState, int ind )
{
//int ind = threadIdx.x;
curandState localState = globalState[ind];
float RANDOM = curand_uniform( &localState );
globalState[ind] = localState;
return RANDOM;
}
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int id = threadIdx.x;
curand_init ( seed, id, 0, &state[id] );
}
__global__ void kernel(float* N, curandState* globalState, int n)
{
// generate random numbers
for(int i=0;i<40000;i++)
{
int k = generate(globalState, i) * 100000;
while(k > n*n-1)
{
k-=(n*n-1);
}
N[i] = k;
}
}
int main()
{
int N=40000;
curandState* devStates;
cudaMalloc ( &devStates, N*sizeof( curandState ) );
// setup seeds
setup_kernel <<< 1, N >>> ( devStates,unsigned(time(NULL)) );
float N2[40000];
float* N3;
cudaMalloc((void**) &N3, sizeof(float)*N);
kernel<<<1,1>>> (N3, devStates, n);
cudaMemcpy(N2, N3, sizeof(float)*N, cudaMemcpyDeviceToHost);
for(int i=0;i<N;i++)
{
cout<<N2[i]<<endl;
}
return 0;
}
You may use curand library to generate random numbers in device memory and then run your kernel without even having to copy those values to the host.