Does bool variable in kernel need to be synchronized - cuda

I have a kernel consisting of a for loop that searches through an array for a specific int value. I'm using a grid block of 256 threads to do this. However, when one thread finds the value, I want to let the other threads know to exit. Currently I'm using a boolean flag, but I'm not sure if its working properly. My concern is synchronization.
__device__ bool found;
__global__
void search()
{
for(int i = threadIdx.x; i<1000000; i += stride)
{
if(found == true)
{
break;
}
else if(arr[i] = x)
{
found = true;
break;
}
}
}
int main()
{
bool flag = false;
cudaMemcpyToSymbol(found, &flag, sizeof(bool), 0,cudaMemcpyHostToDevice);
}

As pointed out in comments, you can probably achieve what you want by declaring the global device flag to be volatile, which will inhibit caching, and by using a memory fence function. There really isn't a global synchronization primitive which would do want you want other than the new grid synchronization mechanism introduced in CUDA 9 and new hardware, but that probably isn't necessary in this case. Turning your pseudocode into a toy example:
#include <iostream>
#include <thrust/device_vector.h>
__device__ volatile bool found;
__device__ volatile size_t idx;
template<bool docheck>
__global__
void search(const int* arr, int x, size_t N)
{
size_t i = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for(; (i<N) && (!found); i += stride)
{
if(arr[i] == x)
{
if (docheck) found = true;
idx = i;
__threadfence();
break;
}
}
}
int main()
{
const size_t N = 1 << 24;
const size_t findidx = 280270;
const int findval = 0xdeadbeef;
thrust::device_vector<int> data(N,1);
data[findidx] = findval;
bool flag = false;
size_t zero = 0;
{
cudaMemcpyToSymbol(found, &flag, sizeof(bool));
cudaMemcpyToSymbol(idx, &zero, sizeof(size_t));
int blocks, threads;
cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, search<false>);
search<false><<<blocks, threads>>>(thrust::raw_pointer_cast(data.data()), findval, N);
cudaDeviceSynchronize();
size_t result = 0;
cudaMemcpyFromSymbol(&result, idx, sizeof(size_t));
std::cout << "result = " << result << std::endl;
}
{
cudaMemcpyToSymbol(found, &flag, sizeof(bool));
cudaMemcpyToSymbol(idx, &zero, sizeof(size_t));
int blocks, threads;
cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, search<true>);
search<true><<<blocks, threads>>>(thrust::raw_pointer_cast(data.data()), findval, N);
cudaDeviceSynchronize();
size_t result = 0;
cudaMemcpyFromSymbol(&result, idx, sizeof(size_t));
std::cout << "result = " << result << std::endl;
}
return 0;
}
and profiling it gives the following:
$ nvcc -arch=sm_52 -o notify notify.cu
$ nvprof ./notify
==3916== NVPROF is profiling process 3916, command: ./notify
result = 280270
result = 280270
==3916== Profiling application: ./notify
==3916== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 78.00% 1.6773ms 1 1.6773ms 1.6773ms 1.6773ms void search<bool=0>(int const *, int, unsigned long)
19.93% 428.63us 1 428.63us 428.63us 428.63us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>(thrust::device_ptr<int>, int)
1.82% 39.199us 1 39.199us 39.199us 39.199us void search<bool=1>(int const *, int, unsigned long)
As you can see, the version which sets the found flag completes the search in 40 microseconds, whereas the version which does not set the flag takes 1.7 milliseconds. Given that the kernel is run with the maximum number of resident blocks in both cases, we can conclude that the early exit mechanism worked correctly and running blocks detected that the required value had been found.

Related

CUDA, how to find the first item in an array that makes a function maximal

In Cuda C++, I have a big array Arr of integers, and a function F: int -> int. I want to find the first index of some items in Arr that makes F maximal.
How can I write a kernel that always keeps the maximal value (in F) to compare with others using atomic stuff to avoid facing the race condition problems?
BTW, I wonder if I can use the functions in Thrust library for this purpose instead.
How can I write a kernel that always keeps the maximal value (in F) to compare with others using atomic stuff to avoid facing the race condition problems?
Based on your description, including usage of int, and a desire to use atomics, I would suggest using a custom atomic. This should work for arrays up to 4 billion elements:
$ cat t2154.cu
#include <iostream>
__device__ __host__ int get_int(unsigned long long val){return reinterpret_cast<int *>(&val)[0];}
__device__ __host__ unsigned get_uns(unsigned long long val){return reinterpret_cast<unsigned *>(&val)[1];}
__device__ bool done(int fval, int fval1, unsigned idx, unsigned idx1){
if (fval > fval1) return true;
if ((fval == fval1) && (idx <= idx1)) return true;
return false;
}
__device__ unsigned long long my_custom_atomic(unsigned long long *addr, int fval, unsigned idx){
unsigned long long old = *addr;
while (!done(get_int(old),fval, get_uns(old), idx))
old = atomicCAS(addr, old, ((((unsigned long long)idx)<<32)|fval));
return old;
}
const int minidx = 256;
__device__ int f(int t){ return minidx - (t-minidx)*(t-minidx);}
__global__ void k(int *arr, unsigned long long *min, unsigned N){
unsigned my_idx = blockIdx.x*blockDim.x+threadIdx.x;
if (my_idx < N){
int my_val = arr[my_idx];
my_val = f(my_val);
my_custom_atomic(min, my_val, my_idx);
}
}
const unsigned my_N = 32768;
int main(){
unsigned long long *min;
cudaMallocManaged(&min, sizeof(min[0]));
int *arr;
cudaMallocManaged(&arr, sizeof(arr[0])*my_N);
for (unsigned i = 0; i < my_N; i++) arr[i] = i;
*min = 0xFFFFFFFF80000000ULL; //maximum unsigned index combined with minimum int value
k<<<my_N/256, 256>>>(arr, min, my_N);
cudaDeviceSynchronize();
std::cout << " maximum val: " << get_int(*min) << " at index: " << get_uns(*min) << std::endl;
}
$ nvcc -o t2154 t2154.cu
$ compute-sanitizer ./t2154
========= COMPUTE-SANITIZER
maximum val: 256 at index: 256
========= ERROR SUMMARY: 0 errors
$
We are assembling and disassembling the 64-bit quantity as needed, and using the general method outlined in the programming guide for arbitrary atomics.
I wonder if I can use the functions in Thrust library for this purpose instead.
Yes, you can do this with a transform and a reduce operation in thrust. In fact thrust can combine these into a single algorithm call. Here is an example:
$ cat t2155.cu
#include <iostream>
#include <thrust/transform.h>
#include <thrust/reduce.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <limits>
const int minidx = 256;
const size_t my_N = 32768;
struct f
{
template <typename T>
__host__ __device__ T operator()(T t) {
T result = t;
thrust::get<0>(result) = minidx - (thrust::get<0>(t) - minidx)*(thrust::get<0>(t) - minidx);
return result;
}
};
struct r
{
template <typename T1, typename T2>
__host__ __device__ T1 operator()(T1 &t1, T2 &t2){
if (thrust::get<0>(t1) > thrust::get<0>(t2)) return t1;
if (thrust::get<0>(t1) < thrust::get<0>(t2)) return t2;
if (thrust::get<1>(t1) < thrust::get<1>(t2)) return t1;
return t2;
}
};
int main(){
thrust::device_vector<int> arr(my_N);
thrust::sequence(arr.begin(), arr.end());
auto my_zip = thrust::make_zip_iterator(thrust::make_tuple(arr.begin(), thrust::counting_iterator<size_t>(0)));
auto init = thrust::make_tuple(std::numeric_limits<int>::min(), std::numeric_limits<size_t>::max());
auto result = thrust::transform_reduce(my_zip, my_zip+my_N, f(), init, r());
std::cout << " maximum val: " << thrust::get<0>(result) << " at index: " << thrust::get<1>(result) << std::endl;
}
$ nvcc -o t2155 t2155.cu
$ compute-sanitizer ./t2155
========= COMPUTE-SANITIZER
maximum val: 256 at index: 256
========= ERROR SUMMARY: 0 errors
$
Notes:
Comparing those 2 implementations/examples, I know which one I would choose. The atomic method is brittle, type-limited, and size-limited. The thrust example could be adapted to handle types more flexibly (e.g. a function that returns a 64-bit type) and could be extended to handle beyond 4 billion elements.
The code here is just intended to be a possible roadmap. It's not thoroughly tested; bugs are always possible.
There is a strong correspondence between the two methods. The main() routines have almost a 1:1 correspondence, which hopefully you can identify. Furthermore the r() functor corresponds to the done() function, and the f() functor corresponds to the f() function.
Don't assume that you can readily/trivially increase my atomic example to 4 billion elements. The f() function I wrote would overflow/underflow an int variable. But with an appropriate data array and f() function, it should be possible to use up to 4 billion elements.
EDIT: As suggested in the comments below, we may be able to do a better job in the atomic case, by doing a threadblock level shared sweep reduction, followed by a single atomic per threadblock. Here is an example of that:
#include <iostream>
const int nTPB=512;
const unsigned long long initval = 0xFFFFFFFF80000000ULL; // maximum index and minimum int
__device__ __host__ int get_int(unsigned long long val){return reinterpret_cast<int *>(&val)[0];}
__device__ __host__ unsigned get_uns(unsigned long long val){return reinterpret_cast<unsigned *>(&val)[1];}
__device__ bool done(int fval, int fval1, unsigned idx, unsigned idx1){
if (fval > fval1) return true;
if ((fval == fval1) && (idx <= idx1)) return true;
return false;
}
__device__ unsigned long long my_custom_atomic(unsigned long long *addr, int fval, unsigned idx){
unsigned long long old = *addr;
while (!done(get_int(old),fval, get_uns(old), idx))
old = atomicCAS(addr, old, ((((unsigned long long)idx)<<32)|fval));
return old;
}
const int minidx = 256;
__device__ int f(int t){ return minidx - (t-minidx)*(t-minidx);}
__device__ unsigned long long my_reduce(unsigned long long t1, unsigned long long t2){
if (done(get_int(t1), get_int(t2), get_uns(t1), get_uns(t2))) return t1;
return t2;
}
__global__ void k(int *arr, unsigned long long *min, unsigned N){
__shared__ unsigned long long smem[nTPB];
smem[threadIdx.x] = initval;
for (unsigned int idx = blockIdx.x*blockDim.x+threadIdx.x; idx < N; idx+=gridDim.x*blockDim.x)
smem[threadIdx.x] = my_reduce(smem[threadIdx.x], (((unsigned long long)idx)<<32)|f(arr[idx]));
for (int t = nTPB>>1; t > 0; t>>=1){
__syncthreads();
if (threadIdx.x < t) smem[threadIdx.x] = my_reduce(smem[threadIdx.x], smem[threadIdx.x+t]);}
if (!threadIdx.x) my_custom_atomic(min, get_int(smem[0]), get_uns(smem[0]));
}
const unsigned my_N = 32768;
int main(){
unsigned long long *min;
cudaMallocManaged(&min, sizeof(min[0]));
int *arr;
cudaMallocManaged(&arr, sizeof(arr[0])*my_N);
for (unsigned i = 0; i < my_N; i++) arr[i] = i;
arr[1024] = minidx;
*min = initval;
k<<<(my_N+nTPB-1)/nTPB, nTPB>>>(arr, min, my_N);
cudaDeviceSynchronize();
std::cout << " minimum val: " << get_int(*min) << " at index: " << get_uns(*min) << std::endl;
}

Why does this CUDA reduction fail if I use 31 blocks?

The following CUDA code takes a list of labels (0, 1, 2, 3, ...) and finds the sums of the weights of these labels.
To accelerate the calculation, I use shared memory so that each thread maintains its own running sum. At the end of the calculation, I perform a CUB block-wide reduction and then an atomic add to the global memory.
The CPU and GPU agree on the results if I use fewer than 30 blocks, but disagree if I use more than this. Why is this and how can I fix it?
Checking error codes in the code doesn't yield anything and cuda-gdb and cuda-memcheck do not show any uncaught errors or memory issues.
I'm using NVCC v10.1.243 and running on a Nvidia Quadro P2000.
MWE
//Compile with, e.g., nvcc -I /z/downloads/cub-1.8.0/ cuda_reduction.cu -arch=sm_61
#include <algorithm>
#include <cub/cub.cuh>
#include <thrust/device_vector.h>
#include <random>
__global__ void group_summer(
const int32_t *const labels,
const float *const weights,
const int num_elements,
const int num_classes,
double *const sums,
uint32_t *const counts
){
constexpr int num_threads = 128;
assert(num_threads==blockDim.x);
//Get shared memory
extern __shared__ int s[];
double *const sums_shmem = (double*)s;
uint32_t *const counts_shmem = (uint32_t*)&sums_shmem[num_threads*num_classes];
double *const my_sums = &sums_shmem [num_classes*threadIdx.x];
uint32_t *const my_counts = &counts_shmem[num_classes*threadIdx.x];
for(int i=0;i<num_threads*num_classes;i+=num_threads){
sums_shmem[i] = 0;
counts_shmem[i] = 0;
}
__syncthreads();
for(int i=blockIdx.x * blockDim.x + threadIdx.x;i<num_elements;i+=gridDim.x*blockDim.x){
// printf("Thread %d at %d looking at %d with %f at %ld and %ld\n", threadIdx.x, i, labels[i], weights[i], (long int)&my_counts[i], (long int)&my_sums[i]);
const auto l = labels[i];
// printf("Before thread %d at %d now has %d counts and %lf sums\n", threadIdx.x, i, my_counts[l], my_sums[l]);
my_sums[l] += weights[i];
my_counts[l]++;
// printf("After thread %d at %d now has %d counts and %lf sums\n", threadIdx.x, i, my_counts[l], my_sums[l]);
}
__syncthreads();
__shared__ cub::BlockReduce<double, num_threads>::TempStorage double_temp_storage;
__shared__ cub::BlockReduce<uint32_t, num_threads>::TempStorage uint32_t_temp_storage;
for(int l=0;l<num_classes;l++){
// printf("Thread %d has %d counts with total weight %f for label %d\n", threadIdx.x, my_counts[l], my_sums[l], l);
const auto sums_total = cub::BlockReduce<double,num_threads>(double_temp_storage).Reduce(my_sums[l], cub::Sum());
const auto counts_total = cub::BlockReduce<uint32_t,num_threads>(uint32_t_temp_storage).Reduce(my_counts[l], cub::Sum());
if(threadIdx.x==0){
atomicAdd(&sums[l], sums_total);
atomicAdd(&counts[l], counts_total);
}
}
}
void group_summer_cpu(
const std::vector<int32_t> &labels,
const std::vector<float> &weights,
std::vector<double> &sums,
std::vector<uint32_t> &counts
){
for(int i=0;i<labels.size();i++){
const auto l = labels[i];
sums[l] += weights[i];
counts[l]++;
}
}
template<class T>
bool vec_nearly_equal(const std::vector<T> &a, const std::vector<T> &b){
if(a.size()!=b.size())
return false;
for(size_t i=0;i<a.size();i++){
if(std::abs(a[i]-b[i])>1e-4)
return false;
}
return true;
}
void TestGroupSummer(std::mt19937 &gen, const int N, const int label_max, const int num_blocks){
std::vector<int32_t> labels(N);
std::vector<float> weights(N);
std::uniform_int_distribution<int> label_dist(0, label_max);
std::uniform_real_distribution<float> weight_dist(0, 5000);
for(int i=0;i<N;i++){
labels[i] = label_dist(gen);
weights[i] = weight_dist(gen);
}
// for(const auto &x: labels) std::cout<<x<<" "; std::cout<<std::endl;
// for(const auto &x: weights) std::cout<<x<<" "; std::cout<<std::endl;
const int num_classes = 1 + *std::max_element(labels.begin(), labels.end());
thrust::device_vector<int32_t> d_labels(labels.size());
thrust::device_vector<float> d_weights(labels.size());
thrust::device_vector<double> d_sums(num_classes);
thrust::device_vector<uint32_t> d_counts(num_classes);
thrust::copy(labels.begin(), labels.end(), d_labels.begin());
thrust::copy(weights.begin(), weights.end(), d_weights.begin());
constexpr int num_threads = 128;
const int shmem = num_threads * num_classes * (sizeof(double)+sizeof(uint32_t));
std::cout<<"Num blocks: "<<num_blocks<<std::endl;
std::cout<<"Shared memory: "<<shmem<<std::endl;
group_summer<<<num_blocks,num_threads,shmem>>>(
thrust::raw_pointer_cast(d_labels.data()),
thrust::raw_pointer_cast(d_weights.data()),
labels.size(),
num_classes,
thrust::raw_pointer_cast(d_sums.data()),
thrust::raw_pointer_cast(d_counts.data())
);
if(cudaGetLastError()!=CUDA_SUCCESS){
std::cout<<"Kernel failed to launch!"<<std::endl;
}
cudaDeviceSynchronize();
if(cudaGetLastError()!=CUDA_SUCCESS){
std::cout<<"Error in kernel!"<<std::endl;
}
std::vector<double> h_sums(num_classes);
std::vector<uint32_t> h_counts(num_classes);
thrust::copy(d_sums.begin(), d_sums.end(), h_sums.begin());
thrust::copy(d_counts.begin(), d_counts.end(), h_counts.begin());
std::vector<double> correct_sums(num_classes);
std::vector<uint32_t> correct_counts(num_classes);
group_summer_cpu(labels, weights, correct_sums, correct_counts);
std::cout<<"Sums good? " <<vec_nearly_equal(h_sums,correct_sums)<<std::endl;
std::cout<<"Counts good? "<<(h_counts==correct_counts)<<std::endl;
std::cout<<"GPU Sums: "; for(const auto &x: h_sums) std::cout<<x<<" "; std::cout<<std::endl;
std::cout<<"CPU Sums: "; for(const auto &x: correct_sums) std::cout<<x<<" "; std::cout<<std::endl;
std::cout<<"GPU Counts: "; for(const auto &x: h_counts) std::cout<<x<<" "; std::cout<<std::endl;
std::cout<<"CPU Counts: "; for(const auto &x: correct_counts) std::cout<<x<<" "; std::cout<<std::endl;
}
int main(){
std::mt19937 gen;
//These all work
TestGroupSummer(gen, 1000000, 10, 30);
TestGroupSummer(gen, 1000000, 10, 30);
TestGroupSummer(gen, 1000000, 10, 30);
TestGroupSummer(gen, 1000000, 10, 30);
//This fails
TestGroupSummer(gen, 1000000, 10, 31);
}
When I run your code on a Tesla V100, all the results are failures except the first test.
You have a problem here:
for(int i=0;i<num_threads*num_classes;i+=num_threads){
sums_shmem[i] = 0;
counts_shmem[i] = 0;
}
That is not properly zero-ing out shared memory. You need to change the i=0 to i=threadIdx.x.
When I make that change, everything passes for me.
As an aside, this is not correct:
if(cudaGetLastError()!=CUDA_SUCCESS)
CUDA_SUCCESS is not the correct enum token to use with the runtime API. You should use cudaSuccess instead (there are 2 instances of this).
I also think your error comparison is apt to cause trouble:
if(std::abs(a[i]-b[i])>1e-4)
but it doesn't seem to be an issue here. I would normally expect to see some scaling before the test.

Atomic Operation failed in CUDA

As the compute ability is 2.1, the atomicAdd and atomicMax operations do not support double precision, then I define both functions based on some answers on stack overflow.
It is strange that the atomicAdd function works well but the atomicMax doesn't work, here is my code.
The test of my code is to generate random number on each block, and then sum the random numbers on each block, we have block sum, I want to test the atomicAdd and atomicMax on the block sum.
#include <iostream>
#include <curand.h>
#include <curand_kernel.h>
#include <stdio.h>
#include <stdlib.h>
#define num_of_blocks 2
#define threads_per_block 2
#define tot_threads 4
__device__ double gsum[num_of_blocks];
__device__ double dev_sum;
__device__ double dev_max;
// set seed for random number generator
__global__ void initcuRand(curandState* globalState, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &globalState[idx]);
}
// atomiMax for double
__device__ double atomicMax_d(double* address, double val)
{
unsigned long long int* address_as_i = (unsigned long long int*)address;
unsigned long long int old = *address_as_i, assumed;
do {
assumed = old;
old = ::atomicCAS(address_as_i, assumed, __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
} while (assumed != old);
return __longlong_as_double(old);
}
// atomicAdd for double
__device__ double atomicAdd_d(double* address, double val)
{
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do{
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
}while(assumed != old);
return __longlong_as_double(old);
}
__global__ void kernel(curandState *globalState){
// global id
int gidx = threadIdx.x + blockIdx.x * blockDim.x;
// local id
int lidx = threadIdx.x;
// creat shared memory to store seeds
__shared__ curandState localState[tot_threads];
__shared__ double srandnum[threads_per_block];
// copy global seed to local
localState[lidx] = globalState[gidx];
//synchronize the local threads writing to the local memory cache
__syncthreads();
// generate random number from normal distribution in shared memory
srandnum[lidx] = curand_normal(&localState[lidx]);
__syncthreads();
if(lidx == 0){srandnum[lidx] += srandnum[lidx + 1];} // sum of each block
if(lidx == 0){gsum[blockIdx.x] = srandnum[lidx];} // copy the sums back to global memory
__threadfence();
if( gidx < num_of_blocks){
atomicAdd_d(&dev_sum, gsum[gidx]);
}
if( gidx < num_of_blocks){
atomicMax_d(&dev_max, gsum[gidx]);
}
if( gidx == 0){
printf("Sum is: %lf\n", dev_sum);
}
if( gidx == 1){
printf("Max is: %lf\n", dev_max);
}
}
int main(){
// set seed on device
curandState *globalState;
cudaMalloc((void**)&globalState, tot_threads*sizeof(curandState));
initcuRand<<<num_of_blocks, threads_per_block>>>(globalState, 1);
// launch kernel
kernel<<<num_of_blocks, threads_per_block>>>(globalState);
double randnum[num_of_blocks];
cudaMemcpyFromSymbol(randnum, gsum, num_of_blocks*sizeof(double), 0, cudaMemcpyDeviceToHost);
std::cout << "Sum of each block:\n";
for (int i = 0; i < num_of_blocks; ++i){
std::cout << randnum[i] << std::endl;
}
cudaFree(globalState);
return 0;
}
The result I get is
Sum is: -0.898329
Max is: 0.000000
Sum of each block:
-0.0152994
-0.88303
From the result, I know that the atomicAdd function works but the atomicMax function doesn't work, I have no idea of this. Thanks beforehand.
You don't ever initialize dev_max or dev_sum. You can't sensibly do these types of atomic operations on them if they don't start with a known value.
Try something like this instead:
__device__ double dev_sum = 0.0;
__device__ double dev_max = -1e99;
and I think you'll be happier with the results.

Can't get matrix*vector multiplication to go faster in CUDA than in CPU

#include <iostream>
#include <assert.h>
#include <sys/time.h>
#define BLOCK_SIZE 32 // CUDA block size
__device__ inline int getValFromMatrix(int* matrix, int row, int col,int matSize) {
if (row<matSize && col<matSize) {return matrix[row*matSize + col];}
return 0;
}
__device__ inline int getValFromVector(int* vector, int row, int matSize) {
if (row<matSize) {return vector[row];}
return 0;
}
__global__ void matVecMultCUDAKernel(int* aOnGPU, int* bOnGPU, int* cOnGPU, int matSize) {
__shared__ int aRowShared[BLOCK_SIZE];
__shared__ int bShared[BLOCK_SIZE];
__shared__ int myRow;
__shared__ double rowSum;
int myIndexInBlock = threadIdx.x;
myRow = blockIdx.x;
rowSum = 0;
for (int m = 0; m < (matSize / BLOCK_SIZE + 1);m++) {
aRowShared[myIndexInBlock] = getValFromMatrix(aOnGPU,myRow,m*BLOCK_SIZE+myIndexInBlock,matSize);
bShared[myIndexInBlock] = getValFromVector(bOnGPU,m*BLOCK_SIZE+myIndexInBlock,matSize);
__syncthreads(); // Sync threads to make sure all fields have been written by all threads in the block to cShared and xShared
if (myIndexInBlock==0) {
for (int k=0;k<BLOCK_SIZE;k++) {
rowSum += aRowShared[k] * bShared[k];
}
}
}
if (myIndexInBlock==0) {cOnGPU[myRow] = rowSum;}
}
static inline void cudaCheckReturn(cudaError_t result) {
if (result != cudaSuccess) {
std::cerr <<"CUDA Runtime Error: " << cudaGetErrorString(result) << std::endl;
assert(result == cudaSuccess);
}
}
static void matVecMultCUDA(int* aOnGPU,int* bOnGPU, int* cOnGPU, int* c, int sizeOfc, int matSize) {
matVecMultCUDAKernel<<<matSize,BLOCK_SIZE>>>(aOnGPU,bOnGPU,cOnGPU,matSize); // Launch 1 block per row
cudaCheckReturn(cudaMemcpy(c,cOnGPU,sizeOfc,cudaMemcpyDeviceToHost));
}
static void matVecMult(int** A,int* b, int* c, int matSize) {
// Sequential implementation:
for (int i=0;i<matSize;i++) {
c[i]=0;
for (int j=0;j<matSize;j++) {
c[i]+=(A[i][j] * b[j]);
}
}
}
int main() {
int matSize = 1000;
int** A,* b,* c;
int* aOnGPU,* bOnGPU,* cOnGPU;
A = new int*[matSize];
for (int i = 0; i < matSize;i++) {A[i] = new int[matSize]();}
b = new int[matSize]();
c = new int[matSize]();
int aSizeOnGPU = matSize * matSize * sizeof(int), bcSizeOnGPU = matSize * sizeof(int);
cudaCheckReturn(cudaMalloc(&aOnGPU,aSizeOnGPU)); // cudaMallocPitch?
cudaCheckReturn(cudaMalloc(&bOnGPU,bcSizeOnGPU));
cudaCheckReturn(cudaMalloc(&cOnGPU,bcSizeOnGPU));
srand(time(NULL));
for (int i=0;i<matSize;i++) {
b[i] = rand()%100;
for (int j=0;j<matSize;j++) {
A[i][j] = rand()%100;
}
}
for (int i=0;i<matSize;i++) {cudaCheckReturn(cudaMemcpy((aOnGPU+i*matSize),A[i],bcSizeOnGPU,cudaMemcpyHostToDevice));}
cudaCheckReturn(cudaMemcpy(bOnGPU,b,bcSizeOnGPU,cudaMemcpyHostToDevice));
int iters=1;
timeval start,end;
// Sequential run:
gettimeofday(&start,NULL);
for (int i=0;i<iters;i++) {matVecMult(A,b,c,matSize);}
gettimeofday(&end,NULL);
std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;
// CUDA run:
gettimeofday(&start,NULL);
for (int i=0;i<iters;i++) {matVecMultCUDA(aOnGPU,bOnGPU,cOnGPU,c,bcSizeOnGPU,matSize);}
gettimeofday(&end,NULL);
std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;
cudaCheckReturn(cudaFree(aOnGPU));
cudaCheckReturn(cudaFree(bOnGPU));
cudaCheckReturn(cudaFree(cOnGPU));
for (int i = 0; i < matSize; ++i) {
delete[] A[i];
}
delete[] A;
delete[] b;
delete[] c;
}
Gives:
267171
580253
I've followed the guide on http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory, on how to do a matrix multiplication. I used shared memory for both the matrix (A) and the vector (B), but no matter what matrix size (100*100-20000*20000) or block size (32-1024) i choose, the sequential implementation always outperforms the CUDA implementation in terms of speed, it is about twice as fast.
Since I'm using matrix*vector multiplication, the shared arrays and blocks are handled a bit different; I'm using one block per row of the matrix instead of a 2D block over a part of the matrix.
Is my implementation wrong, or is simply CUDA not faster than the CPU?
First item: You perform checks on boundaries in the cuda implementation where you don't on CPU. Branching are really expensive on a GPU.
Second : You count the cudamemcpy in the cuda performance. It's very uncommon to perform only one multiplication before having to get the result back to cpu.
Usually (on CG for example), you perform several hundreds of multiplication on GPU before having to copy back.
Third: Dont try to implement that (except for educational purposes) and use vendor libraries (like CUBLAS, which ships with every CUDA release), which are extremely hard to outperform.

I lost data after __syncthreads() in cuda

I am trying to find the maximum of an array.. I took the help from CUDA Maximum Reduction Algorithm Not Working. and do some own modification. However I am running it for 16 data. I am finding that in kernel code shared memory copies only 1st 4data. rest are lost. I put two cuPrintf..1st printf shows data is their in the shared memory. But the 2nd cuPrintf is just after __syncthreads.. and that shows 0 from thread ids 4 onwords.. pls help
#include
#include
#include
#include
#include
#include "cuPrintf.cu"
#include "cuPrintf.cuh"
__device__ float MaxOf2(float a, float b)
{
if(a > b) return a;
else return b;
}
__global__ void findMax(int size,float *array_device , float *outPut)
{
extern __shared__ float sdata[];
int tid = threadIdx.x;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i< size)
{
sdata[tid] = array_device[i];
cuPrintf(" array_d[%d]===%f, sdata[%d]===%f\n ",i,array_device[i],tid,sdata[tid]);
__threadfence();
}
__syncthreads();
if(tid<size)
cuPrintf(" array_d[%d]===%f, sdata[%d]===%f\n ",i,array_device[i],tid,sdata[tid]);
for ( int s=blockDim.x/2; s>0; s=s>>1)//s=blockDim.x/2
{
if (tid < s)
{
sdata[tid]= MaxOf2(sdata[tid],sdata[tid+s]);
}
__syncthreads();
}
if (tid == 0) outPut[blockIdx.x] = sdata[0];
}
int main()
{
long double M = pow(2,20);
long double N = 2;
int noThreadsPerBlock = 512 ;
printf("\n Provide the array Size N.(array will be of size N * 2^20 ) :-");
scanf("%Lf",&N);
long int size = 16;
int numOfBlock = (int)size /noThreadsPerBlock + 1;
printf("\n num of blocks==%ld",numOfBlock);
float *array_device , *outPut;
float array_host[]={221,100,2,340,47,36,500,1,33,4460,5,6,7,8,9,11};
cudaMalloc((void **)&array_device, size*sizeof(float));
cudaMalloc((void **)&outPut, size*sizeof(float));
cudaError_t error0 = cudaGetLastError();
printf("\n 0CUDA error: %s\n", cudaGetErrorString(error0));
printf("size===%ld",size);
cudaMemcpy(array_device, array_host, size*sizeof(float), cudaMemcpyHostToDevice);
cudaError_t error1 = cudaGetLastError();
printf("\n1CUDA error: %s\n", cudaGetErrorString(error1));
while(size>1 )
{
cudaPrintfInit();
findMax<<< numOfBlock,noThreadsPerBlock>>>(size,array_device, outPut);cudaPrintfDisplay(stdout, true);
cudaPrintfEnd();
cudaError_t error2 = cudaGetLastError();
printf(" 2CUDA error: %s\n", cudaGetErrorString(error2));
cudaMemcpy(array_device, outPut, size*sizeof(float), cudaMemcpyDeviceToDevice);
size = numOfBlock;
printf("\n ****size==%ld\n",size);
numOfBlock = (int)size /noThreadsPerBlock + 1;
}
cudaMemcpy(array_host, outPut, size*sizeof(float), cudaMemcpyDeviceToHost);
cudaError_t error3 = cudaGetLastError();
printf("\n3CUDA error: %s\n", cudaGetErrorString(error3));
for(int i=0;i<size;i++)
printf("\n index==%d ;data=%f ",i,array_host[i]);
return 0;
}
I'm posting my comment as an answer as requested.
Firstly, you havent specified dynamic size of shared memory in kernel launch. It should look something like:
findMax<<< numOfBlock,noThreadsPerBlock,sizeof(float)*noThreadsPerBlock>>>
Secondly, what was the concept behind condition if(tid<size) on second cuPrintf? Providing output of the program could also help.