is it data race in nested thrust functor - cuda

I have tested this snippet and try to explain its cause as well as a way to resolve it, but have failed to do so
#include <thrust/inner_product.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h>
#include <thrust/random.h>
#include <thrust/execution_policy.h>
#include <iostream>
#include <cmath>
#include <boost/concept_check.hpp>
struct alter_tuple {
alter_tuple(const int& a_, const int& b_) : a(a_), b(b_){};
__host__ __device__
thrust::tuple<int,int> operator()(thrust::tuple<int,int> X)
{
int Xx = thrust::get<0>(X);
int Xy = thrust::get<1>(X);
int Xpx = a*Xx-b*Xy;
int Xpy = -b*Xx+a*Xy;
printf("in (%d,%d) -> (%d,%d)\n",Xx,Xy,Xpx,Xpy);
return thrust::make_tuple(Xpx,Xpy);
}
int a; // these variables a,b are shared between different threads used by this functor kernel
int b; // which easily creates racing problem
};
struct alter_tuple_arr {
alter_tuple_arr(int* a_, int* b_, int* c_, int* d_) : a(a_), b(b_), c(c_), d(d_) {};
__host__ __device__
thrust::tuple<int,int> operator()(const int& idx)
{
int Xx = a[idx];
int Xy = b[idx];
int Xpx = a[idx]*Xx-b[idx]*Xy;
int Xpy = -b[idx]*Xx+a[idx]*Xy;
printf("in (%d,%d) -> (%d,%d)\n",Xx,Xy,Xpx,Xpy);
return thrust::make_tuple(Xpx,Xpy);
}
int* a;
int* b;
int* c;
int* d;
};
struct bFuntor
{
bFuntor(int* av__, int* bv__, int* cv__, int* dv__, const int& N__) : av_(av__), bv_(bv__), cv_(cv__), dv_(dv__), N_(N__) {};
__host__ __device__
int operator()(const int& idx)
{
thrust::device_ptr<int> av_dpt = thrust::device_pointer_cast(av_);
thrust::device_ptr<int> av_dpt1 = thrust::device_pointer_cast(av_+N_);
thrust::device_ptr<int> bv_dpt = thrust::device_pointer_cast(bv_);
thrust::device_ptr<int> bv_dpt1 = thrust::device_pointer_cast(bv_+N_);
thrust::device_ptr<int> cv_dpt = thrust::device_pointer_cast(cv_);
thrust::device_ptr<int> cv_dpt1 = thrust::device_pointer_cast(cv_+N_);
thrust::device_ptr<int> dv_dpt = thrust::device_pointer_cast(dv_);
thrust::device_ptr<int> dv_dpt1 = thrust::device_pointer_cast(dv_+N_);
thrust::detail::normal_iterator<thrust::device_ptr<int>> a0 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(av_dpt);
thrust::detail::normal_iterator<thrust::device_ptr<int>> a1 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(av_dpt1);
thrust::detail::normal_iterator<thrust::device_ptr<int>> b0 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(bv_dpt);
thrust::detail::normal_iterator<thrust::device_ptr<int>> b1 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(bv_dpt1);
thrust::detail::normal_iterator<thrust::device_ptr<int>> c0 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(cv_dpt);
thrust::detail::normal_iterator<thrust::device_ptr<int>> c1 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(cv_dpt1);
thrust::detail::normal_iterator<thrust::device_ptr<int>> d0 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(dv_dpt);
thrust::detail::normal_iterator<thrust::device_ptr<int>> d1 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(dv_dpt1);
// ** alter_tuple is WRONG
#define WRONG
#ifdef WRONG
thrust::transform(thrust::device,
thrust::make_zip_iterator(thrust::make_tuple(a0,b0)),
thrust::make_zip_iterator(thrust::make_tuple(a1,b1)),
// thrust::make_zip_iterator(thrust::make_tuple(cv_dpt,dv_dpt)), // cv_dpt
thrust::make_zip_iterator(thrust::make_tuple(c0,d0)), // cv_dpt
alter_tuple(cv_[idx],dv_[idx]));
#endif
#ifdef RIGHT
// ** alter_tuple_arr is CORRECT way to do it
thrust::transform(thrust::device,
thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(N_),
// thrust::make_zip_iterator(thrust::make_tuple(cv_dpt,dv_dpt)), // cv_dpt
thrust::make_zip_iterator(thrust::make_tuple(c0,d0)), // cv_dpt
alter_tuple_arr(av_,bv_,cv_,dv_));
#endif
for (int i=0; i<N_; i++)
printf("out: (%d,%d) -> (%d,%d)\n",av_[i],bv_[i],cv_[i],dv_[i]);
return cv_dpt[idx];
}
int* av_;
int* bv_;
int* cv_;
int* dv_;
int N_;
float af; // are these variables host side or device side??
};
__host__ __device__
unsigned int hash(unsigned int a)
{
a = (a+0x7ed55d16) + (a<<12);
a = (a^0xc761c23c) ^ (a>>19);
a = (a+0x165667b1) + (a<<5);
a = (a+0xd3a2646c) ^ (a<<9);
a = (a+0xfd7046c5) + (a<<3);
a = (a^0xb55a4f09) ^ (a>>16);
return a;
}
int main(void)
{
int N = 10;
std::vector<int> av,bv,cv,dv;
unsigned int seed = hash(10);
thrust::default_random_engine rng(seed);
thrust::uniform_real_distribution<float> u01(0,10);
for (int i=0;i<N;i++) {
av.push_back((int)u01(rng));
bv.push_back((int)u01(rng));
cv.push_back((int)u01(rng));
dv.push_back((int)u01(rng));
// printf("%d %d %d %d \n",av[i],bv[i],cv[i],dv[i]);
}
thrust::device_vector<int> av_d(N);
thrust::device_vector<int> bv_d(N);
thrust::device_vector<int> cv_d(N);
thrust::device_vector<int> dv_d(N);
av_d = av; bv_d = bv; cv_d = cv; dv_d = dv;
thrust::transform(thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(N),
cv_d.begin(),
bFuntor(thrust::raw_pointer_cast(av_d.data()),
thrust::raw_pointer_cast(bv_d.data()),
thrust::raw_pointer_cast(cv_d.data()),
thrust::raw_pointer_cast(dv_d.data()),
N));
thrust::host_vector<int> bv_h(N);
thrust::copy(bv_d.begin(), bv_d.end(), bv_h.begin()); // probably I forgot this! to copy back the result from device to host!
return 0;
}
In this nested thrust calls, two nested functors were tested, one of them worked (one with "#define RIGHT"). In the case of WRONG functor i.e. alter_tuple:
where do two variables int a, int b reside? host or device? or local kernel registers? or they are shared between threads of this functor's operator?
Inside, the alter_tuple functor, I tried to print out the result (int printf("in...")) and this is correct calculation. However, when this result is returned to caller functor and is printed out (in printf("out....")), they are incorrect and are different with its previous calculation
how come can these results are different? I can't seem to explain it and there is no documents or example to refer to
this difference is shown in output here
Edit 1:
minimum size test code shows functors (literally, a*x = y) in both cases receive/initialize values correctly SO_example_no_tuple_arr_wo_c.cu
print out is:
out: 9*8 -> 72
out: 9*8 -> 72
out: 9*8 -> 72
out: 6*4 -> 24
out: 6*4 -> 24
out: 6*4 -> 24
out: 1*8 -> 8
out: 1*8 -> 8
out: 1*6 -> 6
out: 9*1 -> 9
out: 9*1 -> 9
which shows the correct received values
minimum test code without using pointer/array to pass input values shows that regardless of input values are correctly initialized, the return results are wrong SO_example_no_tuple.cu
its output in case N=2:
in 9*8 -> 72
in 6*4 -> 24
in 9*8 -> 72
in 6*4 -> 24
out: 9*8 -> 24
out: 9*8 -> 24
out: 6*4 -> 24
out: 6*4 -> 24

The difference in values is not strictly due to a data race problem.
Your two approaches do not do the same thing, and it has to do with the values of a and b that will be selected for each invocation of the nested thrust::transform call. This is evident if you set N = 1, which should remove any concerns about data racing. The results are still different.
In the "failing" case, you are invoking the alter_tuple() operator like so:
thrust::transform(thrust::device,
...
alter_tuple(cv_[idx],dv_[idx]));
These values (cv_[idx], dv_[idx]) then become your initializing parameters ending up in a and b variables inside the functor. But your "passing" case is effectively initializing these variables differently, using a[idx] and b[idx], which correspond to av_[idx] and bv_[idx]. If we change the alter_tuple invocation to use a and b:
alter_tuple(av_[idx],bv_[idx]));
then the N = 1 case results now match. This was easier to understand because we had in fact only one entry in the a, b, c, d vectors.
When we expand to the N = 10 case, however, we no longer get matching results. To explain why, we need to understand the use of a and b inside the functor in this case. In the "failing" case, we are passing a single initializing value for each of a and b as used in the functor:
alter_tuple(av_[idx],bv_[idx]));
so, for a given thread, which means for a given invocation of the nested thrust::transform call, a single value will be used for a and b:
alter_tuple(const int& a_, const int& b_) : a(a_), b(b_){};
...
int a; // these values are constant across variation of "idx"
int b; // passed to the functor
on the other hand, in the "passing" case, the a and b values will vary for each element passed to the functor, within the nested transform call:
thrust::tuple<int,int> operator()(const int& idx)
{
int Xx = a[idx]; // these values of a and b *vary* for each idx
int Xy = b[idx]; // passed to the functor
Once that is understood, if the "passing" case is the desired case, then I have no idea how to transform the first case to produce passing results, as there is no way you can cause a single initializing value to take on the behavior of the varying values for a and b in the "passing" case.
None of the above involves data racing, but since your operations (i.e. each thread) is writing to every value of c and d, I don't think this overall approach makes any sense, and I'm not sure what you are trying to accomplish. I think if you expanded this to more elements/threads, then you could certainly experience unpredictable/variable results.
To answer some of your other questions, the variables a and b end up as thread-local variables, on the device. So each data member in either functor is a thread-local variable on the device.
Inside, the alter_tuple functor, I tried to print out the result (int printf("in...")) and this is correct calculation. However, when this result is returned to caller functor and is printed out (in printf("out....")), they are incorrect and are different with its previous calculation
Each thread is writing to the same locations in the c and d vector. Therefore, since each thread writes to the entire vector, but (in the failing case) each thread uses a different initializing value for a and b inside the functor, it stands to reason that each thread will compute a different result for the values of c and d, and the results you get after completion of the thrust call will depend on which thread "wins" the output write operation. This is unpredictable, and certainly not all threads printout will match the final result, because each thread will compute different values for c and d.

Related

How to reuse functors with member data over many kernel executions in CUDA to improve memory usage and decrease copy time?

I am translating a c++11 program which calculates contact forces between particle pairs into a cuda program. All the particle pairs are independent from each other. I use a functor to calculate the contact force. This functor does many computations and contains a lot of member variables. Therefore I am trying to reuse the functors, instead of making one new functor per particle pair.
Because the functor contains virtual functions, the functor cloning is done on the device instead of on the host.
I am thinking of a scheme which goes like this:
1) Clone M functors
2) Start computing M particle pairs
3) Particle pair M+1 waits until one particle pair has completed and then reuses its functor
However, other ideas are also very welcome.
I've made a very simplified version of the program. In this play program, the F variable does not have to be a member variable, but in the real program it needs to be. There is also a lot more member data and particle pairs (N) in the real program. N is often a few million.
#include <stdio.h>
#define TPB 4 // realistic value = 128
#define N 10 // realistic value = 5000000
#define M 5 // trade of between copy time and parallel gain.
// Realistic value somewhere around 1000 maybe
#define OPTION 1
// option 1: Make one functor per particle pair => works, but creates too many functor clones
// option 2: Only make one functor clone => no more thread independent member variables
// option 3: Make M clones which get reused => my suggestion, but I don't know how to program it
struct FtorBase
{
__device__ virtual void execute(long i) = 0;
__device__ virtual void show() = 0;
};
struct FtorA : public FtorBase
{
__device__ void execute(long i) final
{
F = a*i;
}
__device__ void show() final
{
printf("F = %f\n", F);
}
double a;
double F;
};
template <class T>
__global__ void cloneFtor(FtorBase** d_ftorBase, T ftor, long n_ftorClones)
{
const long i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= n_ftorClones) {
return;
}
d_ftorBase[i] = new T(ftor);
}
struct ClassA
{
typedef FtorA ftor_t;
FtorBase** getFtor()
{
FtorBase** d_cmFtorBase;
cudaMalloc(&d_cmFtorBase, N * sizeof(FtorBase*));
#if OPTION == 1
// option 1: Create one copy of the functor per particle pair
printf("using option 1\n");
cloneFtor<<<(N + TPB - 1) / TPB, TPB>>>(d_cmFtorBase, ftor_, N);
#elif OPTION == 2
// option 2: Create just one copy of the functor
printf("using option 2\n");
cloneFtor<<<1, 1>>>(d_cmFtorBase, ftor_, 1);
#elif OPTION == 3
// option 3: Create M functor clones
printf("using option 3\n");
printf("This option is not implemented. I don't know how to do this.\n");
cloneFtor<<<(M + TPB - 1) / TPB, TPB>>>(d_cmFtorBase, ftor_, M);
#endif
cudaDeviceSynchronize();
return d_cmFtorBase;
}
ftor_t ftor_;
};
__global__ void cudaExecuteFtor(FtorBase** ftorBase)
{
const long i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= N) {
return;
}
#if OPTION == 1
// option 1: One functor per particle was created
ftorBase[i]->execute(i);
ftorBase[i]->show();
#elif OPTION == 2
// option 2: Only one single functor was created
ftorBase[0]->execute(i);
ftorBase[0]->show();
#elif OPTION == 3
// option 3: Reuse the fuctors
// I don't know how to do this
#endif
}
int main()
{
ClassA* classA = new ClassA();
classA->ftor_.a = .1;
FtorBase** ftorBase = classA->getFtor();
cudaExecuteFtor<<<(N + TPB - 1) / TPB, TPB>>>(ftorBase);
cudaDeviceSynchronize();
return 0;
}
I am checking the output of F to see whether the member variable is independent in each call. As expected, when using a different functor for each particle pair (option 1), all the F values are different and when using only one functor for the whole program (option 2), all the F values are the same.
using option 1
F = 0.800000
F = 0.900000
F = 0.000000
F = 0.100000
F = 0.200000
F = 0.300000
F = 0.400000
F = 0.500000
F = 0.600000
F = 0.700000
using option 2
F = 0.700000
F = 0.700000
F = 0.700000
F = 0.700000
F = 0.700000
F = 0.700000
F = 0.700000
F = 0.700000
F = 0.700000
F = 0.700000
I wonder if there is a way to get all different F values in this play example without taking N copies (option 3).
PS: I am using Ubuntu 18.04, nvcc 9.1 and a NVIDIA GeForce GTX 1060 Mobile graphics card (cuda compatability 6.1).
UPDATE:
In the previous code I presented, there was only a problem in debug mode (comilation with -G flag) but not in the release version. I'm guessing that the compiler optimised printf("F = %f\n", F); to printf("F = %f\n", a*i); so that the problem of thread dependent member variables, what this question is about, disappeared.
I updated the code, so the compiler cannot do the substitution in the printf anymore.

CUDA cudaMemcpyFromSymbol "invalid device symbol" error?

So I see a parent question about how to copy from host to the constant memory on GPU using cudaMemcpyToSymbol.
My question is how to do the reverse, copying from device constant memory to the host using cudaMemcpyFromSymbol.
In the following minimal reproducible example, I either got
1) invalid device symbol error using cudaMemcpyFromSymbol(const_d_a, b, size);, or
2) got segmentation fault if I use cudaMemcpyFromSymbol(&b, const_d_a, size, cudaMemcpyDeviceToHost).
I have consulted with the manual which suggests I code as in 1), and this SO question that suggests I code as in 2). Neither of them work here.
Could anyone kindly help suggesting a workaround with this? I must be understanding something improperly... Thanks!
Here is the code:
// a basic CUDA function to test working with device constant memory
#include <stdio.h>
#include <cuda.h>
const unsigned int N = 10; // size of vectors
__constant__ float const_d_a[N * sizeof(float)];
int main()
{
float * a, * b; // a and b are vectors. c is the result
a = (float *)calloc(N, sizeof(float));
b = (float *)calloc(N, sizeof(float));
/**************************** Exp 1: sequential ***************************/
int i;
int size = N * sizeof(float);
for (i = 0; i < N; i++){
a[i] = (float)i / 0.23 + 1;
}
// 1. copy a to constant memory
cudaError_t err = cudaMemcpyToSymbol(const_d_a, a, size);
if (err != cudaSuccess){
printf("%s in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
cudaError_t err2 = cudaMemcpyFromSymbol(const_d_a, b, size);
if (err2 != cudaSuccess){
printf("%s in %s at line %d\n", cudaGetErrorString(err2), __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
double checksum0, checksum1;
for (i = 0; i < N; i++){
checksum0 += a[i];
checksum1 += b[i];
}
printf("Checksum for elements in host memory is %f\n.", checksum0);
printf("Checksum for elements in constant memory is %f\n.", checksum1);
return 0;
}
In CUDA, the various cudaMemcpy* operations are modeled after the C standard library memcpy routine. In that function, the first pointer is always the destination pointer and the second pointer is always the source pointer. That is true for all cudaMemcpy* functions as well.
Therefore, if you want to do cudaMemcpyToSymbol, the symbol had better be the first (destination) argument passed to the function (the second argument would be a host pointer). If you want to do cudaMemcpyFromSymbol, the symbol needs to be the second argument (the source position), and the host pointer is the first argument. That's not what you have here:
cudaError_t err2 = cudaMemcpyFromSymbol(const_d_a, b, size);
^ ^
| This should be the symbol.
|
This is supposed to be the host destination pointer.
You can discover this with a review of the API documentation.
If we reverse the order of those two arguments in that line of code:
cudaError_t err2 = cudaMemcpyFromSymbol(b, const_d_a, size);
Your code will run with no errors and the final results printed will match.
There is no need to use an ampersand with either of the a or b pointers in these functions. a and b are already pointers. In the example you linked, pi_gpu_h is not a pointer. It is an ordinary variable. To copy something to it using cudaMemcpyFromSymbol, it is necessary to take the address of that ordinary variable, because the function expects a (destination) pointer.
As an aside, this doesn't look right:
__constant__ float const_d_a[N * sizeof(float)];
This is effectively a static array declaration, and apart from the __constant__ decorator it should be done equivalently to how you would do it in C or C++. It's not necessary to multiply N by sizeof(float) here, if you want storage for N float quantities. Just N by itself will do that:
__constant__ float const_d_a[N];
however leaving that as-is does not create problems for the code you have posted.

Efficient bitstream convolution

I have two floating point time series A, B of length N each. I have to calculate the circular convolution and find maximum value. The classic and fastest way of doing this is
C = iFFT(FFT(A) * FFT(B))
Now, let's suppose that both A and B is a series which contains only 1s and 0s, so in principle we can represent them as bitstreams.
Question: Is there any faster way of doing the convolution (and find its maximum value) if I am somehow able to make use of the fact above ?
(I was already thinking a lot on Walsh - Hadamard transforms and SSE instructions, popcounts, but found no faster way for M > 2 **20 which is my case.)
Thanks,
gd
The 1D convolution c of two arrays a and b of size n is an array such that :
This formula can be rewritten in an iterative way :
The non-null terms of the sum are limited to the number of changes nb of b : if b is a simple pattern, this sum can be limited to a few terms. An algorithm may now be designed to compute c :
1 : compute c[0] (about n operations)
2 : for 0<i<n compute c[i] using the formula (about nb*n operations)
If nb is small, this method may be faster than fft. Note that it will provide exact results for bitstream signals, while the fft needs oversampling and floating point precision to deliver accurate results.
Here is a piece of code implementing this trick with input type unsigned char.
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <fftw3.h>
typedef struct{
unsigned int nbchange;
unsigned int index[1000];
int change[1000];
}pattern;
void topattern(unsigned int n, unsigned char* b,pattern* bp){
//initialisation
bp->nbchange=0;
unsigned int i;
unsigned char former=b[n-1];
for(i=0;i<n;i++){
if(b[i]!=former){
bp->index[bp->nbchange]=i;
bp->change[bp->nbchange]=((int)b[i])-former;
bp->nbchange++;
}
former=b[i];
}
}
void printpattern(pattern* bp){
int i;
printf("pattern :\n");
for(i=0;i<bp->nbchange;i++){
printf("index %d change %d\n",bp->index[i],bp->change[i]);
}
}
//https://stackoverflow.com/questions/109023/how-to-count-the-number-of-set-bits-in-a-32-bit-integer
unsigned int NumberOfSetBits(unsigned int i)
{
i = i - ((i >> 1) & 0x55555555);
i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
return (((i + (i >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
}
//https://stackoverflow.com/questions/2525310/how-to-define-and-work-with-an-array-of-bits-in-c
unsigned int convol_longint(unsigned int a, unsigned int b){
return NumberOfSetBits(a&b);
}
int main(int argc, char* argv[]) {
unsigned int n=10000000;
//the array a
unsigned char* a=malloc(n*sizeof(unsigned char));
if(a==NULL){printf("malloc failed\n");exit(1);}
unsigned int i,j;
for(i=0;i<n;i++){
a[i]=rand();
}
memset(&a[2],5,2);
memset(&a[10002],255,20);
for(i=0;i<n;i++){
//printf("a %d %d \n",i,a[i]);
}
//pattern b
unsigned char* b=malloc(n*sizeof(unsigned char));
if(b==NULL){printf("malloc failed\n");exit(1);}
memset(b,0,n*sizeof(unsigned char));
memset(&b[2],1,20);
//memset(&b[120],1,10);
//memset(&b[200],1,10);
int* c=malloc(n*sizeof(int)); //nb bit in the array
memset(c,0,n*sizeof(int));
clock_t begin, end;
double time_spent;
begin = clock();
/* here, do your time-consuming job */
//computing c[0]
for(i=0;i<n;i++){
//c[0]+= convol_longint(a[i],b[i]);
c[0]+= ((int)a[i])*((int)b[i]);
//printf("c[0] %d %d\n",c[0],i);
}
printf("c[0] %d\n",c[0]);
//need to store b as a pattern.
pattern bpat;
topattern( n,b,&bpat);
printpattern(&bpat);
//computing c[i] according to formula
for(i=1;i<n;i++){
c[i]=c[i-1];
for(j=0;j<bpat.nbchange;j++){
c[i]+=bpat.change[j]*((int)a[(bpat.index[j]-i+n)%n]);
}
}
//finding max
int currmax=c[0];
unsigned int currindex=0;
for(i=1;i<n;i++){
if(c[i]>currmax){
currmax=c[i];
currindex=i;
}
//printf("c[i] %d %d\n",i,c[i]);
}
printf("c[max] is %d at index %d\n",currmax,currindex);
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("computation took %lf seconds\n",time_spent);
double* dp = malloc(sizeof (double) * n);
fftw_complex * cp = fftw_malloc(sizeof (fftw_complex) * (n/2+1));
begin = clock();
fftw_plan plan = fftw_plan_dft_r2c_1d(n, dp, cp, FFTW_ESTIMATE);
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
fftw_execute ( plan );
printf("fftw took %lf seconds\n",time_spent);
free(dp);
free(cp);
free(a);
free(b);
free(c);
return 0;
}
To compile : gcc main.c -o main -lfftw3 -lm
For n=10 000 000 and nb=2 (b is just a "rectangular 1D window") this algorithm run in 0.65 seconds on my computer. A double-precision fft using fftw took approximately the same time. This comparison, like most of comparisons, may be unfair since :
nb=2 is the best case for the algorithm presented in this answer.
The fft-based algorithm would have needed oversampling.
double precison may not be required for the fft-based algorithm
The implementation exposed here is not optimized. It is just basic code.
This implementation can handle n=100 000 000. At this point, using long int for c could be advised to avoid any risk of overflow.
If signals are bitstreams, this program may be optimzed in various ways. For bitwise operations, look this question and this one

CUDA Dynamic Parallelizm; stream synchronization from device

I am basically looking for a way to synchronize a stream from within the device. I want to avoid using cudaDeviceSynchronize(), as it would serialize execution of my kernel that I want to execute concurrently using streams;
More detailed description: I have written a kernel, that is a stabilized bi-conjugate gradient solver. I want to lunch this kernel concurrently on different data using streams.
This kernel uses cublas functions. They are called from within the kernel.
One of operations required by the solver is calculation of a dot product of two vectors. This can be done with cublasdot(). But as this call is synchronous, execution of kernels in different streams get serialized. Instead of calling a dot product function, I calculate the dot product using cublasspmv(), which is called asynchronously. The problem is that this function returns before the result is calculated. I want therefore to synchronize the stream from the device - I am looking for an equivalent of cudaStreamSynchronize() but callable from the device.
__device__ float _cDdot(cublasHandle_t & cublasHandle, const int n, real_t * x, real_t * y) {
float *norm; norm = new float;
float alpha = 1.0f; float beta = 0.0f;
cublasSgemv_v2(cublasHandle, CUBLAS_OP_N ,1 , n, &alpha, x, 1, y, 1, &beta, norm, 1);
return *norm;
}
What can I do to make sure, that the result is calculated before the function returns? Of course insertion of cudaDeviceSynchronize() works, but as I mentioned, it serializes the execution of my kernel across streams.
Probably if you read the programming guide dynamic parallelism section carefully (especially streams, events, and synchronization), you may get some ideas. Here's what I came up with:
There is an implicit NULL stream (on the device) associated with the execution sequence that calls your _cDdot function (oddly named, IMHO, since you're working with float quantities in that case, i.e. using Sgemv). Therefore, any cuda kernel or API call issued after the call to cublasSgemv_v2 in your function should wait until any cuda activity associated with the cublasSgemv_v2 function is complete. If you insert an innocuous cuda API call, or else a dummy kernel call, after the call to cublasSgemv_v2, it should wait for that to be complete. This should give you the thread-level synchronization you are after. You might also be able to use a cudaEventRecord call followed by a cudaStreamWaitEvent call.
Here's an example to show the implicit stream synchronization approach:
#include <stdio.h>
#include <cublas_v2.h>
#define SZ 16
__global__ void dummy_kernel(float *in, float *out){
*out = *in;
}
__device__ float _cDdot(cublasHandle_t & cublasHandle, const int n, float * x, float * y, const int wait) {
float *norm; norm = new float;
float alpha = 1.0f; float beta = 0.0f;
*norm = 0.0f;
cublasSgemv_v2(cublasHandle, CUBLAS_OP_N ,1 , n, &alpha, x, 1, y, 1, &beta, norm, 1);
if (wait){
dummy_kernel<<<1,1>>>(norm, norm);
}
return *norm;
}
__global__ void compute(){
cublasHandle_t my_h;
cublasStatus_t status;
status = cublasCreate(&my_h);
if (status != CUBLAS_STATUS_SUCCESS) printf("cublasCreate fail\n");
float *x, *y;
x = new float[SZ];
y = new float[SZ];
for (int i = 0; i < SZ; i++){
x[i] = 1.0f;
y[i] = 1.0f;}
float result = _cDdot(my_h, SZ, x, y, 0);
printf("result with no wait = %f\n", result);
result = _cDdot(my_h, SZ, x, y, 1);
printf("result with wait = %f\n", result);
}
int main(){
compute<<<1,1>>>();
cudaDeviceSynchronize();
return 0;
}
compile with:
nvcc -arch=sm_35 -rdc=true -o t302 t302.cu -lcudadevrt -lcublas -lcublas_device
results:
$ ./t302
result with no wait = 0.000000
result with wait = 16.000000
$
Unfortunately I tried a completely empty dummy_kernel; that did not work, unless I compiled with -G. So the compiler may be smart enough to optimize out a complete empty child kernel call.

CUDA kernel call in a simple sample

It's the first parallel code of cuda by example .
Can any one describe me about the kernel call : <<< N , 1 >>>
This is the code with important points :
#define N 10
__global__ void add( int *a, int *b, int *c ) {
int tid = blockIdx.x; // this thread handles the data at its thread id
if (tid < N)
c[tid] = a[tid] + b[tid];
}
int main( void ) {
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
// allocate the memory on the GPU
// fill the arrays 'a' and 'b' on the CPU
// copy the arrays 'a' and 'b' to the GPU
add<<<N,1>>>( dev_a, dev_b, dev_c );
// copy the array 'c' back from the GPU to the CPU
// display the results
// free the memory allocated on the GPU
return 0;
}
Why it used of <<< N , 1 >>> that it means we used of N blocks and 1 thread in each block ?? since we can write this <<< 1 , N >>> and used 1 block and N thread in this block for more optimization.
For this little example, there is no particular reason (as Bart already told you in the comments). But for a larger, more realistic example you should always keep in mind that the number of threads per block is limited. That is, if you use N = 10000, you could not use <<<1,N>>> anymore, but <<<N,1>>> would still work.