CUDA, how to find the first item in an array that makes a function maximal - cuda

In Cuda C++, I have a big array Arr of integers, and a function F: int -> int. I want to find the first index of some items in Arr that makes F maximal.
How can I write a kernel that always keeps the maximal value (in F) to compare with others using atomic stuff to avoid facing the race condition problems?
BTW, I wonder if I can use the functions in Thrust library for this purpose instead.

How can I write a kernel that always keeps the maximal value (in F) to compare with others using atomic stuff to avoid facing the race condition problems?
Based on your description, including usage of int, and a desire to use atomics, I would suggest using a custom atomic. This should work for arrays up to 4 billion elements:
$ cat t2154.cu
#include <iostream>
__device__ __host__ int get_int(unsigned long long val){return reinterpret_cast<int *>(&val)[0];}
__device__ __host__ unsigned get_uns(unsigned long long val){return reinterpret_cast<unsigned *>(&val)[1];}
__device__ bool done(int fval, int fval1, unsigned idx, unsigned idx1){
if (fval > fval1) return true;
if ((fval == fval1) && (idx <= idx1)) return true;
return false;
}
__device__ unsigned long long my_custom_atomic(unsigned long long *addr, int fval, unsigned idx){
unsigned long long old = *addr;
while (!done(get_int(old),fval, get_uns(old), idx))
old = atomicCAS(addr, old, ((((unsigned long long)idx)<<32)|fval));
return old;
}
const int minidx = 256;
__device__ int f(int t){ return minidx - (t-minidx)*(t-minidx);}
__global__ void k(int *arr, unsigned long long *min, unsigned N){
unsigned my_idx = blockIdx.x*blockDim.x+threadIdx.x;
if (my_idx < N){
int my_val = arr[my_idx];
my_val = f(my_val);
my_custom_atomic(min, my_val, my_idx);
}
}
const unsigned my_N = 32768;
int main(){
unsigned long long *min;
cudaMallocManaged(&min, sizeof(min[0]));
int *arr;
cudaMallocManaged(&arr, sizeof(arr[0])*my_N);
for (unsigned i = 0; i < my_N; i++) arr[i] = i;
*min = 0xFFFFFFFF80000000ULL; //maximum unsigned index combined with minimum int value
k<<<my_N/256, 256>>>(arr, min, my_N);
cudaDeviceSynchronize();
std::cout << " maximum val: " << get_int(*min) << " at index: " << get_uns(*min) << std::endl;
}
$ nvcc -o t2154 t2154.cu
$ compute-sanitizer ./t2154
========= COMPUTE-SANITIZER
maximum val: 256 at index: 256
========= ERROR SUMMARY: 0 errors
$
We are assembling and disassembling the 64-bit quantity as needed, and using the general method outlined in the programming guide for arbitrary atomics.
I wonder if I can use the functions in Thrust library for this purpose instead.
Yes, you can do this with a transform and a reduce operation in thrust. In fact thrust can combine these into a single algorithm call. Here is an example:
$ cat t2155.cu
#include <iostream>
#include <thrust/transform.h>
#include <thrust/reduce.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <limits>
const int minidx = 256;
const size_t my_N = 32768;
struct f
{
template <typename T>
__host__ __device__ T operator()(T t) {
T result = t;
thrust::get<0>(result) = minidx - (thrust::get<0>(t) - minidx)*(thrust::get<0>(t) - minidx);
return result;
}
};
struct r
{
template <typename T1, typename T2>
__host__ __device__ T1 operator()(T1 &t1, T2 &t2){
if (thrust::get<0>(t1) > thrust::get<0>(t2)) return t1;
if (thrust::get<0>(t1) < thrust::get<0>(t2)) return t2;
if (thrust::get<1>(t1) < thrust::get<1>(t2)) return t1;
return t2;
}
};
int main(){
thrust::device_vector<int> arr(my_N);
thrust::sequence(arr.begin(), arr.end());
auto my_zip = thrust::make_zip_iterator(thrust::make_tuple(arr.begin(), thrust::counting_iterator<size_t>(0)));
auto init = thrust::make_tuple(std::numeric_limits<int>::min(), std::numeric_limits<size_t>::max());
auto result = thrust::transform_reduce(my_zip, my_zip+my_N, f(), init, r());
std::cout << " maximum val: " << thrust::get<0>(result) << " at index: " << thrust::get<1>(result) << std::endl;
}
$ nvcc -o t2155 t2155.cu
$ compute-sanitizer ./t2155
========= COMPUTE-SANITIZER
maximum val: 256 at index: 256
========= ERROR SUMMARY: 0 errors
$
Notes:
Comparing those 2 implementations/examples, I know which one I would choose. The atomic method is brittle, type-limited, and size-limited. The thrust example could be adapted to handle types more flexibly (e.g. a function that returns a 64-bit type) and could be extended to handle beyond 4 billion elements.
The code here is just intended to be a possible roadmap. It's not thoroughly tested; bugs are always possible.
There is a strong correspondence between the two methods. The main() routines have almost a 1:1 correspondence, which hopefully you can identify. Furthermore the r() functor corresponds to the done() function, and the f() functor corresponds to the f() function.
Don't assume that you can readily/trivially increase my atomic example to 4 billion elements. The f() function I wrote would overflow/underflow an int variable. But with an appropriate data array and f() function, it should be possible to use up to 4 billion elements.
EDIT: As suggested in the comments below, we may be able to do a better job in the atomic case, by doing a threadblock level shared sweep reduction, followed by a single atomic per threadblock. Here is an example of that:
#include <iostream>
const int nTPB=512;
const unsigned long long initval = 0xFFFFFFFF80000000ULL; // maximum index and minimum int
__device__ __host__ int get_int(unsigned long long val){return reinterpret_cast<int *>(&val)[0];}
__device__ __host__ unsigned get_uns(unsigned long long val){return reinterpret_cast<unsigned *>(&val)[1];}
__device__ bool done(int fval, int fval1, unsigned idx, unsigned idx1){
if (fval > fval1) return true;
if ((fval == fval1) && (idx <= idx1)) return true;
return false;
}
__device__ unsigned long long my_custom_atomic(unsigned long long *addr, int fval, unsigned idx){
unsigned long long old = *addr;
while (!done(get_int(old),fval, get_uns(old), idx))
old = atomicCAS(addr, old, ((((unsigned long long)idx)<<32)|fval));
return old;
}
const int minidx = 256;
__device__ int f(int t){ return minidx - (t-minidx)*(t-minidx);}
__device__ unsigned long long my_reduce(unsigned long long t1, unsigned long long t2){
if (done(get_int(t1), get_int(t2), get_uns(t1), get_uns(t2))) return t1;
return t2;
}
__global__ void k(int *arr, unsigned long long *min, unsigned N){
__shared__ unsigned long long smem[nTPB];
smem[threadIdx.x] = initval;
for (unsigned int idx = blockIdx.x*blockDim.x+threadIdx.x; idx < N; idx+=gridDim.x*blockDim.x)
smem[threadIdx.x] = my_reduce(smem[threadIdx.x], (((unsigned long long)idx)<<32)|f(arr[idx]));
for (int t = nTPB>>1; t > 0; t>>=1){
__syncthreads();
if (threadIdx.x < t) smem[threadIdx.x] = my_reduce(smem[threadIdx.x], smem[threadIdx.x+t]);}
if (!threadIdx.x) my_custom_atomic(min, get_int(smem[0]), get_uns(smem[0]));
}
const unsigned my_N = 32768;
int main(){
unsigned long long *min;
cudaMallocManaged(&min, sizeof(min[0]));
int *arr;
cudaMallocManaged(&arr, sizeof(arr[0])*my_N);
for (unsigned i = 0; i < my_N; i++) arr[i] = i;
arr[1024] = minidx;
*min = initval;
k<<<(my_N+nTPB-1)/nTPB, nTPB>>>(arr, min, my_N);
cudaDeviceSynchronize();
std::cout << " minimum val: " << get_int(*min) << " at index: " << get_uns(*min) << std::endl;
}

Related

Pack two types into one CUDA

I'd like to pack two data types, specifically an int and a float into an unsigned long long int in CUDA.
I wrote something like this but I get compilation errors when unpacking:
__global__ void kernel() {
// pack
float positive_num = 5.1034786f;
int index = 1024;
unsigned long long int u_int_val = (unsigned long long int) positive_num << 32;
u_int_val += index & 0xffff;
// unpack
positive_num = (float) u_int_val >> 32 ;
index = u_int_val & 0xffff ;
// check
printf("positive_num: %f - index %i\n", positive_num, index);
}
The error I get:
error: expression must have integral or unscoped enum type // error line - positive_num = (float) ...
I have tried in multiple ways, even casting the memory address, either I get compilation errors or simply I don't manage to have a precise conversion straight and back.
If helps, the only assumption I can make is that both the numbers are positive, hence float positive_num > 0.f; int index > 0;
The reason I need to pack two numbers into one, is to embed everything (a float and and int) in a single atomic operation, to find the minimum of both for instance.
If the need for heterogeneous packing and unpacking exists only in device code, one can use CUDA's device function intrinsics __float_as_int() and __int_as_float() to re-interpret a 32-bit float into a 32-bit int and vice versa. The packing of integers is unproblematic, simply shift the desired high-order part and OR the parts together.
For the same functionality in code that needs to work on both host and device, the canonical C++ way of re-interpreting floating-point data as integer data and vice versa is to use memcpy(); CUDA is a C++ derivative. This may or may not be as efficient as using the device intrinsics, which have no cost, as the 32-bit registers of the GPU can be used for both integer and floating-point data. It may be worthwhile to inspect the generated machine code (SASS) with cuobjdump --dump-sass.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define DEVICE_CODE_ONLY (0)
#if DEVICE_CODE_ONLY
__device__ unsigned long long int pack_float_int (float a, int b)
{
return (((unsigned long long int)(unsigned int)__float_as_int (a)) << 32) |
(unsigned long long int)(unsigned int)b;
}
__device__ void unpack_float_int (unsigned long long int a, float *f, int *i)
{
*f = __int_as_float ((int)(unsigned int)(a >> 32));
*i = (int)(unsigned int)a;
}
#else // DEVICE_CODE_ONLY
__host__ __device__ unsigned long long int pack_float_int (float a, int b)
{
unsigned int t;
memcpy (&t, &a, sizeof t);
return ((unsigned long long int)t << 32) |
(unsigned long long int)(unsigned int)b;
}
__host__ __device__ void unpack_float_int (unsigned long long int a, float *f, int *i)
{
unsigned int t = (unsigned int)(a >> 32);
memcpy (f, &t, sizeof (*f));
*i = (int)(unsigned int)a;
}
#endif // DEVICE_CODE_ONLY
__global__ void kernel (float f, int i)
{
unsigned long long int p;
float uf;
int ui;
p = pack_float_int (f, i);
printf ("GPU: packed (%15.8e, %d) into %016llx\n", f, i, p);
unpack_float_int (p, &uf, &ui);
printf ("GPU: unpacked %016llx into (%15.8e %d)\n", p, uf, ui);
}
int main (void)
{
float f = 5.1034786f;
int i = 1024;
printf ("testing packing/unpacking: %s\n",
DEVICE_CODE_ONLY ? "on device" : "on device and host");
kernel<<<1,1>>> (f, i);
cudaDeviceSynchronize();
#if !DEVICE_CODE_ONLY
unsigned long long int p;
float uf;
int ui;
p = pack_float_int (f, i);
printf ("CPU: packed (%15.8e, %d) into %016llx\n", f, i, p);
unpack_float_int (p, &uf, &ui);
printf ("CPU: unpacked %016llx into (%15.8e %d)\n", p, uf, ui);
#endif // DEVICE_CODE_ONLY
return EXIT_SUCCESS;
}
something like this should work:
__device__ unsigned long long pack(int a, float b){
return (((unsigned long long)(*(reinterpret_cast<unsigned *>(&a))))<<32) + *(reinterpret_cast<unsigned *>(&b));
}
__device__ void unpack(unsigned long long val, int &a, float &b){
unsigned mb = (unsigned)val;
b = *(reinterpret_cast<float *>(&mb));
unsigned ma = (unsigned)(val >> 32);
a = *(reinterpret_cast<int *>(&ma));
}
(This is not really unique to CUDA. This is just C++ code, apart from the __device__ decorators.)
Example:
$ cat t2169.cu
#include <cstdio>
__device__ unsigned long long pack(int a, float b){
return (((unsigned long long)(*(reinterpret_cast<unsigned *>(&a))))<<32) + *(reinterpret_cast<unsigned *>(&b));
}
__device__ void unpack(unsigned long long val, int &a, float &b){
unsigned mb = (unsigned)val;
b = *(reinterpret_cast<float *>(&mb));
unsigned ma = (unsigned)(val >> 32);
a = *(reinterpret_cast<int *>(&ma));
}
__global__ void k(int a, float b){
unsigned long long val = pack(a,b);
int ma;
float mb;
unpack(val, ma, mb);
printf("a = %d, b = %f\n", ma, mb);
}
int main(){
k<<<1,1>>>(-2, -1.3f);
cudaDeviceSynchronize();
}
$ nvcc -o t2169 t2169.cu
$ compute-sanitizer ./t2169
========= COMPUTE-SANITIZER
a = -2, b = -1.300000
========= ERROR SUMMARY: 0 errors
$
That packs the int quantity in the upper 32-bits and the float quantity in the lower 32-bits.
If you want to reverse the storage order, you can just reverse the usage of a and b like this:
__device__ unsigned long long pack(int a, float b){
return (((unsigned long long)(*(reinterpret_cast<unsigned *>(&b))))<<32) + *(reinterpret_cast<unsigned *>(&a));
}
__device__ void unpack(unsigned long long val, int &a, float &b){
unsigned ma = (unsigned)val;
a = *(reinterpret_cast<int *>(&ma));
unsigned mb = (unsigned)(val >> 32);
b = *(reinterpret_cast<float *>(&mb));
}

creating 16-bit input to cufftXtMakePlanMany and workSize for 1 GPU

I need to compute FFT on unsigned int 8bit data. Previously, I was using cufftPlanMany and my input was cufftReal and the output was cufftComplex, and I was using casting before and after FFT to convert from unsigned 8bit to cufftReal and then from cufftComplex to signed 8bit.
It came to my attention that cuFFT has a nice option to run FFT for half-precision data which I hope improves the running time. According to the documentation, it currently doesn't support all of cudaDataType (that would be wonderful if it can in the future), but at least I can run it with 16bit float (half-precision) with the following signature:
cufftResult
cufftXtMakePlanMany(cufftHandle plan, int rank, long long int *n, long long int *inembed,
long long int istride, long long int idist, cudaDataType inputtype,
long long int *onembed, long long int ostride, long long int odist,
cudaDataType outputtype, long long int batch, size_t *workSize,
cudaDataType executiontype);
with data types for input, output and execution respectively as: CUDA_R_16F, CUDA_C_16F and CUDA_C_16F. Tha twould be ideal that I can feed this cuFFT with my U8 data, is there any way for doing so? Otherwise, if the first casting from U8 to cufftReal is necessary how can I convert my data from cufftReal to CUDA_R_16F and then from CUDA_C_16F ? Is cuda smart enough to cast the input from float to half-precision data, because cufftExecR2C ultimaltey would be the same and there is no other function to be called for the half-precision?
The other question is about workSize which is designed for multiple GPU cases. Any idea how this size has to be calculated? (I have just 1 GPU). Am I responsible for managing that buffer?
TL;DR: I can see two possible approaches here, one using a half-precision transform and one using a single-precision transform (perhaps with CUFFT callbacks). The reasons to choose one or the other may depend on a number of factors such as size of your transform, control of scope of input data, the GPU you are running on, and other factors.
I'm not going to try to address the processing of the output data that you indicate here:
then from cufftComplex to signed 8bit.
since I don't know how to do that without more information. However the processing of the input data in each case should be illustrative for how you could process the output data.
Using half-precision transforms
A few things to note here are that you cannot (currently) use callbacks with half-precision transforms, and half-precision transforms can be more sensitive to input data characteristics (e.g. DC offset, transform size, etc.) than single or double-precision transforms. Also, half-precision transforms for the most part require a pascal or newer GPU (ignoring Jetson family).
Because half-precision transforms don't support callbacks, we'll use "ordinary" host code to process the input data; you could also do this processing on the device prior to the transform, the provided code outlines both possibilities. My "preprocessing" here is mostly just designed to prevent the 16-bit transform from overflowing. If you play around with this code you'll quickly see what an overflow looks like (inf and/or nan in the output).
$ cat t1961.cu
#include <cufft.h>
#include <stdio.h>
#include <stdlib.h>
#include <cufftXt.h>
#include <cuda_fp16.h>
#include <assert.h>
#include <iostream>
typedef half2 ctype;
typedef half rtype;
typedef unsigned char dtype;
long long sig_size = 1<<18;
const int amplitude = 127;
const float ramplitude = 1/(float)(4*amplitude);
__host__ __device__ half convert(int val){
return __float2half_rn((val - amplitude)*ramplitude);
}
__global__ void dev_convert(rtype *out, dtype *in, int sz){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < sz)
out[idx] = convert(in[idx]);
}
int main(){
//put 4x sine waves into a U8 array
dtype *my_data = (dtype *)malloc(sig_size*sizeof(dtype));
for (int i = 0; i < sig_size; i++) my_data[i] = amplitude*(sin((i*8*3.141592654f)/sig_size)+1.0);
rtype *d_idata;
ctype *d_odata;
cudaMalloc(&d_idata, sizeof(rtype)*sig_size);
#ifdef USE_HOST
rtype *h_idata = (rtype *)malloc(sig_size*sizeof(rtype));
// convert to 16 bit float non-offset suitable for cufft
for (int i = 0; i < sig_size; i++) h_idata[i] = convert(my_data[i]);
cudaMemcpy(d_idata, h_idata, sig_size*sizeof(rtype), cudaMemcpyHostToDevice);
#else
const int bs = 256;
dtype *d_mydata;
cudaMalloc(&d_mydata, sig_size*sizeof(dtype));
cudaMemcpy(d_mydata, my_data, sig_size*sizeof(dtype), cudaMemcpyHostToDevice);
dev_convert<<<(sig_size+bs-1)/bs, bs>>>(d_idata, d_mydata, sig_size);
#endif
cudaMalloc(&d_odata, sizeof(ctype)*(sig_size/2+1));
cufftHandle plan;
cufftResult r;
r = cufftCreate(&plan);
assert(r == CUFFT_SUCCESS);
size_t ws = 0;
r = cufftXtMakePlanMany(plan, 1, &sig_size, NULL, 1, 1, CUDA_R_16F, NULL, 1, 1, CUDA_C_16F, 1, &ws, CUDA_C_16F);
assert(r == CUFFT_SUCCESS);
r = cufftXtExec(plan, d_idata, d_odata, CUFFT_FORWARD); // warm-up
assert(r == CUFFT_SUCCESS);
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaEventRecord(start);
r = cufftXtExec(plan, d_idata, d_odata, CUFFT_FORWARD);
assert(r == CUFFT_SUCCESS);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float et;
cudaEventElapsedTime(&et, start, stop);
printf("forward FFT time for %ld samples: %fms\n", sig_size, et);
ctype *h_odata = (ctype *)malloc((sig_size/2+1)*sizeof(ctype));
cudaMemcpy(h_odata, d_odata, (sig_size/2+1)*sizeof(ctype), cudaMemcpyDeviceToHost);
for (int i = 0; i < 8; i++)
std::cout << __half2float(h_odata[i].x) << " + " << __half2float(h_odata[i].y) << "i" << std::endl;
return 0;
}
$ nvcc -o t1961 t1961.cu -lcufft
$ ./t1961
forward FFT time for 262144 samples: 0.027520ms
-258 + 0i
0.00349998 + 0.00127506i
-0.000146866 + -0.000833511i
0.00140095 + -0.00501251i
-1.57031 + -32752i
-0.00198174 + 0.00856018i
0.00474548 + 0.00359917i
-0.00226784 + 0.00987244i
$
Using a single precision transform with a load callback
This in my view has a few benefits. It is not as subject to the overflow phenomenon as the half precision transforms are, and the (load) callback routine allows us to still operate on U8 input data.
$ cat t1962.cu
#include <cufft.h>
#include <stdio.h>
#include <stdlib.h>
#include <cufftXt.h>
#include <cuda_fp16.h>
#include <assert.h>
#include <iostream>
typedef cufftComplex ctype;
typedef cufftReal rtype;
typedef unsigned char dtype;
long long sig_size = 1<<18;
const int amplitude = 127;
const cufftReal ramplitude = 1/(float)(4*amplitude);
__device__ rtype convert(int val){
return (val - amplitude)*ramplitude;
}
__device__ rtype myOwnCallback(void *dataIn,
size_t offset,
void *callerInfo,
void *sharedPtr) {
rtype ret;
ret = convert(((dtype *)dataIn)[offset]);
return ret;
}
__device__ cufftCallbackLoadR myOwnCallbackPtr = myOwnCallback;
int main(){
cufftCallbackLoadR hostCopyOfCallbackPtr;
cudaMemcpyFromSymbol(&hostCopyOfCallbackPtr,
myOwnCallbackPtr,
sizeof(hostCopyOfCallbackPtr));
//put 4x sine waves into a U8 array
dtype *my_data = (dtype *)malloc(sig_size*sizeof(dtype));
for (int i = 0; i < sig_size; i++) my_data[i] = amplitude*(sin((i*8*3.141592654f)/sig_size)+1.0);
ctype *d_odata;
dtype *d_mydata;
cudaMalloc(&d_mydata, sig_size*sizeof(dtype));
cudaMemcpy(d_mydata, my_data, sig_size*sizeof(dtype), cudaMemcpyHostToDevice);
cudaMalloc(&d_odata, sizeof(ctype)*(sig_size/2+1));
cufftHandle plan;
cufftResult r;
r = cufftCreate(&plan);
assert(r == CUFFT_SUCCESS);
size_t ws = 0;
r = cufftXtMakePlanMany(plan, 1, &sig_size, NULL, 1, 1, CUDA_R_32F, NULL, 1, 1, CUDA_C_32F, 1, &ws, CUDA_C_32F);
assert(r == CUFFT_SUCCESS);
void *rps[] = {(void *)hostCopyOfCallbackPtr};
r = cufftXtSetCallback(plan, rps, CUFFT_CB_LD_REAL, NULL);
assert(r == CUFFT_SUCCESS);
r = cufftXtExec(plan, (cufftReal *)d_mydata, d_odata, CUFFT_FORWARD); // warm-up
assert(r == CUFFT_SUCCESS);
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaEventRecord(start);
r = cufftXtExec(plan, (cufftReal *)d_mydata, d_odata, CUFFT_FORWARD);
assert(r == CUFFT_SUCCESS);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float et;
cudaEventElapsedTime(&et, start, stop);
printf("forward FFT time for %ld samples: %fms\n", sig_size, et);
ctype *h_odata = (ctype *)malloc((sig_size/2+1)*sizeof(ctype));
cudaMemcpy(h_odata, d_odata, (sig_size/2+1)*sizeof(ctype), cudaMemcpyDeviceToHost);
for (int i = 0; i < 8; i++)
std::cout << h_odata[i].x << " + " << h_odata[i].y << "i" << std::endl;
return 0;
}
$ nvcc -o t1962 t1962.cu -rdc=true -lcufft_static -lculibos
$ ./t1962
forward FFT time for 262144 samples: 0.031488ms
-257.969 + 0i
0.00344251 + 0.00137726i
-3.96543e-05 + -0.00106905i
0.0013994 + -0.00490045i
0.0331312 + -32759.4i
-0.00190887 + 0.00865401i
0.00454092 + 0.00368094i
-0.00219025 + 0.00983646i
$
Yes, the results are not numerically identical between the two transform types. It's not reasonable to expect that 16-bit floating point calculations and 32-bit floating point calculations will be identical. In all probability the 32-bit calculations are "more accurate". For this sinewave case, the terms I consider most important are the DC term as well as the magnitude spike at the fundamental. Those are numerically close to each other. The other terms are "in the noise". The timing results are not exactly comparable either, as the 16-bit calculation case omits the cost of the kernel call to convert the data from U8 to F16. You can use a profiler or just refactor the code to get more comparable timing.
workSize can be ignored for the single GPU case when using cufftXtMakePlanMany, otherwise, use the provided routines to determine workSize.

Does bool variable in kernel need to be synchronized

I have a kernel consisting of a for loop that searches through an array for a specific int value. I'm using a grid block of 256 threads to do this. However, when one thread finds the value, I want to let the other threads know to exit. Currently I'm using a boolean flag, but I'm not sure if its working properly. My concern is synchronization.
__device__ bool found;
__global__
void search()
{
for(int i = threadIdx.x; i<1000000; i += stride)
{
if(found == true)
{
break;
}
else if(arr[i] = x)
{
found = true;
break;
}
}
}
int main()
{
bool flag = false;
cudaMemcpyToSymbol(found, &flag, sizeof(bool), 0,cudaMemcpyHostToDevice);
}
As pointed out in comments, you can probably achieve what you want by declaring the global device flag to be volatile, which will inhibit caching, and by using a memory fence function. There really isn't a global synchronization primitive which would do want you want other than the new grid synchronization mechanism introduced in CUDA 9 and new hardware, but that probably isn't necessary in this case. Turning your pseudocode into a toy example:
#include <iostream>
#include <thrust/device_vector.h>
__device__ volatile bool found;
__device__ volatile size_t idx;
template<bool docheck>
__global__
void search(const int* arr, int x, size_t N)
{
size_t i = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for(; (i<N) && (!found); i += stride)
{
if(arr[i] == x)
{
if (docheck) found = true;
idx = i;
__threadfence();
break;
}
}
}
int main()
{
const size_t N = 1 << 24;
const size_t findidx = 280270;
const int findval = 0xdeadbeef;
thrust::device_vector<int> data(N,1);
data[findidx] = findval;
bool flag = false;
size_t zero = 0;
{
cudaMemcpyToSymbol(found, &flag, sizeof(bool));
cudaMemcpyToSymbol(idx, &zero, sizeof(size_t));
int blocks, threads;
cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, search<false>);
search<false><<<blocks, threads>>>(thrust::raw_pointer_cast(data.data()), findval, N);
cudaDeviceSynchronize();
size_t result = 0;
cudaMemcpyFromSymbol(&result, idx, sizeof(size_t));
std::cout << "result = " << result << std::endl;
}
{
cudaMemcpyToSymbol(found, &flag, sizeof(bool));
cudaMemcpyToSymbol(idx, &zero, sizeof(size_t));
int blocks, threads;
cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, search<true>);
search<true><<<blocks, threads>>>(thrust::raw_pointer_cast(data.data()), findval, N);
cudaDeviceSynchronize();
size_t result = 0;
cudaMemcpyFromSymbol(&result, idx, sizeof(size_t));
std::cout << "result = " << result << std::endl;
}
return 0;
}
and profiling it gives the following:
$ nvcc -arch=sm_52 -o notify notify.cu
$ nvprof ./notify
==3916== NVPROF is profiling process 3916, command: ./notify
result = 280270
result = 280270
==3916== Profiling application: ./notify
==3916== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 78.00% 1.6773ms 1 1.6773ms 1.6773ms 1.6773ms void search<bool=0>(int const *, int, unsigned long)
19.93% 428.63us 1 428.63us 428.63us 428.63us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>(thrust::device_ptr<int>, int)
1.82% 39.199us 1 39.199us 39.199us 39.199us void search<bool=1>(int const *, int, unsigned long)
As you can see, the version which sets the found flag completes the search in 40 microseconds, whereas the version which does not set the flag takes 1.7 milliseconds. Given that the kernel is run with the maximum number of resident blocks in both cases, we can conclude that the early exit mechanism worked correctly and running blocks detected that the required value had been found.

Atomic Operation failed in CUDA

As the compute ability is 2.1, the atomicAdd and atomicMax operations do not support double precision, then I define both functions based on some answers on stack overflow.
It is strange that the atomicAdd function works well but the atomicMax doesn't work, here is my code.
The test of my code is to generate random number on each block, and then sum the random numbers on each block, we have block sum, I want to test the atomicAdd and atomicMax on the block sum.
#include <iostream>
#include <curand.h>
#include <curand_kernel.h>
#include <stdio.h>
#include <stdlib.h>
#define num_of_blocks 2
#define threads_per_block 2
#define tot_threads 4
__device__ double gsum[num_of_blocks];
__device__ double dev_sum;
__device__ double dev_max;
// set seed for random number generator
__global__ void initcuRand(curandState* globalState, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &globalState[idx]);
}
// atomiMax for double
__device__ double atomicMax_d(double* address, double val)
{
unsigned long long int* address_as_i = (unsigned long long int*)address;
unsigned long long int old = *address_as_i, assumed;
do {
assumed = old;
old = ::atomicCAS(address_as_i, assumed, __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
} while (assumed != old);
return __longlong_as_double(old);
}
// atomicAdd for double
__device__ double atomicAdd_d(double* address, double val)
{
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do{
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
}while(assumed != old);
return __longlong_as_double(old);
}
__global__ void kernel(curandState *globalState){
// global id
int gidx = threadIdx.x + blockIdx.x * blockDim.x;
// local id
int lidx = threadIdx.x;
// creat shared memory to store seeds
__shared__ curandState localState[tot_threads];
__shared__ double srandnum[threads_per_block];
// copy global seed to local
localState[lidx] = globalState[gidx];
//synchronize the local threads writing to the local memory cache
__syncthreads();
// generate random number from normal distribution in shared memory
srandnum[lidx] = curand_normal(&localState[lidx]);
__syncthreads();
if(lidx == 0){srandnum[lidx] += srandnum[lidx + 1];} // sum of each block
if(lidx == 0){gsum[blockIdx.x] = srandnum[lidx];} // copy the sums back to global memory
__threadfence();
if( gidx < num_of_blocks){
atomicAdd_d(&dev_sum, gsum[gidx]);
}
if( gidx < num_of_blocks){
atomicMax_d(&dev_max, gsum[gidx]);
}
if( gidx == 0){
printf("Sum is: %lf\n", dev_sum);
}
if( gidx == 1){
printf("Max is: %lf\n", dev_max);
}
}
int main(){
// set seed on device
curandState *globalState;
cudaMalloc((void**)&globalState, tot_threads*sizeof(curandState));
initcuRand<<<num_of_blocks, threads_per_block>>>(globalState, 1);
// launch kernel
kernel<<<num_of_blocks, threads_per_block>>>(globalState);
double randnum[num_of_blocks];
cudaMemcpyFromSymbol(randnum, gsum, num_of_blocks*sizeof(double), 0, cudaMemcpyDeviceToHost);
std::cout << "Sum of each block:\n";
for (int i = 0; i < num_of_blocks; ++i){
std::cout << randnum[i] << std::endl;
}
cudaFree(globalState);
return 0;
}
The result I get is
Sum is: -0.898329
Max is: 0.000000
Sum of each block:
-0.0152994
-0.88303
From the result, I know that the atomicAdd function works but the atomicMax function doesn't work, I have no idea of this. Thanks beforehand.
You don't ever initialize dev_max or dev_sum. You can't sensibly do these types of atomic operations on them if they don't start with a known value.
Try something like this instead:
__device__ double dev_sum = 0.0;
__device__ double dev_max = -1e99;
and I think you'll be happier with the results.

Simpson's method to integrate real valued functions with CUDA

I'm trying to code integration by Simpson's method in CUDA.
This is the formula for Simpson's rule
where x_k = a + k*h.
Here's my code
__device__ void initThreadBounds(int *n_start, int *n_end, int n,
int totalBlocks, int blockWidth)
{
int threadId = blockWidth * blockIdx.x + threadIdx.x;
int nextThreadId = threadId + 1;
int threads = blockWidth * totalBlocks;
*n_start = (threadId * n)/ threads;
*n_end = (nextThreadId * n)/ threads;
}
__device__ float reg_func (float x)
{
return x;
}
typedef float (*p_func) (float);
__device__ p_func integrale_f = reg_func;
__device__ void integralSimpsonMethod(int totalBlocks, int totalThreads,
double a, double b, int n, float p_function(float), float* result)
{
*result = 0;
float h = (b - a)/n;
//*result = p_function(a)+p_function(a + h * n);
//parallel
int idx_start;
int idx_end;
initThreadBounds(&idx_start, &idx_end, n-1, totalBlocks, totalThreads);
//parallel_ends
for (int i = idx_start; i < idx_end; i+=2) {
*result += ( p_function(a + h*(i-1)) +
4 * p_function(a + h*(i)) +
p_function(a + h*(i+1)) ) * h/3;
}
}
__global__ void integralSimpson(int totalBlocks, int totalThreads, float* result)
{
float res = 0;
integralSimpsonMethod(totalBlocks, totalThreads, 0, 10, 1000, integrale_f, &res);
result[(blockIdx.x*totalThreads + threadIdx.x)] = res;
//printf ("Simpson method\n");
}
__host__ void inttest()
{
const int blocksNum = 32;
const int threadNum = 32;
float *device_resultf;
float host_resultf[threadNum*blocksNum]={0};
cudaMalloc((void**) &device_resultf, sizeof(float)*threadNum*blocksNum);
integralSimpson<<<blocksNum, threadNum>>>(blocksNum, threadNum, device_resultf);
cudaThreadSynchronize();
cudaMemcpy(host_resultf, device_resultf, sizeof(float) *threadNum*blocksNum,
cudaMemcpyDeviceToHost);
float sum = 0;
for (int i = 0; i != blocksNum*threadNum; ++i) {
sum += host_resultf[i];
// printf ("result in %i cell = %f \n", i, host_resultf[i]);
}
printf ("sum = %f \n", sum);
cudaFree(device_resultf);
}
int main(int argc, char* argv[])
{
inttest();
int i;
scanf ("%d",&i);
}
The problem is: it works wrong when n is lower than 100000. For an integral from 0 to 10, the result is ~99, but when n = 100000 or larger it works fine and the result is ~50.
What's wrong, guys?
The basic problem here is that you don't understand your own algorithm.
Your integralSimpsonMethod() function is designed such that each thread is sampling at least 3 quadrature points per sub-interval in the integral domain. Therefore, if you choose n so that it is less than four times the number of threads in the kernel call, it is inevitable that each sub interval will overlap and the resulting integral will be incorrect. You need to make sure that the code checks and scales the thread count or n so that they don't produce overlap when the integral is computed.
If you are doing this for anything other than self-edification, then I recommend you look up the composite version of Simpson's rule. This is much better suited to parallel implementation and will be considerably more performant if implemented correctly.
I would propose an approach to Simpson's integration by using CUDA Thrust. You basically need five steps:
Generate the Simpson's quadrature weights;
Generate the function sampling points;
Generate the function values;
Calculate the elementwise product between the quadrature weights and the function values;
Sum the above products.
Step #1 requires creating an array with elements repeated many times, namely, 1 4 2 4 2 4 ... 1 for the Simpson's case. This can be accomplished by borrowing Robert Crovella's approach in cuda thrust library repeat vector multiple times.
Step #2 can be accomplished by using couting_iterators and borrowing talonmies approach in Purpose and usage of counting_iterators in CUDA Thrust library.
Step #3 is an application of thrust::transform.
Steps #4 and #5 can be accomplished together by thrust::inner_product.
This approach can be exploited also for use when other quadrature integration rules are of interest.
Here is the code
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/inner_product.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// for printing
#include <thrust/copy.h>
#include <ostream>
#define STRIDE 2
#define N 100
#define pi_f 3.14159265358979f // Greek pi in single precision
struct sin_functor
{
__host__ __device__
float operator()(float x) const
{
return sin(2.f*pi_f*x);
}
};
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
int main(void)
{
// --- Generate the integration coefficients
thrust::host_vector<float> h_coefficients(STRIDE);
h_coefficients[0] = 4.f;
h_coefficients[1] = 2.f;
thrust::device_vector<float> d_coefficients(N);
typedef thrust::device_vector<float>::iterator Iterator;
strided_range<Iterator> pos1(d_coefficients.begin()+1, d_coefficients.end()-2, STRIDE);
strided_range<Iterator> pos2(d_coefficients.begin()+2, d_coefficients.end()-1, STRIDE);
thrust::fill(pos1.begin(), pos1.end(), h_coefficients[0]);
thrust::fill(pos2.begin(), pos2.end(), h_coefficients[1]);
d_coefficients[0] = 1.f;
d_coefficients[N-1] = 1.f;
// print the generated d_coefficients
std::cout << "d_coefficients: ";
thrust::copy(d_coefficients.begin(), d_coefficients.end(), std::ostream_iterator<float>(std::cout, " ")); std::cout << std::endl;
// --- Generate sampling points
float a = 0.f;
float b = .5f;
float Dx = (b-a)/(float)(N-1);
thrust::device_vector<float> d_x(N);
thrust::transform(thrust::make_counting_iterator(a/Dx),
thrust::make_counting_iterator((b+1.f)/Dx),
thrust::make_constant_iterator(Dx),
d_x.begin(),
thrust::multiplies<float>());
// --- Calculate function values
thrust::device_vector<float> d_y(N);
thrust::transform(d_x.begin(), d_x.end(), d_y.begin(), sin_functor());
// --- Calculate integral
float integral = (Dx/3.f) * thrust::inner_product(d_y.begin(), d_y.begin() + N, d_coefficients.begin(), 0.0f);
printf("The integral is = %f\n", integral);
getchar();
return 0;
}