I have used atomicMax() to find the maximum value in the CUDA kernel:
__global__ void global_max(float* values, float* gl_max)
{
int i=threadIdx.x + blockDim.x * blockIdx.x;
float val=values[i];
atomicMax(gl_max, val);
}
It is throwing the following error:
error: no instance of overloaded function "atomicMax" matches the argument list
The argument types are: (float *, float).
atomicMax is not available for float types. But you can implement it via atomicCAS:
__device__ static float atomicMax(float* address, float val)
{
int* address_as_i = (int*) address;
int old = *address_as_i, assumed;
do {
assumed = old;
old = ::atomicCAS(address_as_i, assumed,
__float_as_int(::fmaxf(val, __int_as_float(assumed))));
} while (assumed != old);
return __int_as_float(old);
}
Based on the CUDA Toolkit Documentation v9.2.148, there are no atomic operations for float.
But we can implement it by mixing atomicMax and atomicMin with signed and unsigned integer casts!
This is a float atomic min:
__device__ __forceinline__ float atomicMinFloat (float * addr, float value) {
float old;
old = (value >= 0) ? __int_as_float(atomicMin((int *)addr, __float_as_int(value))) :
__uint_as_float(atomicMax((unsigned int *)addr, __float_as_uint(value)));
return old;
}
This is a float atomic max:
__device__ __forceinline__ float atomicMaxFloat (float * addr, float value) {
float old;
old = (value >= 0) ? __int_as_float(atomicMax((int *)addr, __float_as_int(value))) :
__uint_as_float(atomicMin((unsigned int *)addr, __float_as_uint(value)));
return old;
}
You need to map float to orderedIntFloat to use atomicMax!
__device__ __forceinline__ int floatToOrderedInt( float floatVal ) {
int intVal = __float_as_int( floatVal );
return (intVal >= 0 ) ? intVal : intVal ^ 0x7FFFFFFF;
}
__device__ __forceinline__ float orderedIntToFloat( int intVal ) {
return __int_as_float( (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF);
}
The short answer is that you can't. As you can see from the atomic function documentation, only integer arguments are supported for atomicMax and 64 bit integer arguments are only supported on compute capability 3.5 devices.
I believe the answer given by Xiaojing An is a good solution but there is a minor issue with the negative zero which is mentioned by Robert Crovella in a comment. For example, if *addr = -1.0f and val = -0.0f then after running the atomicMaxFloat function addr will be set to -1.0f but it should be -0.0f, and the atomicMinFloat function will also be wrong in this case. This happens because the >= 0 check returns true for negative 0 but we need it to be false in this case. This case can be fixed by using the signbit function instead:
__device__ __forceinline__ float atomicMinFloat(float* addr, float value) {
float old;
old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) :
__uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));
return old;
}
__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
float old;
old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) :
__uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
return old;
}
Note - i would have posted this as a comment to the answer from Xiaojing An but don't have enough reputation.
Of course, it's unclear what will happen with nans or infs in this function but i think it can be used without worrying about that assuming you don't need to handle those cases - the negative 0 is probably the only really worrying case. It also depends on your willingness to accept this kind of hackery where we are making assumptions about the way the floating point values are represented in binary and many people may prefer never to go down this kind of route.
Here's a small test program:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>
/*
//these versions fail some of the tests involving negative 0
__device__ __forceinline__ float atomicMinFloat(float* addr, float value) {
float old;
old = value >= 0 ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) :
__uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));
return old;
}
__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
float old;
old = value >= 0 ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) :
__uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
return old;
}
*/
__device__ __forceinline__ float atomicMinFloat(float* addr, float value) {
float old;
old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) :
__uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));
return old;
}
__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
float old;
old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) :
__uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
return old;
}
__global__ void testKernel(float* testMaxData,
float* testMinData,
const float* testValues,
int numTests)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index >= numTests)
{
return;
}
float val = testValues[index];
atomicMaxFloat(testMaxData + index, val);
atomicMinFloat(testMinData + index, val);
}
void checkCudaErr(cudaError_t cudaStatus)
{
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "CUDA Runtime error: %s\n", cudaGetErrorString(cudaStatus));
}
}
int main()
{
const int numValues = 6;
const int numTests = numValues * numValues;
float testData[numValues] = { 0.0f, -0.0f, 1.0f, -1.0f, 200.0f, -200.0f };
float testValuesMinMaxHost[numTests];
float testValuesHost[numTests];
for (int i = 0; i < numValues; ++i)
{
for (int j = 0; j < numValues; ++j)
{
/*
We will test the values of min(a,b) and max(a,b) for
all values of a and b in the testData array.
*/
testValuesMinMaxHost[numValues * i + j] = testData[i];
testValuesHost[numValues * i + j] = testData[j];
}
}
float* devTestMax = 0;
float* devTestMin = 0;
float* devTestValues = 0;
checkCudaErr(cudaSetDevice(0));
checkCudaErr(cudaMalloc((void**)&devTestMax, numTests * sizeof(float)));
checkCudaErr(cudaMalloc((void**)&devTestMin, numTests * sizeof(float)));
checkCudaErr(cudaMalloc((void**)&devTestValues, numTests * sizeof(float)));
checkCudaErr(cudaMemcpy(devTestMax, testValuesMinMaxHost, numTests * sizeof(float), cudaMemcpyHostToDevice));
checkCudaErr(cudaMemcpy(devTestMin, testValuesMinMaxHost, numTests * sizeof(float), cudaMemcpyHostToDevice));
checkCudaErr(cudaMemcpy(devTestValues, testValuesHost, numTests * sizeof(float), cudaMemcpyHostToDevice));
int blockSize = 128;
testKernel << < (numTests+(blockSize-1))/ blockSize, blockSize >> > (devTestMax, devTestMin, devTestValues, numTests);
checkCudaErr(cudaGetLastError());
float resultsMin[numTests];
float resultsMax[numTests];
checkCudaErr(cudaMemcpy(resultsMin, devTestMin, numTests * sizeof(float), cudaMemcpyDeviceToHost));
checkCudaErr(cudaMemcpy(resultsMax, devTestMax, numTests * sizeof(float), cudaMemcpyDeviceToHost));
checkCudaErr(cudaFree(devTestMax));
checkCudaErr(cudaFree(devTestMin));
checkCudaErr(cudaFree(devTestValues));
int fail = 0;
for (int i = 0; i < numTests; ++i)
{
float expectedMax = fmax(testValuesMinMaxHost[i], testValuesHost[i]);
if (resultsMax[i] != expectedMax)
{
printf("fail, expected %f, got %f from max(%f, %f)\n",
expectedMax,
resultsMax[i],
testValuesMinMaxHost[i],
testValuesHost[i]);
fail = 1;
}
float expectedMin = fmin(testValuesMinMaxHost[i], testValuesHost[i]);
if (resultsMin[i] != expectedMin)
{
printf("fail, expected %f, got %f from min(%f, %f)\n",
expectedMin,
resultsMin[i],
testValuesMinMaxHost[i],
testValuesHost[i]);
fail = 1;
}
}
if (fail == 0)
{
printf("all tests passed\n");
}
return 0;
}
This is the syntax for Atomic MAX
int atomicMax(int* address,int val);
But there are exception like atomicAdd which support floats.
Related
What is the difference between the following two functions?
__device__ inline void comparator_volatile(volatile float &A, volatile float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
}
__device__ inline void comparator(float &A, float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
__threadfence();
}
Could anyone help me?
I implement BitonicSort in some different versions based on CUDA SDK version.
For ATOMIC version (bitonicSortAtomic), I tried to use __threadfence() in __syncblocks_atomic to maintain memory consistency. But it doesn't work (the output is incorrect). I have to call comparator_volatile instead of comparator, then I get correct result. Any idea?
The BitonicSort benchmark:
// (C) Copyright 2013, University of Illinois. All Rights Reserved
#include <stdlib.h>
#include <stdio.h>
#include "parboil.h"
#define THREADS 256
#define BLOCKS 32
#define NUM_VALS 2*THREADS*BLOCKS
__device__ volatile int mutex = 0;
__device__ inline void __syncblocks_atomic(int goal) {
__syncthreads();
// __threadfence();
int tx = threadIdx.x;
if (tx == 0) {
atomicAdd((int *)&mutex, 1);
while(g_mutex != goal) {}
}
__syncthreads();
}
__device__ inline void comparator(float &A, float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
}
__device__ inline void comparator_volatile(volatile float &A, volatile float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
}
#ifdef NAIVE
__global__ void bitonicSortNaive(float *src, int stride, int size) {
unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
uint dir = (tid & (size / 2)) == 0;
unsigned int pos = 2*tid - (tid & (stride - 1));
comparator(src[pos], src[pos+stride], dir);
}
#endif
#ifdef ATOMIC
__global__ void bitonicSortAtomic(float *src, int length) {
uint numBlocks = gridDim.x * gridDim.y * gridDim.z;
uint goalVal = 0;
uint tid = threadIdx.x + blockDim.x * blockIdx.x;
for(uint size=2; size<=length; size<<=1) {
for(uint stride=size>>1; stride>0; stride=stride>>1) {
uint dir = (tid & (size / 2)) == 0;
uint pos = 2*tid - (tid & (stride - 1));
comparator_volatile(src[pos], src[pos+stride], dir);
if(stride>THREADS || (stride==1 && size>=THREADS)) {
goalVal += numBlocks;
__syncblocks_atomic(goalVal);
}
else
__syncthreads();
} // end for stride
} // end for size
}
#endif
int main() {
printf("[BENCH] Bitonic Sort %d elements\n", NUM_VALS);
printf("[BENCH] Xuhao Chen <cxh#illinois.edu>\n");
#ifdef NAIVE
printf("[BENCH] Naive version\n");
#endif
#ifdef ATOMIC
printf("[BENCH] Atomic Barrier\n");
#endif
float *values = (float*) malloc( NUM_VALS * sizeof(float));
array_init(values, NUM_VALS);
float *dev_values;
size_t size = NUM_VALS * sizeof(float);
cudaMalloc((void**) &dev_values, size);
cudaMemcpy(dev_values, values, size, cudaMemcpyHostToDevice);
dim3 blocks(BLOCKS,1);
dim3 threads(THREADS,1);
cudaDeviceSynchronize();
#ifdef NAIVE
int j, k;
for (k = 2; k <= NUM_VALS; k <<= 1) {
for (j=k>>1; j>0; j=j>>1) {
bitonicSortNaive<<<blocks, threads>>>(dev_values, j, k);
}
}
#endif
#ifdef ATOMIC
bitonicSortAtomic<<<blocks, threads>>>(dev_values, NUM_VALS);
#endif
cudaDeviceSynchronize();
cudaMemcpy(values, dev_values, size, cudaMemcpyDeviceToHost);
cudaFree(dev_values);
free(values);
}
__syncblocks_atomic is a function to implement global barrier. Since there is inter-block communication, I have to keep data consistency.
The CUDA programming guide states:
If a variable located in
global or shared memory is declared as volatile, the compiler assumes that its value
can be changed or used at any time by another thread and therefore any reference to
this variable compiles to an actual memory read or write instruction.
This basically means that the memory will be flushed immediately as you assign a value to the variable, and will be fetched directly from the memory (with no cache) when you try to read its value.
In you first code sample, since both A and B are volatile, 6 actual memory instructions are generated. One read/write each time you use either A or B. The good point is that other threads will be able to see that modifications earlier, while they are made. The downside is that the execution will be slower, because the caches will be disabled.
In your second code sample, on the other side, the GPU is authorized to use caches to accelerate its execution, until the end of the function, when it's forced to issue a memory write. If both A and B are already cached, only 2 memory writes are issued. The downside is that other threads might only be able to see the changed value after the fence.
Another thing you should consider is that operations are not atomic.
If other threads try to access A and B while your function is executing, they might see a partial execution of the function, in both cases. In the second code sample, this is a bit less likely to happen, because the thread will probably use its cached value, and flush the final values at once (anyway, you should not rely on this).
Also, volatile works as a faster version of __threadfence() among threads in the same warp (because threads in a warp act synchronously).
I'd like to implement this atomic function in CUDA:
__device__ float lowest; // global var
__device__ int lowIdx; // global var
float realNum; // thread reg var
int index; // thread reg var
if(realNum < lowest) {
lowest= realNum; // the new lowest
lowIdx= index; // update the 'low' index
}
I don't believe I can do this with any of the atomic functions. I need to lock down a couple global memory loc's for a couple instructions.
Might I be able to implement this with PTXAS (assembly) code?
As I stated in my second comment above, it's possible to combine your two 32-bit quantities into a single 64-bit atomically managed quantity, and deal with the problem that way. We then manage the 64-bit quantity atomically using the arbitrary atomic example as a rough guide. Obviously you can't extend this idea beyond two 32-bit quantities. Here's an example:
#include <stdio.h>
#define DSIZE 5000
#define nTPB 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef union {
float floats[2]; // floats[0] = lowest
int ints[2]; // ints[1] = lowIdx
unsigned long long int ulong; // for atomic update
} my_atomics;
__device__ my_atomics test;
__device__ unsigned long long int my_atomicMin(unsigned long long int* address, float val1, int val2)
{
my_atomics loc, loctest;
loc.floats[0] = val1;
loc.ints[1] = val2;
loctest.ulong = *address;
while (loctest.floats[0] > val1)
loctest.ulong = atomicCAS(address, loctest.ulong, loc.ulong);
return loctest.ulong;
}
__global__ void min_test(const float* data)
{
int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
if (idx < DSIZE)
my_atomicMin(&(test.ulong), data[idx],idx);
}
int main() {
float *d_data, *h_data;
my_atomics my_init;
my_init.floats[0] = 10.0f;
my_init.ints[1] = DSIZE;
h_data = (float *)malloc(DSIZE * sizeof(float));
if (h_data == 0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_data, DSIZE * sizeof(float));
cudaCheckErrors("cm1 fail");
// create random floats between 0 and 1
for (int i = 0; i < DSIZE; i++) h_data[i] = rand()/(float)RAND_MAX;
cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cmcp1 fail");
cudaMemcpyToSymbol(test, &(my_init.ulong), sizeof(unsigned long long int));
cudaCheckErrors("cmcp2 fail");
min_test<<<(DSIZE+nTPB-1)/nTPB, nTPB>>>(d_data);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpyFromSymbol(&(my_init.ulong), test, sizeof(unsigned long long int));
cudaCheckErrors("cmcp3 fail");
printf("device min result = %f\n", my_init.floats[0]);
printf("device idx result = %d\n", my_init.ints[1]);
float host_val = 10.0f;
int host_idx = DSIZE;
for (int i=0; i<DSIZE; i++)
if (h_data[i] < host_val){
host_val = h_data[i];
host_idx = i;
}
printf("host min result = %f\n", host_val);
printf("host idx result = %d\n", host_idx);
return 0;
}
Here is a similar example that does atomic update of 2 float quantities.
Here is another custom atomic example.
#Robert Crovella: Excellent idea, but I think the function should be modified a little bit as follows:
__device__ unsigned long long int my_atomicMin(unsigned long long int* address, float val1, int val2)
{
my_atomics loc, loctest, old;
loc.floats[0] = val1;
loc.ints[1] = val2;
loctest.ulong = *address;
old.ulong = loctest.ulong;
while (loctest.floats[0] > val1){
old.ulong = loctest.ulong;
loctest.ulong = atomicCAS(address, loctest.ulong, loc.ulong);
}
return old.ulong;
}
I want to write program using Thrust which is supposed to calculate local minima
of a given functions, f.i. sin(x). I have done this by approximating the function derivative by finite differences and then searching for those abscissas where the derivative changes sign. I now want to collect the local minima. I have marked local minima with "1"
and the other points with "0". I have done an inclusive_scan (for calculating places in new tab).
My problem is now gathering the local minima with gather_if (condition stencil, map minima),
but the code does not compile and I do not know why.
Could someone explain why?
/**
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*/
#include <stdio.h>
#include <thrust/device_vector.h>
#include <thrust/gather.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/copy.h>
#include <thrust/remove.h>
#include <thrust/functional.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/scan.h>
#include <sys/time.h>
__host__ __device__ unsigned int bitreverse(unsigned int number) {
number = ((0xf0f0f0f0 & number) >> 4) | ((0x0f0f0f0f & number) << 4);
number = ((0xcccccccc & number) >> 2) | ((0x33333333 & number) << 2);
number = ((0xaaaaaaaa & number) >> 1) | ((0x55555555 & number) << 1);
return number;
}
struct is_even
{
__host__ __device__
bool operator()(const int x) {
return (x % 2) == 0;
}
};
struct select_mine
{
__host__ __device__
float operator()(const float x) {
return (x < 0) ? 1.0f : 0.0f;
}
};
struct bitreverse_functor
{
__host__ __device__ unsigned int operator()(const unsigned int &x) {
return bitreverse(x);
}
};
struct sign
{
__host__ __device__ float operator()(const float x) {
if (x > 0.0f)
return 1.0f;
if (x < 0.0f)
return -1.0f;
return 0.0f;
}
};
struct sine: public thrust::unary_function<float, float>
{
__host__ __device__
float operator()(float x) {
return sinf(x);
}
};
struct absolute: public thrust::unary_function<float, float>
{
__host__ __device__
float operator()(float x) {
if (x < 0.0f)
x = -x;
return x;
}
};
struct lokalne_minimum : public thrust::binary_function<float,float,float>
{
__host__ __device__
float operator()(float x, float y)
{
if (x > 0 && y < 0)
return 1.0f;
return 0.0f;
}
};
struct conv : public thrust::unary_function<float,int>
{
__host__ __device__
int operator()(float x)
{
return (int)(x);
}
};
using namespace thrust;
void help(char *arg) {
fprintf(stderr,
"Nieprawidlowe uzycie: %s [x1] [x2] [n]\nx1 - zakres od\nx2 - zakres do\nn - liczba podzialow zakresu\n",
arg);
}
int main(int argc, char **argv) {
if (argc != 4) {
help(argv[0]);
return 1;
}
int n = atoi(argv[3]);
float x1 = (float) atof(argv[1]);
float x2 = (float) atof(argv[2]);
if (n < 0 || x2 < x1) {
help(argv[0]);
return 1;
}
float step = (x2 - x1) / n;
fprintf(stderr, "Step: %f\n", step);
thrust::device_vector<float> oxdata(n);
thrust::device_vector<float> oydata(n);
thrust::device_vector<float> diff(n);
thrust::host_vector<float> ixdata(n);
// FIXME change it
for (int i = 0; i < n; i++)
ixdata[i] = x1 + i * step;
thrust::copy(ixdata.begin(), ixdata.end(), oxdata.begin());
thrust::transform(oxdata.begin(),oxdata.end(),oydata.begin(),sine());
thrust::transform(oydata.begin() + 1, oydata.end(), oydata.begin(),
diff.begin()+1, thrust::minus<float>());
thrust::copy(diff.begin(), diff.end(), ixdata.begin());
for (int i = 0; i < n; i++)
printf ("%f, ", ixdata[i]);
printf ("\n");
thrust::transform(diff.begin()+1,diff.end(), diff.begin(),diff.begin(),lokalne_minimum());
for (int i = 0; i < n; i++)
printf ("%f, ", ixdata[i]);
printf ("\n");
thrust::copy(oydata.begin(), oydata.end(), ixdata.begin());
for (int i = 0; i < n; i++)
printf ("%f, ", ixdata[i]);
printf ("\n");
thrust::copy(diff.begin(), diff.end(), ixdata.begin());
for (int i = 0; i < n; i++)
printf ("%f, ", ixdata[i]);
printf ("\n");
//thrust::inclusive_scan(diff.begin(),diff.end(),diff.begin());
thrust::copy(diff.begin(), diff.end(), ixdata.begin());
for (int i = 0; i < n; i++)
printf ("%f, ", ixdata[i]);
printf ("\n");
thrust::device_vector<int> minima(n);
thrust::device_vector<int> stencil(n);
thrust::host_vector<int> hminima(n);
thrust::transform(diff.begin(),diff.end(),minima.begin(),conv());
thrust::copy(minima.begin(),minima.end(),hminima.begin());
thrust::copy(minima.begin(),minima.end(),stencil.begin());
for (int i = 0; i < n; i++)
printf ("%d, ", hminima[i]);
printf ("\n");
thrust::inclusive_scan(minima.begin(), minima.end(),minima.begin());
thrust::copy(minima.begin(),minima.end(),hminima.begin());
for (int i = 0; i < n; i++)
printf ("%d, ", hminima[i]);
printf ("\n");
//thrust::gather_if(minima.begin(),minima.end(),stencil.begin(),ixdata.begin(),ixdata.begin());
return 0;
}
This is a very late answer provided to remove this question from the unanswered list. I'm profiting of Robert Crovella's comment and showing below a full working code to find local minima of a sampled function with CUDA Thrust. The rationale of the code is as follows
The function derivative is approximated by central differences as an application of thrust::transform;
The function sampling points are marked by "1" as an application of thrust::transform by seeking the sign changes of the derivative via the predicate local_minima_check();
The number of local minima is counted as an application of thrust::count;
The local minima are isolated as an application of thrust::copy_if.
#include <stdio.h>
#include <thrust/count.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
/**************/
/* COS STRUCT */
/**************/
struct cosine: public thrust::unary_function<float, float>
{
__host__ __device__ float operator()(float h_x) { return cosf(h_x); }
};
/******************************************/
/* SECOND ORDER CENTRAL DIFFERENCE STRUCT */
/******************************************/
struct second_order_central_difference
{
__host__ __device__ float operator()(thrust::tuple<float,float,float> t)
{
float f_1, f0, f1;
thrust::tie(f_1,f0,f1) = t;
return f_1 - 2.0f * f0 + f1;
}
};
/******************************/
/* LOCAL MINIMUM CHECK STRUCT */
/******************************/
struct local_minimum_check:public thrust::binary_function<float,float,float>
{
__host__ __device__ float operator()(float x, float y)
{
if (x < 0 && y > 0) return 1.0f;
return 0.0f;
}
};
/****************************************/
/* LOCAL MINIMUM PREDICATE CHECK STRUCT */
/****************************************/
struct pred
{
__host__ __device__ bool operator()(const int d_x) { return (d_x == 1.f); }
};
void main() {
// --- Input parameters
int n = 100; // Number of sampling points
float x1 = 3.14f / 2.f; // (x1,x2) is the sampling interval
float x2 = 1.5f * 3.14f;
// --- Calculating the sampling points x
thrust::host_vector<float> h_x(n);
float step = (x2 - x1) / n;
for (int i = 0; i < n; i++) h_x[i] = x1 + (float)i * step;
thrust::device_vector<float> d_x = h_x;
// --- Evaluating the function values y = f(x)
thrust::device_vector<float> d_y(n);
thrust::transform(d_x.begin(),d_x.end(),d_y.begin(),cosine());
// --- Computing first order central finite differences
// In Matlab's notation, it calculates d_diff1(1:n-2) = d_y(3:n,:) - d_y(1:n-2,:);
thrust::device_vector<float> d_diff1(n-2);
thrust::transform(d_y.begin() + 2, d_y.end(), d_y.begin(), d_diff1.begin(), thrust::minus<float>());
// --- Computing second order central finite differences
// In Matlab's notation, it calculates d_diff2(1:n-2) = d_y(3:n) - 2. * d_y(2:n-1) + d_y(1:n-2);
thrust::device_vector<float> d_diff2(n-2);
thrust::transform(thrust::make_zip_iterator(
thrust::make_tuple(d_y.begin(), d_y.begin() + 1, d_y.begin() + 2)),
thrust::make_zip_iterator(
thrust::make_tuple(d_y.end() - 2, d_y.end() - 1, d_y.end())),
d_diff2.begin(),second_order_central_difference());
// --- Setting a flag for all those points for which the derivative changes sign from negative to positive
thrust::device_vector<float> d_fo_derivative(n-3);
thrust::transform(d_diff1.begin(), d_diff1.end() - 1, d_diff1.begin() + 1, d_fo_derivative.begin(), local_minimum_check());
// --- Counting the number of local minima and selecting the local minima coordinates
int min_number = thrust::count(d_fo_derivative.begin(), d_fo_derivative.end(), 1.f);
thrust::device_vector<float> d_x_minima(min_number);
thrust::copy_if(d_x.begin() + 1, d_x.end() - 1, d_fo_derivative.begin(), d_x_minima.begin(), pred());
for (int i = 0; i < d_x_minima.size(); i++) {
printf ("Local minimum # %i = %f\n ", i+1, (float)d_x_minima[i]);
}
getchar();
}
I'm trying to code integration by Simpson's method in CUDA.
This is the formula for Simpson's rule
where x_k = a + k*h.
Here's my code
__device__ void initThreadBounds(int *n_start, int *n_end, int n,
int totalBlocks, int blockWidth)
{
int threadId = blockWidth * blockIdx.x + threadIdx.x;
int nextThreadId = threadId + 1;
int threads = blockWidth * totalBlocks;
*n_start = (threadId * n)/ threads;
*n_end = (nextThreadId * n)/ threads;
}
__device__ float reg_func (float x)
{
return x;
}
typedef float (*p_func) (float);
__device__ p_func integrale_f = reg_func;
__device__ void integralSimpsonMethod(int totalBlocks, int totalThreads,
double a, double b, int n, float p_function(float), float* result)
{
*result = 0;
float h = (b - a)/n;
//*result = p_function(a)+p_function(a + h * n);
//parallel
int idx_start;
int idx_end;
initThreadBounds(&idx_start, &idx_end, n-1, totalBlocks, totalThreads);
//parallel_ends
for (int i = idx_start; i < idx_end; i+=2) {
*result += ( p_function(a + h*(i-1)) +
4 * p_function(a + h*(i)) +
p_function(a + h*(i+1)) ) * h/3;
}
}
__global__ void integralSimpson(int totalBlocks, int totalThreads, float* result)
{
float res = 0;
integralSimpsonMethod(totalBlocks, totalThreads, 0, 10, 1000, integrale_f, &res);
result[(blockIdx.x*totalThreads + threadIdx.x)] = res;
//printf ("Simpson method\n");
}
__host__ void inttest()
{
const int blocksNum = 32;
const int threadNum = 32;
float *device_resultf;
float host_resultf[threadNum*blocksNum]={0};
cudaMalloc((void**) &device_resultf, sizeof(float)*threadNum*blocksNum);
integralSimpson<<<blocksNum, threadNum>>>(blocksNum, threadNum, device_resultf);
cudaThreadSynchronize();
cudaMemcpy(host_resultf, device_resultf, sizeof(float) *threadNum*blocksNum,
cudaMemcpyDeviceToHost);
float sum = 0;
for (int i = 0; i != blocksNum*threadNum; ++i) {
sum += host_resultf[i];
// printf ("result in %i cell = %f \n", i, host_resultf[i]);
}
printf ("sum = %f \n", sum);
cudaFree(device_resultf);
}
int main(int argc, char* argv[])
{
inttest();
int i;
scanf ("%d",&i);
}
The problem is: it works wrong when n is lower than 100000. For an integral from 0 to 10, the result is ~99, but when n = 100000 or larger it works fine and the result is ~50.
What's wrong, guys?
The basic problem here is that you don't understand your own algorithm.
Your integralSimpsonMethod() function is designed such that each thread is sampling at least 3 quadrature points per sub-interval in the integral domain. Therefore, if you choose n so that it is less than four times the number of threads in the kernel call, it is inevitable that each sub interval will overlap and the resulting integral will be incorrect. You need to make sure that the code checks and scales the thread count or n so that they don't produce overlap when the integral is computed.
If you are doing this for anything other than self-edification, then I recommend you look up the composite version of Simpson's rule. This is much better suited to parallel implementation and will be considerably more performant if implemented correctly.
I would propose an approach to Simpson's integration by using CUDA Thrust. You basically need five steps:
Generate the Simpson's quadrature weights;
Generate the function sampling points;
Generate the function values;
Calculate the elementwise product between the quadrature weights and the function values;
Sum the above products.
Step #1 requires creating an array with elements repeated many times, namely, 1 4 2 4 2 4 ... 1 for the Simpson's case. This can be accomplished by borrowing Robert Crovella's approach in cuda thrust library repeat vector multiple times.
Step #2 can be accomplished by using couting_iterators and borrowing talonmies approach in Purpose and usage of counting_iterators in CUDA Thrust library.
Step #3 is an application of thrust::transform.
Steps #4 and #5 can be accomplished together by thrust::inner_product.
This approach can be exploited also for use when other quadrature integration rules are of interest.
Here is the code
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/inner_product.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// for printing
#include <thrust/copy.h>
#include <ostream>
#define STRIDE 2
#define N 100
#define pi_f 3.14159265358979f // Greek pi in single precision
struct sin_functor
{
__host__ __device__
float operator()(float x) const
{
return sin(2.f*pi_f*x);
}
};
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
int main(void)
{
// --- Generate the integration coefficients
thrust::host_vector<float> h_coefficients(STRIDE);
h_coefficients[0] = 4.f;
h_coefficients[1] = 2.f;
thrust::device_vector<float> d_coefficients(N);
typedef thrust::device_vector<float>::iterator Iterator;
strided_range<Iterator> pos1(d_coefficients.begin()+1, d_coefficients.end()-2, STRIDE);
strided_range<Iterator> pos2(d_coefficients.begin()+2, d_coefficients.end()-1, STRIDE);
thrust::fill(pos1.begin(), pos1.end(), h_coefficients[0]);
thrust::fill(pos2.begin(), pos2.end(), h_coefficients[1]);
d_coefficients[0] = 1.f;
d_coefficients[N-1] = 1.f;
// print the generated d_coefficients
std::cout << "d_coefficients: ";
thrust::copy(d_coefficients.begin(), d_coefficients.end(), std::ostream_iterator<float>(std::cout, " ")); std::cout << std::endl;
// --- Generate sampling points
float a = 0.f;
float b = .5f;
float Dx = (b-a)/(float)(N-1);
thrust::device_vector<float> d_x(N);
thrust::transform(thrust::make_counting_iterator(a/Dx),
thrust::make_counting_iterator((b+1.f)/Dx),
thrust::make_constant_iterator(Dx),
d_x.begin(),
thrust::multiplies<float>());
// --- Calculate function values
thrust::device_vector<float> d_y(N);
thrust::transform(d_x.begin(), d_x.end(), d_y.begin(), sin_functor());
// --- Calculate integral
float integral = (Dx/3.f) * thrust::inner_product(d_y.begin(), d_y.begin() + N, d_coefficients.begin(), 0.0f);
printf("The integral is = %f\n", integral);
getchar();
return 0;
}
I would like to generate random numbers in my __device__ function, and keep it in my int Board[500] , I found some examples , but they used some type named curandState. I only need a function like a rand() in C++.
here is my code , I have N3[40000] array in my device memory , I generate some random numbers in my kernel working for one thread (I mean this "kernel <<<1,1>>> ... ") , then I copy it to my N2[40000] from CPU, and print it ,so here is the code
#include <iostream>
#include <Cuda.h>
#include<curand.h>
#include<curand_kernel.h>
int n = 200;
using namespace std;
__device__ float generate( curandState* globalState, int ind )
{
//int ind = threadIdx.x;
curandState localState = globalState[ind];
float RANDOM = curand_uniform( &localState );
globalState[ind] = localState;
return RANDOM;
}
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int id = threadIdx.x;
curand_init ( seed, id, 0, &state[id] );
}
__global__ void kernel(float* N, curandState* globalState, int n)
{
// generate random numbers
for(int i=0;i<40000;i++)
{
int k = generate(globalState, i) * 100000;
while(k > n*n-1)
{
k-=(n*n-1);
}
N[i] = k;
}
}
int main()
{
int N=40000;
curandState* devStates;
cudaMalloc ( &devStates, N*sizeof( curandState ) );
// setup seeds
setup_kernel <<< 1, N >>> ( devStates,unsigned(time(NULL)) );
float N2[40000];
float* N3;
cudaMalloc((void**) &N3, sizeof(float)*N);
kernel<<<1,1>>> (N3, devStates, n);
cudaMemcpy(N2, N3, sizeof(float)*N, cudaMemcpyDeviceToHost);
for(int i=0;i<N;i++)
{
cout<<N2[i]<<endl;
}
return 0;
}
You may use curand library to generate random numbers in device memory and then run your kernel without even having to copy those values to the host.