Pack two types into one CUDA - cuda

I'd like to pack two data types, specifically an int and a float into an unsigned long long int in CUDA.
I wrote something like this but I get compilation errors when unpacking:
__global__ void kernel() {
// pack
float positive_num = 5.1034786f;
int index = 1024;
unsigned long long int u_int_val = (unsigned long long int) positive_num << 32;
u_int_val += index & 0xffff;
// unpack
positive_num = (float) u_int_val >> 32 ;
index = u_int_val & 0xffff ;
// check
printf("positive_num: %f - index %i\n", positive_num, index);
}
The error I get:
error: expression must have integral or unscoped enum type // error line - positive_num = (float) ...
I have tried in multiple ways, even casting the memory address, either I get compilation errors or simply I don't manage to have a precise conversion straight and back.
If helps, the only assumption I can make is that both the numbers are positive, hence float positive_num > 0.f; int index > 0;
The reason I need to pack two numbers into one, is to embed everything (a float and and int) in a single atomic operation, to find the minimum of both for instance.

If the need for heterogeneous packing and unpacking exists only in device code, one can use CUDA's device function intrinsics __float_as_int() and __int_as_float() to re-interpret a 32-bit float into a 32-bit int and vice versa. The packing of integers is unproblematic, simply shift the desired high-order part and OR the parts together.
For the same functionality in code that needs to work on both host and device, the canonical C++ way of re-interpreting floating-point data as integer data and vice versa is to use memcpy(); CUDA is a C++ derivative. This may or may not be as efficient as using the device intrinsics, which have no cost, as the 32-bit registers of the GPU can be used for both integer and floating-point data. It may be worthwhile to inspect the generated machine code (SASS) with cuobjdump --dump-sass.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define DEVICE_CODE_ONLY (0)
#if DEVICE_CODE_ONLY
__device__ unsigned long long int pack_float_int (float a, int b)
{
return (((unsigned long long int)(unsigned int)__float_as_int (a)) << 32) |
(unsigned long long int)(unsigned int)b;
}
__device__ void unpack_float_int (unsigned long long int a, float *f, int *i)
{
*f = __int_as_float ((int)(unsigned int)(a >> 32));
*i = (int)(unsigned int)a;
}
#else // DEVICE_CODE_ONLY
__host__ __device__ unsigned long long int pack_float_int (float a, int b)
{
unsigned int t;
memcpy (&t, &a, sizeof t);
return ((unsigned long long int)t << 32) |
(unsigned long long int)(unsigned int)b;
}
__host__ __device__ void unpack_float_int (unsigned long long int a, float *f, int *i)
{
unsigned int t = (unsigned int)(a >> 32);
memcpy (f, &t, sizeof (*f));
*i = (int)(unsigned int)a;
}
#endif // DEVICE_CODE_ONLY
__global__ void kernel (float f, int i)
{
unsigned long long int p;
float uf;
int ui;
p = pack_float_int (f, i);
printf ("GPU: packed (%15.8e, %d) into %016llx\n", f, i, p);
unpack_float_int (p, &uf, &ui);
printf ("GPU: unpacked %016llx into (%15.8e %d)\n", p, uf, ui);
}
int main (void)
{
float f = 5.1034786f;
int i = 1024;
printf ("testing packing/unpacking: %s\n",
DEVICE_CODE_ONLY ? "on device" : "on device and host");
kernel<<<1,1>>> (f, i);
cudaDeviceSynchronize();
#if !DEVICE_CODE_ONLY
unsigned long long int p;
float uf;
int ui;
p = pack_float_int (f, i);
printf ("CPU: packed (%15.8e, %d) into %016llx\n", f, i, p);
unpack_float_int (p, &uf, &ui);
printf ("CPU: unpacked %016llx into (%15.8e %d)\n", p, uf, ui);
#endif // DEVICE_CODE_ONLY
return EXIT_SUCCESS;
}

something like this should work:
__device__ unsigned long long pack(int a, float b){
return (((unsigned long long)(*(reinterpret_cast<unsigned *>(&a))))<<32) + *(reinterpret_cast<unsigned *>(&b));
}
__device__ void unpack(unsigned long long val, int &a, float &b){
unsigned mb = (unsigned)val;
b = *(reinterpret_cast<float *>(&mb));
unsigned ma = (unsigned)(val >> 32);
a = *(reinterpret_cast<int *>(&ma));
}
(This is not really unique to CUDA. This is just C++ code, apart from the __device__ decorators.)
Example:
$ cat t2169.cu
#include <cstdio>
__device__ unsigned long long pack(int a, float b){
return (((unsigned long long)(*(reinterpret_cast<unsigned *>(&a))))<<32) + *(reinterpret_cast<unsigned *>(&b));
}
__device__ void unpack(unsigned long long val, int &a, float &b){
unsigned mb = (unsigned)val;
b = *(reinterpret_cast<float *>(&mb));
unsigned ma = (unsigned)(val >> 32);
a = *(reinterpret_cast<int *>(&ma));
}
__global__ void k(int a, float b){
unsigned long long val = pack(a,b);
int ma;
float mb;
unpack(val, ma, mb);
printf("a = %d, b = %f\n", ma, mb);
}
int main(){
k<<<1,1>>>(-2, -1.3f);
cudaDeviceSynchronize();
}
$ nvcc -o t2169 t2169.cu
$ compute-sanitizer ./t2169
========= COMPUTE-SANITIZER
a = -2, b = -1.300000
========= ERROR SUMMARY: 0 errors
$
That packs the int quantity in the upper 32-bits and the float quantity in the lower 32-bits.
If you want to reverse the storage order, you can just reverse the usage of a and b like this:
__device__ unsigned long long pack(int a, float b){
return (((unsigned long long)(*(reinterpret_cast<unsigned *>(&b))))<<32) + *(reinterpret_cast<unsigned *>(&a));
}
__device__ void unpack(unsigned long long val, int &a, float &b){
unsigned ma = (unsigned)val;
a = *(reinterpret_cast<int *>(&ma));
unsigned mb = (unsigned)(val >> 32);
b = *(reinterpret_cast<float *>(&mb));
}

Related

CUDA, how to find the first item in an array that makes a function maximal

In Cuda C++, I have a big array Arr of integers, and a function F: int -> int. I want to find the first index of some items in Arr that makes F maximal.
How can I write a kernel that always keeps the maximal value (in F) to compare with others using atomic stuff to avoid facing the race condition problems?
BTW, I wonder if I can use the functions in Thrust library for this purpose instead.
How can I write a kernel that always keeps the maximal value (in F) to compare with others using atomic stuff to avoid facing the race condition problems?
Based on your description, including usage of int, and a desire to use atomics, I would suggest using a custom atomic. This should work for arrays up to 4 billion elements:
$ cat t2154.cu
#include <iostream>
__device__ __host__ int get_int(unsigned long long val){return reinterpret_cast<int *>(&val)[0];}
__device__ __host__ unsigned get_uns(unsigned long long val){return reinterpret_cast<unsigned *>(&val)[1];}
__device__ bool done(int fval, int fval1, unsigned idx, unsigned idx1){
if (fval > fval1) return true;
if ((fval == fval1) && (idx <= idx1)) return true;
return false;
}
__device__ unsigned long long my_custom_atomic(unsigned long long *addr, int fval, unsigned idx){
unsigned long long old = *addr;
while (!done(get_int(old),fval, get_uns(old), idx))
old = atomicCAS(addr, old, ((((unsigned long long)idx)<<32)|fval));
return old;
}
const int minidx = 256;
__device__ int f(int t){ return minidx - (t-minidx)*(t-minidx);}
__global__ void k(int *arr, unsigned long long *min, unsigned N){
unsigned my_idx = blockIdx.x*blockDim.x+threadIdx.x;
if (my_idx < N){
int my_val = arr[my_idx];
my_val = f(my_val);
my_custom_atomic(min, my_val, my_idx);
}
}
const unsigned my_N = 32768;
int main(){
unsigned long long *min;
cudaMallocManaged(&min, sizeof(min[0]));
int *arr;
cudaMallocManaged(&arr, sizeof(arr[0])*my_N);
for (unsigned i = 0; i < my_N; i++) arr[i] = i;
*min = 0xFFFFFFFF80000000ULL; //maximum unsigned index combined with minimum int value
k<<<my_N/256, 256>>>(arr, min, my_N);
cudaDeviceSynchronize();
std::cout << " maximum val: " << get_int(*min) << " at index: " << get_uns(*min) << std::endl;
}
$ nvcc -o t2154 t2154.cu
$ compute-sanitizer ./t2154
========= COMPUTE-SANITIZER
maximum val: 256 at index: 256
========= ERROR SUMMARY: 0 errors
$
We are assembling and disassembling the 64-bit quantity as needed, and using the general method outlined in the programming guide for arbitrary atomics.
I wonder if I can use the functions in Thrust library for this purpose instead.
Yes, you can do this with a transform and a reduce operation in thrust. In fact thrust can combine these into a single algorithm call. Here is an example:
$ cat t2155.cu
#include <iostream>
#include <thrust/transform.h>
#include <thrust/reduce.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <limits>
const int minidx = 256;
const size_t my_N = 32768;
struct f
{
template <typename T>
__host__ __device__ T operator()(T t) {
T result = t;
thrust::get<0>(result) = minidx - (thrust::get<0>(t) - minidx)*(thrust::get<0>(t) - minidx);
return result;
}
};
struct r
{
template <typename T1, typename T2>
__host__ __device__ T1 operator()(T1 &t1, T2 &t2){
if (thrust::get<0>(t1) > thrust::get<0>(t2)) return t1;
if (thrust::get<0>(t1) < thrust::get<0>(t2)) return t2;
if (thrust::get<1>(t1) < thrust::get<1>(t2)) return t1;
return t2;
}
};
int main(){
thrust::device_vector<int> arr(my_N);
thrust::sequence(arr.begin(), arr.end());
auto my_zip = thrust::make_zip_iterator(thrust::make_tuple(arr.begin(), thrust::counting_iterator<size_t>(0)));
auto init = thrust::make_tuple(std::numeric_limits<int>::min(), std::numeric_limits<size_t>::max());
auto result = thrust::transform_reduce(my_zip, my_zip+my_N, f(), init, r());
std::cout << " maximum val: " << thrust::get<0>(result) << " at index: " << thrust::get<1>(result) << std::endl;
}
$ nvcc -o t2155 t2155.cu
$ compute-sanitizer ./t2155
========= COMPUTE-SANITIZER
maximum val: 256 at index: 256
========= ERROR SUMMARY: 0 errors
$
Notes:
Comparing those 2 implementations/examples, I know which one I would choose. The atomic method is brittle, type-limited, and size-limited. The thrust example could be adapted to handle types more flexibly (e.g. a function that returns a 64-bit type) and could be extended to handle beyond 4 billion elements.
The code here is just intended to be a possible roadmap. It's not thoroughly tested; bugs are always possible.
There is a strong correspondence between the two methods. The main() routines have almost a 1:1 correspondence, which hopefully you can identify. Furthermore the r() functor corresponds to the done() function, and the f() functor corresponds to the f() function.
Don't assume that you can readily/trivially increase my atomic example to 4 billion elements. The f() function I wrote would overflow/underflow an int variable. But with an appropriate data array and f() function, it should be possible to use up to 4 billion elements.
EDIT: As suggested in the comments below, we may be able to do a better job in the atomic case, by doing a threadblock level shared sweep reduction, followed by a single atomic per threadblock. Here is an example of that:
#include <iostream>
const int nTPB=512;
const unsigned long long initval = 0xFFFFFFFF80000000ULL; // maximum index and minimum int
__device__ __host__ int get_int(unsigned long long val){return reinterpret_cast<int *>(&val)[0];}
__device__ __host__ unsigned get_uns(unsigned long long val){return reinterpret_cast<unsigned *>(&val)[1];}
__device__ bool done(int fval, int fval1, unsigned idx, unsigned idx1){
if (fval > fval1) return true;
if ((fval == fval1) && (idx <= idx1)) return true;
return false;
}
__device__ unsigned long long my_custom_atomic(unsigned long long *addr, int fval, unsigned idx){
unsigned long long old = *addr;
while (!done(get_int(old),fval, get_uns(old), idx))
old = atomicCAS(addr, old, ((((unsigned long long)idx)<<32)|fval));
return old;
}
const int minidx = 256;
__device__ int f(int t){ return minidx - (t-minidx)*(t-minidx);}
__device__ unsigned long long my_reduce(unsigned long long t1, unsigned long long t2){
if (done(get_int(t1), get_int(t2), get_uns(t1), get_uns(t2))) return t1;
return t2;
}
__global__ void k(int *arr, unsigned long long *min, unsigned N){
__shared__ unsigned long long smem[nTPB];
smem[threadIdx.x] = initval;
for (unsigned int idx = blockIdx.x*blockDim.x+threadIdx.x; idx < N; idx+=gridDim.x*blockDim.x)
smem[threadIdx.x] = my_reduce(smem[threadIdx.x], (((unsigned long long)idx)<<32)|f(arr[idx]));
for (int t = nTPB>>1; t > 0; t>>=1){
__syncthreads();
if (threadIdx.x < t) smem[threadIdx.x] = my_reduce(smem[threadIdx.x], smem[threadIdx.x+t]);}
if (!threadIdx.x) my_custom_atomic(min, get_int(smem[0]), get_uns(smem[0]));
}
const unsigned my_N = 32768;
int main(){
unsigned long long *min;
cudaMallocManaged(&min, sizeof(min[0]));
int *arr;
cudaMallocManaged(&arr, sizeof(arr[0])*my_N);
for (unsigned i = 0; i < my_N; i++) arr[i] = i;
arr[1024] = minidx;
*min = initval;
k<<<(my_N+nTPB-1)/nTPB, nTPB>>>(arr, min, my_N);
cudaDeviceSynchronize();
std::cout << " minimum val: " << get_int(*min) << " at index: " << get_uns(*min) << std::endl;
}

How to resolve undefined reference errors to threadIdx.x, blockDim.x, and blockIdx.x in CUDA?

I'm a beginner working on a parallel list ranking algorithm in CUDA. I have no idea why I'm getting undefined reference errors to threadIdx.x, blockDim.x, and blockIdx.x, with the message "ld returned exit 1 status", when compiling with 'nvcc ParallelListRanking.cu -o ParallelListRanking'. Adding #include "device_launch_parameters.h" and #include "cuda.h" did not help. Any ideas to resolve this?
#include<stdio.h>
#include <device_launch_parameters.h>
#include <cuda.h>
__global__ void set_up(unsigned long int n, unsigned long int *dS, unsigned long long int *dQR){
unsigned long int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < n) {
dS[i] = i + 1;
dS[n-1] = 0;
if (dS[i] != 0) {
dQR[i] = (dQR[i]%(1ull<<32)) + (1ull<<32) * 1ull;
}
else {
dQR[i] = (dQR[i]%(1ull<<32)) + (1ull<<32) * 0ull;
}
dQR[i] = ((unsigned long long int) dS[i])%(1ull<<32) + (1ull<<32)*(dQR[i]/(1ull<<32));
}
}
__global__ void update(unsigned long int n, unsigned long long int *dQR) {
unsigned long int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < n) {
unsigned long int tempdQ = (unsigned long int) (dQR[i]%(1ull<<32));
unsigned long int tempdQQ = (unsigned long int) (dQR[tempdQ]%(1ull<<32));
if (tempdQ != 0 && tempdQQ != 0) {
unsigned long long int tmpdQRQ = dQR[tempdQ];
unsigned long int tempdR = (unsigned long int) dQR[i]/(1ull<<32);
unsigned long long int tempdRQ = tmpdQRQ/(1ull<<32);
dQR[i] = ((unsigned long long int) tempdQQ)%(1ull<<32) + (((unsigned long long int) tempdR) + tempdRQ)*(1ull<<32);
tempdQ = tempdQQ;
tempdQQ = (unsigned long int) dQR[tempdQ]%(1ull<<32);
}
}
}
int main() {
unsigned long int n = 1000;
unsigned long int *dS =(unsigned long int *) cudaMalloc((int **)n, sizeof(unsigned long int));
unsigned long long int *dQR = (unsigned long long int *) cudaMalloc((int **)n, sizeof(unsigned long long int));
unsigned long int i = threadIdx.x + blockDim.x * blockIdx.x;
set_up<<< 100, 100 >>>(n, dS, dQR);
for (int t = 1; t < n; t++) {
update<<< 100, 100>>>(n, dQR);
}
printf("%lu", (unsigned long int) (dQR[i]%(1ull<<32)));
printf("\n");
printf("%lu", (unsigned long int) (dQR[i]/(1ull<<32)));
cudaFree(dQR);
cudaFree(dS);
return 0;
}
You can't do this:
unsigned long int i = threadIdx.x + blockDim.x * blockIdx.x;
in host code (i.e. in main()).
Those variables are automatically defined in device code only (the routines decorated with __global__, for example).
The remainder of your host code from that point should not depend on variables like threadIdx.x because they are not defined in host code and make no sense to use there.
So just delete that line entirely (from main()) and come up with another method to use an index like that in your host code.
The only place you seem to use it is in your printf statements, so I guess if you are trying to print out all the variables you will need a loop in host code:
for (int i = 0; i < n; i++){
printf("%lu", (unsigned long int) (dQR[i]%(1ull<<32)));
printf("\n");
printf("%lu", (unsigned long int) (dQR[i]/(1ull<<32)));}
That is the proximal cause of the error you are reporting, however your code has other issues.
This is not correct usage of cudaMalloc:
unsigned long int *dS =(unsigned long int *) cudaMalloc((int **)n, sizeof(unsigned long int));
you want to do something like this:
cudaMallocManaged(&dS, sizeof(unsigned long int)*n);
and likewise for dQR. (I'm switching from cudaMalloc to cudaMallocManaged here for a reason related to your attempt to print those variables from host code.)
There are likely other problems with your code. Before attempting anything like this I would suggest you have a firm grasp of how to write a proper, simple vector add application in CUDA.
To get a basic grasp of CUDA you might want to study a simple sample code like vectorAdd and perhaps study one of the introductory blogs.

Atomic Operation failed in CUDA

As the compute ability is 2.1, the atomicAdd and atomicMax operations do not support double precision, then I define both functions based on some answers on stack overflow.
It is strange that the atomicAdd function works well but the atomicMax doesn't work, here is my code.
The test of my code is to generate random number on each block, and then sum the random numbers on each block, we have block sum, I want to test the atomicAdd and atomicMax on the block sum.
#include <iostream>
#include <curand.h>
#include <curand_kernel.h>
#include <stdio.h>
#include <stdlib.h>
#define num_of_blocks 2
#define threads_per_block 2
#define tot_threads 4
__device__ double gsum[num_of_blocks];
__device__ double dev_sum;
__device__ double dev_max;
// set seed for random number generator
__global__ void initcuRand(curandState* globalState, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &globalState[idx]);
}
// atomiMax for double
__device__ double atomicMax_d(double* address, double val)
{
unsigned long long int* address_as_i = (unsigned long long int*)address;
unsigned long long int old = *address_as_i, assumed;
do {
assumed = old;
old = ::atomicCAS(address_as_i, assumed, __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
} while (assumed != old);
return __longlong_as_double(old);
}
// atomicAdd for double
__device__ double atomicAdd_d(double* address, double val)
{
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do{
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
}while(assumed != old);
return __longlong_as_double(old);
}
__global__ void kernel(curandState *globalState){
// global id
int gidx = threadIdx.x + blockIdx.x * blockDim.x;
// local id
int lidx = threadIdx.x;
// creat shared memory to store seeds
__shared__ curandState localState[tot_threads];
__shared__ double srandnum[threads_per_block];
// copy global seed to local
localState[lidx] = globalState[gidx];
//synchronize the local threads writing to the local memory cache
__syncthreads();
// generate random number from normal distribution in shared memory
srandnum[lidx] = curand_normal(&localState[lidx]);
__syncthreads();
if(lidx == 0){srandnum[lidx] += srandnum[lidx + 1];} // sum of each block
if(lidx == 0){gsum[blockIdx.x] = srandnum[lidx];} // copy the sums back to global memory
__threadfence();
if( gidx < num_of_blocks){
atomicAdd_d(&dev_sum, gsum[gidx]);
}
if( gidx < num_of_blocks){
atomicMax_d(&dev_max, gsum[gidx]);
}
if( gidx == 0){
printf("Sum is: %lf\n", dev_sum);
}
if( gidx == 1){
printf("Max is: %lf\n", dev_max);
}
}
int main(){
// set seed on device
curandState *globalState;
cudaMalloc((void**)&globalState, tot_threads*sizeof(curandState));
initcuRand<<<num_of_blocks, threads_per_block>>>(globalState, 1);
// launch kernel
kernel<<<num_of_blocks, threads_per_block>>>(globalState);
double randnum[num_of_blocks];
cudaMemcpyFromSymbol(randnum, gsum, num_of_blocks*sizeof(double), 0, cudaMemcpyDeviceToHost);
std::cout << "Sum of each block:\n";
for (int i = 0; i < num_of_blocks; ++i){
std::cout << randnum[i] << std::endl;
}
cudaFree(globalState);
return 0;
}
The result I get is
Sum is: -0.898329
Max is: 0.000000
Sum of each block:
-0.0152994
-0.88303
From the result, I know that the atomicAdd function works but the atomicMax function doesn't work, I have no idea of this. Thanks beforehand.
You don't ever initialize dev_max or dev_sum. You can't sensibly do these types of atomic operations on them if they don't start with a known value.
Try something like this instead:
__device__ double dev_sum = 0.0;
__device__ double dev_max = -1e99;
and I think you'll be happier with the results.

is there a way to do "saypx" in cuBLAS?

cublasSaxpy computes y' = a * x + y, where x and y are vectors and a is scalar.
It turns out I need to compute y' = a * y + x instead. I'm not seeing how to twist the cuBLAS library into doing that.
(Of course, I could compute y' = a * y, then y' = y' + x, but y' is read too often in that case. And I could write my own CUDA code to do it, but then it's likely not anywhere near as fast as the cuBLAS code. I'm just surprised there's no apparent way to do "saypx" directly.)
[Added] There are functions similar to "saxpby" in Intel's version of cblas, which would do what I need. But oddly enough, that's not in cuBLAS.
[Added #2] It looks like I can use the cudnnAddTensor function, with some aliasing of descriptors (I have a FilterDescriptor that points to the tensor, which AddTensor won't accept, but I should be able to alias a TensorDescriptor to the same memory and shape.)
There isn't a way I am aware of to do what you are asking in CUBLAS, nor in standard BLAS. What you have found in MKL is an extension added by Intel, but I don't recall seeing something similar in other host and accelerator BLAS implementations.
The good news is that your assertion that "I could write my own CUDA code to do it, but then it's likely not anywhere near as fast as the cuBLAS code", is untrue, at least for an operation as trivial as saxpy. Even a naïve implementation of saxpy will get very close to CUBLAS because there really aren't that many was to read two arrays, perform an FMAD and write back the result. As long as you get memory coalescing correct, it is pretty simple to write performant code. For example:
#include <vector>
#include <algorithm>
#include <cassert>
#include <iostream>
#include <cmath>
#include "cublas_v2.h"
typedef enum
{
AXPY = 0,
AXPBY = 1
} saxpy_op_t;
__device__ __host__ __inline__
float axpby_op(float y, float x, float a)
{
return a * y + x;
}
__device__ __host__ __inline__
float axpy_op(float y, float x, float a)
{
return y + a * x;
}
template<typename T>
class pitched_accessor
{
T * p;
size_t pitch;
public:
__host__ __device__
pitched_accessor(T *p_, size_t pitch_) : p(p_), pitch(pitch_) {};
__host__ __device__
T& operator[](size_t idx) { return p[pitch*idx]; };
__host__ __device__
const T& operator[](size_t idx) const { return p[pitch*idx]; };
};
template<saxpy_op_t op>
__global__
void saxpy_kernel(pitched_accessor<float> y, pitched_accessor<float> x,
const float a, const unsigned int N1)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int stride = gridDim.x * blockDim.x;
#pragma unroll 8
for(; idx < N1; idx += stride) {
switch (op) {
case AXPY:
y[idx] = axpy_op(y[idx], x[idx], a);
break;
case AXPBY:
y[idx] = axpby_op(y[idx], x[idx], a);
break;
}
}
}
__host__ void saxby(const unsigned int N, const float a,
float *x, int xinc, float *y, int yinc)
{
int gridsize, blocksize;
cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPBY>);
saxpy_kernel<AXPBY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc),
pitched_accessor<float>(x, xinc), a, N);
}
__host__ void saxpy(const unsigned int N, const float a,
float *x, int xinc, float *y, int yinc)
{
int gridsize, blocksize;
cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPY>);
saxpy_kernel<AXPY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc),
pitched_accessor<float>(x, xinc), a, N);
}
void check_result(std::vector<float> &yhat, float result, float tolerance=1e-5f)
{
auto it = yhat.begin();
for(; it != yhat.end(); ++it) {
float err = std::fabs(*it - result);
assert( err < tolerance );
}
}
int main()
{
const int N = 1<<22;
std::vector<float> x_h(N);
std::vector<float> y_h(N);
const float a = 2.f, y0 = 1234.f, x0 = 532.f;
std::fill(y_h.begin(), y_h.end(), y0);
std::fill(x_h.begin(), x_h.end(), x0);
float *x_d, *y_d;
size_t sz = sizeof(float) * size_t(N);
cudaMalloc((void **)&x_d, sz);
cudaMalloc((void **)&y_d, sz);
cudaMemcpy(x_d, &x_h[0], sz, cudaMemcpyHostToDevice);
{
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
saxby(N, a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpby_op(y0, x0, a));
}
{
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
saxpy(N, a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpy_op(y0, x0, a));
}
{
cublasHandle_t handle;
cublasCreate(&handle);
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
cublasSaxpy(handle, N, &a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpy_op(y0, x0, a));
cublasDestroy(handle);
}
return int(cudaDeviceReset());
}
This demonstrates that a very simple axpy kernel can be easily adapted to perform both the standard operation and the version you want, and run within 10% of the runtime of CUBLAS on the compute 5.2 device I tested it on:
$ nvcc -std=c++11 -arch=sm_52 -Xptxas="-v" -o saxby saxby.cu -lcublas
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info : Function properties for _Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 17 registers, 360 bytes cmem[0]
ptxas info : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info : Function properties for _Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 17 registers, 360 bytes cmem[0]
$ nvprof ./saxby
==26806== NVPROF is profiling process 26806, command: ./saxby
==26806== Profiling application: ./saxby
==26806== Profiling result:
Time(%) Time Calls Avg Min Max Name
54.06% 11.190ms 5 2.2381ms 960ns 2.9094ms [CUDA memcpy HtoD]
40.89% 8.4641ms 3 2.8214ms 2.8039ms 2.8310ms [CUDA memcpy DtoH]
1.73% 357.59us 1 357.59us 357.59us 357.59us void saxpy_kernel<saxpy_op_t=1>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
1.72% 355.15us 1 355.15us 355.15us 355.15us void saxpy_kernel<saxpy_op_t=0>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
1.60% 332.21us 1 332.21us 332.21us 332.21us void axpy_kernel_val<float, int=0>(cublasAxpyParamsVal<float>)

CUDA random number generating

I would like to generate random numbers in my __device__ function, and keep it in my int Board[500] , I found some examples , but they used some type named curandState. I only need a function like a rand() in C++.
here is my code , I have N3[40000] array in my device memory , I generate some random numbers in my kernel working for one thread (I mean this "kernel <<<1,1>>> ... ") , then I copy it to my N2[40000] from CPU, and print it ,so here is the code
#include <iostream>
#include <Cuda.h>
#include<curand.h>
#include<curand_kernel.h>
int n = 200;
using namespace std;
__device__ float generate( curandState* globalState, int ind )
{
//int ind = threadIdx.x;
curandState localState = globalState[ind];
float RANDOM = curand_uniform( &localState );
globalState[ind] = localState;
return RANDOM;
}
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int id = threadIdx.x;
curand_init ( seed, id, 0, &state[id] );
}
__global__ void kernel(float* N, curandState* globalState, int n)
{
// generate random numbers
for(int i=0;i<40000;i++)
{
int k = generate(globalState, i) * 100000;
while(k > n*n-1)
{
k-=(n*n-1);
}
N[i] = k;
}
}
int main()
{
int N=40000;
curandState* devStates;
cudaMalloc ( &devStates, N*sizeof( curandState ) );
// setup seeds
setup_kernel <<< 1, N >>> ( devStates,unsigned(time(NULL)) );
float N2[40000];
float* N3;
cudaMalloc((void**) &N3, sizeof(float)*N);
kernel<<<1,1>>> (N3, devStates, n);
cudaMemcpy(N2, N3, sizeof(float)*N, cudaMemcpyDeviceToHost);
for(int i=0;i<N;i++)
{
cout<<N2[i]<<endl;
}
return 0;
}
You may use curand library to generate random numbers in device memory and then run your kernel without even having to copy those values to the host.