Related
I'd like to pack two data types, specifically an int and a float into an unsigned long long int in CUDA.
I wrote something like this but I get compilation errors when unpacking:
__global__ void kernel() {
// pack
float positive_num = 5.1034786f;
int index = 1024;
unsigned long long int u_int_val = (unsigned long long int) positive_num << 32;
u_int_val += index & 0xffff;
// unpack
positive_num = (float) u_int_val >> 32 ;
index = u_int_val & 0xffff ;
// check
printf("positive_num: %f - index %i\n", positive_num, index);
}
The error I get:
error: expression must have integral or unscoped enum type // error line - positive_num = (float) ...
I have tried in multiple ways, even casting the memory address, either I get compilation errors or simply I don't manage to have a precise conversion straight and back.
If helps, the only assumption I can make is that both the numbers are positive, hence float positive_num > 0.f; int index > 0;
The reason I need to pack two numbers into one, is to embed everything (a float and and int) in a single atomic operation, to find the minimum of both for instance.
If the need for heterogeneous packing and unpacking exists only in device code, one can use CUDA's device function intrinsics __float_as_int() and __int_as_float() to re-interpret a 32-bit float into a 32-bit int and vice versa. The packing of integers is unproblematic, simply shift the desired high-order part and OR the parts together.
For the same functionality in code that needs to work on both host and device, the canonical C++ way of re-interpreting floating-point data as integer data and vice versa is to use memcpy(); CUDA is a C++ derivative. This may or may not be as efficient as using the device intrinsics, which have no cost, as the 32-bit registers of the GPU can be used for both integer and floating-point data. It may be worthwhile to inspect the generated machine code (SASS) with cuobjdump --dump-sass.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define DEVICE_CODE_ONLY (0)
#if DEVICE_CODE_ONLY
__device__ unsigned long long int pack_float_int (float a, int b)
{
return (((unsigned long long int)(unsigned int)__float_as_int (a)) << 32) |
(unsigned long long int)(unsigned int)b;
}
__device__ void unpack_float_int (unsigned long long int a, float *f, int *i)
{
*f = __int_as_float ((int)(unsigned int)(a >> 32));
*i = (int)(unsigned int)a;
}
#else // DEVICE_CODE_ONLY
__host__ __device__ unsigned long long int pack_float_int (float a, int b)
{
unsigned int t;
memcpy (&t, &a, sizeof t);
return ((unsigned long long int)t << 32) |
(unsigned long long int)(unsigned int)b;
}
__host__ __device__ void unpack_float_int (unsigned long long int a, float *f, int *i)
{
unsigned int t = (unsigned int)(a >> 32);
memcpy (f, &t, sizeof (*f));
*i = (int)(unsigned int)a;
}
#endif // DEVICE_CODE_ONLY
__global__ void kernel (float f, int i)
{
unsigned long long int p;
float uf;
int ui;
p = pack_float_int (f, i);
printf ("GPU: packed (%15.8e, %d) into %016llx\n", f, i, p);
unpack_float_int (p, &uf, &ui);
printf ("GPU: unpacked %016llx into (%15.8e %d)\n", p, uf, ui);
}
int main (void)
{
float f = 5.1034786f;
int i = 1024;
printf ("testing packing/unpacking: %s\n",
DEVICE_CODE_ONLY ? "on device" : "on device and host");
kernel<<<1,1>>> (f, i);
cudaDeviceSynchronize();
#if !DEVICE_CODE_ONLY
unsigned long long int p;
float uf;
int ui;
p = pack_float_int (f, i);
printf ("CPU: packed (%15.8e, %d) into %016llx\n", f, i, p);
unpack_float_int (p, &uf, &ui);
printf ("CPU: unpacked %016llx into (%15.8e %d)\n", p, uf, ui);
#endif // DEVICE_CODE_ONLY
return EXIT_SUCCESS;
}
something like this should work:
__device__ unsigned long long pack(int a, float b){
return (((unsigned long long)(*(reinterpret_cast<unsigned *>(&a))))<<32) + *(reinterpret_cast<unsigned *>(&b));
}
__device__ void unpack(unsigned long long val, int &a, float &b){
unsigned mb = (unsigned)val;
b = *(reinterpret_cast<float *>(&mb));
unsigned ma = (unsigned)(val >> 32);
a = *(reinterpret_cast<int *>(&ma));
}
(This is not really unique to CUDA. This is just C++ code, apart from the __device__ decorators.)
Example:
$ cat t2169.cu
#include <cstdio>
__device__ unsigned long long pack(int a, float b){
return (((unsigned long long)(*(reinterpret_cast<unsigned *>(&a))))<<32) + *(reinterpret_cast<unsigned *>(&b));
}
__device__ void unpack(unsigned long long val, int &a, float &b){
unsigned mb = (unsigned)val;
b = *(reinterpret_cast<float *>(&mb));
unsigned ma = (unsigned)(val >> 32);
a = *(reinterpret_cast<int *>(&ma));
}
__global__ void k(int a, float b){
unsigned long long val = pack(a,b);
int ma;
float mb;
unpack(val, ma, mb);
printf("a = %d, b = %f\n", ma, mb);
}
int main(){
k<<<1,1>>>(-2, -1.3f);
cudaDeviceSynchronize();
}
$ nvcc -o t2169 t2169.cu
$ compute-sanitizer ./t2169
========= COMPUTE-SANITIZER
a = -2, b = -1.300000
========= ERROR SUMMARY: 0 errors
$
That packs the int quantity in the upper 32-bits and the float quantity in the lower 32-bits.
If you want to reverse the storage order, you can just reverse the usage of a and b like this:
__device__ unsigned long long pack(int a, float b){
return (((unsigned long long)(*(reinterpret_cast<unsigned *>(&b))))<<32) + *(reinterpret_cast<unsigned *>(&a));
}
__device__ void unpack(unsigned long long val, int &a, float &b){
unsigned ma = (unsigned)val;
a = *(reinterpret_cast<int *>(&ma));
unsigned mb = (unsigned)(val >> 32);
b = *(reinterpret_cast<float *>(&mb));
}
I want to calculate the sum of all elements of an array in CUDA. I came up with this code. It compiles without any error. But the result is always zero. I've got the invalid device symbol from cudaMemcpyFromSymbol. I cannot use any libraries like Thrust or Cublas.
#define TRIALS_PER_THREAD 4096
#define NUM_BLOCKS 256
#define NUM_THREADS 256
double *dev;
__device__ volatile double pi_gpu = 0;
__global__ void ArraySum(double *array)
{
unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
pi_gpu = pi_gpu + array[tid];
__syncthreads();
}
int main (int argc, char *argv[]) {
cudaMalloc((void **) &dev, NUM_BLOCKS * NUM_THREADS * sizeof(double));
double pi_gpu_h;
ArraySum<<<NUM_BLOCKS, NUM_THREADS>>>(dev);
cudaDeviceSynchronize();
cudaError err = cudaMemcpyFromSymbol(&pi_gpu_h, &pi_gpu, sizeof(double), cudaMemcpyDeviceToHost);
if( cudaSuccess != err )
{
fprintf( stderr, "cudaMemcpyFromSymbolfailed : %s\n", cudaGetErrorString( err ) );
exit( -1 );
}
return pi_gpu_h; // this is always zero!!!
}
The symbol argument in the copy from symbol call is incorrect. It should look like this:
cudaMemcpyFromSymbol(&pi_gpu_h, pi_gpu, sizeof(double), 0, cudaMemcpyDeviceToHost)
cublasSaxpy computes y' = a * x + y, where x and y are vectors and a is scalar.
It turns out I need to compute y' = a * y + x instead. I'm not seeing how to twist the cuBLAS library into doing that.
(Of course, I could compute y' = a * y, then y' = y' + x, but y' is read too often in that case. And I could write my own CUDA code to do it, but then it's likely not anywhere near as fast as the cuBLAS code. I'm just surprised there's no apparent way to do "saypx" directly.)
[Added] There are functions similar to "saxpby" in Intel's version of cblas, which would do what I need. But oddly enough, that's not in cuBLAS.
[Added #2] It looks like I can use the cudnnAddTensor function, with some aliasing of descriptors (I have a FilterDescriptor that points to the tensor, which AddTensor won't accept, but I should be able to alias a TensorDescriptor to the same memory and shape.)
There isn't a way I am aware of to do what you are asking in CUBLAS, nor in standard BLAS. What you have found in MKL is an extension added by Intel, but I don't recall seeing something similar in other host and accelerator BLAS implementations.
The good news is that your assertion that "I could write my own CUDA code to do it, but then it's likely not anywhere near as fast as the cuBLAS code", is untrue, at least for an operation as trivial as saxpy. Even a naïve implementation of saxpy will get very close to CUBLAS because there really aren't that many was to read two arrays, perform an FMAD and write back the result. As long as you get memory coalescing correct, it is pretty simple to write performant code. For example:
#include <vector>
#include <algorithm>
#include <cassert>
#include <iostream>
#include <cmath>
#include "cublas_v2.h"
typedef enum
{
AXPY = 0,
AXPBY = 1
} saxpy_op_t;
__device__ __host__ __inline__
float axpby_op(float y, float x, float a)
{
return a * y + x;
}
__device__ __host__ __inline__
float axpy_op(float y, float x, float a)
{
return y + a * x;
}
template<typename T>
class pitched_accessor
{
T * p;
size_t pitch;
public:
__host__ __device__
pitched_accessor(T *p_, size_t pitch_) : p(p_), pitch(pitch_) {};
__host__ __device__
T& operator[](size_t idx) { return p[pitch*idx]; };
__host__ __device__
const T& operator[](size_t idx) const { return p[pitch*idx]; };
};
template<saxpy_op_t op>
__global__
void saxpy_kernel(pitched_accessor<float> y, pitched_accessor<float> x,
const float a, const unsigned int N1)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int stride = gridDim.x * blockDim.x;
#pragma unroll 8
for(; idx < N1; idx += stride) {
switch (op) {
case AXPY:
y[idx] = axpy_op(y[idx], x[idx], a);
break;
case AXPBY:
y[idx] = axpby_op(y[idx], x[idx], a);
break;
}
}
}
__host__ void saxby(const unsigned int N, const float a,
float *x, int xinc, float *y, int yinc)
{
int gridsize, blocksize;
cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPBY>);
saxpy_kernel<AXPBY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc),
pitched_accessor<float>(x, xinc), a, N);
}
__host__ void saxpy(const unsigned int N, const float a,
float *x, int xinc, float *y, int yinc)
{
int gridsize, blocksize;
cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPY>);
saxpy_kernel<AXPY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc),
pitched_accessor<float>(x, xinc), a, N);
}
void check_result(std::vector<float> &yhat, float result, float tolerance=1e-5f)
{
auto it = yhat.begin();
for(; it != yhat.end(); ++it) {
float err = std::fabs(*it - result);
assert( err < tolerance );
}
}
int main()
{
const int N = 1<<22;
std::vector<float> x_h(N);
std::vector<float> y_h(N);
const float a = 2.f, y0 = 1234.f, x0 = 532.f;
std::fill(y_h.begin(), y_h.end(), y0);
std::fill(x_h.begin(), x_h.end(), x0);
float *x_d, *y_d;
size_t sz = sizeof(float) * size_t(N);
cudaMalloc((void **)&x_d, sz);
cudaMalloc((void **)&y_d, sz);
cudaMemcpy(x_d, &x_h[0], sz, cudaMemcpyHostToDevice);
{
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
saxby(N, a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpby_op(y0, x0, a));
}
{
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
saxpy(N, a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpy_op(y0, x0, a));
}
{
cublasHandle_t handle;
cublasCreate(&handle);
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
cublasSaxpy(handle, N, &a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpy_op(y0, x0, a));
cublasDestroy(handle);
}
return int(cudaDeviceReset());
}
This demonstrates that a very simple axpy kernel can be easily adapted to perform both the standard operation and the version you want, and run within 10% of the runtime of CUBLAS on the compute 5.2 device I tested it on:
$ nvcc -std=c++11 -arch=sm_52 -Xptxas="-v" -o saxby saxby.cu -lcublas
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info : Function properties for _Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 17 registers, 360 bytes cmem[0]
ptxas info : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info : Function properties for _Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 17 registers, 360 bytes cmem[0]
$ nvprof ./saxby
==26806== NVPROF is profiling process 26806, command: ./saxby
==26806== Profiling application: ./saxby
==26806== Profiling result:
Time(%) Time Calls Avg Min Max Name
54.06% 11.190ms 5 2.2381ms 960ns 2.9094ms [CUDA memcpy HtoD]
40.89% 8.4641ms 3 2.8214ms 2.8039ms 2.8310ms [CUDA memcpy DtoH]
1.73% 357.59us 1 357.59us 357.59us 357.59us void saxpy_kernel<saxpy_op_t=1>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
1.72% 355.15us 1 355.15us 355.15us 355.15us void saxpy_kernel<saxpy_op_t=0>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
1.60% 332.21us 1 332.21us 332.21us 332.21us void axpy_kernel_val<float, int=0>(cublasAxpyParamsVal<float>)
I really tried to implement a function in C to multiply to row-major matrix in cublas. I don't know where I mistaking.
In the function below A, B and C are pointers to an row matrix correctly
allocated.
I'd like to keep the option of translate a matrix before perform the product.
The function below is not working.
void matrixMul(cublasHandle_t handle,float *A,float *B,float *C, int m,int n,int k,int transA,int transB){
cublasStatus_t stat ; // CUBLAS functions status
float alfa = 1;
float beta = 0;
int
ma = transA ? n:m,
na = transA ? m:n,
nb = transB ? k:n,
mb = transB ? n:k;
if(na!=mb){
puts("Something wrong");
}
//(mb,nb)(ma,na) = (mb,na)
stat= cublasSgemm_v2(handle, (cublasOperation_t) transB, (cublasOperation_t)transA,
nb,ma,mb,&alfa,
B,k,
A,n,&beta,
C,m);
switch (stat) {
case CUBLAS_STATUS_SUCCESS:
puts("Sucess");
break;
default:
printf(">>>>ERRO %d<<<<\n",stat);
break;
}
}
The entire source code
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA Samples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// CUDA and CUBLAS functions
#include <helper_functions.h>
void getFromDevice(float *h_A,float *d_A,int size){
//printf("Copy input data from the host memory to the CUDA device\n");
cudaError_t err = cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
//A = (m,n)
//B = (n,k)
//C = (m,k)
void matrixMul(cublasHandle_t handle,float *A,float *B,float *C, int m,int n,int k,int transA,int transB){
cublasStatus_t stat ; // CUBLAS functions status
float alfa = 1;
float beta = 0;
int
ma = transA ? n:m,
na = transA ? m:n,
nb = transB ? k:n,
mb = transB ? n:k;
if(na!=mb){
puts("Something wrong");
}
//(mb,nb)(ma,na) = (mb,na)
stat= cublasSgemm_v2(handle, (cublasOperation_t) transB, (cublasOperation_t)transA,
nb,ma,mb,&alfa,
B,k,
A,n,&beta,
C,m);
switch (stat) {
case CUBLAS_STATUS_SUCCESS:
puts("Sucess");
break;
default:
printf(">>>>ERRO %d<<<<\n",stat);
break;
}
}
float *mallocfDevice(int size){
float *d_C = NULL;
cudaError_t err = cudaMalloc((void **)&d_C, size * sizeof(float));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}else{
size_t freeM, total;
cudaMemGetInfo ( &freeM, &total);
printf("MEM:%.3f\n",freeM,total,100 - ((double)freeM/total)*100 );
}
return d_C;
}
void printHostMatrix(int nl, int nc, float *h_s){
for(int j = 0; j < nl ; j++) {
for(int i = 0; i < (nc) ; i++){
int idx = j*nc + i;
printf("%.2f ", h_s[idx]);
}
printf("\n");
}
}
void printfDeviceMatrix(float *d_s,int m, int p){
float *h_s =(float*) malloc(sizeof(float)*m*p);
getFromDevice(h_s,d_s,sizeof(float)*m*p);
printHostMatrix(m,p,h_s);
free(h_s);
}
void sendTofDevice(float *h_A,float *d_A,int size){
//printf("Copy input data from the host memory to the CUDA device\n");
cudaError_t err = cudaMemcpy(d_A, h_A, size*sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
int main(int argc,char **argv){
int ma = 2,
na = 3,
mb = 3,
nb = 2;
float A[] = { 1,2,3,
4,5,6};
float B[] = {7, 8,
9,10,
11,12};
float *C = new float[ma*nb];
float *d_a = mallocfDevice(ma*mb),
*d_b = mallocfDevice(mb*nb),
*d_c = mallocfDevice(ma*nb);
sendTofDevice(A,d_a,ma*na);
sendTofDevice(B,d_b,mb*nb);
cublasHandle_t handle ; // CUBLAS context
cublasCreate (&handle );
puts("A");
printfDeviceMatrix(d_a,ma,na);
puts("B");
printfDeviceMatrix(d_b,mb,nb);
matrixMul(handle, d_a,d_b,d_c,
ma,na,nb,0,0);
puts("AB=C");
printfDeviceMatrix(d_c,ma,nb);
}
CUBLAS assumes that the matrix in the device is stored in column major:
"
where α and β are scalars, and A , B and C are matrices stored in column-major format with dimensions op ( A ) m × k , op ( B ) k × n and C m × n , respectively. Also, for matrix A
Read more at: http://docs.nvidia.com/cuda/cublas/index.html#ixzz3mSDJTWrM "
That means the matrix needs to be treated as differently on the device than on the host.
Take a look on my self written cuda kernel. I had a big kernel but it returned me error message. Then I simplified it and found that it fails on one loop. I simplified this loop and found that if I use int value or constant value to fill data[threadIdx.x] in loop it works fine. But if I use double type value it returns an error.
Advice: if you are not correctly coping your data from host to device, you can get "warning: Cuda API error detected: cudaLaunch returned (0x7)" message when you use Nsight or you can get segmentation fault error when you run your app from terminal
__global__ void sumSeries(double* dSum,int* totalThreadNumber){
volatile __shared__ double data[768];
double var=0;
data[threadIdx.x]=0;
for ( int i = 10 ; i < 20 ;++i){
var=i;
data[threadIdx.x] += (var)/(var*var+1);
__syncthreads();
}
}
Why it does not work?
int main() {
int threadsPerBlock=768;
int blockCount=8;
int *hostThreadNumber=new int ;
*hostThreadNumber=threadsPerBlock*blockCount;
int* deviceThreadNumber=NULL;
double* deviceSum=NULL;
double* hostSum=(double*)malloc(blockCount);
cudaError_t cuerr=cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
if (cuerr != cudaSuccess){
std::cout<<"Cant SetCacheConfig: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr=cudaMalloc(&deviceSum,blockCount*sizeof(double));//размер дабла*число блоков
if (cuerr != cudaSuccess){
std::cout<<"Cant allocate memory for deviceSum: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr=cudaMalloc(&deviceThreadNumber,sizeof(int));
if (cuerr != cudaSuccess){
std::cout<<"Cant allocate memory for deviceThreadNumber: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr = cudaMemcpy(deviceSum,hostSum,blockCount*sizeof(double),cudaMemcpyHostToDevice);
if (cuerr != cudaSuccess){
std::cout<<"Can not copy hostSum to device: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr = cudaMemcpy(deviceThreadNumber,hostThreadNumber,sizeof(int),cudaMemcpyHostToDevice);
if (cuerr != cudaSuccess){
std::cout<<"Can not copy hostThreadNumber to device: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
sumSeries<<<dim3(blockCount),dim3(threadsPerBlock)>>>(deviceSum,deviceThreadNumber);
cuerr=cudaGetLastError();
if (cuerr != cudaSuccess){
std::cout<<"Cuda kernel error: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr= cudaDeviceSynchronize();
if (cuerr != cudaSuccess){
std::cout<<"Can not synchronize cuda kernel : "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr= cudaMemcpy(hostSum,deviceSum,blockCount*sizeof(double),cudaMemcpyDeviceToHost);
if (cuerr != cudaSuccess){
std::cout<<"Can not copy data to host: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cudaFree(deviceSum);
cudaFree(deviceThreadNumber);
return 0;
}
You just allocated 8 bytes memory for hostSum
double* hostSum=(double*)malloc(blockCount)
That's wrong if I assumed that you want to allocate blockCount * sizeof(double) bytes for it, because you allocate this amount of memory for deviceSum and uses it for the memory copy between host and device.
cuerr = cudaMalloc(&deviceSum,blockCount*sizeof(double));
cuerr = cudaMemcpy(deviceSum,hostSum,blockCount*sizeof(double),cudaMemcpyHostToDevice);
cuerr= cudaMemcpy(hostSum,deviceSum,blockCount*sizeof(double),cudaMemcpyDeviceToHost);