How do I pass a shared pointer to a cublas function? - cuda

I'm trying to run a cublas function from within a kernel in the following way:
__device__ void doLinear(const float *W,const float *input, unsigned i, float *out, unsigned o) {
unsigned idx = blockIdx.x*blockDim.x+threadIdx.x;
const float alpha = 1.0f;
const float beta = 0.0f;
if(idx == 0) {
cublasHandle_t cnpHandle;
cublasStatus_t status = cublasCreate(&cnpHandle);
cublasSgemv(cnpHandle, CUBLAS_OP_N, o, i, &alpha, W, 1, input, 1, &beta, out, 1);
}
__syncthreads();
}
This function works perfectly well if the input pointer is allocated using cudaMalloc.
My issue is, if the input pointer actually points to some shared memory, that contains data generated from within the kernel, I get the error:
CUDA_EXCEPTION_14 - Warp Illegal address.
Is it not possible to pass pointers to shared memory to a cublas function being called from a kernel?
What is the correct way to allocate my memory here? (At the moment I'm just doing another cudaMalloc and using that as my 'shared' memory, but it's making me feel a bit dirty)

You can't pass shared memory to a CUBLAS device API routine because it violates the CUDA dynamic parallelism memory model on which device side CUBLAS is based. The best you can do is use malloc() or new to allocate thread local memory on the runtime heap for the CUBLAS routine to use, or a portion of an a priori allocated buffer allocated with one of the host side APIs (as you are presently doing).

Related

How to copy dynamically allocated memory from device to host? [duplicate]

CUDA programming guide states that "Memory allocated via malloc() can be copied using the runtime (i.e., by calling any of the copy memory functions from Device Memory)", but somehow I'm having trouble to reproduce this functionality. Code:
#include <cstdio>
__device__ int* p;
__global__ void allocate_p() {
p = (int*) malloc(10);
printf("p = %p (seen by GPU)\n", p);
}
int main() {
cudaError_t err;
int* localp = (int*) malloc(10);
allocate_p<<<1,1>>>();
cudaDeviceSynchronize();
//Getting pointer to device-allocated memory
int* tmpp = NULL;
cudaMemcpyFromSymbol(&tmpp, p, 4);
printf("p = %p (seen by CPU)\n", tmpp);
//cudaMalloc((void**)&tmpp, 40);
err = cudaMemcpy(tmpp, localp, 40, cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
printf(" err:%i %s", (int)err, cudaGetErrorString(err));
delete localp;
return 0;
}
crashes with output:
p = 0x601f920 (seen by GPU)
p = 0x601f920 (seen by CPU)
err:11 invalid argument
I gather, that the host sees the appropriate address on device, but somehow does not like it coming from malloc().
If I allocate earlier by cudaMalloc((void**)&np, 40); and then pass the pointer np as argument to kernel allocate_p, where it will be assigned to p (instead of malloc()), then the code runs fine.
What am I doing wrong / how do we use malloc() allocated device-memory in host-side functions?
As far as I am aware, it isn't possible to copy runtime heap memory using the host API functions. It certainly was not possible in CUDA 4.x and the CUDA 5.0 release candidate has not changed this. The only workaround I can offer is to use a kernel to "gather" final results and stuff them into a device transfer buffer or zero copy memory which can be accessed via the API or directly from the host. You can see an example of this approach in this answer and another question where Mark Harris from NVIDIA confirmed that this is a limitation of the (then) current implementation in the CUDA runtime.

Making CUB blockradixsort on-chip entirely?

I am reading the CUB documentations and examples:
#include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
__global__ void ExampleKernel(...)
{
// Specialize BlockRadixSort for 128 threads owning 4 integer items each
typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
// Allocate shared memory for BlockRadixSort
__shared__ typename BlockRadixSort::TempStorage temp_storage;
// Obtain a segment of consecutive items that are blocked across threads
int thread_keys[4];
...
// Collectively sort the keys
BlockRadixSort(temp_storage).Sort(thread_keys);
...
}
In the example, each thread has 4 keys. It looks like 'thread_keys' will be allocated in global local memory. If I only has 1 key per thread, could I declare"int thread_key;" and make this variable in register only?
BlockRadixSort(temp_storage).Sort() is taking a pointer to the key as parameter. Does it mean that the keys have to be in global memory?
I would like to use this code but I want each thread to hold one key in register and keep it on-chip in register/shared memory after they are sorted.
Thanks in advance!
You can do this using shared memory (which will keep it "on-chip"). I'm not sure I know how to do it using strictly registers without de-constructing the BlockRadixSort object.
Here's an example code that uses shared memory to hold the initial data to be sorted, and the final sorted results. This sample is mostly set up for one data element per thread, since that seems to be what you are asking for. It's not difficult to extend it to multiple elements per thread, and I have put most of the plumbing in place to do that, with the exception of the data synthesis and debug printouts:
#include <cub/cub.cuh>
#include <stdio.h>
#define nTPB 32
#define ELEMS_PER_THREAD 1
// Block-sorting CUDA kernel (nTPB threads each owning ELEMS_PER THREAD integers)
__global__ void BlockSortKernel()
{
__shared__ int my_val[nTPB*ELEMS_PER_THREAD];
using namespace cub;
// Specialize BlockRadixSort collective types
typedef BlockRadixSort<int, nTPB, ELEMS_PER_THREAD> my_block_sort;
// Allocate shared memory for collectives
__shared__ typename my_block_sort::TempStorage sort_temp_stg;
// need to extend synthetic data for ELEMS_PER_THREAD > 1
my_val[threadIdx.x*ELEMS_PER_THREAD] = (threadIdx.x + 5)%nTPB; // synth data
__syncthreads();
printf("thread %d data = %d\n", threadIdx.x, my_val[threadIdx.x*ELEMS_PER_THREAD]);
// Collectively sort the keys
my_block_sort(sort_temp_stg).Sort(*static_cast<int(*)[ELEMS_PER_THREAD]>(static_cast<void*>(my_val+(threadIdx.x*ELEMS_PER_THREAD))));
__syncthreads();
printf("thread %d sorted data = %d\n", threadIdx.x, my_val[threadIdx.x*ELEMS_PER_THREAD]);
}
int main(){
BlockSortKernel<<<1,nTPB>>>();
cudaDeviceSynchronize();
}
This seems to work correctly for me, in this case I happened to be using RHEL 5.5/gcc 4.1.2, CUDA 6.0 RC, and CUB v1.2.0 (which is quite recent).
The strange/ugly static casting is needed as far as I can tell, because the CUB Sort is expecting a reference to an array of length equal to the customization parameter ITEMS_PER_THREAD(i.e. ELEMS_PER_THREAD):
__device__ __forceinline__ void Sort(
Key (&keys)[ITEMS_PER_THREAD],
int begin_bit = 0,
int end_bit = sizeof(Key) * 8)
{ ...

Dynamic programming in CUDA: global memory allocations to exchange data with child kernels

I have a the following code:
__global__ void interpolation(const double2* __restrict__ data, double2* __restrict__ result, const double* __restrict__ x, const double* __restrict__ y, const int N1, const int N2, int M)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
[...]
double phi_cap1, phi_cap2;
if(i<M) {
for(int m=0; m<(2*K+1); m++) {
[calculate phi_cap1];
for(int n=0; n<(2*K+1); n++) {
[calculate phi_cap2];
[calculate phi_cap=phi_cap1*phi_cap2];
[use phi_cap];
}
}
}
}
I would like to use Dynamic Programming on a Kepler K20 card to dispatch the processing of phi_cap1 and phi_cap2 in parallel to a bunch of threads to reduce the computation time. K=6 in my code, so I'm launching a single block of 13x13 threads.
Following the CUDA Dynamic Parallelism Programming Guide, I'm allocating a matrix phi_cap of 169 elements (formed by the products of phi_cap1 and phi_cap2), needed to exchange the data with the child kernel, in global memory. Indeed, quoting the guide,
As a general rule, all storage passed to a child kernel should be allocated explicitly from the global-memory heap.
I then ended-up with the following code
__global__ void interpolation(const double2* __restrict__ data, double2* __restrict__ result, const double* __restrict__ x, const double* __restrict__ y, const int N1, const int N2, int M)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
[...]
dim3 dimBlock(2*K+1,2*K+1); dim3 dimGrid(1,1);
if(i<M) {
double* phi_cap; cudaMalloc((void**)&phi_cap,sizeof(double)*(2*K+1)*(2*K+1));
child_kernel<<<dimGrid,dimBlock>>>(cc_diff1,cc_diff2,phi_cap);
for(int m=0; m<(2*K+1); m++) {
for(int n=0; n<(2*K+1); n++) {
[use phi_cap];
}
}
}
}
The problem is that the first routine takes 5ms to run, while the second routine, even by commenting the child_kernel launch, takes 23ms, with practically all the time spent in the cudaMalloc API.
Since in dynamic programming one would often need allocating memory space to exchange data with the child kernels, and the only solution seems to be global memory taking so much time, it seems to me that one serious bottleneck of the usefulness of dynamic programming is the data exchange, unless there is a way to circumvent the global memory allocation issue.
The question then is: is there any workaround to the mentioned issue, namely, taking so much time when allocating global memory from within a kernel?. Thanks
SOLUTION PROPOSED IN THE COMMENTS
Allocate the required global memory from outside the parent kernel. I have verified that allocating the required global memory from outside the parent kernel is much faster.
You are calling cudaMalloc from each thread where i < M which means that you are making M cudaMalloc calls.
The bigger M is the worse it is going to get.
Instead you could make a single cudaMalloc call from the first thread of the block allocating M times the size that you used before (actually in your case you should allocate more, so each block is properly aligned). After that sync the threads and you can start your child kernels with correctly computed phi_cap address for each child kernel.
Alternatively (if your specific situation allows you to allocate enough memory that you can hold on to between the kernel calls) you could allocate the memory once outside of the kernel and reuse it. That would be a lot quicker. If M varies between kernel calls you could allocate as much as you would need for the biggest M.

Retaining dot product on GPGPU using CUBLAS routine

I am writing a code to compute dot product of two vectors using CUBLAS routine of dot product but it returns the value in host memory. I want to use the dot product for further computation on GPGPU only. How can I make the value reside on GPGPU only and use it for further computations without making an explicit copy from CPU to GPGPU?
You can do this in CUBLAS as long as you use the "V2" API. The newer API includes a function cublasSetPointerMode which you can use to set the API to assume that all routines which return a scalar value will be passed a device pointer rather than a host pointer. This is discussed in Section 2.4 of the latest CUBLAS documentation. For example:
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <stdio.h>
int main(void)
{
const int nvals = 10;
const size_t sz = sizeof(double) * (size_t)nvals;
double x[nvals], y[nvals];
double *x_, *y_, *result_;
double result=0., resulth=0.;
for(int i=0; i<nvals; i++) {
x[i] = y[i] = (double)(i)/(double)(nvals);
resulth += x[i] * y[i];
}
cublasHandle_t h;
cublasCreate(&h);
cublasSetPointerMode(h, CUBLAS_POINTER_MODE_DEVICE);
cudaMalloc( (void **)(&x_), sz);
cudaMalloc( (void **)(&y_), sz);
cudaMalloc( (void **)(&result_), sizeof(double) );
cudaMemcpy(x_, x, sz, cudaMemcpyHostToDevice);
cudaMemcpy(y_, y, sz, cudaMemcpyHostToDevice);
cublasDdot(h, nvals, x_, 1, y_, 1, result_);
cudaMemcpy(&result, result_, sizeof(double), cudaMemcpyDeviceToHost);
printf("%f %f\n", resulth, result);
cublasDestroy(h);
return 0;
}
Using CUBLAS_POINTER_MODE_DEVICE makes cublasDdot assume that result_ is a device pointer, and there is no attempt made to copy the result back to the host. Note that this makes routines like dot asynchronous, so you might need to keep on eye on synchronization between device and host.
You can't, exactly, using CUBLAS. As per talonmies' answer, starting with the CUBLAS V2 api (CUDA 4.0) the return value can be a device pointer. Refer to his answer. But if you are using the V1 API it's a single value, so it's pretty trivial to pass it as an argument to a kernel that uses it—you don't need an explicit cudaMemcpy (but there is one implied in order to return a host value).
Starting with the Tesla K20 GPU and CUDA 5, you will be able to call CUBLAS routines from device kernels using CUDA Dynamic Parallelism. This means you would be able to call cublasSdot (for example) from inside a __global__ kernel function, and your result would therefore be returned on the GPU.
Set pointer mode to device using cublasSetPointerMode().
From cuBLAS docs:
cublasSetPointerMode()
This function sets the pointer mode used by the cuBLAS library. The default is for the values to be passed by reference on the host.
Example:
cublasHandle_t handle;
cublasCreate(&handle);
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); // Make the values be passed by reference on the device.
Warning: cublasSetPointerMode also affects pointers used as input parameters (e.g., alpha for cublasSgemm). You will need to store the parameters on the device or set the pointer mode back to host mode.

CUDA: Allocating 2D array on GPU

I have already read the following thread , but I couldn't get my code to work.
I am trying to allocate a 2D array on GPU, fill it with values, and copy it back to the CPU. My code is as follows:
__global__ void Kernel(char **result,int N)
{
//do something like result[0][0]='a';
}
int N=20;
int Count=5;
char **result_h=(char**)malloc(sizeof(char*)*Count);
char **result_d;
cudaMalloc(&result_d, sizeof(char*)*Count);
for(int i=0;i<Count;i++)
{
result_h[i] = (char*)malloc(sizeof(char)*N);
cudaMalloc(&result_d[i], sizeof(char)*N); //get exception here
}
//call kernel
//copy values from result_d to result_h
printf("%c",result_h[0][0])//should print a
How can i achieve this?
You can't manipulate device pointers in host code, which is why the cudaMalloc call inside the loop fails. You should probably just allocate a single contiguous block of memory and then treat that as a flattened 2D array.
For doing the simplest 2D operations on a GPU, I'd recommend you just treat it as a 1D array. cudaMalloc a block of size w*h*sizeof(char). You can access the element (i,j) through index j*w+i.
Alternatively, you could use cudaMallocArray to get a 2D array. This has a better sense of locality than linear mapped 2D memory. You can easily bind this to a texture, for example.
Now in terms of your example, the reason why it doesn't work is that cudaMalloc manipulates a host pointer to point at a block of device memory. Your example allocated the pointer structure for results_d on the device. If you just change the cudaMalloc call for results_d to a regular malloc, it should work as you originally intended.
That said, perhaps one of the two options I outlined above might work better from an ease of code maintenance perspective.
When allocating in that way you are allocating addresses that are valid on the CPU memory.
The value of the addresses is transferred as a number without problems, but once on the device memory the char* address will not have meaning.
Create an array of N * max text length, and another array of length N that tells how long each word is.
This is a bit more advanced but if you are processing a set of defined text (passwords for example)
I would suggest you to group it by text length and create specialized kernel for each length
template<int text_width>
__global__ void Kernel(char *result,int N)
{
//pseudocode
for i in text_width:
result[idx][i] = 'a'
}
and in the kernel invocation code you specify:
switch text_length
case 16:
Kernel<16> <<<>>> ()
The following code sample allocates a width×height 2D array of floating-point values and shows how to loop over the array elements in device code[1]
// host code
float* devPtr;
int pitch;
cudaMallocPitch((void**)&devPtr, &pitch, width * sizeof(float), height);
myKernel<<<100, 192>>>(devPtr, pitch);
// device code
__global__ void myKernel(float* devPtr, int pitch)
{
for (int r = 0; r < height; ++r) {
float* row = (float*)((char*)devPtr + r * pitch);
for (int c = 0; c < width; ++c) {
float element = row[c]; }
}
}
The following code sample allocates a width×height CUDA array of one 32-bit
floating-point component[1]
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray* cuArray;
cudaMallocArray(&cuArray, &channelDesc, width, height);
The following code sample copies the 2D array to the CUDA array allocated in the
previous code samples[1]:
cudaMemcpy2DToArray(cuArray, 0, 0, devPtr, pitch, width * sizeof(float), height,
cudaMemcpyDeviceToDevice);
The following code sample copies somehost memory array to device memory[1]:
float data[256];
int size = sizeof(data);
float* devPtr;
cudaMalloc((void**)&devPtr, size);
cudaMemcpy(devPtr, data, size, cudaMemcpyHostToDevice);
you can understand theses examples and apply them in your purpose.
[1] NVIDIA CUDA Compute Unified Device Architecture