CUDA host and device using same __constant__ memory - cuda

I have device/host function that uses constant memory. It runs OK on device, but on host it seems like this memory remains uninitialized.
#include <iostream>
#include <stdio.h>
const __constant__ double vals[2] = { 0.0, 1000.0 };
__device__ __host__ double f(size_t i)
{
return vals[i];
}
__global__ void kern()
{
printf("vals[%d] = %lf\n", threadIdx.x, vals[threadIdx.x]);
}
int main() {
std::cerr << f(0) << " " << f(1) << std::endl;
kern<<<1, 2>>>();
cudaThreadSynchronize();
}
This prints (requires CC 2.0 or above)
0 0
vals[0] = 0.000000
vals[1] = 1000.000000
What is the problem and how can I get both device and host memory constants initialized simultaneously?

Since CygnusX1 misunderstood what I meant in my comment on MurphEngineer's answer, maybe I should post my own answer. What I meant was this:
__constant__ double dc_vals[2] = { 0.0, 1000.0 };
const double hc_vals[2] = { 0.0, 1000.0 };
__device__ __host__ double f(size_t i)
{
#ifdef __CUDA_ARCH__
return dc_vals[i];
#else
return hc_vals[i];
#endif
}
This has the same result as Cygnus', but it is more flexible in the face of real code: it lets you have runtime-defined values in your constant arrays, for example, and allows you to use CUDA API functions like cudaMemcpyToSymbol/cudsaMemcpyFromSymbol on the __constant__ array.
A more realistic complete example:
#include <iostream>
#include <stdio.h>
__constant__ double dc_vals[2];
const double hc_vals[2];
__device__ __host__ double f(size_t i)
{
#ifdef __CUDA_ARCH__
return dc_vals[i];
#else
return hc_vals[i];
#endif
}
__global__ void kern()
{
printf("vals[%d] = %lf\n", threadIdx.x, vals[threadIdx.x]);
}
int main() {
hc_vals[0] = 0.0;
hc_vals[1] = 1000.0;
cudaMemcpyToSymbol(dc_vals, hc_vals, 2 * sizeof(double), 0, cudaMemcpyHostToDevice);
std::cerr << f(0) << " " << f(1) << std::endl;
kern<<<1, 2>>>();
cudaThreadSynchronize();
}

I think MurphEngineer explained well why it does not work.
To quickly fix this problem, you can follow harrism's idea, something like this:
#ifdef __CUDA_ARCH__
#define CONSTANT __constant__
#else
#define CONSTANT
#endif
const CONSTANT double vals[2] = { 0.0, 1000.0 };
This way the host compilation will create a normal host const array, while device compilation will create a device __constant__ compilation.
Do note that with this trick it might be harder to use CUDA API to access that device array with functions like cudaMemcpyToSymbol() if you ever decide to do so.

Using the __constant__ qualifier explicitly allocates that memory on the device. There is no way to access that memory from the host -- not even with the new CUDA Unified Addressing stuff (that only works for memory allocated with cudaMalloc() and its friends). Qualifying the variable with const just says "this is a constant pointer to (...)".
The correct way to do this is, indeed, to have two arrays: one on the host, and one on the device. Initialize your host array, then use cudaMemcpyToSymbol() to copy data to the device array at runtime. For more information on how to do this, see this thread: http://forums.nvidia.com/index.php?showtopic=69724

Absolutely great. I was struggling with the same issue and this provides a solution. However, the code suggested by harrism gives errors on compilation. Here is the fixed code which compiles correctly with nvcc:
#include <iostream>
#include <stdio.h>
__constant__ double dc_vals[2];
const double hc_vals[2] = {0.0, 1000.0};
__device__ __host__ double f(size_t i)
{
#ifdef __CUDA_ARCH__
return dc_vals[i];
#else
return hc_vals[i];
#endif
}
__global__ void kern()
{
printf("Device: vals[%d] = %lf\n", threadIdx.x, f(threadIdx.x));
}
int main() {
cudaMemcpyToSymbol(dc_vals, hc_vals, 2 * sizeof(double), 0, cudaMemcpyHostToDevice);
std::cerr << "Host: " << f(0) << " " << f(1) << std::endl;
kern<<<1, 2>>>();
cudaThreadSynchronize();
}

Related

Thrust error with CUDA separate compilation

I'm running into an error when I try to compile CUDA with relocatable device code enabled (-rdc = true). I'm using Visual Studio 2013 as compiler with CUDA 7.5. Below is a small example that shows the error. To clarify, the code below runs fine when -rdc = false, but when set to true, the error shows up.
The error simply says: CUDA error 11 [\cuda\detail\cub\device\dispatch/device_radix_sort_dispatch.cuh, 687]: invalid argument
Then I found this, which says:
When invoked with primitive data types, thrust::sort, thrust::sort_by_key,thrust::stable_sort, thrust::stable_sort_by_key may fail to link in some cases with nvcc -rdc=true.
Is there some workaround to allow separate compilation?
main.cpp:
#include <stdio.h>
#include <vector>
#include "cuda_runtime.h"
#include "RadixSort.h"
typedef unsigned int uint;
typedef unsigned __int64 uint64;
int main()
{
RadixSort sorter;
uint n = 10;
std::vector<uint64> test(n);
for (uint i = 0; i < n; i++)
test[i] = i + 1;
uint64 * d_array;
uint64 size = n * sizeof(uint64);
cudaMalloc(&d_array, size);
cudaMemcpy(d_array, test.data(), size, cudaMemcpyHostToDevice);
try
{
sorter.Sort(d_array, n);
}
catch (const std::exception & ex)
{
printf("%s\n", ex.what());
}
}
RadixSort.h:
#pragma once
typedef unsigned int uint;
typedef unsigned __int64 uint64;
class RadixSort
{
public:
RadixSort() {}
~RadixSort() {}
void Sort(uint64 * input, const uint n);
};
RadixSort.cu:
#include "RadixSort.h"
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
void RadixSort::Sort(uint64 * input, const uint n)
{
thrust::device_ptr<uint64> d_input = thrust::device_pointer_cast(input);
thrust::stable_sort(d_input, d_input + n);
cudaDeviceSynchronize();
}
As mentioned in the comments by Robert Crovella:
Changing the CUDA architecture to a higher value will solve this problem. In my case I changed it to compute_30 and sm_30 under CUDA C++ -> Device -> Code Generation.
Edit:
The general recommendation is to select the best fit hierarchy for your specific GPU. See the link in comments for additional information.

cuda function application elementwise in cuda

After multiplying a matrix A and a vector x obtaining the result y, I want to apply a function h elementwise to y.
I want to obtain z = h(Ax), where h is applied elementwise to the vector Ax.
I know how to make the matrix/vector multiplication on the GPU (with cublas). Now I want h (which is my own function, coded in C++) to be applied to the resultant vector also in GPU, how can I do that?
Two possible approaches are:
Write your own CUDA kernel to perform the operation
Use thrust (e.g. thrust::for_each() ).
Here is a worked example of both approaches:
$ cat t934.cu
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/for_each.h>
#define DSIZE 4
#define nTPB 256
template <typename T>
__host__ __device__ T myfunc(T &d){
return d + 5; // define your own function here
}
struct mytfunc
{
template <typename T>
__host__ __device__
void operator()(T &d){
d = myfunc(d);
}
};
template <typename T>
__global__ void mykernel(T *dvec, size_t dsize){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < dsize) dvec[idx] = myfunc(dvec[idx]);
}
int main(){
// first using kernel
float *h_data, *d_data;
h_data = new float[DSIZE];
cudaMalloc(&d_data, DSIZE*sizeof(float));
for (int i = 0; i < DSIZE; i++) h_data[i] = i;
cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
mykernel<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_data, DSIZE);
cudaMemcpy(h_data, d_data, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < DSIZE; i++) std::cout << h_data[i] << ",";
std::cout << std::endl;
// then using thrust
thrust::host_vector<float> hvec(h_data, h_data+DSIZE);
thrust::device_vector<float> dvec = hvec;
thrust::for_each(dvec.begin(), dvec.end(), mytfunc());
thrust::copy_n(dvec.begin(), DSIZE, std::ostream_iterator<float>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -o t934 t934.cu
$ ./t934
5,6,7,8,
10,11,12,13,
$
Note that in order to provide a complete example, I'm starting with a vector definition in host memory. If you already have the vector in device memory (perhaps as a result of computing y=Ax) then you can work directly on that, by passing that vector to the CUDA kernel, or using it directly in the thrust function, using a thrust::device_ptr wrapper (this method is covered in the thrust quick start guide previously linked.)
The assumption I've made here is you want to use an arbitrary function of one variable. This should handle pretty much arbitrary functions defined in myfunc. However, for some categories of functions that you may be interested in, you may be able to realize it one or more CUBLAS calls as well.

Thrust not calling device function

I have following simple CUDA-Thrust code which adds 10 to device vector but the function is getting called on host side instead of device.
#include <algorithm>
#include <iostream>
#include <numeric>
#include <vector>
#include <stdio.h>
#include <thrust/device_vector.h>
__host__ __device__ int add(int x){
#if defined(__CUDA_ARCH__)
printf("In device\n");
#else
printf("In host\n");
#endif
return x+10;
}
int main(void)
{
thrust::host_vector<int> H(4);
H[0] = H[1] = H[2] = H[3] = 10;
thrust::device_vector<int> data=H;
std::transform(data.begin(), data.end(), data.begin(),add);
return 0;
}
What am I doing wrong here?
The thrust quick start guide has good examples to follow.
It looks like you have several issues, some already pointed out.
If you want to use thrust, you should use thrust::transform, not std::transform. std::transform has no knowledge of the GPU or CUDA or thrust, and will dispatch the host version of your add function. I'm not sure what that would do exactly when you pass a thrust::device_vector to it.
Thrust algorithms need to use function objects (functors) rather than bare CUDA __device__ functions, for the reason indicated by Jared (the thrust algorithm in your source code is actually host code. That host code cannot discover the address of a bare __device__ function). With this fix, you can be pretty certain that thrust will dispatch the device code path when working on device vectors.
Here's a modification of your code:
$ cat t856.cu
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
struct my_func {
__host__ __device__
int operator()(int x){
#if defined(__CUDA_ARCH__)
printf("In device, x is %d\n", x);
#else
printf("In host, x is %d\n", x);
#endif
return x+10;
}
};
int main(void)
{
thrust::host_vector<int> H(4);
H[0] = H[1] = H[2] = H[3] = 10;
thrust::device_vector<int> data=H;
thrust::transform(data.begin(), data.end(), data.begin(),my_func());
return 0;
}
$ nvcc -o t856 t856.cu
$ ./t856
In device, x is 10
In device, x is 10
In device, x is 10
In device, x is 10
$

CUDA pinned memory flushing from the device

CUDA 5, device capabilities 3.5, VS 2012, 64bit Win 2012 Server.
There is no shared memory access between threads, every thread is standalone.
I am using pinned memory with zero-copy. From the host, I can only read the pinned memory the device has written, only when I issue a cudaDeviceSynchronize on the host.
I want to be able to:
Flush into the pinned memory as soon as the device has updated it.
Not block the device thread (maybe by copying asynchronously)
I tried calling __threadfence_system and __threadfence after each device write, but that didn't flush.
Below is a full sample CUDA code that demonstrates my question:
#include <conio.h>
#include <cstdio>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void Kernel(volatile float* hResult)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
printf("Kernel %u: Before Writing in Kernel\n", tid);
hResult[tid] = tid + 1;
__threadfence_system();
// expecting that the data is getting flushed to host here!
printf("Kernel %u: After Writing in Kernel\n", tid);
// time waster for-loop (sleep)
for (int timeWater = 0; timeWater < 100000000; timeWater++);
}
void main()
{
size_t blocks = 2;
volatile float* hResult;
cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped);
Kernel<<<1,blocks>>>(hResult);
int filledElementsCounter = 0;
// naiive thread implementation that can be impelemted using
// another host thread
while (filledElementsCounter < blocks)
{
// blocks until the value changes, this moves sequentially
// while threads have no order (fine for this sample).
while(hResult[filledElementsCounter] == 0);
printf("%f\n", hResult[filledElementsCounter]);;
filledElementsCounter++;
}
cudaFreeHost((void *)hResult);
system("pause");
}
Currently this sample will wait indefinitely as nothing is being read from the device unless I issue cudaDeviceSynchronize. The sample below works, but it is NOT what I want as it defeats the purpose of async copying:
void main()
{
size_t blocks = 2;
volatile float* hResult;
cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped);
Kernel<<<1,blocks>>>(hResult);
cudaError_t error = cudaDeviceSynchronize();
if (error != cudaSuccess) { throw; }
for(int i = 0; i < blocks; i++)
{
printf("%f\n", hResult[i]);
}
cudaFreeHost((void *)hResult);
system("pause");
}
I played with your code on a Centos 6.2 with CUDA 5.5 and a Tesla M2090 and can conclude this:
The problem that it does not work on your system must be a driver issue and I suggest that you get the TCC drivers.
I attached my code that runs fine and does what you want. The values appear on the host side before the kernel ends. As you can see I added some compute code to prevent the for loop to be removed due to compiler optimizations. I added a stream and a callback that get executed after all work in the stream is finished. The program outputs 1 2 and for a long time does nothing until stream finished... is printed to the console.
#include <iostream>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define SEC_CUDA_CALL(val) checkCall ( (val), #val, __FILE__, __LINE__ )
bool checkCall(cudaError_t result, char const* const func, const char *const file, int const line)
{
if (result != cudaSuccess)
{
std::cout << "CUDA (runtime api) error: " << func << " failed! " << cudaGetErrorString(result) << " (" << result << ") " << file << ":" << line << std::endl;
}
return result != cudaSuccess;
}
class Callback
{
public:
static void CUDART_CB dispatch(cudaStream_t stream, cudaError_t status, void *userData);
private:
void call();
};
void CUDART_CB Callback::dispatch(cudaStream_t stream, cudaError_t status, void *userData)
{
Callback* cb = (Callback*) userData;
cb->call();
}
void Callback::call()
{
std::cout << "stream finished..." << std::endl;
}
__global__ void Kernel(volatile float* hResult)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
hResult[tid] = tid + 1;
__threadfence_system();
float A = 0;
for (int timeWater = 0; timeWater < 100000000; timeWater++)
{
A = sin(cos(log(hResult[0] * hResult[1]))) + A;
A = sqrt(A);
}
}
int main(int argc, char* argv[])
{
size_t blocks = 2;
volatile float* hResult;
SEC_CUDA_CALL(cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped));
cudaStream_t stream;
SEC_CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
Callback obj;
Kernel<<<1,blocks,NULL,stream>>>(hResult);
SEC_CUDA_CALL(cudaStreamAddCallback(stream, Callback::dispatch, &obj, 0));
int filledElementsCounter = 0;
while (filledElementsCounter < blocks)
{
while(hResult[filledElementsCounter] == 0);
std::cout << hResult[filledElementsCounter] << std::endl;
filledElementsCounter++;
}
SEC_CUDA_CALL(cudaStreamDestroy(stream));
SEC_CUDA_CALL(cudaFreeHost((void *)hResult));
}
No call returned an error and cuda-memcheck didn't find any problems. This works as intended. You should really try the TCC driver.
You cannot pass the host pointer directly to the kernel. If you allocate host memory using cudaHostAlloc with cudaHostAllocMapped flag, then first you have to retrieve the device pointer of the mapped host memory before you can use it in the kernel. Use cudaHostGetDevicePointer to get the device pointer of mapped host memory.
float* hResult, *dResult;
cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped);
cudaHostGetDevicePointer(&dResult,hResult);
Kernel<<<1,blocks>>>(dResult);
Calling __threadfence_system() will ensure that the write is visible to the system before proceeding, but your CPU will be caching the h_result variable and hence you're just spinning on the old value in an infinite loop. Try marking h_result as volatile.

CUDA shared object between threads

I am totally new to CUDA. I want to create one object on the device, and access its member from different threads. I use nvcc -arch=sm_20 (on Tesla M2090), and if I run my code I get an 'unspecified launch failure'. Here is my code:
#include <stdio.h>
#include <string>
using namespace std;
#ifdef __CUDACC__
#define CUDA_CALLABLE __host__ __device__
#else
#define CUDA_CALLABLE
#endif
class SimpleClass {
public:
int i;
CUDA_CALLABLE SimpleClass(){i=1;};
CUDA_CALLABLE ~SimpleClass(){};
};
__global__ void initkernel(SimpleClass *a){
a = new SimpleClass();
}
__global__ void delkernel(SimpleClass *a){
delete a;
}
__global__ void kernel(SimpleClass *a){
printf("%d\n", a->i);
}
int main() {
SimpleClass *a;
initkernel<<<1,1>>>(a);
cudaThreadSynchronize();
kernel<<<1,10>>>(a);
cudaThreadSynchronize();
delkernel<<<1,1>>>(a);
cudaThreadSynchronize();
cudaError_t error = cudaGetLastError();
string lastError = cudaGetErrorString(error);
printf("%s\n",lastError.c_str());
return 0;
}
You get the 'unspecified launch failure' during your first kernel code because 'a' is a pointer stored in the host, but you want to give it a value from a device function. If you want to allocate the object on the device, than you first have to allocate a pointer on the device and than you can read and write it form device (kernel) code, but be careful because it will require double indirection.
Your code should looks like something like this (the rest of the functions should be modified similarly):
__global__ void initkernel(SimpleClass** a){
*a = new SimpleClass();
}
int main() {
SimpleClass** a;
cudaMalloc((void**)&a, sizeof(SimpleClass**));
initkernel<<<1,1>>>(a);
cudaThreadSynchronize();
}
PS.: pQB is absolutely right about that, you should do an error check after each kernel code to detect the errors as soon as possible (and currently for finding the exact location of the error in your code)