Thrust error with CUDA separate compilation - cuda

I'm running into an error when I try to compile CUDA with relocatable device code enabled (-rdc = true). I'm using Visual Studio 2013 as compiler with CUDA 7.5. Below is a small example that shows the error. To clarify, the code below runs fine when -rdc = false, but when set to true, the error shows up.
The error simply says: CUDA error 11 [\cuda\detail\cub\device\dispatch/device_radix_sort_dispatch.cuh, 687]: invalid argument
Then I found this, which says:
When invoked with primitive data types, thrust::sort, thrust::sort_by_key,thrust::stable_sort, thrust::stable_sort_by_key may fail to link in some cases with nvcc -rdc=true.
Is there some workaround to allow separate compilation?
main.cpp:
#include <stdio.h>
#include <vector>
#include "cuda_runtime.h"
#include "RadixSort.h"
typedef unsigned int uint;
typedef unsigned __int64 uint64;
int main()
{
RadixSort sorter;
uint n = 10;
std::vector<uint64> test(n);
for (uint i = 0; i < n; i++)
test[i] = i + 1;
uint64 * d_array;
uint64 size = n * sizeof(uint64);
cudaMalloc(&d_array, size);
cudaMemcpy(d_array, test.data(), size, cudaMemcpyHostToDevice);
try
{
sorter.Sort(d_array, n);
}
catch (const std::exception & ex)
{
printf("%s\n", ex.what());
}
}
RadixSort.h:
#pragma once
typedef unsigned int uint;
typedef unsigned __int64 uint64;
class RadixSort
{
public:
RadixSort() {}
~RadixSort() {}
void Sort(uint64 * input, const uint n);
};
RadixSort.cu:
#include "RadixSort.h"
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
void RadixSort::Sort(uint64 * input, const uint n)
{
thrust::device_ptr<uint64> d_input = thrust::device_pointer_cast(input);
thrust::stable_sort(d_input, d_input + n);
cudaDeviceSynchronize();
}

As mentioned in the comments by Robert Crovella:
Changing the CUDA architecture to a higher value will solve this problem. In my case I changed it to compute_30 and sm_30 under CUDA C++ -> Device -> Code Generation.
Edit:
The general recommendation is to select the best fit hierarchy for your specific GPU. See the link in comments for additional information.

Related

Unified memory and struct with arrays

I have a big Struct of Arrays of Structs on CUDA, that is constant and read only for my application. A quite simplified example would be
struct Graph{
Node * nodes;
int nNode;
}
struct Node{
int* pos;
int nPos;
}
My kernels would need to navigate this graph and query it. As you know, copying this struct to GPU memory with cudaMalloc and cudaMemcpy is just lots of code, that unified memory is supposed to remove the need of.
In my code, I generated the graph in CPU and then, for testing, I designed the following kernel
__global__ void testKernel(const Graph graph,int * d_res){
d_res[0]=graph.nNode;
};
being called as:
// using malloc for testing to make sure I know what I am doing
int * d_res,* h_res;
cudaMalloc((void **)&d_res,sizeof(int));
h_res=(int*)malloc(sizeof(int));
testKernel<<<1,1>>>(graph,d_res);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk(cudaMemcpy(h_res,d_res,sizeof(int),cudaMemcpyDeviceToHost));
with the error checks from here.
When I use the testKernel as is shown, it works fine, but if I change the kernel to:
__global__ void testKernel(const Graph graph,int * d_res){
d_res[0]=graph.nodes[0].nPos;
};
I get illegal memory access errors.
Is this because the unified memory does not handle this type of data correctly?
Is there a way to make sure I can avoid writing all the explicit copies to GPU memory?
Full MCVE:
#include <algorithm>
#include <cuda_runtime_api.h>
#include <cuda.h>
typedef struct node{
int* pos;
int nPos;
}Node;
typedef struct Graph{
Node * nodes;
int nNode;
}Graph;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(const Graph graph, int * d_res){
d_res[0] = graph.nNode;
// d_res[0]=graph.nodes[0].nPos; // Not working
};
int main(void){
// fake data, this comes from another process
Graph graph;
graph.nodes = (Node*)malloc(2*sizeof(Node));
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
graph.nodes[i].pos = (int*)malloc(3 * sizeof(int));
graph.nodes[i].pos[0] = 0;
graph.nodes[i].pos[1] = 1;
graph.nodes[i].pos[2] = 2;
graph.nodes[i].nPos = 3;
}
printf("%d\n", graph.nNode); // Change to the kernel variable for comparison
int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(int));
h_res = (int*)malloc(sizeof(int));
testKernel << <1, 1 >> >(graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost));
printf("%d", h_res[0]);
return 0;
}
Your code isn't using CUDA unified memory. UM is not "automatic" in any way. It requires specific programming steps to take advantage of it and it has specific system requirements.
All of this is covered in the UM section of the programming guide.
Is there a way to make sure I can avoid writing all the explicit copies to GPU memory?
Proper use of UM should allow this. Here is a fully worked example. The only thing I have done is mechanically convert your malloc operations in host code to equivalent cudaMallocManaged operations.
$ cat t1389.cu
#include <algorithm>
#include <stdio.h>
typedef struct node{
int* pos;
int nPos;
}Node;
typedef struct Graph{
Node * nodes;
int nNode;
}Graph;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(const Graph graph, int * d_res){
d_res[0] = graph.nNode;
d_res[0]=graph.nodes[0].nPos; // Not working
};
int main(void){
// fake data, this comes from another process
Graph graph;
cudaMallocManaged(&(graph.nodes), 2*sizeof(Node));
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
cudaMallocManaged(&(graph.nodes[i].pos), 3 * sizeof(int));
graph.nodes[i].pos[0] = 0;
graph.nodes[i].pos[1] = 1;
graph.nodes[i].pos[2] = 2;
graph.nodes[i].nPos = 3;
}
printf("%d\n", graph.nNode); // Change to the kernel variable for comparison
int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(int));
h_res = (int*)malloc(sizeof(int));
testKernel << <1, 1 >> >(graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost));
printf("%d", h_res[0]);
return 0;
}
$ nvcc t1389.cu -o t1389
$ cuda-memcheck ./t1389
========= CUDA-MEMCHECK
2
3========= ERROR SUMMARY: 0 errors
$
UM has a number of system requirements that are documented. I'm not going to try to recite them all here. Primarily you need a cc3.0 or higher GPU. Your MCVE did not include any standard error checking, and I didn't try to add it. But if you still have problems with this code, be sure to use proper CUDA error checking and run it with cuda-memcheck.
If your entire data structure, including embedded pointers, is allocated using ordinary host allocators, and you have no control over that, then you won't be able to use it directly in a UM regime, without doing some sort of involved copying. The exception here would be on an IBM Power9 system as mentioned in section K.1.6 of the above linked programming guide section.
Before attempting to use a host allocator (e.g. malloc) with UM, you should first test the pageableMemoryAccessUsesHostPageTables property, as mentioned in that section.
That property currently won't be set on any system except a properly configured IBM Power9 system. No x86 system currently has this property set/available.

__CUDA_ARCH__ flag with Thrust execution policy

I have a __host__ __device__ function which is a wrapper that calls into "sort" function of the thrust library. Inside this wrapper, I am using the __CUDA_ARCH__ flag to set the execution policy to "thrust::device" when called from host and "thrust::seq" when called from device. The following piece of code generates a runtime error -
#ifndef __CUDA_ARCH__
thrust::stable_sort(thrust::device, data, data + num, customGreater<T>());
#else
thrust::stable_sort(thrust::seq, data, data + num, customGreater<T>());
#endif
The error is-
Unexpected Standard exception:
What() is:merge_sort: failed on 2nd step: invalid device function
As per my understanding, CUDA_ARCH can be used for conditional compilation. I request for help in understanding why this error is thrown.
It seems you are stepping on this issue. In a nutshell, thrust uses CUB functionality under the hood for certain algorithms (including sort). Your use of __CUDA_ARCH__ macro in your code, which wraps around thrust algorithm calls that use CUB, is interfering with CUB code that expects to be able to use this macro for all paths.
A possible workaround is to do "your own dispatch":
$ cat t142.cu
#include <iostream>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
template <typename T>
struct customGreater {
__host__ __device__ bool operator()(T &t1, T &t2){
return (t1 > t2);}
};
template <typename T>
__host__ __device__
void my_sort_wrapper(T *data, size_t num){
int hostdev = 0; // 0=device code
#ifndef __CUDA_ARCH__
hostdev = 1; // 1=host code
#endif
if (hostdev == 0) thrust::stable_sort(thrust::seq, data, data + num, customGreater<T>());
else thrust::stable_sort(thrust::device, data, data + num, customGreater<T>());
}
template <typename T>
__global__ void my_dev_sort(T *data, size_t num){
my_sort_wrapper(data, num);
}
typedef int mytype;
const size_t sz = 10;
int main(){
mytype *d_data;
cudaMalloc(&d_data, sz*sizeof(mytype));
cudaMemset(d_data, 0, sz*sizeof(mytype));
my_sort_wrapper(d_data, sz);
my_dev_sort<<<1,1>>>(d_data, sz);
cudaDeviceSynchronize();
}
$ nvcc t142.cu -o t142
$ cuda-memcheck ./t142
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
With this realization, the use of the __CUDA_ARCH__ macro does not perturb the compilation of the thrust algorithms.
Another possible workaround is simply to use thrust::device policy for both cases (no dispatch - just the thrust algorithm call). Except in the case of CUDA Dynamic Parallelism, thrust::device will "decay" to thrust::seq when used in device code.
I would expect that these suggestions would only be necessary/relevant when the thrust algorithm uses CUB functionality in the underlying implementation.
If you don't like this behavior, you could file a thrust issue.
Unfortunately, we can't fix this in Thrust. The trouble here is that the NVCC compiler needs to see all __global__ function template instantiations during host compilation (e.g. when __CUDA_ARCH__ is not defined), otherwise the kernels will be treated as unused and discarded. See this CUB GitHub issue for more details.
As Robert suggested, a workaround such as this should be fine:
#include <iostream>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
template <typename T>
struct customGreater {
__host__ __device__ bool operator()(T &t1, T &t2){
return (t1 > t2);}
};
#if defined(__CUDA_ARCH__)
#define DEVICE_COMPILATION 1
#else
#define DEVICE_COMPILATION 0
#endif
template <typename T>
__host__ __device__
void my_sort(T *data, size_t num){
if (DEVICE_COMPILATION)
thrust::stable_sort(thrust::device, data, data + num, customGreater<T>());
else
thrust::stable_sort(thrust::seq, data, data + num, customGreater<T>());
}
template <typename T>
__global__ void my_dev_sort(T *data, size_t num){
my_sort(data, num);
}
typedef int mytype;
const size_t sz = 10;
int main(){
mytype *d_data;
cudaMallocManaged(&d_data, sz*sizeof(mytype));
cudaMemset(d_data, 0, sz*sizeof(mytype));
my_sort(d_data, sz);
my_dev_sort<<<1,1>>>(d_data, sz);
cudaFree(d_data);
cudaDeviceSynchronize();
}

Thrust not calling device function

I have following simple CUDA-Thrust code which adds 10 to device vector but the function is getting called on host side instead of device.
#include <algorithm>
#include <iostream>
#include <numeric>
#include <vector>
#include <stdio.h>
#include <thrust/device_vector.h>
__host__ __device__ int add(int x){
#if defined(__CUDA_ARCH__)
printf("In device\n");
#else
printf("In host\n");
#endif
return x+10;
}
int main(void)
{
thrust::host_vector<int> H(4);
H[0] = H[1] = H[2] = H[3] = 10;
thrust::device_vector<int> data=H;
std::transform(data.begin(), data.end(), data.begin(),add);
return 0;
}
What am I doing wrong here?
The thrust quick start guide has good examples to follow.
It looks like you have several issues, some already pointed out.
If you want to use thrust, you should use thrust::transform, not std::transform. std::transform has no knowledge of the GPU or CUDA or thrust, and will dispatch the host version of your add function. I'm not sure what that would do exactly when you pass a thrust::device_vector to it.
Thrust algorithms need to use function objects (functors) rather than bare CUDA __device__ functions, for the reason indicated by Jared (the thrust algorithm in your source code is actually host code. That host code cannot discover the address of a bare __device__ function). With this fix, you can be pretty certain that thrust will dispatch the device code path when working on device vectors.
Here's a modification of your code:
$ cat t856.cu
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
struct my_func {
__host__ __device__
int operator()(int x){
#if defined(__CUDA_ARCH__)
printf("In device, x is %d\n", x);
#else
printf("In host, x is %d\n", x);
#endif
return x+10;
}
};
int main(void)
{
thrust::host_vector<int> H(4);
H[0] = H[1] = H[2] = H[3] = 10;
thrust::device_vector<int> data=H;
thrust::transform(data.begin(), data.end(), data.begin(),my_func());
return 0;
}
$ nvcc -o t856 t856.cu
$ ./t856
In device, x is 10
In device, x is 10
In device, x is 10
In device, x is 10
$

determing the limit of size of the array when writing CUDA kernel for multi-gpu using Thrust library

I am trying to write a CUDA kernel which will use multi-gpu and thrust library features. I used some tips from some previous posts.I tried to write a simple addition kernel. My obvious intention is to use more complicated kernels.
My code is as follows:
#include "test.h"
int main(int argc, char *argv[])
{
int num_gpus = 0; // number of CUDA GPUs
// determine the number of CUDA capable GPUs
cudaGetDeviceCount(&num_gpus);
printf("number of CUDA devices:\t%d\n", num_gpus);
typedef thrust::device_vector<int> dvec;
typedef dvec *p_dvec;
// Declaring Vectors
std::vector<p_dvec> dvecs1;
std::vector<p_dvec> dvecs2;
std::vector<p_dvec> dvecs3;
std::vector<double>p(num_gpus);
dim3 DimGrid((DSIZE-1)/16.0 +1,1,1);
dim3 DimBlock(16.0,1,1);
// Initialize Vectors
for(unsigned int i = 0; i < num_gpus; i++) {
cudaSetDevice(i);
p_dvec temp1 = new dvec(DSIZE);
dvecs1.push_back(temp1);
thrust::fill((*(dvecs1[i])).begin(),(*(dvecs1[i])).end(),1.0);
p_dvec temp2 = new dvec(DSIZE);
dvecs2.push_back(temp2);
thrust::fill((*(dvecs2[i])).begin(),(*(dvecs2[i])).end(),2.0);
}
// Launching The Kernel
for(unsigned int i = 0; i < num_gpus; i++) {
cudaSetDevice(i);
p_dvec temp = new dvec(DSIZE);
dvecs3.push_back(temp);
fooKernel<<<DimGrid,DimBlock>>>(convertToKernel(*dvecs1[i])),convertToKernel(*(dvecs2[i])),convertToKernel(*(dvecs3[i])));
// Reduction Operation
p[i]= thrust::reduce((*(dvecs3[i])).begin(),(*(dvecs3[i])).end(), (double) 0, thrust::plus<double>());
std::cout<<*((*(dvecs3[i])).begin())<<std::endl;
std::cout<<p[i]<<std::endl;
}
printf("Success\n");
return 0;
}
and the header file is as follows:
#include <stdio.h>
#include <cstdio>
#include <stdlib.h>
#include <cstdlib>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#define DSIZE 1048560
template < typename T >
struct KernelArray
{
T* _array;
int _size;
};
// Function to convert device_vector to structure
template < typename T >
KernelArray< T > convertToKernel( thrust::device_vector< T >& dVec )
{
KernelArray< T > kArray;
kArray._array = thrust::raw_pointer_cast( &dVec[0] );
kArray._size = ( int ) dVec.size();
return kArray;
}
template< typename scalartype>
__global__ void fooKernel( KernelArray< scalartype > Array1, KernelArray<scalartype>Array2, KernelArray<scalartype> Array3)
{
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if(i< DSIZE)
Array3._array[i] = Array2._array[i] +Array1._array[i];
}
Now if DSIZE> 1048560, then the result is 0;
I have few questions:
1)How to determine the size limit of the vector. I have 8 devices.
2)Is there any way to increase the size of the data that I can use or improve the code?
3)When and where do I need cudaDeviceSynchronize() ?
I would be happy if someone can help me out.
If you had used proper CUDA error checking to find out if and which CUDA errors occured, you would have gotten the following output after launching fooKernel with DSIZE > 1048560:
invalid argument
The reason for this error is that you can have at most 65535 blocks in one dimension and
1048560/16 = 65535
So you did not run into a size limit of the vector but into the maximum block limit.

How to advance iterator in thrust function

I'm doing some study on thrust. But I didn't understand how to get the value of an iterator point to.
An example code is like:
#include <thrust/for_each.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <vector>
using namespace std;
class ADD
{
private:
typedef typename thrust::device_vector<int>::iterator PTR;
public:
ADD(){}
~ADD(){}
void setPtr(PTR &ptr)
{this->ptr=ptr;}
__host__ __device__
void operator()(int &x)
{
// note that using printf in a __device__ function requires
// code compiled for a GPU with compute capability 2.0 or
// higher (nvcc --arch=sm_20)
x+=add();
}
__host__ __device__
int add()
{return *ptr++;}
private:
PTR ptr;
};
int main()
{
thrust::device_vector<int> d_vec(3);
d_vec[0] = 0; d_vec[1] = 1; d_vec[2] = 2;
thrust::device_vector<int>::iterator itr=d_vec.begin();
ADD *addtest=new ADD();
addtest->setPtr(itr);
thrust::for_each(d_vec.begin(), d_vec.end(), *addtest);
for(int i=0;i<3;i++)
cout<<d_vec[i]<<endl;
return 0;
}
When I compile this using nvcc -arch=sm_20 test.cu
I got the following warning:
test.cu(28): warning: calling a host function("thrust::experimental::iterator_facade<thrust::detail::normal_iterator<thrust::device_ptr<int> > , thrust::device_ptr<int> , int, thrust::detail::cuda_device_space_tag, thrust::random_access_traversal_tag, thrust::device_reference<int> , long> ::operator *") from a __device__/__global__ function("printf_functor::add") is not allowed
test.cu(28): warning: calling a host function("thrust::experimental::iterator_facade<thrust::detail::normal_iterator<thrust::device_ptr<int> > , thrust::device_ptr<int> , int, thrust::detail::cuda_device_space_tag, thrust::random_access_traversal_tag, thrust::device_reference<int> , long> ::operator *") from a __device__/__global__ function("printf_functor::add") is not allowed
I cannot get this to compile. How can I solve this problem?
#Gang.Wang: I think you just mixing up 2 different things: all STL-like functionality including for_each, device_vector iterators etc. is just a "facade" which exists on the host only.
While operator() contains the actual GPU code which is compiled to CUDA kernel and applied to each element of your vector in parallel. Hence, device_vector::iterators are not accessible from your functor.