Using CUDA Thrust algorithms sequentially on the host - cuda

I wish to compare a Thrust algorithm's runtime when executed sequentially on a single CPU core versus a parallel execution on a GPU.
Thrust specifies the thrust::seq execution policy, but how can I explicity target the host backend system? I wish to avoid executing the algorithm sequentially on the GPU.

CUDA Thrust is architecture agnostic. Accordingly, consider the code I provided as an answer to
Cumulative summation in CUDA
In that code, MatingProbability and CumulativeProbability were thrust::device_vectors. thrust::transform and thrust::inclusive_scan were automatically able to recognize that and operate accordingly on the GPU.
Below, I'm providing the same code by changing thrust::device_vector to thrust::host_vector. Again, thrust::transform and thrust::inclusive_scan are able to automatically recognize that the vectors to operate on reside on the CPU and to operate accordingly.
#include <thrust/host_vector.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdio>
template <class T>
struct scaling {
const T _a;
scaling(T a) : _a(a) { }
__host__ __device__ T operator()(const T &x) const { return _a * x; }
};
void main()
{
const int N = 20;
double a = -(double)N;
double b = 0.;
double Dx = -1./(0.5*N*(N+1));
thrust::host_vector<double> MatingProbability(N);
thrust::host_vector<double> CumulativeProbability(N+1, 0.);
thrust::transform(thrust::make_counting_iterator(a), thrust::make_counting_iterator(b), MatingProbability.begin(), scaling<double>(Dx));
thrust::inclusive_scan(MatingProbability.begin(), MatingProbability.end(), CumulativeProbability.begin() + 1);
for(int i=0; i<N+1; i++)
{
double val = CumulativeProbability[i];
printf("%d %3.15f\n", i, val);
}
}

Related

Thrust error with CUDA separate compilation

I'm running into an error when I try to compile CUDA with relocatable device code enabled (-rdc = true). I'm using Visual Studio 2013 as compiler with CUDA 7.5. Below is a small example that shows the error. To clarify, the code below runs fine when -rdc = false, but when set to true, the error shows up.
The error simply says: CUDA error 11 [\cuda\detail\cub\device\dispatch/device_radix_sort_dispatch.cuh, 687]: invalid argument
Then I found this, which says:
When invoked with primitive data types, thrust::sort, thrust::sort_by_key,thrust::stable_sort, thrust::stable_sort_by_key may fail to link in some cases with nvcc -rdc=true.
Is there some workaround to allow separate compilation?
main.cpp:
#include <stdio.h>
#include <vector>
#include "cuda_runtime.h"
#include "RadixSort.h"
typedef unsigned int uint;
typedef unsigned __int64 uint64;
int main()
{
RadixSort sorter;
uint n = 10;
std::vector<uint64> test(n);
for (uint i = 0; i < n; i++)
test[i] = i + 1;
uint64 * d_array;
uint64 size = n * sizeof(uint64);
cudaMalloc(&d_array, size);
cudaMemcpy(d_array, test.data(), size, cudaMemcpyHostToDevice);
try
{
sorter.Sort(d_array, n);
}
catch (const std::exception & ex)
{
printf("%s\n", ex.what());
}
}
RadixSort.h:
#pragma once
typedef unsigned int uint;
typedef unsigned __int64 uint64;
class RadixSort
{
public:
RadixSort() {}
~RadixSort() {}
void Sort(uint64 * input, const uint n);
};
RadixSort.cu:
#include "RadixSort.h"
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
void RadixSort::Sort(uint64 * input, const uint n)
{
thrust::device_ptr<uint64> d_input = thrust::device_pointer_cast(input);
thrust::stable_sort(d_input, d_input + n);
cudaDeviceSynchronize();
}
As mentioned in the comments by Robert Crovella:
Changing the CUDA architecture to a higher value will solve this problem. In my case I changed it to compute_30 and sm_30 under CUDA C++ -> Device -> Code Generation.
Edit:
The general recommendation is to select the best fit hierarchy for your specific GPU. See the link in comments for additional information.

Pass cuda array to thrust::inclusive_scan

I can use inclusive_scan for an array on the cpu, but is it possible to do it with an array on the gpu? (commentated is the way that i know works but that I don't need). Alternatively, are there any other easy methods to perform an inclusive scan on an array in device memory?
Code:
#include <stdio.h>
#include <stdlib.h> /* for rand() */
#include <unistd.h> /* for getpid() */
#include <time.h> /* for time() */
#include <math.h>
#include <assert.h>
#include <iostream>
#include <ctime>
#include <thrust/scan.h>
#include <cuda.h>
#ifdef DOUBLE
#define REAL double
#define MAXT 256
#else
#define REAL float
#define MAXT 512
#endif
#ifndef MIN
#define MIN(x,y) ((x < y) ? x : y)
#endif
using namespace std;
bool errorAsk(const char *s="n/a")
{
cudaError_t err=cudaGetLastError();
if(err==cudaSuccess)
return false;
printf("CUDA error [%s]: %s\n",s,cudaGetErrorString(err));
return true;
};
double *fillArray(double *c_idata,int N,double constant) {
int n;
for (n = 0; n < N; n++) {
c_idata[n] = constant*floor(drand48()*10);
}
return c_idata;
}
int main(int argc,char *argv[])
{
int N,blocks,threads;
N = 100;
threads=MAXT;
blocks=N/threads+(N%threads==0?0:1);
double *c_data,*g_data;
c_data = new double[N];
c_data = fillArray(c_data,N,1);
cudaMalloc(&g_data,N*sizeof(double));
cudaMemcpy(g_data,c_data,N*sizeof(double),cudaMemcpyHostToDevice);
thrust::inclusive_scan(g_data, g_data + N, g_data); // in-place scan
cudaMemcpy(c_data,g_data,N*sizeof(double),cudaMemcpyDeviceToHost);
// thrust::inclusive_scan(c_data, c_data + N, c_data); // in-place scan
for(int i = 0; i < N; i++) {
cout<<c_data[i]<<endl;
}
}
If you read the thrust quick start guide you'll find one suggestion for handling "raw" device data: use a thrust::device_ptr:
You may wonder what happens when a "raw" pointer is used as an argument to a Thrust function. Like the STL, Thrust permits this usage and it will dispatch the host path of the algorithm. If the pointer in question is in fact a pointer to device memory then you'll need to wrap it with thrust::device_ptr before calling the function.
To fix your code, you would want to
#include <thrust/device_ptr.h>
and replace your existing call to thrust::inclusive_scan with the following 2 lines:
thrust::device_ptr<double> g_ptr = thrust::device_pointer_cast(g_data);
thrust::inclusive_scan(g_ptr, g_ptr + N, g_ptr); // in-place scan
Another approach would be to use thrust execution policies and modify your call like this:
thrust::inclusive_scan(thrust::device, g_data, g_data + N, g_data);
And there are various other possibilities as well.

cuda function application elementwise in cuda

After multiplying a matrix A and a vector x obtaining the result y, I want to apply a function h elementwise to y.
I want to obtain z = h(Ax), where h is applied elementwise to the vector Ax.
I know how to make the matrix/vector multiplication on the GPU (with cublas). Now I want h (which is my own function, coded in C++) to be applied to the resultant vector also in GPU, how can I do that?
Two possible approaches are:
Write your own CUDA kernel to perform the operation
Use thrust (e.g. thrust::for_each() ).
Here is a worked example of both approaches:
$ cat t934.cu
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/for_each.h>
#define DSIZE 4
#define nTPB 256
template <typename T>
__host__ __device__ T myfunc(T &d){
return d + 5; // define your own function here
}
struct mytfunc
{
template <typename T>
__host__ __device__
void operator()(T &d){
d = myfunc(d);
}
};
template <typename T>
__global__ void mykernel(T *dvec, size_t dsize){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < dsize) dvec[idx] = myfunc(dvec[idx]);
}
int main(){
// first using kernel
float *h_data, *d_data;
h_data = new float[DSIZE];
cudaMalloc(&d_data, DSIZE*sizeof(float));
for (int i = 0; i < DSIZE; i++) h_data[i] = i;
cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
mykernel<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_data, DSIZE);
cudaMemcpy(h_data, d_data, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < DSIZE; i++) std::cout << h_data[i] << ",";
std::cout << std::endl;
// then using thrust
thrust::host_vector<float> hvec(h_data, h_data+DSIZE);
thrust::device_vector<float> dvec = hvec;
thrust::for_each(dvec.begin(), dvec.end(), mytfunc());
thrust::copy_n(dvec.begin(), DSIZE, std::ostream_iterator<float>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -o t934 t934.cu
$ ./t934
5,6,7,8,
10,11,12,13,
$
Note that in order to provide a complete example, I'm starting with a vector definition in host memory. If you already have the vector in device memory (perhaps as a result of computing y=Ax) then you can work directly on that, by passing that vector to the CUDA kernel, or using it directly in the thrust function, using a thrust::device_ptr wrapper (this method is covered in the thrust quick start guide previously linked.)
The assumption I've made here is you want to use an arbitrary function of one variable. This should handle pretty much arbitrary functions defined in myfunc. However, for some categories of functions that you may be interested in, you may be able to realize it one or more CUBLAS calls as well.

Thrust not calling device function

I have following simple CUDA-Thrust code which adds 10 to device vector but the function is getting called on host side instead of device.
#include <algorithm>
#include <iostream>
#include <numeric>
#include <vector>
#include <stdio.h>
#include <thrust/device_vector.h>
__host__ __device__ int add(int x){
#if defined(__CUDA_ARCH__)
printf("In device\n");
#else
printf("In host\n");
#endif
return x+10;
}
int main(void)
{
thrust::host_vector<int> H(4);
H[0] = H[1] = H[2] = H[3] = 10;
thrust::device_vector<int> data=H;
std::transform(data.begin(), data.end(), data.begin(),add);
return 0;
}
What am I doing wrong here?
The thrust quick start guide has good examples to follow.
It looks like you have several issues, some already pointed out.
If you want to use thrust, you should use thrust::transform, not std::transform. std::transform has no knowledge of the GPU or CUDA or thrust, and will dispatch the host version of your add function. I'm not sure what that would do exactly when you pass a thrust::device_vector to it.
Thrust algorithms need to use function objects (functors) rather than bare CUDA __device__ functions, for the reason indicated by Jared (the thrust algorithm in your source code is actually host code. That host code cannot discover the address of a bare __device__ function). With this fix, you can be pretty certain that thrust will dispatch the device code path when working on device vectors.
Here's a modification of your code:
$ cat t856.cu
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
struct my_func {
__host__ __device__
int operator()(int x){
#if defined(__CUDA_ARCH__)
printf("In device, x is %d\n", x);
#else
printf("In host, x is %d\n", x);
#endif
return x+10;
}
};
int main(void)
{
thrust::host_vector<int> H(4);
H[0] = H[1] = H[2] = H[3] = 10;
thrust::device_vector<int> data=H;
thrust::transform(data.begin(), data.end(), data.begin(),my_func());
return 0;
}
$ nvcc -o t856 t856.cu
$ ./t856
In device, x is 10
In device, x is 10
In device, x is 10
In device, x is 10
$

determing the limit of size of the array when writing CUDA kernel for multi-gpu using Thrust library

I am trying to write a CUDA kernel which will use multi-gpu and thrust library features. I used some tips from some previous posts.I tried to write a simple addition kernel. My obvious intention is to use more complicated kernels.
My code is as follows:
#include "test.h"
int main(int argc, char *argv[])
{
int num_gpus = 0; // number of CUDA GPUs
// determine the number of CUDA capable GPUs
cudaGetDeviceCount(&num_gpus);
printf("number of CUDA devices:\t%d\n", num_gpus);
typedef thrust::device_vector<int> dvec;
typedef dvec *p_dvec;
// Declaring Vectors
std::vector<p_dvec> dvecs1;
std::vector<p_dvec> dvecs2;
std::vector<p_dvec> dvecs3;
std::vector<double>p(num_gpus);
dim3 DimGrid((DSIZE-1)/16.0 +1,1,1);
dim3 DimBlock(16.0,1,1);
// Initialize Vectors
for(unsigned int i = 0; i < num_gpus; i++) {
cudaSetDevice(i);
p_dvec temp1 = new dvec(DSIZE);
dvecs1.push_back(temp1);
thrust::fill((*(dvecs1[i])).begin(),(*(dvecs1[i])).end(),1.0);
p_dvec temp2 = new dvec(DSIZE);
dvecs2.push_back(temp2);
thrust::fill((*(dvecs2[i])).begin(),(*(dvecs2[i])).end(),2.0);
}
// Launching The Kernel
for(unsigned int i = 0; i < num_gpus; i++) {
cudaSetDevice(i);
p_dvec temp = new dvec(DSIZE);
dvecs3.push_back(temp);
fooKernel<<<DimGrid,DimBlock>>>(convertToKernel(*dvecs1[i])),convertToKernel(*(dvecs2[i])),convertToKernel(*(dvecs3[i])));
// Reduction Operation
p[i]= thrust::reduce((*(dvecs3[i])).begin(),(*(dvecs3[i])).end(), (double) 0, thrust::plus<double>());
std::cout<<*((*(dvecs3[i])).begin())<<std::endl;
std::cout<<p[i]<<std::endl;
}
printf("Success\n");
return 0;
}
and the header file is as follows:
#include <stdio.h>
#include <cstdio>
#include <stdlib.h>
#include <cstdlib>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#define DSIZE 1048560
template < typename T >
struct KernelArray
{
T* _array;
int _size;
};
// Function to convert device_vector to structure
template < typename T >
KernelArray< T > convertToKernel( thrust::device_vector< T >& dVec )
{
KernelArray< T > kArray;
kArray._array = thrust::raw_pointer_cast( &dVec[0] );
kArray._size = ( int ) dVec.size();
return kArray;
}
template< typename scalartype>
__global__ void fooKernel( KernelArray< scalartype > Array1, KernelArray<scalartype>Array2, KernelArray<scalartype> Array3)
{
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if(i< DSIZE)
Array3._array[i] = Array2._array[i] +Array1._array[i];
}
Now if DSIZE> 1048560, then the result is 0;
I have few questions:
1)How to determine the size limit of the vector. I have 8 devices.
2)Is there any way to increase the size of the data that I can use or improve the code?
3)When and where do I need cudaDeviceSynchronize() ?
I would be happy if someone can help me out.
If you had used proper CUDA error checking to find out if and which CUDA errors occured, you would have gotten the following output after launching fooKernel with DSIZE > 1048560:
invalid argument
The reason for this error is that you can have at most 65535 blocks in one dimension and
1048560/16 = 65535
So you did not run into a size limit of the vector but into the maximum block limit.