cuda function application elementwise in cuda - cuda

After multiplying a matrix A and a vector x obtaining the result y, I want to apply a function h elementwise to y.
I want to obtain z = h(Ax), where h is applied elementwise to the vector Ax.
I know how to make the matrix/vector multiplication on the GPU (with cublas). Now I want h (which is my own function, coded in C++) to be applied to the resultant vector also in GPU, how can I do that?

Two possible approaches are:
Write your own CUDA kernel to perform the operation
Use thrust (e.g. thrust::for_each() ).
Here is a worked example of both approaches:
$ cat t934.cu
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/for_each.h>
#define DSIZE 4
#define nTPB 256
template <typename T>
__host__ __device__ T myfunc(T &d){
return d + 5; // define your own function here
}
struct mytfunc
{
template <typename T>
__host__ __device__
void operator()(T &d){
d = myfunc(d);
}
};
template <typename T>
__global__ void mykernel(T *dvec, size_t dsize){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < dsize) dvec[idx] = myfunc(dvec[idx]);
}
int main(){
// first using kernel
float *h_data, *d_data;
h_data = new float[DSIZE];
cudaMalloc(&d_data, DSIZE*sizeof(float));
for (int i = 0; i < DSIZE; i++) h_data[i] = i;
cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
mykernel<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_data, DSIZE);
cudaMemcpy(h_data, d_data, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < DSIZE; i++) std::cout << h_data[i] << ",";
std::cout << std::endl;
// then using thrust
thrust::host_vector<float> hvec(h_data, h_data+DSIZE);
thrust::device_vector<float> dvec = hvec;
thrust::for_each(dvec.begin(), dvec.end(), mytfunc());
thrust::copy_n(dvec.begin(), DSIZE, std::ostream_iterator<float>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -o t934 t934.cu
$ ./t934
5,6,7,8,
10,11,12,13,
$
Note that in order to provide a complete example, I'm starting with a vector definition in host memory. If you already have the vector in device memory (perhaps as a result of computing y=Ax) then you can work directly on that, by passing that vector to the CUDA kernel, or using it directly in the thrust function, using a thrust::device_ptr wrapper (this method is covered in the thrust quick start guide previously linked.)
The assumption I've made here is you want to use an arbitrary function of one variable. This should handle pretty much arbitrary functions defined in myfunc. However, for some categories of functions that you may be interested in, you may be able to realize it one or more CUBLAS calls as well.

Related

Cublas - Column/Row wise operations

I am looking for a way to perform operations over columns .
I have MxN matrix, i want to activate cublas function (for example nrm2) over each column.
The result i expect to get is : M x 1
How can I do that?
CUBLAS has no batched Level 1 routines, so there is no direct way to compute the column or row norms in a single call. You can do it by calling nrm2 many times in a loop over all the rows or columns of the matrix, for example:
#include <cublas_v2.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>
#include <thrust/random.h>
#include <thrust/device_vector.h>
#include <iostream>
struct prg
{
float a, b;
__host__ __device__
prg(float _a=0.f, float _b=1.f) : a(_a), b(_b) {};
__host__ __device__
float operator()(const unsigned int n) const
{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
int main(void)
{
const int M = 1024, N = M;
const int num = N * M;
thrust::device_vector<float> matrix(num);
thrust::device_vector<float> vector(N, -1.0f);
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
thrust::transform(index_sequence_begin,
index_sequence_begin + num,
matrix.begin(),
prg(1.f,2.f));
float* m_d = thrust::raw_pointer_cast(matrix.data());
float* v_d = thrust::raw_pointer_cast(vector.data());
cudaStream_t stream;
cudaStreamCreate(&stream);
cublasHandle_t handle;
cublasCreate(&handle);
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
cublasSetStream(handle, stream);
for(int col=0; col < N; col++) {
cublasSnrm2(handle, M, m_d + col*M, 1, v_d + col);
}
cudaDeviceSynchronize();
for(auto x : vector) {
float normval = x;
std::cout << normval << std::endl;
}
return 0;
}
Unless you have very large rows or columns, there is little scope to exploit streams to run simultaneous kernels and reduce the overall runtime because each nrm2 call will be too short. So there is a lot of latency in running lots of individual kernels, which will negatively effect performance.
A much better alternative would be to write your own kernel to do this.

determing the limit of size of the array when writing CUDA kernel for multi-gpu using Thrust library

I am trying to write a CUDA kernel which will use multi-gpu and thrust library features. I used some tips from some previous posts.I tried to write a simple addition kernel. My obvious intention is to use more complicated kernels.
My code is as follows:
#include "test.h"
int main(int argc, char *argv[])
{
int num_gpus = 0; // number of CUDA GPUs
// determine the number of CUDA capable GPUs
cudaGetDeviceCount(&num_gpus);
printf("number of CUDA devices:\t%d\n", num_gpus);
typedef thrust::device_vector<int> dvec;
typedef dvec *p_dvec;
// Declaring Vectors
std::vector<p_dvec> dvecs1;
std::vector<p_dvec> dvecs2;
std::vector<p_dvec> dvecs3;
std::vector<double>p(num_gpus);
dim3 DimGrid((DSIZE-1)/16.0 +1,1,1);
dim3 DimBlock(16.0,1,1);
// Initialize Vectors
for(unsigned int i = 0; i < num_gpus; i++) {
cudaSetDevice(i);
p_dvec temp1 = new dvec(DSIZE);
dvecs1.push_back(temp1);
thrust::fill((*(dvecs1[i])).begin(),(*(dvecs1[i])).end(),1.0);
p_dvec temp2 = new dvec(DSIZE);
dvecs2.push_back(temp2);
thrust::fill((*(dvecs2[i])).begin(),(*(dvecs2[i])).end(),2.0);
}
// Launching The Kernel
for(unsigned int i = 0; i < num_gpus; i++) {
cudaSetDevice(i);
p_dvec temp = new dvec(DSIZE);
dvecs3.push_back(temp);
fooKernel<<<DimGrid,DimBlock>>>(convertToKernel(*dvecs1[i])),convertToKernel(*(dvecs2[i])),convertToKernel(*(dvecs3[i])));
// Reduction Operation
p[i]= thrust::reduce((*(dvecs3[i])).begin(),(*(dvecs3[i])).end(), (double) 0, thrust::plus<double>());
std::cout<<*((*(dvecs3[i])).begin())<<std::endl;
std::cout<<p[i]<<std::endl;
}
printf("Success\n");
return 0;
}
and the header file is as follows:
#include <stdio.h>
#include <cstdio>
#include <stdlib.h>
#include <cstdlib>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#define DSIZE 1048560
template < typename T >
struct KernelArray
{
T* _array;
int _size;
};
// Function to convert device_vector to structure
template < typename T >
KernelArray< T > convertToKernel( thrust::device_vector< T >& dVec )
{
KernelArray< T > kArray;
kArray._array = thrust::raw_pointer_cast( &dVec[0] );
kArray._size = ( int ) dVec.size();
return kArray;
}
template< typename scalartype>
__global__ void fooKernel( KernelArray< scalartype > Array1, KernelArray<scalartype>Array2, KernelArray<scalartype> Array3)
{
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if(i< DSIZE)
Array3._array[i] = Array2._array[i] +Array1._array[i];
}
Now if DSIZE> 1048560, then the result is 0;
I have few questions:
1)How to determine the size limit of the vector. I have 8 devices.
2)Is there any way to increase the size of the data that I can use or improve the code?
3)When and where do I need cudaDeviceSynchronize() ?
I would be happy if someone can help me out.
If you had used proper CUDA error checking to find out if and which CUDA errors occured, you would have gotten the following output after launching fooKernel with DSIZE > 1048560:
invalid argument
The reason for this error is that you can have at most 65535 blocks in one dimension and
1048560/16 = 65535
So you did not run into a size limit of the vector but into the maximum block limit.

Using CUDA Thrust algorithms sequentially on the host

I wish to compare a Thrust algorithm's runtime when executed sequentially on a single CPU core versus a parallel execution on a GPU.
Thrust specifies the thrust::seq execution policy, but how can I explicity target the host backend system? I wish to avoid executing the algorithm sequentially on the GPU.
CUDA Thrust is architecture agnostic. Accordingly, consider the code I provided as an answer to
Cumulative summation in CUDA
In that code, MatingProbability and CumulativeProbability were thrust::device_vectors. thrust::transform and thrust::inclusive_scan were automatically able to recognize that and operate accordingly on the GPU.
Below, I'm providing the same code by changing thrust::device_vector to thrust::host_vector. Again, thrust::transform and thrust::inclusive_scan are able to automatically recognize that the vectors to operate on reside on the CPU and to operate accordingly.
#include <thrust/host_vector.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdio>
template <class T>
struct scaling {
const T _a;
scaling(T a) : _a(a) { }
__host__ __device__ T operator()(const T &x) const { return _a * x; }
};
void main()
{
const int N = 20;
double a = -(double)N;
double b = 0.;
double Dx = -1./(0.5*N*(N+1));
thrust::host_vector<double> MatingProbability(N);
thrust::host_vector<double> CumulativeProbability(N+1, 0.);
thrust::transform(thrust::make_counting_iterator(a), thrust::make_counting_iterator(b), MatingProbability.begin(), scaling<double>(Dx));
thrust::inclusive_scan(MatingProbability.begin(), MatingProbability.end(), CumulativeProbability.begin() + 1);
for(int i=0; i<N+1; i++)
{
double val = CumulativeProbability[i];
printf("%d %3.15f\n", i, val);
}
}

How to get a self written class, including dynamic Memory, to the Kernel and back?

i want to do light scattering simulations in 3D Objects with CUDA, its similar to raytracing. So i need a vector Class.
I tried to bring a Class, representing my Photons,to the Device. This Class includes a dynamic allocated vector class(self written). It is not realy necassary that it is dynamic, but i will have the same Problem in an other case.
The Problem is, that i get an unspecified launch failure if i try to modify the vector inside the Kernel.
I think it can also be a problem of copy constructors, or something. I didnĀ“t programmed C++/CUDA for a while.
I use a GTX 480 with Compute capability 2.0 and CUDA 5.0.
Here is my main:
#include "photon.cuh"
#include "Container/vector3f.cu"
// Device code (Kernel, GPU)
__global__ void Sim(photon * l_x){
l_x->vec->m_x = l_x->vec->m_x +1;
l_x->vec->m_y = l_x->vec->m_y +1;
l_x->vec->m_z = l_x->vec->m_z +1;
}
// Host Code (CPU)
int main(int argc, char** argv)
{
photon *h_x,*d_x,*h_x2;
h_x = new photon();
//h_x->vec = new vector3f();
h_x->vec->m_x = 1;
h_x->vec->m_y = 2;
h_x->vec->m_z = 3;
std::cout << "Malloc" << std::endl;
h_x2 = (photon*)malloc(sizeof(photon));
cudaMalloc((void**)&d_x,sizeof(photon));
std::cout << "Cpy h-d" << std::endl;
cudaMemcpy(d_x,h_x,sizeof(photon),cudaMemcpyHostToDevice);
cudaError_t Err = cudaGetLastError();
if ( cudaSuccess != Err )
std::cout << cudaGetErrorString (Err) << std::endl;
std::cout << "Sim" << std::endl;
Sim<<<1, 1>>>(d_x);
cudaThreadSynchronize();
Err = cudaGetLastError();
if ( cudaSuccess != Err )
std::cout << cudaGetErrorString (Err) << std::endl;
std::cout << "CPY back" << std::endl;
cudaMemcpy(h_x2, d_x, sizeof(photon), cudaMemcpyDeviceToHost);
std::cout << h_x2->vec->m_x << std::endl;
std::cout << h_x2->vec->m_y << std::endl;
std::cout << h_x2->vec->m_z << std::endl;
cudaFree(d_x);
return 0;
}
The Photon Class:(.cuh)
class photon {
public:
vector3f *vec;
__host__ __device__ photon();
__host__ __device__ virtual ~photon();
__host__ __device__ photon(const photon &other);
};
(.cu)
#include "photon.cuh"
#include "Container/vector3f.cu"
__host__ __device__ photon::photon(){
this->vec = new vector3f();}
__host__ __device__ photon::~photon(){
delete this->vec;}
__host__ __device__ photon::photon(const photon &rhs){
this->vec = new vector3f(*rhs.vec);}
And Finaly the vector Class:
class vector3f {
public:
float m_x;
float m_y;
float m_z;
__host__ __device__ vector3f(float l_x, float l_y, float l_z){
this->m_x = l_x;
this->m_y = l_y;
this->m_z = l_z;}
__host__ __device__ vector3f(const vector3f& l_vector){
this->m_x = l_vector.m_x;
this->m_y = l_vector.m_y;
this->m_z = l_vector.m_z;}
__host__ __device__ vector3f(){
this->m_x = 0;
this->m_y = 0;
this->m_z = 0;}};
The underlying problem is that the only time you instantiate your photon class anywhere is on the host, and you are copying that host instance directly to the device. That means that the device code is attempting to de-reference a host pointer on the GPU, which is illegal and produces the runtime error you are seeing. The CUDA APIs don't do any sort of magic deep copying, so you have to manage this yourself somehow.
The obvious solution is to redesign the photon class so that vec is stored by value rather than reference. Then the whole problem goes away (and the performance will be a lot better on the GPU because you remove a level of pointer indirection during memory access).
If you are fixated on having a pointer to vec, redesign the constructor so that it takes a pointer from a memory pool, and allocate a device pool for construction. If you pass a device pointer to the constructor, the resulting instance will have a pointer to valid device memory.

CUDA host and device using same __constant__ memory

I have device/host function that uses constant memory. It runs OK on device, but on host it seems like this memory remains uninitialized.
#include <iostream>
#include <stdio.h>
const __constant__ double vals[2] = { 0.0, 1000.0 };
__device__ __host__ double f(size_t i)
{
return vals[i];
}
__global__ void kern()
{
printf("vals[%d] = %lf\n", threadIdx.x, vals[threadIdx.x]);
}
int main() {
std::cerr << f(0) << " " << f(1) << std::endl;
kern<<<1, 2>>>();
cudaThreadSynchronize();
}
This prints (requires CC 2.0 or above)
0 0
vals[0] = 0.000000
vals[1] = 1000.000000
What is the problem and how can I get both device and host memory constants initialized simultaneously?
Since CygnusX1 misunderstood what I meant in my comment on MurphEngineer's answer, maybe I should post my own answer. What I meant was this:
__constant__ double dc_vals[2] = { 0.0, 1000.0 };
const double hc_vals[2] = { 0.0, 1000.0 };
__device__ __host__ double f(size_t i)
{
#ifdef __CUDA_ARCH__
return dc_vals[i];
#else
return hc_vals[i];
#endif
}
This has the same result as Cygnus', but it is more flexible in the face of real code: it lets you have runtime-defined values in your constant arrays, for example, and allows you to use CUDA API functions like cudaMemcpyToSymbol/cudsaMemcpyFromSymbol on the __constant__ array.
A more realistic complete example:
#include <iostream>
#include <stdio.h>
__constant__ double dc_vals[2];
const double hc_vals[2];
__device__ __host__ double f(size_t i)
{
#ifdef __CUDA_ARCH__
return dc_vals[i];
#else
return hc_vals[i];
#endif
}
__global__ void kern()
{
printf("vals[%d] = %lf\n", threadIdx.x, vals[threadIdx.x]);
}
int main() {
hc_vals[0] = 0.0;
hc_vals[1] = 1000.0;
cudaMemcpyToSymbol(dc_vals, hc_vals, 2 * sizeof(double), 0, cudaMemcpyHostToDevice);
std::cerr << f(0) << " " << f(1) << std::endl;
kern<<<1, 2>>>();
cudaThreadSynchronize();
}
I think MurphEngineer explained well why it does not work.
To quickly fix this problem, you can follow harrism's idea, something like this:
#ifdef __CUDA_ARCH__
#define CONSTANT __constant__
#else
#define CONSTANT
#endif
const CONSTANT double vals[2] = { 0.0, 1000.0 };
This way the host compilation will create a normal host const array, while device compilation will create a device __constant__ compilation.
Do note that with this trick it might be harder to use CUDA API to access that device array with functions like cudaMemcpyToSymbol() if you ever decide to do so.
Using the __constant__ qualifier explicitly allocates that memory on the device. There is no way to access that memory from the host -- not even with the new CUDA Unified Addressing stuff (that only works for memory allocated with cudaMalloc() and its friends). Qualifying the variable with const just says "this is a constant pointer to (...)".
The correct way to do this is, indeed, to have two arrays: one on the host, and one on the device. Initialize your host array, then use cudaMemcpyToSymbol() to copy data to the device array at runtime. For more information on how to do this, see this thread: http://forums.nvidia.com/index.php?showtopic=69724
Absolutely great. I was struggling with the same issue and this provides a solution. However, the code suggested by harrism gives errors on compilation. Here is the fixed code which compiles correctly with nvcc:
#include <iostream>
#include <stdio.h>
__constant__ double dc_vals[2];
const double hc_vals[2] = {0.0, 1000.0};
__device__ __host__ double f(size_t i)
{
#ifdef __CUDA_ARCH__
return dc_vals[i];
#else
return hc_vals[i];
#endif
}
__global__ void kern()
{
printf("Device: vals[%d] = %lf\n", threadIdx.x, f(threadIdx.x));
}
int main() {
cudaMemcpyToSymbol(dc_vals, hc_vals, 2 * sizeof(double), 0, cudaMemcpyHostToDevice);
std::cerr << "Host: " << f(0) << " " << f(1) << std::endl;
kern<<<1, 2>>>();
cudaThreadSynchronize();
}