Hello I have this loop in C+, and I was trying to convert it to thrust but without getting the same results...
Any ideas?
thank you
C++ Code
for (i=0;i<n;i++)
for (j=0;j<n;j++)
values[i]=values[i]+(binv[i*n+j]*d[j]);
Thrust Code
thrust::fill(values.begin(), values.end(), 0);
thrust::transform(make_zip_iterator(make_tuple(
thrust::make_permutation_iterator(values.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexDivFunctor(n))),
binv.begin(),
thrust::make_permutation_iterator(d.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexModFunctor(n))))),
make_zip_iterator(make_tuple(
thrust::make_permutation_iterator(values.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexDivFunctor(n))) + n,
binv.end(),
thrust::make_permutation_iterator(d.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexModFunctor(n))) + n)),
thrust::make_permutation_iterator(values.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexDivFunctor(n))),
function1()
);
Thrust Functions
struct IndexDivFunctor: thrust::unary_function<int, int>
{
int n;
IndexDivFunctor(int n_) : n(n_) {}
__host__ __device__
int operator()(int idx)
{
return idx / n;
}
};
struct IndexModFunctor: thrust::unary_function<int, int>
{
int n;
IndexModFunctor(int n_) : n(n_) {}
__host__ __device__
int operator()(int idx)
{
return idx % n;
}
};
struct function1
{
template <typename Tuple>
__host__ __device__
double operator()(Tuple v)
{
return thrust::get<0>(v) + thrust::get<1>(v) * thrust::get<2>(v);
}
};
To begin with, some general comments. Your loop
for (i=0;i<n;i++)
for (j=0;j<n;j++)
v[i]=v[i]+(B[i*n+j]*d[j]);
is the equivalent of the standard BLAS gemv operation
where the matrix is stored in row major order. The optimal way to do this on the device would be using CUBLAS, not something constructed out of thrust primitives.
Having said that, there is absolutely no way the thrust code you posted is ever going to do what your serial code does. The errors you are seeing are not as a result of floating point associativity. Fundamentally thrust::transform applies the functor supplied to every element of the input iterator and stores the result on the output iterator. To yield the same result as the loop you posted, the thrust::transform call would need to perform (n*n) operations of the fmad functor you posted. Clearly it does not. Further, there is no guarantee that thrust::transform would perform the summation/reduction operation in a fashion that would be safe from memory races.
The correct solution is probably going to be something like:
Use thrust::transform to compute the (n*n) products of the elements of B and d
Use thrust::reduce_by_key to reduce the products into partial sums, yielding Bd
Use thrust::transform to add the resulting matrix-vector product to v to yield the final result.
In code, firstly define a functor like this:
struct functor
{
template <typename Tuple>
__host__ __device__
double operator()(Tuple v)
{
return thrust::get<0>(v) * thrust::get<1>(v);
}
};
Then do the following to compute the matrix-vector multiplication
typedef thrust::device_vector<int> iVec;
typedef thrust::device_vector<double> dVec;
typedef thrust::counting_iterator<int> countIt;
typedef thrust::transform_iterator<IndexDivFunctor, countIt> columnIt;
typedef thrust::transform_iterator<IndexModFunctor, countIt> rowIt;
// Assuming the following allocations on the device
dVec B(n*n), v(n), d(n);
// transformation iterators mapping to vector rows and columns
columnIt cv_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexDivFunctor(n));
columnIt cv_end = cv_begin + (n*n);
rowIt rv_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexModFunctor(n));
rowIt rv_end = rv_begin + (n*n);
dVec temp(n*n);
thrust::transform(make_zip_iterator(
make_tuple(
B.begin(),
thrust::make_permutation_iterator(d.begin(),rv_begin) ) ),
make_zip_iterator(
make_tuple(
B.end(),
thrust::make_permutation_iterator(d.end(),rv_end) ) ),
temp.begin(),
functor());
iVec outkey(n);
dVec Bd(n);
thrust::reduce_by_key(cv_begin, cv_end, temp.begin(), outkey.begin(), Bd.begin());
thrust::transform(v.begin(), v.end(), Bd.begin(), v.begin(), thrust::plus<double>());
Of course, this is a terribly inefficient way to do the computation compared to using a purpose designed matrix-vector multiplication code like dgemv from CUBLAS.
How much your results differ? Is it a completely different answer, or differs only on the last digits? Is the loop executed only once, or is it some kind of iterative process?
Floating point operations, especially those that repetedly add up or multiply certain values, are not associative, because of precision issues. Moreover, if you use fast-math optimisations, the operations may not be IEEE compilant.
For starters, check out this wikipedia section on floating-point numbers: http://en.wikipedia.org/wiki/Floating_point#Accuracy_problems
Related
I am trying to add 2 char arrays in cuda, but nothing is working.
I tried to use:
char temp[32];
strcpy(temp, my_array);
strcat(temp, my_array_2);
When I used this in kernel - I am getting error : calling a __host__ function("strcpy") from a __global__ function("Process") is not allowed
After this, I tried to use these functions in host, not in kernel - no error,but after addition I am getting strange symbols like ĶĶĶĶĶĶĶĶĶĶĶĶĶĶĶĶĶĶĶĶĶĶĶĶĶĶĶĶĶĶĶĶ.
So, how I can add two ( or more ) char arrays in CUDA ?
So, how I can add two ( or more ) char arrays in CUDA ?
write your own functions:
__device__ char * my_strcpy(char *dest, const char *src){
int i = 0;
do {
dest[i] = src[i];}
while (src[i++] != 0);
return dest;
}
__device__ char * my_strcat(char *dest, const char *src){
int i = 0;
while (dest[i] != 0) i++;
my_strcpy(dest+i, src);
return dest;
}
And while we're at it, here is strcmp
As the error message explains, you are trying to call host functions ("CPU functions") from a global kernel ("GPU function"). Within a global kernel you only have access to functions provided by the CUDA runtime API, which doesn't include the C standard library (where strcpy and strcat are defined).
You have to create your own str* functions according to what you want to do. Do you want to concatenate an array of chars in parallel, or do it serially in each thread?
While testing if I know how to allocate surface objects, I was designing a dummy kernel to read a single value.
This kernel was failing at compile time because
"no instance of overloaded function "surf3Dread" matches the argument list"
__global__ void test_surface(cudaSurfaceObject_t surfImg,int x, int y, int z){
float test = surf3Dread(surfImg , (int)(x*sizeof(float)) , y , z ,cudaBoundaryModeZero);
printf("%f \n",test);
}
it works when I do this instead:
__global__ void test_surface(cudaSurfaceObject_t surfImg,int x, int y, int z){
float test;
surf3Dread(&test,surfImg , (int)(x*sizeof(float)) , y , z ,cudaBoundaryModeZero);
printf("%f \n",test);
}
This is nor a problem really, but I was doing the first because the documentation of surf3Dread states that you this function is defined as:
template<class T>
T surf3Dread(cudaSurfaceObject_t surfObj,
int x, int y, int z,
boundaryMode = cudaBoundaryModeTrap);
template<class T>
void surf3Dread(T* data,
cudaSurfaceObject_t surfObj,
int x, int y, int z,
boundaryMode = cudaBoundaryModeTrap);
Maybe I am not understanding the documentation correctly, but I'd say that the first kernel here corresponds to the first documented way of calling the function and the second kernel to the second. Why does only one work? If I misunderstood the first function in the documentation, how do you call that version?
I am using CUDA 10.2
In the first instance, the compiler cannot deduce the template instantiation from the supplied function arguments. You need to specify the type explicitly to the compiler. This:
#include <cstdio>
__global__ void test_surface(cudaSurfaceObject_t surfImg,int x, int y, int z){
float test = surf3Dread<float>(surfImg, (int)(x*sizeof(float)), y, z, cudaBoundaryModeZero);
printf("%f \n",test);
}
will work where your version will not.
Basically what I want is an function works like hiloint2uint64(), just join two 32 bit integer and reinterpret the outcome as an uint64.
I cannot find any function in CUDA that can do this, anyhow, is there any ptx code that can do that kind of type casting?
You can define your own function like this:
__host__ __device__ unsigned long long int hiloint2uint64(int h, int l)
{
int combined[] = { h, l };
return *reinterpret_cast<unsigned long long int*>(combined);
}
Maybe a bit late by now, but probably the safest way to do this is to do it "manually" with bit-shifts and or:
uint32_t ui_h = h;
uint32_t ui_l = l;
return (uint64_t(h)<<32)|(uint64_t(l));
Note the other solution presented in the other answer isn't safe, because the array of ints might not be 8-byte aligned (and shifting some bits is faster than memory read/write, anyway)
Use uint2 (but define the temporary variable as 64-bit value: unsigned long long int) instead of arrays to be sure of alignment.
Be careful about the order of l and h.
__host__ __device__ __forceinline__ unsigned long long int hiloint2uint64(unsigned int h, unsigned int l)
{
unsigned long long int result;
uint2& src = *reinterpret_cast<uint2*>(&result);
src.x = l;
src.y = h;
return result;
}
The CUDA registers have a size of 32 bits anyway. In the best case the compiler won't need any extra code. In the worst case it has to reorder the registers by moving a 32-bit value.
Godbolt example https://godbolt.org/z/3r9WYK9e7 of how optimized it gets.
Let us assume that we have the following strings that we need to store in a CUDA array.
"hi there"
"this is"
"who is"
How do we declare a array on the GPU to do this. I tried using C++ strings but it does not work.
Probably the best way to do this is to use structure that is similar to common compressed sparse matrix formats. Store the character data packed into a single piece of linear memory, then use a separate integer array to store the starting indices, and perhaps a third array to store the string lengths. The storage overhead of the latter might be more efficient that storing a string termination byte for every entry in the data and trying to parse for the terminator inside the GPU code.
So you might have something like this:
struct gpuStringArray {
unsigned int * pos;
unsigned int * length; // could be a smaller type if strings are short
char4 * data; // 32 bit data type will improve memory throughput, could be 8 bit
}
Note I used a char4 type for the string data; the vector type will give better memory throughput, but it will mean strings need to be aligned/suitably padded to 4 byte boundaries. That may or may not be a problem depending on what a typical real string looks like in your application. Also, the type of the (optional) length parameter should probably be chosen to reflect the maximum admissible string length. If you have a lot of very short strings, it might be worth using an 8 or 16 bit unsigned type for the lengths to save memory.
A really simplistic code to compare strings stored this way in the style of strcmp might look something like this:
__device__ __host__
int cmp4(const char4 & c1, const char4 & c2)
{
int result;
result = c1.x - c2.x; if (result !=0) return result;
result = c1.y - c2.y; if (result !=0) return result;
result = c1.z - c2.z; if (result !=0) return result;
result = c1.w - c2.w; if (result !=0) return result;
return 0;
}
__device__ __host__
int strncmp4(const char4 * s1, const char4 * s2, const unsigned int nwords)
{
for(unsigned int i=0; i<nwords; i++) {
int result = cmp4(s1[i], s2[i]);
if (result != 0) return result;
}
return 0;
}
__global__
void tkernel(const struct gpuStringArray a, const gpuStringArray b, int * result)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
char4 * s1 = a.data + a.pos[idx];
char4 * s2 = b.data + b.pos[idx];
unsigned int slen = min(a.length[idx], b.length[idx]);
result[idx] = strncmp4(s1, s2, slen);
}
[disclaimer: never compiled, never tested, no warranty real or implied, use at your own risk]
There are some corner cases and assumptions in this which might catch you out depending on exactly what the real strings in your code look like, but I will leave those as an exercise to the reader to resolve. You should be able to adapt and expand this into whatever it is you are trying to do.
You have to use C-style character strings char *str. Searching for "CUDA string" on google would have given you this CUDA "Hello World" example as first hit: http://computer-graphics.se/hello-world-for-cuda.html
There you can see how to use char*-strings in CUDA. Be aware that standard C-functions like strcpy or strcmp are not available in CUDA!
If you want an array of strings, you just have to use char** (as in C/C++). As for strcmp and similar functions, it highly depends on what you want to do. CUDA is not really well suited for string operations, maybe it would help if you would provide a little more detail about what you want to do.
I'm evaluating CUDA and currently using Thrust library to sort numbers.
I'd like to create my own comparer for thrust::sort, but it slows down drammatically!
I created my own less implemetation by just copying code from functional.h.
However it seems to be compiled in some other way and works very slowly.
default comparer: thrust::less() - 94ms
my own comparer: less() - 906ms
I'm using Visual Studio 2010. What should I do to get the same performance as at option 1?
Complete code:
#include <stdio.h>
#include <cuda.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
int myRand()
{
static int counter = 0;
if ( counter++ % 10000 == 0 )
srand(time(NULL)+counter);
return (rand()<<16) | rand();
}
template<typename T>
struct less : public thrust::binary_function<T,T,bool>
{
__host__ __device__ bool operator()(const T &lhs, const T &rhs) const {
return lhs < rhs;
}
};
int main()
{
thrust::host_vector<int> h_vec(10 * 1000 * 1000);
thrust::generate(h_vec.begin(), h_vec.end(), myRand);
thrust::device_vector<int> d_vec = h_vec;
int clc = clock();
thrust::sort(d_vec.begin(), d_vec.end(), less<int>());
printf("%dms\n", (clock()-clc) * 1000 / CLOCKS_PER_SEC);
return 0;
}
The reason you are observing a difference in performance is because Thrust is implementing the sort with different algorithms depending on the arguments provided to thrust::sort.
In case 1., Thrust can prove that the sort can be implemented in linear time with a radix sort. This is because the type of the data to sort is a built-in numeric type (int), and the comparison function is the built-in less than operation -- Thrust recognizes that thrust::less<int> will produce the equivalent result as x < y.
In case 2., Thrust knows nothing about your user-provided less<int>, and has to use a more conservative algorithm based on a comparison sort which has different asymptotic complexity, even though in truth your less<int> is equivalent to thrust::less<int>.
In general, user-defined comparison operators can't be used with more restrictive, faster sorts which manipulate the binary representation of data such as radix sort. In these cases, Thrust falls back on a more general, but slower sort.