Element-by-element vector multiplication with CUDA

Element-by-element vector multiplication with CUDA - cuda

I have build a rudimentary kernel in CUDA to do an elementwise vector-vector multiplication of two complex vectors. The kernel code is inserted below (multiplyElementwise). It works fine, but since I noticed that other seemingly straightforward operations (like scaling a vector) are optimized in libraries like CUBLAS or CULA, I was wondering if it is possible to replace my code by a library call? To my surprise, neither CUBLAS nor CULA have this option, I tried to fake it by making one of the vectors the diagonal of a diagonal matrix-vector product, but the result was really slow.
As a matter of last resort I tried to optimize this code myself (see multiplyElementwiseFast below) by loading the two vectors in shared memory and then work from there, but that was slower than my original code.
So my questions:
Is there library that does elementwise vector-vector multiplications?
If not, can I accelerate my code (multiplyElementwise)?
Any help would be greatly appreciated!
__global__ void multiplyElementwise(cufftComplex* f0, cufftComplex* f1, int size)
{
const int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < size)
{
float a, b, c, d;
a = f0[i].x;
b = f0[i].y;
c = f1[i].x;
d = f1[i].y;
float k;
k = a * (c + d);
d = d * (a + b);
c = c * (b - a);
f0[i].x = k - d;
f0[i].y = k + c;
}
}
__global__ void multiplyElementwiseFast(cufftComplex* f0, cufftComplex* f1, int size)
{
const int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < 4*size)
{
const int N = 256;
const int thId = threadIdx.x / 4;
const int rem4 = threadIdx.x % 4;
const int i4 = i / 4;
__shared__ float a[N];
__shared__ float b[N];
__shared__ float c[N];
__shared__ float d[N];
__shared__ float Re[N];
__shared__ float Im[N];
if (rem4 == 0)
{
a[thId] = f0[i4].x;
Re[thId] = 0.f;
}
if (rem4 == 1)
{
b[thId] = f0[i4].y;
Im[thId] = 0.f;
}
if (rem4 == 2)
c[thId] = f1[i4].x;
if (rem4 == 0)
d[thId] = f1[i4].y;
__syncthreads();
if (rem4 == 0)
atomicAdd(&(Re[thId]), a[thId]*c[thId]);
if (rem4 == 1)
atomicAdd(&(Re[thId]), -b[thId]*d[thId]);
if (rem4 == 2)
atomicAdd(&(Im[thId]), b[thId]*c[thId]);
if (rem4 == 3)
atomicAdd(&(Im[thId]), a[thId]*d[thId]);
__syncthreads();
if (rem4 == 0)
f0[i4].x = Re[thId];
if (rem4 == 1)
f0[i4].y = Im[thId];
}
}

If what you are trying to achieve is a simple element-wise product with complex numbers, you do seem to be doing some extra steps in your multiplyElementwise kernel that increase register usage. What you try to compute is:
f0[i].x = a*c - b*d;
f0[i].y = a*d + b*c;
since (a + ib)*(c + id) = (a*c - b*d) + i(a*d + b*c). By using your improved complex multiplication, you're trading 1 multiplication for 3 additions and some extra registers. Whether this can be justified or not might depend on the hardware you're using. For instance, if your hardware supports FMA (Fused Multiply-Add), that kind of optimization may not be efficient. You should consider reading this document: "Precision & Performance:
Floating Point and IEEE 754 Compliance for NVIDIA GPUs" which also tackles the issue of floating-point precision.
Still, you should consider using Thrust. This library offers many high-level tools to operate on both host and device vectors. You can see a long list of examples here: https://github.com/thrust/thrust/tree/master/examples. This would make your life a lot easier.
UPDATED CODE
In your case, you could use this example and adapt it to something like this:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <time.h>
struct ElementWiseProductBasic : public thrust::binary_function<float2,float2,float2>
{
__host__ __device__
float2 operator()(const float2& v1, const float2& v2) const
{
float2 res;
res.x = v1.x * v2.x - v1.y * v2.y;
res.y = v1.x * v2.y + v1.y * v2.x;
return res;
}
};
/**
* See: http://www.embedded.com/design/embedded/4007256/Digital-Signal-Processing-Tricks--Fast-multiplication-of-complex-numbers%5D
*/
struct ElementWiseProductModified : public thrust::binary_function<float2,float2,float2>
{
__host__ __device__
float2 operator()(const float2& v1, const float2& v2) const
{
float2 res;
float a, b, c, d, k;
a = v1.x;
b = v1.y;
c = v2.x;
d = v2.y;
k = a * (c + d);
d = d * (a + b);
c = c * (b - a);
res.x = k -d;
res.y = k + c;
return res;
}
};
int get_random_int(int min, int max)
{
return min + (rand() % (int)(max - min + 1));
}
thrust::host_vector<float2> init_vector(const size_t N)
{
thrust::host_vector<float2> temp(N);
for(size_t i = 0; i < N; i++)
{
temp[i].x = get_random_int(0, 10);
temp[i].y = get_random_int(0, 10);
}
return temp;
}
int main(void)
{
const size_t N = 100000;
const bool compute_basic_product = true;
const bool compute_modified_product = true;
srand(time(NULL));
thrust::host_vector<float2> h_A = init_vector(N);
thrust::host_vector<float2> h_B = init_vector(N);
thrust::device_vector<float2> d_A = h_A;
thrust::device_vector<float2> d_B = h_B;
thrust::host_vector<float2> h_result(N);
thrust::host_vector<float2> h_result_modified(N);
if (compute_basic_product)
{
thrust::device_vector<float2> d_result(N);
thrust::transform(d_A.begin(), d_A.end(),
d_B.begin(), d_result.begin(),
ElementWiseProductBasic());
h_result = d_result;
}
if (compute_modified_product)
{
thrust::device_vector<float2> d_result_modified(N);
thrust::transform(d_A.begin(), d_A.end(),
d_B.begin(), d_result_modified.begin(),
ElementWiseProductModified());
h_result_modified = d_result_modified;
}
std::cout << std::fixed;
for (size_t i = 0; i < 4; i++)
{
float2 a = h_A[i];
float2 b = h_B[i];
std::cout << "(" << a.x << "," << a.y << ")";
std::cout << " * ";
std::cout << "(" << b.x << "," << b.y << ")";
if (compute_basic_product)
{
float2 prod = h_result[i];
std::cout << " = ";
std::cout << "(" << prod.x << "," << prod.y << ")";
}
if (compute_modified_product)
{
float2 prod_modified = h_result_modified[i];
std::cout << " = ";
std::cout << "(" << prod_modified.x << "," << prod_modified.y << ")";
}
std::cout << std::endl;
}
return 0;
}
This returns:
(6.000000,5.000000) * (0.000000,1.000000) = (-5.000000,6.000000)
(3.000000,2.000000) * (0.000000,4.000000) = (-8.000000,12.000000)
(2.000000,10.000000) * (10.000000,4.000000) = (-20.000000,108.000000)
(4.000000,8.000000) * (10.000000,9.000000) = (-32.000000,116.000000)
You can then compare the timings of the two different multiplication strategies and choose what's best with your hardware.

You can use cublasZdgmm.
cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode,
int m, int n,
const cuDoubleComplex *A, int lda,
const cuDoubleComplex *x, int incx,
cuDoubleComplex *C, int ldc)

Related

CUDA 2nd order recursion with thrust inclusive_scan

I'm trying to understand how to parallelise a recursive calculation. Serially, the calculation takes the form:
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
For the i-1 index there's a solution here to a previous question of mine: CUDA force instruction execution order
I want to modify this to use the i-2 and I can't understand how to apply the same process to a 2nd order calculation. It should be possible using the thrust::inclusive_scan function, but I can't work out how. Does anyone know the solution?

Picking up where we left off in the previous question/answer, we shift our attention to equation 1.11 in the referenced paper by Blelloch. We observe that your problem formulation:
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
seems to match that in equation 1.11 if we set m=2 and in that case we can also observe that for your formulation, all ai,1 are zero (and, as previously, all ai,2 are k).
As per equation 1.12 in that paper, our state variable si now becomes a two-tuple:
si = |xi xi-1|
Taking note of these things, we observe the "correctness" of equation 1.13:
si = |xi-1 xi-2| . |0 1, k 0| + |bi 0|
rewriting:
si,1 = xi = k*xi-2 + bi
si,2 = xi-1 = xi-1
(In my view, the other answer leaves you at this point. That realization, i.e. result.data[0] = right + k * left.data[1]; is sufficient for a serial scan but not for a parallel scan. It's also evident that the functor/scan op there is not associative.)
We now need to come up with a binary operator bop that is an extension of the definition in (1.7) to this case. Referring to the previous definition in equation 1.7, we extend that based on the treatment in 1.13 as follows:
Ci = |Ai , Bi|
where:
Ai = |0 1, k 0|
and:
Bi = |bi 0|
We then have:
Ci bop Cj = | Ai . Aj , Bi . Aj + Bj |
This then becomes the formula for our functor/scan operator. We will need to carry 6 scalar "state" quantities throughout: 2 for the B vector and 4 for the A matrix.
What follows then is a realization of the above:
$ cat t1930.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
#include <cstdio>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef float mt;
const size_t ds = 512;
const mt k = 1.01;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // cpu result
for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
thrust::device_vector<mt> db(b1, b1+ds);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_vector<mt> dx1(ds);
thrust::device_vector<mt> dx0(ds);
thrust::device_vector<mt> dy0(ds);
thrust::device_vector<mt> dy1(ds);
thrust::device_vector<mt> dy2(ds);
thrust::device_vector<mt> dy3(ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(db.begin(), b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
thrust::host_vector<mt> hx1 = dx1;
thrust::copy_n(hx1.begin(), snip, std::ostream_iterator<mt>(std::cout, ","));
thrust::copy_n(hx1.begin()+ds-snip, snip, std::ostream_iterator<mt>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -std=c++14 t1930.cu -o t1930
$ cuda-memcheck ./t1930
========= CUDA-MEMCHECK
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.218,601.275,576.315,607.993,582.947,614.621,589.516,621.699,595.644,628.843,
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.621,589.516,621.7,595.644,628.843,
========= ERROR SUMMARY: 0 errors
$
Yes, there are some results above that differ in the 6th digit. I attribute this to the limitations of float resolution when taking into account the very different order of operations between the serial and parallel method. If you change the typedef to double, the results will appear to match exactly.
Since you've asked about it here's an equivalent realization where it is demonstrated using device data previously allocated using cudaMalloc:
$ cat t1930.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
#include <cstdio>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef double mt;
const size_t ds = 512;
const mt k = 1.01;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // cpu result
for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
mt *db;
cudaMalloc(&db, ds*sizeof(db[0]));
cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_vector<mt> dx1(ds);
thrust::device_vector<mt> dx0(ds);
thrust::device_vector<mt> dy0(ds);
thrust::device_vector<mt> dy1(ds);
thrust::device_vector<mt> dy2(ds);
thrust::device_vector<mt> dy3(ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
cudaMemcpy(cr, thrust::raw_pointer_cast(dx1.data()), ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
}
$ nvcc -std=c++14 t1930.cu -o t1930
$ cuda-memcheck ./t1930
========= CUDA-MEMCHECK
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
========= ERROR SUMMARY: 0 errors
There should be no significant performance difference between these two approaches. (However I happened to switch the typedef to double for this example, so that makes a difference.) Using cudaMalloc as an alternative to the device_vector for the various state vectors (dx0, dx1, dy0, dy1 ...) may be slightly faster, because device_vector first does a cudaMalloc style allocation, then launches a kernel to zero out the allocation. This zero-ing step is unnecessary for the state vectors. The pattern given here should demonstrate how you could do that, if you are interested.
Here's a version that eliminates use of thrust::device_vector and thrust::host_vector altogether:
#include <iostream>
#include <thrust/device_ptr.h>
#include <thrust/scan.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef float mt;
const size_t ds = 32768*4;
const mt k = 1.001;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // result
for (int i = 0; i < ds; i++) { b1[i] = (rand()/(float)RAND_MAX)-0.5;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
mt *db, *dstate;
cudaMalloc(&db, ds*sizeof(db[0]));
cudaMalloc(&dstate, 6*ds*sizeof(dstate[0]));
cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_ptr<mt> dx1 = thrust::device_pointer_cast(dstate);
thrust::device_ptr<mt> dx0 = thrust::device_pointer_cast(dstate+ds);
thrust::device_ptr<mt> dy0 = thrust::device_pointer_cast(dstate+2*ds);
thrust::device_ptr<mt> dy1 = thrust::device_pointer_cast(dstate+3*ds);
thrust::device_ptr<mt> dy2 = thrust::device_pointer_cast(dstate+4*ds);
thrust::device_ptr<mt> dy3 = thrust::device_pointer_cast(dstate+5*ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1, dx0, dy0, dy1, dy2, dy3));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
cudaMemcpy(cr, dstate, ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
}

Here is some cpu code which shows a possible implementation of the formular derived from https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf to express higher-order recurrences as a scan operation.
The key idea is that each element of the scan result is not a scalar, but a vector which contains the n previous scalar results. This way, all the required previous results are available in the scan operator to compute the next result.
#include <iostream>
#include <algorithm>
#include <numeric>
#include <array>
void calculate1(std::vector<int> vec, int k){
std::vector<int> result(vec.size(), 0);
for(int i = 2; i < vec.size(); i++){
result[i] = vec[i] + k * result[i-2];
}
std::cerr << "calculate1 result: ";
for(auto x : result){
std::cerr << x << ", ";
}
std::cerr << "\n";
}
struct S{
//data[0] stores result of last iteration
//data[1] stores result of second last iteration
std::array<int, 2> data;
};
std::ostream& operator<<(std::ostream& os, S s){
os << "(" << s.data[0] << "," << s.data[1] << ")";
}
void calculate2(std::vector<int> vec, int k){
S initvalue{{0,0}};
std::vector<S> result(vec.size(), initvalue);
std::exclusive_scan(
vec.begin() + 2,
vec.end(),
result.begin(),
initvalue,
[k](S left, int right){
S result;
/*A = (
0 1
k 0
)
Compute result = left * A + (right 0)
*/
result.data[0] = right + k * left.data[1];
result.data[1] = left.data[0];
return result;
}
);
std::cerr << "calculate2 result: ";
for(auto x : result){
std::cerr << x << ", ";
}
std::cerr << "\n";
}
int main(){
const int k = 5;
const std::vector<int> vec1{1,3,5,7,9,11,3,6,7,1,2,4};
calculate1(vec1, k);
calculate2(vec1, k);
}
https://godbolt.org/z/cszzn8Ec8
Output:
calculate1 result: 0, 0, 5, 7, 34, 46, 173, 236, 872, 1181, 4362, 5909,
calculate2 result: (0,0), (5,0), (7,5), (34,7), (46,34), (173,46), (236,173), (872,236), (1181,872), (4362,1181), (0,0), (0,0),
There is still an of-by-one error somewhere, but one can get the idea behind it.
I previously said that this approach can be used for a parallel scan in CUDA. This is not correct. For a parallel scan, the scan operator must have an additional property, which is associativity, i.e. (a OP b) OP c == a OP (b OP c). This is not the case in this approach.
Robert Crovella's answer shows how to derive an associative scan operator which can be used for a parallel scan.

Cuda Implementation of Partitioned Subgroup

is there a more efficient way to implement the "Partitioned Subgroup" functions of Vulkan/OpenGL, which do not have to loop over all elements in the subgroup? My current implementation just uses a loop from 0 to WARP_SIZE.
References:
(slide 37+38) https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9909-nvidia-vulkan-features-update.pdf
https://github.com/KhronosGroup/GLSL/blob/master/extensions/nv/GL_NV_shader_subgroup_partitioned.txt
Simple Implementation:
__device__ uint32_t subgroupPartitionNV(ivec2 p)
{
uint32_t result = 0;
for (int i = 0; i < 32; ++i)
{
int x = __shfl_sync(0xFFFFFFFF, p(0), i);
int y = __shfl_sync(0xFFFFFFFF, p(1), i);
uint32_t b = __ballot_sync(0xFFFFFFFF, p(0) == x && p(1) == y);
if (i == threadIdx.x & 31) result = b;
}
return result;
}
__device__ uint32_t subgroupPartitionedAddNV(float value, uint32_t ballot)
{
float result = 0;
for ( unsigned int i = 0; i < 32; ++i)
{
float other_value = __shfl_sync(0xFFFFFFFF, value, i);
if ((1U << i) & ballot) result += other_value;
}
return result;
}

Thanks to the hint of Abator I came up with a more efficient solution. It's a little ugly because labeled_partition is only implemented for int but works quite well.
template <int GROUP_SIZE = 32>
__device__ cooperative_groups::coalesced_group subgroupPartitionNV(ivec2 p)
{
using namespace cooperative_groups;
thread_block block = this_thread_block();
thread_block_tile<GROUP_SIZE> tile32 = tiled_partition<GROUP_SIZE>(block);
coalesced_group g1 = labeled_partition(tile32, p(0));
coalesced_group g2 = labeled_partition(tile32, p(1));
details::_coalesced_group_data_access acc;
return acc.construct_from_mask<coalesced_group>(acc.get_mask(g1) & acc.get_mask(g2));
}
template <typename T, int GROUP_SIZE = 32>
__device__ T subgroupPartitionedAddNV(T value, cooperative_groups::coalesced_group group)
{
int s = group.size();
int r = group.thread_rank();
for (int offset = GROUP_SIZE / 2; offset > 0; offset /= 2)
{
auto v = group.template shfl_down(value, offset);
if (r + offset < s) value += v;
}
return value;
}

The Floyd-Warshall algorithm in CUDA

This is the sequential piece of code I am trying to parallelize in CUDA
/*
Sequential (Single Thread) APSP on CPU.
*/
void floyd_sequential(int *mat, const size_t N)
{
for(int k = 0; k < N; k ++)
for(int i = 0; i < N; i ++)
for(int j = 0; j < N; j ++)
{
int i0 = i*N + j;
int i1 = i*N + k;
int i2 = k*N + j;
if(mat[i1] != -1 && mat[i2] != -1)
mat[i0] = (mat[i0] != -1 && mat[i0] < mat[i1] + mat[i2]) ?
mat[i0] : (mat[i1] + mat[i2]);
}
}
This is my CUDA implementation
// ParallelComputing.cpp : Defines the entry point for the console application.
//
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#define DIMENSION 10;
__global__ void gpu_Floyd(int *result, int N)
{
int j,k;
int Row = blockIdx.y * blockDim.y + threadIdx.y;
for(k = 0; k < N; k++)
{
for(j = 0; j < N; j++)
{
int i0 = Row * N + j;
int i1 = Row * N + k;
int i2 = k * N + j;
if(result[i0] != -1 && result[i2] != -1)
result[i0] = (result[i0] != -1 && result[i0] < result[i1] + result[i2]) ?
result[i0] : (result[i1] + result[i2]);
__syncthreads();
}
}
}
void GenMatrix(int *mat, const size_t N)
{
for(int i = 0; i < N*N; i ++)
mat[i] = rand()%32 - 1;
}
bool CmpArray(const int *l, const int *r, const size_t eleNum)
{
for(int i = 0; i < eleNum; i ++)
if(l[i] != r[i])
{
printf("ERROR: l[%d] = %d, r[%d] = %d\n", i, l[i], i, r[i]);
return false;
}
return true;
}
int main(int argc, char **argv)
{
// generate a random matrix.
size_t N = 10;
int *mat = (int*)malloc(sizeof(int)*N*N);
GenMatrix(mat, N);
// compute the reference result.
int *ref = (int*)malloc(sizeof(int)*N*N);
memcpy(ref, mat, sizeof(int)*N*N);
Floyd_sequential(ref, N);
//CUDA Portion
int Grid_Dim_x = 1, Grid_Dim_y = 1;
int noThreads_x, noThreads_y;
int *result = (int*)malloc(sizeof(int)*N*N);
memcpy(result, mat, sizeof(int)*N*N);
int *d_result;
// compute your results
cudaMalloc((void **)&d_result, N*N);
cudaMemcpy(result, N * N, cudaMemcpyHostToDevice);
gpu_Floyd<<<1024, 256>>>(d_result, N);
cudaMemcpy(result, d_result, cudaMemcpyDeviceToHost);
// compare your result with reference result
if(CmpArray(result, ref, N*N))
printf("The matrix matches.\n");
else
printf("The matrix do not match.\n");
free(ref);
free(result);
cudaFree(d_result);
}
However, my output always shows the matrices do not match.
I understand that in CUDA we try to map each element in the matrix to each row. However, I am trying to explore possibilities by mapping each row of the matrix to a thread instead.

As has already been mentioned, your provided GPU code does not compile, so I'm curious how you got to the observation that your output matrices do not match.
Here are some of the problems with your code:
cudaMalloc, just like malloc allocates bytes, so this is not correct:
cudaMalloc((void **)&d_result, N*N);
instead you want this:
cudaMalloc((void **)&d_result, N*N*sizeof(int));
likewise cudaMemcpy, just like memcpy, operates on bytes, and furthermore cudaMemcpy requires 4 parameters so this is not correct:
cudaMemcpy(result, N * N, cudaMemcpyHostToDevice);
instead you probably want this:
cudaMemcpy(d_result, result, N * N*sizeof(int), cudaMemcpyHostToDevice);
and your other cudaMemcpy line needs to be fixed similarly.
I'd also advise doing proper cuda error checking
Your kernel is written as if it's expecting a 2 dimensional thread array, or at least one dimensional in y, whereas you are launching a one dimensional grid in x:
gpu_Floyd<<<1024, 256>>>(d_result, N);
therefore all your kernel built-in variables in y will be 1 or 0 always, and this line of code:
int Row = blockIdx.y * blockDim.y + threadIdx.y;
will evaluate to zero for all threads in your 1-D grid in x.
Your gpu kernel is putting the results in the same matrix as the input data. For sequential code this may or may not matter, but for code that is intended to run in parallel, it can often lead to race conditions, because the order of operations (i.e. order of thread execution) is largely undefined.

Below you will find a canonical, simple implementation of the Floyd-Warshall algorithm in CUDA.
The CUDA code is accompanied with a sequential implementation and both are based on the simplifying assumption that the edges are non-negative. The full, minimum distance paths are also reconstructed in both the cases. Despite the simplifying assumption, it should be possible to grasp the relevant parallelization idea, namely that a two-dimensional thread grid is exploited and that each thread along x is assigned to a matrix column, while each block along y is assigned to a matrix row. In this way, all the columns are loaded by the threadIdx.x == 0 threads of each block in shared memory.
// --- Assumption: graph with positive edges
#include <stdio.h>
#include <string>
#include <map>
#include <iostream>
#include <fstream>
#include "Utilities.cuh"
#define BLOCKSIZE 256
using namespace std;
map<string, int> nameToNum; // --- names of vertices
map<string, map<string, int>> weightMap; // --- weights of edges
/************************/
/* READ GRAPH FROM FILE */
/************************/
int *readGraphFromFile(int &N, char *fileName) {
string vertex1, vertex2;
ifstream graphFile;
int currentWeight;
N = 0; // --- Init the number of found vertices
graphFile.open(fileName); // --- Open the graph file
graphFile >> vertex1; // --- Read first vertex
while(vertex1 != "--END--") { // --- Loop untile end of file has not been found
graphFile >> vertex2; // --- Read second vertex
graphFile >> currentWeight; // --- Read weight between first and second vertex
if (nameToNum.count(vertex1) == 0) { // --- If vertex has not yet been added ...
nameToNum[vertex1] = N; // assign a progressive number to the vertex
weightMap[vertex1][vertex1] = 0; // assign a zero weight to the "self-edge"
N++; // --- Update the found number of vertices
}
if (nameToNum.count(vertex2) == 0) {
nameToNum[vertex2] = N;
weightMap[vertex2][vertex2] = 0;
N++;
}
weightMap[vertex1][vertex2] = currentWeight; // --- Update weight between vertices 1 and 2
graphFile >> vertex1;
}
graphFile.close(); // --- Close the graph file
// --- Construct the array
int *weightMatrix = (int*) malloc(N * N * sizeof(int));
// --- Loop over all the vertex couples in the wights matrix
for (int ii = 0; ii < N; ii++)
for (int jj = 0; jj < N; jj++)
weightMatrix[ii * N + jj] = INT_MAX / 2; // --- Init the weights matrix elements to infinity
map<string, int>::iterator i, j;
// --- Loop over all the vertex couples in the map
// (*i).first and (*j).first are the weight entries of the map, while (*i).second and (*j).second are their corresponding indices
for (i = nameToNum.begin(); i != nameToNum.end(); ++i)
for (j = nameToNum.begin(); j != nameToNum.end(); ++j) {
// --- If there is connection between vertices (*i).first and (*j).first, the update the weight matrix
if (weightMap[(*i).first].count((*j).first) != 0)
weightMatrix[N * (*i).second + (*j).second] = weightMap[(*i).first][(*j).first];
}
return weightMatrix;
}
/************************************/
/* PRINT MINIMUM DISTANCES FUNCTION */
/************************************/
void printMinimumDistances(int N, int *a) {
map<string, int>::iterator i;
// --- Prints all the node labels at the first row
for (i = nameToNum.begin(); i != nameToNum.end(); ++i) printf("\t%s", i->first.c_str());
printf("\n");
i = nameToNum.begin();
// --- Loop over the rows
for (int p = 0; p < N; p++) {
printf("%s\t", i -> first.c_str());
// --- Loop over the columns
for (int q = 0; q < N; q++) {
int dd = a[p * N + q];
if (dd != INT_MAX / 2) printf("%d\t", dd);
else printf("--\t");
}
printf("\n");
i++;
}
}
void printPathRecursive(int row, int col, int *minimumDistances, int *path, int N) {
map<string, int>::iterator i = nameToNum.begin();
map<string, int>::iterator j = nameToNum.begin();
if (row == col) {advance(i, row); printf("%s\t", i -> first.c_str()); }
else {
if (path[row * N + col] == INT_MAX / 2) printf("%row %row %row No path exists\t\n", minimumDistances[row * N + col], row, col);
else {
printPathRecursive(row, path[row * N + col], minimumDistances, path, N);
advance(j, col);
printf("%s\t", j -> first.c_str());
}
}
}
void printPath(int N, int *minimumDistances, int *path) {
map<string, int>::iterator i;
map<string, int>::iterator j;
// --- Loop over the rows
i = nameToNum.begin();
for (int p = 0; p < N; p++) {
// --- Loop over the columns
j = nameToNum.begin();
for (int q = 0; q < N; q++) {
printf("From %s to %s\t", i -> first.c_str(), j -> first.c_str());
printPathRecursive(p, q, minimumDistances, path, N);
printf("\n");
j++;
}
i++;
}
}
/**********************/
/* FLOYD-WARSHALL CPU */
/**********************/
void h_FloydWarshall(int *h_graphMinimumDistances, int *h_graphPath, const int N) {
for (int k = 0; k < N; k++)
for (int row = 0; row < N; row++)
for (int col = 0; col < N; col++) {
if (h_graphMinimumDistances[row * N + col] > (h_graphMinimumDistances[row * N + k] + h_graphMinimumDistances[k * N + col])) {
h_graphMinimumDistances[row * N + col] = (h_graphMinimumDistances[row * N + k] + h_graphMinimumDistances[k * N + col]);
h_graphPath[row * N + col] = h_graphPath[k * N + col];
}
}
}
/*************************/
/* FLOYD-WARSHALL KERNEL */
/*************************/
__global__ void d_FloydWarshall(int k, int *d_graphMinimumDistances, int *d_graphPath, int N) {
int col = blockIdx.x * blockDim.x + threadIdx.x; // --- Each thread along x is assigned to a matrix column
int row = blockIdx.y; // --- Each block along y is assigned to a matrix row
if (col >= N) return;
int arrayIndex = N * row + col;
// --- All the blocks load the entire k-th column into shared memory
__shared__ int d_graphMinimumDistances_row_k;
if(threadIdx.x == 0) d_graphMinimumDistances_row_k = d_graphMinimumDistances[N * row + k];
__syncthreads();
if (d_graphMinimumDistances_row_k == INT_MAX / 2) // --- If element (row, k) = infinity, no update is needed
return;
int d_graphMinimumDistances_k_col = d_graphMinimumDistances[k * N + col];
if(d_graphMinimumDistances_k_col == INT_MAX / 2) // --- If element (k, col) = infinity, no update is needed
return;
int candidateBetterDistance = d_graphMinimumDistances_row_k + d_graphMinimumDistances_k_col;
if (candidateBetterDistance < d_graphMinimumDistances[arrayIndex]) {
d_graphMinimumDistances[arrayIndex] = candidateBetterDistance;
d_graphPath[arrayIndex] = d_graphPath[k * N + col];
}
}
/********/
/* MAIN */
/********/
int main() {
int N = 0; // --- Number of vertices
// --- Read graph array from file
int *h_graphArray = readGraphFromFile(N, "graph2.txt");
printf("\n******************\n");
printf("* Original graph *\n");
printf("******************\n");
printMinimumDistances(N, h_graphArray);
// --- Floyd-Warshall on CPU
int *h_graphMinimumDistances = (int *) malloc(N * N * sizeof(int));
int *h_graphPath = (int *) malloc(N * N * sizeof(int));
memcpy(h_graphMinimumDistances, h_graphArray, N * N * sizeof(int));
for (int k = 0; k < N; k++)
for (int l = 0; l < N; l++)
if (h_graphArray[k * N + l] == INT_MAX / 2) h_graphPath[k * N + l] = INT_MAX / 2;
else h_graphPath[k * N + l] = k;
h_FloydWarshall(h_graphMinimumDistances, h_graphPath, N);
printf("\n*************************\n");
printf("* CPU result: distances *\n");
printf("*************************\n");
printMinimumDistances(N, h_graphMinimumDistances);
printf("\n********************\n");
printf("* CPU result: path *\n");
printf("********************\n");
printPath(N, h_graphMinimumDistances, h_graphPath);
// --- Graph array device allocation and host-device memory transfer
int *d_graphMinimumDistances; gpuErrchk(cudaMalloc(&d_graphMinimumDistances, N * N * sizeof(int)));
gpuErrchk(cudaMemcpy(d_graphMinimumDistances, h_graphArray, N * N * sizeof(int), cudaMemcpyHostToDevice));
int *d_graphPath; gpuErrchk(cudaMalloc(&d_graphPath, N * N * sizeof(int)));
for (int k = 0; k < N; k++)
for (int l = 0; l < N; l++)
if (h_graphArray[k * N + l] == INT_MAX / 2) h_graphPath[k * N + l] = INT_MAX / 2;
else h_graphPath[k * N + l] = k;
gpuErrchk(cudaMemcpy(d_graphPath, h_graphPath, N * N * sizeof(int), cudaMemcpyHostToDevice));
// --- Iterations
for (int k = 0; k < N; k++) {
d_FloydWarshall <<<dim3(iDivUp(N, BLOCKSIZE), N), BLOCKSIZE>>>(k, d_graphMinimumDistances, d_graphPath, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
// --- Copy results back to the host
gpuErrchk(cudaMemcpy(h_graphMinimumDistances, d_graphMinimumDistances, N * N * sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_graphPath, d_graphPath, N * N * sizeof(int), cudaMemcpyDeviceToHost));
printf("\n**************\n");
printf("* GPU result *\n");
printf("**************\n");
printMinimumDistances(N, h_graphMinimumDistances);
printf("\n********************\n");
printf("* GPU result: path *\n");
printf("********************\n");
printPath(N, h_graphMinimumDistances, h_graphPath);
}

CUDA C/C++: Calculate the average of inverse of distance per point (interaction energy, perhaps?)

I've been trying to write a kernel in that calculates the sum of the inverse of the distance between N given points over N. A serial coda in C would be like
average = 0;
for(int i = 0; i < Np; i++){
for(int j = i + 1; j < Np; j++){
average += 1.0e0f/sqrtf((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
}
average = average/(float)N;
Where rx and ry are the x and y coordinates, respectively.
I generate the points via a kernel that uses random number generator. For the kernel, I used 128(256) threads per block for 4k(8k) points. On it every thread performs the inner above inner loop, then the results are passed to a reduce sum function, as follows
Generate points:
__global__ void InitRNG ( curandState * state, const int seed ){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
curand_init (seed, tIdx, 0, &state[tIdx]);
}
__global__
void SortPoints(float* X, float* Y,const int N, curandState *state){
float rdmn1, rdmn2;
unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
float range;
if(tIdx < N){
rdmn1 = curand_uniform(&state[tIdx]);
rdmn2 = curand_uniform(&state[tIdx]);
range = sqrtf(0.25e0f*N*rdmn1);
X[tIdx] = range*cosf(2.0e0f*pi*rdmn2);
Y[tIdx] = range*sinf(2.0e0f*pi*rdmn2);
}
}
Reduction:
__device__
float ReduceSum2(float In){
__shared__ float data[BlockSize];
unsigned int tIdx = threadIdx.x;
data[tIdx] = In;
__syncthreads();
for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){
if(tIdx < i){
data[tIdx] += data[tIdx + i];
}
__syncthreads();
}
return data[0];
}
Kernel:
__global__
void AvgDistance(float *X, float *Y, float *Avg, const int N){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
int bIdx = blockIdx.x;
float x , y;
float d = 0.0f;
if(tIdx < N){
for(int i = tIdx + 1; i < N ; i++){
x = X[tIdx] - X[i];
y = Y[tIdx] - Y[i];
d += 1.0e0f/(sqrtf(x*x + y*y));
}
__syncthreads();
Avg[bIdx] = ReduceSum2(d);
}
}
The kernel is configured and launched as follows:
dim3 threads(BlockSize,BlockSize);
dim3 blocks(ceil(Np/threads.x),ceil(Np/threads.y));
InitRNG<<<blocks.x,threads.x>>>(d_state,seed);
SortPoints<<<blocks.x,threads.x>>>(d_rx,d_ry,Np,d_state);
AvgDistance<<<blocks.x,threads.x,threads.x*sizeof(float)>>>(d_rx,d_ry,d_Avg,Np);
Finally, I copy the data back to host and then perform the remaining sum:
Avg = new float[blocks.x];
CHECK(cudaMemcpy(Avg,d_Avg,blocks.x*sizeof(float),cudaMemcpyDeviceToHost),ERROR_CPY_DEVTOH);
float average = 0;
for(int i = 0; i < blocks.x; i++){
average += Avg[i];
}
average = average/(float)Np;
For 4k points, ok! the results are:
Average distance between points (via Kernel) = 108.615
Average distance between points (via CPU) = 110.191
In this case the sum may be performed in different order, causing both results to diverge from each other, I don't know...
But when it comes to 8k, the results are quiet different:
Average distance between points (via Kernel) = 153.63
Average distance between points (via CPU) = 131.471
To me it seems that both the kernel and the serial code are written the same way. What leads me to distrust the precision on CUDA calculation of floating point numbers. Does this make sense? Or are the access to global memory causing some conflicts when some threads load the same data from X and Y at the same time? Or the way I wrote the kernel is in some way 'wrong'(I mean, am I doing something that is causing both results to diverge from each other?).

Actually, from what I can tell, the problem seems to be on the CPU side. I created a sample code based on your code.
I was able to reproduce your results.
First I switched all instances of sinf, cosf, and sqrtf to their corresponding double versions. This made no difference in the results.
Next I included a typedef so I could easily switch the precision from float to double and back, replacing every relevant instance of float in the code with mytype which is my typedef.
When I run the code with typedef of float and a data size of 4096 I get these results:
GPU average = 108.294922
CPU average = 109.925285
When I run the code with typedef of double and a data size of 4096 I get these results:
GPU average = 108.294903
CPU average = 108.294903
When I run the code with typedef of float and a data size of 8192 I get these results:
GPU average = 153.447327
CPU average = 131.473526
When I run the code with typedef of double and a data size of 8192 I get these results:
GPU average = 153.447380
CPU average = 153.447380
There are at least 2 observations:
The GPU results don't vary between float and double, except in the 5th decimal place
The CPU results vary by 1-20% or so between float and double, but when double is selected, they line up exactly (to the 6th decimal place, anyway) with the GPU results.
Based on this, I believe the CPU is providing the variable, questionable behavior.
Here's my code for reference:
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#define DSIZE 8192
#define BlockSize 32
#define pi 3.14159f
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef double mytype;
__global__ void InitRNG ( curandState * state, const int seed ){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
curand_init (seed, tIdx, 0, &state[tIdx]);
}
__global__
void SortPoints(mytype* X, mytype* Y,const int N, curandState *state){
mytype rdmn1, rdmn2;
unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
mytype range;
if(tIdx < N){
rdmn1 = curand_uniform(&state[tIdx]);
rdmn2 = curand_uniform(&state[tIdx]);
range = sqrt(0.25e0f*N*rdmn1);
X[tIdx] = range*cos(2.0e0f*pi*rdmn2);
Y[tIdx] = range*sin(2.0e0f*pi*rdmn2);
}
}
__device__
mytype ReduceSum2(mytype In){
__shared__ mytype data[BlockSize];
unsigned int tIdx = threadIdx.x;
data[tIdx] = In;
__syncthreads();
for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){
if(tIdx < i){
data[tIdx] += data[tIdx + i];
}
__syncthreads();
}
return data[0];
}
__global__
void AvgDistance(mytype *X, mytype *Y, mytype *Avg, const int N){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
int bIdx = blockIdx.x;
mytype x , y;
mytype d = 0.0f;
if(tIdx < N){
for(int i = tIdx + 1; i < N ; i++){
x = X[tIdx] - X[i];
y = Y[tIdx] - Y[i];
d += 1.0e0f/(sqrt(x*x + y*y));
}
__syncthreads();
Avg[bIdx] = ReduceSum2(d);
}
}
mytype cpu_avg(const mytype *rx, const mytype *ry, const int size){
mytype average = 0.0f;
for(int i = 0; i < size; i++){
for(int j = i + 1; j < size; j++){
average += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
}
average = average/(mytype)size;
return average;
}
int main() {
int Np = DSIZE;
mytype *rx, *ry, *d_rx, *d_ry, *d_Avg, *Avg;
curandState *d_state;
int seed = 1;
dim3 threads(BlockSize,BlockSize);
dim3 blocks((int)ceilf(Np/(float)threads.x),(int)ceilf(Np/(float)threads.y));
printf("number of blocks = %d\n", blocks.x);
printf("number of threads= %d\n", threads.x);
rx = (mytype *)malloc(DSIZE*sizeof(mytype));
if (rx == 0) {printf("malloc fail\n"); return 1;}
ry = (mytype *)malloc(DSIZE*sizeof(mytype));
if (ry == 0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void**)&d_rx, DSIZE * sizeof(mytype));
cudaMalloc((void**)&d_ry, DSIZE * sizeof(mytype));
cudaMalloc((void**)&d_Avg, blocks.x * sizeof(mytype));
cudaMalloc((void**)&d_state, DSIZE * sizeof(curandState));
cudaCheckErrors("cudamalloc");
InitRNG<<<blocks.x,threads.x>>>(d_state,seed);
SortPoints<<<blocks.x,threads.x>>>(d_rx,d_ry,Np,d_state);
AvgDistance<<<blocks.x,threads.x,threads.x*sizeof(mytype)>>>(d_rx,d_ry,d_Avg,Np);
cudaCheckErrors("kernels");
Avg = new mytype[blocks.x];
cudaMemcpy(Avg,d_Avg,blocks.x*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaMemcpy(rx, d_rx, DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaMemcpy(ry, d_ry, DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaCheckErrors("cudamemcpy");
mytype average = 0;
for(int i = 0; i < blocks.x; i++){
average += Avg[i];
}
average = average/(mytype)Np;
printf("GPU average = %f\n", average);
average = cpu_avg(rx, ry, DSIZE);
printf("CPU average = %f\n", average);
return 0;
}
I am running on RHEL 5.5, CUDA 5.0, Intel Xeon X5560
compiled with:
nvcc -O3 -arch=sm_20 -lcurand -lm -o t93 t93.cu
EDIT:
After observing that the variability was on the CPU side, I found that I could eliminate most of the CPU variability by modifying your CPU averaging code like this:
mytype cpu_avg(const mytype *rx, const mytype *ry, const int size){
mytype average = 0.0f;
mytype temp = 0.0f;
for(int i = 0; i < size; i++){
for(int j = i + 1; j < size; j++){
temp += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
average += temp/(mytype)size;
temp = 0.0f;
}
return average;
}
So I would say there's a problem with intermediate results on the CPU side. It's interesting that it doesn't show up on the GPU result. I suspect the reason for this is that the final summation of GPU averages is done on the CPU (therefore each individual GPU block result is scaled down by the size, e.g. 8192), and these may have an intermediate precision that is sufficient to survive until the final division. If you inlined the CPU average calculation, you may observe something different again.

Cannot read out Values from Texture Memory

Hi I'm writing a simple Program for practicing to work with texture memory. I Just want to write my data into Texture Memory and write it back into Global Memory. But i cannont read out the Values. Here is the code.
#include <stdio.h>
#include <iostream>
#include "cuda.h"
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "HelloWorld.h"
#include "linearInterpolation_kernel4.cu"
using namespace std;
using std::cout;
const int blocksize = 16;
__global__
void hello(char *a, int *b) {
a[threadIdx.x] += b[threadIdx.x];
}
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
printf("%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
}
}
// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
inline void __getLastCudaError( const char *errorMessage, const char *file, const int line )
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
printf("%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
}
}
int main()
{
int N = 40;
float *A;
A = (float *) malloc(N*sizeof(float));
float *B;
B = (float *) malloc(N*sizeof(float));
float *result;
result = (float *) malloc(N*sizeof(float));
float angle = 0.8f;
for(int i = 0; i < N; i++){
A[i] = i; //(float)rand();
B[i] = i+1; //(float)rand();
}
ipLinearTexture2(A,B,result,angle,N);
float result2;
result2 = (angle)*A[4] + (1-angle)*B[4];
printf(" A %f B %f Result %f\n", A[4], B[4], result[4]);
cout << result2 << endl;
return 1;
}
void ipLinearTexture2(float *A, float* B, float* result, float angle, int N)
{
float cuTime;
int N2 = N * 2;
float *dev_result;
float **AB;
AB = (float **) malloc( N * sizeof(float *));
if(AB)
{
for(int i = 0; i < N; i++)
{
AB[i] = (float *) malloc( 2 * sizeof(float *));
}
}
for (int i = 0; i < N; i = i++)
{
AB[i][0] = A[i];
AB[i][1] = B[i];
}
cudaMalloc(&dev_result, N * sizeof(float));
unsigned int size = N2 * sizeof(float);
//cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray* cu_array;
checkCudaErrors(cudaMallocArray( &cu_array, &channelDesc,N,2));
cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);
// set texture parameters
tex2.normalized = true;
tex2.filterMode = cudaFilterModeLinear;
tex2.addressMode[0] = cudaAddressModeWrap; //cudaAddressModeWrap;
tex2.addressMode[1] = cudaAddressModeWrap; //cudaAddressModeClamp;
checkCudaErrors(cudaBindTextureToArray( tex2, cu_array, channelDesc));
dim3 dimBlock(10, 1, 1);
dim3 dimGrid((int)ceil((double)N*2/dimBlock.x), 1, 1);
transformKernel4<<< 256, 256, 0 >>>( dev_result, N, 2, angle);
checkCudaErrors(cudaMemcpy(result, dev_result, N * sizeof(float), cudaMemcpyDeviceToHost));
cout << "==================================================" << endl;
for (int i = 0 ; i < N ;i++)
{
cout << result[i] << " on " << i << endl;
}
cout << "==================================================" << endl;
checkCudaErrors(cudaUnbindTexture(tex));
checkCudaErrors(cudaFree(dev_result));
checkCudaErrors(cudaFreeArray(cu_array));
}
and here is the kernel code
#ifndef _SIMPLETEXTURE_KERNEL5_H_
#define _SIMPLETEXTURE_KERNEL5_H_
// Texture references
texture<float, 2, cudaReadModeElementType> tex2;
__global__ void
transformKernel4(float* g_odata, int width, int height, float theta)
{
unsigned int xid = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int yid = blockIdx.y * blockDim.y + threadIdx.y;
if (xid >= width || yid >= height) return;
float dx = 1.0f / (float)width;
float dy = 1.0f / (float)height;
float x = ((float)xid + 0.5f) * dx;
float y = ((float)yid + 0.5f) * dy;
float value = tex2D(tex2, x , y);
printf("wert %f xid %i yid %i \n",value, xid, yid);
g_odata[yid * width + xid] = value;
}
#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
Can somebody tell what i am doing wrong?
I have edited it to remove the first 2 logical mistake. Put why am I need able to print out my data?

It was the wrong binding of the Arrays. You can not use multidimensional Arrays in C that can be copied. You have to use a onedimensional array that respresents a multidimensional.

I can see 2 logical errors here.
The first one is the one pointed out by #asm.
The output should be stored by calculating linear index from 2D x and y indices.
outputIndex = yid * width + xid;
The second one is that the memory allocation for the cudaArray structure is internally aligned.
You should consider using cudaMemcpy2DToArray function to avoid erroneous data copying.
cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Element-by-element vector multiplication with CUDA - cuda

You can use cublasZdgmm. cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, int n, const cuDoubleComplex A, int lda, const cuDoubleComplex x, int incx, cuDoubleComplex *C, int ldc)

Related

CUDA 2nd order recursion with thrust inclusive_scan

Cuda Implementation of Partitioned Subgroup

The Floyd-Warshall algorithm in CUDA

CUDA C/C++: Calculate the average of inverse of distance per point (interaction energy, perhaps?)

Cannot read out Values from Texture Memory

Categories

Resources

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Element-by-element vector multiplication with CUDA - cuda

You can use cublasZdgmm. cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, int n, const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex *C, int ldc)

Related

CUDA 2nd order recursion with thrust inclusive_scan

Cuda Implementation of Partitioned Subgroup

The Floyd-Warshall algorithm in CUDA

CUDA C/C++: Calculate the average of inverse of distance per point (interaction energy, perhaps?)

Cannot read out Values from Texture Memory

Categories

Resources

You can use cublasZdgmm. cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, int n, const cuDoubleComplex A, int lda, const cuDoubleComplex x, int incx, cuDoubleComplex *C, int ldc)