Thrust copy - OutputIterator column-major order - cuda

I have a vector of matrices (stored as column major arrays) that I want to concat vertically. Therefore, I want to utilize the copy function from the thrust framework as in the following example snippet:
int offset = 0;
for(int i = 0; i < matrices.size(); ++i) {
thrust::copy(
thrust::device_ptr<float>(matrices[i]),
thrust::device_ptr<float>(matrices[i]) + rows[i] * cols[i],
thrust::device_ptr<float>(result) + offset
);
offset += rows[i] * cols[i];
}
EDIT: extended example:
The problem is, that if I have a matrix A = [[1, 2, 3], [4, 5, 6]] (2 rows, 3 cols; in memory [1, 4, 2, 5, 3, 6]) and another B = [[7, 8, 9]] (1 row, 3 cols; in memory [7, 8, 9]), the resulting matrix C is not [[1, 2, 3], [4, 5, 6], [7, 8, 9]] (3 row, 3 cols; in memory [1, 4, 7, 2, 5, 8, 3, 6, 9]), but [[1, 5, 7], [4, 3, 8], [2, 6, 9]] (3 row, 3 cols; in memory [1, 4, 2, 5, 3, 6, 7, 8, 9]).
Is there an way to create an special OutputIterator for this problem (I have searched for it, but found nothing), or a fast alternative way?
EDIT: SSCCE
#include <thrust/host_vector.h>
#include <thrust/generate.h>
#include <thrust/device_vector.h>
#include <iostream>
void printMat2d(thrust::device_vector<float>& mat, int rows, int cols) {
for(int row = 0; row < rows; ++row) {
for(int col = 0; col < cols; ++col) {
std::cout << mat[row + col * rows] << " ";
}
std::cout << std::endl;
}
}
void printMat1d(thrust::device_vector<float>& mat, int rows, int cols) {
for(int idx = 0; idx < cols*rows; ++idx) {
std::cout << mat[idx] << " ";
}
std::cout << std::endl;
}
void generateMat(thrust::device_vector<float>& mat, int rows, int cols, int add) {
thrust::host_vector<float> matHost(rows * cols);
int val = 0;
for(int row = 0; row < rows; ++row) {
for(int col = 0; col < cols; ++col) {
matHost[row + col * rows] = val + add;
val++;
}
}
mat = matHost;
}
int main() {
std::vector<int> rows(2);
rows[0] = 2;
rows[1] = 3;
std::vector<int> cols(2);
cols[0] = 3;
cols[1] = 3;
//generate matrices
std::vector<thrust::device_vector<float> > matrices(2);
for(size_t i = 0; i < matrices.size(); ++i) {
generateMat(matrices[i], rows[i], cols[i], i*10);
std::cout << "mat_ " << i << " = " << std::endl;
printMat2d(matrices[i], rows[i], cols[i]);
printMat1d(matrices[i], rows[i], cols[i]);
}
//copy
int resultRows = 5;
int resultCols = 3;
thrust::device_vector<float> result(resultRows * resultCols);
int offset = 0;
for(int i = 0; i < matrices.size(); ++i) {
thrust::copy(
matrices[i].begin(),
matrices[i].end(),
result.begin() + offset
);
offset += rows[i] * cols[i];
}
std::cout << "result = " << std::endl;
printMat2d(result, resultRows, resultCols);
printMat1d(result, resultRows, resultCols);
return 0;
}

EDIT: I've replaced my previous answer that used the strided range per row method, with a slightly different approach, that gets the copy operation down to a single thrust call (per matrix to be copied).
The key idea here was to use a functor that converts row-major memory indexing to column-major memory indexing. This functor can then be used with a counting_iterator to create arbitrary row-major to column major memory indices (via make_transform_iterator). These indices can then be used in a permutation_iterator for the source matrix to select the element to be copied and a permutation_iterator for the destination matrix to select the memory position to copy to. For a general review of transform_iterator, counting_iterator, and permutation_iterator, refer to the thrust quick start guide. I happened to be using CUDA 5.0 and thrust 1.5.3 for this exercise.
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/functional.h>
#include <thrust/copy.h>
#include <iostream>
struct rm2cm_idx_functor : public thrust::unary_function<int, int>
{
int r;
int c;
rm2cm_idx_functor(int _r, int _c) : r(_r), c(_c) {};
__host__ __device__
int operator() (int idx) {
unsigned my_r = idx/c;
unsigned my_c = idx%c;
return (my_c * r) + my_r;
}
};
typedef float my_type;
void printMat2d(thrust::device_vector<my_type>& mat, int rows, int cols) {
for(int row = 0; row < rows; ++row) {
for(int col = 0; col < cols; ++col) {
std::cout << mat[row + col * rows] << " ";
}
std::cout << std::endl;
}
}
void printMat1d(thrust::device_vector<my_type>& mat, int rows, int cols) {
for(int idx = 0; idx < cols*rows; ++idx) {
std::cout << mat[idx] << " ";
}
std::cout << std::endl;
}
void generateMat(thrust::device_vector<my_type>& mat, int rows, int cols, int add) {
thrust::host_vector<my_type> matHost(rows * cols);
int val = 0;
for(int row = 0; row < rows; ++row) {
for(int col = 0; col < cols; ++col) {
matHost[row + col * rows] = val + add;
val++;
}
}
mat = matHost;
}
void copyMat(thrust::device_vector<my_type>& src, thrust::device_vector<my_type>& dst, unsigned src_rows, unsigned src_cols, unsigned dst_rows, unsigned offset){
thrust::copy_n(thrust::make_permutation_iterator(src.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), rm2cm_idx_functor(src_rows, src_cols))), src_rows*src_cols, thrust::make_permutation_iterator(dst.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(offset), rm2cm_idx_functor(dst_rows, src_cols))));
}
int main() {
std::vector<int> rows(2);
rows[0] = 2;
rows[1] = 3;
std::vector<int> cols(2);
cols[0] = 3;
cols[1] = 3;
//generate matrices
std::vector<thrust::device_vector<my_type> > matrices(2);
for(size_t i = 0; i < matrices.size(); ++i) {
generateMat(matrices[i], rows[i], cols[i], i*10);
std::cout << "mat_ " << i << " = " << std::endl;
printMat2d(matrices[i], rows[i], cols[i]);
printMat1d(matrices[i], rows[i], cols[i]);
}
//copy
int resultRows = 5;
int resultCols = 3;
thrust::device_vector<my_type> result(resultRows * resultCols);
int offset = 0;
for(int i = 0; i < matrices.size(); ++i) {
copyMat(matrices[i], result, rows[i], cols[i], resultRows, offset);
offset += rows[i]*cols[i];
}
std::cout << "result = " << std::endl;
printMat2d(result, resultRows, resultCols);
printMat1d(result, resultRows, resultCols);
return 0;
}
This also assumes that source columns == destination columns, which seems to be implicit in your problem statement. Standard caveat: not saying this is bug free, but it seems to work for the test case built into the original problem statement.
This approach can probably still be further improved. Right now both the read operation and the write operation associated with the thrust::copy_n call will be uncoalesced. We can further improve this by making one of these two operations coalesced. This would necessitate combining the effect of index conversion functor for both read and write into a single mapping functor, which takes into account both source and destination dimensions. With a single mapping functor, the first term of the copy_n call could be just the source vector. I think it should also be possible to alternatively use thrust::gather or thrust::scatter. However, I haven't fully worked it out.

Related

CUDA 2nd order recursion with thrust inclusive_scan

I'm trying to understand how to parallelise a recursive calculation. Serially, the calculation takes the form:
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
For the i-1 index there's a solution here to a previous question of mine: CUDA force instruction execution order
I want to modify this to use the i-2 and I can't understand how to apply the same process to a 2nd order calculation. It should be possible using the thrust::inclusive_scan function, but I can't work out how. Does anyone know the solution?
Picking up where we left off in the previous question/answer, we shift our attention to equation 1.11 in the referenced paper by Blelloch. We observe that your problem formulation:
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
seems to match that in equation 1.11 if we set m=2 and in that case we can also observe that for your formulation, all ai,1 are zero (and, as previously, all ai,2 are k).
As per equation 1.12 in that paper, our state variable si now becomes a two-tuple:
si = |xi xi-1|
Taking note of these things, we observe the "correctness" of equation 1.13:
si = |xi-1 xi-2| . |0 1, k 0| + |bi 0|
rewriting:
si,1 = xi = k*xi-2 + bi
si,2 = xi-1 = xi-1
(In my view, the other answer leaves you at this point. That realization, i.e. result.data[0] = right + k * left.data[1]; is sufficient for a serial scan but not for a parallel scan. It's also evident that the functor/scan op there is not associative.)
We now need to come up with a binary operator bop that is an extension of the definition in (1.7) to this case. Referring to the previous definition in equation 1.7, we extend that based on the treatment in 1.13 as follows:
Ci = |Ai , Bi|
where:
Ai = |0 1, k 0|
and:
Bi = |bi 0|
We then have:
Ci bop Cj = | Ai . Aj , Bi . Aj + Bj |
This then becomes the formula for our functor/scan operator. We will need to carry 6 scalar "state" quantities throughout: 2 for the B vector and 4 for the A matrix.
What follows then is a realization of the above:
$ cat t1930.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
#include <cstdio>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef float mt;
const size_t ds = 512;
const mt k = 1.01;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // cpu result
for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
thrust::device_vector<mt> db(b1, b1+ds);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_vector<mt> dx1(ds);
thrust::device_vector<mt> dx0(ds);
thrust::device_vector<mt> dy0(ds);
thrust::device_vector<mt> dy1(ds);
thrust::device_vector<mt> dy2(ds);
thrust::device_vector<mt> dy3(ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(db.begin(), b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
thrust::host_vector<mt> hx1 = dx1;
thrust::copy_n(hx1.begin(), snip, std::ostream_iterator<mt>(std::cout, ","));
thrust::copy_n(hx1.begin()+ds-snip, snip, std::ostream_iterator<mt>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -std=c++14 t1930.cu -o t1930
$ cuda-memcheck ./t1930
========= CUDA-MEMCHECK
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.218,601.275,576.315,607.993,582.947,614.621,589.516,621.699,595.644,628.843,
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.621,589.516,621.7,595.644,628.843,
========= ERROR SUMMARY: 0 errors
$
Yes, there are some results above that differ in the 6th digit. I attribute this to the limitations of float resolution when taking into account the very different order of operations between the serial and parallel method. If you change the typedef to double, the results will appear to match exactly.
Since you've asked about it here's an equivalent realization where it is demonstrated using device data previously allocated using cudaMalloc:
$ cat t1930.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
#include <cstdio>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef double mt;
const size_t ds = 512;
const mt k = 1.01;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // cpu result
for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
mt *db;
cudaMalloc(&db, ds*sizeof(db[0]));
cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_vector<mt> dx1(ds);
thrust::device_vector<mt> dx0(ds);
thrust::device_vector<mt> dy0(ds);
thrust::device_vector<mt> dy1(ds);
thrust::device_vector<mt> dy2(ds);
thrust::device_vector<mt> dy3(ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
cudaMemcpy(cr, thrust::raw_pointer_cast(dx1.data()), ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
}
$ nvcc -std=c++14 t1930.cu -o t1930
$ cuda-memcheck ./t1930
========= CUDA-MEMCHECK
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
========= ERROR SUMMARY: 0 errors
There should be no significant performance difference between these two approaches. (However I happened to switch the typedef to double for this example, so that makes a difference.) Using cudaMalloc as an alternative to the device_vector for the various state vectors (dx0, dx1, dy0, dy1 ...) may be slightly faster, because device_vector first does a cudaMalloc style allocation, then launches a kernel to zero out the allocation. This zero-ing step is unnecessary for the state vectors. The pattern given here should demonstrate how you could do that, if you are interested.
Here's a version that eliminates use of thrust::device_vector and thrust::host_vector altogether:
#include <iostream>
#include <thrust/device_ptr.h>
#include <thrust/scan.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef float mt;
const size_t ds = 32768*4;
const mt k = 1.001;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // result
for (int i = 0; i < ds; i++) { b1[i] = (rand()/(float)RAND_MAX)-0.5;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
mt *db, *dstate;
cudaMalloc(&db, ds*sizeof(db[0]));
cudaMalloc(&dstate, 6*ds*sizeof(dstate[0]));
cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_ptr<mt> dx1 = thrust::device_pointer_cast(dstate);
thrust::device_ptr<mt> dx0 = thrust::device_pointer_cast(dstate+ds);
thrust::device_ptr<mt> dy0 = thrust::device_pointer_cast(dstate+2*ds);
thrust::device_ptr<mt> dy1 = thrust::device_pointer_cast(dstate+3*ds);
thrust::device_ptr<mt> dy2 = thrust::device_pointer_cast(dstate+4*ds);
thrust::device_ptr<mt> dy3 = thrust::device_pointer_cast(dstate+5*ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1, dx0, dy0, dy1, dy2, dy3));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
cudaMemcpy(cr, dstate, ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
}
Here is some cpu code which shows a possible implementation of the formular derived from https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf to express higher-order recurrences as a scan operation.
The key idea is that each element of the scan result is not a scalar, but a vector which contains the n previous scalar results. This way, all the required previous results are available in the scan operator to compute the next result.
#include <iostream>
#include <algorithm>
#include <numeric>
#include <array>
void calculate1(std::vector<int> vec, int k){
std::vector<int> result(vec.size(), 0);
for(int i = 2; i < vec.size(); i++){
result[i] = vec[i] + k * result[i-2];
}
std::cerr << "calculate1 result: ";
for(auto x : result){
std::cerr << x << ", ";
}
std::cerr << "\n";
}
struct S{
//data[0] stores result of last iteration
//data[1] stores result of second last iteration
std::array<int, 2> data;
};
std::ostream& operator<<(std::ostream& os, S s){
os << "(" << s.data[0] << "," << s.data[1] << ")";
}
void calculate2(std::vector<int> vec, int k){
S initvalue{{0,0}};
std::vector<S> result(vec.size(), initvalue);
std::exclusive_scan(
vec.begin() + 2,
vec.end(),
result.begin(),
initvalue,
[k](S left, int right){
S result;
/*A = (
0 1
k 0
)
Compute result = left * A + (right 0)
*/
result.data[0] = right + k * left.data[1];
result.data[1] = left.data[0];
return result;
}
);
std::cerr << "calculate2 result: ";
for(auto x : result){
std::cerr << x << ", ";
}
std::cerr << "\n";
}
int main(){
const int k = 5;
const std::vector<int> vec1{1,3,5,7,9,11,3,6,7,1,2,4};
calculate1(vec1, k);
calculate2(vec1, k);
}
https://godbolt.org/z/cszzn8Ec8
Output:
calculate1 result: 0, 0, 5, 7, 34, 46, 173, 236, 872, 1181, 4362, 5909,
calculate2 result: (0,0), (5,0), (7,5), (34,7), (46,34), (173,46), (236,173), (872,236), (1181,872), (4362,1181), (0,0), (0,0),
There is still an of-by-one error somewhere, but one can get the idea behind it.
I previously said that this approach can be used for a parallel scan in CUDA. This is not correct. For a parallel scan, the scan operator must have an additional property, which is associativity, i.e. (a OP b) OP c == a OP (b OP c). This is not the case in this approach.
Robert Crovella's answer shows how to derive an associative scan operator which can be used for a parallel scan.

Should I declare a double array with the GPU block number on the inner or outer dimension?

Should I declare a double array with the GPU block number on the inner or outer dimension?
E.g., should I do
int payload[LEN][BLOCKS];
or
int payload[BLOCKS][LEN];
where LEN is a very large number.
I plan to have each block traverse the double array, holding the block dimension constant and iterating over the LEN dimension.
Assuming you're going to access the data in a block-oriented manner, you want to do the latter. This is presumably because when you load the first element of the "len" dimension, you've already paid the cost for missing in the cache for the subsequent 7ish elements. In the first option, there's probably sharing of cache lines between GPU blocks, but the sharing is relatively limited and not as low level.
Indeed, the below code reports that the second option requires 0.481 seconds to execute, and the first option requires 0.979 seconds. Arranging the data with the block on the outer dimension is about twice as performant.
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <string>
#include <chrono>
#include <iostream>
#define BLOCKS 80
#define LEN (1 << 20)
void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err) {
if (err == cudaSuccess)
return;
std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
exit (1);
}
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)
struct Data1 {
int payload[LEN][BLOCKS];
};
struct Data2 {
int payload[BLOCKS][LEN];
};
__global__ void f1(Data1 * data1) {
int sum = 0;
for (int i = 0; i < LEN; ++i) {
sum += data1->payload[i][blockIdx.x];
}
printf("block %i has f1 sum %i\n", blockIdx.x, sum);
}
__global__ void f2(Data2 * data2) {
int sum = 0;
for (int i = 0; i < LEN; ++i) {
sum += data2->payload[blockIdx.x][i];
}
printf("block %i has f2 sum %i\n", blockIdx.x, sum);
}
int main() {
Data1 * data1 = (Data1 *) malloc(sizeof(Data1));
Data2 * data2 = (Data2 *) malloc(sizeof(Data2));;
for (int i = 0; i < LEN; ++i) {
for (int b = 0; b < BLOCKS; ++b) {
data1->payload[i][b] = i * b;
data2->payload[b][i] = i * b;
}
}
Data1 * data1_on_gpu;
CUDA_CHECK_RETURN(cudaMalloc(&data1_on_gpu, sizeof(Data1)));
Data2 * data2_on_gpu;
cudaMalloc(&data2_on_gpu, sizeof(Data2));
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
cudaMemcpy(data1_on_gpu, data1, sizeof(Data1), cudaMemcpyHostToDevice);
cudaMemcpy(data2_on_gpu, data2, sizeof(Data1), cudaMemcpyHostToDevice);
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
std::chrono::time_point<std::chrono::system_clock> t1 = std::chrono::system_clock::now();
f1<<<80,1>>>(data1_on_gpu);
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
std::chrono::time_point<std::chrono::system_clock> t2 = std::chrono::system_clock::now();
f2<<<80,1>>>(data2_on_gpu);
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
std::chrono::time_point<std::chrono::system_clock> t3 = std::chrono::system_clock::now();
std::chrono::duration<double> duration_1_to_2 = t2 - t1;
std::chrono::duration<double> duration_2_to_3 = t3 - t2;
duration_1_to_2.count();
printf("timer for 1st took %.3lf\n", duration_1_to_2.count());
printf("timer for 2nd took %.3lf\n", duration_2_to_3.count());
}

The Floyd-Warshall algorithm in CUDA

This is the sequential piece of code I am trying to parallelize in CUDA
/*
Sequential (Single Thread) APSP on CPU.
*/
void floyd_sequential(int *mat, const size_t N)
{
for(int k = 0; k < N; k ++)
for(int i = 0; i < N; i ++)
for(int j = 0; j < N; j ++)
{
int i0 = i*N + j;
int i1 = i*N + k;
int i2 = k*N + j;
if(mat[i1] != -1 && mat[i2] != -1)
mat[i0] = (mat[i0] != -1 && mat[i0] < mat[i1] + mat[i2]) ?
mat[i0] : (mat[i1] + mat[i2]);
}
}
This is my CUDA implementation
// ParallelComputing.cpp : Defines the entry point for the console application.
//
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#define DIMENSION 10;
__global__ void gpu_Floyd(int *result, int N)
{
int j,k;
int Row = blockIdx.y * blockDim.y + threadIdx.y;
for(k = 0; k < N; k++)
{
for(j = 0; j < N; j++)
{
int i0 = Row * N + j;
int i1 = Row * N + k;
int i2 = k * N + j;
if(result[i0] != -1 && result[i2] != -1)
result[i0] = (result[i0] != -1 && result[i0] < result[i1] + result[i2]) ?
result[i0] : (result[i1] + result[i2]);
__syncthreads();
}
}
}
void GenMatrix(int *mat, const size_t N)
{
for(int i = 0; i < N*N; i ++)
mat[i] = rand()%32 - 1;
}
bool CmpArray(const int *l, const int *r, const size_t eleNum)
{
for(int i = 0; i < eleNum; i ++)
if(l[i] != r[i])
{
printf("ERROR: l[%d] = %d, r[%d] = %d\n", i, l[i], i, r[i]);
return false;
}
return true;
}
int main(int argc, char **argv)
{
// generate a random matrix.
size_t N = 10;
int *mat = (int*)malloc(sizeof(int)*N*N);
GenMatrix(mat, N);
// compute the reference result.
int *ref = (int*)malloc(sizeof(int)*N*N);
memcpy(ref, mat, sizeof(int)*N*N);
Floyd_sequential(ref, N);
//CUDA Portion
int Grid_Dim_x = 1, Grid_Dim_y = 1;
int noThreads_x, noThreads_y;
int *result = (int*)malloc(sizeof(int)*N*N);
memcpy(result, mat, sizeof(int)*N*N);
int *d_result;
// compute your results
cudaMalloc((void **)&d_result, N*N);
cudaMemcpy(result, N * N, cudaMemcpyHostToDevice);
gpu_Floyd<<<1024, 256>>>(d_result, N);
cudaMemcpy(result, d_result, cudaMemcpyDeviceToHost);
// compare your result with reference result
if(CmpArray(result, ref, N*N))
printf("The matrix matches.\n");
else
printf("The matrix do not match.\n");
free(ref);
free(result);
cudaFree(d_result);
}
However, my output always shows the matrices do not match.
I understand that in CUDA we try to map each element in the matrix to each row. However, I am trying to explore possibilities by mapping each row of the matrix to a thread instead.
As has already been mentioned, your provided GPU code does not compile, so I'm curious how you got to the observation that your output matrices do not match.
Here are some of the problems with your code:
cudaMalloc, just like malloc allocates bytes, so this is not correct:
cudaMalloc((void **)&d_result, N*N);
instead you want this:
cudaMalloc((void **)&d_result, N*N*sizeof(int));
likewise cudaMemcpy, just like memcpy, operates on bytes, and furthermore cudaMemcpy requires 4 parameters so this is not correct:
cudaMemcpy(result, N * N, cudaMemcpyHostToDevice);
instead you probably want this:
cudaMemcpy(d_result, result, N * N*sizeof(int), cudaMemcpyHostToDevice);
and your other cudaMemcpy line needs to be fixed similarly.
I'd also advise doing proper cuda error checking
Your kernel is written as if it's expecting a 2 dimensional thread array, or at least one dimensional in y, whereas you are launching a one dimensional grid in x:
gpu_Floyd<<<1024, 256>>>(d_result, N);
therefore all your kernel built-in variables in y will be 1 or 0 always, and this line of code:
int Row = blockIdx.y * blockDim.y + threadIdx.y;
will evaluate to zero for all threads in your 1-D grid in x.
Your gpu kernel is putting the results in the same matrix as the input data. For sequential code this may or may not matter, but for code that is intended to run in parallel, it can often lead to race conditions, because the order of operations (i.e. order of thread execution) is largely undefined.
Below you will find a canonical, simple implementation of the Floyd-Warshall algorithm in CUDA.
The CUDA code is accompanied with a sequential implementation and both are based on the simplifying assumption that the edges are non-negative. The full, minimum distance paths are also reconstructed in both the cases. Despite the simplifying assumption, it should be possible to grasp the relevant parallelization idea, namely that a two-dimensional thread grid is exploited and that each thread along x is assigned to a matrix column, while each block along y is assigned to a matrix row. In this way, all the columns are loaded by the threadIdx.x == 0 threads of each block in shared memory.
// --- Assumption: graph with positive edges
#include <stdio.h>
#include <string>
#include <map>
#include <iostream>
#include <fstream>
#include "Utilities.cuh"
#define BLOCKSIZE 256
using namespace std;
map<string, int> nameToNum; // --- names of vertices
map<string, map<string, int>> weightMap; // --- weights of edges
/************************/
/* READ GRAPH FROM FILE */
/************************/
int *readGraphFromFile(int &N, char *fileName) {
string vertex1, vertex2;
ifstream graphFile;
int currentWeight;
N = 0; // --- Init the number of found vertices
graphFile.open(fileName); // --- Open the graph file
graphFile >> vertex1; // --- Read first vertex
while(vertex1 != "--END--") { // --- Loop untile end of file has not been found
graphFile >> vertex2; // --- Read second vertex
graphFile >> currentWeight; // --- Read weight between first and second vertex
if (nameToNum.count(vertex1) == 0) { // --- If vertex has not yet been added ...
nameToNum[vertex1] = N; // assign a progressive number to the vertex
weightMap[vertex1][vertex1] = 0; // assign a zero weight to the "self-edge"
N++; // --- Update the found number of vertices
}
if (nameToNum.count(vertex2) == 0) {
nameToNum[vertex2] = N;
weightMap[vertex2][vertex2] = 0;
N++;
}
weightMap[vertex1][vertex2] = currentWeight; // --- Update weight between vertices 1 and 2
graphFile >> vertex1;
}
graphFile.close(); // --- Close the graph file
// --- Construct the array
int *weightMatrix = (int*) malloc(N * N * sizeof(int));
// --- Loop over all the vertex couples in the wights matrix
for (int ii = 0; ii < N; ii++)
for (int jj = 0; jj < N; jj++)
weightMatrix[ii * N + jj] = INT_MAX / 2; // --- Init the weights matrix elements to infinity
map<string, int>::iterator i, j;
// --- Loop over all the vertex couples in the map
// (*i).first and (*j).first are the weight entries of the map, while (*i).second and (*j).second are their corresponding indices
for (i = nameToNum.begin(); i != nameToNum.end(); ++i)
for (j = nameToNum.begin(); j != nameToNum.end(); ++j) {
// --- If there is connection between vertices (*i).first and (*j).first, the update the weight matrix
if (weightMap[(*i).first].count((*j).first) != 0)
weightMatrix[N * (*i).second + (*j).second] = weightMap[(*i).first][(*j).first];
}
return weightMatrix;
}
/************************************/
/* PRINT MINIMUM DISTANCES FUNCTION */
/************************************/
void printMinimumDistances(int N, int *a) {
map<string, int>::iterator i;
// --- Prints all the node labels at the first row
for (i = nameToNum.begin(); i != nameToNum.end(); ++i) printf("\t%s", i->first.c_str());
printf("\n");
i = nameToNum.begin();
// --- Loop over the rows
for (int p = 0; p < N; p++) {
printf("%s\t", i -> first.c_str());
// --- Loop over the columns
for (int q = 0; q < N; q++) {
int dd = a[p * N + q];
if (dd != INT_MAX / 2) printf("%d\t", dd);
else printf("--\t");
}
printf("\n");
i++;
}
}
void printPathRecursive(int row, int col, int *minimumDistances, int *path, int N) {
map<string, int>::iterator i = nameToNum.begin();
map<string, int>::iterator j = nameToNum.begin();
if (row == col) {advance(i, row); printf("%s\t", i -> first.c_str()); }
else {
if (path[row * N + col] == INT_MAX / 2) printf("%row %row %row No path exists\t\n", minimumDistances[row * N + col], row, col);
else {
printPathRecursive(row, path[row * N + col], minimumDistances, path, N);
advance(j, col);
printf("%s\t", j -> first.c_str());
}
}
}
void printPath(int N, int *minimumDistances, int *path) {
map<string, int>::iterator i;
map<string, int>::iterator j;
// --- Loop over the rows
i = nameToNum.begin();
for (int p = 0; p < N; p++) {
// --- Loop over the columns
j = nameToNum.begin();
for (int q = 0; q < N; q++) {
printf("From %s to %s\t", i -> first.c_str(), j -> first.c_str());
printPathRecursive(p, q, minimumDistances, path, N);
printf("\n");
j++;
}
i++;
}
}
/**********************/
/* FLOYD-WARSHALL CPU */
/**********************/
void h_FloydWarshall(int *h_graphMinimumDistances, int *h_graphPath, const int N) {
for (int k = 0; k < N; k++)
for (int row = 0; row < N; row++)
for (int col = 0; col < N; col++) {
if (h_graphMinimumDistances[row * N + col] > (h_graphMinimumDistances[row * N + k] + h_graphMinimumDistances[k * N + col])) {
h_graphMinimumDistances[row * N + col] = (h_graphMinimumDistances[row * N + k] + h_graphMinimumDistances[k * N + col]);
h_graphPath[row * N + col] = h_graphPath[k * N + col];
}
}
}
/*************************/
/* FLOYD-WARSHALL KERNEL */
/*************************/
__global__ void d_FloydWarshall(int k, int *d_graphMinimumDistances, int *d_graphPath, int N) {
int col = blockIdx.x * blockDim.x + threadIdx.x; // --- Each thread along x is assigned to a matrix column
int row = blockIdx.y; // --- Each block along y is assigned to a matrix row
if (col >= N) return;
int arrayIndex = N * row + col;
// --- All the blocks load the entire k-th column into shared memory
__shared__ int d_graphMinimumDistances_row_k;
if(threadIdx.x == 0) d_graphMinimumDistances_row_k = d_graphMinimumDistances[N * row + k];
__syncthreads();
if (d_graphMinimumDistances_row_k == INT_MAX / 2) // --- If element (row, k) = infinity, no update is needed
return;
int d_graphMinimumDistances_k_col = d_graphMinimumDistances[k * N + col];
if(d_graphMinimumDistances_k_col == INT_MAX / 2) // --- If element (k, col) = infinity, no update is needed
return;
int candidateBetterDistance = d_graphMinimumDistances_row_k + d_graphMinimumDistances_k_col;
if (candidateBetterDistance < d_graphMinimumDistances[arrayIndex]) {
d_graphMinimumDistances[arrayIndex] = candidateBetterDistance;
d_graphPath[arrayIndex] = d_graphPath[k * N + col];
}
}
/********/
/* MAIN */
/********/
int main() {
int N = 0; // --- Number of vertices
// --- Read graph array from file
int *h_graphArray = readGraphFromFile(N, "graph2.txt");
printf("\n******************\n");
printf("* Original graph *\n");
printf("******************\n");
printMinimumDistances(N, h_graphArray);
// --- Floyd-Warshall on CPU
int *h_graphMinimumDistances = (int *) malloc(N * N * sizeof(int));
int *h_graphPath = (int *) malloc(N * N * sizeof(int));
memcpy(h_graphMinimumDistances, h_graphArray, N * N * sizeof(int));
for (int k = 0; k < N; k++)
for (int l = 0; l < N; l++)
if (h_graphArray[k * N + l] == INT_MAX / 2) h_graphPath[k * N + l] = INT_MAX / 2;
else h_graphPath[k * N + l] = k;
h_FloydWarshall(h_graphMinimumDistances, h_graphPath, N);
printf("\n*************************\n");
printf("* CPU result: distances *\n");
printf("*************************\n");
printMinimumDistances(N, h_graphMinimumDistances);
printf("\n********************\n");
printf("* CPU result: path *\n");
printf("********************\n");
printPath(N, h_graphMinimumDistances, h_graphPath);
// --- Graph array device allocation and host-device memory transfer
int *d_graphMinimumDistances; gpuErrchk(cudaMalloc(&d_graphMinimumDistances, N * N * sizeof(int)));
gpuErrchk(cudaMemcpy(d_graphMinimumDistances, h_graphArray, N * N * sizeof(int), cudaMemcpyHostToDevice));
int *d_graphPath; gpuErrchk(cudaMalloc(&d_graphPath, N * N * sizeof(int)));
for (int k = 0; k < N; k++)
for (int l = 0; l < N; l++)
if (h_graphArray[k * N + l] == INT_MAX / 2) h_graphPath[k * N + l] = INT_MAX / 2;
else h_graphPath[k * N + l] = k;
gpuErrchk(cudaMemcpy(d_graphPath, h_graphPath, N * N * sizeof(int), cudaMemcpyHostToDevice));
// --- Iterations
for (int k = 0; k < N; k++) {
d_FloydWarshall <<<dim3(iDivUp(N, BLOCKSIZE), N), BLOCKSIZE>>>(k, d_graphMinimumDistances, d_graphPath, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
// --- Copy results back to the host
gpuErrchk(cudaMemcpy(h_graphMinimumDistances, d_graphMinimumDistances, N * N * sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_graphPath, d_graphPath, N * N * sizeof(int), cudaMemcpyDeviceToHost));
printf("\n**************\n");
printf("* GPU result *\n");
printf("**************\n");
printMinimumDistances(N, h_graphMinimumDistances);
printf("\n********************\n");
printf("* GPU result: path *\n");
printf("********************\n");
printPath(N, h_graphMinimumDistances, h_graphPath);
}

Summing the rows of a matrix (stored in either row-major or column-major order) in CUDA

I'm working on the problem summing the rows of a matrix in CUDA. I'm giving the following example.
Suppose to have the following 20 * 4 array:
1 2 3 4
4 1 2 3
3 4 1 2
.
1 2 3 4
.
.
.
.
.
.
.
.
2 1 3 4
After flattened the 2d array to a 1d array (either in row-major or column-major order), I need to assign each thread to a different row and calculate the cost for that row.
For example
- thread 1 should calculate the cost for 1 2 3 4
- thread 2 should calculate the cost for 4 1 2 3
How can I that in CUDA?
Thank you all for the reply
#include <stdio.h>
#include <stdlib.h>
#define MROWS 20
#define NCOLS 4
#define nTPB 256
__global__ void mykernel(int *costdata, int rows, int cols, int *results){
int tidx = threadIdx.x + blockDim.x*blockIdx.x;
if (tidx < rows){
int mycost = 0;
for (int i = 0; i < cols; i++)
mycost += costdata[(tidx*cols)+i];
results[tidx] = mycost;
}
}
int main(){
//define and initialize host and device storage for cost and results
int *d_costdata, *h_costdata, *d_results, *h_results;
h_results = (int *)malloc(MROWS*sizeof(int));
h_costdata = (int *)malloc(MROWS*NCOLS*sizeof(int));
for (int i=0; i<(MROWS*NCOLS); i++)
h_costdata[i] = rand()%4;
cudaMalloc((void **)&d_results, MROWS*sizeof(int));
cudaMalloc((void **)&d_costdata, MROWS*NCOLS*sizeof(int));
//copy cost data from host to device
cudaMemcpy(d_costdata, h_costdata, MROWS*NCOLS*sizeof(int), cudaMemcpyHostToDevice);
mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results);
// copy results back from device to host
cudaMemcpy(h_results, d_results, MROWS*sizeof(int), cudaMemcpyDeviceToHost);
for (int i=0; i<MROWS; i++){
int loc_cost = 0;
for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
printf("cost[%d]: host= %d, device = %d\n", i, loc_cost, h_results[i]);
}
}
This assumes "cost" of each row is just the sum of the elements in each row. If you have a different "cost" function, you can modify the activity in the kernel for-loop accordingly. This also assumes C-style row-major data storage (1 2 3 4 4 1 2 3 3 4 1 2 etc.)
If you instead use column-major storage (1 4 3 etc.), you can slightly improve the performance, since the data reads can be fully coalesced. Then your kernel code could look like this:
for (int i = 0; i < cols; i++)
mycost += costdata[(i*rows)+tidx];
You should also use proper cuda error checking on all CUDA API calls and kernel calls.
EDIT: As discussed in the comments below, for the row-major storage case, in some situations it might give an increase in memory efficiency by electing to load 16-byte quantities rather than the base type. Following is a modified version that implements this idea for arbitrary dimensions and (more or less) arbitrary base types:
#include <iostream>
#include <typeinfo>
#include <cstdlib>
#include <vector_types.h>
#define MROWS 1742
#define NCOLS 801
#define nTPB 256
typedef double mytype;
__host__ int sizetype(){
int size = 0;
if ((typeid(mytype) == typeid(float)) || (typeid(mytype) == typeid(int)) || (typeid(mytype) == typeid(unsigned int)))
size = 4;
else if (typeid(mytype) == typeid(double))
size = 8;
else if ((typeid(mytype) == typeid(unsigned char)) || (typeid(mytype) == typeid(char)))
size = 1;
return size;
}
template<typename T>
__global__ void mykernel(const T *costdata, int rows, int cols, T *results, int size, size_t pitch){
int chunk = 16/size; // assumes size is a factor of 16
int tidx = threadIdx.x + blockDim.x*blockIdx.x;
if (tidx < rows){
T *myrowptr = (T *)(((unsigned char *)costdata) + tidx*pitch);
T mycost = (T)0;
int count = 0;
while (count < cols){
if ((cols-count)>=chunk){
// read 16 bytes
int4 temp = *((int4 *)(myrowptr + count));
int bcount = 16;
int j = 0;
while (bcount > 0){
mycost += *(((T *)(&temp)) + j++);
bcount -= size;
count++;}
}
else {
// read one quantity at a time
for (; count < cols; count++)
mycost += myrowptr[count];
}
results[tidx] = mycost;
}
}
}
int main(){
int typesize = sizetype();
if (typesize == 0) {std::cout << "invalid type selected" << std::endl; return 1;}
//define and initialize host and device storage for cost and results
mytype *d_costdata, *h_costdata, *d_results, *h_results;
h_results = (mytype *)malloc(MROWS*sizeof(mytype));
h_costdata = (mytype *)malloc(MROWS*NCOLS*sizeof(mytype));
for (int i=0; i<(MROWS*NCOLS); i++)
h_costdata[i] = (mytype)(rand()%4);
size_t pitch = 0;
cudaMalloc((void **)&d_results, MROWS*sizeof(mytype));
cudaMallocPitch((void **)&d_costdata, &pitch, NCOLS*sizeof(mytype), MROWS);
//copy cost data from host to device
cudaMemcpy2D(d_costdata, pitch, h_costdata, NCOLS*sizeof(mytype), NCOLS*sizeof(mytype), MROWS, cudaMemcpyHostToDevice);
mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results, typesize, pitch);
// copy results back from device to host
cudaMemcpy(h_results, d_results, MROWS*sizeof(mytype), cudaMemcpyDeviceToHost);
for (int i=0; i<MROWS; i++){
mytype loc_cost = (mytype)0;
for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
if ((i < 10) && (typesize > 1))
std::cout <<"cost[" << i << "]: host= " << loc_cost << ", device = " << h_results[i] << std::endl;
if (loc_cost != h_results[i]){ std::cout << "mismatch at index" << i << "should be:" << loc_cost << "was:" << h_results[i] << std::endl; return 1; }
}
std::cout << "Results are correct!" << std::endl;
}

Element-by-element vector multiplication with CUDA

I have build a rudimentary kernel in CUDA to do an elementwise vector-vector multiplication of two complex vectors. The kernel code is inserted below (multiplyElementwise). It works fine, but since I noticed that other seemingly straightforward operations (like scaling a vector) are optimized in libraries like CUBLAS or CULA, I was wondering if it is possible to replace my code by a library call? To my surprise, neither CUBLAS nor CULA have this option, I tried to fake it by making one of the vectors the diagonal of a diagonal matrix-vector product, but the result was really slow.
As a matter of last resort I tried to optimize this code myself (see multiplyElementwiseFast below) by loading the two vectors in shared memory and then work from there, but that was slower than my original code.
So my questions:
Is there library that does elementwise vector-vector multiplications?
If not, can I accelerate my code (multiplyElementwise)?
Any help would be greatly appreciated!
__global__ void multiplyElementwise(cufftComplex* f0, cufftComplex* f1, int size)
{
const int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < size)
{
float a, b, c, d;
a = f0[i].x;
b = f0[i].y;
c = f1[i].x;
d = f1[i].y;
float k;
k = a * (c + d);
d = d * (a + b);
c = c * (b - a);
f0[i].x = k - d;
f0[i].y = k + c;
}
}
__global__ void multiplyElementwiseFast(cufftComplex* f0, cufftComplex* f1, int size)
{
const int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < 4*size)
{
const int N = 256;
const int thId = threadIdx.x / 4;
const int rem4 = threadIdx.x % 4;
const int i4 = i / 4;
__shared__ float a[N];
__shared__ float b[N];
__shared__ float c[N];
__shared__ float d[N];
__shared__ float Re[N];
__shared__ float Im[N];
if (rem4 == 0)
{
a[thId] = f0[i4].x;
Re[thId] = 0.f;
}
if (rem4 == 1)
{
b[thId] = f0[i4].y;
Im[thId] = 0.f;
}
if (rem4 == 2)
c[thId] = f1[i4].x;
if (rem4 == 0)
d[thId] = f1[i4].y;
__syncthreads();
if (rem4 == 0)
atomicAdd(&(Re[thId]), a[thId]*c[thId]);
if (rem4 == 1)
atomicAdd(&(Re[thId]), -b[thId]*d[thId]);
if (rem4 == 2)
atomicAdd(&(Im[thId]), b[thId]*c[thId]);
if (rem4 == 3)
atomicAdd(&(Im[thId]), a[thId]*d[thId]);
__syncthreads();
if (rem4 == 0)
f0[i4].x = Re[thId];
if (rem4 == 1)
f0[i4].y = Im[thId];
}
}
If what you are trying to achieve is a simple element-wise product with complex numbers, you do seem to be doing some extra steps in your multiplyElementwise kernel that increase register usage. What you try to compute is:
f0[i].x = a*c - b*d;
f0[i].y = a*d + b*c;
since (a + ib)*(c + id) = (a*c - b*d) + i(a*d + b*c). By using your improved complex multiplication, you're trading 1 multiplication for 3 additions and some extra registers. Whether this can be justified or not might depend on the hardware you're using. For instance, if your hardware supports FMA (Fused Multiply-Add), that kind of optimization may not be efficient. You should consider reading this document: "Precision & Performance:
Floating Point and IEEE 754 Compliance for NVIDIA GPUs" which also tackles the issue of floating-point precision.
Still, you should consider using Thrust. This library offers many high-level tools to operate on both host and device vectors. You can see a long list of examples here: https://github.com/thrust/thrust/tree/master/examples. This would make your life a lot easier.
UPDATED CODE
In your case, you could use this example and adapt it to something like this:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <time.h>
struct ElementWiseProductBasic : public thrust::binary_function<float2,float2,float2>
{
__host__ __device__
float2 operator()(const float2& v1, const float2& v2) const
{
float2 res;
res.x = v1.x * v2.x - v1.y * v2.y;
res.y = v1.x * v2.y + v1.y * v2.x;
return res;
}
};
/**
* See: http://www.embedded.com/design/embedded/4007256/Digital-Signal-Processing-Tricks--Fast-multiplication-of-complex-numbers%5D
*/
struct ElementWiseProductModified : public thrust::binary_function<float2,float2,float2>
{
__host__ __device__
float2 operator()(const float2& v1, const float2& v2) const
{
float2 res;
float a, b, c, d, k;
a = v1.x;
b = v1.y;
c = v2.x;
d = v2.y;
k = a * (c + d);
d = d * (a + b);
c = c * (b - a);
res.x = k -d;
res.y = k + c;
return res;
}
};
int get_random_int(int min, int max)
{
return min + (rand() % (int)(max - min + 1));
}
thrust::host_vector<float2> init_vector(const size_t N)
{
thrust::host_vector<float2> temp(N);
for(size_t i = 0; i < N; i++)
{
temp[i].x = get_random_int(0, 10);
temp[i].y = get_random_int(0, 10);
}
return temp;
}
int main(void)
{
const size_t N = 100000;
const bool compute_basic_product = true;
const bool compute_modified_product = true;
srand(time(NULL));
thrust::host_vector<float2> h_A = init_vector(N);
thrust::host_vector<float2> h_B = init_vector(N);
thrust::device_vector<float2> d_A = h_A;
thrust::device_vector<float2> d_B = h_B;
thrust::host_vector<float2> h_result(N);
thrust::host_vector<float2> h_result_modified(N);
if (compute_basic_product)
{
thrust::device_vector<float2> d_result(N);
thrust::transform(d_A.begin(), d_A.end(),
d_B.begin(), d_result.begin(),
ElementWiseProductBasic());
h_result = d_result;
}
if (compute_modified_product)
{
thrust::device_vector<float2> d_result_modified(N);
thrust::transform(d_A.begin(), d_A.end(),
d_B.begin(), d_result_modified.begin(),
ElementWiseProductModified());
h_result_modified = d_result_modified;
}
std::cout << std::fixed;
for (size_t i = 0; i < 4; i++)
{
float2 a = h_A[i];
float2 b = h_B[i];
std::cout << "(" << a.x << "," << a.y << ")";
std::cout << " * ";
std::cout << "(" << b.x << "," << b.y << ")";
if (compute_basic_product)
{
float2 prod = h_result[i];
std::cout << " = ";
std::cout << "(" << prod.x << "," << prod.y << ")";
}
if (compute_modified_product)
{
float2 prod_modified = h_result_modified[i];
std::cout << " = ";
std::cout << "(" << prod_modified.x << "," << prod_modified.y << ")";
}
std::cout << std::endl;
}
return 0;
}
This returns:
(6.000000,5.000000) * (0.000000,1.000000) = (-5.000000,6.000000)
(3.000000,2.000000) * (0.000000,4.000000) = (-8.000000,12.000000)
(2.000000,10.000000) * (10.000000,4.000000) = (-20.000000,108.000000)
(4.000000,8.000000) * (10.000000,9.000000) = (-32.000000,116.000000)
You can then compare the timings of the two different multiplication strategies and choose what's best with your hardware.
You can use cublasZdgmm.
cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode,
int m, int n,
const cuDoubleComplex *A, int lda,
const cuDoubleComplex *x, int incx,
cuDoubleComplex *C, int ldc)