How to quickly compact a sparse array with CUDA C? - cuda

Summary
Array [A - B - - - C] in device memory but want [A B C] - what's the quickest way with CUDA C?
Context
I have an array A of integers on device (GPU) memory. At each iteration, I randomly choose a few elements that are larger than 0 and subtract 1 from them. I maintain a sorted lookup array L of those elements that are equal to 0:
Array A:
# iteration i: [0 1 0 3 3 2 0 1 2 3]
# iteration i + 1: [0 0 0 3 2 2 0 1 2 3]
Lookup for 0-elements L:
# iteration i: [0 - 2 - - - 6 - - -] -> want compacted form: [0 2 6]
# iteration i + 1: [0 1 2 - - - 6 - - -] -> want compacted form: [0 1 2 6]
(Here, I randomly chose elements 1 and 4 to subtract 1 from. In my implementation in CUDA C, each thread maps onto an element in A, and so the lookup array is sparse to prevent data races and to maintain a sorted ordering (e.g. [0 1 2 6] rather than [0 2 6 1]).)
Later, I will do some operation only for those elements that are equal to 0. Hence I need to compact my sparse lookup array L, so that I can map threads to 0-elements.
As such, what is the most efficient way to compact a sparse array on device memory with CUDA C?
Many thanks.

Suppose I have:
int V[] = {1, 2, 0, 0, 5};
And my desired result is:
int R[] = {1, 2, 5}
In effect we are removing elements that are zero, or copying elements only if non-zero.
#include <thrust/device_ptr.h>
#include <thrust/copy.h>
#include <stdio.h>
#define SIZE 5
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
struct is_not_zero
{
__host__ __device__
bool operator()(const int x)
{
return (x != 0);
}
};
int main(){
int V[] = {1, 2, 0, 0, 5};
int R[] = {0, 0, 0, 0, 0};
int *d_V, *d_R;
cudaMalloc((void **)&d_V, SIZE*sizeof(int));
cudaCheckErrors("cudaMalloc1 fail");
cudaMalloc((void **)&d_R, SIZE*sizeof(int));
cudaCheckErrors("cudaMalloc2 fail");
cudaMemcpy(d_V, V, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
thrust::device_ptr<int> dp_V(d_V);
thrust::device_ptr<int> dp_R(d_R);
thrust::copy_if(dp_V, dp_V + SIZE, dp_R, is_not_zero());
cudaMemcpy(R, d_R, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy2 fail");
for (int i = 0; i<3; i++)
printf("R[%d]: %d\n", i, R[i]);
return 0;
}
the struct defintion provides us with a functor that tests for zero elements. Note that in thrust, there are no kernels and we are not writing device code directly. All that happens behind the scenes. And I'd definitely suggest familiarizing yourself with the quick start guide, so as not to turn this question into a tutorial on thrust.
After reviewing the comments, I think this modified version of the code will work around the cuda 4.0 issues:
#include <thrust/device_ptr.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <stdio.h>
#define SIZE 5
struct is_not_zero
{
__host__ __device__
bool operator()(const int x)
{
return (x != 0);
}
};
int main(){
int V[] = {1, 2, 0, 0, 5};
int R[] = {0, 0, 0, 0, 0};
thrust::host_vector<int> h_V(V, V+SIZE);
thrust::device_vector<int> d_V = h_V;
thrust::device_vector<int> d_R(SIZE, 0);
thrust::copy_if(d_V.begin(), d_V.end(), d_R.begin(), is_not_zero());
thrust::host_vector<int> h_R = d_R;
thrust::copy(h_R.begin(), h_R.end(), R);
for (int i = 0; i<3; i++)
printf("R[%d]: %d\n", i, R[i]);
return 0;
}

Related

Finding the first index of every distinct value in CUDA array

Assume we have an array like this:
0, 0, 0, 1, 2, 2, 2, 3, 3, 4, ...
I would like to have the index of every first occurrence of every value, so in this example [0, 3, 4, 7, 9]. The array is sorted and all possible values are known and consecutive.
Possible solutions I have is using a kernel for every element in this array and use an atomicmin to save the lowest index. But I assume a better approach is possible.
You can do this with a single call to thrust::unique_by_key() if you provide a vector of indices e.g. via thrust::sequence(). Here's a worked example:
$ cat t3.cu
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/unique.h>
#include <thrust/sequence.h>
#include <iostream>
int main(){
int keys[] = {0, 0, 0, 1, 2, 2, 2, 3, 3, 4};
int ks = sizeof(keys)/sizeof(keys[0]);
thrust::device_vector<int> d_keys(keys, keys+ks);
thrust::device_vector<int> d_result(ks);
thrust::sequence(d_result.begin(), d_result.end());
int rs = (thrust::unique_by_key(d_keys.begin(), d_keys.end(), d_result.begin())).first - d_keys.begin();
thrust::copy_n(d_result.begin(), rs, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -arch=sm_35 -o t3 t3.cu
$ ./t3
0,3,4,7,9,
$
The important activity occurring here is stream compaction and thrust provides a nice set of routines for various use-cases. For example this operation could also be done with thrust::unique_copy() and in that case, with some additional code complexity, you could eliminate the need for the thrust::sequence() call (it would be replaced by a thrust::counting_iterator zipped together with your data, and an appropriate selection functor), but it still requires an output vector of the same length.
As #tera pointed out, you can compare a number with the previous number to determine whether it is the first occurrence in a sequence of unique number. You can write a kernel to generate a mask for this criteria such that the mask array contains the index for a number which is a first occurrence and a negative number (like -1, as it cannot be an index) otherwise. After that, use thrust to count the non -1 values by using a predicate. Then copy those values from the mask using the same predicate as above. Finally, copy back the results to host.
Here is a sample implementation of the above mentioned approach.
#include <iostream>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/count.h>
#include <thrust/copy.h>
using namespace std;
//Copy index
__global__ void is_first_occurence(int* input, int* is, int count)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid<count)
{
if(tid == 0)
{
is[0] = 0;
}
else if(input[tid] != input[tid-1])
{
is[tid] = tid;
}
else
{
is[tid] = -1;
}
}
}
struct isFirst
{
__host__ __device__ bool operator()(const int x)
{
return (x != -1);
}
};
int main(int argc, char** argv)
{
const int count = 13;
std::vector<int> arr = { 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4 ,4 };
thrust::device_vector<int> arr_d = arr;
thrust::device_vector<int> mask_d(arr_d.size());
int* pArr = thrust::raw_pointer_cast(arr_d.data() );
int* pMask = thrust::raw_pointer_cast(mask_d.data() );
dim3 block(16);
dim3 grid((count + block.x -1)/block.x);
is_first_occurence<<<grid,block>>>(pArr, pMask, count);
cudaDeviceSynchronize();
int num_unique = thrust::count_if(mask_d.begin(), mask_d.end(), isFirst());
thrust::copy_if(mask_d.begin(), mask_d.end(), arr_d.begin(), isFirst());
std::vector<int> unique_indices(num_unique);
thrust::copy(arr_d.begin(), arr_d.begin() + num_unique, unique_indices.begin());
for(auto i:unique_indices)
{
cout<<i<<endl;
}
return 0;
}
Compiled and tested using the following command:
nvcc -o get_unique get_unique.cu -std=c++11 -arch=sm_61

CUDA outputs always 0

The printing output is always 0, after executing the kernel function.
After some testing, cudaMemcpy is still correct. But the kernel seems not working, can not get correct data from d_inputs.
Could somebody help explain? Thanks!
#include <cuda_runtime.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/time.h>
#include <math.h>
#define N 32
__global__ void Kernel_double(int niters, int* d_inputs,double* d_outputs)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid<N) {
double val =(double) d_inputs[tid];
/*for (int iter=0; iter < niters; iter++){
val = (sqrt(pow(val,2.0)) + 5.0) - 101.0;
val = (val / 3.0) + 102.0;
val = (val + 1.07) - 103.0;
val = (val / 1.037) + 104.0;
val = (val + 3.00) - 105.0;
val = (val / 0.22) + 106.0;
}*/
val = val + 1.0;
//printf("This is %f\n",val);
d_outputs[tid] = val;
}
}
int main(int argc, char **argv)
{
int niters = 10;
printf("Iterate %d times with GPU 0 or CPU 1: %d\n", niters, cpu);
int inputs[N];
for (int i = 0; i<N; i++){
inputs[i] = i+1;
}
int d_inputs[N];
double d_outputs[N];
double outputs[N];
cudaMalloc( (void**)&d_inputs, N*sizeof(int));
cudaMalloc( (void**)&d_outputs, N*sizeof(double));
printf("test %d \n", inputs[3]);
cudaMemcpy(d_inputs, inputs, N*sizeof(int), cudaMemcpyHostToDevice);
printf("test %d \n", d_inputs[1]);
Kernel_double<<<16,2>>>(niters, d_inputs,d_outputs);
//cudaDeviceSynchronize();
cudaMemcpy(outputs, d_outputs, N*sizeof(double), cudaMemcpyDeviceToHost);
for(int j =0;j<10; j++){
printf("Outputs[%d] is: %f and %f\n",j, d_outputs[j], outputs[j]);
}
cudaFree(d_inputs);
cudaFree(d_outputs);
return EXIT_SUCCESS;
}
Any time you are having trouble with a CUDA code, you should use proper cuda error checking and run your code with cuda-memcheck, before asking others for help. Even if you don't understand the error output, it will be useful for others trying to help you. If you had used proper cuda error checking here, you would be informed that your cudaMemcpy operations are reporting an invalid argument, due to item 3 below.
Your code will not compile. cpu is not defined anywhere.
We don't allocate for, or create device pointers like this:
int d_inputs[N];
double d_outputs[N];
Those are creating stack variables (arrays) that the compiler is allowed to treat as if it were a constant pointer. Instead you should do it like this:
int *d_inputs;
double *d_outputs;
the compiler understands that these are modifiable pointers (which you will modify later with cudaMalloc).
Once you fix the issue in item 3, this will not be legal:
printf("test %d \n", d_inputs[1]);
as this requires dereferencing a device pointer (d_inputs) in host code, which is illegal in CUDA, at least as you have done so here. You have a similar problem in the printf statement later in your code as well (with d_outputs).
The following code has the above items addressed to some degree, and seems to run correctly for me:
$ cat t44.cu
#include <cuda_runtime.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/time.h>
#include <math.h>
#define N 32
__global__ void Kernel_double(int niters, int* d_inputs,double* d_outputs)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid<N) {
double val =(double) d_inputs[tid];
/*for (int iter=0; iter < niters; iter++){
val = (sqrt(pow(val,2.0)) + 5.0) - 101.0;
val = (val / 3.0) + 102.0;
val = (val + 1.07) - 103.0;
val = (val / 1.037) + 104.0;
val = (val + 3.00) - 105.0;
val = (val / 0.22) + 106.0;
}*/
val = val + 1.0;
//printf("This is %f\n",val);
d_outputs[tid] = val;
}
}
int main(int argc, char **argv)
{
int niters = 10;
int cpu = 0;
printf("Iterate %d times with GPU 0 or CPU 1: %d\n", niters, cpu);
int inputs[N];
for (int i = 0; i<N; i++){
inputs[i] = i+1;
}
int *d_inputs;
double *d_outputs;
double outputs[N];
cudaMalloc( (void**)&d_inputs, N*sizeof(int));
cudaMalloc( (void**)&d_outputs, N*sizeof(double));
printf("test %d \n", inputs[3]);
cudaMemcpy(d_inputs, inputs, N*sizeof(int), cudaMemcpyHostToDevice);
// printf("test %d \n", d_inputs[1]);
Kernel_double<<<16,2>>>(niters, d_inputs,d_outputs);
//cudaDeviceSynchronize();
cudaMemcpy(outputs, d_outputs, N*sizeof(double), cudaMemcpyDeviceToHost);
for(int j =0;j<10; j++){
printf("Outputs[%d] is: %f\n",j, outputs[j]);
}
cudaFree(d_inputs);
cudaFree(d_outputs);
return EXIT_SUCCESS;
}
$ nvcc -lineinfo -arch=sm_61 -o t44 t44.cu
$ cuda-memcheck ./t44
========= CUDA-MEMCHECK
Iterate 10 times with GPU 0 or CPU 1: 0
test 4
Outputs[0] is: 2.000000
Outputs[1] is: 3.000000
Outputs[2] is: 4.000000
Outputs[3] is: 5.000000
Outputs[4] is: 6.000000
Outputs[5] is: 7.000000
Outputs[6] is: 8.000000
Outputs[7] is: 9.000000
Outputs[8] is: 10.000000
Outputs[9] is: 11.000000
========= ERROR SUMMARY: 0 errors
$

Thrust: why always host code is executed in spite of __CUDA_ARCH__

I try to define two branches in code: one for CUDA execution and the other - without it (with future OMP in mind). But when I use macro __CUDA_ARCH__ it looks as if always the host code is executed. But I supposed that Thrust by default use CUDA (and branch for device code). What's wrong with my code?
Here it is:
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <stdio.h>
struct my_op
{
my_op(int init_const) : constanta(init_const) {}
__host__ __device__ int operator()(const int &x) const
{
#if defined(__CUDA_ARCH__)
return 2 * x * constanta; // never executed - why?
#else
return x * constanta; // always executed
#endif
}
private:
int constanta;
};
int main()
{
int data[7] = { 0, 0, 0, 0, 0, 0, 0 };
thrust::counting_iterator<int> first(10);
thrust::counting_iterator<int> last = first + 7;
int init_value = 1;
my_op op(init_value);
thrust::transform(first, last, data, op);
for each (int el in data)
std::cout << el << " ";
std::cout << std::endl;
}
I expect that "transform" will define vector as multiplied by 2*constanta but I see that host code is used - the output is "10 11 12 13 14 15 16", not "20 22 24 26 28 30 32" (as expected).
Why?
Thrust is choosing the host path because one of your data items supplied to the thrust transform operation is in host memory:
thrust::transform(first, last, data, op);
^^^^
If you want a thrust algorithm to operate on the device, generally speaking all the container data you pass to/from must also reside in device memory.
Here's a modification to your code that demonstrates that thrust will follow the device path if we replace data with a device-resident container:
$ cat t13.cu
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_vector.h>
#include <stdio.h>
struct my_op
{
my_op(int init_const) : constanta(init_const) {}
__host__ __device__ int operator()(const int &x) const
{
#if defined(__CUDA_ARCH__)
return 2 * x * constanta; // never executed - why?
#else
return x * constanta; // always executed
#endif
}
private:
int constanta;
};
int main()
{
// int data[7] = { 0, 0, 0, 0, 0, 0, 0 };
thrust::counting_iterator<int> first(10);
thrust::counting_iterator<int> last = first + 7;
thrust::device_vector<int> d_data(7);
int init_value = 1;
my_op op(init_value);
thrust::transform(first, last, d_data.begin(), op);
for (int el = 0; el < 7; el++) {
int dat = d_data[el];
std::cout << dat << " "; }
std::cout << std::endl;
}
$ nvcc -arch=sm_61 -o t13 t13.cu
$ ./t13
20 22 24 26 28 30 32
$
You may want to read the thrust quick start guide to learn about thrust algorithm dispatch.

CUDA cublas getrf and getri, for a matrix inversion, cause nvprof errors with one dimensional memory [duplicate]

Since CUDA 5.5, the CUBLAS library contains routines for batched matrix factorization and inversion (cublas<t>getrfBatched and cublas<t>getriBatched respectively).
Getting guide from the documentation, I wrote a test code for inversion of an N x N matrix using these routines. The code gives correct output only if the matrix has all non zero pivots. Setting any pivot to zero results in incorrect results. I have verified the results using MATLAB.
I realize that I am providing row major matrices as input while CUBLAS expects column major matrices, but it shouldn't matter as it would only transpose the result. To be sure, I also tested on column major input, but getting same behavior.
I am confused as, cublas<t>getriBatched expects pivot exchange information array P as input, which is the output from cublas<t>getrfBatched. So, if any zero pivots are eliminated by row exchange, then the inversion routine should handle it automatically.
How to perform inversion of matrices which contain a zero pivot using CUBLAS?
Following is a self contained compile-able example with different test cases:
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void invert_device(float* src_d, float* dst_d, int n)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
int batchSize = 1;
int *P, *INFO;
cudacall(cudaMalloc<int>(&P,n * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));
int lda = n;
float *A[] = { src_d };
float** A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh == n)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
float* C[] = { dst_d };
float** C_d;
cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));
cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,lda,INFO,batchSize));
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}
void invert(float* src, float* dst, int n)
{
float* src_d, *dst_d;
cudacall(cudaMalloc<float>(&src_d,n * n * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,n * n * sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));
invert_device(src_d,dst_d,n);
cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudaFree(src_d), cudaFree(dst_d);
}
void test_invert()
{
const int n = 3;
//Random matrix with full pivots
float full_pivots[n*n] = { 0.5, 3, 4,
1, 3, 10,
4 , 9, 16 };
//Almost same as above matrix with first pivot zero
float zero_pivot[n*n] = { 0, 3, 4,
1, 3, 10,
4 , 9, 16 };
float zero_pivot_col_major[n*n] = { 0, 1, 4,
3, 3, 9,
4 , 10, 16 };
float another_zero_pivot[n*n] = { 0, 3, 4,
1, 5, 6,
9, 8, 2 };
float another_full_pivot[n * n] = { 22, 3, 4,
1, 5, 6,
9, 8, 2 };
float singular[n*n] = {1,2,3,
4,5,6,
7,8,9};
//Select matrix by setting "a"
float* a = zero_pivot;
fprintf(stdout, "Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*n+j]);
fprintf(stdout,"\n");
}
fprintf(stdout,"\n\n");
invert(a,a,n);
fprintf(stdout, "Inverse:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*n+j]);
fprintf(stdout,"\n");
}
}
int main()
{
test_invert();
int n; scanf("%d",&n);
return 0;
}
There seems to be a bug in the current CUBLAS library implementation of cublas<t>getrfBatched for matrices of dimension (n) such that 3<=n<=16, when there is a "zero pivot" as you say.
A possible workaround is to "identity-extend" your A matrix to be inverted, when n<17, to a size of 17x17 (using matlab nomenclature):
LU = getrf( [A 0 ; 0 I]);
continuing, you can then use cublas<t>getriBatched in an "ordinary" fashion:
invA = getri( LU(1:3,1:3) )
(You can also leave everything at n=17, call getri that way, and then extract the result as the first 3x3 rows and columns of invA.)
Here is a fully worked example, borrowing from the code you supplied, showing the inversion of your supplied 3x3 zero_pivot matrix, using the zero_pivot_war matrix as an "identity-extended" workaround:
$ cat t340.cu
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void invert_device(float* src_d, float* dst_d, int n)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
int batchSize = 1;
int *P, *INFO;
cudacall(cudaMalloc<int>(&P,17 * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));
int lda = 17;
float *A[] = { src_d };
float** A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,17,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh == 17)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
float* C[] = { dst_d };
float** C_d;
cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));
cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,n,INFO,batchSize));
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}
void invert(float* src, float* dst, int n)
{
float* src_d, *dst_d;
cudacall(cudaMalloc<float>(&src_d,17 * 17 * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,17 * 17 * sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));
invert_device(src_d,dst_d,n);
cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudaFree(src_d), cudaFree(dst_d);
}
void test_invert()
{
const int n = 3;
//Random matrix with full pivots
/* float full_pivots[n*n] = { 0.5, 3, 4,
1, 3, 10,
4 , 9, 16 };
//Almost same as above matrix with first pivot zero
float zero_pivot[n*n] = { 0, 3, 4,
1, 3, 10,
4 , 9, 16 };
float zero_pivot_col_major[n*n] = { 0, 1, 4,
3, 3, 9,
4 , 10, 16 };
*/
float zero_pivot_war[17*17] = {
0,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4,9,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 };
/*
float another_zero_pivot[n*n] = { 0, 3, 4,
1, 5, 6,
9, 8, 2 };
float another_full_pivot[n * n] = { 22, 3, 4,
1, 5, 6,
9, 8, 2 };
float singular[n*n] = {1,2,3,
4,5,6,
7,8,9};
*/
float result[n*n];
//Select matrix by setting "a"
float* a = zero_pivot_war;
fprintf(stdout, "Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*17+j]);
fprintf(stdout,"\n");
}
fprintf(stdout,"\n\n");
invert(a,result,n);
fprintf(stdout, "Inverse:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",result[i*n+j]);
fprintf(stdout,"\n");
}
}
int main()
{
test_invert();
// int n; scanf("%d",&n);
return 0;
}
$ nvcc -arch=sm_20 -o t340 t340.cu -lcublas
$ cuda-memcheck ./t340
========= CUDA-MEMCHECK
Input:
0.000000 3.000000 4.000000
1.000000 3.000000 10.000000
4.000000 9.000000 16.000000
Inverse:
-0.700000 -0.200000 0.300000
0.400000 -0.266667 0.066667
-0.050000 0.200000 -0.050000
========= ERROR SUMMARY: 0 errors
$
The above result appears to me to be correct based on a simple test elsewhere.
I don't have any further technical details about the nature of the possible bug in CUBLAS. From what I can tell, it is present in both CUDA 5.5 and CUDA 6.0 RC. Detailed bug discussions for NVIDIA-supplied assets (e.g. CUBLAS library) should be taken up on the NVIDIA developer forums or directly at the bug filing portal on developer.nvidia.com (you must be a registered developer to file a bug).

Cuda program does not give the correct output when using a CUDA compatible GPU

I found the following program from http://llpanorama.wordpress.com/2008/05/21/my-first-cuda-program/
Unfortunately I can't copy paste it here because the code becomes messy
It takes as input a vector of numbers and then gives as an output the vector multiplied by itself, I run it on the emulator that I have installed on my computer and it gives the following output:
0 0.000000
1 1.000000
2 4.000000
3 9.000000
4 16.000000
5 25.000000
6 36.000000
7 49.000000
8 64.000000
9 81.000000
however if I decide to run it on a remote computer which runs debian and has cuda compatible gpu by entering
nvcc test.cu -lcudart -o test
./test
it gives me the following output
0 0.000000
1 1.000000
2 2.000000
3 3.000000
4 4.000000
5 5.000000
6 6.000000
7 7.000000
8 8.000000
9 9.000000
why does this happen? Thank you in advance!
The problem is that code has no error checking, and there is something wrong with the remote computer. Add error checking to that code (it's not hard to do), re-run it, and then see what happens. If you still have trouble, report back.
Here is the code suitably modified with error checking:
// example1.cpp : Defines the entry point for the console application.
//
#include <stdio.h>
#include <cuda.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
// main routine that executes on the host
int main(void)
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
cudaCheckErrors("cudaMalloc fail");
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy 1 fail");
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, N);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2 fail");
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
}