Cuda GPUassert: an illegal memory access was encountered - cuda

I was trying to make a game program using __device __ variables instead of declaring it dynamically using cudaMalloc, but it keeps telling me that GPUassert: illegal memory access was encountered at the third last line where the cudaDeviceSynchronization() is called. I have tried the version using cudaMalloc and it worked out fine.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cmath>
#include <stdio.h>
#include <stdlib.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define M 3
#define N 3
#define K 3
using namespace std;
__device__ double A_dev[M * K];
__device__ double B_dev[K * N];
__device__ double C_dev[M * N];
__global__ void gemm(double* A, double* B, double* C, int m, int n, int k)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = x * n + y;
double sum = 0.0;
for (int j = 0; j < k; j++)
{
sum += A[x * k + j] * B[n * j + y];
}
C[i] = sum;
printf("The value is %f", C[i]);
}
int main(void)
{
double A_h[M * K];
double B_h[K * N];
double C_h[M * N];
for (int i = 0; i < M*K; i++)
{
A_h[i] = (double)i;
B_h[i] = (double)i;
C_h[i] = 0.0;
}
gpuErrchk(cudaMemcpyToSymbol(A_dev, A_h, M * K * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(B_dev, B_h, K * N * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(C_dev, C_h, M * N * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
dim3 dimGrid(1, 1, 1);
dim3 dimBlock(3, 3, 1);
gemm <<<dimGrid, dimBlock >>> (A_dev, B_dev, C_dev, 3, 3, 3);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpyFromSymbol(C_h, C_dev, M * N * sizeof(double), 0, cudaMemcpyDeviceToHost));
return 0;
}

When using __device__ variables, they are inherently at global scope, and we do not pass those as kernel arguments. You use those variables directly in kernel code without having to have a kernel argument for them.
If you make the following changes to your code, it will run without error:
#include <iostream>
#include <cmath>
#include <stdio.h>
#include <stdlib.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define M 3
#define N 3
#define K 3
using namespace std;
__device__ double A_dev[M * K];
__device__ double B_dev[K * N];
__device__ double C_dev[M * N];
__global__ void gemm(int m, int n, int k)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = x * n + y;
double sum = 0.0;
for (int j = 0; j < k; j++)
{
sum += A_dev[x * k + j] * B_dev[n * j + y];
}
C_dev[i] = sum;
printf("The value is %f", C_dev[i]);
}
int main(void)
{
double A_h[M * K];
double B_h[K * N];
double C_h[M * N];
for (int i = 0; i < M*K; i++)
{
A_h[i] = (double)i;
B_h[i] = (double)i;
C_h[i] = 0.0;
}
gpuErrchk(cudaMemcpyToSymbol(A_dev, A_h, M * K * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(B_dev, B_h, K * N * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(C_dev, C_h, M * N * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
dim3 dimGrid(1, 1, 1);
dim3 dimBlock(3, 3, 1);
gemm <<<dimGrid, dimBlock >>> (3, 3, 3);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpyFromSymbol(C_h, C_dev, M * N * sizeof(double), 0, cudaMemcpyDeviceToHost));
return 0;
}

Related

Dot product in Cuda by example does not work for me

I'm starting to read "Cuda By Example" Book and I've been a problem with the dot example using "shared memory". I copy-paste the example from the book and I set: N = x * 1024; threadsPerBlock = 32; blocksPerGrid = 8. Where I test the "x" values with 2, 3, 4, 5.
If I set x = 3, the result is bad, but when I used x = 2,4,5 all is ok. I don't understand where is the problem. The code is:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define imin(a, b) (a<b?a:b)
#define sum_squares(x) (x*(x+1)*(2*x+1)/6)
const int x = 3;
const int N = 3 * 1024;
const int threadsPerBlock = 32;
const int blocksPerGrid = 8;
__global__ void dot(float *a, float *b, float *c)
{
__shared__ float cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;
float temp = 0;
while (tid < N)
{
temp += a[tid] * b[tid];
tid += blockDim.x * gridDim.x;
}
cache[cacheIndex] = temp;
__syncthreads();
int i = blockDim.x / 2;
while (i != 0)
{
if (cacheIndex < i)
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();
i /= 2;
}
if (cacheIndex == 0)
c[blockIdx.x] = cache[0];
}
int main()
{
float *a, *b, *partial_c, result;
float *d_a, *d_b, *d_partial_c;
a = (float *)malloc(N * sizeof(float));
b = (float *)malloc(N * sizeof(float));
partial_c = (float *)malloc(blocksPerGrid * sizeof(float));
cudaMalloc((void **)&d_a, N * sizeof(float));
cudaMalloc((void **)&d_b, N * sizeof(float));
cudaMalloc((void **)&d_partial_c, blocksPerGrid * sizeof(float));
for (int i = 0; i < N; i++)
{
a[i] = i;
b[i] = 2 * i;
}
cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, N * sizeof(float), cudaMemcpyHostToDevice);
dot << <blocksPerGrid, threadsPerBlock >> >(d_a, d_b, d_partial_c);
cudaMemcpy(partial_c, d_partial_c, blocksPerGrid * sizeof(float), cudaMemcpyDeviceToHost);
result = 0;
for (int i = 0; i < blocksPerGrid; i++)
result += partial_c[i];
if (2 * sum_squares((float)(N - 1)) == result)
printf(":)\n");
else
printf(":(\n");
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_partial_c);
free(a);
free(b);
free(partial_c);
getchar();
return 0;
}
Because float does not have enough precision, which is ~7 decimal digits only. But for x=3; your expected result is
19317916672
containing 11 digits.
for x=4,5, the results are bad on my machine too.

Call multiple CUDA SVD (in cuSolver) [duplicate]

I'm new to parallel programming using GPU so I apologize if the question is broad or vague. I'm aware there is some parallel SVD function in the CULA library, but what should be the strategy if I have a large number of relatively small matrices to factorize? For example I have n matrices with dimension d, n is large and d is small. How to parallelize this process? Could anyone give me a hint?
My previous answer is now out-of-date. As of February 2015, CUDA 7 (currently in release candidate version) offers full SVD capabilities in its cuSOLVER library. Below, I'm providing an example of generating the singular value decomposition using CUDA cuSOLVER.
Concerning the specific issue you are rising (calculating the SVD of several matrices of small size), you should adapt the example I'm providing below by using streams. To associate a stream to each task you can use
cudaStreamCreate()
and
cusolverDnSetStream()
kernel.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<iostream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include<math.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
#include "Utilities.cuh"
/********/
/* MAIN */
/********/
int main(){
// --- gesvd only supports Nrows >= Ncols
// --- column major memory ordering
const int Nrows = 7;
const int Ncols = 5;
// --- cuSOLVE input/output parameters/arrays
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolverDnCreate(&solver_handle);
// --- Setting the host, Nrows x Ncols matrix
double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double));
for(int j = 0; j < Nrows; j++)
for(int i = 0; i < Ncols; i++)
h_A[j + i*Nrows] = (i + j*j) * sqrt((double)(i + j));
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, Nrows * Ncols * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_U = (double *)malloc(Nrows * Nrows * sizeof(double));
double *h_V = (double *)malloc(Ncols * Ncols * sizeof(double));
double *h_S = (double *)malloc(min(Nrows, Ncols) * sizeof(double));
// --- device side SVD workspace and matrices
double *d_U; gpuErrchk(cudaMalloc(&d_U, Nrows * Nrows * sizeof(double)));
double *d_V; gpuErrchk(cudaMalloc(&d_V, Ncols * Ncols * sizeof(double)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, min(Nrows, Ncols) * sizeof(double)));
// --- CUDA SVD initialization
cusolveSafeCall(cusolverDnDgesvd_bufferSize(solver_handle, Nrows, Ncols, &work_size));
double *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));
// --- CUDA SVD execution
cusolveSafeCall(cusolverDnDgesvd(solver_handle, 'A', 'A', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, d_V, Ncols, work, work_size, NULL, devInfo));
int devInfo_h = 0; gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h != 0) std::cout << "Unsuccessful SVD execution\n\n";
// --- Moving the results from device to host
gpuErrchk(cudaMemcpy(h_S, d_S, min(Nrows, Ncols) * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_U, d_U, Nrows * Nrows * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, Ncols * Ncols * sizeof(double), cudaMemcpyDeviceToHost));
std::cout << "Singular values\n";
for(int i = 0; i < min(Nrows, Ncols); i++)
std::cout << "d_S["<<i<<"] = " << std::setprecision(15) << h_S[i] << std::endl;
std::cout << "\nLeft singular vectors - For y = A * x, the columns of U span the space of y\n";
for(int j = 0; j < Nrows; j++) {
printf("\n");
for(int i = 0; i < Nrows; i++)
printf("U[%i,%i]=%f\n",i,j,h_U[j*Nrows + i]);
}
std::cout << "\nRight singular vectors - For y = A * x, the columns of V span the space of x\n";
for(int i = 0; i < Ncols; i++) {
printf("\n");
for(int j = 0; j < Ncols; j++)
printf("V[%i,%i]=%f\n",i,j,h_V[j*Ncols + i]);
}
cusolverDnDestroy(solver_handle);
return 0;
}
Utilities.cuh
#ifndef UTILITIES_CUH
#define UTILITIES_CUH
extern "C" int iDivUp(int, int);
extern "C" void gpuErrchk(cudaError_t);
extern "C" void cusolveSafeCall(cusolverStatus_t);
#endif
Utilities.cu
#include <stdio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include <cuda.h>
#include <cusolverDn.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cudaGetErrorEnum(cusolverStatus_t error)
{
switch (error)
{
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
if(CUSOLVER_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSOLVE error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }
You can take a look at the Batched Operations post of the CULA blog for a discussion of your problem.
EDIT
From what I understand from your comment below, you would like each thread to calculate a separate SVD. So, basically each thread should execute a standard, sequential SVD scheme. For that some possibly useful references:
Numerical Recipes
Golub, Van Loan, Matrix Computations
If you use this approach, though, I'm afraid you will not be able anymore to use cuBLAS, as those are host functions not callable from the device (unless you do not have a compute capability >3.5, see the the simpleDevLibCUBLAS example.). But basically in this way I think you are somehow implementing the batch concept by yourself.
If you decide to go to a more standard parallel GPU implementation, the reference below could be of interest:
Singular Value Decomposition on GPU using CUDA
The above answers are now out of date. As of CUDA 9.0, the cuSOLVER library has been equipped with a batched SVD calculation based on the Jacobi method. Below, a fully worked example:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
//#define FULLSVD
//#define PRINTRESULTS
/********/
/* MAIN */
/********/
int main() {
const int M = 3;
const int N = 3;
const int lda = M;
//const int numMatrices = 3;
const int numMatrices = 16384;
TimingGPU timerGPU;
// --- Setting the host matrix
double *h_A = (double *)malloc(lda * N * numMatrices * sizeof(double));
for (unsigned int k = 0; k < numMatrices; k++)
for (unsigned int i = 0; i < M; i++){
for (unsigned int j = 0; j < N; j++){
h_A[k * M * N + j * M + i] = (1. / (k + 1)) * (i + j * j) * (i + j);
//printf("%d %d %f\n", i, j, h_A[j*M + i]);
}
}
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * numMatrices * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * numMatrices * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_S = (double *)malloc(N * numMatrices * sizeof(double));
double *h_U = NULL;
double *h_V = NULL;
#ifdef FULLSVD
h_U = (double *)malloc(M * M * numMatrices * sizeof(double));
h_V = (double *)malloc(N * N * numMatrices * sizeof(double));
#endif
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, N * numMatrices * sizeof(double)));
double *d_U = NULL;
double *d_V = NULL;
#ifdef FULLSVD
gpuErrchk(cudaMalloc(&d_U, M * M * numMatrices * sizeof(double)));
gpuErrchk(cudaMalloc(&d_V, N * N * numMatrices * sizeof(double)));
#endif
double *d_work = NULL; /* devie workspace for gesvdj */
int devInfo_h = 0; /* host copy of error devInfo_h */
// --- Parameters configuration of Jacobi-based SVD
const double tol = 1.e-7;
const int maxSweeps = 15;
cusolverEigMode_t jobz; // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif
const int econ = 0; // --- econ = 1 for economy size
// --- Numerical result parameters of gesvdj
double residual = 0;
int executedSweeps = 0;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle = NULL;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
// --- Configuration of gesvdj
gesvdjInfo_t gesvdj_params = NULL;
cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));
// --- Set the computation tolerance, since the default tolerance is machine precision
cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));
// --- Set the maximum number of sweeps, since the default value of max. sweeps is 100
cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, maxSweeps));
// --- Query the SVD workspace
cusolveSafeCall(cusolverDnDgesvdjBatched_bufferSize(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Nubmer of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M,N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
&work_size,
gesvdj_params,
numMatrices));
gpuErrchk(cudaMalloc(&d_work, sizeof(double) * work_size));
// --- Compute SVD
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnDgesvdjBatched(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Number of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M, N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
d_work,
work_size,
devInfo,
gesvdj_params,
numMatrices));
printf("Calculation of the singular values only: %f ms\n\n", timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_S, d_S, sizeof(double) * N * numMatrices, cudaMemcpyDeviceToHost));
#ifdef FULLSVD
gpuErrchk(cudaMemcpy(h_U, d_U, sizeof(double) * lda * M * numMatrices, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, sizeof(double) * lda * N * numMatrices, cudaMemcpyDeviceToHost));
#endif
#ifdef PRINTRESULTS
printf("SINGULAR VALUES \n");
printf("_______________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; SV nr. %d; Value = %f\n", k, p, h_S[k * N + p]);
printf("\n");
}
#ifdef FULLSVD
printf("SINGULAR VECTORS U \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * M + econ * min(M, N); q++)
for (int p = 0; p < M; p++)
printf("Matrix nr. %d; U nr. %d; Value = %f\n", k, p, h_U[((1 - econ) * M + econ * min(M, N)) * M * k + q * M + p]);
printf("\n");
}
printf("SINGULAR VECTORS V \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * N + econ * min(M, N); q++)
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; V nr. %d; Value = %f\n", k, p, h_V[((1 - econ) * N + econ * min(M, N)) * N * k + q * N + p]);
printf("\n");
}
#endif
#endif
if (0 == devInfo_h){
printf("gesvdj converges \n");
}
else if (0 > devInfo_h){
printf("%d-th parameter is wrong \n", -devInfo_h);
exit(1);
}
else{
printf("WARNING: devInfo_h = %d : gesvdj does not converge \n", devInfo_h);
}
// --- Free resources
if (d_A) gpuErrchk(cudaFree(d_A));
if (d_S) gpuErrchk(cudaFree(d_S));
#ifdef FULLSVD
if (d_U) gpuErrchk(cudaFree(d_U));
if (d_V) gpuErrchk(cudaFree(d_V));
#endif
if (devInfo) gpuErrchk(cudaFree(devInfo));
if (d_work) gpuErrchk(cudaFree(d_work));
if (solver_handle) cusolveSafeCall(cusolverDnDestroy(solver_handle));
if (gesvdj_params) cusolveSafeCall(cusolverDnDestroyGesvdjInfo(gesvdj_params));
gpuErrchk(cudaDeviceReset());
return 0;
}

Singular values calculation only with CUDA

I'm trying to use the new cusolverDnSgesvd routine of CUDA 7.0 for the calculation of the singular values. The full code is reported below:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<iostream>
#include<stdlib.h>
#include<stdio.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
/***********************/
/* CUDA ERROR CHECKING */
/***********************/
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main(){
int M = 10;
int N = 10;
// --- Setting the host matrix
float *h_A = (float *)malloc(M * N * sizeof(float));
for(unsigned int i = 0; i < M; i++){
for(unsigned int j = 0; j < N; j++){
h_A[j*M + i] = (i + j) * (i + j);
}
}
// --- Setting the device matrix and moving the host matrix to the device
float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));
// --- host side SVD results space
float *h_U = (float *)malloc(M * M * sizeof(float));
float *h_V = (float *)malloc(N * N * sizeof(float));
float *h_S = (float *)malloc(N * sizeof(float));
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));
cusolverStatus_t stat;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolverDnCreate(&solver_handle);
stat = cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size);
if(stat != CUSOLVER_STATUS_SUCCESS ) std::cout << "Initialization of cuSolver failed. \N";
float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));
//float *rwork; gpuErrchk(cudaMalloc(&rwork, work_size * sizeof(float)));
// --- CUDA SVD execution
//stat = cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
stat = cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
cudaDeviceSynchronize();
int devInfo_h = 0;
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
std::cout << "devInfo = " << devInfo_h << "\n";
switch(stat){
case CUSOLVER_STATUS_SUCCESS: std::cout << "SVD computation success\n"; break;
case CUSOLVER_STATUS_NOT_INITIALIZED: std::cout << "Library cuSolver not initialized correctly\n"; break;
case CUSOLVER_STATUS_INVALID_VALUE: std::cout << "Invalid parameters passed\n"; break;
case CUSOLVER_STATUS_INTERNAL_ERROR: std::cout << "Internal operation failed\n"; break;
}
if (devInfo_h == 0 && stat == CUSOLVER_STATUS_SUCCESS) std::cout << "SVD successful\n\n";
// --- Moving the results from device to host
gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));
for(int i = 0; i < N; i++) std::cout << "d_S["<<i<<"] = " << h_S[i] << std::endl;
cusolverDnDestroy(solver_handle);
return 0;
}
If I ask for the computation of the full SVD (commented line with jobu = 'A' and jobvt = 'A') everything works fine. If I ask for the computation of the singular values only (line with jobu = 'N' and jobvt = 'N'), cusolverDnSgesvd returns
CUSOLVER_STATUS_INVALID_VALUE
Please note that, in this case devInfo = 0, so I cannot spot the invalid parameter.
Please also note that the documentation PDF lacks information about the rwork parameter so that I have dealt with it as a dummy parameter.
At this time the cuSolver gesvd function only supports jobu = 'A' and jobvt = 'A'
So the error when you specify other combinations is expected. From the documentation:
Remark 2: gesvd only supports jobu='A' and jobvt='A' and returns matrix U and VH
USE OF cusolver<T>nSgesvd
As remarked by lebedov, as of CUDA 8.0, it is now possible to calculate the singular values only by cusolverDnSgesvd. I report below a slightly modified version of your code with two calls to cusolverDnSgesvd, one performing the singular values calculation only
cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo)
and one performing the full SVD calculation
cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo)
As you already remarked, the two 'A' fields for the full SVD case are changed to 'N' in the singular values only case. Please, note that, in the singular values only case, there is no need to store space for the singular vector matrices U and V. Indeed, a NULL pointer is passed.
The singular values calculation only is faster than the full SVD calculation. On a GTX 960, for a 1000x1000 matrix, the timing has been the following:
Singular values only: 559 ms
Full SVD: 2239 ms
Here is the full code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<iostream>
#include<stdlib.h>
#include<stdio.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
/********/
/* MAIN */
/********/
int main(){
int M = 1000;
int N = 1000;
TimingGPU timerGPU;
float elapsedTime;
// --- Setting the host matrix
float *h_A = (float *)malloc(M * N * sizeof(float));
for (unsigned int i = 0; i < M; i++){
for (unsigned int j = 0; j < N; j++){
h_A[j*M + i] = (i + j) * (i + j);
}
}
// --- Setting the device matrix and moving the host matrix to the device
float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));
// --- host side SVD results space
float *h_U = (float *)malloc(M * M * sizeof(float));
float *h_V = (float *)malloc(N * N * sizeof(float));
float *h_S = (float *)malloc(N * sizeof(float));
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));
cusolverStatus_t stat;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
cusolveSafeCall(cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size));
float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));
// --- CUDA SVD execution - Singular values only
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo));
elapsedTime = timerGPU.GetCounter();
int devInfo_h = 0;
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h == 0)
printf("SVD successfull for the singular values calculation only\n\n");
else if (devInfo_h < 0)
printf("SVD unsuccessfull for the singular values calculation only. Parameter %i is wrong\n", -devInfo_h);
else
printf("SVD unsuccessfull for the singular values calculation only. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zero\n", devInfo_h);
printf("Calculation of the singular values only: %f ms\n\n", elapsedTime);
// --- Moving the results from device to host
//gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));
//for (int i = 0; i < N; i++) std::cout << "d_S[" << i << "] = " << h_S[i] << std::endl;
// --- CUDA SVD execution - Full SVD
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo));
elapsedTime = timerGPU.GetCounter();
devInfo_h = 0;
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h == 0)
printf("SVD successfull for the full SVD calculation\n\n");
else if (devInfo_h < 0)
printf("SVD unsuccessfull for the full SVD calculation. Parameter %i is wrong\n", -devInfo_h);
else
printf("SVD unsuccessfull for the full SVD calculation. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zero\n", devInfo_h);
printf("Calculation of the full SVD calculation: %f ms\n\n", elapsedTime);
cusolveSafeCall(cusolverDnDestroy(solver_handle));
return 0;
}
EDIT - PERFORMANCE ACROSS DIFFERENT VERSIONS OF CUDA
I have compared the performance of the singular values only calculation and the the Full SVD computations for CUDA 8.0, CUDA 9.1 and CUDA 10.0, for a 5000x5000 matrix. Here are the results on a GTX 960.
Computation type CUDA 8.0 CUDA 9.1 CUDA 10.0
__________________________________________________________________
Singular values only 17s 15s 15s
Full SVD 161s 159s 457s
__________________________________________________________________

Parallel implementation for multiple SVDs using CUDA

I'm new to parallel programming using GPU so I apologize if the question is broad or vague. I'm aware there is some parallel SVD function in the CULA library, but what should be the strategy if I have a large number of relatively small matrices to factorize? For example I have n matrices with dimension d, n is large and d is small. How to parallelize this process? Could anyone give me a hint?
My previous answer is now out-of-date. As of February 2015, CUDA 7 (currently in release candidate version) offers full SVD capabilities in its cuSOLVER library. Below, I'm providing an example of generating the singular value decomposition using CUDA cuSOLVER.
Concerning the specific issue you are rising (calculating the SVD of several matrices of small size), you should adapt the example I'm providing below by using streams. To associate a stream to each task you can use
cudaStreamCreate()
and
cusolverDnSetStream()
kernel.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<iostream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include<math.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
#include "Utilities.cuh"
/********/
/* MAIN */
/********/
int main(){
// --- gesvd only supports Nrows >= Ncols
// --- column major memory ordering
const int Nrows = 7;
const int Ncols = 5;
// --- cuSOLVE input/output parameters/arrays
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolverDnCreate(&solver_handle);
// --- Setting the host, Nrows x Ncols matrix
double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double));
for(int j = 0; j < Nrows; j++)
for(int i = 0; i < Ncols; i++)
h_A[j + i*Nrows] = (i + j*j) * sqrt((double)(i + j));
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, Nrows * Ncols * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_U = (double *)malloc(Nrows * Nrows * sizeof(double));
double *h_V = (double *)malloc(Ncols * Ncols * sizeof(double));
double *h_S = (double *)malloc(min(Nrows, Ncols) * sizeof(double));
// --- device side SVD workspace and matrices
double *d_U; gpuErrchk(cudaMalloc(&d_U, Nrows * Nrows * sizeof(double)));
double *d_V; gpuErrchk(cudaMalloc(&d_V, Ncols * Ncols * sizeof(double)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, min(Nrows, Ncols) * sizeof(double)));
// --- CUDA SVD initialization
cusolveSafeCall(cusolverDnDgesvd_bufferSize(solver_handle, Nrows, Ncols, &work_size));
double *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));
// --- CUDA SVD execution
cusolveSafeCall(cusolverDnDgesvd(solver_handle, 'A', 'A', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, d_V, Ncols, work, work_size, NULL, devInfo));
int devInfo_h = 0; gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h != 0) std::cout << "Unsuccessful SVD execution\n\n";
// --- Moving the results from device to host
gpuErrchk(cudaMemcpy(h_S, d_S, min(Nrows, Ncols) * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_U, d_U, Nrows * Nrows * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, Ncols * Ncols * sizeof(double), cudaMemcpyDeviceToHost));
std::cout << "Singular values\n";
for(int i = 0; i < min(Nrows, Ncols); i++)
std::cout << "d_S["<<i<<"] = " << std::setprecision(15) << h_S[i] << std::endl;
std::cout << "\nLeft singular vectors - For y = A * x, the columns of U span the space of y\n";
for(int j = 0; j < Nrows; j++) {
printf("\n");
for(int i = 0; i < Nrows; i++)
printf("U[%i,%i]=%f\n",i,j,h_U[j*Nrows + i]);
}
std::cout << "\nRight singular vectors - For y = A * x, the columns of V span the space of x\n";
for(int i = 0; i < Ncols; i++) {
printf("\n");
for(int j = 0; j < Ncols; j++)
printf("V[%i,%i]=%f\n",i,j,h_V[j*Ncols + i]);
}
cusolverDnDestroy(solver_handle);
return 0;
}
Utilities.cuh
#ifndef UTILITIES_CUH
#define UTILITIES_CUH
extern "C" int iDivUp(int, int);
extern "C" void gpuErrchk(cudaError_t);
extern "C" void cusolveSafeCall(cusolverStatus_t);
#endif
Utilities.cu
#include <stdio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include <cuda.h>
#include <cusolverDn.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cudaGetErrorEnum(cusolverStatus_t error)
{
switch (error)
{
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
if(CUSOLVER_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSOLVE error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }
You can take a look at the Batched Operations post of the CULA blog for a discussion of your problem.
EDIT
From what I understand from your comment below, you would like each thread to calculate a separate SVD. So, basically each thread should execute a standard, sequential SVD scheme. For that some possibly useful references:
Numerical Recipes
Golub, Van Loan, Matrix Computations
If you use this approach, though, I'm afraid you will not be able anymore to use cuBLAS, as those are host functions not callable from the device (unless you do not have a compute capability >3.5, see the the simpleDevLibCUBLAS example.). But basically in this way I think you are somehow implementing the batch concept by yourself.
If you decide to go to a more standard parallel GPU implementation, the reference below could be of interest:
Singular Value Decomposition on GPU using CUDA
The above answers are now out of date. As of CUDA 9.0, the cuSOLVER library has been equipped with a batched SVD calculation based on the Jacobi method. Below, a fully worked example:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
//#define FULLSVD
//#define PRINTRESULTS
/********/
/* MAIN */
/********/
int main() {
const int M = 3;
const int N = 3;
const int lda = M;
//const int numMatrices = 3;
const int numMatrices = 16384;
TimingGPU timerGPU;
// --- Setting the host matrix
double *h_A = (double *)malloc(lda * N * numMatrices * sizeof(double));
for (unsigned int k = 0; k < numMatrices; k++)
for (unsigned int i = 0; i < M; i++){
for (unsigned int j = 0; j < N; j++){
h_A[k * M * N + j * M + i] = (1. / (k + 1)) * (i + j * j) * (i + j);
//printf("%d %d %f\n", i, j, h_A[j*M + i]);
}
}
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * numMatrices * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * numMatrices * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_S = (double *)malloc(N * numMatrices * sizeof(double));
double *h_U = NULL;
double *h_V = NULL;
#ifdef FULLSVD
h_U = (double *)malloc(M * M * numMatrices * sizeof(double));
h_V = (double *)malloc(N * N * numMatrices * sizeof(double));
#endif
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, N * numMatrices * sizeof(double)));
double *d_U = NULL;
double *d_V = NULL;
#ifdef FULLSVD
gpuErrchk(cudaMalloc(&d_U, M * M * numMatrices * sizeof(double)));
gpuErrchk(cudaMalloc(&d_V, N * N * numMatrices * sizeof(double)));
#endif
double *d_work = NULL; /* devie workspace for gesvdj */
int devInfo_h = 0; /* host copy of error devInfo_h */
// --- Parameters configuration of Jacobi-based SVD
const double tol = 1.e-7;
const int maxSweeps = 15;
cusolverEigMode_t jobz; // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif
const int econ = 0; // --- econ = 1 for economy size
// --- Numerical result parameters of gesvdj
double residual = 0;
int executedSweeps = 0;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle = NULL;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
// --- Configuration of gesvdj
gesvdjInfo_t gesvdj_params = NULL;
cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));
// --- Set the computation tolerance, since the default tolerance is machine precision
cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));
// --- Set the maximum number of sweeps, since the default value of max. sweeps is 100
cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, maxSweeps));
// --- Query the SVD workspace
cusolveSafeCall(cusolverDnDgesvdjBatched_bufferSize(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Nubmer of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M,N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
&work_size,
gesvdj_params,
numMatrices));
gpuErrchk(cudaMalloc(&d_work, sizeof(double) * work_size));
// --- Compute SVD
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnDgesvdjBatched(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Number of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M, N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
d_work,
work_size,
devInfo,
gesvdj_params,
numMatrices));
printf("Calculation of the singular values only: %f ms\n\n", timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_S, d_S, sizeof(double) * N * numMatrices, cudaMemcpyDeviceToHost));
#ifdef FULLSVD
gpuErrchk(cudaMemcpy(h_U, d_U, sizeof(double) * lda * M * numMatrices, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, sizeof(double) * lda * N * numMatrices, cudaMemcpyDeviceToHost));
#endif
#ifdef PRINTRESULTS
printf("SINGULAR VALUES \n");
printf("_______________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; SV nr. %d; Value = %f\n", k, p, h_S[k * N + p]);
printf("\n");
}
#ifdef FULLSVD
printf("SINGULAR VECTORS U \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * M + econ * min(M, N); q++)
for (int p = 0; p < M; p++)
printf("Matrix nr. %d; U nr. %d; Value = %f\n", k, p, h_U[((1 - econ) * M + econ * min(M, N)) * M * k + q * M + p]);
printf("\n");
}
printf("SINGULAR VECTORS V \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * N + econ * min(M, N); q++)
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; V nr. %d; Value = %f\n", k, p, h_V[((1 - econ) * N + econ * min(M, N)) * N * k + q * N + p]);
printf("\n");
}
#endif
#endif
if (0 == devInfo_h){
printf("gesvdj converges \n");
}
else if (0 > devInfo_h){
printf("%d-th parameter is wrong \n", -devInfo_h);
exit(1);
}
else{
printf("WARNING: devInfo_h = %d : gesvdj does not converge \n", devInfo_h);
}
// --- Free resources
if (d_A) gpuErrchk(cudaFree(d_A));
if (d_S) gpuErrchk(cudaFree(d_S));
#ifdef FULLSVD
if (d_U) gpuErrchk(cudaFree(d_U));
if (d_V) gpuErrchk(cudaFree(d_V));
#endif
if (devInfo) gpuErrchk(cudaFree(devInfo));
if (d_work) gpuErrchk(cudaFree(d_work));
if (solver_handle) cusolveSafeCall(cusolverDnDestroy(solver_handle));
if (gesvdj_params) cusolveSafeCall(cusolverDnDestroyGesvdjInfo(gesvdj_params));
gpuErrchk(cudaDeviceReset());
return 0;
}

Cannot read out Values from Texture Memory

Hi I'm writing a simple Program for practicing to work with texture memory. I Just want to write my data into Texture Memory and write it back into Global Memory. But i cannont read out the Values. Here is the code.
#include <stdio.h>
#include <iostream>
#include "cuda.h"
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "HelloWorld.h"
#include "linearInterpolation_kernel4.cu"
using namespace std;
using std::cout;
const int blocksize = 16;
__global__
void hello(char *a, int *b) {
a[threadIdx.x] += b[threadIdx.x];
}
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
printf("%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
}
}
// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
inline void __getLastCudaError( const char *errorMessage, const char *file, const int line )
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
printf("%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
}
}
int main()
{
int N = 40;
float *A;
A = (float *) malloc(N*sizeof(float));
float *B;
B = (float *) malloc(N*sizeof(float));
float *result;
result = (float *) malloc(N*sizeof(float));
float angle = 0.8f;
for(int i = 0; i < N; i++){
A[i] = i; //(float)rand();
B[i] = i+1; //(float)rand();
}
ipLinearTexture2(A,B,result,angle,N);
float result2;
result2 = (angle)*A[4] + (1-angle)*B[4];
printf(" A %f B %f Result %f\n", A[4], B[4], result[4]);
cout << result2 << endl;
return 1;
}
void ipLinearTexture2(float *A, float* B, float* result, float angle, int N)
{
float cuTime;
int N2 = N * 2;
float *dev_result;
float **AB;
AB = (float **) malloc( N * sizeof(float *));
if(AB)
{
for(int i = 0; i < N; i++)
{
AB[i] = (float *) malloc( 2 * sizeof(float *));
}
}
for (int i = 0; i < N; i = i++)
{
AB[i][0] = A[i];
AB[i][1] = B[i];
}
cudaMalloc(&dev_result, N * sizeof(float));
unsigned int size = N2 * sizeof(float);
//cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray* cu_array;
checkCudaErrors(cudaMallocArray( &cu_array, &channelDesc,N,2));
cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);
// set texture parameters
tex2.normalized = true;
tex2.filterMode = cudaFilterModeLinear;
tex2.addressMode[0] = cudaAddressModeWrap; //cudaAddressModeWrap;
tex2.addressMode[1] = cudaAddressModeWrap; //cudaAddressModeClamp;
checkCudaErrors(cudaBindTextureToArray( tex2, cu_array, channelDesc));
dim3 dimBlock(10, 1, 1);
dim3 dimGrid((int)ceil((double)N*2/dimBlock.x), 1, 1);
transformKernel4<<< 256, 256, 0 >>>( dev_result, N, 2, angle);
checkCudaErrors(cudaMemcpy(result, dev_result, N * sizeof(float), cudaMemcpyDeviceToHost));
cout << "==================================================" << endl;
for (int i = 0 ; i < N ;i++)
{
cout << result[i] << " on " << i << endl;
}
cout << "==================================================" << endl;
checkCudaErrors(cudaUnbindTexture(tex));
checkCudaErrors(cudaFree(dev_result));
checkCudaErrors(cudaFreeArray(cu_array));
}
and here is the kernel code
#ifndef _SIMPLETEXTURE_KERNEL5_H_
#define _SIMPLETEXTURE_KERNEL5_H_
// Texture references
texture<float, 2, cudaReadModeElementType> tex2;
__global__ void
transformKernel4(float* g_odata, int width, int height, float theta)
{
unsigned int xid = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int yid = blockIdx.y * blockDim.y + threadIdx.y;
if (xid >= width || yid >= height) return;
float dx = 1.0f / (float)width;
float dy = 1.0f / (float)height;
float x = ((float)xid + 0.5f) * dx;
float y = ((float)yid + 0.5f) * dy;
float value = tex2D(tex2, x , y);
printf("wert %f xid %i yid %i \n",value, xid, yid);
g_odata[yid * width + xid] = value;
}
#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
Can somebody tell what i am doing wrong?
I have edited it to remove the first 2 logical mistake. Put why am I need able to print out my data?
It was the wrong binding of the Arrays. You can not use multidimensional Arrays in C that can be copied. You have to use a onedimensional array that respresents a multidimensional.
I can see 2 logical errors here.
The first one is the one pointed out by #asm.
The output should be stored by calculating linear index from 2D x and y indices.
outputIndex = yid * width + xid;
The second one is that the memory allocation for the cudaArray structure is internally aligned.
You should consider using cudaMemcpy2DToArray function to avoid erroneous data copying.
cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);