Cublas Thrust Segmentation Fault - cuda

I am new to CUDA programming. I was working on a sample code which multiplies a matrix with a vector and prints the results. I am using Cublas Dgemv API for doing the multiplication. On running the program using cuda-memcheck I get the following error,
Error: process didn't terminate successfully
========= The application may have hit an error when dereferencing Unified Memory from the host. Please rerun the application under cuda-gdb or Nsight Eclipse Edition to catch host side errors.
========= Internal error (20)
========= No CUDA-MEMCHECK results found
The minimal complete code is here,
#include <thrust/device_vector.h>
#include <cublas_v2.h>
#include <iostream>
int main(void)
{
int rowDimension = 3; // number of rows
int columnDimension = 6; // number of columns
// initialize data
thrust::device_vector<double> weightMatrix;
weightMatrix.resize(rowDimension * columnDimension);
thrust::device_vector<double> inputVector;
inputVector.resize(columnDimension);
thrust::device_vector<double> F;
F.resize(rowDimension);
for (size_t i = 0; i < rowDimension; i++)
for (size_t j = 0; j < columnDimension; j++)
weightMatrix[j * rowDimension + i]=i;
for (size_t j = 0; j < columnDimension; j++)
inputVector[j] = j;
for (size_t i = 0; i < rowDimension; i++)
F[i]=0;
cublasHandle_t handle;
/* Initialize CUBLAS */
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
std::cerr << "!!!! CUBLAS initialization error\n";
double alpha = 1.0f;
// cudaDeviceSynchronize();
status = cublasDgemv(handle, CUBLAS_OP_N, rowDimension, columnDimension, &alpha, thrust::raw_pointer_cast(weightMatrix.data()), rowDimension,
thrust::raw_pointer_cast(inputVector.data()), 1, 0, thrust::raw_pointer_cast(F.data()), 1) ;;
// cudaDeviceSynchronize();
if (status != CUBLAS_STATUS_SUCCESS)
std::cerr << "!!!! kernel execution error.\n";
for (size_t j = 0; j < rowDimension; j++)
std::cout << F[j] << " ";
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
std::cerr << "!!!! shutdown error (A)\n";
return 0;
}
The above prgram produces a segmentation fault at the cublasDgemv function. Onrunning cuda-memcheck i get the message reported above. On Googling i couldn't find much help.
Can someone please help me resolve this issue.

Have a look at the documentation of cublasDgemv.
The signature is:
cublasDgemv(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const double *alpha,
const double *A,
int lda,
const double *x,
int incx,
const double *beta,
double *y,
int incy)
beta has to be supplied as a pointer. But you pass a NULL pointer to it instead of a pointer pointing to the value 0.
So the following will fix your problem:
double alpha = 1.0;
double beta = 0;
status = cublasDgemv(handle,
CUBLAS_OP_N,
rowDimension,
columnDimension,
&alpha,
thrust::raw_pointer_cast(weightMatrix.data()),
rowDimension,
thrust::raw_pointer_cast(inputVector.data()),
1,
&beta, // note the change here!
thrust::raw_pointer_cast(F.data()),
1);

Related

sum vectors values with cuda C++

I try to sum many vectors values using CUDA c++. I found some solution for two vectors. As you can see, just possible to add two vectors but I wanna generate vectors dynamically with the same length.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
// CUDA kernel. Each thread takes care of one element of c
__global__ void vecAdd(double *a, double *b, double *c, int n)
{
// Get our global thread ID
int id = blockIdx.x*blockDim.x+threadIdx.x;
// Make sure we do not go out of bounds
if (id < n)
c[id] = a[id] + b[id];
}
int main( int argc, char* argv[] )
{
// Size of vectors
int n = 100000;
// Host input vectors
double *h_a;
double *h_b;
//Host output vector
double *h_c;
// Device input vectors
double *d_a;
double *d_b;
//Device output vector
double *d_c;
// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);
// Allocate memory for each vector on host
h_a = (double*)malloc(bytes);
h_b = (double*)malloc(bytes);
h_c = (double*)malloc(bytes);
// Allocate memory for each vector on GPU
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
int i;
// Initialize vectors on host
for( i = 0; i < n; i++ ) {
h_a[i] = sin(i)*sin(i);
h_b[i] = cos(i)*cos(i);
}
// Copy host vectors to device
cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);
int blockSize, gridSize;
// Number of threads in each thread block
blockSize = 1024;
// Number of thread blocks in grid
gridSize = (int)ceil((float)n/blockSize);
// Execute the kernel
vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
// Copy array back to host
cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );
// Sum up vector c and the print result divided by n, this should equal 1
within error
double sum = 0;
for(i=0; i<n; i++)
sum += h_c[i];
printf("final result: %f\n", sum/n);
// Release device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
// Release host memory
free(h_a);
free(h_b);
free(h_c);
return 0;
}
Is there a way to do this for many vectors? My vectors size are:
#vector length
N = 1000
#number of vectors
i = 300000
v[i] = [1,2,..., N]
As result i need to get:
out[i]= [sum(v[1]), sum(v[2]),..., sum(v[i])]
Thanks for any advice.
Summing multiple vectors together in a fashion similar to the code you have shown (i.e. generating elementwise sums) is equivalent to summing the columns of a matrix. And this idea represents a sensible way to realize the solution.
We will treat your vectors as a matrix, where each vector is a row in the matrix. The CUDA kernel will assign one thread to each column, and will sum the elements of that column, producing a single number result. That single number result will become one element of the vector result of the entire problem.
Here is a fully worked example demonstrating one possible approach:
$ cat t2.cu
#include <iostream>
typedef double mt;
const int nTPB = 64;
template <typename T>
__global__ void column_sum(T *matrix, T *sums, unsigned n_vectors, unsigned vector_length){
unsigned idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < vector_length){
T temp = 0;
for (unsigned i = 0; i < n_vectors; i++)
temp += matrix[i*vector_length+idx];
sums[idx] = temp;}
}
int main(){
const unsigned vlen = 1000;
const unsigned nvec = 300000;
mt *h_matrix, *d_matrix, *h_sums, *d_sums;
// create the desired number of vectors as a single matrix
h_sums = new mt[vlen];
h_matrix = new mt[vlen*nvec];
cudaMalloc(&d_matrix, vlen*nvec*sizeof(mt));
cudaMalloc(&d_sums, vlen*sizeof(mt));
size_t count = 0;
for (unsigned i = 0; i < nvec; i++)
for (unsigned j = 0; j < vlen; j++)
h_matrix[count++] = j;
cudaMemcpy(d_matrix, h_matrix, vlen*nvec*sizeof(mt), cudaMemcpyHostToDevice);
column_sum<<<(vlen+nTPB-1)/nTPB,nTPB>>>(d_matrix, d_sums, nvec, vlen);
cudaMemcpy(h_sums, d_sums, vlen*sizeof(mt), cudaMemcpyDeviceToHost);
for (unsigned i = 0; i < vlen; i++) if (h_sums[i] != ((mt)nvec)*i) {std::cout << " mismatch at " << i << " was: " << h_sums[i] << " should be: " << ((mt)nvec)*i << std::endl; return -1;}
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
}
$ nvcc -o t2 t2.cu
$ cuda-memcheck ./t2
========= CUDA-MEMCHECK
no error
========= ERROR SUMMARY: 0 errors
$
Note that this methodology only creates as many threads on the GPU as there are vector elements (1000 in the above example). 1000 threads would be enough to keep only the smallest GPUs busy. However this algorithm will be efficient on most GPUs if your vector length is 10,000 or longer. If you'd like to explore creating more efficient algorithms for small problem sizes, you can study the idea of a classical parallel reduction.

CUDA_SAFE_CALL: an illegal memory access was encountered

I am trying to do simple matrix multiplication on CUDA. I know arrays can be flattened for passing it to the device. However I am using cudaMallocPitch and cudaMemcpy2d to do the multiplication. While executing the code below I get an error " illegal memory was encountered" when I try to copy the result onto the host I highly appreciate any advice on where I am going wrong. Thanks!
weights-first matrix,dim:30x784
input- second matrix,dim:784x100
results_d - result on the device(GPU)
result - result copied on the host
#include <stdio.h>
#include <math.h>
#include <cstdio>
#include <cstdlib>
#define CUDA_SAFE_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"CUDA_SAFE_CALL: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void MatrixMulKernel(double *input,double *weights,double *results_d,size_t in_pitch,size_t w1_pitch,size_t result_pitch)
{
int row = threadIdx.x;
int col= threadIdx.y;
double value;
double *result_matrix;
result_matrix = ((double*)((char*)results_d + row*result_pitch + col));
printf("%d",threadIdx);
for(int i =0 ; i < in_pitch ; i++)
{
double *element1 = ((double*)((char*)input + row*in_pitch) + i) ;
double *element2 = ((double*)((char*)weights + i*w1_pitch) + col);
value =+ (*element1) * (*element2);
}
*result_matrix = value;
}
int main()
{
static double arr1[30][784];
static double arr2[784][100];
static double result[30][100];
for (int i = 0 ; i < 30; i++)
{
for(int j =0;j <784 ; j ++)
arr1[i][j] = 5;
}
for (int i =0 ; i < 784; i ++)
{
for(int j=0;j < 100 ; j++)
arr2[i][j] = 3;
}
double *input;
double *weights;
double *results_d;
size_t in_pitch,w1_pitch,result_pitch;
//allocating memory in GPU for 2 inputs and result
CUDA_SAFE_CALL(cudaMallocPitch((void**)&input,&in_pitch,100*sizeof(double),784));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&weights,&w1_pitch,784*sizeof(double),30));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&results_d,&result_pitch,100*sizeof(double),30));
//Copy matrix from host to device
CUDA_SAFE_CALL(cudaMemcpy2D(input,in_pitch,arr2,100*sizeof(double),100*sizeof(double),784,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(weights,w1_pitch,arr1,784*sizeof(double),784*sizeof(double),30,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(results_d,result_pitch,result,100*sizeof(double),100*sizeof(double),30,cudaMemcpyHostToDevice));
//using GPU
dim3 dimGrid(1,1,1);
dim3 dimBlock(32,32,1);
printf("before kernel fucntion");
MatrixMulKernel<<<dimGrid, dimBlock>>>(input, weights,results_d,in_pitch,w1_pitch,result_pitch);
printf("after kernel fucntion");
cudaThreadSynchronize();
//copying back to host
CUDA_SAFE_CALL(cudaMemcpy2D(result,result_pitch,results_d,100*sizeof(double),100*sizeof(double),30,cudaMemcpyDeviceToHost));
//printing and seeing whether the result matrix has been updated
for (int i =0 ; i < 100; i ++)
{
for(int j=0;j < 30 ; j++)
{
printf("%f",result);
}
printf("\n");
}
CUDA_SAFE_CALL(cudaFree(input));
CUDA_SAFE_CALL(cudaFree(weights));
CUDA_SAFE_CALL(cudaFree(results_d));
return 0;
}
There are a number of errors in this code. First of all, it's not clear that doing pitched allocations is going to give any benefit here. Second, if you're serious about wanting fast matrix multiply performance, you should use CUBLAS.
Issues:
You don't seem to understand pitched allocations. The pitch value returned is a value in bytes. You cannot sensibly use that for a loop index for matrix multiply. Also, the pitch value is the overall width of the pitch allocation. It does not correspond to the valid data area. For that, you should use the appropriate matrix dimension.
Your code will not do a matrix multiplication over the entire matrix area. You are only creating a single block of 32x32 threads, but you need enough blocks/threads to cover the entire matrix area. This requires changes to your grid dimensions, passing matrix dimensions to your kernel, as well as a "thread check" in your kernel to prevent out-of-bounds access.
This construct for pitched access is not correct:
result_matrix = ((double*)((char*)results_d + row*result_pitch + col));
it does not match the other constructions you have for the 2 input matrices, it has a misplaced close parenthesis.
You have the sense of your two input matrices reversed. You are indexing into the input matrix as if it were the weight matrix, and vice-versa. We need to swap the sense of row, column and i to make these match the actual matrix dimensions.
Your final cudaMemcpy2D operation has the pitch values reversed:
cudaMemcpy2D(result,result_pitch,results_d,100*sizeof(double),100*sizeof(double),30,cudaMemcpyDeviceToHost)
^^^^^ ^^^^^
You forgot to initialize to zero your loop sum variable:
double value;
I don't know what you intended here, it should be += not =+:
value =+ ...
The following code has these issues addressed, and seems to run without error for me:
$ cat t104.cu
#include <stdio.h>
#include <math.h>
#include <cstdio>
#include <cstdlib>
const int d1 = 30;
const int d2 = 784;
const int d3 = 100;
double arr1[d1][d2];
double arr2[d2][d3];
double result[d1][d3];
#define CUDA_SAFE_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"CUDA_SAFE_CALL: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void MatrixMulKernel(double *input,double *weights,double *results_d,size_t in_pitch,size_t w1_pitch,size_t result_pitch, int dim, int rrow, int rcol)
{
int col = threadIdx.x + blockDim.x*blockIdx.x;
int row= threadIdx.y + blockDim.y*blockIdx.y;
if ((row >= rrow) || (col >= rcol)) return;
double value = 0;
double *result_matrix;
result_matrix = ((double*)((char*)results_d + row*result_pitch) + col);
for(int i =0 ; i < dim ; i++)
{
double *element1 = ((double*)((char*)input + i*in_pitch) + col) ;
double *element2 = ((double*)((char*)weights + row*w1_pitch) + i);
value += (*element1) * (*element2);
}
*result_matrix = value;
}
int main()
{
for (int i = 0 ; i < d1; i++)
{
for(int j =0;j <d2 ; j ++)
arr1[i][j] = 5;
}
for (int i =0 ; i < d2; i ++)
{
for(int j=0;j < d3 ; j++)
arr2[i][j] = 3;
}
double *input;
double *weights;
double *results_d;
size_t in_pitch,w1_pitch,result_pitch;
//allocating memory in GPU for 2 inputs and result
CUDA_SAFE_CALL(cudaMallocPitch((void**)&input,&in_pitch,d3*sizeof(double),d2));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&weights,&w1_pitch,d2*sizeof(double),d1));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&results_d,&result_pitch,d3*sizeof(double),d1));
//Copy matrix from host to device
CUDA_SAFE_CALL(cudaMemcpy2D(input,in_pitch,arr2,d3*sizeof(double),d3*sizeof(double),d2,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(weights,w1_pitch,arr1,d2*sizeof(double),d2*sizeof(double),d1,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(results_d,result_pitch,result,d3*sizeof(double),d3*sizeof(double),d1,cudaMemcpyHostToDevice));
//using GPU
dim3 dimBlock(32,32,1);
dim3 dimGrid(((d3+dimBlock.x-1)/dimBlock.x),((d1+dimBlock.y-1)/dimBlock.y),1);
MatrixMulKernel<<<dimGrid, dimBlock>>>(input, weights,results_d,in_pitch,w1_pitch,result_pitch, d2, d1, d3);
//copying back to host
CUDA_SAFE_CALL(cudaMemcpy2D(result,d3*sizeof(double),results_d,result_pitch,d3*sizeof(double),d1,cudaMemcpyDeviceToHost));
//printing and seeing whether the result matrix has been updated
for (int i =0 ; i < d3; i ++)
{
for(int j=0;j < d1 ; j++)
{
printf("%f", result[j][i]);
}
printf("\n");
}
CUDA_SAFE_CALL(cudaFree(input));
CUDA_SAFE_CALL(cudaFree(weights));
CUDA_SAFE_CALL(cudaFree(results_d));
return 0;
}
$ nvcc -arch=sm_61 -o t104 t104.cu
$

Is prefix scan CUDA sample code in gpugems3 correct?

I've written a piece of code to call the kernel in the book GPU Gems 3, Chapter 39: Parallel Prefix Sum (Scan) with CUDA.
However the results that I get are a bunch of negative numbers instead of prefix scan.
Is my kernel call wrong or is there something wrong with the code from the GPU Gems 3 book?
Here is my code:
#include <stdio.h>
#include <sys/time.h>
#include <cuda.h>
__global__ void kernel(int *g_odata, int *g_idata, int n, int dim)
{
extern __shared__ int temp[];// allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid]; // load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += g_idata[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
void Initialize(int *h_in,int num_items)
{
int j;
for(j=0;j<num_items;j++)
h_in[j]=j;
printf(" input: ");
printf("\n\n");
}
int main(int argc, char** argv)
{
int num_items = 512;
int* h_in = new int[num_items];
// Initialize problem
Initialize(h_in, num_items);
int *d_in = NULL;
cudaMalloc((void**)&d_in, sizeof(int) * num_items);
if(cudaSuccess != cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)) fprintf(stderr,"could not copy to gpu");
// Allocate device output array
int *d_out = NULL;
cudaMalloc((void**)&d_out, sizeof(int) * (num_items+1));
kernel<<<1,256,num_items*sizeof(int)>>>(d_out, d_in,num_items, 2);
int* h_out= new int[num_items+1];
if(cudaSuccess != cudaMemcpy(h_out,d_out,sizeof(int)*(num_items+1),cudaMemcpyDeviceToHost))fprintf(stderr,"could not copy back");
int i;
printf(" \n");
for(i=0;i<num_items;i++)
printf(" ,%d ",h_out[i]);
// Cleanup
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (d_in) cudaFree(d_in);
if (d_out) cudaFree(d_out);
printf("\n\n");
return 0;
}
It seems that you've made at least 1 error in transcribing the code from the GPU Gems 3 chapter into your kernel. This line is incorrect:
temp[bi] += g_idata[ai];
it should be:
temp[bi] += temp[ai];
When I make that one change to the code you have now posted, it seems to print out the correct (exclusive-scan) prefix sum for me. There's a few other things I would mention:
Even without that change, I get some results that are close to correct. So if you're getting widely different stuff (e.g. negative numbers) you may have a problem with your machine setup or CUDA install. I would suggest using more rigorous cuda error checking than what you have now (although a machine setup problem should have been indicated in one of your checks.)
The routine as crafted will have some limitations. It can only be used in a single threadblock, it will have bank conflicts on shared memory access, and it will be limited in data set size to what can be handled by a single threadblock (this routine produces two output elements per thread, so the data set size is expected to be equal to twice the number of threads). As has been already covered, the dynamic shared memory allocation needs to be as large as the data set size (ie. twice the thread size, in number of elements).
This may be useful for learning, but if you want a robust, fast prefix scan, you are advised to use a routine from thrust or cub instead of your own code, even if derived from this (old) article.
The following code is similar to yours, but it has the above issues fixed, and I have templated the kernel for use with various datatypes:
#include <stdio.h>
#define DSIZE 512
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef int mytype;
template <typename T>
__global__ void prescan(T *g_odata, T *g_idata, int n)
{
extern __shared__ T temp[]; // allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid]; // load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
T t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
int main(){
mytype *h_i, *d_i, *h_o, *d_o;
int dszp = (DSIZE)*sizeof(mytype);
h_i = (mytype *)malloc(dszp);
h_o = (mytype *)malloc(dszp);
if ((h_i == NULL) || (h_o == NULL)) {printf("malloc fail\n"); return 1;}
cudaMalloc(&d_i, dszp);
cudaMalloc(&d_o, dszp);
cudaCheckErrors("cudaMalloc fail");
for (int i = 0 ; i < DSIZE; i++){
h_i[i] = i;
h_o[i] = 0;}
cudaMemset(d_o, 0, dszp);
cudaCheckErrors("cudaMemset fail");
cudaMemcpy(d_i, h_i, dszp, cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy 1 fail");
prescan<<<1,DSIZE/2, dszp>>>(d_o, d_i, DSIZE);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpy(h_o, d_o, dszp, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2 fail");
mytype psum = 0;
for (int i =1; i < DSIZE; i++){
psum += h_i[i-1];
if (psum != h_o[i]) {printf("mismatch at %d, was: %d, should be: %d\n", i, h_o[i], psum); return 1;}
}
return 0;
}

CUDA unkown error when copying from device to host

I wrote some CUDA code, and everything seems great until I try to get the results from the code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdlib>
#include <ctime>
#include <iostream>
#define maskSize 3
__constant__ float masks[32*maskSize*maskSize];
__global__ void myConv(float *res, const float* mats, int mSize)
{
extern __shared__ float curr[];
int rSize=maskSize+mSize-1;
int idxmod=(threadIdx.x+maskSize-1) % (mSize+2*maskSize-2); //these two map any value not within (mSize-1,mSize-1) to the boarders for padding.
int idymod=(threadIdx.y+maskSize-1) % (mSize+2*maskSize-2);
if (threadIdx.x < mSize && threadIdx.y < mSize) //put the value of mats in the middle of the curr matrix
curr[(threadIdx.x+ maskSize-1)*(mSize+2*(maskSize-1)) + threadIdx.y + maskSize-1]=mats[mSize*(blockIdx.y*mSize + threadIdx.x) + threadIdx.y];
else //zero padding
if (threadIdx.x < mSize)
curr[threadIdx.x*(mSize+2*(maskSize-1)) +idymod] =0;
else
curr[idxmod*(mSize+2*(maskSize-1)) +threadIdx.y] =0;
__syncthreads();
float tmp=0;
if (threadIdx.x < mSize+maskSize-1 && threadIdx.y < mSize+maskSize-1)
{
#pragma unroll
for (int i=0;i<maskSize;i++)
#pragma unroll
for (int j=0;j<maskSize;j++)
tmp+=curr[(threadIdx.x+i)*(mSize+2*(maskSize-1)) + threadIdx.y+j]*masks[blockIdx.x*maskSize*maskSize +maskSize*i +j];
res[blockIdx.y*rSize*rSize + threadIdx.x*rSize + threadIdx.y]=tmp;
}
}
int main()
{
int MatSize=5;
int bSize=2000;
int maskNum=10;
int resSize=MatSize+maskSize-1;
float* ms;
ms=(float *)malloc(maskSize*maskSize*maskNum*sizeof(float));
float* resPtr=(float *)malloc((MatSize+maskSize-1)*(MatSize+maskSize-1)*bSize*maskNum*sizeof(float));
for (int i=0; i<maskSize;i++)
for (int j=0; j<maskSize; j++)
for (int k=0; k<maskNum; k++)
ms[k*maskSize*maskSize + j*maskSize + i]=(float)(rand() % 1000)/100;
float* inp=(float *)malloc(MatSize*MatSize*bSize*sizeof(float));
for (int i=0; i<MatSize; i++)
for (int j=0; j<MatSize; j++)
for (int k=0;k<bSize;k++)
inp[k*MatSize*MatSize + j*MatSize + i]=(float)(rand() % 500)/100;
float *cudams, *cudaresPtr,*cudainp;
cudaMalloc((void **) &cudams,maskSize*maskSize*maskNum*sizeof(float));
cudaMalloc((void **) &cudaresPtr,(MatSize+maskSize-1)*(MatSize+maskSize-1)*bSize*maskNum*sizeof(float));
cudaMalloc((void **) &cudainp,MatSize*MatSize*bSize*sizeof(float));
cudaMemcpy((void *)cudams,(void *)ms,maskSize*maskSize*maskNum*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy((void *)cudainp,(void *)inp,MatSize*MatSize*bSize*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(masks,(void *)cudams,maskSize*maskSize*maskNum*sizeof(float),0,cudaMemcpyDeviceToDevice);
dim3 threadSize(MatSize+2*(maskSize-1),MatSize+2*(maskSize-1));
dim3 blockSize(1, 1); //for testing purposes. should be dim3 blockSize(maskNum,bSize);
myConv<<<blockSize, threadSize, (MatSize+2*(maskSize-1))*(MatSize+2*(maskSize-1))>>>(cudaresPtr,cudainp,MatSize);
cudaMemcpy((void *)resPtr,(const void *)cudaresPtr,(MatSize+maskSize-1)*(MatSize+maskSize-1)*bSize*maskNum*sizeof(float),cudaMemcpyDeviceToHost);
//The problem is here - They copying won't work!
free(inp);
free(ms);
free(resPtr);
return 0;
}
I put printf in various places, used error checking as recommended here, printed error string... Can't find anything that would cause an error copying the contents of the pointer back to the host.
Edit: memcheck result: no errors if I understand correctly:
O:\CudaTst>cuda-memcheck CUDA_TST
========= CUDA-MEMCHECK
Time spent: 0.144000 secondsError: Failed to read the strings for
error record
========= ERROR SUMMARY: 0 errors
Re-ran with -l (leak) - 0 leaks.
It would appear that you are (at least) launching your kernel with insufficient dynamically allocated shared memory for it to run without a buffer overflow inside the kernel.
The amount of shared memory per block is specific in bytes, so I suspect you want something like:
size_t shmsz = sizeof(float)*size_t((MatSize+2*(maskSize-1))*
(MatSize+2*(maskSize-1));
myConv<<<blockSize, threadSize, shmz)>>>(cudaresPtr,cudainp,MatSize);
Beyond that, I leave the debugging to you.

CUBLAS works unpredictably

Wrote my first program using CUDA+CUBLAS. It just uses a 'cublasDgemm' function and computes a product of 2 N*N matrices.
However, all the time I was launching my program, it keeped producing the same wrong answer (e.g. when multiplying 1*1 matrix containing 5 as a single element by 1*1 matrix containing element 6, it always said the result is 36, not 30).
I checked the program several times with no success. But, when I came back to it the nexy day (i.e. after reboot), it worked just fine. I don't remember whether I recompiled it or not, but the truth is that it is the same VS project, same code, same computer with its GPU.
So, can anyone explain me why could that have happened? And do I have to expect same strange behaviour further?
Here is the code I was launching:
#include <iostream>
#include <string>
#include <iomanip>
#include <cuda_runtime.h>
#include <cublas_v2.h>
const int N = 5;
#define IDX2F(i,j) ((i) * N + j)
void fail(const cudaError_t& cudaStatus, const std::string& errorMessage) {
if (cudaStatus != cudaSuccess) {
std::cerr << errorMessage << std::endl;
exit(EXIT_FAILURE);
}
}
void fail(const cublasStatus_t& status, const std::string& errorMessage) {
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << errorMessage << std::endl;
exit(EXIT_FAILURE);
}
}
void printMatrix(const double *C) {
for (int i=0; i<N; i++) {
for (int j=0; j<N; j++) {
std::cout << std::fixed << std::setprecision(2) << C[IDX2F(i,j)] << ' ';
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int main(int argc, char **argv) {
cudaError_t cudaStatus;
cublasStatus_t status;
cublasHandle_t handle;
double *A = new double[N*N];
double *devPtrA;
double *B = new double[N*N];
double *devPtrB;
double *C = new double[N*N];
double *devPtrC;
for (int i=0; i<N; i++)
for (int j=0; j<N; j++)
A[IDX2F(i,j)] = i + j;
for (int i=0; i<N; i++)
for (int j=0; j<N; j++)
B[IDX2F(i,j)] = i + j * 0.5;
// do not have to set anything into matrix C, because beta = 0
// allocate mamory on GPU
cudaStatus = cudaMalloc((void**)&devPtrC, N*N*sizeof(*C));
fail(cudaStatus, "device memory allocation failed");
cudaStatus = cudaMalloc((void**)&devPtrA, N*N*sizeof(*A));
fail(cudaStatus, "device memory allocation failed");
cudaStatus = cudaMalloc((void**)&devPtrB, N*N*sizeof(*B));
fail(cudaStatus, "device memory allocation failed");
// create GPU handle
status = cublasCreate(&handle);
fail(status, "CUBLAS initialization failed");
// copying matrices from host to GPU
status = cublasSetMatrix(N, N, sizeof (*B), B, N, devPtrB, N);
fail(status, "failed to load data from host to GPU");
status = cublasSetMatrix(N, N, sizeof (*A), A, N, devPtrA, N);
fail(status, "failed to load data from host to GPU");
const double ONE = 1;
const double ZERO = 0;
printMatrix(A);
printMatrix(B);
status = cublasDgemm( handle,
CUBLAS_OP_N, CUBLAS_OP_N,
N, N, N,
&ONE,
devPtrA, N,
devPtrB, N,
&ZERO,
devPtrC, N);
fail(status, "error cublasDgemm");
status = cublasGetMatrix(N, N, sizeof (*C), devPtrC, N, C, N);
fail(status, "could not load result back from GPU to host");
printMatrix(C);
status = cublasDestroy(handle);
fail(status, "could not destroy CUBLAS handle");
cudaStatus = cudaFree(devPtrC);
fail(cudaStatus, "device memory freeing failed");
cudaStatus = cudaFree(devPtrB);
fail(cudaStatus, "device memory freeing failed");
cudaStatus = cudaFree(devPtrA);
fail(cudaStatus, "device memory freeing failed");
delete[] C;
delete[] B;
delete[] A;
return EXIT_SUCCESS;
}
op(B) must be CUBLAS_OP_T
.
.
status = cublasDgemm( handle,
CUBLAS_OP_N, CUBLAS_OP_T,
N, N, N,
&ONE,
devPtrA, N,
devPtrB, N,
&ZERO,
devPtrC, N);
.
.
.
.
definition is : C = α op ( A ) op ( B ) + β C
http://docs.nvidia.com/cuda/cublas/index.html#topic_8_1