cublas address out of bounds for particular matrix size - cuda

When I run the following code to compute the matrix multiplication y = X * B:
#include <iostream>
#include <Eigen/Dense>
#include <cuda_runtime.h>
#include "cublas_v2.h"
using namespace Eigen;
int main(){
int N = 240000;
int K = 3;
int p = 9700;
MatrixXf X_host = MatrixXf::Zero(N, p);
MatrixXf B_host = MatrixXf::Zero(p, K);
MatrixXf y_host(N, K);
float *X_dev;
float *B_dev;
float *y_dev;
cudaMalloc((void**)&X_dev, sizeof(float) * p * N);
cudaMalloc((void**)&B_dev, sizeof(float) * p * K);
cudaMalloc((void**)&y_dev, sizeof(float) * N * K);
cudaMemcpy(X_dev, X_host.data(), sizeof(float)*p*N, cudaMemcpyHostToDevice);
cudaMemcpy(B_dev, B_host.data(), sizeof(float)*p*K, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
cudaError_t error = cudaGetLastError();
if(error != cudaSuccess)
{
std::cout << "CUDA error: " << cudaGetErrorString(error) << std::endl;
} else {
std::cout << "No problem before cublas call\n";
}
float alpha = 1.0;
float beta = 0.0;
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
N, K, p, &alpha,
X_dev, N, B_dev, p, &beta, y_dev, N);
cudaDeviceSynchronize();
error = cudaGetLastError();
if(error != cudaSuccess)
{
std::cout << "CUDA error: " << cudaGetErrorString(error) << std::endl;
}
cublasDestroy(handle);
cudaFree(X_dev);
cudaFree(B_dev);
cudaFree(y_dev);
return 0;
}
I got this error from cuda-memcheck:
========= Invalid __global__ read of size 4
========= at 0x00000a88 in void gemmSN_NN_kernel<float, int=256, int=4, int=2, int=8, int=4, int=4, cublasGemvTensorStridedBatched<float const >, cublasGemvTensorStridedBatched<float>>(cublasGemmSmallNParams<float const , cublasGemvTensorStridedBatched<float const >, float>)
========= by thread (223,0,0) in block (190,0,0)
========= Address 0x2b660269807c is out of bounds
There are about 100 such address out of bounds error, and the number of them varies between runs. The problem disappears when I set K to be a larger number (for example 10). Anyone has an idea what might be going on? I'm using CUDA 10.1, P100 on CentOS 7. Thanks!
Update on September 21, 2020:
This issue is gone after I updated to CUDA 11.

As mentioned in comments, this would appear to be an internal issue in the CUBLAS library. I would editorialize and guess that they don't have test coverage for this unusual dimension problem with such a small inner product dimensions and this bug passed through pre-release testing undetected.
As is usual with likely bugs, your best best is to submit the code in your question as a repro case on a ticket with the NVIDIA developer portal.

Related

CUDA-why it cannot printf the information in cuda code? [duplicate]

This question already has answers here:
Trouble compiling helloworld.cu
(2 answers)
Closed 3 years ago.
I am a beginner for cuda. I wrote a test code for testing GPU device. my gpu model is k80.
There are 8 gpu cards in one node.
#include <iostream>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define N 10000
__global__ void add(int *a, int *b, int *c)
{
int tid = blockIdx.x;
if (tid < N)
c[tid] = a[tid] + b[tid];
}
int main()
{
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
cudaMalloc((void**)&dev_a, N * sizeof(int));
cudaMalloc((void**)&dev_b, N * sizeof(int));
cudaMalloc((void**)&dev_c, N * sizeof(int));
for (int i = 0;i < N;i++)
{
a[i] = -i;
b[i] = i*i;
}
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
add << <N, 1 >> > (dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0;i < N;i++)
{
printf("%d + %d = %d\\n", a[i], b[i], c[i]);
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
When i compile the code:
nvcc gputest.cu -o gputest
I got errors :
gputest.cu(38): error: identifier "printf" is undefined
1 error detected in the compilation of "/tmp/tmpxft_000059a6_00000000-4_gputest.cpp4.ii".
I think printf is a function in iostream file, but i have already included the iostream. I don't know why?
Add:
#include <stdio.h>
and it will compile is OK.
printf is a function defined in the C standard library cstdio, so inclusion of stdio.h makes sense here. Different compilers may have different behavior here, but in the case of nvcc this is generally the right way to do it.
(It's not valid to assume in all cases that inclusion of iostream will satisfy the reference here.)

invalid device ordinal on cudaMemPrefetchAsync

I'm running a toy CUDA sample on my GeForce 1080 Ti (Pascal) on windows 10 and CUDA 9.2.
Goal is to test cudaMemPrefetchAsync to the CPU, as it's supposed to work.
However, I get a CUDA error (invalid device ordinal) on this particular line.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdio>
#include <cstdlib>
void fill(int* a, int val, int N) {
for (int k = 0; k < N; ++k) {
a[k] = val;
}
}
__global__ void add(int* a, int* b, int N)
{
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += blockDim.x * gridDim.x) {
a[i] += b[i];
}
}
inline void check(cudaError_t err, const char* file, int line) {
if (err != cudaSuccess) {
::fprintf(stderr, "ERROR at %s[%d] : %s\n", file, line, cudaGetErrorString(err));
abort();
}
}
#define CUDA_CHECK(err) do { check(err, __FILE__, __LINE__); } while(0)
int main()
{
int deviceId;
CUDA_CHECK(cudaGetDevice(&deviceId));
const int N = 1024*1024*32;
int *a, *b;
CUDA_CHECK(cudaMallocManaged(&a, N * sizeof(int)));
CUDA_CHECK(cudaMallocManaged(&b, N * sizeof(int)));
CUDA_CHECK(cudaMemPrefetchAsync(a, N * sizeof(int), cudaCpuDeviceId)); // program breaks here
CUDA_CHECK(cudaMemPrefetchAsync(b, N * sizeof(int), cudaCpuDeviceId));
fill(a, 1, N);
fill(a, 2, N);
CUDA_CHECK(cudaMemPrefetchAsync(a, N * sizeof(int), deviceId));
CUDA_CHECK(cudaMemPrefetchAsync(b, N * sizeof(int), deviceId));
add<<<32, 256>>>(a, b, N);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
return 0;
}
Is that a hardware/driver/OS limitation? Can I simply ignore the error?
Is that a hardware/driver/OS limitation?
Yes, the latter. Quoting from the documentation
GPUs with SM architecture 6.x or higher (Pascal class or newer)
provide additional Unified Memory features such as on-demand page
migration and GPU memory oversubscription that are outlined throughout
this document. Note that currently these features are only supported
on Linux operating systems.
So asynchronous page migration is not supported in Windows at the moment and that it why you get an error when you try to enable it.

Trying to use: cudaHostAllocWriteCombined flag but I'm getting invalid argument when I try cudaMemcpy

First off I'd like to say I really do like the CUDA documentation it's really great and resourceful although I'm finding it hard to find out what is supported in what version. I'm using CUDA driver version 5.0 with compute capability 2.0 and was wondering if cudaHostAllocWriteCombined is supported?
In my code:
float *d_data, h_data;
h_data = new float[A];
assert(cudaHostAlloc((void **)&d_data, A * sizeof(float), cudaHostAllocWriteCombined) == cudaSuccess);
cudaError_t err = cudaMemcpy(d_data, h_data, A * sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
std::cout << cudaGetErrorString(err) << std::endl;
return false;
}
error returns invalid argument, however if I use cudaHostAllocDefault it seems to work fine, I understand how it works fast writing slow reading and that's why I would like to use it.
Your usage of h_data is incorrect. new returns a pointer, which should be assigned to the correct variable type. Replace h_data with *h_data in your declaration, and your code will be more-or-less correct, and cudaMemcpy should not throw an invalid argument error.
The following complete code shows the correction and compiles and runs without error for me on CUDA 6:
#include <iostream>
#include <assert.h>
#define A 1024
int main(){
float *d_data, *h_data;
h_data = new float[A];
cudaError_t err = cudaHostAlloc((void **)&d_data, A * sizeof(float), cudaHostAllocWriteCombined);
if (err != cudaSuccess)
{
std::cout << "cudaHostAlloc fail " << std::endl;
std::cout << cudaGetErrorString(err) << std::endl;
return 1;
}
err = cudaMemcpy(d_data, h_data, A * sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
std::cout << "cudaMemcpy fail" << std::endl;
std::cout << cudaGetErrorString(err) << std::endl;
return 1;
}
return 0;
}

memset in CUBLAS gemm is always launched in default stream

I noticed that when calling cublasSgemm function for each call of gemm from a host, there are 3 kernel invocations: memset, scal_kernel and gemm kernel itself (e.g. sgemm_large). This happens even if I use constants alpha/beta allocated in device memory. While the overhead of memset and scal_kernel is relatively small, the problem is memset is always launched in default stream which causes unnecessary synchronization.
The code:
__constant__ __device__ float alpha = 1;
__constant__ __device__ float beta = 1;
int main()
{
// ... memory allocation skipped ...
float* px = thrust::raw_pointer_cast(x.data());
float* py = thrust::raw_pointer_cast(y.data());
float* pmat = thrust::raw_pointer_cast(mat.data());
for (int iter = 0; iter < 3; ++iter)
{
cbstatus = cublasSgemm(cbh, CUBLAS_OP_N, CUBLAS_OP_N, crow, ccol, cshared, &alpha, px, crow, py, cshared, &beta, pmat, crow);
assert(0 == cbstatus);
}
}
This is what I see in profiler:
The question: is there a way to avoid memset or make it run in the stream assigned to CUBLAS handle?
One idea is to use DP and run device version of the gemm function, but this will work only on CC 3.0 and higher.
There was a bug in CUBLAS5.5 where a cudaMemset was used instead of cudaMemsetAsync in the specialized path where k >> m,n.
It is fixed in CUBLAS6.0 RC. And you can have access to it if you are a registered developer.
Btw, I wonder why you use __constant__ __device__ for alpha,beta.
Are you using pointerMode = DEVICE?
If not, you could simply use alpha,beta on the host.
Try the code below. The code is conceived to have only a cublasSgemm call, apart from unavoidable memory allocations and copies. You will see that
You have only one kernel launched (gemm_kernel1x1_core);
The two calls to cublasSgemm run perfectly in two different streams.
In the picture, the Visual Profiler timeline is shown.
My system: GeForce 540M, Windows 7, CUDA 5.5.
#include <conio.h>
#include <stdio.h>
#include <assert.h>
#include <cublas_v2.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/**********************/
/* cuBLAS ERROR CHECK */
/**********************/
#ifndef cublasSafeCall
#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
#endif
inline void __cublasSafeCall(cublasStatus_t err, const char *file, const int line)
{
if( CUBLAS_STATUS_SUCCESS != err) {
fprintf(stderr, "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",__FILE__, __LINE__,err);
getch(); cudaDeviceReset(); assert(0);
}
}
/********/
/* MAIN */
/********/
int main()
{
int N = 5;
float *A1, *A2, *B1, *B2, *C1, *C2;
float *d_A1, *d_A2, *d_B1, *d_B2, *d_C1, *d_C2;
A1 = (float*)malloc(N*N*sizeof(float));
B1 = (float*)malloc(N*N*sizeof(float));
C1 = (float*)malloc(N*N*sizeof(float));
A2 = (float*)malloc(N*N*sizeof(float));
B2 = (float*)malloc(N*N*sizeof(float));
C2 = (float*)malloc(N*N*sizeof(float));
gpuErrchk(cudaMalloc((void**)&d_A1,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_B1,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_C1,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_A2,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_B2,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_C2,N*N*sizeof(float)));
for (int i=0; i<N*N; i++) {
A1[i] = ((float)rand()/(float)RAND_MAX);
A2[i] = ((float)rand()/(float)RAND_MAX);
B1[i] = ((float)rand()/(float)RAND_MAX);
B2[i] = ((float)rand()/(float)RAND_MAX);
}
gpuErrchk(cudaMemcpy(d_A1, A1, N*N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_B1, B1, N*N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_A2, A2, N*N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_B2, B2, N*N*sizeof(float), cudaMemcpyHostToDevice));
cublasHandle_t handle;
cublasSafeCall(cublasCreate(&handle));
cudaStream_t stream1, stream2;
gpuErrchk(cudaStreamCreate(&stream1));
gpuErrchk(cudaStreamCreate(&stream2));
float alpha = 1.f;
float beta = 1.f;
cublasSafeCall(cublasSetStream(handle,stream1));
cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A1, N, d_B1, N, &beta, d_C1, N));
cublasSafeCall(cublasSetStream(handle,stream2));
cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A2, N, d_B2, N, &beta, d_C2, N));
gpuErrchk(cudaDeviceReset());
return 0;
}

Wrong results of a CUDA dynamic parallelism code

I recently bumped in the problem illustrated at Uncorrectable ECC error. Shortly speaking, from time to time I receive an Uncorrectable ECC error and my dynamic parallelism code generates uncorrect results. The most probable hypothesis of the uncorrectable ECC error is a corrupted driver stack, which has also been indirectly confirmed by the experience of another user (see the above post). I would now like to face the second issue, i.e., the algorithmic one. To this end, I'm dealing with the reproducer reported below which, since the original code generating uncorrect results uses dynamic parallelism, uses this CUDA feature too.
I do not see any evindent issue with this code. I think that the synchronization regarding the child kernel launch should be ok: the first __syncthreads() should not be necessary and the cudaDeviceSynchronize() should ensure that all the memory writes of the child kernel are accomplished before the printf.
My question is: is this code wrong or the wrong results are due to a non-programming issue?
My configuration: CUDA 5.0, Windows 7, 4-GPU system equipped with Kepler K20c, driver 327.23.
#include <stdio.h>
#include <conio.h>
#define K 6
#define BLOCK_SIZE 256
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getch(); exit(code); }
}
}
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__global__ void child_kernel(double* P1)
{
int m = threadIdx.x;
P1[m] = (double)m;
}
__global__ void parent_kernel(double* __restrict__ x, int M)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if(i<M) {
double* P1 = new double[13];
dim3 dimBlock(2*K+1,1); dim3 dimGrid(1,1);
__syncthreads();
child_kernel<<<dimGrid,dimBlock>>>(P1);
cudaDeviceSynchronize();
for(int m=0; m<2*K+1; m++) printf("%f %f\n",P1[m],(double)m);
}
}
int main() {
const int M = 19000;
//gpuErrchk(cudaSetDevice(0));
double* x = (double*)malloc(M*sizeof(double));
for (int i=0; i<M; i++) x[i] = (double)i;
double* d_x; gpuErrchk(cudaMalloc((void**)&d_x,M*sizeof(double)));
gpuErrchk(cudaMemcpy(d_x,x,M*sizeof(double),cudaMemcpyHostToDevice));
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(iDivUp(M,BLOCK_SIZE));
parent_kernel<<<dimGrid,dimBlock>>>(d_x,M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
getch();
return 0;
}
I'm pretty sure you're exceeding the launch pending limit. It's nearly impossible to tell with your code as-is, but I've modified it and added error checking on the child kernel launch.
When I do that, I get launch errors, signified by a printout of !. Skipping the launch error cases, all of my in-kernel checking of P1[m] vs. m passes (I get no * printout at all.)
#include <stdio.h>
#define K 6
#define BLOCK_SIZE 256
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__global__ void child_kernel(unsigned long long* P1)
{
int m = threadIdx.x;
P1[m] = (unsigned long long)m;
}
__global__ void parent_kernel(double* __restrict__ x, int M)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if(i<M) {
unsigned long long* P1 = new unsigned long long[13];
dim3 dimBlock(2*K+1,1); dim3 dimGrid(1,1);
__syncthreads();
child_kernel<<<dimGrid,dimBlock>>>(P1);
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("!");
else for(unsigned long long m=0; m<dimBlock.x; m++) if (P1[m] != m) printf("*");
}
}
int main() {
const int M = 19000;
//gpuErrchk(cudaSetDevice(0));
double* x = (double*)malloc(M*sizeof(double));
for (int i=0; i<M; i++) x[i] = (double)i;
double* d_x; gpuErrchk(cudaMalloc((void**)&d_x,M*sizeof(double)));
gpuErrchk(cudaMemcpy(d_x,x,M*sizeof(double),cudaMemcpyHostToDevice));
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(iDivUp(M,BLOCK_SIZE));
parent_kernel<<<dimGrid,dimBlock>>>(d_x,M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
Feel free to add further decoding of the err variable in the parent kernel to convince yourself that you are exceeding the launch pending limit. As another test, you can set M to 2048 instead of 19000 in your host code, and all the ! printouts go away. (launch pending limit default == 2048)
As I've stated in the comments, I think the uncorrectable ECC error is a separate issue, and I suggest trying the driver 321.01 that I linked in the comments.