Application crashes in nvcuda.dll after running for a long time - cuda

I have a strange issue when my application crashes in nvcuda.dll after running for about 2 hours. After spending a lot of time trying to debug the issue, I think I have an idea what’s going on but I’d like to know if anybody else has seen this problem.
My application launches most of its kernels in non-default streams and this process can go on for hours before there is a need to use default stream. Everything was working fine until I upgraded the drivers from some 320 version to most recent 332.50 (for K40m) version. What happens now is if the app runs for about 2 hours and then makes any call which uses default stream then it crashes during the call somewhere inside nvcuda.dll. At first I thought something is wrong with my kernels but it happens even if I use some basic stuff like cudaMemcpy (which uses default stream). The crash does not happen when the app is running for, say, 1 hour or 1.5 hours. It took me a while to realize that might be an issue with the driver so I uninstalled the new driver and installed the old one (320.92) and the problem was gone! I repeated the same process (changing the driver, rebooting then running the app again) multiple times and had 100% repro.
Unfortunately, I don’t have a small, self-contained repro but before I try to create one, has anybody seen something like that recently? The entry from Event Viewer at the time of the crash does not say much:
Faulting application name: <app>.exe, version: <version>, time stamp: 0x5316a970
Faulting module name: nvcuda.dll, version: 8.17.13.3250, time stamp: 0x52e1fa40
Exception code: 0xc00000fd
Fault offset: 0x00000000002226e7
Faulting process id: 0x1558
Faulting application start time: 0x01cf3831a2f3b71b
Faulting application path: <app>.exe
Faulting module path: C:\windows\SYSTEM32\nvcuda.dll
Report Id: aceb9a51-a433-11e3-9403-90b11c4725be
Faulting package full name:
Faulting package-relative application ID:
Update 1:
I now have a simple application which reproduces the crash both on K20m and K40m cards.
Update 2:
Updated sample app, was able to repro the crash. From the call stack it looks like there is a stack overflow somewhere in nvcuda.dll.
Steps:
Install latest version (332.50) of the drivers on the machine.
In Visual Studio 2012 create a new CUDA 5.5 project.
Replace the contents of the kernel.cu with the code below.
Compile and run the code on the machine with K20m or K40m.
After approximately 2 hours of execution the app will crash and the entry below will be written into event log.
Uninstall the driver and install previous (e.g. 321.10) version of the driver.
Run the app, it should still be running after 2, 3 and more hours.
Log:
Faulting application name: CudaTests60.exe, version: 0.0.0.0, time stamp: 0x5317974f
Faulting module name: nvcuda.dll, version: 8.17.13.3250, time stamp: 0x52e1fa40
Exception code: 0xc00000fd
Fault offset: 0x000000000004f5cb
Faulting process id: 0x23d0
Faulting application start time: 0x01cf38ba16961e74
Faulting application path: d:\bin\test\CudaTests60.exe
Faulting module path: C:\windows\system32\nvcuda.dll
Report Id: 192506c4-a4be-11e3-9401-90b11c4b02c0
Faulting package full name:
Faulting package-relative application ID:
Code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cublas_v2.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <Windows.h>
int main()
{
cudaError_t cudaStatus;
{
int crow = 10000;
int ccol = 10000;
int cshared = 10000;
int xLength = crow * cshared;
int yLength = cshared * ccol;
int matLength = crow * ccol;
thrust::device_vector<float> x(xLength);
thrust::device_vector<float> y(yLength);
thrust::device_vector<float> mat(matLength);
thrust::fill(x.begin(), x.end(), 1.0f);
thrust::fill(y.begin(), y.end(), 1.0f);
thrust::fill(mat.begin(), mat.end(), .0f);
cudaStream_t ops;
cudaStatus = cudaStreamCreate(&ops);
assert(0 == cudaStatus);
cublasHandle_t cbh;
cublasStatus_t cbstatus;
cbstatus = cublasCreate(&cbh);
assert(0 == cbstatus);
cbstatus = cublasSetStream(cbh, ops);
assert(0 == cbstatus);
float alpha = 1;
float beta = 0;
float* px = thrust::raw_pointer_cast(x.data());
float* py = thrust::raw_pointer_cast(y.data());
float* pmat = thrust::raw_pointer_cast(mat.data());
ULONGLONG start = GetTickCount64();
ULONGLONG iter = 0;
while (true)
{
cbstatus = cublasSgemm(cbh, CUBLAS_OP_N, CUBLAS_OP_N, crow, ccol, cshared, &alpha, px, crow, py, cshared, &beta, pmat, crow);
assert(0 == cbstatus);
if (0 != cbstatus)
{
printf("cublasSgemm failed: %d.\n", cbstatus);
break;
}
cudaStatus = cudaStreamSynchronize(ops);
assert(0 == cudaStatus);
if (0 != cudaStatus)
{
printf("cudaStreamSynchronize failed: %d.\n", cudaStatus);
break;
}
ULONGLONG cur = GetTickCount64();
// Exit after 2 hours.
if (cur - start > 2 * 3600 * 1000)
break;
iter++;
}
// Crash will happen here.
printf("Before cudaMemcpy.\n");
float res = 0;
cudaStatus = cudaMemcpy(&res, px, sizeof(float), cudaMemcpyDeviceToHost);
assert(0 == cudaStatus);
if (0 == cudaStatus)
printf("After cudaMemcpy: %f\n", res);
else
printf("cudaMemcpy failed: %d\n", cudaStatus);
}
return 0;
}

I'm not surprised the program crashes right where you've indicated.
This line of code is illegal:
cudaStatus = cudaMemcpy(pmat, px, x.size() * sizeof(float), cudaMemcpyDeviceToHost);
both pmat and px are pointers to device memory. However you've requested cudaMemcpyDeviceToHost which means the pmat pointer is interpreted as a host pointer and gets dereferenced during the copy operation. Dereferencing a device pointer in host code is illegal and will cause a seg fault.
With suitable modifications I ran your code on linux and it indicates a seg fault at that line.
Note that I'm not disputing there may be a problem in the driver you indicated (bugs are possible!), but I don't think this code is reproducing anything related to a driver bug.
Bugs can be filed at: https://developer.nvidia.com/nvbugs/cuda/add You will need to log in with developer credentials.
As an aside, your code appears to take a designed exit after 2 hours. I don't see how it could be running longer as you've indicated:
7.Run the app, it should still be running after 2, 3 and more hours.
Unless there is something wrong with your tick count timing system, which I haven't validated.

The bug has been fixed in Tesla drivers starting version 333.11. If you have the same problem, make sure you've updated the drivers.

Related

What does nvprof output: "No kernels were profiled" mean, and how to fix it

I have recently installed Cuda on my arch-Linux machine through the system's package manager, and I have been trying to test whether or not it is working by running a simple vector addition program.
I simply copy-paste the code from this tutorial (Both the one using one and more kernels) into a file titled cuda_test.cu and run
> nvcc cuda_test.cu -o cuda_test
In either case, the program can run, and I get no errors (both as in the program doesn't crash and the output is that there were no errors). But when I try to run the Cuda profiler on the program:
> sudo nvprof ./cuda_test
I get result:
==3201== NVPROF is profiling process 3201, command: ./cuda_test
Max error: 0
==3201== Profiling application: ./cuda_test
==3201== Profiling result:
No kernels were profiled.
No API activities were profiled.
==3201== Warning: Some profiling data are not recorded. Make sure cudaProfilerStop() or cuProfilerStop() is called before application exit to flush profile data.
The latter warning is not my main problem or the topic of my question, my problem is the message saying that No Kernels were profiled and no API activities were profiled.
Does this mean that the program was run entirely on my CPU? or is it an error in nvprof?
I have found a discussion about the same error here, but there the answer was that the wrong version of Cuda was installed, and in my case, the version installed is the latest version installed through the systems package manager (Version 10.1.243-1)
Is there any way I can get either nvprof to display the expected output?
Edit
Trying to adhere to the warning at the end does not solve the problem:
Adding call to cudaProfilerStop() (or cuProfilerStop()), and also adding cudaDeviceReset(); at end as suggested and linking the appropriate library (cuda_profiler_api.h or cudaProfiler.h) and compiling with
> nvcc cuda_test.cu -o cuda_test -lcuda
Yields a program which can still run, but which, when uppon which nvprof is run, returns:
==12558== NVPROF is profiling process 12558, command: ./cuda_test
Max error: 0
==12558== Profiling application: ./cuda_test
==12558== Profiling result:
No kernels were profiled.
No API activities were profiled.
==12558== Warning: Some profiling data are not recorded. Make sure cudaProfilerStop() or cuProfilerStop() is called before application exit to flush profile data.
======== Error: Application received signal 139
This has not solved the original problem, and has in fact created a new error; the same happens when cudaProfilerStop() is used on its own or alongside cuProfilerStop() and cudaDeviceReset();
The code
The code is, as mentioned copied from a tutorial to test if Cuda is working, though I also have included calls to cudaProfilerStop() and cudaDeviceReset(); for clarity, it is here included:
#include <iostream>
#include <math.h>
#include <cuda_profiler_api.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
int index = threadIdx.x;
int stride = blockDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
int main(void)
{
int N = 1<<20;
float *x, *y;
cudaProfilerStart();
// Allocate Unified Memory – accessible from CPU or GPU
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Run kernel on 1M elements on the GPU
add<<<1, 1>>>(N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(x);
cudaFree(y);
cudaDeviceReset();
cudaProfilerStop();
return 0;
}
This problem was apparently somewhat well known, after some searching I found this thread about the error-code in the edited version; the solution as discussed there is to call nvprof with the flag --unified-memory-profiling off:
> sudo nvprof --unified-memory-profiling off ./cuda_test
This makes nvprof work as expected-- even without the call to cudaProfileStop.
You can solve the problem by using
sudo nvprof --unified-memory-profiling per-process-device <your program>

CUDA: Forgetting kernel launch configuration does not result in NVCC compiler warning or error

When I try to call a CUDA kernel (a __global__ function) using a function pointer, everything appears to work just fine. However, if I forget to provide launch configuration when calling the kernel, NVCC will not result in an error or warning, but the program will compile and then crash if I attempt to run it.
__global__ void bar(float x) { printf("foo: %f\n", x); }
typedef void(*FuncPtr)(float);
void invoker(FuncPtr func)
{
func<<<1, 1>>>(1.0);
}
invoker(bar);
cudaDeviceSynchronize();
Compile and run the above. Everything will work just fine. Then, remove the kernel's launch configuration (i.e., <<<1, 1>>>). The code will compile just fine but it will crash when you try to run it.
Any idea what is going on? Is this a bug, or I am not supposed to pass around pointers of __global__ functions?
CUDA version: 8.0
OS version: Debian (Testing repo)
GPU: NVIDIA GeForce 750M
If we take a slightly more complex version of your repro, and look at the code emitted by the CUDA toolchain front-end, it becomes possible to see what is happening:
#include <cstdio>
__global__ void bar_func(float x) { printf("foo: %f\n", x); }
typedef void(*FuncPtr)(float);
void invoker(FuncPtr passed_func)
{
#ifdef NVCC_FAILS_HERE
bar_func(1.0);
#endif
bar_func<<<1,1>>>(1.0);
passed_func(1.0);
passed_func<<<1,1>>>(2.0);
}
So let's compile it a couple of ways:
$ nvcc -arch=sm_52 -c -DNVCC_FAILS_HERE invoker.cu
invoker.cu(10): error: a __global__ function call must be configured
i.e. the front-end can detect that bar_func is a global function and requires launch parameters. Another attempt:
$ nvcc -arch=sm_52 -c -keep invoker.cu
As you note, this produces no compile error. Let's look at what happened:
void bar_func(float x) ;
# 5 "invoker.cu"
typedef void (*FuncPtr)(float);
# 7 "invoker.cu"
void invoker(FuncPtr passed_func)
# 8 "invoker.cu"
{
# 12 "invoker.cu"
(cudaConfigureCall(1, 1)) ? (void)0 : (bar_func)((1.0));
# 13 "invoker.cu"
passed_func((2.0));
# 14 "invoker.cu"
(cudaConfigureCall(1, 1)) ? (void)0 : passed_func((3.0));
# 15 "invoker.cu"
}
The standard kernel invocation syntax <<<>>> gets expanded into an inline call to cudaConfigureCall, and then a host wrapper function is called. The host wrapper has the API internals required to launch the kernel:
void bar_func( float __cuda_0)
# 3 "invoker.cu"
{__device_stub__Z8bar_funcf( __cuda_0); }
void __device_stub__Z8bar_funcf(float __par0)
{
if (cudaSetupArgument((void *)(char *)&__par0, sizeof(__par0), (size_t)0UL) != cudaSuccess) return;
{ volatile static char *__f __attribute__((unused)); __f = ((char *)((void ( *)(float))bar_func));
(void)cudaLaunch(((char *)((void ( *)(float))bar_func)));
};
}
So the stub only handles arguments and launches the kernel via cudaLaunch. It doesn't handle launch configuration
The underlying reason for the crash (actually an undetected runtime API error) is that the kernel launch happens without a prior configuration. Obviously this happens because the CUDA front end (and C++ for that matter) can't do pointer introspection at compile time and detect that your function pointer is a stub function for calling a kernel.
I think the only way to describe this is a "limitation" of the runtime API and compiler. I wouldn't say what you are doing is wrong, but I would probably be using the driver API and explicitly managing the kernel launch myself in such a situation.

Is there an API to get a more precise time for profiling including operating system?

In the Flash API there is the getTimer() method that returns the time in milliseconds since the SWF started. I was writing PHP recently and there is a much more precise method called, microtime() that returns the current Unix timestamp in microseconds. Does Flash have anything more precise than getTimer()?
Update:
If Flash hasn't yet added a more precise profile API is it possible to use ExternalInterface or make a process call to maybe get an operating system API? I have both desktop and browser applications and in desktop I can access native processes.
Note: This answer is in response to the comment thread
This is a cli cmd to access the clock_gettime() on Linux and clock_get_time() on OS-X. I use it on OS-X with a couple of bash scripts, but you can call it using native processes with Air
>> ./nsTime ; ./nsTime
s: 1446019237
ns: 99241000
s: 1446019237
ns: 104217000
Compile:
OS-X :
gcc -o nsTime nsTime.c
Linux:
gcc -o nsTime nsTime.c -lrt
C Code:
#include <time.h>
#include <sys/time.h>
#include <stdio.h>
#ifdef __MACH__
#include <mach/clock.h>
#include <mach/mach.h>
#endif
void current_utc_time(struct timespec *ts) {
#ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time
clock_serv_t cclock;
mach_timespec_t mts;
host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
clock_get_time(cclock, &mts);
mach_port_deallocate(mach_task_self(), cclock);
ts->tv_sec = mts.tv_sec;
ts->tv_nsec = mts.tv_nsec;
#else
clock_gettime(CLOCK_REALTIME, ts);
#endif
}
int main(int argc, char **argv) {
struct timespec ts;
current_utc_time(&ts);
printf("s: %lu\n", ts.tv_sec);
printf("ns: %lu\n", ts.tv_nsec);
return 0;
}
Disclaimer: This code came from the Intertubes, not sure of its original author
Take a look at Jackson Dunstan's sub-millisecond Timer that he wrote about in 2013. I haven't tried it myself, so I can't say if it's any good, but it tries to address your issues with the standard AS3 Timer.

Dynamic parallelism cudaDeviceSynchronize() crashes

I have a kernel which calls another empty kernel. However when the calling kernel calls cudaDeviceSynchronize(), the kernel crashes and the execution goes straight to the host. Memory checker does not report of any memory access issues.
Does anyone know what could be the reason for such uncivilized behavior?
The crash seems to happen only if I run the code from the debugger (Visual Studio -> Nsight -> Start CUDA Debugging).
The crash does not happen every time I run the code - sometimes it crashes, and sometimes it finishes ok.
Here is the complete code to reproduce the problem:
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#define CUDA_RUN(x_, err_) {cudaStatus = x_; if (cudaStatus != cudaSuccess) {fprintf(stderr, err_ " %d - %s\n", cudaStatus, cudaGetErrorString(cudaStatus)); int k; scanf("%d", &k); goto Error;}}
struct computationalStorage {
float rotMat;
};
__global__ void drawThetaFromDistribution() {}
__global__ void chainKernel() {
computationalStorage* c = (computationalStorage*)malloc(sizeof(computationalStorage));
if (!c) printf("malloc error\n");
c->rotMat = 1.0f;
int n = 1;
while (n < 1000) {
cudaError_t err;
drawThetaFromDistribution<<<1, 1>>>();
if ((err = cudaGetLastError()) != cudaSuccess)
printf("drawThetaFromDistribution Sync kernel error: %s\n", cudaGetErrorString(err));
printf("0");
if ((err = cudaDeviceSynchronize()) != cudaSuccess)
printf("drawThetaFromDistribution Async kernel error: %s\n", cudaGetErrorString(err));
printf("1\n");
++n;
}
free(c);
}
int main() {
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
CUDA_RUN(cudaSetDevice(0), "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
// Set to use on chip memory 16KB for shared, 48KB for L1
CUDA_RUN(cudaDeviceSetCacheConfig ( cudaFuncCachePreferL1 ), "Can't set CUDA to use on chip memory for L1");
// Set a large heap
CUDA_RUN(cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1024 * 10 * 192), "Can't set the Heap size");
chainKernel<<<10, 192>>>();
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
printf("Something was wrong! Error code: %d", cudaStatus);
}
CUDA_RUN(cudaDeviceReset(), "cudaDeviceReset failed!");
Error:
int k;
scanf("%d",&k);
return 0;
}
If all goes well I expect to see:
00000000000000000000000....0000000000000001
1
1
1
1
....
This is what I get when everything works ok. When it crashes however:
000000000000....0000000000000Something was wrong! Error code: 30
As you can see the statement err = cudaDeviceSynchronize(); does not finish, and the execution goes straight to the host, where its cudaDeviceSynchronize(); fails with unknown error code (30 = cudaErrorUnknown).
System: CUDA 5.5, NVidia-Titan(Headless), Windows 7x64, Win32 application.
UPDATE: additional Nvidia card driving the display, Nsight 3.2.0.13289.
That last fact may have been the critical one. You don't mention which version of nsight VSE you are using nor your exact machine config (e.g. are there other GPUs in the machine, if so, which is driving the display?), but at least up till recently it was not possible to debug a dynamic parallelism application in single-GPU mode with nsight VSE.
The current feature matrix also suggests that single-GPU CDP debugging is not yet supported.
Probably one possible workaround in your case would be to add another GPU to drive the display, and make the Titan card headless (i.e. don't attach any monitors and don't extend the windows desktop onto that GPU).
I ran your application with and without cuda-memcheck and it does not appear to me that there are any problems with it.

Execution time issue in CUDA benchmarks

I am trying to profile some CUDA Rodinia benchmarks, in terms of their SM and memory utilization, power consumption etc. For that, I simultaneously execute the benchmark and the profiler which essentially spawns a pthread to profile the GPU execution using NVML library.
The issue is that the execution time of a benchmark, is much higher( about 3 times) in case I do not invoke the profiler along with it, than the case when the benchmark is executing with the profiler. The frequency scaling governor for the CPU is userspace so I do not think that frequency of the CPU is changing. Is it due to the flickering in GPU frequency?
Below is the code for the profiler.
#include <pthread.h>
#include <stdio.h>
#include "nvml.h"
#include "unistd.h"
#define NUM_THREADS 1
void *PrintHello(void *threadid)
{
long tid;
tid = (long)threadid;
// printf("Hello World! It's me, thread #%ld!\n", tid);
nvmlReturn_t result;
nvmlDevice_t device;
nvmlUtilization_t utilization;
nvmlClockType_t jok;
unsigned int device_count, i,powergpu,clo;
char version[80];
result = nvmlInit();
result = nvmlSystemGetDriverVersion(version,80);
printf("\n Driver version: %s \n\n", version);
result = nvmlDeviceGetCount(&device_count);
printf("Found %d device%s\n\n", device_count,
device_count != 1 ? "s" : "");
printf("Listing devices:\n");
result = nvmlDeviceGetHandleByIndex(0, &device);
while(1)
{
result = nvmlDeviceGetPowerUsage(device,&powergpu );
result = nvmlDeviceGetUtilizationRates(device, &utilization);
printf("\n%d\n",powergpu);
if (result == NVML_SUCCESS)
{
printf("%d\n", utilization.gpu);
printf("%d\n", utilization.memory);
}
result=nvmlDeviceGetClockInfo(device,NVML_CLOCK_SM,&clo);
if(result==NVML_SUCCESS)
{
printf("%d\n",clo);
}
usleep(500000);
}
pthread_exit(NULL);
}
int main (int argc, char *argv[])
{
pthread_t threads[NUM_THREADS];
int rc;
long t;
for(t=0; t<NUM_THREADS; t++){
printf("In main: creating thread %ld\n", t);
rc = pthread_create(&threads[t], NULL, PrintHello, (void *)t);
if (rc){
printf("ERROR; return code from pthread_create() is %d\n", rc);
exit(-1);
}
}
/* Last thing that main() should do */
pthread_exit(NULL);
}
With your profiler running, the GPU(s) are being pulled out of their sleep state (due to the access to the nvml API, which is querying data from the GPUs). This makes them respond much more quickly to a CUDA application, and so the application appears to run "faster" if you time the entire application execution (e.g. using the linux time command).
One solution is to place the GPUs in "persistence mode" with the nvidia-smi command (use nvidia-smi --help to get command line help).
Another solution would be to do the timing from within the application, and exclude the CUDA start-up time from the timing measurement, perhaps by executing a cuda command such as cudaFree(0); prior to the start of timing.