Asynchronous texture object allocation in multi-GPU code - cuda

I have some code for texture object allocation and Host to Device copy. It is just a modification of the answer here. I do not explicitly use streams, just cudaSetDevice()
This code works fine, however, when I run the Visual Profiler, I can see that the memory copies from Host to Array are not asynchronous. They are allocated each to their own device stream, but the second one does not start until the first one finishes (running on 2 GPUs). I have tried it with large images, so I make certain that its not overhead from CPU.
My guess is that there is something in the code that requires to be synchronous thus halts the CPU, but I don't know what. What can I do to make this loop asynchronous?
An MCVE:
void CreateTexture(int num_devices,float* imagedata, int nVoxelX, int nVoxelY, int nVoxelZ ,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage);
int main(void)
{
int deviceCount =0 ;
cudaGetDeviceCount(&deviceCount);
int nVoxelX=512;
int nVoxelY=512;
int nVoxelZ=512;
float* image=(float*)malloc(nVoxelX*nVoxelY*nVoxelZ*sizeof(float));
cudaTextureObject_t *texImg =new cudaTextureObject_t[deviceCount];
cudaArray **d_cuArrTex = new cudaArray*[deviceCount];
CreateTexture(deviceCount,image, nVoxelX,nVoxelY, nVoxelZ,d_cuArrTex,texImg);
}
Actual function:
void CreateTexture(int num_devices, float* imagedata, int nVoxelX, int nVoxelY, int nVoxelZ ,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage)
{
//size_t size_image=nVoxelX*nVoxelY*nVoxelZ;
for (unsigned int i = 0; i < num_devices; i++){
cudaSetDevice(i);
//cudaArray Descriptor
const cudaExtent extent = make_cudaExtent(nVoxelX, nVoxelY, nVoxelZ);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
//cuda Array
cudaMalloc3DArray(&d_cuArrTex[i], &channelDesc, extent);
//cudaCheckErrors("Texture memory allocation fail");
cudaMemcpy3DParms copyParams = {0};
//Array creation
copyParams.srcPtr = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
copyParams.dstArray = d_cuArrTex[i];
copyParams.extent = extent;
copyParams.kind = cudaMemcpyHostToDevice;
cudaMemcpy3DAsync(&copyParams);
//cudaCheckErrors("Texture memory data copy fail");
//Array creation End
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = d_cuArrTex[i];
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = false;
texDescr.filterMode = cudaFilterModePoint;
texDescr.addressMode[0] = cudaAddressModeBorder;
texDescr.addressMode[1] = cudaAddressModeBorder;
texDescr.addressMode[2] = cudaAddressModeBorder;
texDescr.readMode = cudaReadModeElementType;
cudaCreateTextureObject(&texImage[i], &texRes, &texDescr, NULL);
//cudaCheckErrors("Texture object creation fail");
}
}

The two main problems I can see with the code are:
Your host allocation is a pageable allocation. Asynchrony of copy operations in CUDA where one of the targets is host memory requires a pinned alloction for host memory.
You have other synchronizing operations in your create textures loop. Device allocation operations (cudaMalloc3DArray in this case) are synchronizing, in my experience. I haven't run tests to determine if cudaCreateTextureObject is synchronizing, but I wouldn't be surprised if it was. Therefore my recommendation for asynchrony in general is to get synchronizing operations out of the loop.
In your case, we can refactor your code as follows, which seems to allow overlap of operations from the perspective of nvprof:
$ cat t399.cu
void CreateTexture(int num_devices, float* imagedata, int nVoxelX, int nVoxelY, int nVoxelZ ,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage)
{
//size_t size_image=nVoxelX*nVoxelY*nVoxelZ;
const cudaExtent extent = make_cudaExtent(nVoxelX, nVoxelY, nVoxelZ);
for (unsigned int i = 0; i < num_devices; i++){
cudaSetDevice(i);
//cudaArray Descriptor
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
//cuda Array
cudaMalloc3DArray(&d_cuArrTex[i], &channelDesc, extent);
//cudaCheckErrors("Texture memory allocation fail");
}
for (unsigned int i = 0; i < num_devices; i++){
cudaSetDevice(i);
cudaMemcpy3DParms copyParams = {0};
//Array creation
copyParams.srcPtr = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
copyParams.dstArray = d_cuArrTex[i];
copyParams.extent = extent;
copyParams.kind = cudaMemcpyHostToDevice;
cudaMemcpy3DAsync(&copyParams);
//cudaCheckErrors("Texture memory data copy fail");
}
for (unsigned int i = 0; i < num_devices; i++){
cudaSetDevice(i);
//Array creation End
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = d_cuArrTex[i];
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = false;
texDescr.filterMode = cudaFilterModePoint;
texDescr.addressMode[0] = cudaAddressModeBorder;
texDescr.addressMode[1] = cudaAddressModeBorder;
texDescr.addressMode[2] = cudaAddressModeBorder;
texDescr.readMode = cudaReadModeElementType;
cudaCreateTextureObject(&texImage[i], &texRes, &texDescr, NULL);
//cudaCheckErrors("Texture object creation fail");
}
for (unsigned int i = 0; i < num_devices; i++){
cudaSetDevice(i);
cudaDeviceSynchronize();
}
}
int main(void)
{
int deviceCount =0 ;
cudaGetDeviceCount(&deviceCount);
int nVoxelX=512;
int nVoxelY=512;
int nVoxelZ=512;
float* image;
cudaHostAlloc(&image, nVoxelX*nVoxelY*nVoxelZ*sizeof(float), cudaHostAllocDefault);
cudaTextureObject_t *texImg =new cudaTextureObject_t[deviceCount];
cudaArray **d_cuArrTex = new cudaArray*[deviceCount];
CreateTexture(deviceCount,image, nVoxelX,nVoxelY, nVoxelZ,d_cuArrTex,texImg);
}
$ nvcc -o t399 t399.cu
$ cuda-memcheck ./t399
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof --print-gpu-trace ./t399
==19953== NVPROF is profiling process 19953, command: ./t399
==19953== Profiling application: ./t399
==19953== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput SrcMemType DstMemType Device Context Stream Name
1.55311s 90.735ms - - - - - 512.00MB 5.5106GB/s Pinned Array Tesla P100-PCIE 1 7 [CUDA memcpy HtoA]
1.55316s 90.640ms - - - - - 512.00MB 5.5163GB/s Pinned Array Tesla K40m (1) 2 18 [CUDA memcpy HtoA]
1.55318s 85.962ms - - - - - 512.00MB 5.8165GB/s Pinned Array Tesla K20Xm (2) 3 29 [CUDA memcpy HtoA]
1.55320s 89.908ms - - - - - 512.00MB 5.5612GB/s Pinned Array Tesla K20Xm (3) 4 40 [CUDA memcpy HtoA]
Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy
$
my system here is a 4-GPU system with two GPUs hanging on each of 2 root ports. Therefore the Host->Device pinned transfer bandwidth on PCIE Gen3 of about 10GB/s is getting split from the profiler's perspective between the 2 GPUs on each port, but careful study of the profiler start and duration times for the transfers indicate all 4 are overlapped from the profiler's perspective.

Related

Large iteration loops in CUDA kernels causes memory error [duplicate]

I have a kernel to calculate different elements of a matrix, based on their position (diagonal or off-diagonal). The kernel works as expected when calculating matrices of sizes:
14 x 14 (I understand this is small and does not make proper use of the GPU resources but this was purely for testing purposes to ensure results were correct)
118 x 118, and
300 x 300
However, when I am trying to calculate a matrix of size 2383 x 2383, the kernel crashes. Specifically, the error "Unspecified launch failure" is thrown on the cudaMemcpy() line to return results from device to host. From research, I understand that this error usually arises in the case of an out of bounds memory access (e.g. in an array), however, what I don't get is that it works for the three previous cases but not for the 2383 x 2383 case. The kernel code is shown below:
__global__ void createYBus(float *R, float *X, float *B, int numberOfBuses, int numberOfBranches, int *fromBus, int *toBus, cuComplex *y)
{
int rowIdx = blockIdx.y*blockDim.y + threadIdx.y;
int colIdx = blockIdx.x*blockDim.x + threadIdx.x;
int index = rowIdx*numberOfBuses + colIdx;
if (rowIdx<numberOfBuses && colIdx<numberOfBuses)
{
for (int i=0; i<numberOfBranches; ++i)
{
if (rowIdx==fromBus[i] && colIdx==fromBus[i]) { //diagonal element
y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
}
if (rowIdx==toBus[i] && colIdx==toBus[i]) { //diagonal element
y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
}
if (rowIdx==fromBus[i] && colIdx==toBus[i]) { //off-diagonal element
y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
}
if (rowIdx==toBus[i] && colIdx==fromBus[i]) { //off-diagonal element
y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
}
}
}
}
Global memory allocations are done via calls to cudaMalloc(). The allocations made in the code are as follows:
cudaStat1 = cudaMalloc((void**)&dev_fromBus, numLines*sizeof(int));
cudaStat2 = cudaMalloc((void**)&dev_toBus, numLines*sizeof(int));
cudaStat3 = cudaMalloc((void**)&dev_R, numLines*sizeof(float));
cudaStat4 = cudaMalloc((void**)&dev_X, numLines*sizeof(float));
cudaStat5 = cudaMalloc((void**)&dev_B, numLines*sizeof(float));
cudaStat6 = cudaMalloc((void**)&dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex));
cudaStat7 = cudaMalloc((void**)&dev_Pd, numberOfBuses*sizeof(float));
cudaStat8 = cudaMalloc((void**)&dev_Qd, numberOfBuses*sizeof(float));
cudaStat9 = cudaMalloc((void**)&dev_Vmag, numberOfBuses*sizeof(float));
cudaStat10 = cudaMalloc((void**)&dev_theta, numberOfBuses*sizeof(float));
cudaStat11 = cudaMalloc((void**)&dev_Peq, numberOfBuses*sizeof(float));
cudaStat12 = cudaMalloc((void**)&dev_Qeq, numberOfBuses*sizeof(float));
cudaStat13 = cudaMalloc((void**)&dev_Peq1, numberOfBuses*sizeof(float));
cudaStat14 = cudaMalloc((void**)&dev_Qeq1, numberOfBuses*sizeof(float));
...
...
cudaStat15 = cudaMalloc((void**)&dev_powerMismatch, jacSize*sizeof(float));
cudaStat16 = cudaMalloc((void**)&dev_jacobian, jacSize*jacSize*sizeof(float));
cudaStat17 = cudaMalloc((void**)&dev_stateVector, jacSize*sizeof(float));
cudaStat18 = cudaMalloc((void**)&dev_PQindex, jacSize*sizeof(int));
where cudaStatN are of type cudaError_t to catch errors. The last four allocations were done later on in the code and are for another kernel. However these allocations were done before the kernel in question was called.
The launch parameters are as follows:
dim3 dimBlock(16, 16); //number of threads
dim3 dimGrid((numberOfBuses+15)/16, (numberOfBuses+15)/16); //number of blocks
//launch kernel once data has been copied to GPU
createYBus<<<dimGrid, dimBlock>>>(dev_R, dev_X, dev_B, numberOfBuses, numLines, dev_fromBus, dev_toBus, dev_y);
//copy results back to CPU
cudaStat6 = cudaMemcpy(y_bus, dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex), cudaMemcpyDeviceToHost);
if (cudaStat6 != cudaSuccess) {
cout<<"Device memcpy failed"<<endl;
cout<<cudaGetErrorString(cudaStat6)<<endl;
return 1;
}
I removed the timing code just to show the block and grid dimensions and error checking technique used.
I also have a host (C++ code) version of this function and I'm passing the data to both functions and then comparing results, firstly, to ensure the kernel produces correct results, and secondly in terms of execution time to compare performance. I have double checked the data for the 2383 x 2383 case (it's being read in from a text file and copied to global memory) and I'm not finding any anomalies in array accesses/indexing.
I'm using Visual Studio 2010, so I tried using Nsight to find the error (I'm not too well-versed with Nsight). The summary report overview states: "There was 1 runtime API call error reported. (Please see the CUDA Runtime API Calls report for further information). In the list of runtime API calls, cudaMemcpy returns error 4 - not sure if the Thread ID (5012) is of any significance in the table - this number varies with every run. CUDA memcheck tool (in the command line) returns the following:
Thank you for using this program
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error
=========
========= ERROR SUMMARY: 1 error
I know my kernel isn't the most efficient as there are many global memory accesses. Why is the kernel crashing for this larger matrix? Is there an out of bounds array access that I'm missing? Any assistance would be greatly appreciated.
Solved the problem. Turns out the WDDM TDR (timeout detecion recovery) was enabled and the delay was set to 2 seconds. This means that if the kernel execution time exceeds 2s, the driver will crash and recover. This is applicable to graphics and rendering (for general purpose uses of the GPU). In this case however, the TDR must either me disabled or the delay increased. By increasing the delay to 10s, the crash error "unspecified launch failure" ceased to appear and kernel execution continued as before.
The TDR delay (as well as enabling/disabling) can be done through Nsight options in the Nsight Monitor or through the Registry (HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\GraphicsDrivers) - DWORDS Tdrdelay and Tdrlevel.
I tried to reproduce your code with the following complete example. The code compiles, runs with no error.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include "cuComplex.h"
__global__ void createYBus(float *R, float *X, float *B, int numberOfBuses, int numberOfBranches, int *fromBus, int *toBus, cuComplex *y)
{
int rowIdx = blockIdx.y*blockDim.y + threadIdx.y;
int colIdx = blockIdx.x*blockDim.x + threadIdx.x;
int index = rowIdx*numberOfBuses + colIdx;
if (rowIdx<numberOfBuses && colIdx<numberOfBuses)
{
for (int i=0; i<numberOfBranches; ++i)
{
if (rowIdx==fromBus[i] && colIdx==fromBus[i]) { //diagonal element
y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
}
if (rowIdx==toBus[i] && colIdx==toBus[i]) { //diagonal element
y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
}
if (rowIdx==fromBus[i] && colIdx==toBus[i]) { //off-diagonal element
y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
}
if (rowIdx==toBus[i] && colIdx==fromBus[i]) { //off-diagonal element
y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
}
}
}
}
int main ()
{
int numLines = 32 ;
int numberOfBuses = 2383 ;
int* dev_fromBus, *dev_toBus;
float *dev_R, *dev_X, *dev_B;
cuComplex* dev_y ;
cudaMalloc((void**)&dev_fromBus, numLines*sizeof(int));
cudaMalloc((void**)&dev_toBus, numLines*sizeof(int));
cudaMalloc((void**)&dev_R, numLines*sizeof(float));
cudaMalloc((void**)&dev_X, numLines*sizeof(float));
cudaMalloc((void**)&dev_B, numLines*sizeof(float));
cudaMalloc((void**)&dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex));
dim3 dimBlock(16, 16); //number of threads
dim3 dimGrid((numberOfBuses+15)/16, (numberOfBuses+15)/16); //number of blocks
//launch kernel once data has been copied to GPU
createYBus<<<dimGrid, dimBlock>>>(dev_R, dev_X, dev_B, numberOfBuses, numLines, dev_fromBus, dev_toBus, dev_y);
cuComplex* y_bus = new cuComplex[numberOfBuses*numberOfBuses] ;
//copy results back to CPU
cudaError_t cudaStat6 = cudaMemcpy(y_bus, dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex), cudaMemcpyDeviceToHost);
if (cudaStat6 != cudaSuccess) {
printf ("failure : (%d) - %s\n", cudaStat6, ::cudaGetErrorString(cudaStat6)) ;
return 1;
}
return 0 ;
}
Your error seems to be somewhere else.
You want to run your code in NSIGHT debug mode with cuda mem check activated. If compiled with debug information, the tool should point out the location of your error.
EDIT: The problem appears to ne caused by WDDM TDR as discussed in comment.

Why does my CUDA kernel crash (unspecified launch failure) with a different dataset size?

I have a kernel to calculate different elements of a matrix, based on their position (diagonal or off-diagonal). The kernel works as expected when calculating matrices of sizes:
14 x 14 (I understand this is small and does not make proper use of the GPU resources but this was purely for testing purposes to ensure results were correct)
118 x 118, and
300 x 300
However, when I am trying to calculate a matrix of size 2383 x 2383, the kernel crashes. Specifically, the error "Unspecified launch failure" is thrown on the cudaMemcpy() line to return results from device to host. From research, I understand that this error usually arises in the case of an out of bounds memory access (e.g. in an array), however, what I don't get is that it works for the three previous cases but not for the 2383 x 2383 case. The kernel code is shown below:
__global__ void createYBus(float *R, float *X, float *B, int numberOfBuses, int numberOfBranches, int *fromBus, int *toBus, cuComplex *y)
{
int rowIdx = blockIdx.y*blockDim.y + threadIdx.y;
int colIdx = blockIdx.x*blockDim.x + threadIdx.x;
int index = rowIdx*numberOfBuses + colIdx;
if (rowIdx<numberOfBuses && colIdx<numberOfBuses)
{
for (int i=0; i<numberOfBranches; ++i)
{
if (rowIdx==fromBus[i] && colIdx==fromBus[i]) { //diagonal element
y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
}
if (rowIdx==toBus[i] && colIdx==toBus[i]) { //diagonal element
y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
}
if (rowIdx==fromBus[i] && colIdx==toBus[i]) { //off-diagonal element
y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
}
if (rowIdx==toBus[i] && colIdx==fromBus[i]) { //off-diagonal element
y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
}
}
}
}
Global memory allocations are done via calls to cudaMalloc(). The allocations made in the code are as follows:
cudaStat1 = cudaMalloc((void**)&dev_fromBus, numLines*sizeof(int));
cudaStat2 = cudaMalloc((void**)&dev_toBus, numLines*sizeof(int));
cudaStat3 = cudaMalloc((void**)&dev_R, numLines*sizeof(float));
cudaStat4 = cudaMalloc((void**)&dev_X, numLines*sizeof(float));
cudaStat5 = cudaMalloc((void**)&dev_B, numLines*sizeof(float));
cudaStat6 = cudaMalloc((void**)&dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex));
cudaStat7 = cudaMalloc((void**)&dev_Pd, numberOfBuses*sizeof(float));
cudaStat8 = cudaMalloc((void**)&dev_Qd, numberOfBuses*sizeof(float));
cudaStat9 = cudaMalloc((void**)&dev_Vmag, numberOfBuses*sizeof(float));
cudaStat10 = cudaMalloc((void**)&dev_theta, numberOfBuses*sizeof(float));
cudaStat11 = cudaMalloc((void**)&dev_Peq, numberOfBuses*sizeof(float));
cudaStat12 = cudaMalloc((void**)&dev_Qeq, numberOfBuses*sizeof(float));
cudaStat13 = cudaMalloc((void**)&dev_Peq1, numberOfBuses*sizeof(float));
cudaStat14 = cudaMalloc((void**)&dev_Qeq1, numberOfBuses*sizeof(float));
...
...
cudaStat15 = cudaMalloc((void**)&dev_powerMismatch, jacSize*sizeof(float));
cudaStat16 = cudaMalloc((void**)&dev_jacobian, jacSize*jacSize*sizeof(float));
cudaStat17 = cudaMalloc((void**)&dev_stateVector, jacSize*sizeof(float));
cudaStat18 = cudaMalloc((void**)&dev_PQindex, jacSize*sizeof(int));
where cudaStatN are of type cudaError_t to catch errors. The last four allocations were done later on in the code and are for another kernel. However these allocations were done before the kernel in question was called.
The launch parameters are as follows:
dim3 dimBlock(16, 16); //number of threads
dim3 dimGrid((numberOfBuses+15)/16, (numberOfBuses+15)/16); //number of blocks
//launch kernel once data has been copied to GPU
createYBus<<<dimGrid, dimBlock>>>(dev_R, dev_X, dev_B, numberOfBuses, numLines, dev_fromBus, dev_toBus, dev_y);
//copy results back to CPU
cudaStat6 = cudaMemcpy(y_bus, dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex), cudaMemcpyDeviceToHost);
if (cudaStat6 != cudaSuccess) {
cout<<"Device memcpy failed"<<endl;
cout<<cudaGetErrorString(cudaStat6)<<endl;
return 1;
}
I removed the timing code just to show the block and grid dimensions and error checking technique used.
I also have a host (C++ code) version of this function and I'm passing the data to both functions and then comparing results, firstly, to ensure the kernel produces correct results, and secondly in terms of execution time to compare performance. I have double checked the data for the 2383 x 2383 case (it's being read in from a text file and copied to global memory) and I'm not finding any anomalies in array accesses/indexing.
I'm using Visual Studio 2010, so I tried using Nsight to find the error (I'm not too well-versed with Nsight). The summary report overview states: "There was 1 runtime API call error reported. (Please see the CUDA Runtime API Calls report for further information). In the list of runtime API calls, cudaMemcpy returns error 4 - not sure if the Thread ID (5012) is of any significance in the table - this number varies with every run. CUDA memcheck tool (in the command line) returns the following:
Thank you for using this program
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error
=========
========= ERROR SUMMARY: 1 error
I know my kernel isn't the most efficient as there are many global memory accesses. Why is the kernel crashing for this larger matrix? Is there an out of bounds array access that I'm missing? Any assistance would be greatly appreciated.
Solved the problem. Turns out the WDDM TDR (timeout detecion recovery) was enabled and the delay was set to 2 seconds. This means that if the kernel execution time exceeds 2s, the driver will crash and recover. This is applicable to graphics and rendering (for general purpose uses of the GPU). In this case however, the TDR must either me disabled or the delay increased. By increasing the delay to 10s, the crash error "unspecified launch failure" ceased to appear and kernel execution continued as before.
The TDR delay (as well as enabling/disabling) can be done through Nsight options in the Nsight Monitor or through the Registry (HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\GraphicsDrivers) - DWORDS Tdrdelay and Tdrlevel.
I tried to reproduce your code with the following complete example. The code compiles, runs with no error.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include "cuComplex.h"
__global__ void createYBus(float *R, float *X, float *B, int numberOfBuses, int numberOfBranches, int *fromBus, int *toBus, cuComplex *y)
{
int rowIdx = blockIdx.y*blockDim.y + threadIdx.y;
int colIdx = blockIdx.x*blockDim.x + threadIdx.x;
int index = rowIdx*numberOfBuses + colIdx;
if (rowIdx<numberOfBuses && colIdx<numberOfBuses)
{
for (int i=0; i<numberOfBranches; ++i)
{
if (rowIdx==fromBus[i] && colIdx==fromBus[i]) { //diagonal element
y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
}
if (rowIdx==toBus[i] && colIdx==toBus[i]) { //diagonal element
y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
}
if (rowIdx==fromBus[i] && colIdx==toBus[i]) { //off-diagonal element
y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
}
if (rowIdx==toBus[i] && colIdx==fromBus[i]) { //off-diagonal element
y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
}
}
}
}
int main ()
{
int numLines = 32 ;
int numberOfBuses = 2383 ;
int* dev_fromBus, *dev_toBus;
float *dev_R, *dev_X, *dev_B;
cuComplex* dev_y ;
cudaMalloc((void**)&dev_fromBus, numLines*sizeof(int));
cudaMalloc((void**)&dev_toBus, numLines*sizeof(int));
cudaMalloc((void**)&dev_R, numLines*sizeof(float));
cudaMalloc((void**)&dev_X, numLines*sizeof(float));
cudaMalloc((void**)&dev_B, numLines*sizeof(float));
cudaMalloc((void**)&dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex));
dim3 dimBlock(16, 16); //number of threads
dim3 dimGrid((numberOfBuses+15)/16, (numberOfBuses+15)/16); //number of blocks
//launch kernel once data has been copied to GPU
createYBus<<<dimGrid, dimBlock>>>(dev_R, dev_X, dev_B, numberOfBuses, numLines, dev_fromBus, dev_toBus, dev_y);
cuComplex* y_bus = new cuComplex[numberOfBuses*numberOfBuses] ;
//copy results back to CPU
cudaError_t cudaStat6 = cudaMemcpy(y_bus, dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex), cudaMemcpyDeviceToHost);
if (cudaStat6 != cudaSuccess) {
printf ("failure : (%d) - %s\n", cudaStat6, ::cudaGetErrorString(cudaStat6)) ;
return 1;
}
return 0 ;
}
Your error seems to be somewhere else.
You want to run your code in NSIGHT debug mode with cuda mem check activated. If compiled with debug information, the tool should point out the location of your error.
EDIT: The problem appears to ne caused by WDDM TDR as discussed in comment.

CUDA : 2D grid launch error [duplicate]

My monte carlo pi calculation CUDA program is causing my nvidia driver to crash when I exceed around 500 trials and 256 full blocks. It seems to be happening in the monteCarlo kernel function.Any help is appreciated.
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#define NUM_THREAD 256
#define NUM_BLOCK 256
///////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////
// Function to sum an array
__global__ void reduce0(float *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_odata[i];
__syncthreads();
// do reduction in shared mem
for (unsigned int s=1; s < blockDim.x; s *= 2) { // step = s x 2
if (tid % (2*s) == 0) { // only threadIDs divisible by the step participate
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
///////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////
__global__ void monteCarlo(float *g_odata, int trials, curandState *states){
// unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int incircle, k;
float x, y, z;
incircle = 0;
curand_init(1234, i, 0, &states[i]);
for(k = 0; k < trials; k++){
x = curand_uniform(&states[i]);
y = curand_uniform(&states[i]);
z =(x*x + y*y);
if (z <= 1.0f) incircle++;
}
__syncthreads();
g_odata[i] = incircle;
}
///////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////
int main() {
float* solution = (float*)calloc(100, sizeof(float));
float *sumDev, *sumHost, total;
const char *error;
int trials;
curandState *devStates;
trials = 500;
total = trials*NUM_THREAD*NUM_BLOCK;
dim3 dimGrid(NUM_BLOCK,1,1); // Grid dimensions
dim3 dimBlock(NUM_THREAD,1,1); // Block dimensions
size_t size = NUM_BLOCK*NUM_THREAD*sizeof(float); //Array memory size
sumHost = (float*)calloc(NUM_BLOCK*NUM_THREAD, sizeof(float));
cudaMalloc((void **) &sumDev, size); // Allocate array on device
error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
cudaMalloc((void **) &devStates, (NUM_THREAD*NUM_BLOCK)*sizeof(curandState));
error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
// Do calculation on device by calling CUDA kernel
monteCarlo <<<dimGrid, dimBlock>>> (sumDev, trials, devStates);
error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
// call reduction function to sum
reduce0 <<<dimGrid, dimBlock, (NUM_THREAD*sizeof(float))>>> (sumDev);
error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
dim3 dimGrid1(1,1,1);
dim3 dimBlock1(256,1,1);
reduce0 <<<dimGrid1, dimBlock1, (NUM_THREAD*sizeof(float))>>> (sumDev);
error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
// Retrieve result from device and store it in host array
cudaMemcpy(sumHost, sumDev, sizeof(float), cudaMemcpyDeviceToHost);
error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
*solution = 4*(sumHost[0]/total);
printf("%.*f\n", 1000, *solution);
free (solution);
free(sumHost);
cudaFree(sumDev);
cudaFree(devStates);
//*solution = NULL;
return 0;
}
If smaller numbers of trials work correctly, and if you are running on MS Windows without the NVIDIA Tesla Compute Cluster (TCC) driver and/or the GPU you are using is attached to a display, then you are probably exceeding the operating system's "watchdog" timeout. If the kernel occupies the display device (or any GPU on Windows without TCC) for too long, the OS will kill the kernel so that the system does not become non-interactive.
The solution is to run on a non-display-attached GPU and if you are on Windows, use the TCC driver. Otherwise, you will need to reduce the number of trials in your kernel and run the kernel multiple times to compute the number of trials you need.
EDIT: According to the CUDA 4.0 curand docs(page 15, "Performance Notes"), you can improve performance by copying the state for a generator to local storage inside your kernel, then storing the state back (if you need it again) when you are finished:
curandState state = states[i];
for(k = 0; k < trials; k++){
x = curand_uniform(&state);
y = curand_uniform(&state);
z =(x*x + y*y);
if (z <= 1.0f) incircle++;
}
Next, it mentions that setup is expensive, and suggests that you move curand_init into a separate kernel. This may help keep the cost of your MC kernel down so you don't run up against the watchdog.
I recommend reading that section of the docs, there are several useful guidelines.
For those of you having a geforce GPU which does not support TCC driver there is another solution based on:
http://msdn.microsoft.com/en-us/library/windows/hardware/ff569918(v=vs.85).aspx
start regedit,
navigate to HKEY_LOCAL_MACHINE\System\CurrentControlSet\Control\GraphicsDrivers
create new DWORD key called TdrLevel, set value to 0,
restart PC.
Now your long-running kernels should not be terminated. This answer is based on:
Modifying registry to increase GPU timeout, windows 7
I just thought it might be useful to provide the solution here as well.

cuda display driver stppoed [duplicate]

My monte carlo pi calculation CUDA program is causing my nvidia driver to crash when I exceed around 500 trials and 256 full blocks. It seems to be happening in the monteCarlo kernel function.Any help is appreciated.
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#define NUM_THREAD 256
#define NUM_BLOCK 256
///////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////
// Function to sum an array
__global__ void reduce0(float *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_odata[i];
__syncthreads();
// do reduction in shared mem
for (unsigned int s=1; s < blockDim.x; s *= 2) { // step = s x 2
if (tid % (2*s) == 0) { // only threadIDs divisible by the step participate
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
///////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////
__global__ void monteCarlo(float *g_odata, int trials, curandState *states){
// unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int incircle, k;
float x, y, z;
incircle = 0;
curand_init(1234, i, 0, &states[i]);
for(k = 0; k < trials; k++){
x = curand_uniform(&states[i]);
y = curand_uniform(&states[i]);
z =(x*x + y*y);
if (z <= 1.0f) incircle++;
}
__syncthreads();
g_odata[i] = incircle;
}
///////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////
int main() {
float* solution = (float*)calloc(100, sizeof(float));
float *sumDev, *sumHost, total;
const char *error;
int trials;
curandState *devStates;
trials = 500;
total = trials*NUM_THREAD*NUM_BLOCK;
dim3 dimGrid(NUM_BLOCK,1,1); // Grid dimensions
dim3 dimBlock(NUM_THREAD,1,1); // Block dimensions
size_t size = NUM_BLOCK*NUM_THREAD*sizeof(float); //Array memory size
sumHost = (float*)calloc(NUM_BLOCK*NUM_THREAD, sizeof(float));
cudaMalloc((void **) &sumDev, size); // Allocate array on device
error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
cudaMalloc((void **) &devStates, (NUM_THREAD*NUM_BLOCK)*sizeof(curandState));
error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
// Do calculation on device by calling CUDA kernel
monteCarlo <<<dimGrid, dimBlock>>> (sumDev, trials, devStates);
error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
// call reduction function to sum
reduce0 <<<dimGrid, dimBlock, (NUM_THREAD*sizeof(float))>>> (sumDev);
error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
dim3 dimGrid1(1,1,1);
dim3 dimBlock1(256,1,1);
reduce0 <<<dimGrid1, dimBlock1, (NUM_THREAD*sizeof(float))>>> (sumDev);
error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
// Retrieve result from device and store it in host array
cudaMemcpy(sumHost, sumDev, sizeof(float), cudaMemcpyDeviceToHost);
error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
*solution = 4*(sumHost[0]/total);
printf("%.*f\n", 1000, *solution);
free (solution);
free(sumHost);
cudaFree(sumDev);
cudaFree(devStates);
//*solution = NULL;
return 0;
}
If smaller numbers of trials work correctly, and if you are running on MS Windows without the NVIDIA Tesla Compute Cluster (TCC) driver and/or the GPU you are using is attached to a display, then you are probably exceeding the operating system's "watchdog" timeout. If the kernel occupies the display device (or any GPU on Windows without TCC) for too long, the OS will kill the kernel so that the system does not become non-interactive.
The solution is to run on a non-display-attached GPU and if you are on Windows, use the TCC driver. Otherwise, you will need to reduce the number of trials in your kernel and run the kernel multiple times to compute the number of trials you need.
EDIT: According to the CUDA 4.0 curand docs(page 15, "Performance Notes"), you can improve performance by copying the state for a generator to local storage inside your kernel, then storing the state back (if you need it again) when you are finished:
curandState state = states[i];
for(k = 0; k < trials; k++){
x = curand_uniform(&state);
y = curand_uniform(&state);
z =(x*x + y*y);
if (z <= 1.0f) incircle++;
}
Next, it mentions that setup is expensive, and suggests that you move curand_init into a separate kernel. This may help keep the cost of your MC kernel down so you don't run up against the watchdog.
I recommend reading that section of the docs, there are several useful guidelines.
For those of you having a geforce GPU which does not support TCC driver there is another solution based on:
http://msdn.microsoft.com/en-us/library/windows/hardware/ff569918(v=vs.85).aspx
start regedit,
navigate to HKEY_LOCAL_MACHINE\System\CurrentControlSet\Control\GraphicsDrivers
create new DWORD key called TdrLevel, set value to 0,
restart PC.
Now your long-running kernels should not be terminated. This answer is based on:
Modifying registry to increase GPU timeout, windows 7
I just thought it might be useful to provide the solution here as well.

loop unrolling with dynamic parallelism decrease the time performance

I have a simple program to calculate square root, loop unrolling was done as
loop unrolling
#include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N,int idx);
// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
int idx = 2*(blockIdx.x * blockDim.x + threadIdx.x);
int n=N;
//printf("%d\n",n);
for(int q=0;q<2;q++)
{
if(N<2000)
{
arr[idx+q] = arr[idx+q] * arr[idx+q];
}
}
}
// main routine that executes on the host
int main(void)
{
clock_t start = clock(),diff;
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 1000; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
//int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
first <<< 4, 128 >>> (a_d, N);
//cudaThreadSynchronize();
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
diff = clock() - start;
int msec = diff * 1000 / CLOCKS_PER_SEC;
printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);
}
then realizing that the loop calculation can be minimized with dynamic parallelism .
unrolling with dynamic parallelism was implemented as
unrolling with dynamic parallelism
#include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N,int idx);
// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
int idx = 2*(blockIdx.x * blockDim.x + threadIdx.x);
int n=N;
square <<< 1,2 >>> (arr, n,idx);
}
__global__ void square(float *a, int N,int idx)
{
int tdx = blockIdx.x * blockDim.x + threadIdx.x;
printf("%d\n",N);
if(N<2000)
{
a[tdx+idx] = a[tdx+idx] * a[tdx+idx];
}
}
// main routine that executes on the host
int main(void)
{
clock_t start = clock(),diff;
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 1000; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
//int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
first <<< 4, 128 >>> (a_d, N);
//cudaThreadSynchronize();
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
diff = clock() - start;
int msec = diff * 1000 / CLOCKS_PER_SEC;
printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);
}
the implementation of dynamic parallelism with unrolling takes more time for executio than only unrolling. Aren,t we suppose to improve execution time with dynamic parallelism in such case?
Dynamic parallelism is mainly useful in cases where you have parallelism that is dynamic. That is: cases where you don't know how much parallelism you're going to need until you've done some calculation. Rather than transfer data back to the host which is then instantly fed into parameterising another launch, you launch from within the kernel. In this pattern, with memcpys between kernel launches avoided, you'll see speedup.
In your example above this is not the case. You could have just launched twice as many threads from the host. There's nothing dynamic required as there's no parallelism available there that you didn't know about at the time of the first kernel launch.
Furthermore, performance requirements for kernels launched using dynamic parallelism are similar to that of those launched from the host. You have to launch a reasonable amount of work or the launch latency will dominate your computation time.