cudaDeviceReset for multiple gpu's - cuda

I am currently working on a gpu server which has 4 Tesla T10 gpu's. While I keep testing the kernels and have to frequently kill the processes using ctrl-C, I added a few lines to the end of a simple device query code. The code is given below :
#include <stdio.h>
// Print device properties
void printDevProp(cudaDeviceProp devProp)
{
printf("Major revision number: %d\n", devProp.major);
printf("Minor revision number: %d\n", devProp.minor);
printf("Name: %s\n", devProp.name);
printf("Total global memory: %u\n", devProp.totalGlobalMem);
printf("Total shared memory per block: %u\n", devProp.sharedMemPerBlock);
printf("Total registers per block: %d\n", devProp.regsPerBlock);
printf("Warp size: %d\n", devProp.warpSize);
printf("Maximum memory pitch: %u\n", devProp.memPitch);
printf("Maximum threads per block: %d\n", devProp.maxThreadsPerBlock);
for (int i = 0; i < 3; ++i)
printf("Maximum dimension %d of block: %d\n", i, devProp.maxThreadsDim[i]);
for (int i = 0; i < 3; ++i)
printf("Maximum dimension %d of grid: %d\n", i, devProp.maxGridSize[i]);
printf("Clock rate: %d\n", devProp.clockRate);
printf("Total constant memory: %u\n", devProp.totalConstMem);
printf("Texture alignment: %u\n", devProp.textureAlignment);
printf("Concurrent copy and execution: %s\n", (devProp.deviceOverlap ? "Yes" : "No"));
printf("Number of multiprocessors: %d\n", devProp.multiProcessorCount);
printf("Kernel execution timeout: %s\n", (devProp.kernelExecTimeoutEnabled ? "Yes" : "No"));
return;
}
int main()
{
// Number of CUDA devices
int devCount;
cudaGetDeviceCount(&devCount);
printf("CUDA Device Query...\n");
printf("There are %d CUDA devices.\n", devCount);
// Iterate through devices
for (int i = 0; i < devCount; ++i)
{
// Get device properties
printf("\nCUDA Device #%d\n", i);
cudaDeviceProp devProp;
cudaGetDeviceProperties(&devProp, i);
printDevProp(devProp);
}
printf("\nPress any key to exit...");
char c;
scanf("%c", &c);
**for (int i = 0; i < devCount; i++) {
cudaSetDevice(i);
cudaDeviceReset();
}**
return 0;
}
My query is related to the for loop just before the main() ends in which I set each device one by one and then use cudaResetDevice command. I get a strange feeling that this code, although doesnt produce any error but I am not able to reset all the devices. Instead, the program is resetting only the default device i.e device 0 each time. Can anyone tell me what should I do to reset each of the 4 devices.
Thanks

It looks like you can add a function to your GPU programs to catch the ctrl+c signal (SIGINT) and call the cudaDeviceReset() function for each device that was used by the program.
The example code to call a function when SIGINT is caught can be found here:
https://stackoverflow.com/a/482725
It seems like a good practice to include code like this for every GPU program you write, and I will do the same :-)
I don't have time to write up a full detailed answer, so read the other answer and it's comments also.

This is probably too late but if you write a signal-handler function you can get rid of the memory leaks and reset the device in a sure way:
// State variables for
extern int no_sigint;
int no_sigint = 1;
extern int interrupts;
int interrupts = 0;
/* Catches signal interrupts from Ctrl+c.
If 1 signal is detected the simulation finishes the current frame and
exits in a clean state. If Ctrl+c is pressed again it terminates the
application without completing writes to files or calculations but
deallocates all memory anyway. */
void
sigint_handler (int sig)
{
if (sig == SIGINT)
{
interrupts += 1;
std::cout << std::endl
<< "Aborting loop.. finishing frame."
<< std::endl;
no_sigint = 0;
if (interrupts >= 2)
{
std::cerr << std::endl
<< "Multiple Interrupts issued: "
<< "Clearing memory and Forcing immediate shutdown!"
<< std::endl;
// write a function to free dynamycally allocated memory
free_mem ();
int devCount;
cudaGetDeviceCount (&devCount);
for (int i = 0; i < devCount; ++i)
{
cudaSetDevice (i);
cudaDeviceReset ();
}
exit (9);
}
}
}
....
int main(){
.....
for (int simulation_step=1 ; simulation_step < SIM_STEPS && no_sigint; ++simulation_step)
{
.... simulation code
}
free_mem();
... cuda device resets
return 0;
}
If you use this code (you can even include the first snippet in an external header, it works. You can have 2 levels of control of ctrl+c: the first press stops your simulation and exits normally but the application finishes rendering the step which is great to stop gracefully and have correct results, if you press ctrl+c again it closes the application freeing all memory.

cudaDeviceReset is intended for destroying resources associated with a given GPU context within the process in which it is run. One CUDA process can't reset or otherwise effect the context of another process. So when your modified device query calls cudaDeviceReset, it is only releases resources that it allocated, not those in use by any other process.

Related

Threads of a CUDA kernel execute sequentially

I have two kernels that process some data sequentially (launched with only one thread). I want to combine the two so that I can have one kernel to launch with two threads. After doing so, I was expecting to get an exec time of max(kernel1, kernel2) but what I got was the sum of the two exec times. I narrowed down the problem to something like the code below.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<iostream>
#include<string>
#include<vector>
#include<random>
#include<functional>
#include<algorithm>
#include<iterator>
__global__ void dummyKernel(const float *d_data_Re, const float *d_data_Im,
float *d_out_Re, float *d_out_Im, const int dataLen) {
int i{ threadIdx.x };
if (i == 0) {
printf("Thread zero started \n");
for (int j{}; j < 1000000; j++)
d_out_Re[j%dataLen] = sqrtf(2) + d_data_Re[j%dataLen] * (j % 4 == 1);
printf("Thread zero finished \n");
}
else if (i == 1) {
printf("Thread one started \n");
for (int j{}; j < 1000000; j++)
d_out_Im[j%dataLen] = sqrtf(2) + d_data_Im[j%dataLen] * (j % 4 == 1);
printf("Thread one finished \n");
}
}
__global__ void dummyKernel2(const float *d_data_Re, const float *d_data_Im,
float *d_out_Re, float *d_out_Im, const int dataLen) {
int i{ threadIdx.x };
//if (i == 0) {
printf("Thread zero started \n");
for (int j{}; j < 1000000; j++)
d_out_Re[j%dataLen] = sqrtf(2) + d_data_Re[j%dataLen] * (j % 4 == 1);
printf("Thread zero finished \n");
//}
//else if (i == 1) {
// printf("Thread one started \n");
// for (int j{}; j < 1000000; j++)
// d_out_Im[j%dataLen] = sqrtf(2) + d_data_Im[j%dataLen] * (j % 4 == 1);
// printf("Thread one finished \n");
//}
}
int main()
{
cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
return 1;
}
const int sizeOfFrame = 2 * 1024 * 1024;
std::vector<float> data_re(sizeOfFrame), data_im;
//random number generator
std::uniform_real_distribution<float> distribution(0.0f, 2.0f); //Values between 0 and 2
std::mt19937 engine; // Mersenne twister MT19937
auto generator = std::bind(distribution, engine);
std::generate_n(data_re.begin(), sizeOfFrame, generator);
std::copy(data_re.begin(), data_re.end(), std::back_inserter(data_im));
//
float *d_data_re, *d_data_im;
cudaMalloc(&d_data_re, sizeOfFrame * sizeof(float));
cudaMalloc(&d_data_im, sizeOfFrame * sizeof(float));
cudaMemcpy(d_data_re, data_re.data(), sizeOfFrame * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_data_im, data_im.data(), sizeOfFrame * sizeof(float), cudaMemcpyHostToDevice);
float *d_pll_out_re, *d_pll_out_im;
cudaMalloc(&d_pll_out_re, sizeOfFrame * sizeof(float));
cudaMalloc(&d_pll_out_im, sizeOfFrame * sizeof(float));
dummyKernel << <1, 2 >> >(d_data_re, d_data_im,
d_pll_out_re, d_pll_out_im, sizeOfFrame);
cudaDeviceSynchronize();
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
btw I got the code for random number generator from an answer to this question. So, the dummyKernel doesn't do anything useful, I just wanted to have a kernel that took relatively long to finish. If you launch dummyKernel, the order of the output will be "Thread zero started", "Thread zero finished", "Thread one started", "Thread one finished". Sequential. But if you launch dummyKernel2, the order of the output will be "Thread zero started", "Thread zero started", "Thread zero finished", "Thread zero finished" and the exec time is almost half as dummyKernel. I don't understand this behavior and the effect of the if-else I used.
OS: Windows 10, GTX 1050 Ti, CUDA Driver/Runtime version: 11.1/10.1.
Each Cuda multiprocessor has execution units (several each for int, float, special functions, ...). Those work as pipelines, which take several cycles to complete a calculation, but in each cycle a new calculation can be inserted (=scheduled) and several calculations are processed at the same time at different stages of the pipeline.
Groups of 32 threads (warps) within a block are scheduled the same instruction at the same time (same cycle or often two cycles depending on how many execution and datapath resources are available on the architecture and needed for this instruction), together with a bitfield, stating, for which threads this instruction should be actively executed. If some threads of a warp evaluated an if clause as false, they are temporarily deactivated. Or some threads may have already exited the kernel.
The effect is that if the 32 warps diverge (branch differently), each execution path has to be run through for each of the 32 threads (with some threads deactivated for each path). That should be avoided for performance reasons, as the computation resources are reserved nevertheless. Threads from different warps don't have this interdependency. The algorithm should be structured in a way to consider this.
With Volta, Independent Thread Scheduling was introduced. Each thread has its own instruction counter (and manages a separate function callstack). But the scheduler still will schedule groups of 32 threads (warps) with bitfields for active threads. What changed is that the scheduler can interleave the diverging paths. Instead of executing CCCIIIEEECCC pre-Volta (instructions: C=common, I=if branch, e=else branch), it could execute CCCIEEIIECCC, if the available execution units or the memory latency better fits. As programmer, one has to be careful, as it can be no longer assumed that the threads have not diverged, even when executing the same instruction. That is why __syncwarp was introduced and all kind of cooperation functions (e.g. the shuffle instructions) got a sync variant. Nevertheless (although we cannot know for sure, if the threads diverged) one still has to program in a way that all 32 threads can work together, if executed synchronously, especially for coalesced memory accesses. Putting __syncwarp after each possibly diverging instruction can help to ensure convergence. (But do performance profiling).
The Independent Thread Scheduling is also the reason, why __syncthreads must definitely be called correctly on the RTX 3080 - with each thread participating. A typical correcting solution for the deadlock case you mentioned in the comment is to close the if clause, sync all the threads and open a new if clause with the same condition as the previous one.

cudaGetLastError. Which kernel execution raised it?

I have implemented a pipeline where many kernels are launched in a specific stream. The kernels are enqueued into the stream and executed when the scheduler decides it’s best.
In my code, after every kernel enqueue, I check if there’s any error by calling cudaGetLastError which, according to the documentation, "it returns the last error from a runtime call. This call, may also return error codes from previous asynchronous launches". Thus, if the kernel has only been enqueued, not executed, I understand that the error returned refers only if the kernel was enqueued correctly (parameters checking, grid and block size, shared memory, etc...).
My problem is: I enqueue many different kernels without waiting for finalization of the execution of each kernel. Imagine now, I have a bug in one of my kernels (let's call it Kernel1) which causes a illegal memory access (for instance). If I check the cudaGetLastError right after enqueuing it, the return value is success because it was correctly enqueued. So my CPU thread moves on and keep enqueuing kernels to the stream. At some point Kernel1 is executed and raised the illegal memory access. Thus, next time I check for cudaGetLastError I will get the cuda error but, by that time, the CPU thread is another point forward in the code. Consequently, I know there's been an error, but I have no idea which kernel raised it.
An option is to synchronize (block the CPU thread) until the execution of every kernel have finished and then check the error code, but this is not an option for performance reasons.
The question is, is there any way we can query which kernel raised a given error code returned by cudaGetLastError? If not, which is in your opinion the best way to handle this?
There is an environment variable CUDA_​LAUNCH_​BLOCKING which you can use to serialize kernel execution of an otherwise asynchronous sequence of kernel launches. This should allow you to isolate the kernel instance which is causing an error, either via internal error checking in your host code, or via an external tool like cuda-memcheck.
I have tested 3 different options:
Set CUDA_​LAUNCH_​BLOCKING environment variable to 1. This forces to block the CPU thread until the kernel execution has finished. We can check after each execution if there's been an error catching the exact point of failure. Although, this has an obvious performance impact but this may help to bound the bug in a production environment without having to perform any change at the client side.
Distribute the production code compiled with the flag -lineinfo and run the code again with cuda-memncheck. This has no performance impact and we do not need to perform any change in the client either. Although, we have to execute the binary in a slightly different environment and in some cases, like a service running GPU tasks, can be difficult to achieve.
Insert a callback after each kernel call. In the userData parameter, include a unique id for the kernel-call, and possibly some information on the parameters used. This can be directly distributed in a production environment and always gives us the exact point of failure and we don't need to perform any change at the client side. Although, the performance impact of this approach is huge. Apparently, the callback functions, are processed by a driver thread and cause for the performance impact. I wrote a code to test it
#include <cuda_runtime.h>
#include <vector>
#include <chrono>
#include <iostream>
#define BLOC_SIZE 1024
#define NUM_ELEMENTS BLOC_SIZE * 32
#define NUM_ITERATIONS 500
__global__ void KernelCopy(const unsigned int *input, unsigned int *result) {
unsigned int pos = blockIdx.x * BLOC_SIZE + threadIdx.x;
result[pos] = input[pos];
}
void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status, void *data) {
if (status) {
std::cout << "Error: " << cudaGetErrorString(status) << "-->";
}
}
#define CUDA_CHECK_LAST_ERROR cudaStreamAddCallback(stream, myStreamCallback, nullptr, 0)
int main() {
cudaError_t c_ret;
c_ret = cudaSetDevice(0);
if (c_ret != cudaSuccess) {
return -1;
}
unsigned int *input;
c_ret = cudaMalloc((void **)&input, NUM_ELEMENTS * sizeof(unsigned int));
if (c_ret != cudaSuccess) {
return -1;
}
std::vector<unsigned int> h_input(NUM_ELEMENTS);
for (unsigned int i = 0; i < NUM_ELEMENTS; i++) {
h_input[i] = i;
}
c_ret = cudaMemcpy(input, h_input.data(), NUM_ELEMENTS * sizeof(unsigned int), cudaMemcpyKind::cudaMemcpyHostToDevice);
if (c_ret != cudaSuccess) {
return -1;
}
unsigned int *result;
c_ret = cudaMalloc((void **)&result, NUM_ELEMENTS * sizeof(unsigned int));
if (c_ret != cudaSuccess) {
return -1;
}
cudaStream_t stream;
c_ret = cudaStreamCreate(&stream);
if (c_ret != cudaSuccess) {
return -1;
}
std::chrono::steady_clock::time_point start;
std::chrono::steady_clock::time_point end;
start = std::chrono::steady_clock::now();
for (unsigned int i = 0; i < 500; i++) {
dim3 grid(NUM_ELEMENTS / BLOC_SIZE);
KernelCopy <<< grid, BLOC_SIZE, 0, stream >>> (input, result);
CUDA_CHECK_LAST_ERROR;
}
cudaStreamSynchronize(stream);
end = std::chrono::steady_clock::now();
std::cout << "With callback took (ms): " << std::chrono::duration<float, std::milli>(end - start).count() << '\n';
start = std::chrono::steady_clock::now();
for (unsigned int i = 0; i < 500; i++) {
dim3 grid(NUM_ELEMENTS / BLOC_SIZE);
KernelCopy <<< grid, BLOC_SIZE, 0, stream >>> (input, result);
c_ret = cudaGetLastError();
if (c_ret) {
std::cout << "Error: " << cudaGetErrorString(c_ret) << "-->";
}
}
cudaStreamSynchronize(stream);
end = std::chrono::steady_clock::now();
std::cout << "Without callback took (ms): " << std::chrono::duration<float, std::milli>(end - start).count() << '\n';
c_ret = cudaStreamDestroy(stream);
if (c_ret != cudaSuccess) {
return -1;
}
c_ret = cudaFree(result);
if (c_ret != cudaSuccess) {
return -1;
}
c_ret = cudaFree(input);
if (c_ret != cudaSuccess) {
return -1;
}
return 0;
}
Ouput:
With callback took (ms): 47.8729
Without callback took (ms): 1.9317
(CUDA 9.2, Windows 10, Visual Studio 2015, Nvidia Tesla P4)
To me, in a production environment, the only valid approach is number 2.

Shared memory, branching performance and register count

I came across some peculiar performance behaviour when trying out the CUDA shuffle instruction. The test kernel below is based on an image processing algorithm which adds input-dependent values to all neighbouring pixels within a square of side rad. The output for each block is added in shared memory. If only one thread per warp adds its result to shared memory, the performance is poor (Option 1), whereas on the other hand, if all threads add to shared memory (one thread adds the desired value, the rest just add 0), the execution time drops by 2-3 times (Option 2).
#include <iostream>
#include "cuda_runtime.h"
#define warpSz 32
#define tileY 32
#define rad 32
__global__ void test(float *out, int pitch)
{
// Set shared mem to 0
__shared__ float tile[(warpSz + 2*rad) * (tileY + 2*rad)];
for (int i = threadIdx.y*blockDim.x+threadIdx.x; i<(tileY+2*rad)*(warpSz+2*rad); i+=blockDim.x*blockDim.y) {
tile[i] = 0.0f;
}
__syncthreads();
for (int row=threadIdx.y; row<tileY; row += blockDim.y) {
// Loop over pixels in neighbourhood
for (int i=0; i<2*rad+1; ++i) {
float res = 0.0f;
int rowStartIdx = (row+i)*(warpSz+2*rad);
for (int j=0; j<2*rad+1; ++j) {
res += float(threadIdx.x+row); // Substitute for real calculation
// Option 1: one thread writes to shared mem
if (threadIdx.x == 0) {
tile[rowStartIdx + j] += res;
res = 0.0f;
}
//// Option 2: all threads write to shared mem
//float tmp = 0.0f;
//if (threadIdx.x == 0) {
// tmp = res;
// res = 0.0f;
//}
//tile[rowStartIdx + threadIdx.x+j] += tmp;
res = __shfl(res, (threadIdx.x+1) % warpSz);
}
res += float(threadIdx.x+row);
tile[rowStartIdx + threadIdx.x+2*rad] += res;
__syncthreads();
}
}
// Add result back to global mem
for (int row=threadIdx.y; row<tileY+2*rad; row+=blockDim.y) {
for (int col=threadIdx.x; col<warpSz+2*rad; col+=warpSz) {
int idx = (blockIdx.y*tileY + row)*pitch + blockIdx.x*warpSz + col;
atomicAdd(out+idx, tile[row*(warpSz+2*rad) + col]);
}
}
}
int main(void)
{
int2 dim = make_int2(512, 512);
int pitchOut = (((dim.x+2*rad)+warpSz-1) / warpSz) * warpSz;
int sizeOut = pitchOut*(dim.y+2*rad);
dim3 gridDim((dim.x+warpSz-1)/warpSz, (dim.y+tileY-1)/tileY, 1);
float *devOut;
cudaMalloc((void**)&devOut, sizeOut*sizeof(float));
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaFree(0);
cudaEventRecord(start, 0);
test<<<gridDim, dim3(warpSz, 8)>>>(devOut, pitchOut);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaFree(devOut);
cudaDeviceReset();
std::cout << "Elapsed time: " << elapsedTime << " ms.\n";
std::cin.ignore();
}
Is this expected behaviour/can anyone explain why this happens?
One thing I have noted is that Option 1 uses only 15 registers, whereas Option 2 uses 37, which seems a big difference to me.
Another is that the if-statement in the innermost loop is converted to explicit bra instructions in the PTX code for Option 1, whereas for Option 2 it is converted to two selp instructions. Could it be that the explicit branching is behind the 2-3 times slow down similar to what's suspected in this question?
There are two reasons why I am reluctant to go for Option 2. First, when profiling the original application it seems to be limited by share memory bandwidth, which indicates that there is potential to increase the performance by having fewer threads accessing it. Second, unless we use the volatile keyword, writes to shared memory can be optimised to registers. Since we are only interested in the contribution from last the thread to access each memory location (threadIdx.x == 0), and all others add 0, this is not a problem as long as all changes temporarily located in registers are guaranteed to be written back to shared memory in the same order they were issued. Is this the case though? (This far, both options have produced the exact same result.)
Any thoughts or ideas are much appreciated!
PS. I compile for compute capability 3.0. (However, the shuffle instruction is not necessary to demonstrate the behaviour and can be commented out.)

printf() in my CUDA kernel doesn't result produce any output

I have added some printf() statements in my CUDA program
__device__ __global__ void Kernel(float *, float * ,int );
void DeviceFunc(float *temp_h , int numvar , float *temp1_h)
{ .....
//Kernel call
printf("calling kernel\n");
Kernel<<<dimGrid , dimBlock>>>(a_d , b_d , numvar);
printf("kernel called\n");
....
}
int main(int argc , char **argv)
{ ....
printf("beforeDeviceFunc\n\n");
DeviceFunc(a_h , numvar , b_h); //Showing the data
printf("after DeviceFunc\n\n");
....
}
Also in the Kernel.cu, I wrote:
#include<cuda.h>
#include <stdio.h>
__device__ __global__ void Kernel(float *a_d , float *b_d ,int size)
{
int idx = threadIdx.x ;
int idy = threadIdx.y ;
//Allocating memory in the share memory of the device
__shared__ float temp[16][16];
//Copying the data to the shared memory
temp[idy][idx] = a_d[(idy * (size+1)) + idx] ;
printf("idx=%d, idy=%d, size=%d", idx, idy, size);
....
}
Then I compile using -arch=sm_20 like this:
nvcc -c -arch sm_20 main.cu
nvcc -c -arch sm_20 Kernel.cu
nvcc -arch sm_20 main.o Kernel.o -o main
Now when I run the program, I see:
beforeDeviceFunc
calling kernel
kernel called
after DeviceFunc
So the printf() inside the kernel is not printed. How can I fix that?
printf() output is only displayed if the kernel finishes successfully, so check the return codes of all CUDA function calls and make sure no errors are reported.
Furthermore printf() output is only displayed at certain points in the program. Appendix B.32.2 of the Programming Guide lists these as
Kernel launch via <<<>>> or cuLaunchKernel() (at the start of the launch, and if the CUDA_LAUNCH_BLOCKING environment variable is set to 1, at the end of the launch as well),
Synchronization via cudaDeviceSynchronize(), cuCtxSynchronize(), cudaStreamSynchronize(), cuStreamSynchronize(), cudaEventSynchronize(), or cuEventSynchronize(),
Memory copies via any blocking version of cudaMemcpy*() or cuMemcpy*(),
Module loading/unloading via cuModuleLoad() or cuModuleUnload(),
Context destruction via cudaDeviceReset() or cuCtxDestroy().
Prior to executing a stream callback added by cudaStreamAddCallback() or cuStreamAddCallback().
To check this is your problem, put the following code after your kernel invocation:
{
cudaError_t cudaerr = cudaDeviceSynchronize();
if (cudaerr != cudaSuccess)
printf("kernel launch failed with error \"%s\".\n",
cudaGetErrorString(cudaerr));
}
You should then see either the output of your kernel or an error message.
More conveniently, cuda-memcheck will automatically check all return codes for you if you run your executable under it. While you should always check for errors anyway, this comes handy when resolving concrete issues.
I had the same error just now and decreasing the block size to 512 helped. According to documentation maximum block size can be either 512 or 1024.
I have written a simple test that showed that my GTX 1070 has a maximum block size of 1024. UPD: you can check if your kernel has ever executed by using cudaError_t cudaPeekAtLastError() that returns cudaSuccess if the kernel has started successfully, and only after it is worse calling cudaError_t cudaDeviceSynchronize().
Testing block size of 1023
Testing block size of 1024
Testing block size of 1025
CUDA error: invalid configuration argument
Block maximum size is 1024
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
__global__
void set1(int* t)
{
t[threadIdx.x] = 1;
}
inline bool failed(cudaError_t error)
{
if (cudaSuccess == error)
return false;
fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(error));
return true;
}
int main()
{
int blockSize;
for (blockSize = 1; blockSize < 1 << 12; blockSize++)
{
printf("Testing block size of %d\n", blockSize);
int* t;
if(failed(cudaMallocManaged(&t, blockSize * sizeof(int))))
{
failed(cudaFree(t));
break;
}
for (int i = 0; i < blockSize; i++)
t[0] = 0;
set1 <<<1, blockSize>>> (t);
if (failed(cudaPeekAtLastError()))
{
failed(cudaFree(t));
break;
}
if (failed(cudaDeviceSynchronize()))
{
failed(cudaFree(t));
break;
}
bool hasError = false;
for (int i = 0; i < blockSize; i++)
if (1 != t[i])
{
printf("CUDA error: t[%d] = %d but not 1\n", i, t[i]);
hasError = true;
break;
}
if (hasError)
{
failed(cudaFree(t));
break;
}
failed(cudaFree(t));
}
blockSize--;
if(blockSize <= 0)
{
printf("CUDA error: block size cannot be 0\n");
return 1;
}
printf("Block maximum size is %d", blockSize);
return 0;
}
P.S. Please note, that the only thing in block sizing is warp granularity which is 32 nowadays, so if 0 == yourBlockSize % 32 the warps are used pretty efficiently. The only reason to make blocks bigger then 32 is when the code needs synchronization as synchronization is available only among threads in a single block which makes a developer to use a single large block instead of many small ones. So running with higher number of smaller blocks can be even more efficient than running with lower number of larger blocks.

CUDA-GDB crashes in Kernel

I've been trying to debug my code, as I know something is going wrong in the Kernel, and I've been trying to figure out what specifically. If I try to step into the kernel it seems to completely step over the kernel functions, and will eventually cause an error on quitting:
Single stepping until exit from function dyld_stub_cudaSetupArgument,
which has no line number information.
[Launch of CUDA Kernel 0 (incrementArrayOnDevice<<<(3,1,1),(4,1,1)>>>) on
Device 0]
[Termination of CUDA Kernel 0 (incrementArrayOnDevice<<<(3,1,1),
(4,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 1 (fillinBoth<<<(40,1,1),(1,1,1)>>>) on Device 0]
[Termination of CUDA Kernel 1 (fillinBoth<<<(40,1,1),(1,1,1)>>>) on Device 0]
add (below=0x124400, newtip=0x124430, newfork=0x125ac0) at test.cu:1223
And if I try to break in the Kernel my entire computer crashes and I have to restart it.
I figure there must be something wrong with the way I'm calling the kernel, but I can't figure out what.
The code is rather long, so I'm only including an excerpt of it:
__global__ void fillinOne(seqptr qset, long max) {
int i, j;
aas aa;
int idx = blockIdx.x;
__shared__ long qs[3];
if(idx < max)
{
memcpy(qs, qset[idx], sizeof(long[3]));
for (i = 0; i <= 1; i++)
{
for (aa = ala; (long)aa <= (long)stop; aa = (aas)((long)aa + 1))
{
if (((1L << ((long)aa)) & qs[i]) != 0)
{
for (j = i + 1; j <= 2; j++)
qs[j] |= cudaTranslate[(long)aa - (long)ala][j - i];
}
}
}
}
}
//Kernel for left!= NULL and rt != NULL
void fillin(node *p, node *left, node *rt)
{
cudaError_t err = cudaGetLastError();
size_t stepsize = chars * sizeof(long);
size_t sitesize = chars * sizeof(sitearray);
//int i, j;
if (left == NULL)
{
//copy rt->numsteps into p->numsteps--doesn't actually require CUDA, because no computation to do
memcpy(p->numsteps, rt->numsteps, stepsize);
checkCUDAError("memcpy");
//allocate siteset (array of sitearrays) on device
seqptr qsites; //as in array of qs's
cudaMalloc((void **) &qsites, sitesize);
checkCUDAError("malloc");
//copy rt->siteset into device array (equivalent to memcpy(qs, rs) but for whole array)
cudaMemcpy(qsites, rt->siteset, sitesize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy");
//do loop in device
int block_size = 1; //each site operated on independently
int n_blocks = chars;
fillinOne <<< n_blocks, block_size>>> (qsites, chars);
cudaThreadSynchronize();
//put qset in p->siteset--equivalent to memcpy(p->siteset[m], qs)
cudaMemcpy(p->siteset, qsites, sitesize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy");
//Cleanup
cudaFree(qsites);
}
If anyone has any ideas at all, please resond! Thanks in advance!
I suppose you have a single card configuration. When you are debugging a cuda kernel and you break inside it you effectively put the display driver in pause. That causes what you think is a crash. If you want to use the cuda-gdb with only one graphics card you must use it in command line mode (don't start X or press ctrl-alt-fn from X).
If you have two cards you must run the code in the card not running the display. Use cudaSelectDevice(n).