Unified Memory and Streams in C - cuda

I am trying to use streams with CUDA 6 and unified memory in C. My previous stream implementation was looking like this :
for(x=0; x<DSIZE; x+=N*2){
gpuErrchk(cudaMemcpyAsync(array_d0, array_h+x, N*sizeof(char), cudaMemcpyHostToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(array_d1, array_h+x+N, N*sizeof(char), cudaMemcpyHostToDevice, stream1));
gpuErrchk(cudaMemcpyAsync(data_d0, data_h, wrap->size*sizeof(int), cudaMemcpyHostToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(data_d1, data_h, wrap->size*sizeof(int), cudaMemcpyHostToDevice, stream1));
searchGPUModified<<<N/128,128,0,stream0>>>(data_d0, array_d0, out_d0 );
searchGPUModified<<<N/128,128,0,stream1>>>(data_d1, array_d1, out_d1);
gpuErrchk(cudaMemcpyAsync(out_h+x, out_d0 , N * sizeof(int), cudaMemcpyDeviceToHost, stream0));
gpuErrchk(cudaMemcpyAsync(out_h+x+N, out_d1 ,N * sizeof(int), cudaMemcpyDeviceToHost, stream1));
}
but I cannot find an example of streams and unified memory, using the same technique, where chuncks of data are sent to the GPU. I am thus wondering if there is a way to do this ?

You should read section J.2.2 of the programming guide (and preferably all of appendix J).
With Unified Memory, memory allocated using cudaMallocManaged is by default attached to all streams ("global") and we must modify this in order to make effective use of streams, e.g. for compute/copy overlap. We can do this with the cudaStreamAttachMemAsync function as described in section J.2.2.3 By associating each memory "chunk" with a stream in this fashion, the UM subsystem can make intelligent decisions about when to transfer each data item.
The following example demonstrates this:
#include <stdio.h>
#include <time.h>
#define DSIZE 1048576
#define DWAIT 100000ULL
#define nTPB 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef int mytype;
__global__ void mykernel(mytype *data){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < DSIZE) data[idx] = 1;
unsigned long long int tstart = clock64();
while (clock64() < tstart + DWAIT);
}
int main(){
mytype *data1, *data2, *data3;
cudaStream_t stream1, stream2, stream3;
cudaMallocManaged(&data1, DSIZE*sizeof(mytype));
cudaMallocManaged(&data2, DSIZE*sizeof(mytype));
cudaMallocManaged(&data3, DSIZE*sizeof(mytype));
cudaCheckErrors("cudaMallocManaged fail");
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaStreamCreate(&stream3);
cudaCheckErrors("cudaStreamCreate fail");
cudaStreamAttachMemAsync(stream1, data1);
cudaStreamAttachMemAsync(stream2, data2);
cudaStreamAttachMemAsync(stream3, data3);
cudaDeviceSynchronize();
cudaCheckErrors("cudaStreamAttach fail");
memset(data1, 0, DSIZE*sizeof(mytype));
memset(data2, 0, DSIZE*sizeof(mytype));
memset(data3, 0, DSIZE*sizeof(mytype));
mykernel<<<(DSIZE+nTPB-1)/nTPB, nTPB, 0, stream1>>>(data1);
mykernel<<<(DSIZE+nTPB-1)/nTPB, nTPB, 0, stream2>>>(data2);
mykernel<<<(DSIZE+nTPB-1)/nTPB, nTPB, 0, stream3>>>(data3);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
for (int i = 0; i < DSIZE; i++){
if (data1[i] != 1) {printf("data1 mismatch at %d, should be: %d, was: %d\n", i, 1, data1[i]); return 1;}
if (data2[i] != 1) {printf("data2 mismatch at %d, should be: %d, was: %d\n", i, 1, data2[i]); return 1;}
if (data3[i] != 1) {printf("data3 mismatch at %d, should be: %d, was: %d\n", i, 1, data3[i]); return 1;}
}
printf("Success!\n");
return 0;
}
The above program creates a kernel that runs artificially long using clock64(), so as to give us a simulated opportunity for compute/copy overlap (simulating a compute-intensive kernel). We are launching 3 instances of this kernel, each instance operating on a separate "chunk" of data.
When we profile the above program, the following is seen:
First, note that the 3rd kernel launch is highlighted in yellow, and it begins immediately after the second kernel launch highlighted in purple. The actual cudaLaunch runtime API event that launches this 3rd kernel is indicated in the runtime API line by the mouse pointer, also highlighted in yellow (and is preceded by the cudaLaunch events for the first 2 kernels). Since this launch happens during execution of the first kernel, and there is no intervening "empty space" from that point until the start of the 3rd kernel, we can observe that the transfer of the data for the 3rd kernel launch (i.e. data3) occurred while kernels 1 and 2 were executing. Therefore we have effective overlap of copy and compute. (We could make a similar observation about kernel 2).
Although I haven't shown it here, if we omit the cudaStreamAttachMemAsync lines, the program still compiles and runs correctly, but if we profile it, we observe a different relationship between the cudaLaunch events and the kernels. The overall profile looks similar, and the kernels are executing back to back, but the entire cudaLaunch process now begins and ends before the first kernel begins executing, and there are no cudaLaunch events during the kernel execution. This indicates that (since all the cudaMallocManaged memory is global) all of the data transfers are taking place prior to the first kernel launch. The program has no way to associate a "global" allocation with any particular kernel, so all such allocated memory must be transferred before the first kernel launch (even though that kernel is only using data1).

Related

Where is the boundary of start and end of CPU launch and GPU launch of Nvidia Profiling NVPROF?

What is the definition of start and end of kernel launch in the CPU and GPU (yellow block)? Where is the boundary between them?
Please notice that the start, end, and duration of those yellow blocks in CPU and GPU are different.Why CPU invocation of vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n); takes that long time?
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
// CUDA kernel. Each thread takes care of one element of c
__global__ void vecAdd(double *a, double *b, double *c, int n)
{
// Get our global thread ID
int id = blockIdx.x*blockDim.x+threadIdx.x;
//printf("id = %d \n", id);
// Make sure we do not go out of bounds
if (id < n)
c[id] = a[id] + b[id];
}
int main( int argc, char* argv[] )
{
// Size of vectors
int n = 1000000;
// Host input vectors
double *h_a;
double *h_b;
//Host output vector
double *h_c;
// Device input vectors
double *d_a;
double *d_b;
//Device output vector
double *d_c;
// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);
// Allocate memory for each vector on host
h_a = (double*)malloc(bytes);
h_b = (double*)malloc(bytes);
h_c = (double*)malloc(bytes);
// Allocate memory for each vector on GPU
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
int i;
// Initialize vectors on host
for( i = 0; i < n; i++ ) {
h_a[i] = sin(i)*sin(i);
h_b[i] = cos(i)*cos(i);
}
// Copy host vectors to device
cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);
int blockSize, gridSize;
// Number of threads in each thread block
blockSize = 1024;
// Number of thread blocks in grid
gridSize = (int)ceil((float)n/blockSize);
// Execute the kernel
vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
// Copy array back to host
cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );
// Sum up vector c and print result divided by n, this should equal 1 within error
double sum = 0;
for(i=0; i<n; i++)
sum += h_c[i];
printf("final result: %f\n", sum/n);
// Release device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
// Release host memory
free(h_a);
free(h_b);
free(h_c);
return 0;
}
CPU yellow block:
GPU yellow block:
Note that you mention NVPROF but the pictures you are showing are from nvvp - the visual profiler. nvprof is the command-line profiler
GPU Kernel launches are asynchronous. That means that the CPU thread launches the kernel but does not wait for the kernel to complete. In fact, the CPU activity is actually placing the kernel in a launch queue - the actual execution of the kernel may be delayed if anything else is happening on the GPU.
So there is no defined relationship between the CPU (API) activity, and the GPU activity with respect to time, except that the CPU kernel launch must obviously precede (at least slightly) the GPU kernel execution.
The CPU (API) yellow block represents the duration of time that the CPU thread spends in a library call into the CUDA Runtime library, to launch the kernel (i.e. place it in the launch queue). This library call activity usually has some time overhead associated with it, in the range of 5-50 microseconds. The start of this period is marked by the start of the call into the library. The end of this period is marked by the time at which the library returns control to your code (i.e. your next line of code after the kernel launch).
The GPU yellow block represents the actual time period during which the kernel was executing on the GPU. The start and end of this yellow block are marked by the start and end of kernel activity on the GPU. The duration here is a function of what the code in your kernel is doing, and how long it takes.
I don't think the exact reason why a GPU kernel launch takes ~5-50 microseconds of CPU time is documented or explained anywhere in an authoritative fashion, and it is a closed source library, so you will need to acknowledge that overhead as something you have little control over. If you design kernels that run for a long time and do a lot of work, this overhead can become insignificant.

Small program that burdens the GPU?

What is the most efficient way to burden a GPU and increase the energy consuming for testing purpose?
I do want the program to be as small as possible. Is there a specific kernel function that does the job?
Any suggestion on Metal or Cuda will be perfect.
I am sketching a possible solution here. You will need some experiments to maximize the thermal load of your GPU. Generally speaking, data movement is energetically expensive, much more so than computation in modern processors. So shuffling a lot of data will drive up power consumption. At the same time, we want an additive contribution to power consumption from computational units. Multipliers tend to be the largest power hogs; in modern processors we might want to target FMA (fused multiply-add) units.
Various GPUs have low throughput of double-precision math ops, others have low throughput of half-precision math ops. Therefore we would want to focus on single-precision math for the computational portion of our load. We want to be able to change the ratio of computation to memory activity easily. One approach would be to use the unrolled evaluation of a polynomial with the Horner scheme as the basic building block, using POLY_DEPTH steps. This we repeat REPS time in a loop. Prior to the loop we retrieve source data from global memory, and after the loop terminates we store the result to global memory. By varying REPS we can experiment with various settings of compute / memory balances.
One could further experiment with instruction-level parallelism, data patterns (as the power consumption of multipliers often differs based on bit patterns), and adding PCIe activity by using CUDA streams to achieve overlap of kernel execution and PCIe data transfers. Below I just used some random constants as multiplier data.
Obviously we would want to fill the GPU with lots of threads. For this we may use a fairly small THREADS_PER_BLK value giving us fine granularity for filling each SM. We might want to choose the number of blocks to be a multiple of the number of SM to spread load as evenly as possible, or use a MAX_BLOCKS value that is evenly divides common SM counts. How much source and destination memory we should touch will be up to experimentation: We can define arrays of LEN elements as a multiple of the number of blocks. Lastly, we want to execute the kernel thus defined and configured ITER number of times to create a continuous load for some time.
Note that as we apply load, the GPU will heat up and this will in turn further increase its power draw. To achieve maximum thermal load it will be necessary to run the load-generating app for 5 minutes or more. Note further that the GPU power management may dynamically reduce clock frequencies and voltages to reduce power consumption, and the power cap may kick in before you reach the thermal limit. Depending on the GPU you may be able to set a power cap higher than the one used by default with the nvidia-smi utility.
The program below keeps my Quadro P2000 pegged at the power cap, with a GPU load of 98% and a memory controller load of 83%-86%, as reported by TechPowerUp's GPU-Z utility. It will certainly require adjustments for other GPUs.
#include <stdlib.h>
#include <stdio.h>
#define THREADS_PER_BLK (128)
#define MAX_BLOCKS (65520)
#define LEN (MAX_BLOCKS * 1024)
#define POLY_DEPTH (30)
#define REPS (2)
#define ITER (100000)
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaDeviceSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
__global__ void burn (const float * __restrict__ src,
float * __restrict__ dst, int len)
{
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
float p = src[i] + 1.0;
float q = src[i] + 3.0f;
for (int k = 0; k < REPS; k++) {
#pragma unroll POLY_DEPTH
for (int j = 0; j < POLY_DEPTH; j++) {
p = fmaf (p, 0.68073987f, 0.8947237f);
q = fmaf (q, 0.54639739f, 0.9587058f);
}
}
dst[i] = p + q;
}
}
int main (int argc, char *argv[])
{
float *d_a, *d_b;
/* Allocate memory on device */
CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0]) * LEN));
CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0]) * LEN));
/* Initialize device memory */
CUDA_SAFE_CALL (cudaMemset(d_a, 0x00, sizeof(d_a[0]) * LEN)); // zero
CUDA_SAFE_CALL (cudaMemset(d_b, 0xff, sizeof(d_b[0]) * LEN)); // NaN
/* Compute execution configuration */
dim3 dimBlock(THREADS_PER_BLK);
int threadBlocks = (LEN + (dimBlock.x - 1)) / dimBlock.x;
if (threadBlocks > MAX_BLOCKS) threadBlocks = MAX_BLOCKS;
dim3 dimGrid(threadBlocks);
printf ("burn: using %d threads per block, %d blocks, %f GB\n",
dimBlock.x, dimGrid.x, 2e-9*LEN*sizeof(d_a[0]));
for (int k = 0; k < ITER; k++) {
burn<<<dimGrid,dimBlock>>>(d_a, d_b, LEN);
CHECK_LAUNCH_ERROR();
}
CUDA_SAFE_CALL (cudaFree(d_a));
CUDA_SAFE_CALL (cudaFree(d_b));
return EXIT_SUCCESS;
}

cudaGetLastError. Which kernel execution raised it?

I have implemented a pipeline where many kernels are launched in a specific stream. The kernels are enqueued into the stream and executed when the scheduler decides it’s best.
In my code, after every kernel enqueue, I check if there’s any error by calling cudaGetLastError which, according to the documentation, "it returns the last error from a runtime call. This call, may also return error codes from previous asynchronous launches". Thus, if the kernel has only been enqueued, not executed, I understand that the error returned refers only if the kernel was enqueued correctly (parameters checking, grid and block size, shared memory, etc...).
My problem is: I enqueue many different kernels without waiting for finalization of the execution of each kernel. Imagine now, I have a bug in one of my kernels (let's call it Kernel1) which causes a illegal memory access (for instance). If I check the cudaGetLastError right after enqueuing it, the return value is success because it was correctly enqueued. So my CPU thread moves on and keep enqueuing kernels to the stream. At some point Kernel1 is executed and raised the illegal memory access. Thus, next time I check for cudaGetLastError I will get the cuda error but, by that time, the CPU thread is another point forward in the code. Consequently, I know there's been an error, but I have no idea which kernel raised it.
An option is to synchronize (block the CPU thread) until the execution of every kernel have finished and then check the error code, but this is not an option for performance reasons.
The question is, is there any way we can query which kernel raised a given error code returned by cudaGetLastError? If not, which is in your opinion the best way to handle this?
There is an environment variable CUDA_​LAUNCH_​BLOCKING which you can use to serialize kernel execution of an otherwise asynchronous sequence of kernel launches. This should allow you to isolate the kernel instance which is causing an error, either via internal error checking in your host code, or via an external tool like cuda-memcheck.
I have tested 3 different options:
Set CUDA_​LAUNCH_​BLOCKING environment variable to 1. This forces to block the CPU thread until the kernel execution has finished. We can check after each execution if there's been an error catching the exact point of failure. Although, this has an obvious performance impact but this may help to bound the bug in a production environment without having to perform any change at the client side.
Distribute the production code compiled with the flag -lineinfo and run the code again with cuda-memncheck. This has no performance impact and we do not need to perform any change in the client either. Although, we have to execute the binary in a slightly different environment and in some cases, like a service running GPU tasks, can be difficult to achieve.
Insert a callback after each kernel call. In the userData parameter, include a unique id for the kernel-call, and possibly some information on the parameters used. This can be directly distributed in a production environment and always gives us the exact point of failure and we don't need to perform any change at the client side. Although, the performance impact of this approach is huge. Apparently, the callback functions, are processed by a driver thread and cause for the performance impact. I wrote a code to test it
#include <cuda_runtime.h>
#include <vector>
#include <chrono>
#include <iostream>
#define BLOC_SIZE 1024
#define NUM_ELEMENTS BLOC_SIZE * 32
#define NUM_ITERATIONS 500
__global__ void KernelCopy(const unsigned int *input, unsigned int *result) {
unsigned int pos = blockIdx.x * BLOC_SIZE + threadIdx.x;
result[pos] = input[pos];
}
void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status, void *data) {
if (status) {
std::cout << "Error: " << cudaGetErrorString(status) << "-->";
}
}
#define CUDA_CHECK_LAST_ERROR cudaStreamAddCallback(stream, myStreamCallback, nullptr, 0)
int main() {
cudaError_t c_ret;
c_ret = cudaSetDevice(0);
if (c_ret != cudaSuccess) {
return -1;
}
unsigned int *input;
c_ret = cudaMalloc((void **)&input, NUM_ELEMENTS * sizeof(unsigned int));
if (c_ret != cudaSuccess) {
return -1;
}
std::vector<unsigned int> h_input(NUM_ELEMENTS);
for (unsigned int i = 0; i < NUM_ELEMENTS; i++) {
h_input[i] = i;
}
c_ret = cudaMemcpy(input, h_input.data(), NUM_ELEMENTS * sizeof(unsigned int), cudaMemcpyKind::cudaMemcpyHostToDevice);
if (c_ret != cudaSuccess) {
return -1;
}
unsigned int *result;
c_ret = cudaMalloc((void **)&result, NUM_ELEMENTS * sizeof(unsigned int));
if (c_ret != cudaSuccess) {
return -1;
}
cudaStream_t stream;
c_ret = cudaStreamCreate(&stream);
if (c_ret != cudaSuccess) {
return -1;
}
std::chrono::steady_clock::time_point start;
std::chrono::steady_clock::time_point end;
start = std::chrono::steady_clock::now();
for (unsigned int i = 0; i < 500; i++) {
dim3 grid(NUM_ELEMENTS / BLOC_SIZE);
KernelCopy <<< grid, BLOC_SIZE, 0, stream >>> (input, result);
CUDA_CHECK_LAST_ERROR;
}
cudaStreamSynchronize(stream);
end = std::chrono::steady_clock::now();
std::cout << "With callback took (ms): " << std::chrono::duration<float, std::milli>(end - start).count() << '\n';
start = std::chrono::steady_clock::now();
for (unsigned int i = 0; i < 500; i++) {
dim3 grid(NUM_ELEMENTS / BLOC_SIZE);
KernelCopy <<< grid, BLOC_SIZE, 0, stream >>> (input, result);
c_ret = cudaGetLastError();
if (c_ret) {
std::cout << "Error: " << cudaGetErrorString(c_ret) << "-->";
}
}
cudaStreamSynchronize(stream);
end = std::chrono::steady_clock::now();
std::cout << "Without callback took (ms): " << std::chrono::duration<float, std::milli>(end - start).count() << '\n';
c_ret = cudaStreamDestroy(stream);
if (c_ret != cudaSuccess) {
return -1;
}
c_ret = cudaFree(result);
if (c_ret != cudaSuccess) {
return -1;
}
c_ret = cudaFree(input);
if (c_ret != cudaSuccess) {
return -1;
}
return 0;
}
Ouput:
With callback took (ms): 47.8729
Without callback took (ms): 1.9317
(CUDA 9.2, Windows 10, Visual Studio 2015, Nvidia Tesla P4)
To me, in a production environment, the only valid approach is number 2.

CUDA mapped memory: device -> host writes are not visible on host

What I was trying to do is modifying a variable which resides in mapped memory that would cause the main program to exit.
But instead of this the main program keeps spinning on while (var == 0) ; line. I don't know how the new value could be flushed out so it would be visible on the host side too.
Btw. the variable is declared as volatile everywhere and I tried using the __threadfence_system() function with no success.
The host -> device direction works well.
System: Windows 7 x64, driver 358.50, GTX 560
Here is the piece of code that I can't get working:
static void handleCUDAError(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define CUDA_ERROR_CHECK(err) (handleCUDAError(err, __FILE__, __LINE__ ))
__global__ void echoKernel(volatile int* semaphore)
{
*semaphore = 1;
__threadfence_system();
}
int main()
{
CUDA_ERROR_CHECK(cudaSetDevice(0));
CUDA_ERROR_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost));
volatile int var = 0;
volatile int *devptr;
CUDA_ERROR_CHECK(cudaHostRegister((int*)&var, sizeof (int), cudaHostRegisterMapped));
CUDA_ERROR_CHECK(cudaHostGetDevicePointer(&devptr, (int*)&var, 0));
echoKernel <<< 1, 1 >>> (devptr);
while (var == 0) ;
CUDA_ERROR_CHECK(cudaDeviceSynchronize());
CUDA_ERROR_CHECK(cudaHostUnregister((int*)&var));
CUDA_ERROR_CHECK(cudaDeviceReset());
return 0;
}
When I run your code on linux, it runs as-is without issue.
However on windows, there is a problem around WDDM command batching. In effect, your kernel does not launch and is not getting launched before you enter the while-loop that hangs.
The WDDM command queue is a queue of commands that will eventually go to the GPU device. Various events will cause this queue to be "flushed" and the contents to be delivered as a "batch" of commands to the GPU.
Various cuda runtime API calls may effectively force the "flushing" of the command queue, such as cudaDeviceSynchronize() or cudaMemcpy(). However after the kernel launch, you are not issuing any runtime API calls before entering your while-loop. As a result, in this scenario it seems that the kernel call is getting "stuck" in the queue and never "flushed".
You can work around this in a variety of ways, for example by recording an event after the launch of the kernel and then querying the status of that event. This will have the effect of flushing the queue, which will launch the kernel.
Here's an example modification of your code that works for me:
#include <stdio.h>
static void handleCUDAError(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define CUDA_ERROR_CHECK(err) (handleCUDAError(err, __FILE__, __LINE__ ))
__global__ void echoKernel(volatile int* semaphore)
{
*semaphore = 1;
__threadfence_system();
}
int main()
{
CUDA_ERROR_CHECK(cudaSetDevice(0));
CUDA_ERROR_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost));
volatile int var = 0;
volatile int *devptr;
CUDA_ERROR_CHECK(cudaHostRegister((int*)&var, sizeof(int), cudaHostRegisterMapped));
CUDA_ERROR_CHECK(cudaHostGetDevicePointer(&devptr, (int*)&var, 0));
cudaEvent_t my_event;
CUDA_ERROR_CHECK(cudaEventCreate(&my_event));
echoKernel << < 1, 1 >> > (devptr);
CUDA_ERROR_CHECK(cudaEventRecord(my_event));
cudaEventQuery(my_event);
while (var == 0);
CUDA_ERROR_CHECK(cudaDeviceSynchronize());
CUDA_ERROR_CHECK(cudaHostUnregister((int*)&var));
CUDA_ERROR_CHECK(cudaDeviceReset());
return 0;
}
Tested on CUDA 7.5, Driver 358.50, Win7 x64 release project, GTX460M.
Note that we don't wrap the cudaEventQuery call in a standard error checker, because the expected behavior for it is to return a non-zero status when the event has not been completed yet.

printf() in my CUDA kernel doesn't result produce any output

I have added some printf() statements in my CUDA program
__device__ __global__ void Kernel(float *, float * ,int );
void DeviceFunc(float *temp_h , int numvar , float *temp1_h)
{ .....
//Kernel call
printf("calling kernel\n");
Kernel<<<dimGrid , dimBlock>>>(a_d , b_d , numvar);
printf("kernel called\n");
....
}
int main(int argc , char **argv)
{ ....
printf("beforeDeviceFunc\n\n");
DeviceFunc(a_h , numvar , b_h); //Showing the data
printf("after DeviceFunc\n\n");
....
}
Also in the Kernel.cu, I wrote:
#include<cuda.h>
#include <stdio.h>
__device__ __global__ void Kernel(float *a_d , float *b_d ,int size)
{
int idx = threadIdx.x ;
int idy = threadIdx.y ;
//Allocating memory in the share memory of the device
__shared__ float temp[16][16];
//Copying the data to the shared memory
temp[idy][idx] = a_d[(idy * (size+1)) + idx] ;
printf("idx=%d, idy=%d, size=%d", idx, idy, size);
....
}
Then I compile using -arch=sm_20 like this:
nvcc -c -arch sm_20 main.cu
nvcc -c -arch sm_20 Kernel.cu
nvcc -arch sm_20 main.o Kernel.o -o main
Now when I run the program, I see:
beforeDeviceFunc
calling kernel
kernel called
after DeviceFunc
So the printf() inside the kernel is not printed. How can I fix that?
printf() output is only displayed if the kernel finishes successfully, so check the return codes of all CUDA function calls and make sure no errors are reported.
Furthermore printf() output is only displayed at certain points in the program. Appendix B.32.2 of the Programming Guide lists these as
Kernel launch via <<<>>> or cuLaunchKernel() (at the start of the launch, and if the CUDA_LAUNCH_BLOCKING environment variable is set to 1, at the end of the launch as well),
Synchronization via cudaDeviceSynchronize(), cuCtxSynchronize(), cudaStreamSynchronize(), cuStreamSynchronize(), cudaEventSynchronize(), or cuEventSynchronize(),
Memory copies via any blocking version of cudaMemcpy*() or cuMemcpy*(),
Module loading/unloading via cuModuleLoad() or cuModuleUnload(),
Context destruction via cudaDeviceReset() or cuCtxDestroy().
Prior to executing a stream callback added by cudaStreamAddCallback() or cuStreamAddCallback().
To check this is your problem, put the following code after your kernel invocation:
{
cudaError_t cudaerr = cudaDeviceSynchronize();
if (cudaerr != cudaSuccess)
printf("kernel launch failed with error \"%s\".\n",
cudaGetErrorString(cudaerr));
}
You should then see either the output of your kernel or an error message.
More conveniently, cuda-memcheck will automatically check all return codes for you if you run your executable under it. While you should always check for errors anyway, this comes handy when resolving concrete issues.
I had the same error just now and decreasing the block size to 512 helped. According to documentation maximum block size can be either 512 or 1024.
I have written a simple test that showed that my GTX 1070 has a maximum block size of 1024. UPD: you can check if your kernel has ever executed by using cudaError_t cudaPeekAtLastError() that returns cudaSuccess if the kernel has started successfully, and only after it is worse calling cudaError_t cudaDeviceSynchronize().
Testing block size of 1023
Testing block size of 1024
Testing block size of 1025
CUDA error: invalid configuration argument
Block maximum size is 1024
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
__global__
void set1(int* t)
{
t[threadIdx.x] = 1;
}
inline bool failed(cudaError_t error)
{
if (cudaSuccess == error)
return false;
fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(error));
return true;
}
int main()
{
int blockSize;
for (blockSize = 1; blockSize < 1 << 12; blockSize++)
{
printf("Testing block size of %d\n", blockSize);
int* t;
if(failed(cudaMallocManaged(&t, blockSize * sizeof(int))))
{
failed(cudaFree(t));
break;
}
for (int i = 0; i < blockSize; i++)
t[0] = 0;
set1 <<<1, blockSize>>> (t);
if (failed(cudaPeekAtLastError()))
{
failed(cudaFree(t));
break;
}
if (failed(cudaDeviceSynchronize()))
{
failed(cudaFree(t));
break;
}
bool hasError = false;
for (int i = 0; i < blockSize; i++)
if (1 != t[i])
{
printf("CUDA error: t[%d] = %d but not 1\n", i, t[i]);
hasError = true;
break;
}
if (hasError)
{
failed(cudaFree(t));
break;
}
failed(cudaFree(t));
}
blockSize--;
if(blockSize <= 0)
{
printf("CUDA error: block size cannot be 0\n");
return 1;
}
printf("Block maximum size is %d", blockSize);
return 0;
}
P.S. Please note, that the only thing in block sizing is warp granularity which is 32 nowadays, so if 0 == yourBlockSize % 32 the warps are used pretty efficiently. The only reason to make blocks bigger then 32 is when the code needs synchronization as synchronization is available only among threads in a single block which makes a developer to use a single large block instead of many small ones. So running with higher number of smaller blocks can be even more efficient than running with lower number of larger blocks.