Cuda C threads synchronization with printf or other functions - cuda

I have a problem with threads' id during the block executes.
I would like to have sentence like :"My temporary string is printed via GPU!" as you see (on the attached photo ealier) the sentence has been displayed wrongly and I don't know how to fix it.
Code:
__global__ void Print(const char* const __string, const size_t* const loop_repeat)
{
int id_x = threadIdx.x + blockIdx.x * blockDim.x;
while (id_x < static_cast<int>(*loop_repeat))
{
printf("%c", __string[id_x]);
__syncthreads();
id_x += blockDim.x * gridDim.x;
}
}
int main()
{
const char* my_string = "My temporary string is printed via GPU!";
size_t temp{};
temp = Get_String_Length(my_string); //get the string length
//GPU MEMORY ALLOCATION
size_t* my_string_length{};
cudaMalloc((void**)&my_string_length, sizeof(size_t));
//COPY VALUE FROM CPU(RAM) TO GPU
cudaMemcpy(my_string_length, &temp, sizeof(size_t), HostToDevice);
char* string_GPU{};
cudaMalloc((void**)&string_GPU, (temp) * sizeof(char));
//COPY VALUE FROM CPU(RAM) TO GPU
cudaMemcpy(string_GPU, my_string, (temp) * sizeof(char), HostToDevice);
dim3 grid_size(1);
dim3 block_size((temp));
Print <<< grid_size, temp >>> (string_GPU, my_string_length);
cudaError_t final_error = cudaDeviceSynchronize(); //for synchronization e.g Hello_World then printf
if (final_error == cudaSuccess)
{
printf("%cKernel executed successfully with code: %d !%\n", NEW_LINE, final_error);
}
else
{
printf("%cKernel executed with code error: %d !\n", NEW_LINE, final_error);
}
cudaFree(my_string_length);
cudaFree(string_GPU);
return 0;
}
I will be grateful for any help given.

The main issue here is that you are expecting that the thread or warp execution order has some predictable order. Actually, it does not. Your usage of __syncthreads() doesn't fix or address this issue.
If you want the warps to execute in a predictable order (not recommended) you would need to impose that order yourself. Here is an example that demonstrates that for this very simple code. It is not extensible without modification to larger strings, and this method will completely break down if you introduce more than 1 threadblock.
$ cat t1543.cu
#include <stdio.h>
#include <stdlib.h>
__global__ void Print(const char* const __string, const size_t* const loop_repeat)
{
int id_x = threadIdx.x + blockIdx.x * blockDim.x;
int warp_ID = threadIdx.x>>5;
while (id_x < static_cast<int>(*loop_repeat))
{
if (warp_ID == 0)
printf("%c", __string[id_x]);
__syncthreads();
if (warp_ID == 1)
printf("%c", __string[id_x]);
__syncthreads();
id_x += blockDim.x * gridDim.x;
}
}
int main()
{
const char* my_string = "My temporary string is printed via GPU!";
size_t temp;
temp = 40; //get the string length
//GPU MEMORY ALLOCATION
size_t* my_string_length;
cudaMalloc((void**)&my_string_length, sizeof(size_t));
//COPY VALUE FROM CPU(RAM) TO GPU
cudaMemcpy(my_string_length, &temp, sizeof(size_t), cudaMemcpyHostToDevice);
char* string_GPU;
cudaMalloc((void**)&string_GPU, (temp) * sizeof(char));
//COPY VALUE FROM CPU(RAM) TO GPU
cudaMemcpy(string_GPU, my_string, (temp) * sizeof(char), cudaMemcpyHostToDevice);
dim3 grid_size(1);
dim3 block_size((temp));
Print <<< grid_size, temp >>> (string_GPU, my_string_length);
cudaError_t final_error = cudaDeviceSynchronize(); //for synchronization e.g Hello_World then printf
if (final_error == cudaSuccess)
{
printf("\nKernel executed successfully with code: %d !%\n", final_error);
}
else
{
printf("\nKernel executed with code error: %d !\n", final_error);
}
cudaFree(my_string_length);
cudaFree(string_GPU);
return 0;
}
$ nvcc -o t1543 t1543.cu
$ cuda-memcheck ./t1543
========= CUDA-MEMCHECK
My temporary string is printed via GPU!
Kernel executed successfully with code: 0 !%
========= ERROR SUMMARY: 0 errors
$
Note that I'm not suggesting the above is good coding style. It's provided for understanding of the issue. Even this code is relying on the idea that the threads within a warp will call the printf function in a predictable order, which is not guaranteed by the CUDA programming model. So the code is really still broken.

This happened because The multiprocessor creates, manages, schedules, and executes threads in groups of 32 parallel threads called warps as you can see in CUDA Programming Guide, so the first 32 threads covers "My temporary string is printed v" and the remaining part covers "ia GPU!". It seems that the kernel put the latter wrap before the first one in execution order.

Related

Can we get cuda kernel function name in cudaLaunchKernel?

I was trying to insert some codes in cudaLaunchKernel and need to store its function name, but I cannot find a direct API that can help me to get the kernel function name. I have considered CUPTI, but it uses callback function to get the information so I cannot change the behavior of the kernel launch(or need heavy inter-process communication which is ugly.....)
Is there any way I can get the function name in cudaLaunchKernel(maybe by the function pointer?)?
An exampla is as follows.
cudaKernelLaunch(...) {
kernel_id = getKernelNameBySomeMethods(); // it's what I want..
send_to_other_processes(kernel_name);
return ::cudaKernelLaunch(...);
}
// for other process
receive_kernel_name_from_other_process;
store_it;
Edit: A identifier is also ok. I may send the ID to another process to store so I need to classify different cuda kernels.
There are no APIs to do this, either public or private AFAIK. The compiler emits a lot of static host side boilerplate to perform the runtime API magic we take for granted, it isn't done by the runtime library itself.
However, the nature of that boilerplate means you can build your own lookup table pretty easily -- some hacking over a lunch break got me this partial proof of concept which does what I think it is you want:
#include <cstdio>
#include <map>
#include <string>
#include <iostream>
__global__ void kernel_1(float *in, float *out, int N)
{
int tidx = threadIdx.x + blockDim.x * blockIdx.x;
if (tidx == 0) printf("Running kernel_1\n");
if (tidx < N) out[tidx] = in[tidx];
}
__global__ void kernel_2(float *in, float *out, int N)
{
int tidx = threadIdx.x + blockDim.x * blockIdx.x;
if (tidx == 0) printf("Running kernel_2\n");
if (tidx < N) out[tidx] = 2.f * in[tidx];
}
__global__ void kernel_3(float *in, float *out, int N)
{
int tidx = threadIdx.x + blockDim.x * blockIdx.x;
if (tidx == 0) printf("Running kernel_3\n");
if (tidx < N) out[tidx] = 3.f * in[tidx];
}
void notakernel(float *in, float *out, int N)
{
printf("Someone bad happened\n");
}
std::map <void*, std::string> ktable = {
{ (void*)kernel_1, "kernel_1" },
{ (void*)kernel_2, "kernel_2" },
{ (void*)kernel_3, "kernel_3" } };
cudaError_t MyLaunchKernel (void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream)
{
auto it = ktable.find(func);
if (it != ktable.end()) {
std::cout << "Received request to call " << it->second << std::endl;
} else {
std::cout << "Received request to call unknown function!" << std::endl;
}
return cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);
}
int main()
{
int N = 100;
float* a; cudaMalloc<float>(&a, N * sizeof(float));
float* b; cudaMalloc<float>(&b, N * sizeof(float));
void* args[] = { (void*)&a, (void*)&b, (void*)&N };
MyLaunchKernel((void*)kernel_1, dim3(1), dim3(1), args, 0, NULL);
cudaDeviceSynchronize();
MyLaunchKernel((void*)kernel_2, dim3(1), dim3(1), args, 0, NULL);
cudaDeviceSynchronize();
MyLaunchKernel((void*)kernel_3, dim3(1), dim3(1), args, 0, NULL);
cudaDeviceSynchronize();
MyLaunchKernel((void*)notakernel, dim3(1), dim3(1), args, 0, NULL);
cudaDeviceSynchronize();
return 0;
}
which appears to work:
$ nvcc -std=c++11 -arch=sm_52 -o lookup lookup.cu
$ cuda-memcheck ./lookup
========= CUDA-MEMCHECK
Received request to call kernel_1
Running kernel_1
Received request to call kernel_2
Running kernel_2
Received request to call kernel_3
Running kernel_3
Received request to call unknown function!
========= Program hit cudaErrorInvalidDeviceFunction (error 98) due to "invalid device function" on CUDA API call to cudaLaunchKernel.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3b9803]
========= Host Frame:./lookup [0x4ca95]
========= Host Frame:./lookup [0x746c]
========= Host Frame:./lookup [0x769f]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./lookup [0x722a]
=========
========= ERROR SUMMARY: 1 error
Obviously things need to be a bit more complex in a complete implementation for your use case -- you would require the reverse lookup implementation for another called to go from name/ID to pointer, and if you have multiple source files compiled separately, then you would need a list concatenation call for the construction of the working list at runtime. But it is important to remember that the function pointers you are passing are actually host pointers, not device pointers (thanks to the runtime API magic), so the cost and complexity of runtime setup is trivial when you can use pre-baked C++ standard library containers and algorithms and function adapters to do most of the heavy lifting.

Can an array of shorts be passed into a CUDA kernel

I have written a CUDA kernel and when I copy of an array of shorts to device memory and then pass it to the kernel it doesn't work. The simplified code below expresses my issue.
KernelCaller()
{
const int size = 1;
short hostArray[size]{41};
short* devPointer;
cudaMalloc((void**)&devicePointer, size * sizeof(short));
cudaMemcpy(devPointer, hostArray, size * sizeof(short), cudaMemcpyHostToDevice);
cudaKernel<<<1,1>>>(devPointer);
}
__global__
void cudaKernel(short* arr)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
short val = arr[idx];
}
At this point the value of val is 1063714857 and what I want it to be is 41.
I assume the issue is 41 in hex is 0x29 and the value I have is 0x3F670029 so it looks like it read too many bytes cause the 0x29 is at the beginning. When I switch to an array of floats it works perfectly, but I was trying to save memory. Does CUDA not allow an array of shorts?
I have implemented your code and getting the output as expected.
Here's the code
#include<stdio.h>
__global__ void cudaKernel(short* arr)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
short val = arr[idx];
# if __CUDA_ARCH__>=200
printf("Inside kernel %d\n",val);
#endif
arr[idx] = val;
}
int main()
{
const int size = 1;
short hostArray[size]{41};
printf("Before kernel call %d\n",hostArray[0]);
short *devPointer;
cudaMalloc((void**)&devPointer, size * sizeof(short));
cudaMemcpy(devPointer, hostArray, size * sizeof(short), cudaMemcpyHostToDevice);
cudaKernel<<<1,1>>>(devPointer);
cudaMemcpy(hostArray, devPointer, size * sizeof(short), cudaMemcpyDeviceToHost);
printf("After kernel call %d\n",hostArray[0]);
cudaFree(devPointer);
return 0;
}
And the output is
Before kernel call 41
Inside kernel 41
After kernel call 41
So, yes we can pass array of shorts into a CUDA kernel.

Does bool variable in kernel need to be synchronized

I have a kernel consisting of a for loop that searches through an array for a specific int value. I'm using a grid block of 256 threads to do this. However, when one thread finds the value, I want to let the other threads know to exit. Currently I'm using a boolean flag, but I'm not sure if its working properly. My concern is synchronization.
__device__ bool found;
__global__
void search()
{
for(int i = threadIdx.x; i<1000000; i += stride)
{
if(found == true)
{
break;
}
else if(arr[i] = x)
{
found = true;
break;
}
}
}
int main()
{
bool flag = false;
cudaMemcpyToSymbol(found, &flag, sizeof(bool), 0,cudaMemcpyHostToDevice);
}
As pointed out in comments, you can probably achieve what you want by declaring the global device flag to be volatile, which will inhibit caching, and by using a memory fence function. There really isn't a global synchronization primitive which would do want you want other than the new grid synchronization mechanism introduced in CUDA 9 and new hardware, but that probably isn't necessary in this case. Turning your pseudocode into a toy example:
#include <iostream>
#include <thrust/device_vector.h>
__device__ volatile bool found;
__device__ volatile size_t idx;
template<bool docheck>
__global__
void search(const int* arr, int x, size_t N)
{
size_t i = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for(; (i<N) && (!found); i += stride)
{
if(arr[i] == x)
{
if (docheck) found = true;
idx = i;
__threadfence();
break;
}
}
}
int main()
{
const size_t N = 1 << 24;
const size_t findidx = 280270;
const int findval = 0xdeadbeef;
thrust::device_vector<int> data(N,1);
data[findidx] = findval;
bool flag = false;
size_t zero = 0;
{
cudaMemcpyToSymbol(found, &flag, sizeof(bool));
cudaMemcpyToSymbol(idx, &zero, sizeof(size_t));
int blocks, threads;
cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, search<false>);
search<false><<<blocks, threads>>>(thrust::raw_pointer_cast(data.data()), findval, N);
cudaDeviceSynchronize();
size_t result = 0;
cudaMemcpyFromSymbol(&result, idx, sizeof(size_t));
std::cout << "result = " << result << std::endl;
}
{
cudaMemcpyToSymbol(found, &flag, sizeof(bool));
cudaMemcpyToSymbol(idx, &zero, sizeof(size_t));
int blocks, threads;
cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, search<true>);
search<true><<<blocks, threads>>>(thrust::raw_pointer_cast(data.data()), findval, N);
cudaDeviceSynchronize();
size_t result = 0;
cudaMemcpyFromSymbol(&result, idx, sizeof(size_t));
std::cout << "result = " << result << std::endl;
}
return 0;
}
and profiling it gives the following:
$ nvcc -arch=sm_52 -o notify notify.cu
$ nvprof ./notify
==3916== NVPROF is profiling process 3916, command: ./notify
result = 280270
result = 280270
==3916== Profiling application: ./notify
==3916== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 78.00% 1.6773ms 1 1.6773ms 1.6773ms 1.6773ms void search<bool=0>(int const *, int, unsigned long)
19.93% 428.63us 1 428.63us 428.63us 428.63us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>(thrust::device_ptr<int>, int)
1.82% 39.199us 1 39.199us 39.199us 39.199us void search<bool=1>(int const *, int, unsigned long)
As you can see, the version which sets the found flag completes the search in 40 microseconds, whereas the version which does not set the flag takes 1.7 milliseconds. Given that the kernel is run with the maximum number of resident blocks in both cases, we can conclude that the early exit mechanism worked correctly and running blocks detected that the required value had been found.

Passing variables from parent kernel to child kernel in dynamic parallelism in cuda

I am trying to use dynamic parallelism in cuda. I am in a situation such that parent kernel has a variable that needs to be passed to child for further computation. I have gone through the resources in web
here
and it mentions that local variables cannot be passed to the child kernal and has mentioned the ways to pass variables and I have tried to pass the pass the variable as
#include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(N==10)
{
a[idx] = a[idx] * a[idx];
}
}
// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int n=N; // this value of n can be changed locally and need to be passed
printf("%d\n",n);
cudaMalloc((void **) &n, sizeof(int));
square <<< 1, N >>> (arr, n);
}
// main routine that executes on the host
int main(void)
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
first <<< 1, 1 >>> (a_d, N);
//cudaThreadSynchronize();
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
}
and the value of parent to child kernel is not passed . how can I pass the value of local variable. Is there any way to do so?
This operation is not appropriate:
int n=N; // this value of n can be changed locally and need to be passed
cudaMalloc((void **) &n, sizeof(int)); // illegal
It is not appropriate in host code, nor in device code. n is an int variable. You are not supposed to assign a pointer to it. When you attempt to do so in a 64-bit environment, you are attempting to write a 64-bit pointer on top of a 32-bit int quantity. It will not work.
It's not clear why you would need it anyway. n is an integer parameter presumably specifying the size of your arr array of float. You don't need to allocate anything on top of it.
If you had run this code with cuda-memcheck, you could easily discover that error. You can also do proper cuda error checking in device code in exactly the same fashion as you do it in host code.
When I comment out that cudaMalloc line in the first kernel, your code runs correctly for me.

Counting occurrences of specific events in CUDA kernels

Problem
I am trying to find the best way to count how many times my program ends up in some specific branches of my CUDA kernels. The idea is that some events should nearly never happen, but since the data processed by the GPU is given by a numerical optimization solver, there may be some situations where ill-defined cases become more common. Thus, I want to be able to track/monitor these phenomenons over multiple simulations to make some global statistics later.
Possible idea
The most straightforward way to do this may be to use a structure dedicated to monitoring such occurrences. Then, when entering a monitored branch, we increment the associated counter using atomicAdd. At the end of the simulation, we copy the counters back to the host and store them for some future statistics processing.
In my case, the cost of using atomicAdd should not be that important since I should not be entering those branches that much, but still, I may want to monitor some of the common branches later on, so what would be a better approach then? Since this is just for monitoring, I do not want the overhead to be too important.
I guess I could also have one monitoring structure per block and do a sum at the end, since it should not use much global memory anyway (1 unsigned int per monitored branch).
Code example
#include <iostream>
#include <time.h>
#include <cuda.h>
#include <stdio.h>
#define CUDA_CHECK_ERROR() __cuda_check_errors(__FILE__, __LINE__)
#define CUDA_SAFE_CALL(err) __cuda_safe_call(err, __FILE__, __LINE__)
inline void __cuda_check_errors(const char *filename, const int line_number)
{
cudaError err = cudaDeviceSynchronize();
if(err != cudaSuccess)
{
printf("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString(err));
exit(-1);
}
}
inline void __cuda_safe_call(cudaError err, const char *filename, const int line_number)
{
if (err != cudaSuccess)
{
printf("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString(err));
exit(-1);
}
}
struct Stats
{
unsigned int even;
};
__global__ void test_kernel(int* A, int* B, Stats* stats)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int res = A[tid] + (int)tid;
if (res%2 == 0)
atomicAdd(&(stats->even), 1);
B[tid] = res;
}
int get_random_int(int min, int max)
{
return min + (rand() % (int)(max - min + 1));
}
void print_array(int* ar, unsigned int n)
{
for (unsigned int i = 0; i < n; ++i)
std::cout << ar[i] << " ";
std::cout << std::endl;
}
void print_stats(Stats* s)
{
std::cout << "even: " << s->even << std::endl;
}
int main()
{
// vector size
const unsigned int N = 10;
// device vectors
int *d_A, *d_B;
Stats *d_stats;
// host vectors
int *h_A, *h_B;
Stats *h_stats;
// allocate device memory
CUDA_SAFE_CALL(cudaMalloc(&d_A, N * sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc(&d_B, N * sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc(&d_stats, sizeof(Stats)));
// allocate host memory
h_A = new int[N];
h_B = new int[N];
h_stats = new Stats;
// initialize host data
srand(time(NULL));
for (unsigned int i = 0; i < N; ++i)
{
h_A[i] = get_random_int(0,10);
h_B[i] = 0;
}
memset(h_stats, 0, sizeof(Stats));
// copy data to the device
CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, N * sizeof(int), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_stats, h_stats, sizeof(Stats), cudaMemcpyHostToDevice));
// launch kernel
dim3 grid_size, block_size;
grid_size.x = N;
test_kernel<<<grid_size, block_size>>>(d_A, d_B, d_stats);
// copy result back to host
CUDA_SAFE_CALL(cudaMemcpy(h_B, d_B, N * sizeof(int), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_stats, d_stats, sizeof(Stats), cudaMemcpyDeviceToHost));
print_array(h_B, N);
print_stats(h_stats);
// free device memory
CUDA_SAFE_CALL(cudaFree(d_A));
CUDA_SAFE_CALL(cudaFree(d_B));
CUDA_SAFE_CALL(cudaFree(d_stats));
// free host memory
delete [] h_A;
delete [] h_B;
delete h_stats;
}
Hardware/software information
The solution I am looking for should work for CC >= 2.0 devices and CUDA >= 5.0.
The atomicAdd is is one possibility and i would probably go that route. If you do not use the result of the atomicAdd function call the compiler will emit a reduction operation such as RED.E.ADD. Reduction is very fast as long as there are not many conflicts happening (i actually use it sometimes even if i do not need the operation to be atomic because it can be quicker than loading value from global memory, doing an arithmetic operation and saving back to global memory).
The second option you have is to use a profiler counter and use the profiler to analyze the result. Please see Profiler Counter Function for more details.