CUDA:how to call a host function from a global function [duplicate] - cuda

This question already has answers here:
I get an error when I try to use printf() in a kernel
(3 answers)
Closed 5 years ago.
1 #include<stdlib.h>
2 #include<stdio.h>
3 #include"cuda.h"
4 __global__ void malloctest()
5 {
6 char * ptr=(char *)malloc(123);
7 printf("thread %d got a pointer:%p\n",threadIdx.x,ptr);
8 free(ptr);
9 }
10 int main()
11 {
12 cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024);
13 malloctest<<<1,5>>>();
14 cudaDeviceSynchronize();
15 return 0;
16 }
nvcc warning : The 'compute_10' and 'sm_10' architectures are deprecated, and may be removed in a future release.
malloctest.cu(6) (col. 9): error: calling a __host__ function("malloc") from a __global__ function("malloctest") is not allowed
malloctest.cu(7): error: calling a __host__ function("printf") from a __global__ function("malloctest") is not allowed
malloctest.cu(8): error: calling a __host__ function("free") from a __global__ function("malloctest") is not allowed
how to make it available? thanks

e,i find the answer,i should change the gpu arch to 3.0
nvcc malloctest.cu -o 1 -gencode=arch=compute_30,code=\"sm_30,compute_30\"

Related

cuda-gdb set breakpoint at another line of __global__ function

Problem
I have a __global__ function in CUDA and I want to debug it using cuda-gdb but I cannot set a breakpoint inside the kernel and it points to another line. Here is my code
// include stuff
// ...
#define blockNUM 1
#define threadNUM 1
// ...
int main() {
// ... (define d_R_0, d_R_1, d_R_2, and d_H)
cudaSetDevice(0);
dim3 threadsPerBlock(threadNUM);
dim3 numBlocks(blockNUM);
decode<<<numBlocks,threadsPerBlock>>>(d_R_0, d_R_1, d_R_2, d_H);
// ... (other codes go here)
}
__global__ void decode(uint *d_R_0, uint *d_R_1, uint *d_R_2, uint *d_H) {
uint idx = (blockIdx.x * blockDim.x + threadIdx.x); // --> I want to set breakpoint here! (line 197) <--
// ... (implementation of the function)
} // --> But the cuda-gdb set the breakpoint here! (line 288) <--
And here is the cuda-gdb
(cuda-gdb) break 197
Breakpoint 1 at 0xa7f6: file /home/matin/main.cu, line 288.
Extra Info
I compile main.cu using this command:
$ nvcc -g -G main.cu
I also have the same problem with the A First CUDA C Program snippet on Nvidia's website
Specs:
GNU gdb (GDB) 10.1
NVIDIA (R) CUDA Debugger: 11.5 release
CUDA Version: 12.0
Ubuntu Version: 22.04
After updating my Nvidia drivers, I've encountered the same issue. I hope that this solution works for you too.
You have to set the breakpoint using the kernel function name. For example for the First CUDA C Program you should follow these steps:
Set a breakpoint using the kernel function name
(cuda-gdb) b saxpy
Breakpoint 1 at 0x338: file /home/nahid/temp/saxpy.cu, line 5.
Run to reach the breakpoint.
(cuda-gdb) r
Finally, set the breakpoint to the line that you want!
(cuda-gdb) b 7
Breakpoint 2 at 0xfffe3258e10: file saxpy.cu, line 7

Cuda same kernel, but different results with __constant__ [duplicate]

This question already has answers here:
CUDA 5.0 namespaces for constant memory variable usage
(1 answer)
why do we need cudaDeviceSynchronize(); in kernels with device-printf?
(1 answer)
Issue regarding data of constant memory in CUDA
(2 answers)
CUDA: cudaMemcpyToSymbol is not copying data
(1 answer)
Closed 12 months ago.
How can cudaMemcpyToSymbol just make this ??
// head.h
#include <stdio.h>
__constant__ float const_mem[1];
__global__ void k0(); //I will declare it in main.cu
__global__ void k1(); //I will declare it in separate.cu
//separate.cu
#include "head.h"
__global__ void k0() {
printf("%f\n", const_mem[0]);
}
//main.cu
#include "head.h"
__global__ void k1() {
printf("%f\n", const_mem[0]);
}
int main() {
float arr[] = {5};
cudaMemcpyToSymbol(const_mem, arr, sizeof(float));
k0<<<1,1>>>();
k1<<<1,1>>>();
}
Compilation : nvcc main.cu separate.cu
output of sudo nvprof ./a.out (./a.out gives litteraly nothing)
0.000000
5.000000
That mean that kernel writed in an other transition unit is not accessing const_memory ... but how is it possible ??

CUDA: Forgetting kernel launch configuration does not result in NVCC compiler warning or error

When I try to call a CUDA kernel (a __global__ function) using a function pointer, everything appears to work just fine. However, if I forget to provide launch configuration when calling the kernel, NVCC will not result in an error or warning, but the program will compile and then crash if I attempt to run it.
__global__ void bar(float x) { printf("foo: %f\n", x); }
typedef void(*FuncPtr)(float);
void invoker(FuncPtr func)
{
func<<<1, 1>>>(1.0);
}
invoker(bar);
cudaDeviceSynchronize();
Compile and run the above. Everything will work just fine. Then, remove the kernel's launch configuration (i.e., <<<1, 1>>>). The code will compile just fine but it will crash when you try to run it.
Any idea what is going on? Is this a bug, or I am not supposed to pass around pointers of __global__ functions?
CUDA version: 8.0
OS version: Debian (Testing repo)
GPU: NVIDIA GeForce 750M
If we take a slightly more complex version of your repro, and look at the code emitted by the CUDA toolchain front-end, it becomes possible to see what is happening:
#include <cstdio>
__global__ void bar_func(float x) { printf("foo: %f\n", x); }
typedef void(*FuncPtr)(float);
void invoker(FuncPtr passed_func)
{
#ifdef NVCC_FAILS_HERE
bar_func(1.0);
#endif
bar_func<<<1,1>>>(1.0);
passed_func(1.0);
passed_func<<<1,1>>>(2.0);
}
So let's compile it a couple of ways:
$ nvcc -arch=sm_52 -c -DNVCC_FAILS_HERE invoker.cu
invoker.cu(10): error: a __global__ function call must be configured
i.e. the front-end can detect that bar_func is a global function and requires launch parameters. Another attempt:
$ nvcc -arch=sm_52 -c -keep invoker.cu
As you note, this produces no compile error. Let's look at what happened:
void bar_func(float x) ;
# 5 "invoker.cu"
typedef void (*FuncPtr)(float);
# 7 "invoker.cu"
void invoker(FuncPtr passed_func)
# 8 "invoker.cu"
{
# 12 "invoker.cu"
(cudaConfigureCall(1, 1)) ? (void)0 : (bar_func)((1.0));
# 13 "invoker.cu"
passed_func((2.0));
# 14 "invoker.cu"
(cudaConfigureCall(1, 1)) ? (void)0 : passed_func((3.0));
# 15 "invoker.cu"
}
The standard kernel invocation syntax <<<>>> gets expanded into an inline call to cudaConfigureCall, and then a host wrapper function is called. The host wrapper has the API internals required to launch the kernel:
void bar_func( float __cuda_0)
# 3 "invoker.cu"
{__device_stub__Z8bar_funcf( __cuda_0); }
void __device_stub__Z8bar_funcf(float __par0)
{
if (cudaSetupArgument((void *)(char *)&__par0, sizeof(__par0), (size_t)0UL) != cudaSuccess) return;
{ volatile static char *__f __attribute__((unused)); __f = ((char *)((void ( *)(float))bar_func));
(void)cudaLaunch(((char *)((void ( *)(float))bar_func)));
};
}
So the stub only handles arguments and launches the kernel via cudaLaunch. It doesn't handle launch configuration
The underlying reason for the crash (actually an undetected runtime API error) is that the kernel launch happens without a prior configuration. Obviously this happens because the CUDA front end (and C++ for that matter) can't do pointer introspection at compile time and detect that your function pointer is a stub function for calling a kernel.
I think the only way to describe this is a "limitation" of the runtime API and compiler. I wouldn't say what you are doing is wrong, but I would probably be using the driver API and explicitly managing the kernel launch myself in such a situation.

error: calling a __host__ function from a __global__ function is not allowed

I have written cuda function for dense sampling of feature points but i am getting error. My cuda code is given below. I am using cuda 7.5 toolkit.
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include "opencv2/imgproc/imgproc.hpp"
#include <opencv2/gpu/gpu.hpp>
#include <opencv2/opencv.hpp>
using namespace cv::gpu;
using namespace cv;
using namespace std;
__global__ void densefun(std::vector<int>* d_counters,std::vector<Point2f>* d_points,int d_x_max,int d_y_max,int width, int min_distance)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
Point2f point = (*d_points)[i];
int x = cvFloor(point.x);
int y = cvFloor(point.y);
//if(x >= d_x_max || y >= d_y_max)
//continue;
x /= min_distance;
y /= min_distance;
(*d_counters)[y*width+x]++;
}
void dense(std::vector<int>& counters,std::vector<Point2f>& points,int x_max,int y_max,int width)
{
std::vector<int>* d_counters;
std::vector<Point2f>* d_points;
int min_distance=5;
cudaMalloc(&d_counters,counters.size());
cudaMalloc(&d_points,points.size());
cudaMemcpy(d_points, &points, points.size(), cudaMemcpyHostToDevice);
densefun<<<1,points.size()>>>(d_counters,d_points,x_max,y_max,width,min_distance);
cudaMemcpy(&counters, d_counters, counters.size(), cudaMemcpyDeviceToHost);
cudaFree(d_counters);
cudaFree(d_points);
}
Output:
/home/supriya/Desktop/5Dec/CalculateFV_merged_gpu_old/build/denseCuda.cu(28):
error: calling a host function("cv::Point_ ::Point_") from
a global function("densefun") is not allowed
/home/supriya/Desktop/5Dec/CalculateFV_merged_gpu_old/build/denseCuda.cu(28):
error: calling a host function("std::vector ,
std::allocator > > ::operator []") from a global
function("densefun") is not allowed
/home/supriya/Desktop/5Dec/CalculateFV_merged_gpu_old/build/denseCuda.cu(29)
(col. 7): error: calling a host function("cvFloor") from a
global function("densefun") is not allowed
/home/supriya/Desktop/5Dec/CalculateFV_merged_gpu_old/build/denseCuda.cu(30)
(col. 7): error: calling a host function("cvFloor") from a
global function("densefun") is not allowed
/home/supriya/Desktop/5Dec/CalculateFV_merged_gpu_old/build/denseCuda.cu(35):
error: calling a host function("std::vector > ::operator []") from a global
function("densefun") is not allowed
5 errors detected in the compilation of
"/tmp/tmpxft_00000c85_00000000-7_denseCuda.cpp1.ii". CMake Error at
testVideo_generated_denseCuda.cu.o.cmake:260 (message): Error
generating file
/home/supriya/Desktop/5Dec/CalculateFV_merged_gpu_old/build/CMakeFiles/testVideo.dir//./testVideo_generated_denseCuda.cu.o
CMakeFiles/testVideo.dir/build.make:392: recipe for target
'CMakeFiles/testVideo.dir/./testVideo_generated_denseCuda.cu.o' failed
make[2]: *
[CMakeFiles/testVideo.dir/./testVideo_generated_denseCuda.cu.o] Error
1 CMakeFiles/Makefile2:130: recipe for target
'CMakeFiles/testVideo.dir/all' failed make[1]: *
[CMakeFiles/testVideo.dir/all] Error 2 Makefile:76: recipe for target
'all' failed make: *** [all] Error 2
You cannot use C++ standard library, OpenCV or any other non-CUDA specific library inside a CUDA kernel.
Instead of std::vector you need to use raw pointers to arrays allocated on the device, instead of Point2f you need to use CUDA specific vector type float2, instead of cvFloor you need to use __device__ ​ floorf() and so on.

Update project from older CUDA version

In my older CUDA project I had the globals:
__device__ uint8_t dev_intersect
__constant__ uint8_t dev_flags
... and used them this way:
cudaGetSymbolAddress((void**)&ptr_dev_intersect,"dev_intersect")
cudaMemcpyToSymbol("dev_flags",&flags,sizeof(flags))
Now, since CUDA 5.0 (and newer) the symbols must be passed directly (without string), so I define the globals this way:
__device__ uint8_t *dev_intersect
__constant__ uint8_t *dev_flags
...and call the functions this way:
cudaGetSymbolAddress((void**)&ptr_dev_intersect,dev_intersect)
cudaMemcpyToSymbol(dev_flags,&flags,sizeof(flags))
Am I doing it right so far? I'm asking you, because when I update the code, I start getting other errors, which makes me kinda suspicious. Thanks for any help.
Switching from a POD variable to a pointer is probably not what you want.
If you didn't make changes elsewhere in your code to account for that difference, I would expect things to break.
To update your cuda function calls, leave your variables as-is:
__device__ uint8_t dev_intersect;
__constant__ uint8_t dev_flags;
And just drop the quotes from your cuda API functions that use those variables:
cudaGetSymbolAddress((void**)&ptr_dev_intersect,dev_intersect);
cudaMemcpyToSymbol(dev_flags,&flags,sizeof(flags));
Here is a complete worked example:
$ cat t524.cu
#include <stdio.h>
typedef unsigned char uint8_t;
__device__ uint8_t dev_intersect;
__constant__ uint8_t dev_flags;
__global__ void mykernel(uint8_t *d1_ptr){
printf("data 1 = %c\n", *d1_ptr);
printf("dev_flags = %c\n", dev_flags);
}
int main(){
uint8_t *ptr_dev_intersect;
uint8_t flags = 'X';
uint8_t dev_intersect_data = 'Y';
cudaGetSymbolAddress((void**)&ptr_dev_intersect,dev_intersect);
cudaMemcpyToSymbol(dev_flags,&flags,sizeof(flags));
cudaMemcpyToSymbol(dev_intersect,&dev_intersect_data,sizeof(dev_intersect_data));
mykernel<<<1,1>>>(ptr_dev_intersect);
cudaDeviceSynchronize();
return 0;
}
$ nvcc -arch=sm_20 -o t524 t524.cu
$ cuda-memcheck ./t524
========= CUDA-MEMCHECK
data 1 = Y
dev_flags = X
========= ERROR SUMMARY: 0 errors
$