CUDA_ERROR_INVALID_IMAGE during cuModuleLoad - cuda

I've created a very simple kernel (can be found here) which I successfully compile using
"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\nvcc.exe" --cl-version 2012 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -cudart static -cubin temp.cu
and subsequently use the following code to load the kernel in
CUresult err = cuInit(0);
CUdevice device;
err = cuDeviceGet(&device, 0);
CUcontext ctx;
err = cuCtxCreate(&ctx, 0, device);
CUmodule module;
string path = string(dir) + "\\temp.cubin";
err = cuModuleLoad(&module, path.c_str());
cuCtxDetach(ctx);
Unfortunately, during cuModuleLoad I get a result of CUDA_ERROR_INVALID_IMAGE. Can someone tell me why this could be happening? The kernel's valid and compiles without issues.

The CUDA_ERROR_INVALID_IMAGE error should only be returned by cuModuleLoad when the module file is invalid. If it is missing or contains an architecture mismatch you should probably see a CUDA_ERROR_FILE_NOT_FOUND or CUDA_ERROR_INVALID_SOURCE error. You haven't given us enough details or code to say for certain what is happening, but in principle at least, the API code you have should work.
To show how this should work, consider the following working example on Linux with CUDA 5.5:
First your kernel:
#include <cmath>
using namespace std;
__device__ __inline__ float trim(unsigned char value)
{
return fminf((unsigned char)255, fmaxf(value, (unsigned char)0));
}
__constant__ char z = 1;
__global__ void kernel(unsigned char* img, const float* a)
{
int ix = blockIdx.x;
int iy = threadIdx.x;
int tid = iy*blockDim.x + ix;
float x = (float)ix / blockDim.x;
float y = (float)iy / gridDim.x;
//placeholder
img[tid*4+0] = trim((a[0]*z*z+a[1]*z+a[2]) * 255.0f);
img[tid*4+1] = trim((a[3]*z*z+a[4]*z+a[5]) * 255.0f);
img[tid*4+2] = trim((a[6]*z*z+a[7]*z+a[8]) * 255.0f);
img[tid*4+3] = 255;
}
Then a simple program to load the cubin into a context at runtime:
#include <cuda.h>
#include <string>
#include <iostream>
#define Errchk(ans) { DrvAssert((ans), __FILE__, __LINE__); }
inline void DrvAssert( CUresult code, const char *file, int line)
{
if (code != CUDA_SUCCESS) {
std::cout << "Error: " << code << " " << file << "#" << line << std::endl;
exit(code);
} else {
std::cout << "Success: " << file << "#" << line << std::endl;
}
}
int main(void)
{
Errchk( cuInit(0) );
CUdevice device;
Errchk( cuDeviceGet(&device, 0) );
CUcontext ctx;
Errchk( cuCtxCreate(&ctx, 0, device) );
CUmodule module;
std::string path = "qkernel.cubin";
Errchk( cuModuleLoad(&module, path.c_str()) );
cuCtxDetach(ctx);
return 0;
}
Build the cubin for the architecture of the device present in the host (a GTX670 in this case):
$ nvcc -arch=sm_30 -Xptxas="-v" --cubin qkernel.cu
ptxas info : 11 bytes gmem, 1 bytes cmem[3]
ptxas info : Compiling entry function '_Z6kernelPhPKf' for 'sm_30'
ptxas info : Function properties for _Z6kernelPhPKf
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 10 registers, 336 bytes cmem[0]
and the host program:
$ nvcc -o qexe qmain.cc -lcuda
then run:
$ ./qexe
Success: qmain.cc#18
Success: qmain.cc#20
Success: qmain.cc#22
Success: qmain.cc#26
The module code loads. If I delete the cubin and run again, I see this:
$ rm qkernel.cubin
$ ./qexe
Success: qmain.cc#18
Success: qmain.cc#20
Success: qmain.cc#22
Error: 301 qmain.cc#26
If I compile for an incompatible architecture, I see this:
$ nvcc -arch=sm_10 -Xptxas="-v" --cubin qkernel.cu
ptxas info : 0 bytes gmem, 1 bytes cmem[0]
ptxas info : Compiling entry function '_Z6kernelPhPKf' for 'sm_10'
ptxas info : Used 5 registers, 32 bytes smem, 4 bytes cmem[1]
$ ./qexe
Success: qmain.cc#18
Success: qmain.cc#20
Success: qmain.cc#22
Error: 300 qmain.cc#26
If I compile to an object file, not a cubin, I see this:
$ nvcc -arch=sm_30 -Xptxas="-v" -c -o qkernel.cubin qkernel.cu
ptxas info : 11 bytes gmem, 1 bytes cmem[3]
ptxas info : Compiling entry function '_Z6kernelPhPKf' for 'sm_30'
ptxas info : Function properties for _Z6kernelPhPKf
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 10 registers, 336 bytes cmem[0]
$ ./qexe
Success: qmain.cc#18
Success: qmain.cc#20
Success: qmain.cc#22
Error: 200 qmain.cc#26
This is the only way I can get the code to emit a CUDA_ERROR_INVALID_IMAGE error. All I can suggest is to try my code and recipe and see if you can get it to work.

Happens if you compile for different machine types - for example 32 vs 64.
If you have 32bits app, add --machine 32 to the nvcc param and it will be fine.

Related

Is it possible to get assertion info from within a CUDA kernel?

Is there any way to get a kernel assert message/line number back from a kernel failure?
That is, if I have:
__global__ void my_kernel(int x){
assert(x!=0);
}
int main(){
CUDA_CHECK(my_kernel<<<1,1>>>(0));
CHECK_WITH_ASSERTION_FETCH(cudaDeviceSynchronize());
}
My understanding is that CUDA_CHECK() passes here and cudaDeviceSynchronize() would return a failure code (specifically, CUDA error: device-side assert triggered CUDA kernel errors).
Is there a function CHECK_WITH_ASSERTION_FETCH that can somehow get info about which assertion failed when it observes that cudaDeviceSynchronize() is returning an error? The file and line number in which the assertion failed would be sufficient.
Is there a function CHECK_WITH_ASSERTION_FETCH that can somehow get info about which assertion failed when it observes that cudaDeviceSynchronize() is returning an error?
No there isn't.
As per the documentation, one way that you can see which line of code triggered the assertion and in which block and thread the assertion was raised is by attaching the debugger to the running kernel.
Robert Crovella's the authoritative voice here and says it isn't possible for a kernel assert to get information about itself back to the host. So we need some workarounds.
A major complicating factor is that if assert is called on the device then we are no longer able to communicate with it from the host, so any data we write to device memory is lost forever. (Reference).
Below I offer three:
Using Unified Memory to pass info from the GPU to the CPU even "after" an assert is called. This is the best answer.
Improving the GPU's assertion error messages by passing stacks to the GPU.
Passing info from the GPU to the CPU by dropping asserts and writing to memory. You'd only use this if UVM wasn't an option for some reason.
Using Unified Memory
Unified Memory allows the CUDA device and the host to transparently shuffle bits between each other without the need for cudaMemcpy. The result is that even though throwing an assert blocks our access to the device via regular API calls, we can still transfer signals back to the CPU via the Unified Memory.
Note that if we want kernels to be able to run asynchronously we need a way to associate kernel launches with assertion failures. The circular buffer here provides this functionality.
The code for this is:
//Compile with nvcc -g main.cu -lboost_stacktrace_basic -ldl
#define BOOST_STACKTRACE_USE_ADDR2LINE
#include <boost/assert/source_location.hpp>
#include <boost/stacktrace.hpp>
#include <array>
#include <cassert>
#include <iostream>
// Number of assertion failure messages we can store. If this is too small
// threads will fail silently.
#define DEVICE_SIDE_ASSERTION_COUNT 10
// Used by preprocessor to convert things to strings
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
#define LINE_STRING TOSTRING(__LINE__)
// Standard CUDA success check
#define CUDA_CHECK_API_CALL(error) \
do { \
const auto error_code = error; \
if(error_code!=cudaSuccess){ \
std::cout<<"CUDA API call failure detected at ("<<__FILE__<<":"<<__LINE__<<"): "<<cudaGetErrorString(error_code)<<std::endl; \
std::cout<< boost::stacktrace::stacktrace() << std::endl; \
}} while(false)
// Copy string from `src` to `dst`
__device__ void dstrcpy(char *dst, const char *src){
for(;*src!='\0';dst++,src++){
*dst = *src;
}
*dst = '\0';
}
// Used to hold assertion data generated by the device
struct AssertionData {
char assertion_msg[1000];
char filename[1000];
char function_name[1000];
int line_number;
uint32_t caller;
dim3 block_id;
dim3 thread_id;
};
// Used to hold assertions generated by the device
struct AssertionsData {
int assertion_count;
AssertionData assertions[DEVICE_SIDE_ASSERTION_COUNT];
// Get the next place to insert an assertion failure message
__device__ int next_id(){
// Atomically increment so other threads can fail at the same time
return atomicAdd(&assertion_count, 1);
}
__device__ void insert(
const char *assertion_msg0,
const char *filename0,
const char *function_name0,
const int line_number0,
const uint32_t caller0,
const dim3 block_id0,
const dim3 thread_id0
){
const auto nid = next_id();
if(nid>DEVICE_SIDE_ASSERTION_COUNT){
printf("RAN OUT OF ASSERTION BUFFER SPACE!");
return;
}
auto& self = assertions[nid];
dstrcpy(self.assertion_msg, assertion_msg0);
dstrcpy(self.filename, filename0);
dstrcpy(self.function_name, function_name0);
self.line_number = line_number0;
self.caller = caller0;
self.block_id = block_id0;
self.thread_id = thread_id0;
}
};
// Pointer to device memory allocated to hold assertion failure messages
AssertionsData *uvm_assertions = nullptr;
// Use to hold stack traces generated by the host so that we can run kernels
// asynchronously and still associate stacks to assertion failures
struct StackTraceInfo {
boost::stacktrace::stacktrace stacktrace;
int device;
cudaStream_t stream;
uint32_t generation_number;
StackTraceInfo() = default;
StackTraceInfo(int generation_number0, cudaStream_t stream0) {
// Technically we'd want to lop the top few layers off of this
generation_number = generation_number0;
stacktrace = boost::stacktrace::stacktrace();
CUDA_CHECK_API_CALL(cudaGetDevice(&device));
stream = stream0;
}
};
// Circular buffer used to hold stacks generated by the host
struct CircularTraceBuffer {
// Assume that this is the max number of items that might ever be enqueued
// across all streams
static constexpr int max_size = 1024;
// How many stacktraces we've inserted. Used to ensure that circular queue
// doesn't provide false information by always increasing, but also to mark
// where we are inserting into the queue
uint32_t generation_number = 0;
// The buffer
std::array<StackTraceInfo, max_size> traces;
uint32_t insert(cudaStream_t stream_id) {
traces[generation_number % max_size] = StackTraceInfo(generation_number, stream_id);
return generation_number++;
}
};
// Circular buffer of host stacktraces for associating with kernel launches
CircularTraceBuffer circular_trace_buffer;
// Emulates a kernel assertion. The assertion won't stop the kernel's progress, so you
// should assume everything the kernel produces is garbage if there's an assertion failure.
#define CUDA_COMMUNICATING_KERNEL_ASSERTION(condition, assertions_data, caller) \
do { \
if (! (condition)) { \
/* Atomically increment so other threads can fail at the same time */ \
assertions_data->insert( \
TOSTRING(condition), \
__FILE__, \
__FUNCTION__, \
__LINE__, \
caller, \
blockIdx, \
threadIdx \
); \
\
assert(condition); \
} \
} while (false);
// NOTE: Our kernels now need a pointer to the assertions data and an id for the caller
// NOTE: We can simplify our code by assuming these variables always have the same names
// so that they do not need to be passed to the preprocessor macro
__global__ void my_failing_kernel(int x, AssertionsData *const assertions_data, const uint32_t caller){
CUDA_COMMUNICATING_KERNEL_ASSERTION(x!=5, assertions_data, caller);
}
// Check that kernels ran correctly by acquiring the message buffer. BLOCKING.
void CUDA_CHECK_KERNEL_SUCCESS(const boost::source_location& location = BOOST_CURRENT_LOCATION){
if(cudaDeviceSynchronize()==cudaSuccess){
return;
}
std::cout<<"CUDA API call failure detected at ("<<location.file_name()<<":"<<location.line()<<":"<<location.column()<<"): "<<std::endl;
std::cout<< boost::stacktrace::stacktrace()<<std::endl;
for(int i=0;i<uvm_assertions->assertion_count;i++){
std::cout<<"Assertion failure "<<i<<std::endl;
const auto &self = uvm_assertions->assertions[i];
const auto &stack = circular_trace_buffer.traces[self.caller];
std::cout<<"GPU "<<self.filename<<":"
<<self.line_number<<"("
<<self.function_name<<"): "
<<self.assertion_msg<<std::endl;
if(stack.generation_number == self.caller){
std::cout<<stack.stacktrace
<<"Device = "<<stack.device<<", "
<<"Stream = "<<stack.stream
<<std::endl;
} else {
std::cout<<"CPU stack has been overwritten!"<<std::endl;
}
}
}
int main(){
CUDA_CHECK_API_CALL(cudaMallocManaged(&uvm_assertions, sizeof(AssertionsData)));
CUDA_CHECK_API_CALL(cudaMemAdvise(
uvm_assertions, sizeof(AssertionsData), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId
));
// GPU will establish direct mapping of data in CPU memory, no page faults will be generated
CUDA_CHECK_API_CALL(cudaMemAdvise(
uvm_assertions, sizeof(AssertionsData), cudaMemAdviseSetAccessedBy, 0
));
my_failing_kernel<<<1, 1, 0>>>(4, uvm_assertions, circular_trace_buffer.insert(0));
my_failing_kernel<<<1, 1, 0>>>(5, uvm_assertions, circular_trace_buffer.insert(0));
CUDA_CHECK_KERNEL_SUCCESS();
CUDA_CHECK_API_CALL(cudaFree(uvm_assertions));
return 0;
}
The output for the above is:
main_assert_um_from_device.cu:162: void my_failing_kernel(int, AssertionsData *, unsigned int): block: [0,0,0], thread: [0,0,0] Assertion `x!=5` failed.
CUDA API call failure detected at (main_assert_um_from_device.cu:167:0):
0# 0x000055D3D8CEAFF2 in ./a.out
1# 0x000055D3D8CEB700 in ./a.out
2# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
3# 0x000055D3D8CEADAE in ./a.out
Assertion failure 0
GPU main_assert_um_from_device.cu:162(my_failing_kernel): x!=5
0# 0x000055D3D8CECEF9 in ./a.out
1# 0x000055D3D8CED135 in ./a.out
2# 0x000055D3D8CEB6B9 in ./a.out
3# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
4# 0x000055D3D8CEADAE in ./a.out
Device = 0, Stream = 0
Better Assert Messages
The first work around is to make the device assert message better. To do so, we collect stacktrace strings on the host and transfer them to the GPU. Then, when we call a kernel we pass a pointer to the stacktrace string. If the kernel fails an assertion condition we print out the stacktrace before triggering the assertion.
The code for that is:
//Compile with nvcc -g main.cu -lboost_stacktrace_basic -ldl
#define BOOST_STACKTRACE_USE_ADDR2LINE
#include <boost/stacktrace.hpp>
#include <iostream>
#include <sstream>
#include <string>
#include <unordered_map>
// Used by preprocessor to convert things to strings
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
// Print a beefy kernel assertion message followed by inducing failure using
// the actual assertion
#define CUDA_DEVICE_ASSERT_WITH_STACKTRACE(condition, message) \
do { \
if (! (condition)) { \
printf("Assertion '%s' failed at %s:%d as part of stacktrace:\n%s", \
TOSTRING(condition), \
__FILE__, \
__LINE__, \
message); \
} \
/* Perform actual assertion to stop kernel progress */ \
assert(condition); \
} while (false)
__global__ void my_failing_kernel(int x, const char *d_stacktrace){
CUDA_DEVICE_ASSERT_WITH_STACKTRACE(x!=5, d_stacktrace);
}
// Increases performance by cacheing stack traces so we don't repeatedly
// transfer the same data to the GPU
std::unordered_map<std::string, char*> cached_stacks;
// Send a stacktrace to the GPU, cache the pointer it's stored at, return
// said pointer
char* setup_device_stacktrace(){
std::stringstream ss;
ss << boost::stacktrace::stacktrace();
const auto cached_stack = cached_stacks.find(ss.str());
if(cached_stack!=cached_stacks.end()){
std::cerr<<"Using cached stacktrace!"<<std::endl;
return cached_stack->second;
}
char *d_stacktrace = nullptr;
cudaMalloc(&d_stacktrace, 10000);
cudaMemcpy(d_stacktrace, ss.str().c_str(), ss.str().size(), cudaMemcpyHostToDevice);
cached_stacks[ss.str()] = d_stacktrace;
return d_stacktrace;
}
// Make an interesting stack
void nested_n(int depth, int val){
if(depth<5){
nested_n(depth+1, val);
} else {
const char* d_stacktrace = setup_device_stacktrace();
my_failing_kernel<<<1, 1>>>(val, d_stacktrace);
cudaDeviceSynchronize();
}
}
// Make an interesting stack
void nested3(int val){ nested_n(0, val); }
void nested2(int val){ nested3(val); }
void nested1(int val){ nested2(val); }
int main(){
for(int i=4;i<6;i++){
std::cerr<<"Running with value = "<<i<<std::endl;
nested1(i);
}
// Clean-up
for(const auto &x: cached_stacks){
cudaFree(x.second);
}
return 0;
}
This gives the output:
Running with value = 4
Running with value = 5
Using cached stacktrace!
Assertion 'x!=5' failed at main.cu:31 as part of stacktrace:
0# 0x000055BBF4A3CF76 in ./a.out
1# 0x000055BBF4A3D262 in ./a.out
2# 0x000055BBF4A3D258 in ./a.out
3# 0x000055BBF4A3D258 in ./a.out
4# 0x000055BBF4A3D258 in ./a.out
5# 0x000055BBF4A3D258 in ./a.out
6# 0x000055BBF4A3D258 in ./a.out
7# 0x000055BBF4A3D313 in ./a.out
8# 0x000055BBF4A3D32F in ./a.out
9# 0x000055BBF4A3D34B in ./a.out
10# 0x000055BBF4A3D3CF in ./a.out
11# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
12# 0x000055BBF4A3CE0E in ./a.out
main.cu:31: void my_failing_kernel(int, const char *): block: [0,0,0], thread: [0,0,0] Assertion `x!=5` failed.
Replace The Device Assertion With Magic
Here the idea is to replace the device-side assert with our Own Special Assert. Our OSA will write information about itself to device-side and the host will read this to see what went wrong. Note that we'd only want to do this if the Unified Memory solution wasn't possible for some reason.
Here, rather than have the kernel fail with an assert, we have any failing threads early-exit the kernel while the rest of the threads continuing working. The result is garbage, but at least we can get information about why!
The code for this is:
//Compile with nvcc -g main.cu -lboost_stacktrace_basic -ldl
#define BOOST_STACKTRACE_USE_ADDR2LINE
#include <boost/assert/source_location.hpp>
#include <boost/stacktrace.hpp>
#include <array>
#include <cassert>
#include <iostream>
// Pointer to device memory allocated to hold assertion failure messages
char *d_assert_buffer = nullptr;
// Number of assertion failure messages we can store. If this is too small
// threads will fail silently.
#define DEVICE_SIDE_ASSERTION_COUNT 10
// Length of each assertion failure message - if this is too small we get
// garbage as threads overwrite each other
#define DEVICE_SIDE_ASSERTION_LENGTH 500
// Total size of the assertion failure message buffer. First 4 bytes stores the
// number of logged messages
#define DEVICE_SIDE_ASSERTION_BUFFER_LEN (4 + DEVICE_SIDE_ASSERTION_COUNT * DEVICE_SIDE_ASSERTION_LENGTH)
// Used by preprocessor to convert things to strings
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
#define LINE_STRING TOSTRING(__LINE__)
// Emulates a kernel assertion. The assertion won't stop the kernel's progress, so you
// should assume everything the kernel produces is garbage if there's an assertion failure.
#define CUDA_COMMUNICATING_KERNEL_ASSERTION(condition, buffer) \
do { \
if (! (condition)) { \
/* First four bytes of the buffer indicate which buffer we're using */ \
uint32_t *const msgnum_ptr = reinterpret_cast<uint32_t*>(d_assert_buffer); \
/* Atomically increment so other threads can fail at the same time */ \
const uint32_t msg_num = atomicAdd(msgnum_ptr, 1); \
if(msg_num>=DEVICE_SIDE_ASSERTION_COUNT){ \
printf("RAN OUT OF ASSERTION BUFFER SPACE!\n"); \
return; \
} \
\
/* Find the start of the buffer we'll be writing to */ \
char *const msg_ptr = d_assert_buffer + 4 + msg_num * DEVICE_SIDE_ASSERTION_LENGTH; \
\
constexpr char const assertion_string[] = TOSTRING(x==5); \
constexpr char const line_string[] = LINE_STRING; \
constexpr int assertion_size = sizeof(assertion_string); \
constexpr int filename_size = sizeof(__FILE__)-1; \
\
/* __LINE__ gets turned into a buffer of length 6, it seems, so we need to find */ \
/* the actual length in order to print the message */ \
int line_size = 0; \
for(int i=0;i<20;i++){ \
if(line_string[i]!='\0'){ \
line_size++; \
} else { \
break; \
} \
} \
\
memcpy(msg_ptr, __FILE__, filename_size); \
msg_ptr[filename_size] = ':'; \
memcpy(msg_ptr+filename_size+1, line_string, line_size); \
msg_ptr[filename_size+1+line_size] = ':'; \
memcpy(msg_ptr+filename_size+1+line_size+1, assertion_string, assertion_size); \
msg_ptr[filename_size+1+line_size+1+assertion_size] = '\0'; \
/* If we actually assert then we can't ever get the message to the host, so we */ \
/* return and let the kernel generate garbage */ \
return; \
} \
} while (false);
// Standard CUDA success check
#define CUDA_CHECK_API_CALL(error) \
do { \
const auto error_code = error; \
if(error_code!=cudaSuccess){ \
std::cout<<"CUDA API call failure detected at ("<<__FILE__<<":"<<__LINE__<<"): "<<cudaGetErrorString(error_code)<<std::endl; \
std::cout<< boost::stacktrace::stacktrace() << std::endl; \
}} while(false)
__global__ void my_failing_kernel(int x, char *d_assert_buffer){
CUDA_COMMUNICATING_KERNEL_ASSERTION(x!=5, d_assert_buffer);
}
// Check that kernels ran correctly by acquiring the message buffer. BLOCKING.
void CUDA_CHECK_KERNEL_SUCCESS(const boost::source_location& location = BOOST_CURRENT_LOCATION){
std::array<char, DEVICE_SIDE_ASSERTION_BUFFER_LEN> cuda_assert_buffer = {0};
CUDA_CHECK_API_CALL(cudaDeviceSynchronize());
assert(d_assert_buffer!=nullptr);
// NOTE: We could maybe save time by only moving the message count initially and copying the messages
// conditionally.
CUDA_CHECK_API_CALL(cudaMemcpy(cuda_assert_buffer.data(), d_assert_buffer, DEVICE_SIDE_ASSERTION_BUFFER_LEN, cudaMemcpyDeviceToHost));
CUDA_CHECK_API_CALL(cudaDeviceSynchronize()); // NOTE: Needed for buffers of <64kB
const uint32_t& msg_num = *reinterpret_cast<uint32_t*>(cuda_assert_buffer.data());
if(msg_num==0){
return;
}
std::cout<<"CUDA API call failure detected at ("<<location.file_name()<<":"<<location.line()<<":"<<location.column()<<"): "<<std::endl;
std::cout<< boost::stacktrace::stacktrace();
std::cout<<"Assertion messages ("<<msg_num<<" messages):"<<std::endl;
for(int i=0;i<msg_num;i++){
std::cout<<" "<<i<<" "<<cuda_assert_buffer.data()+(4+i*DEVICE_SIDE_ASSERTION_LENGTH)<<std::endl;
}
}
int main(){
CUDA_CHECK_API_CALL(cudaMalloc(&d_assert_buffer, DEVICE_SIDE_ASSERTION_BUFFER_LEN));
my_failing_kernel<<<1, 1>>>(4, d_assert_buffer);
CUDA_CHECK_KERNEL_SUCCESS();
my_failing_kernel<<<1, 1>>>(5, d_assert_buffer);
CUDA_CHECK_KERNEL_SUCCESS();
// Clean-up
cudaFree(d_assert_buffer);
return 0;
}
And the output looks like:
CUDA API call failure detected at (main_assert_from_device.cu:91:0):
0# 0x00005573A1F633A5 in ./a.out
1# 0x00005573A1F637C2 in ./a.out
2# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
3# 0x00005573A1F62D9E in ./a.out
Assertion messages (1 messages):
0 main_assert_from_device.cu:86:x==5

cuda & rdc & thrust in multiple shared objects results in SIGSEV in registerEntryFunction

I'm trying to run relocatable-device-code in two shared libraries, both using cuda-thrust. Everything runs fine if I stop using thrust in kernel.cu, which is not an option.
edit: The program works too if rdc is disabled. Not an option for me either.
It compiles fine but stops with a segfault when run. gdb tells me this:
Program received signal SIGSEGV, Segmentation fault.
0x0000000000422cc8 in cudart::globalState::registerEntryFunction(void**, char const*, char*, char const*, int, uint3*, uint3*, dim3*, dim3*, int*) ()
(cuda-gdb) bt
#0 0x0000000000422cc8 in cudart::globalState::registerEntryFunction(void**, char const*, char*, char const*, int, uint3*, uint3*, dim3*, dim3*, int*) ()
#1 0x000000000040876c in __cudaRegisterFunction ()
#2 0x0000000000402b58 in __nv_cudaEntityRegisterCallback(void**) ()
#3 0x00007ffff75051a3 in __cudaRegisterLinkedBinary(__fatBinC_Wrapper_t const*, void (*)(void**), void*) ()
from /home/mindoms/rdctestmcsimple/libkernel.so
#4 0x00007ffff75050b1 in __cudaRegisterLinkedBinary_66_tmpxft_00007a5f_00000000_16_cuda_device_runtime_ compute_52_cpp1_ii_8b1a5d37 () from /home/user/rdctestmcsimple/libkernel.so
#5 0x000000000045285d in __libc_csu_init ()
#6 0x00007ffff65ea50f in __libc_start_main () from /lib64/libc.so.6
Here is my stripped down example (using cmake) that shows the error.
main.cpp:
#include "kernel.cuh"
#include "kernel2.cuh"
int main(){
Kernel k;
k.callKernel();
Kernel2 k2;
k2.callKernel2();
}
kernel.cuh:
#ifndef __KERNEL_CUH__
#define __KERNEL_CUH__
class Kernel{
public:
void callKernel();
};
#endif
kernel.cu:
#include "kernel.cuh"
#include <stdio.h>
#include <iostream>
#include <thrust/device_vector.h>
__global__
void thekernel(int *data){
if (threadIdx.x == 0)
printf("the kernel says hello\n");
data[threadIdx.x] = threadIdx.x * 2;
}
void Kernel::callKernel(){
thrust::device_vector<int> D2;
D2.resize(11);
int * raw_ptr = thrust::raw_pointer_cast(&D2[0]);
printf("Kernel::callKernel called\n");
thekernel <<< 1, 10 >>> (raw_ptr);
cudaThreadSynchronize();
cudaError_t code = cudaGetLastError();
if (code != cudaSuccess) {
std::cout << "Cuda error: " << cudaGetErrorString(code) << " after callKernel!" << std::endl;
}
for (int i = 0; i < D2.size(); i++)
std::cout << "Kernel D[" << i << "]=" << D2[i] << std::endl;
}
kernel2.cuh:
#ifndef __KERNEL2_CUH__
#define __KERNEL2_CUH__
class Kernel2{
public:
void callKernel2();
};
#endif
kernel2.cu
#include "kernel2.cuh"
#include <stdio.h>
#include <iostream>
#include <thrust/device_vector.h>
__global__
void thekernel2(int *data2){
if (threadIdx.x == 0)
printf("the kernel2 says hello\n");
data2[threadIdx.x] = threadIdx.x * 2;
}
void Kernel2::callKernel2(){
thrust::device_vector<int> D;
D.resize(11);
int * raw_ptr = thrust::raw_pointer_cast(&D[0]);
printf("Kernel2::callKernel2 called\n");
thekernel2 <<< 1, 10 >>> (raw_ptr);
cudaThreadSynchronize();
cudaError_t code = cudaGetLastError();
if (code != cudaSuccess) {
std::cout << "Cuda error: " << cudaGetErrorString(code) << " after callKernel2!" << std::endl;
}
for (int i = 0; i < D.size(); i++)
std::cout << "Kernel2 D[" << i << "]=" << D[i] << std::endl;
}
The cmake file below was used originally, but I get the same problem when I compile "by hand":
nvcc -arch=sm_35 -Xcompiler -fPIC -dc kernel2.cu
nvcc -arch=sm_35 -shared -Xcompiler -fPIC kernel2.o -o libkernel2.so
nvcc -arch=sm_35 -Xcompiler -fPIC -dc kernel.cu
nvcc -arch=sm_35 -shared -Xcompiler -fPIC kernel.o -o libkernel.so
g++ -o main main.cpp libkernel.so libkernel2.so -L/opt/cuda/current/lib64
Adding -cudart shared to every nvcc call as suggested somewhere results in a different error:
warning: Cuda API error detected: cudaFuncGetAttributes returned (0x8)
terminate called after throwing an instance of 'thrust::system::system_error'
what(): function_attributes(): after cudaFuncGetAttributes: invalid device function
Program received signal SIGABRT, Aborted.
0x000000313c432625 in raise () from /lib64/libc.so.6
(cuda-gdb) bt
#0 0x000000313c432625 in raise () from /lib64/libc.so.6
#1 0x000000313c433e05 in abort () from /lib64/libc.so.6
#2 0x00000031430bea7d in __gnu_cxx::__verbose_terminate_handler() () from /usr/lib64/libstdc++.so.6
#3 0x00000031430bcbd6 in std::set_unexpected(void (*)()) () from /usr/lib64/libstdc++.so.6
#4 0x00000031430bcc03 in std::terminate() () from /usr/lib64/libstdc++.so.6
#5 0x00000031430bcc86 in __cxa_rethrow () from /usr/lib64/libstdc++.so.6
#6 0x00007ffff7d600eb in thrust::detail::vector_base<int, thrust::device_malloc_allocator<int> >::append(unsigned long) () from ./libkernel.so
#7 0x00007ffff7d5f740 in thrust::detail::vector_base<int, thrust::device_malloc_allocator<int> >::resize(unsigned long) () from ./libkernel.so
#8 0x00007ffff7d5b19a in Kernel::callKernel() () from ./libkernel.so
#9 0x00000000004006f8 in main ()
CMakeLists.txt: Please adjust to your environment
cmake_minimum_required(VERSION 2.6.2)
project(Cuda-project)
set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/CMake/cuda" ${CMAKE_MODULE_PATH})
SET(CUDA_TOOLKIT_ROOT_DIR "/opt/cuda/current")
SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52)
find_package(CUDA REQUIRED)
link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
set(CUDA_SEPARABLE_COMPILATION ON)
set(BUILD_SHARED_LIBS ON)
list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC)
CUDA_ADD_LIBRARY(kernel
kernel.cu
)
CUDA_ADD_LIBRARY(kernel2
kernel2.cu
)
cuda_add_executable(rdctest main.cpp)
TARGET_LINK_LIBRARIES(rdctest kernel kernel2 cudadevrt)
About my system:
Fedora 23
kernel: 4.4.2-301.fc23.x86_64
Nvidia Driver: 361.28
Nvidia Toolkit: 7.5.18
g++: g++ (GCC) 5.3.1 20151207 (Red Hat 5.3.1-2)
Reproduced on:
CentOS release 6.7 (Final)
Kernel: 2.6.32-573.8.1.el6.x86_64
Nvidia Driver: 352.55
Nvidia Toolkit: 7.5.18
g++ (GCC) 4.4.7 20120313 (Red Hat 4.4.7-16)
glibc 2.12
cmake to 3.5
Apparently, this has something to do with what cuda runtime is used: shared or static.
I slightly modified your example: Instead of building two shared libraries and linking them to the executable individually, I create two static libraries that are linked together to one shared library, and that one is linked to the executable.
Also, here is an updated CMake file that uses the new (>= 3.8) native CUDA language support.
cmake_minimum_required(VERSION 3.8)
project (CudaSharedThrust CXX CUDA)
string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_61,code=compute_61")
if(BUILD_SHARED_LIBS)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
endif()
add_library(kernel STATIC kernel.cu)
set_target_properties(kernel PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
add_library(kernel2 STATIC kernel2.cu)
set_target_properties(kernel2 PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
add_library(allkernels empty.cu) # empty.cu is an empty file
set_target_properties(allkernels PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(allkernels kernel kernel2)
add_executable(rdctest main.cpp)
set_target_properties(rdctest PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(rdctest allkernels)
Building this without any CMake flags (static build), the build succeeds and the program works.
Building with -DBUILD_SHARED_LIBS=ON, the program compiles, but it crashes with the same error is yours.
Building with
cmake .. -DBUILD_SHARED_LIBS=ON -DCMAKE_CUDA_FLAGS:STRING="--cudart shared"
compiles, and actually makes it run! So for some reason, the shared CUDA runtime is required for this sort of thing.
Also note that the step from 2 SO's -> 2 Static Libs in 1 SO was necessary, because otherwise the program would crash with a hrust::system::system_error.
This, however is expected because NVCC actually ignores shared object files during device linking: http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#libraries

Loading multiple modules in JCuda is not working

In jCuda one can load cuda files as PTX or CUBIN format and call(launch) __global__ functions (kernels) from Java.
With keeping that in mind, I want to develop a framework with JCuda that gets user's __device__ function in a .cu file at run-time, loads and runs it.
And I have already implemented a __global__ function, in which each thread finds out the start point of its related data, perform some computation, initialization and then call user's __device__ function.
Here is my kernel pseudo code:
extern "C" __device__ void userFunc(args);
extern "C" __global__ void kernel(){
// initialize
userFunc(args);
// rest of the kernel
}
And user's __device__ function:
extern "C" __device__ void userFunc(args){
// do something
}
And in Java side, here is the part that I load the modules(modules are made from ptx files which are successfully created from cuda files with this command: nvcc -m64 -ptx path/to/cudaFile -o cudaFile.ptx)
CUmodule kernelModule = new CUmodule(); // 1
CUmodule userFuncModule = new CUmodule(); // 2
cuModuleLoad(kernelModule, ptxKernelFileName); // 3
cuModuleLoad(userFuncModule, ptxUserFuncFileName); // 4
When I try to run it I got error at line 3 : CUDA_ERROR_NO_BINARY_FOR_GPU. After some searching I get that my ptx file has some syntax error. After running this suggested command:
ptxas -arch=sm_30 kernel.ptx
I got:
ptxas fatal : Unresolved extern function 'userFunc'
Even when I replace line 3 with 4 to load userFunc before kernel I get this error. I got stuck at this phase. Is this the correct way to load multiple modules that need to be linked together in JCuda? Or is it even possible?
Edit:
Second part of the question is here
The really short answer is: No, you can't load multiple modules into a context in the runtime API.
You can do what you want, but it requires explicit setup and execution of a JIT linking call. I have no idea how (or even whether) that has been implemented in JCUDA, but I can show you how to do it with the standard driver API. Hold on...
If you have a device function in one file, and a kernel in another, for example:
// test_function.cu
#include <math.h>
__device__ float mathop(float &x, float &y, float &z)
{
float res = sin(x) + cos(y) + sqrt(z);
return res;
}
and
// test_kernel.cu
extern __device__ float mathop(float & x, float & y, float & z);
__global__ void kernel(float *xvals, float * yvals, float * zvals, float *res)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
res[tid] = mathop(xvals[tid], yvals[tid], zvals[tid]);
}
You can compile them to PTX as usual:
$ nvcc -arch=sm_30 -ptx test_function.cu
$ nvcc -arch=sm_30 -ptx test_kernel.cu
$ head -14 test_kernel.ptx
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19324607
// Cuda compilation tools, release 7.0, V7.0.27
// Based on LLVM 3.4svn
//
.version 4.2
.target sm_30
.address_size 64
// .globl _Z6kernelPfS_S_S_
.extern .func (.param .b32 func_retval0) _Z6mathopRfS_S_
At runtime, your code must create a JIT link session, add each PTX to the linker session, then finalise the linker session. This will give you a handle to a compiled cubin image which can be loaded as a module as usual. The simplest possible driver API code to put this together looks like this:
#include <cstdio>
#include <cuda.h>
#define drvErrChk(ans) { drvAssert(ans, __FILE__, __LINE__); }
inline void drvAssert(CUresult code, const char *file, int line, bool abort=true)
{
if (code != CUDA_SUCCESS) {
fprintf(stderr, "Driver API Error %04d at %s %d\n", int(code), file, line);
exit(-1);
}
}
int main()
{
cuInit(0);
CUdevice device;
drvErrChk( cuDeviceGet(&device, 0) );
CUcontext context;
drvErrChk( cuCtxCreate(&context, 0, device) );
CUlinkState state;
drvErrChk( cuLinkCreate(0, 0, 0, &state) );
drvErrChk( cuLinkAddFile(state, CU_JIT_INPUT_PTX, "test_function.ptx", 0, 0, 0) );
drvErrChk( cuLinkAddFile(state, CU_JIT_INPUT_PTX, "test_kernel.ptx" , 0, 0, 0) );
size_t sz;
char * image;
drvErrChk( cuLinkComplete(state, (void **)&image, &sz) );
CUmodule module;
drvErrChk( cuModuleLoadData(&module, image) );
drvErrChk( cuLinkDestroy(state) );
CUfunction function;
drvErrChk( cuModuleGetFunction(&function, module, "_Z6kernelPfS_S_S_") );
return 0;
}
You should be able to compile and run this as posted and verify it works OK. It should serve as a template for a JCUDA implementation, if they have JIT linking support implemented.

CUDA thread block size 1024 doesn't work (cc=20, sm=21)

My running config:
- CUDA Toolkit 5.5
- NVidia Nsight Eclipse edition
- Ubuntu 12.04 x64
- CUDA device is NVidia GeForce GTX 560: cc=20, sm=21 (as you can see I can use blocks up to 1024 threads)
I render my display on iGPU (Intel HD Graphics), so I can use Nsight debugger.
However I encountered some weird behaviour, when I set threads > 960.
Code:
#include <stdio.h>
#include <cuda_runtime.h>
__global__ void mytest() {
float a, b;
b = 1.0F;
a = b / 1.0F;
}
int main(void) {
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Here I run my kernel
mytest<<<1, 961>>>();
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "error=%s\n", cudaGetErrorString(err));
exit (EXIT_FAILURE);
}
// Reset the device and exit
err = cudaDeviceReset();
if (err != cudaSuccess) {
fprintf(stderr, "Failed to deinitialize the device! error=%s\n",
cudaGetErrorString(err));
exit (EXIT_FAILURE);
}
printf("Done\n");
return 0;
}
And... it doesn't work. The problem is in the last line of code with float division. Every time I try to divide by float, my code compiles, but doesn't work. The output error at runtime is:
error=too many resources requested for launch
Here's what I get in debug, when I step it over:
warning: Cuda API error detected: cudaLaunch returned (0x7)
Build output using -Xptxas -v:
12:57:39 **** Incremental Build of configuration Debug for project block_size_test ****
make all
Building file: ../src/vectorAdd.cu
Invoking: NVCC Compiler
/usr/local/cuda-5.5/bin/nvcc -I"/usr/local/cuda-5.5/samples/0_Simple" -I"/usr/local/cuda-5.5/samples/common/inc" -G -g -O0 -m64 -keep -keep-dir /home/vitrums/cuda-workspace-trashcan -optf /home/vitrums/cuda-workspace/block_size_test/options.txt -gencode arch=compute_20,code=sm_20 -gencode arch=compute_20,code=sm_21 -odir "src" -M -o "src/vectorAdd.d" "../src/vectorAdd.cu"
/usr/local/cuda-5.5/bin/nvcc --compile -G -I"/usr/local/cuda-5.5/samples/0_Simple" -I"/usr/local/cuda-5.5/samples/common/inc" -O0 -g -gencode arch=compute_20,code=compute_20 -gencode arch=compute_20,code=sm_21 -keep -keep-dir /home/vitrums/cuda-workspace-trashcan -m64 -optf /home/vitrums/cuda-workspace/block_size_test/options.txt -x cu -o "src/vectorAdd.o" "../src/vectorAdd.cu"
../src/vectorAdd.cu(7): warning: variable "a" was set but never used
../src/vectorAdd.cu(7): warning: variable "a" was set but never used
ptxas info : 4 bytes gmem, 8 bytes cmem[14]
ptxas info : Function properties for _ZN4dim3C1Ejjj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Compiling entry function '_Z6mytestv' for 'sm_21'
ptxas info : Function properties for _Z6mytestv
8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 34 registers, 8 bytes cumulative stack size, 32 bytes cmem[0]
ptxas info : Function properties for _ZN4dim3C2Ejjj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
Finished building: ../src/vectorAdd.cu
Building target: block_size_test
Invoking: NVCC Linker
/usr/local/cuda-5.5/bin/nvcc --cudart static -m64 -link -o "block_size_test" ./src/vectorAdd.o
Finished building target: block_size_test
12:57:41 Build Finished (took 1s.659ms)
When I add -keep key, the compiler generates .cubin file, but I can't read it to find out the values of smem and reg, following this topic too-many-resources-requested-for-launch-how-to-find-out-what-resources-/. At least nowadays this file must have some different format.
Therefore I'm forced to use 256 threads per block, which is probably not a bad idea, considering this .xls: CUDA_Occupancy_calculator.
Anyway. Any help will be appreciated.
I filled the CUDA Occupancy calculator file with the current informations :
Compute capability : 2.1
Threads per block : 961
Registers per thread : 34
Shared memory : 0
I got 0% occupancy, limited by registers count.
If you set the number of thread to 960, you have 63% occupancy, which explains why it works.
Try to limit the count of registers to 32 and set the numbers of threads to 1024 to have 67% occupancy.
To limit the count of registers, use the following option :
nvcc [...] --maxrregcount=32

printf() in my CUDA kernel doesn't result produce any output

I have added some printf() statements in my CUDA program
__device__ __global__ void Kernel(float *, float * ,int );
void DeviceFunc(float *temp_h , int numvar , float *temp1_h)
{ .....
//Kernel call
printf("calling kernel\n");
Kernel<<<dimGrid , dimBlock>>>(a_d , b_d , numvar);
printf("kernel called\n");
....
}
int main(int argc , char **argv)
{ ....
printf("beforeDeviceFunc\n\n");
DeviceFunc(a_h , numvar , b_h); //Showing the data
printf("after DeviceFunc\n\n");
....
}
Also in the Kernel.cu, I wrote:
#include<cuda.h>
#include <stdio.h>
__device__ __global__ void Kernel(float *a_d , float *b_d ,int size)
{
int idx = threadIdx.x ;
int idy = threadIdx.y ;
//Allocating memory in the share memory of the device
__shared__ float temp[16][16];
//Copying the data to the shared memory
temp[idy][idx] = a_d[(idy * (size+1)) + idx] ;
printf("idx=%d, idy=%d, size=%d", idx, idy, size);
....
}
Then I compile using -arch=sm_20 like this:
nvcc -c -arch sm_20 main.cu
nvcc -c -arch sm_20 Kernel.cu
nvcc -arch sm_20 main.o Kernel.o -o main
Now when I run the program, I see:
beforeDeviceFunc
calling kernel
kernel called
after DeviceFunc
So the printf() inside the kernel is not printed. How can I fix that?
printf() output is only displayed if the kernel finishes successfully, so check the return codes of all CUDA function calls and make sure no errors are reported.
Furthermore printf() output is only displayed at certain points in the program. Appendix B.32.2 of the Programming Guide lists these as
Kernel launch via <<<>>> or cuLaunchKernel() (at the start of the launch, and if the CUDA_LAUNCH_BLOCKING environment variable is set to 1, at the end of the launch as well),
Synchronization via cudaDeviceSynchronize(), cuCtxSynchronize(), cudaStreamSynchronize(), cuStreamSynchronize(), cudaEventSynchronize(), or cuEventSynchronize(),
Memory copies via any blocking version of cudaMemcpy*() or cuMemcpy*(),
Module loading/unloading via cuModuleLoad() or cuModuleUnload(),
Context destruction via cudaDeviceReset() or cuCtxDestroy().
Prior to executing a stream callback added by cudaStreamAddCallback() or cuStreamAddCallback().
To check this is your problem, put the following code after your kernel invocation:
{
cudaError_t cudaerr = cudaDeviceSynchronize();
if (cudaerr != cudaSuccess)
printf("kernel launch failed with error \"%s\".\n",
cudaGetErrorString(cudaerr));
}
You should then see either the output of your kernel or an error message.
More conveniently, cuda-memcheck will automatically check all return codes for you if you run your executable under it. While you should always check for errors anyway, this comes handy when resolving concrete issues.
I had the same error just now and decreasing the block size to 512 helped. According to documentation maximum block size can be either 512 or 1024.
I have written a simple test that showed that my GTX 1070 has a maximum block size of 1024. UPD: you can check if your kernel has ever executed by using cudaError_t cudaPeekAtLastError() that returns cudaSuccess if the kernel has started successfully, and only after it is worse calling cudaError_t cudaDeviceSynchronize().
Testing block size of 1023
Testing block size of 1024
Testing block size of 1025
CUDA error: invalid configuration argument
Block maximum size is 1024
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
__global__
void set1(int* t)
{
t[threadIdx.x] = 1;
}
inline bool failed(cudaError_t error)
{
if (cudaSuccess == error)
return false;
fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(error));
return true;
}
int main()
{
int blockSize;
for (blockSize = 1; blockSize < 1 << 12; blockSize++)
{
printf("Testing block size of %d\n", blockSize);
int* t;
if(failed(cudaMallocManaged(&t, blockSize * sizeof(int))))
{
failed(cudaFree(t));
break;
}
for (int i = 0; i < blockSize; i++)
t[0] = 0;
set1 <<<1, blockSize>>> (t);
if (failed(cudaPeekAtLastError()))
{
failed(cudaFree(t));
break;
}
if (failed(cudaDeviceSynchronize()))
{
failed(cudaFree(t));
break;
}
bool hasError = false;
for (int i = 0; i < blockSize; i++)
if (1 != t[i])
{
printf("CUDA error: t[%d] = %d but not 1\n", i, t[i]);
hasError = true;
break;
}
if (hasError)
{
failed(cudaFree(t));
break;
}
failed(cudaFree(t));
}
blockSize--;
if(blockSize <= 0)
{
printf("CUDA error: block size cannot be 0\n");
return 1;
}
printf("Block maximum size is %d", blockSize);
return 0;
}
P.S. Please note, that the only thing in block sizing is warp granularity which is 32 nowadays, so if 0 == yourBlockSize % 32 the warps are used pretty efficiently. The only reason to make blocks bigger then 32 is when the code needs synchronization as synchronization is available only among threads in a single block which makes a developer to use a single large block instead of many small ones. So running with higher number of smaller blocks can be even more efficient than running with lower number of larger blocks.