Copy class data allocated on device back to host - cuda

In my code I want to allocate memory for a pointer data member of a class during kernel execution and write to it afterwards. Then I want to get this data on the host later. In my approach, however, I don't get the right data on the host (see below). Is my approach completely off or can you spot the erroneous part?
#include <cuda_runtime.h>
#include <stdio.h>
class OutputData {
public:
int *data;
};
__global__ void init(OutputData *buffer)
{
// allocate memory for data
buffer->data = (int*) malloc(sizeof(int)*2);
// write data
buffer->data[0] = 1;
buffer->data[1] = 2;
}
int main(int argc, char **argv)
{
// malloc device memory
OutputData *d_buffer;
cudaMalloc(&d_buffer, sizeof(OutputData));
// run kernel
init<<<1,1>>>(d_buffer);
cudaDeviceSynchronize();
// malloc host memory
OutputData *h_buffer = (OutputData*) malloc(sizeof(OutputData));
//transfer data from device to host
cudaMemcpy(h_buffer, d_buffer, sizeof(OutputData), cudaMemcpyDeviceToHost);
int* h_data = (int*) malloc(sizeof(int)*2);
cudaMemcpy(h_data, h_buffer->data, sizeof(int)*2, cudaMemcpyDeviceToHost);
// Print the data
printf("h_data[0] = %d, h_data[1] = %d\n", h_data[0], h_data[1]);
// free memory
cudaFree(h_buffer->data);
free(h_buffer);
cudaFree(d_buffer);
free(h_data);
return (0);
}
The output is
h_data[0] = 0, h_data[1] = 0
and not
h_data[0] = 1, h_data[1] = 2
as expected.

As per the documentation:
In addition, device malloc() memory cannot be used in any runtime or driver API calls (i.e. cudaMemcpy, cudaMemset, etc).
To confirm this, let's run your code with cuda-memcheck:
$ nvcc -std=c++11 -arch=sm_52 -o heapcopy heapcopy.cu
$ cuda-memcheck ./heapcopy
========= CUDA-MEMCHECK
h_data[0] = 36791296, h_data[1] = 0
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3451c3]
========= Host Frame:./heapcopy [0x3cb0a]
========= Host Frame:./heapcopy [0x31ac]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
========= Host Frame:./heapcopy [0x2fd9]
=========
========= Program hit cudaErrorInvalidDevicePointer (error 17) due to "invalid device pointer" on CUDA API call to cudaFree.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3451c3]
========= Host Frame:./heapcopy [0x44f00]
========= Host Frame:./heapcopy [0x31dc]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
========= Host Frame:./heapcopy [0x2fd9]
=========
========= ERROR SUMMARY: 2 errors
This is why your code fails -- the address at h_buffer->data is not host API accessible. Note also that it can't be free'd from the host either.
You could do something like this, which uses a managed memory allocation as the host memory (so it is directly accessible within the kernel), and a device side cudaMemcpyAsync call:
#include <cuda_runtime.h>
#include <stdio.h>
class OutputData {
public:
int *data;
};
__global__ void init(OutputData *buffer)
{
// allocate memory for data
buffer->data = (int*) malloc(sizeof(int)*2);
// write data
buffer->data[0] = 1;
buffer->data[1] = 2;
}
__global__ void deepcopy(OutputData* dest, OutputData* source, size_t datasz)
{
cudaMemcpyAsync(dest->data, source->data, datasz, cudaMemcpyDeviceToDevice);
}
int main(int argc, char **argv)
{
// malloc device memory
OutputData *d_buffer;
cudaMalloc(&d_buffer, sizeof(OutputData));
// run kernel
init<<<1,1>>>(d_buffer);
cudaDeviceSynchronize();
// malloc host memory as managed memory
//OutputData *h_buffer = (OutputData*) malloc(sizeof(OutputData));
//int* h_data = (int*) malloc(sizeof(int)*2);
size_t dsize = sizeof(int)*2;
OutputData* h_buffer; cudaMallocManaged(&h_buffer, sizeof(OutputData));
int* h_data; cudaMallocManaged(&h_data, dsize);
h_buffer->data = h_data;
// run kernel
deepcopy<<<1,1>>>(h_buffer, d_buffer, dsize);
cudaDeviceSynchronize();
// Print the data
printf("h_data[0] = %d, h_data[1] = %d\n", h_data[0], h_data[1]);
// free memory
cudaFree(h_data);
cudaFree(h_buffer);
cudaFree(d_buffer);
return (0);
}
Which runs as expected (note there is technically a device heap memory leak here because a device side free call is never made):
$ nvcc -std=c++11 -arch=sm_52 -dc -o heapcopy.o heapcopy.cu
$ nvcc -std=c++11 -arch=sm_52 -o heapcopy heapcopy.o
$ cuda-memcheck ./heapcopy
========= CUDA-MEMCHECK
h_data[0] = 1, h_data[1] = 2
========= ERROR SUMMARY: 0 errors
There are other variations (like building a complete mirror structure of the heap structure in global memory from the host and then running the copy kernel), but those make even less sense than this does.

Related

Writing on a register in parallel in CUDA C++

Given a 64-bit variable in a register in a kernel, let's say std::uint64_t var, I want to do some calculations and set each bit of this variable using 64 different threads separately but in parallel. Is it possible to write on a shared variable in parallel?
__shared__ std::uint64_t var = 0
in each thread (tid = 0 to 63):
do some calculations
if we should set the bit with index = tid then:
var |= ((std::uint64_t) 1 << tid)
I also realized that using atomicOr does not benefit us as it only works on integers.
You can do this using a 64-bit shared atomicOr():
$ cat t2082.cu
#include <cstdio>
#include <cstdint>
__global__ void k(){
__shared__ unsigned long long var;
if (!threadIdx.x) var = 0;
__syncthreads();
atomicOr(&var, 1<<threadIdx.x);
__syncthreads();
if (!threadIdx.x) printf("0x%lx\n", var);
}
int main(){
k<<<1,64>>>();
cudaDeviceSynchronize();
}
$ nvcc -o t2082 t2082.cu
$ compute-sanitizer ./t2082
========= COMPUTE-SANITIZER
0xffffffffffffffff
========= ERROR SUMMARY: 0 errors
$
The 64-bit shared atomicOr is supported on devices of cc3.5 or greater, which is the same device footprint supported by CUDA 11.
If you were, for example, on CUDA 10.x and using a cc3.0 device, you could do this with two 32-bit variables:
$ cat t2082.cu
#include <cstdio>
#include <cstdint>
__global__ void k(){
__shared__ unsigned var[2];
if (!threadIdx.x) {var[0] = 0; var[1] = 0;}
__syncthreads();
if (threadIdx.x < 32)
atomicOr(&(var[0]), 1<<threadIdx.x);
else
atomicOr(&(var[1]), 1<<(threadIdx.x-32));
__syncthreads();
unsigned long long var64 = (((unsigned long long)var[1])<<32) + var[0];
if (!threadIdx.x) printf("0x%lx\n", var64);
}
int main(){
k<<<1,64>>>();
cudaDeviceSynchronize();
}
$ nvcc -o t2082 t2082.cu
$ compute-sanitizer ./t2082
========= COMPUTE-SANITIZER
0xffffffffffffffff
========= ERROR SUMMARY: 0 errors
$

Profiling code with nsight compute on Pascal fails when cuda memory pools are used

I would like to use Nsight Compute for Pascal GPUs to profile a program which uses CUDA memory pools. I am using Linux, CUDA 11.5, driver 495.46. Nsight Compute is version 2019.5.0, which is the last version that supports Pascal.
Consider the following example program
// nvcc -std=c++17 -arch=sm_61 -O3 main.cu -o main
#include <vector>
#include <memory>
#include <cassert>
#include <iostream>
__global__
void kernel(int* data){ data[0] = 1; };
int main(){
cudaMemPool_t pool{};
cudaMemPoolProps pool_props{};
pool_props.allocType = cudaMemAllocationTypePinned;
pool_props.handleTypes = cudaMemHandleTypePosixFileDescriptor;
pool_props.location.type = cudaMemLocationTypeDevice;
pool_props.location.id = 0;
auto status = cudaMemPoolCreate(&pool, &pool_props);
printf("%s\n", cudaGetErrorName(status));
auto stream = cudaStreamPerThread;
int data = 0;
int* d_data;
status = cudaMallocFromPoolAsync(&d_data, sizeof(int), pool, stream);
printf("%s\n", cudaGetErrorName(status));
kernel<<<1,1,0,stream>>>(d_data);
status = cudaGetLastError();
printf("%s\n", cudaGetErrorName(status));
status = cudaMemcpyAsync(&data, d_data, sizeof(int), cudaMemcpyDeviceToHost, stream);
printf("%s\n", cudaGetErrorName(status));
status = cudaFreeAsync(d_data, stream);
printf("%s\n", cudaGetErrorName(status));
status = cudaDeviceSynchronize();
printf("%s\n", cudaGetErrorName(status));
}
It runs fine without profiler.
compute-sanitizer ./main
========= COMPUTE-SANITIZER
cudaSuccess
cudaSuccess
cudaSuccess
cudaSuccess
cudaSuccess
cudaSuccess
========= ERROR SUMMARY: 0 errors
When run with the profiler, using the pool API returns an error cudaErrorCallRequiresNewerDriver
/opt/nvidia/nsight-compute-2019.5/nv-nsight-cu-cli ./main
==PROF== Connected to process 155966 (main)
cudaErrorCallRequiresNewerDriver
cudaErrorCallRequiresNewerDriver
==PROF== Profiling "kernel" - 1: 0%....50%....100% - 1 pass
==ERROR== Error 0: UnknownError
cudaErrorCallRequiresNewerDriver
cudaErrorIllegalAddress
cudaErrorCallRequiresNewerDriver
cudaErrorIllegalAddress
==PROF== Disconnected from process 155966
==ERROR== An error occurred while trying to profile
Is it possible to profile this program on Pascal using nsight compute?

Check failed: error == cudaSuccess (77 vs. 0) an illegal memory access was encountered [duplicate]

This question already has an answer here:
CUDA - invalid device function, how to know [architecture, code]?
(1 answer)
Closed 2 years ago.
I'm debugging some lengthy code which involves some cuda operations.
I' currently getting the above mentioned error during a call to cudaMemcpy(...,...,cudaMemcpyHostToDevice) but I'm not sure it is speficially related to that.
Here is a code snippet:
int num_elements = 8294400; // --> I also tried it with "1" here which didn't work either!
float *checkArray = new float[num_elements];
float *checkArray_GPU;
CUDA_CHECK(cudaMalloc(&checkArray_GPU, num_elements * sizeof(float)));
CUDA_CHECK(cudaMemcpy(checkArray_GPU, checkArray, num_elements * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(checkArray, checkArray_GPU, num_elements * sizeof(float), cudaMemcpyDeviceToHost));
where CUDA_CHECK is simply a macro for printing any cuda error (this was part of the existing code and works fine for all other cudaMemcpy oder cudaMalloc calls so it is not part of the problem). Strangely this code snippet executed separately in a toy *.cu example works fine.
So my assumption is that due to previous cuda operations in the program, there have been some errors which have not been reported that cause the bug in the code snippet above. Could that be?
Is there a way to check if there is some unreported error involving cuda?
My other estimate is that it might come from the specific graphic card I'm using. I have a Nvidia Titan X Pascal, Cuda 8.0 and cudnn v5.1. I also tried to compile my code using some special compiler flags like
-arch=sm_30 \
-gencode=arch=compute_20,code=sm_20 \
-gencode=arch=compute_30,code=sm_30 \
-gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_52,code=sm_52 \
-gencode=arch=compute_52,code=compute_52 \
-gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61 \
-gencode=arch=compute_62,code=sm_62 \
but it didn't help so far. Here is my current simplified Makefile for completeness:
NVCC = nvcc
CUDA_INC = -I/usr/local/cuda/include
CUDA_LIB = -L/usr/local/cuda/lib64
TARGET = myProgramm
OPTS = -std=c++11
$(TARGET).so: $(TARGET).o
$(NVCC) $(OPTS) -shared $(TARGET).o $(CUDA_LIB) -o $(TARGET).so
$(TARGET).o: $(TARGET).cu headers/some_header.hpp
$(NVCC) $(OPTS) $(CUDA_INC) -Xcompiler -fPIC -c $(TARGET).cu
Has anyone an idea how I could get to the bottom of this?
Edit:
cuda-memcheck was a good idea, so the error apparantly happens earlier during a call of Kernel_set_value:
========= Invalid __global__ write of size 4
========= at 0x00000298 in void Kernel_set_value<float>(unsigned long, unsigned long, float*, float)
========= by thread (480,0,0) in block (30,0,0)
========= Address 0x0005cd00 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x209035]
[...]
========= Host Frame:/media/.../myProgramm.so (_ZN5boost6python6detail6invokeIiPFvRKSsENS0_15arg_from_pythonIS4_EEEEP7_objectNS1_11invoke_tag_ILb1ELb0EEERKT_RT0_RT1_ + 0x2d) [0x3e5eb]
[...]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2f4e33]
========= Host Frame:/media/.../myProgramm.so [0x7489f]
F0703 16:23:54.840698 26207 myProgramm.cu:411] Check failed: error == cudaSuccess (4 vs. 0) unspecified launch failure
[...]
========= Host Frame:python (Py_Main + 0xb5e) [0x66d92]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
========= Host Frame:python [0x177c2e]
=========
*** Check failure stack trace: ***
========= Error: process didn't terminate successfully
========= Internal error (20)
========= No CUDA-MEMCHECK results found
but also the function Kernel_set_value works fine in a toy example. Is there anything special to consider when using Kernel_set_value. This is it's source code and it's respective helper functions.
#define CUDA_NUM_THREADS 512
#define MAX_NUM_BLOCKS 2880
inline int CUDA_GET_BLOCKS(const size_t N) {
return min(MAX_NUM_BLOCKS, int((N + size_t(CUDA_NUM_THREADS) - 1) / CUDA_NUM_THREADS));
}
inline size_t CUDA_GET_LOOPS(const size_t N) {
size_t total_threads = CUDA_GET_BLOCKS(N)*CUDA_NUM_THREADS;
return (N + total_threads -1)/ total_threads;
}
template <typename Dtype>
__global__ void Kernel_set_value(size_t CUDA_NUM_LOOPS, size_t N, Dtype* GPUdst, Dtype value){
const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x));
if (idxBase >= N) return;
for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){
GPUdst[idx] = value;
}
}
So the final solution was to compile the code without any -gencode=arch=compute_XX,code=sm_XX-style flags. Took me forever to find this out. The actual error codes were very missleading (error == cudaSuccess (77 vs. 0) an illegal memory access, (4 vs. 0) unspecified launch failure or (8 vs. 0) invalid device function

float2 matrix (as 1D array) and CUDA

I have to work with a float2 matrix as a 1D array. I wanted to check some things and I have written this code:
#include <stdio.h>
#include <stdlib.h>
#define index(x,y) x+y*N
__global__ void test(float2* matrix_CUDA,int N)
{
int i,j;
i=blockIdx.x*blockDim.x+threadIdx.x;
j=blockIdx.y*blockDim.y+threadIdx.y;
matrix_CUDA[index(i,j)].x=i;
matrix_CUDA[index(i,j)].y=j;
}
int main()
{
int N=256;
int i,j;
//////////////////////////////////////////
float2* matrix;
matrix=(float2*)malloc(N*N*sizeof(float2));
//////////////////////////////////////////
float2* matrix_CUDA;
cudaMalloc((void**)&matrix_CUDA,N*N*sizeof(float2));
//////////////////////////////////////////
dim3 block_dim(32,2,0);
dim3 grid_dim(2,2,0);
test <<< grid_dim,block_dim >>> (matrix_CUDA,N);
//////////////////////////////////////////
cudaMemcpy(matrix,matrix_CUDA,N*N*sizeof(float2),cudaMemcpyDeviceToHost);
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
printf("%d %d, %f %f\n",i,j,matrix[index(i,j)].x,matrix[index(i,j)].y);
}
}
return 0;
}
I was waiting for a output like:
0 0, 0 0
0 1, 0 1
0 2, 0 2
0 3, 0 3
...
But the thing I find is:
0 0, -nan 7.265723657
0 1, -nan 152345
0 2, 25.2135235 -nan
0 3, 52354.324534 24.52354234523
...
That means I have some problems with the memory allocation (I suppose) but I can't find what is wrong with my code. Could someone help me?
Any time you are having trouble with a CUDA code, you should always use proper CUDA error checking and run your code with cuda-memcheck, before asking for help.
Even if you don't understand the output, it will be useful to others trying to help you.
If you had run this code with cuda-memcheck, you would have gotten (amongst all your other output!) some output like this:
$ cuda-memcheck ./t1273
========= CUDA-MEMCHECK
========= Program hit cudaErrorInvalidConfiguration (error 9) due to "invalid configuration argument" on CUDA API call to cudaLaunch.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/lib64/libcuda.so.1 [0x2eea03]
========= Host Frame:./t1273 [0x3616e]
========= Host Frame:./t1273 [0x2bfd]
========= Host Frame:./t1273 [0x299a]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21b15]
========= Host Frame:./t1273 [0x2a5d]
=========
========= ERROR SUMMARY: 1 error
$
This means something is wrong with the way you configured your kernel launch:
dim3 block_dim(32,2,0);
dim3 grid_dim(2,2,0);
test <<< grid_dim,block_dim >>> (matrix_CUDA,N);
^^^^^^^^^^^^^^^^^^
kernel config arguments
Specifically, you do not ever select a dimension of zero when creating a dim3 variable for kernel launch. The minimum dimension for any component is 1, not zero.
So use arguments like this:
dim3 block_dim(32,2,1);
dim3 grid_dim(2,2,1);
In addition, once you fix that, you still find that many of your outputs are not touched by your code. To fix that, you'll need to increase the size of your thread array to match the size of your data array. Since you have a 1-D array, it's not really clear to me why you are launching 2D threadblocks and 2D grids. Your data array should be completely "coverable" with a total of 65536 threads in a linear dimension, something like this:
dim3 block_dim(32,1,1);
dim3 grid_dim(2048,1,1);

CUDA - Same algorithm works on CPU but not on GPU

I am currently working on my first project in CUDA and I ran into something odd, that must be inherent to CUDA and that I don't understand or have overlooked. The same algorithm - the exact same one really, it involves no parallel work - works on the CPU but not on the GPU.
Let me explain in more detail. I am doing thresholding using Otsu's method duplicates computation but reduces transfer time. Short story long, this function:
__device__ double computeThreshold(unsigned int* histogram, int* nbPixels){
double sum = 0;
for (int i = 0; i < 256; i++){
sum += i*histogram[i];
}
int sumB = 0, wB = 0, wF = 0;
double mB, mF, max = 1, between = 0, threshold1 = 0, threshold2 = 0;
for (int j = 0; j < 256 && !(wF == 0 && j != 0 && wB != 0); j++){
wB += histogram[j];
if (wB != 0) {
wF = *nbPixels - wB;
if (wF != 0){
sumB += j*histogram[i];
mB = sumB / wB;
mF = (sum - sumB) / wF;
between = wB * wF *(mB - mF) *(mB - mF);
if (max < 2.0){
threshold1 = j;
if (between > max){
threshold2 = j;
}
max = between;
}
}
}
}
return (threshold1 + threshold2) / 2.0;
}
This works as expected for an image size (ie number of pixels) not too big but fails otherwise; interestingly, even if I don't use histogram and nbPixels in the function and replace all their occurrences by a constant, it still fails - even if I remove the arguments from the function. (What I mean by fail is that the first operation after the call to the kernel returns an unspecified launch failure.)
EDIT 3: Ok, there was a small mistake due to copy/paste errors in what I provided before to test. Now this compiles and allows to reproduce the error:
__device__ double computeThreshold(unsigned int* histogram, long int* nbPixels){
double sum = 0;
for (int i = 0; i < 256; i++){
sum += i*histogram[i];
}
int sumB = 0, wB = 0, wF = 0;
double mB, mF, max = 1, between = 0, threshold1 = 0, threshold2 = 0;
for (int j = 0; j < 256 && !(wF == 0 && j != 0 && wB != 0); j++){
wB += histogram[j];
if (wB != 0) {
wF = *nbPixels - wB;
if (wF != 0){
sumB += j*histogram[j];
mB = sumB / wB;
mF = (sum - sumB) / wF;
between = wB * wF *(mB - mF) *(mB - mF);
if (max < 2.0){
threshold1 = j;
if (between > max){
threshold2 = j;
}
max = between;
}
}
}
}
return (threshold1 + threshold2) / 2.0;
}
__global__ void imageKernel(unsigned int* image, unsigned int* histogram, long int* nbPixels, double* t_threshold){
unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
if (i >= *nbPixels) return;
double threshold = computeThreshold(histogram, nbPixels);
unsigned int pixel = image[i];
if (pixel >= threshold){
pixel = 255;
} else {
pixel = 0;
}
image[i] = pixel;
*t_threshold = threshold;
}
int main(){
unsigned int histogram[256] = { 0 };
const int width = 2048 * 4096;
const int height = 1;
unsigned int* myimage;
myimage = new unsigned int[width*height];
for (int i = 0; i < width*height; i++){
myimage[i] = i % 256;
histogram[i % 256]++;
}
const int threadPerBlock = 256;
const int nbBlock = ceil((double)(width*height) / threadPerBlock);
unsigned int* partial_histograms = new unsigned int[256 * nbBlock];
dim3 dimBlock(threadPerBlock, 1);
dim3 dimGrid(nbBlock, 1);
unsigned int* dev_image;
unsigned int* dev_histogram;
unsigned int* dev_partial_histograms;
double* dev_threshold;
double x = 0;
double* threshold = &x;
long int* nbPixels;
long int nb = width*height;
nbPixels = &(nb);
long int* dev_nbPixels;
cudaSetDevice(0);
cudaMalloc((void**)&dev_image, sizeof(unsigned int)*width*height);
cudaMalloc((void**)&dev_histogram, sizeof(unsigned int)* 256);
cudaMalloc((void**)&dev_partial_histograms, sizeof(unsigned int)* 256 * nbBlock);
cudaMalloc((void**)&dev_threshold, sizeof(double));
cudaMalloc((void**)&dev_nbPixels, sizeof(long int));
cudaMemcpy(dev_image, myimage, sizeof(unsigned int)*width*height, cudaMemcpyHostToDevice);
cudaMemcpy(dev_histogram, histogram, sizeof(unsigned int)* 256, cudaMemcpyHostToDevice);
cudaMemcpy(dev_nbPixels, nbPixels, sizeof(long int), cudaMemcpyHostToDevice);
imageKernel<<<dimGrid, dimBlock>>>(dev_image, dev_histogram, dev_nbPixels, dev_threshold);
cudaMemcpy(histogram, dev_histogram, sizeof(unsigned int)* 256, cudaMemcpyDeviceToHost);
cudaMemcpy(partial_histograms, dev_partial_histograms, sizeof(unsigned int)* 256 * nbBlock, cudaMemcpyDeviceToHost);
cudaMemcpy(threshold, dev_threshold, sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceReset();
return 0;
}
EDIT 4: the characteristics of my GPU
CUDA Device Query (Runtime API) version (CUDART static linking)
Detected 1 CUDA Capable device(s)
Device 0: "GeForce GT 750M"
CUDA Driver Version / Runtime Version 7.5 / 7.5
CUDA Capability Major/Minor version number: 3.0
Total amount of global memory: 2048 MBytes (2147483648 bytes)
( 2) Multiprocessors, (192) CUDA Cores/MP: 384 CUDA Cores
GPU Max Clock rate: 1085 MHz (1.09 GHz)
Memory Clock rate: 900 Mhz
Memory Bus Width: 128-bit
L2 Cache Size: 262144 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536),
3D=(4096, 4096, 4096)
Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Concurrent copy and kernel execution: Yes with 1 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
CUDA Device Driver Mode (TCC or WDDM): WDDM (Windows Display Driver Mo
del)
Device supports Unified Addressing (UVA): Yes
Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simu
ltaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 7.5, CUDA Runtime Versi
on = 7.5, NumDevs = 1, Device0 = GeForce GT 750M
Result = PASS
EDIT 5: I ran cuda-memcheck again and this time, it did output an error message. I don't know why it didn't the first time, I must have done something wrong again. I hope you will pardon me those hesitations and wastes of time. Here is the output message:
========= CUDA-MEMCHECK
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launc
h failure" on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuProfilerStop + 0xb780
2) [0xdb1e2]
========= Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0x160f]
========= Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0xc764]
========= Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0xfe24]
========= Host Frame:C:\WINDOWS\system32\KERNEL32.DLL (BaseThreadInitThunk +
0x22) [0x13d2]
========= Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x3
4) [0x15454]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launc
h failure" on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuProfilerStop + 0xb780
2) [0xdb1e2]
========= Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0x160f]
========= Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0xc788]
========= Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0xfe24]
========= Host Frame:C:\WINDOWS\system32\KERNEL32.DLL (BaseThreadInitThunk +
0x22) [0x13d2]
========= Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x3
4) [0x15454]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launc
h failure" on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuProfilerStop + 0xb780
2) [0xdb1e2]
========= Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0x160f]
========= Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0xc7a6]
========= Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0xfe24]
========= Host Frame:C:\WINDOWS\system32\KERNEL32.DLL (BaseThreadInitThunk +
0x22) [0x13d2]
========= Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x3
4) [0x15454]
=========
========= ERROR SUMMARY: 3 errors
Not very telling though, is it ?
Ok, turns out it wasn't an error of my side but Windows deciding that 2s was enough and that it needed to reset the GPU - stopping there my computation. Thanks a lot to #RobertCrovella, without whom I would never have found this out. And thanks to everyone who tried to answer as well.
So after providing a compileable example (was it really so hard?), I can't reproduce any errors with this code (64 bit linux, compute 3.0 device, CUDA 7.0 release version):
$ nvcc -arch=sm_30 -Xptxas="-v" histogram.cu
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z11imageKernelPjS_PlPd' for 'sm_30'
ptxas info : Function properties for _Z11imageKernelPjS_PlPd
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 34 registers, 352 bytes cmem[0], 16 bytes cmem[2]
$ for i in `seq 1 20`;
> do
> cuda-memcheck ./a.out
> done
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
So if you can reproduce a runtime error doing as I have done, your environment/hardware/toolkit version are subtly different in some way from mine. But in any case the code itself works, and you have a platform specific issue I can't reproduce.