CUDA - Same algorithm works on CPU but not on GPU - cuda

I am currently working on my first project in CUDA and I ran into something odd, that must be inherent to CUDA and that I don't understand or have overlooked. The same algorithm - the exact same one really, it involves no parallel work - works on the CPU but not on the GPU.
Let me explain in more detail. I am doing thresholding using Otsu's method duplicates computation but reduces transfer time. Short story long, this function:
__device__ double computeThreshold(unsigned int* histogram, int* nbPixels){
double sum = 0;
for (int i = 0; i < 256; i++){
sum += i*histogram[i];
}
int sumB = 0, wB = 0, wF = 0;
double mB, mF, max = 1, between = 0, threshold1 = 0, threshold2 = 0;
for (int j = 0; j < 256 && !(wF == 0 && j != 0 && wB != 0); j++){
wB += histogram[j];
if (wB != 0) {
wF = *nbPixels - wB;
if (wF != 0){
sumB += j*histogram[i];
mB = sumB / wB;
mF = (sum - sumB) / wF;
between = wB * wF *(mB - mF) *(mB - mF);
if (max < 2.0){
threshold1 = j;
if (between > max){
threshold2 = j;
}
max = between;
}
}
}
}
return (threshold1 + threshold2) / 2.0;
}
This works as expected for an image size (ie number of pixels) not too big but fails otherwise; interestingly, even if I don't use histogram and nbPixels in the function and replace all their occurrences by a constant, it still fails - even if I remove the arguments from the function. (What I mean by fail is that the first operation after the call to the kernel returns an unspecified launch failure.)
EDIT 3: Ok, there was a small mistake due to copy/paste errors in what I provided before to test. Now this compiles and allows to reproduce the error:
__device__ double computeThreshold(unsigned int* histogram, long int* nbPixels){
double sum = 0;
for (int i = 0; i < 256; i++){
sum += i*histogram[i];
}
int sumB = 0, wB = 0, wF = 0;
double mB, mF, max = 1, between = 0, threshold1 = 0, threshold2 = 0;
for (int j = 0; j < 256 && !(wF == 0 && j != 0 && wB != 0); j++){
wB += histogram[j];
if (wB != 0) {
wF = *nbPixels - wB;
if (wF != 0){
sumB += j*histogram[j];
mB = sumB / wB;
mF = (sum - sumB) / wF;
between = wB * wF *(mB - mF) *(mB - mF);
if (max < 2.0){
threshold1 = j;
if (between > max){
threshold2 = j;
}
max = between;
}
}
}
}
return (threshold1 + threshold2) / 2.0;
}
__global__ void imageKernel(unsigned int* image, unsigned int* histogram, long int* nbPixels, double* t_threshold){
unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
if (i >= *nbPixels) return;
double threshold = computeThreshold(histogram, nbPixels);
unsigned int pixel = image[i];
if (pixel >= threshold){
pixel = 255;
} else {
pixel = 0;
}
image[i] = pixel;
*t_threshold = threshold;
}
int main(){
unsigned int histogram[256] = { 0 };
const int width = 2048 * 4096;
const int height = 1;
unsigned int* myimage;
myimage = new unsigned int[width*height];
for (int i = 0; i < width*height; i++){
myimage[i] = i % 256;
histogram[i % 256]++;
}
const int threadPerBlock = 256;
const int nbBlock = ceil((double)(width*height) / threadPerBlock);
unsigned int* partial_histograms = new unsigned int[256 * nbBlock];
dim3 dimBlock(threadPerBlock, 1);
dim3 dimGrid(nbBlock, 1);
unsigned int* dev_image;
unsigned int* dev_histogram;
unsigned int* dev_partial_histograms;
double* dev_threshold;
double x = 0;
double* threshold = &x;
long int* nbPixels;
long int nb = width*height;
nbPixels = &(nb);
long int* dev_nbPixels;
cudaSetDevice(0);
cudaMalloc((void**)&dev_image, sizeof(unsigned int)*width*height);
cudaMalloc((void**)&dev_histogram, sizeof(unsigned int)* 256);
cudaMalloc((void**)&dev_partial_histograms, sizeof(unsigned int)* 256 * nbBlock);
cudaMalloc((void**)&dev_threshold, sizeof(double));
cudaMalloc((void**)&dev_nbPixels, sizeof(long int));
cudaMemcpy(dev_image, myimage, sizeof(unsigned int)*width*height, cudaMemcpyHostToDevice);
cudaMemcpy(dev_histogram, histogram, sizeof(unsigned int)* 256, cudaMemcpyHostToDevice);
cudaMemcpy(dev_nbPixels, nbPixels, sizeof(long int), cudaMemcpyHostToDevice);
imageKernel<<<dimGrid, dimBlock>>>(dev_image, dev_histogram, dev_nbPixels, dev_threshold);
cudaMemcpy(histogram, dev_histogram, sizeof(unsigned int)* 256, cudaMemcpyDeviceToHost);
cudaMemcpy(partial_histograms, dev_partial_histograms, sizeof(unsigned int)* 256 * nbBlock, cudaMemcpyDeviceToHost);
cudaMemcpy(threshold, dev_threshold, sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceReset();
return 0;
}
EDIT 4: the characteristics of my GPU
CUDA Device Query (Runtime API) version (CUDART static linking)
Detected 1 CUDA Capable device(s)
Device 0: "GeForce GT 750M"
CUDA Driver Version / Runtime Version 7.5 / 7.5
CUDA Capability Major/Minor version number: 3.0
Total amount of global memory: 2048 MBytes (2147483648 bytes)
( 2) Multiprocessors, (192) CUDA Cores/MP: 384 CUDA Cores
GPU Max Clock rate: 1085 MHz (1.09 GHz)
Memory Clock rate: 900 Mhz
Memory Bus Width: 128-bit
L2 Cache Size: 262144 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536),
3D=(4096, 4096, 4096)
Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Concurrent copy and kernel execution: Yes with 1 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
CUDA Device Driver Mode (TCC or WDDM): WDDM (Windows Display Driver Mo
del)
Device supports Unified Addressing (UVA): Yes
Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simu
ltaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 7.5, CUDA Runtime Versi
on = 7.5, NumDevs = 1, Device0 = GeForce GT 750M
Result = PASS
EDIT 5: I ran cuda-memcheck again and this time, it did output an error message. I don't know why it didn't the first time, I must have done something wrong again. I hope you will pardon me those hesitations and wastes of time. Here is the output message:
========= CUDA-MEMCHECK
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launc
h failure" on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuProfilerStop + 0xb780
2) [0xdb1e2]
========= Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0x160f]
========= Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0xc764]
========= Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0xfe24]
========= Host Frame:C:\WINDOWS\system32\KERNEL32.DLL (BaseThreadInitThunk +
0x22) [0x13d2]
========= Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x3
4) [0x15454]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launc
h failure" on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuProfilerStop + 0xb780
2) [0xdb1e2]
========= Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0x160f]
========= Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0xc788]
========= Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0xfe24]
========= Host Frame:C:\WINDOWS\system32\KERNEL32.DLL (BaseThreadInitThunk +
0x22) [0x13d2]
========= Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x3
4) [0x15454]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launc
h failure" on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuProfilerStop + 0xb780
2) [0xdb1e2]
========= Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0x160f]
========= Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0xc7a6]
========= Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0xfe24]
========= Host Frame:C:\WINDOWS\system32\KERNEL32.DLL (BaseThreadInitThunk +
0x22) [0x13d2]
========= Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x3
4) [0x15454]
=========
========= ERROR SUMMARY: 3 errors
Not very telling though, is it ?

Ok, turns out it wasn't an error of my side but Windows deciding that 2s was enough and that it needed to reset the GPU - stopping there my computation. Thanks a lot to #RobertCrovella, without whom I would never have found this out. And thanks to everyone who tried to answer as well.

So after providing a compileable example (was it really so hard?), I can't reproduce any errors with this code (64 bit linux, compute 3.0 device, CUDA 7.0 release version):
$ nvcc -arch=sm_30 -Xptxas="-v" histogram.cu
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z11imageKernelPjS_PlPd' for 'sm_30'
ptxas info : Function properties for _Z11imageKernelPjS_PlPd
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 34 registers, 352 bytes cmem[0], 16 bytes cmem[2]
$ for i in `seq 1 20`;
> do
> cuda-memcheck ./a.out
> done
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
So if you can reproduce a runtime error doing as I have done, your environment/hardware/toolkit version are subtly different in some way from mine. But in any case the code itself works, and you have a platform specific issue I can't reproduce.

Related

Copy class data allocated on device back to host

In my code I want to allocate memory for a pointer data member of a class during kernel execution and write to it afterwards. Then I want to get this data on the host later. In my approach, however, I don't get the right data on the host (see below). Is my approach completely off or can you spot the erroneous part?
#include <cuda_runtime.h>
#include <stdio.h>
class OutputData {
public:
int *data;
};
__global__ void init(OutputData *buffer)
{
// allocate memory for data
buffer->data = (int*) malloc(sizeof(int)*2);
// write data
buffer->data[0] = 1;
buffer->data[1] = 2;
}
int main(int argc, char **argv)
{
// malloc device memory
OutputData *d_buffer;
cudaMalloc(&d_buffer, sizeof(OutputData));
// run kernel
init<<<1,1>>>(d_buffer);
cudaDeviceSynchronize();
// malloc host memory
OutputData *h_buffer = (OutputData*) malloc(sizeof(OutputData));
//transfer data from device to host
cudaMemcpy(h_buffer, d_buffer, sizeof(OutputData), cudaMemcpyDeviceToHost);
int* h_data = (int*) malloc(sizeof(int)*2);
cudaMemcpy(h_data, h_buffer->data, sizeof(int)*2, cudaMemcpyDeviceToHost);
// Print the data
printf("h_data[0] = %d, h_data[1] = %d\n", h_data[0], h_data[1]);
// free memory
cudaFree(h_buffer->data);
free(h_buffer);
cudaFree(d_buffer);
free(h_data);
return (0);
}
The output is
h_data[0] = 0, h_data[1] = 0
and not
h_data[0] = 1, h_data[1] = 2
as expected.
As per the documentation:
In addition, device malloc() memory cannot be used in any runtime or driver API calls (i.e. cudaMemcpy, cudaMemset, etc).
To confirm this, let's run your code with cuda-memcheck:
$ nvcc -std=c++11 -arch=sm_52 -o heapcopy heapcopy.cu
$ cuda-memcheck ./heapcopy
========= CUDA-MEMCHECK
h_data[0] = 36791296, h_data[1] = 0
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3451c3]
========= Host Frame:./heapcopy [0x3cb0a]
========= Host Frame:./heapcopy [0x31ac]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
========= Host Frame:./heapcopy [0x2fd9]
=========
========= Program hit cudaErrorInvalidDevicePointer (error 17) due to "invalid device pointer" on CUDA API call to cudaFree.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3451c3]
========= Host Frame:./heapcopy [0x44f00]
========= Host Frame:./heapcopy [0x31dc]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
========= Host Frame:./heapcopy [0x2fd9]
=========
========= ERROR SUMMARY: 2 errors
This is why your code fails -- the address at h_buffer->data is not host API accessible. Note also that it can't be free'd from the host either.
You could do something like this, which uses a managed memory allocation as the host memory (so it is directly accessible within the kernel), and a device side cudaMemcpyAsync call:
#include <cuda_runtime.h>
#include <stdio.h>
class OutputData {
public:
int *data;
};
__global__ void init(OutputData *buffer)
{
// allocate memory for data
buffer->data = (int*) malloc(sizeof(int)*2);
// write data
buffer->data[0] = 1;
buffer->data[1] = 2;
}
__global__ void deepcopy(OutputData* dest, OutputData* source, size_t datasz)
{
cudaMemcpyAsync(dest->data, source->data, datasz, cudaMemcpyDeviceToDevice);
}
int main(int argc, char **argv)
{
// malloc device memory
OutputData *d_buffer;
cudaMalloc(&d_buffer, sizeof(OutputData));
// run kernel
init<<<1,1>>>(d_buffer);
cudaDeviceSynchronize();
// malloc host memory as managed memory
//OutputData *h_buffer = (OutputData*) malloc(sizeof(OutputData));
//int* h_data = (int*) malloc(sizeof(int)*2);
size_t dsize = sizeof(int)*2;
OutputData* h_buffer; cudaMallocManaged(&h_buffer, sizeof(OutputData));
int* h_data; cudaMallocManaged(&h_data, dsize);
h_buffer->data = h_data;
// run kernel
deepcopy<<<1,1>>>(h_buffer, d_buffer, dsize);
cudaDeviceSynchronize();
// Print the data
printf("h_data[0] = %d, h_data[1] = %d\n", h_data[0], h_data[1]);
// free memory
cudaFree(h_data);
cudaFree(h_buffer);
cudaFree(d_buffer);
return (0);
}
Which runs as expected (note there is technically a device heap memory leak here because a device side free call is never made):
$ nvcc -std=c++11 -arch=sm_52 -dc -o heapcopy.o heapcopy.cu
$ nvcc -std=c++11 -arch=sm_52 -o heapcopy heapcopy.o
$ cuda-memcheck ./heapcopy
========= CUDA-MEMCHECK
h_data[0] = 1, h_data[1] = 2
========= ERROR SUMMARY: 0 errors
There are other variations (like building a complete mirror structure of the heap structure in global memory from the host and then running the copy kernel), but those make even less sense than this does.

Check failed: error == cudaSuccess (77 vs. 0) an illegal memory access was encountered [duplicate]

This question already has an answer here:
CUDA - invalid device function, how to know [architecture, code]?
(1 answer)
Closed 2 years ago.
I'm debugging some lengthy code which involves some cuda operations.
I' currently getting the above mentioned error during a call to cudaMemcpy(...,...,cudaMemcpyHostToDevice) but I'm not sure it is speficially related to that.
Here is a code snippet:
int num_elements = 8294400; // --> I also tried it with "1" here which didn't work either!
float *checkArray = new float[num_elements];
float *checkArray_GPU;
CUDA_CHECK(cudaMalloc(&checkArray_GPU, num_elements * sizeof(float)));
CUDA_CHECK(cudaMemcpy(checkArray_GPU, checkArray, num_elements * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(checkArray, checkArray_GPU, num_elements * sizeof(float), cudaMemcpyDeviceToHost));
where CUDA_CHECK is simply a macro for printing any cuda error (this was part of the existing code and works fine for all other cudaMemcpy oder cudaMalloc calls so it is not part of the problem). Strangely this code snippet executed separately in a toy *.cu example works fine.
So my assumption is that due to previous cuda operations in the program, there have been some errors which have not been reported that cause the bug in the code snippet above. Could that be?
Is there a way to check if there is some unreported error involving cuda?
My other estimate is that it might come from the specific graphic card I'm using. I have a Nvidia Titan X Pascal, Cuda 8.0 and cudnn v5.1. I also tried to compile my code using some special compiler flags like
-arch=sm_30 \
-gencode=arch=compute_20,code=sm_20 \
-gencode=arch=compute_30,code=sm_30 \
-gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_52,code=sm_52 \
-gencode=arch=compute_52,code=compute_52 \
-gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61 \
-gencode=arch=compute_62,code=sm_62 \
but it didn't help so far. Here is my current simplified Makefile for completeness:
NVCC = nvcc
CUDA_INC = -I/usr/local/cuda/include
CUDA_LIB = -L/usr/local/cuda/lib64
TARGET = myProgramm
OPTS = -std=c++11
$(TARGET).so: $(TARGET).o
$(NVCC) $(OPTS) -shared $(TARGET).o $(CUDA_LIB) -o $(TARGET).so
$(TARGET).o: $(TARGET).cu headers/some_header.hpp
$(NVCC) $(OPTS) $(CUDA_INC) -Xcompiler -fPIC -c $(TARGET).cu
Has anyone an idea how I could get to the bottom of this?
Edit:
cuda-memcheck was a good idea, so the error apparantly happens earlier during a call of Kernel_set_value:
========= Invalid __global__ write of size 4
========= at 0x00000298 in void Kernel_set_value<float>(unsigned long, unsigned long, float*, float)
========= by thread (480,0,0) in block (30,0,0)
========= Address 0x0005cd00 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x209035]
[...]
========= Host Frame:/media/.../myProgramm.so (_ZN5boost6python6detail6invokeIiPFvRKSsENS0_15arg_from_pythonIS4_EEEEP7_objectNS1_11invoke_tag_ILb1ELb0EEERKT_RT0_RT1_ + 0x2d) [0x3e5eb]
[...]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2f4e33]
========= Host Frame:/media/.../myProgramm.so [0x7489f]
F0703 16:23:54.840698 26207 myProgramm.cu:411] Check failed: error == cudaSuccess (4 vs. 0) unspecified launch failure
[...]
========= Host Frame:python (Py_Main + 0xb5e) [0x66d92]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
========= Host Frame:python [0x177c2e]
=========
*** Check failure stack trace: ***
========= Error: process didn't terminate successfully
========= Internal error (20)
========= No CUDA-MEMCHECK results found
but also the function Kernel_set_value works fine in a toy example. Is there anything special to consider when using Kernel_set_value. This is it's source code and it's respective helper functions.
#define CUDA_NUM_THREADS 512
#define MAX_NUM_BLOCKS 2880
inline int CUDA_GET_BLOCKS(const size_t N) {
return min(MAX_NUM_BLOCKS, int((N + size_t(CUDA_NUM_THREADS) - 1) / CUDA_NUM_THREADS));
}
inline size_t CUDA_GET_LOOPS(const size_t N) {
size_t total_threads = CUDA_GET_BLOCKS(N)*CUDA_NUM_THREADS;
return (N + total_threads -1)/ total_threads;
}
template <typename Dtype>
__global__ void Kernel_set_value(size_t CUDA_NUM_LOOPS, size_t N, Dtype* GPUdst, Dtype value){
const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x));
if (idxBase >= N) return;
for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){
GPUdst[idx] = value;
}
}
So the final solution was to compile the code without any -gencode=arch=compute_XX,code=sm_XX-style flags. Took me forever to find this out. The actual error codes were very missleading (error == cudaSuccess (77 vs. 0) an illegal memory access, (4 vs. 0) unspecified launch failure or (8 vs. 0) invalid device function

Proper use of cudaFortran cuSolver functions

I am currently working on migrating some Fortran code over to cudaFortran. Specifically the task involves the spectral analysis of massive matrices in order to diagonalize them. Here's the code I've fabricobbled so far
program main
!Trials for usage of cusovlerDn<t>syevd for spectral analysis of a symmetric matrix, see http://docs.nvidia.com/cuda/cusolver/index.html#syevd-example1 for the example used as a base
!Compilation example: 'pgf90 Main.cuf -lcusolver -Mcuda=cuda8.0'
use cudafor !has to go first
use cusolverDn
implicit none
integer :: info
integer,parameter :: q2 = SELECTED_REAL_KIND(15,305)
real(q2), device, dimension(3,3) :: A_d
real(q2), dimension(3,3) :: A
real(q2), device, dimension(3) :: W_d
real(q2), dimension(3) :: W
integer :: stat, lwork, m, lda
real(q2), device, allocatable :: work_d(:)
integer, device :: devInfo
type(cusolverDnHandle) :: h
stat=cusolverDnCreate(h)
W_d=(/0,0,0/)
print *, stat
m=3
lda = m
A_d(1,1:3)=(/4,1,2/)
A_d(2,1:3)=(/1,-1,1/)
A_d(3,1:3)=(/2,1,3/) !eigenvalues are 5.84947, 1.44865, -1.29812
! A_d(1,1:3)=(/1,0,0/)
! A_d(2,1:3)=(/0,1,0/)
! A_d(3,1:3)=(/0,0,1/)
stat=cusolverDnDsyevd_bufferSize(h, CUSOLVER_EIG_MODE_NOVECTOR, CUBLAS_FILL_MODE_UPPER, m, A_d, lda, W_d, lwork)
print *, stat
allocate(work_d(lwork))
stat=cusolverDnDsyevd(h, CUSOLVER_EIG_MODE_NOVECTOR, CUBLAS_FILL_MODE_UPPER, m, A_d, lda, W_d, work_d, lwork, devInfo)
print *, stat !returns 6 as if there was an error
info=devInfo
print *, info !devInfo returns 0, as if the operation was successful
stat=cudaDeviceSynchronize()
print *, stat
W=W_d
print *, W
A=A_d
print *, A
deallocate(work_d)
stat=cusolverDnDestroy(h)
print *, stat
end program main
Compilation and mem-check output are as follows:
olafur#olafur-X556UQK:~/Skyrmions2017/Project$ pgf90 Main.cuf -lcusolver -Mcuda=cuda8.0
olafur#olafur-X556UQK:~/Skyrmions2017/Project$ cuda-memcheck ./a.out
========= CUDA-MEMCHECK
0
0
========= Program hit cudaErrorInvalidDeviceFunction (error 8) due to "invalid device function" on CUDA API call to cudaLaunch.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2ef503]
========= Host Frame:/opt/pgi/linux86-64/2017/cuda/8.0/lib64/libcusolver.so.8.0 [0x5b906e]
========= Host Frame:/opt/pgi/linux86-64/2017/cuda/8.0/lib64/libcusolver.so.8.0 [0x2e0857]
========= Host Frame:/opt/pgi/linux86-64/2017/cuda/8.0/lib64/libcusolver.so.8.0 [0x2e0270]
========= Host Frame:/opt/pgi/linux86-64/2017/cuda/8.0/lib64/libcusolver.so.8.0 [0x2e3df3]
========= Host Frame:/opt/pgi/linux86-64/2017/cuda/8.0/lib64/libcusolver.so.8.0 [0x2e1720]
========= Host Frame:/opt/pgi/linux86-64/2017/cuda/8.0/lib64/libcusolver.so.8.0 [0x2e0157]
========= Host Frame:/opt/pgi/linux86-64/2017/cuda/8.0/lib64/libcusolver.so.8.0 (cusolverDnDsytrd + 0x37) [0x2e3f17]
========= Host Frame:/opt/pgi/linux86-64/2017/cuda/8.0/lib64/libcusolver.so.8.0 [0x2ea607]
========= Host Frame:/opt/pgi/linux86-64/2017/cuda/8.0/lib64/libcusolver.so.8.0 [0x2eb744]
========= Host Frame:/opt/pgi/linux86-64/2017/cuda/8.0/lib64/libcusolver.so.8.0 (cusolverDnDsyevd + 0x27) [0x2ea157]
========= Host Frame:./a.out [0x1b2d]
========= Host Frame:./a.out [0x1514]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830]
========= Host Frame:./a.out [0x13f9]
=========
6
========= Program hit cudaErrorInvalidDeviceFunction (error 8) due to "invalid device function" on CUDA API call to cudaGetLastError.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2ef503]
========= Host Frame:/opt/pgi/linux86-64/2017/cuda/8.0/lib64/libcusolver.so.8.0 [0x5b6793]
========= Host Frame:/opt/pgi/linux86-64/2017/cuda/8.0/lib64/libcusolver.so.8.0 [0x2e1727]
========= Host Frame:/opt/pgi/linux86-64/2017/cuda/8.0/lib64/libcusolver.so.8.0 [0x2e0157]
========= Host Frame:/opt/pgi/linux86-64/2017/cuda/8.0/lib64/libcusolver.so.8.0 (cusolverDnDsytrd + 0x37) [0x2e3f17]
0
========= Host Frame:/opt/pgi/linux86-64/2017/cuda/8.0/lib64/libcusolver.so.8.0 [0x2ea607]
========= Host Frame:/opt/pgi/linux86-64/2017/cuda/8.0/lib64/libcusolver.so.8.0 [0x2eb744]
========= Host Frame:/opt/pgi/linux86-64/2017/cuda/8.0/lib64/libcusolver.so.8.0 (cusolverDnDsyevd + 0x27) [0x2ea157]
========= Host Frame:./a.out [0x1b2d]
0
========= Host Frame:./a.out [0x1514]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830]
========= Host Frame:./a.out [0x13f9]
=========
0.000000000000000 0.000000000000000 0.000000000000000
4.000000000000000 1.000000000000000 2.000000000000000
1.000000000000000 -1.000000000000000 1.000000000000000
2.000000000000000 1.000000000000000 3.000000000000000
0
========= ERROR SUMMARY: 2 errors
It looks like I'm not actually invoking the cusolverDnDsyevd functions properly, most likely I am not using the right types of variables. But since I am semi-illiterate in programming and the only example I have to follow is written in C (using those fancy void** things) I don't know what is proper.
EDIT: Full output of deviceQuery
olafur#olafur-X556UQK:~/NVIDIA_CUDA-8.0_Samples/1_Utilities/deviceQuery$ ./deviceQuery
./deviceQuery Starting...
CUDA Device Query (Runtime API) version (CUDART static linking)
Detected 1 CUDA Capable device(s)
Device 0: "GeForce 940MX"
CUDA Driver Version / Runtime Version 8.0 / 8.0
CUDA Capability Major/Minor version number: 5.0
Total amount of global memory: 2002 MBytes (2099642368 bytes)
( 3) Multiprocessors, (128) CUDA Cores/MP: 384 CUDA Cores
GPU Max Clock rate: 1242 MHz (1.24 GHz)
Memory Clock rate: 900 Mhz
Memory Bus Width: 64-bit
L2 Cache Size: 1048576 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Concurrent copy and kernel execution: Yes with 1 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
Device supports Unified Addressing (UVA): Yes
Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 8.0, CUDA Runtime Version = 8.0, NumDevs = 1, Device0 = GeForce 940MX
Result = PASS
Since the code works fine on another system at my disposal the problem was indeed a runtime environment issue, as suggested by Robert Crovella
Moral of the story: Always try at least 2 systems.

Error with 'cuda-memcheck' in cuda 8.0

It is strange that when I do not add cuda-memcheck before ./main, the program runs without any warning or error message, however, when I add it, it will have error message like following.
========= Invalid __global__ write of size 8
========= at 0x00000120 in initCurand(curandStateXORWOW*, unsigned long)
========= by thread (9,0,0) in block (3,0,0)
========= Address 0x5005413b0 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204115]
========= Host Frame:./main [0x18e11]
========= Host Frame:./main [0x369b3]
========= Host Frame:./main [0x3403]
========= Host Frame:./main [0x308c]
========= Host Frame:./main [0x30b7]
========= Host Frame:./main [0x2ebb]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830]
Here is my functions, a brief introduction on the code, I try to generate a random numbers and save them to a device variable weights, then use this vector to sample from discrete numbers.
#include<iostream>
#include<curand.h>
#include<curand_kernel.h>
#include<time.h>
using namespace std;
#define num 100
__device__ float weights[num];
// function to define seed
__global__ void initCurand(curandState *state, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &state[idx]);
}
__device__ void sampling(float *weight, float max_weight, int *index, curandState *state){
int j;
float u;
do{
j = (int)(curand_uniform(state) * (num + 0.999999));
u = curand_uniform(state); //sample from uniform distribution;
}while( u > weight[j]/max_weight);
*index = j;
}
__global__ void test(int *dev_sample, curandState *state){
int idx = threadIdx.x + blockIdx.x * blockDim.x;\
// generate random numbers from uniform distribution and save them to weights
weights[idx] = curand_uniform(&state[idx]);
// run sampling function, in which, weights is an input for the function on each thread
sampling(weights, 1, dev_sample+idx, &state[idx]);
}
int main(){
// define the seed of random generator
curandState *devState;
cudaMalloc((void**)&devState, num*sizeof(curandState));
int *h_sample;
h_sample = (int*) malloc(num*sizeof(int));
int *d_sample;
cudaMalloc((void**)&d_sample, num*sizeof(float));
initCurand<<<(int)num/32 + 1, 32>>>(devState, 1);
test<<<(int)num/32 + 1, 32>>>(d_sample, devState);
cudaMemcpy(h_sample, d_sample, num*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < num; ++i)
{
cout << *(h_sample + i) << endl;
}
//free memory
cudaFree(devState);
free(h_sample);
cudaFree(d_sample);
return 0;
}
Just start to learn cuda, if the methods to access the global memory is incorrect, please help me with that. Thanks
This is launching "extra" threads:
initCurand<<<(int)num/32 + 1, 32>>>(devState, 1);
num is 100, so the above config will launch 4 blocks of 32 threads each, i.e. 128 threads. But you are only allocating space for 100 curandState here:
cudaMalloc((void**)&devState, num*sizeof(curandState));
So your initCurand kernel will have some threads (idx = 100-127) that are attempting to initialize some curandState that you haven't allocated. As a result when you run cuda-memcheck which does fairly rigorous out-of-bounds checking, an error is reported.
One possible solution would be to modify your initCurand kernel as follows:
__global__ void initCurand(curandState *state, unsigned long seed, int num){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num)
curand_init(seed, idx, 0, &state[idx]);
}
This will prevent any out-of-bounds threads from doing anything. Note that you will need to modify the kernel call to pass num to it. Also, it appears to me you have a similar problem in your test kernel. You may want to do something similar to fix it there. This is a common construct in CUDA kernels, I call it a "thread check". You can find other questions here on the SO tag discussing this same concept.

Determining dimGrid and dimBlock sizes in CUDA

First off, I'm fairly new to CUDA programming so I apologize for such a simple question. I have researched the best way to determine dimGrid and dimBlock in my GPU kernel call and for some reason I'm not quite getting it to work.
On my home PC, I have a GeForce GTX 580 (Compute Capability 2.0). 1024 threads per block etc. I can get my code to run properly on this PC. My gpu is populating the distance array of size 988*988. Here is part of the code below:
#define SIZE 988
__global__ void createDistanceTable(double *d_distances, double *d_coordinates)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if(row < SIZE && col < SIZE)
d_distances[row * SIZE + col] =
acos(__sinf(d_coordinates[row * 2 + 0])*
__sinf(d_coordinates[col * 2 + 0])+__cosf(d_coordinates[row * 2 + 0])*
__cosf(d_coordinates[col * 2 + 0])*__cosf(d_coordinates[col * 2 + 1]-
d_coordinates[row * 2 + 1]))*6371;
}
Kernel call in main:
dim3 dimBlock(32,32,1);
dim3 dimGrid(32,32,1);
createDistanceTable<<<dimGrid, dimBlock>>>(d_distances, d_coordinates);
My issue is I simply have not found a way to get this code to run properly on my laptop. My laptop's GPU is a GeForce 9600M GT (Compute Capability 1.1). 512 threads per block etc. I would greatly appreciate any guidance in helping my understand how I should approach the dimBlock and dimGrid for my kernel call on my laptop. Thanks for any advice!
Several things were wrong in your code.
Using double-precision on CC < 1.3.
The size of your thread blocks (as you said, CC <= 1.3 means 512 threads max per block, you used 1024 threads per block). I guess you could use __CUDA_ARCH__ if you do need some multi-architecture code.
No error checking or memory checking (cuda-memcheck). You may allocate more memory than you have, or use more threads/blocks than your GPU can handle, and you will not detect it.
Consider the following example based on your code (I am using float instead of double):
#include <cuda.h>
#include <stdio.h> // printf
#define SIZE 988
#define GRID_SIZE 32
#define BLOCK_SIZE 16 // set to 16 instead of 32 for instance
#define CUDA_CHECK_ERROR() __cuda_check_errors(__FILE__, __LINE__)
#define CUDA_SAFE_CALL(err) __cuda_safe_call(err, __FILE__, __LINE__)
// See: http://codeyarns.com/2011/03/02/how-to-do-error-checking-in-cuda/
inline void
__cuda_check_errors (const char *filename, const int line_number)
{
cudaError err = cudaDeviceSynchronize ();
if (err != cudaSuccess)
{
printf ("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString (err));
exit (-1);
}
}
inline void
__cuda_safe_call (cudaError err, const char *filename, const int line_number)
{
if (err != cudaSuccess)
{
printf ("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString (err));
exit (-1);
}
}
__global__ void
createDistanceTable (float *d_distances, float *d_coordinates)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if (row < SIZE && col < SIZE)
d_distances[row * SIZE + col] =
acos (__sinf (d_coordinates[row * 2 + 0]) *
__sinf (d_coordinates[col * 2 + 0]) +
__cosf (d_coordinates[row * 2 + 0]) *
__cosf (d_coordinates[col * 2 + 0]) *
__cosf (d_coordinates[col * 2 + 1] -
d_coordinates[row * 2 + 1])) * 6371;
}
int
main ()
{
float *d_distances;
float *d_coordinates;
CUDA_SAFE_CALL (cudaMalloc (&d_distances, SIZE * SIZE * sizeof (float)));
CUDA_SAFE_CALL (cudaMalloc (&d_coordinates, SIZE * SIZE * sizeof (float)));
dim3 dimGrid (GRID_SIZE, GRID_SIZE);
dim3 dimBlock (BLOCK_SIZE, BLOCK_SIZE);
createDistanceTable <<< dimGrid, dimBlock >>> (d_distances, d_coordinates);
CUDA_CHECK_ERROR ();
CUDA_SAFE_CALL (cudaFree (d_distances));
CUDA_SAFE_CALL (cudaFree (d_coordinates));
}
Compilation command (change architecture accordingly):
nvcc prog.cu -g -G -lineinfo -gencode arch=compute_11,code=sm_11 -o prog
With 32x32 block on CC 2.0 or 16x16 on CC 1.1:
cuda-memcheck ./prog
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
With 33x33 block on CC 2.0 or 32x32 block on CC 1.1:
cuda-memcheck ./prog
========= CUDA-MEMCHECK
========= Program hit error 9 on CUDA API call to cudaLaunch
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/nvidia-current-updates/libcuda.so [0x26a230]
========= Host Frame:/opt/cuda/lib64/libcudart.so.5.0 (cudaLaunch + 0x242) [0x2f592]
========= Host Frame:./prog [0xc76]
========= Host Frame:./prog [0xa99]
========= Host Frame:./prog [0xac4]
========= Host Frame:./prog [0x9d1]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xed) [0x2176d]
========= Host Frame:./prog [0x859]
=========
========= ERROR SUMMARY: 1 error
Error 9:
/**
* This indicates that a kernel launch is requesting resources that can
* never be satisfied by the current device. Requesting more shared memory
* per block than the device supports will trigger this error, as will
* requesting too many threads or blocks. See ::cudaDeviceProp for more
* device limitations.
*/ cudaErrorInvalidConfiguration = 9,