Printing inside a cuda __global__ function kills the thread - cuda

I'm having a weird problem with my code. If I try to print the value of a certain variable inside a thread nothing gets written to the screen and all the threads stop at that point. Here is the code:
#define WINSIZE 1
const int nebsize=(WINSIZE*2+1)*(WINSIZE*2+1);
__global__ void loop(double *img, int *consts, int w, int h, double epsilon){
int ind=blockIdx.x*blockDim.x+threadIdx.x;
if(ind<w*h && !consts[ind] && ind%w>=WINSIZE && ind%w<w-WINSIZE && ind/w>=WINSIZE && ind/w<h-WINSIZE){
int win_inds[nebsize];
double winI[3*(2*WINSIZE+1)*(2*WINSIZE+1)];
double winI_re_aux[3*nebsize];
double pre_win_var[9];
double win_var[9];
double win_mu[3];
double tvals[nebsize*nebsize];
double detwin;
int min_i=ind%w-WINSIZE;
int max_i=ind%w+WINSIZE;
int min_j=ind/w-WINSIZE;
int max_j=ind/w+WINSIZE;
int k;
int l;
k=0;
for(int i=min_i; i<=max_i; i++){
for(int j=min_j; j<=max_j; j++){
win_inds[k]=h*i+j;
k++;
}
}
k=0;
for(int j=min_j; j<=max_j; j++){
l=0;
for(int i=min_i; i<=max_i; i++){
winI[3*(l*(2*WINSIZE+1)+k)]=img[3*(j*w+i)];
winI[3*(l*(2*WINSIZE+1)+k)+1]=img[3*(j*w+i)+1];
winI[3*(l*(2*WINSIZE+1)+k)+2]=img[3*(j*w+i)+2];
l++;
}
k++;
}
win_mu[0]=0;
win_mu[1]=0;
win_mu[2]=0;
for(int i=0; i<nebsize; i++){
win_mu[0]+=winI[3*i];
win_mu[1]+=winI[3*i+1];
win_mu[2]+=winI[3*i+2];
}
win_mu[0]=win_mu[0]/(double)nebsize;
win_mu[1]=win_mu[1]/(double)nebsize;
win_mu[2]=win_mu[2]/(double)nebsize;
//all ok here
//this works here
if(ind==200){
printf("%f\n", win_var[8]);
}
for(int i=0; i<3; i++){
for(int j=0; j<3; j++){
pre_win_var[3*i+j]=0;
for(int n=0; n<nebsize; n++){
pre_win_var[3*i+j]+=winI[3*n+i]*winI[3*n+j];
}
pre_win_var[3*i+j]=pre_win_var[3*i+j]/(double)nebsize;
pre_win_var[3*i+j]+=(i==j)*epsilon/(double)nebsize-win_mu[j]*win_mu[i];
}
}
//this kills all threads
if(ind==200){
printf("%f\n", win_var[8]);
}
detwin=pre_win_var[0]*pre_win_var[4]*pre_win_var[8]+pre_win_var[2]*pre_win_var[3]*pre_win_var[7]+pre_win_var[1]*pre_win_var[5]*pre_win_var[6];
detwin-=pre_win_var[6]*pre_win_var[4]*pre_win_var[2]+pre_win_var[3]*pre_win_var[1]*pre_win_var[8]+pre_win_var[7]*pre_win_var[5]*pre_win_var[0];
win_var[0]=(pre_win_var[4]*pre_win_var[8]-pre_win_var[5]*pre_win_var[7])/detwin;
win_var[3]=-(pre_win_var[3]*pre_win_var[8]-pre_win_var[5]*pre_win_var[6])/detwin;
win_var[6]=(pre_win_var[3]*pre_win_var[7]-pre_win_var[4]*pre_win_var[6])/detwin;
win_var[1]=-(pre_win_var[1]*pre_win_var[8]-pre_win_var[2]*pre_win_var[7])/detwin;
win_var[4]=(pre_win_var[0]*pre_win_var[8]-pre_win_var[2]*pre_win_var[6])/detwin;
win_var[7]=-(pre_win_var[0]*pre_win_var[7]-pre_win_var[1]*pre_win_var[6])/detwin;
win_var[2]=(pre_win_var[1]*pre_win_var[5]-pre_win_var[2]*pre_win_var[4])/detwin;
win_var[5]=-(pre_win_var[0]*pre_win_var[5]-pre_win_var[2]*pre_win_var[3])/detwin;
win_var[8]=(pre_win_var[0]*pre_win_var[4]-pre_win_var[1]*pre_win_var[3])/detwin;
//this line gets executed in all threads if I printf nothing
consts[ind]=666;
}
}
Printing the values of win_var or pre_win_var is possible only before the values are calculated, but if I try to print them after that it seems to kill all the threads. If I print nothing the line consts[ind]=666 gets executed in all threads, I know it because I can copy consts back to the host memory and print it. So, anyone has any idea of what's wrong?

The problem appears to be one of resource exhaustion. You are getting cudaErrorLaunchOutOfResources at launch with printf enabled because of the larger register footprint of the kernel with the ABI call included.
You didn't provide any details about your launch parameters, but reducing the total threads per block to a smaller multiple of 32 should cure the problem.

Related

Cuda printf() overlapping when using multiple devices

I have a printf in my __global__ code. It works as intended most of the time. However when using a multi GPU system (typically happens when ran on an 4-8 GPU system), once in a while, the prints will merge. By once in a while Its about 100-500 lines out of 167000 lines.
I was wondering how this situation can be remedied without adding too much overhead of transferring the data back to host (if possible). I was thinking to try a mutex lock for printing but I dont think that sort of thing exists for use in the kernel. Any other solutions I could try?
Note: The actual kernel is a long running kernel usually around 20-50 minutes to complete depending on the GPU.
Note2: I barely know what I'm doing with C/C++.
Example of merged Output
JmHp8rwXAw,031aa97714c800de47971829beded204000cfcf5e0f3775552ccf3e9b387869fxLuZJu3ZkX
qVOuKlQ0ZcMrhGXAnZ75,08bf3e90a57c31b7f355214cdf442748d9ff6ae1d49a96f7a8b9e3c86bd8e68a,5231a9e969d53c64f75bb1f07b1c95bb81f685744ed46f56348c733389c56ca5
,623f62b3198c8b62cd7a3b3cf8bf8ede5f9bfdccb7c1dc48a55530c7d5f59ce8
What it should look like
JmHp8rwXAw,031aa97714c800de47971829beded204000cfcf5e0f3775552ccf3e9b387869f
MrhGXAnZ75,08bf3e90a57c31b7f355214cdf442748d9ff6ae1d49a96f7a8b9e3c86bd8e68a
qVOuKlQ0Zc,5231a9e969d53c64f75bb1f07b1c95bb81f685744ed46f56348c733389c56ca5
xLuZJu3ZkX,623f62b3198c8b62cd7a3b3cf8bf8ede5f9bfdccb7c1dc48a55530c7d5f59ce8
My Example Code:
#define BLOCKS 384
#define THREADS 64
typedef struct HandlerInput {
unsigned char device;
} HandlerInput;
pthread_mutex_t solutionLock;
__global__ void kernel(unsigned long baseSeed) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
BYTE random[RANDOM_LEN];
BYTE data[DIGEST_LEN];
SHA256_CTX ctx;
/* Randomization routine*/
d_getRandomString((unsigned long)idx + baseSeed, random);
/* Hashing routine*/
sha256_hash(&ctx, random, data, RANDOM_LEN);
/* Print to console - randomStr,Hash */
printf("%s,%s\n", random, data);
}
void *launchGPUHandlerThread(void *vargp) {
HandlerInput *hi = (HandlerInput *)vargp;
cudaSetDevice(hi->device);
unsigned long rngSeed = timeus();
while (1) {
hostRandomGen(&rngSeed);
kernel<<<BLOCKS, THREADS>>>(rngSeed);
cudaDeviceSynchronize();
}
cudaDeviceReset();
return NULL;
}
int main() {
int GPUS;
cudaGetDeviceCount(&GPUS);
pthread_t *tids = (pthread_t *)malloc(sizeof(pthread_t) * GPUS);
for (int i = 0; i < GPUS; i++) {
HandlerInput *hi = (HandlerInput *)malloc(sizeof(HandlerInput));
hi->device = i;
pthread_create(tids + i, NULL, launchGPUHandlerThread, hi);
usleep(23);
}
pthread_mutex_lock(&solutionLock);
for (int i = 0; i < GPUS; i++)
pthread_join(tids[i], NULL);
return 0;
}
I spent 4 days trying different things to no avail. I really don't understand memory management enough in C/C++ to get past the endless segmentation fault errors.
What I ended up doing was using Unified Memory as it seemed the easiest way to handle the memory for both device and host and it doesn't seem to add too much overhead to the whole process. Then each cpu thread (gpu) can write to its own file. I ran a couple of nvprof and it seemed that after the initial setup for the memory cudaMallocManaged the rest of the overhead seemed to be measured in the microseconds. Since each loop takes 20 minutes these are really barely noticeable.
I created two __device__ functions to copy the data over to the host accessible arrays, because I wanted to utilize the #pragma unroll feature. Not really sure if that helps or what it even does, but I decided to do things this way.
If anyone has further suggestions on ways to improve I am open to trying more things out.
Here is my new example code:
#define BLOCKS 384
#define THREADS 64
typedef struct HandlerInput {
unsigned char device;
} HandlerInput;
__device__ void mycpydigest(__restrict__ BYTE *dst, __restrict__ const BYTE *src) {
#pragma unroll 64
for (BYTE i = 0; i < 64; i++) {
dst[i] = src[i];
}
dst[64] = '\0';
}
__device__ void mycpyrandom(__restrict__ BYTE *dst, __restrict__ const BYTE *src) {
#pragma unroll 10
for (BYTE i = 0; i < 10; i++) {
dst[i] = src[i];
}
dst[10] = '\0';
}
__global__ void kernel(BYTE **d_random, BYTE **d_hashes, unsigned long baseSeed) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
BYTE random[RANDOM_LEN];
BYTE data[DIGEST_LEN];
SHA256_CTX ctx;
/* Randomization routine*/
d_getRandomString((unsigned long)idx + baseSeed, random);
/* Hashing routine*/
sha256_hash(&ctx, random, data, RANDOM_LEN);
/* Send to host - randomStr & Hash */
mycpydigest(d_hashes[idx], data);
mycpyrandom(d_random[idx], random);
}
void *launchGPUHandlerThread(void *vargp) {
HandlerInput *hi = (HandlerInput *)vargp;
cudaSetDevice(hi->device);
unsigned long rngSeed = timeus();
int threadBlocks = hi->BLOCKS * hi->THREADS;
BYTE **randoms;
BYTE **hashes;
cudaMallocManaged(&randoms, sizeof(BYTE *) * (threadBlocks), cudaMemAttachGlobal);
cudaMallocManaged(&hashes, sizeof(BYTE *) * (threadBlocks), cudaMemAttachGlobal);
for (int i = 0; i < threadBlocks; i++) {
cudaMallocManaged(&randoms[i], sizeof(BYTE) * (RANDOM_LEN), cudaMemAttachGlobal);
cudaMallocManaged(&hashes[i], sizeof(BYTE) * (DIGEST_LEN), cudaMemAttachGlobal);
}
while (1) {
hostRandomGen(&rngSeed);
kernel<<<hi->BLOCKS, hi->THREADS>>>(randoms, hashes, rngSeed);
cudaDeviceSynchronize();
print2File(randoms, hashes, threadBlocks, hi->device)
}
cudaFree(hashes);
cudaFree(randoms);
cudaDeviceReset();
return NULL;
}
int main() {
int GPUS;
cudaGetDeviceCount(&GPUS);
pthread_t *tids = (pthread_t *)malloc(sizeof(pthread_t) * GPUS);
for (int i = 0; i < GPUS; i++) {
HandlerInput *hi = (HandlerInput *)malloc(sizeof(HandlerInput));
hi->device = i;
pthread_create(tids + i, NULL, launchGPUHandlerThread, hi);
usleep(23);
}
for (int i = 0; i < GPUS; i++)
pthread_join(tids[i], NULL);
return 0;
}
I want to thank #paleonix for the help in the comments. I was working on this issue for a week before I posted and your comments helped guide me down a different path.

Passing a row of pointers to __global__ function

I am trying to pass a row of pointers of a two dimensional array of pointers in CUDA. See my code below. Here the array of pointers is noLocal. Because I am doing an atomicAdd I am expecting a number different of zero in line printf("Holaa %d\n", local[0][0]);, but the value I get is 0. Could you help me to pass an arrow in CUDA by reference, please?
__global__ void myadd(int *data[8])
{
unsigned int x = blockIdx.x;
unsigned int y = threadIdx.x;
unsigned int z = threadIdx.y;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
//printf("Ola sou a td %d\n", tid);
for (int i; i<8; i++)
atomicAdd(&(*data)[i],10);
}
int main(void)
{
int local[20][8] = { 0 };
int *noLocal[20][8];
for (int d = 0; d< 20;d++) {
for (int dd = 0; dd< 8; dd++) {
cudaMalloc(&(noLocal[d][dd]), sizeof(int));
cudaMemcpy(noLocal[d][dd], &(local[d][dd]), sizeof(int), cudaMemcpyHostToDevice);
}
myadd<<<20, dim3(10, 20)>>>(noLocal[d]);
}
for (int d = 0; d< 20;d++)
for (int dd = 0; dd < 8; dd++)
cudaMemcpy(&(local[d][dd]), noLocal[d][dd], sizeof(int), cudaMemcpyDeviceToHost);
printf("Holaa %d\n", local[0][0]);
for (int d = 0; d < 20; d++)
for (int dd = 0; dd < 8; dd++)
cudaFree(noLocal[d][dd]);
}
I believe you received good advice in the other answer. I don't recommend this coding pattern. For general reference material on creating 2D arrays in CUDA, see this answer.
When I compile the code you have shown, I get warnings of the form "i is used before its value is set". This kind of warning should not be ignored. It arises from this statement which doesn't make sense to me:
for (int i; i<8; i++)
that should be:
for (int i = 0; i<8; i++)
It's not clear you understand the C++ concepts of pointers and arrays. This:
int local[20][8] = { 0 };
represents an array of 20x8 = 160 integers. If you want to imagine it as an array of pointers, you could pretend that it includes 20 pointers of the form local[0], local[1]..local[19]. Each of those "pointers" points to an array of 8 integers. But there is no sensible comparison to suggest that it has 160 pointers in it. Furthermore the usage pattern you indicate in your kernel does not suggest that you expect 160 pointers to individual integers. But that is exactly what you are creating here:
int *noLocal[20][8]; //this is declaring a 2D array of 160 *pointers*
for (int d = 0; d< 20;d++) { // the combination of these loops means
for (int dd = 0; dd< 8; dd++) { // you will create 160 *pointers*
cudaMalloc(&(noLocal[d][dd]), sizeof(int));
To mimic your host array (local) you want to create 20 pointers each of which is pointing to an allocation of 8 int quantities. The usage in your kernel code here:
&(*data)[i]
means that you intend to take a single pointer, and offset it by i values ranging from 0 to 7. It does not mean that you expect to receive 8 individual pointers. Again, this is C++ behavior, not unique or specific to CUDA.
In order to make your code "sensible" there were a variety of changes I had to make. Here's a "fixed" version:
$ cat t1858.cu
#include <cstdio>
__global__ void myadd(int data[8])
{
// unsigned int x = blockIdx.x;
// unsigned int y = threadIdx.x;
// unsigned int z = threadIdx.y;
// int tid = blockDim.x * blockIdx.x + threadIdx.x;
//printf("Ola sou a td %d\n", tid);
for (int i = 0; i<8; i++)
atomicAdd(data+i,10);
}
int main(void)
{
int local[20][8] = { 0 };
int *noLocal[20];
for (int d = 0; d< 20;d++) {
cudaMalloc(&(noLocal[d]), 8*sizeof(int));
cudaMemcpy(noLocal[d], local[d], 8*sizeof(int), cudaMemcpyHostToDevice);
myadd<<<20, dim3(10, 20)>>>(noLocal[d]);
}
for (int d = 0; d< 20;d++)
cudaMemcpy(local[d], noLocal[d], 8*sizeof(int), cudaMemcpyDeviceToHost);
printf("Holaa %d\n", local[0][0]);
for (int d = 0; d < 20; d++)
cudaFree(noLocal[d]);
}
$ nvcc -o t1858 t1858.cu
$ cuda-memcheck ./t1858
========= CUDA-MEMCHECK
Holaa 40000
========= ERROR SUMMARY: 0 errors
$
The number 40000 is correct. It comes about because every thread is doing an atomic add of 10, and you have 20x200 threads that are doing that. 10x20x200 = 40000.
You should simply not be doing anything like that. You are wasting time and memory with these excessive allocations. And - your kernel would be pretty slow as well. I am 100% certain this is not what you were asked, nor what you wanted, to do.
Instead, you should:
Allocate a single large buffer on the device to fit the data you need.
Avoid using pointers on the device side, except to that buffer, unless absolutely necessary.
If you somehow have to use a 2D pointer array - add relevant offsets to your buffer's base pointer to get different pointers into it.

how to create a matrix in gpu and print it on cpu?

This is a code to create a matrix on gpu and print it out on cpu. Can anyone tell me where am I going wrong. Thank you.
# include <stdio.h>
__global__ void create(int **d_a){
int i = threadIdx.x;
int j = threadIdx.y;
d_a[i][j] = 1;
}
void errorCheck(){
cudaError_t error = cudaGetLastError();
if(error != cudaSuccess){
// print the CUDA error message and exit
printf("CUDA error: %s\n", cudaGetErrorString(error));
exit(-1);
}
}
# define N 5
int main(){
int **d_a, **a;
a = (int**)malloc(N * sizeof(int*));
for (int i =0; i < N; i++){
a[i] = (int*)malloc(N*sizeof(int));
}
cudaMalloc((void***)&d_a, N*sizeof(int*));
for (int i =0; i < N; i++){
cudaMalloc((void**)&d_a,N*sizeof(int));
}
errorCheck();
create <<<1, N>>>(d_a);
errorCheck();
cudaMemcpy(a, d_a, (N*N)*sizeof(int),cudaMemcpyDeviceToHost);
for (int i =0; i < N; i++ ){
for (int j = 0; j < N; j++ ){
printf("%d", a[i][j]);
}
printf("\n");
}
cudaFree(d_a);
free(a);
return 0;
}
Is there something wrong with memory allocation or memcpy ?
Is there something wrong with memory allocation or memcpy ?
Yes on both counts.
This:
cudaMalloc((void***)&d_a, N*sizeof(int*));
for (int i =0; i < N; i++){
cudaMalloc((void**)&d_a,N*sizeof(int));
}
would have to be done like this:
cudaMalloc((void***)&d_a, N*sizeof(int*));
for (int i=0; i < N; i++){
int *row;
cudaMalloc((void**)&row, N*sizeof(int));
cudaMemcpy(d_a+i, &row, sizeof(int*), cudaMemcpyHostToDevice);
}
And then this:
create <<<1, N>>>(d_a);
errorCheck();
cudaMemcpy(a, d_a, (N*N)*sizeof(int),cudaMemcpyDeviceToHost);
would have to be done like this:
create <<<1, dim3(N,N)>>>(d_a);
errorCheck();
for(int i=0; i<N; i++) {
int* row;
cudaMemcpy(&row, d_a+i, sizeof(int*), cudaMemcpyDeviceToHost);
cudaMemcpy(a[i], row, sizeof(int) * N, cudaMemcpyDeviceToHost);
}
[All code written in browser and not tested, use at own risk]
In short, you have decided to work with an array of pointers. This requires additional CUDA API operations because the row pointers in the GPU copy are not accessible on the host by standard assignment. You must use cudaMemcpy in every case.

CUDA_SAFE_CALL: an illegal memory access was encountered

I am trying to do simple matrix multiplication on CUDA. I know arrays can be flattened for passing it to the device. However I am using cudaMallocPitch and cudaMemcpy2d to do the multiplication. While executing the code below I get an error " illegal memory was encountered" when I try to copy the result onto the host I highly appreciate any advice on where I am going wrong. Thanks!
weights-first matrix,dim:30x784
input- second matrix,dim:784x100
results_d - result on the device(GPU)
result - result copied on the host
#include <stdio.h>
#include <math.h>
#include <cstdio>
#include <cstdlib>
#define CUDA_SAFE_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"CUDA_SAFE_CALL: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void MatrixMulKernel(double *input,double *weights,double *results_d,size_t in_pitch,size_t w1_pitch,size_t result_pitch)
{
int row = threadIdx.x;
int col= threadIdx.y;
double value;
double *result_matrix;
result_matrix = ((double*)((char*)results_d + row*result_pitch + col));
printf("%d",threadIdx);
for(int i =0 ; i < in_pitch ; i++)
{
double *element1 = ((double*)((char*)input + row*in_pitch) + i) ;
double *element2 = ((double*)((char*)weights + i*w1_pitch) + col);
value =+ (*element1) * (*element2);
}
*result_matrix = value;
}
int main()
{
static double arr1[30][784];
static double arr2[784][100];
static double result[30][100];
for (int i = 0 ; i < 30; i++)
{
for(int j =0;j <784 ; j ++)
arr1[i][j] = 5;
}
for (int i =0 ; i < 784; i ++)
{
for(int j=0;j < 100 ; j++)
arr2[i][j] = 3;
}
double *input;
double *weights;
double *results_d;
size_t in_pitch,w1_pitch,result_pitch;
//allocating memory in GPU for 2 inputs and result
CUDA_SAFE_CALL(cudaMallocPitch((void**)&input,&in_pitch,100*sizeof(double),784));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&weights,&w1_pitch,784*sizeof(double),30));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&results_d,&result_pitch,100*sizeof(double),30));
//Copy matrix from host to device
CUDA_SAFE_CALL(cudaMemcpy2D(input,in_pitch,arr2,100*sizeof(double),100*sizeof(double),784,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(weights,w1_pitch,arr1,784*sizeof(double),784*sizeof(double),30,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(results_d,result_pitch,result,100*sizeof(double),100*sizeof(double),30,cudaMemcpyHostToDevice));
//using GPU
dim3 dimGrid(1,1,1);
dim3 dimBlock(32,32,1);
printf("before kernel fucntion");
MatrixMulKernel<<<dimGrid, dimBlock>>>(input, weights,results_d,in_pitch,w1_pitch,result_pitch);
printf("after kernel fucntion");
cudaThreadSynchronize();
//copying back to host
CUDA_SAFE_CALL(cudaMemcpy2D(result,result_pitch,results_d,100*sizeof(double),100*sizeof(double),30,cudaMemcpyDeviceToHost));
//printing and seeing whether the result matrix has been updated
for (int i =0 ; i < 100; i ++)
{
for(int j=0;j < 30 ; j++)
{
printf("%f",result);
}
printf("\n");
}
CUDA_SAFE_CALL(cudaFree(input));
CUDA_SAFE_CALL(cudaFree(weights));
CUDA_SAFE_CALL(cudaFree(results_d));
return 0;
}
There are a number of errors in this code. First of all, it's not clear that doing pitched allocations is going to give any benefit here. Second, if you're serious about wanting fast matrix multiply performance, you should use CUBLAS.
Issues:
You don't seem to understand pitched allocations. The pitch value returned is a value in bytes. You cannot sensibly use that for a loop index for matrix multiply. Also, the pitch value is the overall width of the pitch allocation. It does not correspond to the valid data area. For that, you should use the appropriate matrix dimension.
Your code will not do a matrix multiplication over the entire matrix area. You are only creating a single block of 32x32 threads, but you need enough blocks/threads to cover the entire matrix area. This requires changes to your grid dimensions, passing matrix dimensions to your kernel, as well as a "thread check" in your kernel to prevent out-of-bounds access.
This construct for pitched access is not correct:
result_matrix = ((double*)((char*)results_d + row*result_pitch + col));
it does not match the other constructions you have for the 2 input matrices, it has a misplaced close parenthesis.
You have the sense of your two input matrices reversed. You are indexing into the input matrix as if it were the weight matrix, and vice-versa. We need to swap the sense of row, column and i to make these match the actual matrix dimensions.
Your final cudaMemcpy2D operation has the pitch values reversed:
cudaMemcpy2D(result,result_pitch,results_d,100*sizeof(double),100*sizeof(double),30,cudaMemcpyDeviceToHost)
^^^^^ ^^^^^
You forgot to initialize to zero your loop sum variable:
double value;
I don't know what you intended here, it should be += not =+:
value =+ ...
The following code has these issues addressed, and seems to run without error for me:
$ cat t104.cu
#include <stdio.h>
#include <math.h>
#include <cstdio>
#include <cstdlib>
const int d1 = 30;
const int d2 = 784;
const int d3 = 100;
double arr1[d1][d2];
double arr2[d2][d3];
double result[d1][d3];
#define CUDA_SAFE_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"CUDA_SAFE_CALL: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void MatrixMulKernel(double *input,double *weights,double *results_d,size_t in_pitch,size_t w1_pitch,size_t result_pitch, int dim, int rrow, int rcol)
{
int col = threadIdx.x + blockDim.x*blockIdx.x;
int row= threadIdx.y + blockDim.y*blockIdx.y;
if ((row >= rrow) || (col >= rcol)) return;
double value = 0;
double *result_matrix;
result_matrix = ((double*)((char*)results_d + row*result_pitch) + col);
for(int i =0 ; i < dim ; i++)
{
double *element1 = ((double*)((char*)input + i*in_pitch) + col) ;
double *element2 = ((double*)((char*)weights + row*w1_pitch) + i);
value += (*element1) * (*element2);
}
*result_matrix = value;
}
int main()
{
for (int i = 0 ; i < d1; i++)
{
for(int j =0;j <d2 ; j ++)
arr1[i][j] = 5;
}
for (int i =0 ; i < d2; i ++)
{
for(int j=0;j < d3 ; j++)
arr2[i][j] = 3;
}
double *input;
double *weights;
double *results_d;
size_t in_pitch,w1_pitch,result_pitch;
//allocating memory in GPU for 2 inputs and result
CUDA_SAFE_CALL(cudaMallocPitch((void**)&input,&in_pitch,d3*sizeof(double),d2));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&weights,&w1_pitch,d2*sizeof(double),d1));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&results_d,&result_pitch,d3*sizeof(double),d1));
//Copy matrix from host to device
CUDA_SAFE_CALL(cudaMemcpy2D(input,in_pitch,arr2,d3*sizeof(double),d3*sizeof(double),d2,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(weights,w1_pitch,arr1,d2*sizeof(double),d2*sizeof(double),d1,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(results_d,result_pitch,result,d3*sizeof(double),d3*sizeof(double),d1,cudaMemcpyHostToDevice));
//using GPU
dim3 dimBlock(32,32,1);
dim3 dimGrid(((d3+dimBlock.x-1)/dimBlock.x),((d1+dimBlock.y-1)/dimBlock.y),1);
MatrixMulKernel<<<dimGrid, dimBlock>>>(input, weights,results_d,in_pitch,w1_pitch,result_pitch, d2, d1, d3);
//copying back to host
CUDA_SAFE_CALL(cudaMemcpy2D(result,d3*sizeof(double),results_d,result_pitch,d3*sizeof(double),d1,cudaMemcpyDeviceToHost));
//printing and seeing whether the result matrix has been updated
for (int i =0 ; i < d3; i ++)
{
for(int j=0;j < d1 ; j++)
{
printf("%f", result[j][i]);
}
printf("\n");
}
CUDA_SAFE_CALL(cudaFree(input));
CUDA_SAFE_CALL(cudaFree(weights));
CUDA_SAFE_CALL(cudaFree(results_d));
return 0;
}
$ nvcc -arch=sm_61 -o t104 t104.cu
$

cuda programming with pthread

#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#define ARR_SIZE 10
#define NUM_DEVICE 1
typedef struct {
int *arr;
int *dev_arr;
int *dev_result;
int *result;
int num;
} cuda_st;
__global__ void kernel_fc(int *dev_arr, int *dev_result)
{
int idx = threadIdx.x;
printf("dev_arr[%d] = %d\n", idx, dev_arr[idx]);
atomicAdd(dev_result, dev_arr[idx]);
}
void *thread_func(void* struc)
{
cuda_st * data = (cuda_st*)struc;
printf("thread %d func start\n", data->num);
printf("arr %d = ", data->num);
for(int i=0; i<10; i++) {
printf("%d ", data->arr[i]);
}
printf("\n");
cudaSetDevice(data->num);
cudaMemcpy(data->dev_arr, data->arr, sizeof(int)*ARR_SIZE, cudaMemcpyHostToDevice);
kernel_fc<<<1,ARR_SIZE>>>(data->dev_arr, data->dev_result);
cudaMemcpy(data->result, data->dev_result, sizeof(int), cudaMemcpyDeviceToHost);
printf("thread %d func exit\n", data->num);
return NULL;
}
int main(void)
{
// Make object
cuda_st cuda[NUM_DEVICE];
// Make thread
pthread_t pthread[NUM_DEVICE];
// Host array memory allocation
int *arr[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
arr[i] = (int*)malloc(sizeof(int)*ARR_SIZE);
}
// Fill this host array up with specified data
for(int i=0; i<NUM_DEVICE; i++) {
for(int j=0; j<ARR_SIZE; j++) {
arr[i][j] = i*ARR_SIZE+j;
}
}
// To confirm host array data
for(int i=0; i<NUM_DEVICE; i++) {
printf("arr[%d] = ", i);
for(int j=0; j<ARR_SIZE; j++) {
printf("%d ", arr[i][j]);
}
printf("\n");
}
// Result memory allocation
int *result[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
result[i] = (int*)malloc(sizeof(int));
memset(result[i], 0, sizeof(int));
}
// Device array memory allocation
int *dev_arr[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaMalloc(&dev_arr[i], sizeof(int)*ARR_SIZE);
}
// Device result memory allocation
int *dev_result[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaMalloc(&dev_result[i], sizeof(int));
cudaMemset(dev_result[i], 0, sizeof(int));
}
// Connect these pointers with object
for(int i=0; i<NUM_DEVICE; i++) {
cuda[i].arr = arr[i];
cuda[i].dev_arr = dev_arr[i];
cuda[i].result = result[i];
cuda[i].dev_result = dev_result[i];
cuda[i].num = i;
}
// Create and excute pthread
for(int i=0; i<NUM_DEVICE; i++) {
pthread_create(&pthread[i], NULL, thread_func, (void*)&cuda[i]);
}
// Join pthread
for(int i=0; i<NUM_DEVICE; i++) {
pthread_join(pthread[i], NULL);
}
for(int i=0; i<NUM_DEVICE; i++) {
printf("result[%d] = %d\n", i, (*cuda[i].result));
}
return 0;
}
I make my simple-test-program like this to test pthread with multi device cuda code.
When the NUM_DEVICE set as 1, it works well but when set as 2 program stopped.
I guess beacause multiple threads access cudaSetDevice but I don't know how to handle this.
I tried to make my program with single host thread and multi device(with Async function) before, but in my case(not above simple code), there are many host code between kernel functions so it doesn't work well asynchronously.
So I test to use multi thread on host before apply this manner to my real code but I have trouble like this.
Do I have to use asynchonous function in cuda functions and kernels?
Give me some advise.
The problem is that you allocate memory on one device. You need to call cudaSetDevice before cudaMalloc calls:
// Device array memory allocation
int *dev_arr[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaSetDevice(i);
cudaMalloc(&dev_arr[i], sizeof(int)*ARR_SIZE);
}
// Device result memory allocation
int *dev_result[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaSetDevice(i);
cudaMalloc(&dev_result[i], sizeof(int));
cudaMemset(dev_result[i], 0, sizeof(int));
}