Pointers in structs changed in kernel function - cuda

I am trying access an data from array allocated in CUDA. First step was allocate a struct defined by me. After I pass the allocated struct to a kernel function that change the values from the struct. Finally, I pass the struct and the array to a host variables so read them. But actually I am having a problem to read the vector allocated.
#include <stdio.h>
#include <stdlib.h>
typedef struct x{
float *y;
float v;
}x_t;
__global__ void initTeste(x_t *param){
param->v = 10;
param->y[0] = 10;
param->y[1] = 10;
}
int main(void) {
x_t *hvar;
x_t hvarBackup;
float *temp = (float*)malloc(10*sizeof(float));
cudaError_t result;
cudaMalloc(&hvar , sizeof(x_t) );
cudaMalloc(&hvarBackup.y, 10*sizeof(float) );
cudaMemcpy(hvar, &hvarBackup, sizeof(x_t), cudaMemcpyHostToDevice);
initTeste<<<1,1>>>(hvar);
cudaMemcpy(&hvarBackup, hvar, sizeof(x_t), cudaMemcpyDeviceToHost);
cudaMemcpy(temp, &hvar->y, 10*sizeof(float), cudaMemcpyDeviceToHost);
printf("%f",(hvarBackup.v)); //here ok
printf("%f",(temp[0])); //here's the problem
return 0;
}

You cannot do it like that, because you haven't allocated y for the device, hence it will only give you segmentation fault when copying from y content to host. Aside of that, you have to allocate y for the device with the amount of 10*sizeof(float), and this is a truthfully pain in the a** job, especially when your struct becomes a huge container of arrays (and you should always know, that arrays inside structs always have to be avoided in CUDA).
Here's what you can do with the current code
int main(void) {
x_t *h_hvar = (x_t*)malloc(sizeof(x_t));
x_t *d_hvar;
float *h_y = (float*)malloc(10*sizeof(float));
float *d_y;
cudaMalloc(&d_hvar, sizeof(x_t) );
cudaMalloc(&d_y, 10*sizeof(float) );
// Insert the float pointer you allocated in CUDA
// to the host pointer first, and then copy the whole thing
// to the device area
h_hvar->y = d_y;
cudaMemcpy(d_hvar, h_hvar, sizeof(x_t), cudaMemcpyHostToDevice);
initTeste<<<1,1>>>(d_hvar);
cudaMemcpy(h_hvar, d_hvar, sizeof(x_t), cudaMemcpyDeviceToHost);
cudaMemcpy(h_y, d_y, 10*sizeof(float), cudaMemcpyDeviceToHost);
printf("%f",h_hvar->v);
printf("%f",h_y[0]);
return 0;
}
And that should give you the right value..

cudaMemcpy(temp, &hvar->y, 10*sizeof(float), cudaMemcpyDeviceToHost);
should be
cudaMemcpy(temp, hvar->y, 10*sizeof(float), cudaMemcpyDeviceToHost);
because hvar->y is already a pointer and you don't want to get the pointer to that pointer.

Related

Can we copy a "normal" GPU memory to a "unified" memory?

We have two GPU memories, one is allocated with cuMalloc as normal device memory, the other is allocated with cuMallocManaged as unified memory. Is it possible to copy between them? and if we use driver API, what direction should I use?
float* normalMem, unifiedMem;
cuMalloc(&normalMem, 100);
cuMallocManaged(&unifiedMem, 100);
cuMemcpyD2D(unifiedMem, normalMem, 100); // ? D2D? or D2H? or else?
Yes you can. Look at the following code for instance.
It declared a normal pointer a managed pointer and an host pointer all of them of 100 float.
It then initializes the values in the host pointer and then copy the values using cudaMemCpy to the normal pointer.
Values are now copied to the managed pointer
The managed pointer is used in a kernel to shows that values have been copied from the two buffers.
I think that the code is pretty self-explanatory
__global__
void test(float* d_ptr){
for(int i=0;i<100;i++)
printf("%f \n",d_ptr[i]);
printf("\n");
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
size_t size = sizeof(float)*100;
float* h_p =(float*) malloc(size);
float* d_p, dm_p ;
cudaMalloc(&d_p,size);
cudaMallocManaged(&dm_p,size);
for(int i=0;i<100;i++)
h_p[i]=2*(float)i;
cudaMemcpy(d_p,h_p,size,cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
cudaMemcpy(dm_p,d_p,size,cudaMemcpyDeviceToDevice);
cudaDeviceSynchronize();
test<<<1,1>>>(dm_p);
cudaDeviceSynchronize();
cudaFree(dm_p);
cudaFree(d_p);
free(h_p);
return 0;
}
Remember to read the Unified Memory access rules.

argument of type "int *" is incompatible with parameter of type "int" in cuda kernel call

I've been trying for a while and have come across seemingly similar issues already posted however for some reason I'm still failing to clear the error. I'm effectively want to pass a 2D matrix to the kernel as a 1D array as I have seen suggested. I'm not sure where I've gone wrong in my syntax but there is a clash in terms of the variable I supply to the kernel and the parameter that kernel expects.
__global__ void calculatePath(int source, int target, int *cost, int distance){
int t_id = blockIdx.x * blockDim.x + threadIdx.x;
int dist[50];
int prev[50];
int selected[50]={0};
int num_path[50];
int d, m, min, start, j;
if ((t_id > 0) && (t_id < N)){
dist[t_id] = IN;
prev[t_id] = -1;
}
This is my kernel function whose parameters are all integers except "cost" which is a pointer to an integer array.
int main(int argc, char **argv){
int h_num_path[N];
int h_distance = 0;
int h_cost[N][N],i,j,co;
int h_source;
int h_target;
printf("\tShortest Path Algorithm(DIJKSRTRA's ALGORITHM\n\n");
for(i=0;i< N;i++)
for(j=0;j< N;j++)
h_cost[i][j] = IN;
//*********************
srand ( time(NULL));
for(int x=1;x< N;x++) {
for (int y = x + 1; y < N; y++) {
h_cost[x][y] = h_cost[y][x] = (rand() % 100) + 1;
}
}
printf("\nEnter The Source: ");
scanf("%d", &h_source);
printf("\nEnter The target: ");
scanf("%d", &h_target);
int *d_num_path;
int *d_cost;
int *d_source;
int *d_target;
int *d_dist;
int *d_prev;
int *d_distance;
cudaMalloc(&d_num_path, sizeof(int)*N);
cudaMalloc(&d_cost, sizeof(int)*N*N);
cudaMalloc((void**) &d_source, sizeof(int));
cudaMalloc((void**) &d_target, sizeof(int));
cudaMalloc((void**) &d_dist, sizeof(int)*N);
cudaMalloc((void**) &d_distance, sizeof(int));
cudaMemcpy(d_source, &h_source, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_target, &h_target, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_cost, h_cost, sizeof(int)*N*N, cudaMemcpyHostToDevice);
cudaMemcpy(d_distance, &h_distance, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_num_path, &h_num_path, sizeof(int)*N, cudaMemcpyHostToDevice);
clock_t before;
before = clock();
calculatePath<<<N/512 + 1, 512>>>(d_source, d_target, d_cost, d_distance);
clock_t time_taken = clock() - before;
cudaMemcpy(&h_num_path, d_num_path, sizeof(int)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(&h_distance, d_distance, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_num_path);
cudaFree(d_cost);
cudaFree(d_source);
cudaFree(d_target);
cudaFree(d_dist);
cudaFree(d_prev);
cudaFree(d_distance);
printf("\nShortest Path: %d \n",co);
printf("%s %.4f %s", "Time taken:", time_taken/1000.0, "seconds");
return 0;
}
On the kernel call, I however receive the error that "argument of type 'int *' is incompatible with parameter of type 'int'" yet I believe my d_cost already is a pointer. I'd appreciate being set straight as I'm sure I'm overlooking something small.
It is not d_target you are having trouble with. The other three arguments are int* but corresponding parameters are declared as int.
The C Programming Language by K&R at page 25 says:
We will generally use parameter for a variable named in the parenthesized list in a function definition, and argument for the value used in a call of the function.
Since your source and target are just a single integer values, you don't really need to define device side variables for them. Just pass the integer value itself as an argument. By doing so, you'll get performance improvements as talonmies commented:
(With pass by value) there is constant memory cache broadcast within the kernel if it is done that way. Passing pointers for simple constants just increases latency by forcing every thread to dereference the pointer to retrieve the value from global memory, plus all the additional host side memory APIs to allocate them in the first place.
Also, you seem to expect parameter distance to have output value of your kernel, then it must be declared as a pointer, so you can do cudaMemcpyDeviceToHost after kernel.
__global__ void calculatePath(int source, int target, int *cost, int *distance) // kernel definition
caculatePath<<< (N + 511) / 512, 512 >>>(h_source, h_target, d_cost, d_distance) // kernel launch
Three of your arguments need to be integers, but you are passing pointers to integers. You need to change your method signature:
__global__ void calculatePath(int *source, int *target, int *cost, int *distance)

External call of a class method in a kernel

I have a class FPlan that has a number of methods such as permute and packing.
__host__ __device__ void Perturb_action(FPlan *dfp){
dfp->perturb();
dfp->packing();
}
__global__ void Vector_Perturb(FPlan **dfp, int n){
int i=threadIx.x;
if(i<n) Perturb_action(dfp[i]);
}
in main:
FPlan **fp_vec;
fp_vec=(FPlan**)malloc(VEC_SIZE*sizeof(FPlan*));
//initialize the vec
for(int i=0; i<VEC_SIZE;i++)
fp_vec[i]=&fp;
//fp of type FPlan that is initialized
int v_sz=sizeof(fp_vec);
double test=fp_vec[0]->getCost();
printf("the cost before perturb %f\n"test);
FPlan **value;
cudaMalloc(&value,v_sz);
cudaMemcpy(value,&fp_vec,v_sz,cudaMemcpyHostToDevice);
//call kernel
dim3 threadsPerBlock(VEC_SIZE);
dim3 numBlocks(1);
Vector_Perturb<<<numBlocks,threadsPerBlock>>> (value,VEC_SIZE);
cudaMemcpy(fp_vec,value,v_sz,cudaMemcpyDeviceToHost);
test=fp_vec[0]->getCost();
printf("the cost after perturb %f\n"test);
test=fp_vec[1]->getCost();
printf("the cost after perturb %f\n"test);
I am getting before permute for fp_vec[0] printf the cost 0.8.
After permute for fp_vec[0] the value inf and for fp_vec[1] the value 0.8.
The expected output after the permutation should be something like fp_vec[0] = 0.7 and fp_vec[1] = 0.9. I want to apply these permutations to an array of type FPlan.
What am I missing? Is calling an external function supported in CUDA?
This seems to be a common problem these days:
Consider the following code:
#include <stdio.h>
#include <stdlib.h>
int main() {
int* arr = (int*) malloc(100);
printf("sizeof(arr) = %i", sizeof(arr));
return 0;
}
what is the expected ouptut? 100? no its 4 (at least on a 32 bit machine). sizeof() returns the size of the type of a variable not the allocated size of an array.
int v_sz=sizeof(fp_vec);
double test=fp_vec[0]->getCost();
printf("the cost before perturb %f\n"test);
FPlan **value;
cudaMalloc(&value,v_sz);
cudaMemcpy(value,&fp_vec,v_sz,cudaMemcpyHostToDevice);
You are allocating 4 (or 8) bytes on the device and copy 4 (or 8) bytes. The result is undefined (and maybe every time garbage).
Besides that, you shold do proper error checking of your CUDA calls.
Have a look: What is the canonical way to check for errors using the CUDA runtime API?

Error in the result of matrix multiplication example of CUDA C programming guide

I'm doing the matrix multiplication example from the book CUDA C Programming Guide, page 35, for practice, I copied the code and completed the missing code. I understand the logic of the program and how it should work, but I get no the expected result.
Here is the complete code i made, I do not know if the error is mine or from the example?
The code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdio.h>
using namespace std;
#define BLOCK_SIZE 16
typedef struct
{
int width;
int height;
float *elements;
}Matrix;
__global__ void MatMulKernel(const Matrix,const Matrix, Matrix C);
void MatMul(const Matrix A,const Matrix B, Matrix C)
{
size_t size;
//Matrix A creation y storage in device memory
Matrix d_A;
d_A.width=A.width;
d_A.height=A.height;
size=A.height*A.width*sizeof(float);
cudaMalloc(&d_A.elements,size);
cudaMemcpy(d_A.elements,A.elements,size,cudaMemcpyHostToDevice);
//Matrix B creation y storage in device memory
Matrix d_B;
d_B.width=B.width;
d_B.height=B.height;
size=B.height*B.width*sizeof(float);
cudaMalloc(&d_B.elements,size);
cudaMemcpy(d_B.elements,B.elements,size,cudaMemcpyHostToDevice);
//Matrix C creation y storage in device memory
Matrix d_C;
d_C.width=C.width;
d_C.height=C.height;
size=C.height*C.width*sizeof(float);
cudaMalloc(&d_C.elements,size);
//
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(B.width/dimBlock.x,A.height/dimBlock.y);
MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);
//Copy the result in the matrix C from the device to the host.
cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);
//edit the missing code.
// for(int i=0;i<BLOCK_SIZE*BLOCK_SIZE;i++){cout<<C.elements[i]<<endl;}
// result in random numbers
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
float Cvalue=0;
int row=blockIdx.y*blockDim.y+threadIdx.y;
int col=blockIdx.x*blockDim.x+threadIdx.x;
for(int e=0;e<A.width;++e)
{
Cvalue+=A.elements[row*A.width+e]*B.elements[e*B.width+col];
}
C.elements[row*C.width+col]=Cvalue;
}
int main()
{
cout<<"Matrices"<<endl;
//Declarationd of the A,B,C matrix´s
float a[15][15];
float b[15][15];
float c[15][15];
//Fill the matrix whit some numbers.
int cont0=0;
for(int c=0;c<15;c++)
{
for(int v=0;v<15;v++)
{
a[v][c]=cont0;
b[v][c]=cont0;
cont0++;
}
}
//Flatten the matrix for the passing to the kernel
int offset=0;
float a_t[256];
float b_t[256];
for(int y=0;y<15;y++)
{
for(int x=0;x<15;x++)
{
a_t[x+offset]=a[x][y];
b_t[x+offset]=a[x][y];
}
offset=offset+15;
}
float t_C[256];
//Completing the matrix format for the kernel.
Matrix m_A;
m_A.height=15;
m_A.width=15;
m_A.elements=a_t;
Matrix m_B;
m_B.height=15;
m_B.width=15;
m_B.elements=b_t;
Matrix m_C;
m_C.height=15;
m_C.width=15;
m_C.elements=t_C;
//Passing the formated matrix to the kernel.
MatMul(m_A,m_B,m_C);
cout<<"Final"<<endl;
return 0;
}
The program compiles and runs but the result matrix C.elements from: cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);
is a random number. I've tried to use it like a pointer to a array but i don't get anything from it and treating it like array does not work either.
I will be glad if anyone can help me to finish this.
Your code has minor miss match between array indexing in kernel and initialization on CPU. Here is the corrected code with debugging suggested by #harrism:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdio.h>
using namespace std;
#define BLOCK_SIZE 16
typedef struct
{
int width;
int height;
float *elements;
}Matrix;
__global__ void MatMulKernel(const Matrix,const Matrix, Matrix C);
void MatMul(const Matrix A,const Matrix B, Matrix C)
{
size_t size;
//Matrix A creation y storage in device memory
Matrix d_A;
d_A.width=A.width;
d_A.height=A.height;
size=A.height*A.width*sizeof(float);
cudaMalloc(&d_A.elements,size);
cudaMemcpy(d_A.elements,A.elements,size,cudaMemcpyHostToDevice);
//Matrix B creation y storage in device memory
Matrix d_B;
d_B.width=B.width;
d_B.height=B.height;
size=B.height*B.width*sizeof(float);
cudaMalloc(&d_B.elements,size);
cudaMemcpy(d_B.elements,B.elements,size,cudaMemcpyHostToDevice);
//Matrix C creation y storage in device memory
Matrix d_C;
d_C.width=C.width;
d_C.height=C.height;
//cudaMalloc(&d_C,sizeof(Matrix));
//cudaMemcpy(d_C,C,sizeof(Matrix),cudaMemcpyHostToDevice);
size=C.height*C.width*sizeof(float);
cudaMalloc(&d_C.elements,size);
//
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(B.width/dimBlock.x,A.height/dimBlock.y);
MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);
//Copy the result in the matrix C from the device to the host.
printf("error code: %s\n",cudaGetErrorString(cudaGetLastError()));
cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);
//
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
//printf("%d\n",threadIdx.x);
float Cvalue=0;
int row=blockIdx.y*blockDim.y+threadIdx.y;
int col=blockIdx.x*blockDim.x+threadIdx.x;
for(int e=0;e<A.width;++e)
{
Cvalue+=A.elements[row*A.width+e]*B.elements[e*B.width+col];
}
C.elements[row*C.width+col]=Cvalue;
}
int print_matrix(Matrix A){
printf("Matrix:\n");
int i;
for(i=0; i<A.width*A.height; i++){
if(i%A.width==0) printf("\n");
printf("%6.4f\t",A.elements[i]);
}
printf("\n");
}
int main()
{
cout<<"Matrices"<<endl;
//Declarationd of the A,B,C matrix.s
float a[BLOCK_SIZE][BLOCK_SIZE];
float b[BLOCK_SIZE][BLOCK_SIZE];
float c[BLOCK_SIZE][BLOCK_SIZE];
//Fill the matrix whit some numbers.
int cont0=0;
for(int c=0;c<BLOCK_SIZE;c++)
{
for(int v=0;v<BLOCK_SIZE;v++)
{
a[v][c]=cont0;
b[v][c]=cont0;
cont0++;
}
}
//Flatten the matrix for the passing to the kernel
int offset=0;
float a_t[BLOCK_SIZE*BLOCK_SIZE];
float b_t[BLOCK_SIZE*BLOCK_SIZE];
for(int y=0;y<BLOCK_SIZE;y++)
{
for(int x=0;x<BLOCK_SIZE;x++)
{
a_t[x+offset]=a[x][y];
b_t[x+offset]=a[x][y];
}
offset=offset+BLOCK_SIZE;
}
float t_C[BLOCK_SIZE*BLOCK_SIZE];
//Completing the matrix format for the kernel.
Matrix m_A;
m_A.height=BLOCK_SIZE;
m_A.width=BLOCK_SIZE;
m_A.elements=a_t;
Matrix m_B;
m_B.height=BLOCK_SIZE;
m_B.width=BLOCK_SIZE;
m_B.elements=b_t;
Matrix m_C;
m_C.height=BLOCK_SIZE;
m_C.width=BLOCK_SIZE;
m_C.elements=t_C;
//Passing the formated matrix to the kernel.
print_matrix(m_A);
print_matrix(m_B);
MatMul(m_A,m_B,m_C);
print_matrix(m_C);
cout<<"Final"<<endl;
return 0;
}
Check the output. If you see the results are wrong, check the kernel error on your system which is reported in output.
Firstly, see here for how to get useful answers to your questions. In particular, you should always check the return value of your CUDA API calls and kernel launches. Also, running cuda-memcheck can often be very helpful to detect out-of-bounds accesses like this.
#harrism asked how you know the result is wrong since you don't appear to do anything with it.
But more importantly you have 15x15 matrices being computed with a 16x16 threadblock, but you're not taking care to disable the out-of-bounds threads. Since you're trying to create a simple example, just increase the matrix size to 16x16 - if you want to handle odd sizes then you'll need to implement the control logic (or use cuBLAS!).

Copy array of structures from host to device in CUDA

I am trying to copy an array of structures from host to device in CUDA. For example:
#define N 1000;
#define M 100000;
typedef struct {
int i;
float L[N];
}t ;
__global__ void kernel() {
//do something
}
main () {
t *B, *B_d; // Pointer to host & device arrays of structure
int size = M * sizeof(t);
B=(t*)calloc(M,sizeof(t));
cudaMalloc((void **) &B_d, size); // Allocate array of structure on device
// readind B from file ...
cudaMemcpy(B_d, B, size, cudaMemcpyHostToDevice);
kernel<<<1, 1 >>>();
}
Is that right or not? And how can I use Kernel function?
Now you can declare your kernel as accepting a parameter of type (t *) and pass your B to the kernel call.
Some comments:
1. Using only 1 thread in the kernel call is very ineffective. For optimal results, you need to have multiples of 32 threads in the block.
2. Having array of structures will not allow your code effectively use memory bandwidth. For optimal results, you need to make coalesced reads.