deep copy of structs to and from device memory - cuda

I am having trouble with the deep copy of an array of structs with dynamically allocated member variables in this cuda code. I think it is occurring because &deviceHistogram points to an address on the host instead of an address on the device. I tried making an intermediate pointer variable as in here, but that did not work; how do I properly copy this entire array of structs so I can modify it from the makeHistogram function?
#include <stdlib.h>
#include <stdio.h>
#include "cuda.h"
typedef struct histogramBin {
int* items;
int count;
} histogramBin;
__host__ __device__ void outputHistogram(histogramBin* histogram, int size) {
for (int i = 0; i < size; i++) {
printf("%d: ", i);
if (!histogram[i].count) {
printf("EMPTY");
} else {
for (int j = 0; j < histogram[i].count; j++) {
printf("%d ", histogram[i].items[j]);
}
}
printf("\n");
}
}
// This function embeds PTX code of CUDA to extract bit field from x.
__device__ uint bfe(uint x, uint start, uint nbits) {
uint bits;
asm("bfe.u32 %0, %1, %2, %3;"
: "=r"(bits)
: "r"(x), "r"(start), "r"(nbits));
return bits;
}
__global__ void makeHistogram(histogramBin** histogram, int* rH, int rSize, int bit) {
for (int r = 0; r < rSize; r++) {
int thisBin = bfe(rH[r], bit, 1);
int position = (*histogram)[thisBin].count; // **** out of memory access here****
(*histogram)[thisBin].items[position] = rH[r];
(*histogram)[thisBin].count++;
}
}
void histogramDriver(histogramBin* histogram, int* rH, int rSize, int bit) {
int n = 8;
int* deviceRH;
histogramBin* deviceHistogram;
cudaMalloc((void**)&deviceRH, rSize * sizeof(int));
cudaMemcpy(deviceRH, rH, rSize * sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void**)&deviceHistogram, n * sizeof(histogramBin));
cudaMemcpy(deviceHistogram, histogram, n * sizeof(histogramBin), cudaMemcpyHostToDevice);
int* tempData[n];
for (int i = 0; i < n; i++) {
cudaMalloc(&(tempData[i]), rSize * sizeof(int));
}
for (int i = 0; i < n; i++) {
cudaMemcpy(&(deviceHistogram[i].items), &(tempData[i]), sizeof(int*), cudaMemcpyHostToDevice);
}
for (int i = 0; i < n; i++) {
cudaMemcpy(tempData[i], histogram[i].items, rSize * sizeof(int), cudaMemcpyHostToDevice);
}
makeHistogram<<<1, 1>>>(&deviceHistogram, deviceRH, rSize, bit);
cudaDeviceSynchronize();
}
int main(){
int rSize = 5;
int rH[rSize] = {1, 2, 3, 4, 5};
histogramBin * histogram = (histogramBin*)malloc(sizeof(histogramBin) * 8);
for(int i = 0; i < 8; i++){
histogram[i].items = (int*)calloc(sizeof(int), rSize);
histogram[i].count = 0;
}
histogramDriver(histogram, rH, rSize, 0);
return 0;
}
Once it has been copied properly to the device, how do I get it back on the host? For example, if I call outputHistogram(histogram, 5); from inside makeHistogram, I see the following:
0: 2 4
1: 1 3 5
2: EMPTY
3: EMPTY
4: EMPTY
5: EMPTY
6: EMPTY
7: EMPTY
Which is the output I am expecting.
When I call outputHistogram(histogram, 8) from histogramDriver (after the cudaDeviceSynchronize()) I see the following:
0: EMPTY
1: EMPTY
2: EMPTY
3: EMPTY
4: EMPTY
5: EMPTY
6: EMPTY
7: EMPTY
Clearly I am not properly copying the values back from the device to the host.
I have tried copying by doing the reverse procedure from the one in histogramDriver:
for(int i = 0; i < n; i++){
cudaMemcpy(&(tempData[i]), &(deviceHistogram[i].items), sizeof(int*), cudaMemcpyDeviceToHost);
}
for (int i = 0; i < n; i++) {
cudaMemcpy(histogram[i].items, tempData[i], rSize * sizeof(int), cudaMemcpyDeviceToHost);
}
But the output from the outputHistogram call in histogramDriver remains unchanged.

As #talonmies indicated, the biggest problem here is the design of your kernel. There is no reason/need to use a double-pointer for histogram (and indeed, the first iteration of the code you posted did not have that in the kernel prototype, although it was incomplete).
By removing the double-pointer aspect, your code runs without any runtime errors.
#include <stdlib.h>
#include <stdio.h>
#include "cuda.h"
typedef struct histogramBin {
int* items;
int count;
} histogramBin;
// This function embeds PTX code of CUDA to extract bit field from x.
__device__ uint bfe(uint x, uint start, uint nbits) {
uint bits;
asm("bfe.u32 %0, %1, %2, %3;"
: "=r"(bits)
: "r"(x), "r"(start), "r"(nbits));
return bits;
}
__global__ void makeHistogram(histogramBin* histogram, int* rH, int rSize, int bit) {
for (int r = 0; r < rSize; r++) {
int thisBin = bfe(rH[r], bit, 1);
int position = histogram[thisBin].count;
histogram[thisBin].items[position] = rH[r];
histogram[thisBin].count++;
}
}
void histogramDriver(histogramBin* histogram, int* rH, int rSize, int bit) {
int n = 8;
int* deviceRH;
histogramBin* deviceHistogram;
cudaMalloc((void**)&deviceRH, rSize * sizeof(int));
cudaMemcpy(deviceRH, rH, rSize * sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void**)&deviceHistogram, n * sizeof(histogramBin));
cudaMemcpy(deviceHistogram, histogram, n * sizeof(histogramBin), cudaMemcpyHostToDevice);
int* tempData[n];
for (int i = 0; i < n; i++) {
cudaMalloc(&(tempData[i]), rSize * sizeof(int));
}
for (int i = 0; i < n; i++) {
cudaMemcpy(&(deviceHistogram[i].items), &(tempData[i]), sizeof(int*), cudaMemcpyHostToDevice);
}
for (int i = 0; i < n; i++) {
cudaMemcpy(tempData[i], histogram[i].items, rSize * sizeof(int), cudaMemcpyHostToDevice);
}
makeHistogram<<<1, 1>>>(deviceHistogram, deviceRH, rSize, bit);
cudaDeviceSynchronize();
}
int main(){
const int rSize = 5;
int rH[rSize] = {1, 2, 3, 4, 5};
histogramBin * histogram = (histogramBin*)malloc(sizeof(histogramBin) * 8);
for(int i = 0; i < 8; i++){
histogram[i].items = (int*)calloc(sizeof(int), rSize);
histogram[i].count = 0;
}
histogramDriver(histogram, rH, rSize, 0);
return 0;
}
$ nvcc t1452.cu -o t1452
$ cuda-memcheck ./t1452
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
Note that the only changes here are to the kernel code itself, plus removal of the ampersand on kernel call, plus I added const to the definition of rSize to get things to compile.
I have no idea if it produces correct output, because you've included no way to inspect the output, nor indicated what you expect the output to be. If you are interested in that, those would be good things to include in your MVE.

Related

CUDA in C: How to fix Error 11 with cudaMemcpyAsync

I am currently trying to get a simple multi-GPU program running with CUDA.
What it basically does is it copies a large array with some dummy data in chunks to the GPUs, which do some math, and then copy the resulting array back.
I dont get any errors in the output of VS2017, but some error messages I have set up show me that while trying to copy either H2D or D2H.
It tells me that a cudaErrorInvalidValue is occuring.
Also, when using the cudaFree(); function, i get a cudaErrorInvalidDevicePointer error.
The output of the program, the result, is completely wrong. The kernel is, for testing purposes, only setting every value of the output array to a value of 50. The result is a relatively large negative number, always the same no matter what the kernel does.
I have already tried to use a pointer that is not part of a struct, but is defined right before the cudaMalloc, where it is used first. That did not change anything.
This is the function that runs the Kernel:
void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan)
{
cudaSetDevice(device);
cudaStreamCreate(&gpuplan.stream);
cudaMemcpyAsync(gpuplan.d_data_ptr, h_data, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyHostToDevice, gpuplan.stream); //asynchronous memory copy of the data array h2d
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy H2D on GPU %i: Error %i\n", device, x);
}
dummyKernel << <BLOCK_N, THREAD_N, 0, gpuplan.stream >> > (gpuplan.d_data_ptr, gpuplan.d_out_ptr, kernelPlan.ComputationsPerThread, kernelPlan.AdditionalComputationThreadCount); //run kernel
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("no successfull kernel launch\n Kernel Launch Error %i \n", x);
}
else {
printf("kernel ran.\n");
}
cudaMemcpyAsync(h_out, gpuplan.d_out_ptr, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyDeviceToHost, gpuplan.stream); //asynchronous memory copy of the output array d2h
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy D2H on GPU %i: Error %i\n", device, x);
}
cudaStreamDestroy(gpuplan.stream);
}
Then here, how the struct is defined in the "kernel.h":
#ifndef KERNEL_H
#define KERNEL_H
#include "cuda_runtime.h"
//GPU plan
typedef struct
{
unsigned int Computations; //computations on this GPU
unsigned int Repetitions; // amount of kernel repetitions
unsigned int ComputationsPerRepetition; // amount of computations in every kernel execution
unsigned int AdditionalComputationRepetitionsCount; // amount of repetitions that need to do one additional computation
unsigned int DataStartingPoint; // tells the kernel launch at which point in the DATA array this GPU has to start working
float* d_data_ptr;
float* d_out_ptr;
cudaStream_t stream;
} GPUplan;
typedef struct
{
unsigned int Computations;
unsigned int ComputationsPerThread; // number of computations every thread of this repetition on this GPU has to do
unsigned int AdditionalComputationThreadCount; // number of threads in this repetition on this GPU that have to
unsigned int DataStartingPoint; // tells the kernel launch at which point in the DATA array this repetition has to start working
} KernelPlan;
GPUplan planGPUComputation(int DATA_N, int GPU_N, int device, long long MemoryPerComputation, int dataCounter);
KernelPlan planKernelComputation(int GPUDataStartingPoint, int GPUComputationsPerRepetition, int GPUAdditionalComputationRepetitionsCount, int Repetition, int dataCounter, int THREAD_N, int BLOCK_N);
void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan);
void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan);
void memFree(int device, GPUplan gpuPlan);
__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount);
#endif
here the part of code that calls runKernel:
int GPU_N;
cudaGetDeviceCount(&GPU_N);
const int BLOCK_N = 32;
const int THREAD_N = 1024;
const int DATA_N = 144000;
const int MemoryPerComputation = sizeof(float);
float *h_data;
float *h_out;
h_data = (float *)malloc(MemoryPerComputation * DATA_N);
h_out = (float *)malloc(MemoryPerComputation * DATA_N);
float* sourcePointer;
float* destPointer;
for (int i = 0; i < maxRepetitionCount; i++) // repeat this enough times so that the GPU with the most repetitions will get through all of them
{
//malloc
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memAllocation(j, MemoryPerComputation, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
//kernel launch/memcpy
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
sourcePointer = h_data + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
destPointer = h_out + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
runKernel(j, i, sourcePointer, destPointer, MemoryPerComputation, BLOCK_N, THREAD_N, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memFree(j, plan[j]);
}
}
}
I dont think that the kernel itself would be of any importance here since the memcpy error already appears before it is even executed.
The expected output is, that every element of the output array is 50. Instead, every element is -431602080.0
The array is a float array.
EDIT: here is the full code used to reproduce the problem (in addition to kernel.h from above):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include "kernel.h"
#define MAX_GPU_COUNT 32
#define MAX_REP_COUNT 64
__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount) {
int computations = d_ComputationsPerThread; //computations to be performed in this repetition on this GPU
const int threadID = blockDim.x * blockIdx.x + threadIdx.x; //thread id within GPU Repetition
if (threadID > d_AdditionalComputationThreadCount) {
computations++; //check if thread has to do an additional computation
}
for (int i = 0; i < computations; i++) {
d_out[i * blockDim.x * gridDim.x + threadID] = 50;
}
}
GPUplan planGPUComputation(int DATA_N, int GPU_N, int device, long long MemoryPerComputation, int dataCounter)
{
GPUplan plan;
size_t free, total;
//computations on GPU #device
plan.Computations = DATA_N / GPU_N;
//take into account odd data size for this GPU
if (DATA_N % GPU_N > device) {
plan.Computations++;
}
plan.DataStartingPoint = dataCounter;
//get memory information
cudaSetDevice(device);
cudaMemGetInfo(&free, &total);
//calculate Repetitions on this GPU #device
plan.Repetitions = ((plan.Computations * MemoryPerComputation / free) + 1);
printf("Repetitions: %i\n", plan.Repetitions);
if (plan.Repetitions > MAX_REP_COUNT) {
printf("Repetition count larger than MAX_REP_COUNT %i\n\n", MAX_REP_COUNT);
}
//calculate Computations per Repetition
plan.ComputationsPerRepetition = plan.Computations / plan.Repetitions;
//calculate how many Repetitions have to do an additional Computation
plan.AdditionalComputationRepetitionsCount = plan.Computations % plan.Repetitions;
return plan;
}
KernelPlan planKernelComputation(int GPUDataStartingPoint, int GPUComputationsPerRepetition, int GPUAdditionalComputationRepetitionsCount, int Repetition, int dataCounter, int THREAD_N, int BLOCK_N)
{
KernelPlan plan;
//calculate total Calculations in this Repetition
plan.Computations = GPUComputationsPerRepetition;
if (GPUAdditionalComputationRepetitionsCount > Repetition) {
plan.Computations++;
}
plan.ComputationsPerThread = plan.Computations / (THREAD_N * BLOCK_N); // Computations every thread has to do (+- 1)
plan.AdditionalComputationThreadCount = plan.Computations % (THREAD_N * BLOCK_N); // how many threads have to do +1 calculation
plan.DataStartingPoint = GPUDataStartingPoint + dataCounter;
return plan;
}
void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan)
{
cudaSetDevice(device); //select device to allocate memory on
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Error Selecting device %i: Error %i\n", device, x);
}
cudaMalloc((void**)&(gpuPlan.d_data_ptr), MemoryPerComputation * kernelPlan.Computations); // device data array memory allocation
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Malloc 1 on GPU %i: Error %i\n", device, x);
}
cudaMalloc((void**)&(gpuPlan.d_out_ptr), MemoryPerComputation * kernelPlan.Computations); // device output array memory allocation
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Malloc 2 on GPU %i: Error %i\n", device, x);
}
}
void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan)
{
cudaSetDevice(device);
cudaStreamCreate(&gpuplan.stream);
cudaMemcpyAsync(gpuplan.d_data_ptr, h_data, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyHostToDevice, gpuplan.stream); //asynchronous memory copy of the data array h2d
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy H2D on GPU %i: Error %i\n", device, x);
}
dummyKernel << <BLOCK_N, THREAD_N, 0, gpuplan.stream >> > (gpuplan.d_data_ptr, gpuplan.d_out_ptr, kernelPlan.ComputationsPerThread, kernelPlan.AdditionalComputationThreadCount); //run kernel
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("no successfull kernel launch\n Kernel Launch Error %i \n", x);
}
else {
printf("kernel ran.\n");
}
cudaMemcpyAsync(h_out, gpuplan.d_out_ptr, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyDeviceToHost, gpuplan.stream); //asynchronous memory copy of the output array d2h
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy D2H on GPU %i: Error %i\n", device, x);
}
cudaStreamDestroy(gpuplan.stream);
}
void memFree(int device, GPUplan gpuPlan)
{
cudaSetDevice(device); //select device to allocate memory on
cudaFree(gpuPlan.d_data_ptr);
cudaFree(gpuPlan.d_out_ptr);
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memfree on GPU %i: Error %i\n", device, x);
}
else {
printf("memory freed.\n");
}
//17 = cudaErrorInvalidDevicePointer
}
int main()
{
//get device count
int GPU_N;
cudaGetDeviceCount(&GPU_N);
//adjust for device count larger than MAX_GPU_COUNT
if (GPU_N > MAX_GPU_COUNT)
{
GPU_N = MAX_GPU_COUNT;
}
printf("GPU count: %i\n", GPU_N);
//definitions for running the program
const int BLOCK_N = 32;
const int THREAD_N = 1024;
const int DATA_N = 144000;
const int MemoryPerComputation = sizeof(float);
///////////////////////////////////////////////////////////
//Subdividing input data across GPUs
//////////////////////////////////////////////
//GPUplan
GPUplan plan[MAX_GPU_COUNT];
int dataCounter = 0;
for (int i = 0; i < GPU_N; i++)
{
plan[i] = planGPUComputation(DATA_N, GPU_N, i, MemoryPerComputation, dataCounter);
dataCounter += plan[i].Computations;
}
//KernelPlan
KernelPlan kernelPlan[MAX_GPU_COUNT*MAX_REP_COUNT];
for (int i = 0; i < GPU_N; i++)
{
int GPURepetitions = plan[i].Repetitions;
dataCounter = plan[i].DataStartingPoint;
for (int j = 0; j < GPURepetitions; j++)
{
kernelPlan[i*MAX_REP_COUNT + j] = planKernelComputation(plan[i].DataStartingPoint, plan[i].ComputationsPerRepetition, plan[i].AdditionalComputationRepetitionsCount, j, dataCounter, THREAD_N, BLOCK_N);
dataCounter += kernelPlan[i*MAX_REP_COUNT + j].Computations;
}
}
float *h_data;
float *h_out;
h_data = (float *)malloc(MemoryPerComputation * DATA_N);
h_out = (float *)malloc(MemoryPerComputation * DATA_N);
//generate some input data
for (int i = 0; i < DATA_N; i++) {
h_data[i] = 2 * i;
}
//get highest repetition count
int maxRepetitionCount = 0;
for (int i = 0; i < GPU_N; i++) {
if (plan[i].Repetitions > maxRepetitionCount) {
maxRepetitionCount = plan[i].Repetitions;
}
}
printf("maxRepetitionCount: %i\n\n", maxRepetitionCount);
float* sourcePointer;
float* destPointer;
for (int i = 0; i < maxRepetitionCount; i++) // repeat this enough times so that the GPU with the most repetitions will get through all of them
{
//malloc
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memAllocation(j, MemoryPerComputation, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
//kernel launch/memcpy
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
sourcePointer = h_data + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
destPointer = h_out + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
runKernel(j, i, sourcePointer, destPointer, MemoryPerComputation, BLOCK_N, THREAD_N, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memFree(j, plan[j]);
}
}
}
//printing expected results and results
for (int i = 0; i < 50; i++)
{
printf("%f\t", h_data[i]);
printf("%f\n", h_out[i]);
}
free(h_data);
free(h_out);
getchar();
return 0;
}
The first problem has nothing to do with CUDA, actually. When you pass a struct by-value to a function in C or C++, a copy of that struct is made for use by the function. Modifications to that struct in the function have no effect on the original struct in the calling environment. This is affecting you in your memAllocation function:
void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan)
^^^^^^^
passed by value
{
cudaSetDevice(device); //select device to allocate memory on
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Error Selecting device %i: Error %i\n", device, x);
}
cudaMalloc((void**)&(gpuPlan.d_data_ptr), MemoryPerComputation * kernelPlan.Computations); // device data array memory allocation
^^^^^^^^^^^^^^^^^^
modifying the copy, not the original
This is fairly easily fixable by passing the gpuPlan struct by reference rather than by value. Modify both the prototype in the kernel.h header file, as well as the definition:
void memAllocation(int device, int MemoryPerComputation, GPUplan &gpuPlan, KernelPlan kernelPlan)
^
with that change, the struct is passed by reference, and modifications (such as the setting of the allocated pointers) will show up in the calling environment. This is the proximal reason for the invalid argument report on the cudaMemcpy operations. The pointers you were passing were unallocated, because your allocations were done on the pointer copies, not the originals.
After that change your code may appear to be running correctly. At least when I run it no errors are displayed and the outputs appear to be all set to 50.
However there are still problems with this code. If you run your code with cuda-memcheck (or turn on the memory checker functionality in nsight VSE) you should see errors associated with this line of code, which is indexing out of bounds:
__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount) {
...
d_out[i * blockDim.x * gridDim.x + threadID] = 50; //indexing out of bounds
I'm not going to try to sort that out for you. It seems evident to me that your for-loop, coupled with the way you are calculating the index, is going beyond the end of the array. You can follow the methodology discussed here if needed.

Dynamic 2D array using double pointer in CUDA [duplicate]

I'm new to cuda. I want to add up two 2d array into a third array.
I use following code:
cudaMallocPitch((void**)&device_a, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_b, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_c, &pitch, 2*sizeof(int),2);
now my problem is that i dont want to use these array as flattened 2-d array
all in my kernel code i want to di is use two for loop & put the result in the third array like
__global__ void add(int *dev_a ,int *dev_b,int* dec_c)
{
for i=0;i<2;i++)
{
for j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
How i can do this in CUDA?
please tell me how to use 2-d array in this way ?
What should be the kernel call for using 2d-array ?
If possible, please explain using code samples.
The short answer is, you can't. The cudaMallocPitch()function does exactly what its name implies, it allocates pitched linear memory, where the pitch is chosen to be optimal for the GPU memory controller and texture hardware.
If you wanted to use arrays of pointers in the kernel, the kernel code would have to look like this:
__global___ void add(int *dev_a[] ,int *dev_b[], int* dec_c[])
{
for i=0;i<2;i++) {
for j=0;j<2;j++) {
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
and then you would need nested cudaMalloc calls on the host side to construct the array of pointers and copy it to device memory. For your rather trivial 2x2 example, the code to allocate a single array would look like this:
int ** h_a = (int **)malloc(2 * sizeof(int *));
cudaMalloc((void**)&h_a[0], 2*sizeof(int));
cudaMalloc((void**)&h_a[1], 2*sizeof(int));
int **d_a;
cudaMalloc((void ***)&d_a, 2 * sizeof(int *));
cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice);
Which would leave the allocated device array of pointers in d_a, and you would pass that to your kernel.
For code complexity and performance reasons, you really don't want to do that, using arrays of pointers in CUDA code is both harder and slower than the alternative using linear memory.
To show what folly using arrays of pointers is in CUDA, here is a complete working example of your sample problem which combines the two ideas above:
#include <cstdio>
__global__ void add(int * dev_a[], int * dev_b[], int * dev_c[])
{
for(int i=0;i<2;i++)
{
for(int j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != 0) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }
int main(void)
{
const int aa[2][2]={{1,2},{3,4}};
const int bb[2][2]={{5,6},{7,8}};
int cc[2][2];
int ** h_a = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_a[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_a[i], &aa[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int **d_a;
GPUerrchk(cudaMalloc((void ***)&d_a, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_b = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_b[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_b[i], &bb[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int ** d_b;
GPUerrchk(cudaMalloc((void ***)&d_b, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_b, h_b, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_c = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_c[i], 2*sizeof(int)));
}
int ** d_c;
GPUerrchk(cudaMalloc((void ***)&d_c, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_c, h_c, 2*sizeof(int *), cudaMemcpyHostToDevice));
add<<<1,1>>>(d_a,d_b,d_c);
GPUerrchk(cudaPeekAtLastError());
for(int i=0; i<2;i++){
GPUerrchk(cudaMemcpy(&cc[i][0], h_c[i], 2*sizeof(int), cudaMemcpyDeviceToHost));
}
for(int i=0;i<2;i++) {
for(int j=0;j<2;j++) {
printf("(%d,%d):%d\n",i,j,cc[i][j]);
}
}
return cudaThreadExit();
}
I recommend you study it until you understand what it does, and why it is such a poor idea compared to using linear memory.
You don't need to use for loops inside the device. Try this code.
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <time.h>
#define N 800
__global__ void matrixAdd(float* A, float* B, float* C){
int i = threadIdx.x;
int j = blockIdx.x;
C[N*j+i] = A[N*j+i] + B[N*j+i];
}
int main (void) {
clock_t start = clock();
float a[N][N], b[N][N], c[N][N];
float *dev_a, *dev_b, *dev_c;
cudaMalloc((void **)&dev_a, N * N * sizeof(float));
cudaMalloc((void **)&dev_b, N * N * sizeof(float));
cudaMalloc((void **)&dev_c, N * N * sizeof(float));
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
a[i][j] = rand() % 10;
b[i][j] = rand() % 10;
}
}
cudaMemcpy(dev_a, a, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * N * sizeof(float), cudaMemcpyHostToDevice);
matrixAdd <<<N,N>>> (dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N * N * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
printf("[%d, %d ]= %f + %f = %f\n",i,j, a[i][j], b[i][j], c[i][j]);
}
}
printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}

Can't get matrix*vector multiplication to go faster in CUDA than in CPU

#include <iostream>
#include <assert.h>
#include <sys/time.h>
#define BLOCK_SIZE 32 // CUDA block size
__device__ inline int getValFromMatrix(int* matrix, int row, int col,int matSize) {
if (row<matSize && col<matSize) {return matrix[row*matSize + col];}
return 0;
}
__device__ inline int getValFromVector(int* vector, int row, int matSize) {
if (row<matSize) {return vector[row];}
return 0;
}
__global__ void matVecMultCUDAKernel(int* aOnGPU, int* bOnGPU, int* cOnGPU, int matSize) {
__shared__ int aRowShared[BLOCK_SIZE];
__shared__ int bShared[BLOCK_SIZE];
__shared__ int myRow;
__shared__ double rowSum;
int myIndexInBlock = threadIdx.x;
myRow = blockIdx.x;
rowSum = 0;
for (int m = 0; m < (matSize / BLOCK_SIZE + 1);m++) {
aRowShared[myIndexInBlock] = getValFromMatrix(aOnGPU,myRow,m*BLOCK_SIZE+myIndexInBlock,matSize);
bShared[myIndexInBlock] = getValFromVector(bOnGPU,m*BLOCK_SIZE+myIndexInBlock,matSize);
__syncthreads(); // Sync threads to make sure all fields have been written by all threads in the block to cShared and xShared
if (myIndexInBlock==0) {
for (int k=0;k<BLOCK_SIZE;k++) {
rowSum += aRowShared[k] * bShared[k];
}
}
}
if (myIndexInBlock==0) {cOnGPU[myRow] = rowSum;}
}
static inline void cudaCheckReturn(cudaError_t result) {
if (result != cudaSuccess) {
std::cerr <<"CUDA Runtime Error: " << cudaGetErrorString(result) << std::endl;
assert(result == cudaSuccess);
}
}
static void matVecMultCUDA(int* aOnGPU,int* bOnGPU, int* cOnGPU, int* c, int sizeOfc, int matSize) {
matVecMultCUDAKernel<<<matSize,BLOCK_SIZE>>>(aOnGPU,bOnGPU,cOnGPU,matSize); // Launch 1 block per row
cudaCheckReturn(cudaMemcpy(c,cOnGPU,sizeOfc,cudaMemcpyDeviceToHost));
}
static void matVecMult(int** A,int* b, int* c, int matSize) {
// Sequential implementation:
for (int i=0;i<matSize;i++) {
c[i]=0;
for (int j=0;j<matSize;j++) {
c[i]+=(A[i][j] * b[j]);
}
}
}
int main() {
int matSize = 1000;
int** A,* b,* c;
int* aOnGPU,* bOnGPU,* cOnGPU;
A = new int*[matSize];
for (int i = 0; i < matSize;i++) {A[i] = new int[matSize]();}
b = new int[matSize]();
c = new int[matSize]();
int aSizeOnGPU = matSize * matSize * sizeof(int), bcSizeOnGPU = matSize * sizeof(int);
cudaCheckReturn(cudaMalloc(&aOnGPU,aSizeOnGPU)); // cudaMallocPitch?
cudaCheckReturn(cudaMalloc(&bOnGPU,bcSizeOnGPU));
cudaCheckReturn(cudaMalloc(&cOnGPU,bcSizeOnGPU));
srand(time(NULL));
for (int i=0;i<matSize;i++) {
b[i] = rand()%100;
for (int j=0;j<matSize;j++) {
A[i][j] = rand()%100;
}
}
for (int i=0;i<matSize;i++) {cudaCheckReturn(cudaMemcpy((aOnGPU+i*matSize),A[i],bcSizeOnGPU,cudaMemcpyHostToDevice));}
cudaCheckReturn(cudaMemcpy(bOnGPU,b,bcSizeOnGPU,cudaMemcpyHostToDevice));
int iters=1;
timeval start,end;
// Sequential run:
gettimeofday(&start,NULL);
for (int i=0;i<iters;i++) {matVecMult(A,b,c,matSize);}
gettimeofday(&end,NULL);
std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;
// CUDA run:
gettimeofday(&start,NULL);
for (int i=0;i<iters;i++) {matVecMultCUDA(aOnGPU,bOnGPU,cOnGPU,c,bcSizeOnGPU,matSize);}
gettimeofday(&end,NULL);
std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;
cudaCheckReturn(cudaFree(aOnGPU));
cudaCheckReturn(cudaFree(bOnGPU));
cudaCheckReturn(cudaFree(cOnGPU));
for (int i = 0; i < matSize; ++i) {
delete[] A[i];
}
delete[] A;
delete[] b;
delete[] c;
}
Gives:
267171
580253
I've followed the guide on http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory, on how to do a matrix multiplication. I used shared memory for both the matrix (A) and the vector (B), but no matter what matrix size (100*100-20000*20000) or block size (32-1024) i choose, the sequential implementation always outperforms the CUDA implementation in terms of speed, it is about twice as fast.
Since I'm using matrix*vector multiplication, the shared arrays and blocks are handled a bit different; I'm using one block per row of the matrix instead of a 2D block over a part of the matrix.
Is my implementation wrong, or is simply CUDA not faster than the CPU?
First item: You perform checks on boundaries in the cuda implementation where you don't on CPU. Branching are really expensive on a GPU.
Second : You count the cudamemcpy in the cuda performance. It's very uncommon to perform only one multiplication before having to get the result back to cpu.
Usually (on CG for example), you perform several hundreds of multiplication on GPU before having to copy back.
Third: Dont try to implement that (except for educational purposes) and use vendor libraries (like CUBLAS, which ships with every CUDA release), which are extremely hard to outperform.

CUDA - unexpected result with float array

I faced an issue I do not understand. I am trying to set values of an array in the device. With int array I am doing this this way:
#define VECTOR_SIZE 8
int main()
{
printf("Start\n");
int *input_d;
int *output_d;
int output_h[VECTOR_SIZE];
int input_h[VECTOR_SIZE] = { 1, 2, 3, 4, 5, 6, 7, 8 };
int size = VECTOR_SIZE*sizeof(int);
cudaMalloc(&input_d,size);
cudaMalloc(&output_d,size);
cudaMemcpy(input_d,input_h,size,cudaMemcpyHostToDevice);
kernel<<<1,VECTOR_SIZE>>>(input_d,output_d);
cudaMemcpy(output_h,output_d,size, cudaMemcpyDeviceToHost);
cudaFree(input_d);
cudaFree(output_d);
return 0;
}
The kernel looks like:
__global__ void kernel(int* input, int* output)
{
int dx = threadIdx.x + (blockDim.x * blockIdx.x);
output[dx] = dx;
}
The output (output_h) is just like I expected {0, 1, 2, 3, 4, 5, 6, 7}. Now when I am trying do the same on float array:
#define VECTOR_SIZE 8
int main()
{
printf("Start\n");
float *input_d;
float *output_d;
float output_h[VECTOR_SIZE];
float input_h[VECTOR_SIZE] = { 1, 2, 3, 4, 5, 6, 7, 8 };
int size = VECTOR_SIZE*sizeof(float);
cudaMalloc(&input_d,size);
cudaMalloc(&output_d,size);
cudaMemcpy(input_d,input_h,size,cudaMemcpyHostToDevice);
kernel<<<1,VECTOR_SIZE>>>(input_d,output_d);
cudaMemcpy(output_h,output_d,size, cudaMemcpyDeviceToHost);
cudaFree(input_d);
cudaFree(output_d);
return 0;
}
with kernel:
__global__ void kernel(float* input, float* output)
{
int dx = threadIdx.x + (blockDim.x * blockIdx.x);
output[dx] = dx;
}
I am receiving zero array on the device in output_h variable.
The full code for handling float arrays:
#include "cuda_runtime.h"
#include <stdio.h>
#define VECTOR_SIZE 8
__global__ void kernel(float* input, float* output)//, int halfSize)
{
int dx = threadIdx.x + (blockDim.x * blockIdx.x);
output[dx] = dx;
}
int main()
{
printf("Start\n");
float *input_d;
float *output_d;
float output_h[VECTOR_SIZE];
float input_h[VECTOR_SIZE] = { 1, 2, 3, 4, 5, 6, 7, 8 };
int size = VECTOR_SIZE*sizeof(float);
cudaMalloc(&input_d,size);
cudaMalloc(&output_d,size);
cudaMemcpy(input_d,input_h,size,cudaMemcpyHostToDevice);
kernel<<<1,VECTOR_SIZE>>>(input_d,output_d);
cudaMemcpy(output_h,output_d,size, cudaMemcpyDeviceToHost);
cudaFree(input_d);
cudaFree(output_d);
int i;
for (i=1; i<=VECTOR_SIZE; i++)
{
printf("%d, ", output_h[i-1]);
}
getchar();
return 0;
}
Both the integer and floating point versions of CUDA code you have posted work perfectly. The only mistake is how you are printing out the values returned by the kernel in the case of the floating point code:
int i;
for (i=1; i<=VECTOR_SIZE; i++)
{
printf("%d, ", output_h[i-1]);
}
should be changed to
int i;
for (i=0; i<VECTOR_SIZE; i++)
{
printf("%f, ", output_h[i]);
}
(note that the %f format is required for printing floating point numbers).
Given that CUDA uses a C++ compiler for host code by default, you should probably prefer iostream to printf - it will work irrespective of the type of the output and not cause the error you are seeing. If I were to write a "universal" version of your example it would look like this:
#include <iostream>
template<typename T>
__global__ void kernel(T* output)
{
int dx = threadIdx.x + (blockDim.x * blockIdx.x);
output[dx] = dx;
}
template<typename T, int VECTOR_SIZE>
void do_run(void)
{
T *output_d;
T output_h[VECTOR_SIZE] = { 999 };
size_t size = sizeof(output_h);
cudaMalloc(&output_d,size);
kernel<T><<<1,VECTOR_SIZE>>>(output_d);
cudaMemcpy(output_h, output_d, size, cudaMemcpyDeviceToHost);
for(int i=0; i<VECTOR_SIZE; i++)
std::cout << output_h[i] << std::endl;
cudaFree(output_d);
}
int main()
{
std::cout << "Integer version" << std::endl;
do_run<int, 8>();
std::cout << "floating point version" << std::endl;
do_run<float, 8>();
return 0;
}
Note that the output code can be used unchanged for both int and float versions, eliminating the possibility of the mistake you made here.

How to use function pointers in polymorphism?

I want to be able to sort an array out using function pointers in polymorphism. Not to mention, am only doing this to see how things work and so forth.
Here's a simple generic sorting interface, an insertion sort implemented through that interface, and some test code that demonstrates its use:
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
struct sort_interface {
// number of elements
size_t nmemb;
// passed through to 'arg' of compare() and swap()
void *arg;
// compares elements at 'i' and 'j'
int (*compare)(void *arg, size_t i, size_t j);
// swaps elements at 'i' and 'j'
void (*swap)(void *arg, size_t i, size_t j);
};
static void insertion_sort (struct sort_interface iface)
{
for (size_t i = 0; i < iface.nmemb; i++) {
size_t j = i;
while (j > 0) {
if (iface.compare(iface.arg, j - 1, j) <= 0) {
break;
}
iface.swap(iface.arg, j - 1, j);
j--;
}
}
}
static int func_comparator (void *arg, size_t i, size_t j)
{
int *arr = arg;
if (arr[i] < arr[j]) {
return -1;
}
if (arr[i] > arr[j]) {
return 1;
}
return 0;
}
static void func_swap (void *arg, size_t i, size_t j)
{
int *arr = arg;
int temp = arr[i];
arr[i] = arr[j];
arr[j] = temp;
}
int main (int argc, char *argv[])
{
int arr[] = {7, 6, 8, 2, 9, 1, 156, 1, 62, 1671, 15};
size_t count = sizeof(arr) / sizeof(arr[0]);
struct sort_interface iface;
iface.nmemb = count;
iface.arg = arr;
iface.compare = func_comparator;
iface.swap = func_swap;
insertion_sort(iface);
for (size_t i = 0; i < count; i++) {
printf("%d ", arr[i]);
}
printf("\n");
return 0;
}
You might also want to take a look at the qsort() function of the C standard library, which too uses a function pointer comparator, but is somewhat limited to compared to the above. In particular, it assumes you're sorting a continuous array, and if you have pointers to elements or their members, those will be broken (but the above interface allows you to fix pointers in swap()).
Here's an example for how to use the qsort() interface, and also an insertion sort implementation that uses the same interface as qsort():
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
static void insertion_sort (void *base, size_t nmemb, size_t size, int(*compar)(const void *, const void *))
{
char temp[size];
for (size_t i = 0; i < nmemb; i++) {
size_t j = i;
while (j > 0) {
char *x = (char *)base + (j - 1) * size;
char *y = (char *)base + j * size;
if (compar(x, y) <= 0) {
break;
}
memcpy(temp, x, size);
memcpy(x, y, size);
memcpy(y, temp, size);
j--;
}
}
}
static int int_comparator (const void *ve1, const void *ve2)
{
const int *e1 = ve1;
const int *e2 = ve2;
if (*e1 < *e2) {
return -1;
}
if (*e1 > *e2) {
return 1;
}
return 0;
}
int main (int argc, char *argv[])
{
int arr[] = {7, 6, 8, 2, 9, 1, 156, 1, 62, 1671, 15};
size_t count = sizeof(arr) / sizeof(arr[0]);
qsort(arr, count, sizeof(arr[0]), int_comparator); // or insertion_sort()
for (size_t i = 0; i < count; i++) {
printf("%d ", arr[i]);
}
printf("\n");
return 0;
}