how to create a matrix in gpu and print it on cpu? - cuda

This is a code to create a matrix on gpu and print it out on cpu. Can anyone tell me where am I going wrong. Thank you.
# include <stdio.h>
__global__ void create(int **d_a){
int i = threadIdx.x;
int j = threadIdx.y;
d_a[i][j] = 1;
}
void errorCheck(){
cudaError_t error = cudaGetLastError();
if(error != cudaSuccess){
// print the CUDA error message and exit
printf("CUDA error: %s\n", cudaGetErrorString(error));
exit(-1);
}
}
# define N 5
int main(){
int **d_a, **a;
a = (int**)malloc(N * sizeof(int*));
for (int i =0; i < N; i++){
a[i] = (int*)malloc(N*sizeof(int));
}
cudaMalloc((void***)&d_a, N*sizeof(int*));
for (int i =0; i < N; i++){
cudaMalloc((void**)&d_a,N*sizeof(int));
}
errorCheck();
create <<<1, N>>>(d_a);
errorCheck();
cudaMemcpy(a, d_a, (N*N)*sizeof(int),cudaMemcpyDeviceToHost);
for (int i =0; i < N; i++ ){
for (int j = 0; j < N; j++ ){
printf("%d", a[i][j]);
}
printf("\n");
}
cudaFree(d_a);
free(a);
return 0;
}
Is there something wrong with memory allocation or memcpy ?

Is there something wrong with memory allocation or memcpy ?
Yes on both counts.
This:
cudaMalloc((void***)&d_a, N*sizeof(int*));
for (int i =0; i < N; i++){
cudaMalloc((void**)&d_a,N*sizeof(int));
}
would have to be done like this:
cudaMalloc((void***)&d_a, N*sizeof(int*));
for (int i=0; i < N; i++){
int *row;
cudaMalloc((void**)&row, N*sizeof(int));
cudaMemcpy(d_a+i, &row, sizeof(int*), cudaMemcpyHostToDevice);
}
And then this:
create <<<1, N>>>(d_a);
errorCheck();
cudaMemcpy(a, d_a, (N*N)*sizeof(int),cudaMemcpyDeviceToHost);
would have to be done like this:
create <<<1, dim3(N,N)>>>(d_a);
errorCheck();
for(int i=0; i<N; i++) {
int* row;
cudaMemcpy(&row, d_a+i, sizeof(int*), cudaMemcpyDeviceToHost);
cudaMemcpy(a[i], row, sizeof(int) * N, cudaMemcpyDeviceToHost);
}
[All code written in browser and not tested, use at own risk]
In short, you have decided to work with an array of pointers. This requires additional CUDA API operations because the row pointers in the GPU copy are not accessible on the host by standard assignment. You must use cudaMemcpy in every case.

Related

Computing the mean of 2000 2D-arrays with CUDA C

I have 2000 2D-arrays (each array is 1000x1000). I need to compute the mean of each one and put the result in one 2000 vector.
I tried to do that by calling the kernel for each 2D-array, but it is naive, and I want to do the computation once.
What I have been doing is a kernel for one 2D-array. I want to make my kernel do this for 2000 2D-arrays, but in one kernel.
#include <stdio.h>
#include <cuda.h>
#include <time.h>
void init_mat(float *a, const int N, const int M);
void print_mat(float *a, const int N, const int M, char *d);
void print_array(float *a, const int N, char *d);
const int threadsPerBlock=256;
__global__
void kernel(float *mat, float *out, const int N, const int M){
__shared__ float cache[threadsPerBlock];
int tid=threadIdx.x+blockIdx.x*blockDim.x;
int cacheIndex = threadIdx.x;
float sum=0;
if (tid<M) {
for(int i=0; i<N; i++)
sum += mat[(i*M)+tid];
cache[cacheIndex] = sum;
out[tid] =cache[cacheIndex];
}
__syncthreads();
int i = blockDim.x/2;
while(i != 0) {
if(cacheIndex<i)
cache[cacheIndex]+= cache[cacheIndex +i];
__syncthreads();
I /= 2;
}
if (cacheIndex == 0)
out[blockIdx.x]=cache[0];
}
int main (void) {
srand( time(NULL) );
float *a, *b, *c;
float *dev_a, *dev_b, *dev_c;
int N=1000;
int M=1000;
b=(float*)malloc(sizeof(float)*N*M);
c=(float*)malloc(sizeof(float)*M);
init_mat(b, N, M);
printf("<<<<<<<<<< initial data:\n");
print_mat(b, N, M, "matrix");
cudaMalloc((void**)&dev_b, sizeof(float)*N*M);
cudaMalloc((void**)&dev_c, sizeof(float)*M);
cudaMemcpy(dev_b, b, sizeof(float)*N*M, cudaMemcpyHostToDevice);
printf("\n\nRunning Kernel...\n\n");
kernel<<<M/256+1, 256>>>(dev_b, dev_c, N, M);
cudaMemcpy(c, dev_c, sizeof(float)*M, cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
printf(">>>>>>>>>> final data:\n");
print_array(c, M, "out-vector");
};
void init_mat(float *a, const int N, const int M) {
int i, j;
for(i=0; i<N; i++)
for(j=0; j<M; j++)
a[i*M+j] = rand() % 100 + 1;
}
void print_mat(float *a, const int N, const int M, char *d) {
int i, j;
for(i=0; i<N; i++){
printf("\n%s[%d]:", d, i);
for (j=0; j<M; j++)
printf("\t%6.4f", a[i*M+j]);
}
printf("\n");
}
void print_array(float *a, const int N, char *d) {
int i;
for(i=0; i<N; i++)
printf("\n%s[%d]: %f",d, i, a[i]);
printf("\n");
}
For a reasonably large number of arrays (e.g. 2000) and reasonably large sized arrays (e.g. 2000), the GPU can be fairly efficient if we assign a block to perform the sum reduction (and mean calculation) for each array. This means if you have 2000 arrays we will launch 2000 blocks.
In order to handle arbitrary sized arrays with a fixed number of threads per block, we will use an idea like the grid-striding loop but instead we will cause each block to use a block-striding loop to load all the data associated with a particular array. This means the threads of each block will "stride" through the assigned array, to load all the elements of that array.
Apart from this, the main reduction operation is similar to what you have written, and calculation of the mean is trivial this way - we can calculate the mean before writing the result to global memory, once we have the sum calculated via reduction.
Here is a worked example. If you compile with -DMEAN the code will output the mean of each array. If you omit that compile switch, the code will output the sum of each array. Let N be the number of arrays, and let K be the size of each array.
$ cat t1285.cu
#include <stdio.h>
const size_t N = 1000; // number of arrays
const size_t K = 1000; // size of each array
const int nTPB = 256; // number of threads per block, must be a power-of-2
typedef float mytype; // type of data to be summed
// produce the sum or mean of each array
template <typename T>
__global__ void breduce(const T * __restrict__ idata, T * __restrict__ odata, const int bsize){
__shared__ T sdata[nTPB];
T sum = 0;
//block-striding loop
size_t offset = blockIdx.x*bsize + threadIdx.x;
while (offset < (blockIdx.x+1)*bsize){
sum += idata[offset];
offset += blockDim.x;}
sdata[threadIdx.x] = sum;
__syncthreads();
//shared memory reduction sweep
for (int i = nTPB>>1; i > 0; i>>=1){
if (threadIdx.x < i) sdata[threadIdx.x] += sdata[threadIdx.x+i];
__syncthreads();}
// write output sum for this block/array
#ifndef MEAN
if (!threadIdx.x) odata[blockIdx.x] = sdata[0];
#else
if (!threadIdx.x) odata[blockIdx.x] = sdata[0]/bsize;
#endif
}
int main(){
mytype *h_idata, *h_odata, *d_idata, *d_odata;
h_idata=(mytype *)malloc(N*K*sizeof(mytype));
h_odata=(mytype *)malloc(N*sizeof(mytype));
cudaMalloc(&d_idata, N*K*sizeof(mytype));
cudaMalloc(&d_odata, N*sizeof(mytype));
for (size_t i = 0; i < N; i++)
for (size_t j = 0; j < K; j++)
h_idata[i*K+j] = 1 + (i&1); // fill alternating arrays with 1 and 2
memset(h_odata, 0, N*sizeof(mytype)); // zero out
cudaMemset(d_odata, 0, N*sizeof(mytype)); // zero out
cudaMemcpy(d_idata, h_idata, N*K*sizeof(mytype), cudaMemcpyHostToDevice);
breduce<<<N, nTPB>>>(d_idata, d_odata, K);
cudaMemcpy(h_odata, d_odata, N*sizeof(mytype), cudaMemcpyDeviceToHost);
// validate
for (size_t i = 0; i < N; i++)
#ifndef MEAN
if (h_odata[i] != (K*(1 + (i&1)))) {printf("mismatch at %d, was: %f, should be: %f\n", i, (float)h_odata[i], (float)(K*(1 + (i&1)))); return 1;}
#else
if (h_odata[i] != ((1 + (i&1)))) {printf("mismatch at %d, was: %f, should be: %f\n", i, (float)h_odata[i], (float)((1 + (i&1)))); return 1;}
#endif
return 0;
}
$ nvcc -arch=sm_35 -o t1285 t1285.cu -DMEAN
$ cuda-memcheck ./t1285
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvcc -arch=sm_35 -o t1285 t1285.cu
$ cuda-memcheck ./t1285
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

Error in simple reduce on CUDA

I'm newbie in CUDA C... I want to sum elements of array(with reduce) in 1 block and 267 threads use the shared memory. I read a book "CUDA by example, an introdution to General-Purpose to GPU Programming". According to some recomendations from her, i write my version of program:
__global__ void
conva(int* a, int* out)
{
__shared__ int cache[534];
int cacheIndex = threadIdx.x;
for(int n=0; n<2;++n) {
cache[cacheIndex+n] = a[cacheIndex+n];
int i = blockDim.x/2;
while (i != 0) {
if (cacheIndex < i)
cache[cacheIndex + n] += cache[cacheIndex + n + i];
__syncthreads();
i /= 2;
}
}
//need or not this __syncthreads(), I don't know
__syncthreads();
if (cacheIndex == 0)
out = &cache[0];
}
int main(int argc, char** argv)
{
//enter array for sum
int convolution[534];
for(int i=0; i<534; ++i)
convolution[i] = 1;
//variable in which we take a sum from device
int summa = 0;
//it we copy on device from host
int* tash;
int* convolution_gpu;
cudaMalloc((void**)(&convolution_gpu), 534*sizeof(int));
cudaMalloc((void**)(&tash), sizeof(int));
cudaMemcpy(convolution_gpu, convolution, 534*sizeof(int), cudaMemcpyHostToDevice );
//call core with 1 block and 267 threads
conva<<<1, 267>>>(convolution_gpu, tash);
cudaMemcpy(&summa, tash, sizeof(int), cudaMemcpyDeviceToHost);
//and here I want 534 but I have garbage(may be)
std::cout<<summa<<std::endl;
cudaFree(convolution_gpu);
cudaFree(tash);
getchar();
}
Tell please, where here is error and help me to resolve her...
(sorry for my english)
In your kernel, this:
if (cacheIndex == 0)
out = &cache[0];
is almost certainly wrong. Surely you want something like:
if (cacheIndex == 0)
*out = cache[0];

Separating even and odd numbers in CUDA

I have an array of numbers as {1,2,3,4,5,6,7,8,9,10} and I want to separate even and odd numbers as:
even = {2,4,6,8}
and:
odd = {1,3,5,7}
I am aware of atomic operations in CUDA, and also aware that the output is not expected to suffer from race conditions. I don't want to use atomic operations. How can I achieve this without using atomic keywords?
CODE:
#include <stdio.h>
#include <cuda.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *total,float *even,float *odd, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int a=total[idx];
if ((a%2)==0)
{
for (int i=0;i<=idx;i++)
{
int b = even[i];
if(b==0)
{
even[i] = total[idx];
break;
}
}
}
else
{
for (int i=0;i<idx;i++)
{
int c = odd[i];
odd[i] = total[idx];
break;
}
}
}
// main routine that executes on the host
int main(void)
{
float *total_h,*even_h, *odd_h,*total_d, *even_d,*odd_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
total_h = (float *)malloc(size); // Allocate array on host
even_h = (float *)malloc(size); // Allocate array on host
odd_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &total_d, size);
cudaMalloc((void **) &even_d, size);
cudaMemset(even_d,0,size);
cudaMalloc((void **) &odd_d, size); // Allocate array on device
cudaMemset(odd_d,0,size);
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) total_h[i] = (float)i+1;
cudaMemcpy(total_d, total_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
square_array <<< 1,10 >>> (total_d,even_d,odd_d, N);
// Retrieve result from device and store it in host array
cudaMemcpy(even_h, even_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(odd_h, odd_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
printf("total Numbers\n");
for (int i=0; i<N; i++) printf("%f\n",total_h[i]);
printf("EVEN Numbers\n");
for (int i=0; i<N; i++) printf("%f\n",even_h[i]);
printf("ODD Numbers\n");
for (int i=0; i<N; i++) printf("%f\n",odd_h[i]);
// Cleanup
free(total_h);
free(even_h);
free(odd_h);
cudaFree(total_d);
cudaFree(even_d);
cudaFree(odd_d);
}
OUTPUT:
As suggested by Jared Hoberock, it would be much more easy to use the efficient partitioning algorithm available in CUDA Thrust instead of starting the development of a partitioning routine of your own. Below, please find a complete worked example.
#include <thrust\device_vector.h>
#include <thrust\partition.h>
#include <thrust\execution_policy.h>
struct is_even { __host__ __device__ bool operator()(const int &x) { return (x % 2) == 0; } };
void main() {
const int N = 10;
thrust::host_vector<int> h_data(N);
for (int i=0; i<N; i++) h_data[i] = i;
thrust::device_vector<int> d_data(h_data);
thrust::device_vector<int> d_evens(N/2);
thrust::device_vector<int> d_odds(N/2);
thrust::partition_copy(d_data.begin(), d_data.end(), d_evens.begin(), d_odds.begin(), is_even());
printf("Even numbers\n");
for (int i=0; i<N/2; i++) {
int val = d_evens[i];
printf("evens[%i] = %i\n",i,val);
}
printf("Odd numbers\n");
for (int i=0; i<N/2; i++) {
int val = d_odds[i];
printf("odds[%i] = %i\n",i,val);
}
}

cuda programming with pthread

#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#define ARR_SIZE 10
#define NUM_DEVICE 1
typedef struct {
int *arr;
int *dev_arr;
int *dev_result;
int *result;
int num;
} cuda_st;
__global__ void kernel_fc(int *dev_arr, int *dev_result)
{
int idx = threadIdx.x;
printf("dev_arr[%d] = %d\n", idx, dev_arr[idx]);
atomicAdd(dev_result, dev_arr[idx]);
}
void *thread_func(void* struc)
{
cuda_st * data = (cuda_st*)struc;
printf("thread %d func start\n", data->num);
printf("arr %d = ", data->num);
for(int i=0; i<10; i++) {
printf("%d ", data->arr[i]);
}
printf("\n");
cudaSetDevice(data->num);
cudaMemcpy(data->dev_arr, data->arr, sizeof(int)*ARR_SIZE, cudaMemcpyHostToDevice);
kernel_fc<<<1,ARR_SIZE>>>(data->dev_arr, data->dev_result);
cudaMemcpy(data->result, data->dev_result, sizeof(int), cudaMemcpyDeviceToHost);
printf("thread %d func exit\n", data->num);
return NULL;
}
int main(void)
{
// Make object
cuda_st cuda[NUM_DEVICE];
// Make thread
pthread_t pthread[NUM_DEVICE];
// Host array memory allocation
int *arr[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
arr[i] = (int*)malloc(sizeof(int)*ARR_SIZE);
}
// Fill this host array up with specified data
for(int i=0; i<NUM_DEVICE; i++) {
for(int j=0; j<ARR_SIZE; j++) {
arr[i][j] = i*ARR_SIZE+j;
}
}
// To confirm host array data
for(int i=0; i<NUM_DEVICE; i++) {
printf("arr[%d] = ", i);
for(int j=0; j<ARR_SIZE; j++) {
printf("%d ", arr[i][j]);
}
printf("\n");
}
// Result memory allocation
int *result[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
result[i] = (int*)malloc(sizeof(int));
memset(result[i], 0, sizeof(int));
}
// Device array memory allocation
int *dev_arr[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaMalloc(&dev_arr[i], sizeof(int)*ARR_SIZE);
}
// Device result memory allocation
int *dev_result[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaMalloc(&dev_result[i], sizeof(int));
cudaMemset(dev_result[i], 0, sizeof(int));
}
// Connect these pointers with object
for(int i=0; i<NUM_DEVICE; i++) {
cuda[i].arr = arr[i];
cuda[i].dev_arr = dev_arr[i];
cuda[i].result = result[i];
cuda[i].dev_result = dev_result[i];
cuda[i].num = i;
}
// Create and excute pthread
for(int i=0; i<NUM_DEVICE; i++) {
pthread_create(&pthread[i], NULL, thread_func, (void*)&cuda[i]);
}
// Join pthread
for(int i=0; i<NUM_DEVICE; i++) {
pthread_join(pthread[i], NULL);
}
for(int i=0; i<NUM_DEVICE; i++) {
printf("result[%d] = %d\n", i, (*cuda[i].result));
}
return 0;
}
I make my simple-test-program like this to test pthread with multi device cuda code.
When the NUM_DEVICE set as 1, it works well but when set as 2 program stopped.
I guess beacause multiple threads access cudaSetDevice but I don't know how to handle this.
I tried to make my program with single host thread and multi device(with Async function) before, but in my case(not above simple code), there are many host code between kernel functions so it doesn't work well asynchronously.
So I test to use multi thread on host before apply this manner to my real code but I have trouble like this.
Do I have to use asynchonous function in cuda functions and kernels?
Give me some advise.
The problem is that you allocate memory on one device. You need to call cudaSetDevice before cudaMalloc calls:
// Device array memory allocation
int *dev_arr[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaSetDevice(i);
cudaMalloc(&dev_arr[i], sizeof(int)*ARR_SIZE);
}
// Device result memory allocation
int *dev_result[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaSetDevice(i);
cudaMalloc(&dev_result[i], sizeof(int));
cudaMemset(dev_result[i], 0, sizeof(int));
}

Complicated for loop to be ported to a CUDA kernel

I have the next for nested loop and I would like to port it to CUDA to be run on a GPU
int current=0;
int ptr=0;
for (int i=0; i < Nbeans; i++){
for(int j=0;j< NbeamletsPerbeam[i];j++){
current = j + ptr;
for(int k=0;k<Nmax;k++){
......
}
ptr+=NbeamletsPerbeam[i];
}
}
I would be very happy if any body has an idea of how to do it or how can be done.
We are talking about Nbeams=5, NbeamletsPerBeam around 200 each.
This is what I currently have but I am not sure it is right...
for (int i= blockIdx.x; i < d_params->Nbeams; i += gridDim.x){
for (int j= threadIdx.y; j < d_beamletsPerBeam[i]; j+= blockDim.y){
currentBeamlet= j+k;
for (int ivoxel= threadIdx.x; ivoxel < totalVoxels; ivoxel += blockDim.x){
I would suggest this idea. But you might need to do some minor modifications based on your code.
dim3 blocks(NoOfThreads, 1);
dim3 grid(Nbeans, 1);
kernel<<grid, blocks, 1>>()
__global__ kernel()
{
int noOfBlocks = ( NbeamletsPerbeam[blockIdx.x] + blockDim.x -1)/blockDim.x;
for(int j=0; j< noOfBlocks;j++){
// use threads and compute....
if( (threadIdx.x * j) < NbeamletsPerbeam[blockIdx.x]) {
current = (threadIdx.x * j) + ptr;
for(int k=0;k<Nmax;k++){
......
}
ptr+=NbeamletsPerbeam[blockIdx.x];
}
}
}
This should do the trick and gives you better parallelization.