Changing from for loop to multithreading in kernel - cuda

I'm currently working on interpolation of a grid and having some problems regarding multithreading. The code is suppose to read a map represented by a 2x2 matrix, and then interpolate it to increase the number of points by a factor of 100. When using for loops in the kernel, it works great.
Before interpolation: http://bildr.no/view/OWV1UDRO
After interpolation: http://bildr.no/view/eTlmNmpo
When I tried to change the for loops with threads, it produced some weird result. In stead of numbers, it filled the resulting matrix with -1.#QNAN
Here's my working code with for loops in the kernel
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <fstream>
#include "cuda.h"
using namespace std;
float Z[41][41];
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
texture<float, 2, cudaReadModeElementType> tex;
__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare)
{
int k = sqrt(numberOfInterpolationsPerSquare);
for (float i=0; i<n*k; i++)
{
for (float j=0; j<m*k; j++)
{
f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f);
}
}
}
int main (void)
{
// Start timer
clock_t tStart = clock();
// Size of map
int n=41;
int m=41;
int g = 0;
float numberOfInterpolationsPerSquare = 100;
float numberOfElements = pow(sqrt(numberOfInterpolationsPerSquare)*n,2);
size_t pitch, tex_ofs;
float *f;
float *r;
float *map_d = 0;
// Build read-Streams
ifstream map;
//Create and open a txt file for MATLAB
ofstream file;
// Open data
map.open("Map.txt", ios_base::in);
file.open("Bilinear.txt");
// Store the map in a 2D array
for (int i=0; i<n; i++)
{
for (int j=0; j<m; j++)
{
map >> Z[i][j];
}
}
// Allocate memory on host and device
CUDA_SAFE_CALL(cudaMallocPitch((void**)&map_d,&pitch,n*sizeof(*map_d),m));
CUDA_SAFE_CALL(cudaMalloc((void**)&f, numberOfElements*sizeof(float)));
r = (float*)malloc(numberOfElements*sizeof(float));
// Copy map from host to device
CUDA_SAFE_CALL(cudaMemcpy2D(map_d, pitch, Z, n*sizeof(Z[0][0]), n*sizeof(Z[0][0]),m,cudaMemcpyHostToDevice));
// Set texture mode to bilinear interpolation
tex.normalized = false;
tex.filterMode = cudaFilterModeLinear;
// Bind the map to texture
CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, map_d, &tex.channelDesc, n, m, pitch));
// Checking for offset
if (tex_ofs !=0) {
printf ("tex_ofs = %zu\n", tex_ofs);
return EXIT_FAILURE;
}
// Launch Kernel
kernel <<< 1,1 >>> (m, n, f, numberOfInterpolationsPerSquare);
CHECK_LAUNCH_ERROR();
CUDA_SAFE_CALL (cudaDeviceSynchronize());
// Copy result from device to host
cudaMemcpy(r, f, numberOfElements*sizeof(float), cudaMemcpyDeviceToHost);
// Write results to file
for(int h=0;h<numberOfElements;h++)
{
if(g==sqrt(numberOfElements))
{
file << endl;
g=0;
}
file << r[h] << " ";
g++;
}
// Free memory
CUDA_SAFE_CALL (cudaUnbindTexture (tex));
CUDA_SAFE_CALL (cudaFree (map_d));
CUDA_SAFE_CALL (cudaFree (f));
free( r );
// Print out execution time
printf("Time taken: %.3fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return EXIT_SUCCESS;
}
Here's the kernel with multithreading, which doesn't work
__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare)
{
int k = sqrt(numberOfInterpolationsPerSquare);
int i= blockIdx.x * blockDim.x + threadIdx.x;
int j= blockIdx.y * blockDim.y + threadIdx.y;
if(i>=n*k || j>=m*k)
return;
f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f);
}
Does anyone know why the multithread version doesn't work?
Regards
Sondre

In the second kernel, i and j are int instead of float. So j/k and i/k in tex2D will result in integer division. Declare k as float to avoid integer division.
Initially, the kernel was launched with the following configuration:
//Find number of blocks
int nthreads = 1024;
int blocksize = 512;
int nblocks = ceil( (n*m*numberOfInterpolationsPerSquare) / nthreads);
// Launch Kernel
kernel <<< nblocks,blocksize >>> (m, n, f, numberOfInterpolationsPerSquare);
The problem with the above code is that it would launch a 1D grid of 1D blocks, but inside the kernel, 2D indexing is used. A 2D grid/block configuration is required for the kernel to work correctly. From the looks of the kernel code, following grid/block configuration should work:
float k = sqrt(numberOfInterpolationsPerSquare);
const int threads_x = (int)ceil(n * k);
const int threads_y = (int)ceil(m * k);
const dim3 dimBlock(16,16);
dim3 dimGrid;
dimGrid.x = (threads_x + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (threads_y + dimBlock.y - 1)/dimBlock.y;
kernel<<<dimGrid,dimBlock>>>(m, n, f, numberOfInterpolationsPerSquare);

Related

Is there a function in the cublas that can apply the sigmoid function with a vector?

As the title says, I want to do the element-wise operation in the vector with a function.I wonder that is there any function in the cublas library to do that?
I am not aware of a suitable CUBLAS function that can assist in the task. However, you can easily write your own code that applies the sigmoid function, or any other single-argument function for that matter, element-wise to a vector. Note that such code would be memory-bound rather than compute-bound in most circumstances. See the CUDA program below for a worked example, in particular sigmoid_kernel(). The output of the program should look something like this:
source[0]= 0.0000000000000000e+000 source[99999]= 9.9999000000000005e-001
result[0]= 5.0000000000000000e-001 result[99999]= 7.3105661250612963e-001
.
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define DEFAULT_LEN 100000
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
__device__ __forceinline__ double sigmoid (double a)
{
return 1.0 / (1.0 + exp (-a));
}
__global__ void sigmoid_kernel (const double * __restrict__ src,
double * __restrict__ dst, int len)
{
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
dst[i] = sigmoid (src[i]);
}
}
int main (void)
{
double *source, *result;
double *d_a = 0, *d_b = 0;
int len = DEFAULT_LEN;
/* Allocate memory on host */
source = (double *)malloc (len * sizeof (source[0]));
if (!source) return EXIT_FAILURE;
result = (double *)malloc (len * sizeof (result[0]));
if (!result) return EXIT_FAILURE;
/* create source data */
for (int i = 0; i < len; i++) source [i] = i * 1e-5;
/* spot check of source data */
printf ("source[0]=% 23.16e source[%d]=% 23.16e\n",
source[0], len-1, source[len-1]);
/* Allocate memory on device */
CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0]) * len));
CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0]) * len));
/* Push source data to device */
CUDA_SAFE_CALL (cudaMemcpy (d_a, source, sizeof(d_a[0]) * len,
cudaMemcpyHostToDevice));
/* Compute execution configuration */
dim3 dimBlock(256);
int threadBlocks = (len + (dimBlock.x - 1)) / dimBlock.x;
if (threadBlocks > 65520) threadBlocks = 65520;
dim3 dimGrid(threadBlocks);
sigmoid_kernel<<<dimGrid,dimBlock>>>(d_a, d_b, len);
CHECK_LAUNCH_ERROR();
/* retrieve results from device */
CUDA_SAFE_CALL (cudaMemcpy (result, d_b, sizeof (result[0]) * len,
cudaMemcpyDeviceToHost));
/* spot check of results */
printf ("result[0]=% 23.16e result[%d]=% 23.16e\n",
result[0], len-1, result[len-1]);
/* free memory on host and device */
CUDA_SAFE_CALL (cudaFree(d_a));
CUDA_SAFE_CALL (cudaFree(d_b));
free (result);
free (source);
return EXIT_SUCCESS;
}

How can I check the progress of matrix multiplication?

I'm now only need to show an intermediate progress of matrix multiplication.
for(unsigned int col=0; col<mtxSize; col++) {
unsigned tmp = 0;
for(unsigned int row=0; row<mtxSize; row++) {
for(unsigned int idx=0; idx<mtxSize; idx++) {
tmp += h_A[col*mtxSize+idx] * h_B[idx*mtxSize+row];
}
h_Rs[col*mtxSize+row] = tmp;
tmp = 0;
int rate_tmp = (col*mtxSize + (row+1))*100;
// Maybe like this...
fprintf(stdout, "Progress : %d.%d %%\r", rate_tmp/actMtxSize, rate_tmp%actMtxSize);
fflush(stdout);
}
}
In the case of the host code(use CPU), it is very easy beacause it process sequentially so we can check very easily.
But in the case of the GPU which process in parallel, what should I do?
Once the kernel is running, it does not return until finish the kernel execution.
So I can't check mid-data during the kernel execution time.
I think I need to use asynchronous kernel call, but I do not know well.
And even if the asynchronous kernel call is used, to see all of the data into several blocks over processors, do I have to write atomicAdd() (in other words, global memory access) function which is including some overhead?
Give me some advice or hint.
And I want to know in the case of CUDA.
Here is a code which demonstrates how to check progress from a matrix multiply kernel:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define TIME_INC 100000000
#define INCS 10
#define USE_PROGRESS 1
#define MAT_DIMX 4000
#define MAT_DIMY MAT_DIMX
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void mykernel(volatile int *data){
unsigned long time;
for (int i = 0; i < INCS; i++){
atomicAdd((int *)data,1);
__threadfence_system();
time = clock64();
while((clock64() - time)<TIME_INC) {};
}
printf("progress check finished\n");
}
__global__ void matmult(float *a, float *b, float *c, unsigned int rowA, unsigned int colA, unsigned int colB, volatile int *progress){
unsigned int row = threadIdx.x+blockDim.x*blockIdx.x;
unsigned int col = threadIdx.y+blockDim.y*blockIdx.y;
if ((row < rowA) && (col < colB)){
float temp = 0.0f;
for (unsigned int k = 0; k < colA; k++)
temp += a[(row*colA)+k] * b[(k*colB) + col];
c[(row*colB)+col] = temp;
#if USE_PROGRESS
if (!(threadIdx.x || threadIdx.y)){
atomicAdd((int *)progress, 1);
__threadfence_system();
}
#endif
}
}
int main(){
// simple test to demonstrate reading progress data from kernel
volatile int *d_data, *h_data;
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaCheckErrors("cudaSetDeviceFlags error");
cudaHostAlloc((void **)&h_data, sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc error");
cudaHostGetDevicePointer((int **)&d_data, (int *)h_data, 0);
cudaCheckErrors("cudaHostGetDevicePointer error");
*h_data = 0;
printf("kernel starting\n");
mykernel<<<1,1>>>(d_data);
cudaCheckErrors("kernel fail");
int value = 0;
do{
int value1 = *h_data;
if (value1 > value){
printf("h_data = %d\n", value1);
value = value1;}}
while (value < (INCS-1));
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail 2");
// now try matrix multiply with progress
float *h_c, *d_a, *d_b, *d_c;
h_c = (float *)malloc(MAT_DIMX*MAT_DIMY*sizeof(float));
if (h_c == NULL) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_a, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc a fail");
cudaMalloc((void **)&d_b, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc b fail");
cudaMalloc((void **)&d_c, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc c fail");
for (int i = 0; i < MAT_DIMX*MAT_DIMY; i++) h_c[i] = rand()/(float)RAND_MAX;
cudaMemcpy(d_a, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy a fail");
cudaMemcpy(d_b, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy b fail");
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
*h_data=0;
dim3 block(16,16);
dim3 grid(((MAT_DIMX+block.x-1)/block.x), ((MAT_DIMY+block.y-1)/block.y));
printf("matrix multiply kernel starting\n");
cudaEventRecord(start);
matmult<<<grid,block>>>(d_a, d_b, d_c, MAT_DIMY, MAT_DIMX, MAT_DIMX, d_data);
cudaEventRecord(stop);
#if USE_PROGRESS
unsigned int num_blocks = grid.x*grid.y;
float my_progress = 0.0f;
value = 0;
printf("Progress:\n");
do{
cudaEventQuery(stop); // may help WDDM scenario
int value1 = *h_data;
float kern_progress = (float)value1/(float)num_blocks;
if ((kern_progress - my_progress)> 0.1f) {
printf("percent complete = %2.1f\n", (kern_progress*100));
my_progress = kern_progress;}}
while (my_progress < 0.9f);
printf("\n");
#endif
cudaEventSynchronize(stop);
cudaCheckErrors("event sync fail");
float et;
cudaEventElapsedTime(&et, start, stop);
cudaCheckErrors("event elapsed time fail");
cudaDeviceSynchronize();
cudaCheckErrors("mat mult kernel fail");
printf("matrix multiply finished. elapsed time = %f milliseconds\n", et);
return 0;
}
The code associated with the first kernel call is just to demonstrate the basic idea of having a kernel report it's progress back.
The second part of the code shows a sample, naive matrix multiply on the GPU, with the GPU reporting it's progress back. I have included the ability to remove the progress check code via a preprocessor macro, as well as the ability to time the matrix multiply kernel. For the case I have here, there was no discernible difference in timing with or without the progress code. So while the progress reporting code probably does add some overhead, when compared to the scope of a reasonable sized matrix multiply kernel, it adds no significant time that I can see.
Some other uses of signalling are discussed here

Is it possible to use thrust::device_ptr on a mapped array?

I am trying to use the thrust::copy_if function on mapped memory. However, as I get a runtime error and I am not being able to find it, before spending a lot of time in debugging, I would like to have a confirmation of the fact that it is effectively allowed to pass a pointer to a mapped memory location to the thrust::device_ptr wrapper.
Here is an example of what I mean:
int size=1024;
int* v_locked;
int* v_device;
int* stencil_device;
device_ptr<int> v_wrapper;
device_ptr<int> v_wrapper_end;
device_ptr<int> stencil_wrapper;
cudaHostAlloc((void**)&v_locked, size*sizeof(int), cudaHostAllocMapped));
cudaHostGetDevicePointer(&v_device, &v_locked, 0);
cudaMalloc((void**)&stencil_device, size*sizeof(int));
/*
kernel assigning stencil_device elements ...
*/
v_wrapper = device_pointer_cast(v_device);
stencil_wrapper = device_pointer_cast(stencil_device);
v_wrapper_end = copy_if(make_counting_iterator<int>(0), make_counting_iterator<int>(size), stencil_wrapper, v_wrapper, _1 == 1);
Is this a correct usage of mapped memory with thrust library?
Thank you.
Yes, it is possible.
I believe there were several problems with your code.
You don't appear to be doing any proper cuda error checking If you were, you would have detected that although your calls to cudaHostGetDevicePointer seem to compile correctly, they were not set up correctly.
As mentioned above, your calls to cudaHostGetDevicePointer() were not set up correctly. The second pointer argument is passed as a single pointer (*), not double pointer (**). Refer to the documentation This call as written would throw a cuda runtime error which you can trap.
Prior to your cudaHostAlloc calls, you should use the cudaSetDeviceFlags(cudaDeviceMapHost); call to enable this feature.
Here is a sample code which seems to work correctly for me, and has the above problems fixed:
$ cat t281.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/copy.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
template<typename T>
struct is_one : thrust::unary_function<T, bool>
{
__host__ __device__
bool operator()(const T &x)
{
return (x==1);
}
};
int main(){
int size=1024;
int* v_locked;
int* v_device;
int* stencil_locked;
int* stencil_device;
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaCheckErrors("cudaSetDeviceFlags");
cudaHostAlloc((void**)&v_locked, size*sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc 1");
cudaHostGetDevicePointer(&v_device, v_locked, 0);
cudaCheckErrors("cudaHostGetDevicePointer 1");
cudaHostAlloc((void**)&stencil_locked, size*sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc 2");
cudaHostGetDevicePointer(&stencil_device, stencil_locked, 0);
cudaCheckErrors("cudaHostGetDevicePointer 2");
for (int i = 0; i < size; i++){
v_locked[i] = i;
stencil_locked[i] = i%2;}
thrust::device_ptr<int> v_wrapper = thrust::device_pointer_cast(v_device);
thrust::device_ptr<int> stencil_wrapper = thrust::device_pointer_cast(stencil_device);
thrust::device_ptr<int> v_wrapper_end = v_wrapper + size;
thrust::device_vector<int> result(size);
thrust::device_vector<int>::iterator result_end = copy_if(v_wrapper, v_wrapper_end, stencil_wrapper, result.begin(), is_one<int>());
int result_size = result_end - result.begin();
thrust::host_vector<int> h_result(result_size);
thrust::copy_n(result.begin(), result_size, h_result.begin());
thrust::copy_n(h_result.begin(), 10, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
return 0;
}
$ nvcc -arch=sm_20 -o t281 t281.cu
$ ./t281
1 3 5 7 9 11 13 15 17 19
$

How to configure cublas{t}symm() function arguments

This function performs the symmetric matrix-matrix multiplication using CUDA. Although, I succeeded in using the nonsymmetric version "cublas{t}gemm()" I couldn't use the "cublas{t}symm()" function properly.
I know that CUBLAS library uses column-major matrix storage. I am using row-major C/C++ matrix and I know how to solve this issue for "cublas{t}gemm()" by replacing the input matrices and etc. However, I couldn't solve it for the symmetric case. The problem is even if I use column-major matrix storage I find unexpectable results. Matrices contain complex floats (cuComplex). I assume I have row-major matrices. Here is the code and the output:
// Matrix multiplication: C = A * B.
// Host code.
//
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA SDK samples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions (in addition to helper_cuda.h)
void inline checkError(cublasStatus_t status, const char *msg)
{
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("%s", msg);
exit(EXIT_FAILURE);
}
}
// end of CUDA Helper Functions
// Allocates a matrix with random float entries.
void randomCmplxInit(cuComplex *data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND_MAX);
}
//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
void initializeCUDA(int argc, char **argv, int &devID)
{
// By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
cudaError_t error;
devID = 0;
int m,n,k;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
if (error != cudaSuccess)
{
printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
}
// get number of SMs on this GPU
error = cudaGetDevice(&devID);
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID)
{
int i,j;
unsigned int m,n,k;
cudaDeviceProp deviceProp;
cudaError_t error;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;
m=3; //number of rows of matrix op(A) and C. A--> (m x k)
n=2; //number of columns of matrix op(B) and C. B--> (k x n)
k=m; //number of columns of op(A) and rows of op(B). C--> (m x n)
// I want to compute C = A*B in row-major format,
//so I must find C(T)=B(T)A(T) = C(T)A in column-major format
// allocate host memory for matrices A and B
unsigned int size_A = m*(m+1)/2; //size of a symmetric matrix
unsigned int mem_size_A = sizeof(cuComplex) * size_A;
cuComplex *h_A = (cuComplex *)malloc(mem_size_A);
unsigned int size_B = m*n;
unsigned int mem_size_B = sizeof(cuComplex) * size_B;
cuComplex *h_B = (cuComplex *)malloc(mem_size_B);
// initialize host memory
for (i = 0; i < size_A; ++i)
h_A[i] = make_cuComplex( (float)(i+1),(float)0);
for (i = 0; i < size_B; ++i)
h_B[i] = make_cuComplex((float)(i+2), (float)0);
// allocate device memory
cuComplex *d_A, *d_B, *d_C;
unsigned int size_C = m*n;
unsigned int mem_size_C = sizeof(cuComplex) * size_C;
// allocate host memory for the result
cuComplex *h_C = (cuComplex *) malloc(mem_size_C);
cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C);
error = cudaMalloc((void **) &d_A, mem_size_A);
error = cudaMalloc((void **) &d_B, mem_size_B);
// copy host memory to device
error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
error = cudaMalloc((void **) &d_C, mem_size_C);
// setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(n / threads.x, m / threads.y);
// create and start timer
printf("Computing result using CUBLAS...");
// CUBLAS version 2.0
{
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
const cuComplex alpha = make_cuComplex(1.0f,0.0f);
const cuComplex beta = make_cuComplex(0.0f,0.0f);
//Perform operation with cublas
ret = cublasCsymm(handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, n,m,&alpha,d_A,m,d_B,m,&beta,d_C,m);
// copy result from device to host
error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
}
printf ("\nComputations completed.\n\n");
printf (" symm matrix A: \n");
int s=0;
for (i=0; i<min(m,4); i++) {
for (j=0; j<=i; j++) {
//printf ("%7.5G + j(%7.5G)", h_A[j+i*k].x,h_A[j+i*k].y);
printf ("%7.5G", h_A[s].x);
s++;
}
printf ("\n");
}
printf ("\n matrix B: \n");
for (i=0; i<min(k,4); i++) {
for (j=0; j<min(n,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_B[j+i*n].x,h_B[j+i*n].y);
printf ("%7.5G", h_B[j+i*n].x);
}
printf ("\n");
}
printf ("\n matrix C=A*B: \n");
for (i=0; i<min(m,4); i++) {
for (j=0; j<min(n,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_CUBLAS[j+i*n].x,h_CUBLAS[j+i*n].y);
printf ("%7.5G", h_CUBLAS[j+i*n].x);
}
printf ("\n");
}
// clean up memory
free(h_A);
free(h_B);
free(h_C);
//free(reference);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaDeviceReset();
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("[Matrix Multiply CUBLAS] - Starting...\n");
int devID = 0, sizeMult = 5;
initializeCUDA(argc, argv, devID);
int matrix_result = matrixMultiply(argc, argv, devID);
}
I suppose that I have the following matrices for the multiplication:
A =
1 2 4
2 3 5
4 5 6
B =
2 3
4 5
6 7
and expect to obtain
A*B =
34 41
46 56
64 79
But the obtained OUTPUT is as follows:
symm matrix A:
1
2 3
4 5 6
matrix B:
2 3
4 5
6 7
matrix C=A*B:
78 90
74 97
114 146
What am I missing in this code ? Probably the arguments of "cublasCsymm" function are wrong.
Thanks,
Kagan
EDIT:
Based on questions posed below, I elected to re-work my answer and example code.
You can handle row-major storage without transpose at least for these operations. And this observation is further facilitated by the fact that the symm function does not used the packed storage.
So to answer the additional questions:
the cublasCsymm function does not use a packed storage format (like some other functions such as cublasCspmv for example), because the cublasCsymm function is intended to duplicate the functionality of the corresponding netlib function, which also does not use a packed storage format. Based on my review of the cublas API, I don't see a symmetric-packed-storage matrix-matrix multiply function available.
You can use row-major storage (e.g. C-style) with cublas, without transposing, at least for these operations (matrix-matrix multiply, without packed storage) by following the advice given here.
What follows is a re-worked version of my previous example, that incorporates the information in item 2 above.
// Matrix multiplication: C = A * B.
// Host code.
//
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA SDK sa
mples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions (in addition to helper_cuda.h)
void inline checkError(cublasStatus_t status, const char *msg)
{
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("%s", msg);
exit(EXIT_FAILURE);
}
}
// end of CUDA Helper Functions
// Allocates a matrix with random float entries.
void randomCmplxInit(cuComplex *data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND
_MAX);
}
//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMa
trixSize &matrix_size)
void initializeCUDA(int argc, char **argv, int &devID)
{
// By default, we use device 0, otherwise we override the device ID based on
what is provided at the command line
cudaError_t error;
devID = 0;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
if (error != cudaSuccess)
{
printf("cudaSetDevice returned error code %d, line(%d)\n", error, __
LINE__);
exit(EXIT_FAILURE);
}
}
// get number of SMs on this GPU
error = cudaGetDevice(&devID);
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, dev
iceProp.name, deviceProp.major, deviceProp.minor);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID)
{
int i,j;
unsigned int m,n,k;
cudaDeviceProp deviceProp;
cudaError_t error;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
// use a larger block size for Fermi and above
m=3; //number of rows of matrix op(A) and C. A--> (m x k)
n=2; //number of columns of matrix op(B) and C. B--> (k x n)
k=m; //number of columns of op(A) and rows of op(B). C--> (m x n)
// I want to compute C = A*B in row-major format,
//so I must find C(T)=B(T)A(T) = C(T)A in column-major format
// allocate host memory for matrices A and B
unsigned int size_A = m*m; //size of a symmetric matrix
printf("size_A = %d\n", size_A);
unsigned int mem_size_A = sizeof(cuComplex) * size_A;
cuComplex *h_A = (cuComplex *)malloc(mem_size_A);
unsigned int size_B = m*n;
unsigned int mem_size_B = sizeof(cuComplex) * size_B;
cuComplex *h_B = (cuComplex *)malloc(mem_size_B);
// initialize host memory
// for (i = 0; i < size_A; ++i)
// h_A[i] = make_cuComplex( (float)(i+1),(float)0);
h_A[0] = make_cuComplex((float)1, (float)0);
h_A[1] = make_cuComplex((float)2, (float)0);
h_A[2] = make_cuComplex((float)4, (float)0);
h_A[3] = make_cuComplex((float)0, (float)0);
h_A[4] = make_cuComplex((float)3, (float)0);
h_A[5] = make_cuComplex((float)5, (float)0);
h_A[6] = make_cuComplex((float)0, (float)0);
h_A[7] = make_cuComplex((float)0, (float)0);
h_A[8] = make_cuComplex((float)6, (float)0);
// for (i = 0; i < size_B; ++i)
// h_B[i] = make_cuComplex((float)(i+2), (float)0);
h_B[0] = make_cuComplex((float)2, (float)0);
h_B[1] = make_cuComplex((float)3, (float)0);
h_B[2] = make_cuComplex((float)4, (float)0);
h_B[3] = make_cuComplex((float)5, (float)0);
h_B[4] = make_cuComplex((float)6, (float)0);
h_B[5] = make_cuComplex((float)7, (float)0);
// allocate device memory
cuComplex *d_A, *d_B, *d_C;
unsigned int size_C = m*n;
unsigned int mem_size_C = sizeof(cuComplex) * size_C;
// allocate host memory for the result
cuComplex *h_C = (cuComplex *) malloc(mem_size_C);
cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C);
error = cudaMalloc((void **) &d_A, mem_size_A);
error = cudaMalloc((void **) &d_B, mem_size_B);
// copy host memory to device
error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
error = cudaMalloc((void **) &d_C, mem_size_C);
// create and start timer
printf("Computing result using CUBLAS...");
// CUBLAS version 2.0
{
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
const cuComplex alpha = make_cuComplex(1.0f,0.0f);
const cuComplex beta = make_cuComplex(0.0f,0.0f);
//Perform operation with cublas
ret = cublasCsymm(handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, n,m,&alpha,d_A,m,d_B,n,&beta,d_C,n);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCsymm returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
// copy result from device to host
error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
}
printf ("\nComputations completed.\n\n");
printf (" symm matrix A: \n");
// int s=0;
for (i=0; i<min(m,4); i++) {
for (j=0; j<min(m,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_A[j+i*k].x,h_A[j+i*k].y);
// printf ("%7.5G", h_A[s].x);
printf ("%7.5G", h_A[j+(i*m)].x);
// s++;
}
printf ("\n");
}
printf ("\n matrix B: \n");
for (i=0; i<min(k,4); i++) {
for (j=0; j<min(n,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_B[j+i*n].x,h_B[j+i*n].y);
printf ("%7.5G", h_B[j+(i*n)].x);
}
printf ("\n");
}
printf ("\n matrix C=A*B: \n");
for (i=0; i<min(m,4); i++) {
for (j=0; j<min(n,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_CUBLAS[j+i*n].x,h_CUBLAS[j+i*n].y);
printf ("%7.5G", h_CUBLAS[j+(i*n)].x);
}
printf ("\n");
}
// clean up memory
free(h_A);
free(h_B);
free(h_C);
//free(reference);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaDeviceReset();
return 0;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("[Matrix Multiply CUBLAS] - Starting...\n");
int devID = 0;
initializeCUDA(argc, argv, devID);
int matrix_result = matrixMultiply(argc, argv, devID);
cudaCheckErrors("some error");
return 0;
}
$ ./t213
[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "Tesla M2070" with compute capability 2.0
size_A = 9
Computing result using CUBLAS...
Computations completed.
symm matrix A:
1 2 4
0 3 5
0 0 6
matrix B:
2 3
4 5
6 7
matrix C=A*B:
34 41
46 56
64 79
$
ORIGINAL RESPONSE:
Several problems:
When I run your code as you have it posted right now, I don't get the
results that you show. Here's what I get:
[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "Tesla M2070" with compute capability 2.0
Computing result using CUBLAS...
Computations completed.
symm matrix A:
1
2 3
4 5 6
matrix B:
2 3
4 5
6 7
matrix C=A*B:
-131 -128
260 -122
-115 266
The code compiles with a number of warnings and also you're not doing proper error checking (for example you're not checking the return value from cublasCsymm
You are wanting to multiply C = A*B This means A is on the LEFT,
but you are passing CUBLAS_SIDE_RIGHT to cublasCsymm Several other cublasCsymm parameters were wrong as well. I think maybe you thought you could do A*B as (B(T)*A(T)) but that only works for square matrices. Not sure what you were thinking, exactly.
You having row-major storage on your matrices and passing them to cublas which interprets them in column-major order. For the following matrix:
1 2
3 4
row-major storage looks like this:
1 2 3 4
column-major storage looks like this:
1 3 2 4
You can transpose these matrices if you wish, using cublasCgeam or you can manually modify your storage.
You're making some sort of assumption about some kind of compressed
storage format for the symmetric matrix A which is not correct.
Read carefully the defintion of the storage
type.
It doesn't say the portion of the matrix that is "supplied" or
"present" it says the portion of the matrix that is filled.
Here is a complete code that has the above problems fixed:
// Matrix multiplication: C = A * B.
// Host code.
//
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA SDK sa
mples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions (in addition to helper_cuda.h)
void inline checkError(cublasStatus_t status, const char *msg)
{
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("%s", msg);
exit(EXIT_FAILURE);
}
}
// end of CUDA Helper Functions
// Allocates a matrix with random float entries.
void randomCmplxInit(cuComplex *data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND_MAX);
}
//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
void initializeCUDA(int argc, char **argv, int &devID)
{
// By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
cudaError_t error;
devID = 0;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
if (error != cudaSuccess)
{
printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
}
// get number of SMs on this GPU
error = cudaGetDevice(&devID);
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID)
{
int i,j;
unsigned int m,n,k;
cudaDeviceProp deviceProp;
cudaError_t error;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
// use a larger block size for Fermi and above
m=3; //number of rows of matrix op(A) and C. A--> (m x k)
n=2; //number of columns of matrix op(B) and C. B--> (k x n)
k=m; //number of columns of op(A) and rows of op(B). C--> (m x n)
// I want to compute C = A*B in row-major format,
//so I must find C(T)=B(T)A(T) = C(T)A in column-major format
// allocate host memory for matrices A and B
unsigned int size_A = m*m; //size of a symmetric matrix
printf("size_A = %d\n", size_A);
unsigned int mem_size_A = sizeof(cuComplex) * size_A;
cuComplex *h_A = (cuComplex *)malloc(mem_size_A);
unsigned int size_B = m*n;
unsigned int mem_size_B = sizeof(cuComplex) * size_B;
cuComplex *h_B = (cuComplex *)malloc(mem_size_B);
// initialize host memory
// for (i = 0; i < size_A; ++i)
// h_A[i] = make_cuComplex( (float)(i+1),(float)0);
h_A[0] = make_cuComplex((float)1, (float)0);
h_A[1] = make_cuComplex((float)2, (float)0);
h_A[2] = make_cuComplex((float)4, (float)0);
h_A[3] = make_cuComplex((float)0, (float)0);
h_A[4] = make_cuComplex((float)3, (float)0);
h_A[5] = make_cuComplex((float)5, (float)0);
h_A[6] = make_cuComplex((float)0, (float)0);
h_A[7] = make_cuComplex((float)0, (float)0);
h_A[8] = make_cuComplex((float)6, (float)0);
// for (i = 0; i < size_B; ++i)
// h_B[i] = make_cuComplex((float)(i+2), (float)0);
h_B[0] = make_cuComplex((float)2, (float)0);
h_B[1] = make_cuComplex((float)4, (float)0);
h_B[2] = make_cuComplex((float)6, (float)0);
h_B[3] = make_cuComplex((float)3, (float)0);
h_B[4] = make_cuComplex((float)5, (float)0);
h_B[5] = make_cuComplex((float)7, (float)0);
// allocate device memory
cuComplex *d_A, *d_B, *d_C;
unsigned int size_C = m*n;
unsigned int mem_size_C = sizeof(cuComplex) * size_C;
// allocate host memory for the result
cuComplex *h_C = (cuComplex *) malloc(mem_size_C);
cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C);
error = cudaMalloc((void **) &d_A, mem_size_A);
error = cudaMalloc((void **) &d_B, mem_size_B);
// copy host memory to device
error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
error = cudaMalloc((void **) &d_C, mem_size_C);
// create and start timer
printf("Computing result using CUBLAS...");
// CUBLAS version 2.0
{
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
const cuComplex alpha = make_cuComplex(1.0f,0.0f);
const cuComplex beta = make_cuComplex(0.0f,0.0f);
//Perform operation with cublas
ret = cublasCsymm(handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, m,n,&alpha,d_A,m,d_B,m,&beta,d_C,m);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCsymm returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
Here is the output:
[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "Tesla M2070" with compute capability 2.0
size_A = 9
Computing result using CUBLAS...
Computations completed.
symm matrix A:
1 0 0
2 3 0
4 5 6
matrix B:
2 3
4 5
6 7
matrix C=A*B:
34 41
46 56
64 79

cuda- code doesnt enter the kernel

I am trying to learn cuda. I am trying to run a simple code
#include <stdlib.h>
#include <stdio.h>
__global__ void kernel(int *array)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
array[index] = 7;
}
int main(void)
{
int num_elements = 256;
int num_bytes = num_elements * sizeof(int);
// pointers to host & device arrays
int *device_array = 0;
int *host_array = 0;
// malloc a host array
host_array = (int*)malloc(num_bytes);
// cudaMalloc a device array
cudaMalloc((void**)&device_array, num_bytes);
int block_size = 128;
int grid_size = num_elements / block_size;
kernel<<<grid_size,block_size>>>(device_array);
// download and inspect the result on the host:
cudaMemcpy(host_array, device_array, num_bytes, cudaMemcpyDeviceToHost);
// print out the result element by element
for(int i=0; i < num_elements; ++i)
{
printf("%d ", host_array[i]);
}
// deallocate memory
free(host_array);
cudaFree(device_array);
}
It is supposed to print 7's but it prints 0's
This statement doesn't seem to get executed
"kernel<<>>(device_array);"
It doesn't give any compilation error also.
Any help ??
The code runs fine on my machine, but make sure you add cudaDeviceSynchronize and error checking after the kernel call.
Change the code as follows to check for errors:
kernel<<<grid_size,block_size>>>(device_array);
// wait until tasks are completed
cudaDeviceSynchronize();
// check for errors
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
fprintf(stderr, "ERROR: %s \n", cudaGetErrorString(error));
}