Summing the rows of a matrix (stored in either row-major or column-major order) in CUDA - cuda

I'm working on the problem summing the rows of a matrix in CUDA. I'm giving the following example.
Suppose to have the following 20 * 4 array:
1 2 3 4
4 1 2 3
3 4 1 2
.
1 2 3 4
.
.
.
.
.
.
.
.
2 1 3 4
After flattened the 2d array to a 1d array (either in row-major or column-major order), I need to assign each thread to a different row and calculate the cost for that row.
For example
- thread 1 should calculate the cost for 1 2 3 4
- thread 2 should calculate the cost for 4 1 2 3
How can I that in CUDA?
Thank you all for the reply

#include <stdio.h>
#include <stdlib.h>
#define MROWS 20
#define NCOLS 4
#define nTPB 256
__global__ void mykernel(int *costdata, int rows, int cols, int *results){
int tidx = threadIdx.x + blockDim.x*blockIdx.x;
if (tidx < rows){
int mycost = 0;
for (int i = 0; i < cols; i++)
mycost += costdata[(tidx*cols)+i];
results[tidx] = mycost;
}
}
int main(){
//define and initialize host and device storage for cost and results
int *d_costdata, *h_costdata, *d_results, *h_results;
h_results = (int *)malloc(MROWS*sizeof(int));
h_costdata = (int *)malloc(MROWS*NCOLS*sizeof(int));
for (int i=0; i<(MROWS*NCOLS); i++)
h_costdata[i] = rand()%4;
cudaMalloc((void **)&d_results, MROWS*sizeof(int));
cudaMalloc((void **)&d_costdata, MROWS*NCOLS*sizeof(int));
//copy cost data from host to device
cudaMemcpy(d_costdata, h_costdata, MROWS*NCOLS*sizeof(int), cudaMemcpyHostToDevice);
mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results);
// copy results back from device to host
cudaMemcpy(h_results, d_results, MROWS*sizeof(int), cudaMemcpyDeviceToHost);
for (int i=0; i<MROWS; i++){
int loc_cost = 0;
for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
printf("cost[%d]: host= %d, device = %d\n", i, loc_cost, h_results[i]);
}
}
This assumes "cost" of each row is just the sum of the elements in each row. If you have a different "cost" function, you can modify the activity in the kernel for-loop accordingly. This also assumes C-style row-major data storage (1 2 3 4 4 1 2 3 3 4 1 2 etc.)
If you instead use column-major storage (1 4 3 etc.), you can slightly improve the performance, since the data reads can be fully coalesced. Then your kernel code could look like this:
for (int i = 0; i < cols; i++)
mycost += costdata[(i*rows)+tidx];
You should also use proper cuda error checking on all CUDA API calls and kernel calls.
EDIT: As discussed in the comments below, for the row-major storage case, in some situations it might give an increase in memory efficiency by electing to load 16-byte quantities rather than the base type. Following is a modified version that implements this idea for arbitrary dimensions and (more or less) arbitrary base types:
#include <iostream>
#include <typeinfo>
#include <cstdlib>
#include <vector_types.h>
#define MROWS 1742
#define NCOLS 801
#define nTPB 256
typedef double mytype;
__host__ int sizetype(){
int size = 0;
if ((typeid(mytype) == typeid(float)) || (typeid(mytype) == typeid(int)) || (typeid(mytype) == typeid(unsigned int)))
size = 4;
else if (typeid(mytype) == typeid(double))
size = 8;
else if ((typeid(mytype) == typeid(unsigned char)) || (typeid(mytype) == typeid(char)))
size = 1;
return size;
}
template<typename T>
__global__ void mykernel(const T *costdata, int rows, int cols, T *results, int size, size_t pitch){
int chunk = 16/size; // assumes size is a factor of 16
int tidx = threadIdx.x + blockDim.x*blockIdx.x;
if (tidx < rows){
T *myrowptr = (T *)(((unsigned char *)costdata) + tidx*pitch);
T mycost = (T)0;
int count = 0;
while (count < cols){
if ((cols-count)>=chunk){
// read 16 bytes
int4 temp = *((int4 *)(myrowptr + count));
int bcount = 16;
int j = 0;
while (bcount > 0){
mycost += *(((T *)(&temp)) + j++);
bcount -= size;
count++;}
}
else {
// read one quantity at a time
for (; count < cols; count++)
mycost += myrowptr[count];
}
results[tidx] = mycost;
}
}
}
int main(){
int typesize = sizetype();
if (typesize == 0) {std::cout << "invalid type selected" << std::endl; return 1;}
//define and initialize host and device storage for cost and results
mytype *d_costdata, *h_costdata, *d_results, *h_results;
h_results = (mytype *)malloc(MROWS*sizeof(mytype));
h_costdata = (mytype *)malloc(MROWS*NCOLS*sizeof(mytype));
for (int i=0; i<(MROWS*NCOLS); i++)
h_costdata[i] = (mytype)(rand()%4);
size_t pitch = 0;
cudaMalloc((void **)&d_results, MROWS*sizeof(mytype));
cudaMallocPitch((void **)&d_costdata, &pitch, NCOLS*sizeof(mytype), MROWS);
//copy cost data from host to device
cudaMemcpy2D(d_costdata, pitch, h_costdata, NCOLS*sizeof(mytype), NCOLS*sizeof(mytype), MROWS, cudaMemcpyHostToDevice);
mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results, typesize, pitch);
// copy results back from device to host
cudaMemcpy(h_results, d_results, MROWS*sizeof(mytype), cudaMemcpyDeviceToHost);
for (int i=0; i<MROWS; i++){
mytype loc_cost = (mytype)0;
for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
if ((i < 10) && (typesize > 1))
std::cout <<"cost[" << i << "]: host= " << loc_cost << ", device = " << h_results[i] << std::endl;
if (loc_cost != h_results[i]){ std::cout << "mismatch at index" << i << "should be:" << loc_cost << "was:" << h_results[i] << std::endl; return 1; }
}
std::cout << "Results are correct!" << std::endl;
}

Related

How to fix cudaError 77 when copying back from device to host

I am writing a simple example program to test memCpy and kernel run concurrency for a larger program. While writing this example, I stumbled upon error 77, aka cudaErrorIllegalAddress.
I read somewhere that that comes from the kernel accessing an invalid address, and not the memcpy itself. So I tried to index the lowest element of my input array (0). The error remained.
As it only is a small sample program, I will provide the whole code;
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCKS 32
#define THREADS 16
__global__ void kernel(double *d_in, double *d_out) {
int index = threadIdx.x + blockDim.x * blockIdx.x;
d_out[index] = d_in[index] + 5;
}
int main() {
const int GPU_N = 2;
const int data_size = 2048;
const int cycles = 2;
double *h_in, *h_out, *d_in, *d_out;
h_in = (double*)malloc(sizeof(double) * data_size);
h_out = (double*)malloc(sizeof(double) * data_size);
for (int i = 0; i < data_size; i++) {
h_in[i] = 21;
}
cudaError_t error;
printf("1\n");
for (int i = 0; i < cycles; i++) {
//cuMalloc
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMalloc((void**)&d_in, sizeof(double) * data_size / 4);
cudaMalloc((void**)&d_out, sizeof(double) * data_size / 4);
printf("2\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMemcpyAsync(d_in, h_in, sizeof(double) * data_size / 4, cudaMemcpyHostToDevice);
printf("3\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
kernel<<< BLOCKS, THREADS, 0, 0 >>>(d_in, d_out);
error = cudaGetLastError();
printf("4\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
error = cudaMemcpyAsync(h_out, d_out, sizeof(double) * data_size / 4, cudaMemcpyDeviceToHost);
printf("D2H %i\n", error);
printf("5\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaFree(d_in);
cudaFree(d_out);
printf("6\n");
}
}
for (int i = 0; i < data_size; i++) {
printf("%i\n", h_out[i]);
}
getchar();
}
So the output should be something like:
1
1
2
2
3
3
4
4
5
5
6
6
1
1
2
2
3
3
4
4
5
5
6
6
26
26
26
26
26
.....
and then a spam of the result. It does so until the time it has to print 5, then it outputs the error 77. Also, the output of the result is not 26 as expected, but -842150451
There are several problems with this code.
As already pointed out in the comments, the printf format specifier here (%i) is wrong:
printf("%i\n", h_out[i]);
the quantity being printed is a double quantity, an appropriate format specifier would be %f.
This code will not work (for GPU_N greater than 1):
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMalloc((void**)&d_in, sizeof(double) * data_size / 4);
cudaMalloc((void**)&d_out, sizeof(double) * data_size / 4);
printf("2\n");
}
d_in and d_out are individual variables. You don't get to somehow reuse them in this way. When this loop goes through it's 2nd (or later) iteration, it will overwrite the pointer values that were previously assigned. Later on this will result in code trouble, because for at least one of your kernel launches, you will be passing pointers to data that is not resident on that particular GPU (and this particular aspect of the problem is the proximal reason for the error 77 report.)
One solution would be to provide arrays of pointers to make this work.
Some of the CUDA activity you are issuing in your loops may be asynchronous. Therefore, to be sure that your final printout of h_out shows expected results, you should wait for all work on the GPU to be finished. One way to accomplish this is with another set of calls to cudaDeviceSynchronize(). (I don't wish to argue about whether cudaFree is asynchronous or not. I think this item is a sensible suggestion and noteworthy. If you feel you can skip this item, do what you wish. For learning purposes, I think it is important to point this out.) For the reasons indicated in comments below, this item is not necessary/mandatory to get expected results for this particular code. This answer isn't intended to be a complete treatise on asynchronous work issuance; for that I suggest further study of any of the relevant questions here on the cuda tag, and/or study of relevant CUDA sample codes.
Here's a modified code that has the above issues addressed (I have shortened the final print-out loop):
$ cat t1477.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCKS 32
#define THREADS 16
__global__ void kernel(double *d_in, double *d_out) {
int index = threadIdx.x + blockDim.x * blockIdx.x;
d_out[index] = d_in[index] + 5;
}
int main() {
const int GPU_N = 2;
const int data_size = 2048;
const int cycles = 2;
double *h_in, *h_out, *d_in[GPU_N], *d_out[GPU_N];
h_in = (double*)malloc(sizeof(double) * data_size);
h_out = (double*)malloc(sizeof(double) * data_size);
for (int i = 0; i < data_size; i++) {
h_in[i] = 21;
}
cudaError_t error;
printf("1\n");
for (int i = 0; i < cycles; i++) {
//cuMalloc
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMalloc((void**)(&(d_in[j])), sizeof(double) * data_size / 4);
cudaMalloc((void**)(&(d_out[j])), sizeof(double) * data_size / 4);
printf("2\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMemcpyAsync(d_in[j], h_in, sizeof(double) * data_size / 4, cudaMemcpyHostToDevice);
printf("3\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
kernel<<< BLOCKS, THREADS, 0, 0 >>>(d_in[j], d_out[j]);
error = cudaGetLastError();
printf("4\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
error = cudaMemcpyAsync(h_out, d_out[j], sizeof(double) * data_size / 4, cudaMemcpyDeviceToHost);
printf("D2H %i\n", error);
printf("5\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaFree(d_in[j]);
cudaFree(d_out[j]);
printf("6\n");
}
}
for (int i = 0; i < GPU_N; i++){
cudaSetDevice(i);
cudaDeviceSynchronize();}
for (int i = 0; i < 10; i++) {
printf("%f\n", h_out[i]);
}
}
$ nvcc -o t1477 t1477.cu
$ cuda-memcheck ./t1477
========= CUDA-MEMCHECK
1
2
2
3
3
4
4
D2H 0
5
D2H 0
5
6
6
2
2
3
3
4
4
D2H 0
5
D2H 0
5
6
6
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
========= ERROR SUMMARY: 0 errors
$

GPU reduction code only runs one time

I have been using the code sample supplied by Robert Crovella:
thrust::max_element slow in comparison cublasIsamax - More efficient implementation?
Which is a very fast reduction code. I modified it to also return the index of the max in the input array of floats. When I use it in my code, it will only execute one time. If I try calling the routine again it does not find a new max value, it just returns the previous max. Is there something about the volatile global memory that the routine uses that needs to be reset before it can be called again?
#include <cuda.h>
#include <cublas_v2.h>
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 4096*4 // nTPB should be a power-of-2
#define nTPB 512
#define MAX_KERNEL_BLOCKS 30
#define MAX_BLOCKS ((DSIZE/nTPB)+1)
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f
#include <helper_functions.h>
#include <helper_cuda.h>
// this code has been modified to return the index of the max instead of the actual max value - for my application
__device__ volatile float blk_vals[MAX_BLOCKS];
__device__ volatile int blk_idxs[MAX_BLOCKS];
__device__ int blk_num = 0;
//template <typename T>
__global__ void max_idx_kernel(const float *data, const int dsize, int *result){
__shared__ volatile float vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
float my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize){
if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}
idx += blockDim.x*gridDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
// perform block-level reduction
if (!threadIdx.x){
blk_vals[blockIdx.x] = vals[0];
blk_idxs[blockIdx.x] = idxs[0];
if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block
last_block = 1;}
__syncthreads();
if (last_block){
idx = threadIdx.x;
my_val = FLOAT_MIN;
my_idx = -1;
while (idx < gridDim.x){
if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }
idx += blockDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
if (!threadIdx.x)
*result = idxs[0];
}
}
int main(){
int nrElements = DSIZE;
float *d_vector, *h_vector;
StopWatchInterface *hTimer = NULL;
sdkCreateTimer(&hTimer);
double gpuTime;
int k;
int max_index;
int *d_max_index;
cudaMalloc(&d_max_index, sizeof(int));
h_vector = new float[DSIZE];
for(k=0; k < 5; k++){
for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;
h_vector[10+k] = 10; // create definite max element that changes with each loop iteration
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " thrust time: " << gpuTime << " max index: " << max_index << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " cublas time: " << gpuTime << " max index: " << max_index-1 << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);
cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " idx kern time: " << gpuTime << " max index: " << max_index << std::endl;
std::cout << std::endl;
} // end for loop on k
cudaFree(d_max_index);
cudaFree(d_vector);
return 0;
}
The primary issue in re-using this code for multiple loops as-is is in this static initialization of a device (global) variable:
__device__ int blk_num = 0;
That's OK if you're only going to run the routine once. But if you intend to re-use it, you will need to re-initialize this variable to zero before each call to the kernel.
We could fix this by putting an explicit initialization of this variable to zero before each call to the reduction kernel:
cudaMemcpyToSymbol(blk_num, &max_index, sizeof(int));
(I'm using max_index here simply because it is a convenient host int variable that has just been set to zero.)
That's the only change needed to get the code "working".
However the introduction of the loop has created some other "issues" that I would point out. These 3 lines of code:
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
don't belong inside the for-loop on k. That is effectively creating a memory leak and unnecessarily re-initializing the cublas library.
The following code has those changes and seems to work for me:
$ cat t1183.cu
#include <cuda.h>
#include <cublas_v2.h>
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 4096*4 // nTPB should be a power-of-2
#define nTPB 512
#define MAX_KERNEL_BLOCKS 30
#define MAX_BLOCKS ((DSIZE/nTPB)+1)
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f
#include <helper_functions.h>
#include <helper_cuda.h>
// this code has been modified to return the index of the max instead of the actual max value - for my application
__device__ volatile float blk_vals[MAX_BLOCKS];
__device__ volatile int blk_idxs[MAX_BLOCKS];
__device__ int blk_num;
//template <typename T>
__global__ void max_idx_kernel(const float *data, const int dsize, int *result){
__shared__ volatile float vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
float my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize){
if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}
idx += blockDim.x*gridDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
// perform block-level reduction
if (!threadIdx.x){
blk_vals[blockIdx.x] = vals[0];
blk_idxs[blockIdx.x] = idxs[0];
if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block
last_block = 1;}
__syncthreads();
if (last_block){
idx = threadIdx.x;
my_val = FLOAT_MIN;
my_idx = -1;
while (idx < gridDim.x){
if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }
idx += blockDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
if (!threadIdx.x)
*result = idxs[0];
}
}
int main(){
int nrElements = DSIZE;
float *d_vector, *h_vector;
StopWatchInterface *hTimer = NULL;
sdkCreateTimer(&hTimer);
double gpuTime;
int k;
int max_index;
int *d_max_index;
cudaMalloc(&d_max_index, sizeof(int));
h_vector = new float[DSIZE];
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
for(k=0; k < 5; k++){
for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;
h_vector[10+k] = 10; // create definite max element that changes with each loop iteration
cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " thrust time: " << gpuTime << " max index: " << max_index << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " cublas time: " << gpuTime << " max index: " << max_index-1 << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
cudaMemcpyToSymbol(blk_num, &max_index, sizeof(int));
max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);
cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " idx kern time: " << gpuTime << " max index: " << max_index << std::endl;
std::cout << std::endl;
} // end for loop on k
cudaFree(d_max_index);
cudaFree(d_vector);
return 0;
}
$ nvcc -I/usr/local/cuda/samples/common/inc t1183.cu -o t1183 -lcublas
$ cuda-memcheck ./t1183
========= CUDA-MEMCHECK
loop: 0 thrust time: 2.806 max index: 10
loop: 0 cublas time: 0.441 max index: 10
loop: 0 idx kern time: 0.395 max index: 10
loop: 1 thrust time: 1.298 max index: 11
loop: 1 cublas time: 0.419 max index: 11
loop: 1 idx kern time: 0.424 max index: 11
loop: 2 thrust time: 1.303 max index: 12
loop: 2 cublas time: 0.43 max index: 12
loop: 2 idx kern time: 0.419 max index: 12
loop: 3 thrust time: 1.291 max index: 13
loop: 3 cublas time: 0.423 max index: 13
loop: 3 idx kern time: 0.415 max index: 13
loop: 4 thrust time: 1.299 max index: 14
loop: 4 cublas time: 0.423 max index: 14
loop: 4 idx kern time: 0.417 max index: 14
========= ERROR SUMMARY: 0 errors
$

The Floyd-Warshall algorithm in CUDA

This is the sequential piece of code I am trying to parallelize in CUDA
/*
Sequential (Single Thread) APSP on CPU.
*/
void floyd_sequential(int *mat, const size_t N)
{
for(int k = 0; k < N; k ++)
for(int i = 0; i < N; i ++)
for(int j = 0; j < N; j ++)
{
int i0 = i*N + j;
int i1 = i*N + k;
int i2 = k*N + j;
if(mat[i1] != -1 && mat[i2] != -1)
mat[i0] = (mat[i0] != -1 && mat[i0] < mat[i1] + mat[i2]) ?
mat[i0] : (mat[i1] + mat[i2]);
}
}
This is my CUDA implementation
// ParallelComputing.cpp : Defines the entry point for the console application.
//
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#define DIMENSION 10;
__global__ void gpu_Floyd(int *result, int N)
{
int j,k;
int Row = blockIdx.y * blockDim.y + threadIdx.y;
for(k = 0; k < N; k++)
{
for(j = 0; j < N; j++)
{
int i0 = Row * N + j;
int i1 = Row * N + k;
int i2 = k * N + j;
if(result[i0] != -1 && result[i2] != -1)
result[i0] = (result[i0] != -1 && result[i0] < result[i1] + result[i2]) ?
result[i0] : (result[i1] + result[i2]);
__syncthreads();
}
}
}
void GenMatrix(int *mat, const size_t N)
{
for(int i = 0; i < N*N; i ++)
mat[i] = rand()%32 - 1;
}
bool CmpArray(const int *l, const int *r, const size_t eleNum)
{
for(int i = 0; i < eleNum; i ++)
if(l[i] != r[i])
{
printf("ERROR: l[%d] = %d, r[%d] = %d\n", i, l[i], i, r[i]);
return false;
}
return true;
}
int main(int argc, char **argv)
{
// generate a random matrix.
size_t N = 10;
int *mat = (int*)malloc(sizeof(int)*N*N);
GenMatrix(mat, N);
// compute the reference result.
int *ref = (int*)malloc(sizeof(int)*N*N);
memcpy(ref, mat, sizeof(int)*N*N);
Floyd_sequential(ref, N);
//CUDA Portion
int Grid_Dim_x = 1, Grid_Dim_y = 1;
int noThreads_x, noThreads_y;
int *result = (int*)malloc(sizeof(int)*N*N);
memcpy(result, mat, sizeof(int)*N*N);
int *d_result;
// compute your results
cudaMalloc((void **)&d_result, N*N);
cudaMemcpy(result, N * N, cudaMemcpyHostToDevice);
gpu_Floyd<<<1024, 256>>>(d_result, N);
cudaMemcpy(result, d_result, cudaMemcpyDeviceToHost);
// compare your result with reference result
if(CmpArray(result, ref, N*N))
printf("The matrix matches.\n");
else
printf("The matrix do not match.\n");
free(ref);
free(result);
cudaFree(d_result);
}
However, my output always shows the matrices do not match.
I understand that in CUDA we try to map each element in the matrix to each row. However, I am trying to explore possibilities by mapping each row of the matrix to a thread instead.
As has already been mentioned, your provided GPU code does not compile, so I'm curious how you got to the observation that your output matrices do not match.
Here are some of the problems with your code:
cudaMalloc, just like malloc allocates bytes, so this is not correct:
cudaMalloc((void **)&d_result, N*N);
instead you want this:
cudaMalloc((void **)&d_result, N*N*sizeof(int));
likewise cudaMemcpy, just like memcpy, operates on bytes, and furthermore cudaMemcpy requires 4 parameters so this is not correct:
cudaMemcpy(result, N * N, cudaMemcpyHostToDevice);
instead you probably want this:
cudaMemcpy(d_result, result, N * N*sizeof(int), cudaMemcpyHostToDevice);
and your other cudaMemcpy line needs to be fixed similarly.
I'd also advise doing proper cuda error checking
Your kernel is written as if it's expecting a 2 dimensional thread array, or at least one dimensional in y, whereas you are launching a one dimensional grid in x:
gpu_Floyd<<<1024, 256>>>(d_result, N);
therefore all your kernel built-in variables in y will be 1 or 0 always, and this line of code:
int Row = blockIdx.y * blockDim.y + threadIdx.y;
will evaluate to zero for all threads in your 1-D grid in x.
Your gpu kernel is putting the results in the same matrix as the input data. For sequential code this may or may not matter, but for code that is intended to run in parallel, it can often lead to race conditions, because the order of operations (i.e. order of thread execution) is largely undefined.
Below you will find a canonical, simple implementation of the Floyd-Warshall algorithm in CUDA.
The CUDA code is accompanied with a sequential implementation and both are based on the simplifying assumption that the edges are non-negative. The full, minimum distance paths are also reconstructed in both the cases. Despite the simplifying assumption, it should be possible to grasp the relevant parallelization idea, namely that a two-dimensional thread grid is exploited and that each thread along x is assigned to a matrix column, while each block along y is assigned to a matrix row. In this way, all the columns are loaded by the threadIdx.x == 0 threads of each block in shared memory.
// --- Assumption: graph with positive edges
#include <stdio.h>
#include <string>
#include <map>
#include <iostream>
#include <fstream>
#include "Utilities.cuh"
#define BLOCKSIZE 256
using namespace std;
map<string, int> nameToNum; // --- names of vertices
map<string, map<string, int>> weightMap; // --- weights of edges
/************************/
/* READ GRAPH FROM FILE */
/************************/
int *readGraphFromFile(int &N, char *fileName) {
string vertex1, vertex2;
ifstream graphFile;
int currentWeight;
N = 0; // --- Init the number of found vertices
graphFile.open(fileName); // --- Open the graph file
graphFile >> vertex1; // --- Read first vertex
while(vertex1 != "--END--") { // --- Loop untile end of file has not been found
graphFile >> vertex2; // --- Read second vertex
graphFile >> currentWeight; // --- Read weight between first and second vertex
if (nameToNum.count(vertex1) == 0) { // --- If vertex has not yet been added ...
nameToNum[vertex1] = N; // assign a progressive number to the vertex
weightMap[vertex1][vertex1] = 0; // assign a zero weight to the "self-edge"
N++; // --- Update the found number of vertices
}
if (nameToNum.count(vertex2) == 0) {
nameToNum[vertex2] = N;
weightMap[vertex2][vertex2] = 0;
N++;
}
weightMap[vertex1][vertex2] = currentWeight; // --- Update weight between vertices 1 and 2
graphFile >> vertex1;
}
graphFile.close(); // --- Close the graph file
// --- Construct the array
int *weightMatrix = (int*) malloc(N * N * sizeof(int));
// --- Loop over all the vertex couples in the wights matrix
for (int ii = 0; ii < N; ii++)
for (int jj = 0; jj < N; jj++)
weightMatrix[ii * N + jj] = INT_MAX / 2; // --- Init the weights matrix elements to infinity
map<string, int>::iterator i, j;
// --- Loop over all the vertex couples in the map
// (*i).first and (*j).first are the weight entries of the map, while (*i).second and (*j).second are their corresponding indices
for (i = nameToNum.begin(); i != nameToNum.end(); ++i)
for (j = nameToNum.begin(); j != nameToNum.end(); ++j) {
// --- If there is connection between vertices (*i).first and (*j).first, the update the weight matrix
if (weightMap[(*i).first].count((*j).first) != 0)
weightMatrix[N * (*i).second + (*j).second] = weightMap[(*i).first][(*j).first];
}
return weightMatrix;
}
/************************************/
/* PRINT MINIMUM DISTANCES FUNCTION */
/************************************/
void printMinimumDistances(int N, int *a) {
map<string, int>::iterator i;
// --- Prints all the node labels at the first row
for (i = nameToNum.begin(); i != nameToNum.end(); ++i) printf("\t%s", i->first.c_str());
printf("\n");
i = nameToNum.begin();
// --- Loop over the rows
for (int p = 0; p < N; p++) {
printf("%s\t", i -> first.c_str());
// --- Loop over the columns
for (int q = 0; q < N; q++) {
int dd = a[p * N + q];
if (dd != INT_MAX / 2) printf("%d\t", dd);
else printf("--\t");
}
printf("\n");
i++;
}
}
void printPathRecursive(int row, int col, int *minimumDistances, int *path, int N) {
map<string, int>::iterator i = nameToNum.begin();
map<string, int>::iterator j = nameToNum.begin();
if (row == col) {advance(i, row); printf("%s\t", i -> first.c_str()); }
else {
if (path[row * N + col] == INT_MAX / 2) printf("%row %row %row No path exists\t\n", minimumDistances[row * N + col], row, col);
else {
printPathRecursive(row, path[row * N + col], minimumDistances, path, N);
advance(j, col);
printf("%s\t", j -> first.c_str());
}
}
}
void printPath(int N, int *minimumDistances, int *path) {
map<string, int>::iterator i;
map<string, int>::iterator j;
// --- Loop over the rows
i = nameToNum.begin();
for (int p = 0; p < N; p++) {
// --- Loop over the columns
j = nameToNum.begin();
for (int q = 0; q < N; q++) {
printf("From %s to %s\t", i -> first.c_str(), j -> first.c_str());
printPathRecursive(p, q, minimumDistances, path, N);
printf("\n");
j++;
}
i++;
}
}
/**********************/
/* FLOYD-WARSHALL CPU */
/**********************/
void h_FloydWarshall(int *h_graphMinimumDistances, int *h_graphPath, const int N) {
for (int k = 0; k < N; k++)
for (int row = 0; row < N; row++)
for (int col = 0; col < N; col++) {
if (h_graphMinimumDistances[row * N + col] > (h_graphMinimumDistances[row * N + k] + h_graphMinimumDistances[k * N + col])) {
h_graphMinimumDistances[row * N + col] = (h_graphMinimumDistances[row * N + k] + h_graphMinimumDistances[k * N + col]);
h_graphPath[row * N + col] = h_graphPath[k * N + col];
}
}
}
/*************************/
/* FLOYD-WARSHALL KERNEL */
/*************************/
__global__ void d_FloydWarshall(int k, int *d_graphMinimumDistances, int *d_graphPath, int N) {
int col = blockIdx.x * blockDim.x + threadIdx.x; // --- Each thread along x is assigned to a matrix column
int row = blockIdx.y; // --- Each block along y is assigned to a matrix row
if (col >= N) return;
int arrayIndex = N * row + col;
// --- All the blocks load the entire k-th column into shared memory
__shared__ int d_graphMinimumDistances_row_k;
if(threadIdx.x == 0) d_graphMinimumDistances_row_k = d_graphMinimumDistances[N * row + k];
__syncthreads();
if (d_graphMinimumDistances_row_k == INT_MAX / 2) // --- If element (row, k) = infinity, no update is needed
return;
int d_graphMinimumDistances_k_col = d_graphMinimumDistances[k * N + col];
if(d_graphMinimumDistances_k_col == INT_MAX / 2) // --- If element (k, col) = infinity, no update is needed
return;
int candidateBetterDistance = d_graphMinimumDistances_row_k + d_graphMinimumDistances_k_col;
if (candidateBetterDistance < d_graphMinimumDistances[arrayIndex]) {
d_graphMinimumDistances[arrayIndex] = candidateBetterDistance;
d_graphPath[arrayIndex] = d_graphPath[k * N + col];
}
}
/********/
/* MAIN */
/********/
int main() {
int N = 0; // --- Number of vertices
// --- Read graph array from file
int *h_graphArray = readGraphFromFile(N, "graph2.txt");
printf("\n******************\n");
printf("* Original graph *\n");
printf("******************\n");
printMinimumDistances(N, h_graphArray);
// --- Floyd-Warshall on CPU
int *h_graphMinimumDistances = (int *) malloc(N * N * sizeof(int));
int *h_graphPath = (int *) malloc(N * N * sizeof(int));
memcpy(h_graphMinimumDistances, h_graphArray, N * N * sizeof(int));
for (int k = 0; k < N; k++)
for (int l = 0; l < N; l++)
if (h_graphArray[k * N + l] == INT_MAX / 2) h_graphPath[k * N + l] = INT_MAX / 2;
else h_graphPath[k * N + l] = k;
h_FloydWarshall(h_graphMinimumDistances, h_graphPath, N);
printf("\n*************************\n");
printf("* CPU result: distances *\n");
printf("*************************\n");
printMinimumDistances(N, h_graphMinimumDistances);
printf("\n********************\n");
printf("* CPU result: path *\n");
printf("********************\n");
printPath(N, h_graphMinimumDistances, h_graphPath);
// --- Graph array device allocation and host-device memory transfer
int *d_graphMinimumDistances; gpuErrchk(cudaMalloc(&d_graphMinimumDistances, N * N * sizeof(int)));
gpuErrchk(cudaMemcpy(d_graphMinimumDistances, h_graphArray, N * N * sizeof(int), cudaMemcpyHostToDevice));
int *d_graphPath; gpuErrchk(cudaMalloc(&d_graphPath, N * N * sizeof(int)));
for (int k = 0; k < N; k++)
for (int l = 0; l < N; l++)
if (h_graphArray[k * N + l] == INT_MAX / 2) h_graphPath[k * N + l] = INT_MAX / 2;
else h_graphPath[k * N + l] = k;
gpuErrchk(cudaMemcpy(d_graphPath, h_graphPath, N * N * sizeof(int), cudaMemcpyHostToDevice));
// --- Iterations
for (int k = 0; k < N; k++) {
d_FloydWarshall <<<dim3(iDivUp(N, BLOCKSIZE), N), BLOCKSIZE>>>(k, d_graphMinimumDistances, d_graphPath, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
// --- Copy results back to the host
gpuErrchk(cudaMemcpy(h_graphMinimumDistances, d_graphMinimumDistances, N * N * sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_graphPath, d_graphPath, N * N * sizeof(int), cudaMemcpyDeviceToHost));
printf("\n**************\n");
printf("* GPU result *\n");
printf("**************\n");
printMinimumDistances(N, h_graphMinimumDistances);
printf("\n********************\n");
printf("* GPU result: path *\n");
printf("********************\n");
printPath(N, h_graphMinimumDistances, h_graphPath);
}

Matrix Multiplication giving wrong output [duplicate]

This question already has an answer here:
Unable to execute device kernel in CUDA
(1 answer)
Closed 7 years ago.
What I am attempting to do is Multiply Matrix A & Matrix B and then from the product matrix I get the index of the maximum value per column. But unfortunately, only the first 128*128 values of the matrix multiplication are correct while others are just garbage. I do not quite understand how this works. I request you to kindly guide me with this ..
#include<stdio.h>
#include "cuda.h"
#include<stdlib.h>
#define blockD 32
const int wA = 128;
const int hA = 4096;
const int wB = 4096;
const int hB = wA;
main(void){
void MatrixMultiplication(float *, float *, float *, float *);
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *M, *N, *P, *C;
// allocate memory on the CPU
M = (float*)malloc(size_A);
N = (float*)malloc(size_B);
P = (float*)malloc(size_max);
C = (float*)malloc(size_C);
// initialize the matrices
for (int y=0; y < hA; y++) {
for (int x=0; x < wA; x++){
M[y*wA + x] = 32; //x + y*wA;
}
}
for (int y=0; y<hB; y++) {
for (int x=0; x<wB; x++){
N[y*wB + x] = 21; //x + y*wB;
}
}
MatrixMultiplication(M, N, P, C);
//Write
FILE *f1;
int i,j;
f1 = fopen("C.txt","w");
for(i = hA - 2 ; i < hA; i ++){
for(j = 0; j < wB; j++){
fprintf(f1,"%d\t",int(C[i*wB + j]));
}
fprintf(f1,"\n");
}
fclose(f1);
// free the memory allocated on the CPU
free( M );
free( N );
free( P );
free( C );
cudaDeviceReset();
return 0;
}
__device__ void MaxFunction(float* Pd, float* max)
{
int x = (threadIdx.x + blockIdx.x * blockDim.x);
int y = (threadIdx.y + blockIdx.y * blockDim.y);
int k = 0;
int temp = 0; int temp_idx = 0;
for (k = 0; k < wB; ++k) {
if(Pd[x*wB + k] > temp){
temp = Pd[x*wB + k];
temp_idx = x*wB + k;
}
}
max[y*2 + 0] = temp;
max[y*2 + 1] = temp_idx;
}
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, float* max)
{
// declare cache in the shared memory
__shared__ float Mds[blockD][blockD];
__shared__ float Nds[blockD][blockD];
float Pvalue = 0;
// Loop over the Md and Nd block dimension required to compute the Pd element
for (int m = (wA * blockD * blockIdx.y), n = (blockD * blockIdx.x);
m < ((wA * blockD * blockIdx.y)+wA-1);
m += blockD, n += (blockD*hB)){
// collaboratively loading of Md and Nd blocks into shared memory
Mds[threadIdx.y][threadIdx.x] = Md[m + wA * threadIdx.y + threadIdx.x];
Nds[threadIdx.y][threadIdx.x] = Nd[n + wA * threadIdx.y + threadIdx.x];
__syncthreads();
// keep track of the running sum
for (int k = 0; k < blockD; k++)
Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
__syncthreads();
}
// write back to the global memory
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
__syncthreads();
MaxFunction(Pd, max);
}
void MatrixMultiplication(float *M, float *N, float *P, float *C) {
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *Md, *Nd, *Pd, *max;
// allocate memory on the GPU
cudaMalloc((void**)&Md, size_A);
cudaMalloc((void**)&Nd, size_B);
cudaMalloc((void**)&Pd, size_C);
cudaMalloc((void**)&max, size_max);
// transfer M and N to device memory
cudaMemcpy(Md, M, size_A, cudaMemcpyHostToDevice);
cudaMemcpy(Nd, N, size_B, cudaMemcpyHostToDevice);
// kernel invocation code
dim3 dimBlock(blockD, blockD);
dim3 dimGrid(wA/blockD, hB/blockD);
//Execute Kernel
MatrixMulKernel<<<dimGrid, dimBlock>>>( Md, Nd, Pd, max);
// transfer P from device
cudaMemcpy(P, max, size_max, cudaMemcpyDeviceToHost);
cudaMemcpy(C, Pd, size_C, cudaMemcpyDeviceToHost);
// free the memory allocated on the GPU
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
cudaFree(max);
}
In your code you seem to have more than one problem. One of the problems is, in place of this:
dim3 dimGrid(wA/blockD, hB/blockD);
You should have this:
dim3 dimGrid(wB/blockD, hA/blockD);
Ultimately you need one thread in your grid for each output point. Your formulation was giving you a grid of 4 blocks by 4 blocks, whereas you need a grid of 128 blocks by 128 blocks.
The other problem I found with your code was in these lines in the kernel:
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
They are not indexing properly through the output array. Rather than try to sort it out using your scheme, I used this instead:
Pd[(threadIdx.x + (blockIdx.x * blockDim.x)) + ((threadIdx.y + (blockIdx.y * blockDim.y))*(gridDim.x*blockDim.x))] = Pvalue;
When I made the above two changes to your code, I got what I believe are correct results throughout the array. And it took about 32 seconds on my machine to run it. (Note that I haven't tried fixing your original max-finding code -- see below for a better approach.)
Based on your previous question, you seemed to be concerned about speed. If you want to do fast matrix multiply, you should use cublas. The following code shows how to use cublas to multiply two ordinary C-style matrices (they don't have to be square). I've also included a column-max finding kernel that will be fast when the number of columns is large (say, over 500 or so. You have 4096 columns in your example). For small numbers of columns, there may be quicker ways to perform this function, but small numbers of columns also suggests that the overall problem size may be small and so speed (of this piece of code) will not really be an issue.
Here's the code:
#include <stdio.h>
#include <cublas_v2.h>
#define VERBOSE 1
#define nTPB 64
#define ROW_A 4
#define COL_A 4
#define ROW_B COL_A
#define COL_B 4
#define ROW_C ROW_A
#define COL_C COL_B
#define SIZ_A (ROW_A*COL_A)
#define SIZ_B (ROW_B*COL_B)
#define SIZ_C (ROW_C*COL_C)
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void col_max(float *mat, float *max, unsigned int *midx, unsigned int rows, unsigned int cols){
int idx = threadIdx.x + blockDim.x*blockIdx.x;
if (idx < cols){
float tempmax = mat[idx];
unsigned int tempmidx = 0;
for (int i = 1; i< rows; i++)
if (mat[idx + (i*cols)] > tempmax){
tempmax = mat[idx + (i*cols)];
tempmidx = i;}
max[idx] = tempmax;
midx[idx] = tempmidx;
}
}
int main(){
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C, *h_max, *d_max;
unsigned int *h_idx, *d_idx;
h_A = (float *)malloc(SIZ_A*sizeof(float));
if (h_A==0) {printf("malloc fail\n"); return -1;}
h_B = (float *)malloc(SIZ_B*sizeof(float));
if (h_B==0) {printf("malloc fail\n"); return -1;}
h_C = (float *)malloc(SIZ_C*sizeof(float));
if (h_C==0) {printf("malloc fail\n"); return -1;}
h_max = (float *)malloc(COL_C*sizeof(float));
if (h_max==0) {printf("malloc fail\n"); return -1;}
h_idx = (unsigned int*)malloc(COL_C*sizeof(unsigned int));
if (h_idx==0) {printf("malloc fail\n"); return -1;}
cudaMalloc((void **)&d_A, SIZ_A*sizeof(float));
cudaMalloc((void **)&d_B, SIZ_B*sizeof(float));
cudaMalloc((void **)&d_C, SIZ_C*sizeof(float));
cudaMalloc((void **)&d_max, COL_C*sizeof(float));
cudaMalloc((void **)&d_idx, COL_C*sizeof(unsigned int));
cudaCheckErrors("cuda malloc fail");
// initialize data
for (int i=0; i< SIZ_A; i++) h_A[i] = (float)(i+1);
for (int i=0; i< SIZ_B; i++) h_B[i] = (float)(i+2);
cudaMemcpy(d_A, h_A, SIZ_A*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, SIZ_B*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cuda memcpy 1 fail");
const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;
cublasCheckErrors(cublasCreate(&handle));
// C = A*B
// due to cublas expecting column-major storage, parameters
// are scrambled
cublasCheckErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, COL_B, ROW_A, COL_A, &alpha, d_B, COL_B, d_A, COL_A, &beta, d_C, COL_C));
cudaMemcpy(h_C, d_C, SIZ_C*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 2 fail");
col_max<<<(COL_C + nTPB - 1)/nTPB, nTPB>>>(d_C, d_max, d_idx, ROW_C, COL_C);
cudaCheckErrors("kernel launch fail");
cudaMemcpy(h_max, d_max, COL_C*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_idx, d_idx, COL_C*sizeof(unsigned int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 3 fail/kernel fail");
if (VERBOSE){
printf("A: \n");
for (int i=0; i< ROW_A; i++){
for (int j=0; j< COL_A; j++)
printf("%7.5G", h_A[j+(i*COL_A)]);
printf("\n");}
printf("B: \n");
for (int i=0; i< ROW_B; i++){
for (int j=0; j< COL_B; j++)
printf("%7.5G", h_B[j+(i*COL_B)]);
printf("\n");}
printf("C = A*B: \n");
for (int i=0; i< ROW_C; i++){
for (int j=0; j< COL_C; j++)
printf("%7.5G", h_C[j+(i*COL_C)]);
printf("\n");}
printf("COLUMN MAX:\n");
for (int i=0; i< COL_C; i++)
printf("%7.5G", h_max[i]);
printf("\nCOLUMN MAX IDX:\n");
for (int i=0; i< COL_C; i++)
printf("%7d", h_idx[i]);
}
printf("\n finished!\n");
return 0;
}
Here's what I used to compile:
$ nvcc -arch=sm_20 -O3 -o t221 t221.cu -lcublas
And here's the sample output:
$ cuda-memcheck ./t221
========= CUDA-MEMCHECK
A:
1 2 3 4
5 6 7 8
9 10 11 12
13 14 15 16
B:
2 3 4 5
6 7 8 9
10 11 12 13
14 15 16 17
C = A*B:
100 110 120 130
228 254 280 306
356 398 440 482
484 542 600 658
COLUMN MAX:
484 542 600 658
COLUMN MAX IDX:
3 3 3 3
finished!
========= ERROR SUMMARY: 0 errors
$
When I extended my code to handle the same sizes you indicated, (A = 4096x128, B=128x4096) it took about 1 second on my machine. So it's much faster than your code. However, when I take your code and comment out your call to MaxFunction in the kernel, it also only takes about 1 second to compute the matrix multiply result. So if you wanted to keep your matrix multiply code (i.e. not use cublas) you could break the code into 2 kernels, and use your multiply routine in the first kernel with my max-finding routine (col_max) in the second kernel, and also probably get a pretty fast result.
As #talonmies indicated, if you are running on a windows machine, be sure you are aware of the ramifications of windows TDR. (search that in the upper right corner search box if needed)

CUDA C/C++: Calculate the average of inverse of distance per point (interaction energy, perhaps?)

I've been trying to write a kernel in that calculates the sum of the inverse of the distance between N given points over N. A serial coda in C would be like
average = 0;
for(int i = 0; i < Np; i++){
for(int j = i + 1; j < Np; j++){
average += 1.0e0f/sqrtf((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
}
average = average/(float)N;
Where rx and ry are the x and y coordinates, respectively.
I generate the points via a kernel that uses random number generator. For the kernel, I used 128(256) threads per block for 4k(8k) points. On it every thread performs the inner above inner loop, then the results are passed to a reduce sum function, as follows
Generate points:
__global__ void InitRNG ( curandState * state, const int seed ){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
curand_init (seed, tIdx, 0, &state[tIdx]);
}
__global__
void SortPoints(float* X, float* Y,const int N, curandState *state){
float rdmn1, rdmn2;
unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
float range;
if(tIdx < N){
rdmn1 = curand_uniform(&state[tIdx]);
rdmn2 = curand_uniform(&state[tIdx]);
range = sqrtf(0.25e0f*N*rdmn1);
X[tIdx] = range*cosf(2.0e0f*pi*rdmn2);
Y[tIdx] = range*sinf(2.0e0f*pi*rdmn2);
}
}
Reduction:
__device__
float ReduceSum2(float In){
__shared__ float data[BlockSize];
unsigned int tIdx = threadIdx.x;
data[tIdx] = In;
__syncthreads();
for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){
if(tIdx < i){
data[tIdx] += data[tIdx + i];
}
__syncthreads();
}
return data[0];
}
Kernel:
__global__
void AvgDistance(float *X, float *Y, float *Avg, const int N){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
int bIdx = blockIdx.x;
float x , y;
float d = 0.0f;
if(tIdx < N){
for(int i = tIdx + 1; i < N ; i++){
x = X[tIdx] - X[i];
y = Y[tIdx] - Y[i];
d += 1.0e0f/(sqrtf(x*x + y*y));
}
__syncthreads();
Avg[bIdx] = ReduceSum2(d);
}
}
The kernel is configured and launched as follows:
dim3 threads(BlockSize,BlockSize);
dim3 blocks(ceil(Np/threads.x),ceil(Np/threads.y));
InitRNG<<<blocks.x,threads.x>>>(d_state,seed);
SortPoints<<<blocks.x,threads.x>>>(d_rx,d_ry,Np,d_state);
AvgDistance<<<blocks.x,threads.x,threads.x*sizeof(float)>>>(d_rx,d_ry,d_Avg,Np);
Finally, I copy the data back to host and then perform the remaining sum:
Avg = new float[blocks.x];
CHECK(cudaMemcpy(Avg,d_Avg,blocks.x*sizeof(float),cudaMemcpyDeviceToHost),ERROR_CPY_DEVTOH);
float average = 0;
for(int i = 0; i < blocks.x; i++){
average += Avg[i];
}
average = average/(float)Np;
For 4k points, ok! the results are:
Average distance between points (via Kernel) = 108.615
Average distance between points (via CPU) = 110.191
In this case the sum may be performed in different order, causing both results to diverge from each other, I don't know...
But when it comes to 8k, the results are quiet different:
Average distance between points (via Kernel) = 153.63
Average distance between points (via CPU) = 131.471
To me it seems that both the kernel and the serial code are written the same way. What leads me to distrust the precision on CUDA calculation of floating point numbers. Does this make sense? Or are the access to global memory causing some conflicts when some threads load the same data from X and Y at the same time? Or the way I wrote the kernel is in some way 'wrong'(I mean, am I doing something that is causing both results to diverge from each other?).
Actually, from what I can tell, the problem seems to be on the CPU side. I created a sample code based on your code.
I was able to reproduce your results.
First I switched all instances of sinf, cosf, and sqrtf to their corresponding double versions. This made no difference in the results.
Next I included a typedef so I could easily switch the precision from float to double and back, replacing every relevant instance of float in the code with mytype which is my typedef.
When I run the code with typedef of float and a data size of 4096 I get these results:
GPU average = 108.294922
CPU average = 109.925285
When I run the code with typedef of double and a data size of 4096 I get these results:
GPU average = 108.294903
CPU average = 108.294903
When I run the code with typedef of float and a data size of 8192 I get these results:
GPU average = 153.447327
CPU average = 131.473526
When I run the code with typedef of double and a data size of 8192 I get these results:
GPU average = 153.447380
CPU average = 153.447380
There are at least 2 observations:
The GPU results don't vary between float and double, except in the 5th decimal place
The CPU results vary by 1-20% or so between float and double, but when double is selected, they line up exactly (to the 6th decimal place, anyway) with the GPU results.
Based on this, I believe the CPU is providing the variable, questionable behavior.
Here's my code for reference:
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#define DSIZE 8192
#define BlockSize 32
#define pi 3.14159f
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef double mytype;
__global__ void InitRNG ( curandState * state, const int seed ){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
curand_init (seed, tIdx, 0, &state[tIdx]);
}
__global__
void SortPoints(mytype* X, mytype* Y,const int N, curandState *state){
mytype rdmn1, rdmn2;
unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
mytype range;
if(tIdx < N){
rdmn1 = curand_uniform(&state[tIdx]);
rdmn2 = curand_uniform(&state[tIdx]);
range = sqrt(0.25e0f*N*rdmn1);
X[tIdx] = range*cos(2.0e0f*pi*rdmn2);
Y[tIdx] = range*sin(2.0e0f*pi*rdmn2);
}
}
__device__
mytype ReduceSum2(mytype In){
__shared__ mytype data[BlockSize];
unsigned int tIdx = threadIdx.x;
data[tIdx] = In;
__syncthreads();
for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){
if(tIdx < i){
data[tIdx] += data[tIdx + i];
}
__syncthreads();
}
return data[0];
}
__global__
void AvgDistance(mytype *X, mytype *Y, mytype *Avg, const int N){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
int bIdx = blockIdx.x;
mytype x , y;
mytype d = 0.0f;
if(tIdx < N){
for(int i = tIdx + 1; i < N ; i++){
x = X[tIdx] - X[i];
y = Y[tIdx] - Y[i];
d += 1.0e0f/(sqrt(x*x + y*y));
}
__syncthreads();
Avg[bIdx] = ReduceSum2(d);
}
}
mytype cpu_avg(const mytype *rx, const mytype *ry, const int size){
mytype average = 0.0f;
for(int i = 0; i < size; i++){
for(int j = i + 1; j < size; j++){
average += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
}
average = average/(mytype)size;
return average;
}
int main() {
int Np = DSIZE;
mytype *rx, *ry, *d_rx, *d_ry, *d_Avg, *Avg;
curandState *d_state;
int seed = 1;
dim3 threads(BlockSize,BlockSize);
dim3 blocks((int)ceilf(Np/(float)threads.x),(int)ceilf(Np/(float)threads.y));
printf("number of blocks = %d\n", blocks.x);
printf("number of threads= %d\n", threads.x);
rx = (mytype *)malloc(DSIZE*sizeof(mytype));
if (rx == 0) {printf("malloc fail\n"); return 1;}
ry = (mytype *)malloc(DSIZE*sizeof(mytype));
if (ry == 0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void**)&d_rx, DSIZE * sizeof(mytype));
cudaMalloc((void**)&d_ry, DSIZE * sizeof(mytype));
cudaMalloc((void**)&d_Avg, blocks.x * sizeof(mytype));
cudaMalloc((void**)&d_state, DSIZE * sizeof(curandState));
cudaCheckErrors("cudamalloc");
InitRNG<<<blocks.x,threads.x>>>(d_state,seed);
SortPoints<<<blocks.x,threads.x>>>(d_rx,d_ry,Np,d_state);
AvgDistance<<<blocks.x,threads.x,threads.x*sizeof(mytype)>>>(d_rx,d_ry,d_Avg,Np);
cudaCheckErrors("kernels");
Avg = new mytype[blocks.x];
cudaMemcpy(Avg,d_Avg,blocks.x*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaMemcpy(rx, d_rx, DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaMemcpy(ry, d_ry, DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaCheckErrors("cudamemcpy");
mytype average = 0;
for(int i = 0; i < blocks.x; i++){
average += Avg[i];
}
average = average/(mytype)Np;
printf("GPU average = %f\n", average);
average = cpu_avg(rx, ry, DSIZE);
printf("CPU average = %f\n", average);
return 0;
}
I am running on RHEL 5.5, CUDA 5.0, Intel Xeon X5560
compiled with:
nvcc -O3 -arch=sm_20 -lcurand -lm -o t93 t93.cu
EDIT:
After observing that the variability was on the CPU side, I found that I could eliminate most of the CPU variability by modifying your CPU averaging code like this:
mytype cpu_avg(const mytype *rx, const mytype *ry, const int size){
mytype average = 0.0f;
mytype temp = 0.0f;
for(int i = 0; i < size; i++){
for(int j = i + 1; j < size; j++){
temp += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
average += temp/(mytype)size;
temp = 0.0f;
}
return average;
}
So I would say there's a problem with intermediate results on the CPU side. It's interesting that it doesn't show up on the GPU result. I suspect the reason for this is that the final summation of GPU averages is done on the CPU (therefore each individual GPU block result is scaled down by the size, e.g. 8192), and these may have an intermediate precision that is sufficient to survive until the final division. If you inlined the CPU average calculation, you may observe something different again.