How can I check the progress of matrix multiplication? - cuda

I'm now only need to show an intermediate progress of matrix multiplication.
for(unsigned int col=0; col<mtxSize; col++) {
unsigned tmp = 0;
for(unsigned int row=0; row<mtxSize; row++) {
for(unsigned int idx=0; idx<mtxSize; idx++) {
tmp += h_A[col*mtxSize+idx] * h_B[idx*mtxSize+row];
}
h_Rs[col*mtxSize+row] = tmp;
tmp = 0;
int rate_tmp = (col*mtxSize + (row+1))*100;
// Maybe like this...
fprintf(stdout, "Progress : %d.%d %%\r", rate_tmp/actMtxSize, rate_tmp%actMtxSize);
fflush(stdout);
}
}
In the case of the host code(use CPU), it is very easy beacause it process sequentially so we can check very easily.
But in the case of the GPU which process in parallel, what should I do?
Once the kernel is running, it does not return until finish the kernel execution.
So I can't check mid-data during the kernel execution time.
I think I need to use asynchronous kernel call, but I do not know well.
And even if the asynchronous kernel call is used, to see all of the data into several blocks over processors, do I have to write atomicAdd() (in other words, global memory access) function which is including some overhead?
Give me some advice or hint.
And I want to know in the case of CUDA.

Here is a code which demonstrates how to check progress from a matrix multiply kernel:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define TIME_INC 100000000
#define INCS 10
#define USE_PROGRESS 1
#define MAT_DIMX 4000
#define MAT_DIMY MAT_DIMX
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void mykernel(volatile int *data){
unsigned long time;
for (int i = 0; i < INCS; i++){
atomicAdd((int *)data,1);
__threadfence_system();
time = clock64();
while((clock64() - time)<TIME_INC) {};
}
printf("progress check finished\n");
}
__global__ void matmult(float *a, float *b, float *c, unsigned int rowA, unsigned int colA, unsigned int colB, volatile int *progress){
unsigned int row = threadIdx.x+blockDim.x*blockIdx.x;
unsigned int col = threadIdx.y+blockDim.y*blockIdx.y;
if ((row < rowA) && (col < colB)){
float temp = 0.0f;
for (unsigned int k = 0; k < colA; k++)
temp += a[(row*colA)+k] * b[(k*colB) + col];
c[(row*colB)+col] = temp;
#if USE_PROGRESS
if (!(threadIdx.x || threadIdx.y)){
atomicAdd((int *)progress, 1);
__threadfence_system();
}
#endif
}
}
int main(){
// simple test to demonstrate reading progress data from kernel
volatile int *d_data, *h_data;
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaCheckErrors("cudaSetDeviceFlags error");
cudaHostAlloc((void **)&h_data, sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc error");
cudaHostGetDevicePointer((int **)&d_data, (int *)h_data, 0);
cudaCheckErrors("cudaHostGetDevicePointer error");
*h_data = 0;
printf("kernel starting\n");
mykernel<<<1,1>>>(d_data);
cudaCheckErrors("kernel fail");
int value = 0;
do{
int value1 = *h_data;
if (value1 > value){
printf("h_data = %d\n", value1);
value = value1;}}
while (value < (INCS-1));
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail 2");
// now try matrix multiply with progress
float *h_c, *d_a, *d_b, *d_c;
h_c = (float *)malloc(MAT_DIMX*MAT_DIMY*sizeof(float));
if (h_c == NULL) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_a, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc a fail");
cudaMalloc((void **)&d_b, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc b fail");
cudaMalloc((void **)&d_c, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc c fail");
for (int i = 0; i < MAT_DIMX*MAT_DIMY; i++) h_c[i] = rand()/(float)RAND_MAX;
cudaMemcpy(d_a, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy a fail");
cudaMemcpy(d_b, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy b fail");
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
*h_data=0;
dim3 block(16,16);
dim3 grid(((MAT_DIMX+block.x-1)/block.x), ((MAT_DIMY+block.y-1)/block.y));
printf("matrix multiply kernel starting\n");
cudaEventRecord(start);
matmult<<<grid,block>>>(d_a, d_b, d_c, MAT_DIMY, MAT_DIMX, MAT_DIMX, d_data);
cudaEventRecord(stop);
#if USE_PROGRESS
unsigned int num_blocks = grid.x*grid.y;
float my_progress = 0.0f;
value = 0;
printf("Progress:\n");
do{
cudaEventQuery(stop); // may help WDDM scenario
int value1 = *h_data;
float kern_progress = (float)value1/(float)num_blocks;
if ((kern_progress - my_progress)> 0.1f) {
printf("percent complete = %2.1f\n", (kern_progress*100));
my_progress = kern_progress;}}
while (my_progress < 0.9f);
printf("\n");
#endif
cudaEventSynchronize(stop);
cudaCheckErrors("event sync fail");
float et;
cudaEventElapsedTime(&et, start, stop);
cudaCheckErrors("event elapsed time fail");
cudaDeviceSynchronize();
cudaCheckErrors("mat mult kernel fail");
printf("matrix multiply finished. elapsed time = %f milliseconds\n", et);
return 0;
}
The code associated with the first kernel call is just to demonstrate the basic idea of having a kernel report it's progress back.
The second part of the code shows a sample, naive matrix multiply on the GPU, with the GPU reporting it's progress back. I have included the ability to remove the progress check code via a preprocessor macro, as well as the ability to time the matrix multiply kernel. For the case I have here, there was no discernible difference in timing with or without the progress code. So while the progress reporting code probably does add some overhead, when compared to the scope of a reasonable sized matrix multiply kernel, it adds no significant time that I can see.
Some other uses of signalling are discussed here

Related

CUDA matrixMulCUBLAS extra memcpy call

While playing with CUBLAS matrix multiplication sample I realised that nvprof profiler shows an extra call of cudaMemcpy Host to Device.
While 2 appear in source code, 3 actual calls are issued.
Why would that be? Is it an intrinsic effect of using CUBLAS?
Code from CUDA CUBLAS sample:
compiled with flags: -lcublas -I/usr/local/cuda-7.5/samples/common/inc
//////////////////////////////////////////////////////////////////////////
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA Samples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// CUDA and CUBLAS functions
#include <helper_functions.h>
#include <helper_cuda.h>
#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif
typedef struct _matrixSize // Optional Command-line multiplier for matrix sizes
{
unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
} sMatrixSize;
////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set matrix multiply on CPU
//! C = A * B
//! #param C reference data, computed but preallocated
//! #param A matrix A as provided to device
//! #param B matrix B as provided to device
//! #param hA height of matrix A
//! #param wB width of matrix B
////////////////////////////////////////////////////////////////////////////////
void
matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
{
for (unsigned int i = 0; i < hA; ++i)
for (unsigned int j = 0; j < wB; ++j)
{
double sum = 0;
for (unsigned int k = 0; k < wA; ++k)
{
double a = A[i * wA + k];
double b = B[k * wB + j];
sum += a * b;
}
C[i * wB + j] = (float)sum;
}
}
// Allocates a matrix with random float entries.
void randomInit(float *data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
{
printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
int i,j,k;
int error_count=0;
for (j = 0; j < height; j++)
{
if (error_count < iListLength)
{
printf("\n Row %d:\n", j);
}
for (i = 0; i < width; i++)
{
k = j * width + i;
float fDiff = fabs(data1[k] - data2[k]);
if (fDiff > fListTol)
{
if (error_count < iListLength)
{
printf(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
}
error_count++;
}
}
}
printf(" \n Total Errors = %d\n", error_count);
}
void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
{
// By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
cudaError_t error;
devID = 0;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
if (error != cudaSuccess)
{
printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
}
// get number of SMs on this GPU
error = cudaGetDevice(&devID);
if (error != cudaSuccess)
{
printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
if (checkCmdLineFlag(argc, (const char **)argv, "sizemult"))
{
iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
}
iSizeMultiple = min(iSizeMultiple, 10);
iSizeMultiple = max(iSizeMultiple, 1);
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;
matrix_size.uiWA = 3 * block_size * iSizeMultiple;
matrix_size.uiHA = 4 * block_size * iSizeMultiple;
matrix_size.uiWB = 2 * block_size * iSizeMultiple;
matrix_size.uiHB = 3 * block_size * iSizeMultiple;
matrix_size.uiWC = 2 * block_size * iSizeMultiple;
matrix_size.uiHC = 4 * block_size * iSizeMultiple;
printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n",
matrix_size.uiHA, matrix_size.uiWA,
matrix_size.uiHB, matrix_size.uiWB,
matrix_size.uiHC, matrix_size.uiWC);
if( matrix_size.uiWA != matrix_size.uiHB ||
matrix_size.uiHA != matrix_size.uiHC ||
matrix_size.uiWB != matrix_size.uiWC)
{
printf("ERROR: Matrix sizes do not match!\n");
exit(-1);
}
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
{
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;
// set seed for rand()
srand(2006);
// allocate host memory for matrices A and B
unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *)malloc(mem_size_A);
unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *)malloc(mem_size_B);
// set seed for rand()
srand(2006);
// initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);
// allocate device memory
float *d_A, *d_B, *d_C;
unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
unsigned int mem_size_C = sizeof(float) * size_C;
// allocate host memory for the result
float *h_C = (float *) malloc(mem_size_C);
float *h_CUBLAS = (float *) malloc(mem_size_C);
checkCudaErrors(cudaMalloc((void **) &d_A, mem_size_A));
checkCudaErrors(cudaMalloc((void **) &d_B, mem_size_B));
checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMalloc((void **) &d_C, mem_size_C));
// setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);
// create and start timer
printf("Computing result using CUBLAS...");
// execute the kernel
int nIter = 30;
// CUBLAS version 2.0
{
const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;
cudaEvent_t start, stop;
checkCudaErrors(cublasCreate(&handle));
//Perform warmup operation with cublas
checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
// Allocate CUDA events that we'll use for timing
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// Record the start event
checkCudaErrors(cudaEventRecord(start, NULL));
for (int j = 0; j < nIter; j++)
{
//note cublas is column primary!
//need to transpose the order
checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
}
printf("done.\n");
// Record the stop event
checkCudaErrors(cudaEventRecord(stop, NULL));
// Wait for the stop event to complete
checkCudaErrors(cudaEventSynchronize(stop));
float msecTotal = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
// Compute and print the performance
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC * (double)matrix_size.uiWC * (double)matrix_size.uiHB;
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf(
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
gigaFlops,
msecPerMatrixMul,
flopsPerMatrixMul);
// copy result from device to host
checkCudaErrors(cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost));
// Destroy the handle
checkCudaErrors(cublasDestroy(handle));
}
// compute reference solution
printf("Computing result using host CPU...");
float *reference = (float *)malloc(mem_size_C);
matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
printf("done.\n");
// check result (CUBLAS)
bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);
if (resCUBLAS != true)
{
printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f);
}
printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL");
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");
// clean up memory
free(h_A);
free(h_B);
free(h_C);
free(reference);
checkCudaErrors(cudaFree(d_A));
checkCudaErrors(cudaFree(d_B));
checkCudaErrors(cudaFree(d_C));
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
cudaDeviceReset();
if (resCUBLAS == true)
{
return EXIT_SUCCESS; // return value = 1
}
else
{
return EXIT_FAILURE; // return value = 0
}
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("[Matrix Multiply CUBLAS] - Starting...\n");
int devID = 0, sizeMult = 5;
sMatrixSize matrix_size;
initializeCUDA(argc, argv, devID, sizeMult, matrix_size);
int matrix_result = matrixMultiply(argc, argv, devID, matrix_size);
return matrix_result;
}
The additional memory transfer seems to be caused by the CUBLAS library and is triggered by a call to cublasInit. You can confirm this by profiling the following code:
#include <cublas_v2.h>
int main()
{
cublasHandle_t handle;
cublasCreate(&handle);
cudaDeviceReset();
return 0;
}
which nvprof reports as calling cudaMemcpy:
$ nvprof ./a.out
==9536== NVPROF is profiling process 9536, command: ./a.out
==9536== Profiling application: ./a.out
==9536== Profiling result:
Time(%) Time Calls Avg Min Max Name
100.00% 1.1190us 1 1.1190us 1.1190us 1.1190us [CUDA memcpy HtoD]
==9536== API calls:
Time(%) Time Calls Avg Min Max Name
76.51% 348.53ms 1 348.53ms 348.53ms 348.53ms cudaFree
23.26% 105.97ms 1 105.97ms 105.97ms 105.97ms cudaDeviceReset
0.09% 420.25us 178 2.3600us 125ns 103.52us cuDeviceGetAttribute
0.08% 349.37us 2 174.69us 110.59us 238.78us cuDeviceTotalMem
0.04% 202.10us 3 67.366us 9.3750us 109.43us cudaMalloc
0.01% 55.217us 2 27.608us 24.529us 30.688us cuDeviceGetName
0.00% 14.365us 1 14.365us 14.365us 14.365us cudaMemcpy
0.00% 10.016us 16 626ns 434ns 2.0440us cudaEventCreateWithFlags
0.00% 4.5000us 11 409ns 271ns 1.2730us cudaDeviceGetAttribute
0.00% 3.4510us 4 862ns 251ns 2.3370us cuDeviceGetCount
0.00% 2.3200us 4 580ns 281ns 1.0350us cuDeviceGet
0.00% 1.3600us 1 1.3600us 1.3600us 1.3600us cudaGetDevice
0.00% 630ns 1 630ns 630ns 630ns cuInit
0.00% 339ns 1 339ns 339ns 339ns cuDriverGetVersion
I doubt that anyone without access to the current CUBLAS source will be able to explain why initialising the CUBLAS library triggers a host to device transfer, but that seems to be the cause of your observation.

Finding the prev of a node in SSSP implementation in CUDA? [duplicate]

I'd like to implement this atomic function in CUDA:
__device__ float lowest; // global var
__device__ int lowIdx; // global var
float realNum; // thread reg var
int index; // thread reg var
if(realNum < lowest) {
lowest= realNum; // the new lowest
lowIdx= index; // update the 'low' index
}
I don't believe I can do this with any of the atomic functions. I need to lock down a couple global memory loc's for a couple instructions.
Might I be able to implement this with PTXAS (assembly) code?
As I stated in my second comment above, it's possible to combine your two 32-bit quantities into a single 64-bit atomically managed quantity, and deal with the problem that way. We then manage the 64-bit quantity atomically using the arbitrary atomic example as a rough guide. Obviously you can't extend this idea beyond two 32-bit quantities. Here's an example:
#include <stdio.h>
#define DSIZE 5000
#define nTPB 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef union {
float floats[2]; // floats[0] = lowest
int ints[2]; // ints[1] = lowIdx
unsigned long long int ulong; // for atomic update
} my_atomics;
__device__ my_atomics test;
__device__ unsigned long long int my_atomicMin(unsigned long long int* address, float val1, int val2)
{
my_atomics loc, loctest;
loc.floats[0] = val1;
loc.ints[1] = val2;
loctest.ulong = *address;
while (loctest.floats[0] > val1)
loctest.ulong = atomicCAS(address, loctest.ulong, loc.ulong);
return loctest.ulong;
}
__global__ void min_test(const float* data)
{
int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
if (idx < DSIZE)
my_atomicMin(&(test.ulong), data[idx],idx);
}
int main() {
float *d_data, *h_data;
my_atomics my_init;
my_init.floats[0] = 10.0f;
my_init.ints[1] = DSIZE;
h_data = (float *)malloc(DSIZE * sizeof(float));
if (h_data == 0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_data, DSIZE * sizeof(float));
cudaCheckErrors("cm1 fail");
// create random floats between 0 and 1
for (int i = 0; i < DSIZE; i++) h_data[i] = rand()/(float)RAND_MAX;
cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cmcp1 fail");
cudaMemcpyToSymbol(test, &(my_init.ulong), sizeof(unsigned long long int));
cudaCheckErrors("cmcp2 fail");
min_test<<<(DSIZE+nTPB-1)/nTPB, nTPB>>>(d_data);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpyFromSymbol(&(my_init.ulong), test, sizeof(unsigned long long int));
cudaCheckErrors("cmcp3 fail");
printf("device min result = %f\n", my_init.floats[0]);
printf("device idx result = %d\n", my_init.ints[1]);
float host_val = 10.0f;
int host_idx = DSIZE;
for (int i=0; i<DSIZE; i++)
if (h_data[i] < host_val){
host_val = h_data[i];
host_idx = i;
}
printf("host min result = %f\n", host_val);
printf("host idx result = %d\n", host_idx);
return 0;
}
Here is a similar example that does atomic update of 2 float quantities.
Here is another custom atomic example.
#Robert Crovella: Excellent idea, but I think the function should be modified a little bit as follows:
__device__ unsigned long long int my_atomicMin(unsigned long long int* address, float val1, int val2)
{
my_atomics loc, loctest, old;
loc.floats[0] = val1;
loc.ints[1] = val2;
loctest.ulong = *address;
old.ulong = loctest.ulong;
while (loctest.floats[0] > val1){
old.ulong = loctest.ulong;
loctest.ulong = atomicCAS(address, loctest.ulong, loc.ulong);
}
return old.ulong;
}

Finding minimum in GPU slower than CPU

I have implemented this code: http://www.cuvilib.com/Reduction.pdf in order to calculate the sum of the elements of a matrix.
However in GPU it runs much slower than in CPU.
I got i7 processor and NVIDIA GT 540M graphics card.
Is it supposed to be that way or something else?
EDIT: I use version 3 of the above code in Ubuntu 13.04 and I compile it using Eclipse Nsight. The size of the matrix is 2097152 elements. It executes in 3.6 ms whereas the CPU version in around 1.0 ms. Below is the whole code:
#include <stdio.h>
#include <stdlib.h>
#include <thrust/sort.h>
#include <sys/time.h>
#include <omp.h>
#include <iostream>
#include <algorithm>
#define MIN(a,b) (((a)<(b))?(a):(b))
static const int WORK_SIZE = 2097152;
int find_min(int *a,int length){
int min = a[0];
for (int i=1;i<length;i++)
if (a[i]<min)
min=a[i];
return min;
}
__global__ static void red_min(int *g_idata,int *g_odata) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid]= g_idata[i];
__syncthreads();
for(unsigned int s=blockDim.x/2; s > 0; s >>= 1) {
if (tid<s) {
sdata[tid] = MIN(sdata[tid],sdata[tid + s]);
}
__syncthreads();
}
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
int main(void) {
int *d1,*d2;
int i,*result;
int *idata,*fdata;
srand ( time(NULL) );
result = (int *)malloc(sizeof(int));
idata = (int *)malloc(WORK_SIZE*sizeof(int));
fdata = (int *)malloc(WORK_SIZE*sizeof(int));
cudaMalloc((int**)&d1,WORK_SIZE*sizeof(int));
cudaMalloc((int**)&d2,WORK_SIZE*sizeof(int));
for (i = 0; i < WORK_SIZE; i++){
idata[i] = rand();
fdata[i] = i;
}
struct timeval begin, end;
gettimeofday(&begin, NULL);
*result = find_min(idata,WORK_SIZE);
printf( "Minimum Element CPU: %d \n", *result);
gettimeofday(&end, NULL);
int time = (end.tv_sec * (unsigned int)1e6 + end.tv_usec) - (begin.tv_sec * (unsigned int)1e6 + begin.tv_usec);
printf("Microseconds elapsed CPU: %d\n", time);
cudaMemcpy(d1,idata,WORK_SIZE*sizeof(int),cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate( &start);
cudaEventCreate( &stop);
cudaEventRecord(start,0);
int num_blocks = 16384;
bool flag = true;
while (num_blocks>0){
if (flag) {
red_min<<<num_blocks,128,128*sizeof(int)>>>(d1,d2);
}
else {
red_min<<<num_blocks,128,128*sizeof(int)>>>(d2,d1);
}
num_blocks /= 128;
flag = !flag;
}
GT540M is a mobile GPU, so I assume you're running on a laptop, and furthermore you may be hosting the X display on the 540M GPU.
I built a complete version of your code:
#include <stdio.h>
#include <stdlib.h>
#include <thrust/sort.h>
#include <sys/time.h>
#include <omp.h>
#include <iostream>
#include <algorithm>
#define MIN(a,b) (((a)<(b))?(a):(b))
static const int WORK_SIZE = 2097152;
int find_min(int *a,int length){
int min = a[0];
for (int i=1;i<length;i++)
if (a[i]<min)
min=a[i];
return min;
}
__global__ static void red_min(int *g_idata,int *g_odata) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid]= g_idata[i];
__syncthreads();
for(unsigned int s=blockDim.x/2; s > 0; s >>= 1) {
if (tid<s) {
sdata[tid] = MIN(sdata[tid],sdata[tid + s]);
}
__syncthreads();
}
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
int main(void) {
int *d1,*d2;
int i,*result;
int *idata,*fdata;
srand ( time(NULL) );
result = (int *)malloc(sizeof(int));
idata = (int *)malloc(WORK_SIZE*sizeof(int));
fdata = (int *)malloc(WORK_SIZE*sizeof(int));
cudaMalloc((int**)&d1,WORK_SIZE*sizeof(int));
cudaMalloc((int**)&d2,WORK_SIZE*sizeof(int));
for (i = 0; i < WORK_SIZE; i++){
idata[i] = rand();
fdata[i] = i;
}
struct timeval begin, end;
gettimeofday(&begin, NULL);
*result = find_min(idata,WORK_SIZE);
printf( "Minimum Element CPU: %d \n", *result);
gettimeofday(&end, NULL);
int time = (end.tv_sec * (unsigned int)1e6 + end.tv_usec) - (begin.tv_sec * (unsigned int)1e6 + begin.tv_usec);
printf("Microseconds elapsed CPU: %d\n", time);
cudaMemcpy(d1,idata,WORK_SIZE*sizeof(int),cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate( &start);
cudaEventCreate( &stop);
cudaEventRecord(start,0);
int num_blocks = 16384;
bool flag = true;
int loops = 0;
while (num_blocks>0){
if (flag) {
red_min<<<num_blocks,128,128*sizeof(int)>>>(d1,d2);
}
else {
red_min<<<num_blocks,128,128*sizeof(int)>>>(d2,d1);
}
num_blocks /= 128;
flag = !flag;
loops++;
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float et = 0.0f;
cudaEventElapsedTime(&et, start, stop);
printf("GPU time: %fms, in %d loops\n", et, loops);
int gpuresult;
if (flag)
cudaMemcpy(&gpuresult, d1, sizeof(int), cudaMemcpyDeviceToHost);
else
cudaMemcpy(&gpuresult, d2, sizeof(int), cudaMemcpyDeviceToHost);
printf("GPU min: %d\n", gpuresult);
return 0;
}
compiled it:
$ nvcc -O3 -arch=sm_20 -o t264 t264.cu
and ran it on a M2050 GPU, RHEL 5.5, CUDA 5.5, Xeon X5650 CPU
$ ./t264
Minimum Element CPU: 288
Microseconds elapsed CPU: 1217
GPU time: 0.621408ms, in 3 loops
GPU min: 288
$
So my CPU results were pretty close to yours, but my GPU results were about 5-6x faster. If we compare M2050 to your GT540M, we see that the M2050 has 14 SMs whereas the GT540M has 2. More importantly, the M2050 has about 5x the memory bandwidth of your GT540M GPU (28.8GB/s peak theoretical for GT540M vs. ~150GB/s peak theoretical for M2050)
Since a well written parallel reduction is a memory bandwidth constrained code on GPUs, the speed difference between your GPU and my GPU makes sense.
So I would say your results are probably about what is expected, and to get better results you will probably need a faster GPU.
Also, if your GT540M is also hosting an X display, it's possible that the GPU timing is corrupted by display activity. If we are timing a single kernel, this is not normally an issue - the kernel execution interrupts the display processing briefly. But when we are timing a sequence of kernels in succession, it's possible for the display tasks to jump in and execute in-between kernel calls (the GPU is multi-tasking when it is asked to both support a display and also process CUDA code). Therefore, this may be a possible performance impact in your case as well.

I lost data after __syncthreads() in cuda

I am trying to find the maximum of an array.. I took the help from CUDA Maximum Reduction Algorithm Not Working. and do some own modification. However I am running it for 16 data. I am finding that in kernel code shared memory copies only 1st 4data. rest are lost. I put two cuPrintf..1st printf shows data is their in the shared memory. But the 2nd cuPrintf is just after __syncthreads.. and that shows 0 from thread ids 4 onwords.. pls help
#include
#include
#include
#include
#include
#include "cuPrintf.cu"
#include "cuPrintf.cuh"
__device__ float MaxOf2(float a, float b)
{
if(a > b) return a;
else return b;
}
__global__ void findMax(int size,float *array_device , float *outPut)
{
extern __shared__ float sdata[];
int tid = threadIdx.x;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i< size)
{
sdata[tid] = array_device[i];
cuPrintf(" array_d[%d]===%f, sdata[%d]===%f\n ",i,array_device[i],tid,sdata[tid]);
__threadfence();
}
__syncthreads();
if(tid<size)
cuPrintf(" array_d[%d]===%f, sdata[%d]===%f\n ",i,array_device[i],tid,sdata[tid]);
for ( int s=blockDim.x/2; s>0; s=s>>1)//s=blockDim.x/2
{
if (tid < s)
{
sdata[tid]= MaxOf2(sdata[tid],sdata[tid+s]);
}
__syncthreads();
}
if (tid == 0) outPut[blockIdx.x] = sdata[0];
}
int main()
{
long double M = pow(2,20);
long double N = 2;
int noThreadsPerBlock = 512 ;
printf("\n Provide the array Size N.(array will be of size N * 2^20 ) :-");
scanf("%Lf",&N);
long int size = 16;
int numOfBlock = (int)size /noThreadsPerBlock + 1;
printf("\n num of blocks==%ld",numOfBlock);
float *array_device , *outPut;
float array_host[]={221,100,2,340,47,36,500,1,33,4460,5,6,7,8,9,11};
cudaMalloc((void **)&array_device, size*sizeof(float));
cudaMalloc((void **)&outPut, size*sizeof(float));
cudaError_t error0 = cudaGetLastError();
printf("\n 0CUDA error: %s\n", cudaGetErrorString(error0));
printf("size===%ld",size);
cudaMemcpy(array_device, array_host, size*sizeof(float), cudaMemcpyHostToDevice);
cudaError_t error1 = cudaGetLastError();
printf("\n1CUDA error: %s\n", cudaGetErrorString(error1));
while(size>1 )
{
cudaPrintfInit();
findMax<<< numOfBlock,noThreadsPerBlock>>>(size,array_device, outPut);cudaPrintfDisplay(stdout, true);
cudaPrintfEnd();
cudaError_t error2 = cudaGetLastError();
printf(" 2CUDA error: %s\n", cudaGetErrorString(error2));
cudaMemcpy(array_device, outPut, size*sizeof(float), cudaMemcpyDeviceToDevice);
size = numOfBlock;
printf("\n ****size==%ld\n",size);
numOfBlock = (int)size /noThreadsPerBlock + 1;
}
cudaMemcpy(array_host, outPut, size*sizeof(float), cudaMemcpyDeviceToHost);
cudaError_t error3 = cudaGetLastError();
printf("\n3CUDA error: %s\n", cudaGetErrorString(error3));
for(int i=0;i<size;i++)
printf("\n index==%d ;data=%f ",i,array_host[i]);
return 0;
}
I'm posting my comment as an answer as requested.
Firstly, you havent specified dynamic size of shared memory in kernel launch. It should look something like:
findMax<<< numOfBlock,noThreadsPerBlock,sizeof(float)*noThreadsPerBlock>>>
Secondly, what was the concept behind condition if(tid<size) on second cuPrintf? Providing output of the program could also help.

How can I implement a custom atomic function involving several variables?

I'd like to implement this atomic function in CUDA:
__device__ float lowest; // global var
__device__ int lowIdx; // global var
float realNum; // thread reg var
int index; // thread reg var
if(realNum < lowest) {
lowest= realNum; // the new lowest
lowIdx= index; // update the 'low' index
}
I don't believe I can do this with any of the atomic functions. I need to lock down a couple global memory loc's for a couple instructions.
Might I be able to implement this with PTXAS (assembly) code?
As I stated in my second comment above, it's possible to combine your two 32-bit quantities into a single 64-bit atomically managed quantity, and deal with the problem that way. We then manage the 64-bit quantity atomically using the arbitrary atomic example as a rough guide. Obviously you can't extend this idea beyond two 32-bit quantities. Here's an example:
#include <stdio.h>
#define DSIZE 5000
#define nTPB 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef union {
float floats[2]; // floats[0] = lowest
int ints[2]; // ints[1] = lowIdx
unsigned long long int ulong; // for atomic update
} my_atomics;
__device__ my_atomics test;
__device__ unsigned long long int my_atomicMin(unsigned long long int* address, float val1, int val2)
{
my_atomics loc, loctest;
loc.floats[0] = val1;
loc.ints[1] = val2;
loctest.ulong = *address;
while (loctest.floats[0] > val1)
loctest.ulong = atomicCAS(address, loctest.ulong, loc.ulong);
return loctest.ulong;
}
__global__ void min_test(const float* data)
{
int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
if (idx < DSIZE)
my_atomicMin(&(test.ulong), data[idx],idx);
}
int main() {
float *d_data, *h_data;
my_atomics my_init;
my_init.floats[0] = 10.0f;
my_init.ints[1] = DSIZE;
h_data = (float *)malloc(DSIZE * sizeof(float));
if (h_data == 0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_data, DSIZE * sizeof(float));
cudaCheckErrors("cm1 fail");
// create random floats between 0 and 1
for (int i = 0; i < DSIZE; i++) h_data[i] = rand()/(float)RAND_MAX;
cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cmcp1 fail");
cudaMemcpyToSymbol(test, &(my_init.ulong), sizeof(unsigned long long int));
cudaCheckErrors("cmcp2 fail");
min_test<<<(DSIZE+nTPB-1)/nTPB, nTPB>>>(d_data);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpyFromSymbol(&(my_init.ulong), test, sizeof(unsigned long long int));
cudaCheckErrors("cmcp3 fail");
printf("device min result = %f\n", my_init.floats[0]);
printf("device idx result = %d\n", my_init.ints[1]);
float host_val = 10.0f;
int host_idx = DSIZE;
for (int i=0; i<DSIZE; i++)
if (h_data[i] < host_val){
host_val = h_data[i];
host_idx = i;
}
printf("host min result = %f\n", host_val);
printf("host idx result = %d\n", host_idx);
return 0;
}
Here is a similar example that does atomic update of 2 float quantities.
Here is another custom atomic example.
#Robert Crovella: Excellent idea, but I think the function should be modified a little bit as follows:
__device__ unsigned long long int my_atomicMin(unsigned long long int* address, float val1, int val2)
{
my_atomics loc, loctest, old;
loc.floats[0] = val1;
loc.ints[1] = val2;
loctest.ulong = *address;
old.ulong = loctest.ulong;
while (loctest.floats[0] > val1){
old.ulong = loctest.ulong;
loctest.ulong = atomicCAS(address, loctest.ulong, loc.ulong);
}
return old.ulong;
}