Related
I am exploring to move from OpenCL to CUDA, and did a few tests to benchmark the speed of CUDA in various implementations. To my surprise, in the examples below, the PyCUDA implementation is about 20% faster than the C CUDA example.
I read many posts talking about "release build" of C CUDA code. I did try having -Xptxas -O3 in the makefile and that really did not make a difference. I also tried to adjust the block size, with which the kernel was executed. Unfortunately, it did not help improve the speed, either.
My questions here are:
What could be the reasons leading to the speed difference between C CUDA and PYCUDA?
If the "advanced" (lack of a better word) compiling in PYCUDA is one of reasons, how can I optimize the compiling of my C CUDA code?
Are there any other ways to improve the speed of C CUDA in this case?
While I appreciate general comments, I am looking for actionable suggestions that I can validate on my machine. Thanks!
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule
import time
mod = SourceModule(
"""
__global__ void saxpy(int n, const float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n){
y[i] = a * x[i] + y[i];
}
}
"""
)
saxpy = mod.get_function("saxpy")
N = 1 << 25
time_elapse = 0.0
for i in range(100):
# print(i)
# print(N)
x = np.ones(N).astype(np.float32)
y = 2 * np.ones(N).astype(np.float32)
start = time.time()
saxpy(
np.int32(N),
np.float32(2.0),
drv.In(x),
drv.InOut(y),
block=(512, 1, 1),
grid=(int(N / 512) + 1, 1),
)
time_elapse += (time.time() - start)
print(time_elapse )
print(y[-100:-1])
print(y.sum())
print(N * 4.0)
#include <stdio.h>
#include <time.h>
#define DIM 512
__global__ void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
y[i] = a * x[i] + y[i];
}
int main(int num_iterations)
{
double start;
double cputime;
int N = 1 << 25;
float *x, *y, *d_x, *d_y;
int i, j;
for (j = 0; j < num_iterations; j++)
{
x = (float *)malloc(N * sizeof(float));
y = (float *)malloc(N * sizeof(float));
cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N * sizeof(float));
for (i = 0; i < N; i++)
{
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
// Perform SAXPY on 1M elements
start = clock();
saxpy<<<(N + DIM) / DIM, DIM>>>(N, 2.0f, d_x, d_y);
cputime += ((double)(clock() - start) / CLOCKS_PER_SEC);
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
// float maxError = 0.0f;
// for (int i = 0; i < N; i++){
// maxError = max(maxError, abs(y[i] - 4.0f));
// //printf("y[%d]: %f\n", i,y[i]);
// }
// printf("Max error: %f\n", maxError);
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
}
printf("cpu time is %f\n", cputime);
return 0;
}
I saved the above file as cuda_example.cu and compile it with the following commands in a makefile:
nvcc -arch=sm_61 -Xptxas -O3,-v -o main cuda_example.cu
If I execute your CUDA-C code as is, and set num_iterations to 300 like this:
int num_iterations =300;
then the execution of your program takes about 60s on a Geforce GTX 1650. Your code is extremely inefficient, as you copy data back and forth between GPU and device at every iteration.
So, lets restrict the loop to just the kernel execution:
#include <stdio.h>
#include <time.h>
#define DIM 512
__global__ void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
y[i] = a * x[i] + y[i];
}
int main()
{
double start = clock();
int N = 1 << 25;
float *x, *y, *d_x, *d_y;
int i, j;
int num_iterations = 300;
x = (float *)malloc(N * sizeof(float));
y = (float *)malloc(N * sizeof(float));
cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N * sizeof(float));
for (i = 0; i < N; i++)
{
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
for (j = 0; j < num_iterations; j++){
saxpy<<<(N + DIM) / DIM, DIM>>>(N, 2.0f, d_x, d_y);
cudaDeviceSynchronize();
}
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
double cputime = ((double)(clock() - start) / CLOCKS_PER_SEC);
printf("cpu time is %f\n", cputime);
return 0;
}
If I do that, then the execution time becomes 1.36 seconds. Doing sth similar to the PyCUDA code I got about 19s of execution time.
This question already has an answer here:
How to find the sum of array in CUDA by reduction
(1 answer)
Closed 3 years ago.
I use reduction logic in code by referring How to find the sum of array in CUDA by reduction.
But It is giving some errors. I am not getting my mistake, could you please help me out??
required specification:
1.Cuda toolkit v6.5
2. graphics: GTX 210 (compute capability 1.2)
3. visual studio 2013
#include<stdio.h>
#include<cuda.h>
#include<malloc.h>
#include<conio.h>
#include<time.h>
#include<windows.h>
#define SIZE 10
#define N 100
__global__ void vectoreAdd(int *d_a, int *d_b, int *d_c)
{
__shared__ int sdata[256];
int i = threadIdx.x + (blockIdx.x*blockDim.x);
sdata[threadIdx.x] = d_a[i];
__syncthreads();
if (i<SIZE)
for (i = 2; i<SIZE; i++)
{
int counter = 0;
for (int j = 2; j<d_a[i]; j++)
{
if (d_a[i] % j == 0)
{
counter = 1; break;
}
}
if (counter == 0)
{
d_b[i] = d_a[i];
}
}
// do reduction in shared mem
for (int s = 1; s < blockDim.x; s *= 2)
{
int index = 2 * s * threadIdx.x;;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (threadIdx.x == 0)
atomicAdd(d_c, sdata[0]);
}
}
int main()
{
clock_t tic = clock();
int *a, *b, *summation=0, sum = 0,count=-1; //declare summation as double/long if needed
int *d_a, *d_b, *d_c;
//int blocks, block_size = 512;
int size = N * sizeof(int);
a = (int *)malloc(SIZE*sizeof(int));
b = (int *)malloc(SIZE*sizeof(int));
summation = (int *)malloc(SIZE*sizeof(int));
cudaMalloc((void**)&d_a, SIZE * sizeof(int));
cudaMalloc((void**)&d_b, SIZE * sizeof(int));
cudaMalloc((void**)&d_c, SIZE * sizeof(int));
for (int i = 1; i<SIZE; i++)
{
a[i] = i;
b[i] = 0;
}
cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice);
/*blocks = SIZE / block_size;
if (SIZE% block_size != 0)
blocks++; */
dim3 blocksize(256); // create 1D threadblock
dim3 gridsize(N / blocksize.x); //create 1D grid
vectoreAdd << < gridsize, blocksize >> >(d_a, d_b, d_c);
//cudaThreadSynchronize();
cudaMemcpy(b, d_b, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(summation, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
for (int m = 0; m < SIZE; m++)
{
if (b[m] != 0)
{
printf("\n prime no is:%d", b[m]);
count = count + 1;
}
}
printf("\n\n Total prime no. are: %d", count);
/* for (int j = 1; j<SIZE; j++)
{
sum = sum + b[j];
}*/
printf("\n \nsum of all prime no upto %d is:%d", SIZE, summation);
clock_t toc = clock();
printf("\n\nElapsed: %f seconds\n", (double)(toc - tic) / CLOCKS_PER_SEC);
free(a); free(b); free(summation);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
getchar(); return 0;
}
There are lots of mistakes in your code :
cudaMalloc((void**)&d_a, SIZE * sizeof(int));
should be :
cudaMalloc((void**)&d_a, N * sizeof(int)); //OR
cudaMalloc((void**)&d_a, size);
as you already calculated but didnt passed it. same in case of malloc() //Host code
This question already has an answer here:
Unable to execute device kernel in CUDA
(1 answer)
Closed 7 years ago.
What I am attempting to do is Multiply Matrix A & Matrix B and then from the product matrix I get the index of the maximum value per column. But unfortunately, only the first 128*128 values of the matrix multiplication are correct while others are just garbage. I do not quite understand how this works. I request you to kindly guide me with this ..
#include<stdio.h>
#include "cuda.h"
#include<stdlib.h>
#define blockD 32
const int wA = 128;
const int hA = 4096;
const int wB = 4096;
const int hB = wA;
main(void){
void MatrixMultiplication(float *, float *, float *, float *);
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *M, *N, *P, *C;
// allocate memory on the CPU
M = (float*)malloc(size_A);
N = (float*)malloc(size_B);
P = (float*)malloc(size_max);
C = (float*)malloc(size_C);
// initialize the matrices
for (int y=0; y < hA; y++) {
for (int x=0; x < wA; x++){
M[y*wA + x] = 32; //x + y*wA;
}
}
for (int y=0; y<hB; y++) {
for (int x=0; x<wB; x++){
N[y*wB + x] = 21; //x + y*wB;
}
}
MatrixMultiplication(M, N, P, C);
//Write
FILE *f1;
int i,j;
f1 = fopen("C.txt","w");
for(i = hA - 2 ; i < hA; i ++){
for(j = 0; j < wB; j++){
fprintf(f1,"%d\t",int(C[i*wB + j]));
}
fprintf(f1,"\n");
}
fclose(f1);
// free the memory allocated on the CPU
free( M );
free( N );
free( P );
free( C );
cudaDeviceReset();
return 0;
}
__device__ void MaxFunction(float* Pd, float* max)
{
int x = (threadIdx.x + blockIdx.x * blockDim.x);
int y = (threadIdx.y + blockIdx.y * blockDim.y);
int k = 0;
int temp = 0; int temp_idx = 0;
for (k = 0; k < wB; ++k) {
if(Pd[x*wB + k] > temp){
temp = Pd[x*wB + k];
temp_idx = x*wB + k;
}
}
max[y*2 + 0] = temp;
max[y*2 + 1] = temp_idx;
}
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, float* max)
{
// declare cache in the shared memory
__shared__ float Mds[blockD][blockD];
__shared__ float Nds[blockD][blockD];
float Pvalue = 0;
// Loop over the Md and Nd block dimension required to compute the Pd element
for (int m = (wA * blockD * blockIdx.y), n = (blockD * blockIdx.x);
m < ((wA * blockD * blockIdx.y)+wA-1);
m += blockD, n += (blockD*hB)){
// collaboratively loading of Md and Nd blocks into shared memory
Mds[threadIdx.y][threadIdx.x] = Md[m + wA * threadIdx.y + threadIdx.x];
Nds[threadIdx.y][threadIdx.x] = Nd[n + wA * threadIdx.y + threadIdx.x];
__syncthreads();
// keep track of the running sum
for (int k = 0; k < blockD; k++)
Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
__syncthreads();
}
// write back to the global memory
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
__syncthreads();
MaxFunction(Pd, max);
}
void MatrixMultiplication(float *M, float *N, float *P, float *C) {
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *Md, *Nd, *Pd, *max;
// allocate memory on the GPU
cudaMalloc((void**)&Md, size_A);
cudaMalloc((void**)&Nd, size_B);
cudaMalloc((void**)&Pd, size_C);
cudaMalloc((void**)&max, size_max);
// transfer M and N to device memory
cudaMemcpy(Md, M, size_A, cudaMemcpyHostToDevice);
cudaMemcpy(Nd, N, size_B, cudaMemcpyHostToDevice);
// kernel invocation code
dim3 dimBlock(blockD, blockD);
dim3 dimGrid(wA/blockD, hB/blockD);
//Execute Kernel
MatrixMulKernel<<<dimGrid, dimBlock>>>( Md, Nd, Pd, max);
// transfer P from device
cudaMemcpy(P, max, size_max, cudaMemcpyDeviceToHost);
cudaMemcpy(C, Pd, size_C, cudaMemcpyDeviceToHost);
// free the memory allocated on the GPU
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
cudaFree(max);
}
In your code you seem to have more than one problem. One of the problems is, in place of this:
dim3 dimGrid(wA/blockD, hB/blockD);
You should have this:
dim3 dimGrid(wB/blockD, hA/blockD);
Ultimately you need one thread in your grid for each output point. Your formulation was giving you a grid of 4 blocks by 4 blocks, whereas you need a grid of 128 blocks by 128 blocks.
The other problem I found with your code was in these lines in the kernel:
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
They are not indexing properly through the output array. Rather than try to sort it out using your scheme, I used this instead:
Pd[(threadIdx.x + (blockIdx.x * blockDim.x)) + ((threadIdx.y + (blockIdx.y * blockDim.y))*(gridDim.x*blockDim.x))] = Pvalue;
When I made the above two changes to your code, I got what I believe are correct results throughout the array. And it took about 32 seconds on my machine to run it. (Note that I haven't tried fixing your original max-finding code -- see below for a better approach.)
Based on your previous question, you seemed to be concerned about speed. If you want to do fast matrix multiply, you should use cublas. The following code shows how to use cublas to multiply two ordinary C-style matrices (they don't have to be square). I've also included a column-max finding kernel that will be fast when the number of columns is large (say, over 500 or so. You have 4096 columns in your example). For small numbers of columns, there may be quicker ways to perform this function, but small numbers of columns also suggests that the overall problem size may be small and so speed (of this piece of code) will not really be an issue.
Here's the code:
#include <stdio.h>
#include <cublas_v2.h>
#define VERBOSE 1
#define nTPB 64
#define ROW_A 4
#define COL_A 4
#define ROW_B COL_A
#define COL_B 4
#define ROW_C ROW_A
#define COL_C COL_B
#define SIZ_A (ROW_A*COL_A)
#define SIZ_B (ROW_B*COL_B)
#define SIZ_C (ROW_C*COL_C)
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void col_max(float *mat, float *max, unsigned int *midx, unsigned int rows, unsigned int cols){
int idx = threadIdx.x + blockDim.x*blockIdx.x;
if (idx < cols){
float tempmax = mat[idx];
unsigned int tempmidx = 0;
for (int i = 1; i< rows; i++)
if (mat[idx + (i*cols)] > tempmax){
tempmax = mat[idx + (i*cols)];
tempmidx = i;}
max[idx] = tempmax;
midx[idx] = tempmidx;
}
}
int main(){
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C, *h_max, *d_max;
unsigned int *h_idx, *d_idx;
h_A = (float *)malloc(SIZ_A*sizeof(float));
if (h_A==0) {printf("malloc fail\n"); return -1;}
h_B = (float *)malloc(SIZ_B*sizeof(float));
if (h_B==0) {printf("malloc fail\n"); return -1;}
h_C = (float *)malloc(SIZ_C*sizeof(float));
if (h_C==0) {printf("malloc fail\n"); return -1;}
h_max = (float *)malloc(COL_C*sizeof(float));
if (h_max==0) {printf("malloc fail\n"); return -1;}
h_idx = (unsigned int*)malloc(COL_C*sizeof(unsigned int));
if (h_idx==0) {printf("malloc fail\n"); return -1;}
cudaMalloc((void **)&d_A, SIZ_A*sizeof(float));
cudaMalloc((void **)&d_B, SIZ_B*sizeof(float));
cudaMalloc((void **)&d_C, SIZ_C*sizeof(float));
cudaMalloc((void **)&d_max, COL_C*sizeof(float));
cudaMalloc((void **)&d_idx, COL_C*sizeof(unsigned int));
cudaCheckErrors("cuda malloc fail");
// initialize data
for (int i=0; i< SIZ_A; i++) h_A[i] = (float)(i+1);
for (int i=0; i< SIZ_B; i++) h_B[i] = (float)(i+2);
cudaMemcpy(d_A, h_A, SIZ_A*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, SIZ_B*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cuda memcpy 1 fail");
const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;
cublasCheckErrors(cublasCreate(&handle));
// C = A*B
// due to cublas expecting column-major storage, parameters
// are scrambled
cublasCheckErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, COL_B, ROW_A, COL_A, &alpha, d_B, COL_B, d_A, COL_A, &beta, d_C, COL_C));
cudaMemcpy(h_C, d_C, SIZ_C*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 2 fail");
col_max<<<(COL_C + nTPB - 1)/nTPB, nTPB>>>(d_C, d_max, d_idx, ROW_C, COL_C);
cudaCheckErrors("kernel launch fail");
cudaMemcpy(h_max, d_max, COL_C*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_idx, d_idx, COL_C*sizeof(unsigned int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 3 fail/kernel fail");
if (VERBOSE){
printf("A: \n");
for (int i=0; i< ROW_A; i++){
for (int j=0; j< COL_A; j++)
printf("%7.5G", h_A[j+(i*COL_A)]);
printf("\n");}
printf("B: \n");
for (int i=0; i< ROW_B; i++){
for (int j=0; j< COL_B; j++)
printf("%7.5G", h_B[j+(i*COL_B)]);
printf("\n");}
printf("C = A*B: \n");
for (int i=0; i< ROW_C; i++){
for (int j=0; j< COL_C; j++)
printf("%7.5G", h_C[j+(i*COL_C)]);
printf("\n");}
printf("COLUMN MAX:\n");
for (int i=0; i< COL_C; i++)
printf("%7.5G", h_max[i]);
printf("\nCOLUMN MAX IDX:\n");
for (int i=0; i< COL_C; i++)
printf("%7d", h_idx[i]);
}
printf("\n finished!\n");
return 0;
}
Here's what I used to compile:
$ nvcc -arch=sm_20 -O3 -o t221 t221.cu -lcublas
And here's the sample output:
$ cuda-memcheck ./t221
========= CUDA-MEMCHECK
A:
1 2 3 4
5 6 7 8
9 10 11 12
13 14 15 16
B:
2 3 4 5
6 7 8 9
10 11 12 13
14 15 16 17
C = A*B:
100 110 120 130
228 254 280 306
356 398 440 482
484 542 600 658
COLUMN MAX:
484 542 600 658
COLUMN MAX IDX:
3 3 3 3
finished!
========= ERROR SUMMARY: 0 errors
$
When I extended my code to handle the same sizes you indicated, (A = 4096x128, B=128x4096) it took about 1 second on my machine. So it's much faster than your code. However, when I take your code and comment out your call to MaxFunction in the kernel, it also only takes about 1 second to compute the matrix multiply result. So if you wanted to keep your matrix multiply code (i.e. not use cublas) you could break the code into 2 kernels, and use your multiply routine in the first kernel with my max-finding routine (col_max) in the second kernel, and also probably get a pretty fast result.
As #talonmies indicated, if you are running on a windows machine, be sure you are aware of the ramifications of windows TDR. (search that in the upper right corner search box if needed)
I've been trying to write a kernel in that calculates the sum of the inverse of the distance between N given points over N. A serial coda in C would be like
average = 0;
for(int i = 0; i < Np; i++){
for(int j = i + 1; j < Np; j++){
average += 1.0e0f/sqrtf((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
}
average = average/(float)N;
Where rx and ry are the x and y coordinates, respectively.
I generate the points via a kernel that uses random number generator. For the kernel, I used 128(256) threads per block for 4k(8k) points. On it every thread performs the inner above inner loop, then the results are passed to a reduce sum function, as follows
Generate points:
__global__ void InitRNG ( curandState * state, const int seed ){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
curand_init (seed, tIdx, 0, &state[tIdx]);
}
__global__
void SortPoints(float* X, float* Y,const int N, curandState *state){
float rdmn1, rdmn2;
unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
float range;
if(tIdx < N){
rdmn1 = curand_uniform(&state[tIdx]);
rdmn2 = curand_uniform(&state[tIdx]);
range = sqrtf(0.25e0f*N*rdmn1);
X[tIdx] = range*cosf(2.0e0f*pi*rdmn2);
Y[tIdx] = range*sinf(2.0e0f*pi*rdmn2);
}
}
Reduction:
__device__
float ReduceSum2(float In){
__shared__ float data[BlockSize];
unsigned int tIdx = threadIdx.x;
data[tIdx] = In;
__syncthreads();
for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){
if(tIdx < i){
data[tIdx] += data[tIdx + i];
}
__syncthreads();
}
return data[0];
}
Kernel:
__global__
void AvgDistance(float *X, float *Y, float *Avg, const int N){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
int bIdx = blockIdx.x;
float x , y;
float d = 0.0f;
if(tIdx < N){
for(int i = tIdx + 1; i < N ; i++){
x = X[tIdx] - X[i];
y = Y[tIdx] - Y[i];
d += 1.0e0f/(sqrtf(x*x + y*y));
}
__syncthreads();
Avg[bIdx] = ReduceSum2(d);
}
}
The kernel is configured and launched as follows:
dim3 threads(BlockSize,BlockSize);
dim3 blocks(ceil(Np/threads.x),ceil(Np/threads.y));
InitRNG<<<blocks.x,threads.x>>>(d_state,seed);
SortPoints<<<blocks.x,threads.x>>>(d_rx,d_ry,Np,d_state);
AvgDistance<<<blocks.x,threads.x,threads.x*sizeof(float)>>>(d_rx,d_ry,d_Avg,Np);
Finally, I copy the data back to host and then perform the remaining sum:
Avg = new float[blocks.x];
CHECK(cudaMemcpy(Avg,d_Avg,blocks.x*sizeof(float),cudaMemcpyDeviceToHost),ERROR_CPY_DEVTOH);
float average = 0;
for(int i = 0; i < blocks.x; i++){
average += Avg[i];
}
average = average/(float)Np;
For 4k points, ok! the results are:
Average distance between points (via Kernel) = 108.615
Average distance between points (via CPU) = 110.191
In this case the sum may be performed in different order, causing both results to diverge from each other, I don't know...
But when it comes to 8k, the results are quiet different:
Average distance between points (via Kernel) = 153.63
Average distance between points (via CPU) = 131.471
To me it seems that both the kernel and the serial code are written the same way. What leads me to distrust the precision on CUDA calculation of floating point numbers. Does this make sense? Or are the access to global memory causing some conflicts when some threads load the same data from X and Y at the same time? Or the way I wrote the kernel is in some way 'wrong'(I mean, am I doing something that is causing both results to diverge from each other?).
Actually, from what I can tell, the problem seems to be on the CPU side. I created a sample code based on your code.
I was able to reproduce your results.
First I switched all instances of sinf, cosf, and sqrtf to their corresponding double versions. This made no difference in the results.
Next I included a typedef so I could easily switch the precision from float to double and back, replacing every relevant instance of float in the code with mytype which is my typedef.
When I run the code with typedef of float and a data size of 4096 I get these results:
GPU average = 108.294922
CPU average = 109.925285
When I run the code with typedef of double and a data size of 4096 I get these results:
GPU average = 108.294903
CPU average = 108.294903
When I run the code with typedef of float and a data size of 8192 I get these results:
GPU average = 153.447327
CPU average = 131.473526
When I run the code with typedef of double and a data size of 8192 I get these results:
GPU average = 153.447380
CPU average = 153.447380
There are at least 2 observations:
The GPU results don't vary between float and double, except in the 5th decimal place
The CPU results vary by 1-20% or so between float and double, but when double is selected, they line up exactly (to the 6th decimal place, anyway) with the GPU results.
Based on this, I believe the CPU is providing the variable, questionable behavior.
Here's my code for reference:
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#define DSIZE 8192
#define BlockSize 32
#define pi 3.14159f
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef double mytype;
__global__ void InitRNG ( curandState * state, const int seed ){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
curand_init (seed, tIdx, 0, &state[tIdx]);
}
__global__
void SortPoints(mytype* X, mytype* Y,const int N, curandState *state){
mytype rdmn1, rdmn2;
unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
mytype range;
if(tIdx < N){
rdmn1 = curand_uniform(&state[tIdx]);
rdmn2 = curand_uniform(&state[tIdx]);
range = sqrt(0.25e0f*N*rdmn1);
X[tIdx] = range*cos(2.0e0f*pi*rdmn2);
Y[tIdx] = range*sin(2.0e0f*pi*rdmn2);
}
}
__device__
mytype ReduceSum2(mytype In){
__shared__ mytype data[BlockSize];
unsigned int tIdx = threadIdx.x;
data[tIdx] = In;
__syncthreads();
for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){
if(tIdx < i){
data[tIdx] += data[tIdx + i];
}
__syncthreads();
}
return data[0];
}
__global__
void AvgDistance(mytype *X, mytype *Y, mytype *Avg, const int N){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
int bIdx = blockIdx.x;
mytype x , y;
mytype d = 0.0f;
if(tIdx < N){
for(int i = tIdx + 1; i < N ; i++){
x = X[tIdx] - X[i];
y = Y[tIdx] - Y[i];
d += 1.0e0f/(sqrt(x*x + y*y));
}
__syncthreads();
Avg[bIdx] = ReduceSum2(d);
}
}
mytype cpu_avg(const mytype *rx, const mytype *ry, const int size){
mytype average = 0.0f;
for(int i = 0; i < size; i++){
for(int j = i + 1; j < size; j++){
average += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
}
average = average/(mytype)size;
return average;
}
int main() {
int Np = DSIZE;
mytype *rx, *ry, *d_rx, *d_ry, *d_Avg, *Avg;
curandState *d_state;
int seed = 1;
dim3 threads(BlockSize,BlockSize);
dim3 blocks((int)ceilf(Np/(float)threads.x),(int)ceilf(Np/(float)threads.y));
printf("number of blocks = %d\n", blocks.x);
printf("number of threads= %d\n", threads.x);
rx = (mytype *)malloc(DSIZE*sizeof(mytype));
if (rx == 0) {printf("malloc fail\n"); return 1;}
ry = (mytype *)malloc(DSIZE*sizeof(mytype));
if (ry == 0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void**)&d_rx, DSIZE * sizeof(mytype));
cudaMalloc((void**)&d_ry, DSIZE * sizeof(mytype));
cudaMalloc((void**)&d_Avg, blocks.x * sizeof(mytype));
cudaMalloc((void**)&d_state, DSIZE * sizeof(curandState));
cudaCheckErrors("cudamalloc");
InitRNG<<<blocks.x,threads.x>>>(d_state,seed);
SortPoints<<<blocks.x,threads.x>>>(d_rx,d_ry,Np,d_state);
AvgDistance<<<blocks.x,threads.x,threads.x*sizeof(mytype)>>>(d_rx,d_ry,d_Avg,Np);
cudaCheckErrors("kernels");
Avg = new mytype[blocks.x];
cudaMemcpy(Avg,d_Avg,blocks.x*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaMemcpy(rx, d_rx, DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaMemcpy(ry, d_ry, DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaCheckErrors("cudamemcpy");
mytype average = 0;
for(int i = 0; i < blocks.x; i++){
average += Avg[i];
}
average = average/(mytype)Np;
printf("GPU average = %f\n", average);
average = cpu_avg(rx, ry, DSIZE);
printf("CPU average = %f\n", average);
return 0;
}
I am running on RHEL 5.5, CUDA 5.0, Intel Xeon X5560
compiled with:
nvcc -O3 -arch=sm_20 -lcurand -lm -o t93 t93.cu
EDIT:
After observing that the variability was on the CPU side, I found that I could eliminate most of the CPU variability by modifying your CPU averaging code like this:
mytype cpu_avg(const mytype *rx, const mytype *ry, const int size){
mytype average = 0.0f;
mytype temp = 0.0f;
for(int i = 0; i < size; i++){
for(int j = i + 1; j < size; j++){
temp += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
average += temp/(mytype)size;
temp = 0.0f;
}
return average;
}
So I would say there's a problem with intermediate results on the CPU side. It's interesting that it doesn't show up on the GPU result. I suspect the reason for this is that the final summation of GPU averages is done on the CPU (therefore each individual GPU block result is scaled down by the size, e.g. 8192), and these may have an intermediate precision that is sufficient to survive until the final division. If you inlined the CPU average calculation, you may observe something different again.
I have two programs. the only difference is that one uses constant memory to store input while the other uses global memory.I want to know why the global memory one is faster than the constant memory one? They both compute dot product btw 2 matrices
#include<cuda_runtime.h>
#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#define intMin(a,b) ((a<b)?a:b)
//Threads per block
#define TPB 128
//blocks per grid
#define BPG intMin(128, ((n+TPB-1)/TPB))
const int n = 4;
__constant__ float deva[n],devb[n];
__global__ void addVal( float *c){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Using shared memory to temporary store results
__shared__ float cache[TPB];
float temp = 0;
while(tid < n){
temp += deva[tid] * devb[tid];
tid += gridDim.x * blockDim.x;
}
cache[threadIdx.x] = temp;
__syncthreads();
int i = blockDim.x/2;
while( i !=0){
if(threadIdx.x < i){
cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ;
}
__syncthreads();
i = i/2;
}
if(threadIdx.x == 1){
c[blockIdx.x ] = cache[0];
}
}
int main(){
float a[n] , b[n] , c[BPG];
//float *deva, *devb, *devc;
float *devc;
int i;
//Filling with random values to test
for( i =0; i< n; i++){
a[i] = i;
b[i] = i*2;
}
//cudaMalloc((void**)&deva, n * sizeof(float));
//cudaMalloc((void**)&devb, n * sizeof(float));
cudaMalloc((void**)&devc, BPG * sizeof(float));
//cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice);
//cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(deva, a, n * sizeof(float));
cudaMemcpyToSymbol(devb, b, n * sizeof(float));
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//Call function to do dot product
addVal<<<BPG, TPB>>>( devc);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time,start, stop);
printf("The elapsed time is: %f\n", time);
//copy result back
cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost);
float sum =0 ;
for ( i = 0 ; i< BPG; i++){
sum+=c[i];
}
//display answer
printf("%f\n",sum);
getchar();
return 0;
}
Below is the global memory version.
#include<cuda_runtime.h>
#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#define intMin(a,b) ((a<b)?a:b)
//Threads per block
#define TPB 128
//blocks per grid
#define BPG intMin(128, ((n+TPB-1)/TPB))
const int n = 4;
__global__ void addVal(float *a, float *b, float *c){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Using shared memory to temporary store results
__shared__ float cache[TPB];
float temp = 0;
while(tid < n){
temp += a[tid] * b[tid];
tid += gridDim.x * blockDim.x;
}
cache[threadIdx.x] = temp;
__syncthreads();
int i = blockDim.x/2;
while( i !=0){
if(threadIdx.x < i){
cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ;
}
__syncthreads();
i = i/2;
}
if(threadIdx.x == 1){
c[blockIdx.x ] = cache[0];
}
}
int main(){
float a[n] , b[n] , c[BPG];
float *deva, *devb, *devc;
int i;
//Filling with random values to test
for( i =0; i< n; i++){
a[i] = i;
b[i] = i*2;
}
printf("Not using constant memory\n");
cudaMalloc((void**)&deva, n * sizeof(float));
cudaMalloc((void**)&devb, n * sizeof(float));
cudaMalloc((void**)&devc, BPG * sizeof(float));
cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//Call function to do dot product
addVal<<<BPG, TPB>>>(deva, devb, devc);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time,start, stop);
printf("The elapsed time is: %f\n", time);
//copy result back
cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost);
float sum =0 ;
for ( i = 0 ; i< BPG; i++){
sum+=c[i];
}
//display answer
printf("%f\n",sum);
getchar();
return 0;
}
You are not getting advantage of the constant memory.
A single read from constant memory can be broadcast to a half-warp (not your case as every thread load from its own tid).
Constant memory is cached (not used in your case as you only read once from each position in the constant memory array).
As each thread in a half-warp does a single read to different data, the 16 different reads get serialized, taking 16 times the amount of time to place the request.
If they are reading from global memory, the request are done at the same time, coalesced. That's why your global memory example is better than the constant memory.
Of course, this conclusion can vary with devices of compute capability 2.x with a L1 and L2 cache.
Regards!