Hi i have a problem with the time response I am getting a longer response time on GPU than CPU
the algorithm used is a matrix multiplication algorithm
using the next functions:
// Start timers
cudaEvent_t timer1, timer2;
cudaEventCreate(&timer1);
cudaEventCreate(&timer2);
cudaEventRecord(timer1, 0);
cudaEventSynchronize(timer1);
// Stop timers
cudaEventRecord(timer2, 0);
cudaEventSynchronize(timer1);
cudaEventSynchronize(timer2);
float elapsed;
cudaEventElapsedTime(&elapsed, timer1, timer2);
cudaDeviceReset();
return elapsed;
here is my code on GPU:
float Mult_gpu(float* hostPtr, float* hostPtr2, float* hostPtr3, int size, int Ncols, int Nrows, int n) {
size_t pitch;
check("Creating timers");
cudaEvent_t timer1, timer2;
cudaEventCreate(&timer1);
cudaEventCreate(&timer2);
cudaEventRecord(timer1, 0);
cudaEventSynchronize(timer1);
/******************************************/
/***Configuracion de las matrices en gpu***/
/******************************************/
float* devPtr;
cudaMallocPitch(&devPtr, &pitch, n * sizeof(float), Nrows);
cudaMemcpy2D(devPtr, pitch, hostPtr, n * sizeof(float), n * sizeof(float), Nrows, cudaMemcpyHostToDevice);
float* devPtr2;
cudaMallocPitch(&devPtr2, &pitch, Ncols * sizeof(float), n);
cudaMemcpy2D(devPtr2, pitch, hostPtr2, Ncols * sizeof(float), Ncols * sizeof(float), n, cudaMemcpyHostToDevice);
float* devPtr3;
cudaMallocPitch(&devPtr3, &pitch, Ncols * sizeof(float), Nrows);
//dim3 gridSize(iDivUp(Ncols3, BLOCKSIZE_x), iDivUp(Nrows3, BLOCKSIZE_y));
//dim3 blockSize(BLOCKSIZE_y, BLOCKSIZE_x);
dim3 block(32, 32); //hilos por bloque
dim3 grid((size / block.x) + 1, (size / block.y) + 1); //numero de bloques
/**************************/
/**Lanzamiento del kernel**/
/**************************/
Mult << <grid, block >> > (devPtr, devPtr2, devPtr3, pitch, Ncols, Nrows, n);
cudaDeviceSynchronize();
/*********************************/
/***Copiado de devPtr a hosPtr2***/
/*********************************/
cudaMemcpy2D(hostPtr3, Ncols * sizeof(float), devPtr3, pitch, Ncols * sizeof(float), Nrows, cudaMemcpyDeviceToHost);
//cudaMemcpy(hostPtr3, devPtr3, size * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(devPtr);
cudaFree(devPtr2);
cudaFree(devPtr3);
// Stop timers
cudaEventRecord(timer2, 0);
cudaEventSynchronize(timer1);
cudaEventSynchronize(timer2);
float elapsed;
cudaEventElapsedTime(&elapsed, timer1, timer2);
cudaDeviceReset();
return elapsed;
}
and here is my code on CPU
float Mult_cpu(float* hostPtrA, float* HostPtrB, float* hostPtrC, int Ncols, int Nrows, int n)
{
cudaEvent_t timer1, timer2;
cudaEventCreate(&timer1);
cudaEventCreate(&timer2);
cudaEventRecord(timer1, 0);
cudaEventSynchronize(timer1);
for (int i = 0; i < Nrows; ++i) {
for (int j = 0; j < Ncols; ++j) {
float suma = 0;
for (int k = 0; k < n; ++k) {
suma += hostPtrA[i * n + k] * HostPtrB[k * Ncols + j];
}
hostPtrC[i * Ncols + j] = suma;
}
}
// Stop timers
cudaEventRecord(timer2, 0);
cudaEventSynchronize(timer1);
cudaEventSynchronize(timer2);
float elapsed;
cudaEventElapsedTime(&elapsed, timer1, timer2);
return elapsed;
}
when i use a matrix 500x500 or any matrix the CPU is faster than GPU and i don't understand why i don't know if the problem is my kernel program or the CUDA functions that im using
my kernel code
__global__ void Mult(float* devPtrA, float* devPtrB, float* devPtrC, size_t pitch, int Ncols, int Nrows, int n)
{
float temp;
int r = blockDim.y * blockIdx.y + threadIdx.y; //for (int f = 0; f <= fil - 1; f += 1) equivalencia en for
int c = blockDim.x * blockIdx.x + threadIdx.x; //for (int c = 0; c <= col - 1; c += 1)
if ((r < Ncols) && (c < Nrows)) {
for (int c2 = 0; c2 < n; c2++) {
float* vertical = (float*)((char*)devPtrA + r * pitch);
float element1 = vertical[c2];
float* horizontal = (float*)((char*)devPtrB + c2 * pitch);
float element2 = horizontal[c];
temp += element1 * element2;
}
//printf("\nla fila es: %d la columna es: %d el valor es: %8.4f\n\n", r, c, temp);
float* vertical2 = (float*)((char*)devPtrC + r * pitch);
vertical2[c] = temp;
}
}
You should read on the concept of SIMT architecture, CUDA execution model and branch divergence. Analyze your CUDA kernel performance with a profiler. I suspect that the condition if ((r < Ncols) && (c < Nrows)) in your kernel causes threads in each warp to diverge and hence the reduced performance. Also pitch affects the global memory access pattern in your code which is another factor in the performance of CUDA kernels. Some other excellent optimization tips can be found here.
CudaMalloc is really slow. If you know the size of your matrices beforehand, do the initialization at the beginning of your program.
Related
I am exploring to move from OpenCL to CUDA, and did a few tests to benchmark the speed of CUDA in various implementations. To my surprise, in the examples below, the PyCUDA implementation is about 20% faster than the C CUDA example.
I read many posts talking about "release build" of C CUDA code. I did try having -Xptxas -O3 in the makefile and that really did not make a difference. I also tried to adjust the block size, with which the kernel was executed. Unfortunately, it did not help improve the speed, either.
My questions here are:
What could be the reasons leading to the speed difference between C CUDA and PYCUDA?
If the "advanced" (lack of a better word) compiling in PYCUDA is one of reasons, how can I optimize the compiling of my C CUDA code?
Are there any other ways to improve the speed of C CUDA in this case?
While I appreciate general comments, I am looking for actionable suggestions that I can validate on my machine. Thanks!
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule
import time
mod = SourceModule(
"""
__global__ void saxpy(int n, const float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n){
y[i] = a * x[i] + y[i];
}
}
"""
)
saxpy = mod.get_function("saxpy")
N = 1 << 25
time_elapse = 0.0
for i in range(100):
# print(i)
# print(N)
x = np.ones(N).astype(np.float32)
y = 2 * np.ones(N).astype(np.float32)
start = time.time()
saxpy(
np.int32(N),
np.float32(2.0),
drv.In(x),
drv.InOut(y),
block=(512, 1, 1),
grid=(int(N / 512) + 1, 1),
)
time_elapse += (time.time() - start)
print(time_elapse )
print(y[-100:-1])
print(y.sum())
print(N * 4.0)
#include <stdio.h>
#include <time.h>
#define DIM 512
__global__ void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
y[i] = a * x[i] + y[i];
}
int main(int num_iterations)
{
double start;
double cputime;
int N = 1 << 25;
float *x, *y, *d_x, *d_y;
int i, j;
for (j = 0; j < num_iterations; j++)
{
x = (float *)malloc(N * sizeof(float));
y = (float *)malloc(N * sizeof(float));
cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N * sizeof(float));
for (i = 0; i < N; i++)
{
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
// Perform SAXPY on 1M elements
start = clock();
saxpy<<<(N + DIM) / DIM, DIM>>>(N, 2.0f, d_x, d_y);
cputime += ((double)(clock() - start) / CLOCKS_PER_SEC);
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
// float maxError = 0.0f;
// for (int i = 0; i < N; i++){
// maxError = max(maxError, abs(y[i] - 4.0f));
// //printf("y[%d]: %f\n", i,y[i]);
// }
// printf("Max error: %f\n", maxError);
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
}
printf("cpu time is %f\n", cputime);
return 0;
}
I saved the above file as cuda_example.cu and compile it with the following commands in a makefile:
nvcc -arch=sm_61 -Xptxas -O3,-v -o main cuda_example.cu
If I execute your CUDA-C code as is, and set num_iterations to 300 like this:
int num_iterations =300;
then the execution of your program takes about 60s on a Geforce GTX 1650. Your code is extremely inefficient, as you copy data back and forth between GPU and device at every iteration.
So, lets restrict the loop to just the kernel execution:
#include <stdio.h>
#include <time.h>
#define DIM 512
__global__ void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
y[i] = a * x[i] + y[i];
}
int main()
{
double start = clock();
int N = 1 << 25;
float *x, *y, *d_x, *d_y;
int i, j;
int num_iterations = 300;
x = (float *)malloc(N * sizeof(float));
y = (float *)malloc(N * sizeof(float));
cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N * sizeof(float));
for (i = 0; i < N; i++)
{
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
for (j = 0; j < num_iterations; j++){
saxpy<<<(N + DIM) / DIM, DIM>>>(N, 2.0f, d_x, d_y);
cudaDeviceSynchronize();
}
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
double cputime = ((double)(clock() - start) / CLOCKS_PER_SEC);
printf("cpu time is %f\n", cputime);
return 0;
}
If I do that, then the execution time becomes 1.36 seconds. Doing sth similar to the PyCUDA code I got about 19s of execution time.
This question already has an answer here:
How to find the sum of array in CUDA by reduction
(1 answer)
Closed 3 years ago.
I use reduction logic in code by referring How to find the sum of array in CUDA by reduction.
But It is giving some errors. I am not getting my mistake, could you please help me out??
required specification:
1.Cuda toolkit v6.5
2. graphics: GTX 210 (compute capability 1.2)
3. visual studio 2013
#include<stdio.h>
#include<cuda.h>
#include<malloc.h>
#include<conio.h>
#include<time.h>
#include<windows.h>
#define SIZE 10
#define N 100
__global__ void vectoreAdd(int *d_a, int *d_b, int *d_c)
{
__shared__ int sdata[256];
int i = threadIdx.x + (blockIdx.x*blockDim.x);
sdata[threadIdx.x] = d_a[i];
__syncthreads();
if (i<SIZE)
for (i = 2; i<SIZE; i++)
{
int counter = 0;
for (int j = 2; j<d_a[i]; j++)
{
if (d_a[i] % j == 0)
{
counter = 1; break;
}
}
if (counter == 0)
{
d_b[i] = d_a[i];
}
}
// do reduction in shared mem
for (int s = 1; s < blockDim.x; s *= 2)
{
int index = 2 * s * threadIdx.x;;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (threadIdx.x == 0)
atomicAdd(d_c, sdata[0]);
}
}
int main()
{
clock_t tic = clock();
int *a, *b, *summation=0, sum = 0,count=-1; //declare summation as double/long if needed
int *d_a, *d_b, *d_c;
//int blocks, block_size = 512;
int size = N * sizeof(int);
a = (int *)malloc(SIZE*sizeof(int));
b = (int *)malloc(SIZE*sizeof(int));
summation = (int *)malloc(SIZE*sizeof(int));
cudaMalloc((void**)&d_a, SIZE * sizeof(int));
cudaMalloc((void**)&d_b, SIZE * sizeof(int));
cudaMalloc((void**)&d_c, SIZE * sizeof(int));
for (int i = 1; i<SIZE; i++)
{
a[i] = i;
b[i] = 0;
}
cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice);
/*blocks = SIZE / block_size;
if (SIZE% block_size != 0)
blocks++; */
dim3 blocksize(256); // create 1D threadblock
dim3 gridsize(N / blocksize.x); //create 1D grid
vectoreAdd << < gridsize, blocksize >> >(d_a, d_b, d_c);
//cudaThreadSynchronize();
cudaMemcpy(b, d_b, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(summation, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
for (int m = 0; m < SIZE; m++)
{
if (b[m] != 0)
{
printf("\n prime no is:%d", b[m]);
count = count + 1;
}
}
printf("\n\n Total prime no. are: %d", count);
/* for (int j = 1; j<SIZE; j++)
{
sum = sum + b[j];
}*/
printf("\n \nsum of all prime no upto %d is:%d", SIZE, summation);
clock_t toc = clock();
printf("\n\nElapsed: %f seconds\n", (double)(toc - tic) / CLOCKS_PER_SEC);
free(a); free(b); free(summation);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
getchar(); return 0;
}
There are lots of mistakes in your code :
cudaMalloc((void**)&d_a, SIZE * sizeof(int));
should be :
cudaMalloc((void**)&d_a, N * sizeof(int)); //OR
cudaMalloc((void**)&d_a, size);
as you already calculated but didnt passed it. same in case of malloc() //Host code
Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 8 years ago.
Improve this question
I am trying to simply increment a few matrix values in parallel in CUDA and trying to copy them back to main memory. However when I print them out once the thread function returns, the values are the same. I have even tried running the program with just 1 thread, but have had no luck. Any help would be greatly appreciated.
My code:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <cuda.h>
#define BLOCK_SIZE 1024
#define MAX_N 100000000
#define MAX_THREADS 1024
int num_threads;
int count; // Count of threads that have updated their partition
int size;
//int increment; // VS
int * inc2;
//int my_start;
//Host data
int * thread_ids;
//nvcc -arch=sm_20 -o nbody.exe nbody.cu (compilation)
__global__ void pcyc_red(float * a, float * b, float * c, float * D, float * X,
float * a2, float * b2, float * c2, float * D2,
int * inc2_dev, int * size_dev, int * num_threads_dev){
//__threadfence();
int thread_id = threadIdx.x + (blockIdx.x * blockDim.x);
float k1;
float k2;
int i;
int start = 0;
//int end = size_dev-1;
//int inc2_dev = inc2_dev1[0];
//int inc_dev = *inc_dev1;
//int size_dev = size_dev1[0];
int nthreads = num_threads_dev[0];
//Thread work assignment
int chunk_size = size_dev[0]/nthreads;
int my_start = thread_id*(chunk_size);
int my_end = start + ((thread_id + 1)*chunk_size - 1);
//__threadfence();
__syncthreads();
//Forward Reduction
for(i = my_start; i <= my_end; ++i){
a[i] = a[i]++;
b[i] = b[i]++;
c[i] = c[i]++;
D[i] = D[i]++;
X[i] = X[i]++;
}
__threadfence();
//__syncthreads();
}//Device Function
float* init_vector(int size){
float* output;
output = (float*) calloc(size, sizeof(float));
int i;
for(i = 0; i < size; ++i){
output[i] = 2.0;
}
return output;
}
float* init_vector_ac(int s){
//s will be used for size-1 not to be confused for size.
float* output;
output = (float*) calloc(s, sizeof(float));
int i;
for(i = 0; i < s; ++i){
output[i] = -1.0;
}
return output;
}
// Main program
int main(int argc, char *argv[]) {
//num_threads -> atoi(argv[argc-1]);
//struct timeval start, stop;
float total_time;
int i;
///Host structures
float* a;
float* b;
float* c;
float* D;
float* X;
//increment = 2; // VS
inc2 = (int*) malloc(sizeof(int));
inc2[0] = 1;
//size = (int*) malloc(sizeof(int));
//num_threads = (int*) malloc(sizeof(int));
//my_start = 0;
//wait_flag = false;
///Device Data
//SYSTEM * sys_dev;
float * a_dev;
float * b_dev;
float * c_dev;
float * D_dev;
float * X_dev;
float * a2_dev;
float * b2_dev;
float * c2_dev;
float * D2_dev;
//float * X2_dev;
//int * inc_dev;
int * inc2_dev;
//int * mstart_dev;
int * size_dev;
int * num_threads_dev;
int result_var;
//int final_inc2;
cudaEvent_t start, stop; // GPU timing variables
//struct timeval cpu_start, cpu_stop; // CPU timing variables
// float time_array[10];
// Timing initializations
cudaEventCreate(&start);
cudaEventCreate(&stop);
if (argc != 3)
{
printf("Use: <executable_name> <size> <num_threads>\n");
exit(0);
}
if ((size = atoi(argv[argc-2])) > MAX_N)
{
printf("Maximum number of nodes allowed: %d\n", MAX_N);
exit(0);
};
if ((num_threads = atoi(argv[argc-1])) > MAX_THREADS)
{
printf("Maximum number of threads allowed: %d.\n", MAX_THREADS);
exit(0);
};
int size_array = (size) * sizeof(float);
int size_array2 = (size - 1) * sizeof(float);
// Initialize host tridiagonal matrix
a = init_vector_ac(size-1); // a[i] = -1.0
b = init_vector(size); // b[i] = 2.0
c = init_vector_ac(size-1); // c[i] = -1.0
D = init_vector(size); // D[i] = 2.0
X = init_vector(size); // X[i] = 2.0
//xs = init_vector_err(size);
// Shift elements of a by 1
for(i = size-1; i > 0; i--) a[i] = a[i-1];
a[0] = 0.0;
thread_ids = (int*) calloc(num_threads, sizeof(int));
count = 0;
for(i = 0; i < num_threads; ++i){
thread_ids[i] = i;
}
//Cuda Operation
cudaEventRecord( start, 0);
cudaMalloc((void **) &a_dev, size);
cudaMalloc((void **) &b_dev, size);
cudaMalloc((void **) &c_dev, size);
cudaMalloc((void **) &D_dev, size);
cudaMalloc((void **) &X_dev, size);
cudaMalloc((void **) &a2_dev, size);
cudaMalloc((void **) &b2_dev, size);
cudaMalloc((void **) &c2_dev, size);
cudaMalloc((void **) &D2_dev, size);
//cudaMalloc((void**)&inc_dev, sizeof(int));
cudaMalloc((void**)&inc2_dev, sizeof(int));
//cudaMalloc((void**)&mstart_dev, sizeof(int));
cudaMalloc((void**)&size_dev, sizeof(int));
cudaMalloc((void**)&num_threads_dev, sizeof(int));
cudaMemcpy(a_dev, a, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(b_dev, b, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(c_dev, c, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(D_dev, D, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(X_dev, X, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(a2_dev, a, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(b2_dev, b, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(c2_dev, c, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(D2_dev, D, size_array, cudaMemcpyHostToDevice);
//cudaMemcpy(inc_dev, &increment, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(inc2_dev, inc2, sizeof(int), cudaMemcpyHostToDevice);
//cudaMemcpy(mstart_dev, &my_start, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(size_dev, &size, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(num_threads_dev, &num_threads, sizeof(int), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
pcyc_red<<<1, num_threads>>>(a_dev, b_dev, c_dev, D_dev, X_dev,
a2_dev, b2_dev, c2_dev, D2_dev,
inc2_dev, size_dev, num_threads_dev);
cudaDeviceSynchronize();
cudaMemcpy(X, X_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(a, a_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(b, b_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(c, c_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(D, D_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(inc2, inc2_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&result_var, num_threads_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&total_time, start, stop);
printf("Final Var: %d\n\n", inc2[0]);
printf("Num Threads Var: %d\n\n", result_var);
for(i = 0; i < size; ++i){
printf("a=%8.4f \n", a[i]);
printf("b=%8.4f \n", b[i]);
printf("c=%8.4f \n", c[i]);
printf("D=%8.4f \n", D[i]);
printf("X=%8.4f \n", X[i]);
}
printf("Threads = %d, matrix_size = %d, time = %f\n",
num_threads, size, total_time);
cudaFree(a_dev);
cudaFree(b_dev);
cudaFree(c_dev);
cudaFree(D_dev);
cudaFree(X_dev);
//cudaFree(inc_dev);
cudaFree(inc2_dev);
//cudaFree(mstart_dev);
//cudaFree(size_dev);
//cudaFree(num_threads_dev);
}//end of main
Add proper cuda error checking to your code.
One problem I can see is that your allocation sizes are not matched to your array sizes. To pick just one example:
int size_array = (size) * sizeof(float);
...
cudaMalloc((void **) &b_dev, size); // size should probably be size_array here
... ^^^^
cudaMemcpy(b_dev, b, size_array, cudaMemcpyHostToDevice); // this won't work, will throw error
^^^^^^^^^^
The above is certainly an error, and there are several of that type in your code. You may also have a machine configuration issue (CUDA not properly installed, etc.) which the error checking will also indicate.
Quoting the "Kepler Tuning Guide" provided by NVIDIA:
Also note that Kepler GPUs can utilize ILP in place of
thread/warp-level parallelism (TLP) more readily than Fermi GPUs can.
In my opinion, the following code snippet
a = .....;
a2 = f(a);
a3 = g(a2);
can be improved as follows
a = ...;
b = ....;
a2 = f(a);
b2 = f(b);
a3 = g(a2);
b3 = g(b2);
So in my projects, I have a section of code as follows (example 1)
if(x < src.cols && y < src.rows)
{
if(!mask(y,x))
{
src.ptr(y)[x] = make_short4(0,0,0,0);
}
}
and I rewrite it as follows (example2)
if(x < src.cols && y < src.rows)
{
if(!mask(y,x))
{
short4 t;
t.x = 0;
t.y = 0;
t.z = 0;
t.w = 0;
src.ptr(y)[x].x = t.x;
src.ptr(y)[x].y = t.y;
src.ptr(y)[x].z = t.z;
src.ptr(y)[x].w = t.w;
}
}
In the Kepler architecture, the example2 will be more efficient and exhibit better performance than example1, is that right?
A good explanation on Instruction Level Parallelism (ILP) can be found at CUDA Performance: Maximizing Instruction-Level Parallelism.
It has been pointed out by Robert Crovella and talonmies, and it has been recognized by yourself, that your example above does not reach ILP.
Concerning how implementing ILP, I'm showing below the classical example, translated from the PyCUDA code at numbapro-examples, which I have tested for a Fermi and for a Kepler GPU. Please, notice that for the latter case I have not observed relevant speedups.
THE CODE
#include <stdio.h>
#include <time.h>
#define BLOCKSIZE 64
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/************************************/
/* NO INSTRUCTION LEVEL PARALLELISM */
/************************************/
__global__ void ILP0(float* d_a, float* d_b, float* d_c) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
d_c[i] = d_a[i] + d_b[i];
}
/************************************/
/* INSTRUCTION LEVEL PARALLELISM X2 */
/************************************/
__global__ void ILP2(float* d_a, float* d_b, float* d_c) {
// --- Loading the data
int i = threadIdx.x + blockIdx.x * blockDim.x;
float ai = d_a[i];
float bi = d_b[i];
int stride = gridDim.x * blockDim.x;
int j = i + stride;
float aj = d_a[j];
float bj = d_b[j];
// --- Computing
float ci = ai + bi;
float cj = aj + bj;
// --- Writing the data
d_c[i] = ci;
d_c[j] = cj;
}
/************************************/
/* INSTRUCTION LEVEL PARALLELISM X4 */
/************************************/
__global__ void ILP4(float* d_a, float* d_b, float* d_c) {
// --- Loading the data
int i = threadIdx.x + blockIdx.x * blockDim.x;
float ai = d_a[i];
float bi = d_b[i];
int stride = gridDim.x * blockDim.x;
int j = i + stride;
float aj = d_a[j];
float bj = d_b[j];
int k = j + stride;
float ak = d_a[k];
float bk = d_b[k];
int l = k + stride;
float al = d_a[l];
float bl = d_b[l];
// --- Computing
float ci = ai + bi;
float cj = aj + bj;
float ck = ak + bk;
float cl = al + bl;
// --- Writing the data
d_c[i] = ci;
d_c[j] = cj;
d_c[k] = ck;
d_c[l] = cl;
}
/************************************/
/* INSTRUCTION LEVEL PARALLELISM X8 */
/************************************/
__global__ void ILP8(float* d_a, float* d_b, float* d_c) {
// --- Loading the data
int i = threadIdx.x + blockIdx.x * blockDim.x;
float ai = d_a[i];
float bi = d_b[i];
int stride = gridDim.x * blockDim.x;
int j = i + stride;
float aj = d_a[j];
float bj = d_b[j];
int k = j + stride;
float ak = d_a[k];
float bk = d_b[k];
int l = k + stride;
float al = d_a[l];
float bl = d_b[l];
int m = l + stride;
float am = d_a[m];
float bm = d_b[m];
int n = m + stride;
float an = d_a[n];
float bn = d_b[n];
int p = n + stride;
float ap = d_a[p];
float bp = d_b[p];
int q = p + stride;
float aq = d_a[q];
float bq = d_b[q];
// --- Computing
float ci = ai + bi;
float cj = aj + bj;
float ck = ak + bk;
float cl = al + bl;
float cm = am + bm;
float cn = an + bn;
float cp = ap + bp;
float cq = aq + bq;
// --- Writing the data
d_c[i] = ci;
d_c[j] = cj;
d_c[k] = ck;
d_c[l] = cl;
d_c[m] = cm;
d_c[n] = cn;
d_c[p] = cp;
d_c[q] = cq;
}
/********/
/* MAIN */
/********/
void main() {
float timing;
cudaEvent_t start, stop;
const int N = 65536*4; // --- ASSUMPTION: N can be divided by BLOCKSIZE
float* a = (float*)malloc(N*sizeof(float));
float* b = (float*)malloc(N*sizeof(float));
float* c = (float*)malloc(N*sizeof(float));
float* c_ref = (float*)malloc(N*sizeof(float));
srand(time(NULL));
for (int i=0; i<N; i++) {
a[i] = rand() / RAND_MAX;
b[i] = rand() / RAND_MAX;
c_ref[i] = a[i] + b[i];
}
float* d_a; gpuErrchk(cudaMalloc((void**)&d_a,N*sizeof(float)));
float* d_b; gpuErrchk(cudaMalloc((void**)&d_b,N*sizeof(float)));
float* d_c0; gpuErrchk(cudaMalloc((void**)&d_c0,N*sizeof(float)));
float* d_c2; gpuErrchk(cudaMalloc((void**)&d_c2,N*sizeof(float)));
float* d_c4; gpuErrchk(cudaMalloc((void**)&d_c4,N*sizeof(float)));
float* d_c8; gpuErrchk(cudaMalloc((void**)&d_c8,N*sizeof(float)));
gpuErrchk(cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice));
/******************/
/* ILP0 TEST CASE */
/******************/
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
ILP0<<<iDivUp(N,BLOCKSIZE),BLOCKSIZE>>>(d_a, d_b, d_c0);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP0: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c0, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("Error!\n");
return;
}
printf("Test passed!\n");
/******************/
/* ILP2 TEST CASE */
/******************/
cudaEventRecord(start, 0);
ILP2<<<(N/2)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c2);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP2: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c2, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("Error!\n");
return;
}
printf("Test passed!\n");
/******************/
/* ILP4 TEST CASE */
/******************/
cudaEventRecord(start, 0);
ILP4<<<(N/4)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c4);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP4: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c4, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("Error!\n");
return;
}
printf("Test passed!\n");
/******************/
/* ILP8 TEST CASE */
/******************/
cudaEventRecord(start, 0);
ILP8<<<(N/8)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c8);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP8: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c8, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("%f %f\n",c[i],c_ref[i]);
printf("Error!\n");
return;
}
printf("Test passed!\n");
}
PERFORMANCE
Card Kernel Time [ms] Speedup
GeForce GT540M ILP0 4.609 1
" ILP2 2.666 1.72
" ILP4 1.675 2.76
" ILP8 1.477 3.12
Kepler K20c ILP0 0.045
" ILP2 0.043
" ILP4 0.043
" ILP8 0.042
I have two programs. the only difference is that one uses constant memory to store input while the other uses global memory.I want to know why the global memory one is faster than the constant memory one? They both compute dot product btw 2 matrices
#include<cuda_runtime.h>
#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#define intMin(a,b) ((a<b)?a:b)
//Threads per block
#define TPB 128
//blocks per grid
#define BPG intMin(128, ((n+TPB-1)/TPB))
const int n = 4;
__constant__ float deva[n],devb[n];
__global__ void addVal( float *c){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Using shared memory to temporary store results
__shared__ float cache[TPB];
float temp = 0;
while(tid < n){
temp += deva[tid] * devb[tid];
tid += gridDim.x * blockDim.x;
}
cache[threadIdx.x] = temp;
__syncthreads();
int i = blockDim.x/2;
while( i !=0){
if(threadIdx.x < i){
cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ;
}
__syncthreads();
i = i/2;
}
if(threadIdx.x == 1){
c[blockIdx.x ] = cache[0];
}
}
int main(){
float a[n] , b[n] , c[BPG];
//float *deva, *devb, *devc;
float *devc;
int i;
//Filling with random values to test
for( i =0; i< n; i++){
a[i] = i;
b[i] = i*2;
}
//cudaMalloc((void**)&deva, n * sizeof(float));
//cudaMalloc((void**)&devb, n * sizeof(float));
cudaMalloc((void**)&devc, BPG * sizeof(float));
//cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice);
//cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(deva, a, n * sizeof(float));
cudaMemcpyToSymbol(devb, b, n * sizeof(float));
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//Call function to do dot product
addVal<<<BPG, TPB>>>( devc);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time,start, stop);
printf("The elapsed time is: %f\n", time);
//copy result back
cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost);
float sum =0 ;
for ( i = 0 ; i< BPG; i++){
sum+=c[i];
}
//display answer
printf("%f\n",sum);
getchar();
return 0;
}
Below is the global memory version.
#include<cuda_runtime.h>
#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#define intMin(a,b) ((a<b)?a:b)
//Threads per block
#define TPB 128
//blocks per grid
#define BPG intMin(128, ((n+TPB-1)/TPB))
const int n = 4;
__global__ void addVal(float *a, float *b, float *c){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Using shared memory to temporary store results
__shared__ float cache[TPB];
float temp = 0;
while(tid < n){
temp += a[tid] * b[tid];
tid += gridDim.x * blockDim.x;
}
cache[threadIdx.x] = temp;
__syncthreads();
int i = blockDim.x/2;
while( i !=0){
if(threadIdx.x < i){
cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ;
}
__syncthreads();
i = i/2;
}
if(threadIdx.x == 1){
c[blockIdx.x ] = cache[0];
}
}
int main(){
float a[n] , b[n] , c[BPG];
float *deva, *devb, *devc;
int i;
//Filling with random values to test
for( i =0; i< n; i++){
a[i] = i;
b[i] = i*2;
}
printf("Not using constant memory\n");
cudaMalloc((void**)&deva, n * sizeof(float));
cudaMalloc((void**)&devb, n * sizeof(float));
cudaMalloc((void**)&devc, BPG * sizeof(float));
cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//Call function to do dot product
addVal<<<BPG, TPB>>>(deva, devb, devc);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time,start, stop);
printf("The elapsed time is: %f\n", time);
//copy result back
cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost);
float sum =0 ;
for ( i = 0 ; i< BPG; i++){
sum+=c[i];
}
//display answer
printf("%f\n",sum);
getchar();
return 0;
}
You are not getting advantage of the constant memory.
A single read from constant memory can be broadcast to a half-warp (not your case as every thread load from its own tid).
Constant memory is cached (not used in your case as you only read once from each position in the constant memory array).
As each thread in a half-warp does a single read to different data, the 16 different reads get serialized, taking 16 times the amount of time to place the request.
If they are reading from global memory, the request are done at the same time, coalesced. That's why your global memory example is better than the constant memory.
Of course, this conclusion can vary with devices of compute capability 2.x with a L1 and L2 cache.
Regards!