The CUDA kernel function is used to calculate inner product for two vectors. The kernel used multiple threads to calculate products concurrently, and then uses one thread to calculate inner product.
My question is why the result would be correct without __syncthreads().
__global__ void dot( int *a, int *b, int *c, int *dot ){
int tid = threadIdx.x;
int i;
c[tid] = a[tid] * b[tid];
//__syncthreads();
//need synchronize??
if(tid==0){
for(i=0; i<N; i++){
*dot += c[i];
}
}
}
In your for loop, thread #0 is accessing every elements in array c (results by other threads). So the result would be wrong unless you secure that every calculation on array c is completed by using __syncthreads();.
Related
Hello everyone I'm trying to use grid-stride method and atomic functions to do multi-block reduction.
I know that the usual way to do this is to launch two kernels or use lastblock method as directed in this note.(or this tutorial)
However, I thought this could also be done by using grid-stride with atomic code.
As I tested, it worked very well..
until for some number, it gives the wrong answer. (which is very weird)
I have tested for some "n"s and found that I get wrong answer for n = 1234565, 1234566, 1234567.
This is my whole code of doing n sum of 1. So the answer should be n.
Any help or comment is appreciated.
#include<iostream>
__global__ void stride_sum(const double* input,
const int size,
double* sumOut){
extern __shared__ double sm[];
unsigned int tid = threadIdx.x;
unsigned int i = blockDim.x * blockIdx.x + tid;
//doing grid loop using stride method.
for(unsigned int s=i;
s<size;
s+=blockDim.x*gridDim.x){
sm[tid] = input[i];
__syncthreads();
//doing parallel reduction.
for(unsigned int ss = blockDim.x/2;ss>0;ss>>=1){
if(tid<ss && tid+ss<size) sm[tid] += sm[tid+ss];
__syncthreads();
}
//atomically add results to sumOut.
if(tid==0) atomicAdd(sumOut, sm[0]);
}
}
int main(){
unsigned int n = 1234567;
int blockSize = 4;
int nBlocks = (n + blockSize - 1) / blockSize;
int sharedMemory = sizeof(double)*blockSize;
double *data, *sum;
cudaMallocManaged(&data, sizeof(double)*n);
cudaMallocManaged(&sum, sizeof(double));
std::fill_n(data,n,1.);
std::fill_n(sum,1,0.);
stride_sum<<<nBlocks, blockSize, sharedMemory>>>(data,n,sum);
cudaDeviceSynchronize();
printf("res: 10.f \n",sum[0]);
cudaFree(data);
cudaFree(sum);
return 0;
}
You have gotten quite a lot wrong in your implementation. This will work:
__global__ void stride_sum(const double* input,
const int size,
double* sumOut)
{
extern __shared__ volatile double sm[];
unsigned int tid = threadIdx.x;
unsigned int i = blockDim.x * blockIdx.x + tid;
//doing grid loop using stride method.
double val = 0.;
for(unsigned int s=i; s<size; s+=blockDim.x*gridDim.x){
val += input[i];
}
// Load partial sum to memory
sm[tid] = val;
__syncthreads();
//doing parallel reduction.
for(unsigned int ss = blockDim.x/2;ss>0;ss>>=1){
if(tid<ss && tid+ss<size) sm[tid] += sm[tid+ss];
__syncthreads();
}
//atomically add results to sumOut.
if(tid==0) atomicAdd(sumOut, sm[0]);
}
[Never compiled and run, use a own risk]
In short -- do the grid strided summation, then a single shared memory reduction, then a single atomic update. Your implementation has undefined behaviour in a few places, especially the conditionally executed __syncthreads calls and using uninitialized shared memory when some threads fall out of the summation loop.
I tried summation in cuda . I cant find what i did wrong here.
The sum is always returned 0. Can anyone help.
The shared tag defines the variable common in each block.
So i tried to sum one block at a time and finally sum up the result for overall sum.
But the sum doesnt work for block. And i am stuck.
Can anyone help.
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <stdlib.h>
//#define BLOCK_SIZE 32 // size of vectors
__global__ void add( float * i_data, float * sum){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float s_data;
s_data = 0;
// must be synchronized
__syncthreads();
// reduce and sum
// typical in GPU computings
for (int i = 0; i<blockDim.x; i++)
{
__syncthreads();
if (tid <= i)
{
//s_data[blockIdx.x]+ = s_data[tid] + s_data[i+tid];
s_data+= i_data[tid];
}
}
if (tid == 0)
sum[blockIdx.x]=s_data;
}
int main() {
int T = 10, B = 5; // threads per block and blocks per grid
float *a,*b; // host pointers
float *dev_a, *dev_b; // device pointers to host memory
int sizeIN = T*B*sizeof(int);
int sizeOUT = B*sizeof(int);
a= new float[T*B];
b= new float[B];
for(int i = 0;i<B;i++)
{
for (int j=0;j<T;j++)
{
a[i*T+j]=i;
}
}
for(int i = 0;i<B;i++)
{
b[i]=0;
}
cudaMalloc((void **) &dev_a, sizeIN);
cudaMalloc((void **) &dev_b, sizeOUT);
cudaMemcpy(dev_a, a, sizeIN, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, sizeOUT, cudaMemcpyHostToDevice);
add<<< B, T >>> (dev_a, dev_b);
cudaMemcpy(a,dev_a, sizeIN, cudaMemcpyDeviceToHost);
cudaMemcpy(b,dev_b, sizeOUT, cudaMemcpyDeviceToHost);
for(int i = 0;i<B;i++)
{
for (int j=0;j<T;j++)
{
std::cout<< a[i*T+j]<<"\t";
std::cout<<std::endl;
}
std::cout<<std::endl<<std::endl<<"sum is: "<<b[i]<<std::endl;
}
std::cout<<std::endl<<std::endl;
cudaFree(dev_a);
cudaFree(dev_b);
free(a);
free(b);
return 0;
}
This is wrong in 2 ways:
if (tid = 0)
First, you should be doing a comparison == not an assignment =. I don't know why your compiler didn't warn you about this.
Second, tid is only zero for one thread in the entire grid:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
You want one thread in each block to write the block result out to global memory:
if (threadIdx.x == 0)
This is also a problem, similarly:
if (tid <= i)
This is only satisfied for threads in the first block. Beyond that, I have to start to guess at what you want. I guess you're trying to sum the values in each block. Your construction is not a parallel reduction, but to make the minimum changes to get it "functional" I would rewrite the end of your kernel like this:
// reduce and sum
// typical in GPU computings
for (int i = 0; i<blockDim.x; i++)
{
if (threadIdx.x == i)
{
//s_data[blockIdx.x]+ = s_data[tid] + s_data[i+tid];
s_data+= i_data[tid];
}
__syncthreads();
}
if (threadIdx.x == 0)
sum[blockIdx.x]=s_data;
}
Although you didn't have any CUDA API errors, it's good practice to use proper cuda error checking and also run your code with cuda-memcheck any time you are having trouble with a cuda code.
I mentioned that your code above is not a classical reduction. Its just an unoptimal for-loop.
To learn about a CUDA parallel reduction, study the cuda sample code and the accompanying presentation, and there are many examples here on the CUDA tag on SO as well that you can search on.
For the classical reduction algorithm on GPU, it works perfectly if the size of vector is the power of 2. What if it is not the case? At some point we will have to find the sum of odd number of element. What is the best way to deal with that?
You can compute the sum of a matrix that doesn't have a size of a power of two. Look at the example :
#include <math.h>
#define N 1022 //total size
__global__ void sum(int *A, int *C)
{
__shared__ int temp[blockDim.x];
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int local_idx = threadIdx.x;
temp[local_idx] = A[idx];
int i=ceil(blockDim.x/2);
__syncthreads();
while(i!=0)
{
if(idx+i<N && local_idx<i)
temp[local_idx] += tmp[local_idx+i];
i/=2;
__syncthreads();
}
if(local_idx == 0)
C[blockIdx.x] = temp[0];
}
Set
int i = 1024;
instead of
int i=ceil(blockDim.x/2);
i.e. int i should be interger^2 > N.
I was wondering if it was possible, and what was the best way to read cells from an array with threads in CUDA. To simplify what I mean this is an example :
I have an array : {1,2,3,4,5,6,...} and I would like each threads to read n cells of my array depending mainly of its size.
I have been trying a few things, but it seems not to work, so if anyone could point out a (right) way to do it, that would be great.
Thank you.
Generally you want contiguous threads to read contiguous array indices. Doing so results in "coalesced" memory transactions. The simple way to think of it is that if 32 threads are running physically in parallel, and they all do a load, then if all 32 loads fall into the same cache line, then a single memory access can be performed to fill the cache line, rather than 32 separate ones.
So what you want to do is have each thread access n cells that are strided by the number of threads, like this (assuming input data is in the float array data).
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = idx; i < numElements; i += stride) {
float element = data[i];
process(element);
}
If your algorithm requires that each thread reads n contiguous data elements, then you are going to incur non-coalesced loads, which will be much more expensive. In this case, I would consider re-designing the algorithm so this type of access is not required.
You need to:
the threads have to look at the n next numbers
So you can use:
#define N 2
#define NTHREAD 1024
#define ARRAYSIZE N*NTHREAD
// develop the kernel as:
__global__ void accessArray(int *array){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
int startId = tid*N;
// access thread's stride
for(int i=0; i<N; i++){
array[startId+i]=tid;
}
}
// call the kernel by:
accessArray<<<NTHREAD/256, 256>>>(d_array);
dump out the array and check whether it is how you want your thread work or not.
Full code:
#include <cuda.h>
#include <stdio.h>
#define N 2
#define NTHREAD 1024
#define ARRAYSIZE N*NTHREAD
// develop the kernel as:
__global__ void accessArray(int *array){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
int startId = tid*N;
// access thread's stride
for(int i=0; i<N; i++){
array[startId+i]=tid;
}
}
int main()
{
int h_array[ARRAYSIZE];
int *d_array;
size_t memsize= ARRAYSIZE * sizeof(float);
for (int i=0; i< ARRAYSIZE; i++) {
h_array[i] = 0;
}
cudaMalloc(&d_array, memsize);
cudaMemcpy(d_array, h_array, memsize, cudaMemcpyHostToDevice);
accessArray<<<NTHREAD/256, 256>>>(d_array);
cudaMemcpy(h_array, d_array, memsize, cudaMemcpyDeviceToHost);
for (int i=0; i<ARRAYSIZE; i++)
printf("A[%d] => %d\n",i,h_array[i]);
cudaFree(d_array);
}
Serial code snippet looks like this:
int i, j;
for(j=0; j<ny; j++)
{
for(i=0; i<nx; i++)
{
x[i + j*nx] *= y[i];
}
}
I converted this to CUDA using this kernel:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int i,j;
for(tid = 0; tid <nx*ny; tid++)
{
j = tid/nx;
i = tid - j*nx;
x[tid] *= y[i];
}
However the GPU kernel does not give any speedup improvement? Any suggestions on a better solution?? Thanks in advance
If this is the serial code:
int i, j;
for(j=0; j<ny; j++)
{
for(i=0; i<nx; i++)
{
x[i + j*nx] *= y[i];
}
}
then you should be doing this:
__global__ void fn(float *x, int nx)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int j = tid/nx, i = tid - j * nx;
x[tid] *= y[i];
}
fn<<<nx*ny/B, B>>>(x, nx); // with B = 256, 512, etc.
What you're doing is fairly bizarre: you're instructing each thread of the CUDA kernel to iterate over all values of tid between 0 and nx*ny, and compute the same function as your CPU version! Moreover, instead of just iterating over the indices, you're actually doing the loop less efficiently than you did for the CPU version; in other words, you do the same thing in each thread, just less efficiently, than you are doing in 1 thread on the CPU. It's no wonder that this is slower; it should be much, much slower. Your CUDA kernel is:
int **tid** = blockIdx.x * blockDim.x + threadIdx.x;
int i,j;
for(**tid** = 0; **tid** <nx*ny; **tid**++)
{
j = tid/nx;
i = tid - j*nx;
x[tid] *= y[i];
}
This does nx*ny iterations, same as your host code, for each thread; you lose all benefit of the parallelism, since each thread is doing the same thing; you would get the same performance using one thread on the GPU, and the same result!
If this is the verbatim code from your CUDA source file, you need to change it and redo the comparison; if this is code you have written to help explain what your code is doing for a lay non-CUDA audience, then you need to present your actual CUDA code so that we can see what's going on... as it is, the performance analysis I have done - the trivial one - is all you can expect.
Given your comment to this answer:
the nx * ny = 2205; so I used no. of blocks =
(nx*ny+(threads-1))/threads and threads = 64.
is implying you are intending to launch one thread per computation, the correct CUDA implementation would just be:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int j = tid/nx;
int i = tid - j*nx;
if (tid < (nx*ny))
x[tid] *= y[i];
If you were intending for each thread to compute more than one computation per kernel launch, then you would size the grid to "fill" each of the SM on the target GPU, not use the same number of threads as the input size, and then do something like:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int gsize = blockDim.x * gridDim.x;
int i,j;
for(; tid <nx*ny; tid+=gsize)
{
j = tid/nx;
i = tid - j*nx;
x[tid] *= y[i];
}
That would get you at least coalesced reads and writes to x, and remove the enormous number of redundant calculations in your posted version. There are a number of further optimizations that could be made, but it would require more information about the problem than has been supplied in the question and subsequent comments. Your indexing scheme contains an integer division and then an integer multiply-add per calculation. That is a lot of overhead for a single FLOP per input value. However, having said all of that, if the problem size I quoted is that actual problem size you are interested in, the GPU will never be faster than even a modest host CPU. You would require many orders of magnitude larger problems to realize useful speed up using the GPU for this sort low arithmetic intensity operation.
How big is the block? it may be that the time needed to copy a small amount of data to the GPU and setup the envirnoment is much longer than the calculation time.
Remember also that CUDA does a jit compile on the first run so to get accurate benchmarking you need to run it many times.
Try this using shared memory. One of the best implementations around:
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)
typedef struct {
int width;
int height;
int stride; // In number of elements
float *elements;
} Matrix;
// Thread block size
#define BLOCK_SIZE 16
// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col)
{
return A.elements[row * A.stride + col];
}
// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col, float value)
{
A.elements[row * A.stride + col] = value;
}
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
Matrix Asub;
Asub.width = BLOCK_SIZE; Asub.height = BLOCK_SIZE;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row +
BLOCK_SIZE * col];
return Asub;
}
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Same as in previous example, except the followings:
// d_A.width = d_A.stride = A.width;
// d_B.width = d_B.stride = B.width;
// d_C.width = d_C.stride = C.width;
}
// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Block row and column
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
// Each thread block computes one sub-matrix Csub of C
Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
// Each thread computes one element of Csub
// by accumulating results into Cvalue
float Cvalue = 0;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
for (int m = 0; m < (A.width / BLOCK_SIZE); ++m)
{
// Get sub-matrix Asub of A and Bsub of B
Matrix Asub = GetSubMatrix(A, blockRow, m);
Matrix Bsub = GetSubMatrix(B, m, blockCol);
// Shared memory used to store Asub and Bsub respectively
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[row][col] = GetElement(Asub, row, col);
Bs[row][col] = GetElement(Bsub, row, col);
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
__syncthreads();
// Multiply Asub and Bsub together
for (int e = 0; e < BLOCK_SIZE; ++e)
Cvalue += As[row][e] * Bs[e][col];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write Csub to device memory
// Each thread writes one element
SetElement(Csub, row, col, Cvalue);
}