Hello I'm working in a CUDA kernel about matrix vector product. I want to improve the performance with tiling and shared memory.
The problem is that with this code the M Matrix or the N vector aren't loading right.
Do you have any idea about how to Load a tile from M and N into the shared memory arrays??
M is the matrix, N is the vector and P is the result of the matrix vector product
__global__ void matrixMul( float* P, float* M, float* N, int Mw, int Nw)
{
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
__shared__ float Ms[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Ns[BLOCK_SIZE];
// ===================================================================
// Code segment 1
// Determine the update values for the tile indices in the loop
// ===================================================================
int mBegin = Mw * BLOCK_SIZE * by;
int mEnd = mBegin + Mw - 1;
int mStep = BLOCK_SIZE;
int nBegin = BLOCK_SIZE * bx;
//int nStep = BLOCK_SIZE*Nw;
int nStep = 1;
float Psub = 0.0f;
// ===================================================================
// Code segment 2
// Do matrix-matrix multiplication inside a tile
// ===================================================================
for (int m = mBegin, n = nBegin; m <= mEnd; m += mStep, n += nStep) {
// Load a tile from M and N into the shared memory arrays
Ms[ty][tx] = M[bx*mStep*Mw+m];
Ns[ty] = N[by*nStep*Nw+n];
// Synchronize the threads
__syncthreads();
// Multiply the two tiles together, each thread accumulating
// the partial sum of a single dot product.
for (int i = 0; i < BLOCK_SIZE; i++) {
Psub += Ms[i][tx] * Ns[i];
}
// Synchronize again.
__syncthreads();
}
// ===================================================================
// Code segment 3
// Store the data back to global memory
// ===================================================================
int p = Nw * BLOCK_SIZE * by + BLOCK_SIZE * bx;
P[p + nStep] = Psub;
}
I found a similar example (dealing with square matrices of identical sizes, mind you) that also loads parts of the matrix into shared memory. It seems your declarations are right, and it probably just comes down to the algebra you are using to determine which elements go where.
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width){
__shared__float Mds[TILE_WIDTH][TILE_WIDTH]; // Shared memory
__shared__float Nds[TILE_WIDTH][TILE_WIDTH]; // declarations
int bx = blockIdx.x; int by = blockIdx.y; // ID thread
int tx = threadIdx.x; int ty = threadIdx.y;
// Identify the row and column of the Pd element to work on
int Row = by * TILE_WIDTH + ty;
int Col = bx * TILE_WIDTH + tx;
float Pvalue = 0; // REGISTER!
// Loop over the Md and Nd tiles required to compute the Pd element
for (int m = 0; m < Width/TILE_WIDTH; ++m) {
// Collaborative loading of Md and Nd tiles into shared memory
Mds[ty][tx] = Md[Row*Width + (m*TILE_WIDTH + tx)];
Nds[ty][tx] = Nd[Col + (m*TILE_WIDTH + ty)*Width];
__syncthreads();
for (int k = 0; k < TILE_WIDTH; ++k)
Pvalue += Mds[ty][k] * Nds[k][tx];
__syncthreads();
}
Pd[Row*Width+Col] = Pvalue;
}
I have a basic question related to Two-dimensional thread access.
I want to copy the non-contiguous data into contiguous buffer and the use of cuda memcopy can be illustrated as:
void pack_cuda(float *dstbuf, IOV *srciov, int num_iov)
{
int i;
float *ptr;
ptr = buf;
for (i = 0; i < num_iov; i++) {
cudaMemcpy(ptr, srciov[i].bufaddr, srciov[i].len, cudaMemcpyDefault);
ptr = (char *)ptr + srciov[i].len;
}
}
*srciov stores the start memory address and length of each non-contiguous data in an array of structure.
*dstbuf will store the packed contiguous data after the completion of the function.
Now, I want to implement it using CUDA kernels.
__global__ void pack_cuda(float *dstbuf, IOV *srciov, int num_iov)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k;
extern __shared__ size_t tmpdbuflen[16*3]; //suppose num_iov is 16
if ( j == 0 ){
if ( i < 16 ){
tmpdbuflen[i] = (srciov[i].len); //store length to calculate presum
tmpdbuflen[i+16] = tmpdbuflen[i]; //store length
tmpdbuflen[i+32] = ((srciov+i)->bufaddr) - (srciov->bufaddr); //store addr difference
}
__syncthreads();
for ( k = 0; k < i; k++)
tmpdbuflen[i] += srciov[k].len;
}
__syncthreads();
if ( i < 16 && j < srciov[i].len ){ //wondering whether this is correct use
dst[tmpdbuflen[i] + j] = *(src + tmpdbuflen[i+32] + j);
}
__syncthreads();
}
Kernel invocation part:
dim3 dimblock(16, 16); //the length of each non-contiguous data is less than 16
dim3 dimgrid(1,1);
const unsigned int shm_size = sizeof(size_t) * 16 * 3;
pack_cuda<<<dimgrid, dimblock, shm_size, 0>>>(dstbuf, srciov, num_iov);
cudaDeviceSynchronize();
However, it seems that I cannot pack all needed datas into dst buffer.
Sometimes only j = 0 and 1 (with corresponding various i) get packed.
I think the major problem is the usage of shared memory. I only use column 0 threads (threadIdx.y == 0) to copy information onto the shared memory. Then all threads (no restriction on threadIdx.y) will access and read information in shared memory.
How to modify the code enable such design?
I'd appreciate it if anyone can figure out my problems.
Thanks.
Some hints on your code:
__global__ void pack_cuda(float *dstbuf, IOV *srciov, int num_iov)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k;
extern __shared__ size_t tmpdbuflen[16*3]; //suppose num_iov is 16
This Block here will only be executed by one thread, due to the guard j==0 which only allows thread bid*bdim+tid = 0*0+0, ergo thread 0 in block 0, which is undesirable for you. I would guess you want put j < 16 there
if ( j == 0 ){
if ( i < 16 ){
tmpdbuflen[i] = (srciov[i].len); //store length to calculate presum
tmpdbuflen[i+16] = tmpdbuflen[i]; //store length
tmpdbuflen[i+32] = ((srciov+i)->bufaddr) - (srciov->bufaddr); //store addr difference
}
__syncthreads();
for ( k = 0; k < i; k++)
tmpdbuflen[i] += srciov[k].len;
}
.
I've been trying to write a kernel in that calculates the sum of the inverse of the distance between N given points over N. A serial coda in C would be like
average = 0;
for(int i = 0; i < Np; i++){
for(int j = i + 1; j < Np; j++){
average += 1.0e0f/sqrtf((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
}
average = average/(float)N;
Where rx and ry are the x and y coordinates, respectively.
I generate the points via a kernel that uses random number generator. For the kernel, I used 128(256) threads per block for 4k(8k) points. On it every thread performs the inner above inner loop, then the results are passed to a reduce sum function, as follows
Generate points:
__global__ void InitRNG ( curandState * state, const int seed ){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
curand_init (seed, tIdx, 0, &state[tIdx]);
}
__global__
void SortPoints(float* X, float* Y,const int N, curandState *state){
float rdmn1, rdmn2;
unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
float range;
if(tIdx < N){
rdmn1 = curand_uniform(&state[tIdx]);
rdmn2 = curand_uniform(&state[tIdx]);
range = sqrtf(0.25e0f*N*rdmn1);
X[tIdx] = range*cosf(2.0e0f*pi*rdmn2);
Y[tIdx] = range*sinf(2.0e0f*pi*rdmn2);
}
}
Reduction:
__device__
float ReduceSum2(float In){
__shared__ float data[BlockSize];
unsigned int tIdx = threadIdx.x;
data[tIdx] = In;
__syncthreads();
for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){
if(tIdx < i){
data[tIdx] += data[tIdx + i];
}
__syncthreads();
}
return data[0];
}
Kernel:
__global__
void AvgDistance(float *X, float *Y, float *Avg, const int N){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
int bIdx = blockIdx.x;
float x , y;
float d = 0.0f;
if(tIdx < N){
for(int i = tIdx + 1; i < N ; i++){
x = X[tIdx] - X[i];
y = Y[tIdx] - Y[i];
d += 1.0e0f/(sqrtf(x*x + y*y));
}
__syncthreads();
Avg[bIdx] = ReduceSum2(d);
}
}
The kernel is configured and launched as follows:
dim3 threads(BlockSize,BlockSize);
dim3 blocks(ceil(Np/threads.x),ceil(Np/threads.y));
InitRNG<<<blocks.x,threads.x>>>(d_state,seed);
SortPoints<<<blocks.x,threads.x>>>(d_rx,d_ry,Np,d_state);
AvgDistance<<<blocks.x,threads.x,threads.x*sizeof(float)>>>(d_rx,d_ry,d_Avg,Np);
Finally, I copy the data back to host and then perform the remaining sum:
Avg = new float[blocks.x];
CHECK(cudaMemcpy(Avg,d_Avg,blocks.x*sizeof(float),cudaMemcpyDeviceToHost),ERROR_CPY_DEVTOH);
float average = 0;
for(int i = 0; i < blocks.x; i++){
average += Avg[i];
}
average = average/(float)Np;
For 4k points, ok! the results are:
Average distance between points (via Kernel) = 108.615
Average distance between points (via CPU) = 110.191
In this case the sum may be performed in different order, causing both results to diverge from each other, I don't know...
But when it comes to 8k, the results are quiet different:
Average distance between points (via Kernel) = 153.63
Average distance between points (via CPU) = 131.471
To me it seems that both the kernel and the serial code are written the same way. What leads me to distrust the precision on CUDA calculation of floating point numbers. Does this make sense? Or are the access to global memory causing some conflicts when some threads load the same data from X and Y at the same time? Or the way I wrote the kernel is in some way 'wrong'(I mean, am I doing something that is causing both results to diverge from each other?).
Actually, from what I can tell, the problem seems to be on the CPU side. I created a sample code based on your code.
I was able to reproduce your results.
First I switched all instances of sinf, cosf, and sqrtf to their corresponding double versions. This made no difference in the results.
Next I included a typedef so I could easily switch the precision from float to double and back, replacing every relevant instance of float in the code with mytype which is my typedef.
When I run the code with typedef of float and a data size of 4096 I get these results:
GPU average = 108.294922
CPU average = 109.925285
When I run the code with typedef of double and a data size of 4096 I get these results:
GPU average = 108.294903
CPU average = 108.294903
When I run the code with typedef of float and a data size of 8192 I get these results:
GPU average = 153.447327
CPU average = 131.473526
When I run the code with typedef of double and a data size of 8192 I get these results:
GPU average = 153.447380
CPU average = 153.447380
There are at least 2 observations:
The GPU results don't vary between float and double, except in the 5th decimal place
The CPU results vary by 1-20% or so between float and double, but when double is selected, they line up exactly (to the 6th decimal place, anyway) with the GPU results.
Based on this, I believe the CPU is providing the variable, questionable behavior.
Here's my code for reference:
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#define DSIZE 8192
#define BlockSize 32
#define pi 3.14159f
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef double mytype;
__global__ void InitRNG ( curandState * state, const int seed ){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
curand_init (seed, tIdx, 0, &state[tIdx]);
}
__global__
void SortPoints(mytype* X, mytype* Y,const int N, curandState *state){
mytype rdmn1, rdmn2;
unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
mytype range;
if(tIdx < N){
rdmn1 = curand_uniform(&state[tIdx]);
rdmn2 = curand_uniform(&state[tIdx]);
range = sqrt(0.25e0f*N*rdmn1);
X[tIdx] = range*cos(2.0e0f*pi*rdmn2);
Y[tIdx] = range*sin(2.0e0f*pi*rdmn2);
}
}
__device__
mytype ReduceSum2(mytype In){
__shared__ mytype data[BlockSize];
unsigned int tIdx = threadIdx.x;
data[tIdx] = In;
__syncthreads();
for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){
if(tIdx < i){
data[tIdx] += data[tIdx + i];
}
__syncthreads();
}
return data[0];
}
__global__
void AvgDistance(mytype *X, mytype *Y, mytype *Avg, const int N){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
int bIdx = blockIdx.x;
mytype x , y;
mytype d = 0.0f;
if(tIdx < N){
for(int i = tIdx + 1; i < N ; i++){
x = X[tIdx] - X[i];
y = Y[tIdx] - Y[i];
d += 1.0e0f/(sqrt(x*x + y*y));
}
__syncthreads();
Avg[bIdx] = ReduceSum2(d);
}
}
mytype cpu_avg(const mytype *rx, const mytype *ry, const int size){
mytype average = 0.0f;
for(int i = 0; i < size; i++){
for(int j = i + 1; j < size; j++){
average += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
}
average = average/(mytype)size;
return average;
}
int main() {
int Np = DSIZE;
mytype *rx, *ry, *d_rx, *d_ry, *d_Avg, *Avg;
curandState *d_state;
int seed = 1;
dim3 threads(BlockSize,BlockSize);
dim3 blocks((int)ceilf(Np/(float)threads.x),(int)ceilf(Np/(float)threads.y));
printf("number of blocks = %d\n", blocks.x);
printf("number of threads= %d\n", threads.x);
rx = (mytype *)malloc(DSIZE*sizeof(mytype));
if (rx == 0) {printf("malloc fail\n"); return 1;}
ry = (mytype *)malloc(DSIZE*sizeof(mytype));
if (ry == 0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void**)&d_rx, DSIZE * sizeof(mytype));
cudaMalloc((void**)&d_ry, DSIZE * sizeof(mytype));
cudaMalloc((void**)&d_Avg, blocks.x * sizeof(mytype));
cudaMalloc((void**)&d_state, DSIZE * sizeof(curandState));
cudaCheckErrors("cudamalloc");
InitRNG<<<blocks.x,threads.x>>>(d_state,seed);
SortPoints<<<blocks.x,threads.x>>>(d_rx,d_ry,Np,d_state);
AvgDistance<<<blocks.x,threads.x,threads.x*sizeof(mytype)>>>(d_rx,d_ry,d_Avg,Np);
cudaCheckErrors("kernels");
Avg = new mytype[blocks.x];
cudaMemcpy(Avg,d_Avg,blocks.x*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaMemcpy(rx, d_rx, DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaMemcpy(ry, d_ry, DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaCheckErrors("cudamemcpy");
mytype average = 0;
for(int i = 0; i < blocks.x; i++){
average += Avg[i];
}
average = average/(mytype)Np;
printf("GPU average = %f\n", average);
average = cpu_avg(rx, ry, DSIZE);
printf("CPU average = %f\n", average);
return 0;
}
I am running on RHEL 5.5, CUDA 5.0, Intel Xeon X5560
compiled with:
nvcc -O3 -arch=sm_20 -lcurand -lm -o t93 t93.cu
EDIT:
After observing that the variability was on the CPU side, I found that I could eliminate most of the CPU variability by modifying your CPU averaging code like this:
mytype cpu_avg(const mytype *rx, const mytype *ry, const int size){
mytype average = 0.0f;
mytype temp = 0.0f;
for(int i = 0; i < size; i++){
for(int j = i + 1; j < size; j++){
temp += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
average += temp/(mytype)size;
temp = 0.0f;
}
return average;
}
So I would say there's a problem with intermediate results on the CPU side. It's interesting that it doesn't show up on the GPU result. I suspect the reason for this is that the final summation of GPU averages is done on the CPU (therefore each individual GPU block result is scaled down by the size, e.g. 8192), and these may have an intermediate precision that is sufficient to survive until the final division. If you inlined the CPU average calculation, you may observe something different again.
I have a large character array in the device global memory that is accessed
in a coalescent manner by threads. I've read somewhere that I could speed up
memory access by reading 4 or 16 chars in one memory transaction per thread.
I believe I would have to use textures and the char4 or int4 structs. However,
I can't find any documentation or examples on this. Could anyone here please
provide a simple example or pointers to where I can learn more about this?
In my code I define the char array as
char *database = NULL;
cudaMalloc( (void**) &database, SIZE * sizeof(char) );
What would the definition be if I want to use textures and char4 (or int4)?
Thanks very much.
I finally figured out the answer to my own question. The definition with char4
would be
char4 *database = NULL;
cudaMalloc( (void**) &database, SIZE * sizeof(char4)/4 );
Don't need textures for this. The kernel does speedup by a factor of three
with char4 but reduces to two if I do loop unrolling. For the sake of completeness
my kernel is
__global__ void kernel(unsigned int jobs_todo, char* database, float* results ) {
unsigned int id = threadIdx.x + blockIdx.x * blockDim.x;
float A = 0; int i; char ch;
if(id < jobs_todo) {
for(i = 0; i < 1000; i += 1){
ch = database[jobs_todo*i + id];
if(ch == 'A') A++;
}
results[id] = A;
}
}
And with char4 it is
__global__ void kernel4(unsigned int jobs_todo, char4* database, float* results ) {
unsigned int id = threadIdx.x + blockIdx.x * blockDim.x;
float A = 0; int i; char4 ch4;
if(id < jobs_todo) {
for(i = 0; i < 1000/4; i += 1){
ch4 = database[jobs_todo*i + id];
if(ch4.x == 'A') A++;
if(ch4.y == 'A') A++;
if(ch4.z == 'A') A++;
if(ch4.w == 'A') A++;
}
results[id] = A;
}
}
I also tried int4 but it's just .0002 seconds faster than the char4 time.
Given that I have the array
Let Sum be 16
dintptr = { 0 , 2, 8,11,13,15}
I want to compute the difference between consecutive indices using the GPU. So the final array should be as follows:
count = { 2, 6,3,2,2,1}
Below is my kernel:
//for this function n is 6
__global__ void kernel(int *dintptr, int * count, int n){
int id = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ int indexes[256];
int need = (n % 256 ==0)?0:1;
int allow = 256 * ( n/256 + need);
while(id < allow){
if(id < n ){
indexes[threadIdx.x] = dintptr[id];
}
__syncthreads();
if(id < n - 1 ){
if(threadIdx.x % 255 == 0 ){
count[id] = indexes[threadIdx.x + 1] - indexes[threadIdx.x];
}else{
count[id] = dintptr[id+1] - dintptr[id];
}
}//end if id<n-1
__syncthreads();
id+=(gridDim.x * blockDim.x);
}//end while
}//end kernel
// For last element explicitly set count[n-1] = SUm - dintptr[n-1]
2 questions:
Is this kernel fast. Can you suggest a faster implementation?
Does this kernel handle arrays of arbitrary size ( I think it does)
I'll bite.
__global__ void kernel(int *dintptr, int * count, int n)
{
for (int id = blockDim.x * blockIdx.x + threadIdx.x;
id < n-1;
id += gridDim.x * blockDim.x)
count[id] = dintptr[id+1] - dintptr[i];
}
(Since you said you "explicitly" set the value of the last element, and you didn't in your kernel, I didn't bother to set it here either.)
I don't see a lot of advantage to using shared memory in this kernel as you do: the L1 cache on Fermi should give you nearly the same advantage since your locality is high and reuse is low.
Both your kernel and mine appear to handle arbitrary-sized arrays. Yours however appears to assume blockDim.x == 256.