Read an Array With Threads in CUDA - cuda

I was wondering if it was possible, and what was the best way to read cells from an array with threads in CUDA. To simplify what I mean this is an example :
I have an array : {1,2,3,4,5,6,...} and I would like each threads to read n cells of my array depending mainly of its size.
I have been trying a few things, but it seems not to work, so if anyone could point out a (right) way to do it, that would be great.
Thank you.

Generally you want contiguous threads to read contiguous array indices. Doing so results in "coalesced" memory transactions. The simple way to think of it is that if 32 threads are running physically in parallel, and they all do a load, then if all 32 loads fall into the same cache line, then a single memory access can be performed to fill the cache line, rather than 32 separate ones.
So what you want to do is have each thread access n cells that are strided by the number of threads, like this (assuming input data is in the float array data).
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = idx; i < numElements; i += stride) {
float element = data[i];
process(element);
}
If your algorithm requires that each thread reads n contiguous data elements, then you are going to incur non-coalesced loads, which will be much more expensive. In this case, I would consider re-designing the algorithm so this type of access is not required.

You need to:
the threads have to look at the n next numbers
So you can use:
#define N 2
#define NTHREAD 1024
#define ARRAYSIZE N*NTHREAD
// develop the kernel as:
__global__ void accessArray(int *array){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
int startId = tid*N;
// access thread's stride
for(int i=0; i<N; i++){
array[startId+i]=tid;
}
}
// call the kernel by:
accessArray<<<NTHREAD/256, 256>>>(d_array);
dump out the array and check whether it is how you want your thread work or not.
Full code:
#include <cuda.h>
#include <stdio.h>
#define N 2
#define NTHREAD 1024
#define ARRAYSIZE N*NTHREAD
// develop the kernel as:
__global__ void accessArray(int *array){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
int startId = tid*N;
// access thread's stride
for(int i=0; i<N; i++){
array[startId+i]=tid;
}
}
int main()
{
int h_array[ARRAYSIZE];
int *d_array;
size_t memsize= ARRAYSIZE * sizeof(float);
for (int i=0; i< ARRAYSIZE; i++) {
h_array[i] = 0;
}
cudaMalloc(&d_array, memsize);
cudaMemcpy(d_array, h_array, memsize, cudaMemcpyHostToDevice);
accessArray<<<NTHREAD/256, 256>>>(d_array);
cudaMemcpy(h_array, d_array, memsize, cudaMemcpyDeviceToHost);
for (int i=0; i<ARRAYSIZE; i++)
printf("A[%d] => %d\n",i,h_array[i]);
cudaFree(d_array);
}

Related

Unspecified launch failure after cudaDeviceSynchronize() call when program starts. But no errors using step-through debugging. CUDA

I've spent several hours struggling with unspecified launch failure.
I've come up with a tiny task for myself in order to understand how shared memory works.
Task is to divide array [1, 2, 3, ... , N] into K group of (N / K) elements and find the sum of each group. (Difference between current and previous element of the array equals 1)
I was planning to use N threads in grid divided between K blocks. So every threadblock contains (N / K) threads. Thus one threadblock could be used to compute sum of one group. Also I wanted to dynamically allocate shared memory.
When I start program I got unspecified launch failure after cudaDeviceSynchronize() call. But when I try step-through debugging everthing is ok and works fine.
What am I doing wrong? (Visual Studio 2012 Professional, Compute Capability 2.1) I would very appreciate any help.
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf("%s\n",cudaGetErrorString(x)); \
system("pause"); \
return EXIT_FAILURE;}} while(0)
extern __shared__ double shrd[];
__global__ void kernel(double * a){
size_t threadID_block = blockDim.x * threadIdx.y + threadIdx.x;
size_t blockID_global = (gridDim.x * blockIdx.y + blockIdx.x );
size_t threadID_global = blockID_global * blockDim.x * blockDim.y + threadID_block;
double * temp = &shrd[blockID_global * blockDim.x * blockDim.y];
temp[threadID_block] = static_cast<double>(threadID_global);
__syncthreads();
if (threadID_block == 0){
a[blockID_global] = 0.0;
for (size_t index = 0; index < blockDim.x * blockDim.y; index++){
a[blockID_global] += temp[index];
}
}
}
int main(){
int devNum = 0;
CUDA_CALL(cudaGetDevice(&devNum));
CUDA_CALL(cudaSetDevice(devNum));
dim3 gridSize(2,2,1);
dim3 blockSize(4,4,1);
double * dev_a = NULL;
size_t length = gridSize.x * gridSize.y ;
size_t byteSize = length * sizeof(double);
CUDA_CALL(cudaMalloc(&dev_a,byteSize));
size_t shmem_perBlock = blockSize.x * blockSize.y * sizeof(double);
kernel <<< gridSize, blockSize, shmem_perBlock >>> (dev_a);
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaDeviceSynchronize());
double * a = new double [length];
CUDA_CALL(cudaMemcpy(a,dev_a,byteSize,cudaMemcpyDeviceToHost));
for (size_t index = 0; index < length; index++){
printf("%.3f\n",a[index]);
}
printf("\n");
CUDA_CALL(cudaFree(dev_a));
CUDA_CALL(cudaDeviceReset());
delete[]a;
system("pause");
return 0;
}
If you are on kepler or later first read this:
http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
Otherwise if you are pre-kepler read this:
http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf
There are some fundamentals you are missing in terms of CUDA programming. I have given you a template of your code below. It is for clarification of some of these fundamentals. Do not expect this to be optimized as I am expecting you to program the parallel reduction. This will get you started with an understanding on how to use shared memory.
Good Luck!
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N 10000
#define K 100
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf("%s\n",cudaGetErrorString(x)); \
system("pause"); \
return EXIT_FAILURE;}} while(0)
__global__ void kernel(double* a, double* results){
extern __shared__ double shared[];
size_t tid, tid_local, stride;
tid = blockDim.x*blockIdx.x+threadIdx.x; //thread id within all blocks
tid_local = threadIdx.x; //thread id within a block
stride = blockDim.x*gridDim.x; //total number of threads
double *start = &a[K*blockIdx.x]; //each block will get K of a block.
shared[tid_local]=start[tid_local]; //copy K elements into shared memory
__syncthreads();
//Perform Parallel reduction, you will have to implement this
//After parallel reduction, result should be in shared[0]
//for demonstration I made the code serial for each block on thread 0.
//This is for demonstration only.
double sum=0;
if(tid_local==0){
for(int i=0; i<K; i++){
sum+=shared[i];
}
a[blockIdx.x]=sum;
}
}
int main(){
int devNum = 0;
CUDA_CALL(cudaGetDevice(&devNum));
CUDA_CALL(cudaSetDevice(devNum));
double * dev_a = NULL;
double * dev_results=NULL;
CUDA_CALL(cudaMalloc(&dev_a, N*sizeof(double) ));
CUDA_CALL(cudaMalloc(&dev_results, (N/K)*sizeof(double)));
//copy dev_a onto GPU (this is the array you are summing).
dim3 block_size(K, 1, 1);
dim3 grid_size (N/K, 1, 1);
size_t shmem_perBlock = K * sizeof(double);
kernel <<< grid_size, block_size, shmem_perBlock >>> (dev_a, dev_results);
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaDeviceSynchronize());
//copy dev_results back to CPU, this is your result.
CUDA_CALL(cudaFree(dev_a));
CUDA_CALL(cudaFree(dev_results));
system("pause");
return 0;
}

Cuda Summation per block. I get 0 returned to the sums. What is wrong?

I tried summation in cuda . I cant find what i did wrong here.
The sum is always returned 0. Can anyone help.
The shared tag defines the variable common in each block.
So i tried to sum one block at a time and finally sum up the result for overall sum.
But the sum doesnt work for block. And i am stuck.
Can anyone help.
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <stdlib.h>
//#define BLOCK_SIZE 32 // size of vectors
__global__ void add( float * i_data, float * sum){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float s_data;
s_data = 0;
// must be synchronized
__syncthreads();
// reduce and sum
// typical in GPU computings
for (int i = 0; i<blockDim.x; i++)
{
__syncthreads();
if (tid <= i)
{
//s_data[blockIdx.x]+ = s_data[tid] + s_data[i+tid];
s_data+= i_data[tid];
}
}
if (tid == 0)
sum[blockIdx.x]=s_data;
}
int main() {
int T = 10, B = 5; // threads per block and blocks per grid
float *a,*b; // host pointers
float *dev_a, *dev_b; // device pointers to host memory
int sizeIN = T*B*sizeof(int);
int sizeOUT = B*sizeof(int);
a= new float[T*B];
b= new float[B];
for(int i = 0;i<B;i++)
{
for (int j=0;j<T;j++)
{
a[i*T+j]=i;
}
}
for(int i = 0;i<B;i++)
{
b[i]=0;
}
cudaMalloc((void **) &dev_a, sizeIN);
cudaMalloc((void **) &dev_b, sizeOUT);
cudaMemcpy(dev_a, a, sizeIN, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, sizeOUT, cudaMemcpyHostToDevice);
add<<< B, T >>> (dev_a, dev_b);
cudaMemcpy(a,dev_a, sizeIN, cudaMemcpyDeviceToHost);
cudaMemcpy(b,dev_b, sizeOUT, cudaMemcpyDeviceToHost);
for(int i = 0;i<B;i++)
{
for (int j=0;j<T;j++)
{
std::cout<< a[i*T+j]<<"\t";
std::cout<<std::endl;
}
std::cout<<std::endl<<std::endl<<"sum is: "<<b[i]<<std::endl;
}
std::cout<<std::endl<<std::endl;
cudaFree(dev_a);
cudaFree(dev_b);
free(a);
free(b);
return 0;
}
This is wrong in 2 ways:
if (tid = 0)
First, you should be doing a comparison == not an assignment =. I don't know why your compiler didn't warn you about this.
Second, tid is only zero for one thread in the entire grid:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
You want one thread in each block to write the block result out to global memory:
if (threadIdx.x == 0)
This is also a problem, similarly:
if (tid <= i)
This is only satisfied for threads in the first block. Beyond that, I have to start to guess at what you want. I guess you're trying to sum the values in each block. Your construction is not a parallel reduction, but to make the minimum changes to get it "functional" I would rewrite the end of your kernel like this:
// reduce and sum
// typical in GPU computings
for (int i = 0; i<blockDim.x; i++)
{
if (threadIdx.x == i)
{
//s_data[blockIdx.x]+ = s_data[tid] + s_data[i+tid];
s_data+= i_data[tid];
}
__syncthreads();
}
if (threadIdx.x == 0)
sum[blockIdx.x]=s_data;
}
Although you didn't have any CUDA API errors, it's good practice to use proper cuda error checking and also run your code with cuda-memcheck any time you are having trouble with a cuda code.
I mentioned that your code above is not a classical reduction. Its just an unoptimal for-loop.
To learn about a CUDA parallel reduction, study the cuda sample code and the accompanying presentation, and there are many examples here on the CUDA tag on SO as well that you can search on.

CUDA: large kernel gives strange behavior

I recently bought a gtx550ti boost card. Programs that used to work on my old gf440 card fails. Here is an example. The following program works fine with smaller kernels, but goes wrong with larger ones.
#include "stdio.h"
__global__ void kernel(float * d_in, float * d_out){
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int idx = x + y * blockDim.x * gridDim.x;
d_out[idx] = d_in[idx];
}
int main(){
const dim3 gridSize(10,10);
const dim3 blockSize(80,80);
const int size = 800*800;
float * h_in = new float[size];
float * h_out = new float[size];
float * d_in;
float * d_out;
cudaMalloc((void**)&d_in, sizeof(float)*size);
cudaMalloc((void**)&d_out, sizeof(float)*size);
for(int i = 0; i < size; i++)
h_in[i] = (float)i;
cudaMemcpy(d_in, h_in, sizeof(float)*size, cudaMemcpyHostToDevice);
kernel<<<gridSize,blockSize>>>(d_in, d_out);
cudaMemcpy(h_out, d_out, sizeof(float)*size, cudaMemcpyDeviceToHost);
for(int i = 0; i < size; i++)
printf("%f\n",h_out[i]);
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
I expected it to output index in floats. But it outputs some random floats:
0.131061
2.520029
9.304665
0.000189
0.242134
0.525557
0.560013
size 100*100
Instead, when I switch to size 100*100:
const dim3 gridSize(10,10);
const dim3 blockSize(10,10);
const int size = 100*100;
And it works fine(last 5 outputs):
9995.000000
9996.000000
9997.000000
9998.000000
9999.000000
size 500*500
But for larger size 500*500:
const dim3 gridSize(10,10);
const dim3 blockSize(50,50);
const int size = 500*500;
It outputs wrong index(last 5 outputs):
512139.000000
512140.000000
512141.000000
512142.000000
512143.000000
I installed CUDA 5.5. Thanks!
Whenever you are having trouble with cuda code, you should be doing proper cuda error checking.
This is not valid:
const dim3 blockSize(80,80);
This is asking for a threadblock of 80*80 = 6400 threads. There are no GPUs that support 6400 threads per threadblock.
This is also not valid:
const dim3 blockSize(50,50);
2500 threads is also too many. These configs would not work on either of your cards.
This is acceptable:
const dim3 blockSize(10,10);
In the "not valid" cases, your kernel is not running. If you had done proper cuda error checking, you would have discovered this and even got a clue as to what might be wrong (invalid launch configuration).
You may also want to familiarize yourself with the deviceQuery cuda sample, and study the output for your GPUs.

Allocate 2D array with cudaMallocPitch and copying with cudaMemcpy2D

I'm new in CUDA, I appreciate your help and hope you can help me.
I need to store multiple elements of a 2D array into a vector, and then work with the vector, but my code does not work well, when I debug, I find a mistake in allocating the 2D array in the device with cudaMallocPitch and copying to that array with cudaMemcpy2D. This is my code:
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cmath>
#define maxThreads 96
__global__ void extract(int mSize, float* dev_vector, float* dev_matrix, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
while(idx<N)
{
dev_vector[idx] = *(dev_matrix+(mSize*idx+N));
idx += blockDim.x * gridDim.x;
}
}
int main()
{
//CPU variables
int mSize = 5;
float* matrix;
int N = 4; // Vector size
int i,j;
float* vector;
int blocks, threads;
float* dev_matrix;
float* dev_vector;
blocks = 1+((N-1)/maxThreads);
threads = 1+((N-1)/blocks);
unsigned long int pitch;
unsigned long int memsize_vector = N*sizeof(float);
unsigned long int memsize_matrix = mSize*sizeof(float);
matrix = new float[memsize_matrix*memsize_matrix];
vector = new float[memsize_vector];
//Create 2D array
for(i=0; i<mSize; i++)
for(j=0; j<mSize; j++)
{
matrix[i+mSize*j] = ((i+1)+(j+1));
}
printf("\n");
for (i=0; i<mSize; i++){
for(j=0; j<mSize; j++){
printf("% 1.5f ", matrix[i+mSize*j]);
}
printf("\n");
}
printf("\n");
cudaMallocPitch((void **)&dev_matrix, &pitch, memsize_matrix, mSize);
cudaMalloc((void **)&dev_vector, memsize_vector);
cudaMemcpy2D(dev_matrix, pitch, matrix, memsize_matrix, memsize_matrix, mSize,
cudaMemcpyHostToDevice);
extract<<<blocks,threads>>>(mSize, dev_vector, dev_matrix, N);
cudaDeviceSynchronize();
cudaMemcpy(vector, dev_vector, memsize_vector, cudaMemcpyDeviceToHost);
printf("Vector values are:\n");
for(i=0; i<N; i++)
printf(" % 1.5f ", vector[i]);
printf("\n");
cudaFree(dev_matrix);
cudaFree(dev_vector);
}
There are lots of problems in this code, including but not limited to using array sizes in bytes and word sizes interchangeably in several places in code, using incorrect types (note that size_t exists for a very good reason) , potential truncation and type casting problems, and more.
But the core problem is the addressing of pitched memory inside the kernel, to which you are never even passing the pitch value. Reading the documentation for cudaMallocPitch will give you the correct method for addressing pitched memory inside a kernel. Your kernel might then look like this:
__global__ void extract(size_t mpitch, float* dev_vector, float* dev_matrix, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
while(idx<N)
{
dev_vector[idx] = *(float *)( ((char*)dev_matrix + idx * mpitch) + N );
idx += stride;
}
}
[disclaimer: never compiled or tested, use at own risk].
You will have to fix then all the problems in the host code to reflect whatever kernel changes you make.
Thanks to all, Alex I had not seen that, and fix it, thanks.
talonmies, thank you, my code works, with your suggestions. thanks a lot, finally this my kernel:
__global__ void sumreduct(size_t pitch, float* dev_vector, float* dev_matrix, int columns, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
while(idx<N)
{
dev_vector[idx] = *(float *)( ((char*)dev_matrix + idx * pitch) + columns);
idx += stride;
}
}
About "size_t", I was using "Unsigned int" because Nsight show me the next warning:
Type 'size_t' could not be resolved
Thanks
Did you really mean to declare a source matrix of length [memsizeMatrix*memsizeMatrix] ?
This will allocate 400 floats, or 1600 bytes. This means your source-pitch is off, and the Memcpy2D call is failing.
I'm assuming you meant to say
matrix = new float[mSize*mSize];

cuda multiplication

Serial code snippet looks like this:
int i, j;
for(j=0; j<ny; j++)
{
for(i=0; i<nx; i++)
{
x[i + j*nx] *= y[i];
}
}
I converted this to CUDA using this kernel:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int i,j;
for(tid = 0; tid <nx*ny; tid++)
{
j = tid/nx;
i = tid - j*nx;
x[tid] *= y[i];
}
However the GPU kernel does not give any speedup improvement? Any suggestions on a better solution?? Thanks in advance
If this is the serial code:
int i, j;
for(j=0; j<ny; j++)
{
for(i=0; i<nx; i++)
{
x[i + j*nx] *= y[i];
}
}
then you should be doing this:
__global__ void fn(float *x, int nx)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int j = tid/nx, i = tid - j * nx;
x[tid] *= y[i];
}
fn<<<nx*ny/B, B>>>(x, nx); // with B = 256, 512, etc.
What you're doing is fairly bizarre: you're instructing each thread of the CUDA kernel to iterate over all values of tid between 0 and nx*ny, and compute the same function as your CPU version! Moreover, instead of just iterating over the indices, you're actually doing the loop less efficiently than you did for the CPU version; in other words, you do the same thing in each thread, just less efficiently, than you are doing in 1 thread on the CPU. It's no wonder that this is slower; it should be much, much slower. Your CUDA kernel is:
int **tid** = blockIdx.x * blockDim.x + threadIdx.x;
int i,j;
for(**tid** = 0; **tid** <nx*ny; **tid**++)
{
j = tid/nx;
i = tid - j*nx;
x[tid] *= y[i];
}
This does nx*ny iterations, same as your host code, for each thread; you lose all benefit of the parallelism, since each thread is doing the same thing; you would get the same performance using one thread on the GPU, and the same result!
If this is the verbatim code from your CUDA source file, you need to change it and redo the comparison; if this is code you have written to help explain what your code is doing for a lay non-CUDA audience, then you need to present your actual CUDA code so that we can see what's going on... as it is, the performance analysis I have done - the trivial one - is all you can expect.
Given your comment to this answer:
the nx * ny = 2205; so I used no. of blocks =
(nx*ny+(threads-1))/threads and threads = 64.
is implying you are intending to launch one thread per computation, the correct CUDA implementation would just be:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int j = tid/nx;
int i = tid - j*nx;
if (tid < (nx*ny))
x[tid] *= y[i];
If you were intending for each thread to compute more than one computation per kernel launch, then you would size the grid to "fill" each of the SM on the target GPU, not use the same number of threads as the input size, and then do something like:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int gsize = blockDim.x * gridDim.x;
int i,j;
for(; tid <nx*ny; tid+=gsize)
{
j = tid/nx;
i = tid - j*nx;
x[tid] *= y[i];
}
That would get you at least coalesced reads and writes to x, and remove the enormous number of redundant calculations in your posted version. There are a number of further optimizations that could be made, but it would require more information about the problem than has been supplied in the question and subsequent comments. Your indexing scheme contains an integer division and then an integer multiply-add per calculation. That is a lot of overhead for a single FLOP per input value. However, having said all of that, if the problem size I quoted is that actual problem size you are interested in, the GPU will never be faster than even a modest host CPU. You would require many orders of magnitude larger problems to realize useful speed up using the GPU for this sort low arithmetic intensity operation.
How big is the block? it may be that the time needed to copy a small amount of data to the GPU and setup the envirnoment is much longer than the calculation time.
Remember also that CUDA does a jit compile on the first run so to get accurate benchmarking you need to run it many times.
Try this using shared memory. One of the best implementations around:
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)
typedef struct {
int width;
int height;
int stride; // In number of elements
float *elements;
} Matrix;
// Thread block size
#define BLOCK_SIZE 16
// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col)
{
return A.elements[row * A.stride + col];
}
// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col, float value)
{
A.elements[row * A.stride + col] = value;
}
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
Matrix Asub;
Asub.width = BLOCK_SIZE; Asub.height = BLOCK_SIZE;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row +
BLOCK_SIZE * col];
return Asub;
}
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Same as in previous example, except the followings:
// d_A.width = d_A.stride = A.width;
// d_B.width = d_B.stride = B.width;
// d_C.width = d_C.stride = C.width;
}
// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Block row and column
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
// Each thread block computes one sub-matrix Csub of C
Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
// Each thread computes one element of Csub
// by accumulating results into Cvalue
float Cvalue = 0;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
for (int m = 0; m < (A.width / BLOCK_SIZE); ++m)
{
// Get sub-matrix Asub of A and Bsub of B
Matrix Asub = GetSubMatrix(A, blockRow, m);
Matrix Bsub = GetSubMatrix(B, m, blockCol);
// Shared memory used to store Asub and Bsub respectively
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[row][col] = GetElement(Asub, row, col);
Bs[row][col] = GetElement(Bsub, row, col);
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
__syncthreads();
// Multiply Asub and Bsub together
for (int e = 0; e < BLOCK_SIZE; ++e)
Cvalue += As[row][e] * Bs[e][col];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write Csub to device memory
// Each thread writes one element
SetElement(Csub, row, col, Cvalue);
}