Cuda matrix multiplication - cuda

I'm trying to write a matrix multiplication code in cuda, which is pretty similar to Nvidia's cuda programming guide, but it is not working. It is supposed to do C=alpha*A*B+beta*C , but for every A,B C remains unchanged.
__global__ void MatMulKernel(int m,int n,int k,double *A,double *B,double *C,double alpha,double beta)
{
double Ctemp = 0.0;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int ind;
for (ind = 0; ind < k; ++ind)
{
Ctemp += A[row+ind*m]*B[ind+col*k];
}
C[row+m*col] = alpha*Ctemp+beta*C[row+m*col];
//C[row+m*col] = Ctemp;
__syncthreads();
}
extern "C" void
local_mm_cuda (const int m, const int n, const int k, const double alpha,
const double *A, const int lda, const double *B, const int ldb,
const double beta, double *C, const int ldc)
{
int row, col;
/* Verify the sizes of lda, ldb, and ldc */
assert (lda >= m);
assert (ldb >= k);
assert (ldc >= m);
// allocating memory for device array
double *dA,*dB,*dC;
size_t sizeA = sizeof(double)*m*k;
size_t sizeB = sizeof(double)*n*k;
size_t sizeC = sizeof(double)*m*n;
cudaMalloc((void**)&dA,sizeA);
cudaMalloc((void**)&dB,sizeB);
cudaMalloc((void**)&dC,sizeC);
cudaMemcpy(dA, A, sizeA, cudaMemcpyHostToDevice);
cudaMemcpy(dB, B, sizeB, cudaMemcpyHostToDevice);
cudaMemcpy(dC, C, sizeC, cudaMemcpyHostToDevice);
// calling matrix multiplication kernal
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid( n/dimBlock.x, m/dimBlock.y);
MatMulKernel<<<dimGrid, dimBlock>>>(m,n,k,dA,dB,dC,alpha,beta);
cudaThreadSynchronize();
// saving C calculated back in C
cudaMemcpy(dC,C, sizeC,cudaMemcpyDeviceToHost);
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
}

Try to modify
"dim3 dimGrid( n/dimBlock.x, m/dimBlock.y);"
to
"dim3 dimGrid( (n+dimBlock.x-1)/dimBlock.x, (m+dimBlock.y-1)/dimBlock.y); "

Related

CUDA syncthreads() block other threads

#define TS 32
int num_devices = 0;
__global__ void shared_kernel(float* A, float* B, float* C, int M, int N, int K) {
int global_col = blockDim.x * blockIdx.x + threadIdx.x;
int global_row = blockDim.y * blockIdx.y + threadIdx.y;
int local_col = threadIdx.x;
int local_row = threadIdx.y;
if (global_row >= M || global_col >= N) return;
__shared__ float Asub[TS][TS];
__shared__ float Bsub[TS][TS];
const int num_tiles = K / TS;
float acc = 0;
for(int t = 0; t < num_tiles; t++){
const int t_row = TS * t + local_row;
const int t_col = TS * t + local_col;
Asub[local_row][local_col] = A[global_row * K + t_col];
Bsub[local_row][local_col] = B[t_row * N + global_col];
__syncthreads();
printf("[DEBUG] first sync threads, global_row: %d, global_col: %d\n", global_row, global_col);
for (int k = 0; k < K; ++k) {
acc += Asub[local_row][k] * Bsub[k][local_col];
}
__syncthreads();
printf("[DEBUG] second sync threads, global_row: %d, global_col: %d\n", global_row, global_col);
}
C[global_row * N + global_col] = acc;
}
static float *a_d, *b_d, *c_d;
void mat_mul(float *A, float *B, float *C, int M, int N, int K) {
cudaMemcpy(a_d, A, M * K * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(b_d, B, K * N * sizeof(float), cudaMemcpyHostToDevice);
dim3 blockDim(TS, TS);
dim3 gridDim(M/TS, N/TS);
shared_kernel<<<gridDim, blockDim>>>(a_d, b_d, c_d, M, N, K);
cudaMemcpy(C, c_d, M * N * sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
}
void mat_mul_init(float *A, float *B, float *C, int M, int N, int K) {
cudaGetDeviceCount(&num_devices);
cudaSetDevice(0);
cudaMalloc(&a_d, M * K * sizeof(float));
cudaMalloc(&b_d, K * N * sizeof(float));
cudaMalloc(&c_d, M * N * sizeof(float));
}
Above example is a matrix multiplication with shared memory.
I ran above kernel with dim3 blockDim(TS, TS) and dim3 gridDim(M/TS, N/TS) and M, N, K = 128.
I checked that float * C has zero value after launching kernel. Also, I found that only few of global_row are printed(from 37 to 81) after first __syncthreads(), and there is no printf DEBUG message after the second __syncthreads().
I suspect that __syncthreads() is causing the problem, but I don't know how to fix it. My code is almost the same as other matrix multiplication code in other site.
Would you give me some hint how to solve this?
Any time you are having trouble with a CUDA code, I recommend using proper CUDA error checking and run your code with compute-sanitizer or cuda-memcheck. For this type of analysis, it will be easier if you don't use in-kernel printf.
If you did that, you would see output like this:
========= Invalid __shared__ read of size 4
========= at 0x000002f0 in shared_kernel(float*, float*, float*, int, int, int)
========= by thread (0,2,0) in block (0,1,0)
========= Address 0x00002000 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
... (and more output)
So from that, we can see that your kernel is making invalid __shared__ read operations. Where is that happening in your kernel? You could use the methodology here to identify a specific line of code. However this is a fairly simple kernel, and there is only one line that is reading from shared memory, it is here:
for (int k = 0; k < K; ++k) {
acc += Asub[local_row][k] * Bsub[k][local_col]; // shared reads here
A quick inspection will show that if you let this loop iterate over a range of K=128, then you will index out of bounds here:
for (int k = 0; k < K; ++k) {
acc += Asub[local_row][k] * Bsub[k][local_col];
^ ^
when k is greater than 31, because this would exceed your shared array dimensions:
#define TS 32
__shared__ float Asub[TS][TS];
__shared__ float Bsub[TS][TS];
I'm not going to bother writing a fixed kernel/code for you, because as you've already pointed out, this topic is covered in many other places, and a canonical example is already provided in the programming guide.
FWIW, if i change your for-loop to this:
for (int k = 0; k < TS; ++k) {
then the run-time errors go away for me. cuda-memcheck reports no errors.

Why PyCUDA is faster than C CUDA in this example

I am exploring to move from OpenCL to CUDA, and did a few tests to benchmark the speed of CUDA in various implementations. To my surprise, in the examples below, the PyCUDA implementation is about 20% faster than the C CUDA example.
I read many posts talking about "release build" of C CUDA code. I did try having -Xptxas -O3 in the makefile and that really did not make a difference. I also tried to adjust the block size, with which the kernel was executed. Unfortunately, it did not help improve the speed, either.
My questions here are:
What could be the reasons leading to the speed difference between C CUDA and PYCUDA?
If the "advanced" (lack of a better word) compiling in PYCUDA is one of reasons, how can I optimize the compiling of my C CUDA code?
Are there any other ways to improve the speed of C CUDA in this case?
While I appreciate general comments, I am looking for actionable suggestions that I can validate on my machine. Thanks!
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule
import time
mod = SourceModule(
"""
__global__ void saxpy(int n, const float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n){
y[i] = a * x[i] + y[i];
}
}
"""
)
saxpy = mod.get_function("saxpy")
N = 1 << 25
time_elapse = 0.0
for i in range(100):
# print(i)
# print(N)
x = np.ones(N).astype(np.float32)
y = 2 * np.ones(N).astype(np.float32)
start = time.time()
saxpy(
np.int32(N),
np.float32(2.0),
drv.In(x),
drv.InOut(y),
block=(512, 1, 1),
grid=(int(N / 512) + 1, 1),
)
time_elapse += (time.time() - start)
print(time_elapse )
print(y[-100:-1])
print(y.sum())
print(N * 4.0)
#include <stdio.h>
#include <time.h>
#define DIM 512
__global__ void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
y[i] = a * x[i] + y[i];
}
int main(int num_iterations)
{
double start;
double cputime;
int N = 1 << 25;
float *x, *y, *d_x, *d_y;
int i, j;
for (j = 0; j < num_iterations; j++)
{
x = (float *)malloc(N * sizeof(float));
y = (float *)malloc(N * sizeof(float));
cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N * sizeof(float));
for (i = 0; i < N; i++)
{
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
// Perform SAXPY on 1M elements
start = clock();
saxpy<<<(N + DIM) / DIM, DIM>>>(N, 2.0f, d_x, d_y);
cputime += ((double)(clock() - start) / CLOCKS_PER_SEC);
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
// float maxError = 0.0f;
// for (int i = 0; i < N; i++){
// maxError = max(maxError, abs(y[i] - 4.0f));
// //printf("y[%d]: %f\n", i,y[i]);
// }
// printf("Max error: %f\n", maxError);
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
}
printf("cpu time is %f\n", cputime);
return 0;
}
I saved the above file as cuda_example.cu and compile it with the following commands in a makefile:
nvcc -arch=sm_61 -Xptxas -O3,-v -o main cuda_example.cu
If I execute your CUDA-C code as is, and set num_iterations to 300 like this:
int num_iterations =300;
then the execution of your program takes about 60s on a Geforce GTX 1650. Your code is extremely inefficient, as you copy data back and forth between GPU and device at every iteration.
So, lets restrict the loop to just the kernel execution:
#include <stdio.h>
#include <time.h>
#define DIM 512
__global__ void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
y[i] = a * x[i] + y[i];
}
int main()
{
double start = clock();
int N = 1 << 25;
float *x, *y, *d_x, *d_y;
int i, j;
int num_iterations = 300;
x = (float *)malloc(N * sizeof(float));
y = (float *)malloc(N * sizeof(float));
cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N * sizeof(float));
for (i = 0; i < N; i++)
{
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
for (j = 0; j < num_iterations; j++){
saxpy<<<(N + DIM) / DIM, DIM>>>(N, 2.0f, d_x, d_y);
cudaDeviceSynchronize();
}
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
double cputime = ((double)(clock() - start) / CLOCKS_PER_SEC);
printf("cpu time is %f\n", cputime);
return 0;
}
If I do that, then the execution time becomes 1.36 seconds. Doing sth similar to the PyCUDA code I got about 19s of execution time.

Optimizing CalculateConvolutionOutputTensor__im2col

Request
I am writing to request guidance in optimizing my solution / method "CalculateConvolutionOutputTensor__im2col". I would like help determining the best strategy for moving beyond my naive approach; offerings of intuition about any relevant GPU processes and how they apply (e.g., bank conflicts); and help interpreting the above profile in terms of what I can tweak.
A first run of the method takes 0.774 seconds using a GeForce 2080 Ti. I have included a screenshot of the Nsight Compute profile of the only CUDA C++ kernel I have written: im2col.
Things I Could Do
I could have each GPU thread access shared memory instead of global memory. I could transfer GPU "heap" variables to kernel "stack" instead of dereferencing for every thread and in-kernel for-loop iteration. I could put small parameters into arrays in GPU memory and pass single pointers to those arrays. I could use a more sophisticated version of im2col.
Things I Have Tried
I would prefer not to use cuDNN 7.6.5; when I use cuDNN 7.6.5 and write the statement "cudnnCreate(&cudnnHandle);", Nsight Compute suggests that method cuModuleGetFunction returns CUDA_ERROR_NOT_FOUND.
Recreating Solution
The procedure I used to create this project was to create a new CUDA 10.2 Runtime project using Visual Studio Community 2019, rename the default source file to "main.cu", replace all contents with the first code block below, add "CalculateConvolutionOutputTensor__im2col.h" to my project, add the second code block below, add "CalculateConvolutionOutputTensor__im2col.cu" to my project, add the third code block below, and add "cublas.lib;" to Project Properties -> Linker -> Input -> Additional Dependencies.
main.cu
// Allow use of cudaMalloc.
#include <cuda_runtime.h>
// Allow use of time(NULL) as a seed.
#include <ctime>
// Allow construction of a default_random_engine.
#include <random>
// Allow use of CalculateConvolutionOutputTensor__im2col.
#include "CalculateConvolutionOutputTensor__im2col.h"
int main()
{
// --------------------------------------------------------------------------
// Declare and define parameters of CalculateConvolutionOutputTensor__im2col.
// --------------------------------------------------------------------------
float* convolutionOutputTensor;
cudaMalloc(&convolutionOutputTensor, 6 * 3 * 19 * 19 * 4 * sizeof(float));
int elementsInFilter = 3 * 590 * 590;
int elementsInChannelOfOutputTensor = 19 * 19;
int imagesInSubdivision = 4;
int channelsInFilter_host = 3;
int* channelsInFilter_GPU;
cudaMalloc(&channelsInFilter_GPU, sizeof(int));
cudaMemcpy(channelsInFilter_GPU, &channelsInFilter_host, sizeof(int), cudaMemcpyHostToDevice);
int widthOfFilter_host = 590;
int* widthOfFilter_GPU;
cudaMalloc(&widthOfFilter_GPU, sizeof(int));
cudaMemcpy(widthOfFilter_GPU, &widthOfFilter_host, sizeof(int), cudaMemcpyHostToDevice);
int heightOfOutputTensor_host = 19;
int* heightOfOutputTensor_GPU;
cudaMalloc(&heightOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(heightOfOutputTensor_GPU, &heightOfOutputTensor_host, sizeof(int), cudaMemcpyHostToDevice);
int widthOfOutputTensor_host = 19;
int* widthOfOutputTensor_GPU;
cudaMalloc(&widthOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(widthOfOutputTensor_GPU, &widthOfOutputTensor_host, sizeof(int), cudaMemcpyHostToDevice);
int elementsInChannelOfOutputTensor_host = 19 * 19;
int* elementsInChannelOfOutputTensor_GPU;
cudaMalloc(&elementsInChannelOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(
elementsInChannelOfOutputTensor_GPU,
&elementsInChannelOfOutputTensor_host,
sizeof(int),
cudaMemcpyHostToDevice);
int channelsInFilter_times_elementsInChannelOfOutputTensor_host = 3 * 19 * 19;
int* channelsInFilter_times_elementsInChannelOfOutputTensor_GPU;
cudaMalloc(&channelsInFilter_times_elementsInChannelOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(
channelsInFilter_times_elementsInChannelOfOutputTensor_GPU,
&channelsInFilter_times_elementsInChannelOfOutputTensor_host,
sizeof(int),
cudaMemcpyHostToDevice);
int elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_host = 3 * 590 * 19 * 19;
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU;
cudaMalloc(&elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(
elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU,
&elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_host,
sizeof(int),
cudaMemcpyHostToDevice);
int elementsInInputTensor = 3 * 608 * 608 * 4;
float* inputTensor_host = new float[elementsInInputTensor];
for (int i = 0; i < elementsInInputTensor; ++i) {
inputTensor_host[i] = ((float)(i % 255)) / 255.0;
}
float* inputTensor_GPU;
cudaMalloc(&inputTensor_GPU, elementsInInputTensor * sizeof(float));
cudaMemcpy(
inputTensor_GPU,
inputTensor_host,
elementsInInputTensor * sizeof(float),
cudaMemcpyHostToDevice);
delete[] inputTensor_host;
int horizontalFilterStride_host = 1;
int* horizontalFilterStride_GPU;
cudaMalloc(&horizontalFilterStride_GPU, sizeof(int));
cudaMemcpy(
horizontalFilterStride_GPU,
&horizontalFilterStride_host,
sizeof(int),
cudaMemcpyHostToDevice);
int channelsInImage_host = 3;
int* channelsInImage_GPU;
cudaMalloc(&channelsInImage_GPU, sizeof(int));
cudaMemcpy(channelsInImage_GPU, &channelsInImage_host, sizeof(int), cudaMemcpyHostToDevice);
int verticalFilterStride_host = 1;
int* verticalFilterStride_GPU;
cudaMalloc(&verticalFilterStride_GPU, sizeof(int));
cudaMemcpy(
verticalFilterStride_GPU,
&verticalFilterStride_host,
sizeof(int),
cudaMemcpyHostToDevice);
int elementsInCrossSectionOfImage_host = 3 * 608;
int* elementsInCrossSectionOfImage_GPU;
cudaMalloc(&elementsInCrossSectionOfImage_GPU, sizeof(int));
cudaMemcpy(
elementsInCrossSectionOfImage_GPU,
&elementsInCrossSectionOfImage_host,
sizeof(int),
cudaMemcpyHostToDevice);
int elementsInImage_host = 3 * 608 * 608;
int* elementsInImage_GPU;
cudaMalloc(&elementsInImage_GPU, sizeof(int));
cudaMemcpy(elementsInImage_GPU, &elementsInImage_host, sizeof(int), cudaMemcpyHostToDevice);
int filters = 6 * 3;
int elementsInFilterTensor = 6 * 3 * 3 * 590 * 590;
float* filterTensor_host = new float[elementsInFilterTensor];
std::default_random_engine randomNumberGenerator(time(NULL));
std::normal_distribution<float> normalDistribution(0.0, 1.0);
for (int i = 0; i < elementsInFilterTensor; ++i) {
filterTensor_host[i] = normalDistribution(randomNumberGenerator) / sqrt((float)elementsInFilterTensor);
}
float* filterTensor_GPU;
cudaMalloc(&filterTensor_GPU, elementsInFilterTensor * sizeof(float));
cudaMemcpy(
filterTensor_GPU,
filterTensor_host,
elementsInFilterTensor * sizeof(float),
cudaMemcpyHostToDevice);
delete[] filterTensor_host;
int elementsInOutputSubtensor = 6 * 3 * 19 * 19;
// -------------------------------------------------
// Execute CalculateConvolutionOutputTensor__im2col.
// -------------------------------------------------
CalculateConvolutionOutputTensor__im2col(
convolutionOutputTensor,
elementsInFilter,
elementsInChannelOfOutputTensor_host,
imagesInSubdivision,
channelsInFilter_GPU,
widthOfFilter_GPU,
heightOfOutputTensor_GPU,
widthOfOutputTensor_GPU,
elementsInChannelOfOutputTensor_GPU,
channelsInFilter_times_elementsInChannelOfOutputTensor_GPU,
elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU,
inputTensor_GPU,
horizontalFilterStride_GPU,
channelsInImage_GPU,
verticalFilterStride_GPU,
elementsInCrossSectionOfImage_GPU,
elementsInImage_GPU,
filters,
filterTensor_GPU,
elementsInOutputSubtensor);
cudaFree(channelsInFilter_GPU);
cudaFree(widthOfFilter_GPU);
cudaFree(heightOfOutputTensor_GPU);
cudaFree(widthOfOutputTensor_GPU);
cudaFree(elementsInChannelOfOutputTensor_GPU);
cudaFree(channelsInFilter_times_elementsInChannelOfOutputTensor_GPU);
cudaFree(elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU);
cudaFree(inputTensor_GPU);
cudaFree(horizontalFilterStride_GPU);
cudaFree(channelsInImage_GPU);
cudaFree(verticalFilterStride_GPU);
cudaFree(elementsInCrossSectionOfImage_GPU);
cudaFree(elementsInImage_GPU);
cudaFree(filterTensor_GPU);
// --------------------------------------------------
// Make sure that convolutionOutputTensor is correct.
// --------------------------------------------------
float* convolutionOutputTensor_test = new float[6 * 3 * 19 * 19 * 4];
cudaMemcpy(
convolutionOutputTensor_test,
convolutionOutputTensor,
6 * 3 * 19 * 19 * 4 * sizeof(float),
cudaMemcpyDeviceToHost);
printf("convolutionOutputTensor_test: {");
for (int i = 0; i < 18; ++i) {
printf("%f, ", convolutionOutputTensor_test[i]);
}
printf("...}\n");
delete[] convolutionOutputTensor_test;
cudaFree(convolutionOutputTensor);
return 0;
}
CalculateConvolutionOutputTensor__im2col.h
void CalculateConvolutionOutputTensor__im2col(
float* convolutionOutputTensor_child,
int elementsInFilter_child,
int elementsInChannelOfOutputTensor_host_child,
int imagesInSubdivision_child,
int* channelsInFilter,
int* widthOfFilter,
int* heightOfOutputTensor,
int* widthOfOutputTensor,
int* elementsInChannelOfOutputTensor_GPU_child,
int* channelsInFilter_times_elementsInChannelOfOutputTensor,
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor,
float* inputTensor_child,
int* horizontalFilterStride,
int* channelsInImage,
int* verticalFilterStride,
int* elementsInCrossSectionOfImage,
int* elementsInImage,
int filters_child,
float* filterTensor,
int elementsInOutputSubtensor_child);
CalculateConvolutionOutputTensor__im2col.cu
// Allow use of __global__.
#include <cuda_runtime.h>
// Allow declaration of cublasHandle.
#include "cublas_v2.h"
// Allow use of blockIdx.x, blockDim.x, and threadIdx.x.
#include <device_launch_parameters.h>
__global__
void im2col(
float* col_child,
int* channelsInFilter_child,
int* widthOfFilter_child,
int* heightOfOutputTensor_child,
int* widthOfOutputTensor_child,
int* elementsInChannelOfOutputTensor_child,
int* channelsInFilter_times_elementsInChannelOfOutputTensor_child,
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_child,
float* inputTensor_child_child,
int* horizontalFilterStride_child,
int* channelsInImage_child,
int* verticalFilterStride_child,
int* elementsInCrossSectionOfImage_child,
int* image_child,
int* elementsInImage_child);
void CalculateConvolutionOutputTensor__im2col(
float* convolutionOutputTensor_child,
int elementsInFilter_child,
int elementsInChannelOfOutputTensor_host_child,
int imagesInSubdivision_child,
int* channelsInFilter,
int* widthOfFilter,
int* heightOfOutputTensor,
int* widthOfOutputTensor,
int* elementsInChannelOfOutputTensor_GPU_child,
int* channelsInFilter_times_elementsInChannelOfOutputTensor,
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor,
float* inputTensor_child,
int* horizontalFilterStride,
int* channelsInImage,
int* verticalFilterStride,
int* elementsInCrossSectionOfImage,
int* elementsInImage,
int filters_child,
float* filterTensor,
int elementsInOutputSubtensor_child)
{
// -----------------------------------------
// Define and declare parameters for im2col.
// -----------------------------------------
// Define parameters for the execution configuration of im2col.
int threads_per_block_for_im2col = 885;
int blocks_for_im2col =
(elementsInFilter_child + threads_per_block_for_im2col - 1) / threads_per_block_for_im2col;
// Declare col.
float* col;
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
int elementsInFilter_times_elementsInChannelOfOutputTensor =
elementsInFilter_child * elementsInChannelOfOutputTensor_host_child;
cudaMalloc(&col, elementsInFilter_times_elementsInChannelOfOutputTensor * sizeof(float));
// -----------------------------------------------------------------------------
// Define parameters for calculating the matrix product of filterTensor and col.
// -----------------------------------------------------------------------------
// Define a cublasHandle_t object called cublasHandle.
// Declaring cublasHandle requires '#include "cublas_v2.h"'.
// Defining cublasHandle requires adding "cublas.lib" to
// Properties -> Linker -> Input -> Additional Dependencies.
cublasHandle_t cublasHandle;
cublasCreate(&cublasHandle);
// Define parameters for (not) including
// a portion of a third matrix in product_filterTensor_and_col.
float one = 1.0;
float zero = 0.0;
// ------------------------------------------------------------
// For each image in subdivision,
// sculpt image into matrix col.
// Calculate the matrix product of filterTensor and col and
// store the product as a subtensor of convolutionOutputTensor.
// ------------------------------------------------------------
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
int image_times_elementsInOutputSubtensor;
int* image_GPU;
cudaMalloc(&image_GPU, sizeof(int));
for (int image_host = 0; image_host < imagesInSubdivision_child; ++image_host) {
cudaMemcpy(image_GPU, &image_host, sizeof(int), cudaMemcpyHostToDevice);
im2col<<<blocks_for_im2col, threads_per_block_for_im2col>>>
(col,
channelsInFilter,
widthOfFilter,
heightOfOutputTensor,
widthOfOutputTensor,
elementsInChannelOfOutputTensor_GPU_child,
channelsInFilter_times_elementsInChannelOfOutputTensor,
elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor,
inputTensor_child,
horizontalFilterStride,
channelsInImage,
verticalFilterStride,
elementsInCrossSectionOfImage,
image_GPU,
elementsInImage);
cudaDeviceSynchronize();
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
image_times_elementsInOutputSubtensor = image_host * elementsInOutputSubtensor_child;
cublasSgemm(
cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
elementsInChannelOfOutputTensor_host_child,
filters_child,
elementsInFilter_child,
&one,
col,
elementsInChannelOfOutputTensor_host_child,
filterTensor,
elementsInFilter_child,
&zero,
convolutionOutputTensor_child + image_times_elementsInOutputSubtensor,
elementsInChannelOfOutputTensor_host_child);
}
cudaFree(col);
cudaFree(image_GPU);
}
__global__
void im2col(
float* col_child,
int* channelsInFilter_child,
int* widthOfFilter_child,
int* heightOfOutputTensor_child,
int* widthOfOutputTensor_child,
int* elementsInChannelOfOutputTensor_child,
int* channelsInFilter_times_elementsInChannelOfOutputTensor_child,
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_child,
float* inputTensor_child_child,
int* horizontalFilterStride_child,
int* channelsInImage_child,
int* verticalFilterStride_child,
int* elementsInCrossSectionOfImage_child,
int* image,
int* elementsInImage_child)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int c_prime = index % (*channelsInFilter_child);
int temp = (index - c_prime) / (*channelsInFilter_child);
int w_prime = temp % (*widthOfFilter_child);
int h_prime = temp / (*widthOfFilter_child);
for (int h = 0; h < (*heightOfOutputTensor_child); ++h) {
for (int w = 0; w < (*widthOfOutputTensor_child); ++w) {
col_child[
w +
h * (*widthOfOutputTensor_child) +
c_prime * (*elementsInChannelOfOutputTensor_child) +
w_prime * (*channelsInFilter_times_elementsInChannelOfOutputTensor_child) +
h_prime * (*elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_child)] =
inputTensor_child_child[
c_prime +
(w * (*horizontalFilterStride_child) + w_prime) * (*channelsInImage_child) +
(h * (*verticalFilterStride_child) + h_prime) * (*elementsInCrossSectionOfImage_child) +
(*image) * (*elementsInImage_child)];
}
}
}
After reading through the NVIDIA articles that Robert Crovella provided me, I rewrote my solution "CalculateConvolutionOutputTensor__im2col" to have threads in each block load from contiguous global memory. I used less indexing arithmetic and fewer parameters. I saw a method speed-up of (1 method / 0.445 s) / (1 method / 0.774 s) = 1.7, and an im2col kernel speed-up of (1 kernel / 35.27 ms) / (1 kernel / 128.15 ms) = 3.6. Thanks for pointing me to useful specific reading.
im2col used to take 128.15 ms; now it takes only 32.12 ms. Sgemm takes 6.34 ms now; probably took about the same then. Their total is 38.46 ms. The pair is run four times, for a total of 153.84 ms. I wonder how to speed up im2col more, and to reduce the 274.16 ms in "overhead".
To sculpt an image into matrix col, I had the (3*590/2) threads in each of (2*590*19*19) blocks transfer half cross sections of a filter-shaped portion of an image sequentially to col. I believe that each thread loaded from global memory physically adjacent to the memory accessed by the previous thread, and that each thread stored to global memory physically adjacent to the memory stored to by the previous thread. I did notice that 11 threads in the last warp in each block went unused.
I think I might take th31 up on their suggestion and move this optimization thread to Code Review.
Nsight Compute profile of im2col with coalesced global memory loads and stores
main.cu
// Allow use of cudaMalloc.
#include <cuda_runtime.h>
// Allow use of structs in namespace chrono.
#include <ctime>
// Allow construction of a default_random_engine.
#include <random>
// Allow use of CalculateConvolutionOutputTensor__im2col.
#include "CalculateConvolutionOutputTensor__im2col.h"
int main()
{
// --------------------------------------------------------------------------
// Declare and define parameters of CalculateConvolutionOutputTensor__im2col.
// --------------------------------------------------------------------------
float* convolutionOutputTensor;
cudaMalloc(&convolutionOutputTensor, 6 * 3 * 19 * 19 * 4 * sizeof(float));
int elementsInFilter = 3 * 590 * 590;
int elementsInChannelOfOutputTensor = 19 * 19;
int imagesInSubdivision = 4;
int elementsInInputTensor = 3 * 608 * 608 * 4;
float* inputTensor_host = new float[elementsInInputTensor];
for (int i = 0; i < elementsInInputTensor; ++i) {
inputTensor_host[i] = ((float)(i % 255)) / 255.0;
}
float* inputTensor_GPU;
cudaMalloc(&inputTensor_GPU, elementsInInputTensor * sizeof(float));
cudaMemcpy(
inputTensor_GPU,
inputTensor_host,
elementsInInputTensor * sizeof(float),
cudaMemcpyHostToDevice);
delete[] inputTensor_host;
int heightOfFilter_host = 590;
int* heightOfFilter_GPU;
cudaMalloc(&heightOfFilter_GPU, sizeof(int));
cudaMemcpy(heightOfFilter_GPU, &heightOfFilter_host, sizeof(int), cudaMemcpyHostToDevice);
int channelsInImage_host = 3;
int* channelsInImage_GPU;
cudaMalloc(&channelsInImage_GPU, sizeof(int));
cudaMemcpy(channelsInImage_GPU, &channelsInImage_host, sizeof(int), cudaMemcpyHostToDevice);
int widthOfImage_host = 608;
int* widthOfImage_GPU;
cudaMalloc(&widthOfImage_GPU, sizeof(int));
cudaMemcpy(widthOfImage_GPU, &widthOfImage_host, sizeof(int), cudaMemcpyHostToDevice);
int widthOfOutputTensor_host = 19;
int* widthOfOutputTensor_GPU;
cudaMalloc(&widthOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(widthOfOutputTensor_GPU, &widthOfOutputTensor_host, sizeof(int), cudaMemcpyHostToDevice);
int heightOfImage_host = 608;
int* heightOfImage_GPU;
cudaMalloc(&heightOfImage_GPU, sizeof(int));
cudaMemcpy(heightOfImage_GPU, &heightOfImage_host, sizeof(int), cudaMemcpyHostToDevice);
int filters = 6 * 3;
int elementsInFilterTensor = 6 * 3 * 3 * 590 * 590;
float* filterTensor_host = new float[elementsInFilterTensor];
std::default_random_engine randomNumberGenerator(time(NULL));
std::normal_distribution<float> normalDistribution(0.0, 1.0);
for (int i = 0; i < elementsInFilterTensor; ++i) {
filterTensor_host[i] = normalDistribution(randomNumberGenerator) / sqrt((float)elementsInFilterTensor);
}
float* filterTensor_GPU;
cudaMalloc(&filterTensor_GPU, elementsInFilterTensor * sizeof(float));
cudaMemcpy(
filterTensor_GPU,
filterTensor_host,
elementsInFilterTensor * sizeof(float),
cudaMemcpyHostToDevice);
delete[] filterTensor_host;
int elementsInOutputSubtensor = 6 * 3 * 19 * 19;
// -------------------------------------------------
// Execute CalculateConvolutionOutputTensor__im2col.
// -------------------------------------------------
CalculateConvolutionOutputTensor__im2col(
convolutionOutputTensor,
elementsInFilter,
elementsInChannelOfOutputTensor,
imagesInSubdivision,
inputTensor_GPU,
heightOfFilter_GPU,
channelsInImage_GPU,
widthOfImage_GPU,
widthOfOutputTensor_GPU,
heightOfImage_GPU,
filters,
filterTensor_GPU,
elementsInOutputSubtensor);
cudaFree(inputTensor_GPU);
cudaFree(heightOfFilter_GPU);
cudaFree(channelsInImage_GPU);
cudaFree(widthOfImage_GPU);
cudaFree(widthOfOutputTensor_GPU);
cudaFree(heightOfImage_GPU);
cudaFree(filterTensor_GPU);
// --------------------------------------------------
// Make sure that convolutionOutputTensor is correct.
// --------------------------------------------------
float* convolutionOutputTensor_test = new float[6 * 3 * 19 * 19 * 4];
cudaMemcpy(
convolutionOutputTensor_test,
convolutionOutputTensor,
6 * 3 * 19 * 19 * 4 * sizeof(float),
cudaMemcpyDeviceToHost);
printf("convolutionOutputTensor_test: {");
for (int i = 0; i < 18; ++i) {
printf("%f, ", convolutionOutputTensor_test[i]);
}
printf("...}\n");
delete[] convolutionOutputTensor_test;
cudaFree(convolutionOutputTensor);
return 0;
}
CalculateConvolutionOutputTensor__im2col.h
void CalculateConvolutionOutputTensor__im2col(
float* convolutionOutputTensor_child,
int elementsInFilter_child,
int elementsInChannelOfOutputTensor_host_child,
int imagesInSubdivision_child,
float* inputTensor_child,
int* heightOfFilter,
int* channelsInImage,
int* widthOfImage,
int* widthOfOutputTensor,
int* heightOfImage,
int filters_child,
float* filterTensor,
int elementsInOutputSubtensor_child);
CalculateConvolutionOutputTensor__im2col.cu
// Allow use of __global__.
#include <cuda_runtime.h>
// Allow declaration of cublasHandle.
#include "cublas_v2.h"
// Allow use of blockIdx.x, blockDim.x, and threadIdx.x.
#include <device_launch_parameters.h>
__global__
void im2col(
float* col_child,
float* inputTensor_child_child,
int* heightOfFilter_child,
int* channelsInImage_child,
int* widthOfImage_child,
int* widthOfOutputTensor_child,
int* image,
int* heightOfImage_child);
void CalculateConvolutionOutputTensor__im2col(
float* convolutionOutputTensor_child,
int elementsInFilter_child,
int elementsInChannelOfOutputTensor_host_child,
int imagesInSubdivision_child,
float* inputTensor_child,
int* heightOfFilter,
int* channelsInImage,
int* widthOfImage,
int* widthOfOutputTensor,
int* heightOfImage,
int filters_child,
float* filterTensor,
int elementsInOutputSubtensor_child)
{
// -----------------------------------------
// Define and declare parameters for im2col.
// -----------------------------------------
// Define parameters for the execution configuration of im2col.
int threads_per_block_for_im2col = 3 * 590 / 2;
int blocks_for_im2col = 2 * 590 * 19 * 19;
// Declare col.
float* col;
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
int elementsInFilter_times_elementsInChannelOfOutputTensor =
elementsInFilter_child * elementsInChannelOfOutputTensor_host_child;
cudaMalloc(&col, elementsInFilter_times_elementsInChannelOfOutputTensor * sizeof(float));
// -----------------------------------------------------------------------------
// Define parameters for calculating the matrix product of filterTensor and col.
// -----------------------------------------------------------------------------
// Define a cublasHandle_t object called cublasHandle.
// Declaring cublasHandle requires '#include "cublas_v2.h"'.
// Defining cublasHandle requires adding "cublas.lib" to
// Properties -> Linker -> Input -> Additional Dependencies.
cublasHandle_t cublasHandle;
cublasCreate(&cublasHandle);
// Define parameters for (not) including
// a portion of a third matrix in product_filterTensor_and_col.
float one = 1.0;
float zero = 0.0;
// ------------------------------------------------------------
// For each image in subdivision,
// sculpt image into matrix col.
// Calculate the matrix product of filterTensor and col and
// store the product as a subtensor of convolutionOutputTensor.
// ------------------------------------------------------------
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
int image_times_elementsInOutputSubtensor;
int* image_GPU;
cudaMalloc(&image_GPU, sizeof(int));
for (int image_host = 0; image_host < imagesInSubdivision_child; ++image_host) {
cudaMemcpy(image_GPU, &image_host, sizeof(int), cudaMemcpyHostToDevice);
im2col
<<<blocks_for_im2col,
threads_per_block_for_im2col>>>
(col,
inputTensor_child,
heightOfFilter,
channelsInImage,
widthOfImage,
widthOfOutputTensor,
image_GPU,
heightOfImage);
cudaDeviceSynchronize();
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
image_times_elementsInOutputSubtensor = image_host * elementsInOutputSubtensor_child;
cublasSgemm(
cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
filters_child,
elementsInChannelOfOutputTensor_host_child,
elementsInFilter_child,
&one,
filterTensor,
filters_child,
col,
elementsInFilter_child,
&zero,
convolutionOutputTensor_child + image_times_elementsInOutputSubtensor,
filters_child);
float element = 0.0;
}
cudaFree(col);
cudaFree(image_GPU);
}
__global__
void im2col(
float* col_child,
float* inputTensor_child_child,
int* heightOfFilter_child,
int* channelsInImage_child,
int* widthOfImage_child,
int* widthOfOutputTensor_child,
int* image,
int* heightOfImage_child)
{
col_child[blockIdx.x * blockDim.x + threadIdx.x] =
inputTensor_child_child[
threadIdx.x +
(blockIdx.x % 2) * blockDim.x +
((blockIdx.x % (2 * (*heightOfFilter_child))) / 2) * (*channelsInImage_child) * (*widthOfImage_child) +
(blockIdx.x / (2 * (*heightOfFilter_child))) * (*channelsInImage_child) +
(blockIdx.x / (2 * (*heightOfFilter_child) * (*widthOfOutputTensor_child))) * (*channelsInImage_child) * (*widthOfImage_child) +
(*image) * (*channelsInImage_child) * (*widthOfImage_child) * (*heightOfImage_child)];
}

Thrust/CUDA replicate an array multiple times combined with the values of another array

Let's say I have two arrays
A = {1, 2, 3}
and
B = {10,20,30,40,50}
I want to generate a new array which would have a size of
sizeof(A) * sizeof(B)
I want to replicate B sizeof(A) times, and on each repetition i, the resultant array should have A[i] added to it. So the result would be something like
{11,21,31,41,51,12,22,32,42,52,13,23,33,43,53}
This task can be interpreted as a 2-dimensional problem where the output array can be treated as a matrix of dimensions sizeof(A) times sizeof(B). In this way, we can use 2D CUDA indexing to achieve the desired functionality. A sample CUDA C++ code of this 2D implementation is shown below:
#include <iostream>
#include <cuda_runtime.h>
#include <cassert>
using namespace std;
__global__ void kernel_replicate(int* a, int* b, int* c, int alen, int blen, int clen)
{
const int ai = blockIdx.x * blockDim.x + threadIdx.x;
const int bi = blockIdx.y * blockDim.y + threadIdx.y;
if(ai<alen && bi<blen)
{
const int ci = ai * blen + bi;
c[ci] = a[ai] + b[bi];
}
}
void replicate_device(int* a, int* b, int* c, int alen, int blen, int clen)
{
dim3 block(16,16);
dim3 grid;
grid.x = (alen + block.x - 1) / block.x;
grid.y = (blen + block.y - 1) / block.y;
kernel_replicate<<<grid, block>>>(a,b,c,alen,blen,clen);
assert(cudaSuccess == cudaDeviceSynchronize());
}
void replicate(int* a, int* b, int* c, int alen, int blen, int clen)
{
int *ad, *bd, *cd;
size_t abytes = alen * sizeof(int);
size_t bbytes = blen * sizeof(int);
size_t cbytes = clen * sizeof(int);
cudaMalloc(&ad, abytes);
cudaMalloc(&bd, bbytes);
cudaMalloc(&cd, cbytes);
cudaMemcpy(ad,a, abytes, cudaMemcpyHostToDevice);
cudaMemcpy(bd,b, bbytes, cudaMemcpyHostToDevice);
replicate_device(ad,bd,cd, alen,blen,clen);
cudaMemcpy(c,cd, cbytes, cudaMemcpyDeviceToHost);
cudaFree(ad);
cudaFree(bd);
cudaFree(cd);
}
int main()
{
const int alen = 3;
const int blen = 5;
const int clen = alen * blen;
int A[alen] = {1,2,3};
int B[blen] = {10,20,30,40,50};
int C[clen] = {0};
replicate(A,B,C,alen, blen, clen);
for(int i=0; i<alen; i++)
{
cout<<A[i]<<" ";
}
cout<<endl;
for(int i=0; i<blen; i++)
{
cout<<B[i]<<" ";
}
cout<<endl;
for(int i=0; i<clen; i++)
{
cout<<C[i]<<" ";
}
cout<<endl;
return 0;
}

Unable to execute device kernel in CUDA

I am trying to call a device kernel within a global kernel. My global kernel is a Matrix Multiplication and my device kernel is finding the maximum value and the index in each column of the product matrix. Following is the code :
__device__ void MaxFunction(float* Pd, float* max)
{
int x = (threadIdx.x + blockIdx.x * blockDim.x);
int y = (threadIdx.y + blockIdx.y * blockDim.y);
int k = 0;
int temp = 0; int temp_idx = 0;
for (k = 0; k < wB; ++k) {
if(Pd[x*wB + y] > temp){
temp = Pd[x*wB + y];
temp_idx = x*wB + y;
}
max[y*2 + 0] = temp;
max[y*2 + 1] = temp_idx;
}
}
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, float* max)
{
// declare cache in the shared memory
__shared__ float Mds[blockD][blockD];
__shared__ float Nds[blockD][blockD];
float Pvalue = 0;
// Loop over the Md and Nd block dimension required to compute the Pd element
for (int m = (wA * blockD * blockIdx.y), n = (blockD * blockIdx.x);
m < ((wA * blockD * blockIdx.y)+wA-1);
m += blockD, n += (blockD*hB)){
// collaboratively loading of Md and Nd blocks into shared memory
Mds[threadIdx.y][threadIdx.x] = Md[m + wA * threadIdx.y + threadIdx.x];
Nds[threadIdx.y][threadIdx.x] = Nd[n + wA * threadIdx.y + threadIdx.x];
__syncthreads();
// keep track of the running sum
for (int k = 0; k < blockD; k++)
Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
__syncthreads();
}
// write back to the global memory
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
__syncthreads();
MaxFunction(Pd, max);
}
The Main code :
#include<stdio.h>
#include "cuda.h"
#include<stdlib.h>
#define blockD 32
const int wA = 128;
const int hA = 1024;
const int wB = 128;
const int hB = wA;
main(void){
void MatrixMultiplication(float *, float *, float *, float *);
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *M, *N, *P, *C;
// allocate memory on the CPU
M = (float*)malloc(size_A);
N = (float*)malloc(size_B);
P = (float*)malloc(size_max);
C = (float*)malloc(size_C);
// initialize the matrices
for (int y=0; y < hA; y++) {
for (int x=0; x < wA; x++){
M[y*wA + x] = x;
}
}
for (int y=0; y<hB; y++) {
for (int x=0; x<wB; x++){
N[y*wB + x] = x;
}
}
MatrixMultiplication(M, N, P, C);
//Write
FILE *f1;
int i, j;
f1 = fopen("max_val.txt","w");
for(i=0; i < (wB * 2); i+=2){
fprintf(f1,"%d\t%d\n",int(P[i]),int(P[i+1]));
}
fclose(f1);
f1 = fopen("Prod_mat.txt","w");
for(i=0; i < 2; i++){
for(j=0; j < wB; j++){
fprintf(f1,"%d\t",int(C[i*wB + j]));
}
fprintf(f1,"\n");
}
fclose(f1);
free( M );
free( N );
free( P );
free( C );
cudaDeviceReset();
return 0;
}
void MatrixMultiplication(float *M, float *N, float *P, float *C) {
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *Md, *Nd, *Pd, *max;
// allocate memory on the GPU
cudaMalloc((void**)&Md, size_A);
cudaMalloc((void**)&Nd, size_B);
cudaMalloc((void**)&Pd, size_C);
cudaMalloc((void**)&max, size_max);
// transfer M and N to device memory
cudaMemcpy(Md, M, size_A, cudaMemcpyHostToDevice);
cudaMemcpy(Nd, N, size_B, cudaMemcpyHostToDevice);
// kernel invocation code
dim3 dimBlock(blockD, blockD);
dim3 dimGrid(wA/blockD, hB/blockD);
//Execute Kernel
MatrixMulKernel<<<dimGrid, dimBlock>>>( Md, Nd, Pd, max);
// transfer P from device
cudaMemcpy(P, max, size_max, cudaMemcpyDeviceToHost);
cudaMemcpy(C, Pd, size_C, cudaMemcpyDeviceToHost);
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
cudaFree(max);
}
The Matrix Multiplication result is fine (Verified using Matlab), but I am not able to get the max values and their corresponding index. I would appreciate if anyone can kindly point out at what I am doing wrong. The max variable has only garbage when I run the above code.
Apparently you are attempting to find the maximum value in each column, as well as the offset to that value.
But all of your threads in y are hammering on the same location for max value (max[x*2 + 0]). This isn't recommended, as there is no way to sort out a race condition. You should use atomic operations, or other methods (e.g. reduction) to handle multiple threads updating a single max value this way.
Since you have a need to update two values atomically (the max value and it's location), it's not a simple matter of replacing your plain access with a standard atomic function. However, since you are dealing with two 32-bit adjacent quantities, you may be interested in my answer here.
By the way I think matlab's native matrix multiply on gpuArray should be faster than any matrix multiply code you write. But it would require the Parallel Compute Toolbox.