Using cublasGemmBatchedEx - cuda

I am trying to use cublasGemmBatchedEx to perform matrix multiplication.
Here is my code.
#include <iostream>
#include <cublas_v2.h>
#define M 4
#define N 4
#define K 4
//nvcc -lcublas -o matmul_gemmBatchedEx matmul_gemmBatchedEx.cu
void print_matrix(float **A, int rows, int cols, int batch_size) {
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < rows; j++){
for(int k = 0; k < cols; k++){
std::cout << A[i][k * rows + j] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
}
int main(int argc, char* argv[])
{
// Linear dimension of matrices
int batch_size = 2;
float *h_A[batch_size], *h_B[batch_size], *h_C[batch_size];
for (int i = 0; i < batch_size; i++){
h_A[i] = (float*)malloc(M * K * sizeof(float));
h_B[i] = (float*)malloc(K * N * sizeof(float));
h_C[i] = (float*)malloc(M * N * sizeof(float));
}
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < M * K; j++)
h_A[i][j] = j%4;
for (int j = 0; j < K * N; j++)
h_B[i][j] = j%4 + 4;
for (int j = 0; j < M * N; j++)
h_C[i][j] = 0;
}
std::cout << "A =" << std::endl;
print_matrix(h_A, M, K, batch_size);
std::cout << "B =" << std::endl;
print_matrix(h_B, K, N, batch_size);
float *d_A[batch_size], *d_B[batch_size], *d_C[batch_size];
for (int i = 0; i < batch_size; i++){
cudaMalloc(&d_A[i], sizeof(float)* M * K);
cudaMalloc(&d_B[i], sizeof(float)* K * N);
cudaMalloc(&d_C[i], sizeof(float)* M * N);
}
cudaMemcpy(d_A, h_A, sizeof(float)* M * K * batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, sizeof(float)* K * N * batch_size, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
// Set up the matrix dimensions and batch size
int lda = M;
int ldb = K;
int ldc = M;
// Set the alpha and beta parameters for the gemm operation
float alpha = 1.0f;
float beta = 0.0f;
cublasStatus_t status = cublasGemmBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K,
&alpha,
(const void**)d_A, CUDA_R_32F, lda,
(const void**)d_B, CUDA_R_32F, ldb,
&beta,
(void**)d_C, CUDA_R_32F, ldc,
batch_size,
CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
cudaMemcpy(h_C,d_C,sizeof(float) * M * N * batch_size, cudaMemcpyDeviceToHost);
if (status == CUBLAS_STATUS_SUCCESS) {
std::cout << "C =" << std::endl;
print_matrix(h_C, M, N, batch_size);
} else {
std::cout << status << std::endl;
}
// Destroy the handle
cublasDestroy(handle);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaFreeHost(h_A);
cudaFreeHost(h_B);
cudaFreeHost(h_C);
}
This is the result when I ran this code.
A =
0 0 0 0
1 1 1 1
2 2 2 2
3 3 3 3
0 0 0 0
1 1 1 1
2 2 2 2
3 3 3 3
B =
4 4 4 4
5 5 5 5
6 6 6 6
7 7 7 7
4 4 4 4
5 5 5 5
6 6 6 6
7 7 7 7
C =
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
The problem is I don't get a expected result.
It comes full of zeros.
Is there any problem with my code?

The matrix arguments that you pass to this cublas function need to be an array of device pointers, where each device pointer is properly allocated, properly copied, and has a proper population of its allocation.
There are at least several problems with your attempt to do this. The central problem is around these lines:
cudaMemcpy(d_A, h_A, sizeof(float)* M * K * batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, sizeof(float)* K * N * batch_size, cudaMemcpyHostToDevice);
You cannot copy an array of independent allocations that way, and furthermore you are copying pointer data using sizes and types as if it were matrix contents (float * vs. float).
The following example has various issues fixed:
$ cat t2167.cu
#include <iostream>
#include <cublas_v2.h>
#define M 4
#define N 4
#define K 4
//nvcc -lcublas -o matmul_gemmBatchedEx matmul_gemmBatchedEx.cu
void print_matrix(float **A, int rows, int cols, int batch_size) {
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < rows; j++){
for(int k = 0; k < cols; k++){
std::cout << A[i][k * rows + j] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
}
int main(int argc, char* argv[])
{
// Linear dimension of matrices
int batch_size = 2;
float *h_A[batch_size], *h_B[batch_size], *h_C[batch_size];
for (int i = 0; i < batch_size; i++){
h_A[i] = (float*)malloc(M * K * sizeof(float));
h_B[i] = (float*)malloc(K * N * sizeof(float));
h_C[i] = (float*)malloc(M * N * sizeof(float));
}
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < M * K; j++)
h_A[i][j] = j%4;
for (int j = 0; j < K * N; j++)
h_B[i][j] = j%4 + 4;
for (int j = 0; j < M * N; j++)
h_C[i][j] = 0;
}
std::cout << "A =" << std::endl;
print_matrix(h_A, M, K, batch_size);
std::cout << "B =" << std::endl;
print_matrix(h_B, K, N, batch_size);
float *d_A[batch_size], *d_B[batch_size], *d_C[batch_size];
for (int i = 0; i < batch_size; i++){
cudaMalloc(&d_A[i], sizeof(float)* M * K);
cudaMemcpy(d_A[i], h_A[i], sizeof(float)*M*K, cudaMemcpyHostToDevice);
cudaMalloc(&d_B[i], sizeof(float)* K * N);
cudaMemcpy(d_B[i], h_B[i], sizeof(float)*N*K, cudaMemcpyHostToDevice);
cudaMalloc(&d_C[i], sizeof(float)* M * N);
cudaMemcpy(d_C[i], h_C[i], sizeof(float)*N*M, cudaMemcpyHostToDevice);
}
float **d_dA, **d_dB, **d_dC;
cudaMalloc(&d_dA, sizeof(float *)*batch_size);
cudaMalloc(&d_dB, sizeof(float *)*batch_size);
cudaMalloc(&d_dC, sizeof(float *)*batch_size);
cudaMemcpy(d_dA, d_A, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_dB, d_B, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_dC, d_C, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
// Set up the matrix dimensions and batch size
int lda = M;
int ldb = K;
int ldc = M;
// Set the alpha and beta parameters for the gemm operation
float alpha = 1.0f;
float beta = 0.0f;
status = cublasGemmBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K,
&alpha,
(const void**)d_dA, CUDA_R_32F, lda,
(const void**)d_dB, CUDA_R_32F, ldb,
&beta,
(void**)d_dC, CUDA_R_32F, ldc,
batch_size,
CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
for (int i = 0; i < batch_size; i++)
cudaMemcpy(h_C[i], d_C[i], sizeof(float)*M*N, cudaMemcpyDeviceToHost);
if (status == CUBLAS_STATUS_SUCCESS) {
std::cout << "C =" << std::endl;
print_matrix(h_C, M, N, batch_size);
} else {
std::cout << status << std::endl;
}
// Destroy the handle
cublasDestroy(handle);
cudaFree(d_dA);
cudaFree(d_dB);
cudaFree(d_dC);
for (int i = 0; i < batch_size; i++){
free(h_A[i]);
free(h_B[i]);
free(h_C[i]);
cudaFree(d_A[i]);
cudaFree(d_B[i]);
cudaFree(d_C[i]);}
}
$ nvcc -o t2167 t2167.cu -lcublas
$ compute-sanitizer ./t2167
========= COMPUTE-SANITIZER
A =
0 0 0 0
1 1 1 1
2 2 2 2
3 3 3 3
0 0 0 0
1 1 1 1
2 2 2 2
3 3 3 3
B =
4 4 4 4
5 5 5 5
6 6 6 6
7 7 7 7
4 4 4 4
5 5 5 5
6 6 6 6
7 7 7 7
C =
0 0 0 0
22 22 22 22
44 44 44 44
66 66 66 66
0 0 0 0
22 22 22 22
44 44 44 44
66 66 66 66
========= ERROR SUMMARY: 0 errors
$
given that cublasGemmBatchedEx() requires that all the matrices across the batch be of identical sizes, and given that we are starting with a clean slate for this exercise, we can use a somewhat simpler realization by concatenating matrices so we can do single allocations for the batch. This won't work for every use case, but may be of interest:
#include <iostream>
#include <cublas_v2.h>
#define M 4
#define N 4
#define K 4
//nvcc -lcublas -o matmul_gemmBatchedEx matmul_gemmBatchedEx.cu
void print_matrix(float *A, int rows, int cols, int batch_size) {
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < rows; j++){
for(int k = 0; k < cols; k++){
std::cout << A[i*rows*cols+k * rows + j] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
}
int main(int argc, char* argv[])
{
// Linear dimension of matrices
int batch_size = 2;
float *h_A, *h_B, *h_C;
h_A = (float*)malloc(M * K * sizeof(float) * batch_size);
h_B = (float*)malloc(K * N * sizeof(float) * batch_size);
h_C = (float*)malloc(M * N * sizeof(float) * batch_size);
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < M * K; j++)
h_A[i*M*K+j] = j%4;
for (int j = 0; j < K * N; j++)
h_B[i*K*N+j] = j%4 + 4;
for (int j = 0; j < M * N; j++)
h_C[i*M*N+j] = 0;
}
std::cout << "A =" << std::endl;
print_matrix(h_A, M, K, batch_size);
std::cout << "B =" << std::endl;
print_matrix(h_B, K, N, batch_size);
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, sizeof(float)* M * K*batch_size);
cudaMemcpy(d_A, h_A, sizeof(float)*M*K*batch_size, cudaMemcpyHostToDevice);
cudaMalloc(&d_B, sizeof(float)* K * N*batch_size);
cudaMemcpy(d_B, h_B, sizeof(float)*N*K*batch_size, cudaMemcpyHostToDevice);
cudaMalloc(&d_C, sizeof(float)* M * N*batch_size);
cudaMemcpy(d_C, h_C, sizeof(float)*N*M*batch_size, cudaMemcpyHostToDevice);
float *h_dA[batch_size], *h_dB[batch_size], *h_dC[batch_size];
for (int i = 0; i < batch_size; i++){
h_dA[i] = d_A+i*M*K;
h_dB[i] = d_B+i*K*N;
h_dC[i] = d_C+i*M*N;}
float **d_dA, **d_dB, **d_dC;
cudaMalloc(&d_dA, sizeof(float *)*batch_size);
cudaMalloc(&d_dB, sizeof(float *)*batch_size);
cudaMalloc(&d_dC, sizeof(float *)*batch_size);
cudaMemcpy(d_dA, h_dA, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_dB, h_dB, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_dC, h_dC, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
// Set up the matrix dimensions and batch size
int lda = M;
int ldb = K;
int ldc = M;
// Set the alpha and beta parameters for the gemm operation
float alpha = 1.0f;
float beta = 0.0f;
status = cublasGemmBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K,
&alpha,
(const void**)d_dA, CUDA_R_32F, lda,
(const void**)d_dB, CUDA_R_32F, ldb,
&beta,
(void**)d_dC, CUDA_R_32F, ldc,
batch_size,
CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
cudaMemcpy(h_C, d_C, sizeof(float)*M*N*batch_size, cudaMemcpyDeviceToHost);
if (status == CUBLAS_STATUS_SUCCESS) {
std::cout << "C =" << std::endl;
print_matrix(h_C, M, N, batch_size);
} else {
std::cout << status << std::endl;
}
// Destroy the handle
cublasDestroy(handle);
cudaFree(d_dA);
cudaFree(d_dB);
cudaFree(d_dC);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
}

Related

How to fix cudaError 77 when copying back from device to host

I am writing a simple example program to test memCpy and kernel run concurrency for a larger program. While writing this example, I stumbled upon error 77, aka cudaErrorIllegalAddress.
I read somewhere that that comes from the kernel accessing an invalid address, and not the memcpy itself. So I tried to index the lowest element of my input array (0). The error remained.
As it only is a small sample program, I will provide the whole code;
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCKS 32
#define THREADS 16
__global__ void kernel(double *d_in, double *d_out) {
int index = threadIdx.x + blockDim.x * blockIdx.x;
d_out[index] = d_in[index] + 5;
}
int main() {
const int GPU_N = 2;
const int data_size = 2048;
const int cycles = 2;
double *h_in, *h_out, *d_in, *d_out;
h_in = (double*)malloc(sizeof(double) * data_size);
h_out = (double*)malloc(sizeof(double) * data_size);
for (int i = 0; i < data_size; i++) {
h_in[i] = 21;
}
cudaError_t error;
printf("1\n");
for (int i = 0; i < cycles; i++) {
//cuMalloc
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMalloc((void**)&d_in, sizeof(double) * data_size / 4);
cudaMalloc((void**)&d_out, sizeof(double) * data_size / 4);
printf("2\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMemcpyAsync(d_in, h_in, sizeof(double) * data_size / 4, cudaMemcpyHostToDevice);
printf("3\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
kernel<<< BLOCKS, THREADS, 0, 0 >>>(d_in, d_out);
error = cudaGetLastError();
printf("4\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
error = cudaMemcpyAsync(h_out, d_out, sizeof(double) * data_size / 4, cudaMemcpyDeviceToHost);
printf("D2H %i\n", error);
printf("5\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaFree(d_in);
cudaFree(d_out);
printf("6\n");
}
}
for (int i = 0; i < data_size; i++) {
printf("%i\n", h_out[i]);
}
getchar();
}
So the output should be something like:
1
1
2
2
3
3
4
4
5
5
6
6
1
1
2
2
3
3
4
4
5
5
6
6
26
26
26
26
26
.....
and then a spam of the result. It does so until the time it has to print 5, then it outputs the error 77. Also, the output of the result is not 26 as expected, but -842150451
There are several problems with this code.
As already pointed out in the comments, the printf format specifier here (%i) is wrong:
printf("%i\n", h_out[i]);
the quantity being printed is a double quantity, an appropriate format specifier would be %f.
This code will not work (for GPU_N greater than 1):
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMalloc((void**)&d_in, sizeof(double) * data_size / 4);
cudaMalloc((void**)&d_out, sizeof(double) * data_size / 4);
printf("2\n");
}
d_in and d_out are individual variables. You don't get to somehow reuse them in this way. When this loop goes through it's 2nd (or later) iteration, it will overwrite the pointer values that were previously assigned. Later on this will result in code trouble, because for at least one of your kernel launches, you will be passing pointers to data that is not resident on that particular GPU (and this particular aspect of the problem is the proximal reason for the error 77 report.)
One solution would be to provide arrays of pointers to make this work.
Some of the CUDA activity you are issuing in your loops may be asynchronous. Therefore, to be sure that your final printout of h_out shows expected results, you should wait for all work on the GPU to be finished. One way to accomplish this is with another set of calls to cudaDeviceSynchronize(). (I don't wish to argue about whether cudaFree is asynchronous or not. I think this item is a sensible suggestion and noteworthy. If you feel you can skip this item, do what you wish. For learning purposes, I think it is important to point this out.) For the reasons indicated in comments below, this item is not necessary/mandatory to get expected results for this particular code. This answer isn't intended to be a complete treatise on asynchronous work issuance; for that I suggest further study of any of the relevant questions here on the cuda tag, and/or study of relevant CUDA sample codes.
Here's a modified code that has the above issues addressed (I have shortened the final print-out loop):
$ cat t1477.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCKS 32
#define THREADS 16
__global__ void kernel(double *d_in, double *d_out) {
int index = threadIdx.x + blockDim.x * blockIdx.x;
d_out[index] = d_in[index] + 5;
}
int main() {
const int GPU_N = 2;
const int data_size = 2048;
const int cycles = 2;
double *h_in, *h_out, *d_in[GPU_N], *d_out[GPU_N];
h_in = (double*)malloc(sizeof(double) * data_size);
h_out = (double*)malloc(sizeof(double) * data_size);
for (int i = 0; i < data_size; i++) {
h_in[i] = 21;
}
cudaError_t error;
printf("1\n");
for (int i = 0; i < cycles; i++) {
//cuMalloc
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMalloc((void**)(&(d_in[j])), sizeof(double) * data_size / 4);
cudaMalloc((void**)(&(d_out[j])), sizeof(double) * data_size / 4);
printf("2\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMemcpyAsync(d_in[j], h_in, sizeof(double) * data_size / 4, cudaMemcpyHostToDevice);
printf("3\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
kernel<<< BLOCKS, THREADS, 0, 0 >>>(d_in[j], d_out[j]);
error = cudaGetLastError();
printf("4\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
error = cudaMemcpyAsync(h_out, d_out[j], sizeof(double) * data_size / 4, cudaMemcpyDeviceToHost);
printf("D2H %i\n", error);
printf("5\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaFree(d_in[j]);
cudaFree(d_out[j]);
printf("6\n");
}
}
for (int i = 0; i < GPU_N; i++){
cudaSetDevice(i);
cudaDeviceSynchronize();}
for (int i = 0; i < 10; i++) {
printf("%f\n", h_out[i]);
}
}
$ nvcc -o t1477 t1477.cu
$ cuda-memcheck ./t1477
========= CUDA-MEMCHECK
1
2
2
3
3
4
4
D2H 0
5
D2H 0
5
6
6
2
2
3
3
4
4
D2H 0
5
D2H 0
5
6
6
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
========= ERROR SUMMARY: 0 errors
$

Incorrect addition of Prime numbers in CUDA [duplicate]

This question already has an answer here:
How to find the sum of array in CUDA by reduction
(1 answer)
Closed 3 years ago.
I use reduction logic in code by referring How to find the sum of array in CUDA by reduction.
But It is giving some errors. I am not getting my mistake, could you please help me out??
required specification:
1.Cuda toolkit v6.5
2. graphics: GTX 210 (compute capability 1.2)
3. visual studio 2013
#include<stdio.h>
#include<cuda.h>
#include<malloc.h>
#include<conio.h>
#include<time.h>
#include<windows.h>
#define SIZE 10
#define N 100
__global__ void vectoreAdd(int *d_a, int *d_b, int *d_c)
{
__shared__ int sdata[256];
int i = threadIdx.x + (blockIdx.x*blockDim.x);
sdata[threadIdx.x] = d_a[i];
__syncthreads();
if (i<SIZE)
for (i = 2; i<SIZE; i++)
{
int counter = 0;
for (int j = 2; j<d_a[i]; j++)
{
if (d_a[i] % j == 0)
{
counter = 1; break;
}
}
if (counter == 0)
{
d_b[i] = d_a[i];
}
}
// do reduction in shared mem
for (int s = 1; s < blockDim.x; s *= 2)
{
int index = 2 * s * threadIdx.x;;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (threadIdx.x == 0)
atomicAdd(d_c, sdata[0]);
}
}
int main()
{
clock_t tic = clock();
int *a, *b, *summation=0, sum = 0,count=-1; //declare summation as double/long if needed
int *d_a, *d_b, *d_c;
//int blocks, block_size = 512;
int size = N * sizeof(int);
a = (int *)malloc(SIZE*sizeof(int));
b = (int *)malloc(SIZE*sizeof(int));
summation = (int *)malloc(SIZE*sizeof(int));
cudaMalloc((void**)&d_a, SIZE * sizeof(int));
cudaMalloc((void**)&d_b, SIZE * sizeof(int));
cudaMalloc((void**)&d_c, SIZE * sizeof(int));
for (int i = 1; i<SIZE; i++)
{
a[i] = i;
b[i] = 0;
}
cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice);
/*blocks = SIZE / block_size;
if (SIZE% block_size != 0)
blocks++; */
dim3 blocksize(256); // create 1D threadblock
dim3 gridsize(N / blocksize.x); //create 1D grid
vectoreAdd << < gridsize, blocksize >> >(d_a, d_b, d_c);
//cudaThreadSynchronize();
cudaMemcpy(b, d_b, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(summation, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
for (int m = 0; m < SIZE; m++)
{
if (b[m] != 0)
{
printf("\n prime no is:%d", b[m]);
count = count + 1;
}
}
printf("\n\n Total prime no. are: %d", count);
/* for (int j = 1; j<SIZE; j++)
{
sum = sum + b[j];
}*/
printf("\n \nsum of all prime no upto %d is:%d", SIZE, summation);
clock_t toc = clock();
printf("\n\nElapsed: %f seconds\n", (double)(toc - tic) / CLOCKS_PER_SEC);
free(a); free(b); free(summation);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
getchar(); return 0;
}
There are lots of mistakes in your code :
cudaMalloc((void**)&d_a, SIZE * sizeof(int));
should be :
cudaMalloc((void**)&d_a, N * sizeof(int)); //OR
cudaMalloc((void**)&d_a, size);
as you already calculated but didnt passed it. same in case of malloc() //Host code

GPU reduction code only runs one time

I have been using the code sample supplied by Robert Crovella:
thrust::max_element slow in comparison cublasIsamax - More efficient implementation?
Which is a very fast reduction code. I modified it to also return the index of the max in the input array of floats. When I use it in my code, it will only execute one time. If I try calling the routine again it does not find a new max value, it just returns the previous max. Is there something about the volatile global memory that the routine uses that needs to be reset before it can be called again?
#include <cuda.h>
#include <cublas_v2.h>
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 4096*4 // nTPB should be a power-of-2
#define nTPB 512
#define MAX_KERNEL_BLOCKS 30
#define MAX_BLOCKS ((DSIZE/nTPB)+1)
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f
#include <helper_functions.h>
#include <helper_cuda.h>
// this code has been modified to return the index of the max instead of the actual max value - for my application
__device__ volatile float blk_vals[MAX_BLOCKS];
__device__ volatile int blk_idxs[MAX_BLOCKS];
__device__ int blk_num = 0;
//template <typename T>
__global__ void max_idx_kernel(const float *data, const int dsize, int *result){
__shared__ volatile float vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
float my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize){
if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}
idx += blockDim.x*gridDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
// perform block-level reduction
if (!threadIdx.x){
blk_vals[blockIdx.x] = vals[0];
blk_idxs[blockIdx.x] = idxs[0];
if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block
last_block = 1;}
__syncthreads();
if (last_block){
idx = threadIdx.x;
my_val = FLOAT_MIN;
my_idx = -1;
while (idx < gridDim.x){
if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }
idx += blockDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
if (!threadIdx.x)
*result = idxs[0];
}
}
int main(){
int nrElements = DSIZE;
float *d_vector, *h_vector;
StopWatchInterface *hTimer = NULL;
sdkCreateTimer(&hTimer);
double gpuTime;
int k;
int max_index;
int *d_max_index;
cudaMalloc(&d_max_index, sizeof(int));
h_vector = new float[DSIZE];
for(k=0; k < 5; k++){
for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;
h_vector[10+k] = 10; // create definite max element that changes with each loop iteration
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " thrust time: " << gpuTime << " max index: " << max_index << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " cublas time: " << gpuTime << " max index: " << max_index-1 << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);
cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " idx kern time: " << gpuTime << " max index: " << max_index << std::endl;
std::cout << std::endl;
} // end for loop on k
cudaFree(d_max_index);
cudaFree(d_vector);
return 0;
}
The primary issue in re-using this code for multiple loops as-is is in this static initialization of a device (global) variable:
__device__ int blk_num = 0;
That's OK if you're only going to run the routine once. But if you intend to re-use it, you will need to re-initialize this variable to zero before each call to the kernel.
We could fix this by putting an explicit initialization of this variable to zero before each call to the reduction kernel:
cudaMemcpyToSymbol(blk_num, &max_index, sizeof(int));
(I'm using max_index here simply because it is a convenient host int variable that has just been set to zero.)
That's the only change needed to get the code "working".
However the introduction of the loop has created some other "issues" that I would point out. These 3 lines of code:
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
don't belong inside the for-loop on k. That is effectively creating a memory leak and unnecessarily re-initializing the cublas library.
The following code has those changes and seems to work for me:
$ cat t1183.cu
#include <cuda.h>
#include <cublas_v2.h>
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 4096*4 // nTPB should be a power-of-2
#define nTPB 512
#define MAX_KERNEL_BLOCKS 30
#define MAX_BLOCKS ((DSIZE/nTPB)+1)
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f
#include <helper_functions.h>
#include <helper_cuda.h>
// this code has been modified to return the index of the max instead of the actual max value - for my application
__device__ volatile float blk_vals[MAX_BLOCKS];
__device__ volatile int blk_idxs[MAX_BLOCKS];
__device__ int blk_num;
//template <typename T>
__global__ void max_idx_kernel(const float *data, const int dsize, int *result){
__shared__ volatile float vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
float my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize){
if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}
idx += blockDim.x*gridDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
// perform block-level reduction
if (!threadIdx.x){
blk_vals[blockIdx.x] = vals[0];
blk_idxs[blockIdx.x] = idxs[0];
if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block
last_block = 1;}
__syncthreads();
if (last_block){
idx = threadIdx.x;
my_val = FLOAT_MIN;
my_idx = -1;
while (idx < gridDim.x){
if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }
idx += blockDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
if (!threadIdx.x)
*result = idxs[0];
}
}
int main(){
int nrElements = DSIZE;
float *d_vector, *h_vector;
StopWatchInterface *hTimer = NULL;
sdkCreateTimer(&hTimer);
double gpuTime;
int k;
int max_index;
int *d_max_index;
cudaMalloc(&d_max_index, sizeof(int));
h_vector = new float[DSIZE];
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
for(k=0; k < 5; k++){
for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;
h_vector[10+k] = 10; // create definite max element that changes with each loop iteration
cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " thrust time: " << gpuTime << " max index: " << max_index << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " cublas time: " << gpuTime << " max index: " << max_index-1 << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
cudaMemcpyToSymbol(blk_num, &max_index, sizeof(int));
max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);
cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " idx kern time: " << gpuTime << " max index: " << max_index << std::endl;
std::cout << std::endl;
} // end for loop on k
cudaFree(d_max_index);
cudaFree(d_vector);
return 0;
}
$ nvcc -I/usr/local/cuda/samples/common/inc t1183.cu -o t1183 -lcublas
$ cuda-memcheck ./t1183
========= CUDA-MEMCHECK
loop: 0 thrust time: 2.806 max index: 10
loop: 0 cublas time: 0.441 max index: 10
loop: 0 idx kern time: 0.395 max index: 10
loop: 1 thrust time: 1.298 max index: 11
loop: 1 cublas time: 0.419 max index: 11
loop: 1 idx kern time: 0.424 max index: 11
loop: 2 thrust time: 1.303 max index: 12
loop: 2 cublas time: 0.43 max index: 12
loop: 2 idx kern time: 0.419 max index: 12
loop: 3 thrust time: 1.291 max index: 13
loop: 3 cublas time: 0.423 max index: 13
loop: 3 idx kern time: 0.415 max index: 13
loop: 4 thrust time: 1.299 max index: 14
loop: 4 cublas time: 0.423 max index: 14
loop: 4 idx kern time: 0.417 max index: 14
========= ERROR SUMMARY: 0 errors
$

Summing the rows of a matrix (stored in either row-major or column-major order) in CUDA

I'm working on the problem summing the rows of a matrix in CUDA. I'm giving the following example.
Suppose to have the following 20 * 4 array:
1 2 3 4
4 1 2 3
3 4 1 2
.
1 2 3 4
.
.
.
.
.
.
.
.
2 1 3 4
After flattened the 2d array to a 1d array (either in row-major or column-major order), I need to assign each thread to a different row and calculate the cost for that row.
For example
- thread 1 should calculate the cost for 1 2 3 4
- thread 2 should calculate the cost for 4 1 2 3
How can I that in CUDA?
Thank you all for the reply
#include <stdio.h>
#include <stdlib.h>
#define MROWS 20
#define NCOLS 4
#define nTPB 256
__global__ void mykernel(int *costdata, int rows, int cols, int *results){
int tidx = threadIdx.x + blockDim.x*blockIdx.x;
if (tidx < rows){
int mycost = 0;
for (int i = 0; i < cols; i++)
mycost += costdata[(tidx*cols)+i];
results[tidx] = mycost;
}
}
int main(){
//define and initialize host and device storage for cost and results
int *d_costdata, *h_costdata, *d_results, *h_results;
h_results = (int *)malloc(MROWS*sizeof(int));
h_costdata = (int *)malloc(MROWS*NCOLS*sizeof(int));
for (int i=0; i<(MROWS*NCOLS); i++)
h_costdata[i] = rand()%4;
cudaMalloc((void **)&d_results, MROWS*sizeof(int));
cudaMalloc((void **)&d_costdata, MROWS*NCOLS*sizeof(int));
//copy cost data from host to device
cudaMemcpy(d_costdata, h_costdata, MROWS*NCOLS*sizeof(int), cudaMemcpyHostToDevice);
mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results);
// copy results back from device to host
cudaMemcpy(h_results, d_results, MROWS*sizeof(int), cudaMemcpyDeviceToHost);
for (int i=0; i<MROWS; i++){
int loc_cost = 0;
for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
printf("cost[%d]: host= %d, device = %d\n", i, loc_cost, h_results[i]);
}
}
This assumes "cost" of each row is just the sum of the elements in each row. If you have a different "cost" function, you can modify the activity in the kernel for-loop accordingly. This also assumes C-style row-major data storage (1 2 3 4 4 1 2 3 3 4 1 2 etc.)
If you instead use column-major storage (1 4 3 etc.), you can slightly improve the performance, since the data reads can be fully coalesced. Then your kernel code could look like this:
for (int i = 0; i < cols; i++)
mycost += costdata[(i*rows)+tidx];
You should also use proper cuda error checking on all CUDA API calls and kernel calls.
EDIT: As discussed in the comments below, for the row-major storage case, in some situations it might give an increase in memory efficiency by electing to load 16-byte quantities rather than the base type. Following is a modified version that implements this idea for arbitrary dimensions and (more or less) arbitrary base types:
#include <iostream>
#include <typeinfo>
#include <cstdlib>
#include <vector_types.h>
#define MROWS 1742
#define NCOLS 801
#define nTPB 256
typedef double mytype;
__host__ int sizetype(){
int size = 0;
if ((typeid(mytype) == typeid(float)) || (typeid(mytype) == typeid(int)) || (typeid(mytype) == typeid(unsigned int)))
size = 4;
else if (typeid(mytype) == typeid(double))
size = 8;
else if ((typeid(mytype) == typeid(unsigned char)) || (typeid(mytype) == typeid(char)))
size = 1;
return size;
}
template<typename T>
__global__ void mykernel(const T *costdata, int rows, int cols, T *results, int size, size_t pitch){
int chunk = 16/size; // assumes size is a factor of 16
int tidx = threadIdx.x + blockDim.x*blockIdx.x;
if (tidx < rows){
T *myrowptr = (T *)(((unsigned char *)costdata) + tidx*pitch);
T mycost = (T)0;
int count = 0;
while (count < cols){
if ((cols-count)>=chunk){
// read 16 bytes
int4 temp = *((int4 *)(myrowptr + count));
int bcount = 16;
int j = 0;
while (bcount > 0){
mycost += *(((T *)(&temp)) + j++);
bcount -= size;
count++;}
}
else {
// read one quantity at a time
for (; count < cols; count++)
mycost += myrowptr[count];
}
results[tidx] = mycost;
}
}
}
int main(){
int typesize = sizetype();
if (typesize == 0) {std::cout << "invalid type selected" << std::endl; return 1;}
//define and initialize host and device storage for cost and results
mytype *d_costdata, *h_costdata, *d_results, *h_results;
h_results = (mytype *)malloc(MROWS*sizeof(mytype));
h_costdata = (mytype *)malloc(MROWS*NCOLS*sizeof(mytype));
for (int i=0; i<(MROWS*NCOLS); i++)
h_costdata[i] = (mytype)(rand()%4);
size_t pitch = 0;
cudaMalloc((void **)&d_results, MROWS*sizeof(mytype));
cudaMallocPitch((void **)&d_costdata, &pitch, NCOLS*sizeof(mytype), MROWS);
//copy cost data from host to device
cudaMemcpy2D(d_costdata, pitch, h_costdata, NCOLS*sizeof(mytype), NCOLS*sizeof(mytype), MROWS, cudaMemcpyHostToDevice);
mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results, typesize, pitch);
// copy results back from device to host
cudaMemcpy(h_results, d_results, MROWS*sizeof(mytype), cudaMemcpyDeviceToHost);
for (int i=0; i<MROWS; i++){
mytype loc_cost = (mytype)0;
for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
if ((i < 10) && (typesize > 1))
std::cout <<"cost[" << i << "]: host= " << loc_cost << ", device = " << h_results[i] << std::endl;
if (loc_cost != h_results[i]){ std::cout << "mismatch at index" << i << "should be:" << loc_cost << "was:" << h_results[i] << std::endl; return 1; }
}
std::cout << "Results are correct!" << std::endl;
}

Matrix Multiplication giving wrong output [duplicate]

This question already has an answer here:
Unable to execute device kernel in CUDA
(1 answer)
Closed 7 years ago.
What I am attempting to do is Multiply Matrix A & Matrix B and then from the product matrix I get the index of the maximum value per column. But unfortunately, only the first 128*128 values of the matrix multiplication are correct while others are just garbage. I do not quite understand how this works. I request you to kindly guide me with this ..
#include<stdio.h>
#include "cuda.h"
#include<stdlib.h>
#define blockD 32
const int wA = 128;
const int hA = 4096;
const int wB = 4096;
const int hB = wA;
main(void){
void MatrixMultiplication(float *, float *, float *, float *);
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *M, *N, *P, *C;
// allocate memory on the CPU
M = (float*)malloc(size_A);
N = (float*)malloc(size_B);
P = (float*)malloc(size_max);
C = (float*)malloc(size_C);
// initialize the matrices
for (int y=0; y < hA; y++) {
for (int x=0; x < wA; x++){
M[y*wA + x] = 32; //x + y*wA;
}
}
for (int y=0; y<hB; y++) {
for (int x=0; x<wB; x++){
N[y*wB + x] = 21; //x + y*wB;
}
}
MatrixMultiplication(M, N, P, C);
//Write
FILE *f1;
int i,j;
f1 = fopen("C.txt","w");
for(i = hA - 2 ; i < hA; i ++){
for(j = 0; j < wB; j++){
fprintf(f1,"%d\t",int(C[i*wB + j]));
}
fprintf(f1,"\n");
}
fclose(f1);
// free the memory allocated on the CPU
free( M );
free( N );
free( P );
free( C );
cudaDeviceReset();
return 0;
}
__device__ void MaxFunction(float* Pd, float* max)
{
int x = (threadIdx.x + blockIdx.x * blockDim.x);
int y = (threadIdx.y + blockIdx.y * blockDim.y);
int k = 0;
int temp = 0; int temp_idx = 0;
for (k = 0; k < wB; ++k) {
if(Pd[x*wB + k] > temp){
temp = Pd[x*wB + k];
temp_idx = x*wB + k;
}
}
max[y*2 + 0] = temp;
max[y*2 + 1] = temp_idx;
}
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, float* max)
{
// declare cache in the shared memory
__shared__ float Mds[blockD][blockD];
__shared__ float Nds[blockD][blockD];
float Pvalue = 0;
// Loop over the Md and Nd block dimension required to compute the Pd element
for (int m = (wA * blockD * blockIdx.y), n = (blockD * blockIdx.x);
m < ((wA * blockD * blockIdx.y)+wA-1);
m += blockD, n += (blockD*hB)){
// collaboratively loading of Md and Nd blocks into shared memory
Mds[threadIdx.y][threadIdx.x] = Md[m + wA * threadIdx.y + threadIdx.x];
Nds[threadIdx.y][threadIdx.x] = Nd[n + wA * threadIdx.y + threadIdx.x];
__syncthreads();
// keep track of the running sum
for (int k = 0; k < blockD; k++)
Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
__syncthreads();
}
// write back to the global memory
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
__syncthreads();
MaxFunction(Pd, max);
}
void MatrixMultiplication(float *M, float *N, float *P, float *C) {
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *Md, *Nd, *Pd, *max;
// allocate memory on the GPU
cudaMalloc((void**)&Md, size_A);
cudaMalloc((void**)&Nd, size_B);
cudaMalloc((void**)&Pd, size_C);
cudaMalloc((void**)&max, size_max);
// transfer M and N to device memory
cudaMemcpy(Md, M, size_A, cudaMemcpyHostToDevice);
cudaMemcpy(Nd, N, size_B, cudaMemcpyHostToDevice);
// kernel invocation code
dim3 dimBlock(blockD, blockD);
dim3 dimGrid(wA/blockD, hB/blockD);
//Execute Kernel
MatrixMulKernel<<<dimGrid, dimBlock>>>( Md, Nd, Pd, max);
// transfer P from device
cudaMemcpy(P, max, size_max, cudaMemcpyDeviceToHost);
cudaMemcpy(C, Pd, size_C, cudaMemcpyDeviceToHost);
// free the memory allocated on the GPU
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
cudaFree(max);
}
In your code you seem to have more than one problem. One of the problems is, in place of this:
dim3 dimGrid(wA/blockD, hB/blockD);
You should have this:
dim3 dimGrid(wB/blockD, hA/blockD);
Ultimately you need one thread in your grid for each output point. Your formulation was giving you a grid of 4 blocks by 4 blocks, whereas you need a grid of 128 blocks by 128 blocks.
The other problem I found with your code was in these lines in the kernel:
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
They are not indexing properly through the output array. Rather than try to sort it out using your scheme, I used this instead:
Pd[(threadIdx.x + (blockIdx.x * blockDim.x)) + ((threadIdx.y + (blockIdx.y * blockDim.y))*(gridDim.x*blockDim.x))] = Pvalue;
When I made the above two changes to your code, I got what I believe are correct results throughout the array. And it took about 32 seconds on my machine to run it. (Note that I haven't tried fixing your original max-finding code -- see below for a better approach.)
Based on your previous question, you seemed to be concerned about speed. If you want to do fast matrix multiply, you should use cublas. The following code shows how to use cublas to multiply two ordinary C-style matrices (they don't have to be square). I've also included a column-max finding kernel that will be fast when the number of columns is large (say, over 500 or so. You have 4096 columns in your example). For small numbers of columns, there may be quicker ways to perform this function, but small numbers of columns also suggests that the overall problem size may be small and so speed (of this piece of code) will not really be an issue.
Here's the code:
#include <stdio.h>
#include <cublas_v2.h>
#define VERBOSE 1
#define nTPB 64
#define ROW_A 4
#define COL_A 4
#define ROW_B COL_A
#define COL_B 4
#define ROW_C ROW_A
#define COL_C COL_B
#define SIZ_A (ROW_A*COL_A)
#define SIZ_B (ROW_B*COL_B)
#define SIZ_C (ROW_C*COL_C)
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void col_max(float *mat, float *max, unsigned int *midx, unsigned int rows, unsigned int cols){
int idx = threadIdx.x + blockDim.x*blockIdx.x;
if (idx < cols){
float tempmax = mat[idx];
unsigned int tempmidx = 0;
for (int i = 1; i< rows; i++)
if (mat[idx + (i*cols)] > tempmax){
tempmax = mat[idx + (i*cols)];
tempmidx = i;}
max[idx] = tempmax;
midx[idx] = tempmidx;
}
}
int main(){
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C, *h_max, *d_max;
unsigned int *h_idx, *d_idx;
h_A = (float *)malloc(SIZ_A*sizeof(float));
if (h_A==0) {printf("malloc fail\n"); return -1;}
h_B = (float *)malloc(SIZ_B*sizeof(float));
if (h_B==0) {printf("malloc fail\n"); return -1;}
h_C = (float *)malloc(SIZ_C*sizeof(float));
if (h_C==0) {printf("malloc fail\n"); return -1;}
h_max = (float *)malloc(COL_C*sizeof(float));
if (h_max==0) {printf("malloc fail\n"); return -1;}
h_idx = (unsigned int*)malloc(COL_C*sizeof(unsigned int));
if (h_idx==0) {printf("malloc fail\n"); return -1;}
cudaMalloc((void **)&d_A, SIZ_A*sizeof(float));
cudaMalloc((void **)&d_B, SIZ_B*sizeof(float));
cudaMalloc((void **)&d_C, SIZ_C*sizeof(float));
cudaMalloc((void **)&d_max, COL_C*sizeof(float));
cudaMalloc((void **)&d_idx, COL_C*sizeof(unsigned int));
cudaCheckErrors("cuda malloc fail");
// initialize data
for (int i=0; i< SIZ_A; i++) h_A[i] = (float)(i+1);
for (int i=0; i< SIZ_B; i++) h_B[i] = (float)(i+2);
cudaMemcpy(d_A, h_A, SIZ_A*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, SIZ_B*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cuda memcpy 1 fail");
const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;
cublasCheckErrors(cublasCreate(&handle));
// C = A*B
// due to cublas expecting column-major storage, parameters
// are scrambled
cublasCheckErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, COL_B, ROW_A, COL_A, &alpha, d_B, COL_B, d_A, COL_A, &beta, d_C, COL_C));
cudaMemcpy(h_C, d_C, SIZ_C*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 2 fail");
col_max<<<(COL_C + nTPB - 1)/nTPB, nTPB>>>(d_C, d_max, d_idx, ROW_C, COL_C);
cudaCheckErrors("kernel launch fail");
cudaMemcpy(h_max, d_max, COL_C*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_idx, d_idx, COL_C*sizeof(unsigned int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 3 fail/kernel fail");
if (VERBOSE){
printf("A: \n");
for (int i=0; i< ROW_A; i++){
for (int j=0; j< COL_A; j++)
printf("%7.5G", h_A[j+(i*COL_A)]);
printf("\n");}
printf("B: \n");
for (int i=0; i< ROW_B; i++){
for (int j=0; j< COL_B; j++)
printf("%7.5G", h_B[j+(i*COL_B)]);
printf("\n");}
printf("C = A*B: \n");
for (int i=0; i< ROW_C; i++){
for (int j=0; j< COL_C; j++)
printf("%7.5G", h_C[j+(i*COL_C)]);
printf("\n");}
printf("COLUMN MAX:\n");
for (int i=0; i< COL_C; i++)
printf("%7.5G", h_max[i]);
printf("\nCOLUMN MAX IDX:\n");
for (int i=0; i< COL_C; i++)
printf("%7d", h_idx[i]);
}
printf("\n finished!\n");
return 0;
}
Here's what I used to compile:
$ nvcc -arch=sm_20 -O3 -o t221 t221.cu -lcublas
And here's the sample output:
$ cuda-memcheck ./t221
========= CUDA-MEMCHECK
A:
1 2 3 4
5 6 7 8
9 10 11 12
13 14 15 16
B:
2 3 4 5
6 7 8 9
10 11 12 13
14 15 16 17
C = A*B:
100 110 120 130
228 254 280 306
356 398 440 482
484 542 600 658
COLUMN MAX:
484 542 600 658
COLUMN MAX IDX:
3 3 3 3
finished!
========= ERROR SUMMARY: 0 errors
$
When I extended my code to handle the same sizes you indicated, (A = 4096x128, B=128x4096) it took about 1 second on my machine. So it's much faster than your code. However, when I take your code and comment out your call to MaxFunction in the kernel, it also only takes about 1 second to compute the matrix multiply result. So if you wanted to keep your matrix multiply code (i.e. not use cublas) you could break the code into 2 kernels, and use your multiply routine in the first kernel with my max-finding routine (col_max) in the second kernel, and also probably get a pretty fast result.
As #talonmies indicated, if you are running on a windows machine, be sure you are aware of the ramifications of windows TDR. (search that in the upper right corner search box if needed)