Take average over blocks Cuda - cuda

Cuda said that the shared memory can only be shared by data in the same block. But a block can only have at most 1024 threads. What if I have a huge matrix, and want to take average of them with maximized threads.
Takes this as an example. (I didn't use the maximized threads in one block, just as a demo)
#include <iostream>
#include <stdio.h>
__global__ void
kernel(int *a, int dimx, int dimy)
{
int ix = blockDim.x * blockIdx.x + threadIdx.x;
int iy = blockDim.y * blockIdx.y + threadIdx.y;
int idx = iy * dimx + ix;
__shared__ int array[64];
a[idx] = a[idx] + 1;
array[idx] = a[idx];
__syncthreads();
int sum=0;
for(int i=0; i<dimx*dimy; i++)
{
sum += array[i];
}
int average = sum/(dimx*dimy+1.0f);
a[idx] = average;
}
int
main()
{
int dimx = 8;
int dimy = 8;
int num_bytes = dimx*dimy*sizeof(int);
int *d_a=0, *h_a=0; // device and host pointers
h_a = (int*)malloc(num_bytes);
for (int i=0; i < dimx*dimy; i++){
*(h_a+i) = i;
}
cudaMalloc( (void**)&d_a, num_bytes );
//cudaMemset( d_a, 0, num_bytes );
cudaMemcpy( d_a, h_a, num_bytes, cudaMemcpyHostToDevice);
dim3 grid, block;
block.x = 4;
block.y = 4;
grid.x = dimx / block.x;
grid.y = dimy / block.y;
kernel<<<grid, block>>>(d_a, dimx, dimy);
cudaMemcpy( h_a, d_a, num_bytes, cudaMemcpyDeviceToHost );
std::cout << "the array a is:" << std::endl;
for (int row = 0; row < dimy; row++)
{
for (int col =0; col < dimx; col++)
{
std::cout << h_a[row * dimx + col] << " ";
}
std::cout << std::endl;
}
free(h_a);
cudaFree(d_a);
}
I create four blocks, and want to the results to be average of all of them. Now the result is:
the array a is:
3 3 3 3 4 4 4 4
3 3 3 3 4 4 4 4
3 3 3 3 4 4 4 4
3 3 3 3 4 4 4 4
11 11 11 11 12 12 12 12
11 11 11 11 12 12 12 12
11 11 11 11 12 12 12 12
11 11 11 11 12 12 12 12
Each block has its own average, rather overall average. How could I take the average over all the blocks?
I'm new to Cuda. Any relevant answer is welcomed.

The easiest way is to launch multiple kernels, such that you do your per-block average, write those out to global memory, then launch another kernel to work on the per-block results from the previous kernel. Depending on your data dimensions you might have to repeat this multiple times.
e.g. (in pseudo-code)
template <typename T>
__global__ reduce(T* data, T* block_avgs)
{
//find the per-block average, write it out to block_avgs
//...
}
//in your caller:
loop while you have more than 1 block:
call kernel using result from prev. iteration
update grid_dim and block_dim
This is necessary as there's no inter-block synchronization in CUDA. Your problem is a pretty straightforward application of reduction. Take a look at the parallel reduction sample at the nvidia samples page to get a better feel for reductions.

Related

About Cuda 1D convolution, How can I do this faster? [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 2 years ago.
Improve this question
int threads = 32;
dim3 blocks(250000/31,129,50);
coefsize = 129;
__global__ void D_Conv(float *in, float* coef, float *out, int coefsize)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y; //129
int k = blockIdx.z; //50
if (j < coefsize && i < 250000 && k < 50)
{
if (i - j >= 0 && i - j < 250000)
{
atomicAdd(&out[k*250000 + i], coef[j] * in[k*250000 + i - j]);
}
}
}
Many people recommend convolution with FFT, but in this case, two array's sizes have wide variances( 129 and 250000). So convolution with FFT is slower than this method.
I don't believe atomics should be necessary here. The only thread clashing you would have is in the y dimension, so we can simply reduce your overall grid (in y) and convert the operation to a loop computing a running sum. You have plenty of threads in your grid to saturate any GPU, even without the y dimension.
Here's an example:
$ cat t20.cu
#include <iostream>
#define TOL 0.1
__global__ void D_Conv(float *in, float* coef, float *out, int coefsize)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y; //129
int k = blockIdx.z; //50
if (j < coefsize && i < 250000 && k < 50)
{
if (i - j >= 0 && i - j < 250000)
{
atomicAdd(&out[k*250000 + i], coef[j] * in[k*250000 + i - j]);
}
}
}
__global__ void D_Conv_i(float *in, float* coef, float *out, int coefsize)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
//int j = blockIdx.y; //129
int k = blockIdx.z; //50
if (i < 250000 && k < 50)
{
float s = 0;
for (int j = 0; j < 129; j++)
if (i - j >= 0 && i - j < 250000) s += coef[j] * in[k*250000 + i - j];
out[k*250000 + i] += s;
}
}
int main(){
int num_c = 50;
int csz = 250000;
int coefsize = 129;
int isz = num_c*csz;
int osz = num_c*csz;
float *d_in, *h_in, *d_coef, *h_coef, *d_out, *h_out, *h_out_i;
cudaMalloc(&d_in, isz*sizeof(float));
cudaMalloc(&d_out, osz*sizeof(float));
cudaMalloc(&d_coef, coefsize*sizeof(float));
h_in = new float[isz];
h_out = new float[osz];
h_out_i = new float[osz];
h_coef = new float[coefsize];
cudaMemset(d_out, 0, osz*sizeof(float));
for (int i = 0; i < coefsize; i++) h_coef[i] = i%5;
for (int i = 0; i < isz; i++) h_in[i] = i%4;
cudaMemcpy(d_in, h_in, isz*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_coef, h_coef, coefsize*sizeof(float), cudaMemcpyHostToDevice);
int threads = 128;
dim3 blocks((csz+threads-1)/threads, coefsize, num_c);
D_Conv<<<blocks, threads>>>(d_in, d_coef, d_out, coefsize);
cudaMemcpy(h_out, d_out, osz*sizeof(float), cudaMemcpyDeviceToHost);
dim3 blocks2((csz+threads-1)/threads, 1, num_c);
cudaMemset(d_out, 0, osz*sizeof(float));
D_Conv_i<<<blocks2, threads>>>(d_in, d_coef, d_out, coefsize);
cudaMemcpy(h_out_i, d_out, osz*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < osz; i++) if (fabsf(h_out_i[i] - h_out[i]) > TOL) {std::cout << "mismatch at: " << i << " was: " << h_out_i[i] << " should be: " << h_out[i] << std::endl; return 0;}
}
$ nvcc -o t20 t20.cu
$ cuda-memcheck ./t20
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t20
==14221== NVPROF is profiling process 14221, command: ./t20
==14221== Profiling application: ./t20
==14221== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 53.54% 43.853ms 2 21.926ms 21.863ms 21.989ms [CUDA memcpy DtoH]
26.97% 22.087ms 1 22.087ms 22.087ms 22.087ms D_Conv(float*, float*, float*, int)
17.30% 14.172ms 2 7.0860ms 1.4400us 14.171ms [CUDA memcpy HtoD]
2.04% 1.6702ms 1 1.6702ms 1.6702ms 1.6702ms D_Conv_i(float*, float*, float*, int)
0.14% 118.24us 2 59.122us 56.386us 61.858us [CUDA memset]
API calls: 75.11% 270.97ms 3 90.322ms 189.31us 270.50ms cudaMalloc
23.11% 83.367ms 4 20.842ms 45.694us 44.579ms cudaMemcpy
1.07% 3.8698ms 4 967.45us 449.83us 2.5106ms cuDeviceTotalMem
0.59% 2.1262ms 404 5.2620us 332ns 230.46us cuDeviceGetAttribute
0.06% 223.31us 4 55.828us 47.710us 74.669us cuDeviceGetName
0.03% 98.648us 2 49.324us 31.800us 66.848us cudaMemset
0.02% 86.603us 2 43.301us 13.778us 72.825us cudaLaunchKernel
0.01% 21.169us 4 5.2920us 3.2030us 8.0240us cuDeviceGetPCIBusId
0.00% 11.459us 8 1.4320us 427ns 4.2700us cuDeviceGet
0.00% 3.6360us 3 1.2120us 563ns 1.6820us cuDeviceGetCount
0.00% 2.7220us 4 680ns 520ns 877ns cuDeviceGetUuid
$
(CUDA 11.1U1, Tesla V100)
we can see that the atomic kernel takes over 20ms, whereas the non-atomic kernel runs in less than 2ms. Also note that I am running with 128 threads per block rather than 32. Not sure why you chose 32, I would aim for 64 or higher.
Because the size of coef array is relatively small, and the access pattern is uniform across the warp, we can take advantage of __constant__ memory for this data. This gives an additional speed-up:
$ cat t20.cu
#include <iostream>
#define TOL 0.1
__global__ void D_Conv(float *in, float* coef, float *out, int coefsize)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y; //129
int k = blockIdx.z; //50
if (j < coefsize && i < 250000 && k < 50)
{
if (i - j >= 0 && i - j < 250000)
{
atomicAdd(&out[k*250000 + i], coef[j] * in[k*250000 + i - j]);
}
}
}
__constant__ float Ccoef[129];
__global__ void D_Conv_i(float *in, float* coef, float *out, int coefsize)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
//int j = blockIdx.y; //129
int k = blockIdx.z; //50
if (i < 250000 && k < 50)
{
float s = 0;
for (int j = 0; j < 129; j++)
if (i - j >= 0 && i - j < 250000) s += Ccoef[j] * in[k*250000 + i - j];
out[k*250000 + i] += s;
}
}
int main(){
int num_c = 50;
int csz = 250000;
int coefsize = 129;
int isz = num_c*csz;
int osz = num_c*csz;
float *d_in, *h_in, *d_coef, *h_coef, *d_out, *h_out, *h_out_i;
cudaMalloc(&d_in, isz*sizeof(float));
cudaMalloc(&d_out, osz*sizeof(float));
cudaMalloc(&d_coef, coefsize*sizeof(float));
h_in = new float[isz];
h_out = new float[osz];
h_out_i = new float[osz];
h_coef = new float[coefsize];
cudaMemset(d_out, 0, osz*sizeof(float));
for (int i = 0; i < coefsize; i++) h_coef[i] = i%5;
for (int i = 0; i < isz; i++) h_in[i] = i%4;
cudaMemcpy(d_in, h_in, isz*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_coef, h_coef, coefsize*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(Ccoef, h_coef, coefsize*sizeof(float));
int threads = 128;
dim3 blocks((csz+threads-1)/threads, coefsize, num_c);
D_Conv<<<blocks, threads>>>(d_in, d_coef, d_out, coefsize);
cudaMemcpy(h_out, d_out, osz*sizeof(float), cudaMemcpyDeviceToHost);
dim3 blocks2((csz+threads-1)/threads, 1, num_c);
cudaMemset(d_out, 0, osz*sizeof(float));
D_Conv_i<<<blocks2, threads>>>(d_in, d_coef, d_out, coefsize);
cudaMemcpy(h_out_i, d_out, osz*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < osz; i++) if (fabsf(h_out_i[i] - h_out[i]) > TOL) {std::cout << "mismatch at: " << i << " was: " << h_out_i[i] << " should be: " << h_out[i] << std::endl; return 0;}
}
$ nvcc -o t20 t20.cu
$ cuda-memcheck ./t20
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t20
==2191== NVPROF is profiling process 2191, command: ./t20
==2191== Profiling application: ./t20
==2191== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 54.38% 44.047ms 2 22.024ms 21.997ms 22.051ms [CUDA memcpy DtoH]
27.25% 22.075ms 1 22.075ms 22.075ms 22.075ms D_Conv(float*, float*, float*, int)
17.15% 13.888ms 3 4.6294ms 1.4720us 13.885ms [CUDA memcpy HtoD]
1.07% 869.88us 1 869.88us 869.88us 869.88us D_Conv_i(float*, float*, float*, int)
0.15% 117.83us 2 58.913us 56.321us 61.505us [CUDA memset]
API calls: 77.28% 307.94ms 3 102.65ms 188.61us 307.49ms cudaMalloc
20.70% 82.467ms 4 20.617ms 48.300us 44.617ms cudaMemcpy
1.27% 5.0520ms 4 1.2630ms 593.63us 3.2465ms cuDeviceTotalMem
0.62% 2.4765ms 404 6.1290us 450ns 261.77us cuDeviceGetAttribute
0.07% 271.54us 4 67.884us 59.173us 88.716us cuDeviceGetName
0.02% 97.041us 2 48.520us 30.831us 66.210us cudaMemset
0.02% 86.276us 2 43.138us 14.800us 71.476us cudaLaunchKernel
0.01% 23.142us 1 23.142us 23.142us 23.142us cudaMemcpyToSymbol
0.01% 21.576us 4 5.3940us 3.0900us 8.4600us cuDeviceGetPCIBusId
0.00% 13.604us 8 1.7000us 667ns 4.4800us cuDeviceGet
0.00% 5.7060us 3 1.9020us 452ns 3.5840us cuDeviceGetCount
0.00% 3.2440us 4 811ns 660ns 1.0340us cuDeviceGetUuid
$
The improved kernel now runs in less than 1ms, for a ~20x speed-up.

Is one CUDA block dimension faster than the other?

I have a simple CUDA code that assigns the values of an NxN matrix A to matrix B. In one case, I declare block sizes block(1,32) and have each thread loop over the entries in the first matrix dimension. In the second case,
I declare block sizes block(32,1) and have each thread loop over entries in the
second matrix dimension.
Is there some really obvious reason why, in my code below, threads that loop over the stride 1 memory are significantly slower than those that the loop over stride N memory? I would have thought it was the other way around (if there is any difference at all).
Am I missing something really obvious (a bug, perhaps)?
The complete code is below.
#include <stdio.h>
#include <sys/time.h>
__global__ void addmat_x(int m, int n, int* A, int *B)
{
int idx, ix;
int iy = threadIdx.y + blockIdx.y*blockDim.y;
if (iy < n)
for(ix = 0; ix < m; ix++) {
idx = iy*m + ix; /* iy*m is constant */
B[idx] = A[idx];
}
}
__global__ void addmat_y(int m, int n, int* A, int *B)
{
int ix = threadIdx.x + blockIdx.x*blockDim.x;
int idx, iy;
if (ix < m)
for(iy = 0; iy < n; iy++) {
idx = iy*m + ix;
B[idx] = A[idx];
}
}
double cpuSecond()
{
struct timeval tp;
gettimeofday(&tp,NULL);
return (double) tp.tv_sec + (double)tp.tv_usec*1e-6;
}
int main(int argc, char** argv)
{
int *A, *B;
int *dev_A, *dev_B;
size_t m, n, nbytes;
double etime, start;
m = 1 << 14;
n = 1 << 14;
nbytes = m*n*sizeof(int);
A = (int*) malloc(nbytes);
B = (int*) malloc(nbytes);
memset(A,0,nbytes);
cudaMalloc((void**) &dev_A, nbytes);
cudaMalloc((void**) &dev_B, nbytes);
cudaMemcpy(dev_A, A, nbytes, cudaMemcpyHostToDevice);
#if 1
/* One thread per row */
dim3 block(1,32);
dim3 grid(1,(n+block.y-1)/block.y);
start = cpuSecond();
addmat_x<<<grid,block>>>(m,n,dev_A, dev_B);
#else
/* One thread per column */
dim3 block(32,1);
dim3 grid((m+block.x-1)/block.x,1);
start = cpuSecond();
addmat_y<<<grid,block>>>(m,n,dev_A, dev_B);
#endif
cudaDeviceSynchronize();
etime = cpuSecond() - start;
printf("GPU Kernel %10.3g (s)\n",etime);
cudaFree(dev_A);
cudaFree(dev_B);
free(A);
free(B);
cudaDeviceReset();
}
Lets compare the global memory indexing generated by each thread, in each case.
addmat_x:
Your block dimension is (1,32). This means 1 thread wide in x, 32 threads "long" in y. The threadId.x value for each thread will be 0. The threadIdx.y value for the threads in the warp will range from 0 to 31, as you move from thread to thread in the warp. With that, let's inspect your creation of idx in that kernel:
m = 1 << 14;
...
int iy = threadIdx.y + blockIdx.y*blockDim.y;
idx = iy*m + ix;
let's choose the first block, whose blockIdx.y is 0. Then:
idx = threadIdx.y*(1<<14) + ix;
For the first loop iteration, ix is 0. The idx values generated by each thread will be:
threadIdx.y: | idx:
0 0
1 (1<<14)
2 2*(1<<14)
...
31 31*(1<<14)
For a given loop iteration, the distance from the load or store index from one thread to the next will be 1<<14. i.e. not adjacent. Scattered.
addmat_y:
Your block dimension is (32,1). This means 32 threads wide in x, 1 thread "long" in y. The threadIdx.y value for each thread will be 0. The threadIdx.x value for the threads in the warp will range from 0 to 31, as you move from thread to thread. Now let's inspect your creation of idx in that kernel:
m = 1 << 14;
...
int ix = threadIdx.x + blockIdx.x*blockDim.x;
idx = iy*m + ix;
Let's choose the first block, whose blockIdx.x is 0. Then:
idx = iy*m + threadIdx.x;
For the first loop iteration, iy is 0, so we simply have:
idx = threadIdx.x;
This generates the following index pattern across the warp:
threadIdx.x: | idx:
0 0
1 1
2 2
...
31 31
These indices are adjacent, it is not a scattered load or store, the addresses will coalesce nicely, and this represents "efficient" use of global memory. It will perform faster than the first case.

2D threads in CUDA

I'm trying to use 2D threads in CUDA. threadIDx.x and blockIdx.x work fine, but threadIdx.y and blockIdx.y don't work. The .y ones are always 0.
Here is my code:
#define N 16
__global__ void add(int* a) {
int i=threadIdx.x;
int j=threadIdx.y;
a[i] = j;
}
int main(int argc, char **argv)
{
int a[N];
const int size = N*sizeof(int);
int *da;
cudaMalloc((void**)&da, size);
add<<<1, N>>>(da);
cudaMemcpy(a, da, size, cudaMemcpyDeviceToHost);
printf("Thread indices:\n");
for(int i=0;i<N;i++)
{
printf("%d ", a[i]);
}
cudaFree(da);
return 0;
}
The result for a[i] = j; or a[j] = j;
Thread indices:
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
and for a[i] = i;
Thread indices:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
I tried using
#define M 4
#define N 4
...
int i = (blockDim.x * blockIdx.x) + threadIdx.x;
int j = (blockDim.y * blockIdx.y) + threadIdx.y;
...
add<<<M, N>>>(da);
...
and result is same: .x ones are fine but .y ones are all 0. Can anyone help me fixing this? Thanks
You are confusing blocks and threads with dimensions.
add <<<M,N>>> is interpreted as add<<<dim3(M,1,1),dim3(N,1,1)>>> where M is the number of blocks and N is the number of threads per kernel.
If you want to have MxN blocks with MxN threads call add<<<dim3(M,N),dim3(M,N)>>>
I would recommend Udacity CUDA course for beginners, it is very beginner friendly.
I want M blocks with N threads per block.
Well then add<<<M,N>>> is correct but it is 1 dimensional, there is no y to it. If you want to locate the thread use this code.
int index = threadIdx.x + blockDim.x * blockIdx.x
There is no y in it. The entire thing is 1D. Each block can only have a limited number of threads (64 or 128 usually) that is why threads and blocks are separated. There are a lot of nuances to it. I would recommend the Udacity course it helped me a lot.

Summing the rows of a matrix (stored in either row-major or column-major order) in CUDA

I'm working on the problem summing the rows of a matrix in CUDA. I'm giving the following example.
Suppose to have the following 20 * 4 array:
1 2 3 4
4 1 2 3
3 4 1 2
.
1 2 3 4
.
.
.
.
.
.
.
.
2 1 3 4
After flattened the 2d array to a 1d array (either in row-major or column-major order), I need to assign each thread to a different row and calculate the cost for that row.
For example
- thread 1 should calculate the cost for 1 2 3 4
- thread 2 should calculate the cost for 4 1 2 3
How can I that in CUDA?
Thank you all for the reply
#include <stdio.h>
#include <stdlib.h>
#define MROWS 20
#define NCOLS 4
#define nTPB 256
__global__ void mykernel(int *costdata, int rows, int cols, int *results){
int tidx = threadIdx.x + blockDim.x*blockIdx.x;
if (tidx < rows){
int mycost = 0;
for (int i = 0; i < cols; i++)
mycost += costdata[(tidx*cols)+i];
results[tidx] = mycost;
}
}
int main(){
//define and initialize host and device storage for cost and results
int *d_costdata, *h_costdata, *d_results, *h_results;
h_results = (int *)malloc(MROWS*sizeof(int));
h_costdata = (int *)malloc(MROWS*NCOLS*sizeof(int));
for (int i=0; i<(MROWS*NCOLS); i++)
h_costdata[i] = rand()%4;
cudaMalloc((void **)&d_results, MROWS*sizeof(int));
cudaMalloc((void **)&d_costdata, MROWS*NCOLS*sizeof(int));
//copy cost data from host to device
cudaMemcpy(d_costdata, h_costdata, MROWS*NCOLS*sizeof(int), cudaMemcpyHostToDevice);
mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results);
// copy results back from device to host
cudaMemcpy(h_results, d_results, MROWS*sizeof(int), cudaMemcpyDeviceToHost);
for (int i=0; i<MROWS; i++){
int loc_cost = 0;
for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
printf("cost[%d]: host= %d, device = %d\n", i, loc_cost, h_results[i]);
}
}
This assumes "cost" of each row is just the sum of the elements in each row. If you have a different "cost" function, you can modify the activity in the kernel for-loop accordingly. This also assumes C-style row-major data storage (1 2 3 4 4 1 2 3 3 4 1 2 etc.)
If you instead use column-major storage (1 4 3 etc.), you can slightly improve the performance, since the data reads can be fully coalesced. Then your kernel code could look like this:
for (int i = 0; i < cols; i++)
mycost += costdata[(i*rows)+tidx];
You should also use proper cuda error checking on all CUDA API calls and kernel calls.
EDIT: As discussed in the comments below, for the row-major storage case, in some situations it might give an increase in memory efficiency by electing to load 16-byte quantities rather than the base type. Following is a modified version that implements this idea for arbitrary dimensions and (more or less) arbitrary base types:
#include <iostream>
#include <typeinfo>
#include <cstdlib>
#include <vector_types.h>
#define MROWS 1742
#define NCOLS 801
#define nTPB 256
typedef double mytype;
__host__ int sizetype(){
int size = 0;
if ((typeid(mytype) == typeid(float)) || (typeid(mytype) == typeid(int)) || (typeid(mytype) == typeid(unsigned int)))
size = 4;
else if (typeid(mytype) == typeid(double))
size = 8;
else if ((typeid(mytype) == typeid(unsigned char)) || (typeid(mytype) == typeid(char)))
size = 1;
return size;
}
template<typename T>
__global__ void mykernel(const T *costdata, int rows, int cols, T *results, int size, size_t pitch){
int chunk = 16/size; // assumes size is a factor of 16
int tidx = threadIdx.x + blockDim.x*blockIdx.x;
if (tidx < rows){
T *myrowptr = (T *)(((unsigned char *)costdata) + tidx*pitch);
T mycost = (T)0;
int count = 0;
while (count < cols){
if ((cols-count)>=chunk){
// read 16 bytes
int4 temp = *((int4 *)(myrowptr + count));
int bcount = 16;
int j = 0;
while (bcount > 0){
mycost += *(((T *)(&temp)) + j++);
bcount -= size;
count++;}
}
else {
// read one quantity at a time
for (; count < cols; count++)
mycost += myrowptr[count];
}
results[tidx] = mycost;
}
}
}
int main(){
int typesize = sizetype();
if (typesize == 0) {std::cout << "invalid type selected" << std::endl; return 1;}
//define and initialize host and device storage for cost and results
mytype *d_costdata, *h_costdata, *d_results, *h_results;
h_results = (mytype *)malloc(MROWS*sizeof(mytype));
h_costdata = (mytype *)malloc(MROWS*NCOLS*sizeof(mytype));
for (int i=0; i<(MROWS*NCOLS); i++)
h_costdata[i] = (mytype)(rand()%4);
size_t pitch = 0;
cudaMalloc((void **)&d_results, MROWS*sizeof(mytype));
cudaMallocPitch((void **)&d_costdata, &pitch, NCOLS*sizeof(mytype), MROWS);
//copy cost data from host to device
cudaMemcpy2D(d_costdata, pitch, h_costdata, NCOLS*sizeof(mytype), NCOLS*sizeof(mytype), MROWS, cudaMemcpyHostToDevice);
mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results, typesize, pitch);
// copy results back from device to host
cudaMemcpy(h_results, d_results, MROWS*sizeof(mytype), cudaMemcpyDeviceToHost);
for (int i=0; i<MROWS; i++){
mytype loc_cost = (mytype)0;
for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
if ((i < 10) && (typesize > 1))
std::cout <<"cost[" << i << "]: host= " << loc_cost << ", device = " << h_results[i] << std::endl;
if (loc_cost != h_results[i]){ std::cout << "mismatch at index" << i << "should be:" << loc_cost << "was:" << h_results[i] << std::endl; return 1; }
}
std::cout << "Results are correct!" << std::endl;
}

Matrix Multiplication giving wrong output [duplicate]

This question already has an answer here:
Unable to execute device kernel in CUDA
(1 answer)
Closed 7 years ago.
What I am attempting to do is Multiply Matrix A & Matrix B and then from the product matrix I get the index of the maximum value per column. But unfortunately, only the first 128*128 values of the matrix multiplication are correct while others are just garbage. I do not quite understand how this works. I request you to kindly guide me with this ..
#include<stdio.h>
#include "cuda.h"
#include<stdlib.h>
#define blockD 32
const int wA = 128;
const int hA = 4096;
const int wB = 4096;
const int hB = wA;
main(void){
void MatrixMultiplication(float *, float *, float *, float *);
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *M, *N, *P, *C;
// allocate memory on the CPU
M = (float*)malloc(size_A);
N = (float*)malloc(size_B);
P = (float*)malloc(size_max);
C = (float*)malloc(size_C);
// initialize the matrices
for (int y=0; y < hA; y++) {
for (int x=0; x < wA; x++){
M[y*wA + x] = 32; //x + y*wA;
}
}
for (int y=0; y<hB; y++) {
for (int x=0; x<wB; x++){
N[y*wB + x] = 21; //x + y*wB;
}
}
MatrixMultiplication(M, N, P, C);
//Write
FILE *f1;
int i,j;
f1 = fopen("C.txt","w");
for(i = hA - 2 ; i < hA; i ++){
for(j = 0; j < wB; j++){
fprintf(f1,"%d\t",int(C[i*wB + j]));
}
fprintf(f1,"\n");
}
fclose(f1);
// free the memory allocated on the CPU
free( M );
free( N );
free( P );
free( C );
cudaDeviceReset();
return 0;
}
__device__ void MaxFunction(float* Pd, float* max)
{
int x = (threadIdx.x + blockIdx.x * blockDim.x);
int y = (threadIdx.y + blockIdx.y * blockDim.y);
int k = 0;
int temp = 0; int temp_idx = 0;
for (k = 0; k < wB; ++k) {
if(Pd[x*wB + k] > temp){
temp = Pd[x*wB + k];
temp_idx = x*wB + k;
}
}
max[y*2 + 0] = temp;
max[y*2 + 1] = temp_idx;
}
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, float* max)
{
// declare cache in the shared memory
__shared__ float Mds[blockD][blockD];
__shared__ float Nds[blockD][blockD];
float Pvalue = 0;
// Loop over the Md and Nd block dimension required to compute the Pd element
for (int m = (wA * blockD * blockIdx.y), n = (blockD * blockIdx.x);
m < ((wA * blockD * blockIdx.y)+wA-1);
m += blockD, n += (blockD*hB)){
// collaboratively loading of Md and Nd blocks into shared memory
Mds[threadIdx.y][threadIdx.x] = Md[m + wA * threadIdx.y + threadIdx.x];
Nds[threadIdx.y][threadIdx.x] = Nd[n + wA * threadIdx.y + threadIdx.x];
__syncthreads();
// keep track of the running sum
for (int k = 0; k < blockD; k++)
Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
__syncthreads();
}
// write back to the global memory
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
__syncthreads();
MaxFunction(Pd, max);
}
void MatrixMultiplication(float *M, float *N, float *P, float *C) {
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *Md, *Nd, *Pd, *max;
// allocate memory on the GPU
cudaMalloc((void**)&Md, size_A);
cudaMalloc((void**)&Nd, size_B);
cudaMalloc((void**)&Pd, size_C);
cudaMalloc((void**)&max, size_max);
// transfer M and N to device memory
cudaMemcpy(Md, M, size_A, cudaMemcpyHostToDevice);
cudaMemcpy(Nd, N, size_B, cudaMemcpyHostToDevice);
// kernel invocation code
dim3 dimBlock(blockD, blockD);
dim3 dimGrid(wA/blockD, hB/blockD);
//Execute Kernel
MatrixMulKernel<<<dimGrid, dimBlock>>>( Md, Nd, Pd, max);
// transfer P from device
cudaMemcpy(P, max, size_max, cudaMemcpyDeviceToHost);
cudaMemcpy(C, Pd, size_C, cudaMemcpyDeviceToHost);
// free the memory allocated on the GPU
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
cudaFree(max);
}
In your code you seem to have more than one problem. One of the problems is, in place of this:
dim3 dimGrid(wA/blockD, hB/blockD);
You should have this:
dim3 dimGrid(wB/blockD, hA/blockD);
Ultimately you need one thread in your grid for each output point. Your formulation was giving you a grid of 4 blocks by 4 blocks, whereas you need a grid of 128 blocks by 128 blocks.
The other problem I found with your code was in these lines in the kernel:
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
They are not indexing properly through the output array. Rather than try to sort it out using your scheme, I used this instead:
Pd[(threadIdx.x + (blockIdx.x * blockDim.x)) + ((threadIdx.y + (blockIdx.y * blockDim.y))*(gridDim.x*blockDim.x))] = Pvalue;
When I made the above two changes to your code, I got what I believe are correct results throughout the array. And it took about 32 seconds on my machine to run it. (Note that I haven't tried fixing your original max-finding code -- see below for a better approach.)
Based on your previous question, you seemed to be concerned about speed. If you want to do fast matrix multiply, you should use cublas. The following code shows how to use cublas to multiply two ordinary C-style matrices (they don't have to be square). I've also included a column-max finding kernel that will be fast when the number of columns is large (say, over 500 or so. You have 4096 columns in your example). For small numbers of columns, there may be quicker ways to perform this function, but small numbers of columns also suggests that the overall problem size may be small and so speed (of this piece of code) will not really be an issue.
Here's the code:
#include <stdio.h>
#include <cublas_v2.h>
#define VERBOSE 1
#define nTPB 64
#define ROW_A 4
#define COL_A 4
#define ROW_B COL_A
#define COL_B 4
#define ROW_C ROW_A
#define COL_C COL_B
#define SIZ_A (ROW_A*COL_A)
#define SIZ_B (ROW_B*COL_B)
#define SIZ_C (ROW_C*COL_C)
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void col_max(float *mat, float *max, unsigned int *midx, unsigned int rows, unsigned int cols){
int idx = threadIdx.x + blockDim.x*blockIdx.x;
if (idx < cols){
float tempmax = mat[idx];
unsigned int tempmidx = 0;
for (int i = 1; i< rows; i++)
if (mat[idx + (i*cols)] > tempmax){
tempmax = mat[idx + (i*cols)];
tempmidx = i;}
max[idx] = tempmax;
midx[idx] = tempmidx;
}
}
int main(){
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C, *h_max, *d_max;
unsigned int *h_idx, *d_idx;
h_A = (float *)malloc(SIZ_A*sizeof(float));
if (h_A==0) {printf("malloc fail\n"); return -1;}
h_B = (float *)malloc(SIZ_B*sizeof(float));
if (h_B==0) {printf("malloc fail\n"); return -1;}
h_C = (float *)malloc(SIZ_C*sizeof(float));
if (h_C==0) {printf("malloc fail\n"); return -1;}
h_max = (float *)malloc(COL_C*sizeof(float));
if (h_max==0) {printf("malloc fail\n"); return -1;}
h_idx = (unsigned int*)malloc(COL_C*sizeof(unsigned int));
if (h_idx==0) {printf("malloc fail\n"); return -1;}
cudaMalloc((void **)&d_A, SIZ_A*sizeof(float));
cudaMalloc((void **)&d_B, SIZ_B*sizeof(float));
cudaMalloc((void **)&d_C, SIZ_C*sizeof(float));
cudaMalloc((void **)&d_max, COL_C*sizeof(float));
cudaMalloc((void **)&d_idx, COL_C*sizeof(unsigned int));
cudaCheckErrors("cuda malloc fail");
// initialize data
for (int i=0; i< SIZ_A; i++) h_A[i] = (float)(i+1);
for (int i=0; i< SIZ_B; i++) h_B[i] = (float)(i+2);
cudaMemcpy(d_A, h_A, SIZ_A*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, SIZ_B*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cuda memcpy 1 fail");
const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;
cublasCheckErrors(cublasCreate(&handle));
// C = A*B
// due to cublas expecting column-major storage, parameters
// are scrambled
cublasCheckErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, COL_B, ROW_A, COL_A, &alpha, d_B, COL_B, d_A, COL_A, &beta, d_C, COL_C));
cudaMemcpy(h_C, d_C, SIZ_C*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 2 fail");
col_max<<<(COL_C + nTPB - 1)/nTPB, nTPB>>>(d_C, d_max, d_idx, ROW_C, COL_C);
cudaCheckErrors("kernel launch fail");
cudaMemcpy(h_max, d_max, COL_C*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_idx, d_idx, COL_C*sizeof(unsigned int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 3 fail/kernel fail");
if (VERBOSE){
printf("A: \n");
for (int i=0; i< ROW_A; i++){
for (int j=0; j< COL_A; j++)
printf("%7.5G", h_A[j+(i*COL_A)]);
printf("\n");}
printf("B: \n");
for (int i=0; i< ROW_B; i++){
for (int j=0; j< COL_B; j++)
printf("%7.5G", h_B[j+(i*COL_B)]);
printf("\n");}
printf("C = A*B: \n");
for (int i=0; i< ROW_C; i++){
for (int j=0; j< COL_C; j++)
printf("%7.5G", h_C[j+(i*COL_C)]);
printf("\n");}
printf("COLUMN MAX:\n");
for (int i=0; i< COL_C; i++)
printf("%7.5G", h_max[i]);
printf("\nCOLUMN MAX IDX:\n");
for (int i=0; i< COL_C; i++)
printf("%7d", h_idx[i]);
}
printf("\n finished!\n");
return 0;
}
Here's what I used to compile:
$ nvcc -arch=sm_20 -O3 -o t221 t221.cu -lcublas
And here's the sample output:
$ cuda-memcheck ./t221
========= CUDA-MEMCHECK
A:
1 2 3 4
5 6 7 8
9 10 11 12
13 14 15 16
B:
2 3 4 5
6 7 8 9
10 11 12 13
14 15 16 17
C = A*B:
100 110 120 130
228 254 280 306
356 398 440 482
484 542 600 658
COLUMN MAX:
484 542 600 658
COLUMN MAX IDX:
3 3 3 3
finished!
========= ERROR SUMMARY: 0 errors
$
When I extended my code to handle the same sizes you indicated, (A = 4096x128, B=128x4096) it took about 1 second on my machine. So it's much faster than your code. However, when I take your code and comment out your call to MaxFunction in the kernel, it also only takes about 1 second to compute the matrix multiply result. So if you wanted to keep your matrix multiply code (i.e. not use cublas) you could break the code into 2 kernels, and use your multiply routine in the first kernel with my max-finding routine (col_max) in the second kernel, and also probably get a pretty fast result.
As #talonmies indicated, if you are running on a windows machine, be sure you are aware of the ramifications of windows TDR. (search that in the upper right corner search box if needed)