Outside kernel:
dim3 block(32, 32, 1);
printf("rows = %u\n", rows);
dim3 grid(8, 8, rows);
forward_step1<<<block, grid>>>(weight_D, a_D, res1_D, columns);
Inside kernel:
unsigned int tid = blockDim.x*threadIdx.y + threadIdx.x;
unsigned int i = blockIdx.z;
unsigned int j = (gridDim.x*blockIdx.y+blockIdx.x)*blockDim.x*blockDim.y + tid;
if (j==0) printf("%u\n", i);
And the result:
rows = 3
0
0
0
The syntax for the kernel invocation is:
kernel<<<grid_size, block_size>>>(arguments)
You seem to have swapped the grid_size and block_size arguments. Your grid size is (32, 32, 1) and block size (8, 8, rows).
Related
I am trying to flip upside down the array which size is big.(ex. 4096x8192)
At first, I tried with two array for input and output and It works!.
(I will say input is original and output is flipped array)
But I thought it will be easier and much efficient if each thread can hold input elements.
Then I can only use one array!
Could you guys share your knowledge or introduce any documents that help this problem?
Thanks and here is my code.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define ThreadPB 32 // optimal size
dim3 threadsPerBlock(ThreadPB, ThreadPB);
__global__ void initKernel(int *input, int nx, int ny)
{
int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
int idx = idx_y * nx + idx_x;
if (idx_x < nx && idx_y < ny) {
input[idx] = idx_y;
}
}
__global__ void flipKernel(int *output, int *input, int nx, int ny)
{
int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
int idx = idx_y * nx + idx_x;
// is it possible to use only one array?
if (idx_x < nx && idx_y < ny) {
output[(ny - idx_y - 1) * nx + idx_x] = input[idx_y * nx + idx_x];
}
}
int main()
{
// time check
cudaEvent_t start, stop, start_temp, stop_temp;
cudaEvent_t start_temp2, stop_temp2;
float elapsedTime, elapsedTime_temp, elapsedTime_temp2;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaEventCreate(&start_temp); cudaEventCreate(&stop_temp);
cudaEventCreate(&start_temp2); cudaEventCreate(&stop_temp2);
const int num_x = 4096;
const int num_y = 8192;
const int arraySize = num_x * num_y;
int *orig, *flip;
orig = (int *)malloc(sizeof(int) * arraySize);
flip = (int *)malloc(sizeof(int) * arraySize);
int *dev_orig = 0;
int *dev_flip = 0;
cudaMalloc((void**)&dev_orig, arraySize * sizeof(int));
cudaMalloc((void**)&dev_flip, arraySize * sizeof(int));
cudaMemcpy(dev_orig, orig, arraySize * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_flip, flip, arraySize * sizeof(int), cudaMemcpyHostToDevice);
dim3 blocksFlip((num_x + threadsPerBlock.x - 1) / threadsPerBlock.x, (num_y + threadsPerBlock.y - 1) / threadsPerBlock.y);
initKernel << <blocksFlip, threadsPerBlock >> > (dev_orig, num_x, num_y);
cudaEventRecord(start, 0);
flipKernel << <blocksFlip, threadsPerBlock >> > (dev_flip, dev_orig, num_x, num_y);
// time check end
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); printf("flip 1024x2048 처리 시간 = %f ms.\n", elapsedTime);
cudaMemcpy(orig, dev_orig, arraySize * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(flip, dev_flip, arraySize * sizeof(int), cudaMemcpyDeviceToHost);
// check flip works
printf("FLIP this array { 0, 1, 2, 3, 4 , 5, 6, 7, 8, 9...} \n= { %d, %d, %d, %d, %d, %d, %d, %d, %d, %d...}\n",
flip[num_x * 0], flip[num_x * 1], flip[num_x * 2], flip[num_x * 3], flip[num_x * 4],
flip[num_x * 5], flip[num_x * 6], flip[num_x * 7], flip[num_x * 8], flip[num_x * 9]);
return 0;
}
For an even number of rows in the array, you should be able to do something like this:
__global__ void flipKernel(int *input, int nx, int ny)
{
int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
int idx = idx_y * nx + idx_x;
if (idx_x < nx && idx_y < ny/2) {
int output_temp = input[(ny - idx_y - 1) * nx + idx_x];
input[(ny - idx_y - 1) * nx + idx_x] = input[idx_y * nx + idx_x];
input[idx_y * nx + idx_x] = output_temp;
}
}
You would only need to launch this kernel with half as many threads in y (half as many rows in y). Each thread is updating two values in the matrix.
Rather than thinking about things like "register" or imagining that CUDA is some kind of weird language, if you have C or C++ programming ability, I would encourage you to think about how you might solve the problem if it were framed as an ordinary C or C++ programming challenge. Your intuition from that will often work very well in CUDA.
The core of the routine above is just a swap. The thing you are referring to as a "register" is just an ordinary local variable in C or C++. There is a register keyword in C++, but it serves essentially no purpose in CUDA, and is not needed here anyway.
You can handle an odd number of rows by simply leaving the middle row as-is, and swapping the remaining rows. This would require just a slight change to the indexing calculations.
I have a simple CUDA code that assigns the values of an NxN matrix A to matrix B. In one case, I declare block sizes block(1,32) and have each thread loop over the entries in the first matrix dimension. In the second case,
I declare block sizes block(32,1) and have each thread loop over entries in the
second matrix dimension.
Is there some really obvious reason why, in my code below, threads that loop over the stride 1 memory are significantly slower than those that the loop over stride N memory? I would have thought it was the other way around (if there is any difference at all).
Am I missing something really obvious (a bug, perhaps)?
The complete code is below.
#include <stdio.h>
#include <sys/time.h>
__global__ void addmat_x(int m, int n, int* A, int *B)
{
int idx, ix;
int iy = threadIdx.y + blockIdx.y*blockDim.y;
if (iy < n)
for(ix = 0; ix < m; ix++) {
idx = iy*m + ix; /* iy*m is constant */
B[idx] = A[idx];
}
}
__global__ void addmat_y(int m, int n, int* A, int *B)
{
int ix = threadIdx.x + blockIdx.x*blockDim.x;
int idx, iy;
if (ix < m)
for(iy = 0; iy < n; iy++) {
idx = iy*m + ix;
B[idx] = A[idx];
}
}
double cpuSecond()
{
struct timeval tp;
gettimeofday(&tp,NULL);
return (double) tp.tv_sec + (double)tp.tv_usec*1e-6;
}
int main(int argc, char** argv)
{
int *A, *B;
int *dev_A, *dev_B;
size_t m, n, nbytes;
double etime, start;
m = 1 << 14;
n = 1 << 14;
nbytes = m*n*sizeof(int);
A = (int*) malloc(nbytes);
B = (int*) malloc(nbytes);
memset(A,0,nbytes);
cudaMalloc((void**) &dev_A, nbytes);
cudaMalloc((void**) &dev_B, nbytes);
cudaMemcpy(dev_A, A, nbytes, cudaMemcpyHostToDevice);
#if 1
/* One thread per row */
dim3 block(1,32);
dim3 grid(1,(n+block.y-1)/block.y);
start = cpuSecond();
addmat_x<<<grid,block>>>(m,n,dev_A, dev_B);
#else
/* One thread per column */
dim3 block(32,1);
dim3 grid((m+block.x-1)/block.x,1);
start = cpuSecond();
addmat_y<<<grid,block>>>(m,n,dev_A, dev_B);
#endif
cudaDeviceSynchronize();
etime = cpuSecond() - start;
printf("GPU Kernel %10.3g (s)\n",etime);
cudaFree(dev_A);
cudaFree(dev_B);
free(A);
free(B);
cudaDeviceReset();
}
Lets compare the global memory indexing generated by each thread, in each case.
addmat_x:
Your block dimension is (1,32). This means 1 thread wide in x, 32 threads "long" in y. The threadId.x value for each thread will be 0. The threadIdx.y value for the threads in the warp will range from 0 to 31, as you move from thread to thread in the warp. With that, let's inspect your creation of idx in that kernel:
m = 1 << 14;
...
int iy = threadIdx.y + blockIdx.y*blockDim.y;
idx = iy*m + ix;
let's choose the first block, whose blockIdx.y is 0. Then:
idx = threadIdx.y*(1<<14) + ix;
For the first loop iteration, ix is 0. The idx values generated by each thread will be:
threadIdx.y: | idx:
0 0
1 (1<<14)
2 2*(1<<14)
...
31 31*(1<<14)
For a given loop iteration, the distance from the load or store index from one thread to the next will be 1<<14. i.e. not adjacent. Scattered.
addmat_y:
Your block dimension is (32,1). This means 32 threads wide in x, 1 thread "long" in y. The threadIdx.y value for each thread will be 0. The threadIdx.x value for the threads in the warp will range from 0 to 31, as you move from thread to thread. Now let's inspect your creation of idx in that kernel:
m = 1 << 14;
...
int ix = threadIdx.x + blockIdx.x*blockDim.x;
idx = iy*m + ix;
Let's choose the first block, whose blockIdx.x is 0. Then:
idx = iy*m + threadIdx.x;
For the first loop iteration, iy is 0, so we simply have:
idx = threadIdx.x;
This generates the following index pattern across the warp:
threadIdx.x: | idx:
0 0
1 1
2 2
...
31 31
These indices are adjacent, it is not a scattered load or store, the addresses will coalesce nicely, and this represents "efficient" use of global memory. It will perform faster than the first case.
The following code does not work. My expectation is all the y[i] have 3 after the kernel function add() is called. But if N >= (1 << 24) - 255, all the y[i]'s are 2 (as if the kernel function add() did not run).
#include <iostream>
__global__ void add(int n, int *x, int *y) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) y[i] = x[i] + y[i];
}
int main() {
int *x, *y, N = (1 << 24) - 255; // 255 wrong / 256 ok
cudaMallocManaged(&x, N * sizeof(int));
cudaMallocManaged(&y, N * sizeof(int));
for (int i = 0; i < N; ++i) {x[i] = 1; y[i] = 2;}
int sz = 256;
dim3 blockDim(sz,1,1);
dim3 gridDim((N+sz-1)/sz,1,1);
add<<<gridDim, blockDim>>>(N, x, y);
cudaDeviceSynchronize();
for (int i = 0; i < N; ++i) if (y[i]!=3) std::cout << "error" << std::endl;
cudaFree(x);
cudaFree(y);
return 0;
}
The GPU is a GTX1080Ti and has the following limits:
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Machine is X86_64 Linux Ubuntu 16.04. Am I doing something wrong here? Please help.
I did not specify -arch= when compiling this. So I ended up using -arch=sm_20, which is the default value. I used -arch=sm_60 and now it is working as the x dimension of the grid size is 2147483647 for computing capability 3 or above.
http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
Are there limitations as to what I can set the grid size of a CUDA kernel to be? I ran into a problem where kernels were not launching with a grid size of 33 x 33 but were able to launch when the grid size was 32 x 32. Is there any reason for this to occur? Or is it likely that changing the number of blocks from 32 x 32 to 33 x 33 broke some other constraint?
dim3 blockSize(8, 8);
dim3 gridSize(32, 32);
cudaDeviceSynchronize();
set_start<<<gridSize, blockSize>>>(some_params);
The above works.
dim3 blockSize(8, 8);
dim3 gridSize(33, 33);
cudaDeviceSynchronize();
set_start<<<gridSize, blockSize>>>(some_params);
The above does not work.
kernel & main:
__global__
void set_start(double * const H , double * const HU , double * const HV ,
double * const E , const int Na)
{
int j = threadIdx.x + blockIdx.x*blockDim.x + 1;
int i = threadIdx.y + blockIdx.y*blockDim.y + 1;
if(i >= Na-1 || j >= Na-1)
return;
H[i*Na+j] = 1.0 + exp(-100.0*((E[j-1]-0.75)*(E[j-1]-0.75)+(E[i-1]-0.75)*(E[i-1]-0.75))) + 0.5*exp(-100.0*((E[j-1]-0.75)*(E[j-1]-0.75)+(E[i-1]-0.25)*(E[i-1]-0.25)));
HU[i*Na+j] = 0;
HV[i*Na+j] = 0;
}
int main(int argc, char** argv){
double* E_d;
cudaMalloc(&E_d, sizeof(double) * (Nh+1));
set_E<<<64, (Nh/64) + 1>>>(E_d, dx, Nh);
int Na = 259;
double *H_d, *HU_d, *HV_d, *Ht_d, *HUt_d, *HVt_d;
cudaMalloc(&H_d , sizeof(double) * Na * Na);
cudaMalloc(&HU_d, sizeof(double) * Na * Na);
cudaMalloc(&HV_d, sizeof(double) * Na * Na);
dim3 blockSize(8, 8);
//dim3 gridSize(((Na-1)/blockSize.x) + 1, ((Na-1)/blockSize.y) + 1);
//dim3 gridSize(33, 33);
dim3 gridSize(32, 32);
cudaDeviceSynchronize();
set_start<<<blockSize, gridSize>>>(H_d, HU_d, HV_d, E_d, Na);
}
This was on CUDA 7.0.
You have block size and grid size mixed up when calling the kernel.
set_start<<<blockSize, gridSize>>>(H_d, HU_d, HV_d, E_d, Na);
should read:
set_start<<<gridSize, blockSize>>>(H_d, HU_d, HV_d, E_d, Na);
Because of this bug you are actually trying to launch a grid of size blockSize, and blocks of size gridSize. It would appear that the maximum size of a block on your GPU is 1024 threads, so launching blocks of 33x33 fails.
This code is from a book called cuda by examples
#include "../common/book.h"
#define N (33 * 1024)
__global__ void add( int *a, int *b, int *c ) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid += blockDim.x * gridDim.x;
}
}
.
.
.
add<<<128,128>>>( dev_a, dev_b, dev_c );
33*1024 = 33792
128 * 128 = 16384
33792 > 16384
So, can I have to increase the number of threads per blocks in this case to run?
Notice the second command in the body of while-cycle, i.e. tid += blockDim.x * gridDim.x;. It does the stuff even for bigger arrays than 16384.
Thread with ID 0 sums the items of arrays in the positions 0, 16384, 32768,...
Thread with ID 1 sums the items of arrays in the positions 1, 16385, 32769,...