Relation between number of blocks of threads - cuda

This code is from a book called cuda by examples
#include "../common/book.h"
#define N (33 * 1024)
__global__ void add( int *a, int *b, int *c ) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid += blockDim.x * gridDim.x;
}
}
.
.
.
add<<<128,128>>>( dev_a, dev_b, dev_c );
33*1024 = 33792
128 * 128 = 16384
33792 > 16384
So, can I have to increase the number of threads per blocks in this case to run?

Notice the second command in the body of while-cycle, i.e. tid += blockDim.x * gridDim.x;. It does the stuff even for bigger arrays than 16384.
Thread with ID 0 sums the items of arrays in the positions 0, 16384, 32768,...
Thread with ID 1 sums the items of arrays in the positions 1, 16385, 32769,...

Related

How to use register memory for each thread in CUDA?

I am trying to flip upside down the array which size is big.(ex. 4096x8192)
At first, I tried with two array for input and output and It works!.
(I will say input is original and output is flipped array)
But I thought it will be easier and much efficient if each thread can hold input elements.
Then I can only use one array!
Could you guys share your knowledge or introduce any documents that help this problem?
Thanks and here is my code.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define ThreadPB 32 // optimal size
dim3 threadsPerBlock(ThreadPB, ThreadPB);
__global__ void initKernel(int *input, int nx, int ny)
{
int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
int idx = idx_y * nx + idx_x;
if (idx_x < nx && idx_y < ny) {
input[idx] = idx_y;
}
}
__global__ void flipKernel(int *output, int *input, int nx, int ny)
{
int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
int idx = idx_y * nx + idx_x;
// is it possible to use only one array?
if (idx_x < nx && idx_y < ny) {
output[(ny - idx_y - 1) * nx + idx_x] = input[idx_y * nx + idx_x];
}
}
int main()
{
// time check
cudaEvent_t start, stop, start_temp, stop_temp;
cudaEvent_t start_temp2, stop_temp2;
float elapsedTime, elapsedTime_temp, elapsedTime_temp2;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaEventCreate(&start_temp); cudaEventCreate(&stop_temp);
cudaEventCreate(&start_temp2); cudaEventCreate(&stop_temp2);
const int num_x = 4096;
const int num_y = 8192;
const int arraySize = num_x * num_y;
int *orig, *flip;
orig = (int *)malloc(sizeof(int) * arraySize);
flip = (int *)malloc(sizeof(int) * arraySize);
int *dev_orig = 0;
int *dev_flip = 0;
cudaMalloc((void**)&dev_orig, arraySize * sizeof(int));
cudaMalloc((void**)&dev_flip, arraySize * sizeof(int));
cudaMemcpy(dev_orig, orig, arraySize * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_flip, flip, arraySize * sizeof(int), cudaMemcpyHostToDevice);
dim3 blocksFlip((num_x + threadsPerBlock.x - 1) / threadsPerBlock.x, (num_y + threadsPerBlock.y - 1) / threadsPerBlock.y);
initKernel << <blocksFlip, threadsPerBlock >> > (dev_orig, num_x, num_y);
cudaEventRecord(start, 0);
flipKernel << <blocksFlip, threadsPerBlock >> > (dev_flip, dev_orig, num_x, num_y);
// time check end
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); printf("flip 1024x2048 처리 시간 = %f ms.\n", elapsedTime);
cudaMemcpy(orig, dev_orig, arraySize * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(flip, dev_flip, arraySize * sizeof(int), cudaMemcpyDeviceToHost);
// check flip works
printf("FLIP this array { 0, 1, 2, 3, 4 , 5, 6, 7, 8, 9...} \n= { %d, %d, %d, %d, %d, %d, %d, %d, %d, %d...}\n",
flip[num_x * 0], flip[num_x * 1], flip[num_x * 2], flip[num_x * 3], flip[num_x * 4],
flip[num_x * 5], flip[num_x * 6], flip[num_x * 7], flip[num_x * 8], flip[num_x * 9]);
return 0;
}
For an even number of rows in the array, you should be able to do something like this:
__global__ void flipKernel(int *input, int nx, int ny)
{
int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
int idx = idx_y * nx + idx_x;
if (idx_x < nx && idx_y < ny/2) {
int output_temp = input[(ny - idx_y - 1) * nx + idx_x];
input[(ny - idx_y - 1) * nx + idx_x] = input[idx_y * nx + idx_x];
input[idx_y * nx + idx_x] = output_temp;
}
}
You would only need to launch this kernel with half as many threads in y (half as many rows in y). Each thread is updating two values in the matrix.
Rather than thinking about things like "register" or imagining that CUDA is some kind of weird language, if you have C or C++ programming ability, I would encourage you to think about how you might solve the problem if it were framed as an ordinary C or C++ programming challenge. Your intuition from that will often work very well in CUDA.
The core of the routine above is just a swap. The thing you are referring to as a "register" is just an ordinary local variable in C or C++. There is a register keyword in C++, but it serves essentially no purpose in CUDA, and is not needed here anyway.
You can handle an odd number of rows by simply leaving the middle row as-is, and swapping the remaining rows. This would require just a slight change to the indexing calculations.

Calculating indices for nested loops in CUDA

I'm trying to learn CUDA and I'm a bit confused about calculating thread indices. Let's say I have this loop I'm trying to parallelize:
...
for(int x = 0; x < DIM_x; x++){
for(int y = 0; y < DIM_y; y++){
for(int dx = 0; dx < psize; dx++){
array[y*DIM_x + x + dx] += 1;
}
}
}
In PyCUDA, I set:
block = (8, 8, 8)
grid = (96, 96, 16)
Most of the examples I've seen for parallelizing loops calculate thread indices like this:
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int dx = blockIdx.z * blockDim.z + threadIdx.z;
if (x >= DIM_x || y >= DIM_y || dx >= psize)
return;
atomicAdd(&array[y*DIM_x + x + dx], 1)
DIM_x = 580, DIM_y = 550, psize = 50
However, if I print x, I see that multiple threads with the same thread Id are created, and the final result is wrong.
Instead, if I use this (3D grid of 3D blocks):
int blockId = blockIdx.x + blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int x = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x) + threadIdx.x;
It fixes the multiple same thread Ids problem for x, but I'm not sure how I'd parallelize y and dx.
If anyone could help me understand where I'm going wrong, and show me the right way to parallelize the loops, I'd really appreciate it.
However, if I print x, I see that multiple threads with the same
thread Id are created, and the final result is wrong.
It would be normal for you to see multiple threads with the same x thread ID in a multi-dimensional grid, as it would also be normal to observe many iterations of the loops in your host code with the same x value. If the result is wrong, it has nothing to do with any of the code you have shown, viz:
#include <vector>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <assert.h>
void host(int* array, int DIM_x, int DIM_y, int psize)
{
for(int x = 0; x < DIM_x; x++){
for(int y = 0; y < DIM_y; y++){
for(int dx = 0; dx < psize; dx++){
array[y*DIM_x + x + dx] += 1;
}
}
}
}
__global__
void kernel(int* array, int DIM_x, int DIM_y, int psize)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int dx = blockIdx.z * blockDim.z + threadIdx.z;
if (x >= DIM_x || y >= DIM_y || dx >= psize)
return;
atomicAdd(&array[y*DIM_x + x + dx], 1);
}
int main()
{
dim3 block(8, 8, 8);
dim3 grid(96, 96, 16);
int DIM_x = 580, DIM_y = 550, psize = 50;
std::vector<int> array_h(DIM_x * DIM_y * psize, 0);
std::vector<int> array_hd(DIM_x * DIM_y * psize, 0);
thrust::device_vector<int> array_d(DIM_x * DIM_y * psize, 0);
kernel<<<grid, block>>>(thrust::raw_pointer_cast(array_d.data()), DIM_x, DIM_y, psize);
host(&array_h[0], DIM_x, DIM_y, psize);
thrust::copy(array_d.begin(), array_d.end(), array_hd.begin());
cudaDeviceSynchronize();
for(int i=0; i<DIM_x * DIM_y * psize; i++) {
assert( array_h[i] == array_hd[i] );
}
return 0;
}
which when compiled and run
$ nvcc -arch=sm_52 -std=c++11 -o looploop loop_the_loop.cu
$ cuda-memcheck ./looploop
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
emits no errors and passes the check of all elements against the host code in your question.
If you are getting incorrect results, it is likely that you have a problem with initialization of the device memory before running the kernel. Otherwise I fail to see how incorrect results could be emitted by the code you have shown.
In general, performing a large number of atomic memory transactions, as your code does, is not the optimal way to perform computation on the GPU. Using non-atomic transactions would probably need to rely on other a priori information about the structure of the problem (such as a graph decomposition or a precise description of the write patterns of the problem).
In a 3D grid with 3D blocks, the thread ID is:
unsigned long blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
unsigned long threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
Not the x you computed. The x is only the x index of that 3D matrix.
There is a nice cheatsheet in this blog

Is one CUDA block dimension faster than the other?

I have a simple CUDA code that assigns the values of an NxN matrix A to matrix B. In one case, I declare block sizes block(1,32) and have each thread loop over the entries in the first matrix dimension. In the second case,
I declare block sizes block(32,1) and have each thread loop over entries in the
second matrix dimension.
Is there some really obvious reason why, in my code below, threads that loop over the stride 1 memory are significantly slower than those that the loop over stride N memory? I would have thought it was the other way around (if there is any difference at all).
Am I missing something really obvious (a bug, perhaps)?
The complete code is below.
#include <stdio.h>
#include <sys/time.h>
__global__ void addmat_x(int m, int n, int* A, int *B)
{
int idx, ix;
int iy = threadIdx.y + blockIdx.y*blockDim.y;
if (iy < n)
for(ix = 0; ix < m; ix++) {
idx = iy*m + ix; /* iy*m is constant */
B[idx] = A[idx];
}
}
__global__ void addmat_y(int m, int n, int* A, int *B)
{
int ix = threadIdx.x + blockIdx.x*blockDim.x;
int idx, iy;
if (ix < m)
for(iy = 0; iy < n; iy++) {
idx = iy*m + ix;
B[idx] = A[idx];
}
}
double cpuSecond()
{
struct timeval tp;
gettimeofday(&tp,NULL);
return (double) tp.tv_sec + (double)tp.tv_usec*1e-6;
}
int main(int argc, char** argv)
{
int *A, *B;
int *dev_A, *dev_B;
size_t m, n, nbytes;
double etime, start;
m = 1 << 14;
n = 1 << 14;
nbytes = m*n*sizeof(int);
A = (int*) malloc(nbytes);
B = (int*) malloc(nbytes);
memset(A,0,nbytes);
cudaMalloc((void**) &dev_A, nbytes);
cudaMalloc((void**) &dev_B, nbytes);
cudaMemcpy(dev_A, A, nbytes, cudaMemcpyHostToDevice);
#if 1
/* One thread per row */
dim3 block(1,32);
dim3 grid(1,(n+block.y-1)/block.y);
start = cpuSecond();
addmat_x<<<grid,block>>>(m,n,dev_A, dev_B);
#else
/* One thread per column */
dim3 block(32,1);
dim3 grid((m+block.x-1)/block.x,1);
start = cpuSecond();
addmat_y<<<grid,block>>>(m,n,dev_A, dev_B);
#endif
cudaDeviceSynchronize();
etime = cpuSecond() - start;
printf("GPU Kernel %10.3g (s)\n",etime);
cudaFree(dev_A);
cudaFree(dev_B);
free(A);
free(B);
cudaDeviceReset();
}
Lets compare the global memory indexing generated by each thread, in each case.
addmat_x:
Your block dimension is (1,32). This means 1 thread wide in x, 32 threads "long" in y. The threadId.x value for each thread will be 0. The threadIdx.y value for the threads in the warp will range from 0 to 31, as you move from thread to thread in the warp. With that, let's inspect your creation of idx in that kernel:
m = 1 << 14;
...
int iy = threadIdx.y + blockIdx.y*blockDim.y;
idx = iy*m + ix;
let's choose the first block, whose blockIdx.y is 0. Then:
idx = threadIdx.y*(1<<14) + ix;
For the first loop iteration, ix is 0. The idx values generated by each thread will be:
threadIdx.y: | idx:
0 0
1 (1<<14)
2 2*(1<<14)
...
31 31*(1<<14)
For a given loop iteration, the distance from the load or store index from one thread to the next will be 1<<14. i.e. not adjacent. Scattered.
addmat_y:
Your block dimension is (32,1). This means 32 threads wide in x, 1 thread "long" in y. The threadIdx.y value for each thread will be 0. The threadIdx.x value for the threads in the warp will range from 0 to 31, as you move from thread to thread. Now let's inspect your creation of idx in that kernel:
m = 1 << 14;
...
int ix = threadIdx.x + blockIdx.x*blockDim.x;
idx = iy*m + ix;
Let's choose the first block, whose blockIdx.x is 0. Then:
idx = iy*m + threadIdx.x;
For the first loop iteration, iy is 0, so we simply have:
idx = threadIdx.x;
This generates the following index pattern across the warp:
threadIdx.x: | idx:
0 0
1 1
2 2
...
31 31
These indices are adjacent, it is not a scattered load or store, the addresses will coalesce nicely, and this represents "efficient" use of global memory. It will perform faster than the first case.

CUDA Stride function is not working

The following code does not work. My expectation is all the y[i] have 3 after the kernel function add() is called. But if N >= (1 << 24) - 255, all the y[i]'s are 2 (as if the kernel function add() did not run).
#include <iostream>
__global__ void add(int n, int *x, int *y) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) y[i] = x[i] + y[i];
}
int main() {
int *x, *y, N = (1 << 24) - 255; // 255 wrong / 256 ok
cudaMallocManaged(&x, N * sizeof(int));
cudaMallocManaged(&y, N * sizeof(int));
for (int i = 0; i < N; ++i) {x[i] = 1; y[i] = 2;}
int sz = 256;
dim3 blockDim(sz,1,1);
dim3 gridDim((N+sz-1)/sz,1,1);
add<<<gridDim, blockDim>>>(N, x, y);
cudaDeviceSynchronize();
for (int i = 0; i < N; ++i) if (y[i]!=3) std::cout << "error" << std::endl;
cudaFree(x);
cudaFree(y);
return 0;
}
The GPU is a GTX1080Ti and has the following limits:
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Machine is X86_64 Linux Ubuntu 16.04. Am I doing something wrong here? Please help.
I did not specify -arch= when compiling this. So I ended up using -arch=sm_20, which is the default value. I used -arch=sm_60 and now it is working as the x dimension of the grid size is 2147483647 for computing capability 3 or above.
http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities

Calculating differences between consecutive indices fast

Given that I have the array
Let Sum be 16
dintptr = { 0 , 2, 8,11,13,15}
I want to compute the difference between consecutive indices using the GPU. So the final array should be as follows:
count = { 2, 6,3,2,2,1}
Below is my kernel:
//for this function n is 6
__global__ void kernel(int *dintptr, int * count, int n){
int id = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ int indexes[256];
int need = (n % 256 ==0)?0:1;
int allow = 256 * ( n/256 + need);
while(id < allow){
if(id < n ){
indexes[threadIdx.x] = dintptr[id];
}
__syncthreads();
if(id < n - 1 ){
if(threadIdx.x % 255 == 0 ){
count[id] = indexes[threadIdx.x + 1] - indexes[threadIdx.x];
}else{
count[id] = dintptr[id+1] - dintptr[id];
}
}//end if id<n-1
__syncthreads();
id+=(gridDim.x * blockDim.x);
}//end while
}//end kernel
// For last element explicitly set count[n-1] = SUm - dintptr[n-1]
2 questions:
Is this kernel fast. Can you suggest a faster implementation?
Does this kernel handle arrays of arbitrary size ( I think it does)
I'll bite.
__global__ void kernel(int *dintptr, int * count, int n)
{
for (int id = blockDim.x * blockIdx.x + threadIdx.x;
id < n-1;
id += gridDim.x * blockDim.x)
count[id] = dintptr[id+1] - dintptr[i];
}
(Since you said you "explicitly" set the value of the last element, and you didn't in your kernel, I didn't bother to set it here either.)
I don't see a lot of advantage to using shared memory in this kernel as you do: the L1 cache on Fermi should give you nearly the same advantage since your locality is high and reuse is low.
Both your kernel and mine appear to handle arbitrary-sized arrays. Yours however appears to assume blockDim.x == 256.