About the number of registers allocated per SM in CUDA - cuda

First Question.
The CUDA C Programming Guide is written like below.
The same on-chip memory is used for both L1 and shared memory: It can
be configured as 48 KB of shared memory and 16 KB of L1 cache or as 16
KB of shared memory and 48 KB of L1 cache
But, device query shows "Total number of registers available per block: 32768".
I use GTX580.(CC is 2.0)
The guide says default cache size is 16KB, but 32768 means 32768*4(byte) = 131072 Bytes = 128 KBytes. Actually, I don't know which is correct.
Second Question.
I set like below,
dim3 grid(32, 32); //blocks in a grid
dim3 block(16, 16); //threads in a block
kernel<<<grid,block>>>(...);
Then, the number of threads per a block is 256. => we need 256*N registers per a block.
N means the number of registers per a thread needed.
(256*N)*blocks is the number of registers per a SM.(not byte)
So, if default size is 16KB and threads/SM is MAX(1536), then N can't over 2. Because of "Maximum number of threads per multiprocessor: 1536".
16KB/4Bytes = 4096 registers, 4096/1536 = 2.66666...
In case of larger caches 48KB, N can't over 8.
48KB/4Bytes = 12288 registers, 12288/1536 = 8
Is that true? Actually I'm so confused.
Actually, My almost full code is here.
I think, the kernel is optimized when the block dimension is 16x16.
But, in case of 8x8, faster than 16x16 or similar.
I don't know the why.
the number of registers per a thread is 16, the shared memory is 80+16 bytes.
I had asked same question, but I couldn't get the exact solution.:
The result of an experiment different from CUDA Occupancy Calculator
#define WIDTH 512
#define HEIGHT 512
#define TILE_WIDTH 8
#define TILE_HEIGHT 8
#define CHANNELS 3
#define DEVICENUM 1
#define HEIGHTs HEIGHT/DEVICENUM
__global__ void PRINT_POLYGON( unsigned char *IMAGEin, int *MEMin, char a, char b, char c){
int Col = blockIdx.y*blockDim.y+ threadIdx.y; //Col is y coordinate
int Row = blockIdx.x*blockDim.x+ threadIdx.x; //Row is x coordinate
int tid_in_block = threadIdx.x + threadIdx.y*blockDim.x;
int bid_in_grid = blockIdx.x + blockIdx.y*gridDim.x;
int threads_per_block = blockDim.x * blockDim.y;
int tid_in_grid = tid_in_block + threads_per_block * bid_in_grid;
float result_a, result_b;
__shared__ int M[15];
for(int k = 0; k < 5; k++){
M[k] = MEMin[a*5+k];
M[k+5] = MEMin[b*5+k];
M[k+10] = MEMin[c*5+k];
}
int result_a_up = (M[11]-M[1])*(Row-M[0]) - (M[10]-M[0])*(Col-M[1]);
int result_b_up = (M[6] -M[1])*(M[0]-Row) - (M[5] -M[0])*(M[1]-Col);
int result_down = (M[11]-M[1])*(M[5]-M[0]) - (M[6]-M[1])*(M[10]-M[0]);
result_a = (float)result_a_up / (float)result_down;
result_b = (float)result_b_up / (float)result_down;
if((0 <= result_a && result_a <=1) && ((0 <= result_b && result_b <= 1)) && ((0 <= (result_a+result_b) && (result_a+result_b) <= 1))){
IMAGEin[tid_in_grid*CHANNELS] += M[2] + (M[7]-M[2])*result_a + (M[12]-M[2])*result_b; //Red Channel
IMAGEin[tid_in_grid*CHANNELS+1] += M[3] + (M[8]-M[3])*result_a + (M[13]-M[3])*result_b; //Green Channel
IMAGEin[tid_in_grid*CHANNELS+2] += M[4] + (M[9]-M[4])*result_a + (M[14]-M[4])*result_b; //Blue Channel
}
}
struct DataStruct {
int deviceID;
unsigned char IMAGE_SEG[WIDTH*HEIGHTs*CHANNELS];
};
void* routine( void *pvoidData ) {
DataStruct *data = (DataStruct*)pvoidData;
unsigned char *dev_IMAGE;
int *dev_MEM;
unsigned char *IMAGE_SEG = data->IMAGE_SEG;
HANDLE_ERROR(cudaSetDevice(5));
//initialize array
memset(IMAGE_SEG, 0, WIDTH*HEIGHTs*CHANNELS);
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
printf("Device %d Starting..\n", data->deviceID);
//Evaluate Time
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
cudaEventRecord(start, 0);
HANDLE_ERROR( cudaMalloc( (void **)&dev_MEM, sizeof(int)*35) );
HANDLE_ERROR( cudaMalloc( (void **)&dev_IMAGE, sizeof(unsigned char)*WIDTH*HEIGHTs*CHANNELS) );
cudaMemcpy(dev_MEM, MEM, sizeof(int)*35, cudaMemcpyHostToDevice);
cudaMemset(dev_IMAGE, 0, sizeof(unsigned char)*WIDTH*HEIGHTs*CHANNELS);
dim3 grid(WIDTH/TILE_WIDTH, HEIGHTs/TILE_HEIGHT); //blocks in a grid
dim3 block(TILE_WIDTH, TILE_HEIGHT); //threads in a block
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, 0, 1, 2);
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, 0, 2, 3);
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, 0, 3, 4);
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, 0, 4, 5);
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, 3, 2, 4);
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, 2, 6, 4);
HANDLE_ERROR( cudaMemcpy( IMAGE_SEG, dev_IMAGE, sizeof(unsigned char)*WIDTH*HEIGHTs*CHANNELS, cudaMemcpyDeviceToHost ) );
HANDLE_ERROR( cudaFree( dev_MEM ) );
HANDLE_ERROR( cudaFree( dev_IMAGE ) );
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime( &elapsed_time_ms[data->deviceID], start, stop );
cudaEventDestroy(start);
cudaEventDestroy(stop);
elapsed_time_ms[DEVICENUM] += elapsed_time_ms[data->deviceID];
printf("Device %d Complete!\n", data->deviceID);
return 0;
}

The blockDim 8x8 is faster than 16x16 due to the increase in address divergence in your memory access when you increase the block size.
Metrics collected on GTX480 with 15 SMs.
metric 8x8 16x16
duration 161µs 114µs
issued_ipc 1.24 1.31
executed_ipc .88 .59
serialization 54.61% 28.74%
The number of instruction replays clues us in that we likely have bad memory access patterns.
achieved occupancy 88.32% 30.76%
0 warp schedulers issues 8.81% 7.98%
1 warp schedulers issues 2.36% 29.54%
2 warp schedulers issues 88.83% 52.44%
16x16 appears to keep the warp scheduler busy. However, it is keeping the schedulers busy re-issuing instructions.
l1 global load trans 524,407 332,007
l1 global store trans 401,224 209,139
l1 global load trans/request 3.56 2.25
l1 global store trans/request 16.33 8.51
The first priority is to reduce transactions per request. The Nsight VSE source view can display memory statistics per instruction. The primary issue in your kernel is the interleaved U8 load and stores for IMAGEin[] += value. At 16x16 this is resulting in 16.3 transactions per request but only 8.3 for 8x8 configuration.
Changing
IMAGEin[(i*HEIGHTs+j)*CHANNELS] += ...
to be consecutive increases performance of 16x16 by 3x. I imagine increasing channels to 4 and handling packing in the kernel will improve cache performance and memory throughput.
If you fix the number of memory transactions per request you will then likely have to look at execution dependencies and try to increase your ILP.

It is faster with block size of 8x8 because it is a lesser multiple of 32, as it is visible in the picture below, there are 32 CUDA cores bound together, with two different warp schedulers that actually schedule the same thing. So the same instruction is executed on these 32 cores in each execution cycle.
To better clarify this, in the first case (8x8) each block is made of two warps (64 threads) so it is finished within only two execution cycles, however, when you are using (16x16) as your block size, each takes 8 warps (256 threads), therefore taking 4 times more execution cycles resulting in a slower compound.
However, filling an SM with more warps is better in some cases, when memory access is high and each warp is likely to go into a memory stall (i.e. getting its operands from memory), then it will be replaced with another warp until the memory operation gets completed. Therefore resulting in more occupancy of the SM.
You should of course throw in the number of blocks per SM and number of SMs total in your calculations, for example, assigning more than 8 blocks to a single SM might reduce its occupancy, but probably in your case, you are not facing these issues, because 256 is generally a better number than 64, since it will balance your blocks among SMs whereas using 64 threads will result in more blocks getting executed in the same SM.
EDIT: This answer is based on my speculations, for a more scientific approach, see Greg Smiths answer.
Register pool is different from shared memory/cache, to the very bottom of their architecture!
Registers are made of Flip-flops and L1 cache are probably SRAM.
Just to get an idea, look at the picture below which represents FERMI architecture, then update your question to further specify the problem you are facing.
As a note, you can see how many registers and shared memory (smem) are taken by your functions by passing the option --ptxas-options = -v to nvcc.

Related

How to calculate the required heap size for dynamic memory allocation in a kernel?

I am facing the problem that if I set the CUDA heap size to the total amount of memory I need to allocate in a kernel, the heap is still not big enough to serve all allocations.
This is a minimal example which represents my use case:
#include <stdio.h>
#define NARR 8
__global__
void heaptest(int N){
double* arr[NARR];
__shared__ double* arrS[NARR];
if(threadIdx.x == 0){
for(int i = 0; i < NARR; i++){
arrS[i] = (double*) malloc(sizeof(double) * N);
if(arrS[i] == NULL)
printf("block %d, array %d is NULL\n", blockIdx.x, i);
}
}
__syncthreads();
for(int i = 0; i < NARR; i++){
arr[i] = arrS[i];
}
}
size_t getHeapSizePerBlock(int N){
return sizeof(double) * N * NARR;
}
int main(){
int N = 4000 * 18;
int nBlocks = 1;
size_t myheapsize = getHeapSizePerBlock(N) * nBlocks;
printf("set heap size to %lu\n", myheapsize);
cudaDeviceSetLimit(cudaLimitMallocHeapSize, myheapsize);
size_t a;
cudaDeviceGetLimit(&a, cudaLimitMallocHeapSize);
printf("heap size is now %lu\n", a);
heaptest<<<nBlocks, 128>>>(N);
cudaDeviceSynchronize();
cudaDeviceReset();
return 0;
}
I compile with nvcc V8.0.61.
nvcc -arch=sm_60 heaptest.cu -o heaptest
Program output is
set heap size to 4608000
heap size is now 4653056
block 0, array 7 is NULL
So, even if the heap size is larger than the required size, it is not large enough.
How do I correctly calculate the required size in this case?
You may not be able to compute the exact size required by your application's heap, since you don't have control over CUDA's memory manager. Just like when allocating CPU memory where you have the OS' memory manager, CUDA has it's own memory manager. When you allocate multiple arrays in the heap, you have no guarantee that they will fit perfectly in the size of the heap, there may exist some overhead.
To exemplify, I did a small modification to your code to print also the memory address returned by malloc:
printf("block %d, array %d is %p\n", blockIdx.x, i, arrS[i]);
Here's what I get on my GTX 1070:
block 0, array 0 is 0x102059a8d20
block 0, array 1 is 0x10205600120
block 0, array 2 is 0x1020568f280
block 0, array 3 is 0x10205738520
block 0, array 4 is 0x102057c7680
block 0, array 5 is 0x10205870920
block 0, array 6 is 0x102058ffa80
block 0, array 7 is (nil)
First thing to note is that memory locations are not (always) contiguous/increasing (e.g., array 0 > array 6 > ... > array 1), but that's not too important for us. Also, if you subtract the memory addresses in decreasing order, you will not get the size you passed to malloc(), which in your case was always sizeof(double) * N, or 576000 bytes. For example:
0x1020568f280 - 0x10205600120 = 586080 bytes (array 1)
0x10205738520 - 0x1020568f280 = 692896 bytes (array 2)
Since these blocks are contiguous in memory with respect to the block size passed to malloc(), we can verify that there is indeed some memory chunks where we cannot allocate blocks of 576000 bytes. Between arrays 1 and 2 we have an extra 10080 bytes, and between arrays 2 and 3, 116896 bytes extra (that's more than 20% of the block size!).
What I would do is to avoid allocating memory dynamically on the heap, and instead allocate it during host-code execution. But if you really need to do it like that for some reason, I would suggest setting the heap size with some overhead margin, by testing it before what seems to be enough. I would at least expect that even existing some overhead for heap allocation, this shouldn't be too big, so maybe start allocating an extra 10% and go up from there, if necessary.

why increasing the number of blocks in cuda increase the time?

My understanding is that in CUDA, increase the number of blocks will not increase the time as they are implemented parallelly, but in my code, if I double the number of blocks, the time doubled as well.
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#define num_of_blocks 500
#define num_of_threads 512
__constant__ double y = 1.1;
// set seed for random number generator
__global__ void initcuRand(curandState* globalState, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &globalState[idx]);
}
// kernel function for SIR
__global__ void test(curandState* globalState, double *dev_data){
// global threads id
int idx = threadIdx.x + blockIdx.x * blockDim.x;
// local threads id
int lidx = threadIdx.x;
// creat shared memory to store seeds
__shared__ curandState localState[num_of_threads];
// shared memory to store samples
__shared__ double sample[num_of_threads];
// copy global seed to local
localState[lidx] = globalState[idx];
__syncthreads();
sample[lidx] = y + curand_normal_double(&localState[lidx]);
if(lidx == 0){
// save the first sample to dev_data;
dev_data[blockIdx.x] = sample[0];
}
globalState[idx] = localState[lidx];
}
int main(){
// creat random number seeds;
curandState *globalState;
cudaMalloc((void**)&globalState, num_of_blocks*num_of_threads*sizeof(curandState));
initcuRand<<<num_of_blocks, num_of_threads>>>(globalState, 1);
double *dev_data;
cudaMalloc((double**)&dev_data, num_of_blocks*sizeof(double));
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Start record
cudaEventRecord(start, 0);
test<<<num_of_blocks, num_of_threads>>>(globalState, dev_data);
// Stop event
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop); // that's our time!
// Clean up:
cudaEventDestroy(start);
cudaEventDestroy(stop);
std::cout << "Time ellapsed: " << elapsedTime << std::endl;
cudaFree(dev_data);
cudaFree(globalState);
return 0;
}
The test result is:
number of blocks: 500, Time ellapsed: 0.39136.
number of blocks: 1000, Time ellapsed: 0.618656.
So what is the reason that the time will increase? Is it because I access the constant memory or I copy the data from shared memory to global memory? Is that some ways to optimise it?
While the number of blocks being able to run in parallel can be large, it is still finite due to limited on-chip resources. If the number of blocks requested in a kernel launch exceeds that limit, any further blocks have to wait for earlier blocks to finish and free up their resources.
One limited resource is shared memory, of which your kernel uses 28 kilobytes. CUDA 8.0 compatible Nvidia GPUs offer between 48 and 112 kilobytes of shared memory per streaming multiprocessor (SM), so that the maximum number of blocks running at any one time is between 1× and 3× the number of SMs on your GPU.
Other limited resources are registers and various per-warp resources in the scheduler. The CUDA occupancy calculator is a convenient Excel spreadsheet (also works with OpenOffice/LibreOffice) that shows you how these resources limit the number of blocks per SM for a specific kernel. Compile the kernel adding the option --ptxas-options="-v" to the nvcc command line, locate the line saying "ptxas info : Used XX registers, YY bytes smem, zz bytes cmem[0], ww bytes cmem[2]", and enter XX, YY, the number of threads per block you are trying to launch, and the compute capability of your GPU into the spreadsheet. It will then show the maximum number of blocks that can run in parallel on one SM.
You don't mention the GPU you have been running the test on, so I'll use a GTX 980 as an example. It has 16 SMs with 96Kb of shared memory each, so at most 16×3=48 blocks can run in parallel. Had you not used shared memory, the maximum number of resident warps would have limited the number of blocks per SM to 4, allowing 64 blocks to run in parallel.
On any currently existing Nvidia GPU, your example requires at least about a dozen waves of blocks executing sequentially, explaining why doubling the number of blocks will also about double the runtime.

Cuda failed to run on blocks but ok on grids

Here is part of my cuda code from cs344:
It is a task to convert a picture in rgb to gray.
The code runs ok now.
But when I use the code in the comments. It fails.
__global__
void rgba_to_greyscale(const uchar4* const rgbaImage,
unsigned char* const greyImage,
int numRows, int numCols)
{
//int x = threadIdx.x ;
//int y = threadIdx.y ;
int x = blockIdx.x ;
int y = blockIdx.y ;
if (x<numCols && y<numRows)
{
uchar4 rgba = rgbaImage[y*numCols+x] ;
float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z;
greyImage[y*numCols+x] = channelSum;
}
}
void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage,
uchar4 * const d_rgbaImage, unsigned char* const d_greyImage,
size_t numRows, size_t numCols)
{
// const dim3 blockSize(numCols,numRows , 1); //TODO
// const dim3 gridSize( 1, 1, 1); //TODO
const dim3 blockSize(1,1 , 1); //TODO
const dim3 gridSize( numCols,numRows , 1); //TODO
std::cout << numCols << " " << numRows << std::endl ; // numCols=557 numRows=313
rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);
cudaDeviceSynchronize();
checkCudaErrors(cudaGetLastError());
}
The code fails to run when I use the commented ones, with errors:
CUDA error at: /home/yc/cuda_prj/cs344_bak/Problem Sets/Problem Set 1/student_func.cu:90
invalid configuration argument cudaGetLastError()
Here is my deviceQuery report:
Detected 1 CUDA Capable device(s)
Device 0: "GeForce GTX 1080"
CUDA Driver Version / Runtime Version 8.0 / 8.0
CUDA Capability Major/Minor version number: 6.1
Total amount of global memory: 8110 MBytes (8504279040 bytes)
(20) Multiprocessors, (128) CUDA Cores/MP: 2560 CUDA Cores
GPU Max Clock rate: 1823 MHz (1.82 GHz)
Memory Clock rate: 5005 Mhz
Memory Bus Width: 256-bit
L2 Cache Size: 2097152 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
Maximum Layered 1D Texture Size, (num) layers 1D=(32768), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(32768, 32768), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Concurrent copy and kernel execution: Yes with 2 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
Device supports Unified Addressing (UVA): Yes
Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 8.0, CUDA Runtime Version = 8.0, NumDevs = 1, Device0 = GeForce GTX 1080
Can anyone tell me why?
Robert posts the answer that solves the problem:
The total threads per block is the product of the dimensions. If your dimensions are numCols=557 numRows=313 then the product of those is over 150,000. The limit on the total threads per block on your device is 1024 and it is in the deviceQuery output here: Maximum number of threads per block: 1024

Can CUDA branch divergence help me in this case?

This is little more than a thought experiment right now, but I want to check my understanding of the CUDA execution model. Consider the following case:
I am running on a GPU with poor double-precision performance (a non-Tesla card).
I have a kernel that needs to calculate a value using double precision. That value is a constant for the rest of the runtime of the kernel, and it is also constant across a warp.
Is something like the following pseudocode advantageous?
// value that we use later in the kernel; this is constant across all threads
// in a warp
int constant_value;
// check to see if this is the first thread in a warp
enum { warp_size = 32 };
if (!(threadIdx.x & (warp_size - 1))
{
// only do the double-precision math in one thread
constant_value = (int) round(double_precision_calculation());
}
// broadcast constant_value to all threads in the warp
constant_value = __shfl(v, 0);
// go on to use constant_value as needed later in the kernel
The reason why I considered doing this is my (possibly wrong) understanding of how double-precision resources are made available on each multiprocessor. From what I understand, there are simply 1/32 as many double-precision ALUs as single-precision ones on recent Geforce cards. Does this mean that if the other threads in a warp diverge, I can work around this lack of resources, and still get decent performance, as long as the double-precision values that I want can be broadcast to all threads in a warp?
Does this mean that if the other threads in a warp diverge, I can work around this lack of resources, and still get decent performance, as long as the double-precision values that I want can be broadcast to all threads in a warp?
No, you can't.
An instruction issue always occurs at the warp level, even in a warp-diverged scenario. Since it is issued at the warp level, it will require/use/schedule enough execution resources for the warp, even for inactive threads.
Therefore a computation done on only one thread will still use the same resources/scheduling slot as a computation done on all 32 threads in the warp.
For example, a floating point multiply will require 32 instances of usage of a floating point ALU. The exact scheduling of this will vary based on the specific GPU, but you cannot reduce the 32 instance usage to a lower number through warp divergence or any other mechanism.
Based on a question in the comments, here's a worked example on CUDA 7.5, Fedora 20, GT640 (GK208 - has 1/24 ratio of DP to SP units):
$ cat t1241.cu
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
const int nTPB = 32;
const int nBLK = 1;
const int rows = 1048576;
const int nSD = 128;
typedef double mytype;
template <bool use_warp>
__global__ void mpy_k(const mytype * in, mytype * out){
__shared__ mytype sdata[nTPB*nSD];
int idx = threadIdx.x + blockDim.x*blockIdx.x;
mytype accum = in[idx];
#pragma unroll 128
for (int i = 0; i < rows; i++)
if (use_warp)
accum += accum*sdata[threadIdx.x+(i&(nSD-1))*nTPB];
else
if (threadIdx.x == 0)
accum += accum*sdata[threadIdx.x+(i&(nSD-1))*nTPB];
out[idx] = accum;
}
int main(){
mytype *din, *dout;
cudaMalloc(&din, nTPB*nBLK*rows*sizeof(mytype));
cudaMalloc(&dout, nTPB*nBLK*sizeof(mytype));
cudaMemset(din, 0, nTPB*nBLK*rows*sizeof(mytype));
cudaMemset(dout, 0, nTPB*nBLK*sizeof(mytype));
mpy_k<true><<<nBLK, nTPB>>>(din, dout); // warm-up
cudaDeviceSynchronize();
unsigned long long dt = dtime_usec(0);
mpy_k<true><<<nBLK, nTPB>>>(din, dout);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
printf("full warp elapsed time: %f\n", dt/(float)USECPSEC);
mpy_k<false><<<nBLK, nTPB>>>(din, dout); //warm up
cudaDeviceSynchronize();
dt = dtime_usec(0);
mpy_k<false><<<nBLK, nTPB>>>(din, dout);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
printf("one thread elapsed time: %f\n", dt/(float)USECPSEC);
cudaError_t res = cudaGetLastError();
if (res != cudaSuccess) printf("CUDA runtime failure %s\n", cudaGetErrorString(res));
return 0;
}
$ nvcc -arch=sm_35 -o t1241 t1241.cu
$ CUDA_VISIBLE_DEVICES="1" ./t1241
full warp elapsed time: 0.034346
one thread elapsed time: 0.049174
$
It is not faster to use just one thread in the warp for a floating-point multiply

CUDA - Memory Limit - Vector Summation

I'm trying to learn CUDA and the following code works OK for the values N<= 16384, but fails for the greater values(Summation check at the end of the code fails, c values are always 0 for the index value of i>=16384).
#include<iostream>
#include"cuda_runtime.h"
#include"../cuda_be/book.h"
#define N (16384)
__global__ void add(int *a,int *b,int *c)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid<N)
{
c[tid] = a[tid] + b[tid];
tid += blockDim.x * gridDim.x;
}
}
int main()
{
int a[N],b[N],c[N];
int *dev_a,*dev_b,*dev_c;
//allocate mem on gpu
HANDLE_ERROR(cudaMalloc((void**)&dev_a,N*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b,N*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c,N*sizeof(int)));
for(int i=0;i<N;i++)
{
a[i] = -i;
b[i] = i*i;
}
HANDLE_ERROR(cudaMemcpy(dev_a,a,N*sizeof(int),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(dev_b,b,N*sizeof(int),cudaMemcpyHostToDevice));
system("PAUSE");
add<<<128,128>>>(dev_a,dev_b,dev_c);
//copy the array 'c' back from the gpu to the cpu
HANDLE_ERROR( cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost));
system("PAUSE");
bool success = true;
for(int i=0;i<N;i++)
{
if((a[i] + b[i]) != c[i])
{
printf("Error in %d: %d + %d != %d\n",i,a[i],b[i],c[i]);
system("PAUSE");
success = false;
}
}
if(success) printf("We did it!\n");
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
I think it's a shared memory related problem, but I can't come up with a good explanation(Possible lack of knowledge). Could you provide me an explanation and a workaround to run for the values of N greater than 16384. Here is the specs for my GPU:
General Info for device 0
Name: GeForce 9600M GT
Compute capability: 1.1
Clock rate: 1250000
Device copy overlap : Enabled
Kernel Execution timeout : Enabled
Mem info for device 0
Total global mem: 536870912
Total const mem: 65536
Max mem pitch: 2147483647
Texture Alignment 256
MP info about device 0
Multiproccessor count: 4
Shared mem per mp: 16384
Registers per mp: 8192
Threads in warp: 32
Max threads per block: 512
Max thread dimensions: (512,512,64)
Max grid dimensions: (65535,65535,1)
You probably intended to write
while(tid<N)
not
if(tid<N)
You aren't running out of shared memory, your vector arrays are being copied into your device's global memory. As you can see this has far more space available than the 196608 bytes (16384*4*3) you need.
The reason for your problem is that you are only performing one addition operation per thread so hence with this structure, the maximum dimension that your vectors can be is the block*thread parameters in your kernel launch as tera has pointed out. By correcting
if(tid<N)
to
while(tid<N)
in your code, each thread will perform its addition on multiple indexes and the whole array will be considered.
For more information about the memory hierarchy and the various different places memory can sit, you should read sections 2.3 and 5.3 of the CUDA_C_Programming_Guide.pdf provided with the CUDA toolkit.
Hope that helps.
If N is:
#define N (33 * 1024) //value defined in Cuda by Examples
The same code I found in Cuda by Example, but the value of N was different. I think that o value of N cant be 33 * 1024. I must change the parameters number of block and number of threads per blocks. Because:
add<<<128,128>>>(dev_a,dev_b,dev_c); //16384 threads
(128 * 128) < (33 * 1024) so we have a crash.