CUDA - Malloc inside kernel ( compute_50,sm_50 ) - cuda

I had a problem while running a program with the CUDA Memory Checker.
In other threads on stackoverflow, the main problem with using malloc inside a kernel was that the "compute_50,sm_50" was not set properly. Here the code compiles so this is not the problem.
The problem is now solved, but I don't understand why the new code solved the problem.
My question is: why it is working now ?
Old code:
__device__ unsigned int draw_active_levels(curandState * localState,const int num_levels_max){
unsigned int return_value = 0;
float draw;
draw = curand_uniform(localState);
int num_active_levels = floorf(draw * (num_levels_max - 1)) + 1;
double * arrLevelWeights = (double*) malloc((num_levels_max+1) * sizeof(double));
arrLevelWeights[num_levels_max]=0.0; //<--------Error on this line
double level_weights = 1.0 / num_levels_max;
for(int i=0; i<num_levels_max; i++){
arrLevelWeights[i] = level_weights;
}
//...
//do some operations using arrLevelWeights
//..
free(arrLevelWeights);
return return_value;
}
Error with old code:
Memory Checker detected 2 access violations.
error = access violation on store (global memory)
gridid = 198
blockIdx = {1,0,0}
threadIdx = {29,0,0}
address = 0x00000020
accessSize = 8
New code:
I just added a few lines to check if malloc returned a null pointer.
__device__ unsigned int draw_active_levels(curandState * localState,const int num_levels_max){
unsigned int return_value = 0;
float draw;
draw = curand_uniform(localState);
int num_active_levels = floorf(draw * (num_levels_max - 1)) + 1;
double * arrLevelWeights;
arrLevelWeights = (double*) malloc((num_levels_max+1) * sizeof(double));
if(arrLevelWeights == NULL){
printf("Error while dynamically allocating memory on device.\n"); //<--- this line is never called (I put a breakpoint on it)
}
arrLevelWeights[num_levels_max]=0.0; //<-------Error disapeared !
double level_weights = 1.0 / num_levels_max;
for(int i=0; i<num_levels_max; i++){
arrLevelWeights[i] = level_weights;
}
//...
//do some operations using arrLevelWeights
//..
free(arrLevelWeights);
return return_value;
}

If malloc returns NULL, it simply means that you've run out of device heap space which has, by default, a size of 8 MB. I'm not sure how adding a line that is never executed fixes the problem, though.
As you said in a comment, you ran out of heap space because of a missing free somewhere else in your code, which is why I suggest you use RAII (with your own smart pointer class) for memory allocations to avoid this kind of problem in the future.

Related

The way to properly do multiple CUDA block synchronization

I like to do CUDA synchronization for multiple blocks. It is not for each block where __syncthreads() can easily handle it.
I saw there are exiting discussions on this topic, for example cuda block synchronization, and I like the simple solution brought up by #johan, https://stackoverflow.com/a/67252761/3188690, essentially it uses a 64 bits counter to track the synchronized blocks.
However, I wrote the following code trying to accomplish the similar job but meet a problem. Here I used the term environment so that the wkNumberEnvs of blocks within this environment shall be synchronized. It has a counter. I used atomicAdd() to count how many blocks have already been synchronized themselves, once the number of sync blocks == wkBlocksPerEnv, I know all blocks finished sync and it is free to go. However, it has a strange outcome that I am not sure why.
The problem comes from this while loop. Since the first threads of all blocks are doing the atomicAdd, there is a while loop to check until the condition meets. But I find that some blocks will be stuck into the endless loop, which I am not sure why the condition cannot be met eventually? And if I printf some messages either in *** I can print here 1 or *** I can print here 2, there is no endless loop and everything is perfect. I do not see something obvious.
const int wkBlocksPerEnv = 2;
__device__ int env_sync_block_count[wkNumberEnvs];
__device__ void syncthreads_for_env(){
// sync threads for each block so all threads in this block finished the previous tasks
__syncthreads();
// sync threads for wkBlocksPerEnv blocks for each environment
if(wkBlocksPerEnv > 1){
const int kThisEnvId = get_env_scope_block_id(blockIdx.x);
if (threadIdx.x == 0){
// incrementing env_sync_block_count by 1
atomicAdd(&env_sync_block_count[kThisEnvId], 1);
// *** I can print here 1
while(env_sync_block_count[kThisEnvId] != wkBlocksPerEnv){
// *** I can print here 2
}
// Do the next job ...
}
}
There are two potential issues with your code. Caching and block scheduling.
Caching can prevent you from observing an updated value during the while loop.
Block scheduling can cause a dead-lock if you wait for an update of a block which has not yet been scheduled. Since CUDA does not guarantee a specific order of scheduled blocks, the only way to prevent this dead-lock is to limit the number of blocks in the grid such that all blocks can run simultaneously.
Following code shows how you could synchronize multiple blocks while avoiding above issues. I adapted the code from the multi-grid synchronization given in the CUDA-sample conjugateGradientMultiDeviceCG https://github.com/NVIDIA/cuda-samples/blob/master/Samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu#L186
On pre-Volta devices, it uses volatile memory accesses. Volta and later uses acquire/release semantics.
Grid size is limited by querying device properties.
#include <cassert>
#include <cstdio>
constexpr int wkBlocksPerEnv = 13;
__device__
int getEnv(int blockId){
return blockId / wkBlocksPerEnv;
}
__device__
int getRankInEnv(int blockId){
return blockId % wkBlocksPerEnv;
}
__device__
unsigned char load_arrived(unsigned char *arrived) {
#if __CUDA_ARCH__ < 700
return *(volatile unsigned char *)arrived;
#else
unsigned int result;
asm volatile("ld.acquire.gpu.global.u8 %0, [%1];"
: "=r"(result)
: "l"(arrived)
: "memory");
return result;
#endif
}
__device__
void store_arrived(unsigned char *arrived,
unsigned char val) {
#if __CUDA_ARCH__ < 700
*(volatile unsigned char *)arrived = val;
#else
unsigned int reg_val = val;
asm volatile(
"st.release.gpu.global.u8 [%1], %0;" ::"r"(reg_val) "l"(arrived)
: "memory");
// Avoids compiler warnings from unused variable val.
(void)(reg_val = reg_val);
#endif
}
#if 0
//wrong implementation which does not synchronize. to check that kernel assert does trigger without proper synchronization
__device__
void syncthreads_for_env(unsigned char* temp){
}
#else
//temp must have at least size sizeof(unsigned char) * total_number_of_blocks in grid
__device__
void syncthreads_for_env(unsigned char* temp){
__syncthreads();
const int env = getEnv(blockIdx.x);
const int blockInEnv = getRankInEnv(blockIdx.x);
unsigned char* const mytemp = temp + env * wkBlocksPerEnv;
if(threadIdx.x == 0){
if(blockInEnv == 0){
// Leader block waits for others to join and then releases them.
// Other blocks in env can arrive in any order, so the leader have to wait for
// all others.
for (int i = 0; i < wkBlocksPerEnv - 1; i++) {
while (load_arrived(&mytemp[i]) == 0)
;
}
for (int i = 0; i < wkBlocksPerEnv - 1; i++) {
store_arrived(&mytemp[i], 0);
}
__threadfence();
}else{
// Other blocks in env note their arrival and wait to be released.
store_arrived(&mytemp[blockInEnv - 1], 1);
while (load_arrived(&mytemp[blockInEnv - 1]) == 1)
;
}
}
__syncthreads();
}
#endif
__global__
void kernel(unsigned char* synctemp, int* array){
const int env = getEnv(blockIdx.x);
const int blockInEnv = getRankInEnv(blockIdx.x);
if(threadIdx.x == 0){
array[blockIdx.x] = 1;
}
syncthreads_for_env(synctemp);
if(threadIdx.x == 0){
int sum = 0;
for(int i = 0; i < wkBlocksPerEnv; i++){
sum += array[env * wkBlocksPerEnv + i];
}
assert(sum == wkBlocksPerEnv);
}
}
int main(){
const int smem = 0;
const int blocksize = 128;
int deviceId = 0;
int numSMs = 0;
int maxBlocksPerSM = 0;
cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, deviceId);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&maxBlocksPerSM,
kernel,
blocksize,
smem
);
int maxBlocks = maxBlocksPerSM * numSMs;
maxBlocks -= maxBlocks % wkBlocksPerEnv; //round down to nearest multiple of wkBlocksPerEnv
printf("wkBlocksPerEnv %d, maxBlocks: %d\n", wkBlocksPerEnv, maxBlocks);
int* d_array;
unsigned char* d_synctemp;
cudaMalloc(&d_array, sizeof(int) * maxBlocks);
cudaMalloc(&d_synctemp, sizeof(unsigned char) * maxBlocks);
cudaMemset(d_synctemp, 0, sizeof(unsigned char) * maxBlocks);
kernel<<<maxBlocks, blocksize>>>(d_synctemp, d_array);
cudaFree(d_synctemp);
cudaFree(d_array);
return 0;
}
Atomic value is going to global memory but in the while-loop you read it directly and it must be coming from the cache which will not automatically synchronize between threads (cache-coherence only handled by explicit synchronizations like threadfence). Thread gets its own synchronization but other threads may not see it.
Even if you use threadfence, the threads in same warp would be in dead-lock waiting forever if they were the first to check the value before any other thread updates it. But should work with newest GPUs supporting independent thread scheduling.
I like to do CUDA synchronization for multiple blocks.
You should learn to dis-like it. Synchronization is always costly, even when implemented just right, and inter-core synchronization all the more so.
if (threadIdx.x == 0){
// incrementing env_sync_block_count by 1
atomicAdd(&env_sync_block_count[kThisEnvId], 1);
while(env_sync_block_count[kThisEnvId] != wkBlocksPerEnv)
// OH NO!!
{
}
}
This is bad. With this code, the first warp of each block will perform repeated reads of env_sync_block_count[kThisEnvId]. First, and as #AbatorAbetor mentioned, you will face the problem of cache incoherence, causing your blocks to potentially read the wrong value from a local cache well after the global value has long changed.
Also, your blocks will hog up the multiprocessors. Blocks will stay resident and have at least one active warp, indefinitely. Who's to say the will be evicted from their multiprocessor to schedule additional blocks to execute? If I were the GPU, I wouldn't allow more and more active blocks to pile up. Even if you don't deadlock - you'll be wasting a lot of time.
Now, #AbatorAbetor's answer avoids the deadlock by limiting the grid size. And I guess that works. But unless you have a very good reason to write your kernels this way - the real solution is to just break up your algorithm into consecutive kernels (or better yet, figure out how to avoid the need to synchronize altogether).
a mid-way approach is to only have some blocks get past the point of synchronization. You could do that by not waiting except on some condition which holds for a very limited number of blocks (say you had a single workgroup - then only the blocks which got the last K possible counter values, wait).

Cuda, two streams created by a NPP function

I'm working on an image processing project with Cuda 7.5 and a GeForce GTX 650 Ti. I decided to use 2 stream, one where I apply the algorithms responsible to enhance the image and another stream where I apply an independent algorithm from the rest of the processing.
I wrote an example to show my problem. In this example I created a stream and then I used nppSetStream.
I invoked the function nppiThreshold_LTValGTVal_32f_C1R but 2 stream are used when the function is executed.
Here there's a code example:
#include <npp.h>
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
int main(void) {
int srcWidth = 1344;
int srcHeight = 1344;
int paddStride = 0;
float* srcArrayDevice;
float* srcArrayDevice2;
unsigned char* dstArrayDevice;
int status = cudaMalloc((void**)&srcArrayDevice, srcWidth * srcHeight * 4);
status = cudaMalloc((void**)&srcArrayDevice2, srcWidth * srcHeight * 4);
status = cudaMalloc((void**)&dstArrayDevice, srcWidth * srcHeight );
cudaStream_t testStream;
cudaStreamCreateWithFlags(&testStream, cudaStreamNonBlocking);
nppSetStream(testStream);
NppiSize roiSize = { srcWidth,srcHeight };
//status = cudaMemcpyAsync(srcArrayDevice, &srcArrayHost, srcWidth*srcHeight*4, cudaMemcpyHostToDevice, testStream);
int yRect = 100;
int xRect = 60;
float thrL = 50;
float thrH = 1500;
NppiSize sz = { 200, 400 };
for (int i = 0; i < 10; i++) {
int status3 = nppiThreshold_LTValGTVal_32f_C1R(srcArrayDevice + (srcWidth*yRect + xRect)
, srcWidth * 4
, srcArrayDevice2 + (srcWidth*yRect + xRect)
, srcWidth * 4
, sz
, thrL
, thrL
, thrH
, thrH);
}
int length = (srcWidth + paddStride)*srcHeight;
int status6 = nppiScale_32f8u_C1R(srcArrayDevice, srcWidth * 4, dstArrayDevice + paddStride, srcWidth + paddStride, roiSize, 0, 65535);
//int status7 = cudaMemcpyAsync(dstPinPtr, dstTest, length, cudaMemcpyDeviceToHost, testStream);
cudaFree(srcArrayDevice);
cudaFree(srcArrayDevice2);
cudaFree(dstArrayDevice);
cudaStreamDestroy(testStream);
cudaProfilerStop();
return 0;
}
This what I got from the Nvidia Visual Profiler: image_width1344
Why are there two streams if I set only one stream? This causes errors in my original project so I'm thinking to switch to a single stream.
I noticed that this behaviour is dependent from the size of the image, if srcWidth and srcHeight are set to 1500 the result is this:image_width1500.
Why changing the size of the image produces another stream?
Why are there two streams if I setted [sic] only one stream?
It appears that nppiThreshold_LTValGTVal_32f_C1R creates its own internal stream for executing one of the kernels it uses. The other is launched either into the default stream, or the stream you specified with nppSetStream.
I think this is really a documentation oversight/user expectation problem. nppSetStream is doing what it says, but nowhere is it stated that the library is limited to using one stream. It probably should be more explicit in the documentation about how many streams the library uses internally, and how nppSetStream interacts with the library. If this is a problem for your application, I suggest you raise a bug report with NVIDIA.
Why changing the size of the image produces another stream?
My guess would be that there are some performance heuristics at work, and whether the second stream is used depends in image size. The library is closed source, however, so I can't say for sure.

Cost of OpenCL get_local_id()

I have a simple scan kernel, which calculates scans of several blocks in a loop. I noticed that performance somewhat rises when get_local_id() is stored inside a local variable instead of calling it inside the loop. So to summarize with code, this:
__kernel void LocalScan_v0(__global const int *p_array, int n_array_size, __global int *p_scan)
{
const int n_group_offset = get_group_id(0) * SCAN_BLOCK_SIZE;
p_array += n_group_offset;
p_scan += n_group_offset;
// calculate group offset
const int li = get_local_id(0); // *** local id cached ***
const int gn = get_num_groups(0);
__local int p_workspace[SCAN_BLOCK_SIZE];
for(int i = n_group_offset; i < n_array_size; i += SCAN_BLOCK_SIZE * gn) {
LocalScan_SingleBlock(p_array, p_scan, p_workspace, li);
p_array += SCAN_BLOCK_SIZE * gn;
p_scan += SCAN_BLOCK_SIZE * gn;
}
// process all the blocks in the array (each block size SCAN_BLOCK_SIZE)
}
Has throughput of 74 GB/s on GTX-780, while this:
__kernel void LocalScan_v0(__global const int *p_array, int n_array_size, __global int *p_scan)
{
const int n_group_offset = get_group_id(0) * SCAN_BLOCK_SIZE;
p_array += n_group_offset;
p_scan += n_group_offset;
// calculate group offset
const int gn = get_num_groups(0);
__local int p_workspace[SCAN_BLOCK_SIZE];
for(int i = n_group_offset; i < n_array_size; i += SCAN_BLOCK_SIZE * gn) {
LocalScan_SingleBlock(p_array, p_scan, p_workspace, get_local_id(0));
// *** local id polled inside the loop ***
p_array += SCAN_BLOCK_SIZE * gn;
p_scan += SCAN_BLOCK_SIZE * gn;
}
// process all the blocks in the array (each block size SCAN_BLOCK_SIZE)
}
Has only 70 GB/s on the same hardware. The only difference is whether the call to get_local_id() is inside or outside of the loop. The code in LocalScan_SingleBlock() is pretty much described in this GPU Gems article.
Now this brings some questions. I always imagined that thread id is stored inside some register, and access to it is as fast as to any thread-local variable. This doesn't seem to be the case. I always used to have habit of caching the local id in a variable with reluctance of an old "C" programmer who wouldn't call a function in a loop, had he expect it to return the same value every time, but I didn't seriously think it would make any difference.
Any ideas as to why this might be? I didn't do any checking on the compiled binary code. Does anyone have the same experience? Is it the same with threadIdx.x in CUDA? How about ATI platforms? Is this behavior described somewhere? I quickly scanned through CUDA Best Practices, but didn't find anything.
This is just a guess, but as per the Khronos page
http://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/get_local_id.html
get_local_id() isn't defined to return a constant value (merely size_t). That may mean that, as far as the compiler is aware, it may not be allowed to perform certain optimisations compared with a constant local_id because the return of the function value may change in the eyes of the compiler (even though it wont per-thread)

How to synchronize global memory between multiple kernel launches?

I want to launch multiple times the following kernel in a FOR LOOP (pseudo):
__global__ void kernel(t_dev is input array in global mem) {
__shared__ PREC tt[BLOCK_DIM];
if (thid < m) {
tt[thid] = t_dev.data[ii]; // MEM READ!
}
... // MODIFY
__syncthreads();
if (thid < m) {
t_dev.data[thid] = tt[thid]; // MEM WRITE!
}
__threadfence(); // or __syncthreads(); //// NECESSARY!! but why?
}
What I do conceptually is I read in values from t_dev . modify them, and write out to global mem again! and then I start the same kernel again!!
Why do I need obviously the _threadfence or __syncthread
otherwise the result get wrong, because, memory writes are not finished when the same kernel starts again. Thats what happens here, my GTX580 has device overlap enabled,
But why are global mem writes not finished when the next kernel starts... is this because of the device overlap or because its always like that? I thought, when we launch kernel after kernel, mem write/reads are finished after one kernel... :-)
Thanks for your answers!
SOME CODE :
for(int kernelAIdx = 0; kernelAIdx < loops; kernelAIdx++){
proxGPU::sorProxContactOrdered_1threads_StepA_kernelWrap<PREC,SorProxSettings1>(
mu_dev,x_new_dev,T_dev,x_old_dev,d_dev,
t_dev,
kernelAIdx,
pConvergedFlag_dev,
m_absTOL,m_relTOL);
proxGPU::sorProx_StepB_kernelWrap<PREC,SorProxSettings1>(
t_dev,
T_dev,
x_new_dev,
kernelAIdx
);
}
These are thw two kernels which are in the loop, the t_dev and x_new_dev, is moved from Step A to Step B,
Kernel A looks as follows:
template<typename PREC, int THREADS_PER_BLOCK, int BLOCK_DIM, int PROX_PACKAGES, typename TConvexSet>
__global__ void sorProxContactOrdered_1threads_StepA_kernel(
utilCuda::Matrix<PREC> mu_dev,
utilCuda::Matrix<PREC> y_dev,
utilCuda::Matrix<PREC> T_dev,
utilCuda::Matrix<PREC> x_old_dev,
utilCuda::Matrix<PREC> d_dev,
utilCuda::Matrix<PREC> t_dev,
int kernelAIdx,
int maxNContacts,
bool * convergedFlag_dev,
PREC _absTOL, PREC _relTOL){
//__threadfence() HERE OR AT THE END; THEN IT WORKS???? WHY
// Assumend 1 Block, with THREADS_PER_BLOCK Threads and Column Major Matrix T_dev
int thid = threadIdx.x;
int m = min(maxNContacts*PROX_PACKAGE_SIZE, BLOCK_DIM); // this is the actual size of the diagonal block!
int i = kernelAIdx * BLOCK_DIM;
int ii = i + thid;
//First copy x_old_dev in shared
__shared__ PREC xx[BLOCK_DIM]; // each thread writes one element, if its in the limit!!
__shared__ PREC tt[BLOCK_DIM];
if(thid < m){
xx[thid] = x_old_dev.data[ii];
tt[thid] = t_dev.data[ii];
}
__syncthreads();
PREC absTOL = _absTOL;
PREC relTOL = _relTOL;
int jj;
//PREC T_iijj;
//Offset the T_dev_ptr to the start of the Block
PREC * T_dev_ptr = PtrElem_ColM(T_dev,i,i);
PREC * mu_dev_ptr = &mu_dev.data[PROX_PACKAGES*kernelAIdx];
__syncthreads();
for(int j_t = 0; j_t < m ; j_t+=PROX_PACKAGE_SIZE){
//Select the number of threads we need!
// Here we process one [m x PROX_PACKAGE_SIZE] Block
// First Normal Direction ==========================================================
jj = i + j_t;
__syncthreads();
if( ii == jj ){ // select thread on the diagonal ...
PREC x_new_n = (d_dev.data[ii] + tt[thid]);
//Prox Normal!
if(x_new_n <= 0.0){
x_new_n = 0.0;
}
/* if( !checkConverged(x_new,xx[thid],absTOL,relTOL)){
*convergedFlag_dev = 0;
}*/
xx[thid] = x_new_n;
tt[thid] = 0.0;
}
// all threads not on the diagonal fall into this sync!
__syncthreads();
// Select only m threads!
if(thid < m){
tt[thid] += T_dev_ptr[thid] * xx[j_t];
}
// ====================================================================================
// wee need to syncronize here because one threads finished lambda_t2 with shared mem tt, which is updated from another thread!
__syncthreads();
// Second Tangential Direction ==========================================================
jj++;
__syncthreads();
if( ii == jj ){ // select thread on diagonal, one thread finishs T1 and T2 directions.
// Prox tangential
PREC lambda_T1 = (d_dev.data[ii] + tt[thid]);
PREC lambda_T2 = (d_dev.data[ii+1] + tt[thid+1]);
PREC radius = (*mu_dev_ptr) * xx[thid-1];
PREC absvalue = sqrt(lambda_T1*lambda_T1 + lambda_T2*lambda_T2);
if(absvalue > radius){
lambda_T1 = (lambda_T1 * radius ) / absvalue;
lambda_T2 = (lambda_T2 * radius ) / absvalue;
}
/*if( !checkConverged(lambda_T1,xx[thid],absTOL,relTOL)){
*convergedFlag_dev = 0;
}
if( !checkConverged(lambda_T2,xx[thid+1],absTOL,relTOL)){
*convergedFlag_dev = 0;
}*/
//Write the two values back!
xx[thid] = lambda_T1;
tt[thid] = 0.0;
xx[thid+1] = lambda_T2;
tt[thid+1] = 0.0;
}
// all threads not on the diagonal fall into this sync!
__syncthreads();
T_dev_ptr = PtrColOffset_ColM(T_dev_ptr,1,T_dev.outerStrideBytes);
__syncthreads();
if(thid < m){
tt[thid] += T_dev_ptr[thid] * xx[j_t+1];
}
__syncthreads();
T_dev_ptr = PtrColOffset_ColM(T_dev_ptr,1,T_dev.outerStrideBytes);
__syncthreads();
if(thid < m){
tt[thid] += T_dev_ptr[thid] * xx[j_t+2];
}
// ====================================================================================
__syncthreads();
// move T_dev_ptr 1 column
T_dev_ptr = PtrColOffset_ColM(T_dev_ptr,1,T_dev.outerStrideBytes);
// move mu_ptr to nex contact
__syncthreads();
mu_dev_ptr = &mu_dev_ptr[1];
__syncthreads();
}
__syncthreads();
// Write back the results, dont need to syncronize because
// do it anyway to be safe for testing first!
if(thid < m){
y_dev.data[ii] = xx[thid]; THIS IS UPDATED IN KERNEL B
t_dev.data[ii] = tt[thid]; THIS IS UPDATED IN KERNEL B
}
//__threadfence(); /// THIS STUPID THREADFENCE MAKES IT WORKING!
I compare the solution at the end with the CPU, and HERE I put everywhere I can a syncthread around only to be safe, for the start! (this code does gauss seidel stuff)
but it does not work at all without the THREAD_FENCE at the END or at the BEGINNIG where it does not make sense...
Sorry for so much code, but probably you can guess where the problem comes, frome because I am bit at my end, with explainig why this happens?
We checked the algorithm several times, there is no memory error (reported from Nsight) or
other stuff, every thing works fine... Kernel A is launched with ONE Block only!
If you launch the successive instances of the kernel into the same stream, each kernel launch is synchronous compared to the kernel instance before and after it. The programming model guarantees it. CUDA only permits simultaneous kernel execution on kernels launched into different streams of the same context, and even then overlapping kernel execution only happens if the scheduler determines that sufficient resources are available to do so.
Neither __threadfence nor __syncthreads will have the effect you seem to be thinking about - __threadfence works only at the scope of all active threads and __syncthreads is an intra-block barrier operation. If you really want kernel to kernel synchronization, you need to use one of the host side synchronization calls, like cudaThreadSynchronize (pre CUDA 4.0) or cudaDeviceSynchronize (cuda 4.0 and later), or the per-stream equivalent if you are using streams.
While I am a bit surprised by what you are experiencing, I believe your explanation may be correct.
Writes to global memory, with an exception of atomic functions, are not guaranteed to be immediately visible by other threads (from the same, or from different blocks). By putting __threadfence() you halt the current thread until the writes are in fact visible. This might be important in particular when you are using global memory with a cache (the Fermi series).
One thing to note: Kernel calls are asynchronous. While your first kernel call is being handled by the GPU, the host may issue another call. The next kernel will not run in parallel with your current one, but will launch as soon as the current one finishes, esentially hiding the latency caused by the CPU->GPU communication.
Using cudaThreadSynchronise halts the host thread until all the CUDA tasks are done. It may help you, but it will also prevent you from hiding the CPU->GPU communication latency. Do note, that using synchronous memory access (e.g. cudaMemcpy, without "Async" suffix) esentially behaves like cudaThreadSynchronise too.

Using CUDA Occupancy Calculator

I am using occupancy calculator but I cannot understand how to get the Registers per thread / shared memory per block .I read the documentation.I use visual studio .So in the project properties under CUDA build rule->Command Line -> Additional Options I add --ptxas-options=-v.The program compiles fine .But i do not see any output .Can anybody help?
Thanks
With this switch on there should be a line on the compiler output window that tells you about the number of registers and amount of shared memory.
Do you see anything at all on the compiler output window? can you copy and paste it to the question?
It should look something like
ptxas info : Used 3 registers, 2084+1060 bytes smem, 40 bytes cmem[0], 12 bytes cmem[1]
Try this simple rule:
All the local variables like int a, float b etc in your kernel are stored in registers. This is only when local variables in your code remains within the limit of available registers in a multiprocessor, See Limits. However, if you declare a thousand integers like int a[1000] then a will not be stored in registers rather it will be stored in local memory (DRAM).
The amount of shared memory used in your kernel code is Shared Memory/Block. For example if you define __shared__ float shMem[256] then you are using 256*4(size of float) = 1024 bytes of shared memory.
The following sample code (it'll not work properly, just for example) uses 9 32-bit-registers per thread which are: int xIndex, yIndex, Idx, shY, shX, aLocX, aLocY and float t, temp. The code uses 324 bytes of shared memory per block, as BLOCK_DIM = 16.
__global__ void averageFilter (unsigned char * outImage,
int imageWidth,
int imageHeight,
cuviPoint2 loc){
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
unsigned int Idx = yIndex*imageWidth + xIndex;
float t = INC;
if(xIndex>= imageWidth|| yIndex>=imageHeight)
return;
else if(xIndex==0 || xIndex== imageWidth-1 || yIndex==0 || yIndex==imageHeight-1){
for (int i=-1; i<=1; i++)
for (int j=-1; j<=1; j++)
t+= tex1Dfetch(texMem,Idx+i*imageWidth+j);
outImage[Idx] = t/6;
}
__shared__ unsigned char shMem[BLOCK_DIM+2][BLOCK_DIM+2];
unsigned int shY = threadIdx.y + 1;
unsigned int shX = threadIdx.x + 1;
if (threadIdx.x==0 || threadIdx.x==BLOCK_DIM-1 || threadIdx.y==0 || threadIdx.y==BLOCK_DIM-1){
for (int i=-1; i<=1; i++)
for (int j=-1; j<=1; j++)
shMem[shY+i][shX+j]= tex1Dfetch(texMem,Idx+i*imageWidth+j);
}
else
shMem[shY][shX] = tex1Dfetch(texMem,Idx);
__syncthreads();
if(xIndex==0 || xIndex== imageWidth-1 || yIndex==0 || yIndex==imageHeight-1)
return;
int aLocX = loc.x, aLocY = loc.y;
float temp=INC;
for (int i=aLocY; i<=aLocY+2; i++)
for (int j=aLocX; j<=aLocX+2; j++)
temp+= shMem[shY+i][shX+j];
outImage[Idx] = floor(temp/9);
}
shoosh's answer is probably the easiest way to find the register and shared memory usage. Make sure you're looking at the output pane first (select "Output" in the "View" pull-down menu) and then re-compile. The compiler should give you ptxas info for all kernels in the output pane, like you can see in the picture below...
Another way to find this information out is to use the visual profiler or nvidia's parallel nsight.