cudaEventElapsedTime not expected behaviour - cuda

I'm trying to compute the total time taken in GPU to compute something. I'm using the cudaEventRecord and cudaEventElapsedTime to determine this, but I'm having a unexpected behavior, or at least, unexpected for me :) I wrote this example to understand what's happening and I'm still confused.
In the example below I was expecting to report the same time for the three iterations but the result is:
2.80342
1003
2005.6
Which means that the total time in considering the CPU sleep time.
Am I doing something wrong? If not, is it possible do what I want?
#include <iostream>
#include <thread>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
__global__ void kernel_test(int *a, int N) {
for(int i=threadIdx.x;i<N;i+=N) {
if(i<N)
a[i] = 1;
}
}
int main(int argc, char ** argv) {
cudaEvent_t start[3], stop[3];
for(int i=0;i<3;i++) {
cudaEventCreate(&start[i]);
cudaEventCreate(&stop[i]);
}
cudaStream_t stream;
cudaStreamCreate(&stream);
const int N = 1024 * 1024;
int *h_a = (int*)malloc(N * sizeof(int));
int *a = 0;
cudaMalloc((void**)&a, N * sizeof(int));
for(int i=0;i<3;i++) {
cudaEventRecord(start[i], stream);
cudaMemcpyAsync(a, h_a, N * sizeof(int), cudaMemcpyHostToDevice, stream);
kernel_test<<<1, 1024, 0, stream>>>(a, N);
cudaMemcpyAsync(h_a, a, N*sizeof(int), cudaMemcpyDeviceToHost, stream);
cudaEventRecord(stop[i], stream);
std::this_thread::sleep_for (std::chrono::seconds(i));
cudaEventSynchronize(stop[i]);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start[i], stop[i]);
std::cout<<milliseconds<<std::endl;
}
return 0;
}
I attach the nsight result to verify the behaviour of my example.
Windows 8.1
Geforce GTX 780 Ti
Nvidia drivers: 358.50
EDIT:
Added code to be complete
Attached nsight result
Added SO and drivers info
, ,

If you're running the program on Windows using the WDDM (in contrast to TCC with Tesla cards or Linux) this may be the issue:
With the WDDM kernels are not executed immediately after invocation but instead enqueued to a command buffer. Once the buffer is full it gets flushed and the enqueued commands are actually executed. Another option to force the command buffer to be explicitly flushed is to synchronize.
Now what happens is that you wait before the command buffer is acutally flushed...
Edit
Also see https://devtalk.nvidia.com/default/topic/548639/is-wddm-causing-this-/ for the problem and how cudaEventQuery(0) may help

Related

CUDA Seg Fault for Int Device to Host Copy

Why is the following simple program (24 lines) lead to segmentation fault at shrinked_size_host int variable:
#include <stdio.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
__global__ void cuda_set(int* device_var){
*device_var = 12;
printf("Set device variable to: %d\n", *device_var);
}
int main() {
printf("Hello world CPU\n");
int* shrinked_size_device;
cudaMalloc((void**)&shrinked_size_device, sizeof(int));
cudaDeviceSynchronize();
cudaMemset(shrinked_size_device, 0, sizeof(int));
cudaDeviceSynchronize();
cuda_set<<<1,1>>>(shrinked_size_device);
cudaDeviceSynchronize();
int* shrinked_size_host = 0;
cudaMemcpy(shrinked_size_host, shrinked_size_device, sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("shrinked_size_host=%d\n", *shrinked_size_host);
return 0;
}
That's the output produced from the program:
Hello world CPU
Set device variable to: 12
Segmentation fault (core dumped)
Not sure why there is a segmentation fault.
I figured out the answer to this question.
Memory for the shrinked_size_host should be allocated. So, either do:
Heap allocation: malloc or new int to allocate an integer of size 1. Remember to delete the allocated memory at the end.
Stack allocation: Use int shrinked_size_host[1];

Cuda program not working for more than 1024 threads

My program is of Odd-even merge sort and it's not working for more than 1024 threads.
I have already tried increasing the block size to 100 but it still not working for more than 1024 threads.
I'm using Visual Studio 2012 and I have Nvidia Geforce 610M. This is my program
#include<stdio.h>
#include<iostream>
#include<conio.h>
#include <random>
#include <stdint.h>
#include <driver_types.h >
__global__ void odd(int *arr,int n){
int i=threadIdx.x;
int temp;
if(i%2==1&&i<n-1){
if(arr[i]>arr[i+1])
{
temp=arr[i];
arr[i]=arr[i+1];
arr[i+1]=temp;
}
}
}
__global__ void even(int *arr,int n){
int i=threadIdx.x;
int temp;
if(i%2==0&&i<n-1){
if(arr[i]>arr[i+1])
{
temp=arr[i];
arr[i]=arr[i+1];
arr[i+1]=temp;
}
}
}
int main(){
int SIZE,k,*A,p,j;
int *d_A;
float time;
printf("Enter the size of the array\n");
scanf("%d",&SIZE);
A=(int *)malloc(SIZE*sizeof(int));
cudaMalloc(&d_A,SIZE*sizeof(int));
for(k=0;k<SIZE;k++)
A[k]=rand()%1000;
cudaMemcpy(d_A,A,SIZE*sizeof(int),cudaMemcpyHostToDevice);
if(SIZE%2==0)
p=SIZE/2;
else
p=SIZE/2+1;
for(j=0;j<p;j++){
even<<<3,SIZE>>>(d_A,SIZE);
if(j!=p-1)
odd<<<3,SIZE>>>(d_A,SIZE);
if(j==p-1&&SIZE%2==0)
odd<<<1,SIZE>>>(d_A,SIZE);
}
cudaMemcpy(A,d_A,SIZE*sizeof(int),cudaMemcpyDeviceToHost);
for(k=0;k<SIZE;k++)
printf("%d ",A[k]);
free(A);
cudaFree(d_A);
getch();
}
CUDA threadblocks are limited to 1024 threads (or 512 threads, for cc 1.x gpus). The size of the threadblock is indicated in the second kernel configuration parameter in the kernel launch:
even<<<3,SIZE>>>(d_A,SIZE);
^^^^
So when you enter a SIZE value greater than 1024, this kernel will not launch.
You're getting no indication of this because you're not doing proper cuda error checking which is always a good idea any time you're having trouble with a CUDA code. You can also, as a quick test, run your code with cuda-memcheck to look for API errors.

new operator in kernel .. strange behaviour

I was wondering if anybody can shed some light on this behaviour with the new operator within a kernel.. Following is the code
#include <stdio.h>
#include "cuda_runtime.h"
#include "cuComplex.h"
using namespace std;
__global__ void test()
{
cuComplex *store;
store= new cuComplex[30000];
if (store==NULL) printf("Unable to allocate %i\n",blockIdx.y);
delete store;
if (threadIdx.x==10000) store->x=0.0;
}
int main(int argc, char *argv[])
{
float timestamp;
cudaEvent_t event_start,event_stop;
// Initialise
cudaEventCreate(&event_start);
cudaEventCreate(&event_stop);
cudaEventRecord(event_start, 0);
dim3 threadsPerBlock;
dim3 blocks;
threadsPerBlock.x=1;
threadsPerBlock.y=1;
threadsPerBlock.z=1;
blocks.x=1;
blocks.y=500;
blocks.z=1;
cudaEventRecord(event_start);
test<<<blocks,threadsPerBlock,0>>>();
cudaEventRecord(event_stop, 0);
cudaEventSynchronize(event_stop);
cudaEventElapsedTime(&timestamp, event_start, event_stop);
printf("test took %fms \n", timestamp);
}
Running this on a GTX680 Cuda 5 and investigating the output one will notice that randomly memory is not allocated :( I was thinking that maybe it is because all global memory is finished but I have 2GB of memory and since the maximum amount of active blocks is 16 the amount of memory allocated with this method should at maximum be 16*30000*8=38.4x10e6.. ie around 38Mb. So what else should I consider?
The problem is related with the size of the heap used by the malloc() and free() device system calls. See section 3.2.9 Call Stack and appendix B.16.1 Heap Memory Allocation in the NVIDIA CUDA C Programming Guide for more details.
Your test will work if you set the heap size to fit your kernel requirement
cudaDeviceSetLimit(cudaLimitMallocHeapSize, 500*30000*sizeof(cuComplex));

Modulus computation of an array of cufftComplex data type in CUDA

I made a Dll file in visual C++ to compute modulus of an array of complex numbers in CUDA. The array is type of cufftComplex. I then called the Dll in LabVIEW to check the accuracy of the result. I'm receiving an incorrect result. Could anyone tell me what is wrong with the following code, please? I think there should be something wrong with my kernel function(the way I am retrieving the cufftComplex data should be incorrect).
#include <math.h>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cufft.h>
extern "C" __declspec(dllexport) void Modulus(cufftComplex *digits,float *result);
__global__ void ModulusComputation(cufftComplex *a, int N, float *temp)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx<N)
{
temp[idx] = sqrt((a[idx].x * a[idx].x) + (a[idx].y * a[idx].y));
}
}
void Modulus(cufftComplex *digits,float *result)
{
#define N 1024
cufftComplex *d_data;
float *temp;
size_t size = sizeof(cufftComplex)*N;
cudaMalloc((void**)&d_data, size);
cudaMalloc((void**)&temp, sizeof(float)*N);
cudaMemcpy(d_data, digits, size, cudaMemcpyHostToDevice);
int blockSize = 16;
int nBlocks = N/blockSize;
if( N % blockSize != 0 )
nBlocks++;
ModulusComputation <<< nBlocks, blockSize >>> (d_data, N,temp);
cudaMemcpy(result, temp, size, cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(temp);
}
In the final cudaMemcpy in your code, you have:
cudaMemcpy(result, temp, size, cudaMemcpyDeviceToHost);
It should be:
cudaMemcpy(result, temp, sizeof(float)*N, cudaMemcpyDeviceToHost);
If you had included error checking for your cuda calls, you would have seen this cuda call (as originally written) throw an error.
There's other comments that could be made. For example your block size (16) should be an integral multiple of 32. But this does not prevent proper operation.
After the kernel call, when copying back the result, you are using size as the memory size. The third argument of cudaMemcpy should be N * sizeof(float).

CUDA - Copy device data to host?

I have device variable and in this variable, I allocate and fill an array in the device, but I have a problem to get data to host. cudaMemcpy() return cudaErrorInvalidValue error. how can I do it?
PS: The Code is just example, I know, that In this particular case I can use cudaMalloc because I know the size of the array, but In my REAL code, It computes the size of the array in the device and it needs immediately allocate memory.
PS2: I found a similar problem, but I still don't know, how can I solve it? - copy data which is allocated in device from device to host
PS3: I have updated code, but still doesn't work:{
PS4: I am just trying to run this code on a notebook with Nvidia GT 520MX(latest game driver) and doesn't work too :(
thx
#include <cuda.h>
#include <stdio.h>
#define N 400
__device__ int* d_array;
__global__ void allocDeviceMemory()
{
d_array = new int[N];
for(int i=0; i < N; i++)
d_array[i] = 123;
}
int main()
{
allocDeviceMemory<<<1, 1>>>();
cudaDeviceSynchronize();
int* d_a = NULL;
cudaMemcpyFromSymbol((void**)&d_a, "d_array", sizeof(d_a), 0, cudaMemcpyDeviceToHost);
printf("gpu adress: %lld\n", d_a);
int* h_array = (int*)malloc(N*sizeof(int));
cudaError_t errr = cudaMemcpy(h_array, d_a, N*sizeof(int), cudaMemcpyDeviceToHost);
printf("h_array: %d, %d\n", h_array[0], errr);
getchar();
return 0;
}
You need to synchronize (cudaDeviceSynchronize()) after launching the kernel to allocate the memory.
Can you also check the return value of the sync and all other CUDA API calls?
i have tested your code and there is no error here. I am running CUDA 4.0.