CUDA memory transfer issue - cuda

I am trying to execute a code which first transfers data from CPU to GPU memory and vice-versa. In spite of increasing the volume of data, the data transfer time remains the same
as if no data transfer is actually taking place. I am posting the code.
#include <stdio.h> /* Core input/output operations */
#include <stdlib.h> /* Conversions, random numbers, memory allocation, etc. */
#include <math.h> /* Common mathematical functions */
#include <time.h> /* Converting between various date/time formats */
#include <cuda.h> /* CUDA related stuff */
#include <sys/time.h>
__global__ void device_volume(float *x_d,float *y_d)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
}
int main(void)
{
float *x_h,*y_h,*x_d,*y_d,*z_h,*z_d;
long long size=9999999;
long long nbytes=size*sizeof(float);
timeval t1,t2;
double et;
x_h=(float*)malloc(nbytes);
y_h=(float*)malloc(nbytes);
z_h=(float*)malloc(nbytes);
cudaMalloc((void **)&x_d,size*sizeof(float));
cudaMalloc((void **)&y_d,size*sizeof(float));
cudaMalloc((void **)&z_d,size*sizeof(float));
gettimeofday(&t1,NULL);
cudaMemcpy(x_d, x_h, nbytes, cudaMemcpyHostToDevice);
cudaMemcpy(y_d, y_h, nbytes, cudaMemcpyHostToDevice);
cudaMemcpy(z_d, z_h, nbytes, cudaMemcpyHostToDevice);
gettimeofday(&t2,NULL);
et = (t2.tv_sec - t1.tv_sec) * 1000.0; // sec to ms
et += (t2.tv_usec - t1.tv_usec) / 1000.0; // us to ms
printf("\n %ld\t\t%f\t\t",nbytes,et);
et=0.0;
//printf("%f %d\n",seconds,CLOCKS_PER_SEC);
// launch a kernel with a single thread to greet from the device
//device_volume<<<1,1>>>(x_d,y_d);
gettimeofday(&t1,NULL);
cudaMemcpy(x_h, x_d, nbytes, cudaMemcpyDeviceToHost);
cudaMemcpy(y_h, y_d, nbytes, cudaMemcpyDeviceToHost);
cudaMemcpy(z_h, z_d, nbytes, cudaMemcpyDeviceToHost);
gettimeofday(&t2,NULL);
et = (t2.tv_sec - t1.tv_sec) * 1000.0; // sec to ms
et += (t2.tv_usec - t1.tv_usec) / 1000.0; // us to ms
printf("%f\n",et);
cudaFree(x_d);
cudaFree(y_d);
cudaFree(z_d);
return 0;
}
Can anybody help me with this issue?
Thanks

Try cudaEvent for capturing the time for GPU code.
Try use the Visual profiler to see how much time is spend on the memcpy. The profiler will show all execution time spend on the GPU for every cuda related operations.

It stays the same because it takes the same time. In your code you don't add up the transfer times.

Related

CUDA kernels are not overlapping

I have a simple vector multiplication kernel, which I am executing for 2 streams. But when I profile in NVVP, kernels do not seem to overlap. Is it because each kernel execution utilizes %100 of GPU, if not what can be the cause ?
Source code :
#include "common.h"
#include <cstdlib>
#include <stdio.h>
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_profiler_api.h"
#include <string.h>
const int N = 1 << 20;
__global__ void kernel(int n, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n) y[i] = x[i] * y[i];
}
int main()
{
float *x, *y, *d_x, *d_y, *d_1, *d_2;
x = (float*)malloc(N*sizeof(float));
y = (float*)malloc(N*sizeof(float));
cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));
cudaMalloc(&d_1, N*sizeof(float));
cudaMalloc(&d_2, N*sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_1, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_2, y, N*sizeof(float), cudaMemcpyHostToDevice);
const int num_streams = 8;
cudaStream_t stream1;
cudaStream_t stream2;
cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
for (int i = 0; i < 300; i++) {
kernel << <512, 512, 0, stream1 >> >(N, d_x, d_y);
kernel << <512, 512, 0, stream2 >> >(N, d_1, d_2);
}
cudaStreamSynchronize(stream1);
cudaStreamSynchronize(stream2);
// cudaDeviceSynchronize();
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed time : %f ms\n", elapsedTime);
cudaDeviceReset();
cudaProfilerStop();
return 0;
}
EDIT: From comments I understand each kernel is utilizing GPU fully, so what is the best approach for achieving 262144-sized vector multiplication (for multiple streams) ?
My device information :
CUDA Device Query...
There are 1 CUDA devices.
CUDA Device #0
Major revision number: 5
Minor revision number: 0
Name: GeForce GTX 850M
Total global memory: 0
Total shared memory per block: 49152
Total registers per block: 65536
Warp size: 32
Maximum memory pitch: 2147483647
Maximum threads per block: 1024
Maximum dimension 0 of block: 1024
Maximum dimension 1 of block: 1024
Maximum dimension 2 of block: 64
Maximum dimension 0 of grid: 2147483647
Maximum dimension 1 of grid: 65535
Maximum dimension 2 of grid: 65535
Clock rate: 901500
Total constant memory: 65536
Texture alignment: 512
Concurrent copy and execution: Yes
Number of multiprocessors: 5
Kernel execution timeout: Yes
The reason why your kernels don't overlap is because your gpu is 'filled' with execution threads like #Robert Crovella mentions. Checking the Compute Capabilities chapter from the CUDA Programming Guide, there is a limit of 2048 threads per SM for your CC (5.0). You have 5 SM's so this makes it
a maximum of 10240 threads that can run simultaneously on your device. You are calling 512x512=262144 threads, with just a single kernel call, and that pretty much leaves no space at all for the other kernel call.
You need to launch small enough kernels so that 2 can run concurrently on your device.
I'm not an expert on streams, but from what i've understood, if you want to run your program using streams, you need to split it up in chunks and you have to calculate a proper offset mechanism in order for your streams to be able to access their proper data. On your current code, each stream that you are launching does exactly the same calculation over exactly the same data. You have to split the data among the streams.
Other than that if you want to get the max performance you need to overlap the kernel execution with asynchronous data transfers. The easiest way to do this is to assign a scheme like the following to each of your streams like presented here
for (int i = 0; i < nStreams; ++i) {
int offset = i * streamSize;
cudaMemcpyAsync(&d_a[offset], &a[offset], streamBytes, cudaMemcpyHostToDevice, stream[i]);
kernel<<<streamSize/blockSize, blockSize, 0, stream[i]>>>(d_a, offset);
cudaMemcpyAsync(&a[offset], &d_a[offset], streamBytes, cudaMemcpyDeviceToHost, stream[i]);
}
This configuration simply tells each stream to do a memcpy then to execute the kernel on some data then to copy the data back. After the async calls, the streams will work simultaneously completing their tasks.
PS: I would also recommend to revise your kernel as well. Using one thread to compute just one multiplication is an overkill. I would use the thread to process some more data.

cudaEventElapsedTime not expected behaviour

I'm trying to compute the total time taken in GPU to compute something. I'm using the cudaEventRecord and cudaEventElapsedTime to determine this, but I'm having a unexpected behavior, or at least, unexpected for me :) I wrote this example to understand what's happening and I'm still confused.
In the example below I was expecting to report the same time for the three iterations but the result is:
2.80342
1003
2005.6
Which means that the total time in considering the CPU sleep time.
Am I doing something wrong? If not, is it possible do what I want?
#include <iostream>
#include <thread>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
__global__ void kernel_test(int *a, int N) {
for(int i=threadIdx.x;i<N;i+=N) {
if(i<N)
a[i] = 1;
}
}
int main(int argc, char ** argv) {
cudaEvent_t start[3], stop[3];
for(int i=0;i<3;i++) {
cudaEventCreate(&start[i]);
cudaEventCreate(&stop[i]);
}
cudaStream_t stream;
cudaStreamCreate(&stream);
const int N = 1024 * 1024;
int *h_a = (int*)malloc(N * sizeof(int));
int *a = 0;
cudaMalloc((void**)&a, N * sizeof(int));
for(int i=0;i<3;i++) {
cudaEventRecord(start[i], stream);
cudaMemcpyAsync(a, h_a, N * sizeof(int), cudaMemcpyHostToDevice, stream);
kernel_test<<<1, 1024, 0, stream>>>(a, N);
cudaMemcpyAsync(h_a, a, N*sizeof(int), cudaMemcpyDeviceToHost, stream);
cudaEventRecord(stop[i], stream);
std::this_thread::sleep_for (std::chrono::seconds(i));
cudaEventSynchronize(stop[i]);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start[i], stop[i]);
std::cout<<milliseconds<<std::endl;
}
return 0;
}
I attach the nsight result to verify the behaviour of my example.
Windows 8.1
Geforce GTX 780 Ti
Nvidia drivers: 358.50
EDIT:
Added code to be complete
Attached nsight result
Added SO and drivers info
, ,
If you're running the program on Windows using the WDDM (in contrast to TCC with Tesla cards or Linux) this may be the issue:
With the WDDM kernels are not executed immediately after invocation but instead enqueued to a command buffer. Once the buffer is full it gets flushed and the enqueued commands are actually executed. Another option to force the command buffer to be explicitly flushed is to synchronize.
Now what happens is that you wait before the command buffer is acutally flushed...
Edit
Also see https://devtalk.nvidia.com/default/topic/548639/is-wddm-causing-this-/ for the problem and how cudaEventQuery(0) may help

Accurate method to calculate double FMA and Shared memory latency

I am trying to come up with an accurate way to measure the latency of two operations:
1) Latency of a double precision FMA operation.
2) Latency of a double precision load from shared memory.
I am using a K20x and was wondering if this code would give accurate measurements.
#include <cuda.h>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
using namespace std;
//Clock rate
#define MHZ 732e6
//number of streaming multiprocessors
#define SMS 14
// number of double precision units
#define DP_UNITS 16*4
//number of shared banks
#define SHARED_BANKS 32
#define ITER 100000
#define NEARONE 1.0000000000000004
__global__ void fma_latency_kernal(double *in, double *out){
int tid = blockIdx.x*blockDim.x+threadIdx.x;
double val = in[tid];
#pragma unroll 100
for(int i=0; i<ITER; i++){
val+=val*NEARONE;
}
out[tid]=val;
}
__global__ void shared_latency_kernel(double *in, double *out){
volatile extern __shared__ double smem[];
int tid = blockIdx.x*blockDim.x+threadIdx.x;
smem[threadIdx.x]=in[tid];
#pragma unroll 32
for(int i=0; i<ITER; i++){
smem[threadIdx.x]=smem[(threadIdx.x+i)%32]*NEARONE;
}
out[tid]=smem[threadIdx.x];
}
int main (int argc , char **argv){
float time;
cudaEvent_t start, stop, start2, stop2;
double *d_A, *d_B;
cudaMalloc(&d_A, DP_UNITS*SMS*sizeof(float));
cudaMalloc(&d_B, DP_UNITS*SMS*sizeof(float));
cudaError_t err;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
fma_latency_kernal<<<SMS, DP_UNITS>>>(d_A, d_B);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
time/=1000;
err = cudaGetLastError();
if(err!=cudaSuccess)
printf("Error FMA: %s\n", cudaGetErrorString(err));
printf("Latency of FMA = %3.1f clock cycles\n", (time/(double)ITER)*(double)MHZ);
cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
cudaEventCreate(&start2);
cudaEventCreate(&stop2);
cudaEventRecord(start2, 0);
shared_latency_kernel<<<1, SHARED_BANKS, sizeof(double)>>>(d_A, d_B );
cudaEventRecord(stop2, 0);
cudaEventSynchronize(stop2);
cudaEventElapsedTime(&time, start2, stop2);
time/=1000;
err = cudaGetLastError();
if(err!=cudaSuccess)
printf("Error Shared Memory: %s\n", cudaGetErrorString(err));
printf("Latency of Shared Memory = %3.1f clock cycles\n", time/(double)ITER*(double)MHZ);
}
My results on the K20x are the following:
Latency of FMA = 16.4 clock cycles
Latency of Shared Memory = 60.7 clock cycles
This seems reasonable to me, but I am not sure how accurate it is.
Your latency values look very high to me - nearly double what I'd expect. To measure how many cycles something takes on the GPU, you can insert clock() functions before and after the relevant part of the kernel function. The clock function returns the current cycle as an int, so by subtracting the first value from the second you get the the number of cycles that passed between dispatching the first clock instruction and dispatching the second clock instruction.
Note that the numbers you get from this method will include extra time from the clock instructions themselves; I believe that by default a thread will block for several cycles immediately before and after every clock instruction, so you may want to experiment with that to see how many cycles it's adding so you can subtract them back out.

Issue with using Multi GPU NVIDIA

I'm learning how to use multi GPU for my CUDA application. I tried out a simple program which successfully ran on a system having two Tesla C2070. But when I tried to run the same program on a different system having a Tesla K40c and a Tesla C2070, it shows a segmentation fault. What might be the problem? I'm sure that there is no problem with the code. Is there any settings to be done in the environment? I have attached my code here for your reference.
#include <stdio.h>
#include "device_launch_parameters.h"
#include "cuda_runtime_api.h"
__global__ void testA(int *a)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
a[i] = a[i] * 2;
}
int main()
{
int *ai, *bi, *ao, *bo;
int iter;
cudaStream_t streamA, streamB;
cudaSetDevice(0);
cudaStreamCreate(&streamA);
cudaMalloc((void**)&ao, 10 * sizeof(int));
cudaHostAlloc((void**)&ai, 10 * sizeof(int), cudaHostAllocMapped);
for(iter=0; iter<10; iter++)
{
ai[iter] = iter+1;
}
cudaSetDevice(1);
cudaStreamCreate(&streamB);
cudaMalloc((void**)&bo, 10 * sizeof(int));
cudaHostAlloc((void**)&bi, 10 * sizeof(int), cudaHostAllocMapped);
for(iter=0; iter<10; iter++)
{
bi[iter] = iter+11;
}
cudaSetDevice(0);
cudaMemcpyAsync(ao, ai, 10 * sizeof(int), cudaMemcpyHostToDevice, streamA);
testA<<<1, 10, 0, streamA>>>(ao);
cudaMemcpyAsync(ai, ao, 10 * sizeof(int), cudaMemcpyDeviceToHost, streamA);
cudaSetDevice(1);
cudaMemcpyAsync(bo, bi, 10 * sizeof(int), cudaMemcpyHostToDevice, streamB);
testA<<<1, 10, 0, streamB>>>(bo);
cudaMemcpyAsync(bi, bo, 10 * sizeof(int), cudaMemcpyDeviceToHost, streamB);
cudaSetDevice(0);
cudaStreamSynchronize(streamA);
cudaSetDevice(1);
cudaStreamSynchronize(streamB);
printf("%d %d %d %d %d\n",ai[0],ai[1],ai[2],ai[3],ai[4]);
printf("%d %d %d %d %d\n",bi[0],bi[1],bi[2],bi[3],bi[4]);
return 0;
}
The segmentation fault occurs when bi array is initialized inside the for loop, which means the memory is not allocated for bi.
With the new information you've provided based on the error checking, the problem you were having was due to the ECC error.
When a GPU has a double-bit ECC error detected in the current session, it is no longer usable for compute activities until either:
the GPU is reset (e.g. via system reboot, or via driver unload/reload, or manually via nvidia-smi, etc.),
(or)
ECC is disabled (which usually also may require a system reboot or gpu reset)
You can review ECC status of your GPUs with the nvidia-smi command. You probably already know which GPU was reporting the ECC error, since you disabled ECC, but in case not, based on your initial report it would be the one that was associated with the cudaSetDevice(1); command, which probably should have been the Tesla C2070 (i.e. not the K40).

Modulus computation of an array of cufftComplex data type in CUDA

I made a Dll file in visual C++ to compute modulus of an array of complex numbers in CUDA. The array is type of cufftComplex. I then called the Dll in LabVIEW to check the accuracy of the result. I'm receiving an incorrect result. Could anyone tell me what is wrong with the following code, please? I think there should be something wrong with my kernel function(the way I am retrieving the cufftComplex data should be incorrect).
#include <math.h>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cufft.h>
extern "C" __declspec(dllexport) void Modulus(cufftComplex *digits,float *result);
__global__ void ModulusComputation(cufftComplex *a, int N, float *temp)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx<N)
{
temp[idx] = sqrt((a[idx].x * a[idx].x) + (a[idx].y * a[idx].y));
}
}
void Modulus(cufftComplex *digits,float *result)
{
#define N 1024
cufftComplex *d_data;
float *temp;
size_t size = sizeof(cufftComplex)*N;
cudaMalloc((void**)&d_data, size);
cudaMalloc((void**)&temp, sizeof(float)*N);
cudaMemcpy(d_data, digits, size, cudaMemcpyHostToDevice);
int blockSize = 16;
int nBlocks = N/blockSize;
if( N % blockSize != 0 )
nBlocks++;
ModulusComputation <<< nBlocks, blockSize >>> (d_data, N,temp);
cudaMemcpy(result, temp, size, cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(temp);
}
In the final cudaMemcpy in your code, you have:
cudaMemcpy(result, temp, size, cudaMemcpyDeviceToHost);
It should be:
cudaMemcpy(result, temp, sizeof(float)*N, cudaMemcpyDeviceToHost);
If you had included error checking for your cuda calls, you would have seen this cuda call (as originally written) throw an error.
There's other comments that could be made. For example your block size (16) should be an integral multiple of 32. But this does not prevent proper operation.
After the kernel call, when copying back the result, you are using size as the memory size. The third argument of cudaMemcpy should be N * sizeof(float).