CUDA kernel is slower than CPU - cuda

I'm new to CUDA and I'm probably doing something wrong.
All I need is logical operation on two binary vectors. Length of vectors is 2048000. I compared speed between logical and in Matlab's C mex file and in CUDA kernel. C on CPU is ~5% faster than CUDA. Please note that I measured only kernel execution (without memory transfer). I have i7 930 and 9800GT.
##MEX file testCPU.c:##
#include "mex.h"
void mexFunction( int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[] ) {
int i, varLen;
unsigned char *vars, *output;
vars = mxGetPr(prhs[0]);
plhs[0] = mxCreateLogicalMatrix(2048000, 1);
output = mxGetPr(plhs[0]);
for (i=0;i<2048000;i++){
output[i] = vars[i] & vars[2048000+i];
}
}
Compile
mex testCPU.c
Create vectors
vars = ~~(randi(2,2048000,2)-1);
Measure speed:
tic;testCPU(vars);toc;
CUDA:
#CUDA file testGPU.cu#
#include "mex.h"
#include "cuda.h"
__global__ void logical_and(unsigned char* in, unsigned char* out, int N) {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
out[idx] = in[idx] && in[idx+N];
}
void mexFunction( int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[] ) {
int i;
unsigned char *vars, *output, *gpu, *gpures;
vars = (unsigned char*)mxGetData(prhs[0]);
plhs[0] = mxCreateLogicalMatrix(2048000, 1);
output = (unsigned char*)mxGetData(plhs[0]);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float dt_ms;
// input GPU malloc
cudaEventRecord(start, 0);
cudaMalloc( (void **) &gpu, sizeof(unsigned char)*4096000);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&dt_ms, start, stop);
printf("GPU input malloc: %f ms, %i\n", dt_ms, cudaGetLastError());
// output GPU malloc
cudaEventRecord(start, 0);
cudaMalloc( (void **) &gpures, sizeof(unsigned char)*2048000);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&dt_ms, start, stop);
printf("GPU output malloc: %f ms, %i\n", dt_ms, cudaGetLastError());
// copy from CPU to GPU
cudaEventRecord(start, 0);
cudaMemcpy( gpu, vars, sizeof(unsigned char)*4096000, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&dt_ms, start, stop);
printf("copy input from CPU to GPU: %f ms, %i\n", dt_ms, cudaGetLastError());
dim3 dimBlock(32);
printf("thread count: %i\n", dimBlock.x);
dim3 dimGrid(2048000/dimBlock.x);
printf("block count: %i\n", dimGrid.x);
// --- KERNEL ---
cudaEventRecord(start, 0);
logical_and<<<dimGrid, dimBlock>>>(gpu, gpures, 2048000);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&dt_ms, start, stop);
printf("GPU kernel: %f ms, %i\n", dt_ms, cudaGetLastError());
// result from GPU to CPU
cudaEventRecord(start, 0);
cudaMemcpy( output, gpures, sizeof(unsigned char)*2048000, cudaMemcpyDeviceToHost );
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&dt_ms, start, stop);
printf("copy output from GPU to CPU: %f ms, %i\n", dt_ms, cudaGetLastError());
cudaFree(gpu);
cudaFree(gpures);
}
Compile:
nvmex -f nvmexopts_9.bat testGPU.cu
-I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.2\include"
-L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.2\lib\x64" -lcudart -lcufft
Output:
GPU input malloc: 0.772160 ms, 0
GPU output malloc: 0.041728 ms, 0
copy input from CPU to GPU: 1.494784 ms, 0
thread count: 32
block count: 64000
*** GPU kernel: 3.761216 ms, 0 ***
copy output from GPU to CPU: 1.203488 ms, 0
Is that code OK? CPU was ~0.1ms faster than CUDA kernel. I tried different thread counts (multipliers of 32) up to 512, 32 was fastest. Operator & instead of && was almost 1ms slower.
Is 9800GT really so weak? What speed-up can I expect with today's mainstream card (ie. GTX460,560)?
Thank you
EDIT: based on talonmies' comment, I made these modifications:
Kernel function:
__global__ void logical_and(uchar4* in, uchar4* out, int N) {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
out[idx].x = in[idx].x & in[idx+N].x;
out[idx].y = in[idx].y & in[idx+N].y;
out[idx].z = in[idx].z & in[idx+N].z;
out[idx].w = in[idx].w & in[idx+N].w;
}
Main function:
uchar4 *gpu, *gpures;
// 32 was worst, 64,128,256,512 were similar
dim3 dimBlock(128);
// block count is now 4xtimes smaller
dim3 dimGrid(512000/dimBlock.x);
Output:
GPU input malloc: 0.043360 ms, 0
GPU output malloc: 0.038592 ms, 0
copy input from CPU to GPU: 1.499584 ms, 0
thread count: 128
block count: 4000
*** GPU kernel: 0.131296 ms, 0 ***
copy output from GPU to CPU: 1.281120 ms, 0
Is that correct? Almost 30x speed-up! It seems too good to be true, but result is correct :)
How faster will be GTX560 on this particular task? Thx
Edit 2:
Is this code
__global__ void logical_and(uchar4* in, uchar4* out, int N) {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
out[idx].x = in[idx].x & in[idx+N].x;
out[idx].y = in[idx].y & in[idx+N].y;
out[idx].z = in[idx].z & in[idx+N].z;
out[idx].w = in[idx].w & in[idx+N].w;
}
automatically transformed to:
__global__ void logical_and(uchar4* in, uchar4* out, int N) {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
uchar4 buff;
buff.x = in[idx].x;
buff.y = in[idx].y;
buff.z = in[idx].z;
buff.w = in[idx].w;
buff.x &= in[idx+N].x;
buff.y &= in[idx+N].y;
buff.z &= in[idx+N].z;
buff.w &= in[idx+N].w;
out[idx].x = buff.x;
out[idx].y = buff.y;
out[idx].z = buff.z;
out[idx].w = buff.w;
}
by compiler?
If it is correct, it explains my confusion about coalesced access. I thought that in[idx] & in[idx+N] leads to non-coalesced access, because of accessing non-contiguous memory. But in fact, in[idx] and in[idx+N] are loaded in two coalesced steps. N can be any multiple of 16, because uchar4 is 4 bytes long, and for coalesced access address must be aligned to 64 bytes (on 1.1 device). Am I right?

As talonmies pointed out, you're accessing and processing your data byte-wise, which is far from optimal. A collection of techniques you may want to consider, such as Instruction-Level Parallelism and buffered read/writes, are summarized in the nVidia Webinar Better Performance at Lower Occupancy by Vasily Volkov.
In a nutshell, what you want to do is, in each thread, read several uint4 in a coalesced way, process them, and only then store them.
Update
Does it make any difference if you re-write your code as follows?
__global__ void logical_and(unsigned int* in, unsigned int* out, int N) {
int idx = blockIdx.x*blockDim.x*chunksize+threadIdx.x;
unsigned int buff[chunksize];
#pragma unroll
for ( int k = 0 ; k < chunksize ; k++ )
buff[k] = in[ blockDim.x*k + idx ];
#pragma unroll
for ( int k = 0 ; k < chunksize ; k++ )
buff[k] &= in[ blockDim.x*k + idx + N ];
#pragma unroll
for ( int k = 0 ; k < chunksize ; k++ )
out[ blockDim.x*k + idx ] = buff[k];
}
Note that I've assumed chunksize is a variable you've #defined somewhere, e.g.
#define chunksize 4
And that you have to divide the number of blocks you launch and N by that number. I've also used unsigned int which is just four packed uchar. In your calling function, you may have to cast your pointers accordingly.

What i think its happening is called false sharing. I think the problem is that the byte-sized regions you are trying to write from your threads are producing a massive race condition because different threads are trying to write to the same word-aligned address. I'm not sure the details in GPU, but in CPU, when different threads try to write to memory in the same 256-byte aligned region (called cache lines) they will continuously block each other, plummeting your global performance.

Related

Strange Cudamemcpy execution time

I'm currently working on a Cuda code which computes a simple difference pixel by pixel of two images (size: 2560x1706 px) in order to compare execution time of CPU and GPU.
I realize a "for" loop of 1000 iterations of my kernel to have a more significant execution time, and I perform the cudaMemcpy (from device to host) straight after the loop to retrieve the data computed.
Nevertheless, the execution time of this cudaMemcpy took 2800 ms which is higher than expected. I just was asking myself why I obtain such a result.
Here is my Kernel Code :
__global__ void diff (unsigned char *data1 ,unsigned char *data2, int *data_res)
{
int v = threadIdx.x + blockIdx.x*blockDim.x;
if (v < N)
{
data_res[v] = (int) data2[v] - (int) data1[v];
}
}
Here is the kernel calls :
cudaProfilerStart();
// Cuda allocation
cudaMalloc((void**)&dev_data1, N*sizeof(unsigned char));
cudaMalloc((void**)&dev_data2, N*sizeof(unsigned char));
cudaMalloc((void**)&dev_data_res, N*sizeof(int));
// Cuda memory copy
cudaMemcpy(dev_data1, img1->data, N*sizeof(unsigned char), cudaMemcpyHostToDevice);
cudaMemcpy(dev_data2, img2->data, N*sizeof(unsigned char), cudaMemcpyHostToDevice);
cudaMemcpy(dev_data_res, imgresult->data, N*sizeof(int), cudaMemcpyHostToDevice);
//Simulate nb_loops images
for(int m = 0; m < nb_loops ; m++)
{
diff<<<blck_nb, thrd_nb>>>(dev_data1, dev_data2, dev_data_res);
//printf("%4d", m);
}
printf("WAITING FOR MEMCPY...\n");
clock_t begin = clock(), diff;
cudaMemcpy(imgresult_data, dev_data_res, N*sizeof(int), cudaMemcpyDeviceToHost);
diff = clock() - begin;
float msec = diff*1000/CLOCKS_PER_SEC;
printf("\t \nTime of the MEMCPY : %2.3f ms\n", msec);
printf("MEMCPY DEVICE TO HOST OK!\n");
cudaProfilerStop();
And here is the screenshot of the execution time results :
CUDA kernel launches are asynchronous, and cudaMemcpy is a blocking call. So what you are calling memcpy time is really kernel execution + memcpy tiime. Change your code like this:
...
for(int m = 0; m < nb_loops ; m++)
{
diff<<<blck_nb, thrd_nb>>>(dev_data1, dev_data2, dev_data_res);
//printf("%4d", m);
}
cudaDeviceSynchronize();
printf("WAITING FOR MEMCPY...\n");
....
and this timing should be correct.

CUDA kernels are not overlapping

I have a simple vector multiplication kernel, which I am executing for 2 streams. But when I profile in NVVP, kernels do not seem to overlap. Is it because each kernel execution utilizes %100 of GPU, if not what can be the cause ?
Source code :
#include "common.h"
#include <cstdlib>
#include <stdio.h>
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_profiler_api.h"
#include <string.h>
const int N = 1 << 20;
__global__ void kernel(int n, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n) y[i] = x[i] * y[i];
}
int main()
{
float *x, *y, *d_x, *d_y, *d_1, *d_2;
x = (float*)malloc(N*sizeof(float));
y = (float*)malloc(N*sizeof(float));
cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));
cudaMalloc(&d_1, N*sizeof(float));
cudaMalloc(&d_2, N*sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_1, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_2, y, N*sizeof(float), cudaMemcpyHostToDevice);
const int num_streams = 8;
cudaStream_t stream1;
cudaStream_t stream2;
cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
for (int i = 0; i < 300; i++) {
kernel << <512, 512, 0, stream1 >> >(N, d_x, d_y);
kernel << <512, 512, 0, stream2 >> >(N, d_1, d_2);
}
cudaStreamSynchronize(stream1);
cudaStreamSynchronize(stream2);
// cudaDeviceSynchronize();
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed time : %f ms\n", elapsedTime);
cudaDeviceReset();
cudaProfilerStop();
return 0;
}
EDIT: From comments I understand each kernel is utilizing GPU fully, so what is the best approach for achieving 262144-sized vector multiplication (for multiple streams) ?
My device information :
CUDA Device Query...
There are 1 CUDA devices.
CUDA Device #0
Major revision number: 5
Minor revision number: 0
Name: GeForce GTX 850M
Total global memory: 0
Total shared memory per block: 49152
Total registers per block: 65536
Warp size: 32
Maximum memory pitch: 2147483647
Maximum threads per block: 1024
Maximum dimension 0 of block: 1024
Maximum dimension 1 of block: 1024
Maximum dimension 2 of block: 64
Maximum dimension 0 of grid: 2147483647
Maximum dimension 1 of grid: 65535
Maximum dimension 2 of grid: 65535
Clock rate: 901500
Total constant memory: 65536
Texture alignment: 512
Concurrent copy and execution: Yes
Number of multiprocessors: 5
Kernel execution timeout: Yes
The reason why your kernels don't overlap is because your gpu is 'filled' with execution threads like #Robert Crovella mentions. Checking the Compute Capabilities chapter from the CUDA Programming Guide, there is a limit of 2048 threads per SM for your CC (5.0). You have 5 SM's so this makes it
a maximum of 10240 threads that can run simultaneously on your device. You are calling 512x512=262144 threads, with just a single kernel call, and that pretty much leaves no space at all for the other kernel call.
You need to launch small enough kernels so that 2 can run concurrently on your device.
I'm not an expert on streams, but from what i've understood, if you want to run your program using streams, you need to split it up in chunks and you have to calculate a proper offset mechanism in order for your streams to be able to access their proper data. On your current code, each stream that you are launching does exactly the same calculation over exactly the same data. You have to split the data among the streams.
Other than that if you want to get the max performance you need to overlap the kernel execution with asynchronous data transfers. The easiest way to do this is to assign a scheme like the following to each of your streams like presented here
for (int i = 0; i < nStreams; ++i) {
int offset = i * streamSize;
cudaMemcpyAsync(&d_a[offset], &a[offset], streamBytes, cudaMemcpyHostToDevice, stream[i]);
kernel<<<streamSize/blockSize, blockSize, 0, stream[i]>>>(d_a, offset);
cudaMemcpyAsync(&a[offset], &d_a[offset], streamBytes, cudaMemcpyDeviceToHost, stream[i]);
}
This configuration simply tells each stream to do a memcpy then to execute the kernel on some data then to copy the data back. After the async calls, the streams will work simultaneously completing their tasks.
PS: I would also recommend to revise your kernel as well. Using one thread to compute just one multiplication is an overkill. I would use the thread to process some more data.

Accurate method to calculate double FMA and Shared memory latency

I am trying to come up with an accurate way to measure the latency of two operations:
1) Latency of a double precision FMA operation.
2) Latency of a double precision load from shared memory.
I am using a K20x and was wondering if this code would give accurate measurements.
#include <cuda.h>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
using namespace std;
//Clock rate
#define MHZ 732e6
//number of streaming multiprocessors
#define SMS 14
// number of double precision units
#define DP_UNITS 16*4
//number of shared banks
#define SHARED_BANKS 32
#define ITER 100000
#define NEARONE 1.0000000000000004
__global__ void fma_latency_kernal(double *in, double *out){
int tid = blockIdx.x*blockDim.x+threadIdx.x;
double val = in[tid];
#pragma unroll 100
for(int i=0; i<ITER; i++){
val+=val*NEARONE;
}
out[tid]=val;
}
__global__ void shared_latency_kernel(double *in, double *out){
volatile extern __shared__ double smem[];
int tid = blockIdx.x*blockDim.x+threadIdx.x;
smem[threadIdx.x]=in[tid];
#pragma unroll 32
for(int i=0; i<ITER; i++){
smem[threadIdx.x]=smem[(threadIdx.x+i)%32]*NEARONE;
}
out[tid]=smem[threadIdx.x];
}
int main (int argc , char **argv){
float time;
cudaEvent_t start, stop, start2, stop2;
double *d_A, *d_B;
cudaMalloc(&d_A, DP_UNITS*SMS*sizeof(float));
cudaMalloc(&d_B, DP_UNITS*SMS*sizeof(float));
cudaError_t err;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
fma_latency_kernal<<<SMS, DP_UNITS>>>(d_A, d_B);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
time/=1000;
err = cudaGetLastError();
if(err!=cudaSuccess)
printf("Error FMA: %s\n", cudaGetErrorString(err));
printf("Latency of FMA = %3.1f clock cycles\n", (time/(double)ITER)*(double)MHZ);
cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
cudaEventCreate(&start2);
cudaEventCreate(&stop2);
cudaEventRecord(start2, 0);
shared_latency_kernel<<<1, SHARED_BANKS, sizeof(double)>>>(d_A, d_B );
cudaEventRecord(stop2, 0);
cudaEventSynchronize(stop2);
cudaEventElapsedTime(&time, start2, stop2);
time/=1000;
err = cudaGetLastError();
if(err!=cudaSuccess)
printf("Error Shared Memory: %s\n", cudaGetErrorString(err));
printf("Latency of Shared Memory = %3.1f clock cycles\n", time/(double)ITER*(double)MHZ);
}
My results on the K20x are the following:
Latency of FMA = 16.4 clock cycles
Latency of Shared Memory = 60.7 clock cycles
This seems reasonable to me, but I am not sure how accurate it is.
Your latency values look very high to me - nearly double what I'd expect. To measure how many cycles something takes on the GPU, you can insert clock() functions before and after the relevant part of the kernel function. The clock function returns the current cycle as an int, so by subtracting the first value from the second you get the the number of cycles that passed between dispatching the first clock instruction and dispatching the second clock instruction.
Note that the numbers you get from this method will include extra time from the clock instructions themselves; I believe that by default a thread will block for several cycles immediately before and after every clock instruction, so you may want to experiment with that to see how many cycles it's adding so you can subtract them back out.

How could we generate random numbers in CUDA C with different seed on each run?

I am working on a stochastic process and I wanted to generate different series if random numbers in CUDA kernel each time I run the program.
This similar to what we does in C++ by declaring
seed = time(null)
followed by srand(seed)
and rand( )
I can pass seeds from host to device via the kernel but
the problem in doing this is I would have to pass an entire array of seeds into the kernel for each thread to have a different random seed each time. Is there a way I could generate random seed / process if / machine time or something like than within the kernel and pass it as a seed?
You don't need to pass an array of random seeds, but, when you use the cuRAND library, you can properly set the sequence number parameter of curand_init. For example [Disclaimer: it is a non-tested function]
__global__ void generate_random_numbers(float* numbers, unsigned long seed, int Np) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < Np) {
curandState state;
curand_init(seed, i, 0, &state);
numbers[i] = curand_uniform(&state);
}
}
You can also avoid passing the seed from outside if you change the curand_init instruction to
curand_init(clock64(), i, 0, &state);
EDIT
Following Roger Dahl's comment, I have done a comparison (Kepler K20c) between four different possibilities for the generation of arrays of 131072 elements:
Single random number generation: separate kernels for initialization and random number generation;
Single random number generation: unique kernel for initialization and random number generation;
Multiple random number generation: separate kernels for initialization and random number generation;
Multiple random number generation: unique kernel for initialization and random number generation;
Below is the code. The timing for generating has been the following:
861ms;
852ms;
866ms;
2556ms;
I hope I have correctly understood the performance issue raised by Roger Dahl.
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#define DSIZE 8192*16
#define nTPB 256
/***********************/
/* CUDA ERROR CHECKING */
/***********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*************************/
/* CURAND INITIALIZATION */
/*************************/
__global__ void initCurand(curandState *state, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &state[idx]);
}
__global__ void testrand1(curandState *state, float *a){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
a[idx] = curand_uniform(&state[idx]);
}
__global__ void testrand2(unsigned long seed, float *a){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curandState state;
curand_init(seed, idx, 0, &state);
a[idx] = curand_uniform(&state);
}
/********/
/* MAIN */
/********/
int main() {
int n_iter = 20;
curandState *devState; gpuErrchk(cudaMalloc((void**)&devState, DSIZE*sizeof(curandState)));
float *d_a; gpuErrchk(cudaMalloc((void**)&d_a, DSIZE*sizeof(float)));
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int i=0; i<n_iter; i++) {
initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for separate kernels: %3.1f ms \n", time);
cudaEventRecord(start, 0);
for (int i=0; i<n_iter; i++) {
testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for single kernels: %3.1f ms \n", time);
cudaEventRecord(start, 0);
for (int i=0; i<n_iter; i++) {
initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for separate kernels with multiple random number generation: %3.1f ms \n", time);
cudaEventRecord(start, 0);
for (int i=0; i<n_iter; i++) {
testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for single kernels for multiple random number generation: %3.1f ms \n", time);
getchar();
}
Output on GTX660:
Elapsed time for separate kernels: 1960.3 ms
Elapsed time for single kernels: 1536.9 ms
Elapsed time for separate kernels with multiple random number generation: 1576.0 ms
Elapsed time for single kernels for multiple random number generation: 4612.2 ms
Output on GTX570:
Elapsed time for separate kernels: 957.2 ms
Elapsed time for single kernels: 947.7 ms
Elapsed time for separate kernels with multiple random number generation: 964.6 ms
Elapsed time for single kernels for multiple random number generation: 2839.0 ms
Approximately same performance as the K20c.
Using a different seed on each run should be straightforward. The exact method will depend on which generator you are using, but if you're using one of the cuRAND generators then you can cast your time_t from time(NULL) to a 64-bit integer and pass it in to the seed functions.
If you're calling the generator from your kernel you will need to pass this seed in either as a kernel argument or via a __device__ variable. You can then use an offset to curand_init() or use skip_ahead() to get distinct sub-sequences.
If you have a specific generator for which this will not work, please post more information.
You can create more than one global function for random number initialization and generation. or create a loop to go over the global function
example:
for (int rns = 0; rns < 5; rns++) { // too seed 'loop' times
init << < N, 10 >> > (devState, time(0));
gpuErrchk(cudaMalloc((void**)&gpu_no, N * sizeof(double))); // allocate memory for random numbers on device/GPU
//rndn << < N, 10 >> > (devState, gpu_no);//invoke kernel to launch the random numbers
gpuErrchk(cudaMemcpy(cpu_no, gpu_no, N * sizeof(double), cudaMemcpyDeviceToHost))
} cout << "the transition matrix " << ++generate << " seed generation is: " << init << endl;
This does nt have any noticeable effect on the random number generated. But there is a fear of not being correlated and also lack of convergence in the long run. Why would you like to seed more than once in an iteration anyways.
you can use the library function to generate different types of random number distribution like "curand_uniform" curand_normal, curand_poission and so on.
I don't know if this answers your question.

Latency of shuffle instructions in CUDA

About the latency of __shfl() instruction:
Does the following instruction
c=__shfl(c, indi);
/*
where indi is any integer number(may be random (<32)),
and is different for different LaneID.
*/
has the same latency comparing to, lets say:
c=__shfl_down(c,1);
All warp-shuffle instructions have the same performance.
To provide a "quantitative" follow-up answer to Robert's answer, let us consider Mark Harris' reduction approach using CUDA shuffle operations detailed at Faster Parallel Reductions on Kepler.
In this approach, warp reduction is performed by using __shfl_down. An alternative approach to warp reduction is using __shfl_xor according to Lecture 4: warp shuffles, and reduction / scan operations. Below, I'm reporting the full code implementing both the approaches. If tested on a Kepler K20c, both take 0.044ms to reduce an array of N=200000 float elements. Relevantly, both the approaches outperform Thrust reduce by two orders of magnitude since the execution time for the Thrust case is 1.06ms for the same test.
Here is the full code:
#include <thrust\device_vector.h>
#define warpSize 32
/***********************************************/
/* warpReduceSum PERFORMING REDUCTION PER WARP */
/***********************************************/
__forceinline__ __device__ float warpReduceSum(float val) {
for (int offset = warpSize/2; offset > 0; offset /= 2) val += __shfl_down(val, offset);
//for (int i=1; i<warpSize; i*=2) val += __shfl_xor(val, i);
return val;
}
/*************************************************/
/* blockReduceSum PERFORMING REDUCTION PER BLOCK */
/*************************************************/
__forceinline__ __device__ float blockReduceSum(float val) {
// --- The shared memory is appointed to contain the warp reduction results. It is understood that the maximum number of threads per block will be
// 1024, so that there will be at most 32 warps per each block.
static __shared__ float shared[32];
int lane = threadIdx.x % warpSize; // Thread index within the warp
int wid = threadIdx.x / warpSize; // Warp ID
// --- Performing warp reduction. Only the threads with 0 index within the warp have the "val" value set with the warp reduction result
val = warpReduceSum(val);
// --- Only the threads with 0 index within the warp write the warp result to shared memory
if (lane==0) shared[wid]=val; // Write reduced value to shared memory
// --- Wait for all warp reductions
__syncthreads();
// --- There will be at most 1024 threads within a block and at most 1024 blocks within a grid. The partial sum is read from shared memory only
// the corresponding warp existed, otherwise the partial sum is set to zero.
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
// --- The first warp performs the final partial warp summation.
if (wid==0) val = warpReduceSum(val);
return val;
}
/********************/
/* REDUCTION KERNEL */
/********************/
__global__ void deviceReduceKernel(float *in, float* out, int N) {
float sum = 0.f;
// --- Reduce multiple elements per thread.
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) sum += in[i];
sum = blockReduceSum(sum);
if (threadIdx.x==0) out[blockIdx.x]=sum;
}
/********/
/* MAIN */
/********/
void main() {
const int N = 200000;
thrust::host_vector<float> h_out(N,0.f);
thrust::device_vector<float> d_in(N,3.f);
thrust::device_vector<float> d_out(N);
int threads = 512;
int blocks = min((N + threads - 1) / threads, 1024);
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// --- Performs the block reduction. It returns an output vector containig the block reductions as elements
cudaEventRecord(start, 0);
deviceReduceKernel<<<blocks, threads>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
// --- Performs a second block reduction with only one block. The input is an array of all 0's, except the first elements which are the
// block reduction results of the previous step.
deviceReduceKernel<<<1, 1024>>>(thrust::raw_pointer_cast(d_out.data()), thrust::raw_pointer_cast(d_out.data()), blocks);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("CUDA Shuffle - elapsed time: %3.5f ms \n", time);
h_out = d_out;
cudaEventRecord(start, 0);
float sum = thrust::reduce(d_in.begin(),d_in.end(),0.f,thrust::plus<float>());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("CUDA Thrust - elapsed time: %3.5f ms \n", time);
printf("Shuffle result = %f\n",h_out[0]);
printf("Thrust result = %f\n",sum);
getchar();
}