I am trying to call cudaMemsetAsync from kernel (so called "dynamic parallelism"). But no matter what value I use, it always set memory to 0.
Here is my test code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_device_runtime_api.h"
#include <stdio.h>
const int size = 5;
__global__ void kernel(int *c)
{
cudaMemsetAsync(c, 0x7FFFFFFF, size * 4, NULL);
}
int main()
{
cudaError_t cudaStatus;
int c[size] = { 12, 12, 12, 12, 12 };
int *dev_c = 0;
cudaStatus = cudaSetDevice(0);
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
cudaStatus = cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
kernel <<< 1, 1 >>>(dev_c);
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_c);
cudaStatus = cudaDeviceReset();
printf("%d\n", cudaStatus);
printf("{%d,%d,%d,%d,%d}\n", c[0], c[1], c[2], c[3], c[4]);
return 0;
}
And if I run it, I got output like this:
>nvcc -run kernel.cu -gencode=arch=compute_35,code=\"sm_35,compute_35\" -rdc=true -lcudadevrt
kernel.cu
Creating library a.lib and object a.exp
0
{0,0,0,0,0}
When I call memory set, I use value 0x7FFFFFFF. I'm expecting non-zero numbers, but it always shows zero.
Is this a bug? or I did something wrong? I'm using CUDA 8.0
I can confirm this appears not to work in CUDA 8 on the systems I tested it with.
If you want a single thread to perform the operation, you can use memset directly in device code (it, like memcpy, has been supported forever). The kernel will emit a byte sized loop inline within your kernel and the operation will be handled by each running thread.
If you want a dynamic parallelism style memset operation, then the easiest thing is to make your own. A trivial (and very, very lightly tested) implementation in the code you posted might look like this:
#include <cstring>
#include <cstdio>
const int size = 5;
__global__ void myMemset_kernel(void* p, unsigned char val, size_t sz)
{
size_t tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned char* _p = (unsigned char*)p;
for(; tid < sz; tid += blockDim.x * gridDim.x) {
_p[tid] = val;
}
}
__device__ void myMemset(void* p, unsigned int val, size_t sz, cudaStream_t s=NULL)
{
const dim3 blocksz(256,1,1);
size_t nblocks = (sz + blocksz.x -1) / blocksz.x;
unsigned charval = val & 0xff;
myMemset_kernel<<< dim3(nblocks,1,1), blocksz, 0, s >>>(p, charval, sz);
}
__global__ void kernel(int *c)
{
cudaStream_t s;
cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
myMemset(c, 0x7FFFFFFF, size * 4, s);
cudaDeviceSynchronize();
}
int main()
{
int c[size];
int *dev_c;
memset(&c[0], 0xffffff0c, size * sizeof(int));
printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]);
cudaMalloc((void**)&dev_c, size * sizeof(int));
cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
kernel <<< 1, 1 >>>(dev_c);
cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_c);
printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]);
return 0;
}
which compiles and does this:
$ nvcc -rdc=true -arch=sm_52 -o memset memset.cu -lcudadevrt
$ ./memset
{0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c}
{ffffffff,ffffffff,ffffffff,ffffffff,ffffffff}
A final point -- note the values above and read this question and answer. In your code, it is not possible to use cudaMemset to apply a value of 0x7FFFFFFF. Although the value argument is an unsigned integer, cudaMemset and its relatives work like regular memset and set byte values. Only the least significant byte of the 32 bit argument is used to set values. If your objective is to set 32 bit values, then you will need to make your own version of memset for that purpose anyway.
Related
I receive Segmentation fault (core dumped) when i run this code.
I know the cudaMalloc is the problem, but I have no idea how to solve it. I just started learning some CUDA programming and im not familiar with it. I'm working on wsl if it matters.
#include<stdio.h>
#define SIZE 20
__global__ void VectorAdd(int *a,int *b,int *c,int n){
int i = threadIdx.x;
if(i<n)
c[i]=a[i]+b[i];
}
int main(){
int *a,*b,*c;
cudaMalloc((void**)&a,SIZE *sizeof(int));
cudaMalloc((void**)&b, SIZE *sizeof(int));
cudaMalloc((void**)&c,SIZE *sizeof(int));
for (int i=1;i<SIZE;i++){
a[i]=i;
b[i]=i;
c[i]=0;
}
VectorAdd<<<1, SIZE>>>(a,b,c,SIZE);
cudaDeviceSynchronize();
for(int i=1;i<SIZE;i++){
printf("%d \n",c[i]);
}
cudaFree(a);
cudaFree(b);
cudaFree(c);
return 0;
}
As the comments already suggested, you have to initialize values for arrays a and b on the host, copy them to device array, and once computation is completed you have to copy data from c back to the host.
#include<stdio.h>
#define SIZE 20
__global__ void VectorAdd(int *a,int *b,int *c,int n){
int i = threadIdx.x;
if(i<n)
c[i]=a[i]+b[i];
}
int main(){
int *a,*b,*c;
int *h_a, *h_b, *h_c; /*declare pointers to host arrays*/
cudaMalloc((void**)&a,SIZE *sizeof(int));
cudaMalloc((void**)&b, SIZE *sizeof(int));
cudaMalloc((void**)&c,SIZE *sizeof(int));
/* allocate memory for host arrays */
h_a = new int[SIZE];
h_b = new int[SIZE];
h_c = new int[SIZE];
/* initialize values on host arrays */
for (int i = 0; i < SIZE; i++){
h_a[i]=i;
h_b[i]=i;
}
/*copy data from host to device */
cudaMemcpy(a, h_a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(b, h_b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
VectorAdd<<<1, SIZE>>>(a,b,c,SIZE);
// cudaDeviceSynchronize(); /* this is not needed because cudaMemcpy implies sync. */
/*copy results from device to host*/
cudaMemcpy(h_c, c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
for(int i = 0; i < SIZE; i++){
printf("%d \n",h_c[i]);
}
cudaFree(a);
cudaFree(b);
cudaFree(c);
/* free host memory */
delete [] h_a;
delete [] h_b;
delete [] h_c;
return 0;
}
Notes
For some reason you start iterating from position 1 instead 0 in you for loops! If this is wrong by accident I fixed it!
cudaMemcpy always performs synchronization between host and device. So, cudaDeviceSynchronize() is not necessary after kernel invocation.
To avoid explicit handling of separated host and device data, you could use cudaMallocManaged instead of cudaMalloc.
I'm running a toy CUDA sample on my GeForce 1080 Ti (Pascal) on windows 10 and CUDA 9.2.
Goal is to test cudaMemPrefetchAsync to the CPU, as it's supposed to work.
However, I get a CUDA error (invalid device ordinal) on this particular line.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdio>
#include <cstdlib>
void fill(int* a, int val, int N) {
for (int k = 0; k < N; ++k) {
a[k] = val;
}
}
__global__ void add(int* a, int* b, int N)
{
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += blockDim.x * gridDim.x) {
a[i] += b[i];
}
}
inline void check(cudaError_t err, const char* file, int line) {
if (err != cudaSuccess) {
::fprintf(stderr, "ERROR at %s[%d] : %s\n", file, line, cudaGetErrorString(err));
abort();
}
}
#define CUDA_CHECK(err) do { check(err, __FILE__, __LINE__); } while(0)
int main()
{
int deviceId;
CUDA_CHECK(cudaGetDevice(&deviceId));
const int N = 1024*1024*32;
int *a, *b;
CUDA_CHECK(cudaMallocManaged(&a, N * sizeof(int)));
CUDA_CHECK(cudaMallocManaged(&b, N * sizeof(int)));
CUDA_CHECK(cudaMemPrefetchAsync(a, N * sizeof(int), cudaCpuDeviceId)); // program breaks here
CUDA_CHECK(cudaMemPrefetchAsync(b, N * sizeof(int), cudaCpuDeviceId));
fill(a, 1, N);
fill(a, 2, N);
CUDA_CHECK(cudaMemPrefetchAsync(a, N * sizeof(int), deviceId));
CUDA_CHECK(cudaMemPrefetchAsync(b, N * sizeof(int), deviceId));
add<<<32, 256>>>(a, b, N);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
return 0;
}
Is that a hardware/driver/OS limitation? Can I simply ignore the error?
Is that a hardware/driver/OS limitation?
Yes, the latter. Quoting from the documentation
GPUs with SM architecture 6.x or higher (Pascal class or newer)
provide additional Unified Memory features such as on-demand page
migration and GPU memory oversubscription that are outlined throughout
this document. Note that currently these features are only supported
on Linux operating systems.
So asynchronous page migration is not supported in Windows at the moment and that it why you get an error when you try to enable it.
I want to add 128-bit vectors with carry. My 128-bit version (addKernel128 in the code below) is twice slower than the basic 32-bit version (addKernel32 below).
Do I have memory coalescing problems ? How can I get better performance ?
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#define UADDO(c, a, b) asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));
#define UADDC(c, a, b) asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));
__global__ void addKernel32(unsigned int *c, const unsigned int *a, const unsigned int *b, const int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
while (tid < size)
{
c[tid] = a[tid] + b[tid];
tid += blockDim.x * gridDim.x;
}
}
__global__ void addKernel128(unsigned *c, const unsigned *a, const unsigned *b, const int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
while (tid < size / 4)
{
uint4 a4 = ((const uint4 *)a)[tid],
b4 = ((const uint4 *)b)[tid],
c4;
UADDO(c4.x, a4.x, b4.x)
UADDC(c4.y, a4.y, b4.y) // add with carry
UADDC(c4.z, a4.z, b4.z) // add with carry
UADDC(c4.w, a4.w, b4.w) // add with carry (no overflow checking for clarity)
((uint4 *)c)[tid] = c4;
tid += blockDim.x * gridDim.x;
}
}
int main()
{
const int size = 10000000; // 10 million
unsigned int *d_a, *d_b, *d_c;
cudaMalloc((void**)&d_a, size * sizeof(int));
cudaMalloc((void**)&d_b, size * sizeof(int));
cudaMalloc((void**)&d_c, size * sizeof(int));
cudaMemset(d_a, 1, size * sizeof(int)); // dummy init just for the example
cudaMemset(d_b, 2, size * sizeof(int)); // dummy init just for the example
cudaMemset(d_c, 0, size * sizeof(int));
int nbThreads = 512;
int nbBlocks = 1024; // for example
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
addKernel128<<<nbBlocks, nbThreads>>>(d_c, d_a, d_b, size);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float m = 0;
cudaEventElapsedTime(&m, start, stop);
cudaFree(d_c);
cudaFree(d_b);
cudaFree(d_a);
cudaDeviceReset();
printf("Elapsed = %g\n", m);
return 0;
}
Timing CUDA code on a WDDM GPU can be quite difficult for a variety of reasons. Most of these revolve around the fact that the GPU is being managed as a display device by Windows, and this can introduce a variety of artifacts into the timing. One example is that the windows driver and WDDM will batch work for the GPU, and may interleave display work in the middle of CUDA GPU work.
if possible, time your cuda code on linux, or else on a windows GPU
in TCC mode.
for performance, always build without the -G switch. In visual studio, this usually corresponds to building the release, not the debug version of the project.
To get a good performance comparison, it's usually advisable to do some "warm up runs" before actually measuring the timing results. These will eliminate "start-up" and other one-time measurement issues, are you are more likely to get sensible results. You may also wish to run your code a number of times and average the results.
It's also usually advisable to compile with an arch flag that corresponds to your GPU, so for example -arch=sm_20 for a cc2.0 GPU.
I'm trying to learn CUDA by myself, and I'm now into the issue of branch divergence. As far as I understand, this is the name given to the problem that arises when several threads in a block are said to take a branch (due to if or switch statements, for example), but others in that block don't have to take it.
In order to investigate a little bit further this phenomena and its consequences, I've written a little file with a couple of CUDA functions. One of them is supposed to take lots of time, since the threads are stopped for much more time (9999... iterations) than in the other one (in which they're only stopped for an assignation).
However, when I run the code, I'm getting very similar times. Furthermore, even measuring the time that running both of them takes I get a time similar to running only one. Did I code anything wrong, or is there a logical explanation for this?
Code:
#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
#define ITERATIONS 9999999999999999999
#define BLOCK_SIZE 16
unsigned int hTimer;
void checkCUDAError (const char *msg)
{
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,cudaGetErrorString( err) );
getchar();
exit(EXIT_FAILURE);
}
}
__global__ void divergence(float *A, float *B){
float result = 0;
if(threadIdx.x % 2 == 0)
{
for(int i=0;i<ITERATIONS;i++){
result+=A[threadIdx.x]*A[threadIdx.x];
}
} else
for(int i=0;i<ITERATIONS;i++){
result+=A[threadIdx.x]*B[threadIdx.x];
}
}
__global__ void betterDivergence(float *A, float *B){
float result = 0;
float *aux;
//This structure should not affect performance that much
if(threadIdx.x % 2 == 0)
aux = A;
else
aux = B;
for(int i=0;i<ITERATIONS;i++){
result+=A[threadIdx.x]*aux[threadIdx.x];
}
}
// ------------------------
// MAIN function
// ------------------------
int main(int argc, char ** argv){
float* d_a;
float* d_b;
float* d_result;
float *elementsA;
float *elementsB;
elementsA = (float *)malloc(BLOCK_SIZE*sizeof(float));
elementsB = (float *)malloc(BLOCK_SIZE*sizeof(float));
//"Randomly" filling the arrays
for(int x=0;x<BLOCK_SIZE;x++){
elementsA[x] = (x%2==0)?2:1;
elementsB[x] = (x%2==0)?1:3;
}
cudaMalloc((void**) &d_a, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_b, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_result, sizeof(float));
cudaMemcpy(d_a, elementsA, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, elementsB, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
CUT_SAFE_CALL(cutCreateTimer(&hTimer));
CUT_CHECK_ERROR("cudaCreateTimer\n");
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_CHECK_ERROR("reset timer\n");
CUT_SAFE_CALL( cutStartTimer(hTimer) );
CUT_CHECK_ERROR("start timer\n");
float timerValue;
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(32/dimBlock.x, 32/dimBlock.y);
divergence<<<dimBlock, dimGrid>>>(d_a, d_b);
betterDivergence<<<dimBlock, dimGrid>>>(d_a, d_b);
checkCUDAError("kernel invocation");
cudaThreadSynchronize();
CUT_SAFE_CALL(cutStopTimer(hTimer));
CUT_CHECK_ERROR("stop timer\n");
timerValue = cutGetTimerValue(hTimer);
printf("kernel execution time (secs): %f s\n", timerValue);
return 0;
}
1) You have no memory writes in your __global__ code except the local variable(result). I'm not sure that cuda compiler does that, but all your code can be safely removed with no side effect(and maybe the compiler had done that).
2) All your reads from device memory in __global__ functions are from one place on each iteration. Cuda will store the value in register memory and the longest operation(memory access) will be done very fast here.
3) May be the compiler had replaced your cycles with single multiplication like `result=ITERATIONS*A[threadIdx.x]*B[threadIdx.x]
4) If all the code in your functions will be executed as you wrote it, your betterDivergence is going to be approximately 2 times faster than your another function because you have the loops in if branches in slower one and no loops in branches in faster one. But there won't be any idle time in threads among the threads that execute same loop because all threads are going to execute the body of the loop each iteration.
I suggest you to write another example where you will store the result in some device memory and then copy that memory back to host and make some more unpredictable calculations to prevent possible optimizations.
Below is shown the final, tested, right example of a code that allows to compare the performance between CUDA code with and without branch divergence:
#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
//#define ITERATIONS 9999999999999999999
#define ITERATIONS 999999
#define BLOCK_SIZE 16
#define WARP_SIZE 32
unsigned int hTimer;
void checkCUDAError (const char *msg)
{
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,cudaGetErrorString( err) );
getchar();
exit(EXIT_FAILURE);
}
}
__global__ void divergence(float *A, float *B){
int a = blockIdx.x*blockDim.x + threadIdx.x;
if (a >= ITERATIONS) return;
if(threadIdx.x > 2)
{
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]+1;
}
} else
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]-1;
}
}
__global__ void noDivergence(float *A, float *B){
int a = blockIdx.x*blockDim.x + threadIdx.x;
if (a >= ITERATIONS) return;
if(threadIdx.x > WARP_SIZE)
{
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]+1;
}
} else
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]-1;
}
}
// ------------------------
// MAIN function
// ------------------------
int main(int argc, char ** argv){
float* d_a;
float* d_b;
float* d_result;
float *elementsA;
float *elementsB;
elementsA = (float *)malloc(BLOCK_SIZE*sizeof(float));
elementsB = (float *)malloc(BLOCK_SIZE*sizeof(float));
//"Randomly" filling the arrays
for(int x=0;x<BLOCK_SIZE;x++){
elementsA[x] = (x%2==0)?2:1;
}
cudaMalloc((void**) &d_a, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_b, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_result, sizeof(float));
cudaMemcpy(d_a, elementsA, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, elementsB, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
CUT_SAFE_CALL(cutCreateTimer(&hTimer));
CUT_CHECK_ERROR("cudaCreateTimer\n");
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_CHECK_ERROR("reset timer\n");
CUT_SAFE_CALL( cutStartTimer(hTimer) );
CUT_CHECK_ERROR("start timer\n");
float timerValue;
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(128/dimBlock.x, 128/dimBlock.y);
//divergence<<<dimGrid, dimBlock>>>(d_a, d_b);
noDivergence<<<dimGrid, dimBlock>>>(d_a, d_b);
checkCUDAError("kernel invocation");
cudaThreadSynchronize();
CUT_SAFE_CALL(cutStopTimer(hTimer));
CUT_CHECK_ERROR("stop timer\n");
timerValue = cutGetTimerValue(hTimer)/1000;
printf("kernel execution time (secs): %f s\n", timerValue);
cudaMemcpy(elementsB, d_b, BLOCK_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
return 0;
}
I'm using Win8 and Nsight in "visual studio 2010" and I installed "310.90-notebook-win8-win7-winvista-32bit-international-whql" for my Graphic card(9300m Gs).but when I try the code below,I see a black screen!and an error :"Display driver stoped responding and has recoverd"!
I know that the problem is with "cudaMemcpy",but I don't why!?
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define N 8
__global__ void kernel(int *a)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int step = x;
while(step<N){
a[step] = threadIdx.x;
step += x;
}
}
int main()
{
int a[N],i=N,j=0;
for(;j<N;j++)
a[j]=i--;
int *dev_a;
cudaMalloc( (void**)&dev_a, N * sizeof(int) );
cudaMemcpy( dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
kernel<<<2,2>>>(dev_a);
cudaError_t cudaStatus = cudaMemcpy(a, dev_a,N-1 * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
//goto Error;
}
for(j=0;j<N;j++)printf("\n%d",a[j]);
int t;
scanf("%d",&t);
}
In the kernel, the thread with threadIdx.x = 0 and blockIdx.x = 0 i.e. the first thread of the first block will run indefinitely, causing the kernel to crash.
When threadIdx.x = 0 and blockIdx.x = 0 the kernel code will become:
int x = 0;
int step = 0;
while(step<N)
{
a[step] = 0;
step += 0; //This will create infinite loop
}
Also (May be its a typo), there is a logical error in the following line of your code:
cudaError_t cudaStatus = cudaMemcpy(a, dev_a,N-1 * sizeof(int), cudaMemcpyDeviceToHost);
Considering the operator precedence in C, the expression N-1 * sizeof(int) will evaluate to N-4 (if sizeof(int) is 4).