Cuda same kernel, but different results with __constant__ [duplicate]

Cuda same kernel, but different results with __constant__ [duplicate] - cuda

This question already has answers here:
CUDA 5.0 namespaces for constant memory variable usage
(1 answer)
why do we need cudaDeviceSynchronize(); in kernels with device-printf?
(1 answer)
Issue regarding data of constant memory in CUDA
(2 answers)
CUDA: cudaMemcpyToSymbol is not copying data
(1 answer)
Closed 12 months ago.
How can cudaMemcpyToSymbol just make this ??
// head.h
#include <stdio.h>
__constant__ float const_mem[1];
__global__ void k0(); //I will declare it in main.cu
__global__ void k1(); //I will declare it in separate.cu
//separate.cu
#include "head.h"
__global__ void k0() {
printf("%f\n", const_mem[0]);
}
//main.cu
#include "head.h"
__global__ void k1() {
printf("%f\n", const_mem[0]);
}
int main() {
float arr[] = {5};
cudaMemcpyToSymbol(const_mem, arr, sizeof(float));
k0<<<1,1>>>();
k1<<<1,1>>>();
}
Compilation : nvcc main.cu separate.cu
output of sudo nvprof ./a.out (./a.out gives litteraly nothing)
0.000000
5.000000
That mean that kernel writed in an other transition unit is not accessing const_memory ... but how is it possible ??

Related

Increment and access global counter CUDA

I need to make my kernel communicate with the host. I tried to use a global counter (better ways are well accepted), but the following code prints always 0. What am I doing wrong? (I tried both commented and uncommented ways).
#include <stdio.h>
#include <cuda_runtime.h>
//__device__ int count[1] = {0};
__device__ int count = 0;
__global__ void inc() {
//count[0]++;
atomicAdd(&count, 1);
}
int main(void) {
inc<<<1,10>>>();
cudaDeviceSynchronize();
//int *c;
int c;
cudaMemcpyFromSymbol(&c, count, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\n", c);
return 0;
}

Anytime you are having trouble with a CUDA code, I strongly encourage you to use proper CUDA error checking and run your code with cuda-memcheck, before asking others for help. Even if you don't understand the error output, providing it in your question will be useful for those trying to help you.
If you had done so, you would have received a report that cudaMemcpyFromSymbol is throwing an invalid argument error.
If you study the documentation for that function call, you will see that the 4th parameter is not the direction parameter, but is the offset parameter. So providing cudaMemcpyDeviceToHost is incorrect for the offset parameter. Since cudaMemcpyFromSymbol is always a device->host transfer, providing the direction argument is redundant, and since it is provided a default, is unnecessary. Your code works correctly for me simply by eliminating that:
$ cat t1414.cu
#include <stdio.h>
#include <cuda_runtime.h>
//__device__ int count[1] = {0};
__device__ int count = 0;
__global__ void inc() {
//count[0]++;
atomicAdd(&count, 1);
}
int main(void) {
inc<<<1,10>>>();
cudaDeviceSynchronize();
//int *c;
int c;
cudaMemcpyFromSymbol(&c, count, sizeof(int));
printf("%d\n", c);
return 0;
}
$ nvcc -o t1414 t1414.cu
$ cuda-memcheck ./t1414
========= CUDA-MEMCHECK
10
========= ERROR SUMMARY: 0 errors
$

CUDA:how to call a host function from a global function [duplicate]

This question already has answers here:
I get an error when I try to use printf() in a kernel
(3 answers)
Closed 5 years ago.
1 #include<stdlib.h>
2 #include<stdio.h>
3 #include"cuda.h"
4 __global__ void malloctest()
5 {
6 char * ptr=(char *)malloc(123);
7 printf("thread %d got a pointer:%p\n",threadIdx.x,ptr);
8 free(ptr);
9 }
10 int main()
11 {
12 cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024);
13 malloctest<<<1,5>>>();
14 cudaDeviceSynchronize();
15 return 0;
16 }
nvcc warning : The 'compute_10' and 'sm_10' architectures are deprecated, and may be removed in a future release.
malloctest.cu(6) (col. 9): error: calling a __host__ function("malloc") from a __global__ function("malloctest") is not allowed
malloctest.cu(7): error: calling a __host__ function("printf") from a __global__ function("malloctest") is not allowed
malloctest.cu(8): error: calling a __host__ function("free") from a __global__ function("malloctest") is not allowed
how to make it available? thanks

e,i find the answer,i should change the gpu arch to 3.0
nvcc malloctest.cu -o 1 -gencode=arch=compute_30,code=\"sm_30,compute_30\"

Cuda program not working for more than 1024 threads

My program is of Odd-even merge sort and it's not working for more than 1024 threads.
I have already tried increasing the block size to 100 but it still not working for more than 1024 threads.
I'm using Visual Studio 2012 and I have Nvidia Geforce 610M. This is my program
#include<stdio.h>
#include<iostream>
#include<conio.h>
#include <random>
#include <stdint.h>
#include <driver_types.h >
__global__ void odd(int *arr,int n){
int i=threadIdx.x;
int temp;
if(i%2==1&&i<n-1){
if(arr[i]>arr[i+1])
{
temp=arr[i];
arr[i]=arr[i+1];
arr[i+1]=temp;
}
}
}
__global__ void even(int *arr,int n){
int i=threadIdx.x;
int temp;
if(i%2==0&&i<n-1){
if(arr[i]>arr[i+1])
{
temp=arr[i];
arr[i]=arr[i+1];
arr[i+1]=temp;
}
}
}
int main(){
int SIZE,k,*A,p,j;
int *d_A;
float time;
printf("Enter the size of the array\n");
scanf("%d",&SIZE);
A=(int *)malloc(SIZE*sizeof(int));
cudaMalloc(&d_A,SIZE*sizeof(int));
for(k=0;k<SIZE;k++)
A[k]=rand()%1000;
cudaMemcpy(d_A,A,SIZE*sizeof(int),cudaMemcpyHostToDevice);
if(SIZE%2==0)
p=SIZE/2;
else
p=SIZE/2+1;
for(j=0;j<p;j++){
even<<<3,SIZE>>>(d_A,SIZE);
if(j!=p-1)
odd<<<3,SIZE>>>(d_A,SIZE);
if(j==p-1&&SIZE%2==0)
odd<<<1,SIZE>>>(d_A,SIZE);
}
cudaMemcpy(A,d_A,SIZE*sizeof(int),cudaMemcpyDeviceToHost);
for(k=0;k<SIZE;k++)
printf("%d ",A[k]);
free(A);
cudaFree(d_A);
getch();
}

CUDA threadblocks are limited to 1024 threads (or 512 threads, for cc 1.x gpus). The size of the threadblock is indicated in the second kernel configuration parameter in the kernel launch:
even<<<3,SIZE>>>(d_A,SIZE);
^^^^
So when you enter a SIZE value greater than 1024, this kernel will not launch.
You're getting no indication of this because you're not doing proper cuda error checking which is always a good idea any time you're having trouble with a CUDA code. You can also, as a quick test, run your code with cuda-memcheck to look for API errors.

cudaMemcpyFromSymbol on a device variable

I am trying to apply a kernel function on a __device__ variable, which, according to the specs, resides "in global memory"
#include <stdio.h>
#include "sys_data.h"
#include "my_helper.cuh"
#include "helper_cuda.h"
#include <cuda_runtime.h>
double X[10] = {1,-2,3,-4,5,-6,7,-8,9,-10};
double Y[10] = {0};
__device__ double DEV_X[10];
int main(void) {
checkCudaErrors(cudaMemcpyToSymbol(DEV_X, X,10*sizeof(double)));
vector_projection<double><<<1,10>>>(DEV_X, 10);
getLastCudaError("oops");
checkCudaErrors(cudaMemcpyFromSymbol(Y, DEV_X, 10*sizeof(double)));
return 0;
}
The kernel function vector_projection is defined in my_helper.cuh as follows:
template<typename T> __global__ void vector_projection(T *dx, int n) {
int tid;
tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < n) {
if (dx[tid] < 0)
dx[tid] = (T) 0;
}
}
As you can see, I use cudaMemcpyToSymbol and cudaMemcpyFromSymbol to transfer data to and from the device. However, I'm getting the following error:
CUDA error at ../src/vectorAdd.cu:19 code=4(cudaErrorLaunchFailure)
"cudaMemcpyFromSymbol(Y, DEV_X, 10*sizeof(double))"
Footnote: I can of course avoid to use __device__ variables and go for something like this which works fine; I just want to see how to do the same thing (if possible) with __device__ variables.
Update: The output of cuda-memcheck can be found at http://pastebin.com/AW9vmjFs. The error messages I get are as follows:
========= Invalid __global__ read of size 8
========= at 0x000000c8 in /home/ubuntu/Test0001/Debug/../src/my_helper.cuh:75:void vector_projection<double>(double*, int)
========= by thread (9,0,0) in block (0,0,0)
========= Address 0x000370e8 is out of bounds

The root of the problem is that you are not allowed to take the address of a device variable in ordinary host code:
vector_projection<double><<<1,10>>>(DEV_X, 10);
^
Although this seems to compile correctly, the actual address passed is garbage.
To take the address of a device variable in host code, we can use cudaGetSymbolAddress
Here is a worked example that compiles and runs correctly for me:
$ cat t577.cu
#include <stdio.h>
double X[10] = {1,-2,3,-4,5,-6,7,-8,9,-10};
double Y[10] = {0};
__device__ double DEV_X[10];
template<typename T> __global__ void vector_projection(T *dx, int n) {
int tid;
tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < n) {
if (dx[tid] < 0)
dx[tid] = (T) 0;
}
}
int main(void) {
cudaMemcpyToSymbol(DEV_X, X,10*sizeof(double));
double *my_dx;
cudaGetSymbolAddress((void **)&my_dx, DEV_X);
vector_projection<double><<<1,10>>>(my_dx, 10);
cudaMemcpyFromSymbol(Y, DEV_X, 10*sizeof(double));
for (int i = 0; i < 10; i++)
printf("%d: %f\n", i, Y[i]);
return 0;
}
$ nvcc -arch=sm_35 -o t577 t577.cu
$ cuda-memcheck ./t577
========= CUDA-MEMCHECK
0: 1.000000
1: 0.000000
2: 3.000000
3: 0.000000
4: 5.000000
5: 0.000000
6: 7.000000
7: 0.000000
8: 9.000000
9: 0.000000
========= ERROR SUMMARY: 0 errors
$
This is not the only way to address this. It is legal to take the address of a device variable in device code, so you could modify your kernel with a line something like this:
T *dx = DEV_X;
and forgo passing of the device variable as a kernel parameter. As suggested in the comments, you could also modify your code to use Unified Memory.
Regarding error checking, if you deviate from proper cuda error checking and are not careful in your deviations, the results may be confusing. Most cuda API calls can, in addition to errors arising from their own behavior, return an error that resulted from some previous CUDA asynchronous activity (usually kernel calls).

Update project from older CUDA version

In my older CUDA project I had the globals:
__device__ uint8_t dev_intersect
__constant__ uint8_t dev_flags
... and used them this way:
cudaGetSymbolAddress((void**)&ptr_dev_intersect,"dev_intersect")
cudaMemcpyToSymbol("dev_flags",&flags,sizeof(flags))
Now, since CUDA 5.0 (and newer) the symbols must be passed directly (without string), so I define the globals this way:
__device__ uint8_t *dev_intersect
__constant__ uint8_t *dev_flags
...and call the functions this way:
cudaGetSymbolAddress((void**)&ptr_dev_intersect,dev_intersect)
cudaMemcpyToSymbol(dev_flags,&flags,sizeof(flags))
Am I doing it right so far? I'm asking you, because when I update the code, I start getting other errors, which makes me kinda suspicious. Thanks for any help.

Switching from a POD variable to a pointer is probably not what you want.
If you didn't make changes elsewhere in your code to account for that difference, I would expect things to break.
To update your cuda function calls, leave your variables as-is:
__device__ uint8_t dev_intersect;
__constant__ uint8_t dev_flags;
And just drop the quotes from your cuda API functions that use those variables:
cudaGetSymbolAddress((void**)&ptr_dev_intersect,dev_intersect);
cudaMemcpyToSymbol(dev_flags,&flags,sizeof(flags));
Here is a complete worked example:
$ cat t524.cu
#include <stdio.h>
typedef unsigned char uint8_t;
__device__ uint8_t dev_intersect;
__constant__ uint8_t dev_flags;
__global__ void mykernel(uint8_t *d1_ptr){
printf("data 1 = %c\n", *d1_ptr);
printf("dev_flags = %c\n", dev_flags);
}
int main(){
uint8_t *ptr_dev_intersect;
uint8_t flags = 'X';
uint8_t dev_intersect_data = 'Y';
cudaGetSymbolAddress((void**)&ptr_dev_intersect,dev_intersect);
cudaMemcpyToSymbol(dev_flags,&flags,sizeof(flags));
cudaMemcpyToSymbol(dev_intersect,&dev_intersect_data,sizeof(dev_intersect_data));
mykernel<<<1,1>>>(ptr_dev_intersect);
cudaDeviceSynchronize();
return 0;
}
$ nvcc -arch=sm_20 -o t524 t524.cu
$ cuda-memcheck ./t524
========= CUDA-MEMCHECK
data 1 = Y
dev_flags = X
========= ERROR SUMMARY: 0 errors
$

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Cuda same kernel, but different results with constant [duplicate] - cuda

Related

Increment and access global counter CUDA

CUDA:how to call a host function from a global function [duplicate]

Cuda program not working for more than 1024 threads

cudaMemcpyFromSymbol on a device variable

Update project from older CUDA version

Categories

Resources

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Cuda same kernel, but different results with __constant__ [duplicate] - cuda

Related

Increment and access global counter CUDA

CUDA:how to call a host function from a global function [duplicate]

Cuda program not working for more than 1024 threads

cudaMemcpyFromSymbol on a __device__ variable

Update project from older CUDA version

Categories

Resources

Cuda same kernel, but different results with constant [duplicate] - cuda

cudaMemcpyFromSymbol on a device variable