Copy symbol address to symbol - cuda

I'd like to set a symbol pointer to a symbol address, from the host. Something like:
__device__ float * symbolPtr; // address of symbol[3]
__device__ float symbol[5];
cudaGetSymbolAddress( &symbolPtr, symbol[3] );
I've only used this to get a device address to the host. But there is no mention that it can't set a symbol variable.
Here is another approach:
cudaMemcpytoSymbol(&symbolPtr, &symbol[3], sizeof(void*), 0, cudaMemcpyDeviceToDevice);
I doubt either of these will work. Is there a simpler way to do this?

This method worked for me. There may be a better way.
#include <stdio.h>
#define N 5
#define Q 3
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__device__ float *symbolPtr;
__device__ float symbol[N];
__global__ void mykernel(){
for (int i=0; i<N; i++)
symbol[i] = (float) i;
printf("symbol[%d] = %f\n", Q, *symbolPtr);
}
int main(){
float *temp;
cudaGetSymbolAddress((void **)&temp, symbol);
cudaCheckErrors("cudaGetSymbolAddress fail");
temp += Q;
cudaMemcpyToSymbol(symbolPtr, &temp, sizeof(float *));
cudaCheckErrors("cudaMemcpyToSymbol fail");
mykernel<<<1,1>>>();
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
return 0;
}

A static approach would be:
__device__ float symbol[5];
__device__ float * symbolPtr= &symbol[3];

Related

Is there a function in the cublas that can apply the sigmoid function with a vector?

As the title says, I want to do the element-wise operation in the vector with a function.I wonder that is there any function in the cublas library to do that?
I am not aware of a suitable CUBLAS function that can assist in the task. However, you can easily write your own code that applies the sigmoid function, or any other single-argument function for that matter, element-wise to a vector. Note that such code would be memory-bound rather than compute-bound in most circumstances. See the CUDA program below for a worked example, in particular sigmoid_kernel(). The output of the program should look something like this:
source[0]= 0.0000000000000000e+000 source[99999]= 9.9999000000000005e-001
result[0]= 5.0000000000000000e-001 result[99999]= 7.3105661250612963e-001
.
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define DEFAULT_LEN 100000
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
__device__ __forceinline__ double sigmoid (double a)
{
return 1.0 / (1.0 + exp (-a));
}
__global__ void sigmoid_kernel (const double * __restrict__ src,
double * __restrict__ dst, int len)
{
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
dst[i] = sigmoid (src[i]);
}
}
int main (void)
{
double *source, *result;
double *d_a = 0, *d_b = 0;
int len = DEFAULT_LEN;
/* Allocate memory on host */
source = (double *)malloc (len * sizeof (source[0]));
if (!source) return EXIT_FAILURE;
result = (double *)malloc (len * sizeof (result[0]));
if (!result) return EXIT_FAILURE;
/* create source data */
for (int i = 0; i < len; i++) source [i] = i * 1e-5;
/* spot check of source data */
printf ("source[0]=% 23.16e source[%d]=% 23.16e\n",
source[0], len-1, source[len-1]);
/* Allocate memory on device */
CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0]) * len));
CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0]) * len));
/* Push source data to device */
CUDA_SAFE_CALL (cudaMemcpy (d_a, source, sizeof(d_a[0]) * len,
cudaMemcpyHostToDevice));
/* Compute execution configuration */
dim3 dimBlock(256);
int threadBlocks = (len + (dimBlock.x - 1)) / dimBlock.x;
if (threadBlocks > 65520) threadBlocks = 65520;
dim3 dimGrid(threadBlocks);
sigmoid_kernel<<<dimGrid,dimBlock>>>(d_a, d_b, len);
CHECK_LAUNCH_ERROR();
/* retrieve results from device */
CUDA_SAFE_CALL (cudaMemcpy (result, d_b, sizeof (result[0]) * len,
cudaMemcpyDeviceToHost));
/* spot check of results */
printf ("result[0]=% 23.16e result[%d]=% 23.16e\n",
result[0], len-1, result[len-1]);
/* free memory on host and device */
CUDA_SAFE_CALL (cudaFree(d_a));
CUDA_SAFE_CALL (cudaFree(d_b));
free (result);
free (source);
return EXIT_SUCCESS;
}

Is it possible to use thrust::device_ptr on a mapped array?

I am trying to use the thrust::copy_if function on mapped memory. However, as I get a runtime error and I am not being able to find it, before spending a lot of time in debugging, I would like to have a confirmation of the fact that it is effectively allowed to pass a pointer to a mapped memory location to the thrust::device_ptr wrapper.
Here is an example of what I mean:
int size=1024;
int* v_locked;
int* v_device;
int* stencil_device;
device_ptr<int> v_wrapper;
device_ptr<int> v_wrapper_end;
device_ptr<int> stencil_wrapper;
cudaHostAlloc((void**)&v_locked, size*sizeof(int), cudaHostAllocMapped));
cudaHostGetDevicePointer(&v_device, &v_locked, 0);
cudaMalloc((void**)&stencil_device, size*sizeof(int));
/*
kernel assigning stencil_device elements ...
*/
v_wrapper = device_pointer_cast(v_device);
stencil_wrapper = device_pointer_cast(stencil_device);
v_wrapper_end = copy_if(make_counting_iterator<int>(0), make_counting_iterator<int>(size), stencil_wrapper, v_wrapper, _1 == 1);
Is this a correct usage of mapped memory with thrust library?
Thank you.
Yes, it is possible.
I believe there were several problems with your code.
You don't appear to be doing any proper cuda error checking If you were, you would have detected that although your calls to cudaHostGetDevicePointer seem to compile correctly, they were not set up correctly.
As mentioned above, your calls to cudaHostGetDevicePointer() were not set up correctly. The second pointer argument is passed as a single pointer (*), not double pointer (**). Refer to the documentation This call as written would throw a cuda runtime error which you can trap.
Prior to your cudaHostAlloc calls, you should use the cudaSetDeviceFlags(cudaDeviceMapHost); call to enable this feature.
Here is a sample code which seems to work correctly for me, and has the above problems fixed:
$ cat t281.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/copy.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
template<typename T>
struct is_one : thrust::unary_function<T, bool>
{
__host__ __device__
bool operator()(const T &x)
{
return (x==1);
}
};
int main(){
int size=1024;
int* v_locked;
int* v_device;
int* stencil_locked;
int* stencil_device;
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaCheckErrors("cudaSetDeviceFlags");
cudaHostAlloc((void**)&v_locked, size*sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc 1");
cudaHostGetDevicePointer(&v_device, v_locked, 0);
cudaCheckErrors("cudaHostGetDevicePointer 1");
cudaHostAlloc((void**)&stencil_locked, size*sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc 2");
cudaHostGetDevicePointer(&stencil_device, stencil_locked, 0);
cudaCheckErrors("cudaHostGetDevicePointer 2");
for (int i = 0; i < size; i++){
v_locked[i] = i;
stencil_locked[i] = i%2;}
thrust::device_ptr<int> v_wrapper = thrust::device_pointer_cast(v_device);
thrust::device_ptr<int> stencil_wrapper = thrust::device_pointer_cast(stencil_device);
thrust::device_ptr<int> v_wrapper_end = v_wrapper + size;
thrust::device_vector<int> result(size);
thrust::device_vector<int>::iterator result_end = copy_if(v_wrapper, v_wrapper_end, stencil_wrapper, result.begin(), is_one<int>());
int result_size = result_end - result.begin();
thrust::host_vector<int> h_result(result_size);
thrust::copy_n(result.begin(), result_size, h_result.begin());
thrust::copy_n(h_result.begin(), 10, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
return 0;
}
$ nvcc -arch=sm_20 -o t281 t281.cu
$ ./t281
1 3 5 7 9 11 13 15 17 19
$

Changing from for loop to multithreading in kernel

I'm currently working on interpolation of a grid and having some problems regarding multithreading. The code is suppose to read a map represented by a 2x2 matrix, and then interpolate it to increase the number of points by a factor of 100. When using for loops in the kernel, it works great.
Before interpolation: http://bildr.no/view/OWV1UDRO
After interpolation: http://bildr.no/view/eTlmNmpo
When I tried to change the for loops with threads, it produced some weird result. In stead of numbers, it filled the resulting matrix with -1.#QNAN
Here's my working code with for loops in the kernel
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <fstream>
#include "cuda.h"
using namespace std;
float Z[41][41];
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
texture<float, 2, cudaReadModeElementType> tex;
__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare)
{
int k = sqrt(numberOfInterpolationsPerSquare);
for (float i=0; i<n*k; i++)
{
for (float j=0; j<m*k; j++)
{
f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f);
}
}
}
int main (void)
{
// Start timer
clock_t tStart = clock();
// Size of map
int n=41;
int m=41;
int g = 0;
float numberOfInterpolationsPerSquare = 100;
float numberOfElements = pow(sqrt(numberOfInterpolationsPerSquare)*n,2);
size_t pitch, tex_ofs;
float *f;
float *r;
float *map_d = 0;
// Build read-Streams
ifstream map;
//Create and open a txt file for MATLAB
ofstream file;
// Open data
map.open("Map.txt", ios_base::in);
file.open("Bilinear.txt");
// Store the map in a 2D array
for (int i=0; i<n; i++)
{
for (int j=0; j<m; j++)
{
map >> Z[i][j];
}
}
// Allocate memory on host and device
CUDA_SAFE_CALL(cudaMallocPitch((void**)&map_d,&pitch,n*sizeof(*map_d),m));
CUDA_SAFE_CALL(cudaMalloc((void**)&f, numberOfElements*sizeof(float)));
r = (float*)malloc(numberOfElements*sizeof(float));
// Copy map from host to device
CUDA_SAFE_CALL(cudaMemcpy2D(map_d, pitch, Z, n*sizeof(Z[0][0]), n*sizeof(Z[0][0]),m,cudaMemcpyHostToDevice));
// Set texture mode to bilinear interpolation
tex.normalized = false;
tex.filterMode = cudaFilterModeLinear;
// Bind the map to texture
CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, map_d, &tex.channelDesc, n, m, pitch));
// Checking for offset
if (tex_ofs !=0) {
printf ("tex_ofs = %zu\n", tex_ofs);
return EXIT_FAILURE;
}
// Launch Kernel
kernel <<< 1,1 >>> (m, n, f, numberOfInterpolationsPerSquare);
CHECK_LAUNCH_ERROR();
CUDA_SAFE_CALL (cudaDeviceSynchronize());
// Copy result from device to host
cudaMemcpy(r, f, numberOfElements*sizeof(float), cudaMemcpyDeviceToHost);
// Write results to file
for(int h=0;h<numberOfElements;h++)
{
if(g==sqrt(numberOfElements))
{
file << endl;
g=0;
}
file << r[h] << " ";
g++;
}
// Free memory
CUDA_SAFE_CALL (cudaUnbindTexture (tex));
CUDA_SAFE_CALL (cudaFree (map_d));
CUDA_SAFE_CALL (cudaFree (f));
free( r );
// Print out execution time
printf("Time taken: %.3fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return EXIT_SUCCESS;
}
Here's the kernel with multithreading, which doesn't work
__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare)
{
int k = sqrt(numberOfInterpolationsPerSquare);
int i= blockIdx.x * blockDim.x + threadIdx.x;
int j= blockIdx.y * blockDim.y + threadIdx.y;
if(i>=n*k || j>=m*k)
return;
f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f);
}
Does anyone know why the multithread version doesn't work?
Regards
Sondre
In the second kernel, i and j are int instead of float. So j/k and i/k in tex2D will result in integer division. Declare k as float to avoid integer division.
Initially, the kernel was launched with the following configuration:
//Find number of blocks
int nthreads = 1024;
int blocksize = 512;
int nblocks = ceil( (n*m*numberOfInterpolationsPerSquare) / nthreads);
// Launch Kernel
kernel <<< nblocks,blocksize >>> (m, n, f, numberOfInterpolationsPerSquare);
The problem with the above code is that it would launch a 1D grid of 1D blocks, but inside the kernel, 2D indexing is used. A 2D grid/block configuration is required for the kernel to work correctly. From the looks of the kernel code, following grid/block configuration should work:
float k = sqrt(numberOfInterpolationsPerSquare);
const int threads_x = (int)ceil(n * k);
const int threads_y = (int)ceil(m * k);
const dim3 dimBlock(16,16);
dim3 dimGrid;
dimGrid.x = (threads_x + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (threads_y + dimBlock.y - 1)/dimBlock.y;
kernel<<<dimGrid,dimBlock>>>(m, n, f, numberOfInterpolationsPerSquare);

copy data which is allocated in device from device to host

I have a pointer which is dynamically allocated in device,then how can I copy it from device to host.
#include <stdio.h>
#define cudaSafeCall(call){ \
cudaError err = call; \
if(cudaSuccess != err){ \
fprintf(stderr, "%s(%i) : %s.\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
}}
#define cudaCheckErr(errorMessage) { \
cudaError_t err = cudaGetLastError(); \
if(cudaSuccess != err){ \
fprintf(stderr, "%s(%i) : %s : (code %d) %s.\n", __FILE__, __LINE__, errorMessage, err, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \``
}}
struct num{
int *a;
int b;
};
__device__ struct num *gun;
int main()
{
int i;
char c[100];
struct num *dun,*cun;
cudaSafeCall(cudaSetDevice(1));
cun=(struct num*)malloc(10*sizeof(struct num));
cudaSafeCall(cudaMalloc(&dun,10*sizeof(struct num)));
cudaSafeCall(cudaMemcpyToSymbol(gun,&dun,sizeof(struct num*)));
__global__ void kernel();
kernel<<<1,10>>>();
cudaSafeCall(cudaDeviceSynchronize());
cudaCheckErr(c);
cudaSafeCall(cudaMemcpyFromSymbol(&dun,gun,sizeof(struct num*)));
cudaSafeCall(cudaMemcpy(cun,dun,10*sizeof(struct num),cudaMemcpyDeviceToHost));
for(i=0;i<10;i++) cudaSafeCall(cudaMalloc(&csu[i].a,10*sizeof(int)));
cudaSafeCall(cudaGetSymbolAddress((void**)csu[0].a,(void**)gun[0].a));
for(i=0;i<10;i++) cun[i].a=(int*)malloc(10*sizeof(int));
for(i=0;i<10;i++) cudaSafeCall(cudaMemcpy(cun[i].a,dun[i].a,10*sizeof(int),cudaMemcpyDeviceToHost));
printf("%d ",cun[8].b);
printf("%d ",cun[8].a[8]);
cudaSafeCall(cudaFree(dun));
free(cun);
}
__global__ void kernel()
{
int i;
int tid=threadIdx.x;
gun[tid].b=tid;
gun[tid].a=(int*)malloc(10*sizeof(int));/*this is dynamically allocated in device.*/
for(i=0;i<10;i++)
gun[tid].a[i]=tid+i;
}
In this program, it always comes to a "segmentation fault" in
cudaSafeCall(cudaMemcpy(cun[i].a,dun[i].a,10*sizeof(int),cudaMemcpyDeviceToHost))
Why? And what can I do to copy this data from device to host?
The problem you have is that you are trying to use device pointer indirection in host code, which is illegal. In your example
cudaMemcpy(cun[i].a,dun[i].a,10*sizeof(int),cudaMemcpyDeviceToHost)
dun contains a device pointer, so dun[i].a implies indirection of dun[i] to read the value of a. That is not a valid host memory address and so a seg fault results. You have actually already copied the pointers to the heap memory your kernel allocated when you do this:
cudaMemcpy(cun,dun,10*sizeof(struct num),cudaMemcpyDeviceToHost);
so following that code with
int ** a_h = (int **)malloc(10 * sizeof(int *)); // to hold heap pointers
for(i=0;i<10;i++) {
a_h[i] = cun[i].a; // save heap pointer
cun[i].a=(int*)malloc(10*sizeof(int));
cudaMemcpy(cun[i].a,a_h[i],10*sizeo(int),cudaMemcpyDeviceToHost); // copy heap to host
}
should safely copy the heap memory you allocated back to the host.

how to change the __device__ variable in global and device functions in CUDA?

The procedure describe as follows:
#include <cuda.h>
#include <cutil_math>
#include <cuda_runtime.h>
#include <iostream>
struct testtype
{
float x;
int y;
char z;
};
__device__ testtype* gpu_config;
__global__
void test()
{
gpu_config->y = 3.0;
};
int main(void)
{
testtype cpu_config;
cpu_config.x = 1;
cpu_config.y = 2.0f;
cpu_config.z = 'A';
testtype val ;
if (cudaMalloc((void**) &gpu_config, sizeof(testtype)) != cudaSuccess)
{
return -1;
}
cudaMemcpy(gpu_config, &cpu_config, sizeof(testtype), cudaMemcpyHostToDevice);
test<<<1,1,0>>>();
cudaMemcpy(&val, gpu_config, sizeof(testtype), cudaMemcpyDeviceToHost);
std::cout << val.y << std::endl;
}
when I delete test<<<1,1,0>>>(); val is changed the same with gpu_config. but when has test<<<1,1,0>>>();, the val.y is not equal to 3.0 . it means that the global function test not change the value of val. I want to know how to change the _device_ variable the value through the global functions.
#include <stdio.h>
#include <cuda.h>
#include <cutil_math.h>
#include <cuda_runtime.h>
// check runtime call error
#define cudaSafeCall(call) { \
cudaError err = call; \
if(cudaSuccess != err){ \
fprintf(stderr, "%s(%i) : %s.\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
}}
// check kernel launch error
#define cudaCheckErr(errorMessage) { \
cudaError_t err = cudaGetLastError(); \
if(cudaSuccess != err){ \
fprintf(stderr, "%s(%i) : %s : %s.\n", __FILE__, __LINE__, errorMessage, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
}}
struct g{
int m;
};
__device__ struct g *d; // device (global)
__global__ void kernel()
{
int tid=blockIdx.x * blockDim.x + threadIdx.x;
d[tid].m=10;
}
int main()
{
size_t size = 1 * sizeof(struct g);
size_t sizep = 1 * sizeof(struct g*);
struct g *ld; // device (local)
cudaSafeCall(cudaMalloc(&ld, size));
cudaSafeCall(cudaMemcpyToSymbol(d,&ld,sizep));
kernel<<<1,1>>>();
cudaSafeCall(cudaDeviceSynchronize());
cudaCheckErr("kernel error");
struct g *h = (struct g*)malloc(size);
if(h==NULL){
fprintf(stderr, "%s(%i) : malloc error.\n", __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
//cudaSafeCall(cudaMemcpyFromSymbol(&ld,d,sizep)); // not necessary
cudaSafeCall(cudaMemcpy(h, ld, size, cudaMemcpyDeviceToHost));
printf("Result: %d\n",h[0].m);
}