So I see a parent question about how to copy from host to the constant memory on GPU using cudaMemcpyToSymbol.
My question is how to do the reverse, copying from device constant memory to the host using cudaMemcpyFromSymbol.
In the following minimal reproducible example, I either got
1) invalid device symbol error using cudaMemcpyFromSymbol(const_d_a, b, size);, or
2) got segmentation fault if I use cudaMemcpyFromSymbol(&b, const_d_a, size, cudaMemcpyDeviceToHost).
I have consulted with the manual which suggests I code as in 1), and this SO question that suggests I code as in 2). Neither of them work here.
Could anyone kindly help suggesting a workaround with this? I must be understanding something improperly... Thanks!
Here is the code:
// a basic CUDA function to test working with device constant memory
#include <stdio.h>
#include <cuda.h>
const unsigned int N = 10; // size of vectors
__constant__ float const_d_a[N * sizeof(float)];
int main()
{
float * a, * b; // a and b are vectors. c is the result
a = (float *)calloc(N, sizeof(float));
b = (float *)calloc(N, sizeof(float));
/**************************** Exp 1: sequential ***************************/
int i;
int size = N * sizeof(float);
for (i = 0; i < N; i++){
a[i] = (float)i / 0.23 + 1;
}
// 1. copy a to constant memory
cudaError_t err = cudaMemcpyToSymbol(const_d_a, a, size);
if (err != cudaSuccess){
printf("%s in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
cudaError_t err2 = cudaMemcpyFromSymbol(const_d_a, b, size);
if (err2 != cudaSuccess){
printf("%s in %s at line %d\n", cudaGetErrorString(err2), __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
double checksum0, checksum1;
for (i = 0; i < N; i++){
checksum0 += a[i];
checksum1 += b[i];
}
printf("Checksum for elements in host memory is %f\n.", checksum0);
printf("Checksum for elements in constant memory is %f\n.", checksum1);
return 0;
}
In CUDA, the various cudaMemcpy* operations are modeled after the C standard library memcpy routine. In that function, the first pointer is always the destination pointer and the second pointer is always the source pointer. That is true for all cudaMemcpy* functions as well.
Therefore, if you want to do cudaMemcpyToSymbol, the symbol had better be the first (destination) argument passed to the function (the second argument would be a host pointer). If you want to do cudaMemcpyFromSymbol, the symbol needs to be the second argument (the source position), and the host pointer is the first argument. That's not what you have here:
cudaError_t err2 = cudaMemcpyFromSymbol(const_d_a, b, size);
^ ^
| This should be the symbol.
|
This is supposed to be the host destination pointer.
You can discover this with a review of the API documentation.
If we reverse the order of those two arguments in that line of code:
cudaError_t err2 = cudaMemcpyFromSymbol(b, const_d_a, size);
Your code will run with no errors and the final results printed will match.
There is no need to use an ampersand with either of the a or b pointers in these functions. a and b are already pointers. In the example you linked, pi_gpu_h is not a pointer. It is an ordinary variable. To copy something to it using cudaMemcpyFromSymbol, it is necessary to take the address of that ordinary variable, because the function expects a (destination) pointer.
As an aside, this doesn't look right:
__constant__ float const_d_a[N * sizeof(float)];
This is effectively a static array declaration, and apart from the __constant__ decorator it should be done equivalently to how you would do it in C or C++. It's not necessary to multiply N by sizeof(float) here, if you want storage for N float quantities. Just N by itself will do that:
__constant__ float const_d_a[N];
however leaving that as-is does not create problems for the code you have posted.
Related
I am using CUDA 5.5 compute 3.5 on GTX 1080Ti and want to compute this formula:
y = a * a * b / 64 + c * c
Suppose I have these parameters:
a = 5876
b = 0.4474222958088
c = 664
I am computing this both via GPU and on the CPU and they give me different inexact answers:
h_data[0] = 6.822759375000e+05,
h_ref[0] = 6.822760000000e+05,
difference = -6.250000000000e-02
h_data is the CUDA answer, h_ref is the CPU answer. When I plug these into my calculator the GPU answer is closer to the exact answer, and I suspect this has to do with floating point precision. My question now is, how can I get the CUDA solution to match the precision/roundoff of CPU version? If I offset the a parameter by +/-1 the solutions match, but if I offset say the c parameter I still get a difference of 1/16
Here's the working code:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
__global__ void test_func(float a, float b, int c, int nz, float * __restrict__ d_out)
{
float *fdes_out = d_out + blockIdx.x * nz;
float roffout2 = a * a / 64.f;
//float tmp = fma(roffout2,vel,index*index);
for (int tid = threadIdx.x; tid < nz; tid += blockDim.x) {
fdes_out[tid] = roffout2 * b + c * c;
}
}
int main (int argc, char **argv)
{
// parameters
float a = 5876.0f, b = 0.4474222958088f;
int c = 664;
int nz = 1;
float *d_data, *h_data, *h_ref;
h_data = (float*)malloc(nz*sizeof(float));
h_ref = (float*)malloc(nz*sizeof(float));
// CUDA
cudaMalloc((void**)&d_data, sizeof(float)*nz);
dim3 nb(1,1,1); dim3 nt(64,1,1);
test_func <<<nb,nt>>> (a,b,c,nz,d_data);
cudaMemcpy(h_data, d_data, sizeof(float)*nz, cudaMemcpyDeviceToHost);
// Reference
float roffout2 = a * a / 64.f;
h_ref[0] = roffout2*b + c*c;
// Compare
printf("h_data[0] = %1.12e,\nh_ref[0] = %1.12e,\ndifference = %1.12e\n",
h_data[0],h_ref[0],h_data[0]-h_ref[0]);
// Free
free(h_data); free(h_ref);
cudaFree(d_data);
return 0;
}
I'm compiling only with the-O3 flag.
This small numerical difference of one single-precision ulp occurs because the CUDA compiler applies FMA-merging by default, whereas the host compiler does not do that. FMA-merging can be turned off by adding the command line flag -fmad=false to the invocation of the CUDA compiler driver nvcc.
FMA-merging is a compiler optimization in which an FMUL and a dependent FADD are transformed into a single fused multiply-add, or FMA, instruction. An FMA instruction computes a*b+c such that the full unrounded product a*b enters into the addition with c before a final rounding is applied to produce the final result.
Usually, this has performance advantages, since a single FMA instruction is executed instead of two instructions FMUL, FADD, and all the instructions have similar latency. Usually, this also has accuracy advantages as the use of FMA eliminates one rounding step and guards against subtractive cancellation when a*c and c have opposite signs.
In this case, as noted by OP, the GPU result computed with FMA is slightly more accurate than the host result computed without FMA. Using a higher precision reference, I find that the relative error in the GPU result is -4.21e-8, while the relative error in the host result is 4.95e-8.
I am refreshing my mind with cuda, specially the unify memory (my last real cuda dev was 3 years ago), I am a bit rusted.
The pb:
I am creating a task from a container using unify memory. However, I get a crash, after a few days of investigation,
I am not able to say where is the crash (copy constructor), but not why. Because all pointers are allocated correctly.
I am not in contraction with Nvidia post (https://devblogs.nvidia.com/parallelforall/unified-memory-in-cuda-6/)
about C++ and unify memory
#include <cuda.h>
#include <cstdio>
template<class T>
struct container{
container(int size = 1){ cudaMallocManaged(&p,size*sizeof(T));}
~container(){cudaFree(p);}
__device__ __host__ T& operator[](int i){ return p[i];}
T * p;
};
struct task{
int* a;
};
__global__ void kernel_gpu(task& t, container<task>& v){
printf(" gpu value task %i, should be 2 \n", *(t.a)); // this work
task tmp(v[0]); // BUG
printf(" gpu value task from vector %i, should be 1 \n", *(tmp.a));
}
void kernel_cpu(task& t, container<task>& v){
printf(" cpu value task %i, should be 2 \n", *(t.a)); // this work
task tmp(v[0]);
printf(" cpu value task from vector %i, should be 1 \n", *(tmp.a));
}
int main(int argc, const char * argv[]) {
int* p1;
int* p2;
cudaMallocManaged(&p1,sizeof(int));
cudaMallocManaged(&p2,sizeof(int));
*p1 = 1;
*p2 = 2;
task t1,t2;
t1.a=p1;
t2.a=p2;
container<task> c(2);
c[0] = t1;
c[1] = t2;
//gpu does not work
kernel_gpu<<<1,1>>>(c[1],c);
cudaDeviceSynchronize();
//cpu should work, no concurent access
kernel_cpu(c[1],c);
printf("job done !\n");
cudaFree(p1);
cudaFree(p2);
return 0;
}
Objectively I can pass an object as an argument where the memory has been allocated properly. However, it look like it not possible to use a second degree
of indirection (here the container)
I am doing a conceptual mistake, but I do not see where.
Best,
Timocafe
my machine: cuda 7.5, gcc 4.8.2, Tesla K20 m
Although the memories were allocated as Unified Memory, the container itself is declared in host code and allocated in host memory: container<task> c(2);. You can not pass it as a reference to the device code, and de-referencing it in a kernel will very likely result in illegal memory access.
You may want to use cuda-memcheck to identify such issues.
Recently I started working with CUDA and I read an introductory book on the computing language. To see if I understood it well, I considered the following problem.
Consider a function minimize f(x,y) on the grid [-1,1] X [-1,1]. This provided me with a few practical questions and I would like to have your look on things.
Do I explicitly calculate the grid? If I create the grid on the CPU, then I'll have to transfer the information to the GPU. I can then use a 2D block layout and access data efficiently using texture memory. Is it then best to use square blocks or perhaps blocks of different shapes?
Suppose I don't explicitly make a grid. I can assign discretise the X and Y direction with constant float arrays (which provides fast memory access) and then use 1 list of blocks.
Thanks!
This was an interesting question for me because it represents a type of problem that I think is rare:
potentially high compute load
little to no data that needs to be communicated host->device
very low volume of results that need to be communicated device->host
In other words, pretty much all compute, with not much dependence on data transfer, or even global memory usage/bandwidth.
Having said that, the question seems to be looking for a brute-force search approach to functional optimization/minimization, which is not an efficient technique for functions that are amenable to other optimization methods. But as a learning exercise, it's interesting (to me, anyway). It may also be useful for functions that are otherwise difficult to handle such as functions with discontinuities or other irregularities.
To answer your questions:
Do I explicitly calculate the grid? If I create the grid on the CPU, then I'll have to transfer the information to the GPU. I can then use a 2D block layout and access data efficiently using texture memory. Is it then best to use square blocks or perhaps blocks of different shapes?
I wouldn't bother calculating the grid on the CPU. (I assume by "grid" you mean the functional value of f at each point on the grid.) First of all, this is a fairly computationally intensive task - which GPUs are good at, and secondly, it is potentially a large data set, so transferring it to the GPU (so the GPU can then do the search) will take time. I propose to let the GPU do this (compute the functional value at each grid point.) Since we won't be using global access to data for this, texture memory is not an issue.
Suppose I don't explicitly make a grid. I can assign discretise the X and Y direction with constant float arrays (which provides fast memory access) and then use 1 list of blocks.
Yes, you could use a 1D array of blocks (list) or a 2D array. I don't think this significantly impacts the problem either way, and I think the 2D grid approach fits the problem better (and I think allows for slightly cleaner code) so I would suggest starting with a 2D array of blocks.
Here's a sample code that might be interesting to play with or crystallize ideas. Each thread has the responsibility to compute its respective value of x and y, and then the functional value f at that point. Then a reduction followed by a block-draining reduction is used to search over all computed values for the minimum value (in this case).
$ cat t811.cu
#include <stdio.h>
#include <math.h>
#include <assert.h>
// grid dimensions and divisions
#define XNR -1.0f
#define XPR 1.0f
#define YNR -1.0f
#define YPR 1.0f
#define DX 0.0001f
#define DY 0.0001f
// threadblock dimensions - product must be a power of 2
#define BLK_X 16
#define BLK_Y 16
// optimization functions - these are currently set for minimization
#define TST(X1,X2) ((X1)>(X2))
#define OPT(X1,X2) (X2)
// error check macro
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for timing
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
// the function f that will be "optimized"
__host__ __device__ float f(float x, float y){
return (x+0.5)*(x+0.5) + (y+0.5)*(y+0.5) +0.1f;
}
// variable for block-draining reduction block counter
__device__ int blkcnt = 0;
// GPU optimization kernel
__global__ void opt_kernel(float * __restrict__ bf, float * __restrict__ bx, float * __restrict__ by, const float scx, const float scy){
__shared__ float sh_f[BLK_X*BLK_Y];
__shared__ float sh_x[BLK_X*BLK_Y];
__shared__ float sh_y[BLK_X*BLK_Y];
__shared__ int lblock;
// compute x,y coordinates for this thread
float x = ((threadIdx.x+blockDim.x*blockIdx.x) * (XPR-XNR))*scx + XNR;
float y = ((threadIdx.y+blockDim.y*blockIdx.y) * (YPR-YNR))*scy + YNR;
int thid = (threadIdx.y*BLK_X)+threadIdx.x;
lblock = 0;
sh_x[thid] = x;
sh_y[thid] = y;
sh_f[thid] = f(x,y); // compute functional value of f(x,y)
__syncthreads();
// perform block-level shared memory reduction
// assume block size is a power of 2
for (int i = (blockDim.x*blockDim.y)>>1; i > 16; i>>=1){
if (thid < i)
if (TST(sh_f[thid],sh_f[thid+i])){
sh_f[thid] = OPT(sh_f[thid],sh_f[thid+i]);
sh_x[thid] = OPT(sh_x[thid],sh_x[thid+i]);
sh_y[thid] = OPT(sh_y[thid],sh_y[thid+i]);}
__syncthreads();}
volatile float *vf = sh_f;
volatile float *vx = sh_x;
volatile float *vy = sh_y;
for (int i = 16; i > 0; i>>=1)
if (thid < i)
if (TST(vf[thid],vf[thid+i])){
vf[thid] = OPT(vf[thid],vf[thid+i]);
vx[thid] = OPT(vx[thid],vx[thid+i]);
vy[thid] = OPT(vy[thid],vy[thid+i]);}
// save block reduction result, and check if last block
if (!thid){
bf[blockIdx.y*gridDim.x+blockIdx.x] = sh_f[0];
bx[blockIdx.y*gridDim.x+blockIdx.x] = sh_x[0];
by[blockIdx.y*gridDim.x+blockIdx.x] = sh_y[0];
int myblock = atomicAdd(&blkcnt, 1);
if (myblock == (gridDim.x*gridDim.y-1)) lblock = 1;}
__syncthreads();
if (lblock){
// do last-block reduction
float my_x, my_y, my_f;
int myid = thid;
if (myid < gridDim.x * gridDim.y){
my_x = bx[myid];
my_y = by[myid];
my_f = bf[myid];}
else { assert(0);} // does not work correctly if block dims are greater than grid dims
myid += blockDim.x*blockDim.y;
while (myid < gridDim.x*gridDim.y){
if TST(my_f,bf[myid]){
my_x = OPT(my_x,bx[myid]);
my_y = OPT(my_y,by[myid]);
my_f = OPT(my_f,bf[myid]);}
myid += blockDim.x*blockDim.y;}
sh_f[thid] = my_f;
sh_x[thid] = my_x;
sh_y[thid] = my_y;
__syncthreads();
for (int i = (blockDim.x*blockDim.y)>>1; i > 0; i>>=1){
if (thid < i)
if (TST(sh_f[thid],sh_f[thid+i])){
sh_f[thid] = OPT(sh_f[thid],sh_f[thid+i]);
sh_x[thid] = OPT(sh_x[thid],sh_x[thid+i]);
sh_y[thid] = OPT(sh_y[thid],sh_y[thid+i]);}
__syncthreads();}
if (!thid){
bf[0] = sh_f[0];
bx[0] = sh_x[0];
by[0] = sh_y[0];
}
}
}
// cpu (naive,serial) function for comparison
float3 opt_cpu(){
float optx = XNR;
float opty = YNR;
float optf = f(optx,opty);
for (float x = XNR; x < XPR; x += DX)
for (float y = YNR; y < YPR; y += DY){
float test = f(x,y);
if (TST(optf,test)){
optf = OPT(optf,test);
optx = OPT(optx,x);
opty = OPT(opty,y);}}
return make_float3(optf, optx, opty);
}
int main(){
// compute threadblock and grid dimensions
int nx = ceil(XPR-XNR)/DX;
int ny = ceil(YPR-YNR)/DY;
int bx = ceil(nx/(float)BLK_X);
int by = ceil(ny/(float)BLK_Y);
dim3 threads(BLK_X, BLK_Y);
dim3 blocks(bx, by);
float *d_bx, *d_by, *d_bf;
cudaFree(0);
// run GPU test case
unsigned long gtime = dtime_usec(0);
cudaMalloc(&d_bx, bx*by*sizeof(float));
cudaMalloc(&d_by, bx*by*sizeof(float));
cudaMalloc(&d_bf, bx*by*sizeof(float));
opt_kernel<<<blocks, threads>>>(d_bf, d_bx, d_by, 1.0f/(blocks.x*threads.x), 1.0f/(blocks.y*threads.y));
float rf, rx, ry;
cudaMemcpy(&rf, d_bf, sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(&rx, d_bx, sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(&ry, d_by, sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("some error");
gtime = dtime_usec(gtime);
printf("gpu val: %f, x: %f, y: %f, time: %fs\n", rf, rx, ry, gtime/(float)USECPSEC);
//run CPU test case
unsigned long ctime = dtime_usec(0);
float3 cpu_res = opt_cpu();
ctime = dtime_usec(ctime);
printf("cpu val: %f, x: %f, y: %f, time: %fs\n", cpu_res.x, cpu_res.y, cpu_res.z, ctime/(float)USECPSEC);
return 0;
}
$ nvcc -O3 -o t811 t811.cu
$ ./t811
gpu val: 0.100000, x: -0.500000, y: -0.500000, time: 0.193248s
cpu val: 0.100000, x: -0.500017, y: -0.500017, time: 2.810862s
$
Notes:
This problem is set up to find the minimum value of f(x,y) = (x+0.5)^2 + (y+0.5)^2 + 0.1 over the domain: x(-1,1), y(-1,1)
The test was run on Fedora 20, CUDA 7, Quadro5000 GPU (cc2.0) and a Xeon X5560 2.8GHz CPU. Different CPU or GPU will obviously affect the comparison.
The observed speedup here is about 14x. The CPU code is a naive, single threaded code.
It should be possible, for example, via modification of the OPT and TST macros, to perform a different kind of optimization - such as maximum instead of minimum.
The domain (and grid) dimensions and granularity to search over can be modified by the compile time constants such as XNR, XPR, etc.
I have problem with execute CUDA kernel few times. Somethings is wrong with environment in my code. First time code works properly, second time during clean up environemnt before third call there are random crashes.
I think for some reason I have memory corruption. Crashes occurs sometimes in CUDA driver, sometimes simple printf crashes or cheap, kernel32.dll. I suppose that I have problem with memory management in my code.
What should be done before again kernel execution ?
This code works when I execute one time.
I'm using CURAND to initialize random generators.
Here is my code:
#define GRID_BLOCK 64
#define GRID_THREAD 8
#define CITIES 100
#define CIPOW2 101
int lenghtPaths = GRID_BLOCK*GRID_THREAD;
int cities = CITIES;
//prepare CURAND
curandState *devStates;
CUDA_CALL(cudaMalloc((void **)&devStates, GRID_BLOCK*GRID_THREAD*sizeof(curandState)));
/* Setup prng states */
setup_kernel<<<GRID_BLOCK ,GRID_THREAD>>>(devStates);
CUDA_CALL(cudaDeviceSynchronize());
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
fprintf(stderr, "CURAND preparation failed: %s\n", cudaGetErrorString(cudaStatus));
//copy distance grid to constant memory
cudaMemcpyToSymbol(cdist, dist, sizeof(int) *CIPOW2*CIPOW2);
CUDA_CALL(cudaMalloc((void**)&dev_pathsForThreads, lenghtPaths * cities * sizeof(int)));
CUDA_CALL(cudaMalloc((void**)&d_results, GRID_BLOCK*GRID_THREAD * sizeof(int)));
for (int k = 0; k < 5; k++){
int* pathsForThreads;
pathsForThreads = (int*)malloc(lenghtPaths * cities * sizeof(int));
pathsForThreads = PreaparePaths(Path, lenghtPaths, cities);
CUDA_CALL(cudaMemcpy(dev_pathsForThreads, pathsForThreads, lenghtPaths *cities*sizeof(int), cudaMemcpyHostToDevice));
GPUAnnealing<<<GRID_BLOCK ,GRID_THREAD >>>(dev_pathsForThreads, devStates, iterationLimit,temperature, coolingRate, absoluteTemperature, cities,d_results);
CUDA_CALL(cudaDeviceSynchronize());
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
fprintf(stderr, "GPUAnnealing launch failed: %s\n", cudaGetErrorString(cudaStatus));
h_results = (int*) malloc(GRID_BLOCK*GRID_THREAD * sizeof(int));
//Copy lenght of each path to CPU
CUDA_CALL(cudaMemcpy(h_results, d_results, GRID_BLOCK*GRID_THREAD * sizeof(int),cudaMemcpyDeviceToHost));
//Copy paths to CPU
CUDA_CALL(cudaMemcpy(pathsForThreads, dev_pathsForThreads, lenghtPaths *cities*sizeof(int), cudaMemcpyDeviceToHost));
//check the shortest path
shortestPath = FindTheShortestPath(h_results);
fprintf (stdout, "Shortest path on index = %d value = %d \n", shortestPath, h_results[shortestPath]);
for (int i = 0; i < GRID_BLOCK*GRID_BLOCK ; i++)
Path[i] = pathsForThreads[shortestPath*CITIES +i];
free(pathsForThreads);
free(h_results);
}
CUDA_CALL(cudaFree(dev_pathsForThreads));
CUDA_CALL(cudaFree(d_results));
CUDA_CALL(cudaFree(devStates));
CUDA_CALL(cudaDeviceReset());
This is a bad idea:
pathsForThreads = (int*)malloc(lenghtPaths * cities * sizeof(int));
pathsForThreads = PreaparePaths(Path, lenghtPaths, cities);
If the call to PreaparePaths assigns some other value to pathsForThreads than what was assigned to it by the malloc operation, then later when you do this:
free(pathsForThreads);
You're going to get unpredictable results.
You should not reassign a pointer that you're subsequently going to pass to free to some other value. The man page for free indicates:
free() frees the memory space pointed to by ptr, which must have been
returned by a previous call to malloc(), calloc() or realloc().
So reassigning the pointer to something else is not allowed if you intend to pass it to free
I've been trying to debug my code, as I know something is going wrong in the Kernel, and I've been trying to figure out what specifically. If I try to step into the kernel it seems to completely step over the kernel functions, and will eventually cause an error on quitting:
Single stepping until exit from function dyld_stub_cudaSetupArgument,
which has no line number information.
[Launch of CUDA Kernel 0 (incrementArrayOnDevice<<<(3,1,1),(4,1,1)>>>) on
Device 0]
[Termination of CUDA Kernel 0 (incrementArrayOnDevice<<<(3,1,1),
(4,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 1 (fillinBoth<<<(40,1,1),(1,1,1)>>>) on Device 0]
[Termination of CUDA Kernel 1 (fillinBoth<<<(40,1,1),(1,1,1)>>>) on Device 0]
add (below=0x124400, newtip=0x124430, newfork=0x125ac0) at test.cu:1223
And if I try to break in the Kernel my entire computer crashes and I have to restart it.
I figure there must be something wrong with the way I'm calling the kernel, but I can't figure out what.
The code is rather long, so I'm only including an excerpt of it:
__global__ void fillinOne(seqptr qset, long max) {
int i, j;
aas aa;
int idx = blockIdx.x;
__shared__ long qs[3];
if(idx < max)
{
memcpy(qs, qset[idx], sizeof(long[3]));
for (i = 0; i <= 1; i++)
{
for (aa = ala; (long)aa <= (long)stop; aa = (aas)((long)aa + 1))
{
if (((1L << ((long)aa)) & qs[i]) != 0)
{
for (j = i + 1; j <= 2; j++)
qs[j] |= cudaTranslate[(long)aa - (long)ala][j - i];
}
}
}
}
}
//Kernel for left!= NULL and rt != NULL
void fillin(node *p, node *left, node *rt)
{
cudaError_t err = cudaGetLastError();
size_t stepsize = chars * sizeof(long);
size_t sitesize = chars * sizeof(sitearray);
//int i, j;
if (left == NULL)
{
//copy rt->numsteps into p->numsteps--doesn't actually require CUDA, because no computation to do
memcpy(p->numsteps, rt->numsteps, stepsize);
checkCUDAError("memcpy");
//allocate siteset (array of sitearrays) on device
seqptr qsites; //as in array of qs's
cudaMalloc((void **) &qsites, sitesize);
checkCUDAError("malloc");
//copy rt->siteset into device array (equivalent to memcpy(qs, rs) but for whole array)
cudaMemcpy(qsites, rt->siteset, sitesize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy");
//do loop in device
int block_size = 1; //each site operated on independently
int n_blocks = chars;
fillinOne <<< n_blocks, block_size>>> (qsites, chars);
cudaThreadSynchronize();
//put qset in p->siteset--equivalent to memcpy(p->siteset[m], qs)
cudaMemcpy(p->siteset, qsites, sitesize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy");
//Cleanup
cudaFree(qsites);
}
If anyone has any ideas at all, please resond! Thanks in advance!
I suppose you have a single card configuration. When you are debugging a cuda kernel and you break inside it you effectively put the display driver in pause. That causes what you think is a crash. If you want to use the cuda-gdb with only one graphics card you must use it in command line mode (don't start X or press ctrl-alt-fn from X).
If you have two cards you must run the code in the card not running the display. Use cudaSelectDevice(n).