What's the alternative for __match_any_sync on compute capability 6? - cuda

In the cuda examples, e.g. here, __match_all_sync __match_any_sync is used.
Here is an example where a warp is split into multiple (one or more) groups that each keep track of their own atomic counter.
// increment the value at ptr by 1 and return the old value
__device__ int atomicAggInc(int* ptr) {
int pred;
//const auto mask = __match_all_sync(__activemask(), ptr, &pred); //error, should be any_sync, not all_sync
const auto mask = __match_any_sync(__activemask(), ptr, &pred);
const auto leader = __ffs(mask) - 1; // select a leader
int res;
const auto lane_id = ThreadId() % warpSize;
if (lane_id == leader) { // leader does the update
res = atomicAdd(ptr, __popc(mask));
}
res = __shfl_sync(mask, res, leader); // get leader’s old value
return res + __popc(mask & ((1 << lane_id) - 1)); //compute old value
}
The __match_any_sync here splits up the threads in the warp into groups that have the same ptr value, so that each group can update its own ptr atomically without getting in the way of other threads.
I know the nvcc compiler (since cuda 9) does this sort of optimization under the hood automatically, but this is just about the mechanics of __match_any_sync
Is there a way to do this pre compute capability 7?

EDIT: The blog article has now been modified to reflect __match_any_sync() rather than __match_all_sync(), so any commentary to that effect below should be disregarded. The answer below is edited to reflect this.
Based on your statement:
this is just about the mechanics of __match_any_sync
we will focus on a replacement for __match_any_sync itself, not any other form of rewriting the atomicAggInc function. Therefore, we must provide a mask that has the same value as would be returned by __match_any_sync() on cc7.0 or higher architectures.
I believe this will require a loop, which broadcasts the ptr value, in the worst case one iteration for each thread in the warp (since each thread could have a unique ptr value) and testing which threads have the same value. There are various ways we could "optimize" this loop for this function, so as to possibly reduce the trip count from 32 to some lesser value, based on the actual ptr values in each thread, but such optimization in my view introduces considerable complexity, which makes the worst-case processing time longer (as is typical of early-exit optimizations). So I will demonstrate a fairly simple method without this optimization.
The other consideration is what to do in the case of the warp not being converged? For that, we can employ __activemask() to identify that case.
Here is a worked example:
$ cat t1646.cu
#include <iostream>
#include <stdio.h>
// increment the value at ptr by 1 and return the old value
__device__ int atomicAggInc(int* ptr) {
int mask;
#if __CUDA_ARCH__ >= 700
mask = __match_any_sync(__activemask(), (unsigned long long)ptr);
#else
unsigned tmask = __activemask();
for (int i = 0; i < warpSize; i++){
#ifdef USE_OPT
if ((1U<<i) & tmask){
#endif
unsigned long long tptr = __shfl_sync(tmask, (unsigned long long)ptr, i);
unsigned my_mask = __ballot_sync(tmask, (tptr == (unsigned long long)ptr));
if (i == (threadIdx.x & (warpSize-1))) mask = my_mask;}
#ifdef USE_OPT
}
#endif
#endif
int leader = __ffs(mask) - 1; // select a leader
int res;
unsigned lane_id = threadIdx.x % warpSize;
if (lane_id == leader) { // leader does the update
res = atomicAdd(ptr, __popc(mask));
}
res = __shfl_sync(mask, res, leader); // get leader’s old value
return res + __popc(mask & ((1 << lane_id) - 1)); //compute old value
}
__global__ void k(int *d){
int *ptr = d + threadIdx.x/4;
if ((threadIdx.x >= 16) && (threadIdx.x < 32))
atomicAggInc(ptr);
}
const int ds = 32;
int main(){
int *d_d, *h_d;
h_d = new int[ds];
cudaMalloc(&d_d, ds*sizeof(d_d[0]));
cudaMemset(d_d, 0, ds*sizeof(d_d[0]));
k<<<1,ds>>>(d_d);
cudaMemcpy(h_d, d_d, ds*sizeof(d_d[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < ds; i++)
std::cout << h_d[i] << " ";
std::cout << std::endl;
}
$ nvcc -o t1646 t1646.cu -DUSE_OPT
$ cuda-memcheck ./t1646
========= CUDA-MEMCHECK
0 0 0 0 4 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
========= ERROR SUMMARY: 0 errors
$
(CentOS 7, CUDA 10.1.243, with device 0 being Tesla V100, device 1 being a cc3.5 device).
I've added an optional optimization for the case where the warp is diverged (i.e. tmask is not 0xFFFFFFFF). This can be selected by defining USE_OPT.

Related

What is wrong with my understanding of "__shared__" variables in cuda?

After reading the manual of NVIDIA, I wrotea parrell reduction code as follows:
__global__ void kernel(int *devData)
{
__shared__ int sum;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (threadIdx.x == 0)
sum = 0;
__syncthreads();
sum += devData[i];
__syncthreads();
if (threadIdx.x == 0)
printf("sum of block %d is %d\n", blockIdx.x, sum);
}
int main(void)
{
// init device
int devIdx = 0;
cudaError_t err = cudaSuccess;
gpuDeviceInit(devIdx);
int i;
int data[100];
int *devData;
for (i = 0; i < 100; i++)
data[i] = 1;
err = cudaMalloc(&devData, 100 * sizeof(int));
checkCudaErrors(err);
// copy data to device
err = cudaMemcpy(devData, data, 100 * sizeof(int), cudaMemcpyHostToDevice);
checkCudaErrors(err);
int blocksPerGrid = 10;
int threadsPerBlock = 10;
// call kernel function
kernel <<<blocksPerGrid, threadsPerBlock>>> (devData);
checkCudaErrors(cudaGetLastError());
cudaDeviceReset();
return 0;
}
I'm trying to sum integers for each block and then print this sum.
But I found the result was as follows:
sum of block 0 is 1
sum of block 6 is 1
sum of block 2 is 1
sum of block 8 is 1
sum of block 1 is 1
sum of block 7 is 1
sum of block 4 is 1
sum of block 3 is 1
sum of block 9 is 1
sum of block 5 is 1
The result I expected was 10.Is the __shared__ variable "sum" shared by every thread in a block? What's wrong with my understanding of "__shared__" variables in cuda?
you have multiple threads trying to access (read-modify-write) sum at the same time, here:
sum += devData[i];
This doesn't work for either global or shared data in CUDA (i.e. CUDA won't sort that out for you, automatically). To sort this out, the usual approaches are either to use atomics or else to use a canonical parallel reduction
There are numerous questions on both of these topics here on the cuda SO tag, and you can get some focused training on parallel reduction methods in unit 5 of this online training series.
For example, in your code, a trivial change to "fix" would be to replace the above line of code with an atomic add:
atomicAdd(&sum,devData[i]);
atomics force serialization, so a preferred approach is a canonical parallel reduction.

How to use CUDA tex1DFetch with cudaTextureObject_t?

I was working with texture references when I noticed they were deprecated, I tried to update my test function to work with the 'new' bindless texture objects with tex1Dfetch but was not able to produce the same results.
I'm currently exploring the use of texture memory to speed up my aho-corasick implementation; I was able to get tex1D() working with texture references, however, I noticed they were deprecated and decided to use texture objects instead.
I'm getting some immensely weird behaviour with the kernels when I try to use the results in any way; I can do results[tidx] = tidx; without any issues, but results[tidx] = temp + 1; only ever returns the value of temp not temp * 3 or any other numerical test involving temp.
I can see no logical reason for this behaviour, and the documentation examples look similar enough that I can't see where I've gone wrong.
I've already read CUDA tex1Dfetch() wrong behaviour and New CUDA Texture Object — getting wrong data in 2D case but neither seem related to the issue I am having.
Just in case it makes a difference; I am am using CUDA release 10.0, V10.0.130 with an Nvidia GTX 980ti.
#include <iostream>
__global__ void test(cudaTextureObject_t tex ,int* results){
int tidx = threadIdx.y * blockDim.x + threadIdx.x;
unsigned temp = tex1Dfetch<unsigned>(tex, threadIdx.x);
results[tidx] = temp * 3;
}
int main(){
int *host_arr;
const int host_arr_size = 8;
// Create and populate host array
std::cout << "Host:" << std::endl;
cudaMallocHost(&host_arr, host_arr_size*sizeof(int));
for (int i = 0; i < host_arr_size; ++i){
host_arr[i] = i * 2;
std::cout << host_arr[i] << std::endl;
}
// Create resource description
struct cudaResourceDesc resDesc;
resDesc.resType = cudaResourceTypeLinear;
resDesc.res.linear.devPtr = &host_arr;
resDesc.res.linear.sizeInBytes = host_arr_size*sizeof(unsigned);
resDesc.res.linear.desc = cudaCreateChannelDesc<unsigned>();
// Create texture description
struct cudaTextureDesc texDesc;
texDesc.readMode = cudaReadModeElementType;
// Create texture
cudaTextureObject_t tex;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
// Allocate results array
int * result_arr;
cudaMalloc(&result_arr, host_arr_size*sizeof(unsigned));
// launch test kernel
test<<<1, host_arr_size>>>(tex, result_arr);
// fetch results
std::cout << "Device:" << std::endl;
cudaMemcpy(host_arr, result_arr, host_arr_size*sizeof(unsigned), cudaMemcpyDeviceToHost);
// print results
for (int i = 0; i < host_arr_size; ++i){
std::cout << host_arr[i] << std::endl;
}
// Tidy Up
cudaDestroyTextureObject(tex);
cudaFreeHost(host_arr);
cudaFree(result_arr);
}
I expected the above to work similarly to the below (which does work):
texture<int, 1, cudaReadModeElementType> tex_ref;
cudaArray* cuda_array;
__global__ void test(int* results){
const int tidx = threadIdx.x;
results[tidx] = tex1D(tex_ref, tidx) * 3;
}
int main(){
int *host_arr;
int host_arr_size = 8;
// Create and populate host array
cudaMallocHost((void**)&host_arr, host_arr_size * sizeof(int));
for (int i = 0; i < host_arr_size; ++i){
host_arr[i] = i * 2;
std::cout << host_arr[i] << std::endl;
}
// bind to texture
cudaChannelFormatDesc cuDesc = cudaCreateChannelDesc <int >();
cudaMallocArray(&cuda_array, &cuDesc, host_arr_size);
cudaMemcpyToArray(cuda_array, 0, 0, host_arr , host_arr_size * sizeof(int), cudaMemcpyHostToDevice);
cudaBindTextureToArray(tex_ref , cuda_array);
// Allocate results array
int * result_arr;
cudaMalloc((void**)&result_arr, host_arr_size*sizeof(int));
// launch kernel
test<<<1, host_arr_size>>>(result_arr);
// fetch results
cudaMemcpy(host_arr, result_arr, host_arr_size * sizeof(int), cudaMemcpyDeviceToHost);
// print results
for (int i = 0; i < host_arr_size; ++i){
std::cout << host_arr[i] << std::endl;
}
// Tidy Up
cudaUnbindTexture(tex_ref);
cudaFreeHost(host_arr);
cudaFreeArray(cuda_array);
cudaFree(result_arr);
}
Expected results:
Host:
0
2
4
6
8
10
12
14
Device:
0
6
12
18
24
30
36
42
Actual results:
Host:
0
2
4
6
8
10
12
14
Device:
0
2
4
6
8
10
12
14
Does anyone know what on earth is going wrong?
CUDA API function calls return error codes. You want to check these error codes. Especially when something is clearly going wrong somewhere…
You use the the same array to store the initial array data as well as to receive the result from the device. Your kernel launch fails with an illegal address error because you do not have a valid texture object. You do not have a valid texture object because the creation of your texture object failed. The first API call right after the kernel launch is the cudaMemcpy() to get the results back. Since there was an error during the kernel launch, cudaMemcpy() will fail, returning the most recent error instead of performing the copy. As a result, the contents of your host_arr buffer are unchanged and you just end up displaying the original input data again.
The reson why creation of your texture object failed is explained in the documentation (emphasis mine):
If cudaResourceDesc::resType is set to cudaResourceTypeLinear, cudaResourceDesc::res::linear::devPtr must be set to a valid device pointer, that is aligned to cudaDeviceProp::textureAlignment. […]
A texture object cannot reference host memory. The problem in your code lies here:
resDesc.res.linear.devPtr = &host_arr;
You need to allocate a buffer in decive memory, e.g., using cudaMalloc(), copy your data there, and create a texture object that refers to that device buffer.
Furthermore, your texDesc is not initialized properly. In your case, it should be sufficient to just zero-initialize it:
struct cudaTextureDesc texDesc = {};
4 steps:
declare
texture<unsigned char,1,cudaReadmodeElementType> tex1;
bind
cudaBindTexture(0,tex1,dev_A);
fetch/read via index
tex1Dfetch(tex1,2);
unbind
cudaUnbindTexture(tex1);

cublasDgemm getting more slower

I have a problem when using cublasDgemm(this function is in cublas, and the result is A*B,A=750*600,B=600*1000).
for (i=0; i < N; ++i) {
cublasDgemm();
}
N=10, total time is 0.000473s, average call is 0.0000473
N=100, total time is 0.00243s, average call is 0.0000243
N=1000, total time is 0.715072s, average call is 0.000715
N=10000, total time is 10.4998s, average call is 0.00104998
why the average time is increasing so much?
#include <cuda_runtime.h>
#include <string.h>
#include <cublas.h>
#include <cublas_v2.h>
#include <time.h>
#include <sys/time.h>
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
using namespace std;
#define IDX2C(i,j,leading) (((j)*(leading))+(i))
#define CHECK_EQ(a,b) do { \
if ((a) != (b)) { \
cout <<__FILE__<<" : "<< __LINE__<<" : check failed because "<<a<<"!="<<b<<endl;\
exit(1);\
}\
} while(0)
#define CUBLAS_CHECK(condition) \
do {\
cublasStatus_t status = condition; \
CHECK_EQ(status, CUBLAS_STATUS_SUCCESS); \
} while(0)
#define CUDA_CHECK(condition)\
do {\
cudaError_t error = condition;\
CHECK_EQ(error, cudaSuccess);\
} while(0)
//check after kernel function
#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())
template <class T>
void randMtx(T *mat, int n, double range) {
srand((unsigned int)time(NULL));
for (int i = 0; i < n; ++i) {
//mat[i] = 1.0;
double flag = 1.0;
if (rand() % 2 == 0) flag = -1.0;
mat[i] = flag * rand()/RAND_MAX * range;
}
}
int main(int argc, char *argv[]) {
if (argc != 9) {
cout << "m1_row m1_col m2_row m2_col m1 m2 count range\n";
return -1;
}
int row1 = atoi(argv[1]);
int col1 = atoi(argv[2]);
int row2 = atoi(argv[3]);
int col2 = atoi(argv[4]);
int count = atoi(argv[7]);
double range = atof(argv[8]);
cublasOperation_t opt1 = CUBLAS_OP_N;
cublasOperation_t opt2 = CUBLAS_OP_N;
int row3 = row1;
int col3 = col2;
int k = col1;
if (argv[5][0] == 't') {
opt1 = CUBLAS_OP_T;
row3 = col1;
k = row1;
}
if (argv[6][0] == 't') {
opt2 = CUBLAS_OP_T;
col3 = row2;
}
double *mat1_c = (double*)malloc(sizeof(double)*row1*col1);
double *mat2_c = (double*)malloc(sizeof(double)*row2*col2);
double *mat3_c = (double*)malloc(sizeof(double)*row3*col3);
srand((unsigned int)time(NULL));
randMtx(mat1_c, row1*col1, range);
randMtx(mat2_c, row2*col2, range);
double *mat1_g;
double *mat2_g;
double *mat3_g;
double alpha = 1.0;
double beta = 0.0;
CUDA_CHECK(cudaMalloc((void **)&(mat1_g), sizeof(double)*row1*col1));
CUDA_CHECK(cudaMalloc((void **)&(mat2_g), sizeof(double)*row2*col2));
CUDA_CHECK(cudaMalloc((void **)&(mat3_g), sizeof(double)*row3*col3));
CUDA_CHECK(cudaMemcpy(mat1_g, mat1_c, sizeof(double)*row1*col1, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(mat2_g, mat2_c, sizeof(double)*row2*col2, cudaMemcpyHostToDevice));
cublasHandle_t handle;
CUBLAS_CHECK(cublasCreate(&handle));
struct timeval beg, end, b1, e1;
gettimeofday(&beg, NULL);
for (int i = 0; i < count ;++i) {
CUBLAS_CHECK(cublasDgemm(handle, opt1, opt2, row3, col3, k, &alpha, mat1_g, row1, mat2_g, row2, &beta, mat3_g, row3));
}
cudaDeviceSynchronize();//
gettimeofday(&end, NULL);
cout << "real time used: " << end.tv_sec-beg.tv_sec + (double)(end.tv_usec-beg.tv_usec)/1000000 <<endl;
free(mat1_c);
free(mat2_c);
free(mat3_c);
cudaFree(mat1_g);
cudaFree(mat2_g);
cudaFree(mat3_g);
return 1;
}
this is the code. I add cudaDeviceSynchronize after the loop block, and no matter the value of count, the average call time is about 0.001s
As pointed out by #talonmies, this behavior is probably exactly what would be expected.
When you call cublasDgemm, the call (usually) returns control to the host (CPU) thread, before the operation is complete. In fact there is a queue that calls like this will go into, each time you make the call. The operation will be placed into a queue, and your host code will continue.
Furthermore, CUDA and CUBLAS usually have some one-time overhead that is associated with using the API. For example, the call to create a CUBLAS handle usually incurs some measurable time, in order to initialize the library.
So your measurements can be broken into 3 groups:
"Small" iteration counts (e.g. 10). In this case, each call pays the cost to put a Dgemm request into the queue, plus the amortization of the startup costs over a relatively small number of iterations. This corresponds to your measurements like this: "average call is 0.0000473"
"Medium" iteration counts (e.g. 100-1000). In this case, the amortization of the start up costs becomes very small per call, and so most of the measurement is just the time to add a Dgemm request to the queue. This corresponds to your measurements like this: "average call is 0.0000243"
"Large" iteration counts (e.g. 10000). At some point, the internal request queue becomes full and can no longer accept new requests, until some requests have been completed and removed from the queue. What happens at this point is that the Dgemm call switches from non-blocking to blocking. It blocks (holds up the host/CPU thread) until a queue slot becomes available. What happens at this point then, is that suddenly new requests must wait effectively for a previous request to finish, so now the cost for a new Dgemm request approximately equals the time to execute and complete a (previous) Dgemm request. So the per-call cost jumps up dramatically from the cost to add an item to the queue to the cost to complete a request. This corresponds to your measurements like this: "average call is 0.00104998"

Shared memory, branching performance and register count

I came across some peculiar performance behaviour when trying out the CUDA shuffle instruction. The test kernel below is based on an image processing algorithm which adds input-dependent values to all neighbouring pixels within a square of side rad. The output for each block is added in shared memory. If only one thread per warp adds its result to shared memory, the performance is poor (Option 1), whereas on the other hand, if all threads add to shared memory (one thread adds the desired value, the rest just add 0), the execution time drops by 2-3 times (Option 2).
#include <iostream>
#include "cuda_runtime.h"
#define warpSz 32
#define tileY 32
#define rad 32
__global__ void test(float *out, int pitch)
{
// Set shared mem to 0
__shared__ float tile[(warpSz + 2*rad) * (tileY + 2*rad)];
for (int i = threadIdx.y*blockDim.x+threadIdx.x; i<(tileY+2*rad)*(warpSz+2*rad); i+=blockDim.x*blockDim.y) {
tile[i] = 0.0f;
}
__syncthreads();
for (int row=threadIdx.y; row<tileY; row += blockDim.y) {
// Loop over pixels in neighbourhood
for (int i=0; i<2*rad+1; ++i) {
float res = 0.0f;
int rowStartIdx = (row+i)*(warpSz+2*rad);
for (int j=0; j<2*rad+1; ++j) {
res += float(threadIdx.x+row); // Substitute for real calculation
// Option 1: one thread writes to shared mem
if (threadIdx.x == 0) {
tile[rowStartIdx + j] += res;
res = 0.0f;
}
//// Option 2: all threads write to shared mem
//float tmp = 0.0f;
//if (threadIdx.x == 0) {
// tmp = res;
// res = 0.0f;
//}
//tile[rowStartIdx + threadIdx.x+j] += tmp;
res = __shfl(res, (threadIdx.x+1) % warpSz);
}
res += float(threadIdx.x+row);
tile[rowStartIdx + threadIdx.x+2*rad] += res;
__syncthreads();
}
}
// Add result back to global mem
for (int row=threadIdx.y; row<tileY+2*rad; row+=blockDim.y) {
for (int col=threadIdx.x; col<warpSz+2*rad; col+=warpSz) {
int idx = (blockIdx.y*tileY + row)*pitch + blockIdx.x*warpSz + col;
atomicAdd(out+idx, tile[row*(warpSz+2*rad) + col]);
}
}
}
int main(void)
{
int2 dim = make_int2(512, 512);
int pitchOut = (((dim.x+2*rad)+warpSz-1) / warpSz) * warpSz;
int sizeOut = pitchOut*(dim.y+2*rad);
dim3 gridDim((dim.x+warpSz-1)/warpSz, (dim.y+tileY-1)/tileY, 1);
float *devOut;
cudaMalloc((void**)&devOut, sizeOut*sizeof(float));
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaFree(0);
cudaEventRecord(start, 0);
test<<<gridDim, dim3(warpSz, 8)>>>(devOut, pitchOut);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaFree(devOut);
cudaDeviceReset();
std::cout << "Elapsed time: " << elapsedTime << " ms.\n";
std::cin.ignore();
}
Is this expected behaviour/can anyone explain why this happens?
One thing I have noted is that Option 1 uses only 15 registers, whereas Option 2 uses 37, which seems a big difference to me.
Another is that the if-statement in the innermost loop is converted to explicit bra instructions in the PTX code for Option 1, whereas for Option 2 it is converted to two selp instructions. Could it be that the explicit branching is behind the 2-3 times slow down similar to what's suspected in this question?
There are two reasons why I am reluctant to go for Option 2. First, when profiling the original application it seems to be limited by share memory bandwidth, which indicates that there is potential to increase the performance by having fewer threads accessing it. Second, unless we use the volatile keyword, writes to shared memory can be optimised to registers. Since we are only interested in the contribution from last the thread to access each memory location (threadIdx.x == 0), and all others add 0, this is not a problem as long as all changes temporarily located in registers are guaranteed to be written back to shared memory in the same order they were issued. Is this the case though? (This far, both options have produced the exact same result.)
Any thoughts or ideas are much appreciated!
PS. I compile for compute capability 3.0. (However, the shuffle instruction is not necessary to demonstrate the behaviour and can be commented out.)

CUDA-GDB crashes in Kernel

I've been trying to debug my code, as I know something is going wrong in the Kernel, and I've been trying to figure out what specifically. If I try to step into the kernel it seems to completely step over the kernel functions, and will eventually cause an error on quitting:
Single stepping until exit from function dyld_stub_cudaSetupArgument,
which has no line number information.
[Launch of CUDA Kernel 0 (incrementArrayOnDevice<<<(3,1,1),(4,1,1)>>>) on
Device 0]
[Termination of CUDA Kernel 0 (incrementArrayOnDevice<<<(3,1,1),
(4,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 1 (fillinBoth<<<(40,1,1),(1,1,1)>>>) on Device 0]
[Termination of CUDA Kernel 1 (fillinBoth<<<(40,1,1),(1,1,1)>>>) on Device 0]
add (below=0x124400, newtip=0x124430, newfork=0x125ac0) at test.cu:1223
And if I try to break in the Kernel my entire computer crashes and I have to restart it.
I figure there must be something wrong with the way I'm calling the kernel, but I can't figure out what.
The code is rather long, so I'm only including an excerpt of it:
__global__ void fillinOne(seqptr qset, long max) {
int i, j;
aas aa;
int idx = blockIdx.x;
__shared__ long qs[3];
if(idx < max)
{
memcpy(qs, qset[idx], sizeof(long[3]));
for (i = 0; i <= 1; i++)
{
for (aa = ala; (long)aa <= (long)stop; aa = (aas)((long)aa + 1))
{
if (((1L << ((long)aa)) & qs[i]) != 0)
{
for (j = i + 1; j <= 2; j++)
qs[j] |= cudaTranslate[(long)aa - (long)ala][j - i];
}
}
}
}
}
//Kernel for left!= NULL and rt != NULL
void fillin(node *p, node *left, node *rt)
{
cudaError_t err = cudaGetLastError();
size_t stepsize = chars * sizeof(long);
size_t sitesize = chars * sizeof(sitearray);
//int i, j;
if (left == NULL)
{
//copy rt->numsteps into p->numsteps--doesn't actually require CUDA, because no computation to do
memcpy(p->numsteps, rt->numsteps, stepsize);
checkCUDAError("memcpy");
//allocate siteset (array of sitearrays) on device
seqptr qsites; //as in array of qs's
cudaMalloc((void **) &qsites, sitesize);
checkCUDAError("malloc");
//copy rt->siteset into device array (equivalent to memcpy(qs, rs) but for whole array)
cudaMemcpy(qsites, rt->siteset, sitesize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy");
//do loop in device
int block_size = 1; //each site operated on independently
int n_blocks = chars;
fillinOne <<< n_blocks, block_size>>> (qsites, chars);
cudaThreadSynchronize();
//put qset in p->siteset--equivalent to memcpy(p->siteset[m], qs)
cudaMemcpy(p->siteset, qsites, sitesize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy");
//Cleanup
cudaFree(qsites);
}
If anyone has any ideas at all, please resond! Thanks in advance!
I suppose you have a single card configuration. When you are debugging a cuda kernel and you break inside it you effectively put the display driver in pause. That causes what you think is a crash. If you want to use the cuda-gdb with only one graphics card you must use it in command line mode (don't start X or press ctrl-alt-fn from X).
If you have two cards you must run the code in the card not running the display. Use cudaSelectDevice(n).