How to use CUDA tex1DFetch with cudaTextureObject_t? - cuda

I was working with texture references when I noticed they were deprecated, I tried to update my test function to work with the 'new' bindless texture objects with tex1Dfetch but was not able to produce the same results.
I'm currently exploring the use of texture memory to speed up my aho-corasick implementation; I was able to get tex1D() working with texture references, however, I noticed they were deprecated and decided to use texture objects instead.
I'm getting some immensely weird behaviour with the kernels when I try to use the results in any way; I can do results[tidx] = tidx; without any issues, but results[tidx] = temp + 1; only ever returns the value of temp not temp * 3 or any other numerical test involving temp.
I can see no logical reason for this behaviour, and the documentation examples look similar enough that I can't see where I've gone wrong.
I've already read CUDA tex1Dfetch() wrong behaviour and New CUDA Texture Object — getting wrong data in 2D case but neither seem related to the issue I am having.
Just in case it makes a difference; I am am using CUDA release 10.0, V10.0.130 with an Nvidia GTX 980ti.
#include <iostream>
__global__ void test(cudaTextureObject_t tex ,int* results){
int tidx = threadIdx.y * blockDim.x + threadIdx.x;
unsigned temp = tex1Dfetch<unsigned>(tex, threadIdx.x);
results[tidx] = temp * 3;
}
int main(){
int *host_arr;
const int host_arr_size = 8;
// Create and populate host array
std::cout << "Host:" << std::endl;
cudaMallocHost(&host_arr, host_arr_size*sizeof(int));
for (int i = 0; i < host_arr_size; ++i){
host_arr[i] = i * 2;
std::cout << host_arr[i] << std::endl;
}
// Create resource description
struct cudaResourceDesc resDesc;
resDesc.resType = cudaResourceTypeLinear;
resDesc.res.linear.devPtr = &host_arr;
resDesc.res.linear.sizeInBytes = host_arr_size*sizeof(unsigned);
resDesc.res.linear.desc = cudaCreateChannelDesc<unsigned>();
// Create texture description
struct cudaTextureDesc texDesc;
texDesc.readMode = cudaReadModeElementType;
// Create texture
cudaTextureObject_t tex;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
// Allocate results array
int * result_arr;
cudaMalloc(&result_arr, host_arr_size*sizeof(unsigned));
// launch test kernel
test<<<1, host_arr_size>>>(tex, result_arr);
// fetch results
std::cout << "Device:" << std::endl;
cudaMemcpy(host_arr, result_arr, host_arr_size*sizeof(unsigned), cudaMemcpyDeviceToHost);
// print results
for (int i = 0; i < host_arr_size; ++i){
std::cout << host_arr[i] << std::endl;
}
// Tidy Up
cudaDestroyTextureObject(tex);
cudaFreeHost(host_arr);
cudaFree(result_arr);
}
I expected the above to work similarly to the below (which does work):
texture<int, 1, cudaReadModeElementType> tex_ref;
cudaArray* cuda_array;
__global__ void test(int* results){
const int tidx = threadIdx.x;
results[tidx] = tex1D(tex_ref, tidx) * 3;
}
int main(){
int *host_arr;
int host_arr_size = 8;
// Create and populate host array
cudaMallocHost((void**)&host_arr, host_arr_size * sizeof(int));
for (int i = 0; i < host_arr_size; ++i){
host_arr[i] = i * 2;
std::cout << host_arr[i] << std::endl;
}
// bind to texture
cudaChannelFormatDesc cuDesc = cudaCreateChannelDesc <int >();
cudaMallocArray(&cuda_array, &cuDesc, host_arr_size);
cudaMemcpyToArray(cuda_array, 0, 0, host_arr , host_arr_size * sizeof(int), cudaMemcpyHostToDevice);
cudaBindTextureToArray(tex_ref , cuda_array);
// Allocate results array
int * result_arr;
cudaMalloc((void**)&result_arr, host_arr_size*sizeof(int));
// launch kernel
test<<<1, host_arr_size>>>(result_arr);
// fetch results
cudaMemcpy(host_arr, result_arr, host_arr_size * sizeof(int), cudaMemcpyDeviceToHost);
// print results
for (int i = 0; i < host_arr_size; ++i){
std::cout << host_arr[i] << std::endl;
}
// Tidy Up
cudaUnbindTexture(tex_ref);
cudaFreeHost(host_arr);
cudaFreeArray(cuda_array);
cudaFree(result_arr);
}
Expected results:
Host:
0
2
4
6
8
10
12
14
Device:
0
6
12
18
24
30
36
42
Actual results:
Host:
0
2
4
6
8
10
12
14
Device:
0
2
4
6
8
10
12
14
Does anyone know what on earth is going wrong?

CUDA API function calls return error codes. You want to check these error codes. Especially when something is clearly going wrong somewhere…
You use the the same array to store the initial array data as well as to receive the result from the device. Your kernel launch fails with an illegal address error because you do not have a valid texture object. You do not have a valid texture object because the creation of your texture object failed. The first API call right after the kernel launch is the cudaMemcpy() to get the results back. Since there was an error during the kernel launch, cudaMemcpy() will fail, returning the most recent error instead of performing the copy. As a result, the contents of your host_arr buffer are unchanged and you just end up displaying the original input data again.
The reson why creation of your texture object failed is explained in the documentation (emphasis mine):
If cudaResourceDesc::resType is set to cudaResourceTypeLinear, cudaResourceDesc::res::linear::devPtr must be set to a valid device pointer, that is aligned to cudaDeviceProp::textureAlignment. […]
A texture object cannot reference host memory. The problem in your code lies here:
resDesc.res.linear.devPtr = &host_arr;
You need to allocate a buffer in decive memory, e.g., using cudaMalloc(), copy your data there, and create a texture object that refers to that device buffer.
Furthermore, your texDesc is not initialized properly. In your case, it should be sufficient to just zero-initialize it:
struct cudaTextureDesc texDesc = {};

4 steps:
declare
texture<unsigned char,1,cudaReadmodeElementType> tex1;
bind
cudaBindTexture(0,tex1,dev_A);
fetch/read via index
tex1Dfetch(tex1,2);
unbind
cudaUnbindTexture(tex1);

Related

What's the alternative for __match_any_sync on compute capability 6?

In the cuda examples, e.g. here, __match_all_sync __match_any_sync is used.
Here is an example where a warp is split into multiple (one or more) groups that each keep track of their own atomic counter.
// increment the value at ptr by 1 and return the old value
__device__ int atomicAggInc(int* ptr) {
int pred;
//const auto mask = __match_all_sync(__activemask(), ptr, &pred); //error, should be any_sync, not all_sync
const auto mask = __match_any_sync(__activemask(), ptr, &pred);
const auto leader = __ffs(mask) - 1; // select a leader
int res;
const auto lane_id = ThreadId() % warpSize;
if (lane_id == leader) { // leader does the update
res = atomicAdd(ptr, __popc(mask));
}
res = __shfl_sync(mask, res, leader); // get leader’s old value
return res + __popc(mask & ((1 << lane_id) - 1)); //compute old value
}
The __match_any_sync here splits up the threads in the warp into groups that have the same ptr value, so that each group can update its own ptr atomically without getting in the way of other threads.
I know the nvcc compiler (since cuda 9) does this sort of optimization under the hood automatically, but this is just about the mechanics of __match_any_sync
Is there a way to do this pre compute capability 7?
EDIT: The blog article has now been modified to reflect __match_any_sync() rather than __match_all_sync(), so any commentary to that effect below should be disregarded. The answer below is edited to reflect this.
Based on your statement:
this is just about the mechanics of __match_any_sync
we will focus on a replacement for __match_any_sync itself, not any other form of rewriting the atomicAggInc function. Therefore, we must provide a mask that has the same value as would be returned by __match_any_sync() on cc7.0 or higher architectures.
I believe this will require a loop, which broadcasts the ptr value, in the worst case one iteration for each thread in the warp (since each thread could have a unique ptr value) and testing which threads have the same value. There are various ways we could "optimize" this loop for this function, so as to possibly reduce the trip count from 32 to some lesser value, based on the actual ptr values in each thread, but such optimization in my view introduces considerable complexity, which makes the worst-case processing time longer (as is typical of early-exit optimizations). So I will demonstrate a fairly simple method without this optimization.
The other consideration is what to do in the case of the warp not being converged? For that, we can employ __activemask() to identify that case.
Here is a worked example:
$ cat t1646.cu
#include <iostream>
#include <stdio.h>
// increment the value at ptr by 1 and return the old value
__device__ int atomicAggInc(int* ptr) {
int mask;
#if __CUDA_ARCH__ >= 700
mask = __match_any_sync(__activemask(), (unsigned long long)ptr);
#else
unsigned tmask = __activemask();
for (int i = 0; i < warpSize; i++){
#ifdef USE_OPT
if ((1U<<i) & tmask){
#endif
unsigned long long tptr = __shfl_sync(tmask, (unsigned long long)ptr, i);
unsigned my_mask = __ballot_sync(tmask, (tptr == (unsigned long long)ptr));
if (i == (threadIdx.x & (warpSize-1))) mask = my_mask;}
#ifdef USE_OPT
}
#endif
#endif
int leader = __ffs(mask) - 1; // select a leader
int res;
unsigned lane_id = threadIdx.x % warpSize;
if (lane_id == leader) { // leader does the update
res = atomicAdd(ptr, __popc(mask));
}
res = __shfl_sync(mask, res, leader); // get leader’s old value
return res + __popc(mask & ((1 << lane_id) - 1)); //compute old value
}
__global__ void k(int *d){
int *ptr = d + threadIdx.x/4;
if ((threadIdx.x >= 16) && (threadIdx.x < 32))
atomicAggInc(ptr);
}
const int ds = 32;
int main(){
int *d_d, *h_d;
h_d = new int[ds];
cudaMalloc(&d_d, ds*sizeof(d_d[0]));
cudaMemset(d_d, 0, ds*sizeof(d_d[0]));
k<<<1,ds>>>(d_d);
cudaMemcpy(h_d, d_d, ds*sizeof(d_d[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < ds; i++)
std::cout << h_d[i] << " ";
std::cout << std::endl;
}
$ nvcc -o t1646 t1646.cu -DUSE_OPT
$ cuda-memcheck ./t1646
========= CUDA-MEMCHECK
0 0 0 0 4 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
========= ERROR SUMMARY: 0 errors
$
(CentOS 7, CUDA 10.1.243, with device 0 being Tesla V100, device 1 being a cc3.5 device).
I've added an optional optimization for the case where the warp is diverged (i.e. tmask is not 0xFFFFFFFF). This can be selected by defining USE_OPT.

Why doesn't the console show the entire output? [duplicate]

For the purpose of testing printf() call on device, I wrote a simple program which copies an array of moderate size to device and print the value of device array to screen. Although the array is correctly copied to device, the printf() function does not work correctly, which lost the first several hundred numbers. The array size in the code is 4096. Is this a bug or I'm not using this function properly? Thanks in adavnce.
EDIT: My gpu is GeForce GTX 550i, with compute capability 2.1
My code:
#include<stdio.h>
#include<stdlib.h>
#define N 4096
__global__ void Printcell(float *d_Array , int n){
int k = 0;
printf("\n=========== data of d_Array on device==============\n");
for( k = 0; k < n; k++ ){
printf("%f ", d_Array[k]);
if((k+1)%6 == 0) printf("\n");
}
printf("\n\nTotally %d elements has been printed", k);
}
int main(){
int i =0;
float Array[N] = {0}, rArray[N] = {0};
float *d_Array;
for(i=0;i<N;i++)
Array[i] = i;
cudaMalloc((void**)&d_Array, N*sizeof(float));
cudaMemcpy(d_Array, Array, N*sizeof(float), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
Printcell<<<1,1>>>(d_Array, N); //Print the device array by a kernel
cudaDeviceSynchronize();
/* Copy the device array back to host to see if it was correctly copied */
cudaMemcpy(rArray, d_Array, N*sizeof(float), cudaMemcpyDeviceToHost);
printf("\n\n");
for(i=0;i<N;i++){
printf("%f ", rArray[i]);
if((i+1)%6 == 0) printf("\n");
}
}
printf from the device has a limited queue. It's intended for small scale debug-style output, not large scale output.
referring to the programmer's guide:
The output buffer for printf() is set to a fixed size before kernel launch (see Associated Host-Side API). It is circular and if more output is produced during kernel execution than can fit in the buffer, older output is overwritten.
Your in-kernel printf output overran the buffer, and so the first printed elements were lost (overwritten) before the buffer was dumped into the standard I/O queue.
The linked documentation indicates that the buffer size can be increased, also.

Shared memory, branching performance and register count

I came across some peculiar performance behaviour when trying out the CUDA shuffle instruction. The test kernel below is based on an image processing algorithm which adds input-dependent values to all neighbouring pixels within a square of side rad. The output for each block is added in shared memory. If only one thread per warp adds its result to shared memory, the performance is poor (Option 1), whereas on the other hand, if all threads add to shared memory (one thread adds the desired value, the rest just add 0), the execution time drops by 2-3 times (Option 2).
#include <iostream>
#include "cuda_runtime.h"
#define warpSz 32
#define tileY 32
#define rad 32
__global__ void test(float *out, int pitch)
{
// Set shared mem to 0
__shared__ float tile[(warpSz + 2*rad) * (tileY + 2*rad)];
for (int i = threadIdx.y*blockDim.x+threadIdx.x; i<(tileY+2*rad)*(warpSz+2*rad); i+=blockDim.x*blockDim.y) {
tile[i] = 0.0f;
}
__syncthreads();
for (int row=threadIdx.y; row<tileY; row += blockDim.y) {
// Loop over pixels in neighbourhood
for (int i=0; i<2*rad+1; ++i) {
float res = 0.0f;
int rowStartIdx = (row+i)*(warpSz+2*rad);
for (int j=0; j<2*rad+1; ++j) {
res += float(threadIdx.x+row); // Substitute for real calculation
// Option 1: one thread writes to shared mem
if (threadIdx.x == 0) {
tile[rowStartIdx + j] += res;
res = 0.0f;
}
//// Option 2: all threads write to shared mem
//float tmp = 0.0f;
//if (threadIdx.x == 0) {
// tmp = res;
// res = 0.0f;
//}
//tile[rowStartIdx + threadIdx.x+j] += tmp;
res = __shfl(res, (threadIdx.x+1) % warpSz);
}
res += float(threadIdx.x+row);
tile[rowStartIdx + threadIdx.x+2*rad] += res;
__syncthreads();
}
}
// Add result back to global mem
for (int row=threadIdx.y; row<tileY+2*rad; row+=blockDim.y) {
for (int col=threadIdx.x; col<warpSz+2*rad; col+=warpSz) {
int idx = (blockIdx.y*tileY + row)*pitch + blockIdx.x*warpSz + col;
atomicAdd(out+idx, tile[row*(warpSz+2*rad) + col]);
}
}
}
int main(void)
{
int2 dim = make_int2(512, 512);
int pitchOut = (((dim.x+2*rad)+warpSz-1) / warpSz) * warpSz;
int sizeOut = pitchOut*(dim.y+2*rad);
dim3 gridDim((dim.x+warpSz-1)/warpSz, (dim.y+tileY-1)/tileY, 1);
float *devOut;
cudaMalloc((void**)&devOut, sizeOut*sizeof(float));
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaFree(0);
cudaEventRecord(start, 0);
test<<<gridDim, dim3(warpSz, 8)>>>(devOut, pitchOut);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaFree(devOut);
cudaDeviceReset();
std::cout << "Elapsed time: " << elapsedTime << " ms.\n";
std::cin.ignore();
}
Is this expected behaviour/can anyone explain why this happens?
One thing I have noted is that Option 1 uses only 15 registers, whereas Option 2 uses 37, which seems a big difference to me.
Another is that the if-statement in the innermost loop is converted to explicit bra instructions in the PTX code for Option 1, whereas for Option 2 it is converted to two selp instructions. Could it be that the explicit branching is behind the 2-3 times slow down similar to what's suspected in this question?
There are two reasons why I am reluctant to go for Option 2. First, when profiling the original application it seems to be limited by share memory bandwidth, which indicates that there is potential to increase the performance by having fewer threads accessing it. Second, unless we use the volatile keyword, writes to shared memory can be optimised to registers. Since we are only interested in the contribution from last the thread to access each memory location (threadIdx.x == 0), and all others add 0, this is not a problem as long as all changes temporarily located in registers are guaranteed to be written back to shared memory in the same order they were issued. Is this the case though? (This far, both options have produced the exact same result.)
Any thoughts or ideas are much appreciated!
PS. I compile for compute capability 3.0. (However, the shuffle instruction is not necessary to demonstrate the behaviour and can be commented out.)

execute CUDA kernel few times

I have problem with execute CUDA kernel few times. Somethings is wrong with environment in my code. First time code works properly, second time during clean up environemnt before third call there are random crashes.
I think for some reason I have memory corruption. Crashes occurs sometimes in CUDA driver, sometimes simple printf crashes or cheap, kernel32.dll. I suppose that I have problem with memory management in my code.
What should be done before again kernel execution ?
This code works when I execute one time.
I'm using CURAND to initialize random generators.
Here is my code:
#define GRID_BLOCK 64
#define GRID_THREAD 8
#define CITIES 100
#define CIPOW2 101
int lenghtPaths = GRID_BLOCK*GRID_THREAD;
int cities = CITIES;
//prepare CURAND
curandState *devStates;
CUDA_CALL(cudaMalloc((void **)&devStates, GRID_BLOCK*GRID_THREAD*sizeof(curandState)));
/* Setup prng states */
setup_kernel<<<GRID_BLOCK ,GRID_THREAD>>>(devStates);
CUDA_CALL(cudaDeviceSynchronize());
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
fprintf(stderr, "CURAND preparation failed: %s\n", cudaGetErrorString(cudaStatus));
//copy distance grid to constant memory
cudaMemcpyToSymbol(cdist, dist, sizeof(int) *CIPOW2*CIPOW2);
CUDA_CALL(cudaMalloc((void**)&dev_pathsForThreads, lenghtPaths * cities * sizeof(int)));
CUDA_CALL(cudaMalloc((void**)&d_results, GRID_BLOCK*GRID_THREAD * sizeof(int)));
for (int k = 0; k < 5; k++){
int* pathsForThreads;
pathsForThreads = (int*)malloc(lenghtPaths * cities * sizeof(int));
pathsForThreads = PreaparePaths(Path, lenghtPaths, cities);
CUDA_CALL(cudaMemcpy(dev_pathsForThreads, pathsForThreads, lenghtPaths *cities*sizeof(int), cudaMemcpyHostToDevice));
GPUAnnealing<<<GRID_BLOCK ,GRID_THREAD >>>(dev_pathsForThreads, devStates, iterationLimit,temperature, coolingRate, absoluteTemperature, cities,d_results);
CUDA_CALL(cudaDeviceSynchronize());
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
fprintf(stderr, "GPUAnnealing launch failed: %s\n", cudaGetErrorString(cudaStatus));
h_results = (int*) malloc(GRID_BLOCK*GRID_THREAD * sizeof(int));
//Copy lenght of each path to CPU
CUDA_CALL(cudaMemcpy(h_results, d_results, GRID_BLOCK*GRID_THREAD * sizeof(int),cudaMemcpyDeviceToHost));
//Copy paths to CPU
CUDA_CALL(cudaMemcpy(pathsForThreads, dev_pathsForThreads, lenghtPaths *cities*sizeof(int), cudaMemcpyDeviceToHost));
//check the shortest path
shortestPath = FindTheShortestPath(h_results);
fprintf (stdout, "Shortest path on index = %d value = %d \n", shortestPath, h_results[shortestPath]);
for (int i = 0; i < GRID_BLOCK*GRID_BLOCK ; i++)
Path[i] = pathsForThreads[shortestPath*CITIES +i];
free(pathsForThreads);
free(h_results);
}
CUDA_CALL(cudaFree(dev_pathsForThreads));
CUDA_CALL(cudaFree(d_results));
CUDA_CALL(cudaFree(devStates));
CUDA_CALL(cudaDeviceReset());
This is a bad idea:
pathsForThreads = (int*)malloc(lenghtPaths * cities * sizeof(int));
pathsForThreads = PreaparePaths(Path, lenghtPaths, cities);
If the call to PreaparePaths assigns some other value to pathsForThreads than what was assigned to it by the malloc operation, then later when you do this:
free(pathsForThreads);
You're going to get unpredictable results.
You should not reassign a pointer that you're subsequently going to pass to free to some other value. The man page for free indicates:
free() frees the memory space pointed to by ptr, which must have been
returned by a previous call to malloc(), calloc() or realloc().
So reassigning the pointer to something else is not allowed if you intend to pass it to free

CUDA-GDB crashes in Kernel

I've been trying to debug my code, as I know something is going wrong in the Kernel, and I've been trying to figure out what specifically. If I try to step into the kernel it seems to completely step over the kernel functions, and will eventually cause an error on quitting:
Single stepping until exit from function dyld_stub_cudaSetupArgument,
which has no line number information.
[Launch of CUDA Kernel 0 (incrementArrayOnDevice<<<(3,1,1),(4,1,1)>>>) on
Device 0]
[Termination of CUDA Kernel 0 (incrementArrayOnDevice<<<(3,1,1),
(4,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 1 (fillinBoth<<<(40,1,1),(1,1,1)>>>) on Device 0]
[Termination of CUDA Kernel 1 (fillinBoth<<<(40,1,1),(1,1,1)>>>) on Device 0]
add (below=0x124400, newtip=0x124430, newfork=0x125ac0) at test.cu:1223
And if I try to break in the Kernel my entire computer crashes and I have to restart it.
I figure there must be something wrong with the way I'm calling the kernel, but I can't figure out what.
The code is rather long, so I'm only including an excerpt of it:
__global__ void fillinOne(seqptr qset, long max) {
int i, j;
aas aa;
int idx = blockIdx.x;
__shared__ long qs[3];
if(idx < max)
{
memcpy(qs, qset[idx], sizeof(long[3]));
for (i = 0; i <= 1; i++)
{
for (aa = ala; (long)aa <= (long)stop; aa = (aas)((long)aa + 1))
{
if (((1L << ((long)aa)) & qs[i]) != 0)
{
for (j = i + 1; j <= 2; j++)
qs[j] |= cudaTranslate[(long)aa - (long)ala][j - i];
}
}
}
}
}
//Kernel for left!= NULL and rt != NULL
void fillin(node *p, node *left, node *rt)
{
cudaError_t err = cudaGetLastError();
size_t stepsize = chars * sizeof(long);
size_t sitesize = chars * sizeof(sitearray);
//int i, j;
if (left == NULL)
{
//copy rt->numsteps into p->numsteps--doesn't actually require CUDA, because no computation to do
memcpy(p->numsteps, rt->numsteps, stepsize);
checkCUDAError("memcpy");
//allocate siteset (array of sitearrays) on device
seqptr qsites; //as in array of qs's
cudaMalloc((void **) &qsites, sitesize);
checkCUDAError("malloc");
//copy rt->siteset into device array (equivalent to memcpy(qs, rs) but for whole array)
cudaMemcpy(qsites, rt->siteset, sitesize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy");
//do loop in device
int block_size = 1; //each site operated on independently
int n_blocks = chars;
fillinOne <<< n_blocks, block_size>>> (qsites, chars);
cudaThreadSynchronize();
//put qset in p->siteset--equivalent to memcpy(p->siteset[m], qs)
cudaMemcpy(p->siteset, qsites, sitesize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy");
//Cleanup
cudaFree(qsites);
}
If anyone has any ideas at all, please resond! Thanks in advance!
I suppose you have a single card configuration. When you are debugging a cuda kernel and you break inside it you effectively put the display driver in pause. That causes what you think is a crash. If you want to use the cuda-gdb with only one graphics card you must use it in command line mode (don't start X or press ctrl-alt-fn from X).
If you have two cards you must run the code in the card not running the display. Use cudaSelectDevice(n).