I have some NVVM code that I am trying to compile to PTX using nvrtc (i.e. using nvvmCompileProgram, nvvmGetCompiledResult).
Here is the nvvm code:
; ModuleID = 'test_warp_reduce'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-cuda"
define ptx_kernel void #lambda_crit_4197([0 x float]* %_4200_4590, [0 x i64]* %_4201_4591, [0 x float]* %_4202_4592) {
acc_bidx:
%0 = tail call ptx_device i32 #llvm.nvvm.read.ptx.sreg.tid.x()
%1 = tail call ptx_device i32 #llvm.nvvm.read.ptx.sreg.ntid.x()
%2 = tail call ptx_device i32 #llvm.nvvm.read.ptx.sreg.ctaid.x()
%3 = mul nsw i32 %2, %1
%4 = add nsw i32 %3, %0
%5 = icmp slt i32 %4, 32
br i1 %5, label %if_then12, label %next
if_then12: ; preds = %acc_bidx
%6 = getelementptr inbounds [0 x float]* %_4202_4592, i64 0, i32 %4
%7 = load float* %6
%8 = tail call i64 #clock()
%9 = tail call float #reduce_step(float %7, i32 1, i32 31)
%10 = tail call float #reduce_step(float %9, i32 2, i32 31)
%11 = tail call float #reduce_step(float %10, i32 4, i32 31)
%12 = tail call float #reduce_step(float %11, i32 8, i32 31)
%13 = tail call float #reduce_step(float %12, i32 16, i32 31)
%14 = tail call i64 #clock()
%15 = getelementptr inbounds [0 x float]* %_4200_4590, i64 0, i32 %4
%16 = getelementptr inbounds [0 x i64]* %_4201_4591, i64 0, i32 %0
%17 = sub nsw i64 %14, %8
store i64 %17, i64* %16
store float %13, float* %15
br label %next
next: ; preds = %acc_bidx, %if_then12
ret void
}
declare i64 #llvm.nvvm.texsurf.handle.p1i64(metadata, i64 addrspace(1)*)
; Function Attrs: nounwind readnone
declare ptx_device i32 #llvm.nvvm.read.ptx.sreg.tid.x()
; Function Attrs: nounwind readnone
declare ptx_device i32 #llvm.nvvm.read.ptx.sreg.ntid.x()
; Function Attrs: nounwind readnone
declare ptx_device i32 #llvm.nvvm.read.ptx.sreg.ctaid.x()
define i64 #clock() {
%1 = call i64 asm "mov.u32 $0, %clock;", "=r" ()
ret i64 %1
}
define float #reduce_step(float %a, i32 %b, i32 %c) {
%1 = call float asm
"{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, $1, $2, $3;
#p add.f32 r0, r0, $1;
mov.f32 $0, r0;
}", "=f, f, r, r" (float %a, i32 %b, i32 %c)
ret float %1
}
!nvvmir.version = !{!0}
!nvvm.annotations = !{!1}
!0 = metadata !{i32 1, i32 2}
!1 = metadata !{void ([0 x float]*, [0 x i64]*, [0 x float]*)* #lambda_crit_4197, metadata !"kernel", i64 1}
And here is the generated ptx code:
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19324574
// Cuda compilation tools, release 7.0, V7.0.27
// Based on LLVM 3.4svn
//
.version 4.2
.target sm_52
.address_size 64
// .globl lambda_crit_4197
.visible .entry lambda_crit_4197(
.param .u64 lambda_crit_4197_param_0,
.param .u64 lambda_crit_4197_param_1,
.param .u64 lambda_crit_4197_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<11>;
.reg .s32 %r<15>;
.reg .s64 %rd<13>;
ld.param.u64 %rd1, [lambda_crit_4197_param_0];
ld.param.u64 %rd2, [lambda_crit_4197_param_1];
ld.param.u64 %rd3, [lambda_crit_4197_param_2];
mov.u32 %r1, %tid.x;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mad.lo.s32 %r2, %r3, %r4, %r1;
setp.gt.s32 %p1, %r2, 31;
#%p1 bra BB0_2;
cvta.to.global.u64 %rd4, %rd3;
mul.wide.s32 %rd5, %r2, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f2, [%rd6];
mov.u32 %r5, 1;
mov.u32 %r14, 31;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f2, %r5, %r14;
#p add.f32 r0, r0, %f2;
mov.f32 %f1, r0;
}
// inline asm
mov.u32 %r7, 2;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f1, %r7, %r14;
#p add.f32 r0, r0, %f1;
mov.f32 %f3, r0;
}
// inline asm
mov.u32 %r9, 4;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f3, %r9, %r14;
#p add.f32 r0, r0, %f3;
mov.f32 %f5, r0;
}
// inline asm
mov.u32 %r11, 8;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f5, %r11, %r14;
#p add.f32 r0, r0, %f5;
mov.f32 %f7, r0;
}
// inline asm
mov.u32 %r13, 16;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f7, %r13, %r14;
#p add.f32 r0, r0, %f7;
mov.f32 %f9, r0;
}
// inline asm
cvta.to.global.u64 %rd7, %rd1;
add.s64 %rd8, %rd7, %rd5;
cvta.to.global.u64 %rd9, %rd2;
mul.wide.s32 %rd10, %r1, 8;
add.s64 %rd11, %rd9, %rd10;
mov.u64 %rd12, 0;
st.global.u64 [%rd11], %rd12;
st.global.f32 [%rd8], %f9;
BB0_2:
ret;
}
// .globl clock
.visible .func (.param .b64 func_retval0) clock(
)
{
.reg .s32 %r<2>;
.reg .s64 %rd<2>;
// inline asm
mov.u32 %r1, %clock;
// inline asm
cvt.u64.u32 %rd1, %r1;
st.param.b64 [func_retval0+0], %rd1;
ret;
}
// .globl reduce_step
.visible .func (.param .b32 func_retval0) reduce_step(
.param .b32 reduce_step_param_0,
.param .b32 reduce_step_param_1,
.param .b32 reduce_step_param_2
)
{
.reg .f32 %f<3>;
.reg .s32 %r<3>;
ld.param.f32 %f2, [reduce_step_param_0];
ld.param.u32 %r1, [reduce_step_param_1];
ld.param.u32 %r2, [reduce_step_param_2];
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f2, %r1, %r2;
#p add.f32 r0, r0, %f2;
mov.f32 %f1, r0;
}
// inline asm
st.param.f32 [func_retval0+0], %f1;
ret;
}
It seems that the nvvm compiler just eliminates code for mysterious reasons. For example, the calls for the clock function weren't emitted at all.
Whether I used the compiler optimization or not doesn't make a difference in the provided code.
Somebody told me that Cuda 7.5 had some similar issues (assembly not being emitted) on Windows. So I downgraded to 7.0. However, the problem is still in there.
Any clue why this might be the case?
I can tell from experience that PTX code only calls builtin functions. User defined functions get inlined into the calling functions.
I can't seem to find the proper documentation for it right now, but I'll link it in when I find it.
In your code base there are many places code segments like this repeat:
// inline asm
mov.u32 %r7, 2;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f1, %r7, %r14;
#p add.f32 r0, r0, %f1;
mov.f32 %f3, r0;
}
Does this look familiar ? The first line one is coming from clock, the second block is coming from reduce_step.
TL;DR: You don't see the calls because they got inlined.
Related
I'm trying to implement string matching program with CUDA in C and I have th following issue.
When I set 1 block and 1 thread per block the result for pattern dfh is 2. That's correct, but when I increase the blocks the result is 4.
The text file is:
ffskdfhksdjhfksdfksjdfhksdhfksjdhfkjer654yrkhjkfgjhdsrtrhkjchgkjthyoirthygfnbkjgkjdhykhkjchgkjfdhsfykhkbhkjfghkfgjy
This is my code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
__global__ void string_matching(char *buffer, char *pattern, int match_size, int pattern_size, int *result){
int tid, i;
__shared__ int local_matches;
if(threadIdx.x == 0) local_matches = 0;
__syncthreads();
for(tid=blockIdx.x*blockDim.x+threadIdx.x; tid<match_size; tid+=blockDim.x){
for (i = 0; i < pattern_size && pattern[i] == buffer[i + tid]; ++i);
if(i >= pattern_size){
atomicAdd(&local_matches, 1);
}
}
__syncthreads();
if(threadIdx.x == 0)
atomicAdd(result, local_matches);
}
int main(int argc, char *argv[]){
FILE *pFile;
long file_size, match_size, pattern_size;
char * buffer;
char * filename, *pattern;
size_t result;
int *match, total_matches;
//CUDA variables
int blocks, threads_per_block;
int *result_dev;
char *buffer_dev, *pattern_dev;
float total_time, comp_time;
cudaEvent_t total_start, total_stop, comp_start, comp_stop;
cudaEventCreate(&total_start);
cudaEventCreate(&total_stop);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_stop);
if (argc != 5) {
printf ("Usage : %s <file_name> <string> <blocks> <threads_per_block>\n", argv[0]);
return 1;
}
filename = argv[1];
pattern = argv[2];
blocks = strtol(argv[3], NULL, 10);
threads_per_block = strtol(argv[4], NULL, 10);
pFile = fopen ( filename , "rb" );
if (pFile==NULL) {printf ("File error\n"); return 2;}
// obtain file size:
fseek (pFile , 0 , SEEK_END);
file_size = ftell (pFile);
rewind (pFile);
printf("file size is %ld\n", file_size);
// allocate memory to contain the file:
buffer = (char*) malloc (sizeof(char)*file_size);
if (buffer == NULL) {printf ("Memory error\n"); return 3;}
// copy the file into the buffer:
result = fread (buffer,1,file_size,pFile);
if (result != file_size) {printf ("Reading error\n"); return 4;}
pattern_size = strlen(pattern);
match_size = file_size - pattern_size + 1;
match = (int *) malloc (sizeof(int)*match_size);
if (match == NULL) {printf ("Malloc error\n"); return 5;}
cudaMalloc((void **)&result_dev, sizeof(int));
cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
cudaMalloc((void **)&pattern_dev, pattern_size*sizeof(char));
cudaEventRecord(total_start);
cudaEventRecord(comp_start);
cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(pattern_dev, pattern, pattern_size*sizeof(char), cudaMemcpyHostToDevice);
string_matching<<<blocks, threads_per_block>>>(buffer_dev, pattern_dev, match_size, pattern_size, result_dev);
cudaThreadSynchronize();
cudaEventRecord(comp_stop);
cudaEventSynchronize(comp_stop);
cudaEventElapsedTime(&comp_time, comp_start, comp_stop);
cudaMemcpy(&total_matches, result_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(total_stop);
cudaEventSynchronize(total_stop);
cudaEventElapsedTime(&total_time, total_start, total_stop);
cudaFree(result_dev);
cudaFree(buffer_dev);
cudaFree(pattern_dev);
fclose (pFile);
free (buffer);
//Print result
printf("Total matches: %d\n", total_matches);
printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", file_size, blocks, blocks*threads_per_block);
printf("Total time (ms): %.3f\n", total_time);
printf("Kernel time (ms): %.3f\n", comp_time);
printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);
}
You need to synchronize threads of the same block using __syncthreads. For example, local_matches = 0 can theoretically be done concurrently to the atomicAdd of the main loop. Thus, a __syncthreads is needed between both. For the same reason, you also need a __syncthreads before the last if(threadIdx.x == 0). I am not sure this is the only error.
I advise you to use CUDA-GDB to track such bug since the kernel is quite short and relatively simple.
Note that a local_matches do not need to be shared. In fact, it is not efficient to do that. On can perform the reduction in local memory and then perform a final atomicAdd. Additionally, you do not need the innermost conditional. You can simply do: local_matches += i >= pattern_size; (the compiler might already do such optimization).
I finally found a solution for this.
I set a match table with 0 values for every buffer position, set 1 for every position that pattern found, and added the 1's in the CPU.
If you think something better please add an answer.
This is the code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
__global__ void string_matching(char *buffer, char *pattern, int match_size, int pattern_size, int *match){
int tid, i;
for(tid=blockIdx.x*blockDim.x+threadIdx.x; tid<match_size; tid+=blockDim.x){
for (i = 0; i < pattern_size && pattern[i] == buffer[i + tid]; ++i);
if(i >= pattern_size){
match[tid] = 1;
}
else{
match[tid] = 0;
}
}
}
int main(int argc, char *argv[]){
FILE *pFile;
int i;
long file_size, match_size, pattern_size;
char * buffer;
char * filename, *pattern;
size_t result;
int *match, total_matches;
//CUDA variables
int blocks, threads_per_block;
int *match_dev;
char *buffer_dev, *pattern_dev;
float total_time, comp_time;
cudaEvent_t total_start, total_stop, comp_start, comp_stop;
cudaEventCreate(&total_start);
cudaEventCreate(&total_stop);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_stop);
if (argc != 5) {
printf ("Usage : %s <file_name> <string> <blocks> <threads_per_block>\n", argv[0]);
return 1;
}
filename = argv[1];
pattern = argv[2];
blocks = strtol(argv[3], NULL, 10);
threads_per_block = strtol(argv[4], NULL, 10);
pFile = fopen ( filename , "rb" );
if (pFile==NULL) {printf ("File error\n"); return 2;}
// obtain file size:
fseek (pFile , 0 , SEEK_END);
file_size = ftell (pFile);
rewind (pFile);
printf("file size is %ld\n", file_size);
// allocate memory to contain the file:
buffer = (char*) malloc (sizeof(char)*file_size);
if (buffer == NULL) {printf ("Memory error\n"); return 3;}
// copy the file into the buffer:
result = fread (buffer,1,file_size,pFile);
if (result != file_size) {printf ("Reading error\n"); return 4;}
pattern_size = strlen(pattern);
match_size = file_size - pattern_size + 1;
match = (int *) malloc (sizeof(int)*match_size);
if (match == NULL) {printf ("Malloc error\n"); return 5;}
cudaMalloc((void **)&match_dev, match_size*sizeof(int));
cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
cudaMalloc((void **)&pattern_dev, pattern_size*sizeof(char));
cudaEventRecord(total_start);
cudaEventRecord(comp_start);
cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(pattern_dev, pattern, pattern_size*sizeof(char), cudaMemcpyHostToDevice);
string_matching<<<blocks, threads_per_block>>>(buffer_dev, pattern_dev, match_size, pattern_size, match_dev);
cudaThreadSynchronize();
cudaEventRecord(comp_stop);
cudaEventSynchronize(comp_stop);
cudaEventElapsedTime(&comp_time, comp_start, comp_stop);
cudaMemcpy(match, match_dev, match_size*sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(total_stop);
cudaEventSynchronize(total_stop);
cudaEventElapsedTime(&total_time, total_start, total_stop);
total_matches = 0;
for(i=0; i<match_size; i++){
total_matches += match[i];
}
cudaFree(match_dev);
cudaFree(buffer_dev);
cudaFree(pattern_dev);
fclose (pFile);
free (buffer);
//Print result
printf("Total matches: %d\n", total_matches);
printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", file_size, blocks, blocks*threads_per_block);
printf("Total time (ms): %.3f\n", total_time);
printf("Kernel time (ms): %.3f\n", comp_time);
printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);
}
I think this is a better solution.
It works only for power of 2 threads per block.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#define MAX_THREADS_PER_BLOCK 100
__global__ void string_matching(char *buffer, char *pattern, int match_size, int pattern_size, int blocks, int slice, int extra, int *gout){
int tid, i;
int thread_index = blockIdx.x*blockDim.x + threadIdx.x;
int start = thread_index*slice;
int stop = start + slice;
if(thread_index == blocks*blockDim.x - 1){
stop += extra;
}
if(stop > match_size){
stop = match_size;
}
__shared__ int r[MAX_THREADS_PER_BLOCK];
int sum = 0;
for(tid=start; tid<stop; tid++){
for (i = 0; i < pattern_size && pattern[i] == buffer[i + tid]; ++i);
if(i >= pattern_size){
sum++;
}
}
r[threadIdx.x] = sum;
__syncthreads();
//works only for power of 2 threads_per_block
for (int size = blockDim.x/2; size>0; size/=2) { //uniform
if (threadIdx.x<size)
r[threadIdx.x] += r[threadIdx.x+size];
__syncthreads();
}
printf("Block: %d, Thread: %d, Global Thread: %d, Start: %d, Stop: %d, Matches: %d, Block Matches: %d\n", blockIdx.x, threadIdx.x, thread_index, start, stop, r[threadIdx.x], r[0]);
if(threadIdx.x == 0){
gout[blockIdx.x] = r[0];
}
}
int main(int argc, char *argv[]){
int i;
FILE *pFile;
long file_size, match_size, pattern_size;
char * buffer;
char * filename, *pattern;
size_t result;
int *results;
int total_matches;
//CUDA variables
int blocks, threads_per_block, total_threads, slice, extra;
int *results_dev;
char *buffer_dev, *pattern_dev;
float total_time, comp_time;
cudaEvent_t total_start, total_stop, comp_start, comp_stop;
cudaEventCreate(&total_start);
cudaEventCreate(&total_stop);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_stop);
if (argc != 5) {
printf ("Usage : %s <file_name> <string> <blocks> <threads_per_block>\n", argv[0]);
return 1;
}
filename = argv[1];
pattern = argv[2];
blocks = strtol(argv[3], NULL, 10);
threads_per_block = strtol(argv[4], NULL, 10);
pFile = fopen ( filename , "rb" );
if (pFile==NULL) {printf ("File error\n"); return 2;}
// obtain file size:
fseek (pFile , 0 , SEEK_END);
file_size = ftell (pFile);
rewind (pFile);
printf("file size is %ld\n", file_size);
// allocate memory to contain the file:
buffer = (char*) malloc (sizeof(char)*file_size);
if (buffer == NULL) {printf ("Memory error\n"); return 3;}
// copy the file into the buffer:
result = fread (buffer,1,file_size,pFile);
if (result != file_size) {printf ("Reading error\n"); return 4;}
pattern_size = strlen(pattern);
match_size = file_size - pattern_size + 1;
results = (int *)malloc(blocks*sizeof(int));
cudaMalloc((void **)&results_dev, blocks*sizeof(int));
cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
cudaMalloc((void **)&pattern_dev, pattern_size*sizeof(char));
cudaEventRecord(total_start);
cudaEventRecord(comp_start);
cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(pattern_dev, pattern, pattern_size*sizeof(char), cudaMemcpyHostToDevice);
total_threads = blocks*threads_per_block;
slice = match_size/total_threads;
extra = match_size%total_threads;
string_matching<<<blocks, threads_per_block>>>(buffer_dev, pattern_dev, match_size, pattern_size, blocks, slice, extra, results_dev);
cudaEventRecord(comp_stop);
cudaEventSynchronize(comp_stop);
cudaEventElapsedTime(&comp_time, comp_start, comp_stop);
cudaMemcpy(results, results_dev, blocks*sizeof(int), cudaMemcpyDeviceToHost);
total_matches = 0;
for(i=0; i<blocks; i++){
total_matches += results[i];
}
cudaEventRecord(total_stop);
cudaEventSynchronize(total_stop);
cudaEventElapsedTime(&total_time, total_start, total_stop);
cudaFree(results_dev);
cudaFree(buffer_dev);
cudaFree(pattern_dev);
fclose (pFile);
free (buffer);
//Print result
printf("Total matches: %d\n", total_matches);
printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", file_size, blocks, blocks*threads_per_block);
printf("Total time (ms): %.3f\n", total_time);
printf("Kernel time (ms): %.3f\n", comp_time);
printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);
}
The stride on your loop is not correct for using multiple blocks, i.e. both blocks were doing the full amount of work (and therefore finding two entries each). The correct grid-stride loop looks like the following
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
tid < match_size;
tid += blockDim.x * gridDim.x /* <-- fix */) {
// ...
}
I have a kernel that takes several arrays as input. To improve readability it would be nice to group them into a struct and (after proper memory allocation and copy for each input) pass the struct to the kernel instead of the long list of pointers.
Is it going to be the same in the 2 cases, memory-wise, when accessing the arrays inside the kernel?
Can anyone recommend me some documentation on this topic (Couldn't find it on the programming guide)
No, there should be no difference. You can read the PTX output to make sure. Here is a simple example:
struct Foo
{
int* a, *b, *c;
};
__global__ void bar(Foo f)
{ f.a[0] = f.b[0] + f.c[0]; }
__global__ void baz(int* a, int* b, int* c)
{ a[0] = b[0] + c[0]; }
struct Quz
{
int* a, *b, *c;
~Quz() {}
};
__global__ void quuz(Quz f)
{ f.a[0] = f.b[0] + f.c[0]; }
And here is the PTX assembly. Note how there is basically no difference between the functions.
.visible .entry _Z3bar3Foo(
.param .align 8 .b8 _Z3bar3Foo_param_0[24]
)
{
.reg .b32 %r<4>;
.reg .b64 %rd<7>;
ld.param.u64 %rd1, [_Z3bar3Foo_param_0+16];
ld.param.u64 %rd2, [_Z3bar3Foo_param_0+8];
ld.param.u64 %rd3, [_Z3bar3Foo_param_0];
cvta.to.global.u64 %rd4, %rd3;
cvta.to.global.u64 %rd5, %rd2;
cvta.to.global.u64 %rd6, %rd1;
ld.global.u32 %r1, [%rd5];
ld.global.u32 %r2, [%rd6];
add.s32 %r3, %r2, %r1;
st.global.u32 [%rd4], %r3;
ret;
}
.visible .entry _Z3bazPiS_S_(
.param .u64 _Z3bazPiS_S__param_0,
.param .u64 _Z3bazPiS_S__param_1,
.param .u64 _Z3bazPiS_S__param_2
)
{
.reg .b32 %r<4>;
.reg .b64 %rd<7>;
ld.param.u64 %rd1, [_Z3bazPiS_S__param_0];
ld.param.u64 %rd2, [_Z3bazPiS_S__param_1];
ld.param.u64 %rd3, [_Z3bazPiS_S__param_2];
cvta.to.global.u64 %rd4, %rd1;
cvta.to.global.u64 %rd5, %rd3;
cvta.to.global.u64 %rd6, %rd2;
ld.global.u32 %r1, [%rd6];
ld.global.u32 %r2, [%rd5];
add.s32 %r3, %r2, %r1;
st.global.u32 [%rd4], %r3;
ret;
}
.visible .entry _Z4quuz3Quz(
.param .align 8 .b8 _Z4quuz3Quz_param_0[24]
)
{
.reg .b32 %r<4>;
.reg .b64 %rd<7>;
ld.param.u64 %rd1, [_Z4quuz3Quz_param_0+16];
ld.param.u64 %rd2, [_Z4quuz3Quz_param_0+8];
ld.param.u64 %rd3, [_Z4quuz3Quz_param_0];
cvta.to.global.u64 %rd4, %rd3;
cvta.to.global.u64 %rd5, %rd2;
cvta.to.global.u64 %rd6, %rd1;
ld.global.u32 %r1, [%rd5];
ld.global.u32 %r2, [%rd6];
add.s32 %r3, %r2, %r1;
st.global.u32 [%rd4], %r3;
ret;
}
It all works the same because CUDA puts all arguments into "constant memory" and accesses them through specialized memory load functions that go through the "constant cache."
I have an array of 300,000 points and I want the fft of every 600 points. I'm attempting to use cufftPlanMany to execute, but I'm getting an unknown error here:
cufftSafeCall(cufftPlanMany(&plan, rank, n, NULL, istride, idist, NULL, 1,1, CUFFT_C2C, 500));
retrevialfft.cu(82) : cufftSafeCall() CUFFT error: <unknown>
Here's the code in context
cudaSetDevice(0);
// Allocate host memory for the signal
cufftComplex* h_signal=(cufftComplex*)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) {
h_signal[i].x = rand() / (float)RAND_MAX;
h_signal[i].y = 0;
// printf("Orignal: %f %f \n", h_signal[i].x, h_signal[i].y);
}
int mem_size = sizeof(cufftComplex) * SIGNAL_SIZE;
// Allocate device memory for signal
cufftComplex* d_signal;
cudaMalloc((void**)&d_signal, mem_size);
int rank = 1; //1d plan
int numCols = 300000;
int n[] = {numCols};
int batch = 500;
int istride = 1;
int ostride = 1;
int idist = numCols;
// CUFFT plan
cufftHandle plan;
cufftSafeCall(cufftPlanMany(&plan, rank, n, NULL, istride, idist, NULL, 1,1, CUFFT_C2C, 500));
// Transform signal
printf("Transforming signal cufftExecC2C\n");
cufftSafeCall(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));
// Copy device memory to host
cufftComplex* h_transformed = (cufftComplex*)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);;
cudaMemcpy(h_transformed, d_signal, mem_size,
cudaMemcpyDeviceToHost);
//Destroy CUFFT context
cufftDestroy(plan);
// cleanup memory
free(h_signal);
free(h_transformed);
cudaFree(d_signal);
cudaDeviceReset();
Any idea of what the error actually is?
You decided not to show any more detail on your question. Below, I'm providing a full working code using cufftPlanMany() to execute batched 1D FFTs. I hope it helps.
#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <assert.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
/********/
/* MAIN */
/********/
void main() {
int batch = 3; // --- How many transforms to be performed
int numCols = 16; // --- Size of each transform
int SIGNAL_SIZE = batch * numCols; // --- Overall size for all the signals
// --- Allocate host memory for all the signals
cufftComplex* h_signal=(cufftComplex*)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);
// --- Initalize host memory for all the signals
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) {
h_signal[i].x = 1.f;
h_signal[i].y = 0.f;
}
// --- Allocate device memory for all the signals
cufftComplex* d_signal; gpuErrchk(cudaMalloc((void**)&d_signal, sizeof(cufftComplex) * SIGNAL_SIZE));
// --- Host to Device memcopy
gpuErrchk(cudaMemcpy(d_signal, h_signal, sizeof(cufftComplex) * SIGNAL_SIZE, cudaMemcpyHostToDevice));
int rank = 1; // --- 1d plan
int n[] = {numCols};
int istride = 1;
int ostride = 1;
int idist = numCols;
int odist = numCols;
// --- CUFFT plan
cufftHandle plan;
cufftSafeCall(cufftPlanMany(&plan, rank, n, NULL, istride, idist, NULL, ostride, odist, CUFFT_C2C, 500));
// --- Signals transformations
cufftSafeCall(cufftExecC2C(plan, (cufftComplex*)d_signal, (cufftComplex*)d_signal, CUFFT_FORWARD));
// --- Device to Host memcopy
gpuErrchk(cudaMemcpy(h_signal, d_signal, sizeof(cufftComplex) * SIGNAL_SIZE, cudaMemcpyDeviceToHost));
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) printf("Real part = %f; Imaginar part = %f\n", h_signal[i].x, h_signal[i].y);
// --- Destroy CUFFT context
cufftSafeCall(cufftDestroy(plan));
// --- Memory cleanup
free(h_signal);
gpuErrchk(cudaFree(d_signal));
cudaDeviceReset();
}
I noticed that when calling cublasSgemm function for each call of gemm from a host, there are 3 kernel invocations: memset, scal_kernel and gemm kernel itself (e.g. sgemm_large). This happens even if I use constants alpha/beta allocated in device memory. While the overhead of memset and scal_kernel is relatively small, the problem is memset is always launched in default stream which causes unnecessary synchronization.
The code:
__constant__ __device__ float alpha = 1;
__constant__ __device__ float beta = 1;
int main()
{
// ... memory allocation skipped ...
float* px = thrust::raw_pointer_cast(x.data());
float* py = thrust::raw_pointer_cast(y.data());
float* pmat = thrust::raw_pointer_cast(mat.data());
for (int iter = 0; iter < 3; ++iter)
{
cbstatus = cublasSgemm(cbh, CUBLAS_OP_N, CUBLAS_OP_N, crow, ccol, cshared, &alpha, px, crow, py, cshared, &beta, pmat, crow);
assert(0 == cbstatus);
}
}
This is what I see in profiler:
The question: is there a way to avoid memset or make it run in the stream assigned to CUBLAS handle?
One idea is to use DP and run device version of the gemm function, but this will work only on CC 3.0 and higher.
There was a bug in CUBLAS5.5 where a cudaMemset was used instead of cudaMemsetAsync in the specialized path where k >> m,n.
It is fixed in CUBLAS6.0 RC. And you can have access to it if you are a registered developer.
Btw, I wonder why you use __constant__ __device__ for alpha,beta.
Are you using pointerMode = DEVICE?
If not, you could simply use alpha,beta on the host.
Try the code below. The code is conceived to have only a cublasSgemm call, apart from unavoidable memory allocations and copies. You will see that
You have only one kernel launched (gemm_kernel1x1_core);
The two calls to cublasSgemm run perfectly in two different streams.
In the picture, the Visual Profiler timeline is shown.
My system: GeForce 540M, Windows 7, CUDA 5.5.
#include <conio.h>
#include <stdio.h>
#include <assert.h>
#include <cublas_v2.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/**********************/
/* cuBLAS ERROR CHECK */
/**********************/
#ifndef cublasSafeCall
#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
#endif
inline void __cublasSafeCall(cublasStatus_t err, const char *file, const int line)
{
if( CUBLAS_STATUS_SUCCESS != err) {
fprintf(stderr, "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",__FILE__, __LINE__,err);
getch(); cudaDeviceReset(); assert(0);
}
}
/********/
/* MAIN */
/********/
int main()
{
int N = 5;
float *A1, *A2, *B1, *B2, *C1, *C2;
float *d_A1, *d_A2, *d_B1, *d_B2, *d_C1, *d_C2;
A1 = (float*)malloc(N*N*sizeof(float));
B1 = (float*)malloc(N*N*sizeof(float));
C1 = (float*)malloc(N*N*sizeof(float));
A2 = (float*)malloc(N*N*sizeof(float));
B2 = (float*)malloc(N*N*sizeof(float));
C2 = (float*)malloc(N*N*sizeof(float));
gpuErrchk(cudaMalloc((void**)&d_A1,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_B1,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_C1,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_A2,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_B2,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_C2,N*N*sizeof(float)));
for (int i=0; i<N*N; i++) {
A1[i] = ((float)rand()/(float)RAND_MAX);
A2[i] = ((float)rand()/(float)RAND_MAX);
B1[i] = ((float)rand()/(float)RAND_MAX);
B2[i] = ((float)rand()/(float)RAND_MAX);
}
gpuErrchk(cudaMemcpy(d_A1, A1, N*N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_B1, B1, N*N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_A2, A2, N*N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_B2, B2, N*N*sizeof(float), cudaMemcpyHostToDevice));
cublasHandle_t handle;
cublasSafeCall(cublasCreate(&handle));
cudaStream_t stream1, stream2;
gpuErrchk(cudaStreamCreate(&stream1));
gpuErrchk(cudaStreamCreate(&stream2));
float alpha = 1.f;
float beta = 1.f;
cublasSafeCall(cublasSetStream(handle,stream1));
cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A1, N, d_B1, N, &beta, d_C1, N));
cublasSafeCall(cublasSetStream(handle,stream2));
cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A2, N, d_B2, N, &beta, d_C2, N));
gpuErrchk(cudaDeviceReset());
return 0;
}
In CUDA, given the value of a pointer, or the address of a variable, is there an intrinsic or another API which will introspect which address space the pointer refers to?
The CUDA header file sm_20_intrinsics.h defines the function
__device__ unsigned int __isGlobal(const void *ptr)
{
unsigned int ret;
asm volatile ("{ \n\t"
" .reg .pred p; \n\t"
" isspacep.global p, %1; \n\t"
" selp.u32 %0, 1, 0, p; \n\t"
#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
"} \n\t" : "=r"(ret) : "l"(ptr));
#else
"} \n\t" : "=r"(ret) : "r"(ptr));
#endif
return ret;
}
This function returns 1 if generic address ptr is in global memory space.
It returns 0 if ptr is in shared, local or constant memory space.
The PTX instruction isspacep does the heavy lifting. It seems like we should be able to build the analogous function this way:
__device__ unsigned int __isShared(const void *ptr)
{
unsigned int ret;
asm volatile ("{ \n\t"
" .reg .pred p; \n\t"
" isspacep.shared p, %1; \n\t"
" selp.u32 %0, 1, 0, p; \n\t"
#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
"} \n\t" : "=r"(ret) : "l"(ptr));
#else
"} \n\t" : "=r"(ret) : "r"(ptr));
#endif
return ret;
}