Is there any performance downside when passing a struct to a kernel? - cuda

I have a kernel that takes several arrays as input. To improve readability it would be nice to group them into a struct and (after proper memory allocation and copy for each input) pass the struct to the kernel instead of the long list of pointers.
Is it going to be the same in the 2 cases, memory-wise, when accessing the arrays inside the kernel?
Can anyone recommend me some documentation on this topic (Couldn't find it on the programming guide)

No, there should be no difference. You can read the PTX output to make sure. Here is a simple example:
struct Foo
{
int* a, *b, *c;
};
__global__ void bar(Foo f)
{ f.a[0] = f.b[0] + f.c[0]; }
__global__ void baz(int* a, int* b, int* c)
{ a[0] = b[0] + c[0]; }
struct Quz
{
int* a, *b, *c;
~Quz() {}
};
__global__ void quuz(Quz f)
{ f.a[0] = f.b[0] + f.c[0]; }
And here is the PTX assembly. Note how there is basically no difference between the functions.
.visible .entry _Z3bar3Foo(
.param .align 8 .b8 _Z3bar3Foo_param_0[24]
)
{
.reg .b32 %r<4>;
.reg .b64 %rd<7>;
ld.param.u64 %rd1, [_Z3bar3Foo_param_0+16];
ld.param.u64 %rd2, [_Z3bar3Foo_param_0+8];
ld.param.u64 %rd3, [_Z3bar3Foo_param_0];
cvta.to.global.u64 %rd4, %rd3;
cvta.to.global.u64 %rd5, %rd2;
cvta.to.global.u64 %rd6, %rd1;
ld.global.u32 %r1, [%rd5];
ld.global.u32 %r2, [%rd6];
add.s32 %r3, %r2, %r1;
st.global.u32 [%rd4], %r3;
ret;
}
.visible .entry _Z3bazPiS_S_(
.param .u64 _Z3bazPiS_S__param_0,
.param .u64 _Z3bazPiS_S__param_1,
.param .u64 _Z3bazPiS_S__param_2
)
{
.reg .b32 %r<4>;
.reg .b64 %rd<7>;
ld.param.u64 %rd1, [_Z3bazPiS_S__param_0];
ld.param.u64 %rd2, [_Z3bazPiS_S__param_1];
ld.param.u64 %rd3, [_Z3bazPiS_S__param_2];
cvta.to.global.u64 %rd4, %rd1;
cvta.to.global.u64 %rd5, %rd3;
cvta.to.global.u64 %rd6, %rd2;
ld.global.u32 %r1, [%rd6];
ld.global.u32 %r2, [%rd5];
add.s32 %r3, %r2, %r1;
st.global.u32 [%rd4], %r3;
ret;
}
.visible .entry _Z4quuz3Quz(
.param .align 8 .b8 _Z4quuz3Quz_param_0[24]
)
{
.reg .b32 %r<4>;
.reg .b64 %rd<7>;
ld.param.u64 %rd1, [_Z4quuz3Quz_param_0+16];
ld.param.u64 %rd2, [_Z4quuz3Quz_param_0+8];
ld.param.u64 %rd3, [_Z4quuz3Quz_param_0];
cvta.to.global.u64 %rd4, %rd3;
cvta.to.global.u64 %rd5, %rd2;
cvta.to.global.u64 %rd6, %rd1;
ld.global.u32 %r1, [%rd5];
ld.global.u32 %r2, [%rd6];
add.s32 %r3, %r2, %r1;
st.global.u32 [%rd4], %r3;
ret;
}
It all works the same because CUDA puts all arguments into "constant memory" and accesses them through specialized memory load functions that go through the "constant cache."

Related

String matching in CUDA shows different result when increase blocks

I'm trying to implement string matching program with CUDA in C and I have th following issue.
When I set 1 block and 1 thread per block the result for pattern dfh is 2. That's correct, but when I increase the blocks the result is 4.
The text file is:
ffskdfhksdjhfksdfksjdfhksdhfksjdhfkjer654yrkhjkfgjhdsrtrhkjchgkjthyoirthygfnbkjgkjdhykhkjchgkjfdhsfykhkbhkjfghkfgjy
This is my code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
__global__ void string_matching(char *buffer, char *pattern, int match_size, int pattern_size, int *result){
int tid, i;
__shared__ int local_matches;
if(threadIdx.x == 0) local_matches = 0;
__syncthreads();
for(tid=blockIdx.x*blockDim.x+threadIdx.x; tid<match_size; tid+=blockDim.x){
for (i = 0; i < pattern_size && pattern[i] == buffer[i + tid]; ++i);
if(i >= pattern_size){
atomicAdd(&local_matches, 1);
}
}
__syncthreads();
if(threadIdx.x == 0)
atomicAdd(result, local_matches);
}
int main(int argc, char *argv[]){
FILE *pFile;
long file_size, match_size, pattern_size;
char * buffer;
char * filename, *pattern;
size_t result;
int *match, total_matches;
//CUDA variables
int blocks, threads_per_block;
int *result_dev;
char *buffer_dev, *pattern_dev;
float total_time, comp_time;
cudaEvent_t total_start, total_stop, comp_start, comp_stop;
cudaEventCreate(&total_start);
cudaEventCreate(&total_stop);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_stop);
if (argc != 5) {
printf ("Usage : %s <file_name> <string> <blocks> <threads_per_block>\n", argv[0]);
return 1;
}
filename = argv[1];
pattern = argv[2];
blocks = strtol(argv[3], NULL, 10);
threads_per_block = strtol(argv[4], NULL, 10);
pFile = fopen ( filename , "rb" );
if (pFile==NULL) {printf ("File error\n"); return 2;}
// obtain file size:
fseek (pFile , 0 , SEEK_END);
file_size = ftell (pFile);
rewind (pFile);
printf("file size is %ld\n", file_size);
// allocate memory to contain the file:
buffer = (char*) malloc (sizeof(char)*file_size);
if (buffer == NULL) {printf ("Memory error\n"); return 3;}
// copy the file into the buffer:
result = fread (buffer,1,file_size,pFile);
if (result != file_size) {printf ("Reading error\n"); return 4;}
pattern_size = strlen(pattern);
match_size = file_size - pattern_size + 1;
match = (int *) malloc (sizeof(int)*match_size);
if (match == NULL) {printf ("Malloc error\n"); return 5;}
cudaMalloc((void **)&result_dev, sizeof(int));
cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
cudaMalloc((void **)&pattern_dev, pattern_size*sizeof(char));
cudaEventRecord(total_start);
cudaEventRecord(comp_start);
cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(pattern_dev, pattern, pattern_size*sizeof(char), cudaMemcpyHostToDevice);
string_matching<<<blocks, threads_per_block>>>(buffer_dev, pattern_dev, match_size, pattern_size, result_dev);
cudaThreadSynchronize();
cudaEventRecord(comp_stop);
cudaEventSynchronize(comp_stop);
cudaEventElapsedTime(&comp_time, comp_start, comp_stop);
cudaMemcpy(&total_matches, result_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(total_stop);
cudaEventSynchronize(total_stop);
cudaEventElapsedTime(&total_time, total_start, total_stop);
cudaFree(result_dev);
cudaFree(buffer_dev);
cudaFree(pattern_dev);
fclose (pFile);
free (buffer);
//Print result
printf("Total matches: %d\n", total_matches);
printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", file_size, blocks, blocks*threads_per_block);
printf("Total time (ms): %.3f\n", total_time);
printf("Kernel time (ms): %.3f\n", comp_time);
printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);
}
You need to synchronize threads of the same block using __syncthreads. For example, local_matches = 0 can theoretically be done concurrently to the atomicAdd of the main loop. Thus, a __syncthreads is needed between both. For the same reason, you also need a __syncthreads before the last if(threadIdx.x == 0). I am not sure this is the only error.
I advise you to use CUDA-GDB to track such bug since the kernel is quite short and relatively simple.
Note that a local_matches do not need to be shared. In fact, it is not efficient to do that. On can perform the reduction in local memory and then perform a final atomicAdd. Additionally, you do not need the innermost conditional. You can simply do: local_matches += i >= pattern_size; (the compiler might already do such optimization).
I finally found a solution for this.
I set a match table with 0 values for every buffer position, set 1 for every position that pattern found, and added the 1's in the CPU.
If you think something better please add an answer.
This is the code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
__global__ void string_matching(char *buffer, char *pattern, int match_size, int pattern_size, int *match){
int tid, i;
for(tid=blockIdx.x*blockDim.x+threadIdx.x; tid<match_size; tid+=blockDim.x){
for (i = 0; i < pattern_size && pattern[i] == buffer[i + tid]; ++i);
if(i >= pattern_size){
match[tid] = 1;
}
else{
match[tid] = 0;
}
}
}
int main(int argc, char *argv[]){
FILE *pFile;
int i;
long file_size, match_size, pattern_size;
char * buffer;
char * filename, *pattern;
size_t result;
int *match, total_matches;
//CUDA variables
int blocks, threads_per_block;
int *match_dev;
char *buffer_dev, *pattern_dev;
float total_time, comp_time;
cudaEvent_t total_start, total_stop, comp_start, comp_stop;
cudaEventCreate(&total_start);
cudaEventCreate(&total_stop);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_stop);
if (argc != 5) {
printf ("Usage : %s <file_name> <string> <blocks> <threads_per_block>\n", argv[0]);
return 1;
}
filename = argv[1];
pattern = argv[2];
blocks = strtol(argv[3], NULL, 10);
threads_per_block = strtol(argv[4], NULL, 10);
pFile = fopen ( filename , "rb" );
if (pFile==NULL) {printf ("File error\n"); return 2;}
// obtain file size:
fseek (pFile , 0 , SEEK_END);
file_size = ftell (pFile);
rewind (pFile);
printf("file size is %ld\n", file_size);
// allocate memory to contain the file:
buffer = (char*) malloc (sizeof(char)*file_size);
if (buffer == NULL) {printf ("Memory error\n"); return 3;}
// copy the file into the buffer:
result = fread (buffer,1,file_size,pFile);
if (result != file_size) {printf ("Reading error\n"); return 4;}
pattern_size = strlen(pattern);
match_size = file_size - pattern_size + 1;
match = (int *) malloc (sizeof(int)*match_size);
if (match == NULL) {printf ("Malloc error\n"); return 5;}
cudaMalloc((void **)&match_dev, match_size*sizeof(int));
cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
cudaMalloc((void **)&pattern_dev, pattern_size*sizeof(char));
cudaEventRecord(total_start);
cudaEventRecord(comp_start);
cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(pattern_dev, pattern, pattern_size*sizeof(char), cudaMemcpyHostToDevice);
string_matching<<<blocks, threads_per_block>>>(buffer_dev, pattern_dev, match_size, pattern_size, match_dev);
cudaThreadSynchronize();
cudaEventRecord(comp_stop);
cudaEventSynchronize(comp_stop);
cudaEventElapsedTime(&comp_time, comp_start, comp_stop);
cudaMemcpy(match, match_dev, match_size*sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(total_stop);
cudaEventSynchronize(total_stop);
cudaEventElapsedTime(&total_time, total_start, total_stop);
total_matches = 0;
for(i=0; i<match_size; i++){
total_matches += match[i];
}
cudaFree(match_dev);
cudaFree(buffer_dev);
cudaFree(pattern_dev);
fclose (pFile);
free (buffer);
//Print result
printf("Total matches: %d\n", total_matches);
printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", file_size, blocks, blocks*threads_per_block);
printf("Total time (ms): %.3f\n", total_time);
printf("Kernel time (ms): %.3f\n", comp_time);
printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);
}
I think this is a better solution.
It works only for power of 2 threads per block.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#define MAX_THREADS_PER_BLOCK 100
__global__ void string_matching(char *buffer, char *pattern, int match_size, int pattern_size, int blocks, int slice, int extra, int *gout){
int tid, i;
int thread_index = blockIdx.x*blockDim.x + threadIdx.x;
int start = thread_index*slice;
int stop = start + slice;
if(thread_index == blocks*blockDim.x - 1){
stop += extra;
}
if(stop > match_size){
stop = match_size;
}
__shared__ int r[MAX_THREADS_PER_BLOCK];
int sum = 0;
for(tid=start; tid<stop; tid++){
for (i = 0; i < pattern_size && pattern[i] == buffer[i + tid]; ++i);
if(i >= pattern_size){
sum++;
}
}
r[threadIdx.x] = sum;
__syncthreads();
//works only for power of 2 threads_per_block
for (int size = blockDim.x/2; size>0; size/=2) { //uniform
if (threadIdx.x<size)
r[threadIdx.x] += r[threadIdx.x+size];
__syncthreads();
}
printf("Block: %d, Thread: %d, Global Thread: %d, Start: %d, Stop: %d, Matches: %d, Block Matches: %d\n", blockIdx.x, threadIdx.x, thread_index, start, stop, r[threadIdx.x], r[0]);
if(threadIdx.x == 0){
gout[blockIdx.x] = r[0];
}
}
int main(int argc, char *argv[]){
int i;
FILE *pFile;
long file_size, match_size, pattern_size;
char * buffer;
char * filename, *pattern;
size_t result;
int *results;
int total_matches;
//CUDA variables
int blocks, threads_per_block, total_threads, slice, extra;
int *results_dev;
char *buffer_dev, *pattern_dev;
float total_time, comp_time;
cudaEvent_t total_start, total_stop, comp_start, comp_stop;
cudaEventCreate(&total_start);
cudaEventCreate(&total_stop);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_stop);
if (argc != 5) {
printf ("Usage : %s <file_name> <string> <blocks> <threads_per_block>\n", argv[0]);
return 1;
}
filename = argv[1];
pattern = argv[2];
blocks = strtol(argv[3], NULL, 10);
threads_per_block = strtol(argv[4], NULL, 10);
pFile = fopen ( filename , "rb" );
if (pFile==NULL) {printf ("File error\n"); return 2;}
// obtain file size:
fseek (pFile , 0 , SEEK_END);
file_size = ftell (pFile);
rewind (pFile);
printf("file size is %ld\n", file_size);
// allocate memory to contain the file:
buffer = (char*) malloc (sizeof(char)*file_size);
if (buffer == NULL) {printf ("Memory error\n"); return 3;}
// copy the file into the buffer:
result = fread (buffer,1,file_size,pFile);
if (result != file_size) {printf ("Reading error\n"); return 4;}
pattern_size = strlen(pattern);
match_size = file_size - pattern_size + 1;
results = (int *)malloc(blocks*sizeof(int));
cudaMalloc((void **)&results_dev, blocks*sizeof(int));
cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
cudaMalloc((void **)&pattern_dev, pattern_size*sizeof(char));
cudaEventRecord(total_start);
cudaEventRecord(comp_start);
cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(pattern_dev, pattern, pattern_size*sizeof(char), cudaMemcpyHostToDevice);
total_threads = blocks*threads_per_block;
slice = match_size/total_threads;
extra = match_size%total_threads;
string_matching<<<blocks, threads_per_block>>>(buffer_dev, pattern_dev, match_size, pattern_size, blocks, slice, extra, results_dev);
cudaEventRecord(comp_stop);
cudaEventSynchronize(comp_stop);
cudaEventElapsedTime(&comp_time, comp_start, comp_stop);
cudaMemcpy(results, results_dev, blocks*sizeof(int), cudaMemcpyDeviceToHost);
total_matches = 0;
for(i=0; i<blocks; i++){
total_matches += results[i];
}
cudaEventRecord(total_stop);
cudaEventSynchronize(total_stop);
cudaEventElapsedTime(&total_time, total_start, total_stop);
cudaFree(results_dev);
cudaFree(buffer_dev);
cudaFree(pattern_dev);
fclose (pFile);
free (buffer);
//Print result
printf("Total matches: %d\n", total_matches);
printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", file_size, blocks, blocks*threads_per_block);
printf("Total time (ms): %.3f\n", total_time);
printf("Kernel time (ms): %.3f\n", comp_time);
printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);
}
The stride on your loop is not correct for using multiple blocks, i.e. both blocks were doing the full amount of work (and therefore finding two entries each). The correct grid-stride loop looks like the following
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
tid < match_size;
tid += blockDim.x * gridDim.x /* <-- fix */) {
// ...
}

LU factorization receives different results between LAPACK and cuBLAS/cuSOLVER

I am testing out some scenarios where the function dgetrf is returned differently when used with cuBLAS/cuSOLVER compared to writing for LAPACK. For example, I am looking at LU factorization of the following matrix:
2.0 4.0 1.0 -3.0 0.0
-1.0 -2.0 2.0 4.0 0.0
4.0 2.0 -3.0 5.0 0.0
5.0 -4.0 -3.0 1.0 0.0
0.0 0.0 0.0 0.0 0.0
I first try to call dgetrf from cuBLAS/cuSOLVER as followed (warning, ugly testing code ahead!)
#include <cblas.h>
#include <time.h>
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
int main(int argc, char** argv){
const int matrixSize = 5;
int i, j;
double arrA[matrixSize][matrixSize] = {
{2.0, 4.0, 1.0, -3.0, 0.0},
{-1.0, -2.0, 2.0, 4.0, 0.0},
{4.0, 2.0, -3.0, 5.0, 0.0},
{5.0, -4.0, -3.0, 1.0, 0.0},
{0.0, 0.0, 0.0, 0.0, 0.0}
};
double *arrADev, *workArray;
double **matrixArray;
int *pivotArray;
int *infoArray;
double flat[matrixSize*matrixSize] = {0};
cublasHandle_t cublasHandle;
cublasStatus_t cublasStatus;
cudaError_t error;
cudaError cudaStatus;
cusolverStatus_t cusolverStatus;
cusolverDnHandle_t cusolverHandle;
double *matrices[2];
error = cudaMalloc(&arrADev, sizeof(double) * matrixSize*matrixSize);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc(&matrixArray, sizeof(double*) * 2);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc(&pivotArray, sizeof(int) * matrixSize*matrixSize);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc(&infoArray, sizeof(int) * matrixSize*matrixSize);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
cublasStatus = cublasCreate(&cublasHandle);
if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);
//maps matrix to flat vector
for(i=0; i<matrixSize; i++){
for(j=0; j<matrixSize; j++){
flat[i+j*matrixSize] = arrA[i][j];
}
}
//copy matrix A to device
cublasStatus = cublasSetMatrix(matrixSize, matrixSize, sizeof(double), flat, matrixSize, arrADev, matrixSize);
if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);
//save matrix address
matrices[0] = arrADev;
//copy matrices references to device
error = cudaMemcpy(matrixArray, matrices, sizeof(double*)*1, cudaMemcpyHostToDevice);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
int Lwork;
// calculate buffer size for cuSOLVER LU factorization
cusolverStatus = cusolverDnDgetrf_bufferSize(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, &Lwork);
cudaStatus = cudaMalloc((void**)&workArray, Lwork*sizeof(double));
// cuBLAS LU factorization
cublasStatus = cublasDgetrfBatched(cublasHandle, matrixSize, matrixArray, matrixSize, pivotArray, infoArray, 1);
if (cublasStatus == CUBLAS_STATUS_SUCCESS)
printf("cuBLAS DGETRF SUCCESSFUL! \n");
else
printf("cuBLAS DGETRF UNSUCCESSFUL! \n");
// cuSOLVER LU factorization
cusolverStatus = cusolverDnCreate(&cusolverHandle);
cusolverStatus = cusolverDnDgetrf(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, workArray, pivotArray, infoArray);
if (cusolverStatus == CUSOLVER_STATUS_SUCCESS)
printf("cuSOLVER DGETRF SUCCESSFUL! \n");
else
printf("cuSOLVER DGETRF UNSUCCESSFUL! \n");
return 0;
}
The output from the code above is
cuBLAS DGETRF SUCCESSFUL!
cuSOLVER DGETRF SUCCESSFUL!
When I try to do the same with LAPACK (warning: more ugly code!):
#include <iostream>
#include <vector>
using namespace std;
extern "C" void dgetrf_(int* dim1, int* dim2, double* a, int* lda, int* ipiv, int* info);
extern "C" void dgetrs_(char *TRANS, int *N, int *NRHS, double *A, int *LDA, int *IPIV, double *B, int *LDB, int *INFO );
int main()
{
char trans = 'N';
int dim = 5;
int LDA = dim;
int info;
vector<double> a,b;
a.push_back(2.0); a.push_back(4.0); a.push_back(1.0); a.push_back(-3.0); a.push_back(0.0);
a.push_back(-1.0); a.push_back(-2.0); a.push_back(2.0); a.push_back(4.0); a.push_back(0.0);
a.push_back(4.0); a.push_back(2.0); a.push_back(-3.0); a.push_back(5.0); a.push_back(0.0);
a.push_back(5.0); a.push_back(-4.0); a.push_back(-3.0); a.push_back(1.0); a.push_back(0.0);
a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0);
int ipiv[5];
dgetrf_(&dim, &dim, &*a.begin(), &LDA, ipiv, &info);
if (info == 0)
printf("dgetrf successful\n");
else
printf("dgetrf unsuccessful\n");
return 0;
}
The output I get is
dgetrf unsuccessful
I understand that they are different libraries, but is this behaviour expected?
When I compile your CUDA code, I get a warning that the cusolver handle is being used before its value is set. You shouldn't ignore such warnings, because your usage in the sizing function is not correct. However that is not the problem here.
I don't think there's any difference between your two test cases. You seem to be interpreting the results incorrectly.
Looking at the netlib documentation, we see that an info value of 5 mean U(5,5) is zero, which would be problematic for future use. That doesn't mean the dgetrf factorization was successful or unsuccessful as you are printing out, but instead it means something about your input data. In fact the factorization was completed, as clearly indicated in the docs.
Likewise, we get no information about that condition simply by looking at the function return value for the cusolver function. In order to discover information similar to what is being reported by lapack, its necessary to look at the infoArray values.
With those changes, your codes are reporting the same thing (info value of 5):
$ cat t1556.cu
#include <time.h>
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
int main(int argc, char** argv){
const int matrixSize = 5;
int i, j;
double arrA[matrixSize][matrixSize] = {
{2.0, 4.0, 1.0, -3.0, 0.0},
{-1.0, -2.0, 2.0, 4.0, 0.0},
{4.0, 2.0, -3.0, 5.0, 0.0},
{5.0, -4.0, -3.0, 1.0, 0.0},
{0.0, 0.0, 0.0, 0.0, 0.0}
};
double *arrADev, *workArray;
double **matrixArray;
int *pivotArray;
int *infoArray;
double flat[matrixSize*matrixSize] = {0};
cublasHandle_t cublasHandle;
cublasStatus_t cublasStatus;
cudaError_t error;
cudaError cudaStatus;
cusolverStatus_t cusolverStatus;
cusolverDnHandle_t cusolverHandle;
double *matrices[2];
error = cudaMalloc(&arrADev, sizeof(double) * matrixSize*matrixSize);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc(&matrixArray, sizeof(double*) * 2);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc(&pivotArray, sizeof(int) * matrixSize*matrixSize);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc(&infoArray, sizeof(int) * matrixSize*matrixSize);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
cublasStatus = cublasCreate(&cublasHandle);
if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);
//maps matrix to flat vector
for(i=0; i<matrixSize; i++){
for(j=0; j<matrixSize; j++){
flat[i+j*matrixSize] = arrA[i][j];
}
}
//copy matrix A to device
cublasStatus = cublasSetMatrix(matrixSize, matrixSize, sizeof(double), flat, matrixSize, arrADev, matrixSize);
if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);
//save matrix address
matrices[0] = arrADev;
//copy matrices references to device
error = cudaMemcpy(matrixArray, matrices, sizeof(double*)*1, cudaMemcpyHostToDevice);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
int Lwork;
// calculate buffer size for cuSOLVER LU factorization
cusolverStatus = cusolverDnCreate(&cusolverHandle);
cusolverStatus = cusolverDnDgetrf_bufferSize(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, &Lwork);
cudaStatus = cudaMalloc((void**)&workArray, Lwork*sizeof(double));
// cuBLAS LU factorization
cublasStatus = cublasDgetrfBatched(cublasHandle, matrixSize, matrixArray, matrixSize, pivotArray, infoArray, 1);
if (cublasStatus == CUBLAS_STATUS_SUCCESS)
printf("cuBLAS DGETRF SUCCESSFUL! \n");
else
printf("cuBLAS DGETRF UNSUCCESSFUL! \n");
// cuSOLVER LU factorization
cusolverStatus = cusolverDnDgetrf(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, workArray, pivotArray, infoArray);
if (cusolverStatus == CUSOLVER_STATUS_SUCCESS)
printf("cuSOLVER DGETRF SUCCESSFUL! \n");
else
printf("cuSOLVER DGETRF UNSUCCESSFUL! \n");
int *hinfoArray = (int *)malloc(matrixSize*matrixSize*sizeof(int));
cudaMemcpy(hinfoArray, infoArray, matrixSize*matrixSize*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < matrixSize*matrixSize; i++) printf("%d,", hinfoArray[i]);
printf("\n");
return 0;
}
$ nvcc -o t1556 t1556.cu -lcublas -lcusolver
t1556.cu(30): warning: variable "cudaStatus" was set but never used
$ ./t1556
cuBLAS DGETRF SUCCESSFUL!
cuSOLVER DGETRF SUCCESSFUL!
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
$ cat t1557.cpp
#include <iostream>
#include <vector>
#include <lapacke/lapacke.h>
using namespace std;
// extern "C" void dgetrf_(int* dim1, int* dim2, double* a, int* lda, int* ipiv, int* info);
// extern "C" void dgetrs_(char *TRANS, int *N, int *NRHS, double *A, int *LDA, int *IPIV, double *B, int *LDB, int *INFO );
int main()
{
char trans = 'N';
int dim = 5;
int LDA = dim;
int info;
vector<double> a,b;
a.push_back(2.0); a.push_back(4.0); a.push_back(1.0); a.push_back(-3.0); a.push_back(0.0);
a.push_back(-1.0); a.push_back(-2.0); a.push_back(2.0); a.push_back(4.0); a.push_back(0.0);
a.push_back(4.0); a.push_back(2.0); a.push_back(-3.0); a.push_back(5.0); a.push_back(0.0);
a.push_back(5.0); a.push_back(-4.0); a.push_back(-3.0); a.push_back(1.0); a.push_back(0.0);
a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0);
int ipiv[5];
LAPACK_dgetrf(&dim, &dim, &*a.begin(), &LDA, ipiv, &info);
printf("info = %d\n", info);
if (info == 0)
printf("dgetrf successful\n");
else
printf("dgetrf unsuccessful\n");
return 0;
}
$ g++ t1557.cpp -o t1557 -llapack
$ ./t1557
info = 5
dgetrf unsuccessful
$
I'm using the lapack installed by centOS.
centOS 7, CUDA 10.1.243, Tesla V100.

Why doesn't nvrtc compiler emit this nvvm code fragments to ptx?

I have some NVVM code that I am trying to compile to PTX using nvrtc (i.e. using nvvmCompileProgram, nvvmGetCompiledResult).
Here is the nvvm code:
; ModuleID = 'test_warp_reduce'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-cuda"
define ptx_kernel void #lambda_crit_4197([0 x float]* %_4200_4590, [0 x i64]* %_4201_4591, [0 x float]* %_4202_4592) {
acc_bidx:
%0 = tail call ptx_device i32 #llvm.nvvm.read.ptx.sreg.tid.x()
%1 = tail call ptx_device i32 #llvm.nvvm.read.ptx.sreg.ntid.x()
%2 = tail call ptx_device i32 #llvm.nvvm.read.ptx.sreg.ctaid.x()
%3 = mul nsw i32 %2, %1
%4 = add nsw i32 %3, %0
%5 = icmp slt i32 %4, 32
br i1 %5, label %if_then12, label %next
if_then12: ; preds = %acc_bidx
%6 = getelementptr inbounds [0 x float]* %_4202_4592, i64 0, i32 %4
%7 = load float* %6
%8 = tail call i64 #clock()
%9 = tail call float #reduce_step(float %7, i32 1, i32 31)
%10 = tail call float #reduce_step(float %9, i32 2, i32 31)
%11 = tail call float #reduce_step(float %10, i32 4, i32 31)
%12 = tail call float #reduce_step(float %11, i32 8, i32 31)
%13 = tail call float #reduce_step(float %12, i32 16, i32 31)
%14 = tail call i64 #clock()
%15 = getelementptr inbounds [0 x float]* %_4200_4590, i64 0, i32 %4
%16 = getelementptr inbounds [0 x i64]* %_4201_4591, i64 0, i32 %0
%17 = sub nsw i64 %14, %8
store i64 %17, i64* %16
store float %13, float* %15
br label %next
next: ; preds = %acc_bidx, %if_then12
ret void
}
declare i64 #llvm.nvvm.texsurf.handle.p1i64(metadata, i64 addrspace(1)*)
; Function Attrs: nounwind readnone
declare ptx_device i32 #llvm.nvvm.read.ptx.sreg.tid.x()
; Function Attrs: nounwind readnone
declare ptx_device i32 #llvm.nvvm.read.ptx.sreg.ntid.x()
; Function Attrs: nounwind readnone
declare ptx_device i32 #llvm.nvvm.read.ptx.sreg.ctaid.x()
define i64 #clock() {
%1 = call i64 asm "mov.u32 $0, %clock;", "=r" ()
ret i64 %1
}
define float #reduce_step(float %a, i32 %b, i32 %c) {
%1 = call float asm
"{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, $1, $2, $3;
#p add.f32 r0, r0, $1;
mov.f32 $0, r0;
}", "=f, f, r, r" (float %a, i32 %b, i32 %c)
ret float %1
}
!nvvmir.version = !{!0}
!nvvm.annotations = !{!1}
!0 = metadata !{i32 1, i32 2}
!1 = metadata !{void ([0 x float]*, [0 x i64]*, [0 x float]*)* #lambda_crit_4197, metadata !"kernel", i64 1}
And here is the generated ptx code:
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19324574
// Cuda compilation tools, release 7.0, V7.0.27
// Based on LLVM 3.4svn
//
.version 4.2
.target sm_52
.address_size 64
// .globl lambda_crit_4197
.visible .entry lambda_crit_4197(
.param .u64 lambda_crit_4197_param_0,
.param .u64 lambda_crit_4197_param_1,
.param .u64 lambda_crit_4197_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<11>;
.reg .s32 %r<15>;
.reg .s64 %rd<13>;
ld.param.u64 %rd1, [lambda_crit_4197_param_0];
ld.param.u64 %rd2, [lambda_crit_4197_param_1];
ld.param.u64 %rd3, [lambda_crit_4197_param_2];
mov.u32 %r1, %tid.x;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mad.lo.s32 %r2, %r3, %r4, %r1;
setp.gt.s32 %p1, %r2, 31;
#%p1 bra BB0_2;
cvta.to.global.u64 %rd4, %rd3;
mul.wide.s32 %rd5, %r2, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f2, [%rd6];
mov.u32 %r5, 1;
mov.u32 %r14, 31;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f2, %r5, %r14;
#p add.f32 r0, r0, %f2;
mov.f32 %f1, r0;
}
// inline asm
mov.u32 %r7, 2;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f1, %r7, %r14;
#p add.f32 r0, r0, %f1;
mov.f32 %f3, r0;
}
// inline asm
mov.u32 %r9, 4;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f3, %r9, %r14;
#p add.f32 r0, r0, %f3;
mov.f32 %f5, r0;
}
// inline asm
mov.u32 %r11, 8;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f5, %r11, %r14;
#p add.f32 r0, r0, %f5;
mov.f32 %f7, r0;
}
// inline asm
mov.u32 %r13, 16;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f7, %r13, %r14;
#p add.f32 r0, r0, %f7;
mov.f32 %f9, r0;
}
// inline asm
cvta.to.global.u64 %rd7, %rd1;
add.s64 %rd8, %rd7, %rd5;
cvta.to.global.u64 %rd9, %rd2;
mul.wide.s32 %rd10, %r1, 8;
add.s64 %rd11, %rd9, %rd10;
mov.u64 %rd12, 0;
st.global.u64 [%rd11], %rd12;
st.global.f32 [%rd8], %f9;
BB0_2:
ret;
}
// .globl clock
.visible .func (.param .b64 func_retval0) clock(
)
{
.reg .s32 %r<2>;
.reg .s64 %rd<2>;
// inline asm
mov.u32 %r1, %clock;
// inline asm
cvt.u64.u32 %rd1, %r1;
st.param.b64 [func_retval0+0], %rd1;
ret;
}
// .globl reduce_step
.visible .func (.param .b32 func_retval0) reduce_step(
.param .b32 reduce_step_param_0,
.param .b32 reduce_step_param_1,
.param .b32 reduce_step_param_2
)
{
.reg .f32 %f<3>;
.reg .s32 %r<3>;
ld.param.f32 %f2, [reduce_step_param_0];
ld.param.u32 %r1, [reduce_step_param_1];
ld.param.u32 %r2, [reduce_step_param_2];
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f2, %r1, %r2;
#p add.f32 r0, r0, %f2;
mov.f32 %f1, r0;
}
// inline asm
st.param.f32 [func_retval0+0], %f1;
ret;
}
It seems that the nvvm compiler just eliminates code for mysterious reasons. For example, the calls for the clock function weren't emitted at all.
Whether I used the compiler optimization or not doesn't make a difference in the provided code.
Somebody told me that Cuda 7.5 had some similar issues (assembly not being emitted) on Windows. So I downgraded to 7.0. However, the problem is still in there.
Any clue why this might be the case?
I can tell from experience that PTX code only calls builtin functions. User defined functions get inlined into the calling functions.
I can't seem to find the proper documentation for it right now, but I'll link it in when I find it.
In your code base there are many places code segments like this repeat:
// inline asm
mov.u32 %r7, 2;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f1, %r7, %r14;
#p add.f32 r0, r0, %f1;
mov.f32 %f3, r0;
}
Does this look familiar ? The first line one is coming from clock, the second block is coming from reduce_step.
TL;DR: You don't see the calls because they got inlined.

Batched FFTs using cufftPlanMany

I want to perform 441 2D, 32-by-32 FFTs using the batched method provided by the cuFFT library. The parameters of the transform are the following:
int n[2] = {32,32};
int inembed[] = {32,32};
int onembed[] = {32,32/2+1};
cufftPlanMany(&plan,2,n,inembed,1,32*32,onembed,1,32*(32/2+1),CUFFT_D2Z,441);
cufftPlanMany(&inverse_plan,2,n,onembed,1,32*32,inembed,1,32*32,CUFFT_Z2D,441);
After I did the forward and inverse FFTs using the above plans, I could not get the original data back.
Can anyone advise me how to set the parameters correctly for cudaPlanMany? Many thanks in advance.
By the way, is it the best way to use cudaPlanMany for my situation?
Here is a full example on how using cufftPlanMany to perform batched direct and inverse transformations in CUDA. The example refers to float to cufftComplex transformations and back. The final result of the direct+inverse transformation is correct but for a multiplicative constant equal to the overall number of matrix elements nRows*nCols.
#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <assert.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
/********/
/* MAIN */
/********/
void main() {
cufftHandle forward_plan, inverse_plan;
int batch = 3;
int rank = 2;
int nRows = 5;
int nCols = 5;
int n[2] = {nRows, nCols};
int idist = nRows*nCols;
int odist = nRows*(nCols/2+1);
int inembed[] = {nRows, nCols};
int onembed[] = {nRows, nCols/2+1};
int istride = 1;
int ostride = 1;
cufftSafeCall(cufftPlanMany(&forward_plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch));
float *h_in = (float*)malloc(sizeof(float)*nRows*nCols*batch);
for(int i=0; i<nRows*nCols*batch; i++) h_in[i] = 1.f;
float2* h_freq = (float2*)malloc(sizeof(float2)*nRows*(nCols/2+1)*batch);
float* d_in; gpuErrchk(cudaMalloc(&d_in, sizeof(float)*nRows*nCols*batch));
float2* d_freq; gpuErrchk(cudaMalloc(&d_freq, sizeof(float2)*nRows*(nCols/2+1)*batch));
gpuErrchk(cudaMemcpy(d_in,h_in,sizeof(float)*nRows*nCols*batch,cudaMemcpyHostToDevice));
cufftSafeCall(cufftExecR2C(forward_plan, d_in, d_freq));
gpuErrchk(cudaMemcpy(h_freq,d_freq,sizeof(float2)*nRows*(nCols/2+1)*batch,cudaMemcpyDeviceToHost));
for(int i=0; i<nRows*(nCols/2+1)*batch; i++) printf("Direct transform: %i %f %f\n",i,h_freq[i].x,h_freq[i].y);
cufftSafeCall(cufftPlanMany(&inverse_plan, rank, n, onembed, ostride, odist, inembed, istride, idist, CUFFT_C2R, batch));
cufftSafeCall(cufftExecC2R(inverse_plan, d_freq, d_in));
gpuErrchk(cudaMemcpy(h_in,d_in,sizeof(float)*nRows*nCols*batch,cudaMemcpyDeviceToHost));
for(int i=0; i<nRows*nCols*batch; i++) printf("Inverse transform: %i %f \n",i,h_in[i]);
getchar();
}

memset in CUBLAS gemm is always launched in default stream

I noticed that when calling cublasSgemm function for each call of gemm from a host, there are 3 kernel invocations: memset, scal_kernel and gemm kernel itself (e.g. sgemm_large). This happens even if I use constants alpha/beta allocated in device memory. While the overhead of memset and scal_kernel is relatively small, the problem is memset is always launched in default stream which causes unnecessary synchronization.
The code:
__constant__ __device__ float alpha = 1;
__constant__ __device__ float beta = 1;
int main()
{
// ... memory allocation skipped ...
float* px = thrust::raw_pointer_cast(x.data());
float* py = thrust::raw_pointer_cast(y.data());
float* pmat = thrust::raw_pointer_cast(mat.data());
for (int iter = 0; iter < 3; ++iter)
{
cbstatus = cublasSgemm(cbh, CUBLAS_OP_N, CUBLAS_OP_N, crow, ccol, cshared, &alpha, px, crow, py, cshared, &beta, pmat, crow);
assert(0 == cbstatus);
}
}
This is what I see in profiler:
The question: is there a way to avoid memset or make it run in the stream assigned to CUBLAS handle?
One idea is to use DP and run device version of the gemm function, but this will work only on CC 3.0 and higher.
There was a bug in CUBLAS5.5 where a cudaMemset was used instead of cudaMemsetAsync in the specialized path where k >> m,n.
It is fixed in CUBLAS6.0 RC. And you can have access to it if you are a registered developer.
Btw, I wonder why you use __constant__ __device__ for alpha,beta.
Are you using pointerMode = DEVICE?
If not, you could simply use alpha,beta on the host.
Try the code below. The code is conceived to have only a cublasSgemm call, apart from unavoidable memory allocations and copies. You will see that
You have only one kernel launched (gemm_kernel1x1_core);
The two calls to cublasSgemm run perfectly in two different streams.
In the picture, the Visual Profiler timeline is shown.
My system: GeForce 540M, Windows 7, CUDA 5.5.
#include <conio.h>
#include <stdio.h>
#include <assert.h>
#include <cublas_v2.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/**********************/
/* cuBLAS ERROR CHECK */
/**********************/
#ifndef cublasSafeCall
#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
#endif
inline void __cublasSafeCall(cublasStatus_t err, const char *file, const int line)
{
if( CUBLAS_STATUS_SUCCESS != err) {
fprintf(stderr, "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",__FILE__, __LINE__,err);
getch(); cudaDeviceReset(); assert(0);
}
}
/********/
/* MAIN */
/********/
int main()
{
int N = 5;
float *A1, *A2, *B1, *B2, *C1, *C2;
float *d_A1, *d_A2, *d_B1, *d_B2, *d_C1, *d_C2;
A1 = (float*)malloc(N*N*sizeof(float));
B1 = (float*)malloc(N*N*sizeof(float));
C1 = (float*)malloc(N*N*sizeof(float));
A2 = (float*)malloc(N*N*sizeof(float));
B2 = (float*)malloc(N*N*sizeof(float));
C2 = (float*)malloc(N*N*sizeof(float));
gpuErrchk(cudaMalloc((void**)&d_A1,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_B1,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_C1,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_A2,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_B2,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_C2,N*N*sizeof(float)));
for (int i=0; i<N*N; i++) {
A1[i] = ((float)rand()/(float)RAND_MAX);
A2[i] = ((float)rand()/(float)RAND_MAX);
B1[i] = ((float)rand()/(float)RAND_MAX);
B2[i] = ((float)rand()/(float)RAND_MAX);
}
gpuErrchk(cudaMemcpy(d_A1, A1, N*N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_B1, B1, N*N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_A2, A2, N*N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_B2, B2, N*N*sizeof(float), cudaMemcpyHostToDevice));
cublasHandle_t handle;
cublasSafeCall(cublasCreate(&handle));
cudaStream_t stream1, stream2;
gpuErrchk(cudaStreamCreate(&stream1));
gpuErrchk(cudaStreamCreate(&stream2));
float alpha = 1.f;
float beta = 1.f;
cublasSafeCall(cublasSetStream(handle,stream1));
cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A1, N, d_B1, N, &beta, d_C1, N));
cublasSafeCall(cublasSetStream(handle,stream2));
cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A2, N, d_B2, N, &beta, d_C2, N));
gpuErrchk(cudaDeviceReset());
return 0;
}