I have a linear array of unsigned chars representing a 2D array. I would like to place it into a CUDA 2D texture and perform (floating point) linear interpolation on it, i.e., have the texture call fetch the 4 nearest unsigned char neighbors, internally convert them to float, interpolate between them, and return the resulting floating point value.
I am having some difficulty setting up the texture and binding it to a texture reference. I have been through the CUDA reference manual & appendices, but I'm just not having any luck.
Below is runnable code to set up and bind 1) a floating point texture and 2) an unsigned char texture. The floating point code runs just fine. However, if you uncomment the two commented unsigned char lines toward the bottom, an "invalid argument" error is thrown.
#include <cstdio>
#include <cuda_runtime.h>
typedef unsigned char uchar;
// Define (global) texture references; must use "cudaReadModeNormalizedFloat"
// for ordinal textures
texture<float, cudaTextureType2D, cudaReadModeNormalizedFloat> texRefFloat;
texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> texRefUChar;
// Define size of (row major) textures
size_t const WIDTH = 1000;
size_t const HEIGHT = 1000;
size_t const TOT_PIX = WIDTH*HEIGHT;
int main(void)
{
// Set texel formats
cudaChannelFormatDesc descFloat = cudaCreateChannelDesc<float>();
cudaChannelFormatDesc descUChar = cudaCreateChannelDesc<uchar>();
// Choose to perform texture 2D linear interpolation
texRefFloat.filterMode = cudaFilterModeLinear;
texRefUChar.filterMode = cudaFilterModeLinear;
// Allocate texture device memory
float * d_buffFloat; cudaMalloc(&d_buffFloat, sizeof(float)*TOT_PIX);
uchar * d_buffUChar; cudaMalloc(&d_buffUChar, sizeof(uchar)*TOT_PIX);
// Bind texture references to textures
cudaError_t errFloat = cudaSuccess;
cudaError_t errUChar = cudaSuccess;
errFloat = cudaBindTexture2D(0, texRefFloat, d_buffFloat, descFloat,
WIDTH, HEIGHT, sizeof(float)*WIDTH);
// Uncomment the following two lines for an error
//errUChar = cudaBindTexture2D(0, texRefUChar, d_buffUChar, descUChar,
// WIDTH, HEIGHT, sizeof(uchar)*WIDTH);
// Check for errors during binding
if (errFloat != cudaSuccess)
{
printf("Error binding float texture reference: %s\n",
cudaGetErrorString(errFloat));
exit(-1);
}
if (errUChar != cudaSuccess)
{
printf("Error binding unsigned char texture reference: %s\n",
cudaGetErrorString(errUChar));
exit(-1);
}
return 0;
}
Any help/insight would be most appreciated!
Aaron
Each row of a texture must be properly aligned. This cannot be guaranteed in general if you bind the texture to a plain array (as opposed to a CUDA array). To bind plain memory to a 2D texture, you would want to allocate the memory with cudaMallocPitch(). This sets the row pitch such that it is suitable for binding to a texture. Note that it is not good practice to pass 0 as the first argument to a texture binding API call. This argument is for CUDA to return an offset to the app. If the offset is non-zero you will need to add it to the texture coordinate during texture access.
Here is a quick example that shows how to read interpolated values from a texture whose elements are unsigned char.
#include <stdlib.h>
#include <stdio.h>
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex;
__global__ void kernel (int m, int n, float shift_x, float shift_y)
{
float val;
for (int row = 0; row < m; row++) {
for (int col = 0; col < n; col++) {
val = tex2D (tex, col+0.5f+shift_x, row+0.5f+shift_y);
printf ("%.2f ", val);
}
printf ("\n");
}
}
int main (void)
{
int m = 4; // height = #rows
int n = 3; // width = #columns
size_t pitch, tex_ofs;
unsigned char arr[4][3]= {{11,12,13},{21,22,23},{31,32,33},{251,252,253}};
unsigned char *arr_d = 0;
CUDA_SAFE_CALL(cudaMallocPitch((void**)&arr_d,&pitch,n*sizeof(*arr_d),m));
CUDA_SAFE_CALL(cudaMemcpy2D(arr_d, pitch, arr, n*sizeof(arr[0][0]),
n*sizeof(arr[0][0]),m,cudaMemcpyHostToDevice));
tex.normalized = false;
tex.filterMode = cudaFilterModeLinear;
CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, arr_d, &tex.channelDesc,
n, m, pitch));
if (tex_ofs !=0) {
printf ("tex_ofs = %zu\n", tex_ofs);
return EXIT_FAILURE;
}
printf ("reading array straight\n");
kernel<<<1,1>>>(m, n, 0.0f, 0.0f);
CHECK_LAUNCH_ERROR();
CUDA_SAFE_CALL (cudaDeviceSynchronize());
printf ("reading array shifted in x-direction\n");
kernel<<<1,1>>>(m, n, 0.5f, 0.0f);
CHECK_LAUNCH_ERROR();
CUDA_SAFE_CALL (cudaDeviceSynchronize());
printf ("reading array shifted in y-direction\n");
kernel<<<1,1>>>(m, n, 0.0f, 0.5f);
CUDA_SAFE_CALL (cudaDeviceSynchronize());
CUDA_SAFE_CALL (cudaFree (arr_d));
return EXIT_SUCCESS;
}
The output of this program is as follows:
reading array straight
0.04 0.05 0.05
0.08 0.09 0.09
0.12 0.13 0.13
0.98 0.99 0.99
reading array shifted in x-direction
0.05 0.05 0.05
0.08 0.09 0.09
0.12 0.13 0.13
0.99 0.99 0.99
reading array shifted in y-direction
0.06 0.07 0.07
0.10 0.11 0.11
0.55 0.56 0.56
0.98 0.99 0.99
Related
I am trying to perform a sum reduction using CUB and 2D arrays of type float/double.
Although it works for certain combinations of rows+columns, for relatively larger arrays, I get an illegal memory access error during the last transfer.
A minimal example is the following:
#include <stdio.h>
#include <stdlib.h>
#include <cub/device/device_reduce.cuh>
#include "cuda_runtime.h"
#ifdef DP
#define real double
#else
#define real float
#endif
void generatedata(const int num, real* vec, real start, real finish) {
real rrange = finish - start;
for (auto i = 0; i < num; ++i)
vec[i] = rand() / float(RAND_MAX) * rrange + start;
}
real reduce_to_sum(const int num, const real* vec) {
real total = real(0.0);
for (auto i = 0; i < num; ++i)
total += vec[i];
return total;
}
int main() {
int rows = 2001;
int cols = 3145;
size_t msize = rows * cols;
real* data = (real*)malloc(msize * sizeof(real));
if (!data)
return -999;
generatedata(msize, data, 0., 50.);
real ref_sum = reduce_to_sum(msize, data);
real* d_data_in = nullptr;
real* d_data_out = nullptr;
size_t pitch_in, pitch_out;
cudaError_t err = cudaMallocPitch(&d_data_in, &pitch_in, cols * sizeof(real), rows);
if (err != cudaSuccess) {
printf("data_in :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMallocPitch(&d_data_out, &pitch_out, cols * sizeof(real), rows);
if (err != cudaSuccess) {
printf("data_out :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMemset(d_data_in, 0, rows * pitch_in);
if (err != cudaSuccess) {
printf("set data_in :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMemcpy2D(d_data_in, pitch_in, data, cols * sizeof(real), cols * sizeof(real), rows, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
printf("copy data :: %s \n", cudaGetErrorString(err));
return -999;
}
void* d_temp = nullptr;
size_t temp_bytes = 0;
cub::DeviceReduce::Sum(d_temp, temp_bytes, d_data_in, d_data_out, rows * pitch_out);
err = cudaMalloc(&d_temp, temp_bytes);
if (err != cudaSuccess) {
printf("temp :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMemset(d_data_out, 0, rows * pitch_out);
if (err != cudaSuccess) {
printf("set temp :: %s \n", cudaGetErrorString(err));
return -999;
}
// Run sum-reduction
cub::DeviceReduce::Sum(d_temp, temp_bytes, d_data_in, d_data_out, rows * pitch_out);
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("reduction :: %s \n", cudaGetErrorString(err));
return -999;
}
real gpu_sum = real(0.0);
err = cudaMemcpy(&gpu_sum, d_data_out, sizeof(real), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
printf("copy final :: %s \n", cudaGetErrorString(err));
return -999;
}
printf("Difference in sum (h)%f - (d)%f = %f \n", ref_sum, gpu_sum, ref_sum - gpu_sum);
if (data) free(data);
if (d_data_in) cudaFree(d_data_in);
if (d_data_out) cudaFree(d_data_out);
if (d_temp) cudaFree(d_temp);
cudaDeviceReset();
return 0;
}
The error is thrown at "copy final ::". I am bit confused as to why certain rows x columns work and others don't. I did notice it's the larger values that cause it, but can't get my head around.
Any suggestions would be much appreciated.
The 5th parameter of cub::DeviceReduce::Sum should be the number of input elements. However, rows * pitch_out is the size of the output buffer in bytes.
Assuming pitch_in % sizeof(real) == 0, the following call may work.
cub::DeviceReduce::Sum(d_temp, temp_bytes, d_data_in, d_data_out, rows * (pitch_in / sizeof(real)));
Also note that cub::DeviceReduce::Sum may return before the reduction is complete. In this case, if any error happened during execution, this error will be reported by cudaMemcpy.
I am trying to solve a least squares problem via "magma_dgels_gpu()" function of MAGMA Library. My GPU is "Tesla C2050 / C2075" and i have installed MAGMA.
I am trying to compile the below code "testMagmaDGELS.cu", but i get error:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cublas.h>
#include "magma.h"
#define UTILS_MALLOC(__ptr, __type, __size) \
__ptr = (__type*)malloc((__size) * sizeof(__type)); \
if (__ptr == 0) { \
fprintf (stderr, "!!!! Malloc failed for: %s\n", #__ptr ); \
exit(-1); \
}
#define UTILS_DEVALLOC(__ptr, __type, __size) \
if( cudaSuccess != cudaMalloc( (void**)&__ptr, (__size)*sizeof(__type) ) ){ \
fprintf (stderr, "!!!! cudaMalloc failed for: %s\n", #__ptr ); \
exit(-1); \
}
int main(int argc, char** argv)
{
if( CUBLAS_STATUS_SUCCESS != cublasInit( ) ) {
fprintf(stderr, "CUBLAS: Not initialized\n"); exit(-1);
}
double *devA, *devB, *pWork, lWorkQuery[1];
const int M = 5, N = 3;
int ret, info;
/* Allocate device memory for the matrix (column-major) */
int lda = M;
int ldda = ((M + 31) / 32) * 32;
UTILS_DEVALLOC(devA, double, ldda * N);
UTILS_DEVALLOC(devB, double, M);
/* Initialize the matrix */
double A[N][M] = {{ 0.6, 5.0, 1.0, -1.0, -4.2 },
{ 1.2, 4.0, -4.0, -2.0, -8.4 },
{ 3.9, 2.5, -5.5, -6.5, -4.8 }};
cublasSetMatrix(M, N, sizeof(double), A, lda, devA, ldda);
double B[M] = {3.0, 4.0, -1.0, -5.0, -1.0};
cublasSetMatrix(M, 1, sizeof(double), B, M, devB, M);
/* Resolve the LLSP using MAGMA */
ret = magma_dgels_gpu('N', M, N, 1 /* nb of colums in the matrix B */,
devA, ldda, devB, M,
lWorkQuery, -1, // query the optimal work space
&info);
if (info < 0) {
printf("Argument %d of magma_dgels_gpu had an illegal value.\n", -info);
exit(1);
} else if (ret != MAGMA_SUCCESS) {
printf("magma_dgels_gpu failed (code %d).\n", ret);
exit(1);
}
int lwork = (int)lWorkQuery[0];
printf("Optimal work space %d\n", lwork);
UTILS_MALLOC(pWork, double, lwork);
ret = magma_dgels_gpu('N', M, N, 1 /* nb of colums in the matrix B */,
devA, ldda, devB, M,
pWork, lwork,
&info);
if (info < 0) {
printf("Argument %d of magma_dgels_gpu had an illegal value.\n", -info);
exit(1);
} else if (ret != MAGMA_SUCCESS) {
printf("magma_dgels_gpu failed (code %d).\n", ret);
exit(1);
} else {
printf("LLSP solved successfully\n");
}
cublasGetMatrix(M, 1, sizeof(double), devB, M, B, M);
/* Expected solution vector: 0.953333 -0.843333 0.906667 */
printf("Solution vector:\n");
for (int i = 0; i < N; i++) {
printf("\t%lf\n", B[i]);
}
/* Memory clean up */
free( pWork );
cudaFree( devA );
cudaFree( devB );
/* Shutdown */
cublasShutdown();
return 0;
}
I make compile as follows:
nvcc -arch=sm_20 testMagmaDGELS.cu -o testMagmaDGELS -lcublas -I/opt/magma/1.7.0/openblas/gcc/include
And I get these errors:
team24#tesla:~$ nvcc -arch=sm_20 testMagmaDGELS.cu -o testMagmaDGELS -lcublas -I/opt/magma/1.7.0/openblas/gcc/include
testMagmaDGELS.cu(54): error: argument of type "char" is incompatible with parameter of type "magma_trans_t"
testMagmaDGELS.cu(70): error: argument of type "char" is incompatible with parameter of type "magma_trans_t"
2 errors detected in the compilation of "/tmp/tmpxft_00002d95_00000000-8_testMagmaDGELS.cpp1.ii".
Could anyone help me?
Use the magma type for indication of transpose/no transpose, instead of using a char type.
so instead of this:
ret = magma_dgels_gpu('N', ...
do this:
magma_trans_t my_trans = MagmaNoTrans;
ret = magma_dgels_gpu(my_trans, ...
See the documentation here.
magma_trans_t magma_trans_const ( character ) Map 'N', 'T', 'C'
to MagmaNoTrans, MagmaTrans, MagmaConjTrans
I am receiving the error:
Cufft error in file
I am using this file in order to load the FFT and pass them to another file.
//----function to check for errors-------------------------------------------------
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"\nGPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
//function to check for cuFFT errors --------------------------------------------------
#define CUFFT_SAFE_CALL( call) do { \
cufftResult err = call; \
if (err != CUFFT_SUCCESS) { \
fprintf(stderr, "Cufft error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, "error" ); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define NX 128*128
#define NY 16
#define BATCH 16
#define NRANK 2
void FFT_transform(cufftDoubleComplex** B_in)
{
int n[NRANK] = {NX, NY};
//size of B
int Bsize=NX*NY*BATCH;
//allocate host memory
*B_in=(cufftDoubleComplex*)malloc(Bsize*sizeof(cufftDoubleComplex));
for (int i=0;i<NX*NY;i++){
for (int j=0;j<BATCH;j++){
(*B_in)[i*BATCH+j].x=(i*BATCH+j)*2;
(*B_in)[i*BATCH+j].y=(i*BATCH+j)*2+1;
}
}
//allocate device memory
cufftDoubleComplex* B_dev;
gpuErrchk(cudaMalloc((void**) &B_dev,Bsize* sizeof(cufftDoubleComplex)));
if (cudaGetLastError() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to allocate\n");
return;
}
// copy arrays from host to device
gpuErrchk(cudaMemcpy(B_dev, *B_in,Bsize* sizeof(cufftDoubleComplex), cudaMemcpyHostToDevice));
// Create a 2D FFT plan
cufftHandle plan;
CUFFT_SAFE_CALL(cufftPlan2d(&plan,NX,NY,CUFFT_Z2Z));
if (cufftPlanMany(&plan, NRANK, n,NULL, 1, 0,NULL, 1, 0,CUFFT_Z2Z,BATCH) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT Error: Unable to create plan\n");
return;
}
if (cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE)!= CUFFT_SUCCESS){
fprintf(stderr, "CUFFT Error: Unable to set compatibility mode to native\n");
return;
}
// perform transform
CUFFT_SAFE_CALL(cufftExecZ2Z(plan,(cufftDoubleComplex *)(*B_in), (cufftDoubleComplex *)B_dev, CUFFT_FORWARD));
if (cufftExecZ2Z(plan,*B_in,B_dev,CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT Error: Unable to execute plan\n");
return;
}
if (cudaThreadSynchronize() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to synchronize\n");
return;
}
// copy result from device to host
gpuErrchk(cudaMemcpy(*B_in, B_dev,Bsize*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost));
//Destroy CUFFT context
CUFFT_SAFE_CALL(cufftDestroy(plan));
//clean up device memory
gpuErrchk(cudaFree(B_dev));
}
I am receiving the error at line:
CUFFT_SAFE_CALL(cufftExecZ2Z(plan,(cufftDoubleComplex *)(*B_in), (cufftDoubleComplex *)B_dev, CUFFT_FORWARD));
You are getting the error because B_in is a pointer to host memory and not to device memory, which is illegal. In CUFFT, inputs are always in device memory. You need to use cudaMemcpy to transfer the contents of B_in to B_dev before performing the transform, and then supply B_dev as both the input and output, which will result in an in place transform. This is clearly described in the CUFFT API documentation here.
I seem to have an issue with the function cudaMemcpyToArray. I have the following commands:
float *h_data = new float[bmp.width * bmp.height];
...
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *cuArray;
cudaMallocArray(&cuArray, &channelDesc, bmp.width, bmp.height);
cudaMemcpyToArray(cuArray, 0, 0, h_data, bmp.width * bmp.height, cudaMemcpyHostToDevice);
As far as I understand, this should give me a 2D array in cuArray that has dimensions bmp.width by bmp.height from the data in h_data, which is a 1D array with dimensions bmp.width * bmp.height. Unfortunately, it just seg-faults on the last command. Am I doing something horribly wrong?
I think #lmortenson was on the right track, but we don't multiply width and height by sizeof(float), just one of them.
You need to make sure that your bmp.width and bmp.height parameters conform to the limits specified here under Valid extents. These extents are in elements, not bytes.
You need to pass width and height parameters to cudaMallocArray that are in elements, not bytes.
You need to pass an overall size parameter to cudaMemcpyToArray that is in bytes, not elements, but this would generally be of the form width*height*sizeof(float)
I created a simple reproducer based on your code and was able to reproduce the seg fault. The following code was my adaptation with the errors fixed, I believe:
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
int main(){
int width = 256;
int height = 256;
float *h_data = new float[width * height];
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *cArray;
cudaMallocArray(&cArray, &channelDesc, width, height, cudaArrayDefault);
cudaCheckErrors("cudaMallocArray");
cudaMemcpyToArray(cArray, 0, 0, h_data, width*height*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpyToArray");
return 0;
}
After implementing matrix multiplication with CUDA. I tried to implement it with CUBLAS(thanks to the advice of some people here in the forum).
I can multiply square matrices but (yes once again...) I am having difficulties working with non square matrices. The only type of non square matrix multiplication that works is when you vary Matrix A's Width(A*B=C).
I don't get any errors but the resulting matrix returns wrong values. Here is my code(it is basically an adaptation of the simpleCUBLAS SDK example):
#include <stdlib.h>
#include <stdio.h>
#include "cublas.h"
#define HA 2
#define WA 9
#define WB 2
#define HB WA
#define WC WB
#define HC HA
#define index(i,j,ld) (((j)*(ld))+(i))
void printMat(float*P,int uWP,int uHP){
//printf("\n %f",P[1]);
int i,j;
for(i=0;i<uHP;i++){
printf("\n");
for(j=0;j<uWP;j++)
printf("%f ",P[index(i,j,uHP)]);
//printf("%f ",P[i*uWP+j]);
}
}
int main (int argc, char** argv) {
cublasStatus status;
int i,j;
cublasInit();
float *A = (float*)malloc(HA*WA*sizeof(float));
float *B = (float*)malloc(HB*WB*sizeof(float));
float *C = (float*)malloc(HC*WC*sizeof(float));
if (A == 0) {
fprintf (stderr, "!!!! host memory allocation error (A)\n");
return EXIT_FAILURE;
}
if (B == 0) {
fprintf (stderr, "!!!! host memory allocation error (A)\n");
return EXIT_FAILURE;
}
if (C == 0) {
fprintf (stderr, "!!!! host memory allocation error (A)\n");
return EXIT_FAILURE;
}
for (i=0;i<HA;i++)
for (j=0;j<WA;j++)
A[index(i,j,HA)] = (float) index(i,j,HA);
for (i=0;i<HB;i++)
for (j=0;j<WB;j++)
B[index(i,j,HB)] = (float) index(i,j,HB);
/*
for (i=0;i<HA*WA;i++)
A[i]=(float) i;
for (i=0;i<HB*WB;i++)
B[i]=(float) i; */
float* AA; float* BB; float* CC;
/*ALLOCATE ON THE DEVICE*/
status=cublasAlloc(HA*WA,sizeof(float),(void**)&AA);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (A)\n");
return EXIT_FAILURE;
}
status=cublasAlloc(HB*WB,sizeof(float),(void**)&BB);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (A)\n");
return EXIT_FAILURE;
}
status=cublasAlloc(HC*WC,sizeof(float),(void**)&CC);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (A)\n");
return EXIT_FAILURE;
}
/*SET MATRIX*/
status=cublasSetMatrix(HA,WA,sizeof(float),A,HA,AA,HA);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (A)\n");
return EXIT_FAILURE;
}
status=cublasSetMatrix(HB,WB,sizeof(float),B,HB,BB,HB);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (A)\n");
return EXIT_FAILURE;
}
/*KERNEL*/
cublasSgemm('n','n',HA,WB,WA,1,AA,HA,BB,HB,0,CC,HC);
status = cublasGetError();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
cublasGetMatrix(HC,WC,sizeof(float),CC,HC,C,HC);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device read error (A)\n");
return EXIT_FAILURE;
}
/* PERFORMANCE OUTPUT*/
printf("\nMatriz A:\n");
printMat(A,WA,HA);
printf("\nMatriz B:\n");
printMat(B,WB,HB);
printf("\nMatriz C:\n");
printMat(C,WC,HC);
free( A ); free( B ); free ( C );
status = cublasFree(AA);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! memory free error (A)\n");
return EXIT_FAILURE;
}
status = cublasFree(BB);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! memory free error (B)\n");
return EXIT_FAILURE;
}
status = cublasFree(CC);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! memory free error (C)\n");
return EXIT_FAILURE;
}
/* Shutdown */
status = cublasShutdown();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! shutdown error (A)\n");
return EXIT_FAILURE;
}
if (argc > 1) {
if (!strcmp(argv[1], "-noprompt") ||!strcmp(argv[1], "-qatest") )
{
return EXIT_SUCCESS;
}
}
else
{
printf("\nPress ENTER to exit...\n");
getchar();
}
return EXIT_SUCCESS;
}
Any thoughts? Also, does anyone has a matrix multiplication implementation in CUBLAS that is working, so i could compare? Thanks in advance.
I don't understand why you think that code you posted doesn't work. When I compile and run it, the resulting executable produces the same output that I get if I enter the same matrices into matlab and compute the product of them.
CUBLAS is a FORTRAN BLAS, it expects inputs in column major order (and your code is column major ordered). If the results don't match what you want, you must be confusing column and row major ordering somewhere.