CUBLAS works unpredictably - cuda

Wrote my first program using CUDA+CUBLAS. It just uses a 'cublasDgemm' function and computes a product of 2 N*N matrices.
However, all the time I was launching my program, it keeped producing the same wrong answer (e.g. when multiplying 1*1 matrix containing 5 as a single element by 1*1 matrix containing element 6, it always said the result is 36, not 30).
I checked the program several times with no success. But, when I came back to it the nexy day (i.e. after reboot), it worked just fine. I don't remember whether I recompiled it or not, but the truth is that it is the same VS project, same code, same computer with its GPU.
So, can anyone explain me why could that have happened? And do I have to expect same strange behaviour further?
Here is the code I was launching:
#include <iostream>
#include <string>
#include <iomanip>
#include <cuda_runtime.h>
#include <cublas_v2.h>
const int N = 5;
#define IDX2F(i,j) ((i) * N + j)
void fail(const cudaError_t& cudaStatus, const std::string& errorMessage) {
if (cudaStatus != cudaSuccess) {
std::cerr << errorMessage << std::endl;
exit(EXIT_FAILURE);
}
}
void fail(const cublasStatus_t& status, const std::string& errorMessage) {
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << errorMessage << std::endl;
exit(EXIT_FAILURE);
}
}
void printMatrix(const double *C) {
for (int i=0; i<N; i++) {
for (int j=0; j<N; j++) {
std::cout << std::fixed << std::setprecision(2) << C[IDX2F(i,j)] << ' ';
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int main(int argc, char **argv) {
cudaError_t cudaStatus;
cublasStatus_t status;
cublasHandle_t handle;
double *A = new double[N*N];
double *devPtrA;
double *B = new double[N*N];
double *devPtrB;
double *C = new double[N*N];
double *devPtrC;
for (int i=0; i<N; i++)
for (int j=0; j<N; j++)
A[IDX2F(i,j)] = i + j;
for (int i=0; i<N; i++)
for (int j=0; j<N; j++)
B[IDX2F(i,j)] = i + j * 0.5;
// do not have to set anything into matrix C, because beta = 0
// allocate mamory on GPU
cudaStatus = cudaMalloc((void**)&devPtrC, N*N*sizeof(*C));
fail(cudaStatus, "device memory allocation failed");
cudaStatus = cudaMalloc((void**)&devPtrA, N*N*sizeof(*A));
fail(cudaStatus, "device memory allocation failed");
cudaStatus = cudaMalloc((void**)&devPtrB, N*N*sizeof(*B));
fail(cudaStatus, "device memory allocation failed");
// create GPU handle
status = cublasCreate(&handle);
fail(status, "CUBLAS initialization failed");
// copying matrices from host to GPU
status = cublasSetMatrix(N, N, sizeof (*B), B, N, devPtrB, N);
fail(status, "failed to load data from host to GPU");
status = cublasSetMatrix(N, N, sizeof (*A), A, N, devPtrA, N);
fail(status, "failed to load data from host to GPU");
const double ONE = 1;
const double ZERO = 0;
printMatrix(A);
printMatrix(B);
status = cublasDgemm( handle,
CUBLAS_OP_N, CUBLAS_OP_N,
N, N, N,
&ONE,
devPtrA, N,
devPtrB, N,
&ZERO,
devPtrC, N);
fail(status, "error cublasDgemm");
status = cublasGetMatrix(N, N, sizeof (*C), devPtrC, N, C, N);
fail(status, "could not load result back from GPU to host");
printMatrix(C);
status = cublasDestroy(handle);
fail(status, "could not destroy CUBLAS handle");
cudaStatus = cudaFree(devPtrC);
fail(cudaStatus, "device memory freeing failed");
cudaStatus = cudaFree(devPtrB);
fail(cudaStatus, "device memory freeing failed");
cudaStatus = cudaFree(devPtrA);
fail(cudaStatus, "device memory freeing failed");
delete[] C;
delete[] B;
delete[] A;
return EXIT_SUCCESS;
}

op(B) must be CUBLAS_OP_T
.
.
status = cublasDgemm( handle,
CUBLAS_OP_N, CUBLAS_OP_T,
N, N, N,
&ONE,
devPtrA, N,
devPtrB, N,
&ZERO,
devPtrC, N);
.
.
.
.
definition is : C = α op ( A ) op ( B ) + β C
http://docs.nvidia.com/cuda/cublas/index.html#topic_8_1

Related

How to cope with "cudaErrorMissingConfiguration" from "cudaMallocPitch" function of CUDA?

I'm making a Mandelbrot set program with CUDA. However I can't step more unless cudaErrorMissingConfiguration from cudaMallocPitch() function of CUDA is to be solved. Could you tell me something about it?
My GPU is GeForce RTX 2060 SUPER.
I'll show you my command lines below.
> nvcc MandelbrotCUDA.cu -o MandelbrotCUDA -O3
I tried cudaDeviceSetLimit( cudaLimitMallocHeapSize, 7*1024*1024*1024 ) to
resize heap size.
cudaDeviceSetLimit was success.
However I cannot step one more. I cannot print "CUDA malloc done!"
#include <iostream>
#include <thrust/complex.h>
#include <fstream>
#include <string>
#include <stdlib.h>
using namespace std;
#define D 0.0000025 // Tick
#define LIMIT_N 255
#define INF_NUM 2
#define PLOT_METHOD 2 // dat file : 0, ppm file : 1, ppm file with C : 2
__global__
void calculation(const int indexTotalX, const int indexTotalY, int ***n, thrust::complex<double> ***c){ // n, c are the pointers of dN, dC.
for(int i = 0; i < indexTotalY ; i++){
for(int j = 0; j < indexTotalX; j++){
thrust::complex<double> z(0.0f, 0.0f);
n[i][j] = 0;
for(int ctr=1; ctr <= LIMIT_N ; ctr++){
z = z*z + (*(c[i][j]));
n[i][j] = n[i][j] + (abs(z) < INF_NUM);
}
}
}
}
int main(){
// Data Path
string filePath = "Y:\\Documents\\Programming\\mandelbrot\\";
string fileName = "mandelbrot4.ppm";
string filename = filePath+fileName;
//complex<double> c[N][M];
double xRange[2] = {-0.76, -0.74};
double yRange[2] = {0.05, 0.1};
const int indexTotalX = (xRange[1]-xRange[0])/D;
const int indexTotalY = (yRange[1]-yRange[0])/D;
thrust::complex<double> **c;
//c = new complex<double> [N];
cout << "debug_n" << endl;
int **n;
n = new int* [indexTotalY];
c = new thrust::complex<double> * [indexTotalY];
for(int i=0;i<indexTotalY;i++){
n[i] = new int [indexTotalX];
c[i] = new thrust::complex<double> [indexTotalX];
}
cout << "debug_n_end" << endl;
for(int i = 0; i < indexTotalY; i++){
for(int j = 0; j < indexTotalX; j++){
thrust::complex<double> tmp( xRange[0]+j*D, yRange[0]+i*D );
c[i][j] = tmp;
//n[i*sqrt(N)+j] = 0;
}
}
// CUDA malloc
cout << "CUDA malloc initializing..." << endl;
int **dN;
thrust::complex<double> **dC;
cudaError_t error;
error = cudaDeviceSetLimit(cudaLimitMallocHeapSize, 7*1024*1024*1024);
if(error != cudaSuccess){
cout << "cudaDeviceSetLimit's ERROR CODE = " << error << endl;
return 0;
}
size_t tmpPitch;
error = cudaMallocPitch((void **)dN, &tmpPitch,(size_t)(indexTotalY*sizeof(int)), (size_t)(indexTotalX*sizeof(int)));
if(error != cudaSuccess){
cout << "CUDA ERROR CODE = " << error << endl;
cout << "indexTotalX = " << indexTotalX << endl;
cout << "indexTotalY = " << indexTotalY << endl;
return 0;
}
cout << "CUDA malloc done!" << endl;
This is console messages below.
debug_n
debug_n_end
CUDA malloc initializing...
CUDA ERROR CODE = 1
indexTotalX = 8000
indexTotalY = 20000
There are several problems here:
int **dN;
...
error = cudaMallocPitch((void **)dN, &tmpPitch,(size_t)(indexTotalY*sizeof(int)), (size_t)(indexTotalX*sizeof(int)));
The correct type of pointer to use in CUDA allocations is a single pointer:
int *dN;
not a double pointer:
int **dN;
(so your kernel where you are trying pass triple-pointers:
void calculation(const int indexTotalX, const int indexTotalY, int ***n, thrust::complex<double> ***c){ // n, c are the pointers of dN, dC.
is almost certainly not going to work, and should not be designed that way, but that is not the question you are asking.)
The pointer is passed to the allocating function by its address:
error = cudaMallocPitch((void **)&dN,
For cudaMallocPitch, only the horizontal requested dimension is scaled by the size of the data element. The allocation height is not scaled this way. Also, I will assume X corresponds to your allocation width, and Y corresponds to your allocation height, so you also have those parameters reversed:
error = cudaMallocPitch((void **)&dN, &tmpPitch,(size_t)(indexTotalX*sizeof(int)), (size_t)(indexTotalY));
The cudaLimitMallocHeapSize should not be necessary to set to make any of this work. It applies only to in-kernel allocations. Reserving 7GB on an 8GB card may also cause problems. Until you are sure you need that (it's not needed for what you have shown) I would simply remove that.
$ cat t1488.cu
#include <iostream>
#include <thrust/complex.h>
#include <fstream>
#include <string>
#include <stdlib.h>
using namespace std;
#define D 0.0000025 // Tick
#define LIMIT_N 255
#define INF_NUM 2
#define PLOT_METHOD 2 // dat file : 0, ppm file : 1, ppm file with C : 2
__global__
void calculation(const int indexTotalX, const int indexTotalY, int ***n, thrust::complex<double> ***c){ // n, c are the pointers of dN, dC.
for(int i = 0; i < indexTotalY ; i++){
for(int j = 0; j < indexTotalX; j++){
thrust::complex<double> z(0.0f, 0.0f);
n[i][j] = 0;
for(int ctr=1; ctr <= LIMIT_N ; ctr++){
z = z*z + (*(c[i][j]));
n[i][j] = n[i][j] + (abs(z) < INF_NUM);
}
}
}
}
int main(){
// Data Path
string filePath = "Y:\\Documents\\Programming\\mandelbrot\\";
string fileName = "mandelbrot4.ppm";
string filename = filePath+fileName;
//complex<double> c[N][M];
double xRange[2] = {-0.76, -0.74};
double yRange[2] = {0.05, 0.1};
const int indexTotalX = (xRange[1]-xRange[0])/D;
const int indexTotalY = (yRange[1]-yRange[0])/D;
thrust::complex<double> **c;
//c = new complex<double> [N];
cout << "debug_n" << endl;
int **n;
n = new int* [indexTotalY];
c = new thrust::complex<double> * [indexTotalY];
for(int i=0;i<indexTotalY;i++){
n[i] = new int [indexTotalX];
c[i] = new thrust::complex<double> [indexTotalX];
}
cout << "debug_n_end" << endl;
for(int i = 0; i < indexTotalY; i++){
for(int j = 0; j < indexTotalX; j++){
thrust::complex<double> tmp( xRange[0]+j*D, yRange[0]+i*D );
c[i][j] = tmp;
//n[i*sqrt(N)+j] = 0;
}
}
// CUDA malloc
cout << "CUDA malloc initializing..." << endl;
int *dN;
thrust::complex<double> **dC;
cudaError_t error;
size_t tmpPitch;
error = cudaMallocPitch((void **)&dN, &tmpPitch,(size_t)(indexTotalX*sizeof(int)), (size_t)(indexTotalY));
if(error != cudaSuccess){
cout << "CUDA ERROR CODE = " << error << endl;
cout << "indexTotalX = " << indexTotalX << endl;
cout << "indexTotalY = " << indexTotalY << endl;
return 0;
}
cout << "CUDA malloc done!" << endl;
}
$ nvcc -o t1488 t1488.cu
t1488.cu(68): warning: variable "dC" was declared but never referenced
$ cuda-memcheck ./t1488
========= CUDA-MEMCHECK
debug_n
debug_n_end
CUDA malloc initializing...
CUDA malloc done!
========= ERROR SUMMARY: 0 errors
$

Cublas Thrust Segmentation Fault

I am new to CUDA programming. I was working on a sample code which multiplies a matrix with a vector and prints the results. I am using Cublas Dgemv API for doing the multiplication. On running the program using cuda-memcheck I get the following error,
Error: process didn't terminate successfully
========= The application may have hit an error when dereferencing Unified Memory from the host. Please rerun the application under cuda-gdb or Nsight Eclipse Edition to catch host side errors.
========= Internal error (20)
========= No CUDA-MEMCHECK results found
The minimal complete code is here,
#include <thrust/device_vector.h>
#include <cublas_v2.h>
#include <iostream>
int main(void)
{
int rowDimension = 3; // number of rows
int columnDimension = 6; // number of columns
// initialize data
thrust::device_vector<double> weightMatrix;
weightMatrix.resize(rowDimension * columnDimension);
thrust::device_vector<double> inputVector;
inputVector.resize(columnDimension);
thrust::device_vector<double> F;
F.resize(rowDimension);
for (size_t i = 0; i < rowDimension; i++)
for (size_t j = 0; j < columnDimension; j++)
weightMatrix[j * rowDimension + i]=i;
for (size_t j = 0; j < columnDimension; j++)
inputVector[j] = j;
for (size_t i = 0; i < rowDimension; i++)
F[i]=0;
cublasHandle_t handle;
/* Initialize CUBLAS */
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
std::cerr << "!!!! CUBLAS initialization error\n";
double alpha = 1.0f;
// cudaDeviceSynchronize();
status = cublasDgemv(handle, CUBLAS_OP_N, rowDimension, columnDimension, &alpha, thrust::raw_pointer_cast(weightMatrix.data()), rowDimension,
thrust::raw_pointer_cast(inputVector.data()), 1, 0, thrust::raw_pointer_cast(F.data()), 1) ;;
// cudaDeviceSynchronize();
if (status != CUBLAS_STATUS_SUCCESS)
std::cerr << "!!!! kernel execution error.\n";
for (size_t j = 0; j < rowDimension; j++)
std::cout << F[j] << " ";
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
std::cerr << "!!!! shutdown error (A)\n";
return 0;
}
The above prgram produces a segmentation fault at the cublasDgemv function. Onrunning cuda-memcheck i get the message reported above. On Googling i couldn't find much help.
Can someone please help me resolve this issue.
Have a look at the documentation of cublasDgemv.
The signature is:
cublasDgemv(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const double *alpha,
const double *A,
int lda,
const double *x,
int incx,
const double *beta,
double *y,
int incy)
beta has to be supplied as a pointer. But you pass a NULL pointer to it instead of a pointer pointing to the value 0.
So the following will fix your problem:
double alpha = 1.0;
double beta = 0;
status = cublasDgemv(handle,
CUBLAS_OP_N,
rowDimension,
columnDimension,
&alpha,
thrust::raw_pointer_cast(weightMatrix.data()),
rowDimension,
thrust::raw_pointer_cast(inputVector.data()),
1,
&beta, // note the change here!
thrust::raw_pointer_cast(F.data()),
1);

Comparison of Cuda Shared Memory Copies: Which Approach Better

When coping an array to another array in shared memory, I tried six different approaches (see comments in the program). After discussions and testings, my conclusions are:
(1) memcpy is not faster than element-wise copy of an array.
(2) For small array, approach 3 is the best. For larger array, approach 6 is the best.
#include <stdio.h>
#include <iostream>
#include <time.h>
#include <stdlib.h>
#include <assert.h>
const int NUM_OF_BLOCKS = 1;
const int NUM_OF_THREADS_PER_BLOCK = 8;
const int NUM_OF_DATA = 6000;
const int NUM_OF_COPIES= 1000;
//const int NUM_OF_COPIES= 1000000;
cudaError_t cuda_status;
__shared__ int start_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int end_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int src[NUM_OF_DATA];
__shared__ int dest[NUM_OF_DATA];
using namespace std;
__device__ void init(){
unsigned int num_of_data_per_thread = NUM_OF_DATA / NUM_OF_THREADS_PER_BLOCK;
unsigned int extra_data = NUM_OF_DATA % NUM_OF_THREADS_PER_BLOCK;
int size[NUM_OF_THREADS_PER_BLOCK];
start_index[threadIdx.x] = threadIdx.x * num_of_data_per_thread;
if (threadIdx.x < extra_data){
start_index[threadIdx.x] = start_index[threadIdx.x] + threadIdx.x;
size[threadIdx.x] = num_of_data_per_thread + 1;
}else{
start_index[threadIdx.x] = start_index[threadIdx.x] + extra_data;
size[threadIdx.x] = num_of_data_per_thread ;
}
end_index[threadIdx.x] = start_index[threadIdx.x] + size[threadIdx.x] -1;
//printf("start_index[%d] = %d, end_index[%d] = %d\n", threadIdx.x, start_index[threadIdx.x], threadIdx.x, end_index[threadIdx.x]);
}
__device__ void inc_src_data(int* src){
int i;
for (i = 0; i < NUM_OF_DATA; i++, src++){
*src += 1;
}
//__threadfence_block();
}
template <int sel>
__device__ void copy_to_dest_array(int* src, int* dest){
int i;
switch (sel){
case 1:
// Approach 1: every thread executes memcpy
memcpy(dest, src, NUM_OF_DATA * sizeof(int));
break;
case 2:
// Approach 2: one thread executes memcpy and then threadfence
if (threadIdx.x == 0){
memcpy(dest, src, NUM_OF_DATA * sizeof(int));
__threadfence_block();
}
break;
case 3:
// Approach 3: every thread copies each element individually
for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
*dest = *src;
//__threadfence_block(); // added this line to demonstrate timing difference
break;
case 4:
// Approach 4: one thread copy each element individually and then threadfence
if (threadIdx.x == 0)
for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
*dest = *src;
__threadfence_block();
break;
case 5:
// Approach 5: every thread execute memcpy and then threadfence
memcpy(dest+start_index[threadIdx.x], src + start_index[threadIdx.x], (end_index[threadIdx.x] - start_index[threadIdx.x] + 1) * sizeof(int));
__threadfence_block();
break;
case 6:
// Approach 6: every thread copies each element individually and then threadfence
for (i = start_index[threadIdx.x]; i <= end_index[threadIdx.x]; i++){
dest[i] = src[i];
}
__threadfence_block();
break;
default:
assert(0);
break;
}
}
template <int sel>
__global__ void copy_data_test(int* data){
init();
copy_to_dest_array<sel>(data, src);
for (int i = 0; i < NUM_OF_COPIES; i++){
inc_src_data(src);
copy_to_dest_array<sel>(&src[0], &dest[0]);
}
copy_to_dest_array<sel>(dest, data);
}
template <int sel>
__global__ void copy_data_test(int* data){
init();
copy_to_dest_array<sel>(data, src);
for (int i = 0; i < NUM_OF_COPIES; i++){
inc_src_data(src);
copy_to_dest_array<sel>(&src[0], &dest[0]);
}
copy_to_dest_array<sel>(dest, data);
}
template <int sel>
void run_test(int *rdata, int *hdata, int *ddata){
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaMemcpy(ddata, hdata, NUM_OF_DATA * sizeof(int), cudaMemcpyHostToDevice);
cudaEventRecord(start);
copy_data_test<sel><<<NUM_OF_BLOCKS, NUM_OF_THREADS_PER_BLOCK>>>(ddata);
cudaEventRecord(stop);
cout << "kernel error: " << cudaGetErrorString(cudaPeekAtLastError()) << "---" << cudaGetErrorString(cudaDeviceSynchronize()) << endl;
cudaMemcpy(rdata, ddata, NUM_OF_DATA * sizeof(int), cudaMemcpyDeviceToHost);
cudaEventSynchronize(stop);
float et;
cudaEventElapsedTime(&et, start, stop);
cout << "Trial " << sel << " elapsed time: " << et << "ms" << endl;
/*
cout << "after kernel processing" << endl;
for (int i = 0; i < NUM_OF_DATA; i++)
cout << rdata[i] << " ";
cout << endl;
*/
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
int main(int argc, char **argv){
int h_data[NUM_OF_DATA];
int r_data[NUM_OF_DATA];
int* d_data;
int i;
cudaSetDevice(0);
srand(time(NULL));
/*
cout << "before kernel processing" << endl;
for (i = 0; i < NUM_OF_DATA; i++){
h_data[i] = rand()%100;
cout << h_data[i] << " ";
}
cout << endl;
*/
cudaMalloc(&d_data, sizeof(int) * NUM_OF_DATA);
run_test<1>(r_data, h_data, d_data);
run_test<2>(r_data, h_data, d_data);
run_test<3>(r_data, h_data, d_data);
run_test<4>(r_data, h_data, d_data);
run_test<5>(r_data, h_data, d_data);
run_test<6>(r_data, h_data, d_data);
return 0;
}
It seems fairly clear that the __threadfence_block() is an expensive operation. The 4 longest test cases all use __threadfence_block(). The two shortest test cases do not.
If I add __threadfence_block() to the 3rd (i.e. the shortest) test case, the timing (for me) changes from ~2 sec to ~17 sec.
Note that your test cases are not all doing the exact same thing, as evidenced by the difference in the output results. I made a modified version of your code that more clearly demonstrates this:
#include <stdio.h>
#include <iostream>
#include <time.h>
#include <stdlib.h>
#include <assert.h>
const int NUM_OF_BLOCKS = 1;
const int NUM_OF_THREADS_PER_BLOCK = 8;
const int NUM_OF_DATA = 50;
const int NUM_OF_COPIES= 10000000;
cudaError_t cuda_status;
__shared__ int start_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int end_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int src[NUM_OF_DATA];
__shared__ int dest[NUM_OF_DATA];
using namespace std;
__device__ void init(){
unsigned int num_of_data_per_thread = NUM_OF_DATA / NUM_OF_THREADS_PER_BLOCK;
unsigned int extra_data = NUM_OF_DATA % NUM_OF_THREADS_PER_BLOCK;
int size[NUM_OF_THREADS_PER_BLOCK];
start_index[threadIdx.x] = threadIdx.x * num_of_data_per_thread;
if (threadIdx.x < extra_data){
start_index[threadIdx.x] = start_index[threadIdx.x] + threadIdx.x;
size[threadIdx.x] = num_of_data_per_thread + 1;
}else{
start_index[threadIdx.x] = start_index[threadIdx.x] + extra_data;
size[threadIdx.x] = num_of_data_per_thread ;
}
end_index[threadIdx.x] = start_index[threadIdx.x] + size[threadIdx.x] -1;
}
__device__ void inc_src_data(int* src){
int i;
for (i = 0; i < NUM_OF_DATA; i++, src++){
*src += 1;
}
//__threadfence_block();
}
template <int sel>
__device__ void copy_to_dest_array(int* src, int* dest){
int i;
switch (sel){
case 1:
// Approach 1: every thread executes memcpy
memcpy(dest, src, NUM_OF_DATA);
break;
case 2:
// Approach 2: one thread executes memcpy and then threadfence
if (threadIdx.x == 0){
memcpy(dest, src, NUM_OF_DATA);
__threadfence_block();
}
break;
case 3:
// Approach 3: every thread copies each element individually
for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
*dest = *src;
__threadfence_block(); // added this line to demonstrate timing difference
break;
case 4:
// Approach 4: one thread copy each element individually and then threadfence
if (threadIdx.x == 0)
for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
*dest = *src;
__threadfence_block();
break;
case 5:
// Approach 5: every thread execute memcpy and then threadfence
memcpy(dest+start_index[threadIdx.x], src + start_index[threadIdx.x], end_index[threadIdx.x] - start_index[threadIdx.x] + 1);
__threadfence_block();
break;
case 6:
// Approach 6: every thread copies each element individually and then threadfence
for (i = start_index[threadIdx.x]; i <= end_index[threadIdx.x]; i++){
*(dest + i) = *(src + i);
}
__threadfence_block();
break;
default:
assert(0);
break;
}
}
template <int sel>
__global__ void copy_data_test(int* data){
init();
copy_to_dest_array<sel>(data, src);
for (int i = 0; i < NUM_OF_COPIES; i++){
inc_src_data(src);
copy_to_dest_array<sel>(&src[0], &dest[0]);
}
copy_to_dest_array<sel>(dest, data);
}
template <int sel>
void run_test(int *rdata, int *hdata, int *ddata){
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaMemcpy(ddata, hdata, NUM_OF_DATA * sizeof(int), cudaMemcpyHostToDevice);
cudaEventRecord(start);
copy_data_test<sel><<<NUM_OF_BLOCKS, NUM_OF_THREADS_PER_BLOCK>>>(ddata);
cudaEventRecord(stop);
cout << "kernel error: " << cudaGetErrorString(cudaPeekAtLastError()) << "---" << cudaGetErrorString(cudaDeviceSynchronize()) << endl;
cudaMemcpy(rdata, ddata, NUM_OF_DATA * sizeof(int), cudaMemcpyDeviceToHost);
cudaEventSynchronize(stop);
float et;
cudaEventElapsedTime(&et, start, stop);
cout << "Trial " << sel << " elapsed time: " << et << "ms" << endl;
cout << "after kernel processing" << endl;
for (int i = 0; i < NUM_OF_DATA; i++)
cout << rdata[i] << " ";
cout << endl;
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
int main(int argc, char **argv){
int h_data[NUM_OF_DATA];
int r_data[NUM_OF_DATA];
int* d_data;
int i;
cudaSetDevice(0);
srand(time(NULL));
cout << "before kernel processing" << endl;
for (i = 0; i < NUM_OF_DATA; i++){
h_data[i] = rand()%100;
cout << h_data[i] << " ";
}
cout << endl;
cudaMalloc(&d_data, sizeof(int) * NUM_OF_DATA);
run_test<1>(r_data, h_data, d_data);
run_test<2>(r_data, h_data, d_data);
run_test<3>(r_data, h_data, d_data);
run_test<4>(r_data, h_data, d_data);
run_test<5>(r_data, h_data, d_data);
run_test<6>(r_data, h_data, d_data);
return 0;
}

Cuda Reduction in 2d Array

I want to calculate the average of the values over the whole image in Cuda. To test how reduction in 2D array work, I write this kernel below. The final output o should be the sum of all the image values. The input g is a 2D array with value 1 in every pixel. But the result of this program is 0 as the sum. A bit weird to me.
I imitate the reduction in 1D array in this tutorial http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf I write this 2D form. I am new to Cuda. And suggestions to potential bugs and improvement are welcomed!
Just add one comment. I know it makes sense just to calculate the average in 1D array. But I want to exploit more and test more complicated reduction behaviours. It might not be right. But just a test. Hope anyone can give me suggestions more about reduction common practices.
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
cudaEvent_t start, stop;
float elapsedTime;
__global__ void
reduce(float *g, float *o, const int dimx, const int dimy)
{
extern __shared__ float sdata[];
unsigned int tid_x = threadIdx.x;
unsigned int tid_y = threadIdx.y;
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
if (i >= dimx || j >= dimy)
return;
sdata[tid_x*blockDim.y + tid_y] = g[i*dimy + j];
__syncthreads();
for(unsigned int s_y = blockDim.y/2; s_y > 0; s_y >>= 1)
{
if (tid_y < s_y)
{
sdata[tid_x * dimy + tid_y] += sdata[tid_x * dimy + tid_y + s_y];
}
__syncthreads();
}
for(unsigned int s_x = blockDim.x/2; s_x > 0; s_x >>= 1 )
{
if(tid_x < s_x)
{
sdata[tid_x * dimy] += sdata[(tid_x + s_x) * dimy];
}
__syncthreads();
}
float sum;
if( tid_x == 0 && tid_y == 0)
{
sum = sdata[0];
atomicAdd (o, sum); // The result should be the sum of all pixel values. But the program produce 0
}
//if(tid_x==0 && tid__y == 0 )
//o[blockIdx.x] = sdata[0];
}
int
main()
{
int dimx = 320;
int dimy = 160;
int num_bytes = dimx*dimy*sizeof(float);
float *d_a, *h_a, // device and host pointers
*d_o=0, *h_o=0;
h_a = (float*)malloc(num_bytes);
h_o = (float*)malloc(sizeof(float));
srand(time(NULL));
for (int i=0; i < dimx; i++)
{
for (int j=0; j < dimy; j++)
{
h_a[i*dimy + j] = 1;
}
}
cudaMalloc( (void**)&d_a, num_bytes );
cudaMalloc( (void**)&d_o, sizeof(int) );
cudaMemcpy( d_a, h_a, num_bytes, cudaMemcpyHostToDevice);
cudaMemcpy( d_o, h_o, sizeof(int), cudaMemcpyHostToDevice);
dim3 grid, block;
block.x = 4;
block.y = 4;
grid.x = dimx / block.x;
grid.y = dimy / block.y;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
int sizeofSharedMemory = dimx*dimy*sizeof(float);
reduce<<<grid, block, sizeofSharedMemory>>> (d_a, d_o, block.x, block.y);
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
std::cout << "This kernel runs: " << elapsedTime << "ms" << std::endl;
std::cout << block.x << " " << block.y << std::endl;
std::cout << grid.x << " " << grid.y << std::endl;
std::cout << dimx << " " << dimy << " " << dimx*dimy << std::endl;
cudaMemcpy( h_a, d_a, num_bytes, cudaMemcpyDeviceToHost );
cudaMemcpy( h_o, d_o, sizeof(int), cudaMemcpyDeviceToHost );
std::cout << "The sum is:" << *h_o << std::endl;
free(h_a);
free(h_o);
cudaFree(d_a);
cudaFree(d_o);
}
If you do basic cuda error checking you will discover that your reduce kernel is not even running. The reason is as follows:
int dimx = 320;
int dimy = 160;
...
int sizeofSharedMemory = dimx*dimy*sizeof(float); // = 204800
reduce<<<grid, block, sizeofSharedMemory>>> (d_a, d_o, block.x, block.y);
^
|
204800 is illegal here
You cannot request 204800 bytes of shared memory dynamically (or any other way). The maximum is slightly less than 48K bytes.
If you had done proper cuda error checking, you would discover your kernel is not running and would have gotten an instructive error message which suggests the launch configuration (the numbers between the <<< ... >>> ) is invalid. Shared memory is requested on a per-block basis, and it's probably not sensible that you need to request enough shared memory to cover your entire 2D data set, when each block only consists of a 4x4 thread array. You probably just need enough data for what will be accessed by each 4x4 thread array.
After you have properly instrumented your code with cuda error checking, and detected and corrected all the errors, then run your code with cuda-memcheck. This will do an additional level of error checking to point out any kernel access errors. You may also use cuda-memcheck if you are getting an unspecified launch failure, and it may help pinpoint the issue.
After you have done these basic trouble shooting steps, then it might make sense to ask others for help. But use the power of the tools you have been given first.
I also want to point out one other error before you come back and post this code again, asking for help.
This will not be useful:
std::cout << "The sum is:" << *h_o << std::endl;
cudaMemcpy( h_a, d_a, num_bytes, cudaMemcpyDeviceToHost );
cudaMemcpy( h_o, d_o, sizeof(int), cudaMemcpyDeviceToHost );
You are printing out the sum before you have copied the sum from the device to the host.
Reverse the order of these steps:
cudaMemcpy( h_a, d_a, num_bytes, cudaMemcpyDeviceToHost );
cudaMemcpy( h_o, d_o, sizeof(int), cudaMemcpyDeviceToHost );
std::cout << "The sum is:" << *h_o << std::endl;

Cuda call won't allocate more than 8 threads per block, regardless of specification

I am creating a parallel version of the Sieve of Eratosthenes in c++. The problem is my kernel call (reduce0) seems to only ever assign 8 threads per block instead of the 256 I specify. Since even the first CUDA version allows 512 threads per block, there must be some error in my code for it. Any help would be appreciated.
#include <iostream>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cutil.h>
//#include <sieve_kernel.cu>
using namespace std;
////////////////////////////////////////////////////
int psum(int arg[], double n);
int call_kernel(int primes[], int n);
int findsmallest(int arg[], int f, double n);
int sieve(int n);
__global__ void reduce0(int *g_idata, int *g_odata);
////////////////////////////////////////////////////
int main(){
int n = pow((double) 2, 8);
int total = sieve(n);
cout << "# primes" << endl << total << endl;
return 0;
}
///////////////////////////////////////////////////
__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (int s = 1; s < blockDim.x; s *= 2) { // step = s x 2
if (tid % (s*2) == 0) { // only threadIDs divisible by the step participate
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////////////////////////////////
int call_kernel(int *primes, int n){
// Allocate and copy device arrays
int *g_idevice;
int *g_odevice;
int size = n * sizeof(int);
cudaMalloc(&g_idevice, size);
cudaMemcpy(g_idevice, primes, size, cudaMemcpyHostToDevice);
cudaMalloc(&g_odevice, size);
// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
// Copy device data back to primes
cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
//for (int i = 0; i < n; i++) {
// cout << i << " " << primes[i] << endl;
//}
int total = primes[0];
cudaFree(g_idevice);
cudaFree(g_odevice);
return total;
}
/////////////////////////////////////////////////////////////////////
int findsmallest(int arg[], int f, double n){
int i = f;
while(arg[i]!= 1 && i < n) {
i++;
}
return i;
}
//////////////////////////////////////////////////////////////////////
int psum(int arg[], double n){
int total = 0;
int i = 2;
while(i < n){
if(arg[i] == 1){
total = total + 1;
}
i++;
}
return total;
}
/////////////////////////////////////////////////////////////////////////
int sieve(int n){
int* primes = NULL;
int mult = 0;
int k = 2;
int i; int total;
//primes = new int[n];
primes = new int[256];
for(i = 0; i < n; i++){
primes[i] = 1;
}
primes[0] = primes[1] = 0;
while (k * k < n){
mult = k * k;
while (mult < n) {
primes[mult] = 0;
mult = mult + k;
}
k = findsmallest(primes,k+1, n);
}
total = call_kernel(primes, n);
//delete [] primes;
//primes = NULL;
return total;
}
Your kernel is using dynamically allocated shared memory, but the kernel launch does not include any allocation, so the result is the kernel will be aborting because of illegal memory operations on that shared memory buffer. You should find it works if you modify this part of call_kernel as follows:
// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
size_t shmsize = size_t(dimBlock.x * dimBlock.y * dimBlock.z) * sizeof(int);
reduce0<<<dimGrid, dimBlock, shmsize>>>(g_idevice, g_odevice);
If you had of included some basic error checking around the function call, perhaps like this:
reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
if (cudaPeekAtLastError() != cudaSuccess) {
cout << "kernel launch error: " << cudaGetErrorString(cudaGetLastError()) << endl;
}
// Copy device data back to primes
cudaError_t err = cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
cout << "CUDA error: " << cudaGetErrorString(err) << endl;
}
it would have been immediately obvious that the kernel launch or execution was failing with an error.