I'm attempting to write a simple matrix multplication program that continually adds the product of two matrices to a third result matrix (I'm essentially giving the GPU a workout while I measure power consumption with a separate apparatus).
My problem occurs when I specify a large number of iterations. I've tried this with several combinations of BLOCK_SIZE and matrix dimension values, and I've noted that the number of iterations can be increased with smaller matrix dimensions, but the BLOCK_SIZE must be the square root of the matrix dimensions (square matrices).
The resulting error in this case is a 39 second (regardless of iteration value, as long as it is 'too much') freeze followed by all zero matrix output. Interestingly, I ran this once with an iteration of 20000 and it worked fine. I ran it again and got the freeze error.
Any ideas? Thanks in advance!
Kernel:
//********************************************************************
// matrixMultiplication_kernel.cu
//
// Kernel for a basic CUDA matrix multiplication program.
//********************************************************************
#ifndef MATRIXMULTIPLICATION_KERNEL
#define MATRIXMULTIPLICATION_KERNEL
#define BLOCK_SIZE 16 // Set thread block size
#define colsA 256 // Set matrix A column dimension
#define rowsA 256 // Set matrix A row dimension
#define colsB 256 // Set matrix B column dimension
#define rowsB colsA // Set matrix B row dimension
#define colsC colsB // Set matrix C column dimension
#define rowsC rowsA // Set matrix C row dimension
//--------------------------------------------------------------------
// matrixMultiplication() - Multiplies matrixA and matrixB, storing
// the result in device memory for matrixC.
//
// PRE: matrixA, matrixB, and matrixC are float pointers; numColsA
// numColsB are integers.
// POST: The result of multiplying matrixA and matrixB is stored in
// matrixC.
//--------------------------------------------------------------------
__global__ void matrixMultiplication(float * matrixA, float * matrixB,
float * matrixC, int numColsA,
int numColsB) {
/* Declare matrix-multplication holder value ouside of for loop */
float val;
/* Set block and thread index positions */
int blockX = blockIdx.x;
int blockY = blockIdx.y;
int threadX = threadIdx.x;
int threadY = threadIdx.y;
/*
Set starting and ending indices of the first sub-matrix of A
and sub-matrix size for matrix A
*/
int startA = numColsA * BLOCK_SIZE * blockY;
int endA = startA + numColsA - 1;
int subSizeA = BLOCK_SIZE;
/*
Set starting index of the first sub-matrix of B and sub-matrix
size for matrix B
*/
int startB = BLOCK_SIZE * blockX;
int subSizeB = BLOCK_SIZE * colsB;
/* Perform matrix multiplication 20000 times */
for (int iteration = 0; iteration < 20000; iteration++) {
/* Loop through matrix A and matrix B's sub-matrices */
for (int i = startA, j = startB; i <= endA; i += subSizeA,
j += subSizeB) {
/*
Declare shared memory arrays for matrix A and B
sub-matrices
*/
__shared__ float subA[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float subB[BLOCK_SIZE][BLOCK_SIZE];
/* Fill sub-matrices */
subA[threadY][threadX] =
matrixA[i + colsA * threadY + threadX];
subB[threadY][threadX] =
matrixB[j + colsB * threadY + threadX];
/* Ensure that the matrices are loaded */
__syncthreads();
/* Loop through the block */
for (int k = 0; k < BLOCK_SIZE; ++k) {
/* Compute product of two matrix indices */
val += subA[threadY][k] * subB[k][threadX];
}
/*
Ensure completion before the next set of sub-matrices
begin computation
*/
__syncthreads();
}
/* Set device memory for this sub-matrix */
int position = colsB * BLOCK_SIZE * blockY + BLOCK_SIZE * blockX;
matrixC[position + colsB * threadY + threadX] = val;
}
}
#endif
Host:
//********************************************************************
// matrixMultiplication.cu
//
// A basic CUDA matrix multiplication program.
//********************************************************************
/* Include necessary libraries and kernel */
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <matrixMultiplication_kernel.cu>
/* Function declarations */
void fillMatrix(float * matrix, int numIndices);
//*************
// Main Program
//*************
int main(int argc, char** argv) {
/* Declare device memory */
float * deviceA;
float * deviceB;
float * deviceC;
srand(2013); // Set random seed
/* Determine total number of indices in each matrix */
unsigned int numIndicesA = colsA * rowsA;
unsigned int numIndicesB = colsB * rowsB;
unsigned int numIndicesC = colsC * rowsC;
/* Determine memory size of each matrix */
unsigned int memoryA = sizeof(float) * numIndicesA;
unsigned int memoryB = sizeof(float) * numIndicesB;
unsigned int memoryC = sizeof(float) * numIndicesC;
/* Allocate memory for each matrix */
float * matrixA = (float *) malloc(memoryA);
float * matrixB = (float *) malloc(memoryB);
float * matrixC = (float *) malloc(memoryC);
/* Set contents of matrices A and B (matrix C is all zeros) */
fillMatrix(matrixA, numIndicesA);
fillMatrix(matrixB, numIndicesB);
/* Allocate device memory for each matrix */
cudaMalloc((void **) &deviceA, memoryA);
cudaMalloc((void **) &deviceB, memoryB);
cudaMalloc((void **) &deviceC, memoryC);
/* Copy host memory to device memory for matrices A and B */
cudaMemcpy(deviceA, matrixA, memoryA, cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, matrixB, memoryB, cudaMemcpyHostToDevice);
/* Set thread count to BLOCK_SIZE x BLOCK_SIZE */
dim3 tCount(BLOCK_SIZE, BLOCK_SIZE);
/* Set thread block count */
dim3 tbCount((colsC / tCount.x), (rowsC / tCount.y));
/* Run kernel */
matrixMultiplication <<< tbCount, tCount >>> (deviceA, deviceB,
deviceC, colsA,
colsB);
/* Copy device memory to host memory for matrix C */
cudaMemcpy(matrixC, deviceC, memoryC, cudaMemcpyDeviceToHost);
for(int i = 0; i < 256; i++) {
printf("%f ", matrixC[i]);
}
printf("\n");
/* Free up host and device memory for each matrix */
free(matrixA);
free(matrixB);
free(matrixC);
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceC);
}
//--------------------------------------------------------------------
// fillMatrix - Assigns a random float value to each indice of the
// matrix.
//
// PRE: matrix is a pointer to a block of bytes in memory; numIndices
// is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been filled with random float
// values.
//--------------------------------------------------------------------
void fillMatrix(float * matrix, int numIndices) {
/* Loop through each index of the matrix */
for (int i = 0; i < numIndices; ++i) {
/*
Assign a random float between 0 and 1 for this index of
the matrix
*/
matrix[i] = rand() / (float)RAND_MAX;
}
}
Makefile:
GCC = nvcc
CUDA_INSTALL_PATH := /usr/local/cuda
INCLUDES := -I. -I$(CUDA_INSTALL_PATH)/include
CUDA_LIBS := -L$(CUDA_INSTALL_PATH)/lib -lcudart
matrixMultiplication.o: matrixMultiplication.cu
$(GCC) $(INCLUDES) -c matrixMultiplication.cu -o $#
matrixMultiplication: matrixMultiplication.o
$(GCC) -o $# matrixMultiplication.o $(CUDA_LIBS)
clean:
$(RM) *.o *~
Problem solved! It was a system timeout issue due to the long duration of the kernel. By switching terminal only mode, I was able to circumvent the issue.
Thanks for all the help guys!
Related
I try to sum many vectors values using CUDA c++. I found some solution for two vectors. As you can see, just possible to add two vectors but I wanna generate vectors dynamically with the same length.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
// CUDA kernel. Each thread takes care of one element of c
__global__ void vecAdd(double *a, double *b, double *c, int n)
{
// Get our global thread ID
int id = blockIdx.x*blockDim.x+threadIdx.x;
// Make sure we do not go out of bounds
if (id < n)
c[id] = a[id] + b[id];
}
int main( int argc, char* argv[] )
{
// Size of vectors
int n = 100000;
// Host input vectors
double *h_a;
double *h_b;
//Host output vector
double *h_c;
// Device input vectors
double *d_a;
double *d_b;
//Device output vector
double *d_c;
// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);
// Allocate memory for each vector on host
h_a = (double*)malloc(bytes);
h_b = (double*)malloc(bytes);
h_c = (double*)malloc(bytes);
// Allocate memory for each vector on GPU
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
int i;
// Initialize vectors on host
for( i = 0; i < n; i++ ) {
h_a[i] = sin(i)*sin(i);
h_b[i] = cos(i)*cos(i);
}
// Copy host vectors to device
cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);
int blockSize, gridSize;
// Number of threads in each thread block
blockSize = 1024;
// Number of thread blocks in grid
gridSize = (int)ceil((float)n/blockSize);
// Execute the kernel
vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
// Copy array back to host
cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );
// Sum up vector c and the print result divided by n, this should equal 1
within error
double sum = 0;
for(i=0; i<n; i++)
sum += h_c[i];
printf("final result: %f\n", sum/n);
// Release device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
// Release host memory
free(h_a);
free(h_b);
free(h_c);
return 0;
}
Is there a way to do this for many vectors? My vectors size are:
#vector length
N = 1000
#number of vectors
i = 300000
v[i] = [1,2,..., N]
As result i need to get:
out[i]= [sum(v[1]), sum(v[2]),..., sum(v[i])]
Thanks for any advice.
Summing multiple vectors together in a fashion similar to the code you have shown (i.e. generating elementwise sums) is equivalent to summing the columns of a matrix. And this idea represents a sensible way to realize the solution.
We will treat your vectors as a matrix, where each vector is a row in the matrix. The CUDA kernel will assign one thread to each column, and will sum the elements of that column, producing a single number result. That single number result will become one element of the vector result of the entire problem.
Here is a fully worked example demonstrating one possible approach:
$ cat t2.cu
#include <iostream>
typedef double mt;
const int nTPB = 64;
template <typename T>
__global__ void column_sum(T *matrix, T *sums, unsigned n_vectors, unsigned vector_length){
unsigned idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < vector_length){
T temp = 0;
for (unsigned i = 0; i < n_vectors; i++)
temp += matrix[i*vector_length+idx];
sums[idx] = temp;}
}
int main(){
const unsigned vlen = 1000;
const unsigned nvec = 300000;
mt *h_matrix, *d_matrix, *h_sums, *d_sums;
// create the desired number of vectors as a single matrix
h_sums = new mt[vlen];
h_matrix = new mt[vlen*nvec];
cudaMalloc(&d_matrix, vlen*nvec*sizeof(mt));
cudaMalloc(&d_sums, vlen*sizeof(mt));
size_t count = 0;
for (unsigned i = 0; i < nvec; i++)
for (unsigned j = 0; j < vlen; j++)
h_matrix[count++] = j;
cudaMemcpy(d_matrix, h_matrix, vlen*nvec*sizeof(mt), cudaMemcpyHostToDevice);
column_sum<<<(vlen+nTPB-1)/nTPB,nTPB>>>(d_matrix, d_sums, nvec, vlen);
cudaMemcpy(h_sums, d_sums, vlen*sizeof(mt), cudaMemcpyDeviceToHost);
for (unsigned i = 0; i < vlen; i++) if (h_sums[i] != ((mt)nvec)*i) {std::cout << " mismatch at " << i << " was: " << h_sums[i] << " should be: " << ((mt)nvec)*i << std::endl; return -1;}
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
}
$ nvcc -o t2 t2.cu
$ cuda-memcheck ./t2
========= CUDA-MEMCHECK
no error
========= ERROR SUMMARY: 0 errors
$
Note that this methodology only creates as many threads on the GPU as there are vector elements (1000 in the above example). 1000 threads would be enough to keep only the smallest GPUs busy. However this algorithm will be efficient on most GPUs if your vector length is 10,000 or longer. If you'd like to explore creating more efficient algorithms for small problem sizes, you can study the idea of a classical parallel reduction.
I am new to CUDA. I have written some simple code, which tries to copy a random initialized matrix to device memory, increments the value of each matrix entry by one, and transfer it back to the host memory.
There is no error while compiling or running the code. But, it seems that the kernel does not launch as the value of matrix entries are the same after launching the kernel.
Any idea what is happening there?
#include <iostream>
using namespace std;
#define SIZE 2
void print_matrix (int size, float *array);
void matrix_initialize(int size, float *array);
__global__ void LU(float * m, int size){
m[threadIdx.y*size + threadIdx.x] ++ ;
}
int main(){
srand(0);
//variables
float *a = new float[SIZE*SIZE];
dim3 blockdim(2,2,0);
dim3 griddim(1,0,0);
//initialize
matrix_initialize(SIZE, a);
print_matrix (SIZE, a);
//allocate space on device memory:
float * Ad;
int size = SIZE * SIZE;
cudaMalloc ((void **)&Ad, size);
//transfer data to device memory:
cudaMemcpy(Ad , a, size, cudaMemcpyHostToDevice);
//run the kernel
LU<<<griddim,blockdim>>>(Ad, SIZE);
// transfer the data back to the host memory
cudaMemcpy(a , Ad, size, cudaMemcpyDeviceToHost);
//test if the kernel runing the kernel has changed the value
print_matrix (SIZE, a);
// free device memory :
cudaFree (Ad);
return 0;
}
void print_matrix (int size, float *array){
for (int i=0; i < size*size ; i++){
if(i % size == 0)
cout << endl;
cout << array [i] << " ";
}
}
void matrix_initialize(int size, float *array){
for (int i = 0; i< SIZE*SIZE; i++){
array[i] = rand()/(float) RAND_MAX;
}
}
Unused dimensions should be set to 1 instead of 0:
dim3 blockdim(2, 2, 1);
dim3 griddim(1, 1, 1);
Your code launches 2 x 2 x 0 = 0 blocks, 1 x 0 x 0 = 0 threads each.
Your size calculation is wrong:
int size = SIZE * SIZE * sizeof(float);
Your code does not take array element size into account.
I am new in cuda programming. In my program (Matrix multiplication using shared memory) I defined block_size=20 and when matrices are 1200*1200 the program works with double elements but it does not work with float elements (when elements are float it works with 840*840 matrices). My question is that why it happens , although we know float type is smaller than double?
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)
#include <stdio.h>
#define BLOCK_SIZE 20
typedef struct {
int width;
int height;
int stride;
float* elements;
} Matrix;
// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col)
{
return A.elements[row * A.stride + col];
}
// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col,
float value)
{
A.elements[row * A.stride + col] = value;
}
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
Matrix Asub;
Asub.width = BLOCK_SIZE;
Asub.height = BLOCK_SIZE;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row+ BLOCK_SIZE * col];
return Asub;
}
// Thread block size
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Load A and B to device memory
Matrix d_A;
d_A.width = d_A.stride = A.width; d_A.height = A.height;
siz e_t size = A.width * A.height * sizeof(float);
cudaMalloc((void **)&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size,
cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width = d_B.stride = B.width; d_B.height = B.height;
size = B.width * B.height * sizeof(float);
cudaMalloc((void **)&d_B.elements, size);
cudaMemcpy(d_B.elements, B.elements, size,
cudaMemcpyHostToDevice);
// Allocate C in device memory
Matrix d_C;
d_C.width = d_C.stride = C.width; d_C.height = C.height;
size = C.width * C.height * sizeof(float);
cudaMalloc((void **)&d_C.elements, size);
// Invoke kernel
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
//dim3 dimBlock(C.height, C.width);
//dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
dim3 dimGrid((B.width+dimBlock.x-1) / dimBlock.x, (A.height+dimBlock.y-1) /dimBlock.y);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// Read C from device memory
cudaMemcpy(C.elements, d_C.elements, size,
cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Block row and column
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
// Each thread block computes one sub-matrix Csub of C
Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
// Each thread computes one element of Csub
// by accumulating results into Cvalue
float Cvalue = 0;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
for (int m = 0; m < (A.width / BLOCK_SIZE); ++m) {
// Get sub-matrix Asub of A
Matrix Asub = GetSubMatrix(A, blockRow, m);
// Get sub-matrix Bsub of B
Matrix Bsub = GetSubMatrix(B, m, blockCol);
// Shared memory used to store Asub and Bsub respectively
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[row][col] = GetElement(Asub, row, col);
Bs[row][col] = GetElement(Bsub, row, col);
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
__syncthreads();
// Multiply Asub and Bsub together
for (int e = 0; e < BLOCK_SIZE; ++e)
Cvalue += As[row][e] * Bs[e][col];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write Csub to device memory
// Each thread writes one element
SetElement(Csub, row, col, Cvalue);
}
//////////////////////////////////////////////////////////
/// print_matrix function ///////////////////////////
////////////////////////////////////////////////////////
void print_matrix(float *c,int row,int col){
for (int i = 0; i < row; ++i){
for (int j = 0; j < col; ++j)
printf("%f ",c[col*i +j]);
printf("\n\n");
}
}
//////////////////////////////////////////////////////////
/// random_init function ///////////////////////////
////////////////////////////////////////////////////////
void random_init(float *a,int size){
for(int i=0;i<size;i++)
a[i]=rand()%10;
}
////////////////////////////////////////////////////////
int main(void){
//////////////////////////////////////////////////////\|/
cudaEvent_t start,stop;
///////////////////////////////////////////////////////|\
Matrix A,B,C;
A.width=1200;
A.height=1200;/////
B.width=1200;/////
B.height=1200;
C.width=B.width;
C.height=A.height;
size_t size = A.width * A.height * sizeof(float);
A.elements = (float *)malloc(size);
//random_init(A.elements,A.width * A.height );
size = B.width * B.height * sizeof(float);
B.elements= (float *)malloc(size);
//random_init(B.elements,B.width * B.height);
size = C.width * C.height * sizeof(float);
C.elements= (float *)malloc(size);
for(int i=0;i<A.width*A.height;i++)
A.elements[i]=1;
for(int i=0;i<B.width*B.height;i++)
B.elements[i]=1;
printf("matrix A(%d,%d) & matrix B(%d,%d) & matrix C(%d,%d)\n",A.width,A.height,B.width,
B.height,C.width,C.height);
//////////////////////////////////////////////////////\|/
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
///////////////////////////////////////////////////////|\
MatMul(A,B,C);
//////////////////////////////////////////////////////\|/
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime,start,stop);
printf("Time to genreat : %3.5f ms\n",elapsedTime);
///////////////////////////////////////////////////////|\
printf("\nC\n");
//print_matrix(C.elements,C.height,C.width);
printf("C[%d]=%f\n",0,C.elements[0]);
printf("C[%d]=%f\n",C.width -1,C.elements[C.width-1]);
printf("C[%d]=%f\n",(C.width * C.height)-1,C.elements[(C.width * C.height)-1]);
getchar();
return(0);
}
The following message:
"“display driver stopped responding and has recovered”"
is an indication that you have run into a windows TDR event.
Under windows, kernels that take too long to execute will cause the windows display watchdog timer to reset the display device, which will cause CUDA code execution to be terminated. Kernels that require more than about 2 seconds to execute may run into this.
If you search on "windows TDR" you will find other descriptions and possible methods to work around this. You might also investigate why your code is taking longer to execute after you make the changes.
I'm attempting to compile a basic CUDA matrix multiplication program, but I'm running into this error:
nvcc -I. -I/usr/local/cuda/include -c matrixMult1.cu -o matrixMult1.o
make: nvcc: Command not found
make: *** [matrixMult1.o] Error 127
I was getting another error originally and it was recommended that I use nvcc, the only catch being that I know absolutely nothing about nvcc. Anyone have an idea? Thanks in advance!
Makefile:
GCC = nvcc
CUDA_INSTALL_PATH := /usr/local/cuda
INCLUDES := -I. -I$(CUDA_INSTALL_PATH)/include
CUDA_LIBS := -L$(CUDA_INSTALL_PATH)/lib -lcudart
matrixMult1.o: matrixMult1.cu
$(GCC) $(INCLUDES) -c matrixMult1.cu -o $#
matrixMult1: matrixMult1.o
$(GCC) -o $# matrixMult1.o $(CUDA_LIBS)
clean:
$(RM) *.o *~
Kernel:
//********************************************************************
// matrixMul_kernel.cu
//
// Kernel for a basic matrix multiplication program.
//********************************************************************
#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_
#include <stdio.h>
/* Thread block size */
#define BLOCK_SIZE 3
#define WA 3
#define HA 3
#define WB 3
#define HB WA
#define WC WB
#define HC HA
/* CUDA Kernel */
__global__ void matrixMul (float * C, float * A, float * B, int wA,
int wB) {
/* Two dimensional thread ID */
int tx = threadIdx.x;
int ty = threadIdx.y;
/* Computation holder variable */
float value = 0;
/* Loop through row of A and column of B to compute cell of C */
for (int i = 0; i < wA; ++i) {
float elementA = A[ty * wA + i];
float elementB = B[i * wB + tx];
value += elementA * elementB;
}
/* Write the result to C */
C[ty * wA + tx] = value;
}
#endif
Main Program:
//********************************************************************
// matrixMult1.c
//
// A basic matrix multiplication program.
//********************************************************************
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <matrixMul_kernel.cu>
#define WA 3
#define HA 3
#define WB 3
#define HB WA
#define WC WB
#define HC HA
void initMatrix(float * matrix, int numIndices);
//*************
// Main Program
//*************
int main(int argc, char** argv) {
/* Set random seed */
srand(2013);
/* Compute memory sizes for matrices A, B, and C */
unsigned int sizeA = WA * HA;
unsigned int sizeB = WB * HB;
unsigned int sizeC = WC * HC;
unsigned int memoryA = sizeof(float) * sizeA;
unsigned int memoryB = sizeof(float) * sizeB;
unsigned int memoryC = sizeof(float) * sizeC;
/* Allocate memory for matrices A, B, and C */
float * matrixA = (float *) malloc(memoryA);
float * matrixB = (float *) malloc(memoryB);
float * matrixC = (float *) malloc(memoryC);
/* Initialize matrices A and B */
initMatrix(matrixA, sizeA);
initMatrix(matrixB, sizeB);
/* Print matrix A */
printf("\nMatrix A:\n");
for (int i = 0; i < sizeA; i++) {
printf("%f ", matrixA[i]);
if (((i + 1) % WA) == 0) {
printf("\n");
} else {
printf(" | ");
}
}
/* Print matrix B */
printf("\nMatrix B:\n");
for (int i = 0; i < sizeB; i++) {
printf("%f ", matrixB[i]);
if (((i + 1) % WA) == 0) {
printf("\n");
} else {
printf(" | ");
}
}
/* Allocate device memory */
float* deviceMemA;
float* deviceMemB;
float* deviceMemC;
cudaMalloc((void**) &deviceMemA, memoryA);
cudaMalloc((void**) &deviceMemB, memoryB);
cudaMalloc((void**) &deviceMemC, memoryC);
/* Copy host memory to device */
cudaMemcpy(deviceMemA, matrixA, memoryA,
cudaMemcpyHostToDevice);
cudaMemcpy(deviceMemB, matrixB, memoryB,
cudaMemcpyHostToDevice);
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(WC / threads.x, HC / threads.y);
/* Execute kernel */
matrixMul<<< grid, threads >>>(deviceMemC, deviceMemA,
deviceMemB, WA, WB);
cudaMemcpy(deviceMemC, matrixC, memoryC,
cudaMemcpyHostToDevice);
/* Print matrix C */
printf("\nMatrix C:\n");
for (int i = 0; i < sizeC; i++) {
printf("%f ", matrixC[i]);
if (((i + 1) % WC) == 0) {
printf("\n");
} else {
printf(" | ");
}
}
printf("\n");
/* Free up memory */
free(matrixA);
free(matrixB);
free(matrixC);
cudaFree(deviceMemA);
cudaFree(deviceMemB);
cudaFree(deviceMemC);
}
//--------------------------------------------------------------------
// initMatrix - Assigns a random float value to each indice of the
// matrix.
//
// PRE: matrix is a pointer to a block of bytes in memory; numIndices
// is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been instantiated with a random
// float value.
//--------------------------------------------------------------------
void initMatrix(float * matrix, int numIndices) {
/*
Loop through the block of bytes, assigning a random float
for each index of the matrix
*/
for (int i = 0; i < numIndices; ++i) {
/* Assign a random float between 0 and 1 at this byte */
matrix[i] = rand() / (float)RAND_MAX;
}
}
This error:
nvcc: Command not found
indicates that nvcc is not in your shell's PATH.
To fix it, assuming it's bash or similar:
PATH=$PATH:/usr/local/cuda/bin
make
...or add it to the system or your user's profile.
I would like to implement a Differential Evolutionary Algorithm in CUDA.
How can I get two random vectors from matrix, knowing that they cannot be accessed again or, conversely, that they can? Is there an easy way of shuffling vectors in matrices?
I would also need to compute something using values from such a vector, and put new values in the bottom cell of each vector. It is easy to do? How to do it?
Maybe there is something like a stack implementation library (get by id, peek by id, ...)?
Maybe you should have a look at thrust library, which is sort of a C++ STL equivalent for CUDA. It has been integrated in the latest release of the CUDA toolkit, but if you have an older version of CUDA you can still download it for free at: http://code.google.com/p/thrust/
in this library, you'll find easy ways to handle vectors and to generate random numbers.
Concerning the implementation of the Differential Evolutionary Algorithm in CUDA as proposed in
R. Storn and K. Price, "Differential evolution: a simple and efficient heuristic for global optimization over continuous spaces," Journal of Global Optimization, vol. 11, no. 4, pp. 341-359, 1997
you seem to be concerned with the crossover operation. The only significant CUDA implementation of the Differential Evolutionary Algorithm in CUDA I'm aware is that in
L.P. de Veronese, R.A. Krohling, "Differential evolution algorithm on the GPU with C-CUDA," Proc. of the IEEE Congress on Evolutionary Computation, Barcelona, Spain, July 18-23, 2010, pp. 1-7.
Below, I'm showing a full CUDA code based on the implementation suggested in the latter paper.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/device_vector.h>
#include <thrust/extrema.h>
#include <curand.h>
#include <curand_kernel.h>
using namespace thrust;
#include <stdio.h>
#include <time.h>
#include <fstream>
#include "Utilities.cuh"
#define pi 3.14159265358979f
#define BLOCK_SIZE_POP 32
#define BLOCK_SIZE_RAND 64
#define BLOCK_SIZE_UNKN 8
#define BLOCK_SIZE 256
//#define DEBUG
// --- REFERENCES
// [1] R. Storn and K. Price, “Differential evolution – a simple and efficient heuristic for global optimization over continuous spaces,”
// Journal of Global Optimization, vol. 11, no. 4, pp. 341–359, 1997
// [2] Lucas de P. Veronese and Renato A. Krohling, “Differential Evolution Algorithm on the GPU with C-CUDA,”
// Proc. of the IEEE Congress on Evolutionary Computation, Barcelona, Spain, Jul. 18-23, 2010, pp. 1-7.
// Conventions: the index j addresses the population member while the index i addresses the member component
// the homologous host and device variables have the same name with a "h_" or "d_" prefix, respectively
// the __host__ and __device__ functions pointer parameters have the same name for comparison purposes. it is up to the caller to use
// host or device pointers, as appropriate
/****************************************/
/* EVALUATION OF THE OBJECTIVE FUNCTION */
/****************************************/
__global__ void curand_setup_kernel(curandState * __restrict state, const unsigned long int seed)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
curand_init(seed, tid, 0, &state[tid]);
}
/********************************/
/* INITIALIZE POPULATION ON GPU */
/********************************/
__global__ void initialize_population_GPU(float * __restrict pop, const float * __restrict minima, const float * __restrict maxima,
curandState * __restrict state, const int D, const int Np) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if ((i < D) && (j < Np)) pop[j*D+i] = (maxima[i] - minima[i]) * curand_uniform(&state[j*D+i]) + minima[i];
}
/****************************************/
/* EVALUATION OF THE OBJECTIVE FUNCTION */
/****************************************/
__host__ __device__ float functional(const float * __restrict x, const int D) {
float sum = 0.f;
// --- De Jong function
//for (int i=0; i<D; i++) sum = sum + x[i] * x[i];
// --- Rosenbrock's saddle
sum = 0.f;
for (int i=1; i<D; i++) sum = sum + 100.f * (x[i] - x[i-1] * x[i-1]) * (x[i] - x[i-1] * x[i-1]) + (x[i-1] - 1.f) * (x[i-1] - 1.f);
return sum;
}
/********************************/
/* POPULATION EVALUATION ON GPU */
/********************************/
__global__ void evaluation_GPU(const int Np, const int D, const float * __restrict pop, float * __restrict fobj) {
int j = threadIdx.x + blockIdx.x * blockDim.x;
if (j < Np) fobj[j] = functional(&pop[j*D], D);
}
/**********************************************************/
/* GENERATE MUTATION INDICES AND CROSS-OVER VALUES ON GPU */
/**********************************************************/
__global__ void generate_mutation_indices_and_crossover_values_GPU(float * __restrict Rand, int * __restrict mutation, const int Np, const int D,
curandState * __restrict state) {
int j = threadIdx.x + blockIdx.x * blockDim.x;
int a, b, c;
if (j < Np) {
do a=Np*(curand_uniform(&state[j*D])); while(a==j);
do b=Np*(curand_uniform(&state[j*D])); while(b==j||b==a);
do c=Np*(curand_uniform(&state[j*D])); while(c==j||c==a||c==b);
mutation[j*3]=a;
mutation[j*3+1]=b;
mutation[j*3+2]=c;
Rand[j]=curand_uniform(&state[j*D]);
}
}
/**********************************/
/* GENERATION OF A NEW POPULATION */
/**********************************/
__global__ void generation_new_population_GPU(const float * __restrict pop, const int NP, const int D, float * __restrict npop, const float F,
const float CR, const float * __restrict rand, const int * __restrict mutation,
const float * __restrict minimum, const float * __restrict maximum) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if ((i < D) && (j < NP)) {
// --- Mutation indices
int a=mutation[j*3];
int b=mutation[j*3+1];
int c=mutation[j*3+2];
// --- Mutation and crossover
// --- One of the best strategies. Try F = 0.7 and CR = 0.5 as a first guess.
if(rand[j]<CR) npop[j*D+i] = pop[a*D+i]+F*(pop[b*D+i]-pop[c*D+i]);
else npop[j*D+i] = pop[j*D+i];
// --- Other possible approaches to mutation and crossover
// --- Not bad, but found several optimization problems where misconvergence occurs.
//npop[j*D+i] = pop[best_old_gen_ind*D+i] + F*(pop[b*D+i]-pop_old[c*D+i]);
// --- One of the best strategies. Try F = 0.85 and CR = 1. In case of misconvergence, try to increase NP. If this doesn't help,
// play around with all the control variables.
//npop[j*D+i] = pop[j*D+i] + F*(pop[best_old_gen_ind*D+i] - pop[j*D+i]) + F*(pop[a*D+i]-pop[b*D+i]);
// --- Powerful strategy worth trying.
//npop[j*D+i] = pop[best_old_gen_ind*D+i] + (pop[a*D+i]+pop[b*D+i]-pop[c*D+i]-pop[d*D+i])*F;
// --- Robust optimizer for many functions.
//npop[j*D+i] = pop[e*D+i] + (pop[a*D+i]+pop[b*D+i]-pop[c*D+i]-pop[d*D+i])*F;
// --- Saturation due to constraints on the unknown parameters
if (npop[j*D+i]>maximum[i]) npop[j*D+i]=maximum[i];
else if (npop[j*D+i]<minimum[i]) npop[j*D+i]=minimum[i];
}
}
/*******************************/
/* POPULATION SELECTION ON GPU */
/*******************************/
// Assumption: all the optimization variables are associated to the same thread block
__global__ void selection_and_evaluation_GPU(const int Np, const int D, float * __restrict pop, const float * __restrict npop, float * __restrict fobj) {
int i = threadIdx.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if ((i < D) && (j < Np)) {
float nfobj = functional(&npop[j*D], D);
float temp = fobj[j];
if (nfobj < temp) {
pop[j*D+i] = npop[j*D+i];
fobj[j] = nfobj;
}
}
}
/***********************/
/* FIND MINIMUM ON GPU */
/***********************/
void find_minimum_GPU(const int N, float *t, float * __restrict minval, int * __restrict index) {
// --- Wrap raw pointer with a device_ptr
device_ptr<float> dev_ptr = device_pointer_cast(t);
// --- Use device_ptr in thrust min_element
device_ptr<float> min_ptr = thrust::min_element(dev_ptr, dev_ptr + N);
index[0] = &min_ptr[0] - &dev_ptr[0];
minval[0] = min_ptr[0];;
}
/********/
/* MAIN */
/********/
int main()
{
// --- Number of individuals in the population (Np >=4 for mutation purposes)
int Np = 80;
// --- Dimensionality of each individual (number of unknowns)
int D = 5;
// --- Mutation factor (0 < F <= 2). Typically chosen in [0.5, 1], see Ref. [1]
float F = 0.7f;
// --- Maximum number of generations
int Gmax = 2000;
// --- Crossover constant (0 < CR <= 1)
float CR = 0.4f;
// --- Mutually different random integer indices selected from {1, 2, … ,Np}
int *d_mutation, // --- Device side mutation vector
*d_best_index, // --- Device side current optimal member index
*h_best_index_dev; // --- Host side current optimal member index of device side
float *d_pop, // --- Device side population
*d_npop, // --- Device side new population (trial vectors)
*d_Rand, // --- Device side crossover rand vector (uniformly distributed in (0,1))
*d_fobj, // --- Device side objective function value
*d_maxima, // --- Device side maximum constraints vector
*d_minima, // --- Device side minimum constraints vector
*h_pop_dev_res, // --- Host side population result of GPU computations
*h_best_dev, // --- Host side population best value history of device side
*h_maxima, // --- Host side maximum constraints vector
*h_minima; // --- Host side minimum constraints vector
curandState *devState; // --- Device side random generator state vector
// --- Device side memory allocations
gpuErrchk(cudaMalloc((void**)&d_pop,D*Np*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_npop,D*Np*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_Rand,Np*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_fobj,Np*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_mutation,3*Np*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_maxima,D*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_minima,D*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&devState, D*Np*sizeof(curandState)));
// --- Host side memory allocations
h_pop_dev_res = (float*)malloc(D*Np*sizeof(float));
h_best_dev = (float*)malloc(Gmax*sizeof(float));
h_best_index_dev = (int*)malloc(Gmax*sizeof(int));
h_maxima = (float*)malloc(D*sizeof(float));
h_minima = (float*)malloc(D*sizeof(float));
// --- Define grid sizes
int Num_Blocks_Pop = iDivUp(Np,BLOCK_SIZE_POP);
int Num_Blocks_Rand2 = iDivUp(Np,BLOCK_SIZE_RAND);
dim3 Grid(iDivUp(D,BLOCK_SIZE_UNKN),iDivUp(Np,BLOCK_SIZE_POP));
dim3 Block(BLOCK_SIZE_UNKN,BLOCK_SIZE_POP);
// --- Set maxima and minima
for (int i=0; i<D; i++) {
h_maxima[i] = 2.;
h_minima[i] = -2.;
}
gpuErrchk(cudaMemcpy(d_maxima, h_maxima, D*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_minima, h_minima, D*sizeof(float), cudaMemcpyHostToDevice));
// --- Initialize cuRAND states
curand_setup_kernel<<<iDivUp(D*Np, BLOCK_SIZE), BLOCK_SIZE>>>(devState, time(NULL));
// --- Initialize popultion
initialize_population_GPU<<<Grid, Block>>>(d_pop, d_minima, d_maxima, devState, D, Np);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Evaluate population
evaluation_GPU<<<iDivUp(Np, BLOCK_SIZE), BLOCK_SIZE>>>(Np, D, d_pop, d_fobj);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
int a, b, c;
for(int i=0;i<Gmax;i++) {
// --- Generate mutation indices and cross-over uniformly distributed random vector
generate_mutation_indices_and_crossover_values_GPU<<<Num_Blocks_Rand2,BLOCK_SIZE_RAND>>>(d_Rand, d_mutation, Np, D, devState);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Generate new population
generation_new_population_GPU<<<Grid,Block>>>(d_pop, Np, D, d_npop, F, CR, d_Rand, d_mutation, d_minima, d_maxima);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Select new population and evaluate it
selection_and_evaluation_GPU<<<Grid,Block>>>(Np, D, d_pop, d_npop, d_fobj);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
find_minimum_GPU(Np, d_fobj, &h_best_dev[i], &h_best_index_dev[i]);
printf("Iteration: %i; best member value: %f: best member index: %i\n", i, h_best_dev[i], h_best_index_dev[i]);
}
gpuErrchk(cudaMemcpy(h_pop_dev_res, d_pop, Np*sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0; i<D; i++) printf("Variable nr. %i = %f\n", i, h_pop_dev_res[h_best_index_dev[Gmax-1]*D+i]);
return 0;
}