How sum grouped in Cuda - cuda

I would like to know what techniques I could apply to add some dimensions of an array and save to a new Vet lower as in the following example:
A -> [1,2], [3,4], [5,6]
B -> [3], [7], [11]
figure:
http://snag.gy/83Qwl.jpg

If you want to write your own CUDA kernel, take a look at the Vector add sample. Instead of passing 2 input vectors to the kernel, you would pass just A and provide a loop to sum over the "rows" of A:
__global__ void mykernel(int *A, int *B, int rows, int cols){
int idx=threadIdx.x+blockDim.x*blockIdx.x;
if (idx < rows) {
int sum = 0;
for (int i=0; i< cols; i++)
sum += A[(idx*cols)+i];
B[idx] = sum;
}
}
This won't be terribly efficient, but you can improve the efficiency if you can store your A array in column major order:
A -> [1,3,5], [2,4,6]
then a modification to the above kernel becomes pretty efficient:
__global__ void mykernel(int *A, int *B, int rows, int cols){
int idx=threadIdx.x+blockDim.x*blockIdx.x;
if (idx < rows) {
int sum = 0;
for (int i=0; i< cols; i++)
sum += A[(i*cols)+idx];
B[idx] = sum;
}
}
If you're looking for efficiency but can't reorganize your data, then a segmented parallel reduction will be fastest. You can try creating something based on the cuda sample codes but I would suggest using thrust, specifically reduce_by_key
You would leave your A array as is and use it as the "values":
A -> [1,2], [3,4], [5,6]
And you would create a "key" array which corresponds to the rows of your A array:
K -> [0,0], [1,1], [2,2]

Related

How to guarantee the random number generator seed to be different each time in thrust

When we write a CUDA kernel, we always do this to guarantee the seed can be updated.
__global__ void kernel(curandState *globalState){
curandState *localState;
localState = globalState;
// generate random number with localState.
globalState = localState;
}
and if we run the kernel for several times, the random number can always be different.
My question is that if we want to use thrust to generate random number based on this question:
Generating a random number vector between 0 and 1.0 using Thrust
and talonmies' answer, when we need to run several times with the same functor prg, how we could have different seed for each operation?
I tried to rewrite the code as following:
#include<thrust/random.h>
#include<thrust/device_vector.h>
#include<thrust/transform.h>
#include<thrust/iterator/counting_iterator.h>
#include<iostream>
#include<time.h>
struct prg
{
float a, b;
unsigned int N;
__host__ __device__
prg(float _a=0.f, float _b=1.f, unsigned int _N = time(NULL)) : a(_a), b(_b), N(_N) {};
__host__ __device__
float operator()(const unsigned int n) const
{
thrust::default_random_engine rng(N);
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
int main(void)
{
const int N = 5;
thrust::device_vector<float> numbers(N);
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
// first operation
thrust::transform(index_sequence_begin,index_sequence_begin + N, numbers.begin(),prg(1.f,2.f));
for(int i = 0; i < N; i++)
{
std::cout << numbers[i] << std::endl;
}
// second operation
thrust::transform(index_sequence_begin,index_sequence_begin + N, numbers.begin(),prg(1.f,2.f));
for(int i = 0; i < N; i++)
{
std::cout << numbers[i] << std::endl;
}
return 0;
}
The first operation and second operation generate the same number. I know it is because the time difference is short, then how should I modify the code to get different random numbers for these two operations? I guess it is possible to assign the seed based on the operation time,(1,2,.....10000, 10001, ...N), but will it be expensive to do that?
To paraphrase John von Neumann "Nothing as important as random numbers should be left to chance".
If you cannot guarantee that the seeds for the random generators are different (and it appears you cannot in this case), then don't try and have different seeds. Use one seeded generator instance and take different sequences from it.

Can't get matrix*vector multiplication to go faster in CUDA than in CPU

#include <iostream>
#include <assert.h>
#include <sys/time.h>
#define BLOCK_SIZE 32 // CUDA block size
__device__ inline int getValFromMatrix(int* matrix, int row, int col,int matSize) {
if (row<matSize && col<matSize) {return matrix[row*matSize + col];}
return 0;
}
__device__ inline int getValFromVector(int* vector, int row, int matSize) {
if (row<matSize) {return vector[row];}
return 0;
}
__global__ void matVecMultCUDAKernel(int* aOnGPU, int* bOnGPU, int* cOnGPU, int matSize) {
__shared__ int aRowShared[BLOCK_SIZE];
__shared__ int bShared[BLOCK_SIZE];
__shared__ int myRow;
__shared__ double rowSum;
int myIndexInBlock = threadIdx.x;
myRow = blockIdx.x;
rowSum = 0;
for (int m = 0; m < (matSize / BLOCK_SIZE + 1);m++) {
aRowShared[myIndexInBlock] = getValFromMatrix(aOnGPU,myRow,m*BLOCK_SIZE+myIndexInBlock,matSize);
bShared[myIndexInBlock] = getValFromVector(bOnGPU,m*BLOCK_SIZE+myIndexInBlock,matSize);
__syncthreads(); // Sync threads to make sure all fields have been written by all threads in the block to cShared and xShared
if (myIndexInBlock==0) {
for (int k=0;k<BLOCK_SIZE;k++) {
rowSum += aRowShared[k] * bShared[k];
}
}
}
if (myIndexInBlock==0) {cOnGPU[myRow] = rowSum;}
}
static inline void cudaCheckReturn(cudaError_t result) {
if (result != cudaSuccess) {
std::cerr <<"CUDA Runtime Error: " << cudaGetErrorString(result) << std::endl;
assert(result == cudaSuccess);
}
}
static void matVecMultCUDA(int* aOnGPU,int* bOnGPU, int* cOnGPU, int* c, int sizeOfc, int matSize) {
matVecMultCUDAKernel<<<matSize,BLOCK_SIZE>>>(aOnGPU,bOnGPU,cOnGPU,matSize); // Launch 1 block per row
cudaCheckReturn(cudaMemcpy(c,cOnGPU,sizeOfc,cudaMemcpyDeviceToHost));
}
static void matVecMult(int** A,int* b, int* c, int matSize) {
// Sequential implementation:
for (int i=0;i<matSize;i++) {
c[i]=0;
for (int j=0;j<matSize;j++) {
c[i]+=(A[i][j] * b[j]);
}
}
}
int main() {
int matSize = 1000;
int** A,* b,* c;
int* aOnGPU,* bOnGPU,* cOnGPU;
A = new int*[matSize];
for (int i = 0; i < matSize;i++) {A[i] = new int[matSize]();}
b = new int[matSize]();
c = new int[matSize]();
int aSizeOnGPU = matSize * matSize * sizeof(int), bcSizeOnGPU = matSize * sizeof(int);
cudaCheckReturn(cudaMalloc(&aOnGPU,aSizeOnGPU)); // cudaMallocPitch?
cudaCheckReturn(cudaMalloc(&bOnGPU,bcSizeOnGPU));
cudaCheckReturn(cudaMalloc(&cOnGPU,bcSizeOnGPU));
srand(time(NULL));
for (int i=0;i<matSize;i++) {
b[i] = rand()%100;
for (int j=0;j<matSize;j++) {
A[i][j] = rand()%100;
}
}
for (int i=0;i<matSize;i++) {cudaCheckReturn(cudaMemcpy((aOnGPU+i*matSize),A[i],bcSizeOnGPU,cudaMemcpyHostToDevice));}
cudaCheckReturn(cudaMemcpy(bOnGPU,b,bcSizeOnGPU,cudaMemcpyHostToDevice));
int iters=1;
timeval start,end;
// Sequential run:
gettimeofday(&start,NULL);
for (int i=0;i<iters;i++) {matVecMult(A,b,c,matSize);}
gettimeofday(&end,NULL);
std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;
// CUDA run:
gettimeofday(&start,NULL);
for (int i=0;i<iters;i++) {matVecMultCUDA(aOnGPU,bOnGPU,cOnGPU,c,bcSizeOnGPU,matSize);}
gettimeofday(&end,NULL);
std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;
cudaCheckReturn(cudaFree(aOnGPU));
cudaCheckReturn(cudaFree(bOnGPU));
cudaCheckReturn(cudaFree(cOnGPU));
for (int i = 0; i < matSize; ++i) {
delete[] A[i];
}
delete[] A;
delete[] b;
delete[] c;
}
Gives:
267171
580253
I've followed the guide on http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory, on how to do a matrix multiplication. I used shared memory for both the matrix (A) and the vector (B), but no matter what matrix size (100*100-20000*20000) or block size (32-1024) i choose, the sequential implementation always outperforms the CUDA implementation in terms of speed, it is about twice as fast.
Since I'm using matrix*vector multiplication, the shared arrays and blocks are handled a bit different; I'm using one block per row of the matrix instead of a 2D block over a part of the matrix.
Is my implementation wrong, or is simply CUDA not faster than the CPU?
First item: You perform checks on boundaries in the cuda implementation where you don't on CPU. Branching are really expensive on a GPU.
Second : You count the cudamemcpy in the cuda performance. It's very uncommon to perform only one multiplication before having to get the result back to cpu.
Usually (on CG for example), you perform several hundreds of multiplication on GPU before having to copy back.
Third: Dont try to implement that (except for educational purposes) and use vendor libraries (like CUBLAS, which ships with every CUDA release), which are extremely hard to outperform.

scan for large arrays

I want to use the scan function in Thrust library for large arrays but I get core dumped for array larger than 32768. I was wondering if there is another option other than thrust_scan.
Here is a snippet of my code:
#include <thrust/scan.h>
#include <stdio.h>
int main()
{
int *x;
int n = 65536;
x = (int *) malloc(n);
for (int i=0;i<n;i++)
x[i]=i;
thrust::inclusive_scan(x,x+n,x);
for (int i=0;i<n;i++)
printf(" %d ", x[i]);
printf("\n");
}
This:
x = (int *) malloc(n);
allocates n bytes of storage. You want storage for n integers:
x = (int *) malloc(n*sizeof(int));

CUDA function to increase an array values

when I use this code in cuda it only increase a[0],a[1],a[2] other was 0 (didn't increased)
__global__ void inc2(int * a){
int i= threadIdx.x;
i%=10;
atomicAdd(&(a[i]),1);
}
when I write
__global__ void inc2(int * a){
int i= threadIdx.x;
i%=10;
atomicAdd(&(a[6]),1);
}
it didn't increase a[6]
what's wrong? sorry
all of the code is this
__global__ void inc2(int * a){
int i= threadIdx.x;
i%=10;
atomicAdd(&(a[6]),1);
}
int main()
{
//=============================================
int aaa[10]={0};
int *q;
cudaMalloc((void**)&q,100);
cudaMemcpy(q,aaa,10,cudaMemcpyHostToDevice);
inc2<<<100,100>>>(q);
cudaMemcpy(aaa,q,10,cudaMemcpyDeviceToHost);
printf("\n\n");
for(int i=0;i<10;i++){
printf("%d\t",aaa[i]);
}
cudaFree(q);
return 0;
}
First of all, you should use proper cuda error checking any time you are having trouble with a CUDA code.
You may be confused about the size parameters associated with functions like cudaMalloc or cudaMemcpy. They represent a size in bytes. So this:
cudaMemcpy(aaa,q,10,cudaMemcpyDeviceToHost);
only transfers 10 bytes, which is 2.5 int quantities. If you want to see the modified value of a[6], you're going to have to transfer more than the first 2 int quantities in a.
If you modify these lines:
cudaMemcpy(q,aaa,40,cudaMemcpyHostToDevice);
^^
and:
cudaMemcpy(aaa,q,40,cudaMemcpyDeviceToHost);
^^
I think you'll have better results.

CUDA Thrust: Finding the index of the first element in a vector satisfying a predicate (e.g., zero or negative) [Matlab's syntax min(find(x<=0))]

I am attempting to find the the index of the first zero or negative value of an array using CUDA Thrust. The serial CPU code I am attempting to write using CUDA Thrust is the following:
for (int i = StartIndex; i <= ArrayLimitIndex; i++)
{
if (Array[i] <= 0) { DesiredIndex = i; break; }
}
I am thinking that the easiest way to do this on the GPU will be using the find_if function within the Thrust library.
The array is already on the GPU and I am attempting to search for the index on this array using Thrust as such:
struct less_than_or_eq_zero
{
__host__ __device__
bool operator() (double x)
{
return x <= 0;
}
};
thrust::device_vector<double>::iterator iter;
thrust::device_ptr<double> dev_ptr_Col46 = thrust::device_pointer_cast(dev_Col46);
iter = thrust::find_if(thrust::device, dev_ptr_Col46, dev_ptr_Col46 + size,less_than_or_eq_zero());
Now I would like to use the value of iter as an argument for my next kernel:
newKernel<<<size, 1>>>(*dev_array, iter)
where the newKernel definition is of the form:
__global__ void newKernel(double *dev_array, iter)
{
int x = blockIdx.x;
if(x <= iter)
{
//process data here...
}
}
I know that the code I have here is incorrect and I have a few questions regarding the use of iter. First, iter is a device_vector. Is there any way I can make iter just one value and not a vector? Also, when I have executed the find_if how can I use the value of iter in my next kernel call?
Any help with this be greatly appreciated.
Thanks
I'm summarizing the comments by talonmies and Jared Hoberock above as well as the answer by Sebastian Dressler in a fully compilable and executable example. The code calculates, by CUDA Thrust, the index of the first element of a vector satisfying a predicate (x<=0. in this case), I hope it will be helpful for future readers.
#include <thrust/device_vector.h>
#include <stdio.h>
struct less_than_or_eq_zero
{
__host__ __device__ bool operator() (double x) { return x <= 0.; }
};
int main(void)
{
int N = 6;
thrust::device_vector<float> D(N);
D[0] = 3.;
D[1] = 2.3;
D[2] = -1.3;
D[3] = 0.;
D[4] = 3.;
D[5] = -44.;
thrust::device_vector<float>::iterator iter1 = D.begin();
thrust::device_vector<float>::iterator iter2 = thrust::find_if(D.begin(), D.begin() + N, less_than_or_eq_zero());
int d = thrust::distance(iter1, iter2);
printf("Index = %i\n",d);
getchar();
return 0;
}
As you do not use a device_vector in your kernel but a raw array, you have to pass it an index and not an iterator. You can obtain the index by using thrust::distance to calculate the distance between dev_ptr_Col46 and iter.
You'll also want to read thrust iterators documentation, where distance is documented.
Try this:
thrust::device_ptr<double> val_ptr = thrust::find_if(dev_ptr_Col46, dev_ptr_Col46 + size,less_than_or_eq_zero());
double * val = thrust::raw_pointer_cast(val_ptr);
newKernel<<<size, 1>>>(dev_array, val)
Your kernel will have to have signature
__global__ void newKernel(double * dev_array, double * val)