CUDA random number generating - cuda

I would like to generate random numbers in my __device__ function, and keep it in my int Board[500] , I found some examples , but they used some type named curandState. I only need a function like a rand() in C++.

here is my code , I have N3[40000] array in my device memory , I generate some random numbers in my kernel working for one thread (I mean this "kernel <<<1,1>>> ... ") , then I copy it to my N2[40000] from CPU, and print it ,so here is the code
#include <iostream>
#include <Cuda.h>
#include<curand.h>
#include<curand_kernel.h>
int n = 200;
using namespace std;
__device__ float generate( curandState* globalState, int ind )
{
//int ind = threadIdx.x;
curandState localState = globalState[ind];
float RANDOM = curand_uniform( &localState );
globalState[ind] = localState;
return RANDOM;
}
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int id = threadIdx.x;
curand_init ( seed, id, 0, &state[id] );
}
__global__ void kernel(float* N, curandState* globalState, int n)
{
// generate random numbers
for(int i=0;i<40000;i++)
{
int k = generate(globalState, i) * 100000;
while(k > n*n-1)
{
k-=(n*n-1);
}
N[i] = k;
}
}
int main()
{
int N=40000;
curandState* devStates;
cudaMalloc ( &devStates, N*sizeof( curandState ) );
// setup seeds
setup_kernel <<< 1, N >>> ( devStates,unsigned(time(NULL)) );
float N2[40000];
float* N3;
cudaMalloc((void**) &N3, sizeof(float)*N);
kernel<<<1,1>>> (N3, devStates, n);
cudaMemcpy(N2, N3, sizeof(float)*N, cudaMemcpyDeviceToHost);
for(int i=0;i<N;i++)
{
cout<<N2[i]<<endl;
}
return 0;
}

You may use curand library to generate random numbers in device memory and then run your kernel without even having to copy those values to the host.

Related

Atomic Operation failed in CUDA

As the compute ability is 2.1, the atomicAdd and atomicMax operations do not support double precision, then I define both functions based on some answers on stack overflow.
It is strange that the atomicAdd function works well but the atomicMax doesn't work, here is my code.
The test of my code is to generate random number on each block, and then sum the random numbers on each block, we have block sum, I want to test the atomicAdd and atomicMax on the block sum.
#include <iostream>
#include <curand.h>
#include <curand_kernel.h>
#include <stdio.h>
#include <stdlib.h>
#define num_of_blocks 2
#define threads_per_block 2
#define tot_threads 4
__device__ double gsum[num_of_blocks];
__device__ double dev_sum;
__device__ double dev_max;
// set seed for random number generator
__global__ void initcuRand(curandState* globalState, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &globalState[idx]);
}
// atomiMax for double
__device__ double atomicMax_d(double* address, double val)
{
unsigned long long int* address_as_i = (unsigned long long int*)address;
unsigned long long int old = *address_as_i, assumed;
do {
assumed = old;
old = ::atomicCAS(address_as_i, assumed, __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
} while (assumed != old);
return __longlong_as_double(old);
}
// atomicAdd for double
__device__ double atomicAdd_d(double* address, double val)
{
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do{
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
}while(assumed != old);
return __longlong_as_double(old);
}
__global__ void kernel(curandState *globalState){
// global id
int gidx = threadIdx.x + blockIdx.x * blockDim.x;
// local id
int lidx = threadIdx.x;
// creat shared memory to store seeds
__shared__ curandState localState[tot_threads];
__shared__ double srandnum[threads_per_block];
// copy global seed to local
localState[lidx] = globalState[gidx];
//synchronize the local threads writing to the local memory cache
__syncthreads();
// generate random number from normal distribution in shared memory
srandnum[lidx] = curand_normal(&localState[lidx]);
__syncthreads();
if(lidx == 0){srandnum[lidx] += srandnum[lidx + 1];} // sum of each block
if(lidx == 0){gsum[blockIdx.x] = srandnum[lidx];} // copy the sums back to global memory
__threadfence();
if( gidx < num_of_blocks){
atomicAdd_d(&dev_sum, gsum[gidx]);
}
if( gidx < num_of_blocks){
atomicMax_d(&dev_max, gsum[gidx]);
}
if( gidx == 0){
printf("Sum is: %lf\n", dev_sum);
}
if( gidx == 1){
printf("Max is: %lf\n", dev_max);
}
}
int main(){
// set seed on device
curandState *globalState;
cudaMalloc((void**)&globalState, tot_threads*sizeof(curandState));
initcuRand<<<num_of_blocks, threads_per_block>>>(globalState, 1);
// launch kernel
kernel<<<num_of_blocks, threads_per_block>>>(globalState);
double randnum[num_of_blocks];
cudaMemcpyFromSymbol(randnum, gsum, num_of_blocks*sizeof(double), 0, cudaMemcpyDeviceToHost);
std::cout << "Sum of each block:\n";
for (int i = 0; i < num_of_blocks; ++i){
std::cout << randnum[i] << std::endl;
}
cudaFree(globalState);
return 0;
}
The result I get is
Sum is: -0.898329
Max is: 0.000000
Sum of each block:
-0.0152994
-0.88303
From the result, I know that the atomicAdd function works but the atomicMax function doesn't work, I have no idea of this. Thanks beforehand.
You don't ever initialize dev_max or dev_sum. You can't sensibly do these types of atomic operations on them if they don't start with a known value.
Try something like this instead:
__device__ double dev_sum = 0.0;
__device__ double dev_max = -1e99;
and I think you'll be happier with the results.

Can't get matrix*vector multiplication to go faster in CUDA than in CPU

#include <iostream>
#include <assert.h>
#include <sys/time.h>
#define BLOCK_SIZE 32 // CUDA block size
__device__ inline int getValFromMatrix(int* matrix, int row, int col,int matSize) {
if (row<matSize && col<matSize) {return matrix[row*matSize + col];}
return 0;
}
__device__ inline int getValFromVector(int* vector, int row, int matSize) {
if (row<matSize) {return vector[row];}
return 0;
}
__global__ void matVecMultCUDAKernel(int* aOnGPU, int* bOnGPU, int* cOnGPU, int matSize) {
__shared__ int aRowShared[BLOCK_SIZE];
__shared__ int bShared[BLOCK_SIZE];
__shared__ int myRow;
__shared__ double rowSum;
int myIndexInBlock = threadIdx.x;
myRow = blockIdx.x;
rowSum = 0;
for (int m = 0; m < (matSize / BLOCK_SIZE + 1);m++) {
aRowShared[myIndexInBlock] = getValFromMatrix(aOnGPU,myRow,m*BLOCK_SIZE+myIndexInBlock,matSize);
bShared[myIndexInBlock] = getValFromVector(bOnGPU,m*BLOCK_SIZE+myIndexInBlock,matSize);
__syncthreads(); // Sync threads to make sure all fields have been written by all threads in the block to cShared and xShared
if (myIndexInBlock==0) {
for (int k=0;k<BLOCK_SIZE;k++) {
rowSum += aRowShared[k] * bShared[k];
}
}
}
if (myIndexInBlock==0) {cOnGPU[myRow] = rowSum;}
}
static inline void cudaCheckReturn(cudaError_t result) {
if (result != cudaSuccess) {
std::cerr <<"CUDA Runtime Error: " << cudaGetErrorString(result) << std::endl;
assert(result == cudaSuccess);
}
}
static void matVecMultCUDA(int* aOnGPU,int* bOnGPU, int* cOnGPU, int* c, int sizeOfc, int matSize) {
matVecMultCUDAKernel<<<matSize,BLOCK_SIZE>>>(aOnGPU,bOnGPU,cOnGPU,matSize); // Launch 1 block per row
cudaCheckReturn(cudaMemcpy(c,cOnGPU,sizeOfc,cudaMemcpyDeviceToHost));
}
static void matVecMult(int** A,int* b, int* c, int matSize) {
// Sequential implementation:
for (int i=0;i<matSize;i++) {
c[i]=0;
for (int j=0;j<matSize;j++) {
c[i]+=(A[i][j] * b[j]);
}
}
}
int main() {
int matSize = 1000;
int** A,* b,* c;
int* aOnGPU,* bOnGPU,* cOnGPU;
A = new int*[matSize];
for (int i = 0; i < matSize;i++) {A[i] = new int[matSize]();}
b = new int[matSize]();
c = new int[matSize]();
int aSizeOnGPU = matSize * matSize * sizeof(int), bcSizeOnGPU = matSize * sizeof(int);
cudaCheckReturn(cudaMalloc(&aOnGPU,aSizeOnGPU)); // cudaMallocPitch?
cudaCheckReturn(cudaMalloc(&bOnGPU,bcSizeOnGPU));
cudaCheckReturn(cudaMalloc(&cOnGPU,bcSizeOnGPU));
srand(time(NULL));
for (int i=0;i<matSize;i++) {
b[i] = rand()%100;
for (int j=0;j<matSize;j++) {
A[i][j] = rand()%100;
}
}
for (int i=0;i<matSize;i++) {cudaCheckReturn(cudaMemcpy((aOnGPU+i*matSize),A[i],bcSizeOnGPU,cudaMemcpyHostToDevice));}
cudaCheckReturn(cudaMemcpy(bOnGPU,b,bcSizeOnGPU,cudaMemcpyHostToDevice));
int iters=1;
timeval start,end;
// Sequential run:
gettimeofday(&start,NULL);
for (int i=0;i<iters;i++) {matVecMult(A,b,c,matSize);}
gettimeofday(&end,NULL);
std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;
// CUDA run:
gettimeofday(&start,NULL);
for (int i=0;i<iters;i++) {matVecMultCUDA(aOnGPU,bOnGPU,cOnGPU,c,bcSizeOnGPU,matSize);}
gettimeofday(&end,NULL);
std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;
cudaCheckReturn(cudaFree(aOnGPU));
cudaCheckReturn(cudaFree(bOnGPU));
cudaCheckReturn(cudaFree(cOnGPU));
for (int i = 0; i < matSize; ++i) {
delete[] A[i];
}
delete[] A;
delete[] b;
delete[] c;
}
Gives:
267171
580253
I've followed the guide on http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory, on how to do a matrix multiplication. I used shared memory for both the matrix (A) and the vector (B), but no matter what matrix size (100*100-20000*20000) or block size (32-1024) i choose, the sequential implementation always outperforms the CUDA implementation in terms of speed, it is about twice as fast.
Since I'm using matrix*vector multiplication, the shared arrays and blocks are handled a bit different; I'm using one block per row of the matrix instead of a 2D block over a part of the matrix.
Is my implementation wrong, or is simply CUDA not faster than the CPU?
First item: You perform checks on boundaries in the cuda implementation where you don't on CPU. Branching are really expensive on a GPU.
Second : You count the cudamemcpy in the cuda performance. It's very uncommon to perform only one multiplication before having to get the result back to cpu.
Usually (on CG for example), you perform several hundreds of multiplication on GPU before having to copy back.
Third: Dont try to implement that (except for educational purposes) and use vendor libraries (like CUBLAS, which ships with every CUDA release), which are extremely hard to outperform.

Using of shared memory not showing desired result

I am trying to learn the usuage of Shared memory with a view to increase the performance . here I am trying to copy the global memory to shared memory. but when I have single block(256 thread) it gives the result and with more than 1 block it gives random result.
#include <cuda.h>
#include <stdio.h>
__global__ void staticReverse(int *d, int n)
{
__shared__ int s[400];
int t = blockIdx.x * blockDim.x + threadIdx.x;
d[t] = d[t]*d[t];
s[t] =d[t];
__syncthreads();
d[t] = s[t];
}
__global__ void dynamicReverse(int *d, int n)
{
extern __shared__ int s[];
int t = threadIdx.x;
s[t] = d[t]*d[t];
__syncthreads();
d[t] = s[t];
}
int main(void)
{
const int n = 400;
int a[n], d[n];
for (int i = 0; i < n; i++)
{
a[i] = i;
}
int *d_d;
cudaMalloc(&d_d, n * sizeof(int));
// run version with static shared memory
int block_size = 256;
int n_blocks = n/block_size + (n%block_size == 0 ? 0:1);
cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice);
staticReverse<<<n_blocks,block_size>>>(d_d, n);
cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < n; i++)
{
printf("%d\n",d[i]);
}
}
1)what does the third argument in dynamicReverse<<<n_blocks,block_size,n*sizeof(int)>>>(d_d, n);
kernal call does? does it allocates shared memory for entire block or thread.
2) if I required more than 64kb of shared memory per multiprocessor in compute capability 5.0 what I need to do?
In your static shared memory allocation code you had three issues:
The size of the statically allocated shared memory should comply with the block size, not with the size of the input array,
You should use local thread index for indexing shared memory, instead of the global one;
You had no array out of bounds checking.
The dynamic shared memory allocation code had the same issues #2 and #3 as above, plus the fact that you were indexing global memory with local thread index, instead of global. You can use the third argument to specify the size of the shared memory to be allocated. In particular, you should allocate an amount of 256 ints, i.e., related to the block size, similarly to the static shared memory allocation case.
Here is the complete working code:
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/***********************************/
/* SHARED MEMORY STATIC ALLOCATION */
/***********************************/
#include <cuda.h>
#include <stdio.h>
__global__ void staticReverse(int *d, int n)
{
__shared__ int s[256];
int t = blockIdx.x * blockDim.x + threadIdx.x;
if (t < n) {
d[t] = d[t]*d[t];
s[threadIdx.x] =d[t];
__syncthreads();
d[t] = s[threadIdx.x];
}
}
/************************************/
/* SHARED MEMORY DYNAMIC ALLOCATION */
/************************************/
__global__ void dynamicReverse(int *d, int n)
{
extern __shared__ int s[];
int t = blockIdx.x * blockDim.x + threadIdx.x;
if (t < n) {
s[threadIdx.x] = d[t]*d[t];
__syncthreads();
d[t] = s[threadIdx.x];
}
}
int main(void)
{
const int n = 400;
int* a = (int*) malloc(n*sizeof(int));
int* d = (int*) malloc(n*sizeof(int));
for (int i = 0; i < n; i++) { a[i] = i; }
int *d_d; gpuErrchk(cudaMalloc(&d_d, n * sizeof(int)));
// run version with static shared memory
int block_size = 256;
int n_blocks = n/block_size + (n%block_size == 0 ? 0:1);
gpuErrchk(cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice));
//staticReverse<<<n_blocks,block_size>>>(d_d, n);
dynamicReverse<<<n_blocks,block_size,256*sizeof(int)>>>(d_d, n);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < n; i++) { printf("%d\n",d[i]); }
}

Simpson's method to integrate real valued functions with CUDA

I'm trying to code integration by Simpson's method in CUDA.
This is the formula for Simpson's rule
where x_k = a + k*h.
Here's my code
__device__ void initThreadBounds(int *n_start, int *n_end, int n,
int totalBlocks, int blockWidth)
{
int threadId = blockWidth * blockIdx.x + threadIdx.x;
int nextThreadId = threadId + 1;
int threads = blockWidth * totalBlocks;
*n_start = (threadId * n)/ threads;
*n_end = (nextThreadId * n)/ threads;
}
__device__ float reg_func (float x)
{
return x;
}
typedef float (*p_func) (float);
__device__ p_func integrale_f = reg_func;
__device__ void integralSimpsonMethod(int totalBlocks, int totalThreads,
double a, double b, int n, float p_function(float), float* result)
{
*result = 0;
float h = (b - a)/n;
//*result = p_function(a)+p_function(a + h * n);
//parallel
int idx_start;
int idx_end;
initThreadBounds(&idx_start, &idx_end, n-1, totalBlocks, totalThreads);
//parallel_ends
for (int i = idx_start; i < idx_end; i+=2) {
*result += ( p_function(a + h*(i-1)) +
4 * p_function(a + h*(i)) +
p_function(a + h*(i+1)) ) * h/3;
}
}
__global__ void integralSimpson(int totalBlocks, int totalThreads, float* result)
{
float res = 0;
integralSimpsonMethod(totalBlocks, totalThreads, 0, 10, 1000, integrale_f, &res);
result[(blockIdx.x*totalThreads + threadIdx.x)] = res;
//printf ("Simpson method\n");
}
__host__ void inttest()
{
const int blocksNum = 32;
const int threadNum = 32;
float *device_resultf;
float host_resultf[threadNum*blocksNum]={0};
cudaMalloc((void**) &device_resultf, sizeof(float)*threadNum*blocksNum);
integralSimpson<<<blocksNum, threadNum>>>(blocksNum, threadNum, device_resultf);
cudaThreadSynchronize();
cudaMemcpy(host_resultf, device_resultf, sizeof(float) *threadNum*blocksNum,
cudaMemcpyDeviceToHost);
float sum = 0;
for (int i = 0; i != blocksNum*threadNum; ++i) {
sum += host_resultf[i];
// printf ("result in %i cell = %f \n", i, host_resultf[i]);
}
printf ("sum = %f \n", sum);
cudaFree(device_resultf);
}
int main(int argc, char* argv[])
{
inttest();
int i;
scanf ("%d",&i);
}
The problem is: it works wrong when n is lower than 100000. For an integral from 0 to 10, the result is ~99, but when n = 100000 or larger it works fine and the result is ~50.
What's wrong, guys?
The basic problem here is that you don't understand your own algorithm.
Your integralSimpsonMethod() function is designed such that each thread is sampling at least 3 quadrature points per sub-interval in the integral domain. Therefore, if you choose n so that it is less than four times the number of threads in the kernel call, it is inevitable that each sub interval will overlap and the resulting integral will be incorrect. You need to make sure that the code checks and scales the thread count or n so that they don't produce overlap when the integral is computed.
If you are doing this for anything other than self-edification, then I recommend you look up the composite version of Simpson's rule. This is much better suited to parallel implementation and will be considerably more performant if implemented correctly.
I would propose an approach to Simpson's integration by using CUDA Thrust. You basically need five steps:
Generate the Simpson's quadrature weights;
Generate the function sampling points;
Generate the function values;
Calculate the elementwise product between the quadrature weights and the function values;
Sum the above products.
Step #1 requires creating an array with elements repeated many times, namely, 1 4 2 4 2 4 ... 1 for the Simpson's case. This can be accomplished by borrowing Robert Crovella's approach in cuda thrust library repeat vector multiple times.
Step #2 can be accomplished by using couting_iterators and borrowing talonmies approach in Purpose and usage of counting_iterators in CUDA Thrust library.
Step #3 is an application of thrust::transform.
Steps #4 and #5 can be accomplished together by thrust::inner_product.
This approach can be exploited also for use when other quadrature integration rules are of interest.
Here is the code
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/inner_product.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// for printing
#include <thrust/copy.h>
#include <ostream>
#define STRIDE 2
#define N 100
#define pi_f 3.14159265358979f // Greek pi in single precision
struct sin_functor
{
__host__ __device__
float operator()(float x) const
{
return sin(2.f*pi_f*x);
}
};
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
int main(void)
{
// --- Generate the integration coefficients
thrust::host_vector<float> h_coefficients(STRIDE);
h_coefficients[0] = 4.f;
h_coefficients[1] = 2.f;
thrust::device_vector<float> d_coefficients(N);
typedef thrust::device_vector<float>::iterator Iterator;
strided_range<Iterator> pos1(d_coefficients.begin()+1, d_coefficients.end()-2, STRIDE);
strided_range<Iterator> pos2(d_coefficients.begin()+2, d_coefficients.end()-1, STRIDE);
thrust::fill(pos1.begin(), pos1.end(), h_coefficients[0]);
thrust::fill(pos2.begin(), pos2.end(), h_coefficients[1]);
d_coefficients[0] = 1.f;
d_coefficients[N-1] = 1.f;
// print the generated d_coefficients
std::cout << "d_coefficients: ";
thrust::copy(d_coefficients.begin(), d_coefficients.end(), std::ostream_iterator<float>(std::cout, " ")); std::cout << std::endl;
// --- Generate sampling points
float a = 0.f;
float b = .5f;
float Dx = (b-a)/(float)(N-1);
thrust::device_vector<float> d_x(N);
thrust::transform(thrust::make_counting_iterator(a/Dx),
thrust::make_counting_iterator((b+1.f)/Dx),
thrust::make_constant_iterator(Dx),
d_x.begin(),
thrust::multiplies<float>());
// --- Calculate function values
thrust::device_vector<float> d_y(N);
thrust::transform(d_x.begin(), d_x.end(), d_y.begin(), sin_functor());
// --- Calculate integral
float integral = (Dx/3.f) * thrust::inner_product(d_y.begin(), d_y.begin() + N, d_coefficients.begin(), 0.0f);
printf("The integral is = %f\n", integral);
getchar();
return 0;
}

Adding values on GPU

i have a class called Product.
Each product has a value and i want to add these values on GPU. I filled my array on host side
int * h_A, * d_A;
h_A = (int*) malloc(enterNum * sizeof(int));
cudaMalloc((void **) &d_A, enterNum * sizeof(int));
Product p("Product", price);
h_A[i] = p.getValue();
while (i < enterNum) {
i++;
cout << "Enter product name:";
cin >> desc;
cout << "Enter product price:";
cin >> price;
Product p("Product", price);
h_A[i] = p.getValue();
}
cudaMemcpy(d_A, h_A, enterNum, cudaMemcpyHostToDevice);
priceSum<<<enterNum, 1024>>>(d_A,enterNum,result);
int result2 = 0;
cudaMemcpy(result, result2, enterNum, cudaMemcpyDeviceToHost);
here cudaMemcpy function gives error because i dont use pointer. What can i do here? I dont need to use pointer here isn't it?
this is my summation function:
__global__ void priceSum(int *dA, int count, int result) {
int tid = blockIdx.x;
if (tid < count){
result+= dA[tid];
}
}
full code:
using namespace std;
#include "cuda_runtime.h"
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <stdlib.h>
class Product {
private:
char * description;
int productCode;
int value;
static int lastCode;
public:
Product(char* descriptionP, int valueP) {
productCode = ++lastCode;
value = valueP;
description = new char[strlen(descriptionP) + 1];
strcpy(description, descriptionP);
}
Product(Product& other) {
productCode = ++lastCode;
description = new char[strlen(other.description) + 1];
strcpy(description, other.description);
}
~Product() {
delete[] description;
}
char* getDescription() const {
return description;
}
void setDescription(char* description) {
this->description = description;
}
int getValue() const {
return value;
}
void setValue(int value) {
this->value = value;
}
};
int Product::lastCode = 1000;
__global__ void priceSum(int *dA, int count, int * result) {
int tid = blockIdx.x;
if (tid < count)
result+= dA[tid];
}
int main(void) {
int enterNum, price, * result = 0;
string desc;
const char * desc2;
cout << "How many products do you want to enter?";
cin >> enterNum;
int * h_A, * d_A;
h_A = (int*) malloc(enterNum * sizeof(int));
cudaMalloc((void **) &d_A, enterNum * sizeof(int));
int i = 0;
while (i < enterNum) {
cout << "Enter product name:";
cin >> desc;
cout << "Enter product price:";
cin >> price;
Product p("Product", price);
h_A[i] = p.getValue();
i++;
}
cudaMemcpy(d_A, h_A, enterNum * sizeof(int), cudaMemcpyHostToDevice);
priceSum<<<enterNum, 1>>>(d_A,enterNum,result);
int result2 = 0;
cudaMemcpy(&result2, result, enterNum, cudaMemcpyDeviceToHost);
cout << result2;
return 0;
}
You should show the definition of result in your host code, but I assume it is:
int result;
based on how you are passing it to your priceSum kernel.
You have more than 1 problem here.
In your priceSum kernel, you are summing the values in dA[] and storing the answer in result. But you have passed the variable result to the kernel by value instead of by reference so the value you are modifying is local to the function, and will not show up anywhere else. When a function in C needs to modify a variable that is passed to it via the parameter list, and the modified variable is to show up in the function calling context, it's necessary to pass that parameter by reference (i.e. using a pointer) rather than by value. Note this is based on the C programming language and is not specific to CUDA. So you should rewrite your kernel definition as:
__global__ void priceSum(int *dA, int count, int *result) {
Regarding your cudaMemcpy call, there are several issues that need to be cleaned up. First, we need the storage for result to be properly created using cudaMalloc (before the kernel is called, because the kernel will store something there.) Next, we need to fix the parameter list of the cudaMemcpy call itself. So your host code should be rewritten as:
cudaMemcpy(d_A, h_A, enterNum, cudaMemcpyHostToDevice);
int *result;
cudaMalloc((void **)&result, sizeof(int));
priceSum<<<enterNum, 1024>>>(d_A,enterNum,result);
int result2 = 0;
cudaMemcpy(&result2, result, sizeof(int), cudaMemcpyDeviceToHost);
There appear to be other problems with your code, around the grouping of data for threads and blocks. But you haven't shown enough of your program for me to make sense of it. So let me point out that your code shows only a single value for result (and result2), yet the way your kernel is written, each thread will add its value of dA[tid] to result. You can't have a bunch of threads all updating a single value in global memory with no control mechanism, and expect to get a sensible result. Problems like this are usually best handled with a classical parallel reduction algorithm, but for the sake of simplicity, to try and get something working, you can use atomics:
atomicAdd(result, dA[tid]);
Sorry, but your kernel just makes no sense at all. You are using blockIdx.x as your tid variable, but let's note that blockIdx.x is a number that is the same for every thread in a particular block. So then going on to have every thread add dA[tid] to result in this fashion just doesn't make sense. I believe it will make more sense if you change your kernel invocation to:
priceSum<<<enterNum, 1>>>(d_A,enterNum,result);