scan for large arrays - cuda

I want to use the scan function in Thrust library for large arrays but I get core dumped for array larger than 32768. I was wondering if there is another option other than thrust_scan.
Here is a snippet of my code:
#include <thrust/scan.h>
#include <stdio.h>
int main()
{
int *x;
int n = 65536;
x = (int *) malloc(n);
for (int i=0;i<n;i++)
x[i]=i;
thrust::inclusive_scan(x,x+n,x);
for (int i=0;i<n;i++)
printf(" %d ", x[i]);
printf("\n");
}

This:
x = (int *) malloc(n);
allocates n bytes of storage. You want storage for n integers:
x = (int *) malloc(n*sizeof(int));

Related

Cublas - Column/Row wise operations

I am looking for a way to perform operations over columns .
I have MxN matrix, i want to activate cublas function (for example nrm2) over each column.
The result i expect to get is : M x 1
How can I do that?
CUBLAS has no batched Level 1 routines, so there is no direct way to compute the column or row norms in a single call. You can do it by calling nrm2 many times in a loop over all the rows or columns of the matrix, for example:
#include <cublas_v2.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>
#include <thrust/random.h>
#include <thrust/device_vector.h>
#include <iostream>
struct prg
{
float a, b;
__host__ __device__
prg(float _a=0.f, float _b=1.f) : a(_a), b(_b) {};
__host__ __device__
float operator()(const unsigned int n) const
{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
int main(void)
{
const int M = 1024, N = M;
const int num = N * M;
thrust::device_vector<float> matrix(num);
thrust::device_vector<float> vector(N, -1.0f);
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
thrust::transform(index_sequence_begin,
index_sequence_begin + num,
matrix.begin(),
prg(1.f,2.f));
float* m_d = thrust::raw_pointer_cast(matrix.data());
float* v_d = thrust::raw_pointer_cast(vector.data());
cudaStream_t stream;
cudaStreamCreate(&stream);
cublasHandle_t handle;
cublasCreate(&handle);
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
cublasSetStream(handle, stream);
for(int col=0; col < N; col++) {
cublasSnrm2(handle, M, m_d + col*M, 1, v_d + col);
}
cudaDeviceSynchronize();
for(auto x : vector) {
float normval = x;
std::cout << normval << std::endl;
}
return 0;
}
Unless you have very large rows or columns, there is little scope to exploit streams to run simultaneous kernels and reduce the overall runtime because each nrm2 call will be too short. So there is a lot of latency in running lots of individual kernels, which will negatively effect performance.
A much better alternative would be to write your own kernel to do this.

Dynamic 2D array using double pointer in CUDA [duplicate]

I'm new to cuda. I want to add up two 2d array into a third array.
I use following code:
cudaMallocPitch((void**)&device_a, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_b, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_c, &pitch, 2*sizeof(int),2);
now my problem is that i dont want to use these array as flattened 2-d array
all in my kernel code i want to di is use two for loop & put the result in the third array like
__global__ void add(int *dev_a ,int *dev_b,int* dec_c)
{
for i=0;i<2;i++)
{
for j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
How i can do this in CUDA?
please tell me how to use 2-d array in this way ?
What should be the kernel call for using 2d-array ?
If possible, please explain using code samples.
The short answer is, you can't. The cudaMallocPitch()function does exactly what its name implies, it allocates pitched linear memory, where the pitch is chosen to be optimal for the GPU memory controller and texture hardware.
If you wanted to use arrays of pointers in the kernel, the kernel code would have to look like this:
__global___ void add(int *dev_a[] ,int *dev_b[], int* dec_c[])
{
for i=0;i<2;i++) {
for j=0;j<2;j++) {
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
and then you would need nested cudaMalloc calls on the host side to construct the array of pointers and copy it to device memory. For your rather trivial 2x2 example, the code to allocate a single array would look like this:
int ** h_a = (int **)malloc(2 * sizeof(int *));
cudaMalloc((void**)&h_a[0], 2*sizeof(int));
cudaMalloc((void**)&h_a[1], 2*sizeof(int));
int **d_a;
cudaMalloc((void ***)&d_a, 2 * sizeof(int *));
cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice);
Which would leave the allocated device array of pointers in d_a, and you would pass that to your kernel.
For code complexity and performance reasons, you really don't want to do that, using arrays of pointers in CUDA code is both harder and slower than the alternative using linear memory.
To show what folly using arrays of pointers is in CUDA, here is a complete working example of your sample problem which combines the two ideas above:
#include <cstdio>
__global__ void add(int * dev_a[], int * dev_b[], int * dev_c[])
{
for(int i=0;i<2;i++)
{
for(int j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != 0) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }
int main(void)
{
const int aa[2][2]={{1,2},{3,4}};
const int bb[2][2]={{5,6},{7,8}};
int cc[2][2];
int ** h_a = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_a[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_a[i], &aa[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int **d_a;
GPUerrchk(cudaMalloc((void ***)&d_a, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_b = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_b[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_b[i], &bb[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int ** d_b;
GPUerrchk(cudaMalloc((void ***)&d_b, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_b, h_b, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_c = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_c[i], 2*sizeof(int)));
}
int ** d_c;
GPUerrchk(cudaMalloc((void ***)&d_c, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_c, h_c, 2*sizeof(int *), cudaMemcpyHostToDevice));
add<<<1,1>>>(d_a,d_b,d_c);
GPUerrchk(cudaPeekAtLastError());
for(int i=0; i<2;i++){
GPUerrchk(cudaMemcpy(&cc[i][0], h_c[i], 2*sizeof(int), cudaMemcpyDeviceToHost));
}
for(int i=0;i<2;i++) {
for(int j=0;j<2;j++) {
printf("(%d,%d):%d\n",i,j,cc[i][j]);
}
}
return cudaThreadExit();
}
I recommend you study it until you understand what it does, and why it is such a poor idea compared to using linear memory.
You don't need to use for loops inside the device. Try this code.
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <time.h>
#define N 800
__global__ void matrixAdd(float* A, float* B, float* C){
int i = threadIdx.x;
int j = blockIdx.x;
C[N*j+i] = A[N*j+i] + B[N*j+i];
}
int main (void) {
clock_t start = clock();
float a[N][N], b[N][N], c[N][N];
float *dev_a, *dev_b, *dev_c;
cudaMalloc((void **)&dev_a, N * N * sizeof(float));
cudaMalloc((void **)&dev_b, N * N * sizeof(float));
cudaMalloc((void **)&dev_c, N * N * sizeof(float));
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
a[i][j] = rand() % 10;
b[i][j] = rand() % 10;
}
}
cudaMemcpy(dev_a, a, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * N * sizeof(float), cudaMemcpyHostToDevice);
matrixAdd <<<N,N>>> (dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N * N * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
printf("[%d, %d ]= %f + %f = %f\n",i,j, a[i][j], b[i][j], c[i][j]);
}
}
printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}

CUDA outputs always 0

The printing output is always 0, after executing the kernel function.
After some testing, cudaMemcpy is still correct. But the kernel seems not working, can not get correct data from d_inputs.
Could somebody help explain? Thanks!
#include <cuda_runtime.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/time.h>
#include <math.h>
#define N 32
__global__ void Kernel_double(int niters, int* d_inputs,double* d_outputs)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid<N) {
double val =(double) d_inputs[tid];
/*for (int iter=0; iter < niters; iter++){
val = (sqrt(pow(val,2.0)) + 5.0) - 101.0;
val = (val / 3.0) + 102.0;
val = (val + 1.07) - 103.0;
val = (val / 1.037) + 104.0;
val = (val + 3.00) - 105.0;
val = (val / 0.22) + 106.0;
}*/
val = val + 1.0;
//printf("This is %f\n",val);
d_outputs[tid] = val;
}
}
int main(int argc, char **argv)
{
int niters = 10;
printf("Iterate %d times with GPU 0 or CPU 1: %d\n", niters, cpu);
int inputs[N];
for (int i = 0; i<N; i++){
inputs[i] = i+1;
}
int d_inputs[N];
double d_outputs[N];
double outputs[N];
cudaMalloc( (void**)&d_inputs, N*sizeof(int));
cudaMalloc( (void**)&d_outputs, N*sizeof(double));
printf("test %d \n", inputs[3]);
cudaMemcpy(d_inputs, inputs, N*sizeof(int), cudaMemcpyHostToDevice);
printf("test %d \n", d_inputs[1]);
Kernel_double<<<16,2>>>(niters, d_inputs,d_outputs);
//cudaDeviceSynchronize();
cudaMemcpy(outputs, d_outputs, N*sizeof(double), cudaMemcpyDeviceToHost);
for(int j =0;j<10; j++){
printf("Outputs[%d] is: %f and %f\n",j, d_outputs[j], outputs[j]);
}
cudaFree(d_inputs);
cudaFree(d_outputs);
return EXIT_SUCCESS;
}
Any time you are having trouble with a CUDA code, you should use proper cuda error checking and run your code with cuda-memcheck, before asking others for help. Even if you don't understand the error output, it will be useful for others trying to help you. If you had used proper cuda error checking here, you would be informed that your cudaMemcpy operations are reporting an invalid argument, due to item 3 below.
Your code will not compile. cpu is not defined anywhere.
We don't allocate for, or create device pointers like this:
int d_inputs[N];
double d_outputs[N];
Those are creating stack variables (arrays) that the compiler is allowed to treat as if it were a constant pointer. Instead you should do it like this:
int *d_inputs;
double *d_outputs;
the compiler understands that these are modifiable pointers (which you will modify later with cudaMalloc).
Once you fix the issue in item 3, this will not be legal:
printf("test %d \n", d_inputs[1]);
as this requires dereferencing a device pointer (d_inputs) in host code, which is illegal in CUDA, at least as you have done so here. You have a similar problem in the printf statement later in your code as well (with d_outputs).
The following code has the above items addressed to some degree, and seems to run correctly for me:
$ cat t44.cu
#include <cuda_runtime.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/time.h>
#include <math.h>
#define N 32
__global__ void Kernel_double(int niters, int* d_inputs,double* d_outputs)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid<N) {
double val =(double) d_inputs[tid];
/*for (int iter=0; iter < niters; iter++){
val = (sqrt(pow(val,2.0)) + 5.0) - 101.0;
val = (val / 3.0) + 102.0;
val = (val + 1.07) - 103.0;
val = (val / 1.037) + 104.0;
val = (val + 3.00) - 105.0;
val = (val / 0.22) + 106.0;
}*/
val = val + 1.0;
//printf("This is %f\n",val);
d_outputs[tid] = val;
}
}
int main(int argc, char **argv)
{
int niters = 10;
int cpu = 0;
printf("Iterate %d times with GPU 0 or CPU 1: %d\n", niters, cpu);
int inputs[N];
for (int i = 0; i<N; i++){
inputs[i] = i+1;
}
int *d_inputs;
double *d_outputs;
double outputs[N];
cudaMalloc( (void**)&d_inputs, N*sizeof(int));
cudaMalloc( (void**)&d_outputs, N*sizeof(double));
printf("test %d \n", inputs[3]);
cudaMemcpy(d_inputs, inputs, N*sizeof(int), cudaMemcpyHostToDevice);
// printf("test %d \n", d_inputs[1]);
Kernel_double<<<16,2>>>(niters, d_inputs,d_outputs);
//cudaDeviceSynchronize();
cudaMemcpy(outputs, d_outputs, N*sizeof(double), cudaMemcpyDeviceToHost);
for(int j =0;j<10; j++){
printf("Outputs[%d] is: %f\n",j, outputs[j]);
}
cudaFree(d_inputs);
cudaFree(d_outputs);
return EXIT_SUCCESS;
}
$ nvcc -lineinfo -arch=sm_61 -o t44 t44.cu
$ cuda-memcheck ./t44
========= CUDA-MEMCHECK
Iterate 10 times with GPU 0 or CPU 1: 0
test 4
Outputs[0] is: 2.000000
Outputs[1] is: 3.000000
Outputs[2] is: 4.000000
Outputs[3] is: 5.000000
Outputs[4] is: 6.000000
Outputs[5] is: 7.000000
Outputs[6] is: 8.000000
Outputs[7] is: 9.000000
Outputs[8] is: 10.000000
Outputs[9] is: 11.000000
========= ERROR SUMMARY: 0 errors
$

Can't get matrix*vector multiplication to go faster in CUDA than in CPU

#include <iostream>
#include <assert.h>
#include <sys/time.h>
#define BLOCK_SIZE 32 // CUDA block size
__device__ inline int getValFromMatrix(int* matrix, int row, int col,int matSize) {
if (row<matSize && col<matSize) {return matrix[row*matSize + col];}
return 0;
}
__device__ inline int getValFromVector(int* vector, int row, int matSize) {
if (row<matSize) {return vector[row];}
return 0;
}
__global__ void matVecMultCUDAKernel(int* aOnGPU, int* bOnGPU, int* cOnGPU, int matSize) {
__shared__ int aRowShared[BLOCK_SIZE];
__shared__ int bShared[BLOCK_SIZE];
__shared__ int myRow;
__shared__ double rowSum;
int myIndexInBlock = threadIdx.x;
myRow = blockIdx.x;
rowSum = 0;
for (int m = 0; m < (matSize / BLOCK_SIZE + 1);m++) {
aRowShared[myIndexInBlock] = getValFromMatrix(aOnGPU,myRow,m*BLOCK_SIZE+myIndexInBlock,matSize);
bShared[myIndexInBlock] = getValFromVector(bOnGPU,m*BLOCK_SIZE+myIndexInBlock,matSize);
__syncthreads(); // Sync threads to make sure all fields have been written by all threads in the block to cShared and xShared
if (myIndexInBlock==0) {
for (int k=0;k<BLOCK_SIZE;k++) {
rowSum += aRowShared[k] * bShared[k];
}
}
}
if (myIndexInBlock==0) {cOnGPU[myRow] = rowSum;}
}
static inline void cudaCheckReturn(cudaError_t result) {
if (result != cudaSuccess) {
std::cerr <<"CUDA Runtime Error: " << cudaGetErrorString(result) << std::endl;
assert(result == cudaSuccess);
}
}
static void matVecMultCUDA(int* aOnGPU,int* bOnGPU, int* cOnGPU, int* c, int sizeOfc, int matSize) {
matVecMultCUDAKernel<<<matSize,BLOCK_SIZE>>>(aOnGPU,bOnGPU,cOnGPU,matSize); // Launch 1 block per row
cudaCheckReturn(cudaMemcpy(c,cOnGPU,sizeOfc,cudaMemcpyDeviceToHost));
}
static void matVecMult(int** A,int* b, int* c, int matSize) {
// Sequential implementation:
for (int i=0;i<matSize;i++) {
c[i]=0;
for (int j=0;j<matSize;j++) {
c[i]+=(A[i][j] * b[j]);
}
}
}
int main() {
int matSize = 1000;
int** A,* b,* c;
int* aOnGPU,* bOnGPU,* cOnGPU;
A = new int*[matSize];
for (int i = 0; i < matSize;i++) {A[i] = new int[matSize]();}
b = new int[matSize]();
c = new int[matSize]();
int aSizeOnGPU = matSize * matSize * sizeof(int), bcSizeOnGPU = matSize * sizeof(int);
cudaCheckReturn(cudaMalloc(&aOnGPU,aSizeOnGPU)); // cudaMallocPitch?
cudaCheckReturn(cudaMalloc(&bOnGPU,bcSizeOnGPU));
cudaCheckReturn(cudaMalloc(&cOnGPU,bcSizeOnGPU));
srand(time(NULL));
for (int i=0;i<matSize;i++) {
b[i] = rand()%100;
for (int j=0;j<matSize;j++) {
A[i][j] = rand()%100;
}
}
for (int i=0;i<matSize;i++) {cudaCheckReturn(cudaMemcpy((aOnGPU+i*matSize),A[i],bcSizeOnGPU,cudaMemcpyHostToDevice));}
cudaCheckReturn(cudaMemcpy(bOnGPU,b,bcSizeOnGPU,cudaMemcpyHostToDevice));
int iters=1;
timeval start,end;
// Sequential run:
gettimeofday(&start,NULL);
for (int i=0;i<iters;i++) {matVecMult(A,b,c,matSize);}
gettimeofday(&end,NULL);
std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;
// CUDA run:
gettimeofday(&start,NULL);
for (int i=0;i<iters;i++) {matVecMultCUDA(aOnGPU,bOnGPU,cOnGPU,c,bcSizeOnGPU,matSize);}
gettimeofday(&end,NULL);
std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;
cudaCheckReturn(cudaFree(aOnGPU));
cudaCheckReturn(cudaFree(bOnGPU));
cudaCheckReturn(cudaFree(cOnGPU));
for (int i = 0; i < matSize; ++i) {
delete[] A[i];
}
delete[] A;
delete[] b;
delete[] c;
}
Gives:
267171
580253
I've followed the guide on http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory, on how to do a matrix multiplication. I used shared memory for both the matrix (A) and the vector (B), but no matter what matrix size (100*100-20000*20000) or block size (32-1024) i choose, the sequential implementation always outperforms the CUDA implementation in terms of speed, it is about twice as fast.
Since I'm using matrix*vector multiplication, the shared arrays and blocks are handled a bit different; I'm using one block per row of the matrix instead of a 2D block over a part of the matrix.
Is my implementation wrong, or is simply CUDA not faster than the CPU?
First item: You perform checks on boundaries in the cuda implementation where you don't on CPU. Branching are really expensive on a GPU.
Second : You count the cudamemcpy in the cuda performance. It's very uncommon to perform only one multiplication before having to get the result back to cpu.
Usually (on CG for example), you perform several hundreds of multiplication on GPU before having to copy back.
Third: Dont try to implement that (except for educational purposes) and use vendor libraries (like CUBLAS, which ships with every CUDA release), which are extremely hard to outperform.

CUDA: How can I run the parallel reduction code for summation that is described in the NVIDIA paper by Mark Harris?

Although I understand the logic behind parallel reduction described in this paper, I can't seem to be able to run it for a simple example where the input array has size 1s.
Here is what I achieved so far. Keep in mind that I'm using the thrust library to manage input and output data.
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <ctime>
#include <sys/time.h>
#include <sstream>
#include <string>
#include <fstream>
using namespace std;
__global__ void reduce0(int *g_idata, int *g_odata){
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main(void){
int size = 10;
thrust::host_vector<int> data_h_i(size, 1);
//initialize the data, all values will be 1
//so the final sum will be equal to 10
int threadsPerBlock = 256;
int totalBlocks = size/threadsPerBlock + 1;
dim3 dimGrid(totalBlocks,1,1);
dim3 dimBlock(threadsPerBlock, 1, 1);
thrust::device_vector<int> data_v_i = data_h_i;
thrust::device_vector<int> data_v_o(size);
int* output = thrust::raw_pointer_cast(data_v_o.data());
int* input = thrust::raw_pointer_cast(data_v_i.data());
reduce0<<<dimGrid, dimBlock>>>(input, output);
data_v_i.clear();
data_v_i.shrink_to_fit();
thrust::host_vector<int> data_h_o = data_v_o;
data_v_o.clear();
data_v_o.shrink_to_fit();
cout<<data_h_o[0]<<endl;
return 0;
}
the code is simple, I create a host_vector of size size and initialize all the values to 1.
then I say that we need 256 threads per each block and find dynamically the amount of blocks needed for my example.
To keep things simple, I create an array of 10 values only, which means that we are going to require only one block. So one kernel invocation will be enough to produce the final result.
My questions are the following:
Question 1
After compiling the above example (nvcc -O3 reduction.cu -arch=sm_21) and entering ./a.out I get the following message:
terminate called after throwing an instance of 'thrust::system::system_error'
what(): unspecified launch failure
I'm not sure what's going on here, but it seems to me that the error comes from the line
sdata[tid] = g_idata[i]
The kernel is an exact copy of the kernel described in the paper so I'm not sure what changes are required in order to fix this problem.
Question 2
If we fix the first problem, how could we make the above code work for arbitrary size of input array? If for example our size is more than 256, then we would need at least two blocks, so each block will give an output that will then have to be combined with the outputs of other blocks. In the paper it says that we would need multiple invocations of the kernel, however I'm not sure how this can be done dynamically.
Thank you in advance
EDIT1: For Question 1 it seems that I don't allocate memory for the shared memory correctly. Calling the kernel like that: reduce0<<<dimGrid, dimBlock, size*sizeof(int)>>>(input, output); and also checking to see if tid is not out of range.
makes the code work properly.
The new kernel is the following:
__global__ void reduce0(int *g_idata, int *g_odata, int size){
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
if(tid<size){
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s=1; s < size; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
}
I'm still not sure about Question 2 though.
Question 1
Your kernel is using dynamically allocated shared memory:
extern __shared__ int sdata[];
...
sdata[tid] = g_idata[i];
But you are not allocating any dynamic shared memory in your kernel call:
reduce0<<<dimGrid, dimBlock>>>(input, output);
^
|
missing shared memory parameter.
So when you attempt to access the shared memory, you get a kernel fault.
By the way you can still do cuda error checking on your kernel calls (even though you are using thrust elsewhere).
Question 2
Question 2 is pretty well answered in Mark's paper here You can see at the bottom of slide 9 that each block writes it's partial result to an array in global memory (g_odata[]) which stores one result per block. We then simply launch another kernel of essentially the same type that is operating on g_odata[] instead of the original input data. We can do this process successively until our partial results (e.g. g_odata[]) only contain 256 results, or how many threads we are launching in a threadblock. We can then sum that final result with a single threadblock and produce a single answer value.
examples are given in the cuda sample code here.
Here's an edited version of your code, that shows how to call the two kernels in sequence to handle a larger size. I don't consider this a paragon of reduction programming, just a simple extension of what you already wrote to illustrate the concept. Note that there are a variety of changes throughout the kernel and main code to facilitate the use of the kernel to handle larger data sizes. This method still won't scale beyond a data size of (threadsPerBlock ^2), but again it's just to illustrate the concept of calling multiple kernels in sequence to sum partial results, with fewest modifications to your code.
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <ctime>
#include <sys/time.h>
#include <sstream>
#include <string>
#include <fstream>
using namespace std;
__global__ void reduce0(int *g_idata, int *g_odata, int size){
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = 0;
if(i<size)
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main(void){
int size = 40000;
thrust::host_vector<int> data_h_i(size, 1);
//initialize the data, all values will be 1
//so the final sum will be equal to size
int threadsPerBlock = 256;
int totalBlocks = (size+(threadsPerBlock-1))/threadsPerBlock;
thrust::device_vector<int> data_v_i = data_h_i;
thrust::device_vector<int> data_v_o(totalBlocks);
int* output = thrust::raw_pointer_cast(data_v_o.data());
int* input = thrust::raw_pointer_cast(data_v_i.data());
reduce0<<<totalBlocks, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(input, output, size);
reduce0<<<1, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(output, input, totalBlocks);
data_v_o[0] = data_v_i[0];
data_v_i.clear();
data_v_i.shrink_to_fit();
thrust::host_vector<int> data_h_o = data_v_o;
data_v_o.clear();
data_v_o.shrink_to_fit();
cout<<data_h_o[0]<<endl;
return 0;
}
After modifying the code made by Robert Crovella to answer my question, here is the final version which supports arbitrary amount of input values.
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <ctime>
#include <sys/time.h>
#include <sstream>
#include <string>
#include <fstream>
using namespace std;
__global__ void reduce0(int *g_idata, int *g_odata, int size){
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = 0;
if(i<size)
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main(void){
int size = 939289;
thrust::host_vector<int> data_h_i(size, 1);
//initialize the data, all values will be 1
//so the final sum will be equal to size
int threadsPerBlock = 256;
int totalBlocks = (size+(threadsPerBlock-1))/threadsPerBlock;
thrust::device_vector<int> data_v_i = data_h_i;
thrust::device_vector<int> data_v_o(totalBlocks);
int* output = thrust::raw_pointer_cast(data_v_o.data());
int* input = thrust::raw_pointer_cast(data_v_i.data());
bool turn = true;
while(true){
if(turn){
reduce0<<<totalBlocks, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(input, output, size);
turn = false;
}
else{
reduce0<<<totalBlocks, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(output, input, size);
turn = true;
}
if(totalBlocks == 1) break;
size = totalBlocks;
totalBlocks = ceil((double)totalBlocks/threadsPerBlock);
}
thrust::host_vector<int> data_h_o;
if(turn)
data_h_o = data_v_i;
else
data_h_o = data_v_o;
data_v_i.clear();
data_v_i.shrink_to_fit();
data_v_o.clear();
data_v_o.shrink_to_fit();
cout<<data_h_o[0]<<endl;
return 0;
}