Result of reduction#1 is wrong - cuda

I implemented the reduction#1 form the well-known slides by Mark Harris, but I obtain 0 as result. I filled the input array with the same values shown in the slides. I compiled with cuda 7.0 using the command nvcc reduction1.cu -o red1. Where is the mistake? Thanks.
#include <stdio.h>
#include <cuda_runtime.h>
#define THREADS_PER_BLOCK 16
__global__ void reduce1(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for(unsigned int s=1; s < blockDim.x; s *= 2)
{
if (tid % (2*s) == 0) sdata[tid] += sdata[tid + s];
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main()
{
int inputLength=16;
int hostInput[16]={10,1,8,-1,0,-2,3,5,-2,-3,2,7,0,11,0,2};
int hostOutput=0;
int *deviceInput;
int *deviceOutput;
cudaMalloc((void **)&deviceInput, inputLength * sizeof(int));
cudaMalloc((void **)&deviceOutput, sizeof(int));
cudaMemcpy(deviceInput, hostInput, inputLength * sizeof(int),cudaMemcpyHostToDevice);
reduce1<<<1,THREADS_PER_BLOCK>>>(deviceInput, deviceOutput);
cudaDeviceSynchronize();
cudaMemcpy(&hostOutput, deviceOutput,sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\n",hostOutput);
cudaFree(deviceInput);
cudaFree(deviceOutput);
return 0;
}

As talonmies said, you are using dynamic shared memory, but you are not allocating any memory space for it. You have to specify the size of this memory as the third argument of your kernel execution configuration.
reduce1<<<1, THREADS_PER_BLOCK, 64>>>(deviceInput, deviceOutput);
^^
Another way to fix this code is to use static shared memory. Declare your shared memory like this:
__shared__ int sdata[16];
Please read this before asking questions for CUDA.

Related

Compare Thrust fill with kernel launch speed [duplicate]

I want to add 128-bit vectors with carry. My 128-bit version (addKernel128 in the code below) is twice slower than the basic 32-bit version (addKernel32 below).
Do I have memory coalescing problems ? How can I get better performance ?
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#define UADDO(c, a, b) asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));
#define UADDC(c, a, b) asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));
__global__ void addKernel32(unsigned int *c, const unsigned int *a, const unsigned int *b, const int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
while (tid < size)
{
c[tid] = a[tid] + b[tid];
tid += blockDim.x * gridDim.x;
}
}
__global__ void addKernel128(unsigned *c, const unsigned *a, const unsigned *b, const int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
while (tid < size / 4)
{
uint4 a4 = ((const uint4 *)a)[tid],
b4 = ((const uint4 *)b)[tid],
c4;
UADDO(c4.x, a4.x, b4.x)
UADDC(c4.y, a4.y, b4.y) // add with carry
UADDC(c4.z, a4.z, b4.z) // add with carry
UADDC(c4.w, a4.w, b4.w) // add with carry (no overflow checking for clarity)
((uint4 *)c)[tid] = c4;
tid += blockDim.x * gridDim.x;
}
}
int main()
{
const int size = 10000000; // 10 million
unsigned int *d_a, *d_b, *d_c;
cudaMalloc((void**)&d_a, size * sizeof(int));
cudaMalloc((void**)&d_b, size * sizeof(int));
cudaMalloc((void**)&d_c, size * sizeof(int));
cudaMemset(d_a, 1, size * sizeof(int)); // dummy init just for the example
cudaMemset(d_b, 2, size * sizeof(int)); // dummy init just for the example
cudaMemset(d_c, 0, size * sizeof(int));
int nbThreads = 512;
int nbBlocks = 1024; // for example
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
addKernel128<<<nbBlocks, nbThreads>>>(d_c, d_a, d_b, size);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float m = 0;
cudaEventElapsedTime(&m, start, stop);
cudaFree(d_c);
cudaFree(d_b);
cudaFree(d_a);
cudaDeviceReset();
printf("Elapsed = %g\n", m);
return 0;
}
Timing CUDA code on a WDDM GPU can be quite difficult for a variety of reasons. Most of these revolve around the fact that the GPU is being managed as a display device by Windows, and this can introduce a variety of artifacts into the timing. One example is that the windows driver and WDDM will batch work for the GPU, and may interleave display work in the middle of CUDA GPU work.
if possible, time your cuda code on linux, or else on a windows GPU
in TCC mode.
for performance, always build without the -G switch. In visual studio, this usually corresponds to building the release, not the debug version of the project.
To get a good performance comparison, it's usually advisable to do some "warm up runs" before actually measuring the timing results. These will eliminate "start-up" and other one-time measurement issues, are you are more likely to get sensible results. You may also wish to run your code a number of times and average the results.
It's also usually advisable to compile with an arch flag that corresponds to your GPU, so for example -arch=sm_20 for a cc2.0 GPU.

Using of shared memory not showing desired result

I am trying to learn the usuage of Shared memory with a view to increase the performance . here I am trying to copy the global memory to shared memory. but when I have single block(256 thread) it gives the result and with more than 1 block it gives random result.
#include <cuda.h>
#include <stdio.h>
__global__ void staticReverse(int *d, int n)
{
__shared__ int s[400];
int t = blockIdx.x * blockDim.x + threadIdx.x;
d[t] = d[t]*d[t];
s[t] =d[t];
__syncthreads();
d[t] = s[t];
}
__global__ void dynamicReverse(int *d, int n)
{
extern __shared__ int s[];
int t = threadIdx.x;
s[t] = d[t]*d[t];
__syncthreads();
d[t] = s[t];
}
int main(void)
{
const int n = 400;
int a[n], d[n];
for (int i = 0; i < n; i++)
{
a[i] = i;
}
int *d_d;
cudaMalloc(&d_d, n * sizeof(int));
// run version with static shared memory
int block_size = 256;
int n_blocks = n/block_size + (n%block_size == 0 ? 0:1);
cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice);
staticReverse<<<n_blocks,block_size>>>(d_d, n);
cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < n; i++)
{
printf("%d\n",d[i]);
}
}
1)what does the third argument in dynamicReverse<<<n_blocks,block_size,n*sizeof(int)>>>(d_d, n);
kernal call does? does it allocates shared memory for entire block or thread.
2) if I required more than 64kb of shared memory per multiprocessor in compute capability 5.0 what I need to do?
In your static shared memory allocation code you had three issues:
The size of the statically allocated shared memory should comply with the block size, not with the size of the input array,
You should use local thread index for indexing shared memory, instead of the global one;
You had no array out of bounds checking.
The dynamic shared memory allocation code had the same issues #2 and #3 as above, plus the fact that you were indexing global memory with local thread index, instead of global. You can use the third argument to specify the size of the shared memory to be allocated. In particular, you should allocate an amount of 256 ints, i.e., related to the block size, similarly to the static shared memory allocation case.
Here is the complete working code:
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/***********************************/
/* SHARED MEMORY STATIC ALLOCATION */
/***********************************/
#include <cuda.h>
#include <stdio.h>
__global__ void staticReverse(int *d, int n)
{
__shared__ int s[256];
int t = blockIdx.x * blockDim.x + threadIdx.x;
if (t < n) {
d[t] = d[t]*d[t];
s[threadIdx.x] =d[t];
__syncthreads();
d[t] = s[threadIdx.x];
}
}
/************************************/
/* SHARED MEMORY DYNAMIC ALLOCATION */
/************************************/
__global__ void dynamicReverse(int *d, int n)
{
extern __shared__ int s[];
int t = blockIdx.x * blockDim.x + threadIdx.x;
if (t < n) {
s[threadIdx.x] = d[t]*d[t];
__syncthreads();
d[t] = s[threadIdx.x];
}
}
int main(void)
{
const int n = 400;
int* a = (int*) malloc(n*sizeof(int));
int* d = (int*) malloc(n*sizeof(int));
for (int i = 0; i < n; i++) { a[i] = i; }
int *d_d; gpuErrchk(cudaMalloc(&d_d, n * sizeof(int)));
// run version with static shared memory
int block_size = 256;
int n_blocks = n/block_size + (n%block_size == 0 ? 0:1);
gpuErrchk(cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice));
//staticReverse<<<n_blocks,block_size>>>(d_d, n);
dynamicReverse<<<n_blocks,block_size,256*sizeof(int)>>>(d_d, n);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < n; i++) { printf("%d\n",d[i]); }
}

Finding minimum in GPU slower than CPU

I have implemented this code: http://www.cuvilib.com/Reduction.pdf in order to calculate the sum of the elements of a matrix.
However in GPU it runs much slower than in CPU.
I got i7 processor and NVIDIA GT 540M graphics card.
Is it supposed to be that way or something else?
EDIT: I use version 3 of the above code in Ubuntu 13.04 and I compile it using Eclipse Nsight. The size of the matrix is 2097152 elements. It executes in 3.6 ms whereas the CPU version in around 1.0 ms. Below is the whole code:
#include <stdio.h>
#include <stdlib.h>
#include <thrust/sort.h>
#include <sys/time.h>
#include <omp.h>
#include <iostream>
#include <algorithm>
#define MIN(a,b) (((a)<(b))?(a):(b))
static const int WORK_SIZE = 2097152;
int find_min(int *a,int length){
int min = a[0];
for (int i=1;i<length;i++)
if (a[i]<min)
min=a[i];
return min;
}
__global__ static void red_min(int *g_idata,int *g_odata) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid]= g_idata[i];
__syncthreads();
for(unsigned int s=blockDim.x/2; s > 0; s >>= 1) {
if (tid<s) {
sdata[tid] = MIN(sdata[tid],sdata[tid + s]);
}
__syncthreads();
}
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
int main(void) {
int *d1,*d2;
int i,*result;
int *idata,*fdata;
srand ( time(NULL) );
result = (int *)malloc(sizeof(int));
idata = (int *)malloc(WORK_SIZE*sizeof(int));
fdata = (int *)malloc(WORK_SIZE*sizeof(int));
cudaMalloc((int**)&d1,WORK_SIZE*sizeof(int));
cudaMalloc((int**)&d2,WORK_SIZE*sizeof(int));
for (i = 0; i < WORK_SIZE; i++){
idata[i] = rand();
fdata[i] = i;
}
struct timeval begin, end;
gettimeofday(&begin, NULL);
*result = find_min(idata,WORK_SIZE);
printf( "Minimum Element CPU: %d \n", *result);
gettimeofday(&end, NULL);
int time = (end.tv_sec * (unsigned int)1e6 + end.tv_usec) - (begin.tv_sec * (unsigned int)1e6 + begin.tv_usec);
printf("Microseconds elapsed CPU: %d\n", time);
cudaMemcpy(d1,idata,WORK_SIZE*sizeof(int),cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate( &start);
cudaEventCreate( &stop);
cudaEventRecord(start,0);
int num_blocks = 16384;
bool flag = true;
while (num_blocks>0){
if (flag) {
red_min<<<num_blocks,128,128*sizeof(int)>>>(d1,d2);
}
else {
red_min<<<num_blocks,128,128*sizeof(int)>>>(d2,d1);
}
num_blocks /= 128;
flag = !flag;
}
GT540M is a mobile GPU, so I assume you're running on a laptop, and furthermore you may be hosting the X display on the 540M GPU.
I built a complete version of your code:
#include <stdio.h>
#include <stdlib.h>
#include <thrust/sort.h>
#include <sys/time.h>
#include <omp.h>
#include <iostream>
#include <algorithm>
#define MIN(a,b) (((a)<(b))?(a):(b))
static const int WORK_SIZE = 2097152;
int find_min(int *a,int length){
int min = a[0];
for (int i=1;i<length;i++)
if (a[i]<min)
min=a[i];
return min;
}
__global__ static void red_min(int *g_idata,int *g_odata) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid]= g_idata[i];
__syncthreads();
for(unsigned int s=blockDim.x/2; s > 0; s >>= 1) {
if (tid<s) {
sdata[tid] = MIN(sdata[tid],sdata[tid + s]);
}
__syncthreads();
}
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
int main(void) {
int *d1,*d2;
int i,*result;
int *idata,*fdata;
srand ( time(NULL) );
result = (int *)malloc(sizeof(int));
idata = (int *)malloc(WORK_SIZE*sizeof(int));
fdata = (int *)malloc(WORK_SIZE*sizeof(int));
cudaMalloc((int**)&d1,WORK_SIZE*sizeof(int));
cudaMalloc((int**)&d2,WORK_SIZE*sizeof(int));
for (i = 0; i < WORK_SIZE; i++){
idata[i] = rand();
fdata[i] = i;
}
struct timeval begin, end;
gettimeofday(&begin, NULL);
*result = find_min(idata,WORK_SIZE);
printf( "Minimum Element CPU: %d \n", *result);
gettimeofday(&end, NULL);
int time = (end.tv_sec * (unsigned int)1e6 + end.tv_usec) - (begin.tv_sec * (unsigned int)1e6 + begin.tv_usec);
printf("Microseconds elapsed CPU: %d\n", time);
cudaMemcpy(d1,idata,WORK_SIZE*sizeof(int),cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate( &start);
cudaEventCreate( &stop);
cudaEventRecord(start,0);
int num_blocks = 16384;
bool flag = true;
int loops = 0;
while (num_blocks>0){
if (flag) {
red_min<<<num_blocks,128,128*sizeof(int)>>>(d1,d2);
}
else {
red_min<<<num_blocks,128,128*sizeof(int)>>>(d2,d1);
}
num_blocks /= 128;
flag = !flag;
loops++;
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float et = 0.0f;
cudaEventElapsedTime(&et, start, stop);
printf("GPU time: %fms, in %d loops\n", et, loops);
int gpuresult;
if (flag)
cudaMemcpy(&gpuresult, d1, sizeof(int), cudaMemcpyDeviceToHost);
else
cudaMemcpy(&gpuresult, d2, sizeof(int), cudaMemcpyDeviceToHost);
printf("GPU min: %d\n", gpuresult);
return 0;
}
compiled it:
$ nvcc -O3 -arch=sm_20 -o t264 t264.cu
and ran it on a M2050 GPU, RHEL 5.5, CUDA 5.5, Xeon X5650 CPU
$ ./t264
Minimum Element CPU: 288
Microseconds elapsed CPU: 1217
GPU time: 0.621408ms, in 3 loops
GPU min: 288
$
So my CPU results were pretty close to yours, but my GPU results were about 5-6x faster. If we compare M2050 to your GT540M, we see that the M2050 has 14 SMs whereas the GT540M has 2. More importantly, the M2050 has about 5x the memory bandwidth of your GT540M GPU (28.8GB/s peak theoretical for GT540M vs. ~150GB/s peak theoretical for M2050)
Since a well written parallel reduction is a memory bandwidth constrained code on GPUs, the speed difference between your GPU and my GPU makes sense.
So I would say your results are probably about what is expected, and to get better results you will probably need a faster GPU.
Also, if your GT540M is also hosting an X display, it's possible that the GPU timing is corrupted by display activity. If we are timing a single kernel, this is not normally an issue - the kernel execution interrupts the display processing briefly. But when we are timing a sequence of kernels in succession, it's possible for the display tasks to jump in and execute in-between kernel calls (the GPU is multi-tasking when it is asked to both support a display and also process CUDA code). Therefore, this may be a possible performance impact in your case as well.

CUDA: How can I run the parallel reduction code for summation that is described in the NVIDIA paper by Mark Harris?

Although I understand the logic behind parallel reduction described in this paper, I can't seem to be able to run it for a simple example where the input array has size 1s.
Here is what I achieved so far. Keep in mind that I'm using the thrust library to manage input and output data.
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <ctime>
#include <sys/time.h>
#include <sstream>
#include <string>
#include <fstream>
using namespace std;
__global__ void reduce0(int *g_idata, int *g_odata){
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main(void){
int size = 10;
thrust::host_vector<int> data_h_i(size, 1);
//initialize the data, all values will be 1
//so the final sum will be equal to 10
int threadsPerBlock = 256;
int totalBlocks = size/threadsPerBlock + 1;
dim3 dimGrid(totalBlocks,1,1);
dim3 dimBlock(threadsPerBlock, 1, 1);
thrust::device_vector<int> data_v_i = data_h_i;
thrust::device_vector<int> data_v_o(size);
int* output = thrust::raw_pointer_cast(data_v_o.data());
int* input = thrust::raw_pointer_cast(data_v_i.data());
reduce0<<<dimGrid, dimBlock>>>(input, output);
data_v_i.clear();
data_v_i.shrink_to_fit();
thrust::host_vector<int> data_h_o = data_v_o;
data_v_o.clear();
data_v_o.shrink_to_fit();
cout<<data_h_o[0]<<endl;
return 0;
}
the code is simple, I create a host_vector of size size and initialize all the values to 1.
then I say that we need 256 threads per each block and find dynamically the amount of blocks needed for my example.
To keep things simple, I create an array of 10 values only, which means that we are going to require only one block. So one kernel invocation will be enough to produce the final result.
My questions are the following:
Question 1
After compiling the above example (nvcc -O3 reduction.cu -arch=sm_21) and entering ./a.out I get the following message:
terminate called after throwing an instance of 'thrust::system::system_error'
what(): unspecified launch failure
I'm not sure what's going on here, but it seems to me that the error comes from the line
sdata[tid] = g_idata[i]
The kernel is an exact copy of the kernel described in the paper so I'm not sure what changes are required in order to fix this problem.
Question 2
If we fix the first problem, how could we make the above code work for arbitrary size of input array? If for example our size is more than 256, then we would need at least two blocks, so each block will give an output that will then have to be combined with the outputs of other blocks. In the paper it says that we would need multiple invocations of the kernel, however I'm not sure how this can be done dynamically.
Thank you in advance
EDIT1: For Question 1 it seems that I don't allocate memory for the shared memory correctly. Calling the kernel like that: reduce0<<<dimGrid, dimBlock, size*sizeof(int)>>>(input, output); and also checking to see if tid is not out of range.
makes the code work properly.
The new kernel is the following:
__global__ void reduce0(int *g_idata, int *g_odata, int size){
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
if(tid<size){
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s=1; s < size; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
}
I'm still not sure about Question 2 though.
Question 1
Your kernel is using dynamically allocated shared memory:
extern __shared__ int sdata[];
...
sdata[tid] = g_idata[i];
But you are not allocating any dynamic shared memory in your kernel call:
reduce0<<<dimGrid, dimBlock>>>(input, output);
^
|
missing shared memory parameter.
So when you attempt to access the shared memory, you get a kernel fault.
By the way you can still do cuda error checking on your kernel calls (even though you are using thrust elsewhere).
Question 2
Question 2 is pretty well answered in Mark's paper here You can see at the bottom of slide 9 that each block writes it's partial result to an array in global memory (g_odata[]) which stores one result per block. We then simply launch another kernel of essentially the same type that is operating on g_odata[] instead of the original input data. We can do this process successively until our partial results (e.g. g_odata[]) only contain 256 results, or how many threads we are launching in a threadblock. We can then sum that final result with a single threadblock and produce a single answer value.
examples are given in the cuda sample code here.
Here's an edited version of your code, that shows how to call the two kernels in sequence to handle a larger size. I don't consider this a paragon of reduction programming, just a simple extension of what you already wrote to illustrate the concept. Note that there are a variety of changes throughout the kernel and main code to facilitate the use of the kernel to handle larger data sizes. This method still won't scale beyond a data size of (threadsPerBlock ^2), but again it's just to illustrate the concept of calling multiple kernels in sequence to sum partial results, with fewest modifications to your code.
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <ctime>
#include <sys/time.h>
#include <sstream>
#include <string>
#include <fstream>
using namespace std;
__global__ void reduce0(int *g_idata, int *g_odata, int size){
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = 0;
if(i<size)
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main(void){
int size = 40000;
thrust::host_vector<int> data_h_i(size, 1);
//initialize the data, all values will be 1
//so the final sum will be equal to size
int threadsPerBlock = 256;
int totalBlocks = (size+(threadsPerBlock-1))/threadsPerBlock;
thrust::device_vector<int> data_v_i = data_h_i;
thrust::device_vector<int> data_v_o(totalBlocks);
int* output = thrust::raw_pointer_cast(data_v_o.data());
int* input = thrust::raw_pointer_cast(data_v_i.data());
reduce0<<<totalBlocks, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(input, output, size);
reduce0<<<1, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(output, input, totalBlocks);
data_v_o[0] = data_v_i[0];
data_v_i.clear();
data_v_i.shrink_to_fit();
thrust::host_vector<int> data_h_o = data_v_o;
data_v_o.clear();
data_v_o.shrink_to_fit();
cout<<data_h_o[0]<<endl;
return 0;
}
After modifying the code made by Robert Crovella to answer my question, here is the final version which supports arbitrary amount of input values.
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <ctime>
#include <sys/time.h>
#include <sstream>
#include <string>
#include <fstream>
using namespace std;
__global__ void reduce0(int *g_idata, int *g_odata, int size){
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = 0;
if(i<size)
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main(void){
int size = 939289;
thrust::host_vector<int> data_h_i(size, 1);
//initialize the data, all values will be 1
//so the final sum will be equal to size
int threadsPerBlock = 256;
int totalBlocks = (size+(threadsPerBlock-1))/threadsPerBlock;
thrust::device_vector<int> data_v_i = data_h_i;
thrust::device_vector<int> data_v_o(totalBlocks);
int* output = thrust::raw_pointer_cast(data_v_o.data());
int* input = thrust::raw_pointer_cast(data_v_i.data());
bool turn = true;
while(true){
if(turn){
reduce0<<<totalBlocks, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(input, output, size);
turn = false;
}
else{
reduce0<<<totalBlocks, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(output, input, size);
turn = true;
}
if(totalBlocks == 1) break;
size = totalBlocks;
totalBlocks = ceil((double)totalBlocks/threadsPerBlock);
}
thrust::host_vector<int> data_h_o;
if(turn)
data_h_o = data_v_i;
else
data_h_o = data_v_o;
data_v_i.clear();
data_v_i.shrink_to_fit();
data_v_o.clear();
data_v_o.shrink_to_fit();
cout<<data_h_o[0]<<endl;
return 0;
}

cudaMalloc global array cause seg fault

I found some difficulty when I try to access a global array from function that's executed from device:
float globTemp[3][3] = "some value in here";
__device__ float* globTemp_d;
__global__ void compute(int *a, int w)
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int i = y*w+x;
if(x<3 && y<3)
a[i] = 1+globTemp_d[i];
}
int hostFunc(){
float *a_d;
cudaMalloc((void**)&a_d, 3*3*sizeof(int));
cudaMalloc((void**)&globTemp_d, 3*3*sizeof(int));
cudaMemcpy(globTemp_d,globTemp, 3*3*sizeof(float), cudaMemcpyHostToDevice);
compute<<<1,1>>>(a_d,3);
cudaMemcpy(a,a_d, 3*3*sizeof(float), cudaMemcpyDeviceToHost);
}
However, I get seg fault when i try to access globTemp_d[i]. Am I doing something wrong in here?
There are a variety of problems with your code:
Your grid is a 1D grid of 1D threadblocks (in fact you are launching a single block of 1 thread) but your kernel is written as if it were expecting a 2D threadblock structure (using .x and .y built-in variables). A single thread won't get the work done certainly, and a 1D threadblock won't work with your kernel code.
__device__ variables are not accessed with cudaMalloc and cudaMemcpy. We use a different set of API calls like cudaMemcpyToSymbol.
You're not doing any cuda error checking which is always recommended when you're having difficulty. You should do cuda error checking on both API calls and kernel calls.
You're mixing float variables (a_d ) with int variables in the kernel parameters (int *a) so I don't think this code would compile without at least a warning. And that can lead to strange behavior of course if you ignore it.
This is the closest I could come to your code while fixing all the errors:
#include <stdio.h>
__device__ float* globTemp_d;
__global__ void compute(float *a, int w)
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int i = (y*w)+x;
if((x<3) && (y<3))
a[i] = 1.0f+globTemp_d[i];
}
int main(){
float *a_d, *d_globTemp;
float globTemp[3][3] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f};
float a[(3*3)];
dim3 threads(3,3);
dim3 blocks(1);
cudaMalloc((void**)&a_d, 3*3*sizeof(float));
cudaMalloc((void**)&d_globTemp, 3*3*sizeof(float));
cudaMemcpy(d_globTemp,globTemp, 3*3*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(globTemp_d, &d_globTemp, sizeof(float *));
compute<<<blocks,threads>>>(a_d,3);
cudaMemcpy(a,a_d, 3*3*sizeof(float), cudaMemcpyDeviceToHost);
printf("results:\n");
for (int i = 0; i<(3*3); i++)
printf("a[%d] = %f\n", i, a[i]);
return 0;
}
This code can be simplified by dispensing with the __device__ variable and just passing d_globTemp as a parameter to the kernel, and using it in place of references to globTemp_d. However I did not make that simplification.