Separating even and odd numbers in CUDA - cuda

I have an array of numbers as {1,2,3,4,5,6,7,8,9,10} and I want to separate even and odd numbers as:
even = {2,4,6,8}
and:
odd = {1,3,5,7}
I am aware of atomic operations in CUDA, and also aware that the output is not expected to suffer from race conditions. I don't want to use atomic operations. How can I achieve this without using atomic keywords?
CODE:
#include <stdio.h>
#include <cuda.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *total,float *even,float *odd, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int a=total[idx];
if ((a%2)==0)
{
for (int i=0;i<=idx;i++)
{
int b = even[i];
if(b==0)
{
even[i] = total[idx];
break;
}
}
}
else
{
for (int i=0;i<idx;i++)
{
int c = odd[i];
odd[i] = total[idx];
break;
}
}
}
// main routine that executes on the host
int main(void)
{
float *total_h,*even_h, *odd_h,*total_d, *even_d,*odd_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
total_h = (float *)malloc(size); // Allocate array on host
even_h = (float *)malloc(size); // Allocate array on host
odd_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &total_d, size);
cudaMalloc((void **) &even_d, size);
cudaMemset(even_d,0,size);
cudaMalloc((void **) &odd_d, size); // Allocate array on device
cudaMemset(odd_d,0,size);
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) total_h[i] = (float)i+1;
cudaMemcpy(total_d, total_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
square_array <<< 1,10 >>> (total_d,even_d,odd_d, N);
// Retrieve result from device and store it in host array
cudaMemcpy(even_h, even_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(odd_h, odd_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
printf("total Numbers\n");
for (int i=0; i<N; i++) printf("%f\n",total_h[i]);
printf("EVEN Numbers\n");
for (int i=0; i<N; i++) printf("%f\n",even_h[i]);
printf("ODD Numbers\n");
for (int i=0; i<N; i++) printf("%f\n",odd_h[i]);
// Cleanup
free(total_h);
free(even_h);
free(odd_h);
cudaFree(total_d);
cudaFree(even_d);
cudaFree(odd_d);
}
OUTPUT:

As suggested by Jared Hoberock, it would be much more easy to use the efficient partitioning algorithm available in CUDA Thrust instead of starting the development of a partitioning routine of your own. Below, please find a complete worked example.
#include <thrust\device_vector.h>
#include <thrust\partition.h>
#include <thrust\execution_policy.h>
struct is_even { __host__ __device__ bool operator()(const int &x) { return (x % 2) == 0; } };
void main() {
const int N = 10;
thrust::host_vector<int> h_data(N);
for (int i=0; i<N; i++) h_data[i] = i;
thrust::device_vector<int> d_data(h_data);
thrust::device_vector<int> d_evens(N/2);
thrust::device_vector<int> d_odds(N/2);
thrust::partition_copy(d_data.begin(), d_data.end(), d_evens.begin(), d_odds.begin(), is_even());
printf("Even numbers\n");
for (int i=0; i<N/2; i++) {
int val = d_evens[i];
printf("evens[%i] = %i\n",i,val);
}
printf("Odd numbers\n");
for (int i=0; i<N/2; i++) {
int val = d_odds[i];
printf("odds[%i] = %i\n",i,val);
}
}

Related

cuda copy data which dynamic malloc in kernel from device memory

I met a problem about using cudaMemcpy with cudaMemcpyDeviceToHost.
There is a struct which have a pointer int* a, It will malloc in the kernel function.
And then I need copy this int* a to host memory.
My question is: I didn't know how it can not work by using cudaMemcpy.
There my codes:
#include <cuda_runtime.h>
#include <stdio.h>
typedef struct { int n, m; int *a; } myst;
__global__ void xthread(myst *st)
{
unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
myst *mst = &st[idx];
mst->n = idx;
mst->m = idx+1;
mst->a = (int *)malloc((mst->m)*sizeof(int));
mst->a[0] = idx;
}
int main(int argc,char **argv)
{
dim3 dimGrid(1);
dim3 dimBlock(2);
myst *mst = NULL;
myst *hst = (myst *)malloc(2 * sizeof(myst));
cudaMalloc(&mst, 2 * sizeof(myst));
xthread<<<dimGrid, dimBlock>>>(mst);
cudaDeviceSynchronize();
cudaMemcpy(&hst[0],&mst[0],sizeof(myst),cudaMemcpyDeviceToHost);
cudaMemcpy(&hst[1],&mst[1],sizeof(myst),cudaMemcpyDeviceToHost);
int *pInt1 = (int *)malloc((hst[0].m)*sizeof(int)) ;
int *pInt2 = (int *)malloc((hst[1].m)*sizeof(int)) ;
cudaMemcpy(pInt1, hst[0].a, (hst[0].m)*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(pInt2, hst[1].a, (hst[1].m)*sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\t%d\t%d\n",hst[0].n,hst[0].m, pInt1[0]);
printf("%d\t%d\t%d\n",hst[1].n,hst[1].m, pInt2[0]);
free(pInt1);
free(pInt2);
return 0;
}
The codes will go warning about "Cuda API error detected: cudaMemcpy returned (0xb)"
I saw a similar question : copy data which is allocated in device from device to host
But it seem that can not solve my problem.
Thx.
Alright, I work it out with a stupid way (-.-!!).
While return form the kernel function, I count how many space I have to malloc in Host and Device, and cudaMalloc again a big space . Next, in other kernel function named ythread, copy the data which in the Heap to the big space.
typedef struct { int n, m; int *a; } myst;
__global__ void xthread(myst *st) {
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
myst *mst = &st[idx];
mst->n = idx;
mst->m = idx + 1;
mst->a = (int *) malloc((mst->m) * sizeof(int));
for (int i = 0; i < mst->m; i++) {
mst->a[i] = idx + 900 + i * 10;
}
}
__global__ void ythread(myst *st, int *total_a) {
unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
myst *mst = &st[idx];
int offset=0;
for(int i=0; i<idx; i++) {
offset += st[i].m;
}
for(int i=0; i<mst->m; i++) {
total_a[offset+i] = mst->a[i];
}
}
int main(int argc,char **argv) {
dim3 dimGrid(1);
dim3 dimBlock(2);
myst *mst = NULL;
cudaMalloc((void**)&mst, dimBlock.x * sizeof(myst));
xthread<<<dimGrid, dimBlock>>>(mst);
cudaDeviceSynchronize();
myst *hst = (myst *)malloc(dimBlock.x * sizeof(myst));
cudaMemcpy(hst, mst, dimBlock.x*sizeof(myst),cudaMemcpyDeviceToHost);
int t_size = 0;
for(int i=0; i<dimBlock.x; i++) {
t_size += hst[i].m;
}
printf("t_size:%d\n", t_size);
int * t_a_h = (int *)malloc(t_size*sizeof(int));
int * t_a_d = NULL;
cudaMalloc((void**)&t_a_d, t_size*sizeof(int));
ythread<<<dimGrid, dimBlock>>>(mst, t_a_d);
cudaDeviceSynchronize();
cudaMemcpy(t_a_h, t_a_d, t_size*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0; i<t_size; i++) {
printf("t_a_h[%d]:%d\n", i, t_a_h[i]);
}
free(t_a_h);
cudaFree(mst);
cudaFree(t_a_d);
return 0;
}
Emmmmmm, it work, but I think there is a better way to solve this problem.

Computing the mean of 2000 2D-arrays with CUDA C

I have 2000 2D-arrays (each array is 1000x1000). I need to compute the mean of each one and put the result in one 2000 vector.
I tried to do that by calling the kernel for each 2D-array, but it is naive, and I want to do the computation once.
What I have been doing is a kernel for one 2D-array. I want to make my kernel do this for 2000 2D-arrays, but in one kernel.
#include <stdio.h>
#include <cuda.h>
#include <time.h>
void init_mat(float *a, const int N, const int M);
void print_mat(float *a, const int N, const int M, char *d);
void print_array(float *a, const int N, char *d);
const int threadsPerBlock=256;
__global__
void kernel(float *mat, float *out, const int N, const int M){
__shared__ float cache[threadsPerBlock];
int tid=threadIdx.x+blockIdx.x*blockDim.x;
int cacheIndex = threadIdx.x;
float sum=0;
if (tid<M) {
for(int i=0; i<N; i++)
sum += mat[(i*M)+tid];
cache[cacheIndex] = sum;
out[tid] =cache[cacheIndex];
}
__syncthreads();
int i = blockDim.x/2;
while(i != 0) {
if(cacheIndex<i)
cache[cacheIndex]+= cache[cacheIndex +i];
__syncthreads();
I /= 2;
}
if (cacheIndex == 0)
out[blockIdx.x]=cache[0];
}
int main (void) {
srand( time(NULL) );
float *a, *b, *c;
float *dev_a, *dev_b, *dev_c;
int N=1000;
int M=1000;
b=(float*)malloc(sizeof(float)*N*M);
c=(float*)malloc(sizeof(float)*M);
init_mat(b, N, M);
printf("<<<<<<<<<< initial data:\n");
print_mat(b, N, M, "matrix");
cudaMalloc((void**)&dev_b, sizeof(float)*N*M);
cudaMalloc((void**)&dev_c, sizeof(float)*M);
cudaMemcpy(dev_b, b, sizeof(float)*N*M, cudaMemcpyHostToDevice);
printf("\n\nRunning Kernel...\n\n");
kernel<<<M/256+1, 256>>>(dev_b, dev_c, N, M);
cudaMemcpy(c, dev_c, sizeof(float)*M, cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
printf(">>>>>>>>>> final data:\n");
print_array(c, M, "out-vector");
};
void init_mat(float *a, const int N, const int M) {
int i, j;
for(i=0; i<N; i++)
for(j=0; j<M; j++)
a[i*M+j] = rand() % 100 + 1;
}
void print_mat(float *a, const int N, const int M, char *d) {
int i, j;
for(i=0; i<N; i++){
printf("\n%s[%d]:", d, i);
for (j=0; j<M; j++)
printf("\t%6.4f", a[i*M+j]);
}
printf("\n");
}
void print_array(float *a, const int N, char *d) {
int i;
for(i=0; i<N; i++)
printf("\n%s[%d]: %f",d, i, a[i]);
printf("\n");
}
For a reasonably large number of arrays (e.g. 2000) and reasonably large sized arrays (e.g. 2000), the GPU can be fairly efficient if we assign a block to perform the sum reduction (and mean calculation) for each array. This means if you have 2000 arrays we will launch 2000 blocks.
In order to handle arbitrary sized arrays with a fixed number of threads per block, we will use an idea like the grid-striding loop but instead we will cause each block to use a block-striding loop to load all the data associated with a particular array. This means the threads of each block will "stride" through the assigned array, to load all the elements of that array.
Apart from this, the main reduction operation is similar to what you have written, and calculation of the mean is trivial this way - we can calculate the mean before writing the result to global memory, once we have the sum calculated via reduction.
Here is a worked example. If you compile with -DMEAN the code will output the mean of each array. If you omit that compile switch, the code will output the sum of each array. Let N be the number of arrays, and let K be the size of each array.
$ cat t1285.cu
#include <stdio.h>
const size_t N = 1000; // number of arrays
const size_t K = 1000; // size of each array
const int nTPB = 256; // number of threads per block, must be a power-of-2
typedef float mytype; // type of data to be summed
// produce the sum or mean of each array
template <typename T>
__global__ void breduce(const T * __restrict__ idata, T * __restrict__ odata, const int bsize){
__shared__ T sdata[nTPB];
T sum = 0;
//block-striding loop
size_t offset = blockIdx.x*bsize + threadIdx.x;
while (offset < (blockIdx.x+1)*bsize){
sum += idata[offset];
offset += blockDim.x;}
sdata[threadIdx.x] = sum;
__syncthreads();
//shared memory reduction sweep
for (int i = nTPB>>1; i > 0; i>>=1){
if (threadIdx.x < i) sdata[threadIdx.x] += sdata[threadIdx.x+i];
__syncthreads();}
// write output sum for this block/array
#ifndef MEAN
if (!threadIdx.x) odata[blockIdx.x] = sdata[0];
#else
if (!threadIdx.x) odata[blockIdx.x] = sdata[0]/bsize;
#endif
}
int main(){
mytype *h_idata, *h_odata, *d_idata, *d_odata;
h_idata=(mytype *)malloc(N*K*sizeof(mytype));
h_odata=(mytype *)malloc(N*sizeof(mytype));
cudaMalloc(&d_idata, N*K*sizeof(mytype));
cudaMalloc(&d_odata, N*sizeof(mytype));
for (size_t i = 0; i < N; i++)
for (size_t j = 0; j < K; j++)
h_idata[i*K+j] = 1 + (i&1); // fill alternating arrays with 1 and 2
memset(h_odata, 0, N*sizeof(mytype)); // zero out
cudaMemset(d_odata, 0, N*sizeof(mytype)); // zero out
cudaMemcpy(d_idata, h_idata, N*K*sizeof(mytype), cudaMemcpyHostToDevice);
breduce<<<N, nTPB>>>(d_idata, d_odata, K);
cudaMemcpy(h_odata, d_odata, N*sizeof(mytype), cudaMemcpyDeviceToHost);
// validate
for (size_t i = 0; i < N; i++)
#ifndef MEAN
if (h_odata[i] != (K*(1 + (i&1)))) {printf("mismatch at %d, was: %f, should be: %f\n", i, (float)h_odata[i], (float)(K*(1 + (i&1)))); return 1;}
#else
if (h_odata[i] != ((1 + (i&1)))) {printf("mismatch at %d, was: %f, should be: %f\n", i, (float)h_odata[i], (float)((1 + (i&1)))); return 1;}
#endif
return 0;
}
$ nvcc -arch=sm_35 -o t1285 t1285.cu -DMEAN
$ cuda-memcheck ./t1285
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvcc -arch=sm_35 -o t1285 t1285.cu
$ cuda-memcheck ./t1285
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

How to create and use a 1D layered texture in CUDA

I am new to CUDA. I have figured out how to do 1D and 2D textures in CUDA. However, I am struggling with how to use a 1D layered texture. The output of my kernel which uses the texture is all zeros, which is definitely incorrect. However, I am not sure what I am doing wrong. I have serious doubts that I set up this texture correctly, but I checked for cuda errors everywhere and couldn't find any issues. Can someone show me how to correctly set up a 1D layered texture and use it. Here is my code. Thanks in advance:
// To Compile: nvcc backproj.cu -o backproj.out
// To Run: ./backproj.out
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// Includes CUDA
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#define pi acos(-1)
// 1D float textures
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef;
// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int layer) {
unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (location_idx < numlocations) {
// Get the location you want to interpolate from the array
float loc2find = (float) d_locations[location_idx] + 0.5f;
// Read from texture and write to global memory
d_output[location_idx] = tex1DLayered(texRef, loc2find, layer);
}
}
// Host code
int main()
{
// Setup h_data and locations to interpolate from
const unsigned int len = 10;
const unsigned int numlayers = 3;
const unsigned int upsamp = 3;
const unsigned int loclen = 1 + (len - 1) * upsamp;
float idx_spacing = 1/(float)upsamp;
float h_data[len][numlayers], h_loc[loclen];
for (int i = 0; i < len; i++)
for (int j = 0; j < numlayers; j++)
h_data[i][j] = 1+cosf((float) pi*i/(j+1.0f));
for (int i = 0; i < loclen; i ++)
h_loc[i] = i*idx_spacing;
// Get the memory locations you want
float* d_loc;
cudaMalloc(&d_loc, loclen * sizeof(float));
cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice);
// Allocate CUDA array in device memory
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray* cuArray;
cudaMallocArray(&cuArray, &channelDesc, len, numlayers);
// Copy to device memory some data located at address h_data in host memory
cudaMemcpyToArray(cuArray, 0, 0, h_data, len * numlayers * sizeof(float), cudaMemcpyHostToDevice);
// Set texture reference parameters
texRef.addressMode[0] = cudaAddressModeBorder;
texRef.filterMode = cudaFilterModeLinear;
texRef.normalized = false;
// Bind the array to the texture reference
cudaBindTextureToArray(texRef, cuArray, channelDesc);
// Allocate result of transformation in device memory
float* d_output;
cudaMalloc(&d_output, loclen * sizeof(float));
// Invoke kernel
int thdsPerBlk = 256;
int blksPerGrid = (int) (loclen / thdsPerBlk) + 1;
printf("Threads Per Block: %d, Blocks Per Grid: %d\n", thdsPerBlk, blksPerGrid);
interp1Kernel <<<blksPerGrid, thdsPerBlk >>>(d_output, d_loc, loclen, 0);
// Print Results
printf("\n Original Indices \n");
for (int i = 0; i < len; i++) printf(" %d ", i);
printf("\n Original array \n");
for (int i = 0; i < len; i++) printf("%5.3f ", h_data[i][0]);
printf("\n Output Indices \n");
for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
printf("\n Output Array \n");
cudaMemcpy(h_loc, d_output, loclen * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
printf("\n");
// Free device memory
cudaFreeArray(cuArray);
cudaFree(d_output);
return 0;
}
You must use cudaMalloc3DArray with the cudaArrayLayered flag set to allocate memory for layered textures. There is a complete example of layered texture usage in the toolkit samples which you can study to see how they work.
Unfortunately, the CUDA SDK only shows you how to do it when you have 2D layered texture. There is some more trickiness when it comes to 1D layered textures. It turns out you have to put a 0 into the second argument for make_cudaExtent when making the extentDesc as follows:
cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers); // <-- 0 height required for 1Dlayered
However, when using make_cudaExtent for mParams.extent for cudaMemcpy3D, you still need to put a 1 for the second argument:
mParams.extent = make_cudaExtent(len, 1, numlayers); // <<-- non zero height required for memcpy to do anything
Furthermore, there are some other non-obvious details such as the pitch for make_cudaPitchedPtr. So I have included my complete and functioning code for the 1D layered texture. I couldn't find an example of this anywhere. So hopefully this will help out others who are in the same boat:
// To Compile: nvcc layeredTexture1D.cu -o layeredTexture1D.out
// To Run: ./layeredTexture1D.out
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// Includes CUDA
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#define pi acos(-1)
// 1D float textures: x is for input values, y is for corresponding output values
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef;
// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int numlayers) {
unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int layer = blockIdx.y * blockDim.y + threadIdx.y;
if (location_idx < numlocations && layer < numlayers) {
// Get the location you want to interpolate from the array
float loc2find = (float)d_locations[location_idx] + 0.5f;
// Read from texture and write to global memory
d_output[location_idx + layer*numlocations] = tex1DLayered(texRef, loc2find, layer);
//printf("location=%d layer=%d loc2find=%f result=%f \n", location_idx, layer, loc2find, d_output[location_idx]);
}
}
// Host code
int main()
{
// Setup h_data and locations to interpolate from
const unsigned int len = 7;
const unsigned int numlayers = 3;
const unsigned int upsamp = 4;
const unsigned int loclen = 1 + (len - 1) * upsamp;
float idx_spacing = 1 / (float)upsamp;
float h_data[numlayers*len], h_loc[loclen];
for (int i = 0; i < len; i++)
for (int j = 0; j < numlayers; j++)
h_data[len*j + i] = 1 + cosf((float)pi*i / (j + 1.0f));
for (int i = 0; i < loclen; i++)
h_loc[i] = i*idx_spacing;
// Get the memory locations you want
float* d_loc;
cudaMalloc(&d_loc, loclen * sizeof(float));
cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice);
// Allocate CUDA array in device memory
cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers); // <-- 0 height required for 1Dlayered
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaMemcpy3DParms mParams = { 0 };
mParams.srcPtr = make_cudaPitchedPtr(h_data, len*sizeof(float), len, 1);
mParams.kind = cudaMemcpyHostToDevice;
mParams.extent = make_cudaExtent(len, 1, numlayers); // <<-- non zero height required for memcpy to do anything
cudaArray* cuArray;
cudaMalloc3DArray(&cuArray, &channelDesc, extentDesc, cudaArrayLayered);
mParams.dstArray = cuArray;
cudaMemcpy3D(&mParams);
// Set texture reference parameters
texRef.addressMode[0] = cudaAddressModeBorder;
texRef.filterMode = cudaFilterModeLinear;
texRef.normalized = false;
// Bind the array to the texture reference
cudaBindTextureToArray(texRef, cuArray, channelDesc);
// Allocate result of transformation in device memory
float *d_output;
cudaMalloc(&d_output, loclen * numlayers * sizeof(float));
float h_output[loclen * numlayers];
// Invoke kernel
dim3 dimBlock(16, 16, 1);
dim3 dimGrid((loclen + dimBlock.x - 1) / dimBlock.x,
(numlayers + dimBlock.y - 1) / dimBlock.y, 1);
interp1Kernel<<<dimGrid, dimBlock>>>(d_output, d_loc, loclen, numlayers);
// Print Results
printf("\n Original Indices \n");
for (int i = 0; i < len; i++) printf(" %d ", i);
printf("\n Original array \n");
for (int j = 0; j < numlayers; j++) {
for (int i = 0; i < len; i++) {
printf("%5.3f ", h_data[i + j*len]);
}
printf("\n");
}
printf("\n Output Indices \n");
for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
printf("\n Output Array \n");
cudaMemcpy(h_output, d_output, loclen * numlayers * sizeof(float), cudaMemcpyDeviceToHost);
for (int j = 0; j < numlayers; j++) {
for (int i = 0; i < loclen; i++) {
printf("%5.3f ", h_output[i + j*loclen]);
}
printf("\n");
}
printf("\n");
// Free device memory
cudaFreeArray(cuArray);
cudaFree(d_output);
return 0;
}

maximum supported size for cub library

Does anyone know what is the maximum supported size for cub::scan ? I got core dump for input sizes over 500 million. I wanted to make sure I'm not doing anything wrong...
Here is my code:
#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;
bool g_verbose = false; // Whether to display input/output to console
CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
typedef int mytype;
/**
* Solve inclusive-scan problem
*/
static void solve(mytype *h_in, mytype *h_cpu, int n)
{
mytype inclusive = 0;
for (int i = 0; i < n; ++i) {
inclusive += h_in[i];
h_cpu[i] = inclusive;
}
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
for (int i = 0; i < n; i++) {
if (h_cpu[i] != h_o[i]) {
return i + 1;
}
}
return 0;
}
/**
* Main
*/
int main(int argc, char** argv)
{
cudaSetDevice(0);
struct timeval start, end;
int num_items = 1073741824;
const int repetitions = 5;
mytype *h_in, *h_out, *h_cpu;
const int size = num_items * sizeof(mytype);
// Allocate host arrays
h_in = (mytype *)malloc(size);
h_out = (mytype *)malloc(size);
h_cpu = (mytype *)malloc(size);
// Initialize problem and solution
for (int i = 0; i < num_items; i++) {
h_in[i] = i;
h_out[i] = 0;
h_cpu[i] = 0;
}
solve(h_in, h_cpu, num_items);
// Allocate problem device arrays
mytype *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array
mytype *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
gettimeofday(&start, NULL);
for (long i = 0; i < repetitions; i++)
DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
cudaThreadSynchronize();
gettimeofday(&end, NULL);
double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;
cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
int cmp = compare(h_cpu, h_out, num_items);
printf("%d\t", num_items);
if (!cmp)
printf("\t%7.4fs \n", ctime);
printf("\n");
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (h_cpu) delete[] h_cpu;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
printf("\n\n");
return 0;
}
The problem is here:
const int size = num_items * sizeof(mytype);
And it can be fixed by changing it to:
const size_t size = num_items * sizeof(mytype);
The value of num_items in the code is over 1 Billion. When we multiply that by sizeof(mytype) we are multiplying it by 4, so the result is over 4 Billion. This value cannot be stored in an int variable. If you try to use it anyway like that, then your subsequent host code will do bad things. This problem (the core dump) actually has nothing to do with CUDA. The code would core dump if you removed all the CUB elements.
When I modify the line of code above, and compile for the correct GPU (e.g. -arch=sm_35 in my case, or -arch=sm_52 for a Titan X GPU), then I get the correct answer (and no seg fault/core dump).
In general, the correct starting point when chasing a seg fault/core dump type error, is to recognize that this error arises from host code and you should attempt to localize the exact line of source code that is generating this error. This can be done trivially/tediously by putting many printf statements in your code, until you identify the line of your code after which you don't see any printf output, or by using a host code debugger, such as gdb on linux.
Also note that this code as written will require slightly more than 12GB of memory on the host, and slightly more than 8GB of memory on the GPU, so it will only run properly in such settings.
For reference, here is the fixed code (based on what OP posted here):
#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;
bool g_verbose = false; // Whether to display input/output to console
CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
typedef int mytype;
/**
* Solve inclusive-scan problem
*/
static void solve(mytype *h_in, mytype *h_cpu, int n)
{
mytype inclusive = 0;
for (int i = 0; i < n; ++i) {
inclusive += h_in[i];
h_cpu[i] = inclusive;
}
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
for (int i = 0; i < n; i++) {
if (h_cpu[i] != h_o[i]) {
return i + 1;
}
}
return 0;
}
/**
* Main
*/
int main(int argc, char** argv)
{
cudaSetDevice(0);
struct timeval start, end;
int num_items = 1073741824;
const int repetitions = 5;
mytype *h_in, *h_out, *h_cpu;
const size_t size = num_items * sizeof(mytype);
// Allocate host arrays
h_in = (mytype *)malloc(size);
h_out = (mytype *)malloc(size);
h_cpu = (mytype *)malloc(size);
// Initialize problem and solution
for (int i = 0; i < num_items; i++) {
h_in[i] = i;
h_out[i] = 0;
h_cpu[i] = 0;
}
solve(h_in, h_cpu, num_items);
// Allocate problem device arrays
mytype *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array
mytype *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
gettimeofday(&start, NULL);
for (long i = 0; i < repetitions; i++)
DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
cudaThreadSynchronize();
gettimeofday(&end, NULL);
double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;
cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
int cmp = compare(h_cpu, h_out, num_items);
printf("%d\t", num_items);
if (!cmp)
printf("\t%7.4fs \n", ctime);
printf("\n");
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (h_cpu) delete[] h_cpu;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
printf("\n\n");
return 0;
}

cuda programming with pthread

#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#define ARR_SIZE 10
#define NUM_DEVICE 1
typedef struct {
int *arr;
int *dev_arr;
int *dev_result;
int *result;
int num;
} cuda_st;
__global__ void kernel_fc(int *dev_arr, int *dev_result)
{
int idx = threadIdx.x;
printf("dev_arr[%d] = %d\n", idx, dev_arr[idx]);
atomicAdd(dev_result, dev_arr[idx]);
}
void *thread_func(void* struc)
{
cuda_st * data = (cuda_st*)struc;
printf("thread %d func start\n", data->num);
printf("arr %d = ", data->num);
for(int i=0; i<10; i++) {
printf("%d ", data->arr[i]);
}
printf("\n");
cudaSetDevice(data->num);
cudaMemcpy(data->dev_arr, data->arr, sizeof(int)*ARR_SIZE, cudaMemcpyHostToDevice);
kernel_fc<<<1,ARR_SIZE>>>(data->dev_arr, data->dev_result);
cudaMemcpy(data->result, data->dev_result, sizeof(int), cudaMemcpyDeviceToHost);
printf("thread %d func exit\n", data->num);
return NULL;
}
int main(void)
{
// Make object
cuda_st cuda[NUM_DEVICE];
// Make thread
pthread_t pthread[NUM_DEVICE];
// Host array memory allocation
int *arr[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
arr[i] = (int*)malloc(sizeof(int)*ARR_SIZE);
}
// Fill this host array up with specified data
for(int i=0; i<NUM_DEVICE; i++) {
for(int j=0; j<ARR_SIZE; j++) {
arr[i][j] = i*ARR_SIZE+j;
}
}
// To confirm host array data
for(int i=0; i<NUM_DEVICE; i++) {
printf("arr[%d] = ", i);
for(int j=0; j<ARR_SIZE; j++) {
printf("%d ", arr[i][j]);
}
printf("\n");
}
// Result memory allocation
int *result[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
result[i] = (int*)malloc(sizeof(int));
memset(result[i], 0, sizeof(int));
}
// Device array memory allocation
int *dev_arr[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaMalloc(&dev_arr[i], sizeof(int)*ARR_SIZE);
}
// Device result memory allocation
int *dev_result[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaMalloc(&dev_result[i], sizeof(int));
cudaMemset(dev_result[i], 0, sizeof(int));
}
// Connect these pointers with object
for(int i=0; i<NUM_DEVICE; i++) {
cuda[i].arr = arr[i];
cuda[i].dev_arr = dev_arr[i];
cuda[i].result = result[i];
cuda[i].dev_result = dev_result[i];
cuda[i].num = i;
}
// Create and excute pthread
for(int i=0; i<NUM_DEVICE; i++) {
pthread_create(&pthread[i], NULL, thread_func, (void*)&cuda[i]);
}
// Join pthread
for(int i=0; i<NUM_DEVICE; i++) {
pthread_join(pthread[i], NULL);
}
for(int i=0; i<NUM_DEVICE; i++) {
printf("result[%d] = %d\n", i, (*cuda[i].result));
}
return 0;
}
I make my simple-test-program like this to test pthread with multi device cuda code.
When the NUM_DEVICE set as 1, it works well but when set as 2 program stopped.
I guess beacause multiple threads access cudaSetDevice but I don't know how to handle this.
I tried to make my program with single host thread and multi device(with Async function) before, but in my case(not above simple code), there are many host code between kernel functions so it doesn't work well asynchronously.
So I test to use multi thread on host before apply this manner to my real code but I have trouble like this.
Do I have to use asynchonous function in cuda functions and kernels?
Give me some advise.
The problem is that you allocate memory on one device. You need to call cudaSetDevice before cudaMalloc calls:
// Device array memory allocation
int *dev_arr[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaSetDevice(i);
cudaMalloc(&dev_arr[i], sizeof(int)*ARR_SIZE);
}
// Device result memory allocation
int *dev_result[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaSetDevice(i);
cudaMalloc(&dev_result[i], sizeof(int));
cudaMemset(dev_result[i], 0, sizeof(int));
}