cuda programming with pthread - cuda

#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#define ARR_SIZE 10
#define NUM_DEVICE 1
typedef struct {
int *arr;
int *dev_arr;
int *dev_result;
int *result;
int num;
} cuda_st;
__global__ void kernel_fc(int *dev_arr, int *dev_result)
{
int idx = threadIdx.x;
printf("dev_arr[%d] = %d\n", idx, dev_arr[idx]);
atomicAdd(dev_result, dev_arr[idx]);
}
void *thread_func(void* struc)
{
cuda_st * data = (cuda_st*)struc;
printf("thread %d func start\n", data->num);
printf("arr %d = ", data->num);
for(int i=0; i<10; i++) {
printf("%d ", data->arr[i]);
}
printf("\n");
cudaSetDevice(data->num);
cudaMemcpy(data->dev_arr, data->arr, sizeof(int)*ARR_SIZE, cudaMemcpyHostToDevice);
kernel_fc<<<1,ARR_SIZE>>>(data->dev_arr, data->dev_result);
cudaMemcpy(data->result, data->dev_result, sizeof(int), cudaMemcpyDeviceToHost);
printf("thread %d func exit\n", data->num);
return NULL;
}
int main(void)
{
// Make object
cuda_st cuda[NUM_DEVICE];
// Make thread
pthread_t pthread[NUM_DEVICE];
// Host array memory allocation
int *arr[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
arr[i] = (int*)malloc(sizeof(int)*ARR_SIZE);
}
// Fill this host array up with specified data
for(int i=0; i<NUM_DEVICE; i++) {
for(int j=0; j<ARR_SIZE; j++) {
arr[i][j] = i*ARR_SIZE+j;
}
}
// To confirm host array data
for(int i=0; i<NUM_DEVICE; i++) {
printf("arr[%d] = ", i);
for(int j=0; j<ARR_SIZE; j++) {
printf("%d ", arr[i][j]);
}
printf("\n");
}
// Result memory allocation
int *result[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
result[i] = (int*)malloc(sizeof(int));
memset(result[i], 0, sizeof(int));
}
// Device array memory allocation
int *dev_arr[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaMalloc(&dev_arr[i], sizeof(int)*ARR_SIZE);
}
// Device result memory allocation
int *dev_result[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaMalloc(&dev_result[i], sizeof(int));
cudaMemset(dev_result[i], 0, sizeof(int));
}
// Connect these pointers with object
for(int i=0; i<NUM_DEVICE; i++) {
cuda[i].arr = arr[i];
cuda[i].dev_arr = dev_arr[i];
cuda[i].result = result[i];
cuda[i].dev_result = dev_result[i];
cuda[i].num = i;
}
// Create and excute pthread
for(int i=0; i<NUM_DEVICE; i++) {
pthread_create(&pthread[i], NULL, thread_func, (void*)&cuda[i]);
}
// Join pthread
for(int i=0; i<NUM_DEVICE; i++) {
pthread_join(pthread[i], NULL);
}
for(int i=0; i<NUM_DEVICE; i++) {
printf("result[%d] = %d\n", i, (*cuda[i].result));
}
return 0;
}
I make my simple-test-program like this to test pthread with multi device cuda code.
When the NUM_DEVICE set as 1, it works well but when set as 2 program stopped.
I guess beacause multiple threads access cudaSetDevice but I don't know how to handle this.
I tried to make my program with single host thread and multi device(with Async function) before, but in my case(not above simple code), there are many host code between kernel functions so it doesn't work well asynchronously.
So I test to use multi thread on host before apply this manner to my real code but I have trouble like this.
Do I have to use asynchonous function in cuda functions and kernels?
Give me some advise.

The problem is that you allocate memory on one device. You need to call cudaSetDevice before cudaMalloc calls:
// Device array memory allocation
int *dev_arr[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaSetDevice(i);
cudaMalloc(&dev_arr[i], sizeof(int)*ARR_SIZE);
}
// Device result memory allocation
int *dev_result[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaSetDevice(i);
cudaMalloc(&dev_result[i], sizeof(int));
cudaMemset(dev_result[i], 0, sizeof(int));
}

Related

Is changing the device in a CUDA Graph node unavailable?

I have tried to change the current device in CUDA graphs by creating this host node:
cudaGraph_t graph;
// Node #1: Create the 1st setDevice
cudaHostNodeParams hostNodeParams = {0};
memset(&hostNodeParams, 0, sizeof(hostNodeParams));
hostNodeParams.fn = [](void *data) {
int passed_device_ordinal = *(int *)(data);
cout << "CUDA-Graph: in the host node: changing the device to: "
<< passed_device_ordinal << endl;
CUDA_CHECK(cudaSetDevice(passed_device_ordinal));
};
hostNodeParams.userData = (void *)&device_1;
// Node #1: Add the 1st setDevice
CUDA_CHECK(cudaGraphAddHostNode(&setDevice_1, graph, &copy_0to1, 1,
&hostNodeParams));
When running the code, I get this output:
CUDA-Graph: in the host node: changing the device to: 1
Error operation not permitted at line 68 in file src/MultiGPU.cu
Is it possible to change the device within a CUDA graph?
During the execution of a graph, the current device cannot be changed via a host callback, since callbacks are not allowed to make cuda api calls.
There are two ways to specify the device on which a kernel within the graph will execute.
Use stream-capture to create a multi-gpu graph.
When manually constructing the graph, nodes will be assigned to the currently active device. Use cudaSetDevice before adding your kernel.
The following code demonstrates both with a simple pipeline which executes (kernel, memcpy to host, host callback) on each gpu.
#include <thread>
#include <future>
#include <chrono>
#include <array>
#include <vector>
#include <cassert>
__global__
void kernel(int* data){
*data = 42;
}
struct CallbackData{
int* pinnedBuffer;
std::vector<int>* vec;
};
void callback(void* args){
CallbackData* data = static_cast<CallbackData*>(args);
data->vec->push_back(*data->pinnedBuffer);
}
int main(){
constexpr int numDevices = 2;
std::array<int, numDevices> deviceIds{0,1};
constexpr int numIterations = 100;
std::array<cudaStream_t, numDevices> streams{};
std::array<cudaEvent_t, numDevices> events{};
std::array<int*, numDevices> deviceBuffers{};
std::array<int*, numDevices> pinnedBuffers{};
std::array<std::vector<int>, numDevices> vectors{};
std::array<CallbackData, numDevices> callbackArgs{};
for(int i = 0; i < numDevices; i++){
cudaSetDevice(deviceIds[i]);
cudaStreamCreate(&streams[i]);
cudaEventCreate(&events[i], cudaEventDisableTiming);
cudaMalloc(&deviceBuffers[i], sizeof(int));
cudaMallocHost(&pinnedBuffers[i], sizeof(int));
vectors[i].reserve(numIterations);
callbackArgs[i].pinnedBuffer = pinnedBuffers[i];
callbackArgs[i].vec = &vectors[i];
}
cudaSetDevice(deviceIds[0]);
cudaStream_t mainstream;
cudaStreamCreate(&mainstream);
cudaEvent_t mainevent;
cudaEventCreate(&mainevent, cudaEventDisableTiming);
auto launch = [&](){
cudaEventRecord(mainevent, mainstream);
for(int i = 0; i < numDevices; i++){
cudaSetDevice(deviceIds[i]);
auto& stream = streams[i];
cudaStreamWaitEvent(stream, mainevent);
for(int k = 0; k < numIterations; k++){
kernel<<<1,1,0,stream>>>(deviceBuffers[i]);
cudaMemcpyAsync(pinnedBuffers[i], deviceBuffers[i], sizeof(int), cudaMemcpyDeviceToHost, stream);
cudaLaunchHostFunc(stream, callback, (void*)&callbackArgs[i]);
}
cudaEventRecord(events[i], stream);
cudaStreamWaitEvent(mainstream, events[i]);
}
cudaSetDevice(deviceIds[0]);
};
// no graph
launch();
cudaStreamSynchronize(mainstream);
for(int i = 0; i < numDevices; i++){
assert(vectors[i].size() == numIterations);
for(auto x : vectors[i]){
assert(x == 42);
}
vectors[i].clear();
}
//stream capture graph
{
cudaStreamBeginCapture(mainstream, cudaStreamCaptureModeRelaxed);
launch();
cudaGraph_t graph;
cudaStreamEndCapture(mainstream, &graph);
cudaGraphExec_t execGraph;
cudaGraphNode_t errorNode;
cudaError_t status = cudaGraphInstantiate(&execGraph, graph, &errorNode, nullptr, 0);
assert(status == cudaSuccess) ;
cudaGraphDestroy(graph);
cudaGraphLaunch(execGraph, mainstream);
cudaStreamSynchronize(mainstream);
for(int i = 0; i < numDevices; i++){
assert(vectors[i].size() == numIterations);
for(auto x : vectors[i]){
assert(x == 42);
}
vectors[i].clear();
}
cudaGraphExecDestroy(execGraph);
}
//construct graph manually
{
cudaGraph_t graph;
cudaGraphCreate(&graph, 0);
for(int i = 0; i < numDevices; i++){
cudaSetDevice(deviceIds[i]);
cudaGraphNode_t* prev = nullptr;
cudaGraphNode_t kernelNode;
cudaGraphNode_t memcpyNode;
cudaGraphNode_t hostNode;
cudaKernelNodeParams kernelNodeParams{};
kernelNodeParams.func = (void *)kernel;
kernelNodeParams.gridDim = dim3(1, 1, 1);
kernelNodeParams.blockDim = dim3(1, 1, 1);
kernelNodeParams.sharedMemBytes = 0;
void *kernelArgs[1] = {(void *)&deviceBuffers[i]};
kernelNodeParams.kernelParams = kernelArgs;
kernelNodeParams.extra = NULL;
cudaHostNodeParams hostNodeParams{};
hostNodeParams.fn = callback;
hostNodeParams.userData = &callbackArgs[i];
for(int k = 0; k < numIterations; k++){
cudaGraphAddKernelNode(&kernelNode, graph, prev, (prev == nullptr ? 0 : 1), &kernelNodeParams);
cudaGraphAddMemcpyNode1D(&memcpyNode, graph, &kernelNode, 1, pinnedBuffers[i], deviceBuffers[i], sizeof(int), cudaMemcpyDeviceToHost);
cudaGraphAddHostNode(&hostNode, graph, &memcpyNode, 1, &hostNodeParams);
prev = &hostNode;
}
cudaSetDevice(deviceIds[0]);
}
cudaGraphExec_t execGraph;
cudaGraphNode_t errorNode;
cudaError_t status = cudaGraphInstantiate(&execGraph, graph, &errorNode, nullptr, 0);
assert(status == cudaSuccess) ;
cudaGraphDestroy(graph);
cudaGraphLaunch(execGraph, mainstream);
cudaStreamSynchronize(mainstream);
for(int i = 0; i < numDevices; i++){
assert(vectors[i].size() == numIterations);
for(auto x : vectors[i]){
assert(x == 42);
}
vectors[i].clear();
}
cudaGraphExecDestroy(execGraph);
}
cudaEventDestroy(mainevent);
cudaStreamDestroy(mainstream);
for(int i = 0; i < numDevices; i++){
cudaSetDevice(deviceIds[i]);
cudaStreamDestroy(streams[i]);
cudaEventDestroy(events[i]);
cudaFree(deviceBuffers[i]);
cudaFreeHost(pinnedBuffers[i]);
}
}

how to create a matrix in gpu and print it on cpu?

This is a code to create a matrix on gpu and print it out on cpu. Can anyone tell me where am I going wrong. Thank you.
# include <stdio.h>
__global__ void create(int **d_a){
int i = threadIdx.x;
int j = threadIdx.y;
d_a[i][j] = 1;
}
void errorCheck(){
cudaError_t error = cudaGetLastError();
if(error != cudaSuccess){
// print the CUDA error message and exit
printf("CUDA error: %s\n", cudaGetErrorString(error));
exit(-1);
}
}
# define N 5
int main(){
int **d_a, **a;
a = (int**)malloc(N * sizeof(int*));
for (int i =0; i < N; i++){
a[i] = (int*)malloc(N*sizeof(int));
}
cudaMalloc((void***)&d_a, N*sizeof(int*));
for (int i =0; i < N; i++){
cudaMalloc((void**)&d_a,N*sizeof(int));
}
errorCheck();
create <<<1, N>>>(d_a);
errorCheck();
cudaMemcpy(a, d_a, (N*N)*sizeof(int),cudaMemcpyDeviceToHost);
for (int i =0; i < N; i++ ){
for (int j = 0; j < N; j++ ){
printf("%d", a[i][j]);
}
printf("\n");
}
cudaFree(d_a);
free(a);
return 0;
}
Is there something wrong with memory allocation or memcpy ?
Is there something wrong with memory allocation or memcpy ?
Yes on both counts.
This:
cudaMalloc((void***)&d_a, N*sizeof(int*));
for (int i =0; i < N; i++){
cudaMalloc((void**)&d_a,N*sizeof(int));
}
would have to be done like this:
cudaMalloc((void***)&d_a, N*sizeof(int*));
for (int i=0; i < N; i++){
int *row;
cudaMalloc((void**)&row, N*sizeof(int));
cudaMemcpy(d_a+i, &row, sizeof(int*), cudaMemcpyHostToDevice);
}
And then this:
create <<<1, N>>>(d_a);
errorCheck();
cudaMemcpy(a, d_a, (N*N)*sizeof(int),cudaMemcpyDeviceToHost);
would have to be done like this:
create <<<1, dim3(N,N)>>>(d_a);
errorCheck();
for(int i=0; i<N; i++) {
int* row;
cudaMemcpy(&row, d_a+i, sizeof(int*), cudaMemcpyDeviceToHost);
cudaMemcpy(a[i], row, sizeof(int) * N, cudaMemcpyDeviceToHost);
}
[All code written in browser and not tested, use at own risk]
In short, you have decided to work with an array of pointers. This requires additional CUDA API operations because the row pointers in the GPU copy are not accessible on the host by standard assignment. You must use cudaMemcpy in every case.

Is Concurrent cudaMemcpyAsync possible?

I'm writing some test code to get familiar with the concurrent attributes of cudaMemcpyAsync.
When I was trying to do concurrent cudaMemcpyAsync in a single context, the copy operations are queuing up and get executed one by one with throughput 12.4 GB/s, which is consistent with the answer here:
But when I tried to do concurrent cudaMemcpyAsync in different contexts (by separating them into 4 processes), it seems that the first and the last one are running concurrently:
The first 2 sequential cudaMemcpyAsync are running with a throughput 12.4 GB/s while the last 2 concurrent ones are running with a throughput 5.3 GB/s.
How can I do concurrent cudaMemcpyAsync within single context?
I'm using CUDA9.0 on TITAN Xp, which has 2 copy engines.
EDIT:
Code for scenario 1:
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
inline
cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return result;
}
const int nStreams = 8;
const int N = 100000000;
const int bytes = N * sizeof(int);
int* arr_H;
int* arr_D[nStreams];
cudaStream_t stream[nStreams];
int args[nStreams];
pthread_t threads[nStreams];
void* worker(void *arg)
{
int i = *((int *)arg);
checkCuda(cudaMemcpyAsync(arr_D[i], arr_H, bytes, cudaMemcpyHostToDevice, stream[i]));
return NULL;
}
int main()
{
for(int i = 0; i < nStreams; i++)
checkCuda(cudaStreamCreate(&stream[i]));
checkCuda(cudaMallocHost((void**)&arr_H, bytes));
for (int i = 0; i < N; i++)
arr_H[i] = random();
for (int i = 0; i < nStreams; i++)
checkCuda(cudaMalloc((void**)&arr_D[i], bytes));
for (int i = 0; i < nStreams; i++) {
args[i] = i;
pthread_create(&threads[i], NULL, worker, &args[i]);
}
for (int i = 0; i < nStreams; i++)
pthread_join(threads[i], NULL);
cudaFreeHost(arr_H);
for (int i = 0; i < nStreams; i++) {
checkCuda(cudaStreamDestroy(stream[i]));
cudaFree(arr_D[i]);
}
return 0;
Code for scenario 2:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
inline
cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return result;
}
int main()
{
const int nStreams = 1;
const int N = 100000000;
const int bytes = N * sizeof(int);
int* arr_H;
int* arr_D[nStreams];
cudaStream_t stream[nStreams];
for(int i = 0; i < nStreams; i++)
checkCuda(cudaStreamCreate(&stream[i]));
checkCuda(cudaMallocHost((void**)&arr_H, bytes));
for (int i = 0; i < N; i++)
arr_H[i] = random();
for (int i = 0; i < nStreams; i++)
checkCuda(cudaMalloc((void**)&arr_D[i], bytes));
for (int i = 0; i < nStreams; i++)
checkCuda(cudaMemcpyAsync(arr_D[i], arr_H, bytes, cudaMemcpyHostToDevice, stream[i]));
cudaFreeHost(arr_H);
for (int i = 0; i < nStreams; i++) {
checkCuda(cudaStreamDestroy(stream[i]));
cudaFree(arr_D[i]);
}
return 0;
}
Code 2 is basically copied from Code 1. I used a python script to run multiple processes concurrently:
#!/usr/bin/env python3
import subprocess
N = 4
processes = [subprocess.Popen('./a.out', shell=True) for _ in range(N)]
for process in processes:
process.wait()

How to create and use a 1D layered texture in CUDA

I am new to CUDA. I have figured out how to do 1D and 2D textures in CUDA. However, I am struggling with how to use a 1D layered texture. The output of my kernel which uses the texture is all zeros, which is definitely incorrect. However, I am not sure what I am doing wrong. I have serious doubts that I set up this texture correctly, but I checked for cuda errors everywhere and couldn't find any issues. Can someone show me how to correctly set up a 1D layered texture and use it. Here is my code. Thanks in advance:
// To Compile: nvcc backproj.cu -o backproj.out
// To Run: ./backproj.out
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// Includes CUDA
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#define pi acos(-1)
// 1D float textures
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef;
// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int layer) {
unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (location_idx < numlocations) {
// Get the location you want to interpolate from the array
float loc2find = (float) d_locations[location_idx] + 0.5f;
// Read from texture and write to global memory
d_output[location_idx] = tex1DLayered(texRef, loc2find, layer);
}
}
// Host code
int main()
{
// Setup h_data and locations to interpolate from
const unsigned int len = 10;
const unsigned int numlayers = 3;
const unsigned int upsamp = 3;
const unsigned int loclen = 1 + (len - 1) * upsamp;
float idx_spacing = 1/(float)upsamp;
float h_data[len][numlayers], h_loc[loclen];
for (int i = 0; i < len; i++)
for (int j = 0; j < numlayers; j++)
h_data[i][j] = 1+cosf((float) pi*i/(j+1.0f));
for (int i = 0; i < loclen; i ++)
h_loc[i] = i*idx_spacing;
// Get the memory locations you want
float* d_loc;
cudaMalloc(&d_loc, loclen * sizeof(float));
cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice);
// Allocate CUDA array in device memory
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray* cuArray;
cudaMallocArray(&cuArray, &channelDesc, len, numlayers);
// Copy to device memory some data located at address h_data in host memory
cudaMemcpyToArray(cuArray, 0, 0, h_data, len * numlayers * sizeof(float), cudaMemcpyHostToDevice);
// Set texture reference parameters
texRef.addressMode[0] = cudaAddressModeBorder;
texRef.filterMode = cudaFilterModeLinear;
texRef.normalized = false;
// Bind the array to the texture reference
cudaBindTextureToArray(texRef, cuArray, channelDesc);
// Allocate result of transformation in device memory
float* d_output;
cudaMalloc(&d_output, loclen * sizeof(float));
// Invoke kernel
int thdsPerBlk = 256;
int blksPerGrid = (int) (loclen / thdsPerBlk) + 1;
printf("Threads Per Block: %d, Blocks Per Grid: %d\n", thdsPerBlk, blksPerGrid);
interp1Kernel <<<blksPerGrid, thdsPerBlk >>>(d_output, d_loc, loclen, 0);
// Print Results
printf("\n Original Indices \n");
for (int i = 0; i < len; i++) printf(" %d ", i);
printf("\n Original array \n");
for (int i = 0; i < len; i++) printf("%5.3f ", h_data[i][0]);
printf("\n Output Indices \n");
for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
printf("\n Output Array \n");
cudaMemcpy(h_loc, d_output, loclen * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
printf("\n");
// Free device memory
cudaFreeArray(cuArray);
cudaFree(d_output);
return 0;
}
You must use cudaMalloc3DArray with the cudaArrayLayered flag set to allocate memory for layered textures. There is a complete example of layered texture usage in the toolkit samples which you can study to see how they work.
Unfortunately, the CUDA SDK only shows you how to do it when you have 2D layered texture. There is some more trickiness when it comes to 1D layered textures. It turns out you have to put a 0 into the second argument for make_cudaExtent when making the extentDesc as follows:
cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers); // <-- 0 height required for 1Dlayered
However, when using make_cudaExtent for mParams.extent for cudaMemcpy3D, you still need to put a 1 for the second argument:
mParams.extent = make_cudaExtent(len, 1, numlayers); // <<-- non zero height required for memcpy to do anything
Furthermore, there are some other non-obvious details such as the pitch for make_cudaPitchedPtr. So I have included my complete and functioning code for the 1D layered texture. I couldn't find an example of this anywhere. So hopefully this will help out others who are in the same boat:
// To Compile: nvcc layeredTexture1D.cu -o layeredTexture1D.out
// To Run: ./layeredTexture1D.out
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// Includes CUDA
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#define pi acos(-1)
// 1D float textures: x is for input values, y is for corresponding output values
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef;
// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int numlayers) {
unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int layer = blockIdx.y * blockDim.y + threadIdx.y;
if (location_idx < numlocations && layer < numlayers) {
// Get the location you want to interpolate from the array
float loc2find = (float)d_locations[location_idx] + 0.5f;
// Read from texture and write to global memory
d_output[location_idx + layer*numlocations] = tex1DLayered(texRef, loc2find, layer);
//printf("location=%d layer=%d loc2find=%f result=%f \n", location_idx, layer, loc2find, d_output[location_idx]);
}
}
// Host code
int main()
{
// Setup h_data and locations to interpolate from
const unsigned int len = 7;
const unsigned int numlayers = 3;
const unsigned int upsamp = 4;
const unsigned int loclen = 1 + (len - 1) * upsamp;
float idx_spacing = 1 / (float)upsamp;
float h_data[numlayers*len], h_loc[loclen];
for (int i = 0; i < len; i++)
for (int j = 0; j < numlayers; j++)
h_data[len*j + i] = 1 + cosf((float)pi*i / (j + 1.0f));
for (int i = 0; i < loclen; i++)
h_loc[i] = i*idx_spacing;
// Get the memory locations you want
float* d_loc;
cudaMalloc(&d_loc, loclen * sizeof(float));
cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice);
// Allocate CUDA array in device memory
cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers); // <-- 0 height required for 1Dlayered
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaMemcpy3DParms mParams = { 0 };
mParams.srcPtr = make_cudaPitchedPtr(h_data, len*sizeof(float), len, 1);
mParams.kind = cudaMemcpyHostToDevice;
mParams.extent = make_cudaExtent(len, 1, numlayers); // <<-- non zero height required for memcpy to do anything
cudaArray* cuArray;
cudaMalloc3DArray(&cuArray, &channelDesc, extentDesc, cudaArrayLayered);
mParams.dstArray = cuArray;
cudaMemcpy3D(&mParams);
// Set texture reference parameters
texRef.addressMode[0] = cudaAddressModeBorder;
texRef.filterMode = cudaFilterModeLinear;
texRef.normalized = false;
// Bind the array to the texture reference
cudaBindTextureToArray(texRef, cuArray, channelDesc);
// Allocate result of transformation in device memory
float *d_output;
cudaMalloc(&d_output, loclen * numlayers * sizeof(float));
float h_output[loclen * numlayers];
// Invoke kernel
dim3 dimBlock(16, 16, 1);
dim3 dimGrid((loclen + dimBlock.x - 1) / dimBlock.x,
(numlayers + dimBlock.y - 1) / dimBlock.y, 1);
interp1Kernel<<<dimGrid, dimBlock>>>(d_output, d_loc, loclen, numlayers);
// Print Results
printf("\n Original Indices \n");
for (int i = 0; i < len; i++) printf(" %d ", i);
printf("\n Original array \n");
for (int j = 0; j < numlayers; j++) {
for (int i = 0; i < len; i++) {
printf("%5.3f ", h_data[i + j*len]);
}
printf("\n");
}
printf("\n Output Indices \n");
for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
printf("\n Output Array \n");
cudaMemcpy(h_output, d_output, loclen * numlayers * sizeof(float), cudaMemcpyDeviceToHost);
for (int j = 0; j < numlayers; j++) {
for (int i = 0; i < loclen; i++) {
printf("%5.3f ", h_output[i + j*loclen]);
}
printf("\n");
}
printf("\n");
// Free device memory
cudaFreeArray(cuArray);
cudaFree(d_output);
return 0;
}

Separating even and odd numbers in CUDA

I have an array of numbers as {1,2,3,4,5,6,7,8,9,10} and I want to separate even and odd numbers as:
even = {2,4,6,8}
and:
odd = {1,3,5,7}
I am aware of atomic operations in CUDA, and also aware that the output is not expected to suffer from race conditions. I don't want to use atomic operations. How can I achieve this without using atomic keywords?
CODE:
#include <stdio.h>
#include <cuda.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *total,float *even,float *odd, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int a=total[idx];
if ((a%2)==0)
{
for (int i=0;i<=idx;i++)
{
int b = even[i];
if(b==0)
{
even[i] = total[idx];
break;
}
}
}
else
{
for (int i=0;i<idx;i++)
{
int c = odd[i];
odd[i] = total[idx];
break;
}
}
}
// main routine that executes on the host
int main(void)
{
float *total_h,*even_h, *odd_h,*total_d, *even_d,*odd_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
total_h = (float *)malloc(size); // Allocate array on host
even_h = (float *)malloc(size); // Allocate array on host
odd_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &total_d, size);
cudaMalloc((void **) &even_d, size);
cudaMemset(even_d,0,size);
cudaMalloc((void **) &odd_d, size); // Allocate array on device
cudaMemset(odd_d,0,size);
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) total_h[i] = (float)i+1;
cudaMemcpy(total_d, total_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
square_array <<< 1,10 >>> (total_d,even_d,odd_d, N);
// Retrieve result from device and store it in host array
cudaMemcpy(even_h, even_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(odd_h, odd_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
printf("total Numbers\n");
for (int i=0; i<N; i++) printf("%f\n",total_h[i]);
printf("EVEN Numbers\n");
for (int i=0; i<N; i++) printf("%f\n",even_h[i]);
printf("ODD Numbers\n");
for (int i=0; i<N; i++) printf("%f\n",odd_h[i]);
// Cleanup
free(total_h);
free(even_h);
free(odd_h);
cudaFree(total_d);
cudaFree(even_d);
cudaFree(odd_d);
}
OUTPUT:
As suggested by Jared Hoberock, it would be much more easy to use the efficient partitioning algorithm available in CUDA Thrust instead of starting the development of a partitioning routine of your own. Below, please find a complete worked example.
#include <thrust\device_vector.h>
#include <thrust\partition.h>
#include <thrust\execution_policy.h>
struct is_even { __host__ __device__ bool operator()(const int &x) { return (x % 2) == 0; } };
void main() {
const int N = 10;
thrust::host_vector<int> h_data(N);
for (int i=0; i<N; i++) h_data[i] = i;
thrust::device_vector<int> d_data(h_data);
thrust::device_vector<int> d_evens(N/2);
thrust::device_vector<int> d_odds(N/2);
thrust::partition_copy(d_data.begin(), d_data.end(), d_evens.begin(), d_odds.begin(), is_even());
printf("Even numbers\n");
for (int i=0; i<N/2; i++) {
int val = d_evens[i];
printf("evens[%i] = %i\n",i,val);
}
printf("Odd numbers\n");
for (int i=0; i<N/2; i++) {
int val = d_odds[i];
printf("odds[%i] = %i\n",i,val);
}
}