GPU reduction code only runs one time - cuda

I have been using the code sample supplied by Robert Crovella:
thrust::max_element slow in comparison cublasIsamax - More efficient implementation?
Which is a very fast reduction code. I modified it to also return the index of the max in the input array of floats. When I use it in my code, it will only execute one time. If I try calling the routine again it does not find a new max value, it just returns the previous max. Is there something about the volatile global memory that the routine uses that needs to be reset before it can be called again?
#include <cuda.h>
#include <cublas_v2.h>
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 4096*4 // nTPB should be a power-of-2
#define nTPB 512
#define MAX_KERNEL_BLOCKS 30
#define MAX_BLOCKS ((DSIZE/nTPB)+1)
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f
#include <helper_functions.h>
#include <helper_cuda.h>
// this code has been modified to return the index of the max instead of the actual max value - for my application
__device__ volatile float blk_vals[MAX_BLOCKS];
__device__ volatile int blk_idxs[MAX_BLOCKS];
__device__ int blk_num = 0;
//template <typename T>
__global__ void max_idx_kernel(const float *data, const int dsize, int *result){
__shared__ volatile float vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
float my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize){
if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}
idx += blockDim.x*gridDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
// perform block-level reduction
if (!threadIdx.x){
blk_vals[blockIdx.x] = vals[0];
blk_idxs[blockIdx.x] = idxs[0];
if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block
last_block = 1;}
__syncthreads();
if (last_block){
idx = threadIdx.x;
my_val = FLOAT_MIN;
my_idx = -1;
while (idx < gridDim.x){
if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }
idx += blockDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
if (!threadIdx.x)
*result = idxs[0];
}
}
int main(){
int nrElements = DSIZE;
float *d_vector, *h_vector;
StopWatchInterface *hTimer = NULL;
sdkCreateTimer(&hTimer);
double gpuTime;
int k;
int max_index;
int *d_max_index;
cudaMalloc(&d_max_index, sizeof(int));
h_vector = new float[DSIZE];
for(k=0; k < 5; k++){
for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;
h_vector[10+k] = 10; // create definite max element that changes with each loop iteration
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " thrust time: " << gpuTime << " max index: " << max_index << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " cublas time: " << gpuTime << " max index: " << max_index-1 << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);
cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " idx kern time: " << gpuTime << " max index: " << max_index << std::endl;
std::cout << std::endl;
} // end for loop on k
cudaFree(d_max_index);
cudaFree(d_vector);
return 0;
}

The primary issue in re-using this code for multiple loops as-is is in this static initialization of a device (global) variable:
__device__ int blk_num = 0;
That's OK if you're only going to run the routine once. But if you intend to re-use it, you will need to re-initialize this variable to zero before each call to the kernel.
We could fix this by putting an explicit initialization of this variable to zero before each call to the reduction kernel:
cudaMemcpyToSymbol(blk_num, &max_index, sizeof(int));
(I'm using max_index here simply because it is a convenient host int variable that has just been set to zero.)
That's the only change needed to get the code "working".
However the introduction of the loop has created some other "issues" that I would point out. These 3 lines of code:
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
don't belong inside the for-loop on k. That is effectively creating a memory leak and unnecessarily re-initializing the cublas library.
The following code has those changes and seems to work for me:
$ cat t1183.cu
#include <cuda.h>
#include <cublas_v2.h>
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 4096*4 // nTPB should be a power-of-2
#define nTPB 512
#define MAX_KERNEL_BLOCKS 30
#define MAX_BLOCKS ((DSIZE/nTPB)+1)
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f
#include <helper_functions.h>
#include <helper_cuda.h>
// this code has been modified to return the index of the max instead of the actual max value - for my application
__device__ volatile float blk_vals[MAX_BLOCKS];
__device__ volatile int blk_idxs[MAX_BLOCKS];
__device__ int blk_num;
//template <typename T>
__global__ void max_idx_kernel(const float *data, const int dsize, int *result){
__shared__ volatile float vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
float my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize){
if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}
idx += blockDim.x*gridDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
// perform block-level reduction
if (!threadIdx.x){
blk_vals[blockIdx.x] = vals[0];
blk_idxs[blockIdx.x] = idxs[0];
if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block
last_block = 1;}
__syncthreads();
if (last_block){
idx = threadIdx.x;
my_val = FLOAT_MIN;
my_idx = -1;
while (idx < gridDim.x){
if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }
idx += blockDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
if (!threadIdx.x)
*result = idxs[0];
}
}
int main(){
int nrElements = DSIZE;
float *d_vector, *h_vector;
StopWatchInterface *hTimer = NULL;
sdkCreateTimer(&hTimer);
double gpuTime;
int k;
int max_index;
int *d_max_index;
cudaMalloc(&d_max_index, sizeof(int));
h_vector = new float[DSIZE];
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
for(k=0; k < 5; k++){
for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;
h_vector[10+k] = 10; // create definite max element that changes with each loop iteration
cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " thrust time: " << gpuTime << " max index: " << max_index << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " cublas time: " << gpuTime << " max index: " << max_index-1 << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
cudaMemcpyToSymbol(blk_num, &max_index, sizeof(int));
max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);
cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " idx kern time: " << gpuTime << " max index: " << max_index << std::endl;
std::cout << std::endl;
} // end for loop on k
cudaFree(d_max_index);
cudaFree(d_vector);
return 0;
}
$ nvcc -I/usr/local/cuda/samples/common/inc t1183.cu -o t1183 -lcublas
$ cuda-memcheck ./t1183
========= CUDA-MEMCHECK
loop: 0 thrust time: 2.806 max index: 10
loop: 0 cublas time: 0.441 max index: 10
loop: 0 idx kern time: 0.395 max index: 10
loop: 1 thrust time: 1.298 max index: 11
loop: 1 cublas time: 0.419 max index: 11
loop: 1 idx kern time: 0.424 max index: 11
loop: 2 thrust time: 1.303 max index: 12
loop: 2 cublas time: 0.43 max index: 12
loop: 2 idx kern time: 0.419 max index: 12
loop: 3 thrust time: 1.291 max index: 13
loop: 3 cublas time: 0.423 max index: 13
loop: 3 idx kern time: 0.415 max index: 13
loop: 4 thrust time: 1.299 max index: 14
loop: 4 cublas time: 0.423 max index: 14
loop: 4 idx kern time: 0.417 max index: 14
========= ERROR SUMMARY: 0 errors
$

Related

cuda - can't access blockDim.x?

I'm working on a cuda program to process a 2D image.
The problem is when I try to access blockDim.x and blockId.x, the kernel always failed to launch and output unknown error.
Besides, if I use a 3x5 image, I can access the threadId.x, while I use a 2048x2048 image I can't.
My kernel code runs OK when I use PyCuda, but now I have to switch to cuda C.
I think the problem may be related to
the way I pass the array pointer and there's something wrong with cudaMalloc
the configuration with my block size and grid size( but the same configuration works well in PyCuda so I don't know how to correct it).
And I use cuda-memcheck, I got unknown error 30 and I googled for solutions but no helpful information.
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
debug[idx] = threadIdx.x; // debug variable is used for debugging
}
int main(int arg, char* args[])
{
// ...
int size = w*h; // w is image width and h is image height
unsigned char *in = 0;
unsigned char *out = 0;
int* debug = 0;
// Allocate GPU buffers for the images
cudaMalloc((void**)&in, size * sizeof(unsigned char));
cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
cudaMalloc((void**)&debug, size * sizeof(int));
// Copy image data from host memory to GPU buffers.
cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char),cudaMemcpyHostToDevice);
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
extractor<<<g_dim, b_dim>>>(in, out, debug);
// clean up code and processing result
}
Now I can't get expected index so I can't do processing in the kernel, what can be the problem?
EDIT
I want to use 1D index, which means I assume the image array is a "flattened" 1D array and do indexing.
EDIT
After I added the thread check, there's still something wrong.
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
int y; int x;
int temp_x; int temp_y; int temp_idx;
int check = width*height;
if (idx < check) {
debug[0] = 1; // get kernel launch failed "unknown error"
}
}
I've tried to put the debug[0]=1; expression both in the thread check block and out the block, both of them get the same error.
So I doubt the memalloc is not been done correctly?
BTW, I used nvprof and it said
=22344== Warning: Found 2 invalid records in the result.
==22344== Warning: This can happen if device ran out of memory or if a device kernel was stopped due to an assertion.
EDIT
complete code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cmath>
#include <iostream>
#include "PNG.h"
#define L 3
#define INC1 1
#define INC2 1
#define R_IN 2
#define N_P 4
#define BLOCK_SIZE 1024
#define PI 3.14159265358979323846
using namespace std;
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug, int* disX, int* disY, int width, int height, int pad, int num_sample)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
int y; int x;
int temp_x; int temp_y; int temp_idx;
int check = width*height;
if (idx < check) {
debug[idx] = threadIdx.x;
y = idx/width;
x = idx%width;
if ((x < pad) || (x >= (width-pad)) || (y < pad) || (y >= (height-pad))) {
// need padding
for (int i = 0; i < num_sample; ++i){
temp_x = x + disX[i];
temp_y = y + disY[i];
if (!((temp_x < 0)||(temp_x > (width-1)) || (temp_y < 0) ||(temp_y>(height-1)))) {
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
} else {
for (int i = 0; i < num_sample; ++i)
{
temp_x = x + disX[i];
temp_y = y + disY[i];
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
}
}
vector<int> getCirclePos() {
int r = 0;
vector <int> circlePos;
while (!(r>(L/2))) {
circlePos.push_back(r);
if (r < R_IN) r += INC1;
else r += INC2;
}
cout << "circlePos:" << endl;
for (auto i = circlePos.begin(); i != circlePos.end(); ++i)
{cout << *i << ' ';}
cout << endl;
return circlePos;
}
int main(int arg, char* args[])
{
cudaError_t cudaStatus;
vector<int> circlePos = getCirclePos();
// get disX, disY
int num_sample_per_point = circlePos.size() * N_P;
int* disX = new int[num_sample_per_point];
int* disY = new int[num_sample_per_point];
int r; int cnt = 0;
for (int i = 0; i < circlePos.size(); ++i)
{
r = circlePos[i];
float angle;
for (int j = 0; j < N_P; ++j)
{
angle = j*360.0/N_P;
disX[cnt] = r*cos(angle*M_PI/180.0);
disY[cnt] = r*sin(angle*M_PI/180.0);
// cout nvpro << disX[cnt] << "|" << disY[cnt]<< endl;
cnt++;
}
}
PNG inPng("test.png");
// PNG outPng;
// outPng.Create(inPng.w, inPng.h);
//store width and height so we can use them for our output image later
const unsigned int w = inPng.w;
const unsigned int h = inPng.h;
cout << "w: " << w << " h: " << h << endl;
//4 because there are 4 color channels R, G, B, and A
int size = w * h;
unsigned char *in = 0;
unsigned char *out = 0;
int* debug = 0;
// Allocate GPU buffers for the images
cudaMalloc((void**)&in, size * sizeof(unsigned char));
cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
cudaMalloc((void**)&debug, size * sizeof(int));
vector<unsigned char> img_data;
for (int i = 0; i < size; ++i)
{
img_data.push_back(inPng.data[i*4]);
}
// debug
cout << "========= img_data ==========" << endl;
for (int i = 0; i < size; ++i)
{
cout << int(img_data[i]) << "," ;
}
cout << endl;
// Copy image data from host memory to GPU buffers.
cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char), cudaMemcpyHostToDevice);
//free the input image because we do not need it anymore
inPng.Free();
// Launch a kernel on the GPU with one thread for each element.
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
int pad = L/2;
// __global__ void extractor(const unsigned char* in, unsigned char* out, vector<int> disX, vector<int> disY, int width, int height, int pad, int num_sample)
extractor<<<g_dim, b_dim>>>(in, out, debug, disX, disY, w, h, pad, num_sample_per_point);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
std::cout << "Kernel launch failed: " << cudaGetErrorString(cudaStatus) << std::endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
exit(1);
}
auto tmp = new unsigned char[size*num_sample_per_point];
auto tmp_debug = new int [size];
cudaMemcpy(tmp_debug, debug, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(tmp, out, num_sample_per_point * size * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cout << "========= out =========" << endl;
for (int i = 0; i < size*num_sample_per_point; ++i)
{
cout << int(tmp[i]) << ", ";
}
cout << endl;
cout << "========debug=======" << endl;
for (int i = 0; i < size; ++i)
{
cout << tmp_debug[i] << ", ";
}
cout << endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
delete[] tmp; delete[] tmp_debug;
return 0;
}
This (according to your comment) is defining 1024 threads per block:
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
According to your question text, w and h are each 2048 in the failing case, so this:
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
is creating 4097 blocks, just as you indicate in your comment.
4097 blocks of 1024 threads each is 4195328 threads total, but your allocation sizes are only providing 2048*2048 elements, or 4194304 elements total. So you are launching 4195328 threads with only 4194304 elements, leaving 1024 threads left over.
So what do those 1024 extra threads do? They still run the kernel code and attempt to access your debug array beyond the end of the allocated space.
This results in undefined behavior in C and in C++.
The customary method to fix this is to pass the problem size to your kernel and add a "thread check" in your kernel code, like this:
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug, int n)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
if (idx < n)
debug[idx] = threadIdx.x; // debug variable is used for debugging
}
which prevents the "extra" threads from doing anything.
If you search here on the cuda tag for "thread check" you will find many other examples of questions like this.
As an example, based on the code pieces you have shown, the following runs without error for me:
$ cat t147.cu
const int width = 2048;
const int height = 2048;
const int BLOCK_SIZE = 1024;
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
// int y; int x;
// int temp_x; int temp_y; int temp_idx;
int check = width*height;
if (idx < check) {
debug[idx] = 1; // get kernel launch failed "unknown error"
}
}
int main(int arg, char* args[])
{
const int w = width;
const int h = height;
const int num_sample_per_point = 1;
int size = w*h; // w is image width and h is image height
unsigned char *in = 0;
unsigned char *out = 0;
int* debug = 0;
// Allocate GPU buffers for the images
cudaMalloc((void**)&in, size * sizeof(unsigned char));
cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
cudaMalloc((void**)&debug, size * sizeof(int));
// Copy image data from host memory to GPU buffers.
// cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char),cudaMemcpyHostToDevice);
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
extractor<<<g_dim, b_dim>>>(in, out, debug);
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_61 -o t147 t147.cu
$ cuda-memcheck ./t147
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
In your complete code, you simply have an illegal access problem in your kernel. I've modified it to remove the dependency on PNG, and if we omit the kernel code other than the debug setting, it runs fine. However if we include your kernel code, and run with cuda-memcheck we get all sorts of out-of-bounds accesses. In the future, you could use the method described here to debug these:
$ cat t146.cu
#include <cmath>
#include <iostream>
#include <vector>
#define L 3
#define INC1 1
#define INC2 1
#define R_IN 2
#define N_P 4
#define BLOCK_SIZE 1024
#define PI 3.14159265358979323846
using namespace std;
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug, int* disX, int* disY, int width, int height, int pad, int num_sample)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
int y; int x;
int temp_x; int temp_y; int temp_idx;
int check = width*height;
if (idx < check) {
debug[idx] = threadIdx.x;
y = idx/width;
x = idx%width;
#ifdef FAIL
if ((x < pad) || (x >= (width-pad)) || (y < pad) || (y >= (height-pad))) {
// need padding
for (int i = 0; i < num_sample; ++i){
temp_x = x + disX[i];
temp_y = y + disY[i];
if (!((temp_x < 0)||(temp_x > (width-1)) || (temp_y < 0) ||(temp_y>(height-1)))) {
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
} else {
for (int i = 0; i < num_sample; ++i)
{
temp_x = x + disX[i];
temp_y = y + disY[i];
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
#endif
}
}
vector<int> getCirclePos() {
int r = 0;
vector <int> circlePos;
while (!(r>(L/2))) {
circlePos.push_back(r);
if (r < R_IN) r += INC1;
else r += INC2;
}
cout << "circlePos:" << endl;
for (auto i = circlePos.begin(); i != circlePos.end(); ++i)
{//cout << *i << ' ';
}
cout << endl;
return circlePos;
}
int main(int arg, char* args[])
{
cudaError_t cudaStatus;
vector<int> circlePos = getCirclePos();
// get disX, disY
int num_sample_per_point = circlePos.size() * N_P;
int* disX = new int[num_sample_per_point];
int* disY = new int[num_sample_per_point];
int r; int cnt = 0;
for (int i = 0; i < circlePos.size(); ++i)
{
r = circlePos[i];
float angle;
for (int j = 0; j < N_P; ++j)
{
angle = j*360.0/N_P;
disX[cnt] = r*cos(angle*M_PI/180.0);
disY[cnt] = r*sin(angle*M_PI/180.0);
// cout nvpro << disX[cnt] << "|" << disY[cnt]<< endl;
cnt++;
}
}
const unsigned int w = 2048;
const unsigned int h = 2048;
cout << "w: " << w << " h: " << h << endl;
//4 because there are 4 color channels R, G, B, and A
int size = w * h;
unsigned char *in = 0;
unsigned char *out = 0;
int* debug = 0;
// Allocate GPU buffers for the images
cudaMalloc((void**)&in, size * sizeof(unsigned char));
cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
cudaMalloc((void**)&debug, size * sizeof(int));
vector<unsigned char> img_data;
for (int i = 0; i < size; ++i)
{
img_data.push_back(0);
}
// debug
cout << "========= img_data ==========" << endl;
for (int i = 0; i < size; ++i)
{
// cout << int(img_data[i]) << "," ;
}
cout << endl;
// Copy image data from host memory to GPU buffers.
cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char), cudaMemcpyHostToDevice);
// Launch a kernel on the GPU with one thread for each element.
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
int pad = L/2;
// __global__ void extractor(const unsigned char* in, unsigned char* out, vector<int> disX, vector<int> disY, int width, int height, int pad, int num_sample)
extractor<<<g_dim, b_dim>>>(in, out, debug, disX, disY, w, h, pad, num_sample_per_point);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
std::cout << "Kernel launch failed: " << cudaGetErrorString(cudaStatus) << std::endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
exit(1);
}
auto tmp = new unsigned char[size*num_sample_per_point];
auto tmp_debug = new int [size];
cudaMemcpy(tmp_debug, debug, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(tmp, out, num_sample_per_point * size * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cout << "========= out =========" << endl;
for (int i = 0; i < size*num_sample_per_point; ++i)
{
// cout << int(tmp[i]) << ", ";
}
cout << endl;
cout << "========debug=======" << endl;
for (int i = 0; i < size; ++i)
{
// cout << tmp_debug[i] << ", ";
}
cout << endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
delete[] tmp; delete[] tmp_debug;
return 0;
}
$ nvcc -std=c++11 -o t146 t146.cu -arch=sm_61 -lineinfo
t146.cu(18): warning: variable "y" was set but never used
t146.cu(18): warning: variable "x" was set but never used
t146.cu(19): warning: variable "temp_x" was declared but never referenced
t146.cu(19): warning: variable "temp_y" was declared but never referenced
t146.cu(19): warning: variable "temp_idx" was declared but never referenced
t146.cu(18): warning: variable "y" was set but never used
t146.cu(18): warning: variable "x" was set but never used
t146.cu(19): warning: variable "temp_x" was declared but never referenced
t146.cu(19): warning: variable "temp_y" was declared but never referenced
t146.cu(19): warning: variable "temp_idx" was declared but never referenced
$ cuda-memcheck ./t146
========= CUDA-MEMCHECK
circlePos:
w: 2048 h: 2048
========= img_data ==========
========= out =========
========debug=======
========= ERROR SUMMARY: 0 errors
$ nvcc -std=c++11 -o t146 t146.cu -arch=sm_61 -lineinfo -DFAIL
$ cuda-memcheck ./t146
...
========= Invalid __global__ read of size 4
========= at 0x00000418 in /home/ubuntu/bobc/misc/t146.cu:41:extractor(unsigned char const *, unsigned char*, int*, int*, int*, int, int, int, int)
========= by thread (197,0,0) in block (17,0,0)
========= Address 0x00c8b290 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5)
...
(and much more output like this)
The above output points to line 41 in the code, which is reading from disX.
As it turns out, your disX is a host-allocated variable:
int* disX = new int[num_sample_per_point];
but you are attempting to pass it to device code:
extractor<<<g_dim, b_dim>>>(in, out, debug, disX, disY, w, h, pad, num_sample_per_point);
^^^^
That is just completely broken. You can't do that in CUDA. You need to make a device copy of that variable, and also disY When I fix that problem, the modified code runs without error for me:
$ cat t146.cu
#include <cmath>
#include <iostream>
#include <vector>
#define L 3
#define INC1 1
#define INC2 1
#define R_IN 2
#define N_P 4
#define BLOCK_SIZE 1024
#define PI 3.14159265358979323846
using namespace std;
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug, int* disX, int* disY, int width, int height, int pad, int num_sample)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
int y; int x;
int temp_x; int temp_y; int temp_idx;
int check = width*height;
if (idx < check) {
debug[idx] = threadIdx.x;
y = idx/width;
x = idx%width;
#ifdef FAIL
if ((x < pad) || (x >= (width-pad)) || (y < pad) || (y >= (height-pad))) {
// need padding
for (int i = 0; i < num_sample; ++i){
temp_x = x + disX[i];
temp_y = y + disY[i];
if (!((temp_x < 0)||(temp_x > (width-1)) || (temp_y < 0) ||(temp_y>(height-1)))) {
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
} else {
for (int i = 0; i < num_sample; ++i)
{
temp_x = x + disX[i];
temp_y = y + disY[i];
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
#endif
}
}
vector<int> getCirclePos() {
int r = 0;
vector <int> circlePos;
while (!(r>(L/2))) {
circlePos.push_back(r);
if (r < R_IN) r += INC1;
else r += INC2;
}
cout << "circlePos:" << endl;
for (auto i = circlePos.begin(); i != circlePos.end(); ++i)
{//cout << *i << ' ';
}
cout << endl;
return circlePos;
}
int main(int arg, char* args[])
{
cudaError_t cudaStatus;
vector<int> circlePos = getCirclePos();
// get disX, disY
int num_sample_per_point = circlePos.size() * N_P;
int* disX = new int[num_sample_per_point];
int* disY = new int[num_sample_per_point];
int r; int cnt = 0;
for (int i = 0; i < circlePos.size(); ++i)
{
r = circlePos[i];
float angle;
for (int j = 0; j < N_P; ++j)
{
angle = j*360.0/N_P;
disX[cnt] = r*cos(angle*M_PI/180.0);
disY[cnt] = r*sin(angle*M_PI/180.0);
// cout nvpro << disX[cnt] << "|" << disY[cnt]<< endl;
cnt++;
}
}
int *d_disX, *d_disY;
cudaMalloc(&d_disX, num_sample_per_point*sizeof(int));
cudaMalloc(&d_disY, num_sample_per_point*sizeof(int));
cudaMemcpy(d_disX, disX, num_sample_per_point*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_disY, disY, num_sample_per_point*sizeof(int), cudaMemcpyHostToDevice);
const unsigned int w = 2048;
const unsigned int h = 2048;
cout << "w: " << w << " h: " << h << endl;
//4 because there are 4 color channels R, G, B, and A
int size = w * h;
unsigned char *in = 0;
unsigned char *out = 0;
int* debug = 0;
// Allocate GPU buffers for the images
cudaMalloc((void**)&in, size * sizeof(unsigned char));
cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
cudaMalloc((void**)&debug, size * sizeof(int));
vector<unsigned char> img_data;
for (int i = 0; i < size; ++i)
{
img_data.push_back(0);
}
// debug
cout << "========= img_data ==========" << endl;
for (int i = 0; i < size; ++i)
{
// cout << int(img_data[i]) << "," ;
}
cout << endl;
// Copy image data from host memory to GPU buffers.
cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char), cudaMemcpyHostToDevice);
// Launch a kernel on the GPU with one thread for each element.
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
int pad = L/2;
// __global__ void extractor(const unsigned char* in, unsigned char* out, vector<int> disX, vector<int> disY, int width, int height, int pad, int num_sample)
extractor<<<g_dim, b_dim>>>(in, out, debug, d_disX, d_disY, w, h, pad, num_sample_per_point);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
std::cout << "Kernel launch failed: " << cudaGetErrorString(cudaStatus) << std::endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
exit(1);
}
auto tmp = new unsigned char[size*num_sample_per_point];
auto tmp_debug = new int [size];
cudaMemcpy(tmp_debug, debug, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(tmp, out, num_sample_per_point * size * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cout << "========= out =========" << endl;
for (int i = 0; i < size*num_sample_per_point; ++i)
{
// cout << int(tmp[i]) << ", ";
}
cout << endl;
cout << "========debug=======" << endl;
for (int i = 0; i < size; ++i)
{
// cout << tmp_debug[i] << ", ";
}
cout << endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
delete[] tmp; delete[] tmp_debug;
return 0;
}
$ nvcc -std=c++11 -o t146 t146.cu -arch=sm_61 -lineinfo -DFAIL
$ cuda-memcheck ./t146
========= CUDA-MEMCHECK
circlePos:
w: 2048 h: 2048
========= img_data ==========
========= out =========
========debug=======
========= ERROR SUMMARY: 0 errors
$

Summing the rows of a matrix (stored in either row-major or column-major order) in CUDA

I'm working on the problem summing the rows of a matrix in CUDA. I'm giving the following example.
Suppose to have the following 20 * 4 array:
1 2 3 4
4 1 2 3
3 4 1 2
.
1 2 3 4
.
.
.
.
.
.
.
.
2 1 3 4
After flattened the 2d array to a 1d array (either in row-major or column-major order), I need to assign each thread to a different row and calculate the cost for that row.
For example
- thread 1 should calculate the cost for 1 2 3 4
- thread 2 should calculate the cost for 4 1 2 3
How can I that in CUDA?
Thank you all for the reply
#include <stdio.h>
#include <stdlib.h>
#define MROWS 20
#define NCOLS 4
#define nTPB 256
__global__ void mykernel(int *costdata, int rows, int cols, int *results){
int tidx = threadIdx.x + blockDim.x*blockIdx.x;
if (tidx < rows){
int mycost = 0;
for (int i = 0; i < cols; i++)
mycost += costdata[(tidx*cols)+i];
results[tidx] = mycost;
}
}
int main(){
//define and initialize host and device storage for cost and results
int *d_costdata, *h_costdata, *d_results, *h_results;
h_results = (int *)malloc(MROWS*sizeof(int));
h_costdata = (int *)malloc(MROWS*NCOLS*sizeof(int));
for (int i=0; i<(MROWS*NCOLS); i++)
h_costdata[i] = rand()%4;
cudaMalloc((void **)&d_results, MROWS*sizeof(int));
cudaMalloc((void **)&d_costdata, MROWS*NCOLS*sizeof(int));
//copy cost data from host to device
cudaMemcpy(d_costdata, h_costdata, MROWS*NCOLS*sizeof(int), cudaMemcpyHostToDevice);
mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results);
// copy results back from device to host
cudaMemcpy(h_results, d_results, MROWS*sizeof(int), cudaMemcpyDeviceToHost);
for (int i=0; i<MROWS; i++){
int loc_cost = 0;
for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
printf("cost[%d]: host= %d, device = %d\n", i, loc_cost, h_results[i]);
}
}
This assumes "cost" of each row is just the sum of the elements in each row. If you have a different "cost" function, you can modify the activity in the kernel for-loop accordingly. This also assumes C-style row-major data storage (1 2 3 4 4 1 2 3 3 4 1 2 etc.)
If you instead use column-major storage (1 4 3 etc.), you can slightly improve the performance, since the data reads can be fully coalesced. Then your kernel code could look like this:
for (int i = 0; i < cols; i++)
mycost += costdata[(i*rows)+tidx];
You should also use proper cuda error checking on all CUDA API calls and kernel calls.
EDIT: As discussed in the comments below, for the row-major storage case, in some situations it might give an increase in memory efficiency by electing to load 16-byte quantities rather than the base type. Following is a modified version that implements this idea for arbitrary dimensions and (more or less) arbitrary base types:
#include <iostream>
#include <typeinfo>
#include <cstdlib>
#include <vector_types.h>
#define MROWS 1742
#define NCOLS 801
#define nTPB 256
typedef double mytype;
__host__ int sizetype(){
int size = 0;
if ((typeid(mytype) == typeid(float)) || (typeid(mytype) == typeid(int)) || (typeid(mytype) == typeid(unsigned int)))
size = 4;
else if (typeid(mytype) == typeid(double))
size = 8;
else if ((typeid(mytype) == typeid(unsigned char)) || (typeid(mytype) == typeid(char)))
size = 1;
return size;
}
template<typename T>
__global__ void mykernel(const T *costdata, int rows, int cols, T *results, int size, size_t pitch){
int chunk = 16/size; // assumes size is a factor of 16
int tidx = threadIdx.x + blockDim.x*blockIdx.x;
if (tidx < rows){
T *myrowptr = (T *)(((unsigned char *)costdata) + tidx*pitch);
T mycost = (T)0;
int count = 0;
while (count < cols){
if ((cols-count)>=chunk){
// read 16 bytes
int4 temp = *((int4 *)(myrowptr + count));
int bcount = 16;
int j = 0;
while (bcount > 0){
mycost += *(((T *)(&temp)) + j++);
bcount -= size;
count++;}
}
else {
// read one quantity at a time
for (; count < cols; count++)
mycost += myrowptr[count];
}
results[tidx] = mycost;
}
}
}
int main(){
int typesize = sizetype();
if (typesize == 0) {std::cout << "invalid type selected" << std::endl; return 1;}
//define and initialize host and device storage for cost and results
mytype *d_costdata, *h_costdata, *d_results, *h_results;
h_results = (mytype *)malloc(MROWS*sizeof(mytype));
h_costdata = (mytype *)malloc(MROWS*NCOLS*sizeof(mytype));
for (int i=0; i<(MROWS*NCOLS); i++)
h_costdata[i] = (mytype)(rand()%4);
size_t pitch = 0;
cudaMalloc((void **)&d_results, MROWS*sizeof(mytype));
cudaMallocPitch((void **)&d_costdata, &pitch, NCOLS*sizeof(mytype), MROWS);
//copy cost data from host to device
cudaMemcpy2D(d_costdata, pitch, h_costdata, NCOLS*sizeof(mytype), NCOLS*sizeof(mytype), MROWS, cudaMemcpyHostToDevice);
mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results, typesize, pitch);
// copy results back from device to host
cudaMemcpy(h_results, d_results, MROWS*sizeof(mytype), cudaMemcpyDeviceToHost);
for (int i=0; i<MROWS; i++){
mytype loc_cost = (mytype)0;
for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
if ((i < 10) && (typesize > 1))
std::cout <<"cost[" << i << "]: host= " << loc_cost << ", device = " << h_results[i] << std::endl;
if (loc_cost != h_results[i]){ std::cout << "mismatch at index" << i << "should be:" << loc_cost << "was:" << h_results[i] << std::endl; return 1; }
}
std::cout << "Results are correct!" << std::endl;
}

Max GPU kernel function only work with one block

using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
const int threadsPerBlock = 256;
const int N = 40000;
void generateArray(double *data, int count) {
for (int i = 0; i < count; i++)
data[i] = rand() / ((rand() + rand()) / 2.0 + 1);
}
double maxCPU(double *arr, int count) {
int max = arr[0];
for (int i = 0; i < count; i++)
if (arr[i] > max)
max = arr[i];
return max;
}
__global__ void MaxGPU(double *a, int count, double *result){
__shared__ double cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;
int temp = a[tid];
tid+= blockDim.x * gridDim.x;
while(tid < count){
if(a[tid] > temp)
temp = a[tid];
tid+= blockDim.x * gridDim.x;
}
cache[cacheIndex] = temp;
__syncthreads();
int i = blockDim.x/2;
while(i!=0){
if(cacheIndex < i)
if(cache[cacheIndex + i] > cache[cacheIndex])
cache[cacheIndex] = cache[cacheIndex + i];
__syncthreads();
i/=2;
}
if(cacheIndex == 0)
result[blockIdx.x] = cache[0];
}
int main(void) {
double *arr = new double[N], resultGPU;
generateArray(arr, N);
double *devA, *dev_partial_result;
double resultCPU = maxCPU(arr, N);
cudaMalloc((void**)&devA, N * sizeof(double));
cudaMalloc((void**)&dev_partial_result, 512 * sizeof(double));
cudaMemcpy(devA, arr, N * sizeof(double), cudaMemcpyHostToDevice);
MaxGPU<<<1, 256>>>(devA, N, dev_partial_result);
cudaMemcpy(&resultGPU, dev_partial_result,sizeof(double), cudaMemcpyDeviceToHost);
cout << "Max CPU: " << resultCPU << endl;
cout << "Max GPU: " << resultGPU << endl;
cudaFree(devA);
cudaFree(dev_partial_result);
delete [] arr;
return 0;
}
I wrote above code. I don't why but it only works with one block. It does not work with say, 256 or 512 blocks. Why? What's wrong?
Try change
double resultGPU; to
double* resultGPU = new double[blocks_count];
and
cudaMemcpy(&resultGPU, dev_partial_result,sizeof(double), cudaMemcpyDeviceToHost); to
cudaMemcpy(resultGPU, dev_partial_result,blocks_count*sizeof(double), cudaMemcpyDeviceToHost);

CUDA median calculation through reduction [closed]

This question is unlikely to help any future visitors; it is only relevant to a small geographic area, a specific moment in time, or an extraordinarily narrow situation that is not generally applicable to the worldwide audience of the internet. For help making this question more broadly applicable, visit the help center.
Closed 9 years ago.
I'm probably doing something incredibly stupid, but I can't seem to make this reduction work (there is probably a library that does this already, but this is for self-learning, so please bear with me). I'm trying to find the median of an array of integer entries by taking the median of medians approach, which I've coded below:
__global__ void gpuMedOdd(int *entries, int *med) {
extern __shared__ int sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = entries[i];
__syncthreads();
for(int s = blockDim.x / 3; s > 0; s /= 3) {
if(tid < s) {
int list[3];
list[0] = sdata[tid], list[1] = sdata[tid + s], list[2] = sdata[tid + 2 * s];
if(list[1] < list[0])
swapGpu(list[1], list[0]);
if(list[2] < list[0])
swapGpu(list[2], list[0]);
if(list[2] < list[1])
swapGpu(list[2], list[1]);
sdata[tid] = list[1];
}
__syncthreads();
}
*med = sdata[0];
}
I invoke this kernel function as:
gpuMedOdd<<<9, numEntries / 9>>>(d_entries, d_med);
I then copy the value in d_med over into med and print out that value. Unfortunately, this value is always 0, regardless of input. What am I doing wrong?
Edit: I forgot to mention, swapGpu(a, b) is defined as below:
__device__ inline void swapGpu(int a, int b) {
int dum = a;
a = b;
b = dum;
}
Edit2: As suggested below, here is the entirety of the code.
#include <iostream>
#include <fstream>
#include <cstdlib>
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
inline void __checkCudaErrors(cudaError err, const char *file, const int line) {
if(cudaSuccess != err) {
std::cout << file << "(" << line << ") : CUDA Runtime API error " << (int) err << ": " << cudaGetErrorString(err) << std::endl;
exit(3);
}
}
inline void __getLastCudaError(const char *errorMsg, const char *file, const int line) {
cudaError_t err = cudaGetLastError();
if(cudaSuccess != err) {
std::cout << file << "(" << line << ") : getLastCudaError() CUDA error : " << errorMsg << " : (" << (int) err << ") " << cudaGetErrorString(err) << std::endl;
exit(3);
}
}
int cpuMin(int *entries, int numEntries) {
int minVal = entries[0];
for(int i = 1; i < numEntries; i++)
if(entries[i] < minVal)
minVal = entries[i];
return minVal;
}
int cpuMax(int *entries, int numEntries) {
int maxVal = entries[0];
for(int i = 1; i < numEntries; i++)
if(entries[i] > maxVal)
maxVal = entries[i];
return maxVal;
}
inline void swap(int a, int b) {
int dum = a;
a = b;
b = dum;
}
__device__ inline void swapGpu(int a, int b) {
int dum = a;
a = b;
b = dum;
}
__global__ void gpuMedOdd(int *entries, int *med, int numEntries) {
extern __shared__ int sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * (blockDim.x * 3) + threadIdx.x;
if(i + 2 * blockDim.x < numEntries) {
int list[3];
list[0] = entries[i], list[1] = entries[i + blockDim.x], list[2] = entries[i + 2 * blockDim.x];
if(list[1] < list[0])
swapGpu(list[1], list[0]);
if(list[2] < list[0])
swapGpu(list[2], list[0]);
if(list[2] < list[1])
swapGpu(list[2], list[1]);
sdata[tid] = list[1];
}
__syncthreads();
for(int s = blockDim.x / 3; s > 0; s /= 3) {
if(tid < s && tid + 2 * s < blockDim.x) {
int list[3];
list[0] = sdata[tid], list[1] = sdata[tid + s], list[2] = sdata[tid + 2 * s];
if(list[1] < list[0])
swapGpu(list[1], list[0]);
if(list[2] < list[0])
swapGpu(list[2], list[0]);
if(list[2] < list[1])
swapGpu(list[2], list[1]);
sdata[tid] = list[1];
}
__syncthreads();
}
*med = sdata[0];
}
__global__ void gpuMin(int *entries, int *min, int numEntries) {
extern __shared__ int sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
if(i + blockDim.x < numEntries)
sdata[tid] = (entries[i] < entries[i + blockDim.x]) ? entries[i] : entries[i + blockDim.x];
__syncthreads();
for(int s = blockDim.x / 2; s > 0; s >>= 1) {
if(tid < s)
sdata[tid] = (sdata[tid] < sdata[tid + s]) ? sdata[tid] : sdata[tid + s];
__syncthreads();
}
*min = sdata[0];
}
__global__ void gpuMax(int *entries, int *max, int numEntries) {
extern __shared__ int sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
if(i + blockDim.x < numEntries)
sdata[tid] = (entries[i] > entries[i + blockDim.x]) ? entries[i] : entries[i + blockDim.x];
__syncthreads();
for(int s = blockDim.x / 2; s > 0; s >>= 1) {
if(tid < s)
sdata[tid] = (sdata[tid] > sdata[tid + s]) ? sdata[tid] : sdata[tid + s];
__syncthreads();
}
*max = sdata[0];
}
int partition(int *entries, int left, int right, int pivotIdx) {
int i, storeIdx = left, pivot = entries[pivotIdx];
swap(entries[pivotIdx], entries[right]);
for(i = left; i < right; i++)
if(entries[i] < pivot) {
swap(entries[i], entries[storeIdx]);
storeIdx++;
}
return storeIdx;
}
int cpuSelect(int *entries, int left, int right, int k) {
if(left == right)
return entries[left];
int pivotIdx = ((left + right) >> 2) + 1, pivotNewIdx, pivotDist;
pivotNewIdx = partition(entries, left, right, pivotIdx);
pivotDist = pivotNewIdx - left + 1;
if(pivotDist == k)
return entries[pivotNewIdx];
else if(k < pivotDist)
return cpuSelect(entries, left, pivotNewIdx - 1, k);
else
return cpuSelect(entries, pivotNewIdx + 1, right, k - pivotDist);
}
int main(int argc, char *argv[]) {
if(argc != 3) {
std::cout << "ERROR: Incorrect number of input arguments" << std::endl;
std::cout << "Proper usage: " << argv[0] << " fileName numEntries" << std::endl;
exit(1);
}
std::ifstream inp(argv[1]);
if(!inp.is_open()) {
std::cout << "ERROR: File I/O error" << std::endl;
std::cout << "Could not find file " << argv[1] << std::endl;
exit(2);
}
int numEntries = atoi(argv[2]), i = 0;
int *entries = new int[numEntries];
while(inp >> entries[i] && i < numEntries)
i++;
if(i < numEntries) {
std::cout << "ERROR: File I/O error" << std::endl;
std::cout << "Command-line input suggested " << numEntries << " entries, but only found " << i << " entries" << std::endl;
exit(2);
}
if(inp >> i) {
std::cout << "ERROR: File I/O error" << std::endl;
std::cout << "Command-line input suggested " << numEntries << " entries, but file contains more entries" << std::endl;
exit(2);
}
int min, max;
int *d_entries, *d_min, *d_max;
checkCudaErrors(cudaMalloc(&d_entries, sizeof(int) * numEntries));
checkCudaErrors(cudaMalloc(&d_min, sizeof(int)));
checkCudaErrors(cudaMalloc(&d_max, sizeof(int)));
checkCudaErrors(cudaMemcpy(d_entries, entries, sizeof(int) * numEntries, cudaMemcpyHostToDevice));
gpuMin<<<16, numEntries / 16, numEntries / 16 * sizeof(int)>>>(d_entries, d_min, numEntries);
getLastCudaError("kernel launch failure");
gpuMax<<<16, numEntries / 16, numEntries / 16 * sizeof(int)>>>(d_entries, d_max, numEntries);
getLastCudaError("kernel launch failure");
checkCudaErrors(cudaMemcpy(&min, d_min, sizeof(int), cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(&max, d_max, sizeof(int), cudaMemcpyDeviceToHost));
std::cout << "The minimum value is: " << min << std::endl;
std::cout << "The maximum value is: " << max << std::endl;
if(numEntries % 2) {
int med, *d_med;
checkCudaErrors(cudaMalloc(&d_med, sizeof(int)));
gpuMedOdd<<<16, numEntries / 16, 16 * sizeof(int)>>>(d_entries, d_med, numEntries);
getLastCudaError("kernel launch failure");
checkCudaErrors(cudaMemcpy(&med, d_med, sizeof(int), cudaMemcpyDeviceToHost));
std::cout << "The median value is: " << med << std::endl;
}
else {
int *d_med;
cudaMalloc(&d_med, sizeof(int));
gpuMedOdd<<<16, numEntries / 16>>>(d_entries, d_med, numEntries);
}
min = cpuMin(entries, numEntries);
max = cpuMax(entries, numEntries);
if(numEntries % 2) {
int median = cpuSelect(entries, 0, numEntries - 1, (numEntries - 1) / 2 + 1);
std::cout << "The median value is: " << median << std::endl;
}
else {
int med2 = cpuSelect(entries, 0, numEntries - 1, numEntries / 2);
int med1 = cpuSelect(entries, 0, numEntries - 1, numEntries / 2 + 1);
float median = 0.5 * (med1 + med2);
std::cout << "The median value is: " << median << std::endl;
}
std::cout << "The minimum value is: " << min << std::endl;
std::cout << "The maximum value is: " << max << std::endl;
exit(0);
}
One thing that jumps out is that your shared memory size isn't set; that is, you declare your shared memory to be
extern __shared__ int sdata[];
but when you invoke your kernel your launch parameters are
gpuMedOdd<<<9, numEntries / 9>>>(...)
If you're setting your __shared__ memory to be extern, then it's expecting to get the number of bytes for shared memory as the 3rd kernel launch parameter. you should instead have
gpuMedOdd<<<9, numEntries / 9, smem_in_bytes>>>(...)
where smem_in_bytes is the size of shared memory for the kernel. If you don't specify a size, it'll default to 0. Hence in your current code, your __shared__ memory array sdata will be 0 bytes long.
EDIT: here's the link to the relevant part of the CUDA Programming Guide:
http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration
A problem I see in your code is that you seem to have your launch parameters reversed:
gpuMedOdd<<<16, numEntries / 16, 16 * sizeof(int)>>>(d_entries, d_med, numEntries);
I think you intended:
gpuMedOdd<<< numEntries/16, 16, 16 * sizeof(int)>>>(d_entries, d_med, numEntries);
The first launch parameter is blocks per grid. The second launch parameter is threads per block. Here I'm assuming you wanted to launch 16 threads per block. If in fact your intent was to launch a fixed number of blocks (16) and have the threads per block vary based on input size, then I think this is not typical of good cuda coding, and it will blow up if your input size gets too large, because you will exceed the max threads per block limit. Also, since your shared memory allocation is fixed (64 bytes), I assume you had intended a fixed number of threads per block.
Another suggestion I have is that rather than just reporting "CUDA Runtime Error" you should parse the error code returned. Take a look at the example link I already mentioned.

Cannot read out Values from Texture Memory

Hi I'm writing a simple Program for practicing to work with texture memory. I Just want to write my data into Texture Memory and write it back into Global Memory. But i cannont read out the Values. Here is the code.
#include <stdio.h>
#include <iostream>
#include "cuda.h"
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "HelloWorld.h"
#include "linearInterpolation_kernel4.cu"
using namespace std;
using std::cout;
const int blocksize = 16;
__global__
void hello(char *a, int *b) {
a[threadIdx.x] += b[threadIdx.x];
}
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
printf("%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
}
}
// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
inline void __getLastCudaError( const char *errorMessage, const char *file, const int line )
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
printf("%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
}
}
int main()
{
int N = 40;
float *A;
A = (float *) malloc(N*sizeof(float));
float *B;
B = (float *) malloc(N*sizeof(float));
float *result;
result = (float *) malloc(N*sizeof(float));
float angle = 0.8f;
for(int i = 0; i < N; i++){
A[i] = i; //(float)rand();
B[i] = i+1; //(float)rand();
}
ipLinearTexture2(A,B,result,angle,N);
float result2;
result2 = (angle)*A[4] + (1-angle)*B[4];
printf(" A %f B %f Result %f\n", A[4], B[4], result[4]);
cout << result2 << endl;
return 1;
}
void ipLinearTexture2(float *A, float* B, float* result, float angle, int N)
{
float cuTime;
int N2 = N * 2;
float *dev_result;
float **AB;
AB = (float **) malloc( N * sizeof(float *));
if(AB)
{
for(int i = 0; i < N; i++)
{
AB[i] = (float *) malloc( 2 * sizeof(float *));
}
}
for (int i = 0; i < N; i = i++)
{
AB[i][0] = A[i];
AB[i][1] = B[i];
}
cudaMalloc(&dev_result, N * sizeof(float));
unsigned int size = N2 * sizeof(float);
//cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray* cu_array;
checkCudaErrors(cudaMallocArray( &cu_array, &channelDesc,N,2));
cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);
// set texture parameters
tex2.normalized = true;
tex2.filterMode = cudaFilterModeLinear;
tex2.addressMode[0] = cudaAddressModeWrap; //cudaAddressModeWrap;
tex2.addressMode[1] = cudaAddressModeWrap; //cudaAddressModeClamp;
checkCudaErrors(cudaBindTextureToArray( tex2, cu_array, channelDesc));
dim3 dimBlock(10, 1, 1);
dim3 dimGrid((int)ceil((double)N*2/dimBlock.x), 1, 1);
transformKernel4<<< 256, 256, 0 >>>( dev_result, N, 2, angle);
checkCudaErrors(cudaMemcpy(result, dev_result, N * sizeof(float), cudaMemcpyDeviceToHost));
cout << "==================================================" << endl;
for (int i = 0 ; i < N ;i++)
{
cout << result[i] << " on " << i << endl;
}
cout << "==================================================" << endl;
checkCudaErrors(cudaUnbindTexture(tex));
checkCudaErrors(cudaFree(dev_result));
checkCudaErrors(cudaFreeArray(cu_array));
}
and here is the kernel code
#ifndef _SIMPLETEXTURE_KERNEL5_H_
#define _SIMPLETEXTURE_KERNEL5_H_
// Texture references
texture<float, 2, cudaReadModeElementType> tex2;
__global__ void
transformKernel4(float* g_odata, int width, int height, float theta)
{
unsigned int xid = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int yid = blockIdx.y * blockDim.y + threadIdx.y;
if (xid >= width || yid >= height) return;
float dx = 1.0f / (float)width;
float dy = 1.0f / (float)height;
float x = ((float)xid + 0.5f) * dx;
float y = ((float)yid + 0.5f) * dy;
float value = tex2D(tex2, x , y);
printf("wert %f xid %i yid %i \n",value, xid, yid);
g_odata[yid * width + xid] = value;
}
#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
Can somebody tell what i am doing wrong?
I have edited it to remove the first 2 logical mistake. Put why am I need able to print out my data?
It was the wrong binding of the Arrays. You can not use multidimensional Arrays in C that can be copied. You have to use a onedimensional array that respresents a multidimensional.
I can see 2 logical errors here.
The first one is the one pointed out by #asm.
The output should be stored by calculating linear index from 2D x and y indices.
outputIndex = yid * width + xid;
The second one is that the memory allocation for the cudaArray structure is internally aligned.
You should consider using cudaMemcpy2DToArray function to avoid erroneous data copying.
cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);