I'm working on a cuda program to process a 2D image.
The problem is when I try to access blockDim.x and blockId.x, the kernel always failed to launch and output unknown error.
Besides, if I use a 3x5 image, I can access the threadId.x, while I use a 2048x2048 image I can't.
My kernel code runs OK when I use PyCuda, but now I have to switch to cuda C.
I think the problem may be related to
the way I pass the array pointer and there's something wrong with cudaMalloc
the configuration with my block size and grid size( but the same configuration works well in PyCuda so I don't know how to correct it).
And I use cuda-memcheck, I got unknown error 30 and I googled for solutions but no helpful information.
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
debug[idx] = threadIdx.x; // debug variable is used for debugging
}
int main(int arg, char* args[])
{
// ...
int size = w*h; // w is image width and h is image height
unsigned char *in = 0;
unsigned char *out = 0;
int* debug = 0;
// Allocate GPU buffers for the images
cudaMalloc((void**)&in, size * sizeof(unsigned char));
cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
cudaMalloc((void**)&debug, size * sizeof(int));
// Copy image data from host memory to GPU buffers.
cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char),cudaMemcpyHostToDevice);
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
extractor<<<g_dim, b_dim>>>(in, out, debug);
// clean up code and processing result
}
Now I can't get expected index so I can't do processing in the kernel, what can be the problem?
EDIT
I want to use 1D index, which means I assume the image array is a "flattened" 1D array and do indexing.
EDIT
After I added the thread check, there's still something wrong.
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
int y; int x;
int temp_x; int temp_y; int temp_idx;
int check = width*height;
if (idx < check) {
debug[0] = 1; // get kernel launch failed "unknown error"
}
}
I've tried to put the debug[0]=1; expression both in the thread check block and out the block, both of them get the same error.
So I doubt the memalloc is not been done correctly?
BTW, I used nvprof and it said
=22344== Warning: Found 2 invalid records in the result.
==22344== Warning: This can happen if device ran out of memory or if a device kernel was stopped due to an assertion.
EDIT
complete code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cmath>
#include <iostream>
#include "PNG.h"
#define L 3
#define INC1 1
#define INC2 1
#define R_IN 2
#define N_P 4
#define BLOCK_SIZE 1024
#define PI 3.14159265358979323846
using namespace std;
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug, int* disX, int* disY, int width, int height, int pad, int num_sample)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
int y; int x;
int temp_x; int temp_y; int temp_idx;
int check = width*height;
if (idx < check) {
debug[idx] = threadIdx.x;
y = idx/width;
x = idx%width;
if ((x < pad) || (x >= (width-pad)) || (y < pad) || (y >= (height-pad))) {
// need padding
for (int i = 0; i < num_sample; ++i){
temp_x = x + disX[i];
temp_y = y + disY[i];
if (!((temp_x < 0)||(temp_x > (width-1)) || (temp_y < 0) ||(temp_y>(height-1)))) {
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
} else {
for (int i = 0; i < num_sample; ++i)
{
temp_x = x + disX[i];
temp_y = y + disY[i];
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
}
}
vector<int> getCirclePos() {
int r = 0;
vector <int> circlePos;
while (!(r>(L/2))) {
circlePos.push_back(r);
if (r < R_IN) r += INC1;
else r += INC2;
}
cout << "circlePos:" << endl;
for (auto i = circlePos.begin(); i != circlePos.end(); ++i)
{cout << *i << ' ';}
cout << endl;
return circlePos;
}
int main(int arg, char* args[])
{
cudaError_t cudaStatus;
vector<int> circlePos = getCirclePos();
// get disX, disY
int num_sample_per_point = circlePos.size() * N_P;
int* disX = new int[num_sample_per_point];
int* disY = new int[num_sample_per_point];
int r; int cnt = 0;
for (int i = 0; i < circlePos.size(); ++i)
{
r = circlePos[i];
float angle;
for (int j = 0; j < N_P; ++j)
{
angle = j*360.0/N_P;
disX[cnt] = r*cos(angle*M_PI/180.0);
disY[cnt] = r*sin(angle*M_PI/180.0);
// cout nvpro << disX[cnt] << "|" << disY[cnt]<< endl;
cnt++;
}
}
PNG inPng("test.png");
// PNG outPng;
// outPng.Create(inPng.w, inPng.h);
//store width and height so we can use them for our output image later
const unsigned int w = inPng.w;
const unsigned int h = inPng.h;
cout << "w: " << w << " h: " << h << endl;
//4 because there are 4 color channels R, G, B, and A
int size = w * h;
unsigned char *in = 0;
unsigned char *out = 0;
int* debug = 0;
// Allocate GPU buffers for the images
cudaMalloc((void**)&in, size * sizeof(unsigned char));
cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
cudaMalloc((void**)&debug, size * sizeof(int));
vector<unsigned char> img_data;
for (int i = 0; i < size; ++i)
{
img_data.push_back(inPng.data[i*4]);
}
// debug
cout << "========= img_data ==========" << endl;
for (int i = 0; i < size; ++i)
{
cout << int(img_data[i]) << "," ;
}
cout << endl;
// Copy image data from host memory to GPU buffers.
cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char), cudaMemcpyHostToDevice);
//free the input image because we do not need it anymore
inPng.Free();
// Launch a kernel on the GPU with one thread for each element.
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
int pad = L/2;
// __global__ void extractor(const unsigned char* in, unsigned char* out, vector<int> disX, vector<int> disY, int width, int height, int pad, int num_sample)
extractor<<<g_dim, b_dim>>>(in, out, debug, disX, disY, w, h, pad, num_sample_per_point);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
std::cout << "Kernel launch failed: " << cudaGetErrorString(cudaStatus) << std::endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
exit(1);
}
auto tmp = new unsigned char[size*num_sample_per_point];
auto tmp_debug = new int [size];
cudaMemcpy(tmp_debug, debug, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(tmp, out, num_sample_per_point * size * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cout << "========= out =========" << endl;
for (int i = 0; i < size*num_sample_per_point; ++i)
{
cout << int(tmp[i]) << ", ";
}
cout << endl;
cout << "========debug=======" << endl;
for (int i = 0; i < size; ++i)
{
cout << tmp_debug[i] << ", ";
}
cout << endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
delete[] tmp; delete[] tmp_debug;
return 0;
}
This (according to your comment) is defining 1024 threads per block:
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
According to your question text, w and h are each 2048 in the failing case, so this:
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
is creating 4097 blocks, just as you indicate in your comment.
4097 blocks of 1024 threads each is 4195328 threads total, but your allocation sizes are only providing 2048*2048 elements, or 4194304 elements total. So you are launching 4195328 threads with only 4194304 elements, leaving 1024 threads left over.
So what do those 1024 extra threads do? They still run the kernel code and attempt to access your debug array beyond the end of the allocated space.
This results in undefined behavior in C and in C++.
The customary method to fix this is to pass the problem size to your kernel and add a "thread check" in your kernel code, like this:
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug, int n)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
if (idx < n)
debug[idx] = threadIdx.x; // debug variable is used for debugging
}
which prevents the "extra" threads from doing anything.
If you search here on the cuda tag for "thread check" you will find many other examples of questions like this.
As an example, based on the code pieces you have shown, the following runs without error for me:
$ cat t147.cu
const int width = 2048;
const int height = 2048;
const int BLOCK_SIZE = 1024;
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
// int y; int x;
// int temp_x; int temp_y; int temp_idx;
int check = width*height;
if (idx < check) {
debug[idx] = 1; // get kernel launch failed "unknown error"
}
}
int main(int arg, char* args[])
{
const int w = width;
const int h = height;
const int num_sample_per_point = 1;
int size = w*h; // w is image width and h is image height
unsigned char *in = 0;
unsigned char *out = 0;
int* debug = 0;
// Allocate GPU buffers for the images
cudaMalloc((void**)&in, size * sizeof(unsigned char));
cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
cudaMalloc((void**)&debug, size * sizeof(int));
// Copy image data from host memory to GPU buffers.
// cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char),cudaMemcpyHostToDevice);
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
extractor<<<g_dim, b_dim>>>(in, out, debug);
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_61 -o t147 t147.cu
$ cuda-memcheck ./t147
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
In your complete code, you simply have an illegal access problem in your kernel. I've modified it to remove the dependency on PNG, and if we omit the kernel code other than the debug setting, it runs fine. However if we include your kernel code, and run with cuda-memcheck we get all sorts of out-of-bounds accesses. In the future, you could use the method described here to debug these:
$ cat t146.cu
#include <cmath>
#include <iostream>
#include <vector>
#define L 3
#define INC1 1
#define INC2 1
#define R_IN 2
#define N_P 4
#define BLOCK_SIZE 1024
#define PI 3.14159265358979323846
using namespace std;
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug, int* disX, int* disY, int width, int height, int pad, int num_sample)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
int y; int x;
int temp_x; int temp_y; int temp_idx;
int check = width*height;
if (idx < check) {
debug[idx] = threadIdx.x;
y = idx/width;
x = idx%width;
#ifdef FAIL
if ((x < pad) || (x >= (width-pad)) || (y < pad) || (y >= (height-pad))) {
// need padding
for (int i = 0; i < num_sample; ++i){
temp_x = x + disX[i];
temp_y = y + disY[i];
if (!((temp_x < 0)||(temp_x > (width-1)) || (temp_y < 0) ||(temp_y>(height-1)))) {
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
} else {
for (int i = 0; i < num_sample; ++i)
{
temp_x = x + disX[i];
temp_y = y + disY[i];
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
#endif
}
}
vector<int> getCirclePos() {
int r = 0;
vector <int> circlePos;
while (!(r>(L/2))) {
circlePos.push_back(r);
if (r < R_IN) r += INC1;
else r += INC2;
}
cout << "circlePos:" << endl;
for (auto i = circlePos.begin(); i != circlePos.end(); ++i)
{//cout << *i << ' ';
}
cout << endl;
return circlePos;
}
int main(int arg, char* args[])
{
cudaError_t cudaStatus;
vector<int> circlePos = getCirclePos();
// get disX, disY
int num_sample_per_point = circlePos.size() * N_P;
int* disX = new int[num_sample_per_point];
int* disY = new int[num_sample_per_point];
int r; int cnt = 0;
for (int i = 0; i < circlePos.size(); ++i)
{
r = circlePos[i];
float angle;
for (int j = 0; j < N_P; ++j)
{
angle = j*360.0/N_P;
disX[cnt] = r*cos(angle*M_PI/180.0);
disY[cnt] = r*sin(angle*M_PI/180.0);
// cout nvpro << disX[cnt] << "|" << disY[cnt]<< endl;
cnt++;
}
}
const unsigned int w = 2048;
const unsigned int h = 2048;
cout << "w: " << w << " h: " << h << endl;
//4 because there are 4 color channels R, G, B, and A
int size = w * h;
unsigned char *in = 0;
unsigned char *out = 0;
int* debug = 0;
// Allocate GPU buffers for the images
cudaMalloc((void**)&in, size * sizeof(unsigned char));
cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
cudaMalloc((void**)&debug, size * sizeof(int));
vector<unsigned char> img_data;
for (int i = 0; i < size; ++i)
{
img_data.push_back(0);
}
// debug
cout << "========= img_data ==========" << endl;
for (int i = 0; i < size; ++i)
{
// cout << int(img_data[i]) << "," ;
}
cout << endl;
// Copy image data from host memory to GPU buffers.
cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char), cudaMemcpyHostToDevice);
// Launch a kernel on the GPU with one thread for each element.
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
int pad = L/2;
// __global__ void extractor(const unsigned char* in, unsigned char* out, vector<int> disX, vector<int> disY, int width, int height, int pad, int num_sample)
extractor<<<g_dim, b_dim>>>(in, out, debug, disX, disY, w, h, pad, num_sample_per_point);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
std::cout << "Kernel launch failed: " << cudaGetErrorString(cudaStatus) << std::endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
exit(1);
}
auto tmp = new unsigned char[size*num_sample_per_point];
auto tmp_debug = new int [size];
cudaMemcpy(tmp_debug, debug, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(tmp, out, num_sample_per_point * size * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cout << "========= out =========" << endl;
for (int i = 0; i < size*num_sample_per_point; ++i)
{
// cout << int(tmp[i]) << ", ";
}
cout << endl;
cout << "========debug=======" << endl;
for (int i = 0; i < size; ++i)
{
// cout << tmp_debug[i] << ", ";
}
cout << endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
delete[] tmp; delete[] tmp_debug;
return 0;
}
$ nvcc -std=c++11 -o t146 t146.cu -arch=sm_61 -lineinfo
t146.cu(18): warning: variable "y" was set but never used
t146.cu(18): warning: variable "x" was set but never used
t146.cu(19): warning: variable "temp_x" was declared but never referenced
t146.cu(19): warning: variable "temp_y" was declared but never referenced
t146.cu(19): warning: variable "temp_idx" was declared but never referenced
t146.cu(18): warning: variable "y" was set but never used
t146.cu(18): warning: variable "x" was set but never used
t146.cu(19): warning: variable "temp_x" was declared but never referenced
t146.cu(19): warning: variable "temp_y" was declared but never referenced
t146.cu(19): warning: variable "temp_idx" was declared but never referenced
$ cuda-memcheck ./t146
========= CUDA-MEMCHECK
circlePos:
w: 2048 h: 2048
========= img_data ==========
========= out =========
========debug=======
========= ERROR SUMMARY: 0 errors
$ nvcc -std=c++11 -o t146 t146.cu -arch=sm_61 -lineinfo -DFAIL
$ cuda-memcheck ./t146
...
========= Invalid __global__ read of size 4
========= at 0x00000418 in /home/ubuntu/bobc/misc/t146.cu:41:extractor(unsigned char const *, unsigned char*, int*, int*, int*, int, int, int, int)
========= by thread (197,0,0) in block (17,0,0)
========= Address 0x00c8b290 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5)
...
(and much more output like this)
The above output points to line 41 in the code, which is reading from disX.
As it turns out, your disX is a host-allocated variable:
int* disX = new int[num_sample_per_point];
but you are attempting to pass it to device code:
extractor<<<g_dim, b_dim>>>(in, out, debug, disX, disY, w, h, pad, num_sample_per_point);
^^^^
That is just completely broken. You can't do that in CUDA. You need to make a device copy of that variable, and also disY When I fix that problem, the modified code runs without error for me:
$ cat t146.cu
#include <cmath>
#include <iostream>
#include <vector>
#define L 3
#define INC1 1
#define INC2 1
#define R_IN 2
#define N_P 4
#define BLOCK_SIZE 1024
#define PI 3.14159265358979323846
using namespace std;
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug, int* disX, int* disY, int width, int height, int pad, int num_sample)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
int y; int x;
int temp_x; int temp_y; int temp_idx;
int check = width*height;
if (idx < check) {
debug[idx] = threadIdx.x;
y = idx/width;
x = idx%width;
#ifdef FAIL
if ((x < pad) || (x >= (width-pad)) || (y < pad) || (y >= (height-pad))) {
// need padding
for (int i = 0; i < num_sample; ++i){
temp_x = x + disX[i];
temp_y = y + disY[i];
if (!((temp_x < 0)||(temp_x > (width-1)) || (temp_y < 0) ||(temp_y>(height-1)))) {
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
} else {
for (int i = 0; i < num_sample; ++i)
{
temp_x = x + disX[i];
temp_y = y + disY[i];
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
#endif
}
}
vector<int> getCirclePos() {
int r = 0;
vector <int> circlePos;
while (!(r>(L/2))) {
circlePos.push_back(r);
if (r < R_IN) r += INC1;
else r += INC2;
}
cout << "circlePos:" << endl;
for (auto i = circlePos.begin(); i != circlePos.end(); ++i)
{//cout << *i << ' ';
}
cout << endl;
return circlePos;
}
int main(int arg, char* args[])
{
cudaError_t cudaStatus;
vector<int> circlePos = getCirclePos();
// get disX, disY
int num_sample_per_point = circlePos.size() * N_P;
int* disX = new int[num_sample_per_point];
int* disY = new int[num_sample_per_point];
int r; int cnt = 0;
for (int i = 0; i < circlePos.size(); ++i)
{
r = circlePos[i];
float angle;
for (int j = 0; j < N_P; ++j)
{
angle = j*360.0/N_P;
disX[cnt] = r*cos(angle*M_PI/180.0);
disY[cnt] = r*sin(angle*M_PI/180.0);
// cout nvpro << disX[cnt] << "|" << disY[cnt]<< endl;
cnt++;
}
}
int *d_disX, *d_disY;
cudaMalloc(&d_disX, num_sample_per_point*sizeof(int));
cudaMalloc(&d_disY, num_sample_per_point*sizeof(int));
cudaMemcpy(d_disX, disX, num_sample_per_point*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_disY, disY, num_sample_per_point*sizeof(int), cudaMemcpyHostToDevice);
const unsigned int w = 2048;
const unsigned int h = 2048;
cout << "w: " << w << " h: " << h << endl;
//4 because there are 4 color channels R, G, B, and A
int size = w * h;
unsigned char *in = 0;
unsigned char *out = 0;
int* debug = 0;
// Allocate GPU buffers for the images
cudaMalloc((void**)&in, size * sizeof(unsigned char));
cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
cudaMalloc((void**)&debug, size * sizeof(int));
vector<unsigned char> img_data;
for (int i = 0; i < size; ++i)
{
img_data.push_back(0);
}
// debug
cout << "========= img_data ==========" << endl;
for (int i = 0; i < size; ++i)
{
// cout << int(img_data[i]) << "," ;
}
cout << endl;
// Copy image data from host memory to GPU buffers.
cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char), cudaMemcpyHostToDevice);
// Launch a kernel on the GPU with one thread for each element.
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
int pad = L/2;
// __global__ void extractor(const unsigned char* in, unsigned char* out, vector<int> disX, vector<int> disY, int width, int height, int pad, int num_sample)
extractor<<<g_dim, b_dim>>>(in, out, debug, d_disX, d_disY, w, h, pad, num_sample_per_point);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
std::cout << "Kernel launch failed: " << cudaGetErrorString(cudaStatus) << std::endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
exit(1);
}
auto tmp = new unsigned char[size*num_sample_per_point];
auto tmp_debug = new int [size];
cudaMemcpy(tmp_debug, debug, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(tmp, out, num_sample_per_point * size * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cout << "========= out =========" << endl;
for (int i = 0; i < size*num_sample_per_point; ++i)
{
// cout << int(tmp[i]) << ", ";
}
cout << endl;
cout << "========debug=======" << endl;
for (int i = 0; i < size; ++i)
{
// cout << tmp_debug[i] << ", ";
}
cout << endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
delete[] tmp; delete[] tmp_debug;
return 0;
}
$ nvcc -std=c++11 -o t146 t146.cu -arch=sm_61 -lineinfo -DFAIL
$ cuda-memcheck ./t146
========= CUDA-MEMCHECK
circlePos:
w: 2048 h: 2048
========= img_data ==========
========= out =========
========debug=======
========= ERROR SUMMARY: 0 errors
$
Related
I have been using the code sample supplied by Robert Crovella:
thrust::max_element slow in comparison cublasIsamax - More efficient implementation?
Which is a very fast reduction code. I modified it to also return the index of the max in the input array of floats. When I use it in my code, it will only execute one time. If I try calling the routine again it does not find a new max value, it just returns the previous max. Is there something about the volatile global memory that the routine uses that needs to be reset before it can be called again?
#include <cuda.h>
#include <cublas_v2.h>
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 4096*4 // nTPB should be a power-of-2
#define nTPB 512
#define MAX_KERNEL_BLOCKS 30
#define MAX_BLOCKS ((DSIZE/nTPB)+1)
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f
#include <helper_functions.h>
#include <helper_cuda.h>
// this code has been modified to return the index of the max instead of the actual max value - for my application
__device__ volatile float blk_vals[MAX_BLOCKS];
__device__ volatile int blk_idxs[MAX_BLOCKS];
__device__ int blk_num = 0;
//template <typename T>
__global__ void max_idx_kernel(const float *data, const int dsize, int *result){
__shared__ volatile float vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
float my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize){
if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}
idx += blockDim.x*gridDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
// perform block-level reduction
if (!threadIdx.x){
blk_vals[blockIdx.x] = vals[0];
blk_idxs[blockIdx.x] = idxs[0];
if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block
last_block = 1;}
__syncthreads();
if (last_block){
idx = threadIdx.x;
my_val = FLOAT_MIN;
my_idx = -1;
while (idx < gridDim.x){
if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }
idx += blockDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
if (!threadIdx.x)
*result = idxs[0];
}
}
int main(){
int nrElements = DSIZE;
float *d_vector, *h_vector;
StopWatchInterface *hTimer = NULL;
sdkCreateTimer(&hTimer);
double gpuTime;
int k;
int max_index;
int *d_max_index;
cudaMalloc(&d_max_index, sizeof(int));
h_vector = new float[DSIZE];
for(k=0; k < 5; k++){
for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;
h_vector[10+k] = 10; // create definite max element that changes with each loop iteration
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " thrust time: " << gpuTime << " max index: " << max_index << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " cublas time: " << gpuTime << " max index: " << max_index-1 << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);
cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " idx kern time: " << gpuTime << " max index: " << max_index << std::endl;
std::cout << std::endl;
} // end for loop on k
cudaFree(d_max_index);
cudaFree(d_vector);
return 0;
}
The primary issue in re-using this code for multiple loops as-is is in this static initialization of a device (global) variable:
__device__ int blk_num = 0;
That's OK if you're only going to run the routine once. But if you intend to re-use it, you will need to re-initialize this variable to zero before each call to the kernel.
We could fix this by putting an explicit initialization of this variable to zero before each call to the reduction kernel:
cudaMemcpyToSymbol(blk_num, &max_index, sizeof(int));
(I'm using max_index here simply because it is a convenient host int variable that has just been set to zero.)
That's the only change needed to get the code "working".
However the introduction of the loop has created some other "issues" that I would point out. These 3 lines of code:
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
don't belong inside the for-loop on k. That is effectively creating a memory leak and unnecessarily re-initializing the cublas library.
The following code has those changes and seems to work for me:
$ cat t1183.cu
#include <cuda.h>
#include <cublas_v2.h>
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 4096*4 // nTPB should be a power-of-2
#define nTPB 512
#define MAX_KERNEL_BLOCKS 30
#define MAX_BLOCKS ((DSIZE/nTPB)+1)
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f
#include <helper_functions.h>
#include <helper_cuda.h>
// this code has been modified to return the index of the max instead of the actual max value - for my application
__device__ volatile float blk_vals[MAX_BLOCKS];
__device__ volatile int blk_idxs[MAX_BLOCKS];
__device__ int blk_num;
//template <typename T>
__global__ void max_idx_kernel(const float *data, const int dsize, int *result){
__shared__ volatile float vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
float my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize){
if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}
idx += blockDim.x*gridDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
// perform block-level reduction
if (!threadIdx.x){
blk_vals[blockIdx.x] = vals[0];
blk_idxs[blockIdx.x] = idxs[0];
if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block
last_block = 1;}
__syncthreads();
if (last_block){
idx = threadIdx.x;
my_val = FLOAT_MIN;
my_idx = -1;
while (idx < gridDim.x){
if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }
idx += blockDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
if (!threadIdx.x)
*result = idxs[0];
}
}
int main(){
int nrElements = DSIZE;
float *d_vector, *h_vector;
StopWatchInterface *hTimer = NULL;
sdkCreateTimer(&hTimer);
double gpuTime;
int k;
int max_index;
int *d_max_index;
cudaMalloc(&d_max_index, sizeof(int));
h_vector = new float[DSIZE];
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
for(k=0; k < 5; k++){
for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;
h_vector[10+k] = 10; // create definite max element that changes with each loop iteration
cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " thrust time: " << gpuTime << " max index: " << max_index << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " cublas time: " << gpuTime << " max index: " << max_index-1 << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
cudaMemcpyToSymbol(blk_num, &max_index, sizeof(int));
max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);
cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " idx kern time: " << gpuTime << " max index: " << max_index << std::endl;
std::cout << std::endl;
} // end for loop on k
cudaFree(d_max_index);
cudaFree(d_vector);
return 0;
}
$ nvcc -I/usr/local/cuda/samples/common/inc t1183.cu -o t1183 -lcublas
$ cuda-memcheck ./t1183
========= CUDA-MEMCHECK
loop: 0 thrust time: 2.806 max index: 10
loop: 0 cublas time: 0.441 max index: 10
loop: 0 idx kern time: 0.395 max index: 10
loop: 1 thrust time: 1.298 max index: 11
loop: 1 cublas time: 0.419 max index: 11
loop: 1 idx kern time: 0.424 max index: 11
loop: 2 thrust time: 1.303 max index: 12
loop: 2 cublas time: 0.43 max index: 12
loop: 2 idx kern time: 0.419 max index: 12
loop: 3 thrust time: 1.291 max index: 13
loop: 3 cublas time: 0.423 max index: 13
loop: 3 idx kern time: 0.415 max index: 13
loop: 4 thrust time: 1.299 max index: 14
loop: 4 cublas time: 0.423 max index: 14
loop: 4 idx kern time: 0.417 max index: 14
========= ERROR SUMMARY: 0 errors
$
This question already has an answer here:
Unable to execute device kernel in CUDA
(1 answer)
Closed 7 years ago.
What I am attempting to do is Multiply Matrix A & Matrix B and then from the product matrix I get the index of the maximum value per column. But unfortunately, only the first 128*128 values of the matrix multiplication are correct while others are just garbage. I do not quite understand how this works. I request you to kindly guide me with this ..
#include<stdio.h>
#include "cuda.h"
#include<stdlib.h>
#define blockD 32
const int wA = 128;
const int hA = 4096;
const int wB = 4096;
const int hB = wA;
main(void){
void MatrixMultiplication(float *, float *, float *, float *);
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *M, *N, *P, *C;
// allocate memory on the CPU
M = (float*)malloc(size_A);
N = (float*)malloc(size_B);
P = (float*)malloc(size_max);
C = (float*)malloc(size_C);
// initialize the matrices
for (int y=0; y < hA; y++) {
for (int x=0; x < wA; x++){
M[y*wA + x] = 32; //x + y*wA;
}
}
for (int y=0; y<hB; y++) {
for (int x=0; x<wB; x++){
N[y*wB + x] = 21; //x + y*wB;
}
}
MatrixMultiplication(M, N, P, C);
//Write
FILE *f1;
int i,j;
f1 = fopen("C.txt","w");
for(i = hA - 2 ; i < hA; i ++){
for(j = 0; j < wB; j++){
fprintf(f1,"%d\t",int(C[i*wB + j]));
}
fprintf(f1,"\n");
}
fclose(f1);
// free the memory allocated on the CPU
free( M );
free( N );
free( P );
free( C );
cudaDeviceReset();
return 0;
}
__device__ void MaxFunction(float* Pd, float* max)
{
int x = (threadIdx.x + blockIdx.x * blockDim.x);
int y = (threadIdx.y + blockIdx.y * blockDim.y);
int k = 0;
int temp = 0; int temp_idx = 0;
for (k = 0; k < wB; ++k) {
if(Pd[x*wB + k] > temp){
temp = Pd[x*wB + k];
temp_idx = x*wB + k;
}
}
max[y*2 + 0] = temp;
max[y*2 + 1] = temp_idx;
}
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, float* max)
{
// declare cache in the shared memory
__shared__ float Mds[blockD][blockD];
__shared__ float Nds[blockD][blockD];
float Pvalue = 0;
// Loop over the Md and Nd block dimension required to compute the Pd element
for (int m = (wA * blockD * blockIdx.y), n = (blockD * blockIdx.x);
m < ((wA * blockD * blockIdx.y)+wA-1);
m += blockD, n += (blockD*hB)){
// collaboratively loading of Md and Nd blocks into shared memory
Mds[threadIdx.y][threadIdx.x] = Md[m + wA * threadIdx.y + threadIdx.x];
Nds[threadIdx.y][threadIdx.x] = Nd[n + wA * threadIdx.y + threadIdx.x];
__syncthreads();
// keep track of the running sum
for (int k = 0; k < blockD; k++)
Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
__syncthreads();
}
// write back to the global memory
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
__syncthreads();
MaxFunction(Pd, max);
}
void MatrixMultiplication(float *M, float *N, float *P, float *C) {
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *Md, *Nd, *Pd, *max;
// allocate memory on the GPU
cudaMalloc((void**)&Md, size_A);
cudaMalloc((void**)&Nd, size_B);
cudaMalloc((void**)&Pd, size_C);
cudaMalloc((void**)&max, size_max);
// transfer M and N to device memory
cudaMemcpy(Md, M, size_A, cudaMemcpyHostToDevice);
cudaMemcpy(Nd, N, size_B, cudaMemcpyHostToDevice);
// kernel invocation code
dim3 dimBlock(blockD, blockD);
dim3 dimGrid(wA/blockD, hB/blockD);
//Execute Kernel
MatrixMulKernel<<<dimGrid, dimBlock>>>( Md, Nd, Pd, max);
// transfer P from device
cudaMemcpy(P, max, size_max, cudaMemcpyDeviceToHost);
cudaMemcpy(C, Pd, size_C, cudaMemcpyDeviceToHost);
// free the memory allocated on the GPU
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
cudaFree(max);
}
In your code you seem to have more than one problem. One of the problems is, in place of this:
dim3 dimGrid(wA/blockD, hB/blockD);
You should have this:
dim3 dimGrid(wB/blockD, hA/blockD);
Ultimately you need one thread in your grid for each output point. Your formulation was giving you a grid of 4 blocks by 4 blocks, whereas you need a grid of 128 blocks by 128 blocks.
The other problem I found with your code was in these lines in the kernel:
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
They are not indexing properly through the output array. Rather than try to sort it out using your scheme, I used this instead:
Pd[(threadIdx.x + (blockIdx.x * blockDim.x)) + ((threadIdx.y + (blockIdx.y * blockDim.y))*(gridDim.x*blockDim.x))] = Pvalue;
When I made the above two changes to your code, I got what I believe are correct results throughout the array. And it took about 32 seconds on my machine to run it. (Note that I haven't tried fixing your original max-finding code -- see below for a better approach.)
Based on your previous question, you seemed to be concerned about speed. If you want to do fast matrix multiply, you should use cublas. The following code shows how to use cublas to multiply two ordinary C-style matrices (they don't have to be square). I've also included a column-max finding kernel that will be fast when the number of columns is large (say, over 500 or so. You have 4096 columns in your example). For small numbers of columns, there may be quicker ways to perform this function, but small numbers of columns also suggests that the overall problem size may be small and so speed (of this piece of code) will not really be an issue.
Here's the code:
#include <stdio.h>
#include <cublas_v2.h>
#define VERBOSE 1
#define nTPB 64
#define ROW_A 4
#define COL_A 4
#define ROW_B COL_A
#define COL_B 4
#define ROW_C ROW_A
#define COL_C COL_B
#define SIZ_A (ROW_A*COL_A)
#define SIZ_B (ROW_B*COL_B)
#define SIZ_C (ROW_C*COL_C)
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void col_max(float *mat, float *max, unsigned int *midx, unsigned int rows, unsigned int cols){
int idx = threadIdx.x + blockDim.x*blockIdx.x;
if (idx < cols){
float tempmax = mat[idx];
unsigned int tempmidx = 0;
for (int i = 1; i< rows; i++)
if (mat[idx + (i*cols)] > tempmax){
tempmax = mat[idx + (i*cols)];
tempmidx = i;}
max[idx] = tempmax;
midx[idx] = tempmidx;
}
}
int main(){
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C, *h_max, *d_max;
unsigned int *h_idx, *d_idx;
h_A = (float *)malloc(SIZ_A*sizeof(float));
if (h_A==0) {printf("malloc fail\n"); return -1;}
h_B = (float *)malloc(SIZ_B*sizeof(float));
if (h_B==0) {printf("malloc fail\n"); return -1;}
h_C = (float *)malloc(SIZ_C*sizeof(float));
if (h_C==0) {printf("malloc fail\n"); return -1;}
h_max = (float *)malloc(COL_C*sizeof(float));
if (h_max==0) {printf("malloc fail\n"); return -1;}
h_idx = (unsigned int*)malloc(COL_C*sizeof(unsigned int));
if (h_idx==0) {printf("malloc fail\n"); return -1;}
cudaMalloc((void **)&d_A, SIZ_A*sizeof(float));
cudaMalloc((void **)&d_B, SIZ_B*sizeof(float));
cudaMalloc((void **)&d_C, SIZ_C*sizeof(float));
cudaMalloc((void **)&d_max, COL_C*sizeof(float));
cudaMalloc((void **)&d_idx, COL_C*sizeof(unsigned int));
cudaCheckErrors("cuda malloc fail");
// initialize data
for (int i=0; i< SIZ_A; i++) h_A[i] = (float)(i+1);
for (int i=0; i< SIZ_B; i++) h_B[i] = (float)(i+2);
cudaMemcpy(d_A, h_A, SIZ_A*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, SIZ_B*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cuda memcpy 1 fail");
const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;
cublasCheckErrors(cublasCreate(&handle));
// C = A*B
// due to cublas expecting column-major storage, parameters
// are scrambled
cublasCheckErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, COL_B, ROW_A, COL_A, &alpha, d_B, COL_B, d_A, COL_A, &beta, d_C, COL_C));
cudaMemcpy(h_C, d_C, SIZ_C*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 2 fail");
col_max<<<(COL_C + nTPB - 1)/nTPB, nTPB>>>(d_C, d_max, d_idx, ROW_C, COL_C);
cudaCheckErrors("kernel launch fail");
cudaMemcpy(h_max, d_max, COL_C*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_idx, d_idx, COL_C*sizeof(unsigned int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 3 fail/kernel fail");
if (VERBOSE){
printf("A: \n");
for (int i=0; i< ROW_A; i++){
for (int j=0; j< COL_A; j++)
printf("%7.5G", h_A[j+(i*COL_A)]);
printf("\n");}
printf("B: \n");
for (int i=0; i< ROW_B; i++){
for (int j=0; j< COL_B; j++)
printf("%7.5G", h_B[j+(i*COL_B)]);
printf("\n");}
printf("C = A*B: \n");
for (int i=0; i< ROW_C; i++){
for (int j=0; j< COL_C; j++)
printf("%7.5G", h_C[j+(i*COL_C)]);
printf("\n");}
printf("COLUMN MAX:\n");
for (int i=0; i< COL_C; i++)
printf("%7.5G", h_max[i]);
printf("\nCOLUMN MAX IDX:\n");
for (int i=0; i< COL_C; i++)
printf("%7d", h_idx[i]);
}
printf("\n finished!\n");
return 0;
}
Here's what I used to compile:
$ nvcc -arch=sm_20 -O3 -o t221 t221.cu -lcublas
And here's the sample output:
$ cuda-memcheck ./t221
========= CUDA-MEMCHECK
A:
1 2 3 4
5 6 7 8
9 10 11 12
13 14 15 16
B:
2 3 4 5
6 7 8 9
10 11 12 13
14 15 16 17
C = A*B:
100 110 120 130
228 254 280 306
356 398 440 482
484 542 600 658
COLUMN MAX:
484 542 600 658
COLUMN MAX IDX:
3 3 3 3
finished!
========= ERROR SUMMARY: 0 errors
$
When I extended my code to handle the same sizes you indicated, (A = 4096x128, B=128x4096) it took about 1 second on my machine. So it's much faster than your code. However, when I take your code and comment out your call to MaxFunction in the kernel, it also only takes about 1 second to compute the matrix multiply result. So if you wanted to keep your matrix multiply code (i.e. not use cublas) you could break the code into 2 kernels, and use your multiply routine in the first kernel with my max-finding routine (col_max) in the second kernel, and also probably get a pretty fast result.
As #talonmies indicated, if you are running on a windows machine, be sure you are aware of the ramifications of windows TDR. (search that in the upper right corner search box if needed)
using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
const int threadsPerBlock = 256;
const int N = 40000;
void generateArray(double *data, int count) {
for (int i = 0; i < count; i++)
data[i] = rand() / ((rand() + rand()) / 2.0 + 1);
}
double maxCPU(double *arr, int count) {
int max = arr[0];
for (int i = 0; i < count; i++)
if (arr[i] > max)
max = arr[i];
return max;
}
__global__ void MaxGPU(double *a, int count, double *result){
__shared__ double cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;
int temp = a[tid];
tid+= blockDim.x * gridDim.x;
while(tid < count){
if(a[tid] > temp)
temp = a[tid];
tid+= blockDim.x * gridDim.x;
}
cache[cacheIndex] = temp;
__syncthreads();
int i = blockDim.x/2;
while(i!=0){
if(cacheIndex < i)
if(cache[cacheIndex + i] > cache[cacheIndex])
cache[cacheIndex] = cache[cacheIndex + i];
__syncthreads();
i/=2;
}
if(cacheIndex == 0)
result[blockIdx.x] = cache[0];
}
int main(void) {
double *arr = new double[N], resultGPU;
generateArray(arr, N);
double *devA, *dev_partial_result;
double resultCPU = maxCPU(arr, N);
cudaMalloc((void**)&devA, N * sizeof(double));
cudaMalloc((void**)&dev_partial_result, 512 * sizeof(double));
cudaMemcpy(devA, arr, N * sizeof(double), cudaMemcpyHostToDevice);
MaxGPU<<<1, 256>>>(devA, N, dev_partial_result);
cudaMemcpy(&resultGPU, dev_partial_result,sizeof(double), cudaMemcpyDeviceToHost);
cout << "Max CPU: " << resultCPU << endl;
cout << "Max GPU: " << resultGPU << endl;
cudaFree(devA);
cudaFree(dev_partial_result);
delete [] arr;
return 0;
}
I wrote above code. I don't why but it only works with one block. It does not work with say, 256 or 512 blocks. Why? What's wrong?
Try change
double resultGPU; to
double* resultGPU = new double[blocks_count];
and
cudaMemcpy(&resultGPU, dev_partial_result,sizeof(double), cudaMemcpyDeviceToHost); to
cudaMemcpy(resultGPU, dev_partial_result,blocks_count*sizeof(double), cudaMemcpyDeviceToHost);
This question is unlikely to help any future visitors; it is only relevant to a small geographic area, a specific moment in time, or an extraordinarily narrow situation that is not generally applicable to the worldwide audience of the internet. For help making this question more broadly applicable, visit the help center.
Closed 9 years ago.
I'm probably doing something incredibly stupid, but I can't seem to make this reduction work (there is probably a library that does this already, but this is for self-learning, so please bear with me). I'm trying to find the median of an array of integer entries by taking the median of medians approach, which I've coded below:
__global__ void gpuMedOdd(int *entries, int *med) {
extern __shared__ int sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = entries[i];
__syncthreads();
for(int s = blockDim.x / 3; s > 0; s /= 3) {
if(tid < s) {
int list[3];
list[0] = sdata[tid], list[1] = sdata[tid + s], list[2] = sdata[tid + 2 * s];
if(list[1] < list[0])
swapGpu(list[1], list[0]);
if(list[2] < list[0])
swapGpu(list[2], list[0]);
if(list[2] < list[1])
swapGpu(list[2], list[1]);
sdata[tid] = list[1];
}
__syncthreads();
}
*med = sdata[0];
}
I invoke this kernel function as:
gpuMedOdd<<<9, numEntries / 9>>>(d_entries, d_med);
I then copy the value in d_med over into med and print out that value. Unfortunately, this value is always 0, regardless of input. What am I doing wrong?
Edit: I forgot to mention, swapGpu(a, b) is defined as below:
__device__ inline void swapGpu(int a, int b) {
int dum = a;
a = b;
b = dum;
}
Edit2: As suggested below, here is the entirety of the code.
#include <iostream>
#include <fstream>
#include <cstdlib>
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
inline void __checkCudaErrors(cudaError err, const char *file, const int line) {
if(cudaSuccess != err) {
std::cout << file << "(" << line << ") : CUDA Runtime API error " << (int) err << ": " << cudaGetErrorString(err) << std::endl;
exit(3);
}
}
inline void __getLastCudaError(const char *errorMsg, const char *file, const int line) {
cudaError_t err = cudaGetLastError();
if(cudaSuccess != err) {
std::cout << file << "(" << line << ") : getLastCudaError() CUDA error : " << errorMsg << " : (" << (int) err << ") " << cudaGetErrorString(err) << std::endl;
exit(3);
}
}
int cpuMin(int *entries, int numEntries) {
int minVal = entries[0];
for(int i = 1; i < numEntries; i++)
if(entries[i] < minVal)
minVal = entries[i];
return minVal;
}
int cpuMax(int *entries, int numEntries) {
int maxVal = entries[0];
for(int i = 1; i < numEntries; i++)
if(entries[i] > maxVal)
maxVal = entries[i];
return maxVal;
}
inline void swap(int a, int b) {
int dum = a;
a = b;
b = dum;
}
__device__ inline void swapGpu(int a, int b) {
int dum = a;
a = b;
b = dum;
}
__global__ void gpuMedOdd(int *entries, int *med, int numEntries) {
extern __shared__ int sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * (blockDim.x * 3) + threadIdx.x;
if(i + 2 * blockDim.x < numEntries) {
int list[3];
list[0] = entries[i], list[1] = entries[i + blockDim.x], list[2] = entries[i + 2 * blockDim.x];
if(list[1] < list[0])
swapGpu(list[1], list[0]);
if(list[2] < list[0])
swapGpu(list[2], list[0]);
if(list[2] < list[1])
swapGpu(list[2], list[1]);
sdata[tid] = list[1];
}
__syncthreads();
for(int s = blockDim.x / 3; s > 0; s /= 3) {
if(tid < s && tid + 2 * s < blockDim.x) {
int list[3];
list[0] = sdata[tid], list[1] = sdata[tid + s], list[2] = sdata[tid + 2 * s];
if(list[1] < list[0])
swapGpu(list[1], list[0]);
if(list[2] < list[0])
swapGpu(list[2], list[0]);
if(list[2] < list[1])
swapGpu(list[2], list[1]);
sdata[tid] = list[1];
}
__syncthreads();
}
*med = sdata[0];
}
__global__ void gpuMin(int *entries, int *min, int numEntries) {
extern __shared__ int sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
if(i + blockDim.x < numEntries)
sdata[tid] = (entries[i] < entries[i + blockDim.x]) ? entries[i] : entries[i + blockDim.x];
__syncthreads();
for(int s = blockDim.x / 2; s > 0; s >>= 1) {
if(tid < s)
sdata[tid] = (sdata[tid] < sdata[tid + s]) ? sdata[tid] : sdata[tid + s];
__syncthreads();
}
*min = sdata[0];
}
__global__ void gpuMax(int *entries, int *max, int numEntries) {
extern __shared__ int sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
if(i + blockDim.x < numEntries)
sdata[tid] = (entries[i] > entries[i + blockDim.x]) ? entries[i] : entries[i + blockDim.x];
__syncthreads();
for(int s = blockDim.x / 2; s > 0; s >>= 1) {
if(tid < s)
sdata[tid] = (sdata[tid] > sdata[tid + s]) ? sdata[tid] : sdata[tid + s];
__syncthreads();
}
*max = sdata[0];
}
int partition(int *entries, int left, int right, int pivotIdx) {
int i, storeIdx = left, pivot = entries[pivotIdx];
swap(entries[pivotIdx], entries[right]);
for(i = left; i < right; i++)
if(entries[i] < pivot) {
swap(entries[i], entries[storeIdx]);
storeIdx++;
}
return storeIdx;
}
int cpuSelect(int *entries, int left, int right, int k) {
if(left == right)
return entries[left];
int pivotIdx = ((left + right) >> 2) + 1, pivotNewIdx, pivotDist;
pivotNewIdx = partition(entries, left, right, pivotIdx);
pivotDist = pivotNewIdx - left + 1;
if(pivotDist == k)
return entries[pivotNewIdx];
else if(k < pivotDist)
return cpuSelect(entries, left, pivotNewIdx - 1, k);
else
return cpuSelect(entries, pivotNewIdx + 1, right, k - pivotDist);
}
int main(int argc, char *argv[]) {
if(argc != 3) {
std::cout << "ERROR: Incorrect number of input arguments" << std::endl;
std::cout << "Proper usage: " << argv[0] << " fileName numEntries" << std::endl;
exit(1);
}
std::ifstream inp(argv[1]);
if(!inp.is_open()) {
std::cout << "ERROR: File I/O error" << std::endl;
std::cout << "Could not find file " << argv[1] << std::endl;
exit(2);
}
int numEntries = atoi(argv[2]), i = 0;
int *entries = new int[numEntries];
while(inp >> entries[i] && i < numEntries)
i++;
if(i < numEntries) {
std::cout << "ERROR: File I/O error" << std::endl;
std::cout << "Command-line input suggested " << numEntries << " entries, but only found " << i << " entries" << std::endl;
exit(2);
}
if(inp >> i) {
std::cout << "ERROR: File I/O error" << std::endl;
std::cout << "Command-line input suggested " << numEntries << " entries, but file contains more entries" << std::endl;
exit(2);
}
int min, max;
int *d_entries, *d_min, *d_max;
checkCudaErrors(cudaMalloc(&d_entries, sizeof(int) * numEntries));
checkCudaErrors(cudaMalloc(&d_min, sizeof(int)));
checkCudaErrors(cudaMalloc(&d_max, sizeof(int)));
checkCudaErrors(cudaMemcpy(d_entries, entries, sizeof(int) * numEntries, cudaMemcpyHostToDevice));
gpuMin<<<16, numEntries / 16, numEntries / 16 * sizeof(int)>>>(d_entries, d_min, numEntries);
getLastCudaError("kernel launch failure");
gpuMax<<<16, numEntries / 16, numEntries / 16 * sizeof(int)>>>(d_entries, d_max, numEntries);
getLastCudaError("kernel launch failure");
checkCudaErrors(cudaMemcpy(&min, d_min, sizeof(int), cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(&max, d_max, sizeof(int), cudaMemcpyDeviceToHost));
std::cout << "The minimum value is: " << min << std::endl;
std::cout << "The maximum value is: " << max << std::endl;
if(numEntries % 2) {
int med, *d_med;
checkCudaErrors(cudaMalloc(&d_med, sizeof(int)));
gpuMedOdd<<<16, numEntries / 16, 16 * sizeof(int)>>>(d_entries, d_med, numEntries);
getLastCudaError("kernel launch failure");
checkCudaErrors(cudaMemcpy(&med, d_med, sizeof(int), cudaMemcpyDeviceToHost));
std::cout << "The median value is: " << med << std::endl;
}
else {
int *d_med;
cudaMalloc(&d_med, sizeof(int));
gpuMedOdd<<<16, numEntries / 16>>>(d_entries, d_med, numEntries);
}
min = cpuMin(entries, numEntries);
max = cpuMax(entries, numEntries);
if(numEntries % 2) {
int median = cpuSelect(entries, 0, numEntries - 1, (numEntries - 1) / 2 + 1);
std::cout << "The median value is: " << median << std::endl;
}
else {
int med2 = cpuSelect(entries, 0, numEntries - 1, numEntries / 2);
int med1 = cpuSelect(entries, 0, numEntries - 1, numEntries / 2 + 1);
float median = 0.5 * (med1 + med2);
std::cout << "The median value is: " << median << std::endl;
}
std::cout << "The minimum value is: " << min << std::endl;
std::cout << "The maximum value is: " << max << std::endl;
exit(0);
}
One thing that jumps out is that your shared memory size isn't set; that is, you declare your shared memory to be
extern __shared__ int sdata[];
but when you invoke your kernel your launch parameters are
gpuMedOdd<<<9, numEntries / 9>>>(...)
If you're setting your __shared__ memory to be extern, then it's expecting to get the number of bytes for shared memory as the 3rd kernel launch parameter. you should instead have
gpuMedOdd<<<9, numEntries / 9, smem_in_bytes>>>(...)
where smem_in_bytes is the size of shared memory for the kernel. If you don't specify a size, it'll default to 0. Hence in your current code, your __shared__ memory array sdata will be 0 bytes long.
EDIT: here's the link to the relevant part of the CUDA Programming Guide:
http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration
A problem I see in your code is that you seem to have your launch parameters reversed:
gpuMedOdd<<<16, numEntries / 16, 16 * sizeof(int)>>>(d_entries, d_med, numEntries);
I think you intended:
gpuMedOdd<<< numEntries/16, 16, 16 * sizeof(int)>>>(d_entries, d_med, numEntries);
The first launch parameter is blocks per grid. The second launch parameter is threads per block. Here I'm assuming you wanted to launch 16 threads per block. If in fact your intent was to launch a fixed number of blocks (16) and have the threads per block vary based on input size, then I think this is not typical of good cuda coding, and it will blow up if your input size gets too large, because you will exceed the max threads per block limit. Also, since your shared memory allocation is fixed (64 bytes), I assume you had intended a fixed number of threads per block.
Another suggestion I have is that rather than just reporting "CUDA Runtime Error" you should parse the error code returned. Take a look at the example link I already mentioned.
Hi I'm writing a simple Program for practicing to work with texture memory. I Just want to write my data into Texture Memory and write it back into Global Memory. But i cannont read out the Values. Here is the code.
#include <stdio.h>
#include <iostream>
#include "cuda.h"
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "HelloWorld.h"
#include "linearInterpolation_kernel4.cu"
using namespace std;
using std::cout;
const int blocksize = 16;
__global__
void hello(char *a, int *b) {
a[threadIdx.x] += b[threadIdx.x];
}
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
printf("%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
}
}
// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
inline void __getLastCudaError( const char *errorMessage, const char *file, const int line )
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
printf("%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
}
}
int main()
{
int N = 40;
float *A;
A = (float *) malloc(N*sizeof(float));
float *B;
B = (float *) malloc(N*sizeof(float));
float *result;
result = (float *) malloc(N*sizeof(float));
float angle = 0.8f;
for(int i = 0; i < N; i++){
A[i] = i; //(float)rand();
B[i] = i+1; //(float)rand();
}
ipLinearTexture2(A,B,result,angle,N);
float result2;
result2 = (angle)*A[4] + (1-angle)*B[4];
printf(" A %f B %f Result %f\n", A[4], B[4], result[4]);
cout << result2 << endl;
return 1;
}
void ipLinearTexture2(float *A, float* B, float* result, float angle, int N)
{
float cuTime;
int N2 = N * 2;
float *dev_result;
float **AB;
AB = (float **) malloc( N * sizeof(float *));
if(AB)
{
for(int i = 0; i < N; i++)
{
AB[i] = (float *) malloc( 2 * sizeof(float *));
}
}
for (int i = 0; i < N; i = i++)
{
AB[i][0] = A[i];
AB[i][1] = B[i];
}
cudaMalloc(&dev_result, N * sizeof(float));
unsigned int size = N2 * sizeof(float);
//cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray* cu_array;
checkCudaErrors(cudaMallocArray( &cu_array, &channelDesc,N,2));
cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);
// set texture parameters
tex2.normalized = true;
tex2.filterMode = cudaFilterModeLinear;
tex2.addressMode[0] = cudaAddressModeWrap; //cudaAddressModeWrap;
tex2.addressMode[1] = cudaAddressModeWrap; //cudaAddressModeClamp;
checkCudaErrors(cudaBindTextureToArray( tex2, cu_array, channelDesc));
dim3 dimBlock(10, 1, 1);
dim3 dimGrid((int)ceil((double)N*2/dimBlock.x), 1, 1);
transformKernel4<<< 256, 256, 0 >>>( dev_result, N, 2, angle);
checkCudaErrors(cudaMemcpy(result, dev_result, N * sizeof(float), cudaMemcpyDeviceToHost));
cout << "==================================================" << endl;
for (int i = 0 ; i < N ;i++)
{
cout << result[i] << " on " << i << endl;
}
cout << "==================================================" << endl;
checkCudaErrors(cudaUnbindTexture(tex));
checkCudaErrors(cudaFree(dev_result));
checkCudaErrors(cudaFreeArray(cu_array));
}
and here is the kernel code
#ifndef _SIMPLETEXTURE_KERNEL5_H_
#define _SIMPLETEXTURE_KERNEL5_H_
// Texture references
texture<float, 2, cudaReadModeElementType> tex2;
__global__ void
transformKernel4(float* g_odata, int width, int height, float theta)
{
unsigned int xid = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int yid = blockIdx.y * blockDim.y + threadIdx.y;
if (xid >= width || yid >= height) return;
float dx = 1.0f / (float)width;
float dy = 1.0f / (float)height;
float x = ((float)xid + 0.5f) * dx;
float y = ((float)yid + 0.5f) * dy;
float value = tex2D(tex2, x , y);
printf("wert %f xid %i yid %i \n",value, xid, yid);
g_odata[yid * width + xid] = value;
}
#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
Can somebody tell what i am doing wrong?
I have edited it to remove the first 2 logical mistake. Put why am I need able to print out my data?
It was the wrong binding of the Arrays. You can not use multidimensional Arrays in C that can be copied. You have to use a onedimensional array that respresents a multidimensional.
I can see 2 logical errors here.
The first one is the one pointed out by #asm.
The output should be stored by calculating linear index from 2D x and y indices.
outputIndex = yid * width + xid;
The second one is that the memory allocation for the cudaArray structure is internally aligned.
You should consider using cudaMemcpy2DToArray function to avoid erroneous data copying.
cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);