I'm writing a kernel using PyCUDA. My GPU device only supports compute capability 1.1 (arch sm_11) and so I can only use floats in my code. I've taken great effort to ensure I'm doing everything with floats, but despite that, there is a particular line in my code that keeps causing a compiler error.
The chunk of code is:
// Gradient magnitude, so 1 <= x <= width, 1 <= y <= height.
if( j > 0 && j < im_width && i > 0 && i < im_height){
gradient_mag[idx(i,j)] = float(sqrt(x_gradient[idx(i,j)]*x_gradient[idx(i,j)] + y_gradient[idx(i,j)]*y_gradient[idx(i,j)]));
}
Here, idx() is a __device__ helper function that returns a linear index based on pixel indices i and j, and it only works with integers. I use it throughout and it doesn't give errors anywhere else, so I strongly suspect it's not idx(). The sqrt() call is just from the standard C math functions which support floats. All of the arrays involved, x_gradient , y_gradient, and gradient_mag are all float* and they are part of the input to my function (i.e. declared in Python, then converted to device variables, etc.).
I've tried removing the extra cast to float in my code above, with no luck. I've also tried doing something completely stupid like this:
// Gradient magnitude, so 1 <= x <= width, 1 <= y <= height.
if( j > 0 && j < im_width && i > 0 && i < im_height){
gradient_mag[idx(i,j)] = 3.0f; // also tried float(3.0) here
}
All of these variations give the same error:
pycuda.driver.CompileError: nvcc said it demoted types in source code it compiled--this is likely not what you want.
[command: nvcc --cubin -arch sm_11 -I/usr/local/lib/python2.7/dist-packages/pycuda-2011.1.2-py2.7-linux-x86_64.egg/pycuda/../include/pycuda kernel.cu]
[stderr:
ptxas /tmp/tmpxft_00004329_00000000-2_kernel.ptx, line 128; warning : Double is not supported. Demoting to float
]
Any ideas? I've debugged many errors in my code and was hoping to get it working tonight, but this has proved to be a bug that I cannot understand.
Added -- Here is a truncated version of the kernel that produces the same error above on my machine.
every_pixel_hog_kernel_source = \
"""
#include <math.h>
#include <stdio.h>
__device__ int idx(int ii, int jj){
return gridDim.x*blockDim.x*ii+jj;
}
__device__ int bin_number(float angle_val, int total_angles, int num_bins){
float angle1;
float min_dist;
float this_dist;
int bin_indx;
angle1 = 0.0;
min_dist = abs(angle_val - angle1);
bin_indx = 0;
for(int kk=1; kk < num_bins; kk++){
angle1 = angle1 + float(total_angles)/float(num_bins);
this_dist = abs(angle_val - angle1);
if(this_dist < min_dist){
min_dist = this_dist;
bin_indx = kk;
}
}
return bin_indx;
}
__device__ int hist_number(int ii, int jj){
int hist_num = 0;
if(jj >= 0 && jj < 11){
if(ii >= 0 && ii < 11){
hist_num = 0;
}
else if(ii >= 11 && ii < 22){
hist_num = 3;
}
else if(ii >= 22 && ii < 33){
hist_num = 6;
}
}
else if(jj >= 11 && jj < 22){
if(ii >= 0 && ii < 11){
hist_num = 1;
}
else if(ii >= 11 && ii < 22){
hist_num = 4;
}
else if(ii >= 22 && ii < 33){
hist_num = 7;
}
}
else if(jj >= 22 && jj < 33){
if(ii >= 0 && ii < 11){
hist_num = 2;
}
else if(ii >= 11 && ii < 22){
hist_num = 5;
}
else if(ii >= 22 && ii < 33){
hist_num = 8;
}
}
return hist_num;
}
__global__ void every_pixel_hog_kernel(float* input_image, int im_width, int im_height, float* gaussian_array, float* x_gradient, float* y_gradient, float* gradient_mag, float* angles, float* output_array)
{
/////
// Setup the thread indices and linear offset.
/////
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int ang_limit = 180;
int ang_bins = 9;
float pi_val = 3.141592653589f; //91
/////
// Compute a Gaussian smoothing of the current pixel and save it into a new image array
// Use sync threads to make sure everyone does the Gaussian smoothing before moving on.
/////
if( j > 1 && i > 1 && j < im_width-2 && i < im_height-2 ){
// Hard-coded unit standard deviation 5-by-5 Gaussian smoothing filter.
gaussian_array[idx(i,j)] = float(1.0/273.0) *(
input_image[idx(i-2,j-2)] + float(4.0)*input_image[idx(i-2,j-1)] + float(7.0)*input_image[idx(i-2,j)] + float(4.0)*input_image[idx(i-2,j+1)] + input_image[idx(i-2,j+2)] +
float(4.0)*input_image[idx(i-1,j-2)] + float(16.0)*input_image[idx(i-1,j-1)] + float(26.0)*input_image[idx(i-1,j)] + float(16.0)*input_image[idx(i-1,j+1)] + float(4.0)*input_image[idx(i-1,j+2)] +
float(7.0)*input_image[idx(i,j-2)] + float(26.0)*input_image[idx(i,j-1)] + float(41.0)*input_image[idx(i,j)] + float(26.0)*input_image[idx(i,j+1)] + float(7.0)*input_image[idx(i,j+2)] +
float(4.0)*input_image[idx(i+1,j-2)] + float(16.0)*input_image[idx(i+1,j-1)] + float(26.0)*input_image[idx(i+1,j)] + float(16.0)*input_image[idx(i+1,j+1)] + float(4.0)*input_image[idx(i+1,j+2)] +
input_image[idx(i+2,j-2)] + float(4.0)*input_image[idx(i+2,j-1)] + float(7.0)*input_image[idx(i+2,j)] + float(4.0)*input_image[idx(i+2,j+1)] + input_image[idx(i+2,j+2)]);
}
__syncthreads();
/////
// Compute the simple x and y gradients of the image and store these into new images
// again using syncthreads before moving on.
/////
// X-gradient, ensure x is between 1 and width-1
if( j > 0 && j < im_width){
x_gradient[idx(i,j)] = float(input_image[idx(i,j)] - input_image[idx(i,j-1)]);
}
else if(j == 0){
x_gradient[idx(i,j)] = float(0.0);
}
// Y-gradient, ensure y is between 1 and height-1
if( i > 0 && i < im_height){
y_gradient[idx(i,j)] = float(input_image[idx(i,j)] - input_image[idx(i-1,j)]);
}
else if(i == 0){
y_gradient[idx(i,j)] = float(0.0);
}
__syncthreads();
// Gradient magnitude, so 1 <= x <= width, 1 <= y <= height.
if( j < im_width && i < im_height){
gradient_mag[idx(i,j)] = float(sqrt(x_gradient[idx(i,j)]*x_gradient[idx(i,j)] + y_gradient[idx(i,j)]*y_gradient[idx(i,j)]));
}
__syncthreads();
/////
// Compute the orientation angles
/////
if( j < im_width && i < im_height){
if(ang_limit == 360){
angles[idx(i,j)] = float((atan2(y_gradient[idx(i,j)],x_gradient[idx(i,j)])+pi_val)*float(180.0)/pi_val);
}
else{
angles[idx(i,j)] = float((atan( y_gradient[idx(i,j)]/x_gradient[idx(i,j)] )+(pi_val/float(2.0)))*float(180.0)/pi_val);
}
}
__syncthreads();
// Compute the HoG using the above arrays. Do so in a 3x3 grid, with 9 angle bins for each grid.
// forming an 81-vector and then write this 81 vector as a row in the large output array.
int top_bound, bot_bound, left_bound, right_bound, offset;
int window = 32;
if(i-window/2 > 0){
top_bound = i-window/2;
bot_bound = top_bound + window;
}
else{
top_bound = 0;
bot_bound = top_bound + window;
}
if(j-window/2 > 0){
left_bound = j-window/2;
right_bound = left_bound + window;
}
else{
left_bound = 0;
right_bound = left_bound + window;
}
if(bot_bound - im_height > 0){
offset = bot_bound - im_height;
top_bound = top_bound - offset;
bot_bound = bot_bound - offset;
}
if(right_bound - im_width > 0){
offset = right_bound - im_width;
right_bound = right_bound - offset;
left_bound = left_bound - offset;
}
int counter_i = 0;
int counter_j = 0;
int bin_indx, hist_indx, glob_col_indx, glob_row_indx;
int row_width = 81;
for(int pix_i = top_bound; pix_i < bot_bound; pix_i++){
for(int pix_j = left_bound; pix_j < right_bound; pix_j++){
bin_indx = bin_number(angles[idx(pix_i,pix_j)], ang_limit, ang_bins);
hist_indx = hist_number(counter_i,counter_j);
glob_col_indx = ang_bins*hist_indx + bin_indx;
glob_row_indx = idx(i,j);
output_array[glob_row_indx*row_width + glob_col_indx] = float(output_array[glob_row_indx*row_width + glob_col_indx] + float(gradient_mag[idx(pix_i,pix_j)]));
counter_j = counter_j + 1;
}
counter_i = counter_i + 1;
counter_j = 0;
}
}
"""
Here's an unmistakable case of using doubles:
gaussian_array[idx(i,j)] = float(1.0/273.0) *
See the double literals being divided?
But really, use float literals instead of double literals cast to floats - the casts are ugly, and I suggest they will hide bugs like this.
-------Edit 1/Dec---------
Firstly, thanks #CygnusX1, constant folding would prevent that calculation - I didn't even think of it.
I've tried to reproduce the environment of the error: I installed the CUDA SDK 3.2 (That #EMS has mentioned they seem to use in the lab), compiling the truncated kernel version above, and indeed nvopencc did optimize the above calculation away (thanks #CygnusX1), and indeed it didn't use doubles anywhere in the generated PTX code. Further, ptxas didn't give the error received by #EMS. From that, I thought the problem is outside of the every_pixel_hog_kernel_source code itself, perhaps in PyCUDA. However, using PyCUDA 2011.1.2 and compiling with that still does not produce a warning like in #EMS's question. I can get the error in the question, however it is by introducing a double calculation, such as removing the cast from gaussian_array[idx(i,j)] = float(1.0/273.0) *
To get to the same python case, does the following produce your error:
import pycuda.driver as cuda
from pycuda.compiler import compile
x=compile("""put your truncated kernel code here""",options=[],arch="sm_11",keep=True)
It doesn't produce an error in my circumstance, so there is a possibility I simply can't replicate your result.
However, I can give some advice. When using compile (or SourceModule), if you use keep=True, python will print out the folder where the ptx file is being generated just before showing the error message.
Then, if you can examine the ptx file generated in that folder and looking where .f64 appears it should give some idea of what is being treated as a double - however, deciphering what code that is in your original kernel is difficult - having the simplest example that produces your error will help you.
Your problem is here:
angle1 = 0.0;
0.0 is a double precision constant. 0.0f is a single precision constant.
(a comment, not an answer, but it is too big to put it as a comment)
Could you provide the PTX code around the line where the error occurs?
I tried compiling a simple kernel using the code you provided:
__constant__ int im_width;
__constant__ int im_height;
__device__ int idx(int i,int j) {
return i+j*im_width;
}
__global__ void kernel(float* gradient_mag, float* x_gradient, float* y_gradient) {
int i = threadIdx.x;
int j = threadIdx.y;
// Gradient magnitude, so 1 <= x <= width, 1 <= y <= height.
if( j > 0 && j < im_width && i > 0 && i < im_height){
gradient_mag[idx(i,j)] = float(sqrt(x_gradient[idx(i,j)]*x_gradient[idx(i,j)] + y_gradient[idx(i,j)]*y_gradient[idx(i,j)]));
}
}
using:
nvcc.exe -m32 -maxrregcount=32 -gencode=arch=compute_11,code=\"sm_11,compute_11\" --compile -o "Debug\main.cu.obj" main.cu
got no errors.
Using the CUDA 4.1 beta compiler
Update
I tried compiling your new code (I am working within CUDA/C++, not PyCUDA, but this shouldn't matter). Didn't catch the error either! Used CUDA 4.1 and CUDA 4.0.
What is your version of CUDA installation?
C:\>nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2011 NVIDIA Corporation
Built on Wed_Oct_19_23:13:02_PDT_2011
Cuda compilation tools, release 4.1, V0.2.1221
Related
I have a CUDA program that seems to be hitting some sort of limit of some resource, but I can't figure out what that resource is. Here is the kernel function:
__global__ void DoCheck(float2* points, int* segmentToPolylineIndexMap,
int segmentCount, int* output)
{
int segmentIndex = threadIdx.x + blockIdx.x * blockDim.x;
int pointCount = segmentCount + 1;
if(segmentIndex >= segmentCount)
return;
int polylineIndex = segmentToPolylineIndexMap[segmentIndex];
int result = 0;
if(polylineIndex >= 0)
{
float2 p1 = points[segmentIndex];
float2 p2 = points[segmentIndex+1];
float2 A = p2;
float2 a;
a.x = p2.x - p1.x;
a.y = p2.y - p1.y;
for(int i = segmentIndex+2; i < segmentCount; i++)
{
int currentPolylineIndex = segmentToPolylineIndexMap[i];
// if not a different segment within out polyline and
// not a fake segment
bool isLegit = (currentPolylineIndex != polylineIndex &&
currentPolylineIndex >= 0);
float2 p3 = points[i];
float2 p4 = points[i+1];
float2 B = p4;
float2 b;
b.x = p4.x - p3.x;
b.y = p4.y - p3.y;
float2 c;
c.x = B.x - A.x;
c.y = B.y - A.y;
float2 b_perp;
b_perp.x = -b.y;
b_perp.y = b.x;
float numerator = dot(b_perp, c);
float denominator = dot(b_perp, a);
bool isParallel = (denominator == 0.0);
float quotient = numerator / denominator;
float2 intersectionPoint;
intersectionPoint.x = quotient * a.x + A.x;
intersectionPoint.y = quotient * a.y + A.y;
result = result | (isLegit && !isParallel &&
intersectionPoint.x > min(p1.x, p2.x) &&
intersectionPoint.x > min(p3.x, p4.x) &&
intersectionPoint.x < max(p1.x, p2.x) &&
intersectionPoint.x < max(p3.x, p4.x) &&
intersectionPoint.y > min(p1.y, p2.y) &&
intersectionPoint.y > min(p3.y, p4.y) &&
intersectionPoint.y < max(p1.y, p2.y) &&
intersectionPoint.y < max(p3.y, p4.y));
}
}
output[segmentIndex] = result;
}
Here is the call to execute the kernel function:
DoCheck<<<702, 32>>>(
(float2*)devicePoints,
deviceSegmentsToPolylineIndexMap,
numSegments,
deviceOutput);
The sizes of the parameters are as follows:
devicePoints = 22,464 float2s = 179,712 bytes
deviceSegmentsToPolylineIndexMap = 22,463 ints = 89,852 bytes
numSegments = 1 int = 4 bytes
deviceOutput = 22,463 ints = 89,852 bytes
When I execute this kernel, it crashes the video card. It would appear that I am hitting some sort of limit, because if I execute the kernel using DoCheck<<<300, 32>>>(...);, it works. Just to be clear, the parameters are the same, just the number of blocks is different.
Any idea why one crashes the video driver, and the other doesn't? The one that fail seems to be still within the card's limit on number of blocks.
Update
More information on my system configuration:
Video Card: nVidia 8800GT
CUDA Version: 1.1
OS: Windows Server 2008 R2
I also tried it on a laptop with the following configuration, but got the same results:
Video Card: nVidia Quadro FX 880M
CUDA Version: 1.2
OS: Windows 7 64-bit
The resource which is being exhausted is time. On all current CUDA platforms, the display driver includes a watchdog timer which will kill any kernel which takes more than a few seconds to execute. Running code on a card which is running a display is subject to this limit.
On the WDDM Windows platforms you are using, there are three possible solutions/work-arounds:
Get a Telsa card and use the TCC driver, which eliminates the problem completely
Try modifying registry settings to increase the timer limit (google for the TdrDelay registry key for more information, but I am not a Windows user and can't be more specific than that)
Modify your kernel code to be "re-entrant" and process the data parallel work load in several kernel launches rather than one. Kernel launch overhead isn't all that large and processing the workload over several kernel runs is often pretty easy to achieve, depending on the algorithm you are using.
I have a CUDA program that seems to be hitting some sort of limit of some resource, but I can't figure out what that resource is. Here is the kernel function:
__global__ void DoCheck(float2* points, int* segmentToPolylineIndexMap,
int segmentCount, int* output)
{
int segmentIndex = threadIdx.x + blockIdx.x * blockDim.x;
int pointCount = segmentCount + 1;
if(segmentIndex >= segmentCount)
return;
int polylineIndex = segmentToPolylineIndexMap[segmentIndex];
int result = 0;
if(polylineIndex >= 0)
{
float2 p1 = points[segmentIndex];
float2 p2 = points[segmentIndex+1];
float2 A = p2;
float2 a;
a.x = p2.x - p1.x;
a.y = p2.y - p1.y;
for(int i = segmentIndex+2; i < segmentCount; i++)
{
int currentPolylineIndex = segmentToPolylineIndexMap[i];
// if not a different segment within out polyline and
// not a fake segment
bool isLegit = (currentPolylineIndex != polylineIndex &&
currentPolylineIndex >= 0);
float2 p3 = points[i];
float2 p4 = points[i+1];
float2 B = p4;
float2 b;
b.x = p4.x - p3.x;
b.y = p4.y - p3.y;
float2 c;
c.x = B.x - A.x;
c.y = B.y - A.y;
float2 b_perp;
b_perp.x = -b.y;
b_perp.y = b.x;
float numerator = dot(b_perp, c);
float denominator = dot(b_perp, a);
bool isParallel = (denominator == 0.0);
float quotient = numerator / denominator;
float2 intersectionPoint;
intersectionPoint.x = quotient * a.x + A.x;
intersectionPoint.y = quotient * a.y + A.y;
result = result | (isLegit && !isParallel &&
intersectionPoint.x > min(p1.x, p2.x) &&
intersectionPoint.x > min(p3.x, p4.x) &&
intersectionPoint.x < max(p1.x, p2.x) &&
intersectionPoint.x < max(p3.x, p4.x) &&
intersectionPoint.y > min(p1.y, p2.y) &&
intersectionPoint.y > min(p3.y, p4.y) &&
intersectionPoint.y < max(p1.y, p2.y) &&
intersectionPoint.y < max(p3.y, p4.y));
}
}
output[segmentIndex] = result;
}
Here is the call to execute the kernel function:
DoCheck<<<702, 32>>>(
(float2*)devicePoints,
deviceSegmentsToPolylineIndexMap,
numSegments,
deviceOutput);
The sizes of the parameters are as follows:
devicePoints = 22,464 float2s = 179,712 bytes
deviceSegmentsToPolylineIndexMap = 22,463 ints = 89,852 bytes
numSegments = 1 int = 4 bytes
deviceOutput = 22,463 ints = 89,852 bytes
When I execute this kernel, it crashes the video card. It would appear that I am hitting some sort of limit, because if I execute the kernel using DoCheck<<<300, 32>>>(...);, it works. Just to be clear, the parameters are the same, just the number of blocks is different.
Any idea why one crashes the video driver, and the other doesn't? The one that fail seems to be still within the card's limit on number of blocks.
Update
More information on my system configuration:
Video Card: nVidia 8800GT
CUDA Version: 1.1
OS: Windows Server 2008 R2
I also tried it on a laptop with the following configuration, but got the same results:
Video Card: nVidia Quadro FX 880M
CUDA Version: 1.2
OS: Windows 7 64-bit
The resource which is being exhausted is time. On all current CUDA platforms, the display driver includes a watchdog timer which will kill any kernel which takes more than a few seconds to execute. Running code on a card which is running a display is subject to this limit.
On the WDDM Windows platforms you are using, there are three possible solutions/work-arounds:
Get a Telsa card and use the TCC driver, which eliminates the problem completely
Try modifying registry settings to increase the timer limit (google for the TdrDelay registry key for more information, but I am not a Windows user and can't be more specific than that)
Modify your kernel code to be "re-entrant" and process the data parallel work load in several kernel launches rather than one. Kernel launch overhead isn't all that large and processing the workload over several kernel runs is often pretty easy to achieve, depending on the algorithm you are using.
I have a CUDA program that seems to be hitting some sort of limit of some resource, but I can't figure out what that resource is. Here is the kernel function:
__global__ void DoCheck(float2* points, int* segmentToPolylineIndexMap,
int segmentCount, int* output)
{
int segmentIndex = threadIdx.x + blockIdx.x * blockDim.x;
int pointCount = segmentCount + 1;
if(segmentIndex >= segmentCount)
return;
int polylineIndex = segmentToPolylineIndexMap[segmentIndex];
int result = 0;
if(polylineIndex >= 0)
{
float2 p1 = points[segmentIndex];
float2 p2 = points[segmentIndex+1];
float2 A = p2;
float2 a;
a.x = p2.x - p1.x;
a.y = p2.y - p1.y;
for(int i = segmentIndex+2; i < segmentCount; i++)
{
int currentPolylineIndex = segmentToPolylineIndexMap[i];
// if not a different segment within out polyline and
// not a fake segment
bool isLegit = (currentPolylineIndex != polylineIndex &&
currentPolylineIndex >= 0);
float2 p3 = points[i];
float2 p4 = points[i+1];
float2 B = p4;
float2 b;
b.x = p4.x - p3.x;
b.y = p4.y - p3.y;
float2 c;
c.x = B.x - A.x;
c.y = B.y - A.y;
float2 b_perp;
b_perp.x = -b.y;
b_perp.y = b.x;
float numerator = dot(b_perp, c);
float denominator = dot(b_perp, a);
bool isParallel = (denominator == 0.0);
float quotient = numerator / denominator;
float2 intersectionPoint;
intersectionPoint.x = quotient * a.x + A.x;
intersectionPoint.y = quotient * a.y + A.y;
result = result | (isLegit && !isParallel &&
intersectionPoint.x > min(p1.x, p2.x) &&
intersectionPoint.x > min(p3.x, p4.x) &&
intersectionPoint.x < max(p1.x, p2.x) &&
intersectionPoint.x < max(p3.x, p4.x) &&
intersectionPoint.y > min(p1.y, p2.y) &&
intersectionPoint.y > min(p3.y, p4.y) &&
intersectionPoint.y < max(p1.y, p2.y) &&
intersectionPoint.y < max(p3.y, p4.y));
}
}
output[segmentIndex] = result;
}
Here is the call to execute the kernel function:
DoCheck<<<702, 32>>>(
(float2*)devicePoints,
deviceSegmentsToPolylineIndexMap,
numSegments,
deviceOutput);
The sizes of the parameters are as follows:
devicePoints = 22,464 float2s = 179,712 bytes
deviceSegmentsToPolylineIndexMap = 22,463 ints = 89,852 bytes
numSegments = 1 int = 4 bytes
deviceOutput = 22,463 ints = 89,852 bytes
When I execute this kernel, it crashes the video card. It would appear that I am hitting some sort of limit, because if I execute the kernel using DoCheck<<<300, 32>>>(...);, it works. Just to be clear, the parameters are the same, just the number of blocks is different.
Any idea why one crashes the video driver, and the other doesn't? The one that fail seems to be still within the card's limit on number of blocks.
Update
More information on my system configuration:
Video Card: nVidia 8800GT
CUDA Version: 1.1
OS: Windows Server 2008 R2
I also tried it on a laptop with the following configuration, but got the same results:
Video Card: nVidia Quadro FX 880M
CUDA Version: 1.2
OS: Windows 7 64-bit
The resource which is being exhausted is time. On all current CUDA platforms, the display driver includes a watchdog timer which will kill any kernel which takes more than a few seconds to execute. Running code on a card which is running a display is subject to this limit.
On the WDDM Windows platforms you are using, there are three possible solutions/work-arounds:
Get a Telsa card and use the TCC driver, which eliminates the problem completely
Try modifying registry settings to increase the timer limit (google for the TdrDelay registry key for more information, but I am not a Windows user and can't be more specific than that)
Modify your kernel code to be "re-entrant" and process the data parallel work load in several kernel launches rather than one. Kernel launch overhead isn't all that large and processing the workload over several kernel runs is often pretty easy to achieve, depending on the algorithm you are using.
I have p.ntp test particles and every i-th particle has Cartesian coordinates tp.rh[i].x, tp.rh[i].y, tp.rh[i].z. Within this set I need to find CLUSTERS. It means, that I am looking for particles closer to the i-th particle less than hill2 (tp.D_rel < hill2). The number of such a members is stored in N_conv.
I use this cycle for (int i = 0; i < p.ntp; i++), which goes through the data set. For each i-th particle I calculate squared distances tp.D_rel[idx] relative to the others members in the set. Then I use first thread (idx == 0) to find the number of cases, which satisfy my condition. At the end, If are there more than 1 (N_conv > 1) positive cases I need to write out all particles forming possible cluster together (triplets, ...).
My code works well only in cases, where i < blockDim.x. Why? Is there a general way, how to find clusters in a set of data, but write out only triplets and more?
Note: I know, that some cases will be found twice.
__global__ void check_conv_system(double t, struct s_tp tp, struct s_mp mp, struct s_param p, double *time_step)
{
const uint bid = blockIdx.y * gridDim.x + blockIdx.x;
const uint tid = threadIdx.x;
const uint idx = bid * blockDim.x + tid;
double hill2 = 1.0e+6;
__shared__ double D[200];
__shared__ int ID1[200];
__shared__ int ID2[200];
if (idx >= p.ntp) return;
int N_conv;
for (int i = 0; i < p.ntp; i++)
{
tp.D_rel[idx] = (double)((tp.rh[i].x - tp.rh[idx].x)*(tp.rh[i].x - tp.rh[idx].x) +
(tp.rh[i].y - tp.rh[idx].y)*(tp.rh[i].y - tp.rh[idx].y) +
(tp.rh[i].z - tp.rh[idx].z)*(tp.rh[i].z - tp.rh[idx].z));
__syncthreads();
N_conv = 0;
if (idx == 0)
{
for (int n = 0; n < p.ntp; n++) {
if ((tp.D_rel[n] < hill2) && (i != n)) {
N_conv = N_conv + 1;
D[N_conv] = tp.D_rel[n];
ID1[N_conv] = i;
ID2[N_conv] = n;
}
}
if (N_conv > 0) {
for(int k = 1; k < N_conv; k++) {
printf("%lf %lf %d %d \n",t/365.2422, D[k], ID1[k], ID2[k]);
}
}
} //end idx == 0
} //end for cycle for i
}
As RobertCrovella mentionned, without an MCV example, it is hard to tell.
However, the tp.D_del array seems to be written to with idx index, and read-back after a __syncthreads() with full range indexing n. Note that the call to __syncthreads() will only perform synchronization within a block, not accross the whole device. As a result, some thread/block will access data that has not been calculated yet, hence the failure.
You want to review your code so that values computed by blocks do not depend one-another.
I use a GTX 280, which has compute capability 1.3 and supports atomic operations on shared memory. I am using cuda SDK 2.2 and VS 2005. In my program I have to extensively use atomic operations because there is simply no other way.
One example is that I have to calculate the running sum of an array and find out the index where the sum exceeds a given cut off value. For this I am using a variant of scan algorithm and using atomicMin to store index while the value is less than the threshold. So this way at the end the shared memory would have the index where the value is just less than the threshold.
This is just one component of the kernel, and there are many similar code blocks in the kernel call.
I am having 3 problems
Firstly I have not been able to compile the code as it say atomic operations are not defined, I have searched but not found which file I have to add.
Second, I somehow managed to compile the code by copying it in the code provided by CUDA SDK, but then it is saying the atomic operations are not supported on shared memory, where as it is running in the following program
Even when I worked around a hack by giving -arch sm_12 in the command line compilation, the code snippet using these atomic operations are taking an awful lot of time.
I believe that in the worst case I should get some sort of speed up, because there are not very many atomic operations and I using 1 block of 16x16. Unfortunately the serial code in running 10x faster.
Below I am posting the kernel cod*, this kernel call seems to be the bottleneck if anyone could help me optimize then it would be nice. The serial code is just performing these actions in a serial manner. I am using a block configuration of 16 X 16.
The code seems to be lengthy but actually it contains an if code block and while code block that perform almost the same task, but they could not be merged.
#define limit (int)(log((float)256)/log((float)2))
// This receives a pointer to an image, some variables and 4 more arrays cont(of size 256) vars(some constants), lim and buf(of image size)
// block configuration 1 block of 16x16
__global__ void kernel_Main(unsigned char* in, int height,int width, int bs,int th, double cutoff, uint* cont,int* vars, unsigned int* lim,unsigned int* buf)
{
int j = threadIdx.x;
int i = threadIdx.y;
int k = i*blockDim.x+j;
__shared__ int prefix_sum[256];
__shared__ int sum_s[256];
__shared__ int ary_shared[256];
__shared__ int he_shared[256];
// this is the threshold
int cutval = (2*width*height)*cutoff;
prefix_sum[k] = cont[k];
int l;
// a variant of scan algorithm
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > cutval)
{
atomicMin(&vars[cut],k);
}
}
__syncthreads();
}
// The first thread will store the value in global array
if(k==0)
{
vars[cuts]=prefix_sum[vars[cut]];
}
__syncthreads();
if(vars[n])
{
// bs = 7 in this case
if(i<bs && j<bs)
{
// using atomic add because the index could be same for 2 different threads
atomicAdd(&ary_shared[in[i*(width) + j]],1);
}
__syncthreads();
int minth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
prefix_sum[k] = ary_shared[k];
sum_s[k] = 0;
// Again prefix sum
int l;
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > minth)
{
atomicMin(&vars[hmin],k);
}
}
__syncthreads();
}
// set the maximum value here
if(k==0)
{
vars[hminc]=prefix_sum[255];
// because we will always overshoot by 1
vars[hmin]--;
}
__syncthreads();
int maxth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
prefix_sum[k] = ary_shared[255-k];
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > maxth)
{
atomicMin(&vars[hmax], k);
}
}
__syncthreads();
}
// set the maximum value here
if(k==0)
{
vars[hmaxc]=prefix_sum[255];
vars[hmax]--;
vars[hmax]=255-vars[hmax];
}
__syncthreads();
int rng = vars[hmax] - vars[hmin];
if(rng >= vars[cut])
{
if( k <= vars[hmin] )
he_shared[k] = 0;
else if( k >= vars[hmax])
he_shared[k] = 255;
else
he_shared[k] = (255 * (k - vars[hmin])) / rng;
}
__syncthreads();
// only 7x7 = 49 threads will do this
if(i>0 && i<=bs && j>0 && j<=bs)
{
int base = (vars[oy]*width+vars[ox])+ (i-1)*width + (j-1);
if(rng >= vars[cut])
{
int value = he_shared[in[base]];
buf[base]+=value;
lim[base]++;
}
else
{
buf[base]+=255;
lim[base]++;
}
}
if(k==0)
vars[n]--;
__syncthreads();
}// if(n) block closes here
while(vars[n])
{
if(k==0)
{
if( vars[ox]==0 && vars[d1] ==3 )
vars[d1] = 0; // l2r
else if( vars[ox]==0 && vars[d1]==2 )
vars[d1] = 3; // l u2d
else if( vars[ox]==width-bs && vars[d1]==0)
vars[d1] = 1; // r u2d
else if( vars[ox]==width-bs && vars[d1]==1)
vars[d1] = 2; // r2l
}
// Because this value will be changed so
// all the threads should set their registers before
// they move forward
int ox_d = vars[ox];
int oy_d = vars[oy];
// Just putting it here so that all the threads should have set their
// values before moving on, as this value will be changed
__syncthreads();
if(vars[d1]==0)
{
if(i == 0 && j < bs)
{
int index = j*width + ox_d + oy_d*width;
int index2 = j*width + ox_d + oy_d*width +bs;
atomicSub(&ary_shared[in[index]],1);
atomicAdd(&ary_shared[in[index2]],1);
}
// The first thread of the first block should set this value
if(k==0)
vars[ox]++;
}
else if(vars[d1]==1||vars[d1]==3)
{
if(i == 0 && j < bs)
{
/*if(j==0)
printf("Entered 1||3\n");*/
int index = j*width + ox_d + oy_d*width;
int index2 = j*width + ox_d + (oy_d+bs)*width;
atomicSub(&ary_shared[in[index]],1);
atomicAdd(&ary_shared[in[index2]],1);
}
// The first thread of the first block should set this value
if(k==0)
vars[oy]++;
}
else if(vars[d1]==2)
{
if(i == 0 && j < bs)
{
int index = j*width + ox_d-1 + oy_d*width;
int index2 = j*width + ox_d-1 + oy_d*width +bs;
atomicAdd(&ary_shared[in[index]],1);
atomicSub(&ary_shared[in[index2]],1);
}
// The first thread of the first block should set this value
if(k==0 )
vars[ox]--;
}
__syncthreads();
//ary_shared has been calculated
// Reset the hmin and hminc values
// again the same task as done in the if(n) loop
if(k==0)
{
vars[hmin]=0;
vars[hminc]=0;
vars[hmax]=0;
vars[hmaxc]=0;
}
__syncthreads();
int minth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
prefix_sum[k] = ary_shared[k];
int l;
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > minth)
{
atomicMin(&vars[hmin],k);
}
}
__syncthreads();
}
// set the maximum value here
if(k==0)
{
vars[hminc]=prefix_sum[255];
vars[hmin]--;
}
__syncthreads();
// Calculate maxth
int maxth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
prefix_sum[k] = ary_shared[255-k];
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > maxth)
{
atomicMin(&vars[hmax], k);
}
}
__syncthreads();
}
// set the maximum value here
if(k==0)
{
vars[hmaxc]=prefix_sum[255];
vars[hmax]--;
vars[hmax]=255-vars[hmax];
}
__syncthreads();
int rng = vars[hmax] - vars[hmin];
if(rng >= vars[cut])
{
if( k <= vars[hmin] )
he_shared[k] = 0;
else if( k >= vars[hmax])
he_shared[k] = 255;
else
he_shared[k] = (255 * (k - vars[hmin])) / rng;
}
__syncthreads();
if(i>0 && i<=bs && j>0 && j<=bs)
{
int base = (vars[oy]*width+vars[ox])+ (i-1)*width + (j-1);
if(rng >= vars[cut])
{
int value = he_shared[in[base]];
buf[base]+=value;
lim[base]++;
}
else
{
buf[base]+=255;
lim[base]++;
}
}
// This just might cause a little bit of problem
if(k==0)
vars[n]--;
// All threads will wait here before continuing the while loop
__syncthreads();
}// end of while(n)
}
Firstly you need -arch sm_12 (or in your case it should really be -arch sm_13) to enable atomic operations.
As for performance, there is no guarantee that your kernel will be any faster than normal code on the CPU - there are many problems which really do not fit well into the CUDA model and these may indeed run much slower than on the CPU. You need to do some analysis/design/modelling before coding any CUDA kernels to prevent yourself wasting a lot of time on something that is never going to fly.
Having said that, there may be a way to implement your algo in a more efficient way - maybe you could post the CPU code and then invite ideas as to how to efficiently implement it in CUDA ?