Identifying functions writing to memory at LLVM-IR level - llvm-clang

How can i retrieve pointer to a memory area that is accessed by a function call .
I have written a Pass that can intercept function calls and detect whether it is writing to a memory on not using Instruction::mayWriteToMemory(). Whether a function accessing memory or not can be checked by
Instruction::mayReadFromMemory() and Instruction::mayWriteToMemory() but how can i get staring address of those memory areas (program variables) that are being written by.
For example, for the c code
int main(){
char arrd[]="This is destination ";
char arrs[]="COPIED STRING";
strcpy(arrd, arrs);
printf("Final copied string : %s\n", arrd);
return 0;
}
I want to get the staring address and end address of memory areas (variables) being written by strcpy function.
The LLVM-IR code generated for this is
%arrd = alloca [21 x i8], align 16
%arrs = alloca [14 x i8], align 1
%0 = bitcast [21 x i8]* %arrd to i8*
call void #llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds ([21 x i8]* #main.arrd, i32 0, i32 0), i64 21, i32 16, i1 false)
%1 = bitcast [14 x i8]* %arrs to i8*
call void #llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* getelementptr inbounds ([14 x i8]* #main.arrs, i32 0, i32 0), i64 14, i32 1, i1 false)
%arraydecay = getelementptr inbounds [21 x i8]* %arrd, i32 0, i32 0
%arraydecay1 = getelementptr inbounds [14 x i8]* %arrs, i32 0, i32 0
%call = call i8* #strcpy(i8* %arraydecay, i8* %arraydecay1) #1
%arraydecay2 = getelementptr inbounds [21 x i8]* %arrd, i32 0, i32 0
%call3 = call i32 (i8*, ...)* #printf(i8* getelementptr inbounds ([26 x i8]* #.str, i32 0, i32 0), i8* %arraydecay2)
ret i32 0

Related

CUDA graph does not run as expected

I'm using the following the code to learn about how to use "CUDA graphs". The parameter NSTEP is set as 1000, and the parameter NKERNEL is set as 20. The kernel function shortKernel has three parameters, it will perform a simple calculation.
#include <cuda_runtime.h>
#include <iostream>
#define N 131072 // tuned such that kernel takes a few microseconds
#define NSTEP 1000
#define NKERNEL 20
#define BLOCKS 256
#define THREADS 512
#define CHECK(call) \
do { \
const cudaError_t error_code = call; \
if (error_code != cudaSuccess) { \
printf("CUDA Error\n"); \
printf(" File: %s\n", __FILE__); \
printf(" Line: %d\n", __LINE__); \
printf(" Error code: %d\n", error_code); \
printf(" Error text: %s\n", cudaGetErrorString(error_code)); \
exit(1); \
} \
} while (0)
__global__ void shortKernel(float * out_d, float * in_d, int i){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N) out_d[idx]=1.23*in_d[idx] + i;
}
void test2() {
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaSetDevice(0);
float x_host[N], y_host[N];
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x_host[i] = 2.0f;
y_host[i] = 2.0f;
}
float *x, *y, *z;
CHECK(cudaMalloc((void**)&x, N*sizeof(float)));
CHECK(cudaMalloc((void**)&y, N*sizeof(float)));
CHECK(cudaMalloc((void**)&z, N*sizeof(float)));
cudaMemcpy(x, x_host, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaEvent_t begin, end;
CHECK(cudaEventCreate(&begin));
CHECK(cudaEventCreate(&end));
// start recording
cudaEventRecord(begin, stream);
bool graphCreated=false;
cudaGraph_t graph;
cudaGraphExec_t instance;
// Run graphs
for(int istep=0; istep<NSTEP; istep++){
if(!graphCreated){
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
for(int ikrnl=0; ikrnl<NKERNEL; ikrnl++){
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, ikrnl);
}
cudaStreamEndCapture(stream, &graph);
cudaGraphNode_t* nodes = NULL;
size_t num_nodes = 0;
CHECK(cudaGraphGetNodes(graph, nodes, &num_nodes));
std::cout << "Num of nodes in the graph: " << num_nodes
<< std::endl;
CHECK(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
graphCreated=true;
}
CHECK(cudaGraphLaunch(instance, stream));
cudaStreamSynchronize(stream);
} // End run graphs
cudaEventRecord(end, stream);
cudaEventSynchronize(end);
float time_ms = 0;
cudaEventElapsedTime(&time_ms, begin, end);
std::cout << "CUDA Graph - CUDA Kernel overall time: " << time_ms << " ms" << std::endl;
cudaMemcpy(y_host, y, sizeof(float) * N, cudaMemcpyDeviceToHost);
for(int i = 0; i < N; i++) {
std::cout << "res " << y_host[i] << std::endl;
}
// Free memory
cudaFree(x);
cudaFree(y);
}
int main() {
test2();
std::cout << "end" << std::endl;
return 0;
}
My expected results are shown as the following:
res 2.46
res 3.46
res 4.46
res 5.46
res 6.46
...
However, the actual results are shown like this:
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
...
It seems that the all kernels' parameter i is set as NKERNEL-1. I am very confused about it, could someone give any explanations? Thanks!
I had changed the for loop as follows:
// Run graphs
for(int istep=0; istep<NSTEP; istep++){
if(!graphCreated){
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
for(int ikrnl=0; ikrnl<NKERNEL; ikrnl++){
if(ikrnl == 0)
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, 0);
else if(ikrnl == 1)
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, 1);
else if(ikrnl == 2)
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, 2);
else
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, ikrnl);
}
cudaStreamEndCapture(stream, &graph);
cudaGraphNode_t* nodes = NULL;
size_t num_nodes = 0;
CHECK(cudaGraphGetNodes(graph, nodes, &num_nodes));
std::cout << "Num of nodes in the graph: " << num_nodes
<< std::endl;
CHECK(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
graphCreated=true;
}
CHECK(cudaGraphLaunch(instance, stream));
cudaStreamSynchronize(stream);
} // End run graphs
However, the results are still the same:
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
...
The results are expected and correct.
Every time you run the graph, this entire for-loop gets executed:
for(int ikrnl=0; ikrnl<NKERNEL; ikrnl++){
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, ikrnl);
}
After the first iteration of that for-loop, the results will all be 2.46, after the second iteration the results will all be 3.46, and after the 20th iteration (ikrnl = 19) the results will all be 21.46.
Every time you run the graph, you will get that same result.
Expecting any kind of variation in the result such as this:
res 2.46
res 3.46
res 4.46
res 5.46
res 6.46
Is completely illogical, because every thread is doing precisely the same thing. Every thread starts with the same value in x, and does the same calculation on it. There is no reason to expect any difference between y[0] and y[1], for example.
Rather than trying to wade through CUDA graphs, its clear you don't have a good grasp of what the kernel is doing. My suggestion would be that you write an ordinary CUDA code that calls that kernel just once, without any CUDA graph usage, and study the output. After that, you can put a for-loop around the kernel, and watch the result behavior after every iteration of the for-loop. You don't need CUDA graphs to understand what is going on here.

Iterate_level after SetRectangle in tesseract Python wrapper

I have an image with few lines of japanese Kanji for which I am trying modifying the boxes boundes before to iterate over and get the font and its attributes.
I could check that Pyplot is displaying the boxes correctly. Also the text is correctly returned if I remove the loop which setRectangle.
My function is as below:
def font_attr(resized):
img = resized
image = Image.fromarray(img)
with PyTessBaseAPI(path='C:\\Users\\mdelal001\\fast_format_assist\\tessdata\\', lang='jpn+msp+hgp', oem=0, psm=3) as api:
api.SetImage(image)
boxes = api.GetComponentImages(RIL.TEXTLINE, True)
delta = 5
image_array = np.array(image)
for box in boxes:
print(box)
box = box[1]
x, y, w, h = box['x'] - delta, box['y'] - delta, box['w'] + 2 * delta, box['h'] + 2 * delta
cv2.line(image_array, (x, y), (x + w, y), (0, 0, 0), 2)
cv2.line(image_array, (x, y), (x, y + h), (0, 0, 0), 2)
cv2.line(image_array, (x + w, y), (x + w, y + h), (0, 0, 0), 2)
cv2.line(image_array, (x, y + h), (x + w, y + h), (0, 0, 0), 2)
plt.imshow(image_array)
plt.show()
for i, (im, box, _, _) in enumerate(boxes):
print(i, (im, box, _, _))
api.SetRectangle(box['x'] - delta, box['y'] - delta, box['w'] + 2 * delta, box['h'] + 2 * delta)
api.Recognize()
ri = api.GetIterator()
font = []
attributes = []
for r in iterate_level(ri, RIL.BLOCK):
symbol = r.GetUTF8Text(RIL.BLOCK)
conf = r.Confidence(RIL.BLOCK)
symbol = symbol.replace('\n',' ').replace(' ', '')
word_attributes = r.WordFontAttributes()
if not symbol:
continue
else:
font.append([symbol, 'confidence: ',conf])
attributes.append(word_attributes)
return font, attributes
The error:
Traceback (most recent call last):
File "c:\Users\m1\fast_format_assist\font_reader.py", line 338, in <module>
attr = font_attr(resized)
File "c:\Users\m1\fast_format_assist\font_reader.py", line 216, in font_attr
symbol = r.GetUTF8Text(RIL.TEXTLINE)
File "tesserocr.pyx", line 820, in tesserocr._tesserocr.PyLTRResultIterator.GetUTF8Text
RuntimeError: No text returned
Better you go with easyocr for this case.
import easyocr
reader = easyocr.Reader(['ch_sim'],gpu=False)
result = reader.readtext('5.png')
for detection in result:
print(detection)
the result is,
([[0, 0], [384, 0], [384, 70], [0, 70]], '二九办一番目', 0.04030529339493764)
([[0, 75], [381, 75], [381, 155], [0, 155]], '二九加二番目', 0.06939456423064959)
([[0, 164], [380, 164], [380, 242], [0, 242]], '二九|4别0去', 0.02179895938530989)
([[0, 249], [385, 249], [385, 335], [0, 335]], '二九|寸危害学', 0.01723975047661578)
([[0, 335], [385, 335], [385, 421], [0, 421]], '二九(丈著盥目', 0.004155675980245961)
([[0, 424], [382, 424], [382, 499], [0, 499]], '二札|击六番目', 0.022589517592635434)
You can install easyocr by pip install easyocr

CUDA C - CRC32 - Finding unknown polynom and crcxor - program speed up

I was looking for questions related to my problem but only found questions regarding CRC32 reversing. My topic is a bit different.
I am a novice programmer and I have such a task to do. I have input (3 strings of 4 bytes). For this data, I know three checksums computed using a hash function similar to CRC32. However, it is not a standard CRC32 because it differs between the default and unknown values ​​of the polynomial and the crcxor parameter.
So for the input data of 4 bytes I calculate the CRC using different values ​​of the polynomial from 0 to 0xFFFFFFFF and using different values ​​of the parameter crcxor with the values between 0 and 0xFFFF. I wrote this program in CUDA C because it runs faster than the CPU. This is my third CUDA C program right after "Hello World" and "VectorAdd" :). To calculate all possible 0xFFFF x 0xFFFFFFFF variants, it takes about 5 hours for my NVIDIA GTX1060 card.
I wanted to ask if it is possible to modify or optimize the following program code in order to do this task faster?
Ultimately, I would like to calculate 0xFFFFFFFF x 0xFFFFFFFF but I don't know yet if it can be done in a short time.
If anyone would like to have a look at my program code and provide valuable feedback, I would be extremely grateful.
#include <stdio.h>
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
__device__ unsigned long calculate_crc(unsigned long data, unsigned long poly, unsigned long cxor)
// truncated function for constant values crcinit = 0 refin = 0 refout = 0 direct = 0
{
unsigned long i, j, k, c, bit, crc = 0;
for (i=0,k=24; i<4; i++,k-=8)
{
c = (data>>k)&0xFF;
for (j=0x80; j; j>>=1)
{
bit = crc & 0x80000000;
crc<<= 1;
if (c & j) bit^= 0x80000000;
if (bit) crc^= poly;
}
}
crc^= cxor;
crc&= 0xFFFFFFFF;
return crc;
}
__global__ void calculate_crc_parameters(unsigned long n)
{
unsigned long polynom = 0;
unsigned long crcxor = 0;
//Input data:
const unsigned long data1 = 0x928F640C;
const unsigned long data2 = 0x0121B30E;
const unsigned long data3 = 0xCB652607;
// calculated CRC for the above input data and for polynom 0xFF7A1DB7: crcinit: 0, crcxor: 0x00000000, refin: 0, refout: 0, direct: 0:
// for these CRCs, the function should find the polynomial 0xFF7A1DB7 and crcxor = 0
// finds it right away because crcxor = 0
const unsigned long crc1 = 0x7076BCEB;
const unsigned long crc2 = 0x1F719D7A;
const unsigned long crc3 = 0x8369D986;
// other example crc - for crcxor> 0
// computed CRC for polynom 0xFF7A1DB7: crcinit: 0, crcxor: 0x000000FF, refin: 0, refout: 0, direct: 0:
// for these CRCs, the function should find the polynomial 0xFF7A1DB7 and crcxor = 0x000000FF
// Program find it after 1m 12sec.
/*
const unsigned long crc1 = 0x7076BC14;
const unsigned long crc2 = 0x1F719D85;
const unsigned long crc3 = 0x8369D979;
*/
// computed CRC for polynom 0xFF7A1DB7: crcinit: 0, crcxor: 0x0000FFFE, refin: 0, refout: 0, direct: 0:
// for these CRCs, the function should find the polynomial 0xFF7A1DB7 and crcxor = 0x0000FFFE
// searches for 5 hours
/*
const unsigned long crc1 = 0x70764315;
const unsigned long crc2 = 0x1F716284;
const unsigned long crc3 = 0x83692678;
*/
// CRCs - polynom 0xFF7A1DB7: crcinit: 0, crcxor: 0xFF7A1DB7, refin: 0, refout: 0, direct: 0:
// no implementation for 8-byte crcxor yet - and it would count for a long time
/*
const unsigned long crc1 = 0x8F0CA15C;
const unsigned long crc2 = 0xE00B80CD;
const unsigned long crc3 = 0x7C13C431;
*/
unsigned int index_x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int stride_x = blockDim.x * gridDim.x;
unsigned int index_y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int stride_y = blockDim.y * gridDim.y;
unsigned int index_z = blockIdx.z * blockDim.z + threadIdx.z;
unsigned int stride_z = blockDim.z * gridDim.z;
if((index_x<n)&&(index_y<n)&&(index_z<n))
{
polynom = (index_x << 16) ^ index_y; // "gluing" the polynomial
// to get polynom e.g. 0xFF7A1DB7 we have to "glue it" with index_x and index_y
// if index_x == 0xFF7A then LSH by 16 places and we get 0xFF7A0000
// then xor from index_y: 0xFF7A0000 xor 0x00001DB7 and is 0xFF7A1DB7
crcxor = index_z; // crcxor will take the values of index_z that is from 0x0000 to 0xFFFF
if(calculate_crc(data1,polynom,crcxor)==crc1)
if(calculate_crc(data2,polynom,crcxor)==crc2)
if(calculate_crc(data3,polynom,crcxor)==crc3) // compute three checksums and compare them
printf("\nCRC parameters found ---> polynom: 0x%08X, crcxor: 0x%08X\n", polynom,crcxor);
// if the calculated 3 crc sums agree with the known 3 crcs, then display the parameters for which they were calculated
if ((crcxor%0xFF==0)&&(polynom==0xFFFFFFFF)) printf("#"); // 1m 12s from displaying # to the next #
// if the # sign is displayed 256 times, this will be the end of the program
index_x+=stride_x;
index_y+=stride_y;
index_z+=stride_z;
}
}
int main(void)
{
unsigned long N = 0x10000; // 0xFFFF + 0x01 = 65536dec
////////////////////////////////////////////////
// for computing only in X and Y axes - for crcxor = zero all the time
dim3 dimBlock( 4, 4, 1);
dim3 dimGrid(16384, 16384, 1);
////////////////////////////////////////////////
// for computing on the X, Y and Z axes, i.e. for crcxor taking values from the Z axis from 0 to 65535
//dim3 dimBlock( 4, 4, 64); // 4 * 4 * 64 = 1024 --- maximum block size
//dim3 dimGrid(16384, 16384, 1024); //uncomment this 2 lines for crcxor > 0
// 4 4 64
// * * *
// 16384 16384 1024
// = = =
// 0x10000 0x10000 0x10000
// x, y, and z will trigger 65,536 times each
cudaProfilerStart();
calculate_crc_parameters<<<dimGrid, dimBlock>>>(N);
cudaDeviceSynchronize();
cudaDeviceReset();
cudaProfilerStop();
return 0;
}
I compile it in cmd by: nvcc name.cu -o name
I work on win10 with Cuda Toolkit 11.5
Card is NVIDIA GTX 1060.
Could the use of pointers or memory allocations somehow speed up this program?
I computing test crc values here
Optimization should begin with the algorithm, as opposed to optimizing a painfully pointless brute-force approach.
You can factor the search for a polynomial and a final exclusive-or, doing the polynomial first, and then (trivially) finding the exclusive-or value. All you need to do is take the exclusive-or of two of your data values, and the find the polynomial that produces the exclusive-or of the two CRCs of those values, assuming a zero final exclusive or. You will need to try at least two pairs in order to narrow it down to one choice for the polynomial.
Once you have the polynomial, now compute the CRC on one of your data values, exclusive-or that with the desired CRC, and now you have your final exclusive-or value. No search needed for the second step.
The polynomial search is fast enough that you can just use your CPU. No GPU or CUDA or whatever is needed. It took 40 seconds on my three-year old laptop. You only need to try odd polynomials. Even polynomials are not valid.
Exclusive-oring the data and the CRCs also cancels the initial value. So you can find the polynomial this way for CRCs that have both a non-zero initial value and a non-zero final exclusive or. However in order to then solve for both the initial value and final exclusive-or, you will need examples with different length messages, i.e. other than all four-byte messages. There are 232 possible combinations of initial value and final exclusive-or that will match any and all CRCs of four-byte messages.
As an aside, your CRC routine is needlessly complicated. See equivalent below. This prints poly = ff7a1db7, xor = 0000fffe:
#include <stdio.h>
#include <stdint.h>
uint32_t calculate_crc(uint32_t data, uint32_t poly, uint32_t xor) {
for (int i = 0; i < 32; i++)
data = data & 0x80000000 ? (data << 1) ^ poly : data << 1;
return data ^ xor;
}
void findp(uint32_t data1, uint32_t data2, uint32_t data3,
uint32_t crc1, uint32_t crc2, uint32_t crc3) {
uint32_t d = data2, c = crc2;
data1 ^= data3; crc1 ^= crc3;
data2 ^= data3; crc2 ^= crc3;
data3 ^= d; crc3 ^= c;
uint32_t poly = 1;
do {
if (calculate_crc(data1, poly, 0) == crc1 &&
calculate_crc(data2, poly, 0) == crc2 &&
calculate_crc(data3, poly, 0) == crc3)
printf("poly = %08x, xor = %08x\n",
poly, calculate_crc(d, poly, 0) ^ c);
poly += 2;
} while (poly != 1);
}
int main(void) {
findp(0x928F640C, 0x0121B30E, 0xCB652607,
0x70764315, 0x1F716284, 0x83692678);
return 0;
}
There is an even faster, in fact massively faster, approach by solving a set of linear equations over GF(2). However it would take me longer than 40 seconds to write that code, so this is where I would stop. Unless I had many, many of these CRCs to find. Or unless I was trying to find, for example, a 64-bit CRC polynomial.

CUDA texture object -- incorrect interpolation in non-normalized mode

Non-normalized linear interpolation from a CUDA texture object bound to a CUDA array appears to be returning incorrect results. It appears that the interpolated values are a factor of 0.5 smaller than expected. Normalized linear interpolation appears to be working properly.
Is there something wrong in this code? Are we expected to multiply by 2 when doing non-normalized texture interpolation?
The code:
#include <iostream>
#include <cstdio>
// simple function to print an array
template <typename T>
void print_array(const T *a, const size_t length) {
for (size_t i=0; i!=length; i++) {
std::cout << "a[" << i << "]: " << a[i] << std::endl;
}
}
// attempt to interpolate linear memory
__global__
void cuda_texture_interpolate(cudaTextureObject_t tex,
float start,
float stop,
int count) {
if (count < 1) { count = 1; }
float h = (stop-start)/((float)count);
float x = start;
float y;
for (int i = 0; i != count; i++) {
y = tex1D<float>(tex,x);
printf("x: %4g ; y: %4g\n",x,y);
x = x + h;
}
y = tex1D<float>(tex,x);
printf("x: %4g ; y: %4g\n",x,y);
}
int main(void) {
// set up host array
int n = 5;
float a_host[5] = {3,2,1,2,3};
printf("printing array on host.\n");
print_array(a_host,n);
// allocate and copy to cuda array
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc(32, 0, 0, 0,
cudaChannelFormatKindFloat);
cudaArray* cuArray;
cudaMallocArray(&cuArray, &channelDesc, n);
// Copy to device memory some data located at address h_data
// in host memory
cudaMemcpyToArray(cuArray, 0, 0, a_host, n*sizeof(float),
cudaMemcpyHostToDevice);
// create texture object
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.filterMode = cudaFilterModeLinear;
texDesc.readMode = cudaReadModeElementType;
//texDesc.normalizedCoords = 1;
texDesc.normalizedCoords = 0;
cudaResourceViewDesc resViewDesc;
memset(&resViewDesc, 0, sizeof(resViewDesc));
resViewDesc.format = cudaResViewFormatFloat1;
resViewDesc.width = n;
// create texture object
cudaTextureObject_t tex;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, &resViewDesc);
// call interpolation kernel
printf("interpolate (f(x) -> y).\n");
//cuda_texture_interpolate<<<1,1>>>(tex,0.0,1.0,10);
cuda_texture_interpolate<<<1,1>>>(tex,0.0,5.0,10);
// clean up
cudaDestroyTextureObject(tex);
cudaFreeArray(cuArray);
printf("end of texture_object_interpolation.\n");
return 0;
}
The result:
$ ./texture_object_interpolation
printing array on host.
a[0]: 3
a[1]: 2
a[2]: 1
a[3]: 2
a[4]: 3
interpolate (f(x) -> y).
x: 0 ; y: 1.5
x: 0.5 ; y: 1.5
x: 1 ; y: 1.25
x: 1.5 ; y: 1
x: 2 ; y: 0.75
x: 2.5 ; y: 0.5
x: 3 ; y: 0.75
x: 3.5 ; y: 1
x: 4 ; y: 1.25
x: 4.5 ; y: 1.5
x: 5 ; y: 1.5
end of texture_object_interpolation.
Please see this gist for the above code, a makefile, and code for normalized interpolation.
This was apparently caused by a bug in the CUDA 5.0 compiler, and fixed in the CUDA 5.5 release.
[This answer has been assembled from comments to get the question off the unanswered queue for the CUDA tag]

Launch out of resources

I wrote the following simple CUDA kernel:
__global__ void pr_kernel(float* O, const float* I, const float* W, int N)
{
int x = threadIdx.x;
float sum;
int i;
if (x < N) {
for (i = 0; i < N; i++) {
if (i == x) continue;
sum += W[x*N+i] * I[x];
}
O[x] = (0.15 / N) + 0.85 * sum;
}
}
The variables are allocated in Python as follows:
N = np.int32(4)
W = np.float32(np.asarray(
[0, 1, 0, 1, 1, 0, 1, 1,
0, 1, 0, 1,1, 1, 0]))
I = np.float32(np.asarray(
[0.25, 0.25, 0.25, 0.25]))
O = np.float32(np.zeros(N))
I'm transferring the variables using gpuarray.to_gpu, and I'm calling the kernel on a Tesla C2070 with the following line:
pr_kernel(O_d, I_d, W_d, N_d, block=blocksize, grid=gridsize)
Where:
blocksize = (128, 1, 1)
gridsize = (1, 1)
I get the error message:
pycuda.driver.LaunchError: cuLaunchKernel failed: launch out of resources.
This happens even if I reduce blocksize to something like (8, 1, 1). I can run other CUDA programs on the GPU with a blocksize of (512, 1, 1) so I'm confident this is not due to a GPU configuration issue.
What am I doing wrong? Thanks for any help.
The problem was that I was transferring the integer N to the GPU using gpuarray.to_gpu, where I should have been directly passing N to the pr_kernel function.
I got a similar problem when I used a different type in definition and as an argument to the kernel. Probably the fact that the latter required more resources generates an error.