Rank of each element in a matrix row using CUDA - cuda

Is there any way to find the rank of element in a matrix row separately using CUDA or any functions for the same provided by NVidia?

I don't know of a built-in ranking or argsort function in CUDA or any of the libraries I am familiar with.
You could certainly build such a function out of lower-level operations using thrust for example.
Here is a (non-optimized) outline of a possible solution approach using thrust:
$ cat t84.cu
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/sort.h>
#include <thrust/sequence.h>
#include <thrust/functional.h>
#include <thrust/adjacent_difference.h>
#include <thrust/transform.h>
#include <thrust/iterator/permutation_iterator.h>
#include <iostream>
typedef int mytype;
struct clamp
{
template <typename T>
__host__ __device__
T operator()(T data){
if (data == 0) return 0;
return 1;}
};
int main(){
mytype data[] = {4,1,7,1};
int dsize = sizeof(data)/sizeof(data[0]);
thrust::device_vector<mytype> d_data(data, data+dsize);
thrust::device_vector<int> d_idx(dsize);
thrust::device_vector<int> d_result(dsize);
thrust::sequence(d_idx.begin(), d_idx.end());
thrust::sort_by_key(d_data.begin(), d_data.end(), d_idx.begin(), thrust::less<mytype>());
thrust::device_vector<int> d_diff(dsize);
thrust::adjacent_difference(d_data.begin(), d_data.end(), d_diff.begin());
d_diff[0] = 0;
thrust::transform(d_diff.begin(), d_diff.end(), d_diff.begin(), clamp());
thrust::inclusive_scan(d_diff.begin(), d_diff.end(), d_diff.begin());
thrust::copy(d_diff.begin(), d_diff.end(), thrust::make_permutation_iterator(d_result.begin(), d_idx.begin()));
thrust::copy(d_result.begin(), d_result.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -arch=sm_61 -o t84 t84.cu
$ ./t84
1,0,2,0,
$

If you are in CUDA, the concept rank is not the same as the one on other languages as openmp or mpi. On that case you will need to go on a global block of the code you need to work with threadIdx.x and blockIdx.x parameters

Related

Conditional copying in CUDA, where data vector is longer than stencil

I would like to conditional copy data from vector, basing on stencil vector, which is N times shorter. Every element in stencil would be responsible for N elements in data vector.
Suppose that the vectors look as follows (N=3)
data = {1,2,3,4,5,6,7,8,9}
stencil = {1,0,1}
What I would like to get in result:
result = {1,2,3,7,8,9}
Is there a way to achieve this using functions from Thrust library?
I know, that there is:
thrust::copy_if (InputIterator1 first, InputIterator1 last, InputIterator2 stencil, OutputIterator result, Predicate pred)
but this doesn't allow me to copy N values from data vector basing on one element from stencil.
As is often the case, I imagine there are many possible ways to do this.
The approach which occurs to me (using copy_if) is to use the stencil vector as part of a thrust::permutation_iterator, that takes the stencil vector and generates the index into it using a thrust::transform_iterator. If we imagine a copying index that goes from 0..8 for this example, then we can index into the "source" (i.e. stencil) vector using a "map" index calculated using a thrust::counting_iterator with integer division by N (using thrust placeholders). The copying predicate just tests if the stencil value == 1.
The thrust quick start guide gives a concise description of how to use these fancy iterators.
Here is a worked example:
$ cat t471.cu
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <iostream>
using namespace thrust::placeholders;
int main(){
int data[] = {1,2,3,4,5,6,7,8,9};
int stencil[] = {1,0,1};
int ds = sizeof(data)/sizeof(data[0]);
int ss = sizeof(stencil)/sizeof(stencil[0]);
int N = ds/ss; // assume this whole number divisible
thrust::device_vector<int> d_data(data, data+ds);
thrust::device_vector<int> d_stencil(stencil, stencil+ss);
thrust::device_vector<int> d_result(ds);
int rs = thrust::copy_if(d_data.begin(), d_data.end(), thrust::make_permutation_iterator(d_stencil.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1 / N)), d_result.begin(), _1 == 1) - d_result.begin();
thrust::copy_n(d_result.begin(), rs, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc -o t471 t471.cu
$ ./t471
1,2,3,7,8,9,
$
With the assumptions about stencil organization made here, we could also pre-compute the result size rs with thrust::reduce, and use that to allocate the result vector size:
$ cat t471.cu
#include <thrust/copy.h>
#include <thrust/reduce.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <iostream>
using namespace thrust::placeholders;
int main(){
int data[] = {1,2,3,4,5,6,7,8,9};
int stencil[] = {1,0,1};
int ds = sizeof(data)/sizeof(data[0]);
int ss = sizeof(stencil)/sizeof(stencil[0]);
int N = ds/ss; // assume this whole number divisible
thrust::device_vector<int> d_data(data, data+ds);
thrust::device_vector<int> d_stencil(stencil, stencil+ss);
int rs = thrust::reduce(d_stencil.begin(), d_stencil.end())*N;
thrust::device_vector<int> d_result(rs);
thrust::copy_if(d_data.begin(), d_data.end(), thrust::make_permutation_iterator(d_stencil.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1 / N)), d_result.begin(), _1 == 1) - d_result.begin();
thrust::copy_n(d_result.begin(), rs, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc -o t471 t471.cu
$ ./t471
1,2,3,7,8,9,
$

Memory mapping in Octave with mex-functions

I have a plain C code (running on Linux) and I would like to implement it in Octave, so I thought I could use a mex-file for handling the memory mapping and send the information I received (or send) back and forth to my script in Octave and my sensors. The C code looks like this:
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <poll.h>
#include <fcntl.h>
#include <errno.h>
#include <unistd.h>
#include <sys/mman.h>
#define CUSTOM_IP_MAP_SIZE 0x10000
#define CUSTOM_IP_BASEADDR 0x43C00000
#define CUSTOM_IP_S00_AXI_SLV_REG0_OFFSET 0
#define CUSTOM_IP_S00_AXI_SLV_REG1_OFFSET 4
int main(void)
{
uint32_t leds=0x0;
int fd = open("/dev/uio0", O_RDWR);
void *ptr;
if (fd < 0) {
perror("open");
exit(EXIT_FAILURE);
}
ptr = mmap(NULL, CUSTOM_IP_MAP_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
while (1) {
leds = *((unsigned *)(ptr + CUSTOM_IP_S00_AXI_SLV_REG1_OFFSET)); //Read from the IP (slv_reg1).
*((unsigned *)(ptr + CUSTOM_IP_S00_AXI_SLV_REG0_OFFSET)) = leds; //Write to the IP (slv_reg0).
}
close(fd);
exit(EXIT_SUCCESS);
}
I compiled the code with no errors and the following command:
mkoctfile --mex mmap.c
I get the following error when I run it in Octave:
error: failed to install .mex file function 'mmap'
Should I keep trying to do this with a mex-function or there is other option better for this?
Thank you for any help.

get reverse_iterator from device_ptr in CUDA

For a device_vector, I can use its rbegin() method to get its reverse iterator. But how to construct a reverse iterator directly from device_ptr?
May be this can be achieved by constructing a device_vector with the device_ptr, the code is as follows:
thrust::device_ptr<int> ptr = get_ptr();
thrust::device_vector<int> tmpVector(ptr , ptr + N)
thrust::inclusive_scan_by_key(tmpVector.rbegin(), tmpVector.rend(), ......);
But I don't know if thrust::device_vector<int> tmpVector(ptr , ptr + N) will construct a new vector and copy the data from ptr or it just reserve a reference from ptr? The documentation of Thrust doesn't mension this.
Any ideas?
Providing an answer based on the comment by Jared, to get this off the unanswered list, and to preserve the question for future readers.
To make a reverse iterator from any kind of iterator, including thrust::device_ptr, use the thrust::make_reverse_iterator function.
Here is a simple example:
$ cat t615.cu
#include <thrust/device_vector.h>
#include <thrust/iterator/reverse_iterator.h>
#include <thrust/device_ptr.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <iostream>
#define DSIZE 4
int main(){
int *data;
cudaMalloc(&data, DSIZE*sizeof(int));
thrust::device_ptr<int> my_data = thrust::device_pointer_cast<int>(data);
thrust::sequence(my_data, my_data+DSIZE);
thrust::copy_n(my_data, DSIZE, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
typedef thrust::device_vector<int>::iterator Iterator;
thrust::reverse_iterator<Iterator> r_iter = make_reverse_iterator(my_data+DSIZE); // note that we point the iterator to the "end" of the device pointer area
thrust::copy_n(r_iter, DSIZE, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc -arch=sm_35 -o t615 t615.cu
$ ./t615
0,1,2,3,
3,2,1,0,
$
The creation of a reverse iterator does not create any "extra array".

Matrix not copied back from device to host successfully in CUDA

I am new to cuda. I wrote a kernel to create an identity matrix(GPUsetIdentity) of dimension sizeXsize. Further inside a function GPUfunctioncall, I called my kernel. The identity matrix should be stored in dDataInv. But when I copy it back to dataOut sizexsize , all the values are zero. I know, I am doing something very stupid somewhere, but couldnt get it, I am new to cuda, if anyone can point my mistake. Thanks.
#include <stdio.h>
#include <malloc.h>
#include <memory.h>
#include <math.h>
#include <stdlib.h>
#include <iostream>
#include <stdlib.h>
#include <string>
#include <fstream>
#include <iterator>
#include <sstream>
#include <vector>
#include <cstring>
#include <cstdlib>
#include <ctime>
#include <stdlib.h>
#include <cuda_runtime.h>
#include "cuda.h"
#define BLOCKSIZE 16
using namespace std;
__global__ void GPUsetIdentity (float* matrix, int width)
{
int tx = threadIdx.x;
int bx = blockIdx.x;
int offset = bx * BLOCKSIZE + tx;
matrix[offset + width * offset] = 1;
}
void print_matrix_host(float* A , int nr_rows_A, int nr_cols_A) {
for(int i = 0; i < nr_rows_A; ++i){
for(int j = 0; j < nr_cols_A; ++j){
std::cout << A[i * nr_rows_A + j ] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int GPUfunctioncall (float* hDataOut, int size){
float *dDataInv;
cudaMalloc ((void **) &dDataInv, size);
cudaMemset ((void *) dDataInv, 0, size);
dim3 idyThreads (BLOCKSIZE);
dim3 idyBlocks (size / BLOCKSIZE);
GPUsetIdentity <<< idyBlocks, idyThreads >>> (dDataInv, size);
cudaThreadSynchronize ();
cudaMemcpy ((void *) hDataOut, (void *) dDataInv, size, cudaMemcpyDeviceToHost);
cudaFree (dDataInv);
return 0;
}
int main()
{
int size = 4;
float* dataOut;
dataOut = new float[size*size];
GPUfunctioncall(dataOut, size);
print_matrix_host(dataOut, size, size);
}
Any time you are having trouble with a CUDA code, it's good practice to use proper cuda error checking. You can also run your code with cuda-memcheck to get a quick read on whether there are any errors.
Using either of these methods, you would have discovered an "invalid configuration error" on your kernel launch. This usually means that the parameters in the <<< >>> syntax are incorrect. When you run into this type of error, simply printing out those values may indicate the problem.
In your case, this line of code:
dim3 idyBlocks (size / BLOCKSIZE);
results in a value of 0 for idyBlocks when size is 4 and BLOCKSIZE is 16. So you are requesting a kernel launch of 0 blocks which is illegal. Therefore your kernel is not running and your results are not what you expect.
There are a variety of ways to solve this, many of them involving detecting this condition and adding an "extra block" when size is not evenly divisible by BLOCKSIZE. Using this approach, we may be launching "extra threads", so we must include a "thread check" in the kernel to prevent those extra threads from doing anything (such as accessing arrays out of bounds). For this, we often need to know the intended size in the kernel, and we can pass this value as an extra kernel parameter.
You've also made some errors in your handling of device variables. The following code:
dataOut = new float[size*size];
allocates enough space for a square matrix of dimension size. But the following code:
cudaMalloc ((void **) &dDataInv, size);
only allocates enough space for size bytes. You want size*size*sizeof(float) instead of size here, and likewise you want it in the following cudaMemset and cudaMemcpy operations. cudaMalloc, cudaMemset and cudaMemcpy require a size parameter in bytes, just like malloc, memset, and memcpy. This error is found in your usage of cudaMemset and cudaMemcpy as well.
The following code has those modifications, and seems to work correctly for me:
$ cat t580.cu
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#define BLOCKSIZE 16
using namespace std;
__global__ void GPUsetIdentity (float* matrix, int width, int size)
{
int tx = threadIdx.x;
int bx = blockIdx.x;
int offset = bx * BLOCKSIZE + tx;
if (tx < size)
matrix[offset + width * offset] = 1;
}
void print_matrix_host(float* A , int nr_rows_A, int nr_cols_A) {
for(int i = 0; i < nr_rows_A; ++i){
for(int j = 0; j < nr_cols_A; ++j){
std::cout << A[i * nr_rows_A + j ] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int GPUfunctioncall (float* hDataOut, int size){
float *dDataInv;
cudaMalloc ((void **) &dDataInv, size*size*sizeof(float));
cudaMemset ((void *) dDataInv, 0, size*size*sizeof(float));
dim3 idyThreads (BLOCKSIZE);
int num_blocks = size/BLOCKSIZE + (size%BLOCKSIZE)?1:0;
dim3 idyBlocks (num_blocks);
GPUsetIdentity <<< idyBlocks, idyThreads >>> (dDataInv, size, size);
cudaThreadSynchronize ();
cudaMemcpy ((void *) hDataOut, (void *) dDataInv, size*size*sizeof(float), cudaMemcpyDeviceToHost);
cudaFree (dDataInv);
return 0;
}
int main()
{
int size = 4;
float* dataOut;
dataOut = new float[size*size];
GPUfunctioncall(dataOut, size);
print_matrix_host(dataOut, size, size);
}
$ nvcc -arch=sm_20 -o t580 t580.cu
$ cuda-memcheck ./t580
========= CUDA-MEMCHECK
1 0 0 0
0 1 0 0
0 0 1 0
0 0 0 1
========= ERROR SUMMARY: 0 errors
$
Note that it may be redundant to pass size twice to the kernel. For this particular example, we could have easily used the width parameter to do our kernel "thread check". But for educational purposes, I chose to call it out as a separate parameter, because in the general case you will often pass it as a separate parameter to other kernels that you write.
Finally, note that cudaThreadSynchronize() is deprecated and should be replaced with cudaDeviceSynchronize() instead. In this particular example, niether are actually necessary, as the next cudaMemcpy operation will force the same kind of synchronization, but you may use it if you decide to add cuda error checking to your code (recommended).

Performing Fourier Transform with Thrust

Thrust is an amazing wrapper for starting programming CUDA.
I wonder there is any thing to encapsulate NVIDIA CUFFT with thrust or we need to implement ourselves?
This is a very late answer, just to remove this question from the unanswered list.
Using cuFFT with thrust should be very simple and the only thing to do should be to cast the thrust::device_vector to a raw pointer. A very simple example is reported below:
#include <iostream>
#include <cufft.h>
#include <stdlib.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/transform.h>
int main(void){
int N=4;
// --- Setting up input device vector
thrust::device_vector<cuFloatComplex> d_in(N,make_cuComplex(1.f,2.f)), d_out(N);
cufftHandle plan;
cufftPlan1d(&plan, N, CUFFT_C2C, 1);
cufftExecC2C(plan, thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), CUFFT_FORWARD);
// --- Setting up output host vector
thrust::host_vector<cuFloatComplex> h_out(d_out);
for (int i=0; i<N; i++) printf("Element #%i; Real part = %f; Imaginary part: %f\n",i,h_out[i].x,h_out[i].y);
getchar();
}