My task is to take a math function, such as
f(x) = 10 ∗ sin(x)
and export a subset of its coordinates into a .txt file. Is there a good way to do this?
You can export the coordinates into a text file by using std::ofstream.
#include <iostream>
#include <fstream>
#include <cmath>
double func(int x){
return 10 * sin(x); // your function here
}
int main(){
ofstream fout("coordinates.txt"); // declares an output file stream to
// "coordinates.txt" called fout
for(int i = 0; i < 100; ++i){
fout << i << ": " << func(i) << endl;
}
return 0;
}
Is there a performant way in CUDA to get out of multiple arrays (which exist in different structures)
to find the maximum/minimum in parallel? The structures are structured according to the Structure of Arrays format.
A simple idea would be to assign each array to a thread block, which is used to calculate the maximum/minimum using the parallel reduction approach. The problem here is the size of the shared memory, which is why I regard this approach as critical.
An other approach is to calculate every Miminum/Maximum separetly for each Array. I think this is to slow.
struct Cube {
int* x;
int* y;
int* z;
int size;
};
int main() {
Cube* c1 = new Cube(); //c1 includes 100 Cubes (because of SOA)
c1-> x = new int[100];
c1-> y = new int[100];
c1 -> z = new int[100];
Cube* c2 = new Cube();
c2-> x = new int[1047];
c2-> y = new int[1047];
c2 -> z = new int[1047];
Cube* c3 = new Cube();
c3-> x = new int[5000];
c3-> y = new int[5000];
c3 -> z = new int[5000];
//My goal now is to find the smallest/largest x dimension of all cubes in c1, c2, ..., and cn,
//with one Kernel launch.
//So the smallest/largest x in c1, the smallest/largest x in c2 etc..
}
Does anyone know an efficient approach? Thanks.
A simple idea would be to assign each array to a thread block, which is used to calculate the maximum/minimum using the parallel reduction approach. The problem here is the size of the shared memory, which is why I regard this approach as critical.
There is no problem with shared memory size. You may wish to review Mark Harris' canonical parallel reduction tutorial and look at the later methods to understand how we can use a loop to populate shared memory, reducing values into shared memory as we go. Once the input loop is completed, then we begin the block-sweep phase of the reduction. This doesn't impose any special requirements on the shared memory per block.
Here's a worked example demonstrating both a thrust::reduce_by_key method (single call) and a CUDA block-segmented method (single kernel call):
$ cat t1535.cu
#include <iostream>
#include <thrust/reduce.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/functional.h>
#include <cstdlib>
#define IMAX(x,y) (x>y)?x:y
#define IMIN(x,y) (x<y)?x:y
typedef int dtype;
const int ncubes = 3;
struct Cube {
dtype* x;
dtype* y;
dtype* z;
int size;
};
struct my_f
{
template <typename T1, typename T2>
__host__ __device__
thrust::tuple<dtype,dtype> operator()(T1 t1, T2 t2){
thrust::tuple<dtype,dtype> r;
thrust::get<0>(r) = IMAX(thrust::get<0>(t1),thrust::get<0>(t2));
thrust::get<1>(r) = IMIN(thrust::get<1>(t1),thrust::get<1>(t2));
return r;
}
};
const int MIN = -1;
const int MAX = 0x7FFFFFFF;
const int BS = 512;
template <typename T>
__global__ void block_segmented_minmax_reduce(const T * __restrict__ in, T * __restrict__ max, T * __restrict__ min, const size_t * __restrict__ slen){
__shared__ T smax[BS];
__shared__ T smin[BS];
size_t my_seg_start = slen[blockIdx.x];
size_t my_seg_size = slen[blockIdx.x+1] - my_seg_start;
smax[threadIdx.x] = MIN;
smin[threadIdx.x] = MAX;
for (size_t idx = my_seg_start+threadIdx.x; idx < my_seg_size; idx += BS){
T my_val = in[idx];
smax[threadIdx.x] = IMAX(my_val, smax[threadIdx.x]);
smin[threadIdx.x] = IMIN(my_val, smin[threadIdx.x]);}
for (int s = BS>>1; s > 0; s>>=1){
__syncthreads();
if (threadIdx.x < s){
smax[threadIdx.x] = IMAX(smax[threadIdx.x], smax[threadIdx.x+s]);
smin[threadIdx.x] = IMIN(smin[threadIdx.x], smin[threadIdx.x+s]);}
}
if (!threadIdx.x){
max[blockIdx.x] = smax[0];
min[blockIdx.x] = smin[0];}
}
int main() {
// data setup
Cube *c = new Cube[ncubes];
thrust::host_vector<size_t> csize(ncubes+1);
csize[0] = 100;
csize[1] = 1047;
csize[2] = 5000;
csize[3] = 0;
c[0].x = new dtype[csize[0]];
c[1].x = new dtype[csize[1]];
c[2].x = new dtype[csize[2]];
size_t ctot = 0;
for (int i = 0; i < ncubes; i++) ctot+=csize[i];
// method 1: thrust
// concatenate
thrust::host_vector<dtype> h_d(ctot);
size_t start = 0;
for (int i = 0; i < ncubes; i++) {thrust::copy_n(c[i].x, csize[i], h_d.begin()+start); start += csize[i];}
for (size_t i = 0; i < ctot; i++) h_d[i] = rand();
thrust::device_vector<dtype> d_d = h_d;
// build flag vector
thrust::device_vector<int> d_f(d_d.size());
thrust::host_vector<size_t> coff(csize.size());
thrust::exclusive_scan(csize.begin(), csize.end(), coff.begin());
thrust::device_vector<size_t> d_coff = coff;
thrust::scatter(thrust::constant_iterator<int>(1), thrust::constant_iterator<int>(1)+ncubes, d_coff.begin(), d_f.begin());
thrust::inclusive_scan(d_f.begin(), d_f.end(), d_f.begin());
// min/max reduction
thrust::device_vector<dtype> d_max(ncubes);
thrust::device_vector<dtype> d_min(ncubes);
thrust::reduce_by_key(d_f.begin(), d_f.end(), thrust::make_zip_iterator(thrust::make_tuple(d_d.begin(), d_d.begin())), thrust::make_discard_iterator(), thrust::make_zip_iterator(thrust::make_tuple(d_max.begin(), d_min.begin())), thrust::equal_to<int>(), my_f());
thrust::host_vector<dtype> h_max = d_max;
thrust::host_vector<dtype> h_min = d_min;
std::cout << "Thrust Maxima: " <<std::endl;
thrust::copy_n(h_max.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ","));
std::cout << std::endl << "Thrust Minima: " << std::endl;
thrust::copy_n(h_min.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ","));
std::cout << std::endl;
// method 2: CUDA kernel (block reduce)
block_segmented_minmax_reduce<<<ncubes, BS>>>(thrust::raw_pointer_cast(d_d.data()), thrust::raw_pointer_cast(d_max.data()), thrust::raw_pointer_cast(d_min.data()), thrust::raw_pointer_cast(d_coff.data()));
thrust::copy_n(d_max.begin(), ncubes, h_max.begin());
thrust::copy_n(d_min.begin(), ncubes, h_min.begin());
std::cout << "CUDA Maxima: " <<std::endl;
thrust::copy_n(h_max.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ","));
std::cout << std::endl << "CUDA Minima: " << std::endl;
thrust::copy_n(h_min.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc -o t1535 t1535.cu
$ ./t1535
Thrust Maxima:
2145174067,2147469841,2146753918,
Thrust Minima:
35005211,2416949,100669,
CUDA Maxima:
2145174067,2147469841,2146753918,
CUDA Minima:
35005211,2416949,100669,
$
For a small number of Cube objects, the thrust method is likely to be faster. It will tend to make better use of medium to large GPUs than the block method will. For a large number of Cube objects, the block method should also be fairly efficient.
I am looking for any advice on how to turn a function into a class. I will enter a program below. It is long. I feel i should place the whole thing in for context. I need to rewrite it so that i will use classes in place of the three functions.
#include <iostream>
#include <string>
using namespace std;
// Do not change these function prototypes:
void readBig(int[]);
void printBig(int[]);
void addBig(int[], int[], int[]);
// This constant should be 100 when the program is finished.
const int MAX_DIGITS = 100;
int main()
{
// Declare the three numbers, the first, second and the sum:
int number1[MAX_DIGITS], number2[MAX_DIGITS], sum[MAX_DIGITS];
bool finished = false;
char response;
while (! finished)
{
cout << "Please enter a number up to " << MAX_DIGITS << " digits: ";
readBig(number1);
cout << "Please enter a number up to " << MAX_DIGITS << " digits: ";
readBig(number2);
addBig(number1, number2, sum);
printBig(number1);
cout << "\n+\n";
printBig(number2);
cout << "\n=\n";
printBig(sum);
cout << "\n";
cout << "test again?";
cin>>response;
cin.ignore(900,'\n');
finished = toupper(response)!= 'Y';
}
return 0;
}
//ReadBig will read a number as a string,
//It then converts each element of the string to an integer and stores it in an integer array.
//Finally, it reverses the elements of the array so that the ones digit is in element zero,
//the tens digit is in element 1, the hundreds digit is in element 2, etc.
//AddBig adds the corresponding digits of the first two arrays and stores the answer in the third.
//In a second loop, it performs the carry operation.
//PrintBig uses a while loop to skip leading zeros and then uses a for loop to print the number.
//FUNCTIONS GO BELOW
void readBig(int number[MAX_DIGITS])
{
string read="";
cin>>read;
int len,i, save=0;
len= read.length();
while(i<MAX_DIGITS){
number[i]=0;
i++;
}
for (i=0; i <= len-1; i++){
number[i] = int (read.at(i)-'0');
}
for (i=0;i<=len/2-1;i++){
save=number[i];
number[i]=number[len-1-i];
number[len-1-i]=save;
}
}
void printBig(int number[MAX_DIGITS])
{
int digit=MAX_DIGITS-1;
while(number[digit]==0){
digit--;
}
for (int i=digit; i>=0; i--)
{cout<<number[i];
}
}
void addBig(int number1[MAX_DIGITS], int number2[MAX_DIGITS], int sum[MAX_DIGITS])
{
// The code below sums the arrays.
for (int i = MAX_DIGITS - 1; i >= 0; i--)
{
sum[i] = number1[i] + number2[i];
if (sum[i] > 9 && i < MAX_DIGITS - 1)
{
sum[i + 1] += 1;
sum[i] -= 10;
}
}
}
#define _CRT_SECURE_NO_WARNINGS
#define NAME_LENGTH 10
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
double myFunc(double x)
{
double y = (pow(x, 3)) - (3 * (pow(x,2))) + 2;
return y;
}
int main()
{
double y = 3;
myFunc(y);
printf("%d \n", y);
system("pause");
return 0;
}
To me this should be a very very simple formula. I'm not asking it to jump through many hoops, but it only outputs "0" no matter what value for "y" I give.
I am using Visual Studio 2013.
I hovered my mouse over "y" and it said "y" equaled "2.0000" in the function, which is the correct output for this particular value of "y."
Assign the myFunc return value to y: y = MyFunc(y); And use the format %lf instead of %d.
I'm trying to get my head around CUB, and having a bit of trouble following the (rather incomplete) worked examples. CUB looks like it is a fantastic tool, I just can't make sense of the example code.
I've built a simple proto-warp reduce example:
#include <cub/cub.cuh>
#include <cuda.h>
#include <vector>
using std::vector;
#include <iostream>
using std::cout;
using std::endl;
const int N = 128;
__global__ void sum(float *indata, float *outdata) {
typedef cub::WarpReduce<float,4> WarpReduce;
__shared__ typename WarpReduce::TempStorage temp_storage;
int id = blockIdx.x*blockDim.x+threadIdx.x;
if( id < 128 ) {
outdata[id] = WarpReduce(temp_storage).Sum(indata[id]);
}
}
int main() {
vector<float> y(N), sol(N);
float *dev_y, *dev_sol;
cudaMalloc((void**)&dev_y,N*sizeof(float));
cudaMalloc((void**)&dev_sol,N*sizeof(float));
for( int i = 0; i < N; i++ ) {
y[i] = (float)i;
}
cout << "input: ";
for( int i = 0; i < N; i++ ) cout << y[i] << " ";
cout << endl;
cudaMemcpy(&y[0],dev_y,N*sizeof(float),cudaMemcpyHostToDevice);
sum<<<1,32>>>(dev_y,dev_sol);
cudaMemcpy(dev_sol,&sol[0],N*sizeof(float),cudaMemcpyDeviceToHost);
cout << "output: ";
for( int i = 0; i < N; i++ ) cout << sol[i] << " ";
cout << endl;
cudaFree(dev_y);
cudaFree(dev_sol);
return 0;
}
which returns all zeros.
I'm aware that this code would return a reduction that was banded with every 32nd element being the sum of a warp and the other elements being undefined - I just want to get a feel for how CUB works. Can someone point out what I'm doing wrong?
(also, does CUB deserve its own tag yet?)
Your cudaMemcpy arguments are back to front, the destination comes first (to be consistent with memcpy).
cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )
See the API reference for more info.