Related
I'm trying to understand how to parallelise a recursive calculation. Serially, the calculation takes the form:
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
For the i-1 index there's a solution here to a previous question of mine: CUDA force instruction execution order
I want to modify this to use the i-2 and I can't understand how to apply the same process to a 2nd order calculation. It should be possible using the thrust::inclusive_scan function, but I can't work out how. Does anyone know the solution?
Picking up where we left off in the previous question/answer, we shift our attention to equation 1.11 in the referenced paper by Blelloch. We observe that your problem formulation:
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
seems to match that in equation 1.11 if we set m=2 and in that case we can also observe that for your formulation, all ai,1 are zero (and, as previously, all ai,2 are k).
As per equation 1.12 in that paper, our state variable si now becomes a two-tuple:
si = |xi xi-1|
Taking note of these things, we observe the "correctness" of equation 1.13:
si = |xi-1 xi-2| . |0 1, k 0| + |bi 0|
rewriting:
si,1 = xi = k*xi-2 + bi
si,2 = xi-1 = xi-1
(In my view, the other answer leaves you at this point. That realization, i.e. result.data[0] = right + k * left.data[1]; is sufficient for a serial scan but not for a parallel scan. It's also evident that the functor/scan op there is not associative.)
We now need to come up with a binary operator bop that is an extension of the definition in (1.7) to this case. Referring to the previous definition in equation 1.7, we extend that based on the treatment in 1.13 as follows:
Ci = |Ai , Bi|
where:
Ai = |0 1, k 0|
and:
Bi = |bi 0|
We then have:
Ci bop Cj = | Ai . Aj , Bi . Aj + Bj |
This then becomes the formula for our functor/scan operator. We will need to carry 6 scalar "state" quantities throughout: 2 for the B vector and 4 for the A matrix.
What follows then is a realization of the above:
$ cat t1930.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
#include <cstdio>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef float mt;
const size_t ds = 512;
const mt k = 1.01;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // cpu result
for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
thrust::device_vector<mt> db(b1, b1+ds);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_vector<mt> dx1(ds);
thrust::device_vector<mt> dx0(ds);
thrust::device_vector<mt> dy0(ds);
thrust::device_vector<mt> dy1(ds);
thrust::device_vector<mt> dy2(ds);
thrust::device_vector<mt> dy3(ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(db.begin(), b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
thrust::host_vector<mt> hx1 = dx1;
thrust::copy_n(hx1.begin(), snip, std::ostream_iterator<mt>(std::cout, ","));
thrust::copy_n(hx1.begin()+ds-snip, snip, std::ostream_iterator<mt>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -std=c++14 t1930.cu -o t1930
$ cuda-memcheck ./t1930
========= CUDA-MEMCHECK
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.218,601.275,576.315,607.993,582.947,614.621,589.516,621.699,595.644,628.843,
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.621,589.516,621.7,595.644,628.843,
========= ERROR SUMMARY: 0 errors
$
Yes, there are some results above that differ in the 6th digit. I attribute this to the limitations of float resolution when taking into account the very different order of operations between the serial and parallel method. If you change the typedef to double, the results will appear to match exactly.
Since you've asked about it here's an equivalent realization where it is demonstrated using device data previously allocated using cudaMalloc:
$ cat t1930.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
#include <cstdio>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef double mt;
const size_t ds = 512;
const mt k = 1.01;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // cpu result
for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
mt *db;
cudaMalloc(&db, ds*sizeof(db[0]));
cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_vector<mt> dx1(ds);
thrust::device_vector<mt> dx0(ds);
thrust::device_vector<mt> dy0(ds);
thrust::device_vector<mt> dy1(ds);
thrust::device_vector<mt> dy2(ds);
thrust::device_vector<mt> dy3(ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
cudaMemcpy(cr, thrust::raw_pointer_cast(dx1.data()), ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
}
$ nvcc -std=c++14 t1930.cu -o t1930
$ cuda-memcheck ./t1930
========= CUDA-MEMCHECK
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
========= ERROR SUMMARY: 0 errors
There should be no significant performance difference between these two approaches. (However I happened to switch the typedef to double for this example, so that makes a difference.) Using cudaMalloc as an alternative to the device_vector for the various state vectors (dx0, dx1, dy0, dy1 ...) may be slightly faster, because device_vector first does a cudaMalloc style allocation, then launches a kernel to zero out the allocation. This zero-ing step is unnecessary for the state vectors. The pattern given here should demonstrate how you could do that, if you are interested.
Here's a version that eliminates use of thrust::device_vector and thrust::host_vector altogether:
#include <iostream>
#include <thrust/device_ptr.h>
#include <thrust/scan.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef float mt;
const size_t ds = 32768*4;
const mt k = 1.001;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // result
for (int i = 0; i < ds; i++) { b1[i] = (rand()/(float)RAND_MAX)-0.5;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
mt *db, *dstate;
cudaMalloc(&db, ds*sizeof(db[0]));
cudaMalloc(&dstate, 6*ds*sizeof(dstate[0]));
cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_ptr<mt> dx1 = thrust::device_pointer_cast(dstate);
thrust::device_ptr<mt> dx0 = thrust::device_pointer_cast(dstate+ds);
thrust::device_ptr<mt> dy0 = thrust::device_pointer_cast(dstate+2*ds);
thrust::device_ptr<mt> dy1 = thrust::device_pointer_cast(dstate+3*ds);
thrust::device_ptr<mt> dy2 = thrust::device_pointer_cast(dstate+4*ds);
thrust::device_ptr<mt> dy3 = thrust::device_pointer_cast(dstate+5*ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1, dx0, dy0, dy1, dy2, dy3));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
cudaMemcpy(cr, dstate, ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
}
Here is some cpu code which shows a possible implementation of the formular derived from https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf to express higher-order recurrences as a scan operation.
The key idea is that each element of the scan result is not a scalar, but a vector which contains the n previous scalar results. This way, all the required previous results are available in the scan operator to compute the next result.
#include <iostream>
#include <algorithm>
#include <numeric>
#include <array>
void calculate1(std::vector<int> vec, int k){
std::vector<int> result(vec.size(), 0);
for(int i = 2; i < vec.size(); i++){
result[i] = vec[i] + k * result[i-2];
}
std::cerr << "calculate1 result: ";
for(auto x : result){
std::cerr << x << ", ";
}
std::cerr << "\n";
}
struct S{
//data[0] stores result of last iteration
//data[1] stores result of second last iteration
std::array<int, 2> data;
};
std::ostream& operator<<(std::ostream& os, S s){
os << "(" << s.data[0] << "," << s.data[1] << ")";
}
void calculate2(std::vector<int> vec, int k){
S initvalue{{0,0}};
std::vector<S> result(vec.size(), initvalue);
std::exclusive_scan(
vec.begin() + 2,
vec.end(),
result.begin(),
initvalue,
[k](S left, int right){
S result;
/*A = (
0 1
k 0
)
Compute result = left * A + (right 0)
*/
result.data[0] = right + k * left.data[1];
result.data[1] = left.data[0];
return result;
}
);
std::cerr << "calculate2 result: ";
for(auto x : result){
std::cerr << x << ", ";
}
std::cerr << "\n";
}
int main(){
const int k = 5;
const std::vector<int> vec1{1,3,5,7,9,11,3,6,7,1,2,4};
calculate1(vec1, k);
calculate2(vec1, k);
}
https://godbolt.org/z/cszzn8Ec8
Output:
calculate1 result: 0, 0, 5, 7, 34, 46, 173, 236, 872, 1181, 4362, 5909,
calculate2 result: (0,0), (5,0), (7,5), (34,7), (46,34), (173,46), (236,173), (872,236), (1181,872), (4362,1181), (0,0), (0,0),
There is still an of-by-one error somewhere, but one can get the idea behind it.
I previously said that this approach can be used for a parallel scan in CUDA. This is not correct. For a parallel scan, the scan operator must have an additional property, which is associativity, i.e. (a OP b) OP c == a OP (b OP c). This is not the case in this approach.
Robert Crovella's answer shows how to derive an associative scan operator which can be used for a parallel scan.
/*Description: Write a function called getMax that takes three parameters of type int, and returns the biggest of the
three parameters which is of type int.
*/
#include <iostream>
using namespace std;
// Declare the function getMax and put in three variables.
int getMax(int number, int number2, int number3){
if( number >= number2 && number >= number3){
cout << number;
if(number2 >= number && number2 >= number3)
cout << number2 ;
}
else {
cout << number3;
}
return number, number2, number3;
}
// we now use the function to check for largest values below:
int main(){
cout << getMax(-13, -22, -3) << endl; //Prints -3
cout << getMax(9, 8, 9) << endl; //prints 9
cout << getMax(-5, 4, -7) << endl; //prints 4
cout << getMax(15, 15, 15) << endl; //prints 15
return 0;
}
You can solve it this way:
int getMax(int number, int number2, int number3){
return number > number2
? (number > number3 ? number : number3)
: (number2 > number3 ? number2 : number3);
}
Without ternary operators you can write:
int getMax(int number, int number2, int number3){
if (number > number2) {
if (number > number3) {
return number;
}
return number3;
}
if (number2 > number3) {
return number2;
}
return number3;
}
I'm a first year CS student trying to understand functions in C++ better because I am weak in that area as of right now. I'm trying to create a program that will ask the user for two integers that will then be passed to a calculation function that will finally be passed to a display function to show the calculations. As of right now here is my code with the output at the bottom. I'm not really sure why the num1 and num2 aren't properly being passed to the calculation function? Any help is appreciated and please disregard the style, I usually try and clean it up after I get it to work.
#include <iostream>
#include <cmath>
#include <iomanip>
using namespace std;
void getData();
void doTheMath(int num1, int num2);
void displayResults(int num1, int num2, int& sum, int& diff, int& prod, int& quot, int& rem);
int main()
{
int num1;
int num2;
int sum;
int diff;
int prod;
int quot;
int rem;
getData();
doTheMath(num1, num2);
displayResults(num1, num2, sum, diff, prod, quot, rem);
system("pause");
return 0;
}
void getData()
{
int num1;
int num2;
cout << "Please enter two integer values:\n";
cin >> num1;
cin >> num2;
cout << "The first number is " << num1
<< " and the second is "<< num2 << "\n\n";
}
void doTheMath(int num1, int num2)
{
int sum = num1 + num2;
int diff = num1 - num2;
int prod = num1 * num2;
int quot = num1 / num2;
int rem = num1 % num2;
}
void displayResults(int num1, int num2, int& sum, int& diff, int& prod, int& quot, int& rem)
{
if (num2 == 0)
{
cout << "Here are the results:\n\n";
cout << "The sum of " << num1 << " and " << num2
<< " is " << sum << ".\n";
cout << "The difference, (" << num1 << " minus "
<< num2 << ") is " << diff << ".\n";
cout << "The product of " << num1 << " and "
<< num2 << " is " << prod << ".\n";
cout << "Cannot divide by zero.\n\n";
}
else
{
cout << "Here are the results:\n\n";
cout << "The sum of " << num1 << " and " << num2
<< " is " << sum << ".\n";
cout << "The difference, (" << num1 << " minus "
<< num2 << ") is " << diff << ".\n";
cout << "The product of " << num1 << " and "
<< num2 << " is " << prod << ".\n";
cout << num1 << " divided by " << num2 << " is "
<< quot << " with a remainder of " << rem
<< ".\n\n";
}
}
//Output
/*Please enter two integer values:
12
0
The first number is 12 and the second is 0
Here are the results:
The sum of -858993460 and -858993460 is -858993460.
The difference, (-858993460 minus -858993460) is -858993460.
The product of -858993460 and -858993460 is -858993460.
-858993460 divided by -858993460 is -858993460 with a remainder of -858993460.
Press any key to continue . . .*/
The num1 and num2 variables in main() are different variables than num1 and num2 in getData(). So you're setting these in getData() but doing nothing with them except displaying. The num1 and num2 in main() are not affected. Pass these (as a reference) to getData(int &num1, int &num2) and don't declare the ones in getData() itself. Read up in 'auto' variable declaration (declared on the stack).
//==========================================================
/*Description:
This program is to showcase my understanding of
functions that I learned from our Lecture 7b. The user
is prompted to enter two integer values, where they
are then passed to a calculation function to calculate
the sum, difference, product, quotient, and remainder
of the two numbers entered. After all the values are
calculated, they are showcased in a display function
to the user in the output stream.*/
//==========================================================
#include <iostream>
#include <cmath>
#include <iomanip>
using namespace std;
void getData(int& num1, int& num2);
void doTheMath(int num1, int num2, int& sum, int& diff, int& prod, int& quot, int& rem);
void displayResults(int num1, int num2, int sum, int diff, int prod, int quot, int rem);
//====================== main ===========================
//
//=======================================================
int main()
{
int num1;
int num2;
int sum;
int diff;
int prod;
int quot;
int rem;
//Gets two integers from user
getData(num1, num2);
//Does the calculation from integers received
doTheMath(num1, num2, sum, diff, prod, quot, rem);
//Displays calculated results from two integers
displayResults(num1, num2, sum, diff, prod, quot, rem);
system("pause");
return 0;
}
/*===================== getData ==========================
This function gets the information from the user of the
two integers they wish to input. It assigns the user's
numbers to num1 and num2.
Input:
num1 - First integer assigned by user
num2 - Second integer assigned by user
Output:
The values being assigned to be used in the doTheMath
function.*/
//========================================================
void getData(int& num1, int& num2)
{
cout << "Please enter two integer values:\n";
cin >> num1;
cin >> num2;
}
/*==================== doTheMath =========================
This function calculates the user's two integers inputted
into the previous function and assigns the calculated
answers to variables named by the calculation performed.
It first checks to see if num2 is 0, because this system
can't divide by zero without crashing.
Input:
sum - adds the two integers
diff - subtracts the two integers
prod - multiplies the two integers
quot - divides the two integers
rem - gets the remainder of the two integers
Output:
Variables are now assigned new values to be displayed
inside of the displayResults function.*/
//========================================================
void doTheMath(int num1, int num2, int& sum, int& diff, int& prod, int& quot, int& rem)
{
if (num2 == 0)
{
sum = (num1 + num2);
diff = (num1 - num2);
prod = (num1 * num2);
}
else
{
sum = (num1 + num2);
diff = (num1 - num2);
prod = (num1 * num2);
quot = (num1 / num2);
rem = (num1 % num2);
}
}
/*================= displayResults ======================
This function takes the calculations from the doTheMath
function and displays them to the user in a standard
output stream. It first checks to see if num2 is 0,
because this system can't divide by zero without
crashing.
Input:
Calculations from the doTheMath function, as well as
num1 and num2.
(sum, diff, prod, quot, rem).
Output:
Displays the calculations from the doTheMath function
to the user in a standard output stream.*/
//========================================================
void displayResults(int num1, int num2, int sum, int diff, int prod, int quot, int rem)
{
if (num2 == 0)
{
cout << "Here are the results:\n\n";
cout << "The sum of " << num1 << " and " << num2
<< " is " << sum << ".\n";
cout << "The difference, (" << num1 << " minus "
<< num2 << ") is " << diff << ".\n";
cout << "The product of " << num1 << " and "
<< num2 << " is " << prod << ".\n";
cout << "Cannot divide by zero.\n\n";
}
else
{
cout << "Here are the results:\n\n";
cout << "The sum of " << num1 << " and " << num2
<< " is " << sum << ".\n";
cout << "The difference, (" << num1 << " minus "
<< num2 << ") is " << diff << ".\n";
cout << "The product of " << num1 << " and "
<< num2 << " is " << prod << ".\n";
cout << num1 << " divided by " << num2 << " is "
<< quot << " with a remainder of " << rem
<< ".\n\n";
}
}
//==========================================================
/*OUTPUT (When num2 != 0):
Please enter two integer values:
12
3
Here are the results:
The sum of 12 and 3 is 15.
The difference, (12 minus 3) is 9.
The product of 12 and 3 is 36.
12 divided by 3 is 4 with a remainder of 0.
Press any key to continue . . .*/
//==========================================================
//==========================================================
/*OUTPUT (When num2 == 0):
Please enter two integer values:
12
0
Here are the results:
The sum of 12 and 0 is 12.
The difference, (12 minus 0) is 12.
The product of 12 and 0 is 0.
Cannot divide by zero.
Press any key to continue . . .*/
//==========================================================
Thanks for helping everyone. I will continue looking at it so I can better understand! I am still struggling with recursion but I will study it more. Thanks again for all your time and effort for trying to help me
- /
int countEven(int arr[i]){
//I'm not sure what to do here... how to fix it...
int evens = 0;
if(arr[i] <= 0) return 0; //base case
while(arr[i] > 0){
int digit = arr[i]%10; //get the last digit
if(digit%2 == 0){
evens = evens+1;
}
arr[i] = arr[i]/10;
}
cout << evens;
}
}
}
int main(){
cout << "Part A:\n";
int arr[3] = { 5050155, 5, 707070 };
for (int i = 0; i < 3; i++){
cout << "countEven(" << arr[i] << ") = " << countEven(arr[i]) << endl;
cout << "removeEven(" << arr[i] << ") = " << removeEven(arr[i]) << endl;
cout << "hasEven(" << arr[i] << ") = ";
if (hasEven(arr[i])) cout << "Yes" << endl;
else cout << "No" << endl;
printStarDigit(arr[i]);
cout << endl << endl;
}
cout << "Part B:\n";
int a[4] = { 7, 2, 8, 3 };
int b[5] = { 3, 4, 5, 6, 7 };
cout << "The range of array a is " << range(a, 4) << endl;
cout << "The range of array b is " << range(b, 5) << endl;
reverse(a, 4);
reverse(b, 5);
cout << "Array a reversed: ";
for (int i = 0; i < 4; ++i)
cout << a[i] << " ";
cout << endl;
cout << "Array b reversed: ";
for (int i = 0; i < 5; ++i)
cout << b[i] << " ";
cout << endl;
return 0;
}
int countEven(int arr[i]){
Parameters must have simple names, and do not need to be identical to the expressions passed in. arr[i] is not a valid name. A simple name to use here is n.
//I'm not sure what to do here... how to fix it...
int evens = 0;
if(arr[i] <= 0) return 0; //base case
This base case is wrong for two reasons. Firstly, you treat all negative integers as the base case, but -202020 has six even digits. Secondly, you return the wrong value: 0 has one even digit, but you return zero.
A possible base case could be n > -10 && n < 10 (single digit number). I'll let you figure out the expression to return for that base case.
while(arr[i] > 0){
If your task is to write a recursive function, then you shouldn't use a loop here. Instead, see below.
int digit = arr[i]%10; //get the last digit
...
arr[i] = arr[i]/10;
This is a correct way of obtaining the last digit, and everything other than the last digit.
if(digit%2 == 0)
This is a correct way of determining whether the last digit is even.
Now, you need to combine what you have, by observing that the count of even digits is equal to "1 if the last digit is even, else 0", plus the count of even non-last digits. The goal of the exercise is to get you to write "the count of even non-last digits" as countEven(n / 10).
See these code snippets.
int countEven(int n, bool first_time = true){
static int total ;
int num =n;
if (first_time)
{
total = 0;
}
if (num <= 9) {
return num % 2 == 0 ? 1 : 0;
}
int temp = num - (num / 10) * 10;
num = num / 10;
total += temp % 2 == 0 ? 1 : 0;
countEven(num,false);
return total;
}
int removeEven(int n){
int result = 0;
/*TODO*/
return result;
}
bool hasEven(int n){
return countEven(n);
}
void printStarDigit(int* arr){
/*TODO*/
}
int range(int* arr, int n){
int result = *arr;
for (int i = 0; i < n; i++) {
if (result < *(arr + i)) result = *(arr + i);
}
return result;
}
void reverse(int* arr, int n){
int* temp = new int[n];
for (int i = 0; i < n; i++) {
*(temp + i) = *(arr + i);
}
for (int i = 0; i < n; i++) {
*(arr + i) = *(temp + n - 1 - i);
}
delete[] temp;
}
Main implementation :
int main() {
cout << "Part A:\n";
int arr[3] = { 5050155, 5, 707070 };
for (int i = 0; i < 3; i++) {
cout << "countEven(" << arr[i] << ") = " << countEven(arr[i]) << endl;
cout << "removeEven(" << arr[i] << ") = " << removeEven(arr[i]) << endl;
cout << "hasEven(" << arr[i] << ") = ";
if (hasEven(arr[i])) cout << "Yes" << endl;
else cout << "No" << endl;
printStarDigit(arr + i);
cout << endl << endl;
}
cout << "Part B:\n";
int a[4] = { 7, 2, 8, 3 };
int b[5] = { 3, 4, 5, 6, 7 };
cout << "The range of array a is " << range(a, 4) << endl;
cout << "The range of array b is " << range(b, 5) << endl;
reverse(a, 4);
reverse(b, 5);
cout << "Array a reversed: ";
for (int i = 0; i < 4; ++i)
cout << a[i] << " ";
cout << endl;
cout << "Array b reversed: ";
for (int i = 0; i < 5; ++i)
cout << b[i] << " ";
cout << endl;
return 0;
}
Output:
I am to write a cuda code which searches set of keyword strings inside set of data strings and returns an array of boolean for keyword-data string pairs. Data strings: at the moment, 10000(may vary) strings and each of them has max 250 chars.
Keyword strings: at the moment, 100(may vary) strings and each of them has max 100 chars.
Length of each string is known.
My question is which of the following approaches might be more suitable in this case.
1st:
gridDim.x => # of keyword strings
gridDim.y => # of data strings
blockDim => (max string size(250 in this case),1,1)
Naive algorithm will be used for search
Each thread will load the chars of keyword and data to shared mem from global mem.
Each thread will be responsible for one of the windows in naive search algorithm.
Result will be written to the boolean array.
So, each block will be responsible for keyword-data pair.
2nd:
gridDim => (# of data strings,1,1)
blockDim => (# of keyword strings,1,1)
In each block, data string will be loaded to shared mem.
In this case, each thread will be responsible for keyword-data pair instead of block.
Each thread will search corresponding keyword inside the data string.
Naive algorithm is not necessary in this case, Boyer-Moore might be used.
For searches inside huge files, since length of the data is much bigger than the length of the keyword, 1st approach is used. But in this case, I am not sure if the 1st appraoch is better. On the other hand, for 2nd approach, coalescing the keywords might be a problem, since the lengths are not fixed. There is an upper boundry for the size of the keywords. So, padding might ease the coalescing but it would consume more memory.
Anyhow, if you have worked on a similar case or know about a better approach than those I described above, please help me out.
Thank you in advance.
So, I've implemented both of the cases. Code for approach 1:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "stdio.h"
#include "iostream"
#include "chrono"
#include "cstdlib"
#define SEARCHTERMSIZE 100
#define SEARCHITEMSIZE 65000
#define MAXDATASTRINGSIZE 250
#define MAXKEYWORDSTRINGSSIZE 50
using namespace std;
__global__ void searchKeywordKernel(bool* resultPtr, const char * dataPtr, const short* dataLengths, const char * keywordPtr, const short* keywordLengths)
{
int dataIndex = blockIdx.x;
int keywordIndex = blockIdx.y;
int dataLength = dataLengths[dataIndex];
int keywordLength = keywordLengths[keywordIndex];
__shared__ char sData[MAXDATASTRINGSIZE];
__shared__ char sKeyword[MAXKEYWORDSTRINGSSIZE];
__shared__ bool isFound;
if (dataIndex < SEARCHITEMSIZE && keywordIndex < SEARCHTERMSIZE)
{
if (dataLength < keywordLength)
{
resultPtr[keywordIndex*SEARCHITEMSIZE + dataIndex] = false;
}
else
{
isFound = false;
sData[threadIdx.x] = dataPtr[dataIndex*MAXDATASTRINGSIZE + threadIdx.x];
if (threadIdx.x < keywordLength)
sKeyword[threadIdx.x] = keywordPtr[keywordIndex*MAXKEYWORDSTRINGSSIZE + threadIdx.x];
__syncthreads();
if (threadIdx.x <= dataLength - keywordLength)
{
for (int i = 0; i < keywordLength && !isFound; i++)
{
if (sData[threadIdx.x + i] != sKeyword[i])
break;
if (i == keywordLength - 1)
isFound = true;
}
}
resultPtr[keywordIndex*SEARCHITEMSIZE + dataIndex] = isFound;
}
}
}
int main()
{
chrono::steady_clock::time_point startTime;
chrono::steady_clock::time_point endTime;
typedef chrono::duration<int, milli> millisecs_t;
//////////Search Data Init/////////////////
cout << "Before Search Data Init" << endl;
startTime = chrono::steady_clock::now();
char* dataPtr = (char*)malloc(sizeof(char)*MAXDATASTRINGSIZE*SEARCHITEMSIZE);
short* dataLengths = new short[SEARCHITEMSIZE];
short temp;
short tempChar;
for (int i = 0; i < SEARCHITEMSIZE; i++)
{
temp = rand() % (MAXDATASTRINGSIZE - 20) + 20;
for (int k = 0; k < temp; k++)
{
tempChar = rand() % 26;
dataPtr[i*MAXDATASTRINGSIZE + k] = 97 + tempChar; //97->a, 98->b, 122->z
}
dataLengths[i] = temp;
}
endTime = chrono::steady_clock::now();
millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Data Init: " << duration.count() << "ms" << endl;
//////////Search Data Init/////////////////
//////////Search Keyword Init/////////////////
cout << "Before Search Keyword Init" << endl;
startTime = chrono::steady_clock::now();
char* keywordPtr = (char*)malloc(sizeof(char)*MAXKEYWORDSTRINGSSIZE*SEARCHTERMSIZE);
short* keywordLengths = new short[SEARCHTERMSIZE]; //lenghts, not the start positions
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
temp = rand() % (MAXKEYWORDSTRINGSSIZE - 10) + 10;
for (int k = 0; k < temp; k++)
{
tempChar = rand() % 26;
keywordPtr[i*MAXKEYWORDSTRINGSSIZE + k] = 97 + tempChar; //97->a, 98->b, 122->z
}
keywordLengths[i] = temp;
}
endTime = chrono::steady_clock::now();
millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Keyword Init: " << duration1.count() << "ms" << endl;
//////////Search Keyword Init/////////////////
char* d_dataPtr;
short* d_dataLengths;
char* d_keywordPtr;
short* d_keywordLengths;
bool* d_resultPtr;
/////////////////////////CudaMalloc/////////////////////////////////
cout << "Before Malloc" << endl;
startTime = chrono::steady_clock::now();
cudaMalloc(&d_dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE);
cudaMalloc(&d_dataLengths, sizeof(short) * SEARCHITEMSIZE);
cudaMalloc(&d_keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE);
cudaMalloc(&d_keywordLengths, sizeof(short) * SEARCHTERMSIZE);
cudaMalloc(&d_resultPtr, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);
endTime = chrono::steady_clock::now();
millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Malloc: " << duration2.count() << "ms" << endl;
/////////////////////////CudaMalloc/////////////////////////////////
cudaEvent_t start, stop;
float elapsedTime;
/////////////////////////CudaMemCpy///////////////////////////////////
cout << "Before Memcpy" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMemcpy(d_dataPtr, dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataLengths, dataLengths, sizeof(short) * SEARCHITEMSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keywordPtr, keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keywordLengths, keywordLengths, sizeof(short) * SEARCHTERMSIZE, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Memcpy: " << elapsedTime << "ms" << endl;
/////////////////////////CudaMemCpy///////////////////////////////////
////////////////////////Kernel//////////////////////////////////////////
cout << "Before Kernel" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
dim3 dimGrid(SEARCHITEMSIZE,SEARCHTERMSIZE);
searchKeywordKernel << < dimGrid, MAXDATASTRINGSIZE >> >(d_resultPtr, d_dataPtr, d_dataLengths, d_keywordPtr, d_keywordLengths);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Kernel: " << elapsedTime << "ms" << endl;
////////////////////////Kernel//////////////////////////////////////////
bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
cudaMemcpy(result, d_resultPtr, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
/////////////////////////////////// CPU code //////////////////////////////////////////
bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
cout << "CPU code starts" << endl;
startTime = chrono::steady_clock::now();
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
for (int j = 0; j < SEARCHITEMSIZE; j++)
{
if (dataLengths[j] < keywordLengths[i])
{
cpuResult[i*SEARCHITEMSIZE + j] = false;
break;
}
else
{
for (int k = 0; k <= dataLengths[j] - keywordLengths[i]; k++)
{
cpuResult[i*SEARCHITEMSIZE + j] = true;
for (int l = 0; l < keywordLengths[i]; l++)
{
if (dataPtr[j*MAXDATASTRINGSIZE + k + l] != keywordPtr[i*MAXKEYWORDSTRINGSSIZE + l])
{
cpuResult[i*SEARCHITEMSIZE + j] = false;
break;
}
}
if (cpuResult[i*SEARCHTERMSIZE + j])
break;
}
}
}
}
endTime = chrono::steady_clock::now();
millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "CPU code ends: " << duration3.count() << "ms" << endl;
/////////////////////////////////// CPU code //////////////////////////////////////////
////////////////////////////////////Result Comparison////////////////////////////////////////
bool kernelRes = true;
for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
{
if (cpuResult[i] != result[i])
{
kernelRes = false;
break;
}
}
////////////////////////////////////Result Comparison////////////////////////////////////////
cout << boolalpha << "Kernel computation: " << kernelRes << endl;
cout << "Before Deleting arrays" << endl;
delete[] dataPtr;
delete[] keywordPtr;
delete[] dataLengths;
delete[] keywordLengths;
delete[] result;
delete[] cpuResult;
cout << "After Deleting arrays" << endl;
cout << "Before Freeing device memory" << endl;
cudaFree(d_dataPtr);
cudaFree(d_keywordPtr);
cudaFree(d_dataLengths);
cudaFree(d_keywordLengths);
cudaFree(d_resultPtr);
cout << "After Freeing device memory" << endl;
cudaDeviceReset();
system("pause");
return 0;
}
Code for approach 2:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <chrono>
#include <cstdlib>
#define SEARCHTERMSIZE 198
#define SEARCHITEMSIZE 65000
#define MAXDATASTRINGSIZE 250
#define MAXKEYWORDSTRINGSSIZE 50
using namespace std;
__global__ void searchKeywordKernel(bool* resultPtr, const char * __restrict__ dataPtr, const short* dataLengths, const char * keywordPtr, const short* keywordLengths)
{
int dataIndex = blockIdx.x;
int keywordIndex = threadIdx.x;
int dataLength = dataLengths[dataIndex];
int keywordLength = keywordLengths[keywordIndex];
__shared__ char sData[MAXDATASTRINGSIZE];
if (dataIndex < SEARCHITEMSIZE)
{
int my_tid = keywordIndex;
while (my_tid < dataLength)
{
sData[my_tid] = dataPtr[dataIndex*MAXDATASTRINGSIZE + my_tid];
my_tid += blockDim.x;
}
__syncthreads();
if (keywordIndex < SEARCHTERMSIZE)
{
if (dataLength < keywordLength)
{
resultPtr[dataIndex*SEARCHTERMSIZE + keywordIndex] = false;
}
else
{
bool isFound = true;
for (int i = 0; i <= dataLength - keywordLength; i++)
{
for (int j = 0; j < keywordLength; j++)
{
if (sData[i + j] != keywordPtr[j*SEARCHTERMSIZE + keywordIndex])
{
isFound = false;
break;
}
}
if (isFound)
break;
}
resultPtr[dataIndex*SEARCHTERMSIZE + keywordIndex] = isFound;
}
}
}
}
int main()
{
chrono::steady_clock::time_point startTime;
chrono::steady_clock::time_point endTime;
typedef chrono::duration<int, milli> millisecs_t;
//////////Search Data Init/////////////////
cout << "Before Search Data Init" << endl;
startTime = chrono::steady_clock::now();
char* dataPtr = (char*)malloc(sizeof(char)*MAXDATASTRINGSIZE*SEARCHITEMSIZE);
short* dataLengths = new short[SEARCHITEMSIZE];
short temp;
short tempChar;
for (int i = 0; i < SEARCHITEMSIZE; i++)
{
temp = rand() % (MAXDATASTRINGSIZE - 20) + 20;
for (int k = 0; k < temp; k++)
{
tempChar = rand() % 26;
dataPtr[i*MAXDATASTRINGSIZE + k] = 97 + tempChar; //97->a, 98->b, 122->z
}
dataLengths[i] = temp;
}
endTime = chrono::steady_clock::now();
millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Data Init: " << duration.count() << "ms" << endl;
//////////Search Data Init/////////////////
//////////Search Keyword Init/////////////////
cout << "Before Search Keyword Init" << endl;
startTime = chrono::steady_clock::now();
char* keywordPtr = (char*)malloc(sizeof(char)*MAXKEYWORDSTRINGSSIZE*SEARCHTERMSIZE);
short* keywordLengths = new short[SEARCHTERMSIZE]; //lenghts, not the start positions
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
temp = rand() % (MAXKEYWORDSTRINGSSIZE - 10) + 10;
for (int k = 0; k < temp; k++)
{
tempChar = rand() % 26;
keywordPtr[i*MAXKEYWORDSTRINGSSIZE + k] = 97 + tempChar; //97->a, 98->b, 122->z
}
keywordLengths[i] = temp;
}
endTime = chrono::steady_clock::now();
millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Keyword Init: " << duration1.count() << "ms" << endl;
//////////Search Keyword Init/////////////////
////////////////////Traverse Keyword Array////////////////////////////
char* keywordPtr_T = new char[SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE];
for (int i = 0; i < SEARCHTERMSIZE; i++)
for (int j = 0; j < MAXKEYWORDSTRINGSSIZE; j++)
keywordPtr_T[j*SEARCHTERMSIZE + i] = keywordPtr[i*MAXKEYWORDSTRINGSSIZE + j];
////////////////////Traverse Keyword Array////////////////////////////
char* d_dataPtr;
short* d_dataLengths;
char* d_keywordPtr;
short* d_keywordLengths;
bool* d_resultPtr;
/////////////////////////CudaMalloc/////////////////////////////////
cout << "Before Malloc" << endl;
startTime = chrono::steady_clock::now();
cudaMalloc(&d_dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE);
cudaMalloc(&d_dataLengths, sizeof(short) * SEARCHITEMSIZE);
cudaMalloc(&d_keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE);
cudaMalloc(&d_keywordLengths, sizeof(short) * SEARCHTERMSIZE);
cudaMalloc(&d_resultPtr, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);
endTime = chrono::steady_clock::now();
millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Malloc: " << duration2.count() << "ms" << endl;
/////////////////////////CudaMalloc/////////////////////////////////
cudaEvent_t start, stop;
float elapsedTime;
/////////////////////////CudaMemCpy///////////////////////////////////
cout << "Before Memcpy" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMemcpy(d_dataPtr, dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataLengths, dataLengths, sizeof(short) * SEARCHITEMSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keywordPtr, keywordPtr_T, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keywordLengths, keywordLengths, sizeof(short) * SEARCHTERMSIZE, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Memcpy: " << elapsedTime << "ms" << endl;
/////////////////////////CudaMemCpy///////////////////////////////////
////////////////////////Kernel//////////////////////////////////////////
cout << "Before Kernel" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
searchKeywordKernel << < SEARCHITEMSIZE, SEARCHTERMSIZE >> >(d_resultPtr, d_dataPtr, d_dataLengths, d_keywordPtr, d_keywordLengths);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Kernel: " << elapsedTime << "ms" << endl;
////////////////////////Kernel//////////////////////////////////////////
bool* result_T = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
cudaMemcpy(result_T, d_resultPtr, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
for (int i = 0; i < SEARCHTERMSIZE; i++)
for (int j = 0; j < SEARCHITEMSIZE; j++)
result[j*SEARCHTERMSIZE + i] = result_T[i*SEARCHITEMSIZE + j];
/////////////////////////////////// CPU code //////////////////////////////////////////
bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
cout << "CPU code starts" << endl;
startTime = chrono::steady_clock::now();
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
for (int j = 0; j < SEARCHITEMSIZE; j++)
{
if (dataLengths[j] < keywordLengths[i])
{
cpuResult[i*SEARCHITEMSIZE + j] = false;
break;
}
else
{
for (int k = 0; k <= dataLengths[j] - keywordLengths[i]; k++)
{
cpuResult[i*SEARCHITEMSIZE + j] = true;
for (int l = 0; l < keywordLengths[i]; l++)
{
if (dataPtr[j*MAXDATASTRINGSIZE + k + l] != keywordPtr[i*MAXKEYWORDSTRINGSSIZE + l])
{
cpuResult[i*SEARCHITEMSIZE + j] = false;
break;
}
}
if (cpuResult[i*SEARCHTERMSIZE + j])
break;
}
}
}
}
endTime = chrono::steady_clock::now();
millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "CPU code ends: " << duration3.count() << "ms" << endl;
/////////////////////////////////// CPU code //////////////////////////////////////////
////////////////////////////////////Result Comparison////////////////////////////////////////
bool kernelRes = true;
for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
{
if (cpuResult[i] != result[i])
{
kernelRes = false;
break;
}
}
////////////////////////////////////Result Comparison////////////////////////////////////////
cout << boolalpha << "Kernel computation: " << kernelRes << endl;
cout << "Before Deleting arrays" << endl;
delete[] dataPtr;
delete[] keywordPtr;
delete[] keywordPtr_T;
delete[] dataLengths;
delete[] keywordLengths;
delete[] result;
delete[] result_T;
delete[] cpuResult;
cout << "After Deleting arrays" << endl;
cout << "Before Freeing device memory" << endl;
cudaFree(d_dataPtr);
cudaFree(d_keywordPtr);
cudaFree(d_dataLengths);
cudaFree(d_keywordLengths);
cudaFree(d_resultPtr);
cout << "After Freeing device memory" << endl;
cudaDeviceReset();
system("pause");
return 0;
}
Second approach gave better results than the first approach. Yet the performance of the second approach depends on the number of keywords. If the number of the keywords is multiple of 192, gpu has performance than cpu (time of malloc+memcpy+kernel < time of cpu).
What should I do to overcome such dependancy? Would it be viable to increase the number of threads and to pass multiple data strings rather than one in each block?
I suggest blockDim = (16, 16, 1) and gridDim = (# of data strings / 16, # of keyword strings / 16, 1). In your case, where tens of strings can ideally fit in shared memory, such block-grid division will lead to minimum global memory access while introducing no computation overhead.
Padding is not a good choice, unless each string is expected to have its length quite close to the maximum (80% of maximum for example). If you keep a array of offset of every string (CPU is good at generating it), coalescing global memory read is just trivial.