Advice on string search with Cuda - cuda

I am to write a cuda code which searches set of keyword strings inside set of data strings and returns an array of boolean for keyword-data string pairs. Data strings: at the moment, 10000(may vary) strings and each of them has max 250 chars.
Keyword strings: at the moment, 100(may vary) strings and each of them has max 100 chars.
Length of each string is known.
My question is which of the following approaches might be more suitable in this case.
1st:
gridDim.x => # of keyword strings
gridDim.y => # of data strings
blockDim => (max string size(250 in this case),1,1)
Naive algorithm will be used for search
Each thread will load the chars of keyword and data to shared mem from global mem.
Each thread will be responsible for one of the windows in naive search algorithm.
Result will be written to the boolean array.
So, each block will be responsible for keyword-data pair.
2nd:
gridDim => (# of data strings,1,1)
blockDim => (# of keyword strings,1,1)
In each block, data string will be loaded to shared mem.
In this case, each thread will be responsible for keyword-data pair instead of block.
Each thread will search corresponding keyword inside the data string.
Naive algorithm is not necessary in this case, Boyer-Moore might be used.
For searches inside huge files, since length of the data is much bigger than the length of the keyword, 1st approach is used. But in this case, I am not sure if the 1st appraoch is better. On the other hand, for 2nd approach, coalescing the keywords might be a problem, since the lengths are not fixed. There is an upper boundry for the size of the keywords. So, padding might ease the coalescing but it would consume more memory.
Anyhow, if you have worked on a similar case or know about a better approach than those I described above, please help me out.
Thank you in advance.
So, I've implemented both of the cases. Code for approach 1:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "stdio.h"
#include "iostream"
#include "chrono"
#include "cstdlib"
#define SEARCHTERMSIZE 100
#define SEARCHITEMSIZE 65000
#define MAXDATASTRINGSIZE 250
#define MAXKEYWORDSTRINGSSIZE 50
using namespace std;
__global__ void searchKeywordKernel(bool* resultPtr, const char * dataPtr, const short* dataLengths, const char * keywordPtr, const short* keywordLengths)
{
int dataIndex = blockIdx.x;
int keywordIndex = blockIdx.y;
int dataLength = dataLengths[dataIndex];
int keywordLength = keywordLengths[keywordIndex];
__shared__ char sData[MAXDATASTRINGSIZE];
__shared__ char sKeyword[MAXKEYWORDSTRINGSSIZE];
__shared__ bool isFound;
if (dataIndex < SEARCHITEMSIZE && keywordIndex < SEARCHTERMSIZE)
{
if (dataLength < keywordLength)
{
resultPtr[keywordIndex*SEARCHITEMSIZE + dataIndex] = false;
}
else
{
isFound = false;
sData[threadIdx.x] = dataPtr[dataIndex*MAXDATASTRINGSIZE + threadIdx.x];
if (threadIdx.x < keywordLength)
sKeyword[threadIdx.x] = keywordPtr[keywordIndex*MAXKEYWORDSTRINGSSIZE + threadIdx.x];
__syncthreads();
if (threadIdx.x <= dataLength - keywordLength)
{
for (int i = 0; i < keywordLength && !isFound; i++)
{
if (sData[threadIdx.x + i] != sKeyword[i])
break;
if (i == keywordLength - 1)
isFound = true;
}
}
resultPtr[keywordIndex*SEARCHITEMSIZE + dataIndex] = isFound;
}
}
}
int main()
{
chrono::steady_clock::time_point startTime;
chrono::steady_clock::time_point endTime;
typedef chrono::duration<int, milli> millisecs_t;
//////////Search Data Init/////////////////
cout << "Before Search Data Init" << endl;
startTime = chrono::steady_clock::now();
char* dataPtr = (char*)malloc(sizeof(char)*MAXDATASTRINGSIZE*SEARCHITEMSIZE);
short* dataLengths = new short[SEARCHITEMSIZE];
short temp;
short tempChar;
for (int i = 0; i < SEARCHITEMSIZE; i++)
{
temp = rand() % (MAXDATASTRINGSIZE - 20) + 20;
for (int k = 0; k < temp; k++)
{
tempChar = rand() % 26;
dataPtr[i*MAXDATASTRINGSIZE + k] = 97 + tempChar; //97->a, 98->b, 122->z
}
dataLengths[i] = temp;
}
endTime = chrono::steady_clock::now();
millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Data Init: " << duration.count() << "ms" << endl;
//////////Search Data Init/////////////////
//////////Search Keyword Init/////////////////
cout << "Before Search Keyword Init" << endl;
startTime = chrono::steady_clock::now();
char* keywordPtr = (char*)malloc(sizeof(char)*MAXKEYWORDSTRINGSSIZE*SEARCHTERMSIZE);
short* keywordLengths = new short[SEARCHTERMSIZE]; //lenghts, not the start positions
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
temp = rand() % (MAXKEYWORDSTRINGSSIZE - 10) + 10;
for (int k = 0; k < temp; k++)
{
tempChar = rand() % 26;
keywordPtr[i*MAXKEYWORDSTRINGSSIZE + k] = 97 + tempChar; //97->a, 98->b, 122->z
}
keywordLengths[i] = temp;
}
endTime = chrono::steady_clock::now();
millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Keyword Init: " << duration1.count() << "ms" << endl;
//////////Search Keyword Init/////////////////
char* d_dataPtr;
short* d_dataLengths;
char* d_keywordPtr;
short* d_keywordLengths;
bool* d_resultPtr;
/////////////////////////CudaMalloc/////////////////////////////////
cout << "Before Malloc" << endl;
startTime = chrono::steady_clock::now();
cudaMalloc(&d_dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE);
cudaMalloc(&d_dataLengths, sizeof(short) * SEARCHITEMSIZE);
cudaMalloc(&d_keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE);
cudaMalloc(&d_keywordLengths, sizeof(short) * SEARCHTERMSIZE);
cudaMalloc(&d_resultPtr, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);
endTime = chrono::steady_clock::now();
millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Malloc: " << duration2.count() << "ms" << endl;
/////////////////////////CudaMalloc/////////////////////////////////
cudaEvent_t start, stop;
float elapsedTime;
/////////////////////////CudaMemCpy///////////////////////////////////
cout << "Before Memcpy" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMemcpy(d_dataPtr, dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataLengths, dataLengths, sizeof(short) * SEARCHITEMSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keywordPtr, keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keywordLengths, keywordLengths, sizeof(short) * SEARCHTERMSIZE, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Memcpy: " << elapsedTime << "ms" << endl;
/////////////////////////CudaMemCpy///////////////////////////////////
////////////////////////Kernel//////////////////////////////////////////
cout << "Before Kernel" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
dim3 dimGrid(SEARCHITEMSIZE,SEARCHTERMSIZE);
searchKeywordKernel << < dimGrid, MAXDATASTRINGSIZE >> >(d_resultPtr, d_dataPtr, d_dataLengths, d_keywordPtr, d_keywordLengths);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Kernel: " << elapsedTime << "ms" << endl;
////////////////////////Kernel//////////////////////////////////////////
bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
cudaMemcpy(result, d_resultPtr, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
/////////////////////////////////// CPU code //////////////////////////////////////////
bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
cout << "CPU code starts" << endl;
startTime = chrono::steady_clock::now();
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
for (int j = 0; j < SEARCHITEMSIZE; j++)
{
if (dataLengths[j] < keywordLengths[i])
{
cpuResult[i*SEARCHITEMSIZE + j] = false;
break;
}
else
{
for (int k = 0; k <= dataLengths[j] - keywordLengths[i]; k++)
{
cpuResult[i*SEARCHITEMSIZE + j] = true;
for (int l = 0; l < keywordLengths[i]; l++)
{
if (dataPtr[j*MAXDATASTRINGSIZE + k + l] != keywordPtr[i*MAXKEYWORDSTRINGSSIZE + l])
{
cpuResult[i*SEARCHITEMSIZE + j] = false;
break;
}
}
if (cpuResult[i*SEARCHTERMSIZE + j])
break;
}
}
}
}
endTime = chrono::steady_clock::now();
millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "CPU code ends: " << duration3.count() << "ms" << endl;
/////////////////////////////////// CPU code //////////////////////////////////////////
////////////////////////////////////Result Comparison////////////////////////////////////////
bool kernelRes = true;
for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
{
if (cpuResult[i] != result[i])
{
kernelRes = false;
break;
}
}
////////////////////////////////////Result Comparison////////////////////////////////////////
cout << boolalpha << "Kernel computation: " << kernelRes << endl;
cout << "Before Deleting arrays" << endl;
delete[] dataPtr;
delete[] keywordPtr;
delete[] dataLengths;
delete[] keywordLengths;
delete[] result;
delete[] cpuResult;
cout << "After Deleting arrays" << endl;
cout << "Before Freeing device memory" << endl;
cudaFree(d_dataPtr);
cudaFree(d_keywordPtr);
cudaFree(d_dataLengths);
cudaFree(d_keywordLengths);
cudaFree(d_resultPtr);
cout << "After Freeing device memory" << endl;
cudaDeviceReset();
system("pause");
return 0;
}
Code for approach 2:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <chrono>
#include <cstdlib>
#define SEARCHTERMSIZE 198
#define SEARCHITEMSIZE 65000
#define MAXDATASTRINGSIZE 250
#define MAXKEYWORDSTRINGSSIZE 50
using namespace std;
__global__ void searchKeywordKernel(bool* resultPtr, const char * __restrict__ dataPtr, const short* dataLengths, const char * keywordPtr, const short* keywordLengths)
{
int dataIndex = blockIdx.x;
int keywordIndex = threadIdx.x;
int dataLength = dataLengths[dataIndex];
int keywordLength = keywordLengths[keywordIndex];
__shared__ char sData[MAXDATASTRINGSIZE];
if (dataIndex < SEARCHITEMSIZE)
{
int my_tid = keywordIndex;
while (my_tid < dataLength)
{
sData[my_tid] = dataPtr[dataIndex*MAXDATASTRINGSIZE + my_tid];
my_tid += blockDim.x;
}
__syncthreads();
if (keywordIndex < SEARCHTERMSIZE)
{
if (dataLength < keywordLength)
{
resultPtr[dataIndex*SEARCHTERMSIZE + keywordIndex] = false;
}
else
{
bool isFound = true;
for (int i = 0; i <= dataLength - keywordLength; i++)
{
for (int j = 0; j < keywordLength; j++)
{
if (sData[i + j] != keywordPtr[j*SEARCHTERMSIZE + keywordIndex])
{
isFound = false;
break;
}
}
if (isFound)
break;
}
resultPtr[dataIndex*SEARCHTERMSIZE + keywordIndex] = isFound;
}
}
}
}
int main()
{
chrono::steady_clock::time_point startTime;
chrono::steady_clock::time_point endTime;
typedef chrono::duration<int, milli> millisecs_t;
//////////Search Data Init/////////////////
cout << "Before Search Data Init" << endl;
startTime = chrono::steady_clock::now();
char* dataPtr = (char*)malloc(sizeof(char)*MAXDATASTRINGSIZE*SEARCHITEMSIZE);
short* dataLengths = new short[SEARCHITEMSIZE];
short temp;
short tempChar;
for (int i = 0; i < SEARCHITEMSIZE; i++)
{
temp = rand() % (MAXDATASTRINGSIZE - 20) + 20;
for (int k = 0; k < temp; k++)
{
tempChar = rand() % 26;
dataPtr[i*MAXDATASTRINGSIZE + k] = 97 + tempChar; //97->a, 98->b, 122->z
}
dataLengths[i] = temp;
}
endTime = chrono::steady_clock::now();
millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Data Init: " << duration.count() << "ms" << endl;
//////////Search Data Init/////////////////
//////////Search Keyword Init/////////////////
cout << "Before Search Keyword Init" << endl;
startTime = chrono::steady_clock::now();
char* keywordPtr = (char*)malloc(sizeof(char)*MAXKEYWORDSTRINGSSIZE*SEARCHTERMSIZE);
short* keywordLengths = new short[SEARCHTERMSIZE]; //lenghts, not the start positions
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
temp = rand() % (MAXKEYWORDSTRINGSSIZE - 10) + 10;
for (int k = 0; k < temp; k++)
{
tempChar = rand() % 26;
keywordPtr[i*MAXKEYWORDSTRINGSSIZE + k] = 97 + tempChar; //97->a, 98->b, 122->z
}
keywordLengths[i] = temp;
}
endTime = chrono::steady_clock::now();
millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Keyword Init: " << duration1.count() << "ms" << endl;
//////////Search Keyword Init/////////////////
////////////////////Traverse Keyword Array////////////////////////////
char* keywordPtr_T = new char[SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE];
for (int i = 0; i < SEARCHTERMSIZE; i++)
for (int j = 0; j < MAXKEYWORDSTRINGSSIZE; j++)
keywordPtr_T[j*SEARCHTERMSIZE + i] = keywordPtr[i*MAXKEYWORDSTRINGSSIZE + j];
////////////////////Traverse Keyword Array////////////////////////////
char* d_dataPtr;
short* d_dataLengths;
char* d_keywordPtr;
short* d_keywordLengths;
bool* d_resultPtr;
/////////////////////////CudaMalloc/////////////////////////////////
cout << "Before Malloc" << endl;
startTime = chrono::steady_clock::now();
cudaMalloc(&d_dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE);
cudaMalloc(&d_dataLengths, sizeof(short) * SEARCHITEMSIZE);
cudaMalloc(&d_keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE);
cudaMalloc(&d_keywordLengths, sizeof(short) * SEARCHTERMSIZE);
cudaMalloc(&d_resultPtr, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);
endTime = chrono::steady_clock::now();
millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Malloc: " << duration2.count() << "ms" << endl;
/////////////////////////CudaMalloc/////////////////////////////////
cudaEvent_t start, stop;
float elapsedTime;
/////////////////////////CudaMemCpy///////////////////////////////////
cout << "Before Memcpy" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMemcpy(d_dataPtr, dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataLengths, dataLengths, sizeof(short) * SEARCHITEMSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keywordPtr, keywordPtr_T, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keywordLengths, keywordLengths, sizeof(short) * SEARCHTERMSIZE, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Memcpy: " << elapsedTime << "ms" << endl;
/////////////////////////CudaMemCpy///////////////////////////////////
////////////////////////Kernel//////////////////////////////////////////
cout << "Before Kernel" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
searchKeywordKernel << < SEARCHITEMSIZE, SEARCHTERMSIZE >> >(d_resultPtr, d_dataPtr, d_dataLengths, d_keywordPtr, d_keywordLengths);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Kernel: " << elapsedTime << "ms" << endl;
////////////////////////Kernel//////////////////////////////////////////
bool* result_T = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
cudaMemcpy(result_T, d_resultPtr, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
for (int i = 0; i < SEARCHTERMSIZE; i++)
for (int j = 0; j < SEARCHITEMSIZE; j++)
result[j*SEARCHTERMSIZE + i] = result_T[i*SEARCHITEMSIZE + j];
/////////////////////////////////// CPU code //////////////////////////////////////////
bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
cout << "CPU code starts" << endl;
startTime = chrono::steady_clock::now();
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
for (int j = 0; j < SEARCHITEMSIZE; j++)
{
if (dataLengths[j] < keywordLengths[i])
{
cpuResult[i*SEARCHITEMSIZE + j] = false;
break;
}
else
{
for (int k = 0; k <= dataLengths[j] - keywordLengths[i]; k++)
{
cpuResult[i*SEARCHITEMSIZE + j] = true;
for (int l = 0; l < keywordLengths[i]; l++)
{
if (dataPtr[j*MAXDATASTRINGSIZE + k + l] != keywordPtr[i*MAXKEYWORDSTRINGSSIZE + l])
{
cpuResult[i*SEARCHITEMSIZE + j] = false;
break;
}
}
if (cpuResult[i*SEARCHTERMSIZE + j])
break;
}
}
}
}
endTime = chrono::steady_clock::now();
millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "CPU code ends: " << duration3.count() << "ms" << endl;
/////////////////////////////////// CPU code //////////////////////////////////////////
////////////////////////////////////Result Comparison////////////////////////////////////////
bool kernelRes = true;
for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
{
if (cpuResult[i] != result[i])
{
kernelRes = false;
break;
}
}
////////////////////////////////////Result Comparison////////////////////////////////////////
cout << boolalpha << "Kernel computation: " << kernelRes << endl;
cout << "Before Deleting arrays" << endl;
delete[] dataPtr;
delete[] keywordPtr;
delete[] keywordPtr_T;
delete[] dataLengths;
delete[] keywordLengths;
delete[] result;
delete[] result_T;
delete[] cpuResult;
cout << "After Deleting arrays" << endl;
cout << "Before Freeing device memory" << endl;
cudaFree(d_dataPtr);
cudaFree(d_keywordPtr);
cudaFree(d_dataLengths);
cudaFree(d_keywordLengths);
cudaFree(d_resultPtr);
cout << "After Freeing device memory" << endl;
cudaDeviceReset();
system("pause");
return 0;
}
Second approach gave better results than the first approach. Yet the performance of the second approach depends on the number of keywords. If the number of the keywords is multiple of 192, gpu has performance than cpu (time of malloc+memcpy+kernel < time of cpu).
What should I do to overcome such dependancy? Would it be viable to increase the number of threads and to pass multiple data strings rather than one in each block?

I suggest blockDim = (16, 16, 1) and gridDim = (# of data strings / 16, # of keyword strings / 16, 1). In your case, where tens of strings can ideally fit in shared memory, such block-grid division will lead to minimum global memory access while introducing no computation overhead.
Padding is not a good choice, unless each string is expected to have its length quite close to the maximum (80% of maximum for example). If you keep a array of offset of every string (CPU is good at generating it), coalescing global memory read is just trivial.

Related

CUDA 2nd order recursion with thrust inclusive_scan

I'm trying to understand how to parallelise a recursive calculation. Serially, the calculation takes the form:
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
For the i-1 index there's a solution here to a previous question of mine: CUDA force instruction execution order
I want to modify this to use the i-2 and I can't understand how to apply the same process to a 2nd order calculation. It should be possible using the thrust::inclusive_scan function, but I can't work out how. Does anyone know the solution?
Picking up where we left off in the previous question/answer, we shift our attention to equation 1.11 in the referenced paper by Blelloch. We observe that your problem formulation:
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
seems to match that in equation 1.11 if we set m=2 and in that case we can also observe that for your formulation, all ai,1 are zero (and, as previously, all ai,2 are k).
As per equation 1.12 in that paper, our state variable si now becomes a two-tuple:
si = |xi xi-1|
Taking note of these things, we observe the "correctness" of equation 1.13:
si = |xi-1 xi-2| . |0 1, k 0| + |bi 0|
rewriting:
si,1 = xi = k*xi-2 + bi
si,2 = xi-1 = xi-1
(In my view, the other answer leaves you at this point. That realization, i.e. result.data[0] = right + k * left.data[1]; is sufficient for a serial scan but not for a parallel scan. It's also evident that the functor/scan op there is not associative.)
We now need to come up with a binary operator bop that is an extension of the definition in (1.7) to this case. Referring to the previous definition in equation 1.7, we extend that based on the treatment in 1.13 as follows:
Ci = |Ai , Bi|
where:
Ai = |0 1, k 0|
and:
Bi = |bi 0|
We then have:
Ci bop Cj = | Ai . Aj , Bi . Aj + Bj |
This then becomes the formula for our functor/scan operator. We will need to carry 6 scalar "state" quantities throughout: 2 for the B vector and 4 for the A matrix.
What follows then is a realization of the above:
$ cat t1930.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
#include <cstdio>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef float mt;
const size_t ds = 512;
const mt k = 1.01;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // cpu result
for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
thrust::device_vector<mt> db(b1, b1+ds);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_vector<mt> dx1(ds);
thrust::device_vector<mt> dx0(ds);
thrust::device_vector<mt> dy0(ds);
thrust::device_vector<mt> dy1(ds);
thrust::device_vector<mt> dy2(ds);
thrust::device_vector<mt> dy3(ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(db.begin(), b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
thrust::host_vector<mt> hx1 = dx1;
thrust::copy_n(hx1.begin(), snip, std::ostream_iterator<mt>(std::cout, ","));
thrust::copy_n(hx1.begin()+ds-snip, snip, std::ostream_iterator<mt>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -std=c++14 t1930.cu -o t1930
$ cuda-memcheck ./t1930
========= CUDA-MEMCHECK
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.218,601.275,576.315,607.993,582.947,614.621,589.516,621.699,595.644,628.843,
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.621,589.516,621.7,595.644,628.843,
========= ERROR SUMMARY: 0 errors
$
Yes, there are some results above that differ in the 6th digit. I attribute this to the limitations of float resolution when taking into account the very different order of operations between the serial and parallel method. If you change the typedef to double, the results will appear to match exactly.
Since you've asked about it here's an equivalent realization where it is demonstrated using device data previously allocated using cudaMalloc:
$ cat t1930.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
#include <cstdio>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef double mt;
const size_t ds = 512;
const mt k = 1.01;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // cpu result
for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
mt *db;
cudaMalloc(&db, ds*sizeof(db[0]));
cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_vector<mt> dx1(ds);
thrust::device_vector<mt> dx0(ds);
thrust::device_vector<mt> dy0(ds);
thrust::device_vector<mt> dy1(ds);
thrust::device_vector<mt> dy2(ds);
thrust::device_vector<mt> dy3(ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
cudaMemcpy(cr, thrust::raw_pointer_cast(dx1.data()), ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
}
$ nvcc -std=c++14 t1930.cu -o t1930
$ cuda-memcheck ./t1930
========= CUDA-MEMCHECK
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
========= ERROR SUMMARY: 0 errors
There should be no significant performance difference between these two approaches. (However I happened to switch the typedef to double for this example, so that makes a difference.) Using cudaMalloc as an alternative to the device_vector for the various state vectors (dx0, dx1, dy0, dy1 ...) may be slightly faster, because device_vector first does a cudaMalloc style allocation, then launches a kernel to zero out the allocation. This zero-ing step is unnecessary for the state vectors. The pattern given here should demonstrate how you could do that, if you are interested.
Here's a version that eliminates use of thrust::device_vector and thrust::host_vector altogether:
#include <iostream>
#include <thrust/device_ptr.h>
#include <thrust/scan.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef float mt;
const size_t ds = 32768*4;
const mt k = 1.001;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // result
for (int i = 0; i < ds; i++) { b1[i] = (rand()/(float)RAND_MAX)-0.5;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
mt *db, *dstate;
cudaMalloc(&db, ds*sizeof(db[0]));
cudaMalloc(&dstate, 6*ds*sizeof(dstate[0]));
cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_ptr<mt> dx1 = thrust::device_pointer_cast(dstate);
thrust::device_ptr<mt> dx0 = thrust::device_pointer_cast(dstate+ds);
thrust::device_ptr<mt> dy0 = thrust::device_pointer_cast(dstate+2*ds);
thrust::device_ptr<mt> dy1 = thrust::device_pointer_cast(dstate+3*ds);
thrust::device_ptr<mt> dy2 = thrust::device_pointer_cast(dstate+4*ds);
thrust::device_ptr<mt> dy3 = thrust::device_pointer_cast(dstate+5*ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1, dx0, dy0, dy1, dy2, dy3));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
cudaMemcpy(cr, dstate, ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
}
Here is some cpu code which shows a possible implementation of the formular derived from https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf to express higher-order recurrences as a scan operation.
The key idea is that each element of the scan result is not a scalar, but a vector which contains the n previous scalar results. This way, all the required previous results are available in the scan operator to compute the next result.
#include <iostream>
#include <algorithm>
#include <numeric>
#include <array>
void calculate1(std::vector<int> vec, int k){
std::vector<int> result(vec.size(), 0);
for(int i = 2; i < vec.size(); i++){
result[i] = vec[i] + k * result[i-2];
}
std::cerr << "calculate1 result: ";
for(auto x : result){
std::cerr << x << ", ";
}
std::cerr << "\n";
}
struct S{
//data[0] stores result of last iteration
//data[1] stores result of second last iteration
std::array<int, 2> data;
};
std::ostream& operator<<(std::ostream& os, S s){
os << "(" << s.data[0] << "," << s.data[1] << ")";
}
void calculate2(std::vector<int> vec, int k){
S initvalue{{0,0}};
std::vector<S> result(vec.size(), initvalue);
std::exclusive_scan(
vec.begin() + 2,
vec.end(),
result.begin(),
initvalue,
[k](S left, int right){
S result;
/*A = (
0 1
k 0
)
Compute result = left * A + (right 0)
*/
result.data[0] = right + k * left.data[1];
result.data[1] = left.data[0];
return result;
}
);
std::cerr << "calculate2 result: ";
for(auto x : result){
std::cerr << x << ", ";
}
std::cerr << "\n";
}
int main(){
const int k = 5;
const std::vector<int> vec1{1,3,5,7,9,11,3,6,7,1,2,4};
calculate1(vec1, k);
calculate2(vec1, k);
}
https://godbolt.org/z/cszzn8Ec8
Output:
calculate1 result: 0, 0, 5, 7, 34, 46, 173, 236, 872, 1181, 4362, 5909,
calculate2 result: (0,0), (5,0), (7,5), (34,7), (46,34), (173,46), (236,173), (872,236), (1181,872), (4362,1181), (0,0), (0,0),
There is still an of-by-one error somewhere, but one can get the idea behind it.
I previously said that this approach can be used for a parallel scan in CUDA. This is not correct. For a parallel scan, the scan operator must have an additional property, which is associativity, i.e. (a OP b) OP c == a OP (b OP c). This is not the case in this approach.
Robert Crovella's answer shows how to derive an associative scan operator which can be used for a parallel scan.

Recursion and functions

Thanks for helping everyone. I will continue looking at it so I can better understand! I am still struggling with recursion but I will study it more. Thanks again for all your time and effort for trying to help me
- /
int countEven(int arr[i]){
//I'm not sure what to do here... how to fix it...
int evens = 0;
if(arr[i] <= 0) return 0; //base case
while(arr[i] > 0){
int digit = arr[i]%10; //get the last digit
if(digit%2 == 0){
evens = evens+1;
}
arr[i] = arr[i]/10;
}
cout << evens;
}
}
}
int main(){
cout << "Part A:\n";
int arr[3] = { 5050155, 5, 707070 };
for (int i = 0; i < 3; i++){
cout << "countEven(" << arr[i] << ") = " << countEven(arr[i]) << endl;
cout << "removeEven(" << arr[i] << ") = " << removeEven(arr[i]) << endl;
cout << "hasEven(" << arr[i] << ") = ";
if (hasEven(arr[i])) cout << "Yes" << endl;
else cout << "No" << endl;
printStarDigit(arr[i]);
cout << endl << endl;
}
cout << "Part B:\n";
int a[4] = { 7, 2, 8, 3 };
int b[5] = { 3, 4, 5, 6, 7 };
cout << "The range of array a is " << range(a, 4) << endl;
cout << "The range of array b is " << range(b, 5) << endl;
reverse(a, 4);
reverse(b, 5);
cout << "Array a reversed: ";
for (int i = 0; i < 4; ++i)
cout << a[i] << " ";
cout << endl;
cout << "Array b reversed: ";
for (int i = 0; i < 5; ++i)
cout << b[i] << " ";
cout << endl;
return 0;
}
int countEven(int arr[i]){
Parameters must have simple names, and do not need to be identical to the expressions passed in. arr[i] is not a valid name. A simple name to use here is n.
//I'm not sure what to do here... how to fix it...
int evens = 0;
if(arr[i] <= 0) return 0; //base case
This base case is wrong for two reasons. Firstly, you treat all negative integers as the base case, but -202020 has six even digits. Secondly, you return the wrong value: 0 has one even digit, but you return zero.
A possible base case could be n > -10 && n < 10 (single digit number). I'll let you figure out the expression to return for that base case.
while(arr[i] > 0){
If your task is to write a recursive function, then you shouldn't use a loop here. Instead, see below.
int digit = arr[i]%10; //get the last digit
...
arr[i] = arr[i]/10;
This is a correct way of obtaining the last digit, and everything other than the last digit.
if(digit%2 == 0)
This is a correct way of determining whether the last digit is even.
Now, you need to combine what you have, by observing that the count of even digits is equal to "1 if the last digit is even, else 0", plus the count of even non-last digits. The goal of the exercise is to get you to write "the count of even non-last digits" as countEven(n / 10).
See these code snippets.
int countEven(int n, bool first_time = true){
static int total ;
int num =n;
if (first_time)
{
total = 0;
}
if (num <= 9) {
return num % 2 == 0 ? 1 : 0;
}
int temp = num - (num / 10) * 10;
num = num / 10;
total += temp % 2 == 0 ? 1 : 0;
countEven(num,false);
return total;
}
int removeEven(int n){
int result = 0;
/*TODO*/
return result;
}
bool hasEven(int n){
return countEven(n);
}
void printStarDigit(int* arr){
/*TODO*/
}
int range(int* arr, int n){
int result = *arr;
for (int i = 0; i < n; i++) {
if (result < *(arr + i)) result = *(arr + i);
}
return result;
}
void reverse(int* arr, int n){
int* temp = new int[n];
for (int i = 0; i < n; i++) {
*(temp + i) = *(arr + i);
}
for (int i = 0; i < n; i++) {
*(arr + i) = *(temp + n - 1 - i);
}
delete[] temp;
}
Main implementation :
int main() {
cout << "Part A:\n";
int arr[3] = { 5050155, 5, 707070 };
for (int i = 0; i < 3; i++) {
cout << "countEven(" << arr[i] << ") = " << countEven(arr[i]) << endl;
cout << "removeEven(" << arr[i] << ") = " << removeEven(arr[i]) << endl;
cout << "hasEven(" << arr[i] << ") = ";
if (hasEven(arr[i])) cout << "Yes" << endl;
else cout << "No" << endl;
printStarDigit(arr + i);
cout << endl << endl;
}
cout << "Part B:\n";
int a[4] = { 7, 2, 8, 3 };
int b[5] = { 3, 4, 5, 6, 7 };
cout << "The range of array a is " << range(a, 4) << endl;
cout << "The range of array b is " << range(b, 5) << endl;
reverse(a, 4);
reverse(b, 5);
cout << "Array a reversed: ";
for (int i = 0; i < 4; ++i)
cout << a[i] << " ";
cout << endl;
cout << "Array b reversed: ";
for (int i = 0; i < 5; ++i)
cout << b[i] << " ";
cout << endl;
return 0;
}
Output:

GPU reduction code only runs one time

I have been using the code sample supplied by Robert Crovella:
thrust::max_element slow in comparison cublasIsamax - More efficient implementation?
Which is a very fast reduction code. I modified it to also return the index of the max in the input array of floats. When I use it in my code, it will only execute one time. If I try calling the routine again it does not find a new max value, it just returns the previous max. Is there something about the volatile global memory that the routine uses that needs to be reset before it can be called again?
#include <cuda.h>
#include <cublas_v2.h>
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 4096*4 // nTPB should be a power-of-2
#define nTPB 512
#define MAX_KERNEL_BLOCKS 30
#define MAX_BLOCKS ((DSIZE/nTPB)+1)
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f
#include <helper_functions.h>
#include <helper_cuda.h>
// this code has been modified to return the index of the max instead of the actual max value - for my application
__device__ volatile float blk_vals[MAX_BLOCKS];
__device__ volatile int blk_idxs[MAX_BLOCKS];
__device__ int blk_num = 0;
//template <typename T>
__global__ void max_idx_kernel(const float *data, const int dsize, int *result){
__shared__ volatile float vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
float my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize){
if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}
idx += blockDim.x*gridDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
// perform block-level reduction
if (!threadIdx.x){
blk_vals[blockIdx.x] = vals[0];
blk_idxs[blockIdx.x] = idxs[0];
if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block
last_block = 1;}
__syncthreads();
if (last_block){
idx = threadIdx.x;
my_val = FLOAT_MIN;
my_idx = -1;
while (idx < gridDim.x){
if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }
idx += blockDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
if (!threadIdx.x)
*result = idxs[0];
}
}
int main(){
int nrElements = DSIZE;
float *d_vector, *h_vector;
StopWatchInterface *hTimer = NULL;
sdkCreateTimer(&hTimer);
double gpuTime;
int k;
int max_index;
int *d_max_index;
cudaMalloc(&d_max_index, sizeof(int));
h_vector = new float[DSIZE];
for(k=0; k < 5; k++){
for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;
h_vector[10+k] = 10; // create definite max element that changes with each loop iteration
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " thrust time: " << gpuTime << " max index: " << max_index << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " cublas time: " << gpuTime << " max index: " << max_index-1 << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);
cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " idx kern time: " << gpuTime << " max index: " << max_index << std::endl;
std::cout << std::endl;
} // end for loop on k
cudaFree(d_max_index);
cudaFree(d_vector);
return 0;
}
The primary issue in re-using this code for multiple loops as-is is in this static initialization of a device (global) variable:
__device__ int blk_num = 0;
That's OK if you're only going to run the routine once. But if you intend to re-use it, you will need to re-initialize this variable to zero before each call to the kernel.
We could fix this by putting an explicit initialization of this variable to zero before each call to the reduction kernel:
cudaMemcpyToSymbol(blk_num, &max_index, sizeof(int));
(I'm using max_index here simply because it is a convenient host int variable that has just been set to zero.)
That's the only change needed to get the code "working".
However the introduction of the loop has created some other "issues" that I would point out. These 3 lines of code:
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
don't belong inside the for-loop on k. That is effectively creating a memory leak and unnecessarily re-initializing the cublas library.
The following code has those changes and seems to work for me:
$ cat t1183.cu
#include <cuda.h>
#include <cublas_v2.h>
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 4096*4 // nTPB should be a power-of-2
#define nTPB 512
#define MAX_KERNEL_BLOCKS 30
#define MAX_BLOCKS ((DSIZE/nTPB)+1)
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f
#include <helper_functions.h>
#include <helper_cuda.h>
// this code has been modified to return the index of the max instead of the actual max value - for my application
__device__ volatile float blk_vals[MAX_BLOCKS];
__device__ volatile int blk_idxs[MAX_BLOCKS];
__device__ int blk_num;
//template <typename T>
__global__ void max_idx_kernel(const float *data, const int dsize, int *result){
__shared__ volatile float vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
float my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize){
if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}
idx += blockDim.x*gridDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
// perform block-level reduction
if (!threadIdx.x){
blk_vals[blockIdx.x] = vals[0];
blk_idxs[blockIdx.x] = idxs[0];
if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block
last_block = 1;}
__syncthreads();
if (last_block){
idx = threadIdx.x;
my_val = FLOAT_MIN;
my_idx = -1;
while (idx < gridDim.x){
if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }
idx += blockDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
if (!threadIdx.x)
*result = idxs[0];
}
}
int main(){
int nrElements = DSIZE;
float *d_vector, *h_vector;
StopWatchInterface *hTimer = NULL;
sdkCreateTimer(&hTimer);
double gpuTime;
int k;
int max_index;
int *d_max_index;
cudaMalloc(&d_max_index, sizeof(int));
h_vector = new float[DSIZE];
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
for(k=0; k < 5; k++){
for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;
h_vector[10+k] = 10; // create definite max element that changes with each loop iteration
cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " thrust time: " << gpuTime << " max index: " << max_index << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " cublas time: " << gpuTime << " max index: " << max_index-1 << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
cudaMemcpyToSymbol(blk_num, &max_index, sizeof(int));
max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);
cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " idx kern time: " << gpuTime << " max index: " << max_index << std::endl;
std::cout << std::endl;
} // end for loop on k
cudaFree(d_max_index);
cudaFree(d_vector);
return 0;
}
$ nvcc -I/usr/local/cuda/samples/common/inc t1183.cu -o t1183 -lcublas
$ cuda-memcheck ./t1183
========= CUDA-MEMCHECK
loop: 0 thrust time: 2.806 max index: 10
loop: 0 cublas time: 0.441 max index: 10
loop: 0 idx kern time: 0.395 max index: 10
loop: 1 thrust time: 1.298 max index: 11
loop: 1 cublas time: 0.419 max index: 11
loop: 1 idx kern time: 0.424 max index: 11
loop: 2 thrust time: 1.303 max index: 12
loop: 2 cublas time: 0.43 max index: 12
loop: 2 idx kern time: 0.419 max index: 12
loop: 3 thrust time: 1.291 max index: 13
loop: 3 cublas time: 0.423 max index: 13
loop: 3 idx kern time: 0.415 max index: 13
loop: 4 thrust time: 1.299 max index: 14
loop: 4 cublas time: 0.423 max index: 14
loop: 4 idx kern time: 0.417 max index: 14
========= ERROR SUMMARY: 0 errors
$

Char Function, number to letter grades

I am relatively new to c++ programming and I am struggling with my code. The objective of this code is to take scores input by the user and calculate the mean, the standard deviation and converting it to a letter grade using the calculations under char gradeFunction. When i try to debug this program using visual studios 2013, i am having a couple problems with the the gradefunction. Again i am new to programming so troubleshooting errors is very hard for me and I would appreciate any help or advice! The program looks like this so far.
#include <iostream>
#include <iomanip>
#include <cmath>
#include <string.h>
#include <string>
using namespace std;
void printArray(int Array[], int count);
double average(double scoreTotal, int count);
double stddev(int Array[], int count, double mean);
char gradeFunction(int scores, double stddev, double mean);
int main()
{
int scores[8];
int count;
double scoreTotal = 0;
int standarddev[8];
double mean;
cout << "Enter scores seperated by blanks:" " ";
for (count = 0; count <= 7; count++)
{
cin >> scores[count];
scoreTotal += scores[count];
mean = scoreTotal / 8;
}
cout << endl;
cout << "Grade Scores by Student" << endl;
cout << "Score" "\t" "Grade" << endl;
cout << "----------------------------------" << endl;
printArray(scores, 8);
cout << gradeFunction(scores, stddev, mean);
cout << endl;
cout << "The mean is" " "<< fixed << setprecision(1) << average(scoreTotal, count) << endl;
cout << "The standard deviation is" " " << stddev(scores, count, mean) << endl;
cout << endl;
system("pause");
return 0;
}
void printArray(int Array[], int count)
{
for (int x = 0; x < count; x++)
{
cout << fixed << setprecision(1) << Array[x] << endl;
}
}
char gradeFunction(int scores, double stddev, double mean)
{
char F, D, C, B, A;
if (scores <= (mean - (1.5 * stddev)))
return 'F';
else if (scores <= (mean - (.5 * stddev)))
return 'D';
else if (scores <= (mean + (.5 * stddev)))
return 'C';
else if (scores <= (mean + (1.5 * stddev)))
return 'B';
else return 'A';
}
double average(double scoreTotal, int count)
{
return scoreTotal / count;
}
double stddev(int Array[], int count , double mean)
{
double stddev;
double sum2 = 0;
for (int i = 0; i < count; i++)
{
sum2 += pow((Array[i] - mean), 2);
}
stddev = sqrt(sum2 / (count - 1));
return stddev;
}
The error messages this leaves me with are...
3 IntelliSense: argument of type "double (*)(int *Array, int count, double mean)" is incompatible with parameter of type "double"
Error 1 error C2664: 'char gradeFunction(int [],double,double)' : cannot convert argument 2 from 'double (__cdecl *)(int [],int,double)' to 'double'

CUDA median calculation through reduction [closed]

This question is unlikely to help any future visitors; it is only relevant to a small geographic area, a specific moment in time, or an extraordinarily narrow situation that is not generally applicable to the worldwide audience of the internet. For help making this question more broadly applicable, visit the help center.
Closed 9 years ago.
I'm probably doing something incredibly stupid, but I can't seem to make this reduction work (there is probably a library that does this already, but this is for self-learning, so please bear with me). I'm trying to find the median of an array of integer entries by taking the median of medians approach, which I've coded below:
__global__ void gpuMedOdd(int *entries, int *med) {
extern __shared__ int sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = entries[i];
__syncthreads();
for(int s = blockDim.x / 3; s > 0; s /= 3) {
if(tid < s) {
int list[3];
list[0] = sdata[tid], list[1] = sdata[tid + s], list[2] = sdata[tid + 2 * s];
if(list[1] < list[0])
swapGpu(list[1], list[0]);
if(list[2] < list[0])
swapGpu(list[2], list[0]);
if(list[2] < list[1])
swapGpu(list[2], list[1]);
sdata[tid] = list[1];
}
__syncthreads();
}
*med = sdata[0];
}
I invoke this kernel function as:
gpuMedOdd<<<9, numEntries / 9>>>(d_entries, d_med);
I then copy the value in d_med over into med and print out that value. Unfortunately, this value is always 0, regardless of input. What am I doing wrong?
Edit: I forgot to mention, swapGpu(a, b) is defined as below:
__device__ inline void swapGpu(int a, int b) {
int dum = a;
a = b;
b = dum;
}
Edit2: As suggested below, here is the entirety of the code.
#include <iostream>
#include <fstream>
#include <cstdlib>
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
inline void __checkCudaErrors(cudaError err, const char *file, const int line) {
if(cudaSuccess != err) {
std::cout << file << "(" << line << ") : CUDA Runtime API error " << (int) err << ": " << cudaGetErrorString(err) << std::endl;
exit(3);
}
}
inline void __getLastCudaError(const char *errorMsg, const char *file, const int line) {
cudaError_t err = cudaGetLastError();
if(cudaSuccess != err) {
std::cout << file << "(" << line << ") : getLastCudaError() CUDA error : " << errorMsg << " : (" << (int) err << ") " << cudaGetErrorString(err) << std::endl;
exit(3);
}
}
int cpuMin(int *entries, int numEntries) {
int minVal = entries[0];
for(int i = 1; i < numEntries; i++)
if(entries[i] < minVal)
minVal = entries[i];
return minVal;
}
int cpuMax(int *entries, int numEntries) {
int maxVal = entries[0];
for(int i = 1; i < numEntries; i++)
if(entries[i] > maxVal)
maxVal = entries[i];
return maxVal;
}
inline void swap(int a, int b) {
int dum = a;
a = b;
b = dum;
}
__device__ inline void swapGpu(int a, int b) {
int dum = a;
a = b;
b = dum;
}
__global__ void gpuMedOdd(int *entries, int *med, int numEntries) {
extern __shared__ int sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * (blockDim.x * 3) + threadIdx.x;
if(i + 2 * blockDim.x < numEntries) {
int list[3];
list[0] = entries[i], list[1] = entries[i + blockDim.x], list[2] = entries[i + 2 * blockDim.x];
if(list[1] < list[0])
swapGpu(list[1], list[0]);
if(list[2] < list[0])
swapGpu(list[2], list[0]);
if(list[2] < list[1])
swapGpu(list[2], list[1]);
sdata[tid] = list[1];
}
__syncthreads();
for(int s = blockDim.x / 3; s > 0; s /= 3) {
if(tid < s && tid + 2 * s < blockDim.x) {
int list[3];
list[0] = sdata[tid], list[1] = sdata[tid + s], list[2] = sdata[tid + 2 * s];
if(list[1] < list[0])
swapGpu(list[1], list[0]);
if(list[2] < list[0])
swapGpu(list[2], list[0]);
if(list[2] < list[1])
swapGpu(list[2], list[1]);
sdata[tid] = list[1];
}
__syncthreads();
}
*med = sdata[0];
}
__global__ void gpuMin(int *entries, int *min, int numEntries) {
extern __shared__ int sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
if(i + blockDim.x < numEntries)
sdata[tid] = (entries[i] < entries[i + blockDim.x]) ? entries[i] : entries[i + blockDim.x];
__syncthreads();
for(int s = blockDim.x / 2; s > 0; s >>= 1) {
if(tid < s)
sdata[tid] = (sdata[tid] < sdata[tid + s]) ? sdata[tid] : sdata[tid + s];
__syncthreads();
}
*min = sdata[0];
}
__global__ void gpuMax(int *entries, int *max, int numEntries) {
extern __shared__ int sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
if(i + blockDim.x < numEntries)
sdata[tid] = (entries[i] > entries[i + blockDim.x]) ? entries[i] : entries[i + blockDim.x];
__syncthreads();
for(int s = blockDim.x / 2; s > 0; s >>= 1) {
if(tid < s)
sdata[tid] = (sdata[tid] > sdata[tid + s]) ? sdata[tid] : sdata[tid + s];
__syncthreads();
}
*max = sdata[0];
}
int partition(int *entries, int left, int right, int pivotIdx) {
int i, storeIdx = left, pivot = entries[pivotIdx];
swap(entries[pivotIdx], entries[right]);
for(i = left; i < right; i++)
if(entries[i] < pivot) {
swap(entries[i], entries[storeIdx]);
storeIdx++;
}
return storeIdx;
}
int cpuSelect(int *entries, int left, int right, int k) {
if(left == right)
return entries[left];
int pivotIdx = ((left + right) >> 2) + 1, pivotNewIdx, pivotDist;
pivotNewIdx = partition(entries, left, right, pivotIdx);
pivotDist = pivotNewIdx - left + 1;
if(pivotDist == k)
return entries[pivotNewIdx];
else if(k < pivotDist)
return cpuSelect(entries, left, pivotNewIdx - 1, k);
else
return cpuSelect(entries, pivotNewIdx + 1, right, k - pivotDist);
}
int main(int argc, char *argv[]) {
if(argc != 3) {
std::cout << "ERROR: Incorrect number of input arguments" << std::endl;
std::cout << "Proper usage: " << argv[0] << " fileName numEntries" << std::endl;
exit(1);
}
std::ifstream inp(argv[1]);
if(!inp.is_open()) {
std::cout << "ERROR: File I/O error" << std::endl;
std::cout << "Could not find file " << argv[1] << std::endl;
exit(2);
}
int numEntries = atoi(argv[2]), i = 0;
int *entries = new int[numEntries];
while(inp >> entries[i] && i < numEntries)
i++;
if(i < numEntries) {
std::cout << "ERROR: File I/O error" << std::endl;
std::cout << "Command-line input suggested " << numEntries << " entries, but only found " << i << " entries" << std::endl;
exit(2);
}
if(inp >> i) {
std::cout << "ERROR: File I/O error" << std::endl;
std::cout << "Command-line input suggested " << numEntries << " entries, but file contains more entries" << std::endl;
exit(2);
}
int min, max;
int *d_entries, *d_min, *d_max;
checkCudaErrors(cudaMalloc(&d_entries, sizeof(int) * numEntries));
checkCudaErrors(cudaMalloc(&d_min, sizeof(int)));
checkCudaErrors(cudaMalloc(&d_max, sizeof(int)));
checkCudaErrors(cudaMemcpy(d_entries, entries, sizeof(int) * numEntries, cudaMemcpyHostToDevice));
gpuMin<<<16, numEntries / 16, numEntries / 16 * sizeof(int)>>>(d_entries, d_min, numEntries);
getLastCudaError("kernel launch failure");
gpuMax<<<16, numEntries / 16, numEntries / 16 * sizeof(int)>>>(d_entries, d_max, numEntries);
getLastCudaError("kernel launch failure");
checkCudaErrors(cudaMemcpy(&min, d_min, sizeof(int), cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(&max, d_max, sizeof(int), cudaMemcpyDeviceToHost));
std::cout << "The minimum value is: " << min << std::endl;
std::cout << "The maximum value is: " << max << std::endl;
if(numEntries % 2) {
int med, *d_med;
checkCudaErrors(cudaMalloc(&d_med, sizeof(int)));
gpuMedOdd<<<16, numEntries / 16, 16 * sizeof(int)>>>(d_entries, d_med, numEntries);
getLastCudaError("kernel launch failure");
checkCudaErrors(cudaMemcpy(&med, d_med, sizeof(int), cudaMemcpyDeviceToHost));
std::cout << "The median value is: " << med << std::endl;
}
else {
int *d_med;
cudaMalloc(&d_med, sizeof(int));
gpuMedOdd<<<16, numEntries / 16>>>(d_entries, d_med, numEntries);
}
min = cpuMin(entries, numEntries);
max = cpuMax(entries, numEntries);
if(numEntries % 2) {
int median = cpuSelect(entries, 0, numEntries - 1, (numEntries - 1) / 2 + 1);
std::cout << "The median value is: " << median << std::endl;
}
else {
int med2 = cpuSelect(entries, 0, numEntries - 1, numEntries / 2);
int med1 = cpuSelect(entries, 0, numEntries - 1, numEntries / 2 + 1);
float median = 0.5 * (med1 + med2);
std::cout << "The median value is: " << median << std::endl;
}
std::cout << "The minimum value is: " << min << std::endl;
std::cout << "The maximum value is: " << max << std::endl;
exit(0);
}
One thing that jumps out is that your shared memory size isn't set; that is, you declare your shared memory to be
extern __shared__ int sdata[];
but when you invoke your kernel your launch parameters are
gpuMedOdd<<<9, numEntries / 9>>>(...)
If you're setting your __shared__ memory to be extern, then it's expecting to get the number of bytes for shared memory as the 3rd kernel launch parameter. you should instead have
gpuMedOdd<<<9, numEntries / 9, smem_in_bytes>>>(...)
where smem_in_bytes is the size of shared memory for the kernel. If you don't specify a size, it'll default to 0. Hence in your current code, your __shared__ memory array sdata will be 0 bytes long.
EDIT: here's the link to the relevant part of the CUDA Programming Guide:
http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration
A problem I see in your code is that you seem to have your launch parameters reversed:
gpuMedOdd<<<16, numEntries / 16, 16 * sizeof(int)>>>(d_entries, d_med, numEntries);
I think you intended:
gpuMedOdd<<< numEntries/16, 16, 16 * sizeof(int)>>>(d_entries, d_med, numEntries);
The first launch parameter is blocks per grid. The second launch parameter is threads per block. Here I'm assuming you wanted to launch 16 threads per block. If in fact your intent was to launch a fixed number of blocks (16) and have the threads per block vary based on input size, then I think this is not typical of good cuda coding, and it will blow up if your input size gets too large, because you will exceed the max threads per block limit. Also, since your shared memory allocation is fixed (64 bytes), I assume you had intended a fixed number of threads per block.
Another suggestion I have is that rather than just reporting "CUDA Runtime Error" you should parse the error code returned. Take a look at the example link I already mentioned.