thrust ignores long long - thrust

Thrust treats long long int as if it were long int. Here's a demo program:
#include <thrust/reduce.h>
#include <thrust/iterator/constant_iterator.h>
void tryit(long long int n) {
// with long long
long long int s =
thrust::reduce(thrust::constant_iterator<long long int>(1LL),
thrust::constant_iterator<long long int>(1LL)+n);
std::cout << "long long: " << n << ' ' << s << std::endl;
// now with long
long int n1 = n;
long int s1 =
thrust::reduce(thrust::constant_iterator<int>(1),
thrust::constant_iterator<int>(1)+n1);
std::cout << "long: " << n1 << ' ' << s1 << std::endl;
}
int main() {
tryit(1000000);
tryit(1000000000);
tryit(10000000000);
}
The output is:
long long: 1000000 1000000
long: 1000000 1000000
long long: 1000000000 1000000000
long: 1000000000 1000000000
long long: 10000000000 1410065408
long: 10000000000 1410065408
The first 1410065408 should be 10000000000.
I compiled it thus:
nvcc -arch=compute_30 -std=c++11 longlongb.cu -o longlongb

Related

Should I declare a double array with the GPU block number on the inner or outer dimension?

Should I declare a double array with the GPU block number on the inner or outer dimension?
E.g., should I do
int payload[LEN][BLOCKS];
or
int payload[BLOCKS][LEN];
where LEN is a very large number.
I plan to have each block traverse the double array, holding the block dimension constant and iterating over the LEN dimension.
Assuming you're going to access the data in a block-oriented manner, you want to do the latter. This is presumably because when you load the first element of the "len" dimension, you've already paid the cost for missing in the cache for the subsequent 7ish elements. In the first option, there's probably sharing of cache lines between GPU blocks, but the sharing is relatively limited and not as low level.
Indeed, the below code reports that the second option requires 0.481 seconds to execute, and the first option requires 0.979 seconds. Arranging the data with the block on the outer dimension is about twice as performant.
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <string>
#include <chrono>
#include <iostream>
#define BLOCKS 80
#define LEN (1 << 20)
void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err) {
if (err == cudaSuccess)
return;
std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
exit (1);
}
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)
struct Data1 {
int payload[LEN][BLOCKS];
};
struct Data2 {
int payload[BLOCKS][LEN];
};
__global__ void f1(Data1 * data1) {
int sum = 0;
for (int i = 0; i < LEN; ++i) {
sum += data1->payload[i][blockIdx.x];
}
printf("block %i has f1 sum %i\n", blockIdx.x, sum);
}
__global__ void f2(Data2 * data2) {
int sum = 0;
for (int i = 0; i < LEN; ++i) {
sum += data2->payload[blockIdx.x][i];
}
printf("block %i has f2 sum %i\n", blockIdx.x, sum);
}
int main() {
Data1 * data1 = (Data1 *) malloc(sizeof(Data1));
Data2 * data2 = (Data2 *) malloc(sizeof(Data2));;
for (int i = 0; i < LEN; ++i) {
for (int b = 0; b < BLOCKS; ++b) {
data1->payload[i][b] = i * b;
data2->payload[b][i] = i * b;
}
}
Data1 * data1_on_gpu;
CUDA_CHECK_RETURN(cudaMalloc(&data1_on_gpu, sizeof(Data1)));
Data2 * data2_on_gpu;
cudaMalloc(&data2_on_gpu, sizeof(Data2));
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
cudaMemcpy(data1_on_gpu, data1, sizeof(Data1), cudaMemcpyHostToDevice);
cudaMemcpy(data2_on_gpu, data2, sizeof(Data1), cudaMemcpyHostToDevice);
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
std::chrono::time_point<std::chrono::system_clock> t1 = std::chrono::system_clock::now();
f1<<<80,1>>>(data1_on_gpu);
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
std::chrono::time_point<std::chrono::system_clock> t2 = std::chrono::system_clock::now();
f2<<<80,1>>>(data2_on_gpu);
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
std::chrono::time_point<std::chrono::system_clock> t3 = std::chrono::system_clock::now();
std::chrono::duration<double> duration_1_to_2 = t2 - t1;
std::chrono::duration<double> duration_2_to_3 = t3 - t2;
duration_1_to_2.count();
printf("timer for 1st took %.3lf\n", duration_1_to_2.count());
printf("timer for 2nd took %.3lf\n", duration_2_to_3.count());
}

Passing multiple variables in a function?

I'm a first year CS student trying to understand functions in C++ better because I am weak in that area as of right now. I'm trying to create a program that will ask the user for two integers that will then be passed to a calculation function that will finally be passed to a display function to show the calculations. As of right now here is my code with the output at the bottom. I'm not really sure why the num1 and num2 aren't properly being passed to the calculation function? Any help is appreciated and please disregard the style, I usually try and clean it up after I get it to work.
#include <iostream>
#include <cmath>
#include <iomanip>
using namespace std;
void getData();
void doTheMath(int num1, int num2);
void displayResults(int num1, int num2, int& sum, int& diff, int& prod, int& quot, int& rem);
int main()
{
int num1;
int num2;
int sum;
int diff;
int prod;
int quot;
int rem;
getData();
doTheMath(num1, num2);
displayResults(num1, num2, sum, diff, prod, quot, rem);
system("pause");
return 0;
}
void getData()
{
int num1;
int num2;
cout << "Please enter two integer values:\n";
cin >> num1;
cin >> num2;
cout << "The first number is " << num1
<< " and the second is "<< num2 << "\n\n";
}
void doTheMath(int num1, int num2)
{
int sum = num1 + num2;
int diff = num1 - num2;
int prod = num1 * num2;
int quot = num1 / num2;
int rem = num1 % num2;
}
void displayResults(int num1, int num2, int& sum, int& diff, int& prod, int& quot, int& rem)
{
if (num2 == 0)
{
cout << "Here are the results:\n\n";
cout << "The sum of " << num1 << " and " << num2
<< " is " << sum << ".\n";
cout << "The difference, (" << num1 << " minus "
<< num2 << ") is " << diff << ".\n";
cout << "The product of " << num1 << " and "
<< num2 << " is " << prod << ".\n";
cout << "Cannot divide by zero.\n\n";
}
else
{
cout << "Here are the results:\n\n";
cout << "The sum of " << num1 << " and " << num2
<< " is " << sum << ".\n";
cout << "The difference, (" << num1 << " minus "
<< num2 << ") is " << diff << ".\n";
cout << "The product of " << num1 << " and "
<< num2 << " is " << prod << ".\n";
cout << num1 << " divided by " << num2 << " is "
<< quot << " with a remainder of " << rem
<< ".\n\n";
}
}
//Output
/*Please enter two integer values:
12
0
The first number is 12 and the second is 0
Here are the results:
The sum of -858993460 and -858993460 is -858993460.
The difference, (-858993460 minus -858993460) is -858993460.
The product of -858993460 and -858993460 is -858993460.
-858993460 divided by -858993460 is -858993460 with a remainder of -858993460.
Press any key to continue . . .*/
The num1 and num2 variables in main() are different variables than num1 and num2 in getData(). So you're setting these in getData() but doing nothing with them except displaying. The num1 and num2 in main() are not affected. Pass these (as a reference) to getData(int &num1, int &num2) and don't declare the ones in getData() itself. Read up in 'auto' variable declaration (declared on the stack).
//==========================================================
/*Description:
This program is to showcase my understanding of
functions that I learned from our Lecture 7b. The user
is prompted to enter two integer values, where they
are then passed to a calculation function to calculate
the sum, difference, product, quotient, and remainder
of the two numbers entered. After all the values are
calculated, they are showcased in a display function
to the user in the output stream.*/
//==========================================================
#include <iostream>
#include <cmath>
#include <iomanip>
using namespace std;
void getData(int& num1, int& num2);
void doTheMath(int num1, int num2, int& sum, int& diff, int& prod, int& quot, int& rem);
void displayResults(int num1, int num2, int sum, int diff, int prod, int quot, int rem);
//====================== main ===========================
//
//=======================================================
int main()
{
int num1;
int num2;
int sum;
int diff;
int prod;
int quot;
int rem;
//Gets two integers from user
getData(num1, num2);
//Does the calculation from integers received
doTheMath(num1, num2, sum, diff, prod, quot, rem);
//Displays calculated results from two integers
displayResults(num1, num2, sum, diff, prod, quot, rem);
system("pause");
return 0;
}
/*===================== getData ==========================
This function gets the information from the user of the
two integers they wish to input. It assigns the user's
numbers to num1 and num2.
Input:
num1 - First integer assigned by user
num2 - Second integer assigned by user
Output:
The values being assigned to be used in the doTheMath
function.*/
//========================================================
void getData(int& num1, int& num2)
{
cout << "Please enter two integer values:\n";
cin >> num1;
cin >> num2;
}
/*==================== doTheMath =========================
This function calculates the user's two integers inputted
into the previous function and assigns the calculated
answers to variables named by the calculation performed.
It first checks to see if num2 is 0, because this system
can't divide by zero without crashing.
Input:
sum - adds the two integers
diff - subtracts the two integers
prod - multiplies the two integers
quot - divides the two integers
rem - gets the remainder of the two integers
Output:
Variables are now assigned new values to be displayed
inside of the displayResults function.*/
//========================================================
void doTheMath(int num1, int num2, int& sum, int& diff, int& prod, int& quot, int& rem)
{
if (num2 == 0)
{
sum = (num1 + num2);
diff = (num1 - num2);
prod = (num1 * num2);
}
else
{
sum = (num1 + num2);
diff = (num1 - num2);
prod = (num1 * num2);
quot = (num1 / num2);
rem = (num1 % num2);
}
}
/*================= displayResults ======================
This function takes the calculations from the doTheMath
function and displays them to the user in a standard
output stream. It first checks to see if num2 is 0,
because this system can't divide by zero without
crashing.
Input:
Calculations from the doTheMath function, as well as
num1 and num2.
(sum, diff, prod, quot, rem).
Output:
Displays the calculations from the doTheMath function
to the user in a standard output stream.*/
//========================================================
void displayResults(int num1, int num2, int sum, int diff, int prod, int quot, int rem)
{
if (num2 == 0)
{
cout << "Here are the results:\n\n";
cout << "The sum of " << num1 << " and " << num2
<< " is " << sum << ".\n";
cout << "The difference, (" << num1 << " minus "
<< num2 << ") is " << diff << ".\n";
cout << "The product of " << num1 << " and "
<< num2 << " is " << prod << ".\n";
cout << "Cannot divide by zero.\n\n";
}
else
{
cout << "Here are the results:\n\n";
cout << "The sum of " << num1 << " and " << num2
<< " is " << sum << ".\n";
cout << "The difference, (" << num1 << " minus "
<< num2 << ") is " << diff << ".\n";
cout << "The product of " << num1 << " and "
<< num2 << " is " << prod << ".\n";
cout << num1 << " divided by " << num2 << " is "
<< quot << " with a remainder of " << rem
<< ".\n\n";
}
}
//==========================================================
/*OUTPUT (When num2 != 0):
Please enter two integer values:
12
3
Here are the results:
The sum of 12 and 3 is 15.
The difference, (12 minus 3) is 9.
The product of 12 and 3 is 36.
12 divided by 3 is 4 with a remainder of 0.
Press any key to continue . . .*/
//==========================================================
//==========================================================
/*OUTPUT (When num2 == 0):
Please enter two integer values:
12
0
Here are the results:
The sum of 12 and 0 is 12.
The difference, (12 minus 0) is 12.
The product of 12 and 0 is 0.
Cannot divide by zero.
Press any key to continue . . .*/
//==========================================================

float2 cufftcomplex to fftw_complex

I want to using FFTW library on my code. I have cast float2 data type to fftw_complex. But I get:
Segmentation fault
This is my code.
test.cu
typedef float2 cplx;
int DoFFT_Operation( cplx* DatafftOneSlice, float* out, int *dim)
{
cout << "DO CPU FFT RSS Operation" << endl;
int xdim = dim[0];
int ydim = dim[1];
cout << "XDIM " << std::to_string(xdim) << " YDIM " << std::to_string(ydim) << endl;
// int slicedim = dim[2];
int bitdim = 1;
// int sizeOneSlice = xdim*ydim*bitdim;
int sizeOneImage = xdim*ydim;
//FFTW PLAN
fftw_plan pfftw;
pfftw = fftw_plan_dft_1d(sizeOneImage, reinterpret_cast<fftw_complex*>(DatafftOneSlice), reinterpret_cast<fftw_complex*>(DatafftOneSlice), FFTW_BACKWARD, FFTW_ESTIMATE);
fftw_execute(pfftw);
// fft_it(DatafftOneSlice, sizeOneImage);
// cplx* input, float* out, int N, int x, int y, int bit
DoSomething(DatafftOneSlice, out, sizeOneImage, xdim, ydim, bitdim);
fftw_destroy_plan(pfftw);
fftw_cleanup();
return 0;
}
How to casting float2 (CUDA) to fftw_complex?
I have solved my problem,
I am just change fftw_ to fftwf_, because fftw_ has double data type and fftwf_ has float data type.
test.cu
typedef float2 cplx;
int DoFFT_Operation( cplx* DatafftOneSlice, float* out, int *dim)
{
cout << "DO CPU FFT RSS Operation" << endl;
int xdim = dim[0];
int ydim = dim[1];
cout << "XDIM " << std::to_string(xdim) << " YDIM " << std::to_string(ydim) << endl;
// int slicedim = dim[2];
int bitdim = 1;
// int sizeOneSlice = xdim*ydim*bitdim;
int sizeOneImage = xdim*ydim;
//FFTW PLAN
fftwf_plan pfftw;
pfftw = fftwf_plan_dft_1d(sizeOneImage, reinterpret_cast<fftwf_complex*>(DatafftOneSlice), reinterpret_cast<fftwf_complex*>(DatafftOneSlice), FFTW_BACKWARD, FFTW_ESTIMATE);
fftwf_execute(pfftw);
// fft_it(DatafftOneSlice, sizeOneImage);
// cplx* input, float* out, int N, int x, int y, int bit
DoSomething(DatafftOneSlice, out, sizeOneImage, xdim, ydim, bitdim);
fftwf_destroy_plan(pfftw);
fftwf_cleanup();
return 0;
}

GPU reduction code only runs one time

I have been using the code sample supplied by Robert Crovella:
thrust::max_element slow in comparison cublasIsamax - More efficient implementation?
Which is a very fast reduction code. I modified it to also return the index of the max in the input array of floats. When I use it in my code, it will only execute one time. If I try calling the routine again it does not find a new max value, it just returns the previous max. Is there something about the volatile global memory that the routine uses that needs to be reset before it can be called again?
#include <cuda.h>
#include <cublas_v2.h>
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 4096*4 // nTPB should be a power-of-2
#define nTPB 512
#define MAX_KERNEL_BLOCKS 30
#define MAX_BLOCKS ((DSIZE/nTPB)+1)
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f
#include <helper_functions.h>
#include <helper_cuda.h>
// this code has been modified to return the index of the max instead of the actual max value - for my application
__device__ volatile float blk_vals[MAX_BLOCKS];
__device__ volatile int blk_idxs[MAX_BLOCKS];
__device__ int blk_num = 0;
//template <typename T>
__global__ void max_idx_kernel(const float *data, const int dsize, int *result){
__shared__ volatile float vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
float my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize){
if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}
idx += blockDim.x*gridDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
// perform block-level reduction
if (!threadIdx.x){
blk_vals[blockIdx.x] = vals[0];
blk_idxs[blockIdx.x] = idxs[0];
if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block
last_block = 1;}
__syncthreads();
if (last_block){
idx = threadIdx.x;
my_val = FLOAT_MIN;
my_idx = -1;
while (idx < gridDim.x){
if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }
idx += blockDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
if (!threadIdx.x)
*result = idxs[0];
}
}
int main(){
int nrElements = DSIZE;
float *d_vector, *h_vector;
StopWatchInterface *hTimer = NULL;
sdkCreateTimer(&hTimer);
double gpuTime;
int k;
int max_index;
int *d_max_index;
cudaMalloc(&d_max_index, sizeof(int));
h_vector = new float[DSIZE];
for(k=0; k < 5; k++){
for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;
h_vector[10+k] = 10; // create definite max element that changes with each loop iteration
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " thrust time: " << gpuTime << " max index: " << max_index << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " cublas time: " << gpuTime << " max index: " << max_index-1 << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);
cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " idx kern time: " << gpuTime << " max index: " << max_index << std::endl;
std::cout << std::endl;
} // end for loop on k
cudaFree(d_max_index);
cudaFree(d_vector);
return 0;
}
The primary issue in re-using this code for multiple loops as-is is in this static initialization of a device (global) variable:
__device__ int blk_num = 0;
That's OK if you're only going to run the routine once. But if you intend to re-use it, you will need to re-initialize this variable to zero before each call to the kernel.
We could fix this by putting an explicit initialization of this variable to zero before each call to the reduction kernel:
cudaMemcpyToSymbol(blk_num, &max_index, sizeof(int));
(I'm using max_index here simply because it is a convenient host int variable that has just been set to zero.)
That's the only change needed to get the code "working".
However the introduction of the loop has created some other "issues" that I would point out. These 3 lines of code:
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
don't belong inside the for-loop on k. That is effectively creating a memory leak and unnecessarily re-initializing the cublas library.
The following code has those changes and seems to work for me:
$ cat t1183.cu
#include <cuda.h>
#include <cublas_v2.h>
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 4096*4 // nTPB should be a power-of-2
#define nTPB 512
#define MAX_KERNEL_BLOCKS 30
#define MAX_BLOCKS ((DSIZE/nTPB)+1)
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f
#include <helper_functions.h>
#include <helper_cuda.h>
// this code has been modified to return the index of the max instead of the actual max value - for my application
__device__ volatile float blk_vals[MAX_BLOCKS];
__device__ volatile int blk_idxs[MAX_BLOCKS];
__device__ int blk_num;
//template <typename T>
__global__ void max_idx_kernel(const float *data, const int dsize, int *result){
__shared__ volatile float vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
float my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize){
if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}
idx += blockDim.x*gridDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
// perform block-level reduction
if (!threadIdx.x){
blk_vals[blockIdx.x] = vals[0];
blk_idxs[blockIdx.x] = idxs[0];
if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block
last_block = 1;}
__syncthreads();
if (last_block){
idx = threadIdx.x;
my_val = FLOAT_MIN;
my_idx = -1;
while (idx < gridDim.x){
if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }
idx += blockDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
if (!threadIdx.x)
*result = idxs[0];
}
}
int main(){
int nrElements = DSIZE;
float *d_vector, *h_vector;
StopWatchInterface *hTimer = NULL;
sdkCreateTimer(&hTimer);
double gpuTime;
int k;
int max_index;
int *d_max_index;
cudaMalloc(&d_max_index, sizeof(int));
h_vector = new float[DSIZE];
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
for(k=0; k < 5; k++){
for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;
h_vector[10+k] = 10; // create definite max element that changes with each loop iteration
cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " thrust time: " << gpuTime << " max index: " << max_index << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " cublas time: " << gpuTime << " max index: " << max_index-1 << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
cudaMemcpyToSymbol(blk_num, &max_index, sizeof(int));
max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);
cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " idx kern time: " << gpuTime << " max index: " << max_index << std::endl;
std::cout << std::endl;
} // end for loop on k
cudaFree(d_max_index);
cudaFree(d_vector);
return 0;
}
$ nvcc -I/usr/local/cuda/samples/common/inc t1183.cu -o t1183 -lcublas
$ cuda-memcheck ./t1183
========= CUDA-MEMCHECK
loop: 0 thrust time: 2.806 max index: 10
loop: 0 cublas time: 0.441 max index: 10
loop: 0 idx kern time: 0.395 max index: 10
loop: 1 thrust time: 1.298 max index: 11
loop: 1 cublas time: 0.419 max index: 11
loop: 1 idx kern time: 0.424 max index: 11
loop: 2 thrust time: 1.303 max index: 12
loop: 2 cublas time: 0.43 max index: 12
loop: 2 idx kern time: 0.419 max index: 12
loop: 3 thrust time: 1.291 max index: 13
loop: 3 cublas time: 0.423 max index: 13
loop: 3 idx kern time: 0.415 max index: 13
loop: 4 thrust time: 1.299 max index: 14
loop: 4 cublas time: 0.423 max index: 14
loop: 4 idx kern time: 0.417 max index: 14
========= ERROR SUMMARY: 0 errors
$

Char Function, number to letter grades

I am relatively new to c++ programming and I am struggling with my code. The objective of this code is to take scores input by the user and calculate the mean, the standard deviation and converting it to a letter grade using the calculations under char gradeFunction. When i try to debug this program using visual studios 2013, i am having a couple problems with the the gradefunction. Again i am new to programming so troubleshooting errors is very hard for me and I would appreciate any help or advice! The program looks like this so far.
#include <iostream>
#include <iomanip>
#include <cmath>
#include <string.h>
#include <string>
using namespace std;
void printArray(int Array[], int count);
double average(double scoreTotal, int count);
double stddev(int Array[], int count, double mean);
char gradeFunction(int scores, double stddev, double mean);
int main()
{
int scores[8];
int count;
double scoreTotal = 0;
int standarddev[8];
double mean;
cout << "Enter scores seperated by blanks:" " ";
for (count = 0; count <= 7; count++)
{
cin >> scores[count];
scoreTotal += scores[count];
mean = scoreTotal / 8;
}
cout << endl;
cout << "Grade Scores by Student" << endl;
cout << "Score" "\t" "Grade" << endl;
cout << "----------------------------------" << endl;
printArray(scores, 8);
cout << gradeFunction(scores, stddev, mean);
cout << endl;
cout << "The mean is" " "<< fixed << setprecision(1) << average(scoreTotal, count) << endl;
cout << "The standard deviation is" " " << stddev(scores, count, mean) << endl;
cout << endl;
system("pause");
return 0;
}
void printArray(int Array[], int count)
{
for (int x = 0; x < count; x++)
{
cout << fixed << setprecision(1) << Array[x] << endl;
}
}
char gradeFunction(int scores, double stddev, double mean)
{
char F, D, C, B, A;
if (scores <= (mean - (1.5 * stddev)))
return 'F';
else if (scores <= (mean - (.5 * stddev)))
return 'D';
else if (scores <= (mean + (.5 * stddev)))
return 'C';
else if (scores <= (mean + (1.5 * stddev)))
return 'B';
else return 'A';
}
double average(double scoreTotal, int count)
{
return scoreTotal / count;
}
double stddev(int Array[], int count , double mean)
{
double stddev;
double sum2 = 0;
for (int i = 0; i < count; i++)
{
sum2 += pow((Array[i] - mean), 2);
}
stddev = sqrt(sum2 / (count - 1));
return stddev;
}
The error messages this leaves me with are...
3 IntelliSense: argument of type "double (*)(int *Array, int count, double mean)" is incompatible with parameter of type "double"
Error 1 error C2664: 'char gradeFunction(int [],double,double)' : cannot convert argument 2 from 'double (__cdecl *)(int [],int,double)' to 'double'