Thrust: synchronize: launch_closure_by_value: unknown error - thrust

I am experimenting with Thrust example monte-carlo.ru from here:
https://github.com/thrust/thrust/blob/master/examples/monte_carlo.cu .
The problem appears in this peace of code:
float estimate = thrust::transform_reduce(thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(M),
estimate_pi(),
0.0f,
thrust::plus<float>());
When I increase length of input sequence to more than M=87000 for transform_reduce method I got an error:
"synchronize: launch_closure_by_value: unknown error"
Just before the error the screen is became black for several seconds, then in systray I see a message "The video driver NVidia stopped responding and was successfully restored" (my back translation) and then I reboot my computer because it's behavior is unstable.
When I try to use cuda-memcheck the situation is changed: I got the same error already for length M=30000 although when running .exe without cuda-memcheck the program ends successfully for this length.
Here is several lines from cuda-memcheck output:
========= Program hit cudaErrorUnknown (error 30) due to "unknown error" on CUDA API call to cudaThreadSynchronize.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:\Windows\system32\nvcuda.dll (cuProfilerStop + 0xc2d92) [0xe06b2]
========= Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\cudart64_65.dll (cudaThreadSynchronize + 0xf5) [0x19585]
========= Host Frame:C:\test\Monte_carlo.exe (thrust::system::cuda::detail::synchronize + 0x47) [0x11117]
...
========= Program hit cudaErrorUnknown (error 30) due to "unknown error" on CUDA API call to cudaFree.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:\Windows\system32\nvcuda.dll (cuProfilerStop + 0xc2d92) [0xe06b2]
========= Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\cudart64_65.dll (cudaFree + 0xfd) [0x1d28d]
========= Host Frame:C:\test\Monte_carlo.exe (thrust::system::cuda::detail::free > + 0x50) [0x5fa0]
Below also full code of the program. I added to the original only 2 changes: try-catch around transform_reduce and input M from console.
How can I understand the reason of this error?
#include <thrust/random.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
#include <iostream>
#include <iomanip>
#include <cmath>
// we could vary M & N to find the perf sweet spot
__host__ __device__
unsigned int hash(unsigned int a)
{
a = (a+0x7ed55d16) + (a<<12);
a = (a^0xc761c23c) ^ (a>>19);
a = (a+0x165667b1) + (a<<5);
a = (a+0xd3a2646c) ^ (a<<9);
a = (a+0xfd7046c5) + (a<<3);
a = (a^0xb55a4f09) ^ (a>>16);
return a;
}
struct estimate_pi : public thrust::unary_function<unsigned int,float>
{
__host__ __device__
float operator()(unsigned int thread_id)
{
float sum = 0;
unsigned int N = 10000; // samples per thread
unsigned int seed = hash(thread_id);
// seed a random number generator
thrust::default_random_engine rng(seed);
// create a mapping from random numbers to [0,1)
thrust::uniform_real_distribution<float> u01(0,1);
// take N samples in a quarter circle
for(unsigned int i = 0; i < N; ++i)
{
// draw a sample from the unit square
float x = u01(rng);
float y = u01(rng);
// measure distance from the origin
float dist = sqrtf(x*x + y*y);
// add 1.0f if (u0,u1) is inside the quarter circle
if(dist <= 1.0f)
sum += 1.0f;
}
// multiply by 4 to get the area of the whole circle
sum *= 4.0f;
// divide by N
return sum / N;
}
};
int main(void)
{
// use 30K independent seeds
int M;
std::cout << "M: ";
std::cin >> M;
try
{
float estimate = thrust::transform_reduce(thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(M),
estimate_pi(),
0.0f,
thrust::plus<float>());
estimate /= M;
std::cout << "M = " << std::setw(6) << M << " " << std::endl;
std::cout << std::setprecision(6);
std::cout << "pi is approximately " << estimate << std::endl;
}
catch (thrust::system_error &e)
{
// output an error message and exit
std::cerr << "Error: " << e.what() << std::endl;
exit(-1);
}
return 0;
}

Related

CUDA graph does not run as expected

I'm using the following the code to learn about how to use "CUDA graphs". The parameter NSTEP is set as 1000, and the parameter NKERNEL is set as 20. The kernel function shortKernel has three parameters, it will perform a simple calculation.
#include <cuda_runtime.h>
#include <iostream>
#define N 131072 // tuned such that kernel takes a few microseconds
#define NSTEP 1000
#define NKERNEL 20
#define BLOCKS 256
#define THREADS 512
#define CHECK(call) \
do { \
const cudaError_t error_code = call; \
if (error_code != cudaSuccess) { \
printf("CUDA Error\n"); \
printf(" File: %s\n", __FILE__); \
printf(" Line: %d\n", __LINE__); \
printf(" Error code: %d\n", error_code); \
printf(" Error text: %s\n", cudaGetErrorString(error_code)); \
exit(1); \
} \
} while (0)
__global__ void shortKernel(float * out_d, float * in_d, int i){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N) out_d[idx]=1.23*in_d[idx] + i;
}
void test2() {
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaSetDevice(0);
float x_host[N], y_host[N];
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x_host[i] = 2.0f;
y_host[i] = 2.0f;
}
float *x, *y, *z;
CHECK(cudaMalloc((void**)&x, N*sizeof(float)));
CHECK(cudaMalloc((void**)&y, N*sizeof(float)));
CHECK(cudaMalloc((void**)&z, N*sizeof(float)));
cudaMemcpy(x, x_host, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaEvent_t begin, end;
CHECK(cudaEventCreate(&begin));
CHECK(cudaEventCreate(&end));
// start recording
cudaEventRecord(begin, stream);
bool graphCreated=false;
cudaGraph_t graph;
cudaGraphExec_t instance;
// Run graphs
for(int istep=0; istep<NSTEP; istep++){
if(!graphCreated){
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
for(int ikrnl=0; ikrnl<NKERNEL; ikrnl++){
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, ikrnl);
}
cudaStreamEndCapture(stream, &graph);
cudaGraphNode_t* nodes = NULL;
size_t num_nodes = 0;
CHECK(cudaGraphGetNodes(graph, nodes, &num_nodes));
std::cout << "Num of nodes in the graph: " << num_nodes
<< std::endl;
CHECK(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
graphCreated=true;
}
CHECK(cudaGraphLaunch(instance, stream));
cudaStreamSynchronize(stream);
} // End run graphs
cudaEventRecord(end, stream);
cudaEventSynchronize(end);
float time_ms = 0;
cudaEventElapsedTime(&time_ms, begin, end);
std::cout << "CUDA Graph - CUDA Kernel overall time: " << time_ms << " ms" << std::endl;
cudaMemcpy(y_host, y, sizeof(float) * N, cudaMemcpyDeviceToHost);
for(int i = 0; i < N; i++) {
std::cout << "res " << y_host[i] << std::endl;
}
// Free memory
cudaFree(x);
cudaFree(y);
}
int main() {
test2();
std::cout << "end" << std::endl;
return 0;
}
My expected results are shown as the following:
res 2.46
res 3.46
res 4.46
res 5.46
res 6.46
...
However, the actual results are shown like this:
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
...
It seems that the all kernels' parameter i is set as NKERNEL-1. I am very confused about it, could someone give any explanations? Thanks!
I had changed the for loop as follows:
// Run graphs
for(int istep=0; istep<NSTEP; istep++){
if(!graphCreated){
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
for(int ikrnl=0; ikrnl<NKERNEL; ikrnl++){
if(ikrnl == 0)
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, 0);
else if(ikrnl == 1)
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, 1);
else if(ikrnl == 2)
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, 2);
else
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, ikrnl);
}
cudaStreamEndCapture(stream, &graph);
cudaGraphNode_t* nodes = NULL;
size_t num_nodes = 0;
CHECK(cudaGraphGetNodes(graph, nodes, &num_nodes));
std::cout << "Num of nodes in the graph: " << num_nodes
<< std::endl;
CHECK(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
graphCreated=true;
}
CHECK(cudaGraphLaunch(instance, stream));
cudaStreamSynchronize(stream);
} // End run graphs
However, the results are still the same:
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
...
The results are expected and correct.
Every time you run the graph, this entire for-loop gets executed:
for(int ikrnl=0; ikrnl<NKERNEL; ikrnl++){
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, ikrnl);
}
After the first iteration of that for-loop, the results will all be 2.46, after the second iteration the results will all be 3.46, and after the 20th iteration (ikrnl = 19) the results will all be 21.46.
Every time you run the graph, you will get that same result.
Expecting any kind of variation in the result such as this:
res 2.46
res 3.46
res 4.46
res 5.46
res 6.46
Is completely illogical, because every thread is doing precisely the same thing. Every thread starts with the same value in x, and does the same calculation on it. There is no reason to expect any difference between y[0] and y[1], for example.
Rather than trying to wade through CUDA graphs, its clear you don't have a good grasp of what the kernel is doing. My suggestion would be that you write an ordinary CUDA code that calls that kernel just once, without any CUDA graph usage, and study the output. After that, you can put a for-loop around the kernel, and watch the result behavior after every iteration of the for-loop. You don't need CUDA graphs to understand what is going on here.

cuDNN Status Not Supported when trying to use FFT Convolution

I am trying to use the cuDNN library to do a FFT convolution. The code runs when I use the Winograd convolution / the cuDNN method that selects the fastest convolution method, but when I tried to run using the FFT convolution method it does not work.
I set the forward method to FFT convolution myself.
I checked the documents and my input is in NCHW format as required for the FFT convolution. From the docs:
CUDNN_CONVOLUTION_FWD_ALGO_FFT
xDesc Format Support: NCHW HW-packed
yDesc Format Support: NCHW HW-packed
The error "CUDNN_STATUS_NOT_SUPPORTED" happens during the cudnnGetConvolutionForwardWorkspaceSize function call.
What is happening that causes this error when I use FFT convolution VS best or Winograd?
For reference I am using cuda 9.1, cuDNN 7. I compile with the following command on Ubuntu 16.04: nvcc -arch=sm_35 -std=c++11 -O2 -lcudnn FFT_cuDNN.cu -o conv pkg-config --cflags --libs opencv; ./conv TF.png
#include <cudnn.h>
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>
using namespace cv;
using namespace cv::dnn;
#define checkCUDNN(expression) \
{ \
cudnnStatus_t status = (expression); \
if (status != CUDNN_STATUS_SUCCESS) { \
std::cerr << "Error on line " << __LINE__ << ": " \
<< cudnnGetErrorString(status) << std::endl; \
std::exit(EXIT_FAILURE); \
} \
}
cv::Mat load_image_NCHW(const char* image_path)
{
cv::Mat image = cv::imread(image_path, cv::IMREAD_COLOR);
image.convertTo(image, CV_32FC3);
cv::normalize(image,image,0,1, cv::NORM_MINMAX);
cv::Mat inputBlob = blobFromImage(image, 1.0f, cv::Size(image.rows,image.cols), cv::Scalar(0,0,0));
return inputBlob;
}
void save_image(const char* output_filename,
float* buffer,
int height,
int width) {
cv::Mat output_image(height, width, CV_32FC3, buffer);
// Make negative values zero.
cv::threshold(output_image,
output_image,
/*threshold=*/0,
/*maxval=*/0,
cv::THRESH_TOZERO);
cv::normalize(output_image, output_image, 0.0, 255.0, cv::NORM_MINMAX);
output_image.convertTo(output_image, CV_8UC3);
cv::imwrite(output_filename, output_image);
std::cerr << "Wrote output to " << output_filename << std::endl;
}
int main(int argc, const char* argv[]) {
if (argc < 2) {
std::cerr << "usage: conv <image> [gpu=0] [sigmoid=0]" << std::endl;
std::exit(EXIT_FAILURE);
}
int gpu_id = (argc > 2) ? std::atoi(argv[2]) : 0;
std::cerr << "GPU: " << gpu_id << std::endl;
bool with_sigmoid = (argc > 3) ? std::atoi(argv[3]) : 0;
std::cerr << "With sigmoid: " << std::boolalpha << with_sigmoid << std::endl;
// Load the image
cv::Mat image = load_image_NCHW(argv[1]);
int imgH = 600;
int imgW = 561;
int inC = 3;
// Set GPU to use
cudaSetDevice(gpu_id);
// Create the cudnn Handle
cudnnHandle_t cudnn;
checkCUDNN(cudnnCreate(&cudnn));
// Need a descriptor for
// The input, kernel, and convolution
cudnnTensorDescriptor_t input_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1,
/*channels=*/inC,
/*image_height=*/imgH,
/*image_width=*/imgW));
cudnnFilterDescriptor_t kernel_descriptor;
checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor));
checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
/*dataType=*/CUDNN_DATA_FLOAT,
/*format=*/CUDNN_TENSOR_NCHW,
/*out_channels=*/3,
/*in_channels=*/inC,
/*kernel_height=*/3,
/*kernel_width=*/3));
cudnnConvolutionDescriptor_t convolution_descriptor;
checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor));
checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor,
/*pad_height=*/1,
/*pad_width=*/1,
/*vertical_stride=*/1,
/*horizontal_stride=*/1,
/*dilation_height=*/1,
/*dilation_width=*/1,
/*mode=*/CUDNN_CROSS_CORRELATION,
/*computeType=*/CUDNN_DATA_FLOAT));
// Need to compute the output size
int batch_size{0}, channels{0}, height{0}, width{0};
checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convolution_descriptor,
input_descriptor,
kernel_descriptor,
&batch_size,
&channels,
&height,
&width));
std::cerr << "Output Image: " << height << " x " << width << " x " << channels
<< std::endl;
// Need an output descriptor
cudnnTensorDescriptor_t output_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1,
/*channels=*/3,
/*image_height=*/imgH,
/*image_width=*/imgW));
// Need to define the forward algorithm
cudnnConvolutionFwdAlgo_t convolution_algorithm = CUDNN_CONVOLUTION_FWD_ALGO_FFT;
// Have to compute the workspace size
size_t workspace_bytes{0};
checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn,
input_descriptor,
kernel_descriptor,
convolution_descriptor,
output_descriptor,
convolution_algorithm,
&workspace_bytes));
std::cerr << "Workspace size: " << (workspace_bytes / 1048576.0) << "MB"
<< std::endl;
assert(workspace_bytes > 0);
// Allocate the memory needed for the workspace
void* d_workspace{nullptr};
cudaMalloc(&d_workspace, workspace_bytes);
// Allocate memory for the batch of images
// and copy from host to device
int image_bytes = batch_size * channels * height * width * sizeof(float);
float* d_input{nullptr};
cudaMalloc(&d_input, image_bytes);
cudaMemcpy(d_input, image.ptr<float>(0), image_bytes, cudaMemcpyHostToDevice);
// Allocate memory for the output images
// Copy from host to device
float* d_output{nullptr};
cudaMalloc(&d_output, image_bytes);
cudaMemset(d_output, 0, image_bytes);
// clang-format off
const float kernel_template[3][3] = {
{1, 1, 1},
{1, -8, 1},
{1, 1, 1}
};
// clang-format on
float h_kernel[3][3][3][3];
for (int kernel = 0; kernel < 3; ++kernel) {
for (int channel = 0; channel < 3; ++channel) {
for (int row = 0; row < 3; ++row) {
for (int column = 0; column < 3; ++column) {
h_kernel[kernel][channel][row][column] = kernel_template[row][column];
}
}
}
}
float* d_kernel{nullptr};
cudaMalloc(&d_kernel, sizeof(h_kernel));
cudaMemcpy(d_kernel, h_kernel, sizeof(h_kernel), cudaMemcpyHostToDevice);
// Perform actual convolution
const float alpha = 1.0f, beta = 0.0f;
checkCUDNN(cudnnConvolutionForward(cudnn,
&alpha,
input_descriptor,
d_input,
kernel_descriptor,
d_kernel,
convolution_descriptor,
convolution_algorithm,
d_workspace,
workspace_bytes,
&beta,
output_descriptor,
d_output));
// If wish to use sigmoid activation
if (with_sigmoid) {
cudnnActivationDescriptor_t activation_descriptor;
checkCUDNN(cudnnCreateActivationDescriptor(&activation_descriptor));
checkCUDNN(cudnnSetActivationDescriptor(activation_descriptor,
CUDNN_ACTIVATION_SIGMOID,
CUDNN_PROPAGATE_NAN,
/*relu_coef=*/0));
checkCUDNN(cudnnActivationForward(cudnn,
activation_descriptor,
&alpha,
output_descriptor,
d_output,
&beta,
output_descriptor,
d_output));
cudnnDestroyActivationDescriptor(activation_descriptor);
}
// Move results to host
float* h_output = new float[image_bytes];
cudaMemcpy(h_output, d_output, image_bytes, cudaMemcpyDeviceToHost);
save_image("cudnn-out.png", h_output, height, width);
// Free memory
delete[] h_output;
cudaFree(d_kernel);
cudaFree(d_input);
cudaFree(d_output);
cudaFree(d_workspace);
cudnnDestroyTensorDescriptor(input_descriptor);
cudnnDestroyTensorDescriptor(output_descriptor);
cudnnDestroyFilterDescriptor(kernel_descriptor);
cudnnDestroyConvolutionDescriptor(convolution_descriptor);
cudnnDestroy(cudnn);
}
I figured it out, from the docs : xDesc's feature map height + 2 * convDesc's zero-padding height must equal 256 or less xDesc's feature map width + 2 * convDesc's zero-padding width must equal 256 or less.
When I initially read it I was under the impression the zero-padding height meant the kernelH-1, when it refers to the total padded image height / width.
My image was too large. If I resize it works i.e:
cv::Mat inputBlob = blobFromImage(image, 1.0f, cv::Size(100,100), cv::Scalar(0,0,0));

Copy part of memory allocated using cudamallocHost

I would like to copy part of an array from pinned memory on the host to a CUDA device. For example, I've allocated pinned memory of size 500, I would like to copy elements 100-399 to an array of size 300 on the device.
This is my code,
int main()
{
const unsigned int N = 500;
const unsigned int bytes = N * sizeof(int);
cudaError_t status = cudaSuccess;
int *h_a;
int *d_a;
status = cudaMallocHost((void**) &h_a, bytes);
if (status != cudaSuccess)
cout << "Error allocating pinned host memory\n";
status = cudaMalloc((void**) &d_a, bytes);
if (status != cudaSuccess)
cout << "Error allocating pinned device memory\n";
for (int i = 0; i < N; i++) {
h_a[i] = i;
}
status = cudaMemcpy(d_a, h_a + 100, bytes - (200 * sizeof(int)), cudaMemcpyHostToDevice);
if (status != cudaSuccess)
cout << "Error copying to device: " << cudaGetErrorString(status) << "\n";
cudaMemcpy(h_a + 100, d_a, bytes - (200 * sizeof(int)), cudaMemcpyDeviceToHost);
if (status != cudaSuccess)
cout << "Error copying to host: " << cudaGetErrorString(status) << "\n";
cudaFree(d_a);
cudaFreeHost(h_a);
return 0;
}
When I run this, I get this error for the host-to-device copying,
Error copying to device: invalid argument
Only the host-to-device copy fails. The device-to-host copy works fine. Also, the same code works correctly if I use non-pinned host memory. Is there any way to achieve this using pinned memory?
The above code actually compiles and runs properly. Perhaps I was using an old executable while I was testing it.

Cublas Thrust Segmentation Fault

I am new to CUDA programming. I was working on a sample code which multiplies a matrix with a vector and prints the results. I am using Cublas Dgemv API for doing the multiplication. On running the program using cuda-memcheck I get the following error,
Error: process didn't terminate successfully
========= The application may have hit an error when dereferencing Unified Memory from the host. Please rerun the application under cuda-gdb or Nsight Eclipse Edition to catch host side errors.
========= Internal error (20)
========= No CUDA-MEMCHECK results found
The minimal complete code is here,
#include <thrust/device_vector.h>
#include <cublas_v2.h>
#include <iostream>
int main(void)
{
int rowDimension = 3; // number of rows
int columnDimension = 6; // number of columns
// initialize data
thrust::device_vector<double> weightMatrix;
weightMatrix.resize(rowDimension * columnDimension);
thrust::device_vector<double> inputVector;
inputVector.resize(columnDimension);
thrust::device_vector<double> F;
F.resize(rowDimension);
for (size_t i = 0; i < rowDimension; i++)
for (size_t j = 0; j < columnDimension; j++)
weightMatrix[j * rowDimension + i]=i;
for (size_t j = 0; j < columnDimension; j++)
inputVector[j] = j;
for (size_t i = 0; i < rowDimension; i++)
F[i]=0;
cublasHandle_t handle;
/* Initialize CUBLAS */
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
std::cerr << "!!!! CUBLAS initialization error\n";
double alpha = 1.0f;
// cudaDeviceSynchronize();
status = cublasDgemv(handle, CUBLAS_OP_N, rowDimension, columnDimension, &alpha, thrust::raw_pointer_cast(weightMatrix.data()), rowDimension,
thrust::raw_pointer_cast(inputVector.data()), 1, 0, thrust::raw_pointer_cast(F.data()), 1) ;;
// cudaDeviceSynchronize();
if (status != CUBLAS_STATUS_SUCCESS)
std::cerr << "!!!! kernel execution error.\n";
for (size_t j = 0; j < rowDimension; j++)
std::cout << F[j] << " ";
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
std::cerr << "!!!! shutdown error (A)\n";
return 0;
}
The above prgram produces a segmentation fault at the cublasDgemv function. Onrunning cuda-memcheck i get the message reported above. On Googling i couldn't find much help.
Can someone please help me resolve this issue.
Have a look at the documentation of cublasDgemv.
The signature is:
cublasDgemv(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const double *alpha,
const double *A,
int lda,
const double *x,
int incx,
const double *beta,
double *y,
int incy)
beta has to be supplied as a pointer. But you pass a NULL pointer to it instead of a pointer pointing to the value 0.
So the following will fix your problem:
double alpha = 1.0;
double beta = 0;
status = cublasDgemv(handle,
CUBLAS_OP_N,
rowDimension,
columnDimension,
&alpha,
thrust::raw_pointer_cast(weightMatrix.data()),
rowDimension,
thrust::raw_pointer_cast(inputVector.data()),
1,
&beta, // note the change here!
thrust::raw_pointer_cast(F.data()),
1);

How to use CUB and Thrust in one CUDA code

I'm trying to introduce some CUB into my "old" Thrust code, and so have started with a small example to compare thrust::reduce_by_key with cub::DeviceReduce::ReduceByKey, both applied to thrust::device_vectors.
The thrust part of the code is fine, but the CUB part, which naively uses raw pointers obtained via thrust::raw_pointer_cast, crashes after the CUB calls. I put in a cudaDeviceSynchronize() to try to solve this problem, but it didn't help. The CUB part of the code was cribbed from the CUB web pages.
On OSX the runtime error is:
libc++abi.dylib: terminate called throwing an exception
Abort trap: 6
On Linux the runtime error is:
terminate called after throwing an instance of 'thrust::system::system_error'
what(): an illegal memory access was encountered
The first few lines of cuda-memcheck are:
========= CUDA-MEMCHECK
========= Invalid __global__ write of size 4
========= at 0x00127010 in /home/sdettrick/codes/MCthrust/tests/../cub-1.3.2/cub/device/dispatch/../../block_range/block_range_reduce_by_key.cuh:1017:void cub::ReduceByKeyRegionKernel<cub::DeviceReduceByKeyDispatch<unsigned int*, unsigned int*, float*, float*, int*, cub::Equality, CustomSum, int>::PtxReduceByKeyPolicy, unsigned int*, unsigned int*, float*, float*, int*, cub::ReduceByKeyScanTileState<float, int, bool=1>, cub::Equality, CustomSum, int>(unsigned int*, float*, float*, int*, cub::Equality, CustomSum, int, cub::DeviceReduceByKeyDispatch<unsigned int*, unsigned int*, float*, float*, int*, cub::Equality, CustomSum, int>::PtxReduceByKeyPolicy, unsigned int*, int, cub::GridQueue<int>)
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x7fff7dbb3e88 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
Unfortunately I'm not too sure what to do about that.
Any help would be greatly appreciated. I tried this on the NVIDIA developer zone but didn't get any responses. The complete example code is below. It should compile with CUDA 6.5 and cub 1.3.2:
#include <iostream>
#include <thrust/sort.h>
#include <thrust/gather.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh>
//========================================
// for CUB:
struct CustomSum
{
template <typename T>
CUB_RUNTIME_FUNCTION __host__ __device__ __forceinline__
//__host__ __device__ __forceinline__
T operator()(const T &a, const T &b) const {
return b+a;
}
};
//========================================
int main()
{
const int Nkey=20;
int Nseg=9;
int ikey[Nkey] = {0, 0, 0, 6, 8, 0, 2, 4, 6, 8, 1, 3, 5, 7, 8, 1, 3, 5, 7, 8};
thrust::device_vector<unsigned int> key(ikey,ikey+Nkey);
thrust::device_vector<unsigned int> keysout(Nkey);
// Let's reduce x, by key:
float xval[Nkey];
for (int i=0; i<Nkey; i++) xval[i]=ikey[i]+0.1f;
thrust::device_vector<float> x(xval,xval+Nkey);
// First, sort x by key:
thrust::sort_by_key(key.begin(),key.end(),x.begin());
//---------------------------------------------------------------------
std::cout<<"=================================================================="<<std::endl
<<" THRUST reduce_by_key:"<<std::endl
<<"=================================================================="<<std::endl;
thrust::device_vector<float> output(Nseg,0.0f);
thrust::reduce_by_key(key.begin(),
key.end(),
x.begin(),
keysout.begin(),
output.begin());
for (int i=0;i<Nkey;i++) std::cout << x[i] <<" "; std::cout<<std::endl;
for (int i=0;i<Nkey;i++) std::cout << key[i] <<" "; std::cout<<std::endl;
for (int i=0;i<Nseg;i++) std::cout << output[i] <<" "; std::cout<<std::endl;
float ototal=thrust::reduce(output.begin(),output.end());
float xtotal=thrust::reduce(x.begin(),x.end());
std::cout << "total="<< ototal <<", should be "<<xtotal<<std::endl;
//---------------------------------------------------------------------
std::cout<<"=================================================================="<<std::endl
<<" CUB ReduceByKey:"<<std::endl
<<"=================================================================="<<std::endl;
unsigned int *d_keys_in =thrust::raw_pointer_cast(&key[0]);
float *d_values_in =thrust::raw_pointer_cast(&x[0]);
unsigned int *d_keys_out =thrust::raw_pointer_cast(&keysout[0]);
float *d_values_out=thrust::raw_pointer_cast(&output[0]);
int *d_num_segments=&Nseg;
CustomSum reduction_op;
std::cout << "CUB input" << std::endl;
for (int i=0; i<Nkey; ++i) std::cout << key[i] << " "; std::cout<<std::endl;
for (int i=0; i<Nkey; ++i) std::cout << x[i] << " "; std::cout<< std::endl;
for (int i=0; i<Nkey; ++i) std::cout << keysout[i] << " "; std::cout<< std::endl;
for (int i=0; i<Nseg; ++i) std::cout << output[i] << " "; std::cout<< std::endl;
// Determine temporary device storage requirements
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_segments, reduction_op, Nkey);
// Allocate temporary storage
cudaMalloc(&d_temp_storage, temp_storage_bytes);
std::cout << "temp_storage_bytes = " << temp_storage_bytes << std::endl;
// Run reduce-by-key
cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_segments, reduction_op, Nkey);
cudaDeviceSynchronize();
std::cout << "CUB output" << std::endl;
std::cout<<Nkey<<" "<<Nseg<<std::endl;
std::cout<<key.size() << " "<<x.size() << " "<<keysout.size() << " "<<output.size() << std::endl;
// At this point onward it dies:
//libc++abi.dylib: terminate called throwing an exception
//Abort trap: 6
// If the next line is uncommented, it crashes the Mac!
for (int i=0; i<Nkey; ++i) std::cout << key[i] << " "; std::cout<<std::endl;
// for (int i=0; i<Nkey; ++i) std::cout << x[i] << " "; std::cout<< std::endl;
// for (int i=0; i<Nkey; ++i) std::cout << keysout[i] << " "; std::cout<< std::endl;
// for (int i=0; i<Nseg; ++i) std::cout << output[i] << " "; std::cout<< std::endl;
cudaFree(d_temp_storage);
ototal=thrust::reduce(output.begin(),output.end());
xtotal=thrust::reduce(x.begin(),x.end());
std::cout << "total="<< ototal <<", should be "<<xtotal<<std::endl;
return 1;
}
This is not appropriate:
int *d_num_segments=&Nseg;
You cannot take the address of a host variable and use it as a device pointer.
Instead do this:
int *d_num_segments;
cudaMalloc(&d_num_segments, sizeof(int));
This allocates space on the device for the size of data (a single integer that cub will write to), and assigns the address of that allocation to your d_num_segments variable. This then becomes a valid device pointer.
In (*ordinary, non-UM) CUDA, it is illegal dereference a host address in device code, or a device address in host code.