Error memcpy in device code - cuda

I write a code that get first _var positions of a vector of possibilities (i.e., matrix _size*_var with _var=3 and _size=27) and calling this function in my kernel (32 threads, ie, each has an object) but I do not get any return value of the function neither the NULL pointer.
The program exit without error but the printf lines in the kernel is not executed or displayed (even compiled with sm_20 or higher) as if the program stopped before.
dataIntern.h:
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define _MIN -1
#define _MAX 1
#ifdef __CUDACC__
#define CUDA_CALLABLE_MEMBER __host__ __device__
#else
#define CUDA_CALLABLE_MEMBER
#endif
template <class a_type>
class dataIntern{
private:
a_type *possibilities;
int _assign;
int _size;
int _var;
int _maxsize;
public:
CUDA_CALLABLE_MEMBER dataIntern(){
}
CUDA_CALLABLE_MEMBER dataIntern(int var){
_var = var;
_size = (int)pow(3.0, (double)_var);
_maxsize = _size * _var;
_assign = 1;
possibilities = (a_type*)malloc(_maxsize*sizeof(a_type));
if(!possibilities){
exit(1);
}
createTable();
}
CUDA_CALLABLE_MEMBER void createTable(){
int i, j, k, limit, pos;
a_type value;
if(_assign == 1){
for(i=0; i<_var; i++){
#ifdef __CUDA_ARCH__
limit = (int)pow(3.0, _var-i-1);
#else
limit = (int)pow(3.0, (double)_var-i-1);
#endif
value = (a_type)_MIN;
k = 0;
for(j=0; j<_size; j++){
pos = _var*j+i;
if(k >= limit){
value++;
if(value > _MAX){
value = (a_type)_MIN;
}
k = 0;
}
possibilities[pos] = value;
k++;
}
}
}
}
CUDA_CALLABLE_MEMBER void print(){
int i;
printf("Printing.\n");
if(_assign == 1){
for(i=0; i<_size*_var; i++){
printf("%d ", possibilities[i]);
if(i%_var == _var-1){
printf("\n");
}
}
}
else{
printf("Not assigned.\n");
}
}
CUDA_CALLABLE_MEMBER void retify(int posChanged, a_type valueRetified){
int i, pos, count, initpos, attrib;
a_type *newnode;
a_type *newlist = NULL, *morelist = NULL;
pos = posChanged;
initpos = 0;
count = 0;
if(_assign == 1){
attrib = 0;
newnode = (a_type*)malloc(_var*sizeof(a_type));
for(i=0; i<_size; i++){
if(possibilities[pos] == valueRetified){
memcpy(newnode, &possibilities[i*_var], _var*sizeof(a_type));
count++;
if(newlist!=NULL){
morelist = (a_type*)malloc(count*_var*sizeof(a_type));
memcpy(morelist, newlist, (count-1)*_var*sizeof(a_type));
}
newlist = (a_type*)malloc(count*_var*sizeof(a_type));
memcpy(newlist, morelist, (count-1)*_var*sizeof(a_type));
memcpy(&newlist[initpos], newnode, _var*sizeof(a_type));
initpos+=_var;
attrib = 1;
}
pos+=_var;
}
if(attrib == 1){
_size = count;
possibilities = (a_type*)malloc(_size*_var*sizeof(a_type));
if(possibilities == NULL){
printf("Allocation fail in newlist retify.\n");
exit(1);
}
memcpy(possibilities, newlist, _size*_var*sizeof(a_type));
}
else{
_assign = 0;
}
}
}
CUDA_CALLABLE_MEMBER a_type* unstack(){
a_type* solution = NULL, *backup = NULL;
if(_assign == 1){
if(_size>0){
backup = (a_type*)malloc(_var*_size*sizeof(a_type));
if(backup == NULL){
printf("Erro to alloc backup pointer on unstack function in data intern\n");
return NULL;
}
solution = (a_type*)malloc(_var*sizeof(a_type));
if(solution == NULL){
printf("Erro to alloc solution pointer on unstack function in data intern\n");
return NULL;
}
memcpy(backup, possibilities, _size*_var*sizeof(a_type));
memcpy(solution, possibilities, _var*sizeof(a_type));
free(possibilities);
_size--;
possibilities = (a_type*)malloc(_size*_var*sizeof(a_type));
if(possibilities == NULL){
printf("Error to realloc possibilities pointer in data intern\n");
return NULL;
}
memcpy(possibilities, &backup[_var], _size*_var*sizeof(a_type));
free(backup);
return solution;
}
}
return NULL;
}
CUDA_CALLABLE_MEMBER int get_size(){
return _size;
}
CUDA_CALLABLE_MEMBER ~dataIntern(){
_assign = 0;
if(possibilities)
free(possibilities);
}
};
deviceCode.h:
#ifndef DEVICECODE_H
#define DEVICECODE_H
void CallingInMain();
__global__ void kernel();
#endif
deviceCode.cu:
#include "deviceCode.h"
#include "dataIntern.h"
#include <iostream>
#include <stdio.h>
//I declared like this to my kernel:
__global__ void kernel(){
__shared__ dataIntern<int> data[32];
int *vetor;
vetor = NULL;
data[threadIdx.x] = dataIntern<int>(3);
//_var == 3 in the class above
vetor = (int*)malloc(sizeof(int)*3);
vetor = data[threadIdx.x].unstack();
while(vetor!=NULL){
//never past here
printf("%d %d %d %d\n", threadIdx.x, vetor[0], vetor[1], vetor[2]);
vetor = data[threadIdx.x].unstack();
}
//neither here in if or else
if(vetor)
printf("Not null\n");
else
printf("Null final\n");
free(vetor);
}
void CallingInMain(){
kernel<<<1, 32>>>();
cudaDeviceSynchronize();
}
main.cu:
#include <iostream>
#include <stdio.h>
#ifndef deviceCode_H
#include "deviceCode.h"
#endif
int main(int argc, char* argv[]){
CallingInMain();
return 0;
}

Some colleagues pointed out to me that your code seems to have an error in it.
Consider this line in your kernel:
data[threadIdx.x] = dataIntern<int>(3);
This line instantiates a temporary dataIntern<int> object, runs the constructor with a value of 3 on it, and then does a copy from that object to the storage in data[threadIdx.x]. Note that the constructor performs a malloc operation:
CUDA_CALLABLE_MEMBER dataIntern(int var){
...
possibilities = (a_type*)malloc(_maxsize*sizeof(a_type));
But since the original object is temporary, the C++ standard allows the object to be deleted at the termination of the statement, i.e. at the semicolon here:
data[threadIdx.x] = dataIntern<int>(3);
^
after the copy-construction process is complete. But the deletion of the object triggers the destructor, which does a free operation on possibilities:
CUDA_CALLABLE_MEMBER ~dataIntern(){
_assign = 0;
if(possibilities)
free(possibilities);
}
Therefore usage of the pointer so allocated subsequent to this line of code:
data[threadIdx.x] = dataIntern<int>(3);
such as in unstack here:
vetor = data[threadIdx.x].unstack();
will be invalid.
This is a violation of C++ programming rules, and the error is not specific to CUDA.

Related

Blynk Button is not getting actuated

I, a complete beginner, was doing a project with ESP8266 where I interfaced with a relay and two sensors. The sensors are working fine but the relay is not getting actuated by the Blynk button when I press it. Below is the code where I got proper output for the sensors but not the relay.
I get all the values of sensors in the Blynk app but not the relay actuation where I connected it to a motor and in the D2 pin. Thanks in advance.:)
#define BLYNK_PRINT Serial
#define BLYNK_TEMPLATE_ID "my_template"
#define BLYNK_DEVICE_NAME "my_device name"
#define BLYNK_AUTH_TOKEN "Auth token"
#include <SPI.h>
#include <ESP8266WiFi.h>
#include <BlynkSimpleEsp8266.h>
#include <DHT.h>
#define BLYNK_PRINT Serial
#include <OneWire.h>
#include <DallasTemperature.h>
#define ONE_WIRE_BUS D2
OneWire oneWire(ONE_WIRE_BUS);
DallasTemperature sensors(&oneWire);
char auth[] = "Authtoken";
char ssid[] = "my_ssid";
char pass[] = "my_pass";
#define sensorPin D3
int sensorState = 0;
int lastState = 0;
#define DHTPIN 2
#define DHTTYPE DHT11
DHT dht(DHTPIN, DHTTYPE);
BlynkTimer timer;
void sendSensor()
{
float h = dht.readHumidity();
float t = dht.readTemperature();
Blynk.virtualWrite(V5, h); //hum
Blynk.virtualWrite(V6, t); //temp
}
void setup()
{
pinMode(D2,OUTPUT); //these two lines are the one which use for actuating the relay
digitalWrite(D2, HIGH);//
Blynk.begin(auth, ssid, pass);
pinMode(sensorPin, INPUT);
dht.begin();
timer.setInterval(1000L, sendSensor);
Blynk.begin(auth, ssid, pass);
sensors.begin();
}
int sensor = 0;
void sendTemps()
{
sensor = analogRead(A0);
sensors.requestTemperatures();
float temp = sensors.getTempCByIndex(0);
Blynk.virtualWrite(V1, temp);
Blynk.virtualWrite(V2, sensor);
delay(1000);
}
void loop()
{
Blynk.run();
timer.run();
sendTemps();
sensorState = digitalRead(sensorPin);
if (sensorState == 1 && lastState == 0) {
lastState = 1;
delay(1000);
}
else if (sensorState == 1 && lastState == 1) {
delay(1000);
}
else {
lastState = 0;
delay(1000);
}
delay(100);
}

RISC-V fuzzing emulation

I am new to this but I need to emulate RISC-V using qemu. As a start for my fuzzing project, how can I do give qemu an instruction set and get the changes in the registries as an output.
I probably understand your question. Because I don't have a riscv-related environment here, I can only provide a solution.
For example, in riscv, we design a function to get the values of all registers, relying on qemu's plugin module (such as qemu_plugin_register_vcpu_insn_exec_cb()).
plugin_test.c
#include <inttypes.h>
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <stdio.h>
#include <glib.h>
#include <qemu-plugin.h>
QEMU_PLUGIN_EXPORT int qemu_plugin_version = QEMU_PLUGIN_VERSION;
#define CPU_SIZE 32
static int cpu_num;
static int cpu_value[CPU_SIZE]={0};
static void vcpu_insn_exec_before(unsigned int cpu_index, void *)
{
for (size_t i = 0; i < cpu_num; i++)
{
/* code */
for (size_t j = 0; j < CPU_SIZE; i++)
{
if(cpu_value[j] != get_cpu_register(i,j)) {
// The value of cpu has changed
...
} else {
// The value of cpu has not changed
...
}
}
}
}
static void vcpu_tb_trans(qemu_plugin_id_t id, struct qemu_plugin_tb *tb)
{
size_t n = qemu_plugin_tb_n_insns(tb);
size_t i;
for (i = 0; i < n; i++) {
struct qemu_plugin_insn *insn = qemu_plugin_tb_get_insn(tb, i);
qemu_plugin_register_vcpu_insn_exec_cb(
insn, vcpu_insn_exec_before, QEMU_PLUGIN_CB_NO_REGS,void *);
}
}
static void plugin_exit(qemu_plugin_id_t id, void *p)
{
}
QEMU_PLUGIN_EXPORT int qemu_plugin_install(qemu_plugin_id_t id,
const qemu_info_t *info,
int argc, char **argv)
{
if(info->system_emulation) {
cpu_num = info->system.smp_vcpus;
} else {
cpu_num = 1;
}
qemu_plugin_register_vcpu_tb_trans_cb(id, vcpu_tb_trans);
qemu_plugin_register_atexit_cb(id, plugin_exit, NULL);
return 0;
}
api-ext.c
void *qemu_get_cpu(int index);
static uint32_t get_cpu_register(unsigned int cpu_index, unsigned int reg) {
uint8_t* cpu = qemu_get_cpu(cpu_index);
return *(uint32_t*)(cpu + 33488 + 5424 + reg * 4);
}
It should be noted that the content in api-ext.c is obtained from others. This is the function used to obtain the value of arm cpu. You need to check the source code or documentation for riscv.

Cuda get gpu load percent

I want to calculate the GPU load. How get gpu load percent in cuda?
enter image description here
http://eliang.blogspot.com.by/2011/05/getting-nvidia-gpu-usage-in-c.html?m=1
//
// Getting Nvidia GPU Usage
//
// Reference: Open Hardware Monitor (http://code.google.com/p/open-hardware-monitor)
//
#include <windows.h>
#include <iostream>
// magic numbers, do not change them
#define NVAPI_MAX_PHYSICAL_GPUS 64
#define NVAPI_MAX_USAGES_PER_GPU 34
// function pointer types
typedef int *(*NvAPI_QueryInterface_t)(unsigned int offset);
typedef int (*NvAPI_Initialize_t)();
typedef int (*NvAPI_EnumPhysicalGPUs_t)(int **handles, int *count);
typedef int (*NvAPI_GPU_GetUsages_t)(int *handle, unsigned int *usages);
int main()
{
HMODULE hmod = LoadLibraryA("nvapi.dll");
if (hmod == NULL)
{
std::cerr << "Couldn't find nvapi.dll" << std::endl;
return 1;
}
// nvapi.dll internal function pointers
NvAPI_QueryInterface_t NvAPI_QueryInterface = NULL;
NvAPI_Initialize_t NvAPI_Initialize = NULL;
NvAPI_EnumPhysicalGPUs_t NvAPI_EnumPhysicalGPUs = NULL;
NvAPI_GPU_GetUsages_t NvAPI_GPU_GetUsages = NULL;
// nvapi_QueryInterface is a function used to retrieve other internal functions in nvapi.dll
NvAPI_QueryInterface = (NvAPI_QueryInterface_t) GetProcAddress(hmod, "nvapi_QueryInterface");
// some useful internal functions that aren't exported by nvapi.dll
NvAPI_Initialize = (NvAPI_Initialize_t) (*NvAPI_QueryInterface)(0x0150E828);
NvAPI_EnumPhysicalGPUs = (NvAPI_EnumPhysicalGPUs_t) (*NvAPI_QueryInterface)(0xE5AC921F);
NvAPI_GPU_GetUsages = (NvAPI_GPU_GetUsages_t) (*NvAPI_QueryInterface)(0x189A1FDF);
if (NvAPI_Initialize == NULL || NvAPI_EnumPhysicalGPUs == NULL ||
NvAPI_EnumPhysicalGPUs == NULL || NvAPI_GPU_GetUsages == NULL)
{
std::cerr << "Couldn't get functions in nvapi.dll" << std::endl;
return 2;
}
// initialize NvAPI library, call it once before calling any other NvAPI functions
(*NvAPI_Initialize)();
int gpuCount = 0;
int *gpuHandles[NVAPI_MAX_PHYSICAL_GPUS] = { NULL };
unsigned int gpuUsages[NVAPI_MAX_USAGES_PER_GPU] = { 0 };
// gpuUsages[0] must be this value, otherwise NvAPI_GPU_GetUsages won't work
gpuUsages[0] = (NVAPI_MAX_USAGES_PER_GPU * 4) | 0x10000;
(*NvAPI_EnumPhysicalGPUs)(gpuHandles, &gpuCount);
// print GPU usage every second
for (int i = 0; i < 100; i++)
{
(*NvAPI_GPU_GetUsages)(gpuHandles[0], gpuUsages);
int usage = gpuUsages[3];
std::cout << "GPU Usage: " << usage << std::endl;
Sleep(1000);
}
return 0;
}

cublassgemm for row-major matrix

I really tried to implement a function in C to multiply to row-major matrix in cublas. I don't know where I mistaking.
In the function below A, B and C are pointers to an row matrix correctly
allocated.
I'd like to keep the option of translate a matrix before perform the product.
The function below is not working.
void matrixMul(cublasHandle_t handle,float *A,float *B,float *C, int m,int n,int k,int transA,int transB){
cublasStatus_t stat ; // CUBLAS functions status
float alfa = 1;
float beta = 0;
int
ma = transA ? n:m,
na = transA ? m:n,
nb = transB ? k:n,
mb = transB ? n:k;
if(na!=mb){
puts("Something wrong");
}
//(mb,nb)(ma,na) = (mb,na)
stat= cublasSgemm_v2(handle, (cublasOperation_t) transB, (cublasOperation_t)transA,
nb,ma,mb,&alfa,
B,k,
A,n,&beta,
C,m);
switch (stat) {
case CUBLAS_STATUS_SUCCESS:
puts("Sucess");
break;
default:
printf(">>>>ERRO %d<<<<\n",stat);
break;
}
}
The entire source code
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA Samples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// CUDA and CUBLAS functions
#include <helper_functions.h>
void getFromDevice(float *h_A,float *d_A,int size){
//printf("Copy input data from the host memory to the CUDA device\n");
cudaError_t err = cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
//A = (m,n)
//B = (n,k)
//C = (m,k)
void matrixMul(cublasHandle_t handle,float *A,float *B,float *C, int m,int n,int k,int transA,int transB){
cublasStatus_t stat ; // CUBLAS functions status
float alfa = 1;
float beta = 0;
int
ma = transA ? n:m,
na = transA ? m:n,
nb = transB ? k:n,
mb = transB ? n:k;
if(na!=mb){
puts("Something wrong");
}
//(mb,nb)(ma,na) = (mb,na)
stat= cublasSgemm_v2(handle, (cublasOperation_t) transB, (cublasOperation_t)transA,
nb,ma,mb,&alfa,
B,k,
A,n,&beta,
C,m);
switch (stat) {
case CUBLAS_STATUS_SUCCESS:
puts("Sucess");
break;
default:
printf(">>>>ERRO %d<<<<\n",stat);
break;
}
}
float *mallocfDevice(int size){
float *d_C = NULL;
cudaError_t err = cudaMalloc((void **)&d_C, size * sizeof(float));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}else{
size_t freeM, total;
cudaMemGetInfo ( &freeM, &total);
printf("MEM:%.3f\n",freeM,total,100 - ((double)freeM/total)*100 );
}
return d_C;
}
void printHostMatrix(int nl, int nc, float *h_s){
for(int j = 0; j < nl ; j++) {
for(int i = 0; i < (nc) ; i++){
int idx = j*nc + i;
printf("%.2f ", h_s[idx]);
}
printf("\n");
}
}
void printfDeviceMatrix(float *d_s,int m, int p){
float *h_s =(float*) malloc(sizeof(float)*m*p);
getFromDevice(h_s,d_s,sizeof(float)*m*p);
printHostMatrix(m,p,h_s);
free(h_s);
}
void sendTofDevice(float *h_A,float *d_A,int size){
//printf("Copy input data from the host memory to the CUDA device\n");
cudaError_t err = cudaMemcpy(d_A, h_A, size*sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
int main(int argc,char **argv){
int ma = 2,
na = 3,
mb = 3,
nb = 2;
float A[] = { 1,2,3,
4,5,6};
float B[] = {7, 8,
9,10,
11,12};
float *C = new float[ma*nb];
float *d_a = mallocfDevice(ma*mb),
*d_b = mallocfDevice(mb*nb),
*d_c = mallocfDevice(ma*nb);
sendTofDevice(A,d_a,ma*na);
sendTofDevice(B,d_b,mb*nb);
cublasHandle_t handle ; // CUBLAS context
cublasCreate (&handle );
puts("A");
printfDeviceMatrix(d_a,ma,na);
puts("B");
printfDeviceMatrix(d_b,mb,nb);
matrixMul(handle, d_a,d_b,d_c,
ma,na,nb,0,0);
puts("AB=C");
printfDeviceMatrix(d_c,ma,nb);
}
CUBLAS assumes that the matrix in the device is stored in column major:
"
where α and β are scalars, and A , B and C are matrices stored in column-major format with dimensions op ( A ) m × k , op ( B ) k × n and C m × n , respectively. Also, for matrix A
Read more at: http://docs.nvidia.com/cuda/cublas/index.html#ixzz3mSDJTWrM "
That means the matrix needs to be treated as differently on the device than on the host.

Thrust: selectively move elements to another vector

I'm trying to figure out the best way to do the following using Thrust: vector A has a million floats, they have some particular order. I want to move to vector B every element x in A for which x>7.0 such that the order of elements is maintain in both vectors A and B. Importantly, only a tiny fraction of elements need be moved. Efficiency is more important for my code than elegance.
My idea was to use thrust::copy_if from A to B and then thrust::remove_if on A. But I don't know the exact number of elements to be copy, and since apparently the memory for B must be allocated in advance, another counting operation is necessary. An inelegant way to skip the counting operation is to pre-allocate "enough" memory for vector B.
Using thrust::remove_copy_if has much the same problems: you need to allocate memory for B in advance, and also it doesn't actually remove anything from A so another thrust::remove_if is required anyway.
Another idea I had was to use thrust::stable_sort with some custom-made comparison functor, to push all elements I want out to the end of A, and then somehow figure out how many there are and thrust::copy them to B. This also looks pretty inelegant...
You're on the right track with thrust::copy_if. Just allocate two more buffers of the same size as the first one. Then copy_if > 7.0f to the first one and copy_if <= 7.0f to the second one. Allocating buffers of the same size as the original buffer is fine as long as you know there's room, and 1 million floats only takes up 4MB.
Edit:
I did a performance comparison of the copy_if and stable_partition approaches. On my card, a GTX660, stable_partition took around 150% as long as copy_if for "split" values of 0.1f, 0.5f and 0.9f. I added tests to ensure that both methods are stable (maintain the order of the values).
#include <cuda.h>
#include <curand.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/partition.h>
#include <iostream>
#include <cassert>
#define CHECK_CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
return EXIT_FAILURE;}} while(0)
#define CHECK_CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
return EXIT_FAILURE;}} while(0)
#define SPLIT 0.1f
struct is_low
{
__host__ __device__ bool operator()(const float x)
{
return x <= SPLIT;
}
};
struct is_high
{
__host__ __device__ bool operator()(const float x)
{
return x > SPLIT;
}
};
class EventTimer {
public:
EventTimer() : mStarted(false), mStopped(false) {
cudaEventCreate(&mStart);
cudaEventCreate(&mStop);
}
~EventTimer() {
cudaEventDestroy(mStart);
cudaEventDestroy(mStop);
}
void start(cudaStream_t s = 0) {
cudaEventRecord(mStart, s);
mStarted = true;
mStopped = false;
}
void stop(cudaStream_t s = 0) {
assert(mStarted);
cudaEventRecord(mStop, s);
mStarted = false;
mStopped = true;
}
float elapsed() {
assert(mStopped);
if (!mStopped) return 0;
cudaEventSynchronize(mStop);
float elapsed = 0;
cudaEventElapsedTime(&elapsed, mStart, mStop);
return elapsed;
}
private:
bool mStarted, mStopped;
cudaEvent_t mStart, mStop;
};
int main(int argc, char *argv[])
{
const size_t n = 1024 * 1024 * 50;
// Create prng
curandGenerator_t gen;
CHECK_CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
// Set seed
CHECK_CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
// Generate n floats on device
thrust::device_vector<float> vec_rnd_d(n);
float* ptr_rnd_d = thrust::raw_pointer_cast(vec_rnd_d.data());
CHECK_CURAND_CALL(curandGenerateUniform(gen, ptr_rnd_d, n));
thrust::device_vector<float> vec_low_d(n);
thrust::device_vector<float> vec_high_d(n);
for (int i = 0; i < 5; ++i) {
EventTimer timer;
timer.start();
thrust::device_vector<float>::iterator iter_end;
iter_end = thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_low_d.begin(), is_low());
thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_high_d.begin(), is_high());
timer.stop();
std::cout << "copy_if: " << timer.elapsed() << "ms" << std::endl;
// check result
thrust::host_vector<float> vec_rnd_h = vec_rnd_d;
thrust::host_vector<float> vec_low_h = vec_low_d;
thrust::host_vector<float> vec_high_h = vec_high_d;
thrust::host_vector<float>::iterator low_iter_h = vec_low_h.begin();
thrust::host_vector<float>::iterator high_iter_h = vec_high_h.begin();
for (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin();
rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) {
if (*rnd_iter_h <= SPLIT) {
assert(*low_iter_h == *rnd_iter_h);
++low_iter_h;
}
else {
assert(*high_iter_h == *rnd_iter_h);
++high_iter_h;
}
}
}
for (int i = 0; i < 5; ++i) {
thrust::device_vector<float> vec_rnd_copy = vec_rnd_d;
EventTimer timer;
timer.start();
thrust::device_vector<float>::iterator iter_split =
thrust::stable_partition(vec_rnd_copy.begin(), vec_rnd_copy.end(), is_low());
timer.stop();
size_t n_low = iter_split - vec_rnd_copy.begin();
std::cout << "stable_partition: " << timer.elapsed() << "ms" << std::endl;
// check result
thrust::host_vector<float> vec_rnd_h = vec_rnd_d;
thrust::host_vector<float> vec_partitioned_h = vec_rnd_copy;
thrust::host_vector<float>::iterator low_iter_h = vec_partitioned_h.begin();
thrust::host_vector<float>::iterator high_iter_h = vec_partitioned_h.begin() + n_low;
for (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin();
rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) {
if (*rnd_iter_h <= SPLIT) {
assert(*low_iter_h == *rnd_iter_h);
++low_iter_h;
}
else {
assert(*high_iter_h == *rnd_iter_h);
++high_iter_h;
}
}
}
CHECK_CURAND_CALL(curandDestroyGenerator(gen));
return EXIT_SUCCESS;
}
Output:
C:\rd\projects\cpp\test_cuda\Release>test_cuda.exe
copy_if: 40.2919ms
copy_if: 38.0157ms
copy_if: 38.5036ms
copy_if: 37.6751ms
copy_if: 38.1054ms
stable_partition: 59.5473ms
stable_partition: 61.4016ms
stable_partition: 59.1854ms
stable_partition: 61.3195ms
stable_partition: 59.1205ms
To answer my own question, I finally found thrust::stable_partition, which is more efficient and elegant than all "copy_if"-alternatives. It just moves all elements that fail to satisfy a predicate to the end of the array and returns the start of the second sequence. Pointer arithmetic gives the size of B, but in fact it's not necessary anymore:
thrust::device_vector<float>::iterator iter = thrust::stable_partition(A.begin(), A.end(), pred)
thrust::device_vector<float> B(iter, A.end())
A.erase(iter, A.end());