cudaMemcpy returns success but does not copy anything - cuda

below are the things I have checked with cuda-gdb:
the contents of src are correct
cudaMalloc, malloc, and file I/O are successful
cudaMemcpy returns cudaSuccess
the problematic cudaMemcpy is called and throws no errors or exceptions
destination is allocated (cudaMalloc) successfully
Below are relevent parts of the code: wavenet_server.cc mallocs the source, copies data from a file to the source, and calls make_wavenet. wavenet_infer.cu calls constructor of MyWaveNet and calls setEmbeddings.
wavenet_server.cc:
#include "wavenet_infer.h"
void readArrayFromBinary(void* array, size_t len, size_t num_bytes_per_elem, const char* file_name) {
FILE* file = fopen(file_name, "rb");
fread(array, num_bytes_per_elem, len, file);
fclose(file);
}
void setEmbeddingCurr(const char* fileName, size_t len) {
this->embedding_curr = (float*)malloc(sizeof(float) * len);
readArrayFromBinary((void*)this->embedding_curr, len, sizeof(float), fileName);
}
void setWavenet(void) {
this->wavenet = make_wavenet(this->num_samples,
this->batch_size,
this->embedding_prev,
this->embedding_curr,
this->num_layers,
this->max_dilation,
this->dilate_weights_prev,
this->dilate_weights_curr,
this->dilate_biases,
this->res_weights,
this->res_biases,
this->skip_weights,
this->skip_biases,
this->conv_out,
this->conv_end,
this->is_using_embed_tanh,
this->implementation);
}
wavenet_infer.cu:
#include "nv_wavenet.cuh"
typedef nvWavenetInfer<float,float, R, S, A> MyWaveNet;
void* make_wavenet(int sample_count,
int batch_size,
float* embedding_prev,
float* embedding_curr,
int num_layers,
int max_dilation,
float** in_layer_weights_prev,
float** in_layer_weights_curr,
float** in_layer_biases,
float** res_layer_weights,
float** res_layer_biases,
float** skip_layer_weights,
float** skip_layer_biases,
float* conv_out_weight,
float* conv_end_weight,
bool use_embed_tanh,
int implementation
) {
MyWaveNet* wavenet = new MyWaveNet(num_layers, max_dilation, batch_size, sample_count,
implementation, use_embed_tanh);
wavenet->setEmbeddings(embedding_prev, embedding_curr);
// We didn't use biases on our outputs
std::vector<float> dummy_bias_first(S, 0);
std::vector<float> dummy_bias_second(A, 0);
wavenet->setOutWeights(conv_out_weight,
dummy_bias_first.data(),
conv_end_weight,
dummy_bias_second.data());
for (int l = 0; l < num_layers; l++) {
wavenet->setLayerWeights(l, in_layer_weights_prev[l],
in_layer_weights_curr[l],
in_layer_biases[l],
res_layer_weights[l],
res_layer_biases[l],
skip_layer_weights[l],
skip_layer_biases[l]);
}
return (void*)wavenet;
}
nv_wavenet.cuh:
nvWavenetInfer (int numLayers, int maxDilation, int batchSize, int numSamples, int impl=0, bool tanhEmbed=true) : m_numLayers(numLayers), m_maxBatch(batchSize), m_maxSamples(numSamples), m_implementation((nvWavenetInfer::Implementation)impl), m_tanhEmbed(tanhEmbed) {
m_maxDilation = maxDilation;
/*
gpuErrChk(cudaMalloc(&m_yOut, numSamples*batchSize*sizeof(int))); // one-hot vector represented as single value indicating which value is set
gpuErrChk(cudaMemset(m_yOut, 0, numSamples*batchSize*sizeof(int)));
*/
gpuErrChk(cudaMalloc(&m_outputSelectors, numSamples*batchSize*sizeof(float)));
gpuErrChk(cudaMalloc(&m_embedPrev, A*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_embedCur, A*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Wprev, numLayers*2*R*R*sizeof(T_weight)));
gpuErrChk(cudaMalloc(&m_Wcur, numLayers*2*R*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Bh, numLayers*2*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Lh, numSamples*numLayers*batchSize*2*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Wres, numLayers*R*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Bres, numLayers*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Wskip, numLayers*S*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Bskip, numLayers*S*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_XtOut, numLayers*R*batchSize*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_skipOut, numLayers*S*batchSize*sizeof(T_data)));
// For now, just burn memory as though all layers had the maximum dilation value
gpuErrChk(cudaMalloc(&m_XtIn, (m_maxDilation+1)*(numLayers+1)*R*batchSize*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_hOut, numLayers*batchSize*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_aPrev, numLayers*batchSize*2*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_skipIn, numLayers*S*batchSize*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_skipOutFinalAccumulate, A*batchSize*S/R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_outAccumulate, A*batchSize*A/R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_yInPrev, batchSize*sizeof(int))); // one-hot vector represented as single value indicating which value is set
gpuErrChk(cudaMalloc(&m_yInCur, batchSize*sizeof(int))); // one-hot vector represented as single value indicating which value is set
gpuErrChk(cudaMalloc(&m_WskipOut, A*S*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_BskipOut, A*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Wout, A*A*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Bout, A*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_skipOutFinal, A*batchSize*S/R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_out, A*batchSize*A/R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_p, A*batchSize*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_h, numLayers*batchSize*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_hSample, numLayers*batchSize*sizeof(int)));
gpuErrChk(cudaMalloc(&m_ySample, batchSize*sizeof(int)));
if (impl == PERSISTENT) {
gpuErrChk(cudaMalloc(&m_skipOutFinalAccumulate, A*batchSize*S/R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_outAccumulate, A*batchSize*A/R*sizeof(T_data)));
}
}
virtual void setEmbeddings (float* embedPrev, float* embedCur) {
setActivation(m_embedPrev, embedPrev, A*R);
setActivation(m_embedCur, embedCur, A*R);
}
void setActivation(float* dst, float* src, size_t size) {
gpuErrChk(cudaMemcpy(dst, src, size*sizeof(float), cudaMemcpyHostToDevice));
}

Turns out that cudaMemcpy was not the issue. when examining device global memroy using cuda-gdb, one cannot do: x/10fw float_array. It will give incorrect values. To view, try this: p ((#global float*) float_array)[0]#10

Related

Zero padding on the fly with cuFFT

I have a float array and want to FFT from this with an amount of data and padding by zero padding to 2^N. I also want to overlap the data by a selectable factor.
So far I have a cuda kernel with which I create another array in which I store the overlapped and padded data. Afterwards a cufftPlanMany is executed.
By the two factors, the amount of data becomes very large and it is in principle only copies of the original data and zeros with which I waste my entire memory bandwidth.
I could not find anything if cuFFT supports zero padding or if I have a possibility to create custom scripts.
(Nvidia Quadro P5000, C++14, Kubuntu)
Update
I have written a callback function which is called when loading the data into the FFT. Unfortunately this is still a little bit slower than my previous solution with a kernel which prepares the data in another array and then calls the FFT.
I need an average of 2.4ms for the example with the given values.
My hope was that if I process the data on the fly, my memory bandwidth will not limit me anymore. Unfortunately this does not look like that at the moment.
Does anyone have an idea how I can speed this up even more?
// Don't forget to include cufft_static(not cufft), culibos and set flag -dc
#include <stdio.h>
#include <cstdint>
#include <unistd.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
#include <math.h>
typedef struct fft_CB_LD_callerInfo{
uint16_t rgLen;
uint16_t rgDataLen;
uint16_t overlapFactor;
};
static __device__ cufftReal myOwnCallback(void *dataIn,
size_t offset,
void *callerInfo,
void *sharedPtr) {
const fft_CB_LD_callerInfo *fftInfo = (fft_CB_LD_callerInfo*)callerInfo;
int idx_rg = offset/fftInfo->rgLen;
int idx_realRg = idx_rg/fftInfo->overlapFactor;
int idx_posInRg = offset-(size_t)idx_rg*fftInfo->rgLen;
if(idx_posInRg < fftInfo->rgDataLen){
const size_t idx_data = idx_posInRg
+ idx_realRg*fftInfo->rgDataLen
+ idx_rg - (idx_realRg*fftInfo->overlapFactor)*fftInfo->rgDataLen/fftInfo->overlapFactor;
return ((cufftReal*)dataIn)[idx_data];
}
else{
return 0.0f;
}
}
__device__ cufftCallbackLoadR myOwnCallbackPtr = myOwnCallback;
int main(){
// Data
float *dataHost;
float *data;
cufftComplex *spectrum;
cufftComplex *spectrumHost;
unsigned int rgDataLen = 400;
unsigned int rgLen = 2048;
unsigned int overlap = 8;
int peakPosHost[] = {0};
int *peakPos;
unsigned int rgCountClean = 52*16*4;
unsigned int rgCount = rgCountClean*overlap-(overlap-1);
int peakCountHost = 1;
int *peakCount;
// for FFT
cudaStream_t stream;
cufftHandle plan;
cufftResult result;
int fftRank = 1; // --- 1D FFTs
int fftIRide = 1, fftORide = 1; // --- Distance between two successive input/output elements
int fftInembed[] = { 0 }; // --- Input size with pitch (ignored for 1D transforms)
int fftOnembed[] = { 0 }; // --- Output size with pitch (ignored for 1D transforms)
int fftEachLen[] = { (int)rgLen }; // --- Size of the Fourier transform
int fftIDist = rgLen;
int fftODist = rgLen/2+1; // --- Distance between batches
// for Custom callback
cufftCallbackLoadR hostCopyOfCallbackPtr;
size_t worksize;
fft_CB_LD_callerInfo *fftInfo;
fft_CB_LD_callerInfo *fftInfoHost;
// Allocate host memory
dataHost = new float[rgDataLen*rgCountClean*peakCountHost];
spectrumHost = new cufftComplex[fftODist*rgCount];
fftInfoHost = new fft_CB_LD_callerInfo;
// create array with example data
for(int k=0; k<rgDataLen;k++){
for(int i=0; i<rgCountClean; i++){
dataHost[i*rgDataLen + k] = sin((2+i*4)*M_PI*k/rgDataLen);
}
}
fftInfoHost->overlapFactor = overlap;
fftInfoHost->rgDataLen = rgDataLen;
fftInfoHost->rgLen = rgLen;
// allocate device memory
cudaMalloc((void **)&data, sizeof(float) * rgDataLen*rgCountClean*peakCountHost);
cudaMalloc((void **)&peakPos, sizeof(int) * peakCountHost);
cudaMalloc((void **)&peakCount, sizeof(int));
cudaMalloc((void **)&spectrum, sizeof(cufftComplex)*fftODist*rgCount);
cudaMalloc((void **)&fftInfo, sizeof(fft_CB_LD_callerInfo));
// copy date from host to device
cudaMemcpy(data, dataHost, sizeof(float)*rgDataLen*rgCountClean*peakCountHost, cudaMemcpyHostToDevice);
cudaMemcpy(peakPos, peakPosHost, sizeof(int)*peakCountHost, cudaMemcpyHostToDevice);
cudaMemcpy(peakCount, &peakCountHost, sizeof(peakCountHost), cudaMemcpyHostToDevice);
cudaMemcpy(fftInfo, fftInfoHost, sizeof(fft_CB_LD_callerInfo), cudaMemcpyHostToDevice);
// get device pointer to custom callback function
cudaError_t error = cudaMemcpyFromSymbol(&hostCopyOfCallbackPtr, myOwnCallbackPtr, sizeof(hostCopyOfCallbackPtr));
if(error != 0) printf("cudaMemcpyFromSymbol faild with %d!\n", (int)error);
// Create a plan of FFTs to fast execute there later
cufftCreate(&plan);
result = cufftMakePlanMany(plan, fftRank, fftEachLen, fftInembed, fftIRide, fftIDist, fftOnembed, fftORide, fftODist, CUFFT_R2C, rgCount, &worksize);
if(result != CUFFT_SUCCESS) printf("cufftMakePlanMany failed with %d!\n", (int)result);
result = cufftXtSetCallback(plan, (void**)&hostCopyOfCallbackPtr, CUFFT_CB_LD_REAL, (void**)&fftInfo);
if(result != CUFFT_SUCCESS) printf("cufftXtSetCallback failed with %d!\n", (int)result);
// ----- Begin test area ---------------------------------------------------
if(cufftExecR2C(plan, data, spectrum) != CUFFT_SUCCESS)
printf("cufftExecR2C is failed!\n");
// ----- End test area ---------------------------------------------------
return 0;
}

Does bool variable in kernel need to be synchronized

I have a kernel consisting of a for loop that searches through an array for a specific int value. I'm using a grid block of 256 threads to do this. However, when one thread finds the value, I want to let the other threads know to exit. Currently I'm using a boolean flag, but I'm not sure if its working properly. My concern is synchronization.
__device__ bool found;
__global__
void search()
{
for(int i = threadIdx.x; i<1000000; i += stride)
{
if(found == true)
{
break;
}
else if(arr[i] = x)
{
found = true;
break;
}
}
}
int main()
{
bool flag = false;
cudaMemcpyToSymbol(found, &flag, sizeof(bool), 0,cudaMemcpyHostToDevice);
}
As pointed out in comments, you can probably achieve what you want by declaring the global device flag to be volatile, which will inhibit caching, and by using a memory fence function. There really isn't a global synchronization primitive which would do want you want other than the new grid synchronization mechanism introduced in CUDA 9 and new hardware, but that probably isn't necessary in this case. Turning your pseudocode into a toy example:
#include <iostream>
#include <thrust/device_vector.h>
__device__ volatile bool found;
__device__ volatile size_t idx;
template<bool docheck>
__global__
void search(const int* arr, int x, size_t N)
{
size_t i = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for(; (i<N) && (!found); i += stride)
{
if(arr[i] == x)
{
if (docheck) found = true;
idx = i;
__threadfence();
break;
}
}
}
int main()
{
const size_t N = 1 << 24;
const size_t findidx = 280270;
const int findval = 0xdeadbeef;
thrust::device_vector<int> data(N,1);
data[findidx] = findval;
bool flag = false;
size_t zero = 0;
{
cudaMemcpyToSymbol(found, &flag, sizeof(bool));
cudaMemcpyToSymbol(idx, &zero, sizeof(size_t));
int blocks, threads;
cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, search<false>);
search<false><<<blocks, threads>>>(thrust::raw_pointer_cast(data.data()), findval, N);
cudaDeviceSynchronize();
size_t result = 0;
cudaMemcpyFromSymbol(&result, idx, sizeof(size_t));
std::cout << "result = " << result << std::endl;
}
{
cudaMemcpyToSymbol(found, &flag, sizeof(bool));
cudaMemcpyToSymbol(idx, &zero, sizeof(size_t));
int blocks, threads;
cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, search<true>);
search<true><<<blocks, threads>>>(thrust::raw_pointer_cast(data.data()), findval, N);
cudaDeviceSynchronize();
size_t result = 0;
cudaMemcpyFromSymbol(&result, idx, sizeof(size_t));
std::cout << "result = " << result << std::endl;
}
return 0;
}
and profiling it gives the following:
$ nvcc -arch=sm_52 -o notify notify.cu
$ nvprof ./notify
==3916== NVPROF is profiling process 3916, command: ./notify
result = 280270
result = 280270
==3916== Profiling application: ./notify
==3916== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 78.00% 1.6773ms 1 1.6773ms 1.6773ms 1.6773ms void search<bool=0>(int const *, int, unsigned long)
19.93% 428.63us 1 428.63us 428.63us 428.63us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>(thrust::device_ptr<int>, int)
1.82% 39.199us 1 39.199us 39.199us 39.199us void search<bool=1>(int const *, int, unsigned long)
As you can see, the version which sets the found flag completes the search in 40 microseconds, whereas the version which does not set the flag takes 1.7 milliseconds. Given that the kernel is run with the maximum number of resident blocks in both cases, we can conclude that the early exit mechanism worked correctly and running blocks detected that the required value had been found.

cudaMemcpy error from Device to Host

I am returning a two-dimensional structure after computation on a kernel, from device to host.
HANDLE_ERROR(cudaMemcpy(Pixel,Pixel_gpu,img_wd*img_ht*sizeof(pixel),cudaMemcpyDeviceToHost));
Pixel is declared on host, Pixel_gpu is allocated on device, as below:
**Pixel_gpu;
HANDLE_ERROR(cudaMalloc(&Pixel_gpu,img_wd*img_ht*sizeof(pixel)));
pixel **Pixel = (pixel**)malloc((img_ht)*sizeof(pixel*));
for(int i=0;i<(img_ht);i++)
Pixel[i]=(pixel*)malloc((img_wd)*sizeof(pixel));
Using this I end up getting illegal memory access error.
Trying a similar memory alignment for result, doesn't help either.
pixel *Pixel_res = (pixel*)malloc(img_wd*img_ht*sizeof(pixel));
HANDLE_ERROR(cudaMemcpy(Pixel_res,Pixel_gpu,img_wd*img_ht*sizeof(pixel),cudaMemcpyDeviceToHost));
Kernel launching:
cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0));
int thread_block=sqrt(prop.maxThreadsPerBlock);
dim3 DimGrid(ceil(img_wd/thread_block),ceil(img_ht/thread_block),1);
dim3 DimBlock(sqrt(prop.maxThreadsPerBlock),sqrt(prop.maxThreadsPerBlock),1);
//allocating gpu memory
pixel **Pixel_tmp_gpu, **Pixel_gpu;
HANDLE_ERROR(cudaMalloc(&Pixel_tmp_gpu,img_wd*img_ht*sizeof(pixel)));
HANDLE_ERROR(cudaMalloc(&Pixel_gpu,img_wd*img_ht*sizeof(pixel)));
float **kernel0_gpu, **kernel1_gpu;
HANDLE_ERROR(cudaMalloc(&kernel0_gpu,k*1*sizeof(float)));
HANDLE_ERROR(cudaMalloc(&kernel1_gpu,1*k*sizeof(float)));
cout<<"memory allocated"<<endl;
//copying needed data
HANDLE_ERROR(cudaMemcpy(Pixel_tmp_gpu,Pixel_tmp,img_wd*img_ht*sizeof(pixel),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(Pixel_gpu,Pixel,img_wd*img_ht*sizeof(pixel),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(kernel0_gpu,kernel0,k*1*sizeof(float),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(kernel1_gpu,kernel1,1*k*sizeof(float),cudaMemcpyHostToDevice));
cout<<"memory transfers done"<<endl;
vertical_conv<<<DimGrid,DimBlock>>>(Pixel_gpu, Pixel_tmp_gpu,img_wd, img_ht,kernel0_gpu,k);
time_t vertical_convolution=time(NULL);
cout<<" vertical_convolution time: "<<double(vertical_convolution - reading_file)<<"sec"<<endl;
horizontal_conv<<<DimGrid,DimBlock>>>(Pixel_tmp_gpu, Pixel_gpu, img_wd, img_ht, kernel1_gpu, k);
time_t horizontal_convolution=time(NULL);
cout<<" horizontal convolution time:" <<double(horizontal_convolution-vertical_convolution)<<" sec"<<endl;
pixel *Pixel_res = (pixel*)malloc(img_wd*img_ht*sizeof(pixel));
HANDLE_ERROR(cudaMemcpy(Pixel_res,Pixel_gpu,img_wd*img_ht*sizeof(pixel),cudaMemcpyDeviceToHost));
The functions used:
struct pixel //to store RGB values
{
unsigned char r;
unsigned char g;
unsigned char b;
};
static void HandleError( cudaError_t err, const char *file, int line ) {
if (err != cudaSuccess) {
cout<<cudaGetErrorString(err)<<" in "<< file <<" at line "<< line<<endl;
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
__device__ void padding(pixel** Pixel_val, int x_coord, int y_coord, int img_width, int img_height, pixel Px) //padding the image,depending on pixel coordinates, can be replaced by reflect for better result //currently zero padding
{
if(x_coord<img_width && y_coord<img_height && x_coord>=0 && y_coord>=0)
Px=Pixel_val[y_coord][x_coord];
}
The vertical convolution:
__global__ void vertical_conv(pixel** Pixel_in, pixel** Pixel_out,int img_wd, int img_ht, float** kernel, int k)
{
float tmp_r, tmp_g, tmp_b;
pixel pix_val;
pix_val.r=0;pix_val.g=0;pix_val.b=0;
int row=blockIdx.y*blockDim.y + threadIdx.y;
int col = blockIdx.x*blockDim.x + threadIdx.x;
if(row<img_ht && col<img_wd){
tmp_r=0, tmp_g=0, tmp_b=0;
for(int l=0;l<k;l++)
{
padding(Pixel_in, col, row+l-(k-1)/2, img_wd, img_ht, pix_val);
tmp_r+=pix_val.r * kernel[l][0];
tmp_b+=pix_val.b * kernel[l][0];
tmp_g+=pix_val.g * kernel[l][0];
}
Pixel_out[row][col].r=tmp_r;
Pixel_out[row][col].g=tmp_g;
Pixel_out[row][col].b=tmp_b;
}
}
The horizontal convolution:
__global__ void horizontal_conv(pixel** Pixel_in, pixel** Pixel_out, int img_wd, int img_ht, float** kernel, int k)
{
float tmp_r, tmp_b, tmp_g;
pixel pix_val;
pix_val.r=0;pix_val.g=0;pix_val.b=0;
//horizontal convolution
int row=blockIdx.y*blockDim.y + threadIdx.y;
int col = blockIdx.x*blockDim.x + threadIdx.x;
tmp_r=0, tmp_g=0, tmp_b=0;
if(row<img_ht && col<img_wd)
{
for(int l=0; l<k;l++)
{
padding(Pixel_in, col+l-(k-1)/2, row, img_wd, img_ht, pix_val);
tmp_r+=pix_val.r * kernel[0][l];
tmp_g+=pix_val.g * kernel[0][l];
tmp_b+=pix_val.b * kernel[0][l];
}
Pixel_out[row][col].r=tmp_r;
Pixel_out[row][col].g=tmp_g;
Pixel_out[row][col].b=tmp_b;
}
}
Can someone help me know what could be wrong here?
The Pixel_gpu is be one contiguous memory block, consisting of w*h elements of type pixel. Its size is
sizeOfDeviceMemory = img_wd * img_ht * sizeof(pixel)
On contrast to that, Pixel on the CPU side is an "array of pointers": The Pixel pointer points to h elements of type pixel*. Its size is
sizeOfHostMemory = img_ht * sizeof(pixel*)
Clearly, these sizes are different, and trying to write sizeOfDeviceMemory bytes to this pointer causes an illegal access.
Usually, you should allocate your memory on the host as one contiguous block as well:
pixel* Pixel = (pixel*)malloc(img_wd * img_ht * sizeof(pixel));
Then you can copy the memory to this pointer using the cudaMemcpy call that you already have.
If having a pixel* on the host is not OK for you, and you urgently need a pixel** (for example, to pass it to some other function), then you can create an "array of pointers" like you had before, but not allocate new memory for each row, but instead, let each pointer point to one "row" of the single, contiguous pixel block.

Adding values on GPU

i have a class called Product.
Each product has a value and i want to add these values on GPU. I filled my array on host side
int * h_A, * d_A;
h_A = (int*) malloc(enterNum * sizeof(int));
cudaMalloc((void **) &d_A, enterNum * sizeof(int));
Product p("Product", price);
h_A[i] = p.getValue();
while (i < enterNum) {
i++;
cout << "Enter product name:";
cin >> desc;
cout << "Enter product price:";
cin >> price;
Product p("Product", price);
h_A[i] = p.getValue();
}
cudaMemcpy(d_A, h_A, enterNum, cudaMemcpyHostToDevice);
priceSum<<<enterNum, 1024>>>(d_A,enterNum,result);
int result2 = 0;
cudaMemcpy(result, result2, enterNum, cudaMemcpyDeviceToHost);
here cudaMemcpy function gives error because i dont use pointer. What can i do here? I dont need to use pointer here isn't it?
this is my summation function:
__global__ void priceSum(int *dA, int count, int result) {
int tid = blockIdx.x;
if (tid < count){
result+= dA[tid];
}
}
full code:
using namespace std;
#include "cuda_runtime.h"
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <stdlib.h>
class Product {
private:
char * description;
int productCode;
int value;
static int lastCode;
public:
Product(char* descriptionP, int valueP) {
productCode = ++lastCode;
value = valueP;
description = new char[strlen(descriptionP) + 1];
strcpy(description, descriptionP);
}
Product(Product& other) {
productCode = ++lastCode;
description = new char[strlen(other.description) + 1];
strcpy(description, other.description);
}
~Product() {
delete[] description;
}
char* getDescription() const {
return description;
}
void setDescription(char* description) {
this->description = description;
}
int getValue() const {
return value;
}
void setValue(int value) {
this->value = value;
}
};
int Product::lastCode = 1000;
__global__ void priceSum(int *dA, int count, int * result) {
int tid = blockIdx.x;
if (tid < count)
result+= dA[tid];
}
int main(void) {
int enterNum, price, * result = 0;
string desc;
const char * desc2;
cout << "How many products do you want to enter?";
cin >> enterNum;
int * h_A, * d_A;
h_A = (int*) malloc(enterNum * sizeof(int));
cudaMalloc((void **) &d_A, enterNum * sizeof(int));
int i = 0;
while (i < enterNum) {
cout << "Enter product name:";
cin >> desc;
cout << "Enter product price:";
cin >> price;
Product p("Product", price);
h_A[i] = p.getValue();
i++;
}
cudaMemcpy(d_A, h_A, enterNum * sizeof(int), cudaMemcpyHostToDevice);
priceSum<<<enterNum, 1>>>(d_A,enterNum,result);
int result2 = 0;
cudaMemcpy(&result2, result, enterNum, cudaMemcpyDeviceToHost);
cout << result2;
return 0;
}
You should show the definition of result in your host code, but I assume it is:
int result;
based on how you are passing it to your priceSum kernel.
You have more than 1 problem here.
In your priceSum kernel, you are summing the values in dA[] and storing the answer in result. But you have passed the variable result to the kernel by value instead of by reference so the value you are modifying is local to the function, and will not show up anywhere else. When a function in C needs to modify a variable that is passed to it via the parameter list, and the modified variable is to show up in the function calling context, it's necessary to pass that parameter by reference (i.e. using a pointer) rather than by value. Note this is based on the C programming language and is not specific to CUDA. So you should rewrite your kernel definition as:
__global__ void priceSum(int *dA, int count, int *result) {
Regarding your cudaMemcpy call, there are several issues that need to be cleaned up. First, we need the storage for result to be properly created using cudaMalloc (before the kernel is called, because the kernel will store something there.) Next, we need to fix the parameter list of the cudaMemcpy call itself. So your host code should be rewritten as:
cudaMemcpy(d_A, h_A, enterNum, cudaMemcpyHostToDevice);
int *result;
cudaMalloc((void **)&result, sizeof(int));
priceSum<<<enterNum, 1024>>>(d_A,enterNum,result);
int result2 = 0;
cudaMemcpy(&result2, result, sizeof(int), cudaMemcpyDeviceToHost);
There appear to be other problems with your code, around the grouping of data for threads and blocks. But you haven't shown enough of your program for me to make sense of it. So let me point out that your code shows only a single value for result (and result2), yet the way your kernel is written, each thread will add its value of dA[tid] to result. You can't have a bunch of threads all updating a single value in global memory with no control mechanism, and expect to get a sensible result. Problems like this are usually best handled with a classical parallel reduction algorithm, but for the sake of simplicity, to try and get something working, you can use atomics:
atomicAdd(result, dA[tid]);
Sorry, but your kernel just makes no sense at all. You are using blockIdx.x as your tid variable, but let's note that blockIdx.x is a number that is the same for every thread in a particular block. So then going on to have every thread add dA[tid] to result in this fashion just doesn't make sense. I believe it will make more sense if you change your kernel invocation to:
priceSum<<<enterNum, 1>>>(d_A,enterNum,result);

CUDA random number generating

I would like to generate random numbers in my __device__ function, and keep it in my int Board[500] , I found some examples , but they used some type named curandState. I only need a function like a rand() in C++.
here is my code , I have N3[40000] array in my device memory , I generate some random numbers in my kernel working for one thread (I mean this "kernel <<<1,1>>> ... ") , then I copy it to my N2[40000] from CPU, and print it ,so here is the code
#include <iostream>
#include <Cuda.h>
#include<curand.h>
#include<curand_kernel.h>
int n = 200;
using namespace std;
__device__ float generate( curandState* globalState, int ind )
{
//int ind = threadIdx.x;
curandState localState = globalState[ind];
float RANDOM = curand_uniform( &localState );
globalState[ind] = localState;
return RANDOM;
}
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int id = threadIdx.x;
curand_init ( seed, id, 0, &state[id] );
}
__global__ void kernel(float* N, curandState* globalState, int n)
{
// generate random numbers
for(int i=0;i<40000;i++)
{
int k = generate(globalState, i) * 100000;
while(k > n*n-1)
{
k-=(n*n-1);
}
N[i] = k;
}
}
int main()
{
int N=40000;
curandState* devStates;
cudaMalloc ( &devStates, N*sizeof( curandState ) );
// setup seeds
setup_kernel <<< 1, N >>> ( devStates,unsigned(time(NULL)) );
float N2[40000];
float* N3;
cudaMalloc((void**) &N3, sizeof(float)*N);
kernel<<<1,1>>> (N3, devStates, n);
cudaMemcpy(N2, N3, sizeof(float)*N, cudaMemcpyDeviceToHost);
for(int i=0;i<N;i++)
{
cout<<N2[i]<<endl;
}
return 0;
}
You may use curand library to generate random numbers in device memory and then run your kernel without even having to copy those values to the host.