CUDA cudaMemcpy2D not giving expected results [duplicate] - cuda

How do I initialize device array which is allocated using cudaMalloc()?
I tried cudaMemset, but it fails to initialize all values except 0.code, for cudaMemset looks like below, where value is initialized to 5.
cudaMemset(devPtr,value,number_bytes)

As you are discovering, cudaMemset works like the C standard library memset. Quoting from the documentation:
cudaError_t cudaMemset ( void * devPtr,
int value,
size_t count
)
Fills the first count bytes of the memory area pointed to by devPtr
with the constant byte value value.
So value is a byte value. If you do something like:
int *devPtr;
cudaMalloc((void **)&devPtr,number_bytes);
const int value = 5;
cudaMemset(devPtr,value,number_bytes);
what you are asking to happen is that each byte of devPtr will be set to 5. If devPtr was a an array of integers, the result would be each integer word would have the value 84215045. This is probably not what you had in mind.
Using the runtime API, what you could do is write your own generic kernel to do this. It could be as simple as
template<typename T>
__global__ void initKernel(T * devPtr, const T val, const size_t nwords)
{
int tidx = threadIdx.x + blockDim.x * blockIdx.x;
int stride = blockDim.x * gridDim.x;
for(; tidx < nwords; tidx += stride)
devPtr[tidx] = val;
}
(standard disclaimer: written in browser, never compiled, never tested, use at own risk).
Just instantiate the template for the types you need and call it with a suitable grid and block size, paying attention to the last argument now being a word count, not a byte count as in cudaMemset. This isn't really any different to what cudaMemset does anyway, using that API call results in a kernel launch which is do too different to what I posted above.
Alternatively, if you can use the driver API, there is cuMemsetD16 and cuMemsetD32, which do the same thing, but for half and full 32 bit word types. If you need to do set 64 bit or larger types (so doubles or vector types), your best option is to use your own kernel.

I also needed a solution to this question and I didn't really understand the other proposed solution. Particularly I didn't understand why it iterates over the grid blocks for(; tidx < nwords; tidx += stride) and for that matter, the kernel invocation and why using the counter-intuitive word sizes.
Therefore I created a much simpler monolithic generic kernel and customized it with strides i.e. you may use it to initialize a matrix in multiple ways e.g. set rows or columns to any value:
template <typename T>
__global__ void kernelInitializeArray(T* __restrict__ a, const T value,
const size_t n, const size_t incx) {
int tid = threadIdx.x + blockDim.x * blockIdx.x;
if (tid*incx < n) {
a[tid*incx] = value;
}
}
Then you may invoke the kernel like this:
template <typename T>
void deviceInitializeArray(T* a, const T value, const size_t n, const size_t incx) {
int number_of_blocks = ((n / incx) + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 gridDim(number_of_blocks, 1);
dim3 blockDim(BLOCK_SIZE, 1);
kernelInitializeArray<T> <<<gridDim, blockDim>>>(a, value, n, incx);
}

Related

argument of type "int *" is incompatible with parameter of type "int" in cuda kernel call

I've been trying for a while and have come across seemingly similar issues already posted however for some reason I'm still failing to clear the error. I'm effectively want to pass a 2D matrix to the kernel as a 1D array as I have seen suggested. I'm not sure where I've gone wrong in my syntax but there is a clash in terms of the variable I supply to the kernel and the parameter that kernel expects.
__global__ void calculatePath(int source, int target, int *cost, int distance){
int t_id = blockIdx.x * blockDim.x + threadIdx.x;
int dist[50];
int prev[50];
int selected[50]={0};
int num_path[50];
int d, m, min, start, j;
if ((t_id > 0) && (t_id < N)){
dist[t_id] = IN;
prev[t_id] = -1;
}
This is my kernel function whose parameters are all integers except "cost" which is a pointer to an integer array.
int main(int argc, char **argv){
int h_num_path[N];
int h_distance = 0;
int h_cost[N][N],i,j,co;
int h_source;
int h_target;
printf("\tShortest Path Algorithm(DIJKSRTRA's ALGORITHM\n\n");
for(i=0;i< N;i++)
for(j=0;j< N;j++)
h_cost[i][j] = IN;
//*********************
srand ( time(NULL));
for(int x=1;x< N;x++) {
for (int y = x + 1; y < N; y++) {
h_cost[x][y] = h_cost[y][x] = (rand() % 100) + 1;
}
}
printf("\nEnter The Source: ");
scanf("%d", &h_source);
printf("\nEnter The target: ");
scanf("%d", &h_target);
int *d_num_path;
int *d_cost;
int *d_source;
int *d_target;
int *d_dist;
int *d_prev;
int *d_distance;
cudaMalloc(&d_num_path, sizeof(int)*N);
cudaMalloc(&d_cost, sizeof(int)*N*N);
cudaMalloc((void**) &d_source, sizeof(int));
cudaMalloc((void**) &d_target, sizeof(int));
cudaMalloc((void**) &d_dist, sizeof(int)*N);
cudaMalloc((void**) &d_distance, sizeof(int));
cudaMemcpy(d_source, &h_source, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_target, &h_target, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_cost, h_cost, sizeof(int)*N*N, cudaMemcpyHostToDevice);
cudaMemcpy(d_distance, &h_distance, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_num_path, &h_num_path, sizeof(int)*N, cudaMemcpyHostToDevice);
clock_t before;
before = clock();
calculatePath<<<N/512 + 1, 512>>>(d_source, d_target, d_cost, d_distance);
clock_t time_taken = clock() - before;
cudaMemcpy(&h_num_path, d_num_path, sizeof(int)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(&h_distance, d_distance, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_num_path);
cudaFree(d_cost);
cudaFree(d_source);
cudaFree(d_target);
cudaFree(d_dist);
cudaFree(d_prev);
cudaFree(d_distance);
printf("\nShortest Path: %d \n",co);
printf("%s %.4f %s", "Time taken:", time_taken/1000.0, "seconds");
return 0;
}
On the kernel call, I however receive the error that "argument of type 'int *' is incompatible with parameter of type 'int'" yet I believe my d_cost already is a pointer. I'd appreciate being set straight as I'm sure I'm overlooking something small.
It is not d_target you are having trouble with. The other three arguments are int* but corresponding parameters are declared as int.
The C Programming Language by K&R at page 25 says:
We will generally use parameter for a variable named in the parenthesized list in a function definition, and argument for the value used in a call of the function.
Since your source and target are just a single integer values, you don't really need to define device side variables for them. Just pass the integer value itself as an argument. By doing so, you'll get performance improvements as talonmies commented:
(With pass by value) there is constant memory cache broadcast within the kernel if it is done that way. Passing pointers for simple constants just increases latency by forcing every thread to dereference the pointer to retrieve the value from global memory, plus all the additional host side memory APIs to allocate them in the first place.
Also, you seem to expect parameter distance to have output value of your kernel, then it must be declared as a pointer, so you can do cudaMemcpyDeviceToHost after kernel.
__global__ void calculatePath(int source, int target, int *cost, int *distance) // kernel definition
caculatePath<<< (N + 511) / 512, 512 >>>(h_source, h_target, d_cost, d_distance) // kernel launch
Three of your arguments need to be integers, but you are passing pointers to integers. You need to change your method signature:
__global__ void calculatePath(int *source, int *target, int *cost, int *distance)

Why does this CUDA example kernel have a for loop?

I have been looking at the following example from the official CUDA website:
http://docs.nvidia.com/cuda/cuda-samples/index.html#simple-cufft
Download here: http://developer.download.nvidia.com/compute/DevZone/C/Projects/x64/simpleCUFFT.zip
It contains the following kernel:
// Complex pointwise multiplication
static __global__ void ComplexPointwiseMulAndScale(Complex *a, const Complex *b, int size, float scale)
{
const int numThreads = blockDim.x * gridDim.x;
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = threadID; i < size; i += numThreads)
{
a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale);
}
}
My question is, why is there a for loop here? Doesn't CUDA simultaneously call an array of thread? I removed the thread, replacing it with the following code and it produced the same output.
// Complex pointwise multiplication
static __global__ void ComplexPointwiseMulAndScale(Complex *a, const Complex *b, int size, float scale)
{
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
a[threadID] = ComplexScale(ComplexMul(a[threadID], b[threadID]), scale);
}
As this is an official example on the CUDA website, I imagine I must be missing something.
Your version is basically what happens when numThreads is equal to size (but only then).
What the official example does is the following: Suppose numThreads is equal to 4 (for simplicity, usually it will be much larger), and consider the array positions (both for a and b):
a or b x x x x x x x x
thread that works here 0 1 2 3 0 1 2 3
Then the first thread will work on all array position divisible by 4, et cetera.
The problem with your version is that the caller of your function will have to make sure that there are as many threads as size is large. For example, if you call your version with a 1-dim grid and both gridDim.x and blockDim.x being 2, but on vectors of length 8, then half of your vector isn't processed!
The official example works regardless - no matter how many threads the caller assigns to it, the entire vector will be processed.

how to get the index of thrust foreach

I am trying to using thrust for each to give device vector certain values
here is the code
const uint N = 222222;
struct assign_functor
{
template <typename Tuple>
__device__
void operator()(Tuple t)
{
uint x = threadIdx.x + blockIdx.x * blockDim.x;
uint y = threadIdx.y + blockIdx.y * blockDim.y;
uint offset = x + y * blockDim.x * gridDim.x;
thrust::get<0>(t) = offset;
}
};
int main(int argc, char** argv)
{
thrust::device_vector <float> d_float_vec(N);
thrust::for_each(
thrust::make_zip_iterator(
thrust::make_tuple(d_float_vec.begin())
),
thrust::make_zip_iterator(
thrust::make_tuple(d_float_vec.end())
),
assign_functor()
);
std::cout<<d_float_vec[10]<<" "<<d_float_vec[N-2]
}
the output of d_float_vec[N-2] is supposed to be 222220; but it turns out 1036. whats wrong with my code??
I know I could use thrust::sequence to give a sequence values to the vector. I just want to know how to get the real index for thrust foreach function. Thanks!
As noted in comments, your approach is never likely to work because you have assumed a number of things about the way thrust::for_each works internally which are probably not true, including:
You implicitly are assuming that for_each uses a single thread to process each input element. This is almost certainly not the case; it is much more likely that thrust will process multiple elements per thread during the operation.
You are also assuming that execution happens in order so that the Nth thread processes the Nth array element. That may not be the case, and execution may occur in an order which cannot be known a priori
You are assuming for_each processes the whole input data set in a single kernel laumch
Thrust algorithms should be treated as black boxes whose internal operations are undefined and no knowledge of them is required to implement user defined functors. In your example, if you require a sequential index inside a functor, pass a counting iterator. One way to re-write your example would be like this:
#include "thrust/device_vector.h"
#include "thrust/for_each.h"
#include "thrust/tuple.h"
#include "thrust/iterator/counting_iterator.h"
typedef unsigned int uint;
const uint N = 222222;
struct assign_functor
{
template <typename Tuple>
__device__
void operator()(Tuple t)
{
thrust::get<1>(t) = (float)thrust::get<0>(t);
}
};
int main(int argc, char** argv)
{
thrust::device_vector <float> d_float_vec(N);
thrust::counting_iterator<uint> first(0);
thrust::counting_iterator<uint> last = first + N;
thrust::for_each(
thrust::make_zip_iterator(
thrust::make_tuple(first, d_float_vec.begin())
),
thrust::make_zip_iterator(
thrust::make_tuple(last, d_float_vec.end())
),
assign_functor()
);
std::cout<<d_float_vec[10]<<" "<<d_float_vec[N-2]<<std::endl;
}
Here the counting iterator gets passed in a tuple along with the data array, allow the functor access to a sequential index which corresponds to the data array entry it is dealing with.

CUDA: bad performance with shared memory and no parallelism

I'm trying to exploit shared memory in this kernel function, but the performance are not as good as I was expecting. This function is called many times in my application (about 1000 times or more), so I was thinking to exploit shared memory to avoid the memory latency. But something is wrong apparently because my application became really slow since i'm using shared memory.
This is the kernel:
__global__ void AndBitwiseOperation(int* _memory_device, int b1_size, int* b1_memory, int* b2_memory){
int j = 0;
// index GPU - Transaction-wise
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int tid = threadIdx.x;
// shared variable
extern __shared__ int shared_memory_data[];
extern __shared__ int shared_b1_data[];
extern __shared__ int shared_b2_data[];
// copy from global memory into shared memory and sync threads
shared_b1_data[tid] = b1_memory[tid];
shared_b2_data[tid] = b2_memory[tid];
__syncthreads();
// AND each int bitwise
for(j = 0; j < b1_size; j++)
shared_memory_data[tid] = (shared_b1_data[tid] & shared_b2_data[tid]);
// write result for this block to global memory
_memory_device[i] = shared_memory_data[i];
}
The shared variables are declared extern because I don't know the size of b1 and b2 since they depend from the number of customer that I can only know at runtime (but both have the same size all the times).
This is how I call the kernel:
void Bitmap::And(const Bitmap &b1, const Bitmap &b2)
{
int* _memory_device;
int* b1_memory;
int* b2_memory;
int b1_size = b1.getIntSize();
// allocate memory on GPU
(cudaMalloc((void **)&b1_memory, _memSizeInt * SIZE_UINT));
(cudaMalloc((void **)&b2_memory, _memSizeInt * SIZE_UINT));
(cudaMalloc((void **)&_memory_device, _memSizeInt * SIZE_UINT));
// copy values on GPU
(cudaMemcpy(b1_memory, b1._memory, _memSizeInt * SIZE_UINT, cudaMemcpyHostToDevice ));
(cudaMemcpy(b2_memory, b2._memory, _memSizeInt * SIZE_UINT, cudaMemcpyHostToDevice ));
(cudaMemcpy(_memory_device, _memory, _memSizeInt * SIZE_UINT, cudaMemcpyHostToDevice ));
dim3 dimBlock(1, 1);
dim3 dimGrid(1, 1);
AndBitwiseOperation<<<dimGrid, dimBlock>>>(_memory_device, b1_size, b1_memory, b2_memory);
// return values
(cudaMemcpy(_memory, _memory_device, _memSizeInt * SIZE_UINT, cudaMemcpyDeviceToHost ));
// Free Memory
(cudaFree(b1_memory));
(cudaFree(b2_memory));
(cudaFree(_memory_device));
}
b1 and b2 are bitmaps with 4 bits for each element. The number of elements depend from the number of customers. Also, I have problem with the kernel's parameters, because if I add some blocks or threads, the AndBitwiseOperation() is not giving me the correct result. With just 1 block and 1 thread per block the result is correct but the kernel is not in parallel.
Every advice is welcomed :)
Thank you
I did not really understood what your kernel wants to do.
You should read more about CUDA and GPU programming.
I tried to point out some of the mistakes.
Shared memory (sm) should reduce global memory reads.
Analyze your global memory (gm) read and write operations per thread.
a. You read global memory two times and write sm two times
b. (nonsense loop ignored, no use of index) you read two times sn and write once sm
c. you read once sm and write once gm
So in total you have nothing gained. You could directly use the global memory.
You use all threads to write out one value at the block index "i".
You should only use one thread to write this data out.
It makes no sense outputing the same data by multiple threads which will get serialized.
You use a loop and don't use the loop counter at all.
You write at "tid" and read at "i" randomly.
This assignement is overhead.
unsigned int tid = threadIdx.x;
The results cannot be correct with more then one block since with one block tid = i!
All the wrong indexing results in wrong calculation using more then one block
The shared memory at "i" was never written!
_memory_device[i] = shared_memory_data[i];
My assumption what your kernel should do
/*
* Call kernel with x-block usage and up to 3D Grid
*/
__global__ void bitwiseAnd(int* outData_g,
const long long int inSize_s,
const int* inData1_g,
const int* inData2_g)
{
//get unique block index
const unsigned long long int blockId = blockIdx.x //1D
+ blockIdx.y * gridDim.x //2D
+ gridDim.x * gridDim.y * blockIdx.z; //3D
//get unique thread index
const unsigned long long int threadId = blockId * blockDim.x + threadIdx.x;
//check global unique thread range
if(threadId >= inSize_s)
return;
//output bitwise and
outData_g[thread] = inData1_g[thread] & inData2_g[thread];
}
When you declare an extern __shared__ array, you must also specify its size in the kernel call.
The kernel configuration is:
<<< Dg, Db, Ns, S >>>
Ns is the size of the extern __shared__ arrays, and defaults to 0.
I don't think you can define more than one extern __shared__ array in your kernel. An example in the Programming Guide defines a single extern __shared__ array and manually sets arrays with offsets within it:
extern __shared__ float array[];
__device__ void func() // __device__ or __global__ function
{
short* array0 = (short*)array;
float* array1 = (float*)&array0[128];
int* array2 = (int*)&array1[64];
}

CUDA Dot Product

I'm trying to implement the classic dot-product kernel for double precision arrays with atomic computation of the final sum across the various blocks. I used the atomicAdd for double precision as stated in page 116 of the programming guide.Probably i'm doing something wrong.The partial sums across the threads in every block are computed correctly but afterwords the atomic operation doesn't seem to be working properly since every time i run my kernel with the same data,i receive different results. I'll be grateful if somebody could spot the mistake or provide an alternative solution!
Here is my kernel:
__global__ void cuda_dot_kernel(int *n,double *a, double *b, double *dot_res)
{
__shared__ double cache[threadsPerBlock]; //thread shared memory
int global_tid=threadIdx.x + blockIdx.x * blockDim.x;
int i=0,cacheIndex=0;
double temp = 0;
cacheIndex = threadIdx.x;
while (global_tid < (*n)) {
temp += a[global_tid] * b[global_tid];
global_tid += blockDim.x * gridDim.x;
}
cache[cacheIndex] = temp;
__syncthreads();
for (i=blockDim.x/2; i>0; i>>=1) {
if (threadIdx.x < i) {
cache[threadIdx.x] += cache[threadIdx.x + i];
}
__syncthreads();
}
__syncthreads();
if (cacheIndex==0) {
*dot_res=cuda_atomicAdd(dot_res,cache[0]);
}
}
And here is my device function atomicAdd:
__device__ double cuda_atomicAdd(double *address, double val)
{
double assumed,old=*address;
do {
assumed=old;
old= __longlong_as_double(atomicCAS((unsigned long long int*)address,
__double_as_longlong(assumed),
__double_as_longlong(val+assumed)));
}while (assumed!=old);
return old;
}
Getting a reduction right using ad hoc CUDA code can be tricky, so here's an alternative solution using a Thrust algorithm, which is included with the CUDA Toolkit:
#include <thrust/inner_product.h>
#include <thrust/device_ptr.h>
double do_dot_product(int n, double *a, double *b)
{
// wrap raw pointers to device memory with device_ptr
thrust::device_ptr<double> d_a(a), d_b(b);
// inner_product implements a mathematical dot product
return thrust::inner_product(d_a, d_a + n, d_b, 0.0);
}
You are using the cuda_atomicAdd function incorrectly. This section of your kernel:
if (cacheIndex==0) {
*dot_res=cuda_atomicAdd(dot_res,cache[0]);
}
is the culprit. Here, you atomically add to dot_res. then non atomically set dot_res with the result it returns. The return result from this function is the previous value of the location being atomically updated, and it supplied for "information" or local use of the caller only. You don't assign it to what you are atomically updated, that completely defeats the purpose of using atomic memory access in the first place. Do something like this instead:
if (cacheIndex==0) {
double result=cuda_atomicAdd(dot_res,cache[0]);
}
Did not checked your code that depth but here are some advices.
I would only advice using Thrust if you only use your GPU for such generic tasks, since if a complex problem will arise people have no idea to efficiently program parallel on the gpu.
Start a new parallel reduction kernel to summarize the dot product.
Since the data is already on the device you won't see a decrease in performance starting a new kernel.
Your kernel seems not to scale across the maximum number of possible blocks on the newest GPU. If it would and your kernel would be able to calculate the dot product of millions of values the performance would decrease dramatically because of the serialized atomic operation.
Beginner mistake: Is your input data and shared memory access range checked? Or are you sure the input data is always multiple of your block size? Else you will read garbage. Most of my wrong results were due to this fault.
optimise your parallel reduction. My Thesis or Optimisations Mark Harris
Untested, i just wrote it down in notepad:
/*
* #param inCount_s unsigned long long int Length of both input arrays
* #param inValues1_g double* First value array
* #param inValues2_g double* Second value array
* #param outDots_g double* Output dots of each block, length equals the number of blocks
*/
__global__ void dotProduct(const unsigned long long int inCount_s,
const double* inValuesA_g,
const double* inValuesB_g,
double* outDots_g)
{
//get unique block index in a possible 3D Grid
const unsigned long long int blockId = blockIdx.x //1D
+ blockIdx.y * gridDim.x //2D
+ gridDim.x * gridDim.y * blockIdx.z; //3D
//block dimension uses only x-coordinate
const unsigned long long int tId = blockId * blockDim.x + threadIdx.x;
/*
* shared value pair products array, where BLOCK_SIZE power of 2
*
* To improve performance increase its size by multiple of BLOCK_SIZE, so that each threads loads more then 1 element!
* (outDots_g length decreases by same factor, and you need to range check and initialize memory)
* -> see harris gpu optimisations / parallel reduction slides for more informations.
*/
__shared__ double dots_s[BLOCK_SIZE];
/*
* initialize shared memory array and calculate dot product of two values,
* shared memory always needs to be initialized, its never 0 by default, else garbage is read later!
*/
if(tId < inCount_s)
dots_s[threadIdx.x] = inValuesA_g[tId] * inValuesB_g[tId];
else
dots_s[threadIdx.x] = 0;
__syncthreads();
//do parallel reduction on shared memory array to sum up values
reductionAdd(dots_s, dots_s[0]) //see my thesis link
//output value
if(threadIdx.x == 0)
outDots_g[0] = dots_s[0];
//start new parallel reduction kernel to sum up outDots_g!
}
Edit: removed unnecessary points.