I'm having trouble using atomicMin to find the minimum value in a matrix in cuda. I'm sure it has something to do with the parameters I'm passing into the atomicMin function. The findMin function is the function to focus on, the popmatrix function is just to populate the matrix.
#include <stdio.h>
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#define SIZE 4
__global__ void popMatrix(unsigned *matrix) {
unsigned id, num;
curandState_t state;
id = threadIdx.x * blockDim.x + threadIdx.y;
// Populate matrix with random numbers
curand_init(id, 0, 0, &state);
num = curand(&state)%100;
matrix[id] = num;
}
__global__ void findMin(unsigned *matrix, unsigned *temp) {
unsigned id;
id = threadIdx.x * blockDim.y + threadIdx.y;
atomicMin(temp, matrix[id]);
printf("old: %d, new: %d", matrix[id], temp);
}
int main() {
dim3 block(SIZE, SIZE, 1);
unsigned *arr, *harr, *temp;
cudaMalloc(&arr, SIZE*SIZE*sizeof(unsigned));
popMatrix<<<1,block>>>(arr);
// Print matrix of random numbers to see if min number was picked right
cudaMemcpy(harr, arr, SIZE*SIZE*sizeof(unsigned), cudaMemcpyDeviceToHost);
for (unsigned i = 0; i < SIZE; i++) {
for (unsigned j = 0; j < SIZE; j++) {
printf("%d ", harr[i*SIZE+j]);
}
printf("\n");
}
temp = harr[0];
findMin<<<1, block>>>(harr);
return 0;
}
harr is not allocated. You should allocated it on the host side using for example malloc before calling cudaMemcpy. As a result, the printed values you look are garbage. This is quite surprising that the program did not segfault on your machine.
Moreover, when you call the kernel findMin at the end, its parameter is harr (which is supposed to be on the host side regarding its name) should be on the device to perform the atomic operation correctly. As a result, the current kernel call is invalid.
As pointed out by #RobertCrovella, a cudaDeviceSynchronize() call is missing at the end. Moreover, you need to free your memory using cudaFree.
Related
If one wants to copy the arrays to device from host one does cudamalloc and cudaMemcpy. But to lessen the hassle one just does cudaMallocManaged without the former two things and life was never simpler before.
The code looks like this(more or less)
__global__ void convert(float kelvin[], float celsius[]) //can pass
arrays in kernel
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<N)
kelvin[i]=celsius[i]+273.15;
}
int main()
{
float *celsius =(float *)malloc(N*sizeof(float));
float *kelvin =(float *)malloc(N*sizeof(float));
cudaMallocManaged(&celsius, N*sizeof(float));
cudaMallocManaged(&kelvin, N*sizeof(float));
// init celsius here
dim3 blocksPerGrid(1,1,1); //use only one block
dim3 threadsPerBlock(N,1,1); //use N threads in the block
convert<<<blocksPerGrid, threadsPerBlock>>>(kelvin,celsius);
cudaDeviceSynchronize();
//Doing stuff with the output here
return 0;
}
The previous example seems clear to me. But, how to do cudaMallocManaged for 2D and 3D array? I've been trying
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{ // I thonk, 2D arrays can be passed as pointer to pointers
float **A = (float **)malloc(N*N*sizeof(float));
float **B = (float **)malloc(N*N*sizeof(float));
float **C = (float **)malloc(N*N*sizeof(float));
cudaMallocManaged(&A, N*N*sizeof(float));
cudaMallocManaged(&B, N*N*sizeof(float));
cudaMallocManaged(&C, N*N*sizeof(float));
A[N][N]={{1,0,0},{0,1,0},{0,0,1}};
B[N][N]={{1,0,0},{0,1,0},{0,0,1}};
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
//outputs and all
}
But, It shows the following error
matrix_add.cu(22): error: too many initializer values
matrix_add.cu(25): error: argument of type "float **" is incompatible with parameter of type "float (*)[3]"
Your help is highly appreciated.
You got a lot wrong in your attempt, so much that it was faster to write a working version than list out all the individual problems in the code in your question. So here is a working version of what it appears you were trying to do:
#include <algorithm>
#include <iostream>
const int N = 3;
__global__ void MatAdd(float A[][N], float B[][N], float C[][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{
float* A; cudaMallocManaged(&A, N*N*sizeof(float));
float* B; cudaMallocManaged(&B, N*N*sizeof(float));
float* C; cudaMallocManaged(&C, N*N*sizeof(float));
const float A_vals[N][N]={{1,0,0},{0,1,0},{0,0,1}};
const float B_vals[N][N]={{1,0,0},{0,1,0},{0,0,1}};
float (*C_vals)[N] = reinterpret_cast<float (*)[N]>(C);
std::copy(&A_vals[0][0], &A_vals[0][0] + N*N, A);
std::copy(&B_vals[0][0], &B_vals[0][0] + N*N, B);
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(1, 1);
MatAdd<<<numBlocks, threadsPerBlock>>>( reinterpret_cast<float (*)[N]>(A),
reinterpret_cast<float (*)[N]>(B),
C_vals );
cudaDeviceSynchronize();
for(int i=0; i<N; i++) {
for(int j=0; j<N; j++) {
std::cout << C_vals[i][j] << " ";
}
std::cout << std::endl;
}
return 0;
}
Some important points:
Managed memory allocation replaces standard host memory allocation and produces memory which is directly accessible on both the host and the device.
All arrays decay to a pointer when passed as arguments to a function by value. That decay is not recursive. See here for more details.
You can (and will need to) cast in order to use the [][] access syntax on linear memory allocated dynamically at runtime (this applies to malloc, new, or any of the CUDA host memory allocation APIs. See here for more details).
Initialization syntax and assignment syntax for arrays are not interchangeable.
All I can suggest is that you study it thoroughly until you understand how it works.
I've spent several hours struggling with unspecified launch failure.
I've come up with a tiny task for myself in order to understand how shared memory works.
Task is to divide array [1, 2, 3, ... , N] into K group of (N / K) elements and find the sum of each group. (Difference between current and previous element of the array equals 1)
I was planning to use N threads in grid divided between K blocks. So every threadblock contains (N / K) threads. Thus one threadblock could be used to compute sum of one group. Also I wanted to dynamically allocate shared memory.
When I start program I got unspecified launch failure after cudaDeviceSynchronize() call. But when I try step-through debugging everthing is ok and works fine.
What am I doing wrong? (Visual Studio 2012 Professional, Compute Capability 2.1) I would very appreciate any help.
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf("%s\n",cudaGetErrorString(x)); \
system("pause"); \
return EXIT_FAILURE;}} while(0)
extern __shared__ double shrd[];
__global__ void kernel(double * a){
size_t threadID_block = blockDim.x * threadIdx.y + threadIdx.x;
size_t blockID_global = (gridDim.x * blockIdx.y + blockIdx.x );
size_t threadID_global = blockID_global * blockDim.x * blockDim.y + threadID_block;
double * temp = &shrd[blockID_global * blockDim.x * blockDim.y];
temp[threadID_block] = static_cast<double>(threadID_global);
__syncthreads();
if (threadID_block == 0){
a[blockID_global] = 0.0;
for (size_t index = 0; index < blockDim.x * blockDim.y; index++){
a[blockID_global] += temp[index];
}
}
}
int main(){
int devNum = 0;
CUDA_CALL(cudaGetDevice(&devNum));
CUDA_CALL(cudaSetDevice(devNum));
dim3 gridSize(2,2,1);
dim3 blockSize(4,4,1);
double * dev_a = NULL;
size_t length = gridSize.x * gridSize.y ;
size_t byteSize = length * sizeof(double);
CUDA_CALL(cudaMalloc(&dev_a,byteSize));
size_t shmem_perBlock = blockSize.x * blockSize.y * sizeof(double);
kernel <<< gridSize, blockSize, shmem_perBlock >>> (dev_a);
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaDeviceSynchronize());
double * a = new double [length];
CUDA_CALL(cudaMemcpy(a,dev_a,byteSize,cudaMemcpyDeviceToHost));
for (size_t index = 0; index < length; index++){
printf("%.3f\n",a[index]);
}
printf("\n");
CUDA_CALL(cudaFree(dev_a));
CUDA_CALL(cudaDeviceReset());
delete[]a;
system("pause");
return 0;
}
If you are on kepler or later first read this:
http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
Otherwise if you are pre-kepler read this:
http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf
There are some fundamentals you are missing in terms of CUDA programming. I have given you a template of your code below. It is for clarification of some of these fundamentals. Do not expect this to be optimized as I am expecting you to program the parallel reduction. This will get you started with an understanding on how to use shared memory.
Good Luck!
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N 10000
#define K 100
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf("%s\n",cudaGetErrorString(x)); \
system("pause"); \
return EXIT_FAILURE;}} while(0)
__global__ void kernel(double* a, double* results){
extern __shared__ double shared[];
size_t tid, tid_local, stride;
tid = blockDim.x*blockIdx.x+threadIdx.x; //thread id within all blocks
tid_local = threadIdx.x; //thread id within a block
stride = blockDim.x*gridDim.x; //total number of threads
double *start = &a[K*blockIdx.x]; //each block will get K of a block.
shared[tid_local]=start[tid_local]; //copy K elements into shared memory
__syncthreads();
//Perform Parallel reduction, you will have to implement this
//After parallel reduction, result should be in shared[0]
//for demonstration I made the code serial for each block on thread 0.
//This is for demonstration only.
double sum=0;
if(tid_local==0){
for(int i=0; i<K; i++){
sum+=shared[i];
}
a[blockIdx.x]=sum;
}
}
int main(){
int devNum = 0;
CUDA_CALL(cudaGetDevice(&devNum));
CUDA_CALL(cudaSetDevice(devNum));
double * dev_a = NULL;
double * dev_results=NULL;
CUDA_CALL(cudaMalloc(&dev_a, N*sizeof(double) ));
CUDA_CALL(cudaMalloc(&dev_results, (N/K)*sizeof(double)));
//copy dev_a onto GPU (this is the array you are summing).
dim3 block_size(K, 1, 1);
dim3 grid_size (N/K, 1, 1);
size_t shmem_perBlock = K * sizeof(double);
kernel <<< grid_size, block_size, shmem_perBlock >>> (dev_a, dev_results);
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaDeviceSynchronize());
//copy dev_results back to CPU, this is your result.
CUDA_CALL(cudaFree(dev_a));
CUDA_CALL(cudaFree(dev_results));
system("pause");
return 0;
}
I tried summation in cuda . I cant find what i did wrong here.
The sum is always returned 0. Can anyone help.
The shared tag defines the variable common in each block.
So i tried to sum one block at a time and finally sum up the result for overall sum.
But the sum doesnt work for block. And i am stuck.
Can anyone help.
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <stdlib.h>
//#define BLOCK_SIZE 32 // size of vectors
__global__ void add( float * i_data, float * sum){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float s_data;
s_data = 0;
// must be synchronized
__syncthreads();
// reduce and sum
// typical in GPU computings
for (int i = 0; i<blockDim.x; i++)
{
__syncthreads();
if (tid <= i)
{
//s_data[blockIdx.x]+ = s_data[tid] + s_data[i+tid];
s_data+= i_data[tid];
}
}
if (tid == 0)
sum[blockIdx.x]=s_data;
}
int main() {
int T = 10, B = 5; // threads per block and blocks per grid
float *a,*b; // host pointers
float *dev_a, *dev_b; // device pointers to host memory
int sizeIN = T*B*sizeof(int);
int sizeOUT = B*sizeof(int);
a= new float[T*B];
b= new float[B];
for(int i = 0;i<B;i++)
{
for (int j=0;j<T;j++)
{
a[i*T+j]=i;
}
}
for(int i = 0;i<B;i++)
{
b[i]=0;
}
cudaMalloc((void **) &dev_a, sizeIN);
cudaMalloc((void **) &dev_b, sizeOUT);
cudaMemcpy(dev_a, a, sizeIN, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, sizeOUT, cudaMemcpyHostToDevice);
add<<< B, T >>> (dev_a, dev_b);
cudaMemcpy(a,dev_a, sizeIN, cudaMemcpyDeviceToHost);
cudaMemcpy(b,dev_b, sizeOUT, cudaMemcpyDeviceToHost);
for(int i = 0;i<B;i++)
{
for (int j=0;j<T;j++)
{
std::cout<< a[i*T+j]<<"\t";
std::cout<<std::endl;
}
std::cout<<std::endl<<std::endl<<"sum is: "<<b[i]<<std::endl;
}
std::cout<<std::endl<<std::endl;
cudaFree(dev_a);
cudaFree(dev_b);
free(a);
free(b);
return 0;
}
This is wrong in 2 ways:
if (tid = 0)
First, you should be doing a comparison == not an assignment =. I don't know why your compiler didn't warn you about this.
Second, tid is only zero for one thread in the entire grid:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
You want one thread in each block to write the block result out to global memory:
if (threadIdx.x == 0)
This is also a problem, similarly:
if (tid <= i)
This is only satisfied for threads in the first block. Beyond that, I have to start to guess at what you want. I guess you're trying to sum the values in each block. Your construction is not a parallel reduction, but to make the minimum changes to get it "functional" I would rewrite the end of your kernel like this:
// reduce and sum
// typical in GPU computings
for (int i = 0; i<blockDim.x; i++)
{
if (threadIdx.x == i)
{
//s_data[blockIdx.x]+ = s_data[tid] + s_data[i+tid];
s_data+= i_data[tid];
}
__syncthreads();
}
if (threadIdx.x == 0)
sum[blockIdx.x]=s_data;
}
Although you didn't have any CUDA API errors, it's good practice to use proper cuda error checking and also run your code with cuda-memcheck any time you are having trouble with a cuda code.
I mentioned that your code above is not a classical reduction. Its just an unoptimal for-loop.
To learn about a CUDA parallel reduction, study the cuda sample code and the accompanying presentation, and there are many examples here on the CUDA tag on SO as well that you can search on.
I was checking out this sum_reduction.cu example and tutorial and noticed that for certain problem sizes it doesn't work e.g. it works with problem size n=2000 but not with n=3000. Apparently it always work with problem sizes that are multiple of the block size but neither the tutorial nor the example code states so. The question is, does this reduction algorithm only works for certain problem sizes? the example they chose N=256k which is even, a power of two and also multiple of the block size 512.
For self containment I paste the most important bits of (a template version of) the code here:
template<typename T>
__global__ void kernelSum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t n) {
extern __shared__ T sdata[];
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
// load input into __shared__ memory
T x = 0.0;
if (tid < n) {
x = input[tid];
}
sdata[threadIdx.x] = x;
__syncthreads();
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
if(threadIdx.x < offset) {
// add a partial sum upstream to our own
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
// wait until all threads in the block have
// updated their partial sums
__syncthreads();
}
// thread 0 writes the final result
if(threadIdx.x == 0) {
per_block_results[blockIdx.x] = sdata[0];
}
}
and to invoke the kernel:
// launch one kernel to compute, per-block, a partial sum
block_sum<double> <<<num_blocks,block_size,block_size * sizeof(double)>>>(d_input, d_partial_sums_and_total, num_elements);
// launch a single block to compute the sum of the partial sums
block_sum<double> <<<1,num_blocks,num_blocks * sizeof(double)>>>(d_partial_sums_and_total, d_partial_sums_and_total + num_blocks, num_blocks);
To my understanding if the problem size is smaller than the block reduction this statement T x = 0.0; ensures that the element is zeroed out and thus should work but it doesn't?
UPDATE: I am sorry the float/double thing was a typo while preparing the question and not the real problem.
The code you have posted is not consistent, as your templated kernel
is called kernelSum but you are invoking something called
block_sum.
Furthermore, I don't believe your usage of the templated kernel
function could possibly be correct as written:
block_sum<double> <<<num_blocks,block_size,block_size * sizeof(float)>>>(d_input, d_partial_sums_and_total, num_elements);
^ ^
| these types are required to match |
The kernel template is being instantiated with type double. Therefore it is expecting enough shared memory to store block_size double quantities, based on this line:
extern __shared__ T sdata[];
But you are only passing half of the required storage:
block_size * sizeof(float)
I believe that's going to give you unexpected results.
The reduction as written does expect that the block
dimension is a power of 2, due to this loop:
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
This is not likely to be an issue on the first kernel call, because you are probably choosing a power of two for the number of threads per block (block_size):
block_sum<double> <<<num_blocks,block_size,...
However, for the second kernel call, this will depend on whether num_blocks is a power of two, which depends on your grid calculations, which you haven't shown:
block_sum<double> <<<1,num_blocks,...
Finally, the first kernel launch will fail if num_blocks exceeds the limit for your device. This may happen for very large data sets but probably not for size 3000, and it depends on your grid calculations which you haven't shown.
Item 3 above is a difficult requirement to satisfy on the fly for arbitrary vector sizes. Therefore I would suggest an alternate reduction strategy to handle arbitrary sized vectors. For this I would suggest that you study the CUDA reduction sample code and presentation.
Here's a complete program, mostly based on the code you have shown, that has the above issues addressed, and seems to work for me for a size of 3000:
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 3000
#define nTPB 256
template<typename T>
__global__ void block_sum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t n) {
extern __shared__ T sdata[];
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
// load input into __shared__ memory
T x = 0.0;
if (tid < n) {
x = input[tid];
}
sdata[threadIdx.x] = x;
__syncthreads();
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
if(threadIdx.x < offset) {
// add a partial sum upstream to our own
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
// wait until all threads in the block have
// updated their partial sums
__syncthreads();
}
// thread 0 writes the final result
if(threadIdx.x == 0) {
per_block_results[blockIdx.x] = sdata[0];
}
}
int main(){
double *d_input, *d_partial_sums_and_total, *h_input, *h_partial_sums_and_total;
int num_elements=DSIZE;
int block_size = nTPB;
int num_blocks = (num_elements + block_size -1)/block_size;
// bump num_blocks up to the next power of 2
int done = 0;
int test_val = 1;
while (!done){
if (test_val >= num_blocks){
num_blocks = test_val;
done = 1;}
else test_val *= 2;
if (test_val > 65535) {printf("blocks failure\n"); exit(1);}
}
h_input = (double *)malloc(num_elements * sizeof(double));
h_partial_sums_and_total = (double *)malloc((num_blocks+1)*sizeof(double));
cudaMalloc((void **)&d_input, num_elements * sizeof(double));
cudaMalloc((void **)&d_partial_sums_and_total, (num_blocks+1)*sizeof(double));
double h_result = 0.0;
for (int i = 0; i < num_elements; i++) {
h_input[i] = rand()/(double)RAND_MAX;
h_result += h_input[i];}
cudaMemcpy(d_input, h_input, num_elements*sizeof(double), cudaMemcpyHostToDevice);
cudaMemset(d_partial_sums_and_total, 0, (num_blocks+1)*sizeof(double));
// launch one kernel to compute, per-block, a partial sum
block_sum<double> <<<num_blocks,block_size,block_size * sizeof(double)>>>(d_input, d_partial_sums_and_total, num_elements);
// launch a single block to compute the sum of the partial sums
block_sum<double> <<<1,num_blocks,num_blocks * sizeof(double)>>>(d_partial_sums_and_total, d_partial_sums_and_total + num_blocks, num_blocks);
cudaMemcpy(h_partial_sums_and_total, d_partial_sums_and_total, (num_blocks+1)*sizeof(double), cudaMemcpyDeviceToHost);
printf("host result = %lf\n", h_result);
printf("device result = %lf\n", h_partial_sums_and_total[num_blocks]);
}
For brevity/readability, I have dispensed with error checking in the above code. When having difficulty with a cuda code, you should always do proper cuda error checking.
Also, in the future, you will make it easier for others to help you if you post a complete code to demonstrate what you are doing, as I have done above.
I'm new in CUDA, I appreciate your help and hope you can help me.
I need to store multiple elements of a 2D array into a vector, and then work with the vector, but my code does not work well, when I debug, I find a mistake in allocating the 2D array in the device with cudaMallocPitch and copying to that array with cudaMemcpy2D. This is my code:
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cmath>
#define maxThreads 96
__global__ void extract(int mSize, float* dev_vector, float* dev_matrix, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
while(idx<N)
{
dev_vector[idx] = *(dev_matrix+(mSize*idx+N));
idx += blockDim.x * gridDim.x;
}
}
int main()
{
//CPU variables
int mSize = 5;
float* matrix;
int N = 4; // Vector size
int i,j;
float* vector;
int blocks, threads;
float* dev_matrix;
float* dev_vector;
blocks = 1+((N-1)/maxThreads);
threads = 1+((N-1)/blocks);
unsigned long int pitch;
unsigned long int memsize_vector = N*sizeof(float);
unsigned long int memsize_matrix = mSize*sizeof(float);
matrix = new float[memsize_matrix*memsize_matrix];
vector = new float[memsize_vector];
//Create 2D array
for(i=0; i<mSize; i++)
for(j=0; j<mSize; j++)
{
matrix[i+mSize*j] = ((i+1)+(j+1));
}
printf("\n");
for (i=0; i<mSize; i++){
for(j=0; j<mSize; j++){
printf("% 1.5f ", matrix[i+mSize*j]);
}
printf("\n");
}
printf("\n");
cudaMallocPitch((void **)&dev_matrix, &pitch, memsize_matrix, mSize);
cudaMalloc((void **)&dev_vector, memsize_vector);
cudaMemcpy2D(dev_matrix, pitch, matrix, memsize_matrix, memsize_matrix, mSize,
cudaMemcpyHostToDevice);
extract<<<blocks,threads>>>(mSize, dev_vector, dev_matrix, N);
cudaDeviceSynchronize();
cudaMemcpy(vector, dev_vector, memsize_vector, cudaMemcpyDeviceToHost);
printf("Vector values are:\n");
for(i=0; i<N; i++)
printf(" % 1.5f ", vector[i]);
printf("\n");
cudaFree(dev_matrix);
cudaFree(dev_vector);
}
There are lots of problems in this code, including but not limited to using array sizes in bytes and word sizes interchangeably in several places in code, using incorrect types (note that size_t exists for a very good reason) , potential truncation and type casting problems, and more.
But the core problem is the addressing of pitched memory inside the kernel, to which you are never even passing the pitch value. Reading the documentation for cudaMallocPitch will give you the correct method for addressing pitched memory inside a kernel. Your kernel might then look like this:
__global__ void extract(size_t mpitch, float* dev_vector, float* dev_matrix, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
while(idx<N)
{
dev_vector[idx] = *(float *)( ((char*)dev_matrix + idx * mpitch) + N );
idx += stride;
}
}
[disclaimer: never compiled or tested, use at own risk].
You will have to fix then all the problems in the host code to reflect whatever kernel changes you make.
Thanks to all, Alex I had not seen that, and fix it, thanks.
talonmies, thank you, my code works, with your suggestions. thanks a lot, finally this my kernel:
__global__ void sumreduct(size_t pitch, float* dev_vector, float* dev_matrix, int columns, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
while(idx<N)
{
dev_vector[idx] = *(float *)( ((char*)dev_matrix + idx * pitch) + columns);
idx += stride;
}
}
About "size_t", I was using "Unsigned int" because Nsight show me the next warning:
Type 'size_t' could not be resolved
Thanks
Did you really mean to declare a source matrix of length [memsizeMatrix*memsizeMatrix] ?
This will allocate 400 floats, or 1600 bytes. This means your source-pitch is off, and the Memcpy2D call is failing.
I'm assuming you meant to say
matrix = new float[mSize*mSize];