How to fix cudaError 77 when copying back from device to host - cuda

I am writing a simple example program to test memCpy and kernel run concurrency for a larger program. While writing this example, I stumbled upon error 77, aka cudaErrorIllegalAddress.
I read somewhere that that comes from the kernel accessing an invalid address, and not the memcpy itself. So I tried to index the lowest element of my input array (0). The error remained.
As it only is a small sample program, I will provide the whole code;
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCKS 32
#define THREADS 16
__global__ void kernel(double *d_in, double *d_out) {
int index = threadIdx.x + blockDim.x * blockIdx.x;
d_out[index] = d_in[index] + 5;
}
int main() {
const int GPU_N = 2;
const int data_size = 2048;
const int cycles = 2;
double *h_in, *h_out, *d_in, *d_out;
h_in = (double*)malloc(sizeof(double) * data_size);
h_out = (double*)malloc(sizeof(double) * data_size);
for (int i = 0; i < data_size; i++) {
h_in[i] = 21;
}
cudaError_t error;
printf("1\n");
for (int i = 0; i < cycles; i++) {
//cuMalloc
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMalloc((void**)&d_in, sizeof(double) * data_size / 4);
cudaMalloc((void**)&d_out, sizeof(double) * data_size / 4);
printf("2\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMemcpyAsync(d_in, h_in, sizeof(double) * data_size / 4, cudaMemcpyHostToDevice);
printf("3\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
kernel<<< BLOCKS, THREADS, 0, 0 >>>(d_in, d_out);
error = cudaGetLastError();
printf("4\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
error = cudaMemcpyAsync(h_out, d_out, sizeof(double) * data_size / 4, cudaMemcpyDeviceToHost);
printf("D2H %i\n", error);
printf("5\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaFree(d_in);
cudaFree(d_out);
printf("6\n");
}
}
for (int i = 0; i < data_size; i++) {
printf("%i\n", h_out[i]);
}
getchar();
}
So the output should be something like:
1
1
2
2
3
3
4
4
5
5
6
6
1
1
2
2
3
3
4
4
5
5
6
6
26
26
26
26
26
.....
and then a spam of the result. It does so until the time it has to print 5, then it outputs the error 77. Also, the output of the result is not 26 as expected, but -842150451

There are several problems with this code.
As already pointed out in the comments, the printf format specifier here (%i) is wrong:
printf("%i\n", h_out[i]);
the quantity being printed is a double quantity, an appropriate format specifier would be %f.
This code will not work (for GPU_N greater than 1):
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMalloc((void**)&d_in, sizeof(double) * data_size / 4);
cudaMalloc((void**)&d_out, sizeof(double) * data_size / 4);
printf("2\n");
}
d_in and d_out are individual variables. You don't get to somehow reuse them in this way. When this loop goes through it's 2nd (or later) iteration, it will overwrite the pointer values that were previously assigned. Later on this will result in code trouble, because for at least one of your kernel launches, you will be passing pointers to data that is not resident on that particular GPU (and this particular aspect of the problem is the proximal reason for the error 77 report.)
One solution would be to provide arrays of pointers to make this work.
Some of the CUDA activity you are issuing in your loops may be asynchronous. Therefore, to be sure that your final printout of h_out shows expected results, you should wait for all work on the GPU to be finished. One way to accomplish this is with another set of calls to cudaDeviceSynchronize(). (I don't wish to argue about whether cudaFree is asynchronous or not. I think this item is a sensible suggestion and noteworthy. If you feel you can skip this item, do what you wish. For learning purposes, I think it is important to point this out.) For the reasons indicated in comments below, this item is not necessary/mandatory to get expected results for this particular code. This answer isn't intended to be a complete treatise on asynchronous work issuance; for that I suggest further study of any of the relevant questions here on the cuda tag, and/or study of relevant CUDA sample codes.
Here's a modified code that has the above issues addressed (I have shortened the final print-out loop):
$ cat t1477.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCKS 32
#define THREADS 16
__global__ void kernel(double *d_in, double *d_out) {
int index = threadIdx.x + blockDim.x * blockIdx.x;
d_out[index] = d_in[index] + 5;
}
int main() {
const int GPU_N = 2;
const int data_size = 2048;
const int cycles = 2;
double *h_in, *h_out, *d_in[GPU_N], *d_out[GPU_N];
h_in = (double*)malloc(sizeof(double) * data_size);
h_out = (double*)malloc(sizeof(double) * data_size);
for (int i = 0; i < data_size; i++) {
h_in[i] = 21;
}
cudaError_t error;
printf("1\n");
for (int i = 0; i < cycles; i++) {
//cuMalloc
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMalloc((void**)(&(d_in[j])), sizeof(double) * data_size / 4);
cudaMalloc((void**)(&(d_out[j])), sizeof(double) * data_size / 4);
printf("2\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMemcpyAsync(d_in[j], h_in, sizeof(double) * data_size / 4, cudaMemcpyHostToDevice);
printf("3\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
kernel<<< BLOCKS, THREADS, 0, 0 >>>(d_in[j], d_out[j]);
error = cudaGetLastError();
printf("4\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
error = cudaMemcpyAsync(h_out, d_out[j], sizeof(double) * data_size / 4, cudaMemcpyDeviceToHost);
printf("D2H %i\n", error);
printf("5\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaFree(d_in[j]);
cudaFree(d_out[j]);
printf("6\n");
}
}
for (int i = 0; i < GPU_N; i++){
cudaSetDevice(i);
cudaDeviceSynchronize();}
for (int i = 0; i < 10; i++) {
printf("%f\n", h_out[i]);
}
}
$ nvcc -o t1477 t1477.cu
$ cuda-memcheck ./t1477
========= CUDA-MEMCHECK
1
2
2
3
3
4
4
D2H 0
5
D2H 0
5
6
6
2
2
3
3
4
4
D2H 0
5
D2H 0
5
6
6
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
========= ERROR SUMMARY: 0 errors
$

Related

Cuda passing an array of structs

I am new to cuda and am trying to parallelize a very simple program shown below that was inspired from this link: https://devblogs.nvidia.com/even-easier-introduction-cuda/
typedef struct{
int temp;
int newtemp;
int neighbors[20];
} S;
void add(int n, S * s){
for(int i = 0; i < n; i++){
int newTemp = 0;
for(int j = 0; j < 20; j++){
newTemp += s[s[i].neighbors[j]].temp;
}
newTemp /= 3;
s[i].newtemp = newTemp;
}
}
int main(int argc, char *argv[]){
int n = 1<<21;
S grid[n];
for(int i = 0; i < n; i++){
S tmp1;
tmp1.temp = rand();
for(int j = 0; j<20; j++){
tmp1.neighbors[j] = rand()%n;
}
grid[i] = tmp1;
}
struct timespec start, end;
double gettime_diff, time_diff;
clock_t t, starttime, endtime;
clock_gettime(CLOCK_REALTIME, &start);
t = clock();
time(&starttime);
add(n,grid);
for(int i = 0; i < n; i++){
grid[i].temp = grid[i].newtemp;
if(i%83940==1)printf("%d\n",grid[i].temp);
}
return 0;
}
I am not getting desired results however as when I am updating temp all the new values are 0. I think that the issue is because the array of structs I am passing to my add function cannot be accessed in device memory. I, however, am having a hard time figuring out how to fix this. I found this post on stackoverflow and am a little unsure what the suggested answer did to fix the issue: Array of structs of arrays CUDA C
The cuda code I have for reference is here:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define SIZE 1000
#define NS_PER_US 1000
typedef struct{
int temp;
int newtemp;
int neighbors[20];
} S;
__global__ void add(int n, S * s){
int index = threadIdx.x;
int stride = blockDim.x;
//printf("%d\n",(n-index)/stride);
//printf("%d\n",s[0].temp);
for(int i = index; i < n; i+=stride){
printf("%d\n",index);
int newTemp = 0;
for(int j = 0; j < 20; j++){
newTemp += s[s[i].neighbors[j]].temp;
}
printf("%d\n",index);
newTemp /= 3;
s[i].newtemp = newTemp;
}
}
int main(int argc, char *argv[]){
int *h_a;
int *d_a;
int num_blocks= 2;
int num_th_per_blk= 5;
int n = 1<<21;
S grid[n];
for(int i = 0; i < n; i++){
S tmp1;
tmp1.temp = rand();
for(int j = 0; j<20; j++){
tmp1.neighbors[j] = rand()%n;
}
grid[i] = tmp1;
}
struct timespec start, end;
double gettime_diff, time_diff;
clock_t t, starttime, endtime;
clock_gettime(CLOCK_REALTIME, &start);
t = clock();
time(&starttime);
size_t memSize;
memSize = num_blocks* num_th_per_blk* sizeof(int);
h_a= (int*) malloc(memSize);
cudaMallocManaged((void **)&grid, n * sizeof(S));
cudaMalloc( (void**) &d_a, memSize);
dim3 dimGrid(num_blocks);
dim3 dimBlock(num_th_per_blk);
add<<< dimGrid, dimBlock >>>(n,grid);
cudaMemcpy( h_a, d_a, memSize,cudaMemcpyDeviceToHost);
for(int i = 0; i < n; i++){
grid[i].temp = grid[i].newtemp;
if(i%83940==1)printf("%d\n",grid[i].newtemp);
}
clock_gettime(CLOCK_REALTIME, &end);
t = clock() - t;
time(&endtime);
gettime_diff = (double) ((end.tv_sec - start.tv_sec)*CLOCKS_PER_SEC) + (double)((end.tv_nsec - start.tv_nsec)/NS_PER_US);
time_diff = difftime(endtime, starttime);
printf("\ttime (clock_gettime) %f\n", gettime_diff);
printf("\ttime (clock) %f\n", ((float)t)/CLOCKS_PER_SEC);
printf("\ttime (time) %f\n", time_diff);
return 0;
}
I feel like there is a simple fix here that I am not seeing, or maybe I am missing a key concept. Whatever the case any help would be greatly appreciated.
There is actually a lot wrong in your code, so much so that it is easier to post a working version than point out all the individual mistakes:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define NS_PER_US 1000
typedef struct{
int temp;
int newtemp;
int neighbors[20];
} S;
__global__
void add(int n, S * s)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < n; i+=stride){
int newTemp = 0;
for(int j = 0; j < 20; j++){
newTemp += s[s[i].neighbors[j]].temp;
}
newTemp /= 3;
s[i].newtemp = newTemp;
}
}
int main(int argc, char *argv[]){
int n = 1<<10;
S* grid;
cudaMallocManaged((void **)&grid, n * sizeof(S));
for(int i = 0; i < n; i++){
S tmp1;
tmp1.temp = rand()%n;
for(int j = 0; j<20; j++){
tmp1.neighbors[j] = rand()%n;
}
grid[i] = tmp1;
}
struct timespec start, end;
double gettime_diff, time_diff;
clock_t t, starttime, endtime;
clock_gettime(CLOCK_REALTIME, &start);
t = clock();
time(&starttime);
int num_th_per_blk= 32;
int num_blocks= (n / num_th_per_blk) + (n % num_th_per_blk > 0) ? 1 : 0;
dim3 dimGrid(num_blocks);
dim3 dimBlock(num_th_per_blk);
add<<< dimGrid, dimBlock >>>(n,grid);
cudaDeviceSynchronize();
for(int i = 0; i < n; i++){
grid[i].temp = grid[i].newtemp;
if(i%10==1)printf("%d %d\n",i,grid[i].temp);
}
clock_gettime(CLOCK_REALTIME, &end);
t = clock() - t;
time(&endtime);
gettime_diff = (double) ((end.tv_sec - start.tv_sec)*CLOCKS_PER_SEC) + (double)((end.tv_nsec - start.tv_nsec)/NS_PER_US);
time_diff = difftime(endtime, starttime);
printf("\ttime (clock_gettime) %f\n", gettime_diff);
printf("\ttime (clock) %f\n", ((float)t)/CLOCKS_PER_SEC);
printf("\ttime (time) %f\n", time_diff);
return 0;
}
The most egregious error is how you handle grid in the host code. Doing this:
S grid[n];
// code initializing grid
cudaMallocManaged((void **)&grid, n * sizeof(S));
is both illegal (you shouldn't try and set grid to another pointer value, it isn't a pointer), and nonsensical. cudaMallocManaged allocates new memory, so all you are doing is initializing grid, then throwing away all the carefully initialized memory and replacing it with uninitialized memory which you pass to the kernel. The kernel then operates on random data. Note also that the grid stride loop within the kernel is also incorrect, and both the original code and CUDA version potentially suffer from integer overflow due to how you initialize the temp members of the structure in both versions using rand().

Incorrect addition of Prime numbers in CUDA [duplicate]

This question already has an answer here:
How to find the sum of array in CUDA by reduction
(1 answer)
Closed 3 years ago.
I use reduction logic in code by referring How to find the sum of array in CUDA by reduction.
But It is giving some errors. I am not getting my mistake, could you please help me out??
required specification:
1.Cuda toolkit v6.5
2. graphics: GTX 210 (compute capability 1.2)
3. visual studio 2013
#include<stdio.h>
#include<cuda.h>
#include<malloc.h>
#include<conio.h>
#include<time.h>
#include<windows.h>
#define SIZE 10
#define N 100
__global__ void vectoreAdd(int *d_a, int *d_b, int *d_c)
{
__shared__ int sdata[256];
int i = threadIdx.x + (blockIdx.x*blockDim.x);
sdata[threadIdx.x] = d_a[i];
__syncthreads();
if (i<SIZE)
for (i = 2; i<SIZE; i++)
{
int counter = 0;
for (int j = 2; j<d_a[i]; j++)
{
if (d_a[i] % j == 0)
{
counter = 1; break;
}
}
if (counter == 0)
{
d_b[i] = d_a[i];
}
}
// do reduction in shared mem
for (int s = 1; s < blockDim.x; s *= 2)
{
int index = 2 * s * threadIdx.x;;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (threadIdx.x == 0)
atomicAdd(d_c, sdata[0]);
}
}
int main()
{
clock_t tic = clock();
int *a, *b, *summation=0, sum = 0,count=-1; //declare summation as double/long if needed
int *d_a, *d_b, *d_c;
//int blocks, block_size = 512;
int size = N * sizeof(int);
a = (int *)malloc(SIZE*sizeof(int));
b = (int *)malloc(SIZE*sizeof(int));
summation = (int *)malloc(SIZE*sizeof(int));
cudaMalloc((void**)&d_a, SIZE * sizeof(int));
cudaMalloc((void**)&d_b, SIZE * sizeof(int));
cudaMalloc((void**)&d_c, SIZE * sizeof(int));
for (int i = 1; i<SIZE; i++)
{
a[i] = i;
b[i] = 0;
}
cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice);
/*blocks = SIZE / block_size;
if (SIZE% block_size != 0)
blocks++; */
dim3 blocksize(256); // create 1D threadblock
dim3 gridsize(N / blocksize.x); //create 1D grid
vectoreAdd << < gridsize, blocksize >> >(d_a, d_b, d_c);
//cudaThreadSynchronize();
cudaMemcpy(b, d_b, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(summation, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
for (int m = 0; m < SIZE; m++)
{
if (b[m] != 0)
{
printf("\n prime no is:%d", b[m]);
count = count + 1;
}
}
printf("\n\n Total prime no. are: %d", count);
/* for (int j = 1; j<SIZE; j++)
{
sum = sum + b[j];
}*/
printf("\n \nsum of all prime no upto %d is:%d", SIZE, summation);
clock_t toc = clock();
printf("\n\nElapsed: %f seconds\n", (double)(toc - tic) / CLOCKS_PER_SEC);
free(a); free(b); free(summation);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
getchar(); return 0;
}
There are lots of mistakes in your code :
cudaMalloc((void**)&d_a, SIZE * sizeof(int));
should be :
cudaMalloc((void**)&d_a, N * sizeof(int)); //OR
cudaMalloc((void**)&d_a, size);
as you already calculated but didnt passed it. same in case of malloc() //Host code

Summing the rows of a matrix (stored in either row-major or column-major order) in CUDA

I'm working on the problem summing the rows of a matrix in CUDA. I'm giving the following example.
Suppose to have the following 20 * 4 array:
1 2 3 4
4 1 2 3
3 4 1 2
.
1 2 3 4
.
.
.
.
.
.
.
.
2 1 3 4
After flattened the 2d array to a 1d array (either in row-major or column-major order), I need to assign each thread to a different row and calculate the cost for that row.
For example
- thread 1 should calculate the cost for 1 2 3 4
- thread 2 should calculate the cost for 4 1 2 3
How can I that in CUDA?
Thank you all for the reply
#include <stdio.h>
#include <stdlib.h>
#define MROWS 20
#define NCOLS 4
#define nTPB 256
__global__ void mykernel(int *costdata, int rows, int cols, int *results){
int tidx = threadIdx.x + blockDim.x*blockIdx.x;
if (tidx < rows){
int mycost = 0;
for (int i = 0; i < cols; i++)
mycost += costdata[(tidx*cols)+i];
results[tidx] = mycost;
}
}
int main(){
//define and initialize host and device storage for cost and results
int *d_costdata, *h_costdata, *d_results, *h_results;
h_results = (int *)malloc(MROWS*sizeof(int));
h_costdata = (int *)malloc(MROWS*NCOLS*sizeof(int));
for (int i=0; i<(MROWS*NCOLS); i++)
h_costdata[i] = rand()%4;
cudaMalloc((void **)&d_results, MROWS*sizeof(int));
cudaMalloc((void **)&d_costdata, MROWS*NCOLS*sizeof(int));
//copy cost data from host to device
cudaMemcpy(d_costdata, h_costdata, MROWS*NCOLS*sizeof(int), cudaMemcpyHostToDevice);
mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results);
// copy results back from device to host
cudaMemcpy(h_results, d_results, MROWS*sizeof(int), cudaMemcpyDeviceToHost);
for (int i=0; i<MROWS; i++){
int loc_cost = 0;
for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
printf("cost[%d]: host= %d, device = %d\n", i, loc_cost, h_results[i]);
}
}
This assumes "cost" of each row is just the sum of the elements in each row. If you have a different "cost" function, you can modify the activity in the kernel for-loop accordingly. This also assumes C-style row-major data storage (1 2 3 4 4 1 2 3 3 4 1 2 etc.)
If you instead use column-major storage (1 4 3 etc.), you can slightly improve the performance, since the data reads can be fully coalesced. Then your kernel code could look like this:
for (int i = 0; i < cols; i++)
mycost += costdata[(i*rows)+tidx];
You should also use proper cuda error checking on all CUDA API calls and kernel calls.
EDIT: As discussed in the comments below, for the row-major storage case, in some situations it might give an increase in memory efficiency by electing to load 16-byte quantities rather than the base type. Following is a modified version that implements this idea for arbitrary dimensions and (more or less) arbitrary base types:
#include <iostream>
#include <typeinfo>
#include <cstdlib>
#include <vector_types.h>
#define MROWS 1742
#define NCOLS 801
#define nTPB 256
typedef double mytype;
__host__ int sizetype(){
int size = 0;
if ((typeid(mytype) == typeid(float)) || (typeid(mytype) == typeid(int)) || (typeid(mytype) == typeid(unsigned int)))
size = 4;
else if (typeid(mytype) == typeid(double))
size = 8;
else if ((typeid(mytype) == typeid(unsigned char)) || (typeid(mytype) == typeid(char)))
size = 1;
return size;
}
template<typename T>
__global__ void mykernel(const T *costdata, int rows, int cols, T *results, int size, size_t pitch){
int chunk = 16/size; // assumes size is a factor of 16
int tidx = threadIdx.x + blockDim.x*blockIdx.x;
if (tidx < rows){
T *myrowptr = (T *)(((unsigned char *)costdata) + tidx*pitch);
T mycost = (T)0;
int count = 0;
while (count < cols){
if ((cols-count)>=chunk){
// read 16 bytes
int4 temp = *((int4 *)(myrowptr + count));
int bcount = 16;
int j = 0;
while (bcount > 0){
mycost += *(((T *)(&temp)) + j++);
bcount -= size;
count++;}
}
else {
// read one quantity at a time
for (; count < cols; count++)
mycost += myrowptr[count];
}
results[tidx] = mycost;
}
}
}
int main(){
int typesize = sizetype();
if (typesize == 0) {std::cout << "invalid type selected" << std::endl; return 1;}
//define and initialize host and device storage for cost and results
mytype *d_costdata, *h_costdata, *d_results, *h_results;
h_results = (mytype *)malloc(MROWS*sizeof(mytype));
h_costdata = (mytype *)malloc(MROWS*NCOLS*sizeof(mytype));
for (int i=0; i<(MROWS*NCOLS); i++)
h_costdata[i] = (mytype)(rand()%4);
size_t pitch = 0;
cudaMalloc((void **)&d_results, MROWS*sizeof(mytype));
cudaMallocPitch((void **)&d_costdata, &pitch, NCOLS*sizeof(mytype), MROWS);
//copy cost data from host to device
cudaMemcpy2D(d_costdata, pitch, h_costdata, NCOLS*sizeof(mytype), NCOLS*sizeof(mytype), MROWS, cudaMemcpyHostToDevice);
mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results, typesize, pitch);
// copy results back from device to host
cudaMemcpy(h_results, d_results, MROWS*sizeof(mytype), cudaMemcpyDeviceToHost);
for (int i=0; i<MROWS; i++){
mytype loc_cost = (mytype)0;
for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
if ((i < 10) && (typesize > 1))
std::cout <<"cost[" << i << "]: host= " << loc_cost << ", device = " << h_results[i] << std::endl;
if (loc_cost != h_results[i]){ std::cout << "mismatch at index" << i << "should be:" << loc_cost << "was:" << h_results[i] << std::endl; return 1; }
}
std::cout << "Results are correct!" << std::endl;
}

Matrix Multiplication giving wrong output [duplicate]

This question already has an answer here:
Unable to execute device kernel in CUDA
(1 answer)
Closed 7 years ago.
What I am attempting to do is Multiply Matrix A & Matrix B and then from the product matrix I get the index of the maximum value per column. But unfortunately, only the first 128*128 values of the matrix multiplication are correct while others are just garbage. I do not quite understand how this works. I request you to kindly guide me with this ..
#include<stdio.h>
#include "cuda.h"
#include<stdlib.h>
#define blockD 32
const int wA = 128;
const int hA = 4096;
const int wB = 4096;
const int hB = wA;
main(void){
void MatrixMultiplication(float *, float *, float *, float *);
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *M, *N, *P, *C;
// allocate memory on the CPU
M = (float*)malloc(size_A);
N = (float*)malloc(size_B);
P = (float*)malloc(size_max);
C = (float*)malloc(size_C);
// initialize the matrices
for (int y=0; y < hA; y++) {
for (int x=0; x < wA; x++){
M[y*wA + x] = 32; //x + y*wA;
}
}
for (int y=0; y<hB; y++) {
for (int x=0; x<wB; x++){
N[y*wB + x] = 21; //x + y*wB;
}
}
MatrixMultiplication(M, N, P, C);
//Write
FILE *f1;
int i,j;
f1 = fopen("C.txt","w");
for(i = hA - 2 ; i < hA; i ++){
for(j = 0; j < wB; j++){
fprintf(f1,"%d\t",int(C[i*wB + j]));
}
fprintf(f1,"\n");
}
fclose(f1);
// free the memory allocated on the CPU
free( M );
free( N );
free( P );
free( C );
cudaDeviceReset();
return 0;
}
__device__ void MaxFunction(float* Pd, float* max)
{
int x = (threadIdx.x + blockIdx.x * blockDim.x);
int y = (threadIdx.y + blockIdx.y * blockDim.y);
int k = 0;
int temp = 0; int temp_idx = 0;
for (k = 0; k < wB; ++k) {
if(Pd[x*wB + k] > temp){
temp = Pd[x*wB + k];
temp_idx = x*wB + k;
}
}
max[y*2 + 0] = temp;
max[y*2 + 1] = temp_idx;
}
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, float* max)
{
// declare cache in the shared memory
__shared__ float Mds[blockD][blockD];
__shared__ float Nds[blockD][blockD];
float Pvalue = 0;
// Loop over the Md and Nd block dimension required to compute the Pd element
for (int m = (wA * blockD * blockIdx.y), n = (blockD * blockIdx.x);
m < ((wA * blockD * blockIdx.y)+wA-1);
m += blockD, n += (blockD*hB)){
// collaboratively loading of Md and Nd blocks into shared memory
Mds[threadIdx.y][threadIdx.x] = Md[m + wA * threadIdx.y + threadIdx.x];
Nds[threadIdx.y][threadIdx.x] = Nd[n + wA * threadIdx.y + threadIdx.x];
__syncthreads();
// keep track of the running sum
for (int k = 0; k < blockD; k++)
Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
__syncthreads();
}
// write back to the global memory
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
__syncthreads();
MaxFunction(Pd, max);
}
void MatrixMultiplication(float *M, float *N, float *P, float *C) {
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *Md, *Nd, *Pd, *max;
// allocate memory on the GPU
cudaMalloc((void**)&Md, size_A);
cudaMalloc((void**)&Nd, size_B);
cudaMalloc((void**)&Pd, size_C);
cudaMalloc((void**)&max, size_max);
// transfer M and N to device memory
cudaMemcpy(Md, M, size_A, cudaMemcpyHostToDevice);
cudaMemcpy(Nd, N, size_B, cudaMemcpyHostToDevice);
// kernel invocation code
dim3 dimBlock(blockD, blockD);
dim3 dimGrid(wA/blockD, hB/blockD);
//Execute Kernel
MatrixMulKernel<<<dimGrid, dimBlock>>>( Md, Nd, Pd, max);
// transfer P from device
cudaMemcpy(P, max, size_max, cudaMemcpyDeviceToHost);
cudaMemcpy(C, Pd, size_C, cudaMemcpyDeviceToHost);
// free the memory allocated on the GPU
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
cudaFree(max);
}
In your code you seem to have more than one problem. One of the problems is, in place of this:
dim3 dimGrid(wA/blockD, hB/blockD);
You should have this:
dim3 dimGrid(wB/blockD, hA/blockD);
Ultimately you need one thread in your grid for each output point. Your formulation was giving you a grid of 4 blocks by 4 blocks, whereas you need a grid of 128 blocks by 128 blocks.
The other problem I found with your code was in these lines in the kernel:
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
They are not indexing properly through the output array. Rather than try to sort it out using your scheme, I used this instead:
Pd[(threadIdx.x + (blockIdx.x * blockDim.x)) + ((threadIdx.y + (blockIdx.y * blockDim.y))*(gridDim.x*blockDim.x))] = Pvalue;
When I made the above two changes to your code, I got what I believe are correct results throughout the array. And it took about 32 seconds on my machine to run it. (Note that I haven't tried fixing your original max-finding code -- see below for a better approach.)
Based on your previous question, you seemed to be concerned about speed. If you want to do fast matrix multiply, you should use cublas. The following code shows how to use cublas to multiply two ordinary C-style matrices (they don't have to be square). I've also included a column-max finding kernel that will be fast when the number of columns is large (say, over 500 or so. You have 4096 columns in your example). For small numbers of columns, there may be quicker ways to perform this function, but small numbers of columns also suggests that the overall problem size may be small and so speed (of this piece of code) will not really be an issue.
Here's the code:
#include <stdio.h>
#include <cublas_v2.h>
#define VERBOSE 1
#define nTPB 64
#define ROW_A 4
#define COL_A 4
#define ROW_B COL_A
#define COL_B 4
#define ROW_C ROW_A
#define COL_C COL_B
#define SIZ_A (ROW_A*COL_A)
#define SIZ_B (ROW_B*COL_B)
#define SIZ_C (ROW_C*COL_C)
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void col_max(float *mat, float *max, unsigned int *midx, unsigned int rows, unsigned int cols){
int idx = threadIdx.x + blockDim.x*blockIdx.x;
if (idx < cols){
float tempmax = mat[idx];
unsigned int tempmidx = 0;
for (int i = 1; i< rows; i++)
if (mat[idx + (i*cols)] > tempmax){
tempmax = mat[idx + (i*cols)];
tempmidx = i;}
max[idx] = tempmax;
midx[idx] = tempmidx;
}
}
int main(){
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C, *h_max, *d_max;
unsigned int *h_idx, *d_idx;
h_A = (float *)malloc(SIZ_A*sizeof(float));
if (h_A==0) {printf("malloc fail\n"); return -1;}
h_B = (float *)malloc(SIZ_B*sizeof(float));
if (h_B==0) {printf("malloc fail\n"); return -1;}
h_C = (float *)malloc(SIZ_C*sizeof(float));
if (h_C==0) {printf("malloc fail\n"); return -1;}
h_max = (float *)malloc(COL_C*sizeof(float));
if (h_max==0) {printf("malloc fail\n"); return -1;}
h_idx = (unsigned int*)malloc(COL_C*sizeof(unsigned int));
if (h_idx==0) {printf("malloc fail\n"); return -1;}
cudaMalloc((void **)&d_A, SIZ_A*sizeof(float));
cudaMalloc((void **)&d_B, SIZ_B*sizeof(float));
cudaMalloc((void **)&d_C, SIZ_C*sizeof(float));
cudaMalloc((void **)&d_max, COL_C*sizeof(float));
cudaMalloc((void **)&d_idx, COL_C*sizeof(unsigned int));
cudaCheckErrors("cuda malloc fail");
// initialize data
for (int i=0; i< SIZ_A; i++) h_A[i] = (float)(i+1);
for (int i=0; i< SIZ_B; i++) h_B[i] = (float)(i+2);
cudaMemcpy(d_A, h_A, SIZ_A*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, SIZ_B*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cuda memcpy 1 fail");
const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;
cublasCheckErrors(cublasCreate(&handle));
// C = A*B
// due to cublas expecting column-major storage, parameters
// are scrambled
cublasCheckErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, COL_B, ROW_A, COL_A, &alpha, d_B, COL_B, d_A, COL_A, &beta, d_C, COL_C));
cudaMemcpy(h_C, d_C, SIZ_C*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 2 fail");
col_max<<<(COL_C + nTPB - 1)/nTPB, nTPB>>>(d_C, d_max, d_idx, ROW_C, COL_C);
cudaCheckErrors("kernel launch fail");
cudaMemcpy(h_max, d_max, COL_C*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_idx, d_idx, COL_C*sizeof(unsigned int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 3 fail/kernel fail");
if (VERBOSE){
printf("A: \n");
for (int i=0; i< ROW_A; i++){
for (int j=0; j< COL_A; j++)
printf("%7.5G", h_A[j+(i*COL_A)]);
printf("\n");}
printf("B: \n");
for (int i=0; i< ROW_B; i++){
for (int j=0; j< COL_B; j++)
printf("%7.5G", h_B[j+(i*COL_B)]);
printf("\n");}
printf("C = A*B: \n");
for (int i=0; i< ROW_C; i++){
for (int j=0; j< COL_C; j++)
printf("%7.5G", h_C[j+(i*COL_C)]);
printf("\n");}
printf("COLUMN MAX:\n");
for (int i=0; i< COL_C; i++)
printf("%7.5G", h_max[i]);
printf("\nCOLUMN MAX IDX:\n");
for (int i=0; i< COL_C; i++)
printf("%7d", h_idx[i]);
}
printf("\n finished!\n");
return 0;
}
Here's what I used to compile:
$ nvcc -arch=sm_20 -O3 -o t221 t221.cu -lcublas
And here's the sample output:
$ cuda-memcheck ./t221
========= CUDA-MEMCHECK
A:
1 2 3 4
5 6 7 8
9 10 11 12
13 14 15 16
B:
2 3 4 5
6 7 8 9
10 11 12 13
14 15 16 17
C = A*B:
100 110 120 130
228 254 280 306
356 398 440 482
484 542 600 658
COLUMN MAX:
484 542 600 658
COLUMN MAX IDX:
3 3 3 3
finished!
========= ERROR SUMMARY: 0 errors
$
When I extended my code to handle the same sizes you indicated, (A = 4096x128, B=128x4096) it took about 1 second on my machine. So it's much faster than your code. However, when I take your code and comment out your call to MaxFunction in the kernel, it also only takes about 1 second to compute the matrix multiply result. So if you wanted to keep your matrix multiply code (i.e. not use cublas) you could break the code into 2 kernels, and use your multiply routine in the first kernel with my max-finding routine (col_max) in the second kernel, and also probably get a pretty fast result.
As #talonmies indicated, if you are running on a windows machine, be sure you are aware of the ramifications of windows TDR. (search that in the upper right corner search box if needed)

scan-array CUDA

I'm trying to scan a simple array using CUDA but it seems there is something wrong with the code below..I am trying to find what i am doing wrong but i can't.Can anyone please help me?
#include <stdio.h>
#include <stdlib.h>
__global__ void prescan(int *g_odata, int *g_idata, int n){
extern __shared__ int temp[];// allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid]; // load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1){ // build sum in place up the tree
__syncthreads();
if (thid < d){
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2){ // traverse down tree & build scan
offset >>= 1;
__syncthreads();
if (thid < d){
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
int main(int argc, char *argv[]){
int i;
int *input = 0;
int *output = 0;
int *g_idata = 0;
int *g_odata = 0;
int numblocks = 1;
int radix = 16;
input = (int*)malloc(numblocks*radix*sizeof(int));
output = (int*)malloc(numblocks*radix*sizeof(int));
cudaMalloc((void**)&g_idata, numblocks*radix*sizeof(int));
cudaMalloc((void**)&g_odata, numblocks*radix*sizeof(int));
for(i=0; i<numblocks*radix; i++){
input[i] = 1 + 2*i;
}
for(i=0; i<numblocks*radix; i++){
printf("%d ", input[i]);
}
cudaMemcpy(g_idata, input, numblocks*radix*sizeof(int), cudaMemcpyHostToDevice);
prescan<<<1,8>>>(g_odata, g_idata, numblocks*radix);
cudaThreadSynchronize();
cudaMemcpy(output, g_odata, numblocks*radix*sizeof(int), cudaMemcpyDeviceToHost);
for(i=0; i<numblocks*radix; i++){
printf("%d ", output[i]);
}
free(input);
free(output);
cudaFree(g_idata);
cudaFree(g_odata);
return 0;
}
The output is this: 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.I want to have this output: 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 0 1 4 9 16 25 36 49 64 81 100 121 144 169 196 225
Just go through this code to implement scan in parallel environment.
The algorithm which I implemented here is Hillis Steele exclusive scan.I implemented algorithm through shared memory, it will definitely improve the execution time for the large data set.
#include<stdio.h>
#include<math.h>
__global__ void scan(int *d_in,int *d_out,int n)
{
extern __shared__ int sdata[];
int i;
int tid = threadIdx.x;
sdata[tid] = d_in[tid];
for (i = 1; i <n; i <<= 1)
{
if (tid>=i)
{
sdata[tid] +=sdata[tid-i];
}
__syncthreads();
}
d_out[tid] = sdata[tid];
__syncthreads();
}
int main()
{
int h_in[16],h_out[16];
int i,j;
for (i = 0; i < 16; i++)
h_in[i] = 2*i+1;
for (i = 0; i < 16; i++)
printf("%d ", h_in[i]);
int *d_in;
int *d_out;
cudaMalloc((void**)&d_in, sizeof(int)* 16);
cudaMalloc((void**)&d_out, sizeof(int)* 16);
cudaMemcpy(d_in, h_in, sizeof(int) * 16, cudaMemcpyHostToDevice);
scan <<<1, 16, sizeof(int)*16 >>>(d_in,d_out, 16);
cudaMemcpy(h_out, d_out, sizeof(int) * 16, cudaMemcpyDeviceToHost);
for (i = 0; i < 16; i++)
printf("%d ", h_out[i]);
return 0;
}