How to cope with "cudaErrorMissingConfiguration" from "cudaMallocPitch" function of CUDA?

How to cope with "cudaErrorMissingConfiguration" from "cudaMallocPitch" function of CUDA? - cuda

I'm making a Mandelbrot set program with CUDA. However I can't step more unless cudaErrorMissingConfiguration from cudaMallocPitch() function of CUDA is to be solved. Could you tell me something about it?
My GPU is GeForce RTX 2060 SUPER.
I'll show you my command lines below.
> nvcc MandelbrotCUDA.cu -o MandelbrotCUDA -O3
I tried cudaDeviceSetLimit( cudaLimitMallocHeapSize, 7*1024*1024*1024 ) to
resize heap size.
cudaDeviceSetLimit was success.
However I cannot step one more. I cannot print "CUDA malloc done!"
#include <iostream>
#include <thrust/complex.h>
#include <fstream>
#include <string>
#include <stdlib.h>
using namespace std;
#define D 0.0000025 // Tick
#define LIMIT_N 255
#define INF_NUM 2
#define PLOT_METHOD 2 // dat file : 0, ppm file : 1, ppm file with C : 2
__global__
void calculation(const int indexTotalX, const int indexTotalY, int ***n, thrust::complex<double> ***c){ // n, c are the pointers of dN, dC.
for(int i = 0; i < indexTotalY ; i++){
for(int j = 0; j < indexTotalX; j++){
thrust::complex<double> z(0.0f, 0.0f);
n[i][j] = 0;
for(int ctr=1; ctr <= LIMIT_N ; ctr++){
z = z*z + (*(c[i][j]));
n[i][j] = n[i][j] + (abs(z) < INF_NUM);
}
}
}
}
int main(){
// Data Path
string filePath = "Y:\\Documents\\Programming\\mandelbrot\\";
string fileName = "mandelbrot4.ppm";
string filename = filePath+fileName;
//complex<double> c[N][M];
double xRange[2] = {-0.76, -0.74};
double yRange[2] = {0.05, 0.1};
const int indexTotalX = (xRange[1]-xRange[0])/D;
const int indexTotalY = (yRange[1]-yRange[0])/D;
thrust::complex<double> **c;
//c = new complex<double> [N];
cout << "debug_n" << endl;
int **n;
n = new int* [indexTotalY];
c = new thrust::complex<double> * [indexTotalY];
for(int i=0;i<indexTotalY;i++){
n[i] = new int [indexTotalX];
c[i] = new thrust::complex<double> [indexTotalX];
}
cout << "debug_n_end" << endl;
for(int i = 0; i < indexTotalY; i++){
for(int j = 0; j < indexTotalX; j++){
thrust::complex<double> tmp( xRange[0]+j*D, yRange[0]+i*D );
c[i][j] = tmp;
//n[i*sqrt(N)+j] = 0;
}
}
// CUDA malloc
cout << "CUDA malloc initializing..." << endl;
int **dN;
thrust::complex<double> **dC;
cudaError_t error;
error = cudaDeviceSetLimit(cudaLimitMallocHeapSize, 7*1024*1024*1024);
if(error != cudaSuccess){
cout << "cudaDeviceSetLimit's ERROR CODE = " << error << endl;
return 0;
}
size_t tmpPitch;
error = cudaMallocPitch((void **)dN, &tmpPitch,(size_t)(indexTotalY*sizeof(int)), (size_t)(indexTotalX*sizeof(int)));
if(error != cudaSuccess){
cout << "CUDA ERROR CODE = " << error << endl;
cout << "indexTotalX = " << indexTotalX << endl;
cout << "indexTotalY = " << indexTotalY << endl;
return 0;
}
cout << "CUDA malloc done!" << endl;
This is console messages below.
debug_n
debug_n_end
CUDA malloc initializing...
CUDA ERROR CODE = 1
indexTotalX = 8000
indexTotalY = 20000

There are several problems here:
int **dN;
...
error = cudaMallocPitch((void **)dN, &tmpPitch,(size_t)(indexTotalY*sizeof(int)), (size_t)(indexTotalX*sizeof(int)));
The correct type of pointer to use in CUDA allocations is a single pointer:
int *dN;
not a double pointer:
int **dN;
(so your kernel where you are trying pass triple-pointers:
void calculation(const int indexTotalX, const int indexTotalY, int ***n, thrust::complex<double> ***c){ // n, c are the pointers of dN, dC.
is almost certainly not going to work, and should not be designed that way, but that is not the question you are asking.)
The pointer is passed to the allocating function by its address:
error = cudaMallocPitch((void **)&dN,
For cudaMallocPitch, only the horizontal requested dimension is scaled by the size of the data element. The allocation height is not scaled this way. Also, I will assume X corresponds to your allocation width, and Y corresponds to your allocation height, so you also have those parameters reversed:
error = cudaMallocPitch((void **)&dN, &tmpPitch,(size_t)(indexTotalX*sizeof(int)), (size_t)(indexTotalY));
The cudaLimitMallocHeapSize should not be necessary to set to make any of this work. It applies only to in-kernel allocations. Reserving 7GB on an 8GB card may also cause problems. Until you are sure you need that (it's not needed for what you have shown) I would simply remove that.
$ cat t1488.cu
#include <iostream>
#include <thrust/complex.h>
#include <fstream>
#include <string>
#include <stdlib.h>
using namespace std;
#define D 0.0000025 // Tick
#define LIMIT_N 255
#define INF_NUM 2
#define PLOT_METHOD 2 // dat file : 0, ppm file : 1, ppm file with C : 2
__global__
void calculation(const int indexTotalX, const int indexTotalY, int ***n, thrust::complex<double> ***c){ // n, c are the pointers of dN, dC.
for(int i = 0; i < indexTotalY ; i++){
for(int j = 0; j < indexTotalX; j++){
thrust::complex<double> z(0.0f, 0.0f);
n[i][j] = 0;
for(int ctr=1; ctr <= LIMIT_N ; ctr++){
z = z*z + (*(c[i][j]));
n[i][j] = n[i][j] + (abs(z) < INF_NUM);
}
}
}
}
int main(){
// Data Path
string filePath = "Y:\\Documents\\Programming\\mandelbrot\\";
string fileName = "mandelbrot4.ppm";
string filename = filePath+fileName;
//complex<double> c[N][M];
double xRange[2] = {-0.76, -0.74};
double yRange[2] = {0.05, 0.1};
const int indexTotalX = (xRange[1]-xRange[0])/D;
const int indexTotalY = (yRange[1]-yRange[0])/D;
thrust::complex<double> **c;
//c = new complex<double> [N];
cout << "debug_n" << endl;
int **n;
n = new int* [indexTotalY];
c = new thrust::complex<double> * [indexTotalY];
for(int i=0;i<indexTotalY;i++){
n[i] = new int [indexTotalX];
c[i] = new thrust::complex<double> [indexTotalX];
}
cout << "debug_n_end" << endl;
for(int i = 0; i < indexTotalY; i++){
for(int j = 0; j < indexTotalX; j++){
thrust::complex<double> tmp( xRange[0]+j*D, yRange[0]+i*D );
c[i][j] = tmp;
//n[i*sqrt(N)+j] = 0;
}
}
// CUDA malloc
cout << "CUDA malloc initializing..." << endl;
int *dN;
thrust::complex<double> **dC;
cudaError_t error;
size_t tmpPitch;
error = cudaMallocPitch((void **)&dN, &tmpPitch,(size_t)(indexTotalX*sizeof(int)), (size_t)(indexTotalY));
if(error != cudaSuccess){
cout << "CUDA ERROR CODE = " << error << endl;
cout << "indexTotalX = " << indexTotalX << endl;
cout << "indexTotalY = " << indexTotalY << endl;
return 0;
}
cout << "CUDA malloc done!" << endl;
}
$ nvcc -o t1488 t1488.cu
t1488.cu(68): warning: variable "dC" was declared but never referenced
$ cuda-memcheck ./t1488
========= CUDA-MEMCHECK
debug_n
debug_n_end
CUDA malloc initializing...
CUDA malloc done!
========= ERROR SUMMARY: 0 errors
$

Related

Error: 2.5e-1 cannot be used as a function

i wrote a simple program and i'm getting this error which i never encountered yet. Can you help me out?
line 13: error: 2.5e-1 cannot be used as a function
#include <iostream>
#include <iomanip>
using namespace std;
int dirac(int);
int main()
{
float y;
for(int k = 0; k <= 4; k++){
y = 2*dirac(k)-0.5*dirac(k-1)*0.25(2*dirac(k-2)-0.5*dirac(k-3));
cout << "k = " << k << ": ";
cout << setw(8) << setfill(' ');
cout << setprecision(3) << fixed << y << endl;
}
return 0;
}
int dirac(int x){
if(x == 0){
x = 1;
return x;
}else{
x = 0;
return x;
}
}

y = 2*dirac(k)-0.5*dirac(k-1)*0.25(2*dirac(k-2)-0.5*dirac(k-3));
^---
You probably forgot a * at the indicated spot.

Copying an array from host to constant memory [duplicate]

I have the following code to copy from a host variable to a __constant__ variable in CUDA
int main(int argc, char **argv){
int exit_code;
if (argc < 4) {
std::cout << "Usage: \n " << argv[0] << " <input> <output> <nColors>" << std::endl;
return 1;
}
Color *h_input;
int h_rows, h_cols;
timer1.Start();
exit_code = readText2RGB(argv[1], &h_input, &h_rows, &h_cols);
timer1.Stop();
std::cout << "Reading: " << timer1.Elapsed() << std::endl;
if (exit_code != SUCCESS){
std::cout << "Error trying to read file." << std::endl;
return FAILURE;
}
CpuTimer timer1;
GpuTimer timer2;
float timeStep2 = 0, timeStep3 = 0;
int h_numColors = atoi(argv[3]);
int h_change = 0;
int *h_pixelGroup = new int[h_rows*h_cols];
Color *h_groupRep = new Color[h_numColors];
Color *h_output = new Color[h_rows*h_cols];
Color *d_input;
int *d_pixelGroup;
Color *d_groupRep;
Color *d_output;
dim3 block(B_WIDTH, B_HEIGHT);
dim3 grid((h_cols+B_WIDTH-1)/B_WIDTH, (h_rows+B_HEIGHT-1)/B_HEIGHT);
checkCudaError(cudaMalloc((void**)&d_input, sizeof(Color)*h_rows*h_cols));
checkCudaError(cudaMalloc((void**)&d_pixelGroup, sizeof(int)*h_rows*h_cols));
checkCudaError(cudaMalloc((void**)&d_groupRep, sizeof(Color)*h_numColors));
checkCudaError(cudaMalloc((void**)&d_output, sizeof(Color)*h_rows*h_cols));
// STEP 1
//Evenly distribute all pixels of the image onto the color set
timer2.Start();
checkCudaError(cudaMemcpyToSymbol(c_rows, &h_rows, sizeof(int)));
checkCudaError(cudaMemcpyToSymbol(c_cols, &h_cols, sizeof(int)));
checkCudaError(cudaMemcpyToSymbol(c_numColors, &h_numColors, sizeof(int)));
checkCudaError(cudaMemcpy(d_input, h_input, sizeof(Color)*h_rows*h_cols, cudaMemcpyHostToDevice));
clut_distributePixels<<<grid, block>>>(d_pixelGroup);
checkCudaError(cudaMemcpy(h_pixelGroup, d_pixelGroup, sizeof(int)*h_rows*h_cols, cudaMemcpyDeviceToHost));
timer2.Stop();
std::cout << "Phase 1: " << timer2.Elapsed() << std::endl;
std::cout << h_pixelGroup[0] << ","
<< h_pixelGroup[3] << ","
<< h_pixelGroup[4] << ","
<< h_pixelGroup[7] << ","
<< h_pixelGroup[8] << std::endl;
//Do the STEP 2 and STEP 3 as long as there is at least one change of representative in a group
do {
// STEP 2
//Set the representative value to the average colour of all pixels in the same set
timer1.Start();
for (int ng = 0; ng < h_numColors; ng++) {
int r = 0, g = 0, b = 0;
int elem = 0;
for (int i = 0; i < h_rows; i++) {
for (int j = 0; j < h_cols; j++) {
if (h_pixelGroup[i*h_cols+j] == ng) {
r += h_input[i*h_cols+j].r;
g += h_input[i*h_cols+j].g;
b += h_input[i*h_cols+j].b;
elem++;
}
}
}
if (elem == 0) {
h_groupRep[ng].r = 255;
h_groupRep[ng].g = 255;
h_groupRep[ng].b = 255;
}else{
h_groupRep[ng].r = r/elem;
h_groupRep[ng].g = g/elem;
h_groupRep[ng].b = b/elem;
}
}
timer1.Stop();
timeStep2 += timer1.Elapsed();
// STEP 3
//For each pixel in the image, compute Euclidean's distance to each representative
//and assign it to the set which is closest
h_change = 0;
timer2.Start();
checkCudaError(cudaMemcpyToSymbol(d_change, &h_change, sizeof(int)));
checkCudaError(cudaMemcpy(d_groupRep, h_groupRep, sizeof(Color)*h_numColors, cudaMemcpyHostToDevice));
clut_checkDistances<<<grid, block>>>(d_input, d_pixelGroup, d_groupRep);
checkCudaError(cudaMemcpy(h_pixelGroup, d_pixelGroup, sizeof(int)*h_rows*h_cols, cudaMemcpyDeviceToHost));
checkCudaError(cudaMemcpyFromSymbol(&h_change, d_change, sizeof(int)));
timer2.Stop();
timeStep3 += timer2.Elapsed();
std::cout << "Chunche" << std::endl;
} while (h_change == 1);
std::cout << "Phase 2: " << timeStep2 << std::endl;
std::cout << "Phase 3: " << timeStep3 << std::endl;
// STEP 4
//Create the new image with the resulting color lookup table
timer2.Start();
clut_createImage<<<grid, block>>>(d_output, d_pixelGroup, d_groupRep);
checkCudaError(cudaMemcpy(h_output, d_output, sizeof(Color)*h_rows*h_cols, cudaMemcpyDeviceToHost));
timer2.Stop();
std::cout << "Phase 4: " << timer2.Elapsed() << std::endl;
checkCudaError(cudaFree(d_input));
checkCudaError(cudaFree(d_pixelGroup));
checkCudaError(cudaFree(d_groupRep));
checkCudaError(cudaFree(d_output));
timer1.Start();
exit_code = writeRGB2Text(argv[2], h_input, h_rows, h_cols);
timer1.Stop();
std::cout << "Writing: " << timer1.Elapsed() << std::endl;
delete[] h_pixelGroup;
delete[] h_groupRep;
delete[] h_output;
return SUCCESS;
}
when I print from within the kernel I get zeros for the three values
__global__
void clut_distributePixels(int *pixelGroup){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
if(i == 0 && j == 0){
printf("a: %d\n", c_rows);
printf("b: %d\n", c_cols);
printf("c: %d\n", c_numColors);
}
while (i < c_rows) {
while (j < c_cols) {
pixelGroup[i*c_cols+j] = (i*c_cols+j)/c_numColors;
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
Either I am not copying correctly to constant memory or ... I don't know what could be wrong. Any advise !?
I posted the entire host code probably something else is messing with the constant copies.
UPDATE
Main.cu
#include "Imageproc.cuh"
int main(){
int h_change = 0;
int h_rows = 512;
cudaMemcpyToSymbol(c_rows, &h_rows, sizeof(int));
chunche<<<1,1>>>();
cudaMemcpyFromSymbol(&h_change, d_change, sizeof(int));
std::cout << "H = " << h_change << std::endl;
return 0
}
Imageproc.cuh
#ifndef _IMAGEPROC_CUH_
#define _IMAGEPROC_CUH_
#include "Utilities.cuh"
#define B_WIDTH 16
#define B_HEIGHT 16
__constant__ int c_rows;
__constant__ int c_cols;
__constant__ int c_numColors;
__device__ int d_change;
#ifdef __cplusplus
extern "C"
{
#endif
__global__
void chunche();
__global__
void clut_distributePixels(int *pixelGroup);
__global__
void clut_checkDistances(Color *input, int *pixelGroup, Color *groupRep);
__global__
void clut_createImage(Color *clutImage, int *pixelGroup, Color *groupRep);
#ifdef __cplusplus
}
#endif
#endif
Imageproc.cu
#include "Imageproc.cuh"
__global__
void chunche(){
d_change = c_rows + 1;
}
__global__
void clut_distributePixels(int *pixelGroup){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
while (i < c_rows) {
while (j < c_cols) {
pixelGroup[i*c_cols+j] = (i*c_cols+j)/c_numColors;
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
__global__
void clut_checkDistances(Color *input, int *pixelGroup, Color *groupRep){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int newGroup;
while (i < c_rows) {
while (j < c_cols) {
newGroup = 0;
for (int ng = 1; ng < c_numColors; ng++) {
if (
/*If distance from color to group ng is less than distance from color to group idx
then color should belong to ng*/
(groupRep[ng].r-input[i*c_cols+j].r)*(groupRep[ng].r-input[i*c_cols+j].r) +
(groupRep[ng].g-input[i*c_cols+j].g)*(groupRep[ng].g-input[i*c_cols+j].g) +
(groupRep[ng].b-input[i*c_cols+j].b)*(groupRep[ng].b-input[i*c_cols+j].b)
<
(groupRep[newGroup].r-input[i*c_cols+j].r)*(groupRep[newGroup].r-input[i*c_cols+j].r)+
(groupRep[newGroup].g-input[i*c_cols+j].g)*(groupRep[newGroup].g-input[i*c_cols+j].g)+
(groupRep[newGroup].b-input[i*c_cols+j].b)*(groupRep[newGroup].b-input[i*c_cols+j].b)
)
{
newGroup = ng;
}
}
if (pixelGroup[i*c_cols+j] != newGroup) {
pixelGroup[i*c_cols+j] = newGroup;
d_change = 1;
}
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
__global__
void clut_createImage(Color *clutImage, int *pixelGroup, Color *groupRep){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
while (i < c_rows) {
while (j < c_cols) {
clutImage[i*c_cols+j].r = groupRep[pixelGroup[i*c_cols+j]].r;
clutImage[i*c_cols+j].g = groupRep[pixelGroup[i*c_cols+j]].g;
clutImage[i*c_cols+j].b = groupRep[pixelGroup[i*c_cols+j]].b;
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
Utilities.cuh
#ifndef _UTILITIES_CUH_
#define _UTILITIES_CUH_
#include <iostream>
#include <fstream>
#include <string>
#define SUCCESS 1
#define FAILURE 0
#define checkCudaError(val) check( (val), #val, __FILE__, __LINE__)
typedef struct {
int r;
int g;
int b;
} vec3u;
typedef vec3u Color;
typedef unsigned char uchar;
typedef uchar Grayscale;
struct GpuTimer{
cudaEvent_t start;
cudaEvent_t stop;
GpuTimer(){
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
~GpuTimer(){
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void Start(){
cudaEventRecord(start, 0);
}
void Stop(){
cudaEventRecord(stop, 0);
}
float Elapsed(){
float elapsed;
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
return elapsed;
}
};
template<typename T>
void check(T err, const char* const func, const char* const file, const int line) {
if (err != cudaSuccess) {
std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
exit(1);
}
}
int writeGrayscale2Text(const std::string filename, const Grayscale *image, const int rows, const int cols);
int readText2Grayscale(const std::string filename, Grayscale **image, int *rows, int *cols);
int writeRGB2Text(const std::string filename, const Color *image, const int rows, const int cols);
int readText2RGB(const std::string filename, Color **image, int *rows, int *cols);
struct CpuTimer{
clock_t start;
clock_t stop;
void Start(){
start = clock();
}
void Stop(){
stop = clock();
}
float Elapsed(){
return ((float)stop-start)/CLOCKS_PER_SEC * 1000.0f;
}
};
#endif
Utilities.cu
#include "Utilities.cuh"
int writeGrayscale2Text(const std::string filename, const Grayscale *image, const int rows, const int cols){
std::ofstream fileWriter(filename.c_str());
if (!fileWriter.is_open()) {
std::cerr << "** writeGrayscale2Text() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileWriter << rows << "\n";
fileWriter << cols << "\n";
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
fileWriter << (int)image[i*cols+j] << "\n";
}
}
fileWriter.close();
return SUCCESS;
}
int readText2Grayscale(const std::string filename, Grayscale **image, int *rows, int *cols){
std::ifstream fileReader(filename.c_str());
if (!fileReader.is_open()) {
std::cerr << "** readText2Grayscale() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileReader >> *rows;
fileReader >> *cols;
*image = new Grayscale[(*rows)*(*cols)];
int value;
for (int i = 0; i < *rows; i++) {
for (int j = 0; j < *cols; j++) {
fileReader >> value;
(*image)[i*(*cols)+j] = (Grayscale)value;
}
}
fileReader.close();
return SUCCESS;
}
int writeRGB2Text(const std::string filename, const Color *image, const int rows, const int cols){
std::ofstream fileWriter(filename.c_str());
if (!fileWriter.is_open()) {
std::cerr << "** writeRGB2Text() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileWriter << rows << "\n";
fileWriter << cols << "\n";
for (int k = 0; k < 3; k++) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
switch (k) {
case 0:
fileWriter << image[i*cols+j].r << "\n";
break;
case 1:
fileWriter << image[i*cols+j].g << "\n";
break;
case 2:
fileWriter << image[i*cols+j].b << "\n";
break;
}
}
}
}
fileWriter.close();
return SUCCESS;
}
int readText2RGB(const std::string filename, Color **image, int *rows, int *cols){
std::ifstream fileReader(filename.c_str());
if (!fileReader.is_open()) {
std::cerr << "** readText2Grayscale() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileReader >> *rows;
fileReader >> *cols;
*image = new Color[(*rows)*(*cols)];
for (int k = 0; k < 3; k++) {
for (int i = 0; i < *rows; i++) {
for (int j = 0; j < *cols; j++) {
switch (k) {
case 0:
fileReader >> (*image)[i*(*cols)+j].r;
break;
case 1:
fileReader >> (*image)[i*(*cols)+j].g;
break;
case 2:
fileReader >> (*image)[i*(*cols)+j].b;
break;
}
}
}
}
fileReader.close();
return SUCCESS;
}

Constant memory has implicit local scope linkage - answer to this on stack overflow.
This means that the cudaMemcpyToSymbol have to be in the same generated .obj file of the kernel where you want to use it.
You do your memcopy in Main.cu, but the kernel where you use your canstant memory is in Imageproc.cu. So for the constant values are unknown for the kernel chunche.
A option to solve you're problem can be, to implement a wrapper. Just add a function in Imagepro.cu where you do the cudaMemcpyToSymbol and call the wrapper in Main.cu and pass your desired values for the constant memory in there.

False dependency issue for the Fermi architecture

I am trying to achieve "3-way overlapping" using 3 streams as in the examples in CUDA streams and concurrency webinar. But I couldn't achieve it.
I have Geforce GT 550M (Fermi Architecture with one copy engine) and I am using Windows 7 (64 bit).
Here is the code that I have written.
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// includes, project
#include "helper_cuda.h"
#include "helper_functions.h" // helper utility functions
#include <stdio.h>
using namespace std;
#define DATA_SIZE 6000000
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3
__global__ void kernel(const int *in, int *out, int dataSize)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int end = dataSize;
for (int i = start; i < end; i += blockDim.x * gridDim.x)
{
out[i] = in[i] * in[i];
}
}
int main()
{
const int dataSize = DATA_SIZE;
int *h_in = new int[dataSize];
int *h_out = new int[dataSize];
int *h_groundTruth = new int[dataSize];
// Input population
for(int i = 0; i < dataSize; i++)
h_in[i] = 5;
for(int i = 0; i < dataSize; i++)
h_out[i] = 0;
// CPU calculation for ground truth
for(int i = 0; i < dataSize; i++)
h_groundTruth[i] = h_in[i] * h_in[i];
// Choose which GPU to run on, change this on a multi-GPU system.
checkCudaErrors( cudaSetDevice(0) );
int *d_in = 0;
int *d_out = 0;
int streamSize = dataSize / NUM_STREAMS;
size_t memSize = dataSize * sizeof(int);
size_t streamMemSize = memSize / NUM_STREAMS;
checkCudaErrors( cudaMalloc( (void **)&d_in, memSize) );
checkCudaErrors( cudaMalloc( (void **)&d_out, memSize) );
// registers host memory as page-locked (required for asynch cudaMemcpyAsync)
checkCudaErrors(cudaHostRegister(h_in, memSize, cudaHostRegisterPortable));
checkCudaErrors(cudaHostRegister(h_out, memSize, cudaHostRegisterPortable));
// set kernel launch config
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS,1,1);
cout << "GPU Kernel Configuration : " << endl;
cout << "Number of Streams :\t" << NUM_STREAMS << " with size: \t" << streamSize << endl;
cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;
// create cuda stream
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamCreate(&streams[i]));
// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
cudaEventRecord(start, 0);
// overlapped execution using version 2
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]);
}
//cudaMemcpy(d_in, h_in, memSize, cudaMemcpyHostToDevice);
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
//kernel<<<nBlocks, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize);
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize/2);
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2);
}
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]);
}
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamSynchronize(streams[i]));
cudaEventRecord(stop, 0);
checkCudaErrors(cudaStreamSynchronize(0));
checkCudaErrors(cudaDeviceSynchronize());
float gpu_time = 0;
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
// release resources
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaHostUnregister(h_in));
checkCudaErrors(cudaHostUnregister(h_out));
checkCudaErrors(cudaFree(d_in));
checkCudaErrors(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamDestroy(streams[i]));
cudaDeviceReset();
cout << "Execution Time of GPU: " << gpu_time << "ms" << endl;
// GPU output check
int sum = 0;
for(int i = 0; i < dataSize; i++)
sum += h_groundTruth[i] - h_out[i];
cout << "Error between CPU and GPU: " << sum << endl;
delete[] h_in;
delete[] h_out;
delete[] h_groundTruth;
return 0;
}
Using Nsight for profiling, I have this result:
It may seem correct, but why does the D2H transfer in stream #1 only start when the last kernel launch of stream #2 and not before?
I tried also to use 8 streams (just by changing NUM_STREAM to 8) to achieve such a "3-way overlap" and here is the result:
The interesting thing is that when I use 8 streams, the overlappings between computation and memory transfers seem to be much better.
What is the reason for this problem? Is it due to WDDM driver or is there something wrong with my program?

From the comments above, it seems that the OP's problem is a false dependency issue, suffered by the Fermi architecture and solved by the Hyper-Q feature of the Kepler architecture.
To summarize, the OP is highlighting the fact that the first D2H transfer (stream #1) does not start immediately after the last H2D (stream #3) finishes, while in principle it could. The time gap is highlighted by the red circle in the following figure (henceforth, but for the differently specified, all the tests refer to a GeForce GT540M belonging to the Fermi family):
The OP's approach is a breadth-first approach, which operates according to the following scheme:
for(int i = 0; i < NUM_STREAMS; i++)
cudaMemcpyAsync(..., cudaMemcpyHostToDevice, streams[i]);
for(int i = 0; i < NUM_STREAMS; i++)
{
kernel_launch_1<<<..., 0, streams[i]>>>(...);
kernel_launch_2<<<..., 0, streams[i]>>>(...);
}
for(int i = 0; i < NUM_STREAMS; i++)
cudaMemcpyAsync(..., cudaMemcpyDeviceToHost, streams[i]);
Using a depth-first approach, operating according to the following scheme
for(int i = 0; i < NUM_STREAMS; i++)
{
cudaMemcpyAsync(...., cudaMemcpyHostToDevice, streams[i]);
kernel_launch_1<<<...., 0, streams[i]>>>(....);
kernel_launch_2<<<...., 0, streams[i]>>>(....);
cudaMemcpyAsync(...., cudaMemcpyDeviceToHost, streams[i]);
}
does not seem to improve the situation, according to the following timeline (the depth-first code is reported at the bottom of the answer), but it seems to show a worse overlapping:
Under the breadth-first approach, and commenting the second kernel launch, the first D2H copy starts immediately as it can, as reported by the following timeline:
Finally, running the code on a Kepler K20c, the problem does not show up, as illustrated by the following figure:
Here is the code for the depth-first approach:
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// includes, project
#include "helper_cuda.h"
#include "helper_functions.h" // helper utility functions
#include <stdio.h>
using namespace std;
#define DATA_SIZE 6000000
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3
__global__ void kernel(const int *in, int *out, int dataSize)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int end = dataSize;
for (int i = start; i < end; i += blockDim.x * gridDim.x)
{
out[i] = in[i] * in[i];
}
}
int main()
{
const int dataSize = DATA_SIZE;
int *h_in = new int[dataSize];
int *h_out = new int[dataSize];
int *h_groundTruth = new int[dataSize];
// Input population
for(int i = 0; i < dataSize; i++)
h_in[i] = 5;
for(int i = 0; i < dataSize; i++)
h_out[i] = 0;
// CPU calculation for ground truth
for(int i = 0; i < dataSize; i++)
h_groundTruth[i] = h_in[i] * h_in[i];
// Choose which GPU to run on, change this on a multi-GPU system.
checkCudaErrors( cudaSetDevice(0) );
int *d_in = 0;
int *d_out = 0;
int streamSize = dataSize / NUM_STREAMS;
size_t memSize = dataSize * sizeof(int);
size_t streamMemSize = memSize / NUM_STREAMS;
checkCudaErrors( cudaMalloc( (void **)&d_in, memSize) );
checkCudaErrors( cudaMalloc( (void **)&d_out, memSize) );
// registers host memory as page-locked (required for asynch cudaMemcpyAsync)
checkCudaErrors(cudaHostRegister(h_in, memSize, cudaHostRegisterPortable));
checkCudaErrors(cudaHostRegister(h_out, memSize, cudaHostRegisterPortable));
// set kernel launch config
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS,1,1);
cout << "GPU Kernel Configuration : " << endl;
cout << "Number of Streams :\t" << NUM_STREAMS << " with size: \t" << streamSize << endl;
cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;
// create cuda stream
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamCreate(&streams[i]));
// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
cudaEventRecord(start, 0);
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]);
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize/2);
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2);
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]);
}
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamSynchronize(streams[i]));
cudaEventRecord(stop, 0);
checkCudaErrors(cudaStreamSynchronize(0));
checkCudaErrors(cudaDeviceSynchronize());
float gpu_time = 0;
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
// release resources
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaHostUnregister(h_in));
checkCudaErrors(cudaHostUnregister(h_out));
checkCudaErrors(cudaFree(d_in));
checkCudaErrors(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamDestroy(streams[i]));
cudaDeviceReset();
cout << "Execution Time of GPU: " << gpu_time << "ms" << endl;
// GPU output check
int sum = 0;
for(int i = 0; i < dataSize; i++)
sum += h_groundTruth[i] - h_out[i];
cout << "Error between CPU and GPU: " << sum << endl;
delete[] h_in;
delete[] h_out;
delete[] h_groundTruth;
return 0;
}

reduction example using cuda and CUB

I'm trying to get my head around CUB, and having a bit of trouble following the (rather incomplete) worked examples. CUB looks like it is a fantastic tool, I just can't make sense of the example code.
I've built a simple proto-warp reduce example:
#include <cub/cub.cuh>
#include <cuda.h>
#include <vector>
using std::vector;
#include <iostream>
using std::cout;
using std::endl;
const int N = 128;
__global__ void sum(float *indata, float *outdata) {
typedef cub::WarpReduce<float,4> WarpReduce;
__shared__ typename WarpReduce::TempStorage temp_storage;
int id = blockIdx.x*blockDim.x+threadIdx.x;
if( id < 128 ) {
outdata[id] = WarpReduce(temp_storage).Sum(indata[id]);
}
}
int main() {
vector<float> y(N), sol(N);
float *dev_y, *dev_sol;
cudaMalloc((void**)&dev_y,N*sizeof(float));
cudaMalloc((void**)&dev_sol,N*sizeof(float));
for( int i = 0; i < N; i++ ) {
y[i] = (float)i;
}
cout << "input: ";
for( int i = 0; i < N; i++ ) cout << y[i] << " ";
cout << endl;
cudaMemcpy(&y[0],dev_y,N*sizeof(float),cudaMemcpyHostToDevice);
sum<<<1,32>>>(dev_y,dev_sol);
cudaMemcpy(dev_sol,&sol[0],N*sizeof(float),cudaMemcpyDeviceToHost);
cout << "output: ";
for( int i = 0; i < N; i++ ) cout << sol[i] << " ";
cout << endl;
cudaFree(dev_y);
cudaFree(dev_sol);
return 0;
}
which returns all zeros.
I'm aware that this code would return a reduction that was banded with every 32nd element being the sum of a warp and the other elements being undefined - I just want to get a feel for how CUB works. Can someone point out what I'm doing wrong?
(also, does CUB deserve its own tag yet?)

Your cudaMemcpy arguments are back to front, the destination comes first (to be consistent with memcpy).
cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )
See the API reference for more info.

CUDA writing to constant memory wrong value

I have the following code to copy from a host variable to a __constant__ variable in CUDA
int main(int argc, char **argv){
int exit_code;
if (argc < 4) {
std::cout << "Usage: \n " << argv[0] << " <input> <output> <nColors>" << std::endl;
return 1;
}
Color *h_input;
int h_rows, h_cols;
timer1.Start();
exit_code = readText2RGB(argv[1], &h_input, &h_rows, &h_cols);
timer1.Stop();
std::cout << "Reading: " << timer1.Elapsed() << std::endl;
if (exit_code != SUCCESS){
std::cout << "Error trying to read file." << std::endl;
return FAILURE;
}
CpuTimer timer1;
GpuTimer timer2;
float timeStep2 = 0, timeStep3 = 0;
int h_numColors = atoi(argv[3]);
int h_change = 0;
int *h_pixelGroup = new int[h_rows*h_cols];
Color *h_groupRep = new Color[h_numColors];
Color *h_output = new Color[h_rows*h_cols];
Color *d_input;
int *d_pixelGroup;
Color *d_groupRep;
Color *d_output;
dim3 block(B_WIDTH, B_HEIGHT);
dim3 grid((h_cols+B_WIDTH-1)/B_WIDTH, (h_rows+B_HEIGHT-1)/B_HEIGHT);
checkCudaError(cudaMalloc((void**)&d_input, sizeof(Color)*h_rows*h_cols));
checkCudaError(cudaMalloc((void**)&d_pixelGroup, sizeof(int)*h_rows*h_cols));
checkCudaError(cudaMalloc((void**)&d_groupRep, sizeof(Color)*h_numColors));
checkCudaError(cudaMalloc((void**)&d_output, sizeof(Color)*h_rows*h_cols));
// STEP 1
//Evenly distribute all pixels of the image onto the color set
timer2.Start();
checkCudaError(cudaMemcpyToSymbol(c_rows, &h_rows, sizeof(int)));
checkCudaError(cudaMemcpyToSymbol(c_cols, &h_cols, sizeof(int)));
checkCudaError(cudaMemcpyToSymbol(c_numColors, &h_numColors, sizeof(int)));
checkCudaError(cudaMemcpy(d_input, h_input, sizeof(Color)*h_rows*h_cols, cudaMemcpyHostToDevice));
clut_distributePixels<<<grid, block>>>(d_pixelGroup);
checkCudaError(cudaMemcpy(h_pixelGroup, d_pixelGroup, sizeof(int)*h_rows*h_cols, cudaMemcpyDeviceToHost));
timer2.Stop();
std::cout << "Phase 1: " << timer2.Elapsed() << std::endl;
std::cout << h_pixelGroup[0] << ","
<< h_pixelGroup[3] << ","
<< h_pixelGroup[4] << ","
<< h_pixelGroup[7] << ","
<< h_pixelGroup[8] << std::endl;
//Do the STEP 2 and STEP 3 as long as there is at least one change of representative in a group
do {
// STEP 2
//Set the representative value to the average colour of all pixels in the same set
timer1.Start();
for (int ng = 0; ng < h_numColors; ng++) {
int r = 0, g = 0, b = 0;
int elem = 0;
for (int i = 0; i < h_rows; i++) {
for (int j = 0; j < h_cols; j++) {
if (h_pixelGroup[i*h_cols+j] == ng) {
r += h_input[i*h_cols+j].r;
g += h_input[i*h_cols+j].g;
b += h_input[i*h_cols+j].b;
elem++;
}
}
}
if (elem == 0) {
h_groupRep[ng].r = 255;
h_groupRep[ng].g = 255;
h_groupRep[ng].b = 255;
}else{
h_groupRep[ng].r = r/elem;
h_groupRep[ng].g = g/elem;
h_groupRep[ng].b = b/elem;
}
}
timer1.Stop();
timeStep2 += timer1.Elapsed();
// STEP 3
//For each pixel in the image, compute Euclidean's distance to each representative
//and assign it to the set which is closest
h_change = 0;
timer2.Start();
checkCudaError(cudaMemcpyToSymbol(d_change, &h_change, sizeof(int)));
checkCudaError(cudaMemcpy(d_groupRep, h_groupRep, sizeof(Color)*h_numColors, cudaMemcpyHostToDevice));
clut_checkDistances<<<grid, block>>>(d_input, d_pixelGroup, d_groupRep);
checkCudaError(cudaMemcpy(h_pixelGroup, d_pixelGroup, sizeof(int)*h_rows*h_cols, cudaMemcpyDeviceToHost));
checkCudaError(cudaMemcpyFromSymbol(&h_change, d_change, sizeof(int)));
timer2.Stop();
timeStep3 += timer2.Elapsed();
std::cout << "Chunche" << std::endl;
} while (h_change == 1);
std::cout << "Phase 2: " << timeStep2 << std::endl;
std::cout << "Phase 3: " << timeStep3 << std::endl;
// STEP 4
//Create the new image with the resulting color lookup table
timer2.Start();
clut_createImage<<<grid, block>>>(d_output, d_pixelGroup, d_groupRep);
checkCudaError(cudaMemcpy(h_output, d_output, sizeof(Color)*h_rows*h_cols, cudaMemcpyDeviceToHost));
timer2.Stop();
std::cout << "Phase 4: " << timer2.Elapsed() << std::endl;
checkCudaError(cudaFree(d_input));
checkCudaError(cudaFree(d_pixelGroup));
checkCudaError(cudaFree(d_groupRep));
checkCudaError(cudaFree(d_output));
timer1.Start();
exit_code = writeRGB2Text(argv[2], h_input, h_rows, h_cols);
timer1.Stop();
std::cout << "Writing: " << timer1.Elapsed() << std::endl;
delete[] h_pixelGroup;
delete[] h_groupRep;
delete[] h_output;
return SUCCESS;
}
when I print from within the kernel I get zeros for the three values
__global__
void clut_distributePixels(int *pixelGroup){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
if(i == 0 && j == 0){
printf("a: %d\n", c_rows);
printf("b: %d\n", c_cols);
printf("c: %d\n", c_numColors);
}
while (i < c_rows) {
while (j < c_cols) {
pixelGroup[i*c_cols+j] = (i*c_cols+j)/c_numColors;
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
Either I am not copying correctly to constant memory or ... I don't know what could be wrong. Any advise !?
I posted the entire host code probably something else is messing with the constant copies.
UPDATE
Main.cu
#include "Imageproc.cuh"
int main(){
int h_change = 0;
int h_rows = 512;
cudaMemcpyToSymbol(c_rows, &h_rows, sizeof(int));
chunche<<<1,1>>>();
cudaMemcpyFromSymbol(&h_change, d_change, sizeof(int));
std::cout << "H = " << h_change << std::endl;
return 0
}
Imageproc.cuh
#ifndef _IMAGEPROC_CUH_
#define _IMAGEPROC_CUH_
#include "Utilities.cuh"
#define B_WIDTH 16
#define B_HEIGHT 16
__constant__ int c_rows;
__constant__ int c_cols;
__constant__ int c_numColors;
__device__ int d_change;
#ifdef __cplusplus
extern "C"
{
#endif
__global__
void chunche();
__global__
void clut_distributePixels(int *pixelGroup);
__global__
void clut_checkDistances(Color *input, int *pixelGroup, Color *groupRep);
__global__
void clut_createImage(Color *clutImage, int *pixelGroup, Color *groupRep);
#ifdef __cplusplus
}
#endif
#endif
Imageproc.cu
#include "Imageproc.cuh"
__global__
void chunche(){
d_change = c_rows + 1;
}
__global__
void clut_distributePixels(int *pixelGroup){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
while (i < c_rows) {
while (j < c_cols) {
pixelGroup[i*c_cols+j] = (i*c_cols+j)/c_numColors;
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
__global__
void clut_checkDistances(Color *input, int *pixelGroup, Color *groupRep){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int newGroup;
while (i < c_rows) {
while (j < c_cols) {
newGroup = 0;
for (int ng = 1; ng < c_numColors; ng++) {
if (
/*If distance from color to group ng is less than distance from color to group idx
then color should belong to ng*/
(groupRep[ng].r-input[i*c_cols+j].r)*(groupRep[ng].r-input[i*c_cols+j].r) +
(groupRep[ng].g-input[i*c_cols+j].g)*(groupRep[ng].g-input[i*c_cols+j].g) +
(groupRep[ng].b-input[i*c_cols+j].b)*(groupRep[ng].b-input[i*c_cols+j].b)
<
(groupRep[newGroup].r-input[i*c_cols+j].r)*(groupRep[newGroup].r-input[i*c_cols+j].r)+
(groupRep[newGroup].g-input[i*c_cols+j].g)*(groupRep[newGroup].g-input[i*c_cols+j].g)+
(groupRep[newGroup].b-input[i*c_cols+j].b)*(groupRep[newGroup].b-input[i*c_cols+j].b)
)
{
newGroup = ng;
}
}
if (pixelGroup[i*c_cols+j] != newGroup) {
pixelGroup[i*c_cols+j] = newGroup;
d_change = 1;
}
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
__global__
void clut_createImage(Color *clutImage, int *pixelGroup, Color *groupRep){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
while (i < c_rows) {
while (j < c_cols) {
clutImage[i*c_cols+j].r = groupRep[pixelGroup[i*c_cols+j]].r;
clutImage[i*c_cols+j].g = groupRep[pixelGroup[i*c_cols+j]].g;
clutImage[i*c_cols+j].b = groupRep[pixelGroup[i*c_cols+j]].b;
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
Utilities.cuh
#ifndef _UTILITIES_CUH_
#define _UTILITIES_CUH_
#include <iostream>
#include <fstream>
#include <string>
#define SUCCESS 1
#define FAILURE 0
#define checkCudaError(val) check( (val), #val, __FILE__, __LINE__)
typedef struct {
int r;
int g;
int b;
} vec3u;
typedef vec3u Color;
typedef unsigned char uchar;
typedef uchar Grayscale;
struct GpuTimer{
cudaEvent_t start;
cudaEvent_t stop;
GpuTimer(){
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
~GpuTimer(){
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void Start(){
cudaEventRecord(start, 0);
}
void Stop(){
cudaEventRecord(stop, 0);
}
float Elapsed(){
float elapsed;
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
return elapsed;
}
};
template<typename T>
void check(T err, const char* const func, const char* const file, const int line) {
if (err != cudaSuccess) {
std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
exit(1);
}
}
int writeGrayscale2Text(const std::string filename, const Grayscale *image, const int rows, const int cols);
int readText2Grayscale(const std::string filename, Grayscale **image, int *rows, int *cols);
int writeRGB2Text(const std::string filename, const Color *image, const int rows, const int cols);
int readText2RGB(const std::string filename, Color **image, int *rows, int *cols);
struct CpuTimer{
clock_t start;
clock_t stop;
void Start(){
start = clock();
}
void Stop(){
stop = clock();
}
float Elapsed(){
return ((float)stop-start)/CLOCKS_PER_SEC * 1000.0f;
}
};
#endif
Utilities.cu
#include "Utilities.cuh"
int writeGrayscale2Text(const std::string filename, const Grayscale *image, const int rows, const int cols){
std::ofstream fileWriter(filename.c_str());
if (!fileWriter.is_open()) {
std::cerr << "** writeGrayscale2Text() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileWriter << rows << "\n";
fileWriter << cols << "\n";
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
fileWriter << (int)image[i*cols+j] << "\n";
}
}
fileWriter.close();
return SUCCESS;
}
int readText2Grayscale(const std::string filename, Grayscale **image, int *rows, int *cols){
std::ifstream fileReader(filename.c_str());
if (!fileReader.is_open()) {
std::cerr << "** readText2Grayscale() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileReader >> *rows;
fileReader >> *cols;
*image = new Grayscale[(*rows)*(*cols)];
int value;
for (int i = 0; i < *rows; i++) {
for (int j = 0; j < *cols; j++) {
fileReader >> value;
(*image)[i*(*cols)+j] = (Grayscale)value;
}
}
fileReader.close();
return SUCCESS;
}
int writeRGB2Text(const std::string filename, const Color *image, const int rows, const int cols){
std::ofstream fileWriter(filename.c_str());
if (!fileWriter.is_open()) {
std::cerr << "** writeRGB2Text() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileWriter << rows << "\n";
fileWriter << cols << "\n";
for (int k = 0; k < 3; k++) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
switch (k) {
case 0:
fileWriter << image[i*cols+j].r << "\n";
break;
case 1:
fileWriter << image[i*cols+j].g << "\n";
break;
case 2:
fileWriter << image[i*cols+j].b << "\n";
break;
}
}
}
}
fileWriter.close();
return SUCCESS;
}
int readText2RGB(const std::string filename, Color **image, int *rows, int *cols){
std::ifstream fileReader(filename.c_str());
if (!fileReader.is_open()) {
std::cerr << "** readText2Grayscale() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileReader >> *rows;
fileReader >> *cols;
*image = new Color[(*rows)*(*cols)];
for (int k = 0; k < 3; k++) {
for (int i = 0; i < *rows; i++) {
for (int j = 0; j < *cols; j++) {
switch (k) {
case 0:
fileReader >> (*image)[i*(*cols)+j].r;
break;
case 1:
fileReader >> (*image)[i*(*cols)+j].g;
break;
case 2:
fileReader >> (*image)[i*(*cols)+j].b;
break;
}
}
}
}
fileReader.close();
return SUCCESS;
}

Constant memory has implicit local scope linkage - answer to this on stack overflow.
This means that the cudaMemcpyToSymbol have to be in the same generated .obj file of the kernel where you want to use it.
You do your memcopy in Main.cu, but the kernel where you use your canstant memory is in Imageproc.cu. So for the constant values are unknown for the kernel chunche.
A option to solve you're problem can be, to implement a wrapper. Just add a function in Imagepro.cu where you do the cudaMemcpyToSymbol and call the wrapper in Main.cu and pass your desired values for the constant memory in there.

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

How to cope with "cudaErrorMissingConfiguration" from "cudaMallocPitch" function of CUDA? - cuda

Related

Error: 2.5e-1 cannot be used as a function

Copying an array from host to constant memory [duplicate]

False dependency issue for the Fermi architecture

reduction example using cuda and CUB

CUDA writing to constant memory wrong value

Categories

Resources