CUDA writing to constant memory wrong value - cuda

I have the following code to copy from a host variable to a __constant__ variable in CUDA
int main(int argc, char **argv){
int exit_code;
if (argc < 4) {
std::cout << "Usage: \n " << argv[0] << " <input> <output> <nColors>" << std::endl;
return 1;
}
Color *h_input;
int h_rows, h_cols;
timer1.Start();
exit_code = readText2RGB(argv[1], &h_input, &h_rows, &h_cols);
timer1.Stop();
std::cout << "Reading: " << timer1.Elapsed() << std::endl;
if (exit_code != SUCCESS){
std::cout << "Error trying to read file." << std::endl;
return FAILURE;
}
CpuTimer timer1;
GpuTimer timer2;
float timeStep2 = 0, timeStep3 = 0;
int h_numColors = atoi(argv[3]);
int h_change = 0;
int *h_pixelGroup = new int[h_rows*h_cols];
Color *h_groupRep = new Color[h_numColors];
Color *h_output = new Color[h_rows*h_cols];
Color *d_input;
int *d_pixelGroup;
Color *d_groupRep;
Color *d_output;
dim3 block(B_WIDTH, B_HEIGHT);
dim3 grid((h_cols+B_WIDTH-1)/B_WIDTH, (h_rows+B_HEIGHT-1)/B_HEIGHT);
checkCudaError(cudaMalloc((void**)&d_input, sizeof(Color)*h_rows*h_cols));
checkCudaError(cudaMalloc((void**)&d_pixelGroup, sizeof(int)*h_rows*h_cols));
checkCudaError(cudaMalloc((void**)&d_groupRep, sizeof(Color)*h_numColors));
checkCudaError(cudaMalloc((void**)&d_output, sizeof(Color)*h_rows*h_cols));
// STEP 1
//Evenly distribute all pixels of the image onto the color set
timer2.Start();
checkCudaError(cudaMemcpyToSymbol(c_rows, &h_rows, sizeof(int)));
checkCudaError(cudaMemcpyToSymbol(c_cols, &h_cols, sizeof(int)));
checkCudaError(cudaMemcpyToSymbol(c_numColors, &h_numColors, sizeof(int)));
checkCudaError(cudaMemcpy(d_input, h_input, sizeof(Color)*h_rows*h_cols, cudaMemcpyHostToDevice));
clut_distributePixels<<<grid, block>>>(d_pixelGroup);
checkCudaError(cudaMemcpy(h_pixelGroup, d_pixelGroup, sizeof(int)*h_rows*h_cols, cudaMemcpyDeviceToHost));
timer2.Stop();
std::cout << "Phase 1: " << timer2.Elapsed() << std::endl;
std::cout << h_pixelGroup[0] << ","
<< h_pixelGroup[3] << ","
<< h_pixelGroup[4] << ","
<< h_pixelGroup[7] << ","
<< h_pixelGroup[8] << std::endl;
//Do the STEP 2 and STEP 3 as long as there is at least one change of representative in a group
do {
// STEP 2
//Set the representative value to the average colour of all pixels in the same set
timer1.Start();
for (int ng = 0; ng < h_numColors; ng++) {
int r = 0, g = 0, b = 0;
int elem = 0;
for (int i = 0; i < h_rows; i++) {
for (int j = 0; j < h_cols; j++) {
if (h_pixelGroup[i*h_cols+j] == ng) {
r += h_input[i*h_cols+j].r;
g += h_input[i*h_cols+j].g;
b += h_input[i*h_cols+j].b;
elem++;
}
}
}
if (elem == 0) {
h_groupRep[ng].r = 255;
h_groupRep[ng].g = 255;
h_groupRep[ng].b = 255;
}else{
h_groupRep[ng].r = r/elem;
h_groupRep[ng].g = g/elem;
h_groupRep[ng].b = b/elem;
}
}
timer1.Stop();
timeStep2 += timer1.Elapsed();
// STEP 3
//For each pixel in the image, compute Euclidean's distance to each representative
//and assign it to the set which is closest
h_change = 0;
timer2.Start();
checkCudaError(cudaMemcpyToSymbol(d_change, &h_change, sizeof(int)));
checkCudaError(cudaMemcpy(d_groupRep, h_groupRep, sizeof(Color)*h_numColors, cudaMemcpyHostToDevice));
clut_checkDistances<<<grid, block>>>(d_input, d_pixelGroup, d_groupRep);
checkCudaError(cudaMemcpy(h_pixelGroup, d_pixelGroup, sizeof(int)*h_rows*h_cols, cudaMemcpyDeviceToHost));
checkCudaError(cudaMemcpyFromSymbol(&h_change, d_change, sizeof(int)));
timer2.Stop();
timeStep3 += timer2.Elapsed();
std::cout << "Chunche" << std::endl;
} while (h_change == 1);
std::cout << "Phase 2: " << timeStep2 << std::endl;
std::cout << "Phase 3: " << timeStep3 << std::endl;
// STEP 4
//Create the new image with the resulting color lookup table
timer2.Start();
clut_createImage<<<grid, block>>>(d_output, d_pixelGroup, d_groupRep);
checkCudaError(cudaMemcpy(h_output, d_output, sizeof(Color)*h_rows*h_cols, cudaMemcpyDeviceToHost));
timer2.Stop();
std::cout << "Phase 4: " << timer2.Elapsed() << std::endl;
checkCudaError(cudaFree(d_input));
checkCudaError(cudaFree(d_pixelGroup));
checkCudaError(cudaFree(d_groupRep));
checkCudaError(cudaFree(d_output));
timer1.Start();
exit_code = writeRGB2Text(argv[2], h_input, h_rows, h_cols);
timer1.Stop();
std::cout << "Writing: " << timer1.Elapsed() << std::endl;
delete[] h_pixelGroup;
delete[] h_groupRep;
delete[] h_output;
return SUCCESS;
}
when I print from within the kernel I get zeros for the three values
__global__
void clut_distributePixels(int *pixelGroup){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
if(i == 0 && j == 0){
printf("a: %d\n", c_rows);
printf("b: %d\n", c_cols);
printf("c: %d\n", c_numColors);
}
while (i < c_rows) {
while (j < c_cols) {
pixelGroup[i*c_cols+j] = (i*c_cols+j)/c_numColors;
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
Either I am not copying correctly to constant memory or ... I don't know what could be wrong. Any advise !?
I posted the entire host code probably something else is messing with the constant copies.
UPDATE
Main.cu
#include "Imageproc.cuh"
int main(){
int h_change = 0;
int h_rows = 512;
cudaMemcpyToSymbol(c_rows, &h_rows, sizeof(int));
chunche<<<1,1>>>();
cudaMemcpyFromSymbol(&h_change, d_change, sizeof(int));
std::cout << "H = " << h_change << std::endl;
return 0
}
Imageproc.cuh
#ifndef _IMAGEPROC_CUH_
#define _IMAGEPROC_CUH_
#include "Utilities.cuh"
#define B_WIDTH 16
#define B_HEIGHT 16
__constant__ int c_rows;
__constant__ int c_cols;
__constant__ int c_numColors;
__device__ int d_change;
#ifdef __cplusplus
extern "C"
{
#endif
__global__
void chunche();
__global__
void clut_distributePixels(int *pixelGroup);
__global__
void clut_checkDistances(Color *input, int *pixelGroup, Color *groupRep);
__global__
void clut_createImage(Color *clutImage, int *pixelGroup, Color *groupRep);
#ifdef __cplusplus
}
#endif
#endif
Imageproc.cu
#include "Imageproc.cuh"
__global__
void chunche(){
d_change = c_rows + 1;
}
__global__
void clut_distributePixels(int *pixelGroup){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
while (i < c_rows) {
while (j < c_cols) {
pixelGroup[i*c_cols+j] = (i*c_cols+j)/c_numColors;
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
__global__
void clut_checkDistances(Color *input, int *pixelGroup, Color *groupRep){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int newGroup;
while (i < c_rows) {
while (j < c_cols) {
newGroup = 0;
for (int ng = 1; ng < c_numColors; ng++) {
if (
/*If distance from color to group ng is less than distance from color to group idx
then color should belong to ng*/
(groupRep[ng].r-input[i*c_cols+j].r)*(groupRep[ng].r-input[i*c_cols+j].r) +
(groupRep[ng].g-input[i*c_cols+j].g)*(groupRep[ng].g-input[i*c_cols+j].g) +
(groupRep[ng].b-input[i*c_cols+j].b)*(groupRep[ng].b-input[i*c_cols+j].b)
<
(groupRep[newGroup].r-input[i*c_cols+j].r)*(groupRep[newGroup].r-input[i*c_cols+j].r)+
(groupRep[newGroup].g-input[i*c_cols+j].g)*(groupRep[newGroup].g-input[i*c_cols+j].g)+
(groupRep[newGroup].b-input[i*c_cols+j].b)*(groupRep[newGroup].b-input[i*c_cols+j].b)
)
{
newGroup = ng;
}
}
if (pixelGroup[i*c_cols+j] != newGroup) {
pixelGroup[i*c_cols+j] = newGroup;
d_change = 1;
}
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
__global__
void clut_createImage(Color *clutImage, int *pixelGroup, Color *groupRep){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
while (i < c_rows) {
while (j < c_cols) {
clutImage[i*c_cols+j].r = groupRep[pixelGroup[i*c_cols+j]].r;
clutImage[i*c_cols+j].g = groupRep[pixelGroup[i*c_cols+j]].g;
clutImage[i*c_cols+j].b = groupRep[pixelGroup[i*c_cols+j]].b;
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
Utilities.cuh
#ifndef _UTILITIES_CUH_
#define _UTILITIES_CUH_
#include <iostream>
#include <fstream>
#include <string>
#define SUCCESS 1
#define FAILURE 0
#define checkCudaError(val) check( (val), #val, __FILE__, __LINE__)
typedef struct {
int r;
int g;
int b;
} vec3u;
typedef vec3u Color;
typedef unsigned char uchar;
typedef uchar Grayscale;
struct GpuTimer{
cudaEvent_t start;
cudaEvent_t stop;
GpuTimer(){
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
~GpuTimer(){
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void Start(){
cudaEventRecord(start, 0);
}
void Stop(){
cudaEventRecord(stop, 0);
}
float Elapsed(){
float elapsed;
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
return elapsed;
}
};
template<typename T>
void check(T err, const char* const func, const char* const file, const int line) {
if (err != cudaSuccess) {
std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
exit(1);
}
}
int writeGrayscale2Text(const std::string filename, const Grayscale *image, const int rows, const int cols);
int readText2Grayscale(const std::string filename, Grayscale **image, int *rows, int *cols);
int writeRGB2Text(const std::string filename, const Color *image, const int rows, const int cols);
int readText2RGB(const std::string filename, Color **image, int *rows, int *cols);
struct CpuTimer{
clock_t start;
clock_t stop;
void Start(){
start = clock();
}
void Stop(){
stop = clock();
}
float Elapsed(){
return ((float)stop-start)/CLOCKS_PER_SEC * 1000.0f;
}
};
#endif
Utilities.cu
#include "Utilities.cuh"
int writeGrayscale2Text(const std::string filename, const Grayscale *image, const int rows, const int cols){
std::ofstream fileWriter(filename.c_str());
if (!fileWriter.is_open()) {
std::cerr << "** writeGrayscale2Text() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileWriter << rows << "\n";
fileWriter << cols << "\n";
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
fileWriter << (int)image[i*cols+j] << "\n";
}
}
fileWriter.close();
return SUCCESS;
}
int readText2Grayscale(const std::string filename, Grayscale **image, int *rows, int *cols){
std::ifstream fileReader(filename.c_str());
if (!fileReader.is_open()) {
std::cerr << "** readText2Grayscale() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileReader >> *rows;
fileReader >> *cols;
*image = new Grayscale[(*rows)*(*cols)];
int value;
for (int i = 0; i < *rows; i++) {
for (int j = 0; j < *cols; j++) {
fileReader >> value;
(*image)[i*(*cols)+j] = (Grayscale)value;
}
}
fileReader.close();
return SUCCESS;
}
int writeRGB2Text(const std::string filename, const Color *image, const int rows, const int cols){
std::ofstream fileWriter(filename.c_str());
if (!fileWriter.is_open()) {
std::cerr << "** writeRGB2Text() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileWriter << rows << "\n";
fileWriter << cols << "\n";
for (int k = 0; k < 3; k++) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
switch (k) {
case 0:
fileWriter << image[i*cols+j].r << "\n";
break;
case 1:
fileWriter << image[i*cols+j].g << "\n";
break;
case 2:
fileWriter << image[i*cols+j].b << "\n";
break;
}
}
}
}
fileWriter.close();
return SUCCESS;
}
int readText2RGB(const std::string filename, Color **image, int *rows, int *cols){
std::ifstream fileReader(filename.c_str());
if (!fileReader.is_open()) {
std::cerr << "** readText2Grayscale() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileReader >> *rows;
fileReader >> *cols;
*image = new Color[(*rows)*(*cols)];
for (int k = 0; k < 3; k++) {
for (int i = 0; i < *rows; i++) {
for (int j = 0; j < *cols; j++) {
switch (k) {
case 0:
fileReader >> (*image)[i*(*cols)+j].r;
break;
case 1:
fileReader >> (*image)[i*(*cols)+j].g;
break;
case 2:
fileReader >> (*image)[i*(*cols)+j].b;
break;
}
}
}
}
fileReader.close();
return SUCCESS;
}

Constant memory has implicit local scope linkage - answer to this on stack overflow.
This means that the cudaMemcpyToSymbol have to be in the same generated .obj file of the kernel where you want to use it.
You do your memcopy in Main.cu, but the kernel where you use your canstant memory is in Imageproc.cu. So for the constant values are unknown for the kernel chunche.
A option to solve you're problem can be, to implement a wrapper. Just add a function in Imagepro.cu where you do the cudaMemcpyToSymbol and call the wrapper in Main.cu and pass your desired values for the constant memory in there.

Related

How to cope with "cudaErrorMissingConfiguration" from "cudaMallocPitch" function of CUDA?

I'm making a Mandelbrot set program with CUDA. However I can't step more unless cudaErrorMissingConfiguration from cudaMallocPitch() function of CUDA is to be solved. Could you tell me something about it?
My GPU is GeForce RTX 2060 SUPER.
I'll show you my command lines below.
> nvcc MandelbrotCUDA.cu -o MandelbrotCUDA -O3
I tried cudaDeviceSetLimit( cudaLimitMallocHeapSize, 7*1024*1024*1024 ) to
resize heap size.
cudaDeviceSetLimit was success.
However I cannot step one more. I cannot print "CUDA malloc done!"
#include <iostream>
#include <thrust/complex.h>
#include <fstream>
#include <string>
#include <stdlib.h>
using namespace std;
#define D 0.0000025 // Tick
#define LIMIT_N 255
#define INF_NUM 2
#define PLOT_METHOD 2 // dat file : 0, ppm file : 1, ppm file with C : 2
__global__
void calculation(const int indexTotalX, const int indexTotalY, int ***n, thrust::complex<double> ***c){ // n, c are the pointers of dN, dC.
for(int i = 0; i < indexTotalY ; i++){
for(int j = 0; j < indexTotalX; j++){
thrust::complex<double> z(0.0f, 0.0f);
n[i][j] = 0;
for(int ctr=1; ctr <= LIMIT_N ; ctr++){
z = z*z + (*(c[i][j]));
n[i][j] = n[i][j] + (abs(z) < INF_NUM);
}
}
}
}
int main(){
// Data Path
string filePath = "Y:\\Documents\\Programming\\mandelbrot\\";
string fileName = "mandelbrot4.ppm";
string filename = filePath+fileName;
//complex<double> c[N][M];
double xRange[2] = {-0.76, -0.74};
double yRange[2] = {0.05, 0.1};
const int indexTotalX = (xRange[1]-xRange[0])/D;
const int indexTotalY = (yRange[1]-yRange[0])/D;
thrust::complex<double> **c;
//c = new complex<double> [N];
cout << "debug_n" << endl;
int **n;
n = new int* [indexTotalY];
c = new thrust::complex<double> * [indexTotalY];
for(int i=0;i<indexTotalY;i++){
n[i] = new int [indexTotalX];
c[i] = new thrust::complex<double> [indexTotalX];
}
cout << "debug_n_end" << endl;
for(int i = 0; i < indexTotalY; i++){
for(int j = 0; j < indexTotalX; j++){
thrust::complex<double> tmp( xRange[0]+j*D, yRange[0]+i*D );
c[i][j] = tmp;
//n[i*sqrt(N)+j] = 0;
}
}
// CUDA malloc
cout << "CUDA malloc initializing..." << endl;
int **dN;
thrust::complex<double> **dC;
cudaError_t error;
error = cudaDeviceSetLimit(cudaLimitMallocHeapSize, 7*1024*1024*1024);
if(error != cudaSuccess){
cout << "cudaDeviceSetLimit's ERROR CODE = " << error << endl;
return 0;
}
size_t tmpPitch;
error = cudaMallocPitch((void **)dN, &tmpPitch,(size_t)(indexTotalY*sizeof(int)), (size_t)(indexTotalX*sizeof(int)));
if(error != cudaSuccess){
cout << "CUDA ERROR CODE = " << error << endl;
cout << "indexTotalX = " << indexTotalX << endl;
cout << "indexTotalY = " << indexTotalY << endl;
return 0;
}
cout << "CUDA malloc done!" << endl;
This is console messages below.
debug_n
debug_n_end
CUDA malloc initializing...
CUDA ERROR CODE = 1
indexTotalX = 8000
indexTotalY = 20000
There are several problems here:
int **dN;
...
error = cudaMallocPitch((void **)dN, &tmpPitch,(size_t)(indexTotalY*sizeof(int)), (size_t)(indexTotalX*sizeof(int)));
The correct type of pointer to use in CUDA allocations is a single pointer:
int *dN;
not a double pointer:
int **dN;
(so your kernel where you are trying pass triple-pointers:
void calculation(const int indexTotalX, const int indexTotalY, int ***n, thrust::complex<double> ***c){ // n, c are the pointers of dN, dC.
is almost certainly not going to work, and should not be designed that way, but that is not the question you are asking.)
The pointer is passed to the allocating function by its address:
error = cudaMallocPitch((void **)&dN,
For cudaMallocPitch, only the horizontal requested dimension is scaled by the size of the data element. The allocation height is not scaled this way. Also, I will assume X corresponds to your allocation width, and Y corresponds to your allocation height, so you also have those parameters reversed:
error = cudaMallocPitch((void **)&dN, &tmpPitch,(size_t)(indexTotalX*sizeof(int)), (size_t)(indexTotalY));
The cudaLimitMallocHeapSize should not be necessary to set to make any of this work. It applies only to in-kernel allocations. Reserving 7GB on an 8GB card may also cause problems. Until you are sure you need that (it's not needed for what you have shown) I would simply remove that.
$ cat t1488.cu
#include <iostream>
#include <thrust/complex.h>
#include <fstream>
#include <string>
#include <stdlib.h>
using namespace std;
#define D 0.0000025 // Tick
#define LIMIT_N 255
#define INF_NUM 2
#define PLOT_METHOD 2 // dat file : 0, ppm file : 1, ppm file with C : 2
__global__
void calculation(const int indexTotalX, const int indexTotalY, int ***n, thrust::complex<double> ***c){ // n, c are the pointers of dN, dC.
for(int i = 0; i < indexTotalY ; i++){
for(int j = 0; j < indexTotalX; j++){
thrust::complex<double> z(0.0f, 0.0f);
n[i][j] = 0;
for(int ctr=1; ctr <= LIMIT_N ; ctr++){
z = z*z + (*(c[i][j]));
n[i][j] = n[i][j] + (abs(z) < INF_NUM);
}
}
}
}
int main(){
// Data Path
string filePath = "Y:\\Documents\\Programming\\mandelbrot\\";
string fileName = "mandelbrot4.ppm";
string filename = filePath+fileName;
//complex<double> c[N][M];
double xRange[2] = {-0.76, -0.74};
double yRange[2] = {0.05, 0.1};
const int indexTotalX = (xRange[1]-xRange[0])/D;
const int indexTotalY = (yRange[1]-yRange[0])/D;
thrust::complex<double> **c;
//c = new complex<double> [N];
cout << "debug_n" << endl;
int **n;
n = new int* [indexTotalY];
c = new thrust::complex<double> * [indexTotalY];
for(int i=0;i<indexTotalY;i++){
n[i] = new int [indexTotalX];
c[i] = new thrust::complex<double> [indexTotalX];
}
cout << "debug_n_end" << endl;
for(int i = 0; i < indexTotalY; i++){
for(int j = 0; j < indexTotalX; j++){
thrust::complex<double> tmp( xRange[0]+j*D, yRange[0]+i*D );
c[i][j] = tmp;
//n[i*sqrt(N)+j] = 0;
}
}
// CUDA malloc
cout << "CUDA malloc initializing..." << endl;
int *dN;
thrust::complex<double> **dC;
cudaError_t error;
size_t tmpPitch;
error = cudaMallocPitch((void **)&dN, &tmpPitch,(size_t)(indexTotalX*sizeof(int)), (size_t)(indexTotalY));
if(error != cudaSuccess){
cout << "CUDA ERROR CODE = " << error << endl;
cout << "indexTotalX = " << indexTotalX << endl;
cout << "indexTotalY = " << indexTotalY << endl;
return 0;
}
cout << "CUDA malloc done!" << endl;
}
$ nvcc -o t1488 t1488.cu
t1488.cu(68): warning: variable "dC" was declared but never referenced
$ cuda-memcheck ./t1488
========= CUDA-MEMCHECK
debug_n
debug_n_end
CUDA malloc initializing...
CUDA malloc done!
========= ERROR SUMMARY: 0 errors
$

Copying an array from host to constant memory [duplicate]

I have the following code to copy from a host variable to a __constant__ variable in CUDA
int main(int argc, char **argv){
int exit_code;
if (argc < 4) {
std::cout << "Usage: \n " << argv[0] << " <input> <output> <nColors>" << std::endl;
return 1;
}
Color *h_input;
int h_rows, h_cols;
timer1.Start();
exit_code = readText2RGB(argv[1], &h_input, &h_rows, &h_cols);
timer1.Stop();
std::cout << "Reading: " << timer1.Elapsed() << std::endl;
if (exit_code != SUCCESS){
std::cout << "Error trying to read file." << std::endl;
return FAILURE;
}
CpuTimer timer1;
GpuTimer timer2;
float timeStep2 = 0, timeStep3 = 0;
int h_numColors = atoi(argv[3]);
int h_change = 0;
int *h_pixelGroup = new int[h_rows*h_cols];
Color *h_groupRep = new Color[h_numColors];
Color *h_output = new Color[h_rows*h_cols];
Color *d_input;
int *d_pixelGroup;
Color *d_groupRep;
Color *d_output;
dim3 block(B_WIDTH, B_HEIGHT);
dim3 grid((h_cols+B_WIDTH-1)/B_WIDTH, (h_rows+B_HEIGHT-1)/B_HEIGHT);
checkCudaError(cudaMalloc((void**)&d_input, sizeof(Color)*h_rows*h_cols));
checkCudaError(cudaMalloc((void**)&d_pixelGroup, sizeof(int)*h_rows*h_cols));
checkCudaError(cudaMalloc((void**)&d_groupRep, sizeof(Color)*h_numColors));
checkCudaError(cudaMalloc((void**)&d_output, sizeof(Color)*h_rows*h_cols));
// STEP 1
//Evenly distribute all pixels of the image onto the color set
timer2.Start();
checkCudaError(cudaMemcpyToSymbol(c_rows, &h_rows, sizeof(int)));
checkCudaError(cudaMemcpyToSymbol(c_cols, &h_cols, sizeof(int)));
checkCudaError(cudaMemcpyToSymbol(c_numColors, &h_numColors, sizeof(int)));
checkCudaError(cudaMemcpy(d_input, h_input, sizeof(Color)*h_rows*h_cols, cudaMemcpyHostToDevice));
clut_distributePixels<<<grid, block>>>(d_pixelGroup);
checkCudaError(cudaMemcpy(h_pixelGroup, d_pixelGroup, sizeof(int)*h_rows*h_cols, cudaMemcpyDeviceToHost));
timer2.Stop();
std::cout << "Phase 1: " << timer2.Elapsed() << std::endl;
std::cout << h_pixelGroup[0] << ","
<< h_pixelGroup[3] << ","
<< h_pixelGroup[4] << ","
<< h_pixelGroup[7] << ","
<< h_pixelGroup[8] << std::endl;
//Do the STEP 2 and STEP 3 as long as there is at least one change of representative in a group
do {
// STEP 2
//Set the representative value to the average colour of all pixels in the same set
timer1.Start();
for (int ng = 0; ng < h_numColors; ng++) {
int r = 0, g = 0, b = 0;
int elem = 0;
for (int i = 0; i < h_rows; i++) {
for (int j = 0; j < h_cols; j++) {
if (h_pixelGroup[i*h_cols+j] == ng) {
r += h_input[i*h_cols+j].r;
g += h_input[i*h_cols+j].g;
b += h_input[i*h_cols+j].b;
elem++;
}
}
}
if (elem == 0) {
h_groupRep[ng].r = 255;
h_groupRep[ng].g = 255;
h_groupRep[ng].b = 255;
}else{
h_groupRep[ng].r = r/elem;
h_groupRep[ng].g = g/elem;
h_groupRep[ng].b = b/elem;
}
}
timer1.Stop();
timeStep2 += timer1.Elapsed();
// STEP 3
//For each pixel in the image, compute Euclidean's distance to each representative
//and assign it to the set which is closest
h_change = 0;
timer2.Start();
checkCudaError(cudaMemcpyToSymbol(d_change, &h_change, sizeof(int)));
checkCudaError(cudaMemcpy(d_groupRep, h_groupRep, sizeof(Color)*h_numColors, cudaMemcpyHostToDevice));
clut_checkDistances<<<grid, block>>>(d_input, d_pixelGroup, d_groupRep);
checkCudaError(cudaMemcpy(h_pixelGroup, d_pixelGroup, sizeof(int)*h_rows*h_cols, cudaMemcpyDeviceToHost));
checkCudaError(cudaMemcpyFromSymbol(&h_change, d_change, sizeof(int)));
timer2.Stop();
timeStep3 += timer2.Elapsed();
std::cout << "Chunche" << std::endl;
} while (h_change == 1);
std::cout << "Phase 2: " << timeStep2 << std::endl;
std::cout << "Phase 3: " << timeStep3 << std::endl;
// STEP 4
//Create the new image with the resulting color lookup table
timer2.Start();
clut_createImage<<<grid, block>>>(d_output, d_pixelGroup, d_groupRep);
checkCudaError(cudaMemcpy(h_output, d_output, sizeof(Color)*h_rows*h_cols, cudaMemcpyDeviceToHost));
timer2.Stop();
std::cout << "Phase 4: " << timer2.Elapsed() << std::endl;
checkCudaError(cudaFree(d_input));
checkCudaError(cudaFree(d_pixelGroup));
checkCudaError(cudaFree(d_groupRep));
checkCudaError(cudaFree(d_output));
timer1.Start();
exit_code = writeRGB2Text(argv[2], h_input, h_rows, h_cols);
timer1.Stop();
std::cout << "Writing: " << timer1.Elapsed() << std::endl;
delete[] h_pixelGroup;
delete[] h_groupRep;
delete[] h_output;
return SUCCESS;
}
when I print from within the kernel I get zeros for the three values
__global__
void clut_distributePixels(int *pixelGroup){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
if(i == 0 && j == 0){
printf("a: %d\n", c_rows);
printf("b: %d\n", c_cols);
printf("c: %d\n", c_numColors);
}
while (i < c_rows) {
while (j < c_cols) {
pixelGroup[i*c_cols+j] = (i*c_cols+j)/c_numColors;
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
Either I am not copying correctly to constant memory or ... I don't know what could be wrong. Any advise !?
I posted the entire host code probably something else is messing with the constant copies.
UPDATE
Main.cu
#include "Imageproc.cuh"
int main(){
int h_change = 0;
int h_rows = 512;
cudaMemcpyToSymbol(c_rows, &h_rows, sizeof(int));
chunche<<<1,1>>>();
cudaMemcpyFromSymbol(&h_change, d_change, sizeof(int));
std::cout << "H = " << h_change << std::endl;
return 0
}
Imageproc.cuh
#ifndef _IMAGEPROC_CUH_
#define _IMAGEPROC_CUH_
#include "Utilities.cuh"
#define B_WIDTH 16
#define B_HEIGHT 16
__constant__ int c_rows;
__constant__ int c_cols;
__constant__ int c_numColors;
__device__ int d_change;
#ifdef __cplusplus
extern "C"
{
#endif
__global__
void chunche();
__global__
void clut_distributePixels(int *pixelGroup);
__global__
void clut_checkDistances(Color *input, int *pixelGroup, Color *groupRep);
__global__
void clut_createImage(Color *clutImage, int *pixelGroup, Color *groupRep);
#ifdef __cplusplus
}
#endif
#endif
Imageproc.cu
#include "Imageproc.cuh"
__global__
void chunche(){
d_change = c_rows + 1;
}
__global__
void clut_distributePixels(int *pixelGroup){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
while (i < c_rows) {
while (j < c_cols) {
pixelGroup[i*c_cols+j] = (i*c_cols+j)/c_numColors;
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
__global__
void clut_checkDistances(Color *input, int *pixelGroup, Color *groupRep){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int newGroup;
while (i < c_rows) {
while (j < c_cols) {
newGroup = 0;
for (int ng = 1; ng < c_numColors; ng++) {
if (
/*If distance from color to group ng is less than distance from color to group idx
then color should belong to ng*/
(groupRep[ng].r-input[i*c_cols+j].r)*(groupRep[ng].r-input[i*c_cols+j].r) +
(groupRep[ng].g-input[i*c_cols+j].g)*(groupRep[ng].g-input[i*c_cols+j].g) +
(groupRep[ng].b-input[i*c_cols+j].b)*(groupRep[ng].b-input[i*c_cols+j].b)
<
(groupRep[newGroup].r-input[i*c_cols+j].r)*(groupRep[newGroup].r-input[i*c_cols+j].r)+
(groupRep[newGroup].g-input[i*c_cols+j].g)*(groupRep[newGroup].g-input[i*c_cols+j].g)+
(groupRep[newGroup].b-input[i*c_cols+j].b)*(groupRep[newGroup].b-input[i*c_cols+j].b)
)
{
newGroup = ng;
}
}
if (pixelGroup[i*c_cols+j] != newGroup) {
pixelGroup[i*c_cols+j] = newGroup;
d_change = 1;
}
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
__global__
void clut_createImage(Color *clutImage, int *pixelGroup, Color *groupRep){
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
while (i < c_rows) {
while (j < c_cols) {
clutImage[i*c_cols+j].r = groupRep[pixelGroup[i*c_cols+j]].r;
clutImage[i*c_cols+j].g = groupRep[pixelGroup[i*c_cols+j]].g;
clutImage[i*c_cols+j].b = groupRep[pixelGroup[i*c_cols+j]].b;
j += gridDim.x * blockDim.x;
}
j = blockDim.x * blockIdx.x + threadIdx.x;
i += gridDim.y * blockDim.y;
}
}
Utilities.cuh
#ifndef _UTILITIES_CUH_
#define _UTILITIES_CUH_
#include <iostream>
#include <fstream>
#include <string>
#define SUCCESS 1
#define FAILURE 0
#define checkCudaError(val) check( (val), #val, __FILE__, __LINE__)
typedef struct {
int r;
int g;
int b;
} vec3u;
typedef vec3u Color;
typedef unsigned char uchar;
typedef uchar Grayscale;
struct GpuTimer{
cudaEvent_t start;
cudaEvent_t stop;
GpuTimer(){
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
~GpuTimer(){
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void Start(){
cudaEventRecord(start, 0);
}
void Stop(){
cudaEventRecord(stop, 0);
}
float Elapsed(){
float elapsed;
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
return elapsed;
}
};
template<typename T>
void check(T err, const char* const func, const char* const file, const int line) {
if (err != cudaSuccess) {
std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
exit(1);
}
}
int writeGrayscale2Text(const std::string filename, const Grayscale *image, const int rows, const int cols);
int readText2Grayscale(const std::string filename, Grayscale **image, int *rows, int *cols);
int writeRGB2Text(const std::string filename, const Color *image, const int rows, const int cols);
int readText2RGB(const std::string filename, Color **image, int *rows, int *cols);
struct CpuTimer{
clock_t start;
clock_t stop;
void Start(){
start = clock();
}
void Stop(){
stop = clock();
}
float Elapsed(){
return ((float)stop-start)/CLOCKS_PER_SEC * 1000.0f;
}
};
#endif
Utilities.cu
#include "Utilities.cuh"
int writeGrayscale2Text(const std::string filename, const Grayscale *image, const int rows, const int cols){
std::ofstream fileWriter(filename.c_str());
if (!fileWriter.is_open()) {
std::cerr << "** writeGrayscale2Text() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileWriter << rows << "\n";
fileWriter << cols << "\n";
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
fileWriter << (int)image[i*cols+j] << "\n";
}
}
fileWriter.close();
return SUCCESS;
}
int readText2Grayscale(const std::string filename, Grayscale **image, int *rows, int *cols){
std::ifstream fileReader(filename.c_str());
if (!fileReader.is_open()) {
std::cerr << "** readText2Grayscale() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileReader >> *rows;
fileReader >> *cols;
*image = new Grayscale[(*rows)*(*cols)];
int value;
for (int i = 0; i < *rows; i++) {
for (int j = 0; j < *cols; j++) {
fileReader >> value;
(*image)[i*(*cols)+j] = (Grayscale)value;
}
}
fileReader.close();
return SUCCESS;
}
int writeRGB2Text(const std::string filename, const Color *image, const int rows, const int cols){
std::ofstream fileWriter(filename.c_str());
if (!fileWriter.is_open()) {
std::cerr << "** writeRGB2Text() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileWriter << rows << "\n";
fileWriter << cols << "\n";
for (int k = 0; k < 3; k++) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
switch (k) {
case 0:
fileWriter << image[i*cols+j].r << "\n";
break;
case 1:
fileWriter << image[i*cols+j].g << "\n";
break;
case 2:
fileWriter << image[i*cols+j].b << "\n";
break;
}
}
}
}
fileWriter.close();
return SUCCESS;
}
int readText2RGB(const std::string filename, Color **image, int *rows, int *cols){
std::ifstream fileReader(filename.c_str());
if (!fileReader.is_open()) {
std::cerr << "** readText2Grayscale() ** : Unable to open file." << std::endl;
return FAILURE;
}
fileReader >> *rows;
fileReader >> *cols;
*image = new Color[(*rows)*(*cols)];
for (int k = 0; k < 3; k++) {
for (int i = 0; i < *rows; i++) {
for (int j = 0; j < *cols; j++) {
switch (k) {
case 0:
fileReader >> (*image)[i*(*cols)+j].r;
break;
case 1:
fileReader >> (*image)[i*(*cols)+j].g;
break;
case 2:
fileReader >> (*image)[i*(*cols)+j].b;
break;
}
}
}
}
fileReader.close();
return SUCCESS;
}
Constant memory has implicit local scope linkage - answer to this on stack overflow.
This means that the cudaMemcpyToSymbol have to be in the same generated .obj file of the kernel where you want to use it.
You do your memcopy in Main.cu, but the kernel where you use your canstant memory is in Imageproc.cu. So for the constant values are unknown for the kernel chunche.
A option to solve you're problem can be, to implement a wrapper. Just add a function in Imagepro.cu where you do the cudaMemcpyToSymbol and call the wrapper in Main.cu and pass your desired values for the constant memory in there.

Comparison of Cuda Shared Memory Copies: Which Approach Better

When coping an array to another array in shared memory, I tried six different approaches (see comments in the program). After discussions and testings, my conclusions are:
(1) memcpy is not faster than element-wise copy of an array.
(2) For small array, approach 3 is the best. For larger array, approach 6 is the best.
#include <stdio.h>
#include <iostream>
#include <time.h>
#include <stdlib.h>
#include <assert.h>
const int NUM_OF_BLOCKS = 1;
const int NUM_OF_THREADS_PER_BLOCK = 8;
const int NUM_OF_DATA = 6000;
const int NUM_OF_COPIES= 1000;
//const int NUM_OF_COPIES= 1000000;
cudaError_t cuda_status;
__shared__ int start_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int end_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int src[NUM_OF_DATA];
__shared__ int dest[NUM_OF_DATA];
using namespace std;
__device__ void init(){
unsigned int num_of_data_per_thread = NUM_OF_DATA / NUM_OF_THREADS_PER_BLOCK;
unsigned int extra_data = NUM_OF_DATA % NUM_OF_THREADS_PER_BLOCK;
int size[NUM_OF_THREADS_PER_BLOCK];
start_index[threadIdx.x] = threadIdx.x * num_of_data_per_thread;
if (threadIdx.x < extra_data){
start_index[threadIdx.x] = start_index[threadIdx.x] + threadIdx.x;
size[threadIdx.x] = num_of_data_per_thread + 1;
}else{
start_index[threadIdx.x] = start_index[threadIdx.x] + extra_data;
size[threadIdx.x] = num_of_data_per_thread ;
}
end_index[threadIdx.x] = start_index[threadIdx.x] + size[threadIdx.x] -1;
//printf("start_index[%d] = %d, end_index[%d] = %d\n", threadIdx.x, start_index[threadIdx.x], threadIdx.x, end_index[threadIdx.x]);
}
__device__ void inc_src_data(int* src){
int i;
for (i = 0; i < NUM_OF_DATA; i++, src++){
*src += 1;
}
//__threadfence_block();
}
template <int sel>
__device__ void copy_to_dest_array(int* src, int* dest){
int i;
switch (sel){
case 1:
// Approach 1: every thread executes memcpy
memcpy(dest, src, NUM_OF_DATA * sizeof(int));
break;
case 2:
// Approach 2: one thread executes memcpy and then threadfence
if (threadIdx.x == 0){
memcpy(dest, src, NUM_OF_DATA * sizeof(int));
__threadfence_block();
}
break;
case 3:
// Approach 3: every thread copies each element individually
for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
*dest = *src;
//__threadfence_block(); // added this line to demonstrate timing difference
break;
case 4:
// Approach 4: one thread copy each element individually and then threadfence
if (threadIdx.x == 0)
for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
*dest = *src;
__threadfence_block();
break;
case 5:
// Approach 5: every thread execute memcpy and then threadfence
memcpy(dest+start_index[threadIdx.x], src + start_index[threadIdx.x], (end_index[threadIdx.x] - start_index[threadIdx.x] + 1) * sizeof(int));
__threadfence_block();
break;
case 6:
// Approach 6: every thread copies each element individually and then threadfence
for (i = start_index[threadIdx.x]; i <= end_index[threadIdx.x]; i++){
dest[i] = src[i];
}
__threadfence_block();
break;
default:
assert(0);
break;
}
}
template <int sel>
__global__ void copy_data_test(int* data){
init();
copy_to_dest_array<sel>(data, src);
for (int i = 0; i < NUM_OF_COPIES; i++){
inc_src_data(src);
copy_to_dest_array<sel>(&src[0], &dest[0]);
}
copy_to_dest_array<sel>(dest, data);
}
template <int sel>
__global__ void copy_data_test(int* data){
init();
copy_to_dest_array<sel>(data, src);
for (int i = 0; i < NUM_OF_COPIES; i++){
inc_src_data(src);
copy_to_dest_array<sel>(&src[0], &dest[0]);
}
copy_to_dest_array<sel>(dest, data);
}
template <int sel>
void run_test(int *rdata, int *hdata, int *ddata){
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaMemcpy(ddata, hdata, NUM_OF_DATA * sizeof(int), cudaMemcpyHostToDevice);
cudaEventRecord(start);
copy_data_test<sel><<<NUM_OF_BLOCKS, NUM_OF_THREADS_PER_BLOCK>>>(ddata);
cudaEventRecord(stop);
cout << "kernel error: " << cudaGetErrorString(cudaPeekAtLastError()) << "---" << cudaGetErrorString(cudaDeviceSynchronize()) << endl;
cudaMemcpy(rdata, ddata, NUM_OF_DATA * sizeof(int), cudaMemcpyDeviceToHost);
cudaEventSynchronize(stop);
float et;
cudaEventElapsedTime(&et, start, stop);
cout << "Trial " << sel << " elapsed time: " << et << "ms" << endl;
/*
cout << "after kernel processing" << endl;
for (int i = 0; i < NUM_OF_DATA; i++)
cout << rdata[i] << " ";
cout << endl;
*/
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
int main(int argc, char **argv){
int h_data[NUM_OF_DATA];
int r_data[NUM_OF_DATA];
int* d_data;
int i;
cudaSetDevice(0);
srand(time(NULL));
/*
cout << "before kernel processing" << endl;
for (i = 0; i < NUM_OF_DATA; i++){
h_data[i] = rand()%100;
cout << h_data[i] << " ";
}
cout << endl;
*/
cudaMalloc(&d_data, sizeof(int) * NUM_OF_DATA);
run_test<1>(r_data, h_data, d_data);
run_test<2>(r_data, h_data, d_data);
run_test<3>(r_data, h_data, d_data);
run_test<4>(r_data, h_data, d_data);
run_test<5>(r_data, h_data, d_data);
run_test<6>(r_data, h_data, d_data);
return 0;
}
It seems fairly clear that the __threadfence_block() is an expensive operation. The 4 longest test cases all use __threadfence_block(). The two shortest test cases do not.
If I add __threadfence_block() to the 3rd (i.e. the shortest) test case, the timing (for me) changes from ~2 sec to ~17 sec.
Note that your test cases are not all doing the exact same thing, as evidenced by the difference in the output results. I made a modified version of your code that more clearly demonstrates this:
#include <stdio.h>
#include <iostream>
#include <time.h>
#include <stdlib.h>
#include <assert.h>
const int NUM_OF_BLOCKS = 1;
const int NUM_OF_THREADS_PER_BLOCK = 8;
const int NUM_OF_DATA = 50;
const int NUM_OF_COPIES= 10000000;
cudaError_t cuda_status;
__shared__ int start_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int end_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int src[NUM_OF_DATA];
__shared__ int dest[NUM_OF_DATA];
using namespace std;
__device__ void init(){
unsigned int num_of_data_per_thread = NUM_OF_DATA / NUM_OF_THREADS_PER_BLOCK;
unsigned int extra_data = NUM_OF_DATA % NUM_OF_THREADS_PER_BLOCK;
int size[NUM_OF_THREADS_PER_BLOCK];
start_index[threadIdx.x] = threadIdx.x * num_of_data_per_thread;
if (threadIdx.x < extra_data){
start_index[threadIdx.x] = start_index[threadIdx.x] + threadIdx.x;
size[threadIdx.x] = num_of_data_per_thread + 1;
}else{
start_index[threadIdx.x] = start_index[threadIdx.x] + extra_data;
size[threadIdx.x] = num_of_data_per_thread ;
}
end_index[threadIdx.x] = start_index[threadIdx.x] + size[threadIdx.x] -1;
}
__device__ void inc_src_data(int* src){
int i;
for (i = 0; i < NUM_OF_DATA; i++, src++){
*src += 1;
}
//__threadfence_block();
}
template <int sel>
__device__ void copy_to_dest_array(int* src, int* dest){
int i;
switch (sel){
case 1:
// Approach 1: every thread executes memcpy
memcpy(dest, src, NUM_OF_DATA);
break;
case 2:
// Approach 2: one thread executes memcpy and then threadfence
if (threadIdx.x == 0){
memcpy(dest, src, NUM_OF_DATA);
__threadfence_block();
}
break;
case 3:
// Approach 3: every thread copies each element individually
for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
*dest = *src;
__threadfence_block(); // added this line to demonstrate timing difference
break;
case 4:
// Approach 4: one thread copy each element individually and then threadfence
if (threadIdx.x == 0)
for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
*dest = *src;
__threadfence_block();
break;
case 5:
// Approach 5: every thread execute memcpy and then threadfence
memcpy(dest+start_index[threadIdx.x], src + start_index[threadIdx.x], end_index[threadIdx.x] - start_index[threadIdx.x] + 1);
__threadfence_block();
break;
case 6:
// Approach 6: every thread copies each element individually and then threadfence
for (i = start_index[threadIdx.x]; i <= end_index[threadIdx.x]; i++){
*(dest + i) = *(src + i);
}
__threadfence_block();
break;
default:
assert(0);
break;
}
}
template <int sel>
__global__ void copy_data_test(int* data){
init();
copy_to_dest_array<sel>(data, src);
for (int i = 0; i < NUM_OF_COPIES; i++){
inc_src_data(src);
copy_to_dest_array<sel>(&src[0], &dest[0]);
}
copy_to_dest_array<sel>(dest, data);
}
template <int sel>
void run_test(int *rdata, int *hdata, int *ddata){
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaMemcpy(ddata, hdata, NUM_OF_DATA * sizeof(int), cudaMemcpyHostToDevice);
cudaEventRecord(start);
copy_data_test<sel><<<NUM_OF_BLOCKS, NUM_OF_THREADS_PER_BLOCK>>>(ddata);
cudaEventRecord(stop);
cout << "kernel error: " << cudaGetErrorString(cudaPeekAtLastError()) << "---" << cudaGetErrorString(cudaDeviceSynchronize()) << endl;
cudaMemcpy(rdata, ddata, NUM_OF_DATA * sizeof(int), cudaMemcpyDeviceToHost);
cudaEventSynchronize(stop);
float et;
cudaEventElapsedTime(&et, start, stop);
cout << "Trial " << sel << " elapsed time: " << et << "ms" << endl;
cout << "after kernel processing" << endl;
for (int i = 0; i < NUM_OF_DATA; i++)
cout << rdata[i] << " ";
cout << endl;
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
int main(int argc, char **argv){
int h_data[NUM_OF_DATA];
int r_data[NUM_OF_DATA];
int* d_data;
int i;
cudaSetDevice(0);
srand(time(NULL));
cout << "before kernel processing" << endl;
for (i = 0; i < NUM_OF_DATA; i++){
h_data[i] = rand()%100;
cout << h_data[i] << " ";
}
cout << endl;
cudaMalloc(&d_data, sizeof(int) * NUM_OF_DATA);
run_test<1>(r_data, h_data, d_data);
run_test<2>(r_data, h_data, d_data);
run_test<3>(r_data, h_data, d_data);
run_test<4>(r_data, h_data, d_data);
run_test<5>(r_data, h_data, d_data);
run_test<6>(r_data, h_data, d_data);
return 0;
}

False dependency issue for the Fermi architecture

I am trying to achieve "3-way overlapping" using 3 streams as in the examples in CUDA streams and concurrency webinar. But I couldn't achieve it.
I have Geforce GT 550M (Fermi Architecture with one copy engine) and I am using Windows 7 (64 bit).
Here is the code that I have written.
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// includes, project
#include "helper_cuda.h"
#include "helper_functions.h" // helper utility functions
#include <stdio.h>
using namespace std;
#define DATA_SIZE 6000000
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3
__global__ void kernel(const int *in, int *out, int dataSize)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int end = dataSize;
for (int i = start; i < end; i += blockDim.x * gridDim.x)
{
out[i] = in[i] * in[i];
}
}
int main()
{
const int dataSize = DATA_SIZE;
int *h_in = new int[dataSize];
int *h_out = new int[dataSize];
int *h_groundTruth = new int[dataSize];
// Input population
for(int i = 0; i < dataSize; i++)
h_in[i] = 5;
for(int i = 0; i < dataSize; i++)
h_out[i] = 0;
// CPU calculation for ground truth
for(int i = 0; i < dataSize; i++)
h_groundTruth[i] = h_in[i] * h_in[i];
// Choose which GPU to run on, change this on a multi-GPU system.
checkCudaErrors( cudaSetDevice(0) );
int *d_in = 0;
int *d_out = 0;
int streamSize = dataSize / NUM_STREAMS;
size_t memSize = dataSize * sizeof(int);
size_t streamMemSize = memSize / NUM_STREAMS;
checkCudaErrors( cudaMalloc( (void **)&d_in, memSize) );
checkCudaErrors( cudaMalloc( (void **)&d_out, memSize) );
// registers host memory as page-locked (required for asynch cudaMemcpyAsync)
checkCudaErrors(cudaHostRegister(h_in, memSize, cudaHostRegisterPortable));
checkCudaErrors(cudaHostRegister(h_out, memSize, cudaHostRegisterPortable));
// set kernel launch config
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS,1,1);
cout << "GPU Kernel Configuration : " << endl;
cout << "Number of Streams :\t" << NUM_STREAMS << " with size: \t" << streamSize << endl;
cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;
// create cuda stream
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamCreate(&streams[i]));
// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
cudaEventRecord(start, 0);
// overlapped execution using version 2
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]);
}
//cudaMemcpy(d_in, h_in, memSize, cudaMemcpyHostToDevice);
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
//kernel<<<nBlocks, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize);
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize/2);
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2);
}
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]);
}
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamSynchronize(streams[i]));
cudaEventRecord(stop, 0);
checkCudaErrors(cudaStreamSynchronize(0));
checkCudaErrors(cudaDeviceSynchronize());
float gpu_time = 0;
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
// release resources
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaHostUnregister(h_in));
checkCudaErrors(cudaHostUnregister(h_out));
checkCudaErrors(cudaFree(d_in));
checkCudaErrors(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamDestroy(streams[i]));
cudaDeviceReset();
cout << "Execution Time of GPU: " << gpu_time << "ms" << endl;
// GPU output check
int sum = 0;
for(int i = 0; i < dataSize; i++)
sum += h_groundTruth[i] - h_out[i];
cout << "Error between CPU and GPU: " << sum << endl;
delete[] h_in;
delete[] h_out;
delete[] h_groundTruth;
return 0;
}
Using Nsight for profiling, I have this result:
It may seem correct, but why does the D2H transfer in stream #1 only start when the last kernel launch of stream #2 and not before?
I tried also to use 8 streams (just by changing NUM_STREAM to 8) to achieve such a "3-way overlap" and here is the result:
The interesting thing is that when I use 8 streams, the overlappings between computation and memory transfers seem to be much better.
What is the reason for this problem? Is it due to WDDM driver or is there something wrong with my program?
From the comments above, it seems that the OP's problem is a false dependency issue, suffered by the Fermi architecture and solved by the Hyper-Q feature of the Kepler architecture.
To summarize, the OP is highlighting the fact that the first D2H transfer (stream #1) does not start immediately after the last H2D (stream #3) finishes, while in principle it could. The time gap is highlighted by the red circle in the following figure (henceforth, but for the differently specified, all the tests refer to a GeForce GT540M belonging to the Fermi family):
The OP's approach is a breadth-first approach, which operates according to the following scheme:
for(int i = 0; i < NUM_STREAMS; i++)
cudaMemcpyAsync(..., cudaMemcpyHostToDevice, streams[i]);
for(int i = 0; i < NUM_STREAMS; i++)
{
kernel_launch_1<<<..., 0, streams[i]>>>(...);
kernel_launch_2<<<..., 0, streams[i]>>>(...);
}
for(int i = 0; i < NUM_STREAMS; i++)
cudaMemcpyAsync(..., cudaMemcpyDeviceToHost, streams[i]);
Using a depth-first approach, operating according to the following scheme
for(int i = 0; i < NUM_STREAMS; i++)
{
cudaMemcpyAsync(...., cudaMemcpyHostToDevice, streams[i]);
kernel_launch_1<<<...., 0, streams[i]>>>(....);
kernel_launch_2<<<...., 0, streams[i]>>>(....);
cudaMemcpyAsync(...., cudaMemcpyDeviceToHost, streams[i]);
}
does not seem to improve the situation, according to the following timeline (the depth-first code is reported at the bottom of the answer), but it seems to show a worse overlapping:
Under the breadth-first approach, and commenting the second kernel launch, the first D2H copy starts immediately as it can, as reported by the following timeline:
Finally, running the code on a Kepler K20c, the problem does not show up, as illustrated by the following figure:
Here is the code for the depth-first approach:
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// includes, project
#include "helper_cuda.h"
#include "helper_functions.h" // helper utility functions
#include <stdio.h>
using namespace std;
#define DATA_SIZE 6000000
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3
__global__ void kernel(const int *in, int *out, int dataSize)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int end = dataSize;
for (int i = start; i < end; i += blockDim.x * gridDim.x)
{
out[i] = in[i] * in[i];
}
}
int main()
{
const int dataSize = DATA_SIZE;
int *h_in = new int[dataSize];
int *h_out = new int[dataSize];
int *h_groundTruth = new int[dataSize];
// Input population
for(int i = 0; i < dataSize; i++)
h_in[i] = 5;
for(int i = 0; i < dataSize; i++)
h_out[i] = 0;
// CPU calculation for ground truth
for(int i = 0; i < dataSize; i++)
h_groundTruth[i] = h_in[i] * h_in[i];
// Choose which GPU to run on, change this on a multi-GPU system.
checkCudaErrors( cudaSetDevice(0) );
int *d_in = 0;
int *d_out = 0;
int streamSize = dataSize / NUM_STREAMS;
size_t memSize = dataSize * sizeof(int);
size_t streamMemSize = memSize / NUM_STREAMS;
checkCudaErrors( cudaMalloc( (void **)&d_in, memSize) );
checkCudaErrors( cudaMalloc( (void **)&d_out, memSize) );
// registers host memory as page-locked (required for asynch cudaMemcpyAsync)
checkCudaErrors(cudaHostRegister(h_in, memSize, cudaHostRegisterPortable));
checkCudaErrors(cudaHostRegister(h_out, memSize, cudaHostRegisterPortable));
// set kernel launch config
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS,1,1);
cout << "GPU Kernel Configuration : " << endl;
cout << "Number of Streams :\t" << NUM_STREAMS << " with size: \t" << streamSize << endl;
cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;
// create cuda stream
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamCreate(&streams[i]));
// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
cudaEventRecord(start, 0);
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]);
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize/2);
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2);
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]);
}
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamSynchronize(streams[i]));
cudaEventRecord(stop, 0);
checkCudaErrors(cudaStreamSynchronize(0));
checkCudaErrors(cudaDeviceSynchronize());
float gpu_time = 0;
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
// release resources
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaHostUnregister(h_in));
checkCudaErrors(cudaHostUnregister(h_out));
checkCudaErrors(cudaFree(d_in));
checkCudaErrors(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamDestroy(streams[i]));
cudaDeviceReset();
cout << "Execution Time of GPU: " << gpu_time << "ms" << endl;
// GPU output check
int sum = 0;
for(int i = 0; i < dataSize; i++)
sum += h_groundTruth[i] - h_out[i];
cout << "Error between CPU and GPU: " << sum << endl;
delete[] h_in;
delete[] h_out;
delete[] h_groundTruth;
return 0;
}

Cuda call won't allocate more than 8 threads per block, regardless of specification

I am creating a parallel version of the Sieve of Eratosthenes in c++. The problem is my kernel call (reduce0) seems to only ever assign 8 threads per block instead of the 256 I specify. Since even the first CUDA version allows 512 threads per block, there must be some error in my code for it. Any help would be appreciated.
#include <iostream>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cutil.h>
//#include <sieve_kernel.cu>
using namespace std;
////////////////////////////////////////////////////
int psum(int arg[], double n);
int call_kernel(int primes[], int n);
int findsmallest(int arg[], int f, double n);
int sieve(int n);
__global__ void reduce0(int *g_idata, int *g_odata);
////////////////////////////////////////////////////
int main(){
int n = pow((double) 2, 8);
int total = sieve(n);
cout << "# primes" << endl << total << endl;
return 0;
}
///////////////////////////////////////////////////
__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (int s = 1; s < blockDim.x; s *= 2) { // step = s x 2
if (tid % (s*2) == 0) { // only threadIDs divisible by the step participate
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////////////////////////////////
int call_kernel(int *primes, int n){
// Allocate and copy device arrays
int *g_idevice;
int *g_odevice;
int size = n * sizeof(int);
cudaMalloc(&g_idevice, size);
cudaMemcpy(g_idevice, primes, size, cudaMemcpyHostToDevice);
cudaMalloc(&g_odevice, size);
// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
// Copy device data back to primes
cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
//for (int i = 0; i < n; i++) {
// cout << i << " " << primes[i] << endl;
//}
int total = primes[0];
cudaFree(g_idevice);
cudaFree(g_odevice);
return total;
}
/////////////////////////////////////////////////////////////////////
int findsmallest(int arg[], int f, double n){
int i = f;
while(arg[i]!= 1 && i < n) {
i++;
}
return i;
}
//////////////////////////////////////////////////////////////////////
int psum(int arg[], double n){
int total = 0;
int i = 2;
while(i < n){
if(arg[i] == 1){
total = total + 1;
}
i++;
}
return total;
}
/////////////////////////////////////////////////////////////////////////
int sieve(int n){
int* primes = NULL;
int mult = 0;
int k = 2;
int i; int total;
//primes = new int[n];
primes = new int[256];
for(i = 0; i < n; i++){
primes[i] = 1;
}
primes[0] = primes[1] = 0;
while (k * k < n){
mult = k * k;
while (mult < n) {
primes[mult] = 0;
mult = mult + k;
}
k = findsmallest(primes,k+1, n);
}
total = call_kernel(primes, n);
//delete [] primes;
//primes = NULL;
return total;
}
Your kernel is using dynamically allocated shared memory, but the kernel launch does not include any allocation, so the result is the kernel will be aborting because of illegal memory operations on that shared memory buffer. You should find it works if you modify this part of call_kernel as follows:
// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
size_t shmsize = size_t(dimBlock.x * dimBlock.y * dimBlock.z) * sizeof(int);
reduce0<<<dimGrid, dimBlock, shmsize>>>(g_idevice, g_odevice);
If you had of included some basic error checking around the function call, perhaps like this:
reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
if (cudaPeekAtLastError() != cudaSuccess) {
cout << "kernel launch error: " << cudaGetErrorString(cudaGetLastError()) << endl;
}
// Copy device data back to primes
cudaError_t err = cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
cout << "CUDA error: " << cudaGetErrorString(err) << endl;
}
it would have been immediately obvious that the kernel launch or execution was failing with an error.