Request
I am writing to request guidance in optimizing my solution / method "CalculateConvolutionOutputTensor__im2col". I would like help determining the best strategy for moving beyond my naive approach; offerings of intuition about any relevant GPU processes and how they apply (e.g., bank conflicts); and help interpreting the above profile in terms of what I can tweak.
A first run of the method takes 0.774 seconds using a GeForce 2080 Ti. I have included a screenshot of the Nsight Compute profile of the only CUDA C++ kernel I have written: im2col.
Things I Could Do
I could have each GPU thread access shared memory instead of global memory. I could transfer GPU "heap" variables to kernel "stack" instead of dereferencing for every thread and in-kernel for-loop iteration. I could put small parameters into arrays in GPU memory and pass single pointers to those arrays. I could use a more sophisticated version of im2col.
Things I Have Tried
I would prefer not to use cuDNN 7.6.5; when I use cuDNN 7.6.5 and write the statement "cudnnCreate(&cudnnHandle);", Nsight Compute suggests that method cuModuleGetFunction returns CUDA_ERROR_NOT_FOUND.
Recreating Solution
The procedure I used to create this project was to create a new CUDA 10.2 Runtime project using Visual Studio Community 2019, rename the default source file to "main.cu", replace all contents with the first code block below, add "CalculateConvolutionOutputTensor__im2col.h" to my project, add the second code block below, add "CalculateConvolutionOutputTensor__im2col.cu" to my project, add the third code block below, and add "cublas.lib;" to Project Properties -> Linker -> Input -> Additional Dependencies.
main.cu
// Allow use of cudaMalloc.
#include <cuda_runtime.h>
// Allow use of time(NULL) as a seed.
#include <ctime>
// Allow construction of a default_random_engine.
#include <random>
// Allow use of CalculateConvolutionOutputTensor__im2col.
#include "CalculateConvolutionOutputTensor__im2col.h"
int main()
{
// --------------------------------------------------------------------------
// Declare and define parameters of CalculateConvolutionOutputTensor__im2col.
// --------------------------------------------------------------------------
float* convolutionOutputTensor;
cudaMalloc(&convolutionOutputTensor, 6 * 3 * 19 * 19 * 4 * sizeof(float));
int elementsInFilter = 3 * 590 * 590;
int elementsInChannelOfOutputTensor = 19 * 19;
int imagesInSubdivision = 4;
int channelsInFilter_host = 3;
int* channelsInFilter_GPU;
cudaMalloc(&channelsInFilter_GPU, sizeof(int));
cudaMemcpy(channelsInFilter_GPU, &channelsInFilter_host, sizeof(int), cudaMemcpyHostToDevice);
int widthOfFilter_host = 590;
int* widthOfFilter_GPU;
cudaMalloc(&widthOfFilter_GPU, sizeof(int));
cudaMemcpy(widthOfFilter_GPU, &widthOfFilter_host, sizeof(int), cudaMemcpyHostToDevice);
int heightOfOutputTensor_host = 19;
int* heightOfOutputTensor_GPU;
cudaMalloc(&heightOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(heightOfOutputTensor_GPU, &heightOfOutputTensor_host, sizeof(int), cudaMemcpyHostToDevice);
int widthOfOutputTensor_host = 19;
int* widthOfOutputTensor_GPU;
cudaMalloc(&widthOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(widthOfOutputTensor_GPU, &widthOfOutputTensor_host, sizeof(int), cudaMemcpyHostToDevice);
int elementsInChannelOfOutputTensor_host = 19 * 19;
int* elementsInChannelOfOutputTensor_GPU;
cudaMalloc(&elementsInChannelOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(
elementsInChannelOfOutputTensor_GPU,
&elementsInChannelOfOutputTensor_host,
sizeof(int),
cudaMemcpyHostToDevice);
int channelsInFilter_times_elementsInChannelOfOutputTensor_host = 3 * 19 * 19;
int* channelsInFilter_times_elementsInChannelOfOutputTensor_GPU;
cudaMalloc(&channelsInFilter_times_elementsInChannelOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(
channelsInFilter_times_elementsInChannelOfOutputTensor_GPU,
&channelsInFilter_times_elementsInChannelOfOutputTensor_host,
sizeof(int),
cudaMemcpyHostToDevice);
int elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_host = 3 * 590 * 19 * 19;
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU;
cudaMalloc(&elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(
elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU,
&elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_host,
sizeof(int),
cudaMemcpyHostToDevice);
int elementsInInputTensor = 3 * 608 * 608 * 4;
float* inputTensor_host = new float[elementsInInputTensor];
for (int i = 0; i < elementsInInputTensor; ++i) {
inputTensor_host[i] = ((float)(i % 255)) / 255.0;
}
float* inputTensor_GPU;
cudaMalloc(&inputTensor_GPU, elementsInInputTensor * sizeof(float));
cudaMemcpy(
inputTensor_GPU,
inputTensor_host,
elementsInInputTensor * sizeof(float),
cudaMemcpyHostToDevice);
delete[] inputTensor_host;
int horizontalFilterStride_host = 1;
int* horizontalFilterStride_GPU;
cudaMalloc(&horizontalFilterStride_GPU, sizeof(int));
cudaMemcpy(
horizontalFilterStride_GPU,
&horizontalFilterStride_host,
sizeof(int),
cudaMemcpyHostToDevice);
int channelsInImage_host = 3;
int* channelsInImage_GPU;
cudaMalloc(&channelsInImage_GPU, sizeof(int));
cudaMemcpy(channelsInImage_GPU, &channelsInImage_host, sizeof(int), cudaMemcpyHostToDevice);
int verticalFilterStride_host = 1;
int* verticalFilterStride_GPU;
cudaMalloc(&verticalFilterStride_GPU, sizeof(int));
cudaMemcpy(
verticalFilterStride_GPU,
&verticalFilterStride_host,
sizeof(int),
cudaMemcpyHostToDevice);
int elementsInCrossSectionOfImage_host = 3 * 608;
int* elementsInCrossSectionOfImage_GPU;
cudaMalloc(&elementsInCrossSectionOfImage_GPU, sizeof(int));
cudaMemcpy(
elementsInCrossSectionOfImage_GPU,
&elementsInCrossSectionOfImage_host,
sizeof(int),
cudaMemcpyHostToDevice);
int elementsInImage_host = 3 * 608 * 608;
int* elementsInImage_GPU;
cudaMalloc(&elementsInImage_GPU, sizeof(int));
cudaMemcpy(elementsInImage_GPU, &elementsInImage_host, sizeof(int), cudaMemcpyHostToDevice);
int filters = 6 * 3;
int elementsInFilterTensor = 6 * 3 * 3 * 590 * 590;
float* filterTensor_host = new float[elementsInFilterTensor];
std::default_random_engine randomNumberGenerator(time(NULL));
std::normal_distribution<float> normalDistribution(0.0, 1.0);
for (int i = 0; i < elementsInFilterTensor; ++i) {
filterTensor_host[i] = normalDistribution(randomNumberGenerator) / sqrt((float)elementsInFilterTensor);
}
float* filterTensor_GPU;
cudaMalloc(&filterTensor_GPU, elementsInFilterTensor * sizeof(float));
cudaMemcpy(
filterTensor_GPU,
filterTensor_host,
elementsInFilterTensor * sizeof(float),
cudaMemcpyHostToDevice);
delete[] filterTensor_host;
int elementsInOutputSubtensor = 6 * 3 * 19 * 19;
// -------------------------------------------------
// Execute CalculateConvolutionOutputTensor__im2col.
// -------------------------------------------------
CalculateConvolutionOutputTensor__im2col(
convolutionOutputTensor,
elementsInFilter,
elementsInChannelOfOutputTensor_host,
imagesInSubdivision,
channelsInFilter_GPU,
widthOfFilter_GPU,
heightOfOutputTensor_GPU,
widthOfOutputTensor_GPU,
elementsInChannelOfOutputTensor_GPU,
channelsInFilter_times_elementsInChannelOfOutputTensor_GPU,
elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU,
inputTensor_GPU,
horizontalFilterStride_GPU,
channelsInImage_GPU,
verticalFilterStride_GPU,
elementsInCrossSectionOfImage_GPU,
elementsInImage_GPU,
filters,
filterTensor_GPU,
elementsInOutputSubtensor);
cudaFree(channelsInFilter_GPU);
cudaFree(widthOfFilter_GPU);
cudaFree(heightOfOutputTensor_GPU);
cudaFree(widthOfOutputTensor_GPU);
cudaFree(elementsInChannelOfOutputTensor_GPU);
cudaFree(channelsInFilter_times_elementsInChannelOfOutputTensor_GPU);
cudaFree(elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU);
cudaFree(inputTensor_GPU);
cudaFree(horizontalFilterStride_GPU);
cudaFree(channelsInImage_GPU);
cudaFree(verticalFilterStride_GPU);
cudaFree(elementsInCrossSectionOfImage_GPU);
cudaFree(elementsInImage_GPU);
cudaFree(filterTensor_GPU);
// --------------------------------------------------
// Make sure that convolutionOutputTensor is correct.
// --------------------------------------------------
float* convolutionOutputTensor_test = new float[6 * 3 * 19 * 19 * 4];
cudaMemcpy(
convolutionOutputTensor_test,
convolutionOutputTensor,
6 * 3 * 19 * 19 * 4 * sizeof(float),
cudaMemcpyDeviceToHost);
printf("convolutionOutputTensor_test: {");
for (int i = 0; i < 18; ++i) {
printf("%f, ", convolutionOutputTensor_test[i]);
}
printf("...}\n");
delete[] convolutionOutputTensor_test;
cudaFree(convolutionOutputTensor);
return 0;
}
CalculateConvolutionOutputTensor__im2col.h
void CalculateConvolutionOutputTensor__im2col(
float* convolutionOutputTensor_child,
int elementsInFilter_child,
int elementsInChannelOfOutputTensor_host_child,
int imagesInSubdivision_child,
int* channelsInFilter,
int* widthOfFilter,
int* heightOfOutputTensor,
int* widthOfOutputTensor,
int* elementsInChannelOfOutputTensor_GPU_child,
int* channelsInFilter_times_elementsInChannelOfOutputTensor,
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor,
float* inputTensor_child,
int* horizontalFilterStride,
int* channelsInImage,
int* verticalFilterStride,
int* elementsInCrossSectionOfImage,
int* elementsInImage,
int filters_child,
float* filterTensor,
int elementsInOutputSubtensor_child);
CalculateConvolutionOutputTensor__im2col.cu
// Allow use of __global__.
#include <cuda_runtime.h>
// Allow declaration of cublasHandle.
#include "cublas_v2.h"
// Allow use of blockIdx.x, blockDim.x, and threadIdx.x.
#include <device_launch_parameters.h>
__global__
void im2col(
float* col_child,
int* channelsInFilter_child,
int* widthOfFilter_child,
int* heightOfOutputTensor_child,
int* widthOfOutputTensor_child,
int* elementsInChannelOfOutputTensor_child,
int* channelsInFilter_times_elementsInChannelOfOutputTensor_child,
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_child,
float* inputTensor_child_child,
int* horizontalFilterStride_child,
int* channelsInImage_child,
int* verticalFilterStride_child,
int* elementsInCrossSectionOfImage_child,
int* image_child,
int* elementsInImage_child);
void CalculateConvolutionOutputTensor__im2col(
float* convolutionOutputTensor_child,
int elementsInFilter_child,
int elementsInChannelOfOutputTensor_host_child,
int imagesInSubdivision_child,
int* channelsInFilter,
int* widthOfFilter,
int* heightOfOutputTensor,
int* widthOfOutputTensor,
int* elementsInChannelOfOutputTensor_GPU_child,
int* channelsInFilter_times_elementsInChannelOfOutputTensor,
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor,
float* inputTensor_child,
int* horizontalFilterStride,
int* channelsInImage,
int* verticalFilterStride,
int* elementsInCrossSectionOfImage,
int* elementsInImage,
int filters_child,
float* filterTensor,
int elementsInOutputSubtensor_child)
{
// -----------------------------------------
// Define and declare parameters for im2col.
// -----------------------------------------
// Define parameters for the execution configuration of im2col.
int threads_per_block_for_im2col = 885;
int blocks_for_im2col =
(elementsInFilter_child + threads_per_block_for_im2col - 1) / threads_per_block_for_im2col;
// Declare col.
float* col;
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
int elementsInFilter_times_elementsInChannelOfOutputTensor =
elementsInFilter_child * elementsInChannelOfOutputTensor_host_child;
cudaMalloc(&col, elementsInFilter_times_elementsInChannelOfOutputTensor * sizeof(float));
// -----------------------------------------------------------------------------
// Define parameters for calculating the matrix product of filterTensor and col.
// -----------------------------------------------------------------------------
// Define a cublasHandle_t object called cublasHandle.
// Declaring cublasHandle requires '#include "cublas_v2.h"'.
// Defining cublasHandle requires adding "cublas.lib" to
// Properties -> Linker -> Input -> Additional Dependencies.
cublasHandle_t cublasHandle;
cublasCreate(&cublasHandle);
// Define parameters for (not) including
// a portion of a third matrix in product_filterTensor_and_col.
float one = 1.0;
float zero = 0.0;
// ------------------------------------------------------------
// For each image in subdivision,
// sculpt image into matrix col.
// Calculate the matrix product of filterTensor and col and
// store the product as a subtensor of convolutionOutputTensor.
// ------------------------------------------------------------
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
int image_times_elementsInOutputSubtensor;
int* image_GPU;
cudaMalloc(&image_GPU, sizeof(int));
for (int image_host = 0; image_host < imagesInSubdivision_child; ++image_host) {
cudaMemcpy(image_GPU, &image_host, sizeof(int), cudaMemcpyHostToDevice);
im2col<<<blocks_for_im2col, threads_per_block_for_im2col>>>
(col,
channelsInFilter,
widthOfFilter,
heightOfOutputTensor,
widthOfOutputTensor,
elementsInChannelOfOutputTensor_GPU_child,
channelsInFilter_times_elementsInChannelOfOutputTensor,
elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor,
inputTensor_child,
horizontalFilterStride,
channelsInImage,
verticalFilterStride,
elementsInCrossSectionOfImage,
image_GPU,
elementsInImage);
cudaDeviceSynchronize();
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
image_times_elementsInOutputSubtensor = image_host * elementsInOutputSubtensor_child;
cublasSgemm(
cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
elementsInChannelOfOutputTensor_host_child,
filters_child,
elementsInFilter_child,
&one,
col,
elementsInChannelOfOutputTensor_host_child,
filterTensor,
elementsInFilter_child,
&zero,
convolutionOutputTensor_child + image_times_elementsInOutputSubtensor,
elementsInChannelOfOutputTensor_host_child);
}
cudaFree(col);
cudaFree(image_GPU);
}
__global__
void im2col(
float* col_child,
int* channelsInFilter_child,
int* widthOfFilter_child,
int* heightOfOutputTensor_child,
int* widthOfOutputTensor_child,
int* elementsInChannelOfOutputTensor_child,
int* channelsInFilter_times_elementsInChannelOfOutputTensor_child,
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_child,
float* inputTensor_child_child,
int* horizontalFilterStride_child,
int* channelsInImage_child,
int* verticalFilterStride_child,
int* elementsInCrossSectionOfImage_child,
int* image,
int* elementsInImage_child)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int c_prime = index % (*channelsInFilter_child);
int temp = (index - c_prime) / (*channelsInFilter_child);
int w_prime = temp % (*widthOfFilter_child);
int h_prime = temp / (*widthOfFilter_child);
for (int h = 0; h < (*heightOfOutputTensor_child); ++h) {
for (int w = 0; w < (*widthOfOutputTensor_child); ++w) {
col_child[
w +
h * (*widthOfOutputTensor_child) +
c_prime * (*elementsInChannelOfOutputTensor_child) +
w_prime * (*channelsInFilter_times_elementsInChannelOfOutputTensor_child) +
h_prime * (*elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_child)] =
inputTensor_child_child[
c_prime +
(w * (*horizontalFilterStride_child) + w_prime) * (*channelsInImage_child) +
(h * (*verticalFilterStride_child) + h_prime) * (*elementsInCrossSectionOfImage_child) +
(*image) * (*elementsInImage_child)];
}
}
}
After reading through the NVIDIA articles that Robert Crovella provided me, I rewrote my solution "CalculateConvolutionOutputTensor__im2col" to have threads in each block load from contiguous global memory. I used less indexing arithmetic and fewer parameters. I saw a method speed-up of (1 method / 0.445 s) / (1 method / 0.774 s) = 1.7, and an im2col kernel speed-up of (1 kernel / 35.27 ms) / (1 kernel / 128.15 ms) = 3.6. Thanks for pointing me to useful specific reading.
im2col used to take 128.15 ms; now it takes only 32.12 ms. Sgemm takes 6.34 ms now; probably took about the same then. Their total is 38.46 ms. The pair is run four times, for a total of 153.84 ms. I wonder how to speed up im2col more, and to reduce the 274.16 ms in "overhead".
To sculpt an image into matrix col, I had the (3*590/2) threads in each of (2*590*19*19) blocks transfer half cross sections of a filter-shaped portion of an image sequentially to col. I believe that each thread loaded from global memory physically adjacent to the memory accessed by the previous thread, and that each thread stored to global memory physically adjacent to the memory stored to by the previous thread. I did notice that 11 threads in the last warp in each block went unused.
I think I might take th31 up on their suggestion and move this optimization thread to Code Review.
Nsight Compute profile of im2col with coalesced global memory loads and stores
main.cu
// Allow use of cudaMalloc.
#include <cuda_runtime.h>
// Allow use of structs in namespace chrono.
#include <ctime>
// Allow construction of a default_random_engine.
#include <random>
// Allow use of CalculateConvolutionOutputTensor__im2col.
#include "CalculateConvolutionOutputTensor__im2col.h"
int main()
{
// --------------------------------------------------------------------------
// Declare and define parameters of CalculateConvolutionOutputTensor__im2col.
// --------------------------------------------------------------------------
float* convolutionOutputTensor;
cudaMalloc(&convolutionOutputTensor, 6 * 3 * 19 * 19 * 4 * sizeof(float));
int elementsInFilter = 3 * 590 * 590;
int elementsInChannelOfOutputTensor = 19 * 19;
int imagesInSubdivision = 4;
int elementsInInputTensor = 3 * 608 * 608 * 4;
float* inputTensor_host = new float[elementsInInputTensor];
for (int i = 0; i < elementsInInputTensor; ++i) {
inputTensor_host[i] = ((float)(i % 255)) / 255.0;
}
float* inputTensor_GPU;
cudaMalloc(&inputTensor_GPU, elementsInInputTensor * sizeof(float));
cudaMemcpy(
inputTensor_GPU,
inputTensor_host,
elementsInInputTensor * sizeof(float),
cudaMemcpyHostToDevice);
delete[] inputTensor_host;
int heightOfFilter_host = 590;
int* heightOfFilter_GPU;
cudaMalloc(&heightOfFilter_GPU, sizeof(int));
cudaMemcpy(heightOfFilter_GPU, &heightOfFilter_host, sizeof(int), cudaMemcpyHostToDevice);
int channelsInImage_host = 3;
int* channelsInImage_GPU;
cudaMalloc(&channelsInImage_GPU, sizeof(int));
cudaMemcpy(channelsInImage_GPU, &channelsInImage_host, sizeof(int), cudaMemcpyHostToDevice);
int widthOfImage_host = 608;
int* widthOfImage_GPU;
cudaMalloc(&widthOfImage_GPU, sizeof(int));
cudaMemcpy(widthOfImage_GPU, &widthOfImage_host, sizeof(int), cudaMemcpyHostToDevice);
int widthOfOutputTensor_host = 19;
int* widthOfOutputTensor_GPU;
cudaMalloc(&widthOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(widthOfOutputTensor_GPU, &widthOfOutputTensor_host, sizeof(int), cudaMemcpyHostToDevice);
int heightOfImage_host = 608;
int* heightOfImage_GPU;
cudaMalloc(&heightOfImage_GPU, sizeof(int));
cudaMemcpy(heightOfImage_GPU, &heightOfImage_host, sizeof(int), cudaMemcpyHostToDevice);
int filters = 6 * 3;
int elementsInFilterTensor = 6 * 3 * 3 * 590 * 590;
float* filterTensor_host = new float[elementsInFilterTensor];
std::default_random_engine randomNumberGenerator(time(NULL));
std::normal_distribution<float> normalDistribution(0.0, 1.0);
for (int i = 0; i < elementsInFilterTensor; ++i) {
filterTensor_host[i] = normalDistribution(randomNumberGenerator) / sqrt((float)elementsInFilterTensor);
}
float* filterTensor_GPU;
cudaMalloc(&filterTensor_GPU, elementsInFilterTensor * sizeof(float));
cudaMemcpy(
filterTensor_GPU,
filterTensor_host,
elementsInFilterTensor * sizeof(float),
cudaMemcpyHostToDevice);
delete[] filterTensor_host;
int elementsInOutputSubtensor = 6 * 3 * 19 * 19;
// -------------------------------------------------
// Execute CalculateConvolutionOutputTensor__im2col.
// -------------------------------------------------
CalculateConvolutionOutputTensor__im2col(
convolutionOutputTensor,
elementsInFilter,
elementsInChannelOfOutputTensor,
imagesInSubdivision,
inputTensor_GPU,
heightOfFilter_GPU,
channelsInImage_GPU,
widthOfImage_GPU,
widthOfOutputTensor_GPU,
heightOfImage_GPU,
filters,
filterTensor_GPU,
elementsInOutputSubtensor);
cudaFree(inputTensor_GPU);
cudaFree(heightOfFilter_GPU);
cudaFree(channelsInImage_GPU);
cudaFree(widthOfImage_GPU);
cudaFree(widthOfOutputTensor_GPU);
cudaFree(heightOfImage_GPU);
cudaFree(filterTensor_GPU);
// --------------------------------------------------
// Make sure that convolutionOutputTensor is correct.
// --------------------------------------------------
float* convolutionOutputTensor_test = new float[6 * 3 * 19 * 19 * 4];
cudaMemcpy(
convolutionOutputTensor_test,
convolutionOutputTensor,
6 * 3 * 19 * 19 * 4 * sizeof(float),
cudaMemcpyDeviceToHost);
printf("convolutionOutputTensor_test: {");
for (int i = 0; i < 18; ++i) {
printf("%f, ", convolutionOutputTensor_test[i]);
}
printf("...}\n");
delete[] convolutionOutputTensor_test;
cudaFree(convolutionOutputTensor);
return 0;
}
CalculateConvolutionOutputTensor__im2col.h
void CalculateConvolutionOutputTensor__im2col(
float* convolutionOutputTensor_child,
int elementsInFilter_child,
int elementsInChannelOfOutputTensor_host_child,
int imagesInSubdivision_child,
float* inputTensor_child,
int* heightOfFilter,
int* channelsInImage,
int* widthOfImage,
int* widthOfOutputTensor,
int* heightOfImage,
int filters_child,
float* filterTensor,
int elementsInOutputSubtensor_child);
CalculateConvolutionOutputTensor__im2col.cu
// Allow use of __global__.
#include <cuda_runtime.h>
// Allow declaration of cublasHandle.
#include "cublas_v2.h"
// Allow use of blockIdx.x, blockDim.x, and threadIdx.x.
#include <device_launch_parameters.h>
__global__
void im2col(
float* col_child,
float* inputTensor_child_child,
int* heightOfFilter_child,
int* channelsInImage_child,
int* widthOfImage_child,
int* widthOfOutputTensor_child,
int* image,
int* heightOfImage_child);
void CalculateConvolutionOutputTensor__im2col(
float* convolutionOutputTensor_child,
int elementsInFilter_child,
int elementsInChannelOfOutputTensor_host_child,
int imagesInSubdivision_child,
float* inputTensor_child,
int* heightOfFilter,
int* channelsInImage,
int* widthOfImage,
int* widthOfOutputTensor,
int* heightOfImage,
int filters_child,
float* filterTensor,
int elementsInOutputSubtensor_child)
{
// -----------------------------------------
// Define and declare parameters for im2col.
// -----------------------------------------
// Define parameters for the execution configuration of im2col.
int threads_per_block_for_im2col = 3 * 590 / 2;
int blocks_for_im2col = 2 * 590 * 19 * 19;
// Declare col.
float* col;
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
int elementsInFilter_times_elementsInChannelOfOutputTensor =
elementsInFilter_child * elementsInChannelOfOutputTensor_host_child;
cudaMalloc(&col, elementsInFilter_times_elementsInChannelOfOutputTensor * sizeof(float));
// -----------------------------------------------------------------------------
// Define parameters for calculating the matrix product of filterTensor and col.
// -----------------------------------------------------------------------------
// Define a cublasHandle_t object called cublasHandle.
// Declaring cublasHandle requires '#include "cublas_v2.h"'.
// Defining cublasHandle requires adding "cublas.lib" to
// Properties -> Linker -> Input -> Additional Dependencies.
cublasHandle_t cublasHandle;
cublasCreate(&cublasHandle);
// Define parameters for (not) including
// a portion of a third matrix in product_filterTensor_and_col.
float one = 1.0;
float zero = 0.0;
// ------------------------------------------------------------
// For each image in subdivision,
// sculpt image into matrix col.
// Calculate the matrix product of filterTensor and col and
// store the product as a subtensor of convolutionOutputTensor.
// ------------------------------------------------------------
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
int image_times_elementsInOutputSubtensor;
int* image_GPU;
cudaMalloc(&image_GPU, sizeof(int));
for (int image_host = 0; image_host < imagesInSubdivision_child; ++image_host) {
cudaMemcpy(image_GPU, &image_host, sizeof(int), cudaMemcpyHostToDevice);
im2col
<<<blocks_for_im2col,
threads_per_block_for_im2col>>>
(col,
inputTensor_child,
heightOfFilter,
channelsInImage,
widthOfImage,
widthOfOutputTensor,
image_GPU,
heightOfImage);
cudaDeviceSynchronize();
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
image_times_elementsInOutputSubtensor = image_host * elementsInOutputSubtensor_child;
cublasSgemm(
cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
filters_child,
elementsInChannelOfOutputTensor_host_child,
elementsInFilter_child,
&one,
filterTensor,
filters_child,
col,
elementsInFilter_child,
&zero,
convolutionOutputTensor_child + image_times_elementsInOutputSubtensor,
filters_child);
float element = 0.0;
}
cudaFree(col);
cudaFree(image_GPU);
}
__global__
void im2col(
float* col_child,
float* inputTensor_child_child,
int* heightOfFilter_child,
int* channelsInImage_child,
int* widthOfImage_child,
int* widthOfOutputTensor_child,
int* image,
int* heightOfImage_child)
{
col_child[blockIdx.x * blockDim.x + threadIdx.x] =
inputTensor_child_child[
threadIdx.x +
(blockIdx.x % 2) * blockDim.x +
((blockIdx.x % (2 * (*heightOfFilter_child))) / 2) * (*channelsInImage_child) * (*widthOfImage_child) +
(blockIdx.x / (2 * (*heightOfFilter_child))) * (*channelsInImage_child) +
(blockIdx.x / (2 * (*heightOfFilter_child) * (*widthOfOutputTensor_child))) * (*channelsInImage_child) * (*widthOfImage_child) +
(*image) * (*channelsInImage_child) * (*widthOfImage_child) * (*heightOfImage_child)];
}
Related
#define TS 32
int num_devices = 0;
__global__ void shared_kernel(float* A, float* B, float* C, int M, int N, int K) {
int global_col = blockDim.x * blockIdx.x + threadIdx.x;
int global_row = blockDim.y * blockIdx.y + threadIdx.y;
int local_col = threadIdx.x;
int local_row = threadIdx.y;
if (global_row >= M || global_col >= N) return;
__shared__ float Asub[TS][TS];
__shared__ float Bsub[TS][TS];
const int num_tiles = K / TS;
float acc = 0;
for(int t = 0; t < num_tiles; t++){
const int t_row = TS * t + local_row;
const int t_col = TS * t + local_col;
Asub[local_row][local_col] = A[global_row * K + t_col];
Bsub[local_row][local_col] = B[t_row * N + global_col];
__syncthreads();
printf("[DEBUG] first sync threads, global_row: %d, global_col: %d\n", global_row, global_col);
for (int k = 0; k < K; ++k) {
acc += Asub[local_row][k] * Bsub[k][local_col];
}
__syncthreads();
printf("[DEBUG] second sync threads, global_row: %d, global_col: %d\n", global_row, global_col);
}
C[global_row * N + global_col] = acc;
}
static float *a_d, *b_d, *c_d;
void mat_mul(float *A, float *B, float *C, int M, int N, int K) {
cudaMemcpy(a_d, A, M * K * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(b_d, B, K * N * sizeof(float), cudaMemcpyHostToDevice);
dim3 blockDim(TS, TS);
dim3 gridDim(M/TS, N/TS);
shared_kernel<<<gridDim, blockDim>>>(a_d, b_d, c_d, M, N, K);
cudaMemcpy(C, c_d, M * N * sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
}
void mat_mul_init(float *A, float *B, float *C, int M, int N, int K) {
cudaGetDeviceCount(&num_devices);
cudaSetDevice(0);
cudaMalloc(&a_d, M * K * sizeof(float));
cudaMalloc(&b_d, K * N * sizeof(float));
cudaMalloc(&c_d, M * N * sizeof(float));
}
Above example is a matrix multiplication with shared memory.
I ran above kernel with dim3 blockDim(TS, TS) and dim3 gridDim(M/TS, N/TS) and M, N, K = 128.
I checked that float * C has zero value after launching kernel. Also, I found that only few of global_row are printed(from 37 to 81) after first __syncthreads(), and there is no printf DEBUG message after the second __syncthreads().
I suspect that __syncthreads() is causing the problem, but I don't know how to fix it. My code is almost the same as other matrix multiplication code in other site.
Would you give me some hint how to solve this?
Any time you are having trouble with a CUDA code, I recommend using proper CUDA error checking and run your code with compute-sanitizer or cuda-memcheck. For this type of analysis, it will be easier if you don't use in-kernel printf.
If you did that, you would see output like this:
========= Invalid __shared__ read of size 4
========= at 0x000002f0 in shared_kernel(float*, float*, float*, int, int, int)
========= by thread (0,2,0) in block (0,1,0)
========= Address 0x00002000 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
... (and more output)
So from that, we can see that your kernel is making invalid __shared__ read operations. Where is that happening in your kernel? You could use the methodology here to identify a specific line of code. However this is a fairly simple kernel, and there is only one line that is reading from shared memory, it is here:
for (int k = 0; k < K; ++k) {
acc += Asub[local_row][k] * Bsub[k][local_col]; // shared reads here
A quick inspection will show that if you let this loop iterate over a range of K=128, then you will index out of bounds here:
for (int k = 0; k < K; ++k) {
acc += Asub[local_row][k] * Bsub[k][local_col];
^ ^
when k is greater than 31, because this would exceed your shared array dimensions:
#define TS 32
__shared__ float Asub[TS][TS];
__shared__ float Bsub[TS][TS];
I'm not going to bother writing a fixed kernel/code for you, because as you've already pointed out, this topic is covered in many other places, and a canonical example is already provided in the programming guide.
FWIW, if i change your for-loop to this:
for (int k = 0; k < TS; ++k) {
then the run-time errors go away for me. cuda-memcheck reports no errors.
Hi i have a problem with the time response I am getting a longer response time on GPU than CPU
the algorithm used is a matrix multiplication algorithm
using the next functions:
// Start timers
cudaEvent_t timer1, timer2;
cudaEventCreate(&timer1);
cudaEventCreate(&timer2);
cudaEventRecord(timer1, 0);
cudaEventSynchronize(timer1);
// Stop timers
cudaEventRecord(timer2, 0);
cudaEventSynchronize(timer1);
cudaEventSynchronize(timer2);
float elapsed;
cudaEventElapsedTime(&elapsed, timer1, timer2);
cudaDeviceReset();
return elapsed;
here is my code on GPU:
float Mult_gpu(float* hostPtr, float* hostPtr2, float* hostPtr3, int size, int Ncols, int Nrows, int n) {
size_t pitch;
check("Creating timers");
cudaEvent_t timer1, timer2;
cudaEventCreate(&timer1);
cudaEventCreate(&timer2);
cudaEventRecord(timer1, 0);
cudaEventSynchronize(timer1);
/******************************************/
/***Configuracion de las matrices en gpu***/
/******************************************/
float* devPtr;
cudaMallocPitch(&devPtr, &pitch, n * sizeof(float), Nrows);
cudaMemcpy2D(devPtr, pitch, hostPtr, n * sizeof(float), n * sizeof(float), Nrows, cudaMemcpyHostToDevice);
float* devPtr2;
cudaMallocPitch(&devPtr2, &pitch, Ncols * sizeof(float), n);
cudaMemcpy2D(devPtr2, pitch, hostPtr2, Ncols * sizeof(float), Ncols * sizeof(float), n, cudaMemcpyHostToDevice);
float* devPtr3;
cudaMallocPitch(&devPtr3, &pitch, Ncols * sizeof(float), Nrows);
//dim3 gridSize(iDivUp(Ncols3, BLOCKSIZE_x), iDivUp(Nrows3, BLOCKSIZE_y));
//dim3 blockSize(BLOCKSIZE_y, BLOCKSIZE_x);
dim3 block(32, 32); //hilos por bloque
dim3 grid((size / block.x) + 1, (size / block.y) + 1); //numero de bloques
/**************************/
/**Lanzamiento del kernel**/
/**************************/
Mult << <grid, block >> > (devPtr, devPtr2, devPtr3, pitch, Ncols, Nrows, n);
cudaDeviceSynchronize();
/*********************************/
/***Copiado de devPtr a hosPtr2***/
/*********************************/
cudaMemcpy2D(hostPtr3, Ncols * sizeof(float), devPtr3, pitch, Ncols * sizeof(float), Nrows, cudaMemcpyDeviceToHost);
//cudaMemcpy(hostPtr3, devPtr3, size * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(devPtr);
cudaFree(devPtr2);
cudaFree(devPtr3);
// Stop timers
cudaEventRecord(timer2, 0);
cudaEventSynchronize(timer1);
cudaEventSynchronize(timer2);
float elapsed;
cudaEventElapsedTime(&elapsed, timer1, timer2);
cudaDeviceReset();
return elapsed;
}
and here is my code on CPU
float Mult_cpu(float* hostPtrA, float* HostPtrB, float* hostPtrC, int Ncols, int Nrows, int n)
{
cudaEvent_t timer1, timer2;
cudaEventCreate(&timer1);
cudaEventCreate(&timer2);
cudaEventRecord(timer1, 0);
cudaEventSynchronize(timer1);
for (int i = 0; i < Nrows; ++i) {
for (int j = 0; j < Ncols; ++j) {
float suma = 0;
for (int k = 0; k < n; ++k) {
suma += hostPtrA[i * n + k] * HostPtrB[k * Ncols + j];
}
hostPtrC[i * Ncols + j] = suma;
}
}
// Stop timers
cudaEventRecord(timer2, 0);
cudaEventSynchronize(timer1);
cudaEventSynchronize(timer2);
float elapsed;
cudaEventElapsedTime(&elapsed, timer1, timer2);
return elapsed;
}
when i use a matrix 500x500 or any matrix the CPU is faster than GPU and i don't understand why i don't know if the problem is my kernel program or the CUDA functions that im using
my kernel code
__global__ void Mult(float* devPtrA, float* devPtrB, float* devPtrC, size_t pitch, int Ncols, int Nrows, int n)
{
float temp;
int r = blockDim.y * blockIdx.y + threadIdx.y; //for (int f = 0; f <= fil - 1; f += 1) equivalencia en for
int c = blockDim.x * blockIdx.x + threadIdx.x; //for (int c = 0; c <= col - 1; c += 1)
if ((r < Ncols) && (c < Nrows)) {
for (int c2 = 0; c2 < n; c2++) {
float* vertical = (float*)((char*)devPtrA + r * pitch);
float element1 = vertical[c2];
float* horizontal = (float*)((char*)devPtrB + c2 * pitch);
float element2 = horizontal[c];
temp += element1 * element2;
}
//printf("\nla fila es: %d la columna es: %d el valor es: %8.4f\n\n", r, c, temp);
float* vertical2 = (float*)((char*)devPtrC + r * pitch);
vertical2[c] = temp;
}
}
You should read on the concept of SIMT architecture, CUDA execution model and branch divergence. Analyze your CUDA kernel performance with a profiler. I suspect that the condition if ((r < Ncols) && (c < Nrows)) in your kernel causes threads in each warp to diverge and hence the reduced performance. Also pitch affects the global memory access pattern in your code which is another factor in the performance of CUDA kernels. Some other excellent optimization tips can be found here.
CudaMalloc is really slow. If you know the size of your matrices beforehand, do the initialization at the beginning of your program.
Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 8 years ago.
Improve this question
I am trying to simply increment a few matrix values in parallel in CUDA and trying to copy them back to main memory. However when I print them out once the thread function returns, the values are the same. I have even tried running the program with just 1 thread, but have had no luck. Any help would be greatly appreciated.
My code:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <cuda.h>
#define BLOCK_SIZE 1024
#define MAX_N 100000000
#define MAX_THREADS 1024
int num_threads;
int count; // Count of threads that have updated their partition
int size;
//int increment; // VS
int * inc2;
//int my_start;
//Host data
int * thread_ids;
//nvcc -arch=sm_20 -o nbody.exe nbody.cu (compilation)
__global__ void pcyc_red(float * a, float * b, float * c, float * D, float * X,
float * a2, float * b2, float * c2, float * D2,
int * inc2_dev, int * size_dev, int * num_threads_dev){
//__threadfence();
int thread_id = threadIdx.x + (blockIdx.x * blockDim.x);
float k1;
float k2;
int i;
int start = 0;
//int end = size_dev-1;
//int inc2_dev = inc2_dev1[0];
//int inc_dev = *inc_dev1;
//int size_dev = size_dev1[0];
int nthreads = num_threads_dev[0];
//Thread work assignment
int chunk_size = size_dev[0]/nthreads;
int my_start = thread_id*(chunk_size);
int my_end = start + ((thread_id + 1)*chunk_size - 1);
//__threadfence();
__syncthreads();
//Forward Reduction
for(i = my_start; i <= my_end; ++i){
a[i] = a[i]++;
b[i] = b[i]++;
c[i] = c[i]++;
D[i] = D[i]++;
X[i] = X[i]++;
}
__threadfence();
//__syncthreads();
}//Device Function
float* init_vector(int size){
float* output;
output = (float*) calloc(size, sizeof(float));
int i;
for(i = 0; i < size; ++i){
output[i] = 2.0;
}
return output;
}
float* init_vector_ac(int s){
//s will be used for size-1 not to be confused for size.
float* output;
output = (float*) calloc(s, sizeof(float));
int i;
for(i = 0; i < s; ++i){
output[i] = -1.0;
}
return output;
}
// Main program
int main(int argc, char *argv[]) {
//num_threads -> atoi(argv[argc-1]);
//struct timeval start, stop;
float total_time;
int i;
///Host structures
float* a;
float* b;
float* c;
float* D;
float* X;
//increment = 2; // VS
inc2 = (int*) malloc(sizeof(int));
inc2[0] = 1;
//size = (int*) malloc(sizeof(int));
//num_threads = (int*) malloc(sizeof(int));
//my_start = 0;
//wait_flag = false;
///Device Data
//SYSTEM * sys_dev;
float * a_dev;
float * b_dev;
float * c_dev;
float * D_dev;
float * X_dev;
float * a2_dev;
float * b2_dev;
float * c2_dev;
float * D2_dev;
//float * X2_dev;
//int * inc_dev;
int * inc2_dev;
//int * mstart_dev;
int * size_dev;
int * num_threads_dev;
int result_var;
//int final_inc2;
cudaEvent_t start, stop; // GPU timing variables
//struct timeval cpu_start, cpu_stop; // CPU timing variables
// float time_array[10];
// Timing initializations
cudaEventCreate(&start);
cudaEventCreate(&stop);
if (argc != 3)
{
printf("Use: <executable_name> <size> <num_threads>\n");
exit(0);
}
if ((size = atoi(argv[argc-2])) > MAX_N)
{
printf("Maximum number of nodes allowed: %d\n", MAX_N);
exit(0);
};
if ((num_threads = atoi(argv[argc-1])) > MAX_THREADS)
{
printf("Maximum number of threads allowed: %d.\n", MAX_THREADS);
exit(0);
};
int size_array = (size) * sizeof(float);
int size_array2 = (size - 1) * sizeof(float);
// Initialize host tridiagonal matrix
a = init_vector_ac(size-1); // a[i] = -1.0
b = init_vector(size); // b[i] = 2.0
c = init_vector_ac(size-1); // c[i] = -1.0
D = init_vector(size); // D[i] = 2.0
X = init_vector(size); // X[i] = 2.0
//xs = init_vector_err(size);
// Shift elements of a by 1
for(i = size-1; i > 0; i--) a[i] = a[i-1];
a[0] = 0.0;
thread_ids = (int*) calloc(num_threads, sizeof(int));
count = 0;
for(i = 0; i < num_threads; ++i){
thread_ids[i] = i;
}
//Cuda Operation
cudaEventRecord( start, 0);
cudaMalloc((void **) &a_dev, size);
cudaMalloc((void **) &b_dev, size);
cudaMalloc((void **) &c_dev, size);
cudaMalloc((void **) &D_dev, size);
cudaMalloc((void **) &X_dev, size);
cudaMalloc((void **) &a2_dev, size);
cudaMalloc((void **) &b2_dev, size);
cudaMalloc((void **) &c2_dev, size);
cudaMalloc((void **) &D2_dev, size);
//cudaMalloc((void**)&inc_dev, sizeof(int));
cudaMalloc((void**)&inc2_dev, sizeof(int));
//cudaMalloc((void**)&mstart_dev, sizeof(int));
cudaMalloc((void**)&size_dev, sizeof(int));
cudaMalloc((void**)&num_threads_dev, sizeof(int));
cudaMemcpy(a_dev, a, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(b_dev, b, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(c_dev, c, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(D_dev, D, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(X_dev, X, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(a2_dev, a, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(b2_dev, b, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(c2_dev, c, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(D2_dev, D, size_array, cudaMemcpyHostToDevice);
//cudaMemcpy(inc_dev, &increment, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(inc2_dev, inc2, sizeof(int), cudaMemcpyHostToDevice);
//cudaMemcpy(mstart_dev, &my_start, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(size_dev, &size, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(num_threads_dev, &num_threads, sizeof(int), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
pcyc_red<<<1, num_threads>>>(a_dev, b_dev, c_dev, D_dev, X_dev,
a2_dev, b2_dev, c2_dev, D2_dev,
inc2_dev, size_dev, num_threads_dev);
cudaDeviceSynchronize();
cudaMemcpy(X, X_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(a, a_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(b, b_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(c, c_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(D, D_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(inc2, inc2_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&result_var, num_threads_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&total_time, start, stop);
printf("Final Var: %d\n\n", inc2[0]);
printf("Num Threads Var: %d\n\n", result_var);
for(i = 0; i < size; ++i){
printf("a=%8.4f \n", a[i]);
printf("b=%8.4f \n", b[i]);
printf("c=%8.4f \n", c[i]);
printf("D=%8.4f \n", D[i]);
printf("X=%8.4f \n", X[i]);
}
printf("Threads = %d, matrix_size = %d, time = %f\n",
num_threads, size, total_time);
cudaFree(a_dev);
cudaFree(b_dev);
cudaFree(c_dev);
cudaFree(D_dev);
cudaFree(X_dev);
//cudaFree(inc_dev);
cudaFree(inc2_dev);
//cudaFree(mstart_dev);
//cudaFree(size_dev);
//cudaFree(num_threads_dev);
}//end of main
Add proper cuda error checking to your code.
One problem I can see is that your allocation sizes are not matched to your array sizes. To pick just one example:
int size_array = (size) * sizeof(float);
...
cudaMalloc((void **) &b_dev, size); // size should probably be size_array here
... ^^^^
cudaMemcpy(b_dev, b, size_array, cudaMemcpyHostToDevice); // this won't work, will throw error
^^^^^^^^^^
The above is certainly an error, and there are several of that type in your code. You may also have a machine configuration issue (CUDA not properly installed, etc.) which the error checking will also indicate.
I'm trying to write a matrix multiplication code in cuda, which is pretty similar to Nvidia's cuda programming guide, but it is not working. It is supposed to do C=alpha*A*B+beta*C , but for every A,B C remains unchanged.
__global__ void MatMulKernel(int m,int n,int k,double *A,double *B,double *C,double alpha,double beta)
{
double Ctemp = 0.0;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int ind;
for (ind = 0; ind < k; ++ind)
{
Ctemp += A[row+ind*m]*B[ind+col*k];
}
C[row+m*col] = alpha*Ctemp+beta*C[row+m*col];
//C[row+m*col] = Ctemp;
__syncthreads();
}
extern "C" void
local_mm_cuda (const int m, const int n, const int k, const double alpha,
const double *A, const int lda, const double *B, const int ldb,
const double beta, double *C, const int ldc)
{
int row, col;
/* Verify the sizes of lda, ldb, and ldc */
assert (lda >= m);
assert (ldb >= k);
assert (ldc >= m);
// allocating memory for device array
double *dA,*dB,*dC;
size_t sizeA = sizeof(double)*m*k;
size_t sizeB = sizeof(double)*n*k;
size_t sizeC = sizeof(double)*m*n;
cudaMalloc((void**)&dA,sizeA);
cudaMalloc((void**)&dB,sizeB);
cudaMalloc((void**)&dC,sizeC);
cudaMemcpy(dA, A, sizeA, cudaMemcpyHostToDevice);
cudaMemcpy(dB, B, sizeB, cudaMemcpyHostToDevice);
cudaMemcpy(dC, C, sizeC, cudaMemcpyHostToDevice);
// calling matrix multiplication kernal
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid( n/dimBlock.x, m/dimBlock.y);
MatMulKernel<<<dimGrid, dimBlock>>>(m,n,k,dA,dB,dC,alpha,beta);
cudaThreadSynchronize();
// saving C calculated back in C
cudaMemcpy(dC,C, sizeC,cudaMemcpyDeviceToHost);
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
}
Try to modify
"dim3 dimGrid( n/dimBlock.x, m/dimBlock.y);"
to
"dim3 dimGrid( (n+dimBlock.x-1)/dimBlock.x, (m+dimBlock.y-1)/dimBlock.y); "
I have two programs. the only difference is that one uses constant memory to store input while the other uses global memory.I want to know why the global memory one is faster than the constant memory one? They both compute dot product btw 2 matrices
#include<cuda_runtime.h>
#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#define intMin(a,b) ((a<b)?a:b)
//Threads per block
#define TPB 128
//blocks per grid
#define BPG intMin(128, ((n+TPB-1)/TPB))
const int n = 4;
__constant__ float deva[n],devb[n];
__global__ void addVal( float *c){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Using shared memory to temporary store results
__shared__ float cache[TPB];
float temp = 0;
while(tid < n){
temp += deva[tid] * devb[tid];
tid += gridDim.x * blockDim.x;
}
cache[threadIdx.x] = temp;
__syncthreads();
int i = blockDim.x/2;
while( i !=0){
if(threadIdx.x < i){
cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ;
}
__syncthreads();
i = i/2;
}
if(threadIdx.x == 1){
c[blockIdx.x ] = cache[0];
}
}
int main(){
float a[n] , b[n] , c[BPG];
//float *deva, *devb, *devc;
float *devc;
int i;
//Filling with random values to test
for( i =0; i< n; i++){
a[i] = i;
b[i] = i*2;
}
//cudaMalloc((void**)&deva, n * sizeof(float));
//cudaMalloc((void**)&devb, n * sizeof(float));
cudaMalloc((void**)&devc, BPG * sizeof(float));
//cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice);
//cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(deva, a, n * sizeof(float));
cudaMemcpyToSymbol(devb, b, n * sizeof(float));
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//Call function to do dot product
addVal<<<BPG, TPB>>>( devc);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time,start, stop);
printf("The elapsed time is: %f\n", time);
//copy result back
cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost);
float sum =0 ;
for ( i = 0 ; i< BPG; i++){
sum+=c[i];
}
//display answer
printf("%f\n",sum);
getchar();
return 0;
}
Below is the global memory version.
#include<cuda_runtime.h>
#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#define intMin(a,b) ((a<b)?a:b)
//Threads per block
#define TPB 128
//blocks per grid
#define BPG intMin(128, ((n+TPB-1)/TPB))
const int n = 4;
__global__ void addVal(float *a, float *b, float *c){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Using shared memory to temporary store results
__shared__ float cache[TPB];
float temp = 0;
while(tid < n){
temp += a[tid] * b[tid];
tid += gridDim.x * blockDim.x;
}
cache[threadIdx.x] = temp;
__syncthreads();
int i = blockDim.x/2;
while( i !=0){
if(threadIdx.x < i){
cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ;
}
__syncthreads();
i = i/2;
}
if(threadIdx.x == 1){
c[blockIdx.x ] = cache[0];
}
}
int main(){
float a[n] , b[n] , c[BPG];
float *deva, *devb, *devc;
int i;
//Filling with random values to test
for( i =0; i< n; i++){
a[i] = i;
b[i] = i*2;
}
printf("Not using constant memory\n");
cudaMalloc((void**)&deva, n * sizeof(float));
cudaMalloc((void**)&devb, n * sizeof(float));
cudaMalloc((void**)&devc, BPG * sizeof(float));
cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//Call function to do dot product
addVal<<<BPG, TPB>>>(deva, devb, devc);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time,start, stop);
printf("The elapsed time is: %f\n", time);
//copy result back
cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost);
float sum =0 ;
for ( i = 0 ; i< BPG; i++){
sum+=c[i];
}
//display answer
printf("%f\n",sum);
getchar();
return 0;
}
You are not getting advantage of the constant memory.
A single read from constant memory can be broadcast to a half-warp (not your case as every thread load from its own tid).
Constant memory is cached (not used in your case as you only read once from each position in the constant memory array).
As each thread in a half-warp does a single read to different data, the 16 different reads get serialized, taking 16 times the amount of time to place the request.
If they are reading from global memory, the request are done at the same time, coalesced. That's why your global memory example is better than the constant memory.
Of course, this conclusion can vary with devices of compute capability 2.x with a L1 and L2 cache.
Regards!