Values of array after cudaMemcpy do not change, any idea why? [duplicate] - cuda

Thank you very much for reading my threads.
I am doing CUDA work, but keep getting cudaDeviceSynchronize() error code 77: cudaErrorIllegalAddress, without any idea why. I did the search for both the code and the function, surprisingly , only a few records showed up. Very strange.
I basically sum up all pixels of images. To make my questions have as much reference as it can, I am showing all my CUDA code here:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "thorcalgpu.h"
#include <stdio.h>
#include "math.h"
#include <vector>
#include <algorithm>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <numeric>
#include <iostream>
using namespace std;
float random_float(void)
{
return static_cast<float>(rand()) / RAND_MAX;
}
__global__ void reduceSum(unsigned short *input,
unsigned long long *per_block_results,
const int n)
{
extern __shared__ unsigned long long sdata[];
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
// load input into __shared__ memory
unsigned short x = 0;
if(i < n)
{
x = input[i];
}
sdata[threadIdx.x] = x;
__syncthreads();
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
{
if(threadIdx.x < offset)
{
// add a partial sum upstream to our own
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
// wait until all threads in the block have
// updated their partial sums
__syncthreads();
}
// thread 0 writes the final result
if(threadIdx.x == 0)
{
per_block_results[blockIdx.x] = sdata[0];
}
}
// Helper function for using CUDA to add vectors in parallel.
//template <class T>
cudaError_t gpuWrapper(float *mean, int N, vector<string> filelist)
{
int size = N*N;
unsigned long long* dev_sum = 0;
unsigned short* dev_img = 0;
cudaError_t cudaStatus;
const int block_size = 512;
const int num_blocks = (size/block_size) + ((size%block_size) ? 1 : 0);
int L = filelist.size();
// Choose which GPU to run on, change this on a multi-GPU system.
double totalgpuinittime = 0;
StartCounter(7);
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_img, size * sizeof(unsigned short));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_sum, num_blocks*sizeof(unsigned long long));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
totalgpuinittime = GetCounter(7);
unsigned short* img;
unsigned short* pimg;
unsigned long long* sum = new unsigned long long[num_blocks];
unsigned long long* psum = sum;
cout<<endl;
cout << "gpu looping starts, and in progress ..." << endl;
StartCounter(6);
double totalfileiotime = 0;
double totalh2dcpytime = 0;
double totalkerneltime = 0;
double totald2hcpytime = 0;
double totalcpusumtime = 0;
double totalloopingtime = 0;
for (int k = 0; k < L; k++)
{
StartCounter(1);
img = (unsigned short*)LoadTIFF(filelist[k].c_str());
totalfileiotime += GetCounter(1);
psum = sum;
pimg = img;
float gpumean = 0;
memset(psum, 0, sizeof(unsigned long long)*num_blocks);
StartCounter(2);
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_img, pimg, size * sizeof(unsigned short), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_sum, psum, num_blocks*sizeof(unsigned long long), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
totalh2dcpytime += GetCounter(2);
StartCounter(3);
//reduceSum<<<num_blocks,block_size,num_blocks * sizeof(unsigned long long)>>>(dev_img, dev_sum, size);
//reduceSum<<<num_blocks,block_size,block_size * sizeof(unsigned short)>>>(dev_img, dev_sum, size);
reduceSum<<<num_blocks,block_size>>>(dev_img, dev_sum, size);
totalkerneltime += GetCounter(3);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "reduction Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
// !!!!!! following is where the code 77 error occurs!!!!!!!
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
StartCounter(4);
cudaStatus = cudaMemcpy(psum, dev_sum, num_blocks * sizeof(unsigned long long ), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
totald2hcpytime += GetCounter(4);
StartCounter(5);
for (int i = 0; i < num_blocks; i++)
{
gpumean += *psum;
psum++;
}
gpumean /= N*N;
totalcpusumtime += GetCounter(5);
delete img;
img = NULL;
cout<<gpumean<<endl;
}
int S = 1e+6;
int F = filelist.size();
float R = S/F;
totalloopingtime = GetCounter(6);
cout<<"gpu looping ends."<<endl<<endl;
cout<< "analysis:"<<endl;
cout<<"gpu initialization time: "<<totalgpuinittime<<" sec"<<endl<<endl;
cout<<"file I/O time: "<<endl;
cout<<" total "<<totalfileiotime<<" sec | average "<<totalfileiotime*R<<" usec/frame"<<endl<<endl;
cout<<"host-to-device copy time: "<<endl;
cout<<" total "<<totalh2dcpytime<<" sec | average "<<totalh2dcpytime*R<<" usec/frame"<<endl<<endl;
cout<<"pure gpu kerneling time: "<<endl;
cout<<" total "<<totalkerneltime<<" sec | average "<<totalkerneltime*R<<" usec/frame"<<endl<<endl;
cout<<"device-to-host copy time: "<<endl;
cout<<" total "<<totald2hcpytime<<" sec | average "<<totald2hcpytime*R<<" usec/frame"<<endl<<endl;
/*cout<<"cpu summing time: "<<endl;
cout<<" total: "<<totalcpusumtime<<" sec | average: "<<totalcpusumtime*R<<" usec/frame"<<endl<<endl;;*/
/*cout <<"gpu looping time: " << endl;
cout<<" total: "<<totalloopingtime<<" sec | average: "<<totalloopingtime*R<<" usec/frame"<<endl;*/
Error:
cudaFree(dev_sum);
cudaFree(dev_img);
delete sum;
sum = NULL;
return cudaStatus;
}
void kernel(float* &mean, int N, vector<string> filelist)
{
// wrapper and kernel
cudaError_t cudaStatus = gpuWrapper(mean, N, filelist);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "gpuWapper failed!");
}
// printf("mean is: %f\n", mean);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
StartCounter(8);
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceReset failed!");
}
cout<<"gpu reset time: "<<GetCounter(8)<<" sec"<<endl<<endl;
//return *mean;
}
I have assigned enough and equivalent memory space for both host and device memory. Any comments is appreciated.

While this may not be the only source of error in the code, you are not allocating any dynamic shared memory for the reduction kernel, leading to the illegal addressing error you see. The correct kernel launch should be something like
size_t shm_size = block_size * sizeof(unsigned long long);
reduceSum<<<num_blocks,block_size,shm_size>>>(dev_img, dev_sum, size);
This allocates the equivalent of one unsigned long long for each thread running in the reduction kernel, which (by my very cursory reading of your code) should make the shared memory array sdata the correct size for the kernel to run without out-of-bounds access to that array.

Related

Achieved Occupancy column is not shown is Nsight Profiling result

I have faced a problem that is very weird to me. I can not see the achieved occupancy column in Nsight Performance Analysis output. I am using Geforce 920M GPU, NVIDIA driver of version 425.31, Nsight version of 6.0.0.18296 and visual studio 2017. The Nsight's version is compatible with driver's.
Can anyone help me out? I have quite no idea that why this happens.
I use Nsight performance analysis with CUDA trace checked as bellow:
I also used Visual Profiler but the achieved occupancy could not be seen there, too.
And the GPU examination gives out an error:
Note that as talonmies mentioned the error above was due to not running profiler in administrator mode. And solved but achieved occupancy is still not shown.
And here is my code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <math.h>
#include <iostream>
#define MAX_HISTORGRAM_NUMBER 10000
#define ARRAY_SIZE 102400000
#define CHUNK_SIZE 100
#define THREAD_COUNT 8
#define SCALER 80
cudaError_t histogramWithCuda(int *a, unsigned long long int *c);
__global__ void histogramKernelSingle(unsigned long long int *c, int *a)
{
unsigned long long int worker = blockIdx.x*blockDim.x + threadIdx.x;
unsigned long long int start = worker * CHUNK_SIZE;
unsigned long long int end = start + CHUNK_SIZE;
for (int ex = 0; ex < SCALER; ex++)
for (long long int i = start; i < end; i++)
{
if (i < ARRAY_SIZE)
atomicAdd(&c[a[i]], 1);
else
{
break;
}
}
}
int main()
{
int* a = (int*)malloc(sizeof(int)*ARRAY_SIZE);
unsigned long long int* c = (unsigned long long int*)malloc(sizeof(unsigned long long int)*MAX_HISTORGRAM_NUMBER);
for (unsigned long long i = 0; i < ARRAY_SIZE;i++)
a[i] = rand() % MAX_HISTORGRAM_NUMBER;
for (unsigned long long i = 0; i < MAX_HISTORGRAM_NUMBER; i++)
c[i] = 0;
// Add vectors in parallel.
double start_time = omp_get_wtime();
cudaError_t cudaStatus=histogramWithCuda(a,c);
double end_time = omp_get_wtime();
std::cout << end_time - start_time;
// =
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
unsigned long long int R = 0;
for (int i = 0; i < MAX_HISTORGRAM_NUMBER; i++)
{
R += c[i];
//printf("%d ", c[i]);
}
printf("\nCORRECT:%ld ", R/(SCALER));
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t histogramWithCuda(int *a, unsigned long long int *c)
{
int *dev_a = 0;
unsigned long long int *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, MAX_HISTORGRAM_NUMBER * sizeof(unsigned long long int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, ARRAY_SIZE * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
//// BLOCK CALCULATOR HERE
////BLOCK CALCULATOR HERE
histogramKernelSingle << < ARRAY_SIZE / (THREAD_COUNT*CHUNK_SIZE), THREAD_COUNT>> > (dev_c, dev_a);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, MAX_HISTORGRAM_NUMBER * sizeof(unsigned long long int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
return cudaStatus;
}
Thanks in advance.
Achieved Occupancy is only captured in the Profile Activity. The Trace Activity does not support capturing GPU performance counters. Achieved Occupancy is sm__active_warps_sum / sm__actice_cycles_sum / SM__MAX_WARPS * 100.
Nsight Visual Studio Edition
The Trace Activity cannot collect Achieved Occupancy. Run the command Nsight | Start Performance Analysis ... and in the Activity window select Profile CUDA Application (not Trace Application). The default Profile CUDA Application contains the experiment Achieved Occupancy.
NVIDIA Visual Profiler
In NVVP ensure that you are collecting GPU performance counters. The default activity will collect the timeline but will not collect GPU events.
Run | Generate Timeline will not collect Achieved Occupancy
Run | Analyze Application will collect Achieved Occupancy
If you continue to have issues then you may have an issue with permissions on the system. Please try collecting another set of performance counters using Nsight Profile CUDA Application or NVVP | Collect Metrics and Events...

Weird CUDA error in a very simple test [duplicate]

I am new to CUDA. When I multiply the 1024x1024 matrix, and launch a kernel with:
multiplyKernel << <dim3(32,32, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);
But when I multiply a 2048 x 2048 matrix, with
dim3(64,64,1)
I get this error:
cudaDeviceSynchronize returned error code 4 after launching addKernel!
unspecified launch failure
From tinkering with the code, I think that the error is in this statement
result += a[row * size + ind] * b[col + size * ind];
in the part
b[col+size*ind]
If I take that out, I don't get a kernel launch error (just the wrong answer, obviously). I cannot figure out what's wrong. Any suggestions would be most appreciated.
I am using Visual Studio 2013. I am using the debugger, but this does not help me find the error.
This seems to be a similar problem:
cudaDeviceSynchronize returned error code 4 after launching
many thanks, here is the code:
cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size);
__global__ void multiplyKernel(int *c, const int *a, const int *b, unsigned int size)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row > size || col > size) return;
// target field in 1-D
int z = row * size + col;
int result = 0;
for (int ind = 0; ind < size ; ++ind) {
result += a[row * size + ind] * b[col + size * ind];
}
c[z] = result;
}
int main(){
const int sizeMatrix = 2048;
int* a = new int[sizeMatrix * sizeMatrix];
int* b = new int[sizeMatrix * sizeMatrix];
int* c = new int[sizeMatrix * sizeMatrix];
for (int i = 0; i < sizeMatrix * sizeMatrix; i++) {
a[i] = rand() % 2;
b[i] = rand() % 2;
}
cudaError_t cudaStatus = multiplyWithCuda(c, a, b, sizeMatrix);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
int *dev_a ;
int *dev_b;
int *dev_c;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
fprintf(stdout, "device set");
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for c allocated \n");
cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for a allocated \n");
cudaStatus = cudaMalloc((void**)&dev_b, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for b allocated \n");
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
fprintf(stdout, "cudaMemcpy a done \n");
cudaStatus = cudaMemcpy(dev_b, b, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
fprintf(stdout, "cudaMemcpy b done\n");
fprintf(stdout, "about to launch kernel \n");
// Launch a kernel on the GPU with one thread for each element.
multiplyKernel << <dim3(64,64, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);
fprintf(stdout, "kernel launched\n");
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
; fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
fprintf(stderr, " %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
On Windows, I right clicked the NSight monitor icon in the system tray. There I chose Options>General. We see WDDM TDR delay. It was at 2, and I increased it to 10. Then, I ran my program again, and it worked fine.
This was according to Robert's link (see above)
http://http.developer.nvidia.com/NsightVisualStudio/2.2/Documentation/UserGuide/HTML/Content/Timeout_Detection_Recovery.htm

memory location error: thrust::stable_sort when using big array and user-defined comparison operator

I'm running this code to sort big array of IPs using thrust stable_sort and user defined operator to compare the IPs.
this code is working for arrays less than 50000 IPs, but I got a memory error for big arrays.
here is the code I used:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <stdio.h>
#include <time.h>
#include <device_functions.h>
template<typename T>
struct vector_less
{
typedef T first_argument_type;
typedef T second_argument_type;
typedef bool result_type;
__host__ __device__ bool operator()(const T &lhs, const T &rhs) const {
if (lhs[0] == rhs[0])
if (lhs[1] == rhs[1])
if (lhs[2] == rhs[2])
return lhs[3] < rhs[3];
else
return lhs[2] < rhs[2];
else
return lhs[1] < rhs[1];
else
return lhs[0] < rhs[0];
}
};
__global__ void prepare_ips_list(unsigned char ** dev_sorted_Ips, unsigned char * ip_b1, unsigned char * ip_b2, unsigned char * ip_b3, unsigned char * ip_b4, unsigned int searchedIpsSize)
{
int thread = threadIdx.x + blockIdx.x * blockDim.x;
if (thread < searchedIpsSize)
{
dev_sorted_Ips[thread] = new unsigned char[4];
dev_sorted_Ips[thread][0] = ip_b1[thread];
dev_sorted_Ips[thread][1] = ip_b2[thread];
dev_sorted_Ips[thread][2] = ip_b3[thread];
dev_sorted_Ips[thread][3] = ip_b4[thread];
}
}
int main()
{
const int size = 1000000;
unsigned char * ip_b1 = new unsigned char[size];
unsigned char * ip_b2 = new unsigned char[size];;
unsigned char * ip_b3 = new unsigned char[size];;
unsigned char * ip_b4 = new unsigned char[size];;
unsigned char * dev_ip_b1;
unsigned char * dev_ip_b2;
unsigned char * dev_ip_b3;
unsigned char * dev_ip_b4;
unsigned char ** dev_sortedIps;
for (int i = 0; i < size; i++)
{
ip_b1[i] = rand() % 240;
ip_b2[i] = rand() % 240;
ip_b3[i] = rand() % 240;
ip_b4[i] = rand() % 240;
}
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_ip_b1, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_ip_b1, ip_b1, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_ip_b2, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_ip_b2, ip_b2, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_ip_b3, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_ip_b3, ip_b3, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_ip_b4, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_ip_b4, ip_b4, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_sortedIps, size * sizeof(unsigned char *));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
int resetThreads = size;
int resetBlocks = 1;
if (size > 1024)
{
resetThreads = 1024;
resetBlocks = size / 1024;
if (size % 1024 > 0)
resetBlocks++;
}
prepare_ips_list << <resetBlocks, resetThreads >> >(dev_sortedIps, dev_ip_b1, dev_ip_b2, dev_ip_b3, dev_ip_b4, size);
thrust::device_ptr<unsigned char *> sorted_list_ptr1(dev_sortedIps);
thrust::stable_sort(sorted_list_ptr1, sorted_list_ptr1 + size, vector_less<unsigned char *>());
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching !\n", cudaStatus);
goto Error;
}
return 0;
Error:
cudaFree(dev_ip_b1);
cudaFree(dev_ip_b2);
cudaFree(dev_ip_b3);
cudaFree(dev_ip_b4);
cudaFree(dev_sortedIps);
}
the error I got is :
Microsoft C++ exception: thrust::system::system_error at memory location
how to solve this problem with big arrays?
should I use another technique to achieve this sorting such as dividing and sorting for the parts then merging?
The proximal issue is that in-kernel malloc and new are limited in the size of the device heap that they have available to allocate from. This limit can be raised. Please read the documentation.
A few other suggestions:
You're not doing any error checking after your kernel (before the first thrust call). You should do error checking on the kernel, then you would discover that your kernel is what is failing, and thrust is merely reporting the error for you. Avoid the confusion. Do rigorous, proper cuda error checking any time you are having trouble with a CUDA code.
As a good practice, it's not a bad idea, at least for debugging purposes, to test any pointer return by new or malloc for NULL. This is how the API informs you that an allocation failure occurred.
The code below demonstrates a possible workaround for the proximal issue, by adjusting the device heap for the input size. It also demonstrates possible ways to address the other two suggestions:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <device_functions.h>
#include <assert.h>
template<typename T>
struct vector_less
{
typedef T first_argument_type;
typedef T second_argument_type;
typedef bool result_type;
__host__ __device__ bool operator()(const T &lhs, const T &rhs) const {
if (lhs[0] == rhs[0])
if (lhs[1] == rhs[1])
if (lhs[2] == rhs[2])
return lhs[3] < rhs[3];
else
return lhs[2] < rhs[2];
else
return lhs[1] < rhs[1];
else
return lhs[0] < rhs[0];
}
};
__global__ void prepare_ips_list(unsigned char ** dev_sorted_Ips, unsigned char * ip_b1, unsigned char * ip_b2, unsigned char * ip_b3, unsigned char * ip_b4, unsigned int searchedIpsSize)
{
int thread = threadIdx.x + blockIdx.x * blockDim.x;
if (thread < searchedIpsSize)
{
dev_sorted_Ips[thread] = new unsigned char[4];
if (dev_sorted_Ips[thread] == NULL) assert(0);
dev_sorted_Ips[thread][0] = ip_b1[thread];
dev_sorted_Ips[thread][1] = ip_b2[thread];
dev_sorted_Ips[thread][2] = ip_b3[thread];
dev_sorted_Ips[thread][3] = ip_b4[thread];
}
}
int main(int argc, char *argv[])
{
int size = 50000;
if (argc > 1) size = atoi(argv[1]);
int chunks = size/50000 + 1;
cudaError_t cudaStatus;
cudaStatus = cudaDeviceSetLimit(cudaLimitMallocHeapSize, 8000000 * chunks);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "set device heap limit failed!");
}
unsigned char * ip_b1 = new unsigned char[size];
unsigned char * ip_b2 = new unsigned char[size];;
unsigned char * ip_b3 = new unsigned char[size];;
unsigned char * ip_b4 = new unsigned char[size];;
unsigned char * dev_ip_b1;
unsigned char * dev_ip_b2;
unsigned char * dev_ip_b3;
unsigned char * dev_ip_b4;
unsigned char ** dev_sortedIps;
for (int i = 0; i < size; i++)
{
ip_b1[i] = rand() % 240;
ip_b2[i] = rand() % 240;
ip_b3[i] = rand() % 240;
ip_b4[i] = rand() % 240;
}
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
}
cudaStatus = cudaMalloc((void**)&dev_ip_b1, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(dev_ip_b1, ip_b1, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaStatus = cudaMalloc((void**)&dev_ip_b2, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(dev_ip_b2, ip_b2, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaStatus = cudaMalloc((void**)&dev_ip_b3, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(dev_ip_b3, ip_b3, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaStatus = cudaMalloc((void**)&dev_ip_b4, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(dev_ip_b4, ip_b4, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaStatus = cudaMalloc((void**)&dev_sortedIps, size * sizeof(unsigned char *));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
int resetThreads = size;
int resetBlocks = 1;
if (size > 1024)
{
resetThreads = 1024;
resetBlocks = size / 1024;
if (size % 1024 > 0)
resetBlocks++;
}
prepare_ips_list << <resetBlocks, resetThreads >> >(dev_sortedIps, dev_ip_b1, dev_ip_b2, dev_ip_b3, dev_ip_b4, size);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess){
printf(" kernel fail\n");
exit(0);}
thrust::device_ptr<unsigned char *> sorted_list_ptr1(dev_sortedIps);
thrust::stable_sort(sorted_list_ptr1, sorted_list_ptr1 + size, vector_less<unsigned char *>());
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "launch failed: %s\n", cudaGetErrorString(cudaStatus));
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching !\n", cudaStatus);
}
return 0;
}
Note that you can test various sizes by passing the desired size as a command line parameter. I tested up to 1000000, it seemed to work fine. Eventually, for a large enough problem size, you will run out of memory on your GPU. You don't indicate what GPU you have.
I've removed the goto statements, because I am working on linux (apparently you've switched back to windows). I would suggest you come up with a different error handling process than using goto, if for no other reason than that it causes difficulty with thrust constructs.
Also note that in-kernel new or malloc is kind of "slow". You could probably speed this up for larger sizes substantially by doing your necessary allocation up-front, with a single cudaMalloc call of the appropriate size. Unfortunately this is complicated by your use of the double-pointer array dev_sorted_Ips. I would suggest that you instead flatten that to a single pointer array, allocate the necessary size once via cudaMalloc, and do the necessary array indexing in your kernel to make it work. If you profile this code, you'll discover that the vast majority of execution time for longer cases (e.g. size = 1000000) is consumed by your prepare_ips_list kernel, not the sorting operation. So the focus of your efforts for performance improvement should begin there.

How can I check the progress of matrix multiplication?

I'm now only need to show an intermediate progress of matrix multiplication.
for(unsigned int col=0; col<mtxSize; col++) {
unsigned tmp = 0;
for(unsigned int row=0; row<mtxSize; row++) {
for(unsigned int idx=0; idx<mtxSize; idx++) {
tmp += h_A[col*mtxSize+idx] * h_B[idx*mtxSize+row];
}
h_Rs[col*mtxSize+row] = tmp;
tmp = 0;
int rate_tmp = (col*mtxSize + (row+1))*100;
// Maybe like this...
fprintf(stdout, "Progress : %d.%d %%\r", rate_tmp/actMtxSize, rate_tmp%actMtxSize);
fflush(stdout);
}
}
In the case of the host code(use CPU), it is very easy beacause it process sequentially so we can check very easily.
But in the case of the GPU which process in parallel, what should I do?
Once the kernel is running, it does not return until finish the kernel execution.
So I can't check mid-data during the kernel execution time.
I think I need to use asynchronous kernel call, but I do not know well.
And even if the asynchronous kernel call is used, to see all of the data into several blocks over processors, do I have to write atomicAdd() (in other words, global memory access) function which is including some overhead?
Give me some advice or hint.
And I want to know in the case of CUDA.
Here is a code which demonstrates how to check progress from a matrix multiply kernel:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define TIME_INC 100000000
#define INCS 10
#define USE_PROGRESS 1
#define MAT_DIMX 4000
#define MAT_DIMY MAT_DIMX
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void mykernel(volatile int *data){
unsigned long time;
for (int i = 0; i < INCS; i++){
atomicAdd((int *)data,1);
__threadfence_system();
time = clock64();
while((clock64() - time)<TIME_INC) {};
}
printf("progress check finished\n");
}
__global__ void matmult(float *a, float *b, float *c, unsigned int rowA, unsigned int colA, unsigned int colB, volatile int *progress){
unsigned int row = threadIdx.x+blockDim.x*blockIdx.x;
unsigned int col = threadIdx.y+blockDim.y*blockIdx.y;
if ((row < rowA) && (col < colB)){
float temp = 0.0f;
for (unsigned int k = 0; k < colA; k++)
temp += a[(row*colA)+k] * b[(k*colB) + col];
c[(row*colB)+col] = temp;
#if USE_PROGRESS
if (!(threadIdx.x || threadIdx.y)){
atomicAdd((int *)progress, 1);
__threadfence_system();
}
#endif
}
}
int main(){
// simple test to demonstrate reading progress data from kernel
volatile int *d_data, *h_data;
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaCheckErrors("cudaSetDeviceFlags error");
cudaHostAlloc((void **)&h_data, sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc error");
cudaHostGetDevicePointer((int **)&d_data, (int *)h_data, 0);
cudaCheckErrors("cudaHostGetDevicePointer error");
*h_data = 0;
printf("kernel starting\n");
mykernel<<<1,1>>>(d_data);
cudaCheckErrors("kernel fail");
int value = 0;
do{
int value1 = *h_data;
if (value1 > value){
printf("h_data = %d\n", value1);
value = value1;}}
while (value < (INCS-1));
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail 2");
// now try matrix multiply with progress
float *h_c, *d_a, *d_b, *d_c;
h_c = (float *)malloc(MAT_DIMX*MAT_DIMY*sizeof(float));
if (h_c == NULL) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_a, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc a fail");
cudaMalloc((void **)&d_b, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc b fail");
cudaMalloc((void **)&d_c, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc c fail");
for (int i = 0; i < MAT_DIMX*MAT_DIMY; i++) h_c[i] = rand()/(float)RAND_MAX;
cudaMemcpy(d_a, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy a fail");
cudaMemcpy(d_b, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy b fail");
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
*h_data=0;
dim3 block(16,16);
dim3 grid(((MAT_DIMX+block.x-1)/block.x), ((MAT_DIMY+block.y-1)/block.y));
printf("matrix multiply kernel starting\n");
cudaEventRecord(start);
matmult<<<grid,block>>>(d_a, d_b, d_c, MAT_DIMY, MAT_DIMX, MAT_DIMX, d_data);
cudaEventRecord(stop);
#if USE_PROGRESS
unsigned int num_blocks = grid.x*grid.y;
float my_progress = 0.0f;
value = 0;
printf("Progress:\n");
do{
cudaEventQuery(stop); // may help WDDM scenario
int value1 = *h_data;
float kern_progress = (float)value1/(float)num_blocks;
if ((kern_progress - my_progress)> 0.1f) {
printf("percent complete = %2.1f\n", (kern_progress*100));
my_progress = kern_progress;}}
while (my_progress < 0.9f);
printf("\n");
#endif
cudaEventSynchronize(stop);
cudaCheckErrors("event sync fail");
float et;
cudaEventElapsedTime(&et, start, stop);
cudaCheckErrors("event elapsed time fail");
cudaDeviceSynchronize();
cudaCheckErrors("mat mult kernel fail");
printf("matrix multiply finished. elapsed time = %f milliseconds\n", et);
return 0;
}
The code associated with the first kernel call is just to demonstrate the basic idea of having a kernel report it's progress back.
The second part of the code shows a sample, naive matrix multiply on the GPU, with the GPU reporting it's progress back. I have included the ability to remove the progress check code via a preprocessor macro, as well as the ability to time the matrix multiply kernel. For the case I have here, there was no discernible difference in timing with or without the progress code. So while the progress reporting code probably does add some overhead, when compared to the scope of a reasonable sized matrix multiply kernel, it adds no significant time that I can see.
Some other uses of signalling are discussed here

I lost data after __syncthreads() in cuda

I am trying to find the maximum of an array.. I took the help from CUDA Maximum Reduction Algorithm Not Working. and do some own modification. However I am running it for 16 data. I am finding that in kernel code shared memory copies only 1st 4data. rest are lost. I put two cuPrintf..1st printf shows data is their in the shared memory. But the 2nd cuPrintf is just after __syncthreads.. and that shows 0 from thread ids 4 onwords.. pls help
#include
#include
#include
#include
#include
#include "cuPrintf.cu"
#include "cuPrintf.cuh"
__device__ float MaxOf2(float a, float b)
{
if(a > b) return a;
else return b;
}
__global__ void findMax(int size,float *array_device , float *outPut)
{
extern __shared__ float sdata[];
int tid = threadIdx.x;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i< size)
{
sdata[tid] = array_device[i];
cuPrintf(" array_d[%d]===%f, sdata[%d]===%f\n ",i,array_device[i],tid,sdata[tid]);
__threadfence();
}
__syncthreads();
if(tid<size)
cuPrintf(" array_d[%d]===%f, sdata[%d]===%f\n ",i,array_device[i],tid,sdata[tid]);
for ( int s=blockDim.x/2; s>0; s=s>>1)//s=blockDim.x/2
{
if (tid < s)
{
sdata[tid]= MaxOf2(sdata[tid],sdata[tid+s]);
}
__syncthreads();
}
if (tid == 0) outPut[blockIdx.x] = sdata[0];
}
int main()
{
long double M = pow(2,20);
long double N = 2;
int noThreadsPerBlock = 512 ;
printf("\n Provide the array Size N.(array will be of size N * 2^20 ) :-");
scanf("%Lf",&N);
long int size = 16;
int numOfBlock = (int)size /noThreadsPerBlock + 1;
printf("\n num of blocks==%ld",numOfBlock);
float *array_device , *outPut;
float array_host[]={221,100,2,340,47,36,500,1,33,4460,5,6,7,8,9,11};
cudaMalloc((void **)&array_device, size*sizeof(float));
cudaMalloc((void **)&outPut, size*sizeof(float));
cudaError_t error0 = cudaGetLastError();
printf("\n 0CUDA error: %s\n", cudaGetErrorString(error0));
printf("size===%ld",size);
cudaMemcpy(array_device, array_host, size*sizeof(float), cudaMemcpyHostToDevice);
cudaError_t error1 = cudaGetLastError();
printf("\n1CUDA error: %s\n", cudaGetErrorString(error1));
while(size>1 )
{
cudaPrintfInit();
findMax<<< numOfBlock,noThreadsPerBlock>>>(size,array_device, outPut);cudaPrintfDisplay(stdout, true);
cudaPrintfEnd();
cudaError_t error2 = cudaGetLastError();
printf(" 2CUDA error: %s\n", cudaGetErrorString(error2));
cudaMemcpy(array_device, outPut, size*sizeof(float), cudaMemcpyDeviceToDevice);
size = numOfBlock;
printf("\n ****size==%ld\n",size);
numOfBlock = (int)size /noThreadsPerBlock + 1;
}
cudaMemcpy(array_host, outPut, size*sizeof(float), cudaMemcpyDeviceToHost);
cudaError_t error3 = cudaGetLastError();
printf("\n3CUDA error: %s\n", cudaGetErrorString(error3));
for(int i=0;i<size;i++)
printf("\n index==%d ;data=%f ",i,array_host[i]);
return 0;
}
I'm posting my comment as an answer as requested.
Firstly, you havent specified dynamic size of shared memory in kernel launch. It should look something like:
findMax<<< numOfBlock,noThreadsPerBlock,sizeof(float)*noThreadsPerBlock>>>
Secondly, what was the concept behind condition if(tid<size) on second cuPrintf? Providing output of the program could also help.