I'm trying to implement string matching program with CUDA in C and I have th following issue.
When I set 1 block and 1 thread per block the result for pattern dfh is 2. That's correct, but when I increase the blocks the result is 4.
The text file is:
ffskdfhksdjhfksdfksjdfhksdhfksjdhfkjer654yrkhjkfgjhdsrtrhkjchgkjthyoirthygfnbkjgkjdhykhkjchgkjfdhsfykhkbhkjfghkfgjy
This is my code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
__global__ void string_matching(char *buffer, char *pattern, int match_size, int pattern_size, int *result){
int tid, i;
__shared__ int local_matches;
if(threadIdx.x == 0) local_matches = 0;
__syncthreads();
for(tid=blockIdx.x*blockDim.x+threadIdx.x; tid<match_size; tid+=blockDim.x){
for (i = 0; i < pattern_size && pattern[i] == buffer[i + tid]; ++i);
if(i >= pattern_size){
atomicAdd(&local_matches, 1);
}
}
__syncthreads();
if(threadIdx.x == 0)
atomicAdd(result, local_matches);
}
int main(int argc, char *argv[]){
FILE *pFile;
long file_size, match_size, pattern_size;
char * buffer;
char * filename, *pattern;
size_t result;
int *match, total_matches;
//CUDA variables
int blocks, threads_per_block;
int *result_dev;
char *buffer_dev, *pattern_dev;
float total_time, comp_time;
cudaEvent_t total_start, total_stop, comp_start, comp_stop;
cudaEventCreate(&total_start);
cudaEventCreate(&total_stop);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_stop);
if (argc != 5) {
printf ("Usage : %s <file_name> <string> <blocks> <threads_per_block>\n", argv[0]);
return 1;
}
filename = argv[1];
pattern = argv[2];
blocks = strtol(argv[3], NULL, 10);
threads_per_block = strtol(argv[4], NULL, 10);
pFile = fopen ( filename , "rb" );
if (pFile==NULL) {printf ("File error\n"); return 2;}
// obtain file size:
fseek (pFile , 0 , SEEK_END);
file_size = ftell (pFile);
rewind (pFile);
printf("file size is %ld\n", file_size);
// allocate memory to contain the file:
buffer = (char*) malloc (sizeof(char)*file_size);
if (buffer == NULL) {printf ("Memory error\n"); return 3;}
// copy the file into the buffer:
result = fread (buffer,1,file_size,pFile);
if (result != file_size) {printf ("Reading error\n"); return 4;}
pattern_size = strlen(pattern);
match_size = file_size - pattern_size + 1;
match = (int *) malloc (sizeof(int)*match_size);
if (match == NULL) {printf ("Malloc error\n"); return 5;}
cudaMalloc((void **)&result_dev, sizeof(int));
cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
cudaMalloc((void **)&pattern_dev, pattern_size*sizeof(char));
cudaEventRecord(total_start);
cudaEventRecord(comp_start);
cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(pattern_dev, pattern, pattern_size*sizeof(char), cudaMemcpyHostToDevice);
string_matching<<<blocks, threads_per_block>>>(buffer_dev, pattern_dev, match_size, pattern_size, result_dev);
cudaThreadSynchronize();
cudaEventRecord(comp_stop);
cudaEventSynchronize(comp_stop);
cudaEventElapsedTime(&comp_time, comp_start, comp_stop);
cudaMemcpy(&total_matches, result_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(total_stop);
cudaEventSynchronize(total_stop);
cudaEventElapsedTime(&total_time, total_start, total_stop);
cudaFree(result_dev);
cudaFree(buffer_dev);
cudaFree(pattern_dev);
fclose (pFile);
free (buffer);
//Print result
printf("Total matches: %d\n", total_matches);
printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", file_size, blocks, blocks*threads_per_block);
printf("Total time (ms): %.3f\n", total_time);
printf("Kernel time (ms): %.3f\n", comp_time);
printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);
}
You need to synchronize threads of the same block using __syncthreads. For example, local_matches = 0 can theoretically be done concurrently to the atomicAdd of the main loop. Thus, a __syncthreads is needed between both. For the same reason, you also need a __syncthreads before the last if(threadIdx.x == 0). I am not sure this is the only error.
I advise you to use CUDA-GDB to track such bug since the kernel is quite short and relatively simple.
Note that a local_matches do not need to be shared. In fact, it is not efficient to do that. On can perform the reduction in local memory and then perform a final atomicAdd. Additionally, you do not need the innermost conditional. You can simply do: local_matches += i >= pattern_size; (the compiler might already do such optimization).
I finally found a solution for this.
I set a match table with 0 values for every buffer position, set 1 for every position that pattern found, and added the 1's in the CPU.
If you think something better please add an answer.
This is the code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
__global__ void string_matching(char *buffer, char *pattern, int match_size, int pattern_size, int *match){
int tid, i;
for(tid=blockIdx.x*blockDim.x+threadIdx.x; tid<match_size; tid+=blockDim.x){
for (i = 0; i < pattern_size && pattern[i] == buffer[i + tid]; ++i);
if(i >= pattern_size){
match[tid] = 1;
}
else{
match[tid] = 0;
}
}
}
int main(int argc, char *argv[]){
FILE *pFile;
int i;
long file_size, match_size, pattern_size;
char * buffer;
char * filename, *pattern;
size_t result;
int *match, total_matches;
//CUDA variables
int blocks, threads_per_block;
int *match_dev;
char *buffer_dev, *pattern_dev;
float total_time, comp_time;
cudaEvent_t total_start, total_stop, comp_start, comp_stop;
cudaEventCreate(&total_start);
cudaEventCreate(&total_stop);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_stop);
if (argc != 5) {
printf ("Usage : %s <file_name> <string> <blocks> <threads_per_block>\n", argv[0]);
return 1;
}
filename = argv[1];
pattern = argv[2];
blocks = strtol(argv[3], NULL, 10);
threads_per_block = strtol(argv[4], NULL, 10);
pFile = fopen ( filename , "rb" );
if (pFile==NULL) {printf ("File error\n"); return 2;}
// obtain file size:
fseek (pFile , 0 , SEEK_END);
file_size = ftell (pFile);
rewind (pFile);
printf("file size is %ld\n", file_size);
// allocate memory to contain the file:
buffer = (char*) malloc (sizeof(char)*file_size);
if (buffer == NULL) {printf ("Memory error\n"); return 3;}
// copy the file into the buffer:
result = fread (buffer,1,file_size,pFile);
if (result != file_size) {printf ("Reading error\n"); return 4;}
pattern_size = strlen(pattern);
match_size = file_size - pattern_size + 1;
match = (int *) malloc (sizeof(int)*match_size);
if (match == NULL) {printf ("Malloc error\n"); return 5;}
cudaMalloc((void **)&match_dev, match_size*sizeof(int));
cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
cudaMalloc((void **)&pattern_dev, pattern_size*sizeof(char));
cudaEventRecord(total_start);
cudaEventRecord(comp_start);
cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(pattern_dev, pattern, pattern_size*sizeof(char), cudaMemcpyHostToDevice);
string_matching<<<blocks, threads_per_block>>>(buffer_dev, pattern_dev, match_size, pattern_size, match_dev);
cudaThreadSynchronize();
cudaEventRecord(comp_stop);
cudaEventSynchronize(comp_stop);
cudaEventElapsedTime(&comp_time, comp_start, comp_stop);
cudaMemcpy(match, match_dev, match_size*sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(total_stop);
cudaEventSynchronize(total_stop);
cudaEventElapsedTime(&total_time, total_start, total_stop);
total_matches = 0;
for(i=0; i<match_size; i++){
total_matches += match[i];
}
cudaFree(match_dev);
cudaFree(buffer_dev);
cudaFree(pattern_dev);
fclose (pFile);
free (buffer);
//Print result
printf("Total matches: %d\n", total_matches);
printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", file_size, blocks, blocks*threads_per_block);
printf("Total time (ms): %.3f\n", total_time);
printf("Kernel time (ms): %.3f\n", comp_time);
printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);
}
I think this is a better solution.
It works only for power of 2 threads per block.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#define MAX_THREADS_PER_BLOCK 100
__global__ void string_matching(char *buffer, char *pattern, int match_size, int pattern_size, int blocks, int slice, int extra, int *gout){
int tid, i;
int thread_index = blockIdx.x*blockDim.x + threadIdx.x;
int start = thread_index*slice;
int stop = start + slice;
if(thread_index == blocks*blockDim.x - 1){
stop += extra;
}
if(stop > match_size){
stop = match_size;
}
__shared__ int r[MAX_THREADS_PER_BLOCK];
int sum = 0;
for(tid=start; tid<stop; tid++){
for (i = 0; i < pattern_size && pattern[i] == buffer[i + tid]; ++i);
if(i >= pattern_size){
sum++;
}
}
r[threadIdx.x] = sum;
__syncthreads();
//works only for power of 2 threads_per_block
for (int size = blockDim.x/2; size>0; size/=2) { //uniform
if (threadIdx.x<size)
r[threadIdx.x] += r[threadIdx.x+size];
__syncthreads();
}
printf("Block: %d, Thread: %d, Global Thread: %d, Start: %d, Stop: %d, Matches: %d, Block Matches: %d\n", blockIdx.x, threadIdx.x, thread_index, start, stop, r[threadIdx.x], r[0]);
if(threadIdx.x == 0){
gout[blockIdx.x] = r[0];
}
}
int main(int argc, char *argv[]){
int i;
FILE *pFile;
long file_size, match_size, pattern_size;
char * buffer;
char * filename, *pattern;
size_t result;
int *results;
int total_matches;
//CUDA variables
int blocks, threads_per_block, total_threads, slice, extra;
int *results_dev;
char *buffer_dev, *pattern_dev;
float total_time, comp_time;
cudaEvent_t total_start, total_stop, comp_start, comp_stop;
cudaEventCreate(&total_start);
cudaEventCreate(&total_stop);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_stop);
if (argc != 5) {
printf ("Usage : %s <file_name> <string> <blocks> <threads_per_block>\n", argv[0]);
return 1;
}
filename = argv[1];
pattern = argv[2];
blocks = strtol(argv[3], NULL, 10);
threads_per_block = strtol(argv[4], NULL, 10);
pFile = fopen ( filename , "rb" );
if (pFile==NULL) {printf ("File error\n"); return 2;}
// obtain file size:
fseek (pFile , 0 , SEEK_END);
file_size = ftell (pFile);
rewind (pFile);
printf("file size is %ld\n", file_size);
// allocate memory to contain the file:
buffer = (char*) malloc (sizeof(char)*file_size);
if (buffer == NULL) {printf ("Memory error\n"); return 3;}
// copy the file into the buffer:
result = fread (buffer,1,file_size,pFile);
if (result != file_size) {printf ("Reading error\n"); return 4;}
pattern_size = strlen(pattern);
match_size = file_size - pattern_size + 1;
results = (int *)malloc(blocks*sizeof(int));
cudaMalloc((void **)&results_dev, blocks*sizeof(int));
cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
cudaMalloc((void **)&pattern_dev, pattern_size*sizeof(char));
cudaEventRecord(total_start);
cudaEventRecord(comp_start);
cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(pattern_dev, pattern, pattern_size*sizeof(char), cudaMemcpyHostToDevice);
total_threads = blocks*threads_per_block;
slice = match_size/total_threads;
extra = match_size%total_threads;
string_matching<<<blocks, threads_per_block>>>(buffer_dev, pattern_dev, match_size, pattern_size, blocks, slice, extra, results_dev);
cudaEventRecord(comp_stop);
cudaEventSynchronize(comp_stop);
cudaEventElapsedTime(&comp_time, comp_start, comp_stop);
cudaMemcpy(results, results_dev, blocks*sizeof(int), cudaMemcpyDeviceToHost);
total_matches = 0;
for(i=0; i<blocks; i++){
total_matches += results[i];
}
cudaEventRecord(total_stop);
cudaEventSynchronize(total_stop);
cudaEventElapsedTime(&total_time, total_start, total_stop);
cudaFree(results_dev);
cudaFree(buffer_dev);
cudaFree(pattern_dev);
fclose (pFile);
free (buffer);
//Print result
printf("Total matches: %d\n", total_matches);
printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", file_size, blocks, blocks*threads_per_block);
printf("Total time (ms): %.3f\n", total_time);
printf("Kernel time (ms): %.3f\n", comp_time);
printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);
}
The stride on your loop is not correct for using multiple blocks, i.e. both blocks were doing the full amount of work (and therefore finding two entries each). The correct grid-stride loop looks like the following
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
tid < match_size;
tid += blockDim.x * gridDim.x /* <-- fix */) {
// ...
}
Related
I need to sum up a vector, which is longer than the number of threads in a cuda block. So I use multi blocks to handle the task. I sum up a part of the vector within each block, after which I have two options, one is to use atomicAdd to combine the sum of each block, and the other is to write the result in some global memory and launch another kernel to sum up. Which method do you recommand me to use ?
Is cuda atomicAdd operation faster than launch another kernel when we do reduce sum?
For the following test case, using the code lifted from slides 16 and 17 in the training here (video), it seems to be a bit faster. The difference is about the cost of kernel launch overhead, which makes sense:
$ cat t1834.cu
#include <time.h>
#include <sys/time.h>
#include <iostream>
const int BLOCK_SIZE = 1024;
template <typename T>
__global__ void reduce(const T * __restrict__ gdata, T * __restrict__ out, const int N){
__shared__ T sdata[BLOCK_SIZE];
int tid = threadIdx.x;
sdata[tid] = 0.0;
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
while (idx < N) { // grid stride loop to load data
sdata[tid] += gdata[idx];
idx += gridDim.x*blockDim.x;
}
for (unsigned int s=blockDim.x/2; s>0; s>>=1) {
__syncthreads();
if (tid < s) // parallel sweep reduction
sdata[tid] += sdata[tid + s];
}
if (tid == 0)
#ifndef USE_ATOMIC
out[blockIdx.x] = sdata[0];
#else
atomicAdd(out, sdata[0]);
#endif
}
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
typedef float mt;
const int ds = 1048576*8;
int main(){
mt *h_gdata, *d_gdata, *h_out, *d_out;
h_gdata = new mt[ds];
cudaMalloc(&d_gdata, ds*sizeof(mt));
const int nblocks = 160;
h_out = new mt[1];
cudaMalloc(&d_out, nblocks*sizeof(mt));
for (int i = 0; i < ds; i++) h_gdata[i] = 1;
cudaMemcpy(d_gdata, h_gdata, ds*sizeof(mt), cudaMemcpyHostToDevice);
reduce<<<nblocks, BLOCK_SIZE>>>(d_gdata, d_out, ds); // warm-up
cudaDeviceSynchronize();
cudaMemset(d_out, 0, sizeof(mt));
unsigned long long dt = dtime_usec(0);
reduce<<<nblocks, BLOCK_SIZE>>>(d_gdata, d_out, ds);
#ifndef USE_ATOMIC
reduce<<<1, BLOCK_SIZE>>>(d_out, d_out, nblocks);
#endif
cudaDeviceSynchronize();
dt = dtime_usec(dt);
cudaMemcpy(h_out, d_out, sizeof(mt), cudaMemcpyDeviceToHost);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {std::cout << "CUDA error: " << cudaGetErrorString(err) << std::endl; return 0;}
if (h_out[0] != ds) {std::cout << "Reduce Error: " << h_out[0] << std::endl; return 0;}
std::cout << "Timing: " << dt << "us" << std::endl;
return 0;
}
$ nvcc -lineinfo -arch=sm_70 -O3 -o t1834 t1834.cu -std=c++14 -Wno-deprecated-gpu-targets
$ ./t1834
Timing: 69us
$ nvcc -lineinfo -arch=sm_70 -O3 -o t1834 t1834.cu -std=c++14 -Wno-deprecated-gpu-targets -DUSE_ATOMIC
$ ./t1834
Timing: 66us
$
(CUDA 11.2, Centos 7, V100 GPU)
I have an array of 300,000 points and I want the fft of every 600 points. I'm attempting to use cufftPlanMany to execute, but I'm getting an unknown error here:
cufftSafeCall(cufftPlanMany(&plan, rank, n, NULL, istride, idist, NULL, 1,1, CUFFT_C2C, 500));
retrevialfft.cu(82) : cufftSafeCall() CUFFT error: <unknown>
Here's the code in context
cudaSetDevice(0);
// Allocate host memory for the signal
cufftComplex* h_signal=(cufftComplex*)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) {
h_signal[i].x = rand() / (float)RAND_MAX;
h_signal[i].y = 0;
// printf("Orignal: %f %f \n", h_signal[i].x, h_signal[i].y);
}
int mem_size = sizeof(cufftComplex) * SIGNAL_SIZE;
// Allocate device memory for signal
cufftComplex* d_signal;
cudaMalloc((void**)&d_signal, mem_size);
int rank = 1; //1d plan
int numCols = 300000;
int n[] = {numCols};
int batch = 500;
int istride = 1;
int ostride = 1;
int idist = numCols;
// CUFFT plan
cufftHandle plan;
cufftSafeCall(cufftPlanMany(&plan, rank, n, NULL, istride, idist, NULL, 1,1, CUFFT_C2C, 500));
// Transform signal
printf("Transforming signal cufftExecC2C\n");
cufftSafeCall(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));
// Copy device memory to host
cufftComplex* h_transformed = (cufftComplex*)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);;
cudaMemcpy(h_transformed, d_signal, mem_size,
cudaMemcpyDeviceToHost);
//Destroy CUFFT context
cufftDestroy(plan);
// cleanup memory
free(h_signal);
free(h_transformed);
cudaFree(d_signal);
cudaDeviceReset();
Any idea of what the error actually is?
You decided not to show any more detail on your question. Below, I'm providing a full working code using cufftPlanMany() to execute batched 1D FFTs. I hope it helps.
#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <assert.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
/********/
/* MAIN */
/********/
void main() {
int batch = 3; // --- How many transforms to be performed
int numCols = 16; // --- Size of each transform
int SIGNAL_SIZE = batch * numCols; // --- Overall size for all the signals
// --- Allocate host memory for all the signals
cufftComplex* h_signal=(cufftComplex*)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);
// --- Initalize host memory for all the signals
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) {
h_signal[i].x = 1.f;
h_signal[i].y = 0.f;
}
// --- Allocate device memory for all the signals
cufftComplex* d_signal; gpuErrchk(cudaMalloc((void**)&d_signal, sizeof(cufftComplex) * SIGNAL_SIZE));
// --- Host to Device memcopy
gpuErrchk(cudaMemcpy(d_signal, h_signal, sizeof(cufftComplex) * SIGNAL_SIZE, cudaMemcpyHostToDevice));
int rank = 1; // --- 1d plan
int n[] = {numCols};
int istride = 1;
int ostride = 1;
int idist = numCols;
int odist = numCols;
// --- CUFFT plan
cufftHandle plan;
cufftSafeCall(cufftPlanMany(&plan, rank, n, NULL, istride, idist, NULL, ostride, odist, CUFFT_C2C, 500));
// --- Signals transformations
cufftSafeCall(cufftExecC2C(plan, (cufftComplex*)d_signal, (cufftComplex*)d_signal, CUFFT_FORWARD));
// --- Device to Host memcopy
gpuErrchk(cudaMemcpy(h_signal, d_signal, sizeof(cufftComplex) * SIGNAL_SIZE, cudaMemcpyDeviceToHost));
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) printf("Real part = %f; Imaginar part = %f\n", h_signal[i].x, h_signal[i].y);
// --- Destroy CUFFT context
cufftSafeCall(cufftDestroy(plan));
// --- Memory cleanup
free(h_signal);
gpuErrchk(cudaFree(d_signal));
cudaDeviceReset();
}
I want to perform 441 2D, 32-by-32 FFTs using the batched method provided by the cuFFT library. The parameters of the transform are the following:
int n[2] = {32,32};
int inembed[] = {32,32};
int onembed[] = {32,32/2+1};
cufftPlanMany(&plan,2,n,inembed,1,32*32,onembed,1,32*(32/2+1),CUFFT_D2Z,441);
cufftPlanMany(&inverse_plan,2,n,onembed,1,32*32,inembed,1,32*32,CUFFT_Z2D,441);
After I did the forward and inverse FFTs using the above plans, I could not get the original data back.
Can anyone advise me how to set the parameters correctly for cudaPlanMany? Many thanks in advance.
By the way, is it the best way to use cudaPlanMany for my situation?
Here is a full example on how using cufftPlanMany to perform batched direct and inverse transformations in CUDA. The example refers to float to cufftComplex transformations and back. The final result of the direct+inverse transformation is correct but for a multiplicative constant equal to the overall number of matrix elements nRows*nCols.
#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <assert.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
/********/
/* MAIN */
/********/
void main() {
cufftHandle forward_plan, inverse_plan;
int batch = 3;
int rank = 2;
int nRows = 5;
int nCols = 5;
int n[2] = {nRows, nCols};
int idist = nRows*nCols;
int odist = nRows*(nCols/2+1);
int inembed[] = {nRows, nCols};
int onembed[] = {nRows, nCols/2+1};
int istride = 1;
int ostride = 1;
cufftSafeCall(cufftPlanMany(&forward_plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch));
float *h_in = (float*)malloc(sizeof(float)*nRows*nCols*batch);
for(int i=0; i<nRows*nCols*batch; i++) h_in[i] = 1.f;
float2* h_freq = (float2*)malloc(sizeof(float2)*nRows*(nCols/2+1)*batch);
float* d_in; gpuErrchk(cudaMalloc(&d_in, sizeof(float)*nRows*nCols*batch));
float2* d_freq; gpuErrchk(cudaMalloc(&d_freq, sizeof(float2)*nRows*(nCols/2+1)*batch));
gpuErrchk(cudaMemcpy(d_in,h_in,sizeof(float)*nRows*nCols*batch,cudaMemcpyHostToDevice));
cufftSafeCall(cufftExecR2C(forward_plan, d_in, d_freq));
gpuErrchk(cudaMemcpy(h_freq,d_freq,sizeof(float2)*nRows*(nCols/2+1)*batch,cudaMemcpyDeviceToHost));
for(int i=0; i<nRows*(nCols/2+1)*batch; i++) printf("Direct transform: %i %f %f\n",i,h_freq[i].x,h_freq[i].y);
cufftSafeCall(cufftPlanMany(&inverse_plan, rank, n, onembed, ostride, odist, inembed, istride, idist, CUFFT_C2R, batch));
cufftSafeCall(cufftExecC2R(inverse_plan, d_freq, d_in));
gpuErrchk(cudaMemcpy(h_in,d_in,sizeof(float)*nRows*nCols*batch,cudaMemcpyDeviceToHost));
for(int i=0; i<nRows*nCols*batch; i++) printf("Inverse transform: %i %f \n",i,h_in[i]);
getchar();
}
I'm now only need to show an intermediate progress of matrix multiplication.
for(unsigned int col=0; col<mtxSize; col++) {
unsigned tmp = 0;
for(unsigned int row=0; row<mtxSize; row++) {
for(unsigned int idx=0; idx<mtxSize; idx++) {
tmp += h_A[col*mtxSize+idx] * h_B[idx*mtxSize+row];
}
h_Rs[col*mtxSize+row] = tmp;
tmp = 0;
int rate_tmp = (col*mtxSize + (row+1))*100;
// Maybe like this...
fprintf(stdout, "Progress : %d.%d %%\r", rate_tmp/actMtxSize, rate_tmp%actMtxSize);
fflush(stdout);
}
}
In the case of the host code(use CPU), it is very easy beacause it process sequentially so we can check very easily.
But in the case of the GPU which process in parallel, what should I do?
Once the kernel is running, it does not return until finish the kernel execution.
So I can't check mid-data during the kernel execution time.
I think I need to use asynchronous kernel call, but I do not know well.
And even if the asynchronous kernel call is used, to see all of the data into several blocks over processors, do I have to write atomicAdd() (in other words, global memory access) function which is including some overhead?
Give me some advice or hint.
And I want to know in the case of CUDA.
Here is a code which demonstrates how to check progress from a matrix multiply kernel:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define TIME_INC 100000000
#define INCS 10
#define USE_PROGRESS 1
#define MAT_DIMX 4000
#define MAT_DIMY MAT_DIMX
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void mykernel(volatile int *data){
unsigned long time;
for (int i = 0; i < INCS; i++){
atomicAdd((int *)data,1);
__threadfence_system();
time = clock64();
while((clock64() - time)<TIME_INC) {};
}
printf("progress check finished\n");
}
__global__ void matmult(float *a, float *b, float *c, unsigned int rowA, unsigned int colA, unsigned int colB, volatile int *progress){
unsigned int row = threadIdx.x+blockDim.x*blockIdx.x;
unsigned int col = threadIdx.y+blockDim.y*blockIdx.y;
if ((row < rowA) && (col < colB)){
float temp = 0.0f;
for (unsigned int k = 0; k < colA; k++)
temp += a[(row*colA)+k] * b[(k*colB) + col];
c[(row*colB)+col] = temp;
#if USE_PROGRESS
if (!(threadIdx.x || threadIdx.y)){
atomicAdd((int *)progress, 1);
__threadfence_system();
}
#endif
}
}
int main(){
// simple test to demonstrate reading progress data from kernel
volatile int *d_data, *h_data;
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaCheckErrors("cudaSetDeviceFlags error");
cudaHostAlloc((void **)&h_data, sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc error");
cudaHostGetDevicePointer((int **)&d_data, (int *)h_data, 0);
cudaCheckErrors("cudaHostGetDevicePointer error");
*h_data = 0;
printf("kernel starting\n");
mykernel<<<1,1>>>(d_data);
cudaCheckErrors("kernel fail");
int value = 0;
do{
int value1 = *h_data;
if (value1 > value){
printf("h_data = %d\n", value1);
value = value1;}}
while (value < (INCS-1));
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail 2");
// now try matrix multiply with progress
float *h_c, *d_a, *d_b, *d_c;
h_c = (float *)malloc(MAT_DIMX*MAT_DIMY*sizeof(float));
if (h_c == NULL) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_a, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc a fail");
cudaMalloc((void **)&d_b, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc b fail");
cudaMalloc((void **)&d_c, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc c fail");
for (int i = 0; i < MAT_DIMX*MAT_DIMY; i++) h_c[i] = rand()/(float)RAND_MAX;
cudaMemcpy(d_a, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy a fail");
cudaMemcpy(d_b, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy b fail");
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
*h_data=0;
dim3 block(16,16);
dim3 grid(((MAT_DIMX+block.x-1)/block.x), ((MAT_DIMY+block.y-1)/block.y));
printf("matrix multiply kernel starting\n");
cudaEventRecord(start);
matmult<<<grid,block>>>(d_a, d_b, d_c, MAT_DIMY, MAT_DIMX, MAT_DIMX, d_data);
cudaEventRecord(stop);
#if USE_PROGRESS
unsigned int num_blocks = grid.x*grid.y;
float my_progress = 0.0f;
value = 0;
printf("Progress:\n");
do{
cudaEventQuery(stop); // may help WDDM scenario
int value1 = *h_data;
float kern_progress = (float)value1/(float)num_blocks;
if ((kern_progress - my_progress)> 0.1f) {
printf("percent complete = %2.1f\n", (kern_progress*100));
my_progress = kern_progress;}}
while (my_progress < 0.9f);
printf("\n");
#endif
cudaEventSynchronize(stop);
cudaCheckErrors("event sync fail");
float et;
cudaEventElapsedTime(&et, start, stop);
cudaCheckErrors("event elapsed time fail");
cudaDeviceSynchronize();
cudaCheckErrors("mat mult kernel fail");
printf("matrix multiply finished. elapsed time = %f milliseconds\n", et);
return 0;
}
The code associated with the first kernel call is just to demonstrate the basic idea of having a kernel report it's progress back.
The second part of the code shows a sample, naive matrix multiply on the GPU, with the GPU reporting it's progress back. I have included the ability to remove the progress check code via a preprocessor macro, as well as the ability to time the matrix multiply kernel. For the case I have here, there was no discernible difference in timing with or without the progress code. So while the progress reporting code probably does add some overhead, when compared to the scope of a reasonable sized matrix multiply kernel, it adds no significant time that I can see.
Some other uses of signalling are discussed here
I am trying to find the maximum of an array.. I took the help from CUDA Maximum Reduction Algorithm Not Working. and do some own modification. However I am running it for 16 data. I am finding that in kernel code shared memory copies only 1st 4data. rest are lost. I put two cuPrintf..1st printf shows data is their in the shared memory. But the 2nd cuPrintf is just after __syncthreads.. and that shows 0 from thread ids 4 onwords.. pls help
#include
#include
#include
#include
#include
#include "cuPrintf.cu"
#include "cuPrintf.cuh"
__device__ float MaxOf2(float a, float b)
{
if(a > b) return a;
else return b;
}
__global__ void findMax(int size,float *array_device , float *outPut)
{
extern __shared__ float sdata[];
int tid = threadIdx.x;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i< size)
{
sdata[tid] = array_device[i];
cuPrintf(" array_d[%d]===%f, sdata[%d]===%f\n ",i,array_device[i],tid,sdata[tid]);
__threadfence();
}
__syncthreads();
if(tid<size)
cuPrintf(" array_d[%d]===%f, sdata[%d]===%f\n ",i,array_device[i],tid,sdata[tid]);
for ( int s=blockDim.x/2; s>0; s=s>>1)//s=blockDim.x/2
{
if (tid < s)
{
sdata[tid]= MaxOf2(sdata[tid],sdata[tid+s]);
}
__syncthreads();
}
if (tid == 0) outPut[blockIdx.x] = sdata[0];
}
int main()
{
long double M = pow(2,20);
long double N = 2;
int noThreadsPerBlock = 512 ;
printf("\n Provide the array Size N.(array will be of size N * 2^20 ) :-");
scanf("%Lf",&N);
long int size = 16;
int numOfBlock = (int)size /noThreadsPerBlock + 1;
printf("\n num of blocks==%ld",numOfBlock);
float *array_device , *outPut;
float array_host[]={221,100,2,340,47,36,500,1,33,4460,5,6,7,8,9,11};
cudaMalloc((void **)&array_device, size*sizeof(float));
cudaMalloc((void **)&outPut, size*sizeof(float));
cudaError_t error0 = cudaGetLastError();
printf("\n 0CUDA error: %s\n", cudaGetErrorString(error0));
printf("size===%ld",size);
cudaMemcpy(array_device, array_host, size*sizeof(float), cudaMemcpyHostToDevice);
cudaError_t error1 = cudaGetLastError();
printf("\n1CUDA error: %s\n", cudaGetErrorString(error1));
while(size>1 )
{
cudaPrintfInit();
findMax<<< numOfBlock,noThreadsPerBlock>>>(size,array_device, outPut);cudaPrintfDisplay(stdout, true);
cudaPrintfEnd();
cudaError_t error2 = cudaGetLastError();
printf(" 2CUDA error: %s\n", cudaGetErrorString(error2));
cudaMemcpy(array_device, outPut, size*sizeof(float), cudaMemcpyDeviceToDevice);
size = numOfBlock;
printf("\n ****size==%ld\n",size);
numOfBlock = (int)size /noThreadsPerBlock + 1;
}
cudaMemcpy(array_host, outPut, size*sizeof(float), cudaMemcpyDeviceToHost);
cudaError_t error3 = cudaGetLastError();
printf("\n3CUDA error: %s\n", cudaGetErrorString(error3));
for(int i=0;i<size;i++)
printf("\n index==%d ;data=%f ",i,array_host[i]);
return 0;
}
I'm posting my comment as an answer as requested.
Firstly, you havent specified dynamic size of shared memory in kernel launch. It should look something like:
findMax<<< numOfBlock,noThreadsPerBlock,sizeof(float)*noThreadsPerBlock>>>
Secondly, what was the concept behind condition if(tid<size) on second cuPrintf? Providing output of the program could also help.