CUB sum reduction with 2D pitched arrays - cuda

I am trying to perform a sum reduction using CUB and 2D arrays of type float/double.
Although it works for certain combinations of rows+columns, for relatively larger arrays, I get an illegal memory access error during the last transfer.
A minimal example is the following:
#include <stdio.h>
#include <stdlib.h>
#include <cub/device/device_reduce.cuh>
#include "cuda_runtime.h"
#ifdef DP
#define real double
#else
#define real float
#endif
void generatedata(const int num, real* vec, real start, real finish) {
real rrange = finish - start;
for (auto i = 0; i < num; ++i)
vec[i] = rand() / float(RAND_MAX) * rrange + start;
}
real reduce_to_sum(const int num, const real* vec) {
real total = real(0.0);
for (auto i = 0; i < num; ++i)
total += vec[i];
return total;
}
int main() {
int rows = 2001;
int cols = 3145;
size_t msize = rows * cols;
real* data = (real*)malloc(msize * sizeof(real));
if (!data)
return -999;
generatedata(msize, data, 0., 50.);
real ref_sum = reduce_to_sum(msize, data);
real* d_data_in = nullptr;
real* d_data_out = nullptr;
size_t pitch_in, pitch_out;
cudaError_t err = cudaMallocPitch(&d_data_in, &pitch_in, cols * sizeof(real), rows);
if (err != cudaSuccess) {
printf("data_in :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMallocPitch(&d_data_out, &pitch_out, cols * sizeof(real), rows);
if (err != cudaSuccess) {
printf("data_out :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMemset(d_data_in, 0, rows * pitch_in);
if (err != cudaSuccess) {
printf("set data_in :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMemcpy2D(d_data_in, pitch_in, data, cols * sizeof(real), cols * sizeof(real), rows, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
printf("copy data :: %s \n", cudaGetErrorString(err));
return -999;
}
void* d_temp = nullptr;
size_t temp_bytes = 0;
cub::DeviceReduce::Sum(d_temp, temp_bytes, d_data_in, d_data_out, rows * pitch_out);
err = cudaMalloc(&d_temp, temp_bytes);
if (err != cudaSuccess) {
printf("temp :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMemset(d_data_out, 0, rows * pitch_out);
if (err != cudaSuccess) {
printf("set temp :: %s \n", cudaGetErrorString(err));
return -999;
}
// Run sum-reduction
cub::DeviceReduce::Sum(d_temp, temp_bytes, d_data_in, d_data_out, rows * pitch_out);
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("reduction :: %s \n", cudaGetErrorString(err));
return -999;
}
real gpu_sum = real(0.0);
err = cudaMemcpy(&gpu_sum, d_data_out, sizeof(real), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
printf("copy final :: %s \n", cudaGetErrorString(err));
return -999;
}
printf("Difference in sum (h)%f - (d)%f = %f \n", ref_sum, gpu_sum, ref_sum - gpu_sum);
if (data) free(data);
if (d_data_in) cudaFree(d_data_in);
if (d_data_out) cudaFree(d_data_out);
if (d_temp) cudaFree(d_temp);
cudaDeviceReset();
return 0;
}
The error is thrown at "copy final ::". I am bit confused as to why certain rows x columns work and others don't. I did notice it's the larger values that cause it, but can't get my head around.
Any suggestions would be much appreciated.

The 5th parameter of cub::DeviceReduce::Sum should be the number of input elements. However, rows * pitch_out is the size of the output buffer in bytes.
Assuming pitch_in % sizeof(real) == 0, the following call may work.
cub::DeviceReduce::Sum(d_temp, temp_bytes, d_data_in, d_data_out, rows * (pitch_in / sizeof(real)));
Also note that cub::DeviceReduce::Sum may return before the reduction is complete. In this case, if any error happened during execution, this error will be reported by cudaMemcpy.

Related

MAGMA: function "magma_dgels_gpu" --> error "magma_trans_t"

I am trying to solve a least squares problem via "magma_dgels_gpu()" function of MAGMA Library. My GPU is "Tesla C2050 / C2075" and i have installed MAGMA.
I am trying to compile the below code "testMagmaDGELS.cu", but i get error:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cublas.h>
#include "magma.h"
#define UTILS_MALLOC(__ptr, __type, __size) \
__ptr = (__type*)malloc((__size) * sizeof(__type)); \
if (__ptr == 0) { \
fprintf (stderr, "!!!! Malloc failed for: %s\n", #__ptr ); \
exit(-1); \
}
#define UTILS_DEVALLOC(__ptr, __type, __size) \
if( cudaSuccess != cudaMalloc( (void**)&__ptr, (__size)*sizeof(__type) ) ){ \
fprintf (stderr, "!!!! cudaMalloc failed for: %s\n", #__ptr ); \
exit(-1); \
}
int main(int argc, char** argv)
{
if( CUBLAS_STATUS_SUCCESS != cublasInit( ) ) {
fprintf(stderr, "CUBLAS: Not initialized\n"); exit(-1);
}
double *devA, *devB, *pWork, lWorkQuery[1];
const int M = 5, N = 3;
int ret, info;
/* Allocate device memory for the matrix (column-major) */
int lda = M;
int ldda = ((M + 31) / 32) * 32;
UTILS_DEVALLOC(devA, double, ldda * N);
UTILS_DEVALLOC(devB, double, M);
/* Initialize the matrix */
double A[N][M] = {{ 0.6, 5.0, 1.0, -1.0, -4.2 },
{ 1.2, 4.0, -4.0, -2.0, -8.4 },
{ 3.9, 2.5, -5.5, -6.5, -4.8 }};
cublasSetMatrix(M, N, sizeof(double), A, lda, devA, ldda);
double B[M] = {3.0, 4.0, -1.0, -5.0, -1.0};
cublasSetMatrix(M, 1, sizeof(double), B, M, devB, M);
/* Resolve the LLSP using MAGMA */
ret = magma_dgels_gpu('N', M, N, 1 /* nb of colums in the matrix B */,
devA, ldda, devB, M,
lWorkQuery, -1, // query the optimal work space
&info);
if (info < 0) {
printf("Argument %d of magma_dgels_gpu had an illegal value.\n", -info);
exit(1);
} else if (ret != MAGMA_SUCCESS) {
printf("magma_dgels_gpu failed (code %d).\n", ret);
exit(1);
}
int lwork = (int)lWorkQuery[0];
printf("Optimal work space %d\n", lwork);
UTILS_MALLOC(pWork, double, lwork);
ret = magma_dgels_gpu('N', M, N, 1 /* nb of colums in the matrix B */,
devA, ldda, devB, M,
pWork, lwork,
&info);
if (info < 0) {
printf("Argument %d of magma_dgels_gpu had an illegal value.\n", -info);
exit(1);
} else if (ret != MAGMA_SUCCESS) {
printf("magma_dgels_gpu failed (code %d).\n", ret);
exit(1);
} else {
printf("LLSP solved successfully\n");
}
cublasGetMatrix(M, 1, sizeof(double), devB, M, B, M);
/* Expected solution vector: 0.953333 -0.843333 0.906667 */
printf("Solution vector:\n");
for (int i = 0; i < N; i++) {
printf("\t%lf\n", B[i]);
}
/* Memory clean up */
free( pWork );
cudaFree( devA );
cudaFree( devB );
/* Shutdown */
cublasShutdown();
return 0;
}
I make compile as follows:
nvcc -arch=sm_20 testMagmaDGELS.cu -o testMagmaDGELS -lcublas -I/opt/magma/1.7.0/openblas/gcc/include
And I get these errors:
team24#tesla:~$ nvcc -arch=sm_20 testMagmaDGELS.cu -o testMagmaDGELS -lcublas -I/opt/magma/1.7.0/openblas/gcc/include
testMagmaDGELS.cu(54): error: argument of type "char" is incompatible with parameter of type "magma_trans_t"
testMagmaDGELS.cu(70): error: argument of type "char" is incompatible with parameter of type "magma_trans_t"
2 errors detected in the compilation of "/tmp/tmpxft_00002d95_00000000-8_testMagmaDGELS.cpp1.ii".
Could anyone help me?
Use the magma type for indication of transpose/no transpose, instead of using a char type.
so instead of this:
ret = magma_dgels_gpu('N', ...
do this:
magma_trans_t my_trans = MagmaNoTrans;
ret = magma_dgels_gpu(my_trans, ...
See the documentation here.
magma_trans_t magma_trans_const ( character ) Map 'N', 'T', 'C'
to MagmaNoTrans, MagmaTrans, MagmaConjTrans

How to download a file from http using C?

I spent the last days trying to figure out how to download a file from an URL.
This is my first challenge with socket and I'm using it to have an understanding of protocols so I would like to do it without cURL libraries and only in C language!!
I searched a lot....now I'm able to printf the source code of a page but I think it's different with a file, I don't have only to put the received data from a buffer to a file, right?
any tips?
#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <netdb.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <errno.h>
#include <arpa/inet.h>
int main(void)
{
char domain[] = "www.sstatic.net", path[]="stackexchange/img/logos/so/so-logo-med.png"; //example
int sock, bytes_received;
char send_data[1024],recv_data[9999], *p;
struct sockaddr_in server_addr;
struct hostent *he;
FILE *fp;
he = gethostbyname(domain);
if (he == NULL){
herror("gethostbyname");
exit(1);
}
if ((sock = socket(AF_INET, SOCK_STREAM, 0))== -1){
perror("Socket");
exit(1);
}
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(80);
server_addr.sin_addr = *((struct in_addr *)he->h_addr);
bzero(&(server_addr.sin_zero),8);
if (connect(sock, (struct sockaddr *)&server_addr,sizeof(struct sockaddr)) == -1){
perror("Connect");
exit(1);
}
snprintf(send_data, sizeof(send_data), "GET /%s HTTP/1.1\r\nHost: /%s\r\n\r\n", path, domain);
//printf("%s\n", send_data);
send(sock, send_data, strlen(send_data), 0);
printf("Data sended.\n");
fp=fopen("received_file","wb");
bytes_received = recv(sock, recv_data, 9999, 0);
recv_data[bytes_received] = '\0';
printf("Data receieved.\n");
printf("%s\n", recv_data);
p = strstr(recv_data, "\r\n\r\n"); //to find "\r\n\r\n" sequence and put the pointer p after that
p=p+4;
fwrite(p,strlen(p),1,fp);
close(sock);
fclose(fp);
return 0;
}
UPDATE 1 thanks to milevyo for some improvements!
It works good with a txt file but it doesn't with other kinds of file (png in this case)
#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <netdb.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <errno.h>
#include <arpa/inet.h>
int main(void){
//char domain[] = "www.gnu.org", path[]="/licenses/gpl.txt"; //example
char domain[] = "sstatic.net", path[]="stackexchange/img/logos/so/so-logo-med.png"; //example
int sock, bytes_received;
char send_data[1024],recv_data[9999];
struct sockaddr_in server_addr;
struct hostent *he;
FILE *fp;
he = gethostbyname(domain);
if (he == NULL){
herror("gethostbyname");
exit(1);
}
if ((sock = socket(AF_INET, SOCK_STREAM, 0))== -1){
perror("Socket");
exit(1);
}
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(80);
server_addr.sin_addr = *((struct in_addr *)he->h_addr);
bzero(&(server_addr.sin_zero),8);
printf("Connecting ...\n");
if (connect(sock, (struct sockaddr *)&server_addr,sizeof(struct sockaddr)) == -1){
perror("Connect");
exit(1);
}
printf("Sending data ...\n");
snprintf(send_data, sizeof(send_data), "GET /%s HTTP/1.1\r\nHost: /%s\r\n\r\n", path, domain);
if(send(sock, send_data, strlen(send_data), 0)==-1){
perror("send");
exit(2);
}
printf("Data sent.\n");
fp=fopen("received_file","wb");
printf("Recieving data...\n\n");
while((bytes_received = recv(sock, recv_data, 9999, 0))>0){
if(bytes_received==-1){
perror("recieve");
exit(3);
}
recv_data[bytes_received] = '\0';
fwrite(recv_data,bytes_received,1,fp);
printf("%s", recv_data);
}
close(sock);
fclose(fp);
printf("\n\nDone.\n\n");
return 0;
}
this code produce a 334 bytes file (instead of 12,4kb of the original file) with this inside:
HTTP/1.1 400 Bad Request
Date: Sat, 28 Nov 2015 16:20:45 GMT
Content-Type: text/html
Content-Length: 177
Connection: close
Server: -nginx
CF-RAY: -
<html>
<head><title>400 Bad Request</title></head>
<body bgcolor="white">
<center><h1>400 Bad Request</h1></center>
<hr><center>cloudflare-nginx</center>
</body>
</html>
somebody knows how to fix this "400 Bad Request"?
This is an update for the previous posted code. The http protocol is far to be implementation in just a small example.
reformatting the code , or giving a modification to it is more than welcome.
#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <netdb.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <errno.h>
#include <arpa/inet.h>
#include <string.h>
int ReadHttpStatus(int sock){
char c;
char buff[1024]="",*ptr=buff+1;
int bytes_received, status;
printf("Begin Response ..\n");
while(bytes_received = recv(sock, ptr, 1, 0)){
if(bytes_received==-1){
perror("ReadHttpStatus");
exit(1);
}
if((ptr[-1]=='\r') && (*ptr=='\n' )) break;
ptr++;
}
*ptr=0;
ptr=buff+1;
sscanf(ptr,"%*s %d ", &status);
printf("%s\n",ptr);
printf("status=%d\n",status);
printf("End Response ..\n");
return (bytes_received>0)?status:0;
}
//the only filed that it parsed is 'Content-Length'
int ParseHeader(int sock){
char c;
char buff[1024]="",*ptr=buff+4;
int bytes_received, status;
printf("Begin HEADER ..\n");
while(bytes_received = recv(sock, ptr, 1, 0)){
if(bytes_received==-1){
perror("Parse Header");
exit(1);
}
if(
(ptr[-3]=='\r') && (ptr[-2]=='\n' ) &&
(ptr[-1]=='\r') && (*ptr=='\n' )
) break;
ptr++;
}
*ptr=0;
ptr=buff+4;
//printf("%s",ptr);
if(bytes_received){
ptr=strstr(ptr,"Content-Length:");
if(ptr){
sscanf(ptr,"%*s %d",&bytes_received);
}else
bytes_received=-1; //unknown size
printf("Content-Length: %d\n",bytes_received);
}
printf("End HEADER ..\n");
return bytes_received ;
}
int main(void){
char domain[] = "sstatic.net", path[]="stackexchange/img/logos/so/so-logo-med.png";
int sock, bytes_received;
char send_data[1024],recv_data[1024], *p;
struct sockaddr_in server_addr;
struct hostent *he;
he = gethostbyname(domain);
if (he == NULL){
herror("gethostbyname");
exit(1);
}
if ((sock = socket(AF_INET, SOCK_STREAM, 0))== -1){
perror("Socket");
exit(1);
}
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(80);
server_addr.sin_addr = *((struct in_addr *)he->h_addr);
bzero(&(server_addr.sin_zero),8);
printf("Connecting ...\n");
if (connect(sock, (struct sockaddr *)&server_addr,sizeof(struct sockaddr)) == -1){
perror("Connect");
exit(1);
}
printf("Sending data ...\n");
snprintf(send_data, sizeof(send_data), "GET /%s HTTP/1.1\r\nHost: %s\r\n\r\n", path, domain);
if(send(sock, send_data, strlen(send_data), 0)==-1){
perror("send");
exit(2);
}
printf("Data sent.\n");
//fp=fopen("received_file","wb");
printf("Recieving data...\n\n");
int contentlengh;
if(ReadHttpStatus(sock) && (contentlengh=ParseHeader(sock))){
int bytes=0;
FILE* fd=fopen("test.png","wb");
printf("Saving data...\n\n");
while(bytes_received = recv(sock, recv_data, 1024, 0)){
if(bytes_received==-1){
perror("recieve");
exit(3);
}
fwrite(recv_data,1,bytes_received,fd);
bytes+=bytes_received;
printf("Bytes recieved: %d from %d\n",bytes,contentlengh);
if(bytes==contentlengh)
break;
}
fclose(fd);
}
close(sock);
printf("\n\nDone.\n\n");
return 0;
}
Try some thing like below: -
#include <sys/socket.h>
#include <sys/errno.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <unistd.h>
#include <string.h>
#include <strings.h>
#include <stdlib.h>
#include <stdio.h>
#include <openssl/ssl.h>
#include <openssl/err.h>
#define BUFLEN 4096
#define HOST "www.t.edu.pk"
#define PORT 443
int main()
{
int sock, iResult;
char *cmd, *ip;
char recvbuf[BUFLEN];
//
struct sockaddr_in sin;
struct hostent* hent;
//
hent = gethostbyname(HOST);
if(hent == NULL)
{
printf("gethostbyname failed: %d\n", errno);
return -1;
}
printf("gethostbyname succeeded\n");
ip = inet_ntoa(*((struct in_addr*)hent->h_addr_list[0]));
printf("Host IP: %s\n", ip);
//
sock = socket(AF_INET, SOCK_STREAM, 0);
if(sock == -1)
{
printf("socket failed: %d\n", errno);
return -1;
}
printf("socket created\n");
//
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = inet_addr(ip);
sin.sin_port = htons(PORT);
iResult = connect(sock, (struct sockaddr*)&sin, sizeof(sin));
if(iResult < 0)
{
printf("connect failed: %d\n", errno);
return -1;
}
printf("connect succeeded\n");
//
iResult = SSL_library_init();
if(iResult < 0)
{
printf("SSL failed\n");
return -1;
}
printf("SSL library initialised\n");
OpenSSL_add_all_algorithms();
ERR_load_crypto_strings();
SSL_load_error_strings();
SSL_CTX* ctx = SSL_CTX_new(TLSv1_2_client_method());
if(ctx == NULL)
{
printf("ctx failed\n");
ERR_print_errors_fp(stderr);
return -1;
}
printf("ctx loaded\n");
SSL* ssl = SSL_new(ctx);
if(ssl == NULL)
{
printf("ssl failed\n");
ERR_print_errors_fp(stderr);
return -1;
}
printf("ssl loaded\n");
SSL_set_fd(ssl, sock);
SSL_connect(ssl);
//
cmd = "GET / HTTP/1.1\r\nHost: www.t.edu.pk\r\n\r\n";
iResult = SSL_write(ssl, cmd, strlen(cmd));
if(iResult <= 0)
{
printf("SSL write failed\n");
ERR_print_errors_fp(stderr);
return -1;
}
printf("Byte(s) sent: %d\n", iResult);
bzero(recvbuf, BUFLEN);
do
{
iResult = SSL_read(ssl, recvbuf, BUFLEN - 1);
if(iResult < 0)
{
printf("error receiving data\n");
break;
}
if(iResult == 0)
{
printf("host closed connection\n");
break;
}
printf("%s\n", recvbuf);
}while(iResult > 0);
//
iResult = SSL_shutdown(ssl);
if(iResult == 0)
{
printf("SSL shutdown in progress...\n");
}
iResult = SSL_shutdown(ssl);
if(iResult == 1)
{
printf("SSL shutdown complete!\n");
}
if(iResult == -1)
{
printf("SSL shutdown unsuccessful!\n");
}
SSL_CTX_free(ctx);
//
iResult = shutdown(sock, SHUT_RDWR);
if(iResult == -1)
{
printf("Socket shutdown failed: %d\n", errno);
return -1;
}
printf("Socket shutdown succeeded\n");
iResult = close(sock);
if(iResult != 0)
{
printf("error closing socket: %d\n", errno);
return -1;
}
printf("Socket closed\n");
//
return 0;
}
This work for C/C++ in linux Environment. You can run it by makefile, or just add -lcurl option in g++.
Notice that should have lib cURL.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>
FILE *fp = fopen("file.txt", "w");
char outfilename[FILENAME_MAX] = "file_downloaded.txt";
FILE *fp1 = fopen(outfilename,"wb");
struct MemoryStruct {
char *memory;
size_t size;
};
static size_t
WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp)
{
size_t realsize = size * nmemb;
struct MemoryStruct *mem = (struct MemoryStruct *)userp;
char *ptr = (char*)realloc(mem->memory, mem->size + realsize + 1);
if(!ptr) {
/* out of memory! */
printf("not enough memory (realloc returned NULL)\n");
return 0;
}
fprintf(fp, "%ld - %ld - %ld\n", realsize, size, nmemb);
size_t written = fwrite(contents, size, nmemb, fp1);
mem->memory = ptr;
memcpy(&(mem->memory[mem->size]), contents, realsize);
mem->size += realsize;
mem->memory[mem->size] = 0;
return realsize;
}
int main(void)
{
char link_download[] = "https://www.example.com/";
CURL *curl_handle;
CURLcode res;
struct MemoryStruct chunk;
chunk.memory = (char*)malloc(1); /* will be grown as needed by the realloc above */
chunk.size = 0; /* no data at this point */
curl_global_init(CURL_GLOBAL_ALL);
/* init the curl session */
curl_handle = curl_easy_init();
/* specify URL to get */
curl_easy_setopt(curl_handle, CURLOPT_URL, link_download);
/* send all data to this function */
curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
/* we pass our 'chunk' struct to the callback function */
curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk);
/* some servers do not like requests that are made without a user-agent
field, so we provide one */
curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0");
/* get it! */
res = curl_easy_perform(curl_handle);
/* check for errors */
if(res != CURLE_OK) {
fprintf(stderr, "curl_easy_perform() failed: %s\n",
curl_easy_strerror(res));
}
else {
/*
* Now, our chunk.memory points to a memory block that is chunk.size
* bytes big and contains the remote file.
*
* Do something nice with it!
*/
printf("%lu bytes retrieved\n", (unsigned long)chunk.size);
}
/* cleanup curl stuff */
curl_easy_cleanup(curl_handle);
free(chunk.memory);
/* we are done with libcurl, so clean it up */
curl_global_cleanup();
fclose(fp);
fclose(fp1);
return 0;
}

CUDA volatile free

Could anyone please suggest me a way to free a volatile global memory variable in CUDA...
volatile unsigned *d_queue_L12;
err = cudaMalloc((void **)&d_queue_L12, CORES*MAX_12*Cache_Sets_L2*sizeof(volatile unsigned));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate space to L12 QUEUE vector (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_queue_L12);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free L2 FLAG COUNT vector (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
gives an error:
error: argument of type "volatile unsigned int *" is incompatible with parameter of type "void *"
How about something like this:
err = cudaFree((void *)d_queue_L12);

Cufft error in file

I am receiving the error:
Cufft error in file
I am using this file in order to load the FFT and pass them to another file.
//----function to check for errors-------------------------------------------------
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"\nGPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
//function to check for cuFFT errors --------------------------------------------------
#define CUFFT_SAFE_CALL( call) do { \
cufftResult err = call; \
if (err != CUFFT_SUCCESS) { \
fprintf(stderr, "Cufft error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, "error" ); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define NX 128*128
#define NY 16
#define BATCH 16
#define NRANK 2
void FFT_transform(cufftDoubleComplex** B_in)
{
int n[NRANK] = {NX, NY};
//size of B
int Bsize=NX*NY*BATCH;
//allocate host memory
*B_in=(cufftDoubleComplex*)malloc(Bsize*sizeof(cufftDoubleComplex));
for (int i=0;i<NX*NY;i++){
for (int j=0;j<BATCH;j++){
(*B_in)[i*BATCH+j].x=(i*BATCH+j)*2;
(*B_in)[i*BATCH+j].y=(i*BATCH+j)*2+1;
}
}
//allocate device memory
cufftDoubleComplex* B_dev;
gpuErrchk(cudaMalloc((void**) &B_dev,Bsize* sizeof(cufftDoubleComplex)));
if (cudaGetLastError() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to allocate\n");
return;
}
// copy arrays from host to device
gpuErrchk(cudaMemcpy(B_dev, *B_in,Bsize* sizeof(cufftDoubleComplex), cudaMemcpyHostToDevice));
// Create a 2D FFT plan
cufftHandle plan;
CUFFT_SAFE_CALL(cufftPlan2d(&plan,NX,NY,CUFFT_Z2Z));
if (cufftPlanMany(&plan, NRANK, n,NULL, 1, 0,NULL, 1, 0,CUFFT_Z2Z,BATCH) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT Error: Unable to create plan\n");
return;
}
if (cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE)!= CUFFT_SUCCESS){
fprintf(stderr, "CUFFT Error: Unable to set compatibility mode to native\n");
return;
}
// perform transform
CUFFT_SAFE_CALL(cufftExecZ2Z(plan,(cufftDoubleComplex *)(*B_in), (cufftDoubleComplex *)B_dev, CUFFT_FORWARD));
if (cufftExecZ2Z(plan,*B_in,B_dev,CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT Error: Unable to execute plan\n");
return;
}
if (cudaThreadSynchronize() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to synchronize\n");
return;
}
// copy result from device to host
gpuErrchk(cudaMemcpy(*B_in, B_dev,Bsize*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost));
//Destroy CUFFT context
CUFFT_SAFE_CALL(cufftDestroy(plan));
//clean up device memory
gpuErrchk(cudaFree(B_dev));
}
I am receiving the error at line:
CUFFT_SAFE_CALL(cufftExecZ2Z(plan,(cufftDoubleComplex *)(*B_in), (cufftDoubleComplex *)B_dev, CUFFT_FORWARD));
You are getting the error because B_in is a pointer to host memory and not to device memory, which is illegal. In CUFFT, inputs are always in device memory. You need to use cudaMemcpy to transfer the contents of B_in to B_dev before performing the transform, and then supply B_dev as both the input and output, which will result in an in place transform. This is clearly described in the CUFFT API documentation here.

CUDA NPP image dot product having cudaErrorUnknown

The function nppiDotProd_8u64f_C1R causes a cudaErrorUnknown. I'm able to compile and run properly boxFilterNPP and histEqualizationNPP so I assume my system is healthy. I'm running with a GTX470 (compute capability 2.0), CUDA 5.5 and VS2012 x64 on Windows7. I've also run many variations of it on two systems and having the same problem. Here is the code:
NppGpuComputeCapability capability = nppGetGpuComputeCapability();
NppiSize sizeROI;
sizeROI.width = 640;
sizeROI.height = 480;
int nBufferSize = 0;
NppStatus status = nppiDotProdGetBufferHostSize_8u64f_C1R(sizeROI,&nBufferSize);
if(status != NPP_SUCCESS) return status;
unsigned char *pDeviceBuffer;
cudaError_t err = cudaMalloc((void**)&pDeviceBuffer,nBufferSize);
if(err != cudaSuccess) return err;
int stepByte1 = 0;
Npp8u * buf1 = nppiMalloc_8u_C1(sizeROI.width, sizeROI.height, &stepByte1);
status = nppiSet_8u_C1R(1,buf1,stepByte1,sizeROI);
if(status != NPP_SUCCESS) return status;
int stepByte2 = 0;
Npp8u * buf2 = nppiMalloc_8u_C1(sizeROI.width, sizeROI.height, &stepByte2);
status = nppiSet_8u_C1R(1,buf2,stepByte2,sizeROI);
if(status != NPP_SUCCESS) return status;
err = cudaDeviceSynchronize();
if(err != cudaSuccess) return err;
double dp = 0;
status = nppiDotProd_8u64f_C1R(buf1,stepByte1,buf2,stepByte2,sizeROI,&dp,pDeviceBuffer);
if(status != NPP_SUCCESS) return status;
err = cudaDeviceSynchronize(); // return cudaErrorUnknown
// CUDA memchecker gives me "OutOfRangeStore" exception
if(err != cudaSuccess) return err;
printf("result: %f\n", dp);
nppiFree(buf1);
nppiFree(buf2);
cudaFree(pDeviceBuffer);
Any idea about my problem?
Thank you very much!!
The result argument in that nppiDotProd call must be a device pointer, not a host pointer. You can fix it by allocating memory for dp on the device, something like :
double * dp ;
cudaMalloc((void **)(&dp), sizeof(Npp64f) * 1);
status = nppiDotProd_8u64f_C1R(buf1,stepByte1,buf2,stepByte2,sizeROI,dp,pDeviceBuffer);
if(status != NPP_SUCCESS) return status;
[disclaimer: written in browser, not compiled or tested, use a own risk]
You will obviously need to copy the result of the dot product back to the host if you need it.