I am trying to perform a sum reduction using CUB and 2D arrays of type float/double.
Although it works for certain combinations of rows+columns, for relatively larger arrays, I get an illegal memory access error during the last transfer.
A minimal example is the following:
#include <stdio.h>
#include <stdlib.h>
#include <cub/device/device_reduce.cuh>
#include "cuda_runtime.h"
#ifdef DP
#define real double
#else
#define real float
#endif
void generatedata(const int num, real* vec, real start, real finish) {
real rrange = finish - start;
for (auto i = 0; i < num; ++i)
vec[i] = rand() / float(RAND_MAX) * rrange + start;
}
real reduce_to_sum(const int num, const real* vec) {
real total = real(0.0);
for (auto i = 0; i < num; ++i)
total += vec[i];
return total;
}
int main() {
int rows = 2001;
int cols = 3145;
size_t msize = rows * cols;
real* data = (real*)malloc(msize * sizeof(real));
if (!data)
return -999;
generatedata(msize, data, 0., 50.);
real ref_sum = reduce_to_sum(msize, data);
real* d_data_in = nullptr;
real* d_data_out = nullptr;
size_t pitch_in, pitch_out;
cudaError_t err = cudaMallocPitch(&d_data_in, &pitch_in, cols * sizeof(real), rows);
if (err != cudaSuccess) {
printf("data_in :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMallocPitch(&d_data_out, &pitch_out, cols * sizeof(real), rows);
if (err != cudaSuccess) {
printf("data_out :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMemset(d_data_in, 0, rows * pitch_in);
if (err != cudaSuccess) {
printf("set data_in :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMemcpy2D(d_data_in, pitch_in, data, cols * sizeof(real), cols * sizeof(real), rows, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
printf("copy data :: %s \n", cudaGetErrorString(err));
return -999;
}
void* d_temp = nullptr;
size_t temp_bytes = 0;
cub::DeviceReduce::Sum(d_temp, temp_bytes, d_data_in, d_data_out, rows * pitch_out);
err = cudaMalloc(&d_temp, temp_bytes);
if (err != cudaSuccess) {
printf("temp :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMemset(d_data_out, 0, rows * pitch_out);
if (err != cudaSuccess) {
printf("set temp :: %s \n", cudaGetErrorString(err));
return -999;
}
// Run sum-reduction
cub::DeviceReduce::Sum(d_temp, temp_bytes, d_data_in, d_data_out, rows * pitch_out);
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("reduction :: %s \n", cudaGetErrorString(err));
return -999;
}
real gpu_sum = real(0.0);
err = cudaMemcpy(&gpu_sum, d_data_out, sizeof(real), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
printf("copy final :: %s \n", cudaGetErrorString(err));
return -999;
}
printf("Difference in sum (h)%f - (d)%f = %f \n", ref_sum, gpu_sum, ref_sum - gpu_sum);
if (data) free(data);
if (d_data_in) cudaFree(d_data_in);
if (d_data_out) cudaFree(d_data_out);
if (d_temp) cudaFree(d_temp);
cudaDeviceReset();
return 0;
}
The error is thrown at "copy final ::". I am bit confused as to why certain rows x columns work and others don't. I did notice it's the larger values that cause it, but can't get my head around.
Any suggestions would be much appreciated.
The 5th parameter of cub::DeviceReduce::Sum should be the number of input elements. However, rows * pitch_out is the size of the output buffer in bytes.
Assuming pitch_in % sizeof(real) == 0, the following call may work.
cub::DeviceReduce::Sum(d_temp, temp_bytes, d_data_in, d_data_out, rows * (pitch_in / sizeof(real)));
Also note that cub::DeviceReduce::Sum may return before the reduction is complete. In this case, if any error happened during execution, this error will be reported by cudaMemcpy.
Related
I am trying to solve a least squares problem via "magma_dgels_gpu()" function of MAGMA Library. My GPU is "Tesla C2050 / C2075" and i have installed MAGMA.
I am trying to compile the below code "testMagmaDGELS.cu", but i get error:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cublas.h>
#include "magma.h"
#define UTILS_MALLOC(__ptr, __type, __size) \
__ptr = (__type*)malloc((__size) * sizeof(__type)); \
if (__ptr == 0) { \
fprintf (stderr, "!!!! Malloc failed for: %s\n", #__ptr ); \
exit(-1); \
}
#define UTILS_DEVALLOC(__ptr, __type, __size) \
if( cudaSuccess != cudaMalloc( (void**)&__ptr, (__size)*sizeof(__type) ) ){ \
fprintf (stderr, "!!!! cudaMalloc failed for: %s\n", #__ptr ); \
exit(-1); \
}
int main(int argc, char** argv)
{
if( CUBLAS_STATUS_SUCCESS != cublasInit( ) ) {
fprintf(stderr, "CUBLAS: Not initialized\n"); exit(-1);
}
double *devA, *devB, *pWork, lWorkQuery[1];
const int M = 5, N = 3;
int ret, info;
/* Allocate device memory for the matrix (column-major) */
int lda = M;
int ldda = ((M + 31) / 32) * 32;
UTILS_DEVALLOC(devA, double, ldda * N);
UTILS_DEVALLOC(devB, double, M);
/* Initialize the matrix */
double A[N][M] = {{ 0.6, 5.0, 1.0, -1.0, -4.2 },
{ 1.2, 4.0, -4.0, -2.0, -8.4 },
{ 3.9, 2.5, -5.5, -6.5, -4.8 }};
cublasSetMatrix(M, N, sizeof(double), A, lda, devA, ldda);
double B[M] = {3.0, 4.0, -1.0, -5.0, -1.0};
cublasSetMatrix(M, 1, sizeof(double), B, M, devB, M);
/* Resolve the LLSP using MAGMA */
ret = magma_dgels_gpu('N', M, N, 1 /* nb of colums in the matrix B */,
devA, ldda, devB, M,
lWorkQuery, -1, // query the optimal work space
&info);
if (info < 0) {
printf("Argument %d of magma_dgels_gpu had an illegal value.\n", -info);
exit(1);
} else if (ret != MAGMA_SUCCESS) {
printf("magma_dgels_gpu failed (code %d).\n", ret);
exit(1);
}
int lwork = (int)lWorkQuery[0];
printf("Optimal work space %d\n", lwork);
UTILS_MALLOC(pWork, double, lwork);
ret = magma_dgels_gpu('N', M, N, 1 /* nb of colums in the matrix B */,
devA, ldda, devB, M,
pWork, lwork,
&info);
if (info < 0) {
printf("Argument %d of magma_dgels_gpu had an illegal value.\n", -info);
exit(1);
} else if (ret != MAGMA_SUCCESS) {
printf("magma_dgels_gpu failed (code %d).\n", ret);
exit(1);
} else {
printf("LLSP solved successfully\n");
}
cublasGetMatrix(M, 1, sizeof(double), devB, M, B, M);
/* Expected solution vector: 0.953333 -0.843333 0.906667 */
printf("Solution vector:\n");
for (int i = 0; i < N; i++) {
printf("\t%lf\n", B[i]);
}
/* Memory clean up */
free( pWork );
cudaFree( devA );
cudaFree( devB );
/* Shutdown */
cublasShutdown();
return 0;
}
I make compile as follows:
nvcc -arch=sm_20 testMagmaDGELS.cu -o testMagmaDGELS -lcublas -I/opt/magma/1.7.0/openblas/gcc/include
And I get these errors:
team24#tesla:~$ nvcc -arch=sm_20 testMagmaDGELS.cu -o testMagmaDGELS -lcublas -I/opt/magma/1.7.0/openblas/gcc/include
testMagmaDGELS.cu(54): error: argument of type "char" is incompatible with parameter of type "magma_trans_t"
testMagmaDGELS.cu(70): error: argument of type "char" is incompatible with parameter of type "magma_trans_t"
2 errors detected in the compilation of "/tmp/tmpxft_00002d95_00000000-8_testMagmaDGELS.cpp1.ii".
Could anyone help me?
Use the magma type for indication of transpose/no transpose, instead of using a char type.
so instead of this:
ret = magma_dgels_gpu('N', ...
do this:
magma_trans_t my_trans = MagmaNoTrans;
ret = magma_dgels_gpu(my_trans, ...
See the documentation here.
magma_trans_t magma_trans_const ( character ) Map 'N', 'T', 'C'
to MagmaNoTrans, MagmaTrans, MagmaConjTrans
I spent the last days trying to figure out how to download a file from an URL.
This is my first challenge with socket and I'm using it to have an understanding of protocols so I would like to do it without cURL libraries and only in C language!!
I searched a lot....now I'm able to printf the source code of a page but I think it's different with a file, I don't have only to put the received data from a buffer to a file, right?
any tips?
#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <netdb.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <errno.h>
#include <arpa/inet.h>
int main(void)
{
char domain[] = "www.sstatic.net", path[]="stackexchange/img/logos/so/so-logo-med.png"; //example
int sock, bytes_received;
char send_data[1024],recv_data[9999], *p;
struct sockaddr_in server_addr;
struct hostent *he;
FILE *fp;
he = gethostbyname(domain);
if (he == NULL){
herror("gethostbyname");
exit(1);
}
if ((sock = socket(AF_INET, SOCK_STREAM, 0))== -1){
perror("Socket");
exit(1);
}
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(80);
server_addr.sin_addr = *((struct in_addr *)he->h_addr);
bzero(&(server_addr.sin_zero),8);
if (connect(sock, (struct sockaddr *)&server_addr,sizeof(struct sockaddr)) == -1){
perror("Connect");
exit(1);
}
snprintf(send_data, sizeof(send_data), "GET /%s HTTP/1.1\r\nHost: /%s\r\n\r\n", path, domain);
//printf("%s\n", send_data);
send(sock, send_data, strlen(send_data), 0);
printf("Data sended.\n");
fp=fopen("received_file","wb");
bytes_received = recv(sock, recv_data, 9999, 0);
recv_data[bytes_received] = '\0';
printf("Data receieved.\n");
printf("%s\n", recv_data);
p = strstr(recv_data, "\r\n\r\n"); //to find "\r\n\r\n" sequence and put the pointer p after that
p=p+4;
fwrite(p,strlen(p),1,fp);
close(sock);
fclose(fp);
return 0;
}
UPDATE 1 thanks to milevyo for some improvements!
It works good with a txt file but it doesn't with other kinds of file (png in this case)
#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <netdb.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <errno.h>
#include <arpa/inet.h>
int main(void){
//char domain[] = "www.gnu.org", path[]="/licenses/gpl.txt"; //example
char domain[] = "sstatic.net", path[]="stackexchange/img/logos/so/so-logo-med.png"; //example
int sock, bytes_received;
char send_data[1024],recv_data[9999];
struct sockaddr_in server_addr;
struct hostent *he;
FILE *fp;
he = gethostbyname(domain);
if (he == NULL){
herror("gethostbyname");
exit(1);
}
if ((sock = socket(AF_INET, SOCK_STREAM, 0))== -1){
perror("Socket");
exit(1);
}
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(80);
server_addr.sin_addr = *((struct in_addr *)he->h_addr);
bzero(&(server_addr.sin_zero),8);
printf("Connecting ...\n");
if (connect(sock, (struct sockaddr *)&server_addr,sizeof(struct sockaddr)) == -1){
perror("Connect");
exit(1);
}
printf("Sending data ...\n");
snprintf(send_data, sizeof(send_data), "GET /%s HTTP/1.1\r\nHost: /%s\r\n\r\n", path, domain);
if(send(sock, send_data, strlen(send_data), 0)==-1){
perror("send");
exit(2);
}
printf("Data sent.\n");
fp=fopen("received_file","wb");
printf("Recieving data...\n\n");
while((bytes_received = recv(sock, recv_data, 9999, 0))>0){
if(bytes_received==-1){
perror("recieve");
exit(3);
}
recv_data[bytes_received] = '\0';
fwrite(recv_data,bytes_received,1,fp);
printf("%s", recv_data);
}
close(sock);
fclose(fp);
printf("\n\nDone.\n\n");
return 0;
}
this code produce a 334 bytes file (instead of 12,4kb of the original file) with this inside:
HTTP/1.1 400 Bad Request
Date: Sat, 28 Nov 2015 16:20:45 GMT
Content-Type: text/html
Content-Length: 177
Connection: close
Server: -nginx
CF-RAY: -
<html>
<head><title>400 Bad Request</title></head>
<body bgcolor="white">
<center><h1>400 Bad Request</h1></center>
<hr><center>cloudflare-nginx</center>
</body>
</html>
somebody knows how to fix this "400 Bad Request"?
This is an update for the previous posted code. The http protocol is far to be implementation in just a small example.
reformatting the code , or giving a modification to it is more than welcome.
#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <netdb.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <errno.h>
#include <arpa/inet.h>
#include <string.h>
int ReadHttpStatus(int sock){
char c;
char buff[1024]="",*ptr=buff+1;
int bytes_received, status;
printf("Begin Response ..\n");
while(bytes_received = recv(sock, ptr, 1, 0)){
if(bytes_received==-1){
perror("ReadHttpStatus");
exit(1);
}
if((ptr[-1]=='\r') && (*ptr=='\n' )) break;
ptr++;
}
*ptr=0;
ptr=buff+1;
sscanf(ptr,"%*s %d ", &status);
printf("%s\n",ptr);
printf("status=%d\n",status);
printf("End Response ..\n");
return (bytes_received>0)?status:0;
}
//the only filed that it parsed is 'Content-Length'
int ParseHeader(int sock){
char c;
char buff[1024]="",*ptr=buff+4;
int bytes_received, status;
printf("Begin HEADER ..\n");
while(bytes_received = recv(sock, ptr, 1, 0)){
if(bytes_received==-1){
perror("Parse Header");
exit(1);
}
if(
(ptr[-3]=='\r') && (ptr[-2]=='\n' ) &&
(ptr[-1]=='\r') && (*ptr=='\n' )
) break;
ptr++;
}
*ptr=0;
ptr=buff+4;
//printf("%s",ptr);
if(bytes_received){
ptr=strstr(ptr,"Content-Length:");
if(ptr){
sscanf(ptr,"%*s %d",&bytes_received);
}else
bytes_received=-1; //unknown size
printf("Content-Length: %d\n",bytes_received);
}
printf("End HEADER ..\n");
return bytes_received ;
}
int main(void){
char domain[] = "sstatic.net", path[]="stackexchange/img/logos/so/so-logo-med.png";
int sock, bytes_received;
char send_data[1024],recv_data[1024], *p;
struct sockaddr_in server_addr;
struct hostent *he;
he = gethostbyname(domain);
if (he == NULL){
herror("gethostbyname");
exit(1);
}
if ((sock = socket(AF_INET, SOCK_STREAM, 0))== -1){
perror("Socket");
exit(1);
}
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(80);
server_addr.sin_addr = *((struct in_addr *)he->h_addr);
bzero(&(server_addr.sin_zero),8);
printf("Connecting ...\n");
if (connect(sock, (struct sockaddr *)&server_addr,sizeof(struct sockaddr)) == -1){
perror("Connect");
exit(1);
}
printf("Sending data ...\n");
snprintf(send_data, sizeof(send_data), "GET /%s HTTP/1.1\r\nHost: %s\r\n\r\n", path, domain);
if(send(sock, send_data, strlen(send_data), 0)==-1){
perror("send");
exit(2);
}
printf("Data sent.\n");
//fp=fopen("received_file","wb");
printf("Recieving data...\n\n");
int contentlengh;
if(ReadHttpStatus(sock) && (contentlengh=ParseHeader(sock))){
int bytes=0;
FILE* fd=fopen("test.png","wb");
printf("Saving data...\n\n");
while(bytes_received = recv(sock, recv_data, 1024, 0)){
if(bytes_received==-1){
perror("recieve");
exit(3);
}
fwrite(recv_data,1,bytes_received,fd);
bytes+=bytes_received;
printf("Bytes recieved: %d from %d\n",bytes,contentlengh);
if(bytes==contentlengh)
break;
}
fclose(fd);
}
close(sock);
printf("\n\nDone.\n\n");
return 0;
}
Try some thing like below: -
#include <sys/socket.h>
#include <sys/errno.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <unistd.h>
#include <string.h>
#include <strings.h>
#include <stdlib.h>
#include <stdio.h>
#include <openssl/ssl.h>
#include <openssl/err.h>
#define BUFLEN 4096
#define HOST "www.t.edu.pk"
#define PORT 443
int main()
{
int sock, iResult;
char *cmd, *ip;
char recvbuf[BUFLEN];
//
struct sockaddr_in sin;
struct hostent* hent;
//
hent = gethostbyname(HOST);
if(hent == NULL)
{
printf("gethostbyname failed: %d\n", errno);
return -1;
}
printf("gethostbyname succeeded\n");
ip = inet_ntoa(*((struct in_addr*)hent->h_addr_list[0]));
printf("Host IP: %s\n", ip);
//
sock = socket(AF_INET, SOCK_STREAM, 0);
if(sock == -1)
{
printf("socket failed: %d\n", errno);
return -1;
}
printf("socket created\n");
//
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = inet_addr(ip);
sin.sin_port = htons(PORT);
iResult = connect(sock, (struct sockaddr*)&sin, sizeof(sin));
if(iResult < 0)
{
printf("connect failed: %d\n", errno);
return -1;
}
printf("connect succeeded\n");
//
iResult = SSL_library_init();
if(iResult < 0)
{
printf("SSL failed\n");
return -1;
}
printf("SSL library initialised\n");
OpenSSL_add_all_algorithms();
ERR_load_crypto_strings();
SSL_load_error_strings();
SSL_CTX* ctx = SSL_CTX_new(TLSv1_2_client_method());
if(ctx == NULL)
{
printf("ctx failed\n");
ERR_print_errors_fp(stderr);
return -1;
}
printf("ctx loaded\n");
SSL* ssl = SSL_new(ctx);
if(ssl == NULL)
{
printf("ssl failed\n");
ERR_print_errors_fp(stderr);
return -1;
}
printf("ssl loaded\n");
SSL_set_fd(ssl, sock);
SSL_connect(ssl);
//
cmd = "GET / HTTP/1.1\r\nHost: www.t.edu.pk\r\n\r\n";
iResult = SSL_write(ssl, cmd, strlen(cmd));
if(iResult <= 0)
{
printf("SSL write failed\n");
ERR_print_errors_fp(stderr);
return -1;
}
printf("Byte(s) sent: %d\n", iResult);
bzero(recvbuf, BUFLEN);
do
{
iResult = SSL_read(ssl, recvbuf, BUFLEN - 1);
if(iResult < 0)
{
printf("error receiving data\n");
break;
}
if(iResult == 0)
{
printf("host closed connection\n");
break;
}
printf("%s\n", recvbuf);
}while(iResult > 0);
//
iResult = SSL_shutdown(ssl);
if(iResult == 0)
{
printf("SSL shutdown in progress...\n");
}
iResult = SSL_shutdown(ssl);
if(iResult == 1)
{
printf("SSL shutdown complete!\n");
}
if(iResult == -1)
{
printf("SSL shutdown unsuccessful!\n");
}
SSL_CTX_free(ctx);
//
iResult = shutdown(sock, SHUT_RDWR);
if(iResult == -1)
{
printf("Socket shutdown failed: %d\n", errno);
return -1;
}
printf("Socket shutdown succeeded\n");
iResult = close(sock);
if(iResult != 0)
{
printf("error closing socket: %d\n", errno);
return -1;
}
printf("Socket closed\n");
//
return 0;
}
This work for C/C++ in linux Environment. You can run it by makefile, or just add -lcurl option in g++.
Notice that should have lib cURL.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>
FILE *fp = fopen("file.txt", "w");
char outfilename[FILENAME_MAX] = "file_downloaded.txt";
FILE *fp1 = fopen(outfilename,"wb");
struct MemoryStruct {
char *memory;
size_t size;
};
static size_t
WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp)
{
size_t realsize = size * nmemb;
struct MemoryStruct *mem = (struct MemoryStruct *)userp;
char *ptr = (char*)realloc(mem->memory, mem->size + realsize + 1);
if(!ptr) {
/* out of memory! */
printf("not enough memory (realloc returned NULL)\n");
return 0;
}
fprintf(fp, "%ld - %ld - %ld\n", realsize, size, nmemb);
size_t written = fwrite(contents, size, nmemb, fp1);
mem->memory = ptr;
memcpy(&(mem->memory[mem->size]), contents, realsize);
mem->size += realsize;
mem->memory[mem->size] = 0;
return realsize;
}
int main(void)
{
char link_download[] = "https://www.example.com/";
CURL *curl_handle;
CURLcode res;
struct MemoryStruct chunk;
chunk.memory = (char*)malloc(1); /* will be grown as needed by the realloc above */
chunk.size = 0; /* no data at this point */
curl_global_init(CURL_GLOBAL_ALL);
/* init the curl session */
curl_handle = curl_easy_init();
/* specify URL to get */
curl_easy_setopt(curl_handle, CURLOPT_URL, link_download);
/* send all data to this function */
curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
/* we pass our 'chunk' struct to the callback function */
curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk);
/* some servers do not like requests that are made without a user-agent
field, so we provide one */
curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0");
/* get it! */
res = curl_easy_perform(curl_handle);
/* check for errors */
if(res != CURLE_OK) {
fprintf(stderr, "curl_easy_perform() failed: %s\n",
curl_easy_strerror(res));
}
else {
/*
* Now, our chunk.memory points to a memory block that is chunk.size
* bytes big and contains the remote file.
*
* Do something nice with it!
*/
printf("%lu bytes retrieved\n", (unsigned long)chunk.size);
}
/* cleanup curl stuff */
curl_easy_cleanup(curl_handle);
free(chunk.memory);
/* we are done with libcurl, so clean it up */
curl_global_cleanup();
fclose(fp);
fclose(fp1);
return 0;
}
Could anyone please suggest me a way to free a volatile global memory variable in CUDA...
volatile unsigned *d_queue_L12;
err = cudaMalloc((void **)&d_queue_L12, CORES*MAX_12*Cache_Sets_L2*sizeof(volatile unsigned));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate space to L12 QUEUE vector (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_queue_L12);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free L2 FLAG COUNT vector (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
gives an error:
error: argument of type "volatile unsigned int *" is incompatible with parameter of type "void *"
How about something like this:
err = cudaFree((void *)d_queue_L12);
I am receiving the error:
Cufft error in file
I am using this file in order to load the FFT and pass them to another file.
//----function to check for errors-------------------------------------------------
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"\nGPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
//function to check for cuFFT errors --------------------------------------------------
#define CUFFT_SAFE_CALL( call) do { \
cufftResult err = call; \
if (err != CUFFT_SUCCESS) { \
fprintf(stderr, "Cufft error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, "error" ); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define NX 128*128
#define NY 16
#define BATCH 16
#define NRANK 2
void FFT_transform(cufftDoubleComplex** B_in)
{
int n[NRANK] = {NX, NY};
//size of B
int Bsize=NX*NY*BATCH;
//allocate host memory
*B_in=(cufftDoubleComplex*)malloc(Bsize*sizeof(cufftDoubleComplex));
for (int i=0;i<NX*NY;i++){
for (int j=0;j<BATCH;j++){
(*B_in)[i*BATCH+j].x=(i*BATCH+j)*2;
(*B_in)[i*BATCH+j].y=(i*BATCH+j)*2+1;
}
}
//allocate device memory
cufftDoubleComplex* B_dev;
gpuErrchk(cudaMalloc((void**) &B_dev,Bsize* sizeof(cufftDoubleComplex)));
if (cudaGetLastError() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to allocate\n");
return;
}
// copy arrays from host to device
gpuErrchk(cudaMemcpy(B_dev, *B_in,Bsize* sizeof(cufftDoubleComplex), cudaMemcpyHostToDevice));
// Create a 2D FFT plan
cufftHandle plan;
CUFFT_SAFE_CALL(cufftPlan2d(&plan,NX,NY,CUFFT_Z2Z));
if (cufftPlanMany(&plan, NRANK, n,NULL, 1, 0,NULL, 1, 0,CUFFT_Z2Z,BATCH) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT Error: Unable to create plan\n");
return;
}
if (cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE)!= CUFFT_SUCCESS){
fprintf(stderr, "CUFFT Error: Unable to set compatibility mode to native\n");
return;
}
// perform transform
CUFFT_SAFE_CALL(cufftExecZ2Z(plan,(cufftDoubleComplex *)(*B_in), (cufftDoubleComplex *)B_dev, CUFFT_FORWARD));
if (cufftExecZ2Z(plan,*B_in,B_dev,CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT Error: Unable to execute plan\n");
return;
}
if (cudaThreadSynchronize() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to synchronize\n");
return;
}
// copy result from device to host
gpuErrchk(cudaMemcpy(*B_in, B_dev,Bsize*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost));
//Destroy CUFFT context
CUFFT_SAFE_CALL(cufftDestroy(plan));
//clean up device memory
gpuErrchk(cudaFree(B_dev));
}
I am receiving the error at line:
CUFFT_SAFE_CALL(cufftExecZ2Z(plan,(cufftDoubleComplex *)(*B_in), (cufftDoubleComplex *)B_dev, CUFFT_FORWARD));
You are getting the error because B_in is a pointer to host memory and not to device memory, which is illegal. In CUFFT, inputs are always in device memory. You need to use cudaMemcpy to transfer the contents of B_in to B_dev before performing the transform, and then supply B_dev as both the input and output, which will result in an in place transform. This is clearly described in the CUFFT API documentation here.
The function nppiDotProd_8u64f_C1R causes a cudaErrorUnknown. I'm able to compile and run properly boxFilterNPP and histEqualizationNPP so I assume my system is healthy. I'm running with a GTX470 (compute capability 2.0), CUDA 5.5 and VS2012 x64 on Windows7. I've also run many variations of it on two systems and having the same problem. Here is the code:
NppGpuComputeCapability capability = nppGetGpuComputeCapability();
NppiSize sizeROI;
sizeROI.width = 640;
sizeROI.height = 480;
int nBufferSize = 0;
NppStatus status = nppiDotProdGetBufferHostSize_8u64f_C1R(sizeROI,&nBufferSize);
if(status != NPP_SUCCESS) return status;
unsigned char *pDeviceBuffer;
cudaError_t err = cudaMalloc((void**)&pDeviceBuffer,nBufferSize);
if(err != cudaSuccess) return err;
int stepByte1 = 0;
Npp8u * buf1 = nppiMalloc_8u_C1(sizeROI.width, sizeROI.height, &stepByte1);
status = nppiSet_8u_C1R(1,buf1,stepByte1,sizeROI);
if(status != NPP_SUCCESS) return status;
int stepByte2 = 0;
Npp8u * buf2 = nppiMalloc_8u_C1(sizeROI.width, sizeROI.height, &stepByte2);
status = nppiSet_8u_C1R(1,buf2,stepByte2,sizeROI);
if(status != NPP_SUCCESS) return status;
err = cudaDeviceSynchronize();
if(err != cudaSuccess) return err;
double dp = 0;
status = nppiDotProd_8u64f_C1R(buf1,stepByte1,buf2,stepByte2,sizeROI,&dp,pDeviceBuffer);
if(status != NPP_SUCCESS) return status;
err = cudaDeviceSynchronize(); // return cudaErrorUnknown
// CUDA memchecker gives me "OutOfRangeStore" exception
if(err != cudaSuccess) return err;
printf("result: %f\n", dp);
nppiFree(buf1);
nppiFree(buf2);
cudaFree(pDeviceBuffer);
Any idea about my problem?
Thank you very much!!
The result argument in that nppiDotProd call must be a device pointer, not a host pointer. You can fix it by allocating memory for dp on the device, something like :
double * dp ;
cudaMalloc((void **)(&dp), sizeof(Npp64f) * 1);
status = nppiDotProd_8u64f_C1R(buf1,stepByte1,buf2,stepByte2,sizeROI,dp,pDeviceBuffer);
if(status != NPP_SUCCESS) return status;
[disclaimer: written in browser, not compiled or tested, use a own risk]
You will obviously need to copy the result of the dot product back to the host if you need it.