I am trying to solve a least squares problem via "magma_dgels_gpu()" function of MAGMA Library. My GPU is "Tesla C2050 / C2075" and i have installed MAGMA.
I am trying to compile the below code "testMagmaDGELS.cu", but i get error:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cublas.h>
#include "magma.h"
#define UTILS_MALLOC(__ptr, __type, __size) \
__ptr = (__type*)malloc((__size) * sizeof(__type)); \
if (__ptr == 0) { \
fprintf (stderr, "!!!! Malloc failed for: %s\n", #__ptr ); \
exit(-1); \
}
#define UTILS_DEVALLOC(__ptr, __type, __size) \
if( cudaSuccess != cudaMalloc( (void**)&__ptr, (__size)*sizeof(__type) ) ){ \
fprintf (stderr, "!!!! cudaMalloc failed for: %s\n", #__ptr ); \
exit(-1); \
}
int main(int argc, char** argv)
{
if( CUBLAS_STATUS_SUCCESS != cublasInit( ) ) {
fprintf(stderr, "CUBLAS: Not initialized\n"); exit(-1);
}
double *devA, *devB, *pWork, lWorkQuery[1];
const int M = 5, N = 3;
int ret, info;
/* Allocate device memory for the matrix (column-major) */
int lda = M;
int ldda = ((M + 31) / 32) * 32;
UTILS_DEVALLOC(devA, double, ldda * N);
UTILS_DEVALLOC(devB, double, M);
/* Initialize the matrix */
double A[N][M] = {{ 0.6, 5.0, 1.0, -1.0, -4.2 },
{ 1.2, 4.0, -4.0, -2.0, -8.4 },
{ 3.9, 2.5, -5.5, -6.5, -4.8 }};
cublasSetMatrix(M, N, sizeof(double), A, lda, devA, ldda);
double B[M] = {3.0, 4.0, -1.0, -5.0, -1.0};
cublasSetMatrix(M, 1, sizeof(double), B, M, devB, M);
/* Resolve the LLSP using MAGMA */
ret = magma_dgels_gpu('N', M, N, 1 /* nb of colums in the matrix B */,
devA, ldda, devB, M,
lWorkQuery, -1, // query the optimal work space
&info);
if (info < 0) {
printf("Argument %d of magma_dgels_gpu had an illegal value.\n", -info);
exit(1);
} else if (ret != MAGMA_SUCCESS) {
printf("magma_dgels_gpu failed (code %d).\n", ret);
exit(1);
}
int lwork = (int)lWorkQuery[0];
printf("Optimal work space %d\n", lwork);
UTILS_MALLOC(pWork, double, lwork);
ret = magma_dgels_gpu('N', M, N, 1 /* nb of colums in the matrix B */,
devA, ldda, devB, M,
pWork, lwork,
&info);
if (info < 0) {
printf("Argument %d of magma_dgels_gpu had an illegal value.\n", -info);
exit(1);
} else if (ret != MAGMA_SUCCESS) {
printf("magma_dgels_gpu failed (code %d).\n", ret);
exit(1);
} else {
printf("LLSP solved successfully\n");
}
cublasGetMatrix(M, 1, sizeof(double), devB, M, B, M);
/* Expected solution vector: 0.953333 -0.843333 0.906667 */
printf("Solution vector:\n");
for (int i = 0; i < N; i++) {
printf("\t%lf\n", B[i]);
}
/* Memory clean up */
free( pWork );
cudaFree( devA );
cudaFree( devB );
/* Shutdown */
cublasShutdown();
return 0;
}
I make compile as follows:
nvcc -arch=sm_20 testMagmaDGELS.cu -o testMagmaDGELS -lcublas -I/opt/magma/1.7.0/openblas/gcc/include
And I get these errors:
team24#tesla:~$ nvcc -arch=sm_20 testMagmaDGELS.cu -o testMagmaDGELS -lcublas -I/opt/magma/1.7.0/openblas/gcc/include
testMagmaDGELS.cu(54): error: argument of type "char" is incompatible with parameter of type "magma_trans_t"
testMagmaDGELS.cu(70): error: argument of type "char" is incompatible with parameter of type "magma_trans_t"
2 errors detected in the compilation of "/tmp/tmpxft_00002d95_00000000-8_testMagmaDGELS.cpp1.ii".
Could anyone help me?
Use the magma type for indication of transpose/no transpose, instead of using a char type.
so instead of this:
ret = magma_dgels_gpu('N', ...
do this:
magma_trans_t my_trans = MagmaNoTrans;
ret = magma_dgels_gpu(my_trans, ...
See the documentation here.
magma_trans_t magma_trans_const ( character ) Map 'N', 'T', 'C'
to MagmaNoTrans, MagmaTrans, MagmaConjTrans
Related
I am trying to perform a sum reduction using CUB and 2D arrays of type float/double.
Although it works for certain combinations of rows+columns, for relatively larger arrays, I get an illegal memory access error during the last transfer.
A minimal example is the following:
#include <stdio.h>
#include <stdlib.h>
#include <cub/device/device_reduce.cuh>
#include "cuda_runtime.h"
#ifdef DP
#define real double
#else
#define real float
#endif
void generatedata(const int num, real* vec, real start, real finish) {
real rrange = finish - start;
for (auto i = 0; i < num; ++i)
vec[i] = rand() / float(RAND_MAX) * rrange + start;
}
real reduce_to_sum(const int num, const real* vec) {
real total = real(0.0);
for (auto i = 0; i < num; ++i)
total += vec[i];
return total;
}
int main() {
int rows = 2001;
int cols = 3145;
size_t msize = rows * cols;
real* data = (real*)malloc(msize * sizeof(real));
if (!data)
return -999;
generatedata(msize, data, 0., 50.);
real ref_sum = reduce_to_sum(msize, data);
real* d_data_in = nullptr;
real* d_data_out = nullptr;
size_t pitch_in, pitch_out;
cudaError_t err = cudaMallocPitch(&d_data_in, &pitch_in, cols * sizeof(real), rows);
if (err != cudaSuccess) {
printf("data_in :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMallocPitch(&d_data_out, &pitch_out, cols * sizeof(real), rows);
if (err != cudaSuccess) {
printf("data_out :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMemset(d_data_in, 0, rows * pitch_in);
if (err != cudaSuccess) {
printf("set data_in :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMemcpy2D(d_data_in, pitch_in, data, cols * sizeof(real), cols * sizeof(real), rows, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
printf("copy data :: %s \n", cudaGetErrorString(err));
return -999;
}
void* d_temp = nullptr;
size_t temp_bytes = 0;
cub::DeviceReduce::Sum(d_temp, temp_bytes, d_data_in, d_data_out, rows * pitch_out);
err = cudaMalloc(&d_temp, temp_bytes);
if (err != cudaSuccess) {
printf("temp :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMemset(d_data_out, 0, rows * pitch_out);
if (err != cudaSuccess) {
printf("set temp :: %s \n", cudaGetErrorString(err));
return -999;
}
// Run sum-reduction
cub::DeviceReduce::Sum(d_temp, temp_bytes, d_data_in, d_data_out, rows * pitch_out);
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("reduction :: %s \n", cudaGetErrorString(err));
return -999;
}
real gpu_sum = real(0.0);
err = cudaMemcpy(&gpu_sum, d_data_out, sizeof(real), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
printf("copy final :: %s \n", cudaGetErrorString(err));
return -999;
}
printf("Difference in sum (h)%f - (d)%f = %f \n", ref_sum, gpu_sum, ref_sum - gpu_sum);
if (data) free(data);
if (d_data_in) cudaFree(d_data_in);
if (d_data_out) cudaFree(d_data_out);
if (d_temp) cudaFree(d_temp);
cudaDeviceReset();
return 0;
}
The error is thrown at "copy final ::". I am bit confused as to why certain rows x columns work and others don't. I did notice it's the larger values that cause it, but can't get my head around.
Any suggestions would be much appreciated.
The 5th parameter of cub::DeviceReduce::Sum should be the number of input elements. However, rows * pitch_out is the size of the output buffer in bytes.
Assuming pitch_in % sizeof(real) == 0, the following call may work.
cub::DeviceReduce::Sum(d_temp, temp_bytes, d_data_in, d_data_out, rows * (pitch_in / sizeof(real)));
Also note that cub::DeviceReduce::Sum may return before the reduction is complete. In this case, if any error happened during execution, this error will be reported by cudaMemcpy.
I spent the last days trying to figure out how to download a file from an URL.
This is my first challenge with socket and I'm using it to have an understanding of protocols so I would like to do it without cURL libraries and only in C language!!
I searched a lot....now I'm able to printf the source code of a page but I think it's different with a file, I don't have only to put the received data from a buffer to a file, right?
any tips?
#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <netdb.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <errno.h>
#include <arpa/inet.h>
int main(void)
{
char domain[] = "www.sstatic.net", path[]="stackexchange/img/logos/so/so-logo-med.png"; //example
int sock, bytes_received;
char send_data[1024],recv_data[9999], *p;
struct sockaddr_in server_addr;
struct hostent *he;
FILE *fp;
he = gethostbyname(domain);
if (he == NULL){
herror("gethostbyname");
exit(1);
}
if ((sock = socket(AF_INET, SOCK_STREAM, 0))== -1){
perror("Socket");
exit(1);
}
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(80);
server_addr.sin_addr = *((struct in_addr *)he->h_addr);
bzero(&(server_addr.sin_zero),8);
if (connect(sock, (struct sockaddr *)&server_addr,sizeof(struct sockaddr)) == -1){
perror("Connect");
exit(1);
}
snprintf(send_data, sizeof(send_data), "GET /%s HTTP/1.1\r\nHost: /%s\r\n\r\n", path, domain);
//printf("%s\n", send_data);
send(sock, send_data, strlen(send_data), 0);
printf("Data sended.\n");
fp=fopen("received_file","wb");
bytes_received = recv(sock, recv_data, 9999, 0);
recv_data[bytes_received] = '\0';
printf("Data receieved.\n");
printf("%s\n", recv_data);
p = strstr(recv_data, "\r\n\r\n"); //to find "\r\n\r\n" sequence and put the pointer p after that
p=p+4;
fwrite(p,strlen(p),1,fp);
close(sock);
fclose(fp);
return 0;
}
UPDATE 1 thanks to milevyo for some improvements!
It works good with a txt file but it doesn't with other kinds of file (png in this case)
#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <netdb.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <errno.h>
#include <arpa/inet.h>
int main(void){
//char domain[] = "www.gnu.org", path[]="/licenses/gpl.txt"; //example
char domain[] = "sstatic.net", path[]="stackexchange/img/logos/so/so-logo-med.png"; //example
int sock, bytes_received;
char send_data[1024],recv_data[9999];
struct sockaddr_in server_addr;
struct hostent *he;
FILE *fp;
he = gethostbyname(domain);
if (he == NULL){
herror("gethostbyname");
exit(1);
}
if ((sock = socket(AF_INET, SOCK_STREAM, 0))== -1){
perror("Socket");
exit(1);
}
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(80);
server_addr.sin_addr = *((struct in_addr *)he->h_addr);
bzero(&(server_addr.sin_zero),8);
printf("Connecting ...\n");
if (connect(sock, (struct sockaddr *)&server_addr,sizeof(struct sockaddr)) == -1){
perror("Connect");
exit(1);
}
printf("Sending data ...\n");
snprintf(send_data, sizeof(send_data), "GET /%s HTTP/1.1\r\nHost: /%s\r\n\r\n", path, domain);
if(send(sock, send_data, strlen(send_data), 0)==-1){
perror("send");
exit(2);
}
printf("Data sent.\n");
fp=fopen("received_file","wb");
printf("Recieving data...\n\n");
while((bytes_received = recv(sock, recv_data, 9999, 0))>0){
if(bytes_received==-1){
perror("recieve");
exit(3);
}
recv_data[bytes_received] = '\0';
fwrite(recv_data,bytes_received,1,fp);
printf("%s", recv_data);
}
close(sock);
fclose(fp);
printf("\n\nDone.\n\n");
return 0;
}
this code produce a 334 bytes file (instead of 12,4kb of the original file) with this inside:
HTTP/1.1 400 Bad Request
Date: Sat, 28 Nov 2015 16:20:45 GMT
Content-Type: text/html
Content-Length: 177
Connection: close
Server: -nginx
CF-RAY: -
<html>
<head><title>400 Bad Request</title></head>
<body bgcolor="white">
<center><h1>400 Bad Request</h1></center>
<hr><center>cloudflare-nginx</center>
</body>
</html>
somebody knows how to fix this "400 Bad Request"?
This is an update for the previous posted code. The http protocol is far to be implementation in just a small example.
reformatting the code , or giving a modification to it is more than welcome.
#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <netdb.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <errno.h>
#include <arpa/inet.h>
#include <string.h>
int ReadHttpStatus(int sock){
char c;
char buff[1024]="",*ptr=buff+1;
int bytes_received, status;
printf("Begin Response ..\n");
while(bytes_received = recv(sock, ptr, 1, 0)){
if(bytes_received==-1){
perror("ReadHttpStatus");
exit(1);
}
if((ptr[-1]=='\r') && (*ptr=='\n' )) break;
ptr++;
}
*ptr=0;
ptr=buff+1;
sscanf(ptr,"%*s %d ", &status);
printf("%s\n",ptr);
printf("status=%d\n",status);
printf("End Response ..\n");
return (bytes_received>0)?status:0;
}
//the only filed that it parsed is 'Content-Length'
int ParseHeader(int sock){
char c;
char buff[1024]="",*ptr=buff+4;
int bytes_received, status;
printf("Begin HEADER ..\n");
while(bytes_received = recv(sock, ptr, 1, 0)){
if(bytes_received==-1){
perror("Parse Header");
exit(1);
}
if(
(ptr[-3]=='\r') && (ptr[-2]=='\n' ) &&
(ptr[-1]=='\r') && (*ptr=='\n' )
) break;
ptr++;
}
*ptr=0;
ptr=buff+4;
//printf("%s",ptr);
if(bytes_received){
ptr=strstr(ptr,"Content-Length:");
if(ptr){
sscanf(ptr,"%*s %d",&bytes_received);
}else
bytes_received=-1; //unknown size
printf("Content-Length: %d\n",bytes_received);
}
printf("End HEADER ..\n");
return bytes_received ;
}
int main(void){
char domain[] = "sstatic.net", path[]="stackexchange/img/logos/so/so-logo-med.png";
int sock, bytes_received;
char send_data[1024],recv_data[1024], *p;
struct sockaddr_in server_addr;
struct hostent *he;
he = gethostbyname(domain);
if (he == NULL){
herror("gethostbyname");
exit(1);
}
if ((sock = socket(AF_INET, SOCK_STREAM, 0))== -1){
perror("Socket");
exit(1);
}
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(80);
server_addr.sin_addr = *((struct in_addr *)he->h_addr);
bzero(&(server_addr.sin_zero),8);
printf("Connecting ...\n");
if (connect(sock, (struct sockaddr *)&server_addr,sizeof(struct sockaddr)) == -1){
perror("Connect");
exit(1);
}
printf("Sending data ...\n");
snprintf(send_data, sizeof(send_data), "GET /%s HTTP/1.1\r\nHost: %s\r\n\r\n", path, domain);
if(send(sock, send_data, strlen(send_data), 0)==-1){
perror("send");
exit(2);
}
printf("Data sent.\n");
//fp=fopen("received_file","wb");
printf("Recieving data...\n\n");
int contentlengh;
if(ReadHttpStatus(sock) && (contentlengh=ParseHeader(sock))){
int bytes=0;
FILE* fd=fopen("test.png","wb");
printf("Saving data...\n\n");
while(bytes_received = recv(sock, recv_data, 1024, 0)){
if(bytes_received==-1){
perror("recieve");
exit(3);
}
fwrite(recv_data,1,bytes_received,fd);
bytes+=bytes_received;
printf("Bytes recieved: %d from %d\n",bytes,contentlengh);
if(bytes==contentlengh)
break;
}
fclose(fd);
}
close(sock);
printf("\n\nDone.\n\n");
return 0;
}
Try some thing like below: -
#include <sys/socket.h>
#include <sys/errno.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <unistd.h>
#include <string.h>
#include <strings.h>
#include <stdlib.h>
#include <stdio.h>
#include <openssl/ssl.h>
#include <openssl/err.h>
#define BUFLEN 4096
#define HOST "www.t.edu.pk"
#define PORT 443
int main()
{
int sock, iResult;
char *cmd, *ip;
char recvbuf[BUFLEN];
//
struct sockaddr_in sin;
struct hostent* hent;
//
hent = gethostbyname(HOST);
if(hent == NULL)
{
printf("gethostbyname failed: %d\n", errno);
return -1;
}
printf("gethostbyname succeeded\n");
ip = inet_ntoa(*((struct in_addr*)hent->h_addr_list[0]));
printf("Host IP: %s\n", ip);
//
sock = socket(AF_INET, SOCK_STREAM, 0);
if(sock == -1)
{
printf("socket failed: %d\n", errno);
return -1;
}
printf("socket created\n");
//
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = inet_addr(ip);
sin.sin_port = htons(PORT);
iResult = connect(sock, (struct sockaddr*)&sin, sizeof(sin));
if(iResult < 0)
{
printf("connect failed: %d\n", errno);
return -1;
}
printf("connect succeeded\n");
//
iResult = SSL_library_init();
if(iResult < 0)
{
printf("SSL failed\n");
return -1;
}
printf("SSL library initialised\n");
OpenSSL_add_all_algorithms();
ERR_load_crypto_strings();
SSL_load_error_strings();
SSL_CTX* ctx = SSL_CTX_new(TLSv1_2_client_method());
if(ctx == NULL)
{
printf("ctx failed\n");
ERR_print_errors_fp(stderr);
return -1;
}
printf("ctx loaded\n");
SSL* ssl = SSL_new(ctx);
if(ssl == NULL)
{
printf("ssl failed\n");
ERR_print_errors_fp(stderr);
return -1;
}
printf("ssl loaded\n");
SSL_set_fd(ssl, sock);
SSL_connect(ssl);
//
cmd = "GET / HTTP/1.1\r\nHost: www.t.edu.pk\r\n\r\n";
iResult = SSL_write(ssl, cmd, strlen(cmd));
if(iResult <= 0)
{
printf("SSL write failed\n");
ERR_print_errors_fp(stderr);
return -1;
}
printf("Byte(s) sent: %d\n", iResult);
bzero(recvbuf, BUFLEN);
do
{
iResult = SSL_read(ssl, recvbuf, BUFLEN - 1);
if(iResult < 0)
{
printf("error receiving data\n");
break;
}
if(iResult == 0)
{
printf("host closed connection\n");
break;
}
printf("%s\n", recvbuf);
}while(iResult > 0);
//
iResult = SSL_shutdown(ssl);
if(iResult == 0)
{
printf("SSL shutdown in progress...\n");
}
iResult = SSL_shutdown(ssl);
if(iResult == 1)
{
printf("SSL shutdown complete!\n");
}
if(iResult == -1)
{
printf("SSL shutdown unsuccessful!\n");
}
SSL_CTX_free(ctx);
//
iResult = shutdown(sock, SHUT_RDWR);
if(iResult == -1)
{
printf("Socket shutdown failed: %d\n", errno);
return -1;
}
printf("Socket shutdown succeeded\n");
iResult = close(sock);
if(iResult != 0)
{
printf("error closing socket: %d\n", errno);
return -1;
}
printf("Socket closed\n");
//
return 0;
}
This work for C/C++ in linux Environment. You can run it by makefile, or just add -lcurl option in g++.
Notice that should have lib cURL.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>
FILE *fp = fopen("file.txt", "w");
char outfilename[FILENAME_MAX] = "file_downloaded.txt";
FILE *fp1 = fopen(outfilename,"wb");
struct MemoryStruct {
char *memory;
size_t size;
};
static size_t
WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp)
{
size_t realsize = size * nmemb;
struct MemoryStruct *mem = (struct MemoryStruct *)userp;
char *ptr = (char*)realloc(mem->memory, mem->size + realsize + 1);
if(!ptr) {
/* out of memory! */
printf("not enough memory (realloc returned NULL)\n");
return 0;
}
fprintf(fp, "%ld - %ld - %ld\n", realsize, size, nmemb);
size_t written = fwrite(contents, size, nmemb, fp1);
mem->memory = ptr;
memcpy(&(mem->memory[mem->size]), contents, realsize);
mem->size += realsize;
mem->memory[mem->size] = 0;
return realsize;
}
int main(void)
{
char link_download[] = "https://www.example.com/";
CURL *curl_handle;
CURLcode res;
struct MemoryStruct chunk;
chunk.memory = (char*)malloc(1); /* will be grown as needed by the realloc above */
chunk.size = 0; /* no data at this point */
curl_global_init(CURL_GLOBAL_ALL);
/* init the curl session */
curl_handle = curl_easy_init();
/* specify URL to get */
curl_easy_setopt(curl_handle, CURLOPT_URL, link_download);
/* send all data to this function */
curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
/* we pass our 'chunk' struct to the callback function */
curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk);
/* some servers do not like requests that are made without a user-agent
field, so we provide one */
curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0");
/* get it! */
res = curl_easy_perform(curl_handle);
/* check for errors */
if(res != CURLE_OK) {
fprintf(stderr, "curl_easy_perform() failed: %s\n",
curl_easy_strerror(res));
}
else {
/*
* Now, our chunk.memory points to a memory block that is chunk.size
* bytes big and contains the remote file.
*
* Do something nice with it!
*/
printf("%lu bytes retrieved\n", (unsigned long)chunk.size);
}
/* cleanup curl stuff */
curl_easy_cleanup(curl_handle);
free(chunk.memory);
/* we are done with libcurl, so clean it up */
curl_global_cleanup();
fclose(fp);
fclose(fp1);
return 0;
}
I am receiving the error:
Cufft error in file
I am using this file in order to load the FFT and pass them to another file.
//----function to check for errors-------------------------------------------------
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"\nGPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
//function to check for cuFFT errors --------------------------------------------------
#define CUFFT_SAFE_CALL( call) do { \
cufftResult err = call; \
if (err != CUFFT_SUCCESS) { \
fprintf(stderr, "Cufft error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, "error" ); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define NX 128*128
#define NY 16
#define BATCH 16
#define NRANK 2
void FFT_transform(cufftDoubleComplex** B_in)
{
int n[NRANK] = {NX, NY};
//size of B
int Bsize=NX*NY*BATCH;
//allocate host memory
*B_in=(cufftDoubleComplex*)malloc(Bsize*sizeof(cufftDoubleComplex));
for (int i=0;i<NX*NY;i++){
for (int j=0;j<BATCH;j++){
(*B_in)[i*BATCH+j].x=(i*BATCH+j)*2;
(*B_in)[i*BATCH+j].y=(i*BATCH+j)*2+1;
}
}
//allocate device memory
cufftDoubleComplex* B_dev;
gpuErrchk(cudaMalloc((void**) &B_dev,Bsize* sizeof(cufftDoubleComplex)));
if (cudaGetLastError() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to allocate\n");
return;
}
// copy arrays from host to device
gpuErrchk(cudaMemcpy(B_dev, *B_in,Bsize* sizeof(cufftDoubleComplex), cudaMemcpyHostToDevice));
// Create a 2D FFT plan
cufftHandle plan;
CUFFT_SAFE_CALL(cufftPlan2d(&plan,NX,NY,CUFFT_Z2Z));
if (cufftPlanMany(&plan, NRANK, n,NULL, 1, 0,NULL, 1, 0,CUFFT_Z2Z,BATCH) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT Error: Unable to create plan\n");
return;
}
if (cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE)!= CUFFT_SUCCESS){
fprintf(stderr, "CUFFT Error: Unable to set compatibility mode to native\n");
return;
}
// perform transform
CUFFT_SAFE_CALL(cufftExecZ2Z(plan,(cufftDoubleComplex *)(*B_in), (cufftDoubleComplex *)B_dev, CUFFT_FORWARD));
if (cufftExecZ2Z(plan,*B_in,B_dev,CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT Error: Unable to execute plan\n");
return;
}
if (cudaThreadSynchronize() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to synchronize\n");
return;
}
// copy result from device to host
gpuErrchk(cudaMemcpy(*B_in, B_dev,Bsize*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost));
//Destroy CUFFT context
CUFFT_SAFE_CALL(cufftDestroy(plan));
//clean up device memory
gpuErrchk(cudaFree(B_dev));
}
I am receiving the error at line:
CUFFT_SAFE_CALL(cufftExecZ2Z(plan,(cufftDoubleComplex *)(*B_in), (cufftDoubleComplex *)B_dev, CUFFT_FORWARD));
You are getting the error because B_in is a pointer to host memory and not to device memory, which is illegal. In CUFFT, inputs are always in device memory. You need to use cudaMemcpy to transfer the contents of B_in to B_dev before performing the transform, and then supply B_dev as both the input and output, which will result in an in place transform. This is clearly described in the CUFFT API documentation here.
I have a linear array of unsigned chars representing a 2D array. I would like to place it into a CUDA 2D texture and perform (floating point) linear interpolation on it, i.e., have the texture call fetch the 4 nearest unsigned char neighbors, internally convert them to float, interpolate between them, and return the resulting floating point value.
I am having some difficulty setting up the texture and binding it to a texture reference. I have been through the CUDA reference manual & appendices, but I'm just not having any luck.
Below is runnable code to set up and bind 1) a floating point texture and 2) an unsigned char texture. The floating point code runs just fine. However, if you uncomment the two commented unsigned char lines toward the bottom, an "invalid argument" error is thrown.
#include <cstdio>
#include <cuda_runtime.h>
typedef unsigned char uchar;
// Define (global) texture references; must use "cudaReadModeNormalizedFloat"
// for ordinal textures
texture<float, cudaTextureType2D, cudaReadModeNormalizedFloat> texRefFloat;
texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> texRefUChar;
// Define size of (row major) textures
size_t const WIDTH = 1000;
size_t const HEIGHT = 1000;
size_t const TOT_PIX = WIDTH*HEIGHT;
int main(void)
{
// Set texel formats
cudaChannelFormatDesc descFloat = cudaCreateChannelDesc<float>();
cudaChannelFormatDesc descUChar = cudaCreateChannelDesc<uchar>();
// Choose to perform texture 2D linear interpolation
texRefFloat.filterMode = cudaFilterModeLinear;
texRefUChar.filterMode = cudaFilterModeLinear;
// Allocate texture device memory
float * d_buffFloat; cudaMalloc(&d_buffFloat, sizeof(float)*TOT_PIX);
uchar * d_buffUChar; cudaMalloc(&d_buffUChar, sizeof(uchar)*TOT_PIX);
// Bind texture references to textures
cudaError_t errFloat = cudaSuccess;
cudaError_t errUChar = cudaSuccess;
errFloat = cudaBindTexture2D(0, texRefFloat, d_buffFloat, descFloat,
WIDTH, HEIGHT, sizeof(float)*WIDTH);
// Uncomment the following two lines for an error
//errUChar = cudaBindTexture2D(0, texRefUChar, d_buffUChar, descUChar,
// WIDTH, HEIGHT, sizeof(uchar)*WIDTH);
// Check for errors during binding
if (errFloat != cudaSuccess)
{
printf("Error binding float texture reference: %s\n",
cudaGetErrorString(errFloat));
exit(-1);
}
if (errUChar != cudaSuccess)
{
printf("Error binding unsigned char texture reference: %s\n",
cudaGetErrorString(errUChar));
exit(-1);
}
return 0;
}
Any help/insight would be most appreciated!
Aaron
Each row of a texture must be properly aligned. This cannot be guaranteed in general if you bind the texture to a plain array (as opposed to a CUDA array). To bind plain memory to a 2D texture, you would want to allocate the memory with cudaMallocPitch(). This sets the row pitch such that it is suitable for binding to a texture. Note that it is not good practice to pass 0 as the first argument to a texture binding API call. This argument is for CUDA to return an offset to the app. If the offset is non-zero you will need to add it to the texture coordinate during texture access.
Here is a quick example that shows how to read interpolated values from a texture whose elements are unsigned char.
#include <stdlib.h>
#include <stdio.h>
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex;
__global__ void kernel (int m, int n, float shift_x, float shift_y)
{
float val;
for (int row = 0; row < m; row++) {
for (int col = 0; col < n; col++) {
val = tex2D (tex, col+0.5f+shift_x, row+0.5f+shift_y);
printf ("%.2f ", val);
}
printf ("\n");
}
}
int main (void)
{
int m = 4; // height = #rows
int n = 3; // width = #columns
size_t pitch, tex_ofs;
unsigned char arr[4][3]= {{11,12,13},{21,22,23},{31,32,33},{251,252,253}};
unsigned char *arr_d = 0;
CUDA_SAFE_CALL(cudaMallocPitch((void**)&arr_d,&pitch,n*sizeof(*arr_d),m));
CUDA_SAFE_CALL(cudaMemcpy2D(arr_d, pitch, arr, n*sizeof(arr[0][0]),
n*sizeof(arr[0][0]),m,cudaMemcpyHostToDevice));
tex.normalized = false;
tex.filterMode = cudaFilterModeLinear;
CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, arr_d, &tex.channelDesc,
n, m, pitch));
if (tex_ofs !=0) {
printf ("tex_ofs = %zu\n", tex_ofs);
return EXIT_FAILURE;
}
printf ("reading array straight\n");
kernel<<<1,1>>>(m, n, 0.0f, 0.0f);
CHECK_LAUNCH_ERROR();
CUDA_SAFE_CALL (cudaDeviceSynchronize());
printf ("reading array shifted in x-direction\n");
kernel<<<1,1>>>(m, n, 0.5f, 0.0f);
CHECK_LAUNCH_ERROR();
CUDA_SAFE_CALL (cudaDeviceSynchronize());
printf ("reading array shifted in y-direction\n");
kernel<<<1,1>>>(m, n, 0.0f, 0.5f);
CUDA_SAFE_CALL (cudaDeviceSynchronize());
CUDA_SAFE_CALL (cudaFree (arr_d));
return EXIT_SUCCESS;
}
The output of this program is as follows:
reading array straight
0.04 0.05 0.05
0.08 0.09 0.09
0.12 0.13 0.13
0.98 0.99 0.99
reading array shifted in x-direction
0.05 0.05 0.05
0.08 0.09 0.09
0.12 0.13 0.13
0.99 0.99 0.99
reading array shifted in y-direction
0.06 0.07 0.07
0.10 0.11 0.11
0.55 0.56 0.56
0.98 0.99 0.99
My flag for a missing value is 0, so from [0, A, B, 0, 0, C, 0] I want [0, A, B, B, B, C, C] (if no previous non-missing value exists, then just leave as 0).
I'm using the CUDA Thrust library, and was wondering if there's a quick way of doing this without looping though each element.
Many thanks.
seems work well.
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <iterator>
template<class T>
struct FillMissing
{
__host__ __device__ T operator()(const T& res, const T& dat)
{
return dat == T(0) ? res : dat;
}
};
int main()
{
thrust::device_vector<double> vec(7);
vec[1] = 2;
vec[2] = 1;
vec[5] = 3;
thrust::inclusive_scan(
vec.begin(), vec.end(),
vec.begin(),
FillMissing<double>());
thrust::copy(
vec.begin(), vec.end(),
std::ostream_iterator<double>(std::cout, " "));
std::cout << std::endl;
}
output:
0 2 1 1 1 3 3