I am new to CUDA. I have figured out how to do 1D and 2D textures in CUDA. However, I am struggling with how to use a 1D layered texture. The output of my kernel which uses the texture is all zeros, which is definitely incorrect. However, I am not sure what I am doing wrong. I have serious doubts that I set up this texture correctly, but I checked for cuda errors everywhere and couldn't find any issues. Can someone show me how to correctly set up a 1D layered texture and use it. Here is my code. Thanks in advance:
// To Compile: nvcc backproj.cu -o backproj.out
// To Run: ./backproj.out
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// Includes CUDA
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#define pi acos(-1)
// 1D float textures
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef;
// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int layer) {
unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (location_idx < numlocations) {
// Get the location you want to interpolate from the array
float loc2find = (float) d_locations[location_idx] + 0.5f;
// Read from texture and write to global memory
d_output[location_idx] = tex1DLayered(texRef, loc2find, layer);
}
}
// Host code
int main()
{
// Setup h_data and locations to interpolate from
const unsigned int len = 10;
const unsigned int numlayers = 3;
const unsigned int upsamp = 3;
const unsigned int loclen = 1 + (len - 1) * upsamp;
float idx_spacing = 1/(float)upsamp;
float h_data[len][numlayers], h_loc[loclen];
for (int i = 0; i < len; i++)
for (int j = 0; j < numlayers; j++)
h_data[i][j] = 1+cosf((float) pi*i/(j+1.0f));
for (int i = 0; i < loclen; i ++)
h_loc[i] = i*idx_spacing;
// Get the memory locations you want
float* d_loc;
cudaMalloc(&d_loc, loclen * sizeof(float));
cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice);
// Allocate CUDA array in device memory
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray* cuArray;
cudaMallocArray(&cuArray, &channelDesc, len, numlayers);
// Copy to device memory some data located at address h_data in host memory
cudaMemcpyToArray(cuArray, 0, 0, h_data, len * numlayers * sizeof(float), cudaMemcpyHostToDevice);
// Set texture reference parameters
texRef.addressMode[0] = cudaAddressModeBorder;
texRef.filterMode = cudaFilterModeLinear;
texRef.normalized = false;
// Bind the array to the texture reference
cudaBindTextureToArray(texRef, cuArray, channelDesc);
// Allocate result of transformation in device memory
float* d_output;
cudaMalloc(&d_output, loclen * sizeof(float));
// Invoke kernel
int thdsPerBlk = 256;
int blksPerGrid = (int) (loclen / thdsPerBlk) + 1;
printf("Threads Per Block: %d, Blocks Per Grid: %d\n", thdsPerBlk, blksPerGrid);
interp1Kernel <<<blksPerGrid, thdsPerBlk >>>(d_output, d_loc, loclen, 0);
// Print Results
printf("\n Original Indices \n");
for (int i = 0; i < len; i++) printf(" %d ", i);
printf("\n Original array \n");
for (int i = 0; i < len; i++) printf("%5.3f ", h_data[i][0]);
printf("\n Output Indices \n");
for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
printf("\n Output Array \n");
cudaMemcpy(h_loc, d_output, loclen * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
printf("\n");
// Free device memory
cudaFreeArray(cuArray);
cudaFree(d_output);
return 0;
}
You must use cudaMalloc3DArray with the cudaArrayLayered flag set to allocate memory for layered textures. There is a complete example of layered texture usage in the toolkit samples which you can study to see how they work.
Unfortunately, the CUDA SDK only shows you how to do it when you have 2D layered texture. There is some more trickiness when it comes to 1D layered textures. It turns out you have to put a 0 into the second argument for make_cudaExtent when making the extentDesc as follows:
cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers); // <-- 0 height required for 1Dlayered
However, when using make_cudaExtent for mParams.extent for cudaMemcpy3D, you still need to put a 1 for the second argument:
mParams.extent = make_cudaExtent(len, 1, numlayers); // <<-- non zero height required for memcpy to do anything
Furthermore, there are some other non-obvious details such as the pitch for make_cudaPitchedPtr. So I have included my complete and functioning code for the 1D layered texture. I couldn't find an example of this anywhere. So hopefully this will help out others who are in the same boat:
// To Compile: nvcc layeredTexture1D.cu -o layeredTexture1D.out
// To Run: ./layeredTexture1D.out
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// Includes CUDA
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#define pi acos(-1)
// 1D float textures: x is for input values, y is for corresponding output values
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef;
// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int numlayers) {
unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int layer = blockIdx.y * blockDim.y + threadIdx.y;
if (location_idx < numlocations && layer < numlayers) {
// Get the location you want to interpolate from the array
float loc2find = (float)d_locations[location_idx] + 0.5f;
// Read from texture and write to global memory
d_output[location_idx + layer*numlocations] = tex1DLayered(texRef, loc2find, layer);
//printf("location=%d layer=%d loc2find=%f result=%f \n", location_idx, layer, loc2find, d_output[location_idx]);
}
}
// Host code
int main()
{
// Setup h_data and locations to interpolate from
const unsigned int len = 7;
const unsigned int numlayers = 3;
const unsigned int upsamp = 4;
const unsigned int loclen = 1 + (len - 1) * upsamp;
float idx_spacing = 1 / (float)upsamp;
float h_data[numlayers*len], h_loc[loclen];
for (int i = 0; i < len; i++)
for (int j = 0; j < numlayers; j++)
h_data[len*j + i] = 1 + cosf((float)pi*i / (j + 1.0f));
for (int i = 0; i < loclen; i++)
h_loc[i] = i*idx_spacing;
// Get the memory locations you want
float* d_loc;
cudaMalloc(&d_loc, loclen * sizeof(float));
cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice);
// Allocate CUDA array in device memory
cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers); // <-- 0 height required for 1Dlayered
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaMemcpy3DParms mParams = { 0 };
mParams.srcPtr = make_cudaPitchedPtr(h_data, len*sizeof(float), len, 1);
mParams.kind = cudaMemcpyHostToDevice;
mParams.extent = make_cudaExtent(len, 1, numlayers); // <<-- non zero height required for memcpy to do anything
cudaArray* cuArray;
cudaMalloc3DArray(&cuArray, &channelDesc, extentDesc, cudaArrayLayered);
mParams.dstArray = cuArray;
cudaMemcpy3D(&mParams);
// Set texture reference parameters
texRef.addressMode[0] = cudaAddressModeBorder;
texRef.filterMode = cudaFilterModeLinear;
texRef.normalized = false;
// Bind the array to the texture reference
cudaBindTextureToArray(texRef, cuArray, channelDesc);
// Allocate result of transformation in device memory
float *d_output;
cudaMalloc(&d_output, loclen * numlayers * sizeof(float));
float h_output[loclen * numlayers];
// Invoke kernel
dim3 dimBlock(16, 16, 1);
dim3 dimGrid((loclen + dimBlock.x - 1) / dimBlock.x,
(numlayers + dimBlock.y - 1) / dimBlock.y, 1);
interp1Kernel<<<dimGrid, dimBlock>>>(d_output, d_loc, loclen, numlayers);
// Print Results
printf("\n Original Indices \n");
for (int i = 0; i < len; i++) printf(" %d ", i);
printf("\n Original array \n");
for (int j = 0; j < numlayers; j++) {
for (int i = 0; i < len; i++) {
printf("%5.3f ", h_data[i + j*len]);
}
printf("\n");
}
printf("\n Output Indices \n");
for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
printf("\n Output Array \n");
cudaMemcpy(h_output, d_output, loclen * numlayers * sizeof(float), cudaMemcpyDeviceToHost);
for (int j = 0; j < numlayers; j++) {
for (int i = 0; i < loclen; i++) {
printf("%5.3f ", h_output[i + j*loclen]);
}
printf("\n");
}
printf("\n");
// Free device memory
cudaFreeArray(cuArray);
cudaFree(d_output);
return 0;
}
Related
I have the following two mostly identical example codes. code1.cu use cudaMalloc and cudaMemcpy to handling device/host variable value exchange.
The code2.cu use cudaMallocManaged and thus cudaMemcpy is not needed. When cudaMallocManaged is used, I have to include cudaDeviceSynchronize() to get the correct results, while for the one with cudaMalloc, this is not needed. I would appreciate some hint on why this is happening
code2.cu
#include <iostream>
#include <math.h>
#include <vector>
//
using namespace std;
// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < max_x && j<max_y) {
atomicAdd(tot, x[i] + y[j]);
}
}
int main(void)
{
int Nx = 1<<15;
int Ny = 1<<15;
float *d_x = NULL, *d_y = NULL;
float *d_tot = NULL;
cudaMalloc((void **)&d_x, sizeof(float)*Nx);
cudaMalloc((void **)&d_y, sizeof(float)*Ny);
cudaMallocManaged((void **)&d_tot, sizeof(float));
// Allocate Unified Memory – accessible from CPU or GPU
vector<float> vx;
vector<float> vy;
// initialize x and y arrays on the host
for (int i = 0; i < Nx; i++)
vx.push_back(i);
for (int i = 0; i < Ny; i++)
vy.push_back(i*10);
//
float tot = 0;
for(int i = 0; i<vx.size(); i++)
for(int j = 0; j<vy.size(); j++)
tot += vx[i] + vy[j];
cout<<"CPU: tot: "<<tot<<endl;
//
cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);
//
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);
//.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
//.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
int bx = sqrt(blockSize*Nx/(float)Ny);
int by = bx*Ny/(float)Nx;
dim3 blockSize_3D(bx, by);
dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);
cout<<"blockSize: "<<blockSize<<endl;
cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;
// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);
int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);
float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);
printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);
// Run kernel on 1M elements on the GPU
tot = 0;
add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);
// Wait for GPU to finish before accessing on host
//cudaDeviceSynchronize();
tot =*d_tot;
//
//
cout<<" GPU: tot: "<<tot<<endl;
// Free memory
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_tot);
return 0;
}
code1.cu
#include <iostream>
#include <math.h>
#include <vector>
//
using namespace std;
// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < max_x && j<max_y) {
atomicAdd(tot, x[i] + y[j]);
}
}
int main(void)
{
int Nx = 1<<15;
int Ny = 1<<15;
float *d_x = NULL, *d_y = NULL;
float *d_tot = NULL;
cudaMalloc((void **)&d_x, sizeof(float)*Nx);
cudaMalloc((void **)&d_y, sizeof(float)*Ny);
cudaMalloc((void **)&d_tot, sizeof(float));
// Allocate Unified Memory – accessible from CPU or GPU
vector<float> vx;
vector<float> vy;
// initialize x and y arrays on the host
for (int i = 0; i < Nx; i++)
vx.push_back(i);
for (int i = 0; i < Ny; i++)
vy.push_back(i*10);
//
float tot = 0;
for(int i = 0; i<vx.size(); i++)
for(int j = 0; j<vy.size(); j++)
tot += vx[i] + vy[j];
cout<<"CPU: tot: "<<tot<<endl;
//
cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);
//
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);
//.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
//.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
int bx = sqrt(blockSize*Nx/(float)Ny);
int by = bx*Ny/(float)Nx;
dim3 blockSize_3D(bx, by);
dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);
cout<<"blockSize: "<<blockSize<<endl;
cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;
// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);
int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);
float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);
printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);
// Run kernel on 1M elements on the GPU
tot = 0;
add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);
// Wait for GPU to finish before accessing on host
//cudaDeviceSynchronize();
//
cudaMemcpy(&tot, d_tot, sizeof(float), cudaMemcpyDeviceToHost);
//
cout<<" GPU: tot: "<<tot<<endl;
// Free memory
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_tot);
return 0;
}
//Code2.cu has the following output:
//
//CPU: tot: 8.79609e+12
//blockSize: 1024
//bx: 32 by: 32 gx: 1024 gy: 1025
//Launched blocks of size 1024. Theoretical occupancy: 1.000000
//GPU: tot: 0
After remove the comment on cudaDeviceSynchronize(),
GPU: tot: 8.79609e+12
CUDA kernel launches are asynchronous. That means that they execute independently of the CPU thread that launched them.
Because of this asynchronous launch, the CUDA kernel is not guaranteed to be finished (or even started) by the time your CPU thread code begins testing the result.
Therefore it is necessary to wait until the GPU kernel is complete, and cudaDeviceSynchronize() does exactly that. cudaMemcpy also has a synchronizing effect, so when you remove the cudaMemcpy operations, you lose that synchronization, but cudaDeviceSynchronize() restores it.
I'm trying to initialize in a random manner the weights ( stored as floats ) of a neural network using CURAND functions.
I first initialize the neural netwotk with some values and after that I attempt to copy the two matrices in the nn struct ( nn stands for neural network ), that should store the weight values ( nn.wih, and nn.who ) into the Device memory.
Then I call a function that should randomize the matrices' values (assignRandomWeight), which launches two kernels that holds curand functions.
Finally I try to copy the resulting matrices back to the host memory through a cudaMemcpy call, but at this point I get the error "an illegal memory access was encountered".
I tried to print the values of the Device copy matrices of wih and who, which are d_wih and d_who. They seems to be correct; I left in the code two functions usefull for debugging :
checkCudaError can be called to check the last cudaError_t string message
showValues is useful to print the values of a Device allcated arraay
I extracted a sample of my code that compile and presents the same error, plese help me out
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include<cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
struct TNeuralNetwork {
int input_neurons;
int hidden_neurons;
int output_neurons;
float *wih; //first layer of weights (from input layer to hidden layer)
float *who; //second layer of weights (from hidden layer to output layer)
float *wih_old; //for the momentum
float *who_old; //for the momentum
float *erro;
float *errh;
float l; //learning rate
float m; //momentum
float *i; //values into input neurons
float *h; //values into hidden neurons
float *o; //values into output neurons
};
__host__ void checkCudaError(char *str);
__global__ void showValues(float *d_v, int dim);
__global__ void init_rand(unsigned int seed, curandState_t state_wih);
__global__ void generateRandomValues(curandState_t state_wih, float *wih, float *who, int inp, int hid, int out);
void assignRandomWeight(struct TNeuralNetwork *nn, float *d_wih, float *d_who);
void initNeuralNetwork(struct TNeuralNetwork *nn, int bands, int nlabel);
int main(int argc, char **argv) {
struct TNeuralNetwork nn;
//Declare Device variables
float *d_wih;
float *d_who;
unsigned int v;
cudaError_t cudaStatus;
initNeuralNetwork(&nn, 102, 10);
//Allocate Device Memory
v = (nn.input_neurons + 1)*(nn.hidden_neurons);
cudaMalloc((void**)&d_wih, (nn.input_neurons + 1)*(nn.hidden_neurons)*sizeof(float));
checkCudaError("malloc1");
//cudaMalloc((void**)&d_who, (nn.hidden_neurons + 1)*nn.output_neurons * sizeof(float));
//checkCudaError("malloc2");
for (int i = 0; i < (nn.input_neurons + 1); i++){
for (int j = 0; j < nn.hidden_neurons; j++){
nn.wih[i*nn.hidden_neurons + j] = 0;
}
}
for (int i = 0; i < (nn.hidden_neurons + 1); i++){
for (int j = 0; j < nn.output_neurons; j++){
nn.who[i*nn.output_neurons + j] = 0;
}
}
cudaMemcpy(d_wih, nn.wih, (nn.input_neurons + 1)*(nn.hidden_neurons)*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError("memcpy0");
//showValues << <v, 1 >> >(d_wih, v); TEST
//cudaMemcpy(d_who, nn.who, (nn.hidden_neurons + 1)*nn.output_neurons*sizeof(float), cudaMemcpyHostToDevice);
//checkCudaError("memcpy0.1");
assignRandomWeight(&nn, d_wih, d_who);
cudaMemcpy(nn.wih, d_wih, (nn.input_neurons + 1)*(nn.hidden_neurons)*sizeof(float), cudaMemcpyDeviceToHost);
//showValues << <v, 1 >> >(d_wih, v); TEST
checkCudaError("memcpy1");
//cudaMemcpy(nn.who, d_who, (nn.hidden_neurons + 1)*nn.output_neurons*sizeof(float), cudaMemcpyDeviceToHost);
//checkCudaError("memcpy2");
//printf("WIH:\n");
//for (int i = 0; i < (nn.input_neurons + 1); i++){
// for (int j = 0; j < (nn.hidden_neurons); j++){
// printf("%.12f\t", nn.wih[i*(nn.hidden_neurons) + j]);
// }
// printf("\n\n");
//}
//printf("WHO:\n");
//for (int i = 0; i < (nn.hidden_neurons + 1); i++){
// for (int j = 0; j < nn.output_neurons; j++){
// printf("%.12f\t", nn.wih[i*nn.output_neurons + j]);
// }
// printf("\n\n");
//}
cudaFree(d_wih);
cudaFree(d_who);
return 0;
}
__host__ void checkCudaError(char *str){
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess){
printf("Cuda Error at %s: %s \n", str, cudaGetErrorString(err));
exit(-1);
}
}
__global__ void showValues(float *d_v, int dim){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < dim){
printf("elemento[%d] = %.4f\n", tid, d_v[tid]);
}
}
__global__ void init_rand(unsigned int seed, curandState_t state_wih){
int tid = blockIdx.x*blockDim.x + threadIdx.x;
curand_init(seed, 0, tid, &state_wih);
}
__global__ void generateRandomValues(curandState_t state_wih, float *wih, float *who, int inp, int hid, int out){
int tid = (blockIdx.x)*(blockDim.x) + threadIdx.x;
printf("%.7f", (float)curand(&state_wih + tid));
if (tid <= (inp + 1)*hid){
wih[tid] = (float)curand_uniform(&state_wih + tid);
printf("%.7f", wih[tid]);
}
if (tid <= (hid + 1)*out){
who[tid] = (float)curand_uniform(&state_wih + tid);
printf("%.7f", who[tid]);
}
}
void initNeuralNetwork(struct TNeuralNetwork *nn, int bands, int nlabel) {
nn->input_neurons = bands;
nn->output_neurons = nlabel;
//nn->hidden_neurons = (int)((bands + nlabel)/2.0f);
nn->hidden_neurons = (int)((bands + nlabel)*2.0f / 3.0f);
nn->l = 0.001;
nn->m = 0.2;
nn->wih = (float*)malloc((bands + 1)*(nn->hidden_neurons) * sizeof(float)); //+1 for the bias
nn->who = (float*)malloc((nn->hidden_neurons + 1)*nlabel * sizeof(float));//+1 for the bias
nn->wih_old = (float*)malloc((bands + 1)*(nn->hidden_neurons) * sizeof(float)); //+1 for the bias
nn->who_old = (float*)malloc((nn->hidden_neurons + 1)*nlabel * sizeof(float));//+1 for the bias
nn->i = (float*)malloc(bands * sizeof(float));
nn->h = (float*)malloc(nn->hidden_neurons * sizeof(float));
nn->o = (float*)malloc(nlabel * sizeof(float));
nn->errh = (float*)malloc(nn->hidden_neurons * sizeof(float));
nn->erro = (float*)malloc(nlabel * sizeof(float));
memset(nn->wih_old, 0, (bands + 1)*(nn->hidden_neurons) * sizeof(float));
memset(nn->who_old, 0, (nn->hidden_neurons + 1)*nlabel * sizeof(float));
}
//curand
void assignRandomWeight(struct TNeuralNetwork *nn, float *d_wih, float *d_who) {
cudaError_t cudaStatus;
curandState_t state_wih;
srand(time(NULL));
unsigned int seed = rand();
//Alloco la matrice di curandState_t per la randomizzaione, in uscita dalla funzione non mi servirà più
cudaMalloc((void**)&state_wih, (nn->input_neurons + 1)*(nn->hidden_neurons)* sizeof(curandState_t));
dim3 gridSize(ceil((double)((nn->input_neurons + 1)*(nn->hidden_neurons)) / 32));
dim3 blockSize(32);
init_rand << < gridSize, blockSize >> >(seed, state_wih);
generateRandomValues << < gridSize, blockSize >> >(state_wih, d_wih, d_who, nn->input_neurons, nn->hidden_neurons, nn->output_neurons);
}
"Incorrect Indexing" will produce out-of-bounds memory access within the kernel. The CUDA runtime will destroy your context at the point where the error occurred within the kernel, after which no CUDA operations which rely the the context can be performed. The cudaMemcpycall fails because your context has been destroyed. There is no way to avoid this.
NVIDIA supply a utility called cuda-memcheck with the CUDA toolkit. Use that instead to diagnose what is going wrong with your kernel.
I'v found my error:
I was making a bad use of the "curandState_t" type variable in the assignRandomWight function, I had to use a pointer.
this is the correct version:
void assignRandomWeight(struct TNeuralNetwork *nn, float *d_wih, float *d_who) {
cudaError_t cudaStatus;
curandState_t *state_wih;
srand(time(NULL));
unsigned int seed = rand();
//Alloco la matrice di curandState_t per la randomizzaione, in uscita dalla funzione non mi servirà più
cudaMalloc((void**)&state_wih, (nn->input_neurons + 1)*(nn->hidden_neurons)* sizeof(curandState_t));
dim3 gridSize(ceil((double)((nn->input_neurons + 1)*(nn->hidden_neurons)) / 32));
dim3 blockSize(32);
init_rand << < gridSize, blockSize >> >(seed, state_wih);
generateRandomValues << < gridSize, blockSize >> >(state_wih, d_wih, d_who, nn->input_neurons, nn->hidden_neurons, nn->output_neurons);
}
and the correct version for the two kernels:
__global__ void generateRandomValues( curandState_t *state_wih, float *wih, float *who, int inp, int hid, int out){
int tid = (blockIdx.x)*(blockDim.x) + threadIdx.x;
if (tid<=(inp+1)*hid ){
printf("\ncasual : %f", (float)curand_uniform(&state_wih[tid]));
wih[tid] = (float)curand_uniform(&state_wih[tid]);
}
if (tid<=(hid+1)*out){
who[tid] = (float)curand_uniform(&state_wih[tid]);
}
}
__global__ void init_rand(unsigned int seed, curandState_t *state_wih){
int tid = blockIdx.x*blockDim.x + threadIdx.x;
curand_init(seed, tid, 0, &state_wih[tid]);
}
i'm a beginner in cuda programming. I'm trying an own easy code but it's not working and I don't know what else to do.
My code:
#include <mpi.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/wait.h>
// Prototypes
__global__ void helloWorld(char*);
__device__ int getGlobalIdx_2D_2D();
// Host function
int main(int argc, char** argv)
{
unsigned int i, N, gridX, gridY, blockX, blockY;
N = 4096000;
char *str = (char *) malloc(N*sizeof(char));
for(i=0; i < N; i++) str[i]='c';
MPI_Init (&argc, &argv);
char *d_str;
size_t size = (size_t) N*sizeof(char);
cudaMalloc((void**)&d_str, size);
cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice);
gridX = 100;
gridY = 10;
blockX = blockY = 64;
dim3 dimGrid(gridX, gridY); // 4096 chars per block
dim3 dimBlock(blockX, blockY); // one thread per character, 2D
printf("dimGrid(%d, %d)\t", gridX, gridY);
printf("dimBlock(%d, %d)\t", blockX, blockY);
helloWorld<<< dimGrid, dimBlock >>>(d_str);
cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
MPI_Barrier (MPI_COMM_WORLD);
cudaFree(d_str);
printf("\nRes:\n");
for(i = 0; i < N; i++) printf("\t[%u] %c\n", i, str[i]);
MPI_Finalize ();
free(str);
return 0.0;
}
// Device kernel
__global__ void helloWorld(char* str)
{
// determine where in the thread grid we are
int pos = getGlobalIdx_2D_2D();
if (pos % 2 == 0) str[pos] -= 2;
else str[pos] += 8;
}
__device__ int getGlobalIdx_2D_2D()
{
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y) +
(threadIdx.y * blockDim.x) + threadIdx.x;
return threadId;
}
My desired output is: jajajajajajaja... x4096000
I've read that '%' operation is not efficient, but I don't think is the problem there.
Thanks!
You are performing absolutely no CUDA error checking, it is really beneficial to do so. Once you enable it you can find that block dimensions 64 x 64 are invalid as it results into 4096 threads within one block, which is not a valid configuration.
Does anyone know what is the maximum supported size for cub::scan ? I got core dump for input sizes over 500 million. I wanted to make sure I'm not doing anything wrong...
Here is my code:
#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;
bool g_verbose = false; // Whether to display input/output to console
CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
typedef int mytype;
/**
* Solve inclusive-scan problem
*/
static void solve(mytype *h_in, mytype *h_cpu, int n)
{
mytype inclusive = 0;
for (int i = 0; i < n; ++i) {
inclusive += h_in[i];
h_cpu[i] = inclusive;
}
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
for (int i = 0; i < n; i++) {
if (h_cpu[i] != h_o[i]) {
return i + 1;
}
}
return 0;
}
/**
* Main
*/
int main(int argc, char** argv)
{
cudaSetDevice(0);
struct timeval start, end;
int num_items = 1073741824;
const int repetitions = 5;
mytype *h_in, *h_out, *h_cpu;
const int size = num_items * sizeof(mytype);
// Allocate host arrays
h_in = (mytype *)malloc(size);
h_out = (mytype *)malloc(size);
h_cpu = (mytype *)malloc(size);
// Initialize problem and solution
for (int i = 0; i < num_items; i++) {
h_in[i] = i;
h_out[i] = 0;
h_cpu[i] = 0;
}
solve(h_in, h_cpu, num_items);
// Allocate problem device arrays
mytype *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array
mytype *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
gettimeofday(&start, NULL);
for (long i = 0; i < repetitions; i++)
DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
cudaThreadSynchronize();
gettimeofday(&end, NULL);
double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;
cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
int cmp = compare(h_cpu, h_out, num_items);
printf("%d\t", num_items);
if (!cmp)
printf("\t%7.4fs \n", ctime);
printf("\n");
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (h_cpu) delete[] h_cpu;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
printf("\n\n");
return 0;
}
The problem is here:
const int size = num_items * sizeof(mytype);
And it can be fixed by changing it to:
const size_t size = num_items * sizeof(mytype);
The value of num_items in the code is over 1 Billion. When we multiply that by sizeof(mytype) we are multiplying it by 4, so the result is over 4 Billion. This value cannot be stored in an int variable. If you try to use it anyway like that, then your subsequent host code will do bad things. This problem (the core dump) actually has nothing to do with CUDA. The code would core dump if you removed all the CUB elements.
When I modify the line of code above, and compile for the correct GPU (e.g. -arch=sm_35 in my case, or -arch=sm_52 for a Titan X GPU), then I get the correct answer (and no seg fault/core dump).
In general, the correct starting point when chasing a seg fault/core dump type error, is to recognize that this error arises from host code and you should attempt to localize the exact line of source code that is generating this error. This can be done trivially/tediously by putting many printf statements in your code, until you identify the line of your code after which you don't see any printf output, or by using a host code debugger, such as gdb on linux.
Also note that this code as written will require slightly more than 12GB of memory on the host, and slightly more than 8GB of memory on the GPU, so it will only run properly in such settings.
For reference, here is the fixed code (based on what OP posted here):
#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;
bool g_verbose = false; // Whether to display input/output to console
CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
typedef int mytype;
/**
* Solve inclusive-scan problem
*/
static void solve(mytype *h_in, mytype *h_cpu, int n)
{
mytype inclusive = 0;
for (int i = 0; i < n; ++i) {
inclusive += h_in[i];
h_cpu[i] = inclusive;
}
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
for (int i = 0; i < n; i++) {
if (h_cpu[i] != h_o[i]) {
return i + 1;
}
}
return 0;
}
/**
* Main
*/
int main(int argc, char** argv)
{
cudaSetDevice(0);
struct timeval start, end;
int num_items = 1073741824;
const int repetitions = 5;
mytype *h_in, *h_out, *h_cpu;
const size_t size = num_items * sizeof(mytype);
// Allocate host arrays
h_in = (mytype *)malloc(size);
h_out = (mytype *)malloc(size);
h_cpu = (mytype *)malloc(size);
// Initialize problem and solution
for (int i = 0; i < num_items; i++) {
h_in[i] = i;
h_out[i] = 0;
h_cpu[i] = 0;
}
solve(h_in, h_cpu, num_items);
// Allocate problem device arrays
mytype *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array
mytype *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
gettimeofday(&start, NULL);
for (long i = 0; i < repetitions; i++)
DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
cudaThreadSynchronize();
gettimeofday(&end, NULL);
double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;
cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
int cmp = compare(h_cpu, h_out, num_items);
printf("%d\t", num_items);
if (!cmp)
printf("\t%7.4fs \n", ctime);
printf("\n");
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (h_cpu) delete[] h_cpu;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
printf("\n\n");
return 0;
}
I have an array of numbers as {1,2,3,4,5,6,7,8,9,10} and I want to separate even and odd numbers as:
even = {2,4,6,8}
and:
odd = {1,3,5,7}
I am aware of atomic operations in CUDA, and also aware that the output is not expected to suffer from race conditions. I don't want to use atomic operations. How can I achieve this without using atomic keywords?
CODE:
#include <stdio.h>
#include <cuda.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *total,float *even,float *odd, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int a=total[idx];
if ((a%2)==0)
{
for (int i=0;i<=idx;i++)
{
int b = even[i];
if(b==0)
{
even[i] = total[idx];
break;
}
}
}
else
{
for (int i=0;i<idx;i++)
{
int c = odd[i];
odd[i] = total[idx];
break;
}
}
}
// main routine that executes on the host
int main(void)
{
float *total_h,*even_h, *odd_h,*total_d, *even_d,*odd_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
total_h = (float *)malloc(size); // Allocate array on host
even_h = (float *)malloc(size); // Allocate array on host
odd_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &total_d, size);
cudaMalloc((void **) &even_d, size);
cudaMemset(even_d,0,size);
cudaMalloc((void **) &odd_d, size); // Allocate array on device
cudaMemset(odd_d,0,size);
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) total_h[i] = (float)i+1;
cudaMemcpy(total_d, total_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
square_array <<< 1,10 >>> (total_d,even_d,odd_d, N);
// Retrieve result from device and store it in host array
cudaMemcpy(even_h, even_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(odd_h, odd_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
printf("total Numbers\n");
for (int i=0; i<N; i++) printf("%f\n",total_h[i]);
printf("EVEN Numbers\n");
for (int i=0; i<N; i++) printf("%f\n",even_h[i]);
printf("ODD Numbers\n");
for (int i=0; i<N; i++) printf("%f\n",odd_h[i]);
// Cleanup
free(total_h);
free(even_h);
free(odd_h);
cudaFree(total_d);
cudaFree(even_d);
cudaFree(odd_d);
}
OUTPUT:
As suggested by Jared Hoberock, it would be much more easy to use the efficient partitioning algorithm available in CUDA Thrust instead of starting the development of a partitioning routine of your own. Below, please find a complete worked example.
#include <thrust\device_vector.h>
#include <thrust\partition.h>
#include <thrust\execution_policy.h>
struct is_even { __host__ __device__ bool operator()(const int &x) { return (x % 2) == 0; } };
void main() {
const int N = 10;
thrust::host_vector<int> h_data(N);
for (int i=0; i<N; i++) h_data[i] = i;
thrust::device_vector<int> d_data(h_data);
thrust::device_vector<int> d_evens(N/2);
thrust::device_vector<int> d_odds(N/2);
thrust::partition_copy(d_data.begin(), d_data.end(), d_evens.begin(), d_odds.begin(), is_even());
printf("Even numbers\n");
for (int i=0; i<N/2; i++) {
int val = d_evens[i];
printf("evens[%i] = %i\n",i,val);
}
printf("Odd numbers\n");
for (int i=0; i<N/2; i++) {
int val = d_odds[i];
printf("odds[%i] = %i\n",i,val);
}
}