I'm trying to write a program that prompts the user to enter a positive integer N and prints the set of all binary strings of length N. For example, if the user enters 3, the program prints:
0 0 0
0 0 1
0 1 0
0 1 1
1 0 0
1 0 1
1 1 0
1 1 1
I have been trying this for a while but I was unable to do it. If someone can help me with this I will appreciate it!! Thanks
The problem is a typical backtrack problem in algorithm, I made the example code in C language here, you can have a reference.
#include <stdio.h>
#include <stdlib.h>
void getAllBi(int depth, int A[], int n) {
if(depth >= n) {
int i;
for(i = 0; i < n; ++i) {
printf("%d ", A[i]);
}
printf("\n");
return;
}
A[depth] = 0;
getAllBi(depth+1, A, n);
A[depth] = 1;
getAllBi(depth+1, A, n);
}
int main() {
int n;
scanf("%d", &n);
int *A = (int*)malloc(n * sizeof(int));
getAllBi(0, A, n);
return 0;
}
Related
I'm trying to use the a cuda graph to join a bunch of kernels which use __constant__ parameters to set values that won't change. A small test case:
#include <iostream>
#include <vector>
template<typename T>
__constant__ T scalar[1];
__constant__ size_t size_a[1];
template<typename T>
__global__ void set_buffer(T *buffer) {
const size_t index = blockIdx.x*blockDim.x + threadIdx.x;
buffer[min(index, size_a[0])] = scalar<T>[0];
}
int main(int argc, const char * argv[]) {
cudaGraph_t graph;
cudaGraphCreate(&graph, 0);
cudaStream_t stream;
cudaStreamCreate(&stream);
const double src = 10.0;
cudaGraphNode_t node_a;
cudaGraphAddMemcpyNodeToSymbol(&node_a, graph, NULL, 0,
scalar<double>, &src, sizeof(double),
0, cudaMemcpyHostToDevice);
const size_t length = 100;
cudaGraphNode_t node_b;
cudaGraphAddMemcpyNodeToSymbol(&node_b, graph, NULL, 0,
size_a, &length, sizeof(size_t),
0, cudaMemcpyHostToDevice);
double *result;
cudaMalloc(&result, length*sizeof(double));
cudaKernelNodeParams params;
params.func = (void *) set_buffer<double>;
params.gridDim = dim3(1, 1, 1);
params.blockDim = dim3(512, 1, 1);
params.sharedMemBytes = 0;
params.kernelParams = {reinterpret_cast<void **> (&result)};
params.extra = NULL;
cudaGraphNode_t node_c;
std::vector<cudaGraphNode_t> deps = {node_a, node_b};
cudaGraphAddKernelNode(&node_c, graph, deps.data(), deps.size(),
¶ms);
cudaGraphExec_t exec;
cudaGraphInstantiate(&exec, graph, NULL, NULL, 0);
cudaGraphLaunch(exec, stream);
cudaStreamSynchronize(stream);
std::vector<double> host_result(length);
cudaMemcpy(host_result.data(),
result, length*sizeof(double),
cudaMemcpyDeviceToHost);
for (size_t i = 0; i < length; i++) {
std::cout << i << " " << host_result[i] << std::endl;
}
}
When I run this, it doesn't look like it cudaGraphAddMemcpyNodeToSymbol is doing anything. Because when I run it, it prints out.
0 0
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
...
90 0
91 0
92 0
93 0
94 0
95 0
96 0
97 0
98 0
99 0
Why isn't the kernel changing values in my buffer?
When I run your code I get a seg fault. Whether you get a seg fault or not isn't really the issue.
The problem is here:
params.kernelParams = {reinterpret_cast<void **> (&result)};
That is not correct. A correct method to do it is as follows:
void *kernelargs[1] = {(void *)&result};
params.kernelParams = kernelargs;
You have an off-by-one error also:
buffer[min(index, size_a[0])] = scalar<T>[0];
^^^^^^^^^^^^^^^^^^^^^
The value you are putting into size_a[0] is 100, so any thread whose index is 100 or greater will attempt to index into buffer at element whose index is 100, which is out of range for your allocation.
I am writing a simple example program to test memCpy and kernel run concurrency for a larger program. While writing this example, I stumbled upon error 77, aka cudaErrorIllegalAddress.
I read somewhere that that comes from the kernel accessing an invalid address, and not the memcpy itself. So I tried to index the lowest element of my input array (0). The error remained.
As it only is a small sample program, I will provide the whole code;
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCKS 32
#define THREADS 16
__global__ void kernel(double *d_in, double *d_out) {
int index = threadIdx.x + blockDim.x * blockIdx.x;
d_out[index] = d_in[index] + 5;
}
int main() {
const int GPU_N = 2;
const int data_size = 2048;
const int cycles = 2;
double *h_in, *h_out, *d_in, *d_out;
h_in = (double*)malloc(sizeof(double) * data_size);
h_out = (double*)malloc(sizeof(double) * data_size);
for (int i = 0; i < data_size; i++) {
h_in[i] = 21;
}
cudaError_t error;
printf("1\n");
for (int i = 0; i < cycles; i++) {
//cuMalloc
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMalloc((void**)&d_in, sizeof(double) * data_size / 4);
cudaMalloc((void**)&d_out, sizeof(double) * data_size / 4);
printf("2\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMemcpyAsync(d_in, h_in, sizeof(double) * data_size / 4, cudaMemcpyHostToDevice);
printf("3\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
kernel<<< BLOCKS, THREADS, 0, 0 >>>(d_in, d_out);
error = cudaGetLastError();
printf("4\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
error = cudaMemcpyAsync(h_out, d_out, sizeof(double) * data_size / 4, cudaMemcpyDeviceToHost);
printf("D2H %i\n", error);
printf("5\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaFree(d_in);
cudaFree(d_out);
printf("6\n");
}
}
for (int i = 0; i < data_size; i++) {
printf("%i\n", h_out[i]);
}
getchar();
}
So the output should be something like:
1
1
2
2
3
3
4
4
5
5
6
6
1
1
2
2
3
3
4
4
5
5
6
6
26
26
26
26
26
.....
and then a spam of the result. It does so until the time it has to print 5, then it outputs the error 77. Also, the output of the result is not 26 as expected, but -842150451
There are several problems with this code.
As already pointed out in the comments, the printf format specifier here (%i) is wrong:
printf("%i\n", h_out[i]);
the quantity being printed is a double quantity, an appropriate format specifier would be %f.
This code will not work (for GPU_N greater than 1):
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMalloc((void**)&d_in, sizeof(double) * data_size / 4);
cudaMalloc((void**)&d_out, sizeof(double) * data_size / 4);
printf("2\n");
}
d_in and d_out are individual variables. You don't get to somehow reuse them in this way. When this loop goes through it's 2nd (or later) iteration, it will overwrite the pointer values that were previously assigned. Later on this will result in code trouble, because for at least one of your kernel launches, you will be passing pointers to data that is not resident on that particular GPU (and this particular aspect of the problem is the proximal reason for the error 77 report.)
One solution would be to provide arrays of pointers to make this work.
Some of the CUDA activity you are issuing in your loops may be asynchronous. Therefore, to be sure that your final printout of h_out shows expected results, you should wait for all work on the GPU to be finished. One way to accomplish this is with another set of calls to cudaDeviceSynchronize(). (I don't wish to argue about whether cudaFree is asynchronous or not. I think this item is a sensible suggestion and noteworthy. If you feel you can skip this item, do what you wish. For learning purposes, I think it is important to point this out.) For the reasons indicated in comments below, this item is not necessary/mandatory to get expected results for this particular code. This answer isn't intended to be a complete treatise on asynchronous work issuance; for that I suggest further study of any of the relevant questions here on the cuda tag, and/or study of relevant CUDA sample codes.
Here's a modified code that has the above issues addressed (I have shortened the final print-out loop):
$ cat t1477.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCKS 32
#define THREADS 16
__global__ void kernel(double *d_in, double *d_out) {
int index = threadIdx.x + blockDim.x * blockIdx.x;
d_out[index] = d_in[index] + 5;
}
int main() {
const int GPU_N = 2;
const int data_size = 2048;
const int cycles = 2;
double *h_in, *h_out, *d_in[GPU_N], *d_out[GPU_N];
h_in = (double*)malloc(sizeof(double) * data_size);
h_out = (double*)malloc(sizeof(double) * data_size);
for (int i = 0; i < data_size; i++) {
h_in[i] = 21;
}
cudaError_t error;
printf("1\n");
for (int i = 0; i < cycles; i++) {
//cuMalloc
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMalloc((void**)(&(d_in[j])), sizeof(double) * data_size / 4);
cudaMalloc((void**)(&(d_out[j])), sizeof(double) * data_size / 4);
printf("2\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaMemcpyAsync(d_in[j], h_in, sizeof(double) * data_size / 4, cudaMemcpyHostToDevice);
printf("3\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
kernel<<< BLOCKS, THREADS, 0, 0 >>>(d_in[j], d_out[j]);
error = cudaGetLastError();
printf("4\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
error = cudaMemcpyAsync(h_out, d_out[j], sizeof(double) * data_size / 4, cudaMemcpyDeviceToHost);
printf("D2H %i\n", error);
printf("5\n");
}
for (int j = 0; j < GPU_N; j++) {
cudaSetDevice(j);
cudaFree(d_in[j]);
cudaFree(d_out[j]);
printf("6\n");
}
}
for (int i = 0; i < GPU_N; i++){
cudaSetDevice(i);
cudaDeviceSynchronize();}
for (int i = 0; i < 10; i++) {
printf("%f\n", h_out[i]);
}
}
$ nvcc -o t1477 t1477.cu
$ cuda-memcheck ./t1477
========= CUDA-MEMCHECK
1
2
2
3
3
4
4
D2H 0
5
D2H 0
5
6
6
2
2
3
3
4
4
D2H 0
5
D2H 0
5
6
6
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
26.000000
========= ERROR SUMMARY: 0 errors
$
I want to write a prefix scan for large arrays using the instruction in GPUgem, It's a homework for my parallel class. I did follow all the steps in the book but still my code's not working. I got it to work for array size 4096 but it's not working for larger arrays. Here is my code :
#include <stdio.h>
#include <sys/time.h>
#define THREADS 1024
typedef int mytype;
__global__ void phaseI(mytype *g_odata, mytype *g_idata, int n, mytype *aux)
{
__shared__ mytype temp[THREADS];
const int tid1 = threadIdx.x;
int offset = 1;
temp[2*tid1] = g_idata[2*tid1]; // load input into shared memory
temp[2*tid1+1] = g_idata[2*tid1+1];
for (int d = THREADS>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
__syncthreads();
if (tid1 == 0) {
aux[blockIdx.x] = temp[THREADS - 1];
temp[THREADS - 1] = 0;
}
for (int d = 1; d < THREADS; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
mytype t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
__global__ void phaseII(mytype *g_odata, mytype *aux, int n)
{
const int tid1 = threadIdx.x;
const int B = (n / THREADS);
int offset = 1;
for (int d = B>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
__syncthreads();
if (tid1 == 0 && blockIdx.x == 0) {
aux[B - 1] = 0;
}
for (int d = 1; d < B; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
mytype t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] += aux[blockIdx.x];
g_odata[2*thid+1] += aux[blockIdx.x];
}
int main(int argc, char *argv[])
{
if (argc != 2) {
printf("usage: %s n\n", argv[0]);
return -1;
}
const int n = atoi(argv[1]);
mytype *h_i, *d_i, *h_o, *d_o, *d_temp;
const int size = n * sizeof(mytype);
h_i = (mytype *)malloc(size);
h_o = (mytype *)malloc(size);
if ((h_i == NULL) || (h_o == NULL)) {
printf("malloc failed\n");
return -1;
}
for (int i = 0; i < n; i++) {
h_i[i] = i;
h_o[i] = 0;
}
cudaMalloc(&d_i, size);
cudaMalloc(&d_temp, (n / THREADS) );
cudaMalloc(&d_o, size);
cudaMemset(d_o, 0, size);
cudaMemset(d_temp, 0, (n / THREADS));
cudaMemcpy(d_i, h_i, size, cudaMemcpyHostToDevice);
int blocks = n / THREADS;
phaseI<<<blocks, THREADS / 2 >>>(d_o, d_i, n, d_temp);
phaseII<<<blocks, THREADS / 2>>>(d_o, d_temp, n);
cudaThreadSynchronize();
cudaMemcpy(h_o, d_o, size, cudaMemcpyDeviceToHost);
printf("\n");
for (int i = 0; i < n ; i++) {
printf(" %d", h_o[i]);
}
printf("\n\n");
return 0;
}
Does anyone have any idea what I'm doing wrong?
One possible error I see in your code is here:
aux[thid] = temp[THREADS];
If your temp array is temp[1024], as you say, and each block has 1024 threads, as you say, then if THREADS is 1024, temp[THREADS] will access your shared memory array out-of-bounds (one past the end.) An array of 1024 elements only has valid indices from 0 to 1023.
Beyond that, it seems like you're asking how to take the last element out of a shared memory array (temp) and place it in a position in a (presumably global) aux array, which has one element for each block.
Here's a fully worked example:
$ cat t831.cu
#include <stdio.h>
#define THREADS 1024
#define BLOCKS 20
__global__ void kernel(int *aux){
__shared__ int temp[THREADS];
temp[threadIdx.x] = threadIdx.x + blockIdx.x;
__syncthreads();
if (threadIdx.x == 0)
aux[blockIdx.x] = temp[THREADS-1];
}
int main(){
int *h_data, *d_data;
const int dsize = BLOCKS*sizeof(int);
h_data=(int *)malloc(dsize);
cudaMalloc(&d_data, dsize);
memset(h_data, 0, dsize);
cudaMemset(d_data, 0, dsize);
kernel<<<BLOCKS, THREADS>>>(d_data);
cudaMemcpy(h_data, d_data, dsize, cudaMemcpyDeviceToHost);
for (int i = 0; i < BLOCKS; i++) printf("%d, ", h_data[i]);
printf("\n");
return 0;
}
$ nvcc -o t831 t831.cu
$ cuda-memcheck ./t831
========= CUDA-MEMCHECK
1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042,
========= ERROR SUMMARY: 0 errors
$
I'm working on the problem summing the rows of a matrix in CUDA. I'm giving the following example.
Suppose to have the following 20 * 4 array:
1 2 3 4
4 1 2 3
3 4 1 2
.
1 2 3 4
.
.
.
.
.
.
.
.
2 1 3 4
After flattened the 2d array to a 1d array (either in row-major or column-major order), I need to assign each thread to a different row and calculate the cost for that row.
For example
- thread 1 should calculate the cost for 1 2 3 4
- thread 2 should calculate the cost for 4 1 2 3
How can I that in CUDA?
Thank you all for the reply
#include <stdio.h>
#include <stdlib.h>
#define MROWS 20
#define NCOLS 4
#define nTPB 256
__global__ void mykernel(int *costdata, int rows, int cols, int *results){
int tidx = threadIdx.x + blockDim.x*blockIdx.x;
if (tidx < rows){
int mycost = 0;
for (int i = 0; i < cols; i++)
mycost += costdata[(tidx*cols)+i];
results[tidx] = mycost;
}
}
int main(){
//define and initialize host and device storage for cost and results
int *d_costdata, *h_costdata, *d_results, *h_results;
h_results = (int *)malloc(MROWS*sizeof(int));
h_costdata = (int *)malloc(MROWS*NCOLS*sizeof(int));
for (int i=0; i<(MROWS*NCOLS); i++)
h_costdata[i] = rand()%4;
cudaMalloc((void **)&d_results, MROWS*sizeof(int));
cudaMalloc((void **)&d_costdata, MROWS*NCOLS*sizeof(int));
//copy cost data from host to device
cudaMemcpy(d_costdata, h_costdata, MROWS*NCOLS*sizeof(int), cudaMemcpyHostToDevice);
mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results);
// copy results back from device to host
cudaMemcpy(h_results, d_results, MROWS*sizeof(int), cudaMemcpyDeviceToHost);
for (int i=0; i<MROWS; i++){
int loc_cost = 0;
for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
printf("cost[%d]: host= %d, device = %d\n", i, loc_cost, h_results[i]);
}
}
This assumes "cost" of each row is just the sum of the elements in each row. If you have a different "cost" function, you can modify the activity in the kernel for-loop accordingly. This also assumes C-style row-major data storage (1 2 3 4 4 1 2 3 3 4 1 2 etc.)
If you instead use column-major storage (1 4 3 etc.), you can slightly improve the performance, since the data reads can be fully coalesced. Then your kernel code could look like this:
for (int i = 0; i < cols; i++)
mycost += costdata[(i*rows)+tidx];
You should also use proper cuda error checking on all CUDA API calls and kernel calls.
EDIT: As discussed in the comments below, for the row-major storage case, in some situations it might give an increase in memory efficiency by electing to load 16-byte quantities rather than the base type. Following is a modified version that implements this idea for arbitrary dimensions and (more or less) arbitrary base types:
#include <iostream>
#include <typeinfo>
#include <cstdlib>
#include <vector_types.h>
#define MROWS 1742
#define NCOLS 801
#define nTPB 256
typedef double mytype;
__host__ int sizetype(){
int size = 0;
if ((typeid(mytype) == typeid(float)) || (typeid(mytype) == typeid(int)) || (typeid(mytype) == typeid(unsigned int)))
size = 4;
else if (typeid(mytype) == typeid(double))
size = 8;
else if ((typeid(mytype) == typeid(unsigned char)) || (typeid(mytype) == typeid(char)))
size = 1;
return size;
}
template<typename T>
__global__ void mykernel(const T *costdata, int rows, int cols, T *results, int size, size_t pitch){
int chunk = 16/size; // assumes size is a factor of 16
int tidx = threadIdx.x + blockDim.x*blockIdx.x;
if (tidx < rows){
T *myrowptr = (T *)(((unsigned char *)costdata) + tidx*pitch);
T mycost = (T)0;
int count = 0;
while (count < cols){
if ((cols-count)>=chunk){
// read 16 bytes
int4 temp = *((int4 *)(myrowptr + count));
int bcount = 16;
int j = 0;
while (bcount > 0){
mycost += *(((T *)(&temp)) + j++);
bcount -= size;
count++;}
}
else {
// read one quantity at a time
for (; count < cols; count++)
mycost += myrowptr[count];
}
results[tidx] = mycost;
}
}
}
int main(){
int typesize = sizetype();
if (typesize == 0) {std::cout << "invalid type selected" << std::endl; return 1;}
//define and initialize host and device storage for cost and results
mytype *d_costdata, *h_costdata, *d_results, *h_results;
h_results = (mytype *)malloc(MROWS*sizeof(mytype));
h_costdata = (mytype *)malloc(MROWS*NCOLS*sizeof(mytype));
for (int i=0; i<(MROWS*NCOLS); i++)
h_costdata[i] = (mytype)(rand()%4);
size_t pitch = 0;
cudaMalloc((void **)&d_results, MROWS*sizeof(mytype));
cudaMallocPitch((void **)&d_costdata, &pitch, NCOLS*sizeof(mytype), MROWS);
//copy cost data from host to device
cudaMemcpy2D(d_costdata, pitch, h_costdata, NCOLS*sizeof(mytype), NCOLS*sizeof(mytype), MROWS, cudaMemcpyHostToDevice);
mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results, typesize, pitch);
// copy results back from device to host
cudaMemcpy(h_results, d_results, MROWS*sizeof(mytype), cudaMemcpyDeviceToHost);
for (int i=0; i<MROWS; i++){
mytype loc_cost = (mytype)0;
for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
if ((i < 10) && (typesize > 1))
std::cout <<"cost[" << i << "]: host= " << loc_cost << ", device = " << h_results[i] << std::endl;
if (loc_cost != h_results[i]){ std::cout << "mismatch at index" << i << "should be:" << loc_cost << "was:" << h_results[i] << std::endl; return 1; }
}
std::cout << "Results are correct!" << std::endl;
}
I'm trying to scan a simple array using CUDA but it seems there is something wrong with the code below..I am trying to find what i am doing wrong but i can't.Can anyone please help me?
#include <stdio.h>
#include <stdlib.h>
__global__ void prescan(int *g_odata, int *g_idata, int n){
extern __shared__ int temp[];// allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid]; // load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1){ // build sum in place up the tree
__syncthreads();
if (thid < d){
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2){ // traverse down tree & build scan
offset >>= 1;
__syncthreads();
if (thid < d){
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
int main(int argc, char *argv[]){
int i;
int *input = 0;
int *output = 0;
int *g_idata = 0;
int *g_odata = 0;
int numblocks = 1;
int radix = 16;
input = (int*)malloc(numblocks*radix*sizeof(int));
output = (int*)malloc(numblocks*radix*sizeof(int));
cudaMalloc((void**)&g_idata, numblocks*radix*sizeof(int));
cudaMalloc((void**)&g_odata, numblocks*radix*sizeof(int));
for(i=0; i<numblocks*radix; i++){
input[i] = 1 + 2*i;
}
for(i=0; i<numblocks*radix; i++){
printf("%d ", input[i]);
}
cudaMemcpy(g_idata, input, numblocks*radix*sizeof(int), cudaMemcpyHostToDevice);
prescan<<<1,8>>>(g_odata, g_idata, numblocks*radix);
cudaThreadSynchronize();
cudaMemcpy(output, g_odata, numblocks*radix*sizeof(int), cudaMemcpyDeviceToHost);
for(i=0; i<numblocks*radix; i++){
printf("%d ", output[i]);
}
free(input);
free(output);
cudaFree(g_idata);
cudaFree(g_odata);
return 0;
}
The output is this: 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.I want to have this output: 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 0 1 4 9 16 25 36 49 64 81 100 121 144 169 196 225
Just go through this code to implement scan in parallel environment.
The algorithm which I implemented here is Hillis Steele exclusive scan.I implemented algorithm through shared memory, it will definitely improve the execution time for the large data set.
#include<stdio.h>
#include<math.h>
__global__ void scan(int *d_in,int *d_out,int n)
{
extern __shared__ int sdata[];
int i;
int tid = threadIdx.x;
sdata[tid] = d_in[tid];
for (i = 1; i <n; i <<= 1)
{
if (tid>=i)
{
sdata[tid] +=sdata[tid-i];
}
__syncthreads();
}
d_out[tid] = sdata[tid];
__syncthreads();
}
int main()
{
int h_in[16],h_out[16];
int i,j;
for (i = 0; i < 16; i++)
h_in[i] = 2*i+1;
for (i = 0; i < 16; i++)
printf("%d ", h_in[i]);
int *d_in;
int *d_out;
cudaMalloc((void**)&d_in, sizeof(int)* 16);
cudaMalloc((void**)&d_out, sizeof(int)* 16);
cudaMemcpy(d_in, h_in, sizeof(int) * 16, cudaMemcpyHostToDevice);
scan <<<1, 16, sizeof(int)*16 >>>(d_in,d_out, 16);
cudaMemcpy(h_out, d_out, sizeof(int) * 16, cudaMemcpyDeviceToHost);
for (i = 0; i < 16; i++)
printf("%d ", h_out[i]);
return 0;
}