50 error: invalid operands to binary % (have' int ' and ' *int ') - function

I get this error at line 50.I dont know what to use instead of (*p).
I am learning how to use pointers and trying to use pointers in a function passing arguments by reference.
I've been staring at it for some time now.
# include "stdio.h"
int odd (int (*), int );
main(){
int i,n;
int size;
int main(){
int v[i];
int *p;
p = &v[0];
printf("Write the quantity of integers you want to ingress");
scanf("%d",&size);
for(i=0;i<size;i++){
printf("write a number");
scanf("%d",&n);
v[i]= n;
p = &v[i];
odd(&v[i],size);
printf("The value number %d is: %d \n",i,*p);
}
return 0;
}
int odd(int *p,int siz){
int i;
int counter = 0;
for(i=0;i<siz;i++){
/*50*/ if(*p % 2 = 0){ }
else counter++ ;
return counter;
}
}

You are confusing assignment (=) with testing for equality (==). Change:
if(*p % 2 = 0)
to:
if(*p % 2 == 0)
Also your prototype for odd is wrong - change:
int odd (int (*), int );
to:
int odd (int *, int );

Related

Reverse engineering history pattern length in branch predictor

I'm trying to find the length of the history pattern in the branch predictor of my computer's processor. I generated variable length array of bits and have if conditions based on the value of the bit. I will then plot the run time of different execution of the function and search for the knee in the graph. but I don't see any such point in the graph. What am I doing wrong? Any idea?
Here is my code:
vector<int> randomArr(int n)
{
vector<int> arr (n);
for ( int i=0; i <n; i++){
arr[i] = rand() % 2;
}
return arr;
}
int branchy(vector<int> & arr){
int a = 0 ;
int b = 0 ;
for ( int i = 0 ; i < arr.size() ; i++ ) {
if ( arr[i] == 0)
a++;
else
b++;
}
return a^b;
}
int main() {
long int iterations = 100000;
int start_s;
int stop_s;
ofstream runtimesFile;
runtimesFile.open("runtimesFile.txt");
for (int j=0; j <iterations ; j++){
vector<int> arr = randomArr(j);
start_s=clock();
branchy(arr);
stop_s=clock();
runtimesFile<< to_string(stop_s-start_s)<<"\n";
}
runtimesFile.close();
return 0;
}

Rearranging an array in CUDA

I have the following problem that I want to implement on CUDA:
I want to read an array (say "flag[20]"), and based on a certain condition, write indices of this array to another array (say "pindex[]")
Simple code implementation in C can be:
int N = 20;
int flag[N];
int pindex[N];
for(int i=0;i<N;i++)
flag[i] = -1;
for(int i=0;i<N;i+=2)
flag[i] = 0;
for(int i=0;i<N;i++)
pindex[i] = 0;
//operation: count # of times flag != -1 and write those indices in a different array
int pcount1 = 0;
for(int i=0;i<N;i++)
{
if(flag[i] != -1)
{
pindex[pcount1] = i;
++pcount1;
}
}
How will I implement this in CUDA?
I can use atomicAdd() to calculate total number of times my condition is satisfied. But, how do I write indices in a different array. For example, I tried the following:
__global__ void kernel_tryatomic(int N,int* pcount,int* flag, int* pindex)
{
int tId=threadIdx.x;
int n=(blockIdx.x*2+blockIdx.y)*BlockSize+tId;
if(n > N-1) return;
if(flag[n] != -1)
{
atomicAdd(pcount,1);
atomicExch(&pindex[*pcount],n);
//pindex[*pcount] = n;
}
}
This code calculates "pcount" correctly, but does not update "pindex" array.
I need help to do this operation on GPUs.
Thanks
Since your condition (flag) is conceptually a binary, you can use binary prefix sum (thoroughly explained here) to determine which place the thread with a positive flag should write.
For example if N is 20, with the help of below __device__ functions:
__device__ int lanemask_lt(int lane) {
return (1 << (lane)) − 1;
}
__device__ int warp_prefix_sums(int lane, int p) {
const int mask = lanemask_lt( lane );
int b = __ballot( p );
return __popc( b & mask );
}
your __global__ function can simply be written like below:
__global__ void kernel_scan(int N,int* pcount,int* flag, int* pindex)
{
int tId=threadIdx.x;
if(tId >= N)
return;
int threadFlag = ( flag[tId] == -1 ) ? 0 : 1;
int position_to_write = warp_prefix_sum( tId & (warpSize-1), threadFlag );
if( threadFlag )
pindex[ position_to_write ] = tId;
}
If N is bigger than the warp size (32), you can use intra-block binary prefix sum that is explained in the provided link.

Memory Checker detected 1 access violations. error = access violation on load (global memory)

When I debugging my code this error shown when cursor goto :
scoreMatrix[0] = dev_matrix[a]+similarityScore(dev_strA[a-1],dev_strB[b-1]); and repeatedly shown for scoreMatrix[1], scoreMatrix[2], scoreMatrix[3]
__global__ void kernel_ScoreMatrix(char *dev_strA, char *dev_strB, int *dev_matrix, int *dev_array, int *array_length)
{
int x= blockIdx.x;
int y=blockIdx.y;
int m = COLUMNS*y + x;
for (int i=0; i<*(array_length); i++)
if (m==dev_array[i]){
int a = COLUMNS*(y-1) + (x-1);
int b= COLUMNS*(y-1) + (x);
int c= COLUMNS*(y) + (x-1);
int scoreMatrix[4];
scoreMatrix[0] = dev_matrix[a]+similarityScore(dev_strA[a-1],dev_strB[b-1]);
scoreMatrix[1] = dev_matrix[b]+GAP;
scoreMatrix[2] = dev_matrix[c]+GAP;
scoreMatrix[3] = 0;
dev_matrix[m] = findMax(scoreMatrix,4);
}
}
this a picture of problem.
Value of the variable a equals zero so the statement dev_strA[a-1] causes an access violation.

Implementing the exponential function with basic arithmetic operations

For the purpose of the exercise, I have to implement the exponential function with the most basic arithmetic operations. I came up with this, where x is the base and y the exponent:
function expAetB() {
product=1;
for (i=0; i<y; i++)
{
product=product*x;
}
return product;
};
However, there are more basic operations than product=product*x;. I should somehow be able to insert instead another for loop which multiply and pass the result, but I can't find a way to do it without falling into an infinite loop.
In the same way that exponentiation is repeated multiplication, so multiplication is simply repeated addition.
Simply create another function mulAetB which does that for you, and watch out for things like negative inputs.
You could go even one more level and define adding in terms of increment and decrement, but that may be overkill.
See, for example, the following program which uses the overkill method of addition:
#include <stdio.h>
static unsigned int add (unsigned int a, unsigned int b) {
unsigned int result = a;
while (b-- != 0) result++;
return result;
}
static unsigned int mul (unsigned int a, unsigned int b) {
unsigned int result = 0;
while (b-- != 0) result = add (result, a);
return result;
}
static unsigned int pwr (unsigned int a, unsigned int b) {
unsigned int result = 1;
while (b-- != 0) result = mul (result, a);
return result;
}
int main (void) {
int test[] = {0,5, 1,9, 2,4, 3,5, 7,2, -1}, *ip = test;
while (*ip != -1) {
printf ("%d + %d = %3d\n" , *ip, *(ip+1), add (*ip, *(ip+1)));
printf ("%d x %d = %3d\n" , *ip, *(ip+1), mul (*ip, *(ip+1)));
printf ("%d ^ %d = %3d\n\n", *ip, *(ip+1), pwr (*ip, *(ip+1)));
ip += 2;
}
return 0;
}
The output of this program shows that the calculations are correct:
0 + 5 = 5
0 x 5 = 0
0 ^ 5 = 0
1 + 9 = 10
1 x 9 = 9
1 ^ 9 = 1
2 + 4 = 6
2 x 4 = 8
2 ^ 4 = 16
3 + 5 = 8
3 x 5 = 15
3 ^ 5 = 243
7 + 2 = 9
7 x 2 = 14
7 ^ 2 = 49
If you really must have it in a single function, it's a simple matter of refactoring the function call to be inline:
static unsigned int pwr (unsigned int a, unsigned int b) {
unsigned int xres, xa, result = 1;
// Catch common cases, simplifies rest of function (a>1, b>0)
if (b == 0) return 1;
if (a == 0) return 0;
if (a == 1) return 1;
// Do power as repeated multiplication.
result = a;
while (--b != 0) {
// Do multiplication as repeated addition.
xres = result;
xa = a;
while (--xa != 0)
result = result + xres;
}
return result;
}

Get statistics for a list of numbers using GPU

I have several lists of numbers on a file . For example,
.333, .324, .123 , .543, .00054
.2243, .333, .53343 , .4434
Now, I want to get the number of times each number occurs using the GPU. I believe this will be faster to do on the GPU than the CPU because each thread can process one list. What data structure should I use on the GPU to easily get the above counts. For example , for the above, the answer will look as follows:
.333 = 2 times in entire file
.324 = 1 time
etc..
I looking for a general solution. Not one that works only on devices with specific compute capability
Just writing kernel suggested by Pavan to see if I have implemented it efficiently:
int uniqueEle = newend.valiter – d_A;
int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int)); // stores the count of each unique element
int TPB = 256;
int blocks = uniqueEle + TPB -1 / TPB;
//Cast d_I to raw pointer called d_rawI
launch<<<blocks,TPB>>>(d_rawI,count,uniqueEle);
__global__ void launch(int *i, int* count, int n){
int id = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ int indexes[256];
if(id < n ){
indexes[threadIdx.x] = i[id];
//as occurs between two blocks
if(id % 255 == 0){
count[indexes] = i[id+1] - i[id];
}
}
__syncthreads();
if(id < ele - 1){
if(threadIdx.x < 255)
count[id] = indexes[threadIdx.x+1] – indexes[threadIdx.x];
}
}
Question: how to modify this kernel so that it handles arrays of arbitrary size. I.e , handle the condition when the total number of threads < number of elements
Here is how I would do the code in matlab
A = [333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434];
[values, locations] = unique(A); % Find unique values and their locations
counts = diff([0, locations]); % Find the count based on their locations
There is no easy way to do this in plain cuda, but you can use existing libraries to do this.
1) Thrust
It is also being shipped with CUDA toolkit from CUDA 4.0.
The matlab code can be roughly translated into thrust by using the following functions. I am not too proficient with thrust, but I am just trying to give you an idea on what routines to look at.
float _A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int _I[] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
float *A, *I;
// Allocate memory on device and cudaMempCpy values from _A to A and _I to I
int num = 9;
// Values vector
thrust::device_vector<float>d_A(A, A+num);
// Need to sort to get same values together
thrust::stable_sort(d_A, d_A+num);
// Vector containing 0 to num-1
thrust::device_vector<int>d_I(I, I+num);
// Find unique values and elements
thrust::device_vector<float>d_Values(num), d_Locations(num), d_counts(num);
// Find unique elements
thrust::device_vector<float>::iterator valiter;
thrust::device_vector<int>::iterator idxiter;
thrust::pair<valiter, idxiter> new_end;
new_end = thrust::unique_by_key(d_A, d_A+num, d_I, d_Values, d_Locations);
You now have the locations of the first instance of each unique value. You can now launch a kernel to find the differences between adjacent elements from 0 to new_end in d_Locations. Subtract the final value from num to get the count for final location.
EDIT (Adding code that was provided over chat)
Here is how the difference code needs to be done
#define MAX_BLOCKS 65535
#define roundup(A, B) = (((A) + (B) - 1) / (B))
int uniqueEle = newend.valiter – d_A;
int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int));
int TPB = 256;
int num_blocks = roundup(uniqueEle, TPB);
int blocks_y = roundup(num_blocks, MAX_BLOCKS);
int blocks_x = roundup(num_blocks, blocks_y);
dim3 blocks(blocks_x, blocks_y);
kernel<<<blocks,TPB>>>(d_rawI, count, uniqueEle);
__global__ void kernel(float *i, int* count, int n)
{
int tx = threadIdx.x;
int bid = blockIdx.y * gridDim.x + blockIdx.x;
int id = blockDim.x * bid + tx;
__shared__ int indexes[256];
if (id < n) indexes[tx] = i[id];
__syncthreads();
if (id < n - 1) {
if (tx < 255) count[id] = indexes[tx + 1] - indexes[tx];
else count[id] = i[id + 1] - indexes[tx];
}
if (id == n - 1) count[id] = n - indexes[tx];
return;
}
2) ArrayFire
This is an easy to use, free array based library.
You can do the following in ArrayFire.
using namespace af;
float h_A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int num = 9;
// Transfer data to device
array A(9, 1, h_A);
array values, locations, original;
// Find the unique values and locations
setunique(values, locations, original, A);
// Locations are 0 based, add 1.
// Add *num* at the end to find count of last value.
array counts = diff1(join(locations + 1, num));
Disclosure: I work for AccelerEyes, that develops this software.
To answer the latest addenum to this question - the diff kernel which would complete the thrust method proposed by Pavan could look something like this:
template<int blcksz>
__global__ void diffkernel(const int *i, int* count, const int n) {
int id = blockDim.x * blockIdx.x + threadIdx.x;
int strd = blockDim.x * gridDim.x;
int nmax = blcksz * ((n/blcksz) + ((n%blcksz>0) ? 1 : 0));
__shared__ int indices[blcksz+1];
for(; id<nmax; id+=strd) {
// Data load
indices[threadIdx.x] = (id < n) ? i[id] : n;
if (threadIdx.x == (blcksz-1))
indices[blcksz] = ((id+1) < n) ? i[id+1] : n;
__syncthreads();
// Differencing calculation
int diff = indices[threadIdx.x+1] - indices[threadIdx.x];
// Store
if (id < n) count[id] = diff;
__syncthreads();
}
}
here is a solution:
__global__ void counter(float* a, int* b, int N)
{
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if(idx < N)
{
float my = a[idx];
int count = 0;
for(int i=0; i < N; i++)
{
if(my == a[i])
count++;
}
b[idx]=count;
}
}
int main()
{
int threads = 9;
int blocks = 1;
int N = blocks*threads;
float* h_a;
int* h_b;
float* d_a;
int* d_b;
h_a = (float*)malloc(N*sizeof(float));
h_b = (int*)malloc(N*sizeof(int));
cudaMalloc((void**)&d_a,N*sizeof(float));
cudaMalloc((void**)&d_b,N*sizeof(int));
h_a[0]= .333f;
h_a[1]= .324f;
h_a[2]= .123f;
h_a[3]= .543f;
h_a[4]= .00054f;
h_a[5]= .2243f;
h_a[6]= .333f;
h_a[7]= .53343f;
h_a[8]= .4434f;
cudaMemcpy(d_a,h_a,N*sizeof(float),cudaMemcpyHostToDevice);
counter<<<blocks,threads>>>(d_a,d_b,N);
cudaMemcpy(h_b,d_b,N*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0; i < N; i++)
{
printf("%f = %d times\n",h_a[i],h_b[i]);
}
cudaFree(d_a);
cudaFree(d_b);
free(h_a);
free(h_b);
getchar();
return 0;
}