Can I parallelize my program? - cuda

This is my program:
program test
implicit none
integer n,m,k,i,j,Errorflag
real :: Yabs(39,39),angle(39,39)
real ,dimension(67,1) :: deltaA,A
real :: V(1,39),d(1,39),v1(29,1),d1(38,1),Ps(1,38),Qs(1,39),Jac(67,67),invJac(67,67)
real :: B1(1,38),B2(1,29),MF(1,67),trnsMF(67,1),P0(1,39),Q0(1,39)
real, dimension(38,38) :: dia1,offdia1,J1
real, dimension(29,29) :: dia2,dia3,dia4,offdia4,J4
real,dimension(38,29) ::offdia2,J2
real,dimension(29,38) ::offdia3,J3
real p,p1,q,q1
n=39;m=9
MF(1,1)=10
open(unit=3,file="ybus.dat",status="old")
open(unit=4,file="angle.dat",status="old")
do i=1,39
read(3,*) Yabs(i,1:39)
read(4,*)angle(i,1:39)
end do
close(3)
close(4)
open(unit=5,file="activepower.dat",status="old")
open(unit=8,file="reactivepower.dat",status="old")
read(5,*)Ps(1,1:38)
read(8,*)Qs(1,1:29)
close(5)
close(8)
do i=1,67
deltaA(i,1)=0
end do
v1(1:29,1)=1
d1(1:38,1)=0
A(1:38,1)=d1(1:38,1)
A(39:67,1)=v1(1:29,1)
!call cpu_time(t1)
do while(maxval(abs(MF))>0.0001)
V(1,1)=0.982
V(1,2:30)=v1(1:29,1)
V(1,31)=1.03
V(1,32)=0.9831
V(1,33)=1.0123
V(1,34)=0.9972
V(1,35)=1.0493
V(1,36)=1.0635
V(1,37)=1.0278
V(1,38)=1.0265
V(1,39)=1.0475
d(1,1)=0
d(1,2:39)=d1(1:38,1)
! % % % %------Active Power Calculation-----%
p1=0;p=0
do i=2,n
do j=1,n
p1=(V(i)*V(j)*Yabs(i,j)*cos(angle(i,j)-d(i)+d(j)))
p=p1+p
end do
P0(i-1)=p
p=0
end do
! % % % %------Reactive Power Calculation-----%
p=0;p1=0
do i=2,(n-m)
do j=1,n
p1=-(V(i)*V(j)*Yabs(i,j)*sin(angle(i,j)-d(i)+d(j)))
p=p1+p
end do
Q0(i-1)=p
p=0
end do
!!!!!!!!!!!mismatch factor
do i=1,(n-1)
B1(i)=Ps(i)-P0(i)
end do
do i=1,(n-m-1)
B2(i)=Qs(i)-Q0(i)
end do
MF(1,1:38)=B1(1,1:38)
MF(1,39:67)=B2(1,1:29)
!!!!!!!!jacobian calculation for preddictor step
!!!!!!!!!!!!!!!!!!!!!!dia of j1
p=0;p1=0
do i=2,n
do j=1,n
if(j .ne. i)then
p1=V(i)*V(j)*Yabs(i,j)*sin(angle(i,j)-d(i)+d(j))
!print*,p1
p=p1+p
end if
end do
i=i-1
dia1(i,i)=p
p=0
i=i+1
end do
!!!!!!!!!!!!!!off dia. of j1
q=0;q1=0;
do k=2,n
i=k
do j=2,n
if(j .ne. i)then
q1=V(i)*V(j)*Yabs(i,j)*sin(angle(i,j)-d(i)+d(j))
end if
i=i-1;j=j-1
offdia1(i,j)=-q1
q1=0
i=i+1;j=j+1
end do
end do
do i=1,38
do j=1,38
J1(i,j)=offdia1(i,j)+dia1(i,j)
end do
end do
!!!!!!!!!!!!!!!!!!!dia. of j2
p=0;p1=0
do i=2,(n-m)
do j=1,n
if(j .ne. i)then
p1=V(j)*Yabs(i,j)*cos(angle(i,j)-d(i)+d(j))
p=p1+p
end if
end do
dia2(i-1,i-1)=p+(2*V(i)*Yabs(i,i)*cos(angle(i,i)))
p=0;
end do
!!!!!!!!!!!!!!!!!!off dia. of j2
p1=0;
do k=2,n
i=k
do j=2,(n-m)
if(j .ne. i)then
p1=V(i)*Yabs(i,j)*cos(angle(i,j)-d(i)+d(j));
end if
i=i-1;j=j-1
offdia2(i,j)=p1
p1=0;
i=i+1;j=j+1
end do
end do
do i=1,(n-m-1)
offdia2(i,i)=dia2(i,i)
end do
J2=offdia2
!!!!!!!!!!!!!!!!!!!!dia. of j3
p=0;p1=0
do i=2,(n-m)
do j=1,n
if(j .ne. i)then
p1=V(i)*V(j)*Yabs(i,j)*cos(angle(i,j)-d(i)+d(j))
p=p1+p;
end if
end do
i=i-1;
dia3(i,i)=p
p=0;
i=i+1;
end do
!!!!!!!!!!!!!!off dia of j3
p=0;p1=0
do k=2,(n-m)
i=k;
do j=2,n
if(j .ne. i)then
p1=V(i)*V(j)*Yabs(i,j)*cos(angle(i,j)-d(i)+d(j))
end if
i=i-1;j=j-1
offdia3(i,j)=-p1;
p1=0;
i=i+1;j=j+1
end do
end do
do i=1,(n-m-1)
offdia3(i,i)=dia3(i,i)
end do
J3=offdia3
!!!!!!!!!!dia of j4
p=0;p1=0
do i=2,(n-m)
do j=1,n
if(j .ne. i)then
p1=V(j)*Yabs(i,j)*sin(angle(i,j)-d(i)+d(j))
p=p1+p
end if
end do
dia4(i-1,i-1)=-(2*V(i)*Yabs(i,i)*sin(angle(i,i)))-p
p=0;p1=0
end do
!!!!!!!!!!!!!!!off dia of j4
p1=0;p=0
do k=2,(n-m)
i=k;
do j=2,(n-m)
if(j .ne. i)then
p1=V(i)*Yabs(i,j)*sin(angle(i,j)-d(i)+d(j))
end if
i=i-1;j=j-1
offdia4(i,j)=-p1
p1=0;
i=i+1;j=j+1
end do
end do
do i=1,(n-m-1)
offdia4(i,i)=dia4(i,i);
end do
J4=offdia4
!!!!!!!
!!!!!!!!!!!!!!!!!!!formation of final jacobian!!!!!!!!!!
Jac( 1:38, 1:38) = J1 (1:38,1:38)
Jac( 1:38,39:67) = J2 (1:38,1:29)
Jac(39:67, 1:38) = J3 (1:29,1:38)
Jac(39:67,39:67) = J4 (1:29,1:29)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!print*,Jac(23,21)
CALL FindInv(Jac,invJac ,67, ErrorFlag)
trnsMF=transpose(MF)
deltaA=matmul( invJac, trnsMF)
do i=1,67
A(i)=A(i)+deltaA(i)
end do
!!!!!!!!!!!!updating values
do i=1,(n-1)
d1(i)=A(i)
end do
k=0
do i=n,(2*n-2-m)
k=1+k
v1(k)=A(i)
end do
end do
end program test
The array "Ps" contains some values. Now if I increase value of Ps(15) by Ps(15)+1 so for both values can I parallelize this code to get answer quickly.
I am using PGI compiler for CUDA FORTRAN.

Your code is fairly straightforward with lots of independent parallel loops. These parallel loops appear to be wrapped in an outer convergence do while loop, so as long as you keep the data on the device for all iterations of the convergence loop, you won't be bottlenecked by transfers.
I would recommend starting with compiler directives for this code rather than diving in to CUDA Fortran. Compiler directives work well for simple independent loops like these -- they are simple hints that you place in code comments that tell the compiler which loops to parallelize, which data to copy, etc.
You can first try OpenMP to accelerate to multiple CPU cores. Then you can use GPU directives such as OpenACC, which is going to be available soon in compilers from PGI, Cray, and CAPS. To get a head start, you could download a free trial of the PGI compiler and use their "Accelerator" directives. Accelerator is very similar in syntax to OpenACC.

Yes, you can use the PGI compiler to write CUDA kernels and make CUDA API calls.
PGI Fortran CUDA Homepage
The question, I think, you mean to ask is "Should I parallelize this code?"
My answer would be that yes you could see some mild benefits to parallelization, at a glance.
For example, segments like:
do i=2,n
do j=1,n
if(j .ne. i)then
p1=V(i)*V(j)*Yabs(i,j)*sin(angle(i,j)-d(i)+d(j))
!print*,p1
p=p1+p
end if
end do
i=i-1
dia1(i,i)=p
p=0
i=i+1
end do
Are an N^2 set of independent calculations (in this case you set n=39, but I assume it could change). Thus you're dealing with at least a couple hundred calculations. While ideally you'd want even MORE calculations in terms of parallelization, you're at least in good shape in terms of that many of your loops appear to be doing identical independent work # each step -- ideal for a threaded application.
Thus you could see some mild benefit to writing CUDA kernels to replace your looping code segments in your data post-processing algorithms. Beware, the latencies of the PCI bus in terms of memory transfers do nullify some of the performance gains, particularly for small systems.
Thus I would say, yes, by all means you can and should try this if you're game, but don't expect it to be 100x faster... maybe like 2-10x faster, if you code it well, depending on your loop bound size and level of divergence within the particular loops.
Worst case scenario you see no gains, or even see slowdown, but at least you've learned something!!

Related

How to use acc_set_cuda_stream(streamId, stream)?

I created a CUDA stream in this way:
integer(kind=cuda_stream_kind) :: stream1
istat = cudaStreamCreate(stream1)
to use it for the plan of a cufft:
err_dir = err_dir + cufftPlan2D(plan_dir1,NY,NY,CUFFT_D2Z)
err_dir = err_dir + cufftSetStream(plan_dir1,stream1)
In the routine that executes the cufft, I pass plan_dir1 and I have
subroutine new_fft_dir(z,plan)
!$acc host_data use_device(z)
ierr = ierr + cufftExecD2Z(plan,z,z)
!$acc end host_data
!$acc parallel loop collapse(2) present(z)
do i=1,NXP2
do j=1,NY
z(i,j) = z(i,j)/NY**2
enddo
enddo
!$acc end parallel loop
I would like to set an OpenACC stream equal to the CUDA stream stream1, but using :
integer(kind=cuda_stream_kind) :: stream1
istat = cudaStreamCreate(stream1)
integer :: stream
istat = cudaStreamCreate(stream1)
acc_set_cuda_stream(stream,stream1)
I get **NVFORTRAN-S-0034-Syntax error at or near end of line (main.f90: 48)
**
My goal is to add the async clause to
!$acc parallel loop collapse(2) present(z) async(stream)
do i=1,NXP2
do j=1,NY
z(i,j) = z(i,j)/NY**2
enddo
enddo
!$acc end parallel loop
to have this loop and the fft on the same CUDA stream.
Could the problem be that I use integer(kind=cuda_stream_kind) intead of cudaStream_t stream?
"acc_set_cuda_stream" is a subroutine so you do need to add "call " before it. Also, variables need to be declared before executable code, hence "integer :: stream" needs to be moved up a line.
use cudafor
use openacc
integer(kind=cuda_stream_kind) :: stream1
integer :: stream
istat = cudaStreamCreate(stream1)
call acc_set_cuda_stream(stream,stream1)

How to reinitialize a Fortran function at each time step

I have a program as follows:
Integer N,tmax
parameter (N=10,tmax=10)
real :: L,dt,noise,positions(tmax,N),positions0(N),s,force
integer :: param
force=0
L=100.0
dt=0.1
param=0
r=0
!initialization
do 1 i1=1,N
param=param+100
positions(1,i1)=(ran2(param)-0.5)*L
1 continue
positions0=positions(1,:)
!time dependent process
do 2 i2=1,tmax
do 3 i3=1,N
param=param+100
force=0
noise=0
call routine_interaction(i3,N,positions0,force)
noise=(ran2(param)-0.5)*L/10
positions(i2,i3)=positions0(i3)+dt*(noise+force)
3 continue
positions0=positions(i2,:)
2 continue
print*, positions(1,:)
print*, positions(tmax,:)
end
with the subroutine and function:
FUNCTION ran2(idum)
INTEGER idum
!The ran2() subroutine from Numerical Recipes
! (C) Copr. 1986-92 Numerical Recipes Software #$!5,5.){2p491&&k"15. page 272
END
subroutine routine_interaction(ii,N,positions0,force)
integer N,ii
real :: r,s,force
real positions0(N)
do 4 i4=1,N
if (ii.ne.i4) then
r=abs(positions0(ii)-positions0(i4))
s=sign(1.0,positions0(i4)-positions0(ii))
force=force-(1/r**4-1/r**2)*s
end if
4 continue
return
end
as you can see, the subroutine routine_interaction is called at each time step, so that at each time step, there are 2 iterations and NxN=N² calculations.
Is there a way in fortran to define a function at each time step, that would be defined as: new_function(ii)=routine_interaction(ii,N,positions0,force) and that would be called in the do loop 3. It would lead to N+N calculations at each time steps?
If you want to use this algorithm, there is no way to reduce the force calculation (routine_interaction) to O(N) time. You have N elements in positions0, and you are calculating the distance r between each element and each other element (r=abs(positions(ii)-positions0(i4))). This requires at least N*(N-1)/2 calculations.
You can somewhat reduce the number of calculations using a triangular loop, e.g. as
program p
implicit none
integer, parameter :: N=10, tmax=10
real :: L,dt,noise,positions(tmax,N),positions0(N),force(N)
integer :: param
L=100.0
dt=0.1
param=0
!initialization
do i1=1,N
param=param+100
positions(1,i1)=(ran2(param)-0.5)*L
enddo
positions0=positions(1,:)
!time dependent process
do i2=1,tmax
force = routine_interaction(N,positions0)
do i3=1,N
param=param+100
noise=(ran2(param)-0.5)*L/10
positions(i2,i3)=positions0(i3)+dt*(noise+force(i3))
enddo
positions0=positions(i2,:)
enddo
print*, positions(1,:)
print*, positions(tmax,:)
end program
function routine_interaction(N,positions0) result (force)
integer, intent(in) :: N
real, intent(in) :: positions0(N)
real :: r,s,force(N)
integer :: i3,i4
force = 0
do i3=1,N
do i4=1,i3-1
r=abs(positions0(i3)-positions0(i4))
s=sign(1.0,positions0(i4)-positions0(i3))
force(i3)=force(i3)-(1/r**4-1/r**2)*s
force(i4)=force(i4)+(1/r**4-1/r**2)*s
enddo
enddo
end function

Xilinx VHDL latch warning troubleshooting

Xilinx is inferring a latch for a VHDL code i've written. I've looked up the possible causes for this and found that it's often due to incomplete if or case statements. I've gone through and made sure to include else and when others statements, but i'm still receiving the warning. I believe this is also affecting another project i'm working on so i'd like to understand why this is the case.
library IEEE;
use IEEE.STD_LOGIC_1164.ALL;
entity state_machine is
port(trig, en: in std_logic; cstate,nstate: out std_logic_vector(0 to 2));
end state_machine;
architecture Behavioral of state_machine is
signal cstate_s,nstate_s: std_logic_vector(0 to 2);
begin
cstate <= cstate_s;
nstate <= nstate_s;
process(en, cstate_s)
begin
if en = '1' then
nstate_s <= "111";
if cstate_s = "111" then
nstate_s <= "011";
elsif cstate_s = "011" then
nstate_s <= "100";
elsif cstate_s = "100" then
nstate_s <= "101";
elsif cstate_s = "101" then
nstate_s <= "110";
elsif cstate_s = "110" then
nstate_s <= "111";
else
null;
end if;
else
null;
end if;
end process;
process(trig, nstate_s)
begin
if rising_edge(trig) then
cstate_s <= nstate_s;
else
null;
end if;
end process;
end Behavioral;
WARNING:Xst:737 - Found 3-bit latch for signal . Latches may
be generated from incomplete case or if statements. We do not
recommend the use of latches in FPGA/CPLD designs, as they may lead to
timing problems.
For there to be no latches synthesised when a combinational process is synthesised, there must be no path between begin and end process; where all the outputs of the process are not assigned. This is called complete assignment. An output of the process is any signal assigned anywhere within it.
You have such paths. When any path with your null statements are executed, the output of your first process (nstate_s) is not assigned to. Therefore, you will get latches synthesised. There is no point in just having a null statement. If you genuinely don't care what value is assigned to your outputs in these paths, assign the outputs to '-', which means don't care in VHDL.
By the way (assuming trig is a clock), your second process is not combinational (it is sequential) and so you don't need to obey complete assignment; your else branch is unnecessary.

Reduction in OpenACC

Here is a Fortran subroutine for matrix-vector multiply. It is probably old-fashioned and inefficient in a number of ways, but right now I am just trying to get it to work with OpenACC directives, and I'm trying to figure out how reduction works:
subroutine matrmult(matrix,invec,outvec,n)
integer:: n
real*8, intent(in):: matrix(n,n), invec(n)
real*8, intent(out) :: outvec(n)
real*8 :: tmpmat(n,n)
real*8 :: tmpscl
integer :: i,j,k
!$acc declare create(matrix, invec, outvec, tmpmat)
outvec = 0.d0
!$acc update device(matrix, invec, tmpmat, outvec)
!$acc parallel
!$acc loop gang
do j=1,n
!$acc loop vector
do i=1,n
tmpmat(i,j) = matrix(i,j)*invec(j)
enddo
enddo
!$acc loop vector reduction(+:tmpsclr)
do j=1,n
tmpsclr = 0.d0
do i=1,n
tmpsclr = tmpsclr+tmpmat(j,i)
enddo
outvec(j) = tmpsclr
enddo
!$acc end parallel
!$acc update host(outvec)
end subroutine
This code actually gives correct results. But when I try a gang/vector combination on the last loops, like so:
!$acc loop gang reduction(+:tmpsclr)
do j=1,n
tmpsclr = 0.d0
!$acc loop vector
do i=1,n
tmpsclr = tmpsclr+tmpmat(j,i)
enddo
outvec(j) = tmpsclr
enddo
the results come back all wrong. It looks like the summation is incomplete for most, but not all, of the elements of outvec. This is the case no matter where I put the reduction clause, whether with the gang or the vector. Changing the location changes the results, but never gives correct results.
The results I am getting in a simple test are like the following. matrix is 10x10 and all 1's, and invec is 1,2,3,...10. So the elements of outvec should each just be the sum of the elements in invec, 55. If I run the gang/vector version of the code, each element of outvec is 1, not 55. If I put the reduction with the vector, well, then I get the right answer, 55. And this continues to work until I get past 90 elements. When I get to 91, every element of outvec should be equal to 4186. But only the last one is, and all the rest are equal to 4095 (the sum of 1 to 90). As the number of elements get bigger the variation of values and the discrepancy from the correct answer gets worse.
I clearly don't understand how the reduction works. Can anyone explain?
The reduction clause needs to be on loop where the reduction occurs, i.e. the vector loop. I'd also recommend using the "kernels" directive here since "parallel" will create one kernel launch for the two loops, while "kernels" will create two kernels, one for each loop.
For example:
subroutine foo(n,matrix,invec,outvec)
integer n
real*8, intent(in) :: matrix(n,n)
real*8, intent(in) :: invec(n)
real*8, intent(out) :: outvec(n)
real*8 :: tmpmat(n,n)
real*8 :: tmpscl
integer :: i,j,k
!$acc declare create(matrix, invec, outvec, tmpmat)
outvec = 0.d0
!$acc update device(matrix, invec, tmpmat, outvec)
!$acc kernels
!$acc loop gang
do j=1,n
!$acc loop vector
do i=1,n
tmpmat(i,j) = matrix(i,j)*invec(j)
enddo
enddo
!$acc loop gang
do j=1,n
tmpsclr = 0.d0
!$acc loop vector reduction(+:tmpsclr)
do i=1,n
tmpsclr = tmpsclr+tmpmat(j,i)
enddo
outvec(j) = tmpsclr
enddo
!$acc end kernels
!$acc update host(outvec)
end subroutine foo
% pgf90 -c -acc -Minfo=accel test2.f90
foo:
11, Generating create(matrix(:,:),invec(:),outvec(:))
15, Generating update device(outvec(:),tmpmat(:,:),invec(:),matrix(:,:))
20, Loop is parallelizable
22, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
20, !$acc loop gang, vector(4) ! blockidx%y threadidx%y
22, !$acc loop gang, vector(32) ! blockidx%x threadidx%x
28, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
28, !$acc loop gang ! blockidx%x
31, !$acc loop vector(128) ! threadidx%x
Sum reduction generated for tmpsclr
31, Loop is parallelizable
39, Generating update host(outvec(:))
Hope this helps,
Mat

How can I fill a matrix using a fortran subroutine (or function) and pass the matrix back to the main program?

I'm working on a Fortran 90 assignment, and I'm having a lot of issues learning how to use subroutines and functions, and I'm hoping someone can help me. if it isn't obvious, I'm extremely new to FORTRAN and much more comfortable with language like C and Java.
Anyway, here's what I have to do: The user selects what they would like to do: add, subtract, multiply, or transpose two matrices. I'm using a select case for this, which works great. However, I obviously don't want to duplicate the same code to fill two matrices four different times, so I'm trying to make it a separate function. Ideally, I'd like to do something like this:
integer matrix1(11,11), matrix2(11,11)
integer rows1,cols1,rows2,cols2,i,j
case (1)
matrix1 = fillmatrix(rows1,cols1)
matrix2 = fillmatrix(rows2,cols2)
.
.
.
function fillmatrix(rows,columns)
integer input
read *,rows,columns
do i = 1, rows
do j = 1, columns
fillmatrix(i,j) = read *,input
end do
end do
end
Is there any way to do something like this? And am i making myself clear, because sometimes I have trouble saying what I mean.
Or is this possible?
matrix1 = fillmatrix(rows1)cols1)
function fillmatrix(rows,columns)
integer input,matrix(11,11)
//fill matrix
return matrix
end
In C or Java, you just have functions, but Fortran has both functions and subroutines. In a case like this, it might be easier to write it as a subroutine instead of as a function, so your call would look something like
integer matrix1(11,11), matrix2(11,11)
integer rows1,cols1,rows2,cols2,i,j
...
case (1)
call fillmatrix(matrix1)
call fillmatrix(matrix2)
...
where the subroutine would look something like
subroutine fillmatrix(m)
implicit none
integer, intent(out) :: m(:,:)
integer :: i, j
do j = 1,size(m,2)
do i = 1,size(m,1)
read *, m(i,j)
end do
end do
end subroutine fillmatrix
Note that I'm not directly specifying the array bounds - instead I'm figuring them out inside the subroutine. This means that this subroutine needs an explicit interface - the easiest way to get this is to put it in either a contains block or a module.
If you want to use a function, you need to know the size of the matrix before calling it. Here is a small example:
module readMatrix
implicit none
contains
function fillmatrix(cols,rows)
implicit none
! Argument/return value
integer,intent(in) :: rows,cols
integer :: fillmatrix(rows,cols)
! Loop counters
integer :: i,j
do j = 1, rows
do i = 1, cols
write(*,*) 'Enter matrix element ',i,j
read *,fillmatrix(i,j)
enddo ! j
enddo ! i
end function
end module
program test
use readMatrix
implicit none
integer,allocatable :: matrix(:,:)
integer :: row,col, stat
write(*,*) 'Enter number of rows'
read *,row
write(*,*) 'Enter number of cols'
read *,col
allocate( matrix(col,row), stat=stat )
if (stat/=0) stop 'Cannot allocate memory'
matrix = fillmatrix(col,row)
write(*,*) matrix
deallocate(matrix)
end program
This is similar, using a subroutine and a static array (like in the question):
module readMatrix
implicit none
contains
subroutine fillmatrix(cols,rows,matrix)
implicit none
! Argument/return value
integer,intent(out) :: rows,cols
integer,intent(out) :: matrix(:,:)
! Loop counters
integer :: i,j
write(*,*) 'Enter number of rows, up to a maximum of ',size(matrix,2)
read *,rows
write(*,*) 'Enter number of cols, up to a maximum of ',size(matrix,1)
read *,cols
if ( rows > size(matrix,2) .or. cols > size(matrix,1) ) &
stop 'Invalid dimension specified'
do j = 1, rows
do i = 1, cols
write(*,*) 'Enter matrix element ',i,j
read *,matrix(i,j)
enddo ! j
enddo ! i
end subroutine
end module
program test
use readMatrix
implicit none
integer,parameter :: maxCol=10,maxRow=10
integer :: matrix(maxCol,maxRow)
integer :: row,col
call fillmatrix(col,row,matrix)
write(*,*) matrix(1:col,1:row)
end program
You could even pass an allocatable array to the subroutine and allocate it there, but that's a different story...