!version: 2d decomposition using communcator
!version: routines MPI_COMM_SPLIT
!version: processor 0 outputs the grid uses MPI_Gatherv
!version: Uses MPI_Sendrecv for communication.
! solves the 2d Stommel Model of Ocean Circulation  
! using a Five-point stencil and Jacobi iteration
!
! gamma*((d(d(psi)/dx)/dx) + (d(d(psi)/dy)/dy))
! +beta(d(psi)/dx)=-alpha*sin(pi*y/(2*ly))
!
module numz
! module defines the basic real type and pi
    integer, parameter:: b8 = selected_real_kind(14)
    real(b8), parameter :: pi = 3.141592653589793239_b8
end module
!*********************
module input
! module contains the inputs 
    use numz
    integer nx,ny             ! number of interior points for our grid (50 50)
    real(b8) lx,ly            ! physical size of our grid (2000 2000)
    real(b8) alpha,beta,gamma ! parameters of the calculation (1.0e-9 2.25e-11 3.0e-6)
    integer steps             ! number of Jacobi iteration steps (60)
end module
!*********************
module constants
! this module contains the invariants (constants) of the
! calculation.  these values are determined in the main
! routine and used in the do_jacobi Jacobi iteration subroutine
! a6 is used in the force function
    use numz
    real(b8) dx,dy,a1,a2,a3,a4,a5,a6
    real(b8), allocatable:: for(:,:)     ! our force grid
end module
!*********************
module face
! this module contains the interface for the two subroutines
! that modify the grid.  an interface is a good idea in this
! case because we are passing allocatable arrays
    interface bc
        subroutine bc (psi,i1,i2,j1,j2)
! sets the boundary conditions
! input is the grid and the indices for the interior cells
            use numz
            integer,intent(in):: i1,i2,j1,j2
            real(b8),dimension(i1:i2,j1:j2):: psi
        end subroutine
    end interface
    interface do_jacobi
        subroutine do_jacobi (psi,new_psi,diff,i1,i2,j1,j2)
! does a single Jacobi iteration step
! input is the grid and the indices for the interior cells
! new_psi is temp storage for the the updated grid
! output is the updated grid in psi and diff which is
! the sum of the differences between the old and new grids
            use numz
            integer,intent(in):: i1,i2,j1,j2
            real(b8),dimension(i1-1:i2+1,j1-1:j2+1):: psi
            real(b8),dimension(i1-1:i2+1,j1-1:j2+1):: new_psi
            real(b8) diff
        end subroutine
    end interface
! interface for the forcing function
    interface force
        function force(y)
            use numz
            real(b8) force,y
        end function force
    end interface   
! interface for routine to write the grid
    interface write_grid
        subroutine write_grid (psi,i1,i2,j1,j2)
! input is the grid and the indices for the interior cells
            use numz
            integer,intent(in):: i1,i2,j1,j2
            real(b8),dimension(i1:i2,j1:j2):: psi
        end subroutine
    end interface
    interface do_transfer
        subroutine do_transfer (psi,i1,i2,j1,j2)
! sets the boundary conditions
! input is the grid and the indices for the interior cells
            use numz
            integer,intent(in):: i1,i2,j1,j2
            real(b8),dimension(i1:i2,j1:j2):: psi
        end subroutine
    end interface
    interface do_force
        subroutine do_force (i1,i2,j1,j2)
! sets the boundary conditions
! input is the grid and the indices for the interior cells
            use numz
            integer,intent(in):: i1,i2,j1,j2
        end subroutine
    end interface
    interface unique
        function unique(name)
            character (len=*) name
            character (len=20) unique
        end function
    end interface
end module
module mympi
    use mpi
!   include "mpif.h"
    integer numnodes,myid,mpi_err
    integer, parameter::mpi_master=0
    integer status(MPI_STATUS_SIZE)
    integer nrow,ncol
    integer myrow,mycol
    integer myid_col,myid_row,nodes_row,nodes_col
    integer ROW_COMM,COL_COMM
    integer mytop,mybot,myleft,myright
end module
!*********************
program stommel
    use numz
    use input
    use constants
    use face
    use mympi
    implicit none
    real(b8)t1,t2
    real(b8)diff,mydiff
    real(b8)dx2,dy2,bottom
    real(b8), allocatable:: psi(:,:)     ! our calculation grid
    real(b8), allocatable:: new_psi(:,:) ! temp storage for the grid
    integer i,j,i1,i2,j1,j2
    integer iout
    real(b8) di,dj
! do the mpi init stuff
    call MPI_INIT( mpi_err )
    call MPI_COMM_SIZE( MPI_COMM_WORLD, numnodes, mpi_err )
    call MPI_COMM_RANK( MPI_COMM_WORLD, myid, mpi_err )
! find a reasonable grid topology based on the number
! of processors
  nrow=nint(sqrt(float(numnodes)))
  ncol=numnodes/nrow
  do while (nrow*ncol .ne. numnodes)
      nrow=nrow+1
      ncol=numnodes/nrow
  enddo
  if(nrow .gt. ncol)then
      i=ncol
      ncol=nrow
      nrow=i
  endif
  myrow=myid/ncol+1
  mycol=myid - (myrow-1)*ncol + 1
! make the row and col communicators
! all processors with the same row will be in the same ROW_COMM
  call MPI_COMM_SPLIT(MPI_COMM_WORLD,myrow,mycol,ROW_COMM,mpi_err)
  call MPI_COMM_RANK( ROW_COMM, myid_row, mpi_err )
  call MPI_COMM_SIZE( ROW_COMM, nodes_row, mpi_err )
! all processors with the same col will be in the same COL_COMM
  call MPI_COMM_SPLIT(MPI_COMM_WORLD,mycol,myrow,COL_COMM,mpi_err)
  call MPI_COMM_RANK( COL_COMM, myid_col, mpi_err )
  call MPI_COMM_SIZE( COL_COMM, nodes_col, mpi_err )
! find id of neighbors using the communicators created above
  mytop   = myid_col-1;if( mytop    .lt. 0         )mytop   = MPI_PROC_NULL
  mybot   = myid_col+1;if( mybot    .eq. nodes_col )mybot   = MPI_PROC_NULL
  myleft  = myid_row-1;if( myleft   .lt. 0         )myleft  = MPI_PROC_NULL
  myright = myid_row+1;if( myright  .eq. nodes_row )myright = MPI_PROC_NULL
! get the input.  see above for typical values
    if(myid .eq. mpi_master)then
        read(*,*)nx,ny
        read(*,*)lx,ly
        read(*,*)alpha,beta,gamma
        read(*,*)steps
    endif
!send the data to other processors
    call MPI_BCAST(nx,   1,MPI_INTEGER,         mpi_master,MPI_COMM_WORLD,mpi_err)
    call MPI_BCAST(ny,   1,MPI_INTEGER,         mpi_master,MPI_COMM_WORLD,mpi_err)
    call MPI_BCAST(steps,1,MPI_INTEGER,         mpi_master,MPI_COMM_WORLD,mpi_err)
    call MPI_BCAST(lx,   1,MPI_DOUBLE_PRECISION,mpi_master,MPI_COMM_WORLD,mpi_err)
    call MPI_BCAST(ly,   1,MPI_DOUBLE_PRECISION,mpi_master,MPI_COMM_WORLD,mpi_err)
    call MPI_BCAST(alpha,1,MPI_DOUBLE_PRECISION,mpi_master,MPI_COMM_WORLD,mpi_err)
    call MPI_BCAST(beta, 1,MPI_DOUBLE_PRECISION,mpi_master,MPI_COMM_WORLD,mpi_err)
    call MPI_BCAST(gamma,1,MPI_DOUBLE_PRECISION,mpi_master,MPI_COMM_WORLD,mpi_err)
! calculate the constants for the calculations
    dx=lx/(nx+1)
    dy=ly/(ny+1)
    dx2=dx*dx
    dy2=dy*dy
    bottom=2.0_b8*(dx2+dy2)
    a1=(dy2/bottom)+(beta*dx2*dy2)/(2.0_b8*gamma*dx*bottom)
    a2=(dy2/bottom)-(beta*dx2*dy2)/(2.0_b8*gamma*dx*bottom)
    a3=dx2/bottom
    a4=dx2/bottom
    a5=dx2*dy2/(gamma*bottom)
    a6=pi/(ly)
! set the indices for the interior of the grid
! we block the grid across the processors
! for a 50 x 50 gird with 6 processors we get
! nrow=  2  ncol=  3
! myid=   0 myrow=   1 mycol=   1    (  1 <= i <=  25) ,  (  1 <= j <=  17)
! myid=   1 myrow=   1 mycol=   2    (  1 <= i <=  25) ,  ( 18 <= j <=  33)
! myid=   2 myrow=   1 mycol=   3    (  1 <= i <=  25) ,  ( 34 <= j <=  50)
! myid=   3 myrow=   2 mycol=   1    ( 26 <= i <=  50) ,  (  1 <= j <=  17)
! myid=   4 myrow=   2 mycol=   2    ( 26 <= i <=  50) ,  ( 18 <= j <=  33)
! myid=   5 myrow=   2 mycol=   3    ( 26 <= i <=  50) ,  ( 34 <= j <=  50)
    dj=real(ny,b8)/real(nodes_row,b8)
    j1=nint(1.0_b8+myid_row*dj)
    j2=nint(1.0_b8+(myid_row+1)*dj)-1
    di=real(nx,b8)/real(nodes_col,b8)
    i1=nint(1.0_b8+myid_col*di)
    i2=nint(1.0_b8+(myid_col+1)*di)-1
    if(myid == mpi_master)write(*,'("rows= ",i4," columns= ",i4)')nodes_row,nodes_col
    write(*,'("myid= ",i4," myid_row= ",i4," myid_col= ",i4)')myid,myid_row,myid_col
    write(*,101)myid,myrow,mycol,i1,i2,j1,j2
101 format("myid= ",i4," myrow= ",i4," mycol= ",i4,3x,&
           " (",i3," <= i <= ",i3,") , ",            &
           " (",i3," <= j <= ",i3,")")
! allocate the grid to (i1-1:i2+1,j1-1:j2+1) this includes boundary cells
    allocate(psi(i1-1:i2+1,j1-1:j2+1))
    allocate(new_psi(i1-1:i2+1,j1-1:j2+1))
    allocate(for(i1-1:i2+1,j1-1:j2+1))
! set initial guess for the value of the grid
    psi=1.0_b8
! set boundary conditions
    call bc(psi,i1,i2,j1,j2)
    call do_transfer(psi,i1,i2,j1,j2)
    call do_force(i1,i2,j1,j2)
! do the jacobian iterations
    t1=MPI_Wtime()
    iout=steps/100
    if(iout == 0)iout=1
    do i=1,steps
        call do_jacobi(psi,new_psi,mydiff,i1,i2,j1,j2)
        call do_transfer(psi,i1,i2,j1,j2)
!       write(*,*)myid,i,mydiff
	call MPI_REDUCE(mydiff,diff,1,MPI_DOUBLE_PRECISION, &
	                MPI_SUM,mpi_master,MPI_COMM_WORLD,mpi_err)
	if(myid .eq. mpi_master .and. mod(i,iout) .eq. 0)write(*,'(i6,1x,g20.10)')i,diff
    enddo
    t2=MPI_Wtime()
    if(myid .eq. mpi_master)write(*,'("run time =",f10.2)')t2-t1
! write out the final grid
    call write_grid(psi,i1,i2,j1,j2)
100 continue
    call MPI_Finalize(mpi_err)
end program stommel
!*********************
subroutine bc(psi,i1,i2,j1,j2)
! sets the boundary conditions
! input is the grid and the indices for the interior cells
    use numz
    use mympi
    use input, only : nx,ny
    implicit none
    integer,intent(in):: i1,i2,j1,j2
    real(b8),dimension(i1-1:i2+1,j1-1:j2+1):: psi
! do the top edges
    if(i1 .eq.  1) psi(i1-1,:)=0.0_b8
! do the bottom edges
    if(i2 .eq. ny) psi(i2+1,:)=0.0_b8
! do left edges
    if(j1 .eq.  1) psi(:,j1-1)=0.0_b8
! do right edges
    if(j2 .eq. nx) psi(:,j2+1)=0.0_b8
end subroutine bc
!*********************
subroutine do_jacobi(psi,new_psi,diff,i1,i2,j1,j2)
! does a single Jacobi iteration step
! input is the grid and the indices for the interior cells
! new_psi is temp storage for the the updated grid
! output is the updated grid in psi and diff which is
! the sum of the differences between the old and new grids
    use numz
    use constants
    use face , only : force
    implicit none
    integer,intent(in) :: i1,i2,j1,j2
    real(b8),dimension(i1-1:i2+1,j1-1:j2+1):: psi
    real(b8),dimension(i1-1:i2+1,j1-1:j2+1):: new_psi
    real(b8) diff
    integer i,j
    real(b8) y
    diff=0.0_b8
    do j=j1,j2
       do i=i1,i2
!            y=j*dy
            new_psi(i,j)=a1*psi(i+1,j) + a2*psi(i-1,j) + &
                         a3*psi(i,j+1) + a4*psi(i,j-1) - &
                         a5*for(i,j)
!                         a5*force(y)
            diff=diff+abs(new_psi(i,j)-psi(i,j))
         enddo
     enddo
    psi(i1:i2,j1:j2)=new_psi(i1:i2,j1:j2)
end subroutine do_jacobi
!*********************
function force(y)
    use numz
    use input
    use constants
    implicit none
    real(b8) force,y
    force=-alpha*sin(y*a6)
end function force
!*********************
subroutine write_grid(psi,i1,i2,j1,j2)
! input is the grid and the indices for the interior cells
    use numz
    use mympi
    use face ,only : unique
    use input
    implicit none
    integer,intent(in):: i1,i2,j1,j2
    real(b8),dimension(i1-1:i2+1,j1-1:j2+1):: psi
    integer i,j,k,i0,j0,i3,j3
    integer istart,iend,jstart,jend,local_bounds(4)
    integer, allocatable :: bounds(:,:),counts(:),offsets(:)
    integer dj,mystart,myend,icol
    real(b8),allocatable :: arow(:)
    
! the master prints the whole grid a line at a time.
! for a given line, each processor checks to see if
! it holds part of that line and then sends the 
! number of cells held using the MPI_GATHER.  the
! MPI_GATHERV is then used to send the data
    istart=i1
    iend=i2
    jstart=j1
    jend=j2
    if(istart .eq. 1)istart=0
    if(jstart .eq. 1)jstart=0
    if(iend .eq. nx)iend=nx+1
    if(jend .eq. ny)jend=ny+1
    i0=0
    j0=0
    i3=nx+1
    j3=ny+1
    if(myid .eq. mpi_master)then
        open(18,file=unique("out5d_"),recl=max(80,15*((ny)+3)+2))
!        write(18,101)i0,i3,j0,j3
!101 format(6x," (",i3," <= i <= ",i3,") , ", &
!              " (",i3," <= j <= ",i3,")")


        write(18,'(2i6)')i3-i0+1,j3-j0+1
        allocate(arow(j0:j3))
        allocate(counts(0:numnodes-1))
        allocate(offsets(0:numnodes-1))
        offsets(0)=0
    endif 
    do i=i0,i3
       if(i .ge. istart .and. i .le. iend)then
           dj=jend-jstart+1
           mystart=jstart
           myend=jend
           icol=i
       else
           dj=0
           mystart=jstart
           myend=jstart
           icol=istart
       endif
       call MPI_GATHER(dj,    1,MPI_INTEGER, &
                       counts,1,MPI_INTEGER, &
                       mpi_master,MPI_COMM_WORLD,mpi_err)                       
       if(myid .eq. mpi_master)then  
!      write(*,*)"for ",i," counts = ",counts," sum = ",sum(counts)     
           do k=1,numnodes-1
	           offsets(k)=counts(k-1)+offsets(k-1)
	       enddo
       endif
       call MPI_GATHERV(psi(icol,mystart:myend),dj,MPI_DOUBLE_PRECISION, &
                        arow(0),counts,offsets,MPI_DOUBLE_PRECISION, &
                        mpi_master,MPI_COMM_WORLD,mpi_err)
       if(myid .eq. mpi_master)then
	       do j=j0,j3
	           write(18,'(g14.7)',advance="no")arow(j)
	           if(j .ne. j3)write(18,'(" ")',advance="no")
	       enddo
	       write(18,*)
       endif
    enddo
    close(18)
end subroutine write_grid
!*********************
subroutine do_transfer(psi,i1,i2,j1,j2)
! sets the boundary conditions
! input is the grid and the indices for the interior cells
    use numz
    use mympi
    use input
    implicit none
    integer,intent(in):: i1,i2,j1,j2
    real(b8),dimension(i1-1:i2+1,j1-1:j2+1):: psi
    integer num_x,num_y
    logical even
    num_x=i2-i1+3
    num_y=j2-j1+3
    if(even(myid_col))then
        call MPI_Sendrecv(psi(:,j1),  num_x,MPI_DOUBLE_PRECISION, myleft,100,&
                          psi(:,j1-1),num_x,MPI_DOUBLE_PRECISION, myleft,100,&
                          ROW_COMM,status, mpi_err)
        call MPI_Sendrecv(psi(:,j2),  num_x,MPI_DOUBLE_PRECISION, myright,100,&
                          psi(:,j2+1),num_x,MPI_DOUBLE_PRECISION, myright,100,&
                          ROW_COMM,status, mpi_err)

    else    ! we are on an odd col processor
        call MPI_Sendrecv(psi(:,j2),  num_x,MPI_DOUBLE_PRECISION, myright,100,&
                          psi(:,j2+1),num_x,MPI_DOUBLE_PRECISION, myright,100,&
                          ROW_COMM,status, mpi_err)
        call MPI_Sendrecv(psi(:,j1),  num_x,MPI_DOUBLE_PRECISION, myleft,100,&
                          psi(:,j1-1),num_x,MPI_DOUBLE_PRECISION, myleft,100,&
                          ROW_COMM,status, mpi_err)
    endif
    if(even(myid_row))then  
        call MPI_Sendrecv(psi(i1,:),  num_y,MPI_DOUBLE_PRECISION, mytop,10,&
                          psi(i1-1,:),num_y,MPI_DOUBLE_PRECISION, mytop,10,&
                          COL_COMM,status, mpi_err)
        call MPI_Sendrecv(psi(i2,:),  num_y,MPI_DOUBLE_PRECISION, mybot,10,&
                          psi(i2+1,:),num_y,MPI_DOUBLE_PRECISION, mybot,10,&
                          COL_COMM,status, mpi_err)
    else    ! we are on an odd row processor
        call MPI_Sendrecv(psi(i2,:),  num_y,MPI_DOUBLE_PRECISION, mybot,10,&
                          psi(i2+1,:),num_y,MPI_DOUBLE_PRECISION, mybot,10,&
                          COL_COMM,status, mpi_err)
        call MPI_Sendrecv(psi(i1,:),  num_y,MPI_DOUBLE_PRECISION, mytop,10,&
                          psi(i1-1,:),num_y,MPI_DOUBLE_PRECISION, mytop,10,&
                          COL_COMM,status, mpi_err)
    endif
end subroutine do_transfer
!*********************
subroutine do_force (i1,i2,j1,j2)
! sets the force conditions
! input is the grid and the indices for the interior cells
    use numz
    use constants, only:for,dy
    use face, only : force
    implicit none
    integer,intent(in):: i1,i2,j1,j2
    real(b8) y
    integer i,j
    do i=i1,i2
        do j=j1,j2
            y=j*dy
            for(i,j)=force(y)
        enddo
    enddo
end subroutine
!*********************
function even(i)
  integer i
  logical even
  j=i/2
  if(j*2 .eq. i)then
      even = .true.
  else
      even = .false.
  endif
  return
end function even
!*********************
function unique(name)
    use numz
    use mympi
    character (len=*) name
    character (len=20) unique
    character (len=80) temp
    if(myid .gt. 99)then
      write(temp,"(a,i3)")trim(name),myid
    else
        if(myid .gt. 9)then
            write(temp,"(a,'0',i2)")trim(name),myid
        else
            write(temp,"(a,'00',i1)")trim(name),myid
        endif
    endif
    unique=temp
    return
end function unique
!*********************