!version: 2d decomposition using communcator
!version: routines MPI_COMM_SPLIT
! solves the 2d Stommel Model of Ocean Circulation  
! using a Five-point stencil and Jacobi iteration
!
! gamma*((d(d(psi)/dx)/dx) + (d(d(psi)/dy)/dy))
! +beta(d(psi)/dx)=-alpha*sin(pi*y/(2*ly))
!
module numz
! module defines the basic real type and pi
    integer, parameter:: b8 = selected_real_kind(14)
    real(b8), parameter :: pi = 3.141592653589793239_b8
end module
!*********************
module input
! module contains the inputs 
    use numz
    integer nx,ny             ! number of interior points for our grid (50 50)
    real(b8) lx,ly            ! physical size of our grid (2000 2000)
    real(b8) alpha,beta,gamma ! parameters of the calculation (1.0e-9 2.25e-11 3.0e-6)
    integer steps             ! number of Jacobi iteration steps (60)
end module
!*********************
module constants
! this module contains the invariants (constants) of the
! calculation.  these values are determined in the main
! routine and used in the do_jacobi Jacobi iteration subroutine
! a6 is used in the force function
    use numz
    real(b8) dx,dy,a1,a2,a3,a4,a5,a6
    real(b8), allocatable:: for(:,:)     ! our force grid
end module
!*********************
module face
! this module contains the interface for the two subroutines
! that modify the grid.  an interface is a good idea in this
! case because we are passing allocatable arrays
    interface bc
        subroutine bc (psi,i1,i2,j1,j2)
! sets the boundary conditions
! input is the grid and the indices for the interior cells
            use numz
            integer,intent(in):: i1,i2,j1,j2
            real(b8),dimension(i1:i2,j1:j2):: psi
        end subroutine
    end interface
    interface do_jacobi
        subroutine do_jacobi (psi,new_psi,diff,i1,i2,j1,j2)
! does a single Jacobi iteration step
! input is the grid and the indices for the interior cells
! new_psi is temp storage for the the updated grid
! output is the updated grid in psi and diff which is
! the sum of the differences between the old and new grids
            use numz
            integer,intent(in):: i1,i2,j1,j2
            real(b8),dimension(i1-1:i2+1,j1-1:j2+1):: psi
            real(b8),dimension(i1-1:i2+1,j1-1:j2+1):: new_psi
            real(b8) diff
        end subroutine
    end interface
! interface for the forcing function
    interface force
        function force(y)
            use numz
            real(b8) force,y
        end function force
    end interface   
! interface for setting  the force routine
    interface do_force
        subroutine do_force (i1,i2,j1,j2)
! sets the force array
! input is the grid and the indices for the interior cells
            use numz
            integer,intent(in):: i1,i2,j1,j2
        end subroutine
    end interface
! interface for routine to write the grid
    interface write_grid
        subroutine write_grid (psi,i1,i2,j1,j2)
! input is the grid and the indices for the interior cells
            use numz
            integer,intent(in):: i1,i2,j1,j2
            real(b8),dimension(i1:i2,j1:j2):: psi
        end subroutine
    end interface
    interface do_transfer
        subroutine do_transfer (psi,i1,i2,j1,j2)
! sets the boundary conditions
! input is the grid and the indices for the interior cells
            use numz
            integer,intent(in):: i1,i2,j1,j2
            real(b8),dimension(i1:i2,j1:j2):: psi
        end subroutine
    end interface
    interface unique
        function unique(name)
            character (len=*) name
            character (len=20) unique
        end function
    end interface
end module
module mpi
!DEC$ NOFREEFORM
         include "mpif.h"
!DEC$ FREEFORM
    integer numnodes,myid,mpi_err
    integer, parameter::mpi_master=0
    integer status(MPI_STATUS_SIZE)
    integer nrow,ncol
    integer myrow,mycol
    integer myid_col,myid_row,nodes_row,nodes_col
    integer ROW_COMM,COL_COMM
    integer mytop,mybot,myleft,myright
end module
!*********************

module get_glob
    contains
        subroutine get_index(on_proc,proc_index)
		use mpi
		implicit none
		character (len=MPI_MAX_PROCESSOR_NAME ), allocatable :: the_names(:)
		integer on_proc,proc_index,i
	    allocate(the_names(0:numnodes-1))
	    call MPI_GET_PROCESSOR_NAME(the_names(myid),i,mpi_err)
	    call MPI_allGather(the_names(myid),MPI_MAX_PROCESSOR_NAME,MPI_CHARACTER, &
	                       the_names      ,MPI_MAX_PROCESSOR_NAME,MPI_CHARACTER, &
	                       MPI_COMM_WORLD,mpi_err)
	    on_proc=0
	    do i=0,numnodes-1
	        if(LGE(the_names(myid),the_names(i)).and. LLE(the_names(myid),the_names(i)))then
	            on_proc=on_proc+1
	            if(i .eq. myid)proc_index=on_proc
	        endif
	    enddo
!	    write(*,*)"myid = ",myid," which is mpi task ",proc_index," on ",the_names(myid)
	    deallocate(the_names)
	    end subroutine
end module

program stommel
    use numz
    use input
    use constants
    use face
    use mpi
    use get_glob    
    implicit none
    character*(MPI_MAX_PROCESSOR_NAME) name
    integer on_proc,proc_index
    real(b8) t1,t2
    real(b8)diff,mydiff
    real(b8)dx2,dy2,bottom
    real(b8), allocatable:: psi(:,:)     ! our calculation grid
    real(b8), allocatable:: new_psi(:,:) ! temp storage for the grid
    integer i,j,i1,i2,j1,j2
    integer omp_get_max_threads,omp_get_num_threads,omp_get_thread_num
    integer iout
    integer n_mpi,n_thread
    real(b8) di,dj
    character (len=80):: aline1,aline2,aline3,aline4
! do the mpi init stuff
    call MPI_INIT( mpi_err )
    call MPI_COMM_SIZE( MPI_COMM_WORLD, numnodes, mpi_err )
    call MPI_COMM_RANK( MPI_COMM_WORLD, myid, mpi_err )
    call MPI_GET_PROCESSOR_NAME(name,i,mpi_err)
!    write(*,*)"threads=",omp_get_max_threads()," on ",name
    n_thread=omp_get_num_threads()
    n_mpi=numnodes
! find a reasonable grid topology based on the number
! of processors
  nrow=nint(sqrt(float(numnodes)))
  ncol=numnodes/nrow
  do while (nrow*ncol .ne. numnodes)
      nrow=nrow+1
      ncol=numnodes/nrow
  enddo
  if(nrow .gt. ncol)then
      i=ncol
      ncol=nrow
      nrow=i
  endif
  myrow=myid/ncol+1
  mycol=myid - (myrow-1)*ncol + 1
! make the row and col communicators
! all processors with the same row will be in the same ROW_COMM
  call MPI_COMM_SPLIT(MPI_COMM_WORLD,myrow,mycol,ROW_COMM,mpi_err)
  call MPI_COMM_RANK( ROW_COMM, myid_row, mpi_err )
  call MPI_COMM_SIZE( ROW_COMM, nodes_row, mpi_err )
! all processors with the same col will be in the same COL_COMM
  call MPI_COMM_SPLIT(MPI_COMM_WORLD,mycol,myrow,COL_COMM,mpi_err)
  call MPI_COMM_RANK( COL_COMM, myid_col, mpi_err )
  call MPI_COMM_SIZE( COL_COMM, nodes_col, mpi_err )
! find id of neighbors using the communicators created above
  mytop   = myid_col-1;if( mytop    .lt. 0         )mytop   = MPI_PROC_NULL
  mybot   = myid_col+1;if( mybot    .eq. nodes_col )mybot   = MPI_PROC_NULL
  myleft  = myid_row-1;if( myleft   .lt. 0         )myleft  = MPI_PROC_NULL
  myright = myid_row+1;if( myright  .eq. nodes_row )myright = MPI_PROC_NULL
! get the input.  see above for typical values
aline1="200 200"
aline2="2000000 2000000"
aline3="1.0e-9 2.25e-11 3.0e-6"
aline4="7500"
!write(*,*)"do read"
    if(myid .eq. mpi_master)then
        read(aline1,*)nx,ny
        read(aline2,*)lx,ly
        read(aline3,*)alpha,beta,gamma
        read(aline4,*)steps
        read(*,*)nx,ny
        read(*,*)lx,ly
        read(*,*)alpha,beta,gamma
        read(*,*)steps
        write(*,*)nx,ny
        write(*,*)lx,ly
        write(*,*)alpha,beta,gamma
        write(*,*)steps
    endif
!write(*,*)"do bcast"
!send the data to other processors
    call MPI_BCAST(nx,   1,MPI_INTEGER,         mpi_master,MPI_COMM_WORLD,mpi_err)
    call MPI_BCAST(ny,   1,MPI_INTEGER,         mpi_master,MPI_COMM_WORLD,mpi_err)
    call MPI_BCAST(steps,1,MPI_INTEGER,         mpi_master,MPI_COMM_WORLD,mpi_err)
    call MPI_BCAST(lx,   1,MPI_DOUBLE_PRECISION,mpi_master,MPI_COMM_WORLD,mpi_err)
    call MPI_BCAST(ly,   1,MPI_DOUBLE_PRECISION,mpi_master,MPI_COMM_WORLD,mpi_err)
    call MPI_BCAST(alpha,1,MPI_DOUBLE_PRECISION,mpi_master,MPI_COMM_WORLD,mpi_err)
    call MPI_BCAST(beta, 1,MPI_DOUBLE_PRECISION,mpi_master,MPI_COMM_WORLD,mpi_err)
    call MPI_BCAST(gamma,1,MPI_DOUBLE_PRECISION,mpi_master,MPI_COMM_WORLD,mpi_err)
!        write(*,*)nx,ny
!        write(*,*)lx,ly
!        write(*,*)alpha,beta,gamma
!        write(*,*)steps
!write(*,*)"! calculate the constants for the calculations"
    dx=lx/(nx+1)
    dy=ly/(ny+1)
    dx2=dx*dx
    dy2=dy*dy
    bottom=2.0_b8*(dx2+dy2)
    a1=(dy2/bottom)+(beta*dx2*dy2)/(2.0_b8*gamma*dx*bottom)
    a2=(dy2/bottom)-(beta*dx2*dy2)/(2.0_b8*gamma*dx*bottom)
    a3=dx2/bottom
    a4=dx2/bottom
    a5=dx2*dy2/(gamma*bottom)
    a6=pi/(ly)
!write(*,*)" ! set the indices for the interior of the grid"
! we block the grid across the processors
! for a 50 x 50 gird with 6 processors we get
! nrow=  2  ncol=  3
! myid=   0 myrow=   1 mycol=   1    (  1 <= i <=  25) ,  (  1 <= j <=  17)
! myid=   1 myrow=   1 mycol=   2    (  1 <= i <=  25) ,  ( 18 <= j <=  33)
! myid=   2 myrow=   1 mycol=   3    (  1 <= i <=  25) ,  ( 34 <= j <=  50)
! myid=   3 myrow=   2 mycol=   1    ( 26 <= i <=  50) ,  (  1 <= j <=  17)
! myid=   4 myrow=   2 mycol=   2    ( 26 <= i <=  50) ,  ( 18 <= j <=  33)
! myid=   5 myrow=   2 mycol=   3    ( 26 <= i <=  50) ,  ( 34 <= j <=  50)
    dj=real(nx,b8)/real(nodes_row,b8)
    j1=nint(1.0_b8+myid_row*dj)
    j2=nint(1.0_b8+(myid_row+1)*dj)-1
    di=real(ny,b8)/real(nodes_col,b8)
    i1=nint(1.0_b8+myid_col*di)
    i2=nint(1.0_b8+(myid_col+1)*di)-1
    if(myid == mpi_master)write(*,'("rows= ",i4," columns= ",i4)')nodes_row,nodes_col
    write(*,'("myid= ",i4," myid_row= ",i4," myid_col= ",i4)')myid,myid_row,myid_col
    write(*,101)myid,myrow,mycol,i1,i2,j1,j2
101 format("myid= ",i4," myrow= ",i4," mycol= ",i4,3x,&
           " (",i3," <= i <= ",i3,") , ",            &
           " (",i3," <= j <= ",i3,")")
! allocate the grid to (i1-1:i2+1,j1-1:j2+1) this includes boundary cells
    allocate(psi(i1-1:i2+1,j1-1:j2+1))
    allocate(new_psi(i1-1:i2+1,j1-1:j2+1))
    allocate(for(i1-1:i2+1,j1-1:j2+1))
! set initial guess for the value of the grid
    psi=1.0_b8
! set boundary conditions
    call bc(psi,i1,i2,j1,j2)
! set the force array
    call do_force(i1,i2,j1,j2)
    call do_transfer(psi,i1,i2,j1,j2)
    iout=steps/100
    if(iout == 0)iout=1
    t1=MPI_Wtime()
! do the jacobian iterations
    do i=1,steps
        call do_jacobi(psi,new_psi,mydiff,i1,i2,j1,j2)
        call do_transfer(psi,i1,i2,j1,j2)
!       write(*,*)myid,i,mydiff
	call MPI_REDUCE(mydiff,diff,1,MPI_DOUBLE_PRECISION, &
	                MPI_SUM,mpi_master,MPI_COMM_WORLD,mpi_err)
	if(myid .eq. mpi_master .and. mod(i,iout) .eq. 0)write(*,'(i6,1x,g20.10)')i,diff
    enddo
    t2=mpi_Wtime()
    if(myid .eq. mpi_master)write(*,*)"run time =",t2-t1,n_mpi,n_thread
! write out the final grid
!    call write_grid(psi,i1,i2,j1,j2)
100 continue
    call MPI_Finalize(mpi_err)
end program stommel
!*********************
subroutine bc(psi,i1,i2,j1,j2)
! sets the boundary conditions
! input is the grid and the indices for the interior cells
    use numz
    use mpi
    use input, only : nx,ny
    implicit none
    integer,intent(in):: i1,i2,j1,j2
    real(b8),dimension(i1-1:i2+1,j1-1:j2+1):: psi
! do the top edges
    if(i1 .eq.  1) psi(i1-1,:)=0.0_b8
! do the bottom edges
    if(i2 .eq. ny) psi(i2+1,:)=0.0_b8
! do left edges
    if(j1 .eq.  1) psi(:,j1-1)=0.0_b8
! do right edges
    if(j2 .eq. nx) psi(:,j2+1)=0.0_b8
end subroutine bc
!*********************
subroutine do_jacobi(psi,new_psi,diff,i1,i2,j1,j2)
! does a single Jacobi iteration step
! input is the grid and the indices for the interior cells
! new_psi is temp storage for the the updated grid
! output is the updated grid in psi and diff which is
! the sum of the differences between the old and new grids
    use numz
    use mpi, only: numnodes,myid
    use constants
    implicit none
    integer,intent(in) :: i1,i2,j1,j2
    real(b8),dimension(i1-1:i2+1,j1-1:j2+1):: psi
    real(b8),dimension(i1-1:i2+1,j1-1:j2+1):: new_psi
    real(b8) diff
    integer i,j
    integer omp_get_num_threads,omp_get_thread_num
    real(b8) y
    diff=0.0_b8
!$OMP PARALLEL
!$OMP DO SCHEDULE (STATIC) private(i) firstprivate(a1,a2,a3,a4,a5) reduction(+:diff)
    do j=j1,j2
        do i=i1,i2
!            y=j*dy
            new_psi(i,j)=a1*psi(i+1,j) + a2*psi(i-1,j) + &
                         a3*psi(i,j+1) + a4*psi(i,j-1) - &
                         a5*for(i,j)
!                         a5*force(y)
            diff=diff+abs(new_psi(i,j)-psi(i,j))
         enddo
     enddo
!$OMP END  DO
!$OMP  DO SCHEDULE (STATIC) private(i)
    do j=j1,j2
        do i=i1,i2
            psi(i,j)=new_psi(i,j)
         enddo
     enddo
!$OMP END DO
!$OMP END PARALLEL
end subroutine do_jacobi
!*********************
function force(y)
    use numz
    use input
    use constants
    implicit none
    real(b8) force,y
    force=-alpha*sin(y*a6)
end function force
!*********************
subroutine do_force (i1,i2,j1,j2)
! sets the force conditions
! input is the grid and the indices for the interior cells
    use numz
    use constants, only:for,dy
    use face, only : force
    implicit none
    integer,intent(in):: i1,i2,j1,j2
    real(b8) y
    integer i,j
    do i=i1,i2
        do j=j1,j2
            y=j*dy
            for(i,j)=force(y)
        enddo
    enddo
end subroutine
!*********************
subroutine write_grid(psi,i1,i2,j1,j2)
! input is the grid and the indices for the interior cells
    use numz
    use mpi
    use face ,only : unique
    implicit none
    integer,intent(in):: i1,i2,j1,j2
    real(b8),dimension(i1-1:i2+1,j1-1:j2+1):: psi
    integer i,j
    integer istart,iend,jstart,jend
! each processor writes its section of the grid
    istart=i1-1
    iend=i2+1
    jstart=j1-1
    jend=j2+1
    open(18,file=unique("out_both"),recl=max(80,15*((jend-jstart)+3)+2))
    write(18,101)myid,istart,iend,jstart,jend

101 format("myid= ",i3,3x,                 &
           " (",i3," <= i <= ",i3,") , ", &
           " (",i3," <= j <= ",i3,")")
    do i=istart,iend
       do j=jstart,jend
           write(18,'(g14.7)',advance="no")psi(i,j)
           if(j .ne. j2)write(18,'(" ")',advance="no")
       enddo
       write(18,*)
    enddo
    close(18)
end subroutine write_grid
!*********************
subroutine do_transfer(psi,i1,i2,j1,j2)
! sets the boundary conditions
! input is the grid and the indices for the interior cells
    use numz
    use mpi
    use input
    implicit none
    integer,intent(in):: i1,i2,j1,j2
    real(b8),dimension(i1-1:i2+1,j1-1:j2+1):: psi
    integer num_x,num_y
    logical even
    num_x=i2-i1+3
    num_y=j2-j1+3
    if(even(myid_col))then
! send to left
            call MPI_SEND(psi(:,j1),  num_x,MPI_DOUBLE_PRECISION,myleft,  &
                                      100,ROW_COMM,mpi_err)
!	    write(*,*)"sl",myid,psi(:,j1)
! rec from left
            call MPI_RECV(psi(:,j1-1),num_x,MPI_DOUBLE_PRECISION,myleft,  &
                                      100,ROW_COMM,status,mpi_err)
!	    write(*,*)"rl",myid,psi(:,j1-1)
! rec from right
            call MPI_RECV(psi(:,j2+1),num_x,MPI_DOUBLE_PRECISION,myright, &
                                      100,ROW_COMM,status,mpi_err)
!	    write(*,*)"rr",myid,psi(:,j2+1)
! send to right
            call MPI_SEND(psi(:,j2),  num_x,MPI_DOUBLE_PRECISION,myright, &
                                      100,ROW_COMM,mpi_err)
!	    write(*,*)"sr",myid,psi(:,j2)
    else    ! we are on an odd col processor
! rec from right
            call MPI_RECV(psi(:,j2+1),num_x,MPI_DOUBLE_PRECISION,myright, &
                                      100,ROW_COMM,status,mpi_err)
!	    write(*,*)"rr",myid,psi(:,j2+1)
! send to right
            call MPI_SEND(psi(:,j2),  num_x,MPI_DOUBLE_PRECISION,myright, &
                                      100,ROW_COMM,mpi_err)
!	    write(*,*)"sr",myid,psi(:,j2)
! send to left
            call MPI_SEND(psi(:,j1),  num_x,MPI_DOUBLE_PRECISION,myleft,  &
                                      100,ROW_COMM,mpi_err)
!	    write(*,*)"sl",myid,psi(:,j1)
! rec from left
            call MPI_RECV(psi(:,j1-1),num_x,MPI_DOUBLE_PRECISION,myleft,  &
                                      100,ROW_COMM,status,mpi_err)
!	    write(*,*)"rl",myid,psi(:,j1-1)
    endif
    if(even(myid_row))then  
! send to top
        call MPI_SEND(psi(i1,:),num_y,MPI_DOUBLE_PRECISION,mytop,     &
                      10, COL_COMM,mpi_err)  
! rec from top
        call MPI_RECV(psi(i1-1,:),num_y,MPI_DOUBLE_PRECISION,mytop,  &
                      10,COL_COMM,status,mpi_err)  
! rec from bot
        call MPI_RECV(psi(i2+1,:),num_y,MPI_DOUBLE_PRECISION,mybot,  &
                      10,COL_COMM,status,mpi_err)
! send to bot
        call MPI_SEND(psi(i2,:),num_y,MPI_DOUBLE_PRECISION,mybot,     &
                      10, COL_COMM,mpi_err)
    else
! rec from bot
        call MPI_RECV(psi(i2+1,:),num_y,MPI_DOUBLE_PRECISION,mybot,   &
                      10,COL_COMM,status,mpi_err)
! send to bot
        call MPI_SEND(psi(i2,:),num_y,MPI_DOUBLE_PRECISION,mybot,     &
                      10,COL_COMM,mpi_err)
! send to top
        call MPI_SEND(psi(i1,:),num_y,MPI_DOUBLE_PRECISION,mytop,     &
                      10,COL_COMM,mpi_err)
! rec from top
        call MPI_RECV(psi(i1-1,:),num_y,MPI_DOUBLE_PRECISION,mytop,   &
                      10,COL_COMM,status,mpi_err)
    endif
end subroutine do_transfer
!*********************
function even(i)
  integer i
  logical even
  j=i/2
  if(j*2 .eq. i)then
      even = .true.
  else
      even = .false.
  endif
  return
end function even
!*********************
function unique(name)
    use numz
    use mpi
    character (len=*) name
    character (len=20) unique
    character (len=80) temp
    if(myid .gt. 99)then
      write(temp,"(a,i3)")trim(name),myid
    else
        if(myid .gt. 9)then
            write(temp,"(a,'0',i2)")trim(name),myid
        else
            write(temp,"(a,'00',i1)")trim(name),myid
        endif
    endif
    unique=temp
    return
end function unique