!********************************************************************************
!* This program solves the Laplace equation in a square domain by Jacobi
!* iteration with MPI. An example MPI program for CFD beginners.
!*
!*        written by Dr. Katate Masatsuka (info[at]cfdbooks.com),
!*
!* the author of useful CFD books, "I do like CFD" (http://www.cfdbooks.com).
!*
!*
!* This is Version 1 (2015).
!*
!* This F90 program is written and made available for an educational purpose.
!* This program is not optimal nor elegant in any sense. You would want to
!* write one by yourself once you understand what is going on in this progarm.
!*
!* Please feel free to contact the author about suggestions.
!* You can ask questions also, but please don't expect good answers because
!* the author is not an expert in MPI programming.
!*
!* This file may be updated in future.
!*
!* Note: I cannot advise you about how to compile and run this program.
!*       But here is what I do to run this program in the NIA cluster:
!*
!*       prompt% module load openmpi/gcc/64/1.8.1
!*       prompt% mpif90 jacobi_mpi_v1.f90
!*       prompt% salloc --ntasks=4 mpirun ./a.out   <- To run with 4 processors
!*
!*       (Please consult a manual for the cluster/computer you wish to run it on.)
!*
!* Katate Masatsuka, March 2015. http://www.cfdbooks.com
!********************************************************************************

!********************************************************************************
!
! Input : nx_global = number of nodes in x-coordinate direction
!         ny_global = number of nodes in y-coordinate direction
!         (to be specified in the program)
!
! Output: 1. Tecplot files for the grid and the solution:
!
!            tecplot_0.dat
!            tecplot_1.dat
!            tecplot_2.dat
!                :
!            tecplot_npes.dat,  npes is the number of processors
!
!         2. Output files
!
!            fort.1000
!            fort.1001
!            fort.1002
!              :
!            fort.(1000+npes)
!
!
!----------------------------
! Laplace's equation for temperature (the steady heat equation):
!
!  (temperature)_xx + (temperature)_yy = 0, in (x,y)=(0,1)x(0,1).
!
!
!----------------------------
! Exact solution: 
!
!  temperature = (sinh(pi*x)*sin(pi*y)+sinh(pi*y)*sin(pi*x))/sinh(pi)
!
!
!----------------------------
! Domain:
!
!   y=1 _____________________________
!      |                             |
!      |                             |
!      |                             |
!      |                             |
!      |                             |
!      |                             |
!      |                             |
!      |                             |
!      |                             |
!      |                             |
!      |                             |
!   y=0|_____________________________|
!     x=0                           x=1
!
!
!----------------------------
! Jacobi iteration:
!
!  Jacobi iteration is equivalent to explicit time-stepping for dT/dt=Laplace(T):
!
!     (Tnew-Told)/(time_step) = Laplace(Told)
! ->  Tnew = Told + (time_step)*(Laplace(Told))
! ->  Tnew = Told + (time_step)*(Finite-Difference(Told))
! ->  Tnew(i,j) = Told + (time_step)*( Told(i-1,j)+Told(i+1,j)+Told(i,j-1)+Told(i,j+1)-4*Told(i,j) )/h^2
!
!  with (time_step)=h^2/4
!
! ->  Tnew(i,j) = ( Told(i-1,j)+Told(i+1,j)+Told(i,j-1)+Told(i,j+1) )/4
!
!  So, it is just a simple arithmetic average of the neighbor values.
!
!
!----------------------------
! To perform the Jacobi iteration in parallel with MPI, the domain is
! decomposed into 'npes' sub-domains or partitions, or processor elements (pe).
! However it is called, it is just a piece of the domain.
! In this program, we call it partition or partition element (pe).
!
!  Here is an example of npes=4: mype is the ID of each pe.
!       _____________________________
!      |                             |
!      |       mype=3                | <-- Top partition, mype=npes-1
!      |_____________________________|
!      |                             |
!      |       mype=2                |
!      |_____________________________|
!      |                             |
!      |       mype=1                |
!      |_____________________________|
!      |                             |
!      |       mype=0                | <-- Bottom partition, mype=0
!      |_____________________________|
!
!
! Each processor executes the same program on the corresponding sub-domain.
! (Note: This is the basic idea of MPI. We write one program, and run it in parallel
!        on each decomposed sub-domain.)
!
! Each partition has ghost (or frindge or off-processor) nodes as below:
!       _____________________________
!      |______top ghost nodes________| (j=ny+1)
!      |                             |
!      |          interior           |
!      |_____________________________|
!      |______bottom ghost nodes_____| (j=0)
!
!
! Note, however, that the top partition has no ghost nodes at the top:
!       _____________________________
!      |______top boundary nodes_____| (j=ny+1, y=1.0)
!      |                             |
!      |         mype = npes-1       | <-- Top partition, mype=npes-1
!      |_____________________________|
!      |______bottom ghost nodes_____| (j=0)
!
! and the bottom partition has unused ghost nodes:
!       _____________________________
!      |_______top ghost nodes_______| (j=ny+1)
!      |                             |
!      |          interior           | <-- Bottom partition, mype=0
!      |_____________________________|
!      |___bottom boundary nodes_____| (j=1, y=0.0)
!      |______bottom ghost nodes_____| (j=0) <-- We don't use these nodes.
!
!
!********************************************************************************

!********************************************************************************
!********************************************************************************
 program jacobi_iteration_mpi

 implicit none

 include    'mpif.h'

 integer, parameter :: p2 = selected_real_kind(15) !Double precision

!Global domain
 integer , parameter ::      nx_global = 512
 integer , parameter ::      ny_global = 512
 real(p2), parameter :: max_temp_error = 1.0e-06
 integer , parameter :: max_iterations = 500000
 real(p2)            ::      dt_global = 100.0   !Maximum iterative error (initialized as 100)

!Local domain ( in the sub-domain or partition, or processor element (pe) )
 integer             :: nx, ny
 real(p2)            :: start_time, stop_time
 integer             :: i, j, iteration
 real(p2)            :: dt                              !Local Maximum iterative error
 real(p2)            :: h, xp,yp, yp_min_loc

 real(p2), allocatable, dimension(:,:) :: temperature
 real(p2), allocatable, dimension(:,:) :: temperature_old
 real(p2), allocatable, dimension(:,:) :: temperature_exact
 real(p2), allocatable, dimension(:,:) :: x, y


!MPI related variables
 integer               :: mype
 integer               :: npes
 integer               :: ierr
 integer, dimension(4) :: req = MPI_REQUEST_NULL
 integer               :: status(MPI_STATUS_SIZE)
 integer               :: status_array(MPI_STATUS_SIZE,4)
 integer, parameter    :: id_top_to_bottom = 100 ! some number
 integer, parameter    :: id_bottom_to_top = 101 ! some number differnet from the above

!----------------------------------------------------------------------
! MPI start up

      call MPI_INIT(ierr)                   ! <- ierr=0 if successful

      call MPI_Comm_size(MPI_COMM_WORLD, &  ! <- ???
                                   npes, &  ! <- Get the total number of PEs
                                   ierr  )  ! <- ierr=0 if successful

      call MPI_Comm_rank(MPI_COMM_WORLD, &  ! <- ???
                                   mype, &  ! <- Get the ID of the PE
                                   ierr  )  ! <- ierr=0 if successful

      if( mype == 0 ) then
       write(*,*) ' npes = ', npes, ' mype = ', mype
       write(*,*)
       write(*,*) '   nx_global = ', nx_global
       write(*,*) '   ny_global = ', ny_global
       write(*,*) ' Total nodes = ', nx_global*ny_global
      endif

!----------------------------------------------------------------------
! Allocate arrays and decompose the domain

! Each partition has nx+2 nodes in x-direction and ny nodes in y-direction.
! The ny nodes in y covers the sub-domain.
!
!   Here's an example of npes=4:
!
!       _____________________________  j_global=ny_global
!      |                             |  j=ny
!      |       mype=3                |   :
!      |_____________________________|  j=1
!      |                             |  j=ny
!      |       mype=2                |   :
!      |_____________________________|  j=1
!      |                             |  j=ny
!      |       mype=1                |   :
!      |_____________________________|  j=1
!      |                             |  j=ny
!      |       mype=0                |   :
!      |_____________________________|  j=1
!                                      j_global=1
!    i=0                            i=nx+1

! Local size
  nx = nx_global
  ny = ny_global/npes

! Grid spacing. Uniform grid, so, h=1/nx_global=1/ny_global.
  h = 1.0_p2/real(nx_global,p2)

! Allocate the local arrays
  allocate( temperature(      0:nx+1, 0:ny+1) )
  allocate( temperature_old(  0:nx+1, 0:ny+1) )
  allocate( temperature_exact(0:nx+1, 0:ny+1) )
  allocate(            x(     0:nx+1, 0:ny+1) )
  allocate(            y(     0:nx+1, 0:ny+1) )

! Grid data (including the ghost points at j = 0 and ny+1)
  do j = 0, ny+1

   ! y-coordinate
   yp_min_loc = real(mype  )*(1.0/real(npes))
           yp = (yp_min_loc) + h*real(j-1)

   do i = 0, nx+1
    x(i,j) = real(i)/real(nx+1) ! x(0)=0.0 and x(nx+1)=1.0
    y(i,j) = yp
   end do
 
  end do

      if( mype == 0 ) then
       write(*,*)
       write(*,*) '          nx_local = ', nx
       write(*,*) '          ny_local = ', ny
       write(*,*) ' Total local nodes = ', nx*ny
      endif

!----------------------------------------------------------------------
! Set the exact, initial, and boudnary values.

      call initialize(x,y,temperature_old, temperature_exact, npes, mype, nx,ny)
      temperature = temperature_old

!----------------------------------------------------------------------
! Record the starting time in 'start_time' for a timing purpose.

      call cpu_time(start_time)

!**********************************************************************
!----- Beginning of Jacobi iteration

! Initialize the iteration counter

       iteration = 1

! Continue the iteration as long as dt_global = Max|Tnew-Told| is greater than
! the specified maximum tolerance ('max_temp_error') and the maximum iteration
! is not reached.

  jacobi_iteration : do while ( dt_global > max_temp_error .and. iteration <= max_iterations)

  !-------------------------------------------------------------- Jacobi
  ! Jacobi iteration (== simple average):  Update the temperature in the interior.
  ! Note: Exclude the bottom ghost nodes in mype=0, which lie outside the domain, and so
  !       are not used in the iteration.

       do j=1,ny
        if (mype==0 .and. j==1) cycle ! Skip the bottom boundary nodes (solution fixed)
        do i=1,nx
         temperature(i,j)=0.25*(temperature_old(i+1,j  ) + temperature_old(i-1,j  ) + &
                                temperature_old(i  ,j+1) + temperature_old(i  ,j-1)   )
        enddo
       enddo

  !-------------------------------------------------------------- Jacobi
  ! Communication: Update the solution at ghost nodes.


        !--------------------------------------------------------------
        ! Send data
        !--------------------------------------------------------------
        !
        !
        ! (1)Send top interior data to bottom ghost data
        !    Note: Send from mype=0,1,2,3 to mype=1,2,3,4, respectively.
        !  ___________________                                                                      
        ! |___________________|                                                                     
        ! |                   |                                                                     
        ! |      mype+1       |                                                                     
        ! |___________________!                                                                     
        ! |___temp(1:nx,0)____| <--+  bottom ghost data
        !                          |                                
        !                          |                                                                
        !  ___________________     |                                                                
        ! |___________________|    |                                                                
        ! |   temp(1:nx,ny)   | ---+  top interior data
        ! |       mype        |                                                                     
        ! |___________________|                                                                     
        ! |___________________|                                                                     

         if (mype < npes-1) then
            call MPI_Isend(temperature(1,ny), nx, MPI_DOUBLE_PRECISION, &
                  mype+1, id_top_to_bottom, MPI_COMM_WORLD, req(1), ierr)
         endif

         ! (2)Send bottom interior data to top ghost data
         !    Note: Send from mype=1,2,3,4 to mype=0,1,2,3, respectively.
         !  ___________________                                                                      
         ! |___________________|                                                                     
         ! |        mype       |                                                                     
         ! |                   |
         ! |___temp(1:nx,1)____! ---+ bottom interior data
         ! |___________________|    |
         !                          |
         !  ___________________     |
         ! |__temp(1:nx,ny+1)__| <--+ top ghost data
         ! |                   |
         ! |     mype-1        |
         ! |___________________|
         ! |___________________|

         if (mype /= 0) then
            call MPI_Isend(temperature(1,1), nx, MPI_DOUBLE_PRECISION, &
                 mype-1, id_bottom_to_top, MPI_COMM_WORLD, req(2), ierr)
         endif

         !--------------------------------------------------------------
         ! Receive ata
         !--------------------------------------------------------------
                       
         ! (1)Receive top interior data and store them in bottom ghost data
         !    Note: Receive if mype=1,2,3,4
         !  ___________________                                                                      
         ! |___________________|                                                                     
         ! |                   |                                                                     
         ! |      mype         |                                                                     
         ! |___________________!                                                                     
         ! |___temp(1:nx,0)____| <--+  bottom ghost data
         !                          |                                
         !                          |                                                                
         !  ___________________     |                                                                
         ! |___________________|    |                                                                
         ! |   temp(1:nx,ny)   | ---+  top interior data
         ! |       mype-1      |                                                                     
         ! |___________________|                                                                     
         ! |___________________|
    
         if (mype /= 0) then
            call MPI_Irecv(temperature_old(1,0), nx, MPI_DOUBLE_PRECISION, &
                     mype-1, id_top_to_bottom, MPI_COMM_WORLD, req(3), ierr)
         endif

         ! (2)Receive bottom interior data and store them in top ghost data
         !    Note: Receive if mype=0,1,2,3.
         !  ___________________
         ! |___________________|
         ! |      mype+1       |
         ! |                   |
         ! |___temp(1:nx,1)____! ---+ bottom interior data
         ! |___________________|    |
         !                          |
         !  ___________________     |
         ! |__temp(1:nx,ny+1)__| <--+ top ghost data
         ! |                   |
         ! |       mype        |
         ! |___________________|
         ! |___________________|

         if (mype /= npes-1) then
            call MPI_Irecv(temperature_old(1,ny+1), nx, MPI_DOUBLE_PRECISION, &
                        mype+1, id_bottom_to_top, MPI_COMM_WORLD, req(4), ierr)
         endif

         ! To wait until Isend and Irecv in the above are completed.
         call MPI_Waitall(4, req, status_array, ierr)

  !-------------------------------------------------------------- Jacobi
  ! Compute the maximum iterative error within the PE (local).

        dt=0.0

        do j=1,ny
         if (mype==0 .and. j==1) cycle
         do i=1,nx
          dt = max( abs(temperature(i,j) - temperature_old(i,j)), dt )
          temperature_old(i,j) = temperature(i,j)
         enddo
        enddo

  !----------------------------------------------------------------- Jacobi
  ! Compute the maximum of the iterative error among all PEs.

    ! (1)Compute the maximum and store it in the master processor (mype=0).

          call MPI_Reduce(          dt       ,  & ! <- Variable to be collected
                                    dt_global,  & ! <- Variable in which the result is stored
                                            1,  & ! <- number of values = 1
                          MPI_DOUBLE_PRECISION, & ! <- Type of the value to be reduced
                                       MPI_MAX, & ! <- Type of operator = max
                                             0, & ! <- Get the value and store for mype=0
                                MPI_COMM_WORLD, & ! <- ???
                                          ierr  ) ! <- ierr=0 if successful

         !Note: at this point, only mype=0 has the value for dt_global.
    !        ____________________
    !        |    dt_global=?    | mype = 3
    !        |___________________|
    !        |    dt_global=?    | mype = 2
    !        |___________________|           <-- Example, when npes=4
    !        |    dt_global=?    | mype = 1
    !        |___________________|
    !        |    dt_global=xx   | mype = 0
    !        |___________________|


  ! (2)Broadcast dt_global so that all processors will have the same dt_global value

        call MPI_Bcast(           dt_global, & ! <- variable to be broadcast
                                          1, & ! <- number of value = 1
                       MPI_DOUBLE_PRECISION, & ! <- Type of the value 'dt_global'
                                          0, & ! <- Broadcast from mype=0
                             MPI_COMM_WORLD, & ! <- ???
                                       ierr  ) ! <- ierr=0 if successful

    !Note: Now all processors have the same value of dt_global.
    !        ____________________
    !        |   dt_global=xx    | mype = 3
    !        |___________________|
    !        |   dt_global=xx    | mype = 2
    !        |___________________|           <-- Example, when npes=4
    !        |   dt_global=xx    | mype = 1
    !        |___________________|
    !        |   dt_global=xx    | mype = 0
    !        |___________________|
    !
 
  !---------------------------------------------------------------------- Jacobi
  ! Display the iteration number and the maximum iterative error
  ! only from the master PE, and only at every 1000 iterations.

    if ( mype == 0 .and. mod(iteration,1000)==0 ) then
     write(*,*) ' itr = ', iteration, ' Max|Tnew - Told| = ',dt_global
    endif

  !---------------------------------------------------------------------- Jacobi
  ! Update the iteration counter

    iteration = iteration + 1


  enddo jacobi_iteration

!----- End of Jacobi iteration
!**********************************************************************

!----------------------------------------------------------------------
! Wait until all processors come to this point.
! This is for accurate timing and clean output.

  call MPI_Barrier(MPI_COMM_WORLD, & ! <- ???
                             ierr  ) ! <- ierr=0 if successful

!----------------------------------------------------------------------
! Store the current time in 'stop_time'.

  call cpu_time(stop_time)

!----------------------------------------------------------------------
! Print the number of iterations taken and the iterative error,
! and the actual CPU time taken.

   if( mype == 0 ) then
    print*, 'Max|Tnew - Told| at iteration ', iteration-1, ' = ',dt_global
    print*, '  Total time = ', stop_time-start_time, ' seconds.', ' nnodes=',nx*ny
   endif

!   Below will be printed in the file 'fort.xxx' where xxx=mype+1000.
    write(mype+1000,*) '  Total time = ', stop_time-start_time, ' seconds.', ' nnodes=',nx*ny

!----------------------------------------------------------------------
! Write a tecplot file for plotting the grid and the solution.

  temperature = temperature_old
  call write_tecplot_file(x,y,temperature,temperature_exact,nx,ny,mype,npes)

!----------------------------------------------------------------------
! MPI final call

  call MPI_Finalize(ierr) ! ierr=0 if successful

!********************************************************************************


 contains ! Below, subroutines used in the above main program are listed.


!********************************************************************************
! Store the exact solution in temperature_exact(:,:).
! Set the exact solution at boundary nodes in temperature_old(:,:),
! which is the boundary condition. At the interior nodes, the solution,
! temperature_old(:,:) is set to be zero as an initial condition for the iteration.
!
! ------------------------------------------------------------------------------
!  Input: x(:,:), y(:,:) = (x,y)-coordinate arrays of size (0:nx+1, 0:ny+1)
!          npes = number of partition elements (processors)
!          mype = processor ID
!
! Output: temperature_exact(:,:) = exact solution
!         temperature_old(:,:)   = numerical solution
! ------------------------------------------------------------------------------
!
!********************************************************************************
 subroutine initialize(x,y,temperature_old, temperature_exact, npes, mype, nx,ny)

  implicit none

  integer , intent( in)                           :: nx, ny, npes,mype
  real(p2), intent( in), dimension(0:nx+1,0:ny+1) ::  x,  y
  real(p2), intent(out), dimension(0:nx+1,0:ny+1) :: temperature_old
  real(p2), intent(out), dimension(0:nx+1,0:ny+1) :: temperature_exact

  integer                                         ::  i, j
  real(p2)                                        :: xp, yp
  real(p2) , parameter                            :: pi = 3.141592653589793238

!---------------------------------------------------------------------------
!  First, store the exact solution everywhere.

   do j = 0, ny+1

    do i = 0, nx+1

     xp = x(i,j)
     yp = y(i,j)

     temperature_exact(i,j) = (sinh(pi*xp)*sin(pi*yp)+sinh(pi*yp)*sin(pi*xp))/sinh(pi)
     temperature_old(i,j)   = temperature_exact(i,j)

    end do

   end do

!---------------------------------------------------------------------------
!  Well, let's start the iteration with zero values in the interior instead
!  of the exact solution. So, overwrite the solution in the interior!

   do j = 0, ny+1

    ! To skip the bottom domain-boundary(BC) and unused ghost nodes
    if (mype==0      .and. j < 2    ) cycle

    ! To skip the top domain-boundary(BC)
    if (mype==npes-1 .and. j == ny+1) cycle

    do i = 1, nx ! <-- i=1..nx because we set zero solution at interior nodes only.

     temperature_old(i,j) = 0.0

    end do

   end do

! So, now we have the exact solution only on the boundary nodes.

 end subroutine initialize
!********************************************************************************


!********************************************************************************
! This subroutine writes a Tecplot data file for plotting the grid
! and the solution with the filename: tecplot_mype.dat
!
! For example, if npes=4, then four files will be generated:
!
!  tecplot_0.dat
!  tecplot_1.dat
!  tecplot_2.dat
!  tecplot_3.dat
!
! Plot all these files in one place, you'll see the entire domain.
!
! ------------------------------------------------------------------------------
!  Input: x(:,:), y(:,:) = (x,y)-coordinate arrays of size (0:nx+1, 0:ny+1)
!         temperature_old(:,:)   = numerical solution
!         temperature_exact(:,:) = exact solution
!          npes = number of partition elements (processors)
!          mype = processor ID
!
! Output: tecplot_mype.dat = Tecplot file for plotting (mype = processor ID)
! ------------------------------------------------------------------------------
!
!********************************************************************************
 subroutine write_tecplot_file(x,y,temperature,temperature_exact,nx,ny,mype,npes)

 implicit none

 integer , intent(in)                           :: nx, ny, mype,npes
 real(p2), intent(in), dimension(0:nx+1,0:ny+1) ::  x,  y
 real(p2), intent(in), dimension(0:nx+1,0:ny+1) :: temperature,temperature_exact

 integer                                        :: i, j
 character(80)                                  :: char_temp, filename

!---------------------------------------------------------------------------
! Store the value of 'mype' as a character in the character variable 'char_temp'.
  write(char_temp,'(i5)') mype

!---------------------------------------------------------------------------
! Define the file name
  filename = "tecplot_" // trim(adjustl(char_temp)) // '.dat'

!---------------------------------------------------------------------------
! Open the file and start writing the data!
  open(unit=1, file=filename, status="unknown")

!---------------------------------------------------------------------------
! Header
  write(1,*) 'title =', '"Partition_'// trim(adjustl(char_temp)), '"'
  write(1,*) 'variables = "x", "y", "Temperature", "Temperature exact" "T-Texact"'

  if (mype==0) then
   write(1,*) 'zone T=','Partition_'// trim(adjustl(char_temp)),' i=', nx+2, 'j=', ny+1, 'f=point'
  else
   write(1,*) 'zone T=','Partition_'// trim(adjustl(char_temp)),' i=', nx+2, 'j=', ny+2, 'f=point'
  endif

!---------------------------------------------------------------------------
! Note: Exclude the bottom ghost nodes in mype=0, which lie outside the domain, and so
!       are not used in the iteration and completely irrelevant to the soltuion.
! Write grid data (including the ghost points at j = 0 and ny+1)

  do j = 0, ny+1

   if (mype==0 .and. j==0) cycle

   do i = 0, nx+1

    write(1,*) x(i,j), y(i,j), temperature(i,j), temperature_exact(i,j), &
               temperature(i,j) - temperature_exact(i,j) ! <- error

   end do 

  end do

!---------------------------------------------------------------------------
! Close the file
  close(1)

 end subroutine write_tecplot_file
!********************************************************************************


 end program jacobi_iteration_mpi

!********************************************************************************
!********************************************************************************