module general_mpi_module
   implicit none
   save

   !#define COLLECT_TIMER
   !#define DISTRIBUTE_TIMER

#ifdef USEMPI

   !
   !  NOTE: the matrices are considered to be bordered by a columns
   !   left and right and rows up and down. So, n general,
   !   only b(2:mb-1,2:nb-1) is send to the master process.
   !   HOWEVER: if matrix b is
   !   on the left: b(:,1) is also sent
   !   on the right: b(:,nb) is also sent
   !   on the top:   b(1,:)  is also sent
   !   on the bottom: b(mb,:) is also sent
   ! The master process receives a(1:ma,1:na).
   !   dimension of b (1:mb,1:nb).
   !   dimension of a (1:ma,1:na)
   !
   !

   interface matrix_distr
      module procedure matrix_distr_real8
      module procedure matrix_distr_integer
   end interface matrix_distr

   interface matrix_coll
      module procedure matrix_coll_real8
      module procedure matrix_coll_integer
   end interface matrix_coll

   interface shift_borders
      module procedure shift_borders_matrix_real8
   end interface shift_borders

   interface vector_distr
      module procedure vector_distr_real8
      module procedure block_vector_distr_real8
   end interface vector_distr

contains
   !_____________________________________________________________________________
   subroutine vector_distr_real8(a,b,is,lm,root,comm)
      !
      ! Distribute vector a on non-root processes
      ! Process P receives a(is(P):is(P)+lm(P)-1)
      !
      ! Parameters:
      ! real*8 a (in): vector to distribute, only used at root
      ! real*8 b (out): local vector
      ! integer is(number-of-processes) (in): see above: needed on all processes
      ! integer lm(number-of-processes) (in): see above: needed on all processes
      ! integer root (in): process that holds a, is, lm
      ! integer comm (in): MPI communicator
      !
      ! complication: a and b are not necessarily contiguous,
      !  but calling mpi_alltoallw with a and b, the compiler
      !  will generate contiguous arrays.
      !
      ! root process must have rank .eq. 0
      !
      use mpi
      implicit none
      real*8, dimension(:), intent(in)  :: a
      real*8, dimension(:), intent(out) :: b
      integer, dimension(:), intent(in) :: is
      integer, dimension(:), intent(in) :: lm
      integer, intent(in)               :: root
      integer, intent(in)               :: comm

      integer                          :: ier,ra,sz,basic_type,i
      integer,allocatable,dimension(:) :: recvtypes,sendtypes,recvcounts,sendcounts,sdispls,rdispls
      integer,dimension(1)             :: sizes,subsizes,starts

      basic_type = MPI_DOUBLE_PRECISION
      call MPI_Comm_size(comm, sz, ier)
      call MPI_Comm_rank(comm, ra, ier)

      if (root /= 0) then
         print *,'Error in vector_distr_real8: root must be 0, but is:',root
         call MPI_Abort(MPI_COMM_WORLD,1,ier)
      endif

      allocate(recvtypes(sz))
      allocate(sendtypes(sz))
      allocate(recvcounts(sz))
      allocate(sendcounts(sz))
      allocate(sdispls(sz))
      allocate(rdispls(sz))

      sdispls    = 0
      rdispls    = 0
      recvtypes  = MPI_CHARACTER
      sendtypes  = MPI_CHARACTER
      recvcounts = 0
      sendcounts = 0

      !
      ! Create MPI types
      !

      ! MPI_TYPE_CREATE_SUBARRAY(NDIMS, ARRAY_OF_SIZES, ARRAY_OF_SUBSIZES,
      !   ARRAY_OF_STARTS, ORDER, OLDTYPE, NEWTYPE, IERROR)
      !   INTEGER    NDIMS, ARRAY_OF_SIZES(*), ARRAY_OF_SUBSIZES(*),
      !   ARRAY_OF_STARTS(*), ORDER, OLDTYPE, NEWTYPE, IERROR

      ! determine mpi_types for the receive matrices
      ! all processes will receive only from root
      sizes     = shape(b)
      subsizes  = lm(ra+1)
      starts    = 0
      call MPI_Type_create_subarray(1,sizes,subsizes,starts, &
      MPI_ORDER_FORTRAN, basic_type,recvtypes(1), ier)
      call MPI_Type_commit(recvtypes(1),ier)
      recvcounts(1) = 1


      ! determine mpi types for the senders
      if(ra == root) then
         ! root sends to everyone

         do i=1,sz
            sizes      = shape(a)
            subsizes   = lm(i)
            starts     = is(i) - 1
            sendcounts = 1
            call MPI_Type_create_subarray(1,sizes,subsizes,starts,  &
            MPI_ORDER_FORTRAN, basic_type,sendtypes(i),ier)
            call MPI_Type_commit(sendtypes(i),ier)
         enddo
      endif

      call MPI_Alltoallw(a,sendcounts,sdispls,sendtypes, &
      b,recvcounts,rdispls,recvtypes,comm,ier)

      do i=1,sz
         if (sendtypes(i) /= MPI_CHARACTER) then
            call MPI_Type_free(sendtypes(i),ier)
         endif
         if (recvtypes(i) /= MPI_CHARACTER) then
            call MPI_Type_free(recvtypes(i),ier)
         endif
      enddo

   end subroutine vector_distr_real8

   !_____________________________________________________________________________

   subroutine block_vector_distr_real8(a,b,is,lm,root,comm)
      !
      ! Distribute matrix a on non-root processes
      ! Process P receives a(is(P):is(P)+lm(P)-1,:)
      !
      ! Parameters:
      ! real*8 a(:,:) (in): matrix to distribute, only used at root
      ! real*8 b(:,:) (out): local matrix
      ! integer is(number-of-processes) (in): see above: needed on all processes
      ! integer lm(number-of-processes) (in): see above: needed on all processes
      ! integer root (in): process that holds a, is, lm
      ! integer comm (in): MPI communicator
      !
      ! complication: a and b are not necessarily contiguous,
      !  but calling mpi_alltoallw with a and b, the compiler
      !  will generate contiguous arrays.
      !
      ! root process must have rank .eq. 0
      !
      use mpi
      implicit none
      real*8,  dimension(:,:), intent(in)  :: a
      real*8,  dimension(:,:), intent(out) :: b
      integer, dimension(:), intent(in) :: is
      integer, dimension(:), intent(in) :: lm
      integer, intent(in)               :: root
      integer, intent(in)               :: comm

      integer                          :: ier,ra,sz,basic_type,i
      integer,allocatable,dimension(:) :: recvtypes,sendtypes,recvcounts,sendcounts,sdispls,rdispls
      integer,dimension(2)             :: sizes,subsizes,starts

      basic_type = MPI_DOUBLE_PRECISION
      call MPI_Comm_size(comm, sz, ier)
      call MPI_Comm_rank(comm, ra, ier)

      if (root /= 0) then
         print *,'Error in block_vector_distr_real8: root must be 0, but is:',root
         call MPI_Abort(MPI_COMM_WORLD,1,ier)
      endif

      allocate(recvtypes(sz))
      allocate(sendtypes(sz))
      allocate(recvcounts(sz))
      allocate(sendcounts(sz))
      allocate(sdispls(sz))
      allocate(rdispls(sz))

      sdispls    = 0
      rdispls    = 0
      recvtypes  = MPI_CHARACTER
      sendtypes  = MPI_CHARACTER
      recvcounts = 0
      sendcounts = 0

      !
      ! Create MPI types
      !

      ! MPI_TYPE_CREATE_SUBARRAY(NDIMS, ARRAY_OF_SIZES, ARRAY_OF_SUBSIZES,
      !   ARRAY_OF_STARTS, ORDER, OLDTYPE, NEWTYPE, IERROR)
      !   INTEGER    NDIMS, ARRAY_OF_SIZES(*), ARRAY_OF_SUBSIZES(*),
      !   ARRAY_OF_STARTS(*), ORDER, OLDTYPE, NEWTYPE, IERROR

      ! determine mpi_types for the receive matrices

      ! all processes will receive only from root
      sizes     = shape(b)
      subsizes  = (/ lm(ra+1), size(b,2) /)
      starts    = 0
      call MPI_Type_create_subarray(2,sizes,subsizes,starts, &
      MPI_ORDER_FORTRAN, basic_type,recvtypes(1), ier)
      call MPI_Type_commit(recvtypes(1),ier)
      recvcounts(1) = 1

      ! determine mpi types for the senders
      if (ra == root) then
         ! root sends to everyone

         do i=1,sz
            sizes      = shape(a)
            subsizes   = (/ lm(i),     size(a,2) /)
            starts     = (/ is(i) - 1, 0         /)
            sendcounts = 1
            call MPI_Type_create_subarray(2,sizes,subsizes,starts,  &
            MPI_ORDER_FORTRAN, basic_type,sendtypes(i),ier)
            call MPI_Type_commit(sendtypes(i),ier)
         enddo
      endif

      call MPI_Alltoallw(a,sendcounts,sdispls,sendtypes, &
      b,recvcounts,rdispls,recvtypes,comm,ier)

      do i=1,sz
         if (sendtypes(i) /= MPI_CHARACTER) then
            call MPI_Type_free(sendtypes(i),ier)
         endif
         if (recvtypes(i) /= MPI_CHARACTER) then
            call MPI_Type_free(recvtypes(i),ier)
         endif
      enddo

   end subroutine block_vector_distr_real8

   !___________________________________________________________________________

   subroutine matrix_distr_real8(a,b,is,lm,js,ln,  &
   root,comm)
      !
      ! This subroutine distributes matrix on root to the non-root
      ! processes.
      ! Root will not send to itself.
      ! Root MUST be process #0 in comm
      !
      ! a: matrix to be distributed
      ! b: receive matrix, process p
      !    (1:lm(p),1:ln(p)) will be filled (p=1..size of comm)
      !    Normally: the whole of b
      ! is,js: on process p, b(1,1) coincides with a(is(p),js(p)),
      !        p=1..size of comm
      ! root: root process, must be zero
      ! comm: communicator
      !
      ! Note: root process does not send or copy anything to itself,
      !       'there is no b on root'
      !

      use mpi
      implicit none
      real*8, intent(in),  dimension(:,:)  :: a
      real*8, intent(out), dimension(:,:)  :: b
      integer, intent(in), dimension(:)    :: is, lm
      integer, intent(in), dimension(:)    :: js, ln
      integer, intent(in)                  :: root, comm

      integer              :: sz,ier,ra,i
      integer, allocatable :: recvtypes(:), sendtypes(:), recvcounts(:),sendcounts(:)
      integer, allocatable :: sdispls(:), rdispls(:)
      integer sizes(2),subsizes(2),starts(2)
      integer basic_type

      basic_type = MPI_DOUBLE_PRECISION

      include 'genmpi_distr.inc'

   end subroutine matrix_distr_real8

   !___________________________________________________________________________

   subroutine matrix_distr_integer(a,b,is,lm,js,ln,  &
   root,comm)
      !
      ! This subroutine distributes matrix on root to the non-root
      ! processes.
      ! Root will not send to itself.
      ! Root MUST be process #0 in comm
      !
      ! a: matrix to be distributed
      ! b: receive matrix, process p
      !    (1:lm(p),1:ln(p)) will be filled (p=1..size of comm)
      !    Normally: the whole of b
      ! is,js: on process p, b(1,1) coincides with a(is(p),js(p)),
      !        p=1..size of comm
      ! root: root process, must be zero
      ! comm: communicator
      !
      ! Note: root process does not send or copy anything to itself,
      !       'there is no b on root'
      !

      use mpi
      implicit none
      integer, intent(in),  dimension(:,:)  :: a
      integer, intent(out), dimension(:,:)  :: b
      integer, intent(in), dimension(:)    :: is, lm
      integer, intent(in), dimension(:)    :: js, ln
      integer, intent(in)                  :: root, comm

      integer              :: sz,ier,ra,i
      integer, allocatable :: recvtypes(:), sendtypes(:), recvcounts(:),sendcounts(:)
      integer, allocatable :: sdispls(:), rdispls(:)
      integer sizes(2),subsizes(2),starts(2)
      integer basic_type

      basic_type = MPI_INTEGER

      include 'genmpi_distr.inc'

   end subroutine matrix_distr_integer

   !___________________________________________________________________________

   subroutine matrix_coll_real8(a,b,is,lm,js,ln,&
   isleft,isright,istop,isbot,root,comm)
      ! collect matrix on root process
      !  NOTE: the collect is done from the non-root processes only
      !
      ! parameters:
      ! real*8 a(:,:) (out): matrix to be collected.
      !                      This matrix has to be available, it is not
      !                      allocated by this subroutine.
      ! real*8 b(:,:) (in): local scattered matrix
      ! integer is,lm,js,ln(number of non-root processes) (in): description of
      !   the location of the submatrices b in matrix a:
      !   Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is
      !   the rank of the process (0..procs-1)
      !   The length of the 1st dimension of b is given by lm(rank+1)
      !   The length of the 2nd dimension of b is given by ln(rank+1)
      ! logical isleft, isright, istop, isbot (in): is this submatrix at the
      !  left, right top or bottom of the general matrix.
      ! integer root (in): the rank of the process where a is available
      ! integer comm (in): the MPI communicator to be used.
      !
      ! Note: a,is,js are used at the root process only
      !
      ! Note: the root process must have rank .eq. 0
      !
      use mpi
      implicit none
      real*8, intent(out),  dimension(:,:)  :: a
      real*8, intent(in), dimension(:,:)  :: b
      integer, intent(in), dimension(:)    :: is,lm
      integer, intent(in), dimension(:)    :: js,ln
      logical, intent(in), dimension(:)    :: isleft, isright, istop, isbot
      integer, intent(in)                  :: root,comm

      integer sz,ier,ra,i
      integer, allocatable :: recvtypes(:), sendtypes(:), recvcounts(:),sendcounts(:)
      integer, allocatable :: sdispls(:), rdispls(:)
      integer sizes(2),subsizes(2),starts(2)
      integer basic_type
      ! nover is number of overlapping rows/columns of submatrices.
      ! nbord is number of borders.

      integer, parameter                   :: nbord = 2
      integer, parameter                   :: nover = 2*nbord

      basic_type = MPI_DOUBLE_PRECISION

      include 'genmpi_coll.inc'

   end subroutine matrix_coll_real8
   !___________________________________________________________________________

   subroutine matrix_coll_integer(a,b,is,lm,js,ln,&
   isleft,isright,istop,isbot,root,comm)
      ! for description, see matrix-coll_real8
      use mpi
      implicit none
      integer, intent(out),  dimension(:,:) :: a
      integer, intent(in), dimension(:,:)   :: b
      integer, intent(in), dimension(:)     :: is, lm
      integer, intent(in), dimension(:)     :: js, ln
      logical, intent(in), dimension(:)     :: isleft, isright, istop, isbot
      integer, intent(in)                   :: root, comm

      integer sz,ier,ra,i
      integer, allocatable :: recvtypes(:), sendtypes(:), recvcounts(:),sendcounts(:)
      integer, allocatable :: sdispls(:), rdispls(:)
      integer sizes(2),subsizes(2),starts(2)
      integer basic_type
      ! nover is number of overlapping rows/columns of submatrices.
      ! nbord is number of borders.

      integer, parameter                   :: nbord = 2
      integer, parameter                   :: nover = 2*nbord

      basic_type = MPI_INTEGER

      include 'genmpi_coll.inc'

   end subroutine matrix_coll_integer
   !___________________________________________________________________________



   subroutine decomp(n, numprocs, myid, s, e)

      !  From the mpich-distribution: MPE_Decomp1d
      !
      !  n        : integer, input (for example: number of tasks or
      !                               dimension of an array)
      !  numprocs : integer, input (number of processes)
      !  myid     : integer, input (MPI id of this process)
      !  s        : integer, output
      !  e        : integer, output
      !
      !  The purpose of this subroutine is to get an as equal as possible
      !  division of the integers 1..n among numprocs intervals.
      !  A common use is to divide work among MPI processes.
      !
      !  example:
      !
      !  integer numprocs, ier, myid, n, s, e, i
      !  call MPI_Comm_size(MPI_COMM_WORLD, numprocs, ier)
      !  call MPI_Comm_rank(MPI_COMM_WORLD, myid, ier)
      !  n = 1000
      !  call decomp(n, nprocs, myid, s, e)
      !  do i = s,e
      !    ...
      !  enddo
      !
      integer, intent(in) :: n, numprocs, myid
      integer, intent(out):: s, e
      integer nlocal
      integer deficit

      nlocal  = n / numprocs
      s       = myid * nlocal + 1
      deficit = mod(n,numprocs)
      s       = s + min(myid,deficit)
      if (myid .lt. deficit) then
         nlocal = nlocal + 1
      endif
      e = s + nlocal - 1
      if (e .gt. n .or. myid .eq. numprocs-1) then
         e = n
      endif

   end subroutine decomp

   subroutine det_submatrices(ma,na,mp,np,is,lm,js,ln,isleft,isright,istop,isbot)
      !
      ! determine a division of a ma*na matrix on mp*np processes
      ! ma: integer(in): 1st dimension of matrix to divide
      ! na: integer(in): 2nd dimension of this matrix
      ! mp: integer(in): 1st dimension of processorgrid
      ! np: integer(in): 2nd dimension of processorgrid
      ! is(mp*np): integer(out): is(i) will be equal to the
      !                          start row of the i-th submatrix
      ! lm(mp*np): integer(out): lm(i) will be equal to the
      !                          first dimension of the i-th submatrix
      ! js(mp*np): integer(out): js(i) will be equal to the
      !                          start column of the i-th submatrix
      ! ln(mp*np): integer(out): ln(i) will be equal to the
      !                          2nd dimension of the i-th submatrix
      ! isleft(mp*np): logical(out): isleft(i) is true if the i-th submatrix
      !                              shares the first column with the global matrix
      !                              a(:,1)
      ! isright(mp*np): logical(out): isright(i) is true if the i-th submatrix
      !                              shares the last column with the global matrix
      !                              a(ma,:)
      ! istop(mp*np): logical(out): istop(i) is true if the i-th submatrix
      !                              shares the first row with the global matrix
      !                              a(1,:)
      ! isbot(mp*np): logical(out): isbot(i) is true if the i-th submatrix
      !                              shares the last row with the global matrix
      !                              a(na,:)
      !
      ! The submatrices overlap:
      ! 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
      ! 1 2 3 4 5 6 7 8 9           1 2 3 4 5 6 7 8 9 0 1
      !           1 2 3 4 5 6 7 8 9 0 1 2 3
      !
      ! + + + + + x x x x + + + + + x x x x + + + + + + +
      ! + + + + + x x x x + + + + + x x x x + + + + + + +
      ! + + + + + x x x x + + + + + x x x x + + + + + + +
      ! + + + + + x x x x + + + + + x x x x + + + + + + +
      ! + + + + + x x x x + + + + + x x x x + + + + + + +
      ! + + + + + x x x x + + + + + x x x x + + + + + + +
      ! + + + + + x x x x + + + + + x x x x + + + + + + +
      !
      ! This picture is for the overlap in horizontal direction,
      ! The overlap is 4.
      ! + -> matrix element
      ! x -> overlapping matrix element
      ! the order of the submatrices is 9,13,11.
      ! The order of the global matrix is 25
      !
      implicit none
      integer, intent(in)                :: ma,na,mp,np
      integer, dimension(:), intent(out) :: is,lm,js,ln
      logical, dimension(:), intent(out) :: isleft,isright,istop,isbot

      integer i,j,s,e,k
      integer, parameter :: nover = 4

      isleft = .false.
      isright = .false.
      istop = .false.
      isbot = .false.

      do i = 1, mp
         call decomp(ma-nover, mp, i - 1, s, e)
         k = 0
         do j = 1, np
            if (j .eq. 1) then
               isleft(i+k) = .true.
            endif
            if (j .eq. np) then
               isright(i+k) = .true.
            endif
            is(i + k) = s
            lm(i + k) = e - s + nover + 1
            k = k + mp
         enddo
      enddo
      k = 0
      do j=1, np
         call decomp(na-nover, np, j - 1, s, e)
         do i = 1, mp
            if (i .eq. 1) then
               istop(i+k) = .true.
            endif
            if (i .eq. mp) then
               isbot(i+k) = .true.
            endif
            js(i + k) = s
            ln(i + k) = e - s + nover + 1
         enddo
         k = k + mp
      enddo

   end subroutine det_submatrices

   subroutine shift_borders_matrix_real8(a,left,right,top,bot,comm)
      !
      ! shift borders to and from neighbours.
      !
      use mpi
      implicit none
      real*8, dimension(:,:), intent(inout) :: a
      integer, intent(in)                   :: left,right,top,bot,comm

      integer, parameter                    :: datatype = MPI_DOUBLE_PRECISION
      integer ierror,ma,na

      ma = ubound(a,1)
      na = ubound(a,2)

      ! send to left, receive from right
      call MPI_Sendrecv(a(2:ma-1,2), ma-2, datatype,       &
      left, 1,                           &
      a(2:ma-1,na), ma-2, datatype,      &
      right, 1,                          &
      comm, MPI_STATUS_IGNORE, ierror)
      ! send to right, receive from left
      call MPI_Sendrecv(a(2:ma-1,na-1), ma-2, datatype,    &
      right, 2,                          &
      a(2:ma-1,1), ma-2, datatype,       &
      left, 2,                           &
      comm, MPI_STATUS_IGNORE, ierror)

      ! send to bot, receive from top
      call MPI_Sendrecv(a(ma-1,:), na, datatype,           &
      bot, 3,                            &
      a(1,:), na, datatype,              &
      top, 3,                            &
      comm, MPI_STATUS_IGNORE, ierror)
      ! send to top, receive from bot
      call MPI_Sendrecv(a(2,:), na, datatype,              &
      top, 4,                            &
      a(ma,:), na, datatype,             &
      bot, 4,                            &
      comm, MPI_STATUS_IGNORE, ierror)

   end subroutine shift_borders_matrix_real8

#endif
end module general_mpi_module