module general_mpi_module

   ! to tell the compiler that unused dummy parameters are ok,
   ! use for example in subroutine sub(x,i) (x is array):
   ! FAKE(kind(x)+i)
   ! use kind() for everything that can't be added toa complex*16
#define FAKE(x) if(.false.) general_mpi_dummy=x
   ! to tell the compiler that intent(out) aguments are set use
   ! for example FAKESET(x,0)
#define FAKESET(x,y) if(.false.) x=y

#ifdef USEMPI
   use mpi
#endif
   implicit none
   private
   save
   public matrix_coll,matrix_distr, shift_borders, det_submatrices, vector_distr, decomp
   public general_mpi_dummy

   complex*16 :: general_mpi_dummy

   !#define COLLECT_TIMER
   !#define DISTRIBUTE_TIMER


   !
   !  NOTE: the matrices are considered to be bordered by a columns
   !   left and right and rows up and down. So, n general,
   !   only b(2:mb-1,2:nb-1) is send to the master process.
   !   HOWEVER: if matrix b is
   !   on the lowy side:   b(:,1) is also sent
   !   on the highy side:  b(:,nb) is also sent
   !   on the lowx side:   b(1,:)  is also sent
   !   on the highx side:  b(mb,:) is also sent
   ! The master process receives a(1:ma,1:na).
   !   dimension of b (1:mb,1:nb).
   !   dimension of a (1:ma,1:na)
   !
   !

   interface matrix_distr
      module procedure matrix_distr_real8
      module procedure matrix_distr_integer
   end interface matrix_distr

   interface matrix_coll
      module procedure matrix_coll_real8
      module procedure matrix_coll_integer
   end interface matrix_coll

   interface shift_borders
      module procedure shift_borders_matrix_real8
   end interface shift_borders

   interface vector_distr
      module procedure vector_distr_real8
      module procedure block_vector_distr_real8
   end interface vector_distr

contains
   !_____________________________________________________________________________
   subroutine vector_distr_real8(a,b,is,lm,root,comm)
      !
      ! Distribute vector a on processes in communicator comm
      ! Process P receives a(is(P+1):is(P+1)+lm(P+1)-1)  (P: 0..size_of_comm - 1)
      !
      ! Parameters:
      ! real*8 a (in): vector to distribute, only used at root
      ! real*8 b (out): local vector
      ! integer is(number-of-processes) (in): see above: needed on root
      ! integer lm(number-of-processes) (in): see above: needed on all processes
      ! integer root (in): process that holds a, is
      ! integer comm (in): MPI communicator
      !
      ! complication: a and b are not necessarily contiguous,
      !  but calling mpi_alltoallw with a and b, the compiler
      !  will generate contiguous arrays.
      !
      ! root process must have rank .eq. 0
      !
#ifdef USEMPI
      use xmpi_module, only : xmpi_ocomm
#endif
      implicit none
      real*8, dimension(:), intent(in)  :: a
      real*8, dimension(:), intent(out) :: b
      integer, dimension(:), intent(in) :: is
      integer, dimension(:), intent(in) :: lm
      integer, intent(in)               :: root
      integer, intent(in)               :: comm
#ifdef USEMPI
      integer                           :: ier,ra,sz,basic_type,i
      integer,allocatable,dimension(:)  :: recvtypes,sendtypes,recvcounts,sendcounts,sdispls,rdispls
      integer,dimension(1)              :: sizes,subsizes,starts

      basic_type = MPI_DOUBLE_PRECISION
      call MPI_Comm_size(comm, sz, ier)
      call MPI_Comm_rank(comm, ra, ier)

      if (root /= 0) then
         print *,'Error in vector_distr_real8: root must be 0, but is:',root
         call MPI_Abort(xmpi_ocomm,1,ier)
      endif

      allocate(recvtypes(sz))
      allocate(sendtypes(sz))
      allocate(recvcounts(sz))
      allocate(sendcounts(sz))
      allocate(sdispls(sz))
      allocate(rdispls(sz))

      sdispls    = 0
      rdispls    = 0
      recvtypes  = MPI_CHARACTER
      sendtypes  = MPI_CHARACTER
      recvcounts = 0
      sendcounts = 0

      !
      ! Create MPI types
      !

      ! MPI_TYPE_CREATE_SUBARRAY(NDIMS, ARRAY_OF_SIZES, ARRAY_OF_SUBSIZES,
      !   ARRAY_OF_STARTS, ORDER, OLDTYPE, NEWTYPE, IERROR)
      !   INTEGER    NDIMS, ARRAY_OF_SIZES(*), ARRAY_OF_SUBSIZES(*),
      !   ARRAY_OF_STARTS(*), ORDER, OLDTYPE, NEWTYPE, IERROR

      ! determine mpi_types for the receive matrices
      ! all processes will receive only from root
      sizes     = shape(b)
      subsizes  = lm(ra+1)
      starts    = 0
      call MPI_Type_create_subarray(1,sizes,subsizes,starts, &
         MPI_ORDER_FORTRAN, basic_type,recvtypes(1), ier)
      call MPI_Type_commit(recvtypes(1),ier)
      recvcounts(1) = 1


      ! determine mpi types for the senders
      if(ra == root) then
         ! root sends to everyone

         do i=1,sz
            sizes      = shape(a)
            subsizes   = lm(i)
            starts     = is(i) - 1
            sendcounts = 1
            call MPI_Type_create_subarray(1,sizes,subsizes,starts,  &
               MPI_ORDER_FORTRAN, basic_type,sendtypes(i),ier)
            call MPI_Type_commit(sendtypes(i),ier)
         enddo
      endif

      call MPI_Alltoallw(a,sendcounts,sdispls,sendtypes, &
         b,recvcounts,rdispls,recvtypes,comm,ier)

      do i=1,sz
         if (sendtypes(i) /= MPI_CHARACTER) then
            call MPI_Type_free(sendtypes(i),ier)
         endif
         if (recvtypes(i) /= MPI_CHARACTER) then
            call MPI_Type_free(recvtypes(i),ier)
         endif
      enddo

#else
      FAKE(kind(a)+kind(b)+kind(is)+kind(lm)+root+comm)
      FAKESET(b,0)
#endif
   end subroutine vector_distr_real8

   !_____________________________________________________________________________

   subroutine block_vector_distr_real8(a,b,is,lm,root,comm)
      !
      ! Distribute matrix a on non-root processes
      ! Process P receives a(is(P+1):is(P+1)+lm(P+1)-1,:) ( P: 0..size_of_comm - 1 )
      !
      ! Parameters:
      ! real*8 a(:,:) (in): matrix to distribute, only used at root
      ! real*8 b(:,:) (out): local matrix
      ! integer is(number-of-processes) (in): see above: needed on root
      ! integer lm(number-of-processes) (in): see above: needed on all processes
      ! integer root (in): process that holds a, is
      ! integer comm (in): MPI communicator
      !
      ! complication: a and b are not necessarily contiguous,
      !  but calling mpi_alltoallw with a and b, the compiler
      !  will generate contiguous arrays.
      !
      ! root process must have rank .eq. 0
      !
#ifdef USEMPI
      use xmpi_module, only : xmpi_ocomm
#endif

      implicit none
      real*8,  dimension(:,:), intent(in)  :: a
      real*8,  dimension(:,:), intent(out) :: b
      integer, dimension(:), intent(in)    :: is
      integer, dimension(:), intent(in)    :: lm
      integer, intent(in)                  :: root
      integer, intent(in)                  :: comm

#ifdef USEMPI
      integer                              :: ier,ra,sz,basic_type,i
      integer,allocatable,dimension(:)     :: recvtypes,sendtypes,recvcounts,sendcounts,sdispls,rdispls
      integer,dimension(2)                 :: sizes,subsizes,starts

      basic_type = MPI_DOUBLE_PRECISION
      call MPI_Comm_size(comm, sz, ier)
      call MPI_Comm_rank(comm, ra, ier)

      if (root /= 0) then
         print *,'Error in block_vector_distr_real8: root must be 0, but is:',root
         call MPI_Abort(xmpi_ocomm,1,ier)
      endif

      allocate(recvtypes(sz))
      allocate(sendtypes(sz))
      allocate(recvcounts(sz))
      allocate(sendcounts(sz))
      allocate(sdispls(sz))
      allocate(rdispls(sz))

      sdispls    = 0
      rdispls    = 0
      recvtypes  = MPI_CHARACTER
      sendtypes  = MPI_CHARACTER
      recvcounts = 0
      sendcounts = 0

      !
      ! Create MPI types
      !

      ! MPI_TYPE_CREATE_SUBARRAY(NDIMS, ARRAY_OF_SIZES, ARRAY_OF_SUBSIZES,
      !   ARRAY_OF_STARTS, ORDER, OLDTYPE, NEWTYPE, IERROR)
      !   INTEGER    NDIMS, ARRAY_OF_SIZES(*), ARRAY_OF_SUBSIZES(*),
      !   ARRAY_OF_STARTS(*), ORDER, OLDTYPE, NEWTYPE, IERROR

      ! determine mpi_types for the receive matrices

      ! all processes will receive only from root
      sizes     = shape(b)
      subsizes  = (/ lm(ra+1), size(b,2) /)
      starts    = 0
      call MPI_Type_create_subarray(2,sizes,subsizes,starts, &
         MPI_ORDER_FORTRAN, basic_type,recvtypes(1), ier)
      call MPI_Type_commit(recvtypes(1),ier)
      recvcounts(1) = 1

      ! determine mpi types for the senders
      if (ra == root) then
         ! root sends to everyone

         do i=1,sz
            sizes      = shape(a)
            subsizes   = (/ lm(i),     size(a,2) /)
            starts     = (/ is(i) - 1, 0         /)
            sendcounts = 1
            call MPI_Type_create_subarray(2,sizes,subsizes,starts,  &
               MPI_ORDER_FORTRAN, basic_type,sendtypes(i),ier)
            call MPI_Type_commit(sendtypes(i),ier)
         enddo
      endif

      call MPI_Alltoallw(a,sendcounts,sdispls,sendtypes, &
         b,recvcounts,rdispls,recvtypes,comm,ier)

      do i=1,sz
         if (sendtypes(i) /= MPI_CHARACTER) then
            call MPI_Type_free(sendtypes(i),ier)
         endif
         if (recvtypes(i) /= MPI_CHARACTER) then
            call MPI_Type_free(recvtypes(i),ier)
         endif
      enddo

#else
      FAKE(kind(a)+kind(b)+kind(is)+kind(lm)+root+comm)
      FAKESET(b,0)
#endif
   end subroutine block_vector_distr_real8

   !___________________________________________________________________________

   subroutine matrix_distr_real8(a,b,is,lm,js,ln,  &
      root,comm)
      !
      ! This subroutine distributes matrix on root to the
      ! processes in communicator comm.
      ! Root MUST be process #0 in comm
      !
      ! a: matrix to be distributed
      ! b: receive matrix, process p
      !    (1:lm(p+1),1:ln(p+1)) will be filled (p=0..size_of_comm - 1)
      !    Normally: the whole of b
      ! is,js: on process p, b(1,1) coincides with a(is(p),js(p)),
      !        p=1..size of comm
      ! root: root process, must be zero
      ! comm: communicator
      !

      implicit none
      real*8, intent(in),  dimension(:,:)  :: a
      real*8, intent(out), dimension(:,:)  :: b
      integer, intent(in), dimension(:)    :: is, lm
      integer, intent(in), dimension(:)    :: js, ln
      integer, intent(in)                  :: root, comm

#ifdef USEMPI

      call doit_distr(is,lm,js,ln,root,MPI_DOUBLE_PRECISION,comm)

   contains
      ! subroutine doit_distr is included here:
#include "genmpi_distr.inc"
#else
      FAKE(kind(a)+kind(b)+kind(is)+kind(lm)+kind(js)+kind(ln)+root+comm)
      FAKESET(b,0)
#endif
   end subroutine matrix_distr_real8

   !___________________________________________________________________________

   subroutine matrix_distr_integer(a,b,is,lm,js,ln,  &
      root,comm)
      !
      ! This subroutine distributes matrix on root to the
      ! processes in communicator comm.
      ! Root MUST be process #0 in comm
      !
      ! a: matrix to be distributed
      ! b: receive matrix, process p
      !    (1:lm(p+1),1:ln(p+1)) will be filled (p=0..size_of_comm - 1)
      !    Normally: the whole of b
      ! is,js: on process p, b(1,1) coincides with a(is(p),js(p)),
      !        p=1..size of comm
      ! root: root process, must be zero
      ! comm: communicator
      !
      !

      implicit none
      integer, intent(in),  dimension(:,:) :: a
      integer, intent(out), dimension(:,:) :: b
      integer, intent(in), dimension(:)    :: is, lm
      integer, intent(in), dimension(:)    :: js, ln
      integer, intent(in)                  :: root, comm
#ifdef USEMPI

      call doit_distr(is,lm,js,ln,root,MPI_INTEGER,comm)

   contains
      ! subroutine doit_distr is included here:
#include "genmpi_distr.inc"
#else
      FAKE(kind(a)+kind(b)+kind(is)+kind(lm)+kind(js)+kind(ln)+root+comm)
      FAKESET(b,0)
#endif
   end subroutine matrix_distr_integer

   !___________________________________________________________________________

   subroutine matrix_coll_real8(a,b,is,lm,js,ln,&
   &                            islowy,ishighy,islowx,ishighx,root,comm,w_only)
      ! collect matrix on root process
      !  NOTE: the collect is done from the non-root processes only, but see w_only below
      !
      ! parameters:
      ! real*8 a(:,:) (out): matrix to be collected.
      !                      This matrix has to be available, it is not
      !                      allocated by this subroutine.
      ! real*8 b(:,:) (in): local scattered matrix
      ! integer is,lm,js,ln(number of non-root processes) (in): description of
      !   the location of the submatrices b in matrix a:
      !   Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is
      !   the rank of the process (0..procs-1)
      !   The length of the 1st dimension of b is given by lm(rank+1)
      !   The length of the 2nd dimension of b is given by ln(rank+1)
      ! logical islowy, ishighy, islowx, ishighx (in): is this submatrix at the
      !  lowy, highy, lowx or highx side of the general matrix.
      ! integer root (in): the rank of the process where a is available
      ! integer comm (in): the MPI communicator to be used.
      ! logical, optional: w_only: if present and .true. collection
      !    is done on the root process (must be 0) of the worker processes,
      !    so, in this case, the root process contains data to be collected
      !
      ! Note: a,is,js are used at the root process only
      !
      ! Note: the root process must have rank .eq. 0
      !
      implicit none
      real*8, intent(out),  dimension(:,:) :: a
      real*8, intent(in), dimension(:,:)   :: b
      integer, intent(in), dimension(:)    :: is,lm
      integer, intent(in), dimension(:)    :: js,ln
      logical, intent(in), dimension(:)    :: islowy, ishighy, islowx, ishighx
      integer, intent(in)                  :: root,comm
      logical, intent(in), optional        :: w_only

#ifdef USEMPI

      call doit_coll(is,lm,js,ln,islowy,ishighy,islowx,ishighx,root,MPI_DOUBLE_PRECISION,comm,w_only)

   contains
      ! subroutine doit_coll is included here:
#include "genmpi_coll.inc"
#else
      FAKE(kind(a)+kind(b)+kind(is)+kind(lm)+kind(js)+kind(ln)+kind(w_only))
      FAKE(kind(islowy)+kind(ishighy)+kind(islowx)+kind(ishighx)+root+comm)
      FAKESET(a,0)
#endif

   end subroutine matrix_coll_real8
   !___________________________________________________________________________

   subroutine matrix_coll_integer(a,b,is,lm,js,ln,&
   &                              islowy,ishighy,islowx,ishighx,root,comm,w_only)
      ! for description, see matrix-coll_real8
      implicit none
      integer, intent(out), dimension(:,:) :: a
      integer, intent(in), dimension(:,:)  :: b
      integer, intent(in), dimension(:)    :: is, lm
      integer, intent(in), dimension(:)    :: js, ln
      logical, intent(in), dimension(:)    :: islowy, ishighy, islowx, ishighx
      integer, intent(in)                  :: root, comm
      logical, intent(in), optional        :: w_only

#ifdef USEMPI

      call doit_coll(is,lm,js,ln,islowy,ishighy,islowx,ishighx,root,MPI_INTEGER,comm,w_only)

   contains
      ! subroutine doit_coll is included here:
#include "genmpi_coll.inc"
#else
      FAKE(kind(a)+kind(b)+kind(is)+kind(lm)+kind(js)+kind(ln)+kind(w_only))
      FAKE(kind(islowy)+kind(ishighy)+kind(islowx)+kind(ishighx)+root+comm)
      FAKESET(a,0)
#endif

   end subroutine matrix_coll_integer


   !___________________________________________________________________________


   subroutine decomp(n, numprocs, myid, s, e)

      !  From the mpich-distribution: MPE_Decomp1d
      !
      !  n        : integer, input (for example: number of tasks or
      !                               dimension of an array)
      !  numprocs : integer, input (number of processes)
      !  myid     : integer, input (MPI id of this process)
      !  s        : integer, output
      !  e        : integer, output
      !
      !  The purpose of this subroutine is to get an as equal as possible
      !  division of the integers 1..n among numprocs intervals.
      !  A common use is to divide work among MPI processes.
      !
      !  example:
      !
      !  integer numprocs, ier, myid, n, s, e, i
      !  call MPI_Comm_size(MPI_COMM_WORLD, numprocs, ier)
      !  call MPI_Comm_rank(MPI_COMM_WORLD, myid, ier)
      !  n = 1000
      !  call decomp(n, nprocs, myid, s, e)
      !  do i = s,e
      !    ...
      !  enddo
      !
      integer, intent(in) :: n, numprocs, myid
      integer, intent(out):: s, e
      integer nlocal
      integer deficit

      nlocal  = n / numprocs
      s       = myid * nlocal + 1
      deficit = mod(n,numprocs)
      s       = s + min(myid,deficit)
      if (myid .lt. deficit) then
         nlocal = nlocal + 1
      endif
      e = s + nlocal - 1
      if (e .gt. n .or. myid .eq. numprocs-1) then
         e = n
      endif

   end subroutine decomp

   subroutine det_submatrices(ma,na,mp,np,is,lm,js,ln,islowy,ishighy,islowx,ishighx)
      !
      ! determine a division of a ma*na matrix on mp*np processes
      ! ma: integer(in): 1st dimension of matrix to divide
      ! na: integer(in): 2nd dimension of this matrix
      ! mp: integer(in): 1st dimension of processorgrid
      ! np: integer(in): 2nd dimension of processorgrid
      ! is(mp*np): integer(out): is(i) will be equal to the
      !                          start row of the i-th submatrix
      ! lm(mp*np): integer(out): lm(i) will be equal to the
      !                          first dimension of the i-th submatrix
      ! js(mp*np): integer(out): js(i) will be equal to the
      !                          start column of the i-th submatrix
      ! ln(mp*np): integer(out): ln(i) will be equal to the
      !                          2nd dimension of the i-th submatrix
      ! islowy(mp*np): logical(out): islowy(i) is true if the i-th submatrix
      !                              shares the first column with the global matrix
      !                              a(:,1)
      ! ishighy(mp*np): logical(out): ishighy(i) is true if the i-th submatrix
      !                              shares the last column with the global matrix
      !                              a(ma,:)
      ! islowx(mp*np): logical(out): islowx(i) is true if the i-th submatrix
      !                              shares the first row with the global matrix
      !                              a(1,:)
      ! ishighx(mp*np): logical(out): ishighx(i) is true if the i-th submatrix
      !                              shares the last row with the global matrix
      !                              a(na,:)
      !
      ! The submatrices overlap:
      ! 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
      ! 1 2 3 4 5 6 7 8 9           1 2 3 4 5 6 7 8 9 0 1
      !           1 2 3 4 5 6 7 8 9 0 1 2 3
      !
      ! + + + + + x x x x + + + + + x x x x + + + + + + +
      ! + + + + + x x x x + + + + + x x x x + + + + + + +
      ! + + + + + x x x x + + + + + x x x x + + + + + + +
      ! + + + + + x x x x + + + + + x x x x + + + + + + +
      ! + + + + + x x x x + + + + + x x x x + + + + + + +
      ! + + + + + x x x x + + + + + x x x x + + + + + + +
      ! + + + + + x x x x + + + + + x x x x + + + + + + +
      !
      ! This picture is for the overlap in horizontal direction,
      ! The overlap is 4.
      ! + -> matrix element
      ! x -> overlapping matrix element
      ! the order of the submatrices is 9,13,11.
      ! The order of the global matrix is 25
      !
      implicit none
      integer, intent(in)                :: ma,na,mp,np
      integer, dimension(:), intent(out) :: is,lm,js,ln
      logical, dimension(:), intent(out) :: islowy,ishighy,islowx,ishighx

#ifdef USEMPI
      integer i,j,s,e,k
      integer, parameter                 :: nover = 4
#endif

      islowy  = .false.
      ishighy = .false.
      islowx  = .false.
      ishighx = .false.
      is = 0
      lm = 0
      js = 0
      ln = 0

#ifdef USEMPI

      do i = 1, mp
         call decomp(ma-nover, mp, i - 1, s, e)
         k = 0
         do j = 1, np
            if (j .eq. 1) then
               islowy(i+k) = .true.
            endif
            if (j .eq. np) then
               ishighy(i+k) = .true.
            endif
            is(i + k) = s
            lm(i + k) = e - s + nover + 1
            k = k + mp
         enddo
      enddo
      k = 0
      do j=1, np
         call decomp(na-nover, np, j - 1, s, e)
         do i = 1, mp
            if (i .eq. 1) then
               islowx(i+k) = .true.
            endif
            if (i .eq. mp) then
               ishighx(i+k) = .true.
            endif
            js(i + k) = s
            ln(i + k) = e - s + nover + 1
         enddo
         k = k + mp
      enddo
#else
      FAKE(ma+na+mp+np)
#endif
   end subroutine det_submatrices

   subroutine shift_borders_matrix_real8(a,lowy,highy,lowx,highx,comm)
      !
      ! shift borders to and from neighbours.
      !
      implicit none
      real*8, dimension(:,:), intent(inout) :: a
      integer, intent(in)                   :: lowy,highy,lowx,highx,comm

#ifdef USEMPI
      integer, parameter                    :: datatype = MPI_DOUBLE_PRECISION
      integer ierror,ma,na

      ma = ubound(a,1)
      na = ubound(a,2)

      ! send to lowy, receive from highy
      call MPI_Sendrecv(a(2:ma-1,2), ma-2, datatype,       &
         lowy, 1,                           &
         a(2:ma-1,na), ma-2, datatype,      &
         highy, 1,                          &
         comm, MPI_STATUS_IGNORE, ierror)
      ! send to highy, receive from lowy
      call MPI_Sendrecv(a(2:ma-1,na-1), ma-2, datatype,    &
         highy, 2,                          &
         a(2:ma-1,1), ma-2, datatype,       &
         lowy, 2,                           &
         comm, MPI_STATUS_IGNORE, ierror)

      ! send to highx, receive from lowx
      call MPI_Sendrecv(a(ma-1,:), na, datatype,           &
         highx, 3,                            &
         a(1,:), na, datatype,              &
         lowx, 3,                            &
         comm, MPI_STATUS_IGNORE, ierror)
      ! send to lowx, receive from highx
      call MPI_Sendrecv(a(2,:), na, datatype,              &
         lowx, 4,                            &
         a(ma,:), na, datatype,             &
         highx, 4,                            &
         comm, MPI_STATUS_IGNORE, ierror)

#else
      FAKE(kind(a)+lowy+highy+lowx+highx+comm)
#endif
   end subroutine shift_borders_matrix_real8

end module general_mpi_module
#undef FAKE
#undef FAKESET