module general_mpi_module #ifdef USEMPI ! This module contains three variants of subroutines to scatter ! a matrix from the root process to the other processes. ! matrix_distr_scatter: uses MPI_Scatterv, scattering groups ! of columns ! matrix_distr_send: uses MPI_Isend and MPI_Irecv, scattering ! groups od columns ! matrix_distr_sendmat: uses MPI_Send and MPI_Recv, scattering ! submatrices as a whole ! ! and the opposite operation: ! matrix_coll_gather: uses MPI_Gatherv, gathering groups ! of columns ! matrix_coll_recv: uses MPI_Isend and MPI_Recv, gathering groups ! of columns ! matrix_coll_matrecv: uses MPI_Send and MPI_Recv, gathering ! submatrices as a whole ! ! NOTE: the matrices are considered to be bordered by a columns ! left and right and rows up and down. So, n general, ! only b(2:mb-1,2:nb-1) is send to the master process. ! HOWEVER: if matrix b is ! on the left: b(:,1) is also sent ! on the right: b(:,nb) is also sent ! on the top: b(1,:) is also sent ! on the bottom: b(mb,:) is also sent ! The master process receives a(1:ma,1:na). ! dimension of b (1:mb,1:nb). ! dimension of a (1:ma,1:na) ! ! The parameter lists are the same. ! On the lisa system at sara (Xeon, infiniband), the send variety ! is about twice as fast as the scatter variety. ! ! Most probable cause is that in the scatter variety, a number ! of MPI_Scatterv's are used. They will run after the previous ! has been completed, while in the other, as many MPI_isend's and ! MPI_Irecv's are executed in parallel, without waiting. ! There is no non-waiting MPI_Scatterv available, alas. ! ! On the same system, there is no significant difference in ! performance between matrix_coll_gather and matrix_coll_recv. ! ! The matrecv and matsend varieties are even faster than the ! recv and send varieties. Moreover, the code is simpler, so ! it is bet to use these, see interfaces matrix_distr and matrix_coll. ! Most of the subroutines end with a MPI_Barrier, this was ! necessary for usage with openmpi, at least for the ! distbribution routines. Didn't understand why, but now ! I know: this is a bug in the shared memory interface of ! openmpi. ! interface matrix_distr ! module procedure matrix_distr_send_real8 ! module procedure matrix_distr_send_integer ! module procedure matrix_distr_scatter_real8 ! module procedure matrix_distr_scatter_integer module procedure matrix_distr_sendmat_real8 module procedure matrix_distr_sendmat_integer end interface matrix_distr interface matrix_coll ! module procedure matrix_coll_gather_real8 ! module procedure matrix_coll_recv_real8 module procedure matrix_coll_recvmat_real8 end interface matrix_coll interface shift_borders module procedure shift_borders_matrix_real8 end interface shift_borders contains subroutine vector_distr_send(a,b,is,lm,root,comm) ! ! Distribute vector a on processes ! Process P receives a(is(P):is(P)+lm(P)-1) ! ! Parameters: ! real*8 a (in): vector to distribute, only used at root ! real*8 b (out): local vector ! integer is(number-of-processes) (in): see above: needed on all processes ! integer lm(number-of-processes) (in): see above: needed on all processes ! integer root (in): process that holds a, is, lm ! integer comm (in): MPI communicator ! ! Method: root process sends, using MPI_Isend parts of a to ! the other processes. On the root process a copy is ! made. ! use mpi implicit none real*8, dimension(:), intent(in) :: a real*8, dimension(:), intent(out) :: b integer, dimension(:), intent(in) :: is integer, dimension(:), intent(in) :: lm integer, intent(in) :: root integer, intent(in) :: comm integer :: ierror,p,procs,rank,pp,mlocal integer, dimension(:), allocatable :: req real*8, dimension(:), allocatable :: aa call MPI_Comm_rank(comm, rank, ierror) call MPI_Comm_size(comm, procs, ierror) allocate(req(procs-1)) mlocal = lm(rank+1) pp=0 if ( rank .eq. root ) then allocate(aa(size(a))) aa = a ! must have contiguous memory for a because ! of mpi_isend do p = 1,procs if ( p-1 .eq. rank ) then b(1:lm(p)) = a(is(p) : is(p)+lm(p)-1) else pp = pp + 1 call MPI_Isend(aa(is(p)), lm(p), MPI_DOUBLE_PRECISION, p-1, & 11, comm, req(pp), ierror) endif enddo call MPI_Waitall(pp,req,MPI_STATUSES_IGNORE,ierror) deallocate(aa) else call MPI_Recv(b, mlocal, MPI_DOUBLE_PRECISION, root, 11,& comm, MPI_STATUS_IGNORE, ierror) endif deallocate(req) end subroutine vector_distr_send subroutine matrix_distr_scatter_real8(a,b,is,lm,js,ln,root,comm) ! distribute matrix on processes ! ! parameters: ! real*8 a(:,:) (in): matrix to be scattered. ! real*8 b(:,:) (out): local matrix to scatter to. ! This matrix has to be available, it is not ! allocated by this subroutine. ! integer is,lm,js,ln(number of processes) (in): description of ! the location of the submatrices b in matrix a: ! Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is ! the rank of the process (0..procs-1) ! The length of the 1st dimension of b is given by lm(rank+1) ! The length of the 2nd dimension of b is given by ln(rank+1) ! integer root (in): the rank of the process where a is available ! integer comm (in): the MPI communicator to be used. ! ! Note: a,is,js are used at the root process only ! ! Method ! ! Distribute in a number of scatterv's the colums of the submatrices. ! Per scatterv, distribute (number of processes) columns, so ! we need maxval(ln) scatters. ! use mpi implicit none real*8, intent(in), dimension(:,:) :: a real*8, intent(out), dimension(:,:) :: b integer, intent(in), dimension(:) :: is,lm integer, intent(in), dimension(:) :: js,ln integer, intent(in) :: root,comm integer :: j,p,rank,ierror,procs,jj,ma,jmax integer :: nlocal,mlocal,rcnt integer, allocatable, dimension(:) :: displs, cnts real*8, allocatable, dimension(:,:) :: aa call MPI_Comm_rank(comm, rank, ierror) call MPI_Comm_size(comm, procs, ierror) nlocal = ln(rank+1) mlocal = lm(rank+1) if (rank .eq. root) then ma = size(a,1) allocate(displs(procs), cnts(procs)) ! because the counts and displacements are only valid for a ! contiguous matrix, we take care of that: allocate(aa(size(a,1),size(a,2))) aa = a else ma = 1 ! to quiet the compiler allocate(displs(1), cnts(1)) ! these are not needed in non-root process, but allocate(aa(1,1)) displs(1) = 0 cnts(1) = 0 ! it is safer to have valid addresses endif jmax = maxval(ln(1:procs)) ! number of scatters needed do j=1,jmax ! scatter all j'th columns if (rank .eq. root) then do p=1,procs if (ln(p) .ge. j) then ! compute displacement of the start of the j-th ! column of submatrix(p) displs(p) = (js(p) -1 + j -1)*ma + is(p) - 1 cnts(p) = lm(p) ! number of elements in this column else ! for submatrix(p), we are out of columns displs(p) = 0 ! put in some valid value for the displacement cnts(p) = 0 ! and use zero for the number of elements in ! this column endif enddo endif if (nlocal .ge. j) then ! this is for all processes jj=j ! jj equals the column number of the local ! matrix rcnt = mlocal ! number of elements in this column else jj=1 ! we are out of columns, give jj a valid number rcnt = 0 ! and set the number of elements to receive to zero endif call MPI_Scatterv(aa(1,1), cnts(1), displs(1), MPI_DOUBLE_PRECISION, & b(:,jj), rcnt, MPI_DOUBLE_PRECISION, & root, comm, ierror) enddo deallocate(displs,cnts,aa) end subroutine matrix_distr_scatter_real8 subroutine matrix_distr_scatter_integer(a,b,is,lm,js,ln,root,comm) ! distribute matrix on processes ! ! parameters: ! integer a(:,:) (in): matrix to be scattered. ! integer b(:,:) (out): local matrix to scatter to. ! This matrix has to be available, it is not ! allocated by this subroutine. ! integer is,lm,js,ln(number of processes) (in): description of ! the location of the submatrices b in matrix a: ! Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is ! the rank of the process (0..procs-1) ! The length of the 1st dimension of b is given by lm(rank+1) ! The length of the 2nd dimension of b is given by ln(rank+1) ! integer root (in): the rank of the process where a is available ! integer comm (in): the MPI communicator to be used. ! ! Note: a,is,js are used at the root process only ! ! Method ! ! Distribute in a number of scatterv's the colums of the submatrices. ! Per scatterv, distribute (number of processes) columns, so ! we need maxval(ln) scatters. ! use mpi implicit none integer, intent(in), dimension(:,:) :: a integer, intent(out), dimension(:,:) :: b integer, intent(in), dimension(:) :: is,lm integer, intent(in), dimension(:) :: js,ln integer, intent(in) :: root,comm integer :: j,p,rank,ierror,procs,jj,ma,jmax integer :: nlocal,mlocal,rcnt integer, allocatable, dimension(:) :: displs, cnts integer, allocatable, dimension(:,:) :: aa integer, parameter :: tag = 1 call MPI_Comm_rank(comm, rank, ierror) call MPI_Comm_size(comm, procs, ierror) nlocal = ln(rank+1) mlocal = lm(rank+1) if (rank .eq. root) then ma = size(a,1) allocate(displs(procs), cnts(procs)) allocate(aa(size(a,1),size(a,2))) aa = a else ma = 1 ! to quiet the compiler allocate(displs(1), cnts(1)) ! these are not needed in non-root process, but ! it is safer to have valid addresses allocate(aa(1,1)) endif jmax = maxval(ln(1:procs)) ! number of scatters needed do j=1,jmax ! scatter all j'th columns if (rank .eq. root) then do p=1,procs if (ln(p) .ge. j) then ! compute displacement of the start of the j-th ! column of submatrix(p) displs(p) = (js(p) -1 + j -1)*ma + is(p) - 1 cnts(p) = lm(p) ! number of elements in this column else ! for submatrix(p), we are out of columns displs(p) = 0 ! put in some valid value for the displacement cnts(p) = 0 ! and use zero for the number of elements in ! this column endif enddo endif if (nlocal .ge. j) then ! this is for all processes jj=j ! jj equals the column number of the local ! matrix rcnt = mlocal ! number of elements in this column else jj=1 ! we are out of columns, give jj a valid number rcnt = 0 ! and set the number of elements to receive to zero endif call MPI_Scatterv(aa(1,1), cnts(1), displs(1), MPI_INTEGER, & b(:,jj), rcnt, MPI_INTEGER, & root, comm, ierror) enddo deallocate(displs,cnts,aa) end subroutine matrix_distr_scatter_integer subroutine matrix_distr_send_real8(a,b,is,lm,js,ln,root,comm) ! distribute matrix on processes ! ! parameters: ! real*8 a(:,:) (in): matrix to be scatterd. ! real*8 b(:,:) (out): local matrix to scatter to. ! This matrix has to be available, it is not ! allocated by this subroutine. ! integer is,lm,js,ln(number of processes) (in): description of ! the location of the submatrices b in matrix a: ! Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is ! the rank of the process (0..procs-1) ! The length of the 1st dimension of b is given by lm(rank+1) ! The length of the 2nd dimension of b is given by ln(rank+1) ! integer root (in): the rank of the process where a is available ! integer comm (in): the MPI communicator to be used. ! ! Note: a,is,js are used at the root process only ! ! Method ! ! Sends colums of submatrices to the processes using MPI_Isend. ! This routine tries to send as many columns in parallel ! to different processes as possible, see the usage of req(:) ! and maxreq below. ! The columns are received using MPI_Irecv. ! use mpi implicit none real*8, intent(in), dimension(:,:) :: a real*8, intent(out), dimension(:,:) :: b integer, intent(in), dimension(:) :: is,lm integer, intent(in), dimension(:) :: js,ln integer, intent(in) :: root,comm integer :: j,rank,ierror,l,procs integer :: sends, sends_todo, length integer :: p,mlocal,nlocal logical :: done integer :: maxreq integer, allocatable, dimension(:) :: req integer, parameter :: tag = 1 integer, parameter :: datatype = MPI_DOUBLE_PRECISION real*8, allocatable, dimension(:,:) :: aa,bb call MPI_Comm_rank(comm, rank, ierror) call MPI_Comm_size(comm, procs, ierror) nlocal = ln(rank+1) mlocal = lm(rank+1) ! calculate total number of sends to be done: that is equal to ! the total number of columns in the submatrices if (rank .eq. root) then sends_todo = sum(ln(1:procs)) ! maxreq is the maximum number of outstanding Isend or Irecv requests. ! Experiments learned that the value of maxreq is not critical, ! a value of 10 gave on the lisa system (Xeon, infiniband) ! only somewhat worse results. Probably, the slower the network, ! the higher this value can be set. Maximum usable is the total number ! of columns (sends_todo, see above) maxreq = 1000 else sends_todo = 0 ! to quiet the compiler maxreq = nlocal ! on non-root processes, we need only the number of local ! columns endif ! distribute matrix. ! to ensure that every process is as busy as possible, we do the ! loops on the processes in the inner loop. allocate(req(maxreq)) req=MPI_REQUEST_NULL ! initialize req with a null-value if ( rank .eq. root) then ! we need a contiguous matrix a, because ! we are going to use isend. allocate(aa(size(a,1),size(a,2))) aa = a sends=0 l = 0 j = 0 loop: do j=j+1 ! j runs over the columns do p=0, procs - 1 if (ln(p + 1) .ge. j) then ! is j in the range of submatrix(p) length = lm(p + 1) if (p .eq. rank) then ! submtarix(p) is on root, just copy the column b(1:length,j) = a(is(rank+1):is(rank+1)+length-1,j+js(rank+1)-1) else ! search a free request holder: done = .false. do while( .not. done) l=l+1 if (l .gt. maxreq) l=1 if (req(l) .eq. MPI_REQUEST_NULL) then done = .true. else ! ! MPI_Test gives .true. if req(l).eq.MPI_REQUEST_NULL, ! but the standard says nothing about this case, ! so we test first if req(l) is equal to MPI_REQUEST_NULL ! call MPI_Test(req(l),done,MPI_STATUS_IGNORE,ierror) endif enddo ! now we have a free request holder req(l) call MPI_Isend(aa(is(p + 1), j + js(p + 1) - 1), length, & datatype, p, tag, comm, req(l), ierror) endif sends = sends + 1 if(sends .ge. sends_todo) exit loop ! finish if the number of sends is ! equal to the total number of sends ! needed endif enddo enddo loop deallocate(aa) else ! below the code for non-root: ! because of irecv, we must be sure that we have a contiguous matrix allocate(bb(size(b,1),size(b,2))) do j=1,nlocal call MPI_Irecv(bb(1,j), mlocal, datatype, root, tag, comm, req(j), ierror) enddo endif call MPI_Waitall(maxreq,req,MPI_STATUSES_IGNORE,ierror) if (rank .ne. root) then b = bb deallocate(bb) endif deallocate(req) end subroutine matrix_distr_send_real8 subroutine matrix_distr_send_integer(a,b,is,lm,js,ln,root,comm) ! distribute matrix on processes ! ! parameters: ! integer a(:,:) (in): matrix to be scatterd. ! integer b(:,:) (out): local matrix to scatter to. ! This matrix has to be available, it is not ! allocated by this subroutine. ! integer is,lm,js,ln(number of processes) (in): description of ! the location of the submatrices b in matrix a: ! Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is ! the rank of the process (0..procs-1) ! The length of the 1st dimension of b is given by lm(rank+1) ! The length of the 2nd dimension of b is given by ln(rank+1) ! integer root (in): the rank of the process where a is available ! integer comm (in): the MPI communicator to be used. ! ! Note: a,is,js are used at the root process only ! ! Method ! ! Sends colums of submatrices to the processes using MPI_Isend. ! This routine tries to send as many columns in parallel ! to different processes as possible, see the usage of req(:) ! and maxreq below. ! The columns are received using MPI_Irecv. ! use mpi implicit none integer, intent(in), dimension(:,:) :: a integer, intent(out), dimension(:,:) :: b integer, intent(in), dimension(:) :: is,lm integer, intent(in), dimension(:) :: js,ln integer, intent(in) :: root,comm integer :: j,rank,ierror,l,procs integer :: sends, sends_todo, length integer :: p,mlocal,nlocal logical :: done integer :: maxreq integer, allocatable, dimension(:) :: req integer, parameter :: tag = 1 integer, parameter :: datatype = MPI_INTEGER integer, allocatable, dimension(:,:) :: aa,bb call MPI_Comm_rank(comm, rank, ierror) call MPI_Comm_size(comm, procs, ierror) nlocal = ln(rank+1) mlocal = lm(rank+1) ! calculate total number of sends to be done: that is equal to ! the total number of columns in the submatrices if (rank .eq. root) then sends_todo = sum(ln(1:procs)) ! maxreq is the maximum number of outstanding Isend or Irecv requests. ! Experiments learned that the value of maxreq is not critical, ! a value of 10 gave on the lisa system (Xeon, infiniband) ! only somewhat worse results. Probably, the slower the network, ! the higher this value can be set. Maximum usable is the total number ! of columns (sends_todo, see above) maxreq = 1000 else sends_todo = 0 ! to quiet the compiler maxreq = nlocal ! on non-root processes, we need only the number of local ! columns endif ! distribute matrix. ! to ensure that every process is as busy as possible, we do the ! loops on the processes in the inner loop. allocate(req(maxreq)) req=MPI_REQUEST_NULL ! initialize req with a null-value if ( rank .eq. root) then allocate(aa(size(a,1),size(a,2))) aa = a sends=0 l = 0 j = 0 loop: do j=j+1 ! j runs over the columns do p=0, procs - 1 if (ln(p + 1) .ge. j) then ! is j in the range of submatrix(p) length = lm(p + 1) if (p .eq. rank) then ! submtarix(p) is on root, just copy the column b(1:length,j) = a(is(rank+1):is(rank+1)+length-1,j+js(rank+1)-1) else ! search a free request holder: done = .false. do while( .not. done) l=l+1 if (l .gt. maxreq) l=1 if (req(l) .eq. MPI_REQUEST_NULL) then done = .true. else ! ! MPI_Test gives .true. if req(l).eq.MPI_REQUEST_NULL, ! but the standard says nothing about this case, ! so we test first if req(l) is equal to MPI_REQUEST_NULL ! call MPI_Test(req(l),done,MPI_STATUS_IGNORE,ierror) endif enddo ! now we have a free request holder req(l) call MPI_Isend(aa(is(p + 1), j + js(p + 1) - 1), length, & datatype, p, tag, comm, req(l), ierror) endif sends = sends + 1 if(sends .ge. sends_todo) exit loop ! finish if the number of sends is ! equal to the total number of sends ! needed endif enddo enddo loop deallocate(aa) else ! below the code for non-root: allocate(bb(size(b,1),size(b,2))) do j=1,nlocal call MPI_Irecv(bb(1,j), mlocal, datatype, root, tag, comm, req(j), ierror) enddo endif do l=1,maxreq ! wait until all requests are done if (req(l) .ne. MPI_REQUEST_NULL) then call MPI_Wait(req(l), MPI_STATUS_IGNORE, ierror) endif enddo if (rank .ne. root) then b = bb deallocate(bb) endif deallocate(req) end subroutine matrix_distr_send_integer subroutine matrix_distr_sendmat_real8(a,b,is,lm,js,ln,& root,comm) ! distribute matrix on root process to all processes ! ! parameters: ! real*8 a(:,:) (in): matrix to be scattered ! real*8 b(:,:) (out): local scattered matrix ! integer is,lm,js,ln(number of processes) (in): description of ! the location of the submatrices b in matrix a: ! Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is ! the rank of the process (0..procs-1) ! The length of the 1st dimension of b is given by lm(rank+1) ! The length of the 2nd dimension of b is given by ln(rank+1) ! integer root (in): the rank of the process where a is available ! integer comm (in): the MPI communicator to be used. ! ! Note: a,is,js are used at the root process only ! ! Method ! ! root receives submatrices as a whole, one by one. Local ! submatrix is copied. ! use mpi implicit none real*8, intent(in), dimension(:,:) :: a real*8, intent(out), dimension(:,:) :: b integer, intent(in), dimension(:) :: is, lm integer, intent(in), dimension(:) :: js, ln integer, intent(in) :: root, comm integer :: i, rank, ierror, procs, p integer :: mlocal, nlocal integer :: acolstart, acolend, arowstart, arowend integer :: mp, np, k, ia, ja integer, parameter :: tag1 = 125 integer, parameter :: tag2 = 126 call MPI_Comm_rank(comm, rank, ierror) call MPI_Comm_size(comm, procs, ierror) nlocal = ln(rank+1) mlocal = lm(rank+1) ! define start-end cols and rows for the general case: ! computations on arow.. and acol.. are only meaningful at root arowstart = is(root + 1) arowend = is(root + 1) + mlocal acolstart = js(root + 1) acolend = js(root + 1) + nlocal if (rank .eq. root) then ! copy of the the relevant part of A to local submtrix B: b(:,:) = a(arowstart:arowend,acolstart:acolend) ! now, send to each non-root process ! it's part of matrix a ploop: do p = 1, procs if ( p - 1 .eq. root ) then cycle ploop endif ! howmany elements to send to process p-1 (= mp*np) ! and where to put this in a (ia:ia+mp-1, ja:ja+np-1) ! mp = lm(p) np = ln(p) ia = is(p) ja = js(p) ! to avoid deadlocks, master first sends a ! short message to p. After receiving that message, ! p will start receiving its submatrix call MPI_Send(mp*np, 1, MPI_INTEGER, p - 1, tag1, comm, ierror) ! send a submatrix to process p-1: call MPI_Send(a(ia:ia+mp-1,ja:ja+np-1), mp*np, MPI_DOUBLE_PRECISION, & p - 1, tag2, comm, ierror) enddo ploop ! loop over non-root processes else ! above is the root code, now the non-root code ! receive the short message from root, so I can start receiving b call MPI_Recv(i, 1, MPI_INTEGER,& root, tag1, comm, MPI_STATUS_IGNORE, ierror) ! sanity check k = size(b) if (i .ne. k) then write(*,*)'process ',rank,' has a problem in matrix_distr_sendmat_real8: ' write(*,*)'number of words to receive:',k write(*,*)'root expects: ',i call MPI_Abort(MPI_COMM_WORLD,1,ierror) endif call MPI_Recv(b, k, MPI_DOUBLE_PRECISION, root, & tag2, comm, MPI_STATUS_IGNORE, ierror) endif end subroutine matrix_distr_sendmat_real8 subroutine matrix_distr_sendmat_integer(a,b,is,lm,js,ln,& root,comm) ! distribute matrix on root process to all processes ! ! parameters: ! integer a(:,:) (in): matrix to be scattered ! integer b(:,:) (out): local scattered matrix ! integer is,lm,js,ln(number of processes) (in): description of ! the location of the submatrices b in matrix a: ! Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is ! the rank of the process (0..procs-1) ! The length of the 1st dimension of b is given by lm(rank+1) ! The length of the 2nd dimension of b is given by ln(rank+1) ! integer root (in): the rank of the process where a is available ! integer comm (in): the MPI communicator to be used. ! ! Note: a,is,js are used at the root process only ! ! Method ! ! root receives submatrices as a whole, one by one. Local ! submatrix is copied. ! use mpi implicit none integer, intent(in), dimension(:,:) :: a integer, intent(out), dimension(:,:) :: b integer, intent(in), dimension(:) :: is, lm integer, intent(in), dimension(:) :: js, ln integer, intent(in) :: root, comm integer :: i, rank, ierror, procs, p integer :: mlocal, nlocal integer :: acolstart, acolend, arowstart, arowend integer :: mp, np, k, ia, ja integer, parameter :: tag1 = 125 integer, parameter :: tag2 = 126 call MPI_Comm_rank(comm, rank, ierror) call MPI_Comm_size(comm, procs, ierror) nlocal = ln(rank+1) mlocal = lm(rank+1) ! define start-end cols and rows for the general case: ! computations on arow.. and acol.. are only meaningful at root arowstart = is(root + 1) arowend = is(root + 1) + mlocal acolstart = js(root + 1) acolend = js(root + 1) + nlocal if (rank .eq. root) then ! copy of the the relevant part of A to local submtrix B: b(:,:) = a(arowstart:arowend,acolstart:acolend) ! now, send to each non-root process ! it's part of matrix a ploop: do p = 1, procs if ( p - 1 .eq. root ) then cycle ploop endif ! howmany elements to send to process p-1 (= mp*np) ! and where to put this in a (ia:ia+mp-1, ja:ja+np-1) ! mp = lm(p) np = ln(p) ia = is(p) ja = js(p) ! to avoid deadlocks, master first sends a ! short message to p - 1. After receiving that message, ! p - 1 will start receiving its submatrix call MPI_Send(mp*np, 1, MPI_INTEGER, p - 1, tag1, comm, ierror) ! send a submatrix to process p-1 call MPI_Send(a(ia:ia+mp-1,ja:ja+np-1), mp*np, MPI_INTEGER, & p - 1, tag2, comm, ierror) enddo ploop ! loop over non-root processes else ! above is the root code, below the non-root code ! receive the short message from root, so I can start receiving b call MPI_Recv(i, 1, MPI_INTEGER,& root, tag1, comm, MPI_STATUS_IGNORE, ierror) ! sanity check k = size(b) if (i .ne. k) then write(*,*)'process ',rank,' has a problem in matrix_distr_sendmat_integer: ' write(*,*)'number of words to receive:',k write(*,*)'root expects: ',i call MPI_Abort(MPI_COMM_WORLD,1,ierror) endif call MPI_Recv(b, k, MPI_INTEGER, root, & tag2, comm, MPI_STATUS_IGNORE, ierror) endif end subroutine matrix_distr_sendmat_integer subroutine matrix_coll_gather_real8(a,b,is,lm,js,ln, & isleft,isright,istop,isbot,root,comm) ! collect matrix on root process ! ! parameters: ! real*8 a(:,:) (out): matrix to be collected. ! This matrix has to be available, it is not ! allocated by this subroutine. ! real*8 b(:,:) (in): local scattered matrix ! integer is,lm,js,ln(number of processes) (in): description of ! the location of the submatrices b in matrix a: ! Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is ! the rank of the process (0..procs-1) ! The length of the 1st dimension of b is given by lm(rank+1) ! The length of the 2nd dimension of b is given by ln(rank+1) ! logical isleft, isright, istop, isbot (in): is this submatrix at the ! left, right top or bottom of the general matrix. ! integer root (in): the rank of the process where a is available ! integer comm (in): the MPI communicator to be used. ! ! Note: a,is,js are used at the root process only ! ! Method ! ! Use gatherv. Each gatherv call collects one column of ! all submatrices ! ! NOTE: the matrices are considered to be bordered by a columns ! left and right and rows up and down. So, n general, ! only b(2:mb-1,2:nb-1) is send to the master process. ! HOWEVER: if matrix b is ! on the left: b(:,1) is also sent ! on the right: b(:,nb) is also sent ! on the top: b(1,:) is also sent ! on the bottom: b(mb,:) is also sent ! The master process receives a(1:ma,1:na). ! dimension of b (1:mb,1:nb). ! dimension of a (1:ma,1:na) ! use mpi use xmpi_module implicit none real*8, intent(out), dimension(:,:) :: a real*8, intent(in), dimension(:,:) :: b integer, intent(in), dimension(:) :: is,lm integer, intent(in), dimension(:) :: js,ln logical, intent(in), dimension(:) :: isleft, isright, istop, isbot integer, intent(in) :: root,comm integer :: j,p,rank,ierror,procs,jj,ma,jmax,k integer :: nlocal,mlocal,scnt integer, allocatable, dimension(:) :: displs, cnts logical :: isleftl, isrightl, istopl, isbotl real*8, allocatable, dimension(:,:) :: aa #ifdef USEMPE call MPE_Log_event(event_coll_start,0,'cstart') #endif call MPI_Comm_rank(comm, rank, ierror) call MPI_Comm_size(comm, procs, ierror) nlocal = ln(rank+1) mlocal = lm(rank+1) isleftl = isleft(rank+1) isrightl = isright(rank+1) istopl = istop(rank+1) isbotl = isbot(rank+1) ma = 1 ! to quiet the compiler if (rank .eq. root) then ma = size(a,1) endif if (rank .eq. root) then allocate(displs(procs), cnts(procs)) ! need a contiguous matrix, otherwise the counts and ! displacements will be in error allocate(aa(size(a,1),size(a,2))) else allocate(displs(1),cnts(1)) ! need valid addresses on non-root allocate(aa(1,1)) endif nlocal = ln(rank+1) mlocal = lm(rank+1) jmax = maxval(ln(1:procs)) ! number of gathers needed do j=1,jmax if (rank .eq. root) then ! displacements and counts only needed on root do p=1,procs if (ln(p) .ge. j) then displs(p) = (js(p) -1 + j -1)*ma + is(p) ! compute displacement for ! column j of submatrix p ! note that first and last elements ! of this columns are borders, not ! to be gathered cnts(p) = lm(p)-2 ! number of non-border elements in ! this column ! Normally, the first and last rows and columns are not to ! be send to root, but we make an exception for the ! rows and columns that are the first rows and columns ! from the global matrix. ! displs(p) = (js(p) -1 + j - 1)*ma + is(p) - 1 cnts(p) = lm(p) if ( .not. isleft(p) ) then ! this is a submatrix, not on the left. if ( j .eq. 1 ) then ! The first column is not to be sent, ! so make cnts(p) = 0, and give displs(p) a ! valid value displs(p) = 0 cnts(p) = 0 endif endif if ( .not. isright(p) ) then ! this is a submatrix not on the right if ( j .eq. ln(p) ) then ! The last column is not to be sent, ! so make cnts(p) = 0, and give displs(p) a ! valid value displs(p) = 0 cnts(p) = 0 endif endif if ( .not. istop(p) ) then ! this is a submatrix, not on the top ! so we don't send the first element of column j ! adapt displs(p) and cnts(p) accordingly: displs(p) = displs(p)+1 cnts(p) = max(cnts(p)-1,0) endif if ( .not. isbot(p) ) then ! this is a submatrix, not on the bottom ! so we don't send the last element of column j ! adapt cnts(p) accordingly: cnts(p) = max(cnts(p)-1,0) endif else displs(p) = 0 ! we are out of columns, use a valid value for ! the displacement cnts(p) = 0 ! and set number of elements to receive for ! this submatrix to zero endif !ln(p) .ge. j enddo ! do p = 1,procs endif ! rank .eq. root jj = j ! column of submatrix to send scnt = mlocal ! number of elements to send k = 1 ! starting element of column j to send if (nlocal .ge. j) then if (.not. isleftl ) then if ( j .eq. 1 ) then jj = 1 scnt = 0 endif endif if (.not. isrightl ) then if ( j .eq. nlocal ) then jj = 1 scnt = 0 endif endif if (.not. istopl ) then k = k+1 scnt = max(scnt-1, 0) endif if (.not. isbotl ) then scnt = max(scnt-1, 0) endif else jj = 1 ! out of columns, use valid value for jj scnt = 0 ! and put number of elements to send to zero endif call MPI_Gatherv(b(k:k+scnt-1,jj), scnt, MPI_DOUBLE_PRECISION, & aa(1,1), cnts, displs, MPI_DOUBLE_PRECISION, & root, comm, ierror) enddo if(rank .eq. root) then a = aa endif deallocate(displs,cnts,aa) end subroutine matrix_coll_gather_real8 subroutine matrix_coll_recv_real8 (a,b,is,lm,js,ln,& isleft,isright,istop,isbot,root,comm) ! collect matrix on root process ! ! parameters: ! real*8 a(:,:) (out): matrix to be collected. ! This matrix has to be available, it is not ! allocated by this subroutine. ! real*8 b(:,:) (in): local scattered matrix ! integer is,lm,js,ln(number of processes) (in): description of ! the location of the submatrices b in matrix a: ! Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is ! the rank of the process (0..procs-1) ! The length of the 1st dimension of b is given by lm(rank+1) ! The length of the 2nd dimension of b is given by ln(rank+1) ! logical isleft, isright, istop, isbot (in): is this submatrix at the ! left, right top or bottom of the general matrix. ! integer root (in): the rank of the process where a is available ! integer comm (in): the MPI communicator to be used. ! ! Note: a,is,js are used at the root process only ! ! Method ! ! Receive colums of submatrices using MPI_Probe and MPI_Recv. ! The non-root processes use MPI_Isend to send the columns ! ! NOTE: the matrices are considered to be bordered by a columns ! left and right and rows up and down. So, n general, ! only b(2:mb-1,2:nb-1) is send to the master process. ! HOWEVER: if matrix b is ! on the left: b(:,1) is also sent ! on the right: b(:,nb) is also sent ! on the top: b(1,:) is also sent ! on the bottom: b(mb,:) is also sent ! The master process receives a(1:ma,1:na). ! dimension of b (1:mb,1:nb). ! dimension of a (1:ma,1:na) ! use mpi implicit none real*8, intent(out), dimension(:,:) :: a real*8, intent(in), dimension(:,:) :: b integer, intent(in), dimension(:) :: is,lm integer, intent(in), dimension(:) :: js,ln logical, intent(in), dimension(:) :: isleft,isright,istop,isbot integer, intent(in) :: root,comm integer :: i,j,rank,ierror,l,procs,ii,jj integer :: jstart, jend, p, m, n integer :: recvs, recvs_todo integer :: mlocal,nlocal,sender logical :: isleftl, isrightl, istopl, isbotl integer, allocatable, dimension(:) :: col integer, allocatable, dimension(:) :: req integer, parameter :: tag = 2 integer, dimension(MPI_STATUS_SIZE) :: mpistatus real*8, allocatable, dimension(:,:) :: bb call MPI_Comm_rank(comm, rank, ierror) call MPI_Comm_size(comm, procs, ierror) nlocal = ln(rank+1) mlocal = lm(rank+1) isleftl = isleft(rank+1) isrightl = isright(rank+1) istopl = istop(rank+1) isbotl = isbot(rank+1) if (rank .eq. root) then ! calculate total number of recvs to be done: that is equal to ! the total number of columns in the submatrices recvs_todo = sum(ln(1:procs)) recvs_todo = recvs_todo - nlocal ! correct for columns local available do p=1,procs ! correct for borders if (p .eq. root+1) then cycle endif if ( .not. isleft(p)) then recvs_todo = recvs_todo -1 endif if ( .not. isright(p)) then recvs_todo = recvs_todo -1 endif enddo allocate(col(procs)) ! to remember which was the last column ! received for submatrix(p) col = 0 recvs = 0 ! copy submatrix on root: i = 1 ii = is(rank+1) m = mlocal j = 1 jj = js(rank+1) n = nlocal ! take care of borders: if ( .not. isleftl ) then j = j + 1 jj = jj + 1 endif if ( .not. isrightl ) then n = n - 1 endif if ( .not. istopl ) then i = i + 1 ii = ii + 1 m = m - 1 endif if ( .not. isbotl ) then m = m - 1 endif a(ii:ii+m-1,jj:jj+n-1) = b(i:i+m-1,j:j+n-1) do recvs=1,recvs_todo ! loop over the total number of receives needed ! wait until anybody did send a column to root: call MPI_Probe(MPI_ANY_SOURCE, tag, comm, mpistatus, ierror) ! find out who was the sender: sender = mpistatus(MPI_SOURCE) ! update col, and compute i and j: the startlocation of ! to be received column p = sender + 1 col(p) = col(p) + 1 i = is(p) j = col(p)+js(p)-1 m = lm(p) ! perform the actual receive: ! and correct for borders if ( .not. isleft(p)) then j = j + 1 endif if ( .not. isright(p)) then ! nothing to correct here endif if ( .not. istop(p)) then i = i + 1 m = m - 1 endif if ( .not. isbot(p)) then m = m - 1 endif call MPI_Recv(a(i:i+m-1,j),m,MPI_DOUBLE_PRECISION, & sender,tag,comm,MPI_STATUS_IGNORE,ierror) enddo deallocate(col) ! deallocate(isleft, isright, istop, isbot) else ! non-root code allocate(req(nlocal)) ! for holding the requests allocate(bb(size(b,1),size(b,2))) bb = b ! want to be sure that the matrix to send is ! contiguous, because isend's are used req = MPI_REQUEST_NULL jstart = 1 jend = nlocal i = 1 m = mlocal ! take care of borders if ( .not. isleftl ) then jstart = jstart + 1 endif if ( .not. isrightl ) then jend = jend - 1 endif if ( .not. istopl ) then i = i + 1 m = m - 1 endif if ( .not. isbotl ) then m = m - 1 endif do j = jstart, jend call MPI_Isend(bb(i,j), m, MPI_DOUBLE_PRECISION, root, tag, comm, req(j), ierror) enddo do l=1,nlocal ! wait until all requests have been handled if (req(l) .ne. MPI_REQUEST_NULL) then call MPI_Wait(req(l),MPI_STATUS_IGNORE,ierror) endif enddo deallocate(req,bb) endif end subroutine matrix_coll_recv_real8 subroutine matrix_coll_recvmat_real8(a,b,is,lm,js,ln,& isleft,isright,istop,isbot,root,comm) ! collect matrix on root process ! ! parameters: ! real*8 a(:,:) (out): matrix to be collected. ! This matrix has to be available, it is not ! allocated by this subroutine. ! real*8 b(:,:) (in): local scattered matrix ! integer is,lm,js,ln(number of processes) (in): description of ! the location of the submatrices b in matrix a: ! Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is ! the rank of the process (0..procs-1) ! The length of the 1st dimension of b is given by lm(rank+1) ! The length of the 2nd dimension of b is given by ln(rank+1) ! logical isleft, isright, istop, isbot (in): is this submatrix at the ! left, right top or bottom of the general matrix. ! integer root (in): the rank of the process where a is available ! integer comm (in): the MPI communicator to be used. ! ! Note: a,is,js are used at the root process only ! ! Method ! ! root receives submatrices as a whole, one by one. Local ! submatrix is copied. ! ! NOTE: the matrices are considered to be bordered by a columns ! left and right and rows up and down. So, n general, ! only b(2:mb-1,2:nb-1) is send to the master process. ! HOWEVER: if matrix b is ! on the left: b(:,1) is also sent ! on the right: b(:,nb) is also sent ! on the top: b(1,:) is also sent ! on the bottom: b(mb,:) is also sent ! The master process receives a(1:ma,1:na). ! dimension of b (1:mb,1:nb). ! dimension of a (1:ma,1:na) ! use mpi implicit none real*8, intent(out), dimension(:,:) :: a real*8, intent(in), dimension(:,:) :: b integer, intent(in), dimension(:) :: is, lm integer, intent(in), dimension(:) :: js, ln logical, intent(in), dimension(:) :: isleft, isright, istop, isbot integer, intent(in) :: root, comm integer :: i, rank, ierror, procs, p integer :: mlocal, nlocal !logical, allocatable, dimension(:) :: isleft, isright, istop, isbot integer, parameter :: tag = 2 integer :: acolstart, acolend, arowstart, arowend integer :: bcolstart, bcolend, browstart, browend integer :: mp, np, k, ia, ja integer, parameter :: tag1 = 123 integer, parameter :: tag2 = 124 logical :: isleftl, isrightl, istopl, isbotl call MPI_Comm_rank(comm, rank, ierror) call MPI_Comm_size(comm, procs, ierror) nlocal = ln(rank+1) mlocal = lm(rank+1) isleftl = isleft(rank+1) isrightl = isright(rank+1) istopl = istop(rank+1) isbotl = isbot(rank+1) ! define start-end cols and rows for the general case: ! computations on arow.. and acol.. are only meaningful at root browstart = 2 browend = mlocal - 1 bcolstart = 2 bcolend = nlocal - 1 arowstart = is(root + 1) + 1 arowend = is(root + 1) + mlocal - 2 acolstart = js(root + 1) + 1 acolend = js(root + 1) + nlocal - 2 ! correct for the border cases: if (isleftl) then bcolstart = bcolstart - 1 acolstart = acolstart - 1 endif if (isrightl) then bcolend = bcolend + 1 acolend = acolend + 1 endif if (istopl) then browstart = browstart - 1 arowstart = arowstart - 1 endif if (isbotl) then browend = browend + 1 arowend = arowend + 1 endif if (rank .eq. root) then ! copy local submatrix ! coy of the local submtrix B to A: a(arowstart:arowend,acolstart:acolend) = & b(browstart:browend,bcolstart:bcolend) ! now, collect from each non-root process ! it's matrix b, and put it in the proper place in a ploop: do p = 1, procs if ( p - 1 .eq. root ) then cycle ploop endif ! howmany elements to expect from process p-1 (= mp*np) ! and where to put this in a (ia:ia+mp-1, ja:ja+np-1) ! mp = lm(p) - 2 np = ln(p) - 2 ia = is(p) + 1 ja = js(p) + 1 ! correction if b is at a border: if (isleft(p)) then ja = ja - 1 np = np + 1 endif if (isright(p)) then np = np + 1 endif if (istop(p)) then ia = ia - 1 mp = mp + 1 endif if (isbot(p)) then mp = mp + 1 endif ! to avoid deadlocks, master first sends a ! short message to p. After receiving that message, ! p will start sending its submatrix call MPI_Send(mp*np, 1, MPI_INTEGER, p - 1, tag1, comm, ierror) ! receive a submatrix from process p-1 call MPI_Recv(a(ia:ia+mp-1,ja:ja+np-1), mp*np, MPI_DOUBLE_PRECISION, & p - 1, tag2, comm, MPI_STATUS_IGNORE, ierror) enddo ploop ! loop over non-root processes else ! above is the root code, now the non-root code ! receive the short message fro root, so I can start sending b call MPI_Recv(i, 1, MPI_INTEGER,& root, tag1, comm, MPI_STATUS_IGNORE, ierror) ! which part of b to send: b(browstart:browend,bcolstart:bcolend) ! sanity check k = (browend - browstart + 1)*(bcolend - bcolstart + 1) if (i .ne. k) then write(*,*)'process ',rank,' has a problem in matrix_coll_recvmat_real8: ' write(*,*)'number of words to send:',k write(*,*)'root expects: ',i do i=1,procs write(*,*)i-1,isleft(i),isright(i),istop(i),isbot(i) enddo write(*,*)isleftl,isrightl,istopl,isbotl call MPI_Abort(MPI_COMM_WORLD,1,ierror) endif call MPI_Send(b(browstart:browend,bcolstart:bcolend), k, & MPI_DOUBLE_PRECISION, root, tag2, comm, ierror) endif end subroutine matrix_coll_recvmat_real8 subroutine decomp(n, numprocs, myid, s, e) ! From the mpich-distribution: MPE_Decomp1d ! ! n : integer, input (for example: number of tasks or ! dimension of an array) ! numprocs : integer, input (number of processes) ! myid : integer, input (MPI id of this process) ! s : integer, output ! e : integer, output ! ! The purpose of this subroutine is to get an as equal as possible ! division of the integers 1..n among numprocs intervals. ! A common use is to divide work among MPI processes. ! ! example: ! ! integer numprocs, ier, myid, n, s, e, i ! call MPI_Comm_size(MPI_COMM_WORLD, numprocs, ier) ! call MPI_Comm_rank(MPI_COMM_WORLD, myid, ier) ! n = 1000 ! call xmpi_decomp(n, nprocs, myid, s, e) ! do i = s,e ! ... ! enddo ! integer, intent(in) :: n, numprocs, myid integer, intent(out):: s, e integer nlocal integer deficit nlocal = n / numprocs s = myid * nlocal + 1 deficit = mod(n,numprocs) s = s + min(myid,deficit) if (myid .lt. deficit) then nlocal = nlocal + 1 endif e = s + nlocal - 1 if (e .gt. n .or. myid .eq. numprocs-1) then e = n endif end subroutine decomp subroutine det_submatrices(ma,na,mp,np,is,lm,js,ln,isleft,isright,istop,isbot) ! ! determine a division of a ma*na matrix on mp*np processes ! ma: integer(in): 1st dimension of matrix to divide ! na: integer(in): 2nd dimension of this matrix ! mp: integer(in): 1st dimension of processorgrid ! np: integer(in): 2nd dimension of processorgrid ! is(mp*np): integer(out): is(i) will be equal to the ! start row of the i-th submatrix ! lm(mp*np): integer(out): lm(i) will be equal to the ! first dimension of the i-th submatrix ! js(mp*np): integer(out): js(i) will be equal to the ! start column of the i-th submatrix ! ln(mp*np): integer(out): ln(i) will be equal to the ! 2nd dimension of the i-th submatrix ! isleft(mp*np): logical(out): isleft(i) is true if the i-th submatrix ! shares the first column with the global matrix ! a(:,1) ! isrighet(mp*np): logical(out): isright(i) is true if the i-th submatrix ! shares the last column with the global matrix ! a(ma,:) ! istop(mp*np): logical(out): istop(i) is true if the i-th submatrix ! shares the first row with the global matrix ! a(1,:) ! isbot(mp*np): logical(out): isbot(i) is true if the i-th submatrix ! shares the last row with the global matrix ! a(na,:) ! ! All matrices are 'bordered': the first and last columns and rows ! are the borders. ! So, the first column of a submatrix B, somewhere in the middle of ! the global matrix, shares it's first column (the border) with ! the 2-nd last column of it's left neighbour A, while A ! shares its last column with the 2nd column of B. ! Later on, the process that hold A, will compute A(2:m-1,2:n-1) ! (m and n the dimensions of A), using the values of ! A(1,:), A(m:,), A(:,1), A(:,n). These rows and columns are ! computed by the neighbour processes and communicated between ! each time step. ! implicit none integer, intent(in) :: ma,na,mp,np integer, dimension(:), intent(out) :: is,lm,js,ln logical, dimension(:), intent(out) :: isleft,isright,istop,isbot integer i,j,s,e,k isleft = .false. isright = .false. istop = .false. isbot = .false. do i = 1, mp call decomp(ma-2, mp, i - 1, s, e) k = 0 do j = 1, np if (j .eq. 1) then isleft(i+k) = .true. endif if (j .eq. np) then isright(i+k) = .true. endif is(i + k) = s lm(i + k) = e - s + 3 k = k + mp enddo enddo k = 0 do j=1, np call decomp(na-2, np, j - 1, s, e) do i = 1, mp if (i .eq. 1) then istop(i+k) = .true. endif if (i .eq. mp) then isbot(i+k) = .true. endif js(i + k) = s ln(i + k) = e - s + 3 enddo k = k + mp enddo end subroutine det_submatrices subroutine shift_borders_matrix_real8(a,left,right,top,bot,comm) ! ! shift borders to and from neighbours. ! use mpi implicit none real*8, dimension(:,:), intent(inout) :: a integer, intent(in) :: left,right,top,bot,comm integer, parameter :: datatype = MPI_DOUBLE_PRECISION integer ierror,ma,na ma = ubound(a,1) na = ubound(a,2) ! send to left, receive from right call MPI_Sendrecv(a(2:ma-1,2), ma-2, datatype, & left, 1, & a(2:ma-1,na), ma-2, datatype, & right, 1, & comm, MPI_STATUS_IGNORE, ierror) ! send to right, receive from left call MPI_Sendrecv(a(2:ma-1,na-1), ma-2, datatype, & right, 2, & a(2:ma-1,1), ma-2, datatype, & left, 2, & comm, MPI_STATUS_IGNORE, ierror) ! send to bot, receive from top call MPI_Sendrecv(a(ma-1,:), na, datatype, & bot, 3, & a(1,:), na, datatype, & top, 3, & comm, MPI_STATUS_IGNORE, ierror) ! send to top, receive from bot call MPI_Sendrecv(a(2,:), na, datatype, & top, 4, & a(ma,:), na, datatype, & bot, 4, & comm, MPI_STATUS_IGNORE, ierror) end subroutine shift_borders_matrix_real8 #endif end module general_mpi_module