XBeach: C:/repositories/XBeach/trunk/src/xbeachlibrary/general_mpi

Go to the documentation of this file.
00001 module general_mpi_module
00002 #ifdef USEMPI
00003 
00004    ! This module contains three variants of subroutines to scatter
00005    ! a matrix from the root process to the other processes.
00006    ! matrix_distr_scatter: uses MPI_Scatterv, scattering groups
00007    !                       of columns
00008    ! matrix_distr_send:    uses MPI_Isend and MPI_Irecv, scattering
00009    !                       groups od columns
00010    ! matrix_distr_sendmat: uses MPI_Send and MPI_Recv, scattering
00011    !                       submatrices as a whole
00012    !
00013    ! and the opposite operation:
00014    ! matrix_coll_gather:   uses MPI_Gatherv, gathering groups
00015    !                       of columns
00016    ! matrix_coll_recv:     uses MPI_Isend and MPI_Recv, gathering groups
00017    !                       of columns
00018    ! matrix_coll_matrecv:  uses MPI_Send and MPI_Recv, gathering
00019    !                       submatrices as a whole
00020    !
00021    !  NOTE: the matrices are considered to be bordered by a columns
00022    !   left and right and rows up and down. So, n general,
00023    !   only b(2:mb-1,2:nb-1) is send to the master process.
00024    !   HOWEVER: if matrix b is
00025    !   on the left: b(:,1) is also sent
00026    !   on the right: b(:,nb) is also sent
00027    !   on the top:   b(1,:)  is also sent
00028    !   on the bottom: b(mb,:) is also sent
00029    ! The master process receives a(1:ma,1:na).
00030    !   dimension of b (1:mb,1:nb).
00031    !   dimension of a (1:ma,1:na)
00032    !
00033 
00034    ! The parameter lists are the same.
00035    ! On the lisa system at sara (Xeon, infiniband), the send variety
00036    ! is about twice as fast as the scatter variety.
00037    !
00038    ! Most probable cause is that in the scatter variety, a number
00039    ! of MPI_Scatterv's are used. They will run after the previous
00040    ! has been completed, while in the other, as many MPI_isend's and
00041    ! MPI_Irecv's are executed in parallel, without waiting.
00042    ! There is no non-waiting MPI_Scatterv available, alas.
00043    !
00044    ! On the same system, there is no significant difference in
00045    ! performance between matrix_coll_gather and matrix_coll_recv.
00046    !
00047    ! The matrecv and matsend varieties are even faster than the
00048    ! recv and send varieties. Moreover, the code is simpler, so
00049    ! it is bet to use these, see interfaces matrix_distr and matrix_coll.
00050 
00051    ! Most of the subroutines end with a MPI_Barrier, this was
00052    ! necessary for usage with openmpi, at least for the
00053    ! distbribution routines. Didn't understand why, but now
00054    ! I know: this is a bug in the shared memory interface of
00055    ! openmpi.
00056    !
00057 
00058 
00059    interface matrix_distr
00060       !  module procedure matrix_distr_send_real8
00061       !  module procedure matrix_distr_send_integer
00062       !  module procedure matrix_distr_scatter_real8
00063       !  module procedure matrix_distr_scatter_integer
00064       module procedure matrix_distr_sendmat_real8
00065       module procedure matrix_distr_sendmat_integer
00066    end interface matrix_distr
00067 
00068    interface matrix_coll
00069       !  module procedure matrix_coll_gather_real8
00070       !  module procedure matrix_coll_recv_real8
00071       module procedure matrix_coll_recvmat_int
00072       module procedure matrix_coll_recvmat_real8
00073    end interface matrix_coll
00074 
00075    interface shift_borders
00076       module procedure shift_borders_matrix_real8
00077    end interface shift_borders
00078 
00079 contains
00080    subroutine vector_distr_send(a,b,is,lm,root,comm)
00081       !
00082       ! Distribute vector a on processes
00083       ! Process P receives a(is(P):is(P)+lm(P)-1)
00084       !
00085       ! Parameters:
00086       ! real*8 a (in): vector to distribute, only used at root
00087       ! real*8 b (out): local vector
00088       ! integer is(number-of-processes) (in): see above: needed on all processes
00089       ! integer lm(number-of-processes) (in): see above: needed on all processes
00090       ! integer root (in): process that holds a, is, lm
00091       ! integer comm (in): MPI communicator
00092       !
00093       ! Method: root process sends, using MPI_Isend parts of a to
00094       !         the other processes. On the root process a copy is
00095       !         made.
00096       !
00097       use mpi
00098       implicit none
00099       real*8, dimension(:), intent(in)  :: a
00100       real*8, dimension(:), intent(out) :: b
00101       integer, dimension(:), intent(in) :: is
00102       integer, dimension(:), intent(in) :: lm
00103       integer, intent(in)               :: root
00104       integer, intent(in)               :: comm
00105 
00106       integer                             :: ierror,p,procs,rank,pp,mlocal
00107       integer, dimension(:), allocatable  :: req
00108       real*8, dimension(:), allocatable   :: aa
00109 
00110       call MPI_Comm_rank(comm, rank, ierror)
00111       call MPI_Comm_size(comm, procs, ierror)
00112       allocate(req(procs-1))
00113 
00114       mlocal = lm(rank+1)
00115 
00116       pp=0
00117       if ( rank .eq. root ) then
00118          allocate(aa(size(a)))
00119          aa = a               ! must have contiguous memory for a because
00120          ! of mpi_isend
00121          do p = 1,procs
00122             if ( p-1 .eq. rank ) then
00123                b(1:lm(p)) = a(is(p) : is(p)+lm(p)-1)
00124             else
00125                pp = pp + 1
00126                call MPI_Isend(aa(is(p)), lm(p), MPI_DOUBLE_PRECISION, p-1, &
00127                11, comm, req(pp), ierror)
00128             endif
00129          enddo
00130          call MPI_Waitall(pp,req,MPI_STATUSES_IGNORE,ierror)
00131          deallocate(aa)
00132       else
00133          call MPI_Recv(b, mlocal, MPI_DOUBLE_PRECISION, root, 11,&
00134          comm, MPI_STATUS_IGNORE, ierror)
00135       endif
00136 
00137       deallocate(req)
00138 
00139 
00140    end subroutine vector_distr_send
00141 
00142    subroutine matrix_distr_scatter_real8(a,b,is,lm,js,ln,root,comm)
00143       ! distribute matrix on processes
00144       !
00145       ! parameters:
00146       ! real*8 a(:,:) (in): matrix to be scattered.
00147       ! real*8 b(:,:) (out): local matrix to scatter to.
00148       !                      This matrix has to be available, it is not
00149       !                      allocated by this subroutine.
00150       ! integer is,lm,js,ln(number of processes) (in): description of
00151       !   the location of the submatrices b in matrix a:
00152       !   Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is
00153       !   the rank of the process (0..procs-1)
00154       !   The length of the 1st dimension of b is given by lm(rank+1)
00155       !   The length of the 2nd dimension of b is given by ln(rank+1)
00156       ! integer root (in): the rank of the process where a is available
00157       ! integer comm (in): the MPI communicator to be used.
00158       !
00159       ! Note: a,is,js are used at the root process only
00160       !
00161       ! Method
00162       !
00163       !  Distribute in a number of scatterv's the colums of the submatrices.
00164       !  Per scatterv, distribute (number of processes) columns, so
00165       !  we need maxval(ln) scatters.
00166       !
00167       use mpi
00168       implicit none
00169       real*8, intent(in),  dimension(:,:)  :: a
00170       real*8, intent(out), dimension(:,:)  :: b
00171       integer, intent(in), dimension(:)    :: is,lm
00172       integer, intent(in), dimension(:)    :: js,ln
00173       integer, intent(in)                  :: root,comm
00174 
00175       integer                              :: j,p,rank,ierror,procs,jj,ma,jmax
00176       integer                              :: nlocal,mlocal,rcnt
00177       integer, allocatable, dimension(:)   :: displs, cnts
00178       real*8, allocatable, dimension(:,:)  :: aa
00179 
00180       call MPI_Comm_rank(comm, rank, ierror)
00181       call MPI_Comm_size(comm, procs, ierror)
00182 
00183       nlocal = ln(rank+1)
00184       mlocal = lm(rank+1)
00185 
00186       if (rank .eq. root) then
00187          ma = size(a,1)
00188          allocate(displs(procs), cnts(procs))
00189          ! because the counts and displacements are only valid for a
00190          ! contiguous matrix, we take care of that:
00191 
00192          allocate(aa(size(a,1),size(a,2)))
00193          aa = a
00194 
00195       else
00196          ma = 1                       ! to quiet the compiler
00197          allocate(displs(1), cnts(1)) ! these are not needed in non-root process, but
00198          allocate(aa(1,1))
00199          displs(1) = 0
00200          cnts(1)   = 0
00201          ! it is safer to have valid addresses
00202       endif
00203 
00204       jmax = maxval(ln(1:procs))     ! number of scatters needed
00205 
00206       do j=1,jmax ! scatter all j'th columns
00207          if (rank .eq. root) then
00208             do p=1,procs
00209                if (ln(p) .ge. j) then ! compute displacement of the start of the j-th
00210                   ! column of submatrix(p)
00211                   displs(p) = (js(p) -1 + j -1)*ma + is(p) - 1
00212                   cnts(p) = lm(p)      ! number of elements in this column
00213                else                   ! for submatrix(p), we are out of columns
00214                   displs(p) = 0        ! put in some valid value for the displacement
00215                   cnts(p) = 0          ! and use zero for the number of elements in
00216                   ! this column
00217                endif
00218             enddo
00219          endif
00220          if (nlocal .ge. j) then    ! this is for all processes
00221             jj=j                     ! jj equals the column number of the local
00222             ! matrix
00223             rcnt = mlocal            ! number of elements in this column
00224          else
00225             jj=1                     ! we are out of columns, give jj a valid number
00226             rcnt = 0                 ! and set the number of elements to receive to zero
00227          endif
00228 
00229          call MPI_Scatterv(aa(1,1),  cnts(1), displs(1), MPI_DOUBLE_PRECISION,  &
00230          b(:,jj), rcnt,         MPI_DOUBLE_PRECISION,  &
00231          root,    comm, ierror)
00232       enddo
00233 
00234       deallocate(displs,cnts,aa)
00235 
00236    end subroutine matrix_distr_scatter_real8
00237 
00238    subroutine matrix_distr_scatter_integer(a,b,is,lm,js,ln,root,comm)
00239       ! distribute matrix on processes
00240       !
00241       ! parameters:
00242       ! integer a(:,:) (in): matrix to be scattered.
00243       ! integer b(:,:) (out): local matrix to scatter to.
00244       !                      This matrix has to be available, it is not
00245       !                      allocated by this subroutine.
00246       ! integer is,lm,js,ln(number of processes) (in): description of
00247       !   the location of the submatrices b in matrix a:
00248       !   Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is
00249       !   the rank of the process (0..procs-1)
00250       !   The length of the 1st dimension of b is given by lm(rank+1)
00251       !   The length of the 2nd dimension of b is given by ln(rank+1)
00252       ! integer root (in): the rank of the process where a is available
00253       ! integer comm (in): the MPI communicator to be used.
00254       !
00255       ! Note: a,is,js are used at the root process only
00256       !
00257       ! Method
00258       !
00259       !  Distribute in a number of scatterv's the colums of the submatrices.
00260       !  Per scatterv, distribute (number of processes) columns, so
00261       !  we need maxval(ln) scatters.
00262       !
00263       use mpi
00264       implicit none
00265       integer, intent(in),  dimension(:,:)  :: a
00266       integer, intent(out), dimension(:,:)  :: b
00267       integer, intent(in), dimension(:)    :: is,lm
00268       integer, intent(in), dimension(:)    :: js,ln
00269       integer, intent(in)                  :: root,comm
00270 
00271       integer                              :: j,p,rank,ierror,procs,jj,ma,jmax
00272       integer                              :: nlocal,mlocal,rcnt
00273       integer, allocatable, dimension(:)   :: displs, cnts
00274       integer, allocatable, dimension(:,:) :: aa
00275 
00276       integer, parameter                   :: tag = 1
00277 
00278       call MPI_Comm_rank(comm, rank, ierror)
00279       call MPI_Comm_size(comm, procs, ierror)
00280 
00281       nlocal = ln(rank+1)
00282       mlocal = lm(rank+1)
00283 
00284       if (rank .eq. root) then
00285          ma = size(a,1)
00286          allocate(displs(procs), cnts(procs))
00287          allocate(aa(size(a,1),size(a,2)))
00288          aa = a
00289       else
00290          ma = 1                       ! to quiet the compiler
00291          allocate(displs(1), cnts(1)) ! these are not needed in non-root process, but
00292          ! it is safer to have valid addresses
00293          allocate(aa(1,1))
00294       endif
00295 
00296       jmax = maxval(ln(1:procs))     ! number of scatters needed
00297 
00298       do j=1,jmax ! scatter all j'th columns
00299          if (rank .eq. root) then
00300             do p=1,procs
00301                if (ln(p) .ge. j) then ! compute displacement of the start of the j-th
00302                   ! column of submatrix(p)
00303                   displs(p) = (js(p) -1 + j -1)*ma + is(p) - 1
00304                   cnts(p) = lm(p)      ! number of elements in this column
00305                else                   ! for submatrix(p), we are out of columns
00306                   displs(p) = 0        ! put in some valid value for the displacement
00307                   cnts(p) = 0          ! and use zero for the number of elements in
00308                   ! this column
00309                endif
00310             enddo
00311          endif
00312          if (nlocal .ge. j) then    ! this is for all processes
00313             jj=j                     ! jj equals the column number of the local
00314             ! matrix
00315             rcnt = mlocal            ! number of elements in this column
00316          else
00317             jj=1                     ! we are out of columns, give jj a valid number
00318             rcnt = 0                 ! and set the number of elements to receive to zero
00319          endif
00320 
00321          call MPI_Scatterv(aa(1,1),  cnts(1), displs(1), MPI_INTEGER,  &
00322          b(:,jj), rcnt,         MPI_INTEGER,  &
00323          root,    comm, ierror)
00324       enddo
00325 
00326       deallocate(displs,cnts,aa)
00327 
00328    end subroutine matrix_distr_scatter_integer
00329 
00330    subroutine matrix_distr_send_real8(a,b,is,lm,js,ln,root,comm)
00331       ! distribute matrix on processes
00332       !
00333       ! parameters:
00334       ! real*8 a(:,:) (in): matrix to be scatterd.
00335       ! real*8 b(:,:) (out): local matrix to scatter to.
00336       !                      This matrix has to be available, it is not
00337       !                      allocated by this subroutine.
00338       ! integer is,lm,js,ln(number of processes) (in): description of
00339       !   the location of the submatrices b in matrix a:
00340       !   Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is
00341       !   the rank of the process (0..procs-1)
00342       !   The length of the 1st dimension of b is given by lm(rank+1)
00343       !   The length of the 2nd dimension of b is given by ln(rank+1)
00344       ! integer root (in): the rank of the process where a is available
00345       ! integer comm (in): the MPI communicator to be used.
00346       !
00347       ! Note: a,is,js are used at the root process only
00348       !
00349       ! Method
00350       !
00351       ! Sends colums of submatrices to the processes using MPI_Isend.
00352       ! This routine tries to send as many columns in parallel
00353       ! to different processes as possible, see the usage of req(:)
00354       ! and maxreq below.
00355       ! The columns are received using MPI_Irecv.
00356       !
00357       use mpi
00358       implicit none
00359       real*8, intent(in),  dimension(:,:)  :: a
00360       real*8, intent(out), dimension(:,:)  :: b
00361       integer, intent(in), dimension(:)    :: is,lm
00362       integer, intent(in), dimension(:)    :: js,ln
00363       integer, intent(in)                  :: root,comm
00364 
00365       integer                              :: j,rank,ierror,l,procs
00366 
00367       integer                              :: sends, sends_todo, length
00368       integer                              :: p,mlocal,nlocal
00369 
00370       logical                              :: done
00371 
00372       integer                              :: maxreq
00373       integer, allocatable, dimension(:)   :: req
00374 
00375       integer, parameter                   :: tag = 1
00376 
00377       integer, parameter                   :: datatype = MPI_DOUBLE_PRECISION
00378       real*8, allocatable, dimension(:,:)  :: aa,bb
00379 
00380       call MPI_Comm_rank(comm, rank, ierror)
00381       call MPI_Comm_size(comm, procs, ierror)
00382 
00383       nlocal = ln(rank+1)
00384       mlocal = lm(rank+1)
00385 
00386       ! calculate total number of sends to be done: that is equal to
00387       ! the total number of columns in the submatrices
00388 
00389       if (rank .eq. root) then
00390          sends_todo = sum(ln(1:procs))
00391          ! maxreq is the maximum number of outstanding Isend or Irecv requests.
00392          ! Experiments learned that the value of maxreq is not critical,
00393          ! a value of 10 gave on the lisa system (Xeon, infiniband)
00394          ! only somewhat worse results. Probably, the slower the network,
00395          ! the higher this value can be set. Maximum usable is the total number
00396          ! of columns (sends_todo, see above)
00397          maxreq = 1000
00398       else
00399          sends_todo = 0    ! to quiet the compiler
00400          maxreq = nlocal   ! on non-root processes, we need only the number of local
00401          ! columns
00402       endif
00403 
00404       ! distribute matrix.
00405       ! to ensure that every process is as busy as possible, we do the
00406       ! loops on the processes in the inner loop.
00407 
00408       allocate(req(maxreq))
00409       req=MPI_REQUEST_NULL    ! initialize req with a null-value
00410 
00411       allocate(bb(size(b,1),size(b,2)))
00412       if ( rank .eq. root) then
00413          ! we need a contiguous matrix a, because
00414          !  we are going to use isend.
00415          allocate(aa(size(a,1),size(a,2)))
00416          aa = a
00417 
00418          sends=0
00419          l = 0
00420          j = 0
00421          loop: do
00422             j=j+1                 ! j runs over the columns
00423             do p=0, procs - 1
00424                if (ln(p + 1) .ge. j) then ! is j in the range of submatrix(p)
00425                   length = lm(p + 1)
00426                   if (p .eq. rank) then    ! submtarix(p) is on root, just copy the column
00427                      b(1:length,j) = a(is(rank+1):is(rank+1)+length-1,j+js(rank+1)-1)
00428                   else
00429                      ! search a free request holder:
00430                      done = .false.
00431                      do while( .not. done)
00432                         l=l+1
00433                         if (l .gt. maxreq) l=1
00434                         if (req(l) .eq. MPI_REQUEST_NULL) then
00435                            done = .true.
00436                         else
00437                            !
00438                            ! MPI_Test gives .true. if req(l).eq.MPI_REQUEST_NULL,
00439                            ! but the standard says nothing about this case,
00440                            ! so we test first if req(l) is equal to MPI_REQUEST_NULL
00441                            !
00442                            call MPI_Test(req(l),done,MPI_STATUS_IGNORE,ierror)
00443                         endif
00444                      enddo ! now we have a free request holder req(l)
00445                      call MPI_Isend(aa(is(p + 1), j + js(p + 1) - 1), length, &
00446                      datatype, p, tag, comm, req(l), ierror)
00447                   endif
00448                   sends = sends + 1
00449                   if(sends .ge. sends_todo) exit loop ! finish if the number of sends is
00450                   ! equal to the total number of sends
00451                   ! needed
00452                endif
00453             enddo
00454          enddo loop
00455          deallocate(aa)
00456       else   ! below the code for non-root:
00457          ! because of irecv, we must be sure that we have a contiguous matrix
00458          do j=1,nlocal
00459             call MPI_Irecv(bb(1,j), mlocal, datatype, root, tag, comm, req(j), ierror)
00460          enddo
00461       endif
00462 
00463       call MPI_Waitall(maxreq,req,MPI_STATUSES_IGNORE,ierror)
00464 
00465       if (rank .ne. root) then
00466          b = bb
00467       endif
00468 
00469       deallocate(bb)
00470       deallocate(req)
00471 
00472    end subroutine matrix_distr_send_real8
00473 
00474    subroutine matrix_distr_send_integer(a,b,is,lm,js,ln,root,comm)
00475       ! distribute matrix on processes
00476       !
00477       ! parameters:
00478       ! integer a(:,:) (in): matrix to be scatterd.
00479       ! integer b(:,:) (out): local matrix to scatter to.
00480       !                      This matrix has to be available, it is not
00481       !                      allocated by this subroutine.
00482       ! integer is,lm,js,ln(number of processes) (in): description of
00483       !   the location of the submatrices b in matrix a:
00484       !   Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is
00485       !   the rank of the process (0..procs-1)
00486       !   The length of the 1st dimension of b is given by lm(rank+1)
00487       !   The length of the 2nd dimension of b is given by ln(rank+1)
00488       ! integer root (in): the rank of the process where a is available
00489       ! integer comm (in): the MPI communicator to be used.
00490       !
00491       ! Note: a,is,js are used at the root process only
00492       !
00493       ! Method
00494       !
00495       ! Sends colums of submatrices to the processes using MPI_Isend.
00496       ! This routine tries to send as many columns in parallel
00497       ! to different processes as possible, see the usage of req(:)
00498       ! and maxreq below.
00499       ! The columns are received using MPI_Irecv.
00500       !
00501       use mpi
00502       implicit none
00503       integer, intent(in),  dimension(:,:)  :: a
00504       integer, intent(out), dimension(:,:)  :: b
00505       integer, intent(in), dimension(:)    :: is,lm
00506       integer, intent(in), dimension(:)    :: js,ln
00507       integer, intent(in)                  :: root,comm
00508 
00509       integer                              :: j,rank,ierror,l,procs
00510 
00511       integer                              :: sends, sends_todo, length
00512       integer                              :: p,mlocal,nlocal
00513 
00514       logical                              :: done
00515 
00516       integer                              :: maxreq
00517       integer, allocatable, dimension(:)   :: req
00518 
00519       integer, parameter                   :: tag = 1
00520 
00521       integer, parameter                   :: datatype = MPI_INTEGER
00522       integer, allocatable, dimension(:,:) :: aa,bb
00523 
00524 
00525       call MPI_Comm_rank(comm, rank, ierror)
00526       call MPI_Comm_size(comm, procs, ierror)
00527 
00528       nlocal = ln(rank+1)
00529       mlocal = lm(rank+1)
00530 
00531       ! calculate total number of sends to be done: that is equal to
00532       ! the total number of columns in the submatrices
00533 
00534       if (rank .eq. root) then
00535          sends_todo = sum(ln(1:procs))
00536          ! maxreq is the maximum number of outstanding Isend or Irecv requests.
00537          ! Experiments learned that the value of maxreq is not critical,
00538          ! a value of 10 gave on the lisa system (Xeon, infiniband)
00539          ! only somewhat worse results. Probably, the slower the network,
00540          ! the higher this value can be set. Maximum usable is the total number
00541          ! of columns (sends_todo, see above)
00542          maxreq = 1000
00543       else
00544          sends_todo = 0    ! to quiet the compiler
00545          maxreq = nlocal   ! on non-root processes, we need only the number of local
00546          ! columns
00547       endif
00548 
00549       ! distribute matrix.
00550       ! to ensure that every process is as busy as possible, we do the
00551       ! loops on the processes in the inner loop.
00552 
00553       allocate(req(maxreq))
00554       req=MPI_REQUEST_NULL    ! initialize req with a null-value
00555 
00556       allocate(bb(size(b,1),size(b,2)))
00557       if ( rank .eq. root) then
00558          allocate(aa(size(a,1),size(a,2)))
00559          aa = a
00560          sends=0
00561          l = 0
00562          j = 0
00563          loop: do
00564             j=j+1                 ! j runs over the columns
00565             do p=0, procs - 1
00566                if (ln(p + 1) .ge. j) then ! is j in the range of submatrix(p)
00567                   length = lm(p + 1)
00568                   if (p .eq. rank) then    ! submtarix(p) is on root, just copy the column
00569                      b(1:length,j) = a(is(rank+1):is(rank+1)+length-1,j+js(rank+1)-1)
00570                   else
00571                      ! search a free request holder:
00572                      done = .false.
00573                      do while( .not. done)
00574                         l=l+1
00575                         if (l .gt. maxreq) l=1
00576                         if (req(l) .eq. MPI_REQUEST_NULL) then
00577                            done = .true.
00578                         else
00579                            !
00580                            ! MPI_Test gives .true. if req(l).eq.MPI_REQUEST_NULL,
00581                            ! but the standard says nothing about this case,
00582                            ! so we test first if req(l) is equal to MPI_REQUEST_NULL
00583                            !
00584                            call MPI_Test(req(l),done,MPI_STATUS_IGNORE,ierror)
00585                         endif
00586                      enddo ! now we have a free request holder req(l)
00587                      call MPI_Isend(aa(is(p + 1), j + js(p + 1) - 1), length, &
00588                      datatype, p, tag, comm, req(l), ierror)
00589                   endif
00590                   sends = sends + 1
00591                   if(sends .ge. sends_todo) exit loop ! finish if the number of sends is
00592                   ! equal to the total number of sends
00593                   ! needed
00594                endif
00595             enddo
00596          enddo loop
00597          deallocate(aa)
00598       else   ! below the code for non-root:
00599          do j=1,nlocal
00600             call MPI_Irecv(bb(1,j), mlocal, datatype, root, tag, comm, req(j), ierror)
00601          enddo
00602       endif
00603 
00604       do l=1,maxreq   ! wait until all requests are done
00605          if (req(l) .ne. MPI_REQUEST_NULL) then
00606             call MPI_Wait(req(l), MPI_STATUS_IGNORE, ierror)
00607          endif
00608       enddo
00609 
00610       if (rank .ne. root) then
00611          b = bb
00612       endif
00613       deallocate(bb)
00614 
00615       deallocate(req)
00616 
00617    end subroutine matrix_distr_send_integer
00618 
00619    subroutine matrix_distr_sendmat_real8(a,b,is,lm,js,ln,&
00620    root,comm)
00621 
00622       ! distribute matrix on root process to all processes
00623       !
00624       ! parameters:
00625       ! real*8 a(:,:) (in):  matrix to be scattered
00626       ! real*8 b(:,:) (out): local scattered matrix
00627       ! integer is,lm,js,ln(number of processes) (in): description of
00628       !   the location of the submatrices b in matrix a:
00629       !   Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is
00630       !   the rank of the process (0..procs-1)
00631       !   The length of the 1st dimension of b is given by lm(rank+1)
00632       !   The length of the 2nd dimension of b is given by ln(rank+1)
00633       ! integer root (in): the rank of the process where a is available
00634       ! integer comm (in): the MPI communicator to be used.
00635       !
00636       ! Note: a,is,js are used at the root process only
00637       !
00638       ! Method
00639       !
00640       !  root receives submatrices as a whole, one by one. Local
00641       !  submatrix is copied.
00642       !
00643 
00644       use mpi
00645       implicit none
00646       real*8, intent(in),  dimension(:,:)  :: a
00647       real*8, intent(out), dimension(:,:)  :: b
00648       integer, intent(in), dimension(:)    :: is, lm
00649       integer, intent(in), dimension(:)    :: js, ln
00650       integer, intent(in)                  :: root, comm
00651 
00652       integer                              :: i, rank, ierror, procs, p
00653       integer                              :: mlocal, nlocal
00654       integer                              :: acolstart, acolend, arowstart, arowend
00655       integer                              :: mp, np, k, ia, ja
00656       integer, parameter                   :: tag1 = 125
00657       integer, parameter                   :: tag2 = 126
00658 
00659       call MPI_Comm_rank(comm, rank, ierror)
00660       call MPI_Comm_size(comm, procs, ierror)
00661 
00662       nlocal = ln(rank+1)
00663       mlocal = lm(rank+1)
00664 
00665       ! define start-end cols and rows for the general case:
00666       ! computations on arow.. and acol.. are only meaningful at root
00667 
00668       ! See also spaceparams comments:
00669       !    b concides with
00670       !       a(is(p+1):is(p+1)+lm(p+1)-1,js(p+1):js(p+1)+ln(p+1)-1)
00671 
00672       arowstart  = is(root + 1)
00673       arowend    = is(root + 1) + mlocal - 1
00674       acolstart  = js(root + 1)
00675       acolend    = js(root + 1) + nlocal - 1
00676       if (rank .eq. root) then
00677          ! copy of the the relevant part of A to local submtrix B:
00678          b(:,:) =  a(arowstart:arowend,acolstart:acolend)
00679 
00680          ! now, send to each non-root process
00681          ! it's part of matrix a
00682 
00683          ploop: do p = 1, procs
00684             if ( p - 1 .eq. root ) then
00685                cycle ploop
00686             endif
00687             ! howmany elements to send to process p-1  (= mp*np)
00688             ! and where to put this in a  (ia:ia+mp-1, ja:ja+np-1)
00689             !
00690 
00691             mp = lm(p)
00692             np = ln(p)
00693             ia = is(p)
00694             ja = js(p)
00695 
00696             ! to avoid deadlocks, master first sends a
00697             ! short message to p. After receiving that message,
00698             ! p will start receiving its submatrix
00699 
00700             call MPI_Send(mp*np, 1, MPI_INTEGER, p - 1, tag1, comm, ierror)
00701 
00702             ! send a submatrix to process p-1:
00703 
00704             call MPI_Send(a(ia:ia+mp-1,ja:ja+np-1), mp*np, MPI_DOUBLE_PRECISION, &
00705             p - 1, tag2, comm, ierror)
00706 
00707          enddo  ploop ! loop over non-root processes
00708 
00709       else ! above is the root code, now the non-root code
00710          ! receive the short message from root, so I can start receiving b
00711          call MPI_Recv(i, 1, MPI_INTEGER,&
00712          root, tag1, comm, MPI_STATUS_IGNORE, ierror)
00713 
00714          ! sanity check
00715 
00716          k = size(b)
00717          if (i .ne. k) then
00718             write(*,*)'process ',rank,' has a problem in matrix_distr_sendmat_real8: '
00719             write(*,*)'number of words to receive:',k
00720             write(*,*)'root expects:              ',i
00721             call MPI_Abort(MPI_COMM_WORLD,1,ierror)
00722          endif
00723 
00724          call MPI_Recv(b, k, MPI_DOUBLE_PRECISION, root, &
00725          tag2, comm, MPI_STATUS_IGNORE, ierror)
00726       endif
00727 
00728    end subroutine matrix_distr_sendmat_real8
00729 
00730    subroutine matrix_distr_sendmat_integer(a,b,is,lm,js,ln,&
00731    root,comm)
00732 
00733       ! distribute matrix on root process to all processes
00734       !
00735       ! parameters:
00736       ! integer a(:,:) (in):  matrix to be scattered
00737       ! integer b(:,:) (out): local scattered matrix
00738       ! integer is,lm,js,ln(number of processes) (in): description of
00739       !   the location of the submatrices b in matrix a:
00740       !   Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is
00741       !   the rank of the process (0..procs-1)
00742       !   The length of the 1st dimension of b is given by lm(rank+1)
00743       !   The length of the 2nd dimension of b is given by ln(rank+1)
00744       ! integer root (in): the rank of the process where a is available
00745       ! integer comm (in): the MPI communicator to be used.
00746       !
00747       ! Note: a,is,js are used at the root process only
00748       !
00749       ! Method
00750       !
00751       !  root receives submatrices as a whole, one by one. Local
00752       !  submatrix is copied.
00753       !
00754 
00755       use mpi
00756       implicit none
00757       integer, intent(in),  dimension(:,:) :: a
00758       integer, intent(out), dimension(:,:) :: b
00759       integer, intent(in), dimension(:)    :: is, lm
00760       integer, intent(in), dimension(:)    :: js, ln
00761       integer, intent(in)                  :: root, comm
00762       integer                              :: i, rank, ierror, procs, p
00763       integer                              :: mlocal, nlocal
00764       integer                              :: acolstart, acolend, arowstart, arowend
00765       integer                              :: mp, np, k, ia, ja
00766       integer, parameter                   :: tag1 = 125
00767       integer, parameter                   :: tag2 = 126
00768 
00769       call MPI_Comm_rank(comm, rank, ierror)
00770       call MPI_Comm_size(comm, procs, ierror)
00771 
00772       nlocal = ln(rank+1)
00773       mlocal = lm(rank+1)
00774 
00775       ! define start-end cols and rows for the general case:
00776       ! computations on arow.. and acol.. are only meaningful at root
00777 
00778       arowstart  = is(root + 1)
00779       arowend    = is(root + 1) + mlocal - 1
00780       acolstart  = js(root + 1)
00781       acolend    = js(root + 1) + nlocal - 1
00782 
00783       if (rank .eq. root) then
00784 
00785          ! copy of the the relevant part of A to local submtrix B:
00786 
00787          b(:,:) =  a(arowstart:arowend,acolstart:acolend)
00788 
00789          ! now, send to each non-root process
00790          ! it's part of matrix a
00791 
00792          ploop: do p = 1, procs
00793             if ( p - 1 .eq. root ) then
00794                cycle ploop
00795             endif
00796 
00797             ! howmany elements to send to process p-1  (= mp*np)
00798             ! and where to put this in a  (ia:ia+mp-1, ja:ja+np-1)
00799             !
00800 
00801             mp = lm(p)
00802             np = ln(p)
00803             ia = is(p)
00804             ja = js(p)
00805 
00806             ! to avoid deadlocks, master first sends a
00807             ! short message to p - 1. After receiving that message,
00808             ! p - 1 will start receiving its submatrix
00809 
00810             call MPI_Send(mp*np, 1, MPI_INTEGER, p - 1, tag1, comm, ierror)
00811 
00812             ! send a submatrix to process p-1
00813 
00814             call MPI_Send(a(ia:ia+mp-1,ja:ja+np-1), mp*np, MPI_INTEGER, &
00815             p - 1, tag2, comm, ierror)
00816 
00817          enddo  ploop ! loop over non-root processes
00818 
00819       else ! above is the root code, below the non-root code
00820 
00821          ! receive the short message from root, so I can start receiving b
00822 
00823          call MPI_Recv(i, 1, MPI_INTEGER,&
00824          root, tag1, comm, MPI_STATUS_IGNORE, ierror)
00825 
00826          ! sanity check
00827 
00828          k = size(b)
00829          if (i .ne. k) then
00830             write(*,*)'process ',rank,' has a problem in matrix_distr_sendmat_integer: '
00831             write(*,*)'number of words to receive:',k
00832             write(*,*)'root expects:              ',i
00833             call MPI_Abort(MPI_COMM_WORLD,1,ierror)
00834          endif
00835 
00836          call MPI_Recv(b, k, MPI_INTEGER, root, &
00837          tag2, comm, MPI_STATUS_IGNORE, ierror)
00838 
00839       endif
00840 
00841    end subroutine matrix_distr_sendmat_integer
00842 
00843    subroutine matrix_coll_gather_real8(a,b,is,lm,js,ln, &
00844    isleft,isright,istop,isbot,root,comm)
00845       ! collect matrix on root process
00846       !
00847       ! parameters:
00848       ! real*8 a(:,:) (out): matrix to be collected.
00849       !                      This matrix has to be available, it is not
00850       !                      allocated by this subroutine.
00851       ! real*8 b(:,:) (in): local scattered matrix
00852       ! integer is,lm,js,ln(number of processes) (in): description of
00853       !   the location of the submatrices b in matrix a:
00854       !   Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is
00855       !   the rank of the process (0..procs-1)
00856       !   The length of the 1st dimension of b is given by lm(rank+1)
00857       !   The length of the 2nd dimension of b is given by ln(rank+1)
00858       ! logical isleft, isright, istop, isbot (in): is this submatrix at the
00859       !  left, right top or bottom of the general matrix.
00860       ! integer root (in): the rank of the process where a is available
00861       ! integer comm (in): the MPI communicator to be used.
00862       !
00863       ! Note: a,is,js are used at the root process only
00864       !
00865       ! Method
00866       !
00867       ! Use gatherv. Each gatherv call collects one column of
00868       !  all submatrices
00869       !
00870       !  NOTE: the matrices are considered to be bordered by a columns
00871       !   left and right and rows up and down. So, n general,
00872       !   only b(2:mb-1,2:nb-1) is send to the master process.
00873       !   HOWEVER: if matrix b is
00874       !   on the left: b(:,1) is also sent
00875       !   on the right: b(:,nb) is also sent
00876       !   on the top:   b(1,:)  is also sent
00877       !   on the bottom: b(mb,:) is also sent
00878       ! The master process receives a(1:ma,1:na).
00879       !   dimension of b (1:mb,1:nb).
00880       !   dimension of a (1:ma,1:na)
00881       !
00882       use mpi
00883       implicit none
00884       real*8, intent(out),  dimension(:,:) :: a
00885       real*8, intent(in), dimension(:,:)   :: b
00886       integer, intent(in), dimension(:)    :: is,lm
00887       integer, intent(in), dimension(:)    :: js,ln
00888       logical, intent(in), dimension(:)    :: isleft, isright, istop, isbot
00889       integer, intent(in)                  :: root,comm
00890 
00891       integer                              :: j,p,rank,ierror,procs,jj,ma,jmax,k
00892       integer                              :: nlocal,mlocal,scnt
00893       integer, allocatable, dimension(:)   :: displs, cnts
00894       logical                              :: isleftl, isrightl, istopl, isbotl
00895       real*8, allocatable, dimension(:,:)  :: aa
00896 
00897 #ifdef USEMPE
00898       call MPE_Log_event(event_coll_start,0,'cstart')
00899 #endif
00900       call MPI_Comm_rank(comm, rank, ierror)
00901       call MPI_Comm_size(comm, procs, ierror)
00902 
00903       nlocal   = ln(rank+1)
00904       mlocal   = lm(rank+1)
00905       isleftl  = isleft(rank+1)
00906       isrightl = isright(rank+1)
00907       istopl   = istop(rank+1)
00908       isbotl   = isbot(rank+1)
00909 
00910       ma = 1 ! to quiet the compiler
00911       if (rank .eq. root) then
00912          ma = size(a,1)
00913       endif
00914 
00915       if (rank .eq. root) then
00916          allocate(displs(procs), cnts(procs))
00917          ! need a contiguous matrix, otherwise the counts and
00918          ! displacements will be in error
00919          allocate(aa(size(a,1),size(a,2)))
00920       else
00921          allocate(displs(1),cnts(1))    ! need valid addresses on non-root
00922          allocate(aa(1,1))
00923       endif
00924 
00925       nlocal = ln(rank+1)
00926       mlocal = lm(rank+1)
00927 
00928       jmax = maxval(ln(1:procs))     ! number of gathers needed
00929 
00930       do j=1,jmax
00931          if (rank .eq. root) then  ! displacements and counts only needed on root
00932             do p=1,procs
00933                if (ln(p) .ge. j) then
00934                   displs(p) = (js(p) -1 + j -1)*ma + is(p)     ! compute displacement for
00935                   ! column j of submatrix p
00936                   ! note that first and last elements
00937                   ! of this columns are borders, not
00938                   ! to be gathered
00939                   cnts(p) = lm(p)-2               ! number of non-border elements in
00940                   ! this column
00941                   ! Normally, the first and last rows and columns are not to
00942                   ! be send to root, but we make an exception for the
00943                   ! rows and columns that are the first rows and columns
00944                   ! from the global matrix.
00945                   !
00946                   displs(p) = (js(p) -1 + j - 1)*ma + is(p) - 1
00947                   cnts(p) = lm(p)
00948                   if ( .not. isleft(p) ) then
00949                      ! this is a submatrix, not on the left.
00950                      if ( j .eq. 1 ) then
00951                         ! The first column is  not to be sent,
00952                         ! so make cnts(p) = 0, and give displs(p) a
00953                         ! valid value
00954                         displs(p) = 0
00955                         cnts(p)   = 0
00956                      endif
00957                   endif
00958                   if ( .not. isright(p) ) then
00959                      ! this is a submatrix not on the right
00960                      if ( j .eq. ln(p) ) then
00961                         ! The last column is not to be sent,
00962                         ! so make cnts(p) = 0, and give displs(p) a
00963                         ! valid value
00964                         displs(p) = 0
00965                         cnts(p)   = 0
00966                      endif
00967                   endif
00968                   if ( .not. istop(p) ) then
00969                      ! this is a submatrix, not on the top
00970                      ! so we don't send the first element of column j
00971                      ! adapt displs(p) and cnts(p) accordingly:
00972                      displs(p) = displs(p)+1
00973                      cnts(p) = max(cnts(p)-1,0)
00974                   endif
00975                   if ( .not. isbot(p) ) then
00976                      ! this is a submatrix, not on the bottom
00977                      ! so we don't send the last element of column j
00978                      ! adapt cnts(p) accordingly:
00979                      cnts(p) = max(cnts(p)-1,0)
00980                   endif
00981                else
00982                   displs(p) = 0  ! we are out of columns, use a valid value for
00983                   ! the displacement
00984                   cnts(p) = 0    ! and set number of elements to receive for
00985                   ! this submatrix to zero
00986                endif  !ln(p) .ge. j
00987             enddo  ! do p = 1,procs
00988          endif  ! rank .eq. root
00989          jj   = j            ! column of submatrix to send
00990          scnt = mlocal       ! number of elements to send
00991          k    = 1            ! starting element of column j to send
00992          if (nlocal .ge. j) then
00993             if (.not. isleftl ) then
00994                if ( j .eq. 1 ) then
00995                   jj   = 1
00996                   scnt = 0
00997                endif
00998             endif
00999             if (.not. isrightl ) then
01000                if ( j .eq. nlocal ) then
01001                   jj = 1
01002                   scnt = 0
01003                endif
01004             endif
01005             if (.not. istopl ) then
01006                k = k+1
01007                scnt = max(scnt-1, 0)
01008             endif
01009             if (.not. isbotl ) then
01010                scnt = max(scnt-1, 0)
01011             endif
01012          else
01013             jj   = 1              ! out of columns, use valid value for jj
01014             scnt = 0              ! and put number of elements to send to zero
01015          endif
01016 
01017          call MPI_Gatherv(b(k:k+scnt-1,jj), scnt, MPI_DOUBLE_PRECISION,  &
01018          aa(1,1), cnts, displs, MPI_DOUBLE_PRECISION,  &
01019          root, comm, ierror)
01020       enddo
01021 
01022       if(rank .eq. root) then
01023          a = aa
01024       endif
01025 
01026       deallocate(displs,cnts,aa)
01027 
01028    end subroutine matrix_coll_gather_real8
01029 
01030    subroutine matrix_coll_recv_real8    (a,b,is,lm,js,ln,&
01031    isleft,isright,istop,isbot,root,comm)
01032       ! collect matrix on root process
01033       !
01034       ! parameters:
01035       ! real*8 a(:,:) (out): matrix to be collected.
01036       !                      This matrix has to be available, it is not
01037       !                      allocated by this subroutine.
01038       ! real*8 b(:,:) (in): local scattered matrix
01039       ! integer is,lm,js,ln(number of processes) (in): description of
01040       !   the location of the submatrices b in matrix a:
01041       !   Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is
01042       !   the rank of the process (0..procs-1)
01043       !   The length of the 1st dimension of b is given by lm(rank+1)
01044       !   The length of the 2nd dimension of b is given by ln(rank+1)
01045       ! logical isleft, isright, istop, isbot (in): is this submatrix at the
01046       !  left, right top or bottom of the general matrix.
01047       ! integer root (in): the rank of the process where a is available
01048       ! integer comm (in): the MPI communicator to be used.
01049       !
01050       ! Note: a,is,js are used at the root process only
01051       !
01052       ! Method
01053       !
01054       ! Receive colums of submatrices using MPI_Probe and MPI_Recv.
01055       ! The non-root processes use MPI_Isend to send the columns
01056       !
01057       !  NOTE: the matrices are considered to be bordered by a columns
01058       !   left and right and rows up and down. So, n general,
01059       !   only b(2:mb-1,2:nb-1) is send to the master process.
01060       !   HOWEVER: if matrix b is
01061       !   on the left: b(:,1) is also sent
01062       !   on the right: b(:,nb) is also sent
01063       !   on the top:   b(1,:)  is also sent
01064       !   on the bottom: b(mb,:) is also sent
01065       ! The master process receives a(1:ma,1:na).
01066       !   dimension of b (1:mb,1:nb).
01067       !   dimension of a (1:ma,1:na)
01068       !
01069       use mpi
01070       implicit none
01071       real*8, intent(out), dimension(:,:)  :: a
01072       real*8, intent(in),  dimension(:,:)  :: b
01073       integer, intent(in), dimension(:)    :: is,lm
01074       integer, intent(in), dimension(:)    :: js,ln
01075       logical, intent(in), dimension(:)    :: isleft,isright,istop,isbot
01076       integer, intent(in)                  :: root,comm
01077 
01078       integer                              :: i,j,rank,ierror,l,procs,ii,jj
01079       integer                              :: jstart, jend, p, m, n
01080 
01081       integer                              :: recvs, recvs_todo
01082       integer                              :: mlocal,nlocal,sender
01083       logical                              :: isleftl, isrightl, istopl, isbotl
01084 
01085       integer, allocatable, dimension(:)   :: col
01086       integer, allocatable, dimension(:)   :: req
01087 
01088       integer, parameter                   :: tag = 2
01089       integer, dimension(MPI_STATUS_SIZE)  :: mpistatus
01090       real*8, allocatable, dimension(:,:)  :: bb
01091 
01092       call MPI_Comm_rank(comm, rank, ierror)
01093       call MPI_Comm_size(comm, procs, ierror)
01094 
01095       nlocal   = ln(rank+1)
01096       mlocal   = lm(rank+1)
01097       isleftl  = isleft(rank+1)
01098       isrightl = isright(rank+1)
01099       istopl   = istop(rank+1)
01100       isbotl   = isbot(rank+1)
01101 
01102       if (rank .eq. root) then
01103          ! calculate total number of recvs to be done: that is equal to
01104          ! the total number of columns in the submatrices
01105          recvs_todo = sum(ln(1:procs))
01106 
01107          recvs_todo =  recvs_todo - nlocal  ! correct for columns local available
01108 
01109          do p=1,procs                       ! correct for borders
01110             if (p .eq. root+1) then
01111                cycle
01112             endif
01113             if ( .not. isleft(p)) then
01114                recvs_todo = recvs_todo -1
01115             endif
01116             if ( .not. isright(p)) then
01117                recvs_todo = recvs_todo -1
01118             endif
01119          enddo
01120          allocate(col(procs)) ! to remember which was the last column
01121          ! received for submatrix(p)
01122          col   = 0
01123          recvs = 0
01124          ! copy submatrix on root:
01125          i = 1
01126          ii = is(rank+1)
01127          m = mlocal
01128          j = 1
01129          jj = js(rank+1)
01130          n = nlocal
01131          ! take care of borders:
01132          if ( .not. isleftl ) then
01133             j  = j  + 1
01134             jj = jj + 1
01135          endif
01136          if ( .not. isrightl ) then
01137             n = n - 1
01138          endif
01139          if ( .not. istopl ) then
01140             i  = i  + 1
01141             ii = ii + 1
01142             m  = m  - 1
01143          endif
01144          if ( .not. isbotl ) then
01145             m = m - 1
01146          endif
01147          a(ii:ii+m-1,jj:jj+n-1) = b(i:i+m-1,j:j+n-1)
01148 
01149          do recvs=1,recvs_todo  ! loop over the total number of receives needed
01150             ! wait until anybody did send a column to root:
01151 
01152             call MPI_Probe(MPI_ANY_SOURCE, tag, comm, mpistatus, ierror)
01153 
01154             ! find out who was the sender:
01155 
01156             sender = mpistatus(MPI_SOURCE)
01157 
01158             ! update col, and compute i and j: the startlocation of
01159             ! to be received column
01160 
01161             p      = sender + 1
01162             col(p) = col(p) + 1
01163             i      = is(p)
01164             j      = col(p)+js(p)-1
01165             m      = lm(p)
01166 
01167             ! perform the actual receive:
01168             ! and correct for borders
01169 
01170             if ( .not. isleft(p)) then
01171                j = j + 1
01172             endif
01173             if ( .not. isright(p)) then
01174                ! nothing to correct here
01175             endif
01176             if ( .not. istop(p)) then
01177                i = i + 1
01178                m = m - 1
01179             endif
01180             if ( .not. isbot(p)) then
01181                m = m - 1
01182             endif
01183 
01184             call MPI_Recv(a(i:i+m-1,j),m,MPI_DOUBLE_PRECISION, &
01185             sender,tag,comm,MPI_STATUS_IGNORE,ierror)
01186          enddo
01187          deallocate(col)
01188          !    deallocate(isleft, isright, istop, isbot)
01189       else  ! non-root code
01190          allocate(req(nlocal))  ! for holding the requests
01191          allocate(bb(size(b,1),size(b,2)))
01192          bb = b     ! want to be sure that the matrix to send is
01193          ! contiguous, because isend's are used
01194          req = MPI_REQUEST_NULL
01195 
01196          jstart = 1
01197          jend   = nlocal
01198          i      = 1
01199          m      = mlocal
01200 
01201          ! take care of borders
01202          if ( .not. isleftl ) then
01203             jstart = jstart + 1
01204          endif
01205          if ( .not. isrightl ) then
01206             jend = jend - 1
01207          endif
01208          if ( .not. istopl ) then
01209             i = i + 1
01210             m = m - 1
01211          endif
01212          if ( .not. isbotl ) then
01213             m = m - 1
01214          endif
01215 
01216          do j = jstart, jend
01217             call MPI_Isend(bb(i,j), m, MPI_DOUBLE_PRECISION, root, tag, comm, req(j), ierror)
01218          enddo
01219 
01220          do l=1,nlocal ! wait until all requests have been handled
01221             if (req(l) .ne. MPI_REQUEST_NULL) then
01222                call MPI_Wait(req(l),MPI_STATUS_IGNORE,ierror)
01223             endif
01224          enddo
01225          deallocate(req,bb)
01226       endif
01227 
01228    end subroutine matrix_coll_recv_real8
01229 
01230    subroutine matrix_coll_recvmat_real8(a,b,is,lm,js,ln,&
01231    isleft,isright,istop,isbot,root,comm)
01232       ! collect matrix on root process
01233       !
01234       ! parameters:
01235       ! real*8 a(:,:) (out): matrix to be collected.
01236       !                      This matrix has to be available, it is not
01237       !                      allocated by this subroutine.
01238       ! real*8 b(:,:) (in): local scattered matrix
01239       ! integer is,lm,js,ln(number of processes) (in): description of
01240       !   the location of the submatrices b in matrix a:
01241       !   Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is
01242       !   the rank of the process (0..procs-1)
01243       !   The length of the 1st dimension of b is given by lm(rank+1)
01244       !   The length of the 2nd dimension of b is given by ln(rank+1)
01245       ! logical isleft, isright, istop, isbot (in): is this submatrix at the
01246       !  left, right top or bottom of the general matrix.
01247       ! integer root (in): the rank of the process where a is available
01248       ! integer comm (in): the MPI communicator to be used.
01249       !
01250       ! Note: a,is,js are used at the root process only
01251       !
01252       ! Method
01253       !
01254       !  root receives submatrices as a whole, one by one. Local
01255       !  submatrix is copied.
01256       !
01257       !  NOTE: the matrices are considered to be bordered by columns
01258       !   left and right and rows up and down.
01259       !   The number of border columns/rows is nbord.
01260       !   The number of overlapping columns nover = 2*nbord.
01261       !   So, in general,
01262       !   only b(nbord+1:mb-nbord,nbord+1:nb-nbord) is send to the master process.
01263       !   HOWEVER: if matrix b is
01264       !   on the left: b(:,1:nbord) is also sent
01265       !   on the right: b(:,nb-nbord+1:nb) is also sent
01266       !   on the top:   b(1:nbord,:)  is also sent
01267       !   on the bottom: b(mb-nbord+1:mb,:) is also sent
01268       ! The master process receives a(1:ma,1:na).
01269       !   dimension of b (1:mb,1:nb).
01270       !   dimension of a (1:ma,1:na)
01271       !
01272       use mpi
01273       implicit none
01274       real*8, intent(out), dimension(:,:)  :: a
01275       real*8, intent(in),  dimension(:,:)  :: b
01276       integer, intent(in), dimension(:)    :: is, lm
01277       integer, intent(in), dimension(:)    :: js, ln
01278       logical, intent(in), dimension(:)    :: isleft, isright, istop, isbot
01279       integer, intent(in)                  :: root, comm
01280 
01281       integer                              :: i, rank, ierror, procs, p
01282 
01283       integer                              :: mlocal, nlocal
01284 
01285       !logical, allocatable, dimension(:)   :: isleft, isright, istop, isbot
01286 
01287       integer, parameter                   :: tag = 2
01288 
01289       integer                              :: acolstart, acolend, arowstart, arowend
01290       integer                              :: bcolstart, bcolend, browstart, browend
01291       integer                              :: mp, np, k, ia, ja
01292       integer, parameter                   :: tag1 = 123
01293       integer, parameter                   :: tag2 = 124
01294       logical                              :: isleftl, isrightl, istopl, isbotl
01295 
01296       ! nover is number of overlapping rows/columns of submatrices.
01297       ! nbord is number of borders.
01298 
01299       integer, parameter                   :: nbord = 2
01300       integer, parameter                   :: nover = 2*nbord
01301 
01302 
01303       call MPI_Comm_rank(comm, rank, ierror)
01304       call MPI_Comm_size(comm, procs, ierror)
01305 
01306       nlocal   = ln(rank+1)
01307       mlocal   = lm(rank+1)
01308       isleftl  = isleft(rank+1)
01309       isrightl = isright(rank+1)
01310       istopl   = istop(rank+1)
01311       isbotl   = isbot(rank+1)
01312 
01313       ! define start-end cols and rows for the general case:
01314       ! computations on arow.. and acol.. are only meaningful at root
01315       browstart  = nbord  + 1
01316       browend    = mlocal - nbord
01317       bcolstart  = nbord + 1
01318       bcolend    = nlocal - nbord
01319       arowstart  = is(root + 1) + nbord
01320       arowend    = is(root + 1) + mlocal - 1 - nbord
01321       acolstart  = js(root + 1) + nbord
01322       acolend    = js(root + 1) + nlocal - 1 - nbord
01323 
01324       ! correct for the border cases:
01325 
01326       if (isleftl) then
01327          bcolstart = bcolstart - nbord
01328          acolstart = acolstart - nbord
01329       endif
01330       if (isrightl) then
01331          bcolend = bcolend + nbord
01332          acolend = acolend + nbord
01333       endif
01334       if (istopl) then
01335          browstart = browstart - nbord
01336          arowstart = arowstart - nbord
01337       endif
01338       if (isbotl) then
01339          browend = browend + nbord
01340          arowend = arowend + nbord
01341       endif
01342 
01343       if (rank .eq. root) then
01344 
01345          ! copy local submatrix
01346 
01347          ! coy of the local submtrix B to A:
01348          a(arowstart:arowend,acolstart:acolend) = &
01349          b(browstart:browend,bcolstart:bcolend)
01350 
01351          ! now, collect from each non-root process
01352          ! it's matrix b, and put it in the proper place in a
01353 
01354          ploop: do p = 1, procs
01355             if ( p - 1 .eq. root ) then
01356                cycle ploop
01357             endif
01358             ! howmany elements to expect from process p-1  (= mp*np)
01359             ! and where to put this in a  (ia:ia+mp-1, ja:ja+np-1)
01360             !
01361 
01362             mp = lm(p) - nover
01363             np = ln(p) - nover
01364             ia = is(p) + nbord
01365             ja = js(p) + nbord
01366 
01367             ! correction if b is at a border:
01368 
01369             if (isleft(p)) then
01370                ja = ja - nbord
01371                np = np + nbord
01372             endif
01373             if (isright(p)) then
01374                np = np + nbord
01375             endif
01376             if (istop(p)) then
01377                ia = ia - nbord
01378                mp = mp + nbord
01379             endif
01380             if (isbot(p)) then
01381                mp = mp + nbord
01382             endif
01383 
01384             ! to avoid deadlocks, master first sends a
01385             ! short message to p. After receiving that message,
01386             ! p will start sending its submatrix
01387 
01388             call MPI_Send(mp*np, 1, MPI_INTEGER, p - 1, tag1, comm, ierror)
01389 
01390             ! receive a submatrix from process p-1
01391 
01392             call MPI_Recv(a(ia:ia+mp-1,ja:ja+np-1), mp*np, MPI_DOUBLE_PRECISION, &
01393             p - 1, tag2, comm, MPI_STATUS_IGNORE, ierror)
01394 
01395          enddo  ploop ! loop over non-root processes
01396       else ! above is the root code, now the non-root code
01397          ! receive the short message from root, so I can start sending b
01398          call MPI_Recv(i, 1, MPI_INTEGER,&
01399          root, tag1, comm, MPI_STATUS_IGNORE, ierror)
01400 
01401          ! which part of b to send: b(browstart:browend,bcolstart:bcolend)
01402 
01403          ! sanity check
01404          k = (browend - browstart + 1)*(bcolend - bcolstart + 1)
01405          if (i .ne. k) then
01406             write(*,*)'process ',rank,' has a problem in matrix_coll_recvmat_real8: '
01407             write(*,*)'number of words to send:',k
01408             write(*,*)'root expects:           ',i
01409             do i=1,procs
01410                write(*,*)i-1,isleft(i),isright(i),istop(i),isbot(i)
01411             enddo
01412             write(*,*)isleftl,isrightl,istopl,isbotl
01413             call MPI_Abort(MPI_COMM_WORLD,1,ierror)
01414          endif
01415 
01416          call MPI_Send(b(browstart:browend,bcolstart:bcolend), k, &
01417          MPI_DOUBLE_PRECISION, root, tag2, comm, ierror)
01418       endif
01419 
01420    end subroutine matrix_coll_recvmat_real8
01421 
01422    subroutine matrix_coll_recvmat_int(a,b,is,lm,js,ln,&
01423    isleft,isright,istop,isbot,root,comm)
01424       ! This is a copy of matrix_coll_recvmat_real8 for integers
01425       use mpi
01426       implicit none
01427       integer, intent(out), dimension(:,:)  :: a
01428       integer, intent(in),  dimension(:,:)  :: b
01429       integer, intent(in), dimension(:)    :: is, lm
01430       integer, intent(in), dimension(:)    :: js, ln
01431       logical, intent(in), dimension(:)    :: isleft, isright, istop, isbot
01432       integer, intent(in)                  :: root, comm
01433 
01434       integer                              :: i, rank, ierror, procs, p
01435 
01436       integer                              :: mlocal, nlocal
01437 
01438       !logical, allocatable, dimension(:)   :: isleft, isright, istop, isbot
01439 
01440       integer, parameter                   :: tag = 2
01441 
01442       integer                              :: acolstart, acolend, arowstart, arowend
01443       integer                              :: bcolstart, bcolend, browstart, browend
01444       integer                              :: mp, np, k, ia, ja
01445       integer, parameter                   :: tag1 = 123
01446       integer, parameter                   :: tag2 = 124
01447       logical                              :: isleftl, isrightl, istopl, isbotl
01448 
01449       ! nover is number of overlapping rows/columns of submatrices.
01450       ! nbord is number of borders.
01451 
01452       integer, parameter                   :: nover=4
01453       integer, parameter                   :: nbord=nover/2
01454 
01455       call MPI_Comm_rank(comm, rank, ierror)
01456       call MPI_Comm_size(comm, procs, ierror)
01457 
01458       nlocal   = ln(rank+1)
01459       mlocal   = lm(rank+1)
01460       isleftl  = isleft(rank+1)
01461       isrightl = isright(rank+1)
01462       istopl   = istop(rank+1)
01463       isbotl   = isbot(rank+1)
01464 
01465       ! define start-end cols and rows for the general case:
01466       ! computations on arow.. and acol.. are only meaningful at root
01467       browstart  = nbord  + 1
01468       browend    = mlocal - nbord
01469       bcolstart  = nbord + 1
01470       bcolend    = nlocal - nbord
01471       arowstart  = is(root + 1) + nbord
01472       arowend    = is(root + 1) + mlocal - 1 - nbord
01473       acolstart  = js(root + 1) + nbord
01474       acolend    = js(root + 1) + nlocal - 1 - nbord
01475 
01476       ! correct for the border cases:
01477 
01478       if (isleftl) then
01479          bcolstart = bcolstart - nbord
01480          acolstart = acolstart - nbord
01481       endif
01482       if (isrightl) then
01483          bcolend = bcolend + nbord
01484          acolend = acolend + nbord
01485       endif
01486       if (istopl) then
01487          browstart = browstart - nbord
01488          arowstart = arowstart - nbord
01489       endif
01490       if (isbotl) then
01491          browend = browend + nbord
01492          arowend = arowend + nbord
01493       endif
01494 
01495       if (rank .eq. root) then
01496 
01497          ! copy local submatrix
01498 
01499          ! coy of the local submtrix B to A:
01500 
01501          a(arowstart:arowend,acolstart:acolend) = &
01502          b(browstart:browend,bcolstart:bcolend)
01503 
01504          ! now, collect from each non-root process
01505          ! it's matrix b, and put it in the proper place in a
01506 
01507          ploop: do p = 1, procs
01508             if ( p - 1 .eq. root ) then
01509                cycle ploop
01510             endif
01511             ! howmany elements to expect from process p-1  (= mp*np)
01512             ! and where to put this in a  (ia:ia+mp-1, ja:ja+np-1)
01513             !
01514 
01515             mp = lm(p) - nover
01516             np = ln(p) - nover
01517             ia = is(p) + nbord
01518             ja = js(p) + nbord
01519 
01520             ! correction if b is at a border:
01521 
01522             if (isleft(p)) then
01523                ja = ja - nbord
01524                np = np + nbord
01525             endif
01526             if (isright(p)) then
01527                np = np + nbord
01528             endif
01529             if (istop(p)) then
01530                ia = ia - nbord
01531                mp = mp + nbord
01532             endif
01533             if (isbot(p)) then
01534                mp = mp + nbord
01535             endif
01536 
01537             ! to avoid deadlocks, master first sends a
01538             ! short message to p. After receiving that message,
01539             ! p will start sending its submatrix
01540 
01541             call MPI_Send(mp*np, 1, MPI_INTEGER, p - 1, tag1, comm, ierror)
01542 
01543             ! receive a submatrix from process p-1
01544 
01545             call MPI_Recv(a(ia:ia+mp-1,ja:ja+np-1), mp*np, MPI_INTEGER, &
01546             p - 1, tag2, comm, MPI_STATUS_IGNORE, ierror)
01547 
01548          enddo  ploop ! loop over non-root processes
01549       else ! above is the root code, now the non-root code
01550          ! receive the short message from root, so I can start sending b
01551          call MPI_Recv(i, 1, MPI_INTEGER,&
01552          root, tag1, comm, MPI_STATUS_IGNORE, ierror)
01553 
01554          ! which part of b to send: b(browstart:browend,bcolstart:bcolend)
01555 
01556          ! sanity check
01557          k = (browend - browstart + 1)*(bcolend - bcolstart + 1)
01558          if (i .ne. k) then
01559             write(*,*)'process ',rank,' has a problem in matrix_coll_recvmat_int: '
01560             write(*,*)'number of words to send:',k
01561             write(*,*)'root expects:           ',i
01562             do i=1,procs
01563                write(*,*)i-1,isleft(i),isright(i),istop(i),isbot(i)
01564             enddo
01565             write(*,*)isleftl,isrightl,istopl,isbotl
01566             call MPI_Abort(MPI_COMM_WORLD,1,ierror)
01567          endif
01568 
01569          call MPI_Send(b(browstart:browend,bcolstart:bcolend), k, &
01570          MPI_INTEGER, root, tag2, comm, ierror)
01571       endif
01572 
01573    end subroutine matrix_coll_recvmat_int
01574 
01575    subroutine decomp(n, numprocs, myid, s, e)
01576 
01577       !  From the mpich-distribution: MPE_Decomp1d
01578       !
01579       !  n        : integer, input (for example: number of tasks or
01580       !                               dimension of an array)
01581       !  numprocs : integer, input (number of processes)
01582       !  myid     : integer, input (MPI id of this process)
01583       !  s        : integer, output
01584       !  e        : integer, output
01585       !
01586       !  The purpose of this subroutine is to get an as equal as possible
01587       !  division of the integers 1..n among numprocs intervals.
01588       !  A common use is to divide work among MPI processes.
01589       !
01590       !  example:
01591       !
01592       !  integer numprocs, ier, myid, n, s, e, i
01593       !  call MPI_Comm_size(MPI_COMM_WORLD, numprocs, ier)
01594       !  call MPI_Comm_rank(MPI_COMM_WORLD, myid, ier)
01595       !  n = 1000
01596       !  call decomp(n, nprocs, myid, s, e)
01597       !  do i = s,e
01598       !    ...
01599       !  enddo
01600       !
01601       integer, intent(in) :: n, numprocs, myid
01602       integer, intent(out):: s, e
01603       integer nlocal
01604       integer deficit
01605 
01606       nlocal  = n / numprocs
01607       s       = myid * nlocal + 1
01608       deficit = mod(n,numprocs)
01609       s       = s + min(myid,deficit)
01610       if (myid .lt. deficit) then
01611          nlocal = nlocal + 1
01612       endif
01613       e = s + nlocal - 1
01614       if (e .gt. n .or. myid .eq. numprocs-1) then
01615          e = n
01616       endif
01617 
01618    end subroutine decomp
01619 
01620    subroutine det_submatrices(ma,na,mp,np,is,lm,js,ln,isleft,isright,istop,isbot)
01621       !
01622       ! determine a division of a ma*na matrix on mp*np processes
01623       ! ma: integer(in): 1st dimension of matrix to divide
01624       ! na: integer(in): 2nd dimension of this matrix
01625       ! mp: integer(in): 1st dimension of processorgrid
01626       ! np: integer(in): 2nd dimension of processorgrid
01627       ! is(mp*np): integer(out): is(i) will be equal to the
01628       !                          start row of the i-th submatrix
01629       ! lm(mp*np): integer(out): lm(i) will be equal to the
01630       !                          first dimension of the i-th submatrix
01631       ! js(mp*np): integer(out): js(i) will be equal to the
01632       !                          start column of the i-th submatrix
01633       ! ln(mp*np): integer(out): ln(i) will be equal to the
01634       !                          2nd dimension of the i-th submatrix
01635       ! isleft(mp*np): logical(out): isleft(i) is true if the i-th submatrix
01636       !                              shares the first column with the global matrix
01637       !                              a(:,1)
01638       ! isright(mp*np): logical(out): isright(i) is true if the i-th submatrix
01639       !                              shares the last column with the global matrix
01640       !                              a(ma,:)
01641       ! istop(mp*np): logical(out): istop(i) is true if the i-th submatrix
01642       !                              shares the first row with the global matrix
01643       !                              a(1,:)
01644       ! isbot(mp*np): logical(out): isbot(i) is true if the i-th submatrix
01645       !                              shares the last row with the global matrix
01646       !                              a(na,:)
01647       !
01648       ! The submatrices overlap:
01649       ! 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
01650       ! 1 2 3 4 5 6 7 8 9           1 2 3 4 5 6 7 8 9 0 1
01651       !           1 2 3 4 5 6 7 8 9 0 1 2 3
01652       !
01653       ! + + + + + x x x x + + + + + x x x x + + + + + + +
01654       ! + + + + + x x x x + + + + + x x x x + + + + + + +
01655       ! + + + + + x x x x + + + + + x x x x + + + + + + +
01656       ! + + + + + x x x x + + + + + x x x x + + + + + + +
01657       ! + + + + + x x x x + + + + + x x x x + + + + + + +
01658       ! + + + + + x x x x + + + + + x x x x + + + + + + +
01659       ! + + + + + x x x x + + + + + x x x x + + + + + + +
01660       !
01661       ! This picture is for the overlap in horizontal direction,
01662       ! The overlap is 4.
01663       ! + -> matrix element
01664       ! x -> overlapping matrix element
01665       ! the order of the submatrices is 9,13,11.
01666       ! The order of the global matrix is 25
01667       !
01668       implicit none
01669       integer, intent(in)                :: ma,na,mp,np
01670       integer, dimension(:), intent(out) :: is,lm,js,ln
01671       logical, dimension(:), intent(out) :: isleft,isright,istop,isbot
01672 
01673       integer i,j,s,e,k
01674       integer, parameter :: nover = 4
01675 
01676       isleft = .false.
01677       isright = .false.
01678       istop = .false.
01679       isbot = .false.
01680 
01681       do i = 1, mp
01682          call decomp(ma-nover, mp, i - 1, s, e)
01683          k = 0
01684          do j = 1, np
01685             if (j .eq. 1) then
01686                isleft(i+k) = .true.
01687             endif
01688             if (j .eq. np) then
01689                isright(i+k) = .true.
01690             endif
01691             is(i + k) = s
01692             lm(i + k) = e - s + nover + 1
01693             k = k + mp
01694          enddo
01695       enddo
01696       k = 0
01697       do j=1, np
01698          call decomp(na-nover, np, j - 1, s, e)
01699          do i = 1, mp
01700             if (i .eq. 1) then
01701                istop(i+k) = .true.
01702             endif
01703             if (i .eq. mp) then
01704                isbot(i+k) = .true.
01705             endif
01706             js(i + k) = s
01707             ln(i + k) = e - s + nover + 1
01708          enddo
01709          k = k + mp
01710       enddo
01711 
01712    end subroutine det_submatrices
01713 
01714    subroutine shift_borders_matrix_real8(a,left,right,top,bot,comm)
01715       !
01716       ! shift borders to and from neighbours.
01717       !
01718       use mpi
01719       implicit none
01720       real*8, dimension(:,:), intent(inout) :: a
01721       integer, intent(in)                   :: left,right,top,bot,comm
01722 
01723       integer, parameter                    :: datatype = MPI_DOUBLE_PRECISION
01724       integer ierror,ma,na
01725 
01726       ma = ubound(a,1)
01727       na = ubound(a,2)
01728 
01729       ! send to left, receive from right
01730       call MPI_Sendrecv(a(2:ma-1,2), ma-2, datatype,       &
01731       left, 1,                           &
01732       a(2:ma-1,na), ma-2, datatype,      &
01733       right, 1,                          &
01734       comm, MPI_STATUS_IGNORE, ierror)
01735       ! send to right, receive from left
01736       call MPI_Sendrecv(a(2:ma-1,na-1), ma-2, datatype,    &
01737       right, 2,                          &
01738       a(2:ma-1,1), ma-2, datatype,       &
01739       left, 2,                           &
01740       comm, MPI_STATUS_IGNORE, ierror)
01741 
01742       ! send to bot, receive from top
01743       call MPI_Sendrecv(a(ma-1,:), na, datatype,           &
01744       bot, 3,                            &
01745       a(1,:), na, datatype,              &
01746       top, 3,                            &
01747       comm, MPI_STATUS_IGNORE, ierror)
01748       ! send to top, receive from bot
01749       call MPI_Sendrecv(a(2,:), na, datatype,              &
01750       top, 4,                            &
01751       a(ma,:), na, datatype,             &
01752       bot, 4,                            &
01753       comm, MPI_STATUS_IGNORE, ierror)
01754 
01755    end subroutine shift_borders_matrix_real8
01756 
01757 #endif
01758 end module general_mpi_module