XBeach
|
00001 module general_mpi_module 00002 #ifdef USEMPI 00003 00004 ! This module contains three variants of subroutines to scatter 00005 ! a matrix from the root process to the other processes. 00006 ! matrix_distr_scatter: uses MPI_Scatterv, scattering groups 00007 ! of columns 00008 ! matrix_distr_send: uses MPI_Isend and MPI_Irecv, scattering 00009 ! groups od columns 00010 ! matrix_distr_sendmat: uses MPI_Send and MPI_Recv, scattering 00011 ! submatrices as a whole 00012 ! 00013 ! and the opposite operation: 00014 ! matrix_coll_gather: uses MPI_Gatherv, gathering groups 00015 ! of columns 00016 ! matrix_coll_recv: uses MPI_Isend and MPI_Recv, gathering groups 00017 ! of columns 00018 ! matrix_coll_matrecv: uses MPI_Send and MPI_Recv, gathering 00019 ! submatrices as a whole 00020 ! 00021 ! NOTE: the matrices are considered to be bordered by a columns 00022 ! left and right and rows up and down. So, n general, 00023 ! only b(2:mb-1,2:nb-1) is send to the master process. 00024 ! HOWEVER: if matrix b is 00025 ! on the left: b(:,1) is also sent 00026 ! on the right: b(:,nb) is also sent 00027 ! on the top: b(1,:) is also sent 00028 ! on the bottom: b(mb,:) is also sent 00029 ! The master process receives a(1:ma,1:na). 00030 ! dimension of b (1:mb,1:nb). 00031 ! dimension of a (1:ma,1:na) 00032 ! 00033 00034 ! The parameter lists are the same. 00035 ! On the lisa system at sara (Xeon, infiniband), the send variety 00036 ! is about twice as fast as the scatter variety. 00037 ! 00038 ! Most probable cause is that in the scatter variety, a number 00039 ! of MPI_Scatterv's are used. They will run after the previous 00040 ! has been completed, while in the other, as many MPI_isend's and 00041 ! MPI_Irecv's are executed in parallel, without waiting. 00042 ! There is no non-waiting MPI_Scatterv available, alas. 00043 ! 00044 ! On the same system, there is no significant difference in 00045 ! performance between matrix_coll_gather and matrix_coll_recv. 00046 ! 00047 ! The matrecv and matsend varieties are even faster than the 00048 ! recv and send varieties. Moreover, the code is simpler, so 00049 ! it is bet to use these, see interfaces matrix_distr and matrix_coll. 00050 00051 ! Most of the subroutines end with a MPI_Barrier, this was 00052 ! necessary for usage with openmpi, at least for the 00053 ! distbribution routines. Didn't understand why, but now 00054 ! I know: this is a bug in the shared memory interface of 00055 ! openmpi. 00056 ! 00057 00058 00059 interface matrix_distr 00060 ! module procedure matrix_distr_send_real8 00061 ! module procedure matrix_distr_send_integer 00062 ! module procedure matrix_distr_scatter_real8 00063 ! module procedure matrix_distr_scatter_integer 00064 module procedure matrix_distr_sendmat_real8 00065 module procedure matrix_distr_sendmat_integer 00066 end interface matrix_distr 00067 00068 interface matrix_coll 00069 ! module procedure matrix_coll_gather_real8 00070 ! module procedure matrix_coll_recv_real8 00071 module procedure matrix_coll_recvmat_int 00072 module procedure matrix_coll_recvmat_real8 00073 end interface matrix_coll 00074 00075 interface shift_borders 00076 module procedure shift_borders_matrix_real8 00077 end interface shift_borders 00078 00079 contains 00080 subroutine vector_distr_send(a,b,is,lm,root,comm) 00081 ! 00082 ! Distribute vector a on processes 00083 ! Process P receives a(is(P):is(P)+lm(P)-1) 00084 ! 00085 ! Parameters: 00086 ! real*8 a (in): vector to distribute, only used at root 00087 ! real*8 b (out): local vector 00088 ! integer is(number-of-processes) (in): see above: needed on all processes 00089 ! integer lm(number-of-processes) (in): see above: needed on all processes 00090 ! integer root (in): process that holds a, is, lm 00091 ! integer comm (in): MPI communicator 00092 ! 00093 ! Method: root process sends, using MPI_Isend parts of a to 00094 ! the other processes. On the root process a copy is 00095 ! made. 00096 ! 00097 use mpi 00098 implicit none 00099 real*8, dimension(:), intent(in) :: a 00100 real*8, dimension(:), intent(out) :: b 00101 integer, dimension(:), intent(in) :: is 00102 integer, dimension(:), intent(in) :: lm 00103 integer, intent(in) :: root 00104 integer, intent(in) :: comm 00105 00106 integer :: ierror,p,procs,rank,pp,mlocal 00107 integer, dimension(:), allocatable :: req 00108 real*8, dimension(:), allocatable :: aa 00109 00110 call MPI_Comm_rank(comm, rank, ierror) 00111 call MPI_Comm_size(comm, procs, ierror) 00112 allocate(req(procs-1)) 00113 00114 mlocal = lm(rank+1) 00115 00116 pp=0 00117 if ( rank .eq. root ) then 00118 allocate(aa(size(a))) 00119 aa = a ! must have contiguous memory for a because 00120 ! of mpi_isend 00121 do p = 1,procs 00122 if ( p-1 .eq. rank ) then 00123 b(1:lm(p)) = a(is(p) : is(p)+lm(p)-1) 00124 else 00125 pp = pp + 1 00126 call MPI_Isend(aa(is(p)), lm(p), MPI_DOUBLE_PRECISION, p-1, & 00127 11, comm, req(pp), ierror) 00128 endif 00129 enddo 00130 call MPI_Waitall(pp,req,MPI_STATUSES_IGNORE,ierror) 00131 deallocate(aa) 00132 else 00133 call MPI_Recv(b, mlocal, MPI_DOUBLE_PRECISION, root, 11,& 00134 comm, MPI_STATUS_IGNORE, ierror) 00135 endif 00136 00137 deallocate(req) 00138 00139 00140 end subroutine vector_distr_send 00141 00142 subroutine matrix_distr_scatter_real8(a,b,is,lm,js,ln,root,comm) 00143 ! distribute matrix on processes 00144 ! 00145 ! parameters: 00146 ! real*8 a(:,:) (in): matrix to be scattered. 00147 ! real*8 b(:,:) (out): local matrix to scatter to. 00148 ! This matrix has to be available, it is not 00149 ! allocated by this subroutine. 00150 ! integer is,lm,js,ln(number of processes) (in): description of 00151 ! the location of the submatrices b in matrix a: 00152 ! Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is 00153 ! the rank of the process (0..procs-1) 00154 ! The length of the 1st dimension of b is given by lm(rank+1) 00155 ! The length of the 2nd dimension of b is given by ln(rank+1) 00156 ! integer root (in): the rank of the process where a is available 00157 ! integer comm (in): the MPI communicator to be used. 00158 ! 00159 ! Note: a,is,js are used at the root process only 00160 ! 00161 ! Method 00162 ! 00163 ! Distribute in a number of scatterv's the colums of the submatrices. 00164 ! Per scatterv, distribute (number of processes) columns, so 00165 ! we need maxval(ln) scatters. 00166 ! 00167 use mpi 00168 implicit none 00169 real*8, intent(in), dimension(:,:) :: a 00170 real*8, intent(out), dimension(:,:) :: b 00171 integer, intent(in), dimension(:) :: is,lm 00172 integer, intent(in), dimension(:) :: js,ln 00173 integer, intent(in) :: root,comm 00174 00175 integer :: j,p,rank,ierror,procs,jj,ma,jmax 00176 integer :: nlocal,mlocal,rcnt 00177 integer, allocatable, dimension(:) :: displs, cnts 00178 real*8, allocatable, dimension(:,:) :: aa 00179 00180 call MPI_Comm_rank(comm, rank, ierror) 00181 call MPI_Comm_size(comm, procs, ierror) 00182 00183 nlocal = ln(rank+1) 00184 mlocal = lm(rank+1) 00185 00186 if (rank .eq. root) then 00187 ma = size(a,1) 00188 allocate(displs(procs), cnts(procs)) 00189 ! because the counts and displacements are only valid for a 00190 ! contiguous matrix, we take care of that: 00191 00192 allocate(aa(size(a,1),size(a,2))) 00193 aa = a 00194 00195 else 00196 ma = 1 ! to quiet the compiler 00197 allocate(displs(1), cnts(1)) ! these are not needed in non-root process, but 00198 allocate(aa(1,1)) 00199 displs(1) = 0 00200 cnts(1) = 0 00201 ! it is safer to have valid addresses 00202 endif 00203 00204 jmax = maxval(ln(1:procs)) ! number of scatters needed 00205 00206 do j=1,jmax ! scatter all j'th columns 00207 if (rank .eq. root) then 00208 do p=1,procs 00209 if (ln(p) .ge. j) then ! compute displacement of the start of the j-th 00210 ! column of submatrix(p) 00211 displs(p) = (js(p) -1 + j -1)*ma + is(p) - 1 00212 cnts(p) = lm(p) ! number of elements in this column 00213 else ! for submatrix(p), we are out of columns 00214 displs(p) = 0 ! put in some valid value for the displacement 00215 cnts(p) = 0 ! and use zero for the number of elements in 00216 ! this column 00217 endif 00218 enddo 00219 endif 00220 if (nlocal .ge. j) then ! this is for all processes 00221 jj=j ! jj equals the column number of the local 00222 ! matrix 00223 rcnt = mlocal ! number of elements in this column 00224 else 00225 jj=1 ! we are out of columns, give jj a valid number 00226 rcnt = 0 ! and set the number of elements to receive to zero 00227 endif 00228 00229 call MPI_Scatterv(aa(1,1), cnts(1), displs(1), MPI_DOUBLE_PRECISION, & 00230 b(:,jj), rcnt, MPI_DOUBLE_PRECISION, & 00231 root, comm, ierror) 00232 enddo 00233 00234 deallocate(displs,cnts,aa) 00235 00236 end subroutine matrix_distr_scatter_real8 00237 00238 subroutine matrix_distr_scatter_integer(a,b,is,lm,js,ln,root,comm) 00239 ! distribute matrix on processes 00240 ! 00241 ! parameters: 00242 ! integer a(:,:) (in): matrix to be scattered. 00243 ! integer b(:,:) (out): local matrix to scatter to. 00244 ! This matrix has to be available, it is not 00245 ! allocated by this subroutine. 00246 ! integer is,lm,js,ln(number of processes) (in): description of 00247 ! the location of the submatrices b in matrix a: 00248 ! Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is 00249 ! the rank of the process (0..procs-1) 00250 ! The length of the 1st dimension of b is given by lm(rank+1) 00251 ! The length of the 2nd dimension of b is given by ln(rank+1) 00252 ! integer root (in): the rank of the process where a is available 00253 ! integer comm (in): the MPI communicator to be used. 00254 ! 00255 ! Note: a,is,js are used at the root process only 00256 ! 00257 ! Method 00258 ! 00259 ! Distribute in a number of scatterv's the colums of the submatrices. 00260 ! Per scatterv, distribute (number of processes) columns, so 00261 ! we need maxval(ln) scatters. 00262 ! 00263 use mpi 00264 implicit none 00265 integer, intent(in), dimension(:,:) :: a 00266 integer, intent(out), dimension(:,:) :: b 00267 integer, intent(in), dimension(:) :: is,lm 00268 integer, intent(in), dimension(:) :: js,ln 00269 integer, intent(in) :: root,comm 00270 00271 integer :: j,p,rank,ierror,procs,jj,ma,jmax 00272 integer :: nlocal,mlocal,rcnt 00273 integer, allocatable, dimension(:) :: displs, cnts 00274 integer, allocatable, dimension(:,:) :: aa 00275 00276 integer, parameter :: tag = 1 00277 00278 call MPI_Comm_rank(comm, rank, ierror) 00279 call MPI_Comm_size(comm, procs, ierror) 00280 00281 nlocal = ln(rank+1) 00282 mlocal = lm(rank+1) 00283 00284 if (rank .eq. root) then 00285 ma = size(a,1) 00286 allocate(displs(procs), cnts(procs)) 00287 allocate(aa(size(a,1),size(a,2))) 00288 aa = a 00289 else 00290 ma = 1 ! to quiet the compiler 00291 allocate(displs(1), cnts(1)) ! these are not needed in non-root process, but 00292 ! it is safer to have valid addresses 00293 allocate(aa(1,1)) 00294 endif 00295 00296 jmax = maxval(ln(1:procs)) ! number of scatters needed 00297 00298 do j=1,jmax ! scatter all j'th columns 00299 if (rank .eq. root) then 00300 do p=1,procs 00301 if (ln(p) .ge. j) then ! compute displacement of the start of the j-th 00302 ! column of submatrix(p) 00303 displs(p) = (js(p) -1 + j -1)*ma + is(p) - 1 00304 cnts(p) = lm(p) ! number of elements in this column 00305 else ! for submatrix(p), we are out of columns 00306 displs(p) = 0 ! put in some valid value for the displacement 00307 cnts(p) = 0 ! and use zero for the number of elements in 00308 ! this column 00309 endif 00310 enddo 00311 endif 00312 if (nlocal .ge. j) then ! this is for all processes 00313 jj=j ! jj equals the column number of the local 00314 ! matrix 00315 rcnt = mlocal ! number of elements in this column 00316 else 00317 jj=1 ! we are out of columns, give jj a valid number 00318 rcnt = 0 ! and set the number of elements to receive to zero 00319 endif 00320 00321 call MPI_Scatterv(aa(1,1), cnts(1), displs(1), MPI_INTEGER, & 00322 b(:,jj), rcnt, MPI_INTEGER, & 00323 root, comm, ierror) 00324 enddo 00325 00326 deallocate(displs,cnts,aa) 00327 00328 end subroutine matrix_distr_scatter_integer 00329 00330 subroutine matrix_distr_send_real8(a,b,is,lm,js,ln,root,comm) 00331 ! distribute matrix on processes 00332 ! 00333 ! parameters: 00334 ! real*8 a(:,:) (in): matrix to be scatterd. 00335 ! real*8 b(:,:) (out): local matrix to scatter to. 00336 ! This matrix has to be available, it is not 00337 ! allocated by this subroutine. 00338 ! integer is,lm,js,ln(number of processes) (in): description of 00339 ! the location of the submatrices b in matrix a: 00340 ! Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is 00341 ! the rank of the process (0..procs-1) 00342 ! The length of the 1st dimension of b is given by lm(rank+1) 00343 ! The length of the 2nd dimension of b is given by ln(rank+1) 00344 ! integer root (in): the rank of the process where a is available 00345 ! integer comm (in): the MPI communicator to be used. 00346 ! 00347 ! Note: a,is,js are used at the root process only 00348 ! 00349 ! Method 00350 ! 00351 ! Sends colums of submatrices to the processes using MPI_Isend. 00352 ! This routine tries to send as many columns in parallel 00353 ! to different processes as possible, see the usage of req(:) 00354 ! and maxreq below. 00355 ! The columns are received using MPI_Irecv. 00356 ! 00357 use mpi 00358 implicit none 00359 real*8, intent(in), dimension(:,:) :: a 00360 real*8, intent(out), dimension(:,:) :: b 00361 integer, intent(in), dimension(:) :: is,lm 00362 integer, intent(in), dimension(:) :: js,ln 00363 integer, intent(in) :: root,comm 00364 00365 integer :: j,rank,ierror,l,procs 00366 00367 integer :: sends, sends_todo, length 00368 integer :: p,mlocal,nlocal 00369 00370 logical :: done 00371 00372 integer :: maxreq 00373 integer, allocatable, dimension(:) :: req 00374 00375 integer, parameter :: tag = 1 00376 00377 integer, parameter :: datatype = MPI_DOUBLE_PRECISION 00378 real*8, allocatable, dimension(:,:) :: aa,bb 00379 00380 call MPI_Comm_rank(comm, rank, ierror) 00381 call MPI_Comm_size(comm, procs, ierror) 00382 00383 nlocal = ln(rank+1) 00384 mlocal = lm(rank+1) 00385 00386 ! calculate total number of sends to be done: that is equal to 00387 ! the total number of columns in the submatrices 00388 00389 if (rank .eq. root) then 00390 sends_todo = sum(ln(1:procs)) 00391 ! maxreq is the maximum number of outstanding Isend or Irecv requests. 00392 ! Experiments learned that the value of maxreq is not critical, 00393 ! a value of 10 gave on the lisa system (Xeon, infiniband) 00394 ! only somewhat worse results. Probably, the slower the network, 00395 ! the higher this value can be set. Maximum usable is the total number 00396 ! of columns (sends_todo, see above) 00397 maxreq = 1000 00398 else 00399 sends_todo = 0 ! to quiet the compiler 00400 maxreq = nlocal ! on non-root processes, we need only the number of local 00401 ! columns 00402 endif 00403 00404 ! distribute matrix. 00405 ! to ensure that every process is as busy as possible, we do the 00406 ! loops on the processes in the inner loop. 00407 00408 allocate(req(maxreq)) 00409 req=MPI_REQUEST_NULL ! initialize req with a null-value 00410 00411 allocate(bb(size(b,1),size(b,2))) 00412 if ( rank .eq. root) then 00413 ! we need a contiguous matrix a, because 00414 ! we are going to use isend. 00415 allocate(aa(size(a,1),size(a,2))) 00416 aa = a 00417 00418 sends=0 00419 l = 0 00420 j = 0 00421 loop: do 00422 j=j+1 ! j runs over the columns 00423 do p=0, procs - 1 00424 if (ln(p + 1) .ge. j) then ! is j in the range of submatrix(p) 00425 length = lm(p + 1) 00426 if (p .eq. rank) then ! submtarix(p) is on root, just copy the column 00427 b(1:length,j) = a(is(rank+1):is(rank+1)+length-1,j+js(rank+1)-1) 00428 else 00429 ! search a free request holder: 00430 done = .false. 00431 do while( .not. done) 00432 l=l+1 00433 if (l .gt. maxreq) l=1 00434 if (req(l) .eq. MPI_REQUEST_NULL) then 00435 done = .true. 00436 else 00437 ! 00438 ! MPI_Test gives .true. if req(l).eq.MPI_REQUEST_NULL, 00439 ! but the standard says nothing about this case, 00440 ! so we test first if req(l) is equal to MPI_REQUEST_NULL 00441 ! 00442 call MPI_Test(req(l),done,MPI_STATUS_IGNORE,ierror) 00443 endif 00444 enddo ! now we have a free request holder req(l) 00445 call MPI_Isend(aa(is(p + 1), j + js(p + 1) - 1), length, & 00446 datatype, p, tag, comm, req(l), ierror) 00447 endif 00448 sends = sends + 1 00449 if(sends .ge. sends_todo) exit loop ! finish if the number of sends is 00450 ! equal to the total number of sends 00451 ! needed 00452 endif 00453 enddo 00454 enddo loop 00455 deallocate(aa) 00456 else ! below the code for non-root: 00457 ! because of irecv, we must be sure that we have a contiguous matrix 00458 do j=1,nlocal 00459 call MPI_Irecv(bb(1,j), mlocal, datatype, root, tag, comm, req(j), ierror) 00460 enddo 00461 endif 00462 00463 call MPI_Waitall(maxreq,req,MPI_STATUSES_IGNORE,ierror) 00464 00465 if (rank .ne. root) then 00466 b = bb 00467 endif 00468 00469 deallocate(bb) 00470 deallocate(req) 00471 00472 end subroutine matrix_distr_send_real8 00473 00474 subroutine matrix_distr_send_integer(a,b,is,lm,js,ln,root,comm) 00475 ! distribute matrix on processes 00476 ! 00477 ! parameters: 00478 ! integer a(:,:) (in): matrix to be scatterd. 00479 ! integer b(:,:) (out): local matrix to scatter to. 00480 ! This matrix has to be available, it is not 00481 ! allocated by this subroutine. 00482 ! integer is,lm,js,ln(number of processes) (in): description of 00483 ! the location of the submatrices b in matrix a: 00484 ! Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is 00485 ! the rank of the process (0..procs-1) 00486 ! The length of the 1st dimension of b is given by lm(rank+1) 00487 ! The length of the 2nd dimension of b is given by ln(rank+1) 00488 ! integer root (in): the rank of the process where a is available 00489 ! integer comm (in): the MPI communicator to be used. 00490 ! 00491 ! Note: a,is,js are used at the root process only 00492 ! 00493 ! Method 00494 ! 00495 ! Sends colums of submatrices to the processes using MPI_Isend. 00496 ! This routine tries to send as many columns in parallel 00497 ! to different processes as possible, see the usage of req(:) 00498 ! and maxreq below. 00499 ! The columns are received using MPI_Irecv. 00500 ! 00501 use mpi 00502 implicit none 00503 integer, intent(in), dimension(:,:) :: a 00504 integer, intent(out), dimension(:,:) :: b 00505 integer, intent(in), dimension(:) :: is,lm 00506 integer, intent(in), dimension(:) :: js,ln 00507 integer, intent(in) :: root,comm 00508 00509 integer :: j,rank,ierror,l,procs 00510 00511 integer :: sends, sends_todo, length 00512 integer :: p,mlocal,nlocal 00513 00514 logical :: done 00515 00516 integer :: maxreq 00517 integer, allocatable, dimension(:) :: req 00518 00519 integer, parameter :: tag = 1 00520 00521 integer, parameter :: datatype = MPI_INTEGER 00522 integer, allocatable, dimension(:,:) :: aa,bb 00523 00524 00525 call MPI_Comm_rank(comm, rank, ierror) 00526 call MPI_Comm_size(comm, procs, ierror) 00527 00528 nlocal = ln(rank+1) 00529 mlocal = lm(rank+1) 00530 00531 ! calculate total number of sends to be done: that is equal to 00532 ! the total number of columns in the submatrices 00533 00534 if (rank .eq. root) then 00535 sends_todo = sum(ln(1:procs)) 00536 ! maxreq is the maximum number of outstanding Isend or Irecv requests. 00537 ! Experiments learned that the value of maxreq is not critical, 00538 ! a value of 10 gave on the lisa system (Xeon, infiniband) 00539 ! only somewhat worse results. Probably, the slower the network, 00540 ! the higher this value can be set. Maximum usable is the total number 00541 ! of columns (sends_todo, see above) 00542 maxreq = 1000 00543 else 00544 sends_todo = 0 ! to quiet the compiler 00545 maxreq = nlocal ! on non-root processes, we need only the number of local 00546 ! columns 00547 endif 00548 00549 ! distribute matrix. 00550 ! to ensure that every process is as busy as possible, we do the 00551 ! loops on the processes in the inner loop. 00552 00553 allocate(req(maxreq)) 00554 req=MPI_REQUEST_NULL ! initialize req with a null-value 00555 00556 allocate(bb(size(b,1),size(b,2))) 00557 if ( rank .eq. root) then 00558 allocate(aa(size(a,1),size(a,2))) 00559 aa = a 00560 sends=0 00561 l = 0 00562 j = 0 00563 loop: do 00564 j=j+1 ! j runs over the columns 00565 do p=0, procs - 1 00566 if (ln(p + 1) .ge. j) then ! is j in the range of submatrix(p) 00567 length = lm(p + 1) 00568 if (p .eq. rank) then ! submtarix(p) is on root, just copy the column 00569 b(1:length,j) = a(is(rank+1):is(rank+1)+length-1,j+js(rank+1)-1) 00570 else 00571 ! search a free request holder: 00572 done = .false. 00573 do while( .not. done) 00574 l=l+1 00575 if (l .gt. maxreq) l=1 00576 if (req(l) .eq. MPI_REQUEST_NULL) then 00577 done = .true. 00578 else 00579 ! 00580 ! MPI_Test gives .true. if req(l).eq.MPI_REQUEST_NULL, 00581 ! but the standard says nothing about this case, 00582 ! so we test first if req(l) is equal to MPI_REQUEST_NULL 00583 ! 00584 call MPI_Test(req(l),done,MPI_STATUS_IGNORE,ierror) 00585 endif 00586 enddo ! now we have a free request holder req(l) 00587 call MPI_Isend(aa(is(p + 1), j + js(p + 1) - 1), length, & 00588 datatype, p, tag, comm, req(l), ierror) 00589 endif 00590 sends = sends + 1 00591 if(sends .ge. sends_todo) exit loop ! finish if the number of sends is 00592 ! equal to the total number of sends 00593 ! needed 00594 endif 00595 enddo 00596 enddo loop 00597 deallocate(aa) 00598 else ! below the code for non-root: 00599 do j=1,nlocal 00600 call MPI_Irecv(bb(1,j), mlocal, datatype, root, tag, comm, req(j), ierror) 00601 enddo 00602 endif 00603 00604 do l=1,maxreq ! wait until all requests are done 00605 if (req(l) .ne. MPI_REQUEST_NULL) then 00606 call MPI_Wait(req(l), MPI_STATUS_IGNORE, ierror) 00607 endif 00608 enddo 00609 00610 if (rank .ne. root) then 00611 b = bb 00612 endif 00613 deallocate(bb) 00614 00615 deallocate(req) 00616 00617 end subroutine matrix_distr_send_integer 00618 00619 subroutine matrix_distr_sendmat_real8(a,b,is,lm,js,ln,& 00620 root,comm) 00621 00622 ! distribute matrix on root process to all processes 00623 ! 00624 ! parameters: 00625 ! real*8 a(:,:) (in): matrix to be scattered 00626 ! real*8 b(:,:) (out): local scattered matrix 00627 ! integer is,lm,js,ln(number of processes) (in): description of 00628 ! the location of the submatrices b in matrix a: 00629 ! Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is 00630 ! the rank of the process (0..procs-1) 00631 ! The length of the 1st dimension of b is given by lm(rank+1) 00632 ! The length of the 2nd dimension of b is given by ln(rank+1) 00633 ! integer root (in): the rank of the process where a is available 00634 ! integer comm (in): the MPI communicator to be used. 00635 ! 00636 ! Note: a,is,js are used at the root process only 00637 ! 00638 ! Method 00639 ! 00640 ! root receives submatrices as a whole, one by one. Local 00641 ! submatrix is copied. 00642 ! 00643 00644 use mpi 00645 implicit none 00646 real*8, intent(in), dimension(:,:) :: a 00647 real*8, intent(out), dimension(:,:) :: b 00648 integer, intent(in), dimension(:) :: is, lm 00649 integer, intent(in), dimension(:) :: js, ln 00650 integer, intent(in) :: root, comm 00651 00652 integer :: i, rank, ierror, procs, p 00653 integer :: mlocal, nlocal 00654 integer :: acolstart, acolend, arowstart, arowend 00655 integer :: mp, np, k, ia, ja 00656 integer, parameter :: tag1 = 125 00657 integer, parameter :: tag2 = 126 00658 00659 call MPI_Comm_rank(comm, rank, ierror) 00660 call MPI_Comm_size(comm, procs, ierror) 00661 00662 nlocal = ln(rank+1) 00663 mlocal = lm(rank+1) 00664 00665 ! define start-end cols and rows for the general case: 00666 ! computations on arow.. and acol.. are only meaningful at root 00667 00668 ! See also spaceparams comments: 00669 ! b concides with 00670 ! a(is(p+1):is(p+1)+lm(p+1)-1,js(p+1):js(p+1)+ln(p+1)-1) 00671 00672 arowstart = is(root + 1) 00673 arowend = is(root + 1) + mlocal - 1 00674 acolstart = js(root + 1) 00675 acolend = js(root + 1) + nlocal - 1 00676 if (rank .eq. root) then 00677 ! copy of the the relevant part of A to local submtrix B: 00678 b(:,:) = a(arowstart:arowend,acolstart:acolend) 00679 00680 ! now, send to each non-root process 00681 ! it's part of matrix a 00682 00683 ploop: do p = 1, procs 00684 if ( p - 1 .eq. root ) then 00685 cycle ploop 00686 endif 00687 ! howmany elements to send to process p-1 (= mp*np) 00688 ! and where to put this in a (ia:ia+mp-1, ja:ja+np-1) 00689 ! 00690 00691 mp = lm(p) 00692 np = ln(p) 00693 ia = is(p) 00694 ja = js(p) 00695 00696 ! to avoid deadlocks, master first sends a 00697 ! short message to p. After receiving that message, 00698 ! p will start receiving its submatrix 00699 00700 call MPI_Send(mp*np, 1, MPI_INTEGER, p - 1, tag1, comm, ierror) 00701 00702 ! send a submatrix to process p-1: 00703 00704 call MPI_Send(a(ia:ia+mp-1,ja:ja+np-1), mp*np, MPI_DOUBLE_PRECISION, & 00705 p - 1, tag2, comm, ierror) 00706 00707 enddo ploop ! loop over non-root processes 00708 00709 else ! above is the root code, now the non-root code 00710 ! receive the short message from root, so I can start receiving b 00711 call MPI_Recv(i, 1, MPI_INTEGER,& 00712 root, tag1, comm, MPI_STATUS_IGNORE, ierror) 00713 00714 ! sanity check 00715 00716 k = size(b) 00717 if (i .ne. k) then 00718 write(*,*)'process ',rank,' has a problem in matrix_distr_sendmat_real8: ' 00719 write(*,*)'number of words to receive:',k 00720 write(*,*)'root expects: ',i 00721 call MPI_Abort(MPI_COMM_WORLD,1,ierror) 00722 endif 00723 00724 call MPI_Recv(b, k, MPI_DOUBLE_PRECISION, root, & 00725 tag2, comm, MPI_STATUS_IGNORE, ierror) 00726 endif 00727 00728 end subroutine matrix_distr_sendmat_real8 00729 00730 subroutine matrix_distr_sendmat_integer(a,b,is,lm,js,ln,& 00731 root,comm) 00732 00733 ! distribute matrix on root process to all processes 00734 ! 00735 ! parameters: 00736 ! integer a(:,:) (in): matrix to be scattered 00737 ! integer b(:,:) (out): local scattered matrix 00738 ! integer is,lm,js,ln(number of processes) (in): description of 00739 ! the location of the submatrices b in matrix a: 00740 ! Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is 00741 ! the rank of the process (0..procs-1) 00742 ! The length of the 1st dimension of b is given by lm(rank+1) 00743 ! The length of the 2nd dimension of b is given by ln(rank+1) 00744 ! integer root (in): the rank of the process where a is available 00745 ! integer comm (in): the MPI communicator to be used. 00746 ! 00747 ! Note: a,is,js are used at the root process only 00748 ! 00749 ! Method 00750 ! 00751 ! root receives submatrices as a whole, one by one. Local 00752 ! submatrix is copied. 00753 ! 00754 00755 use mpi 00756 implicit none 00757 integer, intent(in), dimension(:,:) :: a 00758 integer, intent(out), dimension(:,:) :: b 00759 integer, intent(in), dimension(:) :: is, lm 00760 integer, intent(in), dimension(:) :: js, ln 00761 integer, intent(in) :: root, comm 00762 integer :: i, rank, ierror, procs, p 00763 integer :: mlocal, nlocal 00764 integer :: acolstart, acolend, arowstart, arowend 00765 integer :: mp, np, k, ia, ja 00766 integer, parameter :: tag1 = 125 00767 integer, parameter :: tag2 = 126 00768 00769 call MPI_Comm_rank(comm, rank, ierror) 00770 call MPI_Comm_size(comm, procs, ierror) 00771 00772 nlocal = ln(rank+1) 00773 mlocal = lm(rank+1) 00774 00775 ! define start-end cols and rows for the general case: 00776 ! computations on arow.. and acol.. are only meaningful at root 00777 00778 arowstart = is(root + 1) 00779 arowend = is(root + 1) + mlocal - 1 00780 acolstart = js(root + 1) 00781 acolend = js(root + 1) + nlocal - 1 00782 00783 if (rank .eq. root) then 00784 00785 ! copy of the the relevant part of A to local submtrix B: 00786 00787 b(:,:) = a(arowstart:arowend,acolstart:acolend) 00788 00789 ! now, send to each non-root process 00790 ! it's part of matrix a 00791 00792 ploop: do p = 1, procs 00793 if ( p - 1 .eq. root ) then 00794 cycle ploop 00795 endif 00796 00797 ! howmany elements to send to process p-1 (= mp*np) 00798 ! and where to put this in a (ia:ia+mp-1, ja:ja+np-1) 00799 ! 00800 00801 mp = lm(p) 00802 np = ln(p) 00803 ia = is(p) 00804 ja = js(p) 00805 00806 ! to avoid deadlocks, master first sends a 00807 ! short message to p - 1. After receiving that message, 00808 ! p - 1 will start receiving its submatrix 00809 00810 call MPI_Send(mp*np, 1, MPI_INTEGER, p - 1, tag1, comm, ierror) 00811 00812 ! send a submatrix to process p-1 00813 00814 call MPI_Send(a(ia:ia+mp-1,ja:ja+np-1), mp*np, MPI_INTEGER, & 00815 p - 1, tag2, comm, ierror) 00816 00817 enddo ploop ! loop over non-root processes 00818 00819 else ! above is the root code, below the non-root code 00820 00821 ! receive the short message from root, so I can start receiving b 00822 00823 call MPI_Recv(i, 1, MPI_INTEGER,& 00824 root, tag1, comm, MPI_STATUS_IGNORE, ierror) 00825 00826 ! sanity check 00827 00828 k = size(b) 00829 if (i .ne. k) then 00830 write(*,*)'process ',rank,' has a problem in matrix_distr_sendmat_integer: ' 00831 write(*,*)'number of words to receive:',k 00832 write(*,*)'root expects: ',i 00833 call MPI_Abort(MPI_COMM_WORLD,1,ierror) 00834 endif 00835 00836 call MPI_Recv(b, k, MPI_INTEGER, root, & 00837 tag2, comm, MPI_STATUS_IGNORE, ierror) 00838 00839 endif 00840 00841 end subroutine matrix_distr_sendmat_integer 00842 00843 subroutine matrix_coll_gather_real8(a,b,is,lm,js,ln, & 00844 isleft,isright,istop,isbot,root,comm) 00845 ! collect matrix on root process 00846 ! 00847 ! parameters: 00848 ! real*8 a(:,:) (out): matrix to be collected. 00849 ! This matrix has to be available, it is not 00850 ! allocated by this subroutine. 00851 ! real*8 b(:,:) (in): local scattered matrix 00852 ! integer is,lm,js,ln(number of processes) (in): description of 00853 ! the location of the submatrices b in matrix a: 00854 ! Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is 00855 ! the rank of the process (0..procs-1) 00856 ! The length of the 1st dimension of b is given by lm(rank+1) 00857 ! The length of the 2nd dimension of b is given by ln(rank+1) 00858 ! logical isleft, isright, istop, isbot (in): is this submatrix at the 00859 ! left, right top or bottom of the general matrix. 00860 ! integer root (in): the rank of the process where a is available 00861 ! integer comm (in): the MPI communicator to be used. 00862 ! 00863 ! Note: a,is,js are used at the root process only 00864 ! 00865 ! Method 00866 ! 00867 ! Use gatherv. Each gatherv call collects one column of 00868 ! all submatrices 00869 ! 00870 ! NOTE: the matrices are considered to be bordered by a columns 00871 ! left and right and rows up and down. So, n general, 00872 ! only b(2:mb-1,2:nb-1) is send to the master process. 00873 ! HOWEVER: if matrix b is 00874 ! on the left: b(:,1) is also sent 00875 ! on the right: b(:,nb) is also sent 00876 ! on the top: b(1,:) is also sent 00877 ! on the bottom: b(mb,:) is also sent 00878 ! The master process receives a(1:ma,1:na). 00879 ! dimension of b (1:mb,1:nb). 00880 ! dimension of a (1:ma,1:na) 00881 ! 00882 use mpi 00883 implicit none 00884 real*8, intent(out), dimension(:,:) :: a 00885 real*8, intent(in), dimension(:,:) :: b 00886 integer, intent(in), dimension(:) :: is,lm 00887 integer, intent(in), dimension(:) :: js,ln 00888 logical, intent(in), dimension(:) :: isleft, isright, istop, isbot 00889 integer, intent(in) :: root,comm 00890 00891 integer :: j,p,rank,ierror,procs,jj,ma,jmax,k 00892 integer :: nlocal,mlocal,scnt 00893 integer, allocatable, dimension(:) :: displs, cnts 00894 logical :: isleftl, isrightl, istopl, isbotl 00895 real*8, allocatable, dimension(:,:) :: aa 00896 00897 #ifdef USEMPE 00898 call MPE_Log_event(event_coll_start,0,'cstart') 00899 #endif 00900 call MPI_Comm_rank(comm, rank, ierror) 00901 call MPI_Comm_size(comm, procs, ierror) 00902 00903 nlocal = ln(rank+1) 00904 mlocal = lm(rank+1) 00905 isleftl = isleft(rank+1) 00906 isrightl = isright(rank+1) 00907 istopl = istop(rank+1) 00908 isbotl = isbot(rank+1) 00909 00910 ma = 1 ! to quiet the compiler 00911 if (rank .eq. root) then 00912 ma = size(a,1) 00913 endif 00914 00915 if (rank .eq. root) then 00916 allocate(displs(procs), cnts(procs)) 00917 ! need a contiguous matrix, otherwise the counts and 00918 ! displacements will be in error 00919 allocate(aa(size(a,1),size(a,2))) 00920 else 00921 allocate(displs(1),cnts(1)) ! need valid addresses on non-root 00922 allocate(aa(1,1)) 00923 endif 00924 00925 nlocal = ln(rank+1) 00926 mlocal = lm(rank+1) 00927 00928 jmax = maxval(ln(1:procs)) ! number of gathers needed 00929 00930 do j=1,jmax 00931 if (rank .eq. root) then ! displacements and counts only needed on root 00932 do p=1,procs 00933 if (ln(p) .ge. j) then 00934 displs(p) = (js(p) -1 + j -1)*ma + is(p) ! compute displacement for 00935 ! column j of submatrix p 00936 ! note that first and last elements 00937 ! of this columns are borders, not 00938 ! to be gathered 00939 cnts(p) = lm(p)-2 ! number of non-border elements in 00940 ! this column 00941 ! Normally, the first and last rows and columns are not to 00942 ! be send to root, but we make an exception for the 00943 ! rows and columns that are the first rows and columns 00944 ! from the global matrix. 00945 ! 00946 displs(p) = (js(p) -1 + j - 1)*ma + is(p) - 1 00947 cnts(p) = lm(p) 00948 if ( .not. isleft(p) ) then 00949 ! this is a submatrix, not on the left. 00950 if ( j .eq. 1 ) then 00951 ! The first column is not to be sent, 00952 ! so make cnts(p) = 0, and give displs(p) a 00953 ! valid value 00954 displs(p) = 0 00955 cnts(p) = 0 00956 endif 00957 endif 00958 if ( .not. isright(p) ) then 00959 ! this is a submatrix not on the right 00960 if ( j .eq. ln(p) ) then 00961 ! The last column is not to be sent, 00962 ! so make cnts(p) = 0, and give displs(p) a 00963 ! valid value 00964 displs(p) = 0 00965 cnts(p) = 0 00966 endif 00967 endif 00968 if ( .not. istop(p) ) then 00969 ! this is a submatrix, not on the top 00970 ! so we don't send the first element of column j 00971 ! adapt displs(p) and cnts(p) accordingly: 00972 displs(p) = displs(p)+1 00973 cnts(p) = max(cnts(p)-1,0) 00974 endif 00975 if ( .not. isbot(p) ) then 00976 ! this is a submatrix, not on the bottom 00977 ! so we don't send the last element of column j 00978 ! adapt cnts(p) accordingly: 00979 cnts(p) = max(cnts(p)-1,0) 00980 endif 00981 else 00982 displs(p) = 0 ! we are out of columns, use a valid value for 00983 ! the displacement 00984 cnts(p) = 0 ! and set number of elements to receive for 00985 ! this submatrix to zero 00986 endif !ln(p) .ge. j 00987 enddo ! do p = 1,procs 00988 endif ! rank .eq. root 00989 jj = j ! column of submatrix to send 00990 scnt = mlocal ! number of elements to send 00991 k = 1 ! starting element of column j to send 00992 if (nlocal .ge. j) then 00993 if (.not. isleftl ) then 00994 if ( j .eq. 1 ) then 00995 jj = 1 00996 scnt = 0 00997 endif 00998 endif 00999 if (.not. isrightl ) then 01000 if ( j .eq. nlocal ) then 01001 jj = 1 01002 scnt = 0 01003 endif 01004 endif 01005 if (.not. istopl ) then 01006 k = k+1 01007 scnt = max(scnt-1, 0) 01008 endif 01009 if (.not. isbotl ) then 01010 scnt = max(scnt-1, 0) 01011 endif 01012 else 01013 jj = 1 ! out of columns, use valid value for jj 01014 scnt = 0 ! and put number of elements to send to zero 01015 endif 01016 01017 call MPI_Gatherv(b(k:k+scnt-1,jj), scnt, MPI_DOUBLE_PRECISION, & 01018 aa(1,1), cnts, displs, MPI_DOUBLE_PRECISION, & 01019 root, comm, ierror) 01020 enddo 01021 01022 if(rank .eq. root) then 01023 a = aa 01024 endif 01025 01026 deallocate(displs,cnts,aa) 01027 01028 end subroutine matrix_coll_gather_real8 01029 01030 subroutine matrix_coll_recv_real8 (a,b,is,lm,js,ln,& 01031 isleft,isright,istop,isbot,root,comm) 01032 ! collect matrix on root process 01033 ! 01034 ! parameters: 01035 ! real*8 a(:,:) (out): matrix to be collected. 01036 ! This matrix has to be available, it is not 01037 ! allocated by this subroutine. 01038 ! real*8 b(:,:) (in): local scattered matrix 01039 ! integer is,lm,js,ln(number of processes) (in): description of 01040 ! the location of the submatrices b in matrix a: 01041 ! Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is 01042 ! the rank of the process (0..procs-1) 01043 ! The length of the 1st dimension of b is given by lm(rank+1) 01044 ! The length of the 2nd dimension of b is given by ln(rank+1) 01045 ! logical isleft, isright, istop, isbot (in): is this submatrix at the 01046 ! left, right top or bottom of the general matrix. 01047 ! integer root (in): the rank of the process where a is available 01048 ! integer comm (in): the MPI communicator to be used. 01049 ! 01050 ! Note: a,is,js are used at the root process only 01051 ! 01052 ! Method 01053 ! 01054 ! Receive colums of submatrices using MPI_Probe and MPI_Recv. 01055 ! The non-root processes use MPI_Isend to send the columns 01056 ! 01057 ! NOTE: the matrices are considered to be bordered by a columns 01058 ! left and right and rows up and down. So, n general, 01059 ! only b(2:mb-1,2:nb-1) is send to the master process. 01060 ! HOWEVER: if matrix b is 01061 ! on the left: b(:,1) is also sent 01062 ! on the right: b(:,nb) is also sent 01063 ! on the top: b(1,:) is also sent 01064 ! on the bottom: b(mb,:) is also sent 01065 ! The master process receives a(1:ma,1:na). 01066 ! dimension of b (1:mb,1:nb). 01067 ! dimension of a (1:ma,1:na) 01068 ! 01069 use mpi 01070 implicit none 01071 real*8, intent(out), dimension(:,:) :: a 01072 real*8, intent(in), dimension(:,:) :: b 01073 integer, intent(in), dimension(:) :: is,lm 01074 integer, intent(in), dimension(:) :: js,ln 01075 logical, intent(in), dimension(:) :: isleft,isright,istop,isbot 01076 integer, intent(in) :: root,comm 01077 01078 integer :: i,j,rank,ierror,l,procs,ii,jj 01079 integer :: jstart, jend, p, m, n 01080 01081 integer :: recvs, recvs_todo 01082 integer :: mlocal,nlocal,sender 01083 logical :: isleftl, isrightl, istopl, isbotl 01084 01085 integer, allocatable, dimension(:) :: col 01086 integer, allocatable, dimension(:) :: req 01087 01088 integer, parameter :: tag = 2 01089 integer, dimension(MPI_STATUS_SIZE) :: mpistatus 01090 real*8, allocatable, dimension(:,:) :: bb 01091 01092 call MPI_Comm_rank(comm, rank, ierror) 01093 call MPI_Comm_size(comm, procs, ierror) 01094 01095 nlocal = ln(rank+1) 01096 mlocal = lm(rank+1) 01097 isleftl = isleft(rank+1) 01098 isrightl = isright(rank+1) 01099 istopl = istop(rank+1) 01100 isbotl = isbot(rank+1) 01101 01102 if (rank .eq. root) then 01103 ! calculate total number of recvs to be done: that is equal to 01104 ! the total number of columns in the submatrices 01105 recvs_todo = sum(ln(1:procs)) 01106 01107 recvs_todo = recvs_todo - nlocal ! correct for columns local available 01108 01109 do p=1,procs ! correct for borders 01110 if (p .eq. root+1) then 01111 cycle 01112 endif 01113 if ( .not. isleft(p)) then 01114 recvs_todo = recvs_todo -1 01115 endif 01116 if ( .not. isright(p)) then 01117 recvs_todo = recvs_todo -1 01118 endif 01119 enddo 01120 allocate(col(procs)) ! to remember which was the last column 01121 ! received for submatrix(p) 01122 col = 0 01123 recvs = 0 01124 ! copy submatrix on root: 01125 i = 1 01126 ii = is(rank+1) 01127 m = mlocal 01128 j = 1 01129 jj = js(rank+1) 01130 n = nlocal 01131 ! take care of borders: 01132 if ( .not. isleftl ) then 01133 j = j + 1 01134 jj = jj + 1 01135 endif 01136 if ( .not. isrightl ) then 01137 n = n - 1 01138 endif 01139 if ( .not. istopl ) then 01140 i = i + 1 01141 ii = ii + 1 01142 m = m - 1 01143 endif 01144 if ( .not. isbotl ) then 01145 m = m - 1 01146 endif 01147 a(ii:ii+m-1,jj:jj+n-1) = b(i:i+m-1,j:j+n-1) 01148 01149 do recvs=1,recvs_todo ! loop over the total number of receives needed 01150 ! wait until anybody did send a column to root: 01151 01152 call MPI_Probe(MPI_ANY_SOURCE, tag, comm, mpistatus, ierror) 01153 01154 ! find out who was the sender: 01155 01156 sender = mpistatus(MPI_SOURCE) 01157 01158 ! update col, and compute i and j: the startlocation of 01159 ! to be received column 01160 01161 p = sender + 1 01162 col(p) = col(p) + 1 01163 i = is(p) 01164 j = col(p)+js(p)-1 01165 m = lm(p) 01166 01167 ! perform the actual receive: 01168 ! and correct for borders 01169 01170 if ( .not. isleft(p)) then 01171 j = j + 1 01172 endif 01173 if ( .not. isright(p)) then 01174 ! nothing to correct here 01175 endif 01176 if ( .not. istop(p)) then 01177 i = i + 1 01178 m = m - 1 01179 endif 01180 if ( .not. isbot(p)) then 01181 m = m - 1 01182 endif 01183 01184 call MPI_Recv(a(i:i+m-1,j),m,MPI_DOUBLE_PRECISION, & 01185 sender,tag,comm,MPI_STATUS_IGNORE,ierror) 01186 enddo 01187 deallocate(col) 01188 ! deallocate(isleft, isright, istop, isbot) 01189 else ! non-root code 01190 allocate(req(nlocal)) ! for holding the requests 01191 allocate(bb(size(b,1),size(b,2))) 01192 bb = b ! want to be sure that the matrix to send is 01193 ! contiguous, because isend's are used 01194 req = MPI_REQUEST_NULL 01195 01196 jstart = 1 01197 jend = nlocal 01198 i = 1 01199 m = mlocal 01200 01201 ! take care of borders 01202 if ( .not. isleftl ) then 01203 jstart = jstart + 1 01204 endif 01205 if ( .not. isrightl ) then 01206 jend = jend - 1 01207 endif 01208 if ( .not. istopl ) then 01209 i = i + 1 01210 m = m - 1 01211 endif 01212 if ( .not. isbotl ) then 01213 m = m - 1 01214 endif 01215 01216 do j = jstart, jend 01217 call MPI_Isend(bb(i,j), m, MPI_DOUBLE_PRECISION, root, tag, comm, req(j), ierror) 01218 enddo 01219 01220 do l=1,nlocal ! wait until all requests have been handled 01221 if (req(l) .ne. MPI_REQUEST_NULL) then 01222 call MPI_Wait(req(l),MPI_STATUS_IGNORE,ierror) 01223 endif 01224 enddo 01225 deallocate(req,bb) 01226 endif 01227 01228 end subroutine matrix_coll_recv_real8 01229 01230 subroutine matrix_coll_recvmat_real8(a,b,is,lm,js,ln,& 01231 isleft,isright,istop,isbot,root,comm) 01232 ! collect matrix on root process 01233 ! 01234 ! parameters: 01235 ! real*8 a(:,:) (out): matrix to be collected. 01236 ! This matrix has to be available, it is not 01237 ! allocated by this subroutine. 01238 ! real*8 b(:,:) (in): local scattered matrix 01239 ! integer is,lm,js,ln(number of processes) (in): description of 01240 ! the location of the submatrices b in matrix a: 01241 ! Element b(1,1) coincides with a(is(rank+1),js(rank+1)), where rank is 01242 ! the rank of the process (0..procs-1) 01243 ! The length of the 1st dimension of b is given by lm(rank+1) 01244 ! The length of the 2nd dimension of b is given by ln(rank+1) 01245 ! logical isleft, isright, istop, isbot (in): is this submatrix at the 01246 ! left, right top or bottom of the general matrix. 01247 ! integer root (in): the rank of the process where a is available 01248 ! integer comm (in): the MPI communicator to be used. 01249 ! 01250 ! Note: a,is,js are used at the root process only 01251 ! 01252 ! Method 01253 ! 01254 ! root receives submatrices as a whole, one by one. Local 01255 ! submatrix is copied. 01256 ! 01257 ! NOTE: the matrices are considered to be bordered by columns 01258 ! left and right and rows up and down. 01259 ! The number of border columns/rows is nbord. 01260 ! The number of overlapping columns nover = 2*nbord. 01261 ! So, in general, 01262 ! only b(nbord+1:mb-nbord,nbord+1:nb-nbord) is send to the master process. 01263 ! HOWEVER: if matrix b is 01264 ! on the left: b(:,1:nbord) is also sent 01265 ! on the right: b(:,nb-nbord+1:nb) is also sent 01266 ! on the top: b(1:nbord,:) is also sent 01267 ! on the bottom: b(mb-nbord+1:mb,:) is also sent 01268 ! The master process receives a(1:ma,1:na). 01269 ! dimension of b (1:mb,1:nb). 01270 ! dimension of a (1:ma,1:na) 01271 ! 01272 use mpi 01273 implicit none 01274 real*8, intent(out), dimension(:,:) :: a 01275 real*8, intent(in), dimension(:,:) :: b 01276 integer, intent(in), dimension(:) :: is, lm 01277 integer, intent(in), dimension(:) :: js, ln 01278 logical, intent(in), dimension(:) :: isleft, isright, istop, isbot 01279 integer, intent(in) :: root, comm 01280 01281 integer :: i, rank, ierror, procs, p 01282 01283 integer :: mlocal, nlocal 01284 01285 !logical, allocatable, dimension(:) :: isleft, isright, istop, isbot 01286 01287 integer, parameter :: tag = 2 01288 01289 integer :: acolstart, acolend, arowstart, arowend 01290 integer :: bcolstart, bcolend, browstart, browend 01291 integer :: mp, np, k, ia, ja 01292 integer, parameter :: tag1 = 123 01293 integer, parameter :: tag2 = 124 01294 logical :: isleftl, isrightl, istopl, isbotl 01295 01296 ! nover is number of overlapping rows/columns of submatrices. 01297 ! nbord is number of borders. 01298 01299 integer, parameter :: nbord = 2 01300 integer, parameter :: nover = 2*nbord 01301 01302 01303 call MPI_Comm_rank(comm, rank, ierror) 01304 call MPI_Comm_size(comm, procs, ierror) 01305 01306 nlocal = ln(rank+1) 01307 mlocal = lm(rank+1) 01308 isleftl = isleft(rank+1) 01309 isrightl = isright(rank+1) 01310 istopl = istop(rank+1) 01311 isbotl = isbot(rank+1) 01312 01313 ! define start-end cols and rows for the general case: 01314 ! computations on arow.. and acol.. are only meaningful at root 01315 browstart = nbord + 1 01316 browend = mlocal - nbord 01317 bcolstart = nbord + 1 01318 bcolend = nlocal - nbord 01319 arowstart = is(root + 1) + nbord 01320 arowend = is(root + 1) + mlocal - 1 - nbord 01321 acolstart = js(root + 1) + nbord 01322 acolend = js(root + 1) + nlocal - 1 - nbord 01323 01324 ! correct for the border cases: 01325 01326 if (isleftl) then 01327 bcolstart = bcolstart - nbord 01328 acolstart = acolstart - nbord 01329 endif 01330 if (isrightl) then 01331 bcolend = bcolend + nbord 01332 acolend = acolend + nbord 01333 endif 01334 if (istopl) then 01335 browstart = browstart - nbord 01336 arowstart = arowstart - nbord 01337 endif 01338 if (isbotl) then 01339 browend = browend + nbord 01340 arowend = arowend + nbord 01341 endif 01342 01343 if (rank .eq. root) then 01344 01345 ! copy local submatrix 01346 01347 ! coy of the local submtrix B to A: 01348 a(arowstart:arowend,acolstart:acolend) = & 01349 b(browstart:browend,bcolstart:bcolend) 01350 01351 ! now, collect from each non-root process 01352 ! it's matrix b, and put it in the proper place in a 01353 01354 ploop: do p = 1, procs 01355 if ( p - 1 .eq. root ) then 01356 cycle ploop 01357 endif 01358 ! howmany elements to expect from process p-1 (= mp*np) 01359 ! and where to put this in a (ia:ia+mp-1, ja:ja+np-1) 01360 ! 01361 01362 mp = lm(p) - nover 01363 np = ln(p) - nover 01364 ia = is(p) + nbord 01365 ja = js(p) + nbord 01366 01367 ! correction if b is at a border: 01368 01369 if (isleft(p)) then 01370 ja = ja - nbord 01371 np = np + nbord 01372 endif 01373 if (isright(p)) then 01374 np = np + nbord 01375 endif 01376 if (istop(p)) then 01377 ia = ia - nbord 01378 mp = mp + nbord 01379 endif 01380 if (isbot(p)) then 01381 mp = mp + nbord 01382 endif 01383 01384 ! to avoid deadlocks, master first sends a 01385 ! short message to p. After receiving that message, 01386 ! p will start sending its submatrix 01387 01388 call MPI_Send(mp*np, 1, MPI_INTEGER, p - 1, tag1, comm, ierror) 01389 01390 ! receive a submatrix from process p-1 01391 01392 call MPI_Recv(a(ia:ia+mp-1,ja:ja+np-1), mp*np, MPI_DOUBLE_PRECISION, & 01393 p - 1, tag2, comm, MPI_STATUS_IGNORE, ierror) 01394 01395 enddo ploop ! loop over non-root processes 01396 else ! above is the root code, now the non-root code 01397 ! receive the short message from root, so I can start sending b 01398 call MPI_Recv(i, 1, MPI_INTEGER,& 01399 root, tag1, comm, MPI_STATUS_IGNORE, ierror) 01400 01401 ! which part of b to send: b(browstart:browend,bcolstart:bcolend) 01402 01403 ! sanity check 01404 k = (browend - browstart + 1)*(bcolend - bcolstart + 1) 01405 if (i .ne. k) then 01406 write(*,*)'process ',rank,' has a problem in matrix_coll_recvmat_real8: ' 01407 write(*,*)'number of words to send:',k 01408 write(*,*)'root expects: ',i 01409 do i=1,procs 01410 write(*,*)i-1,isleft(i),isright(i),istop(i),isbot(i) 01411 enddo 01412 write(*,*)isleftl,isrightl,istopl,isbotl 01413 call MPI_Abort(MPI_COMM_WORLD,1,ierror) 01414 endif 01415 01416 call MPI_Send(b(browstart:browend,bcolstart:bcolend), k, & 01417 MPI_DOUBLE_PRECISION, root, tag2, comm, ierror) 01418 endif 01419 01420 end subroutine matrix_coll_recvmat_real8 01421 01422 subroutine matrix_coll_recvmat_int(a,b,is,lm,js,ln,& 01423 isleft,isright,istop,isbot,root,comm) 01424 ! This is a copy of matrix_coll_recvmat_real8 for integers 01425 use mpi 01426 implicit none 01427 integer, intent(out), dimension(:,:) :: a 01428 integer, intent(in), dimension(:,:) :: b 01429 integer, intent(in), dimension(:) :: is, lm 01430 integer, intent(in), dimension(:) :: js, ln 01431 logical, intent(in), dimension(:) :: isleft, isright, istop, isbot 01432 integer, intent(in) :: root, comm 01433 01434 integer :: i, rank, ierror, procs, p 01435 01436 integer :: mlocal, nlocal 01437 01438 !logical, allocatable, dimension(:) :: isleft, isright, istop, isbot 01439 01440 integer, parameter :: tag = 2 01441 01442 integer :: acolstart, acolend, arowstart, arowend 01443 integer :: bcolstart, bcolend, browstart, browend 01444 integer :: mp, np, k, ia, ja 01445 integer, parameter :: tag1 = 123 01446 integer, parameter :: tag2 = 124 01447 logical :: isleftl, isrightl, istopl, isbotl 01448 01449 ! nover is number of overlapping rows/columns of submatrices. 01450 ! nbord is number of borders. 01451 01452 integer, parameter :: nover=4 01453 integer, parameter :: nbord=nover/2 01454 01455 call MPI_Comm_rank(comm, rank, ierror) 01456 call MPI_Comm_size(comm, procs, ierror) 01457 01458 nlocal = ln(rank+1) 01459 mlocal = lm(rank+1) 01460 isleftl = isleft(rank+1) 01461 isrightl = isright(rank+1) 01462 istopl = istop(rank+1) 01463 isbotl = isbot(rank+1) 01464 01465 ! define start-end cols and rows for the general case: 01466 ! computations on arow.. and acol.. are only meaningful at root 01467 browstart = nbord + 1 01468 browend = mlocal - nbord 01469 bcolstart = nbord + 1 01470 bcolend = nlocal - nbord 01471 arowstart = is(root + 1) + nbord 01472 arowend = is(root + 1) + mlocal - 1 - nbord 01473 acolstart = js(root + 1) + nbord 01474 acolend = js(root + 1) + nlocal - 1 - nbord 01475 01476 ! correct for the border cases: 01477 01478 if (isleftl) then 01479 bcolstart = bcolstart - nbord 01480 acolstart = acolstart - nbord 01481 endif 01482 if (isrightl) then 01483 bcolend = bcolend + nbord 01484 acolend = acolend + nbord 01485 endif 01486 if (istopl) then 01487 browstart = browstart - nbord 01488 arowstart = arowstart - nbord 01489 endif 01490 if (isbotl) then 01491 browend = browend + nbord 01492 arowend = arowend + nbord 01493 endif 01494 01495 if (rank .eq. root) then 01496 01497 ! copy local submatrix 01498 01499 ! coy of the local submtrix B to A: 01500 01501 a(arowstart:arowend,acolstart:acolend) = & 01502 b(browstart:browend,bcolstart:bcolend) 01503 01504 ! now, collect from each non-root process 01505 ! it's matrix b, and put it in the proper place in a 01506 01507 ploop: do p = 1, procs 01508 if ( p - 1 .eq. root ) then 01509 cycle ploop 01510 endif 01511 ! howmany elements to expect from process p-1 (= mp*np) 01512 ! and where to put this in a (ia:ia+mp-1, ja:ja+np-1) 01513 ! 01514 01515 mp = lm(p) - nover 01516 np = ln(p) - nover 01517 ia = is(p) + nbord 01518 ja = js(p) + nbord 01519 01520 ! correction if b is at a border: 01521 01522 if (isleft(p)) then 01523 ja = ja - nbord 01524 np = np + nbord 01525 endif 01526 if (isright(p)) then 01527 np = np + nbord 01528 endif 01529 if (istop(p)) then 01530 ia = ia - nbord 01531 mp = mp + nbord 01532 endif 01533 if (isbot(p)) then 01534 mp = mp + nbord 01535 endif 01536 01537 ! to avoid deadlocks, master first sends a 01538 ! short message to p. After receiving that message, 01539 ! p will start sending its submatrix 01540 01541 call MPI_Send(mp*np, 1, MPI_INTEGER, p - 1, tag1, comm, ierror) 01542 01543 ! receive a submatrix from process p-1 01544 01545 call MPI_Recv(a(ia:ia+mp-1,ja:ja+np-1), mp*np, MPI_INTEGER, & 01546 p - 1, tag2, comm, MPI_STATUS_IGNORE, ierror) 01547 01548 enddo ploop ! loop over non-root processes 01549 else ! above is the root code, now the non-root code 01550 ! receive the short message from root, so I can start sending b 01551 call MPI_Recv(i, 1, MPI_INTEGER,& 01552 root, tag1, comm, MPI_STATUS_IGNORE, ierror) 01553 01554 ! which part of b to send: b(browstart:browend,bcolstart:bcolend) 01555 01556 ! sanity check 01557 k = (browend - browstart + 1)*(bcolend - bcolstart + 1) 01558 if (i .ne. k) then 01559 write(*,*)'process ',rank,' has a problem in matrix_coll_recvmat_int: ' 01560 write(*,*)'number of words to send:',k 01561 write(*,*)'root expects: ',i 01562 do i=1,procs 01563 write(*,*)i-1,isleft(i),isright(i),istop(i),isbot(i) 01564 enddo 01565 write(*,*)isleftl,isrightl,istopl,isbotl 01566 call MPI_Abort(MPI_COMM_WORLD,1,ierror) 01567 endif 01568 01569 call MPI_Send(b(browstart:browend,bcolstart:bcolend), k, & 01570 MPI_INTEGER, root, tag2, comm, ierror) 01571 endif 01572 01573 end subroutine matrix_coll_recvmat_int 01574 01575 subroutine decomp(n, numprocs, myid, s, e) 01576 01577 ! From the mpich-distribution: MPE_Decomp1d 01578 ! 01579 ! n : integer, input (for example: number of tasks or 01580 ! dimension of an array) 01581 ! numprocs : integer, input (number of processes) 01582 ! myid : integer, input (MPI id of this process) 01583 ! s : integer, output 01584 ! e : integer, output 01585 ! 01586 ! The purpose of this subroutine is to get an as equal as possible 01587 ! division of the integers 1..n among numprocs intervals. 01588 ! A common use is to divide work among MPI processes. 01589 ! 01590 ! example: 01591 ! 01592 ! integer numprocs, ier, myid, n, s, e, i 01593 ! call MPI_Comm_size(MPI_COMM_WORLD, numprocs, ier) 01594 ! call MPI_Comm_rank(MPI_COMM_WORLD, myid, ier) 01595 ! n = 1000 01596 ! call decomp(n, nprocs, myid, s, e) 01597 ! do i = s,e 01598 ! ... 01599 ! enddo 01600 ! 01601 integer, intent(in) :: n, numprocs, myid 01602 integer, intent(out):: s, e 01603 integer nlocal 01604 integer deficit 01605 01606 nlocal = n / numprocs 01607 s = myid * nlocal + 1 01608 deficit = mod(n,numprocs) 01609 s = s + min(myid,deficit) 01610 if (myid .lt. deficit) then 01611 nlocal = nlocal + 1 01612 endif 01613 e = s + nlocal - 1 01614 if (e .gt. n .or. myid .eq. numprocs-1) then 01615 e = n 01616 endif 01617 01618 end subroutine decomp 01619 01620 subroutine det_submatrices(ma,na,mp,np,is,lm,js,ln,isleft,isright,istop,isbot) 01621 ! 01622 ! determine a division of a ma*na matrix on mp*np processes 01623 ! ma: integer(in): 1st dimension of matrix to divide 01624 ! na: integer(in): 2nd dimension of this matrix 01625 ! mp: integer(in): 1st dimension of processorgrid 01626 ! np: integer(in): 2nd dimension of processorgrid 01627 ! is(mp*np): integer(out): is(i) will be equal to the 01628 ! start row of the i-th submatrix 01629 ! lm(mp*np): integer(out): lm(i) will be equal to the 01630 ! first dimension of the i-th submatrix 01631 ! js(mp*np): integer(out): js(i) will be equal to the 01632 ! start column of the i-th submatrix 01633 ! ln(mp*np): integer(out): ln(i) will be equal to the 01634 ! 2nd dimension of the i-th submatrix 01635 ! isleft(mp*np): logical(out): isleft(i) is true if the i-th submatrix 01636 ! shares the first column with the global matrix 01637 ! a(:,1) 01638 ! isright(mp*np): logical(out): isright(i) is true if the i-th submatrix 01639 ! shares the last column with the global matrix 01640 ! a(ma,:) 01641 ! istop(mp*np): logical(out): istop(i) is true if the i-th submatrix 01642 ! shares the first row with the global matrix 01643 ! a(1,:) 01644 ! isbot(mp*np): logical(out): isbot(i) is true if the i-th submatrix 01645 ! shares the last row with the global matrix 01646 ! a(na,:) 01647 ! 01648 ! The submatrices overlap: 01649 ! 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 01650 ! 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 0 1 01651 ! 1 2 3 4 5 6 7 8 9 0 1 2 3 01652 ! 01653 ! + + + + + x x x x + + + + + x x x x + + + + + + + 01654 ! + + + + + x x x x + + + + + x x x x + + + + + + + 01655 ! + + + + + x x x x + + + + + x x x x + + + + + + + 01656 ! + + + + + x x x x + + + + + x x x x + + + + + + + 01657 ! + + + + + x x x x + + + + + x x x x + + + + + + + 01658 ! + + + + + x x x x + + + + + x x x x + + + + + + + 01659 ! + + + + + x x x x + + + + + x x x x + + + + + + + 01660 ! 01661 ! This picture is for the overlap in horizontal direction, 01662 ! The overlap is 4. 01663 ! + -> matrix element 01664 ! x -> overlapping matrix element 01665 ! the order of the submatrices is 9,13,11. 01666 ! The order of the global matrix is 25 01667 ! 01668 implicit none 01669 integer, intent(in) :: ma,na,mp,np 01670 integer, dimension(:), intent(out) :: is,lm,js,ln 01671 logical, dimension(:), intent(out) :: isleft,isright,istop,isbot 01672 01673 integer i,j,s,e,k 01674 integer, parameter :: nover = 4 01675 01676 isleft = .false. 01677 isright = .false. 01678 istop = .false. 01679 isbot = .false. 01680 01681 do i = 1, mp 01682 call decomp(ma-nover, mp, i - 1, s, e) 01683 k = 0 01684 do j = 1, np 01685 if (j .eq. 1) then 01686 isleft(i+k) = .true. 01687 endif 01688 if (j .eq. np) then 01689 isright(i+k) = .true. 01690 endif 01691 is(i + k) = s 01692 lm(i + k) = e - s + nover + 1 01693 k = k + mp 01694 enddo 01695 enddo 01696 k = 0 01697 do j=1, np 01698 call decomp(na-nover, np, j - 1, s, e) 01699 do i = 1, mp 01700 if (i .eq. 1) then 01701 istop(i+k) = .true. 01702 endif 01703 if (i .eq. mp) then 01704 isbot(i+k) = .true. 01705 endif 01706 js(i + k) = s 01707 ln(i + k) = e - s + nover + 1 01708 enddo 01709 k = k + mp 01710 enddo 01711 01712 end subroutine det_submatrices 01713 01714 subroutine shift_borders_matrix_real8(a,left,right,top,bot,comm) 01715 ! 01716 ! shift borders to and from neighbours. 01717 ! 01718 use mpi 01719 implicit none 01720 real*8, dimension(:,:), intent(inout) :: a 01721 integer, intent(in) :: left,right,top,bot,comm 01722 01723 integer, parameter :: datatype = MPI_DOUBLE_PRECISION 01724 integer ierror,ma,na 01725 01726 ma = ubound(a,1) 01727 na = ubound(a,2) 01728 01729 ! send to left, receive from right 01730 call MPI_Sendrecv(a(2:ma-1,2), ma-2, datatype, & 01731 left, 1, & 01732 a(2:ma-1,na), ma-2, datatype, & 01733 right, 1, & 01734 comm, MPI_STATUS_IGNORE, ierror) 01735 ! send to right, receive from left 01736 call MPI_Sendrecv(a(2:ma-1,na-1), ma-2, datatype, & 01737 right, 2, & 01738 a(2:ma-1,1), ma-2, datatype, & 01739 left, 2, & 01740 comm, MPI_STATUS_IGNORE, ierror) 01741 01742 ! send to bot, receive from top 01743 call MPI_Sendrecv(a(ma-1,:), na, datatype, & 01744 bot, 3, & 01745 a(1,:), na, datatype, & 01746 top, 3, & 01747 comm, MPI_STATUS_IGNORE, ierror) 01748 ! send to top, receive from bot 01749 call MPI_Sendrecv(a(2,:), na, datatype, & 01750 top, 4, & 01751 a(ma,:), na, datatype, & 01752 bot, 4, & 01753 comm, MPI_STATUS_IGNORE, ierror) 01754 01755 end subroutine shift_borders_matrix_real8 01756 01757 #endif 01758 end module general_mpi_module