#ifdef HAVE_CONFIG_H #include "config.h" #endif #undef DEBUGPART module schedule_mod use metagraph_mod, only : MetaEdge_t use kinds, only : int_kind, iulog implicit none private type, public :: Cycle_t sequence integer :: tag integer :: dest integer :: source integer :: lengthP integer :: lengthP_ghost integer :: type integer :: ptrP integer :: ptrP_ghost type (MetaEdge_t),pointer :: edge end type Cycle_t type, public :: Schedule_t sequence integer :: ncycles integer :: nelemd integer :: placeholder ! total integer count should be even integer :: nSendCycles integer :: nRecvCycles integer :: padding integer,pointer :: Local2Global(:) type (Cycle_t), pointer :: Cycle(:) type (Cycle_t), pointer :: SendCycle(:) type (Cycle_t), pointer :: RecvCycle(:) type (Cycle_t), pointer :: MoveCycle(:) end type Schedule_t type, public :: GraphStats_t sequence integer(kind=int_kind) :: offnode integer(kind=int_kind) :: onnode integer(kind=int_kind) :: LB integer(kind=int_kind) :: padding end type GraphStats_t type (Schedule_t), public, allocatable, target :: Schedule(:) type (Schedule_t), public, allocatable, target :: gSchedule(:) type (Schedule_t), public, allocatable, target :: sSchedule(:) integer,public,parameter :: BNDRY_EXCHANGE_MESSAGE=10 integer,private,allocatable,target :: Global2Local(:) integer :: MinNelemd,MaxNelemd public :: genEdgeSched ! Setup the communication schedule for the edge based boundary exchange public :: PrintSchedule, PrintCycle public :: CheckSchedule public :: FindBufferSlot ! public :: MessageStats ! public :: PrimMessageStats contains !********************** GENCOMSCHED.F ****************************************** subroutine genEdgeSched(elem, PartNumber,LSchedule,MetaVertex) use element_mod, only : element_t use metagraph_mod, only : metavertex_t use dimensions_mod, only : nelem #ifdef _MPI use parallel_mod, only : nComPoints, iam, mpi_status_size, rrequest, srequest, & status, npackpoints #else use parallel_mod, only : nComPoints, iam #endif use parallel_mod, only : haltmp, iam implicit none type(element_t), intent(inout) :: elem(:) integer, intent(in) :: PartNumber type (schedule_t), intent(inout) :: LSchedule type (MetaVertex_t),intent(inout) :: MetaVertex integer :: lengthP,total_length,lengthp_ghost integer :: i,j,is,ir,ncycle integer :: il,ie,ig integer :: nelemd0 integer :: jmd integer :: inbr integer :: nSched integer,allocatable :: tmpP(:,:) integer,allocatable :: tmpP_ghost(:,:) integer :: nSend,nRecv,nedges integer :: icycle integer :: iSched logical, parameter :: VerbosePrint=.FALSE. logical, parameter :: Debug=.false. integer :: ierr nSched=SIZE(schedule) ! ================================================ ! allocate some arrays for the call to MPI_gatherv ! ================================================ MinNelemd = nelem MaxNelemd = 0 ! ===================================================== ! It looks like this is only used in this routine... ! so no need to put it in the schedule data-structure ! ===================================================== allocate(Global2Local(nelem)) if(Debug) write(iulog,*)'genEdgeSched: point #1' iSched = PartNumber nelemd0 = MetaVertex%nmembers if(VerbosePrint) then if(iam .eq. 1) write(iulog,*)'genEdgeSched: Part # ',i,' has ',nelemd0, ' elements ' endif MaxNelemd = AMAX0(MaxNelemd,nelemd0) MinNelemd = AMIN0(MinNelemd,nelemd0) if(Debug) write(iulog,*)'genEdgeSched: point #2' if(Debug) write(iulog,*)'genEdgeSched: point #3' LSchedule%ncycles = MetaVertex%nedges LSchedule%nelemd = nelemd0 if(Debug) write(iulog,*)'genEdgeSched: point #4' ! Note the minus one is for the internal node nedges = MetaVertex%nedges if(2*(nedges/2) .eq. nedges) then nedges = nedges/2 else nedges = (nedges-1)/2 endif LSchedule%nSendCycles = nedges LSchedule%nRecvCycles = nedges if(Debug) write(iulog,*)'genEdgeSched: point #5' ! Temporary array to calculate the Buffer Slot allocate(tmpP(2,nedges+1)) allocate(tmpP_ghost(2,nedges+1)) tmpP(1,:) = -1 tmpP(2,:) = 0 tmpP_ghost(1,:) = -1 tmpP_ghost(2,:) = 0 ! Allocate all the cycle structures allocate(LSchedule%SendCycle(nedges)) allocate(LSchedule%RecvCycle(nedges)) allocate(LSchedule%MoveCycle(1)) if(Debug) write(iulog,*)'genEdgeSched: point #6' !================================================================== ! Allocate and initalized the index translation arrays Global2Local = -1 allocate(LSchedule%Local2Global(nelemd0)) if(Debug) write(iulog,*)'genEdgeSched: point #7' do il=1,nelemd0 ig = MetaVertex%members(il)%number Global2Local(ig)=il LSchedule%Local2Global(il)=ig #ifndef _PREDICT elem(il)%desc%putmapP=-1 elem(il)%desc%getmapP=-1 elem(il)%desc%putmapP_ghost=-1 elem(il)%desc%getmapP_ghost=-1 elem(il)%desc%reverse = .FALSE. #endif enddo !================================================================== if(Debug) write(iulog,*)'genEdgeSched: point #8' total_length = 0 ncycle = LSchedule%ncycles is=1 ir=1 do j=1,ncycle lengthP = MetaVertex%edges(j)%wgtP lengthP_ghost = MetaVertex%edges(j)%wgtP_ghost if((MetaVertex%edges(j)%HeadVertex == PartNumber) .AND. & (MetaVertex%edges(j)%TailVertex == PartNumber)) then inbr = PartNumber if(Debug) write(iulog,*)'genEdgeSched: point #9', iam LSchedule%MoveCycle%ptrP = FindBufferSlot(inbr,lengthP,tmpP) LSchedule%MoveCycle%ptrP_ghost = FindBufferSlot(inbr,lengthP_ghost,tmpP_ghost) call SetCycle(elem, LSchedule,LSchedule%MoveCycle(1),MetaVertex%edges(j)) if(Debug) write(iulog,*)'genEdgeSched: point #10',iam else if (MetaVertex%edges(j)%TailVertex == PartNumber) then inbr = MetaVertex%edges(j)%HeadVertex if(Debug) write(iulog,*)'genEdgeSched: point #11', iam LSchedule%SendCycle(is)%ptrP = FindBufferSlot(inbr,lengthP,tmpP) LSchedule%SendCycle(is)%ptrP_ghost= FindBufferSlot(inbr,lengthP_ghost,tmpP_ghost) call SetCycle(elem, LSchedule,LSchedule%SendCycle(is),MetaVertex%edges(j)) if(Debug) write(iulog,*)'genEdgeSched: point #12',iam is = is+1 else if (MetaVertex%edges(j)%HeadVertex == PartNumber) then inbr = MetaVertex%edges(j)%TailVertex if(Debug) write(iulog,*)'genEdgeSched: point #13',iam LSchedule%RecvCycle(ir)%ptrP = FindBufferSlot(inbr,lengthP,tmpP) LSchedule%RecvCycle(ir)%ptrP_ghost= FindBufferSlot(inbr,lengthP_ghost,tmpP_ghost) call SetCycle(elem, LSchedule,LSchedule%RecvCycle(ir),MetaVertex%edges(j)) if(Debug) write(iulog,*)'genEdgeSched: point #14',iam ir = ir+1 endif enddo deallocate(tmpP) deallocate(tmpP_ghost) do ie=1,nelemd0 elem(ie)%vertex = MetaVertex%members(ie) ig = MetaVertex%members(ie)%number elem(ie)%GlobalId = ig elem(ie)%LocalId = ie #if 0 call LLInsertEdge(eroot,ig,jmd) !DBG write(iulog,*)'After call to LLInsertEdge in schedule: ie,ig ',ie,ig,jmd #endif enddo deallocate(Global2Local) !S-JMD call CheckSchedule() #ifdef _MPI !================================================================ ! Allocate a couple of structures for bndry_exchange ! done here to remove it from the critical path !================================================================ nComPoints=0 nSend = nedges nRecv = nedges allocate(Rrequest(nRecv)) allocate(Srequest(nSend)) allocate(status(MPI_STATUS_SIZE,nRecv)) !=============================================================== ! Number of communication points ... to be used later to ! setup the size of the communication buffer for MPI_Ibsend !=============================================================== do icycle=1,nSend nComPoints = nComPoints + LSchedule%SendCycle(icycle)%lengthP enddo nPackPoints = nComPoints + LSchedule%MoveCycle(1)%lengthP ! nbuf = 4*(nv+1)*nelemd*8*4*nlev ! write(iulog,*)'before call to allocate(combuffer) ',nbuf ! allocate(combuffer(nbuf)) ! write(iulog,*)'IAM: ',iSched,'Before call to MPI_Buffer_Attach ' ! call MPI_Buffer_Attach(combuffer,nbuf,ierr) ! write(iulog,*)'IAM: ',iSched,'After call to MPI_Buffer_Attach ' #endif #ifdef DEBUGPART call haltmp("genEdgeSched: Just testing the partitioning algorithms") #endif end subroutine genEdgeSched #ifdef DOTHIS subroutine MessageStats(nlyr) use kinds, only : real_kind, log_kind use dimensions_mod, only : nmpi_per_node, nnodes, npart, nlev, np, nelem use perfmodel_mod, only : perf_t, commtime_t , setptopnetworkparams, & ! _EXTERNAL setserialparamsexplicit, setserialparamsimplicit use control_mod, only : integration !----------------- implicit none integer,intent(in) :: nlyr ! number of 2D layers in the communication integer :: icycle,ip real(kind=real_kind) :: lb_nelemd,lb_volume integer :: length,nSend,nRecv integer :: i real(kind=real_kind), allocatable :: Time_total(:),Time_calc(:), Time_comm(:) real(kind=real_kind), allocatable :: Time_calc1(:),Time_calc2(:) type (commtime_t), allocatable :: bndry1(:),bndry2(:),bndry3(:),bndryf(:) real(kind=real_kind) :: Time_comm1,Time_commf, & Time_comm2,Time_comm3 integer(kind=int_kind) :: bytes_per_point real(kind=real_kind) :: time_per_elem real(kind=real_kind) :: time_per_iter real(kind=real_kind) :: latency_tmp,bandwidth_tmp real(kind=real_kind) :: avg_cg_iters type (commtime_t) :: offnode,onnode type (perf_t) :: tcv type (GraphStats_t), allocatable :: count(:) real(kind=real_kind) :: Time_serial,Time_parallel,Speedup integer(kind=int_kind),allocatable :: offnode_count(:),onnode_count(:) integer(kind=int_kind),allocatable :: LB_count(:) integer(kind=int_kind) :: edgecut_offnode,edgecut_onnode,edgecut_total logical(kind=log_kind) :: FoundNetwork,FoundMachine character(len=80) :: networkname,machinename integer(kind=int_kind) :: indx_min,indx_comm(1),indx_calc(1) integer(kind=8) :: imin_volume, & imax_volume integer(kind=int_kind),parameter :: configuration = 5 integer :: node1,node2,nbr,in integer :: nelemd0 logical, parameter :: Debug = .FALSE. logical, parameter :: PredictPerformance = .TRUE. if(PredictPerformance) then networkname="protoBGL" call SetPtoPNetworkParams(offnode,onnode,networkname,FoundNetwork) write(iulog,*)'MessageStats: After SetSerialParamsExplicit FoundNetwork: ',FoundNetwork machinename="protoBGL" !JMD integration = "explicit" write(iulog,*)'MessageStats: integration: ',integration if(integration == "explicit") then call SetSerialParamsExplicit(time_per_elem,np,nlev,machinename,FoundMachine) write(iulog,*)'MessageStats: After SetSerialParamsExplicit FoundMachine: ',FoundMachine else if(integration == "semi_imp") then call SetSerialParamsImplicit(time_per_elem,time_per_iter,avg_cg_iters, & np,nelem,nlev,machinename,FoundMachine) endif endif allocate(Time_total(npart)) allocate(Time_calc(npart)) allocate(Time_comm(npart)) allocate(Time_calc1(npart)) allocate(Time_calc2(npart)) allocate(bndry1(npart)) allocate(bndry2(npart)) allocate(bndry3(npart)) allocate(bndryf(npart)) write(iulog,*)'MessageStats: nlev and npart nnodes nmpi_per_node are: ',nlev,npart,nnodes,nmpi_per_node if(npart .gt. 1) then allocate(count(nnodes)) call foo(schedule,count) !---------------------------------------------------- ! Call fooCalc for each boundary exchange !---------------------------------------------------- write(iulog,*)'MessageStats: ' write(iulog,*)'MessageStats: Boundary Exchange #1 ' bytes_per_point = (3*nlev)*8 call fooCalc(schedule,bytes_per_point,offnode,onnode,count,bndry1) write(iulog,*)'MessageStats: ' write(iulog,*)'MessageStats: Filter Boundary Exchange ' bytes_per_point = (2*nlev)*8 call fooCalc(schedule,bytes_per_point,offnode,onnode,count,bndryf) if(integration == "semi_imp") then write(iulog,*)'MessageStats: ' write(iulog,*)'MessageStats: Boundary Exchange #2 ' bytes_per_point = (2*nlev)*8 call fooCalc(schedule,bytes_per_point,offnode,onnode,count,bndry2) write(iulog,*)'MessageStats: ' write(iulog,*)'MessageStats: Helmholtz Boundary Exchange ' bytes_per_point = (2*nlev)*8 call fooCalc(schedule,bytes_per_point,offnode,onnode,count,bndry3) endif #ifdef _PREDICT if(PredictPerformance) then !------------------------------------------------------ ! Calculate information about execution time prediction !------------------------------------------------------ if(integration == "explicit") then do ip=1,npart nelemd0 = Schedule(ip)%nelemd Time_calc(ip) = time_per_elem*nelemd0 if(filter_freq > 0) then Time_commf = (bndryf(ip)%latency + bndryf(ip)%bandwidth)/real(filter_freq,kind=real_kind) else Time_commf = 0 endif Time_comm(ip) = Time_commf + bndry1(ip)%latency + bndry1(ip)%bandwidth enddo else if(integration == "semi_imp") then do ip=1,npart nelemd0 = Schedule(ip)%nelemd Time_calc1(ip) = time_per_elem*nelemd0 Time_calc2(ip) = time_per_iter*nelemd0*avg_cg_iters Time_calc(ip) = Time_calc1(ip) + Time_calc2(ip) if(filter_freq > 0) then Time_commf = (bndryf(ip)%latency + bndryf(ip)%bandwidth)/real(filter_freq,kind=real_kind) else Time_commf = 0 endif Time_comm(ip) = Time_commf & + bndry1(ip)%latency + bndry1(ip)%bandwidth & + bndry2(ip)%latency + bndry2(ip)%bandwidth & + avg_cg_iters*(bndry3(ip)%latency + bndry3(ip)%bandwidth) enddo endif !----------------------------------------------------------- ! Print out information about the execution time prediction !----------------------------------------------------------- indx_calc = maxloc(Time_Calc) indx_comm = maxloc(Time_Comm) Time_parallel = Time_Calc(indx_calc(1)) + Time_comm(indx_comm(1)) write(iulog,*)'MessageStats: ' write(iulog,*)'MessageStats: Execution Time Prediction ' if(FoundMachine) write(iulog,*)'MessageStats: For Machine: ',TRIM(machinename) if(FoundNetwork) write(iulog,*)'MessageStats: With Network: ',TRIM(networkname) if(integration == "explicit" ) then Time_serial = real(nelem,kind=real_kind)*time_per_elem Speedup=Time_serial/Time_parallel write(iulog,201) Time_serial,1.0D-6*Time_serial*(secpday/tstep) !--------------------------------------- ! First Boundary exchange in advance !--------------------------------------- if(FoundNetwork) write(iulog,65) bndry1(indx_comm(1))%latency + bndry1(indx_comm(1))%bandwidth, & bndry1(indx_comm(1))%latency,bndry1(indx_comm(1))%bandwidth !--------------------------------------- ! Filter Boundary exchange in advance !--------------------------------------- if(filter_freq > 0) then Time_commf = (bndryf(indx_comm(1))%latency & + bndryf(indx_comm(1))%bandwidth)/real(filter_freq,kind=real_kind) if(FoundNetwork) write(iulog,68) Time_commf, bndryf(indx_comm(1))%latency,bndryf(indx_comm(1))%bandwidth endif !--------------------------------------- ! Total Communication Cost !--------------------------------------- if(FoundMachine .and. FoundNetwork) & write(iulog,55) Time_Parallel,Time_calc(indx_calc(1)), Time_comm(indx_comm(1)) else if(integration == "semi_imp") then Time_serial = real(nelem,kind=real_kind)*(time_per_elem + avg_cg_iters*time_per_iter) Speedup = Time_serial/Time_parallel write(iulog,202) avg_cg_iters write(iulog,201) Time_serial,1.0D-6*Time_serial*(secpday/tstep) write(iulog,35) MAXVAL(Time_calc1) write(iulog,45) MAXVAL(Time_calc2) !--------------------------------------- ! Filter Boundary exchange in advance_si !--------------------------------------- if(filter_freq > 0) then Time_commf = (bndryf(indx_comm(1))%latency & + bndryf(indx_comm(1))%bandwidth)/real(filter_freq,kind=real_kind) if(FoundNetwork) write(iulog,68) Time_commf, bndryf(indx_comm(1))%latency,bndryf(indx_comm(1))%bandwidth endif !--------------------------------------- ! First Boundary exchange in advance_si !--------------------------------------- Time_comm1 = bndry1(indx_comm(1))%latency + bndry1(indx_comm(1))%bandwidth if(FoundNetwork) write(iulog,65) Time_comm1, bndry1(indx_comm(1))%latency,bndry1(indx_comm(1))%bandwidth !--------------------------------------- ! Second Boundary exchange in advance_si !--------------------------------------- Time_comm2 = bndry2(indx_comm(1))%latency + bndry2(indx_comm(1))%bandwidth if(FoundNetwork) write(iulog,66) Time_comm2, bndry2(indx_comm(1))%latency,bndry2(indx_comm(1))%bandwidth !--------------------------------------- ! Boundary exchange in solver_mod !--------------------------------------- Time_comm3 = bndry3(indx_comm(1))%latency + bndry3(indx_comm(1))%bandwidth if(FoundNetwork) then write(iulog,67) avg_cg_iters*Time_comm3, avg_cg_iters*bndry3(indx_comm(1))%latency, & avg_cg_iters*bndry3(indx_comm(1))%bandwidth endif !--------------------------------------- ! Total Communication Cost !--------------------------------------- if(FoundMachine .and. FoundNetwork) & write(iulog,55) Time_Parallel,Time_calc(indx_calc(1)), & Time_comm1 + Time_commf + Time_comm2 + avg_cg_iters*Time_comm3 endif if(FoundMachine .and. FoundNetwork) write(iulog,95) Speedup endif call haltmp("MessageStats: Just testing the partitioning algorithms") #endif endif 110 format(1x,A,I10,I10,f12.2,f12.2) 100 format(1x,A,I8,I8,f10.2,f8.5) 35 format(" MessageStats: Predicted SI STEP Time (usec/step) ",f12.1) 45 format(" MessageStats: Predicted PCG SOLVER Time (usec/step) ",f12.1) 55 format(" MessageStats: Predicted TOTAL Time (usec/step) total:= ",f12.1," calc:= ",f12.1," comm:= ",f12.1) 65 format(" MessageStats: Predicted BNDRY #1 Time (usec/step) Total:=",f12.1," latency:= ",f12.1," band:= ",f12.1) 165 format(" MessageStats: Predicted BNDRY #1 ",i5," Time (usec/step) Total:=",f12.1," latency:= ",f12.1," band:= ",f12.1) 66 format(" MessageStats: Predicted BNDRY #2 Time (usec/step) Total:=",f12.1," latency:= ",f12.1," band:= ",f12.1) 67 format(" MessageStats: Predicted HELM BNDRY Time (usec/step) Total:=",f12.1," latency:= ",f12.1," band:= ",f12.1) 68 format(" MessageStats: Predicted FLTR BNDRY Time (usec/step) Total:=",f12.1," latency:= ",f12.1," band:= ",f12.1) 75 format(" MessageStats: Edgecut total: ",i10," onnode: ",i10," offnode: ",i10) 95 format(" MessageStats: Speedup: ",f10.1) 201 format(" MessageStats: Serial Execution Time (usec/step)",f12.1," (sec/day) ",f12.1) 202 format(" MessageStats: Average CG iterations: ",f6.3) 120 format(A,I8,I8,f8.5) end subroutine MessageStats subroutine PrimMessageStats(nlyr) use kinds, only : real_kind, log_kind use perfmodel_mod, only : perf_t, commtime_t , setptopnetworkparams, & ! _EXTERNAL setserialparamsprimexplicit use dimensions_mod, only : nmpi_per_node, nnodes, npart, nlev, np, nelem, nelemd use control_mod, only : integration !----------------- implicit none integer,intent(in) :: nlyr ! number of 2D layers in the communication integer :: icycle,ip real(kind=real_kind) :: lb_nelemd,lb_volume integer :: length,nSend,nRecv integer :: i real(kind=real_kind), allocatable :: Time_total(:),Time_calc(:), Time_comm(:) real(kind=real_kind), allocatable :: Time_calc1(:),Time_calc2(:) type (commtime_t), allocatable :: bndry1(:),bndry2(:),bndry3(:),bndryf(:) real(kind=real_kind) :: Time_comm1,Time_commf, & Time_comm2,Time_comm3 integer(kind=int_kind) :: bytes_per_point real(kind=real_kind) :: time_per_elem real(kind=real_kind) :: time_per_iter real(kind=real_kind) :: latency_tmp,bandwidth_tmp real(kind=real_kind) :: avg_cg_iters type (commtime_t) :: offnode,onnode type (perf_t) :: tcv type (GraphStats_t), allocatable :: count(:) real(kind=real_kind) :: Time_serial,Time_parallel,Speedup integer(kind=int_kind),allocatable :: offnode_count(:),onnode_count(:) integer(kind=int_kind),allocatable :: LB_count(:) integer(kind=int_kind) :: edgecut_offnode,edgecut_onnode,edgecut_total logical(kind=log_kind) :: FoundNetwork,FoundMachine character(len=80) :: networkname,machinename integer(kind=int_kind) :: indx_min,indx_comm(1),indx_calc(1) integer(kind=8) :: imin_volume, & imax_volume integer(kind=int_kind),parameter :: configuration = 5 integer :: node1,node2,nbr,in integer :: nelemd0 logical, parameter :: Debug = .FALSE. logical, parameter :: PredictPerformance = .TRUE. if(PredictPerformance) then networkname="protoBGL" call SetPtoPNetworkParams(offnode,onnode,networkname,FoundNetwork) write(iulog,*)'PrimMessageStats: After SetSerialParamsExplicit FoundNetwork: ',FoundNetwork machinename="protoBGL" integration = "explicit" write(iulog,*)'PrimMessageStats: integration: ',integration if(integration == "explicit") then call SetSerialParamsPrimExplicit(time_per_elem,np,nlev,machinename,FoundMachine) write(iulog,*)'PrimMessageStats: After SetSerialParamsPrimExplicit FoundMachine: ',FoundMachine endif endif allocate(Time_total(npart)) allocate(Time_calc(npart)) allocate(Time_comm(npart)) allocate(Time_calc1(npart)) allocate(Time_calc2(npart)) allocate(bndry1(npart)) allocate(bndry2(npart)) allocate(bndry3(npart)) allocate(bndryf(npart)) write(iulog,*)'PrimMessageStats: nlev and npart nnodes nmpi_per_node are: ',nlev,npart,nnodes,nmpi_per_node if(npart .gt. 1) then allocate(count(nnodes)) call foo(schedule,count) !---------------------------------------------------- ! Call fooCalc for each boundary exchange !---------------------------------------------------- write(iulog,*)'PrimMessageStats: ' write(iulog,*)'PrimMessageStats: Boundary Exchange #1 ' bytes_per_point = 3*8 call fooCalc(schedule,bytes_per_point,offnode,onnode,count,bndry1) write(iulog,*)'PrimMessageStats: ' write(iulog,*)'PrimMessageStats: Boundary Exchange #2 ' bytes_per_point = (3*nlev+1)*8 call fooCalc(schedule,bytes_per_point,offnode,onnode,count,bndry2) write(iulog,*)'PrimMessageStats: ' write(iulog,*)'PrimMessageStats: Filter Boundary Exchange ' bytes_per_point = (3*nlev+1)*8 call fooCalc(schedule,bytes_per_point,offnode,onnode,count,bndryf) #ifdef _PREDICT if(PredictPerformance) then !------------------------------------------------------ ! Calculate information about execution time prediction !------------------------------------------------------ if(integration == "explicit") then do ip=1,npart nelemd0 = Schedule(ip)%nelemd Time_calc(ip) = time_per_elem*nelemd0 if(filter_freq > 0) then Time_commf = (bndryf(ip)%latency + bndryf(ip)%bandwidth)/real(filter_freq,kind=real_kind) else Time_commf = 0 endif Time_comm(ip) = Time_commf + bndry1(ip)%latency + bndry1(ip)%bandwidth & + 4.*(bndry2(ip)%latency + bndry2(ip)%bandwidth) enddo endif !----------------------------------------------------------- ! Print out information about the execution time prediction !----------------------------------------------------------- indx_calc = maxloc(Time_Calc) indx_comm = maxloc(Time_Comm) Time_parallel = Time_Calc(indx_calc(1)) + Time_comm(indx_comm(1)) write(iulog,*)'PrimMessageStats: ' write(iulog,*)'PrimMessageStats: Execution Time Prediction ' if(FoundMachine) write(iulog,*)'PrimMessageStats: For Machine: ',TRIM(machinename) if(FoundNetwork) write(iulog,*)'PrimMessageStats: With Network: ',TRIM(networkname) if(integration == "explicit" ) then Time_serial = real(nelem,kind=real_kind)*time_per_elem Speedup=Time_serial/Time_parallel write(iulog,201) Time_serial,1.0D-6*Time_serial*(secpday/tstep) !--------------------------------------- ! Small Boundary exchange in prim_advance !--------------------------------------- if(FoundNetwork) write(iulog,65) bndry1(indx_comm(1))%latency + bndry1(indx_comm(1))%bandwidth, & bndry1(indx_comm(1))%latency,bndry1(indx_comm(1))%bandwidth !------------------------------------------- ! 4 Large Boundary exchanges in prim_advance !------------------------------------------- if(FoundNetwork) write(iulog,65) 4.*(bndry2(indx_comm(1))%latency + bndry2(indx_comm(1))%bandwidth), & 4.*bndry2(indx_comm(1))%latency,4.*bndry1(indx_comm(1))%bandwidth !--------------------------------------- ! Filter Boundary exchange in advance !--------------------------------------- if(filter_freq > 0) then Time_commf = (bndryf(indx_comm(1))%latency & + bndryf(indx_comm(1))%bandwidth)/real(filter_freq,kind=real_kind) if(FoundNetwork) write(iulog,68) Time_commf, bndryf(indx_comm(1))%latency,bndryf(indx_comm(1))%bandwidth endif !--------------------------------------- ! Total Communication Cost !--------------------------------------- if(FoundMachine .and. FoundNetwork) & write(iulog,55) Time_Parallel,Time_calc(indx_calc(1)), Time_comm(indx_comm(1)) else if(integration == "semi_imp") then write(iulog,*)'PrimMessageStats: not yet support for Semi-Implicit time integration' endif if(FoundMachine .and. FoundNetwork) write(iulog,95) Speedup endif call haltmp("PrimMessageStats: Just testing the partitioning algorithms") #endif endif 110 format(1x,A,I10,I10,f12.2,f12.2) 100 format(1x,A,I8,I8,f10.2,f8.5) 35 format(" PrimMessageStats: Predicted SI STEP Time (usec/step) ",f12.1) 45 format(" PrimMessageStats: Predicted PCG SOLVER Time (usec/step) ",f12.1) 55 format(" PrimMessageStats: Predicted TOTAL Time (usec/step) total:= ",f12.1," calc:= ",f12.1," comm:= ",f12.1) 65 format(" PrimMessageStats: Predicted BNDRY #1 Time (usec/step) Total:=",f12.1," latency:= ",f12.1," band:= ",f12.1) 165 format(" PrimMessageStats: Predicted BNDRY #1 ",i5," Time (usec/step) Total:=",f12.1," latency:= ",f12.1," band:= ",f12.1) 66 format(" PrimMessageStats: Predicted BNDRY #2 Time (usec/step) Total:=",f12.1," latency:= ",f12.1," band:= ",f12.1) 67 format(" PrimMessageStats: Predicted HELM BNDRY Time (usec/step) Total:=",f12.1," latency:= ",f12.1," band:= ",f12.1) 68 format(" PrimMessageStats: Predicted FLTR BNDRY Time (usec/step) Total:=",f12.1," latency:= ",f12.1," band:= ",f12.1) 75 format(" PrimMessageStats: Edgecut total: ",i10," onnode: ",i10," offnode: ",i10) 95 format(" PrimMessageStats: Speedup: ",f10.1) 201 format(" PrimMessageStats: Serial Execution Time (usec/step)",f12.1," (sec/day) ",f12.1) 202 format(" PrimMessageStats: Average CG iterations: ",f6.3) 120 format(A,I8,I8,f8.5) end subroutine PrimMessageStats #endif subroutine CheckSchedule() implicit none integer :: i,nSched,nbufferwords_1,nbufferwords_2 type (Schedule_t), pointer :: pSchedule nSched = SIZE(Schedule) do i=1,nSched pSchedule => Schedule(i) nbufferwords_1 = SUM(pSchedule%SendCycle(:)%lengthP) nbufferwords_2 = SUM(pSchedule%RecvCycle(:)%lengthP) if(nbufferwords_1 .ne. nbufferwords_2) then write (*,100) i,nbufferwords_1, nbufferwords_2 endif enddo 100 format('CheckSchedule: ERR IAM:',I3,' SIZEOF(SendBuffer):',I10,' != SIZEOF(RecvBuffer) :',I10) 110 format('CheckSchedule: ERR IAM:',I3,' LENGTH(SendBuffer):',I10,' != LENGTH(RecvBuffer) :',I10) end subroutine CheckSchedule subroutine PrintSchedule(Schedule) use gridgraph_mod, only : printgridedge implicit none type (Schedule_t),intent(in),target :: Schedule(:) type (Schedule_t), pointer :: pSchedule type (Cycle_t),pointer :: pCycle integer :: i,j,nSched nSched = SIZE(Schedule) write(*,*) '------NEW SCHEDULE FORMAT---------------------' do i=1,nSched pSchedule => Schedule(i) write(*,*) write(*,*) '----------------------------------------------' write(*,90) i,pSchedule%ncycles write(*,*) '----------------------------------------------' write(*,*) '-----------SEND-------------------------------' do j=1,pSchedule%nSendCycles pCycle => pSchedule%SendCycle(j) call PrintCycle(pCycle) call PrintGridEdge(pCycle%edge%members) enddo write(*,*) '-----------RECV-------------------------------' do j=1,pSchedule%nRecvCycles pCycle => pSchedule%RecvCycle(j) call PrintCycle(pCycle) call PrintGridEdge(pCycle%edge%members) enddo write(*,*) '-----------MOVE-------------------------------' pCycle => pSchedule%MoveCycle(1) call PrintCycle(pCycle) call PrintGridEdge(pCycle%edge%members) enddo 90 format('NODE # ',I2,2x,'NCYCLES ',I2) 97 format(10x,'EDGE #',I2,2x,'TYPE ',I1,2x,'G.EDGES',I4,2x,'WORDS ',I5,2x, & 'SRC ',I3,2x,'DEST ',I3,2x,'PTR ',I4) 100 format(15x,I4,5x,I3,1x,'(',I1,') --',I1,'--> ',I3,1x,'(',I1,')') end subroutine PrintSchedule #ifdef DOTHIS subroutine fooCalc(Schedule,bytes_per_point,offnode,onnode,count,Time) use kinds, only : real_kind use perfmodel_mod, only : commtime_t, perf_t ! _EXTERNAL use flops_mod, only : min_number,max_number, avg_number, & ! _EXTERNAL min_volume,max_volume, avg_volume, & min_message,max_message, avg_message, & LocalComVolume,TotalComVolume, tnsend use parallel_mod, only : iam, ncompoints use dimensions_mod, only : nmpi_per_node, npart ! results:: TnSend,{min,max}_{volume,number},Time_{total,calc,latency,bandwidth} ! inputs:: results + Schedule + bytes_per_point type (Schedule_t),intent(in) :: Schedule(:) integer(kind=int_kind),intent(in) :: bytes_per_point type (commtime_t),intent(in) :: offnode,onnode type (GraphStats_t), intent(in) :: count(:) type (commtime_t),intent(inout) :: Time(:) real(kind=real_kind) :: bandwidth_tmp,latency_tmp integer(kind=int_kind) :: nbr,node1,node2,nsend,ip,icycle,nrecv integer(kind=int_kind) :: length real(kind=real_kind) :: lb_volume integer(kind=int_kind) :: imax_volume,imin_volume type (perf_t) :: tcv integer :: nSched integer :: nelemd0 logical, parameter :: Debug = .FALSE. nSched=SIZE(Schedule) min_message = 1000000 min_volume = 1000000 max_message = 0 max_volume = 0 TotalComVolume = 0 TnSend = 0 tcv%onnode = 0.0 tcv%offnode = 0.0 do ip=1,nSched nSend = Schedule(ip)%nSendCycles nRecv = Schedule(ip)%nRecvCycles nelemd0 = Schedule(ip)%nelemd nComPoints = 0 Time(ip)%latency = 0 Time(ip)%bandwidth = 0 do icycle=1,nSend length = Schedule(ip)%SendCycle(icycle)%lengthV nComPoints = nComPoints + length min_message = MIN(min_message, bytes_per_point*length) max_message = MAX(max_message, bytes_per_point*length) enddo do icycle=1,nRecv length = Schedule(ip)%RecvCycle(icycle)%lengthV nbr = Schedule(ip)%RecvCycle(icycle)%source node1 = ((ip-1)/nmpi_per_node) + 1 node2 = ((nbr-1)/nmpi_per_node) + 1 if(node1 .eq. node2 ) then ! Message is set to an on-node processor tcv%onnode = tcv%onnode + length*bytes_per_point bandwidth_tmp = onnode%bandwidth latency_tmp = onnode%latency else ! Message is set to an off-node processor tcv%offnode = tcv%offnode + length*bytes_per_point bandwidth_tmp = real(count(node1)%offnode,kind=real_kind)*offnode%bandwidth latency_tmp = offnode%latency if(Debug) write(iulog,*)'fooCalc: node # ',node1,' offnode count is ',real(count(node1)%offnode,kind=real_kind) if(Debug) write(iulog,*)'fooCalc: node # ',node1,' offnode ={total,effective} ',offnode%bandwidth,bandwidth_tmp endif Time(ip)%latency = Time(ip)%latency + latency_tmp if(Debug) then write(iulog,*)'fooCalc: node # ',node1,' Contribution in time ',length*bytes_per_point, & length*bytes_per_point*bandwidth_tmp endif Time(ip)%bandwidth = Time(ip)%bandwidth + length*bytes_per_point*bandwidth_tmp enddo !JMD Note: I am removing the size of the communication buffer out of the computation LocalComVolume = bytes_per_point*nComPoints min_volume = MIN(min_volume,LocalComVolume) max_volume = MAX(max_volume,LocalComVolume) TnSend = TnSend + nSend TotalComVolume = TotalComVolume + LocalComVolume enddo !------------------------------------------------------ ! This Outputs information about each boundary exchange !------------------------------------------------------ avg_volume = dble(TotalComVolume)/dble(npart) lb_volume = (max_volume-avg_volume)/max_volume avg_message = dble(TotalComVolume)/dble(TnSend) imin_volume = NINT(min_volume) imax_volume = NINT(max_volume) if(iam .eq. 1) then write(iulog,*)'MessageStats: Total Message volume is (Mbytes): ',1.0e-6*TotalComVolume #if 1 write(iulog,*) 'MessageStats: imin_volume',imin_volume write(iulog,*) 'MessageStats: imax_volume',imax_volume write(iulog,*) 'MessageStats: avg_volume',avg_volume write(iulog,*) 'MessageStats: lb_volume',lb_volume #else !JMD For some reason on AIX this line breaks when the numbers are high write(iulog,110) 'MessageStats: Single process volume (bytes) {MIN,MAX,AVG,LB} =: ', & imin_volume, imax_volume, avg_volume,lb_volume #endif write(iulog,*) 'MessageStats: Single message (bytes) {MIN,MAX,AVG} =: ', & min_message,max_message,avg_message write(iulog,85) 1.0E-3*TotalComVolume,1.0E-3*tcv%onnode,1.0E-3*tcv%offnode endif 85 format(" MessageStats: TCV total: ",f10.1," onnode: ",f10.1," offnode: ",f10.1) 110 format(1x,A,I10,I10,f12.2,f12.2) end subroutine fooCalc subroutine foo(Schedule,count) use kinds, only : real_kind, log_kind use flops_mod, only : min_number,max_number, avg_number ! _EXTERNAL use dimensions_mod, only : nmpi_per_node, npart, nelem use parallel_mod, only : iam type (Schedule_t),intent(in) :: Schedule(:) type (GraphStats_t),intent(inout) :: count(:) integer :: node2,nbr,ip,node1,icycle,nrecv,nsend logical :: offnode,found_offnode,found_onnode,found_lbnode integer(kind=int_kind) :: edgecut_total real(kind=real_kind) :: lb_nelemd,avg_nelemd integer(kind=int_kind) :: edgecut_offnode,edgecut_onnode integer(kind=int_kind) :: TnSend integer :: nSched integer :: nelemd0 logical(kind=log_kind),parameter :: Debug=.FALSE. nSched=SIZE(Schedule) count(:)%offnode=0 count(:)%onnode=0 count(:)%LB=0 edgecut_offnode=0 edgecut_onnode=0 min_number = 1000000 max_number = 0 TnSend=0 do ip=1,nSched node1=((ip-1)/nmpi_per_node)+1 found_offnode=.FALSE. found_onnode=.FALSE. found_lbnode=.FALSE. nRecv = Schedule(ip)%nRecvCycles nSend = Schedule(ip)%nSendCycles nelemd0 = Schedule(ip)%nelemd do icycle=1,nRecv nbr = Schedule(ip)%RecvCycle(icycle)%source node2=((nbr-1)/nmpi_per_node)+1 if(node2 .ne. node1 ) then ! Message is set off-node if(nelemd0 .eq. MaxNelemd) found_lbnode = .TRUE. found_offnode=.TRUE. edgecut_offnode = edgecut_offnode + 1 else ! Message is set to an on-node processor edgecut_onnode = edgecut_onnode + 1 endif if(node2 .eq. node1 ) found_onnode = .TRUE. enddo if(found_offnode) count(node1)%offnode = count(node1)%offnode + 1 if(found_onnode) count(node1)%onnode = count(node1)%onnode + 1 if(found_lbnode) count(node1)%LB = count(node1)%LB + 1 min_number = MIN(min_number,nSend) max_number = MAX(max_number,nSend) TnSend=TnSend+nSend enddo if(Debug) write(iulog,*)'count(:)%offnode : ',count(:)%offnode if(Debug) write(iulog,*)'count(:)%LB : ',count(:)%LB !------------------------------------------------- ! This Outputs general information about the Graph !------------------------------------------------- edgecut_offnode = edgecut_offnode/2 edgecut_onnode = edgecut_onnode/2 edgecut_total = edgecut_offnode + edgecut_onnode avg_nelemd=dble(nelem)/dble(npart) lb_nelemd=(MaxNelemd-avg_nelemd)/MaxNelemd avg_number = dble(TnSend)/dble(npart) if(iam .eq. 1) then write(iulog,*)'MessageStats: Number of MPI processes are: ',npart write(iulog,100) 'MessageStats: nelemd is {MIN,MAX,AVG,LB}: ',MinNelemd,MaxNelemd,avg_nelemd,lb_nelemd write(iulog,*) 'MessageStats: number of neighbors {MIN,MAX,AVG} ',min_number,max_number,avg_number write(iulog,75) edgecut_total,edgecut_onnode,edgecut_offnode endif 100 format(1x,A,I8,I8,f10.2,f8.5) 75 format(" MessageStats: Edgecut total: ",i10," onnode: ",i10," offnode: ",i10) end subroutine foo #endif subroutine PrintCycle(Cycle) implicit none type (Cycle_t),intent(in),target :: Cycle write(*,97) Cycle%edge%number,Cycle%type,Cycle%edge%nmembers, & Cycle%lengthP,Cycle%source, Cycle%dest,Cycle%ptrP 97 format(5x,'METAEDGE #',I2,2x,'TYPE ',I1,2x,'G.EDGES',I4,2x,'WORDS ',I5,2x, & 'SRC ',I3,2x,'DEST ',I3,2x,'PTR ',I5) end subroutine PrintCycle subroutine SetCycle(elem, schedule,Cycle,Edge) use element_mod, only : element_t use dimensions_mod, only : max_corner_elem, max_neigh_edges use parallel_mod, only : abortmp, iam implicit none type(element_t), intent(inout) :: elem(:) type (Schedule_t),intent(inout) :: Schedule type (Cycle_t),intent(inout) :: Cycle type (MetaEdge_t),intent(in),target :: Edge integer :: i,il,face ! allocate and initalize the index into the buffer slots !JMD allocate(Cycle%sEdgeIndex(Edge%nmembers)) !JMD allocate(Cycle%rEdgeIndex(Edge%nmembers)) #ifndef _PREDICT do i=1,Edge%nmembers ! Setup send index il = Global2Local(Edge%members(i)%tail%number) face = Edge%members(i)%tail_face if (face.ge.5) then face = face + (face-5)*(max_corner_elem-1)+(Edge%members(i)%tail_ind-1) end if if(il .gt. 0) then ! print*, "i = ", i, "face = ", face ! print*, "putmapV = ", elem(il)%desc%putmapV(face) ! print*, '-------------' elem(il)%desc%putmapP(face) = Edge%edgeptrP(i) + Cycle%ptrP - 1 ! offset, so start at 0 elem(il)%desc%putmapP_ghost(face) = Edge%edgeptrP_ghost(i) + Cycle%ptrP_ghost ! index, start at 1 elem(il)%desc%reverse(face) = Edge%members(i)%reverse endif ! Setup receive index il = Global2Local(Edge%members(i)%head%number) face = Edge%members(i)%head_face if (face.ge.5) then face = face + (face-5)*(max_corner_elem-1)+(Edge%members(i)%head_ind-1) if(face>max_neigh_edges) then print *,__FILE__,__LINE__,iam,face,i,max_corner_elem,edge%members(i)%head_ind call abortmp('Face value out of bounds') end if end if if(il .gt. 0) then elem(il)%desc%getmapP(face) = Edge%edgeptrP(i) + Cycle%ptrP - 1 elem(il)%desc%getmapP_ghost(face) = Edge%edgeptrP_ghost(i) + Cycle%ptrP_ghost endif enddo #endif Cycle%edge => Edge Cycle%type = Edge%type Cycle%dest = Edge%HeadVertex Cycle%source = Edge%TailVertex Cycle%tag = BNDRY_EXCHANGE_MESSAGE Cycle%lengthP = Edge%wgtP Cycle%lengthP_ghost = Edge%wgtP_ghost end subroutine SetCycle function FindBufferSlot(inbr,length,tmp) result(ptr) integer :: ptr integer,intent(in) :: inbr,length integer,intent(inout) :: tmp(:,:) integer :: i,n n = SIZE(tmp,2) ptr = 0 do i=1,n if( tmp(1,i) == inbr) then ptr = tmp(2,i) return endif if( tmp(1,i) == -1 ) then tmp(1,i) = inbr if(i .eq. 1) tmp(2,i) = 1 ptr = tmp(2,i) if(i .ne. n) tmp(2,i+1) = ptr +length return endif enddo end function FindBufferSlot end module schedule_mod