Pm7K-mpi

From Ecal

Jump to: navigation, search

Contents

まとめ

subs,slatsmは#if MPI,MPIKがないroutineでも、procid,masterなどmpi用のdummy変数を用いて制御をしている。#if MPI,MPIKがなくてもmpibc1をcallしてMPI,非MPIで同じコードで動くようにしている。

多くのroutineは直前でloop変数をnsizeで分割してloop後にbroadcastをしているだけ。

bandpltはなぜかやや複雑。

lmv7.F

#if MPI|MPIK
      integer  nsize, id 
      call mpi_init(ierr)       <--- start mpi communication
      call mpi_comm_size(MPI_COMM_WORLD, nsize,ierr) <--- get the size of mpi
      call mpi_comm_rank(MPI_COMM_WORLD, id,ierr)  <--- get myrank
      write(*,*) 'mpi: size and id=',nsize,id
#endif      
#if MPI | MPIK
      call MPI_GET_PROCESSOR_NAME(name, resultlen, ierr)
      call strcop(shortname(procid),name,10,'.',i)
      namelen(procid) = i-1
      mlog = cmdopt('--mlog',6,0,strn)   <--- mlogは後でbroadcastする。なぜすぐしない?
#if MPE

#if MPI | MPIK
      if (procid .eq. master) then <---- master=0と定義されている。入出力に関することはmasterのみ行う。
#endif
        if (cmdopt('--input',6,0,strn)) then
          if (nproc .gt. 0) call rx('--input not allowed with MPI')
        else
          if (fxst('CTRL') .ne. 1) then
            call awrit0(' '//prgnam//'%a:%9pmissing ctrl file',' ',80,
     .      i1mach(2))
            swtmp = .true.
          endif
        endif
#if MPI | MPIK
      endif
#endif

ここには#if MPI|MPIKがはいっていないが、procid.eq.masterだけは入ってる。


      if (procid .eq. master) then
        if (.not.cmdopt('--input',7,0,strn)) then
          nfilin = fopna('CTRL',-1,1)
ctakao
          call findctrlstart(nfilin)

          alabl = '#{}% ct '
          if (cmdopt('--show',6,0,strn)) alabl = '#{}% ctp'
          call rdfile(nfilin,alabl,recrd,mxrecs,strn,recln0,nrecs)
...


#if MPI | MPIK
      call mpibc1(nrecs,1,2,mlog,'main','nrecs')  <ーー mlogのbcast
      call MPI_BCAST(recrd,recln0*(nrecs+1),MPI_CHARACTER,  <-- recrdのbcast
     .master,MPI_COMM_WORLD,ierr)
#endif
C -------------- End of program -------------
 1000 continue

#if MPI | MPIK
      call MPI_BARRIER( MPI_COMM_WORLD, ierr )
      if ( procid .eq. master ) then
        call rx0(prgnam//' on '//shortname(procid)(1:namelen(procid)))
      else
        call fexit(0,0,' ',0)
      endif

      call mpi_finalize(ierr)
#endif

fp/bndfp.F

C ... MPI setup
      procid = mpipid(1)
      master = 0
      mlog = cmdopt('--mlog',6,0,strn)
#if MPI | MPIK
      call MPI_COMM_SIZE( MPI_COMM_WORLD, numprocs, ierr ) <- いつもやること
      call MPI_GET_PROCESSOR_NAME(name, resultlen, ierr)
      call strcop(shortname(procid),name,10,'.',i)
      namelen(procid) = i-1
#endif

MPIでなくてもmasterとprocidはsetしておく。

      lpdiag = isw(cmdopt('--pdiag',7,0,strn))

#if MPIK
      lpdiag = 0  <--- なぜか上書き
#endif
#if MPI
C MPI Process configuration
      if (lpdiag .eq. 1) then  <--- 何に使うのだろうか。
        nblk = 16
        dims(1) = 0
        dims(2) = 0
        call MPI_DIMS_CREATE(numprocs,2,dims,ierr)
        npcol = dims(1)
        nprow = dims(2)
        if (iprint() .ge. 30) then
          call awrit3(
     .    ' MPI creating process configuration .. nprow=%i npcol=%i,'//
     .    ' blocking factor %i',
     .    ' ',256,lgunit(1),nprow,npcol,nblk)
        endif
      endif
#endif
        iopq = 0
C       suqlst in MPIK mode; returns cumulative number of k-points
#if MPIK
        iopq = 2
#endif
C MPIK: Setup to assemble all k-points into single list with qp table
#if MPIK
        if (nfbn(1) .gt. 0 .or. nfbn(2) .gt. 0) then
          call rx('Cannot use color weights with MPIK')
        endif

        call suqlsm(i)
C       Re-allocate qp and evl arrays

         if (associated(sbz%rv_p_oqp)) deallocate(sbz%rv_p_oqp)
         allocate(sbz%rv_p_oqp(abs(3*nkp)))

         if (3*nkp<0) sbz%rv_p_oqp(:)=0.0d0


        call info2(20,1,1,
     .  ' bndfp:  MPIK band plotting mode %i:  %i q-points',i,nkp)
C       Loop through all qp; accumulate vector of qp.
C       Use i2 in place of nkp to preserve nkp
        if (procid .eq. master) then
C       iq = running index to big qp list, i1 = index to current line
          iq = 0
          i2 = 0
  199     continue
          i = 1
          call pshpr(0)
           call suqlst ( plbopt , 1 , ndhamx , ef0 , i , iwdummy , nfbn 
     .     , ifbls_iv , i2 , qp , onesp ) 
          call poppr
          if (i2 .gt. 0) then
            call pshpr(0)
            do  i1 = 1, i2
              iq = iq+1
               call suqlst ( plbopt , 1 , ndhamx , ef0 , i , iwdummy , nfbn 
     .         , ifbls_iv , i2 , qp , onesp ) 

               call dpscop ( qp , sbz%rv_p_oqp , 3 , 1 , 3 * iq - 2 , 1d0 ) 

            enddo
            call poppr
            call suqlsm(i)
            if (i .ne. 3) goto 199
          endif
        endif

         call mpibc1 ( sbz%rv_p_oqp , 3 * nkp , 4 , .false. , 'bndfp' , 'qp' 
     .   ) 
#endif

index分割

#if MPIK
      call info0(30,1,0, ' ... Start MPI k-loop')
      sttime = MPI_WTIME()
      allocate (kpproc(0:numprocs), stat=ierr)
      call dstrbp(nkp,numprocs,1,kpproc(0))  <ーーー nkpをnumprocsで分割
      iqini = kpproc(procid)
      iqend = kpproc(procid+1)-1

#else
      iqini=1        <ーー non MPI
      iqend=nkp
#endif

      do 2010 iq = iqini, iqend  <---- 分割したindex で各k点回る。
#if MPIK
        if (iq .eq. kpproc(procid)) then
          if (mlog) then
            call gettime(datim)
            call awrit4(' bndfp '//datim//' Process %i of %i on '
     .      //shortname(procid)(1:namelen(procid))//
     .      ' starting k-points %i to %i',' ',256,lgunit(3),
     .      procid,numprocs,kpproc(procid),kpproc(procid+1)-1)
          endif
        endif
#endif

#ifndef MPIK
        qp=qplist(:,iq) <ーーー MPIKでない場合
#else
         call dpscop ( sbz%rv_p_oqp , qp , 3 , 3 * iq - 2 , 1 , 1d0 ) <ーーーMPIKの場合、上でなんかやっていたな。
#endif


C ...  In k-parallel mode, defer this section until all qp available
#ifndef MPIK            <-------- MPIKでない場合。
            ebot = dmin1(ebot,evl(1,jsp))
            i = max(1,nint(qval-qbg)/(3-nspc))
            evtop = max(evtop,evl(i,jsp))        <ーーー MPIKの場合は後で計算しているはず。
            ecbot = min(ecbot,evl(i+1,jsp))
            if (lmet .eq. 0 .and. iq .eq. 1 .and. jsp .eq. 1) ef0 = evtop
            if (plbnd .eq. 0) then

              if (lwtkb .ne. -1 .and. .not.lwndow) then
                if (iq .eq. 1 .and. jsp .eq. nsp .and.
     .          .not. cmdopt('--no-fixef0',11,0,strn)) then
                  ef00 = ef0
                  call fixef0(qval-qbg,jsp,1,nevl,ndham,evl,dosw,ef0)
                  if (jsp .eq. 2 .and. ef00 .ne. ef0 .and.
     .            lwtkb .eq. 0 .and. lmet .gt. 0 .and. lrout .ne. 0) then
                    if (procid .eq. master)
     .              call info0(10,1,1,
     .              ' ... Fermi level reset in second spin'//
     .              ' channel ... restart band pass')
                    goto 99
                  endif
                endif

              endif
            endif
#endif
#if MPIK
                    i1 = iomoms ( - nfilem , nl , nsp , nspc , 2 , ndimh , 1000 +  <--- 1000+i
     .              i , 1 , iq , isp , ndham , ndimh , nchan , nchan , nev , evl 
     .              ( 1 , jsp ) , 0d0 , doswt_rv , 0d0 , 0d0 , 0d0 ) 

#else
                    i1 = iomoms ( - nfilem , nl , nsp , nspc , 2 , ndimh , i , 1  <--- i 
     .              , 1 , 1 , ndham , ndimh , nchan , nchan , nev , evl ( 1 , jsp 
     .              ) , 0d0 , doswt_rv , 0d0 , 0d0 , 0d0 ) 

#endif

ifndef MPIK
            elseif (plbnd .ne. 0) then  <---- bandplotの時は複雑。

ckino             nfbn(1)>0 means fat band mode => nmx=ndimh, all the evec are calculated 
ckino             n_listwf>0 means listwf is set by --writewf
              if (nfbn(1)>0 .and. n_listwf>0) then

                call rsibl_ev( ssite,sspec,slat,nbas,isp,qp,iq,ndimh,nspc

     .          ,  napw,igv2x,sham%iv_a_oindxo,nev,t_zv,k1,k2,k3
     .          ,  n_listwf,listwf    )

              endif

              i = nsp
              if (onesp .ne. 0 .or. nspc .eq. 2) i = 1

              call suqlsw2(nevl,jsp,i,evl(1,jsp),qpo) !takao
              if (nfbn(1) .ne. 0) then
                if (ndimhx .ne. nevl)
     .          call rx('color weights not implemented when '//
     .          'nevl < hamiltonian dimension')
                call suqlse ( ndimhx , jsp , i , ndimhx , 1 , nfbn , ifbls_iv 
     .          , ndhamx , t_zv , evl ) 

              endif
              if (nfbn(2) .ne. 0) then
                if (ndimhx .ne. nevl)
     .          call rx('color weights not implemented when '//
     .          'nevl < hamiltonian dimension')
                call suqlse ( ndimhx , jsp , i , ndimhx , 2 , nfbn , ifbls_iv 
     .          , ndhamx , t_zv , evl ) 

              endif
#else

C   ... Save evals for this qp
            elseif (plbnd .ne. 0) then

              call dpscop(evl(1,jsp),evlall,ndhamx,
     .        1,1+ndham*(jsp-1+isqp),1d0)

#endif
C --- Second loop over qp, needed to make k-parallelisation possible: ---
C     You can't do this until you have all the evals.
#if MPIK
#if MPE
      ierr = MPE_LOG_EVENT(EVENT_END_KLOOP,procid,"k-loop")
#endif
      entime = MPI_WTIME()
      call info2(30,0,0, ' ... Done MPI k-loop: %;1d seconds elapsed',
     .entime-sttime,0)
      call info0(20,0,-1,' ... Sharing data between processes...')
      sttime = MPI_WTIME()

C     Share bands among all processors
      call xmpbnd(kpproc,ndham,nkp,nsp,evlall)

      if (icls .ne. 0 .and. lwtkb .ne. -1) then
        call rxx(nspc.ne.1,'CLS not implemented in noncoll case')
        call xmpbnd ( kpproc , nlmax * ndham * 3 * nsites , nkp , nsp 
     .  , ausc_zv )  <--- share ausc_zv

      endif
C     Allreduce density-related quantities
      if (lrout .ne. 0) then
        call mpibc2(sumqv,6,4,mlog,'bndfp','sumqv')
        call mpibc2(sumev,6,4,mlog,'bndfp','sumev')
        call mpibc2 ( srout_zv , k1 * k2 * k3 * nsp * numq , 6 , mlog 
     .  , 'bndfp' , 'smrho' ) 

        if (lswtk .eq. 1) then
ckino Dec.8.2011:             call mpibc2 ( rv_p_oswtk , ndhamx * nkp , 4 , mlog , 'bndfp' 
           call mpibc2 ( rv_a_oswtk , ndhamx * nkp , 4 , mlog , 'bndfp' 
     .     , 'swtk' ) 

        endif
...  <---- reduce other quantities

C     Repeat loop for printout.  Put evals back into local array
      do  iq = 1, nkp
...
C end second loop over iq
      enddo
ckino 05Feb09
      if (allocated(kpproc)) deallocate(kpproc, stat=ierr)
#endif

#if MPIK
      subroutine xmpbnd(kpproc,ndham,nkp,nsp,eb)

C- Collect eb from various processors (MPI)
C ----------------------------------------------------------------------
Ci Inputs
Ci   kpproc
Ci   ndham :leading dimension of eb
Ci   nkp   :number of irreducible k-points (bzmesh.f)
Ci   nsp   :2 for spin-polarized case, otherwise 1
Ci   eb    :energy bands; alias eband
Co Outputs
Cl Local variables

      allocate (offset(0:numprocs), stat=ierr)
      allocate (length(0:numprocs), stat=ierr)
      offset(0) = 0

      do  i = 0, numprocs-1
        ista = kpproc(i)
        iend = kpproc(i+1)-1
        length(i) = (iend - ista + 1)*nsp*ndham
        offset(i+1) = offset(i) + length(i)
      enddo
      ista = kpproc(procid)
      allocate(buf_rv(ndham*nkp*nsp))

      call mpi_allgatherv ( eb ( 1 , 1 + nsp * ( ista - 1 ) ) , length 
     .( procid ) , mpi_double_precision , buf_rv , length , offset 
     ., mpi_double_precision , mpi_comm_world , ierr ) 

      call dcopy ( ndham * nsp * nkp , buf_rv , 1 , eb , 1 ) 

      if (allocated(buf_rv)) deallocate(buf_rv)


      deallocate(offset, stat=ierr)
      deallocate(length, stat=ierr)

fp/augmbl.F

MPI用のallocation

#if MPI
      integer, dimension(:), allocatable :: bproc
      complex(8) ,allocatable :: h_zv(:),
     & hbuf_zv(:),  s_zv(:), sbuf_zv(:),  hso_zv(:)
#endif


必ずやる。ここにある変数はmoduleに入れておけばいい。MPIでない場合は問題ないようにnumprocs=1と設定しておく。

#if MPI
      call MPI_COMM_RANK( MPI_COMM_WORLD, procid, ierr )
      call MPI_COMM_SIZE( MPI_COMM_WORLD, numprocs, ierr )
      call MPI_GET_PROCESSOR_NAME(name, resultlen, ierr)
      call strcop(shortname(procid),name,10,'.',i)
      namelen(procid) = i-1
      master = 0
      mlog = cmdopt('--mlog',6,0,strn)
#else
      numprocs=1
#endif
#if MPI
      allocate(h_zv(abs(-ndimh*ndimh)))    <---- processor内のlocal data
      if (-ndimh*ndimh<0) h_zv(:)=0.0d0

      allocate(s_zv(abs(-ndimh*ndimh))) <---- processor内のlocal data
      if (-ndimh*ndimh<0) s_zv(:)=0.0d0

      if (lcplxp .ne. 0) then
        allocate(hso_zv(abs(-ndimh*ndimh)))<---- processor内のlocal data
        if (-ndimh*ndimh<0) hso_zv(:)=0.0d0

      endif
      allocate (bproc(0:numprocs), stat=ierr) <ーloop indexをnumprocsで割り振るためのindex用の配列
      call dstrbp(nbas,numprocs,1,bproc(0)) <- numprocsで割り振る
      initbas=bproc(procid)        <- loopの初め
      endbas =bproc(procid+1)-1  <- loopの最後
#else
      initbas=1  <ーーMPIのときと同じ形に
      endbas=nbas
#endif

      do  ia = initbas, endbas <ーーー以下このindexでloop分割。

C   --- Add 1-center and 2-center terms ---

#if MPI
        if (lcplxp .eq. 0) then
            call augq12 ( mode0 , ia , isp , nkaph , iprmb , lmxha , nlmha 
     .      , kmax , nlma , sv_p_osig ( 3 , ia ) %v , sv_p_oppi( 3 , ia )%v 
     .      , sv_p_osig ( 2 , ia ) %v , sv_p_oppi( 2 , ia )%v , b , ndimh 
     .      , nlmto , s_zv , h_zv )   <--- MPIの場合はprocessごとに分割したs,hを用いる。
        else
            call augq2z ( mode0 , mode1 , ia , isp , nkaph , iprmb , lmxha 
     .      , nlmha , kmax , nlma , sv_p_osig ( 3 , ia ) %v , sv_p_oppi( 3 , ia )%v 
     .      , sv_p_osig ( 2 , ia ) %v , sv_p_oppi( 2 , ia )%v , b , ndimh 
     .      , nlmto , s_zv , h_zv , hso_zv ) 
        endif
#else
#ifndef ALL3C
        if (lcplxp .eq. 0) then
            call augq12 ( mode0 , ia , isp , nkaph , iprmb , lmxha , nlmha 
     .      , kmax , nlma , sv_p_osig ( 3 , ia ) %v , sv_p_oppi( 3 , ia )%v 
     .      , sv_p_osig ( 2 , ia ) %v , sv_p_oppi( 2 , ia )%v , b , ndimh 
     .      , nlmto , s , h ) 
        else
            call augq2z ( mode0 , mode1 , ia , isp , nkaph , iprmb , lmxha 
     .      , nlmha , kmax , nlma , sv_p_osig ( 3 , ia ) %v , sv_p_oppi( 3 , ia )%v 
     .      , sv_p_osig ( 2 , ia ) %v , sv_p_oppi( 2 , ia )%v , b , ndimh 
     .      , nlmto , s , h , hso ) 
        endif
#endif
#endif

<pre>
C ... end loop over ia
      enddo              <ーーーーーーー loopの終わり

#if MPI

      call MPI_BARRIER(MPI_COMM_WORLD,ierr) <ーー全てのprocessがここに来るまで待つ。

      allocate(hbuf_zv(ndimh*ndimh)) <-- reduce用のbuffer 

      call mpi_allreduce ( h_zv , hbuf_zv , 2 * ndimh * ndimh  <--- h_zv を集めてhbuff_zvに入れる。
     ., mpi_double_precision , mpi_sum , mpi_comm_world , ierr ) <--- 結果は全てのノードへ。

      call daxpy ( 2 * ndimh * ndimh , 1d0 , hbuf_zv , 1 , h , 1  <-- h=h+hbuf_zv
     .)

      if (allocated(hbuf_zv)) deallocate(hbuf_zv)  <-- hbuf_zvはもう不要。

      allocate(sbuf_zv(ndimh*ndimh))  <---- reduce用のbuffer

      call mpi_allreduce ( s_zv , sbuf_zv , 2 * ndimh * ndimh   <- sbuf_zvへ、
     ., mpi_double_precision , mpi_sum , mpi_comm_world , ierr )

...
        call mpi_allreduce ( hso_zv , sbuf_zv , 2 * ndimh * ndimh 
     .  , mpi_double_precision , mpi_sum , mpi_comm_world , ierr )

        call daxpy ( 2 * ndimh * ndimh , 1d0 , sbuf_zv , 1 , hso , 
     .  1 )

        if (allocated(sbuf_zv)) deallocate(sbuf_zv)  <ーーー不要なarrayの開放。

        if (allocated(hso_zv)) deallocate(hso_zv)

      endif
      if (allocated(s_zv)) deallocate(s_zv)
      if (allocated(h_zv)) deallocate(h_zv)

      deallocate(bproc, stat=ierr)  <ーーindex arrayの開放

#endif

fp/dfrce.F

C ... Setup array iiv0 = (vector of iv0 for parallel); allocate work arrays
#if MPI | MPIK
      iv0 = 0
      allocate(iiv0(1:nbas), stat=ierr)
      do  12  ib = 1, nbas
        is = int(ssite(ib)%spec)

        lmxl = int(sspec(is)%lmxl)

        nlm = (lmxl+1)**2
        iiv0(ib) = iv0
        iv0 = iv0+nlm
   12 continue
#endif
#if MPI | MPIK

      allocate (bproc(0:numprocs), stat=ierr)  <--- index分割
      call dstrbp(nbas,numprocs,1,bproc(0))
      ibini = bproc(procid)
      ibend = bproc(procid+1)-1
#else
      ibini=1
      ibend=nbas
#endif

      do  ib = ibini, ibend <ーーーloopを回す。


#if MPI | MPIK
      enddo   <--- loop終了

      call MPI_BARRIER(MPI_COMM_WORLD,ierr)

      do  pid = 0, numprocs-1
        ib = bproc(pid)
        jb = bproc(pid+1) - ib
        call MPI_BCAST(dfh(1,ib),3*jb,MPI_DOUBLE_PRECISION,pid, <--- dfhは全てのnodeが持っている。 ibごとにbcast する。
     .  MPI_COMM_WORLD,ierr)

      enddo
      deallocate (bproc, stat=ierr)
      deallocate (iiv0, stat=ierr)

fp/fsmbl.F


C --- Loop over first and second site indices ---
#if MPI

      allocate (index(0:numprocs-1,0:nbas-1), stat=ierr)
      call dstrbp(nbas,numprocs,-1,index(0,0))  <--- nbasを分割

      iloopend=index(procid,0)
#else
      iloopend=nbas
#endif


      do  iloop = 1, iloopend
#if MPI
        ib1 = index(procid,iloop)

#else
        ib1=iloop
#endif
#if MPI
                        xf(m,ib1,iq) = xf(m,ib1,iq) - 2*wt*sum <--- MPIの場合は一度local arrayへ
                        xf(m,ib2,iq) = xf(m,ib2,iq) + 2*wt*sum
#else
                        f(m,ib1,iq) = f(m,ib1,iq) - 2*wt*sum
                        f(m,ib2,iq) = f(m,ib2,iq) + 2*wt*sum
#endif

#if MPI

      call MPI_BARRIER(MPI_COMM_WORLD,ierr)

      allocate(buffer(1:3*nbas*numq), stat=ierr)
      call MPI_ALLREDUCE(xf,buffer,3*nbas*numq,  <---- xf -> bufferへsumup
     .MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD,ierr)

      call daxpy(3*nbas*numq,1d0,buffer,1,f,1)   f= f+buffer
      deallocate(buffer, stat=ierr)

fp/fsmbpw.F

#if MPI
      allocate (index(0:numprocs-1,0:nbas-1), stat=ierr)
      call dstrbp(nbas,numprocs,-1,index(0,0))

      ibini=1
      ibend=index(procid,0)
#else
      ibini=1
      ibend=nbas
#endif

      do iloop = ibini,ibend
#if MPI
        ib1 = index(procid,iloop)
#else
        ib1=iloop
#endif

#if MPI
                      xf(m,ib1,iq) = xf(m,ib1,iq) - 2*wt*sum
#else
                      f(m,ib1,iq) = f(m,ib1,iq) - 2*wt*sum
#endif
#if MPI

      call MPI_BARRIER(MPI_COMM_WORLD,ierr)

      allocate(buffer(1:3*nbas*numq), stat=ierr)
      call MPI_ALLREDUCE(xf,buffer,3*nbas*numq,  
     .MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD,ierr)

      call daxpy(3*nbas*numq,1d0,buffer,1,f,1)  <---- xf->bufferとしてから f=f+buffer
      deallocate(buffer, stat=ierr)

      deallocate(index, stat=ierr)
      deallocate(xf, stat=ierr)
#endif

fp/hsibl.F

#if SGI | MPI
        allocate(w_oc1 ( ng*ndimx,nnn),
     &           w_ocf1( nhblk*ndimx,nnn))
#else
        alllocate(w_oc1( ng*nhblk,nnn),
     &            w_ocf1(  nhblk,nnn))
#endif
#if MPI

        allocate(h_zv(abs(-ndimh*ndimh)))
        if (-ndimh*ndimh<0) h_zv(:)=0.0d0

        allocate (index(0:numprocs-1,0:nbas-1), stat=ierr)
        call dstrbp(nbas,numprocs,-1,index(0,0))  <--- basを分割

        ibini=1
        ibend=index(procid,0)
#else

        ibini=1
        ibend=nbas
#endif

        do  iloop = ibini,ibend


C     ... Add to hamiltonian
#if MPI
            call hsibl5 ( norb1 , blks1 , offl1 , ndim1 , norb2 , blks2 , 
     .      offl2 , ndim2 , ndimh , wk2_zv , h_zv )  <--- h_zvはnode local

#else
            call hsibl5 ( norb1 , blks1 , offl1 , ndim1 , norb2 , blks2 , 
     .      offl2 , ndim2 , ndimh , wk2_zv , h )

#endif

#if MPI
          call hsibl6 ( ndimh , nlmto , norb1 , blks1 , offl1 , ng , iv_iv 
     .    , napw , igapw , w_oc1(1,ip) , w_ocf1(1,ip) , h_zv ) <--- h_zvはnode local

#else
          call hsibl6 ( ndimh , nlmto , norb1 , blks1 , offl1 , ng , iv_iv 
     .    , napw , igapw , w_oc1(1,ip) , w_ocf1(1,ip) , h )

#endif

fp/lmfp.F

#if MPI

      nbas=sctrl%nbas

c     mpipic() is defined in slatsm
      if ( nbas < nproc ) then
         if (procid == master ) then
            write(6,*)
            write(6,"(a,/,a,/,a,/,a)")
     &           '# For lmf-MPI, process>NBAS is not allowed.',
     &           '# If you remove this sanity check, it will fail',
     &           '# at hsibl.F and smhsbl.F.',
     &           '# You may need a better MPI version for efficent use of CPUs.'
         endif
c     call mpi_abort(MPI_COMM_WORLD,-1,ierr)
         call mpi_finalize(ierr)
         call exit(-1)
c     ---------- '-1'  is the return code of the program
         stop
c     ---------- may be stop is unnecessary.
      endif
#endif

fp/mixrho.F

#if ! (MPI | MPIK)
        if (lbin) read (ifi,err=131,end=131) nmixr, na   <---  MPI版はこういう変数をbroadcastする。
        if (.not. lbin) read (ifi,*,err=131,end=131) nmixr, na
        if (nda*nsp .ne. na) then
          call awrit2(' mixrho:  expecting %i elements but found %i ...'
     .    //' discarding file',' ',80,stdo,nda*nsp,na)
          nmixr = 0
          goto 131
        endif

fp/ovlocr.F


#if MPI | MPIK
      call info0(31,1,0,' ovlocr: make sphere densities from '//
     .'overlapping FA densities (parallel)')
      allocate (ibproc(0:numprocs), stat=ierr)
      call pshpr(ipr-10)
      call dstrbp(nbas,numprocs,1,ibproc(0))  <--- nbasをnurpocsで分割
      call poppr
      ipr = 0
      call pshpr(10)

      ibini= ibproc(procid)
      ibend= ibproc(procid+1)-1
#else
      if (ipr .ge. 30) write (stdo,300)
  300 format(/' Free atom and overlapped crystal site charges:'
     ./'   ib    true(FA)    smooth(FA)  true(OV)',
     .'    smooth(OV)    local')

      ibini= 1
      ibend= nbas
#endif
      do  ib = ibini,ibend
    ...
      enddo

#if MPI | MPIK
      call poppr

C ... Combine sphere densities from separate threads  <--- nodeに分割されていたものを全てで共有
      do  pid = 0, numprocs-1
        do  ib = ibproc(pid), ibproc(pid+1)-1

          is=ssite(ib)%spec


          nr=sspec(is)%nr
          lmxl=sspec(is)%lmxl

          nlml = (lmxl+1)**2
           call mpibc3 ( sv_p_orhoat( 1 , ib )%v , nr * nlml * nsp , 4 , 
     .     pid , 0 , 'ovlocr' , 'rhoat(1)' ) 

           call mpibc3 ( sv_p_orhoat( 2 , ib )%v , nr * nlml * nsp , 4 , 
     .     pid , 0 , 'ovlocr' , 'rhoat(1)' ) 

           call mpibc3 ( sv_p_orhoat( 3 , ib )%v , nr * nsp , 4 , pid , 
     .     0 , 'ovlocr' , 'rhoat(1)' ) 


        enddo
      enddo

C ... Combine sum-of-sphere charges from separate threads
      call mpibc2(sqloc,1,4,mlog,'ovlocr','sqloc')
      call mpibc2(slmom,1,4,mlog,'ovlocr','slmom')

      deallocate(ibproc, stat=ierr)

#endif

=fp/ovlpfa.F=
<pre>
#if MPI | MPIK
      allocate (kpproc(0:numprocs), stat=ierr)
      call pshpr(ipr-10)
      call dstrbp(nbas,numprocs,1,kpproc(0))
      call poppr
      ipr = 0

      ibini = kpproc(procid)
      ibend = kpproc(procid+1)-1
#else

      ibini=1
      ibend=nbas
#endif

      do ib=ibini,ibend

      enddo
C ... Combine cv from separate threads
#if MPI | MPIK
      deallocate(kpproc, stat=ierr)
      call mpibc2(cv,ng*nsp*2,4,mlog,'ovlpfa','cv')
      call mpibc2(sum,2,4,mlog,'ovlpfa','sum')
#endif
(
      subroutine mpibc2(vec,n,cast,mlog,funnam,label)
C- Performs MPI_ALLREDUCE on a vector (MPI)
C ----------------------------------------------------------------------
Ci Inputs
Ci   vec   :vector to broadcast
Ci   n     :length of vector
Ci   cast  :cast of vector:
Ci         : 2 int
Ci         : 4 double
Ci         : 6 double complex
Ci   mlog  : T write message to mlog file
Ci   funnam:string used in writing message (function name)
Ci   label :string used in writing message (variable name)
)


fp/rlocbl.F

#if MPI

      allocate (bproc(0:numprocs), stat=ierr)
      call dstrbp(nbas,numprocs,1,bproc(0))
c      do  ia = bproc(procid), bproc(procid+1)-1
      iaini=bproc(procid)
      iaend=bproc(procid+1)-1
#else
      iaini=1
      iaend=nbas
#endif
      do ia = iaini,iaend


C ... end loop over ia
      enddo
#if MPI
      call MPI_BARRIER(MPI_COMM_WORLD,ierr)

      do  pid = 0, numprocs-1
        do  ia = bproc(pid), bproc(pid+1)-1
          is = int(ssite(ia)%spec)


          lmxa=sspec(is)%lmxa
          lmxh=sspec(is)%lmxb
          kmax=sspec(is)%kmxt

          nlma = (lmxa+1)**2
          nlmh = (lmxh+1)**2
          nelt(1) = (kmax+1)*(kmax+1)*nlma*nlma
          nelt(2) = (kmax+1)*nkaph*nlma*nlmh
          nelt(3) = nkaph*nkaph*nlmh*nlmh
          do  i = 1, 3
             call mpi_bcast ( sv_p_oqkkl( i , ia )%v , nelt ( i ) * numq *  <--- 各ノード上のデータを共有。
     .       nsp * nspc , mpi_double_precision , pid , mpi_comm_world , ierr 
     .       ) 

fp/rsibl.F

#if MPI
      nblk = 0
      nvsub = nevec/6
      if (nvsub .le. nproc) nblk = 1
      if (cmdopt('--noblk',7,0,strn)) nblk = 1
      allocate (vproc(0:nproc), stat=ierr)
      call dstrbp(nevec,nproc,nblk,vproc(0))
      if (mlog .and. iq .eq. 1) then
        call awrit3(' RSIBL: nevec=%i, np=%i, nblk=%i%N  proc   index',
     .  ' ',256,lgunit(1),nevec,nproc,nblk)
        write(lgunit(1),1) (i,vproc(i),i=0,nproc-1)
    1   format (3x,i3,3x,i5)
      endif
#else
      nblk = 16
#endif
#if MPI
      allocate(xsmrho(k1,k2,k3,numq,nspc))
      call dpzero(xsmrho,k1*k2*k3*numq*nspc*2)
      if (lfrce .ne. 0) then
        allocate(fr(3,nbas,numq))
        call dpzero(fr,3*nbas*numq)
      endif
      ivecini= vproc(procid)
      ivecend= vproc(procid+1)-1
#else
      ivecini= 1
      ivecend= nevec
#endif
      do  ivec = ivecini,ivecend, nblk
#if MPI
        nvec = min(nblk, vproc(procid+1)-ivec)
#else
        nvec = min(nblk, nevec-ivec+1)
#endif
C   ... Add to real-space mesh, optionally make smpot*psi for forces
Ckino rsibl2 executes FFT to get psi(r), which is F0
Ckino and also calculates <psi|psi>(=F0F0) to get real space charge density.
#if MPI | OPENMP
         call rsibl2 ( ng , nspc , nvec , psi , n1 , n2 , n3 , k1 , k2
ckino Dec.28.2011:       .   , k3 , iv_p_okv , numq , ewgt ( 1 , ivec ) , lfrce
, smpot (
     .   , k3 , iv_a_okv , numq , ewgt ( 1 , ivec ) , lfrce , smpot (
     .   1 , 1 , 1 , isp ) , psir , xsmrho , vpsi )

#else
         call rsibl2 ( ng , nspc , nvec , psi , n1 , n2 , n3 , k1 , k2
ckino Dec.28.2011:       .   , k3 , iv_p_okv , numq , ewgt ( 1 , ivec ) , lfrce
, smpot (
     .   , k3 , iv_a_okv , numq , ewgt ( 1 , ivec ) , lfrce , smpot (
     .   1 , 1 , 1 , isp ) , psir , smrho ( 1 , 1 , 1 , 1 , isp ) , vpsi
     .   )

#endif
#if MPI | OPENMP
          call rsibl1(1,ssite,sspec,q,nbas,iprmb,ng,w_ogq,w_oiv,n1,n2,
     .    n3,qlat,cosi,sini,w_oyl,w_oylw,w_ohe,w_ohr,
     .    wk,wk2,vol,iprt,ipet,etab,rtab,ndimh,nlmto,nspc,
     .    numq,ewgt,ivec,nvec,evec,vpsi,psi,fr)
#else
          call rsibl1(1,ssite,sspec,q,nbas,iprmb,ng,w_ogq,w_oiv,n1,n2,
     .    n3,qlat,cosi,sini,w_oyl,w_oylw,w_ohe,w_ohr,
     .    wk,wk2,vol,iprt,ipet,etab,rtab,ndimh,nlmto,nspc,
     .    numq,ewgt,ivec,nvec,evec,vpsi,psi,f)
#endif
#if MPI

      call MPI_BARRIER(MPI_COMM_WORLD,ierr)

      allocate(w_osmbuf(k1*k2*k3*numq*nspc))
      call MPI_ALLREDUCE(xsmrho,w_osmbuf,2*k1*k2*k3*numq*nspc,
     .MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD,ierr)

      call daxpy(2*k1*k2*k3*numq*nspc,1d0,w_osmbuf,1,
    .smrho(1,1,1,1,isp),1)

      deallocate(w_osmbuf)

      if (lfrce .ne. 0) then

        allocate(w_ofrbuf(3*nbas*numq))
        call MPI_ALLREDUCE(fr,w_ofrbuf,3*nbas*numq,
     .  MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD,ierr)

        call daxpy(3*nbas*numq,1d0,w_ofrbuf,1,f,1)

        deallocate(w_ofrbuf)
      endif
      deallocate(xsmrho)
      if (allocated(fr)) deallocate(fr)
C
      deallocate(vproc, stat=ierr)

#endif

fp/smhsbl.F

     if (nlmto .ne. 0) then
#if MPI
        allocate (hbuf(ndimh,ndimh), stat=ierr)
        allocate (sbuf(ndimh,ndimh), stat=ierr)
        call dcopy(2*ndimh*ndimh,0d0,0,hbuf,1)
        call dcopy(2*ndimh*ndimh,0d0,0,sbuf,1)
        allocate (index(0:numprocs-1,0:nbas-1), stat=ierr)
        call dstrbp(nbas,numprocs,-1,index(0,0))

        iloopmx= index(procid,0)
#else
        iloopmx= nbas
#endif


        do iloop = 1,iloopmx
#if MPI
          ib1 = index(procid,iloop)
#else
          ib1=iloop
#endif
#if MPI
                          sbuf(i1,i2) = sbuf(i1,i2) + s0(ilm1,ilm2,0,ik1,ik2)
                          hbuf(i1,i2) = hbuf(i1,i2) - s0(ilm1,ilm2,1,ik1,ik2)
     .                    + vavg*s0(ilm1,ilm2,0,ik1,ik2)
#else
                          s(i1,i2) = s(i1,i2) + s0(ilm1,ilm2,0,ik1,ik2)
                          h(i1,i2) = h(i1,i2) - s0(ilm1,ilm2,1,ik1,ik2)
     .                    + vavg*s0(ilm1,ilm2,0,ik1,ik2)
#endif
#if MPI
                          sbuf(i1,i2) = sbuf(i1,i2) + s0(ilm1,ilm2,0,ik1,ik2)
#else
                          s(i1,i2) = s(i1,i2) + s0(ilm1,ilm2,0,ik1,ik2)
#endif
#if MPI
                sbuf(i1,i2) = sbuf(i1,i2) + ovl
                hbuf(i1,i2) = hbuf(i1,i2) + qpg2*ovl + vavg*ovl
#else
                s(i1,i2) = s(i1,i2) + ovl
                h(i1,i2) = h(i1,i2) + qpg2*ovl + vavg*ovl
#endif
#if MPI
       if (mode .eq. 0) then

          allocate(w_obuff(ndimh*ndimh))
          call MPI_ALLREDUCE(hbuf,w_obuff,2*ndimh*ndimh,
     .    MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD,ierr)

          call daxpy(2*ndimh*ndimh,1d0,w_obuff,1,h,1)
        endif
        call MPI_ALLREDUCE(sbuf,w_obuff,2*ndimh*ndimh,
     .  MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD,ierr)

        call daxpy(2*ndimh*ndimh,1d0,w_obuff,1,s,1)
        deallocate(w_obuff)

        deallocate(index, stat=ierr)
        deallocate(hbuf, stat=ierr)
        deallocate(sbuf, stat=ierr)
#endif

fp/ugcomp.F

C ... Setup array iiv0 = (vector of iv0 for parallel); allocate work arrays
      mp = 1

#if MPI | MPIK
      call setofl(0,ssite,sspec,nbas,nvl0,iiv0)
      if (nlmx*nbas .lt. nvl0) call rx('ugcomp: increase nlmx')
#endif
#if MPI | MPIK

      allocate (bproc(0:numprocs), stat=ierr)
      call dstrbp(nbas,numprocs,1,bproc(0))
      ibini= bproc(procid)
      ibend= bproc(procid+1)-1
#else
      ibini=1
      ibend=nbas
#endif

      do ib=ibini,ibend
#if MPI |MPIK
        iv0 = iiv0(ib)
#endif

#if MPI | MPIK
              jv0 = iiv0(jb)
#endif
#if MPI | MPIK
      nvl = nvl0
#else
      nvl = iv0
#endif
C ... Assemble data from separate threads
#if MPI | MPIK
      call MPI_BARRIER(MPI_COMM_WORLD,ierr)

      allocate(buffer(1:nvl), stat=ierr)
      call MPI_ALLREDUCE(xgpot0,buffer,nvl,
     .MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD,ierr)
      call daxpy(nvl,1d0,buffer,1,gpot0,1)
      deallocate(buffer, stat=ierr)
    ...
#else
      do  80  ip = 1, mp
        do  82  ib = 1, nbas
          f(1,ib) = f(1,ib) + xf(1,ib,ip)
          f(2,ib) = f(2,ib) + xf(2,ib,ip)
          f(3,ib) = f(3,ib) + xf(3,ib,ip)
          hpot0(ib) = hpot0(ib) + xhpot0(ib,ip)
   82   continue
        do  84  i = 1, nvl
          gpot0(i) = gpot0(i) + xgpot0(i,ip)
   84   continue
        ugg = ugg + xugg(ip)
   80 continue
#endif

subsの中はMPIは無いがmpibc1などを呼んでいて MPIを使わなくてもMPIのroutineをdummyとして呼んでいる。

subs/dstrbp

MPIにだけdstrbpの中身がある。

subs/iopos.F

#if MPI | MPIK
      call MPI_COMM_RANK( MPI_COMM_WORLD, procid, ierr )
      call MPI_COMM_SIZE( MPI_COMM_WORLD, numprocs, ierr )
      call MPI_GET_PROCESSOR_NAME(name, resultlen, ierr)
      call strcop(shortname(procid),name,10,'.',i)
      namelen(procid) = i-1
      master = 0
      mlog = cmdopt('--mlog',6,0,strn)
      if (procid .eq. master) then
#endif
#if MPI | MPIK
      endif
      if (.not. lio) then
        call MPI_BCAST(bas,3*nbas,MPI_DOUBLE_PRECISION,master,
     .  MPI_COMM_WORLD,ierr)
      endif
#endif

subs/iors.F

  1. if MPIは無いがmpiになっている。

subs/pqmix.F

  1. if MPIはないがmpiになっている

subs/relax.F

  1. if MPIはないがmpiになっている

subs/rsmsym.F

#if MPI | MPIK
      allocate (igproc(0:max(numprocs,1)))
      call dstrbp(ng,numprocs,1,igproc)
#endif
#if MPI | MPIK

      igini = max(igproc(procid),2)
      igend = igproc(procid+1)-1
#else

      igini = 2
      igend = ng
#endif

      do ig=igini,igend

subs/sudmtu.F

slatsmもMPIなしでもMPIのdummyを呼んでいる。

Personal tools