! cc_r -c bind.c ! mpxlf90_r -O3 -qsmp=noauto:omp -qsource t_sort.f bind.o -o psort module galapagos type thefit sequence real val integer index,proc end type thefit end module module mpi include 'mpif.h' integer numnodes,myid,my_root,ierr end module module uname contains function unique(name) use mpi character (len=*) name character (len=20) unique character (len=80) temp if(myid .gt. 99)then write(temp,"(a,i3)")trim(name),myid else if(myid .gt. 9)then write(temp,"(a,'0',i2)")trim(name),myid else write(temp,"(a,'00',i1)")trim(name),myid endif endif unique=temp return end function unique end module uname module sort_mod ! recursive merge sort taken from pascal routine in ! moret & shapiro's book contains subroutine Sort(ain, n) use galapagos integer n type(thefit), target:: ain(n) type(thefit), pointer :: work(:) type(thefit), pointer :: a(:) common /bonk_it/ a,work !$OMP THREADPRIVATE (/bonk_it/) allocate(work(n)) nullify(a) a=>ain call RecMergeSort(1,n) deallocate(work) return end subroutine Sort recursive subroutine RecMergeSort(left, right) use galapagos type(thefit), pointer :: work(:) type(thefit), pointer :: a(:) common /bonk_it/ a,work !$OMP THREADPRIVATE (/bonk_it/) integer,intent(in):: left,right integer middle if (left < right) then middle = (left + right) / 2 call RecMergeSort(left,middle) call RecMergeSort(middle+1,right) call Merge(left,middle-left+1,right-middle) endif return end subroutine RecMergeSort subroutine Merge(s, n, m) use galapagos integer s,n,m integer i, j, k, t, u type(thefit), pointer :: work(:) type(thefit), pointer :: a(:) common /bonk_it/ a,work !$OMP THREADPRIVATE (/bonk_it/) k = 1 t = s + n u = t + m i = s j = t if ((i < t) .and. (j < u))then do while ((i < t) .and. (j < u)) if (A(i)%val .ge. A(j)%val)then work(k) = A(i) i = i + 1 k = k + 1 else work(k) = A(j) j = j + 1 k = k + 1 endif enddo endif if(i < t )then do while (i < t ) work(k) = A(i) i = i + 1 k = k + 1 enddo endif if(j < u)then do while (j < u ) work(k) = A(j) j = j + 1 k = k + 1 enddo endif i = s ! the next line is not in moret & shapiro's book k=k-1 do j = 1 , k A(i) = work(j) i = i + 1 enddo return end subroutine Merge ! this subroutine takes two sorted lists of type(thefit) and merges them ! input d1(n) , d2(m) ! output out(n+m) subroutine merge2(d1,n,d2,m,out) use galapagos implicit none integer n,m type(thefit),intent (in):: d1(n),d2(m) type(thefit), intent (out):: out(n+m) integer i,j,k i=1 j=1 do k=1,n+m if(i.gt.n)then out(k)=d2(j) j=j+1 elseif(j.gt.m)then out(k)=d1(i) i=i+1 else if(d1(i)%val .gt. d2(j)%val)then out(k)=d1(i) i=i+1 else out(k)=d2(j) j=j+1 endif endif enddo return end subroutine merge2 end module sort_mod module my_collective contains ! MY_MERGE a global merge routine ! MY_MERGE takes as input a sorted list of values from each ! processor ! it outputs on the root a sorted list of values which is a ! merged list from each processor ! when a processor recieves a list from another processor it ! merges its list with the new one before sending the list on ! this routine is a fortran 90 implimentation of a gather ! routine plus the merge step subroutine MY_MERGE(input,n,output,m,root,THETAG,THECOM,MPI_FIT) use galapagos use mpi use sort_mod implicit none integer, intent(in) :: n type(thefit), intent(inout) :: input(n) type(thefit), intent (inout):: output(*) integer, intent (inout):: m integer, intent(in) :: root integer THETAG,THECOM,MPI_FIT,NTAG integer iout type(thefit) , allocatable:: rec(:),temp(:),temp2(:) type(thefit) , allocatable :: data(:) integer p,active,rank,count,i,ijk integer status(MPI_STATUS_SIZE) call MPI_COMM_RANK( THECOM, rank, ierr ) call MPI_COMM_SIZE( THECOM, p, ierr ) iout=rank+100 NTAG=(THETAG+1) active=1 do while (2*active < p) active=active*2 enddo if(rank .ge. active)then ! send(input,rank-active) write(iout,*)"at 200 send ",rank-active call MPI_SEND(input,n,MPI_FIT,rank-active,THETAG,THECOM,ierr) return endif allocate(data(n)) data=input m=n if(rank + active < p)then ! rec(new_data,rank+active) call MPI_PROBE(rank+active,THETAG,THECOM,status,ierr) call MPI_get_count(status,MPI_FIT,count,ierr) allocate(rec(count)) write(iout,*)"at 211 recv ",rank+active call MPI_RECV(rec,count,MPI_FIT,rank+active,THETAG,THECOM,status,ierr) ! data=data+new_data allocate(temp(n+count)) call merge2(input,n,rec,count,temp) if(allocated(data))deallocate(data) allocate(data(n+count)) data=temp m=n+count endif do while (active > 1) active=active/2 if(rank .ge. active) then ! send(data,rank-active) write(iout,*)"at 225 send ",rank-active call MPI_SEND(data,m,MPI_FIT,rank-active,THETAG,THECOM,ierr) if(allocated(data))deallocate(data) if(allocated(rec))deallocate(rec) if(allocated(temp))deallocate(temp) if(allocated(temp2))deallocate(temp2) return else ! recv(new_data,rank+active) call MPI_PROBE(rank+active,THETAG,THECOM,status,ierr) call MPI_get_count(status,MPI_FIT,count,ierr) if(allocated(rec))deallocate(rec) allocate(rec(count)) write(iout,*)"at 238 recv ",rank+active call MPI_RECV(rec,count,MPI_FIT,rank+active,THETAG,THECOM,status,ierr) ! data=data+new_data if(allocated(temp2))deallocate(temp2) allocate(temp2(m+count)) call merge2(data,m,rec,count,temp2) deallocate(data);allocate(data(m+count)) data=temp2 m=m+count endif enddo if(rank .eq. root)then do i=1,m output(i)=data(i) enddo endif if(allocated(data))deallocate(data) if(allocated(rec))deallocate(rec) if(allocated(temp))deallocate(temp) if(allocated(temp2))deallocate(temp2) return end subroutine MY_MERGE end module my_collective subroutine doseed(myseed) use mpi integer :: my_seed ! optional argument not changed in the routine integer,allocatable :: seed(:) integer the_size,j call random_seed(size=the_size) ! how big is the intrisic seed? allocate(seed(the_size)) ! allocate space for seed do j=1,the_size ! create the seed seed(j)=abs(my_seed)+(j-1)+myid*10 ! abs is generic enddo call random_seed(put=seed) ! assign the seed deallocate(seed) ! deallocate space end subroutine module ran_mod integer, parameter:: b8 = selected_real_kind(14) contains function ran1(idum,r,rm1,rm2,j,ix1,ix2,ix3 ) implicit none real(b8) ran1 !************* real(b8) r(97),rm1,rm2 integer j integer ix1,ix2,ix3 ! common/ran_com/ r,rm1,rm2,j,ix1,ix2,ix3 !!$OMP THREADPRIVATE (/ran_com/) !************* integer, intent(inout), optional :: idum integer, parameter :: m1=259200,ia1=7141,ic1=54773 integer, parameter :: m2=134456,ia2=8121,ic2=28411 integer, parameter :: m3=243000,ia3=4561,ic3=51349 save ! corrects a bug in the original routine if(present(idum))then if (idum<0)then rm1=1.0_b8/m1 rm2=1.0_b8/m2 ix1=mod(ic1-idum,m1) ix1=mod(ia1*ix1+ic1,m1) ix2=mod(ix1,m2) ix1=mod(ia1*ix1+ic1,m1) ix3=mod(ix1,m3) do j=1,97 ix1=mod(ia1*ix1+ic1,m1) ix2=mod(ia2*ix2+ic2,m2) r(j)=(real(ix1,b8)+real(ix2,b8)*rm2)*rm1 enddo idum=1 endif endif ix1=mod(ia1*ix1+ic1,m1) ix2=mod(ia2*ix2+ic2,m2) ix3=mod(ia3*ix3+ic3,m3) j=1+(97*ix3)/m3 if(j>97.or.j<1)then write(*,*)' error in ran1 j=',j stop endif ran1=r(j) r(j)=(real(ix1,b8)+real(ix2,b8)*rm2)*rm1 return end function ran1 end module program test use galapagos use sort_mod use my_collective use mpi use uname use ran_mod implicit none integer jin integer ix1,ix2,ix3 real(b8) r(97),rm1,rm2 integer :: crate integer , dimension(0:4) :: scount,ecount real(b8) , dimension(0:4) :: rclock real x logical :: do_rite integer i,j,k,m,di,k1,k2,myseed,gsize,ithread integer OMP_GET_MAX_THREADS,OMP_GET_THREAD_NUM integer, allocatable :: kstart(:),kend(:) type(thefit),allocatable :: data(:),output1(:),output2(:),output(:) integer t(2),b(2),d(2),MPI_SORT character (len=MPI_MAX_PROCESSOR_NAME):: myname integer mylen call MPI_INIT( ierr ) call MPI_COMM_RANK( MPI_COMM_WORLD, myid, ierr ) call MPI_COMM_SIZE( MPI_COMM_WORLD, numnodes, ierr ) call MPI_Get_processor_name(myname,mylen,ierr) write(*,*)myid,"running on",trim(myname) do_rite=.true. scount=0;ecount=0; my_root=0 myseed=12345 call doseed(myseed) open(16,file=unique("out")) if(do_rite)then write(unit=*,fmt=*)"OMP_GET_MAX_THREADS=",OMP_GET_MAX_THREADS() endif b(1)=1;b(2)=2 d(1)=0;d(2)=4 t(1)=MPI_REAL;t(2)=MPI_INTEGER call MPI_TYPE_STRUCT(2,b,d,t,MPI_SORT,ierr) call MPI_TYPE_COMMIT(MPI_SORT,ierr) if(myid .eq. my_root)then i=6402 do_rite=.true. ! read(*,*)i,do_rite endif call MPI_Bcast(i,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr) call MPI_Bcast(do_rite,1,MPI_LOGICAL,0,MPI_COMM_WORLD,ierr) gsize=i*numnodes if(myid .eq. my_root)then allocate(output(gsize)) if(do_rite)then write(16,*)"starting list" endif do j=1,gsize call random_number(output(j)%val) output(j)%index=j enddo endif call MPI_Barrier(MPI_COMM_WORLD,ierr) call system_clock(scount(0)) call system_clock(scount(1)) allocate(data(i)) call MPI_Scatter(output,i,MPI_SORT, & data, i,MPI_SORT, & my_root,MPI_COMM_WORLD,ierr) call system_clock(ecount(1)) if(do_rite)then write(16,*)"i hold:" endif do j=1,i data(j)%proc=myid if(do_rite)then write(16,'(f10.8,1x,i5,i5)')data(j)%val, & data(j)%proc,& data(j)%index endif enddo m=4 di=(i/m)-1 allocate(kstart(m),kend(m)) kstart(1)=1 kend(1)=di+1 do j=2,m kstart(j)=kend(j-1)+1 kend(j)=kstart(j)+di enddo kend(m)=i if(do_rite)then write(16,"(8i5)")kstart write(16,"(8i5)")kend endif call system_clock(scount(2)) !$OMP PARALLEL SECTIONS !$OMP SECTION call sort(data(kstart(1):kend(1)), kend(1)-kstart(1)+1 ) !$OMP SECTION call sort(data(kstart(2):kend(2)), kend(2)-kstart(2)+1 ) !$OMP SECTION call sort(data(kstart(3):kend(3)), kend(3)-kstart(3)+1 ) !$OMP SECTION call sort(data(kstart(4):kend(4)), kend(4)-kstart(4)+1 ) !$OMP END PARALLEL SECTIONS call system_clock(ecount(2)) if(do_rite)then do k=1,m write(16,*)"sorted section ",k do j=kstart(k),kend(k) write(16,'(f10.8,1x,i5)')data(j)%val,data(j)%index enddo enddo endif !$OMP PARALLEL SECTIONS !$OMP SECTION call system_clock(scount(3)) k1=kend(2)-kstart(1)+1 allocate(output1(k1)) call merge2(data(kstart(1):kend(1)),kend(1)-kstart(1)+1, & data(kstart(2):kend(2)),kend(2)-kstart(2)+1,output1) !$OMP SECTION k2=kend(4)-kstart(3)+1 allocate(output2(k2)) call merge2(data(kstart(3):kend(3)),kend(3)-kstart(3)+1, & data(kstart(4):kend(4)),kend(4)-kstart(4)+1,output2) !$OMP END PARALLEL SECTIONS call merge2(output1,k1,output2,k2,data) call system_clock(ecount(3)) if(do_rite)then write(16,*)"merged single processor sorted list " do j=1,i write(16,'(f10.8,1x,i5)')data(j)%val,data(j)%index enddo endif m=0 call system_clock(scount(4)) call MY_MERGE(data,i,output,m,my_root,1234,MPI_COMM_WORLD,MPI_SORT) call system_clock(ecount(4)) if(do_rite)then if(myid .eq. my_root)then write(16,*)"merged list from all processors" do j=1,m write(16,'(f10.8,1x,i5,i5)')output(j)%val,output(j)%proc,output(j)%index enddo endif endif call system_clock(ecount(0)) call system_clock(count_rate=crate) rclock=ecount-scount rclock=rclock/real(crate,b8) write(unit=16,fmt="(""tpoints="",i10,"" nodes="",i3,1x)",advance="no")i*numnodes,numnodes write(unit=16,fmt="(""times="",5f8.5)")rclock call MPI_FINALIZE(ierr) end program