#ifndef ENABLE_THREADING #define ENABLE_THREADING #endif !#ifndef ENABLE_SIMD !#define ENABLE_SIMD !#endif #ifndef ENABLE_THREADED_INIT #define ENABLE_THREADED_INIT #endif !#ifndef USE_SUBROUTINES !#define USE_SUBROUTINES !#endif !#ifndef ALIGN_LEN !#define ALIGN_LEN 32 !#endif PROGRAM main IMPLICIT NONE TYPE block_data INTEGER(4) :: ng,npad,ne,nmodes,nv REAL(8) :: keff,bigr COMPLEX(8), POINTER, DIMENSION(:,:,:) :: alpha,alpr,alpz COMPLEX(8), POINTER, DIMENSION(:,:,:,:) :: & ve,be,ber,bez,eta,integrand END TYPE block_data INTEGER(4), PARAMETER :: nbl=16 TYPE(block_data) :: bl(nbl) INTEGER(4) :: ibl,iq,ie,iv,im,iloop REAL(8), PARAMETER :: mu0=1.25e-6 REAL(8) :: tst,ten COMPLEX(8) :: refval(3) CALL initialize_blocks CALL timer(tst) DO iloop=1,5000 #ifdef ENABLE_THREADING !$omp parallel do default(shared) schedule(static,1) #endif DO ibl=1,nbl CALL assemble_rhs(bl(ibl)) ENDDO #ifdef ENABLE_THREADING !$omp end parallel do #endif ENDDO CALL timer(ten) WRITE(*,*) 'Total time = ',ten-tst #ifdef ENABLE_SIMD refval(:)=bl(1)%integrand(1,1,1,:) #else refval(:)=bl(1)%integrand(:,1,1,1) #endif WRITE(*,*) 'Output = ',refval DO ibl=1,nbl DO iq=1,3 DO im=1,bl(ibl)%nmodes DO iv=1,bl(ibl)%nv DO ie=1,bl(ibl)%ne #ifdef ENABLE_SIMD IF (refval(iq) /= bl(ibl)%integrand(ie,iv,im,iq)) THEN WRITE(*,*) ibl,iq,ie,iv,im,bl(ibl)%integrand(ie,iv,im,iq) ENDIF #else IF (refval(iq) /= bl(ibl)%integrand(iq,ie,iv,im)) THEN WRITE(*,*) ibl,iq,ie,iv,im,bl(ibl)%integrand(iq,ie,iv,im) ENDIF #endif ENDDO ENDDO ENDDO ENDDO ENDDO CONTAINS SUBROUTINE assemble_rhs(bl) IMPLICIT NONE TYPE(block_data), INTENT(INOUT) :: bl CALL rhs_integrand(bl) ! TODO: assemble into rhs,rhsh,rhsv,rhsi ! assemble into a flat array END SUBROUTINE assemble_rhs SUBROUTINE rhs_integrand(bl) IMPLICIT NONE TYPE(block_data), INTENT(INOUT) :: bl #ifdef ENABLE_SIMD COMPLEX(8), TARGET, DIMENSION(bl%npad,bl%ne,bl%nmodes,3) :: w1 COMPLEX(8), POINTER, DIMENSION(:,:,:,:) :: ve,be,ef COMPLEX(8), POINTER, DIMENSION(:,:,:) :: alpha #else COMPLEX(8), DIMENSION(3,bl%ng,bl%ne,bl%nmodes) :: ef #endif !#ifndef USE_SUBROUTINES INTEGER(4) :: iv,im,ig,ie !#endif #ifdef ENABLE_SIMD ef=>w1 ve=>bl%ve be=>bl%be alpha=>bl%alpha #ifdef USE_SUBROUTINES CALL cross_prod(ef,ve,be,REAL(-1.,8)) CALL alpha_dot(bl%integrand,alpha,ef,bl%ng) #else ! vxb DO im=1,bl%nmodes DO ie=1,bl%ne !$omp simd aligned(ef,ve,be:ALIGN_LEN) linear(ig:1) DO ig=1,bl%ng ef(ig,ie,im,1) = -1.0*(ve(ig,ie,im,2)*be(ig,ie,im,3)-ve(ig,ie,im,3)*be(ig,ie,im,2)) ef(ig,ie,im,2) = -1.0*(ve(ig,ie,im,3)*be(ig,ie,im,1)-ve(ig,ie,im,1)*be(ig,ie,im,3)) ef(ig,ie,im,3) = -1.0*(ve(ig,ie,im,1)*be(ig,ie,im,2)-ve(ig,ie,im,2)*be(ig,ie,im,1)) ENDDO ENDDO ENDDO ! assemble DO im=1,bl%nmodes DO ie=1,bl%ne !$omp simd aligned(alpha,ef:ALIGN_LEN) linear(iv:1) DO iv=1,bl%nv bl%integrand(ie,iv,im,1) = SUM(alpha(1:bl%ng,ie,iv)*ef(1:bl%ng,ie,im,1),1) bl%integrand(ie,iv,im,2) = SUM(alpha(1:bl%ng,ie,iv)*ef(1:bl%ng,ie,im,2),1) bl%integrand(ie,iv,im,3) = SUM(alpha(1:bl%ng,ie,iv)*ef(1:bl%ng,ie,im,3),1) ENDDO ENDDO ENDDO #endif /* USE_SUBROUTINES */ #else ! vxb DO im=1,bl%nmodes DO ie=1,bl%ne DO ig=1,bl%ng ef(1,ig,ie,im) = -1.0*(bl%ve(2,ig,ie,im)*bl%be(3,ig,ie,im)-bl%ve(3,ig,ie,im)*bl%be(2,ig,ie,im)) ef(2,ig,ie,im) = -1.0*(bl%ve(3,ig,ie,im)*bl%be(1,ig,ie,im)-bl%ve(1,ig,ie,im)*bl%be(3,ig,ie,im)) ef(3,ig,ie,im) = -1.0*(bl%ve(1,ig,ie,im)*bl%be(2,ig,ie,im)-bl%ve(2,ig,ie,im)*bl%be(1,ig,ie,im)) ENDDO ENDDO ENDDO ! assemble DO im=1,bl%nmodes DO iv=1,bl%nv bl%integrand(1,:,iv,im) = SUM(bl%alpha(:,:,iv)*ef(1,:,:,im),1) bl%integrand(2,:,iv,im) = SUM(bl%alpha(:,:,iv)*ef(2,:,:,im),1) bl%integrand(3,:,iv,im) = SUM(bl%alpha(:,:,iv)*ef(3,:,:,im),1) ENDDO ENDDO #endif END SUBROUTINE rhs_integrand SUBROUTINE cross_prod(vout,v1,v2,fac) IMPLICIT NONE COMPLEX(8), TARGET, DIMENSION(:,:,:,:), CONTIGUOUS, INTENT(OUT) :: vout COMPLEX(8), TARGET, DIMENSION(:,:,:,:), CONTIGUOUS, INTENT(IN) :: v1,v2 REAL(8), INTENT(IN) :: fac COMPLEX(8), POINTER, DIMENSION(:,:,:) :: vout1,vout2,vout3, & v11,v12,v13,v21,v22,v23 INTEGER(4) :: im,ie,ig vout1=>vout(:,:,:,1) vout2=>vout(:,:,:,2) vout3=>vout(:,:,:,3) v11=>v1(:,:,:,1) v12=>v1(:,:,:,2) v13=>v1(:,:,:,3) v21=>v2(:,:,:,1) v22=>v2(:,:,:,2) v23=>v2(:,:,:,3) DO im=1,SIZE(vout,3) DO ie=1,SIZE(vout,2) !$omp simd aligned(vout1,vout2,vout3,v11,v12,v13,v21,v22,v23:ALIGN_LEN) !dir$ vector aligned DO ig=1,SIZE(vout,1) CALL cross_elemental_fac(vout1(ig,ie,im),vout2(ig,ie,im),vout3(ig,ie,im), & v11(ig,ie,im),v12(ig,ie,im),v13(ig,ie,im), & v21(ig,ie,im),v22(ig,ie,im),v23(ig,ie,im),fac) ENDDO ENDDO ENDDO !!dir$ vector aligned !CALL cross_elemental_fac(vout(:,:,:,1),vout(:,:,:,2),vout(:,:,:,3), & ! v1(:,:,:,1),v1(:,:,:,2),v1(:,:,:,3), & ! v2(:,:,:,1),v2(:,:,:,2),v2(:,:,:,3),fac) END SUBROUTINE cross_prod ELEMENTAL SUBROUTINE cross_elemental_fac(vout1,vout2,vout3,v11,v12,v13,v21,v22,v23,fac) #ifndef __GFORTRAN__ !$omp declare simd(cross_elemental_fac) aligned(vout1,vout2,vout3,v11,v12,v13,v21,v22,v23:ALIGN_LEN) uniform(fac) #endif IMPLICIT NONE COMPLEX(8), INTENT(OUT) :: vout1,vout2,vout3 COMPLEX(8), INTENT(IN) :: v11,v12,v13,v21,v22,v23 REAL(8), INTENT(IN) :: fac vout1=fac*(v12*v23-v13*v22) vout2=fac*(v13*v21-v11*v23) vout3=fac*(v11*v22-v12*v21) END SUBROUTINE cross_elemental_fac SUBROUTINE alpha_dot(integrand,alpha,vec,ng) IMPLICIT NONE COMPLEX(8), CONTIGUOUS, INTENT(OUT) :: integrand(:,:,:,:) COMPLEX(8), CONTIGUOUS, INTENT(IN) :: alpha(:,:,:),vec(:,:,:,:) INTEGER(4), INTENT(IN) :: ng INTEGER(4) :: im,ie,iv DO im=1,SIZE(vec,3) DO ie=1,SIZE(vec,2) #ifndef __GFORTRAN__ !$omp simd aligned(integrand,alpha,vec:ALIGN_LEN) linear(iv:1) #endif !dir$ vector aligned DO iv=1,SIZE(alpha,3) integrand(ie,iv,im,1) = SUM(alpha(1:ng,ie,iv)*vec(1:ng,ie,im,1)) integrand(ie,iv,im,2) = SUM(alpha(1:ng,ie,iv)*vec(1:ng,ie,im,2)) integrand(ie,iv,im,3) = SUM(alpha(1:ng,ie,iv)*vec(1:ng,ie,im,3)) ENDDO ENDDO ENDDO END SUBROUTINE alpha_dot SUBROUTINE initialize_blocks IMPLICIT NONE #ifdef ENABLE_THREADING #ifdef ENABLE_THREADED_INIT !$omp parallel do default(shared) schedule(static,1) #endif #endif DO ibl=1,nbl bl(ibl)%ng=36 ! poly_deg=6 bl(ibl)%npad=64 ! padded ng bl(ibl)%nv=49 ! nodes/element for pd=6: 4v + 10h + 10v + 25i bl(ibl)%ne=16 ! 4x4 blocks !bl(ibl)%ne=64 ! 8x8 blocks bl(ibl)%nmodes=1 ! full toroidal decomposition bl(ibl)%keff=2.0 bl(ibl)%bigr=1.3 ! allocate integrand #ifdef ENABLE_SIMD CALL aligned_allocate_c8d4(bl(ibl)%integrand,bl(ibl)%ne,bl(ibl)%nv,bl(ibl)%nmodes,3) #else ALLOCATE(bl(ibl)%integrand(3,bl(ibl)%ne,bl(ibl)%nv,bl(ibl)%nmodes)) #endif ! allocate test function ALLOCATE(bl(ibl)%alpha(bl(ibl)%ng,bl(ibl)%ne,bl(ibl)%nv)) bl(ibl)%alpha=(1.,2.) ALLOCATE(bl(ibl)%alpr(bl(ibl)%ng,bl(ibl)%ne,bl(ibl)%nv)) bl(ibl)%alpr=(3.,4.) ALLOCATE(bl(ibl)%alpz(bl(ibl)%ng,bl(ibl)%ne,bl(ibl)%nv)) bl(ibl)%alpz=(5.,6.) ! allocate fields CALL allocate_field(bl(ibl)%ve,3,bl(ibl)%ng,bl(ibl)%ne,bl(ibl)%nmodes) #ifdef ENABLE_SIMD bl(ibl)%ve(:,:,:,1)=(7.,8.) bl(ibl)%ve(:,:,:,2)=(9.,10.) bl(ibl)%ve(:,:,:,3)=(12.,11.) #else bl(ibl)%ve(1,:,:,:)=(7.,8.) bl(ibl)%ve(2,:,:,:)=(9.,10.) bl(ibl)%ve(3,:,:,:)=(12.,11.) #endif CALL allocate_field(bl(ibl)%be,3,bl(ibl)%ng,bl(ibl)%ne,bl(ibl)%nmodes) #ifdef ENABLE_SIMD bl(ibl)%be(:,:,:,1)=(13.,14.) bl(ibl)%be(:,:,:,2)=(15.,16.) bl(ibl)%be(:,:,:,3)=(17.,18.) #else bl(ibl)%be(1,:,:,:)=(13.,14.) bl(ibl)%be(2,:,:,:)=(15.,16.) bl(ibl)%be(3,:,:,:)=(17.,18.) #endif CALL allocate_field(bl(ibl)%ber,3,bl(ibl)%ng,bl(ibl)%ne,bl(ibl)%nmodes) bl(ibl)%ber=(11.,12.) CALL allocate_field(bl(ibl)%bez,3,bl(ibl)%ng,bl(ibl)%ne,bl(ibl)%nmodes) bl(ibl)%bez=(13.,14.) CALL allocate_field(bl(ibl)%eta,1,bl(ibl)%ng,bl(ibl)%ne,bl(ibl)%nmodes) bl(ibl)%eta=(15.,16.) ENDDO #ifdef ENABLE_THREADING #ifdef ENABLE_THREADED_INIT !$omp end parallel do #endif #endif END SUBROUTINE initialize_blocks SUBROUTINE allocate_field(field,nqty,ng,ne,nmodes) IMPLICIT NONE COMPLEX(8), POINTER, INTENT(OUT) :: field(:,:,:,:) INTEGER(4), INTENT(IN) :: nqty,ng,ne,nmodes #ifdef ENABLE_SIMD CALL aligned_allocate_c8d4(field,ng,ne,nmodes,nqty) #else ALLOCATE(field(nqty,ng,ne,nmodes)) #endif END SUBROUTINE allocate_field SUBROUTINE aligned_allocate_c8d4(array,n1,n2,n3,n4) IMPLICIT NONE COMPLEX(8), POINTER, INTENT(OUT) :: array(:,:,:,:) !!DIR$ ATTRIBUTES ALIGN : ALIGN_LEN :: array INTEGER(4), INTENT(IN) :: n1,n2,n3,n4 COMPLEX(8) :: complx INTEGER(4) :: npad,nlen nlen=INT(ALIGN_LEN,4)/INT(SIZEOF(complx),4) IF (MOD(n1,nlen)/=0) THEN npad=n1+nlen-MOD(n1,nlen) ELSE npad=n1 ENDIF ALLOCATE(array(npad,n2,n3,n4)) END SUBROUTINE aligned_allocate_c8d4 SUBROUTINE timer(time) IMPLICIT NONE REAL(8), INTENT(OUT) :: time INTEGER(4), SAVE :: count_rate,count_max,count INTEGER(8), SAVE :: last_count,count_tmp LOGICAL, SAVE :: first_call=.true. IF (first_call) THEN CALL SYSTEM_CLOCK(count=count,count_rate=count_rate, & count_max=count_max) count_tmp=count first_call=.false. ELSE CALL SYSTEM_CLOCK(count=count) count_tmp=count DO WHILE (count_tmp