diff --git a/src/atrip/Equations.cxx b/src/atrip/Equations.cxx index 209d0f4..ab9717e 100644 --- a/src/atrip/Equations.cxx +++ b/src/atrip/Equations.cxx @@ -552,8 +552,8 @@ double getEnergySame const size_t bs = Atrip::kernelDimensions.ooo.blocks, ths = Atrip::kernelDimensions.ooo.threads; - // cuda::zeroing<<>>((DataFieldType*)_t_buffer, NoNoNo); - // cuda::zeroing<<>>((DataFieldType*)_vhhh, NoNoNo); + cuda::zeroing<<>>((DataFieldType*)_t_buffer, NoNoNo); + cuda::zeroing<<>>((DataFieldType*)_vhhh, NoNoNo); #else DataFieldType* _t_buffer = (DataFieldType*)malloc(NoNoNo * sizeof(F)); DataFieldType* _vhhh = (DataFieldType*)malloc(NoNoNo * sizeof(F)); @@ -569,7 +569,6 @@ double getEnergySame WITH_CHRONO("double:reorder", cuda::zeroing<<>>((DataFieldType*)Tijk, NoNoNo); - // synchronize all initializations to zero ) #else WITH_CHRONO("double:reorder", @@ -584,37 +583,37 @@ double getEnergySame // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 MAYBE_CONJ(_vhhh, VhhhC) WITH_CHRONO("doubles:holes:1", - DGEMM_HOLES(_vhhh, TABhh, "N") - REORDER(I, K, J) + DGEMM_HOLES(_vhhh, TABhh, "N"); + REORDER(I, K, J) ) // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 WITH_CHRONO("doubles:holes:2", - DGEMM_HOLES(_vhhh, TABhh, "T") - REORDER(J, K, I) + DGEMM_HOLES(_vhhh, TABhh, "T"); + REORDER(J, K, I) ) // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 MAYBE_CONJ(_vhhh, VhhhB) WITH_CHRONO("doubles:holes:3", - DGEMM_HOLES(_vhhh, TAChh, "N") - REORDER(I, J, K) + DGEMM_HOLES(_vhhh, TAChh, "N"); + REORDER(I, J, K) ) // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 WITH_CHRONO("doubles:holes:4", - DGEMM_HOLES(_vhhh, TAChh, "T") - REORDER(K, J, I) + DGEMM_HOLES(_vhhh, TAChh, "T"); + REORDER(K, J, I) ) // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 MAYBE_CONJ(_vhhh, VhhhA) WITH_CHRONO("doubles:holes:5", - DGEMM_HOLES(_vhhh, TBChh, "N") - REORDER(J, I, K) + DGEMM_HOLES(_vhhh, TBChh, "N"); + REORDER(J, I, K) ) // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 WITH_CHRONO("doubles:holes:6", - DGEMM_HOLES(_vhhh, TBChh, "T") - REORDER(K, I, J) + DGEMM_HOLES(_vhhh, TBChh, "T"); + REORDER(K, I, J) ) } ) @@ -625,33 +624,33 @@ double getEnergySame { // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 WITH_CHRONO("doubles:particles:1", - DGEMM_PARTICLES(TAphh, VBCph) - REORDER(I, J, K) + DGEMM_PARTICLES(TAphh, VBCph); + REORDER(I, J, K) ) // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 WITH_CHRONO("doubles:particles:2", - DGEMM_PARTICLES(TAphh, VCBph) - REORDER(I, K, J) + DGEMM_PARTICLES(TAphh, VCBph); + REORDER(I, K, J) ) // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 WITH_CHRONO("doubles:particles:3", - DGEMM_PARTICLES(TCphh, VABph) - REORDER(K, I, J) + DGEMM_PARTICLES(TCphh, VABph); + REORDER(K, I, J) ) // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 WITH_CHRONO("doubles:particles:4", - DGEMM_PARTICLES(TCphh, VBAph) - REORDER(K, J, I) + DGEMM_PARTICLES(TCphh, VBAph); + REORDER(K, J, I) ) // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 WITH_CHRONO("doubles:particles:5", - DGEMM_PARTICLES(TBphh, VACph) - REORDER(J, I, K) + DGEMM_PARTICLES(TBphh, VACph); + REORDER(J, I, K) ) // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 WITH_CHRONO("doubles:particles:6", - DGEMM_PARTICLES(TBphh, VCAph) - REORDER(J, K, I) + DGEMM_PARTICLES(TBphh, VCAph); + REORDER(J, K, I) ) } )