diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx index 5bf556c..d85cf20 100644 --- a/src/atrip/Atrip.cxx +++ b/src/atrip/Atrip.cxx @@ -646,6 +646,9 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // COMPUTE SINGLES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% {{{1 OCD_Barrier(universe); +#if defined(ATRIP_ONLY_DGEMM) + if (false) +#endif if (!isFakeTuple(i)) { WITH_CHRONO("oneshot-unwrap", WITH_CHRONO("unwrap", @@ -678,6 +681,9 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // COMPUTE ENERGY %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% {{{1 +#if defined(ATRIP_ONLY_DGEMM) + if (false) +#endif if (!isFakeTuple(i)) { double tupleEnergy(0.); diff --git a/src/atrip/Equations.cxx b/src/atrip/Equations.cxx index ab9717e..4439383 100644 --- a/src/atrip/Equations.cxx +++ b/src/atrip/Equations.cxx @@ -151,12 +151,11 @@ namespace cuda { KJI }; - /* + /* * Please the c++ type checker and template creator * in order to have an argument in the signature of * the function that helps the compiler know which * instantiation it should take. - * */ template struct reorder_proxy {}; @@ -436,22 +435,20 @@ double getEnergySame , DataFieldType* Tijk_ ) { - const size_t a = abc[0], b = abc[1], c = abc[2] - , NoNo = No*No - ; + const size_t NoNo = No*No; DataFieldType* Tijk = (DataFieldType*)Tijk_; #if defined(ATRIP_USE_DGEMM) #if defined(HAVE_CUDA) -#define REORDER(__II, __JJ, __KK) \ - reorder<<>>(reorder_proxy< \ - DataFieldType, \ - __II ## __JJ ## __KK \ - >{}, \ - No, \ - Tijk, \ - _t_buffer); +#define REORDER(__II, __JJ, __KK) \ + reorder<<>>(reorder_proxy< \ + DataFieldType, \ + __II ## __JJ ## __KK \ + >{}, \ + No, \ + Tijk, \ + _t_buffer) #define DGEMM_PARTICLES(__A, __B) \ atrip::xgemm("T", \ "N", \ @@ -481,11 +478,18 @@ double getEnergySame _t_buffer, \ (int const*)&NoNo \ ) -#define MAYBE_CONJ(_conj, _buffer) \ - cuda::maybeConjugate<<< \ - Atrip::kernelDimensions.ooo.blocks, \ - Atrip::kernelDimensions.ooo.threads \ - >>>((DataFieldType*)_conj, (DataFieldType*)_buffer, NoNoNo); +#define MAYBE_CONJ(_conj, _buffer) \ + do { \ + cuda::maybeConjugate<<< \ + \ + Atrip::kernelDimensions.ooo.blocks, \ + \ + Atrip::kernelDimensions.ooo.threads \ + \ + >>>((DataFieldType*)_conj, \ + (DataFieldType*)_buffer, \ + NoNoNo); \ + } while (0) // END CUDA //////////////////////////////////////////////////////////////////// @@ -500,7 +504,9 @@ double getEnergySame #define REORDER(__II, __JJ, __KK) \ reorder(reorder_proxy, \ __II ## __JJ ## __KK >{}, \ - No, Tijk, _t_buffer); + No, \ + Tijk, \ + _t_buffer) #define DGEMM_PARTICLES(__A, __B) \ atrip::xgemm("T", \ "N", \ @@ -531,9 +537,13 @@ double getEnergySame _t_buffer, \ (int const*)&NoNo \ ) -#define MAYBE_CONJ(_conj, _buffer) \ - for (size_t __i = 0; __i < NoNoNo; ++__i) \ - _conj[__i] = maybeConjugate(_buffer[__i]); +#define MAYBE_CONJ(_conj, _buffer) \ + do { \ + for (size_t __i = 0; __i < NoNoNo; ++__i) { \ + _conj[__i] \ + = maybeConjugate(_buffer[__i]); \ + } \ + } while (0) #endif F one{1.0}, m_one{-1.0}, zero{0.0}; @@ -552,8 +562,12 @@ double getEnergySame const size_t bs = Atrip::kernelDimensions.ooo.blocks, ths = Atrip::kernelDimensions.ooo.threads; + +#if !defined(ATRIP_ONLY_DGEMM) cuda::zeroing<<>>((DataFieldType*)_t_buffer, NoNoNo); cuda::zeroing<<>>((DataFieldType*)_vhhh, NoNoNo); +#endif + #else DataFieldType* _t_buffer = (DataFieldType*)malloc(NoNoNo * sizeof(F)); DataFieldType* _vhhh = (DataFieldType*)malloc(NoNoNo * sizeof(F)); @@ -565,7 +579,7 @@ double getEnergySame #endif // Set Tijk to zero -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) && !defined(ATRIP_ONLY_DGEMM) WITH_CHRONO("double:reorder", cuda::zeroing<<>>((DataFieldType*)Tijk, NoNoNo); @@ -577,43 +591,51 @@ double getEnergySame }) #endif + +#if defined(ATRIP_ONLY_DGEMM) +#undef MAYBE_CONJ +#undef REORDER +#define MAYBE_CONJ(a, b) do {} while(0) +#define REORDER(i, j, k) do {} while(0) +#endif + // HOLES WITH_CHRONO("doubles:holes", { // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 - MAYBE_CONJ(_vhhh, VhhhC) + MAYBE_CONJ(_vhhh, VhhhC); WITH_CHRONO("doubles:holes:1", DGEMM_HOLES(_vhhh, TABhh, "N"); - REORDER(I, K, J) + REORDER(I, K, J); ) // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 WITH_CHRONO("doubles:holes:2", DGEMM_HOLES(_vhhh, TABhh, "T"); - REORDER(J, K, I) + REORDER(J, K, I); ) // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 - MAYBE_CONJ(_vhhh, VhhhB) + MAYBE_CONJ(_vhhh, VhhhB); WITH_CHRONO("doubles:holes:3", DGEMM_HOLES(_vhhh, TAChh, "N"); - REORDER(I, J, K) + REORDER(I, J, K); ) // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 WITH_CHRONO("doubles:holes:4", DGEMM_HOLES(_vhhh, TAChh, "T"); - REORDER(K, J, I) + REORDER(K, J, I); ) // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 - MAYBE_CONJ(_vhhh, VhhhA) + MAYBE_CONJ(_vhhh, VhhhA); WITH_CHRONO("doubles:holes:5", DGEMM_HOLES(_vhhh, TBChh, "N"); - REORDER(J, I, K) + REORDER(J, I, K); ) // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 WITH_CHRONO("doubles:holes:6", DGEMM_HOLES(_vhhh, TBChh, "T"); - REORDER(K, I, J) + REORDER(K, I, J); ) } ) @@ -625,32 +647,32 @@ double getEnergySame // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 WITH_CHRONO("doubles:particles:1", DGEMM_PARTICLES(TAphh, VBCph); - REORDER(I, J, K) + REORDER(I, J, K); ) // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 WITH_CHRONO("doubles:particles:2", DGEMM_PARTICLES(TAphh, VCBph); - REORDER(I, K, J) + REORDER(I, K, J); ) // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 WITH_CHRONO("doubles:particles:3", DGEMM_PARTICLES(TCphh, VABph); - REORDER(K, I, J) + REORDER(K, I, J); ) // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 WITH_CHRONO("doubles:particles:4", DGEMM_PARTICLES(TCphh, VBAph); - REORDER(K, J, I) + REORDER(K, J, I); ) // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 WITH_CHRONO("doubles:particles:5", DGEMM_PARTICLES(TBphh, VACph); - REORDER(J, I, K) + REORDER(J, I, K); ) // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 WITH_CHRONO("doubles:particles:6", DGEMM_PARTICLES(TBphh, VCAph); - REORDER(J, K, I) + REORDER(J, K, I); ) } )