diff --git a/.gitignore b/.gitignore index e602abe..91b3275 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ .emacs doc/doxygen/ +*.o +*.d diff --git a/Sources.mk b/Sources.mk new file mode 100644 index 0000000..c63770b --- /dev/null +++ b/Sources.mk @@ -0,0 +1 @@ +ATRIP_SOURCES = include/atrip.hpp src/atrip/Atrip.cxx include/atrip/Atrip.hpp include/atrip/Debug.hpp include/atrip/Equations.hpp include/atrip/Tuples.hpp include/atrip/Unions.hpp include/atrip/RankMap.hpp include/atrip/Blas.hpp include/atrip/Utils.hpp include/atrip/SliceUnion.hpp include/atrip/Slice.hpp diff --git a/include/atrip.hpp b/include/atrip.hpp new file mode 100644 index 0000000..b3ef823 --- /dev/null +++ b/include/atrip.hpp @@ -0,0 +1,5 @@ +// [[file:../atrip.org::*Include header][Include header:1]] +#pragma once + +#include +// Include header:1 ends here diff --git a/include/atrip/Atrip.hpp b/include/atrip/Atrip.hpp new file mode 100644 index 0000000..fd20c01 --- /dev/null +++ b/include/atrip/Atrip.hpp @@ -0,0 +1,48 @@ +// [[file:../../atrip.org::*Atrip][Atrip:1]] +#pragma once +#include +#include +#include +#include + +#include + +namespace atrip { + + struct Atrip { + + static int rank; + static int np; + static void init(); + + struct Input { + CTF::Tensor *ei = nullptr + , *ea = nullptr + , *Tph = nullptr + , *Tpphh = nullptr + , *Vpphh = nullptr + , *Vhhhp = nullptr + , *Vppph = nullptr + ; + int maxIterations = 0, iterationMod = -1; + bool barrier = true; + Input& with_epsilon_i(CTF::Tensor * t) { ei = t; return *this; } + Input& with_epsilon_a(CTF::Tensor * t) { ea = t; return *this; } + Input& with_Tai(CTF::Tensor * t) { Tph = t; return *this; } + Input& with_Tabij(CTF::Tensor * t) { Tpphh = t; return *this; } + Input& with_Vabij(CTF::Tensor * t) { Vpphh = t; return *this; } + Input& with_Vijka(CTF::Tensor * t) { Vhhhp = t; return *this; } + Input& with_Vabci(CTF::Tensor * t) { Vppph = t; return *this; } + Input& with_maxIterations(int i) { maxIterations = i; return *this; } + Input& with_iterationMod(int i) { iterationMod = i; return *this; } + Input& with_barrier(bool i) { barrier = i; return *this; } + }; + + struct Output { + double energy; + }; + static Output run(Input const& in); + }; + +} +// Atrip:1 ends here diff --git a/include/atrip/Blas.hpp b/include/atrip/Blas.hpp new file mode 100644 index 0000000..fa63028 --- /dev/null +++ b/include/atrip/Blas.hpp @@ -0,0 +1,22 @@ +// [[file:../../atrip.org::*Blas][Blas:1]] +#pragma once +namespace atrip { + extern "C" { + void dgemm_( + const char *transa, + const char *transb, + const int *m, + const int *n, + const int *k, + double *alpha, + const double *A, + const int *lda, + const double *B, + const int *ldb, + double *beta, + double *C, + const int *ldc + ); + } +} +// Blas:1 ends here diff --git a/include/atrip/Debug.hpp b/include/atrip/Debug.hpp new file mode 100644 index 0000000..19ffb22 --- /dev/null +++ b/include/atrip/Debug.hpp @@ -0,0 +1,60 @@ +// [[file:../../atrip.org::*Debug][Debug:1]] +#pragma once +#define ATRIP_BENCHMARK +#define ATRIP_DONT_SLICE +#define ATRIP_DEBUG 1 +//#define ATRIP_WORKLOAD_DUMP +#define ATRIP_USE_DGEMM +//#define ATRIP_PRINT_TUPLES + +#define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": " + +#if ATRIP_DEBUG == 4 +# pragma message("WARNING: You have OCD debugging ABC triples "\ + "expect GB of output and consult your therapist") +# include +# define HAVE_OCD +# define OCD_Barrier(com) MPI_Barrier(com) +# define WITH_OCD +# define WITH_ROOT if (atrip::Atrip::rank == 0) +# define WITH_SPECIAL(r) if (atrip::Atrip::rank == r) +# define WITH_RANK std::cout << atrip::Atrip::rank << ": " +# define WITH_CRAZY_DEBUG +# define WITH_DBG +# define DBG(...) dbg(__VA_ARGS__) +#elif ATRIP_DEBUG == 3 +# pragma message("WARNING: You have crazy debugging ABC triples,"\ + " expect GB of output") +# include +# define OCD_Barrier(com) +# define WITH_OCD if (false) +# define WITH_ROOT if (atrip::Atrip::rank == 0) +# define WITH_SPECIAL(r) if (atrip::Atrip::rank == r) +# define WITH_RANK std::cout << atrip::Atrip::rank << ": " +# define WITH_CRAZY_DEBUG +# define WITH_DBG +# define DBG(...) dbg(__VA_ARGS__) +#elif ATRIP_DEBUG == 2 +# pragma message("WARNING: You have some debugging info for ABC triples") +# include +# define OCD_Barrier(com) +# define WITH_OCD if (false) +# define WITH_ROOT if (atrip::Atrip::rank == 0) +# define WITH_SPECIAL(r) if (atrip::Atrip::rank == r) +# define WITH_RANK std::cout << atrip::Atrip::rank << ": " +# define WITH_CRAZY_DEBUG if (false) +# define WITH_DBG +# define DBG(...) dbg(__VA_ARGS__) +#elif ATRIP_DEBUG == 1 +# define OCD_Barrier(com) +# define WITH_OCD if (false) +# define WITH_ROOT if (false) +# define WITH_SPECIAL(r) if (false) +# define WITH_RANK if (false) std::cout << atrip::Atrip::rank << ": " +# define WITH_DBG if (false) +# define WITH_CRAZY_DEBUG if (false) +# define DBG(...) +#else +# error("ATRIP_DEBUG is not defined!") +#endif +// Debug:1 ends here diff --git a/include/atrip/Equations.hpp b/include/atrip/Equations.hpp new file mode 100644 index 0000000..ca6a859 --- /dev/null +++ b/include/atrip/Equations.hpp @@ -0,0 +1,337 @@ +// [[file:../../atrip.org::*Equations][Equations:1]] +#pragma once + +#include +#include + +namespace atrip { + + double getEnergyDistinct + ( const double epsabc + , std::vector const& epsi + , std::vector const& Tijk_ + , std::vector const& Zijk_ + ) { + constexpr size_t blockSize=16; + double energy(0.); + const size_t No = epsi.size(); + for (size_t kk=0; kk k ? kk : k; + for (size_t j(jstart); j < jend; j++){ + const double ej(epsi[j]); + double facjk( j == k ? 0.5 : 1.0); + size_t istart = ii > j ? ii : j; + for (size_t i(istart); i < iend; i++){ + const double ei(epsi[i]); + double facij ( i==j ? 0.5 : 1.0); + double denominator(epsabc - ei - ej - ek); + double U(Zijk_[i + No*j + No*No*k]); + double V(Zijk_[i + No*k + No*No*j]); + double W(Zijk_[j + No*i + No*No*k]); + double X(Zijk_[j + No*k + No*No*i]); + double Y(Zijk_[k + No*i + No*No*j]); + double Z(Zijk_[k + No*j + No*No*i]); + + double A(Tijk_[i + No*j + No*No*k]); + double B(Tijk_[i + No*k + No*No*j]); + double C(Tijk_[j + No*i + No*No*k]); + double D(Tijk_[j + No*k + No*No*i]); + double E(Tijk_[k + No*i + No*No*j]); + double F(Tijk_[k + No*j + No*No*i]); + double value(3.0*(A*U+B*V+C*W+D*X+E*Y+F*Z) + +((U+X+Y)-2.0*(V+W+Z))*(A+D+E) + +((V+W+Z)-2.0*(U+X+Y))*(B+C+F)); + energy += 2.0*value / denominator * facjk * facij; + } // i + } // j + } // k + } // ii + } // jj + } // kk + return energy; + } + + + double getEnergySame + ( const double epsabc + , std::vector const& epsi + , std::vector const& Tijk_ + , std::vector const& Zijk_ + ) { + constexpr size_t blockSize = 16; + const size_t No = epsi.size(); + double energy(0.); + for (size_t kk=0; kk k ? jj : k; + for(size_t j(jstart); j < jend; j++){ + const double facjk( j == k ? 0.5 : 1.0); + const double ej(epsi[j]); + const size_t istart = ii > j ? ii : j; + for(size_t i(istart); i < iend; i++){ + double ei(epsi[i]); + double facij ( i==j ? 0.5 : 1.0); + double denominator(epsabc - ei - ej - ek); + double U(Zijk_[i + No*j + No*No*k]); + double V(Zijk_[j + No*k + No*No*i]); + double W(Zijk_[k + No*i + No*No*j]); + double A(Tijk_[i + No*j + No*No*k]); + double B(Tijk_[j + No*k + No*No*i]); + double C(Tijk_[k + No*i + No*No*j]); + double value(3.0*( A*U + B*V + C*W) - (A+B+C)*(U+V+W)); + energy += 2.0*value / denominator * facjk * facij; + } // i + } // j + } // k + } // ii + } // jj + } // kk + return energy; + } + + void singlesContribution + ( size_t No + , size_t Nv + , const ABCTuple &abc + , double const* Tph + , double const* VABij + , double const* VACij + , double const* VBCij + , double *Zijk + ) { + const size_t a(abc[0]), b(abc[1]), c(abc[2]); + for (size_t k=0; k < No; k++) + for (size_t i=0; i < No; i++) + for (size_t j=0; j < No; j++) { + const size_t ijk = i + j*No + k*No*No + , jk = j + No * k + ; + Zijk[ijk] += Tph[ a + i * Nv ] * VBCij[ j + k * No ]; + Zijk[ijk] += Tph[ b + j * Nv ] * VACij[ i + k * No ]; + Zijk[ijk] += Tph[ c + k * Nv ] * VABij[ i + j * No ]; + } + } + + void doublesContribution + ( const ABCTuple &abc + , size_t const No + , size_t const Nv + // -- VABCI + , double const* VABph + , double const* VACph + , double const* VBCph + , double const* VBAph + , double const* VCAph + , double const* VCBph + // -- VHHHA + , double const* VhhhA + , double const* VhhhB + , double const* VhhhC + // -- TA + , double const* TAphh + , double const* TBphh + , double const* TCphh + // -- TABIJ + , double const* TABhh + , double const* TAChh + , double const* TBChh + // -- TIJK + , double *Tijk + , atrip::Timings& chrono + ) { + + auto& t_reorder = chrono["doubles:reorder"]; + const size_t a = abc[0], b = abc[1], c = abc[2] + , NoNo = No*No, NoNv = No*Nv + ; + + #if defined(ATRIP_USE_DGEMM) + #define _IJK_(i, j, k) i + j*No + k*NoNo + #define REORDER(__II, __JJ, __KK) \ + t_reorder.start(); \ + for (size_t k = 0; k < No; k++) \ + for (size_t j = 0; j < No; j++) \ + for (size_t i = 0; i < No; i++) { \ + Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)]; \ + } \ + t_reorder.stop(); + #define DGEMM_PARTICLES(__A, __B) \ + atrip::dgemm_( "T" \ + , "N" \ + , (int const*)&NoNo \ + , (int const*)&No \ + , (int const*)&Nv \ + , &one \ + , __A \ + , (int const*)&Nv \ + , __B \ + , (int const*)&Nv \ + , &zero \ + , _t_buffer.data() \ + , (int const*)&NoNo \ + ); + #define DGEMM_HOLES(__A, __B, __TRANSB) \ + atrip::dgemm_( "N" \ + , __TRANSB \ + , (int const*)&NoNo \ + , (int const*)&No \ + , (int const*)&No \ + , &m_one \ + , __A \ + , (int const*)&NoNo \ + , __B \ + , (int const*)&No \ + , &zero \ + , _t_buffer.data() \ + , (int const*)&NoNo \ + ); + + using F = double; + const size_t NoNoNo = No*NoNo; + std::vector _t_buffer; + _t_buffer.reserve(NoNoNo); + F one{1.0}, m_one{-1.0}, zero{0.0}; + + t_reorder.start(); + for (size_t k = 0; k < NoNoNo; k++) { + // zero the Tijk + Tijk[k] = 0.0; + } + t_reorder.stop(); + + chrono["doubles:holes"].start(); + { // Holes part ============================================================ + // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 + chrono["doubles:holes:1"].start(); + DGEMM_HOLES(VhhhC, TABhh, "N") + REORDER(i, k, j) + chrono["doubles:holes:1"].stop(); + // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 + chrono["doubles:holes:2"].start(); + DGEMM_HOLES(VhhhC, TABhh, "T") + REORDER(j, k, i) + chrono["doubles:holes:2"].stop(); + // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 + chrono["doubles:holes:3"].start(); + DGEMM_HOLES(VhhhB, TAChh, "N") + REORDER(i, j, k) + chrono["doubles:holes:3"].stop(); + // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 + chrono["doubles:holes:4"].start(); + DGEMM_HOLES(VhhhB, TAChh, "T") + REORDER(k, j, i) + chrono["doubles:holes:4"].stop(); + // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 + chrono["doubles:holes:5"].start(); + DGEMM_HOLES(VhhhA, TBChh, "N") + REORDER(j, i, k) + chrono["doubles:holes:5"].stop(); + // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 + chrono["doubles:holes:6"].start(); + DGEMM_HOLES(VhhhA, TBChh, "T") + REORDER(k, i, j) + chrono["doubles:holes:6"].stop(); + } + chrono["doubles:holes"].stop(); + + chrono["doubles:particles"].start(); + { // Particle part ========================================================= + // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 + chrono["doubles:particles:1"].start(); + DGEMM_PARTICLES(TAphh, VBCph) + REORDER(i, j, k) + chrono["doubles:particles:1"].stop(); + // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 + chrono["doubles:particles:2"].start(); + DGEMM_PARTICLES(TAphh, VCBph) + REORDER(i, k, j) + chrono["doubles:particles:2"].stop(); + // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 + chrono["doubles:particles:3"].start(); + DGEMM_PARTICLES(TCphh, VABph) + REORDER(k, i, j) + chrono["doubles:particles:3"].stop(); + // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 + chrono["doubles:particles:4"].start(); + DGEMM_PARTICLES(TCphh, VBAph) + REORDER(k, j, i) + chrono["doubles:particles:4"].stop(); + // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 + chrono["doubles:particles:5"].start(); + DGEMM_PARTICLES(TBphh, VACph) + REORDER(j, i, k) + chrono["doubles:particles:5"].stop(); + // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 + chrono["doubles:particles:6"].start(); + DGEMM_PARTICLES(TBphh, VCAph) + REORDER(j, k, i) + chrono["doubles:particles:6"].stop(); + } + chrono["doubles:particles"].stop(); + + #undef REORDER + #undef DGEMM_HOLES + #undef DGEMM_PARTICLES + #undef _IJK_ + #else + for (size_t k = 0; k < No; k++) + for (size_t j = 0; j < No; j++) + for (size_t i = 0; i < No; i++){ + const size_t ijk = i + j*No + k*NoNo + , jk = j + k*No + ; + Tijk[ijk] = 0.0; // :important + // HOLE DIAGRAMS: TABHH and VHHHA + for (size_t L = 0; L < No; L++){ + // t[abLj] * V[Lcik] H1 + // t[baLi] * V[Lcjk] H0 TODO: conjugate T for complex + Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo]; + Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo]; + + // t[acLk] * V[Lbij] H5 + // t[caLi] * V[Lbkj] H3 + Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo]; + Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo]; + + // t[bcLk] * V[Laji] H2 + // t[cbLj] * V[Laki] H4 + Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo]; + Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo]; + } + // PARTILCE DIAGRAMS: TAPHH and VABPH + for (size_t E = 0; E < Nv; E++) { + // t[aEij] * V[bcEk] P0 + // t[aEik] * V[cbEj] P3 // TODO: CHECK THIS ONE, I DONT KNOW + Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; + Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; + + // t[cEki] * V[abEj] P5 + // t[cEkj] * V[baEi] P2 + Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; + Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; + + // t[bEji] * V[acEk] P1 + // t[bEjk] * V[caEi] P4 + Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; + Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; + } + + } + #endif + } + +} +// Equations:1 ends here diff --git a/include/atrip/Slice.hpp b/include/atrip/Slice.hpp new file mode 100644 index 0000000..a7a5363 --- /dev/null +++ b/include/atrip/Slice.hpp @@ -0,0 +1,467 @@ +// [[file:../../atrip.org::*The slice][The slice:1]] +#pragma once +#include +#include +#include +#include + +#include +#include + +namespace atrip { + + +struct Slice { + + using F = double; +// The slice:1 ends here + +// [[file:../../atrip.org::*The slice][The slice:2]] +// ASSOCIATED TYPES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + struct Location { size_t rank; size_t source; }; + + enum Type + { A = 10 + , B + , C + // Two-parameter slices + , AB = 20 + , BC + , AC + // for abci and the doubles + , CB + , BA + , CA + // The non-typed slice + , Blank = 404 + }; + + enum State { + // Fetch represents the state where a slice is to be fetched + // and has a valid data pointer that can be written to + Fetch = 0, + // Dispatches represents the state that an MPI call has been + // dispatched in order to get the data, but the data has not been + // yet unwrapped, the data might be there or we might have to wait. + Dispatched = 2, + // Ready means that the data pointer can be read from + Ready = 1, + // Self sufficient is a slice when its contents are located + // in the same rank that it lives, so that it does not have to + // fetch from no one else. + SelfSufficient = 911, + // Recycled means that this slice gets its data pointer from another + // slice, so it should not be written to + Recycled = 123, + // Acceptor means that the Slice can accept a new Slice, it is + // the counterpart of the Blank type, but for states + Acceptor = 405 + }; + + struct Info { + // which part of a,b,c the slice holds + PartialTuple tuple; + // The type of slice for the user to retrieve the correct one + Type type; + // What is the state of the slice + State state; + // Where the slice is to be retrieved + // NOTE: this can actually be computed from tuple + Location from; + // If the data are actually to be found in this other slice + Type recycling; + + Info() : tuple{0,0} + , type{Blank} + , state{Acceptor} + , from{0,0} + , recycling{Blank} + {} + }; + + using Ty_x_Tu = std::pair< Type, PartialTuple >; + + // Names of the integrals that are considered in CCSD(T) + enum Name + { TA = 100 + , VIJKA = 101 + , VABCI = 200 + , TABIJ = 201 + , VABIJ = 202 + }; + + // DATABASE ==========================================================={{{1 + struct LocalDatabaseElement { + Slice::Name name; + Slice::Info info; + }; + using LocalDatabase = std::vector; + using Database = LocalDatabase; + + + // STATIC METHODS =========================================================== + // + // They are useful to organize the structure of slices + + struct mpi { + + static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) { + MPI_Datatype dt; + MPI_Type_vector(n, 1, 1, DT, &dt); + MPI_Type_commit(&dt); + return dt; + } + + static MPI_Datatype sliceLocation () { + constexpr int n = 2; + // create a sliceLocation to measure in the current architecture + // the packing of the struct + Slice::Location measure; + MPI_Datatype dt; + const std::vector lengths(n, 1); + const MPI_Datatype types[n] = {usizeDt(), usizeDt()}; + + // measure the displacements in the struct + size_t j = 0; + MPI_Aint displacements[n]; + MPI_Get_address(&measure.rank, &displacements[j++]); + MPI_Get_address(&measure.source, &displacements[j++]); + for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0]; + displacements[0] = 0; + + MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); + MPI_Type_commit(&dt); + return dt; + } + + static MPI_Datatype enumDt() { return MPI_INT; } + static MPI_Datatype usizeDt() { return MPI_UINT64_T; } + + static MPI_Datatype sliceInfo () { + constexpr int n = 5; + MPI_Datatype dt; + Slice::Info measure; + const std::vector lengths(n, 1); + const MPI_Datatype types[n] + = { vector(2, usizeDt()) + , enumDt() + , enumDt() + , sliceLocation() + , enumDt() + }; + + // create the displacements from the info measurement struct + size_t j = 0; + MPI_Aint displacements[n]; + MPI_Get_address(measure.tuple.data(), &displacements[j++]); + MPI_Get_address(&measure.type, &displacements[j++]); + MPI_Get_address(&measure.state, &displacements[j++]); + MPI_Get_address(&measure.from, &displacements[j++]); + MPI_Get_address(&measure.recycling, &displacements[j++]); + for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0]; + displacements[0] = 0; + + MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); + MPI_Type_commit(&dt); + return dt; + } + + static MPI_Datatype localDatabaseElement () { + constexpr int n = 2; + MPI_Datatype dt; + LocalDatabaseElement measure; + const std::vector lengths(n, 1); + const MPI_Datatype types[n] + = { enumDt() + , sliceInfo() + }; + + // measure the displacements in the struct + size_t j = 0; + MPI_Aint displacements[n]; + MPI_Get_address(&measure.name, &displacements[j++]); + MPI_Get_address(&measure.info, &displacements[j++]); + for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0]; + displacements[0] = 0; + + MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); + MPI_Type_commit(&dt); + return dt; + } + + }; + + static + PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) { + switch (sliceType) { + case AB: return {abc[0], abc[1]}; + case BC: return {abc[1], abc[2]}; + case AC: return {abc[0], abc[2]}; + case CB: return {abc[2], abc[1]}; + case BA: return {abc[1], abc[0]}; + case CA: return {abc[2], abc[0]}; + case A: return {abc[0], 0}; + case B: return {abc[1], 0}; + case C: return {abc[2], 0}; + default: throw "Switch statement not exhaustive!"; + } + } + + + /** + * It is important here to return a reference to a Slice + * not to accidentally copy the associated buffer of the slice. + */ + static Slice& findOneByType(std::vector &slices, Slice::Type type) { + const auto sliceIt + = std::find_if(slices.begin(), slices.end(), + [&type](Slice const& s) { + return type == s.info.type; + }); + WITH_CRAZY_DEBUG + WITH_RANK + << "\t__ looking for " << type << "\n"; + if (sliceIt == slices.end()) + throw std::domain_error("Slice by type not found!"); + return *sliceIt; + } + + /* + * Check if an info has + * + */ + static std::vector hasRecycledReferencingToIt + ( std::vector &slices + , Info const& info + ) { + std::vector result; + + for (auto& s: slices) + if ( s.info.recycling == info.type + && s.info.tuple == info.tuple + && s.info.state == Recycled + ) result.push_back(&s); + + return result; + } + + static Slice& + findRecycledSource (std::vector &slices, Slice::Info info) { + const auto sliceIt + = std::find_if(slices.begin(), slices.end(), + [&info](Slice const& s) { + return info.recycling == s.info.type + && info.tuple == s.info.tuple + && State::Recycled != s.info.state + ; + }); + + WITH_CRAZY_DEBUG + WITH_RANK << "__slice__:find: recycling source of " + << pretty_print(info) << "\n"; + if (sliceIt == slices.end()) + throw std::domain_error( "Slice not found: " + + pretty_print(info) + + " rank: " + + pretty_print(Atrip::rank) + ); + WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n"; + return *sliceIt; + } + + static Slice& findByTypeAbc + ( std::vector &slices + , Slice::Type type + , ABCTuple const& abc + ) { + const auto tuple = Slice::subtupleBySlice(abc, type); + const auto sliceIt + = std::find_if(slices.begin(), slices.end(), + [&type, &tuple](Slice const& s) { + return type == s.info.type + && tuple == s.info.tuple + ; + }); + WITH_CRAZY_DEBUG + WITH_RANK << "__slice__:find:" << type << " and tuple " + << pretty_print(tuple) + << "\n"; + if (sliceIt == slices.end()) + throw std::domain_error( "Slice not found: " + + pretty_print(tuple) + + ", " + + pretty_print(type) + + " rank: " + + pretty_print(Atrip::rank) + ); + return *sliceIt; + } + + static Slice& findByInfo(std::vector &slices, + Slice::Info const& info) { + const auto sliceIt + = std::find_if(slices.begin(), slices.end(), + [&info](Slice const& s) { + // TODO: maybe implement comparison in Info struct + return info.type == s.info.type + && info.state == s.info.state + && info.tuple == s.info.tuple + && info.from.rank == s.info.from.rank + && info.from.source == s.info.from.source + ; + }); + WITH_CRAZY_DEBUG + WITH_RANK << "__slice__:find:looking for " << pretty_print(info) << "\n"; + if (sliceIt == slices.end()) + throw std::domain_error( "Slice by info not found: " + + pretty_print(info)); + return *sliceIt; + } + + // SLICE DEFINITION =================================================={{{1 + + // ATTRIBUTES ============================================================ + Info info; + F *data; + MPI_Request request; + const size_t size; + + void markReady() noexcept { + info.state = Ready; + info.recycling = Blank; + } + + /* + * This means that the data is there + */ + bool isUnwrapped() const noexcept { + return info.state == Ready + || info.state == SelfSufficient + ; + } + + bool isUnwrappable() const noexcept { + return isUnwrapped() + || info.state == Recycled + || info.state == Dispatched + ; + } + + inline bool isDirectlyFetchable() const noexcept { + return info.state == Ready || info.state == Dispatched; + } + + void free() noexcept { + info.tuple = {0, 0}; + info.type = Blank; + info.state = Acceptor; + info.from = {0, 0}; + info.recycling = Blank; + data = nullptr; + } + + inline bool isFree() const noexcept { + return info.tuple == PartialTuple{0, 0} + && info.type == Blank + && info.state == Acceptor + && info.from.rank == 0 + && info.from.source == 0 + && info.recycling == Blank + && data == nullptr + ; + } + + + /* + * This function answers the question, which slices can be recycled. + * + * A slice can only be recycled if it is Fetch or Ready and has + * a valid datapointer. + * + * In particular, SelfSufficient are not recyclable, since it is easier + * just to create a SelfSufficient slice than deal with data dependencies. + * + * Furthermore, a recycled slice is not recyclable, if this is the case + * then it is either bad design or a bug. + */ + inline bool isRecyclable() const noexcept { + return ( info.state == Dispatched + || info.state == Ready + || info.state == Fetch + ) + && hasValidDataPointer() + ; + } + + /* + * This function describes if a slice has a valid data pointer. + * + * This is important to know if the slice has some data to it, also + * some structural checks are done, so that it should not be Acceptor + * or Blank, if this is the case then it is a bug. + */ + inline bool hasValidDataPointer() const noexcept { + return data != nullptr + && info.state != Acceptor + && info.type != Blank + ; + } + + void unwrapAndMarkReady() { + if (info.state == Ready) return; + if (info.state != Dispatched) + throw + std::domain_error("Can't unwrap a non-ready, non-dispatched slice!"); + markReady(); + MPI_Status status; +#ifdef HAVE_OCD + WITH_RANK << "__slice__:mpi: waiting " << "\n"; +#endif + const int errorCode = MPI_Wait(&request, &status); + if (errorCode != MPI_SUCCESS) + throw "MPI ERROR HAPPENED...."; + +#ifdef HAVE_OCD + char errorString[MPI_MAX_ERROR_STRING]; + int errorSize; + MPI_Error_string(errorCode, errorString, &errorSize); + + WITH_RANK << "__slice__:mpi: status " + << "{ .source=" << status.MPI_SOURCE + << ", .tag=" << status.MPI_TAG + << ", .error=" << status.MPI_ERROR + << ", .errCode=" << errorCode + << ", .err=" << errorString + << " }" + << "\n"; +#endif + } + + Slice(size_t size_) + : info({}) + , data(nullptr) + , size(size_) + {} + + + }; // struct Slice + + +std::ostream& operator<<(std::ostream& out, Slice::Location const& v) { + // TODO: remove me + out << "{.r(" << v.rank << "), .s(" << v.source << ")};"; + return out; +} + +std::ostream& operator<<(std::ostream& out, Slice::Info const& i) { + out << "«t" << i.type << ", s" << i.state << "»" + << " ⊙ {" << i.from.rank << ", " << i.from.source << "}" + << " ∴ {" << i.tuple[0] << ", " << i.tuple[1] << "}" + << " ♲t" << i.recycling + ; + return out; +} + +} // namespace atrip +// The slice:2 ends here diff --git a/include/atrip/SliceUnion.hpp b/include/atrip/SliceUnion.hpp new file mode 100644 index 0000000..09ff2c6 --- /dev/null +++ b/include/atrip/SliceUnion.hpp @@ -0,0 +1,543 @@ +// [[file:../../atrip.org::*The slice union][The slice union:1]] +#pragma once +#include +#include + +namespace atrip { + + struct SliceUnion { + using F = double; + using Tensor = CTF::Tensor; + + virtual void + sliceIntoBuffer(size_t iteration, Tensor &to, Tensor const& from) = 0; + + /* + * This function should enforce an important property of a SliceUnion. + * Namely, there can be no two Slices of the same nature. + * + * This means that there can be at most one slice with a given Ty_x_Tu. + */ + void checkForDuplicates() const { + std::vector tytus; + for (auto const& s: slices) { + if (s.isFree()) continue; + tytus.push_back({s.info.type, s.info.tuple}); + } + + for (auto const& tytu: tytus) { + if (std::count(tytus.begin(), tytus.end(), tytu) > 1) + throw "Invariance violated, more than one slice with same Ty_x_Tu"; + } + + } + + std::vector neededSlices(ABCTuple const& abc) { + std::vector needed(sliceTypes.size()); + // build the needed vector + std::transform(sliceTypes.begin(), sliceTypes.end(), + needed.begin(), + [&abc](Slice::Type const type) { + auto tuple = Slice::subtupleBySlice(abc, type); + return std::make_pair(type, tuple); + }); + return needed; + } + + /* buildLocalDatabase + * + * It should build a database of slices so that we know what is needed + * to fetch in the next iteration represented by the tuple 'abc'. + * + * 1. The algorithm works as follows, we build a database of the all + * the slice types that we need together with their tuple. + * + * 2. Look in the SliceUnion if we already have this tuple, + * if we already have it mark it (TODO) + * + * 3. If we don't have the tuple, look for a (state=acceptor, type=blank) + * slice and mark this slice as type=Fetch with the corresponding type + * and tuple. + * + * NOTE: The algorithm should certify that we always have enough blank + * slices. + * + */ + Slice::LocalDatabase buildLocalDatabase(ABCTuple const& abc) { + Slice::LocalDatabase result; + + auto const needed = neededSlices(abc); + + WITH_RANK << "__db__:needed:" << pretty_print(needed) << "\n"; + // BUILD THE DATABASE + // we need to loop over all sliceTypes that this TensorUnion + // is representing and find out how we will get the corresponding + // slice for the abc we are considering right now. + for (auto const& pair: needed) { + auto const type = pair.first; + auto const tuple = pair.second; + auto const from = rankMap.find(abc, type); + +#ifdef HAVE_OCD + WITH_RANK << "__db__:want:" << pretty_print(pair) << "\n"; + for (auto const& s: slices) + WITH_RANK << "__db__:guts:ocd " + << s.info << " pt " << s.data + << "\n"; +#endif + +#ifdef HAVE_OCD + WITH_RANK << "__db__: checking if exact match" << "\n"; +#endif + { + // FIRST: look up if there is already a *Ready* slice matching what we + // need + auto const& it + = std::find_if(slices.begin(), slices.end(), + [&tuple, &type](Slice const& other) { + return other.info.tuple == tuple + && other.info.type == type + // we only want another slice when it + // has already ready-to-use data + && other.isUnwrappable() + ; + }); + if (it != slices.end()) { + // if we find this slice, it means that we don't have to do anything + WITH_RANK << "__db__: EXACT: found EXACT in name=" << name + << " for tuple " << tuple[0] << ", " << tuple[1] + << " ptr " << it->data + << "\n"; + result.push_back({name, it->info}); + continue; + } + } + +#ifdef HAVE_OCD + WITH_RANK << "__db__: checking if recycle" << "\n"; +#endif + // Try to find a recyling possibility ie. find a slice with the same + // tuple and that has a valid data pointer. + auto const& recycleIt + = std::find_if(slices.begin(), slices.end(), + [&tuple, &type](Slice const& other) { + return other.info.tuple == tuple + && other.info.type != type + && other.isRecyclable() + ; + }); + + // if we find this recylce, then we find a Blank slice + // (which should exist by construction :THINK) + // + if (recycleIt != slices.end()) { + auto& blank = Slice::findOneByType(slices, Slice::Blank); + // TODO: formalize this through a method to copy information + // from another slice + blank.data = recycleIt->data; + blank.info.type = type; + blank.info.tuple = tuple; + blank.info.state = Slice::Recycled; + blank.info.from = from; + blank.info.recycling = recycleIt->info.type; + result.push_back({name, blank.info}); + WITH_RANK << "__db__: RECYCLING: n" << name + << " " << pretty_print(abc) + << " get " << pretty_print(blank.info) + << " from " << pretty_print(recycleIt->info) + << " ptr " << recycleIt->data + << "\n" + ; + continue; + } + + // in this case we have to create a new slice + // this means that we should have a blank slice at our disposal + // and also the freePointers should have some elements inside, + // so we pop a data pointer from the freePointers container +#ifdef HAVE_OCD + WITH_RANK << "__db__: none work, doing new" << "\n"; +#endif + { + WITH_RANK << "__db__: NEW: finding blank in " << name + << " for type " << type + << " for tuple " << tuple[0] << ", " << tuple[1] + << "\n" + ; + auto& blank = Slice::findOneByType(slices, Slice::Blank); + blank.info.type = type; + blank.info.tuple = tuple; + blank.info.from = from; + + // Handle self sufficiency + blank.info.state = Atrip::rank == from.rank + ? Slice::SelfSufficient + : Slice::Fetch + ; + if (blank.info.state == Slice::SelfSufficient) { + blank.data = sources[from.source].data(); + } else { + if (freePointers.size() == 0) + throw std::domain_error("No more free pointers!"); + auto dataPointer = freePointers.begin(); + freePointers.erase(dataPointer); + blank.data = *dataPointer; + } + + result.push_back({name, blank.info}); + continue; + } + + } + +#ifdef HAVE_OCD + for (auto const& s: slices) + WITH_RANK << "__db__:guts:ocd:__end__ " << s.info << "\n"; +#endif + + + return result; + + } + + /* + * Garbage collect slices not needed for the next iteration. + * + * It will throw if it tries to gc a slice that has not been + * previously unwrapped, as a safety mechanism. + */ + void clearUnusedSlicesForNext(ABCTuple const& abc) { + auto const needed = neededSlices(abc); + + // CLEAN UP SLICES, FREE THE ONES THAT ARE NOT NEEDED ANYMORE + for (auto& slice: slices) { + // if the slice is free, then it was not used anyways + if (slice.isFree()) continue; + + + // try to find the slice in the needed slices list + auto const found + = std::find_if(needed.begin(), needed.end(), + [&slice] (Slice::Ty_x_Tu const& tytu) { + return slice.info.tuple == tytu.second + && slice.info.type == tytu.first + ; + }); + + // if we did not find slice in needed, then erase it + if (found == needed.end()) { + + // We have to be careful about the data pointer, + // for SelfSufficient, the data pointer is a source pointer + // of the slice, so we should just wipe it. + // + // For Ready slices, we have to be careful if there are some + // recycled slices depending on it. + bool freeSlicePointer = true; + + // allow to gc unwrapped and recycled, never Fetch, + // if we have a Fetch slice then something has gone very wrong. + if (!slice.isUnwrapped() && slice.info.state != Slice::Recycled) + throw + std::domain_error("Trying to garbage collect " + " a non-unwrapped slice! " + + pretty_print(&slice) + + pretty_print(slice.info)); + + // it can be that our slice is ready, but it has some hanging + // references lying around in the form of a recycled slice. + // Of course if we need the recycled slice the next iteration + // this would be fatal, because we would then free the pointer + // of the slice and at some point in the future we would + // overwrite it. Therefore, we must check if slice has some + // references in slices and if so then + // + // - we should mark those references as the original (since the data + // pointer should be the same) + // + // - we should make sure that the data pointer of slice + // does not get freed. + // + if (slice.info.state == Slice::Ready) { + WITH_OCD WITH_RANK + << "__gc__:" << "checking for data recycled dependencies\n"; + auto recycled + = Slice::hasRecycledReferencingToIt(slices, slice.info); + if (recycled.size()) { + Slice* newReady = recycled[0]; + WITH_OCD WITH_RANK + << "__gc__:" << "swaping recycled " + << pretty_print(newReady->info) + << " and " + << pretty_print(slice.info) + << "\n"; + newReady->markReady(); + assert(newReady->data == slice.data); + freeSlicePointer = false; + + for (size_t i = 1; i < recycled.size(); i++) { + auto newRecyled = recycled[i]; + newRecyled->info.recycling = newReady->info.type; + WITH_OCD WITH_RANK + << "__gc__:" << "updating recycled " + << pretty_print(newRecyled->info) + << "\n"; + } + + } + } + + // if the slice is self sufficient, do not dare touching the + // pointer, since it is a pointer to our sources in our rank. + if ( slice.info.state == Slice::SelfSufficient + || slice.info.state == Slice::Recycled + ) { + freeSlicePointer = false; + } + + // make sure we get its data pointer to be used later + // only for non-recycled, since it can be that we need + // for next iteration the data of the slice that the recycled points + // to + if (freeSlicePointer) { + freePointers.insert(slice.data); + WITH_RANK << "~~~:cl(" << name << ")" + << " added to freePointer " + << pretty_print(freePointers) + << "\n"; + } else { + WITH_OCD WITH_RANK << "__gc__:not touching the free Pointer\n"; + } + + // at this point, let us blank the slice + WITH_RANK << "~~~:cl(" << name << ")" + << " freeing up slice " + << " info " << slice.info + << "\n"; + slice.free(); + } + + } + } + + // CONSTRUCTOR + SliceUnion( Tensor const& sourceTensor + , std::vector sliceTypes_ + , std::vector sliceLength_ + , std::vector paramLength + , size_t np + , MPI_Comm child_world + , MPI_Comm global_world + , Slice::Name name_ + , size_t nSliceBuffers = 4 + ) + : rankMap(paramLength, np) + , world(child_world) + , universe(global_world) + , sliceLength(sliceLength_) + , sources(rankMap.nSources(), + std::vector + (std::accumulate(sliceLength.begin(), + sliceLength.end(), + 1, std::multiplies()))) + , name(name_) + , sliceTypes(sliceTypes_) + , sliceBuffers(nSliceBuffers, sources[0]) + //, slices(2 * sliceTypes.size(), Slice{ sources[0].size() }) + { // constructor begin + + LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n"; + + slices + = std::vector(2 * sliceTypes.size(), { sources[0].size() }); + // TODO: think exactly ^------------------- about this number + + // initialize the freePointers with the pointers to the buffers + std::transform(sliceBuffers.begin(), sliceBuffers.end(), + std::inserter(freePointers, freePointers.begin()), + [](std::vector &vec) { return vec.data(); }); + + + + LOG(1,"Atrip") << "rankMap.nSources " + << rankMap.nSources() << "\n"; + LOG(1,"Atrip") << "#slices " + << slices.size() << "\n"; + LOG(1,"Atrip") << "#slices[0] " + << slices[0].size << "\n"; + LOG(1,"Atrip") << "#sources " + << sources.size() << "\n"; + LOG(1,"Atrip") << "#sources[0] " + << sources[0].size() << "\n"; + LOG(1,"Atrip") << "#freePointers " + << freePointers.size() << "\n"; + LOG(1,"Atrip") << "#sliceBuffers " + << sliceBuffers.size() << "\n"; + LOG(1,"Atrip") << "#sliceBuffers[0] " + << sliceBuffers[0].size() << "\n"; + LOG(1,"Atrip") << "#sliceLength " + << sliceLength.size() << "\n"; + LOG(1,"Atrip") << "#paramLength " + << paramLength.size() << "\n"; + LOG(1,"Atrip") << "GB*" << np << " " + << double(sources.size() + sliceBuffers.size()) + * sources[0].size() + * 8 * np + / 1073741824.0 + << "\n"; + } // constructor ends + + void init(Tensor const& sourceTensor) { + + CTF::World w(world); + const int rank = Atrip::rank + , order = sliceLength.size() + ; + std::vector const syms(order, NS); + std::vector __sliceLength(sliceLength.begin(), sliceLength.end()); + Tensor toSliceInto(order, + __sliceLength.data(), + syms.data(), + w); + LOG(1,"Atrip") << "slicing... \n"; + + // setUp sources + for (size_t it(0); it < rankMap.nSources(); ++it) { + const size_t + source = rankMap.isSourcePadding(rank, source) ? 0 : it; + WITH_OCD + WITH_RANK + << "Init:toSliceInto it-" << it + << " :: source " << source << "\n"; + sliceIntoBuffer(source, toSliceInto, sourceTensor); + } + + } + + /** + * \brief Send asynchronously only if the state is Fetch + */ + void send( size_t otherRank + , Slice::Info const& info + , size_t tag) const noexcept { + MPI_Request request; + bool sendData_p = false; + + if (info.state == Slice::Fetch) sendData_p = true; + // TODO: remove this because I have SelfSufficient + if (otherRank == info.from.rank) sendData_p = false; + if (!sendData_p) return; + + MPI_Isend( sources[info.from.source].data() + , sources[info.from.source].size() + , MPI_DOUBLE /* TODO: adapt this with traits */ + , otherRank + , tag + , universe + , &request + ); + WITH_CRAZY_DEBUG + WITH_RANK << "sent to " << otherRank << "\n"; + + } + + /** + * \brief Receive asynchronously only if the state is Fetch + */ + void receive(Slice::Info const& info, size_t tag) noexcept { + auto& slice = Slice::findByInfo(slices, info); + + if (Atrip::rank == info.from.rank) return; + + if (slice.info.state == Slice::Fetch) { + // TODO: do it through the slice class + slice.info.state = Slice::Dispatched; + MPI_Request request = nullptr; + slice.request = request; + MPI_Irecv( slice.data + , slice.size + , MPI_DOUBLE // TODO: Adapt this with traits + , info.from.rank + , tag + , universe + , &slice.request + //, MPI_STATUS_IGNORE + ); + } + } + + void unwrapAll(ABCTuple const& abc) { + for (auto type: sliceTypes) unwrapSlice(type, abc); + } + + F* unwrapSlice(Slice::Type type, ABCTuple const& abc) { + WITH_CRAZY_DEBUG + WITH_RANK << "__unwrap__:slice " << type << " w n " + << name + << " abc" << pretty_print(abc) + << "\n"; + auto& slice = Slice::findByTypeAbc(slices, type, abc); + WITH_RANK << "__unwrap__:info " << slice.info << "\n"; + switch (slice.info.state) { + case Slice::Dispatched: + WITH_RANK << "__unwrap__:Fetch: " << &slice + << " info " << pretty_print(slice.info) + << "\n"; + slice.unwrapAndMarkReady(); + return slice.data; + break; + case Slice::SelfSufficient: + WITH_RANK << "__unwrap__:SelfSufficient: " << &slice + << " info " << pretty_print(slice.info) + << "\n"; + return slice.data; + break; + case Slice::Ready: + WITH_RANK << "__unwrap__:READY: UNWRAPPED ALREADY" << &slice + << " info " << pretty_print(slice.info) + << "\n"; + return slice.data; + break; + case Slice::Recycled: + WITH_RANK << "__unwrap__:RECYCLED " << &slice + << " info " << pretty_print(slice.info) + << "\n"; + return unwrapSlice(slice.info.recycling, abc); + break; + case Slice::Fetch: + case Slice::Acceptor: + throw std::domain_error("Can't unwrap an acceptor or fetch slice!"); + break; + default: + throw std::domain_error("Unknown error unwrapping slice!"); + } + return slice.data; + } + + const RankMap rankMap; + const MPI_Comm world; + const MPI_Comm universe; + const std::vector sliceLength; + std::vector< std::vector > sources; + std::vector< Slice > slices; + Slice::Name name; + const std::vector sliceTypes; + std::vector< std::vector > sliceBuffers; + std::set freePointers; + + }; + + SliceUnion& + unionByName(std::vector const& unions, Slice::Name name) { + const auto sliceUnionIt + = std::find_if(unions.begin(), unions.end(), + [&name](SliceUnion const* s) { + return name == s->name; + }); + if (sliceUnionIt == unions.end()) + throw std::domain_error("SliceUnion not found!"); + return **sliceUnionIt; + } + +} +// The slice union:1 ends here diff --git a/include/atrip/Tuples.hpp b/include/atrip/Tuples.hpp new file mode 100644 index 0000000..d32bf33 --- /dev/null +++ b/include/atrip/Tuples.hpp @@ -0,0 +1,73 @@ +// [[file:../../atrip.org::*Tuples][Tuples:1]] +#pragma once + +#include +#include + +#include +#include + +namespace atrip { + + using ABCTuple = std::array; + using PartialTuple = std::array; + using ABCTuples = std::vector; + + ABCTuples getTuplesList(size_t Nv) { + const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; + ABCTuples result(n); + size_t u(0); + + for (size_t a(0); a < Nv; a++) + for (size_t b(a); b < Nv; b++) + for (size_t c(b); c < Nv; c++){ + if ( a == b && b == c ) continue; + result[u++] = {a, b, c}; + } + + return result; + + } + + + std::pair + getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) { + + std::vector n_tuples_per_rank(np, tuplesList.size()/np); + const size_t + // how many valid tuples should we still verteilen to nodes + // since the number of tuples is not divisible by the number of nodes + nRoundRobin = tuplesList.size() % np + // every node must have the sanme amount of tuples in order for the + // other nodes to receive and send somewhere, therefore + // some nodes will get extra tuples but that are dummy tuples + , nExtraInvalid = (np - nRoundRobin) % np + ; + + if (nRoundRobin) for (int i = 0; i < np; i++) n_tuples_per_rank[i]++; + + #if defined(TODO) + assert( tuplesList.size() + == + ( std::accumulate(n_tuples_per_rank.begin(), + n_tuples_per_rank.end(), + 0) + + nExtraInvalid + )); + #endif + + WITH_RANK << "nRoundRobin = " << nRoundRobin << "\n"; + WITH_RANK << "nExtraInvalid = " << nExtraInvalid << "\n"; + WITH_RANK << "ntuples = " << n_tuples_per_rank[rank] << "\n"; + + auto const& it = n_tuples_per_rank.begin(); + + return + { std::accumulate(it, it + rank , 0) + , std::accumulate(it, it + rank + 1, 0) + }; + + } + +} +// Tuples:1 ends here diff --git a/include/atrip/Unions.hpp b/include/atrip/Unions.hpp new file mode 100644 index 0000000..8961a21 --- /dev/null +++ b/include/atrip/Unions.hpp @@ -0,0 +1,240 @@ +// [[file:../../atrip.org::*Unions][Unions:1]] +#pragma once +#include + +namespace atrip { + + void sliceIntoVector + ( std::vector &v + , CTF::Tensor &toSlice + , std::vector const low + , std::vector const up + , CTF::Tensor const& origin + , std::vector const originLow + , std::vector const originUp + ) { + // Thank you CTF for forcing me to do this + struct { std::vector up, low; } + toSlice_ = { {up.begin(), up.end()} + , {low.begin(), low.end()} } + , origin_ = { {originUp.begin(), originUp.end()} + , {originLow.begin(), originLow.end()} } + ; + + WITH_OCD + WITH_RANK << "slicing into " << pretty_print(toSlice_.up) + << "," << pretty_print(toSlice_.low) + << " from " << pretty_print(origin_.up) + << "," << pretty_print(origin_.low) + << "\n"; + +#ifndef ATRIP_DONT_SLICE + toSlice.slice( toSlice_.low.data() + , toSlice_.up.data() + , 0.0 + , origin + , origin_.low.data() + , origin_.up.data() + , 1.0); + memcpy(v.data(), toSlice.data, sizeof(double) * v.size()); +#endif + + } + + + struct TAPHH : public SliceUnion { + TAPHH( Tensor const& sourceTensor + , size_t No + , size_t Nv + , size_t np + , MPI_Comm child_world + , MPI_Comm global_world + ) : SliceUnion( sourceTensor + , {Slice::A, Slice::B, Slice::C} + , {Nv, No, No} // size of the slices + , {Nv} + , np + , child_world + , global_world + , Slice::TA + , 4) { + init(sourceTensor); + } + + void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override + { + const int rank = Atrip::rank + , Nv = sliceLength[0] + , No = sliceLength[1] + , a = rankMap.find({static_cast(rank), it}); + ; + + + sliceIntoVector( sources[it] + , to, {0, 0, 0}, {Nv, No, No} + , from, {a, 0, 0, 0}, {a+1, Nv, No, No} + ); + + } + + }; + + + struct HHHA : public SliceUnion { + HHHA( Tensor const& sourceTensor + , size_t No + , size_t Nv + , size_t np + , MPI_Comm child_world + , MPI_Comm global_world + ) : SliceUnion( sourceTensor + , {Slice::A, Slice::B, Slice::C} + , {No, No, No} // size of the slices + , {Nv} // size of the parametrization + , np + , child_world + , global_world + , Slice::VIJKA + , 4) { + init(sourceTensor); + } + + void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override + { + + const int rank = Atrip::rank + , No = sliceLength[0] + , a = rankMap.find({static_cast(rank), it}) + ; + + sliceIntoVector( sources[it] + , to, {0, 0, 0}, {No, No, No} + , from, {0, 0, 0, a}, {No, No, No, a+1} + ); + + } + }; + + struct ABPH : public SliceUnion { + ABPH( Tensor const& sourceTensor + , size_t No + , size_t Nv + , size_t np + , MPI_Comm child_world + , MPI_Comm global_world + ) : SliceUnion( sourceTensor + , { Slice::AB, Slice::BC, Slice::AC + , Slice::BA, Slice::CB, Slice::CA + } + , {Nv, No} // size of the slices + , {Nv, Nv} // size of the parametrization + , np + , child_world + , global_world + , Slice::VABCI + , 2*6) { + init(sourceTensor); + } + + void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override { + + const int Nv = sliceLength[0] + , No = sliceLength[1] + , rank = Atrip::rank + , el = rankMap.find({static_cast(rank), it}) + , a = el % Nv + , b = el / Nv + ; + + + sliceIntoVector( sources[it] + , to, {0, 0}, {Nv, No} + , from, {a, b, 0, 0}, {a+1, b+1, Nv, No} + ); + + } + + }; + + struct ABHH : public SliceUnion { + ABHH( Tensor const& sourceTensor + , size_t No + , size_t Nv + , size_t np + , MPI_Comm child_world + , MPI_Comm global_world + ) : SliceUnion( sourceTensor + , {Slice::AB, Slice::BC, Slice::AC} + , {No, No} // size of the slices + , {Nv, Nv} // size of the parametrization + , np + , child_world + , global_world + , Slice::VABIJ + , 6) { + init(sourceTensor); + } + + void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override { + + const int Nv = from.lens[0] + , No = sliceLength[1] + , rank = Atrip::rank + , el = rankMap.find({static_cast(rank), it}) + , a = el % Nv + , b = el / Nv + ; + + sliceIntoVector( sources[it] + , to, {0, 0}, {No, No} + , from, {a, b, 0, 0}, {a+1, b+1, No, No} + ); + + + } + + }; + + + struct TABHH : public SliceUnion { + TABHH( Tensor const& sourceTensor + , size_t No + , size_t Nv + , size_t np + , MPI_Comm child_world + , MPI_Comm global_world + ) : SliceUnion( sourceTensor + , {Slice::AB, Slice::BC, Slice::AC} + , {No, No} // size of the slices + , {Nv, Nv} // size of the parametrization + , np + , child_world + , global_world + , Slice::TABIJ + , 6) { + init(sourceTensor); + } + + void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override { + // TODO: maybe generalize this with ABHH + + const int Nv = from.lens[0] + , No = sliceLength[1] + , rank = Atrip::rank + , el = rankMap.find({static_cast(rank), it}) + , a = el % Nv + , b = el / Nv + ; + + sliceIntoVector( sources[it] + , to, {0, 0}, {No, No} + , from, {a, b, 0, 0}, {a+1, b+1, No, No} + ); + + + } + + }; + +} +// Unions:1 ends here diff --git a/include/atrip/Utils.hpp b/include/atrip/Utils.hpp new file mode 100644 index 0000000..1efe179 --- /dev/null +++ b/include/atrip/Utils.hpp @@ -0,0 +1,99 @@ +// [[file:../../atrip.org::*Utils][Utils:1]] +#pragma once +#include +#include +#include +#include + +#include + +namespace atrip { + + + template + std::string pretty_print(T&& value) { + std::stringstream stream; +#if ATRIP_DEBUG > 1 + dbg::pretty_print(stream, std::forward(value)); +#endif + return stream.str(); + } + +#define WITH_CHRONO(__chrono, ...) \ + __chrono.start(); __VA_ARGS__ __chrono.stop(); + + struct Timer { + using Clock = std::chrono::high_resolution_clock; + using Event = std::chrono::time_point; + std::chrono::duration duration; + Event _start; + inline void start() noexcept { _start = Clock::now(); } + inline void stop() noexcept { duration += Clock::now() - _start; } + inline void clear() noexcept { duration *= 0; } + inline double count() const noexcept { return duration.count(); } + }; + using Timings = std::map; +} +// Utils:1 ends here + +// [[file:../../atrip.org::*The rank mapping][The rank mapping:1]] +#pragma once + +#include +#include + +#include + +namespace atrip { + struct RankMap { + + std::vector const lengths; + size_t const np, size; + + RankMap(std::vector lens, size_t np_) + : lengths(lens) + , np(np_) + , size(std::accumulate(lengths.begin(), lengths.end(), + 1, std::multiplies())) + { assert(lengths.size() <= 2); } + + size_t find(Slice::Location const& p) const noexcept { + return p.source * np + p.rank; + } + + size_t nSources() const noexcept { + return size / np + size_t(size % np != 0); + } + + + bool isPaddingRank(size_t rank) const noexcept { + return size % np == 0 + ? false + : rank > (size % np - 1) + ; + } + + bool isSourcePadding(size_t rank, size_t source) const noexcept { + return source == nSources() && isPaddingRank(rank); + } + + Slice::Location + find(ABCTuple const& abc, Slice::Type sliceType) const noexcept { + // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB + const auto tuple = Slice::subtupleBySlice(abc, sliceType); + + const size_t index + = tuple[0] + + tuple[1] * (lengths.size() > 1 ? lengths[0] : 0) + ; + + return + { index % np + , index / np + }; + } + + }; + +} +// The rank mapping:1 ends here diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx new file mode 100644 index 0000000..60aaddb --- /dev/null +++ b/src/atrip/Atrip.cxx @@ -0,0 +1,563 @@ +// [[file:../../atrip.org::*Atrip][Atrip:2]] +#include + +#include +#include +#include +#include +#include + +using namespace atrip; + +int Atrip::rank; +int Atrip::np; + +void Atrip::init() { + MPI_Comm_rank(MPI_COMM_WORLD, &Atrip::rank); + MPI_Comm_size(MPI_COMM_WORLD, &Atrip::np); +} + +Atrip::Output Atrip::run(Atrip::Input const& in) { + + const int np = Atrip::np; + const int rank = Atrip::rank; + MPI_Comm universe = in.ei->wrld->comm; + + // Timings in seconds ================================================{{{1 + Timings chrono{}; + + const size_t No = in.ei->lens[0]; + const size_t Nv = in.ea->lens[0]; + LOG(0,"Atrip") << "No: " << No << "\n"; + LOG(0,"Atrip") << "Nv: " << Nv << "\n"; + + // allocate the three scratches, see piecuch + std::vector Tijk(No*No*No) // doubles only (see piecuch) + , Zijk(No*No*No) // singles + doubles (see piecuch) + // we need local copies of the following tensors on every + // rank + , epsi(No) + , epsa(Nv) + , Tai(No * Nv) + ; + + in.ei->read_all(epsi.data()); + in.ea->read_all(epsa.data()); + in.Tph->read_all(Tai.data()); + + // COMMUNICATOR CONSTRUCTION ========================================={{{1 + // + // Construct a new communicator living only on a single rank + int child_size = 1 + , child_rank + ; + const + int color = rank / child_size + , crank = rank % child_size + ; + MPI_Comm child_comm; + if (np == 1) { + child_comm = universe; + } else { + MPI_Comm_split(universe, color, crank, &child_comm); + MPI_Comm_rank(child_comm, &child_rank); + MPI_Comm_size(child_comm, &child_size); + //CTF::World child_world(child_comm); + } + + + chrono["nv-slices"].start(); + // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1 + LOG(0,"Atrip") << "BUILD NV-SLICES\n"; + TAPHH taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + HHHA hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + chrono["nv-slices"].stop(); + + chrono["nv-nv-slices"].start(); + // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1 + LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n"; + ABPH abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + ABHH abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + TABHH tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + chrono["nv-nv-slices"].stop(); + + // all tensors + std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh}; + + //CONSTRUCT TUPLE LIST ==============================================={{{1 + LOG(0,"Atrip") << "BUILD TUPLE LIST\n"; + const auto tuplesList = std::move(getTuplesList(Nv)); + WITH_RANK << "tupList.size() = " << tuplesList.size() << "\n"; + + // GET ABC INDEX RANGE FOR RANK ======================================{{{1 + auto abcIndex = getABCRange(np, rank, tuplesList); + size_t nIterations = abcIndex.second - abcIndex.first; + +#ifdef ATRIP_BENCHMARK + { const size_t maxIterations = in.maxIterations; + if (maxIterations != 0) { + abcIndex.second = abcIndex.first + maxIterations % (nIterations + 1); + nIterations = maxIterations % (nIterations + 1); + } + } +#endif + + WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n"; + LOG(0,"Atrip") << "#iterations: " + << nIterations << "\n"; + + // first abc + const ABCTuple firstAbc = tuplesList[abcIndex.first]; + + + double energy(0.); + + + auto const isFakeTuple + = [&tuplesList](size_t const i) { return i >= tuplesList.size(); }; + + + auto communicateDatabase + = [ &unions + , np + , &chrono + ] (ABCTuple const& abc, MPI_Comm const& c) -> Slice::Database { + + chrono["db:comm:type:do"].start(); + auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement(); + chrono["db:comm:type:do"].stop(); + + chrono["db:comm:ldb"].start(); + Slice::LocalDatabase ldb; + + for (auto const& tensor: unions) { + auto const& tensorDb = tensor->buildLocalDatabase(abc); + ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end()); + } + chrono["db:comm:ldb"].stop(); + + Slice::Database db(np * ldb.size(), ldb[0]); + + chrono["oneshot-db:comm:allgather"].start(); + chrono["db:comm:allgather"].start(); + MPI_Allgather( ldb.data() + , ldb.size() + , MPI_LDB_ELEMENT + , db.data() + , ldb.size() + , MPI_LDB_ELEMENT + , c); + chrono["db:comm:allgather"].stop(); + chrono["oneshot-db:comm:allgather"].stop(); + + chrono["db:comm:type:free"].start(); + MPI_Type_free(&MPI_LDB_ELEMENT); + chrono["db:comm:type:free"].stop(); + + return db; + }; + + auto doIOPhase + = [&unions, &rank, &np, &universe, &chrono] (Slice::Database const& db) { + + const size_t localDBLength = db.size() / np; + + size_t sendTag = 0 + , recvTag = rank * localDBLength + ; + + // RECIEVE PHASE ====================================================== + { + // At this point, we have already send to everyone that fits + auto const& begin = &db[rank * localDBLength] + , end = begin + localDBLength + ; + for (auto it = begin; it != end; ++it) { + recvTag++; + auto const& el = *it; + auto& u = unionByName(unions, el.name); + + WITH_DBG std::cout + << rank << ":r" + << "♯" << recvTag << " =>" + << " «n" << el.name + << ", t" << el.info.type + << ", s" << el.info.state + << "»" + << " ⊙ {" << rank << "⇐" << el.info.from.rank + << ", " + << el.info.from.source << "}" + << " ∴ {" << el.info.tuple[0] + << ", " + << el.info.tuple[1] + << "}" + << "\n" + ; + + chrono["db:io:recv"].start(); + u.receive(el.info, recvTag); + chrono["db:io:recv"].stop(); + + } // recv + } + + // SEND PHASE ========================================================= + for (size_t otherRank = 0; otherRank" + << " «n" << el.name + << ", t" << el.info.type + << ", s" << el.info.state + << "»" + << " ⊙ {" << el.info.from.rank << "⇒" << otherRank + << ", " + << el.info.from.source << "}" + << " ∴ {" << el.info.tuple[0] + << ", " + << el.info.tuple[1] + << "}" + << "\n" + ; + + chrono["db:io:send"].start(); + u.send(otherRank, el.info, sendTag); + chrono["db:io:send"].stop(); + + } // send phase + + } // otherRank + + + }; + +#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES) + std::map tupleEnergies; +#endif + + const double doublesFlops + = double(No) + * double(No) + * double(No) + * (double(No) + double(Nv)) + * 2 + * 6 + / 1e9 + ; + + // START MAIN LOOP ======================================================{{{1 + + Slice::Database db; + + for ( size_t i = abcIndex.first, iteration = 1 + ; i < abcIndex.second + ; i++, iteration++ + ) { + chrono["iterations"].start(); + + // check overhead from chrono over all iterations + chrono["start:stop"].start(); chrono["start:stop"].stop(); + + // check overhead of doing a barrier at the beginning + chrono["oneshot-mpi:barrier"].start(); + chrono["mpi:barrier"].start(); + // TODO: REMOVE + if (in.barrier == 1) + MPI_Barrier(universe); + chrono["mpi:barrier"].stop(); + chrono["oneshot-mpi:barrier"].stop(); + + if (iteration % in.iterationMod == 0) { + LOG(0,"Atrip") + << "iteration " << iteration + << " [" << 100 * iteration / nIterations << "%]" + << " (" << doublesFlops * iteration / chrono["doubles"].count() + << "GF)" + << " (" << doublesFlops * iteration / chrono["iterations"].count() + << "GF)" + << " ===========================\n"; + + // PRINT TIMINGS + for (auto const& pair: chrono) + LOG(1, " ") << pair.first << " :: " + << pair.second.count() + << std::endl; + + } + + const ABCTuple abc = isFakeTuple(i) + ? tuplesList[tuplesList.size() - 1] + : tuplesList[i] + , *abcNext = i == (abcIndex.second - 1) + ? nullptr + : isFakeTuple(i + 1) + ? &tuplesList[tuplesList.size() - 1] + : &tuplesList[i + 1] + ; + + chrono["with_rank"].start(); + WITH_RANK << " :it " << iteration + << " :abc " << pretty_print(abc) + << " :abcN " + << (abcNext ? pretty_print(*abcNext) : "None") + << "\n"; + chrono["with_rank"].stop(); + + + // COMM FIRST DATABASE ================================================{{{1 + if (i == abcIndex.first) { + WITH_RANK << "__first__:first database ............ \n"; + const auto __db = communicateDatabase(abc, universe); + WITH_RANK << "__first__:first database communicated \n"; + WITH_RANK << "__first__:first database io phase \n"; + doIOPhase(__db); + WITH_RANK << "__first__:first database io phase DONE\n"; + WITH_RANK << "__first__::::Unwrapping all slices for first database\n"; + for (auto& u: unions) u->unwrapAll(abc); + WITH_RANK << "__first__::::Unwrapping all slices for first database DONE\n"; + MPI_Barrier(universe); + } + + // COMM NEXT DATABASE ================================================={{{1 + if (abcNext) { + WITH_RANK << "__comm__:" << iteration << "th communicating database\n"; + chrono["db:comm"].start(); + //const auto db = communicateDatabase(*abcNext, universe); + db = communicateDatabase(*abcNext, universe); + chrono["db:comm"].stop(); + chrono["db:io"].start(); + doIOPhase(db); + chrono["db:io"].stop(); + WITH_RANK << "__comm__:" << iteration << "th database io phase DONE\n"; + } + + // COMPUTE DOUBLES ===================================================={{{1 + OCD_Barrier(universe); + if (!isFakeTuple(i)) { + WITH_RANK << iteration << "-th doubles\n"; + WITH_CHRONO(chrono["oneshot-unwrap"], + WITH_CHRONO(chrono["unwrap"], + WITH_CHRONO(chrono["unwrap:doubles"], + for (auto& u: decltype(unions){&abph, &hhha, &taphh, &tabhh}) { + u->unwrapAll(abc); + } + ))) + chrono["oneshot-doubles"].start(); + chrono["doubles"].start(); + doublesContribution( abc, (size_t)No, (size_t)Nv + // -- VABCI + , abph.unwrapSlice(Slice::AB, abc) + , abph.unwrapSlice(Slice::AC, abc) + , abph.unwrapSlice(Slice::BC, abc) + , abph.unwrapSlice(Slice::BA, abc) + , abph.unwrapSlice(Slice::CA, abc) + , abph.unwrapSlice(Slice::CB, abc) + // -- VHHHA + , hhha.unwrapSlice(Slice::A, abc) + , hhha.unwrapSlice(Slice::B, abc) + , hhha.unwrapSlice(Slice::C, abc) + // -- TA + , taphh.unwrapSlice(Slice::A, abc) + , taphh.unwrapSlice(Slice::B, abc) + , taphh.unwrapSlice(Slice::C, abc) + // -- TABIJ + , tabhh.unwrapSlice(Slice::AB, abc) + , tabhh.unwrapSlice(Slice::AC, abc) + , tabhh.unwrapSlice(Slice::BC, abc) + // -- TIJK + , Tijk.data() + , chrono + ); + WITH_RANK << iteration << "-th doubles done\n"; + chrono["doubles"].stop(); + chrono["oneshot-doubles"].stop(); + } + + // COMPUTE SINGLES =================================================== {{{1 + OCD_Barrier(universe); + if (!isFakeTuple(i)) { + WITH_CHRONO(chrono["oneshot-unwrap"], + WITH_CHRONO(chrono["unwrap"], + WITH_CHRONO(chrono["unwrap:singles"], + abhh.unwrapAll(abc); + ))) + chrono["reorder"].start(); + for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I]; + chrono["reorder"].stop(); + chrono["singles"].start(); + singlesContribution( No, Nv, abc + , Tai.data() + , abhh.unwrapSlice(Slice::AB, abc) + , abhh.unwrapSlice(Slice::AC, abc) + , abhh.unwrapSlice(Slice::BC, abc) + , Zijk.data()); + chrono["singles"].stop(); + } + + + // COMPUTE ENERGY ==================================================== {{{1 + if (!isFakeTuple(i)) { + double tupleEnergy(0.); + + int distinct(0); + if (abc[0] == abc[1]) distinct++; + if (abc[1] == abc[2]) distinct--; + const double epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]); + + chrono["energy"].start(); + if ( distinct == 0) + tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk); + else + tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk); + chrono["energy"].stop(); + +#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES) + tupleEnergies[abc] = tupleEnergy; +#endif + + energy += tupleEnergy; + +#ifdef HAVE_OCD + auto const print_slices + = [](ABCTuple const& abc, ABCTuple const& want, SliceUnion& u) { + if (abc != want) return; + + for (auto type: u.sliceTypes) { + auto const& ptr = u.unwrapSlice(type, abc); + auto const& slice = Slice::findByTypeAbc(u.slices, type, abc); + WITH_RANK << "__print_slice__:n" << u.name << " " + << pretty_print(abc) << " " + << pretty_print(slice.info) + ; + for (size_t i = 0; i < 20; i++) std::cout << ptr[i] << ", "; + std::cout << std::endl; + } + }; +#endif + + if (isFakeTuple(i)) { + // fake iterations should also unwrap whatever they got + WITH_RANK << iteration + << "th unwrapping because of fake in " + << i << "\n"; + for (auto& u: unions) u->unwrapAll(abc); + } + +#ifdef HAVE_OCD + for (auto const& u: unions) { + WITH_RANK << "__dups__:" + << iteration + << "-th n" << u->name << " checking duplicates\n"; + u->checkForDuplicates(); + } +#endif + + + // CLEANUP UNIONS ===================================================={{{1 + OCD_Barrier(universe); + if (abcNext) { + chrono["gc"].start(); + WITH_RANK << "__gc__:" << iteration << "-th cleaning up.......\n"; + for (auto& u: unions) { + + u->unwrapAll(abc); + WITH_RANK << "__gc__:n" << u->name << " :it " << iteration + << " :abc " << pretty_print(abc) + << " :abcN " << pretty_print(*abcNext) + << "\n"; + for (auto const& slice: u->slices) + WITH_RANK << "__gc__:guts:" << slice.info << "\n"; + u->clearUnusedSlicesForNext(*abcNext); + + WITH_RANK << "__gc__: checking validity\n"; + +#ifdef HAVE_OCD + // check for validity of the slices + for (auto type: u->sliceTypes) { + auto tuple = Slice::subtupleBySlice(abc, type); + for (auto& slice: u->slices) { + if ( slice.info.type == type + && slice.info.tuple == tuple + && slice.isDirectlyFetchable() + ) { + if (slice.info.state == Slice::Dispatched) + throw std::domain_error( "This slice should not be undispatched! " + + pretty_print(slice.info)); + } + } + } +#endif + + + } + chrono["gc"].stop(); + } + + WITH_RANK << iteration << "-th cleaning up....... DONE\n"; + } + + // CLEAN CHRONO ======================================================{{{1 + chrono["iterations"].stop(); + { // TODO: REMOVEME + chrono["oneshot-doubles"].clear(); + chrono["oneshot-mpi:barrier"].clear(); + chrono["oneshot-db:comm:allgather"].clear(); + chrono["oneshot-unwrap"].clear(); + } + + // ITERATION END ====================================================={{{1 + } // END OF MAIN LOOP + + MPI_Barrier(universe); + + // PRINT TUPLES ========================================================={{{1 +#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES) + LOG(0,"Atrip") << "tuple energies" << "\n"; + for (size_t i = 0; i < np; i++) { + MPI_Barrier(universe); + for (auto const& pair: tupleEnergies) { + if (i == rank) + std::cout << pair.first[0] + << " " << pair.first[1] + << " " << pair.first[2] + << std::setprecision(15) << std::setw(23) + << " tupleEnergy: " << pair.second + << "\n" + ; + } + } +#endif + + // COMMUNICATE THE ENERGIES ============================================={{{1 + LOG(0,"Atrip") << "COMMUNICATING ENERGIES \n"; + double globalEnergy = 0; + MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe); + + WITH_RANK << "local energy " << energy << "\n"; + LOG(0,"LOOP FINISHED, energy") + << std::setprecision(15) << std::setw(23) + << globalEnergy << std::endl; + + // PRINT TIMINGS {{{1 + for (auto const& pair: chrono) + LOG(0,"atrip:chrono") << pair.first << " " + << pair.second.count() << std::endl; + + + LOG(0, "atrip:flops") + << nIterations * doublesFlops / chrono["doubles"].count() << "\n"; + + return { energy }; + +} +// Atrip:2 ends here