From bbbfb30c6f33bb842b209fe00e3706a71211983d Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Fri, 18 Feb 2022 12:54:59 +0100 Subject: [PATCH] Add tangled sources --- include/atrip/Atrip.hpp | 32 +- include/atrip/Debug.hpp | 1 - include/atrip/Equations.hpp | 190 +++++---- include/atrip/RankMap.hpp | 67 +++- include/atrip/Slice.hpp | 758 ++++++++++++++++++----------------- include/atrip/SliceUnion.hpp | 27 +- include/atrip/Tuples.hpp | 569 +++++++++++++++++++++++--- include/atrip/Unions.hpp | 4 +- include/atrip/Utils.hpp | 45 ++- src/atrip/Atrip.cxx | 332 +++++++-------- 10 files changed, 1298 insertions(+), 727 deletions(-) diff --git a/include/atrip/Atrip.hpp b/include/atrip/Atrip.hpp index 6f3859c..2a0f340 100644 --- a/include/atrip/Atrip.hpp +++ b/include/atrip/Atrip.hpp @@ -7,12 +7,22 @@ #include +#include + +#define ADD_ATTRIBUTE(_type, _name, _default) \ + _type _name = _default; \ + Input& with_ ## _name(_type i) { \ + _name = i; \ + return *this; \ + } + namespace atrip { struct Atrip { static int rank; static int np; + static Timings chrono; static void init(); template @@ -25,9 +35,6 @@ namespace atrip { , *Vhhhp = nullptr , *Vppph = nullptr ; - int maxIterations = 0, iterationMod = -1, percentageMod = -1; - bool barrier = false; - bool chrono = false; Input& with_epsilon_i(CTF::Tensor * t) { ei = t; return *this; } Input& with_epsilon_a(CTF::Tensor * t) { ea = t; return *this; } Input& with_Tai(CTF::Tensor * t) { Tph = t; return *this; } @@ -35,11 +42,20 @@ namespace atrip { Input& with_Vabij(CTF::Tensor * t) { Vpphh = t; return *this; } Input& with_Vijka(CTF::Tensor * t) { Vhhhp = t; return *this; } Input& with_Vabci(CTF::Tensor * t) { Vppph = t; return *this; } - Input& with_maxIterations(int i) { maxIterations = i; return *this; } - Input& with_iterationMod(int i) { iterationMod = i; return *this; } - Input& with_percentageMod(int i) { percentageMod = i; return *this; } - Input& with_barrier(bool i) { barrier = i; return *this; } - Input& with_chrono(bool i) { chrono = i; return *this; } + + enum TuplesDistribution { + NAIVE, + GROUP_AND_SORT, + }; + + ADD_ATTRIBUTE(bool, rankRoundRobin, false) + ADD_ATTRIBUTE(bool, chrono, false) + ADD_ATTRIBUTE(bool, barrier, false) + ADD_ATTRIBUTE(int, maxIterations, 0) + ADD_ATTRIBUTE(int, iterationMod, -1) + ADD_ATTRIBUTE(int, percentageMod, -1) + ADD_ATTRIBUTE(TuplesDistribution, tuplesDistribution, NAIVE) + }; struct Output { diff --git a/include/atrip/Debug.hpp b/include/atrip/Debug.hpp index 4347824..e567d5c 100644 --- a/include/atrip/Debug.hpp +++ b/include/atrip/Debug.hpp @@ -41,7 +41,6 @@ # define DBG(...) dbg(__VA_ARGS__) #elif ATRIP_DEBUG == 2 # pragma message("WARNING: You have some debugging info for ABC triples") -# include # define OCD_Barrier(com) # define WITH_OCD if (false) # define WITH_ROOT if (atrip::Atrip::rank == 0) diff --git a/include/atrip/Equations.hpp b/include/atrip/Equations.hpp index 2b90736..e907592 100644 --- a/include/atrip/Equations.hpp +++ b/include/atrip/Equations.hpp @@ -40,12 +40,12 @@ namespace atrip { , X(Zijk_[j + No*k + No*No*i]) , Y(Zijk_[k + No*i + No*No*j]) , Z(Zijk_[k + No*j + No*No*i]) - , A(std::conj(Tijk_[i + No*j + No*No*k])) - , B(std::conj(Tijk_[i + No*k + No*No*j])) - , C(std::conj(Tijk_[j + No*i + No*No*k])) - , D(std::conj(Tijk_[j + No*k + No*No*i])) - , E(std::conj(Tijk_[k + No*i + No*No*j])) - , F(std::conj(Tijk_[k + No*j + No*No*i])) + , A(maybeConjugate(Tijk_[i + No*j + No*No*k])) + , B(maybeConjugate(Tijk_[i + No*k + No*No*j])) + , C(maybeConjugate(Tijk_[j + No*i + No*No*k])) + , D(maybeConjugate(Tijk_[j + No*k + No*No*i])) + , E(maybeConjugate(Tijk_[k + No*i + No*No*j])) + , F(maybeConjugate(Tijk_[k + No*j + No*No*i])) , value = 3.0 * ( A * U + B * V @@ -102,9 +102,9 @@ namespace atrip { , U(Zijk_[i + No*j + No*No*k]) , V(Zijk_[j + No*k + No*No*i]) , W(Zijk_[k + No*i + No*No*j]) - , A(std::conj(Tijk_[i + No*j + No*No*k])) - , B(std::conj(Tijk_[j + No*k + No*No*i])) - , C(std::conj(Tijk_[k + No*i + No*No*j])) + , A(maybeConjugate(Tijk_[i + No*j + No*No*k])) + , B(maybeConjugate(Tijk_[j + No*k + No*No*i])) + , C(maybeConjugate(Tijk_[k + No*i + No*No*j])) , value = F(3.0) * ( A * U + B * V @@ -172,10 +172,8 @@ namespace atrip { , F const* TBChh // -- TIJK , F *Tijk - , atrip::Timings& chrono ) { - auto& t_reorder = chrono["doubles:reorder"]; const size_t a = abc[0], b = abc[1], c = abc[2] , NoNo = No*No, NoNv = No*Nv ; @@ -183,13 +181,13 @@ namespace atrip { #if defined(ATRIP_USE_DGEMM) #define _IJK_(i, j, k) i + j*No + k*NoNo #define REORDER(__II, __JJ, __KK) \ - t_reorder.start(); \ + WITH_CHRONO("doubles:reorder", \ for (size_t k = 0; k < No; k++) \ for (size_t j = 0; j < No; j++) \ for (size_t i = 0; i < No; i++) { \ Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)]; \ } \ - t_reorder.stop(); + ) #define DGEMM_PARTICLES(__A, __B) \ atrip::xgemm( "T" \ , "N" \ @@ -220,106 +218,100 @@ namespace atrip { , _t_buffer.data() \ , (int const*)&NoNo \ ); - #define MAYBE_CONJ(_conj, _buffer) \ - if (traits::isComplex()) { \ - for (size_t __i = 0; __i < NoNoNo; ++__i) \ - _conj[__i] = std::conj(_buffer[__i]); \ - } else { \ - for (size_t __i = 0; __i < NoNoNo; ++__i) \ - _conj[__i] = _buffer[__i]; \ - } + #define MAYBE_CONJ(_conj, _buffer) \ + for (size_t __i = 0; __i < NoNoNo; ++__i) \ + _conj[__i] = maybeConjugate(_buffer[__i]); \ const size_t NoNoNo = No*NoNo; std::vector _t_buffer; _t_buffer.reserve(NoNoNo); F one{1.0}, m_one{-1.0}, zero{0.0}; - t_reorder.start(); - for (size_t k = 0; k < NoNoNo; k++) { - // zero the Tijk - Tijk[k] = 0.0; - } - t_reorder.stop(); + WITH_CHRONO("double:reorder", + for (size_t k = 0; k < NoNoNo; k++) { + Tijk[k] = 0.0; + }) - chrono["doubles:holes"].start(); - { // Holes part ============================================================ + // TOMERGE: replace chronos + WITH_CHRONO("doubles:holes", + { // Holes part ======================================================== - std::vector _vhhh(NoNoNo); + std::vector _vhhh(NoNoNo); - // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 - MAYBE_CONJ(_vhhh, VhhhC) - chrono["doubles:holes:1"].start(); - DGEMM_HOLES(_vhhh.data(), TABhh, "N") - REORDER(i, k, j) - chrono["doubles:holes:1"].stop(); - // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 - chrono["doubles:holes:2"].start(); - DGEMM_HOLES(_vhhh.data(), TABhh, "T") - REORDER(j, k, i) - chrono["doubles:holes:2"].stop(); + // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 + MAYBE_CONJ(_vhhh, VhhhC) + WITH_CHRONO("doubles:holes:1", + DGEMM_HOLES(_vhhh.data(), TABhh, "N") + REORDER(i, k, j) + ) + // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 + WITH_CHRONO("doubles:holes:2", + DGEMM_HOLES(_vhhh.data(), TABhh, "T") + REORDER(j, k, i) + ) - // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 - MAYBE_CONJ(_vhhh, VhhhB) - chrono["doubles:holes:3"].start(); - DGEMM_HOLES(_vhhh.data(), TAChh, "N") - REORDER(i, j, k) - chrono["doubles:holes:3"].stop(); - // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 - chrono["doubles:holes:4"].start(); - DGEMM_HOLES(_vhhh.data(), TAChh, "T") - REORDER(k, j, i) - chrono["doubles:holes:4"].stop(); + // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 + MAYBE_CONJ(_vhhh, VhhhB) + WITH_CHRONO("doubles:holes:3", + DGEMM_HOLES(_vhhh.data(), TAChh, "N") + REORDER(i, j, k) + ) + // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 + WITH_CHRONO("doubles:holes:4", + DGEMM_HOLES(_vhhh.data(), TAChh, "T") + REORDER(k, j, i) + ) - // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 - MAYBE_CONJ(_vhhh, VhhhA) - chrono["doubles:holes:5"].start(); - DGEMM_HOLES(_vhhh.data(), TBChh, "N") - REORDER(j, i, k) - chrono["doubles:holes:5"].stop(); - // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 - chrono["doubles:holes:6"].start(); - DGEMM_HOLES(_vhhh.data(), TBChh, "T") - REORDER(k, i, j) - chrono["doubles:holes:6"].stop(); + // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 + MAYBE_CONJ(_vhhh, VhhhA) + WITH_CHRONO("doubles:holes:5", + DGEMM_HOLES(_vhhh.data(), TBChh, "N") + REORDER(j, i, k) + ) + // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 + WITH_CHRONO("doubles:holes:6", + DGEMM_HOLES(_vhhh.data(), TBChh, "T") + REORDER(k, i, j) + ) - } - chrono["doubles:holes"].stop(); + } + ) #undef MAYBE_CONJ - chrono["doubles:particles"].start(); - { // Particle part ========================================================= - // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 - chrono["doubles:particles:1"].start(); - DGEMM_PARTICLES(TAphh, VBCph) - REORDER(i, j, k) - chrono["doubles:particles:1"].stop(); - // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 - chrono["doubles:particles:2"].start(); - DGEMM_PARTICLES(TAphh, VCBph) - REORDER(i, k, j) - chrono["doubles:particles:2"].stop(); - // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 - chrono["doubles:particles:3"].start(); - DGEMM_PARTICLES(TCphh, VABph) - REORDER(k, i, j) - chrono["doubles:particles:3"].stop(); - // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 - chrono["doubles:particles:4"].start(); - DGEMM_PARTICLES(TCphh, VBAph) - REORDER(k, j, i) - chrono["doubles:particles:4"].stop(); - // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 - chrono["doubles:particles:5"].start(); - DGEMM_PARTICLES(TBphh, VACph) - REORDER(j, i, k) - chrono["doubles:particles:5"].stop(); - // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 - chrono["doubles:particles:6"].start(); - DGEMM_PARTICLES(TBphh, VCAph) - REORDER(j, k, i) - chrono["doubles:particles:6"].stop(); - } - chrono["doubles:particles"].stop(); + WITH_CHRONO("doubles:particles", + { // Particle part ===================================================== + // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 + WITH_CHRONO("doubles:particles:1", + DGEMM_PARTICLES(TAphh, VBCph) + REORDER(i, j, k) + ) + // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 + WITH_CHRONO("doubles:particles:2", + DGEMM_PARTICLES(TAphh, VCBph) + REORDER(i, k, j) + ) + // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 + WITH_CHRONO("doubles:particles:3", + DGEMM_PARTICLES(TCphh, VABph) + REORDER(k, i, j) + ) + // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 + WITH_CHRONO("doubles:particles:4", + DGEMM_PARTICLES(TCphh, VBAph) + REORDER(k, j, i) + ) + // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 + WITH_CHRONO("doubles:particles:5", + DGEMM_PARTICLES(TBphh, VACph) + REORDER(j, i, k) + ) + // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 + WITH_CHRONO("doubles:particles:6", + DGEMM_PARTICLES(TBphh, VCAph) + REORDER(j, k, i) + ) + } + ) #undef REORDER #undef DGEMM_HOLES diff --git a/include/atrip/RankMap.hpp b/include/atrip/RankMap.hpp index 8564f9e..0e31a61 100644 --- a/include/atrip/RankMap.hpp +++ b/include/atrip/RankMap.hpp @@ -5,24 +5,38 @@ #include #include +#include namespace atrip { template struct RankMap { + static bool RANK_ROUND_ROBIN; std::vector const lengths; size_t const np, size; + ClusterInfo const clusterInfo; - RankMap(std::vector lens, size_t np_) + RankMap(std::vector lens, size_t np_, MPI_Comm comm) : lengths(lens) , np(np_) , size(std::accumulate(lengths.begin(), lengths.end(), 1UL, std::multiplies())) + , clusterInfo(getClusterInfo(comm)) { assert(lengths.size() <= 2); } size_t find(typename Slice::Location const& p) const noexcept { - return p.source * np + p.rank; + if (RANK_ROUND_ROBIN) { + return p.source * np + p.rank; + } else { + const size_t + rankPosition = p.source * clusterInfo.ranksPerNode + + clusterInfo.rankInfos[p.rank].localRank + ; + return rankPosition * clusterInfo.nNodes + + clusterInfo.rankInfos[p.rank].nodeId + ; + } } size_t nSources() const noexcept { @@ -42,8 +56,9 @@ namespace atrip { } typename Slice::Location - find(ABCTuple const& abc, typename Slice::Type sliceType) const noexcept { + find(ABCTuple const& abc, typename Slice::Type sliceType) const { // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB + // tuple = {11, 0} when abc = {11, 8, 9} and sliceType = A const auto tuple = Slice::subtupleBySlice(abc, sliceType); const size_t index @@ -51,9 +66,51 @@ namespace atrip { + tuple[1] * (lengths.size() > 1 ? lengths[0] : 0) ; + size_t rank, source; + + if (RANK_ROUND_ROBIN) { + + rank = index % np; + source = index / np; + + } else { + + size_t const + + // the node that will be assigned to + nodeId = index % clusterInfo.nNodes + + // how many times it has been assigned to the node + , s_n = index / clusterInfo.nNodes + + // which local rank in the node should be + , localRank = s_n % clusterInfo.ranksPerNode + + // and the local source (how many times we chose this local rank) + , localSource = s_n / clusterInfo.ranksPerNode + ; + + // find the localRank-th entry in clusterInfo + auto const& it = + std::find_if(clusterInfo.rankInfos.begin(), + clusterInfo.rankInfos.end(), + [nodeId, localRank](RankInfo const& ri) { + return ri.nodeId == nodeId + && ri.localRank == localRank + ; + }); + if (it == clusterInfo.rankInfos.end()) { + throw "FATAL! Error in node distribution of the slices"; + } + + rank = (*it).globalRank; + source = localSource; + + } + return - { index % np - , index / np + { rank + , source }; } diff --git a/include/atrip/Slice.hpp b/include/atrip/Slice.hpp index 877d72a..1f5889e 100644 --- a/include/atrip/Slice.hpp +++ b/include/atrip/Slice.hpp @@ -1,4 +1,4 @@ -// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20slice][The slice:1]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]] #pragma once #include #include @@ -11,6 +11,9 @@ namespace atrip { +template FF maybeConjugate(const FF a) { return a; } +template <> Complex maybeConjugate(const Complex a) { return std::conj(a); } + namespace traits { template bool isComplex() { return false; }; template <> bool isComplex() { return true; }; @@ -24,401 +27,409 @@ namespace mpi { template struct Slice { -// The slice:1 ends here +// Prolog:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20slice][The slice:2]] -// ASSOCIATED TYPES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Location][Location:1]] +struct Location { size_t rank; size_t source; }; +// Location:1 ends here - struct Location { size_t rank; size_t source; }; - - enum Type - { A = 10 - , B - , C - // Two-parameter slices - , AB = 20 - , BC - , AC - // for abci and the doubles - , CB - , BA - , CA - // The non-typed slice - , Blank = 404 - }; - - enum State { - // Fetch represents the state where a slice is to be fetched - // and has a valid data pointer that can be written to - Fetch = 0, - // Dispatches represents the state that an MPI call has been - // dispatched in order to get the data, but the data has not been - // yet unwrapped, the data might be there or we might have to wait. - Dispatched = 2, - // Ready means that the data pointer can be read from - Ready = 1, - // Self sufficient is a slice when its contents are located - // in the same rank that it lives, so that it does not have to - // fetch from no one else. - SelfSufficient = 911, - // Recycled means that this slice gets its data pointer from another - // slice, so it should not be written to - Recycled = 123, - // Acceptor means that the Slice can accept a new Slice, it is - // the counterpart of the Blank type, but for states - Acceptor = 405 +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Type][Type:1]] +enum Type + { A = 10 + , B + , C + // Two-parameter slices + , AB = 20 + , BC + , AC + // for abci and the doubles + , CB + , BA + , CA + // The non-typed slice + , Blank = 404 }; +// Type:1 ends here - struct Info { - // which part of a,b,c the slice holds - PartialTuple tuple; - // The type of slice for the user to retrieve the correct one - Type type; - // What is the state of the slice - State state; - // Where the slice is to be retrieved - // NOTE: this can actually be computed from tuple - Location from; - // If the data are actually to be found in this other slice - Type recycling; +// [[file:~/cc4s/src/atrip/complex/atrip.org::*State][State:1]] +enum State { + Fetch = 0, + Dispatched = 2, + Ready = 1, + SelfSufficient = 911, + Recycled = 123, + Acceptor = 405 +}; +// State:1 ends here - Info() : tuple{0,0} - , type{Blank} - , state{Acceptor} - , from{0,0} - , recycling{Blank} - {} +// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20Info%20structure][The Info structure:1]] +struct Info { + // which part of a,b,c the slice holds + PartialTuple tuple; + // The type of slice for the user to retrieve the correct one + Type type; + // What is the state of the slice + State state; + // Where the slice is to be retrieved + Location from; + // If the data are actually to be found in this other slice + Type recycling; + + Info() : tuple{0,0} + , type{Blank} + , state{Acceptor} + , from{0,0} + , recycling{Blank} + {} +}; + +using Ty_x_Tu = std::pair< Type, PartialTuple >; +// The Info structure:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Name][Name:1]] +enum Name + { TA = 100 + , VIJKA = 101 + , VABCI = 200 + , TABIJ = 201 + , VABIJ = 202 }; +// Name:1 ends here - using Ty_x_Tu = std::pair< Type, PartialTuple >; +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Database][Database:1]] +struct LocalDatabaseElement { + Slice::Name name; + Slice::Info info; +}; +// Database:1 ends here - // Names of the integrals that are considered in CCSD(T) - enum Name - { TA = 100 - , VIJKA = 101 - , VABCI = 200 - , TABIJ = 201 - , VABIJ = 202 - }; +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Database][Database:2]] +using LocalDatabase = std::vector; +using Database = LocalDatabase; +// Database:2 ends here - // DATABASE ==========================================================={{{1 - struct LocalDatabaseElement { - Slice::Name name; - Slice::Info info; - }; - using LocalDatabase = std::vector; - using Database = LocalDatabase; +// [[file:~/cc4s/src/atrip/complex/atrip.org::*MPI%20Types][MPI Types:1]] +struct mpi { - - // STATIC METHODS =========================================================== - // - // They are useful to organize the structure of slices - - struct mpi { - - static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) { - MPI_Datatype dt; - MPI_Type_vector(n, 1, 1, DT, &dt); - MPI_Type_commit(&dt); - return dt; - } - - static MPI_Datatype sliceLocation () { - constexpr int n = 2; - // create a sliceLocation to measure in the current architecture - // the packing of the struct - Slice::Location measure; - MPI_Datatype dt; - const std::vector lengths(n, 1); - const MPI_Datatype types[n] = {usizeDt(), usizeDt()}; - - // measure the displacements in the struct - size_t j = 0; - MPI_Aint displacements[n]; - MPI_Get_address(&measure.rank, &displacements[j++]); - MPI_Get_address(&measure.source, &displacements[j++]); - for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0]; - displacements[0] = 0; - - MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); - MPI_Type_commit(&dt); - return dt; - } - - static MPI_Datatype enumDt() { return MPI_INT; } - static MPI_Datatype usizeDt() { return MPI_UINT64_T; } - - static MPI_Datatype sliceInfo () { - constexpr int n = 5; - MPI_Datatype dt; - Slice::Info measure; - const std::vector lengths(n, 1); - const MPI_Datatype types[n] - = { vector(2, usizeDt()) - , enumDt() - , enumDt() - , sliceLocation() - , enumDt() - }; - - // create the displacements from the info measurement struct - size_t j = 0; - MPI_Aint displacements[n]; - MPI_Get_address(measure.tuple.data(), &displacements[j++]); - MPI_Get_address(&measure.type, &displacements[j++]); - MPI_Get_address(&measure.state, &displacements[j++]); - MPI_Get_address(&measure.from, &displacements[j++]); - MPI_Get_address(&measure.recycling, &displacements[j++]); - for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0]; - displacements[0] = 0; - - MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); - MPI_Type_commit(&dt); - return dt; - } - - static MPI_Datatype localDatabaseElement () { - constexpr int n = 2; - MPI_Datatype dt; - LocalDatabaseElement measure; - const std::vector lengths(n, 1); - const MPI_Datatype types[n] - = { enumDt() - , sliceInfo() - }; - - // measure the displacements in the struct - size_t j = 0; - MPI_Aint displacements[n]; - MPI_Get_address(&measure.name, &displacements[j++]); - MPI_Get_address(&measure.info, &displacements[j++]); - for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0]; - displacements[0] = 0; - - MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); - MPI_Type_commit(&dt); - return dt; - } - - }; - - static - PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) { - switch (sliceType) { - case AB: return {abc[0], abc[1]}; - case BC: return {abc[1], abc[2]}; - case AC: return {abc[0], abc[2]}; - case CB: return {abc[2], abc[1]}; - case BA: return {abc[1], abc[0]}; - case CA: return {abc[2], abc[0]}; - case A: return {abc[0], 0}; - case B: return {abc[1], 0}; - case C: return {abc[2], 0}; - default: throw "Switch statement not exhaustive!"; - } + static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) { + MPI_Datatype dt; + MPI_Type_vector(n, 1, 1, DT, &dt); + MPI_Type_commit(&dt); + return dt; } + static MPI_Datatype sliceLocation () { + constexpr int n = 2; + // create a sliceLocation to measure in the current architecture + // the packing of the struct + Slice::Location measure; + MPI_Datatype dt; + const std::vector lengths(n, 1); + const MPI_Datatype types[n] = {usizeDt(), usizeDt()}; - /** - * It is important here to return a reference to a Slice - * not to accidentally copy the associated buffer of the slice. - */ - static Slice& findOneByType(std::vector> &slices, Slice::Type type) { - const auto sliceIt - = std::find_if(slices.begin(), slices.end(), - [&type](Slice const& s) { - return type == s.info.type; - }); - WITH_CRAZY_DEBUG - WITH_RANK - << "\t__ looking for " << type << "\n"; - if (sliceIt == slices.end()) - throw std::domain_error("Slice by type not found!"); - return *sliceIt; - } + static_assert(sizeof(Slice::Location) == 2 * sizeof(size_t), + "The Location packing is wrong in your compiler"); - /* - * Check if an info has - * - */ - static std::vector*> hasRecycledReferencingToIt - ( std::vector> &slices - , Info const& info - ) { - std::vector*> result; + // measure the displacements in the struct + size_t j = 0; + MPI_Aint base_address, displacements[n]; + MPI_Get_address(&measure, &base_address); + MPI_Get_address(&measure.rank, &displacements[j++]); + MPI_Get_address(&measure.source, &displacements[j++]); + for (size_t i = 0; i < n; i++) + displacements[i] = MPI_Aint_diff(displacements[i], base_address); - for (auto& s: slices) - if ( s.info.recycling == info.type - && s.info.tuple == info.tuple - && s.info.state == Recycled - ) result.push_back(&s); + MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); + MPI_Type_commit(&dt); + return dt; + } - return result; - } + static MPI_Datatype usizeDt() { return MPI_UINT64_T; } - static Slice& - findRecycledSource (std::vector> &slices, Slice::Info info) { - const auto sliceIt - = std::find_if(slices.begin(), slices.end(), - [&info](Slice const& s) { - return info.recycling == s.info.type - && info.tuple == s.info.tuple - && State::Recycled != s.info.state - ; - }); + static MPI_Datatype sliceInfo () { + constexpr int n = 5; + MPI_Datatype dt; + Slice::Info measure; + const std::vector lengths(n, 1); + const MPI_Datatype types[n] + = { vector(2, usizeDt()) + , vector(sizeof(enum Type), MPI_CHAR) + , vector(sizeof(enum State), MPI_CHAR) + , sliceLocation() + , vector(sizeof(enum Type), MPI_CHAR) + // TODO: Why this does not work on intel mpi? + /*, MPI_UINT64_T*/ + }; - WITH_CRAZY_DEBUG - WITH_RANK << "__slice__:find: recycling source of " - << pretty_print(info) << "\n"; - if (sliceIt == slices.end()) - throw std::domain_error( "Slice not found: " - + pretty_print(info) - + " rank: " - + pretty_print(Atrip::rank) - ); - WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n"; - return *sliceIt; - } + static_assert(sizeof(enum Type) == 4, "Enum type not 4 bytes long"); + static_assert(sizeof(enum State) == 4, "Enum State not 4 bytes long"); + static_assert(sizeof(enum Name) == 4, "Enum Name not 4 bytes long"); - static Slice& findByTypeAbc - ( std::vector> &slices - , Slice::Type type - , ABCTuple const& abc - ) { - const auto tuple = Slice::subtupleBySlice(abc, type); - const auto sliceIt - = std::find_if(slices.begin(), slices.end(), - [&type, &tuple](Slice const& s) { - return type == s.info.type - && tuple == s.info.tuple - ; - }); - WITH_CRAZY_DEBUG - WITH_RANK << "__slice__:find:" << type << " and tuple " - << pretty_print(tuple) - << "\n"; - if (sliceIt == slices.end()) - throw std::domain_error( "Slice not found: " - + pretty_print(tuple) - + ", " - + pretty_print(type) - + " rank: " - + pretty_print(Atrip::rank) - ); - return *sliceIt; - } + // create the displacements from the info measurement struct + size_t j = 0; + MPI_Aint base_address, displacements[n]; + MPI_Get_address(&measure, &base_address); + MPI_Get_address(&measure.tuple[0], &displacements[j++]); + MPI_Get_address(&measure.type, &displacements[j++]); + MPI_Get_address(&measure.state, &displacements[j++]); + MPI_Get_address(&measure.from, &displacements[j++]); + MPI_Get_address(&measure.recycling, &displacements[j++]); + for (size_t i = 0; i < n; i++) + displacements[i] = MPI_Aint_diff(displacements[i], base_address); - static Slice& findByInfo(std::vector> &slices, - Slice::Info const& info) { - const auto sliceIt - = std::find_if(slices.begin(), slices.end(), - [&info](Slice const& s) { - // TODO: maybe implement comparison in Info struct - return info.type == s.info.type - && info.state == s.info.state - && info.tuple == s.info.tuple - && info.from.rank == s.info.from.rank - && info.from.source == s.info.from.source - ; - }); - WITH_CRAZY_DEBUG - WITH_RANK << "__slice__:find:looking for " << pretty_print(info) << "\n"; - if (sliceIt == slices.end()) - throw std::domain_error( "Slice by info not found: " - + pretty_print(info)); - return *sliceIt; - } + MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); + MPI_Type_commit(&dt); + return dt; + } - // SLICE DEFINITION =================================================={{{1 + static MPI_Datatype localDatabaseElement () { + constexpr int n = 2; + MPI_Datatype dt; + LocalDatabaseElement measure; + const std::vector lengths(n, 1); + const MPI_Datatype types[n] + = { vector(sizeof(enum Name), MPI_CHAR) + , sliceInfo() + }; - // ATTRIBUTES ============================================================ - Info info; - F *data; - MPI_Request request; - const size_t size; + // measure the displacements in the struct + size_t j = 0; + MPI_Aint base_address, displacements[n]; + MPI_Get_address(&measure, &base_address); + MPI_Get_address(&measure.name, &displacements[j++]); + MPI_Get_address(&measure.info, &displacements[j++]); + for (size_t i = 0; i < n; i++) + displacements[i] = MPI_Aint_diff(displacements[i], base_address); - void markReady() noexcept { - info.state = Ready; - info.recycling = Blank; - } + static_assert( sizeof(LocalDatabaseElement) == sizeof(measure) + , "Measure has bad size"); - /* - * This means that the data is there - */ - bool isUnwrapped() const noexcept { - return info.state == Ready - || info.state == SelfSufficient - ; - } + MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); + MPI_Type_commit(&dt); + return vector(sizeof(LocalDatabaseElement), MPI_CHAR); + // TODO: write tests in order to know if this works + return dt; + } - bool isUnwrappable() const noexcept { - return isUnwrapped() - || info.state == Recycled - || info.state == Dispatched - ; - } +}; +// MPI Types:1 ends here - inline bool isDirectlyFetchable() const noexcept { - return info.state == Ready || info.state == Dispatched; - } +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:1]] +static +PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) { + switch (sliceType) { + case AB: return {abc[0], abc[1]}; + case BC: return {abc[1], abc[2]}; + case AC: return {abc[0], abc[2]}; + case CB: return {abc[2], abc[1]}; + case BA: return {abc[1], abc[0]}; + case CA: return {abc[2], abc[0]}; + case A: return {abc[0], 0}; + case B: return {abc[1], 0}; + case C: return {abc[2], 0}; + default: throw "Switch statement not exhaustive!"; + } +} +// Static utilities:1 ends here - void free() noexcept { - info.tuple = {0, 0}; - info.type = Blank; - info.state = Acceptor; - info.from = {0, 0}; - info.recycling = Blank; - data = nullptr; - } +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:2]] +static std::vector*> hasRecycledReferencingToIt + ( std::vector> &slices + , Info const& info + ) { + std::vector*> result; - inline bool isFree() const noexcept { - return info.tuple == PartialTuple{0, 0} - && info.type == Blank - && info.state == Acceptor - && info.from.rank == 0 - && info.from.source == 0 - && info.recycling == Blank - && data == nullptr - ; - } + for (auto& s: slices) + if ( s.info.recycling == info.type + && s.info.tuple == info.tuple + && s.info.state == Recycled + ) result.push_back(&s); + return result; +} +// Static utilities:2 ends here - /* - * This function answers the question, which slices can be recycled. - * - * A slice can only be recycled if it is Fetch or Ready and has - * a valid datapointer. - * - * In particular, SelfSufficient are not recyclable, since it is easier - * just to create a SelfSufficient slice than deal with data dependencies. - * - * Furthermore, a recycled slice is not recyclable, if this is the case - * then it is either bad design or a bug. - */ - inline bool isRecyclable() const noexcept { - return ( info.state == Dispatched - || info.state == Ready - || info.state == Fetch - ) - && hasValidDataPointer() - ; - } +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:3]] +static Slice& findOneByType(std::vector> &slices, Slice::Type type) { + const auto sliceIt + = std::find_if(slices.begin(), slices.end(), + [&type](Slice const& s) { + return type == s.info.type; + }); + WITH_CRAZY_DEBUG + WITH_RANK + << "\t__ looking for " << type << "\n"; + if (sliceIt == slices.end()) + throw std::domain_error("Slice by type not found!"); + return *sliceIt; +} +// Static utilities:3 ends here - /* - * This function describes if a slice has a valid data pointer. - * - * This is important to know if the slice has some data to it, also - * some structural checks are done, so that it should not be Acceptor - * or Blank, if this is the case then it is a bug. - */ - inline bool hasValidDataPointer() const noexcept { - return data != nullptr - && info.state != Acceptor - && info.type != Blank - ; - } +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:4]] +static Slice& +findRecycledSource (std::vector> &slices, Slice::Info info) { + const auto sliceIt + = std::find_if(slices.begin(), slices.end(), + [&info](Slice const& s) { + return info.recycling == s.info.type + && info.tuple == s.info.tuple + && State::Recycled != s.info.state + ; + }); - void unwrapAndMarkReady() { + WITH_CRAZY_DEBUG + WITH_RANK << "__slice__:find: recycling source of " + << pretty_print(info) << "\n"; + if (sliceIt == slices.end()) + throw std::domain_error( "Slice not found: " + + pretty_print(info) + + " rank: " + + pretty_print(Atrip::rank) + ); + WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n"; + return *sliceIt; +} +// Static utilities:4 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:5]] +static Slice& findByTypeAbc + ( std::vector> &slices + , Slice::Type type + , ABCTuple const& abc + ) { + const auto tuple = Slice::subtupleBySlice(abc, type); + const auto sliceIt + = std::find_if(slices.begin(), slices.end(), + [&type, &tuple](Slice const& s) { + return type == s.info.type + && tuple == s.info.tuple + ; + }); + WITH_CRAZY_DEBUG + WITH_RANK << "__slice__:find:" << type << " and tuple " + << pretty_print(tuple) + << "\n"; + if (sliceIt == slices.end()) + throw std::domain_error( "Slice not found: " + + pretty_print(tuple) + + ", " + + pretty_print(type) + + " rank: " + + pretty_print(Atrip::rank) + ); + return *sliceIt; +} +// Static utilities:5 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:6]] +static Slice& findByInfo(std::vector> &slices, + Slice::Info const& info) { + const auto sliceIt + = std::find_if(slices.begin(), slices.end(), + [&info](Slice const& s) { + // TODO: maybe implement comparison in Info struct + return info.type == s.info.type + && info.state == s.info.state + && info.tuple == s.info.tuple + && info.from.rank == s.info.from.rank + && info.from.source == s.info.from.source + ; + }); + WITH_CRAZY_DEBUG + WITH_RANK << "__slice__:find:looking for " << pretty_print(info) << "\n"; + if (sliceIt == slices.end()) + throw std::domain_error( "Slice by info not found: " + + pretty_print(info)); + return *sliceIt; +} +// Static utilities:6 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:1]] +Info info; +// Attributes:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:2]] +F *data; +// Attributes:2 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:3]] +MPI_Request request; +// Attributes:3 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:4]] +const size_t size; +// Attributes:4 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:1]] +void markReady() noexcept { + info.state = Ready; + info.recycling = Blank; +} +// Member functions:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:2]] +bool isUnwrapped() const noexcept { + return info.state == Ready + || info.state == SelfSufficient + ; +} +// Member functions:2 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:3]] +bool isUnwrappable() const noexcept { + return isUnwrapped() + || info.state == Recycled + || info.state == Dispatched + ; +} + +inline bool isDirectlyFetchable() const noexcept { + return info.state == Ready || info.state == Dispatched; +} + +void free() noexcept { + info.tuple = {0, 0}; + info.type = Blank; + info.state = Acceptor; + info.from = {0, 0}; + info.recycling = Blank; + data = nullptr; +} + +inline bool isFree() const noexcept { + return info.tuple == PartialTuple{0, 0} + && info.type == Blank + && info.state == Acceptor + && info.from.rank == 0 + && info.from.source == 0 + && info.recycling == Blank + && data == nullptr + ; +} +// Member functions:3 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:4]] +inline bool isRecyclable() const noexcept { + return ( info.state == Dispatched + || info.state == Ready + || info.state == Fetch + ) + && hasValidDataPointer() + ; +} +// Member functions:4 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:5]] +inline bool hasValidDataPointer() const noexcept { + return data != nullptr + && info.state != Acceptor + && info.type != Blank + ; +} +// Member functions:5 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:6]] +void unwrapAndMarkReady() { if (info.state == Ready) return; if (info.state != Dispatched) throw @@ -447,17 +458,20 @@ struct Slice { << "\n"; #endif } +// Member functions:6 ends here - Slice(size_t size_) - : info({}) - , data(nullptr) - , size(size_) - {} +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]] +Slice(size_t size_) + : info({}) + , data(nullptr) + , size(size_) + {} - }; // struct Slice - +}; // struct Slice +// Epilog:1 ends here +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Debug][Debug:1]] template std::ostream& operator<<(std::ostream& out, typename Slice::Location const& v) { // TODO: remove me @@ -476,4 +490,4 @@ std::ostream& operator<<(std::ostream& out, typename Slice::Info const& i) { } } // namespace atrip -// The slice:2 ends here +// Debug:1 ends here diff --git a/include/atrip/SliceUnion.hpp b/include/atrip/SliceUnion.hpp index ec7aff6..365ad51 100644 --- a/include/atrip/SliceUnion.hpp +++ b/include/atrip/SliceUnion.hpp @@ -179,8 +179,14 @@ namespace atrip { if (blank.info.state == Slice::SelfSufficient) { blank.data = sources[from.source].data(); } else { - if (freePointers.size() == 0) - throw std::domain_error("No more free pointers!"); + if (freePointers.size() == 0) { + std::stringstream stream; + stream << "No more free pointers " + << "for type " << type + << " and name " << name + ; + throw std::domain_error(stream.str()); + } auto dataPointer = freePointers.begin(); freePointers.erase(dataPointer); blank.data = *dataPointer; @@ -314,7 +320,8 @@ namespace atrip { // at this point, let us blank the slice WITH_RANK << "~~~:cl(" << name << ")" << " freeing up slice " - // TODO: make this possible + // TODO: make this possible because of Templates + // TODO: there is a deduction error here // << " info " << slice.info << "\n"; slice.free(); @@ -334,7 +341,7 @@ namespace atrip { , typename Slice::Name name_ , size_t nSliceBuffers = 4 ) - : rankMap(paramLength, np) + : rankMap(paramLength, np, global_world) , world(child_world) , universe(global_world) , sliceLength(sliceLength_) @@ -353,7 +360,7 @@ namespace atrip { slices = std::vector>(2 * sliceTypes.size(), { sources[0].size() }); - // TODO: think exactly ^------------------- about this number + // TODO: think exactly ^------------------- about this number // initialize the freePointers with the pointers to the buffers std::transform(sliceBuffers.begin(), sliceBuffers.end(), @@ -421,10 +428,11 @@ namespace atrip { * \brief Send asynchronously only if the state is Fetch */ void send( size_t otherRank - , typename Slice::Info const& info + , typename Slice::LocalDatabaseElement const& el , size_t tag) const noexcept { MPI_Request request; bool sendData_p = false; + auto const& info = el.info; if (info.state == Slice::Fetch) sendData_p = true; // TODO: remove this because I have SelfSufficient @@ -539,8 +547,11 @@ namespace atrip { [&name](SliceUnion const* s) { return name == s->name; }); - if (sliceUnionIt == unions.end()) - throw std::domain_error("SliceUnion not found!"); + if (sliceUnionIt == unions.end()) { + std::stringstream stream; + stream << "SliceUnion(" << name << ") not found!"; + throw std::domain_error(stream.str()); + } return **sliceUnionIt; } diff --git a/include/atrip/Tuples.hpp b/include/atrip/Tuples.hpp index 5d4b69f..c41b78a 100644 --- a/include/atrip/Tuples.hpp +++ b/include/atrip/Tuples.hpp @@ -1,75 +1,538 @@ -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Tuples][Tuples:1]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]] #pragma once #include #include #include +// TODO: remove some +#include +#include +#include +#include +#include +#include +#include +#include + #include #include namespace atrip { +// Prolog:1 ends here - using ABCTuple = std::array; - using PartialTuple = std::array; - using ABCTuples = std::vector; +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Tuples%20types][Tuples types:1]] +using ABCTuple = std::array; +using PartialTuple = std::array; +using ABCTuples = std::vector; - ABCTuples getTuplesList(size_t Nv) { - const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; - ABCTuples result(n); - size_t u(0); +constexpr ABCTuple FAKE_TUPLE = {0, 0, 0}; +constexpr ABCTuple INVALID_TUPLE = {1, 1, 1}; +// Tuples types:1 ends here - for (size_t a(0); a < Nv; a++) - for (size_t b(a); b < Nv; b++) - for (size_t c(b); c < Nv; c++){ - if ( a == b && b == c ) continue; - result[u++] = {a, b, c}; - } +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Distributing%20the%20tuples][Distributing the tuples:1]] +struct TuplesDistribution { + virtual ABCTuples getTuples(size_t Nv, MPI_Comm universe) = 0; + virtual bool tupleIsFake(ABCTuple const& t) { return t == FAKE_TUPLE; } +}; +// Distributing the tuples:1 ends here - return result; +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Node%20information][Node information:1]] +std::vector getNodeNames(MPI_Comm comm){ + int rank, np; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &np); + std::vector nodeList(np); + char nodeName[MPI_MAX_PROCESSOR_NAME] + , nodeNames[np*MPI_MAX_PROCESSOR_NAME] + ; + std::vector nameLengths(np) + , off(np) + ; + int nameLength; + MPI_Get_processor_name(nodeName, &nameLength); + MPI_Allgather(&nameLength, + 1, + MPI_INT, + nameLengths.data(), + 1, + MPI_INT, + comm); + for (int i(1); i < np; i++) + off[i] = off[i-1] + nameLengths[i-1]; + MPI_Allgatherv(nodeName, + nameLengths[rank], + MPI_BYTE, + nodeNames, + nameLengths.data(), + off.data(), + MPI_BYTE, + comm); + for (int i(0); i < np; i++) { + std::string const s(&nodeNames[off[i]], nameLengths[i]); + nodeList[i] = s; } + return nodeList; +} +// Node information:1 ends here +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Node%20information][Node information:2]] +struct RankInfo { + const std::string name; + const size_t nodeId; + const size_t globalRank; + const size_t localRank; + const size_t ranksPerNode; +}; - std::pair - getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) { - - std::vector n_tuples_per_rank(np, tuplesList.size()/np); - const size_t - // how many valid tuples should we still verteilen to nodes - // since the number of tuples is not divisible by the number of nodes - nRoundRobin = tuplesList.size() % np - // every node must have the sanme amount of tuples in order for the - // other nodes to receive and send somewhere, therefore - // some nodes will get extra tuples but that are dummy tuples - , nExtraInvalid = (np - nRoundRobin) % np - ; - - if (nRoundRobin) for (int i = 0; i < np; i++) n_tuples_per_rank[i]++; - - #if defined(TODO) - assert( tuplesList.size() - == - ( std::accumulate(n_tuples_per_rank.begin(), - n_tuples_per_rank.end(), - 0UL, - std::plus()) - + nExtraInvalid - )); - #endif - - WITH_RANK << "nRoundRobin = " << nRoundRobin << "\n"; - WITH_RANK << "nExtraInvalid = " << nExtraInvalid << "\n"; - WITH_RANK << "ntuples = " << n_tuples_per_rank[rank] << "\n"; - - auto const& it = n_tuples_per_rank.begin(); - - return - { std::accumulate(it, it + rank , 0) - , std::accumulate(it, it + rank + 1, 0) - }; +template +A unique(A const &xs) { + auto result = xs; + std::sort(std::begin(result), std::end(result)); + auto const& last = std::unique(std::begin(result), std::end(result)); + result.erase(last, std::end(result)); + return result; +} +std::vector +getNodeInfos(std::vector const& nodeNames) { + std::vector result; + auto const uniqueNames = unique(nodeNames); + auto const index = [&uniqueNames](std::string const& s) { + auto const& it = std::find(uniqueNames.begin(), uniqueNames.end(), s); + return std::distance(uniqueNames.begin(), it); + }; + std::vector localRanks(uniqueNames.size(), 0); + size_t globalRank = 0; + for (auto const& name: nodeNames) { + const size_t nodeId = index(name); + result.push_back({name, + nodeId, + globalRank++, + localRanks[nodeId]++, + std::count(nodeNames.begin(), + nodeNames.end(), + name) + }); } + return result; +} + +struct ClusterInfo { + const size_t nNodes, np, ranksPerNode; + const std::vector rankInfos; +}; + +ClusterInfo +getClusterInfo(MPI_Comm comm) { + auto const names = getNodeNames(comm); + auto const rankInfos = getNodeInfos(names); + + return ClusterInfo { + unique(names).size(), + names.size(), + rankInfos[0].ranksPerNode, + rankInfos + }; } -// Tuples:1 ends here +// Node information:2 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Naive%20list][Naive list:1]] +ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) { + + const size_t + // total number of tuples for the problem + n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv + + // all ranks should have the same number of tuples_per_rank + , tuples_per_rank = n / np + size_t(n % np != 0) + + // start index for the global tuples list + , start = tuples_per_rank * rank + + // end index for the global tuples list + , end = tuples_per_rank * (rank + 1) + ; + + LOG(1,"Atrip") << "tuples_per_rank = " << tuples_per_rank << "\n"; + WITH_RANK << "start, end = " << start << ", " << end << "\n"; + ABCTuples result(tuples_per_rank, FAKE_TUPLE); + + for (size_t a(0), r(0), g(0); a < Nv; a++) + for (size_t b(a); b < Nv; b++) + for (size_t c(b); c < Nv; c++){ + if ( a == b && b == c ) continue; + if ( start <= g && g < end) result[r++] = {a, b, c}; + g++; + } + + return result; + +} +// Naive list:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Naive%20list][Naive list:2]] +ABCTuples getAllTuplesList(const size_t Nv) { + const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; + ABCTuples result(n); + + for (size_t a(0), u(0); a < Nv; a++) + for (size_t b(a); b < Nv; b++) + for (size_t c(b); c < Nv; c++){ + if ( a == b && b == c ) continue; + result[u++] = {a, b, c}; + } + + return result; +} +// Naive list:2 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Naive%20list][Naive list:3]] +struct NaiveDistribution : public TuplesDistribution { + ABCTuples getTuples(size_t Nv, MPI_Comm universe) override { + int rank, np; + MPI_Comm_rank(universe, &rank); + MPI_Comm_size(universe, &np); + return getTuplesList(Nv, (size_t)rank, (size_t)np); + } +}; +// Naive list:3 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]] +namespace group_and_sort { +// Prolog:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Utils][Utils:1]] +// Provides the node on which the slice-element is found +// Right now we distribute the slices in a round robin fashion +// over the different nodes (NOTE: not mpi ranks but nodes) +inline +size_t isOnNode(size_t tuple, size_t nNodes) { return tuple % nNodes; } + + +// return the node (or all nodes) where the elements of this +// tuple are located +std::vector getTupleNodes(ABCTuple const& t, size_t nNodes) { + std::vector + nTuple = { isOnNode(t[0], nNodes) + , isOnNode(t[1], nNodes) + , isOnNode(t[2], nNodes) + }; + return unique(nTuple); +} + +struct Info { + size_t nNodes; + size_t nodeId; +}; +// Utils:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Distribution][Distribution:1]] +ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { + + ABCTuples nodeTuples; + size_t const nNodes(info.nNodes); + + std::vector + container1d(nNodes) + , container2d(nNodes * nNodes) + , container3d(nNodes * nNodes * nNodes) + ; + + if (info.nodeId == 0) + std::cout << "\tGoing through all " + << allTuples.size() + << " tuples in " + << nNodes + << " nodes\n"; + + // build container-n-d's + for (auto const& t: allTuples) { + // one which node(s) are the tuple elements located... + // put them into the right container + auto const _nodes = getTupleNodes(t, nNodes); + + switch (_nodes.size()) { + case 1: + container1d[_nodes[0]].push_back(t); + break; + case 2: + container2d[ _nodes[0] + + _nodes[1] * nNodes + ].push_back(t); + break; + case 3: + container3d[ _nodes[0] + + _nodes[1] * nNodes + + _nodes[2] * nNodes * nNodes + ].push_back(t); + break; + } + + } + + if (info.nodeId == 0) + std::cout << "\tBuilding 1-d containers\n"; + // DISTRIBUTE 1-d containers + // every tuple which is only located at one node belongs to this node + { + auto const& _tuples = container1d[info.nodeId]; + nodeTuples.resize(_tuples.size(), INVALID_TUPLE); + std::copy(_tuples.begin(), _tuples.end(), nodeTuples.begin()); + } + + if (info.nodeId == 0) + std::cout << "\tBuilding 2-d containers\n"; + // DISTRIBUTE 2-d containers + //the tuples which are located at two nodes are half/half given to these nodes + for (size_t yx = 0; yx < container2d.size(); yx++) { + + auto const& _tuples = container2d[yx]; + const + size_t idx = yx % nNodes + // remeber: yx = idy * nNodes + idx + , idy = yx / nNodes + , n_half = _tuples.size() / 2 + , size = nodeTuples.size() + ; + + size_t nbeg, nend; + if (info.nodeId == idx) { + nbeg = 0 * n_half; + nend = n_half; + } else if (info.nodeId == idy) { + nbeg = 1 * n_half; + nend = _tuples.size(); + } else { + // either idx or idy is my node + continue; + } + + size_t const nextra = nend - nbeg; + nodeTuples.resize(size + nextra, INVALID_TUPLE); + std::copy(_tuples.begin() + nbeg, + _tuples.begin() + nend, + nodeTuples.begin() + size); + + } + + if (info.nodeId == 0) + std::cout << "\tBuilding 3-d containers\n"; + // DISTRIBUTE 3-d containers + for (size_t zyx = 0; zyx < container3d.size(); zyx++) { + auto const& _tuples = container3d[zyx]; + + const + size_t idx = zyx % nNodes + , idy = (zyx / nNodes) % nNodes + // remember: zyx = idx + idy * nNodes + idz * nNodes^2 + , idz = zyx / nNodes / nNodes + , n_third = _tuples.size() / 3 + , size = nodeTuples.size() + ; + + size_t nbeg, nend; + if (info.nodeId == idx) { + nbeg = 0 * n_third; + nend = 1 * n_third; + } else if (info.nodeId == idy) { + nbeg = 1 * n_third; + nend = 2 * n_third; + } else if (info.nodeId == idz) { + nbeg = 2 * n_third; + nend = _tuples.size(); + } else { + // either idx or idy or idz is my node + continue; + } + + size_t const nextra = nend - nbeg; + nodeTuples.resize(size + nextra, INVALID_TUPLE); + std::copy(_tuples.begin() + nbeg, + _tuples.begin() + nend, + nodeTuples.begin() + size); + + } + + + if (info.nodeId == 0) std::cout << "\tswapping tuples...\n"; + /* + * sort part of group-and-sort algorithm + * every tuple on a given node is sorted in a way that + * the 'home elements' are the fastest index. + * 1:yyy 2:yyn(x) 3:yny(x) 4:ynn(x) 5:nyy 6:nyn(x) 7:nny 8:nnn + */ + for (auto &nt: nodeTuples){ + if ( isOnNode(nt[0], nNodes) == info.nodeId ){ // 1234 + if ( isOnNode(nt[2], nNodes) != info.nodeId ){ // 24 + size_t const x(nt[0]); + nt[0] = nt[2]; // switch first and last + nt[2] = x; + } + else if ( isOnNode(nt[1], nNodes) != info.nodeId){ // 3 + size_t const x(nt[0]); + nt[0] = nt[1]; // switch first two + nt[1] = x; + } + } else { + if ( isOnNode(nt[1], nNodes) == info.nodeId // 56 + && isOnNode(nt[2], nNodes) != info.nodeId + ) { // 6 + size_t const x(nt[1]); + nt[1] = nt[2]; // switch last two + nt[2] = x; + } + } + } + + if (info.nodeId == 0) std::cout << "\tsorting list of tuples...\n"; + //now we sort the list of tuples + std::sort(nodeTuples.begin(), nodeTuples.end()); + + if (info.nodeId == 0) std::cout << "\trestoring tuples...\n"; + // we bring the tuples abc back in the order a 1 + if (info.nodeId == 0) + std::cout << "checking for validity of " << nodeTuples.size() << std::endl; + const bool anyInvalid + = std::any_of(nodeTuples.begin(), + nodeTuples.end(), + [](ABCTuple const& t) { return t == INVALID_TUPLE; }); + if (anyInvalid) throw "Some tuple is invalid in group-and-sort algorithm"; +#endif + + if (info.nodeId == 0) std::cout << "\treturning tuples...\n"; + return nodeTuples; + +} +// Distribution:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:1]] +std::vector main(MPI_Comm universe, size_t Nv) { + + int rank, np; + MPI_Comm_rank(universe, &rank); + MPI_Comm_size(universe, &np); + + std::vector result; + + auto const nodeNames(getNodeNames(universe)); + size_t const nNodes = unique(nodeNames).size(); + auto const nodeInfos = getNodeInfos(nodeNames); + + // We want to construct a communicator which only contains of one + // element per node + bool const computeDistribution + = nodeInfos[rank].localRank == 0; + + std::vector + nodeTuples + = computeDistribution + ? specialDistribution(Info{nNodes, nodeInfos[rank].nodeId}, + getAllTuplesList(Nv)) + : std::vector() + ; + + LOG(1,"Atrip") << "got nodeTuples\n"; + + // now we have to send the data from **one** rank on each node + // to all others ranks of this node + const + int color = nodeInfos[rank].nodeId + , key = nodeInfos[rank].localRank + ; + + + MPI_Comm INTRA_COMM; + MPI_Comm_split(universe, color, key, &INTRA_COMM); +// Main:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:2]] +size_t const + tuplesPerRankLocal + = nodeTuples.size() / nodeInfos[rank].ranksPerNode + + size_t(nodeTuples.size() % nodeInfos[rank].ranksPerNode != 0) + ; + +size_t tuplesPerRankGlobal; + +MPI_Reduce(&tuplesPerRankLocal, + &tuplesPerRankGlobal, + 1, + MPI_UINT64_T, + MPI_MAX, + 0, + universe); + +MPI_Bcast(&tuplesPerRankGlobal, + 1, + MPI_UINT64_T, + 0, + universe); + +LOG(1,"Atrip") << "Tuples per rank: " << tuplesPerRankGlobal << "\n"; +LOG(1,"Atrip") << "ranks per node " << nodeInfos[rank].ranksPerNode << "\n"; +LOG(1,"Atrip") << "#nodes " << nNodes << "\n"; +// Main:2 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:3]] +size_t const totalTuples + = tuplesPerRankGlobal * nodeInfos[rank].ranksPerNode; + +if (computeDistribution) { + // pad with FAKE_TUPLEs + nodeTuples.insert(nodeTuples.end(), + totalTuples - nodeTuples.size(), + FAKE_TUPLE); +} +// Main:3 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:4]] +{ + // construct mpi type for abctuple + MPI_Datatype MPI_ABCTUPLE; + MPI_Type_vector(nodeTuples[0].size(), 1, 1, MPI_UINT64_T, &MPI_ABCTUPLE); + MPI_Type_commit(&MPI_ABCTUPLE); + + LOG(1,"Atrip") << "scattering tuples \n"; + + result.resize(tuplesPerRankGlobal); + MPI_Scatter(nodeTuples.data(), + tuplesPerRankGlobal, + MPI_ABCTUPLE, + result.data(), + tuplesPerRankGlobal, + MPI_ABCTUPLE, + 0, + INTRA_COMM); + + MPI_Type_free(&MPI_ABCTUPLE); + +} +// Main:4 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:5]] +return result; + +} +// Main:5 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Interface][Interface:1]] +struct Distribution : public TuplesDistribution { + ABCTuples getTuples(size_t Nv, MPI_Comm universe) override { + return main(universe, Nv); + } +}; +// Interface:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]] +} // namespace group_and_sort +// Epilog:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]] +} +// Epilog:1 ends here diff --git a/include/atrip/Unions.hpp b/include/atrip/Unions.hpp index db3b6b7..e651ef9 100644 --- a/include/atrip/Unions.hpp +++ b/include/atrip/Unions.hpp @@ -59,7 +59,7 @@ namespace atrip { , child_world , global_world , Slice::TA - , 4) { + , 6) { init(sourceTensor); } @@ -97,7 +97,7 @@ namespace atrip { , child_world , global_world , Slice::VIJKA - , 4) { + , 6) { init(sourceTensor); } diff --git a/include/atrip/Utils.hpp b/include/atrip/Utils.hpp index bff3d19..83656c6 100644 --- a/include/atrip/Utils.hpp +++ b/include/atrip/Utils.hpp @@ -1,4 +1,4 @@ -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Utils][Utils:1]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]] #pragma once #include #include @@ -6,32 +6,41 @@ #include #include +#include namespace atrip { +// Prolog:1 ends here - - template +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Pretty%20printing][Pretty printing:1]] +template std::string pretty_print(T&& value) { std::stringstream stream; -#if ATRIP_DEBUG > 1 +#if ATRIP_DEBUG > 2 dbg::pretty_print(stream, std::forward(value)); #endif return stream.str(); } +// Pretty printing:1 ends here -#define WITH_CHRONO(__chrono, ...) \ - __chrono.start(); __VA_ARGS__ __chrono.stop(); +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Chrono][Chrono:1]] +#define WITH_CHRONO(__chrono_name, ...) \ + Atrip::chrono[__chrono_name].start(); \ + __VA_ARGS__ \ + Atrip::chrono[__chrono_name].stop(); - struct Timer { - using Clock = std::chrono::high_resolution_clock; - using Event = std::chrono::time_point; - std::chrono::duration duration; - Event _start; - inline void start() noexcept { _start = Clock::now(); } - inline void stop() noexcept { duration += Clock::now() - _start; } - inline void clear() noexcept { duration *= 0; } - inline double count() const noexcept { return duration.count(); } - }; - using Timings = std::map; +struct Timer { + using Clock = std::chrono::high_resolution_clock; + using Event = std::chrono::time_point; + std::chrono::duration duration; + Event _start; + inline void start() noexcept { _start = Clock::now(); } + inline void stop() noexcept { duration += Clock::now() - _start; } + inline void clear() noexcept { duration *= 0; } + inline double count() const noexcept { return duration.count(); } +}; +using Timings = std::map; +// Chrono:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]] } -// Utils:1 ends here +// Epilog:1 ends here diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx index fc613b6..b7823de 100644 --- a/src/atrip/Atrip.cxx +++ b/src/atrip/Atrip.cxx @@ -9,8 +9,11 @@ using namespace atrip; +bool RankMap::RANK_ROUND_ROBIN; +bool RankMap::RANK_ROUND_ROBIN; int Atrip::rank; int Atrip::np; +Timings Atrip::chrono; // user printing block IterationDescriptor IterationDescription::descriptor; @@ -30,28 +33,35 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { const int rank = Atrip::rank; MPI_Comm universe = in.ei->wrld->comm; - // Timings in seconds ================================================{{{1 - Timings chrono{}; - const size_t No = in.ei->lens[0]; const size_t Nv = in.ea->lens[0]; LOG(0,"Atrip") << "No: " << No << "\n"; LOG(0,"Atrip") << "Nv: " << Nv << "\n"; + LOG(0,"Atrip") << "np: " << np << "\n"; // allocate the three scratches, see piecuch - std::vector Tijk(No*No*No) // doubles only (see piecuch) - , Zijk(No*No*No) // singles + doubles (see piecuch) - // we need local copies of the following tensors on every - // rank - , epsi(No) - , epsa(Nv) - , Tai(No * Nv) - ; + std::vector Tijk(No*No*No) // doubles only (see piecuch) + , Zijk(No*No*No) // singles + doubles (see piecuch) + // we need local copies of the following tensors on every + // rank + , epsi(No) + , epsa(Nv) + , Tai(No * Nv) + ; in.ei->read_all(epsi.data()); in.ea->read_all(epsa.data()); in.Tph->read_all(Tai.data()); + RankMap::RANK_ROUND_ROBIN = in.rankRoundRobin; + if (RankMap::RANK_ROUND_ROBIN) { + LOG(0,"Atrip") << "Doing rank round robin slices distribution" << "\n"; + } else { + LOG(0,"Atrip") + << "Doing node > local rank round robin slices distribution" << "\n"; + } + + // COMMUNICATOR CONSTRUCTION ========================================={{{1 // // Construct a new communicator living only on a single rank @@ -72,41 +82,49 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { } - chrono["nv-slices"].start(); // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1 - LOG(0,"Atrip") << "BUILD NV-SLICES\n"; - TAPHH taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - HHHA hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - chrono["nv-slices"].stop(); + WITH_CHRONO("nv-slices", + LOG(0,"Atrip") << "BUILD NV-SLICES\n"; + TAPHH taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + HHHA hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + ) - chrono["nv-nv-slices"].start(); // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1 - LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n"; - ABPH abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - ABHH abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - TABHH tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - chrono["nv-nv-slices"].stop(); + WITH_CHRONO("nv-nv-slices", + LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n"; + ABPH abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + ABHH abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + TABHH tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + ) // all tensors std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh}; - //CONSTRUCT TUPLE LIST ==============================================={{{1 - LOG(0,"Atrip") << "BUILD TUPLE LIST\n"; - const auto tuplesList = std::move(getTuplesList(Nv)); - WITH_RANK << "tupList.size() = " << tuplesList.size() << "\n"; + // get tuples for the current rank + TuplesDistribution *distribution; - // GET ABC INDEX RANGE FOR RANK ======================================{{{1 - auto abcIndex = getABCRange(np, rank, tuplesList); - size_t nIterations = abcIndex.second - abcIndex.first; + if (in.tuplesDistribution == Atrip::Input::TuplesDistribution::NAIVE) { + LOG(0,"Atrip") << "Using the naive distribution\n"; + distribution = new NaiveDistribution(); + } else { + LOG(0,"Atrip") << "Using the group-and-sort distribution\n"; + distribution = new group_and_sort::Distribution(); + } - WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n"; - LOG(0,"Atrip") << "#iterations: " << nIterations << "\n"; + LOG(0,"Atrip") << "BUILDING TUPLE LIST\n"; + WITH_CHRONO("tuples:build", + auto const tuplesList = distribution->getTuples(Nv, universe); + ) + const size_t nIterations = tuplesList.size(); - // first abc - const ABCTuple firstAbc = tuplesList[abcIndex.first]; - - - double energy(0.); + { + const size_t _all_tuples = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; + LOG(0,"Atrip") << "#iterations: " + << nIterations + << "/" + << nIterations * np + << "\n"; + } const size_t iterationMod = (in.percentageMod > 0) @@ -119,7 +137,9 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { auto const isFakeTuple - = [&tuplesList](size_t const i) { return i >= tuplesList.size(); }; + = [&tuplesList, distribution](size_t const i) { + return distribution->tupleIsFake(tuplesList[i]); + }; using Database = typename Slice::Database; @@ -127,45 +147,42 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { auto communicateDatabase = [ &unions , np - , &chrono ] (ABCTuple const& abc, MPI_Comm const& c) -> Database { - chrono["db:comm:type:do"].start(); - auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement(); - chrono["db:comm:type:do"].stop(); + WITH_CHRONO("db:comm:type:do", + auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement(); + ) - chrono["db:comm:ldb"].start(); - LocalDatabase ldb; - - for (auto const& tensor: unions) { - auto const& tensorDb = tensor->buildLocalDatabase(abc); - ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end()); - } - chrono["db:comm:ldb"].stop(); + WITH_CHRONO("db:comm:ldb", + typename Slice::LocalDatabase ldb; + for (auto const& tensor: unions) { + auto const& tensorDb = tensor->buildLocalDatabase(abc); + ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end()); + } + ) Database db(np * ldb.size(), ldb[0]); - chrono["oneshot-db:comm:allgather"].start(); - chrono["db:comm:allgather"].start(); - MPI_Allgather( ldb.data() - , ldb.size() - , MPI_LDB_ELEMENT - , db.data() - , ldb.size() - , MPI_LDB_ELEMENT - , c); - chrono["db:comm:allgather"].stop(); - chrono["oneshot-db:comm:allgather"].stop(); + WITH_CHRONO("oneshot-db:comm:allgather", + WITH_CHRONO("db:comm:allgather", + MPI_Allgather( ldb.data() + , ldb.size() + , MPI_LDB_ELEMENT + , db.data() + , ldb.size() + , MPI_LDB_ELEMENT + , c); + )) - chrono["db:comm:type:free"].start(); - MPI_Type_free(&MPI_LDB_ELEMENT); - chrono["db:comm:type:free"].stop(); + WITH_CHRONO("db:comm:type:free", + MPI_Type_free(&MPI_LDB_ELEMENT); + ) return db; }; auto doIOPhase - = [&unions, &rank, &np, &universe, &chrono] (Database const& db) { + = [&unions, &rank, &np, &universe] (Database const& db) { const size_t localDBLength = db.size() / np; @@ -201,9 +218,9 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { << "\n" ; - chrono["db:io:recv"].start(); - u.receive(el.info, recvTag); - chrono["db:io:recv"].stop(); + WITH_CHRONO("db:io:recv", + u.receive(el.info, recvTag); + ) } // recv } @@ -237,9 +254,9 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { << "\n" ; - chrono["db:io:send"].start(); - u.send(otherRank, el.info, sendTag); - chrono["db:io:send"].stop(); + WITH_CHRONO("db:io:send", + u.send(otherRank, el, sendTag); + ) } // send phase @@ -257,31 +274,30 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { * double(No) * double(No) * (double(No) + double(Nv)) - * 2 - * 6 + * 2.0 + * (traits::isComplex() ? 2.0 : 1.0) + * 6.0 / 1e9 ; // START MAIN LOOP ======================================================{{{1 - for ( size_t i = abcIndex.first, iteration = 1 - ; i < abcIndex.second + double energy(0.); + + for ( size_t i = 0, iteration = 1 + ; i < tuplesList.size() ; i++, iteration++ ) { - chrono["iterations"].start(); - + Atrip::chrono["iterations"].start(); // check overhead from chrono over all iterations - chrono["start:stop"].start(); chrono["start:stop"].stop(); + WITH_CHRONO("start:stop", {}) // check overhead of doing a barrier at the beginning - chrono["oneshot-mpi:barrier"].start(); - chrono["mpi:barrier"].start(); - // TODO: REMOVE - if (in.barrier == 1) - MPI_Barrier(universe); - chrono["mpi:barrier"].stop(); - chrono["oneshot-mpi:barrier"].stop(); + WITH_CHRONO("oneshot-mpi:barrier", + WITH_CHRONO("mpi:barrier", + if (in.barrier) MPI_Barrier(universe); + )) if (iteration % iterationMod == 0 || iteration == iteration1Percent) { @@ -289,22 +305,22 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { IterationDescription::descriptor({ iteration, nIterations, - chrono["iterations"].count() + Atrip::chrono["iterations"].count() }); } LOG(0,"Atrip") << "iteration " << iteration << " [" << 100 * iteration / nIterations << "%]" - << " (" << doublesFlops * iteration / chrono["doubles"].count() + << " (" << doublesFlops * iteration / Atrip::chrono["doubles"].count() << "GF)" - << " (" << doublesFlops * iteration / chrono["iterations"].count() + << " (" << doublesFlops * iteration / Atrip::chrono["iterations"].count() << "GF)" << " ===========================\n"; // PRINT TIMINGS if (in.chrono) - for (auto const& pair: chrono) + for (auto const& pair: Atrip::chrono) LOG(1, " ") << pair.first << " :: " << pair.second.count() << std::endl; @@ -314,46 +330,43 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { const ABCTuple abc = isFakeTuple(i) ? tuplesList[tuplesList.size() - 1] : tuplesList[i] - , *abcNext = i == (abcIndex.second - 1) + , *abcNext = i == (tuplesList.size() - 1) ? nullptr - : isFakeTuple(i + 1) - ? &tuplesList[tuplesList.size() - 1] : &tuplesList[i + 1] ; - chrono["with_rank"].start(); - WITH_RANK << " :it " << iteration - << " :abc " << pretty_print(abc) - << " :abcN " - << (abcNext ? pretty_print(*abcNext) : "None") - << "\n"; - chrono["with_rank"].stop(); + WITH_CHRONO("with_rank", + WITH_RANK << " :it " << iteration + << " :abc " << pretty_print(abc) + << " :abcN " + << (abcNext ? pretty_print(*abcNext) : "None") + << "\n"; + ) // COMM FIRST DATABASE ================================================{{{1 - if (i == abcIndex.first) { + if (i == 0) { WITH_RANK << "__first__:first database ............ \n"; - const auto __db = communicateDatabase(abc, universe); + const auto db = communicateDatabase(abc, universe); WITH_RANK << "__first__:first database communicated \n"; WITH_RANK << "__first__:first database io phase \n"; - doIOPhase(__db); + doIOPhase(db); WITH_RANK << "__first__:first database io phase DONE\n"; WITH_RANK << "__first__::::Unwrapping all slices for first database\n"; for (auto& u: unions) u->unwrapAll(abc); - WITH_RANK << "__first__::::Unwrapping all slices for first database DONE\n"; + WITH_RANK << "__first__::::Unwrapping slices for first database DONE\n"; MPI_Barrier(universe); } // COMM NEXT DATABASE ================================================={{{1 if (abcNext) { WITH_RANK << "__comm__:" << iteration << "th communicating database\n"; - chrono["db:comm"].start(); - //const auto db = communicateDatabase(*abcNext, universe); - Database db = communicateDatabase(*abcNext, universe); - chrono["db:comm"].stop(); - chrono["db:io"].start(); - doIOPhase(db); - chrono["db:io"].stop(); + WITH_CHRONO("db:comm", + const auto db = communicateDatabase(*abcNext, universe); + ) + WITH_CHRONO("db:io", + doIOPhase(db); + ) WITH_RANK << "__comm__:" << iteration << "th database io phase DONE\n"; } @@ -361,63 +374,61 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { OCD_Barrier(universe); if (!isFakeTuple(i)) { WITH_RANK << iteration << "-th doubles\n"; - WITH_CHRONO(chrono["oneshot-unwrap"], - WITH_CHRONO(chrono["unwrap"], - WITH_CHRONO(chrono["unwrap:doubles"], + WITH_CHRONO("oneshot-unwrap", + WITH_CHRONO("unwrap", + WITH_CHRONO("unwrap:doubles", for (auto& u: decltype(unions){&abph, &hhha, &taphh, &tabhh}) { u->unwrapAll(abc); } ))) - chrono["oneshot-doubles"].start(); - chrono["doubles"].start(); - doublesContribution( abc, (size_t)No, (size_t)Nv - // -- VABCI - , abph.unwrapSlice(Slice::AB, abc) - , abph.unwrapSlice(Slice::AC, abc) - , abph.unwrapSlice(Slice::BC, abc) - , abph.unwrapSlice(Slice::BA, abc) - , abph.unwrapSlice(Slice::CA, abc) - , abph.unwrapSlice(Slice::CB, abc) - // -- VHHHA - , hhha.unwrapSlice(Slice::A, abc) - , hhha.unwrapSlice(Slice::B, abc) - , hhha.unwrapSlice(Slice::C, abc) - // -- TA - , taphh.unwrapSlice(Slice::A, abc) - , taphh.unwrapSlice(Slice::B, abc) - , taphh.unwrapSlice(Slice::C, abc) - // -- TABIJ - , tabhh.unwrapSlice(Slice::AB, abc) - , tabhh.unwrapSlice(Slice::AC, abc) - , tabhh.unwrapSlice(Slice::BC, abc) - // -- TIJK - , Tijk.data() - , chrono - ); - WITH_RANK << iteration << "-th doubles done\n"; - chrono["doubles"].stop(); - chrono["oneshot-doubles"].stop(); + WITH_CHRONO("oneshot-doubles", + WITH_CHRONO("doubles", + doublesContribution( abc, (size_t)No, (size_t)Nv + // -- VABCI + , abph.unwrapSlice(Slice::AB, abc) + , abph.unwrapSlice(Slice::AC, abc) + , abph.unwrapSlice(Slice::BC, abc) + , abph.unwrapSlice(Slice::BA, abc) + , abph.unwrapSlice(Slice::CA, abc) + , abph.unwrapSlice(Slice::CB, abc) + // -- VHHHA + , hhha.unwrapSlice(Slice::A, abc) + , hhha.unwrapSlice(Slice::B, abc) + , hhha.unwrapSlice(Slice::C, abc) + // -- TA + , taphh.unwrapSlice(Slice::A, abc) + , taphh.unwrapSlice(Slice::B, abc) + , taphh.unwrapSlice(Slice::C, abc) + // -- TABIJ + , tabhh.unwrapSlice(Slice::AB, abc) + , tabhh.unwrapSlice(Slice::AC, abc) + , tabhh.unwrapSlice(Slice::BC, abc) + // -- TIJK + , Tijk.data() + ); + WITH_RANK << iteration << "-th doubles done\n"; + )) } // COMPUTE SINGLES =================================================== {{{1 OCD_Barrier(universe); if (!isFakeTuple(i)) { - WITH_CHRONO(chrono["oneshot-unwrap"], - WITH_CHRONO(chrono["unwrap"], - WITH_CHRONO(chrono["unwrap:singles"], + WITH_CHRONO("oneshot-unwrap", + WITH_CHRONO("unwrap", + WITH_CHRONO("unwrap:singles", abhh.unwrapAll(abc); ))) - chrono["reorder"].start(); - for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I]; - chrono["reorder"].stop(); - chrono["singles"].start(); + WITH_CHRONO("reorder", + for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I]; + ) + WITH_CHRONO("singles", singlesContribution( No, Nv, abc , Tai.data() , abhh.unwrapSlice(Slice::AB, abc) , abhh.unwrapSlice(Slice::AC, abc) , abhh.unwrapSlice(Slice::BC, abc) , Zijk.data()); - chrono["singles"].stop(); + ) } @@ -430,12 +441,12 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { if (abc[1] == abc[2]) distinct--; const F epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]); - chrono["energy"].start(); - if ( distinct == 0) - tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk); - else - tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk); - chrono["energy"].stop(); + WITH_CHRONO("energy", + if ( distinct == 0) + tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk); + else + tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk); + ) #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES) tupleEnergies[abc] = tupleEnergy; @@ -445,6 +456,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { } + // TODO: remove this if (isFakeTuple(i)) { // fake iterations should also unwrap whatever they got WITH_RANK << iteration @@ -466,7 +478,6 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // CLEANUP UNIONS ===================================================={{{1 OCD_Barrier(universe); if (abcNext) { - chrono["gc"].start(); WITH_RANK << "__gc__:" << iteration << "-th cleaning up.......\n"; for (auto& u: unions) { @@ -500,12 +511,11 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { } - chrono["gc"].stop(); } WITH_RANK << iteration << "-th cleaning up....... DONE\n"; - chrono["iterations"].stop(); + Atrip::chrono["iterations"].stop(); // ITERATION END ====================================================={{{1 } @@ -543,15 +553,15 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // PRINT TIMINGS {{{1 if (in.chrono) - for (auto const& pair: chrono) + for (auto const& pair: Atrip::chrono) LOG(0,"atrip:chrono") << pair.first << " " << pair.second.count() << std::endl; LOG(0, "atrip:flops(doubles)") - << nIterations * doublesFlops / chrono["doubles"].count() << "\n"; + << nIterations * doublesFlops / Atrip::chrono["doubles"].count() << "\n"; LOG(0, "atrip:flops(iterations)") - << nIterations * doublesFlops / chrono["iterations"].count() << "\n"; + << nIterations * doublesFlops / Atrip::chrono["iterations"].count() << "\n"; // TODO: change the sign in the getEnergy routines return { - globalEnergy };