From cdbad963b01b7205b1065f6ca6e90b65318fe387 Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Tue, 30 Nov 2021 12:04:44 +0100 Subject: [PATCH 01/22] Add user printing mechanism (cherry pick) --- atrip.org | 97 +++++++++++++++++++++++++++++++++-------- include/atrip/Atrip.hpp | 3 +- include/atrip/Debug.hpp | 47 ++++++++++++++++---- src/atrip/Atrip.cxx | 32 +++++++++----- 4 files changed, 142 insertions(+), 37 deletions(-) diff --git a/atrip.org b/atrip.org index a69fa03..72d2601 100644 --- a/atrip.org +++ b/atrip.org @@ -1847,7 +1847,7 @@ namespace atrip { , *Vhhhp = nullptr , *Vppph = nullptr ; - int maxIterations = 0, iterationMod = -1; + int maxIterations = 0, iterationMod = -1, percentageMod = -1; bool barrier = false; bool chrono = false; Input& with_epsilon_i(CTF::Tensor * t) { ei = t; return *this; } @@ -1859,6 +1859,7 @@ namespace atrip { Input& with_Vabci(CTF::Tensor * t) { Vppph = t; return *this; } Input& with_maxIterations(int i) { maxIterations = i; return *this; } Input& with_iterationMod(int i) { iterationMod = i; return *this; } + Input& with_percentageMod(int i) { percentageMod = i; return *this; } Input& with_barrier(bool i) { barrier = i; return *this; } Input& with_chrono(bool i) { chrono = i; return *this; } }; @@ -1888,6 +1889,12 @@ using namespace atrip; int Atrip::rank; int Atrip::np; +// user printing block +IterationDescriptor IterationDescription::descriptor; +void atrip::registerIterationDescriptor(IterationDescriptor d) { + IterationDescription::descriptor = d; +} + void Atrip::init() { MPI_Comm_rank(MPI_COMM_WORLD, &Atrip::rank); MPI_Comm_size(MPI_COMM_WORLD, &Atrip::np); @@ -1968,15 +1975,6 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { auto abcIndex = getABCRange(np, rank, tuplesList); size_t nIterations = abcIndex.second - abcIndex.first; -#ifdef ATRIP_BENCHMARK - { const size_t maxIterations = in.maxIterations; - if (maxIterations != 0) { - abcIndex.second = abcIndex.first + maxIterations % (nIterations + 1); - nIterations = maxIterations % (nIterations + 1); - } - } -#endif - WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n"; LOG(0,"Atrip") << "#iterations: " << nIterations << "\n"; @@ -1986,6 +1984,12 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { double energy(0.); + size_t iterationMod + = (in.percentageMod > 0) + ? nIterations * in.percentageMod / 100 + : in.iterationMod + ; + auto const isFakeTuple = [&tuplesList](size_t const i) { return i >= tuplesList.size(); }; @@ -2151,7 +2155,16 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { chrono["mpi:barrier"].stop(); chrono["oneshot-mpi:barrier"].stop(); - if (iteration % in.iterationMod == 0) { + if (iteration % iterationMod == 0) { + + if (IterationDescription::descriptor) { + IterationDescription::descriptor({ + iteration, + nIterations, + chrono["iterations"].count() + }); + } + LOG(0,"Atrip") << "iteration " << iteration << " [" << 100 * iteration / nIterations << "%]" @@ -2419,9 +2432,12 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { #+end_src -** Debug +** Debug and Logging +*** Macros + #+begin_src c++ :tangle (atrip-debug-h) #pragma once +#include #define ATRIP_BENCHMARK //#define ATRIP_DONT_SLICE #define ATRIP_DEBUG 1 @@ -2429,10 +2445,12 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { #define ATRIP_USE_DGEMM //#define ATRIP_PRINT_TUPLES -#define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": " +#ifndef ATRIP_DEBUG +#define ATRIP_DEBUG 1 +#endif #if ATRIP_DEBUG == 4 -# pragma message("WARNING: You have OCD debugging ABC triples "\ +# pragma message("WARNING: You have OCD debugging ABC triples " \ "expect GB of output and consult your therapist") # include # define HAVE_OCD @@ -2445,7 +2463,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { # define WITH_DBG # define DBG(...) dbg(__VA_ARGS__) #elif ATRIP_DEBUG == 3 -# pragma message("WARNING: You have crazy debugging ABC triples,"\ +# pragma message("WARNING: You have crazy debugging ABC triples," \ " expect GB of output") # include # define OCD_Barrier(com) @@ -2467,7 +2485,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { # define WITH_CRAZY_DEBUG if (false) # define WITH_DBG # define DBG(...) dbg(__VA_ARGS__) -#elif ATRIP_DEBUG == 1 +#else # define OCD_Barrier(com) # define WITH_OCD if (false) # define WITH_ROOT if (false) @@ -2476,11 +2494,54 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { # define WITH_DBG if (false) # define WITH_CRAZY_DEBUG if (false) # define DBG(...) -#else -# error("ATRIP_DEBUG is not defined!") #endif #+end_src +And users of the library can redefine the =LOG= macro +which in case of not being defined is defined as follows: + +#+begin_src c++ :tangle (atrip-debug-h) +#ifndef LOG +#define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": " +#endif +#+end_src + +Furthermore, if you do not wish to see any output from ATRIP, simply +define =ATRIP_NO_OUTPUT= + + +#+begin_src c++ :tangle (atrip-debug-h) +#ifdef ATRIP_NO_OUTPUT +# undef LOG +# define LOG(level, name) if (false) std::cout << name << ": " +#endif +#+end_src + +*** Iteration informer + +In general a code writer will want to write some messages in every iteration. +A developer then can register a function to be used in this sense. +The input of the function is an [[IterationDescriptor]] structure and the output +should be nothing. + +#+name: IterationDescriptor +#+begin_src c++ :tangle (atrip-debug-h) +namespace atrip { + + struct IterationDescription; + using IterationDescriptor = std::function; + struct IterationDescription { + static IterationDescriptor descriptor; + size_t currentIteration; + size_t totalIterations; + double currentElapsedTime; + }; + + void registerIterationDescriptor(IterationDescriptor); + +} +#+end_src + ** Include header #+begin_src c++ :tangle (atrip-main-h) diff --git a/include/atrip/Atrip.hpp b/include/atrip/Atrip.hpp index a0cad96..a8bcd78 100644 --- a/include/atrip/Atrip.hpp +++ b/include/atrip/Atrip.hpp @@ -24,7 +24,7 @@ namespace atrip { , *Vhhhp = nullptr , *Vppph = nullptr ; - int maxIterations = 0, iterationMod = -1; + int maxIterations = 0, iterationMod = -1, percentageMod = -1; bool barrier = false; bool chrono = false; Input& with_epsilon_i(CTF::Tensor * t) { ei = t; return *this; } @@ -36,6 +36,7 @@ namespace atrip { Input& with_Vabci(CTF::Tensor * t) { Vppph = t; return *this; } Input& with_maxIterations(int i) { maxIterations = i; return *this; } Input& with_iterationMod(int i) { iterationMod = i; return *this; } + Input& with_percentageMod(int i) { percentageMod = i; return *this; } Input& with_barrier(bool i) { barrier = i; return *this; } Input& with_chrono(bool i) { chrono = i; return *this; } }; diff --git a/include/atrip/Debug.hpp b/include/atrip/Debug.hpp index 9153954..6bdfde2 100644 --- a/include/atrip/Debug.hpp +++ b/include/atrip/Debug.hpp @@ -1,5 +1,6 @@ -// [[file:../../atrip.org::*Debug][Debug:1]] +// [[file:../../atrip.org::*Macros][Macros:1]] #pragma once +#include #define ATRIP_BENCHMARK //#define ATRIP_DONT_SLICE #define ATRIP_DEBUG 1 @@ -7,10 +8,12 @@ #define ATRIP_USE_DGEMM //#define ATRIP_PRINT_TUPLES -#define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": " +#ifndef ATRIP_DEBUG +#define ATRIP_DEBUG 1 +#endif #if ATRIP_DEBUG == 4 -# pragma message("WARNING: You have OCD debugging ABC triples "\ +# pragma message("WARNING: You have OCD debugging ABC triples " \ "expect GB of output and consult your therapist") # include # define HAVE_OCD @@ -23,7 +26,7 @@ # define WITH_DBG # define DBG(...) dbg(__VA_ARGS__) #elif ATRIP_DEBUG == 3 -# pragma message("WARNING: You have crazy debugging ABC triples,"\ +# pragma message("WARNING: You have crazy debugging ABC triples," \ " expect GB of output") # include # define OCD_Barrier(com) @@ -45,7 +48,7 @@ # define WITH_CRAZY_DEBUG if (false) # define WITH_DBG # define DBG(...) dbg(__VA_ARGS__) -#elif ATRIP_DEBUG == 1 +#else # define OCD_Barrier(com) # define WITH_OCD if (false) # define WITH_ROOT if (false) @@ -54,7 +57,35 @@ # define WITH_DBG if (false) # define WITH_CRAZY_DEBUG if (false) # define DBG(...) -#else -# error("ATRIP_DEBUG is not defined!") #endif -// Debug:1 ends here +// Macros:1 ends here + +// [[file:../../atrip.org::*Macros][Macros:2]] +#ifndef LOG +#define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": " +#endif +// Macros:2 ends here + +// [[file:../../atrip.org::*Macros][Macros:3]] +#ifdef ATRIP_NO_OUTPUT +# undef LOG +# define LOG(level, name) if (false) std::cout << name << ": " +#endif +// Macros:3 ends here + +// [[file:../../atrip.org::IterationDescriptor][IterationDescriptor]] +namespace atrip { + + struct IterationDescription; + using IterationDescriptor = std::function; + struct IterationDescription { + static IterationDescriptor descriptor; + size_t currentIteration; + size_t totalIterations; + double currentElapsedTime; + }; + + void registerIterationDescriptor(IterationDescriptor); + +} +// IterationDescriptor ends here diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx index 06c4079..a6addc6 100644 --- a/src/atrip/Atrip.cxx +++ b/src/atrip/Atrip.cxx @@ -12,6 +12,12 @@ using namespace atrip; int Atrip::rank; int Atrip::np; +// user printing block +IterationDescriptor IterationDescription::descriptor; +void atrip::registerIterationDescriptor(IterationDescriptor d) { + IterationDescription::descriptor = d; +} + void Atrip::init() { MPI_Comm_rank(MPI_COMM_WORLD, &Atrip::rank); MPI_Comm_size(MPI_COMM_WORLD, &Atrip::np); @@ -92,15 +98,6 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { auto abcIndex = getABCRange(np, rank, tuplesList); size_t nIterations = abcIndex.second - abcIndex.first; -#ifdef ATRIP_BENCHMARK - { const size_t maxIterations = in.maxIterations; - if (maxIterations != 0) { - abcIndex.second = abcIndex.first + maxIterations % (nIterations + 1); - nIterations = maxIterations % (nIterations + 1); - } - } -#endif - WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n"; LOG(0,"Atrip") << "#iterations: " << nIterations << "\n"; @@ -110,6 +107,12 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { double energy(0.); + size_t iterationMod + = (in.percentageMod > 0) + ? nIterations * in.percentageMod / 100 + : in.iterationMod + ; + auto const isFakeTuple = [&tuplesList](size_t const i) { return i >= tuplesList.size(); }; @@ -275,7 +278,16 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { chrono["mpi:barrier"].stop(); chrono["oneshot-mpi:barrier"].stop(); - if (iteration % in.iterationMod == 0) { + if (iteration % iterationMod == 0) { + + if (IterationDescription::descriptor) { + IterationDescription::descriptor({ + iteration, + nIterations, + chrono["iterations"].count() + }); + } + LOG(0,"Atrip") << "iteration " << iteration << " [" << 100 * iteration / nIterations << "%]" From 6fa915db3ae4ba06463b684dfa879421fa2463a2 Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Tue, 14 Dec 2021 17:50:22 +0100 Subject: [PATCH 02/22] Add 1% printing --- atrip.org | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/atrip.org b/atrip.org index 72d2601..30be0ca 100644 --- a/atrip.org +++ b/atrip.org @@ -1821,6 +1821,7 @@ namespace atrip { #+end_src ** Atrip +*** Header #+begin_src c++ :tangle (atrip-atrip-h) #pragma once #include @@ -1984,13 +1985,16 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { double energy(0.); - size_t iterationMod - = (in.percentageMod > 0) - ? nIterations * in.percentageMod / 100 - : in.iterationMod + const size_t + iterationMod = (in.percentageMod > 0) + ? nIterations * in.percentageMod / 100 + : in.iterationMod + + , iteration1Percent = nIterations * 0.01 ; + auto const isFakeTuple = [&tuplesList](size_t const i) { return i >= tuplesList.size(); }; @@ -2155,7 +2159,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { chrono["mpi:barrier"].stop(); chrono["oneshot-mpi:barrier"].stop(); - if (iteration % iterationMod == 0) { + if (iteration % iterationMod == 0 || iteration == iteration1Percent) { if (IterationDescription::descriptor) { IterationDescription::descriptor({ From b1175997af0128a2e9b2b3acf66daefc4741e960 Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Tue, 14 Dec 2021 18:00:25 +0100 Subject: [PATCH 03/22] Add tangled code --- src/atrip/Atrip.cxx | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx index a6addc6..64dea9b 100644 --- a/src/atrip/Atrip.cxx +++ b/src/atrip/Atrip.cxx @@ -107,13 +107,16 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { double energy(0.); - size_t iterationMod - = (in.percentageMod > 0) - ? nIterations * in.percentageMod / 100 - : in.iterationMod + const size_t + iterationMod = (in.percentageMod > 0) + ? nIterations * in.percentageMod / 100 + : in.iterationMod + + , iteration1Percent = nIterations * 0.01 ; + auto const isFakeTuple = [&tuplesList](size_t const i) { return i >= tuplesList.size(); }; @@ -278,7 +281,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { chrono["mpi:barrier"].stop(); chrono["oneshot-mpi:barrier"].stop(); - if (iteration % iterationMod == 0) { + if (iteration % iterationMod == 0 || iteration == iteration1Percent) { if (IterationDescription::descriptor) { IterationDescription::descriptor({ From 8c39827061c12598130e21129f162b6273a2a22b Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Thu, 27 Jan 2022 20:39:00 +0100 Subject: [PATCH 04/22] Add type traits --- atrip.org | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/atrip.org b/atrip.org index 30be0ca..43a7f04 100644 --- a/atrip.org +++ b/atrip.org @@ -8,7 +8,6 @@ The algorithm uses two main data types, the =Slice= and the ** The slice - #+begin_src c++ :tangle (atrip-slice-h) #pragma once #include @@ -18,9 +17,20 @@ The algorithm uses two main data types, the =Slice= and the #include #include +#include namespace atrip { +namespace traits { + template bool isComplex() { return false; }; + template <> bool isComplex() { return true; }; +namespace mpi { + template MPI_Datatype datatypeOf(void); + template <> MPI_Datatype datatypeOf() { return MPI_DOUBLE; } + template <> MPI_Datatype datatypeOf() { return MPI_DOUBLE_COMPLEX; } +} +} + struct Slice { From 61662e27175b091c2fda40b0ac349c8bbd69e955 Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Thu, 27 Jan 2022 20:40:37 +0100 Subject: [PATCH 05/22] Templatize Slice --- atrip.org | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/atrip.org b/atrip.org index 43a7f04..8d06e58 100644 --- a/atrip.org +++ b/atrip.org @@ -32,9 +32,9 @@ namespace mpi { } +template struct Slice { - using F = double; #+end_src A slice is the concept of a subset of values of a given tensor. @@ -124,8 +124,8 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds // DATABASE ==========================================================={{{1 struct LocalDatabaseElement { - Slice::Name name; - Slice::Info info; + Slice::Name name; + Slice::Info info; }; using LocalDatabase = std::vector; using Database = LocalDatabase; @@ -148,7 +148,7 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds constexpr int n = 2; // create a sliceLocation to measure in the current architecture // the packing of the struct - Slice::Location measure; + Slice::Location measure; MPI_Datatype dt; const std::vector lengths(n, 1); const MPI_Datatype types[n] = {usizeDt(), usizeDt()}; @@ -172,7 +172,7 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds static MPI_Datatype sliceInfo () { constexpr int n = 5; MPI_Datatype dt; - Slice::Info measure; + Slice::Info measure; const std::vector lengths(n, 1); const MPI_Datatype types[n] = { vector(2, usizeDt()) @@ -244,10 +244,10 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds ,* It is important here to return a reference to a Slice ,* not to accidentally copy the associated buffer of the slice. ,*/ - static Slice& findOneByType(std::vector &slices, Slice::Type type) { + static Slice& findOneByType(std::vector> &slices, Slice::Type type) { const auto sliceIt = std::find_if(slices.begin(), slices.end(), - [&type](Slice const& s) { + [&type](Slice const& s) { return type == s.info.type; }); WITH_CRAZY_DEBUG @@ -262,11 +262,11 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds ,* Check if an info has ,* ,*/ - static std::vector hasRecycledReferencingToIt - ( std::vector &slices + static std::vector*> hasRecycledReferencingToIt + ( std::vector> &slices , Info const& info ) { - std::vector result; + std::vector*> result; for (auto& s: slices) if ( s.info.recycling == info.type @@ -277,11 +277,11 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds return result; } - static Slice& - findRecycledSource (std::vector &slices, Slice::Info info) { + static Slice& + findRecycledSource (std::vector> &slices, Slice::Info info) { const auto sliceIt = std::find_if(slices.begin(), slices.end(), - [&info](Slice const& s) { + [&info](Slice const& s) { return info.recycling == s.info.type && info.tuple == s.info.tuple && State::Recycled != s.info.state @@ -301,15 +301,15 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds return *sliceIt; } - static Slice& findByTypeAbc - ( std::vector &slices - , Slice::Type type + static Slice& findByTypeAbc + ( std::vector> &slices + , Slice::Type type , ABCTuple const& abc ) { - const auto tuple = Slice::subtupleBySlice(abc, type); + const auto tuple = Slice::subtupleBySlice(abc, type); const auto sliceIt = std::find_if(slices.begin(), slices.end(), - [&type, &tuple](Slice const& s) { + [&type, &tuple](Slice const& s) { return type == s.info.type && tuple == s.info.tuple ; @@ -329,11 +329,11 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds return *sliceIt; } - static Slice& findByInfo(std::vector &slices, - Slice::Info const& info) { + static Slice& findByInfo(std::vector> &slices, + Slice::Info const& info) { const auto sliceIt = std::find_if(slices.begin(), slices.end(), - [&info](Slice const& s) { + [&info](Slice const& s) { // TODO: maybe implement comparison in Info struct return info.type == s.info.type && info.state == s.info.state @@ -479,13 +479,15 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds }; // struct Slice -std::ostream& operator<<(std::ostream& out, Slice::Location const& v) { +template +std::ostream& operator<<(std::ostream& out, typename Slice::Location const& v) { // TODO: remove me out << "{.r(" << v.rank << "), .s(" << v.source << ")};"; return out; } -std::ostream& operator<<(std::ostream& out, Slice::Info const& i) { +template +std::ostream& operator<<(std::ostream& out, typename Slice::Info const& i) { out << "«t" << i.type << ", s" << i.state << "»" << " ⊙ {" << i.from.rank << ", " << i.from.source << "}" << " ∴ {" << i.tuple[0] << ", " << i.tuple[1] << "}" From 4543e712b3b576535991d1210bdb6a645abd823d Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Thu, 27 Jan 2022 20:41:08 +0100 Subject: [PATCH 06/22] Templatize RankMap --- atrip.org | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/atrip.org b/atrip.org index 8d06e58..e38133b 100644 --- a/atrip.org +++ b/atrip.org @@ -549,6 +549,8 @@ namespace atrip { #include namespace atrip { + + template struct RankMap { std::vector const lengths; @@ -561,7 +563,7 @@ namespace atrip { 1UL, std::multiplies())) { assert(lengths.size() <= 2); } - size_t find(Slice::Location const& p) const noexcept { + size_t find(typename Slice::Location const& p) const noexcept { return p.source * np + p.rank; } @@ -581,10 +583,10 @@ namespace atrip { return source == nSources() && isPaddingRank(rank); } - Slice::Location - find(ABCTuple const& abc, Slice::Type sliceType) const noexcept { + typename Slice::Location + find(ABCTuple const& abc, typename Slice::Type sliceType) const noexcept { // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB - const auto tuple = Slice::subtupleBySlice(abc, sliceType); + const auto tuple = Slice::subtupleBySlice(abc, sliceType); const size_t index = tuple[0] From 6776a7134c92eccba0a7e6b0643d2ec0b59337c9 Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Thu, 27 Jan 2022 20:43:41 +0100 Subject: [PATCH 07/22] Templatize SliceUnion --- atrip.org | 108 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 52 deletions(-) diff --git a/atrip.org b/atrip.org index e38133b..52398c1 100644 --- a/atrip.org +++ b/atrip.org @@ -613,8 +613,8 @@ namespace atrip { namespace atrip { + template struct SliceUnion { - using F = double; using Tensor = CTF::Tensor; virtual void @@ -627,7 +627,7 @@ namespace atrip { * This means that there can be at most one slice with a given Ty_x_Tu. */ void checkForDuplicates() const { - std::vector tytus; + std::vector::Ty_x_Tu> tytus; for (auto const& s: slices) { if (s.isFree()) continue; tytus.push_back({s.info.type, s.info.tuple}); @@ -640,13 +640,13 @@ namespace atrip { } - std::vector neededSlices(ABCTuple const& abc) { - std::vector needed(sliceTypes.size()); + std::vector::Ty_x_Tu> neededSlices(ABCTuple const& abc) { + std::vector::Ty_x_Tu> needed(sliceTypes.size()); // build the needed vector std::transform(sliceTypes.begin(), sliceTypes.end(), needed.begin(), - [&abc](Slice::Type const type) { - auto tuple = Slice::subtupleBySlice(abc, type); + [&abc](typename Slice::Type const type) { + auto tuple = Slice::subtupleBySlice(abc, type); return std::make_pair(type, tuple); }); return needed; @@ -671,8 +671,9 @@ namespace atrip { * slices. * */ - Slice::LocalDatabase buildLocalDatabase(ABCTuple const& abc) { - Slice::LocalDatabase result; + typename + Slice::LocalDatabase buildLocalDatabase(ABCTuple const& abc) { + typename Slice::LocalDatabase result; auto const needed = neededSlices(abc); @@ -702,7 +703,7 @@ namespace atrip { // need auto const& it = std::find_if(slices.begin(), slices.end(), - [&tuple, &type](Slice const& other) { + [&tuple, &type](Slice const& other) { return other.info.tuple == tuple && other.info.type == type // we only want another slice when it @@ -728,7 +729,7 @@ namespace atrip { // tuple and that has a valid data pointer. auto const& recycleIt = std::find_if(slices.begin(), slices.end(), - [&tuple, &type](Slice const& other) { + [&tuple, &type](Slice const& other) { return other.info.tuple == tuple && other.info.type != type && other.isRecyclable() @@ -739,13 +740,13 @@ namespace atrip { // (which should exist by construction :THINK) // if (recycleIt != slices.end()) { - auto& blank = Slice::findOneByType(slices, Slice::Blank); + auto& blank = Slice::findOneByType(slices, Slice::Blank); // TODO: formalize this through a method to copy information // from another slice blank.data = recycleIt->data; blank.info.type = type; blank.info.tuple = tuple; - blank.info.state = Slice::Recycled; + blank.info.state = Slice::Recycled; blank.info.from = from; blank.info.recycling = recycleIt->info.type; result.push_back({name, blank.info}); @@ -772,17 +773,17 @@ namespace atrip { << " for tuple " << tuple[0] << ", " << tuple[1] << "\n" ; - auto& blank = Slice::findOneByType(slices, Slice::Blank); + auto& blank = Slice::findOneByType(slices, Slice::Blank); blank.info.type = type; blank.info.tuple = tuple; blank.info.from = from; // Handle self sufficiency blank.info.state = Atrip::rank == from.rank - ? Slice::SelfSufficient - : Slice::Fetch + ? Slice::SelfSufficient + : Slice::Fetch ; - if (blank.info.state == Slice::SelfSufficient) { + if (blank.info.state == Slice::SelfSufficient) { blank.data = sources[from.source].data(); } else { if (freePointers.size() == 0) @@ -826,7 +827,7 @@ namespace atrip { // try to find the slice in the needed slices list auto const found = std::find_if(needed.begin(), needed.end(), - [&slice] (Slice::Ty_x_Tu const& tytu) { + [&slice] (typename Slice::Ty_x_Tu const& tytu) { return slice.info.tuple == tytu.second && slice.info.type == tytu.first ; @@ -845,7 +846,7 @@ namespace atrip { // allow to gc unwrapped and recycled, never Fetch, // if we have a Fetch slice then something has gone very wrong. - if (!slice.isUnwrapped() && slice.info.state != Slice::Recycled) + if (!slice.isUnwrapped() && slice.info.state != Slice::Recycled) throw std::domain_error("Trying to garbage collect " " a non-unwrapped slice! " @@ -866,13 +867,13 @@ namespace atrip { // - we should make sure that the data pointer of slice // does not get freed. // - if (slice.info.state == Slice::Ready) { + if (slice.info.state == Slice::Ready) { WITH_OCD WITH_RANK << "__gc__:" << "checking for data recycled dependencies\n"; auto recycled - = Slice::hasRecycledReferencingToIt(slices, slice.info); + = Slice::hasRecycledReferencingToIt(slices, slice.info); if (recycled.size()) { - Slice* newReady = recycled[0]; + Slice* newReady = recycled[0]; WITH_OCD WITH_RANK << "__gc__:" << "swaping recycled " << pretty_print(newReady->info) @@ -897,8 +898,8 @@ namespace atrip { // if the slice is self sufficient, do not dare touching the // pointer, since it is a pointer to our sources in our rank. - if ( slice.info.state == Slice::SelfSufficient - || slice.info.state == Slice::Recycled + if ( slice.info.state == Slice::SelfSufficient + || slice.info.state == Slice::Recycled ) { freeSlicePointer = false; } @@ -920,7 +921,8 @@ namespace atrip { // at this point, let us blank the slice WITH_RANK << "~~~:cl(" << name << ")" << " freeing up slice " - << " info " << slice.info + // TODO: make this possible + // << " info " << slice.info << "\n"; slice.free(); } @@ -930,13 +932,13 @@ namespace atrip { // CONSTRUCTOR SliceUnion( Tensor const& sourceTensor - , std::vector sliceTypes_ + , std::vector::Type> sliceTypes_ , std::vector sliceLength_ , std::vector paramLength , size_t np , MPI_Comm child_world , MPI_Comm global_world - , Slice::Name name_ + , typename Slice::Name name_ , size_t nSliceBuffers = 4 ) : rankMap(paramLength, np) @@ -951,13 +953,13 @@ namespace atrip { , name(name_) , sliceTypes(sliceTypes_) , sliceBuffers(nSliceBuffers, sources[0]) - //, slices(2 * sliceTypes.size(), Slice{ sources[0].size() }) + //, slices(2 * sliceTypes.size(), Slice{ sources[0].size() }) { // constructor begin LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n"; slices - = std::vector(2 * sliceTypes.size(), { sources[0].size() }); + = std::vector>(2 * sliceTypes.size(), { sources[0].size() }); // TODO: think exactly ^------------------- about this number // initialize the freePointers with the pointers to the buffers @@ -1026,19 +1028,19 @@ namespace atrip { * \brief Send asynchronously only if the state is Fetch */ void send( size_t otherRank - , Slice::Info const& info + , typename Slice::Info const& info , size_t tag) const noexcept { MPI_Request request; bool sendData_p = false; - if (info.state == Slice::Fetch) sendData_p = true; + if (info.state == Slice::Fetch) sendData_p = true; // TODO: remove this because I have SelfSufficient if (otherRank == info.from.rank) sendData_p = false; if (!sendData_p) return; MPI_Isend( sources[info.from.source].data() , sources[info.from.source].size() - , MPI_DOUBLE /* TODO: adapt this with traits */ + , traits::mpi::datatypeOf() , otherRank , tag , universe @@ -1052,19 +1054,19 @@ namespace atrip { /** * \brief Receive asynchronously only if the state is Fetch */ - void receive(Slice::Info const& info, size_t tag) noexcept { - auto& slice = Slice::findByInfo(slices, info); + void receive(typename Slice::Info const& info, size_t tag) noexcept { + auto& slice = Slice::findByInfo(slices, info); if (Atrip::rank == info.from.rank) return; - if (slice.info.state == Slice::Fetch) { + if (slice.info.state == Slice::Fetch) { // TODO: do it through the slice class - slice.info.state = Slice::Dispatched; + slice.info.state = Slice::Dispatched; MPI_Request request; slice.request = request; MPI_Irecv( slice.data , slice.size - , MPI_DOUBLE // TODO: Adapt this with traits + , traits::mpi::datatypeOf() , info.from.rank , tag , universe @@ -1078,42 +1080,42 @@ namespace atrip { for (auto type: sliceTypes) unwrapSlice(type, abc); } - F* unwrapSlice(Slice::Type type, ABCTuple const& abc) { + F* unwrapSlice(typename Slice::Type type, ABCTuple const& abc) { WITH_CRAZY_DEBUG WITH_RANK << "__unwrap__:slice " << type << " w n " << name << " abc" << pretty_print(abc) << "\n"; - auto& slice = Slice::findByTypeAbc(slices, type, abc); - WITH_RANK << "__unwrap__:info " << slice.info << "\n"; + auto& slice = Slice::findByTypeAbc(slices, type, abc); + //WITH_RANK << "__unwrap__:info " << slice.info << "\n"; switch (slice.info.state) { - case Slice::Dispatched: + case Slice::Dispatched: WITH_RANK << "__unwrap__:Fetch: " << &slice << " info " << pretty_print(slice.info) << "\n"; slice.unwrapAndMarkReady(); return slice.data; break; - case Slice::SelfSufficient: + case Slice::SelfSufficient: WITH_RANK << "__unwrap__:SelfSufficient: " << &slice << " info " << pretty_print(slice.info) << "\n"; return slice.data; break; - case Slice::Ready: + case Slice::Ready: WITH_RANK << "__unwrap__:READY: UNWRAPPED ALREADY" << &slice << " info " << pretty_print(slice.info) << "\n"; return slice.data; break; - case Slice::Recycled: + case Slice::Recycled: WITH_RANK << "__unwrap__:RECYCLED " << &slice << " info " << pretty_print(slice.info) << "\n"; return unwrapSlice(slice.info.recycling, abc); break; - case Slice::Fetch: - case Slice::Acceptor: + case Slice::Fetch: + case Slice::Acceptor: throw std::domain_error("Can't unwrap an acceptor or fetch slice!"); break; default: @@ -1122,24 +1124,26 @@ namespace atrip { return slice.data; } - const RankMap rankMap; + const RankMap rankMap; const MPI_Comm world; const MPI_Comm universe; const std::vector sliceLength; std::vector< std::vector > sources; - std::vector< Slice > slices; - Slice::Name name; - const std::vector sliceTypes; + std::vector< Slice > slices; + typename Slice::Name name; + const std::vector::Type> sliceTypes; std::vector< std::vector > sliceBuffers; std::set freePointers; }; - SliceUnion& - unionByName(std::vector const& unions, Slice::Name name) { + template + SliceUnion& + unionByName(std::vector*> const& unions, + typename Slice::Name name) { const auto sliceUnionIt = std::find_if(unions.begin(), unions.end(), - [&name](SliceUnion const* s) { + [&name](SliceUnion const* s) { return name == s->name; }); if (sliceUnionIt == unions.end()) From 05f5bb6104276ebb7ea36f241a19849568c0d16b Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Thu, 27 Jan 2022 20:44:09 +0100 Subject: [PATCH 08/22] Templatize unions --- atrip.org | 202 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 104 insertions(+), 98 deletions(-) diff --git a/atrip.org b/atrip.org index 52398c1..e2e6e69 100644 --- a/atrip.org +++ b/atrip.org @@ -1241,12 +1241,13 @@ and define subclasses of slice unions. namespace atrip { + template void sliceIntoVector - ( std::vector &v - , CTF::Tensor &toSlice + ( std::vector &v + , CTF::Tensor &toSlice , std::vector const low , std::vector const up - , CTF::Tensor const& origin + , CTF::Tensor const& origin , std::vector const originLow , std::vector const originUp ) { @@ -1273,155 +1274,159 @@ namespace atrip { , origin_.low.data() , origin_.up.data() , 1.0); - memcpy(v.data(), toSlice.data, sizeof(double) * v.size()); + memcpy(v.data(), toSlice.data, sizeof(F) * v.size()); #endif } - struct TAPHH : public SliceUnion { - TAPHH( Tensor const& sourceTensor + template + struct TAPHH : public SliceUnion { + TAPHH( CTF::Tensor const& sourceTensor , size_t No , size_t Nv , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , {Slice::A, Slice::B, Slice::C} - , {Nv, No, No} // size of the slices - , {Nv} - , np - , child_world - , global_world - , Slice::TA - , 4) { + ) : SliceUnion( sourceTensor + , {Slice::A, Slice::B, Slice::C} + , {Nv, No, No} // size of the slices + , {Nv} + , np + , child_world + , global_world + , Slice::TA + , 4) { init(sourceTensor); } - void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override + void sliceIntoBuffer(size_t it, CTF::Tensor &to, CTF::Tensor const& from) override { - const int Nv = sliceLength[0] - , No = sliceLength[1] - , a = rankMap.find({static_cast(Atrip::rank), it}); + const int Nv = this->sliceLength[0] + , No = this->sliceLength[1] + , a = this->rankMap.find({static_cast(Atrip::rank), it}); ; - sliceIntoVector( sources[it] - , to, {0, 0, 0}, {Nv, No, No} - , from, {a, 0, 0, 0}, {a+1, Nv, No, No} - ); + sliceIntoVector( this->sources[it] + , to, {0, 0, 0}, {Nv, No, No} + , from, {a, 0, 0, 0}, {a+1, Nv, No, No} + ); } }; - struct HHHA : public SliceUnion { - HHHA( Tensor const& sourceTensor + template + struct HHHA : public SliceUnion { + HHHA( CTF::Tensor const& sourceTensor , size_t No , size_t Nv , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , {Slice::A, Slice::B, Slice::C} - , {No, No, No} // size of the slices - , {Nv} // size of the parametrization - , np - , child_world - , global_world - , Slice::VIJKA - , 4) { + ) : SliceUnion( sourceTensor + , {Slice::A, Slice::B, Slice::C} + , {No, No, No} // size of the slices + , {Nv} // size of the parametrization + , np + , child_world + , global_world + , Slice::VIJKA + , 4) { init(sourceTensor); } - void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override + void sliceIntoBuffer(size_t it, CTF::Tensor &to, CTF::Tensor const& from) override { - const int No = sliceLength[0] - , a = rankMap.find({static_cast(Atrip::rank), it}) + const int No = this->sliceLength[0] + , a = this->rankMap.find({static_cast(Atrip::rank), it}) ; - sliceIntoVector( sources[it] - , to, {0, 0, 0}, {No, No, No} - , from, {0, 0, 0, a}, {No, No, No, a+1} - ); + sliceIntoVector( this->sources[it] + , to, {0, 0, 0}, {No, No, No} + , from, {0, 0, 0, a}, {No, No, No, a+1} + ); } }; - struct ABPH : public SliceUnion { - ABPH( Tensor const& sourceTensor + template + struct ABPH : public SliceUnion { + ABPH( CTF::Tensor const& sourceTensor , size_t No , size_t Nv , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , { Slice::AB, Slice::BC, Slice::AC - , Slice::BA, Slice::CB, Slice::CA - } - , {Nv, No} // size of the slices - , {Nv, Nv} // size of the parametrization - , np - , child_world - , global_world - , Slice::VABCI - , 2*6) { + ) : SliceUnion( sourceTensor + , { Slice::AB, Slice::BC, Slice::AC + , Slice::BA, Slice::CB, Slice::CA + } + , {Nv, No} // size of the slices + , {Nv, Nv} // size of the parametrization + , np + , child_world + , global_world + , Slice::VABCI + , 2*6) { init(sourceTensor); } - void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override { + void sliceIntoBuffer(size_t it, CTF::Tensor &to, CTF::Tensor const& from) override { - const int Nv = sliceLength[0] - , No = sliceLength[1] - , el = rankMap.find({static_cast(Atrip::rank), it}) + const int Nv = this->sliceLength[0] + , No = this->sliceLength[1] + , el = this->rankMap.find({static_cast(Atrip::rank), it}) , a = el % Nv , b = el / Nv ; - sliceIntoVector( sources[it] - , to, {0, 0}, {Nv, No} - , from, {a, b, 0, 0}, {a+1, b+1, Nv, No} - ); + sliceIntoVector( this->sources[it] + , to, {0, 0}, {Nv, No} + , from, {a, b, 0, 0}, {a+1, b+1, Nv, No} + ); } }; - struct ABHH : public SliceUnion { - ABHH( Tensor const& sourceTensor + template + struct ABHH : public SliceUnion { + ABHH( CTF::Tensor const& sourceTensor , size_t No , size_t Nv , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , {Slice::AB, Slice::BC, Slice::AC} - , {No, No} // size of the slices - , {Nv, Nv} // size of the parametrization - , np - , child_world - , global_world - , Slice::VABIJ - , 6) { + ) : SliceUnion( sourceTensor + , {Slice::AB, Slice::BC, Slice::AC} + , {No, No} // size of the slices + , {Nv, Nv} // size of the parametrization + , np + , child_world + , global_world + , Slice::VABIJ + , 6) { init(sourceTensor); } - void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override { + void sliceIntoBuffer(size_t it, CTF::Tensor &to, CTF::Tensor const& from) override { const int Nv = from.lens[0] - , No = sliceLength[1] - , el = rankMap.find({static_cast(Atrip::rank), it}) + , No = this->sliceLength[1] + , el = this->rankMap.find({static_cast(Atrip::rank), it}) , a = el % Nv , b = el / Nv ; - sliceIntoVector( sources[it] - , to, {0, 0}, {No, No} - , from, {a, b, 0, 0}, {a+1, b+1, No, No} - ); + sliceIntoVector( this->sources[it] + , to, {0, 0}, {No, No} + , from, {a, b, 0, 0}, {a+1, b+1, No, No} + ); } @@ -1429,39 +1434,40 @@ namespace atrip { }; - struct TABHH : public SliceUnion { - TABHH( Tensor const& sourceTensor + template + struct TABHH : public SliceUnion { + TABHH( CTF::Tensor const& sourceTensor , size_t No , size_t Nv , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , {Slice::AB, Slice::BC, Slice::AC} - , {No, No} // size of the slices - , {Nv, Nv} // size of the parametrization - , np - , child_world - , global_world - , Slice::TABIJ - , 6) { + ) : SliceUnion( sourceTensor + , {Slice::AB, Slice::BC, Slice::AC} + , {No, No} // size of the slices + , {Nv, Nv} // size of the parametrization + , np + , child_world + , global_world + , Slice::TABIJ + , 6) { init(sourceTensor); } - void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override { + void sliceIntoBuffer(size_t it, CTF::Tensor &to, CTF::Tensor const& from) override { // TODO: maybe generalize this with ABHH const int Nv = from.lens[0] - , No = sliceLength[1] - , el = rankMap.find({static_cast(Atrip::rank), it}) + , No = this->sliceLength[1] + , el = this->rankMap.find({static_cast(Atrip::rank), it}) , a = el % Nv , b = el / Nv ; - sliceIntoVector( sources[it] - , to, {0, 0}, {No, No} - , from, {a, b, 0, 0}, {a+1, b+1, No, No} - ); + sliceIntoVector( this->sources[it] + , to, {0, 0}, {No, No} + , from, {a, b, 0, 0}, {a+1, b+1, No, No} + ); } From 9d684b6624798c01724759fab160c58d9b87c1c0 Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Thu, 27 Jan 2022 20:45:03 +0100 Subject: [PATCH 09/22] Templatize energy functions --- atrip.org | 119 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 70 insertions(+), 49 deletions(-) diff --git a/atrip.org b/atrip.org index e2e6e69..2d218b7 100644 --- a/atrip.org +++ b/atrip.org @@ -1487,14 +1487,15 @@ namespace atrip { namespace atrip { + template double getEnergyDistinct - ( const double epsabc - , std::vector const& epsi - , std::vector const& Tijk_ - , std::vector const& Zijk_ + ( const F epsabc + , std::vector const& epsi + , std::vector const& Tijk_ + , std::vector const& Zijk_ ) { constexpr size_t blockSize=16; - double energy(0.); + F energy(0.); const size_t No = epsi.size(); for (size_t kk=0; kk k ? jj : k; for (size_t j(jstart); j < jend; j++){ - const double ej(epsi[j]); - double facjk( j == k ? 0.5 : 1.0); + F const ej(epsi[j]); + F const facjk = j == k ? F(0.5) : F(1.0); size_t istart = ii > j ? ii : j; for (size_t i(istart); i < iend; i++){ - const double ei(epsi[i]); - double facij ( i==j ? 0.5 : 1.0); - double denominator(epsabc - ei - ej - ek); - double U(Zijk_[i + No*j + No*No*k]); - double V(Zijk_[i + No*k + No*No*j]); - double W(Zijk_[j + No*i + No*No*k]); - double X(Zijk_[j + No*k + No*No*i]); - double Y(Zijk_[k + No*i + No*No*j]); - double Z(Zijk_[k + No*j + No*No*i]); - - double A(Tijk_[i + No*j + No*No*k]); - double B(Tijk_[i + No*k + No*No*j]); - double C(Tijk_[j + No*i + No*No*k]); - double D(Tijk_[j + No*k + No*No*i]); - double E(Tijk_[k + No*i + No*No*j]); - double F(Tijk_[k + No*j + No*No*i]); - double value(3.0*(A*U+B*V+C*W+D*X+E*Y+F*Z) - +((U+X+Y)-2.0*(V+W+Z))*(A+D+E) - +((V+W+Z)-2.0*(U+X+Y))*(B+C+F)); - energy += 2.0*value / denominator * facjk * facij; + const F + ei(epsi[i]) + , facij = i == j ? F(0.5) : F(1.0) + , denominator(epsabc - ei - ej - ek) + , U(Zijk_[i + No*j + No*No*k]) + , V(Zijk_[i + No*k + No*No*j]) + , W(Zijk_[j + No*i + No*No*k]) + , X(Zijk_[j + No*k + No*No*i]) + , Y(Zijk_[k + No*i + No*No*j]) + , Z(Zijk_[k + No*j + No*No*i]) + , A(std::conj(Tijk_[i + No*j + No*No*k])) + , B(std::conj(Tijk_[i + No*k + No*No*j])) + , C(std::conj(Tijk_[j + No*i + No*No*k])) + , D(std::conj(Tijk_[j + No*k + No*No*i])) + , E(std::conj(Tijk_[k + No*i + No*No*j])) + , F(std::conj(Tijk_[k + No*j + No*No*i])) + , value + = 3.0 * ( A * U + + B * V + + C * W + + D * X + + E * Y + + F * Z ) + + ( ( U + X + Y ) + - 2.0 * ( V + W + Z ) + ) * ( A + D + E ) + + ( ( V + W + Z ) + - 2.0 * ( U + X + Y ) + ) * ( B + C + F ) + ; + energy += 2.0 * value / denominator * facjk * facij; } // i } // j } // k } // ii } // jj } // kk - return energy; + return std::real(energy); } + template double getEnergySame - ( const double epsabc - , std::vector const& epsi - , std::vector const& Tijk_ - , std::vector const& Zijk_ + ( const F epsabc + , std::vector const& epsi + , std::vector const& Tijk_ + , std::vector const& Zijk_ ) { constexpr size_t blockSize = 16; const size_t No = epsi.size(); - double energy(0.); + F energy = F(0.); for (size_t kk=0; kk k ? jj : k; for(size_t j(jstart); j < jend; j++){ - const double facjk( j == k ? 0.5 : 1.0); - const double ej(epsi[j]); + const F facjk( j == k ? F(0.5) : F(1.0)); + const F ej(epsi[j]); const size_t istart = ii > j ? ii : j; for(size_t i(istart); i < iend; i++){ - double ei(epsi[i]); - double facij ( i==j ? 0.5 : 1.0); - double denominator(epsabc - ei - ej - ek); - double U(Zijk_[i + No*j + No*No*k]); - double V(Zijk_[j + No*k + No*No*i]); - double W(Zijk_[k + No*i + No*No*j]); - double A(Tijk_[i + No*j + No*No*k]); - double B(Tijk_[j + No*k + No*No*i]); - double C(Tijk_[k + No*i + No*No*j]); - double value(3.0*( A*U + B*V + C*W) - (A+B+C)*(U+V+W)); - energy += 2.0*value / denominator * facjk * facij; + const F + ei(epsi[i]) + , facij ( i==j ? F(0.5) : F(1.0)) + , denominator(epsabc - ei - ej - ek) + , U(Zijk_[i + No*j + No*No*k]) + , V(Zijk_[j + No*k + No*No*i]) + , W(Zijk_[k + No*i + No*No*j]) + , A(std::conj(Tijk_[i + No*j + No*No*k])) + , B(std::conj(Tijk_[j + No*k + No*No*i])) + , C(std::conj(Tijk_[k + No*i + No*No*j])) + , value + = F(3.0) * ( A * U + + B * V + + C * W + ) + - ( A + B + C ) * ( U + V + W ) + ; + energy += F(2.0) * value / denominator * facjk * facij; } // i } // j } // k } // ii } // jj } // kk - return energy; + return std::real(energy); } + template void singlesContribution ( size_t No , size_t Nv From c7c6db77dce8003513bec54e595d34cb5426ee29 Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Thu, 27 Jan 2022 20:45:38 +0100 Subject: [PATCH 10/22] Templatize doubles --- atrip.org | 136 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 77 insertions(+), 59 deletions(-) diff --git a/atrip.org b/atrip.org index 2d218b7..1fd7d91 100644 --- a/atrip.org +++ b/atrip.org @@ -1608,11 +1608,11 @@ namespace atrip { ( size_t No , size_t Nv , const ABCTuple &abc - , double const* Tph - , double const* VABij - , double const* VACij - , double const* VBCij - , double *Zijk + , F const* Tph + , F const* VABij + , F const* VACij + , F const* VBCij + , F *Zijk ) { const size_t a(abc[0]), b(abc[1]), c(abc[2]); for (size_t k=0; k < No; k++) @@ -1627,31 +1627,32 @@ namespace atrip { } } + template void doublesContribution ( const ABCTuple &abc , size_t const No , size_t const Nv // -- VABCI - , double const* VABph - , double const* VACph - , double const* VBCph - , double const* VBAph - , double const* VCAph - , double const* VCBph + , F const* VABph + , F const* VACph + , F const* VBCph + , F const* VBAph + , F const* VCAph + , F const* VCBph // -- VHHHA - , double const* VhhhA - , double const* VhhhB - , double const* VhhhC + , F const* VhhhA + , F const* VhhhB + , F const* VhhhC // -- TA - , double const* TAphh - , double const* TBphh - , double const* TCphh + , F const* TAphh + , F const* TBphh + , F const* TCphh // -- TABIJ - , double const* TABhh - , double const* TAChh - , double const* TBChh + , F const* TABhh + , F const* TAChh + , F const* TBChh // -- TIJK - , double *Tijk + , F *Tijk , atrip::Timings& chrono ) { @@ -1670,40 +1671,47 @@ namespace atrip { Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)]; \ } \ t_reorder.stop(); - #define DGEMM_PARTICLES(__A, __B) \ - atrip::dgemm_( "T" \ - , "N" \ - , (int const*)&NoNo \ - , (int const*)&No \ - , (int const*)&Nv \ - , &one \ - , __A \ - , (int const*)&Nv \ - , __B \ - , (int const*)&Nv \ - , &zero \ - , _t_buffer.data() \ - , (int const*)&NoNo \ - ); - #define DGEMM_HOLES(__A, __B, __TRANSB) \ - atrip::dgemm_( "N" \ - , __TRANSB \ - , (int const*)&NoNo \ - , (int const*)&No \ - , (int const*)&No \ - , &m_one \ - , __A \ - , (int const*)&NoNo \ - , __B \ - , (int const*)&No \ - , &zero \ - , _t_buffer.data() \ - , (int const*)&NoNo \ - ); + #define DGEMM_PARTICLES(__A, __B) \ + atrip::xgemm( "T" \ + , "N" \ + , (int const*)&NoNo \ + , (int const*)&No \ + , (int const*)&Nv \ + , &one \ + , __A \ + , (int const*)&Nv \ + , __B \ + , (int const*)&Nv \ + , &zero \ + , _t_buffer.data() \ + , (int const*)&NoNo \ + ); + #define DGEMM_HOLES(__A, __B, __TRANSB) \ + atrip::xgemm( "N" \ + , __TRANSB \ + , (int const*)&NoNo \ + , (int const*)&No \ + , (int const*)&No \ + , &m_one \ + , __A \ + , (int const*)&NoNo \ + , __B \ + , (int const*)&No \ + , &zero \ + , _t_buffer.data() \ + , (int const*)&NoNo \ + ); + #define MAYBE_CONJ(_conj, _buffer) \ + if (traits::isComplex()) { \ + for (size_t __i = 0; __i < NoNoNo; ++__i) \ + _conj[__i] = std::conj(_buffer[__i]); \ + } else { \ + for (size_t __i = 0; __i < NoNoNo; ++__i) \ + _conj[__i] = _buffer[__i]; \ + } - using F = double; const size_t NoNoNo = No*NoNo; - std::vector _t_buffer; + std::vector _t_buffer; _t_buffer.reserve(NoNoNo); F one{1.0}, m_one{-1.0}, zero{0.0}; @@ -1716,38 +1724,48 @@ namespace atrip { chrono["doubles:holes"].start(); { // Holes part ============================================================ + + std::vector _vhhh(NoNoNo); + // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 + MAYBE_CONJ(_vhhh, VhhhC) chrono["doubles:holes:1"].start(); - DGEMM_HOLES(VhhhC, TABhh, "N") + DGEMM_HOLES(_vhhh.data(), TABhh, "N") REORDER(i, k, j) chrono["doubles:holes:1"].stop(); // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 chrono["doubles:holes:2"].start(); - DGEMM_HOLES(VhhhC, TABhh, "T") + DGEMM_HOLES(_vhhh.data(), TABhh, "T") REORDER(j, k, i) chrono["doubles:holes:2"].stop(); + // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 + MAYBE_CONJ(_vhhh, VhhhB) chrono["doubles:holes:3"].start(); - DGEMM_HOLES(VhhhB, TAChh, "N") + DGEMM_HOLES(_vhhh.data(), TAChh, "N") REORDER(i, j, k) chrono["doubles:holes:3"].stop(); // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 chrono["doubles:holes:4"].start(); - DGEMM_HOLES(VhhhB, TAChh, "T") + DGEMM_HOLES(_vhhh.data(), TAChh, "T") REORDER(k, j, i) chrono["doubles:holes:4"].stop(); + // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 + MAYBE_CONJ(_vhhh, VhhhA) chrono["doubles:holes:5"].start(); - DGEMM_HOLES(VhhhA, TBChh, "N") + DGEMM_HOLES(_vhhh.data(), TBChh, "N") REORDER(j, i, k) chrono["doubles:holes:5"].stop(); // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 chrono["doubles:holes:6"].start(); - DGEMM_HOLES(VhhhA, TBChh, "T") + DGEMM_HOLES(_vhhh.data(), TBChh, "T") REORDER(k, i, j) chrono["doubles:holes:6"].stop(); + } chrono["doubles:holes"].stop(); + #undef MAYBE_CONJ chrono["doubles:particles"].start(); { // Particle part ========================================================= From 75ecd53e18b52f3279806164510b6ba5c088f530 Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Thu, 27 Jan 2022 20:45:54 +0100 Subject: [PATCH 11/22] Add xgemm --- atrip.org | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/atrip.org b/atrip.org index 1fd7d91..2c14694 100644 --- a/atrip.org +++ b/atrip.org @@ -1863,6 +1863,9 @@ is mainly using the =DGEMM= function, which we declare as #+begin_src c++ :tangle (atrip-blas-h) #pragma once namespace atrip { + + using Complex = std::complex; + extern "C" { void dgemm_( const char *transa, @@ -1871,14 +1874,73 @@ namespace atrip { const int *n, const int *k, double *alpha, - const double *A, + const double *a, const int *lda, - const double *B, + const double *b, const int *ldb, double *beta, - double *C, + double *c, const int *ldc ); + + void zgemm_( + const char *transa, + const char *transb, + const int *m, + const int *n, + const int *k, + Complex *alpha, + const Complex *A, + const int *lda, + const Complex *B, + const int *ldb, + Complex *beta, + Complex *C, + const int *ldc + ); + } + + + template + void xgemm(const char *transa, + const char *transb, + const int *m, + const int *n, + const int *k, + F *alpha, + const F *A, + const int *lda, + const F *B, + const int *ldb, + F *beta, + F *C, + const int *ldc) { + dgemm_(transa, transb, + m, n, k, + alpha, A, lda, + B, ldb, beta, + C, ldc); + } + + template <> + void xgemm(const char *transa, + const char *transb, + const int *m, + const int *n, + const int *k, + Complex *alpha, + const Complex *A, + const int *lda, + const Complex *B, + const int *ldb, + Complex *beta, + Complex *C, + const int *ldc) { + zgemm_(transa, transb, + m, n, k, + alpha, A, lda, + B, ldb, beta, + C, ldc); } } #+end_src From f8dd6b3f3179175c847b53c81d8d8702d4abc590 Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Thu, 27 Jan 2022 20:46:23 +0100 Subject: [PATCH 12/22] Templatize Input --- atrip.org | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/atrip.org b/atrip.org index 2c14694..a3aa110 100644 --- a/atrip.org +++ b/atrip.org @@ -1964,8 +1964,9 @@ namespace atrip { static int np; static void init(); + template struct Input { - CTF::Tensor *ei = nullptr + CTF::Tensor *ei = nullptr , *ea = nullptr , *Tph = nullptr , *Tpphh = nullptr @@ -1976,13 +1977,13 @@ namespace atrip { int maxIterations = 0, iterationMod = -1, percentageMod = -1; bool barrier = false; bool chrono = false; - Input& with_epsilon_i(CTF::Tensor * t) { ei = t; return *this; } - Input& with_epsilon_a(CTF::Tensor * t) { ea = t; return *this; } - Input& with_Tai(CTF::Tensor * t) { Tph = t; return *this; } - Input& with_Tabij(CTF::Tensor * t) { Tpphh = t; return *this; } - Input& with_Vabij(CTF::Tensor * t) { Vpphh = t; return *this; } - Input& with_Vijka(CTF::Tensor * t) { Vhhhp = t; return *this; } - Input& with_Vabci(CTF::Tensor * t) { Vppph = t; return *this; } + Input& with_epsilon_i(CTF::Tensor * t) { ei = t; return *this; } + Input& with_epsilon_a(CTF::Tensor * t) { ea = t; return *this; } + Input& with_Tai(CTF::Tensor * t) { Tph = t; return *this; } + Input& with_Tabij(CTF::Tensor * t) { Tpphh = t; return *this; } + Input& with_Vabij(CTF::Tensor * t) { Vpphh = t; return *this; } + Input& with_Vijka(CTF::Tensor * t) { Vhhhp = t; return *this; } + Input& with_Vabci(CTF::Tensor * t) { Vppph = t; return *this; } Input& with_maxIterations(int i) { maxIterations = i; return *this; } Input& with_iterationMod(int i) { iterationMod = i; return *this; } Input& with_percentageMod(int i) { percentageMod = i; return *this; } @@ -1993,7 +1994,8 @@ namespace atrip { struct Output { double energy; }; - static Output run(Input const& in); + template + static Output run(Input const& in); }; } From 5c177a85bc4d490517c2612b76eb6c685161bef6 Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Thu, 27 Jan 2022 20:48:38 +0100 Subject: [PATCH 13/22] Templatize main algorithm --- atrip.org | 127 +++++++++++++++++++++++++++++------------------------- 1 file changed, 68 insertions(+), 59 deletions(-) diff --git a/atrip.org b/atrip.org index a3aa110..7f19a7a 100644 --- a/atrip.org +++ b/atrip.org @@ -2028,7 +2028,8 @@ void Atrip::init() { MPI_Comm_size(MPI_COMM_WORLD, &Atrip::np); } -Atrip::Output Atrip::run(Atrip::Input const& in) { +template +Atrip::Output Atrip::run(Atrip::Input const& in) { const int np = Atrip::np; const int rank = Atrip::rank; @@ -2043,14 +2044,14 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { LOG(0,"Atrip") << "Nv: " << Nv << "\n"; // allocate the three scratches, see piecuch - std::vector Tijk(No*No*No) // doubles only (see piecuch) - , Zijk(No*No*No) // singles + doubles (see piecuch) - // we need local copies of the following tensors on every - // rank - , epsi(No) - , epsa(Nv) - , Tai(No * Nv) - ; + std::vector Tijk(No*No*No) // doubles only (see piecuch) + , Zijk(No*No*No) // singles + doubles (see piecuch) + // we need local copies of the following tensors on every + // rank + , epsi(No) + , epsa(Nv) + , Tai(No * Nv) + ; in.ei->read_all(epsi.data()); in.ea->read_all(epsa.data()); @@ -2079,20 +2080,20 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { chrono["nv-slices"].start(); // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1 LOG(0,"Atrip") << "BUILD NV-SLICES\n"; - TAPHH taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - HHHA hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + TAPHH taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + HHHA hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); chrono["nv-slices"].stop(); chrono["nv-nv-slices"].start(); // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1 LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n"; - ABPH abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - ABHH abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - TABHH tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + ABPH abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + ABHH abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + TABHH tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); chrono["nv-nv-slices"].stop(); // all tensors - std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh}; + std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh}; //CONSTRUCT TUPLE LIST ==============================================={{{1 LOG(0,"Atrip") << "BUILD TUPLE LIST\n"; @@ -2126,18 +2127,20 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { = [&tuplesList](size_t const i) { return i >= tuplesList.size(); }; + using Database = typename Slice::Database; + using LocalDatabase = typename Slice::LocalDatabase; auto communicateDatabase = [ &unions , np , &chrono - ] (ABCTuple const& abc, MPI_Comm const& c) -> Slice::Database { + ] (ABCTuple const& abc, MPI_Comm const& c) -> typename Slice::Database { chrono["db:comm:type:do"].start(); - auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement(); + auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement(); chrono["db:comm:type:do"].stop(); chrono["db:comm:ldb"].start(); - Slice::LocalDatabase ldb; + LocalDatabase ldb; for (auto const& tensor: unions) { auto const& tensorDb = tensor->buildLocalDatabase(abc); @@ -2145,7 +2148,8 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { } chrono["db:comm:ldb"].stop(); - Slice::Database db(np * ldb.size(), ldb[0]); + typename + Slice::Database db(np * ldb.size(), ldb[0]); chrono["oneshot-db:comm:allgather"].start(); chrono["db:comm:allgather"].start(); @@ -2167,7 +2171,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { }; auto doIOPhase - = [&unions, &rank, &np, &universe, &chrono] (Slice::Database const& db) { + = [&unions, &rank, &np, &universe, &chrono] (typename Slice::Database const& db) { const size_t localDBLength = db.size() / np; @@ -2217,7 +2221,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { ; for (auto it = begin; it != end; ++it) { sendTag++; - Slice::LocalDatabaseElement const& el = *it; + typename Slice::LocalDatabaseElement const& el = *it; if (el.info.from.rank != rank) continue; @@ -2266,7 +2270,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // START MAIN LOOP ======================================================{{{1 - Slice::Database db; + typename Slice::Database db; for ( size_t i = abcIndex.first, iteration = 1 ; i < abcIndex.second @@ -2373,30 +2377,31 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { ))) chrono["oneshot-doubles"].start(); chrono["doubles"].start(); - doublesContribution( abc, (size_t)No, (size_t)Nv - // -- VABCI - , abph.unwrapSlice(Slice::AB, abc) - , abph.unwrapSlice(Slice::AC, abc) - , abph.unwrapSlice(Slice::BC, abc) - , abph.unwrapSlice(Slice::BA, abc) - , abph.unwrapSlice(Slice::CA, abc) - , abph.unwrapSlice(Slice::CB, abc) - // -- VHHHA - , hhha.unwrapSlice(Slice::A, abc) - , hhha.unwrapSlice(Slice::B, abc) - , hhha.unwrapSlice(Slice::C, abc) - // -- TA - , taphh.unwrapSlice(Slice::A, abc) - , taphh.unwrapSlice(Slice::B, abc) - , taphh.unwrapSlice(Slice::C, abc) - // -- TABIJ - , tabhh.unwrapSlice(Slice::AB, abc) - , tabhh.unwrapSlice(Slice::AC, abc) - , tabhh.unwrapSlice(Slice::BC, abc) - // -- TIJK - , Tijk.data() - , chrono - ); + LOGREMOVE << "doubles " << iteration << "\n"; + doublesContribution( abc, (size_t)No, (size_t)Nv + // -- VABCI + , abph.unwrapSlice(Slice::AB, abc) + , abph.unwrapSlice(Slice::AC, abc) + , abph.unwrapSlice(Slice::BC, abc) + , abph.unwrapSlice(Slice::BA, abc) + , abph.unwrapSlice(Slice::CA, abc) + , abph.unwrapSlice(Slice::CB, abc) + // -- VHHHA + , hhha.unwrapSlice(Slice::A, abc) + , hhha.unwrapSlice(Slice::B, abc) + , hhha.unwrapSlice(Slice::C, abc) + // -- TA + , taphh.unwrapSlice(Slice::A, abc) + , taphh.unwrapSlice(Slice::B, abc) + , taphh.unwrapSlice(Slice::C, abc) + // -- TABIJ + , tabhh.unwrapSlice(Slice::AB, abc) + , tabhh.unwrapSlice(Slice::AC, abc) + , tabhh.unwrapSlice(Slice::BC, abc) + // -- TIJK + , Tijk.data() + , chrono + ); WITH_RANK << iteration << "-th doubles done\n"; chrono["doubles"].stop(); chrono["oneshot-doubles"].stop(); @@ -2414,12 +2419,12 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I]; chrono["reorder"].stop(); chrono["singles"].start(); - singlesContribution( No, Nv, abc - , Tai.data() - , abhh.unwrapSlice(Slice::AB, abc) - , abhh.unwrapSlice(Slice::AC, abc) - , abhh.unwrapSlice(Slice::BC, abc) - , Zijk.data()); + singlesContribution( No, Nv, abc + , Tai.data() + , abhh.unwrapSlice(Slice::AB, abc) + , abhh.unwrapSlice(Slice::AC, abc) + , abhh.unwrapSlice(Slice::BC, abc) + , Zijk.data()); chrono["singles"].stop(); } @@ -2431,13 +2436,13 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { int distinct(0); if (abc[0] == abc[1]) distinct++; if (abc[1] == abc[2]) distinct--; - const double epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]); + const F epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]); chrono["energy"].start(); if ( distinct == 0) - tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk); + tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk); else - tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk); + tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk); chrono["energy"].stop(); #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES) @@ -2478,8 +2483,8 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { << " :abc " << pretty_print(abc) << " :abcN " << pretty_print(*abcNext) << "\n"; - for (auto const& slice: u->slices) - WITH_RANK << "__gc__:guts:" << slice.info << "\n"; + // for (auto const& slice: u->slices) + // WITH_RANK << "__gc__:guts:" << slice.info << "\n"; u->clearUnusedSlicesForNext(*abcNext); WITH_RANK << "__gc__: checking validity\n"; @@ -2487,13 +2492,13 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { #ifdef HAVE_OCD // check for validity of the slices for (auto type: u->sliceTypes) { - auto tuple = Slice::subtupleBySlice(abc, type); + auto tuple = Slice::subtupleBySlice(abc, type); for (auto& slice: u->slices) { if ( slice.info.type == type && slice.info.tuple == tuple && slice.isDirectlyFetchable() ) { - if (slice.info.state == Slice::Dispatched) + if (slice.info.state == Slice::Dispatched) throw std::domain_error( "This slice should not be undispatched! " + pretty_print(slice.info)); } @@ -2560,6 +2565,10 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { return { - globalEnergy }; } +// instantiate +template Atrip::Output Atrip::run(Atrip::Input const& in); +template Atrip::Output Atrip::run(Atrip::Input const& in); + #+end_src From e161e4c0d6a2f34b732cbc3c144caf47d8f9ea4b Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Thu, 27 Jan 2022 20:49:07 +0100 Subject: [PATCH 14/22] Update Debug --- atrip.org | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/atrip.org b/atrip.org index 7f19a7a..561a375 100644 --- a/atrip.org +++ b/atrip.org @@ -2580,7 +2580,9 @@ template Atrip::Output Atrip::run(Atrip::Input const& in); #include #define ATRIP_BENCHMARK //#define ATRIP_DONT_SLICE -#define ATRIP_DEBUG 1 +#ifndef ATRIP_DEBUG +# define ATRIP_DEBUG 1 +#endif //#define ATRIP_WORKLOAD_DUMP #define ATRIP_USE_DGEMM //#define ATRIP_PRINT_TUPLES From 7f455d54fdb65122425b9dd85ff3867d7222575c Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Thu, 27 Jan 2022 20:59:42 +0100 Subject: [PATCH 15/22] Clean up couple of things --- atrip.org | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/atrip.org b/atrip.org index 561a375..8e4ec91 100644 --- a/atrip.org +++ b/atrip.org @@ -2133,7 +2133,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { = [ &unions , np , &chrono - ] (ABCTuple const& abc, MPI_Comm const& c) -> typename Slice::Database { + ] (ABCTuple const& abc, MPI_Comm const& c) -> Database { chrono["db:comm:type:do"].start(); auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement(); @@ -2148,8 +2148,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { } chrono["db:comm:ldb"].stop(); - typename - Slice::Database db(np * ldb.size(), ldb[0]); + Database db(np * ldb.size(), ldb[0]); chrono["oneshot-db:comm:allgather"].start(); chrono["db:comm:allgather"].start(); @@ -2171,7 +2170,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { }; auto doIOPhase - = [&unions, &rank, &np, &universe, &chrono] (typename Slice::Database const& db) { + = [&unions, &rank, &np, &universe, &chrono] (Database const& db) { const size_t localDBLength = db.size() / np; @@ -2270,14 +2269,13 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // START MAIN LOOP ======================================================{{{1 - typename Slice::Database db; - for ( size_t i = abcIndex.first, iteration = 1 ; i < abcIndex.second ; i++, iteration++ ) { chrono["iterations"].start(); + // check overhead from chrono over all iterations chrono["start:stop"].start(); chrono["start:stop"].stop(); @@ -2356,7 +2354,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { WITH_RANK << "__comm__:" << iteration << "th communicating database\n"; chrono["db:comm"].start(); //const auto db = communicateDatabase(*abcNext, universe); - db = communicateDatabase(*abcNext, universe); + Database db = communicateDatabase(*abcNext, universe); chrono["db:comm"].stop(); chrono["db:io"].start(); doIOPhase(db); @@ -2377,7 +2375,6 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { ))) chrono["oneshot-doubles"].start(); chrono["doubles"].start(); - LOGREMOVE << "doubles " << iteration << "\n"; doublesContribution( abc, (size_t)No, (size_t)Nv // -- VABCI , abph.unwrapSlice(Slice::AB, abc) From c2b1c78c67d756c29b6a0f9b884248491a38f4d7 Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Mon, 7 Feb 2022 22:29:47 +0100 Subject: [PATCH 16/22] Tanlge source files for complex --- include/atrip.hpp | 2 +- include/atrip/Atrip.hpp | 24 ++-- include/atrip/Blas.hpp | 70 +++++++++- include/atrip/Debug.hpp | 12 +- include/atrip/Equations.hpp | 257 ++++++++++++++++++++--------------- include/atrip/RankMap.hpp | 12 +- include/atrip/Slice.hpp | 64 +++++---- include/atrip/SliceUnion.hpp | 110 +++++++-------- include/atrip/Tuples.hpp | 2 +- include/atrip/Unions.hpp | 204 +++++++++++++-------------- include/atrip/Utils.hpp | 2 +- src/atrip/Atrip.cxx | 129 +++++++++--------- 12 files changed, 511 insertions(+), 377 deletions(-) diff --git a/include/atrip.hpp b/include/atrip.hpp index b3ef823..8ecf6ce 100644 --- a/include/atrip.hpp +++ b/include/atrip.hpp @@ -1,4 +1,4 @@ -// [[file:../atrip.org::*Include header][Include header:1]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Include%20header][Include header:1]] #pragma once #include diff --git a/include/atrip/Atrip.hpp b/include/atrip/Atrip.hpp index a8bcd78..6f3859c 100644 --- a/include/atrip/Atrip.hpp +++ b/include/atrip/Atrip.hpp @@ -1,4 +1,4 @@ -// [[file:../../atrip.org::*Atrip][Atrip:1]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Header][Header:1]] #pragma once #include #include @@ -15,8 +15,9 @@ namespace atrip { static int np; static void init(); + template struct Input { - CTF::Tensor *ei = nullptr + CTF::Tensor *ei = nullptr , *ea = nullptr , *Tph = nullptr , *Tpphh = nullptr @@ -27,13 +28,13 @@ namespace atrip { int maxIterations = 0, iterationMod = -1, percentageMod = -1; bool barrier = false; bool chrono = false; - Input& with_epsilon_i(CTF::Tensor * t) { ei = t; return *this; } - Input& with_epsilon_a(CTF::Tensor * t) { ea = t; return *this; } - Input& with_Tai(CTF::Tensor * t) { Tph = t; return *this; } - Input& with_Tabij(CTF::Tensor * t) { Tpphh = t; return *this; } - Input& with_Vabij(CTF::Tensor * t) { Vpphh = t; return *this; } - Input& with_Vijka(CTF::Tensor * t) { Vhhhp = t; return *this; } - Input& with_Vabci(CTF::Tensor * t) { Vppph = t; return *this; } + Input& with_epsilon_i(CTF::Tensor * t) { ei = t; return *this; } + Input& with_epsilon_a(CTF::Tensor * t) { ea = t; return *this; } + Input& with_Tai(CTF::Tensor * t) { Tph = t; return *this; } + Input& with_Tabij(CTF::Tensor * t) { Tpphh = t; return *this; } + Input& with_Vabij(CTF::Tensor * t) { Vpphh = t; return *this; } + Input& with_Vijka(CTF::Tensor * t) { Vhhhp = t; return *this; } + Input& with_Vabci(CTF::Tensor * t) { Vppph = t; return *this; } Input& with_maxIterations(int i) { maxIterations = i; return *this; } Input& with_iterationMod(int i) { iterationMod = i; return *this; } Input& with_percentageMod(int i) { percentageMod = i; return *this; } @@ -44,8 +45,9 @@ namespace atrip { struct Output { double energy; }; - static Output run(Input const& in); + template + static Output run(Input const& in); }; } -// Atrip:1 ends here +// Header:1 ends here diff --git a/include/atrip/Blas.hpp b/include/atrip/Blas.hpp index fa63028..df81d74 100644 --- a/include/atrip/Blas.hpp +++ b/include/atrip/Blas.hpp @@ -1,6 +1,9 @@ -// [[file:../../atrip.org::*Blas][Blas:1]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Blas][Blas:1]] #pragma once namespace atrip { + + using Complex = std::complex; + extern "C" { void dgemm_( const char *transa, @@ -9,14 +12,73 @@ namespace atrip { const int *n, const int *k, double *alpha, - const double *A, + const double *a, const int *lda, - const double *B, + const double *b, const int *ldb, double *beta, - double *C, + double *c, + const int *ldc + ); + + void zgemm_( + const char *transa, + const char *transb, + const int *m, + const int *n, + const int *k, + Complex *alpha, + const Complex *A, + const int *lda, + const Complex *B, + const int *ldb, + Complex *beta, + Complex *C, const int *ldc ); } + + + template + void xgemm(const char *transa, + const char *transb, + const int *m, + const int *n, + const int *k, + F *alpha, + const F *A, + const int *lda, + const F *B, + const int *ldb, + F *beta, + F *C, + const int *ldc) { + dgemm_(transa, transb, + m, n, k, + alpha, A, lda, + B, ldb, beta, + C, ldc); + } + + template <> + void xgemm(const char *transa, + const char *transb, + const int *m, + const int *n, + const int *k, + Complex *alpha, + const Complex *A, + const int *lda, + const Complex *B, + const int *ldb, + Complex *beta, + Complex *C, + const int *ldc) { + zgemm_(transa, transb, + m, n, k, + alpha, A, lda, + B, ldb, beta, + C, ldc); + } } // Blas:1 ends here diff --git a/include/atrip/Debug.hpp b/include/atrip/Debug.hpp index 6bdfde2..4347824 100644 --- a/include/atrip/Debug.hpp +++ b/include/atrip/Debug.hpp @@ -1,9 +1,11 @@ -// [[file:../../atrip.org::*Macros][Macros:1]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Macros][Macros:1]] #pragma once #include #define ATRIP_BENCHMARK //#define ATRIP_DONT_SLICE -#define ATRIP_DEBUG 1 +#ifndef ATRIP_DEBUG +# define ATRIP_DEBUG 1 +#endif //#define ATRIP_WORKLOAD_DUMP #define ATRIP_USE_DGEMM //#define ATRIP_PRINT_TUPLES @@ -60,20 +62,20 @@ #endif // Macros:1 ends here -// [[file:../../atrip.org::*Macros][Macros:2]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Macros][Macros:2]] #ifndef LOG #define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": " #endif // Macros:2 ends here -// [[file:../../atrip.org::*Macros][Macros:3]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Macros][Macros:3]] #ifdef ATRIP_NO_OUTPUT # undef LOG # define LOG(level, name) if (false) std::cout << name << ": " #endif // Macros:3 ends here -// [[file:../../atrip.org::IterationDescriptor][IterationDescriptor]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::IterationDescriptor][IterationDescriptor]] namespace atrip { struct IterationDescription; diff --git a/include/atrip/Equations.hpp b/include/atrip/Equations.hpp index b8496f6..2b90736 100644 --- a/include/atrip/Equations.hpp +++ b/include/atrip/Equations.hpp @@ -1,4 +1,4 @@ -// [[file:../../atrip.org::*Equations][Equations:1]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Equations][Equations:1]] #pragma once #include @@ -6,14 +6,15 @@ namespace atrip { + template double getEnergyDistinct - ( const double epsabc - , std::vector const& epsi - , std::vector const& Tijk_ - , std::vector const& Zijk_ + ( const F epsabc + , std::vector const& epsi + , std::vector const& Tijk_ + , std::vector const& Zijk_ ) { constexpr size_t blockSize=16; - double energy(0.); + F energy(0.); const size_t No = epsi.size(); for (size_t kk=0; kk k ? jj : k; for (size_t j(jstart); j < jend; j++){ - const double ej(epsi[j]); - double facjk( j == k ? 0.5 : 1.0); + F const ej(epsi[j]); + F const facjk = j == k ? F(0.5) : F(1.0); size_t istart = ii > j ? ii : j; for (size_t i(istart); i < iend; i++){ - const double ei(epsi[i]); - double facij ( i==j ? 0.5 : 1.0); - double denominator(epsabc - ei - ej - ek); - double U(Zijk_[i + No*j + No*No*k]); - double V(Zijk_[i + No*k + No*No*j]); - double W(Zijk_[j + No*i + No*No*k]); - double X(Zijk_[j + No*k + No*No*i]); - double Y(Zijk_[k + No*i + No*No*j]); - double Z(Zijk_[k + No*j + No*No*i]); - - double A(Tijk_[i + No*j + No*No*k]); - double B(Tijk_[i + No*k + No*No*j]); - double C(Tijk_[j + No*i + No*No*k]); - double D(Tijk_[j + No*k + No*No*i]); - double E(Tijk_[k + No*i + No*No*j]); - double F(Tijk_[k + No*j + No*No*i]); - double value(3.0*(A*U+B*V+C*W+D*X+E*Y+F*Z) - +((U+X+Y)-2.0*(V+W+Z))*(A+D+E) - +((V+W+Z)-2.0*(U+X+Y))*(B+C+F)); - energy += 2.0*value / denominator * facjk * facij; + const F + ei(epsi[i]) + , facij = i == j ? F(0.5) : F(1.0) + , denominator(epsabc - ei - ej - ek) + , U(Zijk_[i + No*j + No*No*k]) + , V(Zijk_[i + No*k + No*No*j]) + , W(Zijk_[j + No*i + No*No*k]) + , X(Zijk_[j + No*k + No*No*i]) + , Y(Zijk_[k + No*i + No*No*j]) + , Z(Zijk_[k + No*j + No*No*i]) + , A(std::conj(Tijk_[i + No*j + No*No*k])) + , B(std::conj(Tijk_[i + No*k + No*No*j])) + , C(std::conj(Tijk_[j + No*i + No*No*k])) + , D(std::conj(Tijk_[j + No*k + No*No*i])) + , E(std::conj(Tijk_[k + No*i + No*No*j])) + , F(std::conj(Tijk_[k + No*j + No*No*i])) + , value + = 3.0 * ( A * U + + B * V + + C * W + + D * X + + E * Y + + F * Z ) + + ( ( U + X + Y ) + - 2.0 * ( V + W + Z ) + ) * ( A + D + E ) + + ( ( V + W + Z ) + - 2.0 * ( U + X + Y ) + ) * ( B + C + F ) + ; + energy += 2.0 * value / denominator * facjk * facij; } // i } // j } // k } // ii } // jj } // kk - return energy; + return std::real(energy); } + template double getEnergySame - ( const double epsabc - , std::vector const& epsi - , std::vector const& Tijk_ - , std::vector const& Zijk_ + ( const F epsabc + , std::vector const& epsi + , std::vector const& Tijk_ + , std::vector const& Zijk_ ) { constexpr size_t blockSize = 16; const size_t No = epsi.size(); - double energy(0.); + F energy = F(0.); for (size_t kk=0; kk k ? jj : k; for(size_t j(jstart); j < jend; j++){ - const double facjk( j == k ? 0.5 : 1.0); - const double ej(epsi[j]); + const F facjk( j == k ? F(0.5) : F(1.0)); + const F ej(epsi[j]); const size_t istart = ii > j ? ii : j; for(size_t i(istart); i < iend; i++){ - double ei(epsi[i]); - double facij ( i==j ? 0.5 : 1.0); - double denominator(epsabc - ei - ej - ek); - double U(Zijk_[i + No*j + No*No*k]); - double V(Zijk_[j + No*k + No*No*i]); - double W(Zijk_[k + No*i + No*No*j]); - double A(Tijk_[i + No*j + No*No*k]); - double B(Tijk_[j + No*k + No*No*i]); - double C(Tijk_[k + No*i + No*No*j]); - double value(3.0*( A*U + B*V + C*W) - (A+B+C)*(U+V+W)); - energy += 2.0*value / denominator * facjk * facij; + const F + ei(epsi[i]) + , facij ( i==j ? F(0.5) : F(1.0)) + , denominator(epsabc - ei - ej - ek) + , U(Zijk_[i + No*j + No*No*k]) + , V(Zijk_[j + No*k + No*No*i]) + , W(Zijk_[k + No*i + No*No*j]) + , A(std::conj(Tijk_[i + No*j + No*No*k])) + , B(std::conj(Tijk_[j + No*k + No*No*i])) + , C(std::conj(Tijk_[k + No*i + No*No*j])) + , value + = F(3.0) * ( A * U + + B * V + + C * W + ) + - ( A + B + C ) * ( U + V + W ) + ; + energy += F(2.0) * value / denominator * facjk * facij; } // i } // j } // k } // ii } // jj } // kk - return energy; + return std::real(energy); } + template void singlesContribution ( size_t No , size_t Nv , const ABCTuple &abc - , double const* Tph - , double const* VABij - , double const* VACij - , double const* VBCij - , double *Zijk + , F const* Tph + , F const* VABij + , F const* VACij + , F const* VBCij + , F *Zijk ) { const size_t a(abc[0]), b(abc[1]), c(abc[2]); for (size_t k=0; k < No; k++) @@ -125,31 +146,32 @@ namespace atrip { } } + template void doublesContribution ( const ABCTuple &abc , size_t const No , size_t const Nv // -- VABCI - , double const* VABph - , double const* VACph - , double const* VBCph - , double const* VBAph - , double const* VCAph - , double const* VCBph + , F const* VABph + , F const* VACph + , F const* VBCph + , F const* VBAph + , F const* VCAph + , F const* VCBph // -- VHHHA - , double const* VhhhA - , double const* VhhhB - , double const* VhhhC + , F const* VhhhA + , F const* VhhhB + , F const* VhhhC // -- TA - , double const* TAphh - , double const* TBphh - , double const* TCphh + , F const* TAphh + , F const* TBphh + , F const* TCphh // -- TABIJ - , double const* TABhh - , double const* TAChh - , double const* TBChh + , F const* TABhh + , F const* TAChh + , F const* TBChh // -- TIJK - , double *Tijk + , F *Tijk , atrip::Timings& chrono ) { @@ -168,40 +190,47 @@ namespace atrip { Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)]; \ } \ t_reorder.stop(); - #define DGEMM_PARTICLES(__A, __B) \ - atrip::dgemm_( "T" \ - , "N" \ - , (int const*)&NoNo \ - , (int const*)&No \ - , (int const*)&Nv \ - , &one \ - , __A \ - , (int const*)&Nv \ - , __B \ - , (int const*)&Nv \ - , &zero \ - , _t_buffer.data() \ - , (int const*)&NoNo \ - ); - #define DGEMM_HOLES(__A, __B, __TRANSB) \ - atrip::dgemm_( "N" \ - , __TRANSB \ - , (int const*)&NoNo \ - , (int const*)&No \ - , (int const*)&No \ - , &m_one \ - , __A \ - , (int const*)&NoNo \ - , __B \ - , (int const*)&No \ - , &zero \ - , _t_buffer.data() \ - , (int const*)&NoNo \ - ); + #define DGEMM_PARTICLES(__A, __B) \ + atrip::xgemm( "T" \ + , "N" \ + , (int const*)&NoNo \ + , (int const*)&No \ + , (int const*)&Nv \ + , &one \ + , __A \ + , (int const*)&Nv \ + , __B \ + , (int const*)&Nv \ + , &zero \ + , _t_buffer.data() \ + , (int const*)&NoNo \ + ); + #define DGEMM_HOLES(__A, __B, __TRANSB) \ + atrip::xgemm( "N" \ + , __TRANSB \ + , (int const*)&NoNo \ + , (int const*)&No \ + , (int const*)&No \ + , &m_one \ + , __A \ + , (int const*)&NoNo \ + , __B \ + , (int const*)&No \ + , &zero \ + , _t_buffer.data() \ + , (int const*)&NoNo \ + ); + #define MAYBE_CONJ(_conj, _buffer) \ + if (traits::isComplex()) { \ + for (size_t __i = 0; __i < NoNoNo; ++__i) \ + _conj[__i] = std::conj(_buffer[__i]); \ + } else { \ + for (size_t __i = 0; __i < NoNoNo; ++__i) \ + _conj[__i] = _buffer[__i]; \ + } - using F = double; const size_t NoNoNo = No*NoNo; - std::vector _t_buffer; + std::vector _t_buffer; _t_buffer.reserve(NoNoNo); F one{1.0}, m_one{-1.0}, zero{0.0}; @@ -214,38 +243,48 @@ namespace atrip { chrono["doubles:holes"].start(); { // Holes part ============================================================ + + std::vector _vhhh(NoNoNo); + // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 + MAYBE_CONJ(_vhhh, VhhhC) chrono["doubles:holes:1"].start(); - DGEMM_HOLES(VhhhC, TABhh, "N") + DGEMM_HOLES(_vhhh.data(), TABhh, "N") REORDER(i, k, j) chrono["doubles:holes:1"].stop(); // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 chrono["doubles:holes:2"].start(); - DGEMM_HOLES(VhhhC, TABhh, "T") + DGEMM_HOLES(_vhhh.data(), TABhh, "T") REORDER(j, k, i) chrono["doubles:holes:2"].stop(); + // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 + MAYBE_CONJ(_vhhh, VhhhB) chrono["doubles:holes:3"].start(); - DGEMM_HOLES(VhhhB, TAChh, "N") + DGEMM_HOLES(_vhhh.data(), TAChh, "N") REORDER(i, j, k) chrono["doubles:holes:3"].stop(); // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 chrono["doubles:holes:4"].start(); - DGEMM_HOLES(VhhhB, TAChh, "T") + DGEMM_HOLES(_vhhh.data(), TAChh, "T") REORDER(k, j, i) chrono["doubles:holes:4"].stop(); + // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 + MAYBE_CONJ(_vhhh, VhhhA) chrono["doubles:holes:5"].start(); - DGEMM_HOLES(VhhhA, TBChh, "N") + DGEMM_HOLES(_vhhh.data(), TBChh, "N") REORDER(j, i, k) chrono["doubles:holes:5"].stop(); // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 chrono["doubles:holes:6"].start(); - DGEMM_HOLES(VhhhA, TBChh, "T") + DGEMM_HOLES(_vhhh.data(), TBChh, "T") REORDER(k, i, j) chrono["doubles:holes:6"].stop(); + } chrono["doubles:holes"].stop(); + #undef MAYBE_CONJ chrono["doubles:particles"].start(); { // Particle part ========================================================= diff --git a/include/atrip/RankMap.hpp b/include/atrip/RankMap.hpp index 82bb674..8564f9e 100644 --- a/include/atrip/RankMap.hpp +++ b/include/atrip/RankMap.hpp @@ -1,4 +1,4 @@ -// [[file:../../atrip.org::*The rank mapping][The rank mapping:1]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20rank%20mapping][The rank mapping:1]] #pragma once #include @@ -7,6 +7,8 @@ #include namespace atrip { + + template struct RankMap { std::vector const lengths; @@ -19,7 +21,7 @@ namespace atrip { 1UL, std::multiplies())) { assert(lengths.size() <= 2); } - size_t find(Slice::Location const& p) const noexcept { + size_t find(typename Slice::Location const& p) const noexcept { return p.source * np + p.rank; } @@ -39,10 +41,10 @@ namespace atrip { return source == nSources() && isPaddingRank(rank); } - Slice::Location - find(ABCTuple const& abc, Slice::Type sliceType) const noexcept { + typename Slice::Location + find(ABCTuple const& abc, typename Slice::Type sliceType) const noexcept { // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB - const auto tuple = Slice::subtupleBySlice(abc, sliceType); + const auto tuple = Slice::subtupleBySlice(abc, sliceType); const size_t index = tuple[0] diff --git a/include/atrip/Slice.hpp b/include/atrip/Slice.hpp index a7a5363..877d72a 100644 --- a/include/atrip/Slice.hpp +++ b/include/atrip/Slice.hpp @@ -1,4 +1,4 @@ -// [[file:../../atrip.org::*The slice][The slice:1]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20slice][The slice:1]] #pragma once #include #include @@ -7,16 +7,26 @@ #include #include +#include namespace atrip { +namespace traits { + template bool isComplex() { return false; }; + template <> bool isComplex() { return true; }; +namespace mpi { + template MPI_Datatype datatypeOf(void); + template <> MPI_Datatype datatypeOf() { return MPI_DOUBLE; } + template <> MPI_Datatype datatypeOf() { return MPI_DOUBLE_COMPLEX; } +} +} + +template struct Slice { - - using F = double; // The slice:1 ends here -// [[file:../../atrip.org::*The slice][The slice:2]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20slice][The slice:2]] // ASSOCIATED TYPES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% struct Location { size_t rank; size_t source; }; @@ -93,8 +103,8 @@ struct Slice { // DATABASE ==========================================================={{{1 struct LocalDatabaseElement { - Slice::Name name; - Slice::Info info; + Slice::Name name; + Slice::Info info; }; using LocalDatabase = std::vector; using Database = LocalDatabase; @@ -117,7 +127,7 @@ struct Slice { constexpr int n = 2; // create a sliceLocation to measure in the current architecture // the packing of the struct - Slice::Location measure; + Slice::Location measure; MPI_Datatype dt; const std::vector lengths(n, 1); const MPI_Datatype types[n] = {usizeDt(), usizeDt()}; @@ -141,7 +151,7 @@ struct Slice { static MPI_Datatype sliceInfo () { constexpr int n = 5; MPI_Datatype dt; - Slice::Info measure; + Slice::Info measure; const std::vector lengths(n, 1); const MPI_Datatype types[n] = { vector(2, usizeDt()) @@ -213,10 +223,10 @@ struct Slice { * It is important here to return a reference to a Slice * not to accidentally copy the associated buffer of the slice. */ - static Slice& findOneByType(std::vector &slices, Slice::Type type) { + static Slice& findOneByType(std::vector> &slices, Slice::Type type) { const auto sliceIt = std::find_if(slices.begin(), slices.end(), - [&type](Slice const& s) { + [&type](Slice const& s) { return type == s.info.type; }); WITH_CRAZY_DEBUG @@ -231,11 +241,11 @@ struct Slice { * Check if an info has * */ - static std::vector hasRecycledReferencingToIt - ( std::vector &slices + static std::vector*> hasRecycledReferencingToIt + ( std::vector> &slices , Info const& info ) { - std::vector result; + std::vector*> result; for (auto& s: slices) if ( s.info.recycling == info.type @@ -246,11 +256,11 @@ struct Slice { return result; } - static Slice& - findRecycledSource (std::vector &slices, Slice::Info info) { + static Slice& + findRecycledSource (std::vector> &slices, Slice::Info info) { const auto sliceIt = std::find_if(slices.begin(), slices.end(), - [&info](Slice const& s) { + [&info](Slice const& s) { return info.recycling == s.info.type && info.tuple == s.info.tuple && State::Recycled != s.info.state @@ -270,15 +280,15 @@ struct Slice { return *sliceIt; } - static Slice& findByTypeAbc - ( std::vector &slices - , Slice::Type type + static Slice& findByTypeAbc + ( std::vector> &slices + , Slice::Type type , ABCTuple const& abc ) { - const auto tuple = Slice::subtupleBySlice(abc, type); + const auto tuple = Slice::subtupleBySlice(abc, type); const auto sliceIt = std::find_if(slices.begin(), slices.end(), - [&type, &tuple](Slice const& s) { + [&type, &tuple](Slice const& s) { return type == s.info.type && tuple == s.info.tuple ; @@ -298,11 +308,11 @@ struct Slice { return *sliceIt; } - static Slice& findByInfo(std::vector &slices, - Slice::Info const& info) { + static Slice& findByInfo(std::vector> &slices, + Slice::Info const& info) { const auto sliceIt = std::find_if(slices.begin(), slices.end(), - [&info](Slice const& s) { + [&info](Slice const& s) { // TODO: maybe implement comparison in Info struct return info.type == s.info.type && info.state == s.info.state @@ -448,13 +458,15 @@ struct Slice { }; // struct Slice -std::ostream& operator<<(std::ostream& out, Slice::Location const& v) { +template +std::ostream& operator<<(std::ostream& out, typename Slice::Location const& v) { // TODO: remove me out << "{.r(" << v.rank << "), .s(" << v.source << ")};"; return out; } -std::ostream& operator<<(std::ostream& out, Slice::Info const& i) { +template +std::ostream& operator<<(std::ostream& out, typename Slice::Info const& i) { out << "«t" << i.type << ", s" << i.state << "»" << " ⊙ {" << i.from.rank << ", " << i.from.source << "}" << " ∴ {" << i.tuple[0] << ", " << i.tuple[1] << "}" diff --git a/include/atrip/SliceUnion.hpp b/include/atrip/SliceUnion.hpp index 060dcc2..ec7aff6 100644 --- a/include/atrip/SliceUnion.hpp +++ b/include/atrip/SliceUnion.hpp @@ -1,4 +1,4 @@ -// [[file:../../atrip.org::*The slice union][The slice union:1]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20slice%20union][The slice union:1]] #pragma once #include #include @@ -6,8 +6,8 @@ namespace atrip { + template struct SliceUnion { - using F = double; using Tensor = CTF::Tensor; virtual void @@ -20,7 +20,7 @@ namespace atrip { * This means that there can be at most one slice with a given Ty_x_Tu. */ void checkForDuplicates() const { - std::vector tytus; + std::vector::Ty_x_Tu> tytus; for (auto const& s: slices) { if (s.isFree()) continue; tytus.push_back({s.info.type, s.info.tuple}); @@ -33,13 +33,13 @@ namespace atrip { } - std::vector neededSlices(ABCTuple const& abc) { - std::vector needed(sliceTypes.size()); + std::vector::Ty_x_Tu> neededSlices(ABCTuple const& abc) { + std::vector::Ty_x_Tu> needed(sliceTypes.size()); // build the needed vector std::transform(sliceTypes.begin(), sliceTypes.end(), needed.begin(), - [&abc](Slice::Type const type) { - auto tuple = Slice::subtupleBySlice(abc, type); + [&abc](typename Slice::Type const type) { + auto tuple = Slice::subtupleBySlice(abc, type); return std::make_pair(type, tuple); }); return needed; @@ -64,8 +64,9 @@ namespace atrip { * slices. * */ - Slice::LocalDatabase buildLocalDatabase(ABCTuple const& abc) { - Slice::LocalDatabase result; + typename + Slice::LocalDatabase buildLocalDatabase(ABCTuple const& abc) { + typename Slice::LocalDatabase result; auto const needed = neededSlices(abc); @@ -95,7 +96,7 @@ namespace atrip { // need auto const& it = std::find_if(slices.begin(), slices.end(), - [&tuple, &type](Slice const& other) { + [&tuple, &type](Slice const& other) { return other.info.tuple == tuple && other.info.type == type // we only want another slice when it @@ -121,7 +122,7 @@ namespace atrip { // tuple and that has a valid data pointer. auto const& recycleIt = std::find_if(slices.begin(), slices.end(), - [&tuple, &type](Slice const& other) { + [&tuple, &type](Slice const& other) { return other.info.tuple == tuple && other.info.type != type && other.isRecyclable() @@ -132,13 +133,13 @@ namespace atrip { // (which should exist by construction :THINK) // if (recycleIt != slices.end()) { - auto& blank = Slice::findOneByType(slices, Slice::Blank); + auto& blank = Slice::findOneByType(slices, Slice::Blank); // TODO: formalize this through a method to copy information // from another slice blank.data = recycleIt->data; blank.info.type = type; blank.info.tuple = tuple; - blank.info.state = Slice::Recycled; + blank.info.state = Slice::Recycled; blank.info.from = from; blank.info.recycling = recycleIt->info.type; result.push_back({name, blank.info}); @@ -165,17 +166,17 @@ namespace atrip { << " for tuple " << tuple[0] << ", " << tuple[1] << "\n" ; - auto& blank = Slice::findOneByType(slices, Slice::Blank); + auto& blank = Slice::findOneByType(slices, Slice::Blank); blank.info.type = type; blank.info.tuple = tuple; blank.info.from = from; // Handle self sufficiency blank.info.state = Atrip::rank == from.rank - ? Slice::SelfSufficient - : Slice::Fetch + ? Slice::SelfSufficient + : Slice::Fetch ; - if (blank.info.state == Slice::SelfSufficient) { + if (blank.info.state == Slice::SelfSufficient) { blank.data = sources[from.source].data(); } else { if (freePointers.size() == 0) @@ -219,7 +220,7 @@ namespace atrip { // try to find the slice in the needed slices list auto const found = std::find_if(needed.begin(), needed.end(), - [&slice] (Slice::Ty_x_Tu const& tytu) { + [&slice] (typename Slice::Ty_x_Tu const& tytu) { return slice.info.tuple == tytu.second && slice.info.type == tytu.first ; @@ -238,7 +239,7 @@ namespace atrip { // allow to gc unwrapped and recycled, never Fetch, // if we have a Fetch slice then something has gone very wrong. - if (!slice.isUnwrapped() && slice.info.state != Slice::Recycled) + if (!slice.isUnwrapped() && slice.info.state != Slice::Recycled) throw std::domain_error("Trying to garbage collect " " a non-unwrapped slice! " @@ -259,13 +260,13 @@ namespace atrip { // - we should make sure that the data pointer of slice // does not get freed. // - if (slice.info.state == Slice::Ready) { + if (slice.info.state == Slice::Ready) { WITH_OCD WITH_RANK << "__gc__:" << "checking for data recycled dependencies\n"; auto recycled - = Slice::hasRecycledReferencingToIt(slices, slice.info); + = Slice::hasRecycledReferencingToIt(slices, slice.info); if (recycled.size()) { - Slice* newReady = recycled[0]; + Slice* newReady = recycled[0]; WITH_OCD WITH_RANK << "__gc__:" << "swaping recycled " << pretty_print(newReady->info) @@ -290,8 +291,8 @@ namespace atrip { // if the slice is self sufficient, do not dare touching the // pointer, since it is a pointer to our sources in our rank. - if ( slice.info.state == Slice::SelfSufficient - || slice.info.state == Slice::Recycled + if ( slice.info.state == Slice::SelfSufficient + || slice.info.state == Slice::Recycled ) { freeSlicePointer = false; } @@ -313,7 +314,8 @@ namespace atrip { // at this point, let us blank the slice WITH_RANK << "~~~:cl(" << name << ")" << " freeing up slice " - << " info " << slice.info + // TODO: make this possible + // << " info " << slice.info << "\n"; slice.free(); } @@ -323,13 +325,13 @@ namespace atrip { // CONSTRUCTOR SliceUnion( Tensor const& sourceTensor - , std::vector sliceTypes_ + , std::vector::Type> sliceTypes_ , std::vector sliceLength_ , std::vector paramLength , size_t np , MPI_Comm child_world , MPI_Comm global_world - , Slice::Name name_ + , typename Slice::Name name_ , size_t nSliceBuffers = 4 ) : rankMap(paramLength, np) @@ -344,13 +346,13 @@ namespace atrip { , name(name_) , sliceTypes(sliceTypes_) , sliceBuffers(nSliceBuffers, sources[0]) - //, slices(2 * sliceTypes.size(), Slice{ sources[0].size() }) + //, slices(2 * sliceTypes.size(), Slice{ sources[0].size() }) { // constructor begin LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n"; slices - = std::vector(2 * sliceTypes.size(), { sources[0].size() }); + = std::vector>(2 * sliceTypes.size(), { sources[0].size() }); // TODO: think exactly ^------------------- about this number // initialize the freePointers with the pointers to the buffers @@ -419,19 +421,19 @@ namespace atrip { * \brief Send asynchronously only if the state is Fetch */ void send( size_t otherRank - , Slice::Info const& info + , typename Slice::Info const& info , size_t tag) const noexcept { MPI_Request request; bool sendData_p = false; - if (info.state == Slice::Fetch) sendData_p = true; + if (info.state == Slice::Fetch) sendData_p = true; // TODO: remove this because I have SelfSufficient if (otherRank == info.from.rank) sendData_p = false; if (!sendData_p) return; MPI_Isend( sources[info.from.source].data() , sources[info.from.source].size() - , MPI_DOUBLE /* TODO: adapt this with traits */ + , traits::mpi::datatypeOf() , otherRank , tag , universe @@ -445,19 +447,19 @@ namespace atrip { /** * \brief Receive asynchronously only if the state is Fetch */ - void receive(Slice::Info const& info, size_t tag) noexcept { - auto& slice = Slice::findByInfo(slices, info); + void receive(typename Slice::Info const& info, size_t tag) noexcept { + auto& slice = Slice::findByInfo(slices, info); if (Atrip::rank == info.from.rank) return; - if (slice.info.state == Slice::Fetch) { + if (slice.info.state == Slice::Fetch) { // TODO: do it through the slice class - slice.info.state = Slice::Dispatched; + slice.info.state = Slice::Dispatched; MPI_Request request; slice.request = request; MPI_Irecv( slice.data , slice.size - , MPI_DOUBLE // TODO: Adapt this with traits + , traits::mpi::datatypeOf() , info.from.rank , tag , universe @@ -471,42 +473,42 @@ namespace atrip { for (auto type: sliceTypes) unwrapSlice(type, abc); } - F* unwrapSlice(Slice::Type type, ABCTuple const& abc) { + F* unwrapSlice(typename Slice::Type type, ABCTuple const& abc) { WITH_CRAZY_DEBUG WITH_RANK << "__unwrap__:slice " << type << " w n " << name << " abc" << pretty_print(abc) << "\n"; - auto& slice = Slice::findByTypeAbc(slices, type, abc); - WITH_RANK << "__unwrap__:info " << slice.info << "\n"; + auto& slice = Slice::findByTypeAbc(slices, type, abc); + //WITH_RANK << "__unwrap__:info " << slice.info << "\n"; switch (slice.info.state) { - case Slice::Dispatched: + case Slice::Dispatched: WITH_RANK << "__unwrap__:Fetch: " << &slice << " info " << pretty_print(slice.info) << "\n"; slice.unwrapAndMarkReady(); return slice.data; break; - case Slice::SelfSufficient: + case Slice::SelfSufficient: WITH_RANK << "__unwrap__:SelfSufficient: " << &slice << " info " << pretty_print(slice.info) << "\n"; return slice.data; break; - case Slice::Ready: + case Slice::Ready: WITH_RANK << "__unwrap__:READY: UNWRAPPED ALREADY" << &slice << " info " << pretty_print(slice.info) << "\n"; return slice.data; break; - case Slice::Recycled: + case Slice::Recycled: WITH_RANK << "__unwrap__:RECYCLED " << &slice << " info " << pretty_print(slice.info) << "\n"; return unwrapSlice(slice.info.recycling, abc); break; - case Slice::Fetch: - case Slice::Acceptor: + case Slice::Fetch: + case Slice::Acceptor: throw std::domain_error("Can't unwrap an acceptor or fetch slice!"); break; default: @@ -515,24 +517,26 @@ namespace atrip { return slice.data; } - const RankMap rankMap; + const RankMap rankMap; const MPI_Comm world; const MPI_Comm universe; const std::vector sliceLength; std::vector< std::vector > sources; - std::vector< Slice > slices; - Slice::Name name; - const std::vector sliceTypes; + std::vector< Slice > slices; + typename Slice::Name name; + const std::vector::Type> sliceTypes; std::vector< std::vector > sliceBuffers; std::set freePointers; }; - SliceUnion& - unionByName(std::vector const& unions, Slice::Name name) { + template + SliceUnion& + unionByName(std::vector*> const& unions, + typename Slice::Name name) { const auto sliceUnionIt = std::find_if(unions.begin(), unions.end(), - [&name](SliceUnion const* s) { + [&name](SliceUnion const* s) { return name == s->name; }); if (sliceUnionIt == unions.end()) diff --git a/include/atrip/Tuples.hpp b/include/atrip/Tuples.hpp index 090eb9b..5d4b69f 100644 --- a/include/atrip/Tuples.hpp +++ b/include/atrip/Tuples.hpp @@ -1,4 +1,4 @@ -// [[file:../../atrip.org::*Tuples][Tuples:1]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Tuples][Tuples:1]] #pragma once #include diff --git a/include/atrip/Unions.hpp b/include/atrip/Unions.hpp index de924ee..db3b6b7 100644 --- a/include/atrip/Unions.hpp +++ b/include/atrip/Unions.hpp @@ -1,15 +1,16 @@ -// [[file:../../atrip.org::*Unions][Unions:1]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Unions][Unions:1]] #pragma once #include namespace atrip { + template void sliceIntoVector - ( std::vector &v - , CTF::Tensor &toSlice + ( std::vector &v + , CTF::Tensor &toSlice , std::vector const low , std::vector const up - , CTF::Tensor const& origin + , CTF::Tensor const& origin , std::vector const originLow , std::vector const originUp ) { @@ -36,155 +37,159 @@ namespace atrip { , origin_.low.data() , origin_.up.data() , 1.0); - memcpy(v.data(), toSlice.data, sizeof(double) * v.size()); + memcpy(v.data(), toSlice.data, sizeof(F) * v.size()); #endif } - struct TAPHH : public SliceUnion { - TAPHH( Tensor const& sourceTensor + template + struct TAPHH : public SliceUnion { + TAPHH( CTF::Tensor const& sourceTensor , size_t No , size_t Nv , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , {Slice::A, Slice::B, Slice::C} - , {Nv, No, No} // size of the slices - , {Nv} - , np - , child_world - , global_world - , Slice::TA - , 4) { + ) : SliceUnion( sourceTensor + , {Slice::A, Slice::B, Slice::C} + , {Nv, No, No} // size of the slices + , {Nv} + , np + , child_world + , global_world + , Slice::TA + , 4) { init(sourceTensor); } - void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override + void sliceIntoBuffer(size_t it, CTF::Tensor &to, CTF::Tensor const& from) override { - const int Nv = sliceLength[0] - , No = sliceLength[1] - , a = rankMap.find({static_cast(Atrip::rank), it}); + const int Nv = this->sliceLength[0] + , No = this->sliceLength[1] + , a = this->rankMap.find({static_cast(Atrip::rank), it}); ; - sliceIntoVector( sources[it] - , to, {0, 0, 0}, {Nv, No, No} - , from, {a, 0, 0, 0}, {a+1, Nv, No, No} - ); + sliceIntoVector( this->sources[it] + , to, {0, 0, 0}, {Nv, No, No} + , from, {a, 0, 0, 0}, {a+1, Nv, No, No} + ); } }; - struct HHHA : public SliceUnion { - HHHA( Tensor const& sourceTensor + template + struct HHHA : public SliceUnion { + HHHA( CTF::Tensor const& sourceTensor , size_t No , size_t Nv , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , {Slice::A, Slice::B, Slice::C} - , {No, No, No} // size of the slices - , {Nv} // size of the parametrization - , np - , child_world - , global_world - , Slice::VIJKA - , 4) { + ) : SliceUnion( sourceTensor + , {Slice::A, Slice::B, Slice::C} + , {No, No, No} // size of the slices + , {Nv} // size of the parametrization + , np + , child_world + , global_world + , Slice::VIJKA + , 4) { init(sourceTensor); } - void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override + void sliceIntoBuffer(size_t it, CTF::Tensor &to, CTF::Tensor const& from) override { - const int No = sliceLength[0] - , a = rankMap.find({static_cast(Atrip::rank), it}) + const int No = this->sliceLength[0] + , a = this->rankMap.find({static_cast(Atrip::rank), it}) ; - sliceIntoVector( sources[it] - , to, {0, 0, 0}, {No, No, No} - , from, {0, 0, 0, a}, {No, No, No, a+1} - ); + sliceIntoVector( this->sources[it] + , to, {0, 0, 0}, {No, No, No} + , from, {0, 0, 0, a}, {No, No, No, a+1} + ); } }; - struct ABPH : public SliceUnion { - ABPH( Tensor const& sourceTensor + template + struct ABPH : public SliceUnion { + ABPH( CTF::Tensor const& sourceTensor , size_t No , size_t Nv , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , { Slice::AB, Slice::BC, Slice::AC - , Slice::BA, Slice::CB, Slice::CA - } - , {Nv, No} // size of the slices - , {Nv, Nv} // size of the parametrization - , np - , child_world - , global_world - , Slice::VABCI - , 2*6) { + ) : SliceUnion( sourceTensor + , { Slice::AB, Slice::BC, Slice::AC + , Slice::BA, Slice::CB, Slice::CA + } + , {Nv, No} // size of the slices + , {Nv, Nv} // size of the parametrization + , np + , child_world + , global_world + , Slice::VABCI + , 2*6) { init(sourceTensor); } - void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override { + void sliceIntoBuffer(size_t it, CTF::Tensor &to, CTF::Tensor const& from) override { - const int Nv = sliceLength[0] - , No = sliceLength[1] - , el = rankMap.find({static_cast(Atrip::rank), it}) + const int Nv = this->sliceLength[0] + , No = this->sliceLength[1] + , el = this->rankMap.find({static_cast(Atrip::rank), it}) , a = el % Nv , b = el / Nv ; - sliceIntoVector( sources[it] - , to, {0, 0}, {Nv, No} - , from, {a, b, 0, 0}, {a+1, b+1, Nv, No} - ); + sliceIntoVector( this->sources[it] + , to, {0, 0}, {Nv, No} + , from, {a, b, 0, 0}, {a+1, b+1, Nv, No} + ); } }; - struct ABHH : public SliceUnion { - ABHH( Tensor const& sourceTensor + template + struct ABHH : public SliceUnion { + ABHH( CTF::Tensor const& sourceTensor , size_t No , size_t Nv , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , {Slice::AB, Slice::BC, Slice::AC} - , {No, No} // size of the slices - , {Nv, Nv} // size of the parametrization - , np - , child_world - , global_world - , Slice::VABIJ - , 6) { + ) : SliceUnion( sourceTensor + , {Slice::AB, Slice::BC, Slice::AC} + , {No, No} // size of the slices + , {Nv, Nv} // size of the parametrization + , np + , child_world + , global_world + , Slice::VABIJ + , 6) { init(sourceTensor); } - void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override { + void sliceIntoBuffer(size_t it, CTF::Tensor &to, CTF::Tensor const& from) override { const int Nv = from.lens[0] - , No = sliceLength[1] - , el = rankMap.find({static_cast(Atrip::rank), it}) + , No = this->sliceLength[1] + , el = this->rankMap.find({static_cast(Atrip::rank), it}) , a = el % Nv , b = el / Nv ; - sliceIntoVector( sources[it] - , to, {0, 0}, {No, No} - , from, {a, b, 0, 0}, {a+1, b+1, No, No} - ); + sliceIntoVector( this->sources[it] + , to, {0, 0}, {No, No} + , from, {a, b, 0, 0}, {a+1, b+1, No, No} + ); } @@ -192,39 +197,40 @@ namespace atrip { }; - struct TABHH : public SliceUnion { - TABHH( Tensor const& sourceTensor + template + struct TABHH : public SliceUnion { + TABHH( CTF::Tensor const& sourceTensor , size_t No , size_t Nv , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , {Slice::AB, Slice::BC, Slice::AC} - , {No, No} // size of the slices - , {Nv, Nv} // size of the parametrization - , np - , child_world - , global_world - , Slice::TABIJ - , 6) { + ) : SliceUnion( sourceTensor + , {Slice::AB, Slice::BC, Slice::AC} + , {No, No} // size of the slices + , {Nv, Nv} // size of the parametrization + , np + , child_world + , global_world + , Slice::TABIJ + , 6) { init(sourceTensor); } - void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override { + void sliceIntoBuffer(size_t it, CTF::Tensor &to, CTF::Tensor const& from) override { // TODO: maybe generalize this with ABHH const int Nv = from.lens[0] - , No = sliceLength[1] - , el = rankMap.find({static_cast(Atrip::rank), it}) + , No = this->sliceLength[1] + , el = this->rankMap.find({static_cast(Atrip::rank), it}) , a = el % Nv , b = el / Nv ; - sliceIntoVector( sources[it] - , to, {0, 0}, {No, No} - , from, {a, b, 0, 0}, {a+1, b+1, No, No} - ); + sliceIntoVector( this->sources[it] + , to, {0, 0}, {No, No} + , from, {a, b, 0, 0}, {a+1, b+1, No, No} + ); } diff --git a/include/atrip/Utils.hpp b/include/atrip/Utils.hpp index a6bd743..bff3d19 100644 --- a/include/atrip/Utils.hpp +++ b/include/atrip/Utils.hpp @@ -1,4 +1,4 @@ -// [[file:../../atrip.org::*Utils][Utils:1]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Utils][Utils:1]] #pragma once #include #include diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx index 64dea9b..fc613b6 100644 --- a/src/atrip/Atrip.cxx +++ b/src/atrip/Atrip.cxx @@ -1,4 +1,4 @@ -// [[file:../../atrip.org::*Main][Main:1]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:1]] #include #include @@ -23,7 +23,8 @@ void Atrip::init() { MPI_Comm_size(MPI_COMM_WORLD, &Atrip::np); } -Atrip::Output Atrip::run(Atrip::Input const& in) { +template +Atrip::Output Atrip::run(Atrip::Input const& in) { const int np = Atrip::np; const int rank = Atrip::rank; @@ -38,14 +39,14 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { LOG(0,"Atrip") << "Nv: " << Nv << "\n"; // allocate the three scratches, see piecuch - std::vector Tijk(No*No*No) // doubles only (see piecuch) - , Zijk(No*No*No) // singles + doubles (see piecuch) - // we need local copies of the following tensors on every - // rank - , epsi(No) - , epsa(Nv) - , Tai(No * Nv) - ; + std::vector Tijk(No*No*No) // doubles only (see piecuch) + , Zijk(No*No*No) // singles + doubles (see piecuch) + // we need local copies of the following tensors on every + // rank + , epsi(No) + , epsa(Nv) + , Tai(No * Nv) + ; in.ei->read_all(epsi.data()); in.ea->read_all(epsa.data()); @@ -74,20 +75,20 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { chrono["nv-slices"].start(); // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1 LOG(0,"Atrip") << "BUILD NV-SLICES\n"; - TAPHH taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - HHHA hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + TAPHH taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + HHHA hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); chrono["nv-slices"].stop(); chrono["nv-nv-slices"].start(); // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1 LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n"; - ABPH abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - ABHH abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - TABHH tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + ABPH abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + ABHH abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + TABHH tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); chrono["nv-nv-slices"].stop(); // all tensors - std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh}; + std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh}; //CONSTRUCT TUPLE LIST ==============================================={{{1 LOG(0,"Atrip") << "BUILD TUPLE LIST\n"; @@ -121,18 +122,20 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { = [&tuplesList](size_t const i) { return i >= tuplesList.size(); }; + using Database = typename Slice::Database; + using LocalDatabase = typename Slice::LocalDatabase; auto communicateDatabase = [ &unions , np , &chrono - ] (ABCTuple const& abc, MPI_Comm const& c) -> Slice::Database { + ] (ABCTuple const& abc, MPI_Comm const& c) -> Database { chrono["db:comm:type:do"].start(); - auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement(); + auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement(); chrono["db:comm:type:do"].stop(); chrono["db:comm:ldb"].start(); - Slice::LocalDatabase ldb; + LocalDatabase ldb; for (auto const& tensor: unions) { auto const& tensorDb = tensor->buildLocalDatabase(abc); @@ -140,7 +143,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { } chrono["db:comm:ldb"].stop(); - Slice::Database db(np * ldb.size(), ldb[0]); + Database db(np * ldb.size(), ldb[0]); chrono["oneshot-db:comm:allgather"].start(); chrono["db:comm:allgather"].start(); @@ -162,7 +165,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { }; auto doIOPhase - = [&unions, &rank, &np, &universe, &chrono] (Slice::Database const& db) { + = [&unions, &rank, &np, &universe, &chrono] (Database const& db) { const size_t localDBLength = db.size() / np; @@ -212,7 +215,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { ; for (auto it = begin; it != end; ++it) { sendTag++; - Slice::LocalDatabaseElement const& el = *it; + typename Slice::LocalDatabaseElement const& el = *it; if (el.info.from.rank != rank) continue; @@ -261,14 +264,13 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // START MAIN LOOP ======================================================{{{1 - Slice::Database db; - for ( size_t i = abcIndex.first, iteration = 1 ; i < abcIndex.second ; i++, iteration++ ) { chrono["iterations"].start(); + // check overhead from chrono over all iterations chrono["start:stop"].start(); chrono["start:stop"].stop(); @@ -347,7 +349,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { WITH_RANK << "__comm__:" << iteration << "th communicating database\n"; chrono["db:comm"].start(); //const auto db = communicateDatabase(*abcNext, universe); - db = communicateDatabase(*abcNext, universe); + Database db = communicateDatabase(*abcNext, universe); chrono["db:comm"].stop(); chrono["db:io"].start(); doIOPhase(db); @@ -368,30 +370,30 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { ))) chrono["oneshot-doubles"].start(); chrono["doubles"].start(); - doublesContribution( abc, (size_t)No, (size_t)Nv - // -- VABCI - , abph.unwrapSlice(Slice::AB, abc) - , abph.unwrapSlice(Slice::AC, abc) - , abph.unwrapSlice(Slice::BC, abc) - , abph.unwrapSlice(Slice::BA, abc) - , abph.unwrapSlice(Slice::CA, abc) - , abph.unwrapSlice(Slice::CB, abc) - // -- VHHHA - , hhha.unwrapSlice(Slice::A, abc) - , hhha.unwrapSlice(Slice::B, abc) - , hhha.unwrapSlice(Slice::C, abc) - // -- TA - , taphh.unwrapSlice(Slice::A, abc) - , taphh.unwrapSlice(Slice::B, abc) - , taphh.unwrapSlice(Slice::C, abc) - // -- TABIJ - , tabhh.unwrapSlice(Slice::AB, abc) - , tabhh.unwrapSlice(Slice::AC, abc) - , tabhh.unwrapSlice(Slice::BC, abc) - // -- TIJK - , Tijk.data() - , chrono - ); + doublesContribution( abc, (size_t)No, (size_t)Nv + // -- VABCI + , abph.unwrapSlice(Slice::AB, abc) + , abph.unwrapSlice(Slice::AC, abc) + , abph.unwrapSlice(Slice::BC, abc) + , abph.unwrapSlice(Slice::BA, abc) + , abph.unwrapSlice(Slice::CA, abc) + , abph.unwrapSlice(Slice::CB, abc) + // -- VHHHA + , hhha.unwrapSlice(Slice::A, abc) + , hhha.unwrapSlice(Slice::B, abc) + , hhha.unwrapSlice(Slice::C, abc) + // -- TA + , taphh.unwrapSlice(Slice::A, abc) + , taphh.unwrapSlice(Slice::B, abc) + , taphh.unwrapSlice(Slice::C, abc) + // -- TABIJ + , tabhh.unwrapSlice(Slice::AB, abc) + , tabhh.unwrapSlice(Slice::AC, abc) + , tabhh.unwrapSlice(Slice::BC, abc) + // -- TIJK + , Tijk.data() + , chrono + ); WITH_RANK << iteration << "-th doubles done\n"; chrono["doubles"].stop(); chrono["oneshot-doubles"].stop(); @@ -409,12 +411,12 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I]; chrono["reorder"].stop(); chrono["singles"].start(); - singlesContribution( No, Nv, abc - , Tai.data() - , abhh.unwrapSlice(Slice::AB, abc) - , abhh.unwrapSlice(Slice::AC, abc) - , abhh.unwrapSlice(Slice::BC, abc) - , Zijk.data()); + singlesContribution( No, Nv, abc + , Tai.data() + , abhh.unwrapSlice(Slice::AB, abc) + , abhh.unwrapSlice(Slice::AC, abc) + , abhh.unwrapSlice(Slice::BC, abc) + , Zijk.data()); chrono["singles"].stop(); } @@ -426,13 +428,13 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { int distinct(0); if (abc[0] == abc[1]) distinct++; if (abc[1] == abc[2]) distinct--; - const double epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]); + const F epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]); chrono["energy"].start(); if ( distinct == 0) - tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk); + tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk); else - tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk); + tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk); chrono["energy"].stop(); #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES) @@ -473,8 +475,8 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { << " :abc " << pretty_print(abc) << " :abcN " << pretty_print(*abcNext) << "\n"; - for (auto const& slice: u->slices) - WITH_RANK << "__gc__:guts:" << slice.info << "\n"; + // for (auto const& slice: u->slices) + // WITH_RANK << "__gc__:guts:" << slice.info << "\n"; u->clearUnusedSlicesForNext(*abcNext); WITH_RANK << "__gc__: checking validity\n"; @@ -482,13 +484,13 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { #ifdef HAVE_OCD // check for validity of the slices for (auto type: u->sliceTypes) { - auto tuple = Slice::subtupleBySlice(abc, type); + auto tuple = Slice::subtupleBySlice(abc, type); for (auto& slice: u->slices) { if ( slice.info.type == type && slice.info.tuple == tuple && slice.isDirectlyFetchable() ) { - if (slice.info.state == Slice::Dispatched) + if (slice.info.state == Slice::Dispatched) throw std::domain_error( "This slice should not be undispatched! " + pretty_print(slice.info)); } @@ -555,4 +557,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { return { - globalEnergy }; } +// instantiate +template Atrip::Output Atrip::run(Atrip::Input const& in); +template Atrip::Output Atrip::run(Atrip::Input const& in); // Main:1 ends here From e89bd8f150c261357a0e6896c7eb5ed9bfb56332 Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Wed, 9 Feb 2022 19:35:00 +0100 Subject: [PATCH 17/22] Add correct conjugate templated function --- atrip.org | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/atrip.org b/atrip.org index 8e4ec91..32b0b89 100644 --- a/atrip.org +++ b/atrip.org @@ -21,6 +21,9 @@ The algorithm uses two main data types, the =Slice= and the namespace atrip { +template FF maybeConjugate(const FF a) { return a; } +template <> Complex maybeConjugate(const Complex a) { return std::conj(a); } + namespace traits { template bool isComplex() { return false; }; template <> bool isComplex() { return true; }; @@ -1521,12 +1524,12 @@ namespace atrip { , X(Zijk_[j + No*k + No*No*i]) , Y(Zijk_[k + No*i + No*No*j]) , Z(Zijk_[k + No*j + No*No*i]) - , A(std::conj(Tijk_[i + No*j + No*No*k])) - , B(std::conj(Tijk_[i + No*k + No*No*j])) - , C(std::conj(Tijk_[j + No*i + No*No*k])) - , D(std::conj(Tijk_[j + No*k + No*No*i])) - , E(std::conj(Tijk_[k + No*i + No*No*j])) - , F(std::conj(Tijk_[k + No*j + No*No*i])) + , A(maybeConjugate(Tijk_[i + No*j + No*No*k])) + , B(maybeConjugate(Tijk_[i + No*k + No*No*j])) + , C(maybeConjugate(Tijk_[j + No*i + No*No*k])) + , D(maybeConjugate(Tijk_[j + No*k + No*No*i])) + , E(maybeConjugate(Tijk_[k + No*i + No*No*j])) + , F(maybeConjugate(Tijk_[k + No*j + No*No*i])) , value = 3.0 * ( A * U + B * V @@ -1583,9 +1586,9 @@ namespace atrip { , U(Zijk_[i + No*j + No*No*k]) , V(Zijk_[j + No*k + No*No*i]) , W(Zijk_[k + No*i + No*No*j]) - , A(std::conj(Tijk_[i + No*j + No*No*k])) - , B(std::conj(Tijk_[j + No*k + No*No*i])) - , C(std::conj(Tijk_[k + No*i + No*No*j])) + , A(maybeConjugate(Tijk_[i + No*j + No*No*k])) + , B(maybeConjugate(Tijk_[j + No*k + No*No*i])) + , C(maybeConjugate(Tijk_[k + No*i + No*No*j])) , value = F(3.0) * ( A * U + B * V @@ -1701,14 +1704,9 @@ namespace atrip { , _t_buffer.data() \ , (int const*)&NoNo \ ); - #define MAYBE_CONJ(_conj, _buffer) \ - if (traits::isComplex()) { \ - for (size_t __i = 0; __i < NoNoNo; ++__i) \ - _conj[__i] = std::conj(_buffer[__i]); \ - } else { \ - for (size_t __i = 0; __i < NoNoNo; ++__i) \ - _conj[__i] = _buffer[__i]; \ - } + #define MAYBE_CONJ(_conj, _buffer) \ + for (size_t __i = 0; __i < NoNoNo; ++__i) \ + _conj[__i] = maybeConjugate(_buffer[__i]); \ const size_t NoNoNo = No*NoNo; std::vector _t_buffer; @@ -2259,11 +2257,12 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { const double doublesFlops = double(No) - ,* double(No) - ,* double(No) - ,* (double(No) + double(Nv)) - ,* 2 - ,* 6 + * double(No) + * double(No) + * (double(No) + double(Nv)) + * 1 + * (traits::isComplex() ? 2.0 : 1.0) + * 6 / 1e9 ; From 66f2de1083a26bec8cf72b23b05d0e1782fcaafa Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Mon, 14 Feb 2022 11:26:35 +0100 Subject: [PATCH 18/22] Improve MPI handling for enums --- atrip.org | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/atrip.org b/atrip.org index 32b0b89..734972a 100644 --- a/atrip.org +++ b/atrip.org @@ -156,20 +156,23 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds const std::vector lengths(n, 1); const MPI_Datatype types[n] = {usizeDt(), usizeDt()}; + static_assert(sizeof(Slice::Location) == 2 * sizeof(size_t), + "The Location packing is wrong in your compiler"); + // measure the displacements in the struct size_t j = 0; - MPI_Aint displacements[n]; + MPI_Aint base_address, displacements[n]; + MPI_Get_address(&measure, &base_address); MPI_Get_address(&measure.rank, &displacements[j++]); MPI_Get_address(&measure.source, &displacements[j++]); - for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0]; - displacements[0] = 0; + for (size_t i = 0; i < n; i++) + displacements[i] = MPI_Aint_diff(displacements[i], base_address); MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); MPI_Type_commit(&dt); return dt; } - static MPI_Datatype enumDt() { return MPI_INT; } static MPI_Datatype usizeDt() { return MPI_UINT64_T; } static MPI_Datatype sliceInfo () { @@ -179,22 +182,31 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds const std::vector lengths(n, 1); const MPI_Datatype types[n] = { vector(2, usizeDt()) - , enumDt() - , enumDt() + /*, MPI_UINT64_T*/ + , vector(sizeof(enum Type), MPI_CHAR) + /*, MPI_UINT64_T*/ + , vector(sizeof(enum State), MPI_CHAR) + /*, vector(sizeof(Location), MPI_CHAR)*/ , sliceLocation() - , enumDt() + , vector(sizeof(enum Type), MPI_CHAR) + /*, MPI_UINT64_T*/ }; + static_assert(sizeof(enum Type) == 4, "Enum type not 4 bytes long"); + static_assert(sizeof(enum State) == 4, "Enum State not 4 bytes long"); + static_assert(sizeof(enum Name) == 4, "Enum Name not 4 bytes long"); + // create the displacements from the info measurement struct size_t j = 0; - MPI_Aint displacements[n]; - MPI_Get_address(measure.tuple.data(), &displacements[j++]); + MPI_Aint base_address, displacements[n]; + MPI_Get_address(&measure, &base_address); + MPI_Get_address(&measure.tuple[0], &displacements[j++]); MPI_Get_address(&measure.type, &displacements[j++]); MPI_Get_address(&measure.state, &displacements[j++]); MPI_Get_address(&measure.from, &displacements[j++]); MPI_Get_address(&measure.recycling, &displacements[j++]); - for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0]; - displacements[0] = 0; + for (size_t i = 0; i < n; i++) + displacements[i] = MPI_Aint_diff(displacements[i], base_address); MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); MPI_Type_commit(&dt); @@ -207,13 +219,15 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds LocalDatabaseElement measure; const std::vector lengths(n, 1); const MPI_Datatype types[n] - = { enumDt() + = { vector(sizeof(enum Name), MPI_CHAR) + /*= { MPI_UINT64_T*/ , sliceInfo() }; // measure the displacements in the struct size_t j = 0; - MPI_Aint displacements[n]; + MPI_Aint base_address, displacements[n]; + MPI_Get_address(&measure, &base_address); MPI_Get_address(&measure.name, &displacements[j++]); MPI_Get_address(&measure.info, &displacements[j++]); for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0]; @@ -221,6 +235,9 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); MPI_Type_commit(&dt); + /*return vector( 4 + 4 + 48, MPI_CHAR);*/ + // TODO + return vector(sizeof(LocalDatabaseElement), MPI_CHAR); return dt; } @@ -2260,9 +2277,9 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { * double(No) * double(No) * (double(No) + double(Nv)) - * 1 + * 2.0 * (traits::isComplex() ? 2.0 : 1.0) - * 6 + * 6.0 / 1e9 ; From 728c27074532df00f5a70fb421a3a7fd40dbd67e Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Mon, 14 Feb 2022 11:36:58 +0100 Subject: [PATCH 19/22] Add the pertinents todos --- atrip.org | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/atrip.org b/atrip.org index 734972a..20346d6 100644 --- a/atrip.org +++ b/atrip.org @@ -192,9 +192,9 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds /*, MPI_UINT64_T*/ }; - static_assert(sizeof(enum Type) == 4, "Enum type not 4 bytes long"); + static_assert(sizeof(enum Type) == 4, "Enum type not 4 bytes long"); static_assert(sizeof(enum State) == 4, "Enum State not 4 bytes long"); - static_assert(sizeof(enum Name) == 4, "Enum Name not 4 bytes long"); + static_assert(sizeof(enum Name) == 4, "Enum Name not 4 bytes long"); // create the displacements from the info measurement struct size_t j = 0; @@ -230,14 +230,16 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds MPI_Get_address(&measure, &base_address); MPI_Get_address(&measure.name, &displacements[j++]); MPI_Get_address(&measure.info, &displacements[j++]); - for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0]; - displacements[0] = 0; + for (size_t i = 0; i < n; i++) + displacements[i] = MPI_Aint_diff(displacements[i], base_address); + + static_assert( sizeof(LocalDatabaseElement) == sizeof(measure) + , "Measure has bad size"); MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); MPI_Type_commit(&dt); - /*return vector( 4 + 4 + 48, MPI_CHAR);*/ - // TODO return vector(sizeof(LocalDatabaseElement), MPI_CHAR); + // TODO: write tests in order to know if this works return dt; } From 3dc38a43b5be004b714ef723172c53b54d2f457a Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Fri, 18 Feb 2022 12:44:01 +0100 Subject: [PATCH 20/22] Merge group-and-sort with complex --- atrip.org | 2211 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 1585 insertions(+), 626 deletions(-) diff --git a/atrip.org b/atrip.org index 20346d6..c6ea744 100644 --- a/atrip.org +++ b/atrip.org @@ -8,6 +8,9 @@ The algorithm uses two main data types, the =Slice= and the ** The slice +The following section introduces the idea of a slice. + +*** Prolog :noexport: #+begin_src c++ :tangle (atrip-slice-h) #pragma once #include @@ -39,6 +42,7 @@ template struct Slice { #+end_src +*** Introduction A slice is the concept of a subset of values of a given tensor. As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds of objects: @@ -48,13 +52,63 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds - the object \( \mathsf{T}(a,b)_{ij} \) which for every pair of \( a, b \) corresponds the \( N_\mathrm{o}^2 \)-sized tensor \( T^{ab}_{ij} \). +*** Location + +Every slice set, for instance, +\( S_k = \left\{ + a \mapsto \mathsf{T}(a)^{b}_{ij} + \mid + a \in A_k +\right\} \) +where \( A_k \) is some subset of +\( \mathsf{N}_\mathrm{v} \), +gets stored in some rank \( k \). +In general however, the number of elements in \( A_k \) can be bigger +than the number of processes \( n_p \). Therefore in order to uniquely +indentify a given slice in \( S_k \) we need two identifiers, +the rank \( k \), which tells us in which core's memory the slice is +allocated, and an additional tag which we will call =source=. + +The datatype that simply models this state of affairs +is therefore a simple structure: + +#+begin_src c++ :tangle (atrip-slice-h) + struct Location { size_t rank; size_t source; }; +#+end_src + +*** Type + +Due to the permutation operators in the equations +it is noticeable that for every one dimensional +slice and triple \( (a,b,c) \) +\begin{equation*} +a \mapsto \mathsf{t}(a) +\end{equation*} +one needs at the same time +\( \mathsf{t}(a) \), +\( \mathsf{t}(b) \) and +\( \mathsf{t}(c) \). +For two dimensional slices, i.e., slices of the form +\begin{equation*} +(a,b) \mapsto \mathsf{t}(a,b) +\end{equation*} +one needs in the equations the slices +\( \mathsf{t}(a,b) \), +\( \mathsf{t}(b,c) \) and +\( \mathsf{t}(a,c) \). +In addition, in the case of diagrams where +the integral \( V^{ab}_{ci} \) appears, +we additionaly need the permuted slices +from before, i.e. +\( \mathsf{t}(b,a) \), +\( \mathsf{t}(c,b) \) and +\( \mathsf{t}(c,a) \). + +This means, every slice has associated with it +a type which denotes which permutation it is. #+begin_src c++ :tangle (atrip-slice-h) - // ASSOCIATED TYPES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - - struct Location { size_t rank; size_t source; }; - enum Type { A = 10 , B @@ -70,53 +124,102 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds // The non-typed slice , Blank = 404 }; +#+end_src +*** State + +Every slice can be in different states and every state +denotes which function the slice is going to provide +and which relations they have between themselves. + +- Fetch :: + A slice is in state =Fetch= when it + has a valid data pointer that **must** be written to. + A =Fetch= slice should not live very long, this means + that after the database send and receive phase, + =Fetch= slices should be changed into =Dispatched= + in order to start the process of writing to the + data pointer from some other rank. +- Dispatched :: + A =Dispatched= slice indicates that at some point + send and receive MPI calls have been dispatched + in order to get the data. + However, the calls have just been dispatched and there + is no warranty for the data to be there, for that, + the slice must be unwrapped. +- Ready :: + =Ready= means that the data pointer can be read from + directly. +- SelfSufficient :: + A slice is =SelfSufficient= when its contents are located + in the same rank that it lives, so that it does not have to + fetch from no other rank. + This is important in order to handle the data pointers correctly + and in order to save calls to MPI receive and send functions. +- Recycled :: + =Recycled= means that this slice gets its data pointer from another + slice, so it should not be written to +- Acceptor :: + =Acceptor= means that the slice can accept a new slice, it is + the counterpart of the =Blank= type, but for states + +Again the implementation is a simple enum type. + +#+begin_src c++ :tangle (atrip-slice-h) enum State { - // Fetch represents the state where a slice is to be fetched - // and has a valid data pointer that can be written to Fetch = 0, - // Dispatches represents the state that an MPI call has been - // dispatched in order to get the data, but the data has not been - // yet unwrapped, the data might be there or we might have to wait. Dispatched = 2, - // Ready means that the data pointer can be read from Ready = 1, - // Self sufficient is a slice when its contents are located - // in the same rank that it lives, so that it does not have to - // fetch from no one else. SelfSufficient = 911, - // Recycled means that this slice gets its data pointer from another - // slice, so it should not be written to Recycled = 123, - // Acceptor means that the Slice can accept a new Slice, it is - // the counterpart of the Blank type, but for states Acceptor = 405 }; +#+end_src - struct Info { - // which part of a,b,c the slice holds - PartialTuple tuple; - // The type of slice for the user to retrieve the correct one - Type type; - // What is the state of the slice - State state; - // Where the slice is to be retrieved - // NOTE: this can actually be computed from tuple - Location from; - // If the data are actually to be found in this other slice - Type recycling; +*** The Info structure - Info() : tuple{0,0} - , type{Blank} - , state{Acceptor} - , from{0,0} - , recycling{Blank} - {} - }; +Every slice has an information structure associated with it +that keeps track of the **variable** type, state and so on. - using Ty_x_Tu = std::pair< Type, PartialTuple >; +#+begin_src c++ :tangle (atrip-slice-h) +struct Info { + // which part of a,b,c the slice holds + PartialTuple tuple; + // The type of slice for the user to retrieve the correct one + Type type; + // What is the state of the slice + State state; + // Where the slice is to be retrieved + Location from; + // If the data are actually to be found in this other slice + Type recycling; - // Names of the integrals that are considered in CCSD(T) + Info() : tuple{0,0} + , type{Blank} + , state{Acceptor} + , from{0,0} + , recycling{Blank} + {} +}; + +using Ty_x_Tu = std::pair< Type, PartialTuple >; +#+end_src + +*** Name + +CCSD(T) needs in this algorithm 5 types of tensor slices, +namely +\( V^{ij}_{ka} \), \( V^{ab}_{ci} \), +\( V^{ab}_{ij} \) +and two times \( T^{ab}_{ij} \). +The reason why we need two times the doubles +amplitudes is because in the doubles contribution +to the energy, the \( T \) amplidutes will be sliced +through one parameter for the particle contribution +and through two parameters for the hole contribution. + + +#+begin_src c++ :tangle (atrip-slice-h) enum Name { TA = 100 , VIJKA = 101 @@ -124,276 +227,369 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds , TABIJ = 201 , VABIJ = 202 }; +#+end_src - // DATABASE ==========================================================={{{1 +*** Database + +The database is a simple representation of the slices of a slice union. +Every element of the database is given by the name of the tensor it +represents and the internal information structure. + +#+begin_src c++ :tangle (atrip-slice-h) struct LocalDatabaseElement { Slice::Name name; Slice::Info info; }; +#+end_src + +A local database (of a given rank) and the global database is thus simply +a vector of these elements. + +#+begin_src c++ :tangle (atrip-slice-h) using LocalDatabase = std::vector; using Database = LocalDatabase; +#+end_src +*** MPI Types +#+begin_src c++ :tangle (atrip-slice-h) +struct mpi { - // STATIC METHODS =========================================================== - // - // They are useful to organize the structure of slices - - struct mpi { - - static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) { - MPI_Datatype dt; - MPI_Type_vector(n, 1, 1, DT, &dt); - MPI_Type_commit(&dt); - return dt; - } - - static MPI_Datatype sliceLocation () { - constexpr int n = 2; - // create a sliceLocation to measure in the current architecture - // the packing of the struct - Slice::Location measure; - MPI_Datatype dt; - const std::vector lengths(n, 1); - const MPI_Datatype types[n] = {usizeDt(), usizeDt()}; - - static_assert(sizeof(Slice::Location) == 2 * sizeof(size_t), - "The Location packing is wrong in your compiler"); - - // measure the displacements in the struct - size_t j = 0; - MPI_Aint base_address, displacements[n]; - MPI_Get_address(&measure, &base_address); - MPI_Get_address(&measure.rank, &displacements[j++]); - MPI_Get_address(&measure.source, &displacements[j++]); - for (size_t i = 0; i < n; i++) - displacements[i] = MPI_Aint_diff(displacements[i], base_address); - - MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); - MPI_Type_commit(&dt); - return dt; - } - - static MPI_Datatype usizeDt() { return MPI_UINT64_T; } - - static MPI_Datatype sliceInfo () { - constexpr int n = 5; - MPI_Datatype dt; - Slice::Info measure; - const std::vector lengths(n, 1); - const MPI_Datatype types[n] - = { vector(2, usizeDt()) - /*, MPI_UINT64_T*/ - , vector(sizeof(enum Type), MPI_CHAR) - /*, MPI_UINT64_T*/ - , vector(sizeof(enum State), MPI_CHAR) - /*, vector(sizeof(Location), MPI_CHAR)*/ - , sliceLocation() - , vector(sizeof(enum Type), MPI_CHAR) - /*, MPI_UINT64_T*/ - }; - - static_assert(sizeof(enum Type) == 4, "Enum type not 4 bytes long"); - static_assert(sizeof(enum State) == 4, "Enum State not 4 bytes long"); - static_assert(sizeof(enum Name) == 4, "Enum Name not 4 bytes long"); - - // create the displacements from the info measurement struct - size_t j = 0; - MPI_Aint base_address, displacements[n]; - MPI_Get_address(&measure, &base_address); - MPI_Get_address(&measure.tuple[0], &displacements[j++]); - MPI_Get_address(&measure.type, &displacements[j++]); - MPI_Get_address(&measure.state, &displacements[j++]); - MPI_Get_address(&measure.from, &displacements[j++]); - MPI_Get_address(&measure.recycling, &displacements[j++]); - for (size_t i = 0; i < n; i++) - displacements[i] = MPI_Aint_diff(displacements[i], base_address); - - MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); - MPI_Type_commit(&dt); - return dt; - } - - static MPI_Datatype localDatabaseElement () { - constexpr int n = 2; - MPI_Datatype dt; - LocalDatabaseElement measure; - const std::vector lengths(n, 1); - const MPI_Datatype types[n] - = { vector(sizeof(enum Name), MPI_CHAR) - /*= { MPI_UINT64_T*/ - , sliceInfo() - }; - - // measure the displacements in the struct - size_t j = 0; - MPI_Aint base_address, displacements[n]; - MPI_Get_address(&measure, &base_address); - MPI_Get_address(&measure.name, &displacements[j++]); - MPI_Get_address(&measure.info, &displacements[j++]); - for (size_t i = 0; i < n; i++) - displacements[i] = MPI_Aint_diff(displacements[i], base_address); - - static_assert( sizeof(LocalDatabaseElement) == sizeof(measure) - , "Measure has bad size"); - - MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); - MPI_Type_commit(&dt); - return vector(sizeof(LocalDatabaseElement), MPI_CHAR); - // TODO: write tests in order to know if this works - return dt; - } - - }; - - static - PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) { - switch (sliceType) { - case AB: return {abc[0], abc[1]}; - case BC: return {abc[1], abc[2]}; - case AC: return {abc[0], abc[2]}; - case CB: return {abc[2], abc[1]}; - case BA: return {abc[1], abc[0]}; - case CA: return {abc[2], abc[0]}; - case A: return {abc[0], 0}; - case B: return {abc[1], 0}; - case C: return {abc[2], 0}; - default: throw "Switch statement not exhaustive!"; - } + static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) { + MPI_Datatype dt; + MPI_Type_vector(n, 1, 1, DT, &dt); + MPI_Type_commit(&dt); + return dt; } + static MPI_Datatype sliceLocation () { + constexpr int n = 2; + // create a sliceLocation to measure in the current architecture + // the packing of the struct + Slice::Location measure; + MPI_Datatype dt; + const std::vector lengths(n, 1); + const MPI_Datatype types[n] = {usizeDt(), usizeDt()}; - /** - ,* It is important here to return a reference to a Slice - ,* not to accidentally copy the associated buffer of the slice. - ,*/ - static Slice& findOneByType(std::vector> &slices, Slice::Type type) { - const auto sliceIt - = std::find_if(slices.begin(), slices.end(), - [&type](Slice const& s) { - return type == s.info.type; - }); - WITH_CRAZY_DEBUG - WITH_RANK - << "\t__ looking for " << type << "\n"; - if (sliceIt == slices.end()) - throw std::domain_error("Slice by type not found!"); - return *sliceIt; - } + static_assert(sizeof(Slice::Location) == 2 * sizeof(size_t), + "The Location packing is wrong in your compiler"); - /* - ,* Check if an info has - ,* - ,*/ - static std::vector*> hasRecycledReferencingToIt - ( std::vector> &slices - , Info const& info - ) { - std::vector*> result; + // measure the displacements in the struct + size_t j = 0; + MPI_Aint base_address, displacements[n]; + MPI_Get_address(&measure, &base_address); + MPI_Get_address(&measure.rank, &displacements[j++]); + MPI_Get_address(&measure.source, &displacements[j++]); + for (size_t i = 0; i < n; i++) + displacements[i] = MPI_Aint_diff(displacements[i], base_address); - for (auto& s: slices) - if ( s.info.recycling == info.type - && s.info.tuple == info.tuple - && s.info.state == Recycled - ) result.push_back(&s); + MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); + MPI_Type_commit(&dt); + return dt; + } - return result; - } + static MPI_Datatype usizeDt() { return MPI_UINT64_T; } - static Slice& - findRecycledSource (std::vector> &slices, Slice::Info info) { - const auto sliceIt - = std::find_if(slices.begin(), slices.end(), - [&info](Slice const& s) { - return info.recycling == s.info.type - && info.tuple == s.info.tuple - && State::Recycled != s.info.state - ; - }); + static MPI_Datatype sliceInfo () { + constexpr int n = 5; + MPI_Datatype dt; + Slice::Info measure; + const std::vector lengths(n, 1); + const MPI_Datatype types[n] + = { vector(2, usizeDt()) + , vector(sizeof(enum Type), MPI_CHAR) + , vector(sizeof(enum State), MPI_CHAR) + , sliceLocation() + , vector(sizeof(enum Type), MPI_CHAR) + // TODO: Why this does not work on intel mpi? + /*, MPI_UINT64_T*/ + }; - WITH_CRAZY_DEBUG - WITH_RANK << "__slice__:find: recycling source of " - << pretty_print(info) << "\n"; - if (sliceIt == slices.end()) - throw std::domain_error( "Slice not found: " - + pretty_print(info) - + " rank: " - + pretty_print(Atrip::rank) - ); - WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n"; - return *sliceIt; - } + static_assert(sizeof(enum Type) == 4, "Enum type not 4 bytes long"); + static_assert(sizeof(enum State) == 4, "Enum State not 4 bytes long"); + static_assert(sizeof(enum Name) == 4, "Enum Name not 4 bytes long"); - static Slice& findByTypeAbc - ( std::vector> &slices - , Slice::Type type - , ABCTuple const& abc - ) { - const auto tuple = Slice::subtupleBySlice(abc, type); - const auto sliceIt - = std::find_if(slices.begin(), slices.end(), - [&type, &tuple](Slice const& s) { - return type == s.info.type - && tuple == s.info.tuple - ; - }); - WITH_CRAZY_DEBUG - WITH_RANK << "__slice__:find:" << type << " and tuple " - << pretty_print(tuple) - << "\n"; - if (sliceIt == slices.end()) - throw std::domain_error( "Slice not found: " - + pretty_print(tuple) - + ", " - + pretty_print(type) - + " rank: " - + pretty_print(Atrip::rank) - ); - return *sliceIt; - } + // create the displacements from the info measurement struct + size_t j = 0; + MPI_Aint base_address, displacements[n]; + MPI_Get_address(&measure, &base_address); + MPI_Get_address(&measure.tuple[0], &displacements[j++]); + MPI_Get_address(&measure.type, &displacements[j++]); + MPI_Get_address(&measure.state, &displacements[j++]); + MPI_Get_address(&measure.from, &displacements[j++]); + MPI_Get_address(&measure.recycling, &displacements[j++]); + for (size_t i = 0; i < n; i++) + displacements[i] = MPI_Aint_diff(displacements[i], base_address); - static Slice& findByInfo(std::vector> &slices, - Slice::Info const& info) { - const auto sliceIt - = std::find_if(slices.begin(), slices.end(), - [&info](Slice const& s) { - // TODO: maybe implement comparison in Info struct - return info.type == s.info.type - && info.state == s.info.state - && info.tuple == s.info.tuple - && info.from.rank == s.info.from.rank - && info.from.source == s.info.from.source - ; - }); - WITH_CRAZY_DEBUG - WITH_RANK << "__slice__:find:looking for " << pretty_print(info) << "\n"; - if (sliceIt == slices.end()) - throw std::domain_error( "Slice by info not found: " - + pretty_print(info)); - return *sliceIt; - } + MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); + MPI_Type_commit(&dt); + return dt; + } - // SLICE DEFINITION =================================================={{{1 + static MPI_Datatype localDatabaseElement () { + constexpr int n = 2; + MPI_Datatype dt; + LocalDatabaseElement measure; + const std::vector lengths(n, 1); + const MPI_Datatype types[n] + = { vector(sizeof(enum Name), MPI_CHAR) + , sliceInfo() + }; - // ATTRIBUTES ============================================================ + // measure the displacements in the struct + size_t j = 0; + MPI_Aint base_address, displacements[n]; + MPI_Get_address(&measure, &base_address); + MPI_Get_address(&measure.name, &displacements[j++]); + MPI_Get_address(&measure.info, &displacements[j++]); + for (size_t i = 0; i < n; i++) + displacements[i] = MPI_Aint_diff(displacements[i], base_address); + + static_assert( sizeof(LocalDatabaseElement) == sizeof(measure) + , "Measure has bad size"); + + MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); + MPI_Type_commit(&dt); + return vector(sizeof(LocalDatabaseElement), MPI_CHAR); + // TODO: write tests in order to know if this works + return dt; + } + +}; +#+end_src + +*** Static utilities + +This section presents some functions which are useful to work with +slices and are inside the namespace created by the slice struct. + + +The function =subtupleBySlice= gives to every =Slice::Type= +its meaning in terms of the triples \( (a,b,c) \). + +Notice that since in general the relation +\( a < b < c \) holds (in our implementation), the case +of one-dimensional parametrizations =A=, =B= and =C= is well +defined. + +The function should only throw if there is an implementation +error where the =Slice::Type= enum has been expanded and this +function has not been updated accordingly. + +#+begin_src c++ :tangle (atrip-slice-h) +static +PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) { + switch (sliceType) { + case AB: return {abc[0], abc[1]}; + case BC: return {abc[1], abc[2]}; + case AC: return {abc[0], abc[2]}; + case CB: return {abc[2], abc[1]}; + case BA: return {abc[1], abc[0]}; + case CA: return {abc[2], abc[0]}; + case A: return {abc[0], 0}; + case B: return {abc[1], 0}; + case C: return {abc[2], 0}; + default: throw "Switch statement not exhaustive!"; + } +} +#+end_src + +In the context of cleaning up slices during the main loop, +it is important to check if a given slice has some slices +referencing to it in quality of recycled slices. + +This function should therefore return a vector of pointers +of slices referencing to the given slice's info, when +the length of the vector is zero, then there are no dangling +links. + +#+begin_src c++ :tangle (atrip-slice-h) +static std::vector*> hasRecycledReferencingToIt + ( std::vector> &slices + , Info const& info + ) { + std::vector*> result; + + for (auto& s: slices) + if ( s.info.recycling == info.type + && s.info.tuple == info.tuple + && s.info.state == Recycled + ) result.push_back(&s); + + return result; +} +#+end_src + +The rest of the coming functions are utilities in order to find in a vector +of slices a given slice by reference. Mostly they are merely convenience +wrappers to the standard library function =std::find_if=. + +They are named as =find<...>=, where =<...>= represents some condition +and must always return a reference to the found slice, i.e., =Slice&=. +=Atrip= relies on these functions to find the sought for slices, +therefore these functions will throw a =std::domain_error= if the +given slice could not be found. + +#+begin_src c++ :tangle (atrip-slice-h) +static Slice& findOneByType(std::vector> &slices, Slice::Type type) { + const auto sliceIt + = std::find_if(slices.begin(), slices.end(), + [&type](Slice const& s) { + return type == s.info.type; + }); + WITH_CRAZY_DEBUG + WITH_RANK + << "\t__ looking for " << type << "\n"; + if (sliceIt == slices.end()) + throw std::domain_error("Slice by type not found!"); + return *sliceIt; +} +#+end_src + +#+begin_src c++ :tangle (atrip-slice-h) +static Slice& +findRecycledSource (std::vector> &slices, Slice::Info info) { + const auto sliceIt + = std::find_if(slices.begin(), slices.end(), + [&info](Slice const& s) { + return info.recycling == s.info.type + && info.tuple == s.info.tuple + && State::Recycled != s.info.state + ; + }); + + WITH_CRAZY_DEBUG + WITH_RANK << "__slice__:find: recycling source of " + << pretty_print(info) << "\n"; + if (sliceIt == slices.end()) + throw std::domain_error( "Slice not found: " + + pretty_print(info) + + " rank: " + + pretty_print(Atrip::rank) + ); + WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n"; + return *sliceIt; +} +#+end_src + +#+begin_src c++ :tangle (atrip-slice-h) +static Slice& findByTypeAbc + ( std::vector> &slices + , Slice::Type type + , ABCTuple const& abc + ) { + const auto tuple = Slice::subtupleBySlice(abc, type); + const auto sliceIt + = std::find_if(slices.begin(), slices.end(), + [&type, &tuple](Slice const& s) { + return type == s.info.type + && tuple == s.info.tuple + ; + }); + WITH_CRAZY_DEBUG + WITH_RANK << "__slice__:find:" << type << " and tuple " + << pretty_print(tuple) + << "\n"; + if (sliceIt == slices.end()) + throw std::domain_error( "Slice not found: " + + pretty_print(tuple) + + ", " + + pretty_print(type) + + " rank: " + + pretty_print(Atrip::rank) + ); + return *sliceIt; +} +#+end_src + +#+begin_src c++ :tangle (atrip-slice-h) +static Slice& findByInfo(std::vector> &slices, + Slice::Info const& info) { + const auto sliceIt + = std::find_if(slices.begin(), slices.end(), + [&info](Slice const& s) { + // TODO: maybe implement comparison in Info struct + return info.type == s.info.type + && info.state == s.info.state + && info.tuple == s.info.tuple + && info.from.rank == s.info.from.rank + && info.from.source == s.info.from.source + ; + }); + WITH_CRAZY_DEBUG + WITH_RANK << "__slice__:find:looking for " << pretty_print(info) << "\n"; + if (sliceIt == slices.end()) + throw std::domain_error( "Slice by info not found: " + + pretty_print(info)); + return *sliceIt; +} +#+end_src + +*** Attributes + +A slice object does not own data, it is just a container +or a pointer to data together with additional bookkeeping facilities. + +It includes an info structure with the information about the slice, +=Type=, =State= etc, which will be later communicated to other ranks. + +#+begin_src c++ :tangle (atrip-slice-h) Info info; - F *data; - MPI_Request request; - const size_t size; +#+end_src +A pointer to data is also necessary for the =Slice= but not necessary +to be communicated to other ranks. The =Slice= should never allocate +or deallocate itself the pointer. +#+begin_src c++ :tangle (atrip-slice-h) + F *data; +#+end_src + +An =MPI_Request= handle is also included so that the slices that are +to receive data through MPI can know which request they belong to. +#+begin_src c++ :tangle (atrip-slice-h) + MPI_Request request; +#+end_src + +For practical purposes in MPI calls, the number of elements in =data= is also included. +#+begin_src c++ :tangle (atrip-slice-h) + const size_t size; +#+end_src + +*** Member functions + +It is important to note that a ready slice should not be recycled from +any other slice, so that it can have access by itself to the data. +#+begin_src c++ :tangle (atrip-slice-h) void markReady() noexcept { info.state = Ready; info.recycling = Blank; } +#+end_src - /* - ,* This means that the data is there - ,*/ + +The following function asks wether or not +the slice has effectively been unwrapped or not, +i.e., wether or not the data are accessible and already +there. This can only happen in two ways, either +is the slice =Ready= or it is =SelfSufficient=, +i.e., the data pointed to was pre-distributed to the current node. +#+begin_src c++ :tangle (atrip-slice-h) bool isUnwrapped() const noexcept { return info.state == Ready || info.state == SelfSufficient ; } +#+end_src +The function =isUnwrappable= answers which slices can be unwrapped +potentially. Unwrapped slices can be unwrapped again idempotentially. +Also =Recycled= slices can be unwrapped, i.e. the slices pointed to by them +will be unwrapped. +The only other possibility is that the slice has been dispatched +in the past and can be unwrapped. The case where the state +is =Dispatched= is the canonical intuitive case where a real process +of unwrapping, i.e. waiting for the data to get through the network, +is done. +#+begin_src c++ :tangle (atrip-slice-h) bool isUnwrappable() const noexcept { return isUnwrapped() || info.state == Recycled @@ -425,19 +621,20 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds ; } +#+end_src - /* - ,* This function answers the question, which slices can be recycled. - ,* - ,* A slice can only be recycled if it is Fetch or Ready and has - ,* a valid datapointer. - ,* - ,* In particular, SelfSufficient are not recyclable, since it is easier - ,* just to create a SelfSufficient slice than deal with data dependencies. - ,* - ,* Furthermore, a recycled slice is not recyclable, if this is the case - ,* then it is either bad design or a bug. - ,*/ +The function =isRecylable= answers the question, which slices can be recycled. + +A slice can only be recycled if it is Fetch or Ready and has +a valid datapointer. + +In particular, SelfSufficient are not recyclable, since it is easier +just to create a SelfSufficient slice than deal with data dependencies. + +Furthermore, a recycled slice is not recyclable, if this is the case +then it is either bad design or a bug. + +#+begin_src c++ :tangle (atrip-slice-h) inline bool isRecyclable() const noexcept { return ( info.state == Dispatched || info.state == Ready @@ -446,21 +643,38 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds && hasValidDataPointer() ; } +#+end_src - /* - ,* This function describes if a slice has a valid data pointer. - ,* - ,* This is important to know if the slice has some data to it, also - ,* some structural checks are done, so that it should not be Acceptor - ,* or Blank, if this is the case then it is a bug. - ,*/ + +The function =hasValidDataPointer= describes if a slice has a valid +data pointer. + +This is important to know if the slice has some data to it, also +some structural checks are done, so that it should not be =Acceptor= +or =Blank=, if this is the case then it is a bug. + +#+begin_src c++ :tangle (atrip-slice-h) inline bool hasValidDataPointer() const noexcept { return data != nullptr && info.state != Acceptor && info.type != Blank ; } +#+end_src + +The function +=unwrapAndMarkReady= +calls the low-level MPI functions +in order to wait whenever the state of the slice is correct. +The main behaviour of the function should +- return if state is =Ready=, since then there is nothing to be done. +- throw if the state is not =Dispatched=, only a dispatched slice + can be unwrapped through MPI. +- throw if an MPI error happens. + + +#+begin_src c++ :tangle (atrip-slice-h) void unwrapAndMarkReady() { if (info.state == Ready) return; if (info.state != Dispatched) @@ -490,7 +704,10 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds << "\n"; #endif } +#+end_src +*** Epilog :noexport: +#+begin_src c++ :tangle (atrip-slice-h) Slice(size_t size_) : info({}) , data(nullptr) @@ -500,7 +717,11 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds }; // struct Slice +#+end_src +*** Debug :noexport: + +#+begin_src c++ :tangle (atrip-slice-h) template std::ostream& operator<<(std::ostream& out, typename Slice::Location const& v) { // TODO: remove me @@ -522,6 +743,9 @@ std::ostream& operator<<(std::ostream& out, typename Slice::Info const& i) { #+end_src ** Utils + +This section presents some utilities +*** Prolog :noexport: #+begin_src c++ :tangle (atrip-utils-h) #pragma once #include @@ -530,38 +754,64 @@ std::ostream& operator<<(std::ostream& out, typename Slice::Info const& i) { #include #include +#include namespace atrip { +#+end_src +*** Pretty printing +The pretty printing uses the [[https://github.com/sharkdp/dbg-macro][dbg-macro]] package. + +#+begin_src c++ :tangle (atrip-utils-h) template std::string pretty_print(T&& value) { std::stringstream stream; -#if ATRIP_DEBUG > 1 +#if ATRIP_DEBUG > 2 dbg::pretty_print(stream, std::forward(value)); #endif return stream.str(); } -#define WITH_CHRONO(__chrono, ...) \ - __chrono.start(); __VA_ARGS__ __chrono.stop(); +#+end_src - struct Timer { - using Clock = std::chrono::high_resolution_clock; - using Event = std::chrono::time_point; - std::chrono::duration duration; - Event _start; - inline void start() noexcept { _start = Clock::now(); } - inline void stop() noexcept { duration += Clock::now() - _start; } - inline void clear() noexcept { duration *= 0; } - inline double count() const noexcept { return duration.count(); } - }; - using Timings = std::map; -} +*** Chrono + +The chrono is just a simple wrapper for a high resolution clock +that can be found in the =std::chrono= namespace of the standard library. + +#+begin_src c++ :tangle (atrip-utils-h) +#define WITH_CHRONO(__chrono_name, ...) \ + Atrip::chrono[__chrono_name].start(); \ + __VA_ARGS__ \ + Atrip::chrono[__chrono_name].stop(); + +struct Timer { + using Clock = std::chrono::high_resolution_clock; + using Event = std::chrono::time_point; + std::chrono::duration duration; + Event _start; + inline void start() noexcept { _start = Clock::now(); } + inline void stop() noexcept { duration += Clock::now() - _start; } + inline void clear() noexcept { duration *= 0; } + inline double count() const noexcept { return duration.count(); } +}; +using Timings = std::map; #+end_src + +*** Epilog :noexport: +#+begin_src c++ :tangle (atrip-utils-h) +} +#+end_src + ** The rank mapping + +This section introduces the concept of rank mapping, +which defines how slices will be allocated to every +rank. + #+begin_src c++ :tangle (atrip-rankmap-h) #pragma once @@ -569,24 +819,38 @@ namespace atrip { #include #include +#include namespace atrip { template struct RankMap { + static bool RANK_ROUND_ROBIN; std::vector const lengths; size_t const np, size; + ClusterInfo const clusterInfo; - RankMap(std::vector lens, size_t np_) + RankMap(std::vector lens, size_t np_, MPI_Comm comm) : lengths(lens) , np(np_) , size(std::accumulate(lengths.begin(), lengths.end(), 1UL, std::multiplies())) + , clusterInfo(getClusterInfo(comm)) { assert(lengths.size() <= 2); } size_t find(typename Slice::Location const& p) const noexcept { - return p.source * np + p.rank; + if (RANK_ROUND_ROBIN) { + return p.source * np + p.rank; + } else { + const size_t + rankPosition = p.source * clusterInfo.ranksPerNode + + clusterInfo.rankInfos[p.rank].localRank + ; + return rankPosition * clusterInfo.nNodes + + clusterInfo.rankInfos[p.rank].nodeId + ; + } } size_t nSources() const noexcept { @@ -606,8 +870,9 @@ namespace atrip { } typename Slice::Location - find(ABCTuple const& abc, typename Slice::Type sliceType) const noexcept { + find(ABCTuple const& abc, typename Slice::Type sliceType) const { // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB + // tuple = {11, 0} when abc = {11, 8, 9} and sliceType = A const auto tuple = Slice::subtupleBySlice(abc, sliceType); const size_t index @@ -615,9 +880,51 @@ namespace atrip { + tuple[1] * (lengths.size() > 1 ? lengths[0] : 0) ; + size_t rank, source; + + if (RANK_ROUND_ROBIN) { + + rank = index % np; + source = index / np; + + } else { + + size_t const + + // the node that will be assigned to + nodeId = index % clusterInfo.nNodes + + // how many times it has been assigned to the node + , s_n = index / clusterInfo.nNodes + + // which local rank in the node should be + , localRank = s_n % clusterInfo.ranksPerNode + + // and the local source (how many times we chose this local rank) + , localSource = s_n / clusterInfo.ranksPerNode + ; + + // find the localRank-th entry in clusterInfo + auto const& it = + std::find_if(clusterInfo.rankInfos.begin(), + clusterInfo.rankInfos.end(), + [nodeId, localRank](RankInfo const& ri) { + return ri.nodeId == nodeId + && ri.localRank == localRank + ; + }); + if (it == clusterInfo.rankInfos.end()) { + throw "FATAL! Error in node distribution of the slices"; + } + + rank = (*it).globalRank; + source = localSource; + + } + return - { index % np - , index / np + { rank + , source }; } @@ -808,8 +1115,14 @@ namespace atrip { if (blank.info.state == Slice::SelfSufficient) { blank.data = sources[from.source].data(); } else { - if (freePointers.size() == 0) - throw std::domain_error("No more free pointers!"); + if (freePointers.size() == 0) { + std::stringstream stream; + stream << "No more free pointers " + << "for type " << type + << " and name " << name + ; + throw std::domain_error(stream.str()); + } auto dataPointer = freePointers.begin(); freePointers.erase(dataPointer); blank.data = *dataPointer; @@ -943,7 +1256,8 @@ namespace atrip { // at this point, let us blank the slice WITH_RANK << "~~~:cl(" << name << ")" << " freeing up slice " - // TODO: make this possible + // TODO: make this possible because of Templates + // TODO: there is a deduction error here // << " info " << slice.info << "\n"; slice.free(); @@ -963,7 +1277,7 @@ namespace atrip { , typename Slice::Name name_ , size_t nSliceBuffers = 4 ) - : rankMap(paramLength, np) + : rankMap(paramLength, np, global_world) , world(child_world) , universe(global_world) , sliceLength(sliceLength_) @@ -982,7 +1296,7 @@ namespace atrip { slices = std::vector>(2 * sliceTypes.size(), { sources[0].size() }); - // TODO: think exactly ^------------------- about this number + // TODO: think exactly ^------------------- about this number // initialize the freePointers with the pointers to the buffers std::transform(sliceBuffers.begin(), sliceBuffers.end(), @@ -1050,10 +1364,11 @@ namespace atrip { * \brief Send asynchronously only if the state is Fetch */ void send( size_t otherRank - , typename Slice::Info const& info + , typename Slice::LocalDatabaseElement const& el , size_t tag) const noexcept { MPI_Request request; bool sendData_p = false; + auto const& info = el.info; if (info.state == Slice::Fetch) sendData_p = true; // TODO: remove this because I have SelfSufficient @@ -1168,8 +1483,11 @@ namespace atrip { [&name](SliceUnion const* s) { return name == s->name; }); - if (sliceUnionIt == unions.end()) - throw std::domain_error("SliceUnion not found!"); + if (sliceUnionIt == unions.end()) { + std::stringstream stream; + stream << "SliceUnion(" << name << ") not found!"; + throw std::domain_error(stream.str()); + } return **sliceUnionIt; } @@ -1177,6 +1495,12 @@ namespace atrip { #+end_src ** Tuples + +This section introduces the types for tuples \( (a,b,c) \) +as well as their distribution to nodes and cores. + + +*** Prolog :noexport: #+begin_src c++ :tangle (atrip-tuples-h) #pragma once @@ -1184,78 +1508,692 @@ namespace atrip { #include #include +// TODO: remove some +#include +#include +#include +#include +#include +#include +#include +#include + #include #include namespace atrip { +#+end_src - using ABCTuple = std::array; - using PartialTuple = std::array; - using ABCTuples = std::vector; +*** Tuples types - ABCTuples getTuplesList(size_t Nv) { - const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; - ABCTuples result(n); - size_t u(0); +The main tuple types are simple type aliases for finite-size arrays. +A tuple is thus simply 3 natural numbers \( (a,b,c) \) +whereas a partial tuple is a two dimensional subset of these three. - for (size_t a(0); a < Nv; a++) - for (size_t b(a); b < Nv; b++) - for (size_t c(b); c < Nv; c++){ - if ( a == b && b == c ) continue; - result[u++] = {a, b, c}; - } +#+begin_src c++ :tangle (atrip-tuples-h) +using ABCTuple = std::array; +using PartialTuple = std::array; +using ABCTuples = std::vector; - return result; +constexpr ABCTuple FAKE_TUPLE = {0, 0, 0}; +constexpr ABCTuple INVALID_TUPLE = {1, 1, 1}; +#+end_src +*** Distributing the tuples + +In general it is our task to distribute all the tuples +\( (a,b,c) \) among the ranks. Every distribution should +make sure to allocate the same amount of tuples to every rank, +padding the list with =FAKE_TUPLE= elements as necessary. + +The interface that we propose for this is simplye + +#+begin_src c++ :tangle (atrip-tuples-h) +struct TuplesDistribution { + virtual ABCTuples getTuples(size_t Nv, MPI_Comm universe) = 0; + virtual bool tupleIsFake(ABCTuple const& t) { return t == FAKE_TUPLE; } +}; +#+end_src + + + +*** Node information + +- nodeList :: + List of hostnames of size \( N_n \) +- nodeInfos :: + List of (hostname, local rank Id) + of size \( N_p \), i.e., size of ranks + where local rank id goes from 0 to 48. + + + +=getNodeNames= gets the names of the nodes used, +i.e., the size of the resulting vector gives the +number of nodes. +#+begin_src c++ :tangle (atrip-tuples-h) +std::vector getNodeNames(MPI_Comm comm){ + int rank, np; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &np); + + std::vector nodeList(np); + char nodeName[MPI_MAX_PROCESSOR_NAME] + , nodeNames[np*MPI_MAX_PROCESSOR_NAME] + ; + std::vector nameLengths(np) + , off(np) + ; + int nameLength; + MPI_Get_processor_name(nodeName, &nameLength); + MPI_Allgather(&nameLength, + 1, + MPI_INT, + nameLengths.data(), + 1, + MPI_INT, + comm); + for (int i(1); i < np; i++) + off[i] = off[i-1] + nameLengths[i-1]; + MPI_Allgatherv(nodeName, + nameLengths[rank], + MPI_BYTE, + nodeNames, + nameLengths.data(), + off.data(), + MPI_BYTE, + comm); + for (int i(0); i < np; i++) { + std::string const s(&nodeNames[off[i]], nameLengths[i]); + nodeList[i] = s; } + return nodeList; +} +#+end_src +=getNodeInfos= +#+begin_src c++ :tangle (atrip-tuples-h) +struct RankInfo { + const std::string name; + const size_t nodeId; + const size_t globalRank; + const size_t localRank; + const size_t ranksPerNode; +}; - std::pair - getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) { - - std::vector n_tuples_per_rank(np, tuplesList.size()/np); - const size_t - // how many valid tuples should we still verteilen to nodes - // since the number of tuples is not divisible by the number of nodes - nRoundRobin = tuplesList.size() % np - // every node must have the sanme amount of tuples in order for the - // other nodes to receive and send somewhere, therefore - // some nodes will get extra tuples but that are dummy tuples - , nExtraInvalid = (np - nRoundRobin) % np - ; - - if (nRoundRobin) for (int i = 0; i < np; i++) n_tuples_per_rank[i]++; - - #if defined(TODO) - assert( tuplesList.size() - == - ( std::accumulate(n_tuples_per_rank.begin(), - n_tuples_per_rank.end(), - 0UL, - std::plus()) - + nExtraInvalid - )); - #endif - - WITH_RANK << "nRoundRobin = " << nRoundRobin << "\n"; - WITH_RANK << "nExtraInvalid = " << nExtraInvalid << "\n"; - WITH_RANK << "ntuples = " << n_tuples_per_rank[rank] << "\n"; - - auto const& it = n_tuples_per_rank.begin(); - - return - { std::accumulate(it, it + rank , 0) - , std::accumulate(it, it + rank + 1, 0) - }; +template +A unique(A const &xs) { + auto result = xs; + std::sort(std::begin(result), std::end(result)); + auto const& last = std::unique(std::begin(result), std::end(result)); + result.erase(last, std::end(result)); + return result; +} +std::vector +getNodeInfos(std::vector const& nodeNames) { + std::vector result; + auto const uniqueNames = unique(nodeNames); + auto const index = [&uniqueNames](std::string const& s) { + auto const& it = std::find(uniqueNames.begin(), uniqueNames.end(), s); + return std::distance(uniqueNames.begin(), it); + }; + std::vector localRanks(uniqueNames.size(), 0); + size_t globalRank = 0; + for (auto const& name: nodeNames) { + const size_t nodeId = index(name); + result.push_back({name, + nodeId, + globalRank++, + localRanks[nodeId]++, + std::count(nodeNames.begin(), + nodeNames.end(), + name) + }); } + return result; +} + +struct ClusterInfo { + const size_t nNodes, np, ranksPerNode; + const std::vector rankInfos; +}; + +ClusterInfo +getClusterInfo(MPI_Comm comm) { + auto const names = getNodeNames(comm); + auto const rankInfos = getNodeInfos(names); + + return ClusterInfo { + unique(names).size(), + names.size(), + rankInfos[0].ranksPerNode, + rankInfos + }; } #+end_src +*** Naive list + +The naive implementation of the global tuples list is simple +three for loops creating tuples of the sort +\( (a,b,c) \) where the following conditions are met at the same time: +- \( a \leq b \leq c \) +- \( + a \neq b \land b \neq c + \) + +This means, +\( (1, 2, 3) + , (1, 1, 3) + , (1, 2, 2) +\) are acceptable tuples wherease \( (2, 1, 1) \) and \( (1, 1, 1) \) are not. + + +#+begin_src c++ :tangle (atrip-tuples-h) +ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) { + + const size_t + // total number of tuples for the problem + n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv + + // all ranks should have the same number of tuples_per_rank + , tuples_per_rank = n / np + size_t(n % np != 0) + + // start index for the global tuples list + , start = tuples_per_rank * rank + + // end index for the global tuples list + , end = tuples_per_rank * (rank + 1) + ; + + LOG(1,"Atrip") << "tuples_per_rank = " << tuples_per_rank << "\n"; + WITH_RANK << "start, end = " << start << ", " << end << "\n"; + ABCTuples result(tuples_per_rank, FAKE_TUPLE); + + for (size_t a(0), r(0), g(0); a < Nv; a++) + for (size_t b(a); b < Nv; b++) + for (size_t c(b); c < Nv; c++){ + if ( a == b && b == c ) continue; + if ( start <= g && g < end) result[r++] = {a, b, c}; + g++; + } + + return result; + +} +#+end_src + +and all tuples would simply be + +#+begin_src c++ :tangle (atrip-tuples-h) +ABCTuples getAllTuplesList(const size_t Nv) { + const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; + ABCTuples result(n); + + for (size_t a(0), u(0); a < Nv; a++) + for (size_t b(a); b < Nv; b++) + for (size_t c(b); c < Nv; c++){ + if ( a == b && b == c ) continue; + result[u++] = {a, b, c}; + } + + return result; +} +#+end_src + + +With =getTupleList= we can easily define a tuple distribution like + +#+begin_src c++ :tangle (atrip-tuples-h) +struct NaiveDistribution : public TuplesDistribution { + ABCTuples getTuples(size_t Nv, MPI_Comm universe) override { + int rank, np; + MPI_Comm_rank(universe, &rank); + MPI_Comm_size(universe, &np); + return getTuplesList(Nv, (size_t)rank, (size_t)np); + } +}; +#+end_src + + +*** Group and sort list +**** Prolog :noexport: +#+begin_src c++ :tangle (atrip-tuples-h) +namespace group_and_sort { +#+end_src + +**** Utils + +#+begin_src c++ :tangle (atrip-tuples-h) + +// Provides the node on which the slice-element is found +// Right now we distribute the slices in a round robin fashion +// over the different nodes (NOTE: not mpi ranks but nodes) +inline +size_t isOnNode(size_t tuple, size_t nNodes) { return tuple % nNodes; } + + +// return the node (or all nodes) where the elements of this +// tuple are located +std::vector getTupleNodes(ABCTuple const& t, size_t nNodes) { + std::vector + nTuple = { isOnNode(t[0], nNodes) + , isOnNode(t[1], nNodes) + , isOnNode(t[2], nNodes) + }; + return unique(nTuple); +} + +struct Info { + size_t nNodes; + size_t nodeId; +}; + +#+end_src + +**** Distribution + +wording: home element = element which is located on the given node +1. we distribute the tuples such that each tuple has at least one 'home element' +2. we sort each tuple in a way that the 'home element' are the fastest indices +3. we sort the list of tuples on every node +4. we resort the tuples that for every tuple abc the following holds: a + container1d(nNodes) + , container2d(nNodes * nNodes) + , container3d(nNodes * nNodes * nNodes) + ; + + if (info.nodeId == 0) + std::cout << "\tGoing through all " + << allTuples.size() + << " tuples in " + << nNodes + << " nodes\n"; + + // build container-n-d's + for (auto const& t: allTuples) { + // one which node(s) are the tuple elements located... + // put them into the right container + auto const _nodes = getTupleNodes(t, nNodes); + + switch (_nodes.size()) { + case 1: + container1d[_nodes[0]].push_back(t); + break; + case 2: + container2d[ _nodes[0] + + _nodes[1] * nNodes + ].push_back(t); + break; + case 3: + container3d[ _nodes[0] + + _nodes[1] * nNodes + + _nodes[2] * nNodes * nNodes + ].push_back(t); + break; + } + + } + + if (info.nodeId == 0) + std::cout << "\tBuilding 1-d containers\n"; + // DISTRIBUTE 1-d containers + // every tuple which is only located at one node belongs to this node + { + auto const& _tuples = container1d[info.nodeId]; + nodeTuples.resize(_tuples.size(), INVALID_TUPLE); + std::copy(_tuples.begin(), _tuples.end(), nodeTuples.begin()); + } + + if (info.nodeId == 0) + std::cout << "\tBuilding 2-d containers\n"; + // DISTRIBUTE 2-d containers + //the tuples which are located at two nodes are half/half given to these nodes + for (size_t yx = 0; yx < container2d.size(); yx++) { + + auto const& _tuples = container2d[yx]; + const + size_t idx = yx % nNodes + // remeber: yx = idy * nNodes + idx + , idy = yx / nNodes + , n_half = _tuples.size() / 2 + , size = nodeTuples.size() + ; + + size_t nbeg, nend; + if (info.nodeId == idx) { + nbeg = 0 * n_half; + nend = n_half; + } else if (info.nodeId == idy) { + nbeg = 1 * n_half; + nend = _tuples.size(); + } else { + // either idx or idy is my node + continue; + } + + size_t const nextra = nend - nbeg; + nodeTuples.resize(size + nextra, INVALID_TUPLE); + std::copy(_tuples.begin() + nbeg, + _tuples.begin() + nend, + nodeTuples.begin() + size); + + } + + if (info.nodeId == 0) + std::cout << "\tBuilding 3-d containers\n"; + // DISTRIBUTE 3-d containers + for (size_t zyx = 0; zyx < container3d.size(); zyx++) { + auto const& _tuples = container3d[zyx]; + + const + size_t idx = zyx % nNodes + , idy = (zyx / nNodes) % nNodes + // remember: zyx = idx + idy * nNodes + idz * nNodes^2 + , idz = zyx / nNodes / nNodes + , n_third = _tuples.size() / 3 + , size = nodeTuples.size() + ; + + size_t nbeg, nend; + if (info.nodeId == idx) { + nbeg = 0 * n_third; + nend = 1 * n_third; + } else if (info.nodeId == idy) { + nbeg = 1 * n_third; + nend = 2 * n_third; + } else if (info.nodeId == idz) { + nbeg = 2 * n_third; + nend = _tuples.size(); + } else { + // either idx or idy or idz is my node + continue; + } + + size_t const nextra = nend - nbeg; + nodeTuples.resize(size + nextra, INVALID_TUPLE); + std::copy(_tuples.begin() + nbeg, + _tuples.begin() + nend, + nodeTuples.begin() + size); + + } + + + if (info.nodeId == 0) std::cout << "\tswapping tuples...\n"; + /* + * sort part of group-and-sort algorithm + * every tuple on a given node is sorted in a way that + * the 'home elements' are the fastest index. + * 1:yyy 2:yyn(x) 3:yny(x) 4:ynn(x) 5:nyy 6:nyn(x) 7:nny 8:nnn + */ + for (auto &nt: nodeTuples){ + if ( isOnNode(nt[0], nNodes) == info.nodeId ){ // 1234 + if ( isOnNode(nt[2], nNodes) != info.nodeId ){ // 24 + size_t const x(nt[0]); + nt[0] = nt[2]; // switch first and last + nt[2] = x; + } + else if ( isOnNode(nt[1], nNodes) != info.nodeId){ // 3 + size_t const x(nt[0]); + nt[0] = nt[1]; // switch first two + nt[1] = x; + } + } else { + if ( isOnNode(nt[1], nNodes) == info.nodeId // 56 + && isOnNode(nt[2], nNodes) != info.nodeId + ) { // 6 + size_t const x(nt[1]); + nt[1] = nt[2]; // switch last two + nt[2] = x; + } + } + } + + if (info.nodeId == 0) std::cout << "\tsorting list of tuples...\n"; + //now we sort the list of tuples + std::sort(nodeTuples.begin(), nodeTuples.end()); + + if (info.nodeId == 0) std::cout << "\trestoring tuples...\n"; + // we bring the tuples abc back in the order a 1 + if (info.nodeId == 0) + std::cout << "checking for validity of " << nodeTuples.size() << std::endl; + const bool anyInvalid + = std::any_of(nodeTuples.begin(), + nodeTuples.end(), + [](ABCTuple const& t) { return t == INVALID_TUPLE; }); + if (anyInvalid) throw "Some tuple is invalid in group-and-sort algorithm"; +#endif + + if (info.nodeId == 0) std::cout << "\treturning tuples...\n"; + return nodeTuples; + +} +#+end_src + + +**** Main + +The main routine should return the list of tuples to be handled by the current rank. + +Let \( N_p \) be the number of ranks or processes. +Let \( N_n \) be the number of nodes or sockets. + +Then we have the following + +#+begin_example +Global rank | 0 1 2 3 4 5 6 7 8 +key | global rank +nodeId | 0 1 0 1 1 0 2 2 2 +Local rank | 0 0 1 1 2 2 0 1 2 +intra color | 0 1 0 1 1 0 2 2 2 +#+end_example + + + + + +#+begin_src c++ :tangle (atrip-tuples-h) +std::vector main(MPI_Comm universe, size_t Nv) { + + int rank, np; + MPI_Comm_rank(universe, &rank); + MPI_Comm_size(universe, &np); + + std::vector result; + + auto const nodeNames(getNodeNames(universe)); + size_t const nNodes = unique(nodeNames).size(); + auto const nodeInfos = getNodeInfos(nodeNames); + + // We want to construct a communicator which only contains of one + // element per node + bool const computeDistribution + = nodeInfos[rank].localRank == 0; + + std::vector + nodeTuples + = computeDistribution + ? specialDistribution(Info{nNodes, nodeInfos[rank].nodeId}, + getAllTuplesList(Nv)) + : std::vector() + ; + + LOG(1,"Atrip") << "got nodeTuples\n"; + + // now we have to send the data from **one** rank on each node + // to all others ranks of this node + const + int color = nodeInfos[rank].nodeId + , key = nodeInfos[rank].localRank + ; + + + MPI_Comm INTRA_COMM; + MPI_Comm_split(universe, color, key, &INTRA_COMM); +#+end_src + +Every node has to distribute **at least** +=nodeTuples.size() / nodeInfos[rank].ranksPerNode= +tuples among the ranks. + +We have to communicate this quantity among all nodes. + +#+begin_src c++ :tangle (atrip-tuples-h) + + size_t const + tuplesPerRankLocal + = nodeTuples.size() / nodeInfos[rank].ranksPerNode + + size_t(nodeTuples.size() % nodeInfos[rank].ranksPerNode != 0) + ; + + size_t tuplesPerRankGlobal; + + MPI_Reduce(&tuplesPerRankLocal, + &tuplesPerRankGlobal, + 1, + MPI_UINT64_T, + MPI_MAX, + 0, + universe); + + MPI_Bcast(&tuplesPerRankGlobal, + 1, + MPI_UINT64_T, + 0, + universe); + + LOG(1,"Atrip") << "Tuples per rank: " << tuplesPerRankGlobal << "\n"; + LOG(1,"Atrip") << "ranks per node " << nodeInfos[rank].ranksPerNode << "\n"; + LOG(1,"Atrip") << "#nodes " << nNodes << "\n"; +#+end_src + +Now we have the tuples that every rank has to have, i.e., +=tuplesPerRankGlobal=. + +However before this, +the tuples in =nodeTuples= now have to be sent from the local rank +in every node to all the ranks in the given node, +and we have to make sure that every rank inside a given node +gets the same amount of tuples, in this case it should be +=tuplesPerRankLocal=, and in our node the total number +of tuples should be =tuplesPerRankLocal * nodeInfos[rank].ranksPerNode=, +however this might not be the case up to now due to divisibility issues. + +Up to now we have exactly =nodeTuples.size()= tuples, we have to make sure by +resizing that the condition above is met, i.e., so we can resize +and add some fake tuples at the end as padding. + +#+begin_src c++ :tangle (atrip-tuples-h) +size_t const totalTuples + = tuplesPerRankGlobal * nodeInfos[rank].ranksPerNode; + +if (computeDistribution) { + // pad with FAKE_TUPLEs + nodeTuples.insert(nodeTuples.end(), + totalTuples - nodeTuples.size(), + FAKE_TUPLE); +} +#+end_src + +And now we can simply scatter the tuples in nodeTuples and send +=tuplesPerRankGlobal= to the different ranks in the node, + +#+begin_src c++ :tangle (atrip-tuples-h) +{ + // construct mpi type for abctuple + MPI_Datatype MPI_ABCTUPLE; + MPI_Type_vector(nodeTuples[0].size(), 1, 1, MPI_UINT64_T, &MPI_ABCTUPLE); + MPI_Type_commit(&MPI_ABCTUPLE); + + LOG(1,"Atrip") << "scattering tuples \n"; + + result.resize(tuplesPerRankGlobal); + MPI_Scatter(nodeTuples.data(), + tuplesPerRankGlobal, + MPI_ABCTUPLE, + result.data(), + tuplesPerRankGlobal, + MPI_ABCTUPLE, + 0, + INTRA_COMM); + + MPI_Type_free(&MPI_ABCTUPLE); + +} +#+end_src + + +The next step is sending the tuples in the local root rank +to the other ranks in the node, this we do with the MPI function +=MPI_Scatterv=. +Every rank gets =tuplesPerRankLocal= tuples and +the =nodeTuples= vector is now homogeneous and divisible by the number +of ranks per node in our node. +Therefore, the =displacements= are simply the vector +\begin{equation*} + \left\{ + k * \mathrm{tuplesPerNodeLocal} + \mid + k \in + \left\{ 0 + , \ldots + , \#\text{ranks in node} - 1 + \right\} + \right\} +\end{equation*} + +and the =sendCounts= vector is simply the constant vector +=tuplesPerRankLocal= of size =ranksPerNode=. + +#+begin_src c++ :tangle (atrip-tuples-h) + + return result; + +} +#+end_src + +**** Interface + +The distribution interface will then simply be + +#+begin_src c++ :tangle (atrip-tuples-h) +struct Distribution : public TuplesDistribution { + ABCTuples getTuples(size_t Nv, MPI_Comm universe) override { + return main(universe, Nv); + } +}; +#+end_src + + +**** Epilog :noexport: +#+begin_src c++ :tangle (atrip-tuples-h) +} // namespace group_and_sort +#+end_src + + +*** Epilog :noexport: +#+begin_src c++ :tangle (atrip-tuples-h) +} +#+end_src + ** Unions -Since every tensor slice in a different way, we can override the slicing procedure -and define subclasses of slice unions. + +Every slice pertaining to every different tensor +is sliced differently. + #+begin_src c++ :tangle (atrip-unions-h) #pragma once @@ -1318,7 +2256,7 @@ namespace atrip { , child_world , global_world , Slice::TA - , 4) { + , 6) { init(sourceTensor); } @@ -1356,7 +2294,7 @@ namespace atrip { , child_world , global_world , Slice::VIJKA - , 4) { + , 6) { init(sourceTensor); } @@ -1675,10 +2613,8 @@ namespace atrip { , F const* TBChh // -- TIJK , F *Tijk - , atrip::Timings& chrono ) { - auto& t_reorder = chrono["doubles:reorder"]; const size_t a = abc[0], b = abc[1], c = abc[2] , NoNo = No*No, NoNv = No*Nv ; @@ -1686,13 +2622,13 @@ namespace atrip { #if defined(ATRIP_USE_DGEMM) #define _IJK_(i, j, k) i + j*No + k*NoNo #define REORDER(__II, __JJ, __KK) \ - t_reorder.start(); \ + WITH_CHRONO("doubles:reorder", \ for (size_t k = 0; k < No; k++) \ for (size_t j = 0; j < No; j++) \ for (size_t i = 0; i < No; i++) { \ Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)]; \ } \ - t_reorder.stop(); + ) #define DGEMM_PARTICLES(__A, __B) \ atrip::xgemm( "T" \ , "N" \ @@ -1732,92 +2668,91 @@ namespace atrip { _t_buffer.reserve(NoNoNo); F one{1.0}, m_one{-1.0}, zero{0.0}; - t_reorder.start(); - for (size_t k = 0; k < NoNoNo; k++) { - // zero the Tijk - Tijk[k] = 0.0; - } - t_reorder.stop(); + WITH_CHRONO("double:reorder", + for (size_t k = 0; k < NoNoNo; k++) { + Tijk[k] = 0.0; + }) - chrono["doubles:holes"].start(); - { // Holes part ============================================================ + // TOMERGE: replace chronos + WITH_CHRONO("doubles:holes", + { // Holes part ======================================================== - std::vector _vhhh(NoNoNo); + std::vector _vhhh(NoNoNo); - // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 - MAYBE_CONJ(_vhhh, VhhhC) - chrono["doubles:holes:1"].start(); - DGEMM_HOLES(_vhhh.data(), TABhh, "N") - REORDER(i, k, j) - chrono["doubles:holes:1"].stop(); - // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 - chrono["doubles:holes:2"].start(); - DGEMM_HOLES(_vhhh.data(), TABhh, "T") - REORDER(j, k, i) - chrono["doubles:holes:2"].stop(); + // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 + MAYBE_CONJ(_vhhh, VhhhC) + WITH_CHRONO("doubles:holes:1", + DGEMM_HOLES(_vhhh.data(), TABhh, "N") + REORDER(i, k, j) + ) + // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 + WITH_CHRONO("doubles:holes:2", + DGEMM_HOLES(_vhhh.data(), TABhh, "T") + REORDER(j, k, i) + ) - // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 - MAYBE_CONJ(_vhhh, VhhhB) - chrono["doubles:holes:3"].start(); - DGEMM_HOLES(_vhhh.data(), TAChh, "N") - REORDER(i, j, k) - chrono["doubles:holes:3"].stop(); - // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 - chrono["doubles:holes:4"].start(); - DGEMM_HOLES(_vhhh.data(), TAChh, "T") - REORDER(k, j, i) - chrono["doubles:holes:4"].stop(); + // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 + MAYBE_CONJ(_vhhh, VhhhB) + WITH_CHRONO("doubles:holes:3", + DGEMM_HOLES(_vhhh.data(), TAChh, "N") + REORDER(i, j, k) + ) + // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 + WITH_CHRONO("doubles:holes:4", + DGEMM_HOLES(_vhhh.data(), TAChh, "T") + REORDER(k, j, i) + ) - // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 - MAYBE_CONJ(_vhhh, VhhhA) - chrono["doubles:holes:5"].start(); - DGEMM_HOLES(_vhhh.data(), TBChh, "N") - REORDER(j, i, k) - chrono["doubles:holes:5"].stop(); - // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 - chrono["doubles:holes:6"].start(); - DGEMM_HOLES(_vhhh.data(), TBChh, "T") - REORDER(k, i, j) - chrono["doubles:holes:6"].stop(); + // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 + MAYBE_CONJ(_vhhh, VhhhA) + WITH_CHRONO("doubles:holes:5", + DGEMM_HOLES(_vhhh.data(), TBChh, "N") + REORDER(j, i, k) + ) + // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 + WITH_CHRONO("doubles:holes:6", + DGEMM_HOLES(_vhhh.data(), TBChh, "T") + REORDER(k, i, j) + ) - } - chrono["doubles:holes"].stop(); + } + ) #undef MAYBE_CONJ - chrono["doubles:particles"].start(); - { // Particle part ========================================================= - // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 - chrono["doubles:particles:1"].start(); - DGEMM_PARTICLES(TAphh, VBCph) - REORDER(i, j, k) - chrono["doubles:particles:1"].stop(); - // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 - chrono["doubles:particles:2"].start(); - DGEMM_PARTICLES(TAphh, VCBph) - REORDER(i, k, j) - chrono["doubles:particles:2"].stop(); - // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 - chrono["doubles:particles:3"].start(); - DGEMM_PARTICLES(TCphh, VABph) - REORDER(k, i, j) - chrono["doubles:particles:3"].stop(); - // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 - chrono["doubles:particles:4"].start(); - DGEMM_PARTICLES(TCphh, VBAph) - REORDER(k, j, i) - chrono["doubles:particles:4"].stop(); - // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 - chrono["doubles:particles:5"].start(); - DGEMM_PARTICLES(TBphh, VACph) - REORDER(j, i, k) - chrono["doubles:particles:5"].stop(); - // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 - chrono["doubles:particles:6"].start(); - DGEMM_PARTICLES(TBphh, VCAph) - REORDER(j, k, i) - chrono["doubles:particles:6"].stop(); - } - chrono["doubles:particles"].stop(); + WITH_CHRONO("doubles:particles", + { // Particle part ===================================================== + // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 + WITH_CHRONO("doubles:particles:1", + DGEMM_PARTICLES(TAphh, VBCph) + REORDER(i, j, k) + ) + // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 + WITH_CHRONO("doubles:particles:2", + DGEMM_PARTICLES(TAphh, VCBph) + REORDER(i, k, j) + ) + // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 + WITH_CHRONO("doubles:particles:3", + DGEMM_PARTICLES(TCphh, VABph) + REORDER(k, i, j) + ) + // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 + WITH_CHRONO("doubles:particles:4", + DGEMM_PARTICLES(TCphh, VBAph) + REORDER(k, j, i) + ) + // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 + WITH_CHRONO("doubles:particles:5", + DGEMM_PARTICLES(TBphh, VACph) + REORDER(j, i, k) + ) + // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 + WITH_CHRONO("doubles:particles:6", + DGEMM_PARTICLES(TBphh, VCAph) + REORDER(j, k, i) + ) + } + ) #undef REORDER #undef DGEMM_HOLES @@ -1973,12 +2908,22 @@ namespace atrip { #include +#include + +#define ADD_ATTRIBUTE(_type, _name, _default) \ + _type _name = _default; \ + Input& with_ ## _name(_type i) { \ + _name = i; \ + return *this; \ + } + namespace atrip { struct Atrip { static int rank; static int np; + static Timings chrono; static void init(); template @@ -1991,9 +2936,6 @@ namespace atrip { , *Vhhhp = nullptr , *Vppph = nullptr ; - int maxIterations = 0, iterationMod = -1, percentageMod = -1; - bool barrier = false; - bool chrono = false; Input& with_epsilon_i(CTF::Tensor * t) { ei = t; return *this; } Input& with_epsilon_a(CTF::Tensor * t) { ea = t; return *this; } Input& with_Tai(CTF::Tensor * t) { Tph = t; return *this; } @@ -2001,11 +2943,20 @@ namespace atrip { Input& with_Vabij(CTF::Tensor * t) { Vpphh = t; return *this; } Input& with_Vijka(CTF::Tensor * t) { Vhhhp = t; return *this; } Input& with_Vabci(CTF::Tensor * t) { Vppph = t; return *this; } - Input& with_maxIterations(int i) { maxIterations = i; return *this; } - Input& with_iterationMod(int i) { iterationMod = i; return *this; } - Input& with_percentageMod(int i) { percentageMod = i; return *this; } - Input& with_barrier(bool i) { barrier = i; return *this; } - Input& with_chrono(bool i) { chrono = i; return *this; } + + enum TuplesDistribution { + NAIVE, + GROUP_AND_SORT, + }; + + ADD_ATTRIBUTE(bool, rankRoundRobin, false) + ADD_ATTRIBUTE(bool, chrono, false) + ADD_ATTRIBUTE(bool, barrier, false) + ADD_ATTRIBUTE(int, maxIterations, 0) + ADD_ATTRIBUTE(int, iterationMod, -1) + ADD_ATTRIBUTE(int, percentageMod, -1) + ADD_ATTRIBUTE(TuplesDistribution, tuplesDistribution, NAIVE) + }; struct Output { @@ -2031,8 +2982,11 @@ namespace atrip { using namespace atrip; +bool RankMap::RANK_ROUND_ROBIN; +bool RankMap::RANK_ROUND_ROBIN; int Atrip::rank; int Atrip::np; +Timings Atrip::chrono; // user printing block IterationDescriptor IterationDescription::descriptor; @@ -2052,28 +3006,35 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { const int rank = Atrip::rank; MPI_Comm universe = in.ei->wrld->comm; - // Timings in seconds ================================================{{{1 - Timings chrono{}; - const size_t No = in.ei->lens[0]; const size_t Nv = in.ea->lens[0]; LOG(0,"Atrip") << "No: " << No << "\n"; LOG(0,"Atrip") << "Nv: " << Nv << "\n"; + LOG(0,"Atrip") << "np: " << np << "\n"; // allocate the three scratches, see piecuch - std::vector Tijk(No*No*No) // doubles only (see piecuch) - , Zijk(No*No*No) // singles + doubles (see piecuch) - // we need local copies of the following tensors on every - // rank - , epsi(No) - , epsa(Nv) - , Tai(No * Nv) - ; + std::vector Tijk(No*No*No) // doubles only (see piecuch) + , Zijk(No*No*No) // singles + doubles (see piecuch) + // we need local copies of the following tensors on every + // rank + , epsi(No) + , epsa(Nv) + , Tai(No * Nv) + ; in.ei->read_all(epsi.data()); in.ea->read_all(epsa.data()); in.Tph->read_all(Tai.data()); + RankMap::RANK_ROUND_ROBIN = in.rankRoundRobin; + if (RankMap::RANK_ROUND_ROBIN) { + LOG(0,"Atrip") << "Doing rank round robin slices distribution" << "\n"; + } else { + LOG(0,"Atrip") + << "Doing node > local rank round robin slices distribution" << "\n"; + } + + // COMMUNICATOR CONSTRUCTION ========================================={{{1 // // Construct a new communicator living only on a single rank @@ -2094,41 +3055,49 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { } - chrono["nv-slices"].start(); // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1 - LOG(0,"Atrip") << "BUILD NV-SLICES\n"; - TAPHH taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - HHHA hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - chrono["nv-slices"].stop(); + WITH_CHRONO("nv-slices", + LOG(0,"Atrip") << "BUILD NV-SLICES\n"; + TAPHH taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + HHHA hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + ) - chrono["nv-nv-slices"].start(); // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1 - LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n"; - ABPH abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - ABHH abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - TABHH tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - chrono["nv-nv-slices"].stop(); + WITH_CHRONO("nv-nv-slices", + LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n"; + ABPH abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + ABHH abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + TABHH tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + ) // all tensors std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh}; - //CONSTRUCT TUPLE LIST ==============================================={{{1 - LOG(0,"Atrip") << "BUILD TUPLE LIST\n"; - const auto tuplesList = std::move(getTuplesList(Nv)); - WITH_RANK << "tupList.size() = " << tuplesList.size() << "\n"; + // get tuples for the current rank + TuplesDistribution *distribution; - // GET ABC INDEX RANGE FOR RANK ======================================{{{1 - auto abcIndex = getABCRange(np, rank, tuplesList); - size_t nIterations = abcIndex.second - abcIndex.first; + if (in.tuplesDistribution == Atrip::Input::TuplesDistribution::NAIVE) { + LOG(0,"Atrip") << "Using the naive distribution\n"; + distribution = new NaiveDistribution(); + } else { + LOG(0,"Atrip") << "Using the group-and-sort distribution\n"; + distribution = new group_and_sort::Distribution(); + } - WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n"; - LOG(0,"Atrip") << "#iterations: " << nIterations << "\n"; + LOG(0,"Atrip") << "BUILDING TUPLE LIST\n"; + WITH_CHRONO("tuples:build", + auto const tuplesList = distribution->getTuples(Nv, universe); + ) + const size_t nIterations = tuplesList.size(); - // first abc - const ABCTuple firstAbc = tuplesList[abcIndex.first]; - - - double energy(0.); + { + const size_t _all_tuples = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; + LOG(0,"Atrip") << "#iterations: " + << nIterations + << "/" + << nIterations * np + << "\n"; + } const size_t iterationMod = (in.percentageMod > 0) @@ -2141,7 +3110,9 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { auto const isFakeTuple - = [&tuplesList](size_t const i) { return i >= tuplesList.size(); }; + = [&tuplesList, distribution](size_t const i) { + return distribution->tupleIsFake(tuplesList[i]); + }; using Database = typename Slice::Database; @@ -2149,45 +3120,42 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { auto communicateDatabase = [ &unions , np - , &chrono ] (ABCTuple const& abc, MPI_Comm const& c) -> Database { - chrono["db:comm:type:do"].start(); - auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement(); - chrono["db:comm:type:do"].stop(); + WITH_CHRONO("db:comm:type:do", + auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement(); + ) - chrono["db:comm:ldb"].start(); - LocalDatabase ldb; - - for (auto const& tensor: unions) { - auto const& tensorDb = tensor->buildLocalDatabase(abc); - ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end()); - } - chrono["db:comm:ldb"].stop(); + WITH_CHRONO("db:comm:ldb", + typename Slice::LocalDatabase ldb; + for (auto const& tensor: unions) { + auto const& tensorDb = tensor->buildLocalDatabase(abc); + ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end()); + } + ) Database db(np * ldb.size(), ldb[0]); - chrono["oneshot-db:comm:allgather"].start(); - chrono["db:comm:allgather"].start(); - MPI_Allgather( ldb.data() - , ldb.size() - , MPI_LDB_ELEMENT - , db.data() - , ldb.size() - , MPI_LDB_ELEMENT - , c); - chrono["db:comm:allgather"].stop(); - chrono["oneshot-db:comm:allgather"].stop(); + WITH_CHRONO("oneshot-db:comm:allgather", + WITH_CHRONO("db:comm:allgather", + MPI_Allgather( ldb.data() + , ldb.size() + , MPI_LDB_ELEMENT + , db.data() + , ldb.size() + , MPI_LDB_ELEMENT + , c); + )) - chrono["db:comm:type:free"].start(); - MPI_Type_free(&MPI_LDB_ELEMENT); - chrono["db:comm:type:free"].stop(); + WITH_CHRONO("db:comm:type:free", + MPI_Type_free(&MPI_LDB_ELEMENT); + ) return db; }; auto doIOPhase - = [&unions, &rank, &np, &universe, &chrono] (Database const& db) { + = [&unions, &rank, &np, &universe] (Database const& db) { const size_t localDBLength = db.size() / np; @@ -2223,9 +3191,9 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { << "\n" ; - chrono["db:io:recv"].start(); - u.receive(el.info, recvTag); - chrono["db:io:recv"].stop(); + WITH_CHRONO("db:io:recv", + u.receive(el.info, recvTag); + ) } // recv } @@ -2259,9 +3227,9 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { << "\n" ; - chrono["db:io:send"].start(); - u.send(otherRank, el.info, sendTag); - chrono["db:io:send"].stop(); + WITH_CHRONO("db:io:send", + u.send(otherRank, el, sendTag); + ) } // send phase @@ -2287,24 +3255,22 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // START MAIN LOOP ======================================================{{{1 - for ( size_t i = abcIndex.first, iteration = 1 - ; i < abcIndex.second + double energy(0.); + + for ( size_t i = 0, iteration = 1 + ; i < tuplesList.size() ; i++, iteration++ ) { - chrono["iterations"].start(); - + Atrip::chrono["iterations"].start(); // check overhead from chrono over all iterations - chrono["start:stop"].start(); chrono["start:stop"].stop(); + WITH_CHRONO("start:stop", {}) // check overhead of doing a barrier at the beginning - chrono["oneshot-mpi:barrier"].start(); - chrono["mpi:barrier"].start(); - // TODO: REMOVE - if (in.barrier == 1) - MPI_Barrier(universe); - chrono["mpi:barrier"].stop(); - chrono["oneshot-mpi:barrier"].stop(); + WITH_CHRONO("oneshot-mpi:barrier", + WITH_CHRONO("mpi:barrier", + if (in.barrier) MPI_Barrier(universe); + )) if (iteration % iterationMod == 0 || iteration == iteration1Percent) { @@ -2312,22 +3278,22 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { IterationDescription::descriptor({ iteration, nIterations, - chrono["iterations"].count() + Atrip::chrono["iterations"].count() }); } LOG(0,"Atrip") << "iteration " << iteration << " [" << 100 * iteration / nIterations << "%]" - << " (" << doublesFlops * iteration / chrono["doubles"].count() + << " (" << doublesFlops * iteration / Atrip::chrono["doubles"].count() << "GF)" - << " (" << doublesFlops * iteration / chrono["iterations"].count() + << " (" << doublesFlops * iteration / Atrip::chrono["iterations"].count() << "GF)" << " ===========================\n"; // PRINT TIMINGS if (in.chrono) - for (auto const& pair: chrono) + for (auto const& pair: Atrip::chrono) LOG(1, " ") << pair.first << " :: " << pair.second.count() << std::endl; @@ -2337,46 +3303,43 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { const ABCTuple abc = isFakeTuple(i) ? tuplesList[tuplesList.size() - 1] : tuplesList[i] - , *abcNext = i == (abcIndex.second - 1) + , *abcNext = i == (tuplesList.size() - 1) ? nullptr - : isFakeTuple(i + 1) - ? &tuplesList[tuplesList.size() - 1] : &tuplesList[i + 1] ; - chrono["with_rank"].start(); - WITH_RANK << " :it " << iteration - << " :abc " << pretty_print(abc) - << " :abcN " - << (abcNext ? pretty_print(*abcNext) : "None") - << "\n"; - chrono["with_rank"].stop(); + WITH_CHRONO("with_rank", + WITH_RANK << " :it " << iteration + << " :abc " << pretty_print(abc) + << " :abcN " + << (abcNext ? pretty_print(*abcNext) : "None") + << "\n"; + ) // COMM FIRST DATABASE ================================================{{{1 - if (i == abcIndex.first) { + if (i == 0) { WITH_RANK << "__first__:first database ............ \n"; - const auto __db = communicateDatabase(abc, universe); + const auto db = communicateDatabase(abc, universe); WITH_RANK << "__first__:first database communicated \n"; WITH_RANK << "__first__:first database io phase \n"; - doIOPhase(__db); + doIOPhase(db); WITH_RANK << "__first__:first database io phase DONE\n"; WITH_RANK << "__first__::::Unwrapping all slices for first database\n"; for (auto& u: unions) u->unwrapAll(abc); - WITH_RANK << "__first__::::Unwrapping all slices for first database DONE\n"; + WITH_RANK << "__first__::::Unwrapping slices for first database DONE\n"; MPI_Barrier(universe); } // COMM NEXT DATABASE ================================================={{{1 if (abcNext) { WITH_RANK << "__comm__:" << iteration << "th communicating database\n"; - chrono["db:comm"].start(); - //const auto db = communicateDatabase(*abcNext, universe); - Database db = communicateDatabase(*abcNext, universe); - chrono["db:comm"].stop(); - chrono["db:io"].start(); - doIOPhase(db); - chrono["db:io"].stop(); + WITH_CHRONO("db:comm", + const auto db = communicateDatabase(*abcNext, universe); + ) + WITH_CHRONO("db:io", + doIOPhase(db); + ) WITH_RANK << "__comm__:" << iteration << "th database io phase DONE\n"; } @@ -2384,63 +3347,61 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { OCD_Barrier(universe); if (!isFakeTuple(i)) { WITH_RANK << iteration << "-th doubles\n"; - WITH_CHRONO(chrono["oneshot-unwrap"], - WITH_CHRONO(chrono["unwrap"], - WITH_CHRONO(chrono["unwrap:doubles"], + WITH_CHRONO("oneshot-unwrap", + WITH_CHRONO("unwrap", + WITH_CHRONO("unwrap:doubles", for (auto& u: decltype(unions){&abph, &hhha, &taphh, &tabhh}) { u->unwrapAll(abc); } ))) - chrono["oneshot-doubles"].start(); - chrono["doubles"].start(); - doublesContribution( abc, (size_t)No, (size_t)Nv - // -- VABCI - , abph.unwrapSlice(Slice::AB, abc) - , abph.unwrapSlice(Slice::AC, abc) - , abph.unwrapSlice(Slice::BC, abc) - , abph.unwrapSlice(Slice::BA, abc) - , abph.unwrapSlice(Slice::CA, abc) - , abph.unwrapSlice(Slice::CB, abc) - // -- VHHHA - , hhha.unwrapSlice(Slice::A, abc) - , hhha.unwrapSlice(Slice::B, abc) - , hhha.unwrapSlice(Slice::C, abc) - // -- TA - , taphh.unwrapSlice(Slice::A, abc) - , taphh.unwrapSlice(Slice::B, abc) - , taphh.unwrapSlice(Slice::C, abc) - // -- TABIJ - , tabhh.unwrapSlice(Slice::AB, abc) - , tabhh.unwrapSlice(Slice::AC, abc) - , tabhh.unwrapSlice(Slice::BC, abc) - // -- TIJK - , Tijk.data() - , chrono - ); - WITH_RANK << iteration << "-th doubles done\n"; - chrono["doubles"].stop(); - chrono["oneshot-doubles"].stop(); + WITH_CHRONO("oneshot-doubles", + WITH_CHRONO("doubles", + doublesContribution( abc, (size_t)No, (size_t)Nv + // -- VABCI + , abph.unwrapSlice(Slice::AB, abc) + , abph.unwrapSlice(Slice::AC, abc) + , abph.unwrapSlice(Slice::BC, abc) + , abph.unwrapSlice(Slice::BA, abc) + , abph.unwrapSlice(Slice::CA, abc) + , abph.unwrapSlice(Slice::CB, abc) + // -- VHHHA + , hhha.unwrapSlice(Slice::A, abc) + , hhha.unwrapSlice(Slice::B, abc) + , hhha.unwrapSlice(Slice::C, abc) + // -- TA + , taphh.unwrapSlice(Slice::A, abc) + , taphh.unwrapSlice(Slice::B, abc) + , taphh.unwrapSlice(Slice::C, abc) + // -- TABIJ + , tabhh.unwrapSlice(Slice::AB, abc) + , tabhh.unwrapSlice(Slice::AC, abc) + , tabhh.unwrapSlice(Slice::BC, abc) + // -- TIJK + , Tijk.data() + ); + WITH_RANK << iteration << "-th doubles done\n"; + )) } // COMPUTE SINGLES =================================================== {{{1 OCD_Barrier(universe); if (!isFakeTuple(i)) { - WITH_CHRONO(chrono["oneshot-unwrap"], - WITH_CHRONO(chrono["unwrap"], - WITH_CHRONO(chrono["unwrap:singles"], + WITH_CHRONO("oneshot-unwrap", + WITH_CHRONO("unwrap", + WITH_CHRONO("unwrap:singles", abhh.unwrapAll(abc); ))) - chrono["reorder"].start(); - for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I]; - chrono["reorder"].stop(); - chrono["singles"].start(); + WITH_CHRONO("reorder", + for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I]; + ) + WITH_CHRONO("singles", singlesContribution( No, Nv, abc , Tai.data() , abhh.unwrapSlice(Slice::AB, abc) , abhh.unwrapSlice(Slice::AC, abc) , abhh.unwrapSlice(Slice::BC, abc) , Zijk.data()); - chrono["singles"].stop(); + ) } @@ -2453,12 +3414,12 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { if (abc[1] == abc[2]) distinct--; const F epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]); - chrono["energy"].start(); - if ( distinct == 0) - tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk); - else - tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk); - chrono["energy"].stop(); + WITH_CHRONO("energy", + if ( distinct == 0) + tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk); + else + tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk); + ) #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES) tupleEnergies[abc] = tupleEnergy; @@ -2468,6 +3429,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { } + // TODO: remove this if (isFakeTuple(i)) { // fake iterations should also unwrap whatever they got WITH_RANK << iteration @@ -2489,7 +3451,6 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // CLEANUP UNIONS ===================================================={{{1 OCD_Barrier(universe); if (abcNext) { - chrono["gc"].start(); WITH_RANK << "__gc__:" << iteration << "-th cleaning up.......\n"; for (auto& u: unions) { @@ -2523,12 +3484,11 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { } - chrono["gc"].stop(); } WITH_RANK << iteration << "-th cleaning up....... DONE\n"; - chrono["iterations"].stop(); + Atrip::chrono["iterations"].stop(); // ITERATION END ====================================================={{{1 } @@ -2566,15 +3526,15 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // PRINT TIMINGS {{{1 if (in.chrono) - for (auto const& pair: chrono) + for (auto const& pair: Atrip::chrono) LOG(0,"atrip:chrono") << pair.first << " " << pair.second.count() << std::endl; LOG(0, "atrip:flops(doubles)") - << nIterations * doublesFlops / chrono["doubles"].count() << "\n"; + << nIterations * doublesFlops / Atrip::chrono["doubles"].count() << "\n"; LOG(0, "atrip:flops(iterations)") - << nIterations * doublesFlops / chrono["iterations"].count() << "\n"; + << nIterations * doublesFlops / Atrip::chrono["iterations"].count() << "\n"; // TODO: change the sign in the getEnergy routines return { - globalEnergy }; @@ -2633,7 +3593,6 @@ template Atrip::Output Atrip::run(Atrip::Input const& in); # define DBG(...) dbg(__VA_ARGS__) #elif ATRIP_DEBUG == 2 # pragma message("WARNING: You have some debugging info for ABC triples") -# include # define OCD_Barrier(com) # define WITH_OCD if (false) # define WITH_ROOT if (atrip::Atrip::rank == 0) From bbbfb30c6f33bb842b209fe00e3706a71211983d Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Fri, 18 Feb 2022 12:54:59 +0100 Subject: [PATCH 21/22] Add tangled sources --- include/atrip/Atrip.hpp | 32 +- include/atrip/Debug.hpp | 1 - include/atrip/Equations.hpp | 190 +++++---- include/atrip/RankMap.hpp | 67 +++- include/atrip/Slice.hpp | 758 ++++++++++++++++++----------------- include/atrip/SliceUnion.hpp | 27 +- include/atrip/Tuples.hpp | 569 +++++++++++++++++++++++--- include/atrip/Unions.hpp | 4 +- include/atrip/Utils.hpp | 45 ++- src/atrip/Atrip.cxx | 332 +++++++-------- 10 files changed, 1298 insertions(+), 727 deletions(-) diff --git a/include/atrip/Atrip.hpp b/include/atrip/Atrip.hpp index 6f3859c..2a0f340 100644 --- a/include/atrip/Atrip.hpp +++ b/include/atrip/Atrip.hpp @@ -7,12 +7,22 @@ #include +#include + +#define ADD_ATTRIBUTE(_type, _name, _default) \ + _type _name = _default; \ + Input& with_ ## _name(_type i) { \ + _name = i; \ + return *this; \ + } + namespace atrip { struct Atrip { static int rank; static int np; + static Timings chrono; static void init(); template @@ -25,9 +35,6 @@ namespace atrip { , *Vhhhp = nullptr , *Vppph = nullptr ; - int maxIterations = 0, iterationMod = -1, percentageMod = -1; - bool barrier = false; - bool chrono = false; Input& with_epsilon_i(CTF::Tensor * t) { ei = t; return *this; } Input& with_epsilon_a(CTF::Tensor * t) { ea = t; return *this; } Input& with_Tai(CTF::Tensor * t) { Tph = t; return *this; } @@ -35,11 +42,20 @@ namespace atrip { Input& with_Vabij(CTF::Tensor * t) { Vpphh = t; return *this; } Input& with_Vijka(CTF::Tensor * t) { Vhhhp = t; return *this; } Input& with_Vabci(CTF::Tensor * t) { Vppph = t; return *this; } - Input& with_maxIterations(int i) { maxIterations = i; return *this; } - Input& with_iterationMod(int i) { iterationMod = i; return *this; } - Input& with_percentageMod(int i) { percentageMod = i; return *this; } - Input& with_barrier(bool i) { barrier = i; return *this; } - Input& with_chrono(bool i) { chrono = i; return *this; } + + enum TuplesDistribution { + NAIVE, + GROUP_AND_SORT, + }; + + ADD_ATTRIBUTE(bool, rankRoundRobin, false) + ADD_ATTRIBUTE(bool, chrono, false) + ADD_ATTRIBUTE(bool, barrier, false) + ADD_ATTRIBUTE(int, maxIterations, 0) + ADD_ATTRIBUTE(int, iterationMod, -1) + ADD_ATTRIBUTE(int, percentageMod, -1) + ADD_ATTRIBUTE(TuplesDistribution, tuplesDistribution, NAIVE) + }; struct Output { diff --git a/include/atrip/Debug.hpp b/include/atrip/Debug.hpp index 4347824..e567d5c 100644 --- a/include/atrip/Debug.hpp +++ b/include/atrip/Debug.hpp @@ -41,7 +41,6 @@ # define DBG(...) dbg(__VA_ARGS__) #elif ATRIP_DEBUG == 2 # pragma message("WARNING: You have some debugging info for ABC triples") -# include # define OCD_Barrier(com) # define WITH_OCD if (false) # define WITH_ROOT if (atrip::Atrip::rank == 0) diff --git a/include/atrip/Equations.hpp b/include/atrip/Equations.hpp index 2b90736..e907592 100644 --- a/include/atrip/Equations.hpp +++ b/include/atrip/Equations.hpp @@ -40,12 +40,12 @@ namespace atrip { , X(Zijk_[j + No*k + No*No*i]) , Y(Zijk_[k + No*i + No*No*j]) , Z(Zijk_[k + No*j + No*No*i]) - , A(std::conj(Tijk_[i + No*j + No*No*k])) - , B(std::conj(Tijk_[i + No*k + No*No*j])) - , C(std::conj(Tijk_[j + No*i + No*No*k])) - , D(std::conj(Tijk_[j + No*k + No*No*i])) - , E(std::conj(Tijk_[k + No*i + No*No*j])) - , F(std::conj(Tijk_[k + No*j + No*No*i])) + , A(maybeConjugate(Tijk_[i + No*j + No*No*k])) + , B(maybeConjugate(Tijk_[i + No*k + No*No*j])) + , C(maybeConjugate(Tijk_[j + No*i + No*No*k])) + , D(maybeConjugate(Tijk_[j + No*k + No*No*i])) + , E(maybeConjugate(Tijk_[k + No*i + No*No*j])) + , F(maybeConjugate(Tijk_[k + No*j + No*No*i])) , value = 3.0 * ( A * U + B * V @@ -102,9 +102,9 @@ namespace atrip { , U(Zijk_[i + No*j + No*No*k]) , V(Zijk_[j + No*k + No*No*i]) , W(Zijk_[k + No*i + No*No*j]) - , A(std::conj(Tijk_[i + No*j + No*No*k])) - , B(std::conj(Tijk_[j + No*k + No*No*i])) - , C(std::conj(Tijk_[k + No*i + No*No*j])) + , A(maybeConjugate(Tijk_[i + No*j + No*No*k])) + , B(maybeConjugate(Tijk_[j + No*k + No*No*i])) + , C(maybeConjugate(Tijk_[k + No*i + No*No*j])) , value = F(3.0) * ( A * U + B * V @@ -172,10 +172,8 @@ namespace atrip { , F const* TBChh // -- TIJK , F *Tijk - , atrip::Timings& chrono ) { - auto& t_reorder = chrono["doubles:reorder"]; const size_t a = abc[0], b = abc[1], c = abc[2] , NoNo = No*No, NoNv = No*Nv ; @@ -183,13 +181,13 @@ namespace atrip { #if defined(ATRIP_USE_DGEMM) #define _IJK_(i, j, k) i + j*No + k*NoNo #define REORDER(__II, __JJ, __KK) \ - t_reorder.start(); \ + WITH_CHRONO("doubles:reorder", \ for (size_t k = 0; k < No; k++) \ for (size_t j = 0; j < No; j++) \ for (size_t i = 0; i < No; i++) { \ Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)]; \ } \ - t_reorder.stop(); + ) #define DGEMM_PARTICLES(__A, __B) \ atrip::xgemm( "T" \ , "N" \ @@ -220,106 +218,100 @@ namespace atrip { , _t_buffer.data() \ , (int const*)&NoNo \ ); - #define MAYBE_CONJ(_conj, _buffer) \ - if (traits::isComplex()) { \ - for (size_t __i = 0; __i < NoNoNo; ++__i) \ - _conj[__i] = std::conj(_buffer[__i]); \ - } else { \ - for (size_t __i = 0; __i < NoNoNo; ++__i) \ - _conj[__i] = _buffer[__i]; \ - } + #define MAYBE_CONJ(_conj, _buffer) \ + for (size_t __i = 0; __i < NoNoNo; ++__i) \ + _conj[__i] = maybeConjugate(_buffer[__i]); \ const size_t NoNoNo = No*NoNo; std::vector _t_buffer; _t_buffer.reserve(NoNoNo); F one{1.0}, m_one{-1.0}, zero{0.0}; - t_reorder.start(); - for (size_t k = 0; k < NoNoNo; k++) { - // zero the Tijk - Tijk[k] = 0.0; - } - t_reorder.stop(); + WITH_CHRONO("double:reorder", + for (size_t k = 0; k < NoNoNo; k++) { + Tijk[k] = 0.0; + }) - chrono["doubles:holes"].start(); - { // Holes part ============================================================ + // TOMERGE: replace chronos + WITH_CHRONO("doubles:holes", + { // Holes part ======================================================== - std::vector _vhhh(NoNoNo); + std::vector _vhhh(NoNoNo); - // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 - MAYBE_CONJ(_vhhh, VhhhC) - chrono["doubles:holes:1"].start(); - DGEMM_HOLES(_vhhh.data(), TABhh, "N") - REORDER(i, k, j) - chrono["doubles:holes:1"].stop(); - // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 - chrono["doubles:holes:2"].start(); - DGEMM_HOLES(_vhhh.data(), TABhh, "T") - REORDER(j, k, i) - chrono["doubles:holes:2"].stop(); + // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 + MAYBE_CONJ(_vhhh, VhhhC) + WITH_CHRONO("doubles:holes:1", + DGEMM_HOLES(_vhhh.data(), TABhh, "N") + REORDER(i, k, j) + ) + // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 + WITH_CHRONO("doubles:holes:2", + DGEMM_HOLES(_vhhh.data(), TABhh, "T") + REORDER(j, k, i) + ) - // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 - MAYBE_CONJ(_vhhh, VhhhB) - chrono["doubles:holes:3"].start(); - DGEMM_HOLES(_vhhh.data(), TAChh, "N") - REORDER(i, j, k) - chrono["doubles:holes:3"].stop(); - // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 - chrono["doubles:holes:4"].start(); - DGEMM_HOLES(_vhhh.data(), TAChh, "T") - REORDER(k, j, i) - chrono["doubles:holes:4"].stop(); + // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 + MAYBE_CONJ(_vhhh, VhhhB) + WITH_CHRONO("doubles:holes:3", + DGEMM_HOLES(_vhhh.data(), TAChh, "N") + REORDER(i, j, k) + ) + // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 + WITH_CHRONO("doubles:holes:4", + DGEMM_HOLES(_vhhh.data(), TAChh, "T") + REORDER(k, j, i) + ) - // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 - MAYBE_CONJ(_vhhh, VhhhA) - chrono["doubles:holes:5"].start(); - DGEMM_HOLES(_vhhh.data(), TBChh, "N") - REORDER(j, i, k) - chrono["doubles:holes:5"].stop(); - // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 - chrono["doubles:holes:6"].start(); - DGEMM_HOLES(_vhhh.data(), TBChh, "T") - REORDER(k, i, j) - chrono["doubles:holes:6"].stop(); + // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 + MAYBE_CONJ(_vhhh, VhhhA) + WITH_CHRONO("doubles:holes:5", + DGEMM_HOLES(_vhhh.data(), TBChh, "N") + REORDER(j, i, k) + ) + // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 + WITH_CHRONO("doubles:holes:6", + DGEMM_HOLES(_vhhh.data(), TBChh, "T") + REORDER(k, i, j) + ) - } - chrono["doubles:holes"].stop(); + } + ) #undef MAYBE_CONJ - chrono["doubles:particles"].start(); - { // Particle part ========================================================= - // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 - chrono["doubles:particles:1"].start(); - DGEMM_PARTICLES(TAphh, VBCph) - REORDER(i, j, k) - chrono["doubles:particles:1"].stop(); - // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 - chrono["doubles:particles:2"].start(); - DGEMM_PARTICLES(TAphh, VCBph) - REORDER(i, k, j) - chrono["doubles:particles:2"].stop(); - // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 - chrono["doubles:particles:3"].start(); - DGEMM_PARTICLES(TCphh, VABph) - REORDER(k, i, j) - chrono["doubles:particles:3"].stop(); - // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 - chrono["doubles:particles:4"].start(); - DGEMM_PARTICLES(TCphh, VBAph) - REORDER(k, j, i) - chrono["doubles:particles:4"].stop(); - // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 - chrono["doubles:particles:5"].start(); - DGEMM_PARTICLES(TBphh, VACph) - REORDER(j, i, k) - chrono["doubles:particles:5"].stop(); - // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 - chrono["doubles:particles:6"].start(); - DGEMM_PARTICLES(TBphh, VCAph) - REORDER(j, k, i) - chrono["doubles:particles:6"].stop(); - } - chrono["doubles:particles"].stop(); + WITH_CHRONO("doubles:particles", + { // Particle part ===================================================== + // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 + WITH_CHRONO("doubles:particles:1", + DGEMM_PARTICLES(TAphh, VBCph) + REORDER(i, j, k) + ) + // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 + WITH_CHRONO("doubles:particles:2", + DGEMM_PARTICLES(TAphh, VCBph) + REORDER(i, k, j) + ) + // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 + WITH_CHRONO("doubles:particles:3", + DGEMM_PARTICLES(TCphh, VABph) + REORDER(k, i, j) + ) + // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 + WITH_CHRONO("doubles:particles:4", + DGEMM_PARTICLES(TCphh, VBAph) + REORDER(k, j, i) + ) + // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 + WITH_CHRONO("doubles:particles:5", + DGEMM_PARTICLES(TBphh, VACph) + REORDER(j, i, k) + ) + // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 + WITH_CHRONO("doubles:particles:6", + DGEMM_PARTICLES(TBphh, VCAph) + REORDER(j, k, i) + ) + } + ) #undef REORDER #undef DGEMM_HOLES diff --git a/include/atrip/RankMap.hpp b/include/atrip/RankMap.hpp index 8564f9e..0e31a61 100644 --- a/include/atrip/RankMap.hpp +++ b/include/atrip/RankMap.hpp @@ -5,24 +5,38 @@ #include #include +#include namespace atrip { template struct RankMap { + static bool RANK_ROUND_ROBIN; std::vector const lengths; size_t const np, size; + ClusterInfo const clusterInfo; - RankMap(std::vector lens, size_t np_) + RankMap(std::vector lens, size_t np_, MPI_Comm comm) : lengths(lens) , np(np_) , size(std::accumulate(lengths.begin(), lengths.end(), 1UL, std::multiplies())) + , clusterInfo(getClusterInfo(comm)) { assert(lengths.size() <= 2); } size_t find(typename Slice::Location const& p) const noexcept { - return p.source * np + p.rank; + if (RANK_ROUND_ROBIN) { + return p.source * np + p.rank; + } else { + const size_t + rankPosition = p.source * clusterInfo.ranksPerNode + + clusterInfo.rankInfos[p.rank].localRank + ; + return rankPosition * clusterInfo.nNodes + + clusterInfo.rankInfos[p.rank].nodeId + ; + } } size_t nSources() const noexcept { @@ -42,8 +56,9 @@ namespace atrip { } typename Slice::Location - find(ABCTuple const& abc, typename Slice::Type sliceType) const noexcept { + find(ABCTuple const& abc, typename Slice::Type sliceType) const { // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB + // tuple = {11, 0} when abc = {11, 8, 9} and sliceType = A const auto tuple = Slice::subtupleBySlice(abc, sliceType); const size_t index @@ -51,9 +66,51 @@ namespace atrip { + tuple[1] * (lengths.size() > 1 ? lengths[0] : 0) ; + size_t rank, source; + + if (RANK_ROUND_ROBIN) { + + rank = index % np; + source = index / np; + + } else { + + size_t const + + // the node that will be assigned to + nodeId = index % clusterInfo.nNodes + + // how many times it has been assigned to the node + , s_n = index / clusterInfo.nNodes + + // which local rank in the node should be + , localRank = s_n % clusterInfo.ranksPerNode + + // and the local source (how many times we chose this local rank) + , localSource = s_n / clusterInfo.ranksPerNode + ; + + // find the localRank-th entry in clusterInfo + auto const& it = + std::find_if(clusterInfo.rankInfos.begin(), + clusterInfo.rankInfos.end(), + [nodeId, localRank](RankInfo const& ri) { + return ri.nodeId == nodeId + && ri.localRank == localRank + ; + }); + if (it == clusterInfo.rankInfos.end()) { + throw "FATAL! Error in node distribution of the slices"; + } + + rank = (*it).globalRank; + source = localSource; + + } + return - { index % np - , index / np + { rank + , source }; } diff --git a/include/atrip/Slice.hpp b/include/atrip/Slice.hpp index 877d72a..1f5889e 100644 --- a/include/atrip/Slice.hpp +++ b/include/atrip/Slice.hpp @@ -1,4 +1,4 @@ -// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20slice][The slice:1]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]] #pragma once #include #include @@ -11,6 +11,9 @@ namespace atrip { +template FF maybeConjugate(const FF a) { return a; } +template <> Complex maybeConjugate(const Complex a) { return std::conj(a); } + namespace traits { template bool isComplex() { return false; }; template <> bool isComplex() { return true; }; @@ -24,401 +27,409 @@ namespace mpi { template struct Slice { -// The slice:1 ends here +// Prolog:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20slice][The slice:2]] -// ASSOCIATED TYPES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Location][Location:1]] +struct Location { size_t rank; size_t source; }; +// Location:1 ends here - struct Location { size_t rank; size_t source; }; - - enum Type - { A = 10 - , B - , C - // Two-parameter slices - , AB = 20 - , BC - , AC - // for abci and the doubles - , CB - , BA - , CA - // The non-typed slice - , Blank = 404 - }; - - enum State { - // Fetch represents the state where a slice is to be fetched - // and has a valid data pointer that can be written to - Fetch = 0, - // Dispatches represents the state that an MPI call has been - // dispatched in order to get the data, but the data has not been - // yet unwrapped, the data might be there or we might have to wait. - Dispatched = 2, - // Ready means that the data pointer can be read from - Ready = 1, - // Self sufficient is a slice when its contents are located - // in the same rank that it lives, so that it does not have to - // fetch from no one else. - SelfSufficient = 911, - // Recycled means that this slice gets its data pointer from another - // slice, so it should not be written to - Recycled = 123, - // Acceptor means that the Slice can accept a new Slice, it is - // the counterpart of the Blank type, but for states - Acceptor = 405 +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Type][Type:1]] +enum Type + { A = 10 + , B + , C + // Two-parameter slices + , AB = 20 + , BC + , AC + // for abci and the doubles + , CB + , BA + , CA + // The non-typed slice + , Blank = 404 }; +// Type:1 ends here - struct Info { - // which part of a,b,c the slice holds - PartialTuple tuple; - // The type of slice for the user to retrieve the correct one - Type type; - // What is the state of the slice - State state; - // Where the slice is to be retrieved - // NOTE: this can actually be computed from tuple - Location from; - // If the data are actually to be found in this other slice - Type recycling; +// [[file:~/cc4s/src/atrip/complex/atrip.org::*State][State:1]] +enum State { + Fetch = 0, + Dispatched = 2, + Ready = 1, + SelfSufficient = 911, + Recycled = 123, + Acceptor = 405 +}; +// State:1 ends here - Info() : tuple{0,0} - , type{Blank} - , state{Acceptor} - , from{0,0} - , recycling{Blank} - {} +// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20Info%20structure][The Info structure:1]] +struct Info { + // which part of a,b,c the slice holds + PartialTuple tuple; + // The type of slice for the user to retrieve the correct one + Type type; + // What is the state of the slice + State state; + // Where the slice is to be retrieved + Location from; + // If the data are actually to be found in this other slice + Type recycling; + + Info() : tuple{0,0} + , type{Blank} + , state{Acceptor} + , from{0,0} + , recycling{Blank} + {} +}; + +using Ty_x_Tu = std::pair< Type, PartialTuple >; +// The Info structure:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Name][Name:1]] +enum Name + { TA = 100 + , VIJKA = 101 + , VABCI = 200 + , TABIJ = 201 + , VABIJ = 202 }; +// Name:1 ends here - using Ty_x_Tu = std::pair< Type, PartialTuple >; +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Database][Database:1]] +struct LocalDatabaseElement { + Slice::Name name; + Slice::Info info; +}; +// Database:1 ends here - // Names of the integrals that are considered in CCSD(T) - enum Name - { TA = 100 - , VIJKA = 101 - , VABCI = 200 - , TABIJ = 201 - , VABIJ = 202 - }; +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Database][Database:2]] +using LocalDatabase = std::vector; +using Database = LocalDatabase; +// Database:2 ends here - // DATABASE ==========================================================={{{1 - struct LocalDatabaseElement { - Slice::Name name; - Slice::Info info; - }; - using LocalDatabase = std::vector; - using Database = LocalDatabase; +// [[file:~/cc4s/src/atrip/complex/atrip.org::*MPI%20Types][MPI Types:1]] +struct mpi { - - // STATIC METHODS =========================================================== - // - // They are useful to organize the structure of slices - - struct mpi { - - static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) { - MPI_Datatype dt; - MPI_Type_vector(n, 1, 1, DT, &dt); - MPI_Type_commit(&dt); - return dt; - } - - static MPI_Datatype sliceLocation () { - constexpr int n = 2; - // create a sliceLocation to measure in the current architecture - // the packing of the struct - Slice::Location measure; - MPI_Datatype dt; - const std::vector lengths(n, 1); - const MPI_Datatype types[n] = {usizeDt(), usizeDt()}; - - // measure the displacements in the struct - size_t j = 0; - MPI_Aint displacements[n]; - MPI_Get_address(&measure.rank, &displacements[j++]); - MPI_Get_address(&measure.source, &displacements[j++]); - for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0]; - displacements[0] = 0; - - MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); - MPI_Type_commit(&dt); - return dt; - } - - static MPI_Datatype enumDt() { return MPI_INT; } - static MPI_Datatype usizeDt() { return MPI_UINT64_T; } - - static MPI_Datatype sliceInfo () { - constexpr int n = 5; - MPI_Datatype dt; - Slice::Info measure; - const std::vector lengths(n, 1); - const MPI_Datatype types[n] - = { vector(2, usizeDt()) - , enumDt() - , enumDt() - , sliceLocation() - , enumDt() - }; - - // create the displacements from the info measurement struct - size_t j = 0; - MPI_Aint displacements[n]; - MPI_Get_address(measure.tuple.data(), &displacements[j++]); - MPI_Get_address(&measure.type, &displacements[j++]); - MPI_Get_address(&measure.state, &displacements[j++]); - MPI_Get_address(&measure.from, &displacements[j++]); - MPI_Get_address(&measure.recycling, &displacements[j++]); - for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0]; - displacements[0] = 0; - - MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); - MPI_Type_commit(&dt); - return dt; - } - - static MPI_Datatype localDatabaseElement () { - constexpr int n = 2; - MPI_Datatype dt; - LocalDatabaseElement measure; - const std::vector lengths(n, 1); - const MPI_Datatype types[n] - = { enumDt() - , sliceInfo() - }; - - // measure the displacements in the struct - size_t j = 0; - MPI_Aint displacements[n]; - MPI_Get_address(&measure.name, &displacements[j++]); - MPI_Get_address(&measure.info, &displacements[j++]); - for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0]; - displacements[0] = 0; - - MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); - MPI_Type_commit(&dt); - return dt; - } - - }; - - static - PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) { - switch (sliceType) { - case AB: return {abc[0], abc[1]}; - case BC: return {abc[1], abc[2]}; - case AC: return {abc[0], abc[2]}; - case CB: return {abc[2], abc[1]}; - case BA: return {abc[1], abc[0]}; - case CA: return {abc[2], abc[0]}; - case A: return {abc[0], 0}; - case B: return {abc[1], 0}; - case C: return {abc[2], 0}; - default: throw "Switch statement not exhaustive!"; - } + static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) { + MPI_Datatype dt; + MPI_Type_vector(n, 1, 1, DT, &dt); + MPI_Type_commit(&dt); + return dt; } + static MPI_Datatype sliceLocation () { + constexpr int n = 2; + // create a sliceLocation to measure in the current architecture + // the packing of the struct + Slice::Location measure; + MPI_Datatype dt; + const std::vector lengths(n, 1); + const MPI_Datatype types[n] = {usizeDt(), usizeDt()}; - /** - * It is important here to return a reference to a Slice - * not to accidentally copy the associated buffer of the slice. - */ - static Slice& findOneByType(std::vector> &slices, Slice::Type type) { - const auto sliceIt - = std::find_if(slices.begin(), slices.end(), - [&type](Slice const& s) { - return type == s.info.type; - }); - WITH_CRAZY_DEBUG - WITH_RANK - << "\t__ looking for " << type << "\n"; - if (sliceIt == slices.end()) - throw std::domain_error("Slice by type not found!"); - return *sliceIt; - } + static_assert(sizeof(Slice::Location) == 2 * sizeof(size_t), + "The Location packing is wrong in your compiler"); - /* - * Check if an info has - * - */ - static std::vector*> hasRecycledReferencingToIt - ( std::vector> &slices - , Info const& info - ) { - std::vector*> result; + // measure the displacements in the struct + size_t j = 0; + MPI_Aint base_address, displacements[n]; + MPI_Get_address(&measure, &base_address); + MPI_Get_address(&measure.rank, &displacements[j++]); + MPI_Get_address(&measure.source, &displacements[j++]); + for (size_t i = 0; i < n; i++) + displacements[i] = MPI_Aint_diff(displacements[i], base_address); - for (auto& s: slices) - if ( s.info.recycling == info.type - && s.info.tuple == info.tuple - && s.info.state == Recycled - ) result.push_back(&s); + MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); + MPI_Type_commit(&dt); + return dt; + } - return result; - } + static MPI_Datatype usizeDt() { return MPI_UINT64_T; } - static Slice& - findRecycledSource (std::vector> &slices, Slice::Info info) { - const auto sliceIt - = std::find_if(slices.begin(), slices.end(), - [&info](Slice const& s) { - return info.recycling == s.info.type - && info.tuple == s.info.tuple - && State::Recycled != s.info.state - ; - }); + static MPI_Datatype sliceInfo () { + constexpr int n = 5; + MPI_Datatype dt; + Slice::Info measure; + const std::vector lengths(n, 1); + const MPI_Datatype types[n] + = { vector(2, usizeDt()) + , vector(sizeof(enum Type), MPI_CHAR) + , vector(sizeof(enum State), MPI_CHAR) + , sliceLocation() + , vector(sizeof(enum Type), MPI_CHAR) + // TODO: Why this does not work on intel mpi? + /*, MPI_UINT64_T*/ + }; - WITH_CRAZY_DEBUG - WITH_RANK << "__slice__:find: recycling source of " - << pretty_print(info) << "\n"; - if (sliceIt == slices.end()) - throw std::domain_error( "Slice not found: " - + pretty_print(info) - + " rank: " - + pretty_print(Atrip::rank) - ); - WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n"; - return *sliceIt; - } + static_assert(sizeof(enum Type) == 4, "Enum type not 4 bytes long"); + static_assert(sizeof(enum State) == 4, "Enum State not 4 bytes long"); + static_assert(sizeof(enum Name) == 4, "Enum Name not 4 bytes long"); - static Slice& findByTypeAbc - ( std::vector> &slices - , Slice::Type type - , ABCTuple const& abc - ) { - const auto tuple = Slice::subtupleBySlice(abc, type); - const auto sliceIt - = std::find_if(slices.begin(), slices.end(), - [&type, &tuple](Slice const& s) { - return type == s.info.type - && tuple == s.info.tuple - ; - }); - WITH_CRAZY_DEBUG - WITH_RANK << "__slice__:find:" << type << " and tuple " - << pretty_print(tuple) - << "\n"; - if (sliceIt == slices.end()) - throw std::domain_error( "Slice not found: " - + pretty_print(tuple) - + ", " - + pretty_print(type) - + " rank: " - + pretty_print(Atrip::rank) - ); - return *sliceIt; - } + // create the displacements from the info measurement struct + size_t j = 0; + MPI_Aint base_address, displacements[n]; + MPI_Get_address(&measure, &base_address); + MPI_Get_address(&measure.tuple[0], &displacements[j++]); + MPI_Get_address(&measure.type, &displacements[j++]); + MPI_Get_address(&measure.state, &displacements[j++]); + MPI_Get_address(&measure.from, &displacements[j++]); + MPI_Get_address(&measure.recycling, &displacements[j++]); + for (size_t i = 0; i < n; i++) + displacements[i] = MPI_Aint_diff(displacements[i], base_address); - static Slice& findByInfo(std::vector> &slices, - Slice::Info const& info) { - const auto sliceIt - = std::find_if(slices.begin(), slices.end(), - [&info](Slice const& s) { - // TODO: maybe implement comparison in Info struct - return info.type == s.info.type - && info.state == s.info.state - && info.tuple == s.info.tuple - && info.from.rank == s.info.from.rank - && info.from.source == s.info.from.source - ; - }); - WITH_CRAZY_DEBUG - WITH_RANK << "__slice__:find:looking for " << pretty_print(info) << "\n"; - if (sliceIt == slices.end()) - throw std::domain_error( "Slice by info not found: " - + pretty_print(info)); - return *sliceIt; - } + MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); + MPI_Type_commit(&dt); + return dt; + } - // SLICE DEFINITION =================================================={{{1 + static MPI_Datatype localDatabaseElement () { + constexpr int n = 2; + MPI_Datatype dt; + LocalDatabaseElement measure; + const std::vector lengths(n, 1); + const MPI_Datatype types[n] + = { vector(sizeof(enum Name), MPI_CHAR) + , sliceInfo() + }; - // ATTRIBUTES ============================================================ - Info info; - F *data; - MPI_Request request; - const size_t size; + // measure the displacements in the struct + size_t j = 0; + MPI_Aint base_address, displacements[n]; + MPI_Get_address(&measure, &base_address); + MPI_Get_address(&measure.name, &displacements[j++]); + MPI_Get_address(&measure.info, &displacements[j++]); + for (size_t i = 0; i < n; i++) + displacements[i] = MPI_Aint_diff(displacements[i], base_address); - void markReady() noexcept { - info.state = Ready; - info.recycling = Blank; - } + static_assert( sizeof(LocalDatabaseElement) == sizeof(measure) + , "Measure has bad size"); - /* - * This means that the data is there - */ - bool isUnwrapped() const noexcept { - return info.state == Ready - || info.state == SelfSufficient - ; - } + MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); + MPI_Type_commit(&dt); + return vector(sizeof(LocalDatabaseElement), MPI_CHAR); + // TODO: write tests in order to know if this works + return dt; + } - bool isUnwrappable() const noexcept { - return isUnwrapped() - || info.state == Recycled - || info.state == Dispatched - ; - } +}; +// MPI Types:1 ends here - inline bool isDirectlyFetchable() const noexcept { - return info.state == Ready || info.state == Dispatched; - } +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:1]] +static +PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) { + switch (sliceType) { + case AB: return {abc[0], abc[1]}; + case BC: return {abc[1], abc[2]}; + case AC: return {abc[0], abc[2]}; + case CB: return {abc[2], abc[1]}; + case BA: return {abc[1], abc[0]}; + case CA: return {abc[2], abc[0]}; + case A: return {abc[0], 0}; + case B: return {abc[1], 0}; + case C: return {abc[2], 0}; + default: throw "Switch statement not exhaustive!"; + } +} +// Static utilities:1 ends here - void free() noexcept { - info.tuple = {0, 0}; - info.type = Blank; - info.state = Acceptor; - info.from = {0, 0}; - info.recycling = Blank; - data = nullptr; - } +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:2]] +static std::vector*> hasRecycledReferencingToIt + ( std::vector> &slices + , Info const& info + ) { + std::vector*> result; - inline bool isFree() const noexcept { - return info.tuple == PartialTuple{0, 0} - && info.type == Blank - && info.state == Acceptor - && info.from.rank == 0 - && info.from.source == 0 - && info.recycling == Blank - && data == nullptr - ; - } + for (auto& s: slices) + if ( s.info.recycling == info.type + && s.info.tuple == info.tuple + && s.info.state == Recycled + ) result.push_back(&s); + return result; +} +// Static utilities:2 ends here - /* - * This function answers the question, which slices can be recycled. - * - * A slice can only be recycled if it is Fetch or Ready and has - * a valid datapointer. - * - * In particular, SelfSufficient are not recyclable, since it is easier - * just to create a SelfSufficient slice than deal with data dependencies. - * - * Furthermore, a recycled slice is not recyclable, if this is the case - * then it is either bad design or a bug. - */ - inline bool isRecyclable() const noexcept { - return ( info.state == Dispatched - || info.state == Ready - || info.state == Fetch - ) - && hasValidDataPointer() - ; - } +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:3]] +static Slice& findOneByType(std::vector> &slices, Slice::Type type) { + const auto sliceIt + = std::find_if(slices.begin(), slices.end(), + [&type](Slice const& s) { + return type == s.info.type; + }); + WITH_CRAZY_DEBUG + WITH_RANK + << "\t__ looking for " << type << "\n"; + if (sliceIt == slices.end()) + throw std::domain_error("Slice by type not found!"); + return *sliceIt; +} +// Static utilities:3 ends here - /* - * This function describes if a slice has a valid data pointer. - * - * This is important to know if the slice has some data to it, also - * some structural checks are done, so that it should not be Acceptor - * or Blank, if this is the case then it is a bug. - */ - inline bool hasValidDataPointer() const noexcept { - return data != nullptr - && info.state != Acceptor - && info.type != Blank - ; - } +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:4]] +static Slice& +findRecycledSource (std::vector> &slices, Slice::Info info) { + const auto sliceIt + = std::find_if(slices.begin(), slices.end(), + [&info](Slice const& s) { + return info.recycling == s.info.type + && info.tuple == s.info.tuple + && State::Recycled != s.info.state + ; + }); - void unwrapAndMarkReady() { + WITH_CRAZY_DEBUG + WITH_RANK << "__slice__:find: recycling source of " + << pretty_print(info) << "\n"; + if (sliceIt == slices.end()) + throw std::domain_error( "Slice not found: " + + pretty_print(info) + + " rank: " + + pretty_print(Atrip::rank) + ); + WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n"; + return *sliceIt; +} +// Static utilities:4 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:5]] +static Slice& findByTypeAbc + ( std::vector> &slices + , Slice::Type type + , ABCTuple const& abc + ) { + const auto tuple = Slice::subtupleBySlice(abc, type); + const auto sliceIt + = std::find_if(slices.begin(), slices.end(), + [&type, &tuple](Slice const& s) { + return type == s.info.type + && tuple == s.info.tuple + ; + }); + WITH_CRAZY_DEBUG + WITH_RANK << "__slice__:find:" << type << " and tuple " + << pretty_print(tuple) + << "\n"; + if (sliceIt == slices.end()) + throw std::domain_error( "Slice not found: " + + pretty_print(tuple) + + ", " + + pretty_print(type) + + " rank: " + + pretty_print(Atrip::rank) + ); + return *sliceIt; +} +// Static utilities:5 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:6]] +static Slice& findByInfo(std::vector> &slices, + Slice::Info const& info) { + const auto sliceIt + = std::find_if(slices.begin(), slices.end(), + [&info](Slice const& s) { + // TODO: maybe implement comparison in Info struct + return info.type == s.info.type + && info.state == s.info.state + && info.tuple == s.info.tuple + && info.from.rank == s.info.from.rank + && info.from.source == s.info.from.source + ; + }); + WITH_CRAZY_DEBUG + WITH_RANK << "__slice__:find:looking for " << pretty_print(info) << "\n"; + if (sliceIt == slices.end()) + throw std::domain_error( "Slice by info not found: " + + pretty_print(info)); + return *sliceIt; +} +// Static utilities:6 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:1]] +Info info; +// Attributes:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:2]] +F *data; +// Attributes:2 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:3]] +MPI_Request request; +// Attributes:3 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:4]] +const size_t size; +// Attributes:4 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:1]] +void markReady() noexcept { + info.state = Ready; + info.recycling = Blank; +} +// Member functions:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:2]] +bool isUnwrapped() const noexcept { + return info.state == Ready + || info.state == SelfSufficient + ; +} +// Member functions:2 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:3]] +bool isUnwrappable() const noexcept { + return isUnwrapped() + || info.state == Recycled + || info.state == Dispatched + ; +} + +inline bool isDirectlyFetchable() const noexcept { + return info.state == Ready || info.state == Dispatched; +} + +void free() noexcept { + info.tuple = {0, 0}; + info.type = Blank; + info.state = Acceptor; + info.from = {0, 0}; + info.recycling = Blank; + data = nullptr; +} + +inline bool isFree() const noexcept { + return info.tuple == PartialTuple{0, 0} + && info.type == Blank + && info.state == Acceptor + && info.from.rank == 0 + && info.from.source == 0 + && info.recycling == Blank + && data == nullptr + ; +} +// Member functions:3 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:4]] +inline bool isRecyclable() const noexcept { + return ( info.state == Dispatched + || info.state == Ready + || info.state == Fetch + ) + && hasValidDataPointer() + ; +} +// Member functions:4 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:5]] +inline bool hasValidDataPointer() const noexcept { + return data != nullptr + && info.state != Acceptor + && info.type != Blank + ; +} +// Member functions:5 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:6]] +void unwrapAndMarkReady() { if (info.state == Ready) return; if (info.state != Dispatched) throw @@ -447,17 +458,20 @@ struct Slice { << "\n"; #endif } +// Member functions:6 ends here - Slice(size_t size_) - : info({}) - , data(nullptr) - , size(size_) - {} +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]] +Slice(size_t size_) + : info({}) + , data(nullptr) + , size(size_) + {} - }; // struct Slice - +}; // struct Slice +// Epilog:1 ends here +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Debug][Debug:1]] template std::ostream& operator<<(std::ostream& out, typename Slice::Location const& v) { // TODO: remove me @@ -476,4 +490,4 @@ std::ostream& operator<<(std::ostream& out, typename Slice::Info const& i) { } } // namespace atrip -// The slice:2 ends here +// Debug:1 ends here diff --git a/include/atrip/SliceUnion.hpp b/include/atrip/SliceUnion.hpp index ec7aff6..365ad51 100644 --- a/include/atrip/SliceUnion.hpp +++ b/include/atrip/SliceUnion.hpp @@ -179,8 +179,14 @@ namespace atrip { if (blank.info.state == Slice::SelfSufficient) { blank.data = sources[from.source].data(); } else { - if (freePointers.size() == 0) - throw std::domain_error("No more free pointers!"); + if (freePointers.size() == 0) { + std::stringstream stream; + stream << "No more free pointers " + << "for type " << type + << " and name " << name + ; + throw std::domain_error(stream.str()); + } auto dataPointer = freePointers.begin(); freePointers.erase(dataPointer); blank.data = *dataPointer; @@ -314,7 +320,8 @@ namespace atrip { // at this point, let us blank the slice WITH_RANK << "~~~:cl(" << name << ")" << " freeing up slice " - // TODO: make this possible + // TODO: make this possible because of Templates + // TODO: there is a deduction error here // << " info " << slice.info << "\n"; slice.free(); @@ -334,7 +341,7 @@ namespace atrip { , typename Slice::Name name_ , size_t nSliceBuffers = 4 ) - : rankMap(paramLength, np) + : rankMap(paramLength, np, global_world) , world(child_world) , universe(global_world) , sliceLength(sliceLength_) @@ -353,7 +360,7 @@ namespace atrip { slices = std::vector>(2 * sliceTypes.size(), { sources[0].size() }); - // TODO: think exactly ^------------------- about this number + // TODO: think exactly ^------------------- about this number // initialize the freePointers with the pointers to the buffers std::transform(sliceBuffers.begin(), sliceBuffers.end(), @@ -421,10 +428,11 @@ namespace atrip { * \brief Send asynchronously only if the state is Fetch */ void send( size_t otherRank - , typename Slice::Info const& info + , typename Slice::LocalDatabaseElement const& el , size_t tag) const noexcept { MPI_Request request; bool sendData_p = false; + auto const& info = el.info; if (info.state == Slice::Fetch) sendData_p = true; // TODO: remove this because I have SelfSufficient @@ -539,8 +547,11 @@ namespace atrip { [&name](SliceUnion const* s) { return name == s->name; }); - if (sliceUnionIt == unions.end()) - throw std::domain_error("SliceUnion not found!"); + if (sliceUnionIt == unions.end()) { + std::stringstream stream; + stream << "SliceUnion(" << name << ") not found!"; + throw std::domain_error(stream.str()); + } return **sliceUnionIt; } diff --git a/include/atrip/Tuples.hpp b/include/atrip/Tuples.hpp index 5d4b69f..c41b78a 100644 --- a/include/atrip/Tuples.hpp +++ b/include/atrip/Tuples.hpp @@ -1,75 +1,538 @@ -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Tuples][Tuples:1]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]] #pragma once #include #include #include +// TODO: remove some +#include +#include +#include +#include +#include +#include +#include +#include + #include #include namespace atrip { +// Prolog:1 ends here - using ABCTuple = std::array; - using PartialTuple = std::array; - using ABCTuples = std::vector; +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Tuples%20types][Tuples types:1]] +using ABCTuple = std::array; +using PartialTuple = std::array; +using ABCTuples = std::vector; - ABCTuples getTuplesList(size_t Nv) { - const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; - ABCTuples result(n); - size_t u(0); +constexpr ABCTuple FAKE_TUPLE = {0, 0, 0}; +constexpr ABCTuple INVALID_TUPLE = {1, 1, 1}; +// Tuples types:1 ends here - for (size_t a(0); a < Nv; a++) - for (size_t b(a); b < Nv; b++) - for (size_t c(b); c < Nv; c++){ - if ( a == b && b == c ) continue; - result[u++] = {a, b, c}; - } +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Distributing%20the%20tuples][Distributing the tuples:1]] +struct TuplesDistribution { + virtual ABCTuples getTuples(size_t Nv, MPI_Comm universe) = 0; + virtual bool tupleIsFake(ABCTuple const& t) { return t == FAKE_TUPLE; } +}; +// Distributing the tuples:1 ends here - return result; +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Node%20information][Node information:1]] +std::vector getNodeNames(MPI_Comm comm){ + int rank, np; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &np); + std::vector nodeList(np); + char nodeName[MPI_MAX_PROCESSOR_NAME] + , nodeNames[np*MPI_MAX_PROCESSOR_NAME] + ; + std::vector nameLengths(np) + , off(np) + ; + int nameLength; + MPI_Get_processor_name(nodeName, &nameLength); + MPI_Allgather(&nameLength, + 1, + MPI_INT, + nameLengths.data(), + 1, + MPI_INT, + comm); + for (int i(1); i < np; i++) + off[i] = off[i-1] + nameLengths[i-1]; + MPI_Allgatherv(nodeName, + nameLengths[rank], + MPI_BYTE, + nodeNames, + nameLengths.data(), + off.data(), + MPI_BYTE, + comm); + for (int i(0); i < np; i++) { + std::string const s(&nodeNames[off[i]], nameLengths[i]); + nodeList[i] = s; } + return nodeList; +} +// Node information:1 ends here +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Node%20information][Node information:2]] +struct RankInfo { + const std::string name; + const size_t nodeId; + const size_t globalRank; + const size_t localRank; + const size_t ranksPerNode; +}; - std::pair - getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) { - - std::vector n_tuples_per_rank(np, tuplesList.size()/np); - const size_t - // how many valid tuples should we still verteilen to nodes - // since the number of tuples is not divisible by the number of nodes - nRoundRobin = tuplesList.size() % np - // every node must have the sanme amount of tuples in order for the - // other nodes to receive and send somewhere, therefore - // some nodes will get extra tuples but that are dummy tuples - , nExtraInvalid = (np - nRoundRobin) % np - ; - - if (nRoundRobin) for (int i = 0; i < np; i++) n_tuples_per_rank[i]++; - - #if defined(TODO) - assert( tuplesList.size() - == - ( std::accumulate(n_tuples_per_rank.begin(), - n_tuples_per_rank.end(), - 0UL, - std::plus()) - + nExtraInvalid - )); - #endif - - WITH_RANK << "nRoundRobin = " << nRoundRobin << "\n"; - WITH_RANK << "nExtraInvalid = " << nExtraInvalid << "\n"; - WITH_RANK << "ntuples = " << n_tuples_per_rank[rank] << "\n"; - - auto const& it = n_tuples_per_rank.begin(); - - return - { std::accumulate(it, it + rank , 0) - , std::accumulate(it, it + rank + 1, 0) - }; +template +A unique(A const &xs) { + auto result = xs; + std::sort(std::begin(result), std::end(result)); + auto const& last = std::unique(std::begin(result), std::end(result)); + result.erase(last, std::end(result)); + return result; +} +std::vector +getNodeInfos(std::vector const& nodeNames) { + std::vector result; + auto const uniqueNames = unique(nodeNames); + auto const index = [&uniqueNames](std::string const& s) { + auto const& it = std::find(uniqueNames.begin(), uniqueNames.end(), s); + return std::distance(uniqueNames.begin(), it); + }; + std::vector localRanks(uniqueNames.size(), 0); + size_t globalRank = 0; + for (auto const& name: nodeNames) { + const size_t nodeId = index(name); + result.push_back({name, + nodeId, + globalRank++, + localRanks[nodeId]++, + std::count(nodeNames.begin(), + nodeNames.end(), + name) + }); } + return result; +} + +struct ClusterInfo { + const size_t nNodes, np, ranksPerNode; + const std::vector rankInfos; +}; + +ClusterInfo +getClusterInfo(MPI_Comm comm) { + auto const names = getNodeNames(comm); + auto const rankInfos = getNodeInfos(names); + + return ClusterInfo { + unique(names).size(), + names.size(), + rankInfos[0].ranksPerNode, + rankInfos + }; } -// Tuples:1 ends here +// Node information:2 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Naive%20list][Naive list:1]] +ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) { + + const size_t + // total number of tuples for the problem + n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv + + // all ranks should have the same number of tuples_per_rank + , tuples_per_rank = n / np + size_t(n % np != 0) + + // start index for the global tuples list + , start = tuples_per_rank * rank + + // end index for the global tuples list + , end = tuples_per_rank * (rank + 1) + ; + + LOG(1,"Atrip") << "tuples_per_rank = " << tuples_per_rank << "\n"; + WITH_RANK << "start, end = " << start << ", " << end << "\n"; + ABCTuples result(tuples_per_rank, FAKE_TUPLE); + + for (size_t a(0), r(0), g(0); a < Nv; a++) + for (size_t b(a); b < Nv; b++) + for (size_t c(b); c < Nv; c++){ + if ( a == b && b == c ) continue; + if ( start <= g && g < end) result[r++] = {a, b, c}; + g++; + } + + return result; + +} +// Naive list:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Naive%20list][Naive list:2]] +ABCTuples getAllTuplesList(const size_t Nv) { + const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; + ABCTuples result(n); + + for (size_t a(0), u(0); a < Nv; a++) + for (size_t b(a); b < Nv; b++) + for (size_t c(b); c < Nv; c++){ + if ( a == b && b == c ) continue; + result[u++] = {a, b, c}; + } + + return result; +} +// Naive list:2 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Naive%20list][Naive list:3]] +struct NaiveDistribution : public TuplesDistribution { + ABCTuples getTuples(size_t Nv, MPI_Comm universe) override { + int rank, np; + MPI_Comm_rank(universe, &rank); + MPI_Comm_size(universe, &np); + return getTuplesList(Nv, (size_t)rank, (size_t)np); + } +}; +// Naive list:3 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]] +namespace group_and_sort { +// Prolog:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Utils][Utils:1]] +// Provides the node on which the slice-element is found +// Right now we distribute the slices in a round robin fashion +// over the different nodes (NOTE: not mpi ranks but nodes) +inline +size_t isOnNode(size_t tuple, size_t nNodes) { return tuple % nNodes; } + + +// return the node (or all nodes) where the elements of this +// tuple are located +std::vector getTupleNodes(ABCTuple const& t, size_t nNodes) { + std::vector + nTuple = { isOnNode(t[0], nNodes) + , isOnNode(t[1], nNodes) + , isOnNode(t[2], nNodes) + }; + return unique(nTuple); +} + +struct Info { + size_t nNodes; + size_t nodeId; +}; +// Utils:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Distribution][Distribution:1]] +ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { + + ABCTuples nodeTuples; + size_t const nNodes(info.nNodes); + + std::vector + container1d(nNodes) + , container2d(nNodes * nNodes) + , container3d(nNodes * nNodes * nNodes) + ; + + if (info.nodeId == 0) + std::cout << "\tGoing through all " + << allTuples.size() + << " tuples in " + << nNodes + << " nodes\n"; + + // build container-n-d's + for (auto const& t: allTuples) { + // one which node(s) are the tuple elements located... + // put them into the right container + auto const _nodes = getTupleNodes(t, nNodes); + + switch (_nodes.size()) { + case 1: + container1d[_nodes[0]].push_back(t); + break; + case 2: + container2d[ _nodes[0] + + _nodes[1] * nNodes + ].push_back(t); + break; + case 3: + container3d[ _nodes[0] + + _nodes[1] * nNodes + + _nodes[2] * nNodes * nNodes + ].push_back(t); + break; + } + + } + + if (info.nodeId == 0) + std::cout << "\tBuilding 1-d containers\n"; + // DISTRIBUTE 1-d containers + // every tuple which is only located at one node belongs to this node + { + auto const& _tuples = container1d[info.nodeId]; + nodeTuples.resize(_tuples.size(), INVALID_TUPLE); + std::copy(_tuples.begin(), _tuples.end(), nodeTuples.begin()); + } + + if (info.nodeId == 0) + std::cout << "\tBuilding 2-d containers\n"; + // DISTRIBUTE 2-d containers + //the tuples which are located at two nodes are half/half given to these nodes + for (size_t yx = 0; yx < container2d.size(); yx++) { + + auto const& _tuples = container2d[yx]; + const + size_t idx = yx % nNodes + // remeber: yx = idy * nNodes + idx + , idy = yx / nNodes + , n_half = _tuples.size() / 2 + , size = nodeTuples.size() + ; + + size_t nbeg, nend; + if (info.nodeId == idx) { + nbeg = 0 * n_half; + nend = n_half; + } else if (info.nodeId == idy) { + nbeg = 1 * n_half; + nend = _tuples.size(); + } else { + // either idx or idy is my node + continue; + } + + size_t const nextra = nend - nbeg; + nodeTuples.resize(size + nextra, INVALID_TUPLE); + std::copy(_tuples.begin() + nbeg, + _tuples.begin() + nend, + nodeTuples.begin() + size); + + } + + if (info.nodeId == 0) + std::cout << "\tBuilding 3-d containers\n"; + // DISTRIBUTE 3-d containers + for (size_t zyx = 0; zyx < container3d.size(); zyx++) { + auto const& _tuples = container3d[zyx]; + + const + size_t idx = zyx % nNodes + , idy = (zyx / nNodes) % nNodes + // remember: zyx = idx + idy * nNodes + idz * nNodes^2 + , idz = zyx / nNodes / nNodes + , n_third = _tuples.size() / 3 + , size = nodeTuples.size() + ; + + size_t nbeg, nend; + if (info.nodeId == idx) { + nbeg = 0 * n_third; + nend = 1 * n_third; + } else if (info.nodeId == idy) { + nbeg = 1 * n_third; + nend = 2 * n_third; + } else if (info.nodeId == idz) { + nbeg = 2 * n_third; + nend = _tuples.size(); + } else { + // either idx or idy or idz is my node + continue; + } + + size_t const nextra = nend - nbeg; + nodeTuples.resize(size + nextra, INVALID_TUPLE); + std::copy(_tuples.begin() + nbeg, + _tuples.begin() + nend, + nodeTuples.begin() + size); + + } + + + if (info.nodeId == 0) std::cout << "\tswapping tuples...\n"; + /* + * sort part of group-and-sort algorithm + * every tuple on a given node is sorted in a way that + * the 'home elements' are the fastest index. + * 1:yyy 2:yyn(x) 3:yny(x) 4:ynn(x) 5:nyy 6:nyn(x) 7:nny 8:nnn + */ + for (auto &nt: nodeTuples){ + if ( isOnNode(nt[0], nNodes) == info.nodeId ){ // 1234 + if ( isOnNode(nt[2], nNodes) != info.nodeId ){ // 24 + size_t const x(nt[0]); + nt[0] = nt[2]; // switch first and last + nt[2] = x; + } + else if ( isOnNode(nt[1], nNodes) != info.nodeId){ // 3 + size_t const x(nt[0]); + nt[0] = nt[1]; // switch first two + nt[1] = x; + } + } else { + if ( isOnNode(nt[1], nNodes) == info.nodeId // 56 + && isOnNode(nt[2], nNodes) != info.nodeId + ) { // 6 + size_t const x(nt[1]); + nt[1] = nt[2]; // switch last two + nt[2] = x; + } + } + } + + if (info.nodeId == 0) std::cout << "\tsorting list of tuples...\n"; + //now we sort the list of tuples + std::sort(nodeTuples.begin(), nodeTuples.end()); + + if (info.nodeId == 0) std::cout << "\trestoring tuples...\n"; + // we bring the tuples abc back in the order a 1 + if (info.nodeId == 0) + std::cout << "checking for validity of " << nodeTuples.size() << std::endl; + const bool anyInvalid + = std::any_of(nodeTuples.begin(), + nodeTuples.end(), + [](ABCTuple const& t) { return t == INVALID_TUPLE; }); + if (anyInvalid) throw "Some tuple is invalid in group-and-sort algorithm"; +#endif + + if (info.nodeId == 0) std::cout << "\treturning tuples...\n"; + return nodeTuples; + +} +// Distribution:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:1]] +std::vector main(MPI_Comm universe, size_t Nv) { + + int rank, np; + MPI_Comm_rank(universe, &rank); + MPI_Comm_size(universe, &np); + + std::vector result; + + auto const nodeNames(getNodeNames(universe)); + size_t const nNodes = unique(nodeNames).size(); + auto const nodeInfos = getNodeInfos(nodeNames); + + // We want to construct a communicator which only contains of one + // element per node + bool const computeDistribution + = nodeInfos[rank].localRank == 0; + + std::vector + nodeTuples + = computeDistribution + ? specialDistribution(Info{nNodes, nodeInfos[rank].nodeId}, + getAllTuplesList(Nv)) + : std::vector() + ; + + LOG(1,"Atrip") << "got nodeTuples\n"; + + // now we have to send the data from **one** rank on each node + // to all others ranks of this node + const + int color = nodeInfos[rank].nodeId + , key = nodeInfos[rank].localRank + ; + + + MPI_Comm INTRA_COMM; + MPI_Comm_split(universe, color, key, &INTRA_COMM); +// Main:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:2]] +size_t const + tuplesPerRankLocal + = nodeTuples.size() / nodeInfos[rank].ranksPerNode + + size_t(nodeTuples.size() % nodeInfos[rank].ranksPerNode != 0) + ; + +size_t tuplesPerRankGlobal; + +MPI_Reduce(&tuplesPerRankLocal, + &tuplesPerRankGlobal, + 1, + MPI_UINT64_T, + MPI_MAX, + 0, + universe); + +MPI_Bcast(&tuplesPerRankGlobal, + 1, + MPI_UINT64_T, + 0, + universe); + +LOG(1,"Atrip") << "Tuples per rank: " << tuplesPerRankGlobal << "\n"; +LOG(1,"Atrip") << "ranks per node " << nodeInfos[rank].ranksPerNode << "\n"; +LOG(1,"Atrip") << "#nodes " << nNodes << "\n"; +// Main:2 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:3]] +size_t const totalTuples + = tuplesPerRankGlobal * nodeInfos[rank].ranksPerNode; + +if (computeDistribution) { + // pad with FAKE_TUPLEs + nodeTuples.insert(nodeTuples.end(), + totalTuples - nodeTuples.size(), + FAKE_TUPLE); +} +// Main:3 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:4]] +{ + // construct mpi type for abctuple + MPI_Datatype MPI_ABCTUPLE; + MPI_Type_vector(nodeTuples[0].size(), 1, 1, MPI_UINT64_T, &MPI_ABCTUPLE); + MPI_Type_commit(&MPI_ABCTUPLE); + + LOG(1,"Atrip") << "scattering tuples \n"; + + result.resize(tuplesPerRankGlobal); + MPI_Scatter(nodeTuples.data(), + tuplesPerRankGlobal, + MPI_ABCTUPLE, + result.data(), + tuplesPerRankGlobal, + MPI_ABCTUPLE, + 0, + INTRA_COMM); + + MPI_Type_free(&MPI_ABCTUPLE); + +} +// Main:4 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:5]] +return result; + +} +// Main:5 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Interface][Interface:1]] +struct Distribution : public TuplesDistribution { + ABCTuples getTuples(size_t Nv, MPI_Comm universe) override { + return main(universe, Nv); + } +}; +// Interface:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]] +} // namespace group_and_sort +// Epilog:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]] +} +// Epilog:1 ends here diff --git a/include/atrip/Unions.hpp b/include/atrip/Unions.hpp index db3b6b7..e651ef9 100644 --- a/include/atrip/Unions.hpp +++ b/include/atrip/Unions.hpp @@ -59,7 +59,7 @@ namespace atrip { , child_world , global_world , Slice::TA - , 4) { + , 6) { init(sourceTensor); } @@ -97,7 +97,7 @@ namespace atrip { , child_world , global_world , Slice::VIJKA - , 4) { + , 6) { init(sourceTensor); } diff --git a/include/atrip/Utils.hpp b/include/atrip/Utils.hpp index bff3d19..83656c6 100644 --- a/include/atrip/Utils.hpp +++ b/include/atrip/Utils.hpp @@ -1,4 +1,4 @@ -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Utils][Utils:1]] +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]] #pragma once #include #include @@ -6,32 +6,41 @@ #include #include +#include namespace atrip { +// Prolog:1 ends here - - template +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Pretty%20printing][Pretty printing:1]] +template std::string pretty_print(T&& value) { std::stringstream stream; -#if ATRIP_DEBUG > 1 +#if ATRIP_DEBUG > 2 dbg::pretty_print(stream, std::forward(value)); #endif return stream.str(); } +// Pretty printing:1 ends here -#define WITH_CHRONO(__chrono, ...) \ - __chrono.start(); __VA_ARGS__ __chrono.stop(); +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Chrono][Chrono:1]] +#define WITH_CHRONO(__chrono_name, ...) \ + Atrip::chrono[__chrono_name].start(); \ + __VA_ARGS__ \ + Atrip::chrono[__chrono_name].stop(); - struct Timer { - using Clock = std::chrono::high_resolution_clock; - using Event = std::chrono::time_point; - std::chrono::duration duration; - Event _start; - inline void start() noexcept { _start = Clock::now(); } - inline void stop() noexcept { duration += Clock::now() - _start; } - inline void clear() noexcept { duration *= 0; } - inline double count() const noexcept { return duration.count(); } - }; - using Timings = std::map; +struct Timer { + using Clock = std::chrono::high_resolution_clock; + using Event = std::chrono::time_point; + std::chrono::duration duration; + Event _start; + inline void start() noexcept { _start = Clock::now(); } + inline void stop() noexcept { duration += Clock::now() - _start; } + inline void clear() noexcept { duration *= 0; } + inline double count() const noexcept { return duration.count(); } +}; +using Timings = std::map; +// Chrono:1 ends here + +// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]] } -// Utils:1 ends here +// Epilog:1 ends here diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx index fc613b6..b7823de 100644 --- a/src/atrip/Atrip.cxx +++ b/src/atrip/Atrip.cxx @@ -9,8 +9,11 @@ using namespace atrip; +bool RankMap::RANK_ROUND_ROBIN; +bool RankMap::RANK_ROUND_ROBIN; int Atrip::rank; int Atrip::np; +Timings Atrip::chrono; // user printing block IterationDescriptor IterationDescription::descriptor; @@ -30,28 +33,35 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { const int rank = Atrip::rank; MPI_Comm universe = in.ei->wrld->comm; - // Timings in seconds ================================================{{{1 - Timings chrono{}; - const size_t No = in.ei->lens[0]; const size_t Nv = in.ea->lens[0]; LOG(0,"Atrip") << "No: " << No << "\n"; LOG(0,"Atrip") << "Nv: " << Nv << "\n"; + LOG(0,"Atrip") << "np: " << np << "\n"; // allocate the three scratches, see piecuch - std::vector Tijk(No*No*No) // doubles only (see piecuch) - , Zijk(No*No*No) // singles + doubles (see piecuch) - // we need local copies of the following tensors on every - // rank - , epsi(No) - , epsa(Nv) - , Tai(No * Nv) - ; + std::vector Tijk(No*No*No) // doubles only (see piecuch) + , Zijk(No*No*No) // singles + doubles (see piecuch) + // we need local copies of the following tensors on every + // rank + , epsi(No) + , epsa(Nv) + , Tai(No * Nv) + ; in.ei->read_all(epsi.data()); in.ea->read_all(epsa.data()); in.Tph->read_all(Tai.data()); + RankMap::RANK_ROUND_ROBIN = in.rankRoundRobin; + if (RankMap::RANK_ROUND_ROBIN) { + LOG(0,"Atrip") << "Doing rank round robin slices distribution" << "\n"; + } else { + LOG(0,"Atrip") + << "Doing node > local rank round robin slices distribution" << "\n"; + } + + // COMMUNICATOR CONSTRUCTION ========================================={{{1 // // Construct a new communicator living only on a single rank @@ -72,41 +82,49 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { } - chrono["nv-slices"].start(); // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1 - LOG(0,"Atrip") << "BUILD NV-SLICES\n"; - TAPHH taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - HHHA hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - chrono["nv-slices"].stop(); + WITH_CHRONO("nv-slices", + LOG(0,"Atrip") << "BUILD NV-SLICES\n"; + TAPHH taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + HHHA hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + ) - chrono["nv-nv-slices"].start(); // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1 - LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n"; - ABPH abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - ABHH abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - TABHH tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); - chrono["nv-nv-slices"].stop(); + WITH_CHRONO("nv-nv-slices", + LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n"; + ABPH abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + ABHH abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + TABHH tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + ) // all tensors std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh}; - //CONSTRUCT TUPLE LIST ==============================================={{{1 - LOG(0,"Atrip") << "BUILD TUPLE LIST\n"; - const auto tuplesList = std::move(getTuplesList(Nv)); - WITH_RANK << "tupList.size() = " << tuplesList.size() << "\n"; + // get tuples for the current rank + TuplesDistribution *distribution; - // GET ABC INDEX RANGE FOR RANK ======================================{{{1 - auto abcIndex = getABCRange(np, rank, tuplesList); - size_t nIterations = abcIndex.second - abcIndex.first; + if (in.tuplesDistribution == Atrip::Input::TuplesDistribution::NAIVE) { + LOG(0,"Atrip") << "Using the naive distribution\n"; + distribution = new NaiveDistribution(); + } else { + LOG(0,"Atrip") << "Using the group-and-sort distribution\n"; + distribution = new group_and_sort::Distribution(); + } - WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n"; - LOG(0,"Atrip") << "#iterations: " << nIterations << "\n"; + LOG(0,"Atrip") << "BUILDING TUPLE LIST\n"; + WITH_CHRONO("tuples:build", + auto const tuplesList = distribution->getTuples(Nv, universe); + ) + const size_t nIterations = tuplesList.size(); - // first abc - const ABCTuple firstAbc = tuplesList[abcIndex.first]; - - - double energy(0.); + { + const size_t _all_tuples = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; + LOG(0,"Atrip") << "#iterations: " + << nIterations + << "/" + << nIterations * np + << "\n"; + } const size_t iterationMod = (in.percentageMod > 0) @@ -119,7 +137,9 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { auto const isFakeTuple - = [&tuplesList](size_t const i) { return i >= tuplesList.size(); }; + = [&tuplesList, distribution](size_t const i) { + return distribution->tupleIsFake(tuplesList[i]); + }; using Database = typename Slice::Database; @@ -127,45 +147,42 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { auto communicateDatabase = [ &unions , np - , &chrono ] (ABCTuple const& abc, MPI_Comm const& c) -> Database { - chrono["db:comm:type:do"].start(); - auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement(); - chrono["db:comm:type:do"].stop(); + WITH_CHRONO("db:comm:type:do", + auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement(); + ) - chrono["db:comm:ldb"].start(); - LocalDatabase ldb; - - for (auto const& tensor: unions) { - auto const& tensorDb = tensor->buildLocalDatabase(abc); - ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end()); - } - chrono["db:comm:ldb"].stop(); + WITH_CHRONO("db:comm:ldb", + typename Slice::LocalDatabase ldb; + for (auto const& tensor: unions) { + auto const& tensorDb = tensor->buildLocalDatabase(abc); + ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end()); + } + ) Database db(np * ldb.size(), ldb[0]); - chrono["oneshot-db:comm:allgather"].start(); - chrono["db:comm:allgather"].start(); - MPI_Allgather( ldb.data() - , ldb.size() - , MPI_LDB_ELEMENT - , db.data() - , ldb.size() - , MPI_LDB_ELEMENT - , c); - chrono["db:comm:allgather"].stop(); - chrono["oneshot-db:comm:allgather"].stop(); + WITH_CHRONO("oneshot-db:comm:allgather", + WITH_CHRONO("db:comm:allgather", + MPI_Allgather( ldb.data() + , ldb.size() + , MPI_LDB_ELEMENT + , db.data() + , ldb.size() + , MPI_LDB_ELEMENT + , c); + )) - chrono["db:comm:type:free"].start(); - MPI_Type_free(&MPI_LDB_ELEMENT); - chrono["db:comm:type:free"].stop(); + WITH_CHRONO("db:comm:type:free", + MPI_Type_free(&MPI_LDB_ELEMENT); + ) return db; }; auto doIOPhase - = [&unions, &rank, &np, &universe, &chrono] (Database const& db) { + = [&unions, &rank, &np, &universe] (Database const& db) { const size_t localDBLength = db.size() / np; @@ -201,9 +218,9 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { << "\n" ; - chrono["db:io:recv"].start(); - u.receive(el.info, recvTag); - chrono["db:io:recv"].stop(); + WITH_CHRONO("db:io:recv", + u.receive(el.info, recvTag); + ) } // recv } @@ -237,9 +254,9 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { << "\n" ; - chrono["db:io:send"].start(); - u.send(otherRank, el.info, sendTag); - chrono["db:io:send"].stop(); + WITH_CHRONO("db:io:send", + u.send(otherRank, el, sendTag); + ) } // send phase @@ -257,31 +274,30 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { * double(No) * double(No) * (double(No) + double(Nv)) - * 2 - * 6 + * 2.0 + * (traits::isComplex() ? 2.0 : 1.0) + * 6.0 / 1e9 ; // START MAIN LOOP ======================================================{{{1 - for ( size_t i = abcIndex.first, iteration = 1 - ; i < abcIndex.second + double energy(0.); + + for ( size_t i = 0, iteration = 1 + ; i < tuplesList.size() ; i++, iteration++ ) { - chrono["iterations"].start(); - + Atrip::chrono["iterations"].start(); // check overhead from chrono over all iterations - chrono["start:stop"].start(); chrono["start:stop"].stop(); + WITH_CHRONO("start:stop", {}) // check overhead of doing a barrier at the beginning - chrono["oneshot-mpi:barrier"].start(); - chrono["mpi:barrier"].start(); - // TODO: REMOVE - if (in.barrier == 1) - MPI_Barrier(universe); - chrono["mpi:barrier"].stop(); - chrono["oneshot-mpi:barrier"].stop(); + WITH_CHRONO("oneshot-mpi:barrier", + WITH_CHRONO("mpi:barrier", + if (in.barrier) MPI_Barrier(universe); + )) if (iteration % iterationMod == 0 || iteration == iteration1Percent) { @@ -289,22 +305,22 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { IterationDescription::descriptor({ iteration, nIterations, - chrono["iterations"].count() + Atrip::chrono["iterations"].count() }); } LOG(0,"Atrip") << "iteration " << iteration << " [" << 100 * iteration / nIterations << "%]" - << " (" << doublesFlops * iteration / chrono["doubles"].count() + << " (" << doublesFlops * iteration / Atrip::chrono["doubles"].count() << "GF)" - << " (" << doublesFlops * iteration / chrono["iterations"].count() + << " (" << doublesFlops * iteration / Atrip::chrono["iterations"].count() << "GF)" << " ===========================\n"; // PRINT TIMINGS if (in.chrono) - for (auto const& pair: chrono) + for (auto const& pair: Atrip::chrono) LOG(1, " ") << pair.first << " :: " << pair.second.count() << std::endl; @@ -314,46 +330,43 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { const ABCTuple abc = isFakeTuple(i) ? tuplesList[tuplesList.size() - 1] : tuplesList[i] - , *abcNext = i == (abcIndex.second - 1) + , *abcNext = i == (tuplesList.size() - 1) ? nullptr - : isFakeTuple(i + 1) - ? &tuplesList[tuplesList.size() - 1] : &tuplesList[i + 1] ; - chrono["with_rank"].start(); - WITH_RANK << " :it " << iteration - << " :abc " << pretty_print(abc) - << " :abcN " - << (abcNext ? pretty_print(*abcNext) : "None") - << "\n"; - chrono["with_rank"].stop(); + WITH_CHRONO("with_rank", + WITH_RANK << " :it " << iteration + << " :abc " << pretty_print(abc) + << " :abcN " + << (abcNext ? pretty_print(*abcNext) : "None") + << "\n"; + ) // COMM FIRST DATABASE ================================================{{{1 - if (i == abcIndex.first) { + if (i == 0) { WITH_RANK << "__first__:first database ............ \n"; - const auto __db = communicateDatabase(abc, universe); + const auto db = communicateDatabase(abc, universe); WITH_RANK << "__first__:first database communicated \n"; WITH_RANK << "__first__:first database io phase \n"; - doIOPhase(__db); + doIOPhase(db); WITH_RANK << "__first__:first database io phase DONE\n"; WITH_RANK << "__first__::::Unwrapping all slices for first database\n"; for (auto& u: unions) u->unwrapAll(abc); - WITH_RANK << "__first__::::Unwrapping all slices for first database DONE\n"; + WITH_RANK << "__first__::::Unwrapping slices for first database DONE\n"; MPI_Barrier(universe); } // COMM NEXT DATABASE ================================================={{{1 if (abcNext) { WITH_RANK << "__comm__:" << iteration << "th communicating database\n"; - chrono["db:comm"].start(); - //const auto db = communicateDatabase(*abcNext, universe); - Database db = communicateDatabase(*abcNext, universe); - chrono["db:comm"].stop(); - chrono["db:io"].start(); - doIOPhase(db); - chrono["db:io"].stop(); + WITH_CHRONO("db:comm", + const auto db = communicateDatabase(*abcNext, universe); + ) + WITH_CHRONO("db:io", + doIOPhase(db); + ) WITH_RANK << "__comm__:" << iteration << "th database io phase DONE\n"; } @@ -361,63 +374,61 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { OCD_Barrier(universe); if (!isFakeTuple(i)) { WITH_RANK << iteration << "-th doubles\n"; - WITH_CHRONO(chrono["oneshot-unwrap"], - WITH_CHRONO(chrono["unwrap"], - WITH_CHRONO(chrono["unwrap:doubles"], + WITH_CHRONO("oneshot-unwrap", + WITH_CHRONO("unwrap", + WITH_CHRONO("unwrap:doubles", for (auto& u: decltype(unions){&abph, &hhha, &taphh, &tabhh}) { u->unwrapAll(abc); } ))) - chrono["oneshot-doubles"].start(); - chrono["doubles"].start(); - doublesContribution( abc, (size_t)No, (size_t)Nv - // -- VABCI - , abph.unwrapSlice(Slice::AB, abc) - , abph.unwrapSlice(Slice::AC, abc) - , abph.unwrapSlice(Slice::BC, abc) - , abph.unwrapSlice(Slice::BA, abc) - , abph.unwrapSlice(Slice::CA, abc) - , abph.unwrapSlice(Slice::CB, abc) - // -- VHHHA - , hhha.unwrapSlice(Slice::A, abc) - , hhha.unwrapSlice(Slice::B, abc) - , hhha.unwrapSlice(Slice::C, abc) - // -- TA - , taphh.unwrapSlice(Slice::A, abc) - , taphh.unwrapSlice(Slice::B, abc) - , taphh.unwrapSlice(Slice::C, abc) - // -- TABIJ - , tabhh.unwrapSlice(Slice::AB, abc) - , tabhh.unwrapSlice(Slice::AC, abc) - , tabhh.unwrapSlice(Slice::BC, abc) - // -- TIJK - , Tijk.data() - , chrono - ); - WITH_RANK << iteration << "-th doubles done\n"; - chrono["doubles"].stop(); - chrono["oneshot-doubles"].stop(); + WITH_CHRONO("oneshot-doubles", + WITH_CHRONO("doubles", + doublesContribution( abc, (size_t)No, (size_t)Nv + // -- VABCI + , abph.unwrapSlice(Slice::AB, abc) + , abph.unwrapSlice(Slice::AC, abc) + , abph.unwrapSlice(Slice::BC, abc) + , abph.unwrapSlice(Slice::BA, abc) + , abph.unwrapSlice(Slice::CA, abc) + , abph.unwrapSlice(Slice::CB, abc) + // -- VHHHA + , hhha.unwrapSlice(Slice::A, abc) + , hhha.unwrapSlice(Slice::B, abc) + , hhha.unwrapSlice(Slice::C, abc) + // -- TA + , taphh.unwrapSlice(Slice::A, abc) + , taphh.unwrapSlice(Slice::B, abc) + , taphh.unwrapSlice(Slice::C, abc) + // -- TABIJ + , tabhh.unwrapSlice(Slice::AB, abc) + , tabhh.unwrapSlice(Slice::AC, abc) + , tabhh.unwrapSlice(Slice::BC, abc) + // -- TIJK + , Tijk.data() + ); + WITH_RANK << iteration << "-th doubles done\n"; + )) } // COMPUTE SINGLES =================================================== {{{1 OCD_Barrier(universe); if (!isFakeTuple(i)) { - WITH_CHRONO(chrono["oneshot-unwrap"], - WITH_CHRONO(chrono["unwrap"], - WITH_CHRONO(chrono["unwrap:singles"], + WITH_CHRONO("oneshot-unwrap", + WITH_CHRONO("unwrap", + WITH_CHRONO("unwrap:singles", abhh.unwrapAll(abc); ))) - chrono["reorder"].start(); - for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I]; - chrono["reorder"].stop(); - chrono["singles"].start(); + WITH_CHRONO("reorder", + for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I]; + ) + WITH_CHRONO("singles", singlesContribution( No, Nv, abc , Tai.data() , abhh.unwrapSlice(Slice::AB, abc) , abhh.unwrapSlice(Slice::AC, abc) , abhh.unwrapSlice(Slice::BC, abc) , Zijk.data()); - chrono["singles"].stop(); + ) } @@ -430,12 +441,12 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { if (abc[1] == abc[2]) distinct--; const F epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]); - chrono["energy"].start(); - if ( distinct == 0) - tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk); - else - tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk); - chrono["energy"].stop(); + WITH_CHRONO("energy", + if ( distinct == 0) + tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk); + else + tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk); + ) #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES) tupleEnergies[abc] = tupleEnergy; @@ -445,6 +456,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { } + // TODO: remove this if (isFakeTuple(i)) { // fake iterations should also unwrap whatever they got WITH_RANK << iteration @@ -466,7 +478,6 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // CLEANUP UNIONS ===================================================={{{1 OCD_Barrier(universe); if (abcNext) { - chrono["gc"].start(); WITH_RANK << "__gc__:" << iteration << "-th cleaning up.......\n"; for (auto& u: unions) { @@ -500,12 +511,11 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { } - chrono["gc"].stop(); } WITH_RANK << iteration << "-th cleaning up....... DONE\n"; - chrono["iterations"].stop(); + Atrip::chrono["iterations"].stop(); // ITERATION END ====================================================={{{1 } @@ -543,15 +553,15 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // PRINT TIMINGS {{{1 if (in.chrono) - for (auto const& pair: chrono) + for (auto const& pair: Atrip::chrono) LOG(0,"atrip:chrono") << pair.first << " " << pair.second.count() << std::endl; LOG(0, "atrip:flops(doubles)") - << nIterations * doublesFlops / chrono["doubles"].count() << "\n"; + << nIterations * doublesFlops / Atrip::chrono["doubles"].count() << "\n"; LOG(0, "atrip:flops(iterations)") - << nIterations * doublesFlops / chrono["iterations"].count() << "\n"; + << nIterations * doublesFlops / Atrip::chrono["iterations"].count() << "\n"; // TODO: change the sign in the getEnergy routines return { - globalEnergy }; From 10a796971012a1ce494d88d983ff6a1725a3dfbe Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Tue, 22 Feb 2022 12:09:41 +0100 Subject: [PATCH 22/22] Silence the logging in group-and-sort --- atrip.org | 18 ++++++------ include/atrip.hpp | 2 +- include/atrip/Atrip.hpp | 2 +- include/atrip/Blas.hpp | 2 +- include/atrip/Debug.hpp | 8 +++--- include/atrip/Equations.hpp | 2 +- include/atrip/RankMap.hpp | 2 +- include/atrip/Slice.hpp | 54 +++++++++++++++++----------------- include/atrip/SliceUnion.hpp | 2 +- include/atrip/Tuples.hpp | 56 ++++++++++++++++++------------------ include/atrip/Unions.hpp | 2 +- include/atrip/Utils.hpp | 8 +++--- src/atrip/Atrip.cxx | 2 +- 13 files changed, 80 insertions(+), 80 deletions(-) diff --git a/atrip.org b/atrip.org index c6ea744..491f810 100644 --- a/atrip.org +++ b/atrip.org @@ -1813,7 +1813,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { , container3d(nNodes * nNodes * nNodes) ; - if (info.nodeId == 0) + WITH_DBG if (info.nodeId == 0) std::cout << "\tGoing through all " << allTuples.size() << " tuples in " @@ -1845,7 +1845,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { } - if (info.nodeId == 0) + WITH_DBG if (info.nodeId == 0) std::cout << "\tBuilding 1-d containers\n"; // DISTRIBUTE 1-d containers // every tuple which is only located at one node belongs to this node @@ -1855,7 +1855,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { std::copy(_tuples.begin(), _tuples.end(), nodeTuples.begin()); } - if (info.nodeId == 0) + WITH_DBG if (info.nodeId == 0) std::cout << "\tBuilding 2-d containers\n"; // DISTRIBUTE 2-d containers //the tuples which are located at two nodes are half/half given to these nodes @@ -1890,7 +1890,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { } - if (info.nodeId == 0) + WITH_DBG if (info.nodeId == 0) std::cout << "\tBuilding 3-d containers\n"; // DISTRIBUTE 3-d containers for (size_t zyx = 0; zyx < container3d.size(); zyx++) { @@ -1929,7 +1929,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { } - if (info.nodeId == 0) std::cout << "\tswapping tuples...\n"; + WITH_DBG if (info.nodeId == 0) std::cout << "\tswapping tuples...\n"; /* * sort part of group-and-sort algorithm * every tuple on a given node is sorted in a way that @@ -1959,16 +1959,16 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { } } - if (info.nodeId == 0) std::cout << "\tsorting list of tuples...\n"; + WITH_DBG if (info.nodeId == 0) std::cout << "\tsorting list of tuples...\n"; //now we sort the list of tuples std::sort(nodeTuples.begin(), nodeTuples.end()); - if (info.nodeId == 0) std::cout << "\trestoring tuples...\n"; + WITH_DBG if (info.nodeId == 0) std::cout << "\trestoring tuples...\n"; // we bring the tuples abc back in the order a 1 - if (info.nodeId == 0) + WITH_DBG if (info.nodeId == 0) std::cout << "checking for validity of " << nodeTuples.size() << std::endl; const bool anyInvalid = std::any_of(nodeTuples.begin(), @@ -1977,7 +1977,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { if (anyInvalid) throw "Some tuple is invalid in group-and-sort algorithm"; #endif - if (info.nodeId == 0) std::cout << "\treturning tuples...\n"; + WITH_DBG if (info.nodeId == 0) std::cout << "\treturning tuples...\n"; return nodeTuples; } diff --git a/include/atrip.hpp b/include/atrip.hpp index 8ecf6ce..5deb277 100644 --- a/include/atrip.hpp +++ b/include/atrip.hpp @@ -1,4 +1,4 @@ -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Include%20header][Include header:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Include%20header][Include header:1]] #pragma once #include diff --git a/include/atrip/Atrip.hpp b/include/atrip/Atrip.hpp index 2a0f340..15c4ef5 100644 --- a/include/atrip/Atrip.hpp +++ b/include/atrip/Atrip.hpp @@ -1,4 +1,4 @@ -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Header][Header:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Header][Header:1]] #pragma once #include #include diff --git a/include/atrip/Blas.hpp b/include/atrip/Blas.hpp index df81d74..ea4e702 100644 --- a/include/atrip/Blas.hpp +++ b/include/atrip/Blas.hpp @@ -1,4 +1,4 @@ -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Blas][Blas:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Blas][Blas:1]] #pragma once namespace atrip { diff --git a/include/atrip/Debug.hpp b/include/atrip/Debug.hpp index e567d5c..18e56bb 100644 --- a/include/atrip/Debug.hpp +++ b/include/atrip/Debug.hpp @@ -1,4 +1,4 @@ -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Macros][Macros:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Macros][Macros:1]] #pragma once #include #define ATRIP_BENCHMARK @@ -61,20 +61,20 @@ #endif // Macros:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Macros][Macros:2]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Macros][Macros:2]] #ifndef LOG #define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": " #endif // Macros:2 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Macros][Macros:3]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Macros][Macros:3]] #ifdef ATRIP_NO_OUTPUT # undef LOG # define LOG(level, name) if (false) std::cout << name << ": " #endif // Macros:3 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::IterationDescriptor][IterationDescriptor]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::IterationDescriptor][IterationDescriptor]] namespace atrip { struct IterationDescription; diff --git a/include/atrip/Equations.hpp b/include/atrip/Equations.hpp index e907592..6ec67a9 100644 --- a/include/atrip/Equations.hpp +++ b/include/atrip/Equations.hpp @@ -1,4 +1,4 @@ -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Equations][Equations:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Equations][Equations:1]] #pragma once #include diff --git a/include/atrip/RankMap.hpp b/include/atrip/RankMap.hpp index 0e31a61..433fd8f 100644 --- a/include/atrip/RankMap.hpp +++ b/include/atrip/RankMap.hpp @@ -1,4 +1,4 @@ -// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20rank%20mapping][The rank mapping:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*The%20rank%20mapping][The rank mapping:1]] #pragma once #include diff --git a/include/atrip/Slice.hpp b/include/atrip/Slice.hpp index 1f5889e..6b319b7 100644 --- a/include/atrip/Slice.hpp +++ b/include/atrip/Slice.hpp @@ -1,4 +1,4 @@ -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Prolog][Prolog:1]] #pragma once #include #include @@ -29,11 +29,11 @@ template struct Slice { // Prolog:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Location][Location:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Location][Location:1]] struct Location { size_t rank; size_t source; }; // Location:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Type][Type:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Type][Type:1]] enum Type { A = 10 , B @@ -51,7 +51,7 @@ enum Type }; // Type:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*State][State:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*State][State:1]] enum State { Fetch = 0, Dispatched = 2, @@ -62,7 +62,7 @@ enum State { }; // State:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20Info%20structure][The Info structure:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*The%20Info%20structure][The Info structure:1]] struct Info { // which part of a,b,c the slice holds PartialTuple tuple; @@ -86,7 +86,7 @@ struct Info { using Ty_x_Tu = std::pair< Type, PartialTuple >; // The Info structure:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Name][Name:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Name][Name:1]] enum Name { TA = 100 , VIJKA = 101 @@ -96,19 +96,19 @@ enum Name }; // Name:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Database][Database:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Database][Database:1]] struct LocalDatabaseElement { Slice::Name name; Slice::Info info; }; // Database:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Database][Database:2]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Database][Database:2]] using LocalDatabase = std::vector; using Database = LocalDatabase; // Database:2 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*MPI%20Types][MPI Types:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*MPI%20Types][MPI Types:1]] struct mpi { static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) { @@ -214,7 +214,7 @@ struct mpi { }; // MPI Types:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Static%20utilities][Static utilities:1]] static PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) { switch (sliceType) { @@ -232,7 +232,7 @@ PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) { } // Static utilities:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:2]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Static%20utilities][Static utilities:2]] static std::vector*> hasRecycledReferencingToIt ( std::vector> &slices , Info const& info @@ -249,7 +249,7 @@ static std::vector*> hasRecycledReferencingToIt } // Static utilities:2 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:3]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Static%20utilities][Static utilities:3]] static Slice& findOneByType(std::vector> &slices, Slice::Type type) { const auto sliceIt = std::find_if(slices.begin(), slices.end(), @@ -265,7 +265,7 @@ static Slice& findOneByType(std::vector> &slices, Slice::Type typ } // Static utilities:3 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:4]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Static%20utilities][Static utilities:4]] static Slice& findRecycledSource (std::vector> &slices, Slice::Info info) { const auto sliceIt @@ -291,7 +291,7 @@ findRecycledSource (std::vector> &slices, Slice::Info info) { } // Static utilities:4 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:5]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Static%20utilities][Static utilities:5]] static Slice& findByTypeAbc ( std::vector> &slices , Slice::Type type @@ -321,7 +321,7 @@ static Slice& findByTypeAbc } // Static utilities:5 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:6]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Static%20utilities][Static utilities:6]] static Slice& findByInfo(std::vector> &slices, Slice::Info const& info) { const auto sliceIt @@ -344,30 +344,30 @@ static Slice& findByInfo(std::vector> &slices, } // Static utilities:6 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Attributes][Attributes:1]] Info info; // Attributes:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:2]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Attributes][Attributes:2]] F *data; // Attributes:2 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:3]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Attributes][Attributes:3]] MPI_Request request; // Attributes:3 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:4]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Attributes][Attributes:4]] const size_t size; // Attributes:4 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Member%20functions][Member functions:1]] void markReady() noexcept { info.state = Ready; info.recycling = Blank; } // Member functions:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:2]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Member%20functions][Member functions:2]] bool isUnwrapped() const noexcept { return info.state == Ready || info.state == SelfSufficient @@ -375,7 +375,7 @@ bool isUnwrapped() const noexcept { } // Member functions:2 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:3]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Member%20functions][Member functions:3]] bool isUnwrappable() const noexcept { return isUnwrapped() || info.state == Recycled @@ -408,7 +408,7 @@ inline bool isFree() const noexcept { } // Member functions:3 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:4]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Member%20functions][Member functions:4]] inline bool isRecyclable() const noexcept { return ( info.state == Dispatched || info.state == Ready @@ -419,7 +419,7 @@ inline bool isRecyclable() const noexcept { } // Member functions:4 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:5]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Member%20functions][Member functions:5]] inline bool hasValidDataPointer() const noexcept { return data != nullptr && info.state != Acceptor @@ -428,7 +428,7 @@ inline bool hasValidDataPointer() const noexcept { } // Member functions:5 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:6]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Member%20functions][Member functions:6]] void unwrapAndMarkReady() { if (info.state == Ready) return; if (info.state != Dispatched) @@ -460,7 +460,7 @@ void unwrapAndMarkReady() { } // Member functions:6 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Epilog][Epilog:1]] Slice(size_t size_) : info({}) , data(nullptr) @@ -471,7 +471,7 @@ Slice(size_t size_) }; // struct Slice // Epilog:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Debug][Debug:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Debug][Debug:1]] template std::ostream& operator<<(std::ostream& out, typename Slice::Location const& v) { // TODO: remove me diff --git a/include/atrip/SliceUnion.hpp b/include/atrip/SliceUnion.hpp index 365ad51..3bae8f8 100644 --- a/include/atrip/SliceUnion.hpp +++ b/include/atrip/SliceUnion.hpp @@ -1,4 +1,4 @@ -// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20slice%20union][The slice union:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*The%20slice%20union][The slice union:1]] #pragma once #include #include diff --git a/include/atrip/Tuples.hpp b/include/atrip/Tuples.hpp index c41b78a..74b609d 100644 --- a/include/atrip/Tuples.hpp +++ b/include/atrip/Tuples.hpp @@ -1,4 +1,4 @@ -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Prolog][Prolog:1]] #pragma once #include @@ -21,7 +21,7 @@ namespace atrip { // Prolog:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Tuples%20types][Tuples types:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Tuples%20types][Tuples types:1]] using ABCTuple = std::array; using PartialTuple = std::array; using ABCTuples = std::vector; @@ -30,14 +30,14 @@ constexpr ABCTuple FAKE_TUPLE = {0, 0, 0}; constexpr ABCTuple INVALID_TUPLE = {1, 1, 1}; // Tuples types:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Distributing%20the%20tuples][Distributing the tuples:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Distributing%20the%20tuples][Distributing the tuples:1]] struct TuplesDistribution { virtual ABCTuples getTuples(size_t Nv, MPI_Comm universe) = 0; virtual bool tupleIsFake(ABCTuple const& t) { return t == FAKE_TUPLE; } }; // Distributing the tuples:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Node%20information][Node information:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Node%20information][Node information:1]] std::vector getNodeNames(MPI_Comm comm){ int rank, np; MPI_Comm_rank(comm, &rank); @@ -77,7 +77,7 @@ std::vector getNodeNames(MPI_Comm comm){ } // Node information:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Node%20information][Node information:2]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Node%20information][Node information:2]] struct RankInfo { const std::string name; const size_t nodeId; @@ -139,7 +139,7 @@ getClusterInfo(MPI_Comm comm) { } // Node information:2 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Naive%20list][Naive list:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Naive%20list][Naive list:1]] ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) { const size_t @@ -173,7 +173,7 @@ ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) { } // Naive list:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Naive%20list][Naive list:2]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Naive%20list][Naive list:2]] ABCTuples getAllTuplesList(const size_t Nv) { const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; ABCTuples result(n); @@ -189,7 +189,7 @@ ABCTuples getAllTuplesList(const size_t Nv) { } // Naive list:2 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Naive%20list][Naive list:3]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Naive%20list][Naive list:3]] struct NaiveDistribution : public TuplesDistribution { ABCTuples getTuples(size_t Nv, MPI_Comm universe) override { int rank, np; @@ -200,11 +200,11 @@ struct NaiveDistribution : public TuplesDistribution { }; // Naive list:3 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Prolog][Prolog:1]] namespace group_and_sort { // Prolog:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Utils][Utils:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Utils][Utils:1]] // Provides the node on which the slice-element is found // Right now we distribute the slices in a round robin fashion // over the different nodes (NOTE: not mpi ranks but nodes) @@ -229,7 +229,7 @@ struct Info { }; // Utils:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Distribution][Distribution:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Distribution][Distribution:1]] ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { ABCTuples nodeTuples; @@ -241,7 +241,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { , container3d(nNodes * nNodes * nNodes) ; - if (info.nodeId == 0) + WITH_DBG if (info.nodeId == 0) std::cout << "\tGoing through all " << allTuples.size() << " tuples in " @@ -273,7 +273,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { } - if (info.nodeId == 0) + WITH_DBG if (info.nodeId == 0) std::cout << "\tBuilding 1-d containers\n"; // DISTRIBUTE 1-d containers // every tuple which is only located at one node belongs to this node @@ -283,7 +283,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { std::copy(_tuples.begin(), _tuples.end(), nodeTuples.begin()); } - if (info.nodeId == 0) + WITH_DBG if (info.nodeId == 0) std::cout << "\tBuilding 2-d containers\n"; // DISTRIBUTE 2-d containers //the tuples which are located at two nodes are half/half given to these nodes @@ -318,7 +318,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { } - if (info.nodeId == 0) + WITH_DBG if (info.nodeId == 0) std::cout << "\tBuilding 3-d containers\n"; // DISTRIBUTE 3-d containers for (size_t zyx = 0; zyx < container3d.size(); zyx++) { @@ -357,7 +357,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { } - if (info.nodeId == 0) std::cout << "\tswapping tuples...\n"; + WITH_DBG if (info.nodeId == 0) std::cout << "\tswapping tuples...\n"; /* * sort part of group-and-sort algorithm * every tuple on a given node is sorted in a way that @@ -387,16 +387,16 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { } } - if (info.nodeId == 0) std::cout << "\tsorting list of tuples...\n"; + WITH_DBG if (info.nodeId == 0) std::cout << "\tsorting list of tuples...\n"; //now we sort the list of tuples std::sort(nodeTuples.begin(), nodeTuples.end()); - if (info.nodeId == 0) std::cout << "\trestoring tuples...\n"; + WITH_DBG if (info.nodeId == 0) std::cout << "\trestoring tuples...\n"; // we bring the tuples abc back in the order a 1 - if (info.nodeId == 0) + WITH_DBG if (info.nodeId == 0) std::cout << "checking for validity of " << nodeTuples.size() << std::endl; const bool anyInvalid = std::any_of(nodeTuples.begin(), @@ -405,13 +405,13 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { if (anyInvalid) throw "Some tuple is invalid in group-and-sort algorithm"; #endif - if (info.nodeId == 0) std::cout << "\treturning tuples...\n"; + WITH_DBG if (info.nodeId == 0) std::cout << "\treturning tuples...\n"; return nodeTuples; } // Distribution:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Main][Main:1]] std::vector main(MPI_Comm universe, size_t Nv) { int rank, np; @@ -451,7 +451,7 @@ std::vector main(MPI_Comm universe, size_t Nv) { MPI_Comm_split(universe, color, key, &INTRA_COMM); // Main:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:2]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Main][Main:2]] size_t const tuplesPerRankLocal = nodeTuples.size() / nodeInfos[rank].ranksPerNode @@ -479,7 +479,7 @@ LOG(1,"Atrip") << "ranks per node " << nodeInfos[rank].ranksPerNode << "\n"; LOG(1,"Atrip") << "#nodes " << nNodes << "\n"; // Main:2 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:3]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Main][Main:3]] size_t const totalTuples = tuplesPerRankGlobal * nodeInfos[rank].ranksPerNode; @@ -491,7 +491,7 @@ if (computeDistribution) { } // Main:3 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:4]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Main][Main:4]] { // construct mpi type for abctuple MPI_Datatype MPI_ABCTUPLE; @@ -515,13 +515,13 @@ if (computeDistribution) { } // Main:4 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:5]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Main][Main:5]] return result; } // Main:5 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Interface][Interface:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Interface][Interface:1]] struct Distribution : public TuplesDistribution { ABCTuples getTuples(size_t Nv, MPI_Comm universe) override { return main(universe, Nv); @@ -529,10 +529,10 @@ struct Distribution : public TuplesDistribution { }; // Interface:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Epilog][Epilog:1]] } // namespace group_and_sort // Epilog:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Epilog][Epilog:1]] } // Epilog:1 ends here diff --git a/include/atrip/Unions.hpp b/include/atrip/Unions.hpp index e651ef9..6c5e058 100644 --- a/include/atrip/Unions.hpp +++ b/include/atrip/Unions.hpp @@ -1,4 +1,4 @@ -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Unions][Unions:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Unions][Unions:1]] #pragma once #include diff --git a/include/atrip/Utils.hpp b/include/atrip/Utils.hpp index 83656c6..b5b9d6c 100644 --- a/include/atrip/Utils.hpp +++ b/include/atrip/Utils.hpp @@ -1,4 +1,4 @@ -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Prolog][Prolog:1]] #pragma once #include #include @@ -11,7 +11,7 @@ namespace atrip { // Prolog:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Pretty%20printing][Pretty printing:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Pretty%20printing][Pretty printing:1]] template std::string pretty_print(T&& value) { std::stringstream stream; @@ -22,7 +22,7 @@ template } // Pretty printing:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Chrono][Chrono:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Chrono][Chrono:1]] #define WITH_CHRONO(__chrono_name, ...) \ Atrip::chrono[__chrono_name].start(); \ __VA_ARGS__ \ @@ -41,6 +41,6 @@ struct Timer { using Timings = std::map; // Chrono:1 ends here -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Epilog][Epilog:1]] } // Epilog:1 ends here diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx index b7823de..71436b2 100644 --- a/src/atrip/Atrip.cxx +++ b/src/atrip/Atrip.cxx @@ -1,4 +1,4 @@ -// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:1]] +// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Main][Main:1]] #include #include