diff --git a/Makefile b/Makefile index b9cdb91..715056e 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ CONFIG ?= gcc SOURCES_FILE := Sources.mk include $(SOURCES_FILE) -include ./etc/emacs.mk +include ./etc/make/emacs.mk include ./etc/config/$(CONFIG).mk include ./bench/config.mk diff --git a/README.org b/README.org index 9144b63..c40e20f 100644 --- a/README.org +++ b/README.org @@ -9,6 +9,7 @@ The algorithm uses two main data types, the =Slice= and the ** The slice + #+begin_src c++ :tangle (atrip-slice-h) #pragma once #include @@ -286,7 +287,7 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds throw std::domain_error( "Slice not found: " + pretty_print(info) + " rank: " - + pretty_print(cc4s::Cc4s::world->rank) + + pretty_print(Atrip::rank) ); WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n"; return *sliceIt; @@ -315,7 +316,7 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds + ", " + pretty_print(type) + " rank: " - + pretty_print(cc4s::Cc4s::world->rank) + + pretty_print(Atrip::rank) ); return *sliceIt; } @@ -484,5 +485,2063 @@ std::ostream& operator<<(std::ostream& out, Slice::Info const& i) { ; return out; } + +} // namespace atrip #+end_src +** Utils +#+begin_src c++ :tangle (atrip-utils-h) +#pragma once +#include +#include +#include +#include + +namespace atrip { + + struct Atrip { + static size_t rank; + }; + size_t Atrip::rank; + + template + std::string pretty_print(T&& value) { + std::stringstream stream; +#if TRIPLES_DEBUG > 1 + dbg::pretty_print(stream, std::forward(value)); +#endif + return stream.str(); + } + +#define WITH_CHRONO(__chrono, ...) \ + __chrono.start(); __VA_ARGS__ __chrono.stop(); + + struct Timer { + using Clock = std::chrono::high_resolution_clock; + using Event = std::chrono::time_point; + std::chrono::duration duration; + Event _start; + inline void start() noexcept { _start = Clock::now(); } + inline void stop() noexcept { duration += Clock::now() - _start; } + inline void clear() noexcept { duration *= 0; } + inline double count() const noexcept { return duration.count(); } + }; + using Timings = std::map; +} + +#+end_src + +** Include header + +#+begin_src c++ :tangle (atrip-main-h) +#pragma once + +#define TRIPLES_BENCHMARK +#define TRIPLES_DEBUG 1 +//#define TRIPLES_WORKLOAD_DUMP +#define TRIPLES_USE_DGEMM +//#define TRIPLES_PRINT_TUPLES + +#if TRIPLES_DEBUG == 4 +# pragma message("WARNING: You have OCD debugging ABC triples "\ + "expect GB of output and consult your therapist") +# include +# define HAVE_OCD +# define OCD_Barrier(com) MPI_Barrier(com) +# define WITH_OCD +# define WITH_ROOT if (atrip::Atrip::rank == 0) +# define WITH_SPECIAL(r) if (atrip::Atrip::rank == r) +# define WITH_RANK std::cout << atrip::Atrip::rank << ": " +# define WITH_CRAZY_DEBUG +# define WITH_DBG +# define DBG(...) dbg(__VA_ARGS__) +#elif TRIPLES_DEBUG == 3 +# pragma message("WARNING: You have crazy debugging ABC triples,"\ + " expect GB of output") +# include +# define OCD_Barrier(com) +# define WITH_OCD if (false) +# define WITH_ROOT if (atrip::Atrip::rank == 0) +# define WITH_SPECIAL(r) if (atrip::Atrip::rank == r) +# define WITH_RANK std::cout << atrip::Atrip::rank << ": " +# define WITH_CRAZY_DEBUG +# define WITH_DBG +# define DBG(...) dbg(__VA_ARGS__) +#elif TRIPLES_DEBUG == 2 +# pragma message("WARNING: You have some debugging info for ABC triples") +# include +# define OCD_Barrier(com) +# define WITH_OCD if (false) +# define WITH_ROOT if (atrip::Atrip::rank == 0) +# define WITH_SPECIAL(r) if (atrip::Atrip::rank == r) +# define WITH_RANK std::cout << atrip::Atrip::rank << ": " +# define WITH_CRAZY_DEBUG if (false) +# define WITH_DBG +# define DBG(...) dbg(__VA_ARGS__) +#elif TRIPLES_DEBUG == 1 +# define OCD_Barrier(com) +# define WITH_OCD if (false) +# define WITH_ROOT if (false) +# define WITH_SPECIAL(r) if (false) +# define WITH_RANK if (false) std::cout << atrip::Atrip::rank << ": " +# define WITH_DBG if (false) +# define WITH_CRAZY_DEBUG if (false) +# define DBG(...) +#else +# error("TRIPLES_DEBUG is not defined!") +#endif + +#include +#include + + +#+end_src + + +** Todo :noexport: + #+begin_src c++ :tangle src/main.hpp +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace cc4s; + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace atrip { + extern "C" { + void dgemm_( + const char *transa, + const char *transb, + const int *m, + const int *n, + const int *k, + double *alpha, + const double *A, + const int *lda, + const double *B, + const int *ldb, + double *beta, + double *C, + const int *ldc + ); + } +} + + + +namespace atrip { + + +} + + +ALGORITHM_REGISTRAR_DEFINITION(PerturbativeTriplesAbcijk) + + + +// SLICE MAIN DEFININITION =============================================={{{1 +#+end_src + + +#+begin_src c++ :tangle src/main.hpp + + // RANKMAP =============================================================={{{1 + +struct RankMap { + + std::vector const lengths; + size_t const np, size; + + RankMap(std::vector lens, size_t np_) + : lengths(lens) + , np(np_) + , size(std::accumulate(lengths.begin(), lengths.end(), + 1, std::multiplies())) + { assert(lengths.size() <= 2); } + + size_t find(Slice::Location const& p) const noexcept { + return p.source * np + p.rank; + } + + size_t nSources() const noexcept { + return size / np + size_t(size % np != 0); + } + + + bool isPaddingRank(size_t rank) const noexcept { + return size % np == 0 + ? false + : rank > (size % np - 1) + ; + } + + bool isSourcePadding(size_t rank, size_t source) const noexcept { + return source == nSources() && isPaddingRank(rank); + } + + Slice::Location + find(ABCTuple const& abc, Slice::Type sliceType) const noexcept { + // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB + const auto tuple = Slice::subtupleBySlice(abc, sliceType); + + const size_t index + = tuple[0] + + tuple[1] * (lengths.size() > 1 ? lengths[0] : 0) + ; + + return + { index % np + , index / np + }; + } + +}; + + + + // SLICEUNION ==========================================================={{{1 + + struct SliceUnion { + using F = double; + using Tensor = CTF::Tensor; + + virtual void + sliceIntoBuffer(size_t iteration, Tensor &to, Tensor const& from) = 0; + + /* + ,* This function should enforce an important property of a SliceUnion. + ,* Namely, there can be no two Slices of the same nature. + ,* + ,* This means that there can be at most one slice with a given Ty_x_Tu. + ,*/ + void checkForDuplicates() const { + std::vector tytus; + for (auto const& s: slices) { + if (s.isFree()) continue; + tytus.push_back({s.info.type, s.info.tuple}); + } + + for (auto const& tytu: tytus) { + if (std::count(tytus.begin(), tytus.end(), tytu) > 1) + throw "Invariance violated, more than one slice with same Ty_x_Tu"; + } + + } + + std::vector neededSlices(ABCTuple const& abc) { + std::vector needed(sliceTypes.size()); + // build the needed vector + std::transform(sliceTypes.begin(), sliceTypes.end(), + needed.begin(), + [&abc](Slice::Type const type) { + auto tuple = Slice::subtupleBySlice(abc, type); + return std::make_pair(type, tuple); + }); + return needed; + } + + /* buildLocalDatabase + ,* + ,* It should build a database of slices so that we know what is needed + ,* to fetch in the next iteration represented by the tuple 'abc'. + ,* + ,* 1. The algorithm works as follows, we build a database of the all + ,* the slice types that we need together with their tuple. + ,* + ,* 2. Look in the SliceUnion if we already have this tuple, + ,* if we already have it mark it (TODO) + ,* + ,* 3. If we don't have the tuple, look for a (state=acceptor, type=blank) + ,* slice and mark this slice as type=Fetch with the corresponding type + ,* and tuple. + ,* + ,* NOTE: The algorithm should certify that we always have enough blank + ,* slices. + ,* + ,*/ + Slice::LocalDatabase buildLocalDatabase(ABCTuple const& abc) { + Slice::LocalDatabase result; + + auto const needed = neededSlices(abc); + + WITH_RANK << "__db__:needed:" << pretty_print(needed) << "\n"; + // BUILD THE DATABASE + // we need to loop over all sliceTypes that this TensorUnion + // is representing and find out how we will get the corresponding + // slice for the abc we are considering right now. + for (auto const& pair: needed) { + auto const type = pair.first; + auto const tuple = pair.second; + auto const from = rankMap.find(abc, type); + +#ifdef HAVE_OCD + WITH_RANK << "__db__:want:" << pretty_print(pair) << "\n"; + for (auto const& s: slices) + WITH_RANK << "__db__:guts:ocd " + << s.info << " pt " << s.data + << "\n"; +#endif + +#ifdef HAVE_OCD + WITH_RANK << "__db__: checking if exact match" << "\n"; +#endif + { + // FIRST: look up if there is already a *Ready* slice matching what we + // need + auto const& it + = std::find_if(slices.begin(), slices.end(), + [&tuple, &type](Slice const& other) { + return other.info.tuple == tuple + && other.info.type == type + // we only want another slice when it + // has already ready-to-use data + && other.isUnwrappable() + ; + }); + if (it != slices.end()) { + // if we find this slice, it means that we don't have to do anything + WITH_RANK << "__db__: EXACT: found EXACT in name=" << name + << " for tuple " << tuple[0] << ", " << tuple[1] + << " ptr " << it->data + << "\n"; + result.push_back({name, it->info}); + continue; + } + } + +#ifdef HAVE_OCD + WITH_RANK << "__db__: checking if recycle" << "\n"; +#endif + // Try to find a recyling possibility ie. find a slice with the same + // tuple and that has a valid data pointer. + auto const& recycleIt + = std::find_if(slices.begin(), slices.end(), + [&tuple, &type](Slice const& other) { + return other.info.tuple == tuple + && other.info.type != type + && other.isRecyclable() + ; + }); + + // if we find this recylce, then we find a Blank slice + // (which should exist by construction :THINK) + // + if (recycleIt != slices.end()) { + auto& blank = Slice::findOneByType(slices, Slice::Blank); + // TODO: formalize this through a method to copy information + // from another slice + blank.data = recycleIt->data; + blank.info.type = type; + blank.info.tuple = tuple; + blank.info.state = Slice::Recycled; + blank.info.from = from; + blank.info.recycling = recycleIt->info.type; + result.push_back({name, blank.info}); + WITH_RANK << "__db__: RECYCLING: n" << name + << " " << pretty_print(abc) + << " get " << pretty_print(blank.info) + << " from " << pretty_print(recycleIt->info) + << " ptr " << recycleIt->data + << "\n" + ; + continue; + } + + // in this case we have to create a new slice + // this means that we should have a blank slice at our disposal + // and also the freePointers should have some elements inside, + // so we pop a data pointer from the freePointers container +#ifdef HAVE_OCD + WITH_RANK << "__db__: none work, doing new" << "\n"; +#endif + { + WITH_RANK << "__db__: NEW: finding blank in " << name + << " for type " << type + << " for tuple " << tuple[0] << ", " << tuple[1] + << "\n" + ; + auto& blank = Slice::findOneByType(slices, Slice::Blank); + blank.info.type = type; + blank.info.tuple = tuple; + blank.info.from = from; + + // Handle self sufficiency + blank.info.state = cc4s::Cc4s::world->rank == from.rank + ? Slice::SelfSufficient + : Slice::Fetch + ; + if (blank.info.state == Slice::SelfSufficient) { + blank.data = sources[from.source].data(); + } else { + if (freePointers.size() == 0) + throw std::domain_error("No more free pointers!"); + auto dataPointer = freePointers.begin(); + freePointers.erase(dataPointer); + blank.data = *dataPointer; + } + + result.push_back({name, blank.info}); + continue; + } + + } + +#ifdef HAVE_OCD + for (auto const& s: slices) + WITH_RANK << "__db__:guts:ocd:__end__ " << s.info << "\n"; +#endif + + + return result; + + } + + /* + ,* Garbage collect slices not needed for the next iteration. + ,* + ,* It will throw if it tries to gc a slice that has not been + ,* previously unwrapped, as a safety mechanism. + ,*/ + void clearUnusedSlicesForNext(ABCTuple const& abc) { + auto const needed = neededSlices(abc); + + // CLEAN UP SLICES, FREE THE ONES THAT ARE NOT NEEDED ANYMORE + for (auto& slice: slices) { + // if the slice is free, then it was not used anyways + if (slice.isFree()) continue; + + + // try to find the slice in the needed slices list + auto const found + = std::find_if(needed.begin(), needed.end(), + [&slice] (Slice::Ty_x_Tu const& tytu) { + return slice.info.tuple == tytu.second + && slice.info.type == tytu.first + ; + }); + + // if we did not find slice in needed, then erase it + if (found == needed.end()) { + + // We have to be careful about the data pointer, + // for SelfSufficient, the data pointer is a source pointer + // of the slice, so we should just wipe it. + // + // For Ready slices, we have to be careful if there are some + // recycled slices depending on it. + bool freeSlicePointer = true; + + // allow to gc unwrapped and recycled, never Fetch, + // if we have a Fetch slice then something has gone very wrong. + if (!slice.isUnwrapped() && slice.info.state != Slice::Recycled) + throw + std::domain_error("Trying to garbage collect " + " a non-unwrapped slice! " + + pretty_print(&slice) + + pretty_print(slice.info)); + + // it can be that our slice is ready, but it has some hanging + // references lying around in the form of a recycled slice. + // Of course if we need the recycled slice the next iteration + // this would be fatal, because we would then free the pointer + // of the slice and at some point in the future we would + // overwrite it. Therefore, we must check if slice has some + // references in slices and if so then + // + // - we should mark those references as the original (since the data + // pointer should be the same) + // + // - we should make sure that the data pointer of slice + // does not get freed. + // + if (slice.info.state == Slice::Ready) { + WITH_OCD WITH_RANK + << "__gc__:" << "checking for data recycled dependencies\n"; + auto recycled + = Slice::hasRecycledReferencingToIt(slices, slice.info); + if (recycled.size()) { + Slice* newReady = recycled[0]; + WITH_OCD WITH_RANK + << "__gc__:" << "swaping recycled " + << pretty_print(newReady->info) + << " and " + << pretty_print(slice.info) + << "\n"; + newReady->markReady(); + assert(newReady->data == slice.data); + freeSlicePointer = false; + + for (size_t i = 1; i < recycled.size(); i++) { + auto newRecyled = recycled[i]; + newRecyled->info.recycling = newReady->info.type; + WITH_OCD WITH_RANK + << "__gc__:" << "updating recycled " + << pretty_print(newRecyled->info) + << "\n"; + } + + } + } + + // if the slice is self sufficient, do not dare touching the + // pointer, since it is a pointer to our sources in our rank. + if ( slice.info.state == Slice::SelfSufficient + || slice.info.state == Slice::Recycled + ) { + freeSlicePointer = false; + } + + // make sure we get its data pointer to be used later + // only for non-recycled, since it can be that we need + // for next iteration the data of the slice that the recycled points + // to + if (freeSlicePointer) { + freePointers.insert(slice.data); + WITH_RANK << "~~~:cl(" << name << ")" + << " added to freePointer " + << pretty_print(freePointers) + << "\n"; + } else { + WITH_OCD WITH_RANK << "__gc__:not touching the free Pointer\n"; + } + + // at this point, let us blank the slice + WITH_RANK << "~~~:cl(" << name << ")" + << " freeing up slice " + << " info " << slice.info + << "\n"; + slice.free(); + } + + } + } + + // CONSTRUCTOR + SliceUnion( Tensor const& sourceTensor + , std::vector sliceTypes_ + , std::vector sliceLength_ + , std::vector paramLength + , size_t np + , MPI_Comm child_world + , MPI_Comm global_world + , Slice::Name name_ + , size_t nSliceBuffers = 4 + ) + : rankMap(paramLength, np) + , world(child_world) + , universe(global_world) + , sliceLength(sliceLength_) + , sources(rankMap.nSources(), + std::vector + (std::accumulate(sliceLength.begin(), + sliceLength.end(), + 1, std::multiplies()))) + , name(name_) + , sliceTypes(sliceTypes_) + , sliceBuffers(nSliceBuffers, sources[0]) + //, slices(2 * sliceTypes.size(), Slice{ sources[0].size() }) + { // constructor begin + + LOG(0,"NEW_TRIPLES") << "INIT SliceUnion: " << name << "\n"; + + slices + = std::vector(2 * sliceTypes.size(), { sources[0].size() }); + // TODO: think exactly ^------------------- about this number + + // initialize the freePointers with the pointers to the buffers + std::transform(sliceBuffers.begin(), sliceBuffers.end(), + std::inserter(freePointers, freePointers.begin()), + [](std::vector &vec) { return vec.data(); }); + + + + LOG(1,"NEW_TRIPLES") << "rankMap.nSources " + << rankMap.nSources() << "\n"; + LOG(1,"NEW_TRIPLES") << "#slices " + << slices.size() << "\n"; + LOG(1,"NEW_TRIPLES") << "#slices[0] " + << slices[0].size << "\n"; + LOG(1,"NEW_TRIPLES") << "#sources " + << sources.size() << "\n"; + LOG(1,"NEW_TRIPLES") << "#sources[0] " + << sources[0].size() << "\n"; + LOG(1,"NEW_TRIPLES") << "#freePointers " + << freePointers.size() << "\n"; + LOG(1,"NEW_TRIPLES") << "#sliceBuffers " + << sliceBuffers.size() << "\n"; + LOG(1,"NEW_TRIPLES") << "#sliceBuffers[0] " + << sliceBuffers[0].size() << "\n"; + LOG(1,"NEW_TRIPLES") << "#sliceLength " + << sliceLength.size() << "\n"; + LOG(1,"NEW_TRIPLES") << "#paramLength " + << paramLength.size() << "\n"; + LOG(1,"NEW_TRIPLES") << "GB*" << np << " " + << double(sources.size() + sliceBuffers.size()) + ,* sources[0].size() + ,* 8 * np + / 1073741824.0 + << "\n"; + } // constructor ends + + void init(Tensor const& sourceTensor) { + + CTF::World w(world); + const int rank = cc4s::Cc4s::world->rank + , order = sliceLength.size() + ; + std::vector const syms(order, NS); + std::vector __sliceLength(sliceLength.begin(), sliceLength.end()); + Tensor toSliceInto(order, + __sliceLength.data(), + syms.data(), + w); + LOG(1,"NEW_TRIPLES") << "slicing... \n"; + + // setUp sources + for (size_t it(0); it < rankMap.nSources(); ++it) { + const size_t + source = rankMap.isSourcePadding(rank, source) ? 0 : it; + WITH_OCD + WITH_RANK + << "Init:toSliceInto it-" << it + << " :: source " << source << "\n"; + sliceIntoBuffer(source, toSliceInto, sourceTensor); + } + + } + + /** + ,* \brief Send asynchronously only if the state is Fetch + ,*/ + void send( size_t otherRank + , Slice::Info const& info + , size_t tag) const noexcept { + MPI_Request request; + bool sendData_p = false; + + if (info.state == Slice::Fetch) sendData_p = true; + // TODO: remove this because I have SelfSufficient + if (otherRank == info.from.rank) sendData_p = false; + if (!sendData_p) return; + + MPI_Isend( sources[info.from.source].data() + , sources[info.from.source].size() + , MPI_DOUBLE /* TODO: adapt this with traits */ + , otherRank + , tag + , universe + , &request + ); + WITH_CRAZY_DEBUG + WITH_RANK << "sent to " << otherRank << "\n"; + + } + + /** + ,* \brief Receive asynchronously only if the state is Fetch + ,*/ + void receive(Slice::Info const& info, size_t tag) noexcept { + auto& slice = Slice::findByInfo(slices, info); + + if (cc4s::Cc4s::world->rank == info.from.rank) return; + + if (slice.info.state == Slice::Fetch) { + // TODO: do it through the slice class + slice.info.state = Slice::Dispatched; + MPI_Request request; + slice.request = request; + MPI_Irecv( slice.data + , slice.size + , MPI_DOUBLE // TODO: Adapt this with traits + , info.from.rank + , tag + , universe + , &slice.request + //, MPI_STATUS_IGNORE + ); + } + } + + void unwrapAll(ABCTuple const& abc) { + for (auto type: sliceTypes) unwrapSlice(type, abc); + } + + F* unwrapSlice(Slice::Type type, ABCTuple const& abc) { + WITH_CRAZY_DEBUG + WITH_RANK << "__unwrap__:slice " << type << " w n " + << name + << " abc" << pretty_print(abc) + << "\n"; + auto& slice = Slice::findByTypeAbc(slices, type, abc); + WITH_RANK << "__unwrap__:info " << slice.info << "\n"; + switch (slice.info.state) { + case Slice::Dispatched: + WITH_RANK << "__unwrap__:Fetch: " << &slice + << " info " << pretty_print(slice.info) + << "\n"; + slice.unwrapAndMarkReady(); + return slice.data; + break; + case Slice::SelfSufficient: + WITH_RANK << "__unwrap__:SelfSufficient: " << &slice + << " info " << pretty_print(slice.info) + << "\n"; + return slice.data; + break; + case Slice::Ready: + WITH_RANK << "__unwrap__:READY: UNWRAPPED ALREADY" << &slice + << " info " << pretty_print(slice.info) + << "\n"; + return slice.data; + break; + case Slice::Recycled: + WITH_RANK << "__unwrap__:RECYCLED " << &slice + << " info " << pretty_print(slice.info) + << "\n"; + return unwrapSlice(slice.info.recycling, abc); + break; + case Slice::Fetch: + case Slice::Acceptor: + throw std::domain_error("Can't unwrap an acceptor or fetch slice!"); + break; + default: + throw std::domain_error("Unknown error unwrapping slice!"); + } + return slice.data; + } + + const RankMap rankMap; + const MPI_Comm world; + const MPI_Comm universe; + const std::vector sliceLength; + std::vector< std::vector > sources; + std::vector< Slice > slices; + Slice::Name name; + const std::vector sliceTypes; + std::vector< std::vector > sliceBuffers; + std::set freePointers; + + }; + + + + void sliceIntoVector + ( std::vector &v + , CTF::Tensor &toSlice + , std::vector const low + , std::vector const up + , CTF::Tensor const& origin + , std::vector const originLow + , std::vector const originUp + ) { + // Thank you CTF for forcing me to do this + struct { std::vector up, low; } + toSlice_ = { {up.begin(), up.end()} + , {low.begin(), low.end()} } + , origin_ = { {originUp.begin(), originUp.end()} + , {originLow.begin(), originLow.end()} } + ; + + WITH_OCD + WITH_RANK << "slicing into " << pretty_print(toSlice_.up) + << "," << pretty_print(toSlice_.low) + << " from " << pretty_print(origin_.up) + << "," << pretty_print(origin_.low) + << "\n"; + +#ifdef TODO + toSlice.slice( toSlice_.low.data() + , toSlice_.up.data() + , 0.0 + , origin + , origin_.low.data() + , origin_.up.data() + , 1.0); + memcpy(v.data(), toSlice.data, sizeof(double) * v.size()); +#endif + + } + + + struct TAPHH : public SliceUnion { + TAPHH( Tensor const& sourceTensor + , size_t No + , size_t Nv + , size_t np + , MPI_Comm child_world + , MPI_Comm global_world + ) : SliceUnion( sourceTensor + , {Slice::A, Slice::B, Slice::C} + , {Nv, No, No} // size of the slices + , {Nv} + , np + , child_world + , global_world + , Slice::TA + , 4) { + init(sourceTensor); + } + + void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override + { + const int rank = cc4s::Cc4s::world->rank + , Nv = sliceLength[0] + , No = sliceLength[1] + , a = rankMap.find({rank, it}); + ; + + + sliceIntoVector( sources[it] + , to, {0, 0, 0}, {Nv, No, No} + , from, {a, 0, 0, 0}, {a+1, Nv, No, No} + ); + + } + + }; + + + struct HHHA : public SliceUnion { + HHHA( Tensor const& sourceTensor + , size_t No + , size_t Nv + , size_t np + , MPI_Comm child_world + , MPI_Comm global_world + ) : SliceUnion( sourceTensor + , {Slice::A, Slice::B, Slice::C} + , {No, No, No} // size of the slices + , {Nv} // size of the parametrization + , np + , child_world + , global_world + , Slice::VIJKA + , 4) { + init(sourceTensor); + } + + void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override + { + + const int rank = cc4s::Cc4s::world->rank + , No = sliceLength[0] + , a = rankMap.find({rank, it}) + ; + + sliceIntoVector( sources[it] + , to, {0, 0, 0}, {No, No, No} + , from, {0, 0, 0, a}, {No, No, No, a+1} + ); + + } + }; + + struct ABPH : public SliceUnion { + ABPH( Tensor const& sourceTensor + , size_t No + , size_t Nv + , size_t np + , MPI_Comm child_world + , MPI_Comm global_world + ) : SliceUnion( sourceTensor + , { Slice::AB, Slice::BC, Slice::AC + , Slice::BA, Slice::CB, Slice::CA + } + , {Nv, No} // size of the slices + , {Nv, Nv} // size of the parametrization + , np + , child_world + , global_world + , Slice::VABCI + , 2*6) { + init(sourceTensor); + } + + void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override { + + const int Nv = sliceLength[0] + , No = sliceLength[1] + , rank = cc4s::Cc4s::world->rank + , el = rankMap.find({rank, it}) + , a = el % Nv + , b = el / Nv + ; + + + sliceIntoVector( sources[it] + , to, {0, 0}, {Nv, No} + , from, {a, b, 0, 0}, {a+1, b+1, Nv, No} + ); + + } + + }; + + struct ABHH : public SliceUnion { + ABHH( Tensor const& sourceTensor + , size_t No + , size_t Nv + , size_t np + , MPI_Comm child_world + , MPI_Comm global_world + ) : SliceUnion( sourceTensor + , {Slice::AB, Slice::BC, Slice::AC} + , {No, No} // size of the slices + , {Nv, Nv} // size of the parametrization + , np + , child_world + , global_world + , Slice::VABIJ + , 6) { + init(sourceTensor); + } + + void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override { + + const int Nv = from.lens[0] + , No = sliceLength[1] + , rank = cc4s::Cc4s::world->rank + , el = rankMap.find({rank, it}) + , a = el % Nv + , b = el / Nv + ; + + sliceIntoVector( sources[it] + , to, {0, 0}, {No, No} + , from, {a, b, 0, 0}, {a+1, b+1, No, No} + ); + + + } + + }; + + + struct TABHH : public SliceUnion { + TABHH( Tensor const& sourceTensor + , size_t No + , size_t Nv + , size_t np + , MPI_Comm child_world + , MPI_Comm global_world + ) : SliceUnion( sourceTensor + , {Slice::AB, Slice::BC, Slice::AC} + , {No, No} // size of the slices + , {Nv, Nv} // size of the parametrization + , np + , child_world + , global_world + , Slice::TABIJ + , 6) { + init(sourceTensor); + } + + void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override { + // TODO: maybe generalize this with ABHH + + const int Nv = from.lens[0] + , No = sliceLength[1] + , rank = cc4s::Cc4s::world->rank + , el = rankMap.find({rank, it}) + , a = el % Nv + , b = el / Nv + ; + + sliceIntoVector( sources[it] + , to, {0, 0}, {No, No} + , from, {a, b, 0, 0}, {a+1, b+1, No, No} + ); + + + } + + }; + + // PHYSICS ENERGY, SINGLES ... =========================================={{{1 + +double cc4s::PerturbativeTriplesAbcijk::getEnergyDistinct + ( const double epsabc + , std::vector const& epsi + , std::vector const& Tijk_ + , std::vector const& Zijk_ + ) { + constexpr size_t blockSize=16; + double energy(0.); + const size_t No = epsi.size(); + for (size_t kk=0; kk k ? kk : k; + for (size_t j(jstart); j < jend; j++){ + const double ej(epsi[j]); + double facjk( j == k ? 0.5 : 1.0); + size_t istart = ii > j ? ii : j; + for (size_t i(istart); i < iend; i++){ + const double ei(epsi[i]); + double facij ( i==j ? 0.5 : 1.0); + double denominator(epsabc - ei - ej - ek); + double U(Zijk_[i + No*j + No*No*k]); + double V(Zijk_[i + No*k + No*No*j]); + double W(Zijk_[j + No*i + No*No*k]); + double X(Zijk_[j + No*k + No*No*i]); + double Y(Zijk_[k + No*i + No*No*j]); + double Z(Zijk_[k + No*j + No*No*i]); + + double A(Tijk_[i + No*j + No*No*k]); + double B(Tijk_[i + No*k + No*No*j]); + double C(Tijk_[j + No*i + No*No*k]); + double D(Tijk_[j + No*k + No*No*i]); + double E(Tijk_[k + No*i + No*No*j]); + double F(Tijk_[k + No*j + No*No*i]); + double value(3.0*(A*U+B*V+C*W+D*X+E*Y+F*Z) + +((U+X+Y)-2.0*(V+W+Z))*(A+D+E) + +((V+W+Z)-2.0*(U+X+Y))*(B+C+F)); + energy += 2.0*value / denominator * facjk * facij; + } // i + } // j + } // k + } // ii + } // jj + } // kk + return energy; +} + + +double cc4s::PerturbativeTriplesAbcijk::getEnergySame + ( const double epsabc + , std::vector const& epsi + , std::vector const& Tijk_ + , std::vector const& Zijk_ + ) { + constexpr size_t blockSize = 16; + const size_t No = epsi.size(); + double energy(0.); + for (size_t kk=0; kk k ? jj : k; + for(size_t j(jstart); j < jend; j++){ + const double facjk( j == k ? 0.5 : 1.0); + const double ej(epsi[j]); + const size_t istart = ii > j ? ii : j; + for(size_t i(istart); i < iend; i++){ + double ei(epsi[i]); + double facij ( i==j ? 0.5 : 1.0); + double denominator(epsabc - ei - ej - ek); + double U(Zijk_[i + No*j + No*No*k]); + double V(Zijk_[j + No*k + No*No*i]); + double W(Zijk_[k + No*i + No*No*j]); + double A(Tijk_[i + No*j + No*No*k]); + double B(Tijk_[j + No*k + No*No*i]); + double C(Tijk_[k + No*i + No*No*j]); + double value(3.0*( A*U + B*V + C*W) - (A+B+C)*(U+V+W)); + energy += 2.0*value / denominator * facjk * facij; + } // i + } // j + } // k + } // ii + } // jj + } // kk + return energy; +} + +void cc4s::PerturbativeTriplesAbcijk::singlesContribution + ( const ABCTuple &abc + , double const* Tph + , double const* VABij + , double const* VACij + , double const* VBCij + , double *Zijk + ) { + const size_t a(abc[0]), b(abc[1]), c(abc[2]); + for (size_t k=0; k < No; k++) + for (size_t i=0; i < No; i++) + for (size_t j=0; j < No; j++) { + const size_t ijk = i + j*No + k*No*No + , jk = j + No * k + ; + Zijk[ijk] += Tph[ a + i * Nv ] * VBCij[ j + k * No ]; + Zijk[ijk] += Tph[ b + j * Nv ] * VACij[ i + k * No ]; + Zijk[ijk] += Tph[ c + k * Nv ] * VABij[ i + j * No ]; + } +} + +void doublesContribution + ( const ABCTuple &abc + , size_t const No + , size_t const Nv + // -- VABCI + , double const* VABph + , double const* VACph + , double const* VBCph + , double const* VBAph + , double const* VCAph + , double const* VCBph + // -- VHHHA + , double const* VhhhA + , double const* VhhhB + , double const* VhhhC + // -- TA + , double const* TAphh + , double const* TBphh + , double const* TCphh + // -- TABIJ + , double const* TABhh + , double const* TAChh + , double const* TBChh + // -- TIJK + , double *Tijk + , atrip::Timings& chrono + ) { + + auto& t_reorder = chrono["doubles:reorder"]; + const size_t a = abc[0], b = abc[1], c = abc[2] + , NoNo = No*No, NoNv = No*Nv + ; + +#if defined(TRIPLES_USE_DGEMM) +#define _IJK_(i, j, k) i + j*No + k*NoNo +#define REORDER(__II, __JJ, __KK) \ + t_reorder.start(); \ + for (size_t k = 0; k < No; k++) \ + for (size_t j = 0; j < No; j++) \ + for (size_t i = 0; i < No; i++) { \ + Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)]; \ + } \ + t_reorder.stop(); +#define DGEMM_PARTICLES(__A, __B) \ + atrip::dgemm_( "T" \ + , "N" \ + , (int const*)&NoNo \ + , (int const*)&No \ + , (int const*)&Nv \ + , &one \ + , __A \ + , (int const*)&Nv \ + , __B \ + , (int const*)&Nv \ + , &zero \ + , _t_buffer.data() \ + , (int const*)&NoNo \ + ); +#define DGEMM_HOLES(__A, __B, __TRANSB) \ + atrip::dgemm_( "N" \ + , __TRANSB \ + , (int const*)&NoNo \ + , (int const*)&No \ + , (int const*)&No \ + , &m_one \ + , __A \ + , (int const*)&NoNo \ + , __B \ + , (int const*)&No \ + , &zero \ + , _t_buffer.data() \ + , (int const*)&NoNo \ + ); + + using F = double; + const size_t NoNoNo = No*NoNo; + std::vector _t_buffer; + _t_buffer.reserve(NoNoNo); + F one{1.0}, m_one{-1.0}, zero{0.0}; + + t_reorder.start(); + for (size_t k = 0; k < NoNoNo; k++) { + // zero the Tijk + Tijk[k] = 0.0; + } + t_reorder.stop(); + + chrono["doubles:holes"].start(); + { // Holes part ============================================================ + // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 + chrono["doubles:holes:1"].start(); + DGEMM_HOLES(VhhhC, TABhh, "N") + REORDER(i, k, j) + chrono["doubles:holes:1"].stop(); + // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 + chrono["doubles:holes:2"].start(); + DGEMM_HOLES(VhhhC, TABhh, "T") + REORDER(j, k, i) + chrono["doubles:holes:2"].stop(); + // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 + chrono["doubles:holes:3"].start(); + DGEMM_HOLES(VhhhB, TAChh, "N") + REORDER(i, j, k) + chrono["doubles:holes:3"].stop(); + // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 + chrono["doubles:holes:4"].start(); + DGEMM_HOLES(VhhhB, TAChh, "T") + REORDER(k, j, i) + chrono["doubles:holes:4"].stop(); + // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 + chrono["doubles:holes:5"].start(); + DGEMM_HOLES(VhhhA, TBChh, "N") + REORDER(j, i, k) + chrono["doubles:holes:5"].stop(); + // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 + chrono["doubles:holes:6"].start(); + DGEMM_HOLES(VhhhA, TBChh, "T") + REORDER(k, i, j) + chrono["doubles:holes:6"].stop(); + } + chrono["doubles:holes"].stop(); + + chrono["doubles:particles"].start(); + { // Particle part ========================================================= + // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 + chrono["doubles:particles:1"].start(); + DGEMM_PARTICLES(TAphh, VBCph) + REORDER(i, j, k) + chrono["doubles:particles:1"].stop(); + // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 + chrono["doubles:particles:2"].start(); + DGEMM_PARTICLES(TAphh, VCBph) + REORDER(i, k, j) + chrono["doubles:particles:2"].stop(); + // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 + chrono["doubles:particles:3"].start(); + DGEMM_PARTICLES(TCphh, VABph) + REORDER(k, i, j) + chrono["doubles:particles:3"].stop(); + // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 + chrono["doubles:particles:4"].start(); + DGEMM_PARTICLES(TCphh, VBAph) + REORDER(k, j, i) + chrono["doubles:particles:4"].stop(); + // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 + chrono["doubles:particles:5"].start(); + DGEMM_PARTICLES(TBphh, VACph) + REORDER(j, i, k) + chrono["doubles:particles:5"].stop(); + // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 + chrono["doubles:particles:6"].start(); + DGEMM_PARTICLES(TBphh, VCAph) + REORDER(j, k, i) + chrono["doubles:particles:6"].stop(); + } + chrono["doubles:particles"].stop(); + +#undef REORDER +#undef DGEMM_HOLES +#undef DGEMM_PARTICLES +#undef _IJK_ +#else + for (size_t k = 0; k < No; k++) + for (size_t j = 0; j < No; j++) + for (size_t i = 0; i < No; i++){ + const size_t ijk = i + j*No + k*NoNo + , jk = j + k*No + ; + Tijk[ijk] = 0.0; // :important + // HOLE DIAGRAMS: TABHH and VHHHA + for (size_t L = 0; L < No; L++){ + // t[abLj] * V[Lcik] H1 + // t[baLi] * V[Lcjk] H0 TODO: conjugate T for complex + Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo]; + Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo]; + + // t[acLk] * V[Lbij] H5 + // t[caLi] * V[Lbkj] H3 + Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo]; + Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo]; + + // t[bcLk] * V[Laji] H2 + // t[cbLj] * V[Laki] H4 + Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo]; + Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo]; + } + // PARTILCE DIAGRAMS: TAPHH and VABPH + for (size_t E = 0; E < Nv; E++) { + // t[aEij] * V[bcEk] P0 + // t[aEik] * V[cbEj] P3 // TODO: CHECK THIS ONE, I DONT KNOW + Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; + Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; + + // t[cEki] * V[abEj] P5 + // t[cEkj] * V[baEi] P2 + Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; + Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; + + // t[bEji] * V[acEk] P1 + // t[bEjk] * V[caEi] P4 + Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; + Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; + } + + } +#endif +} + + +// HELPER FUNCTIONS ====================================================={{{1 + +using ABCTuples = std::vector; + +ABCTuples getTuplesList(size_t Nv) { + const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; + ABCTuples result(n); + size_t u(0); + + for (size_t a(0); a < Nv; a++) + for (size_t b(a); b < Nv; b++) + for (size_t c(b); c < Nv; c++){ + if ( a == b && b == c ) continue; + result[u++] = {a, b, c}; + } + + return result; + +} + + +std::pair +getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) { + + std::vector n_tuples_per_rank(np, tuplesList.size()/np); + const size_t + // how many valid tuples should we still verteilen to nodes + // since the number of tuples is not divisible by the number of nodes + nRoundRobin = tuplesList.size() % np + // every node must have the sanme amount of tuples in order for the + // other nodes to receive and send somewhere, therefore + // some nodes will get extra tuples but that are dummy tuples + , nExtraInvalid = (np - nRoundRobin) % np + ; + + if (nRoundRobin) for (int i = 0; i < np; i++) n_tuples_per_rank[i]++; + +#if defined(TODO) + assert( tuplesList.size() + == + ( std::accumulate(n_tuples_per_rank.begin(), + n_tuples_per_rank.end(), + 0) + + nExtraInvalid + )); +#endif + + WITH_RANK << "nRoundRobin = " << nRoundRobin << "\n"; + WITH_RANK << "nExtraInvalid = " << nExtraInvalid << "\n"; + WITH_RANK << "ntuples = " << n_tuples_per_rank[rank] << "\n"; + + auto const& it = n_tuples_per_rank.begin(); + + return + { std::accumulate(it, it + rank , 0) + , std::accumulate(it, it + rank + 1, 0) + }; + +} + + +SliceUnion& +unionByName(std::vector const& unions, Slice::Name name) { + const auto sliceUnionIt + = std::find_if(unions.begin(), unions.end(), + [&name](SliceUnion const* s) { + return name == s->name; + }); + if (sliceUnionIt == unions.end()) + throw std::domain_error("SliceUnion not found!"); + return **sliceUnionIt; +} + +#if defined(TRIPLES_WORKLOAD_DUMP) + struct WorkloadEntry { + size_t rank, iteration; + ABCTuple abc; + size_t name; + size_t send; + size_t recv; + size_t doubles; + size_t barrier; + size_t allgather; + size_t unwrap; + + static + size_t findRecv(SliceUnion const& u) { + return std::count_if(u.slices.begin(), u.slices.end(), + [&u](Slice const& slice) { + return slice.info.state == Slice::Dispatched; + }); + } + static + size_t findSend( Slice::Database const& db + , SliceUnion const& u + , size_t rank) { + using Element = Slice::LocalDatabaseElement; + return std::count_if(db.begin(), db.end(), + [&u, rank](Element const& el) { + return el.name == u.name + && el.info.from.rank == rank + // this should work since the database + // has the non-updated Slice::Info + && el.info.state == Slice::Fetch + ; + }); + } + }; +#endif + +// MAIN ALGORITHM ======================================================{{{1 +void cc4s::PerturbativeTriplesAbcijk::run(){ + + const int np = cc4s::Cc4s::world->np; + const int rank = cc4s::Cc4s::world->rank; + MPI_Comm universe = cc4s::Cc4s::world->comm; + + // Timings in seconds ================================================{{{1 + atrip::Timings chrono{}; + + // Get the distributed ctf tensor data + CTF::Tensor<> *ei(getTensorArgument("HoleEigenEnergies")) + , *ea(getTensorArgument("ParticleEigenEnergies")) + , *Tph(getTensorArgument("CcsdSinglesAmplitudes")) + , *Tpphh(getTensorArgument("CcsdDoublesAmplitudes")) + , *Vpphh(getTensorArgument("PPHHCoulombIntegrals")) + , *Vhhhp(getTensorArgument("HHHPCoulombIntegrals")) + , *Vppph(getTensorArgument("PPPHCoulombIntegrals")) + ; + + No = ei->lens[0]; + Nv = ea->lens[0]; + LOG(0,"NEW_TRIPLES") << "No: " << No << "\n"; + LOG(0,"NEW_TRIPLES") << "Nv: " << Nv << "\n"; + + // allocate the three scratches, see piecuch + std::vector Tijk(No*No*No) // doubles only (see piecuch) + , Zijk(No*No*No) // singles + doubles (see piecuch) + // we need local copies of the following tensors on every + // rank + , epsi(No) + , epsa(Nv) + , Tai(No * Nv) + ; + + ei->read_all(epsi.data()); + ea->read_all(epsa.data()); + Tph->read_all(Tai.data()); + + // COMMUNICATOR CONSTRUCTION ========================================={{{1 + // + // Construct a new communicator living only on a single rank + int child_size = 1 + , child_rank + ; + const + int color = rank / child_size + , crank = rank % child_size + ; + MPI_Comm child_comm; + if (np == 1) { + child_comm = universe; + } else { + MPI_Comm_split(cc4s::Cc4s::world->comm, color, crank, &child_comm); + MPI_Comm_rank(child_comm, &child_rank); + MPI_Comm_size(child_comm, &child_size); + //CTF::World child_world(child_comm); + } + + + chrono["nv-slices"].start(); + // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1 + LOG(0,"NEW_TRIPLES") << "BUILD NV-SLICES\n"; + TAPHH taphh(*Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + HHHA hhha(*Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + chrono["nv-slices"].stop(); + + chrono["nv-nv-slices"].start(); + // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1 + LOG(0,"NEW_TRIPLES") << "BUILD NV x NV-SLICES\n"; + ABPH abph(*Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + ABHH abhh(*Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + TABHH tabhh(*Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + chrono["nv-nv-slices"].stop(); + + // all tensors + std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh}; + + //CONSTRUCT TUPLE LIST ==============================================={{{1 + LOG(0,"NEW_TRIPLES") << "BUILD TUPLE LIST\n"; + const auto tuplesList = std::move(getTuplesList(Nv)); + WITH_RANK << "tupList.size() = " << tuplesList.size() << "\n"; + + // GET ABC INDEX RANGE FOR RANK ======================================{{{1 + auto abcIndex = getABCRange(np, rank, tuplesList); + size_t nIterations = abcIndex.second - abcIndex.first; + +#ifdef TRIPLES_BENCHMARK + { const size_t maxIterations = getIntegerArgument("maxIterations", 0); + if (maxIterations != 0) { + abcIndex.second = abcIndex.first + maxIterations % (nIterations + 1); + nIterations = maxIterations % (nIterations + 1); + } + } +#endif + + WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n"; + LOG(0,"NEW_TRIPLES") << "#iterations: " + << nIterations << "\n"; + + // first abc + const ABCTuple firstAbc = tuplesList[abcIndex.first]; + + + double energy(0.); + + + auto const isFakeTuple + = [&tuplesList](size_t const i) { return i >= tuplesList.size(); }; + + + auto communicateDatabase + = [ &unions + , np + , &chrono + ] (ABCTuple const& abc, MPI_Comm const& c) -> Slice::Database { + + chrono["db:comm:type:do"].start(); + auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement(); + chrono["db:comm:type:do"].stop(); + + chrono["db:comm:ldb"].start(); + Slice::LocalDatabase ldb; + + for (auto const& tensor: unions) { + auto const& tensorDb = tensor->buildLocalDatabase(abc); + ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end()); + } + chrono["db:comm:ldb"].stop(); + + Slice::Database db(np * ldb.size(), ldb[0]); + + chrono["oneshot-db:comm:allgather"].start(); + chrono["db:comm:allgather"].start(); + MPI_Allgather( ldb.data() + , ldb.size() + , MPI_LDB_ELEMENT + , db.data() + , ldb.size() + , MPI_LDB_ELEMENT + , c); + chrono["db:comm:allgather"].stop(); + chrono["oneshot-db:comm:allgather"].stop(); + + chrono["db:comm:type:free"].start(); + MPI_Type_free(&MPI_LDB_ELEMENT); + chrono["db:comm:type:free"].stop(); + + return db; + }; + + auto doIOPhase + = [&unions, &rank, &np, &universe, &chrono] (Slice::Database const& db) { + + const size_t localDBLength = db.size() / np; + + size_t sendTag = 0 + , recvTag = rank * localDBLength + ; + + // RECIEVE PHASE ====================================================== + { + // At this point, we have already send to everyone that fits + auto const& begin = &db[rank * localDBLength] + , end = begin + localDBLength + ; + for (auto it = begin; it != end; ++it) { + recvTag++; + auto const& el = *it; + auto& u = unionByName(unions, el.name); + + WITH_DBG std::cout + << rank << ":r" + << "♯" << recvTag << " =>" + << " «n" << el.name + << ", t" << el.info.type + << ", s" << el.info.state + << "»" + << " ⊙ {" << rank << "⇐" << el.info.from.rank + << ", " + << el.info.from.source << "}" + << " ∴ {" << el.info.tuple[0] + << ", " + << el.info.tuple[1] + << "}" + << "\n" + ; + + chrono["db:io:recv"].start(); + u.receive(el.info, recvTag); + chrono["db:io:recv"].stop(); + + } // recv + } + + // SEND PHASE ========================================================= + for (size_t otherRank = 0; otherRank" + << " «n" << el.name + << ", t" << el.info.type + << ", s" << el.info.state + << "»" + << " ⊙ {" << el.info.from.rank << "⇒" << otherRank + << ", " + << el.info.from.source << "}" + << " ∴ {" << el.info.tuple[0] + << ", " + << el.info.tuple[1] + << "}" + << "\n" + ; + + chrono["db:io:send"].start(); + u.send(otherRank, el.info, sendTag); + chrono["db:io:send"].stop(); + + } // send phase + + } // otherRank + + + }; + +#if defined(HAVE_OCD) || defined(TRIPLES_PRINT_TUPLES) + std::map tupleEnergies; +#endif + + const double doublesFlops + = double(No) + ,* double(No) + ,* double(No) + ,* (double(No) + double(Nv)) + ,* 2 + ,* 6 + / 1e9 + ; + + // START MAIN LOOP ======================================================{{{1 + + Slice::Database db; +#if defined(TRIPLES_WORKLOAD_DUMP) + std::vector> workloadDB; +#endif + + for ( size_t i = abcIndex.first, iteration = 1 + ; i < abcIndex.second + ; i++, iteration++ + ) { + chrono["iterations"].start(); + + // check overhead from chrono over all iterations + chrono["start:stop"].start(); chrono["start:stop"].stop(); + + // check overhead of doing a barrier at the beginning + chrono["oneshot-mpi:barrier"].start(); + chrono["mpi:barrier"].start(); + // TODO: REMOVE + if (getIntegerArgument("barrier", 1) == 1) + MPI_Barrier(universe); + chrono["mpi:barrier"].stop(); + chrono["oneshot-mpi:barrier"].stop(); + + if (iteration % getIntegerArgument("iterationMod", 100) == 0) { + LOG(0,"NEW_TRIPLES") + << "iteration " << iteration + << " [" << 100 * iteration / nIterations << "%]" + << " (" << doublesFlops * iteration / chrono["doubles"].count() + << "GF)" + << " (" << doublesFlops * iteration / chrono["iterations"].count() + << "GF)" + << " ===========================\n"; + + // PRINT TIMINGS + for (auto const& pair: chrono) + LOG(1, " ") << pair.first << " :: " + << pair.second.count() + << std::endl; + + } + + const ABCTuple abc = isFakeTuple(i) + ? tuplesList[tuplesList.size() - 1] + : tuplesList[i] + , *abcNext = i == (abcIndex.second - 1) + ? nullptr + : isFakeTuple(i + 1) + ? &tuplesList[tuplesList.size() - 1] + : &tuplesList[i + 1] + ; + + chrono["with_rank"].start(); + WITH_RANK << " :it " << iteration + << " :abc " << pretty_print(abc) + << " :abcN " + << (abcNext ? pretty_print(*abcNext) : "None") + << "\n"; + chrono["with_rank"].stop(); + + + // COMM FIRST DATABASE ================================================{{{1 + if (i == abcIndex.first) { + WITH_RANK << "__first__:first database ............ \n"; + const auto __db = communicateDatabase(abc, universe); + WITH_RANK << "__first__:first database communicated \n"; + WITH_RANK << "__first__:first database io phase \n"; + doIOPhase(__db); + WITH_RANK << "__first__:first database io phase DONE\n"; + WITH_RANK << "__first__::::Unwrapping all slices for first database\n"; + for (auto& u: unions) u->unwrapAll(abc); + WITH_RANK << "__first__::::Unwrapping all slices for first database DONE\n"; + MPI_Barrier(universe); + } + + // COMM NEXT DATABASE ================================================={{{1 + if (abcNext) { + WITH_RANK << "__comm__:" << iteration << "th communicating database\n"; + chrono["db:comm"].start(); + //const auto db = communicateDatabase(*abcNext, universe); + db = communicateDatabase(*abcNext, universe); + chrono["db:comm"].stop(); + chrono["db:io"].start(); + doIOPhase(db); + chrono["db:io"].stop(); + WITH_RANK << "__comm__:" << iteration << "th database io phase DONE\n"; + } + + // COMPUTE DOUBLES ===================================================={{{1 + OCD_Barrier(universe); + if (!isFakeTuple(i)) { + WITH_RANK << iteration << "-th doubles\n"; + WITH_CHRONO(chrono["oneshot-unwrap"], + WITH_CHRONO(chrono["unwrap"], + WITH_CHRONO(chrono["unwrap:doubles"], + for (auto& u: decltype(unions){&abph, &hhha, &taphh, &tabhh}) { + u->unwrapAll(abc); + } + ))) + chrono["oneshot-doubles"].start(); + chrono["doubles"].start(); + // TODO: REMOVE + for (size_t __i=0; __i < getIntegerArgument("doublesLoops", 1); __i++) + doublesContribution( abc, (size_t)No, (size_t)Nv + // -- VABCI + , abph.unwrapSlice(Slice::AB, abc) + , abph.unwrapSlice(Slice::AC, abc) + , abph.unwrapSlice(Slice::BC, abc) + , abph.unwrapSlice(Slice::BA, abc) + , abph.unwrapSlice(Slice::CA, abc) + , abph.unwrapSlice(Slice::CB, abc) + // -- VHHHA + , hhha.unwrapSlice(Slice::A, abc) + , hhha.unwrapSlice(Slice::B, abc) + , hhha.unwrapSlice(Slice::C, abc) + // -- TA + , taphh.unwrapSlice(Slice::A, abc) + , taphh.unwrapSlice(Slice::B, abc) + , taphh.unwrapSlice(Slice::C, abc) + // -- TABIJ + , tabhh.unwrapSlice(Slice::AB, abc) + , tabhh.unwrapSlice(Slice::AC, abc) + , tabhh.unwrapSlice(Slice::BC, abc) + // -- TIJK + , Tijk.data() + , chrono + ); + WITH_RANK << iteration << "-th doubles done\n"; + chrono["doubles"].stop(); + chrono["oneshot-doubles"].stop(); + } + + // COMPUTE SINGLES =================================================== {{{1 + OCD_Barrier(universe); + if (!isFakeTuple(i)) { + WITH_CHRONO(chrono["oneshot-unwrap"], + WITH_CHRONO(chrono["unwrap"], + WITH_CHRONO(chrono["unwrap:singles"], + abhh.unwrapAll(abc); + ))) + chrono["reorder"].start(); + for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I]; + chrono["reorder"].stop(); + chrono["singles"].start(); + singlesContribution( abc + , Tai.data() + , abhh.unwrapSlice(Slice::AB, abc) + , abhh.unwrapSlice(Slice::AC, abc) + , abhh.unwrapSlice(Slice::BC, abc) + , Zijk.data()); + chrono["singles"].stop(); + } + + + // COMPUTE ENERGY ==================================================== {{{1 + if (!isFakeTuple(i)) { + double tupleEnergy(0.); + + int distinct(0); + if (abc[0] == abc[1]) distinct++; + if (abc[1] == abc[2]) distinct--; + const double epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]); + + chrono["energy"].start(); + if ( distinct == 0) + tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk); + else + tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk); + chrono["energy"].stop(); + +#if defined(HAVE_OCD) || defined(TRIPLES_PRINT_TUPLES) + tupleEnergies[abc] = tupleEnergy; +#endif + + energy += tupleEnergy; + +#ifdef HAVE_OCD + auto const print_slices + = [](ABCTuple const& abc, ABCTuple const& want, SliceUnion& u) { + if (abc != want) return; + + for (auto type: u.sliceTypes) { + auto const& ptr = u.unwrapSlice(type, abc); + auto const& slice = Slice::findByTypeAbc(u.slices, type, abc); + WITH_RANK << "__print_slice__:n" << u.name << " " + << pretty_print(abc) << " " + << pretty_print(slice.info) + ; + for (size_t i = 0; i < 20; i++) std::cout << ptr[i] << ", "; + std::cout << std::endl; + } + }; +#endif + + if (isFakeTuple(i)) { + // fake iterations should also unwrap whatever they got + WITH_RANK << iteration + << "th unwrapping because of fake in " + << i << "\n"; + for (auto& u: unions) u->unwrapAll(abc); + } + +#ifdef HAVE_OCD + for (auto const& u: unions) { + WITH_RANK << "__dups__:" + << iteration + << "-th n" << u->name << " checking duplicates\n"; + u->checkForDuplicates(); + } +#endif + + + // WORKLOAD DUMP ====================================================={{{1 +#if defined(TRIPLES_WORKLOAD_DUMP) + { + chrono["workload"].start(); + std::vector entries; + for (auto const& u: unions) { + entries.push_back + ({ rank + , iteration + , abc + , (size_t)u->name + , WorkloadEntry::findSend(db, *u, rank) + , WorkloadEntry::findRecv(*u) + , (size_t)std::round(1e3*chrono["oneshot-doubles"].count()) + , (size_t)std::round(1e3*chrono["oneshot-mpi:barrier"].count()) + , (size_t)std::round(1e3*chrono["oneshot-db:comm:allgather"].count()) + , (size_t)std::round(1e3*chrono["oneshot-unwrap"].count()) + }); + } + workloadDB.push_back(entries); + chrono["workload"].stop(); + } +#endif + + // CLEANUP UNIONS ===================================================={{{1 + OCD_Barrier(universe); + if (abcNext) { + chrono["gc"].start(); + WITH_RANK << "__gc__:" << iteration << "-th cleaning up.......\n"; + for (auto& u: unions) { + + u->unwrapAll(abc); + WITH_RANK << "__gc__:n" << u->name << " :it " << iteration + << " :abc " << pretty_print(abc) + << " :abcN " << pretty_print(*abcNext) + << "\n"; + for (auto const& slice: u->slices) + WITH_RANK << "__gc__:guts:" << slice.info << "\n"; + u->clearUnusedSlicesForNext(*abcNext); + + WITH_RANK << "__gc__: checking validity\n"; + +#ifdef HAVE_OCD + // check for validity of the slices + for (auto type: u->sliceTypes) { + auto tuple = Slice::subtupleBySlice(abc, type); + for (auto& slice: u->slices) { + if ( slice.info.type == type + && slice.info.tuple == tuple + && slice.isDirectlyFetchable() + ) { + if (slice.info.state == Slice::Dispatched) + throw std::domain_error( "This slice should not be undispatched! " + + pretty_print(slice.info)); + } + } + } +#endif + + + } + chrono["gc"].stop(); + } + + WITH_RANK << iteration << "-th cleaning up....... DONE\n"; + } + + // CLEAN CHRONO ======================================================{{{1 + chrono["iterations"].stop(); + { // TODO: REMOVEME + chrono["oneshot-doubles"].clear(); + chrono["oneshot-mpi:barrier"].clear(); + chrono["oneshot-db:comm:allgather"].clear(); + chrono["oneshot-unwrap"].clear(); + } + + // ITERATION END ====================================================={{{1 + } // END OF MAIN LOOP + + MPI_Barrier(universe); + + // PRINT TUPLES ========================================================={{{1 +#if defined(HAVE_OCD) || defined(TRIPLES_PRINT_TUPLES) + LOG(0,"NEW_TRIPLES") << "tuple energies" << "\n"; + for (size_t i = 0; i < np; i++) { + MPI_Barrier(universe); + for (auto const& pair: tupleEnergies) { + if (i == rank) + std::cout << pair.first[0] + << " " << pair.first[1] + << " " << pair.first[2] + << std::setprecision(15) << std::setw(23) + << " tupleEnergy: " << pair.second + << "\n" + ; + } + } +#endif + + // COMMUNICATE THE ENERGIES ============================================={{{1 + LOG(0,"NEW_TRIPLES") << "COMMUNICATING ENERGIES \n"; + double globalEnergy = 0; + MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe); + + WITH_RANK << "local energy " << energy << "\n"; + LOG(0,"LOOP FINISHED, energy") + << std::setprecision(15) << std::setw(23) + << globalEnergy << std::endl; + + // PRINT TIMINGS {{{1 + for (auto const& pair: chrono) + LOG(0,"atrip:chrono") << pair.first << " " + << pair.second.count() << std::endl; + + + LOG(0, "atrip:flops") + << nIterations * doublesFlops / chrono["doubles"].count() << "\n"; + + // WORKLOAD DUMP ====================================================={{{1 +#if defined(TRIPLES_WORKLOAD_DUMP) + { + constexpr char* _workload_file = "workload.triples.bin"; + LOG(0,"NEW_TRIPLES") << "Writing out WORKLOAD to " + << _workload_file << "\n"; + LOG(0,"NEW_TRIPLES") << "# workload entries = " + << workloadDB.size() << "\n"; + MPI_File handle; + MPI_Status status; + MPI_File_open(universe, + _workload_file, + MPI_MODE_CREATE | MPI_MODE_WRONLY, + MPI_INFO_NULL, + &handle); + + for (int _it = 1; _it <= nIterations; _it++) { + MPI_Offset const + chunk_size = sizeof(WorkloadEntry) * workloadDB[0].size() + , main_offset = chunk_size + ,* (rank * nIterations + _it - 1) + ; + + auto const& entries = workloadDB[_it - 1]; + for (size_t i = 0; i < entries.size(); i++) { + MPI_Offset const + local_offset = main_offset + i * sizeof(WorkloadEntry); + LOG(0,"NEW_TRIPLES") << entries[i].iteration + << "," + << entries[i].name + << "," + << entries[i].send + << "," + << entries[i].recv + << "," + << entries[i].doubles + << "," + << entries[i].barrier + << "\n" + ; + MPI_File_write_at(handle, + local_offset, + &entries[i], + sizeof(WorkloadEntry), + MPI_CHAR, + &status); + } + } // iterations + MPI_Barrier(universe); + MPI_File_close(&handle); + } +#endif + + +} + #+end_src diff --git a/bench/config.mk b/bench/config.mk index e7a0b5e..4e09f2f 100644 --- a/bench/config.mk +++ b/bench/config.mk @@ -1,5 +1,7 @@ BENCH_SOURCES = $(wildcard $(ATRIP_ROOT)/bench/test*.cxx) BENCH_TARGETS = $(patsubst %.cxx,%,$(BENCH_SOURCES)) +$(BENCH_TARGETS): CXXFLAGS += -I. +$(BENCH_TARGETS): CXXFLAGS += -fopenmp bench-clean: -rm -v $(BENCH_TARGETS) diff --git a/etc/config/gcc.mk b/etc/config/gcc.mk index 4086fe4..7b14a3e 100644 --- a/etc/config/gcc.mk +++ b/etc/config/gcc.mk @@ -1,8 +1,10 @@ -include etc/ctf.mk +include etc/make/ctf.mk CXX = mpic++ CXXFLAGS += -I$(ATRIP_ROOT)/include CXXFLAGS += -I$(CTF_INCLUDE_PATH) -LDFLAGS += -L$(CTF_BUILD_PATH) -lctf +LDFLAGS += -Wl,-Bstatic -L$(CTF_BUILD_PATH)/lib -lctf +LDFLAGS += -fopenmp -L/usr/lib -lscalapack -L/opt/OpenBLAS/lib -lopenblas +LDFLAGS += -Wl,-Bdynamic diff --git a/etc/make/ctf.mk b/etc/make/ctf.mk index 1142943..c3cf285 100644 --- a/etc/make/ctf.mk +++ b/etc/make/ctf.mk @@ -1,2 +1,2 @@ -include ./etc/ctf_vars.mk -include ./etc/ctf_rules.mk +include ./etc/make/ctf_vars.mk +include ./etc/make/ctf_rules.mk