#+title: ATRIP: An MPI-asynchronous implementation of CCSD(T) #+PROPERTY: header-args+ :noweb yes :comments noweb :mkdirp t * Implementation The algorithm uses two main data types, the =Slice= and the =SliceUnion= as a container and resource manager of the =Slice=. ** The slice #+begin_src c++ :tangle (atrip-slice-h) #pragma once #include #include #include #include #include #include #include namespace atrip { template FF maybeConjugate(const FF a) { return a; } template <> Complex maybeConjugate(const Complex a) { return std::conj(a); } namespace traits { template bool isComplex() { return false; }; template <> bool isComplex() { return true; }; namespace mpi { template MPI_Datatype datatypeOf(void); template <> MPI_Datatype datatypeOf() { return MPI_DOUBLE; } template <> MPI_Datatype datatypeOf() { return MPI_DOUBLE_COMPLEX; } } } template struct Slice { #+end_src A slice is the concept of a subset of values of a given tensor. As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds of objects: - the object \( \mathsf{T}(a)^b_{ij} \) which for every \( a \) gets assigned the tensor \( T^{ab}_{ij} \) of size \( N_\mathrm{o}^2 \times N_\mathrm{v} \) - the object \( \mathsf{T}(a,b)_{ij} \) which for every pair of \( a, b \) corresponds the \( N_\mathrm{o}^2 \)-sized tensor \( T^{ab}_{ij} \). #+begin_src c++ :tangle (atrip-slice-h) // ASSOCIATED TYPES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% struct Location { size_t rank; size_t source; }; enum Type { A = 10 , B , C // Two-parameter slices , AB = 20 , BC , AC // for abci and the doubles , CB , BA , CA // The non-typed slice , Blank = 404 }; enum State { // Fetch represents the state where a slice is to be fetched // and has a valid data pointer that can be written to Fetch = 0, // Dispatches represents the state that an MPI call has been // dispatched in order to get the data, but the data has not been // yet unwrapped, the data might be there or we might have to wait. Dispatched = 2, // Ready means that the data pointer can be read from Ready = 1, // Self sufficient is a slice when its contents are located // in the same rank that it lives, so that it does not have to // fetch from no one else. SelfSufficient = 911, // Recycled means that this slice gets its data pointer from another // slice, so it should not be written to Recycled = 123, // Acceptor means that the Slice can accept a new Slice, it is // the counterpart of the Blank type, but for states Acceptor = 405 }; struct Info { // which part of a,b,c the slice holds PartialTuple tuple; // The type of slice for the user to retrieve the correct one Type type; // What is the state of the slice State state; // Where the slice is to be retrieved // NOTE: this can actually be computed from tuple Location from; // If the data are actually to be found in this other slice Type recycling; Info() : tuple{0,0} , type{Blank} , state{Acceptor} , from{0,0} , recycling{Blank} {} }; using Ty_x_Tu = std::pair< Type, PartialTuple >; // Names of the integrals that are considered in CCSD(T) enum Name { TA = 100 , VIJKA = 101 , VABCI = 200 , TABIJ = 201 , VABIJ = 202 }; // DATABASE ==========================================================={{{1 struct LocalDatabaseElement { Slice::Name name; Slice::Info info; }; using LocalDatabase = std::vector; using Database = LocalDatabase; // STATIC METHODS =========================================================== // // They are useful to organize the structure of slices struct mpi { static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) { MPI_Datatype dt; MPI_Type_vector(n, 1, 1, DT, &dt); MPI_Type_commit(&dt); return dt; } static MPI_Datatype sliceLocation () { constexpr int n = 2; // create a sliceLocation to measure in the current architecture // the packing of the struct Slice::Location measure; MPI_Datatype dt; const std::vector lengths(n, 1); const MPI_Datatype types[n] = {usizeDt(), usizeDt()}; // measure the displacements in the struct size_t j = 0; MPI_Aint displacements[n]; MPI_Get_address(&measure.rank, &displacements[j++]); MPI_Get_address(&measure.source, &displacements[j++]); for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0]; displacements[0] = 0; MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); MPI_Type_commit(&dt); return dt; } static MPI_Datatype enumDt() { return MPI_INT; } static MPI_Datatype usizeDt() { return MPI_UINT64_T; } static MPI_Datatype sliceInfo () { constexpr int n = 5; MPI_Datatype dt; Slice::Info measure; const std::vector lengths(n, 1); const MPI_Datatype types[n] = { vector(2, usizeDt()) , enumDt() , enumDt() , sliceLocation() , enumDt() }; // create the displacements from the info measurement struct size_t j = 0; MPI_Aint displacements[n]; MPI_Get_address(measure.tuple.data(), &displacements[j++]); MPI_Get_address(&measure.type, &displacements[j++]); MPI_Get_address(&measure.state, &displacements[j++]); MPI_Get_address(&measure.from, &displacements[j++]); MPI_Get_address(&measure.recycling, &displacements[j++]); for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0]; displacements[0] = 0; MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); MPI_Type_commit(&dt); return dt; } static MPI_Datatype localDatabaseElement () { constexpr int n = 2; MPI_Datatype dt; LocalDatabaseElement measure; const std::vector lengths(n, 1); const MPI_Datatype types[n] = { enumDt() , sliceInfo() }; // measure the displacements in the struct size_t j = 0; MPI_Aint displacements[n]; MPI_Get_address(&measure.name, &displacements[j++]); MPI_Get_address(&measure.info, &displacements[j++]); for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0]; displacements[0] = 0; MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt); MPI_Type_commit(&dt); return dt; } }; static PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) { switch (sliceType) { case AB: return {abc[0], abc[1]}; case BC: return {abc[1], abc[2]}; case AC: return {abc[0], abc[2]}; case CB: return {abc[2], abc[1]}; case BA: return {abc[1], abc[0]}; case CA: return {abc[2], abc[0]}; case A: return {abc[0], 0}; case B: return {abc[1], 0}; case C: return {abc[2], 0}; default: throw "Switch statement not exhaustive!"; } } /** ,* It is important here to return a reference to a Slice ,* not to accidentally copy the associated buffer of the slice. ,*/ static Slice& findOneByType(std::vector> &slices, Slice::Type type) { const auto sliceIt = std::find_if(slices.begin(), slices.end(), [&type](Slice const& s) { return type == s.info.type; }); WITH_CRAZY_DEBUG WITH_RANK << "\t__ looking for " << type << "\n"; if (sliceIt == slices.end()) throw std::domain_error("Slice by type not found!"); return *sliceIt; } /* ,* Check if an info has ,* ,*/ static std::vector*> hasRecycledReferencingToIt ( std::vector> &slices , Info const& info ) { std::vector*> result; for (auto& s: slices) if ( s.info.recycling == info.type && s.info.tuple == info.tuple && s.info.state == Recycled ) result.push_back(&s); return result; } static Slice& findRecycledSource (std::vector> &slices, Slice::Info info) { const auto sliceIt = std::find_if(slices.begin(), slices.end(), [&info](Slice const& s) { return info.recycling == s.info.type && info.tuple == s.info.tuple && State::Recycled != s.info.state ; }); WITH_CRAZY_DEBUG WITH_RANK << "__slice__:find: recycling source of " << pretty_print(info) << "\n"; if (sliceIt == slices.end()) throw std::domain_error( "Slice not found: " + pretty_print(info) + " rank: " + pretty_print(Atrip::rank) ); WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n"; return *sliceIt; } static Slice& findByTypeAbc ( std::vector> &slices , Slice::Type type , ABCTuple const& abc ) { const auto tuple = Slice::subtupleBySlice(abc, type); const auto sliceIt = std::find_if(slices.begin(), slices.end(), [&type, &tuple](Slice const& s) { return type == s.info.type && tuple == s.info.tuple ; }); WITH_CRAZY_DEBUG WITH_RANK << "__slice__:find:" << type << " and tuple " << pretty_print(tuple) << "\n"; if (sliceIt == slices.end()) throw std::domain_error( "Slice not found: " + pretty_print(tuple) + ", " + pretty_print(type) + " rank: " + pretty_print(Atrip::rank) ); return *sliceIt; } static Slice& findByInfo(std::vector> &slices, Slice::Info const& info) { const auto sliceIt = std::find_if(slices.begin(), slices.end(), [&info](Slice const& s) { // TODO: maybe implement comparison in Info struct return info.type == s.info.type && info.state == s.info.state && info.tuple == s.info.tuple && info.from.rank == s.info.from.rank && info.from.source == s.info.from.source ; }); WITH_CRAZY_DEBUG WITH_RANK << "__slice__:find:looking for " << pretty_print(info) << "\n"; if (sliceIt == slices.end()) throw std::domain_error( "Slice by info not found: " + pretty_print(info)); return *sliceIt; } // SLICE DEFINITION =================================================={{{1 // ATTRIBUTES ============================================================ Info info; F *data; MPI_Request request; const size_t size; void markReady() noexcept { info.state = Ready; info.recycling = Blank; } /* ,* This means that the data is there ,*/ bool isUnwrapped() const noexcept { return info.state == Ready || info.state == SelfSufficient ; } bool isUnwrappable() const noexcept { return isUnwrapped() || info.state == Recycled || info.state == Dispatched ; } inline bool isDirectlyFetchable() const noexcept { return info.state == Ready || info.state == Dispatched; } void free() noexcept { info.tuple = {0, 0}; info.type = Blank; info.state = Acceptor; info.from = {0, 0}; info.recycling = Blank; data = nullptr; } inline bool isFree() const noexcept { return info.tuple == PartialTuple{0, 0} && info.type == Blank && info.state == Acceptor && info.from.rank == 0 && info.from.source == 0 && info.recycling == Blank && data == nullptr ; } /* ,* This function answers the question, which slices can be recycled. ,* ,* A slice can only be recycled if it is Fetch or Ready and has ,* a valid datapointer. ,* ,* In particular, SelfSufficient are not recyclable, since it is easier ,* just to create a SelfSufficient slice than deal with data dependencies. ,* ,* Furthermore, a recycled slice is not recyclable, if this is the case ,* then it is either bad design or a bug. ,*/ inline bool isRecyclable() const noexcept { return ( info.state == Dispatched || info.state == Ready || info.state == Fetch ) && hasValidDataPointer() ; } /* ,* This function describes if a slice has a valid data pointer. ,* ,* This is important to know if the slice has some data to it, also ,* some structural checks are done, so that it should not be Acceptor ,* or Blank, if this is the case then it is a bug. ,*/ inline bool hasValidDataPointer() const noexcept { return data != nullptr && info.state != Acceptor && info.type != Blank ; } void unwrapAndMarkReady() { if (info.state == Ready) return; if (info.state != Dispatched) throw std::domain_error("Can't unwrap a non-ready, non-dispatched slice!"); markReady(); MPI_Status status; #ifdef HAVE_OCD WITH_RANK << "__slice__:mpi: waiting " << "\n"; #endif const int errorCode = MPI_Wait(&request, &status); if (errorCode != MPI_SUCCESS) throw "MPI ERROR HAPPENED...."; #ifdef HAVE_OCD char errorString[MPI_MAX_ERROR_STRING]; int errorSize; MPI_Error_string(errorCode, errorString, &errorSize); WITH_RANK << "__slice__:mpi: status " << "{ .source=" << status.MPI_SOURCE << ", .tag=" << status.MPI_TAG << ", .error=" << status.MPI_ERROR << ", .errCode=" << errorCode << ", .err=" << errorString << " }" << "\n"; #endif } Slice(size_t size_) : info({}) , data(nullptr) , size(size_) {} }; // struct Slice template std::ostream& operator<<(std::ostream& out, typename Slice::Location const& v) { // TODO: remove me out << "{.r(" << v.rank << "), .s(" << v.source << ")};"; return out; } template std::ostream& operator<<(std::ostream& out, typename Slice::Info const& i) { out << "«t" << i.type << ", s" << i.state << "»" << " ⊙ {" << i.from.rank << ", " << i.from.source << "}" << " ∴ {" << i.tuple[0] << ", " << i.tuple[1] << "}" << " ♲t" << i.recycling ; return out; } } // namespace atrip #+end_src ** Utils #+begin_src c++ :tangle (atrip-utils-h) #pragma once #include #include #include #include #include namespace atrip { template std::string pretty_print(T&& value) { std::stringstream stream; #if ATRIP_DEBUG > 1 dbg::pretty_print(stream, std::forward(value)); #endif return stream.str(); } #define WITH_CHRONO(__chrono, ...) \ __chrono.start(); __VA_ARGS__ __chrono.stop(); struct Timer { using Clock = std::chrono::high_resolution_clock; using Event = std::chrono::time_point; std::chrono::duration duration; Event _start; inline void start() noexcept { _start = Clock::now(); } inline void stop() noexcept { duration += Clock::now() - _start; } inline void clear() noexcept { duration *= 0; } inline double count() const noexcept { return duration.count(); } }; using Timings = std::map; } #+end_src ** The rank mapping #+begin_src c++ :tangle (atrip-rankmap-h) #pragma once #include #include #include namespace atrip { template struct RankMap { std::vector const lengths; size_t const np, size; RankMap(std::vector lens, size_t np_) : lengths(lens) , np(np_) , size(std::accumulate(lengths.begin(), lengths.end(), 1UL, std::multiplies())) { assert(lengths.size() <= 2); } size_t find(typename Slice::Location const& p) const noexcept { return p.source * np + p.rank; } size_t nSources() const noexcept { return size / np + size_t(size % np != 0); } bool isPaddingRank(size_t rank) const noexcept { return size % np == 0 ? false : rank > (size % np - 1) ; } bool isSourcePadding(size_t rank, size_t source) const noexcept { return source == nSources() && isPaddingRank(rank); } typename Slice::Location find(ABCTuple const& abc, typename Slice::Type sliceType) const noexcept { // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB const auto tuple = Slice::subtupleBySlice(abc, sliceType); const size_t index = tuple[0] + tuple[1] * (lengths.size() > 1 ? lengths[0] : 0) ; return { index % np , index / np }; } }; } #+end_src ** The slice union #+begin_src c++ :tangle (atrip-slice-union-h) #pragma once #include #include #include namespace atrip { template struct SliceUnion { using Tensor = CTF::Tensor; virtual void sliceIntoBuffer(size_t iteration, Tensor &to, Tensor const& from) = 0; /* * This function should enforce an important property of a SliceUnion. * Namely, there can be no two Slices of the same nature. * * This means that there can be at most one slice with a given Ty_x_Tu. */ void checkForDuplicates() const { std::vector::Ty_x_Tu> tytus; for (auto const& s: slices) { if (s.isFree()) continue; tytus.push_back({s.info.type, s.info.tuple}); } for (auto const& tytu: tytus) { if (std::count(tytus.begin(), tytus.end(), tytu) > 1) throw "Invariance violated, more than one slice with same Ty_x_Tu"; } } std::vector::Ty_x_Tu> neededSlices(ABCTuple const& abc) { std::vector::Ty_x_Tu> needed(sliceTypes.size()); // build the needed vector std::transform(sliceTypes.begin(), sliceTypes.end(), needed.begin(), [&abc](typename Slice::Type const type) { auto tuple = Slice::subtupleBySlice(abc, type); return std::make_pair(type, tuple); }); return needed; } /* buildLocalDatabase * * It should build a database of slices so that we know what is needed * to fetch in the next iteration represented by the tuple 'abc'. * * 1. The algorithm works as follows, we build a database of the all * the slice types that we need together with their tuple. * * 2. Look in the SliceUnion if we already have this tuple, * if we already have it mark it (TODO) * * 3. If we don't have the tuple, look for a (state=acceptor, type=blank) * slice and mark this slice as type=Fetch with the corresponding type * and tuple. * * NOTE: The algorithm should certify that we always have enough blank * slices. * */ typename Slice::LocalDatabase buildLocalDatabase(ABCTuple const& abc) { typename Slice::LocalDatabase result; auto const needed = neededSlices(abc); WITH_RANK << "__db__:needed:" << pretty_print(needed) << "\n"; // BUILD THE DATABASE // we need to loop over all sliceTypes that this TensorUnion // is representing and find out how we will get the corresponding // slice for the abc we are considering right now. for (auto const& pair: needed) { auto const type = pair.first; auto const tuple = pair.second; auto const from = rankMap.find(abc, type); #ifdef HAVE_OCD WITH_RANK << "__db__:want:" << pretty_print(pair) << "\n"; for (auto const& s: slices) WITH_RANK << "__db__:guts:ocd " << s.info << " pt " << s.data << "\n"; #endif #ifdef HAVE_OCD WITH_RANK << "__db__: checking if exact match" << "\n"; #endif { // FIRST: look up if there is already a *Ready* slice matching what we // need auto const& it = std::find_if(slices.begin(), slices.end(), [&tuple, &type](Slice const& other) { return other.info.tuple == tuple && other.info.type == type // we only want another slice when it // has already ready-to-use data && other.isUnwrappable() ; }); if (it != slices.end()) { // if we find this slice, it means that we don't have to do anything WITH_RANK << "__db__: EXACT: found EXACT in name=" << name << " for tuple " << tuple[0] << ", " << tuple[1] << " ptr " << it->data << "\n"; result.push_back({name, it->info}); continue; } } #ifdef HAVE_OCD WITH_RANK << "__db__: checking if recycle" << "\n"; #endif // Try to find a recyling possibility ie. find a slice with the same // tuple and that has a valid data pointer. auto const& recycleIt = std::find_if(slices.begin(), slices.end(), [&tuple, &type](Slice const& other) { return other.info.tuple == tuple && other.info.type != type && other.isRecyclable() ; }); // if we find this recylce, then we find a Blank slice // (which should exist by construction :THINK) // if (recycleIt != slices.end()) { auto& blank = Slice::findOneByType(slices, Slice::Blank); // TODO: formalize this through a method to copy information // from another slice blank.data = recycleIt->data; blank.info.type = type; blank.info.tuple = tuple; blank.info.state = Slice::Recycled; blank.info.from = from; blank.info.recycling = recycleIt->info.type; result.push_back({name, blank.info}); WITH_RANK << "__db__: RECYCLING: n" << name << " " << pretty_print(abc) << " get " << pretty_print(blank.info) << " from " << pretty_print(recycleIt->info) << " ptr " << recycleIt->data << "\n" ; continue; } // in this case we have to create a new slice // this means that we should have a blank slice at our disposal // and also the freePointers should have some elements inside, // so we pop a data pointer from the freePointers container #ifdef HAVE_OCD WITH_RANK << "__db__: none work, doing new" << "\n"; #endif { WITH_RANK << "__db__: NEW: finding blank in " << name << " for type " << type << " for tuple " << tuple[0] << ", " << tuple[1] << "\n" ; auto& blank = Slice::findOneByType(slices, Slice::Blank); blank.info.type = type; blank.info.tuple = tuple; blank.info.from = from; // Handle self sufficiency blank.info.state = Atrip::rank == from.rank ? Slice::SelfSufficient : Slice::Fetch ; if (blank.info.state == Slice::SelfSufficient) { blank.data = sources[from.source].data(); } else { if (freePointers.size() == 0) throw std::domain_error("No more free pointers!"); auto dataPointer = freePointers.begin(); freePointers.erase(dataPointer); blank.data = *dataPointer; } result.push_back({name, blank.info}); continue; } } #ifdef HAVE_OCD for (auto const& s: slices) WITH_RANK << "__db__:guts:ocd:__end__ " << s.info << "\n"; #endif return result; } /* * Garbage collect slices not needed for the next iteration. * * It will throw if it tries to gc a slice that has not been * previously unwrapped, as a safety mechanism. */ void clearUnusedSlicesForNext(ABCTuple const& abc) { auto const needed = neededSlices(abc); // CLEAN UP SLICES, FREE THE ONES THAT ARE NOT NEEDED ANYMORE for (auto& slice: slices) { // if the slice is free, then it was not used anyways if (slice.isFree()) continue; // try to find the slice in the needed slices list auto const found = std::find_if(needed.begin(), needed.end(), [&slice] (typename Slice::Ty_x_Tu const& tytu) { return slice.info.tuple == tytu.second && slice.info.type == tytu.first ; }); // if we did not find slice in needed, then erase it if (found == needed.end()) { // We have to be careful about the data pointer, // for SelfSufficient, the data pointer is a source pointer // of the slice, so we should just wipe it. // // For Ready slices, we have to be careful if there are some // recycled slices depending on it. bool freeSlicePointer = true; // allow to gc unwrapped and recycled, never Fetch, // if we have a Fetch slice then something has gone very wrong. if (!slice.isUnwrapped() && slice.info.state != Slice::Recycled) throw std::domain_error("Trying to garbage collect " " a non-unwrapped slice! " + pretty_print(&slice) + pretty_print(slice.info)); // it can be that our slice is ready, but it has some hanging // references lying around in the form of a recycled slice. // Of course if we need the recycled slice the next iteration // this would be fatal, because we would then free the pointer // of the slice and at some point in the future we would // overwrite it. Therefore, we must check if slice has some // references in slices and if so then // // - we should mark those references as the original (since the data // pointer should be the same) // // - we should make sure that the data pointer of slice // does not get freed. // if (slice.info.state == Slice::Ready) { WITH_OCD WITH_RANK << "__gc__:" << "checking for data recycled dependencies\n"; auto recycled = Slice::hasRecycledReferencingToIt(slices, slice.info); if (recycled.size()) { Slice* newReady = recycled[0]; WITH_OCD WITH_RANK << "__gc__:" << "swaping recycled " << pretty_print(newReady->info) << " and " << pretty_print(slice.info) << "\n"; newReady->markReady(); assert(newReady->data == slice.data); freeSlicePointer = false; for (size_t i = 1; i < recycled.size(); i++) { auto newRecyled = recycled[i]; newRecyled->info.recycling = newReady->info.type; WITH_OCD WITH_RANK << "__gc__:" << "updating recycled " << pretty_print(newRecyled->info) << "\n"; } } } // if the slice is self sufficient, do not dare touching the // pointer, since it is a pointer to our sources in our rank. if ( slice.info.state == Slice::SelfSufficient || slice.info.state == Slice::Recycled ) { freeSlicePointer = false; } // make sure we get its data pointer to be used later // only for non-recycled, since it can be that we need // for next iteration the data of the slice that the recycled points // to if (freeSlicePointer) { freePointers.insert(slice.data); WITH_RANK << "~~~:cl(" << name << ")" << " added to freePointer " << pretty_print(freePointers) << "\n"; } else { WITH_OCD WITH_RANK << "__gc__:not touching the free Pointer\n"; } // at this point, let us blank the slice WITH_RANK << "~~~:cl(" << name << ")" << " freeing up slice " // TODO: make this possible // << " info " << slice.info << "\n"; slice.free(); } } } // CONSTRUCTOR SliceUnion( Tensor const& sourceTensor , std::vector::Type> sliceTypes_ , std::vector sliceLength_ , std::vector paramLength , size_t np , MPI_Comm child_world , MPI_Comm global_world , typename Slice::Name name_ , size_t nSliceBuffers = 4 ) : rankMap(paramLength, np) , world(child_world) , universe(global_world) , sliceLength(sliceLength_) , sources(rankMap.nSources(), std::vector (std::accumulate(sliceLength.begin(), sliceLength.end(), 1UL, std::multiplies()))) , name(name_) , sliceTypes(sliceTypes_) , sliceBuffers(nSliceBuffers, sources[0]) //, slices(2 * sliceTypes.size(), Slice{ sources[0].size() }) { // constructor begin LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n"; slices = std::vector>(2 * sliceTypes.size(), { sources[0].size() }); // TODO: think exactly ^------------------- about this number // initialize the freePointers with the pointers to the buffers std::transform(sliceBuffers.begin(), sliceBuffers.end(), std::inserter(freePointers, freePointers.begin()), [](std::vector &vec) { return vec.data(); }); LOG(1,"Atrip") << "rankMap.nSources " << rankMap.nSources() << "\n"; LOG(1,"Atrip") << "#slices " << slices.size() << "\n"; LOG(1,"Atrip") << "#slices[0] " << slices[0].size << "\n"; LOG(1,"Atrip") << "#sources " << sources.size() << "\n"; LOG(1,"Atrip") << "#sources[0] " << sources[0].size() << "\n"; LOG(1,"Atrip") << "#freePointers " << freePointers.size() << "\n"; LOG(1,"Atrip") << "#sliceBuffers " << sliceBuffers.size() << "\n"; LOG(1,"Atrip") << "#sliceBuffers[0] " << sliceBuffers[0].size() << "\n"; LOG(1,"Atrip") << "#sliceLength " << sliceLength.size() << "\n"; LOG(1,"Atrip") << "#paramLength " << paramLength.size() << "\n"; LOG(1,"Atrip") << "GB*" << np << " " << double(sources.size() + sliceBuffers.size()) * sources[0].size() * 8 * np / 1073741824.0 << "\n"; } // constructor ends void init(Tensor const& sourceTensor) { CTF::World w(world); const int rank = Atrip::rank , order = sliceLength.size() ; std::vector const syms(order, NS); std::vector __sliceLength(sliceLength.begin(), sliceLength.end()); Tensor toSliceInto(order, __sliceLength.data(), syms.data(), w); LOG(1,"Atrip") << "slicing... \n"; // setUp sources for (size_t it(0); it < rankMap.nSources(); ++it) { const size_t source = rankMap.isSourcePadding(rank, source) ? 0 : it; WITH_OCD WITH_RANK << "Init:toSliceInto it-" << it << " :: source " << source << "\n"; sliceIntoBuffer(source, toSliceInto, sourceTensor); } } /** * \brief Send asynchronously only if the state is Fetch */ void send( size_t otherRank , typename Slice::Info const& info , size_t tag) const noexcept { MPI_Request request; bool sendData_p = false; if (info.state == Slice::Fetch) sendData_p = true; // TODO: remove this because I have SelfSufficient if (otherRank == info.from.rank) sendData_p = false; if (!sendData_p) return; MPI_Isend( sources[info.from.source].data() , sources[info.from.source].size() , traits::mpi::datatypeOf() , otherRank , tag , universe , &request ); WITH_CRAZY_DEBUG WITH_RANK << "sent to " << otherRank << "\n"; } /** * \brief Receive asynchronously only if the state is Fetch */ void receive(typename Slice::Info const& info, size_t tag) noexcept { auto& slice = Slice::findByInfo(slices, info); if (Atrip::rank == info.from.rank) return; if (slice.info.state == Slice::Fetch) { // TODO: do it through the slice class slice.info.state = Slice::Dispatched; MPI_Request request; slice.request = request; MPI_Irecv( slice.data , slice.size , traits::mpi::datatypeOf() , info.from.rank , tag , universe , &slice.request //, MPI_STATUS_IGNORE ); } } void unwrapAll(ABCTuple const& abc) { for (auto type: sliceTypes) unwrapSlice(type, abc); } F* unwrapSlice(typename Slice::Type type, ABCTuple const& abc) { WITH_CRAZY_DEBUG WITH_RANK << "__unwrap__:slice " << type << " w n " << name << " abc" << pretty_print(abc) << "\n"; auto& slice = Slice::findByTypeAbc(slices, type, abc); //WITH_RANK << "__unwrap__:info " << slice.info << "\n"; switch (slice.info.state) { case Slice::Dispatched: WITH_RANK << "__unwrap__:Fetch: " << &slice << " info " << pretty_print(slice.info) << "\n"; slice.unwrapAndMarkReady(); return slice.data; break; case Slice::SelfSufficient: WITH_RANK << "__unwrap__:SelfSufficient: " << &slice << " info " << pretty_print(slice.info) << "\n"; return slice.data; break; case Slice::Ready: WITH_RANK << "__unwrap__:READY: UNWRAPPED ALREADY" << &slice << " info " << pretty_print(slice.info) << "\n"; return slice.data; break; case Slice::Recycled: WITH_RANK << "__unwrap__:RECYCLED " << &slice << " info " << pretty_print(slice.info) << "\n"; return unwrapSlice(slice.info.recycling, abc); break; case Slice::Fetch: case Slice::Acceptor: throw std::domain_error("Can't unwrap an acceptor or fetch slice!"); break; default: throw std::domain_error("Unknown error unwrapping slice!"); } return slice.data; } const RankMap rankMap; const MPI_Comm world; const MPI_Comm universe; const std::vector sliceLength; std::vector< std::vector > sources; std::vector< Slice > slices; typename Slice::Name name; const std::vector::Type> sliceTypes; std::vector< std::vector > sliceBuffers; std::set freePointers; }; template SliceUnion& unionByName(std::vector*> const& unions, typename Slice::Name name) { const auto sliceUnionIt = std::find_if(unions.begin(), unions.end(), [&name](SliceUnion const* s) { return name == s->name; }); if (sliceUnionIt == unions.end()) throw std::domain_error("SliceUnion not found!"); return **sliceUnionIt; } } #+end_src ** Tuples #+begin_src c++ :tangle (atrip-tuples-h) #pragma once #include #include #include #include #include namespace atrip { using ABCTuple = std::array; using PartialTuple = std::array; using ABCTuples = std::vector; ABCTuples getTuplesList(size_t Nv) { const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; ABCTuples result(n); size_t u(0); for (size_t a(0); a < Nv; a++) for (size_t b(a); b < Nv; b++) for (size_t c(b); c < Nv; c++){ if ( a == b && b == c ) continue; result[u++] = {a, b, c}; } return result; } std::pair getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) { std::vector n_tuples_per_rank(np, tuplesList.size()/np); const size_t // how many valid tuples should we still verteilen to nodes // since the number of tuples is not divisible by the number of nodes nRoundRobin = tuplesList.size() % np // every node must have the sanme amount of tuples in order for the // other nodes to receive and send somewhere, therefore // some nodes will get extra tuples but that are dummy tuples , nExtraInvalid = (np - nRoundRobin) % np ; if (nRoundRobin) for (int i = 0; i < np; i++) n_tuples_per_rank[i]++; #if defined(TODO) assert( tuplesList.size() == ( std::accumulate(n_tuples_per_rank.begin(), n_tuples_per_rank.end(), 0UL, std::plus()) + nExtraInvalid )); #endif WITH_RANK << "nRoundRobin = " << nRoundRobin << "\n"; WITH_RANK << "nExtraInvalid = " << nExtraInvalid << "\n"; WITH_RANK << "ntuples = " << n_tuples_per_rank[rank] << "\n"; auto const& it = n_tuples_per_rank.begin(); return { std::accumulate(it, it + rank , 0) , std::accumulate(it, it + rank + 1, 0) }; } } #+end_src ** Unions Since every tensor slice in a different way, we can override the slicing procedure and define subclasses of slice unions. #+begin_src c++ :tangle (atrip-unions-h) #pragma once #include namespace atrip { template void sliceIntoVector ( std::vector &v , CTF::Tensor &toSlice , std::vector const low , std::vector const up , CTF::Tensor const& origin , std::vector const originLow , std::vector const originUp ) { // Thank you CTF for forcing me to do this struct { std::vector up, low; } toSlice_ = { {up.begin(), up.end()} , {low.begin(), low.end()} } , origin_ = { {originUp.begin(), originUp.end()} , {originLow.begin(), originLow.end()} } ; WITH_OCD WITH_RANK << "slicing into " << pretty_print(toSlice_.up) << "," << pretty_print(toSlice_.low) << " from " << pretty_print(origin_.up) << "," << pretty_print(origin_.low) << "\n"; #ifndef ATRIP_DONT_SLICE toSlice.slice( toSlice_.low.data() , toSlice_.up.data() , 0.0 , origin , origin_.low.data() , origin_.up.data() , 1.0); memcpy(v.data(), toSlice.data, sizeof(F) * v.size()); #endif } template struct TAPHH : public SliceUnion { TAPHH( CTF::Tensor const& sourceTensor , size_t No , size_t Nv , size_t np , MPI_Comm child_world , MPI_Comm global_world ) : SliceUnion( sourceTensor , {Slice::A, Slice::B, Slice::C} , {Nv, No, No} // size of the slices , {Nv} , np , child_world , global_world , Slice::TA , 4) { init(sourceTensor); } void sliceIntoBuffer(size_t it, CTF::Tensor &to, CTF::Tensor const& from) override { const int Nv = this->sliceLength[0] , No = this->sliceLength[1] , a = this->rankMap.find({static_cast(Atrip::rank), it}); ; sliceIntoVector( this->sources[it] , to, {0, 0, 0}, {Nv, No, No} , from, {a, 0, 0, 0}, {a+1, Nv, No, No} ); } }; template struct HHHA : public SliceUnion { HHHA( CTF::Tensor const& sourceTensor , size_t No , size_t Nv , size_t np , MPI_Comm child_world , MPI_Comm global_world ) : SliceUnion( sourceTensor , {Slice::A, Slice::B, Slice::C} , {No, No, No} // size of the slices , {Nv} // size of the parametrization , np , child_world , global_world , Slice::VIJKA , 4) { init(sourceTensor); } void sliceIntoBuffer(size_t it, CTF::Tensor &to, CTF::Tensor const& from) override { const int No = this->sliceLength[0] , a = this->rankMap.find({static_cast(Atrip::rank), it}) ; sliceIntoVector( this->sources[it] , to, {0, 0, 0}, {No, No, No} , from, {0, 0, 0, a}, {No, No, No, a+1} ); } }; template struct ABPH : public SliceUnion { ABPH( CTF::Tensor const& sourceTensor , size_t No , size_t Nv , size_t np , MPI_Comm child_world , MPI_Comm global_world ) : SliceUnion( sourceTensor , { Slice::AB, Slice::BC, Slice::AC , Slice::BA, Slice::CB, Slice::CA } , {Nv, No} // size of the slices , {Nv, Nv} // size of the parametrization , np , child_world , global_world , Slice::VABCI , 2*6) { init(sourceTensor); } void sliceIntoBuffer(size_t it, CTF::Tensor &to, CTF::Tensor const& from) override { const int Nv = this->sliceLength[0] , No = this->sliceLength[1] , el = this->rankMap.find({static_cast(Atrip::rank), it}) , a = el % Nv , b = el / Nv ; sliceIntoVector( this->sources[it] , to, {0, 0}, {Nv, No} , from, {a, b, 0, 0}, {a+1, b+1, Nv, No} ); } }; template struct ABHH : public SliceUnion { ABHH( CTF::Tensor const& sourceTensor , size_t No , size_t Nv , size_t np , MPI_Comm child_world , MPI_Comm global_world ) : SliceUnion( sourceTensor , {Slice::AB, Slice::BC, Slice::AC} , {No, No} // size of the slices , {Nv, Nv} // size of the parametrization , np , child_world , global_world , Slice::VABIJ , 6) { init(sourceTensor); } void sliceIntoBuffer(size_t it, CTF::Tensor &to, CTF::Tensor const& from) override { const int Nv = from.lens[0] , No = this->sliceLength[1] , el = this->rankMap.find({static_cast(Atrip::rank), it}) , a = el % Nv , b = el / Nv ; sliceIntoVector( this->sources[it] , to, {0, 0}, {No, No} , from, {a, b, 0, 0}, {a+1, b+1, No, No} ); } }; template struct TABHH : public SliceUnion { TABHH( CTF::Tensor const& sourceTensor , size_t No , size_t Nv , size_t np , MPI_Comm child_world , MPI_Comm global_world ) : SliceUnion( sourceTensor , {Slice::AB, Slice::BC, Slice::AC} , {No, No} // size of the slices , {Nv, Nv} // size of the parametrization , np , child_world , global_world , Slice::TABIJ , 6) { init(sourceTensor); } void sliceIntoBuffer(size_t it, CTF::Tensor &to, CTF::Tensor const& from) override { // TODO: maybe generalize this with ABHH const int Nv = from.lens[0] , No = this->sliceLength[1] , el = this->rankMap.find({static_cast(Atrip::rank), it}) , a = el % Nv , b = el / Nv ; sliceIntoVector( this->sources[it] , to, {0, 0}, {No, No} , from, {a, b, 0, 0}, {a+1, b+1, No, No} ); } }; } #+end_src ** Equations #+begin_src c++ :tangle (atrip-equations-h) #pragma once #include #include namespace atrip { template double getEnergyDistinct ( const F epsabc , std::vector const& epsi , std::vector const& Tijk_ , std::vector const& Zijk_ ) { constexpr size_t blockSize=16; F energy(0.); const size_t No = epsi.size(); for (size_t kk=0; kk k ? jj : k; for (size_t j(jstart); j < jend; j++){ F const ej(epsi[j]); F const facjk = j == k ? F(0.5) : F(1.0); size_t istart = ii > j ? ii : j; for (size_t i(istart); i < iend; i++){ const F ei(epsi[i]) , facij = i == j ? F(0.5) : F(1.0) , denominator(epsabc - ei - ej - ek) , U(Zijk_[i + No*j + No*No*k]) , V(Zijk_[i + No*k + No*No*j]) , W(Zijk_[j + No*i + No*No*k]) , X(Zijk_[j + No*k + No*No*i]) , Y(Zijk_[k + No*i + No*No*j]) , Z(Zijk_[k + No*j + No*No*i]) , A(maybeConjugate(Tijk_[i + No*j + No*No*k])) , B(maybeConjugate(Tijk_[i + No*k + No*No*j])) , C(maybeConjugate(Tijk_[j + No*i + No*No*k])) , D(maybeConjugate(Tijk_[j + No*k + No*No*i])) , E(maybeConjugate(Tijk_[k + No*i + No*No*j])) , F(maybeConjugate(Tijk_[k + No*j + No*No*i])) , value = 3.0 * ( A * U + B * V + C * W + D * X + E * Y + F * Z ) + ( ( U + X + Y ) - 2.0 * ( V + W + Z ) ) * ( A + D + E ) + ( ( V + W + Z ) - 2.0 * ( U + X + Y ) ) * ( B + C + F ) ; energy += 2.0 * value / denominator * facjk * facij; } // i } // j } // k } // ii } // jj } // kk return std::real(energy); } template double getEnergySame ( const F epsabc , std::vector const& epsi , std::vector const& Tijk_ , std::vector const& Zijk_ ) { constexpr size_t blockSize = 16; const size_t No = epsi.size(); F energy = F(0.); for (size_t kk=0; kk k ? jj : k; for(size_t j(jstart); j < jend; j++){ const F facjk( j == k ? F(0.5) : F(1.0)); const F ej(epsi[j]); const size_t istart = ii > j ? ii : j; for(size_t i(istart); i < iend; i++){ const F ei(epsi[i]) , facij ( i==j ? F(0.5) : F(1.0)) , denominator(epsabc - ei - ej - ek) , U(Zijk_[i + No*j + No*No*k]) , V(Zijk_[j + No*k + No*No*i]) , W(Zijk_[k + No*i + No*No*j]) , A(maybeConjugate(Tijk_[i + No*j + No*No*k])) , B(maybeConjugate(Tijk_[j + No*k + No*No*i])) , C(maybeConjugate(Tijk_[k + No*i + No*No*j])) , value = F(3.0) * ( A * U + B * V + C * W ) - ( A + B + C ) * ( U + V + W ) ; energy += F(2.0) * value / denominator * facjk * facij; } // i } // j } // k } // ii } // jj } // kk return std::real(energy); } template void singlesContribution ( size_t No , size_t Nv , const ABCTuple &abc , F const* Tph , F const* VABij , F const* VACij , F const* VBCij , F *Zijk ) { const size_t a(abc[0]), b(abc[1]), c(abc[2]); for (size_t k=0; k < No; k++) for (size_t i=0; i < No; i++) for (size_t j=0; j < No; j++) { const size_t ijk = i + j*No + k*No*No , jk = j + No * k ; Zijk[ijk] += Tph[ a + i * Nv ] * VBCij[ j + k * No ]; Zijk[ijk] += Tph[ b + j * Nv ] * VACij[ i + k * No ]; Zijk[ijk] += Tph[ c + k * Nv ] * VABij[ i + j * No ]; } } template void doublesContribution ( const ABCTuple &abc , size_t const No , size_t const Nv // -- VABCI , F const* VABph , F const* VACph , F const* VBCph , F const* VBAph , F const* VCAph , F const* VCBph // -- VHHHA , F const* VhhhA , F const* VhhhB , F const* VhhhC // -- TA , F const* TAphh , F const* TBphh , F const* TCphh // -- TABIJ , F const* TABhh , F const* TAChh , F const* TBChh // -- TIJK , F *Tijk , atrip::Timings& chrono ) { auto& t_reorder = chrono["doubles:reorder"]; const size_t a = abc[0], b = abc[1], c = abc[2] , NoNo = No*No, NoNv = No*Nv ; #if defined(ATRIP_USE_DGEMM) #define _IJK_(i, j, k) i + j*No + k*NoNo #define REORDER(__II, __JJ, __KK) \ t_reorder.start(); \ for (size_t k = 0; k < No; k++) \ for (size_t j = 0; j < No; j++) \ for (size_t i = 0; i < No; i++) { \ Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)]; \ } \ t_reorder.stop(); #define DGEMM_PARTICLES(__A, __B) \ atrip::xgemm( "T" \ , "N" \ , (int const*)&NoNo \ , (int const*)&No \ , (int const*)&Nv \ , &one \ , __A \ , (int const*)&Nv \ , __B \ , (int const*)&Nv \ , &zero \ , _t_buffer.data() \ , (int const*)&NoNo \ ); #define DGEMM_HOLES(__A, __B, __TRANSB) \ atrip::xgemm( "N" \ , __TRANSB \ , (int const*)&NoNo \ , (int const*)&No \ , (int const*)&No \ , &m_one \ , __A \ , (int const*)&NoNo \ , __B \ , (int const*)&No \ , &zero \ , _t_buffer.data() \ , (int const*)&NoNo \ ); #define MAYBE_CONJ(_conj, _buffer) \ for (size_t __i = 0; __i < NoNoNo; ++__i) \ _conj[__i] = maybeConjugate(_buffer[__i]); \ const size_t NoNoNo = No*NoNo; std::vector _t_buffer; _t_buffer.reserve(NoNoNo); F one{1.0}, m_one{-1.0}, zero{0.0}; t_reorder.start(); for (size_t k = 0; k < NoNoNo; k++) { // zero the Tijk Tijk[k] = 0.0; } t_reorder.stop(); chrono["doubles:holes"].start(); { // Holes part ============================================================ std::vector _vhhh(NoNoNo); // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 MAYBE_CONJ(_vhhh, VhhhC) chrono["doubles:holes:1"].start(); DGEMM_HOLES(_vhhh.data(), TABhh, "N") REORDER(i, k, j) chrono["doubles:holes:1"].stop(); // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 chrono["doubles:holes:2"].start(); DGEMM_HOLES(_vhhh.data(), TABhh, "T") REORDER(j, k, i) chrono["doubles:holes:2"].stop(); // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 MAYBE_CONJ(_vhhh, VhhhB) chrono["doubles:holes:3"].start(); DGEMM_HOLES(_vhhh.data(), TAChh, "N") REORDER(i, j, k) chrono["doubles:holes:3"].stop(); // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 chrono["doubles:holes:4"].start(); DGEMM_HOLES(_vhhh.data(), TAChh, "T") REORDER(k, j, i) chrono["doubles:holes:4"].stop(); // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 MAYBE_CONJ(_vhhh, VhhhA) chrono["doubles:holes:5"].start(); DGEMM_HOLES(_vhhh.data(), TBChh, "N") REORDER(j, i, k) chrono["doubles:holes:5"].stop(); // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 chrono["doubles:holes:6"].start(); DGEMM_HOLES(_vhhh.data(), TBChh, "T") REORDER(k, i, j) chrono["doubles:holes:6"].stop(); } chrono["doubles:holes"].stop(); #undef MAYBE_CONJ chrono["doubles:particles"].start(); { // Particle part ========================================================= // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 chrono["doubles:particles:1"].start(); DGEMM_PARTICLES(TAphh, VBCph) REORDER(i, j, k) chrono["doubles:particles:1"].stop(); // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 chrono["doubles:particles:2"].start(); DGEMM_PARTICLES(TAphh, VCBph) REORDER(i, k, j) chrono["doubles:particles:2"].stop(); // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 chrono["doubles:particles:3"].start(); DGEMM_PARTICLES(TCphh, VABph) REORDER(k, i, j) chrono["doubles:particles:3"].stop(); // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 chrono["doubles:particles:4"].start(); DGEMM_PARTICLES(TCphh, VBAph) REORDER(k, j, i) chrono["doubles:particles:4"].stop(); // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 chrono["doubles:particles:5"].start(); DGEMM_PARTICLES(TBphh, VACph) REORDER(j, i, k) chrono["doubles:particles:5"].stop(); // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 chrono["doubles:particles:6"].start(); DGEMM_PARTICLES(TBphh, VCAph) REORDER(j, k, i) chrono["doubles:particles:6"].stop(); } chrono["doubles:particles"].stop(); #undef REORDER #undef DGEMM_HOLES #undef DGEMM_PARTICLES #undef _IJK_ #else for (size_t k = 0; k < No; k++) for (size_t j = 0; j < No; j++) for (size_t i = 0; i < No; i++){ const size_t ijk = i + j*No + k*NoNo , jk = j + k*No ; Tijk[ijk] = 0.0; // :important // HOLE DIAGRAMS: TABHH and VHHHA for (size_t L = 0; L < No; L++){ // t[abLj] * V[Lcik] H1 // t[baLi] * V[Lcjk] H0 TODO: conjugate T for complex Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo]; Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo]; // t[acLk] * V[Lbij] H5 // t[caLi] * V[Lbkj] H3 Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo]; Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo]; // t[bcLk] * V[Laji] H2 // t[cbLj] * V[Laki] H4 Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo]; Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo]; } // PARTILCE DIAGRAMS: TAPHH and VABPH for (size_t E = 0; E < Nv; E++) { // t[aEij] * V[bcEk] P0 // t[aEik] * V[cbEj] P3 // TODO: CHECK THIS ONE, I DONT KNOW Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; // t[cEki] * V[abEj] P5 // t[cEkj] * V[baEi] P2 Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; // t[bEji] * V[acEk] P1 // t[bEjk] * V[caEi] P4 Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; } } #endif } } #+end_src ** Blas The main matrix-matrix multiplication method used in this algorithm is mainly using the =DGEMM= function, which we declare as =extern= since it should be known only at link-time. #+begin_src c++ :tangle (atrip-blas-h) #pragma once namespace atrip { using Complex = std::complex; extern "C" { void dgemm_( const char *transa, const char *transb, const int *m, const int *n, const int *k, double *alpha, const double *a, const int *lda, const double *b, const int *ldb, double *beta, double *c, const int *ldc ); void zgemm_( const char *transa, const char *transb, const int *m, const int *n, const int *k, Complex *alpha, const Complex *A, const int *lda, const Complex *B, const int *ldb, Complex *beta, Complex *C, const int *ldc ); } template void xgemm(const char *transa, const char *transb, const int *m, const int *n, const int *k, F *alpha, const F *A, const int *lda, const F *B, const int *ldb, F *beta, F *C, const int *ldc) { dgemm_(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } template <> void xgemm(const char *transa, const char *transb, const int *m, const int *n, const int *k, Complex *alpha, const Complex *A, const int *lda, const Complex *B, const int *ldb, Complex *beta, Complex *C, const int *ldc) { zgemm_(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } } #+end_src ** Atrip *** Header #+begin_src c++ :tangle (atrip-atrip-h) #pragma once #include #include #include #include #include namespace atrip { struct Atrip { static int rank; static int np; static void init(); template struct Input { CTF::Tensor *ei = nullptr , *ea = nullptr , *Tph = nullptr , *Tpphh = nullptr , *Vpphh = nullptr , *Vhhhp = nullptr , *Vppph = nullptr ; int maxIterations = 0, iterationMod = -1, percentageMod = -1; bool barrier = false; bool chrono = false; Input& with_epsilon_i(CTF::Tensor * t) { ei = t; return *this; } Input& with_epsilon_a(CTF::Tensor * t) { ea = t; return *this; } Input& with_Tai(CTF::Tensor * t) { Tph = t; return *this; } Input& with_Tabij(CTF::Tensor * t) { Tpphh = t; return *this; } Input& with_Vabij(CTF::Tensor * t) { Vpphh = t; return *this; } Input& with_Vijka(CTF::Tensor * t) { Vhhhp = t; return *this; } Input& with_Vabci(CTF::Tensor * t) { Vppph = t; return *this; } Input& with_maxIterations(int i) { maxIterations = i; return *this; } Input& with_iterationMod(int i) { iterationMod = i; return *this; } Input& with_percentageMod(int i) { percentageMod = i; return *this; } Input& with_barrier(bool i) { barrier = i; return *this; } Input& with_chrono(bool i) { chrono = i; return *this; } }; struct Output { double energy; }; template static Output run(Input const& in); }; } #+end_src *** Main #+begin_src c++ :tangle (atrip-atrip-cxx) #include #include #include #include #include #include using namespace atrip; int Atrip::rank; int Atrip::np; // user printing block IterationDescriptor IterationDescription::descriptor; void atrip::registerIterationDescriptor(IterationDescriptor d) { IterationDescription::descriptor = d; } void Atrip::init() { MPI_Comm_rank(MPI_COMM_WORLD, &Atrip::rank); MPI_Comm_size(MPI_COMM_WORLD, &Atrip::np); } template Atrip::Output Atrip::run(Atrip::Input const& in) { const int np = Atrip::np; const int rank = Atrip::rank; MPI_Comm universe = in.ei->wrld->comm; // Timings in seconds ================================================{{{1 Timings chrono{}; const size_t No = in.ei->lens[0]; const size_t Nv = in.ea->lens[0]; LOG(0,"Atrip") << "No: " << No << "\n"; LOG(0,"Atrip") << "Nv: " << Nv << "\n"; // allocate the three scratches, see piecuch std::vector Tijk(No*No*No) // doubles only (see piecuch) , Zijk(No*No*No) // singles + doubles (see piecuch) // we need local copies of the following tensors on every // rank , epsi(No) , epsa(Nv) , Tai(No * Nv) ; in.ei->read_all(epsi.data()); in.ea->read_all(epsa.data()); in.Tph->read_all(Tai.data()); // COMMUNICATOR CONSTRUCTION ========================================={{{1 // // Construct a new communicator living only on a single rank int child_size = 1 , child_rank ; const int color = rank / child_size , crank = rank % child_size ; MPI_Comm child_comm; if (np == 1) { child_comm = universe; } else { MPI_Comm_split(universe, color, crank, &child_comm); MPI_Comm_rank(child_comm, &child_rank); MPI_Comm_size(child_comm, &child_size); } chrono["nv-slices"].start(); // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1 LOG(0,"Atrip") << "BUILD NV-SLICES\n"; TAPHH taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); HHHA hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); chrono["nv-slices"].stop(); chrono["nv-nv-slices"].start(); // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1 LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n"; ABPH abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); ABHH abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); TABHH tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); chrono["nv-nv-slices"].stop(); // all tensors std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh}; //CONSTRUCT TUPLE LIST ==============================================={{{1 LOG(0,"Atrip") << "BUILD TUPLE LIST\n"; const auto tuplesList = std::move(getTuplesList(Nv)); WITH_RANK << "tupList.size() = " << tuplesList.size() << "\n"; // GET ABC INDEX RANGE FOR RANK ======================================{{{1 auto abcIndex = getABCRange(np, rank, tuplesList); size_t nIterations = abcIndex.second - abcIndex.first; WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n"; LOG(0,"Atrip") << "#iterations: " << nIterations << "\n"; // first abc const ABCTuple firstAbc = tuplesList[abcIndex.first]; double energy(0.); const size_t iterationMod = (in.percentageMod > 0) ? nIterations * in.percentageMod / 100 : in.iterationMod , iteration1Percent = nIterations * 0.01 ; auto const isFakeTuple = [&tuplesList](size_t const i) { return i >= tuplesList.size(); }; using Database = typename Slice::Database; using LocalDatabase = typename Slice::LocalDatabase; auto communicateDatabase = [ &unions , np , &chrono ] (ABCTuple const& abc, MPI_Comm const& c) -> Database { chrono["db:comm:type:do"].start(); auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement(); chrono["db:comm:type:do"].stop(); chrono["db:comm:ldb"].start(); LocalDatabase ldb; for (auto const& tensor: unions) { auto const& tensorDb = tensor->buildLocalDatabase(abc); ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end()); } chrono["db:comm:ldb"].stop(); Database db(np * ldb.size(), ldb[0]); chrono["oneshot-db:comm:allgather"].start(); chrono["db:comm:allgather"].start(); MPI_Allgather( ldb.data() , ldb.size() , MPI_LDB_ELEMENT , db.data() , ldb.size() , MPI_LDB_ELEMENT , c); chrono["db:comm:allgather"].stop(); chrono["oneshot-db:comm:allgather"].stop(); chrono["db:comm:type:free"].start(); MPI_Type_free(&MPI_LDB_ELEMENT); chrono["db:comm:type:free"].stop(); return db; }; auto doIOPhase = [&unions, &rank, &np, &universe, &chrono] (Database const& db) { const size_t localDBLength = db.size() / np; size_t sendTag = 0 , recvTag = rank * localDBLength ; // RECIEVE PHASE ====================================================== { // At this point, we have already send to everyone that fits auto const& begin = &db[rank * localDBLength] , end = begin + localDBLength ; for (auto it = begin; it != end; ++it) { recvTag++; auto const& el = *it; auto& u = unionByName(unions, el.name); WITH_DBG std::cout << rank << ":r" << "♯" << recvTag << " =>" << " «n" << el.name << ", t" << el.info.type << ", s" << el.info.state << "»" << " ⊙ {" << rank << "⇐" << el.info.from.rank << ", " << el.info.from.source << "}" << " ∴ {" << el.info.tuple[0] << ", " << el.info.tuple[1] << "}" << "\n" ; chrono["db:io:recv"].start(); u.receive(el.info, recvTag); chrono["db:io:recv"].stop(); } // recv } // SEND PHASE ========================================================= for (size_t otherRank = 0; otherRank::LocalDatabaseElement const& el = *it; if (el.info.from.rank != rank) continue; auto& u = unionByName(unions, el.name); WITH_DBG std::cout << rank << ":s" << "♯" << sendTag << " =>" << " «n" << el.name << ", t" << el.info.type << ", s" << el.info.state << "»" << " ⊙ {" << el.info.from.rank << "⇒" << otherRank << ", " << el.info.from.source << "}" << " ∴ {" << el.info.tuple[0] << ", " << el.info.tuple[1] << "}" << "\n" ; chrono["db:io:send"].start(); u.send(otherRank, el.info, sendTag); chrono["db:io:send"].stop(); } // send phase } // otherRank }; #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES) std::map tupleEnergies; #endif const double doublesFlops = double(No) * double(No) * double(No) * (double(No) + double(Nv)) * 1 * (traits::isComplex() ? 2.0 : 1.0) * 6 / 1e9 ; // START MAIN LOOP ======================================================{{{1 for ( size_t i = abcIndex.first, iteration = 1 ; i < abcIndex.second ; i++, iteration++ ) { chrono["iterations"].start(); // check overhead from chrono over all iterations chrono["start:stop"].start(); chrono["start:stop"].stop(); // check overhead of doing a barrier at the beginning chrono["oneshot-mpi:barrier"].start(); chrono["mpi:barrier"].start(); // TODO: REMOVE if (in.barrier == 1) MPI_Barrier(universe); chrono["mpi:barrier"].stop(); chrono["oneshot-mpi:barrier"].stop(); if (iteration % iterationMod == 0 || iteration == iteration1Percent) { if (IterationDescription::descriptor) { IterationDescription::descriptor({ iteration, nIterations, chrono["iterations"].count() }); } LOG(0,"Atrip") << "iteration " << iteration << " [" << 100 * iteration / nIterations << "%]" << " (" << doublesFlops * iteration / chrono["doubles"].count() << "GF)" << " (" << doublesFlops * iteration / chrono["iterations"].count() << "GF)" << " ===========================\n"; // PRINT TIMINGS if (in.chrono) for (auto const& pair: chrono) LOG(1, " ") << pair.first << " :: " << pair.second.count() << std::endl; } const ABCTuple abc = isFakeTuple(i) ? tuplesList[tuplesList.size() - 1] : tuplesList[i] , *abcNext = i == (abcIndex.second - 1) ? nullptr : isFakeTuple(i + 1) ? &tuplesList[tuplesList.size() - 1] : &tuplesList[i + 1] ; chrono["with_rank"].start(); WITH_RANK << " :it " << iteration << " :abc " << pretty_print(abc) << " :abcN " << (abcNext ? pretty_print(*abcNext) : "None") << "\n"; chrono["with_rank"].stop(); // COMM FIRST DATABASE ================================================{{{1 if (i == abcIndex.first) { WITH_RANK << "__first__:first database ............ \n"; const auto __db = communicateDatabase(abc, universe); WITH_RANK << "__first__:first database communicated \n"; WITH_RANK << "__first__:first database io phase \n"; doIOPhase(__db); WITH_RANK << "__first__:first database io phase DONE\n"; WITH_RANK << "__first__::::Unwrapping all slices for first database\n"; for (auto& u: unions) u->unwrapAll(abc); WITH_RANK << "__first__::::Unwrapping all slices for first database DONE\n"; MPI_Barrier(universe); } // COMM NEXT DATABASE ================================================={{{1 if (abcNext) { WITH_RANK << "__comm__:" << iteration << "th communicating database\n"; chrono["db:comm"].start(); //const auto db = communicateDatabase(*abcNext, universe); Database db = communicateDatabase(*abcNext, universe); chrono["db:comm"].stop(); chrono["db:io"].start(); doIOPhase(db); chrono["db:io"].stop(); WITH_RANK << "__comm__:" << iteration << "th database io phase DONE\n"; } // COMPUTE DOUBLES ===================================================={{{1 OCD_Barrier(universe); if (!isFakeTuple(i)) { WITH_RANK << iteration << "-th doubles\n"; WITH_CHRONO(chrono["oneshot-unwrap"], WITH_CHRONO(chrono["unwrap"], WITH_CHRONO(chrono["unwrap:doubles"], for (auto& u: decltype(unions){&abph, &hhha, &taphh, &tabhh}) { u->unwrapAll(abc); } ))) chrono["oneshot-doubles"].start(); chrono["doubles"].start(); doublesContribution( abc, (size_t)No, (size_t)Nv // -- VABCI , abph.unwrapSlice(Slice::AB, abc) , abph.unwrapSlice(Slice::AC, abc) , abph.unwrapSlice(Slice::BC, abc) , abph.unwrapSlice(Slice::BA, abc) , abph.unwrapSlice(Slice::CA, abc) , abph.unwrapSlice(Slice::CB, abc) // -- VHHHA , hhha.unwrapSlice(Slice::A, abc) , hhha.unwrapSlice(Slice::B, abc) , hhha.unwrapSlice(Slice::C, abc) // -- TA , taphh.unwrapSlice(Slice::A, abc) , taphh.unwrapSlice(Slice::B, abc) , taphh.unwrapSlice(Slice::C, abc) // -- TABIJ , tabhh.unwrapSlice(Slice::AB, abc) , tabhh.unwrapSlice(Slice::AC, abc) , tabhh.unwrapSlice(Slice::BC, abc) // -- TIJK , Tijk.data() , chrono ); WITH_RANK << iteration << "-th doubles done\n"; chrono["doubles"].stop(); chrono["oneshot-doubles"].stop(); } // COMPUTE SINGLES =================================================== {{{1 OCD_Barrier(universe); if (!isFakeTuple(i)) { WITH_CHRONO(chrono["oneshot-unwrap"], WITH_CHRONO(chrono["unwrap"], WITH_CHRONO(chrono["unwrap:singles"], abhh.unwrapAll(abc); ))) chrono["reorder"].start(); for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I]; chrono["reorder"].stop(); chrono["singles"].start(); singlesContribution( No, Nv, abc , Tai.data() , abhh.unwrapSlice(Slice::AB, abc) , abhh.unwrapSlice(Slice::AC, abc) , abhh.unwrapSlice(Slice::BC, abc) , Zijk.data()); chrono["singles"].stop(); } // COMPUTE ENERGY ==================================================== {{{1 if (!isFakeTuple(i)) { double tupleEnergy(0.); int distinct(0); if (abc[0] == abc[1]) distinct++; if (abc[1] == abc[2]) distinct--; const F epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]); chrono["energy"].start(); if ( distinct == 0) tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk); else tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk); chrono["energy"].stop(); #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES) tupleEnergies[abc] = tupleEnergy; #endif energy += tupleEnergy; } if (isFakeTuple(i)) { // fake iterations should also unwrap whatever they got WITH_RANK << iteration << "th unwrapping because of fake in " << i << "\n"; for (auto& u: unions) u->unwrapAll(abc); } #ifdef HAVE_OCD for (auto const& u: unions) { WITH_RANK << "__dups__:" << iteration << "-th n" << u->name << " checking duplicates\n"; u->checkForDuplicates(); } #endif // CLEANUP UNIONS ===================================================={{{1 OCD_Barrier(universe); if (abcNext) { chrono["gc"].start(); WITH_RANK << "__gc__:" << iteration << "-th cleaning up.......\n"; for (auto& u: unions) { u->unwrapAll(abc); WITH_RANK << "__gc__:n" << u->name << " :it " << iteration << " :abc " << pretty_print(abc) << " :abcN " << pretty_print(*abcNext) << "\n"; // for (auto const& slice: u->slices) // WITH_RANK << "__gc__:guts:" << slice.info << "\n"; u->clearUnusedSlicesForNext(*abcNext); WITH_RANK << "__gc__: checking validity\n"; #ifdef HAVE_OCD // check for validity of the slices for (auto type: u->sliceTypes) { auto tuple = Slice::subtupleBySlice(abc, type); for (auto& slice: u->slices) { if ( slice.info.type == type && slice.info.tuple == tuple && slice.isDirectlyFetchable() ) { if (slice.info.state == Slice::Dispatched) throw std::domain_error( "This slice should not be undispatched! " + pretty_print(slice.info)); } } } #endif } chrono["gc"].stop(); } WITH_RANK << iteration << "-th cleaning up....... DONE\n"; chrono["iterations"].stop(); // ITERATION END ====================================================={{{1 } // END OF MAIN LOOP MPI_Barrier(universe); // PRINT TUPLES ========================================================={{{1 #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES) LOG(0,"Atrip") << "tuple energies" << "\n"; for (size_t i = 0; i < np; i++) { MPI_Barrier(universe); for (auto const& pair: tupleEnergies) { if (i == rank) std::cout << pair.first[0] << " " << pair.first[1] << " " << pair.first[2] << std::setprecision(15) << std::setw(23) << " tupleEnergy: " << pair.second << "\n" ; } } #endif // COMMUNICATE THE ENERGIES ============================================={{{1 LOG(0,"Atrip") << "COMMUNICATING ENERGIES \n"; double globalEnergy = 0; MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe); WITH_RANK << "local energy " << energy << "\n"; LOG(0, "Atrip") << "Energy: " << std::setprecision(15) << std::setw(23) << (- globalEnergy) << std::endl; // PRINT TIMINGS {{{1 if (in.chrono) for (auto const& pair: chrono) LOG(0,"atrip:chrono") << pair.first << " " << pair.second.count() << std::endl; LOG(0, "atrip:flops(doubles)") << nIterations * doublesFlops / chrono["doubles"].count() << "\n"; LOG(0, "atrip:flops(iterations)") << nIterations * doublesFlops / chrono["iterations"].count() << "\n"; // TODO: change the sign in the getEnergy routines return { - globalEnergy }; } // instantiate template Atrip::Output Atrip::run(Atrip::Input const& in); template Atrip::Output Atrip::run(Atrip::Input const& in); #+end_src ** Debug and Logging *** Macros #+begin_src c++ :tangle (atrip-debug-h) #pragma once #include #define ATRIP_BENCHMARK //#define ATRIP_DONT_SLICE #ifndef ATRIP_DEBUG # define ATRIP_DEBUG 1 #endif //#define ATRIP_WORKLOAD_DUMP #define ATRIP_USE_DGEMM //#define ATRIP_PRINT_TUPLES #ifndef ATRIP_DEBUG #define ATRIP_DEBUG 1 #endif #if ATRIP_DEBUG == 4 # pragma message("WARNING: You have OCD debugging ABC triples " \ "expect GB of output and consult your therapist") # include # define HAVE_OCD # define OCD_Barrier(com) MPI_Barrier(com) # define WITH_OCD # define WITH_ROOT if (atrip::Atrip::rank == 0) # define WITH_SPECIAL(r) if (atrip::Atrip::rank == r) # define WITH_RANK std::cout << atrip::Atrip::rank << ": " # define WITH_CRAZY_DEBUG # define WITH_DBG # define DBG(...) dbg(__VA_ARGS__) #elif ATRIP_DEBUG == 3 # pragma message("WARNING: You have crazy debugging ABC triples," \ " expect GB of output") # include # define OCD_Barrier(com) # define WITH_OCD if (false) # define WITH_ROOT if (atrip::Atrip::rank == 0) # define WITH_SPECIAL(r) if (atrip::Atrip::rank == r) # define WITH_RANK std::cout << atrip::Atrip::rank << ": " # define WITH_CRAZY_DEBUG # define WITH_DBG # define DBG(...) dbg(__VA_ARGS__) #elif ATRIP_DEBUG == 2 # pragma message("WARNING: You have some debugging info for ABC triples") # include # define OCD_Barrier(com) # define WITH_OCD if (false) # define WITH_ROOT if (atrip::Atrip::rank == 0) # define WITH_SPECIAL(r) if (atrip::Atrip::rank == r) # define WITH_RANK std::cout << atrip::Atrip::rank << ": " # define WITH_CRAZY_DEBUG if (false) # define WITH_DBG # define DBG(...) dbg(__VA_ARGS__) #else # define OCD_Barrier(com) # define WITH_OCD if (false) # define WITH_ROOT if (false) # define WITH_SPECIAL(r) if (false) # define WITH_RANK if (false) std::cout << atrip::Atrip::rank << ": " # define WITH_DBG if (false) # define WITH_CRAZY_DEBUG if (false) # define DBG(...) #endif #+end_src And users of the library can redefine the =LOG= macro which in case of not being defined is defined as follows: #+begin_src c++ :tangle (atrip-debug-h) #ifndef LOG #define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": " #endif #+end_src Furthermore, if you do not wish to see any output from ATRIP, simply define =ATRIP_NO_OUTPUT= #+begin_src c++ :tangle (atrip-debug-h) #ifdef ATRIP_NO_OUTPUT # undef LOG # define LOG(level, name) if (false) std::cout << name << ": " #endif #+end_src *** Iteration informer In general a code writer will want to write some messages in every iteration. A developer then can register a function to be used in this sense. The input of the function is an [[IterationDescriptor]] structure and the output should be nothing. #+name: IterationDescriptor #+begin_src c++ :tangle (atrip-debug-h) namespace atrip { struct IterationDescription; using IterationDescriptor = std::function; struct IterationDescription { static IterationDescriptor descriptor; size_t currentIteration; size_t totalIterations; double currentElapsedTime; }; void registerIterationDescriptor(IterationDescriptor); } #+end_src ** Include header #+begin_src c++ :tangle (atrip-main-h) #pragma once #include #+end_src