atrip/atrip.org

2710 lines
86 KiB
Org Mode

#+title: ATRIP: An MPI-asynchronous implementation of CCSD(T)
#+PROPERTY: header-args+ :noweb yes :comments noweb :mkdirp t
* Implementation
The algorithm uses two main data types, the =Slice= and the
=SliceUnion= as a container and resource manager of the =Slice=.
** The slice
#+begin_src c++ :tangle (atrip-slice-h)
#pragma once
#include <iostream>
#include <algorithm>
#include <vector>
#include <mpi.h>
#include <atrip/Tuples.hpp>
#include <atrip/Utils.hpp>
#include <atrip/Blas.hpp>
namespace atrip {
template <typename FF> FF maybeConjugate(const FF a) { return a; }
template <> Complex maybeConjugate(const Complex a) { return std::conj(a); }
namespace traits {
template <typename FF> bool isComplex() { return false; };
template <> bool isComplex<Complex>() { return true; };
namespace mpi {
template <typename FF> MPI_Datatype datatypeOf(void);
template <> MPI_Datatype datatypeOf<double>() { return MPI_DOUBLE; }
template <> MPI_Datatype datatypeOf<Complex>() { return MPI_DOUBLE_COMPLEX; }
}
}
template <typename F=double>
struct Slice {
#+end_src
A slice is the concept of a subset of values of a given tensor.
As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds of objects:
- the object \( \mathsf{T}(a)^b_{ij} \) which for every \( a \) gets assigned the
tensor \( T^{ab}_{ij} \) of size \( N_\mathrm{o}^2 \times N_\mathrm{v} \)
- the object \( \mathsf{T}(a,b)_{ij} \) which for every pair of \( a, b \)
corresponds the \( N_\mathrm{o}^2 \)-sized tensor \( T^{ab}_{ij} \).
#+begin_src c++ :tangle (atrip-slice-h)
// ASSOCIATED TYPES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
struct Location { size_t rank; size_t source; };
enum Type
{ A = 10
, B
, C
// Two-parameter slices
, AB = 20
, BC
, AC
// for abci and the doubles
, CB
, BA
, CA
// The non-typed slice
, Blank = 404
};
enum State {
// Fetch represents the state where a slice is to be fetched
// and has a valid data pointer that can be written to
Fetch = 0,
// Dispatches represents the state that an MPI call has been
// dispatched in order to get the data, but the data has not been
// yet unwrapped, the data might be there or we might have to wait.
Dispatched = 2,
// Ready means that the data pointer can be read from
Ready = 1,
// Self sufficient is a slice when its contents are located
// in the same rank that it lives, so that it does not have to
// fetch from no one else.
SelfSufficient = 911,
// Recycled means that this slice gets its data pointer from another
// slice, so it should not be written to
Recycled = 123,
// Acceptor means that the Slice can accept a new Slice, it is
// the counterpart of the Blank type, but for states
Acceptor = 405
};
struct Info {
// which part of a,b,c the slice holds
PartialTuple tuple;
// The type of slice for the user to retrieve the correct one
Type type;
// What is the state of the slice
State state;
// Where the slice is to be retrieved
// NOTE: this can actually be computed from tuple
Location from;
// If the data are actually to be found in this other slice
Type recycling;
Info() : tuple{0,0}
, type{Blank}
, state{Acceptor}
, from{0,0}
, recycling{Blank}
{}
};
using Ty_x_Tu = std::pair< Type, PartialTuple >;
// Names of the integrals that are considered in CCSD(T)
enum Name
{ TA = 100
, VIJKA = 101
, VABCI = 200
, TABIJ = 201
, VABIJ = 202
};
// DATABASE ==========================================================={{{1
struct LocalDatabaseElement {
Slice<F>::Name name;
Slice<F>::Info info;
};
using LocalDatabase = std::vector<LocalDatabaseElement>;
using Database = LocalDatabase;
// STATIC METHODS ===========================================================
//
// They are useful to organize the structure of slices
struct mpi {
static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) {
MPI_Datatype dt;
MPI_Type_vector(n, 1, 1, DT, &dt);
MPI_Type_commit(&dt);
return dt;
}
static MPI_Datatype sliceLocation () {
constexpr int n = 2;
// create a sliceLocation to measure in the current architecture
// the packing of the struct
Slice<F>::Location measure;
MPI_Datatype dt;
const std::vector<int> lengths(n, 1);
const MPI_Datatype types[n] = {usizeDt(), usizeDt()};
static_assert(sizeof(Slice<F>::Location) == 2 * sizeof(size_t),
"The Location packing is wrong in your compiler");
// measure the displacements in the struct
size_t j = 0;
MPI_Aint base_address, displacements[n];
MPI_Get_address(&measure, &base_address);
MPI_Get_address(&measure.rank, &displacements[j++]);
MPI_Get_address(&measure.source, &displacements[j++]);
for (size_t i = 0; i < n; i++)
displacements[i] = MPI_Aint_diff(displacements[i], base_address);
MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
MPI_Type_commit(&dt);
return dt;
}
static MPI_Datatype usizeDt() { return MPI_UINT64_T; }
static MPI_Datatype sliceInfo () {
constexpr int n = 5;
MPI_Datatype dt;
Slice<F>::Info measure;
const std::vector<int> lengths(n, 1);
const MPI_Datatype types[n]
= { vector(2, usizeDt())
/*, MPI_UINT64_T*/
, vector(sizeof(enum Type), MPI_CHAR)
/*, MPI_UINT64_T*/
, vector(sizeof(enum State), MPI_CHAR)
/*, vector(sizeof(Location), MPI_CHAR)*/
, sliceLocation()
, vector(sizeof(enum Type), MPI_CHAR)
/*, MPI_UINT64_T*/
};
static_assert(sizeof(enum Type) == 4, "Enum type not 4 bytes long");
static_assert(sizeof(enum State) == 4, "Enum State not 4 bytes long");
static_assert(sizeof(enum Name) == 4, "Enum Name not 4 bytes long");
// create the displacements from the info measurement struct
size_t j = 0;
MPI_Aint base_address, displacements[n];
MPI_Get_address(&measure, &base_address);
MPI_Get_address(&measure.tuple[0], &displacements[j++]);
MPI_Get_address(&measure.type, &displacements[j++]);
MPI_Get_address(&measure.state, &displacements[j++]);
MPI_Get_address(&measure.from, &displacements[j++]);
MPI_Get_address(&measure.recycling, &displacements[j++]);
for (size_t i = 0; i < n; i++)
displacements[i] = MPI_Aint_diff(displacements[i], base_address);
MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
MPI_Type_commit(&dt);
return dt;
}
static MPI_Datatype localDatabaseElement () {
constexpr int n = 2;
MPI_Datatype dt;
LocalDatabaseElement measure;
const std::vector<int> lengths(n, 1);
const MPI_Datatype types[n]
= { vector(sizeof(enum Name), MPI_CHAR)
/*= { MPI_UINT64_T*/
, sliceInfo()
};
// measure the displacements in the struct
size_t j = 0;
MPI_Aint base_address, displacements[n];
MPI_Get_address(&measure, &base_address);
MPI_Get_address(&measure.name, &displacements[j++]);
MPI_Get_address(&measure.info, &displacements[j++]);
for (size_t i = 0; i < n; i++)
displacements[i] = MPI_Aint_diff(displacements[i], base_address);
static_assert( sizeof(LocalDatabaseElement) == sizeof(measure)
, "Measure has bad size");
MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
MPI_Type_commit(&dt);
return vector(sizeof(LocalDatabaseElement), MPI_CHAR);
// TODO: write tests in order to know if this works
return dt;
}
};
static
PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) {
switch (sliceType) {
case AB: return {abc[0], abc[1]};
case BC: return {abc[1], abc[2]};
case AC: return {abc[0], abc[2]};
case CB: return {abc[2], abc[1]};
case BA: return {abc[1], abc[0]};
case CA: return {abc[2], abc[0]};
case A: return {abc[0], 0};
case B: return {abc[1], 0};
case C: return {abc[2], 0};
default: throw "Switch statement not exhaustive!";
}
}
/**
,* It is important here to return a reference to a Slice
,* not to accidentally copy the associated buffer of the slice.
,*/
static Slice<F>& findOneByType(std::vector<Slice<F>> &slices, Slice<F>::Type type) {
const auto sliceIt
= std::find_if(slices.begin(), slices.end(),
[&type](Slice<F> const& s) {
return type == s.info.type;
});
WITH_CRAZY_DEBUG
WITH_RANK
<< "\t__ looking for " << type << "\n";
if (sliceIt == slices.end())
throw std::domain_error("Slice by type not found!");
return *sliceIt;
}
/*
,* Check if an info has
,*
,*/
static std::vector<Slice<F>*> hasRecycledReferencingToIt
( std::vector<Slice<F>> &slices
, Info const& info
) {
std::vector<Slice<F>*> result;
for (auto& s: slices)
if ( s.info.recycling == info.type
&& s.info.tuple == info.tuple
&& s.info.state == Recycled
) result.push_back(&s);
return result;
}
static Slice<F>&
findRecycledSource (std::vector<Slice<F>> &slices, Slice<F>::Info info) {
const auto sliceIt
= std::find_if(slices.begin(), slices.end(),
[&info](Slice<F> const& s) {
return info.recycling == s.info.type
&& info.tuple == s.info.tuple
&& State::Recycled != s.info.state
;
});
WITH_CRAZY_DEBUG
WITH_RANK << "__slice__:find: recycling source of "
<< pretty_print(info) << "\n";
if (sliceIt == slices.end())
throw std::domain_error( "Slice not found: "
+ pretty_print(info)
+ " rank: "
+ pretty_print(Atrip::rank)
);
WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n";
return *sliceIt;
}
static Slice<F>& findByTypeAbc
( std::vector<Slice<F>> &slices
, Slice<F>::Type type
, ABCTuple const& abc
) {
const auto tuple = Slice<F>::subtupleBySlice(abc, type);
const auto sliceIt
= std::find_if(slices.begin(), slices.end(),
[&type, &tuple](Slice<F> const& s) {
return type == s.info.type
&& tuple == s.info.tuple
;
});
WITH_CRAZY_DEBUG
WITH_RANK << "__slice__:find:" << type << " and tuple "
<< pretty_print(tuple)
<< "\n";
if (sliceIt == slices.end())
throw std::domain_error( "Slice not found: "
+ pretty_print(tuple)
+ ", "
+ pretty_print(type)
+ " rank: "
+ pretty_print(Atrip::rank)
);
return *sliceIt;
}
static Slice<F>& findByInfo(std::vector<Slice<F>> &slices,
Slice<F>::Info const& info) {
const auto sliceIt
= std::find_if(slices.begin(), slices.end(),
[&info](Slice<F> const& s) {
// TODO: maybe implement comparison in Info struct
return info.type == s.info.type
&& info.state == s.info.state
&& info.tuple == s.info.tuple
&& info.from.rank == s.info.from.rank
&& info.from.source == s.info.from.source
;
});
WITH_CRAZY_DEBUG
WITH_RANK << "__slice__:find:looking for " << pretty_print(info) << "\n";
if (sliceIt == slices.end())
throw std::domain_error( "Slice by info not found: "
+ pretty_print(info));
return *sliceIt;
}
// SLICE DEFINITION =================================================={{{1
// ATTRIBUTES ============================================================
Info info;
F *data;
MPI_Request request;
const size_t size;
void markReady() noexcept {
info.state = Ready;
info.recycling = Blank;
}
/*
,* This means that the data is there
,*/
bool isUnwrapped() const noexcept {
return info.state == Ready
|| info.state == SelfSufficient
;
}
bool isUnwrappable() const noexcept {
return isUnwrapped()
|| info.state == Recycled
|| info.state == Dispatched
;
}
inline bool isDirectlyFetchable() const noexcept {
return info.state == Ready || info.state == Dispatched;
}
void free() noexcept {
info.tuple = {0, 0};
info.type = Blank;
info.state = Acceptor;
info.from = {0, 0};
info.recycling = Blank;
data = nullptr;
}
inline bool isFree() const noexcept {
return info.tuple == PartialTuple{0, 0}
&& info.type == Blank
&& info.state == Acceptor
&& info.from.rank == 0
&& info.from.source == 0
&& info.recycling == Blank
&& data == nullptr
;
}
/*
,* This function answers the question, which slices can be recycled.
,*
,* A slice can only be recycled if it is Fetch or Ready and has
,* a valid datapointer.
,*
,* In particular, SelfSufficient are not recyclable, since it is easier
,* just to create a SelfSufficient slice than deal with data dependencies.
,*
,* Furthermore, a recycled slice is not recyclable, if this is the case
,* then it is either bad design or a bug.
,*/
inline bool isRecyclable() const noexcept {
return ( info.state == Dispatched
|| info.state == Ready
|| info.state == Fetch
)
&& hasValidDataPointer()
;
}
/*
,* This function describes if a slice has a valid data pointer.
,*
,* This is important to know if the slice has some data to it, also
,* some structural checks are done, so that it should not be Acceptor
,* or Blank, if this is the case then it is a bug.
,*/
inline bool hasValidDataPointer() const noexcept {
return data != nullptr
&& info.state != Acceptor
&& info.type != Blank
;
}
void unwrapAndMarkReady() {
if (info.state == Ready) return;
if (info.state != Dispatched)
throw
std::domain_error("Can't unwrap a non-ready, non-dispatched slice!");
markReady();
MPI_Status status;
#ifdef HAVE_OCD
WITH_RANK << "__slice__:mpi: waiting " << "\n";
#endif
const int errorCode = MPI_Wait(&request, &status);
if (errorCode != MPI_SUCCESS)
throw "MPI ERROR HAPPENED....";
#ifdef HAVE_OCD
char errorString[MPI_MAX_ERROR_STRING];
int errorSize;
MPI_Error_string(errorCode, errorString, &errorSize);
WITH_RANK << "__slice__:mpi: status "
<< "{ .source=" << status.MPI_SOURCE
<< ", .tag=" << status.MPI_TAG
<< ", .error=" << status.MPI_ERROR
<< ", .errCode=" << errorCode
<< ", .err=" << errorString
<< " }"
<< "\n";
#endif
}
Slice(size_t size_)
: info({})
, data(nullptr)
, size(size_)
{}
}; // struct Slice
template <typename F=double>
std::ostream& operator<<(std::ostream& out, typename Slice<F>::Location const& v) {
// TODO: remove me
out << "{.r(" << v.rank << "), .s(" << v.source << ")};";
return out;
}
template <typename F=double>
std::ostream& operator<<(std::ostream& out, typename Slice<F>::Info const& i) {
out << "«t" << i.type << ", s" << i.state << "»"
<< " ⊙ {" << i.from.rank << ", " << i.from.source << "}"
<< " ∴ {" << i.tuple[0] << ", " << i.tuple[1] << "}"
<< " ♲t" << i.recycling
;
return out;
}
} // namespace atrip
#+end_src
** Utils
#+begin_src c++ :tangle (atrip-utils-h)
#pragma once
#include <sstream>
#include <string>
#include <map>
#include <chrono>
#include <ctf.hpp>
namespace atrip {
template <typename T>
std::string pretty_print(T&& value) {
std::stringstream stream;
#if ATRIP_DEBUG > 1
dbg::pretty_print(stream, std::forward<T>(value));
#endif
return stream.str();
}
#define WITH_CHRONO(__chrono, ...) \
__chrono.start(); __VA_ARGS__ __chrono.stop();
struct Timer {
using Clock = std::chrono::high_resolution_clock;
using Event = std::chrono::time_point<Clock>;
std::chrono::duration<double> duration;
Event _start;
inline void start() noexcept { _start = Clock::now(); }
inline void stop() noexcept { duration += Clock::now() - _start; }
inline void clear() noexcept { duration *= 0; }
inline double count() const noexcept { return duration.count(); }
};
using Timings = std::map<std::string, Timer>;
}
#+end_src
** The rank mapping
#+begin_src c++ :tangle (atrip-rankmap-h)
#pragma once
#include <vector>
#include <algorithm>
#include <atrip/Slice.hpp>
namespace atrip {
template <typename F=double>
struct RankMap {
std::vector<size_t> const lengths;
size_t const np, size;
RankMap(std::vector<size_t> lens, size_t np_)
: lengths(lens)
, np(np_)
, size(std::accumulate(lengths.begin(), lengths.end(),
1UL, std::multiplies<size_t>()))
{ assert(lengths.size() <= 2); }
size_t find(typename Slice<F>::Location const& p) const noexcept {
return p.source * np + p.rank;
}
size_t nSources() const noexcept {
return size / np + size_t(size % np != 0);
}
bool isPaddingRank(size_t rank) const noexcept {
return size % np == 0
? false
: rank > (size % np - 1)
;
}
bool isSourcePadding(size_t rank, size_t source) const noexcept {
return source == nSources() && isPaddingRank(rank);
}
typename Slice<F>::Location
find(ABCTuple const& abc, typename Slice<F>::Type sliceType) const noexcept {
// tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB
const auto tuple = Slice<F>::subtupleBySlice(abc, sliceType);
const size_t index
= tuple[0]
+ tuple[1] * (lengths.size() > 1 ? lengths[0] : 0)
;
return
{ index % np
, index / np
};
}
};
}
#+end_src
** The slice union
#+begin_src c++ :tangle (atrip-slice-union-h)
#pragma once
#include <atrip/Debug.hpp>
#include <atrip/Slice.hpp>
#include <atrip/RankMap.hpp>
namespace atrip {
template <typename F=double>
struct SliceUnion {
using Tensor = CTF::Tensor<F>;
virtual void
sliceIntoBuffer(size_t iteration, Tensor &to, Tensor const& from) = 0;
/*
* This function should enforce an important property of a SliceUnion.
* Namely, there can be no two Slices of the same nature.
*
* This means that there can be at most one slice with a given Ty_x_Tu.
*/
void checkForDuplicates() const {
std::vector<typename Slice<F>::Ty_x_Tu> tytus;
for (auto const& s: slices) {
if (s.isFree()) continue;
tytus.push_back({s.info.type, s.info.tuple});
}
for (auto const& tytu: tytus) {
if (std::count(tytus.begin(), tytus.end(), tytu) > 1)
throw "Invariance violated, more than one slice with same Ty_x_Tu";
}
}
std::vector<typename Slice<F>::Ty_x_Tu> neededSlices(ABCTuple const& abc) {
std::vector<typename Slice<F>::Ty_x_Tu> needed(sliceTypes.size());
// build the needed vector
std::transform(sliceTypes.begin(), sliceTypes.end(),
needed.begin(),
[&abc](typename Slice<F>::Type const type) {
auto tuple = Slice<F>::subtupleBySlice(abc, type);
return std::make_pair(type, tuple);
});
return needed;
}
/* buildLocalDatabase
*
* It should build a database of slices so that we know what is needed
* to fetch in the next iteration represented by the tuple 'abc'.
*
* 1. The algorithm works as follows, we build a database of the all
* the slice types that we need together with their tuple.
*
* 2. Look in the SliceUnion if we already have this tuple,
* if we already have it mark it (TODO)
*
* 3. If we don't have the tuple, look for a (state=acceptor, type=blank)
* slice and mark this slice as type=Fetch with the corresponding type
* and tuple.
*
* NOTE: The algorithm should certify that we always have enough blank
* slices.
*
*/
typename
Slice<F>::LocalDatabase buildLocalDatabase(ABCTuple const& abc) {
typename Slice<F>::LocalDatabase result;
auto const needed = neededSlices(abc);
WITH_RANK << "__db__:needed:" << pretty_print(needed) << "\n";
// BUILD THE DATABASE
// we need to loop over all sliceTypes that this TensorUnion
// is representing and find out how we will get the corresponding
// slice for the abc we are considering right now.
for (auto const& pair: needed) {
auto const type = pair.first;
auto const tuple = pair.second;
auto const from = rankMap.find(abc, type);
#ifdef HAVE_OCD
WITH_RANK << "__db__:want:" << pretty_print(pair) << "\n";
for (auto const& s: slices)
WITH_RANK << "__db__:guts:ocd "
<< s.info << " pt " << s.data
<< "\n";
#endif
#ifdef HAVE_OCD
WITH_RANK << "__db__: checking if exact match" << "\n";
#endif
{
// FIRST: look up if there is already a *Ready* slice matching what we
// need
auto const& it
= std::find_if(slices.begin(), slices.end(),
[&tuple, &type](Slice<F> const& other) {
return other.info.tuple == tuple
&& other.info.type == type
// we only want another slice when it
// has already ready-to-use data
&& other.isUnwrappable()
;
});
if (it != slices.end()) {
// if we find this slice, it means that we don't have to do anything
WITH_RANK << "__db__: EXACT: found EXACT in name=" << name
<< " for tuple " << tuple[0] << ", " << tuple[1]
<< " ptr " << it->data
<< "\n";
result.push_back({name, it->info});
continue;
}
}
#ifdef HAVE_OCD
WITH_RANK << "__db__: checking if recycle" << "\n";
#endif
// Try to find a recyling possibility ie. find a slice with the same
// tuple and that has a valid data pointer.
auto const& recycleIt
= std::find_if(slices.begin(), slices.end(),
[&tuple, &type](Slice<F> const& other) {
return other.info.tuple == tuple
&& other.info.type != type
&& other.isRecyclable()
;
});
// if we find this recylce, then we find a Blank slice
// (which should exist by construction :THINK)
//
if (recycleIt != slices.end()) {
auto& blank = Slice<F>::findOneByType(slices, Slice<F>::Blank);
// TODO: formalize this through a method to copy information
// from another slice
blank.data = recycleIt->data;
blank.info.type = type;
blank.info.tuple = tuple;
blank.info.state = Slice<F>::Recycled;
blank.info.from = from;
blank.info.recycling = recycleIt->info.type;
result.push_back({name, blank.info});
WITH_RANK << "__db__: RECYCLING: n" << name
<< " " << pretty_print(abc)
<< " get " << pretty_print(blank.info)
<< " from " << pretty_print(recycleIt->info)
<< " ptr " << recycleIt->data
<< "\n"
;
continue;
}
// in this case we have to create a new slice
// this means that we should have a blank slice at our disposal
// and also the freePointers should have some elements inside,
// so we pop a data pointer from the freePointers container
#ifdef HAVE_OCD
WITH_RANK << "__db__: none work, doing new" << "\n";
#endif
{
WITH_RANK << "__db__: NEW: finding blank in " << name
<< " for type " << type
<< " for tuple " << tuple[0] << ", " << tuple[1]
<< "\n"
;
auto& blank = Slice<F>::findOneByType(slices, Slice<F>::Blank);
blank.info.type = type;
blank.info.tuple = tuple;
blank.info.from = from;
// Handle self sufficiency
blank.info.state = Atrip::rank == from.rank
? Slice<F>::SelfSufficient
: Slice<F>::Fetch
;
if (blank.info.state == Slice<F>::SelfSufficient) {
blank.data = sources[from.source].data();
} else {
if (freePointers.size() == 0)
throw std::domain_error("No more free pointers!");
auto dataPointer = freePointers.begin();
freePointers.erase(dataPointer);
blank.data = *dataPointer;
}
result.push_back({name, blank.info});
continue;
}
}
#ifdef HAVE_OCD
for (auto const& s: slices)
WITH_RANK << "__db__:guts:ocd:__end__ " << s.info << "\n";
#endif
return result;
}
/*
* Garbage collect slices not needed for the next iteration.
*
* It will throw if it tries to gc a slice that has not been
* previously unwrapped, as a safety mechanism.
*/
void clearUnusedSlicesForNext(ABCTuple const& abc) {
auto const needed = neededSlices(abc);
// CLEAN UP SLICES, FREE THE ONES THAT ARE NOT NEEDED ANYMORE
for (auto& slice: slices) {
// if the slice is free, then it was not used anyways
if (slice.isFree()) continue;
// try to find the slice in the needed slices list
auto const found
= std::find_if(needed.begin(), needed.end(),
[&slice] (typename Slice<F>::Ty_x_Tu const& tytu) {
return slice.info.tuple == tytu.second
&& slice.info.type == tytu.first
;
});
// if we did not find slice in needed, then erase it
if (found == needed.end()) {
// We have to be careful about the data pointer,
// for SelfSufficient, the data pointer is a source pointer
// of the slice, so we should just wipe it.
//
// For Ready slices, we have to be careful if there are some
// recycled slices depending on it.
bool freeSlicePointer = true;
// allow to gc unwrapped and recycled, never Fetch,
// if we have a Fetch slice then something has gone very wrong.
if (!slice.isUnwrapped() && slice.info.state != Slice<F>::Recycled)
throw
std::domain_error("Trying to garbage collect "
" a non-unwrapped slice! "
+ pretty_print(&slice)
+ pretty_print(slice.info));
// it can be that our slice is ready, but it has some hanging
// references lying around in the form of a recycled slice.
// Of course if we need the recycled slice the next iteration
// this would be fatal, because we would then free the pointer
// of the slice and at some point in the future we would
// overwrite it. Therefore, we must check if slice has some
// references in slices and if so then
//
// - we should mark those references as the original (since the data
// pointer should be the same)
//
// - we should make sure that the data pointer of slice
// does not get freed.
//
if (slice.info.state == Slice<F>::Ready) {
WITH_OCD WITH_RANK
<< "__gc__:" << "checking for data recycled dependencies\n";
auto recycled
= Slice<F>::hasRecycledReferencingToIt(slices, slice.info);
if (recycled.size()) {
Slice<F>* newReady = recycled[0];
WITH_OCD WITH_RANK
<< "__gc__:" << "swaping recycled "
<< pretty_print(newReady->info)
<< " and "
<< pretty_print(slice.info)
<< "\n";
newReady->markReady();
assert(newReady->data == slice.data);
freeSlicePointer = false;
for (size_t i = 1; i < recycled.size(); i++) {
auto newRecyled = recycled[i];
newRecyled->info.recycling = newReady->info.type;
WITH_OCD WITH_RANK
<< "__gc__:" << "updating recycled "
<< pretty_print(newRecyled->info)
<< "\n";
}
}
}
// if the slice is self sufficient, do not dare touching the
// pointer, since it is a pointer to our sources in our rank.
if ( slice.info.state == Slice<F>::SelfSufficient
|| slice.info.state == Slice<F>::Recycled
) {
freeSlicePointer = false;
}
// make sure we get its data pointer to be used later
// only for non-recycled, since it can be that we need
// for next iteration the data of the slice that the recycled points
// to
if (freeSlicePointer) {
freePointers.insert(slice.data);
WITH_RANK << "~~~:cl(" << name << ")"
<< " added to freePointer "
<< pretty_print(freePointers)
<< "\n";
} else {
WITH_OCD WITH_RANK << "__gc__:not touching the free Pointer\n";
}
// at this point, let us blank the slice
WITH_RANK << "~~~:cl(" << name << ")"
<< " freeing up slice "
// TODO: make this possible
// << " info " << slice.info
<< "\n";
slice.free();
}
}
}
// CONSTRUCTOR
SliceUnion( Tensor const& sourceTensor
, std::vector<typename Slice<F>::Type> sliceTypes_
, std::vector<size_t> sliceLength_
, std::vector<size_t> paramLength
, size_t np
, MPI_Comm child_world
, MPI_Comm global_world
, typename Slice<F>::Name name_
, size_t nSliceBuffers = 4
)
: rankMap(paramLength, np)
, world(child_world)
, universe(global_world)
, sliceLength(sliceLength_)
, sources(rankMap.nSources(),
std::vector<F>
(std::accumulate(sliceLength.begin(),
sliceLength.end(),
1UL, std::multiplies<size_t>())))
, name(name_)
, sliceTypes(sliceTypes_)
, sliceBuffers(nSliceBuffers, sources[0])
//, slices(2 * sliceTypes.size(), Slice<F>{ sources[0].size() })
{ // constructor begin
LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n";
slices
= std::vector<Slice<F>>(2 * sliceTypes.size(), { sources[0].size() });
// TODO: think exactly ^------------------- about this number
// initialize the freePointers with the pointers to the buffers
std::transform(sliceBuffers.begin(), sliceBuffers.end(),
std::inserter(freePointers, freePointers.begin()),
[](std::vector<F> &vec) { return vec.data(); });
LOG(1,"Atrip") << "rankMap.nSources "
<< rankMap.nSources() << "\n";
LOG(1,"Atrip") << "#slices "
<< slices.size() << "\n";
LOG(1,"Atrip") << "#slices[0] "
<< slices[0].size << "\n";
LOG(1,"Atrip") << "#sources "
<< sources.size() << "\n";
LOG(1,"Atrip") << "#sources[0] "
<< sources[0].size() << "\n";
LOG(1,"Atrip") << "#freePointers "
<< freePointers.size() << "\n";
LOG(1,"Atrip") << "#sliceBuffers "
<< sliceBuffers.size() << "\n";
LOG(1,"Atrip") << "#sliceBuffers[0] "
<< sliceBuffers[0].size() << "\n";
LOG(1,"Atrip") << "#sliceLength "
<< sliceLength.size() << "\n";
LOG(1,"Atrip") << "#paramLength "
<< paramLength.size() << "\n";
LOG(1,"Atrip") << "GB*" << np << " "
<< double(sources.size() + sliceBuffers.size())
* sources[0].size()
* 8 * np
/ 1073741824.0
<< "\n";
} // constructor ends
void init(Tensor const& sourceTensor) {
CTF::World w(world);
const int rank = Atrip::rank
, order = sliceLength.size()
;
std::vector<int> const syms(order, NS);
std::vector<int> __sliceLength(sliceLength.begin(), sliceLength.end());
Tensor toSliceInto(order,
__sliceLength.data(),
syms.data(),
w);
LOG(1,"Atrip") << "slicing... \n";
// setUp sources
for (size_t it(0); it < rankMap.nSources(); ++it) {
const size_t
source = rankMap.isSourcePadding(rank, source) ? 0 : it;
WITH_OCD
WITH_RANK
<< "Init:toSliceInto it-" << it
<< " :: source " << source << "\n";
sliceIntoBuffer(source, toSliceInto, sourceTensor);
}
}
/**
* \brief Send asynchronously only if the state is Fetch
*/
void send( size_t otherRank
, typename Slice<F>::Info const& info
, size_t tag) const noexcept {
MPI_Request request;
bool sendData_p = false;
if (info.state == Slice<F>::Fetch) sendData_p = true;
// TODO: remove this because I have SelfSufficient
if (otherRank == info.from.rank) sendData_p = false;
if (!sendData_p) return;
MPI_Isend( sources[info.from.source].data()
, sources[info.from.source].size()
, traits::mpi::datatypeOf<F>()
, otherRank
, tag
, universe
, &request
);
WITH_CRAZY_DEBUG
WITH_RANK << "sent to " << otherRank << "\n";
}
/**
* \brief Receive asynchronously only if the state is Fetch
*/
void receive(typename Slice<F>::Info const& info, size_t tag) noexcept {
auto& slice = Slice<F>::findByInfo(slices, info);
if (Atrip::rank == info.from.rank) return;
if (slice.info.state == Slice<F>::Fetch) {
// TODO: do it through the slice class
slice.info.state = Slice<F>::Dispatched;
MPI_Request request;
slice.request = request;
MPI_Irecv( slice.data
, slice.size
, traits::mpi::datatypeOf<F>()
, info.from.rank
, tag
, universe
, &slice.request
//, MPI_STATUS_IGNORE
);
}
}
void unwrapAll(ABCTuple const& abc) {
for (auto type: sliceTypes) unwrapSlice(type, abc);
}
F* unwrapSlice(typename Slice<F>::Type type, ABCTuple const& abc) {
WITH_CRAZY_DEBUG
WITH_RANK << "__unwrap__:slice " << type << " w n "
<< name
<< " abc" << pretty_print(abc)
<< "\n";
auto& slice = Slice<F>::findByTypeAbc(slices, type, abc);
//WITH_RANK << "__unwrap__:info " << slice.info << "\n";
switch (slice.info.state) {
case Slice<F>::Dispatched:
WITH_RANK << "__unwrap__:Fetch: " << &slice
<< " info " << pretty_print(slice.info)
<< "\n";
slice.unwrapAndMarkReady();
return slice.data;
break;
case Slice<F>::SelfSufficient:
WITH_RANK << "__unwrap__:SelfSufficient: " << &slice
<< " info " << pretty_print(slice.info)
<< "\n";
return slice.data;
break;
case Slice<F>::Ready:
WITH_RANK << "__unwrap__:READY: UNWRAPPED ALREADY" << &slice
<< " info " << pretty_print(slice.info)
<< "\n";
return slice.data;
break;
case Slice<F>::Recycled:
WITH_RANK << "__unwrap__:RECYCLED " << &slice
<< " info " << pretty_print(slice.info)
<< "\n";
return unwrapSlice(slice.info.recycling, abc);
break;
case Slice<F>::Fetch:
case Slice<F>::Acceptor:
throw std::domain_error("Can't unwrap an acceptor or fetch slice!");
break;
default:
throw std::domain_error("Unknown error unwrapping slice!");
}
return slice.data;
}
const RankMap<F> rankMap;
const MPI_Comm world;
const MPI_Comm universe;
const std::vector<size_t> sliceLength;
std::vector< std::vector<F> > sources;
std::vector< Slice<F> > slices;
typename Slice<F>::Name name;
const std::vector<typename Slice<F>::Type> sliceTypes;
std::vector< std::vector<F> > sliceBuffers;
std::set<F*> freePointers;
};
template <typename F=double>
SliceUnion<F>&
unionByName(std::vector<SliceUnion<F>*> const& unions,
typename Slice<F>::Name name) {
const auto sliceUnionIt
= std::find_if(unions.begin(), unions.end(),
[&name](SliceUnion<F> const* s) {
return name == s->name;
});
if (sliceUnionIt == unions.end())
throw std::domain_error("SliceUnion not found!");
return **sliceUnionIt;
}
}
#+end_src
** Tuples
#+begin_src c++ :tangle (atrip-tuples-h)
#pragma once
#include <vector>
#include <array>
#include <numeric>
#include <atrip/Utils.hpp>
#include <atrip/Debug.hpp>
namespace atrip {
using ABCTuple = std::array<size_t, 3>;
using PartialTuple = std::array<size_t, 2>;
using ABCTuples = std::vector<ABCTuple>;
ABCTuples getTuplesList(size_t Nv) {
const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
ABCTuples result(n);
size_t u(0);
for (size_t a(0); a < Nv; a++)
for (size_t b(a); b < Nv; b++)
for (size_t c(b); c < Nv; c++){
if ( a == b && b == c ) continue;
result[u++] = {a, b, c};
}
return result;
}
std::pair<size_t, size_t>
getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) {
std::vector<size_t> n_tuples_per_rank(np, tuplesList.size()/np);
const size_t
// how many valid tuples should we still verteilen to nodes
// since the number of tuples is not divisible by the number of nodes
nRoundRobin = tuplesList.size() % np
// every node must have the sanme amount of tuples in order for the
// other nodes to receive and send somewhere, therefore
// some nodes will get extra tuples but that are dummy tuples
, nExtraInvalid = (np - nRoundRobin) % np
;
if (nRoundRobin) for (int i = 0; i < np; i++) n_tuples_per_rank[i]++;
#if defined(TODO)
assert( tuplesList.size()
==
( std::accumulate(n_tuples_per_rank.begin(),
n_tuples_per_rank.end(),
0UL,
std::plus<size_t>())
+ nExtraInvalid
));
#endif
WITH_RANK << "nRoundRobin = " << nRoundRobin << "\n";
WITH_RANK << "nExtraInvalid = " << nExtraInvalid << "\n";
WITH_RANK << "ntuples = " << n_tuples_per_rank[rank] << "\n";
auto const& it = n_tuples_per_rank.begin();
return
{ std::accumulate(it, it + rank , 0)
, std::accumulate(it, it + rank + 1, 0)
};
}
}
#+end_src
** Unions
Since every tensor slice in a different way, we can override the slicing procedure
and define subclasses of slice unions.
#+begin_src c++ :tangle (atrip-unions-h)
#pragma once
#include <atrip/SliceUnion.hpp>
namespace atrip {
template <typename F=double>
void sliceIntoVector
( std::vector<F> &v
, CTF::Tensor<F> &toSlice
, std::vector<int64_t> const low
, std::vector<int64_t> const up
, CTF::Tensor<F> const& origin
, std::vector<int64_t> const originLow
, std::vector<int64_t> const originUp
) {
// Thank you CTF for forcing me to do this
struct { std::vector<int> up, low; }
toSlice_ = { {up.begin(), up.end()}
, {low.begin(), low.end()} }
, origin_ = { {originUp.begin(), originUp.end()}
, {originLow.begin(), originLow.end()} }
;
WITH_OCD
WITH_RANK << "slicing into " << pretty_print(toSlice_.up)
<< "," << pretty_print(toSlice_.low)
<< " from " << pretty_print(origin_.up)
<< "," << pretty_print(origin_.low)
<< "\n";
#ifndef ATRIP_DONT_SLICE
toSlice.slice( toSlice_.low.data()
, toSlice_.up.data()
, 0.0
, origin
, origin_.low.data()
, origin_.up.data()
, 1.0);
memcpy(v.data(), toSlice.data, sizeof(F) * v.size());
#endif
}
template <typename F=double>
struct TAPHH : public SliceUnion<F> {
TAPHH( CTF::Tensor<F> const& sourceTensor
, size_t No
, size_t Nv
, size_t np
, MPI_Comm child_world
, MPI_Comm global_world
) : SliceUnion<F>( sourceTensor
, {Slice<F>::A, Slice<F>::B, Slice<F>::C}
, {Nv, No, No} // size of the slices
, {Nv}
, np
, child_world
, global_world
, Slice<F>::TA
, 4) {
init(sourceTensor);
}
void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override
{
const int Nv = this->sliceLength[0]
, No = this->sliceLength[1]
, a = this->rankMap.find({static_cast<size_t>(Atrip::rank), it});
;
sliceIntoVector<F>( this->sources[it]
, to, {0, 0, 0}, {Nv, No, No}
, from, {a, 0, 0, 0}, {a+1, Nv, No, No}
);
}
};
template <typename F=double>
struct HHHA : public SliceUnion<F> {
HHHA( CTF::Tensor<F> const& sourceTensor
, size_t No
, size_t Nv
, size_t np
, MPI_Comm child_world
, MPI_Comm global_world
) : SliceUnion<F>( sourceTensor
, {Slice<F>::A, Slice<F>::B, Slice<F>::C}
, {No, No, No} // size of the slices
, {Nv} // size of the parametrization
, np
, child_world
, global_world
, Slice<F>::VIJKA
, 4) {
init(sourceTensor);
}
void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override
{
const int No = this->sliceLength[0]
, a = this->rankMap.find({static_cast<size_t>(Atrip::rank), it})
;
sliceIntoVector<F>( this->sources[it]
, to, {0, 0, 0}, {No, No, No}
, from, {0, 0, 0, a}, {No, No, No, a+1}
);
}
};
template <typename F=double>
struct ABPH : public SliceUnion<F> {
ABPH( CTF::Tensor<F> const& sourceTensor
, size_t No
, size_t Nv
, size_t np
, MPI_Comm child_world
, MPI_Comm global_world
) : SliceUnion<F>( sourceTensor
, { Slice<F>::AB, Slice<F>::BC, Slice<F>::AC
, Slice<F>::BA, Slice<F>::CB, Slice<F>::CA
}
, {Nv, No} // size of the slices
, {Nv, Nv} // size of the parametrization
, np
, child_world
, global_world
, Slice<F>::VABCI
, 2*6) {
init(sourceTensor);
}
void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override {
const int Nv = this->sliceLength[0]
, No = this->sliceLength[1]
, el = this->rankMap.find({static_cast<size_t>(Atrip::rank), it})
, a = el % Nv
, b = el / Nv
;
sliceIntoVector<F>( this->sources[it]
, to, {0, 0}, {Nv, No}
, from, {a, b, 0, 0}, {a+1, b+1, Nv, No}
);
}
};
template <typename F=double>
struct ABHH : public SliceUnion<F> {
ABHH( CTF::Tensor<F> const& sourceTensor
, size_t No
, size_t Nv
, size_t np
, MPI_Comm child_world
, MPI_Comm global_world
) : SliceUnion<F>( sourceTensor
, {Slice<F>::AB, Slice<F>::BC, Slice<F>::AC}
, {No, No} // size of the slices
, {Nv, Nv} // size of the parametrization
, np
, child_world
, global_world
, Slice<F>::VABIJ
, 6) {
init(sourceTensor);
}
void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override {
const int Nv = from.lens[0]
, No = this->sliceLength[1]
, el = this->rankMap.find({static_cast<size_t>(Atrip::rank), it})
, a = el % Nv
, b = el / Nv
;
sliceIntoVector<F>( this->sources[it]
, to, {0, 0}, {No, No}
, from, {a, b, 0, 0}, {a+1, b+1, No, No}
);
}
};
template <typename F=double>
struct TABHH : public SliceUnion<F> {
TABHH( CTF::Tensor<F> const& sourceTensor
, size_t No
, size_t Nv
, size_t np
, MPI_Comm child_world
, MPI_Comm global_world
) : SliceUnion<F>( sourceTensor
, {Slice<F>::AB, Slice<F>::BC, Slice<F>::AC}
, {No, No} // size of the slices
, {Nv, Nv} // size of the parametrization
, np
, child_world
, global_world
, Slice<F>::TABIJ
, 6) {
init(sourceTensor);
}
void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override {
// TODO: maybe generalize this with ABHH
const int Nv = from.lens[0]
, No = this->sliceLength[1]
, el = this->rankMap.find({static_cast<size_t>(Atrip::rank), it})
, a = el % Nv
, b = el / Nv
;
sliceIntoVector<F>( this->sources[it]
, to, {0, 0}, {No, No}
, from, {a, b, 0, 0}, {a+1, b+1, No, No}
);
}
};
}
#+end_src
** Equations
#+begin_src c++ :tangle (atrip-equations-h)
#pragma once
#include<atrip/Slice.hpp>
#include<atrip/Blas.hpp>
namespace atrip {
template <typename F=double>
double getEnergyDistinct
( const F epsabc
, std::vector<F> const& epsi
, std::vector<F> const& Tijk_
, std::vector<F> const& Zijk_
) {
constexpr size_t blockSize=16;
F energy(0.);
const size_t No = epsi.size();
for (size_t kk=0; kk<No; kk+=blockSize){
const size_t kend( std::min(No, kk+blockSize) );
for (size_t jj(kk); jj<No; jj+=blockSize){
const size_t jend( std::min( No, jj+blockSize) );
for (size_t ii(jj); ii<No; ii+=blockSize){
const size_t iend( std::min( No, ii+blockSize) );
for (size_t k(kk); k < kend; k++){
const F ek(epsi[k]);
const size_t jstart = jj > k ? jj : k;
for (size_t j(jstart); j < jend; j++){
F const ej(epsi[j]);
F const facjk = j == k ? F(0.5) : F(1.0);
size_t istart = ii > j ? ii : j;
for (size_t i(istart); i < iend; i++){
const F
ei(epsi[i])
, facij = i == j ? F(0.5) : F(1.0)
, denominator(epsabc - ei - ej - ek)
, U(Zijk_[i + No*j + No*No*k])
, V(Zijk_[i + No*k + No*No*j])
, W(Zijk_[j + No*i + No*No*k])
, X(Zijk_[j + No*k + No*No*i])
, Y(Zijk_[k + No*i + No*No*j])
, Z(Zijk_[k + No*j + No*No*i])
, A(maybeConjugate<F>(Tijk_[i + No*j + No*No*k]))
, B(maybeConjugate<F>(Tijk_[i + No*k + No*No*j]))
, C(maybeConjugate<F>(Tijk_[j + No*i + No*No*k]))
, D(maybeConjugate<F>(Tijk_[j + No*k + No*No*i]))
, E(maybeConjugate<F>(Tijk_[k + No*i + No*No*j]))
, F(maybeConjugate<F>(Tijk_[k + No*j + No*No*i]))
, value
= 3.0 * ( A * U
+ B * V
+ C * W
+ D * X
+ E * Y
+ F * Z )
+ ( ( U + X + Y )
- 2.0 * ( V + W + Z )
) * ( A + D + E )
+ ( ( V + W + Z )
- 2.0 * ( U + X + Y )
) * ( B + C + F )
;
energy += 2.0 * value / denominator * facjk * facij;
} // i
} // j
} // k
} // ii
} // jj
} // kk
return std::real(energy);
}
template <typename F=double>
double getEnergySame
( const F epsabc
, std::vector<F> const& epsi
, std::vector<F> const& Tijk_
, std::vector<F> const& Zijk_
) {
constexpr size_t blockSize = 16;
const size_t No = epsi.size();
F energy = F(0.);
for (size_t kk=0; kk<No; kk+=blockSize){
const size_t kend( std::min( kk+blockSize, No) );
for (size_t jj(kk); jj<No; jj+=blockSize){
const size_t jend( std::min( jj+blockSize, No) );
for (size_t ii(jj); ii<No; ii+=blockSize){
const size_t iend( std::min( ii+blockSize, No) );
for (size_t k(kk); k < kend; k++){
const F ek(epsi[k]);
const size_t jstart = jj > k ? jj : k;
for(size_t j(jstart); j < jend; j++){
const F facjk( j == k ? F(0.5) : F(1.0));
const F ej(epsi[j]);
const size_t istart = ii > j ? ii : j;
for(size_t i(istart); i < iend; i++){
const F
ei(epsi[i])
, facij ( i==j ? F(0.5) : F(1.0))
, denominator(epsabc - ei - ej - ek)
, U(Zijk_[i + No*j + No*No*k])
, V(Zijk_[j + No*k + No*No*i])
, W(Zijk_[k + No*i + No*No*j])
, A(maybeConjugate<F>(Tijk_[i + No*j + No*No*k]))
, B(maybeConjugate<F>(Tijk_[j + No*k + No*No*i]))
, C(maybeConjugate<F>(Tijk_[k + No*i + No*No*j]))
, value
= F(3.0) * ( A * U
+ B * V
+ C * W
)
- ( A + B + C ) * ( U + V + W )
;
energy += F(2.0) * value / denominator * facjk * facij;
} // i
} // j
} // k
} // ii
} // jj
} // kk
return std::real(energy);
}
template <typename F=double>
void singlesContribution
( size_t No
, size_t Nv
, const ABCTuple &abc
, F const* Tph
, F const* VABij
, F const* VACij
, F const* VBCij
, F *Zijk
) {
const size_t a(abc[0]), b(abc[1]), c(abc[2]);
for (size_t k=0; k < No; k++)
for (size_t i=0; i < No; i++)
for (size_t j=0; j < No; j++) {
const size_t ijk = i + j*No + k*No*No
, jk = j + No * k
;
Zijk[ijk] += Tph[ a + i * Nv ] * VBCij[ j + k * No ];
Zijk[ijk] += Tph[ b + j * Nv ] * VACij[ i + k * No ];
Zijk[ijk] += Tph[ c + k * Nv ] * VABij[ i + j * No ];
}
}
template <typename F=double>
void doublesContribution
( const ABCTuple &abc
, size_t const No
, size_t const Nv
// -- VABCI
, F const* VABph
, F const* VACph
, F const* VBCph
, F const* VBAph
, F const* VCAph
, F const* VCBph
// -- VHHHA
, F const* VhhhA
, F const* VhhhB
, F const* VhhhC
// -- TA
, F const* TAphh
, F const* TBphh
, F const* TCphh
// -- TABIJ
, F const* TABhh
, F const* TAChh
, F const* TBChh
// -- TIJK
, F *Tijk
, atrip::Timings& chrono
) {
auto& t_reorder = chrono["doubles:reorder"];
const size_t a = abc[0], b = abc[1], c = abc[2]
, NoNo = No*No, NoNv = No*Nv
;
#if defined(ATRIP_USE_DGEMM)
#define _IJK_(i, j, k) i + j*No + k*NoNo
#define REORDER(__II, __JJ, __KK) \
t_reorder.start(); \
for (size_t k = 0; k < No; k++) \
for (size_t j = 0; j < No; j++) \
for (size_t i = 0; i < No; i++) { \
Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)]; \
} \
t_reorder.stop();
#define DGEMM_PARTICLES(__A, __B) \
atrip::xgemm<F>( "T" \
, "N" \
, (int const*)&NoNo \
, (int const*)&No \
, (int const*)&Nv \
, &one \
, __A \
, (int const*)&Nv \
, __B \
, (int const*)&Nv \
, &zero \
, _t_buffer.data() \
, (int const*)&NoNo \
);
#define DGEMM_HOLES(__A, __B, __TRANSB) \
atrip::xgemm<F>( "N" \
, __TRANSB \
, (int const*)&NoNo \
, (int const*)&No \
, (int const*)&No \
, &m_one \
, __A \
, (int const*)&NoNo \
, __B \
, (int const*)&No \
, &zero \
, _t_buffer.data() \
, (int const*)&NoNo \
);
#define MAYBE_CONJ(_conj, _buffer) \
for (size_t __i = 0; __i < NoNoNo; ++__i) \
_conj[__i] = maybeConjugate<F>(_buffer[__i]); \
const size_t NoNoNo = No*NoNo;
std::vector<F> _t_buffer;
_t_buffer.reserve(NoNoNo);
F one{1.0}, m_one{-1.0}, zero{0.0};
t_reorder.start();
for (size_t k = 0; k < NoNoNo; k++) {
// zero the Tijk
Tijk[k] = 0.0;
}
t_reorder.stop();
chrono["doubles:holes"].start();
{ // Holes part ============================================================
std::vector<F> _vhhh(NoNoNo);
// VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
MAYBE_CONJ(_vhhh, VhhhC)
chrono["doubles:holes:1"].start();
DGEMM_HOLES(_vhhh.data(), TABhh, "N")
REORDER(i, k, j)
chrono["doubles:holes:1"].stop();
// VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
chrono["doubles:holes:2"].start();
DGEMM_HOLES(_vhhh.data(), TABhh, "T")
REORDER(j, k, i)
chrono["doubles:holes:2"].stop();
// VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
MAYBE_CONJ(_vhhh, VhhhB)
chrono["doubles:holes:3"].start();
DGEMM_HOLES(_vhhh.data(), TAChh, "N")
REORDER(i, j, k)
chrono["doubles:holes:3"].stop();
// VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
chrono["doubles:holes:4"].start();
DGEMM_HOLES(_vhhh.data(), TAChh, "T")
REORDER(k, j, i)
chrono["doubles:holes:4"].stop();
// VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
MAYBE_CONJ(_vhhh, VhhhA)
chrono["doubles:holes:5"].start();
DGEMM_HOLES(_vhhh.data(), TBChh, "N")
REORDER(j, i, k)
chrono["doubles:holes:5"].stop();
// VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
chrono["doubles:holes:6"].start();
DGEMM_HOLES(_vhhh.data(), TBChh, "T")
REORDER(k, i, j)
chrono["doubles:holes:6"].stop();
}
chrono["doubles:holes"].stop();
#undef MAYBE_CONJ
chrono["doubles:particles"].start();
{ // Particle part =========================================================
// TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
chrono["doubles:particles:1"].start();
DGEMM_PARTICLES(TAphh, VBCph)
REORDER(i, j, k)
chrono["doubles:particles:1"].stop();
// TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
chrono["doubles:particles:2"].start();
DGEMM_PARTICLES(TAphh, VCBph)
REORDER(i, k, j)
chrono["doubles:particles:2"].stop();
// TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
chrono["doubles:particles:3"].start();
DGEMM_PARTICLES(TCphh, VABph)
REORDER(k, i, j)
chrono["doubles:particles:3"].stop();
// TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
chrono["doubles:particles:4"].start();
DGEMM_PARTICLES(TCphh, VBAph)
REORDER(k, j, i)
chrono["doubles:particles:4"].stop();
// TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
chrono["doubles:particles:5"].start();
DGEMM_PARTICLES(TBphh, VACph)
REORDER(j, i, k)
chrono["doubles:particles:5"].stop();
// TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
chrono["doubles:particles:6"].start();
DGEMM_PARTICLES(TBphh, VCAph)
REORDER(j, k, i)
chrono["doubles:particles:6"].stop();
}
chrono["doubles:particles"].stop();
#undef REORDER
#undef DGEMM_HOLES
#undef DGEMM_PARTICLES
#undef _IJK_
#else
for (size_t k = 0; k < No; k++)
for (size_t j = 0; j < No; j++)
for (size_t i = 0; i < No; i++){
const size_t ijk = i + j*No + k*NoNo
, jk = j + k*No
;
Tijk[ijk] = 0.0; // :important
// HOLE DIAGRAMS: TABHH and VHHHA
for (size_t L = 0; L < No; L++){
// t[abLj] * V[Lcik] H1
// t[baLi] * V[Lcjk] H0 TODO: conjugate T for complex
Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo];
Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo];
// t[acLk] * V[Lbij] H5
// t[caLi] * V[Lbkj] H3
Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo];
Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo];
// t[bcLk] * V[Laji] H2
// t[cbLj] * V[Laki] H4
Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo];
Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo];
}
// PARTILCE DIAGRAMS: TAPHH and VABPH
for (size_t E = 0; E < Nv; E++) {
// t[aEij] * V[bcEk] P0
// t[aEik] * V[cbEj] P3 // TODO: CHECK THIS ONE, I DONT KNOW
Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv];
Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv];
// t[cEki] * V[abEj] P5
// t[cEkj] * V[baEi] P2
Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv];
Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv];
// t[bEji] * V[acEk] P1
// t[bEjk] * V[caEi] P4
Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv];
Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv];
}
}
#endif
}
}
#+end_src
** Blas
The main matrix-matrix multiplication method used in this algorithm
is mainly using the =DGEMM= function, which we declare as
=extern= since it should be known only at link-time.
#+begin_src c++ :tangle (atrip-blas-h)
#pragma once
namespace atrip {
using Complex = std::complex<double>;
extern "C" {
void dgemm_(
const char *transa,
const char *transb,
const int *m,
const int *n,
const int *k,
double *alpha,
const double *a,
const int *lda,
const double *b,
const int *ldb,
double *beta,
double *c,
const int *ldc
);
void zgemm_(
const char *transa,
const char *transb,
const int *m,
const int *n,
const int *k,
Complex *alpha,
const Complex *A,
const int *lda,
const Complex *B,
const int *ldb,
Complex *beta,
Complex *C,
const int *ldc
);
}
template <typename F=double>
void xgemm(const char *transa,
const char *transb,
const int *m,
const int *n,
const int *k,
F *alpha,
const F *A,
const int *lda,
const F *B,
const int *ldb,
F *beta,
F *C,
const int *ldc) {
dgemm_(transa, transb,
m, n, k,
alpha, A, lda,
B, ldb, beta,
C, ldc);
}
template <>
void xgemm(const char *transa,
const char *transb,
const int *m,
const int *n,
const int *k,
Complex *alpha,
const Complex *A,
const int *lda,
const Complex *B,
const int *ldb,
Complex *beta,
Complex *C,
const int *ldc) {
zgemm_(transa, transb,
m, n, k,
alpha, A, lda,
B, ldb, beta,
C, ldc);
}
}
#+end_src
** Atrip
*** Header
#+begin_src c++ :tangle (atrip-atrip-h)
#pragma once
#include <sstream>
#include <string>
#include <map>
#include <chrono>
#include <ctf.hpp>
namespace atrip {
struct Atrip {
static int rank;
static int np;
static void init();
template <typename F=double>
struct Input {
CTF::Tensor<F> *ei = nullptr
, *ea = nullptr
, *Tph = nullptr
, *Tpphh = nullptr
, *Vpphh = nullptr
, *Vhhhp = nullptr
, *Vppph = nullptr
;
int maxIterations = 0, iterationMod = -1, percentageMod = -1;
bool barrier = false;
bool chrono = false;
Input& with_epsilon_i(CTF::Tensor<F> * t) { ei = t; return *this; }
Input& with_epsilon_a(CTF::Tensor<F> * t) { ea = t; return *this; }
Input& with_Tai(CTF::Tensor<F> * t) { Tph = t; return *this; }
Input& with_Tabij(CTF::Tensor<F> * t) { Tpphh = t; return *this; }
Input& with_Vabij(CTF::Tensor<F> * t) { Vpphh = t; return *this; }
Input& with_Vijka(CTF::Tensor<F> * t) { Vhhhp = t; return *this; }
Input& with_Vabci(CTF::Tensor<F> * t) { Vppph = t; return *this; }
Input& with_maxIterations(int i) { maxIterations = i; return *this; }
Input& with_iterationMod(int i) { iterationMod = i; return *this; }
Input& with_percentageMod(int i) { percentageMod = i; return *this; }
Input& with_barrier(bool i) { barrier = i; return *this; }
Input& with_chrono(bool i) { chrono = i; return *this; }
};
struct Output {
double energy;
};
template <typename F=double>
static Output run(Input<F> const& in);
};
}
#+end_src
*** Main
#+begin_src c++ :tangle (atrip-atrip-cxx)
#include <iomanip>
#include <atrip/Atrip.hpp>
#include <atrip/Utils.hpp>
#include <atrip/Equations.hpp>
#include <atrip/SliceUnion.hpp>
#include <atrip/Unions.hpp>
using namespace atrip;
int Atrip::rank;
int Atrip::np;
// user printing block
IterationDescriptor IterationDescription::descriptor;
void atrip::registerIterationDescriptor(IterationDescriptor d) {
IterationDescription::descriptor = d;
}
void Atrip::init() {
MPI_Comm_rank(MPI_COMM_WORLD, &Atrip::rank);
MPI_Comm_size(MPI_COMM_WORLD, &Atrip::np);
}
template <typename F>
Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
const int np = Atrip::np;
const int rank = Atrip::rank;
MPI_Comm universe = in.ei->wrld->comm;
// Timings in seconds ================================================{{{1
Timings chrono{};
const size_t No = in.ei->lens[0];
const size_t Nv = in.ea->lens[0];
LOG(0,"Atrip") << "No: " << No << "\n";
LOG(0,"Atrip") << "Nv: " << Nv << "\n";
// allocate the three scratches, see piecuch
std::vector<F> Tijk(No*No*No) // doubles only (see piecuch)
, Zijk(No*No*No) // singles + doubles (see piecuch)
// we need local copies of the following tensors on every
// rank
, epsi(No)
, epsa(Nv)
, Tai(No * Nv)
;
in.ei->read_all(epsi.data());
in.ea->read_all(epsa.data());
in.Tph->read_all(Tai.data());
// COMMUNICATOR CONSTRUCTION ========================================={{{1
//
// Construct a new communicator living only on a single rank
int child_size = 1
, child_rank
;
const
int color = rank / child_size
, crank = rank % child_size
;
MPI_Comm child_comm;
if (np == 1) {
child_comm = universe;
} else {
MPI_Comm_split(universe, color, crank, &child_comm);
MPI_Comm_rank(child_comm, &child_rank);
MPI_Comm_size(child_comm, &child_size);
}
chrono["nv-slices"].start();
// BUILD SLICES PARAMETRIZED BY NV ==================================={{{1
LOG(0,"Atrip") << "BUILD NV-SLICES\n";
TAPHH<F> taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
HHHA<F> hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
chrono["nv-slices"].stop();
chrono["nv-nv-slices"].start();
// BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1
LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n";
ABPH<F> abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
ABHH<F> abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
TABHH<F> tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
chrono["nv-nv-slices"].stop();
// all tensors
std::vector< SliceUnion<F>* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh};
//CONSTRUCT TUPLE LIST ==============================================={{{1
LOG(0,"Atrip") << "BUILD TUPLE LIST\n";
const auto tuplesList = std::move(getTuplesList(Nv));
WITH_RANK << "tupList.size() = " << tuplesList.size() << "\n";
// GET ABC INDEX RANGE FOR RANK ======================================{{{1
auto abcIndex = getABCRange(np, rank, tuplesList);
size_t nIterations = abcIndex.second - abcIndex.first;
WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n";
LOG(0,"Atrip") << "#iterations: " << nIterations << "\n";
// first abc
const ABCTuple firstAbc = tuplesList[abcIndex.first];
double energy(0.);
const size_t
iterationMod = (in.percentageMod > 0)
? nIterations * in.percentageMod / 100
: in.iterationMod
, iteration1Percent = nIterations * 0.01
;
auto const isFakeTuple
= [&tuplesList](size_t const i) { return i >= tuplesList.size(); };
using Database = typename Slice<F>::Database;
using LocalDatabase = typename Slice<F>::LocalDatabase;
auto communicateDatabase
= [ &unions
, np
, &chrono
] (ABCTuple const& abc, MPI_Comm const& c) -> Database {
chrono["db:comm:type:do"].start();
auto MPI_LDB_ELEMENT = Slice<F>::mpi::localDatabaseElement();
chrono["db:comm:type:do"].stop();
chrono["db:comm:ldb"].start();
LocalDatabase ldb;
for (auto const& tensor: unions) {
auto const& tensorDb = tensor->buildLocalDatabase(abc);
ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end());
}
chrono["db:comm:ldb"].stop();
Database db(np * ldb.size(), ldb[0]);
chrono["oneshot-db:comm:allgather"].start();
chrono["db:comm:allgather"].start();
MPI_Allgather( ldb.data()
, ldb.size()
, MPI_LDB_ELEMENT
, db.data()
, ldb.size()
, MPI_LDB_ELEMENT
, c);
chrono["db:comm:allgather"].stop();
chrono["oneshot-db:comm:allgather"].stop();
chrono["db:comm:type:free"].start();
MPI_Type_free(&MPI_LDB_ELEMENT);
chrono["db:comm:type:free"].stop();
return db;
};
auto doIOPhase
= [&unions, &rank, &np, &universe, &chrono] (Database const& db) {
const size_t localDBLength = db.size() / np;
size_t sendTag = 0
, recvTag = rank * localDBLength
;
// RECIEVE PHASE ======================================================
{
// At this point, we have already send to everyone that fits
auto const& begin = &db[rank * localDBLength]
, end = begin + localDBLength
;
for (auto it = begin; it != end; ++it) {
recvTag++;
auto const& el = *it;
auto& u = unionByName(unions, el.name);
WITH_DBG std::cout
<< rank << ":r"
<< "" << recvTag << " =>"
<< " «n" << el.name
<< ", t" << el.info.type
<< ", s" << el.info.state
<< "»"
<< " ⊙ {" << rank << "" << el.info.from.rank
<< ", "
<< el.info.from.source << "}"
<< " ∴ {" << el.info.tuple[0]
<< ", "
<< el.info.tuple[1]
<< "}"
<< "\n"
;
chrono["db:io:recv"].start();
u.receive(el.info, recvTag);
chrono["db:io:recv"].stop();
} // recv
}
// SEND PHASE =========================================================
for (size_t otherRank = 0; otherRank<np; otherRank++) {
auto const& begin = &db[otherRank * localDBLength]
, end = begin + localDBLength
;
for (auto it = begin; it != end; ++it) {
sendTag++;
typename Slice<F>::LocalDatabaseElement const& el = *it;
if (el.info.from.rank != rank) continue;
auto& u = unionByName(unions, el.name);
WITH_DBG std::cout
<< rank << ":s"
<< "" << sendTag << " =>"
<< " «n" << el.name
<< ", t" << el.info.type
<< ", s" << el.info.state
<< "»"
<< " ⊙ {" << el.info.from.rank << "" << otherRank
<< ", "
<< el.info.from.source << "}"
<< " ∴ {" << el.info.tuple[0]
<< ", "
<< el.info.tuple[1]
<< "}"
<< "\n"
;
chrono["db:io:send"].start();
u.send(otherRank, el.info, sendTag);
chrono["db:io:send"].stop();
} // send phase
} // otherRank
};
#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
std::map<ABCTuple, double> tupleEnergies;
#endif
const double doublesFlops
= double(No)
* double(No)
* double(No)
* (double(No) + double(Nv))
* 2.0
* (traits::isComplex<F>() ? 2.0 : 1.0)
* 6.0
/ 1e9
;
// START MAIN LOOP ======================================================{{{1
for ( size_t i = abcIndex.first, iteration = 1
; i < abcIndex.second
; i++, iteration++
) {
chrono["iterations"].start();
// check overhead from chrono over all iterations
chrono["start:stop"].start(); chrono["start:stop"].stop();
// check overhead of doing a barrier at the beginning
chrono["oneshot-mpi:barrier"].start();
chrono["mpi:barrier"].start();
// TODO: REMOVE
if (in.barrier == 1)
MPI_Barrier(universe);
chrono["mpi:barrier"].stop();
chrono["oneshot-mpi:barrier"].stop();
if (iteration % iterationMod == 0 || iteration == iteration1Percent) {
if (IterationDescription::descriptor) {
IterationDescription::descriptor({
iteration,
nIterations,
chrono["iterations"].count()
});
}
LOG(0,"Atrip")
<< "iteration " << iteration
<< " [" << 100 * iteration / nIterations << "%]"
<< " (" << doublesFlops * iteration / chrono["doubles"].count()
<< "GF)"
<< " (" << doublesFlops * iteration / chrono["iterations"].count()
<< "GF)"
<< " ===========================\n";
// PRINT TIMINGS
if (in.chrono)
for (auto const& pair: chrono)
LOG(1, " ") << pair.first << " :: "
<< pair.second.count()
<< std::endl;
}
const ABCTuple abc = isFakeTuple(i)
? tuplesList[tuplesList.size() - 1]
: tuplesList[i]
, *abcNext = i == (abcIndex.second - 1)
? nullptr
: isFakeTuple(i + 1)
? &tuplesList[tuplesList.size() - 1]
: &tuplesList[i + 1]
;
chrono["with_rank"].start();
WITH_RANK << " :it " << iteration
<< " :abc " << pretty_print(abc)
<< " :abcN "
<< (abcNext ? pretty_print(*abcNext) : "None")
<< "\n";
chrono["with_rank"].stop();
// COMM FIRST DATABASE ================================================{{{1
if (i == abcIndex.first) {
WITH_RANK << "__first__:first database ............ \n";
const auto __db = communicateDatabase(abc, universe);
WITH_RANK << "__first__:first database communicated \n";
WITH_RANK << "__first__:first database io phase \n";
doIOPhase(__db);
WITH_RANK << "__first__:first database io phase DONE\n";
WITH_RANK << "__first__::::Unwrapping all slices for first database\n";
for (auto& u: unions) u->unwrapAll(abc);
WITH_RANK << "__first__::::Unwrapping all slices for first database DONE\n";
MPI_Barrier(universe);
}
// COMM NEXT DATABASE ================================================={{{1
if (abcNext) {
WITH_RANK << "__comm__:" << iteration << "th communicating database\n";
chrono["db:comm"].start();
//const auto db = communicateDatabase(*abcNext, universe);
Database db = communicateDatabase(*abcNext, universe);
chrono["db:comm"].stop();
chrono["db:io"].start();
doIOPhase(db);
chrono["db:io"].stop();
WITH_RANK << "__comm__:" << iteration << "th database io phase DONE\n";
}
// COMPUTE DOUBLES ===================================================={{{1
OCD_Barrier(universe);
if (!isFakeTuple(i)) {
WITH_RANK << iteration << "-th doubles\n";
WITH_CHRONO(chrono["oneshot-unwrap"],
WITH_CHRONO(chrono["unwrap"],
WITH_CHRONO(chrono["unwrap:doubles"],
for (auto& u: decltype(unions){&abph, &hhha, &taphh, &tabhh}) {
u->unwrapAll(abc);
}
)))
chrono["oneshot-doubles"].start();
chrono["doubles"].start();
doublesContribution<F>( abc, (size_t)No, (size_t)Nv
// -- VABCI
, abph.unwrapSlice(Slice<F>::AB, abc)
, abph.unwrapSlice(Slice<F>::AC, abc)
, abph.unwrapSlice(Slice<F>::BC, abc)
, abph.unwrapSlice(Slice<F>::BA, abc)
, abph.unwrapSlice(Slice<F>::CA, abc)
, abph.unwrapSlice(Slice<F>::CB, abc)
// -- VHHHA
, hhha.unwrapSlice(Slice<F>::A, abc)
, hhha.unwrapSlice(Slice<F>::B, abc)
, hhha.unwrapSlice(Slice<F>::C, abc)
// -- TA
, taphh.unwrapSlice(Slice<F>::A, abc)
, taphh.unwrapSlice(Slice<F>::B, abc)
, taphh.unwrapSlice(Slice<F>::C, abc)
// -- TABIJ
, tabhh.unwrapSlice(Slice<F>::AB, abc)
, tabhh.unwrapSlice(Slice<F>::AC, abc)
, tabhh.unwrapSlice(Slice<F>::BC, abc)
// -- TIJK
, Tijk.data()
, chrono
);
WITH_RANK << iteration << "-th doubles done\n";
chrono["doubles"].stop();
chrono["oneshot-doubles"].stop();
}
// COMPUTE SINGLES =================================================== {{{1
OCD_Barrier(universe);
if (!isFakeTuple(i)) {
WITH_CHRONO(chrono["oneshot-unwrap"],
WITH_CHRONO(chrono["unwrap"],
WITH_CHRONO(chrono["unwrap:singles"],
abhh.unwrapAll(abc);
)))
chrono["reorder"].start();
for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I];
chrono["reorder"].stop();
chrono["singles"].start();
singlesContribution<F>( No, Nv, abc
, Tai.data()
, abhh.unwrapSlice(Slice<F>::AB, abc)
, abhh.unwrapSlice(Slice<F>::AC, abc)
, abhh.unwrapSlice(Slice<F>::BC, abc)
, Zijk.data());
chrono["singles"].stop();
}
// COMPUTE ENERGY ==================================================== {{{1
if (!isFakeTuple(i)) {
double tupleEnergy(0.);
int distinct(0);
if (abc[0] == abc[1]) distinct++;
if (abc[1] == abc[2]) distinct--;
const F epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]);
chrono["energy"].start();
if ( distinct == 0)
tupleEnergy = getEnergyDistinct<F>(epsabc, epsi, Tijk, Zijk);
else
tupleEnergy = getEnergySame<F>(epsabc, epsi, Tijk, Zijk);
chrono["energy"].stop();
#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
tupleEnergies[abc] = tupleEnergy;
#endif
energy += tupleEnergy;
}
if (isFakeTuple(i)) {
// fake iterations should also unwrap whatever they got
WITH_RANK << iteration
<< "th unwrapping because of fake in "
<< i << "\n";
for (auto& u: unions) u->unwrapAll(abc);
}
#ifdef HAVE_OCD
for (auto const& u: unions) {
WITH_RANK << "__dups__:"
<< iteration
<< "-th n" << u->name << " checking duplicates\n";
u->checkForDuplicates();
}
#endif
// CLEANUP UNIONS ===================================================={{{1
OCD_Barrier(universe);
if (abcNext) {
chrono["gc"].start();
WITH_RANK << "__gc__:" << iteration << "-th cleaning up.......\n";
for (auto& u: unions) {
u->unwrapAll(abc);
WITH_RANK << "__gc__:n" << u->name << " :it " << iteration
<< " :abc " << pretty_print(abc)
<< " :abcN " << pretty_print(*abcNext)
<< "\n";
// for (auto const& slice: u->slices)
// WITH_RANK << "__gc__:guts:" << slice.info << "\n";
u->clearUnusedSlicesForNext(*abcNext);
WITH_RANK << "__gc__: checking validity\n";
#ifdef HAVE_OCD
// check for validity of the slices
for (auto type: u->sliceTypes) {
auto tuple = Slice<F>::subtupleBySlice(abc, type);
for (auto& slice: u->slices) {
if ( slice.info.type == type
&& slice.info.tuple == tuple
&& slice.isDirectlyFetchable()
) {
if (slice.info.state == Slice<F>::Dispatched)
throw std::domain_error( "This slice should not be undispatched! "
+ pretty_print(slice.info));
}
}
}
#endif
}
chrono["gc"].stop();
}
WITH_RANK << iteration << "-th cleaning up....... DONE\n";
chrono["iterations"].stop();
// ITERATION END ====================================================={{{1
}
// END OF MAIN LOOP
MPI_Barrier(universe);
// PRINT TUPLES ========================================================={{{1
#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
LOG(0,"Atrip") << "tuple energies" << "\n";
for (size_t i = 0; i < np; i++) {
MPI_Barrier(universe);
for (auto const& pair: tupleEnergies) {
if (i == rank)
std::cout << pair.first[0]
<< " " << pair.first[1]
<< " " << pair.first[2]
<< std::setprecision(15) << std::setw(23)
<< " tupleEnergy: " << pair.second
<< "\n"
;
}
}
#endif
// COMMUNICATE THE ENERGIES ============================================={{{1
LOG(0,"Atrip") << "COMMUNICATING ENERGIES \n";
double globalEnergy = 0;
MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe);
WITH_RANK << "local energy " << energy << "\n";
LOG(0, "Atrip") << "Energy: "
<< std::setprecision(15) << std::setw(23)
<< (- globalEnergy) << std::endl;
// PRINT TIMINGS {{{1
if (in.chrono)
for (auto const& pair: chrono)
LOG(0,"atrip:chrono") << pair.first << " "
<< pair.second.count() << std::endl;
LOG(0, "atrip:flops(doubles)")
<< nIterations * doublesFlops / chrono["doubles"].count() << "\n";
LOG(0, "atrip:flops(iterations)")
<< nIterations * doublesFlops / chrono["iterations"].count() << "\n";
// TODO: change the sign in the getEnergy routines
return { - globalEnergy };
}
// instantiate
template Atrip::Output Atrip::run(Atrip::Input<double> const& in);
template Atrip::Output Atrip::run(Atrip::Input<Complex> const& in);
#+end_src
** Debug and Logging
*** Macros
#+begin_src c++ :tangle (atrip-debug-h)
#pragma once
#include <functional>
#define ATRIP_BENCHMARK
//#define ATRIP_DONT_SLICE
#ifndef ATRIP_DEBUG
# define ATRIP_DEBUG 1
#endif
//#define ATRIP_WORKLOAD_DUMP
#define ATRIP_USE_DGEMM
//#define ATRIP_PRINT_TUPLES
#ifndef ATRIP_DEBUG
#define ATRIP_DEBUG 1
#endif
#if ATRIP_DEBUG == 4
# pragma message("WARNING: You have OCD debugging ABC triples " \
"expect GB of output and consult your therapist")
# include <dbg.h>
# define HAVE_OCD
# define OCD_Barrier(com) MPI_Barrier(com)
# define WITH_OCD
# define WITH_ROOT if (atrip::Atrip::rank == 0)
# define WITH_SPECIAL(r) if (atrip::Atrip::rank == r)
# define WITH_RANK std::cout << atrip::Atrip::rank << ": "
# define WITH_CRAZY_DEBUG
# define WITH_DBG
# define DBG(...) dbg(__VA_ARGS__)
#elif ATRIP_DEBUG == 3
# pragma message("WARNING: You have crazy debugging ABC triples," \
" expect GB of output")
# include <dbg.h>
# define OCD_Barrier(com)
# define WITH_OCD if (false)
# define WITH_ROOT if (atrip::Atrip::rank == 0)
# define WITH_SPECIAL(r) if (atrip::Atrip::rank == r)
# define WITH_RANK std::cout << atrip::Atrip::rank << ": "
# define WITH_CRAZY_DEBUG
# define WITH_DBG
# define DBG(...) dbg(__VA_ARGS__)
#elif ATRIP_DEBUG == 2
# pragma message("WARNING: You have some debugging info for ABC triples")
# include <dbg.h>
# define OCD_Barrier(com)
# define WITH_OCD if (false)
# define WITH_ROOT if (atrip::Atrip::rank == 0)
# define WITH_SPECIAL(r) if (atrip::Atrip::rank == r)
# define WITH_RANK std::cout << atrip::Atrip::rank << ": "
# define WITH_CRAZY_DEBUG if (false)
# define WITH_DBG
# define DBG(...) dbg(__VA_ARGS__)
#else
# define OCD_Barrier(com)
# define WITH_OCD if (false)
# define WITH_ROOT if (false)
# define WITH_SPECIAL(r) if (false)
# define WITH_RANK if (false) std::cout << atrip::Atrip::rank << ": "
# define WITH_DBG if (false)
# define WITH_CRAZY_DEBUG if (false)
# define DBG(...)
#endif
#+end_src
And users of the library can redefine the =LOG= macro
which in case of not being defined is defined as follows:
#+begin_src c++ :tangle (atrip-debug-h)
#ifndef LOG
#define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": "
#endif
#+end_src
Furthermore, if you do not wish to see any output from ATRIP, simply
define =ATRIP_NO_OUTPUT=
#+begin_src c++ :tangle (atrip-debug-h)
#ifdef ATRIP_NO_OUTPUT
# undef LOG
# define LOG(level, name) if (false) std::cout << name << ": "
#endif
#+end_src
*** Iteration informer
In general a code writer will want to write some messages in every iteration.
A developer then can register a function to be used in this sense.
The input of the function is an [[IterationDescriptor]] structure and the output
should be nothing.
#+name: IterationDescriptor
#+begin_src c++ :tangle (atrip-debug-h)
namespace atrip {
struct IterationDescription;
using IterationDescriptor = std::function<void(IterationDescription const&)>;
struct IterationDescription {
static IterationDescriptor descriptor;
size_t currentIteration;
size_t totalIterations;
double currentElapsedTime;
};
void registerIterationDescriptor(IterationDescriptor);
}
#+end_src
** Include header
#+begin_src c++ :tangle (atrip-main-h)
#pragma once
#include <atrip/Atrip.hpp>
#+end_src