atrip/atrip.org

#+title: ATRIP: An MPI-asynchronous implementation of CCSD(T)
#+PROPERTY: header-args+ :noweb yes :comments noweb :mkdirp t

* Implementation

The algorithm uses two main data types, the =Slice= and the
=SliceUnion= as a container and resource manager of the =Slice=.

** The slice


#+begin_src c++ :tangle (atrip-slice-h)
#pragma once
#include <iostream>
#include <algorithm>
#include <vector>
#include <mpi.h>

#include <atrip/Tuples.hpp>
#include <atrip/Utils.hpp>

namespace atrip {


struct Slice {

  using F = double;
#+end_src

A slice is the concept of a subset of values of a given tensor.
As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds of objects:
- the object \( \mathsf{T}(a)^b_{ij} \) which for every \( a \) gets assigned the
  tensor \( T^{ab}_{ij} \) of size \( N_\mathrm{o}^2 \times N_\mathrm{v} \)

- the object \( \mathsf{T}(a,b)_{ij} \) which for every pair of \( a, b \)
  corresponds the \( N_\mathrm{o}^2 \)-sized tensor \( T^{ab}_{ij} \).


#+begin_src c++ :tangle (atrip-slice-h)
  // ASSOCIATED TYPES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

  struct Location { size_t rank; size_t source; };

  enum Type
    { A = 10
    , B
    , C
    // Two-parameter slices
    , AB = 20
    , BC
    , AC
    // for abci and the doubles
    , CB
    , BA
    , CA
    // The non-typed slice
    , Blank = 404
    };

  enum State {
    // Fetch represents the state where a slice is to be fetched
    // and has a valid data pointer that can be written to
    Fetch = 0,
    // Dispatches represents the state that an MPI call has been
    // dispatched in order to get the data, but the data has not been
    // yet unwrapped, the data might be there or we might have to wait.
    Dispatched = 2,
    // Ready means that the data pointer can be read from
    Ready = 1,
    // Self sufficient is a slice when its contents are located
    // in the same rank that it lives, so that it does not have to
    // fetch from no one else.
    SelfSufficient = 911,
    // Recycled means that this slice gets its data pointer from another
    // slice, so it should not be written to
    Recycled = 123,
    // Acceptor means that the Slice can accept a new Slice, it is
    // the counterpart of the Blank type, but for states
    Acceptor = 405
  };

  struct Info {
    // which part of a,b,c the slice holds
    PartialTuple tuple;
    // The type of slice for the user to retrieve the correct one
    Type type;
    // What is the state of the slice
    State state;
    // Where the slice is to be retrieved
    // NOTE: this can actually be computed from tuple
    Location from;
    // If the data are actually to be found in this other slice
    Type recycling;

    Info() : tuple{0,0}
           , type{Blank}
           , state{Acceptor}
           , from{0,0}
           , recycling{Blank}
           {}
  };

  using Ty_x_Tu = std::pair< Type, PartialTuple >;

  // Names of the integrals that are considered in CCSD(T)
  enum Name
    { TA    = 100
    , VIJKA = 101
    , VABCI = 200
    , TABIJ = 201
    , VABIJ = 202
    };

  // DATABASE ==========================================================={{{1
  struct LocalDatabaseElement {
    Slice::Name name;
    Slice::Info info;
  };
  using LocalDatabase = std::vector<LocalDatabaseElement>;
  using Database = LocalDatabase;


    // STATIC METHODS ===========================================================
    //
    // They are useful to organize the structure of slices

    struct mpi {

      static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) {
        MPI_Datatype dt;
        MPI_Type_vector(n, 1, 1, DT, &dt);
        MPI_Type_commit(&dt);
        return dt;
      }

      static MPI_Datatype sliceLocation () {
        constexpr int n = 2;
        // create a sliceLocation to measure in the current architecture
        // the packing of the struct
        Slice::Location measure;
        MPI_Datatype dt;
        const std::vector<int> lengths(n, 1);
        const MPI_Datatype types[n] = {usizeDt(), usizeDt()};

        // measure the displacements in the struct
        size_t j = 0;
        MPI_Aint displacements[n];
        MPI_Get_address(&measure.rank,   &displacements[j++]);
        MPI_Get_address(&measure.source, &displacements[j++]);
        for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0];
        displacements[0] = 0;

        MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
        MPI_Type_commit(&dt);
        return dt;
      }

      static MPI_Datatype enumDt() { return MPI_INT; }
      static MPI_Datatype usizeDt() { return MPI_UINT64_T; }

      static MPI_Datatype sliceInfo () {
        constexpr int n = 5;
        MPI_Datatype dt;
        Slice::Info measure;
        const std::vector<int> lengths(n, 1);
        const MPI_Datatype types[n]
          = { vector(2, usizeDt())
            , enumDt()
            , enumDt()
            , sliceLocation()
            , enumDt()
            };

        // create the displacements from the info measurement struct
        size_t j = 0;
        MPI_Aint displacements[n];
        MPI_Get_address(measure.tuple.data(), &displacements[j++]);
        MPI_Get_address(&measure.type,        &displacements[j++]);
        MPI_Get_address(&measure.state,       &displacements[j++]);
        MPI_Get_address(&measure.from,        &displacements[j++]);
        MPI_Get_address(&measure.recycling,   &displacements[j++]);
        for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0];
        displacements[0] = 0;

        MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
        MPI_Type_commit(&dt);
        return dt;
      }

      static MPI_Datatype localDatabaseElement () {
        constexpr int n = 2;
        MPI_Datatype dt;
        LocalDatabaseElement measure;
        const std::vector<int> lengths(n, 1);
        const MPI_Datatype types[n]
          = { enumDt()
            , sliceInfo()
            };

        // measure the displacements in the struct
        size_t j = 0;
        MPI_Aint displacements[n];
        MPI_Get_address(&measure.name, &displacements[j++]);
        MPI_Get_address(&measure.info, &displacements[j++]);
        for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0];
        displacements[0] = 0;

        MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
        MPI_Type_commit(&dt);
        return dt;
      }

    };

  static
  PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) {
    switch (sliceType) {
      case AB: return {abc[0], abc[1]};
      case BC: return {abc[1], abc[2]};
      case AC: return {abc[0], abc[2]};
      case CB: return {abc[2], abc[1]};
      case BA: return {abc[1], abc[0]};
      case CA: return {abc[2], abc[0]};
      case  A: return {abc[0], 0};
      case  B: return {abc[1], 0};
      case  C: return {abc[2], 0};
      default: throw "Switch statement not exhaustive!";
    }
  }


    /**
     ,* It is important here to return a reference to a Slice
     ,* not to accidentally copy the associated buffer of the slice.
     ,*/
    static Slice& findOneByType(std::vector<Slice> &slices, Slice::Type type) {
        const auto sliceIt
          = std::find_if(slices.begin(), slices.end(),
                         [&type](Slice const& s) {
                           return type == s.info.type;
                         });
        WITH_CRAZY_DEBUG
        WITH_RANK
          << "\t__ looking for " << type << "\n";
        if (sliceIt == slices.end())
          throw std::domain_error("Slice by type not found!");
        return *sliceIt;
    }

    /*
     ,* Check if an info has
     ,*
     ,*/
    static std::vector<Slice*> hasRecycledReferencingToIt
      ( std::vector<Slice> &slices
      , Info const& info
      ) {
      std::vector<Slice*> result;

      for (auto& s: slices)
        if (  s.info.recycling == info.type
           && s.info.tuple == info.tuple
           && s.info.state == Recycled
           ) result.push_back(&s);

      return result;
    }

    static Slice&
    findRecycledSource (std::vector<Slice> &slices, Slice::Info info) {
      const auto sliceIt
        = std::find_if(slices.begin(), slices.end(),
                       [&info](Slice const& s) {
                         return info.recycling == s.info.type
                             && info.tuple == s.info.tuple
                             && State::Recycled != s.info.state
                             ;
                       });

      WITH_CRAZY_DEBUG
      WITH_RANK << "__slice__:find: recycling source of "
                << pretty_print(info) << "\n";
      if (sliceIt == slices.end())
        throw std::domain_error( "Slice not found: "
                               + pretty_print(info)
                               + " rank: "
                               + pretty_print(Atrip::rank)
                               );
      WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n";
      return *sliceIt;
    }

    static Slice& findByTypeAbc
      ( std::vector<Slice> &slices
      , Slice::Type type
      , ABCTuple const& abc
      ) {
        const auto tuple = Slice::subtupleBySlice(abc, type);
        const auto sliceIt
          = std::find_if(slices.begin(), slices.end(),
                         [&type, &tuple](Slice const& s) {
                           return type == s.info.type
                               && tuple == s.info.tuple
                               ;
                         });
        WITH_CRAZY_DEBUG
        WITH_RANK << "__slice__:find:" << type << " and tuple "
                  << pretty_print(tuple)
                  << "\n";
        if (sliceIt == slices.end())
          throw std::domain_error( "Slice not found: "
                                 + pretty_print(tuple)
                                 + ", "
                                 + pretty_print(type)
                                 + " rank: "
                                 + pretty_print(Atrip::rank)
                                 );
        return *sliceIt;
    }

    static Slice& findByInfo(std::vector<Slice> &slices,
                             Slice::Info const& info) {
        const auto sliceIt
          = std::find_if(slices.begin(), slices.end(),
                         [&info](Slice const& s) {
                           // TODO: maybe implement comparison in Info struct
                           return info.type == s.info.type
                               && info.state == s.info.state
                               && info.tuple == s.info.tuple
                               && info.from.rank == s.info.from.rank
                               && info.from.source == s.info.from.source
                                ;
                         });
        WITH_CRAZY_DEBUG
        WITH_RANK << "__slice__:find:looking for " << pretty_print(info) << "\n";
        if (sliceIt == slices.end())
          throw std::domain_error( "Slice by info not found: "
                                 + pretty_print(info));
        return *sliceIt;
    }

    // SLICE DEFINITION  =================================================={{{1

    // ATTRIBUTES ============================================================
    Info info;
    F  *data;
    MPI_Request request;
    const size_t size;

    void markReady() noexcept {
      info.state = Ready;
      info.recycling = Blank;
    }

    /*
     ,* This means that the data is there
     ,*/
    bool isUnwrapped() const noexcept {
      return info.state == Ready
          || info.state == SelfSufficient
          ;
    }

    bool isUnwrappable() const noexcept {
      return isUnwrapped()
          || info.state == Recycled
          || info.state == Dispatched
          ;
    }

    inline bool isDirectlyFetchable() const noexcept {
      return info.state == Ready || info.state == Dispatched;
    }

    void free() noexcept {
      info.tuple      = {0, 0};
      info.type       = Blank;
      info.state      = Acceptor;
      info.from       = {0, 0};
      info.recycling  = Blank;
      data            = nullptr;
    }

    inline bool isFree() const noexcept {
      return info.tuple       == PartialTuple{0, 0}
          && info.type        == Blank
          && info.state       == Acceptor
          && info.from.rank   == 0
          && info.from.source == 0
          && info.recycling   == Blank
          && data             == nullptr
           ;
    }


    /*
     ,* This function answers the question, which slices can be recycled.
     ,*
     ,* A slice can only be recycled if it is Fetch or Ready and has
     ,* a valid datapointer.
     ,*
     ,* In particular, SelfSufficient are not recyclable, since it is easier
     ,* just to create a SelfSufficient slice than deal with data dependencies.
     ,*
     ,* Furthermore, a recycled slice is not recyclable, if this is the case
     ,* then it is either bad design or a bug.
     ,*/
    inline bool isRecyclable() const noexcept {
      return (  info.state == Dispatched
             || info.state == Ready
             || info.state == Fetch
             )
          && hasValidDataPointer()
          ;
    }

    /*
     ,* This function describes if a slice has a valid data pointer.
     ,*
     ,* This is important to know if the slice has some data to it, also
     ,* some structural checks are done, so that it should not be Acceptor
     ,* or Blank, if this is the case then it is a bug.
     ,*/
    inline bool hasValidDataPointer() const noexcept {
      return data       != nullptr
          && info.state != Acceptor
          && info.type  != Blank
          ;
    }

    void unwrapAndMarkReady() {
      if (info.state == Ready) return;
      if (info.state != Dispatched)
        throw
          std::domain_error("Can't unwrap a non-ready, non-dispatched slice!");
      markReady();
      MPI_Status status;
#ifdef HAVE_OCD
        WITH_RANK << "__slice__:mpi: waiting " << "\n";
#endif
      const int errorCode = MPI_Wait(&request, &status);
      if (errorCode != MPI_SUCCESS)
        throw "MPI ERROR HAPPENED....";

#ifdef HAVE_OCD
      char errorString[MPI_MAX_ERROR_STRING];
      int errorSize;
      MPI_Error_string(errorCode, errorString, &errorSize);

      WITH_RANK << "__slice__:mpi: status "
                << "{ .source="    << status.MPI_SOURCE
                << ", .tag="       << status.MPI_TAG
                << ", .error="     << status.MPI_ERROR
                << ", .errCode="   << errorCode
                << ", .err="       << errorString
                << " }"
                << "\n";
#endif
    }

    Slice(size_t size_)
      : info({})
      , data(nullptr)
      , size(size_)
      {}


  }; // struct Slice


std::ostream& operator<<(std::ostream& out, Slice::Location const& v) {
  // TODO: remove me
  out << "{.r(" << v.rank << "), .s(" << v.source << ")};";
  return out;
}

std::ostream& operator<<(std::ostream& out, Slice::Info const& i) {
  out << "«t" << i.type << ", s" << i.state << "»"
      << " ⊙ {" << i.from.rank << ", " << i.from.source << "}"
      << " ∴ {" << i.tuple[0] << ", " << i.tuple[1] << "}"
      << " ♲t" << i.recycling
      ;
  return out;
}

} // namespace atrip
#+end_src

** Utils
#+begin_src c++ :tangle (atrip-utils-h)
#pragma once
#include <sstream>
#include <string>
#include <map>
#include <chrono>

#include <ctf.hpp>

namespace atrip {


  template <typename T>
  std::string pretty_print(T&& value) {
    std::stringstream stream;
#if ATRIP_DEBUG > 1
    dbg::pretty_print(stream, std::forward<T>(value));
#endif
    return stream.str();
  }

#define WITH_CHRONO(__chrono, ...) \
  __chrono.start(); __VA_ARGS__ __chrono.stop();

  struct Timer {
    using Clock = std::chrono::high_resolution_clock;
    using Event = std::chrono::time_point<Clock>;
    std::chrono::duration<double> duration;
    Event _start;
    inline void start() noexcept { _start = Clock::now(); }
    inline void stop() noexcept { duration += Clock::now() - _start; }
    inline void clear() noexcept { duration *= 0; }
    inline double count() const noexcept { return duration.count(); }
  };
  using Timings = std::map<std::string, Timer>;
}

#+end_src

** The rank mapping
#+begin_src c++ :tangle (atrip-rankmap-h)
#pragma once

#include <vector>
#include <algorithm>

#include <atrip/Slice.hpp>

namespace atrip {
  struct RankMap {

    std::vector<size_t> const lengths;
    size_t const np, size;

    RankMap(std::vector<size_t> lens, size_t np_)
      : lengths(lens)
      , np(np_)
      , size(std::accumulate(lengths.begin(), lengths.end(),
                            1, std::multiplies<size_t>()))
    { assert(lengths.size() <= 2); }

    size_t find(Slice::Location const& p) const noexcept {
      return p.source * np + p.rank;
    }

    size_t nSources() const noexcept {
      return size / np + size_t(size % np != 0);
    }


    bool isPaddingRank(size_t rank) const noexcept {
      return size % np == 0
          ? false
          : rank > (size % np - 1)
          ;
    }

    bool isSourcePadding(size_t rank, size_t source) const noexcept {
      return source == nSources() && isPaddingRank(rank);
    }

    Slice::Location
    find(ABCTuple const& abc, Slice::Type sliceType) const noexcept {
      // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB
      const auto tuple = Slice::subtupleBySlice(abc, sliceType);

      const size_t index
        = tuple[0]
        + tuple[1] * (lengths.size() > 1 ? lengths[0] : 0)
        ;

      return
        { index % np
        , index / np
        };
    }

  };

}
#+end_src

** The slice union
#+begin_src c++ :tangle (atrip-slice-union-h)
#pragma once
#include <atrip/Debug.hpp>
#include <atrip/Slice.hpp>
#include <atrip/RankMap.hpp>

namespace atrip {

  struct SliceUnion {
    using F = double;
    using Tensor = CTF::Tensor<F>;

    virtual void
    sliceIntoBuffer(size_t iteration, Tensor &to, Tensor const& from) = 0;

    /*
     * This function should enforce an important property of a SliceUnion.
     * Namely, there can be no two Slices of the same nature.
     *
     * This means that there can be at most one slice with a given Ty_x_Tu.
     */
    void checkForDuplicates() const {
      std::vector<Slice::Ty_x_Tu> tytus;
      for (auto const& s: slices) {
        if (s.isFree()) continue;
        tytus.push_back({s.info.type, s.info.tuple});
      }

      for (auto const& tytu: tytus) {
        if (std::count(tytus.begin(), tytus.end(), tytu) > 1)
          throw "Invariance violated, more than one slice with same Ty_x_Tu";
      }

    }

    std::vector<Slice::Ty_x_Tu> neededSlices(ABCTuple const& abc) {
      std::vector<Slice::Ty_x_Tu> needed(sliceTypes.size());
      // build the needed vector
      std::transform(sliceTypes.begin(), sliceTypes.end(),
                     needed.begin(),
                     [&abc](Slice::Type const type) {
                       auto tuple = Slice::subtupleBySlice(abc, type);
                       return std::make_pair(type, tuple);
                     });
      return needed;
    }

    /* buildLocalDatabase
     *
     * It should build a database of slices so that we know what is needed
     * to fetch in the next iteration represented by the tuple 'abc'.
     *
     * 1. The algorithm works as follows, we build a database of the all
     * the slice types that we need together with their tuple.
     *
     * 2. Look in the SliceUnion if we already have this tuple,
     * if we already have it mark it (TODO)
     *
     * 3. If we don't have the tuple, look for a (state=acceptor, type=blank)
     * slice and mark this slice as type=Fetch with the corresponding type
     * and tuple.
     *
     * NOTE: The algorithm should certify that we always have enough blank
     * slices.
     *
     */
    Slice::LocalDatabase buildLocalDatabase(ABCTuple const& abc) {
      Slice::LocalDatabase result;

      auto const needed = neededSlices(abc);

      WITH_RANK << "__db__:needed:" << pretty_print(needed) << "\n";
      // BUILD THE DATABASE
      // we need to loop over all sliceTypes that this TensorUnion
      // is representing and find out how we will get the corresponding
      // slice for the abc we are considering right now.
      for (auto const& pair: needed) {
        auto const type = pair.first;
        auto const tuple = pair.second;
        auto const from  = rankMap.find(abc, type);

#ifdef HAVE_OCD
        WITH_RANK << "__db__:want:" << pretty_print(pair) << "\n";
        for (auto const& s: slices)
          WITH_RANK << "__db__:guts:ocd "
                    << s.info << " pt " << s.data
                    << "\n";
#endif

#ifdef HAVE_OCD
        WITH_RANK << "__db__: checking if exact match" << "\n";
#endif
        {
          // FIRST: look up if there is already a *Ready* slice matching what we
          // need
          auto const& it
            = std::find_if(slices.begin(), slices.end(),
                           [&tuple, &type](Slice const& other) {
                             return other.info.tuple == tuple
                                 && other.info.type == type
                                    // we only want another slice when it
                                    // has already ready-to-use data
                                 && other.isUnwrappable()
                                 ;
                           });
          if (it != slices.end()) {
            // if we find this slice, it means that we don't have to do anything
            WITH_RANK << "__db__: EXACT: found EXACT in name=" << name
                      << " for tuple " << tuple[0] << ", " << tuple[1]
                      << " ptr " << it->data
                      << "\n";
            result.push_back({name, it->info});
            continue;
          }
        }

#ifdef HAVE_OCD
        WITH_RANK << "__db__: checking if recycle" << "\n";
#endif
        // Try to find a recyling possibility ie. find a slice with the same
        // tuple and that has a valid data pointer.
        auto const& recycleIt
          = std::find_if(slices.begin(), slices.end(),
                         [&tuple, &type](Slice const& other) {
                           return other.info.tuple == tuple
                               && other.info.type != type
                               && other.isRecyclable()
                               ;
                         });

        // if we find this recylce, then we find a Blank slice
        // (which should exist by construction :THINK)
        //
        if (recycleIt != slices.end()) {
          auto& blank = Slice::findOneByType(slices, Slice::Blank);
          // TODO: formalize this through a method to copy information
          //       from another slice
          blank.data = recycleIt->data;
          blank.info.type = type;
          blank.info.tuple = tuple;
          blank.info.state = Slice::Recycled;
          blank.info.from = from;
          blank.info.recycling = recycleIt->info.type;
          result.push_back({name, blank.info});
          WITH_RANK << "__db__: RECYCLING: n" << name
                    << " " << pretty_print(abc)
                    << " get " << pretty_print(blank.info)
                    << " from " << pretty_print(recycleIt->info)
                    << " ptr " << recycleIt->data
                    << "\n"
                    ;
          continue;
        }

        // in this case we have to create a new slice
        // this means that we should have a blank slice at our disposal
        // and also the freePointers should have some elements inside,
        // so we pop a data pointer from the freePointers container
#ifdef HAVE_OCD
        WITH_RANK << "__db__: none work, doing new" << "\n";
#endif
        {
          WITH_RANK << "__db__: NEW: finding blank in " << name
                    << " for type " << type
                    << " for tuple " << tuple[0] << ", " << tuple[1]
                    << "\n"
                    ;
          auto& blank = Slice::findOneByType(slices, Slice::Blank);
          blank.info.type = type;
          blank.info.tuple = tuple;
          blank.info.from = from;

          // Handle self sufficiency
          blank.info.state = Atrip::rank == from.rank
                           ? Slice::SelfSufficient
                           : Slice::Fetch
                           ;
          if (blank.info.state == Slice::SelfSufficient) {
            blank.data = sources[from.source].data();
          } else {
            if (freePointers.size() == 0)
              throw std::domain_error("No more free pointers!");
            auto dataPointer = freePointers.begin();
            freePointers.erase(dataPointer);
            blank.data = *dataPointer;
          }

          result.push_back({name, blank.info});
          continue;
        }

      }

#ifdef HAVE_OCD
      for (auto const& s: slices)
        WITH_RANK << "__db__:guts:ocd:__end__ " << s.info << "\n";
#endif


      return result;

    }

    /*
     * Garbage collect slices not needed for the next iteration.
     *
     * It will throw if it tries to gc a slice that has not been
     * previously unwrapped, as a safety mechanism.
     */
    void clearUnusedSlicesForNext(ABCTuple const& abc) {
      auto const needed = neededSlices(abc);

      // CLEAN UP SLICES, FREE THE ONES THAT ARE NOT NEEDED ANYMORE
      for (auto& slice: slices) {
        // if the slice is free, then it was not used anyways
        if (slice.isFree()) continue;


        // try to find the slice in the needed slices list
        auto const found
          = std::find_if(needed.begin(), needed.end(),
                         [&slice] (Slice::Ty_x_Tu const& tytu) {
                           return slice.info.tuple == tytu.second
                               && slice.info.type == tytu.first
                               ;
                         });

        // if we did not find slice in needed, then erase it
        if (found == needed.end()) {

          // We have to be careful about the data pointer,
          // for SelfSufficient, the data pointer is a source pointer
          // of the slice, so we should just wipe it.
          //
          // For Ready slices, we have to be careful if there are some
          // recycled slices depending on it.
          bool freeSlicePointer = true;

          // allow to gc unwrapped and recycled, never Fetch,
          // if we have a Fetch slice then something has gone very wrong.
          if (!slice.isUnwrapped() && slice.info.state != Slice::Recycled)
            throw
              std::domain_error("Trying to garbage collect "
                                " a non-unwrapped slice! "
                                + pretty_print(&slice)
                                + pretty_print(slice.info));

          // it can be that our slice is ready, but it has some hanging
          // references lying around in the form of a recycled slice.
          // Of course if we need the recycled slice the next iteration
          // this would be fatal, because we would then free the pointer
          // of the slice and at some point in the future we would
          // overwrite it. Therefore, we must check if slice has some
          // references in slices and if so then
          //
          //  - we should mark those references as the original (since the data
          //    pointer should be the same)
          //
          //  - we should make sure that the data pointer of slice
          //    does not get freed.
          //
          if (slice.info.state == Slice::Ready) {
            WITH_OCD WITH_RANK
              << "__gc__:" << "checking for data recycled dependencies\n";
            auto recycled
              = Slice::hasRecycledReferencingToIt(slices, slice.info);
            if (recycled.size()) {
              Slice* newReady = recycled[0];
              WITH_OCD WITH_RANK
                << "__gc__:" << "swaping recycled "
                << pretty_print(newReady->info)
                << " and "
                << pretty_print(slice.info)
                << "\n";
              newReady->markReady();
              assert(newReady->data == slice.data);
              freeSlicePointer = false;

              for (size_t i = 1; i < recycled.size(); i++) {
                auto newRecyled = recycled[i];
                newRecyled->info.recycling = newReady->info.type;
                WITH_OCD WITH_RANK
                  << "__gc__:" << "updating recycled "
                  << pretty_print(newRecyled->info)
                  << "\n";
              }

            }
          }

          // if the slice is self sufficient, do not dare touching the
          // pointer, since it is a pointer to our sources in our rank.
          if (  slice.info.state == Slice::SelfSufficient
             || slice.info.state == Slice::Recycled
             ) {
            freeSlicePointer = false;
          }

          // make sure we get its data pointer to be used later
          // only for non-recycled, since it can be that we need
          // for next iteration the data of the slice that the recycled points
          // to
          if (freeSlicePointer) {
            freePointers.insert(slice.data);
            WITH_RANK << "~~~:cl(" << name << ")"
                      << " added to freePointer "
                      << pretty_print(freePointers)
                      << "\n";
          } else {
            WITH_OCD WITH_RANK << "__gc__:not touching the free Pointer\n";
          }

          // at this point, let us blank the slice
          WITH_RANK << "~~~:cl(" << name << ")"
                    << " freeing up slice "
                    << " info " << slice.info
                    << "\n";
          slice.free();
        }

      }
    }

    // CONSTRUCTOR
    SliceUnion( Tensor const& sourceTensor
              , std::vector<Slice::Type> sliceTypes_
              , std::vector<size_t> sliceLength_
              , std::vector<size_t> paramLength
              , size_t np
              , MPI_Comm child_world
              , MPI_Comm global_world
              , Slice::Name name_
              , size_t nSliceBuffers = 4
              )
              : rankMap(paramLength, np)
              , world(child_world)
              , universe(global_world)
              , sliceLength(sliceLength_)
              , sources(rankMap.nSources(),
                        std::vector<F>
                          (std::accumulate(sliceLength.begin(),
                                           sliceLength.end(),
                                           1, std::multiplies<size_t>())))
              , name(name_)
              , sliceTypes(sliceTypes_)
              , sliceBuffers(nSliceBuffers, sources[0])
              //, slices(2 * sliceTypes.size(), Slice{ sources[0].size() })
    { // constructor begin

      LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n";

      slices
        = std::vector<Slice>(2 * sliceTypes.size(), { sources[0].size() });
      // TODO: think exactly ^------------------- about this number

      // initialize the freePointers with the pointers to the buffers
      std::transform(sliceBuffers.begin(), sliceBuffers.end(),
                     std::inserter(freePointers, freePointers.begin()),
                     [](std::vector<F> &vec) { return vec.data(); });


      LOG(1,"Atrip") << "rankMap.nSources "
                           << rankMap.nSources() << "\n";
      LOG(1,"Atrip") << "#slices "
                           << slices.size() << "\n";
      LOG(1,"Atrip") << "#slices[0] "
                           << slices[0].size << "\n";
      LOG(1,"Atrip") << "#sources "
                           << sources.size() << "\n";
      LOG(1,"Atrip") << "#sources[0] "
                           << sources[0].size() << "\n";
      LOG(1,"Atrip") << "#freePointers "
                           << freePointers.size() << "\n";
      LOG(1,"Atrip") << "#sliceBuffers "
                           << sliceBuffers.size() << "\n";
      LOG(1,"Atrip") << "#sliceBuffers[0] "
                           << sliceBuffers[0].size() << "\n";
      LOG(1,"Atrip") << "#sliceLength "
                           << sliceLength.size() << "\n";
      LOG(1,"Atrip") << "#paramLength "
                           << paramLength.size() << "\n";
      LOG(1,"Atrip") << "GB*" << np << " "
                           << double(sources.size() + sliceBuffers.size())
                            * sources[0].size()
                            * 8 * np
                            / 1073741824.0
                           << "\n";
    } // constructor ends

    void init(Tensor const& sourceTensor) {

      CTF::World w(world);
      const int rank = Atrip::rank
              , order = sliceLength.size()
              ;
      std::vector<int> const syms(order, NS);
      std::vector<int> __sliceLength(sliceLength.begin(), sliceLength.end());
      Tensor toSliceInto(order,
                         __sliceLength.data(),
                         syms.data(),
                         w);
      LOG(1,"Atrip") << "slicing... \n";

      // setUp sources
      for (size_t it(0); it < rankMap.nSources(); ++it) {
        const size_t
          source = rankMap.isSourcePadding(rank, source) ? 0 : it;
        WITH_OCD
        WITH_RANK
          << "Init:toSliceInto it-" << it
          << " :: source " << source << "\n";
        sliceIntoBuffer(source, toSliceInto, sourceTensor);
      }

    }

    /**
     * \brief Send asynchronously only if the state is Fetch
     */
    void send( size_t otherRank
             , Slice::Info const& info
             , size_t tag) const noexcept {
      MPI_Request request;
      bool sendData_p = false;

      if (info.state == Slice::Fetch) sendData_p = true;
      // TODO: remove this because I have SelfSufficient
      if (otherRank == info.from.rank)      sendData_p = false;
      if (!sendData_p) return;

      MPI_Isend( sources[info.from.source].data()
               , sources[info.from.source].size()
               , MPI_DOUBLE /* TODO: adapt this with traits */
               , otherRank
               , tag
               , universe
               , &request
               );
      WITH_CRAZY_DEBUG
      WITH_RANK << "sent to " << otherRank << "\n";

    }

    /**
     * \brief Receive asynchronously only if the state is Fetch
     */
    void receive(Slice::Info const& info, size_t tag) noexcept {
      auto& slice = Slice::findByInfo(slices, info);

      if (Atrip::rank == info.from.rank) return;

      if (slice.info.state == Slice::Fetch) {
        // TODO: do it through the slice class
        slice.info.state = Slice::Dispatched;
        MPI_Request request = nullptr;
        slice.request = request;
        MPI_Irecv( slice.data
                 , slice.size
                 , MPI_DOUBLE // TODO: Adapt this with traits
                 , info.from.rank
                 , tag
                 , universe
                 , &slice.request
                //, MPI_STATUS_IGNORE
                 );
      }
    }

    void unwrapAll(ABCTuple const& abc) {
      for (auto type: sliceTypes) unwrapSlice(type, abc);
    }

    F* unwrapSlice(Slice::Type type, ABCTuple const& abc) {
      WITH_CRAZY_DEBUG
      WITH_RANK << "__unwrap__:slice " << type << " w n "
                << name
                << " abc" << pretty_print(abc)
                << "\n";
      auto& slice = Slice::findByTypeAbc(slices, type, abc);
      WITH_RANK << "__unwrap__:info " << slice.info << "\n";
      switch  (slice.info.state) {
        case Slice::Dispatched:
          WITH_RANK << "__unwrap__:Fetch: " << &slice
                    << " info " << pretty_print(slice.info)
                    << "\n";
          slice.unwrapAndMarkReady();
          return slice.data;
          break;
        case Slice::SelfSufficient:
          WITH_RANK << "__unwrap__:SelfSufficient: " << &slice
                    << " info " << pretty_print(slice.info)
                    << "\n";
          return slice.data;
          break;
        case Slice::Ready:
          WITH_RANK << "__unwrap__:READY: UNWRAPPED ALREADY" << &slice
                    << " info " << pretty_print(slice.info)
                    << "\n";
          return slice.data;
          break;
        case Slice::Recycled:
          WITH_RANK << "__unwrap__:RECYCLED " << &slice
                    << " info " << pretty_print(slice.info)
                    << "\n";
          return unwrapSlice(slice.info.recycling, abc);
          break;
        case Slice::Fetch:
        case Slice::Acceptor:
          throw std::domain_error("Can't unwrap an acceptor or fetch slice!");
          break;
        default:
          throw std::domain_error("Unknown error unwrapping slice!");
      }
      return slice.data;
    }

    const RankMap rankMap;
    const MPI_Comm world;
    const MPI_Comm universe;
    const std::vector<size_t> sliceLength;
    std::vector< std::vector<F> > sources;
    std::vector< Slice > slices;
    Slice::Name name;
    const std::vector<Slice::Type> sliceTypes;
    std::vector< std::vector<F> > sliceBuffers;
    std::set<F*> freePointers;

  };

  SliceUnion&
  unionByName(std::vector<SliceUnion*> const& unions, Slice::Name name) {
      const auto sliceUnionIt
        = std::find_if(unions.begin(), unions.end(),
                      [&name](SliceUnion const* s) {
                        return name == s->name;
                      });
      if (sliceUnionIt == unions.end())
        throw std::domain_error("SliceUnion not found!");
      return **sliceUnionIt;
  }

}
#+end_src

** Tuples
#+begin_src c++ :tangle (atrip-tuples-h)
#pragma once

#include <vector>
#include <array>

#include <atrip/Utils.hpp>
#include <atrip/Debug.hpp>

namespace atrip {

  using ABCTuple = std::array<size_t, 3>;
  using PartialTuple = std::array<size_t, 2>;
  using ABCTuples = std::vector<ABCTuple>;

  ABCTuples getTuplesList(size_t Nv) {
    const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
    ABCTuples result(n);
    size_t u(0);

    for (size_t a(0); a < Nv; a++)
    for (size_t b(a); b < Nv; b++)
    for (size_t c(b); c < Nv; c++){
      if ( a == b && b == c ) continue;
      result[u++] = {a, b, c};
    }

    return result;

  }


  std::pair<size_t, size_t>
  getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) {

    std::vector<size_t> n_tuples_per_rank(np, tuplesList.size()/np);
    const size_t
        // how many valid tuples should we still verteilen to nodes
        // since the number of tuples is not divisible by the number of nodes
        nRoundRobin = tuplesList.size() % np
        // every node must have the sanme amount of tuples in order for the
        // other nodes to receive and send somewhere, therefore
        // some nodes will get extra tuples but that are dummy tuples
      , nExtraInvalid = (np - nRoundRobin) % np
      ;

    if (nRoundRobin) for (int i = 0; i < np; i++) n_tuples_per_rank[i]++;

  #if defined(TODO)
    assert( tuplesList.size()
            ==
            ( std::accumulate(n_tuples_per_rank.begin(),
                              n_tuples_per_rank.end(),
                              0)
            + nExtraInvalid
            ));
  #endif

    WITH_RANK << "nRoundRobin = " << nRoundRobin << "\n";
    WITH_RANK << "nExtraInvalid = " << nExtraInvalid << "\n";
    WITH_RANK << "ntuples = " << n_tuples_per_rank[rank] << "\n";

    auto const& it = n_tuples_per_rank.begin();

    return
      { std::accumulate(it, it + rank    , 0)
      , std::accumulate(it, it + rank + 1, 0)
      };

  }

}
#+end_src

** Unions
Since every tensor slice in a different way, we can override the slicing procedure
and define subclasses of slice unions.

#+begin_src c++ :tangle (atrip-unions-h)
#pragma once
#include <atrip/SliceUnion.hpp>

namespace atrip {

  void sliceIntoVector
    ( std::vector<double> &v
    , CTF::Tensor<double> &toSlice
    , std::vector<int64_t> const low
    , std::vector<int64_t> const up
    , CTF::Tensor<double> const& origin
    , std::vector<int64_t> const originLow
    , std::vector<int64_t> const originUp
    ) {
    // Thank you CTF for forcing me to do this
    struct { std::vector<int> up, low; }
        toSlice_ = { {up.begin(), up.end()}
                   , {low.begin(), low.end()} }
      , origin_ = { {originUp.begin(), originUp.end()}
                  , {originLow.begin(), originLow.end()} }
      ;

    WITH_OCD
    WITH_RANK << "slicing into " << pretty_print(toSlice_.up)
                          << "," << pretty_print(toSlice_.low)
              << " from " << pretty_print(origin_.up)
                   << "," << pretty_print(origin_.low)
              << "\n";

#ifndef ATRIP_DONT_SLICE
    toSlice.slice( toSlice_.low.data()
                 , toSlice_.up.data()
                 , 0.0
                 , origin
                 , origin_.low.data()
                 , origin_.up.data()
                 , 1.0);
    memcpy(v.data(), toSlice.data, sizeof(double) * v.size());
#endif

  }


  struct TAPHH : public SliceUnion {
    TAPHH( Tensor const& sourceTensor
         , size_t No
         , size_t Nv
         , size_t np
         , MPI_Comm child_world
         , MPI_Comm global_world
         ) : SliceUnion( sourceTensor
                       , {Slice::A, Slice::B, Slice::C}
                       , {Nv, No, No} // size of the slices
                       , {Nv}
                       , np
                       , child_world
                       , global_world
                       , Slice::TA
                       , 4) {
           init(sourceTensor);
         }

    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override
    {
      const int rank = Atrip::rank
              , Nv = sliceLength[0]
              , No = sliceLength[1]
              , a = rankMap.find({static_cast<size_t>(rank), it});
              ;


      sliceIntoVector( sources[it]
                     , to,   {0, 0, 0},    {Nv, No, No}
                     , from, {a, 0, 0, 0}, {a+1, Nv, No, No}
                     );

    }

  };


  struct HHHA : public SliceUnion {
    HHHA( Tensor const& sourceTensor
        , size_t No
        , size_t Nv
        , size_t np
        , MPI_Comm child_world
        , MPI_Comm global_world
        ) : SliceUnion( sourceTensor
                      , {Slice::A, Slice::B, Slice::C}
                      , {No, No, No} // size of the slices
                      , {Nv}         // size of the parametrization
                      , np
                      , child_world
                      , global_world
                      , Slice::VIJKA
                      , 4) {
           init(sourceTensor);
         }

    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override
    {

      const int rank = Atrip::rank
              , No = sliceLength[0]
              , a = rankMap.find({static_cast<size_t>(rank), it})
              ;

      sliceIntoVector( sources[it]
                     , to,   {0, 0, 0},    {No, No, No}
                     , from, {0, 0, 0, a}, {No, No, No, a+1}
                     );

    }
  };

  struct ABPH : public SliceUnion {
    ABPH( Tensor const& sourceTensor
        , size_t No
        , size_t Nv
        , size_t np
        , MPI_Comm child_world
        , MPI_Comm global_world
        ) : SliceUnion( sourceTensor
                      , { Slice::AB, Slice::BC, Slice::AC
                        , Slice::BA, Slice::CB, Slice::CA
                        }
                      , {Nv, No} // size of the slices
                      , {Nv, Nv} // size of the parametrization
                      , np
                      , child_world
                      , global_world
                      , Slice::VABCI
                      , 2*6) {
           init(sourceTensor);
         }

    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override {

      const int Nv = sliceLength[0]
              , No = sliceLength[1]
              , rank = Atrip::rank
              , el = rankMap.find({static_cast<size_t>(rank), it})
              , a = el % Nv
              , b = el / Nv
              ;


      sliceIntoVector( sources[it]
                     , to,   {0, 0},       {Nv, No}
                     , from, {a, b, 0, 0}, {a+1, b+1, Nv, No}
                     );

    }

  };

  struct ABHH : public SliceUnion {
    ABHH( Tensor const& sourceTensor
        , size_t No
        , size_t Nv
        , size_t np
        , MPI_Comm child_world
        , MPI_Comm global_world
        ) : SliceUnion( sourceTensor
                      , {Slice::AB, Slice::BC, Slice::AC}
                      , {No, No} // size of the slices
                      , {Nv, Nv} // size of the parametrization
                      , np
                      , child_world
                      , global_world
                      , Slice::VABIJ
                      , 6) {
           init(sourceTensor);
         }

    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override {

      const int Nv = from.lens[0]
              , No = sliceLength[1]
              , rank = Atrip::rank
              , el = rankMap.find({static_cast<size_t>(rank), it})
              , a = el % Nv
              , b = el / Nv
              ;

      sliceIntoVector( sources[it]
                     , to,   {0, 0},       {No, No}
                     , from, {a, b, 0, 0}, {a+1, b+1, No, No}
                     );


    }

  };


  struct TABHH : public SliceUnion {
    TABHH( Tensor const& sourceTensor
         , size_t No
         , size_t Nv
         , size_t np
         , MPI_Comm child_world
         , MPI_Comm global_world
         ) : SliceUnion( sourceTensor
                       , {Slice::AB, Slice::BC, Slice::AC}
                       , {No, No} // size of the slices
                       , {Nv, Nv} // size of the parametrization
                       , np
                       , child_world
                       , global_world
                       , Slice::TABIJ
                       , 6) {
           init(sourceTensor);
         }

    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override {
      // TODO: maybe generalize this with ABHH

      const int Nv = from.lens[0]
              , No = sliceLength[1]
              , rank = Atrip::rank
              , el = rankMap.find({static_cast<size_t>(rank), it})
              , a = el % Nv
              , b = el / Nv
              ;

      sliceIntoVector( sources[it]
                     , to,   {0, 0},       {No, No}
                     , from, {a, b, 0, 0}, {a+1, b+1, No, No}
                     );


    }

  };

}
#+end_src


** Equations
#+begin_src c++ :tangle (atrip-equations-h)
#pragma once

#include<atrip/Slice.hpp>
#include<atrip/Blas.hpp>

namespace atrip {

  double getEnergyDistinct
    ( const double epsabc
    , std::vector<double> const& epsi
    , std::vector<double> const& Tijk_
    , std::vector<double> const& Zijk_
    ) {
    constexpr size_t blockSize=16;
    double energy(0.);
    const size_t No = epsi.size();
    for (size_t kk=0; kk<No; kk+=blockSize){
      const size_t kend( std::min(No, kk+blockSize) );
      for (size_t jj(kk); jj<No; jj+=blockSize){
        const size_t jend( std::min( No, jj+blockSize) );
        for (size_t ii(jj); ii<No; ii+=blockSize){
          const size_t iend( std::min( No, ii+blockSize) );
          for (size_t k(kk); k < kend; k++){
            const double ek(epsi[k]);
            const size_t jstart = jj > k ? kk : k;
            for (size_t j(jstart); j < jend; j++){
              const double ej(epsi[j]);
              double facjk( j == k ? 0.5 : 1.0);
              size_t istart = ii > j ? ii : j;
              for (size_t i(istart); i < iend; i++){
                const double ei(epsi[i]);
                double facij ( i==j ? 0.5 : 1.0);
                double denominator(epsabc - ei - ej - ek);
                double U(Zijk_[i + No*j + No*No*k]);
                double V(Zijk_[i + No*k + No*No*j]);
                double W(Zijk_[j + No*i + No*No*k]);
                double X(Zijk_[j + No*k + No*No*i]);
                double Y(Zijk_[k + No*i + No*No*j]);
                double Z(Zijk_[k + No*j + No*No*i]);

                double A(Tijk_[i + No*j + No*No*k]);
                double B(Tijk_[i + No*k + No*No*j]);
                double C(Tijk_[j + No*i + No*No*k]);
                double D(Tijk_[j + No*k + No*No*i]);
                double E(Tijk_[k + No*i + No*No*j]);
                double F(Tijk_[k + No*j + No*No*i]);
                double value(3.0*(A*U+B*V+C*W+D*X+E*Y+F*Z)
                            +((U+X+Y)-2.0*(V+W+Z))*(A+D+E)
                            +((V+W+Z)-2.0*(U+X+Y))*(B+C+F));
                energy += 2.0*value / denominator * facjk * facij;
              } // i
            } // j
          } // k
        } // ii
      } // jj
    } // kk
    return energy;
  }


  double getEnergySame
    ( const double epsabc
    , std::vector<double> const& epsi
    , std::vector<double> const& Tijk_
    , std::vector<double> const& Zijk_
    ) {
    constexpr size_t blockSize = 16;
    const size_t No = epsi.size();
    double energy(0.);
    for (size_t kk=0; kk<No; kk+=blockSize){
      const size_t kend( std::min( kk+blockSize, No) );
      for (size_t jj(kk); jj<No; jj+=blockSize){
        const size_t jend( std::min( jj+blockSize, No) );
        for (size_t ii(jj); ii<No; ii+=blockSize){
          const size_t iend( std::min( ii+blockSize, No) );
          for (size_t k(kk); k < kend; k++){
            const double ek(epsi[k]);
            const size_t jstart = jj > k ? jj : k;
            for(size_t j(jstart); j < jend; j++){
              const double facjk( j == k ? 0.5 : 1.0);
              const double ej(epsi[j]);
              const size_t istart = ii > j ? ii : j;
              for(size_t i(istart); i < iend; i++){
                double ei(epsi[i]);
                double facij ( i==j ? 0.5 : 1.0);
                double denominator(epsabc - ei - ej - ek);
                double U(Zijk_[i + No*j + No*No*k]);
                double V(Zijk_[j + No*k + No*No*i]);
                double W(Zijk_[k + No*i + No*No*j]);
                double A(Tijk_[i + No*j + No*No*k]);
                double B(Tijk_[j + No*k + No*No*i]);
                double C(Tijk_[k + No*i + No*No*j]);
                double value(3.0*( A*U + B*V + C*W) - (A+B+C)*(U+V+W));
                energy += 2.0*value / denominator * facjk * facij;
              } // i
            } // j
          } // k
        } // ii
      } // jj
    } // kk
    return energy;
  }

  void singlesContribution
    ( size_t No
    , size_t Nv
    , const ABCTuple &abc
    , double const* Tph
    , double const* VABij
    , double const* VACij
    , double const* VBCij
    , double *Zijk
    ) {
    const size_t a(abc[0]), b(abc[1]), c(abc[2]);
    for (size_t k=0; k < No; k++)
    for (size_t i=0; i < No; i++)
    for (size_t j=0; j < No; j++) {
      const size_t ijk = i + j*No + k*No*No
                ,  jk = j + No * k
                ;
      Zijk[ijk] += Tph[ a + i * Nv ] * VBCij[ j + k * No ];
      Zijk[ijk] += Tph[ b + j * Nv ] * VACij[ i + k * No ];
      Zijk[ijk] += Tph[ c + k * Nv ] * VABij[ i + j * No ];
    }
  }

  void doublesContribution
    ( const ABCTuple &abc
    , size_t const No
    , size_t const Nv
    // -- VABCI
    , double const* VABph
    , double const* VACph
    , double const* VBCph
    , double const* VBAph
    , double const* VCAph
    , double const* VCBph
    // -- VHHHA
    , double const* VhhhA
    , double const* VhhhB
    , double const* VhhhC
    // -- TA
    , double const* TAphh
    , double const* TBphh
    , double const* TCphh
    // -- TABIJ
    , double const* TABhh
    , double const* TAChh
    , double const* TBChh
    // -- TIJK
    , double *Tijk
    , atrip::Timings& chrono
    ) {

    auto& t_reorder = chrono["doubles:reorder"];
    const size_t a = abc[0], b = abc[1], c = abc[2]
              , NoNo = No*No, NoNv = No*Nv
              ;

  #if defined(ATRIP_USE_DGEMM)
  #define _IJK_(i, j, k) i + j*No + k*NoNo
  #define REORDER(__II, __JJ, __KK)                                 \
    t_reorder.start();                                              \
    for (size_t k = 0; k < No; k++)                                 \
    for (size_t j = 0; j < No; j++)                                 \
    for (size_t i = 0; i < No; i++) {                               \
      Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)];   \
    }                                                               \
    t_reorder.stop();
  #define DGEMM_PARTICLES(__A, __B)    \
    atrip::dgemm_( "T"                 \
                , "N"                 \
                , (int const*)&NoNo   \
                , (int const*)&No     \
                , (int const*)&Nv     \
                , &one                \
                , __A                 \
                , (int const*)&Nv     \
                , __B                 \
                , (int const*)&Nv     \
                , &zero               \
                , _t_buffer.data()    \
                , (int const*)&NoNo   \
                );
  #define DGEMM_HOLES(__A, __B, __TRANSB)  \
    atrip::dgemm_( "N"                     \
                , __TRANSB                \
                , (int const*)&NoNo       \
                , (int const*)&No         \
                , (int const*)&No         \
                , &m_one                  \
                , __A                     \
                , (int const*)&NoNo       \
                , __B                     \
                , (int const*)&No         \
                , &zero                   \
                , _t_buffer.data()        \
                , (int const*)&NoNo       \
                );

    using F = double;
    const size_t NoNoNo = No*NoNo;
    std::vector<double> _t_buffer;
    _t_buffer.reserve(NoNoNo);
    F one{1.0}, m_one{-1.0}, zero{0.0};

    t_reorder.start();
    for (size_t k = 0; k < NoNoNo; k++) {
      // zero the Tijk
      Tijk[k] = 0.0;
    }
    t_reorder.stop();

    chrono["doubles:holes"].start();
    { // Holes part ============================================================
      // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
      chrono["doubles:holes:1"].start();
      DGEMM_HOLES(VhhhC, TABhh, "N")
      REORDER(i, k, j)
      chrono["doubles:holes:1"].stop();
      // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
      chrono["doubles:holes:2"].start();
      DGEMM_HOLES(VhhhC, TABhh, "T")
      REORDER(j, k, i)
      chrono["doubles:holes:2"].stop();
      // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
      chrono["doubles:holes:3"].start();
      DGEMM_HOLES(VhhhB, TAChh, "N")
      REORDER(i, j, k)
      chrono["doubles:holes:3"].stop();
      // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
      chrono["doubles:holes:4"].start();
      DGEMM_HOLES(VhhhB, TAChh, "T")
      REORDER(k, j, i)
      chrono["doubles:holes:4"].stop();
      // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
      chrono["doubles:holes:5"].start();
      DGEMM_HOLES(VhhhA, TBChh, "N")
      REORDER(j, i, k)
      chrono["doubles:holes:5"].stop();
      // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
      chrono["doubles:holes:6"].start();
      DGEMM_HOLES(VhhhA, TBChh, "T")
      REORDER(k, i, j)
      chrono["doubles:holes:6"].stop();
    }
    chrono["doubles:holes"].stop();

    chrono["doubles:particles"].start();
    { // Particle part =========================================================
      // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
      chrono["doubles:particles:1"].start();
      DGEMM_PARTICLES(TAphh, VBCph)
      REORDER(i, j, k)
      chrono["doubles:particles:1"].stop();
      // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
      chrono["doubles:particles:2"].start();
      DGEMM_PARTICLES(TAphh, VCBph)
      REORDER(i, k, j)
      chrono["doubles:particles:2"].stop();
      // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
      chrono["doubles:particles:3"].start();
      DGEMM_PARTICLES(TCphh, VABph)
      REORDER(k, i, j)
      chrono["doubles:particles:3"].stop();
      // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
      chrono["doubles:particles:4"].start();
      DGEMM_PARTICLES(TCphh, VBAph)
      REORDER(k, j, i)
      chrono["doubles:particles:4"].stop();
      // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
      chrono["doubles:particles:5"].start();
      DGEMM_PARTICLES(TBphh, VACph)
      REORDER(j, i, k)
      chrono["doubles:particles:5"].stop();
      // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
      chrono["doubles:particles:6"].start();
      DGEMM_PARTICLES(TBphh, VCAph)
      REORDER(j, k, i)
      chrono["doubles:particles:6"].stop();
    }
    chrono["doubles:particles"].stop();

  #undef REORDER
  #undef DGEMM_HOLES
  #undef DGEMM_PARTICLES
  #undef _IJK_
  #else
    for (size_t k = 0; k < No; k++)
    for (size_t j = 0; j < No; j++)
    for (size_t i = 0; i < No; i++){
      const size_t ijk = i + j*No + k*NoNo
                ,  jk = j + k*No
                ;
      Tijk[ijk] = 0.0; // :important
      // HOLE DIAGRAMS: TABHH and VHHHA
      for (size_t L = 0; L < No; L++){
        // t[abLj] * V[Lcik]        H1
        // t[baLi] * V[Lcjk]        H0      TODO: conjugate T for complex
        Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo];
        Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo];

        // t[acLk] * V[Lbij]        H5
        // t[caLi] * V[Lbkj]        H3
        Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo];
        Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo];

        // t[bcLk] * V[Laji]        H2
        // t[cbLj] * V[Laki]        H4
        Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo];
        Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo];
      }
      // PARTILCE DIAGRAMS: TAPHH and VABPH
      for (size_t E = 0; E < Nv; E++) {
        // t[aEij] * V[bcEk]        P0
        // t[aEik] * V[cbEj]        P3 // TODO: CHECK THIS ONE, I DONT KNOW
        Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv];
        Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv];

        // t[cEki] * V[abEj]        P5
        // t[cEkj] * V[baEi]        P2
        Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv];
        Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv];

        // t[bEji] * V[acEk]        P1
        // t[bEjk] * V[caEi]        P4
        Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv];
        Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv];
      }

    }
  #endif
  }

}
#+end_src

** Blas
The main matrix-matrix multiplication method used in this algorithm
is mainly using the =DGEMM= function, which we declare as
=extern= since it should be known only at link-time.
#+begin_src c++ :tangle (atrip-blas-h)
#pragma once
namespace atrip {
  extern "C" {
    void dgemm_(
      const char *transa,
      const char *transb,
      const int *m,
      const int *n,
      const int *k,
      double *alpha,
      const double *A,
      const int *lda,
      const double *B,
      const int *ldb,
      double *beta,
      double *C,
      const int *ldc
    );
  }
}
#+end_src

** Atrip
#+begin_src c++ :tangle (atrip-atrip-h)
#pragma once
#include <sstream>
#include <string>
#include <map>
#include <chrono>

#include <ctf.hpp>

namespace atrip {

  struct Atrip {

    static int rank;
    static int np;
    static void init();

    struct Input {
      CTF::Tensor<double> *ei = nullptr
                        , *ea = nullptr
                        , *Tph = nullptr
                        , *Tpphh = nullptr
                        , *Vpphh = nullptr
                        , *Vhhhp = nullptr
                        , *Vppph = nullptr
                        ;
      int maxIterations = 0, iterationMod = -1;
      bool barrier = true;
      Input& with_epsilon_i(CTF::Tensor<double> * t) { ei = t; return *this; }
      Input& with_epsilon_a(CTF::Tensor<double> * t) { ea = t; return *this; }
      Input& with_Tai(CTF::Tensor<double> * t) { Tph = t; return *this; }
      Input& with_Tabij(CTF::Tensor<double> * t) { Tpphh = t; return *this; }
      Input& with_Vabij(CTF::Tensor<double> * t) { Vpphh = t; return *this; }
      Input& with_Vijka(CTF::Tensor<double> * t) { Vhhhp = t; return *this; }
      Input& with_Vabci(CTF::Tensor<double> * t) { Vppph = t; return *this; }
      Input& with_maxIterations(int i) { maxIterations = i; return *this; }
      Input& with_iterationMod(int i) { iterationMod = i; return *this; }
      Input& with_barrier(bool i) { barrier = i; return *this; }
    };

    struct Output {
      double energy;
    };
    static Output run(Input const& in);
  };

}
#+end_src

*** Main

#+begin_src c++ :tangle (atrip-atrip-cxx)
#include <iomanip>

#include <atrip/Atrip.hpp>
#include <atrip/Utils.hpp>
#include <atrip/Equations.hpp>
#include <atrip/SliceUnion.hpp>
#include <atrip/Unions.hpp>

using namespace atrip;

int Atrip::rank;
int Atrip::np;

void Atrip::init()  {
  MPI_Comm_rank(MPI_COMM_WORLD, &Atrip::rank);
  MPI_Comm_size(MPI_COMM_WORLD, &Atrip::np);
}

Atrip::Output Atrip::run(Atrip::Input const& in) {

  const int np = Atrip::np;
  const int rank = Atrip::rank;
  MPI_Comm universe = in.ei->wrld->comm;

  // Timings in seconds ================================================{{{1
  Timings chrono{};

  const size_t No = in.ei->lens[0];
  const size_t Nv = in.ea->lens[0];
  LOG(0,"Atrip") << "No: " << No << "\n";
  LOG(0,"Atrip") << "Nv: " << Nv << "\n";

  // allocate the three scratches, see piecuch
  std::vector<double> Tijk(No*No*No) // doubles only (see piecuch)
                    , Zijk(No*No*No) // singles + doubles (see piecuch)
                    // we need local copies of the following tensors on every
                    // rank
                    , epsi(No)
                    , epsa(Nv)
                    , Tai(No * Nv)
                    ;

  in.ei->read_all(epsi.data());
  in.ea->read_all(epsa.data());
  in.Tph->read_all(Tai.data());

  // COMMUNICATOR CONSTRUCTION ========================================={{{1
  //
  // Construct a new communicator living only on a single rank
  int child_size = 1
    , child_rank
    ;
  const
  int color = rank / child_size
    , crank = rank % child_size
    ;
  MPI_Comm child_comm;
  if (np == 1) {
    child_comm = universe;
  } else {
    MPI_Comm_split(universe, color, crank, &child_comm);
    MPI_Comm_rank(child_comm, &child_rank);
    MPI_Comm_size(child_comm, &child_size);
  }


  chrono["nv-slices"].start();
  // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1
  LOG(0,"Atrip") << "BUILD NV-SLICES\n";
  TAPHH taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
  HHHA  hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
  chrono["nv-slices"].stop();

  chrono["nv-nv-slices"].start();
  // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1
  LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n";
  ABPH abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
  ABHH abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
  TABHH tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
  chrono["nv-nv-slices"].stop();

  // all tensors
  std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh};

  //CONSTRUCT TUPLE LIST ==============================================={{{1
  LOG(0,"Atrip") << "BUILD TUPLE LIST\n";
  const auto tuplesList = std::move(getTuplesList(Nv));
  WITH_RANK << "tupList.size() = " << tuplesList.size() << "\n";

  // GET ABC INDEX RANGE FOR RANK ======================================{{{1
  auto abcIndex = getABCRange(np, rank, tuplesList);
  size_t nIterations = abcIndex.second - abcIndex.first;

#ifdef ATRIP_BENCHMARK
  { const size_t maxIterations = in.maxIterations;
    if (maxIterations != 0) {
      abcIndex.second = abcIndex.first + maxIterations % (nIterations + 1);
      nIterations = maxIterations % (nIterations + 1);
    }
  }
#endif

  WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n";
  LOG(0,"Atrip") << "#iterations: "
                       << nIterations << "\n";

  // first abc
  const ABCTuple firstAbc = tuplesList[abcIndex.first];


  double energy(0.);


  auto const isFakeTuple
    = [&tuplesList](size_t const i) { return i >= tuplesList.size(); };


  auto communicateDatabase
    = [ &unions
      , np
      , &chrono
      ] (ABCTuple const& abc, MPI_Comm const& c) -> Slice::Database {

        chrono["db:comm:type:do"].start();
        auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement();
        chrono["db:comm:type:do"].stop();

        chrono["db:comm:ldb"].start();
        Slice::LocalDatabase ldb;

        for (auto const& tensor: unions) {
          auto const& tensorDb = tensor->buildLocalDatabase(abc);
          ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end());
        }
        chrono["db:comm:ldb"].stop();

        Slice::Database db(np * ldb.size(), ldb[0]);

        chrono["oneshot-db:comm:allgather"].start();
        chrono["db:comm:allgather"].start();
        MPI_Allgather( ldb.data()
                     , ldb.size()
                     , MPI_LDB_ELEMENT
                     , db.data()
                     , ldb.size()
                     , MPI_LDB_ELEMENT
                     , c);
        chrono["db:comm:allgather"].stop();
        chrono["oneshot-db:comm:allgather"].stop();

        chrono["db:comm:type:free"].start();
        MPI_Type_free(&MPI_LDB_ELEMENT);
        chrono["db:comm:type:free"].stop();

        return db;
      };

  auto doIOPhase
    = [&unions, &rank, &np, &universe, &chrono] (Slice::Database const& db) {

    const size_t localDBLength = db.size() / np;

    size_t sendTag = 0
         , recvTag = rank * localDBLength
         ;

    // RECIEVE PHASE ======================================================
    {
      // At this point, we have already send to everyone that fits
      auto const& begin = &db[rank * localDBLength]
                , end   = begin + localDBLength
                ;
      for (auto it = begin; it != end; ++it) {
        recvTag++;
        auto const& el = *it;
        auto& u = unionByName(unions, el.name);

        WITH_DBG std::cout
          << rank << ":r"
          << "♯" << recvTag << " =>"
          << " «n" << el.name
          << ", t" << el.info.type
          << ", s" << el.info.state
          << "»"
          << " ⊙ {" << rank << "⇐" << el.info.from.rank
                    << ", "
                    << el.info.from.source << "}"
          << " ∴ {" << el.info.tuple[0]
                    << ", "
                    << el.info.tuple[1]
                    << "}"
          << "\n"
          ;

        chrono["db:io:recv"].start();
        u.receive(el.info, recvTag);
        chrono["db:io:recv"].stop();

      } // recv
    }

    // SEND PHASE =========================================================
    for (size_t otherRank = 0; otherRank<np; otherRank++) {
      auto const& begin = &db[otherRank * localDBLength]
                , end = begin + localDBLength
                ;
      for (auto it = begin; it != end; ++it) {
        sendTag++;
        Slice::LocalDatabaseElement const& el = *it;

        if (el.info.from.rank != rank) continue;

        auto& u = unionByName(unions, el.name);
        WITH_DBG std::cout
          << rank << ":s"
          << "♯" << sendTag << " =>"
          << " «n" << el.name
          << ", t" << el.info.type
          << ", s" << el.info.state
          << "»"
          << " ⊙ {" << el.info.from.rank << "⇒" << otherRank
                    << ", "
                    << el.info.from.source << "}"
          << " ∴ {" << el.info.tuple[0]
                    << ", "
                    << el.info.tuple[1]
                    << "}"
          << "\n"
          ;

        chrono["db:io:send"].start();
        u.send(otherRank, el.info, sendTag);
        chrono["db:io:send"].stop();

      } // send phase

    } // otherRank


  };

#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
  std::map<ABCTuple, double> tupleEnergies;
#endif

  const double doublesFlops
    = double(No)
    ,* double(No)
    ,* double(No)
    ,* (double(No) + double(Nv))
    ,* 2
    ,* 6
    / 1e9
    ;

  // START MAIN LOOP ======================================================{{{1

  Slice::Database db;

  for ( size_t i = abcIndex.first, iteration = 1
      ; i < abcIndex.second
      ; i++, iteration++
      ) {
    chrono["iterations"].start();

    // check overhead from chrono over all iterations
    chrono["start:stop"].start(); chrono["start:stop"].stop();

    // check overhead of doing a barrier at the beginning
    chrono["oneshot-mpi:barrier"].start();
    chrono["mpi:barrier"].start();
    // TODO: REMOVE
    if (in.barrier == 1)
    MPI_Barrier(universe);
    chrono["mpi:barrier"].stop();
    chrono["oneshot-mpi:barrier"].stop();

    if (iteration % in.iterationMod == 0) {
      LOG(0,"Atrip")
        << "iteration " << iteration
        << " [" << 100 * iteration / nIterations << "%]"
        << " (" << doublesFlops * iteration / chrono["doubles"].count()
        << "GF)"
        << " (" << doublesFlops * iteration / chrono["iterations"].count()
        << "GF)"
        << " ===========================\n";

      // PRINT TIMINGS
      for (auto const& pair: chrono)
        LOG(1, " ") << pair.first << " :: "
                    << pair.second.count()
                    << std::endl;

    }

    const ABCTuple abc = isFakeTuple(i)
                       ? tuplesList[tuplesList.size() - 1]
                       : tuplesList[i]
                 , *abcNext = i == (abcIndex.second - 1)
                            ? nullptr
                            : isFakeTuple(i + 1)
                            ? &tuplesList[tuplesList.size() - 1]
                            : &tuplesList[i + 1]
                 ;

    chrono["with_rank"].start();
    WITH_RANK << " :it " << iteration
              << " :abc " << pretty_print(abc)
              << " :abcN "
              << (abcNext ? pretty_print(*abcNext) : "None")
              << "\n";
    chrono["with_rank"].stop();


    // COMM FIRST DATABASE ================================================{{{1
    if (i == abcIndex.first) {
      WITH_RANK << "__first__:first database ............ \n";
      const auto __db = communicateDatabase(abc, universe);
      WITH_RANK << "__first__:first database communicated \n";
      WITH_RANK << "__first__:first database io phase \n";
      doIOPhase(__db);
      WITH_RANK << "__first__:first database io phase DONE\n";
      WITH_RANK << "__first__::::Unwrapping all slices for first database\n";
      for (auto& u: unions) u->unwrapAll(abc);
      WITH_RANK << "__first__::::Unwrapping all slices for first database DONE\n";
      MPI_Barrier(universe);
    }

    // COMM NEXT DATABASE ================================================={{{1
    if (abcNext) {
      WITH_RANK << "__comm__:" << iteration << "th communicating database\n";
      chrono["db:comm"].start();
      //const auto db = communicateDatabase(*abcNext, universe);
      db = communicateDatabase(*abcNext, universe);
      chrono["db:comm"].stop();
      chrono["db:io"].start();
      doIOPhase(db);
      chrono["db:io"].stop();
      WITH_RANK << "__comm__:" <<  iteration << "th database io phase DONE\n";
    }

    // COMPUTE DOUBLES ===================================================={{{1
    OCD_Barrier(universe);
    if (!isFakeTuple(i)) {
      WITH_RANK << iteration << "-th doubles\n";
      WITH_CHRONO(chrono["oneshot-unwrap"],
      WITH_CHRONO(chrono["unwrap"],
      WITH_CHRONO(chrono["unwrap:doubles"],
        for (auto& u: decltype(unions){&abph, &hhha, &taphh, &tabhh}) {
          u->unwrapAll(abc);
        }
      )))
      chrono["oneshot-doubles"].start();
      chrono["doubles"].start();
      doublesContribution( abc, (size_t)No, (size_t)Nv
                         // -- VABCI
                         , abph.unwrapSlice(Slice::AB, abc)
                         , abph.unwrapSlice(Slice::AC, abc)
                         , abph.unwrapSlice(Slice::BC, abc)
                         , abph.unwrapSlice(Slice::BA, abc)
                         , abph.unwrapSlice(Slice::CA, abc)
                         , abph.unwrapSlice(Slice::CB, abc)
                         // -- VHHHA
                         , hhha.unwrapSlice(Slice::A, abc)
                         , hhha.unwrapSlice(Slice::B, abc)
                         , hhha.unwrapSlice(Slice::C, abc)
                         // -- TA
                         , taphh.unwrapSlice(Slice::A, abc)
                         , taphh.unwrapSlice(Slice::B, abc)
                         , taphh.unwrapSlice(Slice::C, abc)
                         // -- TABIJ
                         , tabhh.unwrapSlice(Slice::AB, abc)
                         , tabhh.unwrapSlice(Slice::AC, abc)
                         , tabhh.unwrapSlice(Slice::BC, abc)
                         // -- TIJK
                         , Tijk.data()
                         , chrono
                         );
      WITH_RANK << iteration << "-th doubles done\n";
      chrono["doubles"].stop();
      chrono["oneshot-doubles"].stop();
    }

    // COMPUTE SINGLES =================================================== {{{1
    OCD_Barrier(universe);
    if (!isFakeTuple(i)) {
      WITH_CHRONO(chrono["oneshot-unwrap"],
      WITH_CHRONO(chrono["unwrap"],
      WITH_CHRONO(chrono["unwrap:singles"],
        abhh.unwrapAll(abc);
      )))
      chrono["reorder"].start();
      for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I];
      chrono["reorder"].stop();
      chrono["singles"].start();
      singlesContribution( No, Nv, abc
                         , Tai.data()
                         , abhh.unwrapSlice(Slice::AB, abc)
                         , abhh.unwrapSlice(Slice::AC, abc)
                         , abhh.unwrapSlice(Slice::BC, abc)
                         , Zijk.data());
      chrono["singles"].stop();
    }


    // COMPUTE ENERGY ==================================================== {{{1
    if (!isFakeTuple(i)) {
      double tupleEnergy(0.);

      int distinct(0);
      if (abc[0] == abc[1]) distinct++;
      if (abc[1] == abc[2]) distinct--;
      const double epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]);

      chrono["energy"].start();
      if ( distinct == 0)
        tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk);
      else
        tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk);
      chrono["energy"].stop();

#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
      tupleEnergies[abc] = tupleEnergy;
#endif

      energy += tupleEnergy;

#ifdef HAVE_OCD
      auto const print_slices
        = [](ABCTuple const& abc, ABCTuple const& want, SliceUnion& u) {
          if (abc != want) return;

          for (auto type: u.sliceTypes) {
            auto const& ptr = u.unwrapSlice(type, abc);
            auto const& slice = Slice::findByTypeAbc(u.slices, type, abc);
            WITH_RANK << "__print_slice__:n" << u.name << " "
                      << pretty_print(abc) << " "
                      << pretty_print(slice.info)
                      ;
            for (size_t i = 0; i < 20; i++) std::cout << ptr[i] << ", ";
            std::cout << std::endl;
          }
        };
#endif

    if (isFakeTuple(i)) {
      // fake iterations should also unwrap whatever they got
      WITH_RANK << iteration
                << "th unwrapping because of fake in "
                << i << "\n";
      for (auto& u: unions) u->unwrapAll(abc);
    }

#ifdef HAVE_OCD
    for (auto const& u: unions) {
      WITH_RANK << "__dups__:"
                << iteration
                << "-th n" << u->name << " checking duplicates\n";
      u->checkForDuplicates();
    }
#endif


    // CLEANUP UNIONS ===================================================={{{1
    OCD_Barrier(universe);
    if (abcNext) {
      chrono["gc"].start();
      WITH_RANK << "__gc__:" << iteration << "-th cleaning up.......\n";
      for (auto& u: unions) {

        u->unwrapAll(abc);
        WITH_RANK << "__gc__:n" << u->name  << " :it " << iteration
                  << " :abc " << pretty_print(abc)
                  << " :abcN " << pretty_print(*abcNext)
                  << "\n";
        for (auto const& slice: u->slices)
          WITH_RANK << "__gc__:guts:" << slice.info << "\n";
        u->clearUnusedSlicesForNext(*abcNext);

        WITH_RANK << "__gc__: checking validity\n";

#ifdef HAVE_OCD
        // check for validity of the slices
        for (auto type: u->sliceTypes) {
          auto tuple = Slice::subtupleBySlice(abc, type);
        for (auto& slice: u->slices) {
          if ( slice.info.type == type
             && slice.info.tuple == tuple
             && slice.isDirectlyFetchable()
             ) {
            if (slice.info.state == Slice::Dispatched)
              throw std::domain_error( "This slice should not be undispatched! "
                                     + pretty_print(slice.info));
          }
        }
        }
#endif


      }
      chrono["gc"].stop();
    }

      WITH_RANK << iteration << "-th cleaning up....... DONE\n";
    }

    // CLEAN CHRONO ======================================================{{{1
    chrono["iterations"].stop();
    { // TODO: REMOVEME
      chrono["oneshot-doubles"].clear();
      chrono["oneshot-mpi:barrier"].clear();
      chrono["oneshot-db:comm:allgather"].clear();
      chrono["oneshot-unwrap"].clear();
    }

    // ITERATION END ====================================================={{{1
  }  // END OF MAIN LOOP

  MPI_Barrier(universe);

  // PRINT TUPLES ========================================================={{{1
#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
  LOG(0,"Atrip") << "tuple energies" << "\n";
  for (size_t i = 0; i < np; i++) {
    MPI_Barrier(universe);
    for (auto const& pair: tupleEnergies) {
      if (i == rank)
        std::cout << pair.first[0]
                  << " " << pair.first[1]
                  << " " << pair.first[2]
                  << std::setprecision(15) << std::setw(23)
                  << " tupleEnergy: " << pair.second
                  << "\n"
                  ;
    }
  }
#endif

  // COMMUNICATE THE ENERGIES ============================================={{{1
  LOG(0,"Atrip") << "COMMUNICATING ENERGIES \n";
  double globalEnergy = 0;
  MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe);

  WITH_RANK << "local energy " << energy << "\n";
  LOG(0,"LOOP FINISHED, energy")
    << std::setprecision(15) << std::setw(23)
    << globalEnergy << std::endl;

  // PRINT TIMINGS {{{1
  for (auto const& pair: chrono)
    LOG(0,"atrip:chrono") << pair.first << " "
                          << pair.second.count() << std::endl;


  LOG(0, "atrip:flops")
    << nIterations * doublesFlops / chrono["doubles"].count() << "\n";

  return { energy };

}
#+end_src


** Debug
#+begin_src c++ :tangle (atrip-debug-h)
#pragma once
#define ATRIP_BENCHMARK
//#define ATRIP_DONT_SLICE
#define ATRIP_DEBUG 1
//#define ATRIP_WORKLOAD_DUMP
#define ATRIP_USE_DGEMM
//#define ATRIP_PRINT_TUPLES

#define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": "

#if ATRIP_DEBUG == 4
#  pragma message("WARNING: You have OCD debugging ABC triples "\
                  "expect GB of output and consult your therapist")
#  include <dbg.h>
#  define HAVE_OCD
#  define OCD_Barrier(com) MPI_Barrier(com)
#  define WITH_OCD
#  define WITH_ROOT if (atrip::Atrip::rank == 0)
#  define WITH_SPECIAL(r) if (atrip::Atrip::rank == r)
#  define WITH_RANK std::cout << atrip::Atrip::rank << ": "
#  define WITH_CRAZY_DEBUG
#  define WITH_DBG
#  define DBG(...) dbg(__VA_ARGS__)
#elif ATRIP_DEBUG == 3
#  pragma message("WARNING: You have crazy debugging ABC triples,"\
                  " expect GB of output")
#  include <dbg.h>
#  define OCD_Barrier(com)
#  define WITH_OCD if (false)
#  define WITH_ROOT if (atrip::Atrip::rank == 0)
#  define WITH_SPECIAL(r) if (atrip::Atrip::rank == r)
#  define WITH_RANK std::cout << atrip::Atrip::rank << ": "
#  define WITH_CRAZY_DEBUG
#  define WITH_DBG
#  define DBG(...) dbg(__VA_ARGS__)
#elif ATRIP_DEBUG == 2
#  pragma message("WARNING: You have some debugging info for ABC triples")
#  include <dbg.h>
#  define OCD_Barrier(com)
#  define WITH_OCD if (false)
#  define WITH_ROOT if (atrip::Atrip::rank == 0)
#  define WITH_SPECIAL(r) if (atrip::Atrip::rank == r)
#  define WITH_RANK std::cout << atrip::Atrip::rank << ": "
#  define WITH_CRAZY_DEBUG if (false)
#  define WITH_DBG
#  define DBG(...) dbg(__VA_ARGS__)
#elif ATRIP_DEBUG == 1
#  define OCD_Barrier(com)
#  define WITH_OCD if (false)
#  define WITH_ROOT if (false)
#  define WITH_SPECIAL(r) if (false)
#  define WITH_RANK if (false) std::cout << atrip::Atrip::rank << ": "
#  define WITH_DBG if (false)
#  define WITH_CRAZY_DEBUG if (false)
#  define DBG(...)
#else
#  error("ATRIP_DEBUG is not defined!")
#endif
#+end_src

** Include header

#+begin_src c++ :tangle (atrip-main-h)
#pragma once

#include <atrip/Atrip.hpp>

#+end_src