Add tangled sources

2022-02-18 12:54:59 +01:00
parent 3dc38a43b5
commit bbbfb30c6f
10 changed files with 1298 additions and 727 deletions
--- a/include/atrip/Atrip.hpp
+++ b/include/atrip/Atrip.hpp
@@ -7,12 +7,22 @@
 #include <ctf.hpp>
 #include <atrip/Utils.hpp>
 #define ADD_ATTRIBUTE(_type, _name, _default)   \
  _type _name = _default;                       \
  Input& with_ ## _name(_type i) {              \
    _name = i;                                  \
    return *this;                               \
  }
 namespace atrip {
  struct Atrip {
    static int rank;
    static int np;
    static Timings chrono;
    static void init();
    template <typename F=double>
@@ -25,9 +35,6 @@ namespace atrip {
                        , *Vhhhp = nullptr
                        , *Vppph = nullptr
                        ;
      int maxIterations = 0, iterationMod = -1, percentageMod = -1;
      bool barrier = false;
      bool chrono = false;
      Input& with_epsilon_i(CTF::Tensor<F> * t) { ei = t; return *this; }
      Input& with_epsilon_a(CTF::Tensor<F> * t) { ea = t; return *this; }
      Input& with_Tai(CTF::Tensor<F> * t) { Tph = t; return *this; }
@@ -35,11 +42,20 @@ namespace atrip {
      Input& with_Vabij(CTF::Tensor<F> * t) { Vpphh = t; return *this; }
      Input& with_Vijka(CTF::Tensor<F> * t) { Vhhhp = t; return *this; }
      Input& with_Vabci(CTF::Tensor<F> * t) { Vppph = t; return *this; }
-      Input& with_maxIterations(int i) { maxIterations = i; return *this; }
+
-      Input& with_iterationMod(int i) { iterationMod = i; return *this; }
+      enum TuplesDistribution {
-      Input& with_percentageMod(int i) { percentageMod = i; return *this; }
+        NAIVE,
-      Input& with_barrier(bool i) { barrier = i; return *this; }
+        GROUP_AND_SORT,
-      Input& with_chrono(bool i) { chrono = i; return *this; }
+      };
      ADD_ATTRIBUTE(bool, rankRoundRobin, false)
      ADD_ATTRIBUTE(bool, chrono, false)
      ADD_ATTRIBUTE(bool, barrier, false)
      ADD_ATTRIBUTE(int, maxIterations, 0)
      ADD_ATTRIBUTE(int, iterationMod, -1)
      ADD_ATTRIBUTE(int, percentageMod, -1)
      ADD_ATTRIBUTE(TuplesDistribution, tuplesDistribution, NAIVE)
    };
    struct Output {
--- a/include/atrip/Debug.hpp
+++ b/include/atrip/Debug.hpp
@@ -41,7 +41,6 @@
 #  define DBG(...) dbg(__VA_ARGS__)
 #elif ATRIP_DEBUG == 2
 #  pragma message("WARNING: You have some debugging info for ABC triples")
 #  include <dbg.h>
 #  define OCD_Barrier(com)
 #  define WITH_OCD if (false)
 #  define WITH_ROOT if (atrip::Atrip::rank == 0)
--- a/include/atrip/Equations.hpp
+++ b/include/atrip/Equations.hpp
@@ -40,12 +40,12 @@ namespace atrip {
                  , X(Zijk_[j + No*k + No*No*i])
                  , Y(Zijk_[k + No*i + No*No*j])
                  , Z(Zijk_[k + No*j + No*No*i])
-                  , A(std::conj(Tijk_[i + No*j + No*No*k]))
+                  , A(maybeConjugate<F>(Tijk_[i + No*j + No*No*k]))
-                  , B(std::conj(Tijk_[i + No*k + No*No*j]))
+                  , B(maybeConjugate<F>(Tijk_[i + No*k + No*No*j]))
-                  , C(std::conj(Tijk_[j + No*i + No*No*k]))
+                  , C(maybeConjugate<F>(Tijk_[j + No*i + No*No*k]))
-                  , D(std::conj(Tijk_[j + No*k + No*No*i]))
+                  , D(maybeConjugate<F>(Tijk_[j + No*k + No*No*i]))
-                  , E(std::conj(Tijk_[k + No*i + No*No*j]))
+                  , E(maybeConjugate<F>(Tijk_[k + No*i + No*No*j]))
-                  , F(std::conj(Tijk_[k + No*j + No*No*i]))
+                  , F(maybeConjugate<F>(Tijk_[k + No*j + No*No*i]))
                  , value
                    = 3.0 * ( A * U
                              + B * V
@@ -102,9 +102,9 @@ namespace atrip {
                , U(Zijk_[i + No*j + No*No*k])
                , V(Zijk_[j + No*k + No*No*i])
                , W(Zijk_[k + No*i + No*No*j])
-                , A(std::conj(Tijk_[i + No*j + No*No*k]))
+                , A(maybeConjugate<F>(Tijk_[i + No*j + No*No*k]))
-                , B(std::conj(Tijk_[j + No*k + No*No*i]))
+                , B(maybeConjugate<F>(Tijk_[j + No*k + No*No*i]))
-                , C(std::conj(Tijk_[k + No*i + No*No*j]))
+                , C(maybeConjugate<F>(Tijk_[k + No*i + No*No*j]))
                , value
                  = F(3.0) * ( A * U
                             + B * V
@@ -172,10 +172,8 @@ namespace atrip {
    , F const* TBChh
    // -- TIJK
    , F *Tijk
    , atrip::Timings& chrono
    ) {
    auto& t_reorder = chrono["doubles:reorder"];
    const size_t a = abc[0], b = abc[1], c = abc[2]
              , NoNo = No*No, NoNv = No*Nv
              ;
@@ -183,13 +181,13 @@ namespace atrip {
  #if defined(ATRIP_USE_DGEMM)
  #define _IJK_(i, j, k) i + j*No + k*NoNo
  #define REORDER(__II, __JJ, __KK)                                 \
-    t_reorder.start();                                              \
+    WITH_CHRONO("doubles:reorder",                                  \
    for (size_t k = 0; k < No; k++)                                 \
    for (size_t j = 0; j < No; j++)                                 \
    for (size_t i = 0; i < No; i++) {                               \
      Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)];   \
    }                                                               \
-    t_reorder.stop();
+    )
  #define DGEMM_PARTICLES(__A, __B)      \
    atrip::xgemm<F>( "T"                 \
                   , "N"                 \
@@ -220,106 +218,100 @@ namespace atrip {
                   , _t_buffer.data()        \
                   , (int const*)&NoNo       \
                   );
-  #define MAYBE_CONJ(_conj, _buffer)                          \
+  #define MAYBE_CONJ(_conj, _buffer)                 \
-    if (traits::isComplex<F>()) {                             \
+    for (size_t __i = 0; __i < NoNoNo; ++__i)        \
-      for (size_t __i = 0; __i < NoNoNo; ++__i)               \
+      _conj[__i] = maybeConjugate<F>(_buffer[__i]);  \
        _conj[__i] = std::conj(_buffer[__i]);                 \
    } else {                                                  \
      for (size_t __i = 0; __i < NoNoNo; ++__i)               \
        _conj[__i] = _buffer[__i];                            \
    }
    const size_t NoNoNo = No*NoNo;
    std::vector<F> _t_buffer;
    _t_buffer.reserve(NoNoNo);
    F one{1.0}, m_one{-1.0}, zero{0.0};
-    t_reorder.start();
+    WITH_CHRONO("double:reorder",
-    for (size_t k = 0; k < NoNoNo; k++) {
+      for (size_t k = 0; k < NoNoNo; k++) {
-      // zero the Tijk
+         Tijk[k] = 0.0;
-      Tijk[k] = 0.0;
+       })
    }
    t_reorder.stop();
-    chrono["doubles:holes"].start();
+    // TOMERGE: replace chronos
-    { // Holes part ============================================================
+    WITH_CHRONO("doubles:holes",
      { // Holes part ========================================================
-      std::vector<F> _vhhh(NoNoNo);
+        std::vector<F> _vhhh(NoNoNo);
-      // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
+        // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
-      MAYBE_CONJ(_vhhh, VhhhC)
+        MAYBE_CONJ(_vhhh, VhhhC)
-      chrono["doubles:holes:1"].start();
+        WITH_CHRONO("doubles:holes:1",
-      DGEMM_HOLES(_vhhh.data(), TABhh, "N")
+          DGEMM_HOLES(_vhhh.data(), TABhh, "N")
-      REORDER(i, k, j)
+          REORDER(i, k, j)
-      chrono["doubles:holes:1"].stop();
+        )
-      // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
+        // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
-      chrono["doubles:holes:2"].start();
+        WITH_CHRONO("doubles:holes:2",
-      DGEMM_HOLES(_vhhh.data(), TABhh, "T")
+          DGEMM_HOLES(_vhhh.data(), TABhh, "T")
-      REORDER(j, k, i)
+          REORDER(j, k, i)
-      chrono["doubles:holes:2"].stop();
+        )
-      // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
+        // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
-      MAYBE_CONJ(_vhhh, VhhhB)
+        MAYBE_CONJ(_vhhh, VhhhB)
-      chrono["doubles:holes:3"].start();
+        WITH_CHRONO("doubles:holes:3",
-      DGEMM_HOLES(_vhhh.data(), TAChh, "N")
+          DGEMM_HOLES(_vhhh.data(), TAChh, "N")
-      REORDER(i, j, k)
+          REORDER(i, j, k)
-      chrono["doubles:holes:3"].stop();
+        )
-      // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
+        // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
-      chrono["doubles:holes:4"].start();
+        WITH_CHRONO("doubles:holes:4",
-      DGEMM_HOLES(_vhhh.data(), TAChh, "T")
+          DGEMM_HOLES(_vhhh.data(), TAChh, "T")
-      REORDER(k, j, i)
+          REORDER(k, j, i)
-      chrono["doubles:holes:4"].stop();
+        )
-      // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
+        // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
-      MAYBE_CONJ(_vhhh, VhhhA)
+        MAYBE_CONJ(_vhhh, VhhhA)
-      chrono["doubles:holes:5"].start();
+        WITH_CHRONO("doubles:holes:5",
-      DGEMM_HOLES(_vhhh.data(), TBChh, "N")
+          DGEMM_HOLES(_vhhh.data(), TBChh, "N")
-      REORDER(j, i, k)
+          REORDER(j, i, k)
-      chrono["doubles:holes:5"].stop();
+        )
-      // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
+        // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
-      chrono["doubles:holes:6"].start();
+        WITH_CHRONO("doubles:holes:6",
-      DGEMM_HOLES(_vhhh.data(), TBChh, "T")
+          DGEMM_HOLES(_vhhh.data(), TBChh, "T")
-      REORDER(k, i, j)
+          REORDER(k, i, j)
-      chrono["doubles:holes:6"].stop();
+        )
-    }
+      }
-    chrono["doubles:holes"].stop();
+    )
  #undef MAYBE_CONJ
-    chrono["doubles:particles"].start();
+    WITH_CHRONO("doubles:particles",
-    { // Particle part =========================================================
+      { // Particle part =====================================================
-      // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
+        // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
-      chrono["doubles:particles:1"].start();
+        WITH_CHRONO("doubles:particles:1",
-      DGEMM_PARTICLES(TAphh, VBCph)
+          DGEMM_PARTICLES(TAphh, VBCph)
-      REORDER(i, j, k)
+          REORDER(i, j, k)
-      chrono["doubles:particles:1"].stop();
+        )
-      // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
+        // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
-      chrono["doubles:particles:2"].start();
+        WITH_CHRONO("doubles:particles:2",
-      DGEMM_PARTICLES(TAphh, VCBph)
+          DGEMM_PARTICLES(TAphh, VCBph)
-      REORDER(i, k, j)
+          REORDER(i, k, j)
-      chrono["doubles:particles:2"].stop();
+        )
-      // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
+        // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
-      chrono["doubles:particles:3"].start();
+        WITH_CHRONO("doubles:particles:3",
-      DGEMM_PARTICLES(TCphh, VABph)
+          DGEMM_PARTICLES(TCphh, VABph)
-      REORDER(k, i, j)
+          REORDER(k, i, j)
-      chrono["doubles:particles:3"].stop();
+        )
-      // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
+        // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
-      chrono["doubles:particles:4"].start();
+        WITH_CHRONO("doubles:particles:4",
-      DGEMM_PARTICLES(TCphh, VBAph)
+          DGEMM_PARTICLES(TCphh, VBAph)
-      REORDER(k, j, i)
+          REORDER(k, j, i)
-      chrono["doubles:particles:4"].stop();
+        )
-      // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
+        // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
-      chrono["doubles:particles:5"].start();
+        WITH_CHRONO("doubles:particles:5",
-      DGEMM_PARTICLES(TBphh, VACph)
+          DGEMM_PARTICLES(TBphh, VACph)
-      REORDER(j, i, k)
+          REORDER(j, i, k)
-      chrono["doubles:particles:5"].stop();
+        )
-      // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
+        // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
-      chrono["doubles:particles:6"].start();
+        WITH_CHRONO("doubles:particles:6",
-      DGEMM_PARTICLES(TBphh, VCAph)
+          DGEMM_PARTICLES(TBphh, VCAph)
-      REORDER(j, k, i)
+          REORDER(j, k, i)
-      chrono["doubles:particles:6"].stop();
+        )
-    }
+      }
-    chrono["doubles:particles"].stop();
+    )
  #undef REORDER
  #undef DGEMM_HOLES
--- a/include/atrip/RankMap.hpp
+++ b/include/atrip/RankMap.hpp
@@ -5,24 +5,38 @@
 #include <algorithm>
 #include <atrip/Slice.hpp>
 #include <atrip/Tuples.hpp>
 namespace atrip {
  template <typename F=double>
  struct RankMap {
    static bool RANK_ROUND_ROBIN;
    std::vector<size_t> const lengths;
    size_t const np, size;
    ClusterInfo const clusterInfo;
-    RankMap(std::vector<size_t> lens, size_t np_)
+    RankMap(std::vector<size_t> lens, size_t np_, MPI_Comm comm)
      : lengths(lens)
      , np(np_)
      , size(std::accumulate(lengths.begin(), lengths.end(),
                            1UL, std::multiplies<size_t>()))
      , clusterInfo(getClusterInfo(comm))
    { assert(lengths.size() <= 2); }
    size_t find(typename Slice<F>::Location const& p) const noexcept {
-      return p.source * np + p.rank;
+      if (RANK_ROUND_ROBIN) {
        return p.source * np + p.rank;
      } else {
        const size_t
          rankPosition = p.source * clusterInfo.ranksPerNode
                       + clusterInfo.rankInfos[p.rank].localRank
                       ;
        return rankPosition * clusterInfo.nNodes
             + clusterInfo.rankInfos[p.rank].nodeId
             ;
      }
    }
    size_t nSources() const noexcept {
@@ -42,8 +56,9 @@ namespace atrip {
    }
    typename Slice<F>::Location
-    find(ABCTuple const& abc, typename Slice<F>::Type sliceType) const noexcept {
+    find(ABCTuple const& abc, typename Slice<F>::Type sliceType) const {
      // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB
      // tuple = {11, 0} when abc = {11, 8, 9} and sliceType = A
      const auto tuple = Slice<F>::subtupleBySlice(abc, sliceType);
      const size_t index
@@ -51,9 +66,51 @@ namespace atrip {
        + tuple[1] * (lengths.size() > 1 ? lengths[0] : 0)
        ;
      size_t rank, source;
      if (RANK_ROUND_ROBIN) {
        rank = index % np;
        source = index / np;
      } else {
        size_t const
          // the node that will be assigned to
            nodeId = index % clusterInfo.nNodes
          // how many times it has been assigned to the node
          , s_n = index / clusterInfo.nNodes
          // which local rank in the node should be
          , localRank = s_n % clusterInfo.ranksPerNode
          // and the local source (how many times we chose this local rank)
          , localSource = s_n / clusterInfo.ranksPerNode
          ;
        // find the localRank-th entry in clusterInfo
        auto const& it =
          std::find_if(clusterInfo.rankInfos.begin(),
                       clusterInfo.rankInfos.end(),
                       [nodeId, localRank](RankInfo const& ri) {
                         return ri.nodeId == nodeId
                             && ri.localRank == localRank
                             ;
                       });
        if (it == clusterInfo.rankInfos.end()) {
          throw "FATAL! Error in node distribution of the slices";
        }
        rank = (*it).globalRank;
        source = localSource;
      }
      return
-        { index % np
+        { rank
-        , index / np
+        , source
        };
    }
--- a/include/atrip/Slice.hpp
+++ b/include/atrip/Slice.hpp
@@ -1,4 +1,4 @@
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20slice][The slice:1]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]]
 #pragma once
 #include <iostream>
 #include <algorithm>
@@ -11,6 +11,9 @@
 namespace atrip {
 template <typename FF> FF maybeConjugate(const FF a) { return a; }
 template <> Complex maybeConjugate(const Complex a) { return std::conj(a); }
 namespace traits {
  template <typename FF> bool isComplex() { return false; };
  template <> bool isComplex<Complex>() { return true; };
@@ -24,401 +27,409 @@ namespace mpi {
 template <typename F=double>
 struct Slice {
-// The slice:1 ends here
+// Prolog:1 ends here
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20slice][The slice:2]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Location][Location:1]]
-// ASSOCIATED TYPES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+struct Location { size_t rank; size_t source; };
 // Location:1 ends here
-  struct Location { size_t rank; size_t source; };
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Type][Type:1]]
-
+enum Type
-  enum Type
+  { A = 10
-    { A = 10
+  , B
-    , B
+  , C
-    , C
+  // Two-parameter slices
-    // Two-parameter slices
+  , AB = 20
-    , AB = 20
+  , BC
-    , BC
+  , AC
-    , AC
+  // for abci and the doubles
-    // for abci and the doubles
+  , CB
-    , CB
+  , BA
-    , BA
+  , CA
-    , CA
+  // The non-typed slice
-    // The non-typed slice
+  , Blank = 404
    , Blank = 404
    };
  enum State {
    // Fetch represents the state where a slice is to be fetched
    // and has a valid data pointer that can be written to
    Fetch = 0,
    // Dispatches represents the state that an MPI call has been
    // dispatched in order to get the data, but the data has not been
    // yet unwrapped, the data might be there or we might have to wait.
    Dispatched = 2,
    // Ready means that the data pointer can be read from
    Ready = 1,
    // Self sufficient is a slice when its contents are located
    // in the same rank that it lives, so that it does not have to
    // fetch from no one else.
    SelfSufficient = 911,
    // Recycled means that this slice gets its data pointer from another
    // slice, so it should not be written to
    Recycled = 123,
    // Acceptor means that the Slice can accept a new Slice, it is
    // the counterpart of the Blank type, but for states
    Acceptor = 405
  };
 // Type:1 ends here
-  struct Info {
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*State][State:1]]
-    // which part of a,b,c the slice holds
+enum State {
-    PartialTuple tuple;
+  Fetch = 0,
-    // The type of slice for the user to retrieve the correct one
+  Dispatched = 2,
-    Type type;
+  Ready = 1,
-    // What is the state of the slice
+  SelfSufficient = 911,
-    State state;
+  Recycled = 123,
-    // Where the slice is to be retrieved
+  Acceptor = 405
-    // NOTE: this can actually be computed from tuple
+};
-    Location from;
+// State:1 ends here
    // If the data are actually to be found in this other slice
    Type recycling;
-    Info() : tuple{0,0}
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20Info%20structure][The Info structure:1]]
-           , type{Blank}
+struct Info {
-           , state{Acceptor}
+  // which part of a,b,c the slice holds
-           , from{0,0}
+  PartialTuple tuple;
-           , recycling{Blank}
+  // The type of slice for the user to retrieve the correct one
-           {}
+  Type type;
  // What is the state of the slice
  State state;
  // Where the slice is to be retrieved
  Location from;
  // If the data are actually to be found in this other slice
  Type recycling;
  Info() : tuple{0,0}
          , type{Blank}
          , state{Acceptor}
          , from{0,0}
          , recycling{Blank}
          {}
 };
 using Ty_x_Tu = std::pair< Type, PartialTuple >;
 // The Info structure:1 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Name][Name:1]]
 enum Name
  { TA    = 100
  , VIJKA = 101
  , VABCI = 200
  , TABIJ = 201
  , VABIJ = 202
  };
 // Name:1 ends here
-  using Ty_x_Tu = std::pair< Type, PartialTuple >;
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Database][Database:1]]
 struct LocalDatabaseElement {
  Slice<F>::Name name;
  Slice<F>::Info info;
 };
 // Database:1 ends here
-  // Names of the integrals that are considered in CCSD(T)
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Database][Database:2]]
-  enum Name
+using LocalDatabase = std::vector<LocalDatabaseElement>;
-    { TA    = 100
+using Database = LocalDatabase;
-    , VIJKA = 101
+// Database:2 ends here
    , VABCI = 200
    , TABIJ = 201
    , VABIJ = 202
    };
-  // DATABASE ==========================================================={{{1
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*MPI%20Types][MPI Types:1]]
-  struct LocalDatabaseElement {
+struct mpi {
    Slice<F>::Name name;
    Slice<F>::Info info;
  };
  using LocalDatabase = std::vector<LocalDatabaseElement>;
  using Database = LocalDatabase;
-
+  static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) {
-    // STATIC METHODS ===========================================================
+    MPI_Datatype dt;
-    //
+    MPI_Type_vector(n, 1, 1, DT, &dt);
-    // They are useful to organize the structure of slices
+    MPI_Type_commit(&dt);
-
+    return dt;
    struct mpi {
      static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) {
        MPI_Datatype dt;
        MPI_Type_vector(n, 1, 1, DT, &dt);
        MPI_Type_commit(&dt);
        return dt;
      }
      static MPI_Datatype sliceLocation () {
        constexpr int n = 2;
        // create a sliceLocation to measure in the current architecture
        // the packing of the struct
        Slice<F>::Location measure;
        MPI_Datatype dt;
        const std::vector<int> lengths(n, 1);
        const MPI_Datatype types[n] = {usizeDt(), usizeDt()};
        // measure the displacements in the struct
        size_t j = 0;
        MPI_Aint displacements[n];
        MPI_Get_address(&measure.rank,   &displacements[j++]);
        MPI_Get_address(&measure.source, &displacements[j++]);
        for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0];
        displacements[0] = 0;
        MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
        MPI_Type_commit(&dt);
        return dt;
      }
      static MPI_Datatype enumDt() { return MPI_INT; }
      static MPI_Datatype usizeDt() { return MPI_UINT64_T; }
      static MPI_Datatype sliceInfo () {
        constexpr int n = 5;
        MPI_Datatype dt;
        Slice<F>::Info measure;
        const std::vector<int> lengths(n, 1);
        const MPI_Datatype types[n]
          = { vector(2, usizeDt())
            , enumDt()
            , enumDt()
            , sliceLocation()
            , enumDt()
            };
        // create the displacements from the info measurement struct
        size_t j = 0;
        MPI_Aint displacements[n];
        MPI_Get_address(measure.tuple.data(), &displacements[j++]);
        MPI_Get_address(&measure.type,        &displacements[j++]);
        MPI_Get_address(&measure.state,       &displacements[j++]);
        MPI_Get_address(&measure.from,        &displacements[j++]);
        MPI_Get_address(&measure.recycling,   &displacements[j++]);
        for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0];
        displacements[0] = 0;
        MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
        MPI_Type_commit(&dt);
        return dt;
      }
      static MPI_Datatype localDatabaseElement () {
        constexpr int n = 2;
        MPI_Datatype dt;
        LocalDatabaseElement measure;
        const std::vector<int> lengths(n, 1);
        const MPI_Datatype types[n]
          = { enumDt()
            , sliceInfo()
            };
        // measure the displacements in the struct
        size_t j = 0;
        MPI_Aint displacements[n];
        MPI_Get_address(&measure.name, &displacements[j++]);
        MPI_Get_address(&measure.info, &displacements[j++]);
        for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0];
        displacements[0] = 0;
        MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
        MPI_Type_commit(&dt);
        return dt;
      }
    };
  static
  PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) {
    switch (sliceType) {
      case AB: return {abc[0], abc[1]};
      case BC: return {abc[1], abc[2]};
      case AC: return {abc[0], abc[2]};
      case CB: return {abc[2], abc[1]};
      case BA: return {abc[1], abc[0]};
      case CA: return {abc[2], abc[0]};
      case  A: return {abc[0], 0};
      case  B: return {abc[1], 0};
      case  C: return {abc[2], 0};
      default: throw "Switch statement not exhaustive!";
    }
  }
  static MPI_Datatype sliceLocation () {
    constexpr int n = 2;
    // create a sliceLocation to measure in the current architecture
    // the packing of the struct
    Slice<F>::Location measure;
    MPI_Datatype dt;
    const std::vector<int> lengths(n, 1);
    const MPI_Datatype types[n] = {usizeDt(), usizeDt()};
-    /**
+    static_assert(sizeof(Slice<F>::Location) == 2 * sizeof(size_t),
-     * It is important here to return a reference to a Slice
+                  "The Location packing is wrong in your compiler");
     * not to accidentally copy the associated buffer of the slice.
     */
    static Slice<F>& findOneByType(std::vector<Slice<F>> &slices, Slice<F>::Type type) {
        const auto sliceIt
          = std::find_if(slices.begin(), slices.end(),
                         [&type](Slice<F> const& s) {
                           return type == s.info.type;
                         });
        WITH_CRAZY_DEBUG
        WITH_RANK
          << "\t__ looking for " << type << "\n";
        if (sliceIt == slices.end())
          throw std::domain_error("Slice by type not found!");
        return *sliceIt;
    }
-    /*
+    // measure the displacements in the struct
-     * Check if an info has
+    size_t j = 0;
-     *
+    MPI_Aint base_address, displacements[n];
-     */
+    MPI_Get_address(&measure,        &base_address);
-    static std::vector<Slice<F>*> hasRecycledReferencingToIt
+    MPI_Get_address(&measure.rank,   &displacements[j++]);
-      ( std::vector<Slice<F>> &slices
+    MPI_Get_address(&measure.source, &displacements[j++]);
-      , Info const& info
+    for (size_t i = 0; i < n; i++)
-      ) {
+      displacements[i] = MPI_Aint_diff(displacements[i], base_address);
      std::vector<Slice<F>*> result;
-      for (auto& s: slices)
+    MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
-        if (  s.info.recycling == info.type
+    MPI_Type_commit(&dt);
-           && s.info.tuple == info.tuple
+    return dt;
-           && s.info.state == Recycled
+  }
           ) result.push_back(&s);
-      return result;
+  static MPI_Datatype usizeDt() { return MPI_UINT64_T; }
    }
-    static Slice<F>&
+  static MPI_Datatype sliceInfo () {
-    findRecycledSource (std::vector<Slice<F>> &slices, Slice<F>::Info info) {
+    constexpr int n = 5;
-      const auto sliceIt
+    MPI_Datatype dt;
-        = std::find_if(slices.begin(), slices.end(),
+    Slice<F>::Info measure;
-                       [&info](Slice<F> const& s) {
+    const std::vector<int> lengths(n, 1);
-                         return info.recycling == s.info.type
+    const MPI_Datatype types[n]
-                             && info.tuple == s.info.tuple
+      = { vector(2, usizeDt())
-                             && State::Recycled != s.info.state
+        , vector(sizeof(enum Type), MPI_CHAR)
-                             ;
+        , vector(sizeof(enum State), MPI_CHAR)
-                       });
+        , sliceLocation()
        , vector(sizeof(enum Type), MPI_CHAR)
        // TODO: Why this does not work on intel mpi?
        /*, MPI_UINT64_T*/
        };
-      WITH_CRAZY_DEBUG
+    static_assert(sizeof(enum Type)  == 4, "Enum type not 4 bytes long");
-      WITH_RANK << "__slice__:find: recycling source of "
+    static_assert(sizeof(enum State) == 4, "Enum State not 4 bytes long");
-                << pretty_print(info) << "\n";
+    static_assert(sizeof(enum Name)  == 4, "Enum Name not 4 bytes long");
      if (sliceIt == slices.end())
        throw std::domain_error( "Slice not found: "
                               + pretty_print(info)
                               + " rank: "
                               + pretty_print(Atrip::rank)
                               );
      WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n";
      return *sliceIt;
    }
-    static Slice<F>& findByTypeAbc
+    // create the displacements from the info measurement struct
-      ( std::vector<Slice<F>> &slices
+    size_t j = 0;
-      , Slice<F>::Type type
+    MPI_Aint base_address, displacements[n];
-      , ABCTuple const& abc
+    MPI_Get_address(&measure,             &base_address);
-      ) {
+    MPI_Get_address(&measure.tuple[0],    &displacements[j++]);
-        const auto tuple = Slice<F>::subtupleBySlice(abc, type);
+    MPI_Get_address(&measure.type,        &displacements[j++]);
-        const auto sliceIt
+    MPI_Get_address(&measure.state,       &displacements[j++]);
-          = std::find_if(slices.begin(), slices.end(),
+    MPI_Get_address(&measure.from,        &displacements[j++]);
-                         [&type, &tuple](Slice<F> const& s) {
+    MPI_Get_address(&measure.recycling,   &displacements[j++]);
-                           return type == s.info.type
+    for (size_t i = 0; i < n; i++)
-                               && tuple == s.info.tuple
+      displacements[i] = MPI_Aint_diff(displacements[i], base_address);
                               ;
                         });
        WITH_CRAZY_DEBUG
        WITH_RANK << "__slice__:find:" << type << " and tuple "
                  << pretty_print(tuple)
                  << "\n";
        if (sliceIt == slices.end())
          throw std::domain_error( "Slice not found: "
                                 + pretty_print(tuple)
                                 + ", "
                                 + pretty_print(type)
                                 + " rank: "
                                 + pretty_print(Atrip::rank)
                                 );
        return *sliceIt;
    }
-    static Slice<F>& findByInfo(std::vector<Slice<F>> &slices,
+    MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
-                             Slice<F>::Info const& info) {
+    MPI_Type_commit(&dt);
-        const auto sliceIt
+    return dt;
-          = std::find_if(slices.begin(), slices.end(),
+  }
                         [&info](Slice<F> const& s) {
                           // TODO: maybe implement comparison in Info struct
                           return info.type == s.info.type
                               && info.state == s.info.state
                               && info.tuple == s.info.tuple
                               && info.from.rank == s.info.from.rank
                               && info.from.source == s.info.from.source
                                ;
                         });
        WITH_CRAZY_DEBUG
        WITH_RANK << "__slice__:find:looking for " << pretty_print(info) << "\n";
        if (sliceIt == slices.end())
          throw std::domain_error( "Slice by info not found: "
                                 + pretty_print(info));
        return *sliceIt;
    }
-    // SLICE DEFINITION  =================================================={{{1
+  static MPI_Datatype localDatabaseElement () {
    constexpr int n = 2;
    MPI_Datatype dt;
    LocalDatabaseElement measure;
    const std::vector<int> lengths(n, 1);
    const MPI_Datatype types[n]
      = { vector(sizeof(enum Name), MPI_CHAR)
        , sliceInfo()
        };
-    // ATTRIBUTES ============================================================
+    // measure the displacements in the struct
-    Info info;
+    size_t j = 0;
-    F  *data;
+    MPI_Aint base_address, displacements[n];
-    MPI_Request request;
+    MPI_Get_address(&measure,      &base_address);
-    const size_t size;
+    MPI_Get_address(&measure.name, &displacements[j++]);
    MPI_Get_address(&measure.info, &displacements[j++]);
    for (size_t i = 0; i < n; i++)
      displacements[i] = MPI_Aint_diff(displacements[i], base_address);
-    void markReady() noexcept {
+    static_assert( sizeof(LocalDatabaseElement) == sizeof(measure)
-      info.state = Ready;
+                 , "Measure has bad size");
      info.recycling = Blank;
    }
-    /*
+    MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
-     * This means that the data is there
+    MPI_Type_commit(&dt);
-     */
+    return vector(sizeof(LocalDatabaseElement), MPI_CHAR);
-    bool isUnwrapped() const noexcept {
+    // TODO: write tests in order to know if this works
-      return info.state == Ready
+    return dt;
-          || info.state == SelfSufficient
+  }
          ;
    }
-    bool isUnwrappable() const noexcept {
+};
-      return isUnwrapped()
+// MPI Types:1 ends here
          || info.state == Recycled
          || info.state == Dispatched
          ;
    }
-    inline bool isDirectlyFetchable() const noexcept {
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:1]]
-      return info.state == Ready || info.state == Dispatched;
+static
-    }
+PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) {
  switch (sliceType) {
    case AB: return {abc[0], abc[1]};
    case BC: return {abc[1], abc[2]};
    case AC: return {abc[0], abc[2]};
    case CB: return {abc[2], abc[1]};
    case BA: return {abc[1], abc[0]};
    case CA: return {abc[2], abc[0]};
    case  A: return {abc[0], 0};
    case  B: return {abc[1], 0};
    case  C: return {abc[2], 0};
    default: throw "Switch statement not exhaustive!";
  }
 }
 // Static utilities:1 ends here
-    void free() noexcept {
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:2]]
-      info.tuple      = {0, 0};
+static std::vector<Slice<F>*> hasRecycledReferencingToIt
-      info.type       = Blank;
+  ( std::vector<Slice<F>> &slices
-      info.state      = Acceptor;
+  , Info const& info
-      info.from       = {0, 0};
+  ) {
-      info.recycling  = Blank;
+  std::vector<Slice<F>*> result;
      data            = nullptr;
    }
-    inline bool isFree() const noexcept {
+  for (auto& s: slices)
-      return info.tuple       == PartialTuple{0, 0}
+    if (  s.info.recycling == info.type
-          && info.type        == Blank
+       && s.info.tuple == info.tuple
-          && info.state       == Acceptor
+       && s.info.state == Recycled
-          && info.from.rank   == 0
+       ) result.push_back(&s);
          && info.from.source == 0
          && info.recycling   == Blank
          && data             == nullptr
           ;
    }
  return result;
 }
 // Static utilities:2 ends here
-    /*
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:3]]
-     * This function answers the question, which slices can be recycled.
+static Slice<F>& findOneByType(std::vector<Slice<F>> &slices, Slice<F>::Type type) {
-     *
+    const auto sliceIt
-     * A slice can only be recycled if it is Fetch or Ready and has
+      = std::find_if(slices.begin(), slices.end(),
-     * a valid datapointer.
+                     [&type](Slice<F> const& s) {
-     *
+                       return type == s.info.type;
-     * In particular, SelfSufficient are not recyclable, since it is easier
+                     });
-     * just to create a SelfSufficient slice than deal with data dependencies.
+    WITH_CRAZY_DEBUG
-     *
+    WITH_RANK
-     * Furthermore, a recycled slice is not recyclable, if this is the case
+      << "\t__ looking for " << type << "\n";
-     * then it is either bad design or a bug.
+    if (sliceIt == slices.end())
-     */
+      throw std::domain_error("Slice by type not found!");
-    inline bool isRecyclable() const noexcept {
+    return *sliceIt;
-      return (  info.state == Dispatched
+}
-             || info.state == Ready
+// Static utilities:3 ends here
             || info.state == Fetch
             )
          && hasValidDataPointer()
          ;
    }
-    /*
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:4]]
-     * This function describes if a slice has a valid data pointer.
+static Slice<F>&
-     *
+findRecycledSource (std::vector<Slice<F>> &slices, Slice<F>::Info info) {
-     * This is important to know if the slice has some data to it, also
+  const auto sliceIt
-     * some structural checks are done, so that it should not be Acceptor
+    = std::find_if(slices.begin(), slices.end(),
-     * or Blank, if this is the case then it is a bug.
+                   [&info](Slice<F> const& s) {
-     */
+                     return info.recycling == s.info.type
-    inline bool hasValidDataPointer() const noexcept {
+                         && info.tuple == s.info.tuple
-      return data       != nullptr
+                         && State::Recycled != s.info.state
-          && info.state != Acceptor
+                         ;
-          && info.type  != Blank
+                   });
          ;
    }
-    void unwrapAndMarkReady() {
+  WITH_CRAZY_DEBUG
  WITH_RANK << "__slice__:find: recycling source of "
            << pretty_print(info) << "\n";
  if (sliceIt == slices.end())
    throw std::domain_error( "Slice not found: "
                           + pretty_print(info)
                           + " rank: "
                           + pretty_print(Atrip::rank)
                           );
  WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n";
  return *sliceIt;
 }
 // Static utilities:4 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:5]]
 static Slice<F>& findByTypeAbc
  ( std::vector<Slice<F>> &slices
  , Slice<F>::Type type
  , ABCTuple const& abc
  ) {
    const auto tuple = Slice<F>::subtupleBySlice(abc, type);
    const auto sliceIt
      = std::find_if(slices.begin(), slices.end(),
                     [&type, &tuple](Slice<F> const& s) {
                       return type == s.info.type
                           && tuple == s.info.tuple
                           ;
                     });
    WITH_CRAZY_DEBUG
    WITH_RANK << "__slice__:find:" << type << " and tuple "
              << pretty_print(tuple)
              << "\n";
    if (sliceIt == slices.end())
      throw std::domain_error( "Slice not found: "
                             + pretty_print(tuple)
                             + ", "
                             + pretty_print(type)
                             + " rank: "
                             + pretty_print(Atrip::rank)
                             );
    return *sliceIt;
 }
 // Static utilities:5 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:6]]
 static Slice<F>& findByInfo(std::vector<Slice<F>> &slices,
                         Slice<F>::Info const& info) {
  const auto sliceIt
    = std::find_if(slices.begin(), slices.end(),
                   [&info](Slice<F> const& s) {
                     // TODO: maybe implement comparison in Info struct
                     return info.type == s.info.type
                         && info.state == s.info.state
                         && info.tuple == s.info.tuple
                         && info.from.rank == s.info.from.rank
                         && info.from.source == s.info.from.source
                          ;
                   });
  WITH_CRAZY_DEBUG
  WITH_RANK << "__slice__:find:looking for " << pretty_print(info) << "\n";
  if (sliceIt == slices.end())
    throw std::domain_error( "Slice by info not found: "
                           + pretty_print(info));
  return *sliceIt;
 }
 // Static utilities:6 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:1]]
 Info info;
 // Attributes:1 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:2]]
 F  *data;
 // Attributes:2 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:3]]
 MPI_Request request;
 // Attributes:3 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:4]]
 const size_t size;
 // Attributes:4 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:1]]
 void markReady() noexcept {
  info.state = Ready;
  info.recycling = Blank;
 }
 // Member functions:1 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:2]]
 bool isUnwrapped() const noexcept {
  return info.state == Ready
      || info.state == SelfSufficient
      ;
 }
 // Member functions:2 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:3]]
 bool isUnwrappable() const noexcept {
  return isUnwrapped()
      || info.state == Recycled
      || info.state == Dispatched
      ;
 }
 inline bool isDirectlyFetchable() const noexcept {
  return info.state == Ready || info.state == Dispatched;
 }
 void free() noexcept {
  info.tuple      = {0, 0};
  info.type       = Blank;
  info.state      = Acceptor;
  info.from       = {0, 0};
  info.recycling  = Blank;
  data            = nullptr;
 }
 inline bool isFree() const noexcept {
  return info.tuple       == PartialTuple{0, 0}
      && info.type        == Blank
      && info.state       == Acceptor
      && info.from.rank   == 0
      && info.from.source == 0
      && info.recycling   == Blank
      && data             == nullptr
       ;
 }
 // Member functions:3 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:4]]
 inline bool isRecyclable() const noexcept {
  return (  info.state == Dispatched
         || info.state == Ready
         || info.state == Fetch
         )
      && hasValidDataPointer()
      ;
 }
 // Member functions:4 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:5]]
 inline bool hasValidDataPointer() const noexcept {
  return data       != nullptr
      && info.state != Acceptor
      && info.type  != Blank
      ;
 }
 // Member functions:5 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:6]]
 void unwrapAndMarkReady() {
      if (info.state == Ready) return;
      if (info.state != Dispatched)
        throw
@@ -447,17 +458,20 @@ struct Slice {
                << "\n";
 #endif
    }
 // Member functions:6 ends here
-    Slice(size_t size_)
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]]
-      : info({})
+Slice(size_t size_)
-      , data(nullptr)
+    : info({})
-      , size(size_)
+    , data(nullptr)
-      {}
+    , size(size_)
    {}
-  }; // struct Slice
+}; // struct Slice
-
+// Epilog:1 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Debug][Debug:1]]
 template <typename F=double>
 std::ostream& operator<<(std::ostream& out, typename Slice<F>::Location const& v) {
  // TODO: remove me
@@ -476,4 +490,4 @@ std::ostream& operator<<(std::ostream& out, typename Slice<F>::Info const& i) {
 }
 } // namespace atrip
-// The slice:2 ends here
+// Debug:1 ends here
--- a/include/atrip/SliceUnion.hpp
+++ b/include/atrip/SliceUnion.hpp
@@ -179,8 +179,14 @@ namespace atrip {
          if (blank.info.state == Slice<F>::SelfSufficient) {
            blank.data = sources[from.source].data();
          } else {
-            if (freePointers.size() == 0)
+            if (freePointers.size() == 0) {
-              throw std::domain_error("No more free pointers!");
+              std::stringstream stream;
              stream << "No more free pointers "
                     << "for type " << type
                     << " and name " << name
                      ;
              throw std::domain_error(stream.str());
            }
            auto dataPointer = freePointers.begin();
            freePointers.erase(dataPointer);
            blank.data = *dataPointer;
@@ -314,7 +320,8 @@ namespace atrip {
          // at this point, let us blank the slice
          WITH_RANK << "~~~:cl(" << name << ")"
                    << " freeing up slice "
-                    // TODO: make this possible
+                    // TODO: make this possible because of Templates
                    // TODO: there is a deduction error here
                    // << " info " << slice.info
                    << "\n";
          slice.free();
@@ -334,7 +341,7 @@ namespace atrip {
              , typename Slice<F>::Name name_
              , size_t nSliceBuffers = 4
              )
-              : rankMap(paramLength, np)
+              : rankMap(paramLength, np, global_world)
              , world(child_world)
              , universe(global_world)
              , sliceLength(sliceLength_)
@@ -353,7 +360,7 @@ namespace atrip {
      slices
        = std::vector<Slice<F>>(2 * sliceTypes.size(), { sources[0].size() });
-      // TODO: think exactly ^------------------- about this number
+      // TODO: think exactly    ^------------------- about this number
      // initialize the freePointers with the pointers to the buffers
      std::transform(sliceBuffers.begin(), sliceBuffers.end(),
@@ -421,10 +428,11 @@ namespace atrip {
     * \brief Send asynchronously only if the state is Fetch
     */
    void send( size_t otherRank
-             , typename Slice<F>::Info const& info
+             , typename Slice<F>::LocalDatabaseElement const& el
             , size_t tag) const noexcept {
      MPI_Request request;
      bool sendData_p = false;
      auto const& info = el.info;
      if (info.state == Slice<F>::Fetch) sendData_p = true;
      // TODO: remove this because I have SelfSufficient
@@ -539,8 +547,11 @@ namespace atrip {
                      [&name](SliceUnion<F> const* s) {
                        return name == s->name;
                      });
-      if (sliceUnionIt == unions.end())
+      if (sliceUnionIt == unions.end()) {
-        throw std::domain_error("SliceUnion not found!");
+        std::stringstream stream;
        stream << "SliceUnion(" << name << ") not found!";
        throw std::domain_error(stream.str());
      }
      return **sliceUnionIt;
  }
--- a/include/atrip/Tuples.hpp
+++ b/include/atrip/Tuples.hpp
@@ -1,75 +1,538 @@
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Tuples][Tuples:1]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]]
 #pragma once
 #include <vector>
 #include <array>
 #include <numeric>
 // TODO: remove some
 #include <stdio.h>
 #include <math.h>
 #include <algorithm>
 #include <map>
 #include <cassert>
 #include <chrono>
 #include <climits>
 #include <mpi.h>
 #include <atrip/Utils.hpp>
 #include <atrip/Debug.hpp>
 namespace atrip {
 // Prolog:1 ends here
-  using ABCTuple = std::array<size_t, 3>;
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Tuples%20types][Tuples types:1]]
-  using PartialTuple = std::array<size_t, 2>;
+using ABCTuple = std::array<size_t, 3>;
-  using ABCTuples = std::vector<ABCTuple>;
+using PartialTuple = std::array<size_t, 2>;
 using ABCTuples = std::vector<ABCTuple>;
-  ABCTuples getTuplesList(size_t Nv) {
+constexpr ABCTuple FAKE_TUPLE = {0, 0, 0};
-    const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
+constexpr ABCTuple INVALID_TUPLE = {1, 1, 1};
-    ABCTuples result(n);
+// Tuples types:1 ends here
    size_t u(0);
-    for (size_t a(0); a < Nv; a++)
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Distributing%20the%20tuples][Distributing the tuples:1]]
-    for (size_t b(a); b < Nv; b++)
+struct TuplesDistribution {
-    for (size_t c(b); c < Nv; c++){
+  virtual ABCTuples getTuples(size_t Nv, MPI_Comm universe) = 0;
-      if ( a == b && b == c ) continue;
+  virtual bool tupleIsFake(ABCTuple const& t) { return t == FAKE_TUPLE; }
-      result[u++] = {a, b, c};
+};
-    }
+// Distributing the tuples:1 ends here
-    return result;
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Node%20information][Node information:1]]
 std::vector<std::string> getNodeNames(MPI_Comm comm){
  int rank, np;
  MPI_Comm_rank(comm, &rank);
  MPI_Comm_size(comm, &np);
  std::vector<std::string> nodeList(np);
  char nodeName[MPI_MAX_PROCESSOR_NAME]
     , nodeNames[np*MPI_MAX_PROCESSOR_NAME]
     ;
  std::vector<int> nameLengths(np)
                 , off(np)
                 ;
  int nameLength;
  MPI_Get_processor_name(nodeName, &nameLength);
  MPI_Allgather(&nameLength,
                1,
                MPI_INT,
                nameLengths.data(),
                1,
                MPI_INT,
                comm);
  for (int i(1); i < np; i++)
    off[i] = off[i-1] + nameLengths[i-1];
  MPI_Allgatherv(nodeName,
                 nameLengths[rank],
                 MPI_BYTE,
                 nodeNames,
                 nameLengths.data(),
                 off.data(),
                 MPI_BYTE,
                 comm);
  for (int i(0); i < np; i++) {
    std::string const s(&nodeNames[off[i]], nameLengths[i]);
    nodeList[i] = s;
  }
  return nodeList;
 }
 // Node information:1 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Node%20information][Node information:2]]
 struct RankInfo {
  const std::string name;
  const size_t nodeId;
  const size_t globalRank;
  const size_t localRank;
  const size_t ranksPerNode;
 };
-  std::pair<size_t, size_t>
+template <typename A>
-  getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) {
+A unique(A const &xs) {
-
+  auto result = xs;
-    std::vector<size_t> n_tuples_per_rank(np, tuplesList.size()/np);
+  std::sort(std::begin(result), std::end(result));
-    const size_t
+  auto const& last = std::unique(std::begin(result), std::end(result));
-        // how many valid tuples should we still verteilen to nodes
+  result.erase(last, std::end(result));
-        // since the number of tuples is not divisible by the number of nodes
+  return result;
-        nRoundRobin = tuplesList.size() % np
+}
        // every node must have the sanme amount of tuples in order for the
        // other nodes to receive and send somewhere, therefore
        // some nodes will get extra tuples but that are dummy tuples
      , nExtraInvalid = (np - nRoundRobin) % np
      ;
    if (nRoundRobin) for (int i = 0; i < np; i++) n_tuples_per_rank[i]++;
  #if defined(TODO)
    assert( tuplesList.size()
            ==
            ( std::accumulate(n_tuples_per_rank.begin(),
                              n_tuples_per_rank.end(),
                              0UL,
                              std::plus<size_t>())
            + nExtraInvalid
            ));
  #endif
    WITH_RANK << "nRoundRobin = " << nRoundRobin << "\n";
    WITH_RANK << "nExtraInvalid = " << nExtraInvalid << "\n";
    WITH_RANK << "ntuples = " << n_tuples_per_rank[rank] << "\n";
    auto const& it = n_tuples_per_rank.begin();
    return
      { std::accumulate(it, it + rank    , 0)
      , std::accumulate(it, it + rank + 1, 0)
      };
 std::vector<RankInfo>
 getNodeInfos(std::vector<string> const& nodeNames) {
  std::vector<RankInfo> result;
  auto const uniqueNames = unique(nodeNames);
  auto const index = [&uniqueNames](std::string const& s) {
    auto const& it = std::find(uniqueNames.begin(), uniqueNames.end(), s);
    return std::distance(uniqueNames.begin(), it);
  };
  std::vector<size_t> localRanks(uniqueNames.size(), 0);
  size_t globalRank = 0;
  for (auto const& name: nodeNames) {
    const size_t nodeId = index(name);
    result.push_back({name,
                      nodeId,
                      globalRank++,
                      localRanks[nodeId]++,
                      std::count(nodeNames.begin(),
                                 nodeNames.end(),
                                 name)
                      });
  }
  return result;
 }
 struct ClusterInfo {
  const size_t nNodes, np, ranksPerNode;
  const std::vector<RankInfo> rankInfos;
 };
 ClusterInfo
 getClusterInfo(MPI_Comm comm) {
  auto const names = getNodeNames(comm);
  auto const rankInfos = getNodeInfos(names);
  return ClusterInfo {
    unique(names).size(),
    names.size(),
    rankInfos[0].ranksPerNode,
    rankInfos
  };
 }
-// Tuples:1 ends here
+// Node information:2 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Naive%20list][Naive list:1]]
 ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) {
  const size_t
    // total number of tuples for the problem
       n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv
    // all ranks should have the same number of tuples_per_rank
    , tuples_per_rank = n / np + size_t(n % np != 0)
    // start index for the global tuples list
    , start = tuples_per_rank * rank
    // end index for the global tuples list
    , end = tuples_per_rank * (rank + 1)
    ;
  LOG(1,"Atrip") << "tuples_per_rank = " << tuples_per_rank << "\n";
  WITH_RANK << "start, end = " << start << ", " << end << "\n";
  ABCTuples result(tuples_per_rank, FAKE_TUPLE);
  for (size_t a(0), r(0), g(0); a < Nv; a++)
  for (size_t b(a);             b < Nv; b++)
  for (size_t c(b);             c < Nv; c++){
    if ( a == b && b == c ) continue;
    if ( start <= g && g < end) result[r++] = {a, b, c};
    g++;
  }
  return result;
 }
 // Naive list:1 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Naive%20list][Naive list:2]]
 ABCTuples getAllTuplesList(const size_t Nv) {
  const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
  ABCTuples result(n);
  for (size_t a(0), u(0); a < Nv; a++)
  for (size_t b(a); b < Nv; b++)
  for (size_t c(b); c < Nv; c++){
    if ( a == b && b == c ) continue;
    result[u++] = {a, b, c};
  }
  return result;
 }
 // Naive list:2 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Naive%20list][Naive list:3]]
 struct NaiveDistribution : public TuplesDistribution {
  ABCTuples getTuples(size_t Nv, MPI_Comm universe) override {
    int rank, np;
    MPI_Comm_rank(universe, &rank);
    MPI_Comm_size(universe, &np);
    return getTuplesList(Nv, (size_t)rank, (size_t)np);
  }
 };
 // Naive list:3 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]]
 namespace group_and_sort {
 // Prolog:1 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Utils][Utils:1]]
 // Provides the node on which the slice-element is found
 // Right now we distribute the slices in a round robin fashion
 // over the different nodes (NOTE: not mpi ranks but nodes)
 inline
 size_t isOnNode(size_t tuple, size_t nNodes) { return tuple % nNodes; }
 // return the node (or all nodes) where the elements of this
 // tuple are located
 std::vector<size_t> getTupleNodes(ABCTuple const& t, size_t nNodes) {
  std::vector<size_t>
    nTuple = { isOnNode(t[0], nNodes)
             , isOnNode(t[1], nNodes)
             , isOnNode(t[2], nNodes)
             };
  return unique(nTuple);
 }
 struct Info {
  size_t nNodes;
  size_t nodeId;
 };
 // Utils:1 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Distribution][Distribution:1]]
 ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
  ABCTuples nodeTuples;
  size_t const nNodes(info.nNodes);
  std::vector<ABCTuples>
      container1d(nNodes)
    , container2d(nNodes * nNodes)
    , container3d(nNodes * nNodes * nNodes)
    ;
  if (info.nodeId == 0)
    std::cout << "\tGoing through all "
              << allTuples.size()
              << " tuples in "
              << nNodes
              << " nodes\n";
  // build container-n-d's
  for (auto const& t: allTuples) {
    // one which node(s) are the tuple elements located...
    // put them into the right container
    auto const _nodes = getTupleNodes(t, nNodes);
    switch (_nodes.size()) {
      case 1:
        container1d[_nodes[0]].push_back(t);
        break;
      case 2:
        container2d[ _nodes[0]
                   + _nodes[1] * nNodes
                   ].push_back(t);
        break;
      case 3:
        container3d[ _nodes[0]
                   + _nodes[1] * nNodes
                   + _nodes[2] * nNodes * nNodes
                   ].push_back(t);
        break;
    }
  }
  if (info.nodeId == 0)
    std::cout << "\tBuilding 1-d containers\n";
  // DISTRIBUTE 1-d containers
  // every tuple which is only located at one node belongs to this node
  {
    auto const& _tuples = container1d[info.nodeId];
    nodeTuples.resize(_tuples.size(), INVALID_TUPLE);
    std::copy(_tuples.begin(), _tuples.end(), nodeTuples.begin());
  }
  if (info.nodeId == 0)
    std::cout << "\tBuilding 2-d containers\n";
  // DISTRIBUTE 2-d containers
  //the tuples which are located at two nodes are half/half given to these nodes
  for (size_t yx = 0; yx < container2d.size(); yx++) {
    auto const& _tuples = container2d[yx];
      const
    size_t idx = yx % nNodes
         // remeber: yx = idy * nNodes + idx
         , idy = yx / nNodes
         , n_half = _tuples.size() / 2
         , size = nodeTuples.size()
         ;
    size_t nbeg, nend;
    if (info.nodeId == idx) {
      nbeg = 0 * n_half;
      nend = n_half;
    } else if (info.nodeId == idy) {
      nbeg = 1 * n_half;
      nend = _tuples.size();
    } else {
      // either idx or idy is my node
      continue;
    }
    size_t const nextra = nend - nbeg;
    nodeTuples.resize(size + nextra, INVALID_TUPLE);
    std::copy(_tuples.begin() + nbeg,
              _tuples.begin() + nend,
              nodeTuples.begin() + size);
  }
  if (info.nodeId == 0)
    std::cout << "\tBuilding 3-d containers\n";
  // DISTRIBUTE 3-d containers
  for (size_t zyx = 0; zyx < container3d.size(); zyx++) {
    auto const& _tuples = container3d[zyx];
      const
    size_t idx = zyx % nNodes
         , idy = (zyx / nNodes) % nNodes
         // remember: zyx = idx + idy * nNodes + idz * nNodes^2
         , idz = zyx / nNodes / nNodes
         , n_third = _tuples.size() / 3
         , size = nodeTuples.size()
         ;
    size_t nbeg, nend;
    if (info.nodeId == idx) {
      nbeg = 0 * n_third;
      nend = 1 * n_third;
    } else if (info.nodeId == idy) {
      nbeg = 1 * n_third;
      nend = 2 * n_third;
    } else if (info.nodeId == idz) {
      nbeg = 2 * n_third;
      nend = _tuples.size();
    } else {
      // either idx or idy or idz is my node
      continue;
    }
    size_t const nextra = nend - nbeg;
    nodeTuples.resize(size + nextra, INVALID_TUPLE);
    std::copy(_tuples.begin() + nbeg,
              _tuples.begin() + nend,
              nodeTuples.begin() + size);
  }
  if (info.nodeId == 0) std::cout << "\tswapping tuples...\n";
  /*
   *  sort part of group-and-sort algorithm
   *  every tuple on a given node is sorted in a way that
   *  the 'home elements' are the fastest index.
   *  1:yyy 2:yyn(x) 3:yny(x) 4:ynn(x) 5:nyy 6:nyn(x) 7:nny 8:nnn
   */
  for (auto &nt: nodeTuples){
    if ( isOnNode(nt[0], nNodes) == info.nodeId ){ // 1234
      if ( isOnNode(nt[2], nNodes) != info.nodeId ){ // 24
        size_t const x(nt[0]);
        nt[0] = nt[2];         // switch first and last
        nt[2] = x;
      }
      else if ( isOnNode(nt[1], nNodes) != info.nodeId){ // 3
        size_t const x(nt[0]);
        nt[0] = nt[1];         // switch first two
        nt[1] = x;
      }
    } else {
      if ( isOnNode(nt[1], nNodes) == info.nodeId   // 56
        && isOnNode(nt[2], nNodes) != info.nodeId
        ) { // 6
        size_t const x(nt[1]);
        nt[1] = nt[2];         // switch last two
        nt[2] = x;
      }
    }
  }
  if (info.nodeId == 0) std::cout << "\tsorting list of tuples...\n";
  //now we sort the list of tuples
  std::sort(nodeTuples.begin(), nodeTuples.end());
  if (info.nodeId == 0) std::cout << "\trestoring tuples...\n";
  // we bring the tuples abc back in the order a<b<c
  for (auto &t: nodeTuples)  std::sort(t.begin(), t.end());
 #if ATRIP_DEBUG > 1
  if (info.nodeId == 0)
  std::cout << "checking for validity of " << nodeTuples.size() << std::endl;
  const bool anyInvalid
    = std::any_of(nodeTuples.begin(),
                  nodeTuples.end(),
                  [](ABCTuple const& t) { return t == INVALID_TUPLE; });
  if (anyInvalid) throw "Some tuple is invalid in group-and-sort algorithm";
 #endif
  if (info.nodeId == 0) std::cout << "\treturning tuples...\n";
  return nodeTuples;
 }
 // Distribution:1 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:1]]
 std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv) {
  int rank, np;
  MPI_Comm_rank(universe, &rank);
  MPI_Comm_size(universe, &np);
  std::vector<ABCTuple> result;
  auto const nodeNames(getNodeNames(universe));
  size_t const nNodes = unique(nodeNames).size();
  auto const nodeInfos = getNodeInfos(nodeNames);
  // We want to construct a communicator which only contains of one
  // element per node
  bool const computeDistribution
    = nodeInfos[rank].localRank == 0;
  std::vector<ABCTuple>
    nodeTuples
      = computeDistribution
      ? specialDistribution(Info{nNodes, nodeInfos[rank].nodeId},
                            getAllTuplesList(Nv))
      : std::vector<ABCTuple>()
      ;
  LOG(1,"Atrip") << "got nodeTuples\n";
  // now we have to send the data from **one** rank on each node
  // to all others ranks of this node
    const
  int color = nodeInfos[rank].nodeId
    , key = nodeInfos[rank].localRank
    ;
  MPI_Comm INTRA_COMM;
  MPI_Comm_split(universe, color, key, &INTRA_COMM);
 // Main:1 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:2]]
 size_t const
  tuplesPerRankLocal
     = nodeTuples.size() / nodeInfos[rank].ranksPerNode
     + size_t(nodeTuples.size() % nodeInfos[rank].ranksPerNode != 0)
     ;
 size_t tuplesPerRankGlobal;
 MPI_Reduce(&tuplesPerRankLocal,
           &tuplesPerRankGlobal,
           1,
           MPI_UINT64_T,
           MPI_MAX,
           0,
           universe);
 MPI_Bcast(&tuplesPerRankGlobal,
          1,
          MPI_UINT64_T,
          0,
          universe);
 LOG(1,"Atrip") << "Tuples per rank: " << tuplesPerRankGlobal << "\n";
 LOG(1,"Atrip") << "ranks per node " << nodeInfos[rank].ranksPerNode << "\n";
 LOG(1,"Atrip") << "#nodes " << nNodes << "\n";
 // Main:2 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:3]]
 size_t const totalTuples
  = tuplesPerRankGlobal * nodeInfos[rank].ranksPerNode;
 if (computeDistribution) {
  // pad with FAKE_TUPLEs
  nodeTuples.insert(nodeTuples.end(),
                    totalTuples - nodeTuples.size(),
                    FAKE_TUPLE);
 }
 // Main:3 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:4]]
 {
  // construct mpi type for abctuple
  MPI_Datatype MPI_ABCTUPLE;
  MPI_Type_vector(nodeTuples[0].size(), 1, 1, MPI_UINT64_T, &MPI_ABCTUPLE);
  MPI_Type_commit(&MPI_ABCTUPLE);
  LOG(1,"Atrip") << "scattering tuples \n";
  result.resize(tuplesPerRankGlobal);
  MPI_Scatter(nodeTuples.data(),
              tuplesPerRankGlobal,
              MPI_ABCTUPLE,
              result.data(),
              tuplesPerRankGlobal,
              MPI_ABCTUPLE,
              0,
              INTRA_COMM);
  MPI_Type_free(&MPI_ABCTUPLE);
 }
 // Main:4 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:5]]
 return result;
 }
 // Main:5 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Interface][Interface:1]]
 struct Distribution : public TuplesDistribution {
  ABCTuples getTuples(size_t Nv, MPI_Comm universe) override {
    return main(universe, Nv);
  }
 };
 // Interface:1 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]]
 } // namespace group_and_sort
 // Epilog:1 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]]
 }
 // Epilog:1 ends here
--- a/include/atrip/Unions.hpp
+++ b/include/atrip/Unions.hpp
@@ -59,7 +59,7 @@ namespace atrip {
                          , child_world
                          , global_world
                          , Slice<F>::TA
-                          , 4) {
+                          , 6) {
           init(sourceTensor);
         }
@@ -97,7 +97,7 @@ namespace atrip {
                         , child_world
                         , global_world
                         , Slice<F>::VIJKA
-                         , 4) {
+                         , 6) {
           init(sourceTensor);
         }
--- a/include/atrip/Utils.hpp
+++ b/include/atrip/Utils.hpp
@@ -1,4 +1,4 @@
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Utils][Utils:1]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]]
 #pragma once
 #include <sstream>
 #include <string>
@@ -6,32 +6,41 @@
 #include <chrono>
 #include <ctf.hpp>
 #include <atrip/Debug.hpp>
 namespace atrip {
 // Prolog:1 ends here
-
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Pretty%20printing][Pretty printing:1]]
-  template <typename T>
+template <typename T>
  std::string pretty_print(T&& value) {
    std::stringstream stream;
-#if ATRIP_DEBUG > 1
+#if ATRIP_DEBUG > 2
    dbg::pretty_print(stream, std::forward<T>(value));
 #endif
    return stream.str();
  }
 // Pretty printing:1 ends here
-#define WITH_CHRONO(__chrono, ...) \
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Chrono][Chrono:1]]
-  __chrono.start(); __VA_ARGS__ __chrono.stop();
+#define WITH_CHRONO(__chrono_name, ...)         \
  Atrip::chrono[__chrono_name].start();         \
  __VA_ARGS__                                   \
  Atrip::chrono[__chrono_name].stop();
-  struct Timer {
+struct Timer {
-    using Clock = std::chrono::high_resolution_clock;
+  using Clock = std::chrono::high_resolution_clock;
-    using Event = std::chrono::time_point<Clock>;
+  using Event = std::chrono::time_point<Clock>;
-    std::chrono::duration<double> duration;
+  std::chrono::duration<double> duration;
-    Event _start;
+  Event _start;
-    inline void start() noexcept { _start = Clock::now(); }
+  inline void start() noexcept { _start = Clock::now(); }
-    inline void stop() noexcept { duration += Clock::now() - _start; }
+  inline void stop() noexcept { duration += Clock::now() - _start; }
-    inline void clear() noexcept { duration *= 0; }
+  inline void clear() noexcept { duration *= 0; }
-    inline double count() const noexcept { return duration.count(); }
+  inline double count() const noexcept { return duration.count(); }
-  };
+};
-  using Timings = std::map<std::string, Timer>;
+using Timings = std::map<std::string, Timer>;
 // Chrono:1 ends here
 // [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]]
 }
-// Utils:1 ends here
+// Epilog:1 ends here
--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@@ -9,8 +9,11 @@
 using namespace atrip;
 bool RankMap<Complex>::RANK_ROUND_ROBIN;
 bool RankMap<double>::RANK_ROUND_ROBIN;
 int Atrip::rank;
 int Atrip::np;
 Timings Atrip::chrono;
 // user printing block
 IterationDescriptor IterationDescription::descriptor;
@@ -30,28 +33,35 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
  const int rank = Atrip::rank;
  MPI_Comm universe = in.ei->wrld->comm;
  // Timings in seconds ================================================{{{1
  Timings chrono{};
  const size_t No = in.ei->lens[0];
  const size_t Nv = in.ea->lens[0];
  LOG(0,"Atrip") << "No: " << No << "\n";
  LOG(0,"Atrip") << "Nv: " << Nv << "\n";
  LOG(0,"Atrip") << "np: " << np << "\n";
  // allocate the three scratches, see piecuch
-  std::vector<F>   Tijk(No*No*No) // doubles only (see piecuch)
+  std::vector<F> Tijk(No*No*No) // doubles only (see piecuch)
-                 , Zijk(No*No*No) // singles + doubles (see piecuch)
+               , Zijk(No*No*No) // singles + doubles (see piecuch)
-                 // we need local copies of the following tensors on every
+               // we need local copies of the following tensors on every
-                 // rank
+               // rank
-                 , epsi(No)
+               , epsi(No)
-                 , epsa(Nv)
+               , epsa(Nv)
-                 , Tai(No * Nv)
+               , Tai(No * Nv)
-                 ;
+               ;
  in.ei->read_all(epsi.data());
  in.ea->read_all(epsa.data());
  in.Tph->read_all(Tai.data());
  RankMap<F>::RANK_ROUND_ROBIN = in.rankRoundRobin;
  if (RankMap<F>::RANK_ROUND_ROBIN) {
    LOG(0,"Atrip") << "Doing rank round robin slices distribution" << "\n";
  } else {
    LOG(0,"Atrip")
      << "Doing node > local rank round robin slices distribution" << "\n";
  }
  // COMMUNICATOR CONSTRUCTION ========================================={{{1
  //
  // Construct a new communicator living only on a single rank
@@ -72,41 +82,49 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
  }
  chrono["nv-slices"].start();
  // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1
-  LOG(0,"Atrip") << "BUILD NV-SLICES\n";
+  WITH_CHRONO("nv-slices",
-  TAPHH<F> taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+    LOG(0,"Atrip") << "BUILD NV-SLICES\n";
-  HHHA<F>  hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+    TAPHH<F> taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  chrono["nv-slices"].stop();
+    HHHA<F>  hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
  )
  chrono["nv-nv-slices"].start();
  // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1
-  LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n";
+  WITH_CHRONO("nv-nv-slices",
-  ABPH<F> abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+    LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n";
-  ABHH<F> abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+    ABPH<F> abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  TABHH<F> tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+    ABHH<F> abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  chrono["nv-nv-slices"].stop();
+    TABHH<F> tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
  )
  // all tensors
  std::vector< SliceUnion<F>* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh};
-  //CONSTRUCT TUPLE LIST ==============================================={{{1
+  // get tuples for the current rank
-  LOG(0,"Atrip") << "BUILD TUPLE LIST\n";
+  TuplesDistribution *distribution;
  const auto tuplesList = std::move(getTuplesList(Nv));
  WITH_RANK << "tupList.size() = " << tuplesList.size() << "\n";
-  // GET ABC INDEX RANGE FOR RANK ======================================{{{1
+  if (in.tuplesDistribution == Atrip::Input<F>::TuplesDistribution::NAIVE) {
-  auto abcIndex = getABCRange(np, rank, tuplesList);
+    LOG(0,"Atrip") << "Using the naive distribution\n";
-  size_t nIterations = abcIndex.second - abcIndex.first;
+    distribution = new NaiveDistribution();
  } else {
    LOG(0,"Atrip") << "Using the group-and-sort distribution\n";
    distribution = new group_and_sort::Distribution();
  }
-  WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n";
+  LOG(0,"Atrip") << "BUILDING TUPLE LIST\n";
-  LOG(0,"Atrip") << "#iterations: " << nIterations << "\n";
+  WITH_CHRONO("tuples:build",
    auto const tuplesList = distribution->getTuples(Nv, universe);
    )
  const size_t nIterations = tuplesList.size();
-  // first abc
+  {
-  const ABCTuple firstAbc = tuplesList[abcIndex.first];
+    const size_t _all_tuples = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
-
+    LOG(0,"Atrip") << "#iterations: "
-
+                  << nIterations
-  double energy(0.);
+                  << "/"
                  << nIterations * np
                  << "\n";
  }
  const size_t
      iterationMod = (in.percentageMod > 0)
@@ -119,7 +137,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
  auto const isFakeTuple
-    = [&tuplesList](size_t const i) { return i >= tuplesList.size(); };
+    = [&tuplesList, distribution](size_t const i) {
      return distribution->tupleIsFake(tuplesList[i]);
    };
  using Database = typename Slice<F>::Database;
@@ -127,45 +147,42 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
  auto communicateDatabase
    = [ &unions
      , np
      , &chrono
      ] (ABCTuple const& abc, MPI_Comm const& c) -> Database {
-        chrono["db:comm:type:do"].start();
+        WITH_CHRONO("db:comm:type:do",
-        auto MPI_LDB_ELEMENT = Slice<F>::mpi::localDatabaseElement();
+          auto MPI_LDB_ELEMENT = Slice<F>::mpi::localDatabaseElement();
-        chrono["db:comm:type:do"].stop();
+        )
-        chrono["db:comm:ldb"].start();
+        WITH_CHRONO("db:comm:ldb",
-        LocalDatabase ldb;
+          typename Slice<F>::LocalDatabase ldb;
-
+          for (auto const& tensor: unions) {
-        for (auto const& tensor: unions) {
+            auto const& tensorDb = tensor->buildLocalDatabase(abc);
-          auto const& tensorDb = tensor->buildLocalDatabase(abc);
+            ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end());
-          ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end());
+          }
-        }
+        )
        chrono["db:comm:ldb"].stop();
        Database db(np * ldb.size(), ldb[0]);
-        chrono["oneshot-db:comm:allgather"].start();
+        WITH_CHRONO("oneshot-db:comm:allgather",
-        chrono["db:comm:allgather"].start();
+        WITH_CHRONO("db:comm:allgather",
-        MPI_Allgather( ldb.data()
+          MPI_Allgather( ldb.data()
-                     , ldb.size()
+                       , ldb.size()
-                     , MPI_LDB_ELEMENT
+                       , MPI_LDB_ELEMENT
-                     , db.data()
+                       , db.data()
-                     , ldb.size()
+                       , ldb.size()
-                     , MPI_LDB_ELEMENT
+                       , MPI_LDB_ELEMENT
-                     , c);
+                       , c);
-        chrono["db:comm:allgather"].stop();
+        ))
        chrono["oneshot-db:comm:allgather"].stop();
-        chrono["db:comm:type:free"].start();
+        WITH_CHRONO("db:comm:type:free",
-        MPI_Type_free(&MPI_LDB_ELEMENT);
+          MPI_Type_free(&MPI_LDB_ELEMENT);
-        chrono["db:comm:type:free"].stop();
+        )
        return db;
      };
  auto doIOPhase
-    = [&unions, &rank, &np, &universe, &chrono] (Database const& db) {
+    = [&unions, &rank, &np, &universe] (Database const& db) {
    const size_t localDBLength = db.size() / np;
@@ -201,9 +218,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
          << "\n"
          ;
-        chrono["db:io:recv"].start();
+        WITH_CHRONO("db:io:recv",
-        u.receive(el.info, recvTag);
+          u.receive(el.info, recvTag);
-        chrono["db:io:recv"].stop();
+        )
      } // recv
    }
@@ -237,9 +254,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
          << "\n"
          ;
-        chrono["db:io:send"].start();
+        WITH_CHRONO("db:io:send",
-        u.send(otherRank, el.info, sendTag);
+          u.send(otherRank, el, sendTag);
-        chrono["db:io:send"].stop();
+        )
      } // send phase
@@ -257,31 +274,30 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
    * double(No)
    * double(No)
    * (double(No) + double(Nv))
-    * 2
+    * 2.0
-    * 6
+    * (traits::isComplex<F>() ? 2.0 : 1.0)
    * 6.0
    / 1e9
    ;
  // START MAIN LOOP ======================================================{{{1
-  for ( size_t i = abcIndex.first, iteration = 1
+  double energy(0.);
-      ; i < abcIndex.second
+
  for ( size_t i = 0, iteration = 1
      ; i < tuplesList.size()
      ; i++, iteration++
      ) {
-    chrono["iterations"].start();
+    Atrip::chrono["iterations"].start();
    // check overhead from chrono over all iterations
-    chrono["start:stop"].start(); chrono["start:stop"].stop();
+    WITH_CHRONO("start:stop", {})
    // check overhead of doing a barrier at the beginning
-    chrono["oneshot-mpi:barrier"].start();
+    WITH_CHRONO("oneshot-mpi:barrier",
-    chrono["mpi:barrier"].start();
+    WITH_CHRONO("mpi:barrier",
-    // TODO: REMOVE
+      if (in.barrier) MPI_Barrier(universe);
-    if (in.barrier == 1)
+    ))
    MPI_Barrier(universe);
    chrono["mpi:barrier"].stop();
    chrono["oneshot-mpi:barrier"].stop();
    if (iteration % iterationMod == 0 || iteration == iteration1Percent) {
@@ -289,22 +305,22 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
        IterationDescription::descriptor({
          iteration,
          nIterations,
-          chrono["iterations"].count()
+          Atrip::chrono["iterations"].count()
        });
      }
      LOG(0,"Atrip")
        << "iteration " << iteration
        << " [" << 100 * iteration / nIterations << "%]"
-        << " (" << doublesFlops * iteration / chrono["doubles"].count()
+        << " (" << doublesFlops * iteration / Atrip::chrono["doubles"].count()
        << "GF)"
-        << " (" << doublesFlops * iteration / chrono["iterations"].count()
+        << " (" << doublesFlops * iteration / Atrip::chrono["iterations"].count()
        << "GF)"
        << " ===========================\n";
      // PRINT TIMINGS
      if (in.chrono)
-      for (auto const& pair: chrono)
+      for (auto const& pair: Atrip::chrono)
        LOG(1, " ") << pair.first << " :: "
                    << pair.second.count()
                    << std::endl;
@@ -314,46 +330,43 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
    const ABCTuple abc = isFakeTuple(i)
                       ? tuplesList[tuplesList.size() - 1]
                       : tuplesList[i]
-                 , *abcNext = i == (abcIndex.second - 1)
+                 , *abcNext = i == (tuplesList.size() - 1)
                            ? nullptr
                            : isFakeTuple(i + 1)
                            ? &tuplesList[tuplesList.size() - 1]
                            : &tuplesList[i + 1]
                 ;
-    chrono["with_rank"].start();
+    WITH_CHRONO("with_rank",
-    WITH_RANK << " :it " << iteration
+      WITH_RANK << " :it " << iteration
-              << " :abc " << pretty_print(abc)
+                << " :abc " << pretty_print(abc)
-              << " :abcN "
+                << " :abcN "
-              << (abcNext ? pretty_print(*abcNext) : "None")
+                << (abcNext ? pretty_print(*abcNext) : "None")
-              << "\n";
+                << "\n";
-    chrono["with_rank"].stop();
+    )
    // COMM FIRST DATABASE ================================================{{{1
-    if (i == abcIndex.first) {
+    if (i == 0) {
      WITH_RANK << "__first__:first database ............ \n";
-      const auto __db = communicateDatabase(abc, universe);
+      const auto db = communicateDatabase(abc, universe);
      WITH_RANK << "__first__:first database communicated \n";
      WITH_RANK << "__first__:first database io phase \n";
-      doIOPhase(__db);
+      doIOPhase(db);
      WITH_RANK << "__first__:first database io phase DONE\n";
      WITH_RANK << "__first__::::Unwrapping all slices for first database\n";
      for (auto& u: unions) u->unwrapAll(abc);
-      WITH_RANK << "__first__::::Unwrapping all slices for first database DONE\n";
+      WITH_RANK << "__first__::::Unwrapping slices for first database DONE\n";
      MPI_Barrier(universe);
    }
    // COMM NEXT DATABASE ================================================={{{1
    if (abcNext) {
      WITH_RANK << "__comm__:" << iteration << "th communicating database\n";
-      chrono["db:comm"].start();
+      WITH_CHRONO("db:comm",
-      //const auto db = communicateDatabase(*abcNext, universe);
+        const auto db = communicateDatabase(*abcNext, universe);
-      Database db = communicateDatabase(*abcNext, universe);
+      )
-      chrono["db:comm"].stop();
+      WITH_CHRONO("db:io",
-      chrono["db:io"].start();
+        doIOPhase(db);
-      doIOPhase(db);
+      )
      chrono["db:io"].stop();
      WITH_RANK << "__comm__:" <<  iteration << "th database io phase DONE\n";
    }
@@ -361,63 +374,61 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
    OCD_Barrier(universe);
    if (!isFakeTuple(i)) {
      WITH_RANK << iteration << "-th doubles\n";
-      WITH_CHRONO(chrono["oneshot-unwrap"],
+      WITH_CHRONO("oneshot-unwrap",
-      WITH_CHRONO(chrono["unwrap"],
+      WITH_CHRONO("unwrap",
-      WITH_CHRONO(chrono["unwrap:doubles"],
+      WITH_CHRONO("unwrap:doubles",
        for (auto& u: decltype(unions){&abph, &hhha, &taphh, &tabhh}) {
          u->unwrapAll(abc);
        }
      )))
-      chrono["oneshot-doubles"].start();
+      WITH_CHRONO("oneshot-doubles",
-      chrono["doubles"].start();
+      WITH_CHRONO("doubles",
-      doublesContribution<F>( abc, (size_t)No, (size_t)Nv
+        doublesContribution<F>( abc, (size_t)No, (size_t)Nv
-                            // -- VABCI
+                              // -- VABCI
-                            , abph.unwrapSlice(Slice<F>::AB, abc)
+                              , abph.unwrapSlice(Slice<F>::AB, abc)
-                            , abph.unwrapSlice(Slice<F>::AC, abc)
+                              , abph.unwrapSlice(Slice<F>::AC, abc)
-                            , abph.unwrapSlice(Slice<F>::BC, abc)
+                              , abph.unwrapSlice(Slice<F>::BC, abc)
-                            , abph.unwrapSlice(Slice<F>::BA, abc)
+                              , abph.unwrapSlice(Slice<F>::BA, abc)
-                            , abph.unwrapSlice(Slice<F>::CA, abc)
+                              , abph.unwrapSlice(Slice<F>::CA, abc)
-                            , abph.unwrapSlice(Slice<F>::CB, abc)
+                              , abph.unwrapSlice(Slice<F>::CB, abc)
-                            // -- VHHHA
+                              // -- VHHHA
-                            , hhha.unwrapSlice(Slice<F>::A, abc)
+                              , hhha.unwrapSlice(Slice<F>::A, abc)
-                            , hhha.unwrapSlice(Slice<F>::B, abc)
+                              , hhha.unwrapSlice(Slice<F>::B, abc)
-                            , hhha.unwrapSlice(Slice<F>::C, abc)
+                              , hhha.unwrapSlice(Slice<F>::C, abc)
-                            // -- TA
+                              // -- TA
-                            , taphh.unwrapSlice(Slice<F>::A, abc)
+                              , taphh.unwrapSlice(Slice<F>::A, abc)
-                            , taphh.unwrapSlice(Slice<F>::B, abc)
+                              , taphh.unwrapSlice(Slice<F>::B, abc)
-                            , taphh.unwrapSlice(Slice<F>::C, abc)
+                              , taphh.unwrapSlice(Slice<F>::C, abc)
-                            // -- TABIJ
+                              // -- TABIJ
-                            , tabhh.unwrapSlice(Slice<F>::AB, abc)
+                              , tabhh.unwrapSlice(Slice<F>::AB, abc)
-                            , tabhh.unwrapSlice(Slice<F>::AC, abc)
+                              , tabhh.unwrapSlice(Slice<F>::AC, abc)
-                            , tabhh.unwrapSlice(Slice<F>::BC, abc)
+                              , tabhh.unwrapSlice(Slice<F>::BC, abc)
-                            // -- TIJK
+                              // -- TIJK
-                            , Tijk.data()
+                              , Tijk.data()
-                            , chrono
+                              );
-                            );
+        WITH_RANK << iteration << "-th doubles done\n";
-      WITH_RANK << iteration << "-th doubles done\n";
+      ))
      chrono["doubles"].stop();
      chrono["oneshot-doubles"].stop();
    }
    // COMPUTE SINGLES =================================================== {{{1
    OCD_Barrier(universe);
    if (!isFakeTuple(i)) {
-      WITH_CHRONO(chrono["oneshot-unwrap"],
+      WITH_CHRONO("oneshot-unwrap",
-      WITH_CHRONO(chrono["unwrap"],
+      WITH_CHRONO("unwrap",
-      WITH_CHRONO(chrono["unwrap:singles"],
+      WITH_CHRONO("unwrap:singles",
        abhh.unwrapAll(abc);
      )))
-      chrono["reorder"].start();
+      WITH_CHRONO("reorder",
-      for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I];
+        for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I];
-      chrono["reorder"].stop();
+      )
-      chrono["singles"].start();
+      WITH_CHRONO("singles",
      singlesContribution<F>( No, Nv, abc
                            , Tai.data()
                            , abhh.unwrapSlice(Slice<F>::AB, abc)
                            , abhh.unwrapSlice(Slice<F>::AC, abc)
                            , abhh.unwrapSlice(Slice<F>::BC, abc)
                            , Zijk.data());
-      chrono["singles"].stop();
+      )
    }
@@ -430,12 +441,12 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
      if (abc[1] == abc[2]) distinct--;
      const F epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]);
-      chrono["energy"].start();
+      WITH_CHRONO("energy",
-      if ( distinct == 0)
+        if ( distinct == 0)
-        tupleEnergy = getEnergyDistinct<F>(epsabc, epsi, Tijk, Zijk);
+          tupleEnergy = getEnergyDistinct<F>(epsabc, epsi, Tijk, Zijk);
-      else
+        else
-        tupleEnergy = getEnergySame<F>(epsabc, epsi, Tijk, Zijk);
+          tupleEnergy = getEnergySame<F>(epsabc, epsi, Tijk, Zijk);
-      chrono["energy"].stop();
+      )
 #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
      tupleEnergies[abc] = tupleEnergy;
@@ -445,6 +456,7 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
    }
    // TODO: remove this
    if (isFakeTuple(i)) {
      // fake iterations should also unwrap whatever they got
      WITH_RANK << iteration
@@ -466,7 +478,6 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
    // CLEANUP UNIONS ===================================================={{{1
    OCD_Barrier(universe);
    if (abcNext) {
      chrono["gc"].start();
      WITH_RANK << "__gc__:" << iteration << "-th cleaning up.......\n";
      for (auto& u: unions) {
@@ -500,12 +511,11 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
      }
      chrono["gc"].stop();
    }
      WITH_RANK << iteration << "-th cleaning up....... DONE\n";
-    chrono["iterations"].stop();
+    Atrip::chrono["iterations"].stop();
    // ITERATION END ====================================================={{{1
  }
@@ -543,15 +553,15 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
  // PRINT TIMINGS {{{1
  if (in.chrono)
-  for (auto const& pair: chrono)
+  for (auto const& pair: Atrip::chrono)
    LOG(0,"atrip:chrono") << pair.first << " "
                          << pair.second.count() << std::endl;
  LOG(0, "atrip:flops(doubles)")
-    << nIterations * doublesFlops / chrono["doubles"].count() << "\n";
+    << nIterations * doublesFlops / Atrip::chrono["doubles"].count() << "\n";
  LOG(0, "atrip:flops(iterations)")
-    << nIterations * doublesFlops / chrono["iterations"].count() << "\n";
+    << nIterations * doublesFlops / Atrip::chrono["iterations"].count() << "\n";
  // TODO: change the sign in  the getEnergy routines
  return { - globalEnergy };