Add sources explicitly

2021-09-07 16:18:29 +02:00
parent 3d0f8a3a61
commit 1cc41f1dd1
13 changed files with 2460 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
 .emacs
 doc/doxygen/
+*.o
+*.d
--- a/Sources.mk
+++ b/Sources.mk
@@ -0,0 +1 @@
+ATRIP_SOURCES = include/atrip.hpp src/atrip/Atrip.cxx include/atrip/Atrip.hpp include/atrip/Debug.hpp include/atrip/Equations.hpp include/atrip/Tuples.hpp include/atrip/Unions.hpp include/atrip/RankMap.hpp include/atrip/Blas.hpp include/atrip/Utils.hpp include/atrip/SliceUnion.hpp include/atrip/Slice.hpp
--- a/include/atrip.hpp
+++ b/include/atrip.hpp
@@ -0,0 +1,5 @@
+// [[file:../atrip.org::*Include header][Include header:1]]
+#pragma once
+
+#include <atrip/Atrip.hpp>
+// Include header:1 ends here
--- a/include/atrip/Atrip.hpp
+++ b/include/atrip/Atrip.hpp
@@ -0,0 +1,48 @@
+// [[file:../../atrip.org::*Atrip][Atrip:1]]
+#pragma once
+#include <sstream>
+#include <string>
+#include <map>
+#include <chrono>
+
+#include <ctf.hpp>
+
+namespace atrip {
+
+  struct Atrip {
+
+    static int rank;
+    static int np;
+    static void init();
+
+    struct Input {
+      CTF::Tensor<double> *ei = nullptr
+                        , *ea = nullptr
+                        , *Tph = nullptr
+                        , *Tpphh = nullptr
+                        , *Vpphh = nullptr
+                        , *Vhhhp = nullptr
+                        , *Vppph = nullptr
+                        ;
+      int maxIterations = 0, iterationMod = -1;
+      bool barrier = true;
+      Input& with_epsilon_i(CTF::Tensor<double> * t) { ei = t; return *this; }
+      Input& with_epsilon_a(CTF::Tensor<double> * t) { ea = t; return *this; }
+      Input& with_Tai(CTF::Tensor<double> * t) { Tph = t; return *this; }
+      Input& with_Tabij(CTF::Tensor<double> * t) { Tpphh = t; return *this; }
+      Input& with_Vabij(CTF::Tensor<double> * t) { Vpphh = t; return *this; }
+      Input& with_Vijka(CTF::Tensor<double> * t) { Vhhhp = t; return *this; }
+      Input& with_Vabci(CTF::Tensor<double> * t) { Vppph = t; return *this; }
+      Input& with_maxIterations(int i) { maxIterations = i; return *this; }
+      Input& with_iterationMod(int i) { iterationMod = i; return *this; }
+      Input& with_barrier(bool i) { barrier = i; return *this; }
+    };
+
+    struct Output {
+      double energy;
+    };
+    static Output run(Input const& in);
+  };
+
+}
+// Atrip:1 ends here
--- a/include/atrip/Blas.hpp
+++ b/include/atrip/Blas.hpp
@@ -0,0 +1,22 @@
+// [[file:../../atrip.org::*Blas][Blas:1]]
+#pragma once
+namespace atrip {
+  extern "C" {
+    void dgemm_(
+      const char *transa,
+      const char *transb,
+      const int *m,
+      const int *n,
+      const int *k,
+      double *alpha,
+      const double *A,
+      const int *lda,
+      const double *B,
+      const int *ldb,
+      double *beta,
+      double *C,
+      const int *ldc
+    );
+  }
+}
+// Blas:1 ends here
--- a/include/atrip/Debug.hpp
+++ b/include/atrip/Debug.hpp
@@ -0,0 +1,60 @@
+// [[file:../../atrip.org::*Debug][Debug:1]]
+#pragma once
+#define ATRIP_BENCHMARK
+#define ATRIP_DONT_SLICE
+#define ATRIP_DEBUG 1
+//#define ATRIP_WORKLOAD_DUMP
+#define ATRIP_USE_DGEMM
+//#define ATRIP_PRINT_TUPLES
+
+#define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": "
+
+#if ATRIP_DEBUG == 4
+#  pragma message("WARNING: You have OCD debugging ABC triples "\
+                  "expect GB of output and consult your therapist")
+#  include <dbg.h>
+#  define HAVE_OCD
+#  define OCD_Barrier(com) MPI_Barrier(com)
+#  define WITH_OCD
+#  define WITH_ROOT if (atrip::Atrip::rank == 0)
+#  define WITH_SPECIAL(r) if (atrip::Atrip::rank == r)
+#  define WITH_RANK std::cout << atrip::Atrip::rank << ": "
+#  define WITH_CRAZY_DEBUG
+#  define WITH_DBG
+#  define DBG(...) dbg(__VA_ARGS__)
+#elif ATRIP_DEBUG == 3
+#  pragma message("WARNING: You have crazy debugging ABC triples,"\
+                  " expect GB of output")
+#  include <dbg.h>
+#  define OCD_Barrier(com)
+#  define WITH_OCD if (false)
+#  define WITH_ROOT if (atrip::Atrip::rank == 0)
+#  define WITH_SPECIAL(r) if (atrip::Atrip::rank == r)
+#  define WITH_RANK std::cout << atrip::Atrip::rank << ": "
+#  define WITH_CRAZY_DEBUG
+#  define WITH_DBG
+#  define DBG(...) dbg(__VA_ARGS__)
+#elif ATRIP_DEBUG == 2
+#  pragma message("WARNING: You have some debugging info for ABC triples")
+#  include <dbg.h>
+#  define OCD_Barrier(com)
+#  define WITH_OCD if (false)
+#  define WITH_ROOT if (atrip::Atrip::rank == 0)
+#  define WITH_SPECIAL(r) if (atrip::Atrip::rank == r)
+#  define WITH_RANK std::cout << atrip::Atrip::rank << ": "
+#  define WITH_CRAZY_DEBUG if (false)
+#  define WITH_DBG
+#  define DBG(...) dbg(__VA_ARGS__)
+#elif ATRIP_DEBUG == 1
+#  define OCD_Barrier(com)
+#  define WITH_OCD if (false)
+#  define WITH_ROOT if (false)
+#  define WITH_SPECIAL(r) if (false)
+#  define WITH_RANK if (false) std::cout << atrip::Atrip::rank << ": "
+#  define WITH_DBG if (false)
+#  define WITH_CRAZY_DEBUG if (false)
+#  define DBG(...)
+#else
+#  error("ATRIP_DEBUG is not defined!")
+#endif
+// Debug:1 ends here
--- a/include/atrip/Equations.hpp
+++ b/include/atrip/Equations.hpp
@@ -0,0 +1,337 @@
+// [[file:../../atrip.org::*Equations][Equations:1]]
+#pragma once
+
+#include<atrip/Slice.hpp>
+#include<atrip/Blas.hpp>
+
+namespace atrip {
+
+  double getEnergyDistinct
+    ( const double epsabc
+    , std::vector<double> const& epsi
+    , std::vector<double> const& Tijk_
+    , std::vector<double> const& Zijk_
+    ) {
+    constexpr size_t blockSize=16;
+    double energy(0.);
+    const size_t No = epsi.size();
+    for (size_t kk=0; kk<No; kk+=blockSize){
+      const size_t kend( std::min(No, kk+blockSize) );
+      for (size_t jj(kk); jj<No; jj+=blockSize){
+        const size_t jend( std::min( No, jj+blockSize) );
+        for (size_t ii(jj); ii<No; ii+=blockSize){
+          const size_t iend( std::min( No, ii+blockSize) );
+          for (size_t k(kk); k < kend; k++){
+            const double ek(epsi[k]);
+            const size_t jstart = jj > k ? kk : k;
+            for (size_t j(jstart); j < jend; j++){
+              const double ej(epsi[j]);
+              double facjk( j == k ? 0.5 : 1.0);
+              size_t istart = ii > j ? ii : j;
+              for (size_t i(istart); i < iend; i++){
+                const double ei(epsi[i]);
+                double facij ( i==j ? 0.5 : 1.0);
+                double denominator(epsabc - ei - ej - ek);
+                double U(Zijk_[i + No*j + No*No*k]);
+                double V(Zijk_[i + No*k + No*No*j]);
+                double W(Zijk_[j + No*i + No*No*k]);
+                double X(Zijk_[j + No*k + No*No*i]);
+                double Y(Zijk_[k + No*i + No*No*j]);
+                double Z(Zijk_[k + No*j + No*No*i]);
+
+                double A(Tijk_[i + No*j + No*No*k]);
+                double B(Tijk_[i + No*k + No*No*j]);
+                double C(Tijk_[j + No*i + No*No*k]);
+                double D(Tijk_[j + No*k + No*No*i]);
+                double E(Tijk_[k + No*i + No*No*j]);
+                double F(Tijk_[k + No*j + No*No*i]);
+                double value(3.0*(A*U+B*V+C*W+D*X+E*Y+F*Z)
+                            +((U+X+Y)-2.0*(V+W+Z))*(A+D+E)
+                            +((V+W+Z)-2.0*(U+X+Y))*(B+C+F));
+                energy += 2.0*value / denominator * facjk * facij;
+              } // i
+            } // j
+          } // k
+        } // ii
+      } // jj
+    } // kk
+    return energy;
+  }
+
+
+  double getEnergySame
+    ( const double epsabc
+    , std::vector<double> const& epsi
+    , std::vector<double> const& Tijk_
+    , std::vector<double> const& Zijk_
+    ) {
+    constexpr size_t blockSize = 16;
+    const size_t No = epsi.size();
+    double energy(0.);
+    for (size_t kk=0; kk<No; kk+=blockSize){
+      const size_t kend( std::min( kk+blockSize, No) );
+      for (size_t jj(kk); jj<No; jj+=blockSize){
+        const size_t jend( std::min( jj+blockSize, No) );
+        for (size_t ii(jj); ii<No; ii+=blockSize){
+          const size_t iend( std::min( ii+blockSize, No) );
+          for (size_t k(kk); k < kend; k++){
+            const double ek(epsi[k]);
+            const size_t jstart = jj > k ? jj : k;
+            for(size_t j(jstart); j < jend; j++){
+              const double facjk( j == k ? 0.5 : 1.0);
+              const double ej(epsi[j]);
+              const size_t istart = ii > j ? ii : j;
+              for(size_t i(istart); i < iend; i++){
+                double ei(epsi[i]);
+                double facij ( i==j ? 0.5 : 1.0);
+                double denominator(epsabc - ei - ej - ek);
+                double U(Zijk_[i + No*j + No*No*k]);
+                double V(Zijk_[j + No*k + No*No*i]);
+                double W(Zijk_[k + No*i + No*No*j]);
+                double A(Tijk_[i + No*j + No*No*k]);
+                double B(Tijk_[j + No*k + No*No*i]);
+                double C(Tijk_[k + No*i + No*No*j]);
+                double value(3.0*( A*U + B*V + C*W) - (A+B+C)*(U+V+W));
+                energy += 2.0*value / denominator * facjk * facij;
+              } // i
+            } // j
+          } // k
+        } // ii
+      } // jj
+    } // kk
+    return energy;
+  }
+
+  void singlesContribution
+    ( size_t No
+    , size_t Nv
+    , const ABCTuple &abc
+    , double const* Tph
+    , double const* VABij
+    , double const* VACij
+    , double const* VBCij
+    , double *Zijk
+    ) {
+    const size_t a(abc[0]), b(abc[1]), c(abc[2]);
+    for (size_t k=0; k < No; k++)
+    for (size_t i=0; i < No; i++)
+    for (size_t j=0; j < No; j++) {
+      const size_t ijk = i + j*No + k*No*No
+                ,  jk = j + No * k
+                ;
+      Zijk[ijk] += Tph[ a + i * Nv ] * VBCij[ j + k * No ];
+      Zijk[ijk] += Tph[ b + j * Nv ] * VACij[ i + k * No ];
+      Zijk[ijk] += Tph[ c + k * Nv ] * VABij[ i + j * No ];
+    }
+  }
+
+  void doublesContribution
+    ( const ABCTuple &abc
+    , size_t const No
+    , size_t const Nv
+    // -- VABCI
+    , double const* VABph
+    , double const* VACph
+    , double const* VBCph
+    , double const* VBAph
+    , double const* VCAph
+    , double const* VCBph
+    // -- VHHHA
+    , double const* VhhhA
+    , double const* VhhhB
+    , double const* VhhhC
+    // -- TA
+    , double const* TAphh
+    , double const* TBphh
+    , double const* TCphh
+    // -- TABIJ
+    , double const* TABhh
+    , double const* TAChh
+    , double const* TBChh
+    // -- TIJK
+    , double *Tijk
+    , atrip::Timings& chrono
+    ) {
+
+    auto& t_reorder = chrono["doubles:reorder"];
+    const size_t a = abc[0], b = abc[1], c = abc[2]
+              , NoNo = No*No, NoNv = No*Nv
+              ;
+
+  #if defined(ATRIP_USE_DGEMM)
+  #define _IJK_(i, j, k) i + j*No + k*NoNo
+  #define REORDER(__II, __JJ, __KK)                                 \
+    t_reorder.start();                                              \
+    for (size_t k = 0; k < No; k++)                                 \
+    for (size_t j = 0; j < No; j++)                                 \
+    for (size_t i = 0; i < No; i++) {                               \
+      Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)];   \
+    }                                                               \
+    t_reorder.stop();
+  #define DGEMM_PARTICLES(__A, __B)    \
+    atrip::dgemm_( "T"                 \
+                , "N"                 \
+                , (int const*)&NoNo   \
+                , (int const*)&No     \
+                , (int const*)&Nv     \
+                , &one                \
+                , __A                 \
+                , (int const*)&Nv     \
+                , __B                 \
+                , (int const*)&Nv     \
+                , &zero               \
+                , _t_buffer.data()    \
+                , (int const*)&NoNo   \
+                );
+  #define DGEMM_HOLES(__A, __B, __TRANSB)  \
+    atrip::dgemm_( "N"                     \
+                , __TRANSB                \
+                , (int const*)&NoNo       \
+                , (int const*)&No         \
+                , (int const*)&No         \
+                , &m_one                  \
+                , __A                     \
+                , (int const*)&NoNo       \
+                , __B                     \
+                , (int const*)&No         \
+                , &zero                   \
+                , _t_buffer.data()        \
+                , (int const*)&NoNo       \
+                );
+
+    using F = double;
+    const size_t NoNoNo = No*NoNo;
+    std::vector<double> _t_buffer;
+    _t_buffer.reserve(NoNoNo);
+    F one{1.0}, m_one{-1.0}, zero{0.0};
+
+    t_reorder.start();
+    for (size_t k = 0; k < NoNoNo; k++) {
+      // zero the Tijk
+      Tijk[k] = 0.0;
+    }
+    t_reorder.stop();
+
+    chrono["doubles:holes"].start();
+    { // Holes part ============================================================
+      // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
+      chrono["doubles:holes:1"].start();
+      DGEMM_HOLES(VhhhC, TABhh, "N")
+      REORDER(i, k, j)
+      chrono["doubles:holes:1"].stop();
+      // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
+      chrono["doubles:holes:2"].start();
+      DGEMM_HOLES(VhhhC, TABhh, "T")
+      REORDER(j, k, i)
+      chrono["doubles:holes:2"].stop();
+      // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
+      chrono["doubles:holes:3"].start();
+      DGEMM_HOLES(VhhhB, TAChh, "N")
+      REORDER(i, j, k)
+      chrono["doubles:holes:3"].stop();
+      // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
+      chrono["doubles:holes:4"].start();
+      DGEMM_HOLES(VhhhB, TAChh, "T")
+      REORDER(k, j, i)
+      chrono["doubles:holes:4"].stop();
+      // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
+      chrono["doubles:holes:5"].start();
+      DGEMM_HOLES(VhhhA, TBChh, "N")
+      REORDER(j, i, k)
+      chrono["doubles:holes:5"].stop();
+      // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
+      chrono["doubles:holes:6"].start();
+      DGEMM_HOLES(VhhhA, TBChh, "T")
+      REORDER(k, i, j)
+      chrono["doubles:holes:6"].stop();
+    }
+    chrono["doubles:holes"].stop();
+
+    chrono["doubles:particles"].start();
+    { // Particle part =========================================================
+      // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
+      chrono["doubles:particles:1"].start();
+      DGEMM_PARTICLES(TAphh, VBCph)
+      REORDER(i, j, k)
+      chrono["doubles:particles:1"].stop();
+      // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
+      chrono["doubles:particles:2"].start();
+      DGEMM_PARTICLES(TAphh, VCBph)
+      REORDER(i, k, j)
+      chrono["doubles:particles:2"].stop();
+      // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
+      chrono["doubles:particles:3"].start();
+      DGEMM_PARTICLES(TCphh, VABph)
+      REORDER(k, i, j)
+      chrono["doubles:particles:3"].stop();
+      // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
+      chrono["doubles:particles:4"].start();
+      DGEMM_PARTICLES(TCphh, VBAph)
+      REORDER(k, j, i)
+      chrono["doubles:particles:4"].stop();
+      // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
+      chrono["doubles:particles:5"].start();
+      DGEMM_PARTICLES(TBphh, VACph)
+      REORDER(j, i, k)
+      chrono["doubles:particles:5"].stop();
+      // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
+      chrono["doubles:particles:6"].start();
+      DGEMM_PARTICLES(TBphh, VCAph)
+      REORDER(j, k, i)
+      chrono["doubles:particles:6"].stop();
+    }
+    chrono["doubles:particles"].stop();
+
+  #undef REORDER
+  #undef DGEMM_HOLES
+  #undef DGEMM_PARTICLES
+  #undef _IJK_
+  #else
+    for (size_t k = 0; k < No; k++)
+    for (size_t j = 0; j < No; j++)
+    for (size_t i = 0; i < No; i++){
+      const size_t ijk = i + j*No + k*NoNo
+                ,  jk = j + k*No
+                ;
+      Tijk[ijk] = 0.0; // :important
+      // HOLE DIAGRAMS: TABHH and VHHHA
+      for (size_t L = 0; L < No; L++){
+        // t[abLj] * V[Lcik]        H1
+        // t[baLi] * V[Lcjk]        H0      TODO: conjugate T for complex
+        Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo];
+        Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo];
+
+        // t[acLk] * V[Lbij]        H5
+        // t[caLi] * V[Lbkj]        H3
+        Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo];
+        Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo];
+
+        // t[bcLk] * V[Laji]        H2
+        // t[cbLj] * V[Laki]        H4
+        Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo];
+        Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo];
+      }
+      // PARTILCE DIAGRAMS: TAPHH and VABPH
+      for (size_t E = 0; E < Nv; E++) {
+        // t[aEij] * V[bcEk]        P0
+        // t[aEik] * V[cbEj]        P3 // TODO: CHECK THIS ONE, I DONT KNOW
+        Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv];
+        Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv];
+
+        // t[cEki] * V[abEj]        P5
+        // t[cEkj] * V[baEi]        P2
+        Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv];
+        Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv];
+
+        // t[bEji] * V[acEk]        P1
+        // t[bEjk] * V[caEi]        P4
+        Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv];
+        Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv];
+      }
+
+    }
+  #endif
+  }
+
+}
+// Equations:1 ends here
--- a/include/atrip/Slice.hpp
+++ b/include/atrip/Slice.hpp
@@ -0,0 +1,467 @@
+// [[file:../../atrip.org::*The slice][The slice:1]]
+#pragma once
+#include <iostream>
+#include <algorithm>
+#include <vector>
+#include <mpi.h>
+
+#include <atrip/Tuples.hpp>
+#include <atrip/Utils.hpp>
+
+namespace atrip {
+
+
+struct Slice {
+
+  using F = double;
+// The slice:1 ends here
+
+// [[file:../../atrip.org::*The slice][The slice:2]]
+// ASSOCIATED TYPES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+  struct Location { size_t rank; size_t source; };
+
+  enum Type
+    { A = 10
+    , B
+    , C
+    // Two-parameter slices
+    , AB = 20
+    , BC
+    , AC
+    // for abci and the doubles
+    , CB
+    , BA
+    , CA
+    // The non-typed slice
+    , Blank = 404
+    };
+
+  enum State {
+    // Fetch represents the state where a slice is to be fetched
+    // and has a valid data pointer that can be written to
+    Fetch = 0,
+    // Dispatches represents the state that an MPI call has been
+    // dispatched in order to get the data, but the data has not been
+    // yet unwrapped, the data might be there or we might have to wait.
+    Dispatched = 2,
+    // Ready means that the data pointer can be read from
+    Ready = 1,
+    // Self sufficient is a slice when its contents are located
+    // in the same rank that it lives, so that it does not have to
+    // fetch from no one else.
+    SelfSufficient = 911,
+    // Recycled means that this slice gets its data pointer from another
+    // slice, so it should not be written to
+    Recycled = 123,
+    // Acceptor means that the Slice can accept a new Slice, it is
+    // the counterpart of the Blank type, but for states
+    Acceptor = 405
+  };
+
+  struct Info {
+    // which part of a,b,c the slice holds
+    PartialTuple tuple;
+    // The type of slice for the user to retrieve the correct one
+    Type type;
+    // What is the state of the slice
+    State state;
+    // Where the slice is to be retrieved
+    // NOTE: this can actually be computed from tuple
+    Location from;
+    // If the data are actually to be found in this other slice
+    Type recycling;
+
+    Info() : tuple{0,0}
+           , type{Blank}
+           , state{Acceptor}
+           , from{0,0}
+           , recycling{Blank}
+           {}
+  };
+
+  using Ty_x_Tu = std::pair< Type, PartialTuple >;
+
+  // Names of the integrals that are considered in CCSD(T)
+  enum Name
+    { TA    = 100
+    , VIJKA = 101
+    , VABCI = 200
+    , TABIJ = 201
+    , VABIJ = 202
+    };
+
+  // DATABASE ==========================================================={{{1
+  struct LocalDatabaseElement {
+    Slice::Name name;
+    Slice::Info info;
+  };
+  using LocalDatabase = std::vector<LocalDatabaseElement>;
+  using Database = LocalDatabase;
+
+
+    // STATIC METHODS ===========================================================
+    //
+    // They are useful to organize the structure of slices
+
+    struct mpi {
+
+      static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) {
+        MPI_Datatype dt;
+        MPI_Type_vector(n, 1, 1, DT, &dt);
+        MPI_Type_commit(&dt);
+        return dt;
+      }
+
+      static MPI_Datatype sliceLocation () {
+        constexpr int n = 2;
+        // create a sliceLocation to measure in the current architecture
+        // the packing of the struct
+        Slice::Location measure;
+        MPI_Datatype dt;
+        const std::vector<int> lengths(n, 1);
+        const MPI_Datatype types[n] = {usizeDt(), usizeDt()};
+
+        // measure the displacements in the struct
+        size_t j = 0;
+        MPI_Aint displacements[n];
+        MPI_Get_address(&measure.rank,   &displacements[j++]);
+        MPI_Get_address(&measure.source, &displacements[j++]);
+        for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0];
+        displacements[0] = 0;
+
+        MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
+        MPI_Type_commit(&dt);
+        return dt;
+      }
+
+      static MPI_Datatype enumDt() { return MPI_INT; }
+      static MPI_Datatype usizeDt() { return MPI_UINT64_T; }
+
+      static MPI_Datatype sliceInfo () {
+        constexpr int n = 5;
+        MPI_Datatype dt;
+        Slice::Info measure;
+        const std::vector<int> lengths(n, 1);
+        const MPI_Datatype types[n]
+          = { vector(2, usizeDt())
+            , enumDt()
+            , enumDt()
+            , sliceLocation()
+            , enumDt()
+            };
+
+        // create the displacements from the info measurement struct
+        size_t j = 0;
+        MPI_Aint displacements[n];
+        MPI_Get_address(measure.tuple.data(), &displacements[j++]);
+        MPI_Get_address(&measure.type,        &displacements[j++]);
+        MPI_Get_address(&measure.state,       &displacements[j++]);
+        MPI_Get_address(&measure.from,        &displacements[j++]);
+        MPI_Get_address(&measure.recycling,   &displacements[j++]);
+        for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0];
+        displacements[0] = 0;
+
+        MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
+        MPI_Type_commit(&dt);
+        return dt;
+      }
+
+      static MPI_Datatype localDatabaseElement () {
+        constexpr int n = 2;
+        MPI_Datatype dt;
+        LocalDatabaseElement measure;
+        const std::vector<int> lengths(n, 1);
+        const MPI_Datatype types[n]
+          = { enumDt()
+            , sliceInfo()
+            };
+
+        // measure the displacements in the struct
+        size_t j = 0;
+        MPI_Aint displacements[n];
+        MPI_Get_address(&measure.name, &displacements[j++]);
+        MPI_Get_address(&measure.info, &displacements[j++]);
+        for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0];
+        displacements[0] = 0;
+
+        MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
+        MPI_Type_commit(&dt);
+        return dt;
+      }
+
+    };
+
+  static
+  PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) {
+    switch (sliceType) {
+      case AB: return {abc[0], abc[1]};
+      case BC: return {abc[1], abc[2]};
+      case AC: return {abc[0], abc[2]};
+      case CB: return {abc[2], abc[1]};
+      case BA: return {abc[1], abc[0]};
+      case CA: return {abc[2], abc[0]};
+      case  A: return {abc[0], 0};
+      case  B: return {abc[1], 0};
+      case  C: return {abc[2], 0};
+      default: throw "Switch statement not exhaustive!";
+    }
+  }
+
+
+    /**
+     * It is important here to return a reference to a Slice
+     * not to accidentally copy the associated buffer of the slice.
+     */
+    static Slice& findOneByType(std::vector<Slice> &slices, Slice::Type type) {
+        const auto sliceIt
+          = std::find_if(slices.begin(), slices.end(),
+                         [&type](Slice const& s) {
+                           return type == s.info.type;
+                         });
+        WITH_CRAZY_DEBUG
+        WITH_RANK
+          << "\t__ looking for " << type << "\n";
+        if (sliceIt == slices.end())
+          throw std::domain_error("Slice by type not found!");
+        return *sliceIt;
+    }
+
+    /*
+     * Check if an info has
+     *
+     */
+    static std::vector<Slice*> hasRecycledReferencingToIt
+      ( std::vector<Slice> &slices
+      , Info const& info
+      ) {
+      std::vector<Slice*> result;
+
+      for (auto& s: slices)
+        if (  s.info.recycling == info.type
+           && s.info.tuple == info.tuple
+           && s.info.state == Recycled
+           ) result.push_back(&s);
+
+      return result;
+    }
+
+    static Slice&
+    findRecycledSource (std::vector<Slice> &slices, Slice::Info info) {
+      const auto sliceIt
+        = std::find_if(slices.begin(), slices.end(),
+                       [&info](Slice const& s) {
+                         return info.recycling == s.info.type
+                             && info.tuple == s.info.tuple
+                             && State::Recycled != s.info.state
+                             ;
+                       });
+
+      WITH_CRAZY_DEBUG
+      WITH_RANK << "__slice__:find: recycling source of "
+                << pretty_print(info) << "\n";
+      if (sliceIt == slices.end())
+        throw std::domain_error( "Slice not found: "
+                               + pretty_print(info)
+                               + " rank: "
+                               + pretty_print(Atrip::rank)
+                               );
+      WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n";
+      return *sliceIt;
+    }
+
+    static Slice& findByTypeAbc
+      ( std::vector<Slice> &slices
+      , Slice::Type type
+      , ABCTuple const& abc
+      ) {
+        const auto tuple = Slice::subtupleBySlice(abc, type);
+        const auto sliceIt
+          = std::find_if(slices.begin(), slices.end(),
+                         [&type, &tuple](Slice const& s) {
+                           return type == s.info.type
+                               && tuple == s.info.tuple
+                               ;
+                         });
+        WITH_CRAZY_DEBUG
+        WITH_RANK << "__slice__:find:" << type << " and tuple "
+                  << pretty_print(tuple)
+                  << "\n";
+        if (sliceIt == slices.end())
+          throw std::domain_error( "Slice not found: "
+                                 + pretty_print(tuple)
+                                 + ", "
+                                 + pretty_print(type)
+                                 + " rank: "
+                                 + pretty_print(Atrip::rank)
+                                 );
+        return *sliceIt;
+    }
+
+    static Slice& findByInfo(std::vector<Slice> &slices,
+                             Slice::Info const& info) {
+        const auto sliceIt
+          = std::find_if(slices.begin(), slices.end(),
+                         [&info](Slice const& s) {
+                           // TODO: maybe implement comparison in Info struct
+                           return info.type == s.info.type
+                               && info.state == s.info.state
+                               && info.tuple == s.info.tuple
+                               && info.from.rank == s.info.from.rank
+                               && info.from.source == s.info.from.source
+                                ;
+                         });
+        WITH_CRAZY_DEBUG
+        WITH_RANK << "__slice__:find:looking for " << pretty_print(info) << "\n";
+        if (sliceIt == slices.end())
+          throw std::domain_error( "Slice by info not found: "
+                                 + pretty_print(info));
+        return *sliceIt;
+    }
+
+    // SLICE DEFINITION  =================================================={{{1
+
+    // ATTRIBUTES ============================================================
+    Info info;
+    F  *data;
+    MPI_Request request;
+    const size_t size;
+
+    void markReady() noexcept {
+      info.state = Ready;
+      info.recycling = Blank;
+    }
+
+    /*
+     * This means that the data is there
+     */
+    bool isUnwrapped() const noexcept {
+      return info.state == Ready
+          || info.state == SelfSufficient
+          ;
+    }
+
+    bool isUnwrappable() const noexcept {
+      return isUnwrapped()
+          || info.state == Recycled
+          || info.state == Dispatched
+          ;
+    }
+
+    inline bool isDirectlyFetchable() const noexcept {
+      return info.state == Ready || info.state == Dispatched;
+    }
+
+    void free() noexcept {
+      info.tuple      = {0, 0};
+      info.type       = Blank;
+      info.state      = Acceptor;
+      info.from       = {0, 0};
+      info.recycling  = Blank;
+      data            = nullptr;
+    }
+
+    inline bool isFree() const noexcept {
+      return info.tuple       == PartialTuple{0, 0}
+          && info.type        == Blank
+          && info.state       == Acceptor
+          && info.from.rank   == 0
+          && info.from.source == 0
+          && info.recycling   == Blank
+          && data             == nullptr
+           ;
+    }
+
+
+    /*
+     * This function answers the question, which slices can be recycled.
+     *
+     * A slice can only be recycled if it is Fetch or Ready and has
+     * a valid datapointer.
+     *
+     * In particular, SelfSufficient are not recyclable, since it is easier
+     * just to create a SelfSufficient slice than deal with data dependencies.
+     *
+     * Furthermore, a recycled slice is not recyclable, if this is the case
+     * then it is either bad design or a bug.
+     */
+    inline bool isRecyclable() const noexcept {
+      return (  info.state == Dispatched
+             || info.state == Ready
+             || info.state == Fetch
+             )
+          && hasValidDataPointer()
+          ;
+    }
+
+    /*
+     * This function describes if a slice has a valid data pointer.
+     *
+     * This is important to know if the slice has some data to it, also
+     * some structural checks are done, so that it should not be Acceptor
+     * or Blank, if this is the case then it is a bug.
+     */
+    inline bool hasValidDataPointer() const noexcept {
+      return data       != nullptr
+          && info.state != Acceptor
+          && info.type  != Blank
+          ;
+    }
+
+    void unwrapAndMarkReady() {
+      if (info.state == Ready) return;
+      if (info.state != Dispatched)
+        throw
+          std::domain_error("Can't unwrap a non-ready, non-dispatched slice!");
+      markReady();
+      MPI_Status status;
+#ifdef HAVE_OCD
+        WITH_RANK << "__slice__:mpi: waiting " << "\n";
+#endif
+      const int errorCode = MPI_Wait(&request, &status);
+      if (errorCode != MPI_SUCCESS)
+        throw "MPI ERROR HAPPENED....";
+
+#ifdef HAVE_OCD
+      char errorString[MPI_MAX_ERROR_STRING];
+      int errorSize;
+      MPI_Error_string(errorCode, errorString, &errorSize);
+
+      WITH_RANK << "__slice__:mpi: status "
+                << "{ .source="    << status.MPI_SOURCE
+                << ", .tag="       << status.MPI_TAG
+                << ", .error="     << status.MPI_ERROR
+                << ", .errCode="   << errorCode
+                << ", .err="       << errorString
+                << " }"
+                << "\n";
+#endif
+    }
+
+    Slice(size_t size_)
+      : info({})
+      , data(nullptr)
+      , size(size_)
+      {}
+
+
+  }; // struct Slice
+
+
+std::ostream& operator<<(std::ostream& out, Slice::Location const& v) {
+  // TODO: remove me
+  out << "{.r(" << v.rank << "), .s(" << v.source << ")};";
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, Slice::Info const& i) {
+  out << "«t" << i.type << ", s" << i.state << "»"
+      << " ⊙ {" << i.from.rank << ", " << i.from.source << "}"
+      << " ∴ {" << i.tuple[0] << ", " << i.tuple[1] << "}"
+      << " ♲t" << i.recycling
+      ;
+  return out;
+}
+
+} // namespace atrip
+// The slice:2 ends here
--- a/include/atrip/SliceUnion.hpp
+++ b/include/atrip/SliceUnion.hpp
@@ -0,0 +1,543 @@
+// [[file:../../atrip.org::*The slice union][The slice union:1]]
+#pragma once
+#include <atrip/Debug.hpp>
+#include <atrip/Slice.hpp>
+
+namespace atrip {
+
+  struct SliceUnion {
+    using F = double;
+    using Tensor = CTF::Tensor<F>;
+
+    virtual void
+    sliceIntoBuffer(size_t iteration, Tensor &to, Tensor const& from) = 0;
+
+    /*
+     * This function should enforce an important property of a SliceUnion.
+     * Namely, there can be no two Slices of the same nature.
+     *
+     * This means that there can be at most one slice with a given Ty_x_Tu.
+     */
+    void checkForDuplicates() const {
+      std::vector<Slice::Ty_x_Tu> tytus;
+      for (auto const& s: slices) {
+        if (s.isFree()) continue;
+        tytus.push_back({s.info.type, s.info.tuple});
+      }
+
+      for (auto const& tytu: tytus) {
+        if (std::count(tytus.begin(), tytus.end(), tytu) > 1)
+          throw "Invariance violated, more than one slice with same Ty_x_Tu";
+      }
+
+    }
+
+    std::vector<Slice::Ty_x_Tu> neededSlices(ABCTuple const& abc) {
+      std::vector<Slice::Ty_x_Tu> needed(sliceTypes.size());
+      // build the needed vector
+      std::transform(sliceTypes.begin(), sliceTypes.end(),
+                     needed.begin(),
+                     [&abc](Slice::Type const type) {
+                       auto tuple = Slice::subtupleBySlice(abc, type);
+                       return std::make_pair(type, tuple);
+                     });
+      return needed;
+    }
+
+    /* buildLocalDatabase
+     *
+     * It should build a database of slices so that we know what is needed
+     * to fetch in the next iteration represented by the tuple 'abc'.
+     *
+     * 1. The algorithm works as follows, we build a database of the all
+     * the slice types that we need together with their tuple.
+     *
+     * 2. Look in the SliceUnion if we already have this tuple,
+     * if we already have it mark it (TODO)
+     *
+     * 3. If we don't have the tuple, look for a (state=acceptor, type=blank)
+     * slice and mark this slice as type=Fetch with the corresponding type
+     * and tuple.
+     *
+     * NOTE: The algorithm should certify that we always have enough blank
+     * slices.
+     *
+     */
+    Slice::LocalDatabase buildLocalDatabase(ABCTuple const& abc) {
+      Slice::LocalDatabase result;
+
+      auto const needed = neededSlices(abc);
+
+      WITH_RANK << "__db__:needed:" << pretty_print(needed) << "\n";
+      // BUILD THE DATABASE
+      // we need to loop over all sliceTypes that this TensorUnion
+      // is representing and find out how we will get the corresponding
+      // slice for the abc we are considering right now.
+      for (auto const& pair: needed) {
+        auto const type = pair.first;
+        auto const tuple = pair.second;
+        auto const from  = rankMap.find(abc, type);
+
+#ifdef HAVE_OCD
+        WITH_RANK << "__db__:want:" << pretty_print(pair) << "\n";
+        for (auto const& s: slices)
+          WITH_RANK << "__db__:guts:ocd "
+                    << s.info << " pt " << s.data
+                    << "\n";
+#endif
+
+#ifdef HAVE_OCD
+        WITH_RANK << "__db__: checking if exact match" << "\n";
+#endif
+        {
+          // FIRST: look up if there is already a *Ready* slice matching what we
+          // need
+          auto const& it
+            = std::find_if(slices.begin(), slices.end(),
+                           [&tuple, &type](Slice const& other) {
+                             return other.info.tuple == tuple
+                                 && other.info.type == type
+                                    // we only want another slice when it
+                                    // has already ready-to-use data
+                                 && other.isUnwrappable()
+                                 ;
+                           });
+          if (it != slices.end()) {
+            // if we find this slice, it means that we don't have to do anything
+            WITH_RANK << "__db__: EXACT: found EXACT in name=" << name
+                      << " for tuple " << tuple[0] << ", " << tuple[1]
+                      << " ptr " << it->data
+                      << "\n";
+            result.push_back({name, it->info});
+            continue;
+          }
+        }
+
+#ifdef HAVE_OCD
+        WITH_RANK << "__db__: checking if recycle" << "\n";
+#endif
+        // Try to find a recyling possibility ie. find a slice with the same
+        // tuple and that has a valid data pointer.
+        auto const& recycleIt
+          = std::find_if(slices.begin(), slices.end(),
+                         [&tuple, &type](Slice const& other) {
+                           return other.info.tuple == tuple
+                               && other.info.type != type
+                               && other.isRecyclable()
+                               ;
+                         });
+
+        // if we find this recylce, then we find a Blank slice
+        // (which should exist by construction :THINK)
+        //
+        if (recycleIt != slices.end()) {
+          auto& blank = Slice::findOneByType(slices, Slice::Blank);
+          // TODO: formalize this through a method to copy information
+          //       from another slice
+          blank.data = recycleIt->data;
+          blank.info.type = type;
+          blank.info.tuple = tuple;
+          blank.info.state = Slice::Recycled;
+          blank.info.from = from;
+          blank.info.recycling = recycleIt->info.type;
+          result.push_back({name, blank.info});
+          WITH_RANK << "__db__: RECYCLING: n" << name
+                    << " " << pretty_print(abc)
+                    << " get " << pretty_print(blank.info)
+                    << " from " << pretty_print(recycleIt->info)
+                    << " ptr " << recycleIt->data
+                    << "\n"
+                    ;
+          continue;
+        }
+
+        // in this case we have to create a new slice
+        // this means that we should have a blank slice at our disposal
+        // and also the freePointers should have some elements inside,
+        // so we pop a data pointer from the freePointers container
+#ifdef HAVE_OCD
+        WITH_RANK << "__db__: none work, doing new" << "\n";
+#endif
+        {
+          WITH_RANK << "__db__: NEW: finding blank in " << name
+                    << " for type " << type
+                    << " for tuple " << tuple[0] << ", " << tuple[1]
+                    << "\n"
+                    ;
+          auto& blank = Slice::findOneByType(slices, Slice::Blank);
+          blank.info.type = type;
+          blank.info.tuple = tuple;
+          blank.info.from = from;
+
+          // Handle self sufficiency
+          blank.info.state = Atrip::rank == from.rank
+                           ? Slice::SelfSufficient
+                           : Slice::Fetch
+                           ;
+          if (blank.info.state == Slice::SelfSufficient) {
+            blank.data = sources[from.source].data();
+          } else {
+            if (freePointers.size() == 0)
+              throw std::domain_error("No more free pointers!");
+            auto dataPointer = freePointers.begin();
+            freePointers.erase(dataPointer);
+            blank.data = *dataPointer;
+          }
+
+          result.push_back({name, blank.info});
+          continue;
+        }
+
+      }
+
+#ifdef HAVE_OCD
+      for (auto const& s: slices)
+        WITH_RANK << "__db__:guts:ocd:__end__ " << s.info << "\n";
+#endif
+
+
+      return result;
+
+    }
+
+    /*
+     * Garbage collect slices not needed for the next iteration.
+     *
+     * It will throw if it tries to gc a slice that has not been
+     * previously unwrapped, as a safety mechanism.
+     */
+    void clearUnusedSlicesForNext(ABCTuple const& abc) {
+      auto const needed = neededSlices(abc);
+
+      // CLEAN UP SLICES, FREE THE ONES THAT ARE NOT NEEDED ANYMORE
+      for (auto& slice: slices) {
+        // if the slice is free, then it was not used anyways
+        if (slice.isFree()) continue;
+
+
+        // try to find the slice in the needed slices list
+        auto const found
+          = std::find_if(needed.begin(), needed.end(),
+                         [&slice] (Slice::Ty_x_Tu const& tytu) {
+                           return slice.info.tuple == tytu.second
+                               && slice.info.type == tytu.first
+                               ;
+                         });
+
+        // if we did not find slice in needed, then erase it
+        if (found == needed.end()) {
+
+          // We have to be careful about the data pointer,
+          // for SelfSufficient, the data pointer is a source pointer
+          // of the slice, so we should just wipe it.
+          //
+          // For Ready slices, we have to be careful if there are some
+          // recycled slices depending on it.
+          bool freeSlicePointer = true;
+
+          // allow to gc unwrapped and recycled, never Fetch,
+          // if we have a Fetch slice then something has gone very wrong.
+          if (!slice.isUnwrapped() && slice.info.state != Slice::Recycled)
+            throw
+              std::domain_error("Trying to garbage collect "
+                                " a non-unwrapped slice! "
+                                + pretty_print(&slice)
+                                + pretty_print(slice.info));
+
+          // it can be that our slice is ready, but it has some hanging
+          // references lying around in the form of a recycled slice.
+          // Of course if we need the recycled slice the next iteration
+          // this would be fatal, because we would then free the pointer
+          // of the slice and at some point in the future we would
+          // overwrite it. Therefore, we must check if slice has some
+          // references in slices and if so then
+          //
+          //  - we should mark those references as the original (since the data
+          //    pointer should be the same)
+          //
+          //  - we should make sure that the data pointer of slice
+          //    does not get freed.
+          //
+          if (slice.info.state == Slice::Ready) {
+            WITH_OCD WITH_RANK
+              << "__gc__:" << "checking for data recycled dependencies\n";
+            auto recycled
+              = Slice::hasRecycledReferencingToIt(slices, slice.info);
+            if (recycled.size()) {
+              Slice* newReady = recycled[0];
+              WITH_OCD WITH_RANK
+                << "__gc__:" << "swaping recycled "
+                << pretty_print(newReady->info)
+                << " and "
+                << pretty_print(slice.info)
+                << "\n";
+              newReady->markReady();
+              assert(newReady->data == slice.data);
+              freeSlicePointer = false;
+
+              for (size_t i = 1; i < recycled.size(); i++) {
+                auto newRecyled = recycled[i];
+                newRecyled->info.recycling = newReady->info.type;
+                WITH_OCD WITH_RANK
+                  << "__gc__:" << "updating recycled "
+                  << pretty_print(newRecyled->info)
+                  << "\n";
+              }
+
+            }
+          }
+
+          // if the slice is self sufficient, do not dare touching the
+          // pointer, since it is a pointer to our sources in our rank.
+          if (  slice.info.state == Slice::SelfSufficient
+             || slice.info.state == Slice::Recycled
+             ) {
+            freeSlicePointer = false;
+          }
+
+          // make sure we get its data pointer to be used later
+          // only for non-recycled, since it can be that we need
+          // for next iteration the data of the slice that the recycled points
+          // to
+          if (freeSlicePointer) {
+            freePointers.insert(slice.data);
+            WITH_RANK << "~~~:cl(" << name << ")"
+                      << " added to freePointer "
+                      << pretty_print(freePointers)
+                      << "\n";
+          } else {
+            WITH_OCD WITH_RANK << "__gc__:not touching the free Pointer\n";
+          }
+
+          // at this point, let us blank the slice
+          WITH_RANK << "~~~:cl(" << name << ")"
+                    << " freeing up slice "
+                    << " info " << slice.info
+                    << "\n";
+          slice.free();
+        }
+
+      }
+    }
+
+    // CONSTRUCTOR
+    SliceUnion( Tensor const& sourceTensor
+              , std::vector<Slice::Type> sliceTypes_
+              , std::vector<size_t> sliceLength_
+              , std::vector<size_t> paramLength
+              , size_t np
+              , MPI_Comm child_world
+              , MPI_Comm global_world
+              , Slice::Name name_
+              , size_t nSliceBuffers = 4
+              )
+              : rankMap(paramLength, np)
+              , world(child_world)
+              , universe(global_world)
+              , sliceLength(sliceLength_)
+              , sources(rankMap.nSources(),
+                        std::vector<F>
+                          (std::accumulate(sliceLength.begin(),
+                                           sliceLength.end(),
+                                           1, std::multiplies<size_t>())))
+              , name(name_)
+              , sliceTypes(sliceTypes_)
+              , sliceBuffers(nSliceBuffers, sources[0])
+              //, slices(2 * sliceTypes.size(), Slice{ sources[0].size() })
+    { // constructor begin
+
+      LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n";
+
+      slices
+        = std::vector<Slice>(2 * sliceTypes.size(), { sources[0].size() });
+      // TODO: think exactly ^------------------- about this number
+
+      // initialize the freePointers with the pointers to the buffers
+      std::transform(sliceBuffers.begin(), sliceBuffers.end(),
+                     std::inserter(freePointers, freePointers.begin()),
+                     [](std::vector<F> &vec) { return vec.data(); });
+
+
+
+      LOG(1,"Atrip") << "rankMap.nSources "
+                           << rankMap.nSources() << "\n";
+      LOG(1,"Atrip") << "#slices "
+                           << slices.size() << "\n";
+      LOG(1,"Atrip") << "#slices[0] "
+                           << slices[0].size << "\n";
+      LOG(1,"Atrip") << "#sources "
+                           << sources.size() << "\n";
+      LOG(1,"Atrip") << "#sources[0] "
+                           << sources[0].size() << "\n";
+      LOG(1,"Atrip") << "#freePointers "
+                           << freePointers.size() << "\n";
+      LOG(1,"Atrip") << "#sliceBuffers "
+                           << sliceBuffers.size() << "\n";
+      LOG(1,"Atrip") << "#sliceBuffers[0] "
+                           << sliceBuffers[0].size() << "\n";
+      LOG(1,"Atrip") << "#sliceLength "
+                           << sliceLength.size() << "\n";
+      LOG(1,"Atrip") << "#paramLength "
+                           << paramLength.size() << "\n";
+      LOG(1,"Atrip") << "GB*" << np << " "
+                           << double(sources.size() + sliceBuffers.size())
+                            * sources[0].size()
+                            * 8 * np
+                            / 1073741824.0
+                           << "\n";
+    } // constructor ends
+
+    void init(Tensor const& sourceTensor) {
+
+      CTF::World w(world);
+      const int rank = Atrip::rank
+              , order = sliceLength.size()
+              ;
+      std::vector<int> const syms(order, NS);
+      std::vector<int> __sliceLength(sliceLength.begin(), sliceLength.end());
+      Tensor toSliceInto(order,
+                         __sliceLength.data(),
+                         syms.data(),
+                         w);
+      LOG(1,"Atrip") << "slicing... \n";
+
+      // setUp sources
+      for (size_t it(0); it < rankMap.nSources(); ++it) {
+        const size_t
+          source = rankMap.isSourcePadding(rank, source) ? 0 : it;
+        WITH_OCD
+        WITH_RANK
+          << "Init:toSliceInto it-" << it
+          << " :: source " << source << "\n";
+        sliceIntoBuffer(source, toSliceInto, sourceTensor);
+      }
+
+    }
+
+    /**
+     * \brief Send asynchronously only if the state is Fetch
+     */
+    void send( size_t otherRank
+             , Slice::Info const& info
+             , size_t tag) const noexcept {
+      MPI_Request request;
+      bool sendData_p = false;
+
+      if (info.state == Slice::Fetch) sendData_p = true;
+      // TODO: remove this because I have SelfSufficient
+      if (otherRank == info.from.rank)      sendData_p = false;
+      if (!sendData_p) return;
+
+      MPI_Isend( sources[info.from.source].data()
+               , sources[info.from.source].size()
+               , MPI_DOUBLE /* TODO: adapt this with traits */
+               , otherRank
+               , tag
+               , universe
+               , &request
+               );
+      WITH_CRAZY_DEBUG
+      WITH_RANK << "sent to " << otherRank << "\n";
+
+    }
+
+    /**
+     * \brief Receive asynchronously only if the state is Fetch
+     */
+    void receive(Slice::Info const& info, size_t tag) noexcept {
+      auto& slice = Slice::findByInfo(slices, info);
+
+      if (Atrip::rank == info.from.rank) return;
+
+      if (slice.info.state == Slice::Fetch) {
+        // TODO: do it through the slice class
+        slice.info.state = Slice::Dispatched;
+        MPI_Request request = nullptr;
+        slice.request = request;
+        MPI_Irecv( slice.data
+                 , slice.size
+                 , MPI_DOUBLE // TODO: Adapt this with traits
+                 , info.from.rank
+                 , tag
+                 , universe
+                 , &slice.request
+                //, MPI_STATUS_IGNORE
+                 );
+      }
+    }
+
+    void unwrapAll(ABCTuple const& abc) {
+      for (auto type: sliceTypes) unwrapSlice(type, abc);
+    }
+
+    F* unwrapSlice(Slice::Type type, ABCTuple const& abc) {
+      WITH_CRAZY_DEBUG
+      WITH_RANK << "__unwrap__:slice " << type << " w n "
+                << name
+                << " abc" << pretty_print(abc)
+                << "\n";
+      auto& slice = Slice::findByTypeAbc(slices, type, abc);
+      WITH_RANK << "__unwrap__:info " << slice.info << "\n";
+      switch  (slice.info.state) {
+        case Slice::Dispatched:
+          WITH_RANK << "__unwrap__:Fetch: " << &slice
+                    << " info " << pretty_print(slice.info)
+                    << "\n";
+          slice.unwrapAndMarkReady();
+          return slice.data;
+          break;
+        case Slice::SelfSufficient:
+          WITH_RANK << "__unwrap__:SelfSufficient: " << &slice
+                    << " info " << pretty_print(slice.info)
+                    << "\n";
+          return slice.data;
+          break;
+        case Slice::Ready:
+          WITH_RANK << "__unwrap__:READY: UNWRAPPED ALREADY" << &slice
+                    << " info " << pretty_print(slice.info)
+                    << "\n";
+          return slice.data;
+          break;
+        case Slice::Recycled:
+          WITH_RANK << "__unwrap__:RECYCLED " << &slice
+                    << " info " << pretty_print(slice.info)
+                    << "\n";
+          return unwrapSlice(slice.info.recycling, abc);
+          break;
+        case Slice::Fetch:
+        case Slice::Acceptor:
+          throw std::domain_error("Can't unwrap an acceptor or fetch slice!");
+          break;
+        default:
+          throw std::domain_error("Unknown error unwrapping slice!");
+      }
+      return slice.data;
+    }
+
+    const RankMap rankMap;
+    const MPI_Comm world;
+    const MPI_Comm universe;
+    const std::vector<size_t> sliceLength;
+    std::vector< std::vector<F> > sources;
+    std::vector< Slice > slices;
+    Slice::Name name;
+    const std::vector<Slice::Type> sliceTypes;
+    std::vector< std::vector<F> > sliceBuffers;
+    std::set<F*> freePointers;
+
+  };
+
+  SliceUnion&
+  unionByName(std::vector<SliceUnion*> const& unions, Slice::Name name) {
+      const auto sliceUnionIt
+        = std::find_if(unions.begin(), unions.end(),
+                      [&name](SliceUnion const* s) {
+                        return name == s->name;
+                      });
+      if (sliceUnionIt == unions.end())
+        throw std::domain_error("SliceUnion not found!");
+      return **sliceUnionIt;
+  }
+
+}
+// The slice union:1 ends here
--- a/include/atrip/Tuples.hpp
+++ b/include/atrip/Tuples.hpp
@@ -0,0 +1,73 @@
+// [[file:../../atrip.org::*Tuples][Tuples:1]]
+#pragma once
+
+#include <vector>
+#include <array>
+
+#include <atrip/Utils.hpp>
+#include <atrip/Debug.hpp>
+
+namespace atrip {
+
+  using ABCTuple = std::array<size_t, 3>;
+  using PartialTuple = std::array<size_t, 2>;
+  using ABCTuples = std::vector<ABCTuple>;
+
+  ABCTuples getTuplesList(size_t Nv) {
+    const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
+    ABCTuples result(n);
+    size_t u(0);
+
+    for (size_t a(0); a < Nv; a++)
+    for (size_t b(a); b < Nv; b++)
+    for (size_t c(b); c < Nv; c++){
+      if ( a == b && b == c ) continue;
+      result[u++] = {a, b, c};
+    }
+
+    return result;
+
+  }
+
+
+  std::pair<size_t, size_t>
+  getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) {
+
+    std::vector<size_t> n_tuples_per_rank(np, tuplesList.size()/np);
+    const size_t
+        // how many valid tuples should we still verteilen to nodes
+        // since the number of tuples is not divisible by the number of nodes
+        nRoundRobin = tuplesList.size() % np
+        // every node must have the sanme amount of tuples in order for the
+        // other nodes to receive and send somewhere, therefore
+        // some nodes will get extra tuples but that are dummy tuples
+      , nExtraInvalid = (np - nRoundRobin) % np
+      ;
+
+    if (nRoundRobin) for (int i = 0; i < np; i++) n_tuples_per_rank[i]++;
+
+  #if defined(TODO)
+    assert( tuplesList.size()
+            ==
+            ( std::accumulate(n_tuples_per_rank.begin(),
+                              n_tuples_per_rank.end(),
+                              0)
+            + nExtraInvalid
+            ));
+  #endif
+
+    WITH_RANK << "nRoundRobin = " << nRoundRobin << "\n";
+    WITH_RANK << "nExtraInvalid = " << nExtraInvalid << "\n";
+    WITH_RANK << "ntuples = " << n_tuples_per_rank[rank] << "\n";
+
+    auto const& it = n_tuples_per_rank.begin();
+
+    return
+      { std::accumulate(it, it + rank    , 0)
+      , std::accumulate(it, it + rank + 1, 0)
+      };
+
+  }
+
+}
+// Tuples:1 ends here
--- a/include/atrip/Unions.hpp
+++ b/include/atrip/Unions.hpp
@@ -0,0 +1,240 @@
+// [[file:../../atrip.org::*Unions][Unions:1]]
+#pragma once
+#include <atrip/SliceUnion.hpp>
+
+namespace atrip {
+
+  void sliceIntoVector
+    ( std::vector<double> &v
+    , CTF::Tensor<double> &toSlice
+    , std::vector<int64_t> const low
+    , std::vector<int64_t> const up
+    , CTF::Tensor<double> const& origin
+    , std::vector<int64_t> const originLow
+    , std::vector<int64_t> const originUp
+    ) {
+    // Thank you CTF for forcing me to do this
+    struct { std::vector<int> up, low; }
+        toSlice_ = { {up.begin(), up.end()}
+                   , {low.begin(), low.end()} }
+      , origin_ = { {originUp.begin(), originUp.end()}
+                  , {originLow.begin(), originLow.end()} }
+      ;
+
+    WITH_OCD
+    WITH_RANK << "slicing into " << pretty_print(toSlice_.up)
+                          << "," << pretty_print(toSlice_.low)
+              << " from " << pretty_print(origin_.up)
+                   << "," << pretty_print(origin_.low)
+              << "\n";
+
+#ifndef ATRIP_DONT_SLICE
+    toSlice.slice( toSlice_.low.data()
+                 , toSlice_.up.data()
+                 , 0.0
+                 , origin
+                 , origin_.low.data()
+                 , origin_.up.data()
+                 , 1.0);
+    memcpy(v.data(), toSlice.data, sizeof(double) * v.size());
+#endif
+
+  }
+
+
+  struct TAPHH : public SliceUnion {
+    TAPHH( Tensor const& sourceTensor
+         , size_t No
+         , size_t Nv
+         , size_t np
+         , MPI_Comm child_world
+         , MPI_Comm global_world
+         ) : SliceUnion( sourceTensor
+                       , {Slice::A, Slice::B, Slice::C}
+                       , {Nv, No, No} // size of the slices
+                       , {Nv}
+                       , np
+                       , child_world
+                       , global_world
+                       , Slice::TA
+                       , 4) {
+           init(sourceTensor);
+         }
+
+    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override
+    {
+      const int rank = Atrip::rank
+              , Nv = sliceLength[0]
+              , No = sliceLength[1]
+              , a = rankMap.find({static_cast<size_t>(rank), it});
+              ;
+
+
+      sliceIntoVector( sources[it]
+                     , to,   {0, 0, 0},    {Nv, No, No}
+                     , from, {a, 0, 0, 0}, {a+1, Nv, No, No}
+                     );
+
+    }
+
+  };
+
+
+  struct HHHA : public SliceUnion {
+    HHHA( Tensor const& sourceTensor
+        , size_t No
+        , size_t Nv
+        , size_t np
+        , MPI_Comm child_world
+        , MPI_Comm global_world
+        ) : SliceUnion( sourceTensor
+                      , {Slice::A, Slice::B, Slice::C}
+                      , {No, No, No} // size of the slices
+                      , {Nv}         // size of the parametrization
+                      , np
+                      , child_world
+                      , global_world
+                      , Slice::VIJKA
+                      , 4) {
+           init(sourceTensor);
+         }
+
+    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override
+    {
+
+      const int rank = Atrip::rank
+              , No = sliceLength[0]
+              , a = rankMap.find({static_cast<size_t>(rank), it})
+              ;
+
+      sliceIntoVector( sources[it]
+                     , to,   {0, 0, 0},    {No, No, No}
+                     , from, {0, 0, 0, a}, {No, No, No, a+1}
+                     );
+
+    }
+  };
+
+  struct ABPH : public SliceUnion {
+    ABPH( Tensor const& sourceTensor
+        , size_t No
+        , size_t Nv
+        , size_t np
+        , MPI_Comm child_world
+        , MPI_Comm global_world
+        ) : SliceUnion( sourceTensor
+                      , { Slice::AB, Slice::BC, Slice::AC
+                        , Slice::BA, Slice::CB, Slice::CA
+                        }
+                      , {Nv, No} // size of the slices
+                      , {Nv, Nv} // size of the parametrization
+                      , np
+                      , child_world
+                      , global_world
+                      , Slice::VABCI
+                      , 2*6) {
+           init(sourceTensor);
+         }
+
+    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override {
+
+      const int Nv = sliceLength[0]
+              , No = sliceLength[1]
+              , rank = Atrip::rank
+              , el = rankMap.find({static_cast<size_t>(rank), it})
+              , a = el % Nv
+              , b = el / Nv
+              ;
+
+
+      sliceIntoVector( sources[it]
+                     , to,   {0, 0},       {Nv, No}
+                     , from, {a, b, 0, 0}, {a+1, b+1, Nv, No}
+                     );
+
+    }
+
+  };
+
+  struct ABHH : public SliceUnion {
+    ABHH( Tensor const& sourceTensor
+        , size_t No
+        , size_t Nv
+        , size_t np
+        , MPI_Comm child_world
+        , MPI_Comm global_world
+        ) : SliceUnion( sourceTensor
+                      , {Slice::AB, Slice::BC, Slice::AC}
+                      , {No, No} // size of the slices
+                      , {Nv, Nv} // size of the parametrization
+                      , np
+                      , child_world
+                      , global_world
+                      , Slice::VABIJ
+                      , 6) {
+           init(sourceTensor);
+         }
+
+    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override {
+
+      const int Nv = from.lens[0]
+              , No = sliceLength[1]
+              , rank = Atrip::rank
+              , el = rankMap.find({static_cast<size_t>(rank), it})
+              , a = el % Nv
+              , b = el / Nv
+              ;
+
+      sliceIntoVector( sources[it]
+                     , to,   {0, 0},       {No, No}
+                     , from, {a, b, 0, 0}, {a+1, b+1, No, No}
+                     );
+
+
+    }
+
+  };
+
+
+  struct TABHH : public SliceUnion {
+    TABHH( Tensor const& sourceTensor
+         , size_t No
+         , size_t Nv
+         , size_t np
+         , MPI_Comm child_world
+         , MPI_Comm global_world
+         ) : SliceUnion( sourceTensor
+                       , {Slice::AB, Slice::BC, Slice::AC}
+                       , {No, No} // size of the slices
+                       , {Nv, Nv} // size of the parametrization
+                       , np
+                       , child_world
+                       , global_world
+                       , Slice::TABIJ
+                       , 6) {
+           init(sourceTensor);
+         }
+
+    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override {
+      // TODO: maybe generalize this with ABHH
+
+      const int Nv = from.lens[0]
+              , No = sliceLength[1]
+              , rank = Atrip::rank
+              , el = rankMap.find({static_cast<size_t>(rank), it})
+              , a = el % Nv
+              , b = el / Nv
+              ;
+
+      sliceIntoVector( sources[it]
+                     , to,   {0, 0},       {No, No}
+                     , from, {a, b, 0, 0}, {a+1, b+1, No, No}
+                     );
+
+
+    }
+
+  };
+
+}
+// Unions:1 ends here
--- a/include/atrip/Utils.hpp
+++ b/include/atrip/Utils.hpp
@@ -0,0 +1,99 @@
+// [[file:../../atrip.org::*Utils][Utils:1]]
+#pragma once
+#include <sstream>
+#include <string>
+#include <map>
+#include <chrono>
+
+#include <ctf.hpp>
+
+namespace atrip {
+
+
+  template <typename T>
+  std::string pretty_print(T&& value) {
+    std::stringstream stream;
+#if ATRIP_DEBUG > 1
+    dbg::pretty_print(stream, std::forward<T>(value));
+#endif
+    return stream.str();
+  }
+
+#define WITH_CHRONO(__chrono, ...) \
+  __chrono.start(); __VA_ARGS__ __chrono.stop();
+
+  struct Timer {
+    using Clock = std::chrono::high_resolution_clock;
+    using Event = std::chrono::time_point<Clock>;
+    std::chrono::duration<double> duration;
+    Event _start;
+    inline void start() noexcept { _start = Clock::now(); }
+    inline void stop() noexcept { duration += Clock::now() - _start; }
+    inline void clear() noexcept { duration *= 0; }
+    inline double count() const noexcept { return duration.count(); }
+  };
+  using Timings = std::map<std::string, Timer>;
+}
+// Utils:1 ends here
+
+// [[file:../../atrip.org::*The rank mapping][The rank mapping:1]]
+#pragma once
+
+#include <vector>
+#include <algorithm>
+
+#include <atrip/Slice.hpp>
+
+namespace atrip {
+  struct RankMap {
+
+    std::vector<size_t> const lengths;
+    size_t const np, size;
+
+    RankMap(std::vector<size_t> lens, size_t np_)
+      : lengths(lens)
+      , np(np_)
+      , size(std::accumulate(lengths.begin(), lengths.end(),
+                            1, std::multiplies<size_t>()))
+    { assert(lengths.size() <= 2); }
+
+    size_t find(Slice::Location const& p) const noexcept {
+      return p.source * np + p.rank;
+    }
+
+    size_t nSources() const noexcept {
+      return size / np + size_t(size % np != 0);
+    }
+
+
+    bool isPaddingRank(size_t rank) const noexcept {
+      return size % np == 0
+          ? false
+          : rank > (size % np - 1)
+          ;
+    }
+
+    bool isSourcePadding(size_t rank, size_t source) const noexcept {
+      return source == nSources() && isPaddingRank(rank);
+    }
+
+    Slice::Location
+    find(ABCTuple const& abc, Slice::Type sliceType) const noexcept {
+      // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB
+      const auto tuple = Slice::subtupleBySlice(abc, sliceType);
+
+      const size_t index
+        = tuple[0]
+        + tuple[1] * (lengths.size() > 1 ? lengths[0] : 0)
+        ;
+
+      return
+        { index % np
+        , index / np
+        };
+    }
+
+  };
+
+}
+// The rank mapping:1 ends here
--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@@ -0,0 +1,563 @@
+// [[file:../../atrip.org::*Atrip][Atrip:2]]
+#include <iomanip>
+
+#include <atrip/Atrip.hpp>
+#include <atrip/Utils.hpp>
+#include <atrip/Equations.hpp>
+#include <atrip/SliceUnion.hpp>
+#include <atrip/Unions.hpp>
+
+using namespace atrip;
+
+int Atrip::rank;
+int Atrip::np;
+
+void Atrip::init()  {
+  MPI_Comm_rank(MPI_COMM_WORLD, &Atrip::rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &Atrip::np);
+}
+
+Atrip::Output Atrip::run(Atrip::Input const& in) {
+
+  const int np = Atrip::np;
+  const int rank = Atrip::rank;
+  MPI_Comm universe = in.ei->wrld->comm;
+
+  // Timings in seconds ================================================{{{1
+  Timings chrono{};
+
+  const size_t No = in.ei->lens[0];
+  const size_t Nv = in.ea->lens[0];
+  LOG(0,"Atrip") << "No: " << No << "\n";
+  LOG(0,"Atrip") << "Nv: " << Nv << "\n";
+
+  // allocate the three scratches, see piecuch
+  std::vector<double> Tijk(No*No*No) // doubles only (see piecuch)
+                    , Zijk(No*No*No) // singles + doubles (see piecuch)
+                    // we need local copies of the following tensors on every
+                    // rank
+                    , epsi(No)
+                    , epsa(Nv)
+                    , Tai(No * Nv)
+                    ;
+
+  in.ei->read_all(epsi.data());
+  in.ea->read_all(epsa.data());
+  in.Tph->read_all(Tai.data());
+
+  // COMMUNICATOR CONSTRUCTION ========================================={{{1
+  //
+  // Construct a new communicator living only on a single rank
+  int child_size = 1
+    , child_rank
+    ;
+  const
+  int color = rank / child_size
+    , crank = rank % child_size
+    ;
+  MPI_Comm child_comm;
+  if (np == 1) {
+    child_comm = universe;
+  } else {
+    MPI_Comm_split(universe, color, crank, &child_comm);
+    MPI_Comm_rank(child_comm, &child_rank);
+    MPI_Comm_size(child_comm, &child_size);
+    //CTF::World child_world(child_comm);
+  }
+
+
+  chrono["nv-slices"].start();
+  // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1
+  LOG(0,"Atrip") << "BUILD NV-SLICES\n";
+  TAPHH taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  HHHA  hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  chrono["nv-slices"].stop();
+
+  chrono["nv-nv-slices"].start();
+  // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1
+  LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n";
+  ABPH abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  ABHH abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  TABHH tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  chrono["nv-nv-slices"].stop();
+
+  // all tensors
+  std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh};
+
+  //CONSTRUCT TUPLE LIST ==============================================={{{1
+  LOG(0,"Atrip") << "BUILD TUPLE LIST\n";
+  const auto tuplesList = std::move(getTuplesList(Nv));
+  WITH_RANK << "tupList.size() = " << tuplesList.size() << "\n";
+
+  // GET ABC INDEX RANGE FOR RANK ======================================{{{1
+  auto abcIndex = getABCRange(np, rank, tuplesList);
+  size_t nIterations = abcIndex.second - abcIndex.first;
+
+#ifdef ATRIP_BENCHMARK
+  { const size_t maxIterations = in.maxIterations;
+    if (maxIterations != 0) {
+      abcIndex.second = abcIndex.first + maxIterations % (nIterations + 1);
+      nIterations = maxIterations % (nIterations + 1);
+    }
+  }
+#endif
+
+  WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n";
+  LOG(0,"Atrip") << "#iterations: "
+                       << nIterations << "\n";
+
+  // first abc
+  const ABCTuple firstAbc = tuplesList[abcIndex.first];
+
+
+  double energy(0.);
+
+
+  auto const isFakeTuple
+    = [&tuplesList](size_t const i) { return i >= tuplesList.size(); };
+
+
+  auto communicateDatabase
+    = [ &unions
+      , np
+      , &chrono
+      ] (ABCTuple const& abc, MPI_Comm const& c) -> Slice::Database {
+
+        chrono["db:comm:type:do"].start();
+        auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement();
+        chrono["db:comm:type:do"].stop();
+
+        chrono["db:comm:ldb"].start();
+        Slice::LocalDatabase ldb;
+
+        for (auto const& tensor: unions) {
+          auto const& tensorDb = tensor->buildLocalDatabase(abc);
+          ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end());
+        }
+        chrono["db:comm:ldb"].stop();
+
+        Slice::Database db(np * ldb.size(), ldb[0]);
+
+        chrono["oneshot-db:comm:allgather"].start();
+        chrono["db:comm:allgather"].start();
+        MPI_Allgather( ldb.data()
+                     , ldb.size()
+                     , MPI_LDB_ELEMENT
+                     , db.data()
+                     , ldb.size()
+                     , MPI_LDB_ELEMENT
+                     , c);
+        chrono["db:comm:allgather"].stop();
+        chrono["oneshot-db:comm:allgather"].stop();
+
+        chrono["db:comm:type:free"].start();
+        MPI_Type_free(&MPI_LDB_ELEMENT);
+        chrono["db:comm:type:free"].stop();
+
+        return db;
+      };
+
+  auto doIOPhase
+    = [&unions, &rank, &np, &universe, &chrono] (Slice::Database const& db) {
+
+    const size_t localDBLength = db.size() / np;
+
+    size_t sendTag = 0
+         , recvTag = rank * localDBLength
+         ;
+
+    // RECIEVE PHASE ======================================================
+    {
+      // At this point, we have already send to everyone that fits
+      auto const& begin = &db[rank * localDBLength]
+                , end   = begin + localDBLength
+                ;
+      for (auto it = begin; it != end; ++it) {
+        recvTag++;
+        auto const& el = *it;
+        auto& u = unionByName(unions, el.name);
+
+        WITH_DBG std::cout
+          << rank << ":r"
+          << "♯" << recvTag << " =>"
+          << " «n" << el.name
+          << ", t" << el.info.type
+          << ", s" << el.info.state
+          << "»"
+          << " ⊙ {" << rank << "⇐" << el.info.from.rank
+                    << ", "
+                    << el.info.from.source << "}"
+          << " ∴ {" << el.info.tuple[0]
+                    << ", "
+                    << el.info.tuple[1]
+                    << "}"
+          << "\n"
+          ;
+
+        chrono["db:io:recv"].start();
+        u.receive(el.info, recvTag);
+        chrono["db:io:recv"].stop();
+
+      } // recv
+    }
+
+    // SEND PHASE =========================================================
+    for (size_t otherRank = 0; otherRank<np; otherRank++) {
+      auto const& begin = &db[otherRank * localDBLength]
+                , end = begin + localDBLength
+                ;
+      for (auto it = begin; it != end; ++it) {
+        sendTag++;
+        Slice::LocalDatabaseElement const& el = *it;
+
+        if (el.info.from.rank != rank) continue;
+
+        auto& u = unionByName(unions, el.name);
+        WITH_DBG std::cout
+          << rank << ":s"
+          << "♯" << sendTag << " =>"
+          << " «n" << el.name
+          << ", t" << el.info.type
+          << ", s" << el.info.state
+          << "»"
+          << " ⊙ {" << el.info.from.rank << "⇒" << otherRank
+                    << ", "
+                    << el.info.from.source << "}"
+          << " ∴ {" << el.info.tuple[0]
+                    << ", "
+                    << el.info.tuple[1]
+                    << "}"
+          << "\n"
+          ;
+
+        chrono["db:io:send"].start();
+        u.send(otherRank, el.info, sendTag);
+        chrono["db:io:send"].stop();
+
+      } // send phase
+
+    } // otherRank
+
+
+  };
+
+#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
+  std::map<ABCTuple, double> tupleEnergies;
+#endif
+
+  const double doublesFlops
+    = double(No)
+    * double(No)
+    * double(No)
+    * (double(No) + double(Nv))
+    * 2
+    * 6
+    / 1e9
+    ;
+
+  // START MAIN LOOP ======================================================{{{1
+
+  Slice::Database db;
+
+  for ( size_t i = abcIndex.first, iteration = 1
+      ; i < abcIndex.second
+      ; i++, iteration++
+      ) {
+    chrono["iterations"].start();
+
+    // check overhead from chrono over all iterations
+    chrono["start:stop"].start(); chrono["start:stop"].stop();
+
+    // check overhead of doing a barrier at the beginning
+    chrono["oneshot-mpi:barrier"].start();
+    chrono["mpi:barrier"].start();
+    // TODO: REMOVE
+    if (in.barrier == 1)
+    MPI_Barrier(universe);
+    chrono["mpi:barrier"].stop();
+    chrono["oneshot-mpi:barrier"].stop();
+
+    if (iteration % in.iterationMod == 0) {
+      LOG(0,"Atrip")
+        << "iteration " << iteration
+        << " [" << 100 * iteration / nIterations << "%]"
+        << " (" << doublesFlops * iteration / chrono["doubles"].count()
+        << "GF)"
+        << " (" << doublesFlops * iteration / chrono["iterations"].count()
+        << "GF)"
+        << " ===========================\n";
+
+      // PRINT TIMINGS
+      for (auto const& pair: chrono)
+        LOG(1, " ") << pair.first << " :: "
+                    << pair.second.count()
+                    << std::endl;
+
+    }
+
+    const ABCTuple abc = isFakeTuple(i)
+                       ? tuplesList[tuplesList.size() - 1]
+                       : tuplesList[i]
+                 , *abcNext = i == (abcIndex.second - 1)
+                            ? nullptr
+                            : isFakeTuple(i + 1)
+                            ? &tuplesList[tuplesList.size() - 1]
+                            : &tuplesList[i + 1]
+                 ;
+
+    chrono["with_rank"].start();
+    WITH_RANK << " :it " << iteration
+              << " :abc " << pretty_print(abc)
+              << " :abcN "
+              << (abcNext ? pretty_print(*abcNext) : "None")
+              << "\n";
+    chrono["with_rank"].stop();
+
+
+    // COMM FIRST DATABASE ================================================{{{1
+    if (i == abcIndex.first) {
+      WITH_RANK << "__first__:first database ............ \n";
+      const auto __db = communicateDatabase(abc, universe);
+      WITH_RANK << "__first__:first database communicated \n";
+      WITH_RANK << "__first__:first database io phase \n";
+      doIOPhase(__db);
+      WITH_RANK << "__first__:first database io phase DONE\n";
+      WITH_RANK << "__first__::::Unwrapping all slices for first database\n";
+      for (auto& u: unions) u->unwrapAll(abc);
+      WITH_RANK << "__first__::::Unwrapping all slices for first database DONE\n";
+      MPI_Barrier(universe);
+    }
+
+    // COMM NEXT DATABASE ================================================={{{1
+    if (abcNext) {
+      WITH_RANK << "__comm__:" << iteration << "th communicating database\n";
+      chrono["db:comm"].start();
+      //const auto db = communicateDatabase(*abcNext, universe);
+      db = communicateDatabase(*abcNext, universe);
+      chrono["db:comm"].stop();
+      chrono["db:io"].start();
+      doIOPhase(db);
+      chrono["db:io"].stop();
+      WITH_RANK << "__comm__:" <<  iteration << "th database io phase DONE\n";
+    }
+
+    // COMPUTE DOUBLES ===================================================={{{1
+    OCD_Barrier(universe);
+    if (!isFakeTuple(i)) {
+      WITH_RANK << iteration << "-th doubles\n";
+      WITH_CHRONO(chrono["oneshot-unwrap"],
+      WITH_CHRONO(chrono["unwrap"],
+      WITH_CHRONO(chrono["unwrap:doubles"],
+        for (auto& u: decltype(unions){&abph, &hhha, &taphh, &tabhh}) {
+          u->unwrapAll(abc);
+        }
+      )))
+      chrono["oneshot-doubles"].start();
+      chrono["doubles"].start();
+      doublesContribution( abc, (size_t)No, (size_t)Nv
+                         // -- VABCI
+                         , abph.unwrapSlice(Slice::AB, abc)
+                         , abph.unwrapSlice(Slice::AC, abc)
+                         , abph.unwrapSlice(Slice::BC, abc)
+                         , abph.unwrapSlice(Slice::BA, abc)
+                         , abph.unwrapSlice(Slice::CA, abc)
+                         , abph.unwrapSlice(Slice::CB, abc)
+                         // -- VHHHA
+                         , hhha.unwrapSlice(Slice::A, abc)
+                         , hhha.unwrapSlice(Slice::B, abc)
+                         , hhha.unwrapSlice(Slice::C, abc)
+                         // -- TA
+                         , taphh.unwrapSlice(Slice::A, abc)
+                         , taphh.unwrapSlice(Slice::B, abc)
+                         , taphh.unwrapSlice(Slice::C, abc)
+                         // -- TABIJ
+                         , tabhh.unwrapSlice(Slice::AB, abc)
+                         , tabhh.unwrapSlice(Slice::AC, abc)
+                         , tabhh.unwrapSlice(Slice::BC, abc)
+                         // -- TIJK
+                         , Tijk.data()
+                         , chrono
+                         );
+      WITH_RANK << iteration << "-th doubles done\n";
+      chrono["doubles"].stop();
+      chrono["oneshot-doubles"].stop();
+    }
+
+    // COMPUTE SINGLES =================================================== {{{1
+    OCD_Barrier(universe);
+    if (!isFakeTuple(i)) {
+      WITH_CHRONO(chrono["oneshot-unwrap"],
+      WITH_CHRONO(chrono["unwrap"],
+      WITH_CHRONO(chrono["unwrap:singles"],
+        abhh.unwrapAll(abc);
+      )))
+      chrono["reorder"].start();
+      for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I];
+      chrono["reorder"].stop();
+      chrono["singles"].start();
+      singlesContribution( No, Nv, abc
+                         , Tai.data()
+                         , abhh.unwrapSlice(Slice::AB, abc)
+                         , abhh.unwrapSlice(Slice::AC, abc)
+                         , abhh.unwrapSlice(Slice::BC, abc)
+                         , Zijk.data());
+      chrono["singles"].stop();
+    }
+
+
+    // COMPUTE ENERGY ==================================================== {{{1
+    if (!isFakeTuple(i)) {
+      double tupleEnergy(0.);
+
+      int distinct(0);
+      if (abc[0] == abc[1]) distinct++;
+      if (abc[1] == abc[2]) distinct--;
+      const double epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]);
+
+      chrono["energy"].start();
+      if ( distinct == 0)
+        tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk);
+      else
+        tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk);
+      chrono["energy"].stop();
+
+#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
+      tupleEnergies[abc] = tupleEnergy;
+#endif
+
+      energy += tupleEnergy;
+
+#ifdef HAVE_OCD
+      auto const print_slices
+        = [](ABCTuple const& abc, ABCTuple const& want, SliceUnion& u) {
+          if (abc != want) return;
+
+          for (auto type: u.sliceTypes) {
+            auto const& ptr = u.unwrapSlice(type, abc);
+            auto const& slice = Slice::findByTypeAbc(u.slices, type, abc);
+            WITH_RANK << "__print_slice__:n" << u.name << " "
+                      << pretty_print(abc) << " "
+                      << pretty_print(slice.info)
+                      ;
+            for (size_t i = 0; i < 20; i++) std::cout << ptr[i] << ", ";
+            std::cout << std::endl;
+          }
+        };
+#endif
+
+    if (isFakeTuple(i)) {
+      // fake iterations should also unwrap whatever they got
+      WITH_RANK << iteration
+                << "th unwrapping because of fake in "
+                << i << "\n";
+      for (auto& u: unions) u->unwrapAll(abc);
+    }
+
+#ifdef HAVE_OCD
+    for (auto const& u: unions) {
+      WITH_RANK << "__dups__:"
+                << iteration
+                << "-th n" << u->name << " checking duplicates\n";
+      u->checkForDuplicates();
+    }
+#endif
+
+
+    // CLEANUP UNIONS ===================================================={{{1
+    OCD_Barrier(universe);
+    if (abcNext) {
+      chrono["gc"].start();
+      WITH_RANK << "__gc__:" << iteration << "-th cleaning up.......\n";
+      for (auto& u: unions) {
+
+        u->unwrapAll(abc);
+        WITH_RANK << "__gc__:n" << u->name  << " :it " << iteration
+                  << " :abc " << pretty_print(abc)
+                  << " :abcN " << pretty_print(*abcNext)
+                  << "\n";
+        for (auto const& slice: u->slices)
+          WITH_RANK << "__gc__:guts:" << slice.info << "\n";
+        u->clearUnusedSlicesForNext(*abcNext);
+
+        WITH_RANK << "__gc__: checking validity\n";
+
+#ifdef HAVE_OCD
+        // check for validity of the slices
+        for (auto type: u->sliceTypes) {
+          auto tuple = Slice::subtupleBySlice(abc, type);
+        for (auto& slice: u->slices) {
+          if ( slice.info.type == type
+             && slice.info.tuple == tuple
+             && slice.isDirectlyFetchable()
+             ) {
+            if (slice.info.state == Slice::Dispatched)
+              throw std::domain_error( "This slice should not be undispatched! "
+                                     + pretty_print(slice.info));
+          }
+        }
+        }
+#endif
+
+
+      }
+      chrono["gc"].stop();
+    }
+
+      WITH_RANK << iteration << "-th cleaning up....... DONE\n";
+    }
+
+    // CLEAN CHRONO ======================================================{{{1
+    chrono["iterations"].stop();
+    { // TODO: REMOVEME
+      chrono["oneshot-doubles"].clear();
+      chrono["oneshot-mpi:barrier"].clear();
+      chrono["oneshot-db:comm:allgather"].clear();
+      chrono["oneshot-unwrap"].clear();
+    }
+
+    // ITERATION END ====================================================={{{1
+  }  // END OF MAIN LOOP
+
+  MPI_Barrier(universe);
+
+  // PRINT TUPLES ========================================================={{{1
+#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
+  LOG(0,"Atrip") << "tuple energies" << "\n";
+  for (size_t i = 0; i < np; i++) {
+    MPI_Barrier(universe);
+    for (auto const& pair: tupleEnergies) {
+      if (i == rank)
+        std::cout << pair.first[0]
+                  << " " << pair.first[1]
+                  << " " << pair.first[2]
+                  << std::setprecision(15) << std::setw(23)
+                  << " tupleEnergy: " << pair.second
+                  << "\n"
+                  ;
+    }
+  }
+#endif
+
+  // COMMUNICATE THE ENERGIES ============================================={{{1
+  LOG(0,"Atrip") << "COMMUNICATING ENERGIES \n";
+  double globalEnergy = 0;
+  MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe);
+
+  WITH_RANK << "local energy " << energy << "\n";
+  LOG(0,"LOOP FINISHED, energy")
+    << std::setprecision(15) << std::setw(23)
+    << globalEnergy << std::endl;
+
+  // PRINT TIMINGS {{{1
+  for (auto const& pair: chrono)
+    LOG(0,"atrip:chrono") << pair.first << " "
+                          << pair.second.count() << std::endl;
+
+
+  LOG(0, "atrip:flops")
+    << nIterations * doublesFlops / chrono["doubles"].count() << "\n";
+
+  return { energy };
+
+}
+// Atrip:2 ends here
				`@@ -0,0 +1 @@`
				`ATRIP_SOURCES = include/atrip.hpp src/atrip/Atrip.cxx include/atrip/Atrip.hpp include/atrip/Debug.hpp include/atrip/Equations.hpp include/atrip/Tuples.hpp include/atrip/Unions.hpp include/atrip/RankMap.hpp include/atrip/Blas.hpp include/atrip/Utils.hpp include/atrip/SliceUnion.hpp include/atrip/Slice.hpp`