From cdbad963b01b7205b1065f6ca6e90b65318fe387 Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Tue, 30 Nov 2021 12:04:44 +0100
Subject: [PATCH 01/22] Add user printing mechanism (cherry pick)

---
 atrip.org               | 97 +++++++++++++++++++++++++++++++++--------
 include/atrip/Atrip.hpp |  3 +-
 include/atrip/Debug.hpp | 47 ++++++++++++++++----
 src/atrip/Atrip.cxx     | 32 +++++++++-----
 4 files changed, 142 insertions(+), 37 deletions(-)
diff --git a/atrip.org b/atrip.org
index a69fa03..72d2601 100644
--- a/atrip.org
+++ b/atrip.org
@@ -1847,7 +1847,7 @@ namespace atrip {
                         , *Vhhhp = nullptr
                         , *Vppph = nullptr
                         ;
-      int maxIterations = 0, iterationMod = -1;
+      int maxIterations = 0, iterationMod = -1, percentageMod = -1;
       bool barrier = false;
       bool chrono = false;
       Input& with_epsilon_i(CTF::Tensor<double> * t) { ei = t; return *this; }
@@ -1859,6 +1859,7 @@ namespace atrip {
       Input& with_Vabci(CTF::Tensor<double> * t) { Vppph = t; return *this; }
       Input& with_maxIterations(int i) { maxIterations = i; return *this; }
       Input& with_iterationMod(int i) { iterationMod = i; return *this; }
+      Input& with_percentageMod(int i) { percentageMod = i; return *this; }
       Input& with_barrier(bool i) { barrier = i; return *this; }
       Input& with_chrono(bool i) { chrono = i; return *this; }
     };
@@ -1888,6 +1889,12 @@ using namespace atrip;
 int Atrip::rank;
 int Atrip::np;
 
+// user printing block
+IterationDescriptor IterationDescription::descriptor;
+void atrip::registerIterationDescriptor(IterationDescriptor d) {
+  IterationDescription::descriptor = d;
+}
+
 void Atrip::init()  {
   MPI_Comm_rank(MPI_COMM_WORLD, &Atrip::rank);
   MPI_Comm_size(MPI_COMM_WORLD, &Atrip::np);
@@ -1968,15 +1975,6 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
   auto abcIndex = getABCRange(np, rank, tuplesList);
   size_t nIterations = abcIndex.second - abcIndex.first;
 
-#ifdef ATRIP_BENCHMARK
-  { const size_t maxIterations = in.maxIterations;
-    if (maxIterations != 0) {
-      abcIndex.second = abcIndex.first + maxIterations % (nIterations + 1);
-      nIterations = maxIterations % (nIterations + 1);
-    }
-  }
-#endif
-
   WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n";
   LOG(0,"Atrip") << "#iterations: " << nIterations << "\n";
 
@@ -1986,6 +1984,12 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
 
   double energy(0.);
 
+  size_t iterationMod
+    = (in.percentageMod > 0)
+    ? nIterations * in.percentageMod / 100
+    : in.iterationMod
+    ;
+
 
   auto const isFakeTuple
     = [&tuplesList](size_t const i) { return i >= tuplesList.size(); };
@@ -2151,7 +2155,16 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
     chrono["mpi:barrier"].stop();
     chrono["oneshot-mpi:barrier"].stop();
 
-    if (iteration % in.iterationMod == 0) {
+    if (iteration % iterationMod == 0) {
+
+      if (IterationDescription::descriptor) {
+        IterationDescription::descriptor({
+          iteration,
+          nIterations,
+          chrono["iterations"].count()
+        });
+      }
+
       LOG(0,"Atrip")
         << "iteration " << iteration
         << " [" << 100 * iteration / nIterations << "%]"
@@ -2419,9 +2432,12 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
 #+end_src
 
 
-** Debug
+** Debug and Logging
+*** Macros
+
 #+begin_src c++ :tangle (atrip-debug-h)
 #pragma once
+#include <functional>
 #define ATRIP_BENCHMARK
 //#define ATRIP_DONT_SLICE
 #define ATRIP_DEBUG 1
@@ -2429,10 +2445,12 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
 #define ATRIP_USE_DGEMM
 //#define ATRIP_PRINT_TUPLES
 
-#define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": "
+#ifndef ATRIP_DEBUG
+#define ATRIP_DEBUG 1
+#endif
 
 #if ATRIP_DEBUG == 4
-#  pragma message("WARNING: You have OCD debugging ABC triples "\
+#  pragma message("WARNING: You have OCD debugging ABC triples "    \
                   "expect GB of output and consult your therapist")
 #  include <dbg.h>
 #  define HAVE_OCD
@@ -2445,7 +2463,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
 #  define WITH_DBG
 #  define DBG(...) dbg(__VA_ARGS__)
 #elif ATRIP_DEBUG == 3
-#  pragma message("WARNING: You have crazy debugging ABC triples,"\
+#  pragma message("WARNING: You have crazy debugging ABC triples,"  \
                   " expect GB of output")
 #  include <dbg.h>
 #  define OCD_Barrier(com)
@@ -2467,7 +2485,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
 #  define WITH_CRAZY_DEBUG if (false)
 #  define WITH_DBG
 #  define DBG(...) dbg(__VA_ARGS__)
-#elif ATRIP_DEBUG == 1
+#else
 #  define OCD_Barrier(com)
 #  define WITH_OCD if (false)
 #  define WITH_ROOT if (false)
@@ -2476,11 +2494,54 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
 #  define WITH_DBG if (false)
 #  define WITH_CRAZY_DEBUG if (false)
 #  define DBG(...)
-#else
-#  error("ATRIP_DEBUG is not defined!")
 #endif
 #+end_src
 
+And users of the library can redefine the =LOG= macro
+which in case of not being defined is defined as follows:
+
+#+begin_src c++ :tangle (atrip-debug-h)
+#ifndef LOG
+#define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": "
+#endif
+#+end_src
+
+Furthermore, if you do not wish to see any output from ATRIP, simply
+define =ATRIP_NO_OUTPUT=
+
+
+#+begin_src c++ :tangle (atrip-debug-h)
+#ifdef ATRIP_NO_OUTPUT
+#  undef LOG
+#  define LOG(level, name) if (false) std::cout << name << ": "
+#endif
+#+end_src
+
+*** Iteration informer
+
+In general a code writer will want to write some messages in every iteration.
+A developer then can register a function to be used in this sense.
+The input of the function is an [[IterationDescriptor]] structure and the output
+should be nothing.
+
+#+name: IterationDescriptor
+#+begin_src c++ :tangle (atrip-debug-h)
+namespace atrip {
+
+  struct IterationDescription;
+  using IterationDescriptor = std::function<void(IterationDescription const&)>;
+  struct IterationDescription {
+    static IterationDescriptor descriptor;
+    size_t currentIteration;
+    size_t totalIterations;
+    double currentElapsedTime;
+  };
+
+  void registerIterationDescriptor(IterationDescriptor);
+
+}
+#+end_src
+
 ** Include header
 
 #+begin_src c++ :tangle (atrip-main-h)
diff --git a/include/atrip/Atrip.hpp b/include/atrip/Atrip.hpp
index a0cad96..a8bcd78 100644
--- a/include/atrip/Atrip.hpp
+++ b/include/atrip/Atrip.hpp
@@ -24,7 +24,7 @@ namespace atrip {
                         , *Vhhhp = nullptr
                         , *Vppph = nullptr
                         ;
-      int maxIterations = 0, iterationMod = -1;
+      int maxIterations = 0, iterationMod = -1, percentageMod = -1;
       bool barrier = false;
       bool chrono = false;
       Input& with_epsilon_i(CTF::Tensor<double> * t) { ei = t; return *this; }
@@ -36,6 +36,7 @@ namespace atrip {
       Input& with_Vabci(CTF::Tensor<double> * t) { Vppph = t; return *this; }
       Input& with_maxIterations(int i) { maxIterations = i; return *this; }
       Input& with_iterationMod(int i) { iterationMod = i; return *this; }
+      Input& with_percentageMod(int i) { percentageMod = i; return *this; }
       Input& with_barrier(bool i) { barrier = i; return *this; }
       Input& with_chrono(bool i) { chrono = i; return *this; }
     };
diff --git a/include/atrip/Debug.hpp b/include/atrip/Debug.hpp
index 9153954..6bdfde2 100644
--- a/include/atrip/Debug.hpp
+++ b/include/atrip/Debug.hpp
@@ -1,5 +1,6 @@
-// [[file:../../atrip.org::*Debug][Debug:1]]
+// [[file:../../atrip.org::*Macros][Macros:1]]
 #pragma once
+#include <functional>
 #define ATRIP_BENCHMARK
 //#define ATRIP_DONT_SLICE
 #define ATRIP_DEBUG 1
@@ -7,10 +8,12 @@
 #define ATRIP_USE_DGEMM
 //#define ATRIP_PRINT_TUPLES
 
-#define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": "
+#ifndef ATRIP_DEBUG
+#define ATRIP_DEBUG 1
+#endif
 
 #if ATRIP_DEBUG == 4
-#  pragma message("WARNING: You have OCD debugging ABC triples "\
+#  pragma message("WARNING: You have OCD debugging ABC triples "    \
                   "expect GB of output and consult your therapist")
 #  include <dbg.h>
 #  define HAVE_OCD
@@ -23,7 +26,7 @@
 #  define WITH_DBG
 #  define DBG(...) dbg(__VA_ARGS__)
 #elif ATRIP_DEBUG == 3
-#  pragma message("WARNING: You have crazy debugging ABC triples,"\
+#  pragma message("WARNING: You have crazy debugging ABC triples,"  \
                   " expect GB of output")
 #  include <dbg.h>
 #  define OCD_Barrier(com)
@@ -45,7 +48,7 @@
 #  define WITH_CRAZY_DEBUG if (false)
 #  define WITH_DBG
 #  define DBG(...) dbg(__VA_ARGS__)
-#elif ATRIP_DEBUG == 1
+#else
 #  define OCD_Barrier(com)
 #  define WITH_OCD if (false)
 #  define WITH_ROOT if (false)
@@ -54,7 +57,35 @@
 #  define WITH_DBG if (false)
 #  define WITH_CRAZY_DEBUG if (false)
 #  define DBG(...)
-#else
-#  error("ATRIP_DEBUG is not defined!")
 #endif
-// Debug:1 ends here
+// Macros:1 ends here
+
+// [[file:../../atrip.org::*Macros][Macros:2]]
+#ifndef LOG
+#define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": "
+#endif
+// Macros:2 ends here
+
+// [[file:../../atrip.org::*Macros][Macros:3]]
+#ifdef ATRIP_NO_OUTPUT
+#  undef LOG
+#  define LOG(level, name) if (false) std::cout << name << ": "
+#endif
+// Macros:3 ends here
+
+// [[file:../../atrip.org::IterationDescriptor][IterationDescriptor]]
+namespace atrip {
+
+  struct IterationDescription;
+  using IterationDescriptor = std::function<void(IterationDescription const&)>;
+  struct IterationDescription {
+    static IterationDescriptor descriptor;
+    size_t currentIteration;
+    size_t totalIterations;
+    double currentElapsedTime;
+  };
+
+  void registerIterationDescriptor(IterationDescriptor);
+
+}
+// IterationDescriptor ends here
diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx
index 06c4079..a6addc6 100644
--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@@ -12,6 +12,12 @@ using namespace atrip;
 int Atrip::rank;
 int Atrip::np;
 
+// user printing block
+IterationDescriptor IterationDescription::descriptor;
+void atrip::registerIterationDescriptor(IterationDescriptor d) {
+  IterationDescription::descriptor = d;
+}
+
 void Atrip::init()  {
   MPI_Comm_rank(MPI_COMM_WORLD, &Atrip::rank);
   MPI_Comm_size(MPI_COMM_WORLD, &Atrip::np);
@@ -92,15 +98,6 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
   auto abcIndex = getABCRange(np, rank, tuplesList);
   size_t nIterations = abcIndex.second - abcIndex.first;
 
-#ifdef ATRIP_BENCHMARK
-  { const size_t maxIterations = in.maxIterations;
-    if (maxIterations != 0) {
-      abcIndex.second = abcIndex.first + maxIterations % (nIterations + 1);
-      nIterations = maxIterations % (nIterations + 1);
-    }
-  }
-#endif
-
   WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n";
   LOG(0,"Atrip") << "#iterations: " << nIterations << "\n";
 
@@ -110,6 +107,12 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
 
   double energy(0.);
 
+  size_t iterationMod
+    = (in.percentageMod > 0)
+    ? nIterations * in.percentageMod / 100
+    : in.iterationMod
+    ;
+
 
   auto const isFakeTuple
     = [&tuplesList](size_t const i) { return i >= tuplesList.size(); };
@@ -275,7 +278,16 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
     chrono["mpi:barrier"].stop();
     chrono["oneshot-mpi:barrier"].stop();
 
-    if (iteration % in.iterationMod == 0) {
+    if (iteration % iterationMod == 0) {
+
+      if (IterationDescription::descriptor) {
+        IterationDescription::descriptor({
+          iteration,
+          nIterations,
+          chrono["iterations"].count()
+        });
+      }
+
       LOG(0,"Atrip")
         << "iteration " << iteration
         << " [" << 100 * iteration / nIterations << "%]"

From 6fa915db3ae4ba06463b684dfa879421fa2463a2 Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Tue, 14 Dec 2021 17:50:22 +0100
Subject: [PATCH 02/22] Add 1% printing

---
 atrip.org | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/atrip.org b/atrip.org
index 72d2601..30be0ca 100644
--- a/atrip.org
+++ b/atrip.org
@@ -1821,6 +1821,7 @@ namespace atrip {
 #+end_src
 
 ** Atrip
+*** Header
 #+begin_src c++ :tangle (atrip-atrip-h)
 #pragma once
 #include <sstream>
@@ -1984,13 +1985,16 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
 
   double energy(0.);
 
-  size_t iterationMod
-    = (in.percentageMod > 0)
-    ? nIterations * in.percentageMod / 100
-    : in.iterationMod
+  const size_t
+      iterationMod = (in.percentageMod > 0)
+                  ? nIterations * in.percentageMod / 100
+                  : in.iterationMod
+
+    , iteration1Percent = nIterations * 0.01
     ;
 
 
+
   auto const isFakeTuple
     = [&tuplesList](size_t const i) { return i >= tuplesList.size(); };
 
@@ -2155,7 +2159,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
     chrono["mpi:barrier"].stop();
     chrono["oneshot-mpi:barrier"].stop();
 
-    if (iteration % iterationMod == 0) {
+    if (iteration % iterationMod == 0 || iteration == iteration1Percent) {
 
       if (IterationDescription::descriptor) {
         IterationDescription::descriptor({

From b1175997af0128a2e9b2b3acf66daefc4741e960 Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Tue, 14 Dec 2021 18:00:25 +0100
Subject: [PATCH 03/22] Add tangled code

---
 src/atrip/Atrip.cxx | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx
index a6addc6..64dea9b 100644
--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@@ -107,13 +107,16 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
 
   double energy(0.);
 
-  size_t iterationMod
-    = (in.percentageMod > 0)
-    ? nIterations * in.percentageMod / 100
-    : in.iterationMod
+  const size_t
+      iterationMod = (in.percentageMod > 0)
+                  ? nIterations * in.percentageMod / 100
+                  : in.iterationMod
+
+    , iteration1Percent = nIterations * 0.01
     ;
 
 
+
   auto const isFakeTuple
     = [&tuplesList](size_t const i) { return i >= tuplesList.size(); };
 
@@ -278,7 +281,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
     chrono["mpi:barrier"].stop();
     chrono["oneshot-mpi:barrier"].stop();
 
-    if (iteration % iterationMod == 0) {
+    if (iteration % iterationMod == 0 || iteration == iteration1Percent) {
 
       if (IterationDescription::descriptor) {
         IterationDescription::descriptor({

From 8c39827061c12598130e21129f162b6273a2a22b Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Thu, 27 Jan 2022 20:39:00 +0100
Subject: [PATCH 04/22] Add type traits

---
 atrip.org | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/atrip.org b/atrip.org
index 30be0ca..43a7f04 100644
--- a/atrip.org
+++ b/atrip.org
@@ -8,7 +8,6 @@ The algorithm uses two main data types, the =Slice= and the
 
 ** The slice
 
-
 #+begin_src c++ :tangle (atrip-slice-h)
 #pragma once
 #include <iostream>
@@ -18,9 +17,20 @@ The algorithm uses two main data types, the =Slice= and the
 
 #include <atrip/Tuples.hpp>
 #include <atrip/Utils.hpp>
+#include <atrip/Blas.hpp>
 
 namespace atrip {
 
+namespace traits {
+  template <typename FF> bool isComplex() { return false; };
+  template <> bool isComplex<Complex>() { return true; };
+namespace mpi {
+  template <typename FF> MPI_Datatype datatypeOf(void);
+  template <> MPI_Datatype datatypeOf<double>() { return MPI_DOUBLE; }
+  template <> MPI_Datatype datatypeOf<Complex>() { return MPI_DOUBLE_COMPLEX; }
+}
+}
+
 
 struct Slice {
 

From 61662e27175b091c2fda40b0ac349c8bbd69e955 Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Thu, 27 Jan 2022 20:40:37 +0100
Subject: [PATCH 05/22] Templatize Slice

---
 atrip.org | 48 +++++++++++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/atrip.org b/atrip.org
index 43a7f04..8d06e58 100644
--- a/atrip.org
+++ b/atrip.org
@@ -32,9 +32,9 @@ namespace mpi {
 }
 
 
+template <typename F=double>
 struct Slice {
 
-  using F = double;
 #+end_src
 
 A slice is the concept of a subset of values of a given tensor.
@@ -124,8 +124,8 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
 
   // DATABASE ==========================================================={{{1
   struct LocalDatabaseElement {
-    Slice::Name name;
-    Slice::Info info;
+    Slice<F>::Name name;
+    Slice<F>::Info info;
   };
   using LocalDatabase = std::vector<LocalDatabaseElement>;
   using Database = LocalDatabase;
@@ -148,7 +148,7 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
         constexpr int n = 2;
         // create a sliceLocation to measure in the current architecture
         // the packing of the struct
-        Slice::Location measure;
+        Slice<F>::Location measure;
         MPI_Datatype dt;
         const std::vector<int> lengths(n, 1);
         const MPI_Datatype types[n] = {usizeDt(), usizeDt()};
@@ -172,7 +172,7 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
       static MPI_Datatype sliceInfo () {
         constexpr int n = 5;
         MPI_Datatype dt;
-        Slice::Info measure;
+        Slice<F>::Info measure;
         const std::vector<int> lengths(n, 1);
         const MPI_Datatype types[n]
           = { vector(2, usizeDt())
@@ -244,10 +244,10 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
      ,* It is important here to return a reference to a Slice
      ,* not to accidentally copy the associated buffer of the slice.
      ,*/
-    static Slice& findOneByType(std::vector<Slice> &slices, Slice::Type type) {
+    static Slice<F>& findOneByType(std::vector<Slice<F>> &slices, Slice<F>::Type type) {
         const auto sliceIt
           = std::find_if(slices.begin(), slices.end(),
-                         [&type](Slice const& s) {
+                         [&type](Slice<F> const& s) {
                            return type == s.info.type;
                          });
         WITH_CRAZY_DEBUG
@@ -262,11 +262,11 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
      ,* Check if an info has
      ,*
      ,*/
-    static std::vector<Slice*> hasRecycledReferencingToIt
-      ( std::vector<Slice> &slices
+    static std::vector<Slice<F>*> hasRecycledReferencingToIt
+      ( std::vector<Slice<F>> &slices
       , Info const& info
       ) {
-      std::vector<Slice*> result;
+      std::vector<Slice<F>*> result;
 
       for (auto& s: slices)
         if (  s.info.recycling == info.type
@@ -277,11 +277,11 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
       return result;
     }
 
-    static Slice&
-    findRecycledSource (std::vector<Slice> &slices, Slice::Info info) {
+    static Slice<F>&
+    findRecycledSource (std::vector<Slice<F>> &slices, Slice<F>::Info info) {
       const auto sliceIt
         = std::find_if(slices.begin(), slices.end(),
-                       [&info](Slice const& s) {
+                       [&info](Slice<F> const& s) {
                          return info.recycling == s.info.type
                              && info.tuple == s.info.tuple
                              && State::Recycled != s.info.state
@@ -301,15 +301,15 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
       return *sliceIt;
     }
 
-    static Slice& findByTypeAbc
-      ( std::vector<Slice> &slices
-      , Slice::Type type
+    static Slice<F>& findByTypeAbc
+      ( std::vector<Slice<F>> &slices
+      , Slice<F>::Type type
       , ABCTuple const& abc
       ) {
-        const auto tuple = Slice::subtupleBySlice(abc, type);
+        const auto tuple = Slice<F>::subtupleBySlice(abc, type);
         const auto sliceIt
           = std::find_if(slices.begin(), slices.end(),
-                         [&type, &tuple](Slice const& s) {
+                         [&type, &tuple](Slice<F> const& s) {
                            return type == s.info.type
                                && tuple == s.info.tuple
                                ;
@@ -329,11 +329,11 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
         return *sliceIt;
     }
 
-    static Slice& findByInfo(std::vector<Slice> &slices,
-                             Slice::Info const& info) {
+    static Slice<F>& findByInfo(std::vector<Slice<F>> &slices,
+                             Slice<F>::Info const& info) {
         const auto sliceIt
           = std::find_if(slices.begin(), slices.end(),
-                         [&info](Slice const& s) {
+                         [&info](Slice<F> const& s) {
                            // TODO: maybe implement comparison in Info struct
                            return info.type == s.info.type
                                && info.state == s.info.state
@@ -479,13 +479,15 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
   }; // struct Slice
 
 
-std::ostream& operator<<(std::ostream& out, Slice::Location const& v) {
+template <typename F=double>
+std::ostream& operator<<(std::ostream& out, typename Slice<F>::Location const& v) {
   // TODO: remove me
   out << "{.r(" << v.rank << "), .s(" << v.source << ")};";
   return out;
 }
 
-std::ostream& operator<<(std::ostream& out, Slice::Info const& i) {
+template <typename F=double>
+std::ostream& operator<<(std::ostream& out, typename Slice<F>::Info const& i) {
   out << "«t" << i.type << ", s" << i.state << "»"
       << " ⊙ {" << i.from.rank << ", " << i.from.source << "}"
       << " ∴ {" << i.tuple[0] << ", " << i.tuple[1] << "}"

From 4543e712b3b576535991d1210bdb6a645abd823d Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Thu, 27 Jan 2022 20:41:08 +0100
Subject: [PATCH 06/22] Templatize RankMap

---
 atrip.org | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/atrip.org b/atrip.org
index 8d06e58..e38133b 100644
--- a/atrip.org
+++ b/atrip.org
@@ -549,6 +549,8 @@ namespace atrip {
 #include <atrip/Slice.hpp>
 
 namespace atrip {
+
+  template <typename F=double>
   struct RankMap {
 
     std::vector<size_t> const lengths;
@@ -561,7 +563,7 @@ namespace atrip {
                             1UL, std::multiplies<size_t>()))
     { assert(lengths.size() <= 2); }
 
-    size_t find(Slice::Location const& p) const noexcept {
+    size_t find(typename Slice<F>::Location const& p) const noexcept {
       return p.source * np + p.rank;
     }
 
@@ -581,10 +583,10 @@ namespace atrip {
       return source == nSources() && isPaddingRank(rank);
     }
 
-    Slice::Location
-    find(ABCTuple const& abc, Slice::Type sliceType) const noexcept {
+    typename Slice<F>::Location
+    find(ABCTuple const& abc, typename Slice<F>::Type sliceType) const noexcept {
       // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB
-      const auto tuple = Slice::subtupleBySlice(abc, sliceType);
+      const auto tuple = Slice<F>::subtupleBySlice(abc, sliceType);
 
       const size_t index
         = tuple[0]

From 6776a7134c92eccba0a7e6b0643d2ec0b59337c9 Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Thu, 27 Jan 2022 20:43:41 +0100
Subject: [PATCH 07/22] Templatize SliceUnion

---
 atrip.org | 108 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 56 insertions(+), 52 deletions(-)

diff --git a/atrip.org b/atrip.org
index e38133b..52398c1 100644
--- a/atrip.org
+++ b/atrip.org
@@ -613,8 +613,8 @@ namespace atrip {
 
 namespace atrip {
 
+  template <typename F=double>
   struct SliceUnion {
-    using F = double;
     using Tensor = CTF::Tensor<F>;
 
     virtual void
@@ -627,7 +627,7 @@ namespace atrip {
      * This means that there can be at most one slice with a given Ty_x_Tu.
      */
     void checkForDuplicates() const {
-      std::vector<Slice::Ty_x_Tu> tytus;
+      std::vector<typename Slice<F>::Ty_x_Tu> tytus;
       for (auto const& s: slices) {
         if (s.isFree()) continue;
         tytus.push_back({s.info.type, s.info.tuple});
@@ -640,13 +640,13 @@ namespace atrip {
 
     }
 
-    std::vector<Slice::Ty_x_Tu> neededSlices(ABCTuple const& abc) {
-      std::vector<Slice::Ty_x_Tu> needed(sliceTypes.size());
+    std::vector<typename Slice<F>::Ty_x_Tu> neededSlices(ABCTuple const& abc) {
+      std::vector<typename Slice<F>::Ty_x_Tu> needed(sliceTypes.size());
       // build the needed vector
       std::transform(sliceTypes.begin(), sliceTypes.end(),
                      needed.begin(),
-                     [&abc](Slice::Type const type) {
-                       auto tuple = Slice::subtupleBySlice(abc, type);
+                     [&abc](typename Slice<F>::Type const type) {
+                       auto tuple = Slice<F>::subtupleBySlice(abc, type);
                        return std::make_pair(type, tuple);
                      });
       return needed;
@@ -671,8 +671,9 @@ namespace atrip {
      * slices.
      *
      */
-    Slice::LocalDatabase buildLocalDatabase(ABCTuple const& abc) {
-      Slice::LocalDatabase result;
+    typename
+    Slice<F>::LocalDatabase buildLocalDatabase(ABCTuple const& abc) {
+      typename Slice<F>::LocalDatabase result;
 
       auto const needed = neededSlices(abc);
 
@@ -702,7 +703,7 @@ namespace atrip {
           // need
           auto const& it
             = std::find_if(slices.begin(), slices.end(),
-                           [&tuple, &type](Slice const& other) {
+                           [&tuple, &type](Slice<F> const& other) {
                              return other.info.tuple == tuple
                                  && other.info.type == type
                                     // we only want another slice when it
@@ -728,7 +729,7 @@ namespace atrip {
         // tuple and that has a valid data pointer.
         auto const& recycleIt
           = std::find_if(slices.begin(), slices.end(),
-                         [&tuple, &type](Slice const& other) {
+                         [&tuple, &type](Slice<F> const& other) {
                            return other.info.tuple == tuple
                                && other.info.type != type
                                && other.isRecyclable()
@@ -739,13 +740,13 @@ namespace atrip {
         // (which should exist by construction :THINK)
         //
         if (recycleIt != slices.end()) {
-          auto& blank = Slice::findOneByType(slices, Slice::Blank);
+          auto& blank = Slice<F>::findOneByType(slices, Slice<F>::Blank);
           // TODO: formalize this through a method to copy information
           //       from another slice
           blank.data = recycleIt->data;
           blank.info.type = type;
           blank.info.tuple = tuple;
-          blank.info.state = Slice::Recycled;
+          blank.info.state = Slice<F>::Recycled;
           blank.info.from = from;
           blank.info.recycling = recycleIt->info.type;
           result.push_back({name, blank.info});
@@ -772,17 +773,17 @@ namespace atrip {
                     << " for tuple " << tuple[0] << ", " << tuple[1]
                     << "\n"
                     ;
-          auto& blank = Slice::findOneByType(slices, Slice::Blank);
+          auto& blank = Slice<F>::findOneByType(slices, Slice<F>::Blank);
           blank.info.type = type;
           blank.info.tuple = tuple;
           blank.info.from = from;
 
           // Handle self sufficiency
           blank.info.state = Atrip::rank == from.rank
-                           ? Slice::SelfSufficient
-                           : Slice::Fetch
+                           ? Slice<F>::SelfSufficient
+                           : Slice<F>::Fetch
                            ;
-          if (blank.info.state == Slice::SelfSufficient) {
+          if (blank.info.state == Slice<F>::SelfSufficient) {
             blank.data = sources[from.source].data();
           } else {
             if (freePointers.size() == 0)
@@ -826,7 +827,7 @@ namespace atrip {
         // try to find the slice in the needed slices list
         auto const found
           = std::find_if(needed.begin(), needed.end(),
-                         [&slice] (Slice::Ty_x_Tu const& tytu) {
+                         [&slice] (typename Slice<F>::Ty_x_Tu const& tytu) {
                            return slice.info.tuple == tytu.second
                                && slice.info.type == tytu.first
                                ;
@@ -845,7 +846,7 @@ namespace atrip {
 
           // allow to gc unwrapped and recycled, never Fetch,
           // if we have a Fetch slice then something has gone very wrong.
-          if (!slice.isUnwrapped() && slice.info.state != Slice::Recycled)
+          if (!slice.isUnwrapped() && slice.info.state != Slice<F>::Recycled)
             throw
               std::domain_error("Trying to garbage collect "
                                 " a non-unwrapped slice! "
@@ -866,13 +867,13 @@ namespace atrip {
           //  - we should make sure that the data pointer of slice
           //    does not get freed.
           //
-          if (slice.info.state == Slice::Ready) {
+          if (slice.info.state == Slice<F>::Ready) {
             WITH_OCD WITH_RANK
               << "__gc__:" << "checking for data recycled dependencies\n";
             auto recycled
-              = Slice::hasRecycledReferencingToIt(slices, slice.info);
+              = Slice<F>::hasRecycledReferencingToIt(slices, slice.info);
             if (recycled.size()) {
-              Slice* newReady = recycled[0];
+              Slice<F>* newReady = recycled[0];
               WITH_OCD WITH_RANK
                 << "__gc__:" << "swaping recycled "
                 << pretty_print(newReady->info)
@@ -897,8 +898,8 @@ namespace atrip {
 
           // if the slice is self sufficient, do not dare touching the
           // pointer, since it is a pointer to our sources in our rank.
-          if (  slice.info.state == Slice::SelfSufficient
-             || slice.info.state == Slice::Recycled
+          if (  slice.info.state == Slice<F>::SelfSufficient
+             || slice.info.state == Slice<F>::Recycled
              ) {
             freeSlicePointer = false;
           }
@@ -920,7 +921,8 @@ namespace atrip {
           // at this point, let us blank the slice
           WITH_RANK << "~~~:cl(" << name << ")"
                     << " freeing up slice "
-                    << " info " << slice.info
+                    // TODO: make this possible
+                    // << " info " << slice.info
                     << "\n";
           slice.free();
         }
@@ -930,13 +932,13 @@ namespace atrip {
 
     // CONSTRUCTOR
     SliceUnion( Tensor const& sourceTensor
-              , std::vector<Slice::Type> sliceTypes_
+              , std::vector<typename Slice<F>::Type> sliceTypes_
               , std::vector<size_t> sliceLength_
               , std::vector<size_t> paramLength
               , size_t np
               , MPI_Comm child_world
               , MPI_Comm global_world
-              , Slice::Name name_
+              , typename Slice<F>::Name name_
               , size_t nSliceBuffers = 4
               )
               : rankMap(paramLength, np)
@@ -951,13 +953,13 @@ namespace atrip {
               , name(name_)
               , sliceTypes(sliceTypes_)
               , sliceBuffers(nSliceBuffers, sources[0])
-              //, slices(2 * sliceTypes.size(), Slice{ sources[0].size() })
+              //, slices(2 * sliceTypes.size(), Slice<F>{ sources[0].size() })
     { // constructor begin
 
       LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n";
 
       slices
-        = std::vector<Slice>(2 * sliceTypes.size(), { sources[0].size() });
+        = std::vector<Slice<F>>(2 * sliceTypes.size(), { sources[0].size() });
       // TODO: think exactly ^------------------- about this number
 
       // initialize the freePointers with the pointers to the buffers
@@ -1026,19 +1028,19 @@ namespace atrip {
      * \brief Send asynchronously only if the state is Fetch
      */
     void send( size_t otherRank
-             , Slice::Info const& info
+             , typename Slice<F>::Info const& info
              , size_t tag) const noexcept {
       MPI_Request request;
       bool sendData_p = false;
 
-      if (info.state == Slice::Fetch) sendData_p = true;
+      if (info.state == Slice<F>::Fetch) sendData_p = true;
       // TODO: remove this because I have SelfSufficient
       if (otherRank == info.from.rank)      sendData_p = false;
       if (!sendData_p) return;
 
       MPI_Isend( sources[info.from.source].data()
                , sources[info.from.source].size()
-               , MPI_DOUBLE /* TODO: adapt this with traits */
+               , traits::mpi::datatypeOf<F>()
                , otherRank
                , tag
                , universe
@@ -1052,19 +1054,19 @@ namespace atrip {
     /**
      * \brief Receive asynchronously only if the state is Fetch
      */
-    void receive(Slice::Info const& info, size_t tag) noexcept {
-      auto& slice = Slice::findByInfo(slices, info);
+    void receive(typename Slice<F>::Info const& info, size_t tag) noexcept {
+      auto& slice = Slice<F>::findByInfo(slices, info);
 
       if (Atrip::rank == info.from.rank) return;
 
-      if (slice.info.state == Slice::Fetch) {
+      if (slice.info.state == Slice<F>::Fetch) {
         // TODO: do it through the slice class
-        slice.info.state = Slice::Dispatched;
+        slice.info.state = Slice<F>::Dispatched;
         MPI_Request request;
         slice.request = request;
         MPI_Irecv( slice.data
                  , slice.size
-                 , MPI_DOUBLE // TODO: Adapt this with traits
+                 , traits::mpi::datatypeOf<F>()
                  , info.from.rank
                  , tag
                  , universe
@@ -1078,42 +1080,42 @@ namespace atrip {
       for (auto type: sliceTypes) unwrapSlice(type, abc);
     }
 
-    F* unwrapSlice(Slice::Type type, ABCTuple const& abc) {
+    F* unwrapSlice(typename Slice<F>::Type type, ABCTuple const& abc) {
       WITH_CRAZY_DEBUG
       WITH_RANK << "__unwrap__:slice " << type << " w n "
                 << name
                 << " abc" << pretty_print(abc)
                 << "\n";
-      auto& slice = Slice::findByTypeAbc(slices, type, abc);
-      WITH_RANK << "__unwrap__:info " << slice.info << "\n";
+      auto& slice = Slice<F>::findByTypeAbc(slices, type, abc);
+      //WITH_RANK << "__unwrap__:info " << slice.info << "\n";
       switch  (slice.info.state) {
-        case Slice::Dispatched:
+        case Slice<F>::Dispatched:
           WITH_RANK << "__unwrap__:Fetch: " << &slice
                     << " info " << pretty_print(slice.info)
                     << "\n";
           slice.unwrapAndMarkReady();
           return slice.data;
           break;
-        case Slice::SelfSufficient:
+        case Slice<F>::SelfSufficient:
           WITH_RANK << "__unwrap__:SelfSufficient: " << &slice
                     << " info " << pretty_print(slice.info)
                     << "\n";
           return slice.data;
           break;
-        case Slice::Ready:
+        case Slice<F>::Ready:
           WITH_RANK << "__unwrap__:READY: UNWRAPPED ALREADY" << &slice
                     << " info " << pretty_print(slice.info)
                     << "\n";
           return slice.data;
           break;
-        case Slice::Recycled:
+        case Slice<F>::Recycled:
           WITH_RANK << "__unwrap__:RECYCLED " << &slice
                     << " info " << pretty_print(slice.info)
                     << "\n";
           return unwrapSlice(slice.info.recycling, abc);
           break;
-        case Slice::Fetch:
-        case Slice::Acceptor:
+        case Slice<F>::Fetch:
+        case Slice<F>::Acceptor:
           throw std::domain_error("Can't unwrap an acceptor or fetch slice!");
           break;
         default:
@@ -1122,24 +1124,26 @@ namespace atrip {
       return slice.data;
     }
 
-    const RankMap rankMap;
+    const RankMap<F> rankMap;
     const MPI_Comm world;
     const MPI_Comm universe;
     const std::vector<size_t> sliceLength;
     std::vector< std::vector<F> > sources;
-    std::vector< Slice > slices;
-    Slice::Name name;
-    const std::vector<Slice::Type> sliceTypes;
+    std::vector< Slice<F> > slices;
+    typename Slice<F>::Name name;
+    const std::vector<typename Slice<F>::Type> sliceTypes;
     std::vector< std::vector<F> > sliceBuffers;
     std::set<F*> freePointers;
 
   };
 
-  SliceUnion&
-  unionByName(std::vector<SliceUnion*> const& unions, Slice::Name name) {
+  template <typename F=double>
+  SliceUnion<F>&
+  unionByName(std::vector<SliceUnion<F>*> const& unions,
+              typename Slice<F>::Name name) {
       const auto sliceUnionIt
         = std::find_if(unions.begin(), unions.end(),
-                      [&name](SliceUnion const* s) {
+                      [&name](SliceUnion<F> const* s) {
                         return name == s->name;
                       });
       if (sliceUnionIt == unions.end())

From 05f5bb6104276ebb7ea36f241a19849568c0d16b Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Thu, 27 Jan 2022 20:44:09 +0100
Subject: [PATCH 08/22] Templatize unions

---
 atrip.org | 202 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 104 insertions(+), 98 deletions(-)

diff --git a/atrip.org b/atrip.org
index 52398c1..e2e6e69 100644
--- a/atrip.org
+++ b/atrip.org
@@ -1241,12 +1241,13 @@ and define subclasses of slice unions.
 
 namespace atrip {
 
+  template <typename F=double>
   void sliceIntoVector
-    ( std::vector<double> &v
-    , CTF::Tensor<double> &toSlice
+    ( std::vector<F> &v
+    , CTF::Tensor<F> &toSlice
     , std::vector<int64_t> const low
     , std::vector<int64_t> const up
-    , CTF::Tensor<double> const& origin
+    , CTF::Tensor<F> const& origin
     , std::vector<int64_t> const originLow
     , std::vector<int64_t> const originUp
     ) {
@@ -1273,155 +1274,159 @@ namespace atrip {
                  , origin_.low.data()
                  , origin_.up.data()
                  , 1.0);
-    memcpy(v.data(), toSlice.data, sizeof(double) * v.size());
+    memcpy(v.data(), toSlice.data, sizeof(F) * v.size());
 #endif
 
   }
 
 
-  struct TAPHH : public SliceUnion {
-    TAPHH( Tensor const& sourceTensor
+  template <typename F=double>
+  struct TAPHH : public SliceUnion<F> {
+    TAPHH( CTF::Tensor<F> const& sourceTensor
          , size_t No
          , size_t Nv
          , size_t np
          , MPI_Comm child_world
          , MPI_Comm global_world
-         ) : SliceUnion( sourceTensor
-                       , {Slice::A, Slice::B, Slice::C}
-                       , {Nv, No, No} // size of the slices
-                       , {Nv}
-                       , np
-                       , child_world
-                       , global_world
-                       , Slice::TA
-                       , 4) {
+         ) : SliceUnion<F>( sourceTensor
+                          , {Slice<F>::A, Slice<F>::B, Slice<F>::C}
+                          , {Nv, No, No} // size of the slices
+                          , {Nv}
+                          , np
+                          , child_world
+                          , global_world
+                          , Slice<F>::TA
+                          , 4) {
            init(sourceTensor);
          }
 
-    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override
+    void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override
     {
-      const int Nv = sliceLength[0]
-              , No = sliceLength[1]
-              , a = rankMap.find({static_cast<size_t>(Atrip::rank), it});
+      const int Nv = this->sliceLength[0]
+              , No = this->sliceLength[1]
+              , a = this->rankMap.find({static_cast<size_t>(Atrip::rank), it});
               ;
 
 
-      sliceIntoVector( sources[it]
-                     , to,   {0, 0, 0},    {Nv, No, No}
-                     , from, {a, 0, 0, 0}, {a+1, Nv, No, No}
-                     );
+      sliceIntoVector<F>( this->sources[it]
+                        , to,   {0, 0, 0},    {Nv, No, No}
+                        , from, {a, 0, 0, 0}, {a+1, Nv, No, No}
+                        );
 
     }
 
   };
 
 
-  struct HHHA : public SliceUnion {
-    HHHA( Tensor const& sourceTensor
+  template <typename F=double>
+  struct HHHA : public SliceUnion<F> {
+    HHHA( CTF::Tensor<F> const& sourceTensor
         , size_t No
         , size_t Nv
         , size_t np
         , MPI_Comm child_world
         , MPI_Comm global_world
-        ) : SliceUnion( sourceTensor
-                      , {Slice::A, Slice::B, Slice::C}
-                      , {No, No, No} // size of the slices
-                      , {Nv}         // size of the parametrization
-                      , np
-                      , child_world
-                      , global_world
-                      , Slice::VIJKA
-                      , 4) {
+        ) : SliceUnion<F>( sourceTensor
+                         , {Slice<F>::A, Slice<F>::B, Slice<F>::C}
+                         , {No, No, No} // size of the slices
+                         , {Nv}         // size of the parametrization
+                         , np
+                         , child_world
+                         , global_world
+                         , Slice<F>::VIJKA
+                         , 4) {
            init(sourceTensor);
          }
 
-    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override
+    void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override
     {
 
-      const int No = sliceLength[0]
-              , a = rankMap.find({static_cast<size_t>(Atrip::rank), it})
+      const int No = this->sliceLength[0]
+              , a = this->rankMap.find({static_cast<size_t>(Atrip::rank), it})
               ;
 
-      sliceIntoVector( sources[it]
-                     , to,   {0, 0, 0},    {No, No, No}
-                     , from, {0, 0, 0, a}, {No, No, No, a+1}
-                     );
+      sliceIntoVector<F>( this->sources[it]
+                        , to,   {0, 0, 0},    {No, No, No}
+                        , from, {0, 0, 0, a}, {No, No, No, a+1}
+                        );
 
     }
   };
 
-  struct ABPH : public SliceUnion {
-    ABPH( Tensor const& sourceTensor
+  template <typename F=double>
+  struct ABPH : public SliceUnion<F> {
+    ABPH( CTF::Tensor<F> const& sourceTensor
         , size_t No
         , size_t Nv
         , size_t np
         , MPI_Comm child_world
         , MPI_Comm global_world
-        ) : SliceUnion( sourceTensor
-                      , { Slice::AB, Slice::BC, Slice::AC
-                        , Slice::BA, Slice::CB, Slice::CA
-                        }
-                      , {Nv, No} // size of the slices
-                      , {Nv, Nv} // size of the parametrization
-                      , np
-                      , child_world
-                      , global_world
-                      , Slice::VABCI
-                      , 2*6) {
+        ) : SliceUnion<F>( sourceTensor
+                         , { Slice<F>::AB, Slice<F>::BC, Slice<F>::AC
+                           , Slice<F>::BA, Slice<F>::CB, Slice<F>::CA
+                           }
+                         , {Nv, No} // size of the slices
+                         , {Nv, Nv} // size of the parametrization
+                         , np
+                         , child_world
+                         , global_world
+                         , Slice<F>::VABCI
+                         , 2*6) {
            init(sourceTensor);
          }
 
-    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override {
+    void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override {
 
-      const int Nv = sliceLength[0]
-              , No = sliceLength[1]
-              , el = rankMap.find({static_cast<size_t>(Atrip::rank), it})
+      const int Nv = this->sliceLength[0]
+              , No = this->sliceLength[1]
+              , el = this->rankMap.find({static_cast<size_t>(Atrip::rank), it})
               , a = el % Nv
               , b = el / Nv
               ;
 
 
-      sliceIntoVector( sources[it]
-                     , to,   {0, 0},       {Nv, No}
-                     , from, {a, b, 0, 0}, {a+1, b+1, Nv, No}
-                     );
+      sliceIntoVector<F>( this->sources[it]
+                        , to,   {0, 0},       {Nv, No}
+                        , from, {a, b, 0, 0}, {a+1, b+1, Nv, No}
+                        );
 
     }
 
   };
 
-  struct ABHH : public SliceUnion {
-    ABHH( Tensor const& sourceTensor
+  template <typename F=double>
+  struct ABHH : public SliceUnion<F> {
+    ABHH( CTF::Tensor<F> const& sourceTensor
         , size_t No
         , size_t Nv
         , size_t np
         , MPI_Comm child_world
         , MPI_Comm global_world
-        ) : SliceUnion( sourceTensor
-                      , {Slice::AB, Slice::BC, Slice::AC}
-                      , {No, No} // size of the slices
-                      , {Nv, Nv} // size of the parametrization
-                      , np
-                      , child_world
-                      , global_world
-                      , Slice::VABIJ
-                      , 6) {
+        ) : SliceUnion<F>( sourceTensor
+                         , {Slice<F>::AB, Slice<F>::BC, Slice<F>::AC}
+                         , {No, No} // size of the slices
+                         , {Nv, Nv} // size of the parametrization
+                         , np
+                         , child_world
+                         , global_world
+                         , Slice<F>::VABIJ
+                         , 6) {
            init(sourceTensor);
          }
 
-    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override {
+    void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override {
 
       const int Nv = from.lens[0]
-              , No = sliceLength[1]
-              , el = rankMap.find({static_cast<size_t>(Atrip::rank), it})
+              , No = this->sliceLength[1]
+              , el = this->rankMap.find({static_cast<size_t>(Atrip::rank), it})
               , a = el % Nv
               , b = el / Nv
               ;
 
-      sliceIntoVector( sources[it]
-                     , to,   {0, 0},       {No, No}
-                     , from, {a, b, 0, 0}, {a+1, b+1, No, No}
-                     );
+      sliceIntoVector<F>( this->sources[it]
+                        , to,   {0, 0},       {No, No}
+                        , from, {a, b, 0, 0}, {a+1, b+1, No, No}
+                        );
 
 
     }
@@ -1429,39 +1434,40 @@ namespace atrip {
   };
 
 
-  struct TABHH : public SliceUnion {
-    TABHH( Tensor const& sourceTensor
+  template <typename F=double>
+  struct TABHH : public SliceUnion<F> {
+    TABHH( CTF::Tensor<F> const& sourceTensor
          , size_t No
          , size_t Nv
          , size_t np
          , MPI_Comm child_world
          , MPI_Comm global_world
-         ) : SliceUnion( sourceTensor
-                       , {Slice::AB, Slice::BC, Slice::AC}
-                       , {No, No} // size of the slices
-                       , {Nv, Nv} // size of the parametrization
-                       , np
-                       , child_world
-                       , global_world
-                       , Slice::TABIJ
-                       , 6) {
+         ) : SliceUnion<F>( sourceTensor
+                          , {Slice<F>::AB, Slice<F>::BC, Slice<F>::AC}
+                          , {No, No} // size of the slices
+                          , {Nv, Nv} // size of the parametrization
+                          , np
+                          , child_world
+                          , global_world
+                          , Slice<F>::TABIJ
+                          , 6) {
            init(sourceTensor);
          }
 
-    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override {
+    void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override {
       // TODO: maybe generalize this with ABHH
 
       const int Nv = from.lens[0]
-              , No = sliceLength[1]
-              , el = rankMap.find({static_cast<size_t>(Atrip::rank), it})
+              , No = this->sliceLength[1]
+              , el = this->rankMap.find({static_cast<size_t>(Atrip::rank), it})
               , a = el % Nv
               , b = el / Nv
               ;
 
-      sliceIntoVector( sources[it]
-                     , to,   {0, 0},       {No, No}
-                     , from, {a, b, 0, 0}, {a+1, b+1, No, No}
-                     );
+      sliceIntoVector<F>( this->sources[it]
+                        , to,   {0, 0},       {No, No}
+                        , from, {a, b, 0, 0}, {a+1, b+1, No, No}
+                        );
 
 
     }

From 9d684b6624798c01724759fab160c58d9b87c1c0 Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Thu, 27 Jan 2022 20:45:03 +0100
Subject: [PATCH 09/22] Templatize energy functions

---
 atrip.org | 119 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 70 insertions(+), 49 deletions(-)

diff --git a/atrip.org b/atrip.org
index e2e6e69..2d218b7 100644
--- a/atrip.org
+++ b/atrip.org
@@ -1487,14 +1487,15 @@ namespace atrip {
 
 namespace atrip {
 
+  template <typename F=double>
   double getEnergyDistinct
-    ( const double epsabc
-    , std::vector<double> const& epsi
-    , std::vector<double> const& Tijk_
-    , std::vector<double> const& Zijk_
+    ( const F epsabc
+    , std::vector<F> const& epsi
+    , std::vector<F> const& Tijk_
+    , std::vector<F> const& Zijk_
     ) {
     constexpr size_t blockSize=16;
-    double energy(0.);
+    F energy(0.);
     const size_t No = epsi.size();
     for (size_t kk=0; kk<No; kk+=blockSize){
       const size_t kend( std::min(No, kk+blockSize) );
@@ -1503,52 +1504,64 @@ namespace atrip {
         for (size_t ii(jj); ii<No; ii+=blockSize){
           const size_t iend( std::min( No, ii+blockSize) );
           for (size_t k(kk); k < kend; k++){
-            const double ek(epsi[k]);
+            const F ek(epsi[k]);
             const size_t jstart = jj > k ? jj : k;
             for (size_t j(jstart); j < jend; j++){
-              const double ej(epsi[j]);
-              double facjk( j == k ? 0.5 : 1.0);
+              F const ej(epsi[j]);
+              F const facjk = j == k ? F(0.5) : F(1.0);
               size_t istart = ii > j ? ii : j;
               for (size_t i(istart); i < iend; i++){
-                const double ei(epsi[i]);
-                double facij ( i==j ? 0.5 : 1.0);
-                double denominator(epsabc - ei - ej - ek);
-                double U(Zijk_[i + No*j + No*No*k]);
-                double V(Zijk_[i + No*k + No*No*j]);
-                double W(Zijk_[j + No*i + No*No*k]);
-                double X(Zijk_[j + No*k + No*No*i]);
-                double Y(Zijk_[k + No*i + No*No*j]);
-                double Z(Zijk_[k + No*j + No*No*i]);
-
-                double A(Tijk_[i + No*j + No*No*k]);
-                double B(Tijk_[i + No*k + No*No*j]);
-                double C(Tijk_[j + No*i + No*No*k]);
-                double D(Tijk_[j + No*k + No*No*i]);
-                double E(Tijk_[k + No*i + No*No*j]);
-                double F(Tijk_[k + No*j + No*No*i]);
-                double value(3.0*(A*U+B*V+C*W+D*X+E*Y+F*Z)
-                            +((U+X+Y)-2.0*(V+W+Z))*(A+D+E)
-                            +((V+W+Z)-2.0*(U+X+Y))*(B+C+F));
-                energy += 2.0*value / denominator * facjk * facij;
+                const F
+                    ei(epsi[i])
+                  , facij = i == j ? F(0.5) : F(1.0)
+                  , denominator(epsabc - ei - ej - ek)
+                  , U(Zijk_[i + No*j + No*No*k])
+                  , V(Zijk_[i + No*k + No*No*j])
+                  , W(Zijk_[j + No*i + No*No*k])
+                  , X(Zijk_[j + No*k + No*No*i])
+                  , Y(Zijk_[k + No*i + No*No*j])
+                  , Z(Zijk_[k + No*j + No*No*i])
+                  , A(std::conj(Tijk_[i + No*j + No*No*k]))
+                  , B(std::conj(Tijk_[i + No*k + No*No*j]))
+                  , C(std::conj(Tijk_[j + No*i + No*No*k]))
+                  , D(std::conj(Tijk_[j + No*k + No*No*i]))
+                  , E(std::conj(Tijk_[k + No*i + No*No*j]))
+                  , F(std::conj(Tijk_[k + No*j + No*No*i]))
+                  , value
+                    = 3.0 * ( A * U
+                              + B * V
+                              + C * W
+                              + D * X
+                              + E * Y
+                              + F * Z )
+                   + ( ( U + X + Y )
+                     - 2.0 * ( V + W + Z )
+                     ) * ( A + D + E )
+                   + ( ( V + W + Z )
+                     - 2.0 * ( U + X + Y )
+                     ) * ( B + C + F )
+                  ;
+                energy += 2.0 * value / denominator * facjk * facij;
               } // i
             } // j
           } // k
         } // ii
       } // jj
     } // kk
-    return energy;
+    return std::real(energy);
   }
 
 
+  template <typename F=double>
   double getEnergySame
-    ( const double epsabc
-    , std::vector<double> const& epsi
-    , std::vector<double> const& Tijk_
-    , std::vector<double> const& Zijk_
+    ( const F epsabc
+    , std::vector<F> const& epsi
+    , std::vector<F> const& Tijk_
+    , std::vector<F> const& Zijk_
     ) {
     constexpr size_t blockSize = 16;
     const size_t No = epsi.size();
-    double energy(0.);
+    F energy = F(0.);
     for (size_t kk=0; kk<No; kk+=blockSize){
       const size_t kend( std::min( kk+blockSize, No) );
       for (size_t jj(kk); jj<No; jj+=blockSize){
@@ -1556,33 +1569,41 @@ namespace atrip {
         for (size_t ii(jj); ii<No; ii+=blockSize){
           const size_t iend( std::min( ii+blockSize, No) );
           for (size_t k(kk); k < kend; k++){
-            const double ek(epsi[k]);
+            const F ek(epsi[k]);
             const size_t jstart = jj > k ? jj : k;
             for(size_t j(jstart); j < jend; j++){
-              const double facjk( j == k ? 0.5 : 1.0);
-              const double ej(epsi[j]);
+              const F facjk( j == k ? F(0.5) : F(1.0));
+              const F ej(epsi[j]);
               const size_t istart = ii > j ? ii : j;
               for(size_t i(istart); i < iend; i++){
-                double ei(epsi[i]);
-                double facij ( i==j ? 0.5 : 1.0);
-                double denominator(epsabc - ei - ej - ek);
-                double U(Zijk_[i + No*j + No*No*k]);
-                double V(Zijk_[j + No*k + No*No*i]);
-                double W(Zijk_[k + No*i + No*No*j]);
-                double A(Tijk_[i + No*j + No*No*k]);
-                double B(Tijk_[j + No*k + No*No*i]);
-                double C(Tijk_[k + No*i + No*No*j]);
-                double value(3.0*( A*U + B*V + C*W) - (A+B+C)*(U+V+W));
-                energy += 2.0*value / denominator * facjk * facij;
+                const F
+                  ei(epsi[i])
+                , facij ( i==j ? F(0.5) : F(1.0))
+                , denominator(epsabc - ei - ej - ek)
+                , U(Zijk_[i + No*j + No*No*k])
+                , V(Zijk_[j + No*k + No*No*i])
+                , W(Zijk_[k + No*i + No*No*j])
+                , A(std::conj(Tijk_[i + No*j + No*No*k]))
+                , B(std::conj(Tijk_[j + No*k + No*No*i]))
+                , C(std::conj(Tijk_[k + No*i + No*No*j]))
+                , value
+                  = F(3.0) * ( A * U
+                             + B * V
+                             + C * W
+                             )
+                  - ( A + B + C ) * ( U + V + W )
+                ;
+                energy += F(2.0) * value / denominator * facjk * facij;
               } // i
             } // j
           } // k
         } // ii
       } // jj
     } // kk
-    return energy;
+    return std::real(energy);
   }
 
+  template <typename F=double>
   void singlesContribution
     ( size_t No
     , size_t Nv

From c7c6db77dce8003513bec54e595d34cb5426ee29 Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Thu, 27 Jan 2022 20:45:38 +0100
Subject: [PATCH 10/22] Templatize doubles

---
 atrip.org | 136 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 77 insertions(+), 59 deletions(-)

diff --git a/atrip.org b/atrip.org
index 2d218b7..1fd7d91 100644
--- a/atrip.org
+++ b/atrip.org
@@ -1608,11 +1608,11 @@ namespace atrip {
     ( size_t No
     , size_t Nv
     , const ABCTuple &abc
-    , double const* Tph
-    , double const* VABij
-    , double const* VACij
-    , double const* VBCij
-    , double *Zijk
+    , F const* Tph
+    , F const* VABij
+    , F const* VACij
+    , F const* VBCij
+    , F *Zijk
     ) {
     const size_t a(abc[0]), b(abc[1]), c(abc[2]);
     for (size_t k=0; k < No; k++)
@@ -1627,31 +1627,32 @@ namespace atrip {
     }
   }
 
+  template <typename F=double>
   void doublesContribution
     ( const ABCTuple &abc
     , size_t const No
     , size_t const Nv
     // -- VABCI
-    , double const* VABph
-    , double const* VACph
-    , double const* VBCph
-    , double const* VBAph
-    , double const* VCAph
-    , double const* VCBph
+    , F const* VABph
+    , F const* VACph
+    , F const* VBCph
+    , F const* VBAph
+    , F const* VCAph
+    , F const* VCBph
     // -- VHHHA
-    , double const* VhhhA
-    , double const* VhhhB
-    , double const* VhhhC
+    , F const* VhhhA
+    , F const* VhhhB
+    , F const* VhhhC
     // -- TA
-    , double const* TAphh
-    , double const* TBphh
-    , double const* TCphh
+    , F const* TAphh
+    , F const* TBphh
+    , F const* TCphh
     // -- TABIJ
-    , double const* TABhh
-    , double const* TAChh
-    , double const* TBChh
+    , F const* TABhh
+    , F const* TAChh
+    , F const* TBChh
     // -- TIJK
-    , double *Tijk
+    , F *Tijk
     , atrip::Timings& chrono
     ) {
 
@@ -1670,40 +1671,47 @@ namespace atrip {
       Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)];   \
     }                                                               \
     t_reorder.stop();
-  #define DGEMM_PARTICLES(__A, __B)    \
-    atrip::dgemm_( "T"                 \
-                , "N"                 \
-                , (int const*)&NoNo   \
-                , (int const*)&No     \
-                , (int const*)&Nv     \
-                , &one                \
-                , __A                 \
-                , (int const*)&Nv     \
-                , __B                 \
-                , (int const*)&Nv     \
-                , &zero               \
-                , _t_buffer.data()    \
-                , (int const*)&NoNo   \
-                );
-  #define DGEMM_HOLES(__A, __B, __TRANSB)  \
-    atrip::dgemm_( "N"                     \
-                , __TRANSB                \
-                , (int const*)&NoNo       \
-                , (int const*)&No         \
-                , (int const*)&No         \
-                , &m_one                  \
-                , __A                     \
-                , (int const*)&NoNo       \
-                , __B                     \
-                , (int const*)&No         \
-                , &zero                   \
-                , _t_buffer.data()        \
-                , (int const*)&NoNo       \
-                );
+  #define DGEMM_PARTICLES(__A, __B)      \
+    atrip::xgemm<F>( "T"                 \
+                   , "N"                 \
+                   , (int const*)&NoNo   \
+                   , (int const*)&No     \
+                   , (int const*)&Nv     \
+                   , &one                \
+                   , __A                 \
+                   , (int const*)&Nv     \
+                   , __B                 \
+                   , (int const*)&Nv     \
+                   , &zero               \
+                   , _t_buffer.data()    \
+                   , (int const*)&NoNo   \
+                   );
+  #define DGEMM_HOLES(__A, __B, __TRANSB)    \
+    atrip::xgemm<F>( "N"                     \
+                   , __TRANSB                \
+                   , (int const*)&NoNo       \
+                   , (int const*)&No         \
+                   , (int const*)&No         \
+                   , &m_one                  \
+                   , __A                     \
+                   , (int const*)&NoNo       \
+                   , __B                     \
+                   , (int const*)&No         \
+                   , &zero                   \
+                   , _t_buffer.data()        \
+                   , (int const*)&NoNo       \
+                   );
+  #define MAYBE_CONJ(_conj, _buffer)                          \
+    if (traits::isComplex<F>()) {                             \
+      for (size_t __i = 0; __i < NoNoNo; ++__i)               \
+        _conj[__i] = std::conj(_buffer[__i]);                 \
+    } else {                                                  \
+      for (size_t __i = 0; __i < NoNoNo; ++__i)               \
+        _conj[__i] = _buffer[__i];                            \
+    }
 
-    using F = double;
     const size_t NoNoNo = No*NoNo;
-    std::vector<double> _t_buffer;
+    std::vector<F> _t_buffer;
     _t_buffer.reserve(NoNoNo);
     F one{1.0}, m_one{-1.0}, zero{0.0};
 
@@ -1716,38 +1724,48 @@ namespace atrip {
 
     chrono["doubles:holes"].start();
     { // Holes part ============================================================
+
+      std::vector<F> _vhhh(NoNoNo);
+
       // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
+      MAYBE_CONJ(_vhhh, VhhhC)
       chrono["doubles:holes:1"].start();
-      DGEMM_HOLES(VhhhC, TABhh, "N")
+      DGEMM_HOLES(_vhhh.data(), TABhh, "N")
       REORDER(i, k, j)
       chrono["doubles:holes:1"].stop();
       // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
       chrono["doubles:holes:2"].start();
-      DGEMM_HOLES(VhhhC, TABhh, "T")
+      DGEMM_HOLES(_vhhh.data(), TABhh, "T")
       REORDER(j, k, i)
       chrono["doubles:holes:2"].stop();
+
       // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
+      MAYBE_CONJ(_vhhh, VhhhB)
       chrono["doubles:holes:3"].start();
-      DGEMM_HOLES(VhhhB, TAChh, "N")
+      DGEMM_HOLES(_vhhh.data(), TAChh, "N")
       REORDER(i, j, k)
       chrono["doubles:holes:3"].stop();
       // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
       chrono["doubles:holes:4"].start();
-      DGEMM_HOLES(VhhhB, TAChh, "T")
+      DGEMM_HOLES(_vhhh.data(), TAChh, "T")
       REORDER(k, j, i)
       chrono["doubles:holes:4"].stop();
+
       // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
+      MAYBE_CONJ(_vhhh, VhhhA)
       chrono["doubles:holes:5"].start();
-      DGEMM_HOLES(VhhhA, TBChh, "N")
+      DGEMM_HOLES(_vhhh.data(), TBChh, "N")
       REORDER(j, i, k)
       chrono["doubles:holes:5"].stop();
       // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
       chrono["doubles:holes:6"].start();
-      DGEMM_HOLES(VhhhA, TBChh, "T")
+      DGEMM_HOLES(_vhhh.data(), TBChh, "T")
       REORDER(k, i, j)
       chrono["doubles:holes:6"].stop();
+
     }
     chrono["doubles:holes"].stop();
+  #undef MAYBE_CONJ
 
     chrono["doubles:particles"].start();
     { // Particle part =========================================================

From 75ecd53e18b52f3279806164510b6ba5c088f530 Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Thu, 27 Jan 2022 20:45:54 +0100
Subject: [PATCH 11/22] Add xgemm

---
 atrip.org | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 65 insertions(+), 3 deletions(-)

diff --git a/atrip.org b/atrip.org
index 1fd7d91..2c14694 100644
--- a/atrip.org
+++ b/atrip.org
@@ -1863,6 +1863,9 @@ is mainly using the =DGEMM= function, which we declare as
 #+begin_src c++ :tangle (atrip-blas-h)
 #pragma once
 namespace atrip {
+
+  using Complex = std::complex<double>;
+
   extern "C" {
     void dgemm_(
       const char *transa,
@@ -1871,14 +1874,73 @@ namespace atrip {
       const int *n,
       const int *k,
       double *alpha,
-      const double *A,
+      const double *a,
       const int *lda,
-      const double *B,
+      const double *b,
       const int *ldb,
       double *beta,
-      double *C,
+      double *c,
       const int *ldc
     );
+
+    void zgemm_(
+      const char *transa,
+      const char *transb,
+      const int *m,
+      const int *n,
+      const int *k,
+      Complex *alpha,
+      const Complex *A,
+      const int *lda,
+      const Complex *B,
+      const int *ldb,
+      Complex *beta,
+      Complex *C,
+      const int *ldc
+    );
+  }
+
+
+  template <typename F=double>
+  void xgemm(const char *transa,
+             const char *transb,
+             const int *m,
+             const int *n,
+             const int *k,
+             F *alpha,
+             const F *A,
+             const int *lda,
+             const F *B,
+             const int *ldb,
+             F *beta,
+             F *C,
+             const int *ldc) {
+    dgemm_(transa, transb,
+           m, n, k,
+           alpha, A, lda,
+           B, ldb, beta,
+           C, ldc);
+  }
+
+  template <>
+  void xgemm(const char *transa,
+             const char *transb,
+             const int *m,
+             const int *n,
+             const int *k,
+             Complex *alpha,
+             const Complex *A,
+             const int *lda,
+             const Complex *B,
+             const int *ldb,
+             Complex *beta,
+             Complex *C,
+             const int *ldc) {
+    zgemm_(transa, transb,
+           m, n, k,
+           alpha, A, lda,
+           B, ldb, beta,
+           C, ldc);
   }
 }
 #+end_src

From f8dd6b3f3179175c847b53c81d8d8702d4abc590 Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Thu, 27 Jan 2022 20:46:23 +0100
Subject: [PATCH 12/22] Templatize Input

---
 atrip.org | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/atrip.org b/atrip.org
index 2c14694..a3aa110 100644
--- a/atrip.org
+++ b/atrip.org
@@ -1964,8 +1964,9 @@ namespace atrip {
     static int np;
     static void init();
 
+    template <typename F=double>
     struct Input {
-      CTF::Tensor<double> *ei = nullptr
+      CTF::Tensor<F> *ei = nullptr
                         , *ea = nullptr
                         , *Tph = nullptr
                         , *Tpphh = nullptr
@@ -1976,13 +1977,13 @@ namespace atrip {
       int maxIterations = 0, iterationMod = -1, percentageMod = -1;
       bool barrier = false;
       bool chrono = false;
-      Input& with_epsilon_i(CTF::Tensor<double> * t) { ei = t; return *this; }
-      Input& with_epsilon_a(CTF::Tensor<double> * t) { ea = t; return *this; }
-      Input& with_Tai(CTF::Tensor<double> * t) { Tph = t; return *this; }
-      Input& with_Tabij(CTF::Tensor<double> * t) { Tpphh = t; return *this; }
-      Input& with_Vabij(CTF::Tensor<double> * t) { Vpphh = t; return *this; }
-      Input& with_Vijka(CTF::Tensor<double> * t) { Vhhhp = t; return *this; }
-      Input& with_Vabci(CTF::Tensor<double> * t) { Vppph = t; return *this; }
+      Input& with_epsilon_i(CTF::Tensor<F> * t) { ei = t; return *this; }
+      Input& with_epsilon_a(CTF::Tensor<F> * t) { ea = t; return *this; }
+      Input& with_Tai(CTF::Tensor<F> * t) { Tph = t; return *this; }
+      Input& with_Tabij(CTF::Tensor<F> * t) { Tpphh = t; return *this; }
+      Input& with_Vabij(CTF::Tensor<F> * t) { Vpphh = t; return *this; }
+      Input& with_Vijka(CTF::Tensor<F> * t) { Vhhhp = t; return *this; }
+      Input& with_Vabci(CTF::Tensor<F> * t) { Vppph = t; return *this; }
       Input& with_maxIterations(int i) { maxIterations = i; return *this; }
       Input& with_iterationMod(int i) { iterationMod = i; return *this; }
       Input& with_percentageMod(int i) { percentageMod = i; return *this; }
@@ -1993,7 +1994,8 @@ namespace atrip {
     struct Output {
       double energy;
     };
-    static Output run(Input const& in);
+    template <typename F=double>
+    static Output run(Input<F> const& in);
   };
 
 }

From 5c177a85bc4d490517c2612b76eb6c685161bef6 Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Thu, 27 Jan 2022 20:48:38 +0100
Subject: [PATCH 13/22] Templatize main algorithm

---
 atrip.org | 127 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 68 insertions(+), 59 deletions(-)

diff --git a/atrip.org b/atrip.org
index a3aa110..7f19a7a 100644
--- a/atrip.org
+++ b/atrip.org
@@ -2028,7 +2028,8 @@ void Atrip::init()  {
   MPI_Comm_size(MPI_COMM_WORLD, &Atrip::np);
 }
 
-Atrip::Output Atrip::run(Atrip::Input const& in) {
+template <typename F>
+Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
 
   const int np = Atrip::np;
   const int rank = Atrip::rank;
@@ -2043,14 +2044,14 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
   LOG(0,"Atrip") << "Nv: " << Nv << "\n";
 
   // allocate the three scratches, see piecuch
-  std::vector<double> Tijk(No*No*No) // doubles only (see piecuch)
-                    , Zijk(No*No*No) // singles + doubles (see piecuch)
-                    // we need local copies of the following tensors on every
-                    // rank
-                    , epsi(No)
-                    , epsa(Nv)
-                    , Tai(No * Nv)
-                    ;
+  std::vector<F>   Tijk(No*No*No) // doubles only (see piecuch)
+                 , Zijk(No*No*No) // singles + doubles (see piecuch)
+                 // we need local copies of the following tensors on every
+                 // rank
+                 , epsi(No)
+                 , epsa(Nv)
+                 , Tai(No * Nv)
+                 ;
 
   in.ei->read_all(epsi.data());
   in.ea->read_all(epsa.data());
@@ -2079,20 +2080,20 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
   chrono["nv-slices"].start();
   // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1
   LOG(0,"Atrip") << "BUILD NV-SLICES\n";
-  TAPHH taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  HHHA  hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  TAPHH<F> taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  HHHA<F>  hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
   chrono["nv-slices"].stop();
 
   chrono["nv-nv-slices"].start();
   // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1
   LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n";
-  ABPH abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  ABHH abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  TABHH tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  ABPH<F> abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  ABHH<F> abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  TABHH<F> tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
   chrono["nv-nv-slices"].stop();
 
   // all tensors
-  std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh};
+  std::vector< SliceUnion<F>* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh};
 
   //CONSTRUCT TUPLE LIST ==============================================={{{1
   LOG(0,"Atrip") << "BUILD TUPLE LIST\n";
@@ -2126,18 +2127,20 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
     = [&tuplesList](size_t const i) { return i >= tuplesList.size(); };
 
 
+  using Database = typename Slice<F>::Database;
+  using LocalDatabase = typename Slice<F>::LocalDatabase;
   auto communicateDatabase
     = [ &unions
       , np
       , &chrono
-      ] (ABCTuple const& abc, MPI_Comm const& c) -> Slice::Database {
+      ] (ABCTuple const& abc, MPI_Comm const& c) -> typename Slice<F>::Database {
 
         chrono["db:comm:type:do"].start();
-        auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement();
+        auto MPI_LDB_ELEMENT = Slice<F>::mpi::localDatabaseElement();
         chrono["db:comm:type:do"].stop();
 
         chrono["db:comm:ldb"].start();
-        Slice::LocalDatabase ldb;
+        LocalDatabase ldb;
 
         for (auto const& tensor: unions) {
           auto const& tensorDb = tensor->buildLocalDatabase(abc);
@@ -2145,7 +2148,8 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
         }
         chrono["db:comm:ldb"].stop();
 
-        Slice::Database db(np * ldb.size(), ldb[0]);
+        typename
+        Slice<F>::Database db(np * ldb.size(), ldb[0]);
 
         chrono["oneshot-db:comm:allgather"].start();
         chrono["db:comm:allgather"].start();
@@ -2167,7 +2171,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
       };
 
   auto doIOPhase
-    = [&unions, &rank, &np, &universe, &chrono] (Slice::Database const& db) {
+    = [&unions, &rank, &np, &universe, &chrono] (typename Slice<F>::Database const& db) {
 
     const size_t localDBLength = db.size() / np;
 
@@ -2217,7 +2221,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
                 ;
       for (auto it = begin; it != end; ++it) {
         sendTag++;
-        Slice::LocalDatabaseElement const& el = *it;
+        typename Slice<F>::LocalDatabaseElement const& el = *it;
 
         if (el.info.from.rank != rank) continue;
 
@@ -2266,7 +2270,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
 
   // START MAIN LOOP ======================================================{{{1
 
-  Slice::Database db;
+  typename Slice<F>::Database db;
 
   for ( size_t i = abcIndex.first, iteration = 1
       ; i < abcIndex.second
@@ -2373,30 +2377,31 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
       )))
       chrono["oneshot-doubles"].start();
       chrono["doubles"].start();
-      doublesContribution( abc, (size_t)No, (size_t)Nv
-                         // -- VABCI
-                         , abph.unwrapSlice(Slice::AB, abc)
-                         , abph.unwrapSlice(Slice::AC, abc)
-                         , abph.unwrapSlice(Slice::BC, abc)
-                         , abph.unwrapSlice(Slice::BA, abc)
-                         , abph.unwrapSlice(Slice::CA, abc)
-                         , abph.unwrapSlice(Slice::CB, abc)
-                         // -- VHHHA
-                         , hhha.unwrapSlice(Slice::A, abc)
-                         , hhha.unwrapSlice(Slice::B, abc)
-                         , hhha.unwrapSlice(Slice::C, abc)
-                         // -- TA
-                         , taphh.unwrapSlice(Slice::A, abc)
-                         , taphh.unwrapSlice(Slice::B, abc)
-                         , taphh.unwrapSlice(Slice::C, abc)
-                         // -- TABIJ
-                         , tabhh.unwrapSlice(Slice::AB, abc)
-                         , tabhh.unwrapSlice(Slice::AC, abc)
-                         , tabhh.unwrapSlice(Slice::BC, abc)
-                         // -- TIJK
-                         , Tijk.data()
-                         , chrono
-                         );
+      LOGREMOVE << "doubles " << iteration << "\n";
+      doublesContribution<F>( abc, (size_t)No, (size_t)Nv
+                            // -- VABCI
+                            , abph.unwrapSlice(Slice<F>::AB, abc)
+                            , abph.unwrapSlice(Slice<F>::AC, abc)
+                            , abph.unwrapSlice(Slice<F>::BC, abc)
+                            , abph.unwrapSlice(Slice<F>::BA, abc)
+                            , abph.unwrapSlice(Slice<F>::CA, abc)
+                            , abph.unwrapSlice(Slice<F>::CB, abc)
+                            // -- VHHHA
+                            , hhha.unwrapSlice(Slice<F>::A, abc)
+                            , hhha.unwrapSlice(Slice<F>::B, abc)
+                            , hhha.unwrapSlice(Slice<F>::C, abc)
+                            // -- TA
+                            , taphh.unwrapSlice(Slice<F>::A, abc)
+                            , taphh.unwrapSlice(Slice<F>::B, abc)
+                            , taphh.unwrapSlice(Slice<F>::C, abc)
+                            // -- TABIJ
+                            , tabhh.unwrapSlice(Slice<F>::AB, abc)
+                            , tabhh.unwrapSlice(Slice<F>::AC, abc)
+                            , tabhh.unwrapSlice(Slice<F>::BC, abc)
+                            // -- TIJK
+                            , Tijk.data()
+                            , chrono
+                            );
       WITH_RANK << iteration << "-th doubles done\n";
       chrono["doubles"].stop();
       chrono["oneshot-doubles"].stop();
@@ -2414,12 +2419,12 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
       for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I];
       chrono["reorder"].stop();
       chrono["singles"].start();
-      singlesContribution( No, Nv, abc
-                         , Tai.data()
-                         , abhh.unwrapSlice(Slice::AB, abc)
-                         , abhh.unwrapSlice(Slice::AC, abc)
-                         , abhh.unwrapSlice(Slice::BC, abc)
-                         , Zijk.data());
+      singlesContribution<F>( No, Nv, abc
+                            , Tai.data()
+                            , abhh.unwrapSlice(Slice<F>::AB, abc)
+                            , abhh.unwrapSlice(Slice<F>::AC, abc)
+                            , abhh.unwrapSlice(Slice<F>::BC, abc)
+                            , Zijk.data());
       chrono["singles"].stop();
     }
 
@@ -2431,13 +2436,13 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
       int distinct(0);
       if (abc[0] == abc[1]) distinct++;
       if (abc[1] == abc[2]) distinct--;
-      const double epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]);
+      const F epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]);
 
       chrono["energy"].start();
       if ( distinct == 0)
-        tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk);
+        tupleEnergy = getEnergyDistinct<F>(epsabc, epsi, Tijk, Zijk);
       else
-        tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk);
+        tupleEnergy = getEnergySame<F>(epsabc, epsi, Tijk, Zijk);
       chrono["energy"].stop();
 
 #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
@@ -2478,8 +2483,8 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
                   << " :abc " << pretty_print(abc)
                   << " :abcN " << pretty_print(*abcNext)
                   << "\n";
-        for (auto const& slice: u->slices)
-          WITH_RANK << "__gc__:guts:" << slice.info << "\n";
+        // for (auto const& slice: u->slices)
+        //   WITH_RANK << "__gc__:guts:" << slice.info << "\n";
         u->clearUnusedSlicesForNext(*abcNext);
 
         WITH_RANK << "__gc__: checking validity\n";
@@ -2487,13 +2492,13 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
 #ifdef HAVE_OCD
         // check for validity of the slices
         for (auto type: u->sliceTypes) {
-          auto tuple = Slice::subtupleBySlice(abc, type);
+          auto tuple = Slice<F>::subtupleBySlice(abc, type);
         for (auto& slice: u->slices) {
           if ( slice.info.type == type
              && slice.info.tuple == tuple
              && slice.isDirectlyFetchable()
              ) {
-            if (slice.info.state == Slice::Dispatched)
+            if (slice.info.state == Slice<F>::Dispatched)
               throw std::domain_error( "This slice should not be undispatched! "
                                      + pretty_print(slice.info));
           }
@@ -2560,6 +2565,10 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
   return { - globalEnergy };
 
 }
+// instantiate
+template Atrip::Output Atrip::run(Atrip::Input<double> const& in);
+template Atrip::Output Atrip::run(Atrip::Input<Complex> const& in);
+
 #+end_src
 
 

From e161e4c0d6a2f34b732cbc3c144caf47d8f9ea4b Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Thu, 27 Jan 2022 20:49:07 +0100
Subject: [PATCH 14/22] Update Debug

---
 atrip.org | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/atrip.org b/atrip.org
index 7f19a7a..561a375 100644
--- a/atrip.org
+++ b/atrip.org
@@ -2580,7 +2580,9 @@ template Atrip::Output Atrip::run(Atrip::Input<Complex> const& in);
 #include <functional>
 #define ATRIP_BENCHMARK
 //#define ATRIP_DONT_SLICE
-#define ATRIP_DEBUG 1
+#ifndef ATRIP_DEBUG
+#  define ATRIP_DEBUG 1
+#endif
 //#define ATRIP_WORKLOAD_DUMP
 #define ATRIP_USE_DGEMM
 //#define ATRIP_PRINT_TUPLES

From 7f455d54fdb65122425b9dd85ff3867d7222575c Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Thu, 27 Jan 2022 20:59:42 +0100
Subject: [PATCH 15/22] Clean up couple of things

---
 atrip.org | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/atrip.org b/atrip.org
index 561a375..8e4ec91 100644
--- a/atrip.org
+++ b/atrip.org
@@ -2133,7 +2133,7 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
     = [ &unions
       , np
       , &chrono
-      ] (ABCTuple const& abc, MPI_Comm const& c) -> typename Slice<F>::Database {
+      ] (ABCTuple const& abc, MPI_Comm const& c) -> Database {
 
         chrono["db:comm:type:do"].start();
         auto MPI_LDB_ELEMENT = Slice<F>::mpi::localDatabaseElement();
@@ -2148,8 +2148,7 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
         }
         chrono["db:comm:ldb"].stop();
 
-        typename
-        Slice<F>::Database db(np * ldb.size(), ldb[0]);
+        Database db(np * ldb.size(), ldb[0]);
 
         chrono["oneshot-db:comm:allgather"].start();
         chrono["db:comm:allgather"].start();
@@ -2171,7 +2170,7 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
       };
 
   auto doIOPhase
-    = [&unions, &rank, &np, &universe, &chrono] (typename Slice<F>::Database const& db) {
+    = [&unions, &rank, &np, &universe, &chrono] (Database const& db) {
 
     const size_t localDBLength = db.size() / np;
 
@@ -2270,14 +2269,13 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
 
   // START MAIN LOOP ======================================================{{{1
 
-  typename Slice<F>::Database db;
-
   for ( size_t i = abcIndex.first, iteration = 1
       ; i < abcIndex.second
       ; i++, iteration++
       ) {
     chrono["iterations"].start();
 
+
     // check overhead from chrono over all iterations
     chrono["start:stop"].start(); chrono["start:stop"].stop();
 
@@ -2356,7 +2354,7 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
       WITH_RANK << "__comm__:" << iteration << "th communicating database\n";
       chrono["db:comm"].start();
       //const auto db = communicateDatabase(*abcNext, universe);
-      db = communicateDatabase(*abcNext, universe);
+      Database db = communicateDatabase(*abcNext, universe);
       chrono["db:comm"].stop();
       chrono["db:io"].start();
       doIOPhase(db);
@@ -2377,7 +2375,6 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
       )))
       chrono["oneshot-doubles"].start();
       chrono["doubles"].start();
-      LOGREMOVE << "doubles " << iteration << "\n";
       doublesContribution<F>( abc, (size_t)No, (size_t)Nv
                             // -- VABCI
                             , abph.unwrapSlice(Slice<F>::AB, abc)

From c2b1c78c67d756c29b6a0f9b884248491a38f4d7 Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Mon, 7 Feb 2022 22:29:47 +0100
Subject: [PATCH 16/22] Tanlge source files for complex

---
 include/atrip.hpp            |   2 +-
 include/atrip/Atrip.hpp      |  24 ++--
 include/atrip/Blas.hpp       |  70 +++++++++-
 include/atrip/Debug.hpp      |  12 +-
 include/atrip/Equations.hpp  | 257 ++++++++++++++++++++---------------
 include/atrip/RankMap.hpp    |  12 +-
 include/atrip/Slice.hpp      |  64 +++++----
 include/atrip/SliceUnion.hpp | 110 +++++++--------
 include/atrip/Tuples.hpp     |   2 +-
 include/atrip/Unions.hpp     | 204 +++++++++++++--------------
 include/atrip/Utils.hpp      |   2 +-
 src/atrip/Atrip.cxx          | 129 +++++++++---------
 12 files changed, 511 insertions(+), 377 deletions(-)

diff --git a/include/atrip.hpp b/include/atrip.hpp
index b3ef823..8ecf6ce 100644
--- a/include/atrip.hpp
+++ b/include/atrip.hpp
@@ -1,4 +1,4 @@
-// [[file:../atrip.org::*Include header][Include header:1]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Include%20header][Include header:1]]
 #pragma once
 
 #include <atrip/Atrip.hpp>
diff --git a/include/atrip/Atrip.hpp b/include/atrip/Atrip.hpp
index a8bcd78..6f3859c 100644
--- a/include/atrip/Atrip.hpp
+++ b/include/atrip/Atrip.hpp
@@ -1,4 +1,4 @@
-// [[file:../../atrip.org::*Atrip][Atrip:1]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Header][Header:1]]
 #pragma once
 #include <sstream>
 #include <string>
@@ -15,8 +15,9 @@ namespace atrip {
     static int np;
     static void init();
 
+    template <typename F=double>
     struct Input {
-      CTF::Tensor<double> *ei = nullptr
+      CTF::Tensor<F> *ei = nullptr
                         , *ea = nullptr
                         , *Tph = nullptr
                         , *Tpphh = nullptr
@@ -27,13 +28,13 @@ namespace atrip {
       int maxIterations = 0, iterationMod = -1, percentageMod = -1;
       bool barrier = false;
       bool chrono = false;
-      Input& with_epsilon_i(CTF::Tensor<double> * t) { ei = t; return *this; }
-      Input& with_epsilon_a(CTF::Tensor<double> * t) { ea = t; return *this; }
-      Input& with_Tai(CTF::Tensor<double> * t) { Tph = t; return *this; }
-      Input& with_Tabij(CTF::Tensor<double> * t) { Tpphh = t; return *this; }
-      Input& with_Vabij(CTF::Tensor<double> * t) { Vpphh = t; return *this; }
-      Input& with_Vijka(CTF::Tensor<double> * t) { Vhhhp = t; return *this; }
-      Input& with_Vabci(CTF::Tensor<double> * t) { Vppph = t; return *this; }
+      Input& with_epsilon_i(CTF::Tensor<F> * t) { ei = t; return *this; }
+      Input& with_epsilon_a(CTF::Tensor<F> * t) { ea = t; return *this; }
+      Input& with_Tai(CTF::Tensor<F> * t) { Tph = t; return *this; }
+      Input& with_Tabij(CTF::Tensor<F> * t) { Tpphh = t; return *this; }
+      Input& with_Vabij(CTF::Tensor<F> * t) { Vpphh = t; return *this; }
+      Input& with_Vijka(CTF::Tensor<F> * t) { Vhhhp = t; return *this; }
+      Input& with_Vabci(CTF::Tensor<F> * t) { Vppph = t; return *this; }
       Input& with_maxIterations(int i) { maxIterations = i; return *this; }
       Input& with_iterationMod(int i) { iterationMod = i; return *this; }
       Input& with_percentageMod(int i) { percentageMod = i; return *this; }
@@ -44,8 +45,9 @@ namespace atrip {
     struct Output {
       double energy;
     };
-    static Output run(Input const& in);
+    template <typename F=double>
+    static Output run(Input<F> const& in);
   };
 
 }
-// Atrip:1 ends here
+// Header:1 ends here
diff --git a/include/atrip/Blas.hpp b/include/atrip/Blas.hpp
index fa63028..df81d74 100644
--- a/include/atrip/Blas.hpp
+++ b/include/atrip/Blas.hpp
@@ -1,6 +1,9 @@
-// [[file:../../atrip.org::*Blas][Blas:1]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Blas][Blas:1]]
 #pragma once
 namespace atrip {
+
+  using Complex = std::complex<double>;
+
   extern "C" {
     void dgemm_(
       const char *transa,
@@ -9,14 +12,73 @@ namespace atrip {
       const int *n,
       const int *k,
       double *alpha,
-      const double *A,
+      const double *a,
       const int *lda,
-      const double *B,
+      const double *b,
       const int *ldb,
       double *beta,
-      double *C,
+      double *c,
+      const int *ldc
+    );
+
+    void zgemm_(
+      const char *transa,
+      const char *transb,
+      const int *m,
+      const int *n,
+      const int *k,
+      Complex *alpha,
+      const Complex *A,
+      const int *lda,
+      const Complex *B,
+      const int *ldb,
+      Complex *beta,
+      Complex *C,
       const int *ldc
     );
   }
+
+
+  template <typename F=double>
+  void xgemm(const char *transa,
+             const char *transb,
+             const int *m,
+             const int *n,
+             const int *k,
+             F *alpha,
+             const F *A,
+             const int *lda,
+             const F *B,
+             const int *ldb,
+             F *beta,
+             F *C,
+             const int *ldc) {
+    dgemm_(transa, transb,
+           m, n, k,
+           alpha, A, lda,
+           B, ldb, beta,
+           C, ldc);
+  }
+
+  template <>
+  void xgemm(const char *transa,
+             const char *transb,
+             const int *m,
+             const int *n,
+             const int *k,
+             Complex *alpha,
+             const Complex *A,
+             const int *lda,
+             const Complex *B,
+             const int *ldb,
+             Complex *beta,
+             Complex *C,
+             const int *ldc) {
+    zgemm_(transa, transb,
+           m, n, k,
+           alpha, A, lda,
+           B, ldb, beta,
+           C, ldc);
+  }
 }
 // Blas:1 ends here
diff --git a/include/atrip/Debug.hpp b/include/atrip/Debug.hpp
index 6bdfde2..4347824 100644
--- a/include/atrip/Debug.hpp
+++ b/include/atrip/Debug.hpp
@@ -1,9 +1,11 @@
-// [[file:../../atrip.org::*Macros][Macros:1]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Macros][Macros:1]]
 #pragma once
 #include <functional>
 #define ATRIP_BENCHMARK
 //#define ATRIP_DONT_SLICE
-#define ATRIP_DEBUG 1
+#ifndef ATRIP_DEBUG
+#  define ATRIP_DEBUG 1
+#endif
 //#define ATRIP_WORKLOAD_DUMP
 #define ATRIP_USE_DGEMM
 //#define ATRIP_PRINT_TUPLES
@@ -60,20 +62,20 @@
 #endif
 // Macros:1 ends here
 
-// [[file:../../atrip.org::*Macros][Macros:2]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Macros][Macros:2]]
 #ifndef LOG
 #define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": "
 #endif
 // Macros:2 ends here
 
-// [[file:../../atrip.org::*Macros][Macros:3]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Macros][Macros:3]]
 #ifdef ATRIP_NO_OUTPUT
 #  undef LOG
 #  define LOG(level, name) if (false) std::cout << name << ": "
 #endif
 // Macros:3 ends here
 
-// [[file:../../atrip.org::IterationDescriptor][IterationDescriptor]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::IterationDescriptor][IterationDescriptor]]
 namespace atrip {
 
   struct IterationDescription;
diff --git a/include/atrip/Equations.hpp b/include/atrip/Equations.hpp
index b8496f6..2b90736 100644
--- a/include/atrip/Equations.hpp
+++ b/include/atrip/Equations.hpp
@@ -1,4 +1,4 @@
-// [[file:../../atrip.org::*Equations][Equations:1]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Equations][Equations:1]]
 #pragma once
 
 #include<atrip/Slice.hpp>
@@ -6,14 +6,15 @@
 
 namespace atrip {
 
+  template <typename F=double>
   double getEnergyDistinct
-    ( const double epsabc
-    , std::vector<double> const& epsi
-    , std::vector<double> const& Tijk_
-    , std::vector<double> const& Zijk_
+    ( const F epsabc
+    , std::vector<F> const& epsi
+    , std::vector<F> const& Tijk_
+    , std::vector<F> const& Zijk_
     ) {
     constexpr size_t blockSize=16;
-    double energy(0.);
+    F energy(0.);
     const size_t No = epsi.size();
     for (size_t kk=0; kk<No; kk+=blockSize){
       const size_t kend( std::min(No, kk+blockSize) );
@@ -22,52 +23,64 @@ namespace atrip {
         for (size_t ii(jj); ii<No; ii+=blockSize){
           const size_t iend( std::min( No, ii+blockSize) );
           for (size_t k(kk); k < kend; k++){
-            const double ek(epsi[k]);
+            const F ek(epsi[k]);
             const size_t jstart = jj > k ? jj : k;
             for (size_t j(jstart); j < jend; j++){
-              const double ej(epsi[j]);
-              double facjk( j == k ? 0.5 : 1.0);
+              F const ej(epsi[j]);
+              F const facjk = j == k ? F(0.5) : F(1.0);
               size_t istart = ii > j ? ii : j;
               for (size_t i(istart); i < iend; i++){
-                const double ei(epsi[i]);
-                double facij ( i==j ? 0.5 : 1.0);
-                double denominator(epsabc - ei - ej - ek);
-                double U(Zijk_[i + No*j + No*No*k]);
-                double V(Zijk_[i + No*k + No*No*j]);
-                double W(Zijk_[j + No*i + No*No*k]);
-                double X(Zijk_[j + No*k + No*No*i]);
-                double Y(Zijk_[k + No*i + No*No*j]);
-                double Z(Zijk_[k + No*j + No*No*i]);
-
-                double A(Tijk_[i + No*j + No*No*k]);
-                double B(Tijk_[i + No*k + No*No*j]);
-                double C(Tijk_[j + No*i + No*No*k]);
-                double D(Tijk_[j + No*k + No*No*i]);
-                double E(Tijk_[k + No*i + No*No*j]);
-                double F(Tijk_[k + No*j + No*No*i]);
-                double value(3.0*(A*U+B*V+C*W+D*X+E*Y+F*Z)
-                            +((U+X+Y)-2.0*(V+W+Z))*(A+D+E)
-                            +((V+W+Z)-2.0*(U+X+Y))*(B+C+F));
-                energy += 2.0*value / denominator * facjk * facij;
+                const F
+                    ei(epsi[i])
+                  , facij = i == j ? F(0.5) : F(1.0)
+                  , denominator(epsabc - ei - ej - ek)
+                  , U(Zijk_[i + No*j + No*No*k])
+                  , V(Zijk_[i + No*k + No*No*j])
+                  , W(Zijk_[j + No*i + No*No*k])
+                  , X(Zijk_[j + No*k + No*No*i])
+                  , Y(Zijk_[k + No*i + No*No*j])
+                  , Z(Zijk_[k + No*j + No*No*i])
+                  , A(std::conj(Tijk_[i + No*j + No*No*k]))
+                  , B(std::conj(Tijk_[i + No*k + No*No*j]))
+                  , C(std::conj(Tijk_[j + No*i + No*No*k]))
+                  , D(std::conj(Tijk_[j + No*k + No*No*i]))
+                  , E(std::conj(Tijk_[k + No*i + No*No*j]))
+                  , F(std::conj(Tijk_[k + No*j + No*No*i]))
+                  , value
+                    = 3.0 * ( A * U
+                              + B * V
+                              + C * W
+                              + D * X
+                              + E * Y
+                              + F * Z )
+                   + ( ( U + X + Y )
+                     - 2.0 * ( V + W + Z )
+                     ) * ( A + D + E )
+                   + ( ( V + W + Z )
+                     - 2.0 * ( U + X + Y )
+                     ) * ( B + C + F )
+                  ;
+                energy += 2.0 * value / denominator * facjk * facij;
               } // i
             } // j
           } // k
         } // ii
       } // jj
     } // kk
-    return energy;
+    return std::real(energy);
   }
 
 
+  template <typename F=double>
   double getEnergySame
-    ( const double epsabc
-    , std::vector<double> const& epsi
-    , std::vector<double> const& Tijk_
-    , std::vector<double> const& Zijk_
+    ( const F epsabc
+    , std::vector<F> const& epsi
+    , std::vector<F> const& Tijk_
+    , std::vector<F> const& Zijk_
     ) {
     constexpr size_t blockSize = 16;
     const size_t No = epsi.size();
-    double energy(0.);
+    F energy = F(0.);
     for (size_t kk=0; kk<No; kk+=blockSize){
       const size_t kend( std::min( kk+blockSize, No) );
       for (size_t jj(kk); jj<No; jj+=blockSize){
@@ -75,42 +88,50 @@ namespace atrip {
         for (size_t ii(jj); ii<No; ii+=blockSize){
           const size_t iend( std::min( ii+blockSize, No) );
           for (size_t k(kk); k < kend; k++){
-            const double ek(epsi[k]);
+            const F ek(epsi[k]);
             const size_t jstart = jj > k ? jj : k;
             for(size_t j(jstart); j < jend; j++){
-              const double facjk( j == k ? 0.5 : 1.0);
-              const double ej(epsi[j]);
+              const F facjk( j == k ? F(0.5) : F(1.0));
+              const F ej(epsi[j]);
               const size_t istart = ii > j ? ii : j;
               for(size_t i(istart); i < iend; i++){
-                double ei(epsi[i]);
-                double facij ( i==j ? 0.5 : 1.0);
-                double denominator(epsabc - ei - ej - ek);
-                double U(Zijk_[i + No*j + No*No*k]);
-                double V(Zijk_[j + No*k + No*No*i]);
-                double W(Zijk_[k + No*i + No*No*j]);
-                double A(Tijk_[i + No*j + No*No*k]);
-                double B(Tijk_[j + No*k + No*No*i]);
-                double C(Tijk_[k + No*i + No*No*j]);
-                double value(3.0*( A*U + B*V + C*W) - (A+B+C)*(U+V+W));
-                energy += 2.0*value / denominator * facjk * facij;
+                const F
+                  ei(epsi[i])
+                , facij ( i==j ? F(0.5) : F(1.0))
+                , denominator(epsabc - ei - ej - ek)
+                , U(Zijk_[i + No*j + No*No*k])
+                , V(Zijk_[j + No*k + No*No*i])
+                , W(Zijk_[k + No*i + No*No*j])
+                , A(std::conj(Tijk_[i + No*j + No*No*k]))
+                , B(std::conj(Tijk_[j + No*k + No*No*i]))
+                , C(std::conj(Tijk_[k + No*i + No*No*j]))
+                , value
+                  = F(3.0) * ( A * U
+                             + B * V
+                             + C * W
+                             )
+                  - ( A + B + C ) * ( U + V + W )
+                ;
+                energy += F(2.0) * value / denominator * facjk * facij;
               } // i
             } // j
           } // k
         } // ii
       } // jj
     } // kk
-    return energy;
+    return std::real(energy);
   }
 
+  template <typename F=double>
   void singlesContribution
     ( size_t No
     , size_t Nv
     , const ABCTuple &abc
-    , double const* Tph
-    , double const* VABij
-    , double const* VACij
-    , double const* VBCij
-    , double *Zijk
+    , F const* Tph
+    , F const* VABij
+    , F const* VACij
+    , F const* VBCij
+    , F *Zijk
     ) {
     const size_t a(abc[0]), b(abc[1]), c(abc[2]);
     for (size_t k=0; k < No; k++)
@@ -125,31 +146,32 @@ namespace atrip {
     }
   }
 
+  template <typename F=double>
   void doublesContribution
     ( const ABCTuple &abc
     , size_t const No
     , size_t const Nv
     // -- VABCI
-    , double const* VABph
-    , double const* VACph
-    , double const* VBCph
-    , double const* VBAph
-    , double const* VCAph
-    , double const* VCBph
+    , F const* VABph
+    , F const* VACph
+    , F const* VBCph
+    , F const* VBAph
+    , F const* VCAph
+    , F const* VCBph
     // -- VHHHA
-    , double const* VhhhA
-    , double const* VhhhB
-    , double const* VhhhC
+    , F const* VhhhA
+    , F const* VhhhB
+    , F const* VhhhC
     // -- TA
-    , double const* TAphh
-    , double const* TBphh
-    , double const* TCphh
+    , F const* TAphh
+    , F const* TBphh
+    , F const* TCphh
     // -- TABIJ
-    , double const* TABhh
-    , double const* TAChh
-    , double const* TBChh
+    , F const* TABhh
+    , F const* TAChh
+    , F const* TBChh
     // -- TIJK
-    , double *Tijk
+    , F *Tijk
     , atrip::Timings& chrono
     ) {
 
@@ -168,40 +190,47 @@ namespace atrip {
       Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)];   \
     }                                                               \
     t_reorder.stop();
-  #define DGEMM_PARTICLES(__A, __B)    \
-    atrip::dgemm_( "T"                 \
-                , "N"                 \
-                , (int const*)&NoNo   \
-                , (int const*)&No     \
-                , (int const*)&Nv     \
-                , &one                \
-                , __A                 \
-                , (int const*)&Nv     \
-                , __B                 \
-                , (int const*)&Nv     \
-                , &zero               \
-                , _t_buffer.data()    \
-                , (int const*)&NoNo   \
-                );
-  #define DGEMM_HOLES(__A, __B, __TRANSB)  \
-    atrip::dgemm_( "N"                     \
-                , __TRANSB                \
-                , (int const*)&NoNo       \
-                , (int const*)&No         \
-                , (int const*)&No         \
-                , &m_one                  \
-                , __A                     \
-                , (int const*)&NoNo       \
-                , __B                     \
-                , (int const*)&No         \
-                , &zero                   \
-                , _t_buffer.data()        \
-                , (int const*)&NoNo       \
-                );
+  #define DGEMM_PARTICLES(__A, __B)      \
+    atrip::xgemm<F>( "T"                 \
+                   , "N"                 \
+                   , (int const*)&NoNo   \
+                   , (int const*)&No     \
+                   , (int const*)&Nv     \
+                   , &one                \
+                   , __A                 \
+                   , (int const*)&Nv     \
+                   , __B                 \
+                   , (int const*)&Nv     \
+                   , &zero               \
+                   , _t_buffer.data()    \
+                   , (int const*)&NoNo   \
+                   );
+  #define DGEMM_HOLES(__A, __B, __TRANSB)    \
+    atrip::xgemm<F>( "N"                     \
+                   , __TRANSB                \
+                   , (int const*)&NoNo       \
+                   , (int const*)&No         \
+                   , (int const*)&No         \
+                   , &m_one                  \
+                   , __A                     \
+                   , (int const*)&NoNo       \
+                   , __B                     \
+                   , (int const*)&No         \
+                   , &zero                   \
+                   , _t_buffer.data()        \
+                   , (int const*)&NoNo       \
+                   );
+  #define MAYBE_CONJ(_conj, _buffer)                          \
+    if (traits::isComplex<F>()) {                             \
+      for (size_t __i = 0; __i < NoNoNo; ++__i)               \
+        _conj[__i] = std::conj(_buffer[__i]);                 \
+    } else {                                                  \
+      for (size_t __i = 0; __i < NoNoNo; ++__i)               \
+        _conj[__i] = _buffer[__i];                            \
+    }
 
-    using F = double;
     const size_t NoNoNo = No*NoNo;
-    std::vector<double> _t_buffer;
+    std::vector<F> _t_buffer;
     _t_buffer.reserve(NoNoNo);
     F one{1.0}, m_one{-1.0}, zero{0.0};
 
@@ -214,38 +243,48 @@ namespace atrip {
 
     chrono["doubles:holes"].start();
     { // Holes part ============================================================
+
+      std::vector<F> _vhhh(NoNoNo);
+
       // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
+      MAYBE_CONJ(_vhhh, VhhhC)
       chrono["doubles:holes:1"].start();
-      DGEMM_HOLES(VhhhC, TABhh, "N")
+      DGEMM_HOLES(_vhhh.data(), TABhh, "N")
       REORDER(i, k, j)
       chrono["doubles:holes:1"].stop();
       // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
       chrono["doubles:holes:2"].start();
-      DGEMM_HOLES(VhhhC, TABhh, "T")
+      DGEMM_HOLES(_vhhh.data(), TABhh, "T")
       REORDER(j, k, i)
       chrono["doubles:holes:2"].stop();
+
       // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
+      MAYBE_CONJ(_vhhh, VhhhB)
       chrono["doubles:holes:3"].start();
-      DGEMM_HOLES(VhhhB, TAChh, "N")
+      DGEMM_HOLES(_vhhh.data(), TAChh, "N")
       REORDER(i, j, k)
       chrono["doubles:holes:3"].stop();
       // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
       chrono["doubles:holes:4"].start();
-      DGEMM_HOLES(VhhhB, TAChh, "T")
+      DGEMM_HOLES(_vhhh.data(), TAChh, "T")
       REORDER(k, j, i)
       chrono["doubles:holes:4"].stop();
+
       // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
+      MAYBE_CONJ(_vhhh, VhhhA)
       chrono["doubles:holes:5"].start();
-      DGEMM_HOLES(VhhhA, TBChh, "N")
+      DGEMM_HOLES(_vhhh.data(), TBChh, "N")
       REORDER(j, i, k)
       chrono["doubles:holes:5"].stop();
       // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
       chrono["doubles:holes:6"].start();
-      DGEMM_HOLES(VhhhA, TBChh, "T")
+      DGEMM_HOLES(_vhhh.data(), TBChh, "T")
       REORDER(k, i, j)
       chrono["doubles:holes:6"].stop();
+
     }
     chrono["doubles:holes"].stop();
+  #undef MAYBE_CONJ
 
     chrono["doubles:particles"].start();
     { // Particle part =========================================================
diff --git a/include/atrip/RankMap.hpp b/include/atrip/RankMap.hpp
index 82bb674..8564f9e 100644
--- a/include/atrip/RankMap.hpp
+++ b/include/atrip/RankMap.hpp
@@ -1,4 +1,4 @@
-// [[file:../../atrip.org::*The rank mapping][The rank mapping:1]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20rank%20mapping][The rank mapping:1]]
 #pragma once
 
 #include <vector>
@@ -7,6 +7,8 @@
 #include <atrip/Slice.hpp>
 
 namespace atrip {
+
+  template <typename F=double>
   struct RankMap {
 
     std::vector<size_t> const lengths;
@@ -19,7 +21,7 @@ namespace atrip {
                             1UL, std::multiplies<size_t>()))
     { assert(lengths.size() <= 2); }
 
-    size_t find(Slice::Location const& p) const noexcept {
+    size_t find(typename Slice<F>::Location const& p) const noexcept {
       return p.source * np + p.rank;
     }
 
@@ -39,10 +41,10 @@ namespace atrip {
       return source == nSources() && isPaddingRank(rank);
     }
 
-    Slice::Location
-    find(ABCTuple const& abc, Slice::Type sliceType) const noexcept {
+    typename Slice<F>::Location
+    find(ABCTuple const& abc, typename Slice<F>::Type sliceType) const noexcept {
       // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB
-      const auto tuple = Slice::subtupleBySlice(abc, sliceType);
+      const auto tuple = Slice<F>::subtupleBySlice(abc, sliceType);
 
       const size_t index
         = tuple[0]
diff --git a/include/atrip/Slice.hpp b/include/atrip/Slice.hpp
index a7a5363..877d72a 100644
--- a/include/atrip/Slice.hpp
+++ b/include/atrip/Slice.hpp
@@ -1,4 +1,4 @@
-// [[file:../../atrip.org::*The slice][The slice:1]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20slice][The slice:1]]
 #pragma once
 #include <iostream>
 #include <algorithm>
@@ -7,16 +7,26 @@
 
 #include <atrip/Tuples.hpp>
 #include <atrip/Utils.hpp>
+#include <atrip/Blas.hpp>
 
 namespace atrip {
 
+namespace traits {
+  template <typename FF> bool isComplex() { return false; };
+  template <> bool isComplex<Complex>() { return true; };
+namespace mpi {
+  template <typename FF> MPI_Datatype datatypeOf(void);
+  template <> MPI_Datatype datatypeOf<double>() { return MPI_DOUBLE; }
+  template <> MPI_Datatype datatypeOf<Complex>() { return MPI_DOUBLE_COMPLEX; }
+}
+}
 
+
+template <typename F=double>
 struct Slice {
-
-  using F = double;
 // The slice:1 ends here
 
-// [[file:../../atrip.org::*The slice][The slice:2]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20slice][The slice:2]]
 // ASSOCIATED TYPES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
   struct Location { size_t rank; size_t source; };
@@ -93,8 +103,8 @@ struct Slice {
 
   // DATABASE ==========================================================={{{1
   struct LocalDatabaseElement {
-    Slice::Name name;
-    Slice::Info info;
+    Slice<F>::Name name;
+    Slice<F>::Info info;
   };
   using LocalDatabase = std::vector<LocalDatabaseElement>;
   using Database = LocalDatabase;
@@ -117,7 +127,7 @@ struct Slice {
         constexpr int n = 2;
         // create a sliceLocation to measure in the current architecture
         // the packing of the struct
-        Slice::Location measure;
+        Slice<F>::Location measure;
         MPI_Datatype dt;
         const std::vector<int> lengths(n, 1);
         const MPI_Datatype types[n] = {usizeDt(), usizeDt()};
@@ -141,7 +151,7 @@ struct Slice {
       static MPI_Datatype sliceInfo () {
         constexpr int n = 5;
         MPI_Datatype dt;
-        Slice::Info measure;
+        Slice<F>::Info measure;
         const std::vector<int> lengths(n, 1);
         const MPI_Datatype types[n]
           = { vector(2, usizeDt())
@@ -213,10 +223,10 @@ struct Slice {
      * It is important here to return a reference to a Slice
      * not to accidentally copy the associated buffer of the slice.
      */
-    static Slice& findOneByType(std::vector<Slice> &slices, Slice::Type type) {
+    static Slice<F>& findOneByType(std::vector<Slice<F>> &slices, Slice<F>::Type type) {
         const auto sliceIt
           = std::find_if(slices.begin(), slices.end(),
-                         [&type](Slice const& s) {
+                         [&type](Slice<F> const& s) {
                            return type == s.info.type;
                          });
         WITH_CRAZY_DEBUG
@@ -231,11 +241,11 @@ struct Slice {
      * Check if an info has
      *
      */
-    static std::vector<Slice*> hasRecycledReferencingToIt
-      ( std::vector<Slice> &slices
+    static std::vector<Slice<F>*> hasRecycledReferencingToIt
+      ( std::vector<Slice<F>> &slices
       , Info const& info
       ) {
-      std::vector<Slice*> result;
+      std::vector<Slice<F>*> result;
 
       for (auto& s: slices)
         if (  s.info.recycling == info.type
@@ -246,11 +256,11 @@ struct Slice {
       return result;
     }
 
-    static Slice&
-    findRecycledSource (std::vector<Slice> &slices, Slice::Info info) {
+    static Slice<F>&
+    findRecycledSource (std::vector<Slice<F>> &slices, Slice<F>::Info info) {
       const auto sliceIt
         = std::find_if(slices.begin(), slices.end(),
-                       [&info](Slice const& s) {
+                       [&info](Slice<F> const& s) {
                          return info.recycling == s.info.type
                              && info.tuple == s.info.tuple
                              && State::Recycled != s.info.state
@@ -270,15 +280,15 @@ struct Slice {
       return *sliceIt;
     }
 
-    static Slice& findByTypeAbc
-      ( std::vector<Slice> &slices
-      , Slice::Type type
+    static Slice<F>& findByTypeAbc
+      ( std::vector<Slice<F>> &slices
+      , Slice<F>::Type type
       , ABCTuple const& abc
       ) {
-        const auto tuple = Slice::subtupleBySlice(abc, type);
+        const auto tuple = Slice<F>::subtupleBySlice(abc, type);
         const auto sliceIt
           = std::find_if(slices.begin(), slices.end(),
-                         [&type, &tuple](Slice const& s) {
+                         [&type, &tuple](Slice<F> const& s) {
                            return type == s.info.type
                                && tuple == s.info.tuple
                                ;
@@ -298,11 +308,11 @@ struct Slice {
         return *sliceIt;
     }
 
-    static Slice& findByInfo(std::vector<Slice> &slices,
-                             Slice::Info const& info) {
+    static Slice<F>& findByInfo(std::vector<Slice<F>> &slices,
+                             Slice<F>::Info const& info) {
         const auto sliceIt
           = std::find_if(slices.begin(), slices.end(),
-                         [&info](Slice const& s) {
+                         [&info](Slice<F> const& s) {
                            // TODO: maybe implement comparison in Info struct
                            return info.type == s.info.type
                                && info.state == s.info.state
@@ -448,13 +458,15 @@ struct Slice {
   }; // struct Slice
 
 
-std::ostream& operator<<(std::ostream& out, Slice::Location const& v) {
+template <typename F=double>
+std::ostream& operator<<(std::ostream& out, typename Slice<F>::Location const& v) {
   // TODO: remove me
   out << "{.r(" << v.rank << "), .s(" << v.source << ")};";
   return out;
 }
 
-std::ostream& operator<<(std::ostream& out, Slice::Info const& i) {
+template <typename F=double>
+std::ostream& operator<<(std::ostream& out, typename Slice<F>::Info const& i) {
   out << "«t" << i.type << ", s" << i.state << "»"
       << " ⊙ {" << i.from.rank << ", " << i.from.source << "}"
       << " ∴ {" << i.tuple[0] << ", " << i.tuple[1] << "}"
diff --git a/include/atrip/SliceUnion.hpp b/include/atrip/SliceUnion.hpp
index 060dcc2..ec7aff6 100644
--- a/include/atrip/SliceUnion.hpp
+++ b/include/atrip/SliceUnion.hpp
@@ -1,4 +1,4 @@
-// [[file:../../atrip.org::*The slice union][The slice union:1]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20slice%20union][The slice union:1]]
 #pragma once
 #include <atrip/Debug.hpp>
 #include <atrip/Slice.hpp>
@@ -6,8 +6,8 @@
 
 namespace atrip {
 
+  template <typename F=double>
   struct SliceUnion {
-    using F = double;
     using Tensor = CTF::Tensor<F>;
 
     virtual void
@@ -20,7 +20,7 @@ namespace atrip {
      * This means that there can be at most one slice with a given Ty_x_Tu.
      */
     void checkForDuplicates() const {
-      std::vector<Slice::Ty_x_Tu> tytus;
+      std::vector<typename Slice<F>::Ty_x_Tu> tytus;
       for (auto const& s: slices) {
         if (s.isFree()) continue;
         tytus.push_back({s.info.type, s.info.tuple});
@@ -33,13 +33,13 @@ namespace atrip {
 
     }
 
-    std::vector<Slice::Ty_x_Tu> neededSlices(ABCTuple const& abc) {
-      std::vector<Slice::Ty_x_Tu> needed(sliceTypes.size());
+    std::vector<typename Slice<F>::Ty_x_Tu> neededSlices(ABCTuple const& abc) {
+      std::vector<typename Slice<F>::Ty_x_Tu> needed(sliceTypes.size());
       // build the needed vector
       std::transform(sliceTypes.begin(), sliceTypes.end(),
                      needed.begin(),
-                     [&abc](Slice::Type const type) {
-                       auto tuple = Slice::subtupleBySlice(abc, type);
+                     [&abc](typename Slice<F>::Type const type) {
+                       auto tuple = Slice<F>::subtupleBySlice(abc, type);
                        return std::make_pair(type, tuple);
                      });
       return needed;
@@ -64,8 +64,9 @@ namespace atrip {
      * slices.
      *
      */
-    Slice::LocalDatabase buildLocalDatabase(ABCTuple const& abc) {
-      Slice::LocalDatabase result;
+    typename
+    Slice<F>::LocalDatabase buildLocalDatabase(ABCTuple const& abc) {
+      typename Slice<F>::LocalDatabase result;
 
       auto const needed = neededSlices(abc);
 
@@ -95,7 +96,7 @@ namespace atrip {
           // need
           auto const& it
             = std::find_if(slices.begin(), slices.end(),
-                           [&tuple, &type](Slice const& other) {
+                           [&tuple, &type](Slice<F> const& other) {
                              return other.info.tuple == tuple
                                  && other.info.type == type
                                     // we only want another slice when it
@@ -121,7 +122,7 @@ namespace atrip {
         // tuple and that has a valid data pointer.
         auto const& recycleIt
           = std::find_if(slices.begin(), slices.end(),
-                         [&tuple, &type](Slice const& other) {
+                         [&tuple, &type](Slice<F> const& other) {
                            return other.info.tuple == tuple
                                && other.info.type != type
                                && other.isRecyclable()
@@ -132,13 +133,13 @@ namespace atrip {
         // (which should exist by construction :THINK)
         //
         if (recycleIt != slices.end()) {
-          auto& blank = Slice::findOneByType(slices, Slice::Blank);
+          auto& blank = Slice<F>::findOneByType(slices, Slice<F>::Blank);
           // TODO: formalize this through a method to copy information
           //       from another slice
           blank.data = recycleIt->data;
           blank.info.type = type;
           blank.info.tuple = tuple;
-          blank.info.state = Slice::Recycled;
+          blank.info.state = Slice<F>::Recycled;
           blank.info.from = from;
           blank.info.recycling = recycleIt->info.type;
           result.push_back({name, blank.info});
@@ -165,17 +166,17 @@ namespace atrip {
                     << " for tuple " << tuple[0] << ", " << tuple[1]
                     << "\n"
                     ;
-          auto& blank = Slice::findOneByType(slices, Slice::Blank);
+          auto& blank = Slice<F>::findOneByType(slices, Slice<F>::Blank);
           blank.info.type = type;
           blank.info.tuple = tuple;
           blank.info.from = from;
 
           // Handle self sufficiency
           blank.info.state = Atrip::rank == from.rank
-                           ? Slice::SelfSufficient
-                           : Slice::Fetch
+                           ? Slice<F>::SelfSufficient
+                           : Slice<F>::Fetch
                            ;
-          if (blank.info.state == Slice::SelfSufficient) {
+          if (blank.info.state == Slice<F>::SelfSufficient) {
             blank.data = sources[from.source].data();
           } else {
             if (freePointers.size() == 0)
@@ -219,7 +220,7 @@ namespace atrip {
         // try to find the slice in the needed slices list
         auto const found
           = std::find_if(needed.begin(), needed.end(),
-                         [&slice] (Slice::Ty_x_Tu const& tytu) {
+                         [&slice] (typename Slice<F>::Ty_x_Tu const& tytu) {
                            return slice.info.tuple == tytu.second
                                && slice.info.type == tytu.first
                                ;
@@ -238,7 +239,7 @@ namespace atrip {
 
           // allow to gc unwrapped and recycled, never Fetch,
           // if we have a Fetch slice then something has gone very wrong.
-          if (!slice.isUnwrapped() && slice.info.state != Slice::Recycled)
+          if (!slice.isUnwrapped() && slice.info.state != Slice<F>::Recycled)
             throw
               std::domain_error("Trying to garbage collect "
                                 " a non-unwrapped slice! "
@@ -259,13 +260,13 @@ namespace atrip {
           //  - we should make sure that the data pointer of slice
           //    does not get freed.
           //
-          if (slice.info.state == Slice::Ready) {
+          if (slice.info.state == Slice<F>::Ready) {
             WITH_OCD WITH_RANK
               << "__gc__:" << "checking for data recycled dependencies\n";
             auto recycled
-              = Slice::hasRecycledReferencingToIt(slices, slice.info);
+              = Slice<F>::hasRecycledReferencingToIt(slices, slice.info);
             if (recycled.size()) {
-              Slice* newReady = recycled[0];
+              Slice<F>* newReady = recycled[0];
               WITH_OCD WITH_RANK
                 << "__gc__:" << "swaping recycled "
                 << pretty_print(newReady->info)
@@ -290,8 +291,8 @@ namespace atrip {
 
           // if the slice is self sufficient, do not dare touching the
           // pointer, since it is a pointer to our sources in our rank.
-          if (  slice.info.state == Slice::SelfSufficient
-             || slice.info.state == Slice::Recycled
+          if (  slice.info.state == Slice<F>::SelfSufficient
+             || slice.info.state == Slice<F>::Recycled
              ) {
             freeSlicePointer = false;
           }
@@ -313,7 +314,8 @@ namespace atrip {
           // at this point, let us blank the slice
           WITH_RANK << "~~~:cl(" << name << ")"
                     << " freeing up slice "
-                    << " info " << slice.info
+                    // TODO: make this possible
+                    // << " info " << slice.info
                     << "\n";
           slice.free();
         }
@@ -323,13 +325,13 @@ namespace atrip {
 
     // CONSTRUCTOR
     SliceUnion( Tensor const& sourceTensor
-              , std::vector<Slice::Type> sliceTypes_
+              , std::vector<typename Slice<F>::Type> sliceTypes_
               , std::vector<size_t> sliceLength_
               , std::vector<size_t> paramLength
               , size_t np
               , MPI_Comm child_world
               , MPI_Comm global_world
-              , Slice::Name name_
+              , typename Slice<F>::Name name_
               , size_t nSliceBuffers = 4
               )
               : rankMap(paramLength, np)
@@ -344,13 +346,13 @@ namespace atrip {
               , name(name_)
               , sliceTypes(sliceTypes_)
               , sliceBuffers(nSliceBuffers, sources[0])
-              //, slices(2 * sliceTypes.size(), Slice{ sources[0].size() })
+              //, slices(2 * sliceTypes.size(), Slice<F>{ sources[0].size() })
     { // constructor begin
 
       LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n";
 
       slices
-        = std::vector<Slice>(2 * sliceTypes.size(), { sources[0].size() });
+        = std::vector<Slice<F>>(2 * sliceTypes.size(), { sources[0].size() });
       // TODO: think exactly ^------------------- about this number
 
       // initialize the freePointers with the pointers to the buffers
@@ -419,19 +421,19 @@ namespace atrip {
      * \brief Send asynchronously only if the state is Fetch
      */
     void send( size_t otherRank
-             , Slice::Info const& info
+             , typename Slice<F>::Info const& info
              , size_t tag) const noexcept {
       MPI_Request request;
       bool sendData_p = false;
 
-      if (info.state == Slice::Fetch) sendData_p = true;
+      if (info.state == Slice<F>::Fetch) sendData_p = true;
       // TODO: remove this because I have SelfSufficient
       if (otherRank == info.from.rank)      sendData_p = false;
       if (!sendData_p) return;
 
       MPI_Isend( sources[info.from.source].data()
                , sources[info.from.source].size()
-               , MPI_DOUBLE /* TODO: adapt this with traits */
+               , traits::mpi::datatypeOf<F>()
                , otherRank
                , tag
                , universe
@@ -445,19 +447,19 @@ namespace atrip {
     /**
      * \brief Receive asynchronously only if the state is Fetch
      */
-    void receive(Slice::Info const& info, size_t tag) noexcept {
-      auto& slice = Slice::findByInfo(slices, info);
+    void receive(typename Slice<F>::Info const& info, size_t tag) noexcept {
+      auto& slice = Slice<F>::findByInfo(slices, info);
 
       if (Atrip::rank == info.from.rank) return;
 
-      if (slice.info.state == Slice::Fetch) {
+      if (slice.info.state == Slice<F>::Fetch) {
         // TODO: do it through the slice class
-        slice.info.state = Slice::Dispatched;
+        slice.info.state = Slice<F>::Dispatched;
         MPI_Request request;
         slice.request = request;
         MPI_Irecv( slice.data
                  , slice.size
-                 , MPI_DOUBLE // TODO: Adapt this with traits
+                 , traits::mpi::datatypeOf<F>()
                  , info.from.rank
                  , tag
                  , universe
@@ -471,42 +473,42 @@ namespace atrip {
       for (auto type: sliceTypes) unwrapSlice(type, abc);
     }
 
-    F* unwrapSlice(Slice::Type type, ABCTuple const& abc) {
+    F* unwrapSlice(typename Slice<F>::Type type, ABCTuple const& abc) {
       WITH_CRAZY_DEBUG
       WITH_RANK << "__unwrap__:slice " << type << " w n "
                 << name
                 << " abc" << pretty_print(abc)
                 << "\n";
-      auto& slice = Slice::findByTypeAbc(slices, type, abc);
-      WITH_RANK << "__unwrap__:info " << slice.info << "\n";
+      auto& slice = Slice<F>::findByTypeAbc(slices, type, abc);
+      //WITH_RANK << "__unwrap__:info " << slice.info << "\n";
       switch  (slice.info.state) {
-        case Slice::Dispatched:
+        case Slice<F>::Dispatched:
           WITH_RANK << "__unwrap__:Fetch: " << &slice
                     << " info " << pretty_print(slice.info)
                     << "\n";
           slice.unwrapAndMarkReady();
           return slice.data;
           break;
-        case Slice::SelfSufficient:
+        case Slice<F>::SelfSufficient:
           WITH_RANK << "__unwrap__:SelfSufficient: " << &slice
                     << " info " << pretty_print(slice.info)
                     << "\n";
           return slice.data;
           break;
-        case Slice::Ready:
+        case Slice<F>::Ready:
           WITH_RANK << "__unwrap__:READY: UNWRAPPED ALREADY" << &slice
                     << " info " << pretty_print(slice.info)
                     << "\n";
           return slice.data;
           break;
-        case Slice::Recycled:
+        case Slice<F>::Recycled:
           WITH_RANK << "__unwrap__:RECYCLED " << &slice
                     << " info " << pretty_print(slice.info)
                     << "\n";
           return unwrapSlice(slice.info.recycling, abc);
           break;
-        case Slice::Fetch:
-        case Slice::Acceptor:
+        case Slice<F>::Fetch:
+        case Slice<F>::Acceptor:
           throw std::domain_error("Can't unwrap an acceptor or fetch slice!");
           break;
         default:
@@ -515,24 +517,26 @@ namespace atrip {
       return slice.data;
     }
 
-    const RankMap rankMap;
+    const RankMap<F> rankMap;
     const MPI_Comm world;
     const MPI_Comm universe;
     const std::vector<size_t> sliceLength;
     std::vector< std::vector<F> > sources;
-    std::vector< Slice > slices;
-    Slice::Name name;
-    const std::vector<Slice::Type> sliceTypes;
+    std::vector< Slice<F> > slices;
+    typename Slice<F>::Name name;
+    const std::vector<typename Slice<F>::Type> sliceTypes;
     std::vector< std::vector<F> > sliceBuffers;
     std::set<F*> freePointers;
 
   };
 
-  SliceUnion&
-  unionByName(std::vector<SliceUnion*> const& unions, Slice::Name name) {
+  template <typename F=double>
+  SliceUnion<F>&
+  unionByName(std::vector<SliceUnion<F>*> const& unions,
+              typename Slice<F>::Name name) {
       const auto sliceUnionIt
         = std::find_if(unions.begin(), unions.end(),
-                      [&name](SliceUnion const* s) {
+                      [&name](SliceUnion<F> const* s) {
                         return name == s->name;
                       });
       if (sliceUnionIt == unions.end())
diff --git a/include/atrip/Tuples.hpp b/include/atrip/Tuples.hpp
index 090eb9b..5d4b69f 100644
--- a/include/atrip/Tuples.hpp
+++ b/include/atrip/Tuples.hpp
@@ -1,4 +1,4 @@
-// [[file:../../atrip.org::*Tuples][Tuples:1]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Tuples][Tuples:1]]
 #pragma once
 
 #include <vector>
diff --git a/include/atrip/Unions.hpp b/include/atrip/Unions.hpp
index de924ee..db3b6b7 100644
--- a/include/atrip/Unions.hpp
+++ b/include/atrip/Unions.hpp
@@ -1,15 +1,16 @@
-// [[file:../../atrip.org::*Unions][Unions:1]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Unions][Unions:1]]
 #pragma once
 #include <atrip/SliceUnion.hpp>
 
 namespace atrip {
 
+  template <typename F=double>
   void sliceIntoVector
-    ( std::vector<double> &v
-    , CTF::Tensor<double> &toSlice
+    ( std::vector<F> &v
+    , CTF::Tensor<F> &toSlice
     , std::vector<int64_t> const low
     , std::vector<int64_t> const up
-    , CTF::Tensor<double> const& origin
+    , CTF::Tensor<F> const& origin
     , std::vector<int64_t> const originLow
     , std::vector<int64_t> const originUp
     ) {
@@ -36,155 +37,159 @@ namespace atrip {
                  , origin_.low.data()
                  , origin_.up.data()
                  , 1.0);
-    memcpy(v.data(), toSlice.data, sizeof(double) * v.size());
+    memcpy(v.data(), toSlice.data, sizeof(F) * v.size());
 #endif
 
   }
 
 
-  struct TAPHH : public SliceUnion {
-    TAPHH( Tensor const& sourceTensor
+  template <typename F=double>
+  struct TAPHH : public SliceUnion<F> {
+    TAPHH( CTF::Tensor<F> const& sourceTensor
          , size_t No
          , size_t Nv
          , size_t np
          , MPI_Comm child_world
          , MPI_Comm global_world
-         ) : SliceUnion( sourceTensor
-                       , {Slice::A, Slice::B, Slice::C}
-                       , {Nv, No, No} // size of the slices
-                       , {Nv}
-                       , np
-                       , child_world
-                       , global_world
-                       , Slice::TA
-                       , 4) {
+         ) : SliceUnion<F>( sourceTensor
+                          , {Slice<F>::A, Slice<F>::B, Slice<F>::C}
+                          , {Nv, No, No} // size of the slices
+                          , {Nv}
+                          , np
+                          , child_world
+                          , global_world
+                          , Slice<F>::TA
+                          , 4) {
            init(sourceTensor);
          }
 
-    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override
+    void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override
     {
-      const int Nv = sliceLength[0]
-              , No = sliceLength[1]
-              , a = rankMap.find({static_cast<size_t>(Atrip::rank), it});
+      const int Nv = this->sliceLength[0]
+              , No = this->sliceLength[1]
+              , a = this->rankMap.find({static_cast<size_t>(Atrip::rank), it});
               ;
 
 
-      sliceIntoVector( sources[it]
-                     , to,   {0, 0, 0},    {Nv, No, No}
-                     , from, {a, 0, 0, 0}, {a+1, Nv, No, No}
-                     );
+      sliceIntoVector<F>( this->sources[it]
+                        , to,   {0, 0, 0},    {Nv, No, No}
+                        , from, {a, 0, 0, 0}, {a+1, Nv, No, No}
+                        );
 
     }
 
   };
 
 
-  struct HHHA : public SliceUnion {
-    HHHA( Tensor const& sourceTensor
+  template <typename F=double>
+  struct HHHA : public SliceUnion<F> {
+    HHHA( CTF::Tensor<F> const& sourceTensor
         , size_t No
         , size_t Nv
         , size_t np
         , MPI_Comm child_world
         , MPI_Comm global_world
-        ) : SliceUnion( sourceTensor
-                      , {Slice::A, Slice::B, Slice::C}
-                      , {No, No, No} // size of the slices
-                      , {Nv}         // size of the parametrization
-                      , np
-                      , child_world
-                      , global_world
-                      , Slice::VIJKA
-                      , 4) {
+        ) : SliceUnion<F>( sourceTensor
+                         , {Slice<F>::A, Slice<F>::B, Slice<F>::C}
+                         , {No, No, No} // size of the slices
+                         , {Nv}         // size of the parametrization
+                         , np
+                         , child_world
+                         , global_world
+                         , Slice<F>::VIJKA
+                         , 4) {
            init(sourceTensor);
          }
 
-    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override
+    void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override
     {
 
-      const int No = sliceLength[0]
-              , a = rankMap.find({static_cast<size_t>(Atrip::rank), it})
+      const int No = this->sliceLength[0]
+              , a = this->rankMap.find({static_cast<size_t>(Atrip::rank), it})
               ;
 
-      sliceIntoVector( sources[it]
-                     , to,   {0, 0, 0},    {No, No, No}
-                     , from, {0, 0, 0, a}, {No, No, No, a+1}
-                     );
+      sliceIntoVector<F>( this->sources[it]
+                        , to,   {0, 0, 0},    {No, No, No}
+                        , from, {0, 0, 0, a}, {No, No, No, a+1}
+                        );
 
     }
   };
 
-  struct ABPH : public SliceUnion {
-    ABPH( Tensor const& sourceTensor
+  template <typename F=double>
+  struct ABPH : public SliceUnion<F> {
+    ABPH( CTF::Tensor<F> const& sourceTensor
         , size_t No
         , size_t Nv
         , size_t np
         , MPI_Comm child_world
         , MPI_Comm global_world
-        ) : SliceUnion( sourceTensor
-                      , { Slice::AB, Slice::BC, Slice::AC
-                        , Slice::BA, Slice::CB, Slice::CA
-                        }
-                      , {Nv, No} // size of the slices
-                      , {Nv, Nv} // size of the parametrization
-                      , np
-                      , child_world
-                      , global_world
-                      , Slice::VABCI
-                      , 2*6) {
+        ) : SliceUnion<F>( sourceTensor
+                         , { Slice<F>::AB, Slice<F>::BC, Slice<F>::AC
+                           , Slice<F>::BA, Slice<F>::CB, Slice<F>::CA
+                           }
+                         , {Nv, No} // size of the slices
+                         , {Nv, Nv} // size of the parametrization
+                         , np
+                         , child_world
+                         , global_world
+                         , Slice<F>::VABCI
+                         , 2*6) {
            init(sourceTensor);
          }
 
-    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override {
+    void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override {
 
-      const int Nv = sliceLength[0]
-              , No = sliceLength[1]
-              , el = rankMap.find({static_cast<size_t>(Atrip::rank), it})
+      const int Nv = this->sliceLength[0]
+              , No = this->sliceLength[1]
+              , el = this->rankMap.find({static_cast<size_t>(Atrip::rank), it})
               , a = el % Nv
               , b = el / Nv
               ;
 
 
-      sliceIntoVector( sources[it]
-                     , to,   {0, 0},       {Nv, No}
-                     , from, {a, b, 0, 0}, {a+1, b+1, Nv, No}
-                     );
+      sliceIntoVector<F>( this->sources[it]
+                        , to,   {0, 0},       {Nv, No}
+                        , from, {a, b, 0, 0}, {a+1, b+1, Nv, No}
+                        );
 
     }
 
   };
 
-  struct ABHH : public SliceUnion {
-    ABHH( Tensor const& sourceTensor
+  template <typename F=double>
+  struct ABHH : public SliceUnion<F> {
+    ABHH( CTF::Tensor<F> const& sourceTensor
         , size_t No
         , size_t Nv
         , size_t np
         , MPI_Comm child_world
         , MPI_Comm global_world
-        ) : SliceUnion( sourceTensor
-                      , {Slice::AB, Slice::BC, Slice::AC}
-                      , {No, No} // size of the slices
-                      , {Nv, Nv} // size of the parametrization
-                      , np
-                      , child_world
-                      , global_world
-                      , Slice::VABIJ
-                      , 6) {
+        ) : SliceUnion<F>( sourceTensor
+                         , {Slice<F>::AB, Slice<F>::BC, Slice<F>::AC}
+                         , {No, No} // size of the slices
+                         , {Nv, Nv} // size of the parametrization
+                         , np
+                         , child_world
+                         , global_world
+                         , Slice<F>::VABIJ
+                         , 6) {
            init(sourceTensor);
          }
 
-    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override {
+    void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override {
 
       const int Nv = from.lens[0]
-              , No = sliceLength[1]
-              , el = rankMap.find({static_cast<size_t>(Atrip::rank), it})
+              , No = this->sliceLength[1]
+              , el = this->rankMap.find({static_cast<size_t>(Atrip::rank), it})
               , a = el % Nv
               , b = el / Nv
               ;
 
-      sliceIntoVector( sources[it]
-                     , to,   {0, 0},       {No, No}
-                     , from, {a, b, 0, 0}, {a+1, b+1, No, No}
-                     );
+      sliceIntoVector<F>( this->sources[it]
+                        , to,   {0, 0},       {No, No}
+                        , from, {a, b, 0, 0}, {a+1, b+1, No, No}
+                        );
 
 
     }
@@ -192,39 +197,40 @@ namespace atrip {
   };
 
 
-  struct TABHH : public SliceUnion {
-    TABHH( Tensor const& sourceTensor
+  template <typename F=double>
+  struct TABHH : public SliceUnion<F> {
+    TABHH( CTF::Tensor<F> const& sourceTensor
          , size_t No
          , size_t Nv
          , size_t np
          , MPI_Comm child_world
          , MPI_Comm global_world
-         ) : SliceUnion( sourceTensor
-                       , {Slice::AB, Slice::BC, Slice::AC}
-                       , {No, No} // size of the slices
-                       , {Nv, Nv} // size of the parametrization
-                       , np
-                       , child_world
-                       , global_world
-                       , Slice::TABIJ
-                       , 6) {
+         ) : SliceUnion<F>( sourceTensor
+                          , {Slice<F>::AB, Slice<F>::BC, Slice<F>::AC}
+                          , {No, No} // size of the slices
+                          , {Nv, Nv} // size of the parametrization
+                          , np
+                          , child_world
+                          , global_world
+                          , Slice<F>::TABIJ
+                          , 6) {
            init(sourceTensor);
          }
 
-    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override {
+    void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override {
       // TODO: maybe generalize this with ABHH
 
       const int Nv = from.lens[0]
-              , No = sliceLength[1]
-              , el = rankMap.find({static_cast<size_t>(Atrip::rank), it})
+              , No = this->sliceLength[1]
+              , el = this->rankMap.find({static_cast<size_t>(Atrip::rank), it})
               , a = el % Nv
               , b = el / Nv
               ;
 
-      sliceIntoVector( sources[it]
-                     , to,   {0, 0},       {No, No}
-                     , from, {a, b, 0, 0}, {a+1, b+1, No, No}
-                     );
+      sliceIntoVector<F>( this->sources[it]
+                        , to,   {0, 0},       {No, No}
+                        , from, {a, b, 0, 0}, {a+1, b+1, No, No}
+                        );
 
 
     }
diff --git a/include/atrip/Utils.hpp b/include/atrip/Utils.hpp
index a6bd743..bff3d19 100644
--- a/include/atrip/Utils.hpp
+++ b/include/atrip/Utils.hpp
@@ -1,4 +1,4 @@
-// [[file:../../atrip.org::*Utils][Utils:1]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Utils][Utils:1]]
 #pragma once
 #include <sstream>
 #include <string>
diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx
index 64dea9b..fc613b6 100644
--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@@ -1,4 +1,4 @@
-// [[file:../../atrip.org::*Main][Main:1]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:1]]
 #include <iomanip>
 
 #include <atrip/Atrip.hpp>
@@ -23,7 +23,8 @@ void Atrip::init()  {
   MPI_Comm_size(MPI_COMM_WORLD, &Atrip::np);
 }
 
-Atrip::Output Atrip::run(Atrip::Input const& in) {
+template <typename F>
+Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
 
   const int np = Atrip::np;
   const int rank = Atrip::rank;
@@ -38,14 +39,14 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
   LOG(0,"Atrip") << "Nv: " << Nv << "\n";
 
   // allocate the three scratches, see piecuch
-  std::vector<double> Tijk(No*No*No) // doubles only (see piecuch)
-                    , Zijk(No*No*No) // singles + doubles (see piecuch)
-                    // we need local copies of the following tensors on every
-                    // rank
-                    , epsi(No)
-                    , epsa(Nv)
-                    , Tai(No * Nv)
-                    ;
+  std::vector<F>   Tijk(No*No*No) // doubles only (see piecuch)
+                 , Zijk(No*No*No) // singles + doubles (see piecuch)
+                 // we need local copies of the following tensors on every
+                 // rank
+                 , epsi(No)
+                 , epsa(Nv)
+                 , Tai(No * Nv)
+                 ;
 
   in.ei->read_all(epsi.data());
   in.ea->read_all(epsa.data());
@@ -74,20 +75,20 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
   chrono["nv-slices"].start();
   // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1
   LOG(0,"Atrip") << "BUILD NV-SLICES\n";
-  TAPHH taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  HHHA  hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  TAPHH<F> taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  HHHA<F>  hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
   chrono["nv-slices"].stop();
 
   chrono["nv-nv-slices"].start();
   // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1
   LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n";
-  ABPH abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  ABHH abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  TABHH tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  ABPH<F> abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  ABHH<F> abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  TABHH<F> tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
   chrono["nv-nv-slices"].stop();
 
   // all tensors
-  std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh};
+  std::vector< SliceUnion<F>* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh};
 
   //CONSTRUCT TUPLE LIST ==============================================={{{1
   LOG(0,"Atrip") << "BUILD TUPLE LIST\n";
@@ -121,18 +122,20 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
     = [&tuplesList](size_t const i) { return i >= tuplesList.size(); };
 
 
+  using Database = typename Slice<F>::Database;
+  using LocalDatabase = typename Slice<F>::LocalDatabase;
   auto communicateDatabase
     = [ &unions
       , np
       , &chrono
-      ] (ABCTuple const& abc, MPI_Comm const& c) -> Slice::Database {
+      ] (ABCTuple const& abc, MPI_Comm const& c) -> Database {
 
         chrono["db:comm:type:do"].start();
-        auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement();
+        auto MPI_LDB_ELEMENT = Slice<F>::mpi::localDatabaseElement();
         chrono["db:comm:type:do"].stop();
 
         chrono["db:comm:ldb"].start();
-        Slice::LocalDatabase ldb;
+        LocalDatabase ldb;
 
         for (auto const& tensor: unions) {
           auto const& tensorDb = tensor->buildLocalDatabase(abc);
@@ -140,7 +143,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
         }
         chrono["db:comm:ldb"].stop();
 
-        Slice::Database db(np * ldb.size(), ldb[0]);
+        Database db(np * ldb.size(), ldb[0]);
 
         chrono["oneshot-db:comm:allgather"].start();
         chrono["db:comm:allgather"].start();
@@ -162,7 +165,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
       };
 
   auto doIOPhase
-    = [&unions, &rank, &np, &universe, &chrono] (Slice::Database const& db) {
+    = [&unions, &rank, &np, &universe, &chrono] (Database const& db) {
 
     const size_t localDBLength = db.size() / np;
 
@@ -212,7 +215,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
                 ;
       for (auto it = begin; it != end; ++it) {
         sendTag++;
-        Slice::LocalDatabaseElement const& el = *it;
+        typename Slice<F>::LocalDatabaseElement const& el = *it;
 
         if (el.info.from.rank != rank) continue;
 
@@ -261,14 +264,13 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
 
   // START MAIN LOOP ======================================================{{{1
 
-  Slice::Database db;
-
   for ( size_t i = abcIndex.first, iteration = 1
       ; i < abcIndex.second
       ; i++, iteration++
       ) {
     chrono["iterations"].start();
 
+
     // check overhead from chrono over all iterations
     chrono["start:stop"].start(); chrono["start:stop"].stop();
 
@@ -347,7 +349,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
       WITH_RANK << "__comm__:" << iteration << "th communicating database\n";
       chrono["db:comm"].start();
       //const auto db = communicateDatabase(*abcNext, universe);
-      db = communicateDatabase(*abcNext, universe);
+      Database db = communicateDatabase(*abcNext, universe);
       chrono["db:comm"].stop();
       chrono["db:io"].start();
       doIOPhase(db);
@@ -368,30 +370,30 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
       )))
       chrono["oneshot-doubles"].start();
       chrono["doubles"].start();
-      doublesContribution( abc, (size_t)No, (size_t)Nv
-                         // -- VABCI
-                         , abph.unwrapSlice(Slice::AB, abc)
-                         , abph.unwrapSlice(Slice::AC, abc)
-                         , abph.unwrapSlice(Slice::BC, abc)
-                         , abph.unwrapSlice(Slice::BA, abc)
-                         , abph.unwrapSlice(Slice::CA, abc)
-                         , abph.unwrapSlice(Slice::CB, abc)
-                         // -- VHHHA
-                         , hhha.unwrapSlice(Slice::A, abc)
-                         , hhha.unwrapSlice(Slice::B, abc)
-                         , hhha.unwrapSlice(Slice::C, abc)
-                         // -- TA
-                         , taphh.unwrapSlice(Slice::A, abc)
-                         , taphh.unwrapSlice(Slice::B, abc)
-                         , taphh.unwrapSlice(Slice::C, abc)
-                         // -- TABIJ
-                         , tabhh.unwrapSlice(Slice::AB, abc)
-                         , tabhh.unwrapSlice(Slice::AC, abc)
-                         , tabhh.unwrapSlice(Slice::BC, abc)
-                         // -- TIJK
-                         , Tijk.data()
-                         , chrono
-                         );
+      doublesContribution<F>( abc, (size_t)No, (size_t)Nv
+                            // -- VABCI
+                            , abph.unwrapSlice(Slice<F>::AB, abc)
+                            , abph.unwrapSlice(Slice<F>::AC, abc)
+                            , abph.unwrapSlice(Slice<F>::BC, abc)
+                            , abph.unwrapSlice(Slice<F>::BA, abc)
+                            , abph.unwrapSlice(Slice<F>::CA, abc)
+                            , abph.unwrapSlice(Slice<F>::CB, abc)
+                            // -- VHHHA
+                            , hhha.unwrapSlice(Slice<F>::A, abc)
+                            , hhha.unwrapSlice(Slice<F>::B, abc)
+                            , hhha.unwrapSlice(Slice<F>::C, abc)
+                            // -- TA
+                            , taphh.unwrapSlice(Slice<F>::A, abc)
+                            , taphh.unwrapSlice(Slice<F>::B, abc)
+                            , taphh.unwrapSlice(Slice<F>::C, abc)
+                            // -- TABIJ
+                            , tabhh.unwrapSlice(Slice<F>::AB, abc)
+                            , tabhh.unwrapSlice(Slice<F>::AC, abc)
+                            , tabhh.unwrapSlice(Slice<F>::BC, abc)
+                            // -- TIJK
+                            , Tijk.data()
+                            , chrono
+                            );
       WITH_RANK << iteration << "-th doubles done\n";
       chrono["doubles"].stop();
       chrono["oneshot-doubles"].stop();
@@ -409,12 +411,12 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
       for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I];
       chrono["reorder"].stop();
       chrono["singles"].start();
-      singlesContribution( No, Nv, abc
-                         , Tai.data()
-                         , abhh.unwrapSlice(Slice::AB, abc)
-                         , abhh.unwrapSlice(Slice::AC, abc)
-                         , abhh.unwrapSlice(Slice::BC, abc)
-                         , Zijk.data());
+      singlesContribution<F>( No, Nv, abc
+                            , Tai.data()
+                            , abhh.unwrapSlice(Slice<F>::AB, abc)
+                            , abhh.unwrapSlice(Slice<F>::AC, abc)
+                            , abhh.unwrapSlice(Slice<F>::BC, abc)
+                            , Zijk.data());
       chrono["singles"].stop();
     }
 
@@ -426,13 +428,13 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
       int distinct(0);
       if (abc[0] == abc[1]) distinct++;
       if (abc[1] == abc[2]) distinct--;
-      const double epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]);
+      const F epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]);
 
       chrono["energy"].start();
       if ( distinct == 0)
-        tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk);
+        tupleEnergy = getEnergyDistinct<F>(epsabc, epsi, Tijk, Zijk);
       else
-        tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk);
+        tupleEnergy = getEnergySame<F>(epsabc, epsi, Tijk, Zijk);
       chrono["energy"].stop();
 
 #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
@@ -473,8 +475,8 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
                   << " :abc " << pretty_print(abc)
                   << " :abcN " << pretty_print(*abcNext)
                   << "\n";
-        for (auto const& slice: u->slices)
-          WITH_RANK << "__gc__:guts:" << slice.info << "\n";
+        // for (auto const& slice: u->slices)
+        //   WITH_RANK << "__gc__:guts:" << slice.info << "\n";
         u->clearUnusedSlicesForNext(*abcNext);
 
         WITH_RANK << "__gc__: checking validity\n";
@@ -482,13 +484,13 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
 #ifdef HAVE_OCD
         // check for validity of the slices
         for (auto type: u->sliceTypes) {
-          auto tuple = Slice::subtupleBySlice(abc, type);
+          auto tuple = Slice<F>::subtupleBySlice(abc, type);
         for (auto& slice: u->slices) {
           if ( slice.info.type == type
              && slice.info.tuple == tuple
              && slice.isDirectlyFetchable()
              ) {
-            if (slice.info.state == Slice::Dispatched)
+            if (slice.info.state == Slice<F>::Dispatched)
               throw std::domain_error( "This slice should not be undispatched! "
                                      + pretty_print(slice.info));
           }
@@ -555,4 +557,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) {
   return { - globalEnergy };
 
 }
+// instantiate
+template Atrip::Output Atrip::run(Atrip::Input<double> const& in);
+template Atrip::Output Atrip::run(Atrip::Input<Complex> const& in);
 // Main:1 ends here

From e89bd8f150c261357a0e6896c7eb5ed9bfb56332 Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Wed, 9 Feb 2022 19:35:00 +0100
Subject: [PATCH 17/22] Add correct conjugate templated function

---
 atrip.org | 43 +++++++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/atrip.org b/atrip.org
index 8e4ec91..32b0b89 100644
--- a/atrip.org
+++ b/atrip.org
@@ -21,6 +21,9 @@ The algorithm uses two main data types, the =Slice= and the
 
 namespace atrip {
 
+template <typename FF> FF maybeConjugate(const FF a) { return a; }
+template <> Complex maybeConjugate(const Complex a) { return std::conj(a); }
+
 namespace traits {
   template <typename FF> bool isComplex() { return false; };
   template <> bool isComplex<Complex>() { return true; };
@@ -1521,12 +1524,12 @@ namespace atrip {
                   , X(Zijk_[j + No*k + No*No*i])
                   , Y(Zijk_[k + No*i + No*No*j])
                   , Z(Zijk_[k + No*j + No*No*i])
-                  , A(std::conj(Tijk_[i + No*j + No*No*k]))
-                  , B(std::conj(Tijk_[i + No*k + No*No*j]))
-                  , C(std::conj(Tijk_[j + No*i + No*No*k]))
-                  , D(std::conj(Tijk_[j + No*k + No*No*i]))
-                  , E(std::conj(Tijk_[k + No*i + No*No*j]))
-                  , F(std::conj(Tijk_[k + No*j + No*No*i]))
+                  , A(maybeConjugate<F>(Tijk_[i + No*j + No*No*k]))
+                  , B(maybeConjugate<F>(Tijk_[i + No*k + No*No*j]))
+                  , C(maybeConjugate<F>(Tijk_[j + No*i + No*No*k]))
+                  , D(maybeConjugate<F>(Tijk_[j + No*k + No*No*i]))
+                  , E(maybeConjugate<F>(Tijk_[k + No*i + No*No*j]))
+                  , F(maybeConjugate<F>(Tijk_[k + No*j + No*No*i]))
                   , value
                     = 3.0 * ( A * U
                               + B * V
@@ -1583,9 +1586,9 @@ namespace atrip {
                 , U(Zijk_[i + No*j + No*No*k])
                 , V(Zijk_[j + No*k + No*No*i])
                 , W(Zijk_[k + No*i + No*No*j])
-                , A(std::conj(Tijk_[i + No*j + No*No*k]))
-                , B(std::conj(Tijk_[j + No*k + No*No*i]))
-                , C(std::conj(Tijk_[k + No*i + No*No*j]))
+                , A(maybeConjugate<F>(Tijk_[i + No*j + No*No*k]))
+                , B(maybeConjugate<F>(Tijk_[j + No*k + No*No*i]))
+                , C(maybeConjugate<F>(Tijk_[k + No*i + No*No*j]))
                 , value
                   = F(3.0) * ( A * U
                              + B * V
@@ -1701,14 +1704,9 @@ namespace atrip {
                    , _t_buffer.data()        \
                    , (int const*)&NoNo       \
                    );
-  #define MAYBE_CONJ(_conj, _buffer)                          \
-    if (traits::isComplex<F>()) {                             \
-      for (size_t __i = 0; __i < NoNoNo; ++__i)               \
-        _conj[__i] = std::conj(_buffer[__i]);                 \
-    } else {                                                  \
-      for (size_t __i = 0; __i < NoNoNo; ++__i)               \
-        _conj[__i] = _buffer[__i];                            \
-    }
+  #define MAYBE_CONJ(_conj, _buffer)                 \
+    for (size_t __i = 0; __i < NoNoNo; ++__i)        \
+      _conj[__i] = maybeConjugate<F>(_buffer[__i]);  \
 
     const size_t NoNoNo = No*NoNo;
     std::vector<F> _t_buffer;
@@ -2259,11 +2257,12 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
 
   const double doublesFlops
     = double(No)
-    ,* double(No)
-    ,* double(No)
-    ,* (double(No) + double(Nv))
-    ,* 2
-    ,* 6
+    * double(No)
+    * double(No)
+    * (double(No) + double(Nv))
+    * 1
+    * (traits::isComplex<F>() ? 2.0 : 1.0)
+    * 6
     / 1e9
     ;
 

From 66f2de1083a26bec8cf72b23b05d0e1782fcaafa Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Mon, 14 Feb 2022 11:26:35 +0100
Subject: [PATCH 18/22] Improve MPI handling for enums

---
 atrip.org | 47 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 15 deletions(-)

diff --git a/atrip.org b/atrip.org
index 32b0b89..734972a 100644
--- a/atrip.org
+++ b/atrip.org
@@ -156,20 +156,23 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
         const std::vector<int> lengths(n, 1);
         const MPI_Datatype types[n] = {usizeDt(), usizeDt()};
 
+        static_assert(sizeof(Slice<F>::Location) == 2 * sizeof(size_t),
+                      "The Location packing is wrong in your compiler");
+
         // measure the displacements in the struct
         size_t j = 0;
-        MPI_Aint displacements[n];
+        MPI_Aint base_address, displacements[n];
+        MPI_Get_address(&measure,        &base_address);
         MPI_Get_address(&measure.rank,   &displacements[j++]);
         MPI_Get_address(&measure.source, &displacements[j++]);
-        for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0];
-        displacements[0] = 0;
+        for (size_t i = 0; i < n; i++)
+          displacements[i] = MPI_Aint_diff(displacements[i], base_address);
 
         MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
         MPI_Type_commit(&dt);
         return dt;
       }
 
-      static MPI_Datatype enumDt() { return MPI_INT; }
       static MPI_Datatype usizeDt() { return MPI_UINT64_T; }
 
       static MPI_Datatype sliceInfo () {
@@ -179,22 +182,31 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
         const std::vector<int> lengths(n, 1);
         const MPI_Datatype types[n]
           = { vector(2, usizeDt())
-            , enumDt()
-            , enumDt()
+            /*, MPI_UINT64_T*/
+            , vector(sizeof(enum Type), MPI_CHAR)
+            /*, MPI_UINT64_T*/
+            , vector(sizeof(enum State), MPI_CHAR)
+            /*, vector(sizeof(Location), MPI_CHAR)*/
             , sliceLocation()
-            , enumDt()
+            , vector(sizeof(enum Type), MPI_CHAR)
+            /*, MPI_UINT64_T*/
             };
 
+        static_assert(sizeof(enum Type) == 4, "Enum type not 4 bytes long");
+        static_assert(sizeof(enum State) == 4, "Enum State not 4 bytes long");
+        static_assert(sizeof(enum Name) == 4, "Enum Name not 4 bytes long");
+
         // create the displacements from the info measurement struct
         size_t j = 0;
-        MPI_Aint displacements[n];
-        MPI_Get_address(measure.tuple.data(), &displacements[j++]);
+        MPI_Aint base_address, displacements[n];
+        MPI_Get_address(&measure,             &base_address);
+        MPI_Get_address(&measure.tuple[0],    &displacements[j++]);
         MPI_Get_address(&measure.type,        &displacements[j++]);
         MPI_Get_address(&measure.state,       &displacements[j++]);
         MPI_Get_address(&measure.from,        &displacements[j++]);
         MPI_Get_address(&measure.recycling,   &displacements[j++]);
-        for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0];
-        displacements[0] = 0;
+        for (size_t i = 0; i < n; i++)
+          displacements[i] = MPI_Aint_diff(displacements[i], base_address);
 
         MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
         MPI_Type_commit(&dt);
@@ -207,13 +219,15 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
         LocalDatabaseElement measure;
         const std::vector<int> lengths(n, 1);
         const MPI_Datatype types[n]
-          = { enumDt()
+          = { vector(sizeof(enum Name), MPI_CHAR)
+          /*= { MPI_UINT64_T*/
             , sliceInfo()
             };
 
         // measure the displacements in the struct
         size_t j = 0;
-        MPI_Aint displacements[n];
+        MPI_Aint base_address, displacements[n];
+        MPI_Get_address(&measure,      &base_address);
         MPI_Get_address(&measure.name, &displacements[j++]);
         MPI_Get_address(&measure.info, &displacements[j++]);
         for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0];
@@ -221,6 +235,9 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
 
         MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
         MPI_Type_commit(&dt);
+        /*return vector( 4 + 4 + 48, MPI_CHAR);*/
+        // TODO
+        return vector(sizeof(LocalDatabaseElement), MPI_CHAR);
         return dt;
       }
 
@@ -2260,9 +2277,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
     * double(No)
     * double(No)
     * (double(No) + double(Nv))
-    * 1
+    * 2.0
     * (traits::isComplex<F>() ? 2.0 : 1.0)
-    * 6
+    * 6.0
     / 1e9
     ;
 

From 728c27074532df00f5a70fb421a3a7fd40dbd67e Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Mon, 14 Feb 2022 11:36:58 +0100
Subject: [PATCH 19/22] Add the pertinents todos

---
 atrip.org | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/atrip.org b/atrip.org
index 734972a..20346d6 100644
--- a/atrip.org
+++ b/atrip.org
@@ -192,9 +192,9 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
             /*, MPI_UINT64_T*/
             };
 
-        static_assert(sizeof(enum Type) == 4, "Enum type not 4 bytes long");
+        static_assert(sizeof(enum Type)  == 4, "Enum type not 4 bytes long");
         static_assert(sizeof(enum State) == 4, "Enum State not 4 bytes long");
-        static_assert(sizeof(enum Name) == 4, "Enum Name not 4 bytes long");
+        static_assert(sizeof(enum Name)  == 4, "Enum Name not 4 bytes long");
 
         // create the displacements from the info measurement struct
         size_t j = 0;
@@ -230,14 +230,16 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
         MPI_Get_address(&measure,      &base_address);
         MPI_Get_address(&measure.name, &displacements[j++]);
         MPI_Get_address(&measure.info, &displacements[j++]);
-        for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0];
-        displacements[0] = 0;
+        for (size_t i = 0; i < n; i++)
+          displacements[i] = MPI_Aint_diff(displacements[i], base_address);
+
+        static_assert( sizeof(LocalDatabaseElement) == sizeof(measure)
+                     , "Measure has bad size");
 
         MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
         MPI_Type_commit(&dt);
-        /*return vector( 4 + 4 + 48, MPI_CHAR);*/
-        // TODO
         return vector(sizeof(LocalDatabaseElement), MPI_CHAR);
+        // TODO: write tests in order to know if this works
         return dt;
       }
 

From 3dc38a43b5be004b714ef723172c53b54d2f457a Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Fri, 18 Feb 2022 12:44:01 +0100
Subject: [PATCH 20/22] Merge group-and-sort with complex

---
 atrip.org | 2211 ++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 1585 insertions(+), 626 deletions(-)

diff --git a/atrip.org b/atrip.org
index 20346d6..c6ea744 100644
--- a/atrip.org
+++ b/atrip.org
@@ -8,6 +8,9 @@ The algorithm uses two main data types, the =Slice= and the
 
 ** The slice
 
+The following section introduces the idea of a slice.
+
+*** Prolog                                                         :noexport:
 #+begin_src c++ :tangle (atrip-slice-h)
 #pragma once
 #include <iostream>
@@ -39,6 +42,7 @@ template <typename F=double>
 struct Slice {
 
 #+end_src
+*** Introduction
 
 A slice is the concept of a subset of values of a given tensor.
 As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds of objects:
@@ -48,13 +52,63 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
 - the object \( \mathsf{T}(a,b)_{ij} \) which for every pair of \( a, b \)
   corresponds the \( N_\mathrm{o}^2 \)-sized tensor \( T^{ab}_{ij} \).
 
+*** Location
+
+Every slice set, for instance,
+\( S_k = \left\{
+    a \mapsto \mathsf{T}(a)^{b}_{ij}
+    \mid
+    a \in A_k
+\right\} \)
+where \( A_k \) is some subset of
+\( \mathsf{N}_\mathrm{v} \),
+gets stored in some rank \( k \).
+In general however, the number of elements in \( A_k \) can be bigger
+than the number of processes \( n_p \). Therefore in order to uniquely
+indentify a given slice in \( S_k \) we need two identifiers,
+the rank \( k \), which tells us in which core's memory the slice is
+allocated, and an additional tag which we will call =source=.
+
+The datatype that simply models this state of affairs
+is therefore a simple structure:
+
+#+begin_src c++ :tangle (atrip-slice-h)
+  struct Location { size_t rank; size_t source; };
+#+end_src
+
+*** Type
+
+Due to the permutation operators in the equations
+it is noticeable that for every one dimensional
+slice and triple \( (a,b,c) \)
+\begin{equation*}
+a \mapsto \mathsf{t}(a)
+\end{equation*}
+one needs at the same time
+\( \mathsf{t}(a) \),
+\( \mathsf{t}(b) \) and
+\( \mathsf{t}(c) \).
+For two dimensional slices, i.e., slices of the form
+\begin{equation*}
+(a,b) \mapsto \mathsf{t}(a,b)
+\end{equation*}
+one needs in the equations the slices
+\( \mathsf{t}(a,b) \),
+\( \mathsf{t}(b,c) \) and
+\( \mathsf{t}(a,c) \).
+In addition, in the case of diagrams where
+the integral \( V^{ab}_{ci} \) appears,
+we additionaly need the permuted slices
+from before, i.e.
+\( \mathsf{t}(b,a) \),
+\( \mathsf{t}(c,b) \) and
+\( \mathsf{t}(c,a) \).
+
+This means, every slice has associated with it
+a type which denotes which permutation it is.
 
 
 #+begin_src c++ :tangle (atrip-slice-h)
-  // ASSOCIATED TYPES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-  struct Location { size_t rank; size_t source; };
-
   enum Type
     { A = 10
     , B
@@ -70,53 +124,102 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
     // The non-typed slice
     , Blank = 404
     };
+#+end_src
 
+*** State
+
+Every slice can be in different states and every state
+denotes which function the slice is going to provide
+and which relations they have between themselves.
+
+- Fetch ::
+  A slice is in state =Fetch= when it
+  has a valid data pointer that **must** be written to.
+  A =Fetch= slice should not live very long, this means
+  that after the database send and receive phase,
+  =Fetch= slices should be changed into =Dispatched=
+  in order to start the process of writing to the
+  data pointer from some other rank.
+- Dispatched ::
+  A =Dispatched= slice indicates that at some point
+  send and receive MPI calls have been dispatched
+  in order to get the data.
+  However, the calls have just been dispatched and there
+  is no warranty for the data to be there, for that,
+  the slice must be unwrapped.
+- Ready ::
+  =Ready= means that the data pointer can be read from
+  directly.
+- SelfSufficient ::
+  A slice is =SelfSufficient= when its contents are located
+  in the same rank that it lives, so that it does not have to
+  fetch from no other rank.
+  This is important in order to handle the data pointers correctly
+  and in order to save calls to MPI receive and send functions.
+- Recycled ::
+  =Recycled= means that this slice gets its data pointer from another
+  slice, so it should not be written to
+- Acceptor ::
+  =Acceptor= means that the slice can accept a new slice, it is
+  the counterpart of the =Blank= type, but for states
+
+Again the implementation is a simple enum type.
+
+#+begin_src c++ :tangle (atrip-slice-h)
   enum State {
-    // Fetch represents the state where a slice is to be fetched
-    // and has a valid data pointer that can be written to
     Fetch = 0,
-    // Dispatches represents the state that an MPI call has been
-    // dispatched in order to get the data, but the data has not been
-    // yet unwrapped, the data might be there or we might have to wait.
     Dispatched = 2,
-    // Ready means that the data pointer can be read from
     Ready = 1,
-    // Self sufficient is a slice when its contents are located
-    // in the same rank that it lives, so that it does not have to
-    // fetch from no one else.
     SelfSufficient = 911,
-    // Recycled means that this slice gets its data pointer from another
-    // slice, so it should not be written to
     Recycled = 123,
-    // Acceptor means that the Slice can accept a new Slice, it is
-    // the counterpart of the Blank type, but for states
     Acceptor = 405
   };
+#+end_src
 
-  struct Info {
-    // which part of a,b,c the slice holds
-    PartialTuple tuple;
-    // The type of slice for the user to retrieve the correct one
-    Type type;
-    // What is the state of the slice
-    State state;
-    // Where the slice is to be retrieved
-    // NOTE: this can actually be computed from tuple
-    Location from;
-    // If the data are actually to be found in this other slice
-    Type recycling;
+*** The Info structure
 
-    Info() : tuple{0,0}
-           , type{Blank}
-           , state{Acceptor}
-           , from{0,0}
-           , recycling{Blank}
-           {}
-  };
+Every slice has an information structure associated with it
+that keeps track of the **variable** type, state and so on.
 
-  using Ty_x_Tu = std::pair< Type, PartialTuple >;
+#+begin_src c++ :tangle (atrip-slice-h)
+struct Info {
+  // which part of a,b,c the slice holds
+  PartialTuple tuple;
+  // The type of slice for the user to retrieve the correct one
+  Type type;
+  // What is the state of the slice
+  State state;
+  // Where the slice is to be retrieved
+  Location from;
+  // If the data are actually to be found in this other slice
+  Type recycling;
 
-  // Names of the integrals that are considered in CCSD(T)
+  Info() : tuple{0,0}
+          , type{Blank}
+          , state{Acceptor}
+          , from{0,0}
+          , recycling{Blank}
+          {}
+};
+
+using Ty_x_Tu = std::pair< Type, PartialTuple >;
+#+end_src
+
+*** Name
+
+CCSD(T) needs in this algorithm 5 types of tensor slices,
+namely
+\( V^{ij}_{ka} \), \( V^{ab}_{ci} \),
+\( V^{ab}_{ij} \)
+and two times \( T^{ab}_{ij} \).
+The reason why we need two times the doubles
+amplitudes is because in the doubles contribution
+to the energy, the \( T \) amplidutes will be sliced
+through one parameter for the particle contribution
+and through two parameters for the hole contribution.
+
+
+#+begin_src c++ :tangle (atrip-slice-h)
   enum Name
     { TA    = 100
     , VIJKA = 101
@@ -124,276 +227,369 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
     , TABIJ = 201
     , VABIJ = 202
     };
+#+end_src
 
-  // DATABASE ==========================================================={{{1
+*** Database
+
+The database is a simple representation of the slices of a slice union.
+Every element of the database is given by the name of the tensor it
+represents and the internal information structure.
+
+#+begin_src c++ :tangle (atrip-slice-h)
   struct LocalDatabaseElement {
     Slice<F>::Name name;
     Slice<F>::Info info;
   };
+#+end_src
+
+A local database (of a given rank) and the global database is thus simply
+a vector of these elements.
+
+#+begin_src c++ :tangle (atrip-slice-h)
   using LocalDatabase = std::vector<LocalDatabaseElement>;
   using Database = LocalDatabase;
+#+end_src
 
+*** MPI Types
+#+begin_src c++ :tangle (atrip-slice-h)
+struct mpi {
 
-    // STATIC METHODS ===========================================================
-    //
-    // They are useful to organize the structure of slices
-
-    struct mpi {
-
-      static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) {
-        MPI_Datatype dt;
-        MPI_Type_vector(n, 1, 1, DT, &dt);
-        MPI_Type_commit(&dt);
-        return dt;
-      }
-
-      static MPI_Datatype sliceLocation () {
-        constexpr int n = 2;
-        // create a sliceLocation to measure in the current architecture
-        // the packing of the struct
-        Slice<F>::Location measure;
-        MPI_Datatype dt;
-        const std::vector<int> lengths(n, 1);
-        const MPI_Datatype types[n] = {usizeDt(), usizeDt()};
-
-        static_assert(sizeof(Slice<F>::Location) == 2 * sizeof(size_t),
-                      "The Location packing is wrong in your compiler");
-
-        // measure the displacements in the struct
-        size_t j = 0;
-        MPI_Aint base_address, displacements[n];
-        MPI_Get_address(&measure,        &base_address);
-        MPI_Get_address(&measure.rank,   &displacements[j++]);
-        MPI_Get_address(&measure.source, &displacements[j++]);
-        for (size_t i = 0; i < n; i++)
-          displacements[i] = MPI_Aint_diff(displacements[i], base_address);
-
-        MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
-        MPI_Type_commit(&dt);
-        return dt;
-      }
-
-      static MPI_Datatype usizeDt() { return MPI_UINT64_T; }
-
-      static MPI_Datatype sliceInfo () {
-        constexpr int n = 5;
-        MPI_Datatype dt;
-        Slice<F>::Info measure;
-        const std::vector<int> lengths(n, 1);
-        const MPI_Datatype types[n]
-          = { vector(2, usizeDt())
-            /*, MPI_UINT64_T*/
-            , vector(sizeof(enum Type), MPI_CHAR)
-            /*, MPI_UINT64_T*/
-            , vector(sizeof(enum State), MPI_CHAR)
-            /*, vector(sizeof(Location), MPI_CHAR)*/
-            , sliceLocation()
-            , vector(sizeof(enum Type), MPI_CHAR)
-            /*, MPI_UINT64_T*/
-            };
-
-        static_assert(sizeof(enum Type)  == 4, "Enum type not 4 bytes long");
-        static_assert(sizeof(enum State) == 4, "Enum State not 4 bytes long");
-        static_assert(sizeof(enum Name)  == 4, "Enum Name not 4 bytes long");
-
-        // create the displacements from the info measurement struct
-        size_t j = 0;
-        MPI_Aint base_address, displacements[n];
-        MPI_Get_address(&measure,             &base_address);
-        MPI_Get_address(&measure.tuple[0],    &displacements[j++]);
-        MPI_Get_address(&measure.type,        &displacements[j++]);
-        MPI_Get_address(&measure.state,       &displacements[j++]);
-        MPI_Get_address(&measure.from,        &displacements[j++]);
-        MPI_Get_address(&measure.recycling,   &displacements[j++]);
-        for (size_t i = 0; i < n; i++)
-          displacements[i] = MPI_Aint_diff(displacements[i], base_address);
-
-        MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
-        MPI_Type_commit(&dt);
-        return dt;
-      }
-
-      static MPI_Datatype localDatabaseElement () {
-        constexpr int n = 2;
-        MPI_Datatype dt;
-        LocalDatabaseElement measure;
-        const std::vector<int> lengths(n, 1);
-        const MPI_Datatype types[n]
-          = { vector(sizeof(enum Name), MPI_CHAR)
-          /*= { MPI_UINT64_T*/
-            , sliceInfo()
-            };
-
-        // measure the displacements in the struct
-        size_t j = 0;
-        MPI_Aint base_address, displacements[n];
-        MPI_Get_address(&measure,      &base_address);
-        MPI_Get_address(&measure.name, &displacements[j++]);
-        MPI_Get_address(&measure.info, &displacements[j++]);
-        for (size_t i = 0; i < n; i++)
-          displacements[i] = MPI_Aint_diff(displacements[i], base_address);
-
-        static_assert( sizeof(LocalDatabaseElement) == sizeof(measure)
-                     , "Measure has bad size");
-
-        MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
-        MPI_Type_commit(&dt);
-        return vector(sizeof(LocalDatabaseElement), MPI_CHAR);
-        // TODO: write tests in order to know if this works
-        return dt;
-      }
-
-    };
-
-  static
-  PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) {
-    switch (sliceType) {
-      case AB: return {abc[0], abc[1]};
-      case BC: return {abc[1], abc[2]};
-      case AC: return {abc[0], abc[2]};
-      case CB: return {abc[2], abc[1]};
-      case BA: return {abc[1], abc[0]};
-      case CA: return {abc[2], abc[0]};
-      case  A: return {abc[0], 0};
-      case  B: return {abc[1], 0};
-      case  C: return {abc[2], 0};
-      default: throw "Switch statement not exhaustive!";
-    }
+  static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) {
+    MPI_Datatype dt;
+    MPI_Type_vector(n, 1, 1, DT, &dt);
+    MPI_Type_commit(&dt);
+    return dt;
   }
 
+  static MPI_Datatype sliceLocation () {
+    constexpr int n = 2;
+    // create a sliceLocation to measure in the current architecture
+    // the packing of the struct
+    Slice<F>::Location measure;
+    MPI_Datatype dt;
+    const std::vector<int> lengths(n, 1);
+    const MPI_Datatype types[n] = {usizeDt(), usizeDt()};
 
-    /**
-     ,* It is important here to return a reference to a Slice
-     ,* not to accidentally copy the associated buffer of the slice.
-     ,*/
-    static Slice<F>& findOneByType(std::vector<Slice<F>> &slices, Slice<F>::Type type) {
-        const auto sliceIt
-          = std::find_if(slices.begin(), slices.end(),
-                         [&type](Slice<F> const& s) {
-                           return type == s.info.type;
-                         });
-        WITH_CRAZY_DEBUG
-        WITH_RANK
-          << "\t__ looking for " << type << "\n";
-        if (sliceIt == slices.end())
-          throw std::domain_error("Slice by type not found!");
-        return *sliceIt;
-    }
+    static_assert(sizeof(Slice<F>::Location) == 2 * sizeof(size_t),
+                  "The Location packing is wrong in your compiler");
 
-    /*
-     ,* Check if an info has
-     ,*
-     ,*/
-    static std::vector<Slice<F>*> hasRecycledReferencingToIt
-      ( std::vector<Slice<F>> &slices
-      , Info const& info
-      ) {
-      std::vector<Slice<F>*> result;
+    // measure the displacements in the struct
+    size_t j = 0;
+    MPI_Aint base_address, displacements[n];
+    MPI_Get_address(&measure,        &base_address);
+    MPI_Get_address(&measure.rank,   &displacements[j++]);
+    MPI_Get_address(&measure.source, &displacements[j++]);
+    for (size_t i = 0; i < n; i++)
+      displacements[i] = MPI_Aint_diff(displacements[i], base_address);
 
-      for (auto& s: slices)
-        if (  s.info.recycling == info.type
-           && s.info.tuple == info.tuple
-           && s.info.state == Recycled
-           ) result.push_back(&s);
+    MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
+    MPI_Type_commit(&dt);
+    return dt;
+  }
 
-      return result;
-    }
+  static MPI_Datatype usizeDt() { return MPI_UINT64_T; }
 
-    static Slice<F>&
-    findRecycledSource (std::vector<Slice<F>> &slices, Slice<F>::Info info) {
-      const auto sliceIt
-        = std::find_if(slices.begin(), slices.end(),
-                       [&info](Slice<F> const& s) {
-                         return info.recycling == s.info.type
-                             && info.tuple == s.info.tuple
-                             && State::Recycled != s.info.state
-                             ;
-                       });
+  static MPI_Datatype sliceInfo () {
+    constexpr int n = 5;
+    MPI_Datatype dt;
+    Slice<F>::Info measure;
+    const std::vector<int> lengths(n, 1);
+    const MPI_Datatype types[n]
+      = { vector(2, usizeDt())
+        , vector(sizeof(enum Type), MPI_CHAR)
+        , vector(sizeof(enum State), MPI_CHAR)
+        , sliceLocation()
+        , vector(sizeof(enum Type), MPI_CHAR)
+        // TODO: Why this does not work on intel mpi?
+        /*, MPI_UINT64_T*/
+        };
 
-      WITH_CRAZY_DEBUG
-      WITH_RANK << "__slice__:find: recycling source of "
-                << pretty_print(info) << "\n";
-      if (sliceIt == slices.end())
-        throw std::domain_error( "Slice not found: "
-                               + pretty_print(info)
-                               + " rank: "
-                               + pretty_print(Atrip::rank)
-                               );
-      WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n";
-      return *sliceIt;
-    }
+    static_assert(sizeof(enum Type)  == 4, "Enum type not 4 bytes long");
+    static_assert(sizeof(enum State) == 4, "Enum State not 4 bytes long");
+    static_assert(sizeof(enum Name)  == 4, "Enum Name not 4 bytes long");
 
-    static Slice<F>& findByTypeAbc
-      ( std::vector<Slice<F>> &slices
-      , Slice<F>::Type type
-      , ABCTuple const& abc
-      ) {
-        const auto tuple = Slice<F>::subtupleBySlice(abc, type);
-        const auto sliceIt
-          = std::find_if(slices.begin(), slices.end(),
-                         [&type, &tuple](Slice<F> const& s) {
-                           return type == s.info.type
-                               && tuple == s.info.tuple
-                               ;
-                         });
-        WITH_CRAZY_DEBUG
-        WITH_RANK << "__slice__:find:" << type << " and tuple "
-                  << pretty_print(tuple)
-                  << "\n";
-        if (sliceIt == slices.end())
-          throw std::domain_error( "Slice not found: "
-                                 + pretty_print(tuple)
-                                 + ", "
-                                 + pretty_print(type)
-                                 + " rank: "
-                                 + pretty_print(Atrip::rank)
-                                 );
-        return *sliceIt;
-    }
+    // create the displacements from the info measurement struct
+    size_t j = 0;
+    MPI_Aint base_address, displacements[n];
+    MPI_Get_address(&measure,             &base_address);
+    MPI_Get_address(&measure.tuple[0],    &displacements[j++]);
+    MPI_Get_address(&measure.type,        &displacements[j++]);
+    MPI_Get_address(&measure.state,       &displacements[j++]);
+    MPI_Get_address(&measure.from,        &displacements[j++]);
+    MPI_Get_address(&measure.recycling,   &displacements[j++]);
+    for (size_t i = 0; i < n; i++)
+      displacements[i] = MPI_Aint_diff(displacements[i], base_address);
 
-    static Slice<F>& findByInfo(std::vector<Slice<F>> &slices,
-                             Slice<F>::Info const& info) {
-        const auto sliceIt
-          = std::find_if(slices.begin(), slices.end(),
-                         [&info](Slice<F> const& s) {
-                           // TODO: maybe implement comparison in Info struct
-                           return info.type == s.info.type
-                               && info.state == s.info.state
-                               && info.tuple == s.info.tuple
-                               && info.from.rank == s.info.from.rank
-                               && info.from.source == s.info.from.source
-                                ;
-                         });
-        WITH_CRAZY_DEBUG
-        WITH_RANK << "__slice__:find:looking for " << pretty_print(info) << "\n";
-        if (sliceIt == slices.end())
-          throw std::domain_error( "Slice by info not found: "
-                                 + pretty_print(info));
-        return *sliceIt;
-    }
+    MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
+    MPI_Type_commit(&dt);
+    return dt;
+  }
 
-    // SLICE DEFINITION  =================================================={{{1
+  static MPI_Datatype localDatabaseElement () {
+    constexpr int n = 2;
+    MPI_Datatype dt;
+    LocalDatabaseElement measure;
+    const std::vector<int> lengths(n, 1);
+    const MPI_Datatype types[n]
+      = { vector(sizeof(enum Name), MPI_CHAR)
+        , sliceInfo()
+        };
 
-    // ATTRIBUTES ============================================================
+    // measure the displacements in the struct
+    size_t j = 0;
+    MPI_Aint base_address, displacements[n];
+    MPI_Get_address(&measure,      &base_address);
+    MPI_Get_address(&measure.name, &displacements[j++]);
+    MPI_Get_address(&measure.info, &displacements[j++]);
+    for (size_t i = 0; i < n; i++)
+      displacements[i] = MPI_Aint_diff(displacements[i], base_address);
+
+    static_assert( sizeof(LocalDatabaseElement) == sizeof(measure)
+                 , "Measure has bad size");
+
+    MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
+    MPI_Type_commit(&dt);
+    return vector(sizeof(LocalDatabaseElement), MPI_CHAR);
+    // TODO: write tests in order to know if this works
+    return dt;
+  }
+
+};
+#+end_src
+
+*** Static utilities
+
+This section presents some functions which are useful to work with
+slices and are inside the namespace created by the slice struct.
+
+
+The function =subtupleBySlice= gives to every =Slice::Type=
+its meaning in terms of the triples \( (a,b,c) \).
+
+Notice that since in general the relation
+\( a < b < c \) holds (in our implementation), the case
+of one-dimensional parametrizations =A=, =B= and =C= is well
+defined.
+
+The function should only throw if there is an implementation
+error where the =Slice::Type= enum has been expanded and this
+function has not been updated accordingly.
+
+#+begin_src c++ :tangle (atrip-slice-h)
+static
+PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) {
+  switch (sliceType) {
+    case AB: return {abc[0], abc[1]};
+    case BC: return {abc[1], abc[2]};
+    case AC: return {abc[0], abc[2]};
+    case CB: return {abc[2], abc[1]};
+    case BA: return {abc[1], abc[0]};
+    case CA: return {abc[2], abc[0]};
+    case  A: return {abc[0], 0};
+    case  B: return {abc[1], 0};
+    case  C: return {abc[2], 0};
+    default: throw "Switch statement not exhaustive!";
+  }
+}
+#+end_src
+
+In the context of cleaning up slices during the main loop,
+it is important to check if a given slice has some slices
+referencing to it in quality of recycled slices.
+
+This function should therefore return a vector of pointers
+of slices referencing to the given slice's info, when
+the length of the vector is zero, then there are no dangling
+links.
+
+#+begin_src c++ :tangle (atrip-slice-h)
+static std::vector<Slice<F>*> hasRecycledReferencingToIt
+  ( std::vector<Slice<F>> &slices
+  , Info const& info
+  ) {
+  std::vector<Slice<F>*> result;
+
+  for (auto& s: slices)
+    if (  s.info.recycling == info.type
+       && s.info.tuple == info.tuple
+       && s.info.state == Recycled
+       ) result.push_back(&s);
+
+  return result;
+}
+#+end_src
+
+The rest of the coming functions are utilities in order to find in a vector
+of slices a given slice by reference. Mostly they are merely convenience
+wrappers to the standard library function =std::find_if=.
+
+They are named as =find<...>=, where =<...>= represents some condition
+and must always return a reference to the found slice, i.e., =Slice&=.
+=Atrip= relies on these functions to find the sought for slices,
+therefore these functions will throw a =std::domain_error= if the
+given slice could not be found.
+
+#+begin_src c++ :tangle (atrip-slice-h)
+static Slice<F>& findOneByType(std::vector<Slice<F>> &slices, Slice<F>::Type type) {
+    const auto sliceIt
+      = std::find_if(slices.begin(), slices.end(),
+                     [&type](Slice<F> const& s) {
+                       return type == s.info.type;
+                     });
+    WITH_CRAZY_DEBUG
+    WITH_RANK
+      << "\t__ looking for " << type << "\n";
+    if (sliceIt == slices.end())
+      throw std::domain_error("Slice by type not found!");
+    return *sliceIt;
+}
+#+end_src
+
+#+begin_src c++ :tangle (atrip-slice-h)
+static Slice<F>&
+findRecycledSource (std::vector<Slice<F>> &slices, Slice<F>::Info info) {
+  const auto sliceIt
+    = std::find_if(slices.begin(), slices.end(),
+                   [&info](Slice<F> const& s) {
+                     return info.recycling == s.info.type
+                         && info.tuple == s.info.tuple
+                         && State::Recycled != s.info.state
+                         ;
+                   });
+
+  WITH_CRAZY_DEBUG
+  WITH_RANK << "__slice__:find: recycling source of "
+            << pretty_print(info) << "\n";
+  if (sliceIt == slices.end())
+    throw std::domain_error( "Slice not found: "
+                           + pretty_print(info)
+                           + " rank: "
+                           + pretty_print(Atrip::rank)
+                           );
+  WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n";
+  return *sliceIt;
+}
+#+end_src
+
+#+begin_src c++ :tangle (atrip-slice-h)
+static Slice<F>& findByTypeAbc
+  ( std::vector<Slice<F>> &slices
+  , Slice<F>::Type type
+  , ABCTuple const& abc
+  ) {
+    const auto tuple = Slice<F>::subtupleBySlice(abc, type);
+    const auto sliceIt
+      = std::find_if(slices.begin(), slices.end(),
+                     [&type, &tuple](Slice<F> const& s) {
+                       return type == s.info.type
+                           && tuple == s.info.tuple
+                           ;
+                     });
+    WITH_CRAZY_DEBUG
+    WITH_RANK << "__slice__:find:" << type << " and tuple "
+              << pretty_print(tuple)
+              << "\n";
+    if (sliceIt == slices.end())
+      throw std::domain_error( "Slice not found: "
+                             + pretty_print(tuple)
+                             + ", "
+                             + pretty_print(type)
+                             + " rank: "
+                             + pretty_print(Atrip::rank)
+                             );
+    return *sliceIt;
+}
+#+end_src
+
+#+begin_src c++ :tangle (atrip-slice-h)
+static Slice<F>& findByInfo(std::vector<Slice<F>> &slices,
+                         Slice<F>::Info const& info) {
+  const auto sliceIt
+    = std::find_if(slices.begin(), slices.end(),
+                   [&info](Slice<F> const& s) {
+                     // TODO: maybe implement comparison in Info struct
+                     return info.type == s.info.type
+                         && info.state == s.info.state
+                         && info.tuple == s.info.tuple
+                         && info.from.rank == s.info.from.rank
+                         && info.from.source == s.info.from.source
+                          ;
+                   });
+  WITH_CRAZY_DEBUG
+  WITH_RANK << "__slice__:find:looking for " << pretty_print(info) << "\n";
+  if (sliceIt == slices.end())
+    throw std::domain_error( "Slice by info not found: "
+                           + pretty_print(info));
+  return *sliceIt;
+}
+#+end_src
+
+*** Attributes
+
+A slice object does not own data, it is just a container
+or a pointer to data together with additional bookkeeping facilities.
+
+It includes an info structure with the information about the slice,
+=Type=, =State= etc, which will be later communicated to other ranks.
+
+#+begin_src c++ :tangle (atrip-slice-h)
     Info info;
-    F  *data;
-    MPI_Request request;
-    const size_t size;
+#+end_src
 
+A pointer to data is also necessary for the =Slice= but not necessary
+to be communicated to other ranks. The =Slice= should never allocate
+or deallocate itself the pointer.
+#+begin_src c++ :tangle (atrip-slice-h)
+    F  *data;
+#+end_src
+
+An =MPI_Request= handle is also included so that the slices that are
+to receive data through MPI can know which request they belong to.
+#+begin_src c++ :tangle (atrip-slice-h)
+    MPI_Request request;
+#+end_src
+
+For practical purposes in MPI calls, the number of elements in =data= is also included.
+#+begin_src c++ :tangle (atrip-slice-h)
+    const size_t size;
+#+end_src
+
+*** Member functions
+
+It is important to note that a ready slice should not be recycled from
+any other slice, so that it can have access by itself to the data.
+#+begin_src c++ :tangle (atrip-slice-h)
     void markReady() noexcept {
       info.state = Ready;
       info.recycling = Blank;
     }
+#+end_src
 
-    /*
-     ,* This means that the data is there
-     ,*/
+
+The following function asks wether or not
+the slice has effectively been unwrapped or not,
+i.e., wether or not the data are accessible and already
+there. This can only happen in two ways, either
+is the slice =Ready= or it is =SelfSufficient=,
+i.e., the data pointed to was pre-distributed to the current node.
+#+begin_src c++ :tangle (atrip-slice-h)
     bool isUnwrapped() const noexcept {
       return info.state == Ready
           || info.state == SelfSufficient
           ;
     }
+#+end_src
 
+The function =isUnwrappable= answers which slices can be unwrapped
+potentially. Unwrapped slices can be unwrapped again idempotentially.
+Also =Recycled= slices can be unwrapped, i.e. the slices pointed to by them
+will be unwrapped.
+The only other possibility is that the slice has been dispatched
+in the past and can be unwrapped. The case where the state
+is =Dispatched= is the canonical intuitive case where a real process
+of unwrapping, i.e. waiting for the data to get through the network,
+is done.
+#+begin_src c++ :tangle (atrip-slice-h)
     bool isUnwrappable() const noexcept {
       return isUnwrapped()
           || info.state == Recycled
@@ -425,19 +621,20 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
            ;
     }
 
+#+end_src
 
-    /*
-     ,* This function answers the question, which slices can be recycled.
-     ,*
-     ,* A slice can only be recycled if it is Fetch or Ready and has
-     ,* a valid datapointer.
-     ,*
-     ,* In particular, SelfSufficient are not recyclable, since it is easier
-     ,* just to create a SelfSufficient slice than deal with data dependencies.
-     ,*
-     ,* Furthermore, a recycled slice is not recyclable, if this is the case
-     ,* then it is either bad design or a bug.
-     ,*/
+The function =isRecylable= answers the question, which slices can be recycled.
+
+A slice can only be recycled if it is Fetch or Ready and has
+a valid datapointer.
+
+In particular, SelfSufficient are not recyclable, since it is easier
+just to create a SelfSufficient slice than deal with data dependencies.
+
+Furthermore, a recycled slice is not recyclable, if this is the case
+then it is either bad design or a bug.
+
+#+begin_src c++ :tangle (atrip-slice-h)
     inline bool isRecyclable() const noexcept {
       return (  info.state == Dispatched
              || info.state == Ready
@@ -446,21 +643,38 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
           && hasValidDataPointer()
           ;
     }
+#+end_src
 
-    /*
-     ,* This function describes if a slice has a valid data pointer.
-     ,*
-     ,* This is important to know if the slice has some data to it, also
-     ,* some structural checks are done, so that it should not be Acceptor
-     ,* or Blank, if this is the case then it is a bug.
-     ,*/
+
+The function =hasValidDataPointer= describes if a slice has a valid
+data pointer.
+
+This is important to know if the slice has some data to it, also
+some structural checks are done, so that it should not be =Acceptor=
+or =Blank=, if this is the case then it is a bug.
+
+#+begin_src c++ :tangle (atrip-slice-h)
     inline bool hasValidDataPointer() const noexcept {
       return data       != nullptr
           && info.state != Acceptor
           && info.type  != Blank
           ;
     }
+#+end_src
 
+
+The function
+=unwrapAndMarkReady=
+calls the low-level MPI functions
+in order to wait whenever the state of the slice is correct.
+The main behaviour of the function should
+- return if state is =Ready=, since then there is nothing to be done.
+- throw if the state is not =Dispatched=, only a dispatched slice
+  can be unwrapped through MPI.
+- throw if an MPI error happens.
+
+
+#+begin_src c++ :tangle (atrip-slice-h)
     void unwrapAndMarkReady() {
       if (info.state == Ready) return;
       if (info.state != Dispatched)
@@ -490,7 +704,10 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
                 << "\n";
 #endif
     }
+#+end_src
 
+*** Epilog                                                         :noexport:
+#+begin_src c++ :tangle (atrip-slice-h)
     Slice(size_t size_)
       : info({})
       , data(nullptr)
@@ -500,7 +717,11 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
 
   }; // struct Slice
 
+#+end_src
 
+*** Debug                                                          :noexport:
+
+#+begin_src c++ :tangle (atrip-slice-h)
 template <typename F=double>
 std::ostream& operator<<(std::ostream& out, typename Slice<F>::Location const& v) {
   // TODO: remove me
@@ -522,6 +743,9 @@ std::ostream& operator<<(std::ostream& out, typename Slice<F>::Info const& i) {
 #+end_src
 
 ** Utils
+
+This section presents some utilities
+*** Prolog                                                         :noexport:
 #+begin_src c++ :tangle (atrip-utils-h)
 #pragma once
 #include <sstream>
@@ -530,38 +754,64 @@ std::ostream& operator<<(std::ostream& out, typename Slice<F>::Info const& i) {
 #include <chrono>
 
 #include <ctf.hpp>
+#include <atrip/Debug.hpp>
 
 namespace atrip {
+#+end_src
 
+*** Pretty printing
 
+The pretty printing uses the [[https://github.com/sharkdp/dbg-macro][dbg-macro]] package.
+
+#+begin_src c++ :tangle (atrip-utils-h)
   template <typename T>
   std::string pretty_print(T&& value) {
     std::stringstream stream;
-#if ATRIP_DEBUG > 1
+#if ATRIP_DEBUG > 2
     dbg::pretty_print(stream, std::forward<T>(value));
 #endif
     return stream.str();
   }
 
-#define WITH_CHRONO(__chrono, ...) \
-  __chrono.start(); __VA_ARGS__ __chrono.stop();
+#+end_src
 
-  struct Timer {
-    using Clock = std::chrono::high_resolution_clock;
-    using Event = std::chrono::time_point<Clock>;
-    std::chrono::duration<double> duration;
-    Event _start;
-    inline void start() noexcept { _start = Clock::now(); }
-    inline void stop() noexcept { duration += Clock::now() - _start; }
-    inline void clear() noexcept { duration *= 0; }
-    inline double count() const noexcept { return duration.count(); }
-  };
-  using Timings = std::map<std::string, Timer>;
-}
+*** Chrono
+
+The chrono is just a simple wrapper for a high resolution clock
+that can be found in the =std::chrono= namespace of the standard library.
+
+#+begin_src c++ :tangle (atrip-utils-h)
+#define WITH_CHRONO(__chrono_name, ...)         \
+  Atrip::chrono[__chrono_name].start();         \
+  __VA_ARGS__                                   \
+  Atrip::chrono[__chrono_name].stop();
+
+struct Timer {
+  using Clock = std::chrono::high_resolution_clock;
+  using Event = std::chrono::time_point<Clock>;
+  std::chrono::duration<double> duration;
+  Event _start;
+  inline void start() noexcept { _start = Clock::now(); }
+  inline void stop() noexcept { duration += Clock::now() - _start; }
+  inline void clear() noexcept { duration *= 0; }
+  inline double count() const noexcept { return duration.count(); }
+};
+using Timings = std::map<std::string, Timer>;
 
 #+end_src
 
+
+*** Epilog                                                         :noexport:
+#+begin_src c++ :tangle (atrip-utils-h)
+}
+#+end_src
+
 ** The rank mapping
+
+This section introduces the concept of rank mapping,
+which defines how slices will be allocated to every
+rank.
+
 #+begin_src c++ :tangle (atrip-rankmap-h)
 #pragma once
 
@@ -569,24 +819,38 @@ namespace atrip {
 #include <algorithm>
 
 #include <atrip/Slice.hpp>
+#include <atrip/Tuples.hpp>
 
 namespace atrip {
 
   template <typename F=double>
   struct RankMap {
 
+    static bool RANK_ROUND_ROBIN;
     std::vector<size_t> const lengths;
     size_t const np, size;
+    ClusterInfo const clusterInfo;
 
-    RankMap(std::vector<size_t> lens, size_t np_)
+    RankMap(std::vector<size_t> lens, size_t np_, MPI_Comm comm)
       : lengths(lens)
       , np(np_)
       , size(std::accumulate(lengths.begin(), lengths.end(),
                             1UL, std::multiplies<size_t>()))
+      , clusterInfo(getClusterInfo(comm))
     { assert(lengths.size() <= 2); }
 
     size_t find(typename Slice<F>::Location const& p) const noexcept {
-      return p.source * np + p.rank;
+      if (RANK_ROUND_ROBIN) {
+        return p.source * np + p.rank;
+      } else {
+        const size_t
+          rankPosition = p.source * clusterInfo.ranksPerNode
+                       + clusterInfo.rankInfos[p.rank].localRank
+                       ;
+        return rankPosition * clusterInfo.nNodes
+             + clusterInfo.rankInfos[p.rank].nodeId
+             ;
+      }
     }
 
     size_t nSources() const noexcept {
@@ -606,8 +870,9 @@ namespace atrip {
     }
 
     typename Slice<F>::Location
-    find(ABCTuple const& abc, typename Slice<F>::Type sliceType) const noexcept {
+    find(ABCTuple const& abc, typename Slice<F>::Type sliceType) const {
       // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB
+      // tuple = {11, 0} when abc = {11, 8, 9} and sliceType = A
       const auto tuple = Slice<F>::subtupleBySlice(abc, sliceType);
 
       const size_t index
@@ -615,9 +880,51 @@ namespace atrip {
         + tuple[1] * (lengths.size() > 1 ? lengths[0] : 0)
         ;
 
+      size_t rank, source;
+
+      if (RANK_ROUND_ROBIN) {
+
+        rank = index % np;
+        source = index / np;
+
+      } else {
+
+        size_t const
+
+          // the node that will be assigned to
+            nodeId = index % clusterInfo.nNodes
+
+          // how many times it has been assigned to the node
+          , s_n = index / clusterInfo.nNodes
+
+          // which local rank in the node should be
+          , localRank = s_n % clusterInfo.ranksPerNode
+
+          // and the local source (how many times we chose this local rank)
+          , localSource = s_n / clusterInfo.ranksPerNode
+          ;
+
+        // find the localRank-th entry in clusterInfo
+        auto const& it =
+          std::find_if(clusterInfo.rankInfos.begin(),
+                       clusterInfo.rankInfos.end(),
+                       [nodeId, localRank](RankInfo const& ri) {
+                         return ri.nodeId == nodeId
+                             && ri.localRank == localRank
+                             ;
+                       });
+        if (it == clusterInfo.rankInfos.end()) {
+          throw "FATAL! Error in node distribution of the slices";
+        }
+
+        rank = (*it).globalRank;
+        source = localSource;
+
+      }
+
       return
-        { index % np
-        , index / np
+        { rank
+        , source
         };
     }
 
@@ -808,8 +1115,14 @@ namespace atrip {
           if (blank.info.state == Slice<F>::SelfSufficient) {
             blank.data = sources[from.source].data();
           } else {
-            if (freePointers.size() == 0)
-              throw std::domain_error("No more free pointers!");
+            if (freePointers.size() == 0) {
+              std::stringstream stream;
+              stream << "No more free pointers "
+                     << "for type " << type
+                     << " and name " << name
+                      ;
+              throw std::domain_error(stream.str());
+            }
             auto dataPointer = freePointers.begin();
             freePointers.erase(dataPointer);
             blank.data = *dataPointer;
@@ -943,7 +1256,8 @@ namespace atrip {
           // at this point, let us blank the slice
           WITH_RANK << "~~~:cl(" << name << ")"
                     << " freeing up slice "
-                    // TODO: make this possible
+                    // TODO: make this possible because of Templates
+                    // TODO: there is a deduction error here
                     // << " info " << slice.info
                     << "\n";
           slice.free();
@@ -963,7 +1277,7 @@ namespace atrip {
               , typename Slice<F>::Name name_
               , size_t nSliceBuffers = 4
               )
-              : rankMap(paramLength, np)
+              : rankMap(paramLength, np, global_world)
               , world(child_world)
               , universe(global_world)
               , sliceLength(sliceLength_)
@@ -982,7 +1296,7 @@ namespace atrip {
 
       slices
         = std::vector<Slice<F>>(2 * sliceTypes.size(), { sources[0].size() });
-      // TODO: think exactly ^------------------- about this number
+      // TODO: think exactly    ^------------------- about this number
 
       // initialize the freePointers with the pointers to the buffers
       std::transform(sliceBuffers.begin(), sliceBuffers.end(),
@@ -1050,10 +1364,11 @@ namespace atrip {
      * \brief Send asynchronously only if the state is Fetch
      */
     void send( size_t otherRank
-             , typename Slice<F>::Info const& info
+             , typename Slice<F>::LocalDatabaseElement const& el
              , size_t tag) const noexcept {
       MPI_Request request;
       bool sendData_p = false;
+      auto const& info = el.info;
 
       if (info.state == Slice<F>::Fetch) sendData_p = true;
       // TODO: remove this because I have SelfSufficient
@@ -1168,8 +1483,11 @@ namespace atrip {
                       [&name](SliceUnion<F> const* s) {
                         return name == s->name;
                       });
-      if (sliceUnionIt == unions.end())
-        throw std::domain_error("SliceUnion not found!");
+      if (sliceUnionIt == unions.end()) {
+        std::stringstream stream;
+        stream << "SliceUnion(" << name << ") not found!";
+        throw std::domain_error(stream.str());
+      }
       return **sliceUnionIt;
   }
 
@@ -1177,6 +1495,12 @@ namespace atrip {
 #+end_src
 
 ** Tuples
+
+This section introduces the types for tuples \( (a,b,c) \)
+as well as their distribution to nodes and cores.
+
+
+*** Prolog                                                         :noexport:
 #+begin_src c++ :tangle (atrip-tuples-h)
 #pragma once
 
@@ -1184,78 +1508,692 @@ namespace atrip {
 #include <array>
 #include <numeric>
 
+// TODO: remove some
+#include <stdio.h>
+#include <math.h>
+#include <algorithm>
+#include <map>
+#include <cassert>
+#include <chrono>
+#include <climits>
+#include <mpi.h>
+
 #include <atrip/Utils.hpp>
 #include <atrip/Debug.hpp>
 
 namespace atrip {
+#+end_src
 
-  using ABCTuple = std::array<size_t, 3>;
-  using PartialTuple = std::array<size_t, 2>;
-  using ABCTuples = std::vector<ABCTuple>;
+*** Tuples types
 
-  ABCTuples getTuplesList(size_t Nv) {
-    const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
-    ABCTuples result(n);
-    size_t u(0);
+The main tuple types are simple type aliases for finite-size arrays.
+A tuple is thus simply 3 natural numbers \( (a,b,c) \)
+whereas a partial tuple is a two dimensional subset of these three.
 
-    for (size_t a(0); a < Nv; a++)
-    for (size_t b(a); b < Nv; b++)
-    for (size_t c(b); c < Nv; c++){
-      if ( a == b && b == c ) continue;
-      result[u++] = {a, b, c};
-    }
+#+begin_src c++ :tangle (atrip-tuples-h)
+using ABCTuple = std::array<size_t, 3>;
+using PartialTuple = std::array<size_t, 2>;
+using ABCTuples = std::vector<ABCTuple>;
 
-    return result;
+constexpr ABCTuple FAKE_TUPLE = {0, 0, 0};
+constexpr ABCTuple INVALID_TUPLE = {1, 1, 1};
+#+end_src
 
+*** Distributing the tuples
+
+In general it is our task to distribute all the tuples
+\( (a,b,c) \) among the ranks. Every distribution should
+make sure to allocate the same amount of tuples to every rank,
+padding the list with =FAKE_TUPLE= elements as necessary.
+
+The interface that we propose for this is simplye
+
+#+begin_src c++ :tangle (atrip-tuples-h)
+struct TuplesDistribution {
+  virtual ABCTuples getTuples(size_t Nv, MPI_Comm universe) = 0;
+  virtual bool tupleIsFake(ABCTuple const& t) { return t == FAKE_TUPLE; }
+};
+#+end_src
+
+
+
+*** Node information
+
+- nodeList ::
+    List of hostnames of size \( N_n \)
+- nodeInfos ::
+    List of (hostname, local rank Id)
+    of size \( N_p \), i.e., size of ranks
+    where local rank id goes from 0 to 48.
+
+
+
+=getNodeNames= gets the names of the nodes used,
+i.e., the size of the resulting vector gives the
+number of nodes.
+#+begin_src c++ :tangle (atrip-tuples-h)
+std::vector<std::string> getNodeNames(MPI_Comm comm){
+  int rank, np;
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &np);
+
+  std::vector<std::string> nodeList(np);
+  char nodeName[MPI_MAX_PROCESSOR_NAME]
+     , nodeNames[np*MPI_MAX_PROCESSOR_NAME]
+     ;
+  std::vector<int> nameLengths(np)
+                 , off(np)
+                 ;
+  int nameLength;
+  MPI_Get_processor_name(nodeName, &nameLength);
+  MPI_Allgather(&nameLength,
+                1,
+                MPI_INT,
+                nameLengths.data(),
+                1,
+                MPI_INT,
+                comm);
+  for (int i(1); i < np; i++)
+    off[i] = off[i-1] + nameLengths[i-1];
+  MPI_Allgatherv(nodeName,
+                 nameLengths[rank],
+                 MPI_BYTE,
+                 nodeNames,
+                 nameLengths.data(),
+                 off.data(),
+                 MPI_BYTE,
+                 comm);
+  for (int i(0); i < np; i++) {
+    std::string const s(&nodeNames[off[i]], nameLengths[i]);
+    nodeList[i] = s;
   }
+  return nodeList;
+}
+#+end_src
 
+=getNodeInfos=
+#+begin_src c++ :tangle (atrip-tuples-h)
+struct RankInfo {
+  const std::string name;
+  const size_t nodeId;
+  const size_t globalRank;
+  const size_t localRank;
+  const size_t ranksPerNode;
+};
 
-  std::pair<size_t, size_t>
-  getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) {
-
-    std::vector<size_t> n_tuples_per_rank(np, tuplesList.size()/np);
-    const size_t
-        // how many valid tuples should we still verteilen to nodes
-        // since the number of tuples is not divisible by the number of nodes
-        nRoundRobin = tuplesList.size() % np
-        // every node must have the sanme amount of tuples in order for the
-        // other nodes to receive and send somewhere, therefore
-        // some nodes will get extra tuples but that are dummy tuples
-      , nExtraInvalid = (np - nRoundRobin) % np
-      ;
-
-    if (nRoundRobin) for (int i = 0; i < np; i++) n_tuples_per_rank[i]++;
-
-  #if defined(TODO)
-    assert( tuplesList.size()
-            ==
-            ( std::accumulate(n_tuples_per_rank.begin(),
-                              n_tuples_per_rank.end(),
-                              0UL,
-                              std::plus<size_t>())
-            + nExtraInvalid
-            ));
-  #endif
-
-    WITH_RANK << "nRoundRobin = " << nRoundRobin << "\n";
-    WITH_RANK << "nExtraInvalid = " << nExtraInvalid << "\n";
-    WITH_RANK << "ntuples = " << n_tuples_per_rank[rank] << "\n";
-
-    auto const& it = n_tuples_per_rank.begin();
-
-    return
-      { std::accumulate(it, it + rank    , 0)
-      , std::accumulate(it, it + rank + 1, 0)
-      };
+template <typename A>
+A unique(A const &xs) {
+  auto result = xs;
+  std::sort(std::begin(result), std::end(result));
+  auto const& last = std::unique(std::begin(result), std::end(result));
+  result.erase(last, std::end(result));
+  return result;
+}
 
+std::vector<RankInfo>
+getNodeInfos(std::vector<string> const& nodeNames) {
+  std::vector<RankInfo> result;
+  auto const uniqueNames = unique(nodeNames);
+  auto const index = [&uniqueNames](std::string const& s) {
+    auto const& it = std::find(uniqueNames.begin(), uniqueNames.end(), s);
+    return std::distance(uniqueNames.begin(), it);
+  };
+  std::vector<size_t> localRanks(uniqueNames.size(), 0);
+  size_t globalRank = 0;
+  for (auto const& name: nodeNames) {
+    const size_t nodeId = index(name);
+    result.push_back({name,
+                      nodeId,
+                      globalRank++,
+                      localRanks[nodeId]++,
+                      std::count(nodeNames.begin(),
+                                 nodeNames.end(),
+                                 name)
+                      });
   }
+  return result;
+}
+
+struct ClusterInfo {
+  const size_t nNodes, np, ranksPerNode;
+  const std::vector<RankInfo> rankInfos;
+};
+
+ClusterInfo
+getClusterInfo(MPI_Comm comm) {
+  auto const names = getNodeNames(comm);
+  auto const rankInfos = getNodeInfos(names);
+
+  return ClusterInfo {
+    unique(names).size(),
+    names.size(),
+    rankInfos[0].ranksPerNode,
+    rankInfos
+  };
 
 }
 #+end_src
 
+*** Naive list
+
+The naive implementation of the global tuples list is simple
+three for loops creating tuples of the sort
+\( (a,b,c) \) where the following conditions are met at the same time:
+- \( a \leq b \leq c \)
+- \(
+  a \neq b \land b \neq c
+  \)
+
+This means,
+\( (1, 2, 3)
+ , (1, 1, 3)
+ , (1, 2, 2)
+\) are acceptable tuples wherease \( (2, 1, 1) \) and \( (1, 1, 1) \) are not.
+
+
+#+begin_src c++ :tangle (atrip-tuples-h)
+ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) {
+
+  const size_t
+    // total number of tuples for the problem
+       n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv
+
+    // all ranks should have the same number of tuples_per_rank
+    , tuples_per_rank = n / np + size_t(n % np != 0)
+
+    // start index for the global tuples list
+    , start = tuples_per_rank * rank
+
+    // end index for the global tuples list
+    , end = tuples_per_rank * (rank + 1)
+    ;
+
+  LOG(1,"Atrip") << "tuples_per_rank = " << tuples_per_rank << "\n";
+  WITH_RANK << "start, end = " << start << ", " << end << "\n";
+  ABCTuples result(tuples_per_rank, FAKE_TUPLE);
+
+  for (size_t a(0), r(0), g(0); a < Nv; a++)
+  for (size_t b(a);             b < Nv; b++)
+  for (size_t c(b);             c < Nv; c++){
+    if ( a == b && b == c ) continue;
+    if ( start <= g && g < end) result[r++] = {a, b, c};
+    g++;
+  }
+
+  return result;
+
+}
+#+end_src
+
+and all tuples would simply be
+
+#+begin_src c++ :tangle (atrip-tuples-h)
+ABCTuples getAllTuplesList(const size_t Nv) {
+  const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
+  ABCTuples result(n);
+
+  for (size_t a(0), u(0); a < Nv; a++)
+  for (size_t b(a); b < Nv; b++)
+  for (size_t c(b); c < Nv; c++){
+    if ( a == b && b == c ) continue;
+    result[u++] = {a, b, c};
+  }
+
+  return result;
+}
+#+end_src
+
+
+With =getTupleList= we can easily define a tuple distribution like
+
+#+begin_src c++ :tangle (atrip-tuples-h)
+struct NaiveDistribution : public TuplesDistribution {
+  ABCTuples getTuples(size_t Nv, MPI_Comm universe) override {
+    int rank, np;
+    MPI_Comm_rank(universe, &rank);
+    MPI_Comm_size(universe, &np);
+    return getTuplesList(Nv, (size_t)rank, (size_t)np);
+  }
+};
+#+end_src
+
+
+*** Group and sort list
+**** Prolog                                                        :noexport:
+#+begin_src c++ :tangle (atrip-tuples-h)
+namespace group_and_sort {
+#+end_src
+
+**** Utils
+
+#+begin_src c++ :tangle (atrip-tuples-h)
+
+// Provides the node on which the slice-element is found
+// Right now we distribute the slices in a round robin fashion
+// over the different nodes (NOTE: not mpi ranks but nodes)
+inline
+size_t isOnNode(size_t tuple, size_t nNodes) { return tuple % nNodes; }
+
+
+// return the node (or all nodes) where the elements of this
+// tuple are located
+std::vector<size_t> getTupleNodes(ABCTuple const& t, size_t nNodes) {
+  std::vector<size_t>
+    nTuple = { isOnNode(t[0], nNodes)
+             , isOnNode(t[1], nNodes)
+             , isOnNode(t[2], nNodes)
+             };
+  return unique(nTuple);
+}
+
+struct Info {
+  size_t nNodes;
+  size_t nodeId;
+};
+
+#+end_src
+
+**** Distribution
+
+wording: home element = element which is located on the given node
+1. we distribute the tuples such that each tuple has at least one 'home element'
+2. we sort each tuple in a way that the 'home element' are the fastest indices
+3. we sort the list of tuples on every node
+4. we resort the tuples that for every tuple abc the following holds: a<b<c
+
+#+begin_src c++ :tangle (atrip-tuples-h)
+ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
+
+  ABCTuples nodeTuples;
+  size_t const nNodes(info.nNodes);
+
+  std::vector<ABCTuples>
+      container1d(nNodes)
+    , container2d(nNodes * nNodes)
+    , container3d(nNodes * nNodes * nNodes)
+    ;
+
+  if (info.nodeId == 0)
+    std::cout << "\tGoing through all "
+              << allTuples.size()
+              << " tuples in "
+              << nNodes
+              << " nodes\n";
+
+  // build container-n-d's
+  for (auto const& t: allTuples) {
+    // one which node(s) are the tuple elements located...
+    // put them into the right container
+    auto const _nodes = getTupleNodes(t, nNodes);
+
+    switch (_nodes.size()) {
+      case 1:
+        container1d[_nodes[0]].push_back(t);
+        break;
+      case 2:
+        container2d[ _nodes[0]
+                   + _nodes[1] * nNodes
+                   ].push_back(t);
+        break;
+      case 3:
+        container3d[ _nodes[0]
+                   + _nodes[1] * nNodes
+                   + _nodes[2] * nNodes * nNodes
+                   ].push_back(t);
+        break;
+    }
+
+  }
+
+  if (info.nodeId == 0)
+    std::cout << "\tBuilding 1-d containers\n";
+  // DISTRIBUTE 1-d containers
+  // every tuple which is only located at one node belongs to this node
+  {
+    auto const& _tuples = container1d[info.nodeId];
+    nodeTuples.resize(_tuples.size(), INVALID_TUPLE);
+    std::copy(_tuples.begin(), _tuples.end(), nodeTuples.begin());
+  }
+
+  if (info.nodeId == 0)
+    std::cout << "\tBuilding 2-d containers\n";
+  // DISTRIBUTE 2-d containers
+  //the tuples which are located at two nodes are half/half given to these nodes
+  for (size_t yx = 0; yx < container2d.size(); yx++) {
+
+    auto const& _tuples = container2d[yx];
+      const
+    size_t idx = yx % nNodes
+         // remeber: yx = idy * nNodes + idx
+         , idy = yx / nNodes
+         , n_half = _tuples.size() / 2
+         , size = nodeTuples.size()
+         ;
+
+    size_t nbeg, nend;
+    if (info.nodeId == idx) {
+      nbeg = 0 * n_half;
+      nend = n_half;
+    } else if (info.nodeId == idy) {
+      nbeg = 1 * n_half;
+      nend = _tuples.size();
+    } else {
+      // either idx or idy is my node
+      continue;
+    }
+
+    size_t const nextra = nend - nbeg;
+    nodeTuples.resize(size + nextra, INVALID_TUPLE);
+    std::copy(_tuples.begin() + nbeg,
+              _tuples.begin() + nend,
+              nodeTuples.begin() + size);
+
+  }
+
+  if (info.nodeId == 0)
+    std::cout << "\tBuilding 3-d containers\n";
+  // DISTRIBUTE 3-d containers
+  for (size_t zyx = 0; zyx < container3d.size(); zyx++) {
+    auto const& _tuples = container3d[zyx];
+
+      const
+    size_t idx = zyx % nNodes
+         , idy = (zyx / nNodes) % nNodes
+         // remember: zyx = idx + idy * nNodes + idz * nNodes^2
+         , idz = zyx / nNodes / nNodes
+         , n_third = _tuples.size() / 3
+         , size = nodeTuples.size()
+         ;
+
+    size_t nbeg, nend;
+    if (info.nodeId == idx) {
+      nbeg = 0 * n_third;
+      nend = 1 * n_third;
+    } else if (info.nodeId == idy) {
+      nbeg = 1 * n_third;
+      nend = 2 * n_third;
+    } else if (info.nodeId == idz) {
+      nbeg = 2 * n_third;
+      nend = _tuples.size();
+    } else {
+      // either idx or idy or idz is my node
+      continue;
+    }
+
+    size_t const nextra = nend - nbeg;
+    nodeTuples.resize(size + nextra, INVALID_TUPLE);
+    std::copy(_tuples.begin() + nbeg,
+              _tuples.begin() + nend,
+              nodeTuples.begin() + size);
+
+  }
+
+
+  if (info.nodeId == 0) std::cout << "\tswapping tuples...\n";
+  /*
+   *  sort part of group-and-sort algorithm
+   *  every tuple on a given node is sorted in a way that
+   *  the 'home elements' are the fastest index.
+   *  1:yyy 2:yyn(x) 3:yny(x) 4:ynn(x) 5:nyy 6:nyn(x) 7:nny 8:nnn
+   */
+  for (auto &nt: nodeTuples){
+    if ( isOnNode(nt[0], nNodes) == info.nodeId ){ // 1234
+      if ( isOnNode(nt[2], nNodes) != info.nodeId ){ // 24
+        size_t const x(nt[0]);
+        nt[0] = nt[2];         // switch first and last
+        nt[2] = x;
+      }
+      else if ( isOnNode(nt[1], nNodes) != info.nodeId){ // 3
+        size_t const x(nt[0]);
+        nt[0] = nt[1];         // switch first two
+        nt[1] = x;
+      }
+    } else {
+      if ( isOnNode(nt[1], nNodes) == info.nodeId   // 56
+        && isOnNode(nt[2], nNodes) != info.nodeId
+        ) { // 6
+        size_t const x(nt[1]);
+        nt[1] = nt[2];         // switch last two
+        nt[2] = x;
+      }
+    }
+  }
+
+  if (info.nodeId == 0) std::cout << "\tsorting list of tuples...\n";
+  //now we sort the list of tuples
+  std::sort(nodeTuples.begin(), nodeTuples.end());
+
+  if (info.nodeId == 0) std::cout << "\trestoring tuples...\n";
+  // we bring the tuples abc back in the order a<b<c
+  for (auto &t: nodeTuples)  std::sort(t.begin(), t.end());
+
+#if ATRIP_DEBUG > 1
+  if (info.nodeId == 0)
+  std::cout << "checking for validity of " << nodeTuples.size() << std::endl;
+  const bool anyInvalid
+    = std::any_of(nodeTuples.begin(),
+                  nodeTuples.end(),
+                  [](ABCTuple const& t) { return t == INVALID_TUPLE; });
+  if (anyInvalid) throw "Some tuple is invalid in group-and-sort algorithm";
+#endif
+
+  if (info.nodeId == 0) std::cout << "\treturning tuples...\n";
+  return nodeTuples;
+
+}
+#+end_src
+
+
+**** Main
+
+The main routine should return the list of tuples to be handled by the current rank.
+
+Let \( N_p \) be the number of ranks or processes.
+Let \( N_n \) be the number of nodes or sockets.
+
+Then we have the following
+
+#+begin_example
+Global rank | 0 1 2 3 4 5 6 7 8
+key         | global rank
+nodeId      | 0 1 0 1 1 0 2 2 2
+Local rank  | 0 0 1 1 2 2 0 1 2
+intra color | 0 1 0 1 1 0 2 2 2
+#+end_example
+
+
+
+
+
+#+begin_src c++ :tangle (atrip-tuples-h)
+std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv) {
+
+  int rank, np;
+  MPI_Comm_rank(universe, &rank);
+  MPI_Comm_size(universe, &np);
+
+  std::vector<ABCTuple> result;
+
+  auto const nodeNames(getNodeNames(universe));
+  size_t const nNodes = unique(nodeNames).size();
+  auto const nodeInfos = getNodeInfos(nodeNames);
+
+  // We want to construct a communicator which only contains of one
+  // element per node
+  bool const computeDistribution
+    = nodeInfos[rank].localRank == 0;
+
+  std::vector<ABCTuple>
+    nodeTuples
+      = computeDistribution
+      ? specialDistribution(Info{nNodes, nodeInfos[rank].nodeId},
+                            getAllTuplesList(Nv))
+      : std::vector<ABCTuple>()
+      ;
+
+  LOG(1,"Atrip") << "got nodeTuples\n";
+
+  // now we have to send the data from **one** rank on each node
+  // to all others ranks of this node
+    const
+  int color = nodeInfos[rank].nodeId
+    , key = nodeInfos[rank].localRank
+    ;
+
+
+  MPI_Comm INTRA_COMM;
+  MPI_Comm_split(universe, color, key, &INTRA_COMM);
+#+end_src
+
+Every node has to distribute **at least**
+=nodeTuples.size() / nodeInfos[rank].ranksPerNode=
+tuples among the ranks.
+
+We have to communicate this quantity among all nodes.
+
+#+begin_src c++ :tangle (atrip-tuples-h)
+
+  size_t const
+    tuplesPerRankLocal
+       = nodeTuples.size() / nodeInfos[rank].ranksPerNode
+       + size_t(nodeTuples.size() % nodeInfos[rank].ranksPerNode != 0)
+       ;
+
+  size_t tuplesPerRankGlobal;
+
+  MPI_Reduce(&tuplesPerRankLocal,
+             &tuplesPerRankGlobal,
+             1,
+             MPI_UINT64_T,
+             MPI_MAX,
+             0,
+             universe);
+
+  MPI_Bcast(&tuplesPerRankGlobal,
+            1,
+            MPI_UINT64_T,
+            0,
+            universe);
+
+  LOG(1,"Atrip") << "Tuples per rank: " << tuplesPerRankGlobal << "\n";
+  LOG(1,"Atrip") << "ranks per node " << nodeInfos[rank].ranksPerNode << "\n";
+  LOG(1,"Atrip") << "#nodes " << nNodes << "\n";
+#+end_src
+
+Now we have the tuples that every rank has to have, i.e.,
+=tuplesPerRankGlobal=.
+
+However before this,
+the tuples in =nodeTuples= now have to be sent from the local rank
+in every node to all the ranks in the given node,
+and we have to make sure that every rank inside a given node
+gets the same amount of tuples, in this case it should be
+=tuplesPerRankLocal=, and in our node the total number
+of tuples should be =tuplesPerRankLocal * nodeInfos[rank].ranksPerNode=,
+however this might not be the case up to now due to divisibility issues.
+
+Up to now we have exactly =nodeTuples.size()= tuples, we have to make sure by
+resizing that the condition above is met, i.e., so we can resize
+and add some fake tuples at the end as padding.
+
+#+begin_src c++ :tangle (atrip-tuples-h)
+size_t const totalTuples
+  = tuplesPerRankGlobal * nodeInfos[rank].ranksPerNode;
+
+if (computeDistribution) {
+  // pad with FAKE_TUPLEs
+  nodeTuples.insert(nodeTuples.end(),
+                    totalTuples - nodeTuples.size(),
+                    FAKE_TUPLE);
+}
+#+end_src
+
+And now we can simply scatter the tuples in nodeTuples and send
+=tuplesPerRankGlobal= to the different ranks in the node,
+
+#+begin_src c++ :tangle (atrip-tuples-h)
+{
+  // construct mpi type for abctuple
+  MPI_Datatype MPI_ABCTUPLE;
+  MPI_Type_vector(nodeTuples[0].size(), 1, 1, MPI_UINT64_T, &MPI_ABCTUPLE);
+  MPI_Type_commit(&MPI_ABCTUPLE);
+
+  LOG(1,"Atrip") << "scattering tuples \n";
+
+  result.resize(tuplesPerRankGlobal);
+  MPI_Scatter(nodeTuples.data(),
+              tuplesPerRankGlobal,
+              MPI_ABCTUPLE,
+              result.data(),
+              tuplesPerRankGlobal,
+              MPI_ABCTUPLE,
+              0,
+              INTRA_COMM);
+
+  MPI_Type_free(&MPI_ABCTUPLE);
+
+}
+#+end_src
+
+
+The next step is sending the tuples in the local root rank
+to the other ranks in the node, this we do with the MPI function
+=MPI_Scatterv=.
+Every rank gets =tuplesPerRankLocal= tuples and
+the =nodeTuples= vector is now homogeneous and divisible by the number
+of ranks per node in our node.
+Therefore, the =displacements= are simply the vector
+\begin{equation*}
+  \left\{
+  k * \mathrm{tuplesPerNodeLocal}
+  \mid
+  k \in
+  \left\{ 0
+        , \ldots
+        , \#\text{ranks in node} - 1
+        \right\}
+  \right\}
+\end{equation*}
+
+and the =sendCounts= vector is simply the constant vector
+=tuplesPerRankLocal= of size =ranksPerNode=.
+
+#+begin_src c++ :tangle (atrip-tuples-h)
+
+  return result;
+
+}
+#+end_src
+
+**** Interface
+
+The distribution interface will then simply be
+
+#+begin_src c++ :tangle (atrip-tuples-h)
+struct Distribution : public TuplesDistribution {
+  ABCTuples getTuples(size_t Nv, MPI_Comm universe) override {
+    return main(universe, Nv);
+  }
+};
+#+end_src
+
+
+**** Epilog                                                        :noexport:
+#+begin_src c++ :tangle (atrip-tuples-h)
+} // namespace group_and_sort
+#+end_src
+
+
+*** Epilog                                                         :noexport:
+#+begin_src c++ :tangle (atrip-tuples-h)
+}
+#+end_src
+
 ** Unions
-Since every tensor slice in a different way, we can override the slicing procedure
-and define subclasses of slice unions.
+
+Every slice pertaining to every different tensor
+is sliced differently.
+
 
 #+begin_src c++ :tangle (atrip-unions-h)
 #pragma once
@@ -1318,7 +2256,7 @@ namespace atrip {
                           , child_world
                           , global_world
                           , Slice<F>::TA
-                          , 4) {
+                          , 6) {
            init(sourceTensor);
          }
 
@@ -1356,7 +2294,7 @@ namespace atrip {
                          , child_world
                          , global_world
                          , Slice<F>::VIJKA
-                         , 4) {
+                         , 6) {
            init(sourceTensor);
          }
 
@@ -1675,10 +2613,8 @@ namespace atrip {
     , F const* TBChh
     // -- TIJK
     , F *Tijk
-    , atrip::Timings& chrono
     ) {
 
-    auto& t_reorder = chrono["doubles:reorder"];
     const size_t a = abc[0], b = abc[1], c = abc[2]
               , NoNo = No*No, NoNv = No*Nv
               ;
@@ -1686,13 +2622,13 @@ namespace atrip {
   #if defined(ATRIP_USE_DGEMM)
   #define _IJK_(i, j, k) i + j*No + k*NoNo
   #define REORDER(__II, __JJ, __KK)                                 \
-    t_reorder.start();                                              \
+    WITH_CHRONO("doubles:reorder",                                  \
     for (size_t k = 0; k < No; k++)                                 \
     for (size_t j = 0; j < No; j++)                                 \
     for (size_t i = 0; i < No; i++) {                               \
       Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)];   \
     }                                                               \
-    t_reorder.stop();
+    )
   #define DGEMM_PARTICLES(__A, __B)      \
     atrip::xgemm<F>( "T"                 \
                    , "N"                 \
@@ -1732,92 +2668,91 @@ namespace atrip {
     _t_buffer.reserve(NoNoNo);
     F one{1.0}, m_one{-1.0}, zero{0.0};
 
-    t_reorder.start();
-    for (size_t k = 0; k < NoNoNo; k++) {
-      // zero the Tijk
-      Tijk[k] = 0.0;
-    }
-    t_reorder.stop();
+    WITH_CHRONO("double:reorder",
+      for (size_t k = 0; k < NoNoNo; k++) {
+         Tijk[k] = 0.0;
+       })
 
-    chrono["doubles:holes"].start();
-    { // Holes part ============================================================
+    // TOMERGE: replace chronos
+    WITH_CHRONO("doubles:holes",
+      { // Holes part ========================================================
 
-      std::vector<F> _vhhh(NoNoNo);
+        std::vector<F> _vhhh(NoNoNo);
 
-      // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
-      MAYBE_CONJ(_vhhh, VhhhC)
-      chrono["doubles:holes:1"].start();
-      DGEMM_HOLES(_vhhh.data(), TABhh, "N")
-      REORDER(i, k, j)
-      chrono["doubles:holes:1"].stop();
-      // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
-      chrono["doubles:holes:2"].start();
-      DGEMM_HOLES(_vhhh.data(), TABhh, "T")
-      REORDER(j, k, i)
-      chrono["doubles:holes:2"].stop();
+        // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
+        MAYBE_CONJ(_vhhh, VhhhC)
+        WITH_CHRONO("doubles:holes:1",
+          DGEMM_HOLES(_vhhh.data(), TABhh, "N")
+          REORDER(i, k, j)
+        )
+        // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
+        WITH_CHRONO("doubles:holes:2",
+          DGEMM_HOLES(_vhhh.data(), TABhh, "T")
+          REORDER(j, k, i)
+        )
 
-      // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
-      MAYBE_CONJ(_vhhh, VhhhB)
-      chrono["doubles:holes:3"].start();
-      DGEMM_HOLES(_vhhh.data(), TAChh, "N")
-      REORDER(i, j, k)
-      chrono["doubles:holes:3"].stop();
-      // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
-      chrono["doubles:holes:4"].start();
-      DGEMM_HOLES(_vhhh.data(), TAChh, "T")
-      REORDER(k, j, i)
-      chrono["doubles:holes:4"].stop();
+        // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
+        MAYBE_CONJ(_vhhh, VhhhB)
+        WITH_CHRONO("doubles:holes:3",
+          DGEMM_HOLES(_vhhh.data(), TAChh, "N")
+          REORDER(i, j, k)
+        )
+        // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
+        WITH_CHRONO("doubles:holes:4",
+          DGEMM_HOLES(_vhhh.data(), TAChh, "T")
+          REORDER(k, j, i)
+        )
 
-      // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
-      MAYBE_CONJ(_vhhh, VhhhA)
-      chrono["doubles:holes:5"].start();
-      DGEMM_HOLES(_vhhh.data(), TBChh, "N")
-      REORDER(j, i, k)
-      chrono["doubles:holes:5"].stop();
-      // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
-      chrono["doubles:holes:6"].start();
-      DGEMM_HOLES(_vhhh.data(), TBChh, "T")
-      REORDER(k, i, j)
-      chrono["doubles:holes:6"].stop();
+        // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
+        MAYBE_CONJ(_vhhh, VhhhA)
+        WITH_CHRONO("doubles:holes:5",
+          DGEMM_HOLES(_vhhh.data(), TBChh, "N")
+          REORDER(j, i, k)
+        )
+        // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
+        WITH_CHRONO("doubles:holes:6",
+          DGEMM_HOLES(_vhhh.data(), TBChh, "T")
+          REORDER(k, i, j)
+        )
 
-    }
-    chrono["doubles:holes"].stop();
+      }
+    )
   #undef MAYBE_CONJ
 
-    chrono["doubles:particles"].start();
-    { // Particle part =========================================================
-      // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
-      chrono["doubles:particles:1"].start();
-      DGEMM_PARTICLES(TAphh, VBCph)
-      REORDER(i, j, k)
-      chrono["doubles:particles:1"].stop();
-      // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
-      chrono["doubles:particles:2"].start();
-      DGEMM_PARTICLES(TAphh, VCBph)
-      REORDER(i, k, j)
-      chrono["doubles:particles:2"].stop();
-      // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
-      chrono["doubles:particles:3"].start();
-      DGEMM_PARTICLES(TCphh, VABph)
-      REORDER(k, i, j)
-      chrono["doubles:particles:3"].stop();
-      // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
-      chrono["doubles:particles:4"].start();
-      DGEMM_PARTICLES(TCphh, VBAph)
-      REORDER(k, j, i)
-      chrono["doubles:particles:4"].stop();
-      // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
-      chrono["doubles:particles:5"].start();
-      DGEMM_PARTICLES(TBphh, VACph)
-      REORDER(j, i, k)
-      chrono["doubles:particles:5"].stop();
-      // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
-      chrono["doubles:particles:6"].start();
-      DGEMM_PARTICLES(TBphh, VCAph)
-      REORDER(j, k, i)
-      chrono["doubles:particles:6"].stop();
-    }
-    chrono["doubles:particles"].stop();
+    WITH_CHRONO("doubles:particles",
+      { // Particle part =====================================================
+        // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
+        WITH_CHRONO("doubles:particles:1",
+          DGEMM_PARTICLES(TAphh, VBCph)
+          REORDER(i, j, k)
+        )
+        // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
+        WITH_CHRONO("doubles:particles:2",
+          DGEMM_PARTICLES(TAphh, VCBph)
+          REORDER(i, k, j)
+        )
+        // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
+        WITH_CHRONO("doubles:particles:3",
+          DGEMM_PARTICLES(TCphh, VABph)
+          REORDER(k, i, j)
+        )
+        // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
+        WITH_CHRONO("doubles:particles:4",
+          DGEMM_PARTICLES(TCphh, VBAph)
+          REORDER(k, j, i)
+        )
+        // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
+        WITH_CHRONO("doubles:particles:5",
+          DGEMM_PARTICLES(TBphh, VACph)
+          REORDER(j, i, k)
+        )
+        // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
+        WITH_CHRONO("doubles:particles:6",
+          DGEMM_PARTICLES(TBphh, VCAph)
+          REORDER(j, k, i)
+        )
+      }
+    )
 
   #undef REORDER
   #undef DGEMM_HOLES
@@ -1973,12 +2908,22 @@ namespace atrip {
 
 #include <ctf.hpp>
 
+#include <atrip/Utils.hpp>
+
+#define ADD_ATTRIBUTE(_type, _name, _default)   \
+  _type _name = _default;                       \
+  Input& with_ ## _name(_type i) {              \
+    _name = i;                                  \
+    return *this;                               \
+  }
+
 namespace atrip {
 
   struct Atrip {
 
     static int rank;
     static int np;
+    static Timings chrono;
     static void init();
 
     template <typename F=double>
@@ -1991,9 +2936,6 @@ namespace atrip {
                         , *Vhhhp = nullptr
                         , *Vppph = nullptr
                         ;
-      int maxIterations = 0, iterationMod = -1, percentageMod = -1;
-      bool barrier = false;
-      bool chrono = false;
       Input& with_epsilon_i(CTF::Tensor<F> * t) { ei = t; return *this; }
       Input& with_epsilon_a(CTF::Tensor<F> * t) { ea = t; return *this; }
       Input& with_Tai(CTF::Tensor<F> * t) { Tph = t; return *this; }
@@ -2001,11 +2943,20 @@ namespace atrip {
       Input& with_Vabij(CTF::Tensor<F> * t) { Vpphh = t; return *this; }
       Input& with_Vijka(CTF::Tensor<F> * t) { Vhhhp = t; return *this; }
       Input& with_Vabci(CTF::Tensor<F> * t) { Vppph = t; return *this; }
-      Input& with_maxIterations(int i) { maxIterations = i; return *this; }
-      Input& with_iterationMod(int i) { iterationMod = i; return *this; }
-      Input& with_percentageMod(int i) { percentageMod = i; return *this; }
-      Input& with_barrier(bool i) { barrier = i; return *this; }
-      Input& with_chrono(bool i) { chrono = i; return *this; }
+
+      enum TuplesDistribution {
+        NAIVE,
+        GROUP_AND_SORT,
+      };
+
+      ADD_ATTRIBUTE(bool, rankRoundRobin, false)
+      ADD_ATTRIBUTE(bool, chrono, false)
+      ADD_ATTRIBUTE(bool, barrier, false)
+      ADD_ATTRIBUTE(int, maxIterations, 0)
+      ADD_ATTRIBUTE(int, iterationMod, -1)
+      ADD_ATTRIBUTE(int, percentageMod, -1)
+      ADD_ATTRIBUTE(TuplesDistribution, tuplesDistribution, NAIVE)
+
     };
 
     struct Output {
@@ -2031,8 +2982,11 @@ namespace atrip {
 
 using namespace atrip;
 
+bool RankMap<Complex>::RANK_ROUND_ROBIN;
+bool RankMap<double>::RANK_ROUND_ROBIN;
 int Atrip::rank;
 int Atrip::np;
+Timings Atrip::chrono;
 
 // user printing block
 IterationDescriptor IterationDescription::descriptor;
@@ -2052,28 +3006,35 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
   const int rank = Atrip::rank;
   MPI_Comm universe = in.ei->wrld->comm;
 
-  // Timings in seconds ================================================{{{1
-  Timings chrono{};
-
   const size_t No = in.ei->lens[0];
   const size_t Nv = in.ea->lens[0];
   LOG(0,"Atrip") << "No: " << No << "\n";
   LOG(0,"Atrip") << "Nv: " << Nv << "\n";
+  LOG(0,"Atrip") << "np: " << np << "\n";
 
   // allocate the three scratches, see piecuch
-  std::vector<F>   Tijk(No*No*No) // doubles only (see piecuch)
-                 , Zijk(No*No*No) // singles + doubles (see piecuch)
-                 // we need local copies of the following tensors on every
-                 // rank
-                 , epsi(No)
-                 , epsa(Nv)
-                 , Tai(No * Nv)
-                 ;
+  std::vector<F> Tijk(No*No*No) // doubles only (see piecuch)
+               , Zijk(No*No*No) // singles + doubles (see piecuch)
+               // we need local copies of the following tensors on every
+               // rank
+               , epsi(No)
+               , epsa(Nv)
+               , Tai(No * Nv)
+               ;
 
   in.ei->read_all(epsi.data());
   in.ea->read_all(epsa.data());
   in.Tph->read_all(Tai.data());
 
+  RankMap<F>::RANK_ROUND_ROBIN = in.rankRoundRobin;
+  if (RankMap<F>::RANK_ROUND_ROBIN) {
+    LOG(0,"Atrip") << "Doing rank round robin slices distribution" << "\n";
+  } else {
+    LOG(0,"Atrip")
+      << "Doing node > local rank round robin slices distribution" << "\n";
+  }
+
+
   // COMMUNICATOR CONSTRUCTION ========================================={{{1
   //
   // Construct a new communicator living only on a single rank
@@ -2094,41 +3055,49 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
   }
 
 
-  chrono["nv-slices"].start();
   // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1
-  LOG(0,"Atrip") << "BUILD NV-SLICES\n";
-  TAPHH<F> taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  HHHA<F>  hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  chrono["nv-slices"].stop();
+  WITH_CHRONO("nv-slices",
+    LOG(0,"Atrip") << "BUILD NV-SLICES\n";
+    TAPHH<F> taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+    HHHA<F>  hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  )
 
-  chrono["nv-nv-slices"].start();
   // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1
-  LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n";
-  ABPH<F> abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  ABHH<F> abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  TABHH<F> tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  chrono["nv-nv-slices"].stop();
+  WITH_CHRONO("nv-nv-slices",
+    LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n";
+    ABPH<F> abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+    ABHH<F> abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+    TABHH<F> tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  )
 
   // all tensors
   std::vector< SliceUnion<F>* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh};
 
-  //CONSTRUCT TUPLE LIST ==============================================={{{1
-  LOG(0,"Atrip") << "BUILD TUPLE LIST\n";
-  const auto tuplesList = std::move(getTuplesList(Nv));
-  WITH_RANK << "tupList.size() = " << tuplesList.size() << "\n";
+  // get tuples for the current rank
+  TuplesDistribution *distribution;
 
-  // GET ABC INDEX RANGE FOR RANK ======================================{{{1
-  auto abcIndex = getABCRange(np, rank, tuplesList);
-  size_t nIterations = abcIndex.second - abcIndex.first;
+  if (in.tuplesDistribution == Atrip::Input<F>::TuplesDistribution::NAIVE) {
+    LOG(0,"Atrip") << "Using the naive distribution\n";
+    distribution = new NaiveDistribution();
+  } else {
+    LOG(0,"Atrip") << "Using the group-and-sort distribution\n";
+    distribution = new group_and_sort::Distribution();
+  }
 
-  WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n";
-  LOG(0,"Atrip") << "#iterations: " << nIterations << "\n";
+  LOG(0,"Atrip") << "BUILDING TUPLE LIST\n";
+  WITH_CHRONO("tuples:build",
+    auto const tuplesList = distribution->getTuples(Nv, universe);
+    )
+  const size_t nIterations = tuplesList.size();
 
-  // first abc
-  const ABCTuple firstAbc = tuplesList[abcIndex.first];
-
-
-  double energy(0.);
+  {
+    const size_t _all_tuples = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
+    LOG(0,"Atrip") << "#iterations: "
+                  << nIterations
+                  << "/"
+                  << nIterations * np
+                  << "\n";
+  }
 
   const size_t
       iterationMod = (in.percentageMod > 0)
@@ -2141,7 +3110,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
 
 
   auto const isFakeTuple
-    = [&tuplesList](size_t const i) { return i >= tuplesList.size(); };
+    = [&tuplesList, distribution](size_t const i) {
+      return distribution->tupleIsFake(tuplesList[i]);
+    };
 
 
   using Database = typename Slice<F>::Database;
@@ -2149,45 +3120,42 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
   auto communicateDatabase
     = [ &unions
       , np
-      , &chrono
       ] (ABCTuple const& abc, MPI_Comm const& c) -> Database {
 
-        chrono["db:comm:type:do"].start();
-        auto MPI_LDB_ELEMENT = Slice<F>::mpi::localDatabaseElement();
-        chrono["db:comm:type:do"].stop();
+        WITH_CHRONO("db:comm:type:do",
+          auto MPI_LDB_ELEMENT = Slice<F>::mpi::localDatabaseElement();
+        )
 
-        chrono["db:comm:ldb"].start();
-        LocalDatabase ldb;
-
-        for (auto const& tensor: unions) {
-          auto const& tensorDb = tensor->buildLocalDatabase(abc);
-          ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end());
-        }
-        chrono["db:comm:ldb"].stop();
+        WITH_CHRONO("db:comm:ldb",
+          typename Slice<F>::LocalDatabase ldb;
+          for (auto const& tensor: unions) {
+            auto const& tensorDb = tensor->buildLocalDatabase(abc);
+            ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end());
+          }
+        )
 
         Database db(np * ldb.size(), ldb[0]);
 
-        chrono["oneshot-db:comm:allgather"].start();
-        chrono["db:comm:allgather"].start();
-        MPI_Allgather( ldb.data()
-                     , ldb.size()
-                     , MPI_LDB_ELEMENT
-                     , db.data()
-                     , ldb.size()
-                     , MPI_LDB_ELEMENT
-                     , c);
-        chrono["db:comm:allgather"].stop();
-        chrono["oneshot-db:comm:allgather"].stop();
+        WITH_CHRONO("oneshot-db:comm:allgather",
+        WITH_CHRONO("db:comm:allgather",
+          MPI_Allgather( ldb.data()
+                       , ldb.size()
+                       , MPI_LDB_ELEMENT
+                       , db.data()
+                       , ldb.size()
+                       , MPI_LDB_ELEMENT
+                       , c);
+        ))
 
-        chrono["db:comm:type:free"].start();
-        MPI_Type_free(&MPI_LDB_ELEMENT);
-        chrono["db:comm:type:free"].stop();
+        WITH_CHRONO("db:comm:type:free",
+          MPI_Type_free(&MPI_LDB_ELEMENT);
+        )
 
         return db;
       };
 
   auto doIOPhase
-    = [&unions, &rank, &np, &universe, &chrono] (Database const& db) {
+    = [&unions, &rank, &np, &universe] (Database const& db) {
 
     const size_t localDBLength = db.size() / np;
 
@@ -2223,9 +3191,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
           << "\n"
           ;
 
-        chrono["db:io:recv"].start();
-        u.receive(el.info, recvTag);
-        chrono["db:io:recv"].stop();
+        WITH_CHRONO("db:io:recv",
+          u.receive(el.info, recvTag);
+        )
 
       } // recv
     }
@@ -2259,9 +3227,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
           << "\n"
           ;
 
-        chrono["db:io:send"].start();
-        u.send(otherRank, el.info, sendTag);
-        chrono["db:io:send"].stop();
+        WITH_CHRONO("db:io:send",
+          u.send(otherRank, el, sendTag);
+        )
 
       } // send phase
 
@@ -2287,24 +3255,22 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
 
   // START MAIN LOOP ======================================================{{{1
 
-  for ( size_t i = abcIndex.first, iteration = 1
-      ; i < abcIndex.second
+  double energy(0.);
+
+  for ( size_t i = 0, iteration = 1
+      ; i < tuplesList.size()
       ; i++, iteration++
       ) {
-    chrono["iterations"].start();
-
+    Atrip::chrono["iterations"].start();
 
     // check overhead from chrono over all iterations
-    chrono["start:stop"].start(); chrono["start:stop"].stop();
+    WITH_CHRONO("start:stop", {})
 
     // check overhead of doing a barrier at the beginning
-    chrono["oneshot-mpi:barrier"].start();
-    chrono["mpi:barrier"].start();
-    // TODO: REMOVE
-    if (in.barrier == 1)
-    MPI_Barrier(universe);
-    chrono["mpi:barrier"].stop();
-    chrono["oneshot-mpi:barrier"].stop();
+    WITH_CHRONO("oneshot-mpi:barrier",
+    WITH_CHRONO("mpi:barrier",
+      if (in.barrier) MPI_Barrier(universe);
+    ))
 
     if (iteration % iterationMod == 0 || iteration == iteration1Percent) {
 
@@ -2312,22 +3278,22 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
         IterationDescription::descriptor({
           iteration,
           nIterations,
-          chrono["iterations"].count()
+          Atrip::chrono["iterations"].count()
         });
       }
 
       LOG(0,"Atrip")
         << "iteration " << iteration
         << " [" << 100 * iteration / nIterations << "%]"
-        << " (" << doublesFlops * iteration / chrono["doubles"].count()
+        << " (" << doublesFlops * iteration / Atrip::chrono["doubles"].count()
         << "GF)"
-        << " (" << doublesFlops * iteration / chrono["iterations"].count()
+        << " (" << doublesFlops * iteration / Atrip::chrono["iterations"].count()
         << "GF)"
         << " ===========================\n";
 
       // PRINT TIMINGS
       if (in.chrono)
-      for (auto const& pair: chrono)
+      for (auto const& pair: Atrip::chrono)
         LOG(1, " ") << pair.first << " :: "
                     << pair.second.count()
                     << std::endl;
@@ -2337,46 +3303,43 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
     const ABCTuple abc = isFakeTuple(i)
                        ? tuplesList[tuplesList.size() - 1]
                        : tuplesList[i]
-                 , *abcNext = i == (abcIndex.second - 1)
+                 , *abcNext = i == (tuplesList.size() - 1)
                             ? nullptr
-                            : isFakeTuple(i + 1)
-                            ? &tuplesList[tuplesList.size() - 1]
                             : &tuplesList[i + 1]
                  ;
 
-    chrono["with_rank"].start();
-    WITH_RANK << " :it " << iteration
-              << " :abc " << pretty_print(abc)
-              << " :abcN "
-              << (abcNext ? pretty_print(*abcNext) : "None")
-              << "\n";
-    chrono["with_rank"].stop();
+    WITH_CHRONO("with_rank",
+      WITH_RANK << " :it " << iteration
+                << " :abc " << pretty_print(abc)
+                << " :abcN "
+                << (abcNext ? pretty_print(*abcNext) : "None")
+                << "\n";
+    )
 
 
     // COMM FIRST DATABASE ================================================{{{1
-    if (i == abcIndex.first) {
+    if (i == 0) {
       WITH_RANK << "__first__:first database ............ \n";
-      const auto __db = communicateDatabase(abc, universe);
+      const auto db = communicateDatabase(abc, universe);
       WITH_RANK << "__first__:first database communicated \n";
       WITH_RANK << "__first__:first database io phase \n";
-      doIOPhase(__db);
+      doIOPhase(db);
       WITH_RANK << "__first__:first database io phase DONE\n";
       WITH_RANK << "__first__::::Unwrapping all slices for first database\n";
       for (auto& u: unions) u->unwrapAll(abc);
-      WITH_RANK << "__first__::::Unwrapping all slices for first database DONE\n";
+      WITH_RANK << "__first__::::Unwrapping slices for first database DONE\n";
       MPI_Barrier(universe);
     }
 
     // COMM NEXT DATABASE ================================================={{{1
     if (abcNext) {
       WITH_RANK << "__comm__:" << iteration << "th communicating database\n";
-      chrono["db:comm"].start();
-      //const auto db = communicateDatabase(*abcNext, universe);
-      Database db = communicateDatabase(*abcNext, universe);
-      chrono["db:comm"].stop();
-      chrono["db:io"].start();
-      doIOPhase(db);
-      chrono["db:io"].stop();
+      WITH_CHRONO("db:comm",
+        const auto db = communicateDatabase(*abcNext, universe);
+      )
+      WITH_CHRONO("db:io",
+        doIOPhase(db);
+      )
       WITH_RANK << "__comm__:" <<  iteration << "th database io phase DONE\n";
     }
 
@@ -2384,63 +3347,61 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
     OCD_Barrier(universe);
     if (!isFakeTuple(i)) {
       WITH_RANK << iteration << "-th doubles\n";
-      WITH_CHRONO(chrono["oneshot-unwrap"],
-      WITH_CHRONO(chrono["unwrap"],
-      WITH_CHRONO(chrono["unwrap:doubles"],
+      WITH_CHRONO("oneshot-unwrap",
+      WITH_CHRONO("unwrap",
+      WITH_CHRONO("unwrap:doubles",
         for (auto& u: decltype(unions){&abph, &hhha, &taphh, &tabhh}) {
           u->unwrapAll(abc);
         }
       )))
-      chrono["oneshot-doubles"].start();
-      chrono["doubles"].start();
-      doublesContribution<F>( abc, (size_t)No, (size_t)Nv
-                            // -- VABCI
-                            , abph.unwrapSlice(Slice<F>::AB, abc)
-                            , abph.unwrapSlice(Slice<F>::AC, abc)
-                            , abph.unwrapSlice(Slice<F>::BC, abc)
-                            , abph.unwrapSlice(Slice<F>::BA, abc)
-                            , abph.unwrapSlice(Slice<F>::CA, abc)
-                            , abph.unwrapSlice(Slice<F>::CB, abc)
-                            // -- VHHHA
-                            , hhha.unwrapSlice(Slice<F>::A, abc)
-                            , hhha.unwrapSlice(Slice<F>::B, abc)
-                            , hhha.unwrapSlice(Slice<F>::C, abc)
-                            // -- TA
-                            , taphh.unwrapSlice(Slice<F>::A, abc)
-                            , taphh.unwrapSlice(Slice<F>::B, abc)
-                            , taphh.unwrapSlice(Slice<F>::C, abc)
-                            // -- TABIJ
-                            , tabhh.unwrapSlice(Slice<F>::AB, abc)
-                            , tabhh.unwrapSlice(Slice<F>::AC, abc)
-                            , tabhh.unwrapSlice(Slice<F>::BC, abc)
-                            // -- TIJK
-                            , Tijk.data()
-                            , chrono
-                            );
-      WITH_RANK << iteration << "-th doubles done\n";
-      chrono["doubles"].stop();
-      chrono["oneshot-doubles"].stop();
+      WITH_CHRONO("oneshot-doubles",
+      WITH_CHRONO("doubles",
+        doublesContribution<F>( abc, (size_t)No, (size_t)Nv
+                              // -- VABCI
+                              , abph.unwrapSlice(Slice<F>::AB, abc)
+                              , abph.unwrapSlice(Slice<F>::AC, abc)
+                              , abph.unwrapSlice(Slice<F>::BC, abc)
+                              , abph.unwrapSlice(Slice<F>::BA, abc)
+                              , abph.unwrapSlice(Slice<F>::CA, abc)
+                              , abph.unwrapSlice(Slice<F>::CB, abc)
+                              // -- VHHHA
+                              , hhha.unwrapSlice(Slice<F>::A, abc)
+                              , hhha.unwrapSlice(Slice<F>::B, abc)
+                              , hhha.unwrapSlice(Slice<F>::C, abc)
+                              // -- TA
+                              , taphh.unwrapSlice(Slice<F>::A, abc)
+                              , taphh.unwrapSlice(Slice<F>::B, abc)
+                              , taphh.unwrapSlice(Slice<F>::C, abc)
+                              // -- TABIJ
+                              , tabhh.unwrapSlice(Slice<F>::AB, abc)
+                              , tabhh.unwrapSlice(Slice<F>::AC, abc)
+                              , tabhh.unwrapSlice(Slice<F>::BC, abc)
+                              // -- TIJK
+                              , Tijk.data()
+                              );
+        WITH_RANK << iteration << "-th doubles done\n";
+      ))
     }
 
     // COMPUTE SINGLES =================================================== {{{1
     OCD_Barrier(universe);
     if (!isFakeTuple(i)) {
-      WITH_CHRONO(chrono["oneshot-unwrap"],
-      WITH_CHRONO(chrono["unwrap"],
-      WITH_CHRONO(chrono["unwrap:singles"],
+      WITH_CHRONO("oneshot-unwrap",
+      WITH_CHRONO("unwrap",
+      WITH_CHRONO("unwrap:singles",
         abhh.unwrapAll(abc);
       )))
-      chrono["reorder"].start();
-      for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I];
-      chrono["reorder"].stop();
-      chrono["singles"].start();
+      WITH_CHRONO("reorder",
+        for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I];
+      )
+      WITH_CHRONO("singles",
       singlesContribution<F>( No, Nv, abc
                             , Tai.data()
                             , abhh.unwrapSlice(Slice<F>::AB, abc)
                             , abhh.unwrapSlice(Slice<F>::AC, abc)
                             , abhh.unwrapSlice(Slice<F>::BC, abc)
                             , Zijk.data());
-      chrono["singles"].stop();
+      )
     }
 
 
@@ -2453,12 +3414,12 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
       if (abc[1] == abc[2]) distinct--;
       const F epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]);
 
-      chrono["energy"].start();
-      if ( distinct == 0)
-        tupleEnergy = getEnergyDistinct<F>(epsabc, epsi, Tijk, Zijk);
-      else
-        tupleEnergy = getEnergySame<F>(epsabc, epsi, Tijk, Zijk);
-      chrono["energy"].stop();
+      WITH_CHRONO("energy",
+        if ( distinct == 0)
+          tupleEnergy = getEnergyDistinct<F>(epsabc, epsi, Tijk, Zijk);
+        else
+          tupleEnergy = getEnergySame<F>(epsabc, epsi, Tijk, Zijk);
+      )
 
 #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
       tupleEnergies[abc] = tupleEnergy;
@@ -2468,6 +3429,7 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
 
     }
 
+    // TODO: remove this
     if (isFakeTuple(i)) {
       // fake iterations should also unwrap whatever they got
       WITH_RANK << iteration
@@ -2489,7 +3451,6 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
     // CLEANUP UNIONS ===================================================={{{1
     OCD_Barrier(universe);
     if (abcNext) {
-      chrono["gc"].start();
       WITH_RANK << "__gc__:" << iteration << "-th cleaning up.......\n";
       for (auto& u: unions) {
 
@@ -2523,12 +3484,11 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
 
 
       }
-      chrono["gc"].stop();
     }
 
       WITH_RANK << iteration << "-th cleaning up....... DONE\n";
 
-    chrono["iterations"].stop();
+    Atrip::chrono["iterations"].stop();
     // ITERATION END ====================================================={{{1
 
   }
@@ -2566,15 +3526,15 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
 
   // PRINT TIMINGS {{{1
   if (in.chrono)
-  for (auto const& pair: chrono)
+  for (auto const& pair: Atrip::chrono)
     LOG(0,"atrip:chrono") << pair.first << " "
                           << pair.second.count() << std::endl;
 
 
   LOG(0, "atrip:flops(doubles)")
-    << nIterations * doublesFlops / chrono["doubles"].count() << "\n";
+    << nIterations * doublesFlops / Atrip::chrono["doubles"].count() << "\n";
   LOG(0, "atrip:flops(iterations)")
-    << nIterations * doublesFlops / chrono["iterations"].count() << "\n";
+    << nIterations * doublesFlops / Atrip::chrono["iterations"].count() << "\n";
 
   // TODO: change the sign in  the getEnergy routines
   return { - globalEnergy };
@@ -2633,7 +3593,6 @@ template Atrip::Output Atrip::run(Atrip::Input<Complex> const& in);
 #  define DBG(...) dbg(__VA_ARGS__)
 #elif ATRIP_DEBUG == 2
 #  pragma message("WARNING: You have some debugging info for ABC triples")
-#  include <dbg.h>
 #  define OCD_Barrier(com)
 #  define WITH_OCD if (false)
 #  define WITH_ROOT if (atrip::Atrip::rank == 0)

From bbbfb30c6f33bb842b209fe00e3706a71211983d Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Fri, 18 Feb 2022 12:54:59 +0100
Subject: [PATCH 21/22] Add tangled sources

---
 include/atrip/Atrip.hpp      |  32 +-
 include/atrip/Debug.hpp      |   1 -
 include/atrip/Equations.hpp  | 190 +++++----
 include/atrip/RankMap.hpp    |  67 +++-
 include/atrip/Slice.hpp      | 758 ++++++++++++++++++-----------------
 include/atrip/SliceUnion.hpp |  27 +-
 include/atrip/Tuples.hpp     | 569 +++++++++++++++++++++++---
 include/atrip/Unions.hpp     |   4 +-
 include/atrip/Utils.hpp      |  45 ++-
 src/atrip/Atrip.cxx          | 332 +++++++--------
 10 files changed, 1298 insertions(+), 727 deletions(-)

diff --git a/include/atrip/Atrip.hpp b/include/atrip/Atrip.hpp
index 6f3859c..2a0f340 100644
--- a/include/atrip/Atrip.hpp
+++ b/include/atrip/Atrip.hpp
@@ -7,12 +7,22 @@
 
 #include <ctf.hpp>
 
+#include <atrip/Utils.hpp>
+
+#define ADD_ATTRIBUTE(_type, _name, _default)   \
+  _type _name = _default;                       \
+  Input& with_ ## _name(_type i) {              \
+    _name = i;                                  \
+    return *this;                               \
+  }
+
 namespace atrip {
 
   struct Atrip {
 
     static int rank;
     static int np;
+    static Timings chrono;
     static void init();
 
     template <typename F=double>
@@ -25,9 +35,6 @@ namespace atrip {
                         , *Vhhhp = nullptr
                         , *Vppph = nullptr
                         ;
-      int maxIterations = 0, iterationMod = -1, percentageMod = -1;
-      bool barrier = false;
-      bool chrono = false;
       Input& with_epsilon_i(CTF::Tensor<F> * t) { ei = t; return *this; }
       Input& with_epsilon_a(CTF::Tensor<F> * t) { ea = t; return *this; }
       Input& with_Tai(CTF::Tensor<F> * t) { Tph = t; return *this; }
@@ -35,11 +42,20 @@ namespace atrip {
       Input& with_Vabij(CTF::Tensor<F> * t) { Vpphh = t; return *this; }
       Input& with_Vijka(CTF::Tensor<F> * t) { Vhhhp = t; return *this; }
       Input& with_Vabci(CTF::Tensor<F> * t) { Vppph = t; return *this; }
-      Input& with_maxIterations(int i) { maxIterations = i; return *this; }
-      Input& with_iterationMod(int i) { iterationMod = i; return *this; }
-      Input& with_percentageMod(int i) { percentageMod = i; return *this; }
-      Input& with_barrier(bool i) { barrier = i; return *this; }
-      Input& with_chrono(bool i) { chrono = i; return *this; }
+
+      enum TuplesDistribution {
+        NAIVE,
+        GROUP_AND_SORT,
+      };
+
+      ADD_ATTRIBUTE(bool, rankRoundRobin, false)
+      ADD_ATTRIBUTE(bool, chrono, false)
+      ADD_ATTRIBUTE(bool, barrier, false)
+      ADD_ATTRIBUTE(int, maxIterations, 0)
+      ADD_ATTRIBUTE(int, iterationMod, -1)
+      ADD_ATTRIBUTE(int, percentageMod, -1)
+      ADD_ATTRIBUTE(TuplesDistribution, tuplesDistribution, NAIVE)
+
     };
 
     struct Output {
diff --git a/include/atrip/Debug.hpp b/include/atrip/Debug.hpp
index 4347824..e567d5c 100644
--- a/include/atrip/Debug.hpp
+++ b/include/atrip/Debug.hpp
@@ -41,7 +41,6 @@
 #  define DBG(...) dbg(__VA_ARGS__)
 #elif ATRIP_DEBUG == 2
 #  pragma message("WARNING: You have some debugging info for ABC triples")
-#  include <dbg.h>
 #  define OCD_Barrier(com)
 #  define WITH_OCD if (false)
 #  define WITH_ROOT if (atrip::Atrip::rank == 0)
diff --git a/include/atrip/Equations.hpp b/include/atrip/Equations.hpp
index 2b90736..e907592 100644
--- a/include/atrip/Equations.hpp
+++ b/include/atrip/Equations.hpp
@@ -40,12 +40,12 @@ namespace atrip {
                   , X(Zijk_[j + No*k + No*No*i])
                   , Y(Zijk_[k + No*i + No*No*j])
                   , Z(Zijk_[k + No*j + No*No*i])
-                  , A(std::conj(Tijk_[i + No*j + No*No*k]))
-                  , B(std::conj(Tijk_[i + No*k + No*No*j]))
-                  , C(std::conj(Tijk_[j + No*i + No*No*k]))
-                  , D(std::conj(Tijk_[j + No*k + No*No*i]))
-                  , E(std::conj(Tijk_[k + No*i + No*No*j]))
-                  , F(std::conj(Tijk_[k + No*j + No*No*i]))
+                  , A(maybeConjugate<F>(Tijk_[i + No*j + No*No*k]))
+                  , B(maybeConjugate<F>(Tijk_[i + No*k + No*No*j]))
+                  , C(maybeConjugate<F>(Tijk_[j + No*i + No*No*k]))
+                  , D(maybeConjugate<F>(Tijk_[j + No*k + No*No*i]))
+                  , E(maybeConjugate<F>(Tijk_[k + No*i + No*No*j]))
+                  , F(maybeConjugate<F>(Tijk_[k + No*j + No*No*i]))
                   , value
                     = 3.0 * ( A * U
                               + B * V
@@ -102,9 +102,9 @@ namespace atrip {
                 , U(Zijk_[i + No*j + No*No*k])
                 , V(Zijk_[j + No*k + No*No*i])
                 , W(Zijk_[k + No*i + No*No*j])
-                , A(std::conj(Tijk_[i + No*j + No*No*k]))
-                , B(std::conj(Tijk_[j + No*k + No*No*i]))
-                , C(std::conj(Tijk_[k + No*i + No*No*j]))
+                , A(maybeConjugate<F>(Tijk_[i + No*j + No*No*k]))
+                , B(maybeConjugate<F>(Tijk_[j + No*k + No*No*i]))
+                , C(maybeConjugate<F>(Tijk_[k + No*i + No*No*j]))
                 , value
                   = F(3.0) * ( A * U
                              + B * V
@@ -172,10 +172,8 @@ namespace atrip {
     , F const* TBChh
     // -- TIJK
     , F *Tijk
-    , atrip::Timings& chrono
     ) {
 
-    auto& t_reorder = chrono["doubles:reorder"];
     const size_t a = abc[0], b = abc[1], c = abc[2]
               , NoNo = No*No, NoNv = No*Nv
               ;
@@ -183,13 +181,13 @@ namespace atrip {
   #if defined(ATRIP_USE_DGEMM)
   #define _IJK_(i, j, k) i + j*No + k*NoNo
   #define REORDER(__II, __JJ, __KK)                                 \
-    t_reorder.start();                                              \
+    WITH_CHRONO("doubles:reorder",                                  \
     for (size_t k = 0; k < No; k++)                                 \
     for (size_t j = 0; j < No; j++)                                 \
     for (size_t i = 0; i < No; i++) {                               \
       Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)];   \
     }                                                               \
-    t_reorder.stop();
+    )
   #define DGEMM_PARTICLES(__A, __B)      \
     atrip::xgemm<F>( "T"                 \
                    , "N"                 \
@@ -220,106 +218,100 @@ namespace atrip {
                    , _t_buffer.data()        \
                    , (int const*)&NoNo       \
                    );
-  #define MAYBE_CONJ(_conj, _buffer)                          \
-    if (traits::isComplex<F>()) {                             \
-      for (size_t __i = 0; __i < NoNoNo; ++__i)               \
-        _conj[__i] = std::conj(_buffer[__i]);                 \
-    } else {                                                  \
-      for (size_t __i = 0; __i < NoNoNo; ++__i)               \
-        _conj[__i] = _buffer[__i];                            \
-    }
+  #define MAYBE_CONJ(_conj, _buffer)                 \
+    for (size_t __i = 0; __i < NoNoNo; ++__i)        \
+      _conj[__i] = maybeConjugate<F>(_buffer[__i]);  \
 
     const size_t NoNoNo = No*NoNo;
     std::vector<F> _t_buffer;
     _t_buffer.reserve(NoNoNo);
     F one{1.0}, m_one{-1.0}, zero{0.0};
 
-    t_reorder.start();
-    for (size_t k = 0; k < NoNoNo; k++) {
-      // zero the Tijk
-      Tijk[k] = 0.0;
-    }
-    t_reorder.stop();
+    WITH_CHRONO("double:reorder",
+      for (size_t k = 0; k < NoNoNo; k++) {
+         Tijk[k] = 0.0;
+       })
 
-    chrono["doubles:holes"].start();
-    { // Holes part ============================================================
+    // TOMERGE: replace chronos
+    WITH_CHRONO("doubles:holes",
+      { // Holes part ========================================================
 
-      std::vector<F> _vhhh(NoNoNo);
+        std::vector<F> _vhhh(NoNoNo);
 
-      // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
-      MAYBE_CONJ(_vhhh, VhhhC)
-      chrono["doubles:holes:1"].start();
-      DGEMM_HOLES(_vhhh.data(), TABhh, "N")
-      REORDER(i, k, j)
-      chrono["doubles:holes:1"].stop();
-      // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
-      chrono["doubles:holes:2"].start();
-      DGEMM_HOLES(_vhhh.data(), TABhh, "T")
-      REORDER(j, k, i)
-      chrono["doubles:holes:2"].stop();
+        // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
+        MAYBE_CONJ(_vhhh, VhhhC)
+        WITH_CHRONO("doubles:holes:1",
+          DGEMM_HOLES(_vhhh.data(), TABhh, "N")
+          REORDER(i, k, j)
+        )
+        // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
+        WITH_CHRONO("doubles:holes:2",
+          DGEMM_HOLES(_vhhh.data(), TABhh, "T")
+          REORDER(j, k, i)
+        )
 
-      // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
-      MAYBE_CONJ(_vhhh, VhhhB)
-      chrono["doubles:holes:3"].start();
-      DGEMM_HOLES(_vhhh.data(), TAChh, "N")
-      REORDER(i, j, k)
-      chrono["doubles:holes:3"].stop();
-      // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
-      chrono["doubles:holes:4"].start();
-      DGEMM_HOLES(_vhhh.data(), TAChh, "T")
-      REORDER(k, j, i)
-      chrono["doubles:holes:4"].stop();
+        // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
+        MAYBE_CONJ(_vhhh, VhhhB)
+        WITH_CHRONO("doubles:holes:3",
+          DGEMM_HOLES(_vhhh.data(), TAChh, "N")
+          REORDER(i, j, k)
+        )
+        // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
+        WITH_CHRONO("doubles:holes:4",
+          DGEMM_HOLES(_vhhh.data(), TAChh, "T")
+          REORDER(k, j, i)
+        )
 
-      // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
-      MAYBE_CONJ(_vhhh, VhhhA)
-      chrono["doubles:holes:5"].start();
-      DGEMM_HOLES(_vhhh.data(), TBChh, "N")
-      REORDER(j, i, k)
-      chrono["doubles:holes:5"].stop();
-      // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
-      chrono["doubles:holes:6"].start();
-      DGEMM_HOLES(_vhhh.data(), TBChh, "T")
-      REORDER(k, i, j)
-      chrono["doubles:holes:6"].stop();
+        // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
+        MAYBE_CONJ(_vhhh, VhhhA)
+        WITH_CHRONO("doubles:holes:5",
+          DGEMM_HOLES(_vhhh.data(), TBChh, "N")
+          REORDER(j, i, k)
+        )
+        // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
+        WITH_CHRONO("doubles:holes:6",
+          DGEMM_HOLES(_vhhh.data(), TBChh, "T")
+          REORDER(k, i, j)
+        )
 
-    }
-    chrono["doubles:holes"].stop();
+      }
+    )
   #undef MAYBE_CONJ
 
-    chrono["doubles:particles"].start();
-    { // Particle part =========================================================
-      // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
-      chrono["doubles:particles:1"].start();
-      DGEMM_PARTICLES(TAphh, VBCph)
-      REORDER(i, j, k)
-      chrono["doubles:particles:1"].stop();
-      // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
-      chrono["doubles:particles:2"].start();
-      DGEMM_PARTICLES(TAphh, VCBph)
-      REORDER(i, k, j)
-      chrono["doubles:particles:2"].stop();
-      // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
-      chrono["doubles:particles:3"].start();
-      DGEMM_PARTICLES(TCphh, VABph)
-      REORDER(k, i, j)
-      chrono["doubles:particles:3"].stop();
-      // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
-      chrono["doubles:particles:4"].start();
-      DGEMM_PARTICLES(TCphh, VBAph)
-      REORDER(k, j, i)
-      chrono["doubles:particles:4"].stop();
-      // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
-      chrono["doubles:particles:5"].start();
-      DGEMM_PARTICLES(TBphh, VACph)
-      REORDER(j, i, k)
-      chrono["doubles:particles:5"].stop();
-      // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
-      chrono["doubles:particles:6"].start();
-      DGEMM_PARTICLES(TBphh, VCAph)
-      REORDER(j, k, i)
-      chrono["doubles:particles:6"].stop();
-    }
-    chrono["doubles:particles"].stop();
+    WITH_CHRONO("doubles:particles",
+      { // Particle part =====================================================
+        // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
+        WITH_CHRONO("doubles:particles:1",
+          DGEMM_PARTICLES(TAphh, VBCph)
+          REORDER(i, j, k)
+        )
+        // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
+        WITH_CHRONO("doubles:particles:2",
+          DGEMM_PARTICLES(TAphh, VCBph)
+          REORDER(i, k, j)
+        )
+        // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
+        WITH_CHRONO("doubles:particles:3",
+          DGEMM_PARTICLES(TCphh, VABph)
+          REORDER(k, i, j)
+        )
+        // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
+        WITH_CHRONO("doubles:particles:4",
+          DGEMM_PARTICLES(TCphh, VBAph)
+          REORDER(k, j, i)
+        )
+        // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
+        WITH_CHRONO("doubles:particles:5",
+          DGEMM_PARTICLES(TBphh, VACph)
+          REORDER(j, i, k)
+        )
+        // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
+        WITH_CHRONO("doubles:particles:6",
+          DGEMM_PARTICLES(TBphh, VCAph)
+          REORDER(j, k, i)
+        )
+      }
+    )
 
   #undef REORDER
   #undef DGEMM_HOLES
diff --git a/include/atrip/RankMap.hpp b/include/atrip/RankMap.hpp
index 8564f9e..0e31a61 100644
--- a/include/atrip/RankMap.hpp
+++ b/include/atrip/RankMap.hpp
@@ -5,24 +5,38 @@
 #include <algorithm>
 
 #include <atrip/Slice.hpp>
+#include <atrip/Tuples.hpp>
 
 namespace atrip {
 
   template <typename F=double>
   struct RankMap {
 
+    static bool RANK_ROUND_ROBIN;
     std::vector<size_t> const lengths;
     size_t const np, size;
+    ClusterInfo const clusterInfo;
 
-    RankMap(std::vector<size_t> lens, size_t np_)
+    RankMap(std::vector<size_t> lens, size_t np_, MPI_Comm comm)
       : lengths(lens)
       , np(np_)
       , size(std::accumulate(lengths.begin(), lengths.end(),
                             1UL, std::multiplies<size_t>()))
+      , clusterInfo(getClusterInfo(comm))
     { assert(lengths.size() <= 2); }
 
     size_t find(typename Slice<F>::Location const& p) const noexcept {
-      return p.source * np + p.rank;
+      if (RANK_ROUND_ROBIN) {
+        return p.source * np + p.rank;
+      } else {
+        const size_t
+          rankPosition = p.source * clusterInfo.ranksPerNode
+                       + clusterInfo.rankInfos[p.rank].localRank
+                       ;
+        return rankPosition * clusterInfo.nNodes
+             + clusterInfo.rankInfos[p.rank].nodeId
+             ;
+      }
     }
 
     size_t nSources() const noexcept {
@@ -42,8 +56,9 @@ namespace atrip {
     }
 
     typename Slice<F>::Location
-    find(ABCTuple const& abc, typename Slice<F>::Type sliceType) const noexcept {
+    find(ABCTuple const& abc, typename Slice<F>::Type sliceType) const {
       // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB
+      // tuple = {11, 0} when abc = {11, 8, 9} and sliceType = A
       const auto tuple = Slice<F>::subtupleBySlice(abc, sliceType);
 
       const size_t index
@@ -51,9 +66,51 @@ namespace atrip {
         + tuple[1] * (lengths.size() > 1 ? lengths[0] : 0)
         ;
 
+      size_t rank, source;
+
+      if (RANK_ROUND_ROBIN) {
+
+        rank = index % np;
+        source = index / np;
+
+      } else {
+
+        size_t const
+
+          // the node that will be assigned to
+            nodeId = index % clusterInfo.nNodes
+
+          // how many times it has been assigned to the node
+          , s_n = index / clusterInfo.nNodes
+
+          // which local rank in the node should be
+          , localRank = s_n % clusterInfo.ranksPerNode
+
+          // and the local source (how many times we chose this local rank)
+          , localSource = s_n / clusterInfo.ranksPerNode
+          ;
+
+        // find the localRank-th entry in clusterInfo
+        auto const& it =
+          std::find_if(clusterInfo.rankInfos.begin(),
+                       clusterInfo.rankInfos.end(),
+                       [nodeId, localRank](RankInfo const& ri) {
+                         return ri.nodeId == nodeId
+                             && ri.localRank == localRank
+                             ;
+                       });
+        if (it == clusterInfo.rankInfos.end()) {
+          throw "FATAL! Error in node distribution of the slices";
+        }
+
+        rank = (*it).globalRank;
+        source = localSource;
+
+      }
+
       return
-        { index % np
-        , index / np
+        { rank
+        , source
         };
     }
 
diff --git a/include/atrip/Slice.hpp b/include/atrip/Slice.hpp
index 877d72a..1f5889e 100644
--- a/include/atrip/Slice.hpp
+++ b/include/atrip/Slice.hpp
@@ -1,4 +1,4 @@
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20slice][The slice:1]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]]
 #pragma once
 #include <iostream>
 #include <algorithm>
@@ -11,6 +11,9 @@
 
 namespace atrip {
 
+template <typename FF> FF maybeConjugate(const FF a) { return a; }
+template <> Complex maybeConjugate(const Complex a) { return std::conj(a); }
+
 namespace traits {
   template <typename FF> bool isComplex() { return false; };
   template <> bool isComplex<Complex>() { return true; };
@@ -24,401 +27,409 @@ namespace mpi {
 
 template <typename F=double>
 struct Slice {
-// The slice:1 ends here
+// Prolog:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20slice][The slice:2]]
-// ASSOCIATED TYPES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Location][Location:1]]
+struct Location { size_t rank; size_t source; };
+// Location:1 ends here
 
-  struct Location { size_t rank; size_t source; };
-
-  enum Type
-    { A = 10
-    , B
-    , C
-    // Two-parameter slices
-    , AB = 20
-    , BC
-    , AC
-    // for abci and the doubles
-    , CB
-    , BA
-    , CA
-    // The non-typed slice
-    , Blank = 404
-    };
-
-  enum State {
-    // Fetch represents the state where a slice is to be fetched
-    // and has a valid data pointer that can be written to
-    Fetch = 0,
-    // Dispatches represents the state that an MPI call has been
-    // dispatched in order to get the data, but the data has not been
-    // yet unwrapped, the data might be there or we might have to wait.
-    Dispatched = 2,
-    // Ready means that the data pointer can be read from
-    Ready = 1,
-    // Self sufficient is a slice when its contents are located
-    // in the same rank that it lives, so that it does not have to
-    // fetch from no one else.
-    SelfSufficient = 911,
-    // Recycled means that this slice gets its data pointer from another
-    // slice, so it should not be written to
-    Recycled = 123,
-    // Acceptor means that the Slice can accept a new Slice, it is
-    // the counterpart of the Blank type, but for states
-    Acceptor = 405
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Type][Type:1]]
+enum Type
+  { A = 10
+  , B
+  , C
+  // Two-parameter slices
+  , AB = 20
+  , BC
+  , AC
+  // for abci and the doubles
+  , CB
+  , BA
+  , CA
+  // The non-typed slice
+  , Blank = 404
   };
+// Type:1 ends here
 
-  struct Info {
-    // which part of a,b,c the slice holds
-    PartialTuple tuple;
-    // The type of slice for the user to retrieve the correct one
-    Type type;
-    // What is the state of the slice
-    State state;
-    // Where the slice is to be retrieved
-    // NOTE: this can actually be computed from tuple
-    Location from;
-    // If the data are actually to be found in this other slice
-    Type recycling;
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*State][State:1]]
+enum State {
+  Fetch = 0,
+  Dispatched = 2,
+  Ready = 1,
+  SelfSufficient = 911,
+  Recycled = 123,
+  Acceptor = 405
+};
+// State:1 ends here
 
-    Info() : tuple{0,0}
-           , type{Blank}
-           , state{Acceptor}
-           , from{0,0}
-           , recycling{Blank}
-           {}
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20Info%20structure][The Info structure:1]]
+struct Info {
+  // which part of a,b,c the slice holds
+  PartialTuple tuple;
+  // The type of slice for the user to retrieve the correct one
+  Type type;
+  // What is the state of the slice
+  State state;
+  // Where the slice is to be retrieved
+  Location from;
+  // If the data are actually to be found in this other slice
+  Type recycling;
+
+  Info() : tuple{0,0}
+          , type{Blank}
+          , state{Acceptor}
+          , from{0,0}
+          , recycling{Blank}
+          {}
+};
+
+using Ty_x_Tu = std::pair< Type, PartialTuple >;
+// The Info structure:1 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Name][Name:1]]
+enum Name
+  { TA    = 100
+  , VIJKA = 101
+  , VABCI = 200
+  , TABIJ = 201
+  , VABIJ = 202
   };
+// Name:1 ends here
 
-  using Ty_x_Tu = std::pair< Type, PartialTuple >;
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Database][Database:1]]
+struct LocalDatabaseElement {
+  Slice<F>::Name name;
+  Slice<F>::Info info;
+};
+// Database:1 ends here
 
-  // Names of the integrals that are considered in CCSD(T)
-  enum Name
-    { TA    = 100
-    , VIJKA = 101
-    , VABCI = 200
-    , TABIJ = 201
-    , VABIJ = 202
-    };
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Database][Database:2]]
+using LocalDatabase = std::vector<LocalDatabaseElement>;
+using Database = LocalDatabase;
+// Database:2 ends here
 
-  // DATABASE ==========================================================={{{1
-  struct LocalDatabaseElement {
-    Slice<F>::Name name;
-    Slice<F>::Info info;
-  };
-  using LocalDatabase = std::vector<LocalDatabaseElement>;
-  using Database = LocalDatabase;
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*MPI%20Types][MPI Types:1]]
+struct mpi {
 
-
-    // STATIC METHODS ===========================================================
-    //
-    // They are useful to organize the structure of slices
-
-    struct mpi {
-
-      static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) {
-        MPI_Datatype dt;
-        MPI_Type_vector(n, 1, 1, DT, &dt);
-        MPI_Type_commit(&dt);
-        return dt;
-      }
-
-      static MPI_Datatype sliceLocation () {
-        constexpr int n = 2;
-        // create a sliceLocation to measure in the current architecture
-        // the packing of the struct
-        Slice<F>::Location measure;
-        MPI_Datatype dt;
-        const std::vector<int> lengths(n, 1);
-        const MPI_Datatype types[n] = {usizeDt(), usizeDt()};
-
-        // measure the displacements in the struct
-        size_t j = 0;
-        MPI_Aint displacements[n];
-        MPI_Get_address(&measure.rank,   &displacements[j++]);
-        MPI_Get_address(&measure.source, &displacements[j++]);
-        for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0];
-        displacements[0] = 0;
-
-        MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
-        MPI_Type_commit(&dt);
-        return dt;
-      }
-
-      static MPI_Datatype enumDt() { return MPI_INT; }
-      static MPI_Datatype usizeDt() { return MPI_UINT64_T; }
-
-      static MPI_Datatype sliceInfo () {
-        constexpr int n = 5;
-        MPI_Datatype dt;
-        Slice<F>::Info measure;
-        const std::vector<int> lengths(n, 1);
-        const MPI_Datatype types[n]
-          = { vector(2, usizeDt())
-            , enumDt()
-            , enumDt()
-            , sliceLocation()
-            , enumDt()
-            };
-
-        // create the displacements from the info measurement struct
-        size_t j = 0;
-        MPI_Aint displacements[n];
-        MPI_Get_address(measure.tuple.data(), &displacements[j++]);
-        MPI_Get_address(&measure.type,        &displacements[j++]);
-        MPI_Get_address(&measure.state,       &displacements[j++]);
-        MPI_Get_address(&measure.from,        &displacements[j++]);
-        MPI_Get_address(&measure.recycling,   &displacements[j++]);
-        for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0];
-        displacements[0] = 0;
-
-        MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
-        MPI_Type_commit(&dt);
-        return dt;
-      }
-
-      static MPI_Datatype localDatabaseElement () {
-        constexpr int n = 2;
-        MPI_Datatype dt;
-        LocalDatabaseElement measure;
-        const std::vector<int> lengths(n, 1);
-        const MPI_Datatype types[n]
-          = { enumDt()
-            , sliceInfo()
-            };
-
-        // measure the displacements in the struct
-        size_t j = 0;
-        MPI_Aint displacements[n];
-        MPI_Get_address(&measure.name, &displacements[j++]);
-        MPI_Get_address(&measure.info, &displacements[j++]);
-        for (size_t i = 1; i < n; i++) displacements[i] -= displacements[0];
-        displacements[0] = 0;
-
-        MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
-        MPI_Type_commit(&dt);
-        return dt;
-      }
-
-    };
-
-  static
-  PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) {
-    switch (sliceType) {
-      case AB: return {abc[0], abc[1]};
-      case BC: return {abc[1], abc[2]};
-      case AC: return {abc[0], abc[2]};
-      case CB: return {abc[2], abc[1]};
-      case BA: return {abc[1], abc[0]};
-      case CA: return {abc[2], abc[0]};
-      case  A: return {abc[0], 0};
-      case  B: return {abc[1], 0};
-      case  C: return {abc[2], 0};
-      default: throw "Switch statement not exhaustive!";
-    }
+  static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) {
+    MPI_Datatype dt;
+    MPI_Type_vector(n, 1, 1, DT, &dt);
+    MPI_Type_commit(&dt);
+    return dt;
   }
 
+  static MPI_Datatype sliceLocation () {
+    constexpr int n = 2;
+    // create a sliceLocation to measure in the current architecture
+    // the packing of the struct
+    Slice<F>::Location measure;
+    MPI_Datatype dt;
+    const std::vector<int> lengths(n, 1);
+    const MPI_Datatype types[n] = {usizeDt(), usizeDt()};
 
-    /**
-     * It is important here to return a reference to a Slice
-     * not to accidentally copy the associated buffer of the slice.
-     */
-    static Slice<F>& findOneByType(std::vector<Slice<F>> &slices, Slice<F>::Type type) {
-        const auto sliceIt
-          = std::find_if(slices.begin(), slices.end(),
-                         [&type](Slice<F> const& s) {
-                           return type == s.info.type;
-                         });
-        WITH_CRAZY_DEBUG
-        WITH_RANK
-          << "\t__ looking for " << type << "\n";
-        if (sliceIt == slices.end())
-          throw std::domain_error("Slice by type not found!");
-        return *sliceIt;
-    }
+    static_assert(sizeof(Slice<F>::Location) == 2 * sizeof(size_t),
+                  "The Location packing is wrong in your compiler");
 
-    /*
-     * Check if an info has
-     *
-     */
-    static std::vector<Slice<F>*> hasRecycledReferencingToIt
-      ( std::vector<Slice<F>> &slices
-      , Info const& info
-      ) {
-      std::vector<Slice<F>*> result;
+    // measure the displacements in the struct
+    size_t j = 0;
+    MPI_Aint base_address, displacements[n];
+    MPI_Get_address(&measure,        &base_address);
+    MPI_Get_address(&measure.rank,   &displacements[j++]);
+    MPI_Get_address(&measure.source, &displacements[j++]);
+    for (size_t i = 0; i < n; i++)
+      displacements[i] = MPI_Aint_diff(displacements[i], base_address);
 
-      for (auto& s: slices)
-        if (  s.info.recycling == info.type
-           && s.info.tuple == info.tuple
-           && s.info.state == Recycled
-           ) result.push_back(&s);
+    MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
+    MPI_Type_commit(&dt);
+    return dt;
+  }
 
-      return result;
-    }
+  static MPI_Datatype usizeDt() { return MPI_UINT64_T; }
 
-    static Slice<F>&
-    findRecycledSource (std::vector<Slice<F>> &slices, Slice<F>::Info info) {
-      const auto sliceIt
-        = std::find_if(slices.begin(), slices.end(),
-                       [&info](Slice<F> const& s) {
-                         return info.recycling == s.info.type
-                             && info.tuple == s.info.tuple
-                             && State::Recycled != s.info.state
-                             ;
-                       });
+  static MPI_Datatype sliceInfo () {
+    constexpr int n = 5;
+    MPI_Datatype dt;
+    Slice<F>::Info measure;
+    const std::vector<int> lengths(n, 1);
+    const MPI_Datatype types[n]
+      = { vector(2, usizeDt())
+        , vector(sizeof(enum Type), MPI_CHAR)
+        , vector(sizeof(enum State), MPI_CHAR)
+        , sliceLocation()
+        , vector(sizeof(enum Type), MPI_CHAR)
+        // TODO: Why this does not work on intel mpi?
+        /*, MPI_UINT64_T*/
+        };
 
-      WITH_CRAZY_DEBUG
-      WITH_RANK << "__slice__:find: recycling source of "
-                << pretty_print(info) << "\n";
-      if (sliceIt == slices.end())
-        throw std::domain_error( "Slice not found: "
-                               + pretty_print(info)
-                               + " rank: "
-                               + pretty_print(Atrip::rank)
-                               );
-      WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n";
-      return *sliceIt;
-    }
+    static_assert(sizeof(enum Type)  == 4, "Enum type not 4 bytes long");
+    static_assert(sizeof(enum State) == 4, "Enum State not 4 bytes long");
+    static_assert(sizeof(enum Name)  == 4, "Enum Name not 4 bytes long");
 
-    static Slice<F>& findByTypeAbc
-      ( std::vector<Slice<F>> &slices
-      , Slice<F>::Type type
-      , ABCTuple const& abc
-      ) {
-        const auto tuple = Slice<F>::subtupleBySlice(abc, type);
-        const auto sliceIt
-          = std::find_if(slices.begin(), slices.end(),
-                         [&type, &tuple](Slice<F> const& s) {
-                           return type == s.info.type
-                               && tuple == s.info.tuple
-                               ;
-                         });
-        WITH_CRAZY_DEBUG
-        WITH_RANK << "__slice__:find:" << type << " and tuple "
-                  << pretty_print(tuple)
-                  << "\n";
-        if (sliceIt == slices.end())
-          throw std::domain_error( "Slice not found: "
-                                 + pretty_print(tuple)
-                                 + ", "
-                                 + pretty_print(type)
-                                 + " rank: "
-                                 + pretty_print(Atrip::rank)
-                                 );
-        return *sliceIt;
-    }
+    // create the displacements from the info measurement struct
+    size_t j = 0;
+    MPI_Aint base_address, displacements[n];
+    MPI_Get_address(&measure,             &base_address);
+    MPI_Get_address(&measure.tuple[0],    &displacements[j++]);
+    MPI_Get_address(&measure.type,        &displacements[j++]);
+    MPI_Get_address(&measure.state,       &displacements[j++]);
+    MPI_Get_address(&measure.from,        &displacements[j++]);
+    MPI_Get_address(&measure.recycling,   &displacements[j++]);
+    for (size_t i = 0; i < n; i++)
+      displacements[i] = MPI_Aint_diff(displacements[i], base_address);
 
-    static Slice<F>& findByInfo(std::vector<Slice<F>> &slices,
-                             Slice<F>::Info const& info) {
-        const auto sliceIt
-          = std::find_if(slices.begin(), slices.end(),
-                         [&info](Slice<F> const& s) {
-                           // TODO: maybe implement comparison in Info struct
-                           return info.type == s.info.type
-                               && info.state == s.info.state
-                               && info.tuple == s.info.tuple
-                               && info.from.rank == s.info.from.rank
-                               && info.from.source == s.info.from.source
-                                ;
-                         });
-        WITH_CRAZY_DEBUG
-        WITH_RANK << "__slice__:find:looking for " << pretty_print(info) << "\n";
-        if (sliceIt == slices.end())
-          throw std::domain_error( "Slice by info not found: "
-                                 + pretty_print(info));
-        return *sliceIt;
-    }
+    MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
+    MPI_Type_commit(&dt);
+    return dt;
+  }
 
-    // SLICE DEFINITION  =================================================={{{1
+  static MPI_Datatype localDatabaseElement () {
+    constexpr int n = 2;
+    MPI_Datatype dt;
+    LocalDatabaseElement measure;
+    const std::vector<int> lengths(n, 1);
+    const MPI_Datatype types[n]
+      = { vector(sizeof(enum Name), MPI_CHAR)
+        , sliceInfo()
+        };
 
-    // ATTRIBUTES ============================================================
-    Info info;
-    F  *data;
-    MPI_Request request;
-    const size_t size;
+    // measure the displacements in the struct
+    size_t j = 0;
+    MPI_Aint base_address, displacements[n];
+    MPI_Get_address(&measure,      &base_address);
+    MPI_Get_address(&measure.name, &displacements[j++]);
+    MPI_Get_address(&measure.info, &displacements[j++]);
+    for (size_t i = 0; i < n; i++)
+      displacements[i] = MPI_Aint_diff(displacements[i], base_address);
 
-    void markReady() noexcept {
-      info.state = Ready;
-      info.recycling = Blank;
-    }
+    static_assert( sizeof(LocalDatabaseElement) == sizeof(measure)
+                 , "Measure has bad size");
 
-    /*
-     * This means that the data is there
-     */
-    bool isUnwrapped() const noexcept {
-      return info.state == Ready
-          || info.state == SelfSufficient
-          ;
-    }
+    MPI_Type_create_struct(n, lengths.data(), displacements, types, &dt);
+    MPI_Type_commit(&dt);
+    return vector(sizeof(LocalDatabaseElement), MPI_CHAR);
+    // TODO: write tests in order to know if this works
+    return dt;
+  }
 
-    bool isUnwrappable() const noexcept {
-      return isUnwrapped()
-          || info.state == Recycled
-          || info.state == Dispatched
-          ;
-    }
+};
+// MPI Types:1 ends here
 
-    inline bool isDirectlyFetchable() const noexcept {
-      return info.state == Ready || info.state == Dispatched;
-    }
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:1]]
+static
+PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) {
+  switch (sliceType) {
+    case AB: return {abc[0], abc[1]};
+    case BC: return {abc[1], abc[2]};
+    case AC: return {abc[0], abc[2]};
+    case CB: return {abc[2], abc[1]};
+    case BA: return {abc[1], abc[0]};
+    case CA: return {abc[2], abc[0]};
+    case  A: return {abc[0], 0};
+    case  B: return {abc[1], 0};
+    case  C: return {abc[2], 0};
+    default: throw "Switch statement not exhaustive!";
+  }
+}
+// Static utilities:1 ends here
 
-    void free() noexcept {
-      info.tuple      = {0, 0};
-      info.type       = Blank;
-      info.state      = Acceptor;
-      info.from       = {0, 0};
-      info.recycling  = Blank;
-      data            = nullptr;
-    }
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:2]]
+static std::vector<Slice<F>*> hasRecycledReferencingToIt
+  ( std::vector<Slice<F>> &slices
+  , Info const& info
+  ) {
+  std::vector<Slice<F>*> result;
 
-    inline bool isFree() const noexcept {
-      return info.tuple       == PartialTuple{0, 0}
-          && info.type        == Blank
-          && info.state       == Acceptor
-          && info.from.rank   == 0
-          && info.from.source == 0
-          && info.recycling   == Blank
-          && data             == nullptr
-           ;
-    }
+  for (auto& s: slices)
+    if (  s.info.recycling == info.type
+       && s.info.tuple == info.tuple
+       && s.info.state == Recycled
+       ) result.push_back(&s);
 
+  return result;
+}
+// Static utilities:2 ends here
 
-    /*
-     * This function answers the question, which slices can be recycled.
-     *
-     * A slice can only be recycled if it is Fetch or Ready and has
-     * a valid datapointer.
-     *
-     * In particular, SelfSufficient are not recyclable, since it is easier
-     * just to create a SelfSufficient slice than deal with data dependencies.
-     *
-     * Furthermore, a recycled slice is not recyclable, if this is the case
-     * then it is either bad design or a bug.
-     */
-    inline bool isRecyclable() const noexcept {
-      return (  info.state == Dispatched
-             || info.state == Ready
-             || info.state == Fetch
-             )
-          && hasValidDataPointer()
-          ;
-    }
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:3]]
+static Slice<F>& findOneByType(std::vector<Slice<F>> &slices, Slice<F>::Type type) {
+    const auto sliceIt
+      = std::find_if(slices.begin(), slices.end(),
+                     [&type](Slice<F> const& s) {
+                       return type == s.info.type;
+                     });
+    WITH_CRAZY_DEBUG
+    WITH_RANK
+      << "\t__ looking for " << type << "\n";
+    if (sliceIt == slices.end())
+      throw std::domain_error("Slice by type not found!");
+    return *sliceIt;
+}
+// Static utilities:3 ends here
 
-    /*
-     * This function describes if a slice has a valid data pointer.
-     *
-     * This is important to know if the slice has some data to it, also
-     * some structural checks are done, so that it should not be Acceptor
-     * or Blank, if this is the case then it is a bug.
-     */
-    inline bool hasValidDataPointer() const noexcept {
-      return data       != nullptr
-          && info.state != Acceptor
-          && info.type  != Blank
-          ;
-    }
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:4]]
+static Slice<F>&
+findRecycledSource (std::vector<Slice<F>> &slices, Slice<F>::Info info) {
+  const auto sliceIt
+    = std::find_if(slices.begin(), slices.end(),
+                   [&info](Slice<F> const& s) {
+                     return info.recycling == s.info.type
+                         && info.tuple == s.info.tuple
+                         && State::Recycled != s.info.state
+                         ;
+                   });
 
-    void unwrapAndMarkReady() {
+  WITH_CRAZY_DEBUG
+  WITH_RANK << "__slice__:find: recycling source of "
+            << pretty_print(info) << "\n";
+  if (sliceIt == slices.end())
+    throw std::domain_error( "Slice not found: "
+                           + pretty_print(info)
+                           + " rank: "
+                           + pretty_print(Atrip::rank)
+                           );
+  WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n";
+  return *sliceIt;
+}
+// Static utilities:4 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:5]]
+static Slice<F>& findByTypeAbc
+  ( std::vector<Slice<F>> &slices
+  , Slice<F>::Type type
+  , ABCTuple const& abc
+  ) {
+    const auto tuple = Slice<F>::subtupleBySlice(abc, type);
+    const auto sliceIt
+      = std::find_if(slices.begin(), slices.end(),
+                     [&type, &tuple](Slice<F> const& s) {
+                       return type == s.info.type
+                           && tuple == s.info.tuple
+                           ;
+                     });
+    WITH_CRAZY_DEBUG
+    WITH_RANK << "__slice__:find:" << type << " and tuple "
+              << pretty_print(tuple)
+              << "\n";
+    if (sliceIt == slices.end())
+      throw std::domain_error( "Slice not found: "
+                             + pretty_print(tuple)
+                             + ", "
+                             + pretty_print(type)
+                             + " rank: "
+                             + pretty_print(Atrip::rank)
+                             );
+    return *sliceIt;
+}
+// Static utilities:5 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:6]]
+static Slice<F>& findByInfo(std::vector<Slice<F>> &slices,
+                         Slice<F>::Info const& info) {
+  const auto sliceIt
+    = std::find_if(slices.begin(), slices.end(),
+                   [&info](Slice<F> const& s) {
+                     // TODO: maybe implement comparison in Info struct
+                     return info.type == s.info.type
+                         && info.state == s.info.state
+                         && info.tuple == s.info.tuple
+                         && info.from.rank == s.info.from.rank
+                         && info.from.source == s.info.from.source
+                          ;
+                   });
+  WITH_CRAZY_DEBUG
+  WITH_RANK << "__slice__:find:looking for " << pretty_print(info) << "\n";
+  if (sliceIt == slices.end())
+    throw std::domain_error( "Slice by info not found: "
+                           + pretty_print(info));
+  return *sliceIt;
+}
+// Static utilities:6 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:1]]
+Info info;
+// Attributes:1 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:2]]
+F  *data;
+// Attributes:2 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:3]]
+MPI_Request request;
+// Attributes:3 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:4]]
+const size_t size;
+// Attributes:4 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:1]]
+void markReady() noexcept {
+  info.state = Ready;
+  info.recycling = Blank;
+}
+// Member functions:1 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:2]]
+bool isUnwrapped() const noexcept {
+  return info.state == Ready
+      || info.state == SelfSufficient
+      ;
+}
+// Member functions:2 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:3]]
+bool isUnwrappable() const noexcept {
+  return isUnwrapped()
+      || info.state == Recycled
+      || info.state == Dispatched
+      ;
+}
+
+inline bool isDirectlyFetchable() const noexcept {
+  return info.state == Ready || info.state == Dispatched;
+}
+
+void free() noexcept {
+  info.tuple      = {0, 0};
+  info.type       = Blank;
+  info.state      = Acceptor;
+  info.from       = {0, 0};
+  info.recycling  = Blank;
+  data            = nullptr;
+}
+
+inline bool isFree() const noexcept {
+  return info.tuple       == PartialTuple{0, 0}
+      && info.type        == Blank
+      && info.state       == Acceptor
+      && info.from.rank   == 0
+      && info.from.source == 0
+      && info.recycling   == Blank
+      && data             == nullptr
+       ;
+}
+// Member functions:3 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:4]]
+inline bool isRecyclable() const noexcept {
+  return (  info.state == Dispatched
+         || info.state == Ready
+         || info.state == Fetch
+         )
+      && hasValidDataPointer()
+      ;
+}
+// Member functions:4 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:5]]
+inline bool hasValidDataPointer() const noexcept {
+  return data       != nullptr
+      && info.state != Acceptor
+      && info.type  != Blank
+      ;
+}
+// Member functions:5 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:6]]
+void unwrapAndMarkReady() {
       if (info.state == Ready) return;
       if (info.state != Dispatched)
         throw
@@ -447,17 +458,20 @@ struct Slice {
                 << "\n";
 #endif
     }
+// Member functions:6 ends here
 
-    Slice(size_t size_)
-      : info({})
-      , data(nullptr)
-      , size(size_)
-      {}
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]]
+Slice(size_t size_)
+    : info({})
+    , data(nullptr)
+    , size(size_)
+    {}
 
 
-  }; // struct Slice
-
+}; // struct Slice
+// Epilog:1 ends here
 
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Debug][Debug:1]]
 template <typename F=double>
 std::ostream& operator<<(std::ostream& out, typename Slice<F>::Location const& v) {
   // TODO: remove me
@@ -476,4 +490,4 @@ std::ostream& operator<<(std::ostream& out, typename Slice<F>::Info const& i) {
 }
 
 } // namespace atrip
-// The slice:2 ends here
+// Debug:1 ends here
diff --git a/include/atrip/SliceUnion.hpp b/include/atrip/SliceUnion.hpp
index ec7aff6..365ad51 100644
--- a/include/atrip/SliceUnion.hpp
+++ b/include/atrip/SliceUnion.hpp
@@ -179,8 +179,14 @@ namespace atrip {
           if (blank.info.state == Slice<F>::SelfSufficient) {
             blank.data = sources[from.source].data();
           } else {
-            if (freePointers.size() == 0)
-              throw std::domain_error("No more free pointers!");
+            if (freePointers.size() == 0) {
+              std::stringstream stream;
+              stream << "No more free pointers "
+                     << "for type " << type
+                     << " and name " << name
+                      ;
+              throw std::domain_error(stream.str());
+            }
             auto dataPointer = freePointers.begin();
             freePointers.erase(dataPointer);
             blank.data = *dataPointer;
@@ -314,7 +320,8 @@ namespace atrip {
           // at this point, let us blank the slice
           WITH_RANK << "~~~:cl(" << name << ")"
                     << " freeing up slice "
-                    // TODO: make this possible
+                    // TODO: make this possible because of Templates
+                    // TODO: there is a deduction error here
                     // << " info " << slice.info
                     << "\n";
           slice.free();
@@ -334,7 +341,7 @@ namespace atrip {
               , typename Slice<F>::Name name_
               , size_t nSliceBuffers = 4
               )
-              : rankMap(paramLength, np)
+              : rankMap(paramLength, np, global_world)
               , world(child_world)
               , universe(global_world)
               , sliceLength(sliceLength_)
@@ -353,7 +360,7 @@ namespace atrip {
 
       slices
         = std::vector<Slice<F>>(2 * sliceTypes.size(), { sources[0].size() });
-      // TODO: think exactly ^------------------- about this number
+      // TODO: think exactly    ^------------------- about this number
 
       // initialize the freePointers with the pointers to the buffers
       std::transform(sliceBuffers.begin(), sliceBuffers.end(),
@@ -421,10 +428,11 @@ namespace atrip {
      * \brief Send asynchronously only if the state is Fetch
      */
     void send( size_t otherRank
-             , typename Slice<F>::Info const& info
+             , typename Slice<F>::LocalDatabaseElement const& el
              , size_t tag) const noexcept {
       MPI_Request request;
       bool sendData_p = false;
+      auto const& info = el.info;
 
       if (info.state == Slice<F>::Fetch) sendData_p = true;
       // TODO: remove this because I have SelfSufficient
@@ -539,8 +547,11 @@ namespace atrip {
                       [&name](SliceUnion<F> const* s) {
                         return name == s->name;
                       });
-      if (sliceUnionIt == unions.end())
-        throw std::domain_error("SliceUnion not found!");
+      if (sliceUnionIt == unions.end()) {
+        std::stringstream stream;
+        stream << "SliceUnion(" << name << ") not found!";
+        throw std::domain_error(stream.str());
+      }
       return **sliceUnionIt;
   }
 
diff --git a/include/atrip/Tuples.hpp b/include/atrip/Tuples.hpp
index 5d4b69f..c41b78a 100644
--- a/include/atrip/Tuples.hpp
+++ b/include/atrip/Tuples.hpp
@@ -1,75 +1,538 @@
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Tuples][Tuples:1]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]]
 #pragma once
 
 #include <vector>
 #include <array>
 #include <numeric>
 
+// TODO: remove some
+#include <stdio.h>
+#include <math.h>
+#include <algorithm>
+#include <map>
+#include <cassert>
+#include <chrono>
+#include <climits>
+#include <mpi.h>
+
 #include <atrip/Utils.hpp>
 #include <atrip/Debug.hpp>
 
 namespace atrip {
+// Prolog:1 ends here
 
-  using ABCTuple = std::array<size_t, 3>;
-  using PartialTuple = std::array<size_t, 2>;
-  using ABCTuples = std::vector<ABCTuple>;
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Tuples%20types][Tuples types:1]]
+using ABCTuple = std::array<size_t, 3>;
+using PartialTuple = std::array<size_t, 2>;
+using ABCTuples = std::vector<ABCTuple>;
 
-  ABCTuples getTuplesList(size_t Nv) {
-    const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
-    ABCTuples result(n);
-    size_t u(0);
+constexpr ABCTuple FAKE_TUPLE = {0, 0, 0};
+constexpr ABCTuple INVALID_TUPLE = {1, 1, 1};
+// Tuples types:1 ends here
 
-    for (size_t a(0); a < Nv; a++)
-    for (size_t b(a); b < Nv; b++)
-    for (size_t c(b); c < Nv; c++){
-      if ( a == b && b == c ) continue;
-      result[u++] = {a, b, c};
-    }
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Distributing%20the%20tuples][Distributing the tuples:1]]
+struct TuplesDistribution {
+  virtual ABCTuples getTuples(size_t Nv, MPI_Comm universe) = 0;
+  virtual bool tupleIsFake(ABCTuple const& t) { return t == FAKE_TUPLE; }
+};
+// Distributing the tuples:1 ends here
 
-    return result;
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Node%20information][Node information:1]]
+std::vector<std::string> getNodeNames(MPI_Comm comm){
+  int rank, np;
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &np);
 
+  std::vector<std::string> nodeList(np);
+  char nodeName[MPI_MAX_PROCESSOR_NAME]
+     , nodeNames[np*MPI_MAX_PROCESSOR_NAME]
+     ;
+  std::vector<int> nameLengths(np)
+                 , off(np)
+                 ;
+  int nameLength;
+  MPI_Get_processor_name(nodeName, &nameLength);
+  MPI_Allgather(&nameLength,
+                1,
+                MPI_INT,
+                nameLengths.data(),
+                1,
+                MPI_INT,
+                comm);
+  for (int i(1); i < np; i++)
+    off[i] = off[i-1] + nameLengths[i-1];
+  MPI_Allgatherv(nodeName,
+                 nameLengths[rank],
+                 MPI_BYTE,
+                 nodeNames,
+                 nameLengths.data(),
+                 off.data(),
+                 MPI_BYTE,
+                 comm);
+  for (int i(0); i < np; i++) {
+    std::string const s(&nodeNames[off[i]], nameLengths[i]);
+    nodeList[i] = s;
   }
+  return nodeList;
+}
+// Node information:1 ends here
 
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Node%20information][Node information:2]]
+struct RankInfo {
+  const std::string name;
+  const size_t nodeId;
+  const size_t globalRank;
+  const size_t localRank;
+  const size_t ranksPerNode;
+};
 
-  std::pair<size_t, size_t>
-  getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) {
-
-    std::vector<size_t> n_tuples_per_rank(np, tuplesList.size()/np);
-    const size_t
-        // how many valid tuples should we still verteilen to nodes
-        // since the number of tuples is not divisible by the number of nodes
-        nRoundRobin = tuplesList.size() % np
-        // every node must have the sanme amount of tuples in order for the
-        // other nodes to receive and send somewhere, therefore
-        // some nodes will get extra tuples but that are dummy tuples
-      , nExtraInvalid = (np - nRoundRobin) % np
-      ;
-
-    if (nRoundRobin) for (int i = 0; i < np; i++) n_tuples_per_rank[i]++;
-
-  #if defined(TODO)
-    assert( tuplesList.size()
-            ==
-            ( std::accumulate(n_tuples_per_rank.begin(),
-                              n_tuples_per_rank.end(),
-                              0UL,
-                              std::plus<size_t>())
-            + nExtraInvalid
-            ));
-  #endif
-
-    WITH_RANK << "nRoundRobin = " << nRoundRobin << "\n";
-    WITH_RANK << "nExtraInvalid = " << nExtraInvalid << "\n";
-    WITH_RANK << "ntuples = " << n_tuples_per_rank[rank] << "\n";
-
-    auto const& it = n_tuples_per_rank.begin();
-
-    return
-      { std::accumulate(it, it + rank    , 0)
-      , std::accumulate(it, it + rank + 1, 0)
-      };
+template <typename A>
+A unique(A const &xs) {
+  auto result = xs;
+  std::sort(std::begin(result), std::end(result));
+  auto const& last = std::unique(std::begin(result), std::end(result));
+  result.erase(last, std::end(result));
+  return result;
+}
 
+std::vector<RankInfo>
+getNodeInfos(std::vector<string> const& nodeNames) {
+  std::vector<RankInfo> result;
+  auto const uniqueNames = unique(nodeNames);
+  auto const index = [&uniqueNames](std::string const& s) {
+    auto const& it = std::find(uniqueNames.begin(), uniqueNames.end(), s);
+    return std::distance(uniqueNames.begin(), it);
+  };
+  std::vector<size_t> localRanks(uniqueNames.size(), 0);
+  size_t globalRank = 0;
+  for (auto const& name: nodeNames) {
+    const size_t nodeId = index(name);
+    result.push_back({name,
+                      nodeId,
+                      globalRank++,
+                      localRanks[nodeId]++,
+                      std::count(nodeNames.begin(),
+                                 nodeNames.end(),
+                                 name)
+                      });
   }
+  return result;
+}
+
+struct ClusterInfo {
+  const size_t nNodes, np, ranksPerNode;
+  const std::vector<RankInfo> rankInfos;
+};
+
+ClusterInfo
+getClusterInfo(MPI_Comm comm) {
+  auto const names = getNodeNames(comm);
+  auto const rankInfos = getNodeInfos(names);
+
+  return ClusterInfo {
+    unique(names).size(),
+    names.size(),
+    rankInfos[0].ranksPerNode,
+    rankInfos
+  };
 
 }
-// Tuples:1 ends here
+// Node information:2 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Naive%20list][Naive list:1]]
+ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) {
+
+  const size_t
+    // total number of tuples for the problem
+       n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv
+
+    // all ranks should have the same number of tuples_per_rank
+    , tuples_per_rank = n / np + size_t(n % np != 0)
+
+    // start index for the global tuples list
+    , start = tuples_per_rank * rank
+
+    // end index for the global tuples list
+    , end = tuples_per_rank * (rank + 1)
+    ;
+
+  LOG(1,"Atrip") << "tuples_per_rank = " << tuples_per_rank << "\n";
+  WITH_RANK << "start, end = " << start << ", " << end << "\n";
+  ABCTuples result(tuples_per_rank, FAKE_TUPLE);
+
+  for (size_t a(0), r(0), g(0); a < Nv; a++)
+  for (size_t b(a);             b < Nv; b++)
+  for (size_t c(b);             c < Nv; c++){
+    if ( a == b && b == c ) continue;
+    if ( start <= g && g < end) result[r++] = {a, b, c};
+    g++;
+  }
+
+  return result;
+
+}
+// Naive list:1 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Naive%20list][Naive list:2]]
+ABCTuples getAllTuplesList(const size_t Nv) {
+  const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
+  ABCTuples result(n);
+
+  for (size_t a(0), u(0); a < Nv; a++)
+  for (size_t b(a); b < Nv; b++)
+  for (size_t c(b); c < Nv; c++){
+    if ( a == b && b == c ) continue;
+    result[u++] = {a, b, c};
+  }
+
+  return result;
+}
+// Naive list:2 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Naive%20list][Naive list:3]]
+struct NaiveDistribution : public TuplesDistribution {
+  ABCTuples getTuples(size_t Nv, MPI_Comm universe) override {
+    int rank, np;
+    MPI_Comm_rank(universe, &rank);
+    MPI_Comm_size(universe, &np);
+    return getTuplesList(Nv, (size_t)rank, (size_t)np);
+  }
+};
+// Naive list:3 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]]
+namespace group_and_sort {
+// Prolog:1 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Utils][Utils:1]]
+// Provides the node on which the slice-element is found
+// Right now we distribute the slices in a round robin fashion
+// over the different nodes (NOTE: not mpi ranks but nodes)
+inline
+size_t isOnNode(size_t tuple, size_t nNodes) { return tuple % nNodes; }
+
+
+// return the node (or all nodes) where the elements of this
+// tuple are located
+std::vector<size_t> getTupleNodes(ABCTuple const& t, size_t nNodes) {
+  std::vector<size_t>
+    nTuple = { isOnNode(t[0], nNodes)
+             , isOnNode(t[1], nNodes)
+             , isOnNode(t[2], nNodes)
+             };
+  return unique(nTuple);
+}
+
+struct Info {
+  size_t nNodes;
+  size_t nodeId;
+};
+// Utils:1 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Distribution][Distribution:1]]
+ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
+
+  ABCTuples nodeTuples;
+  size_t const nNodes(info.nNodes);
+
+  std::vector<ABCTuples>
+      container1d(nNodes)
+    , container2d(nNodes * nNodes)
+    , container3d(nNodes * nNodes * nNodes)
+    ;
+
+  if (info.nodeId == 0)
+    std::cout << "\tGoing through all "
+              << allTuples.size()
+              << " tuples in "
+              << nNodes
+              << " nodes\n";
+
+  // build container-n-d's
+  for (auto const& t: allTuples) {
+    // one which node(s) are the tuple elements located...
+    // put them into the right container
+    auto const _nodes = getTupleNodes(t, nNodes);
+
+    switch (_nodes.size()) {
+      case 1:
+        container1d[_nodes[0]].push_back(t);
+        break;
+      case 2:
+        container2d[ _nodes[0]
+                   + _nodes[1] * nNodes
+                   ].push_back(t);
+        break;
+      case 3:
+        container3d[ _nodes[0]
+                   + _nodes[1] * nNodes
+                   + _nodes[2] * nNodes * nNodes
+                   ].push_back(t);
+        break;
+    }
+
+  }
+
+  if (info.nodeId == 0)
+    std::cout << "\tBuilding 1-d containers\n";
+  // DISTRIBUTE 1-d containers
+  // every tuple which is only located at one node belongs to this node
+  {
+    auto const& _tuples = container1d[info.nodeId];
+    nodeTuples.resize(_tuples.size(), INVALID_TUPLE);
+    std::copy(_tuples.begin(), _tuples.end(), nodeTuples.begin());
+  }
+
+  if (info.nodeId == 0)
+    std::cout << "\tBuilding 2-d containers\n";
+  // DISTRIBUTE 2-d containers
+  //the tuples which are located at two nodes are half/half given to these nodes
+  for (size_t yx = 0; yx < container2d.size(); yx++) {
+
+    auto const& _tuples = container2d[yx];
+      const
+    size_t idx = yx % nNodes
+         // remeber: yx = idy * nNodes + idx
+         , idy = yx / nNodes
+         , n_half = _tuples.size() / 2
+         , size = nodeTuples.size()
+         ;
+
+    size_t nbeg, nend;
+    if (info.nodeId == idx) {
+      nbeg = 0 * n_half;
+      nend = n_half;
+    } else if (info.nodeId == idy) {
+      nbeg = 1 * n_half;
+      nend = _tuples.size();
+    } else {
+      // either idx or idy is my node
+      continue;
+    }
+
+    size_t const nextra = nend - nbeg;
+    nodeTuples.resize(size + nextra, INVALID_TUPLE);
+    std::copy(_tuples.begin() + nbeg,
+              _tuples.begin() + nend,
+              nodeTuples.begin() + size);
+
+  }
+
+  if (info.nodeId == 0)
+    std::cout << "\tBuilding 3-d containers\n";
+  // DISTRIBUTE 3-d containers
+  for (size_t zyx = 0; zyx < container3d.size(); zyx++) {
+    auto const& _tuples = container3d[zyx];
+
+      const
+    size_t idx = zyx % nNodes
+         , idy = (zyx / nNodes) % nNodes
+         // remember: zyx = idx + idy * nNodes + idz * nNodes^2
+         , idz = zyx / nNodes / nNodes
+         , n_third = _tuples.size() / 3
+         , size = nodeTuples.size()
+         ;
+
+    size_t nbeg, nend;
+    if (info.nodeId == idx) {
+      nbeg = 0 * n_third;
+      nend = 1 * n_third;
+    } else if (info.nodeId == idy) {
+      nbeg = 1 * n_third;
+      nend = 2 * n_third;
+    } else if (info.nodeId == idz) {
+      nbeg = 2 * n_third;
+      nend = _tuples.size();
+    } else {
+      // either idx or idy or idz is my node
+      continue;
+    }
+
+    size_t const nextra = nend - nbeg;
+    nodeTuples.resize(size + nextra, INVALID_TUPLE);
+    std::copy(_tuples.begin() + nbeg,
+              _tuples.begin() + nend,
+              nodeTuples.begin() + size);
+
+  }
+
+
+  if (info.nodeId == 0) std::cout << "\tswapping tuples...\n";
+  /*
+   *  sort part of group-and-sort algorithm
+   *  every tuple on a given node is sorted in a way that
+   *  the 'home elements' are the fastest index.
+   *  1:yyy 2:yyn(x) 3:yny(x) 4:ynn(x) 5:nyy 6:nyn(x) 7:nny 8:nnn
+   */
+  for (auto &nt: nodeTuples){
+    if ( isOnNode(nt[0], nNodes) == info.nodeId ){ // 1234
+      if ( isOnNode(nt[2], nNodes) != info.nodeId ){ // 24
+        size_t const x(nt[0]);
+        nt[0] = nt[2];         // switch first and last
+        nt[2] = x;
+      }
+      else if ( isOnNode(nt[1], nNodes) != info.nodeId){ // 3
+        size_t const x(nt[0]);
+        nt[0] = nt[1];         // switch first two
+        nt[1] = x;
+      }
+    } else {
+      if ( isOnNode(nt[1], nNodes) == info.nodeId   // 56
+        && isOnNode(nt[2], nNodes) != info.nodeId
+        ) { // 6
+        size_t const x(nt[1]);
+        nt[1] = nt[2];         // switch last two
+        nt[2] = x;
+      }
+    }
+  }
+
+  if (info.nodeId == 0) std::cout << "\tsorting list of tuples...\n";
+  //now we sort the list of tuples
+  std::sort(nodeTuples.begin(), nodeTuples.end());
+
+  if (info.nodeId == 0) std::cout << "\trestoring tuples...\n";
+  // we bring the tuples abc back in the order a<b<c
+  for (auto &t: nodeTuples)  std::sort(t.begin(), t.end());
+
+#if ATRIP_DEBUG > 1
+  if (info.nodeId == 0)
+  std::cout << "checking for validity of " << nodeTuples.size() << std::endl;
+  const bool anyInvalid
+    = std::any_of(nodeTuples.begin(),
+                  nodeTuples.end(),
+                  [](ABCTuple const& t) { return t == INVALID_TUPLE; });
+  if (anyInvalid) throw "Some tuple is invalid in group-and-sort algorithm";
+#endif
+
+  if (info.nodeId == 0) std::cout << "\treturning tuples...\n";
+  return nodeTuples;
+
+}
+// Distribution:1 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:1]]
+std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv) {
+
+  int rank, np;
+  MPI_Comm_rank(universe, &rank);
+  MPI_Comm_size(universe, &np);
+
+  std::vector<ABCTuple> result;
+
+  auto const nodeNames(getNodeNames(universe));
+  size_t const nNodes = unique(nodeNames).size();
+  auto const nodeInfos = getNodeInfos(nodeNames);
+
+  // We want to construct a communicator which only contains of one
+  // element per node
+  bool const computeDistribution
+    = nodeInfos[rank].localRank == 0;
+
+  std::vector<ABCTuple>
+    nodeTuples
+      = computeDistribution
+      ? specialDistribution(Info{nNodes, nodeInfos[rank].nodeId},
+                            getAllTuplesList(Nv))
+      : std::vector<ABCTuple>()
+      ;
+
+  LOG(1,"Atrip") << "got nodeTuples\n";
+
+  // now we have to send the data from **one** rank on each node
+  // to all others ranks of this node
+    const
+  int color = nodeInfos[rank].nodeId
+    , key = nodeInfos[rank].localRank
+    ;
+
+
+  MPI_Comm INTRA_COMM;
+  MPI_Comm_split(universe, color, key, &INTRA_COMM);
+// Main:1 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:2]]
+size_t const
+  tuplesPerRankLocal
+     = nodeTuples.size() / nodeInfos[rank].ranksPerNode
+     + size_t(nodeTuples.size() % nodeInfos[rank].ranksPerNode != 0)
+     ;
+
+size_t tuplesPerRankGlobal;
+
+MPI_Reduce(&tuplesPerRankLocal,
+           &tuplesPerRankGlobal,
+           1,
+           MPI_UINT64_T,
+           MPI_MAX,
+           0,
+           universe);
+
+MPI_Bcast(&tuplesPerRankGlobal,
+          1,
+          MPI_UINT64_T,
+          0,
+          universe);
+
+LOG(1,"Atrip") << "Tuples per rank: " << tuplesPerRankGlobal << "\n";
+LOG(1,"Atrip") << "ranks per node " << nodeInfos[rank].ranksPerNode << "\n";
+LOG(1,"Atrip") << "#nodes " << nNodes << "\n";
+// Main:2 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:3]]
+size_t const totalTuples
+  = tuplesPerRankGlobal * nodeInfos[rank].ranksPerNode;
+
+if (computeDistribution) {
+  // pad with FAKE_TUPLEs
+  nodeTuples.insert(nodeTuples.end(),
+                    totalTuples - nodeTuples.size(),
+                    FAKE_TUPLE);
+}
+// Main:3 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:4]]
+{
+  // construct mpi type for abctuple
+  MPI_Datatype MPI_ABCTUPLE;
+  MPI_Type_vector(nodeTuples[0].size(), 1, 1, MPI_UINT64_T, &MPI_ABCTUPLE);
+  MPI_Type_commit(&MPI_ABCTUPLE);
+
+  LOG(1,"Atrip") << "scattering tuples \n";
+
+  result.resize(tuplesPerRankGlobal);
+  MPI_Scatter(nodeTuples.data(),
+              tuplesPerRankGlobal,
+              MPI_ABCTUPLE,
+              result.data(),
+              tuplesPerRankGlobal,
+              MPI_ABCTUPLE,
+              0,
+              INTRA_COMM);
+
+  MPI_Type_free(&MPI_ABCTUPLE);
+
+}
+// Main:4 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:5]]
+return result;
+
+}
+// Main:5 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Interface][Interface:1]]
+struct Distribution : public TuplesDistribution {
+  ABCTuples getTuples(size_t Nv, MPI_Comm universe) override {
+    return main(universe, Nv);
+  }
+};
+// Interface:1 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]]
+} // namespace group_and_sort
+// Epilog:1 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]]
+}
+// Epilog:1 ends here
diff --git a/include/atrip/Unions.hpp b/include/atrip/Unions.hpp
index db3b6b7..e651ef9 100644
--- a/include/atrip/Unions.hpp
+++ b/include/atrip/Unions.hpp
@@ -59,7 +59,7 @@ namespace atrip {
                           , child_world
                           , global_world
                           , Slice<F>::TA
-                          , 4) {
+                          , 6) {
            init(sourceTensor);
          }
 
@@ -97,7 +97,7 @@ namespace atrip {
                          , child_world
                          , global_world
                          , Slice<F>::VIJKA
-                         , 4) {
+                         , 6) {
            init(sourceTensor);
          }
 
diff --git a/include/atrip/Utils.hpp b/include/atrip/Utils.hpp
index bff3d19..83656c6 100644
--- a/include/atrip/Utils.hpp
+++ b/include/atrip/Utils.hpp
@@ -1,4 +1,4 @@
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Utils][Utils:1]]
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]]
 #pragma once
 #include <sstream>
 #include <string>
@@ -6,32 +6,41 @@
 #include <chrono>
 
 #include <ctf.hpp>
+#include <atrip/Debug.hpp>
 
 namespace atrip {
+// Prolog:1 ends here
 
-
-  template <typename T>
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Pretty%20printing][Pretty printing:1]]
+template <typename T>
   std::string pretty_print(T&& value) {
     std::stringstream stream;
-#if ATRIP_DEBUG > 1
+#if ATRIP_DEBUG > 2
     dbg::pretty_print(stream, std::forward<T>(value));
 #endif
     return stream.str();
   }
+// Pretty printing:1 ends here
 
-#define WITH_CHRONO(__chrono, ...) \
-  __chrono.start(); __VA_ARGS__ __chrono.stop();
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Chrono][Chrono:1]]
+#define WITH_CHRONO(__chrono_name, ...)         \
+  Atrip::chrono[__chrono_name].start();         \
+  __VA_ARGS__                                   \
+  Atrip::chrono[__chrono_name].stop();
 
-  struct Timer {
-    using Clock = std::chrono::high_resolution_clock;
-    using Event = std::chrono::time_point<Clock>;
-    std::chrono::duration<double> duration;
-    Event _start;
-    inline void start() noexcept { _start = Clock::now(); }
-    inline void stop() noexcept { duration += Clock::now() - _start; }
-    inline void clear() noexcept { duration *= 0; }
-    inline double count() const noexcept { return duration.count(); }
-  };
-  using Timings = std::map<std::string, Timer>;
+struct Timer {
+  using Clock = std::chrono::high_resolution_clock;
+  using Event = std::chrono::time_point<Clock>;
+  std::chrono::duration<double> duration;
+  Event _start;
+  inline void start() noexcept { _start = Clock::now(); }
+  inline void stop() noexcept { duration += Clock::now() - _start; }
+  inline void clear() noexcept { duration *= 0; }
+  inline double count() const noexcept { return duration.count(); }
+};
+using Timings = std::map<std::string, Timer>;
+// Chrono:1 ends here
+
+// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]]
 }
-// Utils:1 ends here
+// Epilog:1 ends here
diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx
index fc613b6..b7823de 100644
--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@@ -9,8 +9,11 @@
 
 using namespace atrip;
 
+bool RankMap<Complex>::RANK_ROUND_ROBIN;
+bool RankMap<double>::RANK_ROUND_ROBIN;
 int Atrip::rank;
 int Atrip::np;
+Timings Atrip::chrono;
 
 // user printing block
 IterationDescriptor IterationDescription::descriptor;
@@ -30,28 +33,35 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
   const int rank = Atrip::rank;
   MPI_Comm universe = in.ei->wrld->comm;
 
-  // Timings in seconds ================================================{{{1
-  Timings chrono{};
-
   const size_t No = in.ei->lens[0];
   const size_t Nv = in.ea->lens[0];
   LOG(0,"Atrip") << "No: " << No << "\n";
   LOG(0,"Atrip") << "Nv: " << Nv << "\n";
+  LOG(0,"Atrip") << "np: " << np << "\n";
 
   // allocate the three scratches, see piecuch
-  std::vector<F>   Tijk(No*No*No) // doubles only (see piecuch)
-                 , Zijk(No*No*No) // singles + doubles (see piecuch)
-                 // we need local copies of the following tensors on every
-                 // rank
-                 , epsi(No)
-                 , epsa(Nv)
-                 , Tai(No * Nv)
-                 ;
+  std::vector<F> Tijk(No*No*No) // doubles only (see piecuch)
+               , Zijk(No*No*No) // singles + doubles (see piecuch)
+               // we need local copies of the following tensors on every
+               // rank
+               , epsi(No)
+               , epsa(Nv)
+               , Tai(No * Nv)
+               ;
 
   in.ei->read_all(epsi.data());
   in.ea->read_all(epsa.data());
   in.Tph->read_all(Tai.data());
 
+  RankMap<F>::RANK_ROUND_ROBIN = in.rankRoundRobin;
+  if (RankMap<F>::RANK_ROUND_ROBIN) {
+    LOG(0,"Atrip") << "Doing rank round robin slices distribution" << "\n";
+  } else {
+    LOG(0,"Atrip")
+      << "Doing node > local rank round robin slices distribution" << "\n";
+  }
+
+
   // COMMUNICATOR CONSTRUCTION ========================================={{{1
   //
   // Construct a new communicator living only on a single rank
@@ -72,41 +82,49 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
   }
 
 
-  chrono["nv-slices"].start();
   // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1
-  LOG(0,"Atrip") << "BUILD NV-SLICES\n";
-  TAPHH<F> taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  HHHA<F>  hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  chrono["nv-slices"].stop();
+  WITH_CHRONO("nv-slices",
+    LOG(0,"Atrip") << "BUILD NV-SLICES\n";
+    TAPHH<F> taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+    HHHA<F>  hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  )
 
-  chrono["nv-nv-slices"].start();
   // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1
-  LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n";
-  ABPH<F> abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  ABHH<F> abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  TABHH<F> tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-  chrono["nv-nv-slices"].stop();
+  WITH_CHRONO("nv-nv-slices",
+    LOG(0,"Atrip") << "BUILD NV x NV-SLICES\n";
+    ABPH<F> abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+    ABHH<F> abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+    TABHH<F> tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  )
 
   // all tensors
   std::vector< SliceUnion<F>* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh};
 
-  //CONSTRUCT TUPLE LIST ==============================================={{{1
-  LOG(0,"Atrip") << "BUILD TUPLE LIST\n";
-  const auto tuplesList = std::move(getTuplesList(Nv));
-  WITH_RANK << "tupList.size() = " << tuplesList.size() << "\n";
+  // get tuples for the current rank
+  TuplesDistribution *distribution;
 
-  // GET ABC INDEX RANGE FOR RANK ======================================{{{1
-  auto abcIndex = getABCRange(np, rank, tuplesList);
-  size_t nIterations = abcIndex.second - abcIndex.first;
+  if (in.tuplesDistribution == Atrip::Input<F>::TuplesDistribution::NAIVE) {
+    LOG(0,"Atrip") << "Using the naive distribution\n";
+    distribution = new NaiveDistribution();
+  } else {
+    LOG(0,"Atrip") << "Using the group-and-sort distribution\n";
+    distribution = new group_and_sort::Distribution();
+  }
 
-  WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n";
-  LOG(0,"Atrip") << "#iterations: " << nIterations << "\n";
+  LOG(0,"Atrip") << "BUILDING TUPLE LIST\n";
+  WITH_CHRONO("tuples:build",
+    auto const tuplesList = distribution->getTuples(Nv, universe);
+    )
+  const size_t nIterations = tuplesList.size();
 
-  // first abc
-  const ABCTuple firstAbc = tuplesList[abcIndex.first];
-
-
-  double energy(0.);
+  {
+    const size_t _all_tuples = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
+    LOG(0,"Atrip") << "#iterations: "
+                  << nIterations
+                  << "/"
+                  << nIterations * np
+                  << "\n";
+  }
 
   const size_t
       iterationMod = (in.percentageMod > 0)
@@ -119,7 +137,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
 
 
   auto const isFakeTuple
-    = [&tuplesList](size_t const i) { return i >= tuplesList.size(); };
+    = [&tuplesList, distribution](size_t const i) {
+      return distribution->tupleIsFake(tuplesList[i]);
+    };
 
 
   using Database = typename Slice<F>::Database;
@@ -127,45 +147,42 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
   auto communicateDatabase
     = [ &unions
       , np
-      , &chrono
       ] (ABCTuple const& abc, MPI_Comm const& c) -> Database {
 
-        chrono["db:comm:type:do"].start();
-        auto MPI_LDB_ELEMENT = Slice<F>::mpi::localDatabaseElement();
-        chrono["db:comm:type:do"].stop();
+        WITH_CHRONO("db:comm:type:do",
+          auto MPI_LDB_ELEMENT = Slice<F>::mpi::localDatabaseElement();
+        )
 
-        chrono["db:comm:ldb"].start();
-        LocalDatabase ldb;
-
-        for (auto const& tensor: unions) {
-          auto const& tensorDb = tensor->buildLocalDatabase(abc);
-          ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end());
-        }
-        chrono["db:comm:ldb"].stop();
+        WITH_CHRONO("db:comm:ldb",
+          typename Slice<F>::LocalDatabase ldb;
+          for (auto const& tensor: unions) {
+            auto const& tensorDb = tensor->buildLocalDatabase(abc);
+            ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end());
+          }
+        )
 
         Database db(np * ldb.size(), ldb[0]);
 
-        chrono["oneshot-db:comm:allgather"].start();
-        chrono["db:comm:allgather"].start();
-        MPI_Allgather( ldb.data()
-                     , ldb.size()
-                     , MPI_LDB_ELEMENT
-                     , db.data()
-                     , ldb.size()
-                     , MPI_LDB_ELEMENT
-                     , c);
-        chrono["db:comm:allgather"].stop();
-        chrono["oneshot-db:comm:allgather"].stop();
+        WITH_CHRONO("oneshot-db:comm:allgather",
+        WITH_CHRONO("db:comm:allgather",
+          MPI_Allgather( ldb.data()
+                       , ldb.size()
+                       , MPI_LDB_ELEMENT
+                       , db.data()
+                       , ldb.size()
+                       , MPI_LDB_ELEMENT
+                       , c);
+        ))
 
-        chrono["db:comm:type:free"].start();
-        MPI_Type_free(&MPI_LDB_ELEMENT);
-        chrono["db:comm:type:free"].stop();
+        WITH_CHRONO("db:comm:type:free",
+          MPI_Type_free(&MPI_LDB_ELEMENT);
+        )
 
         return db;
       };
 
   auto doIOPhase
-    = [&unions, &rank, &np, &universe, &chrono] (Database const& db) {
+    = [&unions, &rank, &np, &universe] (Database const& db) {
 
     const size_t localDBLength = db.size() / np;
 
@@ -201,9 +218,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
           << "\n"
           ;
 
-        chrono["db:io:recv"].start();
-        u.receive(el.info, recvTag);
-        chrono["db:io:recv"].stop();
+        WITH_CHRONO("db:io:recv",
+          u.receive(el.info, recvTag);
+        )
 
       } // recv
     }
@@ -237,9 +254,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
           << "\n"
           ;
 
-        chrono["db:io:send"].start();
-        u.send(otherRank, el.info, sendTag);
-        chrono["db:io:send"].stop();
+        WITH_CHRONO("db:io:send",
+          u.send(otherRank, el, sendTag);
+        )
 
       } // send phase
 
@@ -257,31 +274,30 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
     * double(No)
     * double(No)
     * (double(No) + double(Nv))
-    * 2
-    * 6
+    * 2.0
+    * (traits::isComplex<F>() ? 2.0 : 1.0)
+    * 6.0
     / 1e9
     ;
 
   // START MAIN LOOP ======================================================{{{1
 
-  for ( size_t i = abcIndex.first, iteration = 1
-      ; i < abcIndex.second
+  double energy(0.);
+
+  for ( size_t i = 0, iteration = 1
+      ; i < tuplesList.size()
       ; i++, iteration++
       ) {
-    chrono["iterations"].start();
-
+    Atrip::chrono["iterations"].start();
 
     // check overhead from chrono over all iterations
-    chrono["start:stop"].start(); chrono["start:stop"].stop();
+    WITH_CHRONO("start:stop", {})
 
     // check overhead of doing a barrier at the beginning
-    chrono["oneshot-mpi:barrier"].start();
-    chrono["mpi:barrier"].start();
-    // TODO: REMOVE
-    if (in.barrier == 1)
-    MPI_Barrier(universe);
-    chrono["mpi:barrier"].stop();
-    chrono["oneshot-mpi:barrier"].stop();
+    WITH_CHRONO("oneshot-mpi:barrier",
+    WITH_CHRONO("mpi:barrier",
+      if (in.barrier) MPI_Barrier(universe);
+    ))
 
     if (iteration % iterationMod == 0 || iteration == iteration1Percent) {
 
@@ -289,22 +305,22 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
         IterationDescription::descriptor({
           iteration,
           nIterations,
-          chrono["iterations"].count()
+          Atrip::chrono["iterations"].count()
         });
       }
 
       LOG(0,"Atrip")
         << "iteration " << iteration
         << " [" << 100 * iteration / nIterations << "%]"
-        << " (" << doublesFlops * iteration / chrono["doubles"].count()
+        << " (" << doublesFlops * iteration / Atrip::chrono["doubles"].count()
         << "GF)"
-        << " (" << doublesFlops * iteration / chrono["iterations"].count()
+        << " (" << doublesFlops * iteration / Atrip::chrono["iterations"].count()
         << "GF)"
         << " ===========================\n";
 
       // PRINT TIMINGS
       if (in.chrono)
-      for (auto const& pair: chrono)
+      for (auto const& pair: Atrip::chrono)
         LOG(1, " ") << pair.first << " :: "
                     << pair.second.count()
                     << std::endl;
@@ -314,46 +330,43 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
     const ABCTuple abc = isFakeTuple(i)
                        ? tuplesList[tuplesList.size() - 1]
                        : tuplesList[i]
-                 , *abcNext = i == (abcIndex.second - 1)
+                 , *abcNext = i == (tuplesList.size() - 1)
                             ? nullptr
-                            : isFakeTuple(i + 1)
-                            ? &tuplesList[tuplesList.size() - 1]
                             : &tuplesList[i + 1]
                  ;
 
-    chrono["with_rank"].start();
-    WITH_RANK << " :it " << iteration
-              << " :abc " << pretty_print(abc)
-              << " :abcN "
-              << (abcNext ? pretty_print(*abcNext) : "None")
-              << "\n";
-    chrono["with_rank"].stop();
+    WITH_CHRONO("with_rank",
+      WITH_RANK << " :it " << iteration
+                << " :abc " << pretty_print(abc)
+                << " :abcN "
+                << (abcNext ? pretty_print(*abcNext) : "None")
+                << "\n";
+    )
 
 
     // COMM FIRST DATABASE ================================================{{{1
-    if (i == abcIndex.first) {
+    if (i == 0) {
       WITH_RANK << "__first__:first database ............ \n";
-      const auto __db = communicateDatabase(abc, universe);
+      const auto db = communicateDatabase(abc, universe);
       WITH_RANK << "__first__:first database communicated \n";
       WITH_RANK << "__first__:first database io phase \n";
-      doIOPhase(__db);
+      doIOPhase(db);
       WITH_RANK << "__first__:first database io phase DONE\n";
       WITH_RANK << "__first__::::Unwrapping all slices for first database\n";
       for (auto& u: unions) u->unwrapAll(abc);
-      WITH_RANK << "__first__::::Unwrapping all slices for first database DONE\n";
+      WITH_RANK << "__first__::::Unwrapping slices for first database DONE\n";
       MPI_Barrier(universe);
     }
 
     // COMM NEXT DATABASE ================================================={{{1
     if (abcNext) {
       WITH_RANK << "__comm__:" << iteration << "th communicating database\n";
-      chrono["db:comm"].start();
-      //const auto db = communicateDatabase(*abcNext, universe);
-      Database db = communicateDatabase(*abcNext, universe);
-      chrono["db:comm"].stop();
-      chrono["db:io"].start();
-      doIOPhase(db);
-      chrono["db:io"].stop();
+      WITH_CHRONO("db:comm",
+        const auto db = communicateDatabase(*abcNext, universe);
+      )
+      WITH_CHRONO("db:io",
+        doIOPhase(db);
+      )
       WITH_RANK << "__comm__:" <<  iteration << "th database io phase DONE\n";
     }
 
@@ -361,63 +374,61 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
     OCD_Barrier(universe);
     if (!isFakeTuple(i)) {
       WITH_RANK << iteration << "-th doubles\n";
-      WITH_CHRONO(chrono["oneshot-unwrap"],
-      WITH_CHRONO(chrono["unwrap"],
-      WITH_CHRONO(chrono["unwrap:doubles"],
+      WITH_CHRONO("oneshot-unwrap",
+      WITH_CHRONO("unwrap",
+      WITH_CHRONO("unwrap:doubles",
         for (auto& u: decltype(unions){&abph, &hhha, &taphh, &tabhh}) {
           u->unwrapAll(abc);
         }
       )))
-      chrono["oneshot-doubles"].start();
-      chrono["doubles"].start();
-      doublesContribution<F>( abc, (size_t)No, (size_t)Nv
-                            // -- VABCI
-                            , abph.unwrapSlice(Slice<F>::AB, abc)
-                            , abph.unwrapSlice(Slice<F>::AC, abc)
-                            , abph.unwrapSlice(Slice<F>::BC, abc)
-                            , abph.unwrapSlice(Slice<F>::BA, abc)
-                            , abph.unwrapSlice(Slice<F>::CA, abc)
-                            , abph.unwrapSlice(Slice<F>::CB, abc)
-                            // -- VHHHA
-                            , hhha.unwrapSlice(Slice<F>::A, abc)
-                            , hhha.unwrapSlice(Slice<F>::B, abc)
-                            , hhha.unwrapSlice(Slice<F>::C, abc)
-                            // -- TA
-                            , taphh.unwrapSlice(Slice<F>::A, abc)
-                            , taphh.unwrapSlice(Slice<F>::B, abc)
-                            , taphh.unwrapSlice(Slice<F>::C, abc)
-                            // -- TABIJ
-                            , tabhh.unwrapSlice(Slice<F>::AB, abc)
-                            , tabhh.unwrapSlice(Slice<F>::AC, abc)
-                            , tabhh.unwrapSlice(Slice<F>::BC, abc)
-                            // -- TIJK
-                            , Tijk.data()
-                            , chrono
-                            );
-      WITH_RANK << iteration << "-th doubles done\n";
-      chrono["doubles"].stop();
-      chrono["oneshot-doubles"].stop();
+      WITH_CHRONO("oneshot-doubles",
+      WITH_CHRONO("doubles",
+        doublesContribution<F>( abc, (size_t)No, (size_t)Nv
+                              // -- VABCI
+                              , abph.unwrapSlice(Slice<F>::AB, abc)
+                              , abph.unwrapSlice(Slice<F>::AC, abc)
+                              , abph.unwrapSlice(Slice<F>::BC, abc)
+                              , abph.unwrapSlice(Slice<F>::BA, abc)
+                              , abph.unwrapSlice(Slice<F>::CA, abc)
+                              , abph.unwrapSlice(Slice<F>::CB, abc)
+                              // -- VHHHA
+                              , hhha.unwrapSlice(Slice<F>::A, abc)
+                              , hhha.unwrapSlice(Slice<F>::B, abc)
+                              , hhha.unwrapSlice(Slice<F>::C, abc)
+                              // -- TA
+                              , taphh.unwrapSlice(Slice<F>::A, abc)
+                              , taphh.unwrapSlice(Slice<F>::B, abc)
+                              , taphh.unwrapSlice(Slice<F>::C, abc)
+                              // -- TABIJ
+                              , tabhh.unwrapSlice(Slice<F>::AB, abc)
+                              , tabhh.unwrapSlice(Slice<F>::AC, abc)
+                              , tabhh.unwrapSlice(Slice<F>::BC, abc)
+                              // -- TIJK
+                              , Tijk.data()
+                              );
+        WITH_RANK << iteration << "-th doubles done\n";
+      ))
     }
 
     // COMPUTE SINGLES =================================================== {{{1
     OCD_Barrier(universe);
     if (!isFakeTuple(i)) {
-      WITH_CHRONO(chrono["oneshot-unwrap"],
-      WITH_CHRONO(chrono["unwrap"],
-      WITH_CHRONO(chrono["unwrap:singles"],
+      WITH_CHRONO("oneshot-unwrap",
+      WITH_CHRONO("unwrap",
+      WITH_CHRONO("unwrap:singles",
         abhh.unwrapAll(abc);
       )))
-      chrono["reorder"].start();
-      for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I];
-      chrono["reorder"].stop();
-      chrono["singles"].start();
+      WITH_CHRONO("reorder",
+        for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I];
+      )
+      WITH_CHRONO("singles",
       singlesContribution<F>( No, Nv, abc
                             , Tai.data()
                             , abhh.unwrapSlice(Slice<F>::AB, abc)
                             , abhh.unwrapSlice(Slice<F>::AC, abc)
                             , abhh.unwrapSlice(Slice<F>::BC, abc)
                             , Zijk.data());
-      chrono["singles"].stop();
+      )
     }
 
 
@@ -430,12 +441,12 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
       if (abc[1] == abc[2]) distinct--;
       const F epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]);
 
-      chrono["energy"].start();
-      if ( distinct == 0)
-        tupleEnergy = getEnergyDistinct<F>(epsabc, epsi, Tijk, Zijk);
-      else
-        tupleEnergy = getEnergySame<F>(epsabc, epsi, Tijk, Zijk);
-      chrono["energy"].stop();
+      WITH_CHRONO("energy",
+        if ( distinct == 0)
+          tupleEnergy = getEnergyDistinct<F>(epsabc, epsi, Tijk, Zijk);
+        else
+          tupleEnergy = getEnergySame<F>(epsabc, epsi, Tijk, Zijk);
+      )
 
 #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
       tupleEnergies[abc] = tupleEnergy;
@@ -445,6 +456,7 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
 
     }
 
+    // TODO: remove this
     if (isFakeTuple(i)) {
       // fake iterations should also unwrap whatever they got
       WITH_RANK << iteration
@@ -466,7 +478,6 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
     // CLEANUP UNIONS ===================================================={{{1
     OCD_Barrier(universe);
     if (abcNext) {
-      chrono["gc"].start();
       WITH_RANK << "__gc__:" << iteration << "-th cleaning up.......\n";
       for (auto& u: unions) {
 
@@ -500,12 +511,11 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
 
 
       }
-      chrono["gc"].stop();
     }
 
       WITH_RANK << iteration << "-th cleaning up....... DONE\n";
 
-    chrono["iterations"].stop();
+    Atrip::chrono["iterations"].stop();
     // ITERATION END ====================================================={{{1
 
   }
@@ -543,15 +553,15 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
 
   // PRINT TIMINGS {{{1
   if (in.chrono)
-  for (auto const& pair: chrono)
+  for (auto const& pair: Atrip::chrono)
     LOG(0,"atrip:chrono") << pair.first << " "
                           << pair.second.count() << std::endl;
 
 
   LOG(0, "atrip:flops(doubles)")
-    << nIterations * doublesFlops / chrono["doubles"].count() << "\n";
+    << nIterations * doublesFlops / Atrip::chrono["doubles"].count() << "\n";
   LOG(0, "atrip:flops(iterations)")
-    << nIterations * doublesFlops / chrono["iterations"].count() << "\n";
+    << nIterations * doublesFlops / Atrip::chrono["iterations"].count() << "\n";
 
   // TODO: change the sign in  the getEnergy routines
   return { - globalEnergy };

From 10a796971012a1ce494d88d983ff6a1725a3dfbe Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Tue, 22 Feb 2022 12:09:41 +0100
Subject: [PATCH 22/22] Silence the logging in group-and-sort

---
 atrip.org                    | 18 ++++++------
 include/atrip.hpp            |  2 +-
 include/atrip/Atrip.hpp      |  2 +-
 include/atrip/Blas.hpp       |  2 +-
 include/atrip/Debug.hpp      |  8 +++---
 include/atrip/Equations.hpp  |  2 +-
 include/atrip/RankMap.hpp    |  2 +-
 include/atrip/Slice.hpp      | 54 +++++++++++++++++-----------------
 include/atrip/SliceUnion.hpp |  2 +-
 include/atrip/Tuples.hpp     | 56 ++++++++++++++++++------------------
 include/atrip/Unions.hpp     |  2 +-
 include/atrip/Utils.hpp      |  8 +++---
 src/atrip/Atrip.cxx          |  2 +-
 13 files changed, 80 insertions(+), 80 deletions(-)

diff --git a/atrip.org b/atrip.org
index c6ea744..491f810 100644
--- a/atrip.org
+++ b/atrip.org
@@ -1813,7 +1813,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
     , container3d(nNodes * nNodes * nNodes)
     ;
 
-  if (info.nodeId == 0)
+  WITH_DBG if (info.nodeId == 0)
     std::cout << "\tGoing through all "
               << allTuples.size()
               << " tuples in "
@@ -1845,7 +1845,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
 
   }
 
-  if (info.nodeId == 0)
+  WITH_DBG if (info.nodeId == 0)
     std::cout << "\tBuilding 1-d containers\n";
   // DISTRIBUTE 1-d containers
   // every tuple which is only located at one node belongs to this node
@@ -1855,7 +1855,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
     std::copy(_tuples.begin(), _tuples.end(), nodeTuples.begin());
   }
 
-  if (info.nodeId == 0)
+  WITH_DBG if (info.nodeId == 0)
     std::cout << "\tBuilding 2-d containers\n";
   // DISTRIBUTE 2-d containers
   //the tuples which are located at two nodes are half/half given to these nodes
@@ -1890,7 +1890,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
 
   }
 
-  if (info.nodeId == 0)
+  WITH_DBG if (info.nodeId == 0)
     std::cout << "\tBuilding 3-d containers\n";
   // DISTRIBUTE 3-d containers
   for (size_t zyx = 0; zyx < container3d.size(); zyx++) {
@@ -1929,7 +1929,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
   }
 
 
-  if (info.nodeId == 0) std::cout << "\tswapping tuples...\n";
+  WITH_DBG if (info.nodeId == 0) std::cout << "\tswapping tuples...\n";
   /*
    *  sort part of group-and-sort algorithm
    *  every tuple on a given node is sorted in a way that
@@ -1959,16 +1959,16 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
     }
   }
 
-  if (info.nodeId == 0) std::cout << "\tsorting list of tuples...\n";
+  WITH_DBG if (info.nodeId == 0) std::cout << "\tsorting list of tuples...\n";
   //now we sort the list of tuples
   std::sort(nodeTuples.begin(), nodeTuples.end());
 
-  if (info.nodeId == 0) std::cout << "\trestoring tuples...\n";
+  WITH_DBG if (info.nodeId == 0) std::cout << "\trestoring tuples...\n";
   // we bring the tuples abc back in the order a<b<c
   for (auto &t: nodeTuples)  std::sort(t.begin(), t.end());
 
 #if ATRIP_DEBUG > 1
-  if (info.nodeId == 0)
+  WITH_DBG if (info.nodeId == 0)
   std::cout << "checking for validity of " << nodeTuples.size() << std::endl;
   const bool anyInvalid
     = std::any_of(nodeTuples.begin(),
@@ -1977,7 +1977,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
   if (anyInvalid) throw "Some tuple is invalid in group-and-sort algorithm";
 #endif
 
-  if (info.nodeId == 0) std::cout << "\treturning tuples...\n";
+  WITH_DBG if (info.nodeId == 0) std::cout << "\treturning tuples...\n";
   return nodeTuples;
 
 }
diff --git a/include/atrip.hpp b/include/atrip.hpp
index 8ecf6ce..5deb277 100644
--- a/include/atrip.hpp
+++ b/include/atrip.hpp
@@ -1,4 +1,4 @@
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Include%20header][Include header:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Include%20header][Include header:1]]
 #pragma once
 
 #include <atrip/Atrip.hpp>
diff --git a/include/atrip/Atrip.hpp b/include/atrip/Atrip.hpp
index 2a0f340..15c4ef5 100644
--- a/include/atrip/Atrip.hpp
+++ b/include/atrip/Atrip.hpp
@@ -1,4 +1,4 @@
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Header][Header:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Header][Header:1]]
 #pragma once
 #include <sstream>
 #include <string>
diff --git a/include/atrip/Blas.hpp b/include/atrip/Blas.hpp
index df81d74..ea4e702 100644
--- a/include/atrip/Blas.hpp
+++ b/include/atrip/Blas.hpp
@@ -1,4 +1,4 @@
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Blas][Blas:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Blas][Blas:1]]
 #pragma once
 namespace atrip {
 
diff --git a/include/atrip/Debug.hpp b/include/atrip/Debug.hpp
index e567d5c..18e56bb 100644
--- a/include/atrip/Debug.hpp
+++ b/include/atrip/Debug.hpp
@@ -1,4 +1,4 @@
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Macros][Macros:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Macros][Macros:1]]
 #pragma once
 #include <functional>
 #define ATRIP_BENCHMARK
@@ -61,20 +61,20 @@
 #endif
 // Macros:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Macros][Macros:2]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Macros][Macros:2]]
 #ifndef LOG
 #define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": "
 #endif
 // Macros:2 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Macros][Macros:3]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Macros][Macros:3]]
 #ifdef ATRIP_NO_OUTPUT
 #  undef LOG
 #  define LOG(level, name) if (false) std::cout << name << ": "
 #endif
 // Macros:3 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::IterationDescriptor][IterationDescriptor]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::IterationDescriptor][IterationDescriptor]]
 namespace atrip {
 
   struct IterationDescription;
diff --git a/include/atrip/Equations.hpp b/include/atrip/Equations.hpp
index e907592..6ec67a9 100644
--- a/include/atrip/Equations.hpp
+++ b/include/atrip/Equations.hpp
@@ -1,4 +1,4 @@
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Equations][Equations:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Equations][Equations:1]]
 #pragma once
 
 #include<atrip/Slice.hpp>
diff --git a/include/atrip/RankMap.hpp b/include/atrip/RankMap.hpp
index 0e31a61..433fd8f 100644
--- a/include/atrip/RankMap.hpp
+++ b/include/atrip/RankMap.hpp
@@ -1,4 +1,4 @@
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20rank%20mapping][The rank mapping:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*The%20rank%20mapping][The rank mapping:1]]
 #pragma once
 
 #include <vector>
diff --git a/include/atrip/Slice.hpp b/include/atrip/Slice.hpp
index 1f5889e..6b319b7 100644
--- a/include/atrip/Slice.hpp
+++ b/include/atrip/Slice.hpp
@@ -1,4 +1,4 @@
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Prolog][Prolog:1]]
 #pragma once
 #include <iostream>
 #include <algorithm>
@@ -29,11 +29,11 @@ template <typename F=double>
 struct Slice {
 // Prolog:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Location][Location:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Location][Location:1]]
 struct Location { size_t rank; size_t source; };
 // Location:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Type][Type:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Type][Type:1]]
 enum Type
   { A = 10
   , B
@@ -51,7 +51,7 @@ enum Type
   };
 // Type:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*State][State:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*State][State:1]]
 enum State {
   Fetch = 0,
   Dispatched = 2,
@@ -62,7 +62,7 @@ enum State {
 };
 // State:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20Info%20structure][The Info structure:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*The%20Info%20structure][The Info structure:1]]
 struct Info {
   // which part of a,b,c the slice holds
   PartialTuple tuple;
@@ -86,7 +86,7 @@ struct Info {
 using Ty_x_Tu = std::pair< Type, PartialTuple >;
 // The Info structure:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Name][Name:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Name][Name:1]]
 enum Name
   { TA    = 100
   , VIJKA = 101
@@ -96,19 +96,19 @@ enum Name
   };
 // Name:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Database][Database:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Database][Database:1]]
 struct LocalDatabaseElement {
   Slice<F>::Name name;
   Slice<F>::Info info;
 };
 // Database:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Database][Database:2]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Database][Database:2]]
 using LocalDatabase = std::vector<LocalDatabaseElement>;
 using Database = LocalDatabase;
 // Database:2 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*MPI%20Types][MPI Types:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*MPI%20Types][MPI Types:1]]
 struct mpi {
 
   static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) {
@@ -214,7 +214,7 @@ struct mpi {
 };
 // MPI Types:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Static%20utilities][Static utilities:1]]
 static
 PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) {
   switch (sliceType) {
@@ -232,7 +232,7 @@ PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) {
 }
 // Static utilities:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:2]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Static%20utilities][Static utilities:2]]
 static std::vector<Slice<F>*> hasRecycledReferencingToIt
   ( std::vector<Slice<F>> &slices
   , Info const& info
@@ -249,7 +249,7 @@ static std::vector<Slice<F>*> hasRecycledReferencingToIt
 }
 // Static utilities:2 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:3]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Static%20utilities][Static utilities:3]]
 static Slice<F>& findOneByType(std::vector<Slice<F>> &slices, Slice<F>::Type type) {
     const auto sliceIt
       = std::find_if(slices.begin(), slices.end(),
@@ -265,7 +265,7 @@ static Slice<F>& findOneByType(std::vector<Slice<F>> &slices, Slice<F>::Type typ
 }
 // Static utilities:3 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:4]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Static%20utilities][Static utilities:4]]
 static Slice<F>&
 findRecycledSource (std::vector<Slice<F>> &slices, Slice<F>::Info info) {
   const auto sliceIt
@@ -291,7 +291,7 @@ findRecycledSource (std::vector<Slice<F>> &slices, Slice<F>::Info info) {
 }
 // Static utilities:4 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:5]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Static%20utilities][Static utilities:5]]
 static Slice<F>& findByTypeAbc
   ( std::vector<Slice<F>> &slices
   , Slice<F>::Type type
@@ -321,7 +321,7 @@ static Slice<F>& findByTypeAbc
 }
 // Static utilities:5 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Static%20utilities][Static utilities:6]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Static%20utilities][Static utilities:6]]
 static Slice<F>& findByInfo(std::vector<Slice<F>> &slices,
                          Slice<F>::Info const& info) {
   const auto sliceIt
@@ -344,30 +344,30 @@ static Slice<F>& findByInfo(std::vector<Slice<F>> &slices,
 }
 // Static utilities:6 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Attributes][Attributes:1]]
 Info info;
 // Attributes:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:2]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Attributes][Attributes:2]]
 F  *data;
 // Attributes:2 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:3]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Attributes][Attributes:3]]
 MPI_Request request;
 // Attributes:3 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Attributes][Attributes:4]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Attributes][Attributes:4]]
 const size_t size;
 // Attributes:4 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Member%20functions][Member functions:1]]
 void markReady() noexcept {
   info.state = Ready;
   info.recycling = Blank;
 }
 // Member functions:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:2]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Member%20functions][Member functions:2]]
 bool isUnwrapped() const noexcept {
   return info.state == Ready
       || info.state == SelfSufficient
@@ -375,7 +375,7 @@ bool isUnwrapped() const noexcept {
 }
 // Member functions:2 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:3]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Member%20functions][Member functions:3]]
 bool isUnwrappable() const noexcept {
   return isUnwrapped()
       || info.state == Recycled
@@ -408,7 +408,7 @@ inline bool isFree() const noexcept {
 }
 // Member functions:3 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:4]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Member%20functions][Member functions:4]]
 inline bool isRecyclable() const noexcept {
   return (  info.state == Dispatched
          || info.state == Ready
@@ -419,7 +419,7 @@ inline bool isRecyclable() const noexcept {
 }
 // Member functions:4 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:5]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Member%20functions][Member functions:5]]
 inline bool hasValidDataPointer() const noexcept {
   return data       != nullptr
       && info.state != Acceptor
@@ -428,7 +428,7 @@ inline bool hasValidDataPointer() const noexcept {
 }
 // Member functions:5 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Member%20functions][Member functions:6]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Member%20functions][Member functions:6]]
 void unwrapAndMarkReady() {
       if (info.state == Ready) return;
       if (info.state != Dispatched)
@@ -460,7 +460,7 @@ void unwrapAndMarkReady() {
     }
 // Member functions:6 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Epilog][Epilog:1]]
 Slice(size_t size_)
     : info({})
     , data(nullptr)
@@ -471,7 +471,7 @@ Slice(size_t size_)
 }; // struct Slice
 // Epilog:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Debug][Debug:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Debug][Debug:1]]
 template <typename F=double>
 std::ostream& operator<<(std::ostream& out, typename Slice<F>::Location const& v) {
   // TODO: remove me
diff --git a/include/atrip/SliceUnion.hpp b/include/atrip/SliceUnion.hpp
index 365ad51..3bae8f8 100644
--- a/include/atrip/SliceUnion.hpp
+++ b/include/atrip/SliceUnion.hpp
@@ -1,4 +1,4 @@
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*The%20slice%20union][The slice union:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*The%20slice%20union][The slice union:1]]
 #pragma once
 #include <atrip/Debug.hpp>
 #include <atrip/Slice.hpp>
diff --git a/include/atrip/Tuples.hpp b/include/atrip/Tuples.hpp
index c41b78a..74b609d 100644
--- a/include/atrip/Tuples.hpp
+++ b/include/atrip/Tuples.hpp
@@ -1,4 +1,4 @@
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Prolog][Prolog:1]]
 #pragma once
 
 #include <vector>
@@ -21,7 +21,7 @@
 namespace atrip {
 // Prolog:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Tuples%20types][Tuples types:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Tuples%20types][Tuples types:1]]
 using ABCTuple = std::array<size_t, 3>;
 using PartialTuple = std::array<size_t, 2>;
 using ABCTuples = std::vector<ABCTuple>;
@@ -30,14 +30,14 @@ constexpr ABCTuple FAKE_TUPLE = {0, 0, 0};
 constexpr ABCTuple INVALID_TUPLE = {1, 1, 1};
 // Tuples types:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Distributing%20the%20tuples][Distributing the tuples:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Distributing%20the%20tuples][Distributing the tuples:1]]
 struct TuplesDistribution {
   virtual ABCTuples getTuples(size_t Nv, MPI_Comm universe) = 0;
   virtual bool tupleIsFake(ABCTuple const& t) { return t == FAKE_TUPLE; }
 };
 // Distributing the tuples:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Node%20information][Node information:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Node%20information][Node information:1]]
 std::vector<std::string> getNodeNames(MPI_Comm comm){
   int rank, np;
   MPI_Comm_rank(comm, &rank);
@@ -77,7 +77,7 @@ std::vector<std::string> getNodeNames(MPI_Comm comm){
 }
 // Node information:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Node%20information][Node information:2]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Node%20information][Node information:2]]
 struct RankInfo {
   const std::string name;
   const size_t nodeId;
@@ -139,7 +139,7 @@ getClusterInfo(MPI_Comm comm) {
 }
 // Node information:2 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Naive%20list][Naive list:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Naive%20list][Naive list:1]]
 ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) {
 
   const size_t
@@ -173,7 +173,7 @@ ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) {
 }
 // Naive list:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Naive%20list][Naive list:2]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Naive%20list][Naive list:2]]
 ABCTuples getAllTuplesList(const size_t Nv) {
   const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
   ABCTuples result(n);
@@ -189,7 +189,7 @@ ABCTuples getAllTuplesList(const size_t Nv) {
 }
 // Naive list:2 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Naive%20list][Naive list:3]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Naive%20list][Naive list:3]]
 struct NaiveDistribution : public TuplesDistribution {
   ABCTuples getTuples(size_t Nv, MPI_Comm universe) override {
     int rank, np;
@@ -200,11 +200,11 @@ struct NaiveDistribution : public TuplesDistribution {
 };
 // Naive list:3 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Prolog][Prolog:1]]
 namespace group_and_sort {
 // Prolog:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Utils][Utils:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Utils][Utils:1]]
 // Provides the node on which the slice-element is found
 // Right now we distribute the slices in a round robin fashion
 // over the different nodes (NOTE: not mpi ranks but nodes)
@@ -229,7 +229,7 @@ struct Info {
 };
 // Utils:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Distribution][Distribution:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Distribution][Distribution:1]]
 ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
 
   ABCTuples nodeTuples;
@@ -241,7 +241,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
     , container3d(nNodes * nNodes * nNodes)
     ;
 
-  if (info.nodeId == 0)
+  WITH_DBG if (info.nodeId == 0)
     std::cout << "\tGoing through all "
               << allTuples.size()
               << " tuples in "
@@ -273,7 +273,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
 
   }
 
-  if (info.nodeId == 0)
+  WITH_DBG if (info.nodeId == 0)
     std::cout << "\tBuilding 1-d containers\n";
   // DISTRIBUTE 1-d containers
   // every tuple which is only located at one node belongs to this node
@@ -283,7 +283,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
     std::copy(_tuples.begin(), _tuples.end(), nodeTuples.begin());
   }
 
-  if (info.nodeId == 0)
+  WITH_DBG if (info.nodeId == 0)
     std::cout << "\tBuilding 2-d containers\n";
   // DISTRIBUTE 2-d containers
   //the tuples which are located at two nodes are half/half given to these nodes
@@ -318,7 +318,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
 
   }
 
-  if (info.nodeId == 0)
+  WITH_DBG if (info.nodeId == 0)
     std::cout << "\tBuilding 3-d containers\n";
   // DISTRIBUTE 3-d containers
   for (size_t zyx = 0; zyx < container3d.size(); zyx++) {
@@ -357,7 +357,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
   }
 
 
-  if (info.nodeId == 0) std::cout << "\tswapping tuples...\n";
+  WITH_DBG if (info.nodeId == 0) std::cout << "\tswapping tuples...\n";
   /*
    *  sort part of group-and-sort algorithm
    *  every tuple on a given node is sorted in a way that
@@ -387,16 +387,16 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
     }
   }
 
-  if (info.nodeId == 0) std::cout << "\tsorting list of tuples...\n";
+  WITH_DBG if (info.nodeId == 0) std::cout << "\tsorting list of tuples...\n";
   //now we sort the list of tuples
   std::sort(nodeTuples.begin(), nodeTuples.end());
 
-  if (info.nodeId == 0) std::cout << "\trestoring tuples...\n";
+  WITH_DBG if (info.nodeId == 0) std::cout << "\trestoring tuples...\n";
   // we bring the tuples abc back in the order a<b<c
   for (auto &t: nodeTuples)  std::sort(t.begin(), t.end());
 
 #if ATRIP_DEBUG > 1
-  if (info.nodeId == 0)
+  WITH_DBG if (info.nodeId == 0)
   std::cout << "checking for validity of " << nodeTuples.size() << std::endl;
   const bool anyInvalid
     = std::any_of(nodeTuples.begin(),
@@ -405,13 +405,13 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
   if (anyInvalid) throw "Some tuple is invalid in group-and-sort algorithm";
 #endif
 
-  if (info.nodeId == 0) std::cout << "\treturning tuples...\n";
+  WITH_DBG if (info.nodeId == 0) std::cout << "\treturning tuples...\n";
   return nodeTuples;
 
 }
 // Distribution:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Main][Main:1]]
 std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv) {
 
   int rank, np;
@@ -451,7 +451,7 @@ std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv) {
   MPI_Comm_split(universe, color, key, &INTRA_COMM);
 // Main:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:2]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Main][Main:2]]
 size_t const
   tuplesPerRankLocal
      = nodeTuples.size() / nodeInfos[rank].ranksPerNode
@@ -479,7 +479,7 @@ LOG(1,"Atrip") << "ranks per node " << nodeInfos[rank].ranksPerNode << "\n";
 LOG(1,"Atrip") << "#nodes " << nNodes << "\n";
 // Main:2 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:3]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Main][Main:3]]
 size_t const totalTuples
   = tuplesPerRankGlobal * nodeInfos[rank].ranksPerNode;
 
@@ -491,7 +491,7 @@ if (computeDistribution) {
 }
 // Main:3 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:4]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Main][Main:4]]
 {
   // construct mpi type for abctuple
   MPI_Datatype MPI_ABCTUPLE;
@@ -515,13 +515,13 @@ if (computeDistribution) {
 }
 // Main:4 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:5]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Main][Main:5]]
 return result;
 
 }
 // Main:5 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Interface][Interface:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Interface][Interface:1]]
 struct Distribution : public TuplesDistribution {
   ABCTuples getTuples(size_t Nv, MPI_Comm universe) override {
     return main(universe, Nv);
@@ -529,10 +529,10 @@ struct Distribution : public TuplesDistribution {
 };
 // Interface:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Epilog][Epilog:1]]
 } // namespace group_and_sort
 // Epilog:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Epilog][Epilog:1]]
 }
 // Epilog:1 ends here
diff --git a/include/atrip/Unions.hpp b/include/atrip/Unions.hpp
index e651ef9..6c5e058 100644
--- a/include/atrip/Unions.hpp
+++ b/include/atrip/Unions.hpp
@@ -1,4 +1,4 @@
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Unions][Unions:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Unions][Unions:1]]
 #pragma once
 #include <atrip/SliceUnion.hpp>
 
diff --git a/include/atrip/Utils.hpp b/include/atrip/Utils.hpp
index 83656c6..b5b9d6c 100644
--- a/include/atrip/Utils.hpp
+++ b/include/atrip/Utils.hpp
@@ -1,4 +1,4 @@
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Prolog][Prolog:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Prolog][Prolog:1]]
 #pragma once
 #include <sstream>
 #include <string>
@@ -11,7 +11,7 @@
 namespace atrip {
 // Prolog:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Pretty%20printing][Pretty printing:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Pretty%20printing][Pretty printing:1]]
 template <typename T>
   std::string pretty_print(T&& value) {
     std::stringstream stream;
@@ -22,7 +22,7 @@ template <typename T>
   }
 // Pretty printing:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Chrono][Chrono:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Chrono][Chrono:1]]
 #define WITH_CHRONO(__chrono_name, ...)         \
   Atrip::chrono[__chrono_name].start();         \
   __VA_ARGS__                                   \
@@ -41,6 +41,6 @@ struct Timer {
 using Timings = std::map<std::string, Timer>;
 // Chrono:1 ends here
 
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Epilog][Epilog:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Epilog][Epilog:1]]
 }
 // Epilog:1 ends here
diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx
index b7823de..71436b2 100644
--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@@ -1,4 +1,4 @@
-// [[file:~/cc4s/src/atrip/complex/atrip.org::*Main][Main:1]]
+// [[file:~/cc4s/src/atrip/bbbfb30/atrip.org::*Main][Main:1]]
 #include <iomanip>
 
 #include <atrip/Atrip.hpp>