diff --git a/Makefile b/Makefile
index b9cdb91..715056e 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ CONFIG ?= gcc
 SOURCES_FILE := Sources.mk
 
 include $(SOURCES_FILE)
-include ./etc/emacs.mk
+include ./etc/make/emacs.mk
 include ./etc/config/$(CONFIG).mk
 include ./bench/config.mk
 
diff --git a/README.org b/README.org
index 9144b63..c40e20f 100644
--- a/README.org
+++ b/README.org
@@ -9,6 +9,7 @@ The algorithm uses two main data types, the =Slice= and the
 
 ** The slice
 
+
 #+begin_src c++ :tangle (atrip-slice-h)
 #pragma once
 #include <iostream>
@@ -286,7 +287,7 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
         throw std::domain_error( "Slice not found: "
                                + pretty_print(info)
                                + " rank: "
-                               + pretty_print(cc4s::Cc4s::world->rank)
+                               + pretty_print(Atrip::rank)
                                );
       WITH_RANK << "__slice__:find: " << pretty_print(sliceIt->info) << "\n";
       return *sliceIt;
@@ -315,7 +316,7 @@ As an example, for the doubles amplitudes \( T^{ab}_{ij} \), one need two kinds
                                  + ", "
                                  + pretty_print(type)
                                  + " rank: "
-                                 + pretty_print(cc4s::Cc4s::world->rank)
+                                 + pretty_print(Atrip::rank)
                                  );
         return *sliceIt;
     }
@@ -484,5 +485,2063 @@ std::ostream& operator<<(std::ostream& out, Slice::Info const& i) {
       ;
   return out;
 }
+
+} // namespace atrip
 #+end_src
 
+** Utils
+#+begin_src c++ :tangle (atrip-utils-h)
+#pragma once
+#include <sstream>
+#include <string>
+#include <map>
+#include <chrono>
+
+namespace atrip {
+
+  struct Atrip {
+    static size_t rank;
+  };
+  size_t Atrip::rank;
+
+  template <typename T>
+  std::string pretty_print(T&& value) {
+    std::stringstream stream;
+#if TRIPLES_DEBUG > 1
+    dbg::pretty_print(stream, std::forward<T>(value));
+#endif
+    return stream.str();
+  }
+
+#define WITH_CHRONO(__chrono, ...) \
+  __chrono.start(); __VA_ARGS__ __chrono.stop();
+
+  struct Timer {
+    using Clock = std::chrono::high_resolution_clock;
+    using Event = std::chrono::time_point<Clock>;
+    std::chrono::duration<double> duration;
+    Event _start;
+    inline void start() noexcept { _start = Clock::now(); }
+    inline void stop() noexcept { duration += Clock::now() - _start; }
+    inline void clear() noexcept { duration *= 0; }
+    inline double count() const noexcept { return duration.count(); }
+  };
+  using Timings = std::map<std::string, Timer>;
+}
+
+#+end_src
+
+** Include header
+
+#+begin_src c++ :tangle (atrip-main-h)
+#pragma once
+
+#define TRIPLES_BENCHMARK
+#define TRIPLES_DEBUG 1
+//#define TRIPLES_WORKLOAD_DUMP
+#define TRIPLES_USE_DGEMM
+//#define TRIPLES_PRINT_TUPLES
+
+#if TRIPLES_DEBUG == 4
+#  pragma message("WARNING: You have OCD debugging ABC triples "\
+                  "expect GB of output and consult your therapist")
+#  include <dbg.h>
+#  define HAVE_OCD
+#  define OCD_Barrier(com) MPI_Barrier(com)
+#  define WITH_OCD
+#  define WITH_ROOT if (atrip::Atrip::rank == 0)
+#  define WITH_SPECIAL(r) if (atrip::Atrip::rank == r)
+#  define WITH_RANK std::cout << atrip::Atrip::rank << ": "
+#  define WITH_CRAZY_DEBUG
+#  define WITH_DBG
+#  define DBG(...) dbg(__VA_ARGS__)
+#elif TRIPLES_DEBUG == 3
+#  pragma message("WARNING: You have crazy debugging ABC triples,"\
+                  " expect GB of output")
+#  include <dbg.h>
+#  define OCD_Barrier(com)
+#  define WITH_OCD if (false)
+#  define WITH_ROOT if (atrip::Atrip::rank == 0)
+#  define WITH_SPECIAL(r) if (atrip::Atrip::rank == r)
+#  define WITH_RANK std::cout << atrip::Atrip::rank << ": "
+#  define WITH_CRAZY_DEBUG
+#  define WITH_DBG
+#  define DBG(...) dbg(__VA_ARGS__)
+#elif TRIPLES_DEBUG == 2
+#  pragma message("WARNING: You have some debugging info for ABC triples")
+#  include <dbg.h>
+#  define OCD_Barrier(com)
+#  define WITH_OCD if (false)
+#  define WITH_ROOT if (atrip::Atrip::rank == 0)
+#  define WITH_SPECIAL(r) if (atrip::Atrip::rank == r)
+#  define WITH_RANK std::cout << atrip::Atrip::rank << ": "
+#  define WITH_CRAZY_DEBUG if (false)
+#  define WITH_DBG
+#  define DBG(...) dbg(__VA_ARGS__)
+#elif TRIPLES_DEBUG == 1
+#  define OCD_Barrier(com)
+#  define WITH_OCD if (false)
+#  define WITH_ROOT if (false)
+#  define WITH_SPECIAL(r) if (false)
+#  define WITH_RANK if (false) std::cout << atrip::Atrip::rank << ": "
+#  define WITH_DBG if (false)
+#  define WITH_CRAZY_DEBUG if (false)
+#  define DBG(...)
+#else
+#  error("TRIPLES_DEBUG is not defined!")
+#endif
+
+#include <atrip/Utils.hpp>
+#include <atrip/Slice.hpp>
+
+
+#+end_src
+
+
+** Todo                                                            :noexport:
+ #+begin_src c++  :tangle src/main.hpp
+#include <algorithms/PerturbativeTriplesAbcijk.hpp>
+#include <math/MathFunctions.hpp>
+#include <tcc/DryTensor.hpp>
+#include <util/Log.hpp>
+#include <util/Exception.hpp>
+#include <Cc4s.hpp>
+#include <ctf.hpp>
+#include <util/MpiCommunicator.hpp>
+
+using namespace cc4s;
+
+#include <sstream>
+#include <algorithm>
+#include <numeric>
+#include <array>
+#include <vector>
+#include <map>
+#include <memory>
+#include <set>
+#include <chrono>
+
+namespace atrip {
+  extern "C" {
+    void dgemm_(
+      const char *transa,
+      const char *transb,
+      const int *m,
+      const int *n,
+      const int *k,
+      double *alpha,
+      const double *A,
+      const int *lda,
+      const double *B,
+      const int *ldb,
+      double *beta,
+      double *C,
+      const int *ldc
+    );
+  }
+}
+
+
+
+namespace atrip {
+
+
+}
+
+
+ALGORITHM_REGISTRAR_DEFINITION(PerturbativeTriplesAbcijk)
+
+
+
+// SLICE MAIN DEFININITION =============================================={{{1
+#+end_src
+
+
+#+begin_src c++ :tangle src/main.hpp
+
+  // RANKMAP =============================================================={{{1
+
+struct RankMap {
+
+  std::vector<size_t> const lengths;
+  size_t const np, size;
+
+  RankMap(std::vector<size_t> lens, size_t np_)
+    : lengths(lens)
+    , np(np_)
+    , size(std::accumulate(lengths.begin(), lengths.end(),
+                           1, std::multiplies<size_t>()))
+  { assert(lengths.size() <= 2); }
+
+  size_t find(Slice::Location const& p) const noexcept {
+    return p.source * np + p.rank;
+  }
+
+  size_t nSources() const noexcept {
+    return size / np + size_t(size % np != 0);
+  }
+
+
+  bool isPaddingRank(size_t rank) const noexcept {
+    return size % np == 0
+         ? false
+         : rank > (size % np - 1)
+         ;
+  }
+
+  bool isSourcePadding(size_t rank, size_t source) const noexcept {
+    return source == nSources() && isPaddingRank(rank);
+  }
+
+  Slice::Location
+  find(ABCTuple const& abc, Slice::Type sliceType) const noexcept {
+    // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB
+    const auto tuple = Slice::subtupleBySlice(abc, sliceType);
+
+    const size_t index
+      = tuple[0]
+      + tuple[1] * (lengths.size() > 1 ? lengths[0] : 0)
+      ;
+
+    return
+      { index % np
+      , index / np
+      };
+  }
+
+};
+
+
+
+  // SLICEUNION ==========================================================={{{1
+
+  struct SliceUnion {
+    using F = double;
+    using Tensor = CTF::Tensor<F>;
+
+    virtual void
+    sliceIntoBuffer(size_t iteration, Tensor &to, Tensor const& from) = 0;
+
+    /*
+     ,* This function should enforce an important property of a SliceUnion.
+     ,* Namely, there can be no two Slices of the same nature.
+     ,*
+     ,* This means that there can be at most one slice with a given Ty_x_Tu.
+     ,*/
+    void checkForDuplicates() const {
+      std::vector<Slice::Ty_x_Tu> tytus;
+      for (auto const& s: slices) {
+        if (s.isFree()) continue;
+        tytus.push_back({s.info.type, s.info.tuple});
+      }
+
+      for (auto const& tytu: tytus) {
+        if (std::count(tytus.begin(), tytus.end(), tytu) > 1)
+          throw "Invariance violated, more than one slice with same Ty_x_Tu";
+      }
+
+    }
+
+    std::vector<Slice::Ty_x_Tu> neededSlices(ABCTuple const& abc) {
+      std::vector<Slice::Ty_x_Tu> needed(sliceTypes.size());
+      // build the needed vector
+      std::transform(sliceTypes.begin(), sliceTypes.end(),
+                     needed.begin(),
+                     [&abc](Slice::Type const type) {
+                       auto tuple = Slice::subtupleBySlice(abc, type);
+                       return std::make_pair(type, tuple);
+                     });
+      return needed;
+    }
+
+    /* buildLocalDatabase
+     ,*
+     ,* It should build a database of slices so that we know what is needed
+     ,* to fetch in the next iteration represented by the tuple 'abc'.
+     ,*
+     ,* 1. The algorithm works as follows, we build a database of the all
+     ,* the slice types that we need together with their tuple.
+     ,*
+     ,* 2. Look in the SliceUnion if we already have this tuple,
+     ,* if we already have it mark it (TODO)
+     ,*
+     ,* 3. If we don't have the tuple, look for a (state=acceptor, type=blank)
+     ,* slice and mark this slice as type=Fetch with the corresponding type
+     ,* and tuple.
+     ,*
+     ,* NOTE: The algorithm should certify that we always have enough blank
+     ,* slices.
+     ,*
+     ,*/
+    Slice::LocalDatabase buildLocalDatabase(ABCTuple const& abc) {
+      Slice::LocalDatabase result;
+
+      auto const needed = neededSlices(abc);
+
+      WITH_RANK << "__db__:needed:" << pretty_print(needed) << "\n";
+      // BUILD THE DATABASE
+      // we need to loop over all sliceTypes that this TensorUnion
+      // is representing and find out how we will get the corresponding
+      // slice for the abc we are considering right now.
+      for (auto const& pair: needed) {
+        auto const type = pair.first;
+        auto const tuple = pair.second;
+        auto const from  = rankMap.find(abc, type);
+
+#ifdef HAVE_OCD
+        WITH_RANK << "__db__:want:" << pretty_print(pair) << "\n";
+        for (auto const& s: slices)
+          WITH_RANK << "__db__:guts:ocd "
+                    << s.info << " pt " << s.data
+                    << "\n";
+#endif
+
+#ifdef HAVE_OCD
+        WITH_RANK << "__db__: checking if exact match" << "\n";
+#endif
+        {
+          // FIRST: look up if there is already a *Ready* slice matching what we
+          // need
+          auto const& it
+            = std::find_if(slices.begin(), slices.end(),
+                           [&tuple, &type](Slice const& other) {
+                             return other.info.tuple == tuple
+                                 && other.info.type == type
+                                    // we only want another slice when it
+                                    // has already ready-to-use data
+                                 && other.isUnwrappable()
+                                 ;
+                           });
+          if (it != slices.end()) {
+            // if we find this slice, it means that we don't have to do anything
+            WITH_RANK << "__db__: EXACT: found EXACT in name=" << name
+                      << " for tuple " << tuple[0] << ", " << tuple[1]
+                      << " ptr " << it->data
+                      << "\n";
+            result.push_back({name, it->info});
+            continue;
+          }
+        }
+
+#ifdef HAVE_OCD
+        WITH_RANK << "__db__: checking if recycle" << "\n";
+#endif
+        // Try to find a recyling possibility ie. find a slice with the same
+        // tuple and that has a valid data pointer.
+        auto const& recycleIt
+          = std::find_if(slices.begin(), slices.end(),
+                         [&tuple, &type](Slice const& other) {
+                           return other.info.tuple == tuple
+                               && other.info.type != type
+                               && other.isRecyclable()
+                               ;
+                         });
+
+        // if we find this recylce, then we find a Blank slice
+        // (which should exist by construction :THINK)
+        //
+        if (recycleIt != slices.end()) {
+          auto& blank = Slice::findOneByType(slices, Slice::Blank);
+          // TODO: formalize this through a method to copy information
+          //       from another slice
+          blank.data = recycleIt->data;
+          blank.info.type = type;
+          blank.info.tuple = tuple;
+          blank.info.state = Slice::Recycled;
+          blank.info.from = from;
+          blank.info.recycling = recycleIt->info.type;
+          result.push_back({name, blank.info});
+          WITH_RANK << "__db__: RECYCLING: n" << name
+                    << " " << pretty_print(abc)
+                    << " get " << pretty_print(blank.info)
+                    << " from " << pretty_print(recycleIt->info)
+                    << " ptr " << recycleIt->data
+                    << "\n"
+                    ;
+          continue;
+        }
+
+        // in this case we have to create a new slice
+        // this means that we should have a blank slice at our disposal
+        // and also the freePointers should have some elements inside,
+        // so we pop a data pointer from the freePointers container
+#ifdef HAVE_OCD
+        WITH_RANK << "__db__: none work, doing new" << "\n";
+#endif
+        {
+          WITH_RANK << "__db__: NEW: finding blank in " << name
+                    << " for type " << type
+                    << " for tuple " << tuple[0] << ", " << tuple[1]
+                    << "\n"
+                    ;
+          auto& blank = Slice::findOneByType(slices, Slice::Blank);
+          blank.info.type = type;
+          blank.info.tuple = tuple;
+          blank.info.from = from;
+
+          // Handle self sufficiency
+          blank.info.state = cc4s::Cc4s::world->rank == from.rank
+                           ? Slice::SelfSufficient
+                           : Slice::Fetch
+                           ;
+          if (blank.info.state == Slice::SelfSufficient) {
+            blank.data = sources[from.source].data();
+          } else {
+            if (freePointers.size() == 0)
+              throw std::domain_error("No more free pointers!");
+            auto dataPointer = freePointers.begin();
+            freePointers.erase(dataPointer);
+            blank.data = *dataPointer;
+          }
+
+          result.push_back({name, blank.info});
+          continue;
+        }
+
+      }
+
+#ifdef HAVE_OCD
+      for (auto const& s: slices)
+        WITH_RANK << "__db__:guts:ocd:__end__ " << s.info << "\n";
+#endif
+
+
+      return result;
+
+    }
+
+    /*
+     ,* Garbage collect slices not needed for the next iteration.
+     ,*
+     ,* It will throw if it tries to gc a slice that has not been
+     ,* previously unwrapped, as a safety mechanism.
+     ,*/
+    void clearUnusedSlicesForNext(ABCTuple const& abc) {
+      auto const needed = neededSlices(abc);
+
+      // CLEAN UP SLICES, FREE THE ONES THAT ARE NOT NEEDED ANYMORE
+      for (auto& slice: slices) {
+        // if the slice is free, then it was not used anyways
+        if (slice.isFree()) continue;
+
+
+        // try to find the slice in the needed slices list
+        auto const found
+          = std::find_if(needed.begin(), needed.end(),
+                         [&slice] (Slice::Ty_x_Tu const& tytu) {
+                           return slice.info.tuple == tytu.second
+                               && slice.info.type == tytu.first
+                               ;
+                         });
+
+        // if we did not find slice in needed, then erase it
+        if (found == needed.end()) {
+
+          // We have to be careful about the data pointer,
+          // for SelfSufficient, the data pointer is a source pointer
+          // of the slice, so we should just wipe it.
+          //
+          // For Ready slices, we have to be careful if there are some
+          // recycled slices depending on it.
+          bool freeSlicePointer = true;
+
+          // allow to gc unwrapped and recycled, never Fetch,
+          // if we have a Fetch slice then something has gone very wrong.
+          if (!slice.isUnwrapped() && slice.info.state != Slice::Recycled)
+            throw
+              std::domain_error("Trying to garbage collect "
+                                " a non-unwrapped slice! "
+                                + pretty_print(&slice)
+                                + pretty_print(slice.info));
+
+          // it can be that our slice is ready, but it has some hanging
+          // references lying around in the form of a recycled slice.
+          // Of course if we need the recycled slice the next iteration
+          // this would be fatal, because we would then free the pointer
+          // of the slice and at some point in the future we would
+          // overwrite it. Therefore, we must check if slice has some
+          // references in slices and if so then
+          //
+          //  - we should mark those references as the original (since the data
+          //    pointer should be the same)
+          //
+          //  - we should make sure that the data pointer of slice
+          //    does not get freed.
+          //
+          if (slice.info.state == Slice::Ready) {
+            WITH_OCD WITH_RANK
+              << "__gc__:" << "checking for data recycled dependencies\n";
+            auto recycled
+              = Slice::hasRecycledReferencingToIt(slices, slice.info);
+            if (recycled.size()) {
+              Slice* newReady = recycled[0];
+              WITH_OCD WITH_RANK
+                << "__gc__:" << "swaping recycled "
+                << pretty_print(newReady->info)
+                << " and "
+                << pretty_print(slice.info)
+                << "\n";
+              newReady->markReady();
+              assert(newReady->data == slice.data);
+              freeSlicePointer = false;
+
+              for (size_t i = 1; i < recycled.size(); i++) {
+                auto newRecyled = recycled[i];
+                newRecyled->info.recycling = newReady->info.type;
+                WITH_OCD WITH_RANK
+                  << "__gc__:" << "updating recycled "
+                  << pretty_print(newRecyled->info)
+                  << "\n";
+              }
+
+            }
+          }
+
+          // if the slice is self sufficient, do not dare touching the
+          // pointer, since it is a pointer to our sources in our rank.
+          if (  slice.info.state == Slice::SelfSufficient
+             || slice.info.state == Slice::Recycled
+             ) {
+            freeSlicePointer = false;
+          }
+
+          // make sure we get its data pointer to be used later
+          // only for non-recycled, since it can be that we need
+          // for next iteration the data of the slice that the recycled points
+          // to
+          if (freeSlicePointer) {
+            freePointers.insert(slice.data);
+            WITH_RANK << "~~~:cl(" << name << ")"
+                      << " added to freePointer "
+                      << pretty_print(freePointers)
+                      << "\n";
+          } else {
+            WITH_OCD WITH_RANK << "__gc__:not touching the free Pointer\n";
+          }
+
+          // at this point, let us blank the slice
+          WITH_RANK << "~~~:cl(" << name << ")"
+                    << " freeing up slice "
+                    << " info " << slice.info
+                    << "\n";
+          slice.free();
+        }
+
+      }
+    }
+
+    // CONSTRUCTOR
+    SliceUnion( Tensor const& sourceTensor
+              , std::vector<Slice::Type> sliceTypes_
+              , std::vector<size_t> sliceLength_
+              , std::vector<size_t> paramLength
+              , size_t np
+              , MPI_Comm child_world
+              , MPI_Comm global_world
+              , Slice::Name name_
+              , size_t nSliceBuffers = 4
+              )
+              : rankMap(paramLength, np)
+              , world(child_world)
+              , universe(global_world)
+              , sliceLength(sliceLength_)
+              , sources(rankMap.nSources(),
+                        std::vector<F>
+                          (std::accumulate(sliceLength.begin(),
+                                           sliceLength.end(),
+                                           1, std::multiplies<size_t>())))
+              , name(name_)
+              , sliceTypes(sliceTypes_)
+              , sliceBuffers(nSliceBuffers, sources[0])
+              //, slices(2 * sliceTypes.size(), Slice{ sources[0].size() })
+    { // constructor begin
+
+      LOG(0,"NEW_TRIPLES") << "INIT SliceUnion: " << name << "\n";
+
+      slices
+        = std::vector<Slice>(2 * sliceTypes.size(), { sources[0].size() });
+      // TODO: think exactly ^------------------- about this number
+
+      // initialize the freePointers with the pointers to the buffers
+      std::transform(sliceBuffers.begin(), sliceBuffers.end(),
+                     std::inserter(freePointers, freePointers.begin()),
+                     [](std::vector<F> &vec) { return vec.data(); });
+
+
+
+      LOG(1,"NEW_TRIPLES") << "rankMap.nSources "
+                           << rankMap.nSources() << "\n";
+      LOG(1,"NEW_TRIPLES") << "#slices "
+                           << slices.size() << "\n";
+      LOG(1,"NEW_TRIPLES") << "#slices[0] "
+                           << slices[0].size << "\n";
+      LOG(1,"NEW_TRIPLES") << "#sources "
+                           << sources.size() << "\n";
+      LOG(1,"NEW_TRIPLES") << "#sources[0] "
+                           << sources[0].size() << "\n";
+      LOG(1,"NEW_TRIPLES") << "#freePointers "
+                           << freePointers.size() << "\n";
+      LOG(1,"NEW_TRIPLES") << "#sliceBuffers "
+                           << sliceBuffers.size() << "\n";
+      LOG(1,"NEW_TRIPLES") << "#sliceBuffers[0] "
+                           << sliceBuffers[0].size() << "\n";
+      LOG(1,"NEW_TRIPLES") << "#sliceLength "
+                           << sliceLength.size() << "\n";
+      LOG(1,"NEW_TRIPLES") << "#paramLength "
+                           << paramLength.size() << "\n";
+      LOG(1,"NEW_TRIPLES") << "GB*" << np << " "
+                           << double(sources.size() + sliceBuffers.size())
+                            ,* sources[0].size()
+                            ,* 8 * np
+                            / 1073741824.0
+                           << "\n";
+    } // constructor ends
+
+    void init(Tensor const& sourceTensor) {
+
+      CTF::World w(world);
+      const int rank = cc4s::Cc4s::world->rank
+              , order = sliceLength.size()
+              ;
+      std::vector<int> const syms(order, NS);
+      std::vector<int> __sliceLength(sliceLength.begin(), sliceLength.end());
+      Tensor toSliceInto(order,
+                         __sliceLength.data(),
+                         syms.data(),
+                         w);
+      LOG(1,"NEW_TRIPLES") << "slicing... \n";
+
+      // setUp sources
+      for (size_t it(0); it < rankMap.nSources(); ++it) {
+        const size_t
+          source = rankMap.isSourcePadding(rank, source) ? 0 : it;
+        WITH_OCD
+        WITH_RANK
+          << "Init:toSliceInto it-" << it
+          << " :: source " << source << "\n";
+        sliceIntoBuffer(source, toSliceInto, sourceTensor);
+      }
+
+    }
+
+    /**
+     ,* \brief Send asynchronously only if the state is Fetch
+     ,*/
+    void send( size_t otherRank
+             , Slice::Info const& info
+             , size_t tag) const noexcept {
+      MPI_Request request;
+      bool sendData_p = false;
+
+      if (info.state == Slice::Fetch) sendData_p = true;
+      // TODO: remove this because I have SelfSufficient
+      if (otherRank == info.from.rank)      sendData_p = false;
+      if (!sendData_p) return;
+
+      MPI_Isend( sources[info.from.source].data()
+               , sources[info.from.source].size()
+               , MPI_DOUBLE /* TODO: adapt this with traits */
+               , otherRank
+               , tag
+               , universe
+               , &request
+               );
+      WITH_CRAZY_DEBUG
+      WITH_RANK << "sent to " << otherRank << "\n";
+
+    }
+
+    /**
+     ,* \brief Receive asynchronously only if the state is Fetch
+     ,*/
+    void receive(Slice::Info const& info, size_t tag) noexcept {
+      auto& slice = Slice::findByInfo(slices, info);
+
+      if (cc4s::Cc4s::world->rank == info.from.rank) return;
+
+      if (slice.info.state == Slice::Fetch) {
+        // TODO: do it through the slice class
+        slice.info.state = Slice::Dispatched;
+        MPI_Request request;
+        slice.request = request;
+        MPI_Irecv( slice.data
+                 , slice.size
+                 , MPI_DOUBLE // TODO: Adapt this with traits
+                 , info.from.rank
+                 , tag
+                 , universe
+                 , &slice.request
+                //, MPI_STATUS_IGNORE
+                 );
+      }
+    }
+
+    void unwrapAll(ABCTuple const& abc) {
+      for (auto type: sliceTypes) unwrapSlice(type, abc);
+    }
+
+    F* unwrapSlice(Slice::Type type, ABCTuple const& abc) {
+      WITH_CRAZY_DEBUG
+      WITH_RANK << "__unwrap__:slice " << type << " w n "
+                << name
+                << " abc" << pretty_print(abc)
+                << "\n";
+      auto& slice = Slice::findByTypeAbc(slices, type, abc);
+      WITH_RANK << "__unwrap__:info " << slice.info << "\n";
+      switch  (slice.info.state) {
+        case Slice::Dispatched:
+          WITH_RANK << "__unwrap__:Fetch: " << &slice
+                    << " info " << pretty_print(slice.info)
+                    << "\n";
+          slice.unwrapAndMarkReady();
+          return slice.data;
+          break;
+        case Slice::SelfSufficient:
+          WITH_RANK << "__unwrap__:SelfSufficient: " << &slice
+                    << " info " << pretty_print(slice.info)
+                    << "\n";
+          return slice.data;
+          break;
+        case Slice::Ready:
+          WITH_RANK << "__unwrap__:READY: UNWRAPPED ALREADY" << &slice
+                    << " info " << pretty_print(slice.info)
+                    << "\n";
+          return slice.data;
+          break;
+        case Slice::Recycled:
+          WITH_RANK << "__unwrap__:RECYCLED " << &slice
+                    << " info " << pretty_print(slice.info)
+                    << "\n";
+          return unwrapSlice(slice.info.recycling, abc);
+          break;
+        case Slice::Fetch:
+        case Slice::Acceptor:
+          throw std::domain_error("Can't unwrap an acceptor or fetch slice!");
+          break;
+        default:
+          throw std::domain_error("Unknown error unwrapping slice!");
+      }
+      return slice.data;
+    }
+
+    const RankMap rankMap;
+    const MPI_Comm world;
+    const MPI_Comm universe;
+    const std::vector<size_t> sliceLength;
+    std::vector< std::vector<F> > sources;
+    std::vector< Slice > slices;
+    Slice::Name name;
+    const std::vector<Slice::Type> sliceTypes;
+    std::vector< std::vector<F> > sliceBuffers;
+    std::set<F*> freePointers;
+
+  };
+
+
+
+  void sliceIntoVector
+    ( std::vector<double> &v
+    , CTF::Tensor<double> &toSlice
+    , std::vector<int64_t> const low
+    , std::vector<int64_t> const up
+    , CTF::Tensor<double> const& origin
+    , std::vector<int64_t> const originLow
+    , std::vector<int64_t> const originUp
+    ) {
+    // Thank you CTF for forcing me to do this
+    struct { std::vector<int> up, low; }
+        toSlice_ = { {up.begin(), up.end()}
+                   , {low.begin(), low.end()} }
+      , origin_ = { {originUp.begin(), originUp.end()}
+                  , {originLow.begin(), originLow.end()} }
+      ;
+
+    WITH_OCD
+    WITH_RANK << "slicing into " << pretty_print(toSlice_.up)
+                          << "," << pretty_print(toSlice_.low)
+              << " from " << pretty_print(origin_.up)
+                   << "," << pretty_print(origin_.low)
+              << "\n";
+
+#ifdef TODO
+    toSlice.slice( toSlice_.low.data()
+                 , toSlice_.up.data()
+                 , 0.0
+                 , origin
+                 , origin_.low.data()
+                 , origin_.up.data()
+                 , 1.0);
+    memcpy(v.data(), toSlice.data, sizeof(double) * v.size());
+#endif
+
+  }
+
+
+  struct TAPHH : public SliceUnion {
+    TAPHH( Tensor const& sourceTensor
+         , size_t No
+         , size_t Nv
+         , size_t np
+         , MPI_Comm child_world
+         , MPI_Comm global_world
+         ) : SliceUnion( sourceTensor
+                       , {Slice::A, Slice::B, Slice::C}
+                       , {Nv, No, No} // size of the slices
+                       , {Nv}
+                       , np
+                       , child_world
+                       , global_world
+                       , Slice::TA
+                       , 4) {
+           init(sourceTensor);
+         }
+
+    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override
+    {
+      const int rank = cc4s::Cc4s::world->rank
+              , Nv = sliceLength[0]
+              , No = sliceLength[1]
+              , a = rankMap.find({rank, it});
+              ;
+
+
+      sliceIntoVector( sources[it]
+                     , to,   {0, 0, 0},    {Nv, No, No}
+                     , from, {a, 0, 0, 0}, {a+1, Nv, No, No}
+                     );
+
+    }
+
+  };
+
+
+  struct HHHA : public SliceUnion {
+    HHHA( Tensor const& sourceTensor
+        , size_t No
+        , size_t Nv
+        , size_t np
+        , MPI_Comm child_world
+        , MPI_Comm global_world
+        ) : SliceUnion( sourceTensor
+                      , {Slice::A, Slice::B, Slice::C}
+                      , {No, No, No} // size of the slices
+                      , {Nv}         // size of the parametrization
+                      , np
+                      , child_world
+                      , global_world
+                      , Slice::VIJKA
+                      , 4) {
+           init(sourceTensor);
+         }
+
+    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override
+    {
+
+      const int rank = cc4s::Cc4s::world->rank
+              , No = sliceLength[0]
+              , a = rankMap.find({rank, it})
+              ;
+
+      sliceIntoVector( sources[it]
+                     , to,   {0, 0, 0},    {No, No, No}
+                     , from, {0, 0, 0, a}, {No, No, No, a+1}
+                     );
+
+    }
+  };
+
+  struct ABPH : public SliceUnion {
+    ABPH( Tensor const& sourceTensor
+        , size_t No
+        , size_t Nv
+        , size_t np
+        , MPI_Comm child_world
+        , MPI_Comm global_world
+        ) : SliceUnion( sourceTensor
+                      , { Slice::AB, Slice::BC, Slice::AC
+                        , Slice::BA, Slice::CB, Slice::CA
+                        }
+                      , {Nv, No} // size of the slices
+                      , {Nv, Nv} // size of the parametrization
+                      , np
+                      , child_world
+                      , global_world
+                      , Slice::VABCI
+                      , 2*6) {
+           init(sourceTensor);
+         }
+
+    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override {
+
+      const int Nv = sliceLength[0]
+              , No = sliceLength[1]
+              , rank = cc4s::Cc4s::world->rank
+              , el = rankMap.find({rank, it})
+              , a = el % Nv
+              , b = el / Nv
+              ;
+
+
+      sliceIntoVector( sources[it]
+                     , to,   {0, 0},       {Nv, No}
+                     , from, {a, b, 0, 0}, {a+1, b+1, Nv, No}
+                     );
+
+    }
+
+  };
+
+  struct ABHH : public SliceUnion {
+    ABHH( Tensor const& sourceTensor
+        , size_t No
+        , size_t Nv
+        , size_t np
+        , MPI_Comm child_world
+        , MPI_Comm global_world
+        ) : SliceUnion( sourceTensor
+                      , {Slice::AB, Slice::BC, Slice::AC}
+                      , {No, No} // size of the slices
+                      , {Nv, Nv} // size of the parametrization
+                      , np
+                      , child_world
+                      , global_world
+                      , Slice::VABIJ
+                      , 6) {
+           init(sourceTensor);
+         }
+
+    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override {
+
+      const int Nv = from.lens[0]
+              , No = sliceLength[1]
+              , rank = cc4s::Cc4s::world->rank
+              , el = rankMap.find({rank, it})
+              , a = el % Nv
+              , b = el / Nv
+              ;
+
+      sliceIntoVector( sources[it]
+                     , to,   {0, 0},       {No, No}
+                     , from, {a, b, 0, 0}, {a+1, b+1, No, No}
+                     );
+
+
+    }
+
+  };
+
+
+  struct TABHH : public SliceUnion {
+    TABHH( Tensor const& sourceTensor
+         , size_t No
+         , size_t Nv
+         , size_t np
+         , MPI_Comm child_world
+         , MPI_Comm global_world
+         ) : SliceUnion( sourceTensor
+                       , {Slice::AB, Slice::BC, Slice::AC}
+                       , {No, No} // size of the slices
+                       , {Nv, Nv} // size of the parametrization
+                       , np
+                       , child_world
+                       , global_world
+                       , Slice::TABIJ
+                       , 6) {
+           init(sourceTensor);
+         }
+
+    void sliceIntoBuffer(size_t it, Tensor &to, Tensor const& from) override {
+      // TODO: maybe generalize this with ABHH
+
+      const int Nv = from.lens[0]
+              , No = sliceLength[1]
+              , rank = cc4s::Cc4s::world->rank
+              , el = rankMap.find({rank, it})
+              , a = el % Nv
+              , b = el / Nv
+              ;
+
+      sliceIntoVector( sources[it]
+                     , to,   {0, 0},       {No, No}
+                     , from, {a, b, 0, 0}, {a+1, b+1, No, No}
+                     );
+
+
+    }
+
+  };
+
+  // PHYSICS ENERGY, SINGLES ... =========================================={{{1
+
+double cc4s::PerturbativeTriplesAbcijk::getEnergyDistinct
+  ( const double epsabc
+  , std::vector<double> const& epsi
+  , std::vector<double> const& Tijk_
+  , std::vector<double> const& Zijk_
+  ) {
+  constexpr size_t blockSize=16;
+  double energy(0.);
+  const size_t No = epsi.size();
+  for (size_t kk=0; kk<No; kk+=blockSize){
+    const size_t kend( std::min(No, kk+blockSize) );
+    for (size_t jj(kk); jj<No; jj+=blockSize){
+      const size_t jend( std::min( No, jj+blockSize) );
+      for (size_t ii(jj); ii<No; ii+=blockSize){
+        const size_t iend( std::min( No, ii+blockSize) );
+        for (size_t k(kk); k < kend; k++){
+          const double ek(epsi[k]);
+          const size_t jstart = jj > k ? kk : k;
+          for (size_t j(jstart); j < jend; j++){
+            const double ej(epsi[j]);
+            double facjk( j == k ? 0.5 : 1.0);
+            size_t istart = ii > j ? ii : j;
+            for (size_t i(istart); i < iend; i++){
+              const double ei(epsi[i]);
+              double facij ( i==j ? 0.5 : 1.0);
+              double denominator(epsabc - ei - ej - ek);
+              double U(Zijk_[i + No*j + No*No*k]);
+              double V(Zijk_[i + No*k + No*No*j]);
+              double W(Zijk_[j + No*i + No*No*k]);
+              double X(Zijk_[j + No*k + No*No*i]);
+              double Y(Zijk_[k + No*i + No*No*j]);
+              double Z(Zijk_[k + No*j + No*No*i]);
+
+              double A(Tijk_[i + No*j + No*No*k]);
+              double B(Tijk_[i + No*k + No*No*j]);
+              double C(Tijk_[j + No*i + No*No*k]);
+              double D(Tijk_[j + No*k + No*No*i]);
+              double E(Tijk_[k + No*i + No*No*j]);
+              double F(Tijk_[k + No*j + No*No*i]);
+              double value(3.0*(A*U+B*V+C*W+D*X+E*Y+F*Z)
+                           +((U+X+Y)-2.0*(V+W+Z))*(A+D+E)
+                           +((V+W+Z)-2.0*(U+X+Y))*(B+C+F));
+              energy += 2.0*value / denominator * facjk * facij;
+            } // i
+          } // j
+        } // k
+      } // ii
+    } // jj
+  } // kk
+  return energy;
+}
+
+
+double cc4s::PerturbativeTriplesAbcijk::getEnergySame
+  ( const double epsabc
+  , std::vector<double> const& epsi
+  , std::vector<double> const& Tijk_
+  , std::vector<double> const& Zijk_
+  ) {
+  constexpr size_t blockSize = 16;
+  const size_t No = epsi.size();
+  double energy(0.);
+  for (size_t kk=0; kk<No; kk+=blockSize){
+    const size_t kend( std::min( kk+blockSize, No) );
+    for (size_t jj(kk); jj<No; jj+=blockSize){
+      const size_t jend( std::min( jj+blockSize, No) );
+      for (size_t ii(jj); ii<No; ii+=blockSize){
+        const size_t iend( std::min( ii+blockSize, No) );
+        for (size_t k(kk); k < kend; k++){
+          const double ek(epsi[k]);
+          const size_t jstart = jj > k ? jj : k;
+          for(size_t j(jstart); j < jend; j++){
+            const double facjk( j == k ? 0.5 : 1.0);
+            const double ej(epsi[j]);
+            const size_t istart = ii > j ? ii : j;
+            for(size_t i(istart); i < iend; i++){
+              double ei(epsi[i]);
+              double facij ( i==j ? 0.5 : 1.0);
+              double denominator(epsabc - ei - ej - ek);
+              double U(Zijk_[i + No*j + No*No*k]);
+              double V(Zijk_[j + No*k + No*No*i]);
+              double W(Zijk_[k + No*i + No*No*j]);
+              double A(Tijk_[i + No*j + No*No*k]);
+              double B(Tijk_[j + No*k + No*No*i]);
+              double C(Tijk_[k + No*i + No*No*j]);
+              double value(3.0*( A*U + B*V + C*W) - (A+B+C)*(U+V+W));
+              energy += 2.0*value / denominator * facjk * facij;
+            } // i
+          } // j
+        } // k
+      } // ii
+    } // jj
+  } // kk
+  return energy;
+}
+
+void cc4s::PerturbativeTriplesAbcijk::singlesContribution
+  ( const ABCTuple &abc
+  , double const* Tph
+  , double const* VABij
+  , double const* VACij
+  , double const* VBCij
+  , double *Zijk
+  ) {
+  const size_t a(abc[0]), b(abc[1]), c(abc[2]);
+  for (size_t k=0; k < No; k++)
+  for (size_t i=0; i < No; i++)
+  for (size_t j=0; j < No; j++) {
+    const size_t ijk = i + j*No + k*No*No
+               ,  jk = j + No * k
+               ;
+    Zijk[ijk] += Tph[ a + i * Nv ] * VBCij[ j + k * No ];
+    Zijk[ijk] += Tph[ b + j * Nv ] * VACij[ i + k * No ];
+    Zijk[ijk] += Tph[ c + k * Nv ] * VABij[ i + j * No ];
+  }
+}
+
+void doublesContribution
+  ( const ABCTuple &abc
+  , size_t const No
+  , size_t const Nv
+  // -- VABCI
+  , double const* VABph
+  , double const* VACph
+  , double const* VBCph
+  , double const* VBAph
+  , double const* VCAph
+  , double const* VCBph
+  // -- VHHHA
+  , double const* VhhhA
+  , double const* VhhhB
+  , double const* VhhhC
+  // -- TA
+  , double const* TAphh
+  , double const* TBphh
+  , double const* TCphh
+  // -- TABIJ
+  , double const* TABhh
+  , double const* TAChh
+  , double const* TBChh
+  // -- TIJK
+  , double *Tijk
+  , atrip::Timings& chrono
+  ) {
+
+  auto& t_reorder = chrono["doubles:reorder"];
+  const size_t a = abc[0], b = abc[1], c = abc[2]
+             , NoNo = No*No, NoNv = No*Nv
+             ;
+
+#if defined(TRIPLES_USE_DGEMM)
+#define _IJK_(i, j, k) i + j*No + k*NoNo
+#define REORDER(__II, __JJ, __KK)                                 \
+  t_reorder.start();                                              \
+  for (size_t k = 0; k < No; k++)                                 \
+  for (size_t j = 0; j < No; j++)                                 \
+  for (size_t i = 0; i < No; i++) {                               \
+    Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)];   \
+  }                                                               \
+  t_reorder.stop();
+#define DGEMM_PARTICLES(__A, __B)    \
+  atrip::dgemm_( "T"                 \
+               , "N"                 \
+               , (int const*)&NoNo   \
+               , (int const*)&No     \
+               , (int const*)&Nv     \
+               , &one                \
+               , __A                 \
+               , (int const*)&Nv     \
+               , __B                 \
+               , (int const*)&Nv     \
+               , &zero               \
+               , _t_buffer.data()    \
+               , (int const*)&NoNo   \
+               );
+#define DGEMM_HOLES(__A, __B, __TRANSB)  \
+  atrip::dgemm_( "N"                     \
+               , __TRANSB                \
+               , (int const*)&NoNo       \
+               , (int const*)&No         \
+               , (int const*)&No         \
+               , &m_one                  \
+               , __A                     \
+               , (int const*)&NoNo       \
+               , __B                     \
+               , (int const*)&No         \
+               , &zero                   \
+               , _t_buffer.data()        \
+               , (int const*)&NoNo       \
+               );
+
+  using F = double;
+  const size_t NoNoNo = No*NoNo;
+  std::vector<double> _t_buffer;
+  _t_buffer.reserve(NoNoNo);
+  F one{1.0}, m_one{-1.0}, zero{0.0};
+
+  t_reorder.start();
+  for (size_t k = 0; k < NoNoNo; k++) {
+    // zero the Tijk
+    Tijk[k] = 0.0;
+  }
+  t_reorder.stop();
+
+  chrono["doubles:holes"].start();
+  { // Holes part ============================================================
+    // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
+    chrono["doubles:holes:1"].start();
+    DGEMM_HOLES(VhhhC, TABhh, "N")
+    REORDER(i, k, j)
+    chrono["doubles:holes:1"].stop();
+    // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
+    chrono["doubles:holes:2"].start();
+    DGEMM_HOLES(VhhhC, TABhh, "T")
+    REORDER(j, k, i)
+    chrono["doubles:holes:2"].stop();
+    // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
+    chrono["doubles:holes:3"].start();
+    DGEMM_HOLES(VhhhB, TAChh, "N")
+    REORDER(i, j, k)
+    chrono["doubles:holes:3"].stop();
+    // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
+    chrono["doubles:holes:4"].start();
+    DGEMM_HOLES(VhhhB, TAChh, "T")
+    REORDER(k, j, i)
+    chrono["doubles:holes:4"].stop();
+    // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
+    chrono["doubles:holes:5"].start();
+    DGEMM_HOLES(VhhhA, TBChh, "N")
+    REORDER(j, i, k)
+    chrono["doubles:holes:5"].stop();
+    // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
+    chrono["doubles:holes:6"].start();
+    DGEMM_HOLES(VhhhA, TBChh, "T")
+    REORDER(k, i, j)
+    chrono["doubles:holes:6"].stop();
+  }
+  chrono["doubles:holes"].stop();
+
+  chrono["doubles:particles"].start();
+  { // Particle part =========================================================
+    // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
+    chrono["doubles:particles:1"].start();
+    DGEMM_PARTICLES(TAphh, VBCph)
+    REORDER(i, j, k)
+    chrono["doubles:particles:1"].stop();
+    // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
+    chrono["doubles:particles:2"].start();
+    DGEMM_PARTICLES(TAphh, VCBph)
+    REORDER(i, k, j)
+    chrono["doubles:particles:2"].stop();
+    // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
+    chrono["doubles:particles:3"].start();
+    DGEMM_PARTICLES(TCphh, VABph)
+    REORDER(k, i, j)
+    chrono["doubles:particles:3"].stop();
+    // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
+    chrono["doubles:particles:4"].start();
+    DGEMM_PARTICLES(TCphh, VBAph)
+    REORDER(k, j, i)
+    chrono["doubles:particles:4"].stop();
+    // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
+    chrono["doubles:particles:5"].start();
+    DGEMM_PARTICLES(TBphh, VACph)
+    REORDER(j, i, k)
+    chrono["doubles:particles:5"].stop();
+    // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
+    chrono["doubles:particles:6"].start();
+    DGEMM_PARTICLES(TBphh, VCAph)
+    REORDER(j, k, i)
+    chrono["doubles:particles:6"].stop();
+  }
+  chrono["doubles:particles"].stop();
+
+#undef REORDER
+#undef DGEMM_HOLES
+#undef DGEMM_PARTICLES
+#undef _IJK_
+#else
+  for (size_t k = 0; k < No; k++)
+  for (size_t j = 0; j < No; j++)
+  for (size_t i = 0; i < No; i++){
+    const size_t ijk = i + j*No + k*NoNo
+               ,  jk = j + k*No
+               ;
+    Tijk[ijk] = 0.0; // :important
+    // HOLE DIAGRAMS: TABHH and VHHHA
+    for (size_t L = 0; L < No; L++){
+      // t[abLj] * V[Lcik]        H1
+      // t[baLi] * V[Lcjk]        H0      TODO: conjugate T for complex
+      Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo];
+      Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo];
+
+      // t[acLk] * V[Lbij]        H5
+      // t[caLi] * V[Lbkj]        H3
+      Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo];
+      Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo];
+
+      // t[bcLk] * V[Laji]        H2
+      // t[cbLj] * V[Laki]        H4
+      Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo];
+      Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo];
+    }
+    // PARTILCE DIAGRAMS: TAPHH and VABPH
+    for (size_t E = 0; E < Nv; E++) {
+      // t[aEij] * V[bcEk]        P0
+      // t[aEik] * V[cbEj]        P3 // TODO: CHECK THIS ONE, I DONT KNOW
+      Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv];
+      Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv];
+
+      // t[cEki] * V[abEj]        P5
+      // t[cEkj] * V[baEi]        P2
+      Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv];
+      Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv];
+
+      // t[bEji] * V[acEk]        P1
+      // t[bEjk] * V[caEi]        P4
+      Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv];
+      Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv];
+    }
+
+  }
+#endif
+}
+
+
+// HELPER FUNCTIONS ====================================================={{{1
+
+using ABCTuples = std::vector<ABCTuple>;
+
+ABCTuples getTuplesList(size_t Nv) {
+  const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
+  ABCTuples result(n);
+  size_t u(0);
+
+  for (size_t a(0); a < Nv; a++)
+  for (size_t b(a); b < Nv; b++)
+  for (size_t c(b); c < Nv; c++){
+    if ( a == b && b == c ) continue;
+    result[u++] = {a, b, c};
+  }
+
+  return result;
+
+}
+
+
+std::pair<size_t, size_t>
+getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) {
+
+  std::vector<size_t> n_tuples_per_rank(np, tuplesList.size()/np);
+  const size_t
+      // how many valid tuples should we still verteilen to nodes
+      // since the number of tuples is not divisible by the number of nodes
+      nRoundRobin = tuplesList.size() % np
+      // every node must have the sanme amount of tuples in order for the
+      // other nodes to receive and send somewhere, therefore
+      // some nodes will get extra tuples but that are dummy tuples
+    , nExtraInvalid = (np - nRoundRobin) % np
+    ;
+
+  if (nRoundRobin) for (int i = 0; i < np; i++) n_tuples_per_rank[i]++;
+
+#if defined(TODO)
+  assert( tuplesList.size()
+          ==
+          ( std::accumulate(n_tuples_per_rank.begin(),
+                            n_tuples_per_rank.end(),
+                            0)
+          + nExtraInvalid
+          ));
+#endif
+
+  WITH_RANK << "nRoundRobin = " << nRoundRobin << "\n";
+  WITH_RANK << "nExtraInvalid = " << nExtraInvalid << "\n";
+  WITH_RANK << "ntuples = " << n_tuples_per_rank[rank] << "\n";
+
+  auto const& it = n_tuples_per_rank.begin();
+
+  return
+    { std::accumulate(it, it + rank    , 0)
+    , std::accumulate(it, it + rank + 1, 0)
+    };
+
+}
+
+
+SliceUnion&
+unionByName(std::vector<SliceUnion*> const& unions, Slice::Name name) {
+    const auto sliceUnionIt
+      = std::find_if(unions.begin(), unions.end(),
+                     [&name](SliceUnion const* s) {
+                       return name == s->name;
+                     });
+    if (sliceUnionIt == unions.end())
+      throw std::domain_error("SliceUnion not found!");
+    return **sliceUnionIt;
+}
+
+#if defined(TRIPLES_WORKLOAD_DUMP)
+      struct WorkloadEntry {
+        size_t rank, iteration;
+        ABCTuple abc;
+        size_t name;
+        size_t send;
+        size_t recv;
+        size_t doubles;
+        size_t barrier;
+        size_t allgather;
+        size_t unwrap;
+
+        static
+        size_t findRecv(SliceUnion const& u) {
+          return std::count_if(u.slices.begin(), u.slices.end(),
+                               [&u](Slice const& slice) {
+                                 return slice.info.state == Slice::Dispatched;
+                               });
+        }
+        static
+        size_t findSend( Slice::Database const& db
+                       , SliceUnion const& u
+                       , size_t rank) {
+          using Element = Slice::LocalDatabaseElement;
+          return std::count_if(db.begin(), db.end(),
+                               [&u, rank](Element const& el) {
+                                 return el.name == u.name
+                                     && el.info.from.rank == rank
+                                     // this should work since the database
+                                     // has the non-updated Slice::Info
+                                     && el.info.state == Slice::Fetch
+                                     ;
+                               });
+        }
+      };
+#endif
+
+// MAIN ALGORITHM ======================================================{{{1
+void cc4s::PerturbativeTriplesAbcijk::run(){
+
+  const int np = cc4s::Cc4s::world->np;
+  const int rank = cc4s::Cc4s::world->rank;
+  MPI_Comm universe = cc4s::Cc4s::world->comm;
+
+  // Timings in seconds ================================================{{{1
+  atrip::Timings chrono{};
+
+  // Get the distributed ctf tensor data
+  CTF::Tensor<> *ei(getTensorArgument("HoleEigenEnergies"))
+              , *ea(getTensorArgument("ParticleEigenEnergies"))
+              , *Tph(getTensorArgument("CcsdSinglesAmplitudes"))
+              , *Tpphh(getTensorArgument("CcsdDoublesAmplitudes"))
+              , *Vpphh(getTensorArgument("PPHHCoulombIntegrals"))
+              , *Vhhhp(getTensorArgument("HHHPCoulombIntegrals"))
+              , *Vppph(getTensorArgument("PPPHCoulombIntegrals"))
+              ;
+
+  No = ei->lens[0];
+  Nv = ea->lens[0];
+  LOG(0,"NEW_TRIPLES") << "No: " << No << "\n";
+  LOG(0,"NEW_TRIPLES") << "Nv: " << Nv << "\n";
+
+  // allocate the three scratches, see piecuch
+  std::vector<double> Tijk(No*No*No) // doubles only (see piecuch)
+                    , Zijk(No*No*No) // singles + doubles (see piecuch)
+                    // we need local copies of the following tensors on every
+                    // rank
+                    , epsi(No)
+                    , epsa(Nv)
+                    , Tai(No * Nv)
+                    ;
+
+  ei->read_all(epsi.data());
+  ea->read_all(epsa.data());
+  Tph->read_all(Tai.data());
+
+  // COMMUNICATOR CONSTRUCTION ========================================={{{1
+  //
+  // Construct a new communicator living only on a single rank
+  int child_size = 1
+    , child_rank
+    ;
+  const
+  int color = rank / child_size
+    , crank = rank % child_size
+    ;
+  MPI_Comm child_comm;
+  if (np == 1) {
+    child_comm = universe;
+  } else {
+    MPI_Comm_split(cc4s::Cc4s::world->comm, color, crank, &child_comm);
+    MPI_Comm_rank(child_comm, &child_rank);
+    MPI_Comm_size(child_comm, &child_size);
+    //CTF::World child_world(child_comm);
+  }
+
+
+  chrono["nv-slices"].start();
+  // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1
+  LOG(0,"NEW_TRIPLES") << "BUILD NV-SLICES\n";
+  TAPHH taphh(*Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  HHHA  hhha(*Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  chrono["nv-slices"].stop();
+
+  chrono["nv-nv-slices"].start();
+  // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1
+  LOG(0,"NEW_TRIPLES") << "BUILD NV x NV-SLICES\n";
+  ABPH abph(*Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  ABHH abhh(*Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  TABHH tabhh(*Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  chrono["nv-nv-slices"].stop();
+
+  // all tensors
+  std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh};
+
+  //CONSTRUCT TUPLE LIST ==============================================={{{1
+  LOG(0,"NEW_TRIPLES") << "BUILD TUPLE LIST\n";
+  const auto tuplesList = std::move(getTuplesList(Nv));
+  WITH_RANK << "tupList.size() = " << tuplesList.size() << "\n";
+
+  // GET ABC INDEX RANGE FOR RANK ======================================{{{1
+  auto abcIndex = getABCRange(np, rank, tuplesList);
+  size_t nIterations = abcIndex.second - abcIndex.first;
+
+#ifdef TRIPLES_BENCHMARK
+  { const size_t maxIterations = getIntegerArgument("maxIterations", 0);
+    if (maxIterations != 0) {
+      abcIndex.second = abcIndex.first + maxIterations % (nIterations + 1);
+      nIterations = maxIterations % (nIterations + 1);
+    }
+  }
+#endif
+
+  WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n";
+  LOG(0,"NEW_TRIPLES") << "#iterations: "
+                       << nIterations << "\n";
+
+  // first abc
+  const ABCTuple firstAbc = tuplesList[abcIndex.first];
+
+
+  double energy(0.);
+
+
+  auto const isFakeTuple
+    = [&tuplesList](size_t const i) { return i >= tuplesList.size(); };
+
+
+  auto communicateDatabase
+    = [ &unions
+      , np
+      , &chrono
+      ] (ABCTuple const& abc, MPI_Comm const& c) -> Slice::Database {
+
+        chrono["db:comm:type:do"].start();
+        auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement();
+        chrono["db:comm:type:do"].stop();
+
+        chrono["db:comm:ldb"].start();
+        Slice::LocalDatabase ldb;
+
+        for (auto const& tensor: unions) {
+          auto const& tensorDb = tensor->buildLocalDatabase(abc);
+          ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end());
+        }
+        chrono["db:comm:ldb"].stop();
+
+        Slice::Database db(np * ldb.size(), ldb[0]);
+
+        chrono["oneshot-db:comm:allgather"].start();
+        chrono["db:comm:allgather"].start();
+        MPI_Allgather( ldb.data()
+                     , ldb.size()
+                     , MPI_LDB_ELEMENT
+                     , db.data()
+                     , ldb.size()
+                     , MPI_LDB_ELEMENT
+                     , c);
+        chrono["db:comm:allgather"].stop();
+        chrono["oneshot-db:comm:allgather"].stop();
+
+        chrono["db:comm:type:free"].start();
+        MPI_Type_free(&MPI_LDB_ELEMENT);
+        chrono["db:comm:type:free"].stop();
+
+        return db;
+      };
+
+  auto doIOPhase
+    = [&unions, &rank, &np, &universe, &chrono] (Slice::Database const& db) {
+
+    const size_t localDBLength = db.size() / np;
+
+    size_t sendTag = 0
+         , recvTag = rank * localDBLength
+         ;
+
+    // RECIEVE PHASE ======================================================
+    {
+      // At this point, we have already send to everyone that fits
+      auto const& begin = &db[rank * localDBLength]
+                , end   = begin + localDBLength
+                ;
+      for (auto it = begin; it != end; ++it) {
+        recvTag++;
+        auto const& el = *it;
+        auto& u = unionByName(unions, el.name);
+
+        WITH_DBG std::cout
+          << rank << ":r"
+          << "♯" << recvTag << " =>"
+          << " «n" << el.name
+          << ", t" << el.info.type
+          << ", s" << el.info.state
+          << "»"
+          << " ⊙ {" << rank << "⇐" << el.info.from.rank
+                    << ", "
+                    << el.info.from.source << "}"
+          << " ∴ {" << el.info.tuple[0]
+                    << ", "
+                    << el.info.tuple[1]
+                    << "}"
+          << "\n"
+          ;
+
+        chrono["db:io:recv"].start();
+        u.receive(el.info, recvTag);
+        chrono["db:io:recv"].stop();
+
+      } // recv
+    }
+
+    // SEND PHASE =========================================================
+    for (size_t otherRank = 0; otherRank<np; otherRank++) {
+      auto const& begin = &db[otherRank * localDBLength]
+                , end = begin + localDBLength
+                ;
+      for (auto it = begin; it != end; ++it) {
+        sendTag++;
+        Slice::LocalDatabaseElement const& el = *it;
+
+        if (el.info.from.rank != rank) continue;
+
+        auto& u = unionByName(unions, el.name);
+        WITH_DBG std::cout
+          << rank << ":s"
+          << "♯" << sendTag << " =>"
+          << " «n" << el.name
+          << ", t" << el.info.type
+          << ", s" << el.info.state
+          << "»"
+          << " ⊙ {" << el.info.from.rank << "⇒" << otherRank
+                    << ", "
+                    << el.info.from.source << "}"
+          << " ∴ {" << el.info.tuple[0]
+                    << ", "
+                    << el.info.tuple[1]
+                    << "}"
+          << "\n"
+          ;
+
+        chrono["db:io:send"].start();
+        u.send(otherRank, el.info, sendTag);
+        chrono["db:io:send"].stop();
+
+      } // send phase
+
+    } // otherRank
+
+
+  };
+
+#if defined(HAVE_OCD) || defined(TRIPLES_PRINT_TUPLES)
+  std::map<ABCTuple, double> tupleEnergies;
+#endif
+
+  const double doublesFlops
+    = double(No)
+    ,* double(No)
+    ,* double(No)
+    ,* (double(No) + double(Nv))
+    ,* 2
+    ,* 6
+    / 1e9
+    ;
+
+  // START MAIN LOOP ======================================================{{{1
+
+  Slice::Database db;
+#if defined(TRIPLES_WORKLOAD_DUMP)
+  std::vector<std::vector<WorkloadEntry>> workloadDB;
+#endif
+
+  for ( size_t i = abcIndex.first, iteration = 1
+      ; i < abcIndex.second
+      ; i++, iteration++
+      ) {
+    chrono["iterations"].start();
+
+    // check overhead from chrono over all iterations
+    chrono["start:stop"].start(); chrono["start:stop"].stop();
+
+    // check overhead of doing a barrier at the beginning
+    chrono["oneshot-mpi:barrier"].start();
+    chrono["mpi:barrier"].start();
+    // TODO: REMOVE
+    if (getIntegerArgument("barrier", 1) == 1)
+    MPI_Barrier(universe);
+    chrono["mpi:barrier"].stop();
+    chrono["oneshot-mpi:barrier"].stop();
+
+    if (iteration % getIntegerArgument("iterationMod", 100) == 0) {
+      LOG(0,"NEW_TRIPLES")
+        << "iteration " << iteration
+        << " [" << 100 * iteration / nIterations << "%]"
+        << " (" << doublesFlops * iteration / chrono["doubles"].count()
+        << "GF)"
+        << " (" << doublesFlops * iteration / chrono["iterations"].count()
+        << "GF)"
+        << " ===========================\n";
+
+      // PRINT TIMINGS
+      for (auto const& pair: chrono)
+        LOG(1, " ") << pair.first << " :: "
+                    << pair.second.count()
+                    << std::endl;
+
+    }
+
+    const ABCTuple abc = isFakeTuple(i)
+                       ? tuplesList[tuplesList.size() - 1]
+                       : tuplesList[i]
+                 , *abcNext = i == (abcIndex.second - 1)
+                            ? nullptr
+                            : isFakeTuple(i + 1)
+                            ? &tuplesList[tuplesList.size() - 1]
+                            : &tuplesList[i + 1]
+                 ;
+
+    chrono["with_rank"].start();
+    WITH_RANK << " :it " << iteration
+              << " :abc " << pretty_print(abc)
+              << " :abcN "
+              << (abcNext ? pretty_print(*abcNext) : "None")
+              << "\n";
+    chrono["with_rank"].stop();
+
+
+    // COMM FIRST DATABASE ================================================{{{1
+    if (i == abcIndex.first) {
+      WITH_RANK << "__first__:first database ............ \n";
+      const auto __db = communicateDatabase(abc, universe);
+      WITH_RANK << "__first__:first database communicated \n";
+      WITH_RANK << "__first__:first database io phase \n";
+      doIOPhase(__db);
+      WITH_RANK << "__first__:first database io phase DONE\n";
+      WITH_RANK << "__first__::::Unwrapping all slices for first database\n";
+      for (auto& u: unions) u->unwrapAll(abc);
+      WITH_RANK << "__first__::::Unwrapping all slices for first database DONE\n";
+      MPI_Barrier(universe);
+    }
+
+    // COMM NEXT DATABASE ================================================={{{1
+    if (abcNext) {
+      WITH_RANK << "__comm__:" << iteration << "th communicating database\n";
+      chrono["db:comm"].start();
+      //const auto db = communicateDatabase(*abcNext, universe);
+      db = communicateDatabase(*abcNext, universe);
+      chrono["db:comm"].stop();
+      chrono["db:io"].start();
+      doIOPhase(db);
+      chrono["db:io"].stop();
+      WITH_RANK << "__comm__:" <<  iteration << "th database io phase DONE\n";
+    }
+
+    // COMPUTE DOUBLES ===================================================={{{1
+    OCD_Barrier(universe);
+    if (!isFakeTuple(i)) {
+      WITH_RANK << iteration << "-th doubles\n";
+      WITH_CHRONO(chrono["oneshot-unwrap"],
+      WITH_CHRONO(chrono["unwrap"],
+      WITH_CHRONO(chrono["unwrap:doubles"],
+        for (auto& u: decltype(unions){&abph, &hhha, &taphh, &tabhh}) {
+          u->unwrapAll(abc);
+        }
+      )))
+      chrono["oneshot-doubles"].start();
+      chrono["doubles"].start();
+      // TODO: REMOVE
+      for (size_t __i=0; __i < getIntegerArgument("doublesLoops", 1); __i++)
+      doublesContribution( abc, (size_t)No, (size_t)Nv
+                         // -- VABCI
+                         , abph.unwrapSlice(Slice::AB, abc)
+                         , abph.unwrapSlice(Slice::AC, abc)
+                         , abph.unwrapSlice(Slice::BC, abc)
+                         , abph.unwrapSlice(Slice::BA, abc)
+                         , abph.unwrapSlice(Slice::CA, abc)
+                         , abph.unwrapSlice(Slice::CB, abc)
+                         // -- VHHHA
+                         , hhha.unwrapSlice(Slice::A, abc)
+                         , hhha.unwrapSlice(Slice::B, abc)
+                         , hhha.unwrapSlice(Slice::C, abc)
+                         // -- TA
+                         , taphh.unwrapSlice(Slice::A, abc)
+                         , taphh.unwrapSlice(Slice::B, abc)
+                         , taphh.unwrapSlice(Slice::C, abc)
+                         // -- TABIJ
+                         , tabhh.unwrapSlice(Slice::AB, abc)
+                         , tabhh.unwrapSlice(Slice::AC, abc)
+                         , tabhh.unwrapSlice(Slice::BC, abc)
+                         // -- TIJK
+                         , Tijk.data()
+                         , chrono
+                         );
+      WITH_RANK << iteration << "-th doubles done\n";
+      chrono["doubles"].stop();
+      chrono["oneshot-doubles"].stop();
+    }
+
+    // COMPUTE SINGLES =================================================== {{{1
+    OCD_Barrier(universe);
+    if (!isFakeTuple(i)) {
+      WITH_CHRONO(chrono["oneshot-unwrap"],
+      WITH_CHRONO(chrono["unwrap"],
+      WITH_CHRONO(chrono["unwrap:singles"],
+        abhh.unwrapAll(abc);
+      )))
+      chrono["reorder"].start();
+      for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I];
+      chrono["reorder"].stop();
+      chrono["singles"].start();
+      singlesContribution( abc
+                         , Tai.data()
+                         , abhh.unwrapSlice(Slice::AB, abc)
+                         , abhh.unwrapSlice(Slice::AC, abc)
+                         , abhh.unwrapSlice(Slice::BC, abc)
+                         , Zijk.data());
+      chrono["singles"].stop();
+    }
+
+
+    // COMPUTE ENERGY ==================================================== {{{1
+    if (!isFakeTuple(i)) {
+      double tupleEnergy(0.);
+
+      int distinct(0);
+      if (abc[0] == abc[1]) distinct++;
+      if (abc[1] == abc[2]) distinct--;
+      const double epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]);
+
+      chrono["energy"].start();
+      if ( distinct == 0)
+        tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk);
+      else
+        tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk);
+      chrono["energy"].stop();
+
+#if defined(HAVE_OCD) || defined(TRIPLES_PRINT_TUPLES)
+      tupleEnergies[abc] = tupleEnergy;
+#endif
+
+      energy += tupleEnergy;
+
+#ifdef HAVE_OCD
+      auto const print_slices
+        = [](ABCTuple const& abc, ABCTuple const& want, SliceUnion& u) {
+          if (abc != want) return;
+
+          for (auto type: u.sliceTypes) {
+            auto const& ptr = u.unwrapSlice(type, abc);
+            auto const& slice = Slice::findByTypeAbc(u.slices, type, abc);
+            WITH_RANK << "__print_slice__:n" << u.name << " "
+                      << pretty_print(abc) << " "
+                      << pretty_print(slice.info)
+                      ;
+            for (size_t i = 0; i < 20; i++) std::cout << ptr[i] << ", ";
+            std::cout << std::endl;
+          }
+        };
+#endif
+
+    if (isFakeTuple(i)) {
+      // fake iterations should also unwrap whatever they got
+      WITH_RANK << iteration
+                << "th unwrapping because of fake in "
+                << i << "\n";
+      for (auto& u: unions) u->unwrapAll(abc);
+    }
+
+#ifdef HAVE_OCD
+    for (auto const& u: unions) {
+      WITH_RANK << "__dups__:"
+                << iteration
+                << "-th n" << u->name << " checking duplicates\n";
+      u->checkForDuplicates();
+    }
+#endif
+
+
+    // WORKLOAD DUMP  ====================================================={{{1
+#if defined(TRIPLES_WORKLOAD_DUMP)
+    {
+      chrono["workload"].start();
+      std::vector<WorkloadEntry> entries;
+      for (auto const& u: unions) {
+        entries.push_back
+         ({ rank
+          , iteration
+          , abc
+          , (size_t)u->name
+          , WorkloadEntry::findSend(db, *u, rank)
+          , WorkloadEntry::findRecv(*u)
+          , (size_t)std::round(1e3*chrono["oneshot-doubles"].count())
+          , (size_t)std::round(1e3*chrono["oneshot-mpi:barrier"].count())
+          , (size_t)std::round(1e3*chrono["oneshot-db:comm:allgather"].count())
+          , (size_t)std::round(1e3*chrono["oneshot-unwrap"].count())
+          });
+      }
+      workloadDB.push_back(entries);
+      chrono["workload"].stop();
+    }
+#endif
+
+    // CLEANUP UNIONS ===================================================={{{1
+    OCD_Barrier(universe);
+    if (abcNext) {
+      chrono["gc"].start();
+      WITH_RANK << "__gc__:" << iteration << "-th cleaning up.......\n";
+      for (auto& u: unions) {
+
+        u->unwrapAll(abc);
+        WITH_RANK << "__gc__:n" << u->name  << " :it " << iteration
+                  << " :abc " << pretty_print(abc)
+                  << " :abcN " << pretty_print(*abcNext)
+                  << "\n";
+        for (auto const& slice: u->slices)
+          WITH_RANK << "__gc__:guts:" << slice.info << "\n";
+        u->clearUnusedSlicesForNext(*abcNext);
+
+        WITH_RANK << "__gc__: checking validity\n";
+
+#ifdef HAVE_OCD
+        // check for validity of the slices
+        for (auto type: u->sliceTypes) {
+          auto tuple = Slice::subtupleBySlice(abc, type);
+        for (auto& slice: u->slices) {
+          if ( slice.info.type == type
+             && slice.info.tuple == tuple
+             && slice.isDirectlyFetchable()
+             ) {
+            if (slice.info.state == Slice::Dispatched)
+              throw std::domain_error( "This slice should not be undispatched! "
+                                     + pretty_print(slice.info));
+          }
+        }
+        }
+#endif
+
+
+      }
+      chrono["gc"].stop();
+    }
+
+      WITH_RANK << iteration << "-th cleaning up....... DONE\n";
+    }
+
+    // CLEAN CHRONO ======================================================{{{1
+    chrono["iterations"].stop();
+    { // TODO: REMOVEME
+      chrono["oneshot-doubles"].clear();
+      chrono["oneshot-mpi:barrier"].clear();
+      chrono["oneshot-db:comm:allgather"].clear();
+      chrono["oneshot-unwrap"].clear();
+    }
+
+    // ITERATION END ====================================================={{{1
+  }  // END OF MAIN LOOP
+
+  MPI_Barrier(universe);
+
+  // PRINT TUPLES ========================================================={{{1
+#if defined(HAVE_OCD) || defined(TRIPLES_PRINT_TUPLES)
+  LOG(0,"NEW_TRIPLES") << "tuple energies" << "\n";
+  for (size_t i = 0; i < np; i++) {
+    MPI_Barrier(universe);
+    for (auto const& pair: tupleEnergies) {
+      if (i == rank)
+        std::cout << pair.first[0]
+                  << " " << pair.first[1]
+                  << " " << pair.first[2]
+                  << std::setprecision(15) << std::setw(23)
+                  << " tupleEnergy: " << pair.second
+                  << "\n"
+                  ;
+    }
+  }
+#endif
+
+  // COMMUNICATE THE ENERGIES ============================================={{{1
+  LOG(0,"NEW_TRIPLES") << "COMMUNICATING ENERGIES \n";
+  double globalEnergy = 0;
+  MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe);
+
+  WITH_RANK << "local energy " << energy << "\n";
+  LOG(0,"LOOP FINISHED, energy")
+    << std::setprecision(15) << std::setw(23)
+    << globalEnergy << std::endl;
+
+  // PRINT TIMINGS {{{1
+  for (auto const& pair: chrono)
+    LOG(0,"atrip:chrono") << pair.first << " "
+                          << pair.second.count() << std::endl;
+
+
+  LOG(0, "atrip:flops")
+    << nIterations * doublesFlops / chrono["doubles"].count() << "\n";
+
+  // WORKLOAD DUMP  ====================================================={{{1
+#if defined(TRIPLES_WORKLOAD_DUMP)
+  {
+    constexpr char* _workload_file = "workload.triples.bin";
+    LOG(0,"NEW_TRIPLES") << "Writing out WORKLOAD to "
+                         << _workload_file << "\n";
+    LOG(0,"NEW_TRIPLES") << "# workload entries = "
+                         << workloadDB.size() << "\n";
+    MPI_File handle;
+    MPI_Status status;
+    MPI_File_open(universe,
+                  _workload_file,
+                  MPI_MODE_CREATE | MPI_MODE_WRONLY,
+                  MPI_INFO_NULL,
+                  &handle);
+
+    for (int _it = 1; _it <= nIterations; _it++) {
+      MPI_Offset const
+          chunk_size = sizeof(WorkloadEntry) * workloadDB[0].size()
+        , main_offset = chunk_size
+                      ,* (rank * nIterations + _it - 1)
+                      ;
+
+      auto const& entries = workloadDB[_it - 1];
+      for (size_t i = 0; i < entries.size(); i++) {
+        MPI_Offset const
+          local_offset = main_offset + i * sizeof(WorkloadEntry);
+        LOG(0,"NEW_TRIPLES") << entries[i].iteration
+                             << ","
+                             << entries[i].name
+                             << ","
+                             << entries[i].send
+                             << ","
+                             << entries[i].recv
+                             << ","
+                             << entries[i].doubles
+                             << ","
+                             << entries[i].barrier
+                             << "\n"
+                             ;
+        MPI_File_write_at(handle,
+                          local_offset,
+                          &entries[i],
+                          sizeof(WorkloadEntry),
+                          MPI_CHAR,
+                          &status);
+      }
+    } // iterations
+    MPI_Barrier(universe);
+    MPI_File_close(&handle);
+  }
+#endif
+
+
+}
+ #+end_src
diff --git a/bench/config.mk b/bench/config.mk
index e7a0b5e..4e09f2f 100644
--- a/bench/config.mk
+++ b/bench/config.mk
@@ -1,5 +1,7 @@
 BENCH_SOURCES = $(wildcard $(ATRIP_ROOT)/bench/test*.cxx)
 BENCH_TARGETS = $(patsubst %.cxx,%,$(BENCH_SOURCES))
 
+$(BENCH_TARGETS): CXXFLAGS += -I.
+$(BENCH_TARGETS): CXXFLAGS += -fopenmp
 bench-clean:
 	-rm -v $(BENCH_TARGETS)
diff --git a/etc/config/gcc.mk b/etc/config/gcc.mk
index 4086fe4..7b14a3e 100644
--- a/etc/config/gcc.mk
+++ b/etc/config/gcc.mk
@@ -1,8 +1,10 @@
-include etc/ctf.mk
+include etc/make/ctf.mk
 
 CXX = mpic++
 
 CXXFLAGS += -I$(ATRIP_ROOT)/include
 
 CXXFLAGS += -I$(CTF_INCLUDE_PATH)
-LDFLAGS += -L$(CTF_BUILD_PATH) -lctf
+LDFLAGS += -Wl,-Bstatic -L$(CTF_BUILD_PATH)/lib -lctf
+LDFLAGS += -fopenmp -L/usr/lib -lscalapack -L/opt/OpenBLAS/lib -lopenblas
+LDFLAGS += -Wl,-Bdynamic
diff --git a/etc/make/ctf.mk b/etc/make/ctf.mk
index 1142943..c3cf285 100644
--- a/etc/make/ctf.mk
+++ b/etc/make/ctf.mk
@@ -1,2 +1,2 @@
-include ./etc/ctf_vars.mk
-include ./etc/ctf_rules.mk
+include ./etc/make/ctf_vars.mk
+include ./etc/make/ctf_rules.mk