From 7029ccda2a562850fc3a3ce4022bb941012d1e0f Mon Sep 17 00:00:00 2001
From: Alejandro Gallo <aamsgallo@gmail.com>
Date: Thu, 2 Sep 2021 18:45:38 +0200
Subject: [PATCH] Simple update

---
 Makefile              |    2 +-
 README.org            | 1882 ++++++++++++++++++++++++++---------------
 bench/test_main.cxx   |    3 +
 config.el             |   25 +-
 etc/config/gcc.mk     |    3 +-
 etc/make/ctf_vars.mk  |    4 +-
 etc/make/emacs.mk     |    2 +-
 etc/nix/scalapack.nix |   56 ++
 shell.nix             |   19 +
 9 files changed, 1318 insertions(+), 678 deletions(-)
 create mode 100644 etc/nix/scalapack.nix
diff --git a/Makefile b/Makefile
index 715056e..5c12e14 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ include ./bench/config.mk
 
 MAIN = README.org
 
-$(SOURCES_FILE): $(MAIN)
+$(SOURCES_FILE): $(MAIN) config.el
 	echo -n "SOURCES = " > $@
 	$(EMACS) --eval '(atrip-print-sources)' >> $@
 
diff --git a/README.org b/README.org
index c40e20f..c03218e 100644
--- a/README.org
+++ b/README.org
@@ -17,12 +17,11 @@ The algorithm uses two main data types, the =Slice= and the
 #include <vector>
 #include <mpi.h>
 
+#include <atrip/Tuples.hpp>
 #include <atrip/Utils.hpp>
 
 namespace atrip {
 
-using ABCTuple = std::array<size_t, 3>;
-using PartialTuple = std::array<size_t, 2>;
 
 struct Slice {
 
@@ -497,12 +496,10 @@ std::ostream& operator<<(std::ostream& out, Slice::Info const& i) {
 #include <map>
 #include <chrono>
 
+#include <ctf.hpp>
+
 namespace atrip {
 
-  struct Atrip {
-    static size_t rank;
-  };
-  size_t Atrip::rank;
 
   template <typename T>
   std::string pretty_print(T&& value) {
@@ -531,189 +528,75 @@ namespace atrip {
 
 #+end_src
 
-** Include header
-
-#+begin_src c++ :tangle (atrip-main-h)
+** The rank mapping
+#+begin_src c++ :tangle (atrip-utils-h)
 #pragma once
 
-#define TRIPLES_BENCHMARK
-#define TRIPLES_DEBUG 1
-//#define TRIPLES_WORKLOAD_DUMP
-#define TRIPLES_USE_DGEMM
-//#define TRIPLES_PRINT_TUPLES
+#include <vector>
+#include <algorithm>
 
-#if TRIPLES_DEBUG == 4
-#  pragma message("WARNING: You have OCD debugging ABC triples "\
-                  "expect GB of output and consult your therapist")
-#  include <dbg.h>
-#  define HAVE_OCD
-#  define OCD_Barrier(com) MPI_Barrier(com)
-#  define WITH_OCD
-#  define WITH_ROOT if (atrip::Atrip::rank == 0)
-#  define WITH_SPECIAL(r) if (atrip::Atrip::rank == r)
-#  define WITH_RANK std::cout << atrip::Atrip::rank << ": "
-#  define WITH_CRAZY_DEBUG
-#  define WITH_DBG
-#  define DBG(...) dbg(__VA_ARGS__)
-#elif TRIPLES_DEBUG == 3
-#  pragma message("WARNING: You have crazy debugging ABC triples,"\
-                  " expect GB of output")
-#  include <dbg.h>
-#  define OCD_Barrier(com)
-#  define WITH_OCD if (false)
-#  define WITH_ROOT if (atrip::Atrip::rank == 0)
-#  define WITH_SPECIAL(r) if (atrip::Atrip::rank == r)
-#  define WITH_RANK std::cout << atrip::Atrip::rank << ": "
-#  define WITH_CRAZY_DEBUG
-#  define WITH_DBG
-#  define DBG(...) dbg(__VA_ARGS__)
-#elif TRIPLES_DEBUG == 2
-#  pragma message("WARNING: You have some debugging info for ABC triples")
-#  include <dbg.h>
-#  define OCD_Barrier(com)
-#  define WITH_OCD if (false)
-#  define WITH_ROOT if (atrip::Atrip::rank == 0)
-#  define WITH_SPECIAL(r) if (atrip::Atrip::rank == r)
-#  define WITH_RANK std::cout << atrip::Atrip::rank << ": "
-#  define WITH_CRAZY_DEBUG if (false)
-#  define WITH_DBG
-#  define DBG(...) dbg(__VA_ARGS__)
-#elif TRIPLES_DEBUG == 1
-#  define OCD_Barrier(com)
-#  define WITH_OCD if (false)
-#  define WITH_ROOT if (false)
-#  define WITH_SPECIAL(r) if (false)
-#  define WITH_RANK if (false) std::cout << atrip::Atrip::rank << ": "
-#  define WITH_DBG if (false)
-#  define WITH_CRAZY_DEBUG if (false)
-#  define DBG(...)
-#else
-#  error("TRIPLES_DEBUG is not defined!")
-#endif
-
-#include <atrip/Utils.hpp>
 #include <atrip/Slice.hpp>
 
-
-#+end_src
-
-
-** Todo                                                            :noexport:
- #+begin_src c++  :tangle src/main.hpp
-#include <algorithms/PerturbativeTriplesAbcijk.hpp>
-#include <math/MathFunctions.hpp>
-#include <tcc/DryTensor.hpp>
-#include <util/Log.hpp>
-#include <util/Exception.hpp>
-#include <Cc4s.hpp>
-#include <ctf.hpp>
-#include <util/MpiCommunicator.hpp>
-
-using namespace cc4s;
-
-#include <sstream>
-#include <algorithm>
-#include <numeric>
-#include <array>
-#include <vector>
-#include <map>
-#include <memory>
-#include <set>
-#include <chrono>
-
 namespace atrip {
-  extern "C" {
-    void dgemm_(
-      const char *transa,
-      const char *transb,
-      const int *m,
-      const int *n,
-      const int *k,
-      double *alpha,
-      const double *A,
-      const int *lda,
-      const double *B,
-      const int *ldb,
-      double *beta,
-      double *C,
-      const int *ldc
-    );
-  }
-}
+  struct RankMap {
+
+    std::vector<size_t> const lengths;
+    size_t const np, size;
+
+    RankMap(std::vector<size_t> lens, size_t np_)
+      : lengths(lens)
+      , np(np_)
+      , size(std::accumulate(lengths.begin(), lengths.end(),
+                            1, std::multiplies<size_t>()))
+    { assert(lengths.size() <= 2); }
+
+    size_t find(Slice::Location const& p) const noexcept {
+      return p.source * np + p.rank;
+    }
+
+    size_t nSources() const noexcept {
+      return size / np + size_t(size % np != 0);
+    }
 
 
+    bool isPaddingRank(size_t rank) const noexcept {
+      return size % np == 0
+          ? false
+          : rank > (size % np - 1)
+          ;
+    }
 
-namespace atrip {
+    bool isSourcePadding(size_t rank, size_t source) const noexcept {
+      return source == nSources() && isPaddingRank(rank);
+    }
 
+    Slice::Location
+    find(ABCTuple const& abc, Slice::Type sliceType) const noexcept {
+      // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB
+      const auto tuple = Slice::subtupleBySlice(abc, sliceType);
+
+      const size_t index
+        = tuple[0]
+        + tuple[1] * (lengths.size() > 1 ? lengths[0] : 0)
+        ;
+
+      return
+        { index % np
+        , index / np
+        };
+    }
+
+  };
 
 }
-
-
-ALGORITHM_REGISTRAR_DEFINITION(PerturbativeTriplesAbcijk)
-
-
-
-// SLICE MAIN DEFININITION =============================================={{{1
 #+end_src
 
+** The slice union
+#+begin_src c++ :tangle (atrip-slice-union-h)
+#pragma once
+#include <atrip/Slice.hpp>
 
-#+begin_src c++ :tangle src/main.hpp
-
-  // RANKMAP =============================================================={{{1
-
-struct RankMap {
-
-  std::vector<size_t> const lengths;
-  size_t const np, size;
-
-  RankMap(std::vector<size_t> lens, size_t np_)
-    : lengths(lens)
-    , np(np_)
-    , size(std::accumulate(lengths.begin(), lengths.end(),
-                           1, std::multiplies<size_t>()))
-  { assert(lengths.size() <= 2); }
-
-  size_t find(Slice::Location const& p) const noexcept {
-    return p.source * np + p.rank;
-  }
-
-  size_t nSources() const noexcept {
-    return size / np + size_t(size % np != 0);
-  }
-
-
-  bool isPaddingRank(size_t rank) const noexcept {
-    return size % np == 0
-         ? false
-         : rank > (size % np - 1)
-         ;
-  }
-
-  bool isSourcePadding(size_t rank, size_t source) const noexcept {
-    return source == nSources() && isPaddingRank(rank);
-  }
-
-  Slice::Location
-  find(ABCTuple const& abc, Slice::Type sliceType) const noexcept {
-    // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB
-    const auto tuple = Slice::subtupleBySlice(abc, sliceType);
-
-    const size_t index
-      = tuple[0]
-      + tuple[1] * (lengths.size() > 1 ? lengths[0] : 0)
-      ;
-
-    return
-      { index % np
-      , index / np
-      };
-  }
-
-};
-
-
-
-  // SLICEUNION ==========================================================={{{1
+namespace atrip {
 
   struct SliceUnion {
     using F = double;
@@ -1237,7 +1120,104 @@ struct RankMap {
 
   };
 
+  SliceUnion&
+  unionByName(std::vector<SliceUnion*> const& unions, Slice::Name name) {
+      const auto sliceUnionIt
+        = std::find_if(unions.begin(), unions.end(),
+                      [&name](SliceUnion const* s) {
+                        return name == s->name;
+                      });
+      if (sliceUnionIt == unions.end())
+        throw std::domain_error("SliceUnion not found!");
+      return **sliceUnionIt;
+  }
 
+}
+#+end_src
+
+** Tuples
+#+begin_src c++ :tangle (atrip-tuples-h)
+#pragma once
+
+#include <vector>
+#include <array>
+
+#include <atrip/Utils.hpp>
+
+namespace atrip {
+
+  using ABCTuple = std::array<size_t, 3>;
+  using PartialTuple = std::array<size_t, 2>;
+  using ABCTuples = std::vector<ABCTuple>;
+
+  ABCTuples getTuplesList(size_t Nv) {
+    const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
+    ABCTuples result(n);
+    size_t u(0);
+
+    for (size_t a(0); a < Nv; a++)
+    for (size_t b(a); b < Nv; b++)
+    for (size_t c(b); c < Nv; c++){
+      if ( a == b && b == c ) continue;
+      result[u++] = {a, b, c};
+    }
+
+    return result;
+
+  }
+
+
+  std::pair<size_t, size_t>
+  getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) {
+
+    std::vector<size_t> n_tuples_per_rank(np, tuplesList.size()/np);
+    const size_t
+        // how many valid tuples should we still verteilen to nodes
+        // since the number of tuples is not divisible by the number of nodes
+        nRoundRobin = tuplesList.size() % np
+        // every node must have the sanme amount of tuples in order for the
+        // other nodes to receive and send somewhere, therefore
+        // some nodes will get extra tuples but that are dummy tuples
+      , nExtraInvalid = (np - nRoundRobin) % np
+      ;
+
+    if (nRoundRobin) for (int i = 0; i < np; i++) n_tuples_per_rank[i]++;
+
+  #if defined(TODO)
+    assert( tuplesList.size()
+            ==
+            ( std::accumulate(n_tuples_per_rank.begin(),
+                              n_tuples_per_rank.end(),
+                              0)
+            + nExtraInvalid
+            ));
+  #endif
+
+    WITH_RANK << "nRoundRobin = " << nRoundRobin << "\n";
+    WITH_RANK << "nExtraInvalid = " << nExtraInvalid << "\n";
+    WITH_RANK << "ntuples = " << n_tuples_per_rank[rank] << "\n";
+
+    auto const& it = n_tuples_per_rank.begin();
+
+    return
+      { std::accumulate(it, it + rank    , 0)
+      , std::accumulate(it, it + rank + 1, 0)
+      };
+
+  }
+
+}
+#+end_src
+
+** Unions
+Since every tensor slice in a different way, we can override the slicing procedure
+and define subclasses of slice unions.
+
+#+begin_src c++ :tangle (atrip-unions-h)
+#pragma once
+#include <atrip/SliceUnion.hpp>
+
+namespace atrip {
 
   void sliceIntoVector
     ( std::vector<double> &v
@@ -1471,444 +1451,1087 @@ struct RankMap {
 
   };
 
-  // PHYSICS ENERGY, SINGLES ... =========================================={{{1
-
-double cc4s::PerturbativeTriplesAbcijk::getEnergyDistinct
-  ( const double epsabc
-  , std::vector<double> const& epsi
-  , std::vector<double> const& Tijk_
-  , std::vector<double> const& Zijk_
-  ) {
-  constexpr size_t blockSize=16;
-  double energy(0.);
-  const size_t No = epsi.size();
-  for (size_t kk=0; kk<No; kk+=blockSize){
-    const size_t kend( std::min(No, kk+blockSize) );
-    for (size_t jj(kk); jj<No; jj+=blockSize){
-      const size_t jend( std::min( No, jj+blockSize) );
-      for (size_t ii(jj); ii<No; ii+=blockSize){
-        const size_t iend( std::min( No, ii+blockSize) );
-        for (size_t k(kk); k < kend; k++){
-          const double ek(epsi[k]);
-          const size_t jstart = jj > k ? kk : k;
-          for (size_t j(jstart); j < jend; j++){
-            const double ej(epsi[j]);
-            double facjk( j == k ? 0.5 : 1.0);
-            size_t istart = ii > j ? ii : j;
-            for (size_t i(istart); i < iend; i++){
-              const double ei(epsi[i]);
-              double facij ( i==j ? 0.5 : 1.0);
-              double denominator(epsabc - ei - ej - ek);
-              double U(Zijk_[i + No*j + No*No*k]);
-              double V(Zijk_[i + No*k + No*No*j]);
-              double W(Zijk_[j + No*i + No*No*k]);
-              double X(Zijk_[j + No*k + No*No*i]);
-              double Y(Zijk_[k + No*i + No*No*j]);
-              double Z(Zijk_[k + No*j + No*No*i]);
-
-              double A(Tijk_[i + No*j + No*No*k]);
-              double B(Tijk_[i + No*k + No*No*j]);
-              double C(Tijk_[j + No*i + No*No*k]);
-              double D(Tijk_[j + No*k + No*No*i]);
-              double E(Tijk_[k + No*i + No*No*j]);
-              double F(Tijk_[k + No*j + No*No*i]);
-              double value(3.0*(A*U+B*V+C*W+D*X+E*Y+F*Z)
-                           +((U+X+Y)-2.0*(V+W+Z))*(A+D+E)
-                           +((V+W+Z)-2.0*(U+X+Y))*(B+C+F));
-              energy += 2.0*value / denominator * facjk * facij;
-            } // i
-          } // j
-        } // k
-      } // ii
-    } // jj
-  } // kk
-  return energy;
 }
+#+end_src
 
 
-double cc4s::PerturbativeTriplesAbcijk::getEnergySame
-  ( const double epsabc
-  , std::vector<double> const& epsi
-  , std::vector<double> const& Tijk_
-  , std::vector<double> const& Zijk_
-  ) {
-  constexpr size_t blockSize = 16;
-  const size_t No = epsi.size();
-  double energy(0.);
-  for (size_t kk=0; kk<No; kk+=blockSize){
-    const size_t kend( std::min( kk+blockSize, No) );
-    for (size_t jj(kk); jj<No; jj+=blockSize){
-      const size_t jend( std::min( jj+blockSize, No) );
-      for (size_t ii(jj); ii<No; ii+=blockSize){
-        const size_t iend( std::min( ii+blockSize, No) );
-        for (size_t k(kk); k < kend; k++){
-          const double ek(epsi[k]);
-          const size_t jstart = jj > k ? jj : k;
-          for(size_t j(jstart); j < jend; j++){
-            const double facjk( j == k ? 0.5 : 1.0);
-            const double ej(epsi[j]);
-            const size_t istart = ii > j ? ii : j;
-            for(size_t i(istart); i < iend; i++){
-              double ei(epsi[i]);
-              double facij ( i==j ? 0.5 : 1.0);
-              double denominator(epsabc - ei - ej - ek);
-              double U(Zijk_[i + No*j + No*No*k]);
-              double V(Zijk_[j + No*k + No*No*i]);
-              double W(Zijk_[k + No*i + No*No*j]);
-              double A(Tijk_[i + No*j + No*No*k]);
-              double B(Tijk_[j + No*k + No*No*i]);
-              double C(Tijk_[k + No*i + No*No*j]);
-              double value(3.0*( A*U + B*V + C*W) - (A+B+C)*(U+V+W));
-              energy += 2.0*value / denominator * facjk * facij;
-            } // i
-          } // j
-        } // k
-      } // ii
-    } // jj
-  } // kk
-  return energy;
-}
+** Equations
+#+begin_src c++ :tangle (atrip-equations-h)
+#pragma once
 
-void cc4s::PerturbativeTriplesAbcijk::singlesContribution
-  ( const ABCTuple &abc
-  , double const* Tph
-  , double const* VABij
-  , double const* VACij
-  , double const* VBCij
-  , double *Zijk
-  ) {
-  const size_t a(abc[0]), b(abc[1]), c(abc[2]);
-  for (size_t k=0; k < No; k++)
-  for (size_t i=0; i < No; i++)
-  for (size_t j=0; j < No; j++) {
-    const size_t ijk = i + j*No + k*No*No
-               ,  jk = j + No * k
-               ;
-    Zijk[ijk] += Tph[ a + i * Nv ] * VBCij[ j + k * No ];
-    Zijk[ijk] += Tph[ b + j * Nv ] * VACij[ i + k * No ];
-    Zijk[ijk] += Tph[ c + k * Nv ] * VABij[ i + j * No ];
+#include<atrip/Slice.hpp>
+#include<atrip/Blas.hpp>
+
+namespace atrip {
+
+  double getEnergyDistinct
+    ( const double epsabc
+    , std::vector<double> const& epsi
+    , std::vector<double> const& Tijk_
+    , std::vector<double> const& Zijk_
+    ) {
+    constexpr size_t blockSize=16;
+    double energy(0.);
+    const size_t No = epsi.size();
+    for (size_t kk=0; kk<No; kk+=blockSize){
+      const size_t kend( std::min(No, kk+blockSize) );
+      for (size_t jj(kk); jj<No; jj+=blockSize){
+        const size_t jend( std::min( No, jj+blockSize) );
+        for (size_t ii(jj); ii<No; ii+=blockSize){
+          const size_t iend( std::min( No, ii+blockSize) );
+          for (size_t k(kk); k < kend; k++){
+            const double ek(epsi[k]);
+            const size_t jstart = jj > k ? kk : k;
+            for (size_t j(jstart); j < jend; j++){
+              const double ej(epsi[j]);
+              double facjk( j == k ? 0.5 : 1.0);
+              size_t istart = ii > j ? ii : j;
+              for (size_t i(istart); i < iend; i++){
+                const double ei(epsi[i]);
+                double facij ( i==j ? 0.5 : 1.0);
+                double denominator(epsabc - ei - ej - ek);
+                double U(Zijk_[i + No*j + No*No*k]);
+                double V(Zijk_[i + No*k + No*No*j]);
+                double W(Zijk_[j + No*i + No*No*k]);
+                double X(Zijk_[j + No*k + No*No*i]);
+                double Y(Zijk_[k + No*i + No*No*j]);
+                double Z(Zijk_[k + No*j + No*No*i]);
+
+                double A(Tijk_[i + No*j + No*No*k]);
+                double B(Tijk_[i + No*k + No*No*j]);
+                double C(Tijk_[j + No*i + No*No*k]);
+                double D(Tijk_[j + No*k + No*No*i]);
+                double E(Tijk_[k + No*i + No*No*j]);
+                double F(Tijk_[k + No*j + No*No*i]);
+                double value(3.0*(A*U+B*V+C*W+D*X+E*Y+F*Z)
+                            +((U+X+Y)-2.0*(V+W+Z))*(A+D+E)
+                            +((V+W+Z)-2.0*(U+X+Y))*(B+C+F));
+                energy += 2.0*value / denominator * facjk * facij;
+              } // i
+            } // j
+          } // k
+        } // ii
+      } // jj
+    } // kk
+    return energy;
   }
-}
 
-void doublesContribution
-  ( const ABCTuple &abc
-  , size_t const No
-  , size_t const Nv
-  // -- VABCI
-  , double const* VABph
-  , double const* VACph
-  , double const* VBCph
-  , double const* VBAph
-  , double const* VCAph
-  , double const* VCBph
-  // -- VHHHA
-  , double const* VhhhA
-  , double const* VhhhB
-  , double const* VhhhC
-  // -- TA
-  , double const* TAphh
-  , double const* TBphh
-  , double const* TCphh
-  // -- TABIJ
-  , double const* TABhh
-  , double const* TAChh
-  , double const* TBChh
-  // -- TIJK
-  , double *Tijk
-  , atrip::Timings& chrono
-  ) {
 
-  auto& t_reorder = chrono["doubles:reorder"];
-  const size_t a = abc[0], b = abc[1], c = abc[2]
-             , NoNo = No*No, NoNv = No*Nv
-             ;
-
-#if defined(TRIPLES_USE_DGEMM)
-#define _IJK_(i, j, k) i + j*No + k*NoNo
-#define REORDER(__II, __JJ, __KK)                                 \
-  t_reorder.start();                                              \
-  for (size_t k = 0; k < No; k++)                                 \
-  for (size_t j = 0; j < No; j++)                                 \
-  for (size_t i = 0; i < No; i++) {                               \
-    Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)];   \
-  }                                                               \
-  t_reorder.stop();
-#define DGEMM_PARTICLES(__A, __B)    \
-  atrip::dgemm_( "T"                 \
-               , "N"                 \
-               , (int const*)&NoNo   \
-               , (int const*)&No     \
-               , (int const*)&Nv     \
-               , &one                \
-               , __A                 \
-               , (int const*)&Nv     \
-               , __B                 \
-               , (int const*)&Nv     \
-               , &zero               \
-               , _t_buffer.data()    \
-               , (int const*)&NoNo   \
-               );
-#define DGEMM_HOLES(__A, __B, __TRANSB)  \
-  atrip::dgemm_( "N"                     \
-               , __TRANSB                \
-               , (int const*)&NoNo       \
-               , (int const*)&No         \
-               , (int const*)&No         \
-               , &m_one                  \
-               , __A                     \
-               , (int const*)&NoNo       \
-               , __B                     \
-               , (int const*)&No         \
-               , &zero                   \
-               , _t_buffer.data()        \
-               , (int const*)&NoNo       \
-               );
-
-  using F = double;
-  const size_t NoNoNo = No*NoNo;
-  std::vector<double> _t_buffer;
-  _t_buffer.reserve(NoNoNo);
-  F one{1.0}, m_one{-1.0}, zero{0.0};
-
-  t_reorder.start();
-  for (size_t k = 0; k < NoNoNo; k++) {
-    // zero the Tijk
-    Tijk[k] = 0.0;
+  double getEnergySame
+    ( const double epsabc
+    , std::vector<double> const& epsi
+    , std::vector<double> const& Tijk_
+    , std::vector<double> const& Zijk_
+    ) {
+    constexpr size_t blockSize = 16;
+    const size_t No = epsi.size();
+    double energy(0.);
+    for (size_t kk=0; kk<No; kk+=blockSize){
+      const size_t kend( std::min( kk+blockSize, No) );
+      for (size_t jj(kk); jj<No; jj+=blockSize){
+        const size_t jend( std::min( jj+blockSize, No) );
+        for (size_t ii(jj); ii<No; ii+=blockSize){
+          const size_t iend( std::min( ii+blockSize, No) );
+          for (size_t k(kk); k < kend; k++){
+            const double ek(epsi[k]);
+            const size_t jstart = jj > k ? jj : k;
+            for(size_t j(jstart); j < jend; j++){
+              const double facjk( j == k ? 0.5 : 1.0);
+              const double ej(epsi[j]);
+              const size_t istart = ii > j ? ii : j;
+              for(size_t i(istart); i < iend; i++){
+                double ei(epsi[i]);
+                double facij ( i==j ? 0.5 : 1.0);
+                double denominator(epsabc - ei - ej - ek);
+                double U(Zijk_[i + No*j + No*No*k]);
+                double V(Zijk_[j + No*k + No*No*i]);
+                double W(Zijk_[k + No*i + No*No*j]);
+                double A(Tijk_[i + No*j + No*No*k]);
+                double B(Tijk_[j + No*k + No*No*i]);
+                double C(Tijk_[k + No*i + No*No*j]);
+                double value(3.0*( A*U + B*V + C*W) - (A+B+C)*(U+V+W));
+                energy += 2.0*value / denominator * facjk * facij;
+              } // i
+            } // j
+          } // k
+        } // ii
+      } // jj
+    } // kk
+    return energy;
   }
-  t_reorder.stop();
 
-  chrono["doubles:holes"].start();
-  { // Holes part ============================================================
-    // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
-    chrono["doubles:holes:1"].start();
-    DGEMM_HOLES(VhhhC, TABhh, "N")
-    REORDER(i, k, j)
-    chrono["doubles:holes:1"].stop();
-    // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
-    chrono["doubles:holes:2"].start();
-    DGEMM_HOLES(VhhhC, TABhh, "T")
-    REORDER(j, k, i)
-    chrono["doubles:holes:2"].stop();
-    // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
-    chrono["doubles:holes:3"].start();
-    DGEMM_HOLES(VhhhB, TAChh, "N")
-    REORDER(i, j, k)
-    chrono["doubles:holes:3"].stop();
-    // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
-    chrono["doubles:holes:4"].start();
-    DGEMM_HOLES(VhhhB, TAChh, "T")
-    REORDER(k, j, i)
-    chrono["doubles:holes:4"].stop();
-    // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
-    chrono["doubles:holes:5"].start();
-    DGEMM_HOLES(VhhhA, TBChh, "N")
-    REORDER(j, i, k)
-    chrono["doubles:holes:5"].stop();
-    // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
-    chrono["doubles:holes:6"].start();
-    DGEMM_HOLES(VhhhA, TBChh, "T")
-    REORDER(k, i, j)
-    chrono["doubles:holes:6"].stop();
-  }
-  chrono["doubles:holes"].stop();
-
-  chrono["doubles:particles"].start();
-  { // Particle part =========================================================
-    // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
-    chrono["doubles:particles:1"].start();
-    DGEMM_PARTICLES(TAphh, VBCph)
-    REORDER(i, j, k)
-    chrono["doubles:particles:1"].stop();
-    // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
-    chrono["doubles:particles:2"].start();
-    DGEMM_PARTICLES(TAphh, VCBph)
-    REORDER(i, k, j)
-    chrono["doubles:particles:2"].stop();
-    // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
-    chrono["doubles:particles:3"].start();
-    DGEMM_PARTICLES(TCphh, VABph)
-    REORDER(k, i, j)
-    chrono["doubles:particles:3"].stop();
-    // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
-    chrono["doubles:particles:4"].start();
-    DGEMM_PARTICLES(TCphh, VBAph)
-    REORDER(k, j, i)
-    chrono["doubles:particles:4"].stop();
-    // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
-    chrono["doubles:particles:5"].start();
-    DGEMM_PARTICLES(TBphh, VACph)
-    REORDER(j, i, k)
-    chrono["doubles:particles:5"].stop();
-    // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
-    chrono["doubles:particles:6"].start();
-    DGEMM_PARTICLES(TBphh, VCAph)
-    REORDER(j, k, i)
-    chrono["doubles:particles:6"].stop();
-  }
-  chrono["doubles:particles"].stop();
-
-#undef REORDER
-#undef DGEMM_HOLES
-#undef DGEMM_PARTICLES
-#undef _IJK_
-#else
-  for (size_t k = 0; k < No; k++)
-  for (size_t j = 0; j < No; j++)
-  for (size_t i = 0; i < No; i++){
-    const size_t ijk = i + j*No + k*NoNo
-               ,  jk = j + k*No
-               ;
-    Tijk[ijk] = 0.0; // :important
-    // HOLE DIAGRAMS: TABHH and VHHHA
-    for (size_t L = 0; L < No; L++){
-      // t[abLj] * V[Lcik]        H1
-      // t[baLi] * V[Lcjk]        H0      TODO: conjugate T for complex
-      Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo];
-      Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo];
-
-      // t[acLk] * V[Lbij]        H5
-      // t[caLi] * V[Lbkj]        H3
-      Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo];
-      Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo];
-
-      // t[bcLk] * V[Laji]        H2
-      // t[cbLj] * V[Laki]        H4
-      Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo];
-      Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo];
+  void singlesContribution
+    ( const ABCTuple &abc
+    , double const* Tph
+    , double const* VABij
+    , double const* VACij
+    , double const* VBCij
+    , double *Zijk
+    ) {
+    const size_t a(abc[0]), b(abc[1]), c(abc[2]);
+    for (size_t k=0; k < No; k++)
+    for (size_t i=0; i < No; i++)
+    for (size_t j=0; j < No; j++) {
+      const size_t ijk = i + j*No + k*No*No
+                ,  jk = j + No * k
+                ;
+      Zijk[ijk] += Tph[ a + i * Nv ] * VBCij[ j + k * No ];
+      Zijk[ijk] += Tph[ b + j * Nv ] * VACij[ i + k * No ];
+      Zijk[ijk] += Tph[ c + k * Nv ] * VABij[ i + j * No ];
     }
-    // PARTILCE DIAGRAMS: TAPHH and VABPH
-    for (size_t E = 0; E < Nv; E++) {
-      // t[aEij] * V[bcEk]        P0
-      // t[aEik] * V[cbEj]        P3 // TODO: CHECK THIS ONE, I DONT KNOW
-      Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv];
-      Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv];
+  }
 
-      // t[cEki] * V[abEj]        P5
-      // t[cEkj] * V[baEi]        P2
-      Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv];
-      Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv];
+  void doublesContribution
+    ( const ABCTuple &abc
+    , size_t const No
+    , size_t const Nv
+    // -- VABCI
+    , double const* VABph
+    , double const* VACph
+    , double const* VBCph
+    , double const* VBAph
+    , double const* VCAph
+    , double const* VCBph
+    // -- VHHHA
+    , double const* VhhhA
+    , double const* VhhhB
+    , double const* VhhhC
+    // -- TA
+    , double const* TAphh
+    , double const* TBphh
+    , double const* TCphh
+    // -- TABIJ
+    , double const* TABhh
+    , double const* TAChh
+    , double const* TBChh
+    // -- TIJK
+    , double *Tijk
+    , atrip::Timings& chrono
+    ) {
 
-      // t[bEji] * V[acEk]        P1
-      // t[bEjk] * V[caEi]        P4
-      Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv];
-      Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv];
+    auto& t_reorder = chrono["doubles:reorder"];
+    const size_t a = abc[0], b = abc[1], c = abc[2]
+              , NoNo = No*No, NoNv = No*Nv
+              ;
+
+  #if defined(TRIPLES_USE_DGEMM)
+  #define _IJK_(i, j, k) i + j*No + k*NoNo
+  #define REORDER(__II, __JJ, __KK)                                 \
+    t_reorder.start();                                              \
+    for (size_t k = 0; k < No; k++)                                 \
+    for (size_t j = 0; j < No; j++)                                 \
+    for (size_t i = 0; i < No; i++) {                               \
+      Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)];   \
+    }                                                               \
+    t_reorder.stop();
+  #define DGEMM_PARTICLES(__A, __B)    \
+    atrip::dgemm_( "T"                 \
+                , "N"                 \
+                , (int const*)&NoNo   \
+                , (int const*)&No     \
+                , (int const*)&Nv     \
+                , &one                \
+                , __A                 \
+                , (int const*)&Nv     \
+                , __B                 \
+                , (int const*)&Nv     \
+                , &zero               \
+                , _t_buffer.data()    \
+                , (int const*)&NoNo   \
+                );
+  #define DGEMM_HOLES(__A, __B, __TRANSB)  \
+    atrip::dgemm_( "N"                     \
+                , __TRANSB                \
+                , (int const*)&NoNo       \
+                , (int const*)&No         \
+                , (int const*)&No         \
+                , &m_one                  \
+                , __A                     \
+                , (int const*)&NoNo       \
+                , __B                     \
+                , (int const*)&No         \
+                , &zero                   \
+                , _t_buffer.data()        \
+                , (int const*)&NoNo       \
+                );
+
+    using F = double;
+    const size_t NoNoNo = No*NoNo;
+    std::vector<double> _t_buffer;
+    _t_buffer.reserve(NoNoNo);
+    F one{1.0}, m_one{-1.0}, zero{0.0};
+
+    t_reorder.start();
+    for (size_t k = 0; k < NoNoNo; k++) {
+      // zero the Tijk
+      Tijk[k] = 0.0;
+    }
+    t_reorder.stop();
+
+    chrono["doubles:holes"].start();
+    { // Holes part ============================================================
+      // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
+      chrono["doubles:holes:1"].start();
+      DGEMM_HOLES(VhhhC, TABhh, "N")
+      REORDER(i, k, j)
+      chrono["doubles:holes:1"].stop();
+      // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
+      chrono["doubles:holes:2"].start();
+      DGEMM_HOLES(VhhhC, TABhh, "T")
+      REORDER(j, k, i)
+      chrono["doubles:holes:2"].stop();
+      // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
+      chrono["doubles:holes:3"].start();
+      DGEMM_HOLES(VhhhB, TAChh, "N")
+      REORDER(i, j, k)
+      chrono["doubles:holes:3"].stop();
+      // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
+      chrono["doubles:holes:4"].start();
+      DGEMM_HOLES(VhhhB, TAChh, "T")
+      REORDER(k, j, i)
+      chrono["doubles:holes:4"].stop();
+      // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
+      chrono["doubles:holes:5"].start();
+      DGEMM_HOLES(VhhhA, TBChh, "N")
+      REORDER(j, i, k)
+      chrono["doubles:holes:5"].stop();
+      // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
+      chrono["doubles:holes:6"].start();
+      DGEMM_HOLES(VhhhA, TBChh, "T")
+      REORDER(k, i, j)
+      chrono["doubles:holes:6"].stop();
+    }
+    chrono["doubles:holes"].stop();
+
+    chrono["doubles:particles"].start();
+    { // Particle part =========================================================
+      // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
+      chrono["doubles:particles:1"].start();
+      DGEMM_PARTICLES(TAphh, VBCph)
+      REORDER(i, j, k)
+      chrono["doubles:particles:1"].stop();
+      // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
+      chrono["doubles:particles:2"].start();
+      DGEMM_PARTICLES(TAphh, VCBph)
+      REORDER(i, k, j)
+      chrono["doubles:particles:2"].stop();
+      // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
+      chrono["doubles:particles:3"].start();
+      DGEMM_PARTICLES(TCphh, VABph)
+      REORDER(k, i, j)
+      chrono["doubles:particles:3"].stop();
+      // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
+      chrono["doubles:particles:4"].start();
+      DGEMM_PARTICLES(TCphh, VBAph)
+      REORDER(k, j, i)
+      chrono["doubles:particles:4"].stop();
+      // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
+      chrono["doubles:particles:5"].start();
+      DGEMM_PARTICLES(TBphh, VACph)
+      REORDER(j, i, k)
+      chrono["doubles:particles:5"].stop();
+      // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
+      chrono["doubles:particles:6"].start();
+      DGEMM_PARTICLES(TBphh, VCAph)
+      REORDER(j, k, i)
+      chrono["doubles:particles:6"].stop();
+    }
+    chrono["doubles:particles"].stop();
+
+  #undef REORDER
+  #undef DGEMM_HOLES
+  #undef DGEMM_PARTICLES
+  #undef _IJK_
+  #else
+    for (size_t k = 0; k < No; k++)
+    for (size_t j = 0; j < No; j++)
+    for (size_t i = 0; i < No; i++){
+      const size_t ijk = i + j*No + k*NoNo
+                ,  jk = j + k*No
+                ;
+      Tijk[ijk] = 0.0; // :important
+      // HOLE DIAGRAMS: TABHH and VHHHA
+      for (size_t L = 0; L < No; L++){
+        // t[abLj] * V[Lcik]        H1
+        // t[baLi] * V[Lcjk]        H0      TODO: conjugate T for complex
+        Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo];
+        Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo];
+
+        // t[acLk] * V[Lbij]        H5
+        // t[caLi] * V[Lbkj]        H3
+        Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo];
+        Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo];
+
+        // t[bcLk] * V[Laji]        H2
+        // t[cbLj] * V[Laki]        H4
+        Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo];
+        Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo];
+      }
+      // PARTILCE DIAGRAMS: TAPHH and VABPH
+      for (size_t E = 0; E < Nv; E++) {
+        // t[aEij] * V[bcEk]        P0
+        // t[aEik] * V[cbEj]        P3 // TODO: CHECK THIS ONE, I DONT KNOW
+        Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv];
+        Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv];
+
+        // t[cEki] * V[abEj]        P5
+        // t[cEkj] * V[baEi]        P2
+        Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv];
+        Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv];
+
+        // t[bEji] * V[acEk]        P1
+        // t[bEjk] * V[caEi]        P4
+        Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv];
+        Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv];
+      }
+
+    }
+  #endif
+  }
+
+}
+#+end_src
+
+** Blas
+The main matrix-matrix multiplication method used in this algorithm
+is mainly using the =DGEMM= function, which we declare as
+=extern= since it should be known only at link-time.
+#+begin_src c++ :tangle (atrip-blas-h)
+#pragma once
+namespace atrip {
+  extern "C" {
+    void dgemm_(
+      const char *transa,
+      const char *transb,
+      const int *m,
+      const int *n,
+      const int *k,
+      double *alpha,
+      const double *A,
+      const int *lda,
+      const double *B,
+      const int *ldb,
+      double *beta,
+      double *C,
+      const int *ldc
+    );
+  }
+}
+#+end_src
+
+** Atrip
+#+begin_src c++ :tangle (atrip-atrip-h)
+#pragma once
+#include <sstream>
+#include <string>
+#include <map>
+#include <chrono>
+
+#include <ctf.hpp>
+
+namespace atrip {
+
+  struct Atrip {
+
+    static int rank;
+    static int np;
+
+    static void init() {
+      MPI_Comm_rank(MPI_COMM_WORLD, &Atrip::rank);
+      MPI_Comm_size(MPI_COMM_WORLD, &Atrip::np);
     }
 
-  }
-#endif
-}
-
-
-// HELPER FUNCTIONS ====================================================={{{1
-
-using ABCTuples = std::vector<ABCTuple>;
-
-ABCTuples getTuplesList(size_t Nv) {
-  const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
-  ABCTuples result(n);
-  size_t u(0);
-
-  for (size_t a(0); a < Nv; a++)
-  for (size_t b(a); b < Nv; b++)
-  for (size_t c(b); c < Nv; c++){
-    if ( a == b && b == c ) continue;
-    result[u++] = {a, b, c};
-  }
-
-  return result;
-
-}
-
-
-std::pair<size_t, size_t>
-getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) {
-
-  std::vector<size_t> n_tuples_per_rank(np, tuplesList.size()/np);
-  const size_t
-      // how many valid tuples should we still verteilen to nodes
-      // since the number of tuples is not divisible by the number of nodes
-      nRoundRobin = tuplesList.size() % np
-      // every node must have the sanme amount of tuples in order for the
-      // other nodes to receive and send somewhere, therefore
-      // some nodes will get extra tuples but that are dummy tuples
-    , nExtraInvalid = (np - nRoundRobin) % np
-    ;
-
-  if (nRoundRobin) for (int i = 0; i < np; i++) n_tuples_per_rank[i]++;
-
-#if defined(TODO)
-  assert( tuplesList.size()
-          ==
-          ( std::accumulate(n_tuples_per_rank.begin(),
-                            n_tuples_per_rank.end(),
-                            0)
-          + nExtraInvalid
-          ));
-#endif
-
-  WITH_RANK << "nRoundRobin = " << nRoundRobin << "\n";
-  WITH_RANK << "nExtraInvalid = " << nExtraInvalid << "\n";
-  WITH_RANK << "ntuples = " << n_tuples_per_rank[rank] << "\n";
-
-  auto const& it = n_tuples_per_rank.begin();
-
-  return
-    { std::accumulate(it, it + rank    , 0)
-    , std::accumulate(it, it + rank + 1, 0)
+    struct Input {
+      CTF::Tensor<double> *ei = nullptr
+                        , *ea = nullptr
+                        , *Tph = nullptr
+                        , *Tpphh = nullptr
+                        , *Vpphh = nullptr
+                        , *Vhhhp = nullptr
+                        , *Vppph = nullptr
+                        ;
+      Input& with_epsilon_i(CTF::Tensor<double> * t) { ei = t; return *this; }
+      Input& with_epsilon_a(CTF::Tensor<double> * t) { ea = t; return *this; }
+      Input& with_Tai(CTF::Tensor<double> * t) { Tph = t; return *this; }
+      Input& with_Tabij(CTF::Tensor<double> * t) { Tpphh = t; return *this; }
+      Input& with_Vabij(CTF::Tensor<double> * t) { Vpphh = t; return *this; }
+      Input& with_Vijka(CTF::Tensor<double> * t) { Vhhhp = t; return *this; }
+      Input& with_Vabci(CTF::Tensor<double> * t) { Vppph = t; return *this; }
     };
 
+    struct Output {
+      double energy;
+    };
+    static Output run(Input const& in);
+  };
+  int Atrip::rank;
+  int Atrip::np;
+
 }
+#+end_src
+
+#+begin_src c++ :tangle (atrip-atrip-cxx)
+#include <atrip/Atrip.hpp>
+#include <atrip/Utils.hpp>
+
+using namespace atrip;
 
 
-SliceUnion&
-unionByName(std::vector<SliceUnion*> const& unions, Slice::Name name) {
-    const auto sliceUnionIt
-      = std::find_if(unions.begin(), unions.end(),
-                     [&name](SliceUnion const* s) {
-                       return name == s->name;
-                     });
-    if (sliceUnionIt == unions.end())
-      throw std::domain_error("SliceUnion not found!");
-    return **sliceUnionIt;
-}
+Output Atrip::run(Input const& in){
 
-#if defined(TRIPLES_WORKLOAD_DUMP)
-      struct WorkloadEntry {
-        size_t rank, iteration;
-        ABCTuple abc;
-        size_t name;
-        size_t send;
-        size_t recv;
-        size_t doubles;
-        size_t barrier;
-        size_t allgather;
-        size_t unwrap;
+  const int np = Atrip::np;
+  const int rank = Atrip::rank;
+  MPI_Comm universe = in.ei->wrld->comm;
 
-        static
-        size_t findRecv(SliceUnion const& u) {
-          return std::count_if(u.slices.begin(), u.slices.end(),
-                               [&u](Slice const& slice) {
-                                 return slice.info.state == Slice::Dispatched;
-                               });
-        }
-        static
-        size_t findSend( Slice::Database const& db
-                       , SliceUnion const& u
-                       , size_t rank) {
-          using Element = Slice::LocalDatabaseElement;
-          return std::count_if(db.begin(), db.end(),
-                               [&u, rank](Element const& el) {
-                                 return el.name == u.name
-                                     && el.info.from.rank == rank
-                                     // this should work since the database
-                                     // has the non-updated Slice::Info
-                                     && el.info.state == Slice::Fetch
-                                     ;
-                               });
-        }
-      };
+  // Timings in seconds ================================================{{{1
+  Timings chrono{};
+
+  // Get the distributed ctf tensor data
+  CTF::Tensor<> *ei(getTensorArgument("HoleEigenEnergies"))
+              , *ea(getTensorArgument("ParticleEigenEnergies"))
+              , *Tph(getTensorArgument("CcsdSinglesAmplitudes"))
+              , *Tpphh(getTensorArgument("CcsdDoublesAmplitudes"))
+              , *Vpphh(getTensorArgument("PPHHCoulombIntegrals"))
+              , *Vhhhp(getTensorArgument("HHHPCoulombIntegrals"))
+              , *Vppph(getTensorArgument("PPPHCoulombIntegrals"))
+              ;
+
+  No = ei->lens[0];
+  Nv = ea->lens[0];
+  LOG(0,"NEW_TRIPLES") << "No: " << No << "\n";
+  LOG(0,"NEW_TRIPLES") << "Nv: " << Nv << "\n";
+
+  // allocate the three scratches, see piecuch
+  std::vector<double> Tijk(No*No*No) // doubles only (see piecuch)
+                    , Zijk(No*No*No) // singles + doubles (see piecuch)
+                    // we need local copies of the following tensors on every
+                    // rank
+                    , epsi(No)
+                    , epsa(Nv)
+                    , Tai(No * Nv)
+                    ;
+
+  ei->read_all(epsi.data());
+  ea->read_all(epsa.data());
+  Tph->read_all(Tai.data());
+
+  // COMMUNICATOR CONSTRUCTION ========================================={{{1
+  //
+  // Construct a new communicator living only on a single rank
+  int child_size = 1
+    , child_rank
+    ;
+  const
+  int color = rank / child_size
+    , crank = rank % child_size
+    ;
+  MPI_Comm child_comm;
+  if (np == 1) {
+    child_comm = universe;
+  } else {
+    MPI_Comm_split(cc4s::Cc4s::world->comm, color, crank, &child_comm);
+    MPI_Comm_rank(child_comm, &child_rank);
+    MPI_Comm_size(child_comm, &child_size);
+    //CTF::World child_world(child_comm);
+  }
+
+
+  chrono["nv-slices"].start();
+  // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1
+  LOG(0,"NEW_TRIPLES") << "BUILD NV-SLICES\n";
+  TAPHH taphh(*Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  HHHA  hhha(*Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  chrono["nv-slices"].stop();
+
+  chrono["nv-nv-slices"].start();
+  // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1
+  LOG(0,"NEW_TRIPLES") << "BUILD NV x NV-SLICES\n";
+  ABPH abph(*Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  ABHH abhh(*Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  TABHH tabhh(*Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
+  chrono["nv-nv-slices"].stop();
+
+  // all tensors
+  std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh};
+
+  //CONSTRUCT TUPLE LIST ==============================================={{{1
+  LOG(0,"NEW_TRIPLES") << "BUILD TUPLE LIST\n";
+  const auto tuplesList = std::move(getTuplesList(Nv));
+  WITH_RANK << "tupList.size() = " << tuplesList.size() << "\n";
+
+  // GET ABC INDEX RANGE FOR RANK ======================================{{{1
+  auto abcIndex = getABCRange(np, rank, tuplesList);
+  size_t nIterations = abcIndex.second - abcIndex.first;
+
+#ifdef TRIPLES_BENCHMARK
+  { const size_t maxIterations = getIntegerArgument("maxIterations", 0);
+    if (maxIterations != 0) {
+      abcIndex.second = abcIndex.first + maxIterations % (nIterations + 1);
+      nIterations = maxIterations % (nIterations + 1);
+    }
+  }
 #endif
 
+  WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n";
+  LOG(0,"NEW_TRIPLES") << "#iterations: "
+                       << nIterations << "\n";
+
+  // first abc
+  const ABCTuple firstAbc = tuplesList[abcIndex.first];
+
+
+  double energy(0.);
+
+
+  auto const isFakeTuple
+    = [&tuplesList](size_t const i) { return i >= tuplesList.size(); };
+
+
+  auto communicateDatabase
+    = [ &unions
+      , np
+      , &chrono
+      ] (ABCTuple const& abc, MPI_Comm const& c) -> Slice::Database {
+
+        chrono["db:comm:type:do"].start();
+        auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement();
+        chrono["db:comm:type:do"].stop();
+
+        chrono["db:comm:ldb"].start();
+        Slice::LocalDatabase ldb;
+
+        for (auto const& tensor: unions) {
+          auto const& tensorDb = tensor->buildLocalDatabase(abc);
+          ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end());
+        }
+        chrono["db:comm:ldb"].stop();
+
+        Slice::Database db(np * ldb.size(), ldb[0]);
+
+        chrono["oneshot-db:comm:allgather"].start();
+        chrono["db:comm:allgather"].start();
+        MPI_Allgather( ldb.data()
+                     , ldb.size()
+                     , MPI_LDB_ELEMENT
+                     , db.data()
+                     , ldb.size()
+                     , MPI_LDB_ELEMENT
+                     , c);
+        chrono["db:comm:allgather"].stop();
+        chrono["oneshot-db:comm:allgather"].stop();
+
+        chrono["db:comm:type:free"].start();
+        MPI_Type_free(&MPI_LDB_ELEMENT);
+        chrono["db:comm:type:free"].stop();
+
+        return db;
+      };
+
+  auto doIOPhase
+    = [&unions, &rank, &np, &universe, &chrono] (Slice::Database const& db) {
+
+    const size_t localDBLength = db.size() / np;
+
+    size_t sendTag = 0
+         , recvTag = rank * localDBLength
+         ;
+
+    // RECIEVE PHASE ======================================================
+    {
+      // At this point, we have already send to everyone that fits
+      auto const& begin = &db[rank * localDBLength]
+                , end   = begin + localDBLength
+                ;
+      for (auto it = begin; it != end; ++it) {
+        recvTag++;
+        auto const& el = *it;
+        auto& u = unionByName(unions, el.name);
+
+        WITH_DBG std::cout
+          << rank << ":r"
+          << "♯" << recvTag << " =>"
+          << " «n" << el.name
+          << ", t" << el.info.type
+          << ", s" << el.info.state
+          << "»"
+          << " ⊙ {" << rank << "⇐" << el.info.from.rank
+                    << ", "
+                    << el.info.from.source << "}"
+          << " ∴ {" << el.info.tuple[0]
+                    << ", "
+                    << el.info.tuple[1]
+                    << "}"
+          << "\n"
+          ;
+
+        chrono["db:io:recv"].start();
+        u.receive(el.info, recvTag);
+        chrono["db:io:recv"].stop();
+
+      } // recv
+    }
+
+    // SEND PHASE =========================================================
+    for (size_t otherRank = 0; otherRank<np; otherRank++) {
+      auto const& begin = &db[otherRank * localDBLength]
+                , end = begin + localDBLength
+                ;
+      for (auto it = begin; it != end; ++it) {
+        sendTag++;
+        Slice::LocalDatabaseElement const& el = *it;
+
+        if (el.info.from.rank != rank) continue;
+
+        auto& u = unionByName(unions, el.name);
+        WITH_DBG std::cout
+          << rank << ":s"
+          << "♯" << sendTag << " =>"
+          << " «n" << el.name
+          << ", t" << el.info.type
+          << ", s" << el.info.state
+          << "»"
+          << " ⊙ {" << el.info.from.rank << "⇒" << otherRank
+                    << ", "
+                    << el.info.from.source << "}"
+          << " ∴ {" << el.info.tuple[0]
+                    << ", "
+                    << el.info.tuple[1]
+                    << "}"
+          << "\n"
+          ;
+
+        chrono["db:io:send"].start();
+        u.send(otherRank, el.info, sendTag);
+        chrono["db:io:send"].stop();
+
+      } // send phase
+
+    } // otherRank
+
+
+  };
+
+#if defined(HAVE_OCD) || defined(TRIPLES_PRINT_TUPLES)
+  std::map<ABCTuple, double> tupleEnergies;
+#endif
+
+  const double doublesFlops
+    = double(No)
+    ,* double(No)
+    ,* double(No)
+    ,* (double(No) + double(Nv))
+    ,* 2
+    ,* 6
+    / 1e9
+    ;
+
+  // START MAIN LOOP ======================================================{{{1
+
+  Slice::Database db;
+
+  for ( size_t i = abcIndex.first, iteration = 1
+      ; i < abcIndex.second
+      ; i++, iteration++
+      ) {
+    chrono["iterations"].start();
+
+    // check overhead from chrono over all iterations
+    chrono["start:stop"].start(); chrono["start:stop"].stop();
+
+    // check overhead of doing a barrier at the beginning
+    chrono["oneshot-mpi:barrier"].start();
+    chrono["mpi:barrier"].start();
+    // TODO: REMOVE
+    if (getIntegerArgument("barrier", 1) == 1)
+    MPI_Barrier(universe);
+    chrono["mpi:barrier"].stop();
+    chrono["oneshot-mpi:barrier"].stop();
+
+    if (iteration % getIntegerArgument("iterationMod", 100) == 0) {
+      LOG(0,"NEW_TRIPLES")
+        << "iteration " << iteration
+        << " [" << 100 * iteration / nIterations << "%]"
+        << " (" << doublesFlops * iteration / chrono["doubles"].count()
+        << "GF)"
+        << " (" << doublesFlops * iteration / chrono["iterations"].count()
+        << "GF)"
+        << " ===========================\n";
+
+      // PRINT TIMINGS
+      for (auto const& pair: chrono)
+        LOG(1, " ") << pair.first << " :: "
+                    << pair.second.count()
+                    << std::endl;
+
+    }
+
+    const ABCTuple abc = isFakeTuple(i)
+                       ? tuplesList[tuplesList.size() - 1]
+                       : tuplesList[i]
+                 , *abcNext = i == (abcIndex.second - 1)
+                            ? nullptr
+                            : isFakeTuple(i + 1)
+                            ? &tuplesList[tuplesList.size() - 1]
+                            : &tuplesList[i + 1]
+                 ;
+
+    chrono["with_rank"].start();
+    WITH_RANK << " :it " << iteration
+              << " :abc " << pretty_print(abc)
+              << " :abcN "
+              << (abcNext ? pretty_print(*abcNext) : "None")
+              << "\n";
+    chrono["with_rank"].stop();
+
+
+    // COMM FIRST DATABASE ================================================{{{1
+    if (i == abcIndex.first) {
+      WITH_RANK << "__first__:first database ............ \n";
+      const auto __db = communicateDatabase(abc, universe);
+      WITH_RANK << "__first__:first database communicated \n";
+      WITH_RANK << "__first__:first database io phase \n";
+      doIOPhase(__db);
+      WITH_RANK << "__first__:first database io phase DONE\n";
+      WITH_RANK << "__first__::::Unwrapping all slices for first database\n";
+      for (auto& u: unions) u->unwrapAll(abc);
+      WITH_RANK << "__first__::::Unwrapping all slices for first database DONE\n";
+      MPI_Barrier(universe);
+    }
+
+    // COMM NEXT DATABASE ================================================={{{1
+    if (abcNext) {
+      WITH_RANK << "__comm__:" << iteration << "th communicating database\n";
+      chrono["db:comm"].start();
+      //const auto db = communicateDatabase(*abcNext, universe);
+      db = communicateDatabase(*abcNext, universe);
+      chrono["db:comm"].stop();
+      chrono["db:io"].start();
+      doIOPhase(db);
+      chrono["db:io"].stop();
+      WITH_RANK << "__comm__:" <<  iteration << "th database io phase DONE\n";
+    }
+
+    // COMPUTE DOUBLES ===================================================={{{1
+    OCD_Barrier(universe);
+    if (!isFakeTuple(i)) {
+      WITH_RANK << iteration << "-th doubles\n";
+      WITH_CHRONO(chrono["oneshot-unwrap"],
+      WITH_CHRONO(chrono["unwrap"],
+      WITH_CHRONO(chrono["unwrap:doubles"],
+        for (auto& u: decltype(unions){&abph, &hhha, &taphh, &tabhh}) {
+          u->unwrapAll(abc);
+        }
+      )))
+      chrono["oneshot-doubles"].start();
+      chrono["doubles"].start();
+      // TODO: REMOVE
+      for (size_t __i=0; __i < getIntegerArgument("doublesLoops", 1); __i++)
+      doublesContribution( abc, (size_t)No, (size_t)Nv
+                         // -- VABCI
+                         , abph.unwrapSlice(Slice::AB, abc)
+                         , abph.unwrapSlice(Slice::AC, abc)
+                         , abph.unwrapSlice(Slice::BC, abc)
+                         , abph.unwrapSlice(Slice::BA, abc)
+                         , abph.unwrapSlice(Slice::CA, abc)
+                         , abph.unwrapSlice(Slice::CB, abc)
+                         // -- VHHHA
+                         , hhha.unwrapSlice(Slice::A, abc)
+                         , hhha.unwrapSlice(Slice::B, abc)
+                         , hhha.unwrapSlice(Slice::C, abc)
+                         // -- TA
+                         , taphh.unwrapSlice(Slice::A, abc)
+                         , taphh.unwrapSlice(Slice::B, abc)
+                         , taphh.unwrapSlice(Slice::C, abc)
+                         // -- TABIJ
+                         , tabhh.unwrapSlice(Slice::AB, abc)
+                         , tabhh.unwrapSlice(Slice::AC, abc)
+                         , tabhh.unwrapSlice(Slice::BC, abc)
+                         // -- TIJK
+                         , Tijk.data()
+                         , chrono
+                         );
+      WITH_RANK << iteration << "-th doubles done\n";
+      chrono["doubles"].stop();
+      chrono["oneshot-doubles"].stop();
+    }
+
+    // COMPUTE SINGLES =================================================== {{{1
+    OCD_Barrier(universe);
+    if (!isFakeTuple(i)) {
+      WITH_CHRONO(chrono["oneshot-unwrap"],
+      WITH_CHRONO(chrono["unwrap"],
+      WITH_CHRONO(chrono["unwrap:singles"],
+        abhh.unwrapAll(abc);
+      )))
+      chrono["reorder"].start();
+      for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I];
+      chrono["reorder"].stop();
+      chrono["singles"].start();
+      singlesContribution( abc
+                         , Tai.data()
+                         , abhh.unwrapSlice(Slice::AB, abc)
+                         , abhh.unwrapSlice(Slice::AC, abc)
+                         , abhh.unwrapSlice(Slice::BC, abc)
+                         , Zijk.data());
+      chrono["singles"].stop();
+    }
+
+
+    // COMPUTE ENERGY ==================================================== {{{1
+    if (!isFakeTuple(i)) {
+      double tupleEnergy(0.);
+
+      int distinct(0);
+      if (abc[0] == abc[1]) distinct++;
+      if (abc[1] == abc[2]) distinct--;
+      const double epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]);
+
+      chrono["energy"].start();
+      if ( distinct == 0)
+        tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk);
+      else
+        tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk);
+      chrono["energy"].stop();
+
+#if defined(HAVE_OCD) || defined(TRIPLES_PRINT_TUPLES)
+      tupleEnergies[abc] = tupleEnergy;
+#endif
+
+      energy += tupleEnergy;
+
+#ifdef HAVE_OCD
+      auto const print_slices
+        = [](ABCTuple const& abc, ABCTuple const& want, SliceUnion& u) {
+          if (abc != want) return;
+
+          for (auto type: u.sliceTypes) {
+            auto const& ptr = u.unwrapSlice(type, abc);
+            auto const& slice = Slice::findByTypeAbc(u.slices, type, abc);
+            WITH_RANK << "__print_slice__:n" << u.name << " "
+                      << pretty_print(abc) << " "
+                      << pretty_print(slice.info)
+                      ;
+            for (size_t i = 0; i < 20; i++) std::cout << ptr[i] << ", ";
+            std::cout << std::endl;
+          }
+        };
+#endif
+
+    if (isFakeTuple(i)) {
+      // fake iterations should also unwrap whatever they got
+      WITH_RANK << iteration
+                << "th unwrapping because of fake in "
+                << i << "\n";
+      for (auto& u: unions) u->unwrapAll(abc);
+    }
+
+#ifdef HAVE_OCD
+    for (auto const& u: unions) {
+      WITH_RANK << "__dups__:"
+                << iteration
+                << "-th n" << u->name << " checking duplicates\n";
+      u->checkForDuplicates();
+    }
+#endif
+
+
+    // CLEANUP UNIONS ===================================================={{{1
+    OCD_Barrier(universe);
+    if (abcNext) {
+      chrono["gc"].start();
+      WITH_RANK << "__gc__:" << iteration << "-th cleaning up.......\n";
+      for (auto& u: unions) {
+
+        u->unwrapAll(abc);
+        WITH_RANK << "__gc__:n" << u->name  << " :it " << iteration
+                  << " :abc " << pretty_print(abc)
+                  << " :abcN " << pretty_print(*abcNext)
+                  << "\n";
+        for (auto const& slice: u->slices)
+          WITH_RANK << "__gc__:guts:" << slice.info << "\n";
+        u->clearUnusedSlicesForNext(*abcNext);
+
+        WITH_RANK << "__gc__: checking validity\n";
+
+#ifdef HAVE_OCD
+        // check for validity of the slices
+        for (auto type: u->sliceTypes) {
+          auto tuple = Slice::subtupleBySlice(abc, type);
+        for (auto& slice: u->slices) {
+          if ( slice.info.type == type
+             && slice.info.tuple == tuple
+             && slice.isDirectlyFetchable()
+             ) {
+            if (slice.info.state == Slice::Dispatched)
+              throw std::domain_error( "This slice should not be undispatched! "
+                                     + pretty_print(slice.info));
+          }
+        }
+        }
+#endif
+
+
+      }
+      chrono["gc"].stop();
+    }
+
+      WITH_RANK << iteration << "-th cleaning up....... DONE\n";
+    }
+
+    // CLEAN CHRONO ======================================================{{{1
+    chrono["iterations"].stop();
+    { // TODO: REMOVEME
+      chrono["oneshot-doubles"].clear();
+      chrono["oneshot-mpi:barrier"].clear();
+      chrono["oneshot-db:comm:allgather"].clear();
+      chrono["oneshot-unwrap"].clear();
+    }
+
+    // ITERATION END ====================================================={{{1
+  }  // END OF MAIN LOOP
+
+  MPI_Barrier(universe);
+
+  // PRINT TUPLES ========================================================={{{1
+#if defined(HAVE_OCD) || defined(TRIPLES_PRINT_TUPLES)
+  LOG(0,"NEW_TRIPLES") << "tuple energies" << "\n";
+  for (size_t i = 0; i < np; i++) {
+    MPI_Barrier(universe);
+    for (auto const& pair: tupleEnergies) {
+      if (i == rank)
+        std::cout << pair.first[0]
+                  << " " << pair.first[1]
+                  << " " << pair.first[2]
+                  << std::setprecision(15) << std::setw(23)
+                  << " tupleEnergy: " << pair.second
+                  << "\n"
+                  ;
+    }
+  }
+#endif
+
+  // COMMUNICATE THE ENERGIES ============================================={{{1
+  LOG(0,"NEW_TRIPLES") << "COMMUNICATING ENERGIES \n";
+  double globalEnergy = 0;
+  MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe);
+
+  WITH_RANK << "local energy " << energy << "\n";
+  LOG(0,"LOOP FINISHED, energy")
+    << std::setprecision(15) << std::setw(23)
+    << globalEnergy << std::endl;
+
+  // PRINT TIMINGS {{{1
+  for (auto const& pair: chrono)
+    LOG(0,"atrip:chrono") << pair.first << " "
+                          << pair.second.count() << std::endl;
+
+
+  LOG(0, "atrip:flops")
+    << nIterations * doublesFlops / chrono["doubles"].count() << "\n";
+
+}
+#+end_src
+
+
+** Include header
+
+#+begin_src c++ :tangle (atrip-main-h)
+#pragma once
+
+#define TRIPLES_BENCHMARK
+#define TRIPLES_DEBUG 1
+//#define TRIPLES_WORKLOAD_DUMP
+#define TRIPLES_USE_DGEMM
+//#define TRIPLES_PRINT_TUPLES
+
+#if TRIPLES_DEBUG == 4
+#  pragma message("WARNING: You have OCD debugging ABC triples "\
+                  "expect GB of output and consult your therapist")
+#  include <dbg.h>
+#  define HAVE_OCD
+#  define OCD_Barrier(com) MPI_Barrier(com)
+#  define WITH_OCD
+#  define WITH_ROOT if (atrip::Atrip::rank == 0)
+#  define WITH_SPECIAL(r) if (atrip::Atrip::rank == r)
+#  define WITH_RANK std::cout << atrip::Atrip::rank << ": "
+#  define WITH_CRAZY_DEBUG
+#  define WITH_DBG
+#  define DBG(...) dbg(__VA_ARGS__)
+#elif TRIPLES_DEBUG == 3
+#  pragma message("WARNING: You have crazy debugging ABC triples,"\
+                  " expect GB of output")
+#  include <dbg.h>
+#  define OCD_Barrier(com)
+#  define WITH_OCD if (false)
+#  define WITH_ROOT if (atrip::Atrip::rank == 0)
+#  define WITH_SPECIAL(r) if (atrip::Atrip::rank == r)
+#  define WITH_RANK std::cout << atrip::Atrip::rank << ": "
+#  define WITH_CRAZY_DEBUG
+#  define WITH_DBG
+#  define DBG(...) dbg(__VA_ARGS__)
+#elif TRIPLES_DEBUG == 2
+#  pragma message("WARNING: You have some debugging info for ABC triples")
+#  include <dbg.h>
+#  define OCD_Barrier(com)
+#  define WITH_OCD if (false)
+#  define WITH_ROOT if (atrip::Atrip::rank == 0)
+#  define WITH_SPECIAL(r) if (atrip::Atrip::rank == r)
+#  define WITH_RANK std::cout << atrip::Atrip::rank << ": "
+#  define WITH_CRAZY_DEBUG if (false)
+#  define WITH_DBG
+#  define DBG(...) dbg(__VA_ARGS__)
+#elif TRIPLES_DEBUG == 1
+#  define OCD_Barrier(com)
+#  define WITH_OCD if (false)
+#  define WITH_ROOT if (false)
+#  define WITH_SPECIAL(r) if (false)
+#  define WITH_RANK if (false) std::cout << atrip::Atrip::rank << ": "
+#  define WITH_DBG if (false)
+#  define WITH_CRAZY_DEBUG if (false)
+#  define DBG(...)
+#else
+#  error("TRIPLES_DEBUG is not defined!")
+#endif
+
+#include <atrip/Utils.hpp>
+#include <atrip/Slice.hpp>
+#include <atrip/Atrip.hpp>
+
+
+#+end_src
+
+
+** Todo                                                            :noexport:
+ #+begin_src c++  :tangle todo.hpp
+#include <algorithms/PerturbativeTriplesAbcijk.hpp>
+#include <math/MathFunctions.hpp>
+#include <tcc/DryTensor.hpp>
+#include <util/Log.hpp>
+#include <util/Exception.hpp>
+#include <Cc4s.hpp>
+#include <ctf.hpp>
+#include <util/MpiCommunicator.hpp>
+
+using namespace cc4s;
+
+#include <sstream>
+#include <algorithm>
+#include <numeric>
+#include <array>
+#include <vector>
+#include <map>
+#include <memory>
+#include <set>
+#include <chrono>
+
+
+#+end_src
+
+#+begin_src c++ :tangle todo.hpp
+
+
+
+
 // MAIN ALGORITHM ======================================================{{{1
 void cc4s::PerturbativeTriplesAbcijk::run(){
 
@@ -2161,9 +2784,6 @@ void cc4s::PerturbativeTriplesAbcijk::run(){
   // START MAIN LOOP ======================================================{{{1
 
   Slice::Database db;
-#if defined(TRIPLES_WORKLOAD_DUMP)
-  std::vector<std::vector<WorkloadEntry>> workloadDB;
-#endif
 
   for ( size_t i = abcIndex.first, iteration = 1
       ; i < abcIndex.second
@@ -2371,30 +2991,6 @@ void cc4s::PerturbativeTriplesAbcijk::run(){
 #endif
 
 
-    // WORKLOAD DUMP  ====================================================={{{1
-#if defined(TRIPLES_WORKLOAD_DUMP)
-    {
-      chrono["workload"].start();
-      std::vector<WorkloadEntry> entries;
-      for (auto const& u: unions) {
-        entries.push_back
-         ({ rank
-          , iteration
-          , abc
-          , (size_t)u->name
-          , WorkloadEntry::findSend(db, *u, rank)
-          , WorkloadEntry::findRecv(*u)
-          , (size_t)std::round(1e3*chrono["oneshot-doubles"].count())
-          , (size_t)std::round(1e3*chrono["oneshot-mpi:barrier"].count())
-          , (size_t)std::round(1e3*chrono["oneshot-db:comm:allgather"].count())
-          , (size_t)std::round(1e3*chrono["oneshot-unwrap"].count())
-          });
-      }
-      workloadDB.push_back(entries);
-      chrono["workload"].stop();
-    }
-#endif
-
     // CLEANUP UNIONS ===================================================={{{1
     OCD_Barrier(universe);
     if (abcNext) {
@@ -2489,59 +3085,5 @@ void cc4s::PerturbativeTriplesAbcijk::run(){
   LOG(0, "atrip:flops")
     << nIterations * doublesFlops / chrono["doubles"].count() << "\n";
 
-  // WORKLOAD DUMP  ====================================================={{{1
-#if defined(TRIPLES_WORKLOAD_DUMP)
-  {
-    constexpr char* _workload_file = "workload.triples.bin";
-    LOG(0,"NEW_TRIPLES") << "Writing out WORKLOAD to "
-                         << _workload_file << "\n";
-    LOG(0,"NEW_TRIPLES") << "# workload entries = "
-                         << workloadDB.size() << "\n";
-    MPI_File handle;
-    MPI_Status status;
-    MPI_File_open(universe,
-                  _workload_file,
-                  MPI_MODE_CREATE | MPI_MODE_WRONLY,
-                  MPI_INFO_NULL,
-                  &handle);
-
-    for (int _it = 1; _it <= nIterations; _it++) {
-      MPI_Offset const
-          chunk_size = sizeof(WorkloadEntry) * workloadDB[0].size()
-        , main_offset = chunk_size
-                      ,* (rank * nIterations + _it - 1)
-                      ;
-
-      auto const& entries = workloadDB[_it - 1];
-      for (size_t i = 0; i < entries.size(); i++) {
-        MPI_Offset const
-          local_offset = main_offset + i * sizeof(WorkloadEntry);
-        LOG(0,"NEW_TRIPLES") << entries[i].iteration
-                             << ","
-                             << entries[i].name
-                             << ","
-                             << entries[i].send
-                             << ","
-                             << entries[i].recv
-                             << ","
-                             << entries[i].doubles
-                             << ","
-                             << entries[i].barrier
-                             << "\n"
-                             ;
-        MPI_File_write_at(handle,
-                          local_offset,
-                          &entries[i],
-                          sizeof(WorkloadEntry),
-                          MPI_CHAR,
-                          &status);
-      }
-    } // iterations
-    MPI_Barrier(universe);
-    MPI_File_close(&handle);
-  }
-#endif
-
-
 }
  #+end_src
diff --git a/bench/test_main.cxx b/bench/test_main.cxx
index e06dcc4..ebfe817 100644
--- a/bench/test_main.cxx
+++ b/bench/test_main.cxx
@@ -28,6 +28,9 @@ int main(int argc, char** argv) {
     , Vppph(4, vvvo.data(), symmetries.data(), world)
     ;
 
+  atrip::Atrip::init();
+  atrip::Atrip::run({&ei, &ea, &Tph, &Tpphh, &Vpphh, &Vhhhp, &Vppph});
+
   std::cout << "Hello world" << std::endl;
   MPI_Finalize();
   return 0;
diff --git a/config.el b/config.el
index 868bfea..f837321 100644
--- a/config.el
+++ b/config.el
@@ -1,4 +1,5 @@
 (require 'subr-x)
+(require 'org)
 
 (defun f-join (&rest args)
   (string-join args "/"))
@@ -7,11 +8,29 @@
 (defun atrip-print-sources ()
   (princ (string-join atrip-sources " ")))
 
+(defvar atrip-include-f "include/atrip") ;; TODO: create defvar
+(defvar atrip-src-f "src/atrip")         ;; TODO: create defvar
+
 (defmacro atrip-def (name body) `(progn (defun ,name () ,body)
                                              (push (,name) atrip-sources)))
 
+
+(defmacro atrip-def-src (name body)
+  `(atrip-def ,name (f-join atrip-src-f ,body)))
+(defmacro atrip-def-hdr (name body)
+  `(atrip-def ,name (f-join atrip-include-f ,body)))
+
 ;; atrip variables for the org-mode file
-(atrip-def atrip-include-f "include/atrip")
-(atrip-def atrip-slice-h (f-join (atrip-include-f) "Slice.hpp"))
-(atrip-def atrip-utils-h (f-join (atrip-include-f) "Utils.hpp"))
+(atrip-def-hdr atrip-slice-h "Slice.hpp")
+(atrip-def-hdr atrip-slice-union-h "SliceUnion.hpp")
+(atrip-def-hdr atrip-utils-h "Utils.hpp")
+(atrip-def-hdr atrip-blas-h "Blas.hpp")
+(atrip-def-hdr atrip-rankmap-h "RankMap.hpp")
+(atrip-def-hdr atrip-unions-h "Unions.hpp")
+(atrip-def-hdr atrip-tuples-h "Tuples.hpp")
+(atrip-def-hdr atrip-equations-h "Equations.hpp")
+
+(atrip-def-hdr atrip-atrip-h "Atrip.hpp")
+(atrip-def-src atrip-atrip-cxx "Atrip.cxx")
+
 (atrip-def atrip-main-h   "include/atrip.hpp")
diff --git a/etc/config/gcc.mk b/etc/config/gcc.mk
index 7b14a3e..cf3aee4 100644
--- a/etc/config/gcc.mk
+++ b/etc/config/gcc.mk
@@ -6,5 +6,6 @@ CXXFLAGS += -I$(ATRIP_ROOT)/include
 
 CXXFLAGS += -I$(CTF_INCLUDE_PATH)
 LDFLAGS += -Wl,-Bstatic -L$(CTF_BUILD_PATH)/lib -lctf
-LDFLAGS += -fopenmp -L/usr/lib -lscalapack -L/opt/OpenBLAS/lib -lopenblas
+LDFLAGS += -fopenmp -L/usr/lib -L/opt/OpenBLAS/lib -lopenblas
+LDFLAGS += -L/usr/local/lib -lscalapack
 LDFLAGS += -Wl,-Bdynamic
diff --git a/etc/make/ctf_vars.mk b/etc/make/ctf_vars.mk
index 490d26b..1de08fa 100644
--- a/etc/make/ctf_vars.mk
+++ b/etc/make/ctf_vars.mk
@@ -1,8 +1,8 @@
 CTF_REPOSITORY = https://github.com/cyclops-community/ctf
 CTF_COMMIT    ?= v1.5.0
 
-CTF_SRC_PATH     = $(ATRIP_ROOT)/lib/src/ctf/$(CTF_COMMIT)
-CTF_BUILD_PATH   = $(ATRIP_ROOT)/lib/build/ctf/$(CTF_COMMIT)
+CTF_SRC_PATH     = $(ATRIP_ROOT)/extern/src/ctf/$(CTF_COMMIT)
+CTF_BUILD_PATH   = $(ATRIP_ROOT)/extern/build/ctf/$(CTF_COMMIT)
 
 CTF_CONFIG_FLAGS =
 
diff --git a/etc/make/emacs.mk b/etc/make/emacs.mk
index d617eae..82f0485 100644
--- a/etc/make/emacs.mk
+++ b/etc/make/emacs.mk
@@ -1,5 +1,5 @@
 EMACS = emacs -q --batch --load config.el
 define tangle
-$(EMACS) $(1) --eval "(require 'org)" --eval '(org-babel-tangle)'
+$(EMACS) $(1) --eval '(org-babel-tangle)'
 endef
 
diff --git a/etc/nix/scalapack.nix b/etc/nix/scalapack.nix
new file mode 100644
index 0000000..04864da
--- /dev/null
+++ b/etc/nix/scalapack.nix
@@ -0,0 +1,56 @@
+{ lib, stdenv, fetchFromGitHub, cmake, openssh
+, gfortran, mpi, blas, lapack
+} :
+
+assert (!blas.isILP64) && (!lapack.isILP64);
+
+stdenv.mkDerivation rec {
+  pname = "scalapack";
+  version = "2.1.0";
+
+  src = fetchFromGitHub {
+    owner = "Reference-ScaLAPACK";
+    repo = pname;
+    rev = "v${version}";
+    sha256 = "1c10d18gj3kvpmyv5q246x35hjxaqn4ygy1cygaydhyxnm4klzdj";
+  };
+
+  nativeBuildInputs = [ cmake openssh ];
+  buildInputs = [ mpi gfortran blas lapack ];
+
+  doCheck = false;
+
+  preConfigure = ''
+    cmakeFlagsArray+=(
+      -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON
+      -DLAPACK_LIBRARIES="-llapack"
+      -DBLAS_LIBRARIES="-lblas"
+      )
+  '';
+
+  # Increase individual test timeout from 1500s to 10000s because hydra's builds
+  # sometimes fail due to this
+  checkFlagsArray = [ "ARGS=--timeout 10000" ];
+
+  preCheck = ''
+    # make sure the test starts even if we have less than 4 cores
+    export OMPI_MCA_rmaps_base_oversubscribe=1
+
+    # Fix to make mpich run in a sandbox
+    export HYDRA_IFACE=lo
+
+    # Run single threaded
+    export OMP_NUM_THREADS=1
+
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH''${LD_LIBRARY_PATH:+:}`pwd`/lib
+  '';
+
+  meta = with lib; {
+    homepage = "http://www.netlib.org/scalapack/";
+    description = "Library of high-performance linear algebra routines for parallel distributed memory machines";
+    license = licenses.bsd3;
+    platforms = [ "x86_64-linux" ];
+    maintainers = with maintainers; [ costrouc markuskowa ];
+  };
+
+}
diff --git a/shell.nix b/shell.nix
index 915c87f..5550b0d 100644
--- a/shell.nix
+++ b/shell.nix
@@ -14,4 +14,23 @@ pkgs.mkShell rec {
     emacs
   ];
 
+  scalapack = import ./etc/nix/scalapack.nix {
+    lib = pkgs.lib;
+    stdenv = pkgs.stdenv;
+    fetchFromGitHub = pkgs.fetchFromGitHub;
+    cmake = pkgs.cmake;
+    openssh = pkgs.openssh;
+    gfortran = pkgs.gfortran;
+    mpi = pkgs.mpi;
+    blas = pkgs.blas;
+    lapack = pkgs.lapack;
+  };
+
+  shellHook = ''
+    export LAPACK_PATH=${pkgs.lapack}
+    export BLAS_PATH=${pkgs.blas}
+    export SCALAPACK_PATH=${scalapack}
+    export LD_LIBRARY_PATH=${scalapack}/lib:$LD_LIBRARY_PATH
+  '';
+
 }