Changes in source files, makes cuda run

2022-08-05 13:42:04 +02:00 · 2022-08-05 13:42:04 +02:00 · a5b2a74e18
commit a5b2a74e18
parent e03dd77904
22 changed files with 1408 additions and 539 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
 *.swo
 *.swp
 etc/autotools/
 Makefile.in
 build
--- a/configure.ac
+++ b/configure.ac
@ -21,6 +21,27 @@ AC_ARG_ENABLE(shared,
                                     files (default=YES)]),
              [], [enable_shared=yes])
 AC_ARG_ENABLE(
    [slice],
    [AS_HELP_STRING(
      [--disable-slice],
      [Disable the step of slicing tensors for CTF, this is useful for example for benchmarking or testing.])],
    [atrip_dont_slice=1
     AC_DEFINE([ATRIP_DONT_SLICE],1,[Wether CTF will slice tensors or skip the step])
    ],
    [atrip_dont_slice=0]
 )
 AC_ARG_ENABLE(
    [atrip_dgemm],
    [AS_HELP_STRING(
      [--disable-dgemm],
      [Disable using dgemm for the doubles equations])],
    [],
    [AC_DEFINE([ATRIP_USE_DGEMM],1,[Use dgemm for the doubles equations])]
 )
 AC_ARG_ENABLE([docs],
              [AS_HELP_STRING([--enable-docs],
                              [Enable building docs])],
@ -28,7 +49,7 @@ AC_ARG_ENABLE([docs],
 dnl LIBGC library options
 AC_ARG_WITH(libctf-prefix,
-            AS_HELP_STRING([--with-libctf-prefix=path],
+            AS_HELP_STRING([--with-ctf],
                           [prefix for CTF includes and libraries] ),
            [LIBCTF_PATH="`readlink -f $withval`";
             LIBCTF_CPATH="`readlink -f $withval`/include";
@ -44,11 +65,6 @@ AC_ARG_WITH([clang-check],
            [clang_check=NO])
 AM_CONDITIONAL([WITH_CLANG_CHECK], [test x${clang_check} = xYES])
 AC_ARG_WITH(dgemm,
            AS_HELP_STRING([--without-dgemm], [Disable dgemm]),
            [with_dgemm=NO],
            [with_dgemm=YES])
 AC_ARG_ENABLE([cuda],
              [AS_HELP_STRING([--enable-cuda],
                              [Build with cuda])],
@ -59,6 +75,12 @@ AC_ARG_VAR([CUDA_LDFLAGS], [LDFLAGS to find libraries -lcuda, -lcudart, -lcublas
 AC_ARG_VAR([CUDA_CXXFLAGS], [CXXFLAGS to find the CUDA headers])
 AC_ARG_WITH([atrip-debug],
            [AS_HELP_STRING([--with-atrip-debug],
                            [Debug level for atrip, possible values: 1, 2, 3, 4])],
            [AC_DEFINE([ATRIP_DEBUG],[atrip-debug],[Atrip debug level])],
            [AC_DEFINE([ATRIP_DEBUG],[1],[Atrip debug level])]
            )
 dnl -----------------------------------------------------------------------
--- a/include/atrip.hpp
+++ b/include/atrip.hpp
@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// [[file:../atrip.org::*Include header][Include header:1]]
+// [[file:~/cuda/atrip/atrip.org::*Include%20header][Include header:1]]
 #pragma once
 #include <atrip/Atrip.hpp>
--- a/include/atrip/Atrip.hpp
+++ b/include/atrip/Atrip.hpp
@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// [[file:../../atrip.org::*Header][Header:1]]
+// [[file:~/cuda/atrip/atrip.org::*Header][Header:1]]
 #pragma once
 #include <sstream>
 #include <string>
@ -41,12 +41,21 @@ namespace atrip {
  struct Atrip {
-    static int rank;
+    static size_t rank;
-    static int np;
+    static size_t np;
    static MPI_Comm communicator;
    static Timings chrono;
 #if defined(HAVE_CUDA)
    struct CudaContext {
      cublasStatus_t status;
      cublasHandle_t handle;
    };
    static CudaContext cuda;
 #endif
    static void init(MPI_Comm);
    template <typename F=double>
    struct Input {
      CTF::Tensor<F> *ei = nullptr
--- a/include/atrip/Blas.hpp
+++ b/include/atrip/Blas.hpp
@ -12,12 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// [[file:../../atrip.org::*Blas][Blas:1]]
+// [[file:~/cuda/atrip/atrip.org::*Blas][Blas:1]]
 #pragma once
 #include <atrip/Complex.hpp>
 #include <atrip/Types.hpp>
 #include "config.h"
 namespace atrip {
-  using Complex = std::complex<double>;
+#if !defined(HAVE_CUDA)
  extern "C" {
    void dgemm_(
      const char *transa,
@ -50,49 +54,43 @@ namespace atrip {
      Complex *C,
      const int *ldc
    );
  void dcopy_(const int n,
              const double *x,
              const int incx,
              double *y,
              const int incy);
  void zcopy_(const int n,
              const void *x,
              const int incx,
              void *y,
              const int incy);
  }
 #endif
  template <typename F>
  void xcopy(const int n,
             const  DataFieldType<F>* x,
             const int incx,
             DataFieldType<F>* y,
             const int incy);
-  template <typename F=double>
+  template <typename F>
  void xgemm(const char *transa,
             const char *transb,
             const int *m,
             const int *n,
             const int *k,
             F *alpha,
-             const F *A,
+             const DataFieldType<F> *A,
             const int *lda,
-             const F *B,
+             const DataFieldType<F> *B,
             const int *ldb,
             F *beta,
-             F *C,
+             DataFieldType<F> *C,
-             const int *ldc) {
+             const int *ldc);
    dgemm_(transa, transb,
           m, n, k,
           alpha, A, lda,
           B, ldb, beta,
           C, ldc);
  }
  template <>
  void xgemm(const char *transa,
             const char *transb,
             const int *m,
             const int *n,
             const int *k,
             Complex *alpha,
             const Complex *A,
             const int *lda,
             const Complex *B,
             const int *ldb,
             Complex *beta,
             Complex *C,
             const int *ldc) {
    zgemm_(transa, transb,
           m, n, k,
           alpha, A, lda,
           B, ldb, beta,
           C, ldc);
  }
 }
 // Blas:1 ends here
--- a/include/atrip/CUDA.hpp
+++ b/include/atrip/CUDA.hpp
@ -0,0 +1,9 @@
 #pragma once
 #if defined(HAVE_CUDA) && defined(__CUDACC__)
 #  define __MAYBE_GLOBAL__ __global__
 #  define __MAYBE_DEVICE__ __device__
 #else
 #  define __MAYBE_GLOBAL__
 #  define __MAYBE_DEVICE__
 #endif
--- a/include/atrip/Checkpoint.hpp
+++ b/include/atrip/Checkpoint.hpp
@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// [[file:../../atrip.org::*Prolog][Prolog:1]]
+// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
 #pragma once
 #include <fstream>
 #include <iomanip>
@ -22,7 +22,7 @@
 namespace atrip {
 // Prolog:1 ends here
-// [[file:../../atrip.org::checkpoint-definition][checkpoint-definition]]
+// [[file:~/cuda/atrip/atrip.org::checkpoint-definition][checkpoint-definition]]
 // template <typename F>
 struct Checkpoint {
  size_t no, nv;
@ -36,7 +36,7 @@ struct Checkpoint {
 };
 // checkpoint-definition ends here
-// [[file:../../atrip.org::*Input and output][Input and output:1]]
+// [[file:~/cuda/atrip/atrip.org::*Input%20and%20output][Input and output:1]]
 void write_checkpoint(Checkpoint const& c, std::string const& filepath) {
  std::ofstream out(filepath);
  out << "No: " << c.no
@ -87,6 +87,6 @@ Checkpoint read_checkpoint(std::string const& filepath) {
 }
 // Input and output:1 ends here
-// [[file:../../atrip.org::*Epilog][Epilog:1]]
+// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
 }
 // Epilog:1 ends here
--- a/include/atrip/Complex.hpp
+++ b/include/atrip/Complex.hpp
@ -0,0 +1,46 @@
 // Copyright 2022 Alejandro Gallo
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // [[file:~/cuda/atrip/atrip.org::*Complex%20numbers][Complex numbers:1]]
 #pragma once
 #include <complex>
 #include <mpi.h>
 #include "config.h"
 #if defined(HAVE_CUDA)
 #include <cuComplex.h>
 #endif
 namespace atrip {
  using Complex = std::complex<double>;
  template <typename F> F maybeConjugate(const F);
 #if defined(HAVE_CUDA)
  cuDoubleComplex& operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz);
 #endif
  namespace traits {
    template <typename FF> bool isComplex();
    namespace mpi {
      template <typename F> MPI_Datatype datatypeOf(void);
    }
  }
 }
 // Complex numbers:1 ends here
--- a/include/atrip/Debug.hpp
+++ b/include/atrip/Debug.hpp
@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// [[file:../../atrip.org::*Macros][Macros:1]]
+// [[file:~/cuda/atrip/atrip.org::*Macros][Macros:1]]
 #pragma once
 #include <functional>
 #define ATRIP_BENCHMARK
@ -21,7 +21,6 @@
 #  define ATRIP_DEBUG 1
 #endif
 //#define ATRIP_WORKLOAD_DUMP
 #define ATRIP_USE_DGEMM
 //#define ATRIP_PRINT_TUPLES
 #ifndef ATRIP_DEBUG
@ -75,20 +74,20 @@
 #endif
 // Macros:1 ends here
-// [[file:../../atrip.org::*Macros][Macros:2]]
+// [[file:~/cuda/atrip/atrip.org::*Macros][Macros:2]]
 #ifndef LOG
 #define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": "
 #endif
 // Macros:2 ends here
-// [[file:../../atrip.org::*Macros][Macros:3]]
+// [[file:~/cuda/atrip/atrip.org::*Macros][Macros:3]]
 #ifdef ATRIP_NO_OUTPUT
 #  undef LOG
 #  define LOG(level, name) if (false) std::cout << name << ": "
 #endif
 // Macros:3 ends here
-// [[file:../../atrip.org::IterationDescriptor][IterationDescriptor]]
+// [[file:~/cuda/atrip/atrip.org::IterationDescriptor][IterationDescriptor]]
 namespace atrip {
  struct IterationDescription;
--- a/include/atrip/Equations.hpp
+++ b/include/atrip/Equations.hpp
@ -12,371 +12,94 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// [[file:../../atrip.org::*Equations][Equations:1]]
+// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
 #pragma once
-#include<atrip/Slice.hpp>
+#include<atrip/Atrip.hpp>
 #include<atrip/Blas.hpp>
 #include<atrip/Utils.hpp>
 #if defined(HAVE_CUDA)
 #include<thrust/device_vector.h>
 #endif
 namespace atrip {
 using ABCTuple = std::array<size_t, 3>;
 using PartialTuple = std::array<size_t, 2>;
 using ABCTuples = std::vector<ABCTuple>;
 // Prolog:1 ends here
-  template <typename F=double>
+// [[file:~/cuda/atrip/atrip.org::*Energy][Energy:1]]
-  double getEnergyDistinct
+template <typename F=double>
-    ( const F epsabc
+double getEnergyDistinct
-    , std::vector<F> const& epsi
+  ( F const epsabc
-    , std::vector<F> const& Tijk_
+  , size_t const No
-    , std::vector<F> const& Zijk_
+  , F* const epsi
-    ) {
+  , F* const Tijk
-    constexpr size_t blockSize=16;
+  , F* const Zijk
-    F energy(0.);
+  );
    const size_t No = epsi.size();
    for (size_t kk=0; kk<No; kk+=blockSize){
      const size_t kend( std::min(No, kk+blockSize) );
      for (size_t jj(kk); jj<No; jj+=blockSize){
        const size_t jend( std::min( No, jj+blockSize) );
        for (size_t ii(jj); ii<No; ii+=blockSize){
          const size_t iend( std::min( No, ii+blockSize) );
          for (size_t k(kk); k < kend; k++){
            const F ek(epsi[k]);
            const size_t jstart = jj > k ? jj : k;
            for (size_t j(jstart); j < jend; j++){
              F const ej(epsi[j]);
              F const facjk = j == k ? F(0.5) : F(1.0);
              size_t istart = ii > j ? ii : j;
              for (size_t i(istart); i < iend; i++){
                const F
                    ei(epsi[i])
                  , facij = i == j ? F(0.5) : F(1.0)
                  , denominator(epsabc - ei - ej - ek)
                  , U(Zijk_[i + No*j + No*No*k])
                  , V(Zijk_[i + No*k + No*No*j])
                  , W(Zijk_[j + No*i + No*No*k])
                  , X(Zijk_[j + No*k + No*No*i])
                  , Y(Zijk_[k + No*i + No*No*j])
                  , Z(Zijk_[k + No*j + No*No*i])
                  , A(maybeConjugate<F>(Tijk_[i + No*j + No*No*k]))
                  , B(maybeConjugate<F>(Tijk_[i + No*k + No*No*j]))
                  , C(maybeConjugate<F>(Tijk_[j + No*i + No*No*k]))
                  , D(maybeConjugate<F>(Tijk_[j + No*k + No*No*i]))
                  , E(maybeConjugate<F>(Tijk_[k + No*i + No*No*j]))
                  , _F(maybeConjugate<F>(Tijk_[k + No*j + No*No*i]))
                  , value
                    = 3.0 * ( A * U
                              + B * V
                              + C * W
                              + D * X
                              + E * Y
                              + _F * Z )
                   + ( ( U + X + Y )
                     - 2.0 * ( V + W + Z )
                     ) * ( A + D + E )
                   + ( ( V + W + Z )
                     - 2.0 * ( U + X + Y )
                     ) * ( B + C + _F )
                  ;
                energy += 2.0 * value / denominator * facjk * facij;
              } // i
            } // j
          } // k
        } // ii
      } // jj
    } // kk
    return std::real(energy);
  }
 template <typename F=double>
 double getEnergySame
  ( F const epsabc
  , size_t const No
  , F* const epsi
  , F* const Tijk
  , F* const Zijk
  );
 // Energy:1 ends here
-  template <typename F=double>
+// [[file:~/cuda/atrip/atrip.org::*Singles%20contribution][Singles contribution:1]]
-  double getEnergySame
+template <typename F=double>
-    ( const F epsabc
+#ifdef HAVE_CUDA
-    , std::vector<F> const& epsi
+__global__
-    , std::vector<F> const& Tijk_
+#endif
-    , std::vector<F> const& Zijk_
+void singlesContribution
    ) {
    constexpr size_t blockSize = 16;
    const size_t No = epsi.size();
    F energy = F(0.);
    for (size_t kk=0; kk<No; kk+=blockSize){
      const size_t kend( std::min( kk+blockSize, No) );
      for (size_t jj(kk); jj<No; jj+=blockSize){
        const size_t jend( std::min( jj+blockSize, No) );
        for (size_t ii(jj); ii<No; ii+=blockSize){
          const size_t iend( std::min( ii+blockSize, No) );
          for (size_t k(kk); k < kend; k++){
            const F ek(epsi[k]);
            const size_t jstart = jj > k ? jj : k;
            for(size_t j(jstart); j < jend; j++){
              const F facjk( j == k ? F(0.5) : F(1.0));
              const F ej(epsi[j]);
              const size_t istart = ii > j ? ii : j;
              for(size_t i(istart); i < iend; i++){
                const F
                  ei(epsi[i])
                , facij ( i==j ? F(0.5) : F(1.0))
                , denominator(epsabc - ei - ej - ek)
                , U(Zijk_[i + No*j + No*No*k])
                , V(Zijk_[j + No*k + No*No*i])
                , W(Zijk_[k + No*i + No*No*j])
                , A(maybeConjugate<F>(Tijk_[i + No*j + No*No*k]))
                , B(maybeConjugate<F>(Tijk_[j + No*k + No*No*i]))
                , C(maybeConjugate<F>(Tijk_[k + No*i + No*No*j]))
                , value
                  = F(3.0) * ( A * U
                             + B * V
                             + C * W
                             )
                  - ( A + B + C ) * ( U + V + W )
                ;
                energy += F(2.0) * value / denominator * facjk * facij;
              } // i
            } // j
          } // k
        } // ii
      } // jj
    } // kk
    return std::real(energy);
  }
  template <typename F=double>
  void singlesContribution
  ( size_t No
  , size_t Nv
-    , const ABCTuple &abc
+  , size_t a
-    , F const* Tph
+  , size_t b
-    , F const* VABij
+  , size_t c
-    , F const* VACij
+  , DataFieldType<F>* const Tph
-    , F const* VBCij
+  , DataFieldType<F>* const VABij
-    , F *Zijk
+  , DataFieldType<F>* const VACij
-    ) {
+  , DataFieldType<F>* const VBCij
-    const size_t a(abc[0]), b(abc[1]), c(abc[2]);
+  , DataFieldType<F>* Zijk
-    for (size_t k=0; k < No; k++)
+  );
-    for (size_t i=0; i < No; i++)
+// Singles contribution:1 ends here
    for (size_t j=0; j < No; j++) {
      const size_t ijk = i + j*No + k*No*No
                ,  jk = j + No * k
                ;
      Zijk[ijk] += Tph[ a + i * Nv ] * VBCij[ j + k * No ];
      Zijk[ijk] += Tph[ b + j * Nv ] * VACij[ i + k * No ];
      Zijk[ijk] += Tph[ c + k * Nv ] * VABij[ i + j * No ];
    }
  }
 // [[file:~/cuda/atrip/atrip.org::*Doubles%20contribution][Doubles contribution:1]]
  template <typename F=double>
  void doublesContribution
    ( const ABCTuple &abc
    , size_t const No
    , size_t const Nv
    // -- VABCI
-    , F const* VABph
+    , DataPtr<F> const VABph
-    , F const* VACph
+    , DataPtr<F> const VACph
-    , F const* VBCph
+    , DataPtr<F> const VBCph
-    , F const* VBAph
+    , DataPtr<F> const VBAph
-    , F const* VCAph
+    , DataPtr<F> const VCAph
-    , F const* VCBph
+    , DataPtr<F> const VCBph
    // -- VHHHA
-    , F const* VhhhA
+    , DataPtr<F> const VhhhA
-    , F const* VhhhB
+    , DataPtr<F> const VhhhB
-    , F const* VhhhC
+    , DataPtr<F> const VhhhC
    // -- TA
-    , F const* TAphh
+    , DataPtr<F> const TAphh
-    , F const* TBphh
+    , DataPtr<F> const TBphh
-    , F const* TCphh
+    , DataPtr<F> const TCphh
    // -- TABIJ
-    , F const* TABhh
+    , DataPtr<F> const TABhh
-    , F const* TAChh
+    , DataPtr<F> const TAChh
-    , F const* TBChh
+    , DataPtr<F> const TBChh
    // -- TIJK
-    , F *Tijk
+    // , DataPtr<F> Tijk
-    ) {
+    , DataFieldType<F>* Tijk_
    const size_t a = abc[0], b = abc[1], c = abc[2]
              , NoNo = No*No, NoNv = No*Nv
              ;
  #if defined(ATRIP_USE_DGEMM)
  #define _IJK_(i, j, k) i + j*No + k*NoNo
  #define REORDER(__II, __JJ, __KK)                                 \
    WITH_CHRONO("doubles:reorder",                                  \
    for (size_t k = 0; k < No; k++)                                 \
    for (size_t j = 0; j < No; j++)                                 \
    for (size_t i = 0; i < No; i++) {                               \
      Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)];   \
    }                                                               \
    )
  #define DGEMM_PARTICLES(__A, __B)      \
    atrip::xgemm<F>( "T"                 \
                   , "N"                 \
                   , (int const*)&NoNo   \
                   , (int const*)&No     \
                   , (int const*)&Nv     \
                   , &one                \
                   , __A                 \
                   , (int const*)&Nv     \
                   , __B                 \
                   , (int const*)&Nv     \
                   , &zero               \
                   , _t_buffer.data()    \
                   , (int const*)&NoNo   \
    );
-  #define DGEMM_HOLES(__A, __B, __TRANSB)    \
+// Doubles contribution:1 ends here
    atrip::xgemm<F>( "N"                     \
                   , __TRANSB                \
                   , (int const*)&NoNo       \
                   , (int const*)&No         \
                   , (int const*)&No         \
                   , &m_one                  \
                   , __A                     \
                   , (int const*)&NoNo       \
                   , __B                     \
                   , (int const*)&No         \
                   , &zero                   \
                   , _t_buffer.data()        \
                   , (int const*)&NoNo       \
                   );
  #define MAYBE_CONJ(_conj, _buffer)                 \
    for (size_t __i = 0; __i < NoNoNo; ++__i)        \
      _conj[__i] = maybeConjugate<F>(_buffer[__i]);  \
    const size_t NoNoNo = No*NoNo;
    std::vector<F> _t_buffer;
    _t_buffer.reserve(NoNoNo);
    F one{1.0}, m_one{-1.0}, zero{0.0};
    WITH_CHRONO("double:reorder",
      for (size_t k = 0; k < NoNoNo; k++) {
         Tijk[k] = 0.0;
       })
    // TOMERGE: replace chronos
    WITH_CHRONO("doubles:holes",
      { // Holes part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        std::vector<F> _vhhh(NoNoNo);
        // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
        MAYBE_CONJ(_vhhh, VhhhC)
        WITH_CHRONO("doubles:holes:1",
          DGEMM_HOLES(_vhhh.data(), TABhh, "N")
          REORDER(i, k, j)
        )
        // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
        WITH_CHRONO("doubles:holes:2",
          DGEMM_HOLES(_vhhh.data(), TABhh, "T")
          REORDER(j, k, i)
        )
        // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
        MAYBE_CONJ(_vhhh, VhhhB)
        WITH_CHRONO("doubles:holes:3",
          DGEMM_HOLES(_vhhh.data(), TAChh, "N")
          REORDER(i, j, k)
        )
        // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
        WITH_CHRONO("doubles:holes:4",
          DGEMM_HOLES(_vhhh.data(), TAChh, "T")
          REORDER(k, j, i)
        )
        // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
        MAYBE_CONJ(_vhhh, VhhhA)
        WITH_CHRONO("doubles:holes:5",
          DGEMM_HOLES(_vhhh.data(), TBChh, "N")
          REORDER(j, i, k)
        )
        // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
        WITH_CHRONO("doubles:holes:6",
          DGEMM_HOLES(_vhhh.data(), TBChh, "T")
          REORDER(k, i, j)
        )
      }
    )
  #undef MAYBE_CONJ
    WITH_CHRONO("doubles:particles",
      { // Particle part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
        WITH_CHRONO("doubles:particles:1",
          DGEMM_PARTICLES(TAphh, VBCph)
          REORDER(i, j, k)
        )
        // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
        WITH_CHRONO("doubles:particles:2",
          DGEMM_PARTICLES(TAphh, VCBph)
          REORDER(i, k, j)
        )
        // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
        WITH_CHRONO("doubles:particles:3",
          DGEMM_PARTICLES(TCphh, VABph)
          REORDER(k, i, j)
        )
        // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
        WITH_CHRONO("doubles:particles:4",
          DGEMM_PARTICLES(TCphh, VBAph)
          REORDER(k, j, i)
        )
        // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
        WITH_CHRONO("doubles:particles:5",
          DGEMM_PARTICLES(TBphh, VACph)
          REORDER(j, i, k)
        )
        // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
        WITH_CHRONO("doubles:particles:6",
          DGEMM_PARTICLES(TBphh, VCAph)
          REORDER(j, k, i)
        )
      }
    )
  #undef REORDER
  #undef DGEMM_HOLES
  #undef DGEMM_PARTICLES
  #undef _IJK_
  #else
    for (size_t k = 0; k < No; k++)
    for (size_t j = 0; j < No; j++)
    for (size_t i = 0; i < No; i++){
      const size_t ijk = i + j*No + k*NoNo
                ,  jk = j + k*No
                ;
      Tijk[ijk] = 0.0; // :important
      // HOLE DIAGRAMS: TABHH and VHHHA
      for (size_t L = 0; L < No; L++){
        // t[abLj] * V[Lcik]        H1
        // t[baLi] * V[Lcjk]        H0      TODO: conjugate T for complex
        Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo];
        Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo];
        // t[acLk] * V[Lbij]        H5
        // t[caLi] * V[Lbkj]        H3
        Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo];
        Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo];
        // t[bcLk] * V[Laji]        H2
        // t[cbLj] * V[Laki]        H4
        Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo];
        Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo];
      }
      // PARTILCE DIAGRAMS: TAPHH and VABPH
      for (size_t E = 0; E < Nv; E++) {
        // t[aEij] * V[bcEk]        P0
        // t[aEik] * V[cbEj]        P3 // TODO: CHECK THIS ONE, I DONT KNOW
        Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv];
        Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv];
        // t[cEki] * V[abEj]        P5
        // t[cEkj] * V[baEi]        P2
        Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv];
        Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv];
        // t[bEji] * V[acEk]        P1
        // t[bEjk] * V[caEi]        P4
        Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv];
        Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv];
      }
    }
 #endif
  }
 // [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
 }
-// Equations:1 ends here
+// Epilog:1 ends here
--- a/include/atrip/RankMap.hpp
+++ b/include/atrip/RankMap.hpp
@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// [[file:../../atrip.org::*The rank mapping][The rank mapping:1]]
+// [[file:~/cuda/atrip/atrip.org::*The%20rank%20mapping][The rank mapping:1]]
 #pragma once
 #include <vector>
@ -65,7 +65,8 @@ namespace atrip {
          ;
    }
-    bool isSourcePadding(size_t rank, size_t source) const noexcept {
+    bool isSourcePadding(const size_t rank, const size_t source)
      const noexcept {
      return source == nSources() && isPaddingRank(rank);
    }
--- a/include/atrip/Slice.hpp
+++ b/include/atrip/Slice.hpp
@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// [[file:../../atrip.org::*Prolog][Prolog:1]]
+// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
 #pragma once
 #include <iostream>
 #include <algorithm>
@ -21,33 +21,20 @@
 #include <atrip/Tuples.hpp>
 #include <atrip/Utils.hpp>
 #include <atrip/Blas.hpp>
 namespace atrip {
 template <typename FF> FF maybeConjugate(const FF a) { return a; }
 template <> Complex maybeConjugate(const Complex a) { return std::conj(a); }
 namespace traits {
  template <typename FF> bool isComplex() { return false; }
  template <> bool isComplex<Complex>() { return true; }
 namespace mpi {
  template <typename FF> MPI_Datatype datatypeOf(void);
  template <> MPI_Datatype datatypeOf<double>() { return MPI_DOUBLE; }
  template <> MPI_Datatype datatypeOf<Complex>() { return MPI_DOUBLE_COMPLEX; }
 }
 }
 template <typename F=double>
 struct Slice {
 // Prolog:1 ends here
-// [[file:../../atrip.org::*Location][Location:1]]
+// [[file:~/cuda/atrip/atrip.org::*Location][Location:1]]
 struct Location { size_t rank; size_t source; };
 // Location:1 ends here
-// [[file:../../atrip.org::*Type][Type:1]]
+// [[file:~/cuda/atrip/atrip.org::*Type][Type:1]]
 enum Type
  { A = 10
  , B
@ -65,7 +52,7 @@ enum Type
  };
 // Type:1 ends here
-// [[file:../../atrip.org::*State][State:1]]
+// [[file:~/cuda/atrip/atrip.org::*State][State:1]]
 enum State {
  Fetch = 0,
  Dispatched = 2,
@ -76,7 +63,7 @@ enum State {
 };
 // State:1 ends here
-// [[file:../../atrip.org::*The Info structure][The Info structure:1]]
+// [[file:~/cuda/atrip/atrip.org::*The%20Info%20structure][The Info structure:1]]
 struct Info {
  // which part of a,b,c the slice holds
  PartialTuple tuple;
@ -100,7 +87,7 @@ struct Info {
 using Ty_x_Tu = std::pair< Type, PartialTuple >;
 // The Info structure:1 ends here
-// [[file:../../atrip.org::*Name][Name:1]]
+// [[file:~/cuda/atrip/atrip.org::*Name][Name:1]]
 enum Name
  { TA    = 100
  , VIJKA = 101
@ -110,19 +97,19 @@ enum Name
  };
 // Name:1 ends here
-// [[file:../../atrip.org::*Database][Database:1]]
+// [[file:~/cuda/atrip/atrip.org::*Database][Database:1]]
 struct LocalDatabaseElement {
  Slice<F>::Name name;
  Slice<F>::Info info;
 };
 // Database:1 ends here
-// [[file:../../atrip.org::*Database][Database:2]]
+// [[file:~/cuda/atrip/atrip.org::*Database][Database:2]]
 using LocalDatabase = std::vector<LocalDatabaseElement>;
 using Database = LocalDatabase;
 // Database:2 ends here
-// [[file:../../atrip.org::*MPI Types][MPI Types:1]]
+// [[file:~/cuda/atrip/atrip.org::*MPI%20Types][MPI Types:1]]
 struct mpi {
  static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) {
@ -228,7 +215,7 @@ struct mpi {
 };
 // MPI Types:1 ends here
-// [[file:../../atrip.org::*Static utilities][Static utilities:1]]
+// [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:1]]
 static
 PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) {
  switch (sliceType) {
@ -246,7 +233,7 @@ PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) {
 }
 // Static utilities:1 ends here
-// [[file:../../atrip.org::*Static utilities][Static utilities:2]]
+// [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:2]]
 static std::vector<Slice<F>*> hasRecycledReferencingToIt
  ( std::vector<Slice<F>> &slices
  , Info const& info
@ -263,7 +250,7 @@ static std::vector<Slice<F>*> hasRecycledReferencingToIt
 }
 // Static utilities:2 ends here
-// [[file:../../atrip.org::*Static utilities][Static utilities:3]]
+// [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:3]]
 static Slice<F>& findOneByType(std::vector<Slice<F>> &slices, Slice<F>::Type type) {
    const auto sliceIt
      = std::find_if(slices.begin(), slices.end(),
@ -279,7 +266,7 @@ static Slice<F>& findOneByType(std::vector<Slice<F>> &slices, Slice<F>::Type typ
 }
 // Static utilities:3 ends here
-// [[file:../../atrip.org::*Static utilities][Static utilities:4]]
+// [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:4]]
 static Slice<F>&
 findRecycledSource (std::vector<Slice<F>> &slices, Slice<F>::Info info) {
  const auto sliceIt
@ -305,7 +292,7 @@ findRecycledSource (std::vector<Slice<F>> &slices, Slice<F>::Info info) {
 }
 // Static utilities:4 ends here
-// [[file:../../atrip.org::*Static utilities][Static utilities:5]]
+// [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:5]]
 static Slice<F>& findByTypeAbc
  ( std::vector<Slice<F>> &slices
  , Slice<F>::Type type
@ -335,7 +322,7 @@ static Slice<F>& findByTypeAbc
 }
 // Static utilities:5 ends here
-// [[file:../../atrip.org::*Static utilities][Static utilities:6]]
+// [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:6]]
 static Slice<F>& findByInfo(std::vector<Slice<F>> &slices,
                         Slice<F>::Info const& info) {
  const auto sliceIt
@ -358,30 +345,33 @@ static Slice<F>& findByInfo(std::vector<Slice<F>> &slices,
 }
 // Static utilities:6 ends here
-// [[file:../../atrip.org::*Attributes][Attributes:1]]
+// [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:1]]
 Info info;
 // Attributes:1 ends here
-// [[file:../../atrip.org::*Attributes][Attributes:2]]
+// [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:2]]
-F  *data;
+DataPtr<F> data;
 #if defined(HAVE_CUDA)
    F* mpi_data;
 #endif
 // Attributes:2 ends here
-// [[file:../../atrip.org::*Attributes][Attributes:3]]
+// [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:3]]
 MPI_Request request;
 // Attributes:3 ends here
-// [[file:../../atrip.org::*Attributes][Attributes:4]]
+// [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:4]]
 const size_t size;
 // Attributes:4 ends here
-// [[file:../../atrip.org::*Member functions][Member functions:1]]
+// [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:1]]
 void markReady() noexcept {
  info.state = Ready;
  info.recycling = Blank;
 }
 // Member functions:1 ends here
-// [[file:../../atrip.org::*Member functions][Member functions:2]]
+// [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:2]]
 bool isUnwrapped() const noexcept {
  return info.state == Ready
      || info.state == SelfSufficient
@ -389,7 +379,7 @@ bool isUnwrapped() const noexcept {
 }
 // Member functions:2 ends here
-// [[file:../../atrip.org::*Member functions][Member functions:3]]
+// [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:3]]
 bool isUnwrappable() const noexcept {
  return isUnwrapped()
      || info.state == Recycled
@ -407,7 +397,7 @@ void free() noexcept {
  info.state      = Acceptor;
  info.from       = {0, 0};
  info.recycling  = Blank;
-  data            = nullptr;
+  data            = DataNullPtr;
 }
 inline bool isFree() const noexcept {
@ -417,12 +407,12 @@ inline bool isFree() const noexcept {
      && info.from.rank   == 0
      && info.from.source == 0
      && info.recycling   == Blank
-      && data             == nullptr
+      && data             == DataNullPtr
       ;
 }
 // Member functions:3 ends here
-// [[file:../../atrip.org::*Member functions][Member functions:4]]
+// [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:4]]
 inline bool isRecyclable() const noexcept {
  return (  info.state == Dispatched
         || info.state == Ready
@ -433,16 +423,16 @@ inline bool isRecyclable() const noexcept {
 }
 // Member functions:4 ends here
-// [[file:../../atrip.org::*Member functions][Member functions:5]]
+// [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:5]]
 inline bool hasValidDataPointer() const noexcept {
-  return data       != nullptr
+  return data       != DataNullPtr
      && info.state != Acceptor
      && info.type  != Blank
      ;
 }
 // Member functions:5 ends here
-// [[file:../../atrip.org::*Member functions][Member functions:6]]
+// [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:6]]
 void unwrapAndMarkReady() {
      if (info.state == Ready) return;
      if (info.state != Dispatched)
@ -454,9 +444,17 @@ void unwrapAndMarkReady() {
        WITH_RANK << "__slice__:mpi: waiting " << "\n";
 #endif
      const int errorCode = MPI_Wait(&request, &status);
      if (MPI_SUCCESS != MPI_Request_free(&request))
        throw "Error freeing MPI request";
      if (errorCode != MPI_SUCCESS)
        throw "MPI ERROR HAPPENED....";
 #if defined(HAVE_CUDA)
      // copy the retrieved mpi data to the device
      cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);
      std::free(mpi_data);
 #endif
 #ifdef HAVE_OCD
      char errorString[MPI_MAX_ERROR_STRING];
      int errorSize;
@ -474,18 +472,21 @@ void unwrapAndMarkReady() {
    }
 // Member functions:6 ends here
-// [[file:../../atrip.org::*Epilog][Epilog:1]]
+// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
 Slice(size_t size_)
      : info({})
-    , data(nullptr)
+      , data(DataNullPtr)
 #if defined(HAVE_CUDA)
      , mpi_data(nullptr)
 #endif
      , size(size_)
      {}
-}; // struct Slice
+  }; // struct Slice
 // Epilog:1 ends here
-// [[file:../../atrip.org::*Debug][Debug:1]]
+// [[file:~/cuda/atrip/atrip.org::*Debug][Debug:1]]
 template <typename F=double>
 std::ostream& operator<<(std::ostream& out, typename Slice<F>::Location const& v) {
  // TODO: remove me
--- a/include/atrip/SliceUnion.hpp
+++ b/include/atrip/SliceUnion.hpp
@ -12,16 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// [[file:../../atrip.org::*The slice union][The slice union:1]]
+// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
 #pragma once
 #include <atrip/Debug.hpp>
 #include <atrip/Slice.hpp>
 #include <atrip/RankMap.hpp>
 namespace atrip {
 // Prolog:1 ends here
-  template <typename F=double>
+// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]]
-  struct SliceUnion {
+template <typename F=double>
  class SliceUnion {
  public:
    using Tensor = CTF::Tensor<F>;
    virtual void
@ -191,7 +194,11 @@ namespace atrip {
                           : Slice<F>::Fetch
                           ;
          if (blank.info.state == Slice<F>::SelfSufficient) {
 #if defined(HAVE_CUDA)
            blank.mpi_data = sources[from.source].data();
 #else
            blank.data = sources[from.source].data();
 #endif
          } else {
            if (freePointers.size() == 0) {
              std::stringstream stream;
@ -345,8 +352,7 @@ namespace atrip {
    }
    // CONSTRUCTOR
-    SliceUnion( Tensor const& sourceTensor
+    SliceUnion( std::vector<typename Slice<F>::Type> sliceTypes_
              , std::vector<typename Slice<F>::Type> sliceTypes_
              , std::vector<size_t> sliceLength_
              , std::vector<size_t> paramLength
              , size_t np
@ -366,12 +372,19 @@ namespace atrip {
                                           1UL, std::multiplies<size_t>())))
              , name(name_)
              , sliceTypes(sliceTypes_)
-              , sliceBuffers(nSliceBuffers, sources[0])
+              , sliceBuffers(nSliceBuffers)
              //, slices(2 * sliceTypes.size(), Slice<F>{ sources[0].size() })
    { // constructor begin
      LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n";
      for (auto& ptr: sliceBuffers)
 #if defined(HAVE_CUDA)
        cuMemAlloc(&ptr, sizeof(F) * sources[0].size());
 #else
        ptr = (DataPtr<F>)malloc(sizeof(F) * sources[0].size());
 #endif
      slices
        = std::vector<Slice<F>>(2 * sliceTypes.size(), { sources[0].size() });
      // TODO: think exactly    ^------------------- about this number
@ -379,7 +392,7 @@ namespace atrip {
      // initialize the freePointers with the pointers to the buffers
      std::transform(sliceBuffers.begin(), sliceBuffers.end(),
                     std::inserter(freePointers, freePointers.begin()),
-                     [](std::vector<F> &vec) { return vec.data(); });
+                     [](DataPtr<F> ptr) { return ptr; });
@ -397,8 +410,6 @@ namespace atrip {
                           << freePointers.size() << "\n";
      LOG(1,"Atrip") << "#sliceBuffers "
                           << sliceBuffers.size() << "\n";
      LOG(1,"Atrip") << "#sliceBuffers[0] "
                           << sliceBuffers[0].size() << "\n";
      LOG(1,"Atrip") << "#sliceLength "
                           << sliceLength.size() << "\n";
      LOG(1,"Atrip") << "#paramLength "
@ -477,9 +488,12 @@ namespace atrip {
      if (slice.info.state == Slice<F>::Fetch) {
        // TODO: do it through the slice class
        slice.info.state = Slice<F>::Dispatched;
-        MPI_Request request;
+#if defined(HAVE_CUDA)
-        slice.request = request;
+        slice.mpi_data = (F*)malloc(sizeof(F) * slice.size);
        MPI_Irecv( slice.mpi_data
 #else
        MPI_Irecv( slice.data
 #endif
                 , slice.size
                 , traits::mpi::datatypeOf<F>()
                 , info.from.rank
@ -495,7 +509,7 @@ namespace atrip {
      for (auto type: sliceTypes) unwrapSlice(type, abc);
    }
-    F* unwrapSlice(typename Slice<F>::Type type, ABCTuple const& abc) {
+    DataPtr<F> unwrapSlice(typename Slice<F>::Type type, ABCTuple const& abc) {
      WITH_CRAZY_DEBUG
      WITH_RANK << "__unwrap__:slice " << type << " w n "
                << name
@ -539,6 +553,15 @@ namespace atrip {
      return slice.data;
    }
    ~SliceUnion() {
      for (auto& ptr: sliceBuffers)
 #if defined(HAVE_CUDA)
        cuMemFree(ptr);
 #else
        std::free(ptr);
 #endif
    }
    const RankMap<F> rankMap;
    const MPI_Comm world;
    const MPI_Comm universe;
@ -547,8 +570,8 @@ namespace atrip {
    std::vector< Slice<F> > slices;
    typename Slice<F>::Name name;
    const std::vector<typename Slice<F>::Type> sliceTypes;
-    std::vector< std::vector<F> > sliceBuffers;
+    std::vector< DataPtr<F> > sliceBuffers;
-    std::set<F*> freePointers;
+    std::set< DataPtr<F> > freePointers;
  };
@ -568,6 +591,8 @@ namespace atrip {
      }
      return **sliceUnionIt;
  }
 // Prolog:2 ends here
 // [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
 }
-// The slice union:1 ends here
+// Epilog:1 ends here
--- a/include/atrip/Tuples.hpp
+++ b/include/atrip/Tuples.hpp
@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// [[file:../../atrip.org::*Prolog][Prolog:1]]
+// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
 #pragma once
 #include <vector>
@ -35,7 +35,7 @@
 namespace atrip {
 // Prolog:1 ends here
-// [[file:../../atrip.org::*Tuples types][Tuples types:1]]
+// [[file:~/cuda/atrip/atrip.org::*Tuples%20types][Tuples types:1]]
 using ABCTuple = std::array<size_t, 3>;
 using PartialTuple = std::array<size_t, 2>;
 using ABCTuples = std::vector<ABCTuple>;
@ -44,23 +44,22 @@ constexpr ABCTuple FAKE_TUPLE = {0, 0, 0};
 constexpr ABCTuple INVALID_TUPLE = {1, 1, 1};
 // Tuples types:1 ends here
-// [[file:../../atrip.org::*Distributing the tuples][Distributing the tuples:1]]
+// [[file:~/cuda/atrip/atrip.org::*Distributing%20the%20tuples][Distributing the tuples:1]]
 struct TuplesDistribution {
  virtual ABCTuples getTuples(size_t Nv, MPI_Comm universe) = 0;
  virtual bool tupleIsFake(ABCTuple const& t) { return t == FAKE_TUPLE; }
 };
 // Distributing the tuples:1 ends here
-// [[file:../../atrip.org::*Node information][Node information:1]]
+// [[file:~/cuda/atrip/atrip.org::*Node%20information][Node information:1]]
 std::vector<std::string> getNodeNames(MPI_Comm comm){
  int rank, np;
  MPI_Comm_rank(comm, &rank);
  MPI_Comm_size(comm, &np);
  std::vector<std::string> nodeList(np);
-  char nodeName[MPI_MAX_PROCESSOR_NAME]
+  char nodeName[MPI_MAX_PROCESSOR_NAME];
-     , nodeNames[np*MPI_MAX_PROCESSOR_NAME]
+  char *nodeNames = (char*)malloc(np * MPI_MAX_PROCESSOR_NAME);
     ;
  std::vector<int> nameLengths(np)
                 , off(np)
                 ;
@ -87,11 +86,12 @@ std::vector<std::string> getNodeNames(MPI_Comm comm){
    std::string const s(&nodeNames[off[i]], nameLengths[i]);
    nodeList[i] = s;
  }
  std::free(nodeNames);
  return nodeList;
 }
 // Node information:1 ends here
-// [[file:../../atrip.org::*Node information][Node information:2]]
+// [[file:~/cuda/atrip/atrip.org::*Node%20information][Node information:2]]
 struct RankInfo {
  const std::string name;
  const size_t nodeId;
@ -154,7 +154,7 @@ getClusterInfo(MPI_Comm comm) {
 }
 // Node information:2 ends here
-// [[file:../../atrip.org::*Naive list][Naive list:1]]
+// [[file:~/cuda/atrip/atrip.org::*Naive%20list][Naive list:1]]
 ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) {
  const size_t
@ -188,7 +188,7 @@ ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) {
 }
 // Naive list:1 ends here
-// [[file:../../atrip.org::*Naive list][Naive list:2]]
+// [[file:~/cuda/atrip/atrip.org::*Naive%20list][Naive list:2]]
 ABCTuples getAllTuplesList(const size_t Nv) {
  const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
  ABCTuples result(n);
@ -204,7 +204,7 @@ ABCTuples getAllTuplesList(const size_t Nv) {
 }
 // Naive list:2 ends here
-// [[file:../../atrip.org::*Naive list][Naive list:3]]
+// [[file:~/cuda/atrip/atrip.org::*Naive%20list][Naive list:3]]
 struct NaiveDistribution : public TuplesDistribution {
  ABCTuples getTuples(size_t Nv, MPI_Comm universe) override {
    int rank, np;
@ -215,11 +215,11 @@ struct NaiveDistribution : public TuplesDistribution {
 };
 // Naive list:3 ends here
-// [[file:../../atrip.org::*Prolog][Prolog:1]]
+// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
 namespace group_and_sort {
 // Prolog:1 ends here
-// [[file:../../atrip.org::*Utils][Utils:1]]
+// [[file:~/cuda/atrip/atrip.org::*Utils][Utils:1]]
 // Provides the node on which the slice-element is found
 // Right now we distribute the slices in a round robin fashion
 // over the different nodes (NOTE: not mpi ranks but nodes)
@ -244,7 +244,7 @@ struct Info {
 };
 // Utils:1 ends here
-// [[file:../../atrip.org::*Distribution][Distribution:1]]
+// [[file:~/cuda/atrip/atrip.org::*Distribution][Distribution:1]]
 ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
  ABCTuples nodeTuples;
@ -426,7 +426,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
 }
 // Distribution:1 ends here
-// [[file:../../atrip.org::*Main][Main:1]]
+// [[file:~/cuda/atrip/atrip.org::*Main][Main:1]]
 std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv) {
  int rank, np;
@ -466,7 +466,7 @@ std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv) {
  MPI_Comm_split(universe, color, key, &INTRA_COMM);
 // Main:1 ends here
-// [[file:../../atrip.org::*Main][Main:2]]
+// [[file:~/cuda/atrip/atrip.org::*Main][Main:2]]
 size_t const
  tuplesPerRankLocal
     = nodeTuples.size() / nodeInfos[rank].ranksPerNode
@ -494,7 +494,7 @@ LOG(1,"Atrip") << "ranks per node " << nodeInfos[rank].ranksPerNode << "\n";
 LOG(1,"Atrip") << "#nodes " << nNodes << "\n";
 // Main:2 ends here
-// [[file:../../atrip.org::*Main][Main:3]]
+// [[file:~/cuda/atrip/atrip.org::*Main][Main:3]]
 size_t const totalTuples
  = tuplesPerRankGlobal * nodeInfos[rank].ranksPerNode;
@ -506,7 +506,7 @@ if (computeDistribution) {
 }
 // Main:3 ends here
-// [[file:../../atrip.org::*Main][Main:4]]
+// [[file:~/cuda/atrip/atrip.org::*Main][Main:4]]
 {
  // construct mpi type for abctuple
  MPI_Datatype MPI_ABCTUPLE;
@ -530,13 +530,13 @@ if (computeDistribution) {
 }
 // Main:4 ends here
-// [[file:../../atrip.org::*Main][Main:5]]
+// [[file:~/cuda/atrip/atrip.org::*Main][Main:5]]
 return result;
 }
 // Main:5 ends here
-// [[file:../../atrip.org::*Interface][Interface:1]]
+// [[file:~/cuda/atrip/atrip.org::*Interface][Interface:1]]
 struct Distribution : public TuplesDistribution {
  ABCTuples getTuples(size_t Nv, MPI_Comm universe) override {
    return main(universe, Nv);
@ -544,10 +544,10 @@ struct Distribution : public TuplesDistribution {
 };
 // Interface:1 ends here
-// [[file:../../atrip.org::*Epilog][Epilog:1]]
+// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
 } // namespace group_and_sort
 // Epilog:1 ends here
-// [[file:../../atrip.org::*Epilog][Epilog:1]]
+// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
 }
 // Epilog:1 ends here
--- a/include/atrip/Types.hpp
+++ b/include/atrip/Types.hpp
@ -0,0 +1,60 @@
 // Copyright 2022 Alejandro Gallo
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // [[file:~/cuda/atrip/atrip.org::*Data%20pointer][Data pointer:1]]
 #pragma once
 #include <atrip/Complex.hpp>
 #include <atrip/Atrip.hpp>
 namespace atrip {
 template <typename F>
 struct DataField;
 template <>
 struct DataField<double> {
  using type = double;
 };
 #if defined(HAVE_CUDA)
 template <typename F>
 using DataPtr = CUdeviceptr;
 #define DataNullPtr 0x00
 template <>
 struct DataField<Complex> {
  using type = cuDoubleComplex;
 };
 #else
 template <typename F>
 using DataPtr = F*;
 #define DataNullPtr nullptr
 template <>
 struct DataField<Complex> {
  using type = Complex;
 };
 #endif
 template <typename F>
 using DataFieldType = typename DataField<F>::type;
 }
 // Data pointer:1 ends here
--- a/include/atrip/Unions.hpp
+++ b/include/atrip/Unions.hpp
@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// [[file:../../atrip.org::*Unions][Unions:1]]
+// [[file:~/cuda/atrip/atrip.org::*Unions][Unions:1]]
 #pragma once
 #include <atrip/SliceUnion.hpp>
@ -65,8 +65,7 @@ namespace atrip {
         , size_t np
         , MPI_Comm child_world
         , MPI_Comm global_world
-         ) : SliceUnion<F>( sourceTensor
+         ) : SliceUnion<F>( {Slice<F>::A, Slice<F>::B, Slice<F>::C}
                          , {Slice<F>::A, Slice<F>::B, Slice<F>::C}
                          , {Nv, No, No} // size of the slices
                          , {Nv}
                          , np
@ -103,8 +102,7 @@ namespace atrip {
        , size_t np
        , MPI_Comm child_world
        , MPI_Comm global_world
-        ) : SliceUnion<F>( sourceTensor
+        ) : SliceUnion<F>( {Slice<F>::A, Slice<F>::B, Slice<F>::C}
                         , {Slice<F>::A, Slice<F>::B, Slice<F>::C}
                         , {No, No, No} // size of the slices
                         , {Nv}         // size of the parametrization
                         , np
@ -138,8 +136,7 @@ namespace atrip {
        , size_t np
        , MPI_Comm child_world
        , MPI_Comm global_world
-        ) : SliceUnion<F>( sourceTensor
+        ) : SliceUnion<F>( { Slice<F>::AB, Slice<F>::BC, Slice<F>::AC
                         , { Slice<F>::AB, Slice<F>::BC, Slice<F>::AC
                           , Slice<F>::BA, Slice<F>::CB, Slice<F>::CA
                           }
                         , {Nv, No} // size of the slices
@ -179,8 +176,7 @@ namespace atrip {
        , size_t np
        , MPI_Comm child_world
        , MPI_Comm global_world
-        ) : SliceUnion<F>( sourceTensor
+        ) : SliceUnion<F>( {Slice<F>::AB, Slice<F>::BC, Slice<F>::AC}
                         , {Slice<F>::AB, Slice<F>::BC, Slice<F>::AC}
                         , {No, No} // size of the slices
                         , {Nv, Nv} // size of the parametrization
                         , np
@ -219,8 +215,7 @@ namespace atrip {
         , size_t np
         , MPI_Comm child_world
         , MPI_Comm global_world
-         ) : SliceUnion<F>( sourceTensor
+         ) : SliceUnion<F>( {Slice<F>::AB, Slice<F>::BC, Slice<F>::AC}
                          , {Slice<F>::AB, Slice<F>::BC, Slice<F>::AC}
                          , {No, No} // size of the slices
                          , {Nv, Nv} // size of the parametrization
                          , np
--- a/include/atrip/Utils.hpp
+++ b/include/atrip/Utils.hpp
@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// [[file:../../atrip.org::*Prolog][Prolog:1]]
+// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
 #pragma once
 #include <sstream>
 #include <string>
@ -29,11 +29,14 @@
 #include <atrip/Debug.hpp>
 namespace atrip {
 // Prolog:1 ends here
-// [[file:../../atrip.org::*Pretty printing][Pretty printing:1]]
+// [[file:~/cuda/atrip/atrip.org::*Pretty%20printing][Pretty printing:1]]
-template <typename T>
+#pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
  template <typename T>
  std::string pretty_print(T&& value) {
    std::stringstream stream;
 #if ATRIP_DEBUG > 2
@ -41,9 +44,10 @@ template <typename T>
 #endif
    return stream.str();
  }
 #pragma GCC diagnostic pop
 // Pretty printing:1 ends here
-// [[file:../../atrip.org::*Chrono][Chrono:1]]
+// [[file:~/cuda/atrip/atrip.org::*Chrono][Chrono:1]]
 #define WITH_CHRONO(__chrono_name, ...)         \
  Atrip::chrono[__chrono_name].start();         \
  __VA_ARGS__                                   \
@ -62,6 +66,6 @@ struct Timer {
 using Timings = std::map<std::string, Timer>;
 // Chrono:1 ends here
-// [[file:../../atrip.org::*Epilog][Epilog:1]]
+// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
 }
 // Epilog:1 ends here
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -2,26 +2,25 @@ AUTOMAKE_OPTIONS = subdir-objects
 include $(top_srcdir)/atrip.mk
 AM_CXXFLAGS = $(CTF_CPPFLAGS) -fmax-errors=1
 AM_CPPFLAGS = $(CTF_CPPFLAGS)
 lib_LIBRARIES = libatrip.a
 libatrip_a_CPPFLAGS = -I$(top_srcdir)/include/
-libatrip_a_SOURCES = ./atrip/Atrip.cxx \
+libatrip_a_SOURCES = ./atrip/Blas.cxx
-                     ./atrip/Blas.cxx
+NVCC_FILES = ./atrip/Equations.cxx ./atrip/Complex.cxx ./atrip/Atrip.cxx
 if WITH_CUDA
 NVCC_FILES = ./atrip/Equations.cxx ./atrip/Complex.cxx
 NVCC_OBJS = $(patsubst %.cxx,%.nvcc.o,$(NVCC_FILES))
 libatrip_a_CPPFLAGS += $(CUDA_CXXFLAGS)
 libatrip_a_DEPENDENCIES = $(NVCC_OBJS)
 libatrip_a_LIBADD = $(NVCC_OBJS)
 %.nvcc.o: %.cxx
-	$(NVCC) -c -I../ $(CPPFLAGS) $(libatrip_a_CPPFLAGS) $< -o $@
+	$(NVCC) -c -x cu -I../ $(CPPFLAGS) $(CTF_CPPFLAGS) $(DEFS) $(libatrip_a_CPPFLAGS) $< -o $@
 #./atrip/Equations.o: ./atrip/Equations.cxx
 #	$(NVCC) -c -I../ $(CPPFLAGS) $(libatrip_a_CPPFLAGS) $< -o $@
 else
-libatrip_a_SOURCES += ./atrip/Equations.cxx
+libatrip_a_SOURCES += $(NVCC_FILES)
 endif
--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// [[file:../../atrip.org::*Main][Main:1]]
+// [[file:~/cuda/atrip/atrip.org::*Main][Main:1]]
 #include <iomanip>
 #include <atrip/Atrip.hpp>
@ -23,12 +23,24 @@
 #include <atrip/Checkpoint.hpp>
 using namespace atrip;
 #if defined(HAVE_CUDA)
 namespace atrip {
 namespace cuda {
 };
 };
 #endif
 template <typename F> bool RankMap<F>::RANK_ROUND_ROBIN;
 template bool RankMap<double>::RANK_ROUND_ROBIN;
 template bool RankMap<Complex>::RANK_ROUND_ROBIN;
-int Atrip::rank;
+size_t Atrip::rank;
-int Atrip::np;
+size_t Atrip::np;
 #if defined(HAVE_CUDA)
 typename Atrip::CudaContext Atrip::cuda;
 #endif
 MPI_Comm Atrip::communicator;
 Timings Atrip::chrono;
@ -40,15 +52,20 @@ void atrip::registerIterationDescriptor(IterationDescriptor d) {
 void Atrip::init(MPI_Comm world)  {
  Atrip::communicator = world;
-  MPI_Comm_rank(world, &Atrip::rank);
+  MPI_Comm_rank(world, (int*)&Atrip::rank);
-  MPI_Comm_size(world, &Atrip::np);
+  MPI_Comm_size(world, (int*)&Atrip::np);
 #if defined(HAVE_CUDA)
  Atrip::cuda.status = cublasCreate(&Atrip::cuda.handle);
 #endif
 }
 template <typename F>
 Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
-  const int np = Atrip::np;
+  const size_t np = Atrip::np;
-  const int rank = Atrip::rank;
+  const size_t rank = Atrip::rank;
  MPI_Comm universe = Atrip::communicator;
  const size_t No = in.ei->lens[0];
@ -58,18 +75,35 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
  LOG(0,"Atrip") << "np: " << np << "\n";
  // allocate the three scratches, see piecuch
  std::vector<F> Tijk(No*No*No) // doubles only (see piecuch)
               , Zijk(No*No*No) // singles + doubles (see piecuch)
  // we need local copies of the following tensors on every
  // rank
-               , epsi(No)
+  std::vector<F> _epsi(No)
-               , epsa(Nv)
+               , _epsa(Nv)
-               , Tai(No * Nv)
+               , _Tai(No * Nv)
               ;
-  in.ei->read_all(epsi.data());
+  in.ei->read_all(_epsi.data());
-  in.ea->read_all(epsa.data());
+  in.ea->read_all(_epsa.data());
-  in.Tph->read_all(Tai.data());
+  in.Tph->read_all(_Tai.data());
 #if defined(HAVE_CUDA)
  DataPtr<F> Tai, epsi, epsa;
  cuMemAlloc(&Tai, sizeof(F) * _Tai.size());
  cuMemAlloc(&epsi, sizeof(F) * _epsi.size());
  cuMemAlloc(&epsa, sizeof(F) * _epsa.size());
  cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size());
  cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size());
  cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size());
  DataPtr<F> Tijk, Zijk;
  //TODO: free memory
  cuMemAlloc(&Tijk, sizeof(F) * No * No * No);
  cuMemAlloc(&Zijk, sizeof(F) * No * No * No);
 #else
  std::vector<F> &Tai = _Tai, &epsi = _epsi, &epsa = _epsa;
  std::vector<F> Tijk(No*No*No), Zijk(No*No*No);
 #endif
  RankMap<F>::RANK_ROUND_ROBIN = in.rankRoundRobin;
  if (RankMap<F>::RANK_ROUND_ROBIN) {
@ -139,7 +173,6 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
    )
  const size_t nIterations = tuplesList.size();
  {
    const size_t _all_tuples = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
    LOG(0,"Atrip") << "#iterations: "
                  << nIterations
                  << "/"
@ -338,6 +371,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
    }
  }
  LOG(0, "AtripCUDA") <<  "Starting iterations\n";
  for ( size_t
          i = first_iteration,
          iteration = first_iteration + 1
@ -346,6 +382,8 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
      ) {
    Atrip::chrono["iterations"].start();
    LOG(0, "AtripCUDA") <<  "iteration " << i << "\n";
    // check overhead from chrono over all iterations
    WITH_CHRONO("start:stop", {})
@ -355,8 +393,10 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
      if (in.barrier) MPI_Barrier(universe);
    ))
    // write checkpoints
-    if (iteration % checkpoint_mod == 0) {
+    if (iteration % checkpoint_mod == 0 && false) {
        LOG(0, "AtripCUDA") <<  "checkpoints \n";
        double globalEnergy = 0;
        MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe);
        Checkpoint out
@ -368,9 +408,10 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
             iteration - 1,
             in.rankRoundRobin};
        LOG(0, "Atrip") << "Writing checkpoint\n";
-        if (Atrip::rank == 0) write_checkpoint(out, in.checkpointPath);
+        //if (Atrip::rank == 0) write_checkpoint(out, in.checkpointPath);
    }
    LOG(0, "AtripCUDA") <<  "reporting \n";
    // write reporting
    if (iteration % iterationMod == 0 || iteration == iteration1Percent) {
@ -417,9 +458,11 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
                << "\n";
    )
    LOG(0, "AtripCUDA") <<  "first database " << i << "\n";
    // COMM FIRST DATABASE ================================================{{{1
    if (i == first_iteration) {
      LOG(0, "AtripCUDA") <<  "first database " << i << "\n";
      WITH_RANK << "__first__:first database ............ \n";
      const auto db = communicateDatabase(abc, universe);
      WITH_RANK << "__first__:first database communicated \n";
@ -432,6 +475,8 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
      MPI_Barrier(universe);
    }
    LOG(0, "AtripCUDA") <<  "next database" << i << "\n";
    // COMM NEXT DATABASE ================================================={{{1
    if (abcNext) {
      WITH_RANK << "__comm__:" << iteration << "th communicating database\n";
@ -447,6 +492,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
    // COMPUTE DOUBLES ===================================================={{{1
    OCD_Barrier(universe);
    if (!isFakeTuple(i)) {
      LOG(0, "AtripCUDA") <<  "computing doubles " << i << "\n";
      WITH_RANK << iteration << "-th doubles\n";
      WITH_CHRONO("oneshot-unwrap",
      WITH_CHRONO("unwrap",
@ -478,7 +526,11 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
                              , tabhh.unwrapSlice(Slice<F>::AC, abc)
                              , tabhh.unwrapSlice(Slice<F>::BC, abc)
                              // -- TIJK
 #if defined(HAVE_CUDA)
                              , (DataFieldType<F>*)Tijk
 #else
                              , Tijk.data()
 #endif
                              );
        WITH_RANK << iteration << "-th doubles done\n";
      ))
@ -493,16 +545,35 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
        abhh.unwrapAll(abc);
      )))
      WITH_CHRONO("reorder",
-        for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I];
+        LOG(0, "AtripCUDA") <<  "reorder singles" << i << "\n";
        atrip::xcopy<F>(No*No*No,
 #if defined(HAVE_CUDA)
                        (DataFieldType<F>*)Tijk, 1,
                        (DataFieldType<F>*)Zijk, 1);
 #else
                        (DataFieldType<F>*)Tijk.data(), 1,
                        (DataFieldType<F>*)Zijk.data(), 1);
 #endif
      )
      WITH_CHRONO("singles",
-      singlesContribution<F>( No, Nv, abc
+      LOG(0, "AtripCUDA") <<  "doing singles" << i << "\n";
 #if defined(HAVE_CUDA)
      singlesContribution<F><<<1,1>>>( No, Nv, abc[0], abc[1], abc[2]
                            , (DataFieldType<F>*)Tai
 #else
      singlesContribution<F>( No, Nv, abc[0], abc[1], abc[2]
                            , Tai.data()
-                            , abhh.unwrapSlice(Slice<F>::AB, abc)
+#endif
-                            , abhh.unwrapSlice(Slice<F>::AC, abc)
+                            , (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB, abc)
-                            , abhh.unwrapSlice(Slice<F>::BC, abc)
+                            , (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC, abc)
                            , (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC, abc)
 #if defined(HAVE_CUDA)
                            , (DataFieldType<F>*)Zijk);
 #else
                            , Zijk.data());
 #endif
      )
      LOG(0, "AtripCUDA") <<  "singles done" << i << "\n";
    }
@ -513,13 +584,17 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
      int distinct(0);
      if (abc[0] == abc[1]) distinct++;
      if (abc[1] == abc[2]) distinct--;
-      const F epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]);
+      const F epsabc(_epsa[abc[0]] + _epsa[abc[1]] + _epsa[abc[2]]);
      LOG(0, "AtripCUDA") <<  "doing energy " << i << "distinct " << distinct << "\n";
      WITH_CHRONO("energy",
 /*
    TODO: think about how to do this on the GPU in the best way possible
        if ( distinct == 0)
-          tupleEnergy = getEnergyDistinct<F>(epsabc, epsi, Tijk, Zijk);
+          tupleEnergy = getEnergyDistinct<F>(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk);
        else
-          tupleEnergy = getEnergySame<F>(epsabc, epsi, Tijk, Zijk);
+          tupleEnergy = getEnergySame<F>(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk);
 */
      )
 #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
--- a/src/atrip/Blas.cxx
+++ b/src/atrip/Blas.cxx
@ -0,0 +1,141 @@
 // Copyright 2022 Alejandro Gallo
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // [[file:~/cuda/atrip/atrip.org::*Blas][Blas:2]]
 #include <atrip/Blas.hpp>
 #include <atrip/Atrip.hpp>
 #if defined(HAVE_CUDA)
 #  include <cstring>
  static inline
  cublasOperation_t char_to_cublasOperation(const char* trans) {
    if (strncmp("N", trans, 1) == 0)
      return CUBLAS_OP_N;
    else if (strncmp("T", trans, 1) == 0)
      return CUBLAS_OP_T;
    else
      return CUBLAS_OP_C;
  }
 #endif
 namespace atrip {
  template <>
  void xgemm<double>(const char *transa,
             const char *transb,
             const int *m,
             const int *n,
             const int *k,
             double *alpha,
             const typename DataField<double>::type *A,
             const int *lda,
             const typename DataField<double>::type *B,
             const int *ldb,
             double *beta,
             typename DataField<double>::type *C,
             const int *ldc) {
 #if defined(HAVE_CUDA)
    cublasDgemm(Atrip::cuda.handle,
                char_to_cublasOperation(transa),
                char_to_cublasOperation(transb),
                *m, *n, *k,
                alpha, A, *lda,
                B, *ldb, beta,
                C, *ldc);
 #else
    dgemm_(transa, transb,
           m, n, k,
           alpha, A, lda,
           B, ldb, beta,
           C, ldc);
 #endif
  }
  template <>
  void xgemm<Complex>(const char *transa,
             const char *transb,
             const int *m,
             const int *n,
             const int *k,
             Complex *alpha,
             const typename DataField<Complex>::type *A,
             const int *lda,
             const typename DataField<Complex>::type *B,
             const int *ldb,
             Complex *beta,
             typename DataField<Complex>::type *C,
             const int *ldc) {
 #if defined(HAVE_CUDA)
 #pragma warning HAVE_CUDA
    cuDoubleComplex
      cu_alpha = {std::real(*alpha), std::imag(*alpha)},
      cu_beta = {std::real(*beta), std::imag(*beta)};
    cublasZgemm(Atrip::cuda.handle,
                char_to_cublasOperation(transa),
                char_to_cublasOperation(transb),
                *m, *n, *k,
                &cu_alpha,
                A, *lda,
                B, *ldb,
                &cu_beta,
                C, *ldc);
 #else
    zgemm_(transa, transb,
           m, n, k,
           alpha, A, lda,
           B, ldb, beta,
           C, ldc);
 #endif
  }
  template <>
  void xcopy<double>(const int n,
                     const  DataFieldType<double>* x,
                     const int incx,
                     DataFieldType<double>* y,
                     const int incy) {
 #if defined(HAVE_CUDA)
    cublasDcopy(Atrip::cuda.handle,
                n,
                x, incx,
                y, incy);
 #else
    dcopy_(n, x, incx, y, incy);
 #endif
  }
  template <>
  void xcopy<Complex>(const int n,
                     const  DataFieldType<Complex>* x,
                     const int incx,
                     DataFieldType<Complex>* y,
                     const int incy) {
 #if defined(HAVE_CUDA)
    cublasZcopy(Atrip::cuda.handle,
                n,
                x, incx,
                y, incy);
 #else
    zcopy_(n, x, incx, y, incy);
 #endif
  }
 }
 // Blas:2 ends here
--- a/src/atrip/Complex.cxx
+++ b/src/atrip/Complex.cxx
@ -0,0 +1,40 @@
 // Copyright 2022 Alejandro Gallo
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // [[file:~/cuda/atrip/atrip.org::*Complex%20numbers][Complex numbers:2]]
 #include <atrip/Complex.hpp>
 #include <atrip/CUDA.hpp>
 namespace atrip {
  template <> double maybeConjugate(const double a) { return a; }
  template <> Complex maybeConjugate(const Complex a) { return std::conj(a); }
 #if defined(HAVE_CUDA)
 #endif
  namespace traits {
    template <typename F> bool isComplex() { return false; }
    template <> bool isComplex<double>() { return false; }
    template <> bool isComplex<Complex>() { return true; }
  namespace mpi {
    template <> MPI_Datatype datatypeOf<double>() { return MPI_DOUBLE; }
    template <> MPI_Datatype datatypeOf<Complex>() { return MPI_DOUBLE_COMPLEX; }
  }
  }
 }
 // Complex numbers:2 ends here
--- a/src/atrip/Equations.cxx
+++ b/src/atrip/Equations.cxx
@ -0,0 +1,720 @@
 // Copyright 2022 Alejandro Gallo
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]]
 #include<atrip/Equations.hpp>
 #if defined(HAVE_CUDA)
 #include <cuda.h>
 #endif
 namespace atrip {
 // Prolog:2 ends here
 #ifdef HAVE_CUDA
 namespace cuda {
  // cuda kernels
  template <typename F>
  __global__
  void zeroing(F* a, size_t n) {
    F zero = {0};
    for (size_t i = 0; i < n; i++) {
      a[i] = zero;
    }
  }
  ////
  template <typename F>
  __device__
  F maybeConjugateScalar(const F a);
  template <>
  __device__
  double maybeConjugateScalar(const double a) { return a; }
  template <>
  __device__
  cuDoubleComplex
  maybeConjugateScalar(const cuDoubleComplex a) {
    return {a.x, -a.y};
  }
  template <typename F>
  __global__
  void maybeConjugate(F* to, F* from, size_t n) {
    for (size_t i = 0; i < n; ++i) {
      to[i] = maybeConjugateScalar<F>(from[i]);
    }
  }
  template <typename F>
  __global__
  void reorder(F* to, F* from, size_t size, size_t I, size_t J, size_t K) {
    size_t idx = 0;
    const size_t IDX = I + J*size + K*size*size;
    for (size_t k = 0; k < size; k++)
    for (size_t j = 0; j < size; j++)
    for (size_t i = 0; i < size; i++, idx++)
      to[idx] += from[IDX];
  }
  // I mean, really CUDA... really!?
  template <typename F>
  __device__
  F multiply(const F &a, const F &b);
  template <>
  __device__
  double multiply(const double &a, const double &b) { return a * b; }
  template <>
  __device__
  cuDoubleComplex multiply(const cuDoubleComplex &a, const cuDoubleComplex &b) {
   return
     {a.x * b.x - a.y * b.y,
      a.x * b.y + a.y * b.x};
  }
  template <typename F>
  __device__
  void sum_in_place(F* to, const F* b);
  template <>
  __device__
  void sum_in_place(double* to, const double *b) { *to += *b; }
  template <>
  __device__
  void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* b) {
   to->x += b->x;
   to->y += b->y;
  }
  __device__
  cuDoubleComplex& operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz) {
    lz.x += rz.x;
    lz.y += rz.y;
    return lz;
  }
 };
 #endif
 // [[file:~/cuda/atrip/atrip.org::*Energy][Energy:2]]
 template <typename F>
 double getEnergyDistinct
  ( F const epsabc
  , size_t const No
  , F* const epsi
  , F* const Tijk
  , F* const Zijk
  ) {
  constexpr size_t blockSize=16;
  F energy(0.);
  for (size_t kk=0; kk<No; kk+=blockSize){
    const size_t kend( std::min(No, kk+blockSize) );
    for (size_t jj(kk); jj<No; jj+=blockSize){
      const size_t jend( std::min( No, jj+blockSize) );
      for (size_t ii(jj); ii<No; ii+=blockSize){
        const size_t iend( std::min( No, ii+blockSize) );
        for (size_t k(kk); k < kend; k++){
          const F ek(epsi[k]);
          const size_t jstart = jj > k ? jj : k;
          for (size_t j(jstart); j < jend; j++){
            F const ej(epsi[j]);
            F const facjk = j == k ? F(0.5) : F(1.0);
            size_t istart = ii > j ? ii : j;
            for (size_t i(istart); i < iend; i++){
              const F
                  ei(epsi[i])
                , facij = i == j ? F(0.5) : F(1.0)
                , denominator(epsabc - ei - ej - ek)
                , U(Zijk[i + No*j + No*No*k])
                , V(Zijk[i + No*k + No*No*j])
                , W(Zijk[j + No*i + No*No*k])
                , X(Zijk[j + No*k + No*No*i])
                , Y(Zijk[k + No*i + No*No*j])
                , Z(Zijk[k + No*j + No*No*i])
                , A(maybeConjugate<F>(Tijk[i + No*j + No*No*k]))
                , B(maybeConjugate<F>(Tijk[i + No*k + No*No*j]))
                , C(maybeConjugate<F>(Tijk[j + No*i + No*No*k]))
                , D(maybeConjugate<F>(Tijk[j + No*k + No*No*i]))
                , E(maybeConjugate<F>(Tijk[k + No*i + No*No*j]))
                , _F(maybeConjugate<F>(Tijk[k + No*j + No*No*i]))
                , value
                  = 3.0 * ( A * U
                            + B * V
                            + C * W
                            + D * X
                            + E * Y
                            + _F * Z )
                 + ( ( U + X + Y )
                   - 2.0 * ( V + W + Z )
                   ) * ( A + D + E )
                 + ( ( V + W + Z )
                   - 2.0 * ( U + X + Y )
                   ) * ( B + C + _F )
                ;
              energy += 2.0 * value / denominator * facjk * facij;
            } // i
          } // j
        } // k
      } // ii
    } // jj
  } // kk
  return std::real(energy);
 }
 template <typename F>
 double getEnergySame
  ( F const epsabc
  , size_t const No
  , F* const epsi
  , F* const Tijk
  , F* const Zijk
  ) {
  constexpr size_t blockSize = 16;
  F energy = F(0.);
  for (size_t kk=0; kk<No; kk+=blockSize){
    const size_t kend( std::min( kk+blockSize, No) );
    for (size_t jj(kk); jj<No; jj+=blockSize){
      const size_t jend( std::min( jj+blockSize, No) );
      for (size_t ii(jj); ii<No; ii+=blockSize){
        const size_t iend( std::min( ii+blockSize, No) );
        for (size_t k(kk); k < kend; k++){
          const F ek(epsi[k]);
          const size_t jstart = jj > k ? jj : k;
          for(size_t j(jstart); j < jend; j++){
            const F facjk( j == k ? F(0.5) : F(1.0));
            const F ej(epsi[j]);
            const size_t istart = ii > j ? ii : j;
            for(size_t i(istart); i < iend; i++){
              const F
                ei(epsi[i])
              , facij ( i==j ? F(0.5) : F(1.0))
              , denominator(epsabc - ei - ej - ek)
              , U(Zijk[i + No*j + No*No*k])
              , V(Zijk[j + No*k + No*No*i])
              , W(Zijk[k + No*i + No*No*j])
              , A(maybeConjugate<F>(Tijk[i + No*j + No*No*k]))
              , B(maybeConjugate<F>(Tijk[j + No*k + No*No*i]))
              , C(maybeConjugate<F>(Tijk[k + No*i + No*No*j]))
              , value
                = F(3.0) * ( A * U
                           + B * V
                           + C * W
                           )
                - ( A + B + C ) * ( U + V + W )
              ;
              energy += F(2.0) * value / denominator * facjk * facij;
            } // i
          } // j
        } // k
      } // ii
    } // jj
  } // kk
  return std::real(energy);
 }
 // Energy:2 ends here
 // [[file:~/cuda/atrip/atrip.org::*Energy][Energy:3]]
 // instantiate double
 template
 double getEnergyDistinct
  ( double const epsabc
  , size_t const No
  , double* const epsi
  , double* const Tijk
  , double* const Zijk
  );
 template
 double getEnergySame
  ( double const epsabc
  , size_t const No
  , double* const epsi
  , double* const Tijk
  , double* const Zijk
  );
 // instantiate Complex
 template
 double getEnergyDistinct
  ( Complex const epsabc
  , size_t const No
  , Complex* const epsi
  , Complex* const Tijk
  , Complex* const Zijk
  );
 template
 double getEnergySame
  ( Complex const epsabc
  , size_t const No
  , Complex* const epsi
  , Complex* const Tijk
  , Complex* const Zijk
  );
 // Energy:3 ends here
 // [[file:~/cuda/atrip/atrip.org::*Singles%20contribution][Singles contribution:2]]
 template <typename F>
 #ifdef HAVE_CUDA
 __global__
 #endif
  void singlesContribution
    ( size_t No
    , size_t Nv
    , size_t a
    , size_t b
    , size_t c
    , DataFieldType<F>* const Tph
    , DataFieldType<F>* const VABij
    , DataFieldType<F>* const VACij
    , DataFieldType<F>* const VBCij
    , DataFieldType<F>* Zijk
    ) {
    const size_t NoNo = No*No;
    // TODO: change order of for loops
    for (size_t k = 0; k < No; k++)
    for (size_t i = 0; i < No; i++)
    for (size_t j = 0; j < No; j++) {
      const size_t ijk = i + j*No + k*No*No;
 #ifdef HAVE_CUDA
 #  define GO(__TPH, __VABIJ)                                          \
    {                                                                 \
      const DataFieldType<F> product                                  \
            = cuda::multiply<DataFieldType<F>>((__TPH), (__VABIJ));   \
      cuda::sum_in_place<DataFieldType<F>>(&Zijk[ijk], &product);     \
    }
 #else
 #  define GO(__TPH, __VABIJ) Zijk[ijk] += (__TPH) * (__VABIJ);
 #endif
      GO(Tph[ a + i * Nv ], VBCij[ j + k * No ])
      GO(Tph[ b + j * Nv ], VACij[ i + k * No ])
      GO(Tph[ c + k * Nv ], VABij[ i + j * No ])
 #undef GO
    } // for loop j
  }
 // instantiate
  template
 #ifdef HAVE_CUDA
  __global__
 #endif
  void singlesContribution<double>( size_t No
                                  , size_t Nv
                                  , size_t a
                                  , size_t b
                                  , size_t c
                                  , double* const Tph
                                  , double* const VABij
                                  , double* const VACij
                                  , double* const VBCij
                                  , double* Zijk
                                  );
  template
 #ifdef HAVE_CUDA
  __global__
 #endif
  void singlesContribution<Complex>( size_t No
                                   , size_t Nv
                                   , size_t a
                                   , size_t b
                                   , size_t c
                                   , DataFieldType<Complex>* const Tph
                                   , DataFieldType<Complex>* const VABij
                                   , DataFieldType<Complex>* const VACij
                                   , DataFieldType<Complex>* const VBCij
                                   , DataFieldType<Complex>* Zijk
                                   );
 // Singles contribution:2 ends here
 // [[file:~/cuda/atrip/atrip.org::*Doubles%20contribution][Doubles contribution:2]]
  template <typename F>
  void doublesContribution
    ( const ABCTuple &abc
    , size_t const No
    , size_t const Nv
    // -- VABCI
    , DataPtr<F> const VABph
    , DataPtr<F> const VACph
    , DataPtr<F> const VBCph
    , DataPtr<F> const VBAph
    , DataPtr<F> const VCAph
    , DataPtr<F> const VCBph
    // -- VHHHA
    , DataPtr<F> const VhhhA
    , DataPtr<F> const VhhhB
    , DataPtr<F> const VhhhC
    // -- TA
    , DataPtr<F> const TAphh
    , DataPtr<F> const TBphh
    , DataPtr<F> const TCphh
    // -- TABIJ
    , DataPtr<F> const TABhh
    , DataPtr<F> const TAChh
    , DataPtr<F> const TBChh
    // -- TIJK
    // , DataPtr<F> Tijk_
    , DataFieldType<F>* Tijk_
    ) {
    const size_t a = abc[0], b = abc[1], c = abc[2]
              , NoNo = No*No, NoNv = No*Nv
              ;
    typename DataField<F>::type* Tijk = (typename DataField<F>::type*) Tijk_;
    LOG(0, "AtripCUDA") <<  "in doubles " << "\n";
 #if defined(ATRIP_USE_DGEMM)
 #define _IJK_(i, j, k) i + j*No + k*NoNo
 #if defined(HAVE_CUDA)
 // TODO
 #define REORDER(__II, __JJ, __KK)
 #define __TO_DEVICEPTR(_v) (_v)
 #define DGEMM_PARTICLES(__A, __B)                   \
  atrip::xgemm<F>("T",                              \
                  "N",                              \
                  (int const*)&NoNo,                \
                  (int const*)&No,                  \
                  (int const*)&Nv,                  \
                  &one,                             \
                  (DataFieldType<F>*)__A,           \
                  (int const*)&Nv,                  \
                  (DataFieldType<F>*)__B,           \
                  (int const*)&Nv,                  \
                  &zero,                            \
                  _t_buffer_p,                      \
                  (int const*)&NoNo);
 #define DGEMM_HOLES(__A, __B, __TRANSB)             \
  atrip::xgemm<F>("N",                              \
                  __TRANSB,                         \
                  (int const*)&NoNo,                \
                  (int const*)&No,                  \
                  (int const*)&No,                  \
                  &m_one,                           \
                  __TO_DEVICEPTR(__A),              \
                  (int const*)&NoNo,                \
                  (DataFieldType<F>*)__B,           \
                  (int const*)&No,                  \
                  &zero,                            \
                  _t_buffer_p,                      \
                  (int const*)&NoNo                 \
                  );
 #define MAYBE_CONJ(_conj, _buffer)                                      \
  cuda::maybeConjugate<<<1,1>>>((DataFieldType<F>*)_conj, (DataFieldType<F>*)_buffer, NoNoNo);
 #else
 #define REORDER(__II, __JJ, __KK)                                   \
    WITH_CHRONO("doubles:reorder",                                  \
    for (size_t k = 0; k < No; k++)                                 \
    for (size_t j = 0; j < No; j++)                                 \
    for (size_t i = 0; i < No; i++) {                               \
      Tijk[_IJK_(i, j, k)] += _t_buffer_p[_IJK_(__II, __JJ, __KK)];   \
    }                                                               \
    )
 #define __TO_DEVICEPTR(_v) (_v)
 #define DGEMM_PARTICLES(__A, __B)               \
  atrip::xgemm<F>("T",                          \
                  "N",                          \
                  (int const*)&NoNo,            \
                  (int const*)&No,              \
                  (int const*)&Nv,              \
                  &one,                         \
                  __A,                          \
                  (int const*)&Nv,              \
                  __B,                          \
                  (int const*)&Nv,              \
                  &zero,                        \
                  _t_buffer_p,                  \
                  (int const*)&NoNo             \
                  );
 #define DGEMM_HOLES(__A, __B, __TRANSB)         \
  atrip::xgemm<F>("N",                          \
                  __TRANSB,                     \
                  (int const*)&NoNo,            \
                  (int const*)&No,              \
                  (int const*)&No,              \
                  &m_one,                       \
                  __A,                          \
                  (int const*)&NoNo,            \
                  __B,                          \
                  (int const*)&No,              \
                  &zero,                        \
                  _t_buffer_p,                  \
                  (int const*)&NoNo             \
                  );
 #define MAYBE_CONJ(_conj, _buffer)                \
  for (size_t __i = 0; __i < NoNoNo; ++__i)       \
    _conj[__i] = maybeConjugate<F>(_buffer[__i]);
 #endif
    F one{1.0}, m_one{-1.0}, zero{0.0};
    DataFieldType<F> zero_h{0.0};
    const size_t NoNoNo = No*NoNo;
 #ifdef HAVE_CUDA
    DataFieldType<F>* _t_buffer;
    DataFieldType<F>* _vhhh;
    LOG(0, "AtripCUDA") <<  "getting memory" << "\n";
    cuMemAlloc((CUdeviceptr*)&_t_buffer, NoNoNo * sizeof(DataFieldType<F>));
    cuMemAlloc((CUdeviceptr*)&_vhhh, NoNoNo * sizeof(DataFieldType<F>));
    LOG(0, "AtripCUDA") <<  "cuda::zeroing " << "\n";
    cuda::zeroing<<<1,1>>>((DataFieldType<F>*)_t_buffer, NoNoNo);
    cuda::zeroing<<<1,1>>>((DataFieldType<F>*)_vhhh, NoNoNo);
 #else
    F* _t_buffer = (F*)malloc(NoNoNo * sizeof(F));
    F* _vhhh = (F*)malloc(NoNoNo * sizeof(F));
    for (size_t i=0; i < NoNoNo; i++) {
      _t_buffer[i] = zero_h;
      _vhhh[i] = zero_h;
    }
 #endif
    //_t_buffer.reserve(NoNoNo);
    DataFieldType<F>* _t_buffer_p = __TO_DEVICEPTR(_t_buffer);
 #ifdef HAVE_CUDA
    LOG(0, "AtripCUDA") <<  "cuda::zeroing Tijk" << "\n";
    cuda::zeroing<<<1,1>>>((DataFieldType<F>*)Tijk, NoNoNo);
 #else
    WITH_CHRONO("double:reorder",
      for (size_t k = 0; k < NoNoNo; k++) {
        Tijk[k] = DataFieldType<F>{0.0};
       })
 #endif
    LOG(0, "AtripCUDA") <<  "doing holes" << "\n";
    // TOMERGE: replace chronos
    WITH_CHRONO("doubles:holes",
      { // Holes part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
        LOG(0, "AtripCUDA") <<  "conj 1" << "\n";
        MAYBE_CONJ(_vhhh, VhhhC)
        LOG(0, "AtripCUDA") <<  "done" << "\n";
        WITH_CHRONO("doubles:holes:1",
          LOG(0, "AtripCUDA") <<  "dgemm 1" << "\n";
          DGEMM_HOLES(_vhhh, TABhh, "N")
          LOG(0, "AtripCUDA") <<  "reorder 1" << "\n";
          REORDER(i, k, j)
        )
        // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
        WITH_CHRONO("doubles:holes:2",
          LOG(0, "AtripCUDA") <<  "dgemm 2" << "\n";
          DGEMM_HOLES(_vhhh, TABhh, "T")
          REORDER(j, k, i)
        )
        // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
        LOG(0, "AtripCUDA") <<  "conj 2" << "\n";
        MAYBE_CONJ(_vhhh, VhhhB)
        LOG(0, "AtripCUDA") <<  "done" << "\n";
        WITH_CHRONO("doubles:holes:3",
          DGEMM_HOLES(_vhhh, TAChh, "N")
          REORDER(i, j, k)
        )
        // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
        WITH_CHRONO("doubles:holes:4",
          DGEMM_HOLES(_vhhh, TAChh, "T")
          REORDER(k, j, i)
        )
        // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
        LOG(0, "AtripCUDA") <<  "conj 3" << "\n";
        MAYBE_CONJ(_vhhh, VhhhA)
        WITH_CHRONO("doubles:holes:5",
          DGEMM_HOLES(_vhhh, TBChh, "N")
          REORDER(j, i, k)
        )
        // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
        WITH_CHRONO("doubles:holes:6",
          DGEMM_HOLES(_vhhh, TBChh, "T")
          REORDER(k, i, j)
        )
      }
    )
  #undef MAYBE_CONJ
    LOG(0, "AtripCUDA") <<  "doing particles" << "\n";
    WITH_CHRONO("doubles:particles",
      { // Particle part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
        WITH_CHRONO("doubles:particles:1",
          DGEMM_PARTICLES(TAphh, VBCph)
          REORDER(i, j, k)
        )
        // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
        WITH_CHRONO("doubles:particles:2",
          DGEMM_PARTICLES(TAphh, VCBph)
          REORDER(i, k, j)
        )
        // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
        WITH_CHRONO("doubles:particles:3",
          DGEMM_PARTICLES(TCphh, VABph)
          REORDER(k, i, j)
        )
        // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
        WITH_CHRONO("doubles:particles:4",
          DGEMM_PARTICLES(TCphh, VBAph)
          REORDER(k, j, i)
        )
        // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
        WITH_CHRONO("doubles:particles:5",
          DGEMM_PARTICLES(TBphh, VACph)
          REORDER(j, i, k)
        )
        // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
        WITH_CHRONO("doubles:particles:6",
          DGEMM_PARTICLES(TBphh, VCAph)
          REORDER(j, k, i)
        )
      }
    )
    LOG(0, "AtripCUDA") <<  "particles done" << "\n";
  { // free resources
 #ifdef HAVE_CUDA
    LOG(0, "AtripCUDA") <<  "free mem" << "\n";
    cuMemFree((CUdeviceptr)_vhhh);
    cuMemFree((CUdeviceptr)_t_buffer);
    LOG(0, "AtripCUDA") <<  "free mem done" << "\n";
 #else
    free(_vhhh);
    free(_t_buffer);
 #endif
  }
  #undef REORDER
  #undef DGEMM_HOLES
  #undef DGEMM_PARTICLES
  #undef _IJK_
  #else
    for (size_t k = 0; k < No; k++)
    for (size_t j = 0; j < No; j++)
    for (size_t i = 0; i < No; i++){
      const size_t ijk = i + j*No + k*NoNo
                ,  jk = j + k*No
                ;
      Tijk[ijk] = 0.0; // :important
      // HOLE DIAGRAMS: TABHH and VHHHA
      for (size_t L = 0; L < No; L++){
        // t[abLj] * V[Lcik]        H1
        // t[baLi] * V[Lcjk]        H0      TODO: conjugate T for complex
        Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo];
        Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo];
        // t[acLk] * V[Lbij]        H5
        // t[caLi] * V[Lbkj]        H3
        Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo];
        Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo];
        // t[bcLk] * V[Laji]        H2
        // t[cbLj] * V[Laki]        H4
        Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo];
        Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo];
      }
      // PARTILCE DIAGRAMS: TAPHH and VABPH
      for (size_t E = 0; E < Nv; E++) {
        // t[aEij] * V[bcEk]        P0
        // t[aEik] * V[cbEj]        P3 // TODO: CHECK THIS ONE, I DONT KNOW
        Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv];
        Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv];
        // t[cEki] * V[abEj]        P5
        // t[cEkj] * V[baEi]        P2
        Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv];
        Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv];
        // t[bEji] * V[acEk]        P1
        // t[bEjk] * V[caEi]        P4
        Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv];
        Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv];
      }
    }
 #endif
  }
  // instantiate templates
  template
  void doublesContribution<double>
    ( const ABCTuple &abc
    , size_t const No
    , size_t const Nv
    // -- VABCI
    , DataPtr<double> const VABph
    , DataPtr<double> const VACph
    , DataPtr<double> const VBCph
    , DataPtr<double> const VBAph
    , DataPtr<double> const VCAph
    , DataPtr<double> const VCBph
    // -- VHHHA
    , DataPtr<double> const VhhhA
    , DataPtr<double> const VhhhB
    , DataPtr<double> const VhhhC
    // -- TA
    , DataPtr<double> const TAphh
    , DataPtr<double> const TBphh
    , DataPtr<double> const TCphh
    // -- TABIJ
    , DataPtr<double> const TABhh
    , DataPtr<double> const TAChh
    , DataPtr<double> const TBChh
    // -- TIJK
    , DataFieldType<double>* Tijk
    );
  template
  void doublesContribution<Complex>
    ( const ABCTuple &abc
    , size_t const No
    , size_t const Nv
    // -- VABCI
    , DataPtr<Complex> const VABph
    , DataPtr<Complex> const VACph
    , DataPtr<Complex> const VBCph
    , DataPtr<Complex> const VBAph
    , DataPtr<Complex> const VCAph
    , DataPtr<Complex> const VCBph
    // -- VHHHA
    , DataPtr<Complex> const VhhhA
    , DataPtr<Complex> const VhhhB
    , DataPtr<Complex> const VhhhC
    // -- TA
    , DataPtr<Complex> const TAphh
    , DataPtr<Complex> const TBphh
    , DataPtr<Complex> const TCphh
    // -- TABIJ
    , DataPtr<Complex> const TABhh
    , DataPtr<Complex> const TAChh
    , DataPtr<Complex> const TBChh
    // -- TIJK
    , DataFieldType<Complex>* Tijk
    );
 // Doubles contribution:2 ends here
 // [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:2]]
 }
 // Epilog:2 ends here