13 changed files with 150 additions and 811 deletions
--- a/bench/Makefile.am
+++ b/bench/Makefile.am
@ -1,4 +1,3 @@
 AUTOMAKE_OPTIONS = subdir-objects
 include $(top_srcdir)/atrip.mk
 AM_CPPFLAGS = -I$(top_srcdir)/include/ $(CTF_CPPFLAGS)
@ -20,13 +19,4 @@ endif
 if WITH_CUDA
 test_main_CXXFLAGS = $(CUDA_CXXFLAGS)
 test_main_LDADD += $(CUDA_LDFLAGS)
 AM_CXXFLAGS = $(CUDA_CXXFLAGS)
 AM_LDFLAGS += $(CUDA_LDFLAGS)
 bin_PROGRAMS += test-cublas-parallel-atrip
 test_cublas_parallel_SOURCES = test-cublas-parallel-atrip.cxx
 bin_PROGRAMS += test-cuda-sanity
 test_cuda_sanity_SOURCES = test-cuda-sanity.cxx
 endif
--- a/bench/test-cublas-parallel-atrip.cxx
+++ b/bench/test-cublas-parallel-atrip.cxx
@ -1,239 +0,0 @@
 #include <mpi.h>
 #include <iostream>
 #include <cassert>
 #include <string.h>
 #include <sstream>
 #include <string>
 #include <vector>
 #include <cuda.h>
 #include <chrono>
 #include <map>
 #include <cstdlib>
 #include <unistd.h>
 #include <CLI11.hpp>
 #define CUBLASAPI
 #include <cublas_api.h>
 #include <cublas_v2.h>
 struct Timer {
  using Clock = std::chrono::high_resolution_clock;
  using Event = std::chrono::time_point<Clock>;
  std::chrono::duration<double> duration;
  Event _start;
  inline void start() noexcept { _start = Clock::now(); }
  inline void stop() noexcept { duration += Clock::now() - _start; }
  inline void clear() noexcept { duration *= 0; }
  inline double count() const noexcept { return duration.count(); }
 };
 using Timings = std::map<std::string, Timer>;
 #define _FORMAT(_fmt, ...)                                    \
  ([&] (void) -> std::string {                                \
     int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__);  \
     std::vector<char>  _out(_sz  +  1);                      \
     std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
     return std::string(_out.data());                         \
   })()
 #define _CHECK_CUDA_SUCCESS(message, ...)                               \
  do {                                                                  \
    CUresult result = __VA_ARGS__;                                      \
    printf("doing %s\n", message);                                      \
    if (result != CUDA_SUCCESS) {                                       \
      printf("\t!!CUDA_ERROR(%d): %s:%d %s\n",                          \
             result,                                                    \
             __FILE__,                                                  \
             __LINE__,                                                  \
             message);                                                  \
      return 1;                                                         \
    }                                                                   \
  } while (0)
 #define _CHECK_CUBLAS_SUCCESS(message, ...)                             \
  do {                                                                  \
    cublasStatus_t result = __VA_ARGS__;                                \
    if (result != 0) {                                                  \
      printf("\t!!CUBLAS_ERROR(%d): %s:%d  %s\n",                       \
             result,                                                    \
             __FILE__,                                                  \
             __LINE__,                                                  \
             message);                                                  \
      return 1;                                                         \
    }                                                                   \
  } while (0)
 int main(int  argc, char** argv) {
  using std::vector;
  MPI_Init(NULL, NULL);
  int rank, np, ngcards;
  size_t no(10), nv(no * 10), its(2);
  bool barrier = false;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &np);
  CLI::App app{"Main bench for atrip"};
  app.add_option("--no", no, "Occupied orbitals");
  app.add_option("--nv", nv, "Virtual orbitals");
  app.add_option("--its", its, "Number of iterations to be done");
  app.add_option("--barrier", barrier, "Call a MPI_Barrier in every iteration?");
  CLI11_PARSE(app, argc, argv);
  const size_t oo = no * no, ooo = no * oo;
  Timings timings;
  MPI_Barrier(MPI_COMM_WORLD);
  _CHECK_CUDA_SUCCESS("init for cuda",
                      cuInit(0));
  _CHECK_CUDA_SUCCESS("get ncards",
                      cuDeviceGetCount(&ngcards));
  CUcontext ctx;
  CUdevice dev;
  char hostname[256];
  gethostname(hostname, 256);
  printf("%s with rank %d gets card %d\n",
         hostname,
         rank,
         rank % ngcards);
  // set contexts
  _CHECK_CUDA_SUCCESS("device get",       cuDeviceGet(&dev, rank % ngcards));
  _CHECK_CUDA_SUCCESS("creating context", cuCtxCreate(&ctx, 0, dev));
  _CHECK_CUDA_SUCCESS("setting context",  cuCtxSetCurrent(ctx));
  _CHECK_CUDA_SUCCESS("synchronizing",    cuCtxSynchronize());
  MPI_Barrier(MPI_COMM_WORLD);
  using host_slice_t = vector<double>;
  vector<size_t> sizes = {nv * oo,  nv * no , oo,      oo * no,  oo * no};
  vector<CUdeviceptr>     P_phh(3), P_ph(6) , H_hh(3), H_hhh(3), T_hhh(1);
  vector<vector<CUdeviceptr>*> slices_d = {&P_phh, &P_ph , &H_hh, &H_hhh, &T_hhh};
  vector<vector<host_slice_t>> slices_h(slices_d.size());
  {
    int i = -1;
    for (auto& v: slices_d) {
      i++;
      for (auto& ptr: *v) {
        _CHECK_CUDA_SUCCESS("malloc",
                            cuMemAlloc(&ptr,
                                       sizes[i] * sizeof(double)));
        slices_h[i].push_back(std::move(std::vector<double>(sizes[i])));
      }
    }
  }
  const double one = 1.0, zero = 0.0;
  printf("its: %d\n", its);
  printf("barrier: %d\n", barrier);
  printf("no: %ld\n", no);
  printf("nv: %ld\n", nv);
  printf("SIZE %f GB\n", (3 * nv * oo
                          + 6 * oo * nv
                          + 3 * oo
                          + 3 * ooo
                          + 1 * ooo
                          ) * sizeof(double) / 1024.0 / 1024.0 / 1024.0);
  std::map<std::string, double> tflopss
    {{ "dgemm",     ooo * (no + nv) * 6.0 * 2.0 * its / 1e12},
     { "holes",     ooo * no * 6.0 * 2.0 * its / 1e12},
     { "particles", ooo * nv * 6.0 * 2.0 * its / 1e12}};
  cublasHandle_t handle;
  _CHECK_CUBLAS_SUCCESS("handle create", cublasCreate(&handle));
  printf("handle %ld\n", handle);
  timings["dgemm"].start();
  for (size_t i = 0; i < its; i++) {
    if (barrier) {
      MPI_Barrier(MPI_COMM_WORLD);
      timings["memcpy"].start();
      for (size_t _s = 0; _s < slices_d.size(); _s++) {
        // for (size_t _b = 0; _b < slices_h[_s].size(); _b++) {
        for (size_t _b = 0; _b < 1 ; _b++) {
          auto device = (*slices_d[_s])[_b];
          auto host   = slices_h[_s][_b].data();
          cuMemcpyHtoD(device, host, sizes[_s]);
        }
      }
      timings["memcpy"].stop();
    }
    timings["holes"].start();
    for (size_t j = 0; j < 3; j++) {
      _CHECK_CUBLAS_SUCCESS(" > 'geming ...",
                            cublasDgemm(handle,
                                        CUBLAS_OP_N,
                                        CUBLAS_OP_N,
                                        oo, no, no,
                                        &one,
                                        (double*)H_hhh[j], oo,
                                        (double*)H_hh[j], no,
                                        &zero,
                                        (double*)T_hhh[0], oo));
      _CHECK_CUBLAS_SUCCESS(" > 'geming ...",
                            cublasDgemm(handle,
                                        CUBLAS_OP_N,
                                        CUBLAS_OP_T,
                                        oo, no, no,
                                        &one,
                                        (double*)H_hhh[j], oo,
                                        (double*)H_hh[j], no,
                                        &zero,
                                        (double*)T_hhh[0], oo));
    }
    timings["holes"].stop();
    timings["particles"].start();
    for (size_t j = 0; j < 6; j++) {
      _CHECK_CUBLAS_SUCCESS(" > 'geming ...",
                            cublasDgemm(handle,
                                        CUBLAS_OP_T,
                                        CUBLAS_OP_N,
                                        oo, no, nv,
                                        &one,
                                        (double*)P_phh[j % 3], nv,
                                        (double*)P_ph[j], nv,
                                        &zero,
                                        (double*)T_hhh[0], oo));
    }
    timings["particles"].stop();
    cuCtxSynchronize();
  }
  timings["dgemm"].stop();
  printf("Performance: \n");
  for (auto name: {"holes", "particles", "dgemm"})
    printf("%10s TFlops: %4.1f\n",
          name,
          tflopss[name]
          / timings[name].count());
  printf("Timings: \n");
  for (auto const& kv: timings)
    printf("%10s: %10f\n", kv.first.c_str(), kv.second.count());
  MPI_Finalize();
  return 0;
 }
--- a/bench/test-cublas-parallel.cxx
+++ b/bench/test-cublas-parallel.cxx
@ -1,163 +0,0 @@
 #include <mpi.h>
 #include <iostream>
 #include <cassert>
 #include <string.h>
 #include <sstream>
 #include <string>
 #include <vector>
 #include <cuda.h>
 #include <chrono>
 #include <map>
 #include <cstdlib>
 #include <unistd.h>
 #define CUBLASAPI
 #include <cublas_api.h>
 #include <cublas_v2.h>
 struct Timer {
  using Clock = std::chrono::high_resolution_clock;
  using Event = std::chrono::time_point<Clock>;
  std::chrono::duration<double> duration;
  Event _start;
  inline void start() noexcept { _start = Clock::now(); }
  inline void stop() noexcept { duration += Clock::now() - _start; }
  inline void clear() noexcept { duration *= 0; }
  inline double count() const noexcept { return duration.count(); }
 };
 using Timings = std::map<std::string, Timer>;
 #define _FORMAT(_fmt, ...)                                    \
  ([&] (void) -> std::string {                                \
     int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__);  \
     std::vector<char>  _out(_sz  +  1);                      \
     std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
     return std::string(_out.data());                         \
   })()
 #define _CHECK_CUDA_SUCCESS(message, ...)                               \
  do {                                                                  \
    CUresult result = __VA_ARGS__;                                      \
    printf("doing %s\n", message);                                      \
    if (result != CUDA_SUCCESS) {                                       \
      printf("\t!!CUDA_ERROR(%d): %s:%d %s\n",                          \
             result,                                                    \
             __FILE__,                                                  \
             __LINE__,                                                  \
             message);                                                  \
      return 1;                                                         \
    }                                                                   \
  } while (0)
 #define _CHECK_CUBLAS_SUCCESS(message, ...)                             \
  do {                                                                  \
    cublasStatus_t result = __VA_ARGS__;                                \
    printf("CUBLAS: doing %s\n", message);                              \
    if (result != 0) {                                                  \
      printf("\t!!CUBLAS_ERROR(%d): %s:%d  %s\n",                       \
             result,                                                    \
             __FILE__,                                                  \
             __LINE__,                                                  \
             message);                                                  \
      return 1;                                                         \
    }                                                                   \
  } while (0)
 int main(int  argc, char** argv) {
  MPI_Init(NULL, NULL);
  int rank, np, ngcards;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &np);
  Timings timings;
  MPI_Barrier(MPI_COMM_WORLD);
  _CHECK_CUDA_SUCCESS("init for cuda",
                      cuInit(0));
  _CHECK_CUDA_SUCCESS("get ncards",
                      cuDeviceGetCount(&ngcards));
  const size_t N
    = argc > 1
    ? atoi(argv[1])
    : 30000
    , dgemms
    = argc > 2
    ? atoi(argv[2])
    : 2
    , flops = 2 * N * N * N * dgemms
    ;
  CUcontext ctx;
  CUdevice dev;
  char hostname[256];
  gethostname(hostname, 256);
  printf("%s with rank %d gets card %d\n",
         hostname,
         rank,
         rank % ngcards);
  // set contexts
  _CHECK_CUDA_SUCCESS("device get",
                      cuDeviceGet(&dev, rank % ngcards));
  _CHECK_CUDA_SUCCESS("creating context",
                      cuCtxCreate(&ctx, 0, dev));
  _CHECK_CUDA_SUCCESS("setting context",
                      cuCtxSetCurrent(ctx));
  _CHECK_CUDA_SUCCESS("synchronizing",
                      cuCtxSynchronize());
  MPI_Barrier(MPI_COMM_WORLD);
  CUdeviceptr A, B, C;
  const double one = 1.0;
  printf("SIZE %f GB\n", N * N * sizeof(double) / 1024.0 / 1024.0 / 1024.0);
  _CHECK_CUDA_SUCCESS("A", cuMemAlloc(&A, N * N * sizeof(double)));
  _CHECK_CUDA_SUCCESS("B", cuMemAlloc(&B, N * N * sizeof(double)));
  _CHECK_CUDA_SUCCESS("C", cuMemAlloc(&C, N * N * sizeof(double)));
  cublasHandle_t handle;
  cublasStatus_t stat;
  _CHECK_CUBLAS_SUCCESS("handle create", cublasCreate(&handle));
  printf("handle %ld\n", handle);
  timings["dgemm"].start();
  for (size_t i = 0; i < dgemms; i++) {
    _CHECK_CUBLAS_SUCCESS(_FORMAT(" > 'geming %ld ...", i).c_str(),
                          cublasDgemm(handle,
                                      CUBLAS_OP_N,
                                      CUBLAS_OP_N,
                                      N, N, N,
                                      &one,
                                      (double*)A, N,
                                      (double*)B, N,
                                      &one,
                                      (double*)C, N));
  }
  cuCtxSynchronize();
  timings["dgemm"].stop();
  printf("dgemm Gflops: %f\n",
         flops
         / timings["dgemm"].count()
         / 1024.0
         / 1024.0
         / 1024.0);
  MPI_Finalize();
  return 0;
 }
 // Local Variables:
 // compile-command: "mpic++                         \
 //      -pedantic -std=c++11                        \
 //      -L./cudaroot/lib64 -lcuda                   \
 //      -L./cudaroot/lib64 -lcudart                 \
 //      -L./cudaroot/lib64 -lcublas                 \
 //      ./test-cublas-parallel.cxx -o test-cublas-parallel"
 // End:
--- a/bench/test-cuda-sanity.cxx
+++ b/bench/test-cuda-sanity.cxx
@ -1,104 +0,0 @@
 #include <mpi.h>
 #include <iostream>
 #include <cassert>
 #include <string.h>
 #include <sstream>
 #include <string>
 #include <vector>
 #include <cuda.h>
 #define _CHECK_CUDA_SUCCESS(message, ...)                               \
  do {                                                                  \
    CUresult result = __VA_ARGS__;                                      \
    printf("doing %s\n", message);                                      \
    if (result != CUDA_SUCCESS) {                                       \
      printf("\t!!CUDA_ERROR(%d): %s:%d %s\n",                          \
             result,                                                    \
             __FILE__,                                                  \
             __LINE__,                                                  \
             message);                                                  \
      return 1;                                                         \
    }                                                                   \
  } while (0)
 int main() {
  int rank, np, ngcards;
  MPI_Init(NULL, NULL);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &np);
  _CHECK_CUDA_SUCCESS("init for cuda",
                      cuInit(0));
  _CHECK_CUDA_SUCCESS("get ncards",
                      cuDeviceGetCount(&ngcards));
  for (size_t rank = 0; rank < ngcards; rank++) {
    CUcontext ctx;
    CUdevice dev;
    CUdevprop_st prop;
    size_t _free, total, total2;
    char *name = (char*)malloc(256);
    printf("Setting contexts\n");
    // set contexts
    _CHECK_CUDA_SUCCESS("device get",
                        cuDeviceGet(&dev, rank));
    _CHECK_CUDA_SUCCESS("creating context",
                        cuCtxCreate(&ctx, 0, dev));
    _CHECK_CUDA_SUCCESS("setting context",
                        cuCtxSetCurrent(ctx));
    _CHECK_CUDA_SUCCESS("synchronizing",
                        cuCtxSynchronize());
    _CHECK_CUDA_SUCCESS("prop get",
                        cuDeviceGetProperties(&prop, dev));
    _CHECK_CUDA_SUCCESS("meminfo get",
                        cuMemGetInfo(&_free, &total));
    _CHECK_CUDA_SUCCESS("name get",
                        cuDeviceGetName(name, 256, dev));
    _CHECK_CUDA_SUCCESS("totalmem get",
                        cuDeviceTotalMem(&total2, dev));
    printf("\n"
           "CUDA CARD RANK %d\n"
           "=================\n"
           "\tname: %s\n"
           "\tShared Mem Per Block (KB): %f\n"
           "\tFree/Total mem (GB): %f/%f\n"
           "\total2 mem (GB): %f\n"
           "\n",
           dev,
           name,
           prop.sharedMemPerBlock / 1024.0,
           _free / 1024.0 / 1024.0 / 1024.0 ,
           total / 1024.0 / 1024.0 / 1024.0 ,
           total2 / 1024.0 / 1024.0 / 1024.0
           );
    if (_free == 0 || total == 0 || total2 == 0)
      return 1;
    CUdeviceptr data;
    _CHECK_CUDA_SUCCESS("memalloc 1",
                        cuMemAlloc(&data, sizeof(double) * 10000));
    _CHECK_CUDA_SUCCESS("memalloc 2",
                        cuMemAlloc(&data, sizeof(double) * 10000));
  }
  MPI_Finalize();
  return 0;
 }
 // Local Variables:
 // compile-command: "mpic++                         \
 //      -pedantic -std=c++11                        \
 //      -L./cudaroot/lib64 -lcuda                   \
 //      -L./cudaroot/lib64 -lcudart                 \
 //      -L./cudaroot/lib64 -lcublas                 \
 //      ./mem.cxx -o mem"
 // End:
--- a/bench/test_main.cxx
+++ b/bench/test_main.cxx
@ -142,10 +142,8 @@ int main(int argc, char** argv) {
        + /* tai  */ f * nranks * no * nv
    ;
-  if (rank == 0) {
+  if (atrip::Atrip::rank == 0)
-    std::cout << "Tentative MEMORY USAGE (GB): "
+  std::cout << "Tentative MEMORY USAGE: " << atrip_memory << "\n";
 	      << double(atrip_memory) / 1024.0 / 1024.0 / 1024.0 << "\n";
  }
  std::vector<int> symmetries(4, NS)
--- a/bootstrap.sh
+++ b/bootstrap.sh
@ -1,7 +1,5 @@
 #!/usr/bin/env bash
 cd `git rev-parse --show-toplevel`
 type -a autoreconf > /dev/null ||
 {
        cat <<EOF && exit
--- a/include/atrip/CUDA.hpp
+++ b/include/atrip/CUDA.hpp
@ -1,13 +1,5 @@
 #pragma once
 #include <atrip/Utils.hpp>
 #if defined(HAVE_CUDA)
 #include <cuda.h>
 #endif
 #include <sstream>
 #if defined(HAVE_CUDA) && defined(__CUDACC__)
 #  define __MAYBE_GLOBAL__ __global__
 #  define __MAYBE_DEVICE__ __device__
@ -15,31 +7,3 @@
 #  define __MAYBE_GLOBAL__
 #  define __MAYBE_DEVICE__
 #endif
 #define _CHECK_CUDA_SUCCESS(message, ...)                               \
  do {                                                                  \
    CUresult result = __VA_ARGS__;                                      \
    if (result != CUDA_SUCCESS) {                                       \
      auto msg = _FORMAT("\t!!CUDA_ERROR(%d): %s:%d\v%s",               \
                         result,                                        \
                         __FILE__,                                      \
                         __LINE__,                                      \
                         message);                                      \
      std::cerr << msg;                                                 \
      throw msg;                                                        \
    }                                                                   \
  } while (0)
 #define _CHECK_CUBLAS_SUCCESS(message, ...)                             \
  do {                                                                  \
    cublasStatus_t result = __VA_ARGS__;                                      \
    if (result != 0) {                                                  \
      auto msg = _FORMAT("\t!!CUBLAS_ERROR(%d): %s:%d\v%s",             \
                         result,                                        \
                         __FILE__,                                      \
                         __LINE__,                                      \
                         message);                                      \
      std::cerr << msg;                                                 \
      throw msg;                                                        \
    }                                                                   \
  } while (0)
--- a/include/atrip/Slice.hpp
+++ b/include/atrip/Slice.hpp
@ -21,7 +21,6 @@
 #include <atrip/Tuples.hpp>
 #include <atrip/Utils.hpp>
 #include <atrip/CUDA.hpp>
 namespace atrip {
@ -459,11 +458,7 @@ void unwrapAndMarkReady() {
 #if defined(HAVE_CUDA)
      // copy the retrieved mpi data to the device
      WITH_CHRONO("cuda:memcpy",
-                  _CHECK_CUDA_SUCCESS("copying mpi data to device",
+                  cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);)
                                      cuMemcpyHtoD(data,
                                                   (void*)mpi_data,
                                                   sizeof(F) * size));
                  )
      std::free(mpi_data);
 #endif
--- a/include/atrip/SliceUnion.hpp
+++ b/include/atrip/SliceUnion.hpp
@ -195,28 +195,7 @@ template <typename F=double>
                           ;
          if (blank.info.state == Slice<F>::SelfSufficient) {
 #if defined(HAVE_CUDA)
-            const size_t _size = sizeof(F) * sources[from.source].size();
+            blank.mpi_data = sources[from.source].data();
            // TODO: this is code duplication with downstairs
            if (freePointers.size() == 0) {
              std::stringstream stream;
              stream << "No more free pointers "
                     << "for type " << type
                     << " and name " << name
                      ;
              throw std::domain_error(stream.str());
            }
            auto dataPointer = freePointers.begin();
            freePointers.erase(dataPointer);
            blank.data = *dataPointer;
            //
            //
            // TODO [#A]: do cuMemcpy of
            //      sources[from.source].data() ⇒ blank.data
            // Do this when everything else is working.
            // This will probably be a bottleneck of the H-to-D communication,
            // as most slices are SelfSufficient.
            //
            //
 #else
            blank.data = sources[from.source].data();
 #endif
@ -337,16 +316,6 @@ template <typename F=double>
            }
          }
 #if defined(HAVE_CUDA)
          // In cuda, SelfSufficient slices have an ad-hoc pointer
          // since it is a pointer on the device and has to be
          // brought back to the free pointer bucket of the SliceUnion.
          // Therefore, only in the Recycled case it should not be
          // put back the pointer.
          if (slice.info.state == Slice<F>::Recycled) {
            freeSlicePointer = false;
          }
 #else
          // if the slice is self sufficient, do not dare touching the
          // pointer, since it is a pointer to our sources in our rank.
          if (  slice.info.state == Slice<F>::SelfSufficient
@ -354,7 +323,6 @@ template <typename F=double>
             ) {
            freeSlicePointer = false;
          }
 #endif
          // make sure we get its data pointer to be used later
          // only for non-recycled, since it can be that we need
@ -378,7 +346,7 @@ template <typename F=double>
                    // << " info " << slice.info
                    << "\n";
          slice.free();
-        }  // we did not find the slice
+        }
      }
    }
@ -412,17 +380,10 @@ template <typename F=double>
      for (auto& ptr: sliceBuffers) {
 #if defined(HAVE_CUDA)
-        const CUresult error =
+        cuMemAlloc(&ptr, sizeof(F) * sources[0].size());
-          cuMemAlloc(&ptr, sizeof(F) * sources[0].size());
+	if (ptr == 0UL) {
-        if (ptr == 0UL) {
+	  throw "UNSUFICCIENT MEMORY ON THE GRAPHIC CARD FOR FREE POINTERS";
-          throw "UNSUFICCIENT MEMORY ON THE GRAPHIC CARD FOR FREE POINTERS";
+	}
        }
        if (error != CUDA_SUCCESS) {
          std::stringstream s;
          s << "Error allocating memory for slice buffers "
            << "code " << error << "\n";
          throw s.str();
        }
 #else
        ptr = (DataPtr<F>)malloc(sizeof(F) * sources[0].size());
 #endif
--- a/include/atrip/Utils.hpp
+++ b/include/atrip/Utils.hpp
@ -80,16 +80,6 @@ struct Timer {
 using Timings = std::map<std::string, Timer>;
 // Chrono:1 ends here
 //  A nice handy macro to do formatting
 #define _FORMAT(_fmt, ...)                                    \
  ([&] (void) -> std::string {                                \
     int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__);  \
     std::vector<char>  _out(_sz  +  1);                      \
     std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
     return std::string(_out.data());                         \
   })()
 // [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
 }
 // Epilog:1 ends here
--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@ -24,7 +24,7 @@
 using namespace atrip;
 #if defined(HAVE_CUDA)
-#include <atrip/CUDA.hpp>
+#include <cuda.h>
 #endif
 template <typename F> bool RankMap<F>::RANK_ROUND_ROBIN;
@ -49,6 +49,11 @@ void Atrip::init(MPI_Comm world)  {
  Atrip::communicator = world;
  MPI_Comm_rank(world, (int*)&Atrip::rank);
  MPI_Comm_size(world, (int*)&Atrip::np);
 #if defined(HAVE_CUDA)
  Atrip::cuda.status = cublasCreate(&Atrip::cuda.handle);
 #endif
 }
 template <typename F>
@ -66,24 +71,18 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
 #if defined(HAVE_CUDA)
  int ngcards;
  _CHECK_CUDA_SUCCESS("initializing cuda",
                      cuInit(0));
  _CHECK_CUDA_SUCCESS("getting device count",
                      cuDeviceGetCount(&ngcards));
  const auto clusterInfo = getClusterInfo(Atrip::communicator);
  cuDeviceGetCount(&ngcards);
  LOG(0,"Atrip") << "ngcards: " << ngcards << "\n";
  if (clusterInfo.ranksPerNode > ngcards) {
-    const auto msg
+    std::cerr << "ATRIP: You are running on more ranks per node than the number of graphic cards\n"
-      = _FORMAT("ATRIP: You are running on more ranks per node than the number of graphic cards\n"
+              << "You have " << ngcards << " cards at your disposal\n";
-                "You have %d cards at your disposal\n", ngcards);
+    throw "";
-    std::cerr << msg;
+  }
-    throw msg;
+  if (clusterInfo.ranksPerNode < ngcards) {
-  } else if (clusterInfo.ranksPerNode < ngcards) {
+    std::cerr << "You have " << ngcards << " cards at your disposal\n"
-    const auto msg
+              << "You will be only using " << clusterInfo.ranksPerNode
-      = _FORMAT("You have %d cards at your disposal.\n"
+              << ", i.e., the nubmer of ranks.\n";
                "You will be only using %d, i.e, the number of ranks\n",
                ngcards, clusterInfo.ranksPerNode);
    std::cerr << msg;
  }
@ -95,27 +94,16 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
      struct { struct { size_t free, total; } avail; size_t total; } memory;
      char *name = (char*)malloc(256);
      // - TODO :: we should check that the Zuweisung of graphic cards
      //           to nodes works as expected, i.e., node k should get from 0
      //           to ngcards with the formula =rank % ngcards=.
      // set current device
-      _CHECK_CUDA_SUCCESS("getting device for index <rank>",
+      cuDeviceGet(&dev, rank);
-                          cuDeviceGet(&dev, rank % ngcards));
+      cuCtxCreate(&ctx, 0, dev);
-      _CHECK_CUDA_SUCCESS("creating a cuda context",
+      cuCtxSetCurrent(ctx);
                          cuCtxCreate(&ctx, 0, dev));
      _CHECK_CUDA_SUCCESS("setting the context",
                          cuCtxSetCurrent(ctx));
      // get information of the device
-      _CHECK_CUDA_SUCCESS("getting  properties of current device",
+      cuDeviceGetProperties(&prop, dev);
-                          cuDeviceGetProperties(&prop, dev));
+      cuMemGetInfo(&memory.avail.free, &memory.avail.total);
-      _CHECK_CUDA_SUCCESS("getting memory information",
+      cuDeviceGetName(name, 256, dev);
-                          cuMemGetInfo(&memory.avail.free, &memory.avail.total));
+      cuDeviceTotalMem(&memory.total, dev);
      _CHECK_CUDA_SUCCESS("getting name",
                          cuDeviceGetName(name, 256, dev));
      _CHECK_CUDA_SUCCESS("getting total memory",
                          cuDeviceTotalMem(&memory.total, dev));
      printf("\n"
             "CUDA CARD RANK %d\n"
@ -136,10 +124,6 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
             memory.total / 1024.0 / 1024.0 / 1024.0
             );
      std::free((void*)name);
      _CHECK_CUBLAS_SUCCESS("creating a cublas handle",
                            cublasCreate(&Atrip::cuda.handle));
    }
    MPI_Barrier(universe);
  }
@ -179,27 +163,17 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
 #if defined(HAVE_CUDA)
  DataPtr<F> Tai, epsi, epsa;
  //TODO: free memory pointers in the end of the algorithm
  cuMemAlloc(&Tai, sizeof(F) * _Tai.size());
  cuMemAlloc(&epsi, sizeof(F) * _epsi.size());
  cuMemAlloc(&epsa, sizeof(F) * _epsa.size());
-  // TODO: free memory pointers in the end of the algorithm
+  cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size());
  cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size());
  cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size());
-  _CHECK_CUDA_SUCCESS("Tai",
+  cuMemAlloc(&Tijk, sizeof(F) * No * No * No);
-                      cuMemAlloc(&Tai, sizeof(F) * _Tai.size()));
+  cuMemAlloc(&Zijk, sizeof(F) * No * No * No);
  _CHECK_CUDA_SUCCESS("epsi",
                      cuMemAlloc(&epsi, sizeof(F) * _epsi.size()));
  _CHECK_CUDA_SUCCESS("epsa",
                      cuMemAlloc(&epsa, sizeof(F) * _epsa.size()));
  _CHECK_CUDA_SUCCESS("memcpy Tai",
                      cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size()));
  _CHECK_CUDA_SUCCESS("memcpy epsi",
                      cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size()));
  _CHECK_CUDA_SUCCESS("memcpy epsa",
                      cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size()));
  _CHECK_CUDA_SUCCESS("Tijk",
                      cuMemAlloc(&Tijk, sizeof(F) * No * No * No));
  _CHECK_CUDA_SUCCESS("Zijk",
                      cuMemAlloc(&Zijk, sizeof(F) * No * No * No));
 #else
  std::vector<F> &Tai = _Tai, &epsi = _epsi, &epsa = _epsa;
  Zijk = (DataFieldType<F>*)malloc(No*No*No * sizeof(DataFieldType<F>));
@ -292,8 +266,8 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
  auto const isFakeTuple
    = [&tuplesList, distribution](size_t const i) {
-        return distribution->tupleIsFake(tuplesList[i]);
+      return distribution->tupleIsFake(tuplesList[i]);
-      };
+    };
  using Database = typename Slice<F>::Database;
--- a/src/atrip/Blas.cxx
+++ b/src/atrip/Blas.cxx
@ -15,22 +15,19 @@
 // [[file:~/cuda/atrip/atrip.org::*Blas][Blas:2]]
 #include <atrip/Blas.hpp>
 #include <atrip/Atrip.hpp>
 #include <atrip/CUDA.hpp>
 #if defined(HAVE_CUDA)
 #  include <cstring>
-static size_t dgem_call = 0;
+  static inline
-
+  cublasOperation_t char_to_cublasOperation(const char* trans) {
-static inline
+    if (strncmp("N", trans, 1) == 0)
-cublasOperation_t char_to_cublasOperation(const char* trans) {
+      return CUBLAS_OP_N;
-  if (strncmp("N", trans, 1) == 0)
+    else if (strncmp("T", trans, 1) == 0)
-    return CUBLAS_OP_N;
+      return CUBLAS_OP_T;
-  else if (strncmp("T", trans, 1) == 0)
+    else
-    return CUBLAS_OP_T;
+      return CUBLAS_OP_C;
-  else
+  }
    return CUBLAS_OP_C;
 }
 #endif
@ -52,23 +49,13 @@ namespace atrip {
             typename DataField<double>::type *C,
             const int *ldc) {
 #if defined(HAVE_CUDA)
-    // TODO: remove this verbose checking
+    cublasDgemm(Atrip::cuda.handle,
-    const cublasStatus_t error =
+                char_to_cublasOperation(transa),
-      cublasDgemm(Atrip::cuda.handle,
+                char_to_cublasOperation(transb),
-                  char_to_cublasOperation(transa),
+                *m, *n, *k,
-                  char_to_cublasOperation(transb),
+                alpha, A, *lda,
-                  *m, *n, *k,
+                B, *ldb, beta,
-                  alpha, A, *lda,
+                C, *ldc);
                  B, *ldb, beta,
                  C, *ldc);
    if (error != 0) printf(":%-3ld (%4ldth) ERR<%4d> cublasDgemm: "
                           "A = %20ld "
                           "B = %20ld "
                           "C = %20ld "
                           "\n",
                           Atrip::rank,
                           dgem_call++,
                           error, A, B, C);
 #else
    dgemm_(transa, transb,
           m, n, k,
@ -96,17 +83,16 @@ namespace atrip {
    cuDoubleComplex
      cu_alpha = {std::real(*alpha), std::imag(*alpha)},
      cu_beta = {std::real(*beta), std::imag(*beta)};
    cublasZgemm(Atrip::cuda.handle,
                char_to_cublasOperation(transa),
                char_to_cublasOperation(transb),
                *m, *n, *k,
                &cu_alpha,
-    _CHECK_CUBLAS_SUCCESS("cublasZgemm",
+                A, *lda,
-                          cublasZgemm(Atrip::cuda.handle,
+                B, *ldb,
-                                      char_to_cublasOperation(transa),
+                &cu_beta,
-                                      char_to_cublasOperation(transb),
+                C, *ldc);
                                      *m, *n, *k,
                                      &cu_alpha,
                                      A, *lda,
                                      B, *ldb,
                                      &cu_beta,
                                      C, *ldc));
 #else
    zgemm_(transa, transb,
           m, n, k,
--- a/src/atrip/Equations.cxx
+++ b/src/atrip/Equations.cxx
@ -14,13 +14,31 @@
 // [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]]
 #include<atrip/Equations.hpp>
 #include<atrip/CUDA.hpp>
 #if defined(HAVE_CUDA)
 #include <cuda.h>
 #endif
 namespace atrip {
 // Prolog:2 ends here
  // These are just help structures
  // to help with the templating of reorder
  // function
  enum reordering_t
    {
     IJK,
     IKJ,
     JIK,
     JKI,
     KIJ,
     KJI
    };
  template <typename F, reordering_t R>
  struct reorder_proxy {};
 #ifdef HAVE_CUDA
 namespace cuda {
@ -83,42 +101,49 @@ namespace cuda {
  template <>
  __device__
  cuDoubleComplex multiply(const cuDoubleComplex &a, const cuDoubleComplex &b) {
-    return
+   return
-      {a.x * b.x - a.y * b.y,
+     {a.x * b.x - a.y * b.y,
-       a.x * b.y + a.y * b.x};
+      a.x * b.y + a.y * b.x};
  }
  template <typename F>
  __device__
-  void sum_in_place(F* to, const F* from);
+  void sum_in_place(F* to, const F* b);
  template <>
  __device__
-  void sum_in_place(double* to, const double *from) { *to += *from; }
+  void sum_in_place(double* to, const double *b) { *to += *b; }
  template <>
  __device__
-  void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* from) {
+  void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* b) {
-    to->x += from->x;
+   to->x += b->x;
-    to->y += from->y;
+   to->y += b->y;
  }
  __device__
  cuDoubleComplex& operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz) {
    lz.x += rz.x;
    lz.y += rz.y;
    return lz;
  }
 };
 #endif
 #if defined(HAVE_CUDA)
-#define FOR_K()                                             \
+#define LIMS_KS()                                       \
-  for (size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \
+  size_t kmin = blockIdx.x * blockDim.x + threadIdx.x,  \
-         k = kmin,                                          \
+    k = kmin,                                           \
-         idx = kmin * size * size * size;                   \
+    idx = kmin * size * size * size                     \
-       k < (kmin < size) ? kmin + 1 : size;                 \
+    ;                                                   \
-       k++)
+  k < (kmin < size) ? kmin + 1 : size
 #else
-#define FOR_K() for (size_t k=0, idx=0; k < size; k++)
+#define LIMS_KS size_t k=0, idx=0; k < size
 #endif
 #define _IJK_(i, j, k) i + j*size + k*size*size
 #define _REORDER_BODY_(...)                       \
-  FOR_K()                                         \
+  for (LIMS_KS() ; k++)                           \
    for (size_t j = 0; j < size; j++)             \
      for (size_t i = 0; i < size; i++, idx++) {  \
        __VA_ARGS__                               \
@ -127,9 +152,7 @@ namespace cuda {
  template <typename F>                         \
  __MAYBE_GLOBAL__                              \
  void reorder(reorder_proxy< F, _enum > p,     \
-               size_t size,                     \
+               size_t size, F* to, F* from) {   \
               F* to,                           \
               F* from) {                       \
    _REORDER_BODY_(__VA_ARGS__)                 \
      }
 #if defined(HAVE_CUDA)
@ -138,31 +161,9 @@ namespace cuda {
 #define GO(__TO, __FROM) __TO += __FROM;
 #endif
  // These are just help structures
  // to help with the templating of reorder
  // function
  enum reordering_t
    {
     IJK,
     IKJ,
     JIK,
     JKI,
     KIJ,
     KJI
    };
  /* 
   * Please the c++ type checker and template creator
   * in order to have an argument in the signature of
   * the function  that helps the compiler know which
   * instantiation it should take.
   * 
   */
  template <typename F, reordering_t R>
  struct reorder_proxy {};
  template <typename F, reordering_t R>
-  __MAYBE_GLOBAL__
+  __MAYBE_GLOBAL__							\
  void reorder(reorder_proxy<F, R> proxy,
               size_t size, F* to, F* from);
@ -444,14 +445,10 @@ double getEnergySame
 #if defined(ATRIP_USE_DGEMM)
 #if defined(HAVE_CUDA)
-#define REORDER(__II, __JJ, __KK)                           \
+#define REORDER(__II, __JJ, __KK)                       \
-  reorder<<<bs, ths>>>(reorder_proxy<                       \
+  reorder<<<bs, ths>>>(reorder_proxy<DataFieldType<F>,  \
-                       DataFieldType<F>,                    \
+                       __II ## __JJ ## __KK >{},        \
-                       __II ## __JJ ## __KK                 \
+                       No, Tijk, _t_buffer);
                       >{},                                 \
                       No,                                  \
                       Tijk,                                \
                       _t_buffer);
 #define DGEMM_PARTICLES(__A, __B)               \
  atrip::xgemm<F>("T",                          \
                  "N",                          \
@ -465,7 +462,7 @@ double getEnergySame
                  (int const*)&Nv,              \
                  &zero,                        \
                  _t_buffer,                    \
-                  (int const*)&NoNo)
+                  (int const*)&NoNo);
 #define DGEMM_HOLES(__A, __B, __TRANSB)         \
  atrip::xgemm<F>("N",                          \
                  __TRANSB,                     \
@ -480,7 +477,7 @@ double getEnergySame
                  &zero,                        \
                  _t_buffer,                    \
                  (int const*)&NoNo             \
-                  )
+                  );
 #define MAYBE_CONJ(_conj, _buffer)                                      \
  cuda::maybeConjugate<<<                                               \
    Atrip::kernelDimensions.ooo.blocks,                                 \
@ -515,7 +512,7 @@ double getEnergySame
                  &zero,                        \
                  _t_buffer,                    \
                  (int const*)&NoNo             \
-                  )
+                  );
 #define DGEMM_HOLES(__A, __B, __TRANSB)         \
  atrip::xgemm<F>("N",                          \
                  __TRANSB,                     \
@ -530,7 +527,7 @@ double getEnergySame
                  &zero,                        \
                  _t_buffer,                    \
                  (int const*)&NoNo             \
-                  )
+                  );
 #define MAYBE_CONJ(_conj, _buffer)                \
  for (size_t __i = 0; __i < NoNoNo; ++__i)       \
    _conj[__i] = maybeConjugate<F>(_buffer[__i]);
@ -541,14 +538,8 @@ double getEnergySame
 #ifdef HAVE_CUDA
    DataFieldType<F>* _t_buffer;
    DataFieldType<F>* _vhhh;
-    WITH_CHRONO("double:cuda:alloc",
+    cuMemAlloc((CUdeviceptr*)&_t_buffer, NoNoNo * sizeof(DataFieldType<F>));
-    _CHECK_CUDA_SUCCESS("Allocating _t_buffer",
+    cuMemAlloc((CUdeviceptr*)&_vhhh, NoNoNo * sizeof(DataFieldType<F>));
                        cuMemAlloc((CUdeviceptr*)&_t_buffer,
                                   NoNoNo * sizeof(DataFieldType<F>)));
    _CHECK_CUDA_SUCCESS("Allocating _vhhh",
                        cuMemAlloc((CUdeviceptr*)&_vhhh,
                                   NoNoNo * sizeof(DataFieldType<F>)));
                )
    const size_t
      bs = Atrip::kernelDimensions.ooo.blocks,
      ths = Atrip::kernelDimensions.ooo.threads;
@ -569,6 +560,7 @@ double getEnergySame
    WITH_CHRONO("double:reorder",
                cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk,
                                           NoNoNo);
                // synchronize all initializations to zero
                )
 #else
    WITH_CHRONO("double:reorder",
@ -583,37 +575,37 @@ double getEnergySame
        // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
        MAYBE_CONJ(_vhhh, VhhhC)
        WITH_CHRONO("doubles:holes:1",
-                    DGEMM_HOLES(_vhhh, TABhh, "N");
+          DGEMM_HOLES(_vhhh, TABhh, "N")
-                    REORDER(I, K, J)
+          REORDER(I, K, J)
        )
        // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
        WITH_CHRONO("doubles:holes:2",
-                    DGEMM_HOLES(_vhhh, TABhh, "T");
+          DGEMM_HOLES(_vhhh, TABhh, "T")
-                    REORDER(J, K, I)
+          REORDER(J, K, I)
        )
        // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
        MAYBE_CONJ(_vhhh, VhhhB)
        WITH_CHRONO("doubles:holes:3",
-                    DGEMM_HOLES(_vhhh, TAChh, "N");
+          DGEMM_HOLES(_vhhh, TAChh, "N")
-                    REORDER(I, J, K)
+          REORDER(I, J, K)
        )
        // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
        WITH_CHRONO("doubles:holes:4",
-                    DGEMM_HOLES(_vhhh, TAChh, "T");
+          DGEMM_HOLES(_vhhh, TAChh, "T")
-                    REORDER(K, J, I)
+          REORDER(K, J, I)
        )
        // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
        MAYBE_CONJ(_vhhh, VhhhA)
        WITH_CHRONO("doubles:holes:5",
-                    DGEMM_HOLES(_vhhh, TBChh, "N");
+          DGEMM_HOLES(_vhhh, TBChh, "N")
-                    REORDER(J, I, K)
+          REORDER(J, I, K)
        )
        // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
        WITH_CHRONO("doubles:holes:6",
-                    DGEMM_HOLES(_vhhh, TBChh, "T");
+          DGEMM_HOLES(_vhhh, TBChh, "T")
-                    REORDER(K, I, J)
+          REORDER(K, I, J)
        )
      }
    )
@ -624,33 +616,33 @@ double getEnergySame
      {
        // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
        WITH_CHRONO("doubles:particles:1",
-                    DGEMM_PARTICLES(TAphh, VBCph);
+          DGEMM_PARTICLES(TAphh, VBCph)
-                    REORDER(I, J, K)
+          REORDER(I, J, K)
        )
        // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
        WITH_CHRONO("doubles:particles:2",
-                    DGEMM_PARTICLES(TAphh, VCBph);
+          DGEMM_PARTICLES(TAphh, VCBph)
-                    REORDER(I, K, J)
+          REORDER(I, K, J)
        )
        // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
        WITH_CHRONO("doubles:particles:3",
-                    DGEMM_PARTICLES(TCphh, VABph);
+          DGEMM_PARTICLES(TCphh, VABph)
-                    REORDER(K, I, J)
+          REORDER(K, I, J)
        )
        // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
        WITH_CHRONO("doubles:particles:4",
-                    DGEMM_PARTICLES(TCphh, VBAph);
+          DGEMM_PARTICLES(TCphh, VBAph)
-                    REORDER(K, J, I)
+          REORDER(K, J, I)
        )
        // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
        WITH_CHRONO("doubles:particles:5",
-                    DGEMM_PARTICLES(TBphh, VACph);
+          DGEMM_PARTICLES(TBphh, VACph)
-                    REORDER(J, I, K)
+          REORDER(J, I, K)
        )
        // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
        WITH_CHRONO("doubles:particles:6",
-                    DGEMM_PARTICLES(TBphh, VCAph);
+          DGEMM_PARTICLES(TBphh, VCAph)
-                    REORDER(J, K, I)
+          REORDER(J, K, I)
        )
      }
    )
@ -659,12 +651,9 @@ double getEnergySame
 #ifdef HAVE_CUDA
    // we need to synchronize here since we need
    // the Tijk for next process in the pipeline
-    _CHECK_CUDA_SUCCESS("Synchronizing",
+    cuCtxSynchronize();
-                        cuCtxSynchronize());
+    cuMemFree((CUdeviceptr)_vhhh);
-    _CHECK_CUDA_SUCCESS("Freeing _vhhh",
+    cuMemFree((CUdeviceptr)_t_buffer);
                        cuMemFree((CUdeviceptr)_vhhh));
    _CHECK_CUDA_SUCCESS("Freeing _t_buffer",
                        cuMemFree((CUdeviceptr)_t_buffer));
 #else
    free(_vhhh);
    free(_t_buffer);