Add a couple of bench scripts for cuda and cublas testing

Give a better message when out of memory in allocating SliceUnion
Fix AniaBug#2: Lifecycle of SelfSufficient slices was wrong (comments)
2022-09-12 19:53:50 +02:00 · 2022-09-12 19:39:38 +02:00 · 2022-09-12 19:29:35 +02:00 · 2022-09-12 19:17:52 +02:00 · 2022-09-12 19:13:45 +02:00 · 2022-09-12 19:10:14 +02:00
13 changed files with 811 additions and 150 deletions
--- a/bench/Makefile.am
+++ b/bench/Makefile.am
@ -1,3 +1,4 @@
+AUTOMAKE_OPTIONS = subdir-objects
 include $(top_srcdir)/atrip.mk

 AM_CPPFLAGS = -I$(top_srcdir)/include/ $(CTF_CPPFLAGS)
@ -19,4 +20,13 @@ endif
 if WITH_CUDA
 test_main_CXXFLAGS = $(CUDA_CXXFLAGS)
 test_main_LDADD += $(CUDA_LDFLAGS)
+
+AM_CXXFLAGS = $(CUDA_CXXFLAGS)
+AM_LDFLAGS += $(CUDA_LDFLAGS)
+
+bin_PROGRAMS += test-cublas-parallel-atrip
+test_cublas_parallel_SOURCES = test-cublas-parallel-atrip.cxx
+
+bin_PROGRAMS += test-cuda-sanity
+test_cuda_sanity_SOURCES = test-cuda-sanity.cxx
 endif
--- a/bench/test-cublas-parallel-atrip.cxx
+++ b/bench/test-cublas-parallel-atrip.cxx
@ -0,0 +1,239 @@
+#include <mpi.h>
+#include <iostream>
+#include <cassert>
+#include <string.h>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <cuda.h>
+#include <chrono>
+#include <map>
+#include <cstdlib>
+#include <unistd.h>
+
+#include <CLI11.hpp>
+
+#define CUBLASAPI
+#include <cublas_api.h>
+#include <cublas_v2.h>
+
+struct Timer {
+  using Clock = std::chrono::high_resolution_clock;
+  using Event = std::chrono::time_point<Clock>;
+  std::chrono::duration<double> duration;
+  Event _start;
+  inline void start() noexcept { _start = Clock::now(); }
+  inline void stop() noexcept { duration += Clock::now() - _start; }
+  inline void clear() noexcept { duration *= 0; }
+  inline double count() const noexcept { return duration.count(); }
+};
+using Timings = std::map<std::string, Timer>;
+
+#define _FORMAT(_fmt, ...)                                    \
+  ([&] (void) -> std::string {                                \
+     int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__);  \
+     std::vector<char>  _out(_sz  +  1);                      \
+     std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
+     return std::string(_out.data());                         \
+   })()
+
+#define _CHECK_CUDA_SUCCESS(message, ...)                               \
+  do {                                                                  \
+    CUresult result = __VA_ARGS__;                                      \
+    printf("doing %s\n", message);                                      \
+    if (result != CUDA_SUCCESS) {                                       \
+      printf("\t!!CUDA_ERROR(%d): %s:%d %s\n",                          \
+             result,                                                    \
+             __FILE__,                                                  \
+             __LINE__,                                                  \
+             message);                                                  \
+      return 1;                                                         \
+    }                                                                   \
+  } while (0)
+
+#define _CHECK_CUBLAS_SUCCESS(message, ...)                             \
+  do {                                                                  \
+    cublasStatus_t result = __VA_ARGS__;                                \
+    if (result != 0) {                                                  \
+      printf("\t!!CUBLAS_ERROR(%d): %s:%d  %s\n",                       \
+             result,                                                    \
+             __FILE__,                                                  \
+             __LINE__,                                                  \
+             message);                                                  \
+      return 1;                                                         \
+    }                                                                   \
+  } while (0)
+
+int main(int  argc, char** argv) {
+
+  using std::vector;
+
+  MPI_Init(NULL, NULL);
+  int rank, np, ngcards;
+  size_t no(10), nv(no * 10), its(2);
+  bool barrier = false;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &np);
+
+  CLI::App app{"Main bench for atrip"};
+  app.add_option("--no", no, "Occupied orbitals");
+  app.add_option("--nv", nv, "Virtual orbitals");
+  app.add_option("--its", its, "Number of iterations to be done");
+  app.add_option("--barrier", barrier, "Call a MPI_Barrier in every iteration?");
+  CLI11_PARSE(app, argc, argv);
+
+  const size_t oo = no * no, ooo = no * oo;
+
+  Timings timings;
+
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  _CHECK_CUDA_SUCCESS("init for cuda",
+                      cuInit(0));
+
+  _CHECK_CUDA_SUCCESS("get ncards",
+                      cuDeviceGetCount(&ngcards));
+
+  CUcontext ctx;
+  CUdevice dev;
+
+  char hostname[256];
+  gethostname(hostname, 256);
+  printf("%s with rank %d gets card %d\n",
+         hostname,
+         rank,
+         rank % ngcards);
+
+  // set contexts
+  _CHECK_CUDA_SUCCESS("device get",       cuDeviceGet(&dev, rank % ngcards));
+  _CHECK_CUDA_SUCCESS("creating context", cuCtxCreate(&ctx, 0, dev));
+  _CHECK_CUDA_SUCCESS("setting context",  cuCtxSetCurrent(ctx));
+  _CHECK_CUDA_SUCCESS("synchronizing",    cuCtxSynchronize());
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  using host_slice_t = vector<double>;
+
+  vector<size_t> sizes = {nv * oo,  nv * no , oo,      oo * no,  oo * no};
+  vector<CUdeviceptr>     P_phh(3), P_ph(6) , H_hh(3), H_hhh(3), T_hhh(1);
+  vector<vector<CUdeviceptr>*> slices_d = {&P_phh, &P_ph , &H_hh, &H_hhh, &T_hhh};
+  vector<vector<host_slice_t>> slices_h(slices_d.size());
+  {
+    int i = -1;
+    for (auto& v: slices_d) {
+      i++;
+      for (auto& ptr: *v) {
+        _CHECK_CUDA_SUCCESS("malloc",
+                            cuMemAlloc(&ptr,
+                                       sizes[i] * sizeof(double)));
+        slices_h[i].push_back(std::move(std::vector<double>(sizes[i])));
+      }
+    }
+  }
+
+  const double one = 1.0, zero = 0.0;
+
+  printf("its: %d\n", its);
+  printf("barrier: %d\n", barrier);
+  printf("no: %ld\n", no);
+  printf("nv: %ld\n", nv);
+  printf("SIZE %f GB\n", (3 * nv * oo
+                          + 6 * oo * nv
+                          + 3 * oo
+                          + 3 * ooo
+                          + 1 * ooo
+                          ) * sizeof(double) / 1024.0 / 1024.0 / 1024.0);
+  std::map<std::string, double> tflopss
+    {{ "dgemm",     ooo * (no + nv) * 6.0 * 2.0 * its / 1e12},
+     { "holes",     ooo * no * 6.0 * 2.0 * its / 1e12},
+     { "particles", ooo * nv * 6.0 * 2.0 * its / 1e12}};
+
+  cublasHandle_t handle;
+  _CHECK_CUBLAS_SUCCESS("handle create", cublasCreate(&handle));
+  printf("handle %ld\n", handle);
+
+  timings["dgemm"].start();
+  for (size_t i = 0; i < its; i++) {
+
+    if (barrier) {
+      MPI_Barrier(MPI_COMM_WORLD);
+      timings["memcpy"].start();
+      for (size_t _s = 0; _s < slices_d.size(); _s++) {
+        // for (size_t _b = 0; _b < slices_h[_s].size(); _b++) {
+        for (size_t _b = 0; _b < 1 ; _b++) {
+          auto device = (*slices_d[_s])[_b];
+          auto host   = slices_h[_s][_b].data();
+          cuMemcpyHtoD(device, host, sizes[_s]);
+        }
+      }
+      timings["memcpy"].stop();
+    }
+
+
+    timings["holes"].start();
+    for (size_t j = 0; j < 3; j++) {
+
+      _CHECK_CUBLAS_SUCCESS(" > 'geming ...",
+                            cublasDgemm(handle,
+                                        CUBLAS_OP_N,
+                                        CUBLAS_OP_N,
+                                        oo, no, no,
+                                        &one,
+                                        (double*)H_hhh[j], oo,
+                                        (double*)H_hh[j], no,
+                                        &zero,
+                                        (double*)T_hhh[0], oo));
+
+      _CHECK_CUBLAS_SUCCESS(" > 'geming ...",
+                            cublasDgemm(handle,
+                                        CUBLAS_OP_N,
+                                        CUBLAS_OP_T,
+                                        oo, no, no,
+                                        &one,
+                                        (double*)H_hhh[j], oo,
+                                        (double*)H_hh[j], no,
+                                        &zero,
+                                        (double*)T_hhh[0], oo));
+
+
+    }
+    timings["holes"].stop();
+
+    timings["particles"].start();
+    for (size_t j = 0; j < 6; j++) {
+      _CHECK_CUBLAS_SUCCESS(" > 'geming ...",
+                            cublasDgemm(handle,
+                                        CUBLAS_OP_T,
+                                        CUBLAS_OP_N,
+                                        oo, no, nv,
+                                        &one,
+                                        (double*)P_phh[j % 3], nv,
+                                        (double*)P_ph[j], nv,
+                                        &zero,
+                                        (double*)T_hhh[0], oo));
+    }
+    timings["particles"].stop();
+
+    cuCtxSynchronize();
+
+  }
+
+
+  timings["dgemm"].stop();
+
+
+
+  printf("Performance: \n");
+  for (auto name: {"holes", "particles", "dgemm"})
+    printf("%10s TFlops: %4.1f\n",
+          name,
+          tflopss[name]
+          / timings[name].count());
+
+  printf("Timings: \n");
+  for (auto const& kv: timings)
+    printf("%10s: %10f\n", kv.first.c_str(), kv.second.count());
+
+  MPI_Finalize();
+  return 0;
+
+}
--- a/bench/test-cublas-parallel.cxx
+++ b/bench/test-cublas-parallel.cxx
@ -0,0 +1,163 @@
+#include <mpi.h>
+#include <iostream>
+#include <cassert>
+#include <string.h>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <cuda.h>
+#include <chrono>
+#include <map>
+#include <cstdlib>
+#include <unistd.h>
+
+#define CUBLASAPI
+#include <cublas_api.h>
+#include <cublas_v2.h>
+
+struct Timer {
+  using Clock = std::chrono::high_resolution_clock;
+  using Event = std::chrono::time_point<Clock>;
+  std::chrono::duration<double> duration;
+  Event _start;
+  inline void start() noexcept { _start = Clock::now(); }
+  inline void stop() noexcept { duration += Clock::now() - _start; }
+  inline void clear() noexcept { duration *= 0; }
+  inline double count() const noexcept { return duration.count(); }
+};
+using Timings = std::map<std::string, Timer>;
+
+#define _FORMAT(_fmt, ...)                                    \
+  ([&] (void) -> std::string {                                \
+     int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__);  \
+     std::vector<char>  _out(_sz  +  1);                      \
+     std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
+     return std::string(_out.data());                         \
+   })()
+
+#define _CHECK_CUDA_SUCCESS(message, ...)                               \
+  do {                                                                  \
+    CUresult result = __VA_ARGS__;                                      \
+    printf("doing %s\n", message);                                      \
+    if (result != CUDA_SUCCESS) {                                       \
+      printf("\t!!CUDA_ERROR(%d): %s:%d %s\n",                          \
+             result,                                                    \
+             __FILE__,                                                  \
+             __LINE__,                                                  \
+             message);                                                  \
+      return 1;                                                         \
+    }                                                                   \
+  } while (0)
+
+#define _CHECK_CUBLAS_SUCCESS(message, ...)                             \
+  do {                                                                  \
+    cublasStatus_t result = __VA_ARGS__;                                \
+    printf("CUBLAS: doing %s\n", message);                              \
+    if (result != 0) {                                                  \
+      printf("\t!!CUBLAS_ERROR(%d): %s:%d  %s\n",                       \
+             result,                                                    \
+             __FILE__,                                                  \
+             __LINE__,                                                  \
+             message);                                                  \
+      return 1;                                                         \
+    }                                                                   \
+  } while (0)
+
+int main(int  argc, char** argv) {
+  MPI_Init(NULL, NULL);
+  int rank, np, ngcards;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &np);
+
+  Timings timings;
+
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  _CHECK_CUDA_SUCCESS("init for cuda",
+                      cuInit(0));
+
+  _CHECK_CUDA_SUCCESS("get ncards",
+                      cuDeviceGetCount(&ngcards));
+
+
+  const size_t N
+    = argc > 1
+    ? atoi(argv[1])
+    : 30000
+    , dgemms
+    = argc > 2
+    ? atoi(argv[2])
+    : 2
+    , flops = 2 * N * N * N * dgemms
+    ;
+
+  CUcontext ctx;
+  CUdevice dev;
+
+  char hostname[256];
+  gethostname(hostname, 256);
+  printf("%s with rank %d gets card %d\n",
+         hostname,
+         rank,
+         rank % ngcards);
+
+  // set contexts
+  _CHECK_CUDA_SUCCESS("device get",
+                      cuDeviceGet(&dev, rank % ngcards));
+  _CHECK_CUDA_SUCCESS("creating context",
+                      cuCtxCreate(&ctx, 0, dev));
+  _CHECK_CUDA_SUCCESS("setting context",
+                      cuCtxSetCurrent(ctx));
+  _CHECK_CUDA_SUCCESS("synchronizing",
+                      cuCtxSynchronize());
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  CUdeviceptr A, B, C;
+  const double one = 1.0;
+  printf("SIZE %f GB\n", N * N * sizeof(double) / 1024.0 / 1024.0 / 1024.0);
+  _CHECK_CUDA_SUCCESS("A", cuMemAlloc(&A, N * N * sizeof(double)));
+  _CHECK_CUDA_SUCCESS("B", cuMemAlloc(&B, N * N * sizeof(double)));
+  _CHECK_CUDA_SUCCESS("C", cuMemAlloc(&C, N * N * sizeof(double)));
+
+  cublasHandle_t handle;
+  cublasStatus_t stat;
+  _CHECK_CUBLAS_SUCCESS("handle create", cublasCreate(&handle));
+  printf("handle %ld\n", handle);
+
+  timings["dgemm"].start();
+  for (size_t i = 0; i < dgemms; i++) {
+    _CHECK_CUBLAS_SUCCESS(_FORMAT(" > 'geming %ld ...", i).c_str(),
+                          cublasDgemm(handle,
+                                      CUBLAS_OP_N,
+                                      CUBLAS_OP_N,
+                                      N, N, N,
+                                      &one,
+                                      (double*)A, N,
+                                      (double*)B, N,
+                                      &one,
+                                      (double*)C, N));
+  }
+
+  cuCtxSynchronize();
+  timings["dgemm"].stop();
+
+
+  printf("dgemm Gflops: %f\n",
+         flops
+         / timings["dgemm"].count()
+         / 1024.0
+         / 1024.0
+         / 1024.0);
+
+  MPI_Finalize();
+  return 0;
+}
+
+// Local Variables:
+// compile-command: "mpic++                         \
+//      -pedantic -std=c++11                        \
+//      -L./cudaroot/lib64 -lcuda                   \
+//      -L./cudaroot/lib64 -lcudart                 \
+//      -L./cudaroot/lib64 -lcublas                 \
+//      ./test-cublas-parallel.cxx -o test-cublas-parallel"
+// End:
--- a/bench/test-cuda-sanity.cxx
+++ b/bench/test-cuda-sanity.cxx
@ -0,0 +1,104 @@
+#include <mpi.h>
+#include <iostream>
+#include <cassert>
+#include <string.h>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <cuda.h>
+
+#define _CHECK_CUDA_SUCCESS(message, ...)                               \
+  do {                                                                  \
+    CUresult result = __VA_ARGS__;                                      \
+    printf("doing %s\n", message);                                      \
+    if (result != CUDA_SUCCESS) {                                       \
+      printf("\t!!CUDA_ERROR(%d): %s:%d %s\n",                          \
+             result,                                                    \
+             __FILE__,                                                  \
+             __LINE__,                                                  \
+             message);                                                  \
+      return 1;                                                         \
+    }                                                                   \
+  } while (0)
+
+int main() {
+  int rank, np, ngcards;
+  MPI_Init(NULL, NULL);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &np);
+
+  _CHECK_CUDA_SUCCESS("init for cuda",
+                      cuInit(0));
+
+  _CHECK_CUDA_SUCCESS("get ncards",
+                      cuDeviceGetCount(&ngcards));
+
+
+  for (size_t rank = 0; rank < ngcards; rank++) {
+    CUcontext ctx;
+    CUdevice dev;
+    CUdevprop_st prop;
+    size_t _free, total, total2;
+    char *name = (char*)malloc(256);
+
+    printf("Setting contexts\n");
+    // set contexts
+    _CHECK_CUDA_SUCCESS("device get",
+                        cuDeviceGet(&dev, rank));
+    _CHECK_CUDA_SUCCESS("creating context",
+                        cuCtxCreate(&ctx, 0, dev));
+    _CHECK_CUDA_SUCCESS("setting context",
+                        cuCtxSetCurrent(ctx));
+    _CHECK_CUDA_SUCCESS("synchronizing",
+                        cuCtxSynchronize());
+
+    _CHECK_CUDA_SUCCESS("prop get",
+                        cuDeviceGetProperties(&prop, dev));
+    _CHECK_CUDA_SUCCESS("meminfo get",
+                        cuMemGetInfo(&_free, &total));
+    _CHECK_CUDA_SUCCESS("name get",
+                        cuDeviceGetName(name, 256, dev));
+    _CHECK_CUDA_SUCCESS("totalmem get",
+                        cuDeviceTotalMem(&total2, dev));
+
+    printf("\n"
+           "CUDA CARD RANK %d\n"
+           "=================\n"
+           "\tname: %s\n"
+           "\tShared Mem Per Block (KB): %f\n"
+           "\tFree/Total mem (GB): %f/%f\n"
+           "\total2 mem (GB): %f\n"
+           "\n",
+           dev,
+           name,
+           prop.sharedMemPerBlock / 1024.0,
+           _free / 1024.0 / 1024.0 / 1024.0 ,
+           total / 1024.0 / 1024.0 / 1024.0 ,
+           total2 / 1024.0 / 1024.0 / 1024.0
+           );
+
+    if (_free == 0 || total == 0 || total2 == 0)
+      return 1;
+
+    CUdeviceptr data;
+    _CHECK_CUDA_SUCCESS("memalloc 1",
+                        cuMemAlloc(&data, sizeof(double) * 10000));
+    _CHECK_CUDA_SUCCESS("memalloc 2",
+                        cuMemAlloc(&data, sizeof(double) * 10000));
+    
+  }
+
+  MPI_Finalize();
+
+  return 0;
+}
+
+// Local Variables:
+// compile-command: "mpic++                         \
+//      -pedantic -std=c++11                        \
+//      -L./cudaroot/lib64 -lcuda                   \
+//      -L./cudaroot/lib64 -lcudart                 \
+//      -L./cudaroot/lib64 -lcublas                 \
+//      ./mem.cxx -o mem"
+// End:
--- a/bench/test_main.cxx
+++ b/bench/test_main.cxx
@ -142,8 +142,10 @@ int main(int argc, char** argv) {
        + /* tai  */ f * nranks * no * nv
    ;

-  if (atrip::Atrip::rank == 0)
-  std::cout << "Tentative MEMORY USAGE: " << atrip_memory << "\n";
+  if (rank == 0) {
+    std::cout << "Tentative MEMORY USAGE (GB): "
+	      << double(atrip_memory) / 1024.0 / 1024.0 / 1024.0 << "\n";
+  }


  std::vector<int> symmetries(4, NS)
--- a/bootstrap.sh
+++ b/bootstrap.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+cd `git rev-parse --show-toplevel`
+
 type -a autoreconf > /dev/null ||
 {
        cat <<EOF && exit
--- a/include/atrip/CUDA.hpp
+++ b/include/atrip/CUDA.hpp
@ -1,5 +1,13 @@
 #pragma once

+#include <atrip/Utils.hpp>
+
+#if defined(HAVE_CUDA)
+#include <cuda.h>
+#endif
+
+#include <sstream>
+
 #if defined(HAVE_CUDA) && defined(__CUDACC__)
 #  define __MAYBE_GLOBAL__ __global__
 #  define __MAYBE_DEVICE__ __device__
@ -7,3 +15,31 @@
 #  define __MAYBE_GLOBAL__
 #  define __MAYBE_DEVICE__
 #endif
+
+#define _CHECK_CUDA_SUCCESS(message, ...)                               \
+  do {                                                                  \
+    CUresult result = __VA_ARGS__;                                      \
+    if (result != CUDA_SUCCESS) {                                       \
+      auto msg = _FORMAT("\t!!CUDA_ERROR(%d): %s:%d\v%s",               \
+                         result,                                        \
+                         __FILE__,                                      \
+                         __LINE__,                                      \
+                         message);                                      \
+      std::cerr << msg;                                                 \
+      throw msg;                                                        \
+    }                                                                   \
+  } while (0)
+
+#define _CHECK_CUBLAS_SUCCESS(message, ...)                             \
+  do {                                                                  \
+    cublasStatus_t result = __VA_ARGS__;                                      \
+    if (result != 0) {                                                  \
+      auto msg = _FORMAT("\t!!CUBLAS_ERROR(%d): %s:%d\v%s",             \
+                         result,                                        \
+                         __FILE__,                                      \
+                         __LINE__,                                      \
+                         message);                                      \
+      std::cerr << msg;                                                 \
+      throw msg;                                                        \
+    }                                                                   \
+  } while (0)
--- a/include/atrip/Slice.hpp
+++ b/include/atrip/Slice.hpp
@ -21,6 +21,7 @@

 #include <atrip/Tuples.hpp>
 #include <atrip/Utils.hpp>
+#include <atrip/CUDA.hpp>

 namespace atrip {

@ -458,7 +459,11 @@ void unwrapAndMarkReady() {
 #if defined(HAVE_CUDA)
      // copy the retrieved mpi data to the device
      WITH_CHRONO("cuda:memcpy",
-                  cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);)
+                  _CHECK_CUDA_SUCCESS("copying mpi data to device",
+                                      cuMemcpyHtoD(data,
+                                                   (void*)mpi_data,
+                                                   sizeof(F) * size));
+                  )
      std::free(mpi_data);
 #endif

--- a/include/atrip/SliceUnion.hpp
+++ b/include/atrip/SliceUnion.hpp
@ -195,7 +195,28 @@ template <typename F=double>
                           ;
          if (blank.info.state == Slice<F>::SelfSufficient) {
 #if defined(HAVE_CUDA)
-            blank.mpi_data = sources[from.source].data();
+            const size_t _size = sizeof(F) * sources[from.source].size();
+            // TODO: this is code duplication with downstairs
+            if (freePointers.size() == 0) {
+              std::stringstream stream;
+              stream << "No more free pointers "
+                     << "for type " << type
+                     << " and name " << name
+                      ;
+              throw std::domain_error(stream.str());
+            }
+            auto dataPointer = freePointers.begin();
+            freePointers.erase(dataPointer);
+            blank.data = *dataPointer;
+            //
+            //
+            // TODO [#A]: do cuMemcpy of
+            //      sources[from.source].data() ⇒ blank.data
+            // Do this when everything else is working.
+            // This will probably be a bottleneck of the H-to-D communication,
+            // as most slices are SelfSufficient.
+            //
+            //
 #else
            blank.data = sources[from.source].data();
 #endif
@ -316,6 +337,16 @@ template <typename F=double>
            }
          }

+#if defined(HAVE_CUDA)
+          // In cuda, SelfSufficient slices have an ad-hoc pointer
+          // since it is a pointer on the device and has to be
+          // brought back to the free pointer bucket of the SliceUnion.
+          // Therefore, only in the Recycled case it should not be
+          // put back the pointer.
+          if (slice.info.state == Slice<F>::Recycled) {
+            freeSlicePointer = false;
+          }
+#else
          // if the slice is self sufficient, do not dare touching the
          // pointer, since it is a pointer to our sources in our rank.
          if (  slice.info.state == Slice<F>::SelfSufficient
@ -323,6 +354,7 @@ template <typename F=double>
             ) {
            freeSlicePointer = false;
          }
+#endif

          // make sure we get its data pointer to be used later
          // only for non-recycled, since it can be that we need
@ -346,7 +378,7 @@ template <typename F=double>
                    // << " info " << slice.info
                    << "\n";
          slice.free();
-        }
+        }  // we did not find the slice

      }
    }
@ -380,10 +412,17 @@ template <typename F=double>

      for (auto& ptr: sliceBuffers) {
 #if defined(HAVE_CUDA)
-        cuMemAlloc(&ptr, sizeof(F) * sources[0].size());
-	if (ptr == 0UL) {
-	  throw "UNSUFICCIENT MEMORY ON THE GRAPHIC CARD FOR FREE POINTERS";
-	}
+        const CUresult error =
+          cuMemAlloc(&ptr, sizeof(F) * sources[0].size());
+        if (ptr == 0UL) {
+          throw "UNSUFICCIENT MEMORY ON THE GRAPHIC CARD FOR FREE POINTERS";
+        }
+        if (error != CUDA_SUCCESS) {
+          std::stringstream s;
+          s << "Error allocating memory for slice buffers "
+            << "code " << error << "\n";
+          throw s.str();
+        }
 #else
        ptr = (DataPtr<F>)malloc(sizeof(F) * sources[0].size());
 #endif
--- a/include/atrip/Utils.hpp
+++ b/include/atrip/Utils.hpp
@ -80,6 +80,16 @@ struct Timer {
 using Timings = std::map<std::string, Timer>;
 // Chrono:1 ends here

+//  A nice handy macro to do formatting
+#define _FORMAT(_fmt, ...)                                    \
+  ([&] (void) -> std::string {                                \
+     int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__);  \
+     std::vector<char>  _out(_sz  +  1);                      \
+     std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
+     return std::string(_out.data());                         \
+   })()
+
+
 // [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
 }
 // Epilog:1 ends here
--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@ -24,7 +24,7 @@

 using namespace atrip;
 #if defined(HAVE_CUDA)
-#include <cuda.h>
+#include <atrip/CUDA.hpp>
 #endif

 template <typename F> bool RankMap<F>::RANK_ROUND_ROBIN;
@ -49,11 +49,6 @@ void Atrip::init(MPI_Comm world)  {
  Atrip::communicator = world;
  MPI_Comm_rank(world, (int*)&Atrip::rank);
  MPI_Comm_size(world, (int*)&Atrip::np);
-
-#if defined(HAVE_CUDA)
-  Atrip::cuda.status = cublasCreate(&Atrip::cuda.handle);
-#endif
-
 }

 template <typename F>
@ -71,18 +66,24 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {

 #if defined(HAVE_CUDA)
  int ngcards;
+  _CHECK_CUDA_SUCCESS("initializing cuda",
+                      cuInit(0));
+  _CHECK_CUDA_SUCCESS("getting device count",
+                      cuDeviceGetCount(&ngcards));
  const auto clusterInfo = getClusterInfo(Atrip::communicator);
-  cuDeviceGetCount(&ngcards);
  LOG(0,"Atrip") << "ngcards: " << ngcards << "\n";
  if (clusterInfo.ranksPerNode > ngcards) {
-    std::cerr << "ATRIP: You are running on more ranks per node than the number of graphic cards\n"
-              << "You have " << ngcards << " cards at your disposal\n";
-    throw "";
-  }
-  if (clusterInfo.ranksPerNode < ngcards) {
-    std::cerr << "You have " << ngcards << " cards at your disposal\n"
-              << "You will be only using " << clusterInfo.ranksPerNode
-              << ", i.e., the nubmer of ranks.\n";
+    const auto msg
+      = _FORMAT("ATRIP: You are running on more ranks per node than the number of graphic cards\n"
+                "You have %d cards at your disposal\n", ngcards);
+    std::cerr << msg;
+    throw msg;
+  } else if (clusterInfo.ranksPerNode < ngcards) {
+    const auto msg
+      = _FORMAT("You have %d cards at your disposal.\n"
+                "You will be only using %d, i.e, the number of ranks\n",
+                ngcards, clusterInfo.ranksPerNode);
+    std::cerr << msg;
  }


@ -94,16 +95,27 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
      struct { struct { size_t free, total; } avail; size_t total; } memory;
      char *name = (char*)malloc(256);

+      // - TODO :: we should check that the Zuweisung of graphic cards
+      //           to nodes works as expected, i.e., node k should get from 0
+      //           to ngcards with the formula =rank % ngcards=.
+
      // set current device
-      cuDeviceGet(&dev, rank);
-      cuCtxCreate(&ctx, 0, dev);
-      cuCtxSetCurrent(ctx);
+      _CHECK_CUDA_SUCCESS("getting device for index <rank>",
+                          cuDeviceGet(&dev, rank % ngcards));
+      _CHECK_CUDA_SUCCESS("creating a cuda context",
+                          cuCtxCreate(&ctx, 0, dev));
+      _CHECK_CUDA_SUCCESS("setting the context",
+                          cuCtxSetCurrent(ctx));

      // get information of the device
-      cuDeviceGetProperties(&prop, dev);
-      cuMemGetInfo(&memory.avail.free, &memory.avail.total);
-      cuDeviceGetName(name, 256, dev);
-      cuDeviceTotalMem(&memory.total, dev);
+      _CHECK_CUDA_SUCCESS("getting  properties of current device",
+                          cuDeviceGetProperties(&prop, dev));
+      _CHECK_CUDA_SUCCESS("getting memory information",
+                          cuMemGetInfo(&memory.avail.free, &memory.avail.total));
+      _CHECK_CUDA_SUCCESS("getting name",
+                          cuDeviceGetName(name, 256, dev));
+      _CHECK_CUDA_SUCCESS("getting total memory",
+                          cuDeviceTotalMem(&memory.total, dev));

      printf("\n"
             "CUDA CARD RANK %d\n"
@ -124,6 +136,10 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
             memory.total / 1024.0 / 1024.0 / 1024.0
             );
      std::free((void*)name);
+
+      _CHECK_CUBLAS_SUCCESS("creating a cublas handle",
+                            cublasCreate(&Atrip::cuda.handle));
+
    }
    MPI_Barrier(universe);
  }
@ -163,17 +179,27 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {

 #if defined(HAVE_CUDA)
  DataPtr<F> Tai, epsi, epsa;
-  //TODO: free memory pointers in the end of the algorithm
-  cuMemAlloc(&Tai, sizeof(F) * _Tai.size());
-  cuMemAlloc(&epsi, sizeof(F) * _epsi.size());
-  cuMemAlloc(&epsa, sizeof(F) * _epsa.size());

-  cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size());
-  cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size());
-  cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size());
+  // TODO: free memory pointers in the end of the algorithm

-  cuMemAlloc(&Tijk, sizeof(F) * No * No * No);
-  cuMemAlloc(&Zijk, sizeof(F) * No * No * No);
+  _CHECK_CUDA_SUCCESS("Tai",
+                      cuMemAlloc(&Tai, sizeof(F) * _Tai.size()));
+  _CHECK_CUDA_SUCCESS("epsi",
+                      cuMemAlloc(&epsi, sizeof(F) * _epsi.size()));
+  _CHECK_CUDA_SUCCESS("epsa",
+                      cuMemAlloc(&epsa, sizeof(F) * _epsa.size()));
+
+  _CHECK_CUDA_SUCCESS("memcpy Tai",
+                      cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size()));
+  _CHECK_CUDA_SUCCESS("memcpy epsi",
+                      cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size()));
+  _CHECK_CUDA_SUCCESS("memcpy epsa",
+                      cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size()));
+
+  _CHECK_CUDA_SUCCESS("Tijk",
+                      cuMemAlloc(&Tijk, sizeof(F) * No * No * No));
+  _CHECK_CUDA_SUCCESS("Zijk",
+                      cuMemAlloc(&Zijk, sizeof(F) * No * No * No));
 #else
  std::vector<F> &Tai = _Tai, &epsi = _epsi, &epsa = _epsa;
  Zijk = (DataFieldType<F>*)malloc(No*No*No * sizeof(DataFieldType<F>));
@ -266,8 +292,8 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {

  auto const isFakeTuple
    = [&tuplesList, distribution](size_t const i) {
-      return distribution->tupleIsFake(tuplesList[i]);
-    };
+        return distribution->tupleIsFake(tuplesList[i]);
+      };


  using Database = typename Slice<F>::Database;
--- a/src/atrip/Blas.cxx
+++ b/src/atrip/Blas.cxx
@ -15,19 +15,22 @@
 // [[file:~/cuda/atrip/atrip.org::*Blas][Blas:2]]
 #include <atrip/Blas.hpp>
 #include <atrip/Atrip.hpp>
+#include <atrip/CUDA.hpp>

 #if defined(HAVE_CUDA)
 #  include <cstring>

-  static inline
-  cublasOperation_t char_to_cublasOperation(const char* trans) {
-    if (strncmp("N", trans, 1) == 0)
-      return CUBLAS_OP_N;
-    else if (strncmp("T", trans, 1) == 0)
-      return CUBLAS_OP_T;
-    else
-      return CUBLAS_OP_C;
-  }
+static size_t dgem_call = 0;
+
+static inline
+cublasOperation_t char_to_cublasOperation(const char* trans) {
+  if (strncmp("N", trans, 1) == 0)
+    return CUBLAS_OP_N;
+  else if (strncmp("T", trans, 1) == 0)
+    return CUBLAS_OP_T;
+  else
+    return CUBLAS_OP_C;
+}

 #endif

@ -49,13 +52,23 @@ namespace atrip {
             typename DataField<double>::type *C,
             const int *ldc) {
 #if defined(HAVE_CUDA)
-    cublasDgemm(Atrip::cuda.handle,
-                char_to_cublasOperation(transa),
-                char_to_cublasOperation(transb),
-                *m, *n, *k,
-                alpha, A, *lda,
-                B, *ldb, beta,
-                C, *ldc);
+    // TODO: remove this verbose checking
+    const cublasStatus_t error =
+      cublasDgemm(Atrip::cuda.handle,
+                  char_to_cublasOperation(transa),
+                  char_to_cublasOperation(transb),
+                  *m, *n, *k,
+                  alpha, A, *lda,
+                  B, *ldb, beta,
+                  C, *ldc);
+    if (error != 0) printf(":%-3ld (%4ldth) ERR<%4d> cublasDgemm: "
+                           "A = %20ld "
+                           "B = %20ld "
+                           "C = %20ld "
+                           "\n",
+                           Atrip::rank,
+                           dgem_call++,
+                           error, A, B, C);
 #else
    dgemm_(transa, transb,
           m, n, k,
@ -83,16 +96,17 @@ namespace atrip {
    cuDoubleComplex
      cu_alpha = {std::real(*alpha), std::imag(*alpha)},
      cu_beta = {std::real(*beta), std::imag(*beta)};
-    cublasZgemm(Atrip::cuda.handle,
-                char_to_cublasOperation(transa),
-                char_to_cublasOperation(transb),
-                *m, *n, *k,
-                &cu_alpha,
-
-                A, *lda,
-                B, *ldb,
-                &cu_beta,
-                C, *ldc);
+    
+    _CHECK_CUBLAS_SUCCESS("cublasZgemm",
+                          cublasZgemm(Atrip::cuda.handle,
+                                      char_to_cublasOperation(transa),
+                                      char_to_cublasOperation(transb),
+                                      *m, *n, *k,
+                                      &cu_alpha,
+                                      A, *lda,
+                                      B, *ldb,
+                                      &cu_beta,
+                                      C, *ldc));
 #else
    zgemm_(transa, transb,
           m, n, k,
--- a/src/atrip/Equations.cxx
+++ b/src/atrip/Equations.cxx
@ -14,31 +14,13 @@

 // [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]]
 #include<atrip/Equations.hpp>
-#include<atrip/CUDA.hpp>

-#if defined(HAVE_CUDA)
-#include <cuda.h>
-#endif
+#include<atrip/CUDA.hpp>

 namespace atrip {
 // Prolog:2 ends here


-  // These are just help structures
-  // to help with the templating of reorder
-  // function
-  enum reordering_t
-    {
-     IJK,
-     IKJ,
-     JIK,
-     JKI,
-     KIJ,
-     KJI
-    };
-
-  template <typename F, reordering_t R>
-  struct reorder_proxy {};

 #ifdef HAVE_CUDA
 namespace cuda {
@ -101,49 +83,42 @@ namespace cuda {
  template <>
  __device__
  cuDoubleComplex multiply(const cuDoubleComplex &a, const cuDoubleComplex &b) {
-   return
-     {a.x * b.x - a.y * b.y,
-      a.x * b.y + a.y * b.x};
+    return
+      {a.x * b.x - a.y * b.y,
+       a.x * b.y + a.y * b.x};
  }

  template <typename F>
  __device__
-  void sum_in_place(F* to, const F* b);
+  void sum_in_place(F* to, const F* from);

  template <>
  __device__
-  void sum_in_place(double* to, const double *b) { *to += *b; }
+  void sum_in_place(double* to, const double *from) { *to += *from; }

  template <>
  __device__
-  void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* b) {
-   to->x += b->x;
-   to->y += b->y;
-  }
-
-  __device__
-  cuDoubleComplex& operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz) {
-    lz.x += rz.x;
-    lz.y += rz.y;
-    return lz;
+  void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* from) {
+    to->x += from->x;
+    to->y += from->y;
  }

 };
 #endif

 #if defined(HAVE_CUDA)
-#define LIMS_KS()                                       \
-  size_t kmin = blockIdx.x * blockDim.x + threadIdx.x,  \
-    k = kmin,                                           \
-    idx = kmin * size * size * size                     \
-    ;                                                   \
-  k < (kmin < size) ? kmin + 1 : size
+#define FOR_K()                                             \
+  for (size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \
+         k = kmin,                                          \
+         idx = kmin * size * size * size;                   \
+       k < (kmin < size) ? kmin + 1 : size;                 \
+       k++)
 #else
-#define LIMS_KS size_t k=0, idx=0; k < size
+#define FOR_K() for (size_t k=0, idx=0; k < size; k++)
 #endif
 #define _IJK_(i, j, k) i + j*size + k*size*size
 #define _REORDER_BODY_(...)                       \
-  for (LIMS_KS() ; k++)                           \
+  FOR_K()                                         \
    for (size_t j = 0; j < size; j++)             \
      for (size_t i = 0; i < size; i++, idx++) {  \
        __VA_ARGS__                               \
@ -152,7 +127,9 @@ namespace cuda {
  template <typename F>                         \
  __MAYBE_GLOBAL__                              \
  void reorder(reorder_proxy< F, _enum > p,     \
-               size_t size, F* to, F* from) {   \
+               size_t size,                     \
+               F* to,                           \
+               F* from) {                       \
    _REORDER_BODY_(__VA_ARGS__)                 \
      }
 #if defined(HAVE_CUDA)
@ -161,9 +138,31 @@ namespace cuda {
 #define GO(__TO, __FROM) __TO += __FROM;
 #endif

+  // These are just help structures
+  // to help with the templating of reorder
+  // function
+  enum reordering_t
+    {
+     IJK,
+     IKJ,
+     JIK,
+     JKI,
+     KIJ,
+     KJI
+    };
+
+  /* 
+   * Please the c++ type checker and template creator
+   * in order to have an argument in the signature of
+   * the function  that helps the compiler know which
+   * instantiation it should take.
+   * 
+   */
+  template <typename F, reordering_t R>
+  struct reorder_proxy {};

  template <typename F, reordering_t R>
-  __MAYBE_GLOBAL__							\
+  __MAYBE_GLOBAL__
  void reorder(reorder_proxy<F, R> proxy,
               size_t size, F* to, F* from);

@ -445,10 +444,14 @@ double getEnergySame

 #if defined(ATRIP_USE_DGEMM)
 #if defined(HAVE_CUDA)
-#define REORDER(__II, __JJ, __KK)                       \
-  reorder<<<bs, ths>>>(reorder_proxy<DataFieldType<F>,  \
-                       __II ## __JJ ## __KK >{},        \
-                       No, Tijk, _t_buffer);
+#define REORDER(__II, __JJ, __KK)                           \
+  reorder<<<bs, ths>>>(reorder_proxy<                       \
+                       DataFieldType<F>,                    \
+                       __II ## __JJ ## __KK                 \
+                       >{},                                 \
+                       No,                                  \
+                       Tijk,                                \
+                       _t_buffer);
 #define DGEMM_PARTICLES(__A, __B)               \
  atrip::xgemm<F>("T",                          \
                  "N",                          \
@ -462,7 +465,7 @@ double getEnergySame
                  (int const*)&Nv,              \
                  &zero,                        \
                  _t_buffer,                    \
-                  (int const*)&NoNo);
+                  (int const*)&NoNo)
 #define DGEMM_HOLES(__A, __B, __TRANSB)         \
  atrip::xgemm<F>("N",                          \
                  __TRANSB,                     \
@ -477,7 +480,7 @@ double getEnergySame
                  &zero,                        \
                  _t_buffer,                    \
                  (int const*)&NoNo             \
-                  );
+                  )
 #define MAYBE_CONJ(_conj, _buffer)                                      \
  cuda::maybeConjugate<<<                                               \
    Atrip::kernelDimensions.ooo.blocks,                                 \
@ -512,7 +515,7 @@ double getEnergySame
                  &zero,                        \
                  _t_buffer,                    \
                  (int const*)&NoNo             \
-                  );
+                  )
 #define DGEMM_HOLES(__A, __B, __TRANSB)         \
  atrip::xgemm<F>("N",                          \
                  __TRANSB,                     \
@ -527,7 +530,7 @@ double getEnergySame
                  &zero,                        \
                  _t_buffer,                    \
                  (int const*)&NoNo             \
-                  );
+                  )
 #define MAYBE_CONJ(_conj, _buffer)                \
  for (size_t __i = 0; __i < NoNoNo; ++__i)       \
    _conj[__i] = maybeConjugate<F>(_buffer[__i]);
@ -538,8 +541,14 @@ double getEnergySame
 #ifdef HAVE_CUDA
    DataFieldType<F>* _t_buffer;
    DataFieldType<F>* _vhhh;
-    cuMemAlloc((CUdeviceptr*)&_t_buffer, NoNoNo * sizeof(DataFieldType<F>));
-    cuMemAlloc((CUdeviceptr*)&_vhhh, NoNoNo * sizeof(DataFieldType<F>));
+    WITH_CHRONO("double:cuda:alloc",
+    _CHECK_CUDA_SUCCESS("Allocating _t_buffer",
+                        cuMemAlloc((CUdeviceptr*)&_t_buffer,
+                                   NoNoNo * sizeof(DataFieldType<F>)));
+    _CHECK_CUDA_SUCCESS("Allocating _vhhh",
+                        cuMemAlloc((CUdeviceptr*)&_vhhh,
+                                   NoNoNo * sizeof(DataFieldType<F>)));
+                )
    const size_t
      bs = Atrip::kernelDimensions.ooo.blocks,
      ths = Atrip::kernelDimensions.ooo.threads;
@ -560,7 +569,6 @@ double getEnergySame
    WITH_CHRONO("double:reorder",
                cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk,
                                           NoNoNo);
-                // synchronize all initializations to zero
                )
 #else
    WITH_CHRONO("double:reorder",
@ -575,37 +583,37 @@ double getEnergySame
        // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
        MAYBE_CONJ(_vhhh, VhhhC)
        WITH_CHRONO("doubles:holes:1",
-          DGEMM_HOLES(_vhhh, TABhh, "N")
-          REORDER(I, K, J)
+                    DGEMM_HOLES(_vhhh, TABhh, "N");
+                    REORDER(I, K, J)
        )
        // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
        WITH_CHRONO("doubles:holes:2",
-          DGEMM_HOLES(_vhhh, TABhh, "T")
-          REORDER(J, K, I)
+                    DGEMM_HOLES(_vhhh, TABhh, "T");
+                    REORDER(J, K, I)
        )

        // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
        MAYBE_CONJ(_vhhh, VhhhB)
        WITH_CHRONO("doubles:holes:3",
-          DGEMM_HOLES(_vhhh, TAChh, "N")
-          REORDER(I, J, K)
+                    DGEMM_HOLES(_vhhh, TAChh, "N");
+                    REORDER(I, J, K)
        )
        // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
        WITH_CHRONO("doubles:holes:4",
-          DGEMM_HOLES(_vhhh, TAChh, "T")
-          REORDER(K, J, I)
+                    DGEMM_HOLES(_vhhh, TAChh, "T");
+                    REORDER(K, J, I)
        )

        // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
        MAYBE_CONJ(_vhhh, VhhhA)
        WITH_CHRONO("doubles:holes:5",
-          DGEMM_HOLES(_vhhh, TBChh, "N")
-          REORDER(J, I, K)
+                    DGEMM_HOLES(_vhhh, TBChh, "N");
+                    REORDER(J, I, K)
        )
        // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
        WITH_CHRONO("doubles:holes:6",
-          DGEMM_HOLES(_vhhh, TBChh, "T")
-          REORDER(K, I, J)
+                    DGEMM_HOLES(_vhhh, TBChh, "T");
+                    REORDER(K, I, J)
        )
      }
    )
@ -616,33 +624,33 @@ double getEnergySame
      {
        // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
        WITH_CHRONO("doubles:particles:1",
-          DGEMM_PARTICLES(TAphh, VBCph)
-          REORDER(I, J, K)
+                    DGEMM_PARTICLES(TAphh, VBCph);
+                    REORDER(I, J, K)
        )
        // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
        WITH_CHRONO("doubles:particles:2",
-          DGEMM_PARTICLES(TAphh, VCBph)
-          REORDER(I, K, J)
+                    DGEMM_PARTICLES(TAphh, VCBph);
+                    REORDER(I, K, J)
        )
        // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
        WITH_CHRONO("doubles:particles:3",
-          DGEMM_PARTICLES(TCphh, VABph)
-          REORDER(K, I, J)
+                    DGEMM_PARTICLES(TCphh, VABph);
+                    REORDER(K, I, J)
        )
        // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
        WITH_CHRONO("doubles:particles:4",
-          DGEMM_PARTICLES(TCphh, VBAph)
-          REORDER(K, J, I)
+                    DGEMM_PARTICLES(TCphh, VBAph);
+                    REORDER(K, J, I)
        )
        // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
        WITH_CHRONO("doubles:particles:5",
-          DGEMM_PARTICLES(TBphh, VACph)
-          REORDER(J, I, K)
+                    DGEMM_PARTICLES(TBphh, VACph);
+                    REORDER(J, I, K)
        )
        // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
        WITH_CHRONO("doubles:particles:6",
-          DGEMM_PARTICLES(TBphh, VCAph)
-          REORDER(J, K, I)
+                    DGEMM_PARTICLES(TBphh, VCAph);
+                    REORDER(J, K, I)
        )
      }
    )
@ -651,9 +659,12 @@ double getEnergySame
 #ifdef HAVE_CUDA
    // we need to synchronize here since we need
    // the Tijk for next process in the pipeline
-    cuCtxSynchronize();
-    cuMemFree((CUdeviceptr)_vhhh);
-    cuMemFree((CUdeviceptr)_t_buffer);
+    _CHECK_CUDA_SUCCESS("Synchronizing",
+                        cuCtxSynchronize());
+    _CHECK_CUDA_SUCCESS("Freeing _vhhh",
+                        cuMemFree((CUdeviceptr)_vhhh));
+    _CHECK_CUDA_SUCCESS("Freeing _t_buffer",
+                        cuMemFree((CUdeviceptr)_t_buffer));
 #else
    free(_vhhh);
    free(_t_buffer);
Author	SHA1	Message	Date
Gallo Alejandro	20c29ed815	Add a couple of bench scripts for cuda and cublas testing	2022-09-12 19:53:50 +02:00
Gallo Alejandro	a5bb8f3d9c	Give a better message when out of memory in allocating SliceUnion	2022-09-12 19:39:38 +02:00
Gallo Alejandro	5678ac0d9c	Fix AniaBug#2: Lifecycle of SelfSufficient slices was wrong (comments) This commit shows how the lifecycle of as slice goes. At some point, a rank gets a list of slices that needs in the next iteration, at classifies them according to the characteristics of every situation. If for instance we are given a slice with an abc tuple such that we find that this tuple was given to our rank, then we know that we have to create a SelfSufficient tuple. What we do is that we find a blank slice in our SliceUnion slices bucket. This buffer is blank and safe to do everything we want with it. Without cuda, we just need to point this blank slice to the correct memory address of the data, that we (the SliceUnion) own. This is therefore the line blank.data = sources[from.source].data(); Of course, doing this in CUDA will mess everything, as it was until now, since we are pointing to a Host address. Sadly the way the casting fu is now implemented, the typechecker did not get that one and I foolishly forgot about this important bit. After the creation of the slice comes at some point in the life cycle the destruction, which we also have to handle separately. This is done every iteration in the void clearUnusedSlicesForNext(ABCTuple const& abc); function. There, normally the SelfSufficient slice would just forget about the pointer it points, slice.data, since this point is part of the original data of the tensor distributed in the SliceUnion. In the CUDA case however, we gave the SelfSufficient slice a freePointer from our SliceUnion's bucket of allocated freePointers in the GPU (of which we have around 6 per SliceUnion type). This pointer needs to be marked again free to use by a slice in the future, so it has to go back to the bucket, we can't afford to lose it.	2022-09-12 19:29:35 +02:00
Gallo Alejandro	5483325626	Fix AniaBug #1 : cublasCreate after context setting	2022-09-12 19:17:52 +02:00
Gallo Alejandro	23ad87214f	Uncomment everything in Equations, (to test see comments) For testing, comment out everything that has REORDER, MAYBE_CONJ and zeroing.	2022-09-12 19:13:45 +02:00
Gallo Alejandro	3d7702d501	Minimal changes in Equations	2022-09-12 19:10:14 +02:00
Gallo Alejandro	c20b9e3bcb	Add error checking in Blas.cxx	2022-09-12 19:07:48 +02:00
Gallo Alejandro	da704ad820	Add convenience _FORMAT macro	2022-09-12 18:42:36 +02:00
Gallo Alejandro	f1b2f37fe2	Check CUresult for mpi_data to device	2022-09-12 18:41:37 +02:00
Gallo Alejandro	1cd7bac187	Add macros to check CUerror and cublasStatus_t	2022-09-12 18:36:01 +02:00
Gallo Alejandro	68892d5dd8	Make bootstrap work from anywhere in the project	2022-09-12 18:35:30 +02:00