diff --git a/bench/Makefile.am b/bench/Makefile.am
index 39ce3c3..8897ddf 100644
--- a/bench/Makefile.am
+++ b/bench/Makefile.am
@@ -1,3 +1,4 @@
+AUTOMAKE_OPTIONS = subdir-objects
 include $(top_srcdir)/atrip.mk
 
 AM_CPPFLAGS = -I$(top_srcdir)/include/ $(CTF_CPPFLAGS)
@@ -19,4 +20,13 @@ endif
 if WITH_CUDA
 test_main_CXXFLAGS = $(CUDA_CXXFLAGS)
 test_main_LDADD += $(CUDA_LDFLAGS)
+
+AM_CXXFLAGS = $(CUDA_CXXFLAGS)
+AM_LDFLAGS += $(CUDA_LDFLAGS)
+
+bin_PROGRAMS += test-cublas-parallel-atrip
+test_cublas_parallel_SOURCES = test-cublas-parallel-atrip.cxx
+
+bin_PROGRAMS += test-cuda-sanity
+test_cuda_sanity_SOURCES = test-cuda-sanity.cxx
 endif
diff --git a/bench/test-cublas-parallel-atrip.cxx b/bench/test-cublas-parallel-atrip.cxx
new file mode 100644
index 0000000..2215ff2
--- /dev/null
+++ b/bench/test-cublas-parallel-atrip.cxx
@@ -0,0 +1,239 @@
+#include <mpi.h>
+#include <iostream>
+#include <cassert>
+#include <string.h>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <cuda.h>
+#include <chrono>
+#include <map>
+#include <cstdlib>
+#include <unistd.h>
+
+#include <CLI11.hpp>
+
+#define CUBLASAPI
+#include <cublas_api.h>
+#include <cublas_v2.h>
+
+struct Timer {
+  using Clock = std::chrono::high_resolution_clock;
+  using Event = std::chrono::time_point<Clock>;
+  std::chrono::duration<double> duration;
+  Event _start;
+  inline void start() noexcept { _start = Clock::now(); }
+  inline void stop() noexcept { duration += Clock::now() - _start; }
+  inline void clear() noexcept { duration *= 0; }
+  inline double count() const noexcept { return duration.count(); }
+};
+using Timings = std::map<std::string, Timer>;
+
+#define _FORMAT(_fmt, ...)                                    \
+  ([&] (void) -> std::string {                                \
+     int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__);  \
+     std::vector<char>  _out(_sz  +  1);                      \
+     std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
+     return std::string(_out.data());                         \
+   })()
+
+#define _CHECK_CUDA_SUCCESS(message, ...)                               \
+  do {                                                                  \
+    CUresult result = __VA_ARGS__;                                      \
+    printf("doing %s\n", message);                                      \
+    if (result != CUDA_SUCCESS) {                                       \
+      printf("\t!!CUDA_ERROR(%d): %s:%d %s\n",                          \
+             result,                                                    \
+             __FILE__,                                                  \
+             __LINE__,                                                  \
+             message);                                                  \
+      return 1;                                                         \
+    }                                                                   \
+  } while (0)
+
+#define _CHECK_CUBLAS_SUCCESS(message, ...)                             \
+  do {                                                                  \
+    cublasStatus_t result = __VA_ARGS__;                                \
+    if (result != 0) {                                                  \
+      printf("\t!!CUBLAS_ERROR(%d): %s:%d  %s\n",                       \
+             result,                                                    \
+             __FILE__,                                                  \
+             __LINE__,                                                  \
+             message);                                                  \
+      return 1;                                                         \
+    }                                                                   \
+  } while (0)
+
+int main(int  argc, char** argv) {
+
+  using std::vector;
+
+  MPI_Init(NULL, NULL);
+  int rank, np, ngcards;
+  size_t no(10), nv(no * 10), its(2);
+  bool barrier = false;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &np);
+
+  CLI::App app{"Main bench for atrip"};
+  app.add_option("--no", no, "Occupied orbitals");
+  app.add_option("--nv", nv, "Virtual orbitals");
+  app.add_option("--its", its, "Number of iterations to be done");
+  app.add_option("--barrier", barrier, "Call a MPI_Barrier in every iteration?");
+  CLI11_PARSE(app, argc, argv);
+
+  const size_t oo = no * no, ooo = no * oo;
+
+  Timings timings;
+
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  _CHECK_CUDA_SUCCESS("init for cuda",
+                      cuInit(0));
+
+  _CHECK_CUDA_SUCCESS("get ncards",
+                      cuDeviceGetCount(&ngcards));
+
+  CUcontext ctx;
+  CUdevice dev;
+
+  char hostname[256];
+  gethostname(hostname, 256);
+  printf("%s with rank %d gets card %d\n",
+         hostname,
+         rank,
+         rank % ngcards);
+
+  // set contexts
+  _CHECK_CUDA_SUCCESS("device get",       cuDeviceGet(&dev, rank % ngcards));
+  _CHECK_CUDA_SUCCESS("creating context", cuCtxCreate(&ctx, 0, dev));
+  _CHECK_CUDA_SUCCESS("setting context",  cuCtxSetCurrent(ctx));
+  _CHECK_CUDA_SUCCESS("synchronizing",    cuCtxSynchronize());
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  using host_slice_t = vector<double>;
+
+  vector<size_t> sizes = {nv * oo,  nv * no , oo,      oo * no,  oo * no};
+  vector<CUdeviceptr>     P_phh(3), P_ph(6) , H_hh(3), H_hhh(3), T_hhh(1);
+  vector<vector<CUdeviceptr>*> slices_d = {&P_phh, &P_ph , &H_hh, &H_hhh, &T_hhh};
+  vector<vector<host_slice_t>> slices_h(slices_d.size());
+  {
+    int i = -1;
+    for (auto& v: slices_d) {
+      i++;
+      for (auto& ptr: *v) {
+        _CHECK_CUDA_SUCCESS("malloc",
+                            cuMemAlloc(&ptr,
+                                       sizes[i] * sizeof(double)));
+        slices_h[i].push_back(std::move(std::vector<double>(sizes[i])));
+      }
+    }
+  }
+
+  const double one = 1.0, zero = 0.0;
+
+  printf("its: %d\n", its);
+  printf("barrier: %d\n", barrier);
+  printf("no: %ld\n", no);
+  printf("nv: %ld\n", nv);
+  printf("SIZE %f GB\n", (3 * nv * oo
+                          + 6 * oo * nv
+                          + 3 * oo
+                          + 3 * ooo
+                          + 1 * ooo
+                          ) * sizeof(double) / 1024.0 / 1024.0 / 1024.0);
+  std::map<std::string, double> tflopss
+    {{ "dgemm",     ooo * (no + nv) * 6.0 * 2.0 * its / 1e12},
+     { "holes",     ooo * no * 6.0 * 2.0 * its / 1e12},
+     { "particles", ooo * nv * 6.0 * 2.0 * its / 1e12}};
+
+  cublasHandle_t handle;
+  _CHECK_CUBLAS_SUCCESS("handle create", cublasCreate(&handle));
+  printf("handle %ld\n", handle);
+
+  timings["dgemm"].start();
+  for (size_t i = 0; i < its; i++) {
+
+    if (barrier) {
+      MPI_Barrier(MPI_COMM_WORLD);
+      timings["memcpy"].start();
+      for (size_t _s = 0; _s < slices_d.size(); _s++) {
+        // for (size_t _b = 0; _b < slices_h[_s].size(); _b++) {
+        for (size_t _b = 0; _b < 1 ; _b++) {
+          auto device = (*slices_d[_s])[_b];
+          auto host   = slices_h[_s][_b].data();
+          cuMemcpyHtoD(device, host, sizes[_s]);
+        }
+      }
+      timings["memcpy"].stop();
+    }
+
+
+    timings["holes"].start();
+    for (size_t j = 0; j < 3; j++) {
+
+      _CHECK_CUBLAS_SUCCESS(" > 'geming ...",
+                            cublasDgemm(handle,
+                                        CUBLAS_OP_N,
+                                        CUBLAS_OP_N,
+                                        oo, no, no,
+                                        &one,
+                                        (double*)H_hhh[j], oo,
+                                        (double*)H_hh[j], no,
+                                        &zero,
+                                        (double*)T_hhh[0], oo));
+
+      _CHECK_CUBLAS_SUCCESS(" > 'geming ...",
+                            cublasDgemm(handle,
+                                        CUBLAS_OP_N,
+                                        CUBLAS_OP_T,
+                                        oo, no, no,
+                                        &one,
+                                        (double*)H_hhh[j], oo,
+                                        (double*)H_hh[j], no,
+                                        &zero,
+                                        (double*)T_hhh[0], oo));
+
+
+    }
+    timings["holes"].stop();
+
+    timings["particles"].start();
+    for (size_t j = 0; j < 6; j++) {
+      _CHECK_CUBLAS_SUCCESS(" > 'geming ...",
+                            cublasDgemm(handle,
+                                        CUBLAS_OP_T,
+                                        CUBLAS_OP_N,
+                                        oo, no, nv,
+                                        &one,
+                                        (double*)P_phh[j % 3], nv,
+                                        (double*)P_ph[j], nv,
+                                        &zero,
+                                        (double*)T_hhh[0], oo));
+    }
+    timings["particles"].stop();
+
+    cuCtxSynchronize();
+
+  }
+
+
+  timings["dgemm"].stop();
+
+
+
+  printf("Performance: \n");
+  for (auto name: {"holes", "particles", "dgemm"})
+    printf("%10s TFlops: %4.1f\n",
+          name,
+          tflopss[name]
+          / timings[name].count());
+
+  printf("Timings: \n");
+  for (auto const& kv: timings)
+    printf("%10s: %10f\n", kv.first.c_str(), kv.second.count());
+
+  MPI_Finalize();
+  return 0;
+
+}
diff --git a/bench/test-cublas-parallel.cxx b/bench/test-cublas-parallel.cxx
new file mode 100644
index 0000000..6fdfc93
--- /dev/null
+++ b/bench/test-cublas-parallel.cxx
@@ -0,0 +1,163 @@
+#include <mpi.h>
+#include <iostream>
+#include <cassert>
+#include <string.h>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <cuda.h>
+#include <chrono>
+#include <map>
+#include <cstdlib>
+#include <unistd.h>
+
+#define CUBLASAPI
+#include <cublas_api.h>
+#include <cublas_v2.h>
+
+struct Timer {
+  using Clock = std::chrono::high_resolution_clock;
+  using Event = std::chrono::time_point<Clock>;
+  std::chrono::duration<double> duration;
+  Event _start;
+  inline void start() noexcept { _start = Clock::now(); }
+  inline void stop() noexcept { duration += Clock::now() - _start; }
+  inline void clear() noexcept { duration *= 0; }
+  inline double count() const noexcept { return duration.count(); }
+};
+using Timings = std::map<std::string, Timer>;
+
+#define _FORMAT(_fmt, ...)                                    \
+  ([&] (void) -> std::string {                                \
+     int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__);  \
+     std::vector<char>  _out(_sz  +  1);                      \
+     std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
+     return std::string(_out.data());                         \
+   })()
+
+#define _CHECK_CUDA_SUCCESS(message, ...)                               \
+  do {                                                                  \
+    CUresult result = __VA_ARGS__;                                      \
+    printf("doing %s\n", message);                                      \
+    if (result != CUDA_SUCCESS) {                                       \
+      printf("\t!!CUDA_ERROR(%d): %s:%d %s\n",                          \
+             result,                                                    \
+             __FILE__,                                                  \
+             __LINE__,                                                  \
+             message);                                                  \
+      return 1;                                                         \
+    }                                                                   \
+  } while (0)
+
+#define _CHECK_CUBLAS_SUCCESS(message, ...)                             \
+  do {                                                                  \
+    cublasStatus_t result = __VA_ARGS__;                                \
+    printf("CUBLAS: doing %s\n", message);                              \
+    if (result != 0) {                                                  \
+      printf("\t!!CUBLAS_ERROR(%d): %s:%d  %s\n",                       \
+             result,                                                    \
+             __FILE__,                                                  \
+             __LINE__,                                                  \
+             message);                                                  \
+      return 1;                                                         \
+    }                                                                   \
+  } while (0)
+
+int main(int  argc, char** argv) {
+  MPI_Init(NULL, NULL);
+  int rank, np, ngcards;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &np);
+
+  Timings timings;
+
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  _CHECK_CUDA_SUCCESS("init for cuda",
+                      cuInit(0));
+
+  _CHECK_CUDA_SUCCESS("get ncards",
+                      cuDeviceGetCount(&ngcards));
+
+
+  const size_t N
+    = argc > 1
+    ? atoi(argv[1])
+    : 30000
+    , dgemms
+    = argc > 2
+    ? atoi(argv[2])
+    : 2
+    , flops = 2 * N * N * N * dgemms
+    ;
+
+  CUcontext ctx;
+  CUdevice dev;
+
+  char hostname[256];
+  gethostname(hostname, 256);
+  printf("%s with rank %d gets card %d\n",
+         hostname,
+         rank,
+         rank % ngcards);
+
+  // set contexts
+  _CHECK_CUDA_SUCCESS("device get",
+                      cuDeviceGet(&dev, rank % ngcards));
+  _CHECK_CUDA_SUCCESS("creating context",
+                      cuCtxCreate(&ctx, 0, dev));
+  _CHECK_CUDA_SUCCESS("setting context",
+                      cuCtxSetCurrent(ctx));
+  _CHECK_CUDA_SUCCESS("synchronizing",
+                      cuCtxSynchronize());
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  CUdeviceptr A, B, C;
+  const double one = 1.0;
+  printf("SIZE %f GB\n", N * N * sizeof(double) / 1024.0 / 1024.0 / 1024.0);
+  _CHECK_CUDA_SUCCESS("A", cuMemAlloc(&A, N * N * sizeof(double)));
+  _CHECK_CUDA_SUCCESS("B", cuMemAlloc(&B, N * N * sizeof(double)));
+  _CHECK_CUDA_SUCCESS("C", cuMemAlloc(&C, N * N * sizeof(double)));
+
+  cublasHandle_t handle;
+  cublasStatus_t stat;
+  _CHECK_CUBLAS_SUCCESS("handle create", cublasCreate(&handle));
+  printf("handle %ld\n", handle);
+
+  timings["dgemm"].start();
+  for (size_t i = 0; i < dgemms; i++) {
+    _CHECK_CUBLAS_SUCCESS(_FORMAT(" > 'geming %ld ...", i).c_str(),
+                          cublasDgemm(handle,
+                                      CUBLAS_OP_N,
+                                      CUBLAS_OP_N,
+                                      N, N, N,
+                                      &one,
+                                      (double*)A, N,
+                                      (double*)B, N,
+                                      &one,
+                                      (double*)C, N));
+  }
+
+  cuCtxSynchronize();
+  timings["dgemm"].stop();
+
+
+  printf("dgemm Gflops: %f\n",
+         flops
+         / timings["dgemm"].count()
+         / 1024.0
+         / 1024.0
+         / 1024.0);
+
+  MPI_Finalize();
+  return 0;
+}
+
+// Local Variables:
+// compile-command: "mpic++                         \
+//      -pedantic -std=c++11                        \
+//      -L./cudaroot/lib64 -lcuda                   \
+//      -L./cudaroot/lib64 -lcudart                 \
+//      -L./cudaroot/lib64 -lcublas                 \
+//      ./test-cublas-parallel.cxx -o test-cublas-parallel"
+// End:
diff --git a/bench/test-cuda-sanity.cxx b/bench/test-cuda-sanity.cxx
new file mode 100644
index 0000000..bbb9147
--- /dev/null
+++ b/bench/test-cuda-sanity.cxx
@@ -0,0 +1,104 @@
+#include <mpi.h>
+#include <iostream>
+#include <cassert>
+#include <string.h>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <cuda.h>
+
+#define _CHECK_CUDA_SUCCESS(message, ...)                               \
+  do {                                                                  \
+    CUresult result = __VA_ARGS__;                                      \
+    printf("doing %s\n", message);                                      \
+    if (result != CUDA_SUCCESS) {                                       \
+      printf("\t!!CUDA_ERROR(%d): %s:%d %s\n",                          \
+             result,                                                    \
+             __FILE__,                                                  \
+             __LINE__,                                                  \
+             message);                                                  \
+      return 1;                                                         \
+    }                                                                   \
+  } while (0)
+
+int main() {
+  int rank, np, ngcards;
+  MPI_Init(NULL, NULL);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &np);
+
+  _CHECK_CUDA_SUCCESS("init for cuda",
+                      cuInit(0));
+
+  _CHECK_CUDA_SUCCESS("get ncards",
+                      cuDeviceGetCount(&ngcards));
+
+
+  for (size_t rank = 0; rank < ngcards; rank++) {
+    CUcontext ctx;
+    CUdevice dev;
+    CUdevprop_st prop;
+    size_t _free, total, total2;
+    char *name = (char*)malloc(256);
+
+    printf("Setting contexts\n");
+    // set contexts
+    _CHECK_CUDA_SUCCESS("device get",
+                        cuDeviceGet(&dev, rank));
+    _CHECK_CUDA_SUCCESS("creating context",
+                        cuCtxCreate(&ctx, 0, dev));
+    _CHECK_CUDA_SUCCESS("setting context",
+                        cuCtxSetCurrent(ctx));
+    _CHECK_CUDA_SUCCESS("synchronizing",
+                        cuCtxSynchronize());
+
+    _CHECK_CUDA_SUCCESS("prop get",
+                        cuDeviceGetProperties(&prop, dev));
+    _CHECK_CUDA_SUCCESS("meminfo get",
+                        cuMemGetInfo(&_free, &total));
+    _CHECK_CUDA_SUCCESS("name get",
+                        cuDeviceGetName(name, 256, dev));
+    _CHECK_CUDA_SUCCESS("totalmem get",
+                        cuDeviceTotalMem(&total2, dev));
+
+    printf("\n"
+           "CUDA CARD RANK %d\n"
+           "=================\n"
+           "\tname: %s\n"
+           "\tShared Mem Per Block (KB): %f\n"
+           "\tFree/Total mem (GB): %f/%f\n"
+           "\total2 mem (GB): %f\n"
+           "\n",
+           dev,
+           name,
+           prop.sharedMemPerBlock / 1024.0,
+           _free / 1024.0 / 1024.0 / 1024.0 ,
+           total / 1024.0 / 1024.0 / 1024.0 ,
+           total2 / 1024.0 / 1024.0 / 1024.0
+           );
+
+    if (_free == 0 || total == 0 || total2 == 0)
+      return 1;
+
+    CUdeviceptr data;
+    _CHECK_CUDA_SUCCESS("memalloc 1",
+                        cuMemAlloc(&data, sizeof(double) * 10000));
+    _CHECK_CUDA_SUCCESS("memalloc 2",
+                        cuMemAlloc(&data, sizeof(double) * 10000));
+    
+  }
+
+  MPI_Finalize();
+
+  return 0;
+}
+
+// Local Variables:
+// compile-command: "mpic++                         \
+//      -pedantic -std=c++11                        \
+//      -L./cudaroot/lib64 -lcuda                   \
+//      -L./cudaroot/lib64 -lcudart                 \
+//      -L./cudaroot/lib64 -lcublas                 \
+//      ./mem.cxx -o mem"
+// End: