diff --git a/include/atrip/Equations.hpp b/include/atrip/Equations.hpp
index f09d919..466b6c2 100644
--- a/include/atrip/Equations.hpp
+++ b/include/atrip/Equations.hpp
@@ -103,6 +103,11 @@ void singlesContribution
     // -- TIJK
     // , DataPtr<F> Tijk
     , DataFieldType<F>* Tijk_
+#if defined(HAVE_CUDA)
+     // -- tmp buffers
+    , DataFieldType<F>* _t_buffer
+    , DataFieldType<F>* _vhhh 
+#endif
     );
 // Doubles contribution:1 ends here
 
diff --git a/include/atrip/Slice.hpp b/include/atrip/Slice.hpp
index 3d12187..a24d23f 100644
--- a/include/atrip/Slice.hpp
+++ b/include/atrip/Slice.hpp
@@ -352,7 +352,7 @@ Info info;
 
 // [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:2]]
 DataPtr<F> data;
-#if defined(HAVE_CUDA && !defined ATRIP_SOURCES_IN_GPU)
+#if defined(HAVE_CUDA) && !defined (ATRIP_SOURCES_IN_GPU)
     F* mpi_data;
 #endif
 // Attributes:2 ends here
@@ -456,7 +456,7 @@ void unwrapAndMarkReady() {
       if (errorCode != MPI_SUCCESS)
         throw "Atrip: Unexpected error MPI ERROR";
 
-#if defined(HAVE_CUDA && !defined ATRIP_SOURCES_IN_GPU)
+#if defined(HAVE_CUDA) && !defined(ATRIP_SOURCES_IN_GPU)
       // copy the retrieved mpi data to the device
       WITH_CHRONO("cuda:memcpy",
                   _CHECK_CUDA_SUCCESS("copying mpi data to device",
@@ -488,7 +488,7 @@ void unwrapAndMarkReady() {
 Slice(size_t size_)
       : info({})
       , data(DataNullPtr)
-#if defined(HAVE_CUDA && !defined ATRIP_SOURCES_IN_GPU)
+#if defined(HAVE_CUDA) && !defined(ATRIP_SOURCES_IN_GPU)
       , mpi_data(nullptr)
 #endif
       , size(size_)
diff --git a/include/atrip/SliceUnion.hpp b/include/atrip/SliceUnion.hpp
index 8813c23..008c926 100644
--- a/include/atrip/SliceUnion.hpp
+++ b/include/atrip/SliceUnion.hpp
@@ -405,6 +405,7 @@ template <typename F=double>
               , sliceSize(std::accumulate(sliceLength.begin(),
                                           sliceLength.end(),
                                           1UL, std::multiplies<size_t>()))
+
 #if defined(ATRIP_SOURCES_IN_GPU)
               , sources(rankMap.nSources())
 #else
@@ -417,6 +418,7 @@ template <typename F=double>
     { // constructor begin
 
       LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n";
+        printf("sliceSize %d, number of slices %d\n\n\n", sliceSize, sources.size());
 
 #if defined(ATRIP_SOURCES_IN_GPU)
       for (auto& ptr: sources) {
@@ -571,7 +573,7 @@ template <typename F=double>
       if (slice.info.state == Slice<F>::Fetch) { // if-1
         // TODO: do it through the slice class
         slice.info.state = Slice<F>::Dispatched;
-#if defined(HAVE_CUDA && defined ATRIP_SOURCES_IN_GPU)
+#if defined(HAVE_CUDA) && defined(ATRIP_SOURCES_IN_GPU)
 #  if !defined(ATRIP_CUDA_AWARE_MPI) 
 #    error "You need CUDA aware MPI to have slices on the GPU"
 #  endif
diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx
index 54c68a0..8265c0e 100644
--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@@ -258,6 +258,25 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
   // all tensors
   std::vector< SliceUnion<F>* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh};
 
+#ifdef HAVE_CUDA
+    // TODO: free buffers
+    DataFieldType<F>* _t_buffer;
+    DataFieldType<F>* _vhhh;
+    WITH_CHRONO("double:cuda:alloc",
+    _CHECK_CUDA_SUCCESS("Allocating _t_buffer",
+                        cuMemAlloc((CUdeviceptr*)&_t_buffer,
+                                   No*No*No * sizeof(DataFieldType<F>)));
+    _CHECK_CUDA_SUCCESS("Allocating _vhhh",
+                        cuMemAlloc((CUdeviceptr*)&_vhhh,
+                                   No*No*No * sizeof(DataFieldType<F>)));
+                )
+    //const size_t
+     // bs = Atrip::kernelDimensions.ooo.blocks,
+      //ths = Atrip::kernelDimensions.ooo.threads;
+    //cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_t_buffer, NoNoNo);
+    //cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_vhhh, NoNoNo);
+#endif
+
   // get tuples for the current rank
   TuplesDistribution *distribution;
 
@@ -639,7 +658,14 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
                                          tabhh.unwrapSlice(Slice<F>::AC, abc),
                                          tabhh.unwrapSlice(Slice<F>::BC, abc),
                                          // -- TIJK
-                                         (DataFieldType<F>*)Tijk);
+                                         (DataFieldType<F>*)Tijk
+#if defined(HAVE_CUDA)
+                                         // -- tmp buffers
+                                         ,(DataFieldType<F>*)_t_buffer
+                                         ,(DataFieldType<F>*)_vhhh
+#endif
+                                         );
+
                   WITH_RANK << iteration << "-th doubles done\n";
       ))
     }
diff --git a/src/atrip/Equations.cxx b/src/atrip/Equations.cxx
index 12be45d..9f42613 100644
--- a/src/atrip/Equations.cxx
+++ b/src/atrip/Equations.cxx
@@ -401,9 +401,15 @@ void getEnergySame
     // -- TIJK
     // , DataPtr<F> Tijk_
     , DataFieldType<F>* Tijk_
-    ) {
-
-    const size_t NoNo = No*No;
+#if defined(HAVE_CUDA)
+     // -- tmp buffers
+    , DataFieldType<F>* _t_buffer
+    , DataFieldType<F>* _vhhh 
+#endif
+     ) {
+    const size_t a = abc[0], b = abc[1], c = abc[2]
+              , NoNo = No*No
+              ;
 
     DataFieldType<F>* Tijk = (DataFieldType<F>*)Tijk_;
 
@@ -517,25 +523,21 @@ void getEnergySame
     F one{1.0}, m_one{-1.0}, zero{0.0};
     const size_t NoNoNo = No*NoNo;
 #ifdef HAVE_CUDA
-    DataFieldType<F>* _t_buffer;
-    DataFieldType<F>* _vhhh;
-    WITH_CHRONO("double:cuda:alloc",
-    _CHECK_CUDA_SUCCESS("Allocating _t_buffer",
-                        cuMemAlloc((CUdeviceptr*)&_t_buffer,
-                                   NoNoNo * sizeof(DataFieldType<F>)));
-    _CHECK_CUDA_SUCCESS("Allocating _vhhh",
-                        cuMemAlloc((CUdeviceptr*)&_vhhh,
-                                   NoNoNo * sizeof(DataFieldType<F>)));
-                )
-    const size_t
-      bs = Atrip::kernelDimensions.ooo.blocks,
-      ths = Atrip::kernelDimensions.ooo.threads;
-
-#if !defined(ATRIP_ONLY_DGEMM)
-    acc::zeroing<<<bs, ths>>>((DataFieldType<F>*)_t_buffer, NoNoNo);
-    acc::zeroing<<<bs, ths>>>((DataFieldType<F>*)_vhhh, NoNoNo);
-#endif
-
+//    DataFieldType<F>* _t_buffer;
+//    DataFieldType<F>* _vhhh;
+//    WITH_CHRONO("double:cuda:alloc",
+//    _CHECK_CUDA_SUCCESS("Allocating _t_buffer",
+//                        cuMemAlloc((CUdeviceptr*)&_t_buffer,
+//                                   NoNoNo * sizeof(DataFieldType<F>)));
+//    _CHECK_CUDA_SUCCESS("Allocating _vhhh",
+//                        cuMemAlloc((CUdeviceptr*)&_vhhh,
+//                                   NoNoNo * sizeof(DataFieldType<F>)));
+//                )
+//    const size_t
+//      bs = Atrip::kernelDimensions.ooo.blocks,
+//      ths = Atrip::kernelDimensions.ooo.threads;
+    //cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_t_buffer, NoNoNo);
+    //cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_vhhh, NoNoNo);
 #else
     DataFieldType<F>* _t_buffer = (DataFieldType<F>*)malloc(NoNoNo * sizeof(F));
     DataFieldType<F>* _vhhh = (DataFieldType<F>*)malloc(NoNoNo * sizeof(F));
@@ -649,12 +651,12 @@ void getEnergySame
 #ifdef HAVE_CUDA
     // we need to synchronize here since we need
     // the Tijk for next process in the pipeline
-    _CHECK_CUDA_SUCCESS("Synchronizing",
-                        cuCtxSynchronize());
-    _CHECK_CUDA_SUCCESS("Freeing _vhhh",
-                        cuMemFree((CUdeviceptr)_vhhh));
-    _CHECK_CUDA_SUCCESS("Freeing _t_buffer",
-                        cuMemFree((CUdeviceptr)_t_buffer));
+    //_CHECK_CUDA_SUCCESS("Synchronizing",
+    //                    cuCtxSynchronize());
+    //_CHECK_CUDA_SUCCESS("Freeing _vhhh",
+    //                    cuMemFree((CUdeviceptr)_vhhh));
+    //_CHECK_CUDA_SUCCESS("Freeing _t_buffer",
+    //                    cuMemFree((CUdeviceptr)_t_buffer));
 #else
     free(_vhhh);
     free(_t_buffer);
@@ -741,6 +743,12 @@ void getEnergySame
     , DataPtr<double> const TBChh
     // -- TIJK
     , DataFieldType<double>* Tijk
+#if defined(HAVE_CUDA)
+     // -- tmp buffers
+    , DataFieldType<double>* _t_buffer
+    , DataFieldType<double>* _vhhh 
+#endif
+
     );
 
   template
@@ -769,6 +777,12 @@ void getEnergySame
     , DataPtr<Complex> const TBChh
     // -- TIJK
     , DataFieldType<Complex>* Tijk
+#if defined(HAVE_CUDA)
+     // -- tmp buffers
+    , DataFieldType<Complex>* _t_buffer
+    , DataFieldType<Complex>* _vhhh 
+#endif
+
     );
 // Doubles contribution:2 ends here