5 changed files with 36 additions and 82 deletions
--- a/include/atrip/Equations.hpp
+++ b/include/atrip/Equations.hpp
@ -103,11 +103,6 @@ void singlesContribution
    // -- TIJK
    // , DataPtr<F> Tijk
    , DataFieldType<F>* Tijk_
 #if defined(HAVE_CUDA)
     // -- tmp buffers
    , DataFieldType<F>* _t_buffer
    , DataFieldType<F>* _vhhh 
 #endif
    );
 // Doubles contribution:1 ends here
--- a/include/atrip/Slice.hpp
+++ b/include/atrip/Slice.hpp
@ -352,7 +352,7 @@ Info info;
 // [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:2]]
 DataPtr<F> data;
-#if defined(HAVE_CUDA) && !defined (ATRIP_SOURCES_IN_GPU)
+#if defined(HAVE_CUDA)
    F* mpi_data;
 #endif
 // Attributes:2 ends here
@ -456,7 +456,7 @@ void unwrapAndMarkReady() {
      if (errorCode != MPI_SUCCESS)
        throw "Atrip: Unexpected error MPI ERROR";
-#if defined(HAVE_CUDA) && !defined(ATRIP_SOURCES_IN_GPU)
+#if defined(HAVE_CUDA)
      // copy the retrieved mpi data to the device
      WITH_CHRONO("cuda:memcpy",
                  _CHECK_CUDA_SUCCESS("copying mpi data to device",
@ -488,7 +488,7 @@ void unwrapAndMarkReady() {
 Slice(size_t size_)
      : info({})
      , data(DataNullPtr)
-#if defined(HAVE_CUDA) && !defined(ATRIP_SOURCES_IN_GPU)
+#if defined(HAVE_CUDA)
      , mpi_data(nullptr)
 #endif
      , size(size_)
--- a/include/atrip/SliceUnion.hpp
+++ b/include/atrip/SliceUnion.hpp
@ -405,7 +405,6 @@ template <typename F=double>
              , sliceSize(std::accumulate(sliceLength.begin(),
                                          sliceLength.end(),
                                          1UL, std::multiplies<size_t>()))
 #if defined(ATRIP_SOURCES_IN_GPU)
              , sources(rankMap.nSources())
 #else
@ -418,7 +417,6 @@ template <typename F=double>
    { // constructor begin
      LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n";
        printf("sliceSize %d, number of slices %d\n\n\n", sliceSize, sources.size());
 #if defined(ATRIP_SOURCES_IN_GPU)
      for (auto& ptr: sources) {
@ -573,11 +571,12 @@ template <typename F=double>
      if (slice.info.state == Slice<F>::Fetch) { // if-1
        // TODO: do it through the slice class
        slice.info.state = Slice<F>::Dispatched;
-#if defined(HAVE_CUDA) && defined(ATRIP_SOURCES_IN_GPU)
+#if defined(HAVE_CUDA)
-#  if !defined(ATRIP_CUDA_AWARE_MPI) 
+#  if !defined(ATRIP_CUDA_AWARE_MPI) && defined(ATRIP_SOURCES_IN_GPU)
 #    error "You need CUDA aware MPI to have slices on the GPU"
 #  endif
-        MPI_Irecv((void*)slice.data,
+        slice.mpi_data = (F*)malloc(sizeof(F) * slice.size);
        MPI_Irecv(slice.mpi_data,
 #else
        MPI_Irecv(slice.data,
 #endif
--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@ -258,25 +258,6 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
  // all tensors
  std::vector< SliceUnion<F>* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh};
 #ifdef HAVE_CUDA
    // TODO: free buffers
    DataFieldType<F>* _t_buffer;
    DataFieldType<F>* _vhhh;
    WITH_CHRONO("double:cuda:alloc",
    _CHECK_CUDA_SUCCESS("Allocating _t_buffer",
                        cuMemAlloc((CUdeviceptr*)&_t_buffer,
                                   No*No*No * sizeof(DataFieldType<F>)));
    _CHECK_CUDA_SUCCESS("Allocating _vhhh",
                        cuMemAlloc((CUdeviceptr*)&_vhhh,
                                   No*No*No * sizeof(DataFieldType<F>)));
                )
    //const size_t
     // bs = Atrip::kernelDimensions.ooo.blocks,
      //ths = Atrip::kernelDimensions.ooo.threads;
    //cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_t_buffer, NoNoNo);
    //cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_vhhh, NoNoNo);
 #endif
  // get tuples for the current rank
  TuplesDistribution *distribution;
@ -658,14 +639,7 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
                                         tabhh.unwrapSlice(Slice<F>::AC, abc),
                                         tabhh.unwrapSlice(Slice<F>::BC, abc),
                                         // -- TIJK
-                                         (DataFieldType<F>*)Tijk
+                                         (DataFieldType<F>*)Tijk);
 #if defined(HAVE_CUDA)
                                         // -- tmp buffers
                                         ,(DataFieldType<F>*)_t_buffer
                                         ,(DataFieldType<F>*)_vhhh
 #endif
                                         );
                  WITH_RANK << iteration << "-th doubles done\n";
      ))
    }
--- a/src/atrip/Equations.cxx
+++ b/src/atrip/Equations.cxx
@ -401,15 +401,9 @@ void getEnergySame
    // -- TIJK
    // , DataPtr<F> Tijk_
    , DataFieldType<F>* Tijk_
 #if defined(HAVE_CUDA)
     // -- tmp buffers
    , DataFieldType<F>* _t_buffer
    , DataFieldType<F>* _vhhh 
 #endif
    ) {
-    const size_t a = abc[0], b = abc[1], c = abc[2]
+
-              , NoNo = No*No
+    const size_t NoNo = No*No;
              ;
    DataFieldType<F>* Tijk = (DataFieldType<F>*)Tijk_;
@ -523,21 +517,25 @@ void getEnergySame
    F one{1.0}, m_one{-1.0}, zero{0.0};
    const size_t NoNoNo = No*NoNo;
 #ifdef HAVE_CUDA
-//    DataFieldType<F>* _t_buffer;
+    DataFieldType<F>* _t_buffer;
-//    DataFieldType<F>* _vhhh;
+    DataFieldType<F>* _vhhh;
-//    WITH_CHRONO("double:cuda:alloc",
+    WITH_CHRONO("double:cuda:alloc",
-//    _CHECK_CUDA_SUCCESS("Allocating _t_buffer",
+    _CHECK_CUDA_SUCCESS("Allocating _t_buffer",
-//                        cuMemAlloc((CUdeviceptr*)&_t_buffer,
+                        cuMemAlloc((CUdeviceptr*)&_t_buffer,
-//                                   NoNoNo * sizeof(DataFieldType<F>)));
+                                   NoNoNo * sizeof(DataFieldType<F>)));
-//    _CHECK_CUDA_SUCCESS("Allocating _vhhh",
+    _CHECK_CUDA_SUCCESS("Allocating _vhhh",
-//                        cuMemAlloc((CUdeviceptr*)&_vhhh,
+                        cuMemAlloc((CUdeviceptr*)&_vhhh,
-//                                   NoNoNo * sizeof(DataFieldType<F>)));
+                                   NoNoNo * sizeof(DataFieldType<F>)));
-//                )
+                )
-//    const size_t
+    const size_t
-//      bs = Atrip::kernelDimensions.ooo.blocks,
+      bs = Atrip::kernelDimensions.ooo.blocks,
-//      ths = Atrip::kernelDimensions.ooo.threads;
+      ths = Atrip::kernelDimensions.ooo.threads;
-    //cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_t_buffer, NoNoNo);
+
-    //cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_vhhh, NoNoNo);
+#if !defined(ATRIP_ONLY_DGEMM)
    acc::zeroing<<<bs, ths>>>((DataFieldType<F>*)_t_buffer, NoNoNo);
    acc::zeroing<<<bs, ths>>>((DataFieldType<F>*)_vhhh, NoNoNo);
 #endif
 #else
    DataFieldType<F>* _t_buffer = (DataFieldType<F>*)malloc(NoNoNo * sizeof(F));
    DataFieldType<F>* _vhhh = (DataFieldType<F>*)malloc(NoNoNo * sizeof(F));
@ -651,12 +649,12 @@ void getEnergySame
 #ifdef HAVE_CUDA
    // we need to synchronize here since we need
    // the Tijk for next process in the pipeline
-    //_CHECK_CUDA_SUCCESS("Synchronizing",
+    _CHECK_CUDA_SUCCESS("Synchronizing",
-    //                    cuCtxSynchronize());
+                        cuCtxSynchronize());
-    //_CHECK_CUDA_SUCCESS("Freeing _vhhh",
+    _CHECK_CUDA_SUCCESS("Freeing _vhhh",
-    //                    cuMemFree((CUdeviceptr)_vhhh));
+                        cuMemFree((CUdeviceptr)_vhhh));
-    //_CHECK_CUDA_SUCCESS("Freeing _t_buffer",
+    _CHECK_CUDA_SUCCESS("Freeing _t_buffer",
-    //                    cuMemFree((CUdeviceptr)_t_buffer));
+                        cuMemFree((CUdeviceptr)_t_buffer));
 #else
    free(_vhhh);
    free(_t_buffer);
@ -743,12 +741,6 @@ void getEnergySame
    , DataPtr<double> const TBChh
    // -- TIJK
    , DataFieldType<double>* Tijk
 #if defined(HAVE_CUDA)
     // -- tmp buffers
    , DataFieldType<double>* _t_buffer
    , DataFieldType<double>* _vhhh 
 #endif
    );
  template
@ -777,12 +769,6 @@ void getEnergySame
    , DataPtr<Complex> const TBChh
    // -- TIJK
    , DataFieldType<Complex>* Tijk
 #if defined(HAVE_CUDA)
     // -- tmp buffers
    , DataFieldType<Complex>* _t_buffer
    , DataFieldType<Complex>* _vhhh 
 #endif
    );
 // Doubles contribution:2 ends here