Add memory consumption in bench

Fix small syntactic bug
Add a chrono for mpi memcpy in cuda
2022-09-08 15:44:29 +02:00 · 2022-09-08 15:36:10 +02:00 · 2022-09-08 15:27:51 +02:00 · 2022-09-08 15:27:38 +02:00 · 2022-09-08 13:51:49 +02:00 · 2022-09-08 13:50:08 +02:00
6 changed files with 265 additions and 168 deletions
--- a/bench/test_main.cxx
+++ b/bench/test_main.cxx
@ -58,8 +58,9 @@ int main(int argc, char** argv) {
  CLI11_PARSE(app, argc, argv);

  CTF::World world(argc, argv);
-  int rank;
+  int rank, nranks;
  MPI_Comm_rank(world.comm, &rank);
+  MPI_Comm_size(world.comm, &nranks);
  constexpr double elem_to_gb = 8.0 / 1024.0 / 1024.0 / 1024.0;

  // USER PRINTING TEST BEGIN
@ -108,6 +109,43 @@ int main(int argc, char** argv) {
    }
  }

+  size_t
+      f = sizeof(double)
+    , n_tuples = nv * (nv + 1) * (nv + 2) / 6 - nv
+    , atrip_memory
+        = /* tuples_memory */ 3 * sizeof(size_t) * n_tuples
+          //
+          // one dimensional slices (all ranks)
+          //
+        + /* taphh */ f * nranks * 6 * nv * no * no
+        + /* hhha  */ f * nranks * 6 * no * no * no
+          //
+          // two dimensional slices (all ranks)
+          //
+        + /* abph  */ f * nranks * 12 * nv * no
+        + /* abhh  */ f * nranks *  6 * no * no
+        + /* tabhh */ f * nranks *  6 * no * no
+          //
+          // distributed sources (all ranks)
+          //
+        + /* tpphh */ f * nv * nv * no * no
+        + /* vhhhp */ f * no * no * no * nv
+        + /* vppph */ f * nv * nv * nv * no
+        + /* vpphh */ f * nv * nv * no * no
+        + /* tpphh2 */ f * nv * nv * no * no
+          //
+          // tensors in every rank
+          //
+        + /* tijk */ f * nranks * no * no * no
+        + /* zijk */ f * nranks * no * no * no
+        + /* epsp */ f * nranks * (no + nv)
+        + /* tai  */ f * nranks * no * nv
+    ;
+
+  if (atrip::Atrip::rank == 0)
+  std::cout << "Tentative MEMORY USAGE: " << atrip_memory << "\n";
+
+
  std::vector<int> symmetries(4, NS)
                 , vo({nv, no})
                 , vvoo({nv, nv, no, no})
@ -173,7 +211,7 @@ int main(int argc, char** argv) {
  try {
    auto out = atrip::Atrip::run(in);
    if (atrip::Atrip::rank == 0)
-	std::cout << "Energy: " << out.energy << std::endl;
+      std::cout << "Energy: " << out.energy << std::endl;
  } catch (const char* msg) {
    if (atrip::Atrip::rank == 0)
      std::cout << "Atrip throwed with msg:\n\t\t " << msg << "\n";
--- a/bootstrap.sh
+++ b/bootstrap.sh
@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+
+type -a autoreconf > /dev/null ||
+{
+        cat <<EOF && exit
+
+You don't seem to have autotools installed, please install it.
+
+  - https://www.gnu.org/software/autoconf/
+  - https://www.gnu.org/software/automake/
+
+EOF
+}
+
+
+cat <<EOF
+
+  Creating configure script
+
+EOF
+
+
+autoreconf -vif .
+test -f configure || {
+  cat <<EOF
+
+  An error happened and a configure script could not be built!
+
+EOF
+  exit 1
+}
+
+
+cat <<EOF
+
+  Now you can build by doing
+
+  mkdir build
+  cd build
+  ../configure
+  make extern
+  make all
+
+EOF
--- a/etc/m4/atrip_cublas.m4
+++ b/etc/m4/atrip_cublas.m4
@ -40,7 +40,7 @@ AC_RUN_IFELSE([AC_LANG_SOURCE([_ATRIP_CUDA_MEMORY_OF_DEVICES])],
 	],
 	[
 	atrip_success=no
-	AC_MSG_ERROR([An available device reports zero memory available!])
+	AC_MSG_WARN([An available device reports zero memory available!])
 	])

 CXX="$ac_save_CXX"
@ -79,11 +79,11 @@ int main() {
  cuMemAlloc((CUdeviceptr*)&F_d, els*sizeof(double));

  stat = cublasDgemm(handle,
-	 CUBLAS_OP_N,
-	 CUBLAS_OP_N,
-	 oo, No, Nv,
-	 &one,
-	 HHP_d, oo, PH_d, Nv, &one, F_d, oo);
+                    CUBLAS_OP_N,
+                    CUBLAS_OP_N,
+                    oo, No, Nv,
+                    &one,
+                    HHP_d, oo, PH_d, Nv, &one, F_d, oo);
  //cudaSetDevice(rank);

  return 0;
@ -92,6 +92,7 @@ int main() {


 m4_define([_ATRIP_CUDA_MEMORY_OF_DEVICES], [[
+
 #include <mpi.h>
 #include <iostream>
 #include <cassert>
@ -124,26 +125,27 @@ int main() {
    cuDeviceTotalMem(&total2, dev);

    printf("\n"
-	   "CUDA CARD RANK %d\n"
-	   "=================\n"
-	   "\tname: %s\n"
-	   "\tShared Mem Per Block (KB): %f\n"
-	   "\tFree/Total mem (GB): %f/%f\n"
-	   "\ttotal2 mem (GB): %f\n"
-	   "\n",
-	   dev,
-	   name,
-	   prop.sharedMemPerBlock / 1024.0,
-	   _free / 1024.0 / 1024.0 / 1024.0 ,
-	   total / 1024.0 / 1024.0 / 1024.0 ,
-	   total2 / 1024.0 / 1024.0 / 1024.0
-	   );
+           "CUDA CARD RANK %d\n"
+           "=================\n"
+           "\tname: %s\n"
+           "\tShared Mem Per Block (KB): %f\n"
+           "\tFree/Total mem (GB): %f/%f\n"
+           "\ttotal2 mem (GB): %f\n"
+           "\n",
+           dev,
+           name,
+           prop.sharedMemPerBlock / 1024.0,
+           _free / 1024.0 / 1024.0 / 1024.0 ,
+           total / 1024.0 / 1024.0 / 1024.0 ,
+           total2 / 1024.0 / 1024.0 / 1024.0
+           );

    if (_free == 0 || total == 0 || total2 == 0)
-	return 1;
+      return 1;

  }

  return 0;
 }
+
 ]])
--- a/include/atrip/Slice.hpp
+++ b/include/atrip/Slice.hpp
@ -457,7 +457,8 @@ void unwrapAndMarkReady() {

 #if defined(HAVE_CUDA)
      // copy the retrieved mpi data to the device
-      cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);
+      WITH_CHRONO("cuda:memcpy",
+                  cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);)
      std::free(mpi_data);
 #endif

--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@ -76,13 +76,13 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
  LOG(0,"Atrip") << "ngcards: " << ngcards << "\n";
  if (clusterInfo.ranksPerNode > ngcards) {
    std::cerr << "ATRIP: You are running on more ranks per node than the number of graphic cards\n"
-	      << "You have " << ngcards << " cards at your disposal\n";
+              << "You have " << ngcards << " cards at your disposal\n";
    throw "";
  }
  if (clusterInfo.ranksPerNode < ngcards) {
    std::cerr << "You have " << ngcards << " cards at your disposal\n"
-	      << "You will be only using " << clusterInfo.ranksPerNode
-	      << ", i.e., the nubmer of ranks.\n";
+              << "You will be only using " << clusterInfo.ranksPerNode
+              << ", i.e., the nubmer of ranks.\n";
  }


@ -106,23 +106,23 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
      cuDeviceTotalMem(&memory.total, dev);

      printf("\n"
-	     "CUDA CARD RANK %d\n"
-	     "=================\n"
-	     "\tnumber: %1$d\n"
-	     "\tname: %s\n"
-	     "\tMem. clock rate (KHz): %d\n"
-	     "\tShared Mem Per Block (KB): %f\n"
-	     "\tAvail. Free/Total mem (GB): %f/%f\n"
-	     "\tFree memory (GB): %f\n"
-	     "\n",
-	     Atrip::rank,
-	     name,
-	     prop.clockRate,
-	     prop.sharedMemPerBlock / 1024.0,
-	     memory.avail.free / 1024.0 / 1024.0 / 1024.0 ,
-	     memory.avail.total / 1024.0 / 1024.0 / 1024.0,
-	     memory.total / 1024.0 / 1024.0 / 1024.0 
-	     );
+             "CUDA CARD RANK %d\n"
+             "=================\n"
+             "\tnumber: %1$ld\n"
+             "\tname: %s\n"
+             "\tMem. clock rate (KHz): %ld\n"
+             "\tShared Mem Per Block (KB): %f\n"
+             "\tAvail. Free/Total mem (GB): %f/%f\n"
+             "\tFree memory (GB): %f\n"
+             "\n",
+             Atrip::rank,
+             name,
+             prop.clockRate,
+             prop.sharedMemPerBlock / 1024.0,
+             memory.avail.free / 1024.0 / 1024.0 / 1024.0 ,
+             memory.avail.total / 1024.0 / 1024.0 / 1024.0,
+             memory.total / 1024.0 / 1024.0 / 1024.0
+             );
      std::free((void*)name);
    }
    MPI_Barrier(universe);
@ -249,19 +249,18 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
  const size_t nIterations = tuplesList.size();
  {
    LOG(0,"Atrip") << "#iterations: "
-                  << nIterations
-                  << "/"
-                  << nIterations * np
-                  << "\n";
+                   << nIterations
+                   << "/"
+                   << nIterations * np
+                   << "\n";
  }

  const size_t
-      iterationMod = (in.percentageMod > 0)
-                  ? nIterations * in.percentageMod / 100.0
-                  : in.iterationMod
-
-    , iteration1Percent = nIterations * 0.01
-    ;
+    iterationMod = (in.percentageMod > 0)
+                 ? nIterations * in.percentageMod / 100.0
+                 : in.iterationMod
+  , iteration1Percent = nIterations * 0.01
+  ;



@ -293,20 +292,20 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {

        WITH_CHRONO("oneshot-db:comm:allgather",
        WITH_CHRONO("db:comm:allgather",
-          MPI_Allgather( ldb.data()
-                       // , ldb.size() * sizeof(typename Slice<F>::LocalDatabaseElement)
-                       , ldb.size()
-                       , MPI_LDB_ELEMENT
-                       , db.data()
-                       // , ldb.size() * sizeof(typename Slice<F>::LocalDatabaseElement)
-                       , ldb.size()
-                       , MPI_LDB_ELEMENT
-                       , c);
+                    MPI_Allgather(ldb.data(),
+                                  /* ldb.size() * sizeof(typename
+                                     Slice<F>::LocalDatabaseElement) */
+                                  ldb.size(),
+                                  MPI_LDB_ELEMENT,
+                                  db.data(),
+                                  /* ldb.size() * sizeof(typename
+                                     Slice<F>::LocalDatabaseElement), */
+                                  ldb.size(),
+                                  MPI_LDB_ELEMENT,
+                                  c);
        ))

-        WITH_CHRONO("db:comm:type:free",
-          MPI_Type_free(&MPI_LDB_ELEMENT);
-        )
+        WITH_CHRONO("db:comm:type:free", MPI_Type_free(&MPI_LDB_ELEMENT);)

        return db;
      };
@ -575,30 +574,29 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
      )))
      WITH_CHRONO("oneshot-doubles",
      WITH_CHRONO("doubles",
-        doublesContribution<F>( abc, (size_t)No, (size_t)Nv
-                              // -- VABCI
-                              , abph.unwrapSlice(Slice<F>::AB, abc)
-                              , abph.unwrapSlice(Slice<F>::AC, abc)
-                              , abph.unwrapSlice(Slice<F>::BC, abc)
-                              , abph.unwrapSlice(Slice<F>::BA, abc)
-                              , abph.unwrapSlice(Slice<F>::CA, abc)
-                              , abph.unwrapSlice(Slice<F>::CB, abc)
-                              // -- VHHHA
-                              , hhha.unwrapSlice(Slice<F>::A, abc)
-                              , hhha.unwrapSlice(Slice<F>::B, abc)
-                              , hhha.unwrapSlice(Slice<F>::C, abc)
-                              // -- TA
-                              , taphh.unwrapSlice(Slice<F>::A, abc)
-                              , taphh.unwrapSlice(Slice<F>::B, abc)
-                              , taphh.unwrapSlice(Slice<F>::C, abc)
-                              // -- TABIJ
-                              , tabhh.unwrapSlice(Slice<F>::AB, abc)
-                              , tabhh.unwrapSlice(Slice<F>::AC, abc)
-                              , tabhh.unwrapSlice(Slice<F>::BC, abc)
-                              // -- TIJK
-                              , (DataFieldType<F>*)Tijk
-                              );
-        WITH_RANK << iteration << "-th doubles done\n";
+                  doublesContribution<F>(abc, (size_t)No, (size_t)Nv,
+                                         // -- VABCI
+                                         abph.unwrapSlice(Slice<F>::AB, abc),
+                                         abph.unwrapSlice(Slice<F>::AC, abc),
+                                         abph.unwrapSlice(Slice<F>::BC, abc),
+                                         abph.unwrapSlice(Slice<F>::BA, abc),
+                                         abph.unwrapSlice(Slice<F>::CA, abc),
+                                         abph.unwrapSlice(Slice<F>::CB, abc),
+                                         // -- VHHHA,
+                                         hhha.unwrapSlice(Slice<F>::A, abc),
+                                         hhha.unwrapSlice(Slice<F>::B, abc),
+                                         hhha.unwrapSlice(Slice<F>::C, abc),
+                                         // -- TA,
+                                         taphh.unwrapSlice(Slice<F>::A, abc),
+                                         taphh.unwrapSlice(Slice<F>::B, abc),
+                                         taphh.unwrapSlice(Slice<F>::C, abc),
+                                         // -- TABIJ
+                                         tabhh.unwrapSlice(Slice<F>::AB, abc),
+                                         tabhh.unwrapSlice(Slice<F>::AC, abc),
+                                         tabhh.unwrapSlice(Slice<F>::BC, abc),
+                                         // -- TIJK
+                                         (DataFieldType<F>*)Tijk);
+                  WITH_RANK << iteration << "-th doubles done\n";
      ))
    }

@ -618,16 +616,19 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
      )
      WITH_CHRONO("singles",
 #if defined(HAVE_CUDA)
-      singlesContribution<F><<<1,1>>>( No, Nv, abc[0], abc[1], abc[2]
-                            , (DataFieldType<F>*)Tai
+      singlesContribution<F><<<1,1>>>(No, Nv, abc[0], abc[1], abc[2],
+                                      (DataFieldType<F>*)Tai,
 #else
-      singlesContribution<F>( No, Nv, abc[0], abc[1], abc[2]
-                            , Tai.data()
+      singlesContribution<F>(No, Nv, abc[0], abc[1], abc[2],
+                             Tai.data(),
 #endif
-                            , (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB, abc)
-                            , (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC, abc)
-                            , (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC, abc)
-                            , (DataFieldType<F>*)Zijk);
+                             (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB,
+                                                                 abc),
+                             (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC,
+                                                                 abc),
+                             (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC,
+                                                                 abc),
+                             (DataFieldType<F>*)Zijk);
      )
    }

--- a/src/atrip/Equations.cxx
+++ b/src/atrip/Equations.cxx
@ -77,7 +77,7 @@ namespace cuda {
      to[i] = maybeConjugateScalar<F>(from[i]);
    }
  }
-  
+

  template <typename F>
  __global__
@ -132,30 +132,29 @@ namespace cuda {
 #endif

 #if defined(HAVE_CUDA)
-#define LIMS_KS						\
-  size_t						\
-  kmin = blockIdx.x * blockDim.x + threadIdx.x,		\
-    k = kmin,						\
-    idx = kmin * size * size * size			\
-    ;							\
+#define LIMS_KS()                                       \
+  size_t kmin = blockIdx.x * blockDim.x + threadIdx.x,  \
+    k = kmin,                                           \
+    idx = kmin * size * size * size                     \
+    ;                                                   \
  k < (kmin < size) ? kmin + 1 : size
 #else
 #define LIMS_KS size_t k=0, idx=0; k < size
 #endif
 #define _IJK_(i, j, k) i + j*size + k*size*size
-#define _REORDER_BODY_(...)				\
-  for (LIMS_KS ; k++)					\
-    for (size_t j = 0; j < size; j++)			\
-      for (size_t i = 0; i < size; i++, idx++) {	\
-	__VA_ARGS__					\
-	}
-#define _MAKE_REORDER_(_enum, ...)					\
-  template <typename F>							\
-  __MAYBE_GLOBAL__							\
-  void reorder(reorder_proxy< F, _enum > p,				\
-	       size_t size, F* to, F* from) {				\
-    _REORDER_BODY_(__VA_ARGS__)						\
-  }
+#define _REORDER_BODY_(...)                       \
+  for (LIMS_KS() ; k++)                           \
+    for (size_t j = 0; j < size; j++)             \
+      for (size_t i = 0; i < size; i++, idx++) {  \
+        __VA_ARGS__                               \
+          }
+#define _MAKE_REORDER_(_enum, ...)              \
+  template <typename F>                         \
+  __MAYBE_GLOBAL__                              \
+  void reorder(reorder_proxy< F, _enum > p,     \
+               size_t size, F* to, F* from) {   \
+    _REORDER_BODY_(__VA_ARGS__)                 \
+      }
 #if defined(HAVE_CUDA)
 #define GO(__TO, __FROM) cuda::sum_in_place<F>(&__TO, &__FROM);
 #else
@ -166,7 +165,7 @@ namespace cuda {
  template <typename F, reordering_t R>
  __MAYBE_GLOBAL__							\
  void reorder(reorder_proxy<F, R> proxy,
-	       size_t size, F* to, F* from);
+               size_t size, F* to, F* from);

  _MAKE_REORDER_(IJK, GO(to[idx], from[_IJK_(i, j, k)]))
  _MAKE_REORDER_(IKJ, GO(to[idx], from[_IJK_(i, k, j)]))
@ -174,7 +173,7 @@ namespace cuda {
  _MAKE_REORDER_(JKI, GO(to[idx], from[_IJK_(j, k, i)]))
  _MAKE_REORDER_(KIJ, GO(to[idx], from[_IJK_(k, i, j)]))
  _MAKE_REORDER_(KJI, GO(to[idx], from[_IJK_(k, j, i)]))
-  
+

 #undef LIMS_KS
 #undef _MAKE_REORDER
@ -446,50 +445,59 @@ double getEnergySame

 #if defined(ATRIP_USE_DGEMM)
 #if defined(HAVE_CUDA)
-#define REORDER(__II, __JJ, __KK)					\
-    reorder<<<								\
-      bs, ths								\
-	   >>>(reorder_proxy<DataFieldType<F>, __II ## __JJ ## __KK >{}, \
-		      No, Tijk, _t_buffer);
-#define DGEMM_PARTICLES(__A, __B)                   \
-  atrip::xgemm<F>("T",                              \
-                  "N",                              \
-                  (int const*)&NoNo,                \
-                  (int const*)&No,                  \
-                  (int const*)&Nv,                  \
-                  &one,                             \
-                  (DataFieldType<F>*)__A,           \
-                  (int const*)&Nv,                  \
-                  (DataFieldType<F>*)__B,           \
-                  (int const*)&Nv,                  \
-		  &zero,                            \
-		  _t_buffer,			    \
-		  (int const*)&NoNo);
-#define DGEMM_HOLES(__A, __B, __TRANSB)             \
-  atrip::xgemm<F>("N",                              \
-                  __TRANSB,                         \
-                  (int const*)&NoNo,                \
-                  (int const*)&No,                  \
-                  (int const*)&No,                  \
-                  &m_one,                           \
-		  __A,				    \
-                  (int const*)&NoNo,                \
-                  (DataFieldType<F>*)__B,           \
-                  (int const*)&No,                  \
-                  &zero,                            \
-		  _t_buffer,			    \
-                  (int const*)&NoNo                 \
-		  );
+#define REORDER(__II, __JJ, __KK)                       \
+  reorder<<<bs, ths>>>(reorder_proxy<DataFieldType<F>,  \
+                       __II ## __JJ ## __KK >{},        \
+                       No, Tijk, _t_buffer);
+#define DGEMM_PARTICLES(__A, __B)               \
+  atrip::xgemm<F>("T",                          \
+                  "N",                          \
+                  (int const*)&NoNo,            \
+                  (int const*)&No,              \
+                  (int const*)&Nv,              \
+                  &one,                         \
+                  (DataFieldType<F>*)__A,       \
+                  (int const*)&Nv,              \
+                  (DataFieldType<F>*)__B,       \
+                  (int const*)&Nv,              \
+                  &zero,                        \
+                  _t_buffer,                    \
+                  (int const*)&NoNo);
+#define DGEMM_HOLES(__A, __B, __TRANSB)         \
+  atrip::xgemm<F>("N",                          \
+                  __TRANSB,                     \
+                  (int const*)&NoNo,            \
+                  (int const*)&No,              \
+                  (int const*)&No,              \
+                  &m_one,                       \
+                  __A,                          \
+                  (int const*)&NoNo,            \
+                  (DataFieldType<F>*)__B,       \
+                  (int const*)&No,              \
+                  &zero,                        \
+                  _t_buffer,                    \
+                  (int const*)&NoNo             \
+                  );
 #define MAYBE_CONJ(_conj, _buffer)                                      \
-    cuda::maybeConjugate<<<						\
-      Atrip::kernelDimensions.ooo.blocks,				\
-      Atrip::kernelDimensions.ooo.threads 				\
-		      >>>((DataFieldType<F>*)_conj, (DataFieldType<F>*)_buffer, NoNoNo);
+  cuda::maybeConjugate<<<                                               \
+    Atrip::kernelDimensions.ooo.blocks,                                 \
+    Atrip::kernelDimensions.ooo.threads                                 \
+    >>>((DataFieldType<F>*)_conj, (DataFieldType<F>*)_buffer, NoNoNo);
+
+
+// END CUDA ////////////////////////////////////////////////////////////////////
+
+
 #else
-// NONCUDA //////////////////////////////////////////////////////////////////////
-#define REORDER(__II, __JJ, __KK)					\
-    reorder(reorder_proxy<DataFieldType<F>, __II ## __JJ ## __KK >{},	\
-	    No, Tijk, _t_buffer);
+
+
+// NONCUDA /////////////////////////////////////////////////////////////////////
+
+
+#define REORDER(__II, __JJ, __KK)               \
+  reorder(reorder_proxy<DataFieldType<F>,       \
+          __II ## __JJ ## __KK >{},             \
+          No, Tijk, _t_buffer);
 #define DGEMM_PARTICLES(__A, __B)               \
  atrip::xgemm<F>("T",                          \
                  "N",                          \
@ -502,7 +510,7 @@ double getEnergySame
                  __B,                          \
                  (int const*)&Nv,              \
                  &zero,                        \
-                  _t_buffer,                  \
+                  _t_buffer,                    \
                  (int const*)&NoNo             \
                  );
 #define DGEMM_HOLES(__A, __B, __TRANSB)         \
@ -517,8 +525,8 @@ double getEnergySame
                  __B,                          \
                  (int const*)&No,              \
                  &zero,                        \
-		  _t_buffer,			\
-		  (int const*)&NoNo             \
+                  _t_buffer,                    \
+                  (int const*)&NoNo             \
                  );
 #define MAYBE_CONJ(_conj, _buffer)                \
  for (size_t __i = 0; __i < NoNoNo; ++__i)       \
@ -550,9 +558,10 @@ double getEnergySame
    // Set Tijk to zero
 #ifdef HAVE_CUDA
    WITH_CHRONO("double:reorder",
-		cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk, NoNoNo);
-		// synchronize all initializations to zero
-		)
+                cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk,
+                                           NoNoNo);
+                // synchronize all initializations to zero
+                )
 #else
    WITH_CHRONO("double:reorder",
      for (size_t k = 0; k < NoNoNo; k++) {
@ -640,6 +649,8 @@ double getEnergySame

  { // free resources
 #ifdef HAVE_CUDA
+    // we need to synchronize here since we need
+    // the Tijk for next process in the pipeline
    cuCtxSynchronize();
    cuMemFree((CUdeviceptr)_vhhh);
    cuMemFree((CUdeviceptr)_t_buffer);
Author	SHA1	Message	Date
Alejandro Gallo	4277c07cc2	Add memory consumption in bench	2022-09-08 15:44:29 +02:00
Alejandro Gallo	0558148937	Fix small syntactic bug	2022-09-08 15:36:10 +02:00
Alejandro Gallo	49ff3b377c	Add a chrono for mpi memcpy in cuda	2022-09-08 15:27:51 +02:00
Alejandro Gallo	00a28c990c	Indent more conventionally	2022-09-08 15:27:38 +02:00
Alejandro Gallo	2c5a4620ca	Lint and tidy up Equations	2022-09-08 13:51:49 +02:00
Alejandro Gallo	368c5619cc	Fix the autotools atrip_cublas	2022-09-08 13:50:08 +02:00
Alejandro Gallo	0b14ac7704	Add bootstrap script	2022-09-06 15:26:28 +02:00