Fix zeroing

Zero Tijk correctly in CPU code
Fix bs and ths error in equations
2023-01-23 17:01:29 +01:00 · 2023-01-23 16:58:08 +01:00 · 2023-01-23 16:57:07 +01:00 · 2023-01-23 14:44:54 +01:00 · 2023-01-23 14:43:17 +01:00 · 2023-01-23 14:30:11 +01:00
5 changed files with 121 additions and 57 deletions
--- a/include/atrip/Equations.hpp
+++ b/include/atrip/Equations.hpp
@@ -103,6 +103,11 @@ void singlesContribution
    // -- TIJK
    // , DataPtr<F> Tijk
    , DataFieldType<F>* Tijk_
+#if defined(HAVE_CUDA)
+     // -- tmp buffers
+    , DataFieldType<F>* _t_buffer
+    , DataFieldType<F>* _vhhh 
+#endif
    );
 // Doubles contribution:1 ends here

--- a/include/atrip/Slice.hpp
+++ b/include/atrip/Slice.hpp
@@ -352,7 +352,7 @@ Info info;

 // [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:2]]
 DataPtr<F> data;
-#if defined(HAVE_CUDA)
+#if defined(HAVE_CUDA) && !defined (ATRIP_SOURCES_IN_GPU)
    F* mpi_data;
 #endif
 // Attributes:2 ends here
@@ -456,7 +456,7 @@ void unwrapAndMarkReady() {
      if (errorCode != MPI_SUCCESS)
        throw "Atrip: Unexpected error MPI ERROR";

-#if defined(HAVE_CUDA)
+#if defined(HAVE_CUDA) && !defined(ATRIP_SOURCES_IN_GPU)
      // copy the retrieved mpi data to the device
      WITH_CHRONO("cuda:memcpy",
                  _CHECK_CUDA_SUCCESS("copying mpi data to device",
@@ -488,7 +488,7 @@ void unwrapAndMarkReady() {
 Slice(size_t size_)
      : info({})
      , data(DataNullPtr)
-#if defined(HAVE_CUDA)
+#if defined(HAVE_CUDA) && !defined(ATRIP_SOURCES_IN_GPU)
      , mpi_data(nullptr)
 #endif
      , size(size_)
--- a/include/atrip/SliceUnion.hpp
+++ b/include/atrip/SliceUnion.hpp
@@ -405,6 +405,7 @@ template <typename F=double>
              , sliceSize(std::accumulate(sliceLength.begin(),
                                          sliceLength.end(),
                                          1UL, std::multiplies<size_t>()))
+
 #if defined(ATRIP_SOURCES_IN_GPU)
              , sources(rankMap.nSources())
 #else
@@ -417,6 +418,7 @@ template <typename F=double>
    { // constructor begin

      LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n";
+        printf("sliceSize %d, number of slices %d\n\n\n", sliceSize, sources.size());

 #if defined(ATRIP_SOURCES_IN_GPU)
      for (auto& ptr: sources) {
@@ -463,30 +465,30 @@ template <typename F=double>

 #if defined(HAVE_CUDA)
      LOG(1,"Atrip") << "warming communication up " << slices.size() << "\n";
-      WITH_CHRONO("cuda:warmup",
-                   int nRanks=Atrip::np, requestCount=0;
-                   int nSends=sliceBuffers.size()*nRanks;
-                   MPI_Request *requests = (MPI_Request*) malloc(nSends*2 * sizeof(MPI_Request));
-                   MPI_Status *statuses = (MPI_Status*) malloc(nSends*2 * sizeof(MPI_Status));
-                   for (int sliceId=0; sliceId<sliceBuffers.size(); sliceId++){
-                     for (int rankId=0; rankId<nRanks; rankId++){
-                       MPI_Isend((void*)SOURCES_DATA(sources[0]),
-                                 sliceSize,
-                                 traits::mpi::datatypeOf<F>(),
-                                 rankId,
-                                 100,
-                                 universe,
-                                 &requests[requestCount++]);
-                       MPI_Irecv((void*)sliceBuffers[sliceId],
-                                 sliceSize,
-                                 traits::mpi::datatypeOf<F>(),
-                                 rankId,
-                                 100,
-                                 universe,
-                                 &requests[requestCount++]);
-                     }
-                   }
-                   MPI_Waitall(nSends*2, requests, statuses);
+      WITH_CHRONO("cuda:warmup",
+                  int nRanks=Atrip::np, requestCount=0;
+                  int nSends=sliceBuffers.size()*nRanks;
+                  MPI_Request *requests = (MPI_Request*) malloc(nSends*2 * sizeof(MPI_Request));
+                  MPI_Status *statuses = (MPI_Status*) malloc(nSends*2 * sizeof(MPI_Status));
+                  for (int sliceId=0; sliceId<sliceBuffers.size(); sliceId++){
+                    for (int rankId=0; rankId<nRanks; rankId++){
+                      MPI_Isend((void*)SOURCES_DATA(sources[0]),
+                                sliceSize,
+                                traits::mpi::datatypeOf<F>(),
+                                rankId,
+                                100,
+                                universe,
+                                &requests[requestCount++]);
+                      MPI_Irecv((void*)sliceBuffers[sliceId],
+                                sliceSize,
+                                traits::mpi::datatypeOf<F>(),
+                                rankId,
+                                100,
+                                universe,
+                                &requests[requestCount++]);
+                    }
+                  }
+                  MPI_Waitall(nSends*2, requests, statuses);
                  )
 #endif

@@ -571,12 +573,11 @@ template <typename F=double>
      if (slice.info.state == Slice<F>::Fetch) { // if-1
        // TODO: do it through the slice class
        slice.info.state = Slice<F>::Dispatched;
-#if defined(HAVE_CUDA)
-#  if !defined(ATRIP_CUDA_AWARE_MPI) && defined(ATRIP_SOURCES_IN_GPU)
+#if defined(HAVE_CUDA) && defined(ATRIP_SOURCES_IN_GPU)
+#  if !defined(ATRIP_CUDA_AWARE_MPI) 
 #    error "You need CUDA aware MPI to have slices on the GPU"
 #  endif
-        slice.mpi_data = (F*)malloc(sizeof(F) * slice.size);
-        MPI_Irecv(slice.mpi_data,
+        MPI_Irecv((void*)slice.data,
 #else
        MPI_Irecv(slice.data,
 #endif
--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@@ -202,7 +202,7 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
  _CHECK_CUDA_SUCCESS("Zijk",
                      cuMemAlloc(&Zijk, sizeof(F) * No * No * No));
 #else
-  std::vector<F> &Tai = _Tai, &epsi = _epsi, &epsa = _epsa;
+  DataPtr<F> Tai = _Tai.data(), epsi = _epsi.data(), epsa = _epsa.data();
  Zijk = (DataFieldType<F>*)malloc(No*No*No * sizeof(DataFieldType<F>));
  Tijk = (DataFieldType<F>*)malloc(No*No*No * sizeof(DataFieldType<F>));
 #endif
@@ -258,6 +258,25 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
  // all tensors
  std::vector< SliceUnion<F>* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh};

+#ifdef HAVE_CUDA
+    // TODO: free buffers
+    DataFieldType<F>* _t_buffer;
+    DataFieldType<F>* _vhhh;
+    WITH_CHRONO("double:cuda:alloc",
+    _CHECK_CUDA_SUCCESS("Allocating _t_buffer",
+                        cuMemAlloc((CUdeviceptr*)&_t_buffer,
+                                   No*No*No * sizeof(DataFieldType<F>)));
+    _CHECK_CUDA_SUCCESS("Allocating _vhhh",
+                        cuMemAlloc((CUdeviceptr*)&_vhhh,
+                                   No*No*No * sizeof(DataFieldType<F>)));
+                )
+    //const size_t
+     // bs = Atrip::kernelDimensions.ooo.blocks,
+      //ths = Atrip::kernelDimensions.ooo.threads;
+    //cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_t_buffer, NoNoNo);
+    //cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_vhhh, NoNoNo);
+#endif
+
  // get tuples for the current rank
  TuplesDistribution *distribution;

@@ -639,7 +658,14 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
                                         tabhh.unwrapSlice(Slice<F>::AC, abc),
                                         tabhh.unwrapSlice(Slice<F>::BC, abc),
                                         // -- TIJK
-                                         (DataFieldType<F>*)Tijk);
+                                         (DataFieldType<F>*)Tijk
+#if defined(HAVE_CUDA)
+                                         // -- tmp buffers
+                                         ,(DataFieldType<F>*)_t_buffer
+                                         ,(DataFieldType<F>*)_vhhh
+#endif
+                                         );
+
                  WITH_RANK << iteration << "-th doubles done\n";
      ))
    }
@@ -667,7 +693,7 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
                                      (DataFieldType<F>*)Tai,
 #else
      singlesContribution<F>(No, Nv, abc[0], abc[1], abc[2],
-                             Tai.data(),
+                             Tai,
 #endif
                             (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB,
                                                                 abc),
@@ -707,18 +733,30 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
                                1, 1, // for cuda
                                _epsabc,
                                No,
+#if defined(HAVE_CUDA)
                                (DataFieldType<F>*)epsi,
                                (DataFieldType<F>*)Tijk,
                                (DataFieldType<F>*)Zijk,
+#else
+                                epsi,
+                                Tijk,
+                                Zijk,
+#endif
                                tupleEnergy);
                  } else {
                    ACC_FUNCALL(getEnergySame<DataFieldType<F>>,
                                1, 1, // for cuda
                                _epsabc,
                                No,
+#if defined(HAVE_CUDA)
                                (DataFieldType<F>*)epsi,
                                (DataFieldType<F>*)Tijk,
                                (DataFieldType<F>*)Zijk,
+#else
+                                epsi,
+                                Tijk,
+                                Zijk,
+#endif
                                tupleEnergy);
                  })

--- a/src/atrip/Equations.cxx
+++ b/src/atrip/Equations.cxx
@@ -401,9 +401,15 @@ void getEnergySame
    // -- TIJK
    // , DataPtr<F> Tijk_
    , DataFieldType<F>* Tijk_
-    ) {
-
-    const size_t NoNo = No*No;
+#if defined(HAVE_CUDA)
+     // -- tmp buffers
+    , DataFieldType<F>* _t_buffer
+    , DataFieldType<F>* _vhhh 
+#endif
+     ) {
+    const size_t a = abc[0], b = abc[1], c = abc[2]
+              , NoNo = No*No
+              ;

    DataFieldType<F>* Tijk = (DataFieldType<F>*)Tijk_;

@@ -517,21 +523,21 @@ void getEnergySame
    F one{1.0}, m_one{-1.0}, zero{0.0};
    const size_t NoNoNo = No*NoNo;
 #ifdef HAVE_CUDA
-    DataFieldType<F>* _t_buffer;
-    DataFieldType<F>* _vhhh;
-    WITH_CHRONO("double:cuda:alloc",
-    _CHECK_CUDA_SUCCESS("Allocating _t_buffer",
-                        cuMemAlloc((CUdeviceptr*)&_t_buffer,
-                                   NoNoNo * sizeof(DataFieldType<F>)));
-    _CHECK_CUDA_SUCCESS("Allocating _vhhh",
-                        cuMemAlloc((CUdeviceptr*)&_vhhh,
-                                   NoNoNo * sizeof(DataFieldType<F>)));
-                )
+//    DataFieldType<F>* _t_buffer;
+//    DataFieldType<F>* _vhhh;
+//    WITH_CHRONO("double:cuda:alloc",
+//    _CHECK_CUDA_SUCCESS("Allocating _t_buffer",
+//                        cuMemAlloc((CUdeviceptr*)&_t_buffer,
+//                                   NoNoNo * sizeof(DataFieldType<F>)));
+//    _CHECK_CUDA_SUCCESS("Allocating _vhhh",
+//                        cuMemAlloc((CUdeviceptr*)&_vhhh,
+//                                   NoNoNo * sizeof(DataFieldType<F>)));
+//                )
+#if !defined(ATRIP_ONLY_DGEMM)
+    // we still have to zero this
    const size_t
      bs = Atrip::kernelDimensions.ooo.blocks,
      ths = Atrip::kernelDimensions.ooo.threads;
-
-#if !defined(ATRIP_ONLY_DGEMM)
    acc::zeroing<<<bs, ths>>>((DataFieldType<F>*)_t_buffer, NoNoNo);
    acc::zeroing<<<bs, ths>>>((DataFieldType<F>*)_vhhh, NoNoNo);
 #endif
@@ -552,12 +558,14 @@ void getEnergySame
                acc::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk,
                                           NoNoNo);
                )
-#else
+#endif
+
+#if !defined(HAVE_CUDA)
    WITH_CHRONO("double:reorder",
      for (size_t k = 0; k < NoNoNo; k++) {
        Tijk[k] = DataFieldType<F>{0.0};
       })
-#endif /* defined(HAVE_CUDA) && !defined(ATRIP_ONLY_DGEMM) */
+#endif /* !defined(HAVE_CUDA) */


 #if defined(ATRIP_ONLY_DGEMM)
@@ -649,12 +657,12 @@ void getEnergySame
 #ifdef HAVE_CUDA
    // we need to synchronize here since we need
    // the Tijk for next process in the pipeline
-    _CHECK_CUDA_SUCCESS("Synchronizing",
-                        cuCtxSynchronize());
-    _CHECK_CUDA_SUCCESS("Freeing _vhhh",
-                        cuMemFree((CUdeviceptr)_vhhh));
-    _CHECK_CUDA_SUCCESS("Freeing _t_buffer",
-                        cuMemFree((CUdeviceptr)_t_buffer));
+    //_CHECK_CUDA_SUCCESS("Synchronizing",
+    //                    cuCtxSynchronize());
+    //_CHECK_CUDA_SUCCESS("Freeing _vhhh",
+    //                    cuMemFree((CUdeviceptr)_vhhh));
+    //_CHECK_CUDA_SUCCESS("Freeing _t_buffer",
+    //                    cuMemFree((CUdeviceptr)_t_buffer));
 #else
    free(_vhhh);
    free(_t_buffer);
@@ -741,6 +749,12 @@ void getEnergySame
    , DataPtr<double> const TBChh
    // -- TIJK
    , DataFieldType<double>* Tijk
+#if defined(HAVE_CUDA)
+     // -- tmp buffers
+    , DataFieldType<double>* _t_buffer
+    , DataFieldType<double>* _vhhh 
+#endif
+
    );

  template
@@ -769,6 +783,12 @@ void getEnergySame
    , DataPtr<Complex> const TBChh
    // -- TIJK
    , DataFieldType<Complex>* Tijk
+#if defined(HAVE_CUDA)
+     // -- tmp buffers
+    , DataFieldType<Complex>* _t_buffer
+    , DataFieldType<Complex>* _vhhh 
+#endif
+
    );
 // Doubles contribution:2 ends here
Author	SHA1	Message	Date
Alejandro Gallo	122329eca7	Fix zeroing	2023-01-23 17:01:29 +01:00
Ania Brown	58c0bf078e	Zero Tijk correctly in CPU code	2023-01-23 16:58:08 +01:00
Alejandro Gallo	3fe15e5e5c	Fix bs and ths error in equations	2023-01-23 16:57:07 +01:00
Alejandro Gallo	0d223e6ed9	Fix vector types for energy in cpu	2023-01-23 14:44:54 +01:00
Alejandro Gallo	c8bdc4239f	Fix an odd character in the warmup	2023-01-23 14:43:17 +01:00
Ania Brown	be96e4bf8c	1.syntax error fix 2.allocate temporary buffers only once per sim	2023-01-23 14:30:11 +01:00
Anna Brown	9003c218a3	don't need to copy to separate mpi_data array on the host when sources are resident on gpu	2023-01-23 14:25:25 +01:00