Minimal changes in Equations

2022-09-12 19:10:14 +02:00
parent c20b9e3bcb
commit 3d7702d501
1 changed files with 71 additions and 59 deletions
--- a/src/atrip/Equations.cxx
+++ b/src/atrip/Equations.cxx
@@ -14,31 +14,13 @@

 // [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]]
 #include<atrip/Equations.hpp>
-#include<atrip/CUDA.hpp>

-#if defined(HAVE_CUDA)
-#include <cuda.h>
-#endif
+#include<atrip/CUDA.hpp>

 namespace atrip {
 // Prolog:2 ends here


-  // These are just help structures
-  // to help with the templating of reorder
-  // function
-  enum reordering_t
-    {
-     IJK,
-     IKJ,
-     JIK,
-     JKI,
-     KIJ,
-     KJI
-    };
-
-  template <typename F, reordering_t R>
-  struct reorder_proxy {};

 #ifdef HAVE_CUDA
 namespace cuda {
@@ -101,49 +83,42 @@ namespace cuda {
  template <>
  __device__
  cuDoubleComplex multiply(const cuDoubleComplex &a, const cuDoubleComplex &b) {
-   return
-     {a.x * b.x - a.y * b.y,
-      a.x * b.y + a.y * b.x};
+    return
+      {a.x * b.x - a.y * b.y,
+       a.x * b.y + a.y * b.x};
  }

  template <typename F>
  __device__
-  void sum_in_place(F* to, const F* b);
+  void sum_in_place(F* to, const F* from);

  template <>
  __device__
-  void sum_in_place(double* to, const double *b) { *to += *b; }
+  void sum_in_place(double* to, const double *from) { *to += *from; }

  template <>
  __device__
-  void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* b) {
-   to->x += b->x;
-   to->y += b->y;
-  }
-
-  __device__
-  cuDoubleComplex& operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz) {
-    lz.x += rz.x;
-    lz.y += rz.y;
-    return lz;
+  void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* from) {
+    to->x += from->x;
+    to->y += from->y;
  }

 };
 #endif

 #if defined(HAVE_CUDA)
-#define LIMS_KS()                                       \
-  size_t kmin = blockIdx.x * blockDim.x + threadIdx.x,  \
-    k = kmin,                                           \
-    idx = kmin * size * size * size                     \
-    ;                                                   \
-  k < (kmin < size) ? kmin + 1 : size
+#define FOR_K()                                             \
+  for (size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \
+         k = kmin,                                          \
+         idx = kmin * size * size * size;                   \
+       k < (kmin < size) ? kmin + 1 : size;                 \
+       k++)
 #else
-#define LIMS_KS size_t k=0, idx=0; k < size
+#define FOR_K() for (size_t k=0, idx=0; k < size; k++)
 #endif
 #define _IJK_(i, j, k) i + j*size + k*size*size
 #define _REORDER_BODY_(...)                       \
-  for (LIMS_KS() ; k++)                           \
+  FOR_K()                                         \
    for (size_t j = 0; j < size; j++)             \
      for (size_t i = 0; i < size; i++, idx++) {  \
        __VA_ARGS__                               \
@@ -152,7 +127,9 @@ namespace cuda {
  template <typename F>                         \
  __MAYBE_GLOBAL__                              \
  void reorder(reorder_proxy< F, _enum > p,     \
-               size_t size, F* to, F* from) {   \
+               size_t size,                     \
+               F* to,                           \
+               F* from) {                       \
    _REORDER_BODY_(__VA_ARGS__)                 \
      }
 #if defined(HAVE_CUDA)
@@ -161,9 +138,31 @@ namespace cuda {
 #define GO(__TO, __FROM) __TO += __FROM;
 #endif

+  // These are just help structures
+  // to help with the templating of reorder
+  // function
+  enum reordering_t
+    {
+     IJK,
+     IKJ,
+     JIK,
+     JKI,
+     KIJ,
+     KJI
+    };
+
+  /* 
+   * Please the c++ type checker and template creator
+   * in order to have an argument in the signature of
+   * the function  that helps the compiler know which
+   * instantiation it should take.
+   * 
+   */
+  template <typename F, reordering_t R>
+  struct reorder_proxy {};

  template <typename F, reordering_t R>
-  __MAYBE_GLOBAL__							\
+  __MAYBE_GLOBAL__
  void reorder(reorder_proxy<F, R> proxy,
               size_t size, F* to, F* from);

@@ -445,10 +444,14 @@ double getEnergySame

 #if defined(ATRIP_USE_DGEMM)
 #if defined(HAVE_CUDA)
-#define REORDER(__II, __JJ, __KK)                       \
-  reorder<<<bs, ths>>>(reorder_proxy<DataFieldType<F>,  \
-                       __II ## __JJ ## __KK >{},        \
-                       No, Tijk, _t_buffer);
+#define REORDER(__II, __JJ, __KK)                           \
+  reorder<<<bs, ths>>>(reorder_proxy<                       \
+                       DataFieldType<F>,                    \
+                       __II ## __JJ ## __KK                 \
+                       >{},                                 \
+                       No,                                  \
+                       Tijk,                                \
+                       _t_buffer);
 #define DGEMM_PARTICLES(__A, __B)               \
  atrip::xgemm<F>("T",                          \
                  "N",                          \
@@ -462,7 +465,7 @@ double getEnergySame
                  (int const*)&Nv,              \
                  &zero,                        \
                  _t_buffer,                    \
-                  (int const*)&NoNo);
+                  (int const*)&NoNo)
 #define DGEMM_HOLES(__A, __B, __TRANSB)         \
  atrip::xgemm<F>("N",                          \
                  __TRANSB,                     \
@@ -477,7 +480,7 @@ double getEnergySame
                  &zero,                        \
                  _t_buffer,                    \
                  (int const*)&NoNo             \
-                  );
+                  )
 #define MAYBE_CONJ(_conj, _buffer)                                      \
  cuda::maybeConjugate<<<                                               \
    Atrip::kernelDimensions.ooo.blocks,                                 \
@@ -512,7 +515,7 @@ double getEnergySame
                  &zero,                        \
                  _t_buffer,                    \
                  (int const*)&NoNo             \
-                  );
+                  )
 #define DGEMM_HOLES(__A, __B, __TRANSB)         \
  atrip::xgemm<F>("N",                          \
                  __TRANSB,                     \
@@ -527,7 +530,7 @@ double getEnergySame
                  &zero,                        \
                  _t_buffer,                    \
                  (int const*)&NoNo             \
-                  );
+                  )
 #define MAYBE_CONJ(_conj, _buffer)                \
  for (size_t __i = 0; __i < NoNoNo; ++__i)       \
    _conj[__i] = maybeConjugate<F>(_buffer[__i]);
@@ -538,13 +541,19 @@ double getEnergySame
 #ifdef HAVE_CUDA
    DataFieldType<F>* _t_buffer;
    DataFieldType<F>* _vhhh;
-    cuMemAlloc((CUdeviceptr*)&_t_buffer, NoNoNo * sizeof(DataFieldType<F>));
-    cuMemAlloc((CUdeviceptr*)&_vhhh, NoNoNo * sizeof(DataFieldType<F>));
+    WITH_CHRONO("double:cuda:alloc",
+    _CHECK_CUDA_SUCCESS("Allocating _t_buffer",
+                        cuMemAlloc((CUdeviceptr*)&_t_buffer,
+                                   NoNoNo * sizeof(DataFieldType<F>)));
+    _CHECK_CUDA_SUCCESS("Allocating _vhhh",
+                        cuMemAlloc((CUdeviceptr*)&_vhhh,
+                                   NoNoNo * sizeof(DataFieldType<F>)));
+                )
    const size_t
      bs = Atrip::kernelDimensions.ooo.blocks,
      ths = Atrip::kernelDimensions.ooo.threads;
-    cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_t_buffer, NoNoNo);
-    cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_vhhh, NoNoNo);
+    // cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_t_buffer, NoNoNo);
+    // cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_vhhh, NoNoNo);
 #else
    DataFieldType<F>* _t_buffer = (DataFieldType<F>*)malloc(NoNoNo * sizeof(F));
    DataFieldType<F>* _vhhh = (DataFieldType<F>*)malloc(NoNoNo * sizeof(F));
@@ -651,9 +660,12 @@ double getEnergySame
 #ifdef HAVE_CUDA
    // we need to synchronize here since we need
    // the Tijk for next process in the pipeline
-    cuCtxSynchronize();
-    cuMemFree((CUdeviceptr)_vhhh);
-    cuMemFree((CUdeviceptr)_t_buffer);
+    _CHECK_CUDA_SUCCESS("Synchronizing",
+                        cuCtxSynchronize());
+    _CHECK_CUDA_SUCCESS("Freeing _vhhh",
+                        cuMemFree((CUdeviceptr)_vhhh));
+    _CHECK_CUDA_SUCCESS("Freeing _t_buffer",
+                        cuMemFree((CUdeviceptr)_t_buffer));
 #else
    free(_vhhh);
    free(_t_buffer);