Minimal changes in Equations

2022-09-12 19:10:14 +02:00
parent c20b9e3bcb
commit 3d7702d501
1 changed files with 71 additions and 59 deletions
--- a/src/atrip/Equations.cxx
+++ b/src/atrip/Equations.cxx
@@ -14,31 +14,13 @@
 // [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]]
 #include<atrip/Equations.hpp>
 #include<atrip/CUDA.hpp>
-#if defined(HAVE_CUDA)
+#include<atrip/CUDA.hpp>
 #include <cuda.h>
 #endif
 namespace atrip {
 // Prolog:2 ends here
  // These are just help structures
  // to help with the templating of reorder
  // function
  enum reordering_t
    {
     IJK,
     IKJ,
     JIK,
     JKI,
     KIJ,
     KJI
    };
  template <typename F, reordering_t R>
  struct reorder_proxy {};
 #ifdef HAVE_CUDA
 namespace cuda {
@@ -101,49 +83,42 @@ namespace cuda {
  template <>
  __device__
  cuDoubleComplex multiply(const cuDoubleComplex &a, const cuDoubleComplex &b) {
-   return
+    return
-     {a.x * b.x - a.y * b.y,
+      {a.x * b.x - a.y * b.y,
-      a.x * b.y + a.y * b.x};
+       a.x * b.y + a.y * b.x};
  }
  template <typename F>
  __device__
-  void sum_in_place(F* to, const F* b);
+  void sum_in_place(F* to, const F* from);
  template <>
  __device__
-  void sum_in_place(double* to, const double *b) { *to += *b; }
+  void sum_in_place(double* to, const double *from) { *to += *from; }
  template <>
  __device__
-  void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* b) {
+  void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* from) {
-   to->x += b->x;
+    to->x += from->x;
-   to->y += b->y;
+    to->y += from->y;
  }
  __device__
  cuDoubleComplex& operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz) {
    lz.x += rz.x;
    lz.y += rz.y;
    return lz;
  }
 };
 #endif
 #if defined(HAVE_CUDA)
-#define LIMS_KS()                                       \
+#define FOR_K()                                             \
-  size_t kmin = blockIdx.x * blockDim.x + threadIdx.x,  \
+  for (size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \
-    k = kmin,                                           \
+         k = kmin,                                          \
-    idx = kmin * size * size * size                     \
+         idx = kmin * size * size * size;                   \
-    ;                                                   \
+       k < (kmin < size) ? kmin + 1 : size;                 \
-  k < (kmin < size) ? kmin + 1 : size
+       k++)
 #else
-#define LIMS_KS size_t k=0, idx=0; k < size
+#define FOR_K() for (size_t k=0, idx=0; k < size; k++)
 #endif
 #define _IJK_(i, j, k) i + j*size + k*size*size
 #define _REORDER_BODY_(...)                       \
-  for (LIMS_KS() ; k++)                           \
+  FOR_K()                                         \
    for (size_t j = 0; j < size; j++)             \
      for (size_t i = 0; i < size; i++, idx++) {  \
        __VA_ARGS__                               \
@@ -152,7 +127,9 @@ namespace cuda {
  template <typename F>                         \
  __MAYBE_GLOBAL__                              \
  void reorder(reorder_proxy< F, _enum > p,     \
-               size_t size, F* to, F* from) {   \
+               size_t size,                     \
               F* to,                           \
               F* from) {                       \
    _REORDER_BODY_(__VA_ARGS__)                 \
      }
 #if defined(HAVE_CUDA)
@@ -161,9 +138,31 @@ namespace cuda {
 #define GO(__TO, __FROM) __TO += __FROM;
 #endif
  // These are just help structures
  // to help with the templating of reorder
  // function
  enum reordering_t
    {
     IJK,
     IKJ,
     JIK,
     JKI,
     KIJ,
     KJI
    };
  /* 
   * Please the c++ type checker and template creator
   * in order to have an argument in the signature of
   * the function  that helps the compiler know which
   * instantiation it should take.
   * 
   */
  template <typename F, reordering_t R>
  struct reorder_proxy {};
  template <typename F, reordering_t R>
-  __MAYBE_GLOBAL__							\
+  __MAYBE_GLOBAL__
  void reorder(reorder_proxy<F, R> proxy,
               size_t size, F* to, F* from);
@@ -445,10 +444,14 @@ double getEnergySame
 #if defined(ATRIP_USE_DGEMM)
 #if defined(HAVE_CUDA)
-#define REORDER(__II, __JJ, __KK)                       \
+#define REORDER(__II, __JJ, __KK)                           \
-  reorder<<<bs, ths>>>(reorder_proxy<DataFieldType<F>,  \
+  reorder<<<bs, ths>>>(reorder_proxy<                       \
-                       __II ## __JJ ## __KK >{},        \
+                       DataFieldType<F>,                    \
-                       No, Tijk, _t_buffer);
+                       __II ## __JJ ## __KK                 \
                       >{},                                 \
                       No,                                  \
                       Tijk,                                \
                       _t_buffer);
 #define DGEMM_PARTICLES(__A, __B)               \
  atrip::xgemm<F>("T",                          \
                  "N",                          \
@@ -462,7 +465,7 @@ double getEnergySame
                  (int const*)&Nv,              \
                  &zero,                        \
                  _t_buffer,                    \
-                  (int const*)&NoNo);
+                  (int const*)&NoNo)
 #define DGEMM_HOLES(__A, __B, __TRANSB)         \
  atrip::xgemm<F>("N",                          \
                  __TRANSB,                     \
@@ -477,7 +480,7 @@ double getEnergySame
                  &zero,                        \
                  _t_buffer,                    \
                  (int const*)&NoNo             \
-                  );
+                  )
 #define MAYBE_CONJ(_conj, _buffer)                                      \
  cuda::maybeConjugate<<<                                               \
    Atrip::kernelDimensions.ooo.blocks,                                 \
@@ -512,7 +515,7 @@ double getEnergySame
                  &zero,                        \
                  _t_buffer,                    \
                  (int const*)&NoNo             \
-                  );
+                  )
 #define DGEMM_HOLES(__A, __B, __TRANSB)         \
  atrip::xgemm<F>("N",                          \
                  __TRANSB,                     \
@@ -527,7 +530,7 @@ double getEnergySame
                  &zero,                        \
                  _t_buffer,                    \
                  (int const*)&NoNo             \
-                  );
+                  )
 #define MAYBE_CONJ(_conj, _buffer)                \
  for (size_t __i = 0; __i < NoNoNo; ++__i)       \
    _conj[__i] = maybeConjugate<F>(_buffer[__i]);
@@ -538,13 +541,19 @@ double getEnergySame
 #ifdef HAVE_CUDA
    DataFieldType<F>* _t_buffer;
    DataFieldType<F>* _vhhh;
-    cuMemAlloc((CUdeviceptr*)&_t_buffer, NoNoNo * sizeof(DataFieldType<F>));
+    WITH_CHRONO("double:cuda:alloc",
-    cuMemAlloc((CUdeviceptr*)&_vhhh, NoNoNo * sizeof(DataFieldType<F>));
+    _CHECK_CUDA_SUCCESS("Allocating _t_buffer",
                        cuMemAlloc((CUdeviceptr*)&_t_buffer,
                                   NoNoNo * sizeof(DataFieldType<F>)));
    _CHECK_CUDA_SUCCESS("Allocating _vhhh",
                        cuMemAlloc((CUdeviceptr*)&_vhhh,
                                   NoNoNo * sizeof(DataFieldType<F>)));
                )
    const size_t
      bs = Atrip::kernelDimensions.ooo.blocks,
      ths = Atrip::kernelDimensions.ooo.threads;
-    cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_t_buffer, NoNoNo);
+    // cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_t_buffer, NoNoNo);
-    cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_vhhh, NoNoNo);
+    // cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_vhhh, NoNoNo);
 #else
    DataFieldType<F>* _t_buffer = (DataFieldType<F>*)malloc(NoNoNo * sizeof(F));
    DataFieldType<F>* _vhhh = (DataFieldType<F>*)malloc(NoNoNo * sizeof(F));
@@ -651,9 +660,12 @@ double getEnergySame
 #ifdef HAVE_CUDA
    // we need to synchronize here since we need
    // the Tijk for next process in the pipeline
-    cuCtxSynchronize();
+    _CHECK_CUDA_SUCCESS("Synchronizing",
-    cuMemFree((CUdeviceptr)_vhhh);
+                        cuCtxSynchronize());
-    cuMemFree((CUdeviceptr)_t_buffer);
+    _CHECK_CUDA_SUCCESS("Freeing _vhhh",
                        cuMemFree((CUdeviceptr)_vhhh));
    _CHECK_CUDA_SUCCESS("Freeing _t_buffer",
                        cuMemFree((CUdeviceptr)_t_buffer));
 #else
    free(_vhhh);
    free(_t_buffer);