5 changed files with 78 additions and 234 deletions
--- a/include/atrip/Operations.hpp
+++ b/include/atrip/Operations.hpp
@ -24,13 +24,20 @@ namespace acc {

  // cuda kernels

+  template <typename F>
+  __MAYBE_GLOBAL__
+  void zeroing(F* a, size_t n) {
+    F zero = {0};
+    for (size_t i = 0; i < n; i++) {
+      a[i] = zero;
+    }
+  }
+
  ////
  template <typename F>
  __MAYBE_DEVICE__ __MAYBE_HOST__ __INLINE__
  F maybeConjugateScalar(const F &a) { return a; }

-  // TODO: instantiate for std::complex<double>
-
 #if defined(HAVE_CUDA)
  template <>
  __MAYBE_DEVICE__ __MAYBE_HOST__ __INLINE__
--- a/include/atrip/SliceUnion.hpp
+++ b/include/atrip/SliceUnion.hpp
@ -200,7 +200,7 @@ template <typename F=double>
                           : Slice<F>::Fetch
                           ;
          if (blank.info.state == Slice<F>::SelfSufficient) {
-#if defined(HAVE_CUDA) && !defined(ATRIP_SOURCES_IN_GPU)
+#if defined(HAVE_CUDA)
            const size_t _size = sizeof(F) * sliceSize;
            // TODO: this is code duplication with downstairs
            if (freePointers.size() == 0) {
@ -221,6 +221,7 @@ template <typename F=double>
                                                         (void*)SOURCES_DATA(sources[from.source]),
                                                         sizeof(F) * sliceSize));
                        ))
+
 #else
            blank.data = SOURCES_DATA(sources[from.source]);
 #endif
@ -387,22 +388,6 @@ template <typename F=double>
      }
    }

-    static size_t
-    getSize(const std::vector<size_t> sliceLength,
-            const std::vector<size_t> paramLength,
-            const size_t np,
-            const MPI_Comm global_world) {
-        const RankMap<F> rankMap(paramLength, np, global_world);
-        const size_t
-          nSources = rankMap.nSources(),
-          sliceSize = std::accumulate(sliceLength.begin(),
-                                      sliceLength.end(),
-                                      1UL,
-                                      std::multiplies<size_t>());
-        return nSources * sliceSize;
-      }
-
-
    // CONSTRUCTOR
    SliceUnion( std::vector<typename Slice<F>::Type> sliceTypes_
              , std::vector<size_t> sliceLength_
@ -593,11 +578,8 @@ template <typename F=double>
 #    error "You need CUDA aware MPI to have slices on the GPU"
 #  endif
        MPI_Irecv((void*)slice.data,
-#elif defined(HAVE_CUDA) && !defined(ATRIP_SOURCES_IN_GPU)
-        slice.mpi_data = (F*)malloc(sizeof(F) * slice.size);
-        MPI_Irecv(slice.mpi_data,
 #else
-        MPI_Irecv((void*)slice.data,
+        MPI_Irecv(slice.data,
 #endif
                  slice.size,
                  traits::mpi::datatypeOf<F>(),
--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@ -235,54 +235,11 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
    MPI_Comm_size(child_comm, &child_size);
  }

-  // a, b, c, d, e, f and P => Nv
-  // H                      => No
-  // total_source_sizes contains a list of the number of elements
-  // in all sources of every tensor union, therefore nSlices * sliceSize
-  const std::vector<size_t> total_source_sizes = {
-    // ABPH
-    SliceUnion<F>::getSize({Nv, No}, {Nv, Nv}, (size_t)np, universe),
-    // ABHH
-    SliceUnion<F>::getSize({No, No}, {Nv, Nv}, (size_t)np, universe),
-    // TABHH
-    SliceUnion<F>::getSize({No, No}, {Nv, Nv}, (size_t)np, universe),
-    // TAPHH
-    SliceUnion<F>::getSize({Nv, No, No}, {Nv}, (size_t)np, universe),
-    // HHHA
-    SliceUnion<F>::getSize({No, No, No}, {Nv}, (size_t)np, universe),
-  };
-
-  const size_t
-    total_source_size = sizeof(DataFieldType<F>)
-                      * std::accumulate(total_source_sizes.begin(),
-                                        total_source_sizes.end(),
-                                        0UL);
-
-#if defined(HAVE_CUDA)
-    DataPtr<F> all_sources_pointer;
-    cuMemAlloc(&all_sources_pointer, total_source_size);
-#else
-    DataPtr<F>
-      all_sources_pointer = (DataPtr<F>)malloc(total_source_size);
-#endif
-  size_t _source_pointer_idx = 0;
-
  // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1
  WITH_CHRONO("nv-nv-slices",
    LOG(0,"Atrip") << "building NV x NV slices\n";
-    // TODO
-    // DataPtr<F> offseted_pointer = all_sources_pointer
-    //                             * total_source_sizes[_source_pointer_idx++];
    ABPH<F> abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-
-    // TODO
-    // DataPtr<F> offseted_pointer = all_sources_pointer
-    //                             * total_source_sizes[_source_pointer_idx++];
    ABHH<F> abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-
-    // TODO
-    // DataPtr<F> offseted_pointer = all_sources_pointer
-    //                             * total_source_sizes[_source_pointer_idx++];
    TABHH<F> tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
  )

@ -294,13 +251,7 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
  // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1
  WITH_CHRONO("nv-slices",
    LOG(0,"Atrip") << "building NV slices\n";
-    // TODO
-    // DataPtr<F> offseted_pointer = all_sources_pointer
-    //                             * total_source_sizes[_source_pointer_idx++];
    TAPHH<F> taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
-    // TODO
-    // DataPtr<F> offseted_pointer = all_sources_pointer
-    //                             * total_source_sizes[_source_pointer_idx++];
    HHHA<F>  hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
  )

@ -952,5 +903,5 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
 }
 // instantiate
 template Atrip::Output Atrip::run(Atrip::Input<double> const& in);
-// template Atrip::Output Atrip::run(Atrip::Input<Complex> const& in);
+template Atrip::Output Atrip::run(Atrip::Input<Complex> const& in);
 // Main:1 ends here
--- a/src/atrip/Complex.cxx
+++ b/src/atrip/Complex.cxx
@ -21,6 +21,11 @@ namespace atrip {
  template <> double maybeConjugate(const double a) { return a; }
  template <> Complex maybeConjugate(const Complex a) { return std::conj(a); }

+#if defined(HAVE_CUDA)
+
+#endif
+
+
  namespace traits {
    template <typename F> bool isComplex() { return false; }
    template <> bool isComplex<double>() { return false; }
--- a/src/atrip/Equations.cxx
+++ b/src/atrip/Equations.cxx
@ -13,8 +13,6 @@
 // limitations under the License.

 // [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]]
-#include <cstring>
-
 #include<atrip/Equations.hpp>

 #include<atrip/CUDA.hpp>
@ -27,8 +25,11 @@ namespace atrip {

 #if defined(HAVE_CUDA)
 #define FOR_K()                                             \
-  const size_t k = blockIdx.x * blockDim.x + threadIdx.x; \
-  size_t idx = k*size*size;
+  for (size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \
+         k = kmin,                                          \
+         idx = kmin * size * size * size;                   \
+       k < (kmin < size) ? kmin + 1 : size;                 \
+       k++)
 #else
 #define FOR_K() for (size_t k=0, idx=0; k < size; k++)
 #endif
@ -101,7 +102,6 @@ namespace atrip {
 #  define MIN(a, b) std::min((a), (b))
 #endif

-#if defined(ATRIP_NEW_ENERGY)

 // [[file:~/cuda/atrip/atrip.org::*Energy][Energy:2]]
 template <typename F>
@ -250,131 +250,6 @@ void getEnergySame
 }
 // Energy:2 ends here

-#else
-
-// [[file:~/cuda/atrip/atrip.org::*Energy][Energy:2]]
-template <typename F>
-__MAYBE_GLOBAL__
-void getEnergyDistinct
-  ( F const epsabc
-  , size_t const No
-  , F* const epsi
-  , F* const Tijk
-  , F* const Zijk
-  , double* _energy
-  ) {
-  constexpr size_t blockSize=16;
-  F energy(0.);
-  for (size_t kk=0; kk<No; kk+=blockSize){
-    const size_t kend( MIN(No, kk+blockSize) );
-    for (size_t jj(kk); jj<No; jj+=blockSize){
-      const size_t jend( MIN( No, jj+blockSize) );
-      for (size_t ii(jj); ii<No; ii+=blockSize){
-        const size_t iend( MIN( No, ii+blockSize) );
-        for (size_t k(kk); k < kend; k++){
-          const F ek(epsi[k]);
-          const size_t jstart = jj > k ? jj : k;
-          for (size_t j(jstart); j < jend; j++){
-            F const ej(epsi[j]);
-            F const facjk = j == k ? F(0.5) : F(1.0);
-            size_t istart = ii > j ? ii : j;
-            for (size_t i(istart); i < iend; i++){
-              const F
-                  ei(epsi[i])
-                , facij = i == j ? F(0.5) : F(1.0)
-                , denominator(epsabc - ei - ej - ek)
-                , U(Zijk[i + No*j + No*No*k])
-                , V(Zijk[i + No*k + No*No*j])
-                , W(Zijk[j + No*i + No*No*k])
-                , X(Zijk[j + No*k + No*No*i])
-                , Y(Zijk[k + No*i + No*No*j])
-                , Z(Zijk[k + No*j + No*No*i])
-                , A(acc::maybeConjugateScalar<F>(Tijk[i + No*j + No*No*k]))
-                , B(acc::maybeConjugateScalar<F>(Tijk[i + No*k + No*No*j]))
-                , C(acc::maybeConjugateScalar<F>(Tijk[j + No*i + No*No*k]))
-                , D(acc::maybeConjugateScalar<F>(Tijk[j + No*k + No*No*i]))
-                , E(acc::maybeConjugateScalar<F>(Tijk[k + No*i + No*No*j]))
-                , _F(acc::maybeConjugateScalar<F>(Tijk[k + No*j + No*No*i]))
-                , value
-                  = 3.0 * ( A * U
-                            + B * V
-                            + C * W
-                            + D * X
-                            + E * Y
-                            + _F * Z )
-                 + ( ( U + X + Y )
-                   - 2.0 * ( V + W + Z )
-                   ) * ( A + D + E )
-                 + ( ( V + W + Z )
-                   - 2.0 * ( U + X + Y )
-                   ) * ( B + C + _F )
-                ;
-              energy += 2.0 * value / denominator * facjk * facij;
-            } // i
-          } // j
-        } // k
-      } // ii
-    } // jj
-  } // kk
-  *_energy = acc::real(energy);
-}
-
-
-template <typename F>
-__MAYBE_GLOBAL__
-void getEnergySame
-  ( F const epsabc
-  , size_t const No
-  , F* const epsi
-  , F* const Tijk
-  , F* const Zijk
-  , double* _energy
-  ) {
-  constexpr size_t blockSize = 16;
-  F energy = F(0.);
-  for (size_t kk=0; kk<No; kk+=blockSize){
-    const size_t kend( MIN( kk+blockSize, No) );
-    for (size_t jj(kk); jj<No; jj+=blockSize){
-      const size_t jend( MIN( jj+blockSize, No) );
-      for (size_t ii(jj); ii<No; ii+=blockSize){
-        const size_t iend( MIN( ii+blockSize, No) );
-        for (size_t k(kk); k < kend; k++){
-          const F ek(epsi[k]);
-          const size_t jstart = jj > k ? jj : k;
-          for(size_t j(jstart); j < jend; j++){
-            const F facjk( j == k ? F(0.5) : F(1.0));
-            const F ej(epsi[j]);
-            const size_t istart = ii > j ? ii : j;
-            for(size_t i(istart); i < iend; i++){
-              const F
-                ei(epsi[i])
-              , facij ( i==j ? F(0.5) : F(1.0))
-              , denominator(epsabc - ei - ej - ek)
-              , U(Zijk[i + No*j + No*No*k])
-              , V(Zijk[j + No*k + No*No*i])
-              , W(Zijk[k + No*i + No*No*j])
-              , A(acc::maybeConjugateScalar<F>(Tijk[i + No*j + No*No*k]))
-              , B(acc::maybeConjugateScalar<F>(Tijk[j + No*k + No*No*i]))
-              , C(acc::maybeConjugateScalar<F>(Tijk[k + No*i + No*No*j]))
-              , value
-                = F(3.0) * ( A * U
-                           + B * V
-                           + C * W
-                           )
-                - ( A + B + C ) * ( U + V + W )
-              ;
-              energy += F(2.0) * value / denominator * facjk * facij;
-            } // i
-          } // j
-        } // k
-      } // ii
-    } // jj
-  } // kk
-  *_energy = acc::real(energy);
-}
-// Energy:2 ends here
-#endif /* defined(ATRIP_NEW_ENERGY) */
-
 // [[file:~/cuda/atrip/atrip.org::*Energy][Energy:3]]
 // instantiate double
 template
@ -399,8 +274,6 @@ void getEnergySame
  , DataFieldType<double>* energy
  );

-// TODO: put this back in
-#if defined(ATRIP_WITH_COMPLEX)
 // instantiate Complex
 template
 __MAYBE_GLOBAL__
@ -424,7 +297,6 @@ void getEnergySame
  , DataFieldType<double>* energy
  );
 // Energy:3 ends here
-#endif

 // [[file:~/cuda/atrip/atrip.org::*Singles%20contribution][Singles contribution:2]]
  template <typename F> __MAYBE_GLOBAL__
@ -544,7 +416,7 @@ void getEnergySame
 #if defined(ATRIP_USE_DGEMM)
 #if defined(HAVE_CUDA)
 #define REORDER(__II, __JJ, __KK)               \
-  reorder<<<1, No>>>(reorder_proxy<           \
+  reorder<<<bs, ths>>>(reorder_proxy<           \
                       DataFieldType<F>,        \
                       __II ## __JJ ## __KK     \
                       >{},                     \
@ -582,7 +454,12 @@ void getEnergySame
                  )
 #define MAYBE_CONJ(_conj, _buffer)                                \
  do {                                                            \
-    acc::maybeConjugate<<<1, 1                                    \
+    acc::maybeConjugate<<<                                        \
+                                                                  \
+                            Atrip::kernelDimensions.ooo.blocks,   \
+                                                                  \
+                              Atrip::kernelDimensions.ooo.threads \
+                                                                  \
                        >>>((DataFieldType<F>*)_conj,             \
                            (DataFieldType<F>*)_buffer,           \
                            NoNoNo);                              \
@ -635,38 +512,60 @@ void getEnergySame
                  (int const*)&NoNo             \
                  )
 #define MAYBE_CONJ(_conj, _buffer)              \
-  acc::maybeConjugate((DataFieldType<F>*)_conj,  \
-                      (DataFieldType<F>*)_buffer,\
-                      NoNoNo);
+  do {                                          \
+    for (size_t __i = 0; __i < NoNoNo; ++__i) { \
+      _conj[__i]                                \
+        = maybeConjugate<F>(_buffer[__i]);      \
+    }                                           \
+  } while (0)
 #endif

    F one{1.0}, m_one{-1.0}, zero{0.0};
    const size_t NoNoNo = No*NoNo;
-
-// Zeroing vectors
 #ifdef HAVE_CUDA
-
+//    DataFieldType<F>* _t_buffer;
+//    DataFieldType<F>* _vhhh;
+//    WITH_CHRONO("double:cuda:alloc",
+//    _CHECK_CUDA_SUCCESS("Allocating _t_buffer",
+//                        cuMemAlloc((CUdeviceptr*)&_t_buffer,
+//                                   NoNoNo * sizeof(DataFieldType<F>)));
+//    _CHECK_CUDA_SUCCESS("Allocating _vhhh",
+//                        cuMemAlloc((CUdeviceptr*)&_vhhh,
+//                                   NoNoNo * sizeof(DataFieldType<F>)));
+//                )
 #if !defined(ATRIP_ONLY_DGEMM)
-    {
-    const size_t elements = NoNoNo * sizeof(DataFieldType<F>)/4;
-    WITH_CHRONO("double:zeroing",
-                _CHECK_CUDA_SUCCESS("Zeroing Tijk",
-                  cuMemsetD32_v2((CUdeviceptr)Tijk, 0x00, elements));
-                _CHECK_CUDA_SUCCESS("Zeroing t buffer",
-                  cuMemsetD32_v2((CUdeviceptr)_t_buffer, 0x00, elements));
-                _CHECK_CUDA_SUCCESS("Zeroing vhhh buffer",
-                  cuMemsetD32_v2((CUdeviceptr)_vhhh, 0x00, elements));
-                )
-    }
+    // we still have to zero this
+    const size_t
+      bs = Atrip::kernelDimensions.ooo.blocks,
+      ths = Atrip::kernelDimensions.ooo.threads;
+    acc::zeroing<<<bs, ths>>>((DataFieldType<F>*)_t_buffer, NoNoNo);
+    acc::zeroing<<<bs, ths>>>((DataFieldType<F>*)_vhhh, NoNoNo);
 #endif

 #else
    DataFieldType<F>* _t_buffer = (DataFieldType<F>*)malloc(NoNoNo * sizeof(F));
    DataFieldType<F>* _vhhh = (DataFieldType<F>*)malloc(NoNoNo * sizeof(F));
-    std::memset((void*)_t_buffer, 0x00, NoNoNo * sizeof(DataFieldType<F>));
-    std::memset((void*)_vhhh,     0x00, NoNoNo * sizeof(DataFieldType<F>));
-    std::memset((void*)Tijk,      0x00, NoNoNo * sizeof(DataFieldType<F>));
-#endif /* HAVE_CUDA */
+    DataFieldType<F> zero_h{0.0};
+    for (size_t i=0; i < NoNoNo; i++) {
+      _t_buffer[i] = zero_h;
+      _vhhh[i] = zero_h;
+    }
+#endif
+
+    // Set Tijk to zero
+#if defined(HAVE_CUDA) && !defined(ATRIP_ONLY_DGEMM)
+    WITH_CHRONO("double:reorder",
+                acc::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk,
+                                           NoNoNo);
+                )
+#endif
+
+#if !defined(HAVE_CUDA)
+    WITH_CHRONO("double:reorder",
+      for (size_t k = 0; k < NoNoNo; k++) {
+        Tijk[k] = DataFieldType<F>{0.0};
+       })
+#endif /* !defined(HAVE_CUDA) */


 #if defined(ATRIP_ONLY_DGEMM)