diff --git a/.gitignore b/.gitignore index a9ca2cc..3ba04bb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +*.swo +*.swp etc/autotools/ Makefile.in build diff --git a/configure.ac b/configure.ac index cc7293c..bfbfc33 100644 --- a/configure.ac +++ b/configure.ac @@ -21,6 +21,27 @@ AC_ARG_ENABLE(shared, files (default=YES)]), [], [enable_shared=yes]) +AC_ARG_ENABLE( + [slice], + [AS_HELP_STRING( + [--disable-slice], + [Disable the step of slicing tensors for CTF, this is useful for example for benchmarking or testing.])], + [atrip_dont_slice=1 + AC_DEFINE([ATRIP_DONT_SLICE],1,[Wether CTF will slice tensors or skip the step]) + ], + [atrip_dont_slice=0] +) + +AC_ARG_ENABLE( + [atrip_dgemm], + [AS_HELP_STRING( + [--disable-dgemm], + [Disable using dgemm for the doubles equations])], + [], + [AC_DEFINE([ATRIP_USE_DGEMM],1,[Use dgemm for the doubles equations])] +) + + AC_ARG_ENABLE([docs], [AS_HELP_STRING([--enable-docs], [Enable building docs])], @@ -28,7 +49,7 @@ AC_ARG_ENABLE([docs], dnl LIBGC library options AC_ARG_WITH(libctf-prefix, - AS_HELP_STRING([--with-libctf-prefix=path], + AS_HELP_STRING([--with-ctf], [prefix for CTF includes and libraries] ), [LIBCTF_PATH="`readlink -f $withval`"; LIBCTF_CPATH="`readlink -f $withval`/include"; @@ -44,11 +65,6 @@ AC_ARG_WITH([clang-check], [clang_check=NO]) AM_CONDITIONAL([WITH_CLANG_CHECK], [test x${clang_check} = xYES]) -AC_ARG_WITH(dgemm, - AS_HELP_STRING([--without-dgemm], [Disable dgemm]), - [with_dgemm=NO], - [with_dgemm=YES]) - AC_ARG_ENABLE([cuda], [AS_HELP_STRING([--enable-cuda], [Build with cuda])], @@ -59,6 +75,12 @@ AC_ARG_VAR([CUDA_LDFLAGS], [LDFLAGS to find libraries -lcuda, -lcudart, -lcublas AC_ARG_VAR([CUDA_CXXFLAGS], [CXXFLAGS to find the CUDA headers]) +AC_ARG_WITH([atrip-debug], + [AS_HELP_STRING([--with-atrip-debug], + [Debug level for atrip, possible values: 1, 2, 3, 4])], + [AC_DEFINE([ATRIP_DEBUG],[atrip-debug],[Atrip debug level])], + [AC_DEFINE([ATRIP_DEBUG],[1],[Atrip debug level])] + ) dnl ----------------------------------------------------------------------- diff --git a/include/atrip.hpp b/include/atrip.hpp index ccf93e9..542b517 100644 --- a/include/atrip.hpp +++ b/include/atrip.hpp @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// [[file:../atrip.org::*Include header][Include header:1]] +// [[file:~/cuda/atrip/atrip.org::*Include%20header][Include header:1]] #pragma once #include diff --git a/include/atrip/Atrip.hpp b/include/atrip/Atrip.hpp index 7a94208..338b686 100644 --- a/include/atrip/Atrip.hpp +++ b/include/atrip/Atrip.hpp @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// [[file:../../atrip.org::*Header][Header:1]] +// [[file:~/cuda/atrip/atrip.org::*Header][Header:1]] #pragma once #include #include @@ -41,12 +41,21 @@ namespace atrip { struct Atrip { - static int rank; - static int np; + static size_t rank; + static size_t np; static MPI_Comm communicator; static Timings chrono; +#if defined(HAVE_CUDA) + struct CudaContext { + cublasStatus_t status; + cublasHandle_t handle; + }; + static CudaContext cuda; +#endif + static void init(MPI_Comm); + template struct Input { CTF::Tensor *ei = nullptr diff --git a/include/atrip/Blas.hpp b/include/atrip/Blas.hpp index 93f822d..117eb26 100644 --- a/include/atrip/Blas.hpp +++ b/include/atrip/Blas.hpp @@ -12,12 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -// [[file:../../atrip.org::*Blas][Blas:1]] +// [[file:~/cuda/atrip/atrip.org::*Blas][Blas:1]] #pragma once + +#include +#include +#include "config.h" + namespace atrip { - using Complex = std::complex; - +#if !defined(HAVE_CUDA) extern "C" { void dgemm_( const char *transa, @@ -50,49 +54,43 @@ namespace atrip { Complex *C, const int *ldc ); + + void dcopy_(const int n, + const double *x, + const int incx, + double *y, + const int incy); + + void zcopy_(const int n, + const void *x, + const int incx, + void *y, + const int incy); + + } +#endif + template + void xcopy(const int n, + const DataFieldType* x, + const int incx, + DataFieldType* y, + const int incy); - template + template void xgemm(const char *transa, const char *transb, const int *m, const int *n, const int *k, F *alpha, - const F *A, + const DataFieldType *A, const int *lda, - const F *B, + const DataFieldType *B, const int *ldb, F *beta, - F *C, - const int *ldc) { - dgemm_(transa, transb, - m, n, k, - alpha, A, lda, - B, ldb, beta, - C, ldc); - } - - template <> - void xgemm(const char *transa, - const char *transb, - const int *m, - const int *n, - const int *k, - Complex *alpha, - const Complex *A, - const int *lda, - const Complex *B, - const int *ldb, - Complex *beta, - Complex *C, - const int *ldc) { - zgemm_(transa, transb, - m, n, k, - alpha, A, lda, - B, ldb, beta, - C, ldc); - } + DataFieldType *C, + const int *ldc); } // Blas:1 ends here diff --git a/include/atrip/CUDA.hpp b/include/atrip/CUDA.hpp new file mode 100644 index 0000000..9610986 --- /dev/null +++ b/include/atrip/CUDA.hpp @@ -0,0 +1,9 @@ +#pragma once + +#if defined(HAVE_CUDA) && defined(__CUDACC__) +# define __MAYBE_GLOBAL__ __global__ +# define __MAYBE_DEVICE__ __device__ +#else +# define __MAYBE_GLOBAL__ +# define __MAYBE_DEVICE__ +#endif diff --git a/include/atrip/Checkpoint.hpp b/include/atrip/Checkpoint.hpp index 16243a2..df9e0ec 100644 --- a/include/atrip/Checkpoint.hpp +++ b/include/atrip/Checkpoint.hpp @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// [[file:../../atrip.org::*Prolog][Prolog:1]] +// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]] #pragma once #include #include @@ -22,7 +22,7 @@ namespace atrip { // Prolog:1 ends here -// [[file:../../atrip.org::checkpoint-definition][checkpoint-definition]] +// [[file:~/cuda/atrip/atrip.org::checkpoint-definition][checkpoint-definition]] // template struct Checkpoint { size_t no, nv; @@ -36,7 +36,7 @@ struct Checkpoint { }; // checkpoint-definition ends here -// [[file:../../atrip.org::*Input and output][Input and output:1]] +// [[file:~/cuda/atrip/atrip.org::*Input%20and%20output][Input and output:1]] void write_checkpoint(Checkpoint const& c, std::string const& filepath) { std::ofstream out(filepath); out << "No: " << c.no @@ -87,6 +87,6 @@ Checkpoint read_checkpoint(std::string const& filepath) { } // Input and output:1 ends here -// [[file:../../atrip.org::*Epilog][Epilog:1]] +// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]] } // Epilog:1 ends here diff --git a/include/atrip/Complex.hpp b/include/atrip/Complex.hpp new file mode 100644 index 0000000..946d5e9 --- /dev/null +++ b/include/atrip/Complex.hpp @@ -0,0 +1,46 @@ +// Copyright 2022 Alejandro Gallo +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// [[file:~/cuda/atrip/atrip.org::*Complex%20numbers][Complex numbers:1]] +#pragma once + +#include +#include +#include "config.h" +#if defined(HAVE_CUDA) +#include +#endif + +namespace atrip { + + using Complex = std::complex; + + template F maybeConjugate(const F); + +#if defined(HAVE_CUDA) + cuDoubleComplex& operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz); +#endif + + namespace traits { + + template bool isComplex(); + + namespace mpi { + template MPI_Datatype datatypeOf(void); + } + + } + +} +// Complex numbers:1 ends here diff --git a/include/atrip/Debug.hpp b/include/atrip/Debug.hpp index 4fcf0b7..bf79b51 100644 --- a/include/atrip/Debug.hpp +++ b/include/atrip/Debug.hpp @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// [[file:../../atrip.org::*Macros][Macros:1]] +// [[file:~/cuda/atrip/atrip.org::*Macros][Macros:1]] #pragma once #include #define ATRIP_BENCHMARK @@ -21,7 +21,6 @@ # define ATRIP_DEBUG 1 #endif //#define ATRIP_WORKLOAD_DUMP -#define ATRIP_USE_DGEMM //#define ATRIP_PRINT_TUPLES #ifndef ATRIP_DEBUG @@ -75,20 +74,20 @@ #endif // Macros:1 ends here -// [[file:../../atrip.org::*Macros][Macros:2]] +// [[file:~/cuda/atrip/atrip.org::*Macros][Macros:2]] #ifndef LOG #define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": " #endif // Macros:2 ends here -// [[file:../../atrip.org::*Macros][Macros:3]] +// [[file:~/cuda/atrip/atrip.org::*Macros][Macros:3]] #ifdef ATRIP_NO_OUTPUT # undef LOG # define LOG(level, name) if (false) std::cout << name << ": " #endif // Macros:3 ends here -// [[file:../../atrip.org::IterationDescriptor][IterationDescriptor]] +// [[file:~/cuda/atrip/atrip.org::IterationDescriptor][IterationDescriptor]] namespace atrip { struct IterationDescription; diff --git a/include/atrip/Equations.hpp b/include/atrip/Equations.hpp index abadbc2..fbee04c 100644 --- a/include/atrip/Equations.hpp +++ b/include/atrip/Equations.hpp @@ -12,371 +12,94 @@ // See the License for the specific language governing permissions and // limitations under the License. -// [[file:../../atrip.org::*Equations][Equations:1]] +// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]] #pragma once -#include +#include #include +#include + +#if defined(HAVE_CUDA) +#include +#endif + namespace atrip { +using ABCTuple = std::array; +using PartialTuple = std::array; +using ABCTuples = std::vector; +// Prolog:1 ends here - template - double getEnergyDistinct - ( const F epsabc - , std::vector const& epsi - , std::vector const& Tijk_ - , std::vector const& Zijk_ - ) { - constexpr size_t blockSize=16; - F energy(0.); - const size_t No = epsi.size(); - for (size_t kk=0; kk k ? jj : k; - for (size_t j(jstart); j < jend; j++){ - F const ej(epsi[j]); - F const facjk = j == k ? F(0.5) : F(1.0); - size_t istart = ii > j ? ii : j; - for (size_t i(istart); i < iend; i++){ - const F - ei(epsi[i]) - , facij = i == j ? F(0.5) : F(1.0) - , denominator(epsabc - ei - ej - ek) - , U(Zijk_[i + No*j + No*No*k]) - , V(Zijk_[i + No*k + No*No*j]) - , W(Zijk_[j + No*i + No*No*k]) - , X(Zijk_[j + No*k + No*No*i]) - , Y(Zijk_[k + No*i + No*No*j]) - , Z(Zijk_[k + No*j + No*No*i]) - , A(maybeConjugate(Tijk_[i + No*j + No*No*k])) - , B(maybeConjugate(Tijk_[i + No*k + No*No*j])) - , C(maybeConjugate(Tijk_[j + No*i + No*No*k])) - , D(maybeConjugate(Tijk_[j + No*k + No*No*i])) - , E(maybeConjugate(Tijk_[k + No*i + No*No*j])) - , _F(maybeConjugate(Tijk_[k + No*j + No*No*i])) - , value - = 3.0 * ( A * U - + B * V - + C * W - + D * X - + E * Y - + _F * Z ) - + ( ( U + X + Y ) - - 2.0 * ( V + W + Z ) - ) * ( A + D + E ) - + ( ( V + W + Z ) - - 2.0 * ( U + X + Y ) - ) * ( B + C + _F ) - ; - energy += 2.0 * value / denominator * facjk * facij; - } // i - } // j - } // k - } // ii - } // jj - } // kk - return std::real(energy); - } +// [[file:~/cuda/atrip/atrip.org::*Energy][Energy:1]] +template +double getEnergyDistinct + ( F const epsabc + , size_t const No + , F* const epsi + , F* const Tijk + , F* const Zijk + ); +template +double getEnergySame + ( F const epsabc + , size_t const No + , F* const epsi + , F* const Tijk + , F* const Zijk + ); +// Energy:1 ends here - template - double getEnergySame - ( const F epsabc - , std::vector const& epsi - , std::vector const& Tijk_ - , std::vector const& Zijk_ - ) { - constexpr size_t blockSize = 16; - const size_t No = epsi.size(); - F energy = F(0.); - for (size_t kk=0; kk k ? jj : k; - for(size_t j(jstart); j < jend; j++){ - const F facjk( j == k ? F(0.5) : F(1.0)); - const F ej(epsi[j]); - const size_t istart = ii > j ? ii : j; - for(size_t i(istart); i < iend; i++){ - const F - ei(epsi[i]) - , facij ( i==j ? F(0.5) : F(1.0)) - , denominator(epsabc - ei - ej - ek) - , U(Zijk_[i + No*j + No*No*k]) - , V(Zijk_[j + No*k + No*No*i]) - , W(Zijk_[k + No*i + No*No*j]) - , A(maybeConjugate(Tijk_[i + No*j + No*No*k])) - , B(maybeConjugate(Tijk_[j + No*k + No*No*i])) - , C(maybeConjugate(Tijk_[k + No*i + No*No*j])) - , value - = F(3.0) * ( A * U - + B * V - + C * W - ) - - ( A + B + C ) * ( U + V + W ) - ; - energy += F(2.0) * value / denominator * facjk * facij; - } // i - } // j - } // k - } // ii - } // jj - } // kk - return std::real(energy); - } - - template - void singlesContribution - ( size_t No - , size_t Nv - , const ABCTuple &abc - , F const* Tph - , F const* VABij - , F const* VACij - , F const* VBCij - , F *Zijk - ) { - const size_t a(abc[0]), b(abc[1]), c(abc[2]); - for (size_t k=0; k < No; k++) - for (size_t i=0; i < No; i++) - for (size_t j=0; j < No; j++) { - const size_t ijk = i + j*No + k*No*No - , jk = j + No * k - ; - Zijk[ijk] += Tph[ a + i * Nv ] * VBCij[ j + k * No ]; - Zijk[ijk] += Tph[ b + j * Nv ] * VACij[ i + k * No ]; - Zijk[ijk] += Tph[ c + k * Nv ] * VABij[ i + j * No ]; - } - } +// [[file:~/cuda/atrip/atrip.org::*Singles%20contribution][Singles contribution:1]] +template +#ifdef HAVE_CUDA +__global__ +#endif +void singlesContribution + ( size_t No + , size_t Nv + , size_t a + , size_t b + , size_t c + , DataFieldType* const Tph + , DataFieldType* const VABij + , DataFieldType* const VACij + , DataFieldType* const VBCij + , DataFieldType* Zijk + ); +// Singles contribution:1 ends here +// [[file:~/cuda/atrip/atrip.org::*Doubles%20contribution][Doubles contribution:1]] template void doublesContribution ( const ABCTuple &abc , size_t const No , size_t const Nv // -- VABCI - , F const* VABph - , F const* VACph - , F const* VBCph - , F const* VBAph - , F const* VCAph - , F const* VCBph + , DataPtr const VABph + , DataPtr const VACph + , DataPtr const VBCph + , DataPtr const VBAph + , DataPtr const VCAph + , DataPtr const VCBph // -- VHHHA - , F const* VhhhA - , F const* VhhhB - , F const* VhhhC + , DataPtr const VhhhA + , DataPtr const VhhhB + , DataPtr const VhhhC // -- TA - , F const* TAphh - , F const* TBphh - , F const* TCphh + , DataPtr const TAphh + , DataPtr const TBphh + , DataPtr const TCphh // -- TABIJ - , F const* TABhh - , F const* TAChh - , F const* TBChh + , DataPtr const TABhh + , DataPtr const TAChh + , DataPtr const TBChh // -- TIJK - , F *Tijk - ) { - - const size_t a = abc[0], b = abc[1], c = abc[2] - , NoNo = No*No, NoNv = No*Nv - ; - - #if defined(ATRIP_USE_DGEMM) - #define _IJK_(i, j, k) i + j*No + k*NoNo - #define REORDER(__II, __JJ, __KK) \ - WITH_CHRONO("doubles:reorder", \ - for (size_t k = 0; k < No; k++) \ - for (size_t j = 0; j < No; j++) \ - for (size_t i = 0; i < No; i++) { \ - Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)]; \ - } \ - ) - #define DGEMM_PARTICLES(__A, __B) \ - atrip::xgemm( "T" \ - , "N" \ - , (int const*)&NoNo \ - , (int const*)&No \ - , (int const*)&Nv \ - , &one \ - , __A \ - , (int const*)&Nv \ - , __B \ - , (int const*)&Nv \ - , &zero \ - , _t_buffer.data() \ - , (int const*)&NoNo \ - ); - #define DGEMM_HOLES(__A, __B, __TRANSB) \ - atrip::xgemm( "N" \ - , __TRANSB \ - , (int const*)&NoNo \ - , (int const*)&No \ - , (int const*)&No \ - , &m_one \ - , __A \ - , (int const*)&NoNo \ - , __B \ - , (int const*)&No \ - , &zero \ - , _t_buffer.data() \ - , (int const*)&NoNo \ - ); - #define MAYBE_CONJ(_conj, _buffer) \ - for (size_t __i = 0; __i < NoNoNo; ++__i) \ - _conj[__i] = maybeConjugate(_buffer[__i]); \ - - const size_t NoNoNo = No*NoNo; - std::vector _t_buffer; - _t_buffer.reserve(NoNoNo); - F one{1.0}, m_one{-1.0}, zero{0.0}; - - WITH_CHRONO("double:reorder", - for (size_t k = 0; k < NoNoNo; k++) { - Tijk[k] = 0.0; - }) - - // TOMERGE: replace chronos - WITH_CHRONO("doubles:holes", - { // Holes part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - - std::vector _vhhh(NoNoNo); - - // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 - MAYBE_CONJ(_vhhh, VhhhC) - WITH_CHRONO("doubles:holes:1", - DGEMM_HOLES(_vhhh.data(), TABhh, "N") - REORDER(i, k, j) - ) - // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 - WITH_CHRONO("doubles:holes:2", - DGEMM_HOLES(_vhhh.data(), TABhh, "T") - REORDER(j, k, i) - ) - - // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 - MAYBE_CONJ(_vhhh, VhhhB) - WITH_CHRONO("doubles:holes:3", - DGEMM_HOLES(_vhhh.data(), TAChh, "N") - REORDER(i, j, k) - ) - // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 - WITH_CHRONO("doubles:holes:4", - DGEMM_HOLES(_vhhh.data(), TAChh, "T") - REORDER(k, j, i) - ) - - // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 - MAYBE_CONJ(_vhhh, VhhhA) - WITH_CHRONO("doubles:holes:5", - DGEMM_HOLES(_vhhh.data(), TBChh, "N") - REORDER(j, i, k) - ) - // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 - WITH_CHRONO("doubles:holes:6", - DGEMM_HOLES(_vhhh.data(), TBChh, "T") - REORDER(k, i, j) - ) - - } - ) - #undef MAYBE_CONJ - - WITH_CHRONO("doubles:particles", - { // Particle part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 - WITH_CHRONO("doubles:particles:1", - DGEMM_PARTICLES(TAphh, VBCph) - REORDER(i, j, k) - ) - // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 - WITH_CHRONO("doubles:particles:2", - DGEMM_PARTICLES(TAphh, VCBph) - REORDER(i, k, j) - ) - // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 - WITH_CHRONO("doubles:particles:3", - DGEMM_PARTICLES(TCphh, VABph) - REORDER(k, i, j) - ) - // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 - WITH_CHRONO("doubles:particles:4", - DGEMM_PARTICLES(TCphh, VBAph) - REORDER(k, j, i) - ) - // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 - WITH_CHRONO("doubles:particles:5", - DGEMM_PARTICLES(TBphh, VACph) - REORDER(j, i, k) - ) - // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 - WITH_CHRONO("doubles:particles:6", - DGEMM_PARTICLES(TBphh, VCAph) - REORDER(j, k, i) - ) - } - ) - - #undef REORDER - #undef DGEMM_HOLES - #undef DGEMM_PARTICLES - #undef _IJK_ - #else - for (size_t k = 0; k < No; k++) - for (size_t j = 0; j < No; j++) - for (size_t i = 0; i < No; i++){ - const size_t ijk = i + j*No + k*NoNo - , jk = j + k*No - ; - Tijk[ijk] = 0.0; // :important - // HOLE DIAGRAMS: TABHH and VHHHA - for (size_t L = 0; L < No; L++){ - // t[abLj] * V[Lcik] H1 - // t[baLi] * V[Lcjk] H0 TODO: conjugate T for complex - Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo]; - Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo]; - - // t[acLk] * V[Lbij] H5 - // t[caLi] * V[Lbkj] H3 - Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo]; - Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo]; - - // t[bcLk] * V[Laji] H2 - // t[cbLj] * V[Laki] H4 - Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo]; - Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo]; - } - // PARTILCE DIAGRAMS: TAPHH and VABPH - for (size_t E = 0; E < Nv; E++) { - // t[aEij] * V[bcEk] P0 - // t[aEik] * V[cbEj] P3 // TODO: CHECK THIS ONE, I DONT KNOW - Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; - Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; - - // t[cEki] * V[abEj] P5 - // t[cEkj] * V[baEi] P2 - Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; - Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; - - // t[bEji] * V[acEk] P1 - // t[bEjk] * V[caEi] P4 - Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; - Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; - } - - } -#endif - } + // , DataPtr Tijk + , DataFieldType* Tijk_ + ); +// Doubles contribution:1 ends here +// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]] } -// Equations:1 ends here +// Epilog:1 ends here diff --git a/include/atrip/RankMap.hpp b/include/atrip/RankMap.hpp index a13013e..3cd689d 100644 --- a/include/atrip/RankMap.hpp +++ b/include/atrip/RankMap.hpp @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// [[file:../../atrip.org::*The rank mapping][The rank mapping:1]] +// [[file:~/cuda/atrip/atrip.org::*The%20rank%20mapping][The rank mapping:1]] #pragma once #include @@ -65,7 +65,8 @@ namespace atrip { ; } - bool isSourcePadding(size_t rank, size_t source) const noexcept { + bool isSourcePadding(const size_t rank, const size_t source) + const noexcept { return source == nSources() && isPaddingRank(rank); } diff --git a/include/atrip/Slice.hpp b/include/atrip/Slice.hpp index ff6d982..ed0d584 100644 --- a/include/atrip/Slice.hpp +++ b/include/atrip/Slice.hpp @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// [[file:../../atrip.org::*Prolog][Prolog:1]] +// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]] #pragma once #include #include @@ -21,33 +21,20 @@ #include #include -#include namespace atrip { -template FF maybeConjugate(const FF a) { return a; } -template <> Complex maybeConjugate(const Complex a) { return std::conj(a); } - -namespace traits { - template bool isComplex() { return false; } - template <> bool isComplex() { return true; } -namespace mpi { - template MPI_Datatype datatypeOf(void); - template <> MPI_Datatype datatypeOf() { return MPI_DOUBLE; } - template <> MPI_Datatype datatypeOf() { return MPI_DOUBLE_COMPLEX; } -} -} template struct Slice { // Prolog:1 ends here -// [[file:../../atrip.org::*Location][Location:1]] +// [[file:~/cuda/atrip/atrip.org::*Location][Location:1]] struct Location { size_t rank; size_t source; }; // Location:1 ends here -// [[file:../../atrip.org::*Type][Type:1]] +// [[file:~/cuda/atrip/atrip.org::*Type][Type:1]] enum Type { A = 10 , B @@ -65,7 +52,7 @@ enum Type }; // Type:1 ends here -// [[file:../../atrip.org::*State][State:1]] +// [[file:~/cuda/atrip/atrip.org::*State][State:1]] enum State { Fetch = 0, Dispatched = 2, @@ -76,7 +63,7 @@ enum State { }; // State:1 ends here -// [[file:../../atrip.org::*The Info structure][The Info structure:1]] +// [[file:~/cuda/atrip/atrip.org::*The%20Info%20structure][The Info structure:1]] struct Info { // which part of a,b,c the slice holds PartialTuple tuple; @@ -100,7 +87,7 @@ struct Info { using Ty_x_Tu = std::pair< Type, PartialTuple >; // The Info structure:1 ends here -// [[file:../../atrip.org::*Name][Name:1]] +// [[file:~/cuda/atrip/atrip.org::*Name][Name:1]] enum Name { TA = 100 , VIJKA = 101 @@ -110,19 +97,19 @@ enum Name }; // Name:1 ends here -// [[file:../../atrip.org::*Database][Database:1]] +// [[file:~/cuda/atrip/atrip.org::*Database][Database:1]] struct LocalDatabaseElement { Slice::Name name; Slice::Info info; }; // Database:1 ends here -// [[file:../../atrip.org::*Database][Database:2]] +// [[file:~/cuda/atrip/atrip.org::*Database][Database:2]] using LocalDatabase = std::vector; using Database = LocalDatabase; // Database:2 ends here -// [[file:../../atrip.org::*MPI Types][MPI Types:1]] +// [[file:~/cuda/atrip/atrip.org::*MPI%20Types][MPI Types:1]] struct mpi { static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) { @@ -228,7 +215,7 @@ struct mpi { }; // MPI Types:1 ends here -// [[file:../../atrip.org::*Static utilities][Static utilities:1]] +// [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:1]] static PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) { switch (sliceType) { @@ -246,7 +233,7 @@ PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) { } // Static utilities:1 ends here -// [[file:../../atrip.org::*Static utilities][Static utilities:2]] +// [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:2]] static std::vector*> hasRecycledReferencingToIt ( std::vector> &slices , Info const& info @@ -263,7 +250,7 @@ static std::vector*> hasRecycledReferencingToIt } // Static utilities:2 ends here -// [[file:../../atrip.org::*Static utilities][Static utilities:3]] +// [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:3]] static Slice& findOneByType(std::vector> &slices, Slice::Type type) { const auto sliceIt = std::find_if(slices.begin(), slices.end(), @@ -279,7 +266,7 @@ static Slice& findOneByType(std::vector> &slices, Slice::Type typ } // Static utilities:3 ends here -// [[file:../../atrip.org::*Static utilities][Static utilities:4]] +// [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:4]] static Slice& findRecycledSource (std::vector> &slices, Slice::Info info) { const auto sliceIt @@ -305,7 +292,7 @@ findRecycledSource (std::vector> &slices, Slice::Info info) { } // Static utilities:4 ends here -// [[file:../../atrip.org::*Static utilities][Static utilities:5]] +// [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:5]] static Slice& findByTypeAbc ( std::vector> &slices , Slice::Type type @@ -335,7 +322,7 @@ static Slice& findByTypeAbc } // Static utilities:5 ends here -// [[file:../../atrip.org::*Static utilities][Static utilities:6]] +// [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:6]] static Slice& findByInfo(std::vector> &slices, Slice::Info const& info) { const auto sliceIt @@ -358,30 +345,33 @@ static Slice& findByInfo(std::vector> &slices, } // Static utilities:6 ends here -// [[file:../../atrip.org::*Attributes][Attributes:1]] +// [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:1]] Info info; // Attributes:1 ends here -// [[file:../../atrip.org::*Attributes][Attributes:2]] -F *data; +// [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:2]] +DataPtr data; +#if defined(HAVE_CUDA) + F* mpi_data; +#endif // Attributes:2 ends here -// [[file:../../atrip.org::*Attributes][Attributes:3]] +// [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:3]] MPI_Request request; // Attributes:3 ends here -// [[file:../../atrip.org::*Attributes][Attributes:4]] +// [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:4]] const size_t size; // Attributes:4 ends here -// [[file:../../atrip.org::*Member functions][Member functions:1]] +// [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:1]] void markReady() noexcept { info.state = Ready; info.recycling = Blank; } // Member functions:1 ends here -// [[file:../../atrip.org::*Member functions][Member functions:2]] +// [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:2]] bool isUnwrapped() const noexcept { return info.state == Ready || info.state == SelfSufficient @@ -389,7 +379,7 @@ bool isUnwrapped() const noexcept { } // Member functions:2 ends here -// [[file:../../atrip.org::*Member functions][Member functions:3]] +// [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:3]] bool isUnwrappable() const noexcept { return isUnwrapped() || info.state == Recycled @@ -407,7 +397,7 @@ void free() noexcept { info.state = Acceptor; info.from = {0, 0}; info.recycling = Blank; - data = nullptr; + data = DataNullPtr; } inline bool isFree() const noexcept { @@ -417,12 +407,12 @@ inline bool isFree() const noexcept { && info.from.rank == 0 && info.from.source == 0 && info.recycling == Blank - && data == nullptr + && data == DataNullPtr ; } // Member functions:3 ends here -// [[file:../../atrip.org::*Member functions][Member functions:4]] +// [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:4]] inline bool isRecyclable() const noexcept { return ( info.state == Dispatched || info.state == Ready @@ -433,16 +423,16 @@ inline bool isRecyclable() const noexcept { } // Member functions:4 ends here -// [[file:../../atrip.org::*Member functions][Member functions:5]] +// [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:5]] inline bool hasValidDataPointer() const noexcept { - return data != nullptr + return data != DataNullPtr && info.state != Acceptor && info.type != Blank ; } // Member functions:5 ends here -// [[file:../../atrip.org::*Member functions][Member functions:6]] +// [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:6]] void unwrapAndMarkReady() { if (info.state == Ready) return; if (info.state != Dispatched) @@ -454,9 +444,17 @@ void unwrapAndMarkReady() { WITH_RANK << "__slice__:mpi: waiting " << "\n"; #endif const int errorCode = MPI_Wait(&request, &status); + if (MPI_SUCCESS != MPI_Request_free(&request)) + throw "Error freeing MPI request"; if (errorCode != MPI_SUCCESS) throw "MPI ERROR HAPPENED...."; +#if defined(HAVE_CUDA) + // copy the retrieved mpi data to the device + cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size); + std::free(mpi_data); +#endif + #ifdef HAVE_OCD char errorString[MPI_MAX_ERROR_STRING]; int errorSize; @@ -474,18 +472,21 @@ void unwrapAndMarkReady() { } // Member functions:6 ends here -// [[file:../../atrip.org::*Epilog][Epilog:1]] +// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]] Slice(size_t size_) - : info({}) - , data(nullptr) - , size(size_) - {} + : info({}) + , data(DataNullPtr) +#if defined(HAVE_CUDA) + , mpi_data(nullptr) +#endif + , size(size_) + {} -}; // struct Slice + }; // struct Slice // Epilog:1 ends here -// [[file:../../atrip.org::*Debug][Debug:1]] +// [[file:~/cuda/atrip/atrip.org::*Debug][Debug:1]] template std::ostream& operator<<(std::ostream& out, typename Slice::Location const& v) { // TODO: remove me diff --git a/include/atrip/SliceUnion.hpp b/include/atrip/SliceUnion.hpp index c3b3d27..0403a33 100644 --- a/include/atrip/SliceUnion.hpp +++ b/include/atrip/SliceUnion.hpp @@ -12,16 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -// [[file:../../atrip.org::*The slice union][The slice union:1]] +// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]] #pragma once #include #include #include namespace atrip { +// Prolog:1 ends here - template - struct SliceUnion { +// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]] +template + class SliceUnion { + public: using Tensor = CTF::Tensor; virtual void @@ -191,7 +194,11 @@ namespace atrip { : Slice::Fetch ; if (blank.info.state == Slice::SelfSufficient) { +#if defined(HAVE_CUDA) + blank.mpi_data = sources[from.source].data(); +#else blank.data = sources[from.source].data(); +#endif } else { if (freePointers.size() == 0) { std::stringstream stream; @@ -345,8 +352,7 @@ namespace atrip { } // CONSTRUCTOR - SliceUnion( Tensor const& sourceTensor - , std::vector::Type> sliceTypes_ + SliceUnion( std::vector::Type> sliceTypes_ , std::vector sliceLength_ , std::vector paramLength , size_t np @@ -366,12 +372,19 @@ namespace atrip { 1UL, std::multiplies()))) , name(name_) , sliceTypes(sliceTypes_) - , sliceBuffers(nSliceBuffers, sources[0]) + , sliceBuffers(nSliceBuffers) //, slices(2 * sliceTypes.size(), Slice{ sources[0].size() }) { // constructor begin LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n"; + for (auto& ptr: sliceBuffers) +#if defined(HAVE_CUDA) + cuMemAlloc(&ptr, sizeof(F) * sources[0].size()); +#else + ptr = (DataPtr)malloc(sizeof(F) * sources[0].size()); +#endif + slices = std::vector>(2 * sliceTypes.size(), { sources[0].size() }); // TODO: think exactly ^------------------- about this number @@ -379,7 +392,7 @@ namespace atrip { // initialize the freePointers with the pointers to the buffers std::transform(sliceBuffers.begin(), sliceBuffers.end(), std::inserter(freePointers, freePointers.begin()), - [](std::vector &vec) { return vec.data(); }); + [](DataPtr ptr) { return ptr; }); @@ -397,8 +410,6 @@ namespace atrip { << freePointers.size() << "\n"; LOG(1,"Atrip") << "#sliceBuffers " << sliceBuffers.size() << "\n"; - LOG(1,"Atrip") << "#sliceBuffers[0] " - << sliceBuffers[0].size() << "\n"; LOG(1,"Atrip") << "#sliceLength " << sliceLength.size() << "\n"; LOG(1,"Atrip") << "#paramLength " @@ -477,9 +488,12 @@ namespace atrip { if (slice.info.state == Slice::Fetch) { // TODO: do it through the slice class slice.info.state = Slice::Dispatched; - MPI_Request request; - slice.request = request; +#if defined(HAVE_CUDA) + slice.mpi_data = (F*)malloc(sizeof(F) * slice.size); + MPI_Irecv( slice.mpi_data +#else MPI_Irecv( slice.data +#endif , slice.size , traits::mpi::datatypeOf() , info.from.rank @@ -495,7 +509,7 @@ namespace atrip { for (auto type: sliceTypes) unwrapSlice(type, abc); } - F* unwrapSlice(typename Slice::Type type, ABCTuple const& abc) { + DataPtr unwrapSlice(typename Slice::Type type, ABCTuple const& abc) { WITH_CRAZY_DEBUG WITH_RANK << "__unwrap__:slice " << type << " w n " << name @@ -539,6 +553,15 @@ namespace atrip { return slice.data; } + ~SliceUnion() { + for (auto& ptr: sliceBuffers) +#if defined(HAVE_CUDA) + cuMemFree(ptr); +#else + std::free(ptr); +#endif + } + const RankMap rankMap; const MPI_Comm world; const MPI_Comm universe; @@ -547,8 +570,8 @@ namespace atrip { std::vector< Slice > slices; typename Slice::Name name; const std::vector::Type> sliceTypes; - std::vector< std::vector > sliceBuffers; - std::set freePointers; + std::vector< DataPtr > sliceBuffers; + std::set< DataPtr > freePointers; }; @@ -568,6 +591,8 @@ namespace atrip { } return **sliceUnionIt; } +// Prolog:2 ends here +// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]] } -// The slice union:1 ends here +// Epilog:1 ends here diff --git a/include/atrip/Tuples.hpp b/include/atrip/Tuples.hpp index 26c9d2b..e03d33e 100644 --- a/include/atrip/Tuples.hpp +++ b/include/atrip/Tuples.hpp @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// [[file:../../atrip.org::*Prolog][Prolog:1]] +// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]] #pragma once #include @@ -35,7 +35,7 @@ namespace atrip { // Prolog:1 ends here -// [[file:../../atrip.org::*Tuples types][Tuples types:1]] +// [[file:~/cuda/atrip/atrip.org::*Tuples%20types][Tuples types:1]] using ABCTuple = std::array; using PartialTuple = std::array; using ABCTuples = std::vector; @@ -44,23 +44,22 @@ constexpr ABCTuple FAKE_TUPLE = {0, 0, 0}; constexpr ABCTuple INVALID_TUPLE = {1, 1, 1}; // Tuples types:1 ends here -// [[file:../../atrip.org::*Distributing the tuples][Distributing the tuples:1]] +// [[file:~/cuda/atrip/atrip.org::*Distributing%20the%20tuples][Distributing the tuples:1]] struct TuplesDistribution { virtual ABCTuples getTuples(size_t Nv, MPI_Comm universe) = 0; virtual bool tupleIsFake(ABCTuple const& t) { return t == FAKE_TUPLE; } }; // Distributing the tuples:1 ends here -// [[file:../../atrip.org::*Node information][Node information:1]] +// [[file:~/cuda/atrip/atrip.org::*Node%20information][Node information:1]] std::vector getNodeNames(MPI_Comm comm){ int rank, np; MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &np); std::vector nodeList(np); - char nodeName[MPI_MAX_PROCESSOR_NAME] - , nodeNames[np*MPI_MAX_PROCESSOR_NAME] - ; + char nodeName[MPI_MAX_PROCESSOR_NAME]; + char *nodeNames = (char*)malloc(np * MPI_MAX_PROCESSOR_NAME); std::vector nameLengths(np) , off(np) ; @@ -87,11 +86,12 @@ std::vector getNodeNames(MPI_Comm comm){ std::string const s(&nodeNames[off[i]], nameLengths[i]); nodeList[i] = s; } + std::free(nodeNames); return nodeList; } // Node information:1 ends here -// [[file:../../atrip.org::*Node information][Node information:2]] +// [[file:~/cuda/atrip/atrip.org::*Node%20information][Node information:2]] struct RankInfo { const std::string name; const size_t nodeId; @@ -154,7 +154,7 @@ getClusterInfo(MPI_Comm comm) { } // Node information:2 ends here -// [[file:../../atrip.org::*Naive list][Naive list:1]] +// [[file:~/cuda/atrip/atrip.org::*Naive%20list][Naive list:1]] ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) { const size_t @@ -188,7 +188,7 @@ ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) { } // Naive list:1 ends here -// [[file:../../atrip.org::*Naive list][Naive list:2]] +// [[file:~/cuda/atrip/atrip.org::*Naive%20list][Naive list:2]] ABCTuples getAllTuplesList(const size_t Nv) { const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; ABCTuples result(n); @@ -204,7 +204,7 @@ ABCTuples getAllTuplesList(const size_t Nv) { } // Naive list:2 ends here -// [[file:../../atrip.org::*Naive list][Naive list:3]] +// [[file:~/cuda/atrip/atrip.org::*Naive%20list][Naive list:3]] struct NaiveDistribution : public TuplesDistribution { ABCTuples getTuples(size_t Nv, MPI_Comm universe) override { int rank, np; @@ -215,11 +215,11 @@ struct NaiveDistribution : public TuplesDistribution { }; // Naive list:3 ends here -// [[file:../../atrip.org::*Prolog][Prolog:1]] +// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]] namespace group_and_sort { // Prolog:1 ends here -// [[file:../../atrip.org::*Utils][Utils:1]] +// [[file:~/cuda/atrip/atrip.org::*Utils][Utils:1]] // Provides the node on which the slice-element is found // Right now we distribute the slices in a round robin fashion // over the different nodes (NOTE: not mpi ranks but nodes) @@ -244,7 +244,7 @@ struct Info { }; // Utils:1 ends here -// [[file:../../atrip.org::*Distribution][Distribution:1]] +// [[file:~/cuda/atrip/atrip.org::*Distribution][Distribution:1]] ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { ABCTuples nodeTuples; @@ -426,7 +426,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { } // Distribution:1 ends here -// [[file:../../atrip.org::*Main][Main:1]] +// [[file:~/cuda/atrip/atrip.org::*Main][Main:1]] std::vector main(MPI_Comm universe, size_t Nv) { int rank, np; @@ -466,7 +466,7 @@ std::vector main(MPI_Comm universe, size_t Nv) { MPI_Comm_split(universe, color, key, &INTRA_COMM); // Main:1 ends here -// [[file:../../atrip.org::*Main][Main:2]] +// [[file:~/cuda/atrip/atrip.org::*Main][Main:2]] size_t const tuplesPerRankLocal = nodeTuples.size() / nodeInfos[rank].ranksPerNode @@ -494,7 +494,7 @@ LOG(1,"Atrip") << "ranks per node " << nodeInfos[rank].ranksPerNode << "\n"; LOG(1,"Atrip") << "#nodes " << nNodes << "\n"; // Main:2 ends here -// [[file:../../atrip.org::*Main][Main:3]] +// [[file:~/cuda/atrip/atrip.org::*Main][Main:3]] size_t const totalTuples = tuplesPerRankGlobal * nodeInfos[rank].ranksPerNode; @@ -506,7 +506,7 @@ if (computeDistribution) { } // Main:3 ends here -// [[file:../../atrip.org::*Main][Main:4]] +// [[file:~/cuda/atrip/atrip.org::*Main][Main:4]] { // construct mpi type for abctuple MPI_Datatype MPI_ABCTUPLE; @@ -530,13 +530,13 @@ if (computeDistribution) { } // Main:4 ends here -// [[file:../../atrip.org::*Main][Main:5]] +// [[file:~/cuda/atrip/atrip.org::*Main][Main:5]] return result; } // Main:5 ends here -// [[file:../../atrip.org::*Interface][Interface:1]] +// [[file:~/cuda/atrip/atrip.org::*Interface][Interface:1]] struct Distribution : public TuplesDistribution { ABCTuples getTuples(size_t Nv, MPI_Comm universe) override { return main(universe, Nv); @@ -544,10 +544,10 @@ struct Distribution : public TuplesDistribution { }; // Interface:1 ends here -// [[file:../../atrip.org::*Epilog][Epilog:1]] +// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]] } // namespace group_and_sort // Epilog:1 ends here -// [[file:../../atrip.org::*Epilog][Epilog:1]] +// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]] } // Epilog:1 ends here diff --git a/include/atrip/Types.hpp b/include/atrip/Types.hpp new file mode 100644 index 0000000..6d60e9c --- /dev/null +++ b/include/atrip/Types.hpp @@ -0,0 +1,60 @@ +// Copyright 2022 Alejandro Gallo +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// [[file:~/cuda/atrip/atrip.org::*Data%20pointer][Data pointer:1]] +#pragma once +#include +#include + +namespace atrip { + +template +struct DataField; + +template <> +struct DataField { + using type = double; +}; + +#if defined(HAVE_CUDA) + +template +using DataPtr = CUdeviceptr; +#define DataNullPtr 0x00 + +template <> +struct DataField { + using type = cuDoubleComplex; +}; + + +#else + +template +using DataPtr = F*; +#define DataNullPtr nullptr + +template <> +struct DataField { + using type = Complex; +}; + +#endif + + +template +using DataFieldType = typename DataField::type; + +} +// Data pointer:1 ends here diff --git a/include/atrip/Unions.hpp b/include/atrip/Unions.hpp index 86791ff..a4214fc 100644 --- a/include/atrip/Unions.hpp +++ b/include/atrip/Unions.hpp @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// [[file:../../atrip.org::*Unions][Unions:1]] +// [[file:~/cuda/atrip/atrip.org::*Unions][Unions:1]] #pragma once #include @@ -65,8 +65,7 @@ namespace atrip { , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , {Slice::A, Slice::B, Slice::C} + ) : SliceUnion( {Slice::A, Slice::B, Slice::C} , {Nv, No, No} // size of the slices , {Nv} , np @@ -103,8 +102,7 @@ namespace atrip { , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , {Slice::A, Slice::B, Slice::C} + ) : SliceUnion( {Slice::A, Slice::B, Slice::C} , {No, No, No} // size of the slices , {Nv} // size of the parametrization , np @@ -138,8 +136,7 @@ namespace atrip { , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , { Slice::AB, Slice::BC, Slice::AC + ) : SliceUnion( { Slice::AB, Slice::BC, Slice::AC , Slice::BA, Slice::CB, Slice::CA } , {Nv, No} // size of the slices @@ -179,8 +176,7 @@ namespace atrip { , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , {Slice::AB, Slice::BC, Slice::AC} + ) : SliceUnion( {Slice::AB, Slice::BC, Slice::AC} , {No, No} // size of the slices , {Nv, Nv} // size of the parametrization , np @@ -219,8 +215,7 @@ namespace atrip { , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , {Slice::AB, Slice::BC, Slice::AC} + ) : SliceUnion( {Slice::AB, Slice::BC, Slice::AC} , {No, No} // size of the slices , {Nv, Nv} // size of the parametrization , np diff --git a/include/atrip/Utils.hpp b/include/atrip/Utils.hpp index 39c462f..184cf13 100644 --- a/include/atrip/Utils.hpp +++ b/include/atrip/Utils.hpp @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// [[file:../../atrip.org::*Prolog][Prolog:1]] +// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]] #pragma once #include #include @@ -29,11 +29,14 @@ #include + namespace atrip { // Prolog:1 ends here -// [[file:../../atrip.org::*Pretty printing][Pretty printing:1]] -template +// [[file:~/cuda/atrip/atrip.org::*Pretty%20printing][Pretty printing:1]] +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" + template std::string pretty_print(T&& value) { std::stringstream stream; #if ATRIP_DEBUG > 2 @@ -41,9 +44,10 @@ template #endif return stream.str(); } +#pragma GCC diagnostic pop // Pretty printing:1 ends here -// [[file:../../atrip.org::*Chrono][Chrono:1]] +// [[file:~/cuda/atrip/atrip.org::*Chrono][Chrono:1]] #define WITH_CHRONO(__chrono_name, ...) \ Atrip::chrono[__chrono_name].start(); \ __VA_ARGS__ \ @@ -62,6 +66,6 @@ struct Timer { using Timings = std::map; // Chrono:1 ends here -// [[file:../../atrip.org::*Epilog][Epilog:1]] +// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]] } // Epilog:1 ends here diff --git a/src/Makefile.am b/src/Makefile.am index 223e0db..5e9c555 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -2,26 +2,25 @@ AUTOMAKE_OPTIONS = subdir-objects include $(top_srcdir)/atrip.mk AM_CXXFLAGS = $(CTF_CPPFLAGS) -fmax-errors=1 +AM_CPPFLAGS = $(CTF_CPPFLAGS) lib_LIBRARIES = libatrip.a libatrip_a_CPPFLAGS = -I$(top_srcdir)/include/ -libatrip_a_SOURCES = ./atrip/Atrip.cxx \ - ./atrip/Blas.cxx - +libatrip_a_SOURCES = ./atrip/Blas.cxx +NVCC_FILES = ./atrip/Equations.cxx ./atrip/Complex.cxx ./atrip/Atrip.cxx if WITH_CUDA -NVCC_FILES = ./atrip/Equations.cxx ./atrip/Complex.cxx NVCC_OBJS = $(patsubst %.cxx,%.nvcc.o,$(NVCC_FILES)) libatrip_a_CPPFLAGS += $(CUDA_CXXFLAGS) libatrip_a_DEPENDENCIES = $(NVCC_OBJS) libatrip_a_LIBADD = $(NVCC_OBJS) %.nvcc.o: %.cxx - $(NVCC) -c -I../ $(CPPFLAGS) $(libatrip_a_CPPFLAGS) $< -o $@ + $(NVCC) -c -x cu -I../ $(CPPFLAGS) $(CTF_CPPFLAGS) $(DEFS) $(libatrip_a_CPPFLAGS) $< -o $@ #./atrip/Equations.o: ./atrip/Equations.cxx # $(NVCC) -c -I../ $(CPPFLAGS) $(libatrip_a_CPPFLAGS) $< -o $@ else -libatrip_a_SOURCES += ./atrip/Equations.cxx +libatrip_a_SOURCES += $(NVCC_FILES) endif diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx index 9808658..7bb5e06 100644 --- a/src/atrip/Atrip.cxx +++ b/src/atrip/Atrip.cxx @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// [[file:../../atrip.org::*Main][Main:1]] +// [[file:~/cuda/atrip/atrip.org::*Main][Main:1]] #include #include @@ -23,12 +23,24 @@ #include using namespace atrip; +#if defined(HAVE_CUDA) + +namespace atrip { +namespace cuda { + +}; +}; + +#endif template bool RankMap::RANK_ROUND_ROBIN; template bool RankMap::RANK_ROUND_ROBIN; template bool RankMap::RANK_ROUND_ROBIN; -int Atrip::rank; -int Atrip::np; +size_t Atrip::rank; +size_t Atrip::np; +#if defined(HAVE_CUDA) +typename Atrip::CudaContext Atrip::cuda; +#endif MPI_Comm Atrip::communicator; Timings Atrip::chrono; @@ -40,15 +52,20 @@ void atrip::registerIterationDescriptor(IterationDescriptor d) { void Atrip::init(MPI_Comm world) { Atrip::communicator = world; - MPI_Comm_rank(world, &Atrip::rank); - MPI_Comm_size(world, &Atrip::np); + MPI_Comm_rank(world, (int*)&Atrip::rank); + MPI_Comm_size(world, (int*)&Atrip::np); + +#if defined(HAVE_CUDA) + Atrip::cuda.status = cublasCreate(&Atrip::cuda.handle); +#endif + } template Atrip::Output Atrip::run(Atrip::Input const& in) { - const int np = Atrip::np; - const int rank = Atrip::rank; + const size_t np = Atrip::np; + const size_t rank = Atrip::rank; MPI_Comm universe = Atrip::communicator; const size_t No = in.ei->lens[0]; @@ -58,18 +75,35 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { LOG(0,"Atrip") << "np: " << np << "\n"; // allocate the three scratches, see piecuch - std::vector Tijk(No*No*No) // doubles only (see piecuch) - , Zijk(No*No*No) // singles + doubles (see piecuch) - // we need local copies of the following tensors on every - // rank - , epsi(No) - , epsa(Nv) - , Tai(No * Nv) + // we need local copies of the following tensors on every + // rank + std::vector _epsi(No) + , _epsa(Nv) + , _Tai(No * Nv) ; - in.ei->read_all(epsi.data()); - in.ea->read_all(epsa.data()); - in.Tph->read_all(Tai.data()); + in.ei->read_all(_epsi.data()); + in.ea->read_all(_epsa.data()); + in.Tph->read_all(_Tai.data()); + +#if defined(HAVE_CUDA) + DataPtr Tai, epsi, epsa; + cuMemAlloc(&Tai, sizeof(F) * _Tai.size()); + cuMemAlloc(&epsi, sizeof(F) * _epsi.size()); + cuMemAlloc(&epsa, sizeof(F) * _epsa.size()); + + cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size()); + cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size()); + cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size()); + + DataPtr Tijk, Zijk; + //TODO: free memory + cuMemAlloc(&Tijk, sizeof(F) * No * No * No); + cuMemAlloc(&Zijk, sizeof(F) * No * No * No); +#else + std::vector &Tai = _Tai, &epsi = _epsi, &epsa = _epsa; + std::vector Tijk(No*No*No), Zijk(No*No*No); +#endif RankMap::RANK_ROUND_ROBIN = in.rankRoundRobin; if (RankMap::RANK_ROUND_ROBIN) { @@ -139,7 +173,6 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { ) const size_t nIterations = tuplesList.size(); { - const size_t _all_tuples = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; LOG(0,"Atrip") << "#iterations: " << nIterations << "/" @@ -338,6 +371,9 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { } } + LOG(0, "AtripCUDA") << "Starting iterations\n"; + + for ( size_t i = first_iteration, iteration = first_iteration + 1 @@ -346,6 +382,8 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { ) { Atrip::chrono["iterations"].start(); + LOG(0, "AtripCUDA") << "iteration " << i << "\n"; + // check overhead from chrono over all iterations WITH_CHRONO("start:stop", {}) @@ -355,8 +393,10 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { if (in.barrier) MPI_Barrier(universe); )) + // write checkpoints - if (iteration % checkpoint_mod == 0) { + if (iteration % checkpoint_mod == 0 && false) { + LOG(0, "AtripCUDA") << "checkpoints \n"; double globalEnergy = 0; MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe); Checkpoint out @@ -368,9 +408,10 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { iteration - 1, in.rankRoundRobin}; LOG(0, "Atrip") << "Writing checkpoint\n"; - if (Atrip::rank == 0) write_checkpoint(out, in.checkpointPath); + //if (Atrip::rank == 0) write_checkpoint(out, in.checkpointPath); } + LOG(0, "AtripCUDA") << "reporting \n"; // write reporting if (iteration % iterationMod == 0 || iteration == iteration1Percent) { @@ -417,9 +458,11 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { << "\n"; ) + LOG(0, "AtripCUDA") << "first database " << i << "\n"; // COMM FIRST DATABASE ================================================{{{1 if (i == first_iteration) { + LOG(0, "AtripCUDA") << "first database " << i << "\n"; WITH_RANK << "__first__:first database ............ \n"; const auto db = communicateDatabase(abc, universe); WITH_RANK << "__first__:first database communicated \n"; @@ -432,6 +475,8 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { MPI_Barrier(universe); } + LOG(0, "AtripCUDA") << "next database" << i << "\n"; + // COMM NEXT DATABASE ================================================={{{1 if (abcNext) { WITH_RANK << "__comm__:" << iteration << "th communicating database\n"; @@ -447,6 +492,9 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // COMPUTE DOUBLES ===================================================={{{1 OCD_Barrier(universe); if (!isFakeTuple(i)) { + + LOG(0, "AtripCUDA") << "computing doubles " << i << "\n"; + WITH_RANK << iteration << "-th doubles\n"; WITH_CHRONO("oneshot-unwrap", WITH_CHRONO("unwrap", @@ -478,7 +526,11 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { , tabhh.unwrapSlice(Slice::AC, abc) , tabhh.unwrapSlice(Slice::BC, abc) // -- TIJK +#if defined(HAVE_CUDA) + , (DataFieldType*)Tijk +#else , Tijk.data() +#endif ); WITH_RANK << iteration << "-th doubles done\n"; )) @@ -493,16 +545,35 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { abhh.unwrapAll(abc); ))) WITH_CHRONO("reorder", - for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I]; + LOG(0, "AtripCUDA") << "reorder singles" << i << "\n"; + atrip::xcopy(No*No*No, +#if defined(HAVE_CUDA) + (DataFieldType*)Tijk, 1, + (DataFieldType*)Zijk, 1); +#else + (DataFieldType*)Tijk.data(), 1, + (DataFieldType*)Zijk.data(), 1); +#endif ) WITH_CHRONO("singles", - singlesContribution( No, Nv, abc + LOG(0, "AtripCUDA") << "doing singles" << i << "\n"; +#if defined(HAVE_CUDA) + singlesContribution<<<1,1>>>( No, Nv, abc[0], abc[1], abc[2] + , (DataFieldType*)Tai +#else + singlesContribution( No, Nv, abc[0], abc[1], abc[2] , Tai.data() - , abhh.unwrapSlice(Slice::AB, abc) - , abhh.unwrapSlice(Slice::AC, abc) - , abhh.unwrapSlice(Slice::BC, abc) +#endif + , (DataFieldType*)abhh.unwrapSlice(Slice::AB, abc) + , (DataFieldType*)abhh.unwrapSlice(Slice::AC, abc) + , (DataFieldType*)abhh.unwrapSlice(Slice::BC, abc) +#if defined(HAVE_CUDA) + , (DataFieldType*)Zijk); +#else , Zijk.data()); +#endif ) + LOG(0, "AtripCUDA") << "singles done" << i << "\n"; } @@ -513,13 +584,17 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { int distinct(0); if (abc[0] == abc[1]) distinct++; if (abc[1] == abc[2]) distinct--; - const F epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]); + const F epsabc(_epsa[abc[0]] + _epsa[abc[1]] + _epsa[abc[2]]); + LOG(0, "AtripCUDA") << "doing energy " << i << "distinct " << distinct << "\n"; WITH_CHRONO("energy", +/* + TODO: think about how to do this on the GPU in the best way possible if ( distinct == 0) - tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk); + tupleEnergy = getEnergyDistinct(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk); else - tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk); + tupleEnergy = getEnergySame(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk); +*/ ) #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES) diff --git a/src/atrip/Blas.cxx b/src/atrip/Blas.cxx new file mode 100644 index 0000000..d6ded3e --- /dev/null +++ b/src/atrip/Blas.cxx @@ -0,0 +1,141 @@ +// Copyright 2022 Alejandro Gallo +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// [[file:~/cuda/atrip/atrip.org::*Blas][Blas:2]] +#include +#include + +#if defined(HAVE_CUDA) +# include + + static inline + cublasOperation_t char_to_cublasOperation(const char* trans) { + if (strncmp("N", trans, 1) == 0) + return CUBLAS_OP_N; + else if (strncmp("T", trans, 1) == 0) + return CUBLAS_OP_T; + else + return CUBLAS_OP_C; + } + +#endif + +namespace atrip { + + + template <> + void xgemm(const char *transa, + const char *transb, + const int *m, + const int *n, + const int *k, + double *alpha, + const typename DataField::type *A, + const int *lda, + const typename DataField::type *B, + const int *ldb, + double *beta, + typename DataField::type *C, + const int *ldc) { +#if defined(HAVE_CUDA) + cublasDgemm(Atrip::cuda.handle, + char_to_cublasOperation(transa), + char_to_cublasOperation(transb), + *m, *n, *k, + alpha, A, *lda, + B, *ldb, beta, + C, *ldc); +#else + dgemm_(transa, transb, + m, n, k, + alpha, A, lda, + B, ldb, beta, + C, ldc); +#endif + } + + template <> + void xgemm(const char *transa, + const char *transb, + const int *m, + const int *n, + const int *k, + Complex *alpha, + const typename DataField::type *A, + const int *lda, + const typename DataField::type *B, + const int *ldb, + Complex *beta, + typename DataField::type *C, + const int *ldc) { +#if defined(HAVE_CUDA) +#pragma warning HAVE_CUDA + cuDoubleComplex + cu_alpha = {std::real(*alpha), std::imag(*alpha)}, + cu_beta = {std::real(*beta), std::imag(*beta)}; + cublasZgemm(Atrip::cuda.handle, + char_to_cublasOperation(transa), + char_to_cublasOperation(transb), + *m, *n, *k, + &cu_alpha, + + A, *lda, + B, *ldb, + &cu_beta, + C, *ldc); +#else + zgemm_(transa, transb, + m, n, k, + alpha, A, lda, + B, ldb, beta, + C, ldc); +#endif + } + + + template <> + void xcopy(const int n, + const DataFieldType* x, + const int incx, + DataFieldType* y, + const int incy) { +#if defined(HAVE_CUDA) + cublasDcopy(Atrip::cuda.handle, + n, + x, incx, + y, incy); +#else + dcopy_(n, x, incx, y, incy); +#endif + } + + template <> + void xcopy(const int n, + const DataFieldType* x, + const int incx, + DataFieldType* y, + const int incy) { +#if defined(HAVE_CUDA) + cublasZcopy(Atrip::cuda.handle, + n, + x, incx, + y, incy); +#else + zcopy_(n, x, incx, y, incy); +#endif + } + + +} +// Blas:2 ends here diff --git a/src/atrip/Complex.cxx b/src/atrip/Complex.cxx new file mode 100644 index 0000000..96e0406 --- /dev/null +++ b/src/atrip/Complex.cxx @@ -0,0 +1,40 @@ +// Copyright 2022 Alejandro Gallo +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// [[file:~/cuda/atrip/atrip.org::*Complex%20numbers][Complex numbers:2]] +#include +#include + +namespace atrip { + + template <> double maybeConjugate(const double a) { return a; } + template <> Complex maybeConjugate(const Complex a) { return std::conj(a); } + +#if defined(HAVE_CUDA) + +#endif + + + namespace traits { + template bool isComplex() { return false; } + template <> bool isComplex() { return false; } + template <> bool isComplex() { return true; } + namespace mpi { + template <> MPI_Datatype datatypeOf() { return MPI_DOUBLE; } + template <> MPI_Datatype datatypeOf() { return MPI_DOUBLE_COMPLEX; } + } + } + +} +// Complex numbers:2 ends here diff --git a/src/atrip/Equations.cxx b/src/atrip/Equations.cxx new file mode 100644 index 0000000..8ef7e7b --- /dev/null +++ b/src/atrip/Equations.cxx @@ -0,0 +1,720 @@ +// Copyright 2022 Alejandro Gallo +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]] +#include + +#if defined(HAVE_CUDA) +#include +#endif + +namespace atrip { +// Prolog:2 ends here + +#ifdef HAVE_CUDA +namespace cuda { + + // cuda kernels + + template + __global__ + void zeroing(F* a, size_t n) { + F zero = {0}; + for (size_t i = 0; i < n; i++) { + a[i] = zero; + } + } + + //// + template + __device__ + F maybeConjugateScalar(const F a); + + template <> + __device__ + double maybeConjugateScalar(const double a) { return a; } + + template <> + __device__ + cuDoubleComplex + maybeConjugateScalar(const cuDoubleComplex a) { + return {a.x, -a.y}; + } + + template + __global__ + void maybeConjugate(F* to, F* from, size_t n) { + for (size_t i = 0; i < n; ++i) { + to[i] = maybeConjugateScalar(from[i]); + } + } + + + template + __global__ + void reorder(F* to, F* from, size_t size, size_t I, size_t J, size_t K) { + size_t idx = 0; + const size_t IDX = I + J*size + K*size*size; + for (size_t k = 0; k < size; k++) + for (size_t j = 0; j < size; j++) + for (size_t i = 0; i < size; i++, idx++) + to[idx] += from[IDX]; + } + + // I mean, really CUDA... really!? + template + __device__ + F multiply(const F &a, const F &b); + template <> + __device__ + double multiply(const double &a, const double &b) { return a * b; } + + template <> + __device__ + cuDoubleComplex multiply(const cuDoubleComplex &a, const cuDoubleComplex &b) { + return + {a.x * b.x - a.y * b.y, + a.x * b.y + a.y * b.x}; + } + + template + __device__ + void sum_in_place(F* to, const F* b); + + template <> + __device__ + void sum_in_place(double* to, const double *b) { *to += *b; } + + template <> + __device__ + void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* b) { + to->x += b->x; + to->y += b->y; + } + + __device__ + cuDoubleComplex& operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz) { + lz.x += rz.x; + lz.y += rz.y; + return lz; + } + + + +}; +#endif + +// [[file:~/cuda/atrip/atrip.org::*Energy][Energy:2]] +template +double getEnergyDistinct + ( F const epsabc + , size_t const No + , F* const epsi + , F* const Tijk + , F* const Zijk + ) { + constexpr size_t blockSize=16; + F energy(0.); + for (size_t kk=0; kk k ? jj : k; + for (size_t j(jstart); j < jend; j++){ + F const ej(epsi[j]); + F const facjk = j == k ? F(0.5) : F(1.0); + size_t istart = ii > j ? ii : j; + for (size_t i(istart); i < iend; i++){ + const F + ei(epsi[i]) + , facij = i == j ? F(0.5) : F(1.0) + , denominator(epsabc - ei - ej - ek) + , U(Zijk[i + No*j + No*No*k]) + , V(Zijk[i + No*k + No*No*j]) + , W(Zijk[j + No*i + No*No*k]) + , X(Zijk[j + No*k + No*No*i]) + , Y(Zijk[k + No*i + No*No*j]) + , Z(Zijk[k + No*j + No*No*i]) + , A(maybeConjugate(Tijk[i + No*j + No*No*k])) + , B(maybeConjugate(Tijk[i + No*k + No*No*j])) + , C(maybeConjugate(Tijk[j + No*i + No*No*k])) + , D(maybeConjugate(Tijk[j + No*k + No*No*i])) + , E(maybeConjugate(Tijk[k + No*i + No*No*j])) + , _F(maybeConjugate(Tijk[k + No*j + No*No*i])) + , value + = 3.0 * ( A * U + + B * V + + C * W + + D * X + + E * Y + + _F * Z ) + + ( ( U + X + Y ) + - 2.0 * ( V + W + Z ) + ) * ( A + D + E ) + + ( ( V + W + Z ) + - 2.0 * ( U + X + Y ) + ) * ( B + C + _F ) + ; + energy += 2.0 * value / denominator * facjk * facij; + } // i + } // j + } // k + } // ii + } // jj + } // kk + return std::real(energy); +} + + +template +double getEnergySame + ( F const epsabc + , size_t const No + , F* const epsi + , F* const Tijk + , F* const Zijk + ) { + constexpr size_t blockSize = 16; + F energy = F(0.); + for (size_t kk=0; kk k ? jj : k; + for(size_t j(jstart); j < jend; j++){ + const F facjk( j == k ? F(0.5) : F(1.0)); + const F ej(epsi[j]); + const size_t istart = ii > j ? ii : j; + for(size_t i(istart); i < iend; i++){ + const F + ei(epsi[i]) + , facij ( i==j ? F(0.5) : F(1.0)) + , denominator(epsabc - ei - ej - ek) + , U(Zijk[i + No*j + No*No*k]) + , V(Zijk[j + No*k + No*No*i]) + , W(Zijk[k + No*i + No*No*j]) + , A(maybeConjugate(Tijk[i + No*j + No*No*k])) + , B(maybeConjugate(Tijk[j + No*k + No*No*i])) + , C(maybeConjugate(Tijk[k + No*i + No*No*j])) + , value + = F(3.0) * ( A * U + + B * V + + C * W + ) + - ( A + B + C ) * ( U + V + W ) + ; + energy += F(2.0) * value / denominator * facjk * facij; + } // i + } // j + } // k + } // ii + } // jj + } // kk + return std::real(energy); +} +// Energy:2 ends here + +// [[file:~/cuda/atrip/atrip.org::*Energy][Energy:3]] +// instantiate double +template +double getEnergyDistinct + ( double const epsabc + , size_t const No + , double* const epsi + , double* const Tijk + , double* const Zijk + ); + +template +double getEnergySame + ( double const epsabc + , size_t const No + , double* const epsi + , double* const Tijk + , double* const Zijk + ); + +// instantiate Complex +template +double getEnergyDistinct + ( Complex const epsabc + , size_t const No + , Complex* const epsi + , Complex* const Tijk + , Complex* const Zijk + ); + +template +double getEnergySame + ( Complex const epsabc + , size_t const No + , Complex* const epsi + , Complex* const Tijk + , Complex* const Zijk + ); +// Energy:3 ends here + +// [[file:~/cuda/atrip/atrip.org::*Singles%20contribution][Singles contribution:2]] +template +#ifdef HAVE_CUDA +__global__ +#endif + void singlesContribution + ( size_t No + , size_t Nv + , size_t a + , size_t b + , size_t c + , DataFieldType* const Tph + , DataFieldType* const VABij + , DataFieldType* const VACij + , DataFieldType* const VBCij + , DataFieldType* Zijk + ) { + const size_t NoNo = No*No; + // TODO: change order of for loops + for (size_t k = 0; k < No; k++) + for (size_t i = 0; i < No; i++) + for (size_t j = 0; j < No; j++) { + const size_t ijk = i + j*No + k*No*No; + +#ifdef HAVE_CUDA +# define GO(__TPH, __VABIJ) \ + { \ + const DataFieldType product \ + = cuda::multiply>((__TPH), (__VABIJ)); \ + cuda::sum_in_place>(&Zijk[ijk], &product); \ + } +#else +# define GO(__TPH, __VABIJ) Zijk[ijk] += (__TPH) * (__VABIJ); +#endif + GO(Tph[ a + i * Nv ], VBCij[ j + k * No ]) + GO(Tph[ b + j * Nv ], VACij[ i + k * No ]) + GO(Tph[ c + k * Nv ], VABij[ i + j * No ]) +#undef GO + } // for loop j + } + + +// instantiate + template +#ifdef HAVE_CUDA + __global__ +#endif + void singlesContribution( size_t No + , size_t Nv + , size_t a + , size_t b + , size_t c + , double* const Tph + , double* const VABij + , double* const VACij + , double* const VBCij + , double* Zijk + ); + + template +#ifdef HAVE_CUDA + __global__ +#endif + void singlesContribution( size_t No + , size_t Nv + , size_t a + , size_t b + , size_t c + , DataFieldType* const Tph + , DataFieldType* const VABij + , DataFieldType* const VACij + , DataFieldType* const VBCij + , DataFieldType* Zijk + ); +// Singles contribution:2 ends here + +// [[file:~/cuda/atrip/atrip.org::*Doubles%20contribution][Doubles contribution:2]] + template + void doublesContribution + ( const ABCTuple &abc + , size_t const No + , size_t const Nv + // -- VABCI + , DataPtr const VABph + , DataPtr const VACph + , DataPtr const VBCph + , DataPtr const VBAph + , DataPtr const VCAph + , DataPtr const VCBph + // -- VHHHA + , DataPtr const VhhhA + , DataPtr const VhhhB + , DataPtr const VhhhC + // -- TA + , DataPtr const TAphh + , DataPtr const TBphh + , DataPtr const TCphh + // -- TABIJ + , DataPtr const TABhh + , DataPtr const TAChh + , DataPtr const TBChh + // -- TIJK + // , DataPtr Tijk_ + , DataFieldType* Tijk_ + ) { + + const size_t a = abc[0], b = abc[1], c = abc[2] + , NoNo = No*No, NoNv = No*Nv + ; + + typename DataField::type* Tijk = (typename DataField::type*) Tijk_; + LOG(0, "AtripCUDA") << "in doubles " << "\n"; + +#if defined(ATRIP_USE_DGEMM) +#define _IJK_(i, j, k) i + j*No + k*NoNo +#if defined(HAVE_CUDA) +// TODO +#define REORDER(__II, __JJ, __KK) +#define __TO_DEVICEPTR(_v) (_v) +#define DGEMM_PARTICLES(__A, __B) \ + atrip::xgemm("T", \ + "N", \ + (int const*)&NoNo, \ + (int const*)&No, \ + (int const*)&Nv, \ + &one, \ + (DataFieldType*)__A, \ + (int const*)&Nv, \ + (DataFieldType*)__B, \ + (int const*)&Nv, \ + &zero, \ + _t_buffer_p, \ + (int const*)&NoNo); +#define DGEMM_HOLES(__A, __B, __TRANSB) \ + atrip::xgemm("N", \ + __TRANSB, \ + (int const*)&NoNo, \ + (int const*)&No, \ + (int const*)&No, \ + &m_one, \ + __TO_DEVICEPTR(__A), \ + (int const*)&NoNo, \ + (DataFieldType*)__B, \ + (int const*)&No, \ + &zero, \ + _t_buffer_p, \ + (int const*)&NoNo \ + ); +#define MAYBE_CONJ(_conj, _buffer) \ + cuda::maybeConjugate<<<1,1>>>((DataFieldType*)_conj, (DataFieldType*)_buffer, NoNoNo); +#else +#define REORDER(__II, __JJ, __KK) \ + WITH_CHRONO("doubles:reorder", \ + for (size_t k = 0; k < No; k++) \ + for (size_t j = 0; j < No; j++) \ + for (size_t i = 0; i < No; i++) { \ + Tijk[_IJK_(i, j, k)] += _t_buffer_p[_IJK_(__II, __JJ, __KK)]; \ + } \ + ) +#define __TO_DEVICEPTR(_v) (_v) +#define DGEMM_PARTICLES(__A, __B) \ + atrip::xgemm("T", \ + "N", \ + (int const*)&NoNo, \ + (int const*)&No, \ + (int const*)&Nv, \ + &one, \ + __A, \ + (int const*)&Nv, \ + __B, \ + (int const*)&Nv, \ + &zero, \ + _t_buffer_p, \ + (int const*)&NoNo \ + ); +#define DGEMM_HOLES(__A, __B, __TRANSB) \ + atrip::xgemm("N", \ + __TRANSB, \ + (int const*)&NoNo, \ + (int const*)&No, \ + (int const*)&No, \ + &m_one, \ + __A, \ + (int const*)&NoNo, \ + __B, \ + (int const*)&No, \ + &zero, \ + _t_buffer_p, \ + (int const*)&NoNo \ + ); +#define MAYBE_CONJ(_conj, _buffer) \ + for (size_t __i = 0; __i < NoNoNo; ++__i) \ + _conj[__i] = maybeConjugate(_buffer[__i]); +#endif + + F one{1.0}, m_one{-1.0}, zero{0.0}; + DataFieldType zero_h{0.0}; + const size_t NoNoNo = No*NoNo; +#ifdef HAVE_CUDA + DataFieldType* _t_buffer; + DataFieldType* _vhhh; + LOG(0, "AtripCUDA") << "getting memory" << "\n"; + cuMemAlloc((CUdeviceptr*)&_t_buffer, NoNoNo * sizeof(DataFieldType)); + cuMemAlloc((CUdeviceptr*)&_vhhh, NoNoNo * sizeof(DataFieldType)); + LOG(0, "AtripCUDA") << "cuda::zeroing " << "\n"; + cuda::zeroing<<<1,1>>>((DataFieldType*)_t_buffer, NoNoNo); + cuda::zeroing<<<1,1>>>((DataFieldType*)_vhhh, NoNoNo); +#else + F* _t_buffer = (F*)malloc(NoNoNo * sizeof(F)); + F* _vhhh = (F*)malloc(NoNoNo * sizeof(F)); + for (size_t i=0; i < NoNoNo; i++) { + _t_buffer[i] = zero_h; + _vhhh[i] = zero_h; + } +#endif + //_t_buffer.reserve(NoNoNo); + DataFieldType* _t_buffer_p = __TO_DEVICEPTR(_t_buffer); + +#ifdef HAVE_CUDA + LOG(0, "AtripCUDA") << "cuda::zeroing Tijk" << "\n"; + cuda::zeroing<<<1,1>>>((DataFieldType*)Tijk, NoNoNo); +#else + WITH_CHRONO("double:reorder", + for (size_t k = 0; k < NoNoNo; k++) { + Tijk[k] = DataFieldType{0.0}; + }) +#endif + + LOG(0, "AtripCUDA") << "doing holes" << "\n"; + // TOMERGE: replace chronos + WITH_CHRONO("doubles:holes", + { // Holes part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + + // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 + LOG(0, "AtripCUDA") << "conj 1" << "\n"; + MAYBE_CONJ(_vhhh, VhhhC) + LOG(0, "AtripCUDA") << "done" << "\n"; + WITH_CHRONO("doubles:holes:1", + LOG(0, "AtripCUDA") << "dgemm 1" << "\n"; + DGEMM_HOLES(_vhhh, TABhh, "N") + LOG(0, "AtripCUDA") << "reorder 1" << "\n"; + REORDER(i, k, j) + ) + // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 + WITH_CHRONO("doubles:holes:2", + LOG(0, "AtripCUDA") << "dgemm 2" << "\n"; + DGEMM_HOLES(_vhhh, TABhh, "T") + REORDER(j, k, i) + ) + + // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 + LOG(0, "AtripCUDA") << "conj 2" << "\n"; + MAYBE_CONJ(_vhhh, VhhhB) + LOG(0, "AtripCUDA") << "done" << "\n"; + WITH_CHRONO("doubles:holes:3", + DGEMM_HOLES(_vhhh, TAChh, "N") + REORDER(i, j, k) + ) + // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 + WITH_CHRONO("doubles:holes:4", + DGEMM_HOLES(_vhhh, TAChh, "T") + REORDER(k, j, i) + ) + + // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 + LOG(0, "AtripCUDA") << "conj 3" << "\n"; + MAYBE_CONJ(_vhhh, VhhhA) + WITH_CHRONO("doubles:holes:5", + DGEMM_HOLES(_vhhh, TBChh, "N") + REORDER(j, i, k) + ) + // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 + WITH_CHRONO("doubles:holes:6", + DGEMM_HOLES(_vhhh, TBChh, "T") + REORDER(k, i, j) + ) + + } + ) + #undef MAYBE_CONJ + + LOG(0, "AtripCUDA") << "doing particles" << "\n"; + WITH_CHRONO("doubles:particles", + { // Particle part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 + WITH_CHRONO("doubles:particles:1", + DGEMM_PARTICLES(TAphh, VBCph) + REORDER(i, j, k) + ) + // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 + WITH_CHRONO("doubles:particles:2", + DGEMM_PARTICLES(TAphh, VCBph) + REORDER(i, k, j) + ) + // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 + WITH_CHRONO("doubles:particles:3", + DGEMM_PARTICLES(TCphh, VABph) + REORDER(k, i, j) + ) + // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 + WITH_CHRONO("doubles:particles:4", + DGEMM_PARTICLES(TCphh, VBAph) + REORDER(k, j, i) + ) + // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 + WITH_CHRONO("doubles:particles:5", + DGEMM_PARTICLES(TBphh, VACph) + REORDER(j, i, k) + ) + // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 + WITH_CHRONO("doubles:particles:6", + DGEMM_PARTICLES(TBphh, VCAph) + REORDER(j, k, i) + ) + } + ) + LOG(0, "AtripCUDA") << "particles done" << "\n"; + + { // free resources +#ifdef HAVE_CUDA + LOG(0, "AtripCUDA") << "free mem" << "\n"; + cuMemFree((CUdeviceptr)_vhhh); + cuMemFree((CUdeviceptr)_t_buffer); + LOG(0, "AtripCUDA") << "free mem done" << "\n"; +#else + free(_vhhh); + free(_t_buffer); +#endif + } + + #undef REORDER + #undef DGEMM_HOLES + #undef DGEMM_PARTICLES + #undef _IJK_ + #else + for (size_t k = 0; k < No; k++) + for (size_t j = 0; j < No; j++) + for (size_t i = 0; i < No; i++){ + const size_t ijk = i + j*No + k*NoNo + , jk = j + k*No + ; + Tijk[ijk] = 0.0; // :important + // HOLE DIAGRAMS: TABHH and VHHHA + for (size_t L = 0; L < No; L++){ + // t[abLj] * V[Lcik] H1 + // t[baLi] * V[Lcjk] H0 TODO: conjugate T for complex + Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo]; + Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo]; + + // t[acLk] * V[Lbij] H5 + // t[caLi] * V[Lbkj] H3 + Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo]; + Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo]; + + // t[bcLk] * V[Laji] H2 + // t[cbLj] * V[Laki] H4 + Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo]; + Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo]; + } + // PARTILCE DIAGRAMS: TAPHH and VABPH + for (size_t E = 0; E < Nv; E++) { + // t[aEij] * V[bcEk] P0 + // t[aEik] * V[cbEj] P3 // TODO: CHECK THIS ONE, I DONT KNOW + Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; + Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; + + // t[cEki] * V[abEj] P5 + // t[cEkj] * V[baEi] P2 + Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; + Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; + + // t[bEji] * V[acEk] P1 + // t[bEjk] * V[caEi] P4 + Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; + Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; + } + + } +#endif + } + + + + // instantiate templates + template + void doublesContribution + ( const ABCTuple &abc + , size_t const No + , size_t const Nv + // -- VABCI + , DataPtr const VABph + , DataPtr const VACph + , DataPtr const VBCph + , DataPtr const VBAph + , DataPtr const VCAph + , DataPtr const VCBph + // -- VHHHA + , DataPtr const VhhhA + , DataPtr const VhhhB + , DataPtr const VhhhC + // -- TA + , DataPtr const TAphh + , DataPtr const TBphh + , DataPtr const TCphh + // -- TABIJ + , DataPtr const TABhh + , DataPtr const TAChh + , DataPtr const TBChh + // -- TIJK + , DataFieldType* Tijk + ); + + template + void doublesContribution + ( const ABCTuple &abc + , size_t const No + , size_t const Nv + // -- VABCI + , DataPtr const VABph + , DataPtr const VACph + , DataPtr const VBCph + , DataPtr const VBAph + , DataPtr const VCAph + , DataPtr const VCBph + // -- VHHHA + , DataPtr const VhhhA + , DataPtr const VhhhB + , DataPtr const VhhhC + // -- TA + , DataPtr const TAphh + , DataPtr const TBphh + , DataPtr const TCphh + // -- TABIJ + , DataPtr const TABhh + , DataPtr const TAChh + , DataPtr const TBChh + // -- TIJK + , DataFieldType* Tijk + ); +// Doubles contribution:2 ends here + +// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:2]] +} +// Epilog:2 ends here