From 0f60ffdf0fdc47b44fb05c693c8fdbf9a7969525 Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Tue, 5 Jul 2022 12:39:46 +0200 Subject: [PATCH] Initial changes for naive cuda implementation --- atrip.org | 832 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 677 insertions(+), 155 deletions(-) diff --git a/atrip.org b/atrip.org index b0025f8..49d598b 100644 --- a/atrip.org +++ b/atrip.org @@ -59,22 +59,9 @@ The following section introduces the idea of a slice. #include #include -#include namespace atrip { -template FF maybeConjugate(const FF a) { return a; } -template <> Complex maybeConjugate(const Complex a) { return std::conj(a); } - -namespace traits { - template bool isComplex() { return false; } - template <> bool isComplex() { return true; } -namespace mpi { - template MPI_Datatype datatypeOf(void); - template <> MPI_Datatype datatypeOf() { return MPI_DOUBLE; } - template <> MPI_Datatype datatypeOf() { return MPI_DOUBLE_COMPLEX; } -} -} template @@ -217,6 +204,31 @@ Again the implementation is a simple enum type. }; #+end_src +** Data pointer + +The data pointer type is an abstraction of where the data +of the slice are. +In a CPU architecture it will simply be =F*=, +otherwise it will be a pointer to a GPU address in the case +of a GPU architecture. + +#+begin_src c++ :tangle (atrip-types-h) +#pragma once + +template +#if defined(HAVE_CUDA) +using DataPtr = CUdeviceptr; +#define DataNullPtr 0x00 +#define _AT_(_array, _idx) ((F*)(_array))[(_idx)] +#else +using DataPtr = F*; +#define DataNullPtr nullptr +#define _AT_(_array, _idx) (_array)[(_idx)] +#endif +#+end_src + + + ** The Info structure Every slice has an information structure associated with it @@ -580,7 +592,10 @@ A pointer to data is also necessary for the =Slice= but not necessary to be communicated to other ranks. The =Slice= should never allocate or deallocate itself the pointer. #+begin_src c++ :tangle (atrip-slice-h) - F *data; + DataPtr data; +#if defined(HAVE_CUDA) + F* mpi_data; +#endif #+end_src An =MPI_Request= handle is also included so that the slices that are @@ -647,7 +662,7 @@ is done. info.state = Acceptor; info.from = {0, 0}; info.recycling = Blank; - data = nullptr; + data = DataNullPtr; } inline bool isFree() const noexcept { @@ -657,7 +672,7 @@ is done. && info.from.rank == 0 && info.from.source == 0 && info.recycling == Blank - && data == nullptr + && data == DataNullPtr ; } @@ -695,7 +710,7 @@ or =Blank=, if this is the case then it is a bug. #+begin_src c++ :tangle (atrip-slice-h) inline bool hasValidDataPointer() const noexcept { - return data != nullptr + return data != DataNullPtr && info.state != Acceptor && info.type != Blank ; @@ -726,9 +741,17 @@ The main behaviour of the function should WITH_RANK << "__slice__:mpi: waiting " << "\n"; #endif const int errorCode = MPI_Wait(&request, &status); + if (MPI_SUCCESS != MPI_Request_free(&request)) + throw "Error freeing MPI request"; if (errorCode != MPI_SUCCESS) throw "MPI ERROR HAPPENED...."; +#if defined(HAVE_CUDA) + // copy the retrieved mpi data to the device + cuMemcpy((DataPtr)mpi_data, data, size); + std::free(mpi_data); +#endif + #ifdef HAVE_OCD char errorString[MPI_MAX_ERROR_STRING]; int errorSize; @@ -750,7 +773,10 @@ The main behaviour of the function should #+begin_src c++ :tangle (atrip-slice-h) Slice(size_t size_) : info({}) - , data(nullptr) + , data(DataNullPtr) +#if defined(HAVE_CUDA) + , mpi_data(nullptr) +#endif , size(size_) {} @@ -803,6 +829,7 @@ This section presents some utilities #include + namespace atrip { #+end_src @@ -811,6 +838,8 @@ namespace atrip { The pretty printing uses the [[https://github.com/sharkdp/dbg-macro][dbg-macro]] package. #+begin_src c++ :tangle (atrip-utils-h) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" template std::string pretty_print(T&& value) { std::stringstream stream; @@ -819,6 +848,7 @@ The pretty printing uses the [[https://github.com/sharkdp/dbg-macro][dbg-macro]] #endif return stream.str(); } +#pragma GCC diagnostic pop #+end_src @@ -912,7 +942,8 @@ namespace atrip { ; } - bool isSourcePadding(size_t rank, size_t source) const noexcept { + bool isSourcePadding(const size_t rank, const size_t source) + const noexcept { return source == nSources() && isPaddingRank(rank); } @@ -981,6 +1012,8 @@ namespace atrip { #+end_src * The slice union + +** Prolog :noexport: #+begin_src c++ :tangle (atrip-slice-union-h) #pragma once #include @@ -988,20 +1021,23 @@ namespace atrip { #include namespace atrip { +#+end_src +#+begin_src c++ :tangle (atrip-slice-union-h) template - struct SliceUnion { + class SliceUnion { + public: using Tensor = CTF::Tensor; virtual void sliceIntoBuffer(size_t iteration, Tensor &to, Tensor const& from) = 0; /* - * This function should enforce an important property of a SliceUnion. - * Namely, there can be no two Slices of the same nature. - * - * This means that there can be at most one slice with a given Ty_x_Tu. - */ + ,* This function should enforce an important property of a SliceUnion. + ,* Namely, there can be no two Slices of the same nature. + ,* + ,* This means that there can be at most one slice with a given Ty_x_Tu. + ,*/ void checkForDuplicates() const { std::vector::Ty_x_Tu> tytus; for (auto const& s: slices) { @@ -1160,7 +1196,11 @@ namespace atrip { : Slice::Fetch ; if (blank.info.state == Slice::SelfSufficient) { +#if defined(HAVE_CUDA) + blank.mpi_data = sources[from.source].data(); +#else blank.data = sources[from.source].data(); +#endif } else { if (freePointers.size() == 0) { std::stringstream stream; @@ -1314,8 +1354,7 @@ namespace atrip { } // CONSTRUCTOR - SliceUnion( Tensor const& sourceTensor - , std::vector::Type> sliceTypes_ + SliceUnion( std::vector::Type> sliceTypes_ , std::vector sliceLength_ , std::vector paramLength , size_t np @@ -1335,12 +1374,19 @@ namespace atrip { 1UL, std::multiplies()))) , name(name_) , sliceTypes(sliceTypes_) - , sliceBuffers(nSliceBuffers, sources[0]) + , sliceBuffers(nSliceBuffers) //, slices(2 * sliceTypes.size(), Slice{ sources[0].size() }) { // constructor begin LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n"; + for (auto& ptr: sliceBuffers) +#if defined(HAVE_CUDA) + cuMemAlloc(&ptr, sizeof(F) * sources[0].size()); +#else + ptr = (DataPtr)malloc(sizeof(F) * sources[0].size()); +#endif + slices = std::vector>(2 * sliceTypes.size(), { sources[0].size() }); // TODO: think exactly ^------------------- about this number @@ -1348,7 +1394,7 @@ namespace atrip { // initialize the freePointers with the pointers to the buffers std::transform(sliceBuffers.begin(), sliceBuffers.end(), std::inserter(freePointers, freePointers.begin()), - [](std::vector &vec) { return vec.data(); }); + [](DataPtr ptr) { return ptr; }); @@ -1366,16 +1412,14 @@ namespace atrip { << freePointers.size() << "\n"; LOG(1,"Atrip") << "#sliceBuffers " << sliceBuffers.size() << "\n"; - LOG(1,"Atrip") << "#sliceBuffers[0] " - << sliceBuffers[0].size() << "\n"; LOG(1,"Atrip") << "#sliceLength " << sliceLength.size() << "\n"; LOG(1,"Atrip") << "#paramLength " << paramLength.size() << "\n"; LOG(1,"Atrip") << "GB*" << np << " " << double(sources.size() + sliceBuffers.size()) - * sources[0].size() - * 8 * np + ,* sources[0].size() + ,* 8 * np / 1073741824.0 << "\n"; } // constructor ends @@ -1408,8 +1452,8 @@ namespace atrip { } /** - * \brief Send asynchronously only if the state is Fetch - */ + ,* \brief Send asynchronously only if the state is Fetch + ,*/ void send( size_t otherRank , typename Slice::LocalDatabaseElement const& el , size_t tag) const noexcept { @@ -1446,9 +1490,12 @@ namespace atrip { if (slice.info.state == Slice::Fetch) { // TODO: do it through the slice class slice.info.state = Slice::Dispatched; - MPI_Request request; - slice.request = request; +#if defined(HAVE_CUDA) + slice.mpi_data = (F*)malloc(sizeof(F) * slice.size); + MPI_Irecv( slice.mpi_data +#else MPI_Irecv( slice.data +#endif , slice.size , traits::mpi::datatypeOf() , info.from.rank @@ -1464,7 +1511,7 @@ namespace atrip { for (auto type: sliceTypes) unwrapSlice(type, abc); } - F* unwrapSlice(typename Slice::Type type, ABCTuple const& abc) { + DataPtr unwrapSlice(typename Slice::Type type, ABCTuple const& abc) { WITH_CRAZY_DEBUG WITH_RANK << "__unwrap__:slice " << type << " w n " << name @@ -1508,6 +1555,15 @@ namespace atrip { return slice.data; } + ~SliceUnion() { + for (auto& ptr: sliceBuffers) +#if defined(HAVE_CUDA) + cuMemFree(ptr); +#else + std::free(ptr); +#endif + } + const RankMap rankMap; const MPI_Comm world; const MPI_Comm universe; @@ -1516,8 +1572,8 @@ namespace atrip { std::vector< Slice > slices; typename Slice::Name name; const std::vector::Type> sliceTypes; - std::vector< std::vector > sliceBuffers; - std::set freePointers; + std::vector< DataPtr > sliceBuffers; + std::set< DataPtr > freePointers; }; @@ -1537,7 +1593,10 @@ namespace atrip { } return **sliceUnionIt; } +#+end_src +** Epilog :noexport: +#+begin_src c++ :tangle (atrip-slice-union-h) } #+end_src @@ -1625,9 +1684,8 @@ std::vector getNodeNames(MPI_Comm comm){ MPI_Comm_size(comm, &np); std::vector nodeList(np); - char nodeName[MPI_MAX_PROCESSOR_NAME] - , nodeNames[np*MPI_MAX_PROCESSOR_NAME] - ; + char nodeName[MPI_MAX_PROCESSOR_NAME]; + char *nodeNames = (char*)malloc(np * MPI_MAX_PROCESSOR_NAME); std::vector nameLengths(np) , off(np) ; @@ -1654,10 +1712,13 @@ std::vector getNodeNames(MPI_Comm comm){ std::string const s(&nodeNames[off[i]], nameLengths[i]); nodeList[i] = s; } + std::free(nodeNames); return nodeList; } #+end_src + + =getNodeInfos= #+begin_src c++ :tangle (atrip-tuples-h) struct RankInfo { @@ -2295,8 +2356,7 @@ namespace atrip { , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , {Slice::A, Slice::B, Slice::C} + ) : SliceUnion( {Slice::A, Slice::B, Slice::C} , {Nv, No, No} // size of the slices , {Nv} , np @@ -2333,8 +2393,7 @@ namespace atrip { , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , {Slice::A, Slice::B, Slice::C} + ) : SliceUnion( {Slice::A, Slice::B, Slice::C} , {No, No, No} // size of the slices , {Nv} // size of the parametrization , np @@ -2368,8 +2427,7 @@ namespace atrip { , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , { Slice::AB, Slice::BC, Slice::AC + ) : SliceUnion( { Slice::AB, Slice::BC, Slice::AC , Slice::BA, Slice::CB, Slice::CA } , {Nv, No} // size of the slices @@ -2409,8 +2467,7 @@ namespace atrip { , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , {Slice::AB, Slice::BC, Slice::AC} + ) : SliceUnion( {Slice::AB, Slice::BC, Slice::AC} , {No, No} // size of the slices , {Nv, Nv} // size of the parametrization , np @@ -2449,8 +2506,7 @@ namespace atrip { , size_t np , MPI_Comm child_world , MPI_Comm global_world - ) : SliceUnion( sourceTensor - , {Slice::AB, Slice::BC, Slice::AC} + ) : SliceUnion( {Slice::AB, Slice::BC, Slice::AC} , {No, No} // size of the slices , {Nv, Nv} // size of the parametrization , np @@ -2486,24 +2542,82 @@ namespace atrip { * Equations + +This section presents on of the main physical contents of the program, +the equations used. + +The equations are of two types, energy equations and intermediate +tensor contractions. + +** Prolog :noexport: + #+begin_src c++ :tangle (atrip-equations-h) #pragma once -#include +#include #include +#include + namespace atrip { +using ABCTuple = std::array; +using PartialTuple = std::array; +using ABCTuples = std::vector; +#+end_src +#+begin_src c++ :tangle (atrip-equations-cxx) +#include + +#if defined(HAVE_CUDA) +#include +#endif + +namespace atrip { +#+end_src + +** Energy + +For the energy we have two types of functions, +a function for when the tuples \( (a,b,c) \) +are distinct and where they are the same, +they are accordingly called: + + +#+begin_src c++ :tangle (atrip-equations-h) template double getEnergyDistinct - ( const F epsabc - , std::vector const& epsi - , std::vector const& Tijk_ - , std::vector const& Zijk_ + ( F const epsabc + , size_t const No + , F* const epsi + , F* const Tijk + , F* const Zijk + ); + + template + double getEnergySame + ( F const epsabc + , size_t const No + , F* const epsi + , F* const Tijk + , F* const Zijk + ); +#+end_src + +Their implementations follow, and they are +written in a cache-friendly style for CPU architectures +through the =blockSize= variable. + +#+begin_src c++ :tangle (atrip-equations-cxx) + template + double getEnergyDistinct + ( F const epsabc + , size_t const No + , F* const epsi + , F* const Tijk + , F* const Zijk ) { constexpr size_t blockSize=16; F energy(0.); - const size_t No = epsi.size(); for (size_t kk=0; kk(Tijk_[i + No*j + No*No*k])) - , B(maybeConjugate(Tijk_[i + No*k + No*No*j])) - , C(maybeConjugate(Tijk_[j + No*i + No*No*k])) - , D(maybeConjugate(Tijk_[j + No*k + No*No*i])) - , E(maybeConjugate(Tijk_[k + No*i + No*No*j])) - , _F(maybeConjugate(Tijk_[k + No*j + No*No*i])) + , U(Zijk[i + No*j + No*No*k]) + , V(Zijk[i + No*k + No*No*j]) + , W(Zijk[j + No*i + No*No*k]) + , X(Zijk[j + No*k + No*No*i]) + , Y(Zijk[k + No*i + No*No*j]) + , Z(Zijk[k + No*j + No*No*i]) + , A(maybeConjugate(Tijk[i + No*j + No*No*k])) + , B(maybeConjugate(Tijk[i + No*k + No*No*j])) + , C(maybeConjugate(Tijk[j + No*i + No*No*k])) + , D(maybeConjugate(Tijk[j + No*k + No*No*i])) + , E(maybeConjugate(Tijk[k + No*i + No*No*j])) + , _F(maybeConjugate(Tijk[k + No*j + No*No*i])) , value = 3.0 * ( A * U + B * V @@ -2561,13 +2675,13 @@ namespace atrip { template double getEnergySame - ( const F epsabc - , std::vector const& epsi - , std::vector const& Tijk_ - , std::vector const& Zijk_ + ( F const epsabc + , size_t const No + , F* const epsi + , F* const Tijk + , F* const Zijk ) { constexpr size_t blockSize = 16; - const size_t No = epsi.size(); F energy = F(0.); for (size_t kk=0; kk(Tijk_[i + No*j + No*No*k])) - , B(maybeConjugate(Tijk_[j + No*k + No*No*i])) - , C(maybeConjugate(Tijk_[k + No*i + No*No*j])) + , U(Zijk[i + No*j + No*No*k]) + , V(Zijk[j + No*k + No*No*i]) + , W(Zijk[k + No*i + No*No*j]) + , A(maybeConjugate(Tijk[i + No*j + No*No*k])) + , B(maybeConjugate(Tijk[j + No*k + No*No*i])) + , C(maybeConjugate(Tijk[k + No*i + No*No*j])) , value = F(3.0) * ( A * U + B * V @@ -2610,56 +2724,221 @@ namespace atrip { return std::real(energy); } +#+end_src + +And we explicitly instantiate these templated functions: + +#+begin_src c++ :tangle (atrip-equations-cxx) + // instantiate double + template + double getEnergyDistinct + ( double const epsabc + , size_t const No + , double* const epsi + , double* const Tijk + , double* const Zijk + ); + + template + double getEnergySame + ( double const epsabc + , size_t const No + , double* const epsi + , double* const Tijk + , double* const Zijk + ); + + // instantiate Complex + template + double getEnergyDistinct + ( Complex const epsabc + , size_t const No + , Complex* const epsi + , Complex* const Tijk + , Complex* const Zijk + ); + + template + double getEnergySame + ( Complex const epsabc + , size_t const No + , Complex* const epsi + , Complex* const Tijk + , Complex* const Zijk + ); +#+end_src + +** Contractions + +*** Singles contribution +The first contraction we discuss is the one involving \( t_i^a \) +singles amplitudes. + +For every \( (a,b,c) \) we construct the object + +\begin{align*} +Z_{ijk} &= t^a_i V^{bc}_{jk} \\ + & + t^b_j V^{ac}_{ik} \\ + & + t^c_k V^{ab}_{ij} +\end{align*} + +and for it we will need the corresponding slices +of the \( V^{pp}_{ph} \) integrals, i.e., =AB=, =AC= and =BC=. + + +#+begin_src c++ :tangle (atrip-equations-h) :exports none template void singlesContribution ( size_t No , size_t Nv , const ABCTuple &abc - , F const* Tph - , F const* VABij - , F const* VACij - , F const* VBCij - , F *Zijk + , F* const Tph + , F* const VABij + , F* const VACij + , F* const VBCij + , F* Zijk + ); +#+end_src + +#+begin_src c++ :tangle (atrip-equations-cxx) + + template + void singlesContribution + ( size_t No + , size_t Nv + , const ABCTuple &abc + , F* const Tph + , F* const VABij + , F* const VACij + , F* const VBCij + , F* Zijk ) { const size_t a(abc[0]), b(abc[1]), c(abc[2]); - for (size_t k=0; k < No; k++) - for (size_t i=0; i < No; i++) - for (size_t j=0; j < No; j++) { - const size_t ijk = i + j*No + k*No*No - , jk = j + No * k - ; + const size_t NoNo = No*No; + // TODO: change order of for loops + for (size_t k = 0; k < No; k++) + for (size_t i = 0; i < No; i++) + for (size_t j = 0; j < No; j++) { + const size_t ijk = i + j*No + k*NoNo; Zijk[ijk] += Tph[ a + i * Nv ] * VBCij[ j + k * No ]; Zijk[ijk] += Tph[ b + j * Nv ] * VACij[ i + k * No ]; Zijk[ijk] += Tph[ c + k * Nv ] * VABij[ i + j * No ]; } } +// instantiate + template void singlesContribution( size_t No + , size_t Nv + , const ABCTuple &abc + , double* const Tph + , double* const VABij + , double* const VACij + , double* const VBCij + , double* Zijk + ); + + template void singlesContribution( size_t No + , size_t Nv + , const ABCTuple &abc + , Complex* const Tph + , Complex* const VABij + , Complex* const VACij + , Complex* const VBCij + , Complex* Zijk + ); +#+end_src + +*** Doubles contribution + +Next we build the tensor \( T_{ijk} \) from the contraction +of the coulomb integrals with the doubles amplitudes \( t^{ab}_{ij} \). +There are two kinds of contractions, which are differently challenging +in terms of computation. + +\begin{align*} +T_{ijk} &= \\ +\sum_{{\color{red}L} = 0}^{N_\mathrm{o}} +&- T^{{\color{blue} ab}}_{{\color{red}L}j} V_{ik}^{{\color{red}L}{\color{blue}c}} \\ +&- T^{{\color{blue} ab}}_{i{\color{red}L}} V_{jk}^{{\color{red}L}{\color{blue}c}} \\ +&- T^{{\color{blue} ac}}_{{\color{red}L}k} V_{ij}^{{\color{red}L}{\color{blue}b}} \\ +&- T^{{\color{blue} ac}}_{i{\color{red}L}} V_{kj}^{{\color{red}L}{\color{blue}b}} \\ +&- T^{{\color{blue} bc}}_{{\color{red}L}k} V_{ji}^{{\color{red}L}{\color{blue}a}} \\ +&- T^{{\color{blue} bc}}_{j{\color{red}L}} V_{ki}^{{\color{red}L}{\color{blue}a}} +\end{align*} + +\begin{align*} +T_{ijk} &= \\ +\sum_{{\color{red}e} = 0}^{N_\mathrm{v}} + &\phantom{+} +V^{{\color{blue}ab}}_{{\color{red}e}i} T^{{\color{blue}c}{\color{red}e}}_{ij} \\ + &+ V^{{\color{blue}ba}}_{{\color{red}e}j} T^{{\color{blue}c}{\color{red}e}}_{ji} \\ + &+ V^{{\color{blue}cb}}_{{\color{red}e}k} T^{{\color{blue}a}{\color{red}e}}_{kj} \\ + &+ V^{{\color{blue}ac}}_{{\color{red}e}i} T^{{\color{blue}b}{\color{red}e}}_{ik} \\ + &+ V^{{\color{blue}bc}}_{{\color{red}e}j} T^{{\color{blue}a}{\color{red}e}}_{jk} \\ + &+ V^{{\color{blue}ca}}_{{\color{red}e}k} T^{{\color{blue}b}{\color{red}e}}_{ki} \\ +\end{align*} + + +#+begin_src c++ :tangle (atrip-equations-h) template void doublesContribution ( const ABCTuple &abc , size_t const No , size_t const Nv // -- VABCI - , F const* VABph - , F const* VACph - , F const* VBCph - , F const* VBAph - , F const* VCAph - , F const* VCBph + , F* const VABph + , F* const VACph + , F* const VBCph + , F* const VBAph + , F* const VCAph + , F* const VCBph // -- VHHHA - , F const* VhhhA - , F const* VhhhB - , F const* VhhhC + , F* const VhhhA + , F* const VhhhB + , F* const VhhhC // -- TA - , F const* TAphh - , F const* TBphh - , F const* TCphh + , F* const TAphh + , F* const TBphh + , F* const TCphh // -- TABIJ - , F const* TABhh - , F const* TAChh - , F const* TBChh + , F* const TABhh + , F* const TAChh + , F* const TBChh // -- TIJK - , F *Tijk + , F* Tijk + ); +#+end_src + + + + +#+begin_src c++ :tangle (atrip-equations-cxx) + template + void doublesContribution + ( const ABCTuple &abc + , size_t const No + , size_t const Nv + // -- VABCI + , F* const VABph + , F* const VACph + , F* const VBCph + , F* const VBAph + , F* const VCAph + , F* const VCBph + // -- VHHHA + , F* const VhhhA + , F* const VhhhB + , F* const VhhhC + // -- TA + , F* const TAphh + , F* const TBphh + , F* const TCphh + // -- TABIJ + , F* const TABhh + , F* const TAChh + , F* const TBChh + // -- TIJK + , F* Tijk ) { const size_t a = abc[0], b = abc[1], c = abc[2] @@ -2852,19 +3131,92 @@ namespace atrip { #endif } + + + // instantiate templates + template + void doublesContribution + ( const ABCTuple &abc + , size_t const No + , size_t const Nv + // -- VABCI + , double* const VABph + , double* const VACph + , double* const VBCph + , double* const VBAph + , double* const VCAph + , double* const VCBph + // -- VHHHA + , double* const VhhhA + , double* const VhhhB + , double* const VhhhC + // -- TA + , double* const TAphh + , double* const TBphh + , double* const TCphh + // -- TABIJ + , double* const TABhh + , double* const TAChh + , double* const TBChh + // -- TIJK + , double* Tijk + ); + + template + void doublesContribution + ( const ABCTuple &abc + , size_t const No + , size_t const Nv + // -- VABCI + , Complex* const VABph + , Complex* const VACph + , Complex* const VBCph + , Complex* const VBAph + , Complex* const VCAph + , Complex* const VCBph + // -- VHHHA + , Complex* const VhhhA + , Complex* const VhhhB + , Complex* const VhhhC + // -- TA + , Complex* const TAphh + , Complex* const TBphh + , Complex* const TCphh + // -- TABIJ + , Complex* const TABhh + , Complex* const TAChh + , Complex* const TBChh + // -- TIJK + , Complex* Tijk + ); +#+end_src + +** Epilog :noexport: +#+begin_src c++ :tangle (atrip-equations-h) } #+end_src +#+begin_src c++ :tangle (atrip-equations-cxx) +} +#+end_src + + + + * Blas + The main matrix-matrix multiplication method used in this algorithm is mainly using the =DGEMM= function, which we declare as =extern= since it should be known only at link-time. -#+begin_src c++ :tangle (atrip-blas-h) +#+begin_src c++ :tangle (atrip-blas-h) :exports none #pragma once + +#include +#include "config.h" + namespace atrip { - using Complex = std::complex; - +#if !defined(HAVE_CUDA) extern "C" { void dgemm_( const char *transa, @@ -2898,9 +3250,9 @@ namespace atrip { const int *ldc ); } +#endif - - template + template void xgemm(const char *transa, const char *transb, const int *m, @@ -2913,12 +3265,61 @@ namespace atrip { const int *ldb, F *beta, F *C, + const int *ldc); +} +#+end_src + +#+begin_src c++ :tangle (atrip-blas-cxx) +#include +#include + +#if defined(HAVE_CUDA) +# include + + static inline + cublasOperation_t char_to_cublasOperation(const char* trans) { + if (strncmp("N", trans, 1) == 0) + return CUBLAS_OP_N; + else if (strncmp("T", trans, 1) == 0) + return CUBLAS_OP_T; + else + return CUBLAS_OP_C; + } + +#endif + +namespace atrip { + + + template <> + void xgemm(const char *transa, + const char *transb, + const int *m, + const int *n, + const int *k, + double *alpha, + const double *A, + const int *lda, + const double *B, + const int *ldb, + double *beta, + double *C, const int *ldc) { +#if defined(HAVE_CUDA) + cublasDgemm(Atrip::cuda.handle, + char_to_cublasOperation(transa), + char_to_cublasOperation(transb), + ,*m, *n, *k, + alpha, A, *lda, + B, *ldb, beta, + C, *ldc); +#else dgemm_(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); +#endif } template <> @@ -2935,12 +3336,28 @@ namespace atrip { Complex *beta, Complex *C, const int *ldc) { +#if defined(HAVE_CUDA) + cuDoubleComplex + cu_alpha = {std::real(*alpha), std::imag(*alpha)}, + cu_beta = {std::real(*beta), std::imag(*beta)}; + cublasZgemm(Atrip::cuda.handle, + char_to_cublasOperation(transa), + char_to_cublasOperation(transb), + ,*m, *n, *k, + &cu_alpha, + reinterpret_cast(A), *lda, + reinterpret_cast(B), *ldb, + &cu_beta, + reinterpret_cast(C), *ldc); +#else zgemm_(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); +#endif } + } #+end_src @@ -2952,9 +3369,17 @@ namespace atrip { #include #include #include +#include "config.h" +#if defined(HAVE_CUDA) +#include +#define CUBLASAPI +#include +#include +#endif #include +#include #define ADD_ATTRIBUTE(_type, _name, _default) \ _type _name = _default; \ @@ -2967,12 +3392,21 @@ namespace atrip { struct Atrip { - static int rank; - static int np; + static size_t rank; + static size_t np; static MPI_Comm communicator; static Timings chrono; +#if defined(HAVE_CUDA) + struct CudaContext { + cublasStatus_t status; + cublasHandle_t handle; + }; + static CudaContext cuda; +#endif + static void init(MPI_Comm); + template struct Input { CTF::Tensor *ei = nullptr @@ -3041,8 +3475,11 @@ using namespace atrip; template bool RankMap::RANK_ROUND_ROBIN; template bool RankMap::RANK_ROUND_ROBIN; template bool RankMap::RANK_ROUND_ROBIN; -int Atrip::rank; -int Atrip::np; +size_t Atrip::rank; +size_t Atrip::np; +#if defined(HAVE_CUDA) +typename Atrip::CudaContext Atrip::cuda; +#endif MPI_Comm Atrip::communicator; Timings Atrip::chrono; @@ -3054,15 +3491,20 @@ void atrip::registerIterationDescriptor(IterationDescriptor d) { void Atrip::init(MPI_Comm world) { Atrip::communicator = world; - MPI_Comm_rank(world, &Atrip::rank); - MPI_Comm_size(world, &Atrip::np); + MPI_Comm_rank(world, (int*)&Atrip::rank); + MPI_Comm_size(world, (int*)&Atrip::np); + +#if defined(HAVE_CUDA) + Atrip::cuda.status = cublasCreate(&Atrip::cuda.handle); +#endif + } template Atrip::Output Atrip::run(Atrip::Input const& in) { - const int np = Atrip::np; - const int rank = Atrip::rank; + const size_t np = Atrip::np; + const size_t rank = Atrip::rank; MPI_Comm universe = Atrip::communicator; const size_t No = in.ei->lens[0]; @@ -3072,18 +3514,34 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { LOG(0,"Atrip") << "np: " << np << "\n"; // allocate the three scratches, see piecuch - std::vector Tijk(No*No*No) // doubles only (see piecuch) - , Zijk(No*No*No) // singles + doubles (see piecuch) - // we need local copies of the following tensors on every - // rank - , epsi(No) - , epsa(Nv) - , Tai(No * Nv) + // we need local copies of the following tensors on every + // rank + std::vector _epsi(No) + , _epsa(Nv) + , _Tai(No * Nv) ; - in.ei->read_all(epsi.data()); - in.ea->read_all(epsa.data()); - in.Tph->read_all(Tai.data()); + in.ei->read_all(_epsi.data()); + in.ea->read_all(_epsa.data()); + in.Tph->read_all(_Tai.data()); + +#if defined(HAVE_CUDA) + DataPtr Tai, epsi, epsa; + cuMemAlloc(&Tai, sizeof(F) * _Tai.size()); + cuMemAlloc(&epsi, sizeof(F) * _epsi.size()); + cuMemAlloc(&epsa, sizeof(F) * _epsa.size()); + + cuMemcpy(Tai, (DataPtr)_Tai.data(), sizeof(F) * _Tai.size()); + cuMemcpy(epsi,(DataPtr)_epsi.data(), sizeof(F) * _epsi.size()); + cuMemcpy(epsa, (DataPtr)_epsa.data(), sizeof(F) * _epsa.size()); + + DataPtr Tijk, Zijk; + cuMemAlloc(&Tijk, sizeof(F) * No * No * No); + cuMemAlloc(&Zijk, sizeof(F) * No * No * No); +#else + std::vector &Tai = _Tai, &epsi = _epsi, &epsa = _epsa; + std::vector Tijk(No*No*No), Zijk(No*No*No); +#endif RankMap::RANK_ROUND_ROBIN = in.rankRoundRobin; if (RankMap::RANK_ROUND_ROBIN) { @@ -3153,7 +3611,6 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { ) const size_t nIterations = tuplesList.size(); { - const size_t _all_tuples = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; LOG(0,"Atrip") << "#iterations: " << nIterations << "/" @@ -3473,26 +3930,30 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { WITH_CHRONO("doubles", doublesContribution( abc, (size_t)No, (size_t)Nv // -- VABCI - , abph.unwrapSlice(Slice::AB, abc) - , abph.unwrapSlice(Slice::AC, abc) - , abph.unwrapSlice(Slice::BC, abc) - , abph.unwrapSlice(Slice::BA, abc) - , abph.unwrapSlice(Slice::CA, abc) - , abph.unwrapSlice(Slice::CB, abc) + , (F*)abph.unwrapSlice(Slice::AB, abc) + , (F*)abph.unwrapSlice(Slice::AC, abc) + , (F*)abph.unwrapSlice(Slice::BC, abc) + , (F*)abph.unwrapSlice(Slice::BA, abc) + , (F*)abph.unwrapSlice(Slice::CA, abc) + , (F*)abph.unwrapSlice(Slice::CB, abc) // -- VHHHA - , hhha.unwrapSlice(Slice::A, abc) - , hhha.unwrapSlice(Slice::B, abc) - , hhha.unwrapSlice(Slice::C, abc) + , (F*)hhha.unwrapSlice(Slice::A, abc) + , (F*)hhha.unwrapSlice(Slice::B, abc) + , (F*)hhha.unwrapSlice(Slice::C, abc) // -- TA - , taphh.unwrapSlice(Slice::A, abc) - , taphh.unwrapSlice(Slice::B, abc) - , taphh.unwrapSlice(Slice::C, abc) + , (F*)taphh.unwrapSlice(Slice::A, abc) + , (F*)taphh.unwrapSlice(Slice::B, abc) + , (F*)taphh.unwrapSlice(Slice::C, abc) // -- TABIJ - , tabhh.unwrapSlice(Slice::AB, abc) - , tabhh.unwrapSlice(Slice::AC, abc) - , tabhh.unwrapSlice(Slice::BC, abc) + , (F*)tabhh.unwrapSlice(Slice::AB, abc) + , (F*)tabhh.unwrapSlice(Slice::AC, abc) + , (F*)tabhh.unwrapSlice(Slice::BC, abc) // -- TIJK +#if defined(HAVE_CUDA) + , (F*)Tijk +#else , Tijk.data() +#endif ); WITH_RANK << iteration << "-th doubles done\n"; )) @@ -3507,15 +3968,25 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { abhh.unwrapAll(abc); ))) WITH_CHRONO("reorder", - for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I]; + #pragma acc parallel + for (size_t I(0); I < No*No*No; I++) + ((F*)Zijk)[I] = ((F*)Tijk)[I]; ) WITH_CHRONO("singles", singlesContribution( No, Nv, abc +#if defined(HAVE_CUDA) + , (F*)Tai +#else , Tai.data() - , abhh.unwrapSlice(Slice::AB, abc) - , abhh.unwrapSlice(Slice::AC, abc) - , abhh.unwrapSlice(Slice::BC, abc) +#endif + , (F*)abhh.unwrapSlice(Slice::AB, abc) + , (F*)abhh.unwrapSlice(Slice::AC, abc) + , (F*)abhh.unwrapSlice(Slice::BC, abc) +#if defined(HAVE_CUDA) + , (F*)Zijk); +#else , Zijk.data()); +#endif ) } @@ -3527,13 +3998,13 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { int distinct(0); if (abc[0] == abc[1]) distinct++; if (abc[1] == abc[2]) distinct--; - const F epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]); + const F epsabc(_epsa[abc[0]] + _epsa[abc[1]] + _epsa[abc[2]]); WITH_CHRONO("energy", if ( distinct == 0) - tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk); + tupleEnergy = getEnergyDistinct(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk); else - tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk); + tupleEnergy = getEnergySame(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk); ) #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES) @@ -3938,6 +4409,57 @@ using namespace atrip; } #+end_src +* Miscellaneous +** Complex numbers +#+begin_src c++ :tangle (atrip-complex-h) +#pragma once + +#include +#include + +namespace atrip { + + using Complex = std::complex; + + template F maybeConjugate(const F); + + namespace traits { + + template bool isComplex(); + + namespace mpi { + template MPI_Datatype datatypeOf(void); + } + + } + +} + +#+end_src + +#+begin_src c++ :tangle (atrip-complex-cxx) +#include + +namespace atrip { + + template <> double maybeConjugate(const double a) { return a; } + template <> Complex maybeConjugate(const Complex a) { return std::conj(a); } + + + namespace traits { + template bool isComplex() { return false; } + template <> bool isComplex() { return false; } + template <> bool isComplex() { return true; } + namespace mpi { + template <> MPI_Datatype datatypeOf() { return MPI_DOUBLE; } + template <> MPI_Datatype datatypeOf() { return MPI_DOUBLE_COMPLEX; } + } + } + +} + +#+end_src + * Include header #+begin_src c++ :tangle (atrip-main-h)