From c757c4650c7544eb464098a78d11cb7be4e2ded3 Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Thu, 14 Jul 2022 00:17:24 +0200 Subject: [PATCH] Fix problem with complex numbers --- atrip.org | 381 ++++++++++++++++++++++++++++++++++-------------------- shell.nix | 8 +- 2 files changed, 245 insertions(+), 144 deletions(-) diff --git a/atrip.org b/atrip.org index 49d598b..ec77dd5 100644 --- a/atrip.org +++ b/atrip.org @@ -214,17 +214,49 @@ of a GPU architecture. #+begin_src c++ :tangle (atrip-types-h) #pragma once +#include +#include + +namespace atrip { template +struct DataField; + +template <> +struct DataField { + using type = double; +}; + #if defined(HAVE_CUDA) + +template using DataPtr = CUdeviceptr; #define DataNullPtr 0x00 -#define _AT_(_array, _idx) ((F*)(_array))[(_idx)] + +template <> +struct DataField { + using type = cuDoubleComplex; +}; + + #else + +template using DataPtr = F*; #define DataNullPtr nullptr -#define _AT_(_array, _idx) (_array)[(_idx)] + +template <> +struct DataField { + using type = Complex; +}; + #endif + + +template +using DataFieldType = typename DataField::type; + +} #+end_src @@ -748,7 +780,7 @@ The main behaviour of the function should #if defined(HAVE_CUDA) // copy the retrieved mpi data to the device - cuMemcpy((DataPtr)mpi_data, data, size); + cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size); std::free(mpi_data); #endif @@ -2558,6 +2590,10 @@ tensor contractions. #include #include +#if defined(HAVE_CUDA) +#include +#endif + namespace atrip { using ABCTuple = std::array; @@ -2886,26 +2922,27 @@ V^{{\color{blue}ab}}_{{\color{red}e}i} T^{{\color{blue}c}{\color{red}e}}_{ij} \ , size_t const No , size_t const Nv // -- VABCI - , F* const VABph - , F* const VACph - , F* const VBCph - , F* const VBAph - , F* const VCAph - , F* const VCBph + , DataPtr const VABph + , DataPtr const VACph + , DataPtr const VBCph + , DataPtr const VBAph + , DataPtr const VCAph + , DataPtr const VCBph // -- VHHHA - , F* const VhhhA - , F* const VhhhB - , F* const VhhhC + , DataPtr const VhhhA + , DataPtr const VhhhB + , DataPtr const VhhhC // -- TA - , F* const TAphh - , F* const TBphh - , F* const TCphh + , DataPtr const TAphh + , DataPtr const TBphh + , DataPtr const TCphh // -- TABIJ - , F* const TABhh - , F* const TAChh - , F* const TBChh + , DataPtr const TABhh + , DataPtr const TAChh + , DataPtr const TBChh // -- TIJK - , F* Tijk + // , DataPtr Tijk + , DataFieldType* Tijk_ ); #+end_src @@ -2919,91 +2956,141 @@ V^{{\color{blue}ab}}_{{\color{red}e}i} T^{{\color{blue}c}{\color{red}e}}_{ij} \ , size_t const No , size_t const Nv // -- VABCI - , F* const VABph - , F* const VACph - , F* const VBCph - , F* const VBAph - , F* const VCAph - , F* const VCBph + , DataPtr const VABph + , DataPtr const VACph + , DataPtr const VBCph + , DataPtr const VBAph + , DataPtr const VCAph + , DataPtr const VCBph // -- VHHHA - , F* const VhhhA - , F* const VhhhB - , F* const VhhhC + , DataPtr const VhhhA + , DataPtr const VhhhB + , DataPtr const VhhhC // -- TA - , F* const TAphh - , F* const TBphh - , F* const TCphh + , DataPtr const TAphh + , DataPtr const TBphh + , DataPtr const TCphh // -- TABIJ - , F* const TABhh - , F* const TAChh - , F* const TBChh + , DataPtr const TABhh + , DataPtr const TAChh + , DataPtr const TBChh // -- TIJK - , F* Tijk + // , DataPtr Tijk_ + , DataFieldType* Tijk_ ) { const size_t a = abc[0], b = abc[1], c = abc[2] , NoNo = No*No, NoNv = No*Nv ; - #if defined(ATRIP_USE_DGEMM) - #define _IJK_(i, j, k) i + j*No + k*NoNo - #define REORDER(__II, __JJ, __KK) \ + typename DataField::type* Tijk = (typename DataField::type*) Tijk_; + +#if defined(ATRIP_USE_DGEMM) +#define _IJK_(i, j, k) i + j*No + k*NoNo +#define REORDER(__II, __JJ, __KK) \ WITH_CHRONO("doubles:reorder", \ for (size_t k = 0; k < No; k++) \ for (size_t j = 0; j < No; j++) \ for (size_t i = 0; i < No; i++) { \ - Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)]; \ + Tijk[_IJK_(i, j, k)] += _t_buffer_p[_IJK_(__II, __JJ, __KK)]; \ } \ ) - #define DGEMM_PARTICLES(__A, __B) \ - atrip::xgemm( "T" \ - , "N" \ - , (int const*)&NoNo \ - , (int const*)&No \ - , (int const*)&Nv \ - , &one \ - , __A \ - , (int const*)&Nv \ - , __B \ - , (int const*)&Nv \ - , &zero \ - , _t_buffer.data() \ - , (int const*)&NoNo \ - ); - #define DGEMM_HOLES(__A, __B, __TRANSB) \ - atrip::xgemm( "N" \ - , __TRANSB \ - , (int const*)&NoNo \ - , (int const*)&No \ - , (int const*)&No \ - , &m_one \ - , __A \ - , (int const*)&NoNo \ - , __B \ - , (int const*)&No \ - , &zero \ - , _t_buffer.data() \ - , (int const*)&NoNo \ - ); - #define MAYBE_CONJ(_conj, _buffer) \ - for (size_t __i = 0; __i < NoNoNo; ++__i) \ - _conj[__i] = maybeConjugate(_buffer[__i]); \ +#if defined(HAVE_CUDA) +#define __TO_DEVICEPTR(_v) (DataFieldType*)(CUdeviceptr)thrust::raw_pointer_cast((_v)) +#define DGEMM_PARTICLES(__A, __B) \ + atrip::xgemm("T", \ + "N", \ + (int const*)&NoNo, \ + (int const*)&No, \ + (int const*)&Nv, \ + &one, \ + (DataFieldType*)__A, \ + (int const*)&Nv, \ + (DataFieldType*)__B, \ + (int const*)&Nv, \ + &zero, \ + _t_buffer_p, \ + (int const*)&NoNo); +#define DGEMM_HOLES(__A, __B, __TRANSB) \ + atrip::xgemm("N", \ + __TRANSB, \ + (int const*)&NoNo, \ + (int const*)&No, \ + (int const*)&No, \ + &m_one, \ + __TO_DEVICEPTR(__A), \ + (int const*)&NoNo, \ + (DataFieldType*)__B, \ + (int const*)&No, \ + &zero, \ + _t_buffer_p, \ + (int const*)&NoNo \ + ); +#define MAYBE_CONJ(_conj, _buffer) \ + for (size_t __i = 0; __i < NoNoNo; ++__i) \ + _conj[__i] = \ + maybeConjugate>(((DataFieldType*)_buffer)[__i]); +#else +#define __TO_DEVICEPTR(_v) (_v) +#define DGEMM_PARTICLES(__A, __B) \ + atrip::xgemm("T", \ + "N", \ + (int const*)&NoNo, \ + (int const*)&No, \ + (int const*)&Nv, \ + &one, \ + __A, \ + (int const*)&Nv, \ + __B, \ + (int const*)&Nv, \ + &zero, \ + _t_buffer_p, \ + (int const*)&NoNo \ + ); +#define DGEMM_HOLES(__A, __B, __TRANSB) \ + atrip::xgemm("N", \ + __TRANSB, \ + (int const*)&NoNo, \ + (int const*)&No, \ + (int const*)&No, \ + &m_one, \ + __A, \ + (int const*)&NoNo, \ + __B, \ + (int const*)&No, \ + &zero, \ + _t_buffer_p, \ + (int const*)&NoNo \ + ); +#define MAYBE_CONJ(_conj, _buffer) \ + for (size_t __i = 0; __i < NoNoNo; ++__i) \ + _conj[__i] = maybeConjugate(_buffer[__i]); +#endif const size_t NoNoNo = No*NoNo; +#ifdef HAVE_CUDA + thrust::device_vector< DataFieldType > _t_buffer; +#else std::vector _t_buffer; +#endif _t_buffer.reserve(NoNoNo); + DataFieldType* _t_buffer_p = __TO_DEVICEPTR(_t_buffer.data()); F one{1.0}, m_one{-1.0}, zero{0.0}; WITH_CHRONO("double:reorder", for (size_t k = 0; k < NoNoNo; k++) { - Tijk[k] = 0.0; + Tijk[k] = DataFieldType{0.0}; }) // TOMERGE: replace chronos WITH_CHRONO("doubles:holes", { // Holes part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +#ifdef HAVE_CUDA + thrust::device_vector< DataFieldType > _vhhh(NoNoNo); +#else std::vector _vhhh(NoNoNo); +#endif // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 MAYBE_CONJ(_vhhh, VhhhC) @@ -3135,59 +3222,59 @@ V^{{\color{blue}ab}}_{{\color{red}e}i} T^{{\color{blue}c}{\color{red}e}}_{ij} \ // instantiate templates template - void doublesContribution + void doublesContribution ( const ABCTuple &abc , size_t const No , size_t const Nv // -- VABCI - , double* const VABph - , double* const VACph - , double* const VBCph - , double* const VBAph - , double* const VCAph - , double* const VCBph + , DataPtr const VABph + , DataPtr const VACph + , DataPtr const VBCph + , DataPtr const VBAph + , DataPtr const VCAph + , DataPtr const VCBph // -- VHHHA - , double* const VhhhA - , double* const VhhhB - , double* const VhhhC + , DataPtr const VhhhA + , DataPtr const VhhhB + , DataPtr const VhhhC // -- TA - , double* const TAphh - , double* const TBphh - , double* const TCphh + , DataPtr const TAphh + , DataPtr const TBphh + , DataPtr const TCphh // -- TABIJ - , double* const TABhh - , double* const TAChh - , double* const TBChh + , DataPtr const TABhh + , DataPtr const TAChh + , DataPtr const TBChh // -- TIJK - , double* Tijk + , DataFieldType* Tijk ); template - void doublesContribution + void doublesContribution ( const ABCTuple &abc , size_t const No , size_t const Nv // -- VABCI - , Complex* const VABph - , Complex* const VACph - , Complex* const VBCph - , Complex* const VBAph - , Complex* const VCAph - , Complex* const VCBph + , DataPtr const VABph + , DataPtr const VACph + , DataPtr const VBCph + , DataPtr const VBAph + , DataPtr const VCAph + , DataPtr const VCBph // -- VHHHA - , Complex* const VhhhA - , Complex* const VhhhB - , Complex* const VhhhC + , DataPtr const VhhhA + , DataPtr const VhhhB + , DataPtr const VhhhC // -- TA - , Complex* const TAphh - , Complex* const TBphh - , Complex* const TCphh + , DataPtr const TAphh + , DataPtr const TBphh + , DataPtr const TCphh // -- TABIJ - , Complex* const TABhh - , Complex* const TAChh - , Complex* const TBChh + , DataPtr const TABhh + , DataPtr const TAChh + , DataPtr const TBChh // -- TIJK - , Complex* Tijk + , DataFieldType* Tijk ); #+end_src @@ -3212,6 +3299,7 @@ is mainly using the =DGEMM= function, which we declare as #pragma once #include +#include #include "config.h" namespace atrip { @@ -3259,12 +3347,12 @@ namespace atrip { const int *n, const int *k, F *alpha, - const F *A, + const DataFieldType *A, const int *lda, - const F *B, + const DataFieldType *B, const int *ldb, F *beta, - F *C, + DataFieldType *C, const int *ldc); } #+end_src @@ -3292,18 +3380,18 @@ namespace atrip { template <> - void xgemm(const char *transa, + void xgemm(const char *transa, const char *transb, const int *m, const int *n, const int *k, double *alpha, - const double *A, + const typename DataField::type *A, const int *lda, - const double *B, + const typename DataField::type *B, const int *ldb, double *beta, - double *C, + typename DataField::type *C, const int *ldc) { #if defined(HAVE_CUDA) cublasDgemm(Atrip::cuda.handle, @@ -3323,20 +3411,21 @@ namespace atrip { } template <> - void xgemm(const char *transa, + void xgemm(const char *transa, const char *transb, const int *m, const int *n, const int *k, Complex *alpha, - const Complex *A, + const typename DataField::type *A, const int *lda, - const Complex *B, + const typename DataField::type *B, const int *ldb, Complex *beta, - Complex *C, + typename DataField::type *C, const int *ldc) { #if defined(HAVE_CUDA) +#pragma warning HAVE_CUDA cuDoubleComplex cu_alpha = {std::real(*alpha), std::imag(*alpha)}, cu_beta = {std::real(*beta), std::imag(*beta)}; @@ -3345,10 +3434,11 @@ namespace atrip { char_to_cublasOperation(transb), ,*m, *n, *k, &cu_alpha, - reinterpret_cast(A), *lda, - reinterpret_cast(B), *ldb, + + A, *lda, + B, *ldb, &cu_beta, - reinterpret_cast(C), *ldc); + C, *ldc); #else zgemm_(transa, transb, m, n, k, @@ -3531,9 +3621,9 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { cuMemAlloc(&epsi, sizeof(F) * _epsi.size()); cuMemAlloc(&epsa, sizeof(F) * _epsa.size()); - cuMemcpy(Tai, (DataPtr)_Tai.data(), sizeof(F) * _Tai.size()); - cuMemcpy(epsi,(DataPtr)_epsi.data(), sizeof(F) * _epsi.size()); - cuMemcpy(epsa, (DataPtr)_epsa.data(), sizeof(F) * _epsa.size()); + cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size()); + cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size()); + cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size()); DataPtr Tijk, Zijk; cuMemAlloc(&Tijk, sizeof(F) * No * No * No); @@ -3930,27 +4020,27 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { WITH_CHRONO("doubles", doublesContribution( abc, (size_t)No, (size_t)Nv // -- VABCI - , (F*)abph.unwrapSlice(Slice::AB, abc) - , (F*)abph.unwrapSlice(Slice::AC, abc) - , (F*)abph.unwrapSlice(Slice::BC, abc) - , (F*)abph.unwrapSlice(Slice::BA, abc) - , (F*)abph.unwrapSlice(Slice::CA, abc) - , (F*)abph.unwrapSlice(Slice::CB, abc) + , abph.unwrapSlice(Slice::AB, abc) + , abph.unwrapSlice(Slice::AC, abc) + , abph.unwrapSlice(Slice::BC, abc) + , abph.unwrapSlice(Slice::BA, abc) + , abph.unwrapSlice(Slice::CA, abc) + , abph.unwrapSlice(Slice::CB, abc) // -- VHHHA - , (F*)hhha.unwrapSlice(Slice::A, abc) - , (F*)hhha.unwrapSlice(Slice::B, abc) - , (F*)hhha.unwrapSlice(Slice::C, abc) + , hhha.unwrapSlice(Slice::A, abc) + , hhha.unwrapSlice(Slice::B, abc) + , hhha.unwrapSlice(Slice::C, abc) // -- TA - , (F*)taphh.unwrapSlice(Slice::A, abc) - , (F*)taphh.unwrapSlice(Slice::B, abc) - , (F*)taphh.unwrapSlice(Slice::C, abc) + , taphh.unwrapSlice(Slice::A, abc) + , taphh.unwrapSlice(Slice::B, abc) + , taphh.unwrapSlice(Slice::C, abc) // -- TABIJ - , (F*)tabhh.unwrapSlice(Slice::AB, abc) - , (F*)tabhh.unwrapSlice(Slice::AC, abc) - , (F*)tabhh.unwrapSlice(Slice::BC, abc) + , tabhh.unwrapSlice(Slice::AB, abc) + , tabhh.unwrapSlice(Slice::AC, abc) + , tabhh.unwrapSlice(Slice::BC, abc) // -- TIJK #if defined(HAVE_CUDA) - , (F*)Tijk + , (DataFieldType*)Tijk #else , Tijk.data() #endif @@ -4423,6 +4513,10 @@ namespace atrip { template F maybeConjugate(const F); +#if defined(HAVE_CUDA) + void operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz); +#endif + namespace traits { template bool isComplex(); @@ -4445,6 +4539,13 @@ namespace atrip { template <> double maybeConjugate(const double a) { return a; } template <> Complex maybeConjugate(const Complex a) { return std::conj(a); } +#if defined(HAVE_CUDA) + void operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz) { + lz.x += rz.x; + lz.y += rz.y; + } +#endif + namespace traits { template bool isComplex() { return false; } diff --git a/shell.nix b/shell.nix index 5f4aa06..94f0bc0 100644 --- a/shell.nix +++ b/shell.nix @@ -7,19 +7,19 @@ let - mkl = import ./etc/nix/mkl.nix { pkgs = (import { + unfree-pkgs = import { config.allowUnfree = true; - }); }; + }; openblas = import ./etc/nix/openblas.nix { inherit pkgs; }; - cuda-pkg = if cuda then (import ./cuda.nix { inherit pkgs; }) else {}; + mkl = import ./etc/nix/mkl.nix { pkgs = unfree-pkgs; }; + cuda-pkg = if cuda then (import ./cuda.nix { pkgs = unfree-pkgs; }) else {}; in pkgs.mkShell rec { - compiler-pkg = if compiler == "gcc11" then pkgs.gcc11 else if compiler == "gcc10" then pkgs.gcc10