From 7029ccda2a562850fc3a3ce4022bb941012d1e0f Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Thu, 2 Sep 2021 18:45:38 +0200 Subject: [PATCH] Simple update --- Makefile | 2 +- README.org | 1882 ++++++++++++++++++++++++++--------------- bench/test_main.cxx | 3 + config.el | 25 +- etc/config/gcc.mk | 3 +- etc/make/ctf_vars.mk | 4 +- etc/make/emacs.mk | 2 +- etc/nix/scalapack.nix | 56 ++ shell.nix | 19 + 9 files changed, 1318 insertions(+), 678 deletions(-) create mode 100644 etc/nix/scalapack.nix diff --git a/Makefile b/Makefile index 715056e..5c12e14 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ include ./bench/config.mk MAIN = README.org -$(SOURCES_FILE): $(MAIN) +$(SOURCES_FILE): $(MAIN) config.el echo -n "SOURCES = " > $@ $(EMACS) --eval '(atrip-print-sources)' >> $@ diff --git a/README.org b/README.org index c40e20f..c03218e 100644 --- a/README.org +++ b/README.org @@ -17,12 +17,11 @@ The algorithm uses two main data types, the =Slice= and the #include #include +#include #include namespace atrip { -using ABCTuple = std::array; -using PartialTuple = std::array; struct Slice { @@ -497,12 +496,10 @@ std::ostream& operator<<(std::ostream& out, Slice::Info const& i) { #include #include +#include + namespace atrip { - struct Atrip { - static size_t rank; - }; - size_t Atrip::rank; template std::string pretty_print(T&& value) { @@ -531,189 +528,75 @@ namespace atrip { #+end_src -** Include header - -#+begin_src c++ :tangle (atrip-main-h) +** The rank mapping +#+begin_src c++ :tangle (atrip-utils-h) #pragma once -#define TRIPLES_BENCHMARK -#define TRIPLES_DEBUG 1 -//#define TRIPLES_WORKLOAD_DUMP -#define TRIPLES_USE_DGEMM -//#define TRIPLES_PRINT_TUPLES +#include +#include -#if TRIPLES_DEBUG == 4 -# pragma message("WARNING: You have OCD debugging ABC triples "\ - "expect GB of output and consult your therapist") -# include -# define HAVE_OCD -# define OCD_Barrier(com) MPI_Barrier(com) -# define WITH_OCD -# define WITH_ROOT if (atrip::Atrip::rank == 0) -# define WITH_SPECIAL(r) if (atrip::Atrip::rank == r) -# define WITH_RANK std::cout << atrip::Atrip::rank << ": " -# define WITH_CRAZY_DEBUG -# define WITH_DBG -# define DBG(...) dbg(__VA_ARGS__) -#elif TRIPLES_DEBUG == 3 -# pragma message("WARNING: You have crazy debugging ABC triples,"\ - " expect GB of output") -# include -# define OCD_Barrier(com) -# define WITH_OCD if (false) -# define WITH_ROOT if (atrip::Atrip::rank == 0) -# define WITH_SPECIAL(r) if (atrip::Atrip::rank == r) -# define WITH_RANK std::cout << atrip::Atrip::rank << ": " -# define WITH_CRAZY_DEBUG -# define WITH_DBG -# define DBG(...) dbg(__VA_ARGS__) -#elif TRIPLES_DEBUG == 2 -# pragma message("WARNING: You have some debugging info for ABC triples") -# include -# define OCD_Barrier(com) -# define WITH_OCD if (false) -# define WITH_ROOT if (atrip::Atrip::rank == 0) -# define WITH_SPECIAL(r) if (atrip::Atrip::rank == r) -# define WITH_RANK std::cout << atrip::Atrip::rank << ": " -# define WITH_CRAZY_DEBUG if (false) -# define WITH_DBG -# define DBG(...) dbg(__VA_ARGS__) -#elif TRIPLES_DEBUG == 1 -# define OCD_Barrier(com) -# define WITH_OCD if (false) -# define WITH_ROOT if (false) -# define WITH_SPECIAL(r) if (false) -# define WITH_RANK if (false) std::cout << atrip::Atrip::rank << ": " -# define WITH_DBG if (false) -# define WITH_CRAZY_DEBUG if (false) -# define DBG(...) -#else -# error("TRIPLES_DEBUG is not defined!") -#endif - -#include #include - -#+end_src - - -** Todo :noexport: - #+begin_src c++ :tangle src/main.hpp -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace cc4s; - -#include -#include -#include -#include -#include -#include -#include -#include -#include - namespace atrip { - extern "C" { - void dgemm_( - const char *transa, - const char *transb, - const int *m, - const int *n, - const int *k, - double *alpha, - const double *A, - const int *lda, - const double *B, - const int *ldb, - double *beta, - double *C, - const int *ldc - ); - } -} + struct RankMap { + + std::vector const lengths; + size_t const np, size; + + RankMap(std::vector lens, size_t np_) + : lengths(lens) + , np(np_) + , size(std::accumulate(lengths.begin(), lengths.end(), + 1, std::multiplies())) + { assert(lengths.size() <= 2); } + + size_t find(Slice::Location const& p) const noexcept { + return p.source * np + p.rank; + } + + size_t nSources() const noexcept { + return size / np + size_t(size % np != 0); + } + bool isPaddingRank(size_t rank) const noexcept { + return size % np == 0 + ? false + : rank > (size % np - 1) + ; + } -namespace atrip { + bool isSourcePadding(size_t rank, size_t source) const noexcept { + return source == nSources() && isPaddingRank(rank); + } + Slice::Location + find(ABCTuple const& abc, Slice::Type sliceType) const noexcept { + // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB + const auto tuple = Slice::subtupleBySlice(abc, sliceType); + + const size_t index + = tuple[0] + + tuple[1] * (lengths.size() > 1 ? lengths[0] : 0) + ; + + return + { index % np + , index / np + }; + } + + }; } - - -ALGORITHM_REGISTRAR_DEFINITION(PerturbativeTriplesAbcijk) - - - -// SLICE MAIN DEFININITION =============================================={{{1 #+end_src +** The slice union +#+begin_src c++ :tangle (atrip-slice-union-h) +#pragma once +#include -#+begin_src c++ :tangle src/main.hpp - - // RANKMAP =============================================================={{{1 - -struct RankMap { - - std::vector const lengths; - size_t const np, size; - - RankMap(std::vector lens, size_t np_) - : lengths(lens) - , np(np_) - , size(std::accumulate(lengths.begin(), lengths.end(), - 1, std::multiplies())) - { assert(lengths.size() <= 2); } - - size_t find(Slice::Location const& p) const noexcept { - return p.source * np + p.rank; - } - - size_t nSources() const noexcept { - return size / np + size_t(size % np != 0); - } - - - bool isPaddingRank(size_t rank) const noexcept { - return size % np == 0 - ? false - : rank > (size % np - 1) - ; - } - - bool isSourcePadding(size_t rank, size_t source) const noexcept { - return source == nSources() && isPaddingRank(rank); - } - - Slice::Location - find(ABCTuple const& abc, Slice::Type sliceType) const noexcept { - // tuple = {11, 8} when abc = {11, 8, 9} and sliceType = AB - const auto tuple = Slice::subtupleBySlice(abc, sliceType); - - const size_t index - = tuple[0] - + tuple[1] * (lengths.size() > 1 ? lengths[0] : 0) - ; - - return - { index % np - , index / np - }; - } - -}; - - - - // SLICEUNION ==========================================================={{{1 +namespace atrip { struct SliceUnion { using F = double; @@ -1237,7 +1120,104 @@ struct RankMap { }; + SliceUnion& + unionByName(std::vector const& unions, Slice::Name name) { + const auto sliceUnionIt + = std::find_if(unions.begin(), unions.end(), + [&name](SliceUnion const* s) { + return name == s->name; + }); + if (sliceUnionIt == unions.end()) + throw std::domain_error("SliceUnion not found!"); + return **sliceUnionIt; + } +} +#+end_src + +** Tuples +#+begin_src c++ :tangle (atrip-tuples-h) +#pragma once + +#include +#include + +#include + +namespace atrip { + + using ABCTuple = std::array; + using PartialTuple = std::array; + using ABCTuples = std::vector; + + ABCTuples getTuplesList(size_t Nv) { + const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; + ABCTuples result(n); + size_t u(0); + + for (size_t a(0); a < Nv; a++) + for (size_t b(a); b < Nv; b++) + for (size_t c(b); c < Nv; c++){ + if ( a == b && b == c ) continue; + result[u++] = {a, b, c}; + } + + return result; + + } + + + std::pair + getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) { + + std::vector n_tuples_per_rank(np, tuplesList.size()/np); + const size_t + // how many valid tuples should we still verteilen to nodes + // since the number of tuples is not divisible by the number of nodes + nRoundRobin = tuplesList.size() % np + // every node must have the sanme amount of tuples in order for the + // other nodes to receive and send somewhere, therefore + // some nodes will get extra tuples but that are dummy tuples + , nExtraInvalid = (np - nRoundRobin) % np + ; + + if (nRoundRobin) for (int i = 0; i < np; i++) n_tuples_per_rank[i]++; + + #if defined(TODO) + assert( tuplesList.size() + == + ( std::accumulate(n_tuples_per_rank.begin(), + n_tuples_per_rank.end(), + 0) + + nExtraInvalid + )); + #endif + + WITH_RANK << "nRoundRobin = " << nRoundRobin << "\n"; + WITH_RANK << "nExtraInvalid = " << nExtraInvalid << "\n"; + WITH_RANK << "ntuples = " << n_tuples_per_rank[rank] << "\n"; + + auto const& it = n_tuples_per_rank.begin(); + + return + { std::accumulate(it, it + rank , 0) + , std::accumulate(it, it + rank + 1, 0) + }; + + } + +} +#+end_src + +** Unions +Since every tensor slice in a different way, we can override the slicing procedure +and define subclasses of slice unions. + +#+begin_src c++ :tangle (atrip-unions-h) +#pragma once +#include + +namespace atrip { void sliceIntoVector ( std::vector &v @@ -1471,444 +1451,1087 @@ struct RankMap { }; - // PHYSICS ENERGY, SINGLES ... =========================================={{{1 - -double cc4s::PerturbativeTriplesAbcijk::getEnergyDistinct - ( const double epsabc - , std::vector const& epsi - , std::vector const& Tijk_ - , std::vector const& Zijk_ - ) { - constexpr size_t blockSize=16; - double energy(0.); - const size_t No = epsi.size(); - for (size_t kk=0; kk k ? kk : k; - for (size_t j(jstart); j < jend; j++){ - const double ej(epsi[j]); - double facjk( j == k ? 0.5 : 1.0); - size_t istart = ii > j ? ii : j; - for (size_t i(istart); i < iend; i++){ - const double ei(epsi[i]); - double facij ( i==j ? 0.5 : 1.0); - double denominator(epsabc - ei - ej - ek); - double U(Zijk_[i + No*j + No*No*k]); - double V(Zijk_[i + No*k + No*No*j]); - double W(Zijk_[j + No*i + No*No*k]); - double X(Zijk_[j + No*k + No*No*i]); - double Y(Zijk_[k + No*i + No*No*j]); - double Z(Zijk_[k + No*j + No*No*i]); - - double A(Tijk_[i + No*j + No*No*k]); - double B(Tijk_[i + No*k + No*No*j]); - double C(Tijk_[j + No*i + No*No*k]); - double D(Tijk_[j + No*k + No*No*i]); - double E(Tijk_[k + No*i + No*No*j]); - double F(Tijk_[k + No*j + No*No*i]); - double value(3.0*(A*U+B*V+C*W+D*X+E*Y+F*Z) - +((U+X+Y)-2.0*(V+W+Z))*(A+D+E) - +((V+W+Z)-2.0*(U+X+Y))*(B+C+F)); - energy += 2.0*value / denominator * facjk * facij; - } // i - } // j - } // k - } // ii - } // jj - } // kk - return energy; } +#+end_src -double cc4s::PerturbativeTriplesAbcijk::getEnergySame - ( const double epsabc - , std::vector const& epsi - , std::vector const& Tijk_ - , std::vector const& Zijk_ - ) { - constexpr size_t blockSize = 16; - const size_t No = epsi.size(); - double energy(0.); - for (size_t kk=0; kk k ? jj : k; - for(size_t j(jstart); j < jend; j++){ - const double facjk( j == k ? 0.5 : 1.0); - const double ej(epsi[j]); - const size_t istart = ii > j ? ii : j; - for(size_t i(istart); i < iend; i++){ - double ei(epsi[i]); - double facij ( i==j ? 0.5 : 1.0); - double denominator(epsabc - ei - ej - ek); - double U(Zijk_[i + No*j + No*No*k]); - double V(Zijk_[j + No*k + No*No*i]); - double W(Zijk_[k + No*i + No*No*j]); - double A(Tijk_[i + No*j + No*No*k]); - double B(Tijk_[j + No*k + No*No*i]); - double C(Tijk_[k + No*i + No*No*j]); - double value(3.0*( A*U + B*V + C*W) - (A+B+C)*(U+V+W)); - energy += 2.0*value / denominator * facjk * facij; - } // i - } // j - } // k - } // ii - } // jj - } // kk - return energy; -} +** Equations +#+begin_src c++ :tangle (atrip-equations-h) +#pragma once -void cc4s::PerturbativeTriplesAbcijk::singlesContribution - ( const ABCTuple &abc - , double const* Tph - , double const* VABij - , double const* VACij - , double const* VBCij - , double *Zijk - ) { - const size_t a(abc[0]), b(abc[1]), c(abc[2]); - for (size_t k=0; k < No; k++) - for (size_t i=0; i < No; i++) - for (size_t j=0; j < No; j++) { - const size_t ijk = i + j*No + k*No*No - , jk = j + No * k - ; - Zijk[ijk] += Tph[ a + i * Nv ] * VBCij[ j + k * No ]; - Zijk[ijk] += Tph[ b + j * Nv ] * VACij[ i + k * No ]; - Zijk[ijk] += Tph[ c + k * Nv ] * VABij[ i + j * No ]; +#include +#include + +namespace atrip { + + double getEnergyDistinct + ( const double epsabc + , std::vector const& epsi + , std::vector const& Tijk_ + , std::vector const& Zijk_ + ) { + constexpr size_t blockSize=16; + double energy(0.); + const size_t No = epsi.size(); + for (size_t kk=0; kk k ? kk : k; + for (size_t j(jstart); j < jend; j++){ + const double ej(epsi[j]); + double facjk( j == k ? 0.5 : 1.0); + size_t istart = ii > j ? ii : j; + for (size_t i(istart); i < iend; i++){ + const double ei(epsi[i]); + double facij ( i==j ? 0.5 : 1.0); + double denominator(epsabc - ei - ej - ek); + double U(Zijk_[i + No*j + No*No*k]); + double V(Zijk_[i + No*k + No*No*j]); + double W(Zijk_[j + No*i + No*No*k]); + double X(Zijk_[j + No*k + No*No*i]); + double Y(Zijk_[k + No*i + No*No*j]); + double Z(Zijk_[k + No*j + No*No*i]); + + double A(Tijk_[i + No*j + No*No*k]); + double B(Tijk_[i + No*k + No*No*j]); + double C(Tijk_[j + No*i + No*No*k]); + double D(Tijk_[j + No*k + No*No*i]); + double E(Tijk_[k + No*i + No*No*j]); + double F(Tijk_[k + No*j + No*No*i]); + double value(3.0*(A*U+B*V+C*W+D*X+E*Y+F*Z) + +((U+X+Y)-2.0*(V+W+Z))*(A+D+E) + +((V+W+Z)-2.0*(U+X+Y))*(B+C+F)); + energy += 2.0*value / denominator * facjk * facij; + } // i + } // j + } // k + } // ii + } // jj + } // kk + return energy; } -} -void doublesContribution - ( const ABCTuple &abc - , size_t const No - , size_t const Nv - // -- VABCI - , double const* VABph - , double const* VACph - , double const* VBCph - , double const* VBAph - , double const* VCAph - , double const* VCBph - // -- VHHHA - , double const* VhhhA - , double const* VhhhB - , double const* VhhhC - // -- TA - , double const* TAphh - , double const* TBphh - , double const* TCphh - // -- TABIJ - , double const* TABhh - , double const* TAChh - , double const* TBChh - // -- TIJK - , double *Tijk - , atrip::Timings& chrono - ) { - auto& t_reorder = chrono["doubles:reorder"]; - const size_t a = abc[0], b = abc[1], c = abc[2] - , NoNo = No*No, NoNv = No*Nv - ; - -#if defined(TRIPLES_USE_DGEMM) -#define _IJK_(i, j, k) i + j*No + k*NoNo -#define REORDER(__II, __JJ, __KK) \ - t_reorder.start(); \ - for (size_t k = 0; k < No; k++) \ - for (size_t j = 0; j < No; j++) \ - for (size_t i = 0; i < No; i++) { \ - Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)]; \ - } \ - t_reorder.stop(); -#define DGEMM_PARTICLES(__A, __B) \ - atrip::dgemm_( "T" \ - , "N" \ - , (int const*)&NoNo \ - , (int const*)&No \ - , (int const*)&Nv \ - , &one \ - , __A \ - , (int const*)&Nv \ - , __B \ - , (int const*)&Nv \ - , &zero \ - , _t_buffer.data() \ - , (int const*)&NoNo \ - ); -#define DGEMM_HOLES(__A, __B, __TRANSB) \ - atrip::dgemm_( "N" \ - , __TRANSB \ - , (int const*)&NoNo \ - , (int const*)&No \ - , (int const*)&No \ - , &m_one \ - , __A \ - , (int const*)&NoNo \ - , __B \ - , (int const*)&No \ - , &zero \ - , _t_buffer.data() \ - , (int const*)&NoNo \ - ); - - using F = double; - const size_t NoNoNo = No*NoNo; - std::vector _t_buffer; - _t_buffer.reserve(NoNoNo); - F one{1.0}, m_one{-1.0}, zero{0.0}; - - t_reorder.start(); - for (size_t k = 0; k < NoNoNo; k++) { - // zero the Tijk - Tijk[k] = 0.0; + double getEnergySame + ( const double epsabc + , std::vector const& epsi + , std::vector const& Tijk_ + , std::vector const& Zijk_ + ) { + constexpr size_t blockSize = 16; + const size_t No = epsi.size(); + double energy(0.); + for (size_t kk=0; kk k ? jj : k; + for(size_t j(jstart); j < jend; j++){ + const double facjk( j == k ? 0.5 : 1.0); + const double ej(epsi[j]); + const size_t istart = ii > j ? ii : j; + for(size_t i(istart); i < iend; i++){ + double ei(epsi[i]); + double facij ( i==j ? 0.5 : 1.0); + double denominator(epsabc - ei - ej - ek); + double U(Zijk_[i + No*j + No*No*k]); + double V(Zijk_[j + No*k + No*No*i]); + double W(Zijk_[k + No*i + No*No*j]); + double A(Tijk_[i + No*j + No*No*k]); + double B(Tijk_[j + No*k + No*No*i]); + double C(Tijk_[k + No*i + No*No*j]); + double value(3.0*( A*U + B*V + C*W) - (A+B+C)*(U+V+W)); + energy += 2.0*value / denominator * facjk * facij; + } // i + } // j + } // k + } // ii + } // jj + } // kk + return energy; } - t_reorder.stop(); - chrono["doubles:holes"].start(); - { // Holes part ============================================================ - // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 - chrono["doubles:holes:1"].start(); - DGEMM_HOLES(VhhhC, TABhh, "N") - REORDER(i, k, j) - chrono["doubles:holes:1"].stop(); - // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 - chrono["doubles:holes:2"].start(); - DGEMM_HOLES(VhhhC, TABhh, "T") - REORDER(j, k, i) - chrono["doubles:holes:2"].stop(); - // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 - chrono["doubles:holes:3"].start(); - DGEMM_HOLES(VhhhB, TAChh, "N") - REORDER(i, j, k) - chrono["doubles:holes:3"].stop(); - // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 - chrono["doubles:holes:4"].start(); - DGEMM_HOLES(VhhhB, TAChh, "T") - REORDER(k, j, i) - chrono["doubles:holes:4"].stop(); - // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 - chrono["doubles:holes:5"].start(); - DGEMM_HOLES(VhhhA, TBChh, "N") - REORDER(j, i, k) - chrono["doubles:holes:5"].stop(); - // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 - chrono["doubles:holes:6"].start(); - DGEMM_HOLES(VhhhA, TBChh, "T") - REORDER(k, i, j) - chrono["doubles:holes:6"].stop(); - } - chrono["doubles:holes"].stop(); - - chrono["doubles:particles"].start(); - { // Particle part ========================================================= - // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 - chrono["doubles:particles:1"].start(); - DGEMM_PARTICLES(TAphh, VBCph) - REORDER(i, j, k) - chrono["doubles:particles:1"].stop(); - // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 - chrono["doubles:particles:2"].start(); - DGEMM_PARTICLES(TAphh, VCBph) - REORDER(i, k, j) - chrono["doubles:particles:2"].stop(); - // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 - chrono["doubles:particles:3"].start(); - DGEMM_PARTICLES(TCphh, VABph) - REORDER(k, i, j) - chrono["doubles:particles:3"].stop(); - // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 - chrono["doubles:particles:4"].start(); - DGEMM_PARTICLES(TCphh, VBAph) - REORDER(k, j, i) - chrono["doubles:particles:4"].stop(); - // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 - chrono["doubles:particles:5"].start(); - DGEMM_PARTICLES(TBphh, VACph) - REORDER(j, i, k) - chrono["doubles:particles:5"].stop(); - // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 - chrono["doubles:particles:6"].start(); - DGEMM_PARTICLES(TBphh, VCAph) - REORDER(j, k, i) - chrono["doubles:particles:6"].stop(); - } - chrono["doubles:particles"].stop(); - -#undef REORDER -#undef DGEMM_HOLES -#undef DGEMM_PARTICLES -#undef _IJK_ -#else - for (size_t k = 0; k < No; k++) - for (size_t j = 0; j < No; j++) - for (size_t i = 0; i < No; i++){ - const size_t ijk = i + j*No + k*NoNo - , jk = j + k*No - ; - Tijk[ijk] = 0.0; // :important - // HOLE DIAGRAMS: TABHH and VHHHA - for (size_t L = 0; L < No; L++){ - // t[abLj] * V[Lcik] H1 - // t[baLi] * V[Lcjk] H0 TODO: conjugate T for complex - Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo]; - Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo]; - - // t[acLk] * V[Lbij] H5 - // t[caLi] * V[Lbkj] H3 - Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo]; - Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo]; - - // t[bcLk] * V[Laji] H2 - // t[cbLj] * V[Laki] H4 - Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo]; - Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo]; + void singlesContribution + ( const ABCTuple &abc + , double const* Tph + , double const* VABij + , double const* VACij + , double const* VBCij + , double *Zijk + ) { + const size_t a(abc[0]), b(abc[1]), c(abc[2]); + for (size_t k=0; k < No; k++) + for (size_t i=0; i < No; i++) + for (size_t j=0; j < No; j++) { + const size_t ijk = i + j*No + k*No*No + , jk = j + No * k + ; + Zijk[ijk] += Tph[ a + i * Nv ] * VBCij[ j + k * No ]; + Zijk[ijk] += Tph[ b + j * Nv ] * VACij[ i + k * No ]; + Zijk[ijk] += Tph[ c + k * Nv ] * VABij[ i + j * No ]; } - // PARTILCE DIAGRAMS: TAPHH and VABPH - for (size_t E = 0; E < Nv; E++) { - // t[aEij] * V[bcEk] P0 - // t[aEik] * V[cbEj] P3 // TODO: CHECK THIS ONE, I DONT KNOW - Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; - Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; + } - // t[cEki] * V[abEj] P5 - // t[cEkj] * V[baEi] P2 - Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; - Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; + void doublesContribution + ( const ABCTuple &abc + , size_t const No + , size_t const Nv + // -- VABCI + , double const* VABph + , double const* VACph + , double const* VBCph + , double const* VBAph + , double const* VCAph + , double const* VCBph + // -- VHHHA + , double const* VhhhA + , double const* VhhhB + , double const* VhhhC + // -- TA + , double const* TAphh + , double const* TBphh + , double const* TCphh + // -- TABIJ + , double const* TABhh + , double const* TAChh + , double const* TBChh + // -- TIJK + , double *Tijk + , atrip::Timings& chrono + ) { - // t[bEji] * V[acEk] P1 - // t[bEjk] * V[caEi] P4 - Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; - Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; + auto& t_reorder = chrono["doubles:reorder"]; + const size_t a = abc[0], b = abc[1], c = abc[2] + , NoNo = No*No, NoNv = No*Nv + ; + + #if defined(TRIPLES_USE_DGEMM) + #define _IJK_(i, j, k) i + j*No + k*NoNo + #define REORDER(__II, __JJ, __KK) \ + t_reorder.start(); \ + for (size_t k = 0; k < No; k++) \ + for (size_t j = 0; j < No; j++) \ + for (size_t i = 0; i < No; i++) { \ + Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)]; \ + } \ + t_reorder.stop(); + #define DGEMM_PARTICLES(__A, __B) \ + atrip::dgemm_( "T" \ + , "N" \ + , (int const*)&NoNo \ + , (int const*)&No \ + , (int const*)&Nv \ + , &one \ + , __A \ + , (int const*)&Nv \ + , __B \ + , (int const*)&Nv \ + , &zero \ + , _t_buffer.data() \ + , (int const*)&NoNo \ + ); + #define DGEMM_HOLES(__A, __B, __TRANSB) \ + atrip::dgemm_( "N" \ + , __TRANSB \ + , (int const*)&NoNo \ + , (int const*)&No \ + , (int const*)&No \ + , &m_one \ + , __A \ + , (int const*)&NoNo \ + , __B \ + , (int const*)&No \ + , &zero \ + , _t_buffer.data() \ + , (int const*)&NoNo \ + ); + + using F = double; + const size_t NoNoNo = No*NoNo; + std::vector _t_buffer; + _t_buffer.reserve(NoNoNo); + F one{1.0}, m_one{-1.0}, zero{0.0}; + + t_reorder.start(); + for (size_t k = 0; k < NoNoNo; k++) { + // zero the Tijk + Tijk[k] = 0.0; + } + t_reorder.stop(); + + chrono["doubles:holes"].start(); + { // Holes part ============================================================ + // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 + chrono["doubles:holes:1"].start(); + DGEMM_HOLES(VhhhC, TABhh, "N") + REORDER(i, k, j) + chrono["doubles:holes:1"].stop(); + // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 + chrono["doubles:holes:2"].start(); + DGEMM_HOLES(VhhhC, TABhh, "T") + REORDER(j, k, i) + chrono["doubles:holes:2"].stop(); + // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 + chrono["doubles:holes:3"].start(); + DGEMM_HOLES(VhhhB, TAChh, "N") + REORDER(i, j, k) + chrono["doubles:holes:3"].stop(); + // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 + chrono["doubles:holes:4"].start(); + DGEMM_HOLES(VhhhB, TAChh, "T") + REORDER(k, j, i) + chrono["doubles:holes:4"].stop(); + // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 + chrono["doubles:holes:5"].start(); + DGEMM_HOLES(VhhhA, TBChh, "N") + REORDER(j, i, k) + chrono["doubles:holes:5"].stop(); + // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 + chrono["doubles:holes:6"].start(); + DGEMM_HOLES(VhhhA, TBChh, "T") + REORDER(k, i, j) + chrono["doubles:holes:6"].stop(); + } + chrono["doubles:holes"].stop(); + + chrono["doubles:particles"].start(); + { // Particle part ========================================================= + // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 + chrono["doubles:particles:1"].start(); + DGEMM_PARTICLES(TAphh, VBCph) + REORDER(i, j, k) + chrono["doubles:particles:1"].stop(); + // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 + chrono["doubles:particles:2"].start(); + DGEMM_PARTICLES(TAphh, VCBph) + REORDER(i, k, j) + chrono["doubles:particles:2"].stop(); + // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 + chrono["doubles:particles:3"].start(); + DGEMM_PARTICLES(TCphh, VABph) + REORDER(k, i, j) + chrono["doubles:particles:3"].stop(); + // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 + chrono["doubles:particles:4"].start(); + DGEMM_PARTICLES(TCphh, VBAph) + REORDER(k, j, i) + chrono["doubles:particles:4"].stop(); + // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 + chrono["doubles:particles:5"].start(); + DGEMM_PARTICLES(TBphh, VACph) + REORDER(j, i, k) + chrono["doubles:particles:5"].stop(); + // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 + chrono["doubles:particles:6"].start(); + DGEMM_PARTICLES(TBphh, VCAph) + REORDER(j, k, i) + chrono["doubles:particles:6"].stop(); + } + chrono["doubles:particles"].stop(); + + #undef REORDER + #undef DGEMM_HOLES + #undef DGEMM_PARTICLES + #undef _IJK_ + #else + for (size_t k = 0; k < No; k++) + for (size_t j = 0; j < No; j++) + for (size_t i = 0; i < No; i++){ + const size_t ijk = i + j*No + k*NoNo + , jk = j + k*No + ; + Tijk[ijk] = 0.0; // :important + // HOLE DIAGRAMS: TABHH and VHHHA + for (size_t L = 0; L < No; L++){ + // t[abLj] * V[Lcik] H1 + // t[baLi] * V[Lcjk] H0 TODO: conjugate T for complex + Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo]; + Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo]; + + // t[acLk] * V[Lbij] H5 + // t[caLi] * V[Lbkj] H3 + Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo]; + Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo]; + + // t[bcLk] * V[Laji] H2 + // t[cbLj] * V[Laki] H4 + Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo]; + Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo]; + } + // PARTILCE DIAGRAMS: TAPHH and VABPH + for (size_t E = 0; E < Nv; E++) { + // t[aEij] * V[bcEk] P0 + // t[aEik] * V[cbEj] P3 // TODO: CHECK THIS ONE, I DONT KNOW + Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; + Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; + + // t[cEki] * V[abEj] P5 + // t[cEkj] * V[baEi] P2 + Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; + Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; + + // t[bEji] * V[acEk] P1 + // t[bEjk] * V[caEi] P4 + Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; + Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; + } + + } + #endif + } + +} +#+end_src + +** Blas +The main matrix-matrix multiplication method used in this algorithm +is mainly using the =DGEMM= function, which we declare as +=extern= since it should be known only at link-time. +#+begin_src c++ :tangle (atrip-blas-h) +#pragma once +namespace atrip { + extern "C" { + void dgemm_( + const char *transa, + const char *transb, + const int *m, + const int *n, + const int *k, + double *alpha, + const double *A, + const int *lda, + const double *B, + const int *ldb, + double *beta, + double *C, + const int *ldc + ); + } +} +#+end_src + +** Atrip +#+begin_src c++ :tangle (atrip-atrip-h) +#pragma once +#include +#include +#include +#include + +#include + +namespace atrip { + + struct Atrip { + + static int rank; + static int np; + + static void init() { + MPI_Comm_rank(MPI_COMM_WORLD, &Atrip::rank); + MPI_Comm_size(MPI_COMM_WORLD, &Atrip::np); } - } -#endif -} - - -// HELPER FUNCTIONS ====================================================={{{1 - -using ABCTuples = std::vector; - -ABCTuples getTuplesList(size_t Nv) { - const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; - ABCTuples result(n); - size_t u(0); - - for (size_t a(0); a < Nv; a++) - for (size_t b(a); b < Nv; b++) - for (size_t c(b); c < Nv; c++){ - if ( a == b && b == c ) continue; - result[u++] = {a, b, c}; - } - - return result; - -} - - -std::pair -getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) { - - std::vector n_tuples_per_rank(np, tuplesList.size()/np); - const size_t - // how many valid tuples should we still verteilen to nodes - // since the number of tuples is not divisible by the number of nodes - nRoundRobin = tuplesList.size() % np - // every node must have the sanme amount of tuples in order for the - // other nodes to receive and send somewhere, therefore - // some nodes will get extra tuples but that are dummy tuples - , nExtraInvalid = (np - nRoundRobin) % np - ; - - if (nRoundRobin) for (int i = 0; i < np; i++) n_tuples_per_rank[i]++; - -#if defined(TODO) - assert( tuplesList.size() - == - ( std::accumulate(n_tuples_per_rank.begin(), - n_tuples_per_rank.end(), - 0) - + nExtraInvalid - )); -#endif - - WITH_RANK << "nRoundRobin = " << nRoundRobin << "\n"; - WITH_RANK << "nExtraInvalid = " << nExtraInvalid << "\n"; - WITH_RANK << "ntuples = " << n_tuples_per_rank[rank] << "\n"; - - auto const& it = n_tuples_per_rank.begin(); - - return - { std::accumulate(it, it + rank , 0) - , std::accumulate(it, it + rank + 1, 0) + struct Input { + CTF::Tensor *ei = nullptr + , *ea = nullptr + , *Tph = nullptr + , *Tpphh = nullptr + , *Vpphh = nullptr + , *Vhhhp = nullptr + , *Vppph = nullptr + ; + Input& with_epsilon_i(CTF::Tensor * t) { ei = t; return *this; } + Input& with_epsilon_a(CTF::Tensor * t) { ea = t; return *this; } + Input& with_Tai(CTF::Tensor * t) { Tph = t; return *this; } + Input& with_Tabij(CTF::Tensor * t) { Tpphh = t; return *this; } + Input& with_Vabij(CTF::Tensor * t) { Vpphh = t; return *this; } + Input& with_Vijka(CTF::Tensor * t) { Vhhhp = t; return *this; } + Input& with_Vabci(CTF::Tensor * t) { Vppph = t; return *this; } }; + struct Output { + double energy; + }; + static Output run(Input const& in); + }; + int Atrip::rank; + int Atrip::np; + } +#+end_src + +#+begin_src c++ :tangle (atrip-atrip-cxx) +#include +#include + +using namespace atrip; -SliceUnion& -unionByName(std::vector const& unions, Slice::Name name) { - const auto sliceUnionIt - = std::find_if(unions.begin(), unions.end(), - [&name](SliceUnion const* s) { - return name == s->name; - }); - if (sliceUnionIt == unions.end()) - throw std::domain_error("SliceUnion not found!"); - return **sliceUnionIt; -} +Output Atrip::run(Input const& in){ -#if defined(TRIPLES_WORKLOAD_DUMP) - struct WorkloadEntry { - size_t rank, iteration; - ABCTuple abc; - size_t name; - size_t send; - size_t recv; - size_t doubles; - size_t barrier; - size_t allgather; - size_t unwrap; + const int np = Atrip::np; + const int rank = Atrip::rank; + MPI_Comm universe = in.ei->wrld->comm; - static - size_t findRecv(SliceUnion const& u) { - return std::count_if(u.slices.begin(), u.slices.end(), - [&u](Slice const& slice) { - return slice.info.state == Slice::Dispatched; - }); - } - static - size_t findSend( Slice::Database const& db - , SliceUnion const& u - , size_t rank) { - using Element = Slice::LocalDatabaseElement; - return std::count_if(db.begin(), db.end(), - [&u, rank](Element const& el) { - return el.name == u.name - && el.info.from.rank == rank - // this should work since the database - // has the non-updated Slice::Info - && el.info.state == Slice::Fetch - ; - }); - } - }; + // Timings in seconds ================================================{{{1 + Timings chrono{}; + + // Get the distributed ctf tensor data + CTF::Tensor<> *ei(getTensorArgument("HoleEigenEnergies")) + , *ea(getTensorArgument("ParticleEigenEnergies")) + , *Tph(getTensorArgument("CcsdSinglesAmplitudes")) + , *Tpphh(getTensorArgument("CcsdDoublesAmplitudes")) + , *Vpphh(getTensorArgument("PPHHCoulombIntegrals")) + , *Vhhhp(getTensorArgument("HHHPCoulombIntegrals")) + , *Vppph(getTensorArgument("PPPHCoulombIntegrals")) + ; + + No = ei->lens[0]; + Nv = ea->lens[0]; + LOG(0,"NEW_TRIPLES") << "No: " << No << "\n"; + LOG(0,"NEW_TRIPLES") << "Nv: " << Nv << "\n"; + + // allocate the three scratches, see piecuch + std::vector Tijk(No*No*No) // doubles only (see piecuch) + , Zijk(No*No*No) // singles + doubles (see piecuch) + // we need local copies of the following tensors on every + // rank + , epsi(No) + , epsa(Nv) + , Tai(No * Nv) + ; + + ei->read_all(epsi.data()); + ea->read_all(epsa.data()); + Tph->read_all(Tai.data()); + + // COMMUNICATOR CONSTRUCTION ========================================={{{1 + // + // Construct a new communicator living only on a single rank + int child_size = 1 + , child_rank + ; + const + int color = rank / child_size + , crank = rank % child_size + ; + MPI_Comm child_comm; + if (np == 1) { + child_comm = universe; + } else { + MPI_Comm_split(cc4s::Cc4s::world->comm, color, crank, &child_comm); + MPI_Comm_rank(child_comm, &child_rank); + MPI_Comm_size(child_comm, &child_size); + //CTF::World child_world(child_comm); + } + + + chrono["nv-slices"].start(); + // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1 + LOG(0,"NEW_TRIPLES") << "BUILD NV-SLICES\n"; + TAPHH taphh(*Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + HHHA hhha(*Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + chrono["nv-slices"].stop(); + + chrono["nv-nv-slices"].start(); + // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1 + LOG(0,"NEW_TRIPLES") << "BUILD NV x NV-SLICES\n"; + ABPH abph(*Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + ABHH abhh(*Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + TABHH tabhh(*Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe); + chrono["nv-nv-slices"].stop(); + + // all tensors + std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh}; + + //CONSTRUCT TUPLE LIST ==============================================={{{1 + LOG(0,"NEW_TRIPLES") << "BUILD TUPLE LIST\n"; + const auto tuplesList = std::move(getTuplesList(Nv)); + WITH_RANK << "tupList.size() = " << tuplesList.size() << "\n"; + + // GET ABC INDEX RANGE FOR RANK ======================================{{{1 + auto abcIndex = getABCRange(np, rank, tuplesList); + size_t nIterations = abcIndex.second - abcIndex.first; + +#ifdef TRIPLES_BENCHMARK + { const size_t maxIterations = getIntegerArgument("maxIterations", 0); + if (maxIterations != 0) { + abcIndex.second = abcIndex.first + maxIterations % (nIterations + 1); + nIterations = maxIterations % (nIterations + 1); + } + } #endif + WITH_RANK << "abcIndex = " << pretty_print(abcIndex) << "\n"; + LOG(0,"NEW_TRIPLES") << "#iterations: " + << nIterations << "\n"; + + // first abc + const ABCTuple firstAbc = tuplesList[abcIndex.first]; + + + double energy(0.); + + + auto const isFakeTuple + = [&tuplesList](size_t const i) { return i >= tuplesList.size(); }; + + + auto communicateDatabase + = [ &unions + , np + , &chrono + ] (ABCTuple const& abc, MPI_Comm const& c) -> Slice::Database { + + chrono["db:comm:type:do"].start(); + auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement(); + chrono["db:comm:type:do"].stop(); + + chrono["db:comm:ldb"].start(); + Slice::LocalDatabase ldb; + + for (auto const& tensor: unions) { + auto const& tensorDb = tensor->buildLocalDatabase(abc); + ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end()); + } + chrono["db:comm:ldb"].stop(); + + Slice::Database db(np * ldb.size(), ldb[0]); + + chrono["oneshot-db:comm:allgather"].start(); + chrono["db:comm:allgather"].start(); + MPI_Allgather( ldb.data() + , ldb.size() + , MPI_LDB_ELEMENT + , db.data() + , ldb.size() + , MPI_LDB_ELEMENT + , c); + chrono["db:comm:allgather"].stop(); + chrono["oneshot-db:comm:allgather"].stop(); + + chrono["db:comm:type:free"].start(); + MPI_Type_free(&MPI_LDB_ELEMENT); + chrono["db:comm:type:free"].stop(); + + return db; + }; + + auto doIOPhase + = [&unions, &rank, &np, &universe, &chrono] (Slice::Database const& db) { + + const size_t localDBLength = db.size() / np; + + size_t sendTag = 0 + , recvTag = rank * localDBLength + ; + + // RECIEVE PHASE ====================================================== + { + // At this point, we have already send to everyone that fits + auto const& begin = &db[rank * localDBLength] + , end = begin + localDBLength + ; + for (auto it = begin; it != end; ++it) { + recvTag++; + auto const& el = *it; + auto& u = unionByName(unions, el.name); + + WITH_DBG std::cout + << rank << ":r" + << "♯" << recvTag << " =>" + << " «n" << el.name + << ", t" << el.info.type + << ", s" << el.info.state + << "»" + << " ⊙ {" << rank << "⇐" << el.info.from.rank + << ", " + << el.info.from.source << "}" + << " ∴ {" << el.info.tuple[0] + << ", " + << el.info.tuple[1] + << "}" + << "\n" + ; + + chrono["db:io:recv"].start(); + u.receive(el.info, recvTag); + chrono["db:io:recv"].stop(); + + } // recv + } + + // SEND PHASE ========================================================= + for (size_t otherRank = 0; otherRank" + << " «n" << el.name + << ", t" << el.info.type + << ", s" << el.info.state + << "»" + << " ⊙ {" << el.info.from.rank << "⇒" << otherRank + << ", " + << el.info.from.source << "}" + << " ∴ {" << el.info.tuple[0] + << ", " + << el.info.tuple[1] + << "}" + << "\n" + ; + + chrono["db:io:send"].start(); + u.send(otherRank, el.info, sendTag); + chrono["db:io:send"].stop(); + + } // send phase + + } // otherRank + + + }; + +#if defined(HAVE_OCD) || defined(TRIPLES_PRINT_TUPLES) + std::map tupleEnergies; +#endif + + const double doublesFlops + = double(No) + ,* double(No) + ,* double(No) + ,* (double(No) + double(Nv)) + ,* 2 + ,* 6 + / 1e9 + ; + + // START MAIN LOOP ======================================================{{{1 + + Slice::Database db; + + for ( size_t i = abcIndex.first, iteration = 1 + ; i < abcIndex.second + ; i++, iteration++ + ) { + chrono["iterations"].start(); + + // check overhead from chrono over all iterations + chrono["start:stop"].start(); chrono["start:stop"].stop(); + + // check overhead of doing a barrier at the beginning + chrono["oneshot-mpi:barrier"].start(); + chrono["mpi:barrier"].start(); + // TODO: REMOVE + if (getIntegerArgument("barrier", 1) == 1) + MPI_Barrier(universe); + chrono["mpi:barrier"].stop(); + chrono["oneshot-mpi:barrier"].stop(); + + if (iteration % getIntegerArgument("iterationMod", 100) == 0) { + LOG(0,"NEW_TRIPLES") + << "iteration " << iteration + << " [" << 100 * iteration / nIterations << "%]" + << " (" << doublesFlops * iteration / chrono["doubles"].count() + << "GF)" + << " (" << doublesFlops * iteration / chrono["iterations"].count() + << "GF)" + << " ===========================\n"; + + // PRINT TIMINGS + for (auto const& pair: chrono) + LOG(1, " ") << pair.first << " :: " + << pair.second.count() + << std::endl; + + } + + const ABCTuple abc = isFakeTuple(i) + ? tuplesList[tuplesList.size() - 1] + : tuplesList[i] + , *abcNext = i == (abcIndex.second - 1) + ? nullptr + : isFakeTuple(i + 1) + ? &tuplesList[tuplesList.size() - 1] + : &tuplesList[i + 1] + ; + + chrono["with_rank"].start(); + WITH_RANK << " :it " << iteration + << " :abc " << pretty_print(abc) + << " :abcN " + << (abcNext ? pretty_print(*abcNext) : "None") + << "\n"; + chrono["with_rank"].stop(); + + + // COMM FIRST DATABASE ================================================{{{1 + if (i == abcIndex.first) { + WITH_RANK << "__first__:first database ............ \n"; + const auto __db = communicateDatabase(abc, universe); + WITH_RANK << "__first__:first database communicated \n"; + WITH_RANK << "__first__:first database io phase \n"; + doIOPhase(__db); + WITH_RANK << "__first__:first database io phase DONE\n"; + WITH_RANK << "__first__::::Unwrapping all slices for first database\n"; + for (auto& u: unions) u->unwrapAll(abc); + WITH_RANK << "__first__::::Unwrapping all slices for first database DONE\n"; + MPI_Barrier(universe); + } + + // COMM NEXT DATABASE ================================================={{{1 + if (abcNext) { + WITH_RANK << "__comm__:" << iteration << "th communicating database\n"; + chrono["db:comm"].start(); + //const auto db = communicateDatabase(*abcNext, universe); + db = communicateDatabase(*abcNext, universe); + chrono["db:comm"].stop(); + chrono["db:io"].start(); + doIOPhase(db); + chrono["db:io"].stop(); + WITH_RANK << "__comm__:" << iteration << "th database io phase DONE\n"; + } + + // COMPUTE DOUBLES ===================================================={{{1 + OCD_Barrier(universe); + if (!isFakeTuple(i)) { + WITH_RANK << iteration << "-th doubles\n"; + WITH_CHRONO(chrono["oneshot-unwrap"], + WITH_CHRONO(chrono["unwrap"], + WITH_CHRONO(chrono["unwrap:doubles"], + for (auto& u: decltype(unions){&abph, &hhha, &taphh, &tabhh}) { + u->unwrapAll(abc); + } + ))) + chrono["oneshot-doubles"].start(); + chrono["doubles"].start(); + // TODO: REMOVE + for (size_t __i=0; __i < getIntegerArgument("doublesLoops", 1); __i++) + doublesContribution( abc, (size_t)No, (size_t)Nv + // -- VABCI + , abph.unwrapSlice(Slice::AB, abc) + , abph.unwrapSlice(Slice::AC, abc) + , abph.unwrapSlice(Slice::BC, abc) + , abph.unwrapSlice(Slice::BA, abc) + , abph.unwrapSlice(Slice::CA, abc) + , abph.unwrapSlice(Slice::CB, abc) + // -- VHHHA + , hhha.unwrapSlice(Slice::A, abc) + , hhha.unwrapSlice(Slice::B, abc) + , hhha.unwrapSlice(Slice::C, abc) + // -- TA + , taphh.unwrapSlice(Slice::A, abc) + , taphh.unwrapSlice(Slice::B, abc) + , taphh.unwrapSlice(Slice::C, abc) + // -- TABIJ + , tabhh.unwrapSlice(Slice::AB, abc) + , tabhh.unwrapSlice(Slice::AC, abc) + , tabhh.unwrapSlice(Slice::BC, abc) + // -- TIJK + , Tijk.data() + , chrono + ); + WITH_RANK << iteration << "-th doubles done\n"; + chrono["doubles"].stop(); + chrono["oneshot-doubles"].stop(); + } + + // COMPUTE SINGLES =================================================== {{{1 + OCD_Barrier(universe); + if (!isFakeTuple(i)) { + WITH_CHRONO(chrono["oneshot-unwrap"], + WITH_CHRONO(chrono["unwrap"], + WITH_CHRONO(chrono["unwrap:singles"], + abhh.unwrapAll(abc); + ))) + chrono["reorder"].start(); + for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I]; + chrono["reorder"].stop(); + chrono["singles"].start(); + singlesContribution( abc + , Tai.data() + , abhh.unwrapSlice(Slice::AB, abc) + , abhh.unwrapSlice(Slice::AC, abc) + , abhh.unwrapSlice(Slice::BC, abc) + , Zijk.data()); + chrono["singles"].stop(); + } + + + // COMPUTE ENERGY ==================================================== {{{1 + if (!isFakeTuple(i)) { + double tupleEnergy(0.); + + int distinct(0); + if (abc[0] == abc[1]) distinct++; + if (abc[1] == abc[2]) distinct--; + const double epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]); + + chrono["energy"].start(); + if ( distinct == 0) + tupleEnergy = getEnergyDistinct(epsabc, epsi, Tijk, Zijk); + else + tupleEnergy = getEnergySame(epsabc, epsi, Tijk, Zijk); + chrono["energy"].stop(); + +#if defined(HAVE_OCD) || defined(TRIPLES_PRINT_TUPLES) + tupleEnergies[abc] = tupleEnergy; +#endif + + energy += tupleEnergy; + +#ifdef HAVE_OCD + auto const print_slices + = [](ABCTuple const& abc, ABCTuple const& want, SliceUnion& u) { + if (abc != want) return; + + for (auto type: u.sliceTypes) { + auto const& ptr = u.unwrapSlice(type, abc); + auto const& slice = Slice::findByTypeAbc(u.slices, type, abc); + WITH_RANK << "__print_slice__:n" << u.name << " " + << pretty_print(abc) << " " + << pretty_print(slice.info) + ; + for (size_t i = 0; i < 20; i++) std::cout << ptr[i] << ", "; + std::cout << std::endl; + } + }; +#endif + + if (isFakeTuple(i)) { + // fake iterations should also unwrap whatever they got + WITH_RANK << iteration + << "th unwrapping because of fake in " + << i << "\n"; + for (auto& u: unions) u->unwrapAll(abc); + } + +#ifdef HAVE_OCD + for (auto const& u: unions) { + WITH_RANK << "__dups__:" + << iteration + << "-th n" << u->name << " checking duplicates\n"; + u->checkForDuplicates(); + } +#endif + + + // CLEANUP UNIONS ===================================================={{{1 + OCD_Barrier(universe); + if (abcNext) { + chrono["gc"].start(); + WITH_RANK << "__gc__:" << iteration << "-th cleaning up.......\n"; + for (auto& u: unions) { + + u->unwrapAll(abc); + WITH_RANK << "__gc__:n" << u->name << " :it " << iteration + << " :abc " << pretty_print(abc) + << " :abcN " << pretty_print(*abcNext) + << "\n"; + for (auto const& slice: u->slices) + WITH_RANK << "__gc__:guts:" << slice.info << "\n"; + u->clearUnusedSlicesForNext(*abcNext); + + WITH_RANK << "__gc__: checking validity\n"; + +#ifdef HAVE_OCD + // check for validity of the slices + for (auto type: u->sliceTypes) { + auto tuple = Slice::subtupleBySlice(abc, type); + for (auto& slice: u->slices) { + if ( slice.info.type == type + && slice.info.tuple == tuple + && slice.isDirectlyFetchable() + ) { + if (slice.info.state == Slice::Dispatched) + throw std::domain_error( "This slice should not be undispatched! " + + pretty_print(slice.info)); + } + } + } +#endif + + + } + chrono["gc"].stop(); + } + + WITH_RANK << iteration << "-th cleaning up....... DONE\n"; + } + + // CLEAN CHRONO ======================================================{{{1 + chrono["iterations"].stop(); + { // TODO: REMOVEME + chrono["oneshot-doubles"].clear(); + chrono["oneshot-mpi:barrier"].clear(); + chrono["oneshot-db:comm:allgather"].clear(); + chrono["oneshot-unwrap"].clear(); + } + + // ITERATION END ====================================================={{{1 + } // END OF MAIN LOOP + + MPI_Barrier(universe); + + // PRINT TUPLES ========================================================={{{1 +#if defined(HAVE_OCD) || defined(TRIPLES_PRINT_TUPLES) + LOG(0,"NEW_TRIPLES") << "tuple energies" << "\n"; + for (size_t i = 0; i < np; i++) { + MPI_Barrier(universe); + for (auto const& pair: tupleEnergies) { + if (i == rank) + std::cout << pair.first[0] + << " " << pair.first[1] + << " " << pair.first[2] + << std::setprecision(15) << std::setw(23) + << " tupleEnergy: " << pair.second + << "\n" + ; + } + } +#endif + + // COMMUNICATE THE ENERGIES ============================================={{{1 + LOG(0,"NEW_TRIPLES") << "COMMUNICATING ENERGIES \n"; + double globalEnergy = 0; + MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe); + + WITH_RANK << "local energy " << energy << "\n"; + LOG(0,"LOOP FINISHED, energy") + << std::setprecision(15) << std::setw(23) + << globalEnergy << std::endl; + + // PRINT TIMINGS {{{1 + for (auto const& pair: chrono) + LOG(0,"atrip:chrono") << pair.first << " " + << pair.second.count() << std::endl; + + + LOG(0, "atrip:flops") + << nIterations * doublesFlops / chrono["doubles"].count() << "\n"; + +} +#+end_src + + +** Include header + +#+begin_src c++ :tangle (atrip-main-h) +#pragma once + +#define TRIPLES_BENCHMARK +#define TRIPLES_DEBUG 1 +//#define TRIPLES_WORKLOAD_DUMP +#define TRIPLES_USE_DGEMM +//#define TRIPLES_PRINT_TUPLES + +#if TRIPLES_DEBUG == 4 +# pragma message("WARNING: You have OCD debugging ABC triples "\ + "expect GB of output and consult your therapist") +# include +# define HAVE_OCD +# define OCD_Barrier(com) MPI_Barrier(com) +# define WITH_OCD +# define WITH_ROOT if (atrip::Atrip::rank == 0) +# define WITH_SPECIAL(r) if (atrip::Atrip::rank == r) +# define WITH_RANK std::cout << atrip::Atrip::rank << ": " +# define WITH_CRAZY_DEBUG +# define WITH_DBG +# define DBG(...) dbg(__VA_ARGS__) +#elif TRIPLES_DEBUG == 3 +# pragma message("WARNING: You have crazy debugging ABC triples,"\ + " expect GB of output") +# include +# define OCD_Barrier(com) +# define WITH_OCD if (false) +# define WITH_ROOT if (atrip::Atrip::rank == 0) +# define WITH_SPECIAL(r) if (atrip::Atrip::rank == r) +# define WITH_RANK std::cout << atrip::Atrip::rank << ": " +# define WITH_CRAZY_DEBUG +# define WITH_DBG +# define DBG(...) dbg(__VA_ARGS__) +#elif TRIPLES_DEBUG == 2 +# pragma message("WARNING: You have some debugging info for ABC triples") +# include +# define OCD_Barrier(com) +# define WITH_OCD if (false) +# define WITH_ROOT if (atrip::Atrip::rank == 0) +# define WITH_SPECIAL(r) if (atrip::Atrip::rank == r) +# define WITH_RANK std::cout << atrip::Atrip::rank << ": " +# define WITH_CRAZY_DEBUG if (false) +# define WITH_DBG +# define DBG(...) dbg(__VA_ARGS__) +#elif TRIPLES_DEBUG == 1 +# define OCD_Barrier(com) +# define WITH_OCD if (false) +# define WITH_ROOT if (false) +# define WITH_SPECIAL(r) if (false) +# define WITH_RANK if (false) std::cout << atrip::Atrip::rank << ": " +# define WITH_DBG if (false) +# define WITH_CRAZY_DEBUG if (false) +# define DBG(...) +#else +# error("TRIPLES_DEBUG is not defined!") +#endif + +#include +#include +#include + + +#+end_src + + +** Todo :noexport: + #+begin_src c++ :tangle todo.hpp +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace cc4s; + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#+end_src + +#+begin_src c++ :tangle todo.hpp + + + + // MAIN ALGORITHM ======================================================{{{1 void cc4s::PerturbativeTriplesAbcijk::run(){ @@ -2161,9 +2784,6 @@ void cc4s::PerturbativeTriplesAbcijk::run(){ // START MAIN LOOP ======================================================{{{1 Slice::Database db; -#if defined(TRIPLES_WORKLOAD_DUMP) - std::vector> workloadDB; -#endif for ( size_t i = abcIndex.first, iteration = 1 ; i < abcIndex.second @@ -2371,30 +2991,6 @@ void cc4s::PerturbativeTriplesAbcijk::run(){ #endif - // WORKLOAD DUMP ====================================================={{{1 -#if defined(TRIPLES_WORKLOAD_DUMP) - { - chrono["workload"].start(); - std::vector entries; - for (auto const& u: unions) { - entries.push_back - ({ rank - , iteration - , abc - , (size_t)u->name - , WorkloadEntry::findSend(db, *u, rank) - , WorkloadEntry::findRecv(*u) - , (size_t)std::round(1e3*chrono["oneshot-doubles"].count()) - , (size_t)std::round(1e3*chrono["oneshot-mpi:barrier"].count()) - , (size_t)std::round(1e3*chrono["oneshot-db:comm:allgather"].count()) - , (size_t)std::round(1e3*chrono["oneshot-unwrap"].count()) - }); - } - workloadDB.push_back(entries); - chrono["workload"].stop(); - } -#endif - // CLEANUP UNIONS ===================================================={{{1 OCD_Barrier(universe); if (abcNext) { @@ -2489,59 +3085,5 @@ void cc4s::PerturbativeTriplesAbcijk::run(){ LOG(0, "atrip:flops") << nIterations * doublesFlops / chrono["doubles"].count() << "\n"; - // WORKLOAD DUMP ====================================================={{{1 -#if defined(TRIPLES_WORKLOAD_DUMP) - { - constexpr char* _workload_file = "workload.triples.bin"; - LOG(0,"NEW_TRIPLES") << "Writing out WORKLOAD to " - << _workload_file << "\n"; - LOG(0,"NEW_TRIPLES") << "# workload entries = " - << workloadDB.size() << "\n"; - MPI_File handle; - MPI_Status status; - MPI_File_open(universe, - _workload_file, - MPI_MODE_CREATE | MPI_MODE_WRONLY, - MPI_INFO_NULL, - &handle); - - for (int _it = 1; _it <= nIterations; _it++) { - MPI_Offset const - chunk_size = sizeof(WorkloadEntry) * workloadDB[0].size() - , main_offset = chunk_size - ,* (rank * nIterations + _it - 1) - ; - - auto const& entries = workloadDB[_it - 1]; - for (size_t i = 0; i < entries.size(); i++) { - MPI_Offset const - local_offset = main_offset + i * sizeof(WorkloadEntry); - LOG(0,"NEW_TRIPLES") << entries[i].iteration - << "," - << entries[i].name - << "," - << entries[i].send - << "," - << entries[i].recv - << "," - << entries[i].doubles - << "," - << entries[i].barrier - << "\n" - ; - MPI_File_write_at(handle, - local_offset, - &entries[i], - sizeof(WorkloadEntry), - MPI_CHAR, - &status); - } - } // iterations - MPI_Barrier(universe); - MPI_File_close(&handle); - } -#endif - - } #+end_src diff --git a/bench/test_main.cxx b/bench/test_main.cxx index e06dcc4..ebfe817 100644 --- a/bench/test_main.cxx +++ b/bench/test_main.cxx @@ -28,6 +28,9 @@ int main(int argc, char** argv) { , Vppph(4, vvvo.data(), symmetries.data(), world) ; + atrip::Atrip::init(); + atrip::Atrip::run({&ei, &ea, &Tph, &Tpphh, &Vpphh, &Vhhhp, &Vppph}); + std::cout << "Hello world" << std::endl; MPI_Finalize(); return 0; diff --git a/config.el b/config.el index 868bfea..f837321 100644 --- a/config.el +++ b/config.el @@ -1,4 +1,5 @@ (require 'subr-x) +(require 'org) (defun f-join (&rest args) (string-join args "/")) @@ -7,11 +8,29 @@ (defun atrip-print-sources () (princ (string-join atrip-sources " "))) +(defvar atrip-include-f "include/atrip") ;; TODO: create defvar +(defvar atrip-src-f "src/atrip") ;; TODO: create defvar + (defmacro atrip-def (name body) `(progn (defun ,name () ,body) (push (,name) atrip-sources))) + +(defmacro atrip-def-src (name body) + `(atrip-def ,name (f-join atrip-src-f ,body))) +(defmacro atrip-def-hdr (name body) + `(atrip-def ,name (f-join atrip-include-f ,body))) + ;; atrip variables for the org-mode file -(atrip-def atrip-include-f "include/atrip") -(atrip-def atrip-slice-h (f-join (atrip-include-f) "Slice.hpp")) -(atrip-def atrip-utils-h (f-join (atrip-include-f) "Utils.hpp")) +(atrip-def-hdr atrip-slice-h "Slice.hpp") +(atrip-def-hdr atrip-slice-union-h "SliceUnion.hpp") +(atrip-def-hdr atrip-utils-h "Utils.hpp") +(atrip-def-hdr atrip-blas-h "Blas.hpp") +(atrip-def-hdr atrip-rankmap-h "RankMap.hpp") +(atrip-def-hdr atrip-unions-h "Unions.hpp") +(atrip-def-hdr atrip-tuples-h "Tuples.hpp") +(atrip-def-hdr atrip-equations-h "Equations.hpp") + +(atrip-def-hdr atrip-atrip-h "Atrip.hpp") +(atrip-def-src atrip-atrip-cxx "Atrip.cxx") + (atrip-def atrip-main-h "include/atrip.hpp") diff --git a/etc/config/gcc.mk b/etc/config/gcc.mk index 7b14a3e..cf3aee4 100644 --- a/etc/config/gcc.mk +++ b/etc/config/gcc.mk @@ -6,5 +6,6 @@ CXXFLAGS += -I$(ATRIP_ROOT)/include CXXFLAGS += -I$(CTF_INCLUDE_PATH) LDFLAGS += -Wl,-Bstatic -L$(CTF_BUILD_PATH)/lib -lctf -LDFLAGS += -fopenmp -L/usr/lib -lscalapack -L/opt/OpenBLAS/lib -lopenblas +LDFLAGS += -fopenmp -L/usr/lib -L/opt/OpenBLAS/lib -lopenblas +LDFLAGS += -L/usr/local/lib -lscalapack LDFLAGS += -Wl,-Bdynamic diff --git a/etc/make/ctf_vars.mk b/etc/make/ctf_vars.mk index 490d26b..1de08fa 100644 --- a/etc/make/ctf_vars.mk +++ b/etc/make/ctf_vars.mk @@ -1,8 +1,8 @@ CTF_REPOSITORY = https://github.com/cyclops-community/ctf CTF_COMMIT ?= v1.5.0 -CTF_SRC_PATH = $(ATRIP_ROOT)/lib/src/ctf/$(CTF_COMMIT) -CTF_BUILD_PATH = $(ATRIP_ROOT)/lib/build/ctf/$(CTF_COMMIT) +CTF_SRC_PATH = $(ATRIP_ROOT)/extern/src/ctf/$(CTF_COMMIT) +CTF_BUILD_PATH = $(ATRIP_ROOT)/extern/build/ctf/$(CTF_COMMIT) CTF_CONFIG_FLAGS = diff --git a/etc/make/emacs.mk b/etc/make/emacs.mk index d617eae..82f0485 100644 --- a/etc/make/emacs.mk +++ b/etc/make/emacs.mk @@ -1,5 +1,5 @@ EMACS = emacs -q --batch --load config.el define tangle -$(EMACS) $(1) --eval "(require 'org)" --eval '(org-babel-tangle)' +$(EMACS) $(1) --eval '(org-babel-tangle)' endef diff --git a/etc/nix/scalapack.nix b/etc/nix/scalapack.nix new file mode 100644 index 0000000..04864da --- /dev/null +++ b/etc/nix/scalapack.nix @@ -0,0 +1,56 @@ +{ lib, stdenv, fetchFromGitHub, cmake, openssh +, gfortran, mpi, blas, lapack +} : + +assert (!blas.isILP64) && (!lapack.isILP64); + +stdenv.mkDerivation rec { + pname = "scalapack"; + version = "2.1.0"; + + src = fetchFromGitHub { + owner = "Reference-ScaLAPACK"; + repo = pname; + rev = "v${version}"; + sha256 = "1c10d18gj3kvpmyv5q246x35hjxaqn4ygy1cygaydhyxnm4klzdj"; + }; + + nativeBuildInputs = [ cmake openssh ]; + buildInputs = [ mpi gfortran blas lapack ]; + + doCheck = false; + + preConfigure = '' + cmakeFlagsArray+=( + -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON + -DLAPACK_LIBRARIES="-llapack" + -DBLAS_LIBRARIES="-lblas" + ) + ''; + + # Increase individual test timeout from 1500s to 10000s because hydra's builds + # sometimes fail due to this + checkFlagsArray = [ "ARGS=--timeout 10000" ]; + + preCheck = '' + # make sure the test starts even if we have less than 4 cores + export OMPI_MCA_rmaps_base_oversubscribe=1 + + # Fix to make mpich run in a sandbox + export HYDRA_IFACE=lo + + # Run single threaded + export OMP_NUM_THREADS=1 + + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH''${LD_LIBRARY_PATH:+:}`pwd`/lib + ''; + + meta = with lib; { + homepage = "http://www.netlib.org/scalapack/"; + description = "Library of high-performance linear algebra routines for parallel distributed memory machines"; + license = licenses.bsd3; + platforms = [ "x86_64-linux" ]; + maintainers = with maintainers; [ costrouc markuskowa ]; + }; + +} diff --git a/shell.nix b/shell.nix index 915c87f..5550b0d 100644 --- a/shell.nix +++ b/shell.nix @@ -14,4 +14,23 @@ pkgs.mkShell rec { emacs ]; + scalapack = import ./etc/nix/scalapack.nix { + lib = pkgs.lib; + stdenv = pkgs.stdenv; + fetchFromGitHub = pkgs.fetchFromGitHub; + cmake = pkgs.cmake; + openssh = pkgs.openssh; + gfortran = pkgs.gfortran; + mpi = pkgs.mpi; + blas = pkgs.blas; + lapack = pkgs.lapack; + }; + + shellHook = '' + export LAPACK_PATH=${pkgs.lapack} + export BLAS_PATH=${pkgs.blas} + export SCALAPACK_PATH=${scalapack} + export LD_LIBRARY_PATH=${scalapack}/lib:$LD_LIBRARY_PATH + ''; + }