diff --git a/bench/tuples-distribution.cxx b/bench/tuples-distribution.cxx new file mode 100644 index 0000000..0245e6d --- /dev/null +++ b/bench/tuples-distribution.cxx @@ -0,0 +1,431 @@ +#include +#define ATRIP_DEBUG 2 +#include +#include +#include +#include +#include + +using namespace atrip; + +using F = double; +using Tr = CTF::Tensor; + +#define INIT_DRY(name, ...) \ + do { \ + std::vector lens = __VA_ARGS__; \ + int i = -1; \ + name.order = lens.size(); \ + name.lens = (int64_t*)malloc(sizeof(int64_t) * lens.size()); \ + name.sym = (int*)malloc(sizeof(int) * lens.size()); \ + name.lens[++i] = lens[i]; name.lens[++i] = lens[i]; \ + name.lens[++i] = lens[i]; name.lens[++i] = lens[i]; \ + i = 0; \ + name.sym[i++] = NS; name.sym[i++] = NS; \ + name.sym[i++] = NS; name.sym[i++] = NS; \ + } while (0) + +#define DEINIT_DRY(name) \ + do { \ + name.order = 0; \ + name.lens = NULL; \ + name.sym = NULL; \ + } while (0) + +using LocalDatabase = typename Slice::LocalDatabase; +using LocalDatabaseElement = typename Slice::LocalDatabaseElement; + +LocalDatabase buildLocalDatabase(SliceUnion &u, + ABCTuple const& abc) { + LocalDatabase result; + + auto const needed = u.neededSlices(abc); + + // BUILD THE DATABASE + // we need to loop over all sliceTypes that this TensorUnion + // is representing and find out how we will get the corresponding + // slice for the abc we are considering right now. + for (auto const& pair: needed) { + auto const type = pair.first; + auto const tuple = pair.second; + auto const from = u.rankMap.find(abc, type); + + { + // FIRST: look up if there is already a *Ready* slice matching what we + // need + auto const& it + = std::find_if(u.slices.begin(), u.slices.end(), + [&tuple, &type](Slice const& other) { + return other.info.tuple == tuple + && other.info.type == type + // we only want another slice when it + // has already ready-to-use data + && other.isUnwrappable() + ; + }); + if (it != u.slices.end()) { + // if we find this slice, it means that we don't have to do anything + result.push_back({u.name, it->info}); + continue; + } + } + + // + // Try to find a recyling possibility ie. find a slice with the same + // tuple and that has a valid data pointer. + // + auto const& recycleIt + = std::find_if(u.slices.begin(), u.slices.end(), + [&tuple, &type](Slice const& other) { + return other.info.tuple == tuple + && other.info.type != type + && other.isRecyclable() + ; + }); + + // + // if we find this recylce, then we find a Blank slice + // (which should exist by construction :THINK) + // + if (recycleIt != u.slices.end()) { + auto& blank = Slice::findOneByType(u.slices, Slice::Blank); + // TODO: formalize this through a method to copy information + // from another slice + blank.data = recycleIt->data; + blank.info.type = type; + blank.info.tuple = tuple; + blank.info.state = Slice::Recycled; + blank.info.from = from; + blank.info.recycling = recycleIt->info.type; + result.push_back({u.name, blank.info}); + WITH_RANK << "__db__: RECYCLING: n" << u.name + << " " << pretty_print(abc) + << " get " << pretty_print(blank.info) + << " from " << pretty_print(recycleIt->info) + << " ptr " << recycleIt->data + << "\n" + ; + continue; + } + + // in this case we have to create a new slice + // this means that we should have a blank slice at our disposal + // and also the freePointers should have some elements inside, + // so we pop a data pointer from the freePointers container + { + auto& blank = Slice::findOneByType(u.slices, Slice::Blank); + blank.info.type = type; + blank.info.tuple = tuple; + blank.info.from = from; + + // Handle self sufficiency + blank.info.state = Atrip::rank == from.rank + ? Slice::SelfSufficient + : Slice::Fetch + ; + if (blank.info.state == Slice::SelfSufficient) { + blank.data = (F*)0xBADA55; + } else { + blank.data = (F*)0xA55A55; + } + + result.push_back({u.name, blank.info}); + continue; + } + + } + + return result; + +} + +void clearUnusedSlicesForNext(SliceUnion &u, + ABCTuple const& abc) { + auto const needed = u.neededSlices(abc); + + // CLEAN UP SLICES, FREE THE ONES THAT ARE NOT NEEDED ANYMORE + for (auto& slice: u.slices) { + // if the slice is free, then it was not used anyways + if (slice.isFree()) continue; + + + // try to find the slice in the needed slices list + auto const found + = std::find_if(needed.begin(), needed.end(), + [&slice] (typename Slice::Ty_x_Tu const& tytu) { + return slice.info.tuple == tytu.second + && slice.info.type == tytu.first + ; + }); + + // if we did not find slice in needed, then erase it + if (found == needed.end()) { + + // allow to gc unwrapped and recycled, never Fetch, + // if we have a Fetch slice then something has gone very wrong. + if (!slice.isUnwrapped() && slice.info.state != Slice::Recycled) + throw + std::domain_error(_FORMAT("Trying to garbage collect (%d, %d) " + " a non-unwrapped slice! ", + slice.info.type, + slice.info.state)); + + // it can be that our slice is ready, but it has some hanging + // references lying around in the form of a recycled slice. + // Of course if we need the recycled slice the next iteration + // this would be fatal, because we would then free the pointer + // of the slice and at some point in the future we would + // overwrite it. Therefore, we must check if slice has some + // references in slices and if so then + // + // - we should mark those references as the original (since the data + // pointer should be the same) + // + // - we should make sure that the data pointer of slice + // does not get freed. + // + if (slice.info.state == Slice::Ready) { + WITH_OCD WITH_RANK + << "__gc__:" << "checking for data recycled dependencies\n"; + auto recycled + = Slice::hasRecycledReferencingToIt(u.slices, slice.info); + if (recycled.size()) { + Slice* newReady = recycled[0]; + WITH_OCD WITH_RANK + << "__gc__:" << "swaping recycled " + << pretty_print(newReady->info) + << " and " + << pretty_print(slice.info) + << "\n"; + newReady->markReady(); + + for (size_t i = 1; i < recycled.size(); i++) { + auto newRecyled = recycled[i]; + newRecyled->info.recycling = newReady->info.type; + WITH_OCD WITH_RANK + << "__gc__:" << "updating recycled " + << pretty_print(newRecyled->info) + << "\n"; + } + + } + } + + slice.free(); + } // we did not find the slice + + } +} + + +void unwrapSlice(Slice::Type t, ABCTuple abc, SliceUnion *u) { + auto& slice = Slice::findByTypeAbc(u->slices, t, abc); + switch (slice.info.state) { + case Slice::Dispatched: + slice.markReady(); + break; + case Slice::Recycled: + unwrapSlice(t, abc, u); + break; + } +} + +int main(int argc, char** argv) { + MPI_Init(&argc, &argv); + + int no(10), nv(100); + std::string tuplesDistributionString = "naive"; + + CLI::App app{"Main bench for atrip"}; + app.add_option("--no", no, "Occupied orbitals"); + app.add_option("--nv", nv, "Virtual orbitals"); + CLI11_PARSE(app, argc, argv); + + CTF::World world(argc, argv); + auto kaun = world.comm; + int rank, np; + MPI_Comm_rank(kaun, &rank); + MPI_Comm_size(kaun, &np); + Atrip::init(world.comm); + + + atrip::ABCTuples tuplesList; + atrip::TuplesDistribution *dist; + { + using namespace atrip; + if (tuplesDistributionString == "naive") { + dist = new NaiveDistribution(); + tuplesList = dist->getTuples(nv, world.comm); + } else if (tuplesDistributionString == "group") { + dist = new group_and_sort::Distribution(); + tuplesList = dist->getTuples(nv, world.comm); + } else { + std::cout << "--dist should be either naive or group\n"; + exit(1); + } + } + + LOG(0, "bench:") << "We have " + << tuplesList.size() + << " tuples" << std::endl; + + // create a fake dry tensor + Tr t_abph, t_abhh, t_tabhh, t_taphh, t_hhha; + INIT_DRY(t_abph , {nv, nv, nv, no}); + INIT_DRY(t_abhh , {nv, nv, no, no}); + INIT_DRY(t_tabhh , {nv, nv, no, no}); + INIT_DRY(t_taphh , {nv, nv, no, no}); + INIT_DRY(t_hhha , {no, no, no, nv}); + + ABPH abph(t_abph, (size_t)no, (size_t)nv, (size_t)np, kaun, kaun); + ABHH abhh(t_abhh, (size_t)no, (size_t)nv, (size_t)np, kaun, kaun); + TABHH tabhh(t_tabhh, (size_t)no, (size_t)nv, (size_t)np, kaun, kaun); + TAPHH taphh(t_taphh, (size_t)no, (size_t)nv, (size_t)np, kaun, kaun); + HHHA hhha(t_hhha, (size_t)no, (size_t)nv, (size_t)np, kaun, kaun); + std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh}; + + + + using Database = typename Slice::Database; + auto communicateDatabase + = [ &unions + , np + ] (ABCTuple const& abc, MPI_Comm const& c) -> Database { + + WITH_CHRONO("db:comm:type:do", + auto MPI_LDB_ELEMENT = Slice::mpi::localDatabaseElement(); + ) + + WITH_CHRONO("db:comm:ldb", + typename Slice::LocalDatabase ldb; + for (auto const& tensor: unions) { + auto const& tensorDb = buildLocalDatabase(*tensor, abc); + ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end()); + } + ) + + Database db(np * ldb.size(), ldb[0]); + + WITH_CHRONO("oneshot-db:comm:allgather", + WITH_CHRONO("db:comm:allgather", + MPI_Allgather(ldb.data(), + /* ldb.size() * sizeof(typename + Slice::LocalDatabaseElement) */ + ldb.size(), + MPI_LDB_ELEMENT, + db.data(), + /* ldb.size() * sizeof(typename + Slice::LocalDatabaseElement), */ + ldb.size(), + MPI_LDB_ELEMENT, + c); + )) + + WITH_CHRONO("db:comm:type:free", MPI_Type_free(&MPI_LDB_ELEMENT);) + + return db; + }; + + auto doIOPhase + = [&unions, &rank, &np] (Database const& db, + std::vector &to_send) { + + const size_t localDBLength = db.size() / np; + + size_t sendTag = 0 + , recvTag = rank * localDBLength + ; + + { + // At this point, we have already send to everyone that fits + auto const& begin = &db[rank * localDBLength] + , end = begin + localDBLength + ; + for (auto it = begin; it != end; ++it) { + recvTag++; + auto const& el = *it; + auto& u = unionByName(unions, el.name); + auto& slice = Slice::findByInfo(u.slices, el.info); + slice.markReady(); + // u.receive(el.info, recvTag); + + } // recv + } + + // SEND PHASE ========================================================= + for (size_t otherRank = 0; otherRank < np; otherRank++) { + auto const& begin = &db[otherRank * localDBLength] + , end = begin + localDBLength + ; + for (auto it = begin; it != end; ++it) { + sendTag++; + typename Slice::LocalDatabaseElement const& el = *it; + if (el.info.from.rank != rank) continue; + auto& u = unionByName(unions, el.name); + if (el.info.state == Slice::Fetch) { + to_send.push_back(el); + } + // u.send(otherRank, el, sendTag); + + } // send phase + + } // otherRank + + + }; + + std::vector + to_send; + + for (size_t it = 0; it < tuplesList.size(); it++) { + + + const ABCTuple abc = dist->tupleIsFake(tuplesList[it]) + ? tuplesList[tuplesList.size() - 1] + : tuplesList[it] + , *abcNext = it == (tuplesList.size() - 1) + ? nullptr + : &tuplesList[it + 1] + ; + + if (it > 0) { + for (auto const& u: unions) { + clearUnusedSlicesForNext(*u, abc); + } + } + + const auto db = communicateDatabase(abc, kaun); + doIOPhase(db, to_send); + + if (it % 1000 == 0) + std::cout << _FORMAT("%ld :it(%ld) %f %% ∷ %ld ∷ %f GB\n", + rank, + it, + 100 * to_send.size() / tuplesList.size(), + to_send.size(), + double(to_send.size()) * 8 / + 1024.0 / 1024.0 / 1024.0); + + + for (auto const& u: unions) { + for (auto type: u->sliceTypes) { + unwrapSlice(type, abc, u); + } + } + + + } + + std::cout << "=========================================================\n"; + std::cout << "FINISHING, it will segfaulten, that's ok, don't even trip" + << std::endl; + MPI_Barrier(kaun); + DEINIT_DRY(t_abph); + DEINIT_DRY(t_abhh); + DEINIT_DRY(t_tabhh); + DEINIT_DRY(t_taphh); + DEINIT_DRY(t_hhha); + + MPI_Finalize(); + return 0; +} diff --git a/shell.nix b/shell.nix index 0b0899a..92851eb 100644 --- a/shell.nix +++ b/shell.nix @@ -11,7 +11,8 @@ let config.allowUnfree = true; }; - openblas = import ./etc/nix/openblas.nix { inherit pkgs; }; + openblas = import ./etc/nix/openblas.nix { inherit pkgs; }; + vendor = import ./etc/nix/vendor-shell.nix; mkl-pkg = import ./etc/nix/mkl.nix { pkgs = unfree-pkgs; }; cuda-pkg = if cuda then (import ./cuda.nix { pkgs = unfree-pkgs; }) else {}; @@ -57,14 +58,15 @@ pkgs.mkShell rec { buildInputs = with pkgs; [ + gdb coreutils - git vim + git + vim openmpi llvmPackages.openmp binutils - emacs gfortran gnumake @@ -84,6 +86,15 @@ pkgs.mkShell rec { shellHook = '' + + ${vendor.src} + + ${vendor.cpath "${pkgs.openmpi.out}/include"} + ${vendor.cpath "${openblas.pkg.dev}/include"} + + ${vendor.lib "${pkgs.openmpi.out}/lib"} + ${vendor.lib "${openblas.pkg.out}/lib"} + export OMPI_CXX=${CXX} export OMPI_CC=${CC} CXX=${CXX} diff --git a/src/Makefile.am b/src/Makefile.am index b0ee2fb..27f3ba2 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -7,7 +7,7 @@ AM_CPPFLAGS = $(CTF_CPPFLAGS) lib_LIBRARIES = libatrip.a libatrip_a_CPPFLAGS = -I$(top_srcdir)/include/ -libatrip_a_SOURCES = ./atrip/Blas.cxx +libatrip_a_SOURCES = ./atrip/Blas.cxx ./atrip/Tuples.cxx NVCC_FILES = ./atrip/Equations.cxx ./atrip/Complex.cxx ./atrip/Atrip.cxx if WITH_CUDA