Add tentative DatabaseCommunicator

Update tuples-distribution script
Add tuples distribution bench
2022-10-06 01:10:06 +02:00 · 2022-10-06 01:07:53 +02:00 · 2022-10-03 17:13:21 +02:00 · 2022-10-03 17:11:49 +02:00 · 2022-10-03 17:11:33 +02:00
11 changed files with 1168 additions and 437 deletions
--- a/.dir-locals.el
+++ b/.dir-locals.el
@ -0,0 +1,20 @@
+;;; Directory Local Variables
+;;; For more information see (info "(emacs) Directory Variables")
+
+((c++-mode . ((outline-regexp . "// \\[\\[file:")
+              (eval . (let
+                          ((root
+                            (expand-file-name
+                             (project-root
+                              (project-current)))))
+                        (setq-local flycheck-gcc-include-path
+                                    (list
+                                     (format "%s/vendor/include/" root)
+                                     (format "%s/include/" root)
+                                     (format "%s/" root)
+                                     (format "%s/bench/" root)
+                                     (format "%s/build/main/" root)))))
+              (eval . (flycheck-mode))
+              (eval . (outline-minor-mode))
+              (indent-tabs-mode . nil)
+              (tab-width . 2))))
--- a/.gitignore
+++ b/.gitignore
@ -25,3 +25,6 @@ config.mk
 /atrip.html
 /TAGS
 /config.h.in
+/result
+/result-dev
+/vendor/
--- a/bench/tuples-distribution.cxx
+++ b/bench/tuples-distribution.cxx
@ -0,0 +1,443 @@
+#include <iostream>
+#define ATRIP_DEBUG 2
+#include <atrip/Atrip.hpp>
+#include <atrip/Tuples.hpp>
+#include <atrip/Unions.hpp>
+#include <bench/CLI11.hpp>
+#include <bench/utils.hpp>
+
+using namespace atrip;
+
+using F = double;
+using Tr = CTF::Tensor<F>;
+
+#define INIT_DRY(name, ...)                                       \
+  do {                                                            \
+    std::vector<int64_t> lens = __VA_ARGS__;                      \
+    int i = -1;                                                   \
+    name.order = lens.size();                                     \
+    name.lens = (int64_t*)malloc(sizeof(int64_t) * lens.size());  \
+    name.sym = (int*)malloc(sizeof(int) * lens.size());           \
+    name.lens[++i] = lens[i]; name.lens[++i] = lens[i];           \
+    name.lens[++i] = lens[i]; name.lens[++i] = lens[i];           \
+    i = 0;                                                        \
+    name.sym[i++] = NS; name.sym[i++] = NS;                       \
+    name.sym[i++] = NS; name.sym[i++] = NS;                       \
+  } while (0)
+
+#define DEINIT_DRY(name)                        \
+  do {                                          \
+    name.order = 0;                             \
+    name.lens = NULL;                           \
+    name.sym = NULL;                            \
+  } while (0)
+
+using LocalDatabase = typename Slice<F>::LocalDatabase;
+using LocalDatabaseElement = typename Slice<F>::LocalDatabaseElement;
+
+LocalDatabase buildLocalDatabase(SliceUnion<F> &u,
+                                 ABCTuple const& abc) {
+  LocalDatabase result;
+
+  auto const needed = u.neededSlices(abc);
+
+  // BUILD THE DATABASE
+  // we need to loop over all sliceTypes that this TensorUnion
+  // is representing and find out how we will get the corresponding
+  // slice for the abc we are considering right now.
+  for (auto const& pair: needed) {
+    auto const type = pair.first;
+    auto const tuple = pair.second;
+    auto const from  = u.rankMap.find(abc, type);
+
+    {
+      // FIRST: look up if there is already a *Ready* slice matching what we
+      // need
+      auto const& it
+        = std::find_if(u.slices.begin(), u.slices.end(),
+                       [&tuple, &type](Slice<F> const& other) {
+                         return other.info.tuple == tuple
+                           && other.info.type == type
+                           // we only want another slice when it
+                           // has already ready-to-use data
+                           && other.isUnwrappable()
+                           ;
+                       });
+      if (it != u.slices.end()) {
+        // if we find this slice, it means that we don't have to do anything
+        result.push_back({u.name, it->info});
+        continue;
+      }
+    }
+
+    //
+    // Try to find a recyling possibility ie. find a slice with the same
+    // tuple and that has a valid data pointer.
+    //
+    auto const& recycleIt
+      = std::find_if(u.slices.begin(), u.slices.end(),
+                     [&tuple, &type](Slice<F> const& other) {
+                       return other.info.tuple == tuple
+                         && other.info.type != type
+                         && other.isRecyclable()
+                         ;
+                     });
+
+    //
+    // if we find this recylce, then we find a Blank slice
+    // (which should exist by construction :THINK)
+    //
+    if (recycleIt != u.slices.end()) {
+      auto& blank = Slice<F>::findOneByType(u.slices, Slice<F>::Blank);
+      // TODO: formalize this through a method to copy information
+      //       from another slice
+      blank.data = recycleIt->data;
+      blank.info.type = type;
+      blank.info.tuple = tuple;
+      blank.info.state = Slice<F>::Recycled;
+      blank.info.from = from;
+      blank.info.recycling = recycleIt->info.type;
+      result.push_back({u.name, blank.info});
+      WITH_RANK << "__db__: RECYCLING: n" << u.name
+                << " " << pretty_print(abc)
+                << " get " << pretty_print(blank.info)
+                << " from " << pretty_print(recycleIt->info)
+                << " ptr " << recycleIt->data
+                << "\n"
+                ;
+      continue;
+    }
+
+    // in this case we have to create a new slice
+    // this means that we should have a blank slice at our disposal
+    // and also the freePointers should have some elements inside,
+    // so we pop a data pointer from the freePointers container
+    {
+      auto& blank = Slice<F>::findOneByType(u.slices, Slice<F>::Blank);
+      blank.info.type = type;
+      blank.info.tuple = tuple;
+      blank.info.from = from;
+
+      // Handle self sufficiency
+      blank.info.state = Atrip::rank == from.rank
+                        ? Slice<F>::SelfSufficient
+                        : Slice<F>::Fetch
+                        ;
+      if (blank.info.state == Slice<F>::SelfSufficient) {
+        blank.data = (F*)0xBADA55;
+      } else {
+        blank.data = (F*)0xA55A55;
+      }
+
+      result.push_back({u.name, blank.info});
+      continue;
+    }
+
+  }
+
+  return result;
+
+}
+
+void clearUnusedSlicesForNext(SliceUnion<F> &u,
+                              ABCTuple const& abc) {
+  auto const needed = u.neededSlices(abc);
+
+  // CLEAN UP SLICES, FREE THE ONES THAT ARE NOT NEEDED ANYMORE
+  for (auto& slice: u.slices) {
+    // if the slice is free, then it was not used anyways
+    if (slice.isFree()) continue;
+
+
+    // try to find the slice in the needed slices list
+    auto const found
+      = std::find_if(needed.begin(), needed.end(),
+                      [&slice] (typename Slice<F>::Ty_x_Tu const& tytu) {
+                        return slice.info.tuple == tytu.second
+                            && slice.info.type == tytu.first
+                            ;
+                      });
+
+    // if we did not find slice in needed, then erase it
+    if (found == needed.end()) {
+
+      // allow to gc unwrapped and recycled, never Fetch,
+      // if we have a Fetch slice then something has gone very wrong.
+      if (!slice.isUnwrapped() && slice.info.state != Slice<F>::Recycled)
+        throw
+          std::domain_error(_FORMAT("Trying to garbage collect (%d, %d) "
+                                    " a non-unwrapped slice! ",
+                                    slice.info.type,
+                                    slice.info.state));
+
+      // it can be that our slice is ready, but it has some hanging
+      // references lying around in the form of a recycled slice.
+      // Of course if we need the recycled slice the next iteration
+      // this would be fatal, because we would then free the pointer
+      // of the slice and at some point in the future we would
+      // overwrite it. Therefore, we must check if slice has some
+      // references in slices and if so then
+      //
+      //  - we should mark those references as the original (since the data
+      //    pointer should be the same)
+      //
+      //  - we should make sure that the data pointer of slice
+      //    does not get freed.
+      //
+      if (slice.info.state == Slice<F>::Ready) {
+        WITH_OCD WITH_RANK
+          << "__gc__:" << "checking for data recycled dependencies\n";
+        auto recycled
+          = Slice<F>::hasRecycledReferencingToIt(u.slices, slice.info);
+        if (recycled.size()) {
+          Slice<F>* newReady = recycled[0];
+          WITH_OCD WITH_RANK
+            << "__gc__:" << "swaping recycled "
+            << pretty_print(newReady->info)
+            << " and "
+            << pretty_print(slice.info)
+            << "\n";
+          newReady->markReady();
+
+          for (size_t i = 1; i < recycled.size(); i++) {
+            auto newRecyled = recycled[i];
+            newRecyled->info.recycling = newReady->info.type;
+            WITH_OCD WITH_RANK
+              << "__gc__:" << "updating recycled "
+              << pretty_print(newRecyled->info)
+              << "\n";
+          }
+
+        }
+      }
+
+      slice.free();
+    }  // we did not find the slice
+
+  }
+}
+
+
+void unwrapSlice(Slice<F>::Type t, ABCTuple abc, SliceUnion<F> *u) {
+  auto& slice = Slice<F>::findByTypeAbc(u->slices, t, abc);
+  switch  (slice.info.state) {
+  case Slice<F>::Dispatched:
+    slice.markReady();
+    break;
+  case Slice<F>::Recycled:
+    unwrapSlice(t, abc, u);
+    break;
+  }
+}
+
+#define PRINT_VARIABLE(v)                                         \
+  do {                                                            \
+    if (!rank) std::cout << "# " << #v << ": " << v << std::endl; \
+  } while (0)
+
+int main(int argc, char** argv) {
+  MPI_Init(&argc, &argv);
+
+  int no(10), nv(100);
+  std::string tuplesDistributionString = "naive";
+
+  CLI::App app{"Main bench for atrip"};
+  app.add_option("--no", no, "Occupied orbitals");
+  app.add_option("--nv", nv, "Virtual orbitals");
+  app.add_option("--dist", tuplesDistributionString, "Which distribution");
+  CLI11_PARSE(app, argc, argv);
+
+  CTF::World world(argc, argv);
+  auto kaun = world.comm;
+  int rank, np;
+  MPI_Comm_rank(kaun, &rank);
+  MPI_Comm_size(kaun, &np);
+  Atrip::init(world.comm);
+
+
+
+  atrip::ABCTuples tuplesList;
+  atrip::TuplesDistribution *dist;
+  {
+    using namespace atrip;
+    if (tuplesDistributionString == "naive") {
+      dist = new NaiveDistribution();
+      tuplesList = dist->getTuples(nv, world.comm);
+    } else if (tuplesDistributionString == "group") {
+      dist = new group_and_sort::Distribution();
+      tuplesList = dist->getTuples(nv, world.comm);
+    } else {
+      std::cout << "--dist should be either naive or group\n";
+      exit(1);
+    }
+  }
+
+  double tuplesListGb
+    = tuplesList.size() * sizeof(tuplesList[0])
+    / 1024.0 / 1024.0 / 1024.0;
+
+  std::cout << "\n";
+  PRINT_VARIABLE(tuplesDistributionString);
+  PRINT_VARIABLE(np);
+  PRINT_VARIABLE(no);
+  PRINT_VARIABLE(nv);
+  PRINT_VARIABLE(tuplesList.size());
+  PRINT_VARIABLE(tuplesListGb);
+
+  // create a fake dry tensor
+  Tr t_abph, t_abhh, t_tabhh, t_taphh, t_hhha;
+  INIT_DRY(t_abph  , {nv, nv, nv, no});
+  INIT_DRY(t_abhh  , {nv, nv, no, no});
+  INIT_DRY(t_tabhh , {nv, nv, no, no});
+  INIT_DRY(t_taphh , {nv, nv, no, no});
+  INIT_DRY(t_hhha  , {no, no, no, nv});
+
+  ABPH<F> abph(t_abph, (size_t)no, (size_t)nv, (size_t)np, kaun, kaun);
+  ABHH<F> abhh(t_abhh, (size_t)no, (size_t)nv, (size_t)np, kaun, kaun);
+  TABHH<F> tabhh(t_tabhh, (size_t)no, (size_t)nv, (size_t)np, kaun, kaun);
+  TAPHH<F> taphh(t_taphh, (size_t)no, (size_t)nv, (size_t)np, kaun, kaun);
+  HHHA<F>  hhha(t_hhha, (size_t)no, (size_t)nv, (size_t)np, kaun, kaun);
+  std::vector< SliceUnion<F>* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh};
+
+
+
+  using Database = typename Slice<F>::Database;
+  auto communicateDatabase
+    = [ &unions
+      , np
+      ] (ABCTuple const& abc, MPI_Comm const& c) -> Database {
+
+        WITH_CHRONO("db:comm:type:do",
+          auto MPI_LDB_ELEMENT = Slice<F>::mpi::localDatabaseElement();
+        )
+
+        WITH_CHRONO("db:comm:ldb",
+          typename Slice<F>::LocalDatabase ldb;
+          for (auto const& tensor: unions) {
+            auto const& tensorDb = buildLocalDatabase(*tensor, abc);
+            ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end());
+          }
+        )
+
+        Database db(np * ldb.size(), ldb[0]);
+
+        WITH_CHRONO("oneshot-db:comm:allgather",
+        WITH_CHRONO("db:comm:allgather",
+                    MPI_Allgather(ldb.data(),
+                                  /* ldb.size() * sizeof(typename
+                                     Slice<F>::LocalDatabaseElement) */
+                                  ldb.size(),
+                                  MPI_LDB_ELEMENT,
+                                  db.data(),
+                                  /* ldb.size() * sizeof(typename
+                                     Slice<F>::LocalDatabaseElement), */
+                                  ldb.size(),
+                                  MPI_LDB_ELEMENT,
+                                  c);
+        ))
+
+        WITH_CHRONO("db:comm:type:free", MPI_Type_free(&MPI_LDB_ELEMENT);)
+
+        return db;
+      };
+
+  auto doIOPhase
+    = [&unions, &rank, &np] (Database const& db,
+                             std::vector<LocalDatabaseElement> &to_send) {
+
+    const size_t localDBLength = db.size() / np;
+
+    size_t sendTag = 0
+         , recvTag = rank * localDBLength
+         ;
+
+    {
+      // At this point, we have already send to everyone that fits
+      auto const& begin = &db[rank * localDBLength]
+                , end   = begin + localDBLength
+                ;
+      for (auto it = begin; it != end; ++it) {
+        recvTag++;
+        auto const& el = *it;
+        auto& u = unionByName(unions, el.name);
+        auto& slice = Slice<F>::findByInfo(u.slices, el.info);
+        slice.markReady();
+        // u.receive(el.info, recvTag);
+
+      } // recv
+    }
+
+    // SEND PHASE =========================================================
+    for (size_t otherRank = 0; otherRank < np; otherRank++) {
+      auto const& begin = &db[otherRank * localDBLength]
+                , end = begin + localDBLength
+                ;
+      for (auto it = begin; it != end; ++it) {
+        sendTag++;
+        typename Slice<F>::LocalDatabaseElement const& el = *it;
+        if (el.info.from.rank != rank) continue;
+        auto& u = unionByName(unions, el.name);
+        if (el.info.state == Slice<F>::Fetch) {
+          to_send.push_back(el);
+        }
+        // u.send(otherRank, el, sendTag);
+
+      } // send phase
+
+    } // otherRank
+
+
+  };
+
+  std::vector<LocalDatabaseElement>
+    to_send;
+
+  for (size_t it = 0; it < tuplesList.size(); it++) {
+
+
+    const ABCTuple abc = dist->tupleIsFake(tuplesList[it])
+                       ? tuplesList[tuplesList.size() - 1]
+                       : tuplesList[it]
+                 ;
+
+    if (it > 0) {
+      for (auto const& u: unions) {
+        clearUnusedSlicesForNext(*u, abc);
+      }
+    }
+
+    const auto db = communicateDatabase(abc, kaun);
+    doIOPhase(db, to_send);
+
+    if (it % 1000 == 0)
+      std::cout << _FORMAT("%ld :it %ld  %f %% ∷ %ld ∷ %f GB\n",
+                           rank,
+                           it,
+                           100.0 * double(to_send.size()) / double(tuplesList.size()),
+                           to_send.size(),
+                           double(to_send.size()) * sizeof(to_send[0])
+                           / 1024.0 / 1024.0 / 1024.0);
+
+
+    for (auto const& u: unions) {
+      for (auto type: u->sliceTypes) {
+        unwrapSlice(type, abc, u);
+      }
+    }
+
+
+  }
+
+  std::cout << "=========================================================\n";
+  std::cout << "FINISHING, it will segfaulten, that's ok, don't even trip"
+            << std::endl;
+  MPI_Barrier(kaun);
+  DEINIT_DRY(t_abph);
+  DEINIT_DRY(t_abhh);
+  DEINIT_DRY(t_tabhh);
+  DEINIT_DRY(t_taphh);
+  DEINIT_DRY(t_hhha);
+
+  MPI_Finalize();
+  return 0;
+}
--- a/bench/utils.hpp
+++ b/bench/utils.hpp
@ -0,0 +1,12 @@
+#ifndef UTILS_HPP_
+#define UTILS_HPP_
+
+#define _FORMAT(_fmt, ...)                                    \
+  ([&] (void) -> std::string {                                \
+     int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__);  \
+     std::vector<char>  _out(_sz  +  1);                      \
+     std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
+     return std::string(_out.data());                         \
+   })()
+
+#endif
--- a/include/atrip/DatabaseCommunicator.hpp
+++ b/include/atrip/DatabaseCommunicator.hpp
@ -0,0 +1,20 @@
+#pragma once
+#include <atrip/Utils.hpp>
+#include <atrip/Equations.hpp>
+#include <atrip/SliceUnion.hpp>
+#include <atrip/Unions.hpp>
+
+namespace atrip {
+
+  template <typename F>
+  using Unions = std::vector<SliceUnion<F>*>;
+
+  template <typename F>
+  typename Slice<F>::Database
+  naiveDatabase(Unions<F> &unions,
+                size_t nv,
+                size_t np,
+                size_t iteration,
+                MPI_Comm const& c);
+
+}  // namespace atrip
--- a/include/atrip/Tuples.hpp
+++ b/include/atrip/Tuples.hpp
@ -52,43 +52,7 @@ struct TuplesDistribution {
 // Distributing the tuples:1 ends here

 // [[file:~/cuda/atrip/atrip.org::*Node%20information][Node information:1]]
-std::vector<std::string> getNodeNames(MPI_Comm comm){
-  int rank, np;
-  MPI_Comm_rank(comm, &rank);
-  MPI_Comm_size(comm, &np);
-
-  std::vector<std::string> nodeList(np);
-  char nodeName[MPI_MAX_PROCESSOR_NAME];
-  char *nodeNames = (char*)malloc(np * MPI_MAX_PROCESSOR_NAME);
-  std::vector<int> nameLengths(np)
-                 , off(np)
-                 ;
-  int nameLength;
-  MPI_Get_processor_name(nodeName, &nameLength);
-  MPI_Allgather(&nameLength,
-                1,
-                MPI_INT,
-                nameLengths.data(),
-                1,
-                MPI_INT,
-                comm);
-  for (int i(1); i < np; i++)
-    off[i] = off[i-1] + nameLengths[i-1];
-  MPI_Allgatherv(nodeName,
-                 nameLengths[rank],
-                 MPI_BYTE,
-                 nodeNames,
-                 nameLengths.data(),
-                 off.data(),
-                 MPI_BYTE,
-                 comm);
-  for (int i(0); i < np; i++) {
-    std::string const s(&nodeNames[off[i]], nameLengths[i]);
-    nodeList[i] = s;
-  }
-  std::free(nodeNames);
-  return nodeList;
-}
+  std::vector<std::string> getNodeNames(MPI_Comm comm);
 // Node information:1 ends here

 // [[file:~/cuda/atrip/atrip.org::*Node%20information][Node information:2]]
@ -100,118 +64,28 @@ struct RankInfo {
  const size_t ranksPerNode;
 };

-template <typename A>
-A unique(A const &xs) {
-  auto result = xs;
-  std::sort(std::begin(result), std::end(result));
-  auto const& last = std::unique(std::begin(result), std::end(result));
-  result.erase(last, std::end(result));
-  return result;
-}
-
 std::vector<RankInfo>
-getNodeInfos(std::vector<string> const& nodeNames) {
-  std::vector<RankInfo> result;
-  auto const uniqueNames = unique(nodeNames);
-  auto const index = [&uniqueNames](std::string const& s) {
-    auto const& it = std::find(uniqueNames.begin(), uniqueNames.end(), s);
-    return std::distance(uniqueNames.begin(), it);
-  };
-  std::vector<size_t> localRanks(uniqueNames.size(), 0);
-  size_t globalRank = 0;
-  for (auto const& name: nodeNames) {
-    const size_t nodeId = index(name);
-    result.push_back({name,
-                      nodeId,
-                      globalRank++,
-                      localRanks[nodeId]++,
-                      (size_t)
-                      std::count(nodeNames.begin(),
-                                 nodeNames.end(),
-                                 name)
-                      });
-  }
-  return result;
-}
+getNodeInfos(std::vector<string> const& nodeNames);

 struct ClusterInfo {
  const size_t nNodes, np, ranksPerNode;
  const std::vector<RankInfo> rankInfos;
 };

-ClusterInfo
-getClusterInfo(MPI_Comm comm) {
-  auto const names = getNodeNames(comm);
-  auto const rankInfos = getNodeInfos(names);
-
-  return ClusterInfo {
-    unique(names).size(),
-    names.size(),
-    rankInfos[0].ranksPerNode,
-    rankInfos
-  };
-
-}
+ClusterInfo getClusterInfo(MPI_Comm comm);
 // Node information:2 ends here

 // [[file:~/cuda/atrip/atrip.org::*Naive%20list][Naive list:1]]
-ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) {
-
-  const size_t
-    // total number of tuples for the problem
-       n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv
-
-    // all ranks should have the same number of tuples_per_rank
-    , tuples_per_rank = n / np + size_t(n % np != 0)
-
-    // start index for the global tuples list
-    , start = tuples_per_rank * rank
-
-    // end index for the global tuples list
-    , end = tuples_per_rank * (rank + 1)
-    ;
-
-  LOG(1,"Atrip") << "tuples_per_rank = " << tuples_per_rank << "\n";
-  WITH_RANK << "start, end = " << start << ", " << end << "\n";
-  ABCTuples result(tuples_per_rank, FAKE_TUPLE);
-
-  for (size_t a(0), r(0), g(0); a < Nv; a++)
-  for (size_t b(a);             b < Nv; b++)
-  for (size_t c(b);             c < Nv; c++){
-    if ( a == b && b == c ) continue;
-    if ( start <= g && g < end) result[r++] = {a, b, c};
-    g++;
-  }
-
-  return result;
-
-}
+ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np);
 // Naive list:1 ends here

 // [[file:~/cuda/atrip/atrip.org::*Naive%20list][Naive list:2]]
-ABCTuples getAllTuplesList(const size_t Nv) {
-  const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
-  ABCTuples result(n);
-
-  for (size_t a(0), u(0); a < Nv; a++)
-  for (size_t b(a); b < Nv; b++)
-  for (size_t c(b); c < Nv; c++){
-    if ( a == b && b == c ) continue;
-    result[u++] = {a, b, c};
-  }
-
-  return result;
-}
+ABCTuples getAllTuplesList(const size_t Nv);
 // Naive list:2 ends here

 // [[file:~/cuda/atrip/atrip.org::*Naive%20list][Naive list:3]]
 struct NaiveDistribution : public TuplesDistribution {
-  ABCTuples getTuples(size_t Nv, MPI_Comm universe) override {
-    int rank, np;
-    MPI_Comm_rank(universe, &rank);
-    MPI_Comm_size(universe, &np);
-    return getTuplesList(Nv, (size_t)rank, (size_t)np);
-  }
+  ABCTuples getTuples(size_t Nv, MPI_Comm universe) override;
 };
 // Naive list:3 ends here

@ -224,19 +98,12 @@ namespace group_and_sort {
 // Right now we distribute the slices in a round robin fashion
 // over the different nodes (NOTE: not mpi ranks but nodes)
 inline
-size_t isOnNode(size_t tuple, size_t nNodes) { return tuple % nNodes; }
+size_t isOnNode(size_t tuple, size_t nNodes);


 // return the node (or all nodes) where the elements of this
 // tuple are located
-std::vector<size_t> getTupleNodes(ABCTuple const& t, size_t nNodes) {
-  std::vector<size_t>
-    nTuple = { isOnNode(t[0], nNodes)
-             , isOnNode(t[1], nNodes)
-             , isOnNode(t[2], nNodes)
-             };
-  return unique(nTuple);
-}
+std::vector<size_t> getTupleNodes(ABCTuple const& t, size_t nNodes);

 struct Info {
  size_t nNodes;
@ -245,302 +112,16 @@ struct Info {
 // Utils:1 ends here

 // [[file:~/cuda/atrip/atrip.org::*Distribution][Distribution:1]]
-ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
-
-  ABCTuples nodeTuples;
-  size_t const nNodes(info.nNodes);
-
-  std::vector<ABCTuples>
-      container1d(nNodes)
-    , container2d(nNodes * nNodes)
-    , container3d(nNodes * nNodes * nNodes)
-    ;
-
-  WITH_DBG if (info.nodeId == 0)
-    std::cout << "\tGoing through all "
-              << allTuples.size()
-              << " tuples in "
-              << nNodes
-              << " nodes\n";
-
-  // build container-n-d's
-  for (auto const& t: allTuples) {
-    // one which node(s) are the tuple elements located...
-    // put them into the right container
-    auto const _nodes = getTupleNodes(t, nNodes);
-
-    switch (_nodes.size()) {
-      case 1:
-        container1d[_nodes[0]].push_back(t);
-        break;
-      case 2:
-        container2d[ _nodes[0]
-                   + _nodes[1] * nNodes
-                   ].push_back(t);
-        break;
-      case 3:
-        container3d[ _nodes[0]
-                   + _nodes[1] * nNodes
-                   + _nodes[2] * nNodes * nNodes
-                   ].push_back(t);
-        break;
-    }
-
-  }
-
-  WITH_DBG if (info.nodeId == 0)
-    std::cout << "\tBuilding 1-d containers\n";
-  // DISTRIBUTE 1-d containers
-  // every tuple which is only located at one node belongs to this node
-  {
-    auto const& _tuples = container1d[info.nodeId];
-    nodeTuples.resize(_tuples.size(), INVALID_TUPLE);
-    std::copy(_tuples.begin(), _tuples.end(), nodeTuples.begin());
-  }
-
-  WITH_DBG if (info.nodeId == 0)
-    std::cout << "\tBuilding 2-d containers\n";
-  // DISTRIBUTE 2-d containers
-  //the tuples which are located at two nodes are half/half given to these nodes
-  for (size_t yx = 0; yx < container2d.size(); yx++) {
-
-    auto const& _tuples = container2d[yx];
-      const
-    size_t idx = yx % nNodes
-         // remeber: yx = idy * nNodes + idx
-         , idy = yx / nNodes
-         , n_half = _tuples.size() / 2
-         , size = nodeTuples.size()
-         ;
-
-    size_t nbeg, nend;
-    if (info.nodeId == idx) {
-      nbeg = 0 * n_half;
-      nend = n_half;
-    } else if (info.nodeId == idy) {
-      nbeg = 1 * n_half;
-      nend = _tuples.size();
-    } else {
-      // either idx or idy is my node
-      continue;
-    }
-
-    size_t const nextra = nend - nbeg;
-    nodeTuples.resize(size + nextra, INVALID_TUPLE);
-    std::copy(_tuples.begin() + nbeg,
-              _tuples.begin() + nend,
-              nodeTuples.begin() + size);
-
-  }
-
-  WITH_DBG if (info.nodeId == 0)
-    std::cout << "\tBuilding 3-d containers\n";
-  // DISTRIBUTE 3-d containers
-  for (size_t zyx = 0; zyx < container3d.size(); zyx++) {
-    auto const& _tuples = container3d[zyx];
-
-      const
-    size_t idx = zyx % nNodes
-         , idy = (zyx / nNodes) % nNodes
-         // remember: zyx = idx + idy * nNodes + idz * nNodes^2
-         , idz = zyx / nNodes / nNodes
-         , n_third = _tuples.size() / 3
-         , size = nodeTuples.size()
-         ;
-
-    size_t nbeg, nend;
-    if (info.nodeId == idx) {
-      nbeg = 0 * n_third;
-      nend = 1 * n_third;
-    } else if (info.nodeId == idy) {
-      nbeg = 1 * n_third;
-      nend = 2 * n_third;
-    } else if (info.nodeId == idz) {
-      nbeg = 2 * n_third;
-      nend = _tuples.size();
-    } else {
-      // either idx or idy or idz is my node
-      continue;
-    }
-
-    size_t const nextra = nend - nbeg;
-    nodeTuples.resize(size + nextra, INVALID_TUPLE);
-    std::copy(_tuples.begin() + nbeg,
-              _tuples.begin() + nend,
-              nodeTuples.begin() + size);
-
-  }
-
-
-  WITH_DBG if (info.nodeId == 0) std::cout << "\tswapping tuples...\n";
-  /*
-   *  sort part of group-and-sort algorithm
-   *  every tuple on a given node is sorted in a way that
-   *  the 'home elements' are the fastest index.
-   *  1:yyy 2:yyn(x) 3:yny(x) 4:ynn(x) 5:nyy 6:nyn(x) 7:nny 8:nnn
-   */
-  for (auto &nt: nodeTuples){
-    if ( isOnNode(nt[0], nNodes) == info.nodeId ){ // 1234
-      if ( isOnNode(nt[2], nNodes) != info.nodeId ){ // 24
-        size_t const x(nt[0]);
-        nt[0] = nt[2];         // switch first and last
-        nt[2] = x;
-      }
-      else if ( isOnNode(nt[1], nNodes) != info.nodeId){ // 3
-        size_t const x(nt[0]);
-        nt[0] = nt[1];         // switch first two
-        nt[1] = x;
-      }
-    } else {
-      if ( isOnNode(nt[1], nNodes) == info.nodeId   // 56
-        && isOnNode(nt[2], nNodes) != info.nodeId
-        ) { // 6
-        size_t const x(nt[1]);
-        nt[1] = nt[2];         // switch last two
-        nt[2] = x;
-      }
-    }
-  }
-
-  WITH_DBG if (info.nodeId == 0) std::cout << "\tsorting list of tuples...\n";
-  //now we sort the list of tuples
-  std::sort(nodeTuples.begin(), nodeTuples.end());
-
-  WITH_DBG if (info.nodeId == 0) std::cout << "\trestoring tuples...\n";
-  // we bring the tuples abc back in the order a<b<c
-  for (auto &t: nodeTuples)  std::sort(t.begin(), t.end());
-
-#if ATRIP_DEBUG > 1
-  WITH_DBG if (info.nodeId == 0)
-  std::cout << "checking for validity of " << nodeTuples.size() << std::endl;
-  const bool anyInvalid
-    = std::any_of(nodeTuples.begin(),
-                  nodeTuples.end(),
-                  [](ABCTuple const& t) { return t == INVALID_TUPLE; });
-  if (anyInvalid) throw "Some tuple is invalid in group-and-sort algorithm";
-#endif
-
-  WITH_DBG if (info.nodeId == 0) std::cout << "\treturning tuples...\n";
-  return nodeTuples;
-
-}
+ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples);
 // Distribution:1 ends here

 // [[file:~/cuda/atrip/atrip.org::*Main][Main:1]]
-std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv) {
-
-  int rank, np;
-  MPI_Comm_rank(universe, &rank);
-  MPI_Comm_size(universe, &np);
-
-  std::vector<ABCTuple> result;
-
-  auto const nodeNames(getNodeNames(universe));
-  size_t const nNodes = unique(nodeNames).size();
-  auto const nodeInfos = getNodeInfos(nodeNames);
-
-  // We want to construct a communicator which only contains of one
-  // element per node
-  bool const computeDistribution
-    = nodeInfos[rank].localRank == 0;
-
-  std::vector<ABCTuple>
-    nodeTuples
-      = computeDistribution
-      ? specialDistribution(Info{nNodes, nodeInfos[rank].nodeId},
-                            getAllTuplesList(Nv))
-      : std::vector<ABCTuple>()
-      ;
-
-  LOG(1,"Atrip") << "got nodeTuples\n";
-
-  // now we have to send the data from **one** rank on each node
-  // to all others ranks of this node
-    const
-  int color = nodeInfos[rank].nodeId
-    , key = nodeInfos[rank].localRank
-    ;
-
-
-  MPI_Comm INTRA_COMM;
-  MPI_Comm_split(universe, color, key, &INTRA_COMM);
-// Main:1 ends here
-
-// [[file:~/cuda/atrip/atrip.org::*Main][Main:2]]
-size_t const
-  tuplesPerRankLocal
-     = nodeTuples.size() / nodeInfos[rank].ranksPerNode
-     + size_t(nodeTuples.size() % nodeInfos[rank].ranksPerNode != 0)
-     ;
-
-size_t tuplesPerRankGlobal;
-
-MPI_Reduce(&tuplesPerRankLocal,
-           &tuplesPerRankGlobal,
-           1,
-           MPI_UINT64_T,
-           MPI_MAX,
-           0,
-           universe);
-
-MPI_Bcast(&tuplesPerRankGlobal,
-          1,
-          MPI_UINT64_T,
-          0,
-          universe);
-
-LOG(1,"Atrip") << "Tuples per rank: " << tuplesPerRankGlobal << "\n";
-LOG(1,"Atrip") << "ranks per node " << nodeInfos[rank].ranksPerNode << "\n";
-LOG(1,"Atrip") << "#nodes " << nNodes << "\n";
-// Main:2 ends here
-
-// [[file:~/cuda/atrip/atrip.org::*Main][Main:3]]
-size_t const totalTuples
-  = tuplesPerRankGlobal * nodeInfos[rank].ranksPerNode;
-
-if (computeDistribution) {
-  // pad with FAKE_TUPLEs
-  nodeTuples.insert(nodeTuples.end(),
-                    totalTuples - nodeTuples.size(),
-                    FAKE_TUPLE);
-}
-// Main:3 ends here
-
-// [[file:~/cuda/atrip/atrip.org::*Main][Main:4]]
-{
-  // construct mpi type for abctuple
-  MPI_Datatype MPI_ABCTUPLE;
-  MPI_Type_vector(nodeTuples[0].size(), 1, 1, MPI_UINT64_T, &MPI_ABCTUPLE);
-  MPI_Type_commit(&MPI_ABCTUPLE);
-
-  LOG(1,"Atrip") << "scattering tuples \n";
-
-  result.resize(tuplesPerRankGlobal);
-  MPI_Scatter(nodeTuples.data(),
-              tuplesPerRankGlobal,
-              MPI_ABCTUPLE,
-              result.data(),
-              tuplesPerRankGlobal,
-              MPI_ABCTUPLE,
-              0,
-              INTRA_COMM);
-
-  MPI_Type_free(&MPI_ABCTUPLE);
-
-}
-// Main:4 ends here
-
-// [[file:~/cuda/atrip/atrip.org::*Main][Main:5]]
-return result;
-
-}
+std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv);
 // Main:5 ends here

 // [[file:~/cuda/atrip/atrip.org::*Interface][Interface:1]]
 struct Distribution : public TuplesDistribution {
-  ABCTuples getTuples(size_t Nv, MPI_Comm universe) override {
-    return main(universe, Nv);
-  }
+  ABCTuples getTuples(size_t Nv, MPI_Comm universe) override;
 };
 // Interface:1 ends here

--- a/shell.nix
+++ b/shell.nix
@ -12,6 +12,7 @@ let
  };

  openblas = import ./etc/nix/openblas.nix { inherit pkgs; };
+  vendor = import ./etc/nix/vendor-shell.nix;

  mkl-pkg = import ./etc/nix/mkl.nix { pkgs = unfree-pkgs; };
  cuda-pkg = if cuda then (import ./cuda.nix { pkgs = unfree-pkgs; }) else {};
@ -57,14 +58,15 @@ pkgs.mkShell rec {
  buildInputs
    = with pkgs; [

+        gdb
        coreutils
-        git vim
+        git
+        vim

        openmpi
        llvmPackages.openmp

        binutils
-        emacs
        gfortran

        gnumake
@ -84,6 +86,15 @@ pkgs.mkShell rec {
  shellHook
    =
    ''
+
+    ${vendor.src}
+
+    ${vendor.cpath "${pkgs.openmpi.out}/include"}
+    ${vendor.cpath "${openblas.pkg.dev}/include"}
+
+    ${vendor.lib "${pkgs.openmpi.out}/lib"}
+    ${vendor.lib "${openblas.pkg.out}/lib"}
+
    export OMPI_CXX=${CXX}
    export OMPI_CC=${CC}
    CXX=${CXX}
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -7,7 +7,7 @@ AM_CPPFLAGS = $(CTF_CPPFLAGS)
 lib_LIBRARIES = libatrip.a

 libatrip_a_CPPFLAGS = -I$(top_srcdir)/include/
-libatrip_a_SOURCES = ./atrip/Blas.cxx
+libatrip_a_SOURCES = ./atrip/Blas.cxx ./atrip/Tuples.cxx ./atrip/DatabaseCommunicator.cxx
 NVCC_FILES = ./atrip/Equations.cxx ./atrip/Complex.cxx ./atrip/Atrip.cxx

 if WITH_CUDA
--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@ -21,6 +21,7 @@
 #include <atrip/SliceUnion.hpp>
 #include <atrip/Unions.hpp>
 #include <atrip/Checkpoint.hpp>
+#include <atrip/DatabaseCommunicator.hpp>

 using namespace atrip;
 #if defined(HAVE_CUDA)
@ -299,9 +300,16 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
  using Database = typename Slice<F>::Database;
  auto communicateDatabase
    = [ &unions
+      , &in
+      , Nv
      , np
-      ] (ABCTuple const& abc, MPI_Comm const& c) -> Database {
+      ] (ABCTuple const& abc, MPI_Comm const& c, size_t iteration) -> Database {

+      if (in.tuplesDistribution == Atrip::Input<F>::TuplesDistribution::NAIVE) {
+
+        return naiveDatabase<F>(unions, Nv, np, iteration, c);
+
+      } else {
        WITH_CHRONO("db:comm:type:do",
          auto MPI_LDB_ELEMENT = Slice<F>::mpi::localDatabaseElement();
        )
@ -334,6 +342,8 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
        WITH_CHRONO("db:comm:type:free", MPI_Type_free(&MPI_LDB_ELEMENT);)

        return db;
+      }
+
      };

  auto doIOPhase
@ -564,7 +574,7 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
    // COMM FIRST DATABASE ================================================{{{1
    if (i == first_iteration) {
      WITH_RANK << "__first__:first database ............ \n";
-      const auto db = communicateDatabase(abc, universe);
+      const auto db = communicateDatabase(abc, universe, i);
      WITH_RANK << "__first__:first database communicated \n";
      WITH_RANK << "__first__:first database io phase \n";
      doIOPhase(db);
@ -579,7 +589,7 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
    if (abcNext) {
      WITH_RANK << "__comm__:" << iteration << "th communicating database\n";
      WITH_CHRONO("db:comm",
-        const auto db = communicateDatabase(*abcNext, universe);
+        const auto db = communicateDatabase(*abcNext, universe, i);
      )
      WITH_CHRONO("db:io",
        doIOPhase(db);
--- a/src/atrip/DatabaseCommunicator.cxx
+++ b/src/atrip/DatabaseCommunicator.cxx
@ -0,0 +1,167 @@
+#include <atrip/DatabaseCommunicator.hpp>
+#include <atrip/Complex.hpp>
+
+
+namespace atrip {
+
+  static
+  ABCTuples get_nth_naive_tuples(size_t Nv, size_t np) {
+
+    const size_t
+      // total number of tuples for the problem
+      n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv
+
+      // all ranks should have the same number of tuples_per_rank
+      , tuples_per_rank = n / np + size_t(n % np != 0)
+      ;
+
+
+    ABCTuples result(np);
+
+    for (size_t a(0), g(0); a < Nv; a++)
+    for (size_t b(a);       b < Nv; b++)
+    for (size_t c(b);       c < Nv; c++){
+      if ( a == b && b == c ) continue;
+      for (size_t rank = 0; rank < np; rank++) {
+
+        const size_t
+          // start index for the global tuples list
+            start = tuples_per_rank * rank
+
+          // end index for the global tuples list
+          , end = tuples_per_rank * (rank + 1)
+          ;
+
+        if ( start <= g && g < end) result[rank] = {a, b, c};
+
+      }
+      g++;
+    }
+
+    return result;
+
+  }
+
+
+  template <typename F>
+  static
+  typename Slice<F>::LocalDatabase
+  build_local_database_fake(ABCTuple const& abc_prev,
+                            ABCTuple const& abc,
+                            size_t rank,
+                            SliceUnion<F>* u) {
+
+    typename Slice<F>::LocalDatabase result;
+
+    // vector of type x tuple
+    auto const needed = u->neededSlices(abc);
+    auto const needed_prev = u->neededSlices(abc_prev);
+
+    for (auto const& pair: needed) {
+      auto const type = pair.first;
+      auto const tuple = pair.second;
+      auto const from  = u->rankMap.find(abc, type);
+
+      // Try to find in the previously needed slices
+      // one that exactly matches the tuple.
+      // Not necessarily has to match the type.
+      //
+      // If we find it, then it means that the fake rank
+      // will mark it as recycled. This covers
+      // the finding of Ready slices and Recycled slices.
+      {
+        auto const& it
+          = std::find_if(needed_prev.begin(), needed_prev.end(),
+                         [&tuple, &type](typename Slice<F>::Ty_x_Tu const& o) {
+                           return o.second == tuple;
+                         });
+
+        if (it != needed_prev.end()) {
+          typename Slice<F>::Info info;
+          info.tuple = tuple;
+          info.type = type;
+          info.from = from;
+          info.state = Slice<F>::Recycled;
+          result.push_back({u->name, info});
+          continue;
+        }
+      }
+
+      {
+        typename Slice<F>::Info info;
+        info.type = type;
+        info.tuple = tuple;
+        info.from = from;
+
+        // Handle self sufficiency
+        info.state = rank == from.rank
+                   ? Slice<F>::SelfSufficient
+                   : Slice<F>::Fetch
+                   ;
+        result.push_back({u->name, info});
+        continue;
+      }
+
+    }
+
+    return result;
+
+  }
+
+
+
+  template <typename F>
+  typename Slice<F>::Database
+  naiveDatabase(Unions<F> &unions,
+                size_t nv,
+                size_t np,
+                size_t iteration,
+                MPI_Comm const& c) {
+
+    using Database = typename Slice<F>::Database;
+    Database db;
+    const auto tuples = get_nth_naive_tuples(nv, np);
+    const auto prev_tuples = get_nth_naive_tuples(nv, np);
+
+    for (size_t rank = 0; rank < np; rank++) {
+      auto abc = tuples[rank];
+      typename Slice<F>::LocalDatabase ldb;
+
+      for (auto const& tensor: unions) {
+        if (rank == Atrip::rank) {
+          auto const& tensorDb = tensor->buildLocalDatabase(abc);
+          ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end());
+        } else {
+          auto const& tensorDb
+            = build_local_database_fake(prev_tuples[rank],
+                                        abc,
+                                        rank,
+                                        tensor);
+          ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end());
+        }
+      }
+
+      db.insert(db.end(), ldb.begin(), ldb.end());
+
+    }
+
+    return db;
+  }
+
+  template 
+  typename Slice<double>::Database
+  naiveDatabase<double>(Unions<double> &unions,
+                size_t nv,
+                size_t np,
+                size_t iteration,
+                MPI_Comm const& c);
+
+  template 
+  typename Slice<Complex>::Database
+  naiveDatabase<Complex>(Unions<Complex> &unions,
+                size_t nv,
+                size_t np,
+                size_t iteration,
+                MPI_Comm const& c);
+
+}  // namespace atrip
--- a/src/atrip/Tuples.cxx
+++ b/src/atrip/Tuples.cxx
@ -0,0 +1,464 @@
+#include <atrip/Tuples.hpp>
+#include <atrip/Atrip.hpp>
+
+namespace atrip {
+
+template <typename A>
+static A unique(A const &xs) {
+  auto result = xs;
+  std::sort(std::begin(result), std::end(result));
+  auto const& last = std::unique(std::begin(result), std::end(result));
+  result.erase(last, std::end(result));
+  return result;
+}
+
+
+std::vector<std::string> getNodeNames(MPI_Comm comm){
+  int rank, np;
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &np);
+
+  std::vector<std::string> nodeList(np);
+  char nodeName[MPI_MAX_PROCESSOR_NAME];
+  char *nodeNames = (char*)malloc(np * MPI_MAX_PROCESSOR_NAME);
+  std::vector<int> nameLengths(np)
+                 , off(np)
+                 ;
+  int nameLength;
+  MPI_Get_processor_name(nodeName, &nameLength);
+  MPI_Allgather(&nameLength,
+                1,
+                MPI_INT,
+                nameLengths.data(),
+                1,
+                MPI_INT,
+                comm);
+  for (int i(1); i < np; i++)
+    off[i] = off[i-1] + nameLengths[i-1];
+  MPI_Allgatherv(nodeName,
+                 nameLengths[rank],
+                 MPI_BYTE,
+                 nodeNames,
+                 nameLengths.data(),
+                 off.data(),
+                 MPI_BYTE,
+                 comm);
+  for (int i(0); i < np; i++) {
+    std::string const s(&nodeNames[off[i]], nameLengths[i]);
+    nodeList[i] = s;
+  }
+  std::free(nodeNames);
+  return nodeList;
+}
+
+
+
+std::vector<RankInfo>
+getNodeInfos(std::vector<string> const& nodeNames) {
+  std::vector<RankInfo> result;
+  auto const uniqueNames = unique(nodeNames);
+  auto const index = [&uniqueNames](std::string const& s) {
+    auto const& it = std::find(uniqueNames.begin(), uniqueNames.end(), s);
+    return std::distance(uniqueNames.begin(), it);
+  };
+  std::vector<size_t> localRanks(uniqueNames.size(), 0);
+  size_t globalRank = 0;
+  for (auto const& name: nodeNames) {
+    const size_t nodeId = index(name);
+    result.push_back({name,
+                      nodeId,
+                      globalRank++,
+                      localRanks[nodeId]++,
+                      (size_t)
+                      std::count(nodeNames.begin(),
+                                 nodeNames.end(),
+                                 name)
+                      });
+  }
+  return result;
+}
+
+ClusterInfo
+getClusterInfo(MPI_Comm comm) {
+  auto const names = getNodeNames(comm);
+  auto const rankInfos = getNodeInfos(names);
+
+  return ClusterInfo {
+    unique(names).size(),
+    names.size(),
+    rankInfos[0].ranksPerNode,
+    rankInfos
+  };
+
+}
+
+
+
+ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) {
+
+  const size_t
+    // total number of tuples for the problem
+       n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv
+
+    // all ranks should have the same number of tuples_per_rank
+    , tuples_per_rank = n / np + size_t(n % np != 0)
+
+    // start index for the global tuples list
+    , start = tuples_per_rank * rank
+
+    // end index for the global tuples list
+    , end = tuples_per_rank * (rank + 1)
+    ;
+
+  LOG(1,"Atrip") << "tuples_per_rank = " << tuples_per_rank << "\n";
+  WITH_RANK << "start, end = " << start << ", " << end << "\n";
+  ABCTuples result(tuples_per_rank, FAKE_TUPLE);
+
+  for (size_t a(0), r(0), g(0); a < Nv; a++)
+  for (size_t b(a);             b < Nv; b++)
+  for (size_t c(b);             c < Nv; c++){
+    if ( a == b && b == c ) continue;
+    if ( start <= g && g < end) result[r++] = {a, b, c};
+    g++;
+  }
+
+  return result;
+
+}
+
+
+ABCTuples getAllTuplesList(const size_t Nv) {
+  const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
+  ABCTuples result(n);
+
+  for (size_t a(0), u(0); a < Nv; a++)
+  for (size_t b(a); b < Nv; b++)
+  for (size_t c(b); c < Nv; c++){
+    if ( a == b && b == c ) continue;
+    result[u++] = {a, b, c};
+  }
+
+  return result;
+}
+
+
+ABCTuples atrip::NaiveDistribution::getTuples(size_t Nv, MPI_Comm universe) {
+  int rank, np;
+  MPI_Comm_rank(universe, &rank);
+  MPI_Comm_size(universe, &np);
+  return getTuplesList(Nv, (size_t)rank, (size_t)np);
+}
+
+
+
+
+namespace group_and_sort {
+
+inline
+size_t isOnNode(size_t tuple, size_t nNodes) { return tuple % nNodes; }
+
+std::vector<size_t> getTupleNodes(ABCTuple const& t, size_t nNodes) {
+  std::vector<size_t>
+    nTuple = { isOnNode(t[0], nNodes)
+             , isOnNode(t[1], nNodes)
+             , isOnNode(t[2], nNodes)
+             };
+  return unique(nTuple);
+}
+
+
+ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
+
+  ABCTuples nodeTuples;
+  size_t const nNodes(info.nNodes);
+
+  std::vector<ABCTuples>
+      container1d(nNodes)
+    , container2d(nNodes * nNodes)
+    , container3d(nNodes * nNodes * nNodes)
+    ;
+
+  WITH_DBG if (info.nodeId == 0)
+    std::cout << "\tGoing through all "
+              << allTuples.size()
+              << " tuples in "
+              << nNodes
+              << " nodes\n";
+
+  // build container-n-d's
+  for (auto const& t: allTuples) {
+    // one which node(s) are the tuple elements located...
+    // put them into the right container
+    auto const _nodes = getTupleNodes(t, nNodes);
+
+    switch (_nodes.size()) {
+      case 1:
+        container1d[_nodes[0]].push_back(t);
+        break;
+      case 2:
+        container2d[ _nodes[0]
+                   + _nodes[1] * nNodes
+                   ].push_back(t);
+        break;
+      case 3:
+        container3d[ _nodes[0]
+                   + _nodes[1] * nNodes
+                   + _nodes[2] * nNodes * nNodes
+                   ].push_back(t);
+        break;
+    }
+
+  }
+
+  WITH_DBG if (info.nodeId == 0)
+    std::cout << "\tBuilding 1-d containers\n";
+  // DISTRIBUTE 1-d containers
+  // every tuple which is only located at one node belongs to this node
+  {
+    auto const& _tuples = container1d[info.nodeId];
+    nodeTuples.resize(_tuples.size(), INVALID_TUPLE);
+    std::copy(_tuples.begin(), _tuples.end(), nodeTuples.begin());
+  }
+
+  WITH_DBG if (info.nodeId == 0)
+    std::cout << "\tBuilding 2-d containers\n";
+  // DISTRIBUTE 2-d containers
+  //the tuples which are located at two nodes are half/half given to these nodes
+  for (size_t yx = 0; yx < container2d.size(); yx++) {
+
+    auto const& _tuples = container2d[yx];
+      const
+    size_t idx = yx % nNodes
+         // remeber: yx = idy * nNodes + idx
+         , idy = yx / nNodes
+         , n_half = _tuples.size() / 2
+         , size = nodeTuples.size()
+         ;
+
+    size_t nbeg, nend;
+    if (info.nodeId == idx) {
+      nbeg = 0 * n_half;
+      nend = n_half;
+    } else if (info.nodeId == idy) {
+      nbeg = 1 * n_half;
+      nend = _tuples.size();
+    } else {
+      // either idx or idy is my node
+      continue;
+    }
+
+    size_t const nextra = nend - nbeg;
+    nodeTuples.resize(size + nextra, INVALID_TUPLE);
+    std::copy(_tuples.begin() + nbeg,
+              _tuples.begin() + nend,
+              nodeTuples.begin() + size);
+
+  }
+
+  WITH_DBG if (info.nodeId == 0)
+    std::cout << "\tBuilding 3-d containers\n";
+  // DISTRIBUTE 3-d containers
+  for (size_t zyx = 0; zyx < container3d.size(); zyx++) {
+    auto const& _tuples = container3d[zyx];
+
+      const
+    size_t idx = zyx % nNodes
+         , idy = (zyx / nNodes) % nNodes
+         // remember: zyx = idx + idy * nNodes + idz * nNodes^2
+         , idz = zyx / nNodes / nNodes
+         , n_third = _tuples.size() / 3
+         , size = nodeTuples.size()
+         ;
+
+    size_t nbeg, nend;
+    if (info.nodeId == idx) {
+      nbeg = 0 * n_third;
+      nend = 1 * n_third;
+    } else if (info.nodeId == idy) {
+      nbeg = 1 * n_third;
+      nend = 2 * n_third;
+    } else if (info.nodeId == idz) {
+      nbeg = 2 * n_third;
+      nend = _tuples.size();
+    } else {
+      // either idx or idy or idz is my node
+      continue;
+    }
+
+    size_t const nextra = nend - nbeg;
+    nodeTuples.resize(size + nextra, INVALID_TUPLE);
+    std::copy(_tuples.begin() + nbeg,
+              _tuples.begin() + nend,
+              nodeTuples.begin() + size);
+
+  }
+
+
+  WITH_DBG if (info.nodeId == 0) std::cout << "\tswapping tuples...\n";
+  /*
+   *  sort part of group-and-sort algorithm
+   *  every tuple on a given node is sorted in a way that
+   *  the 'home elements' are the fastest index.
+   *  1:yyy 2:yyn(x) 3:yny(x) 4:ynn(x) 5:nyy 6:nyn(x) 7:nny 8:nnn
+   */
+  for (auto &nt: nodeTuples){
+    if ( isOnNode(nt[0], nNodes) == info.nodeId ){ // 1234
+      if ( isOnNode(nt[2], nNodes) != info.nodeId ){ // 24
+        size_t const x(nt[0]);
+        nt[0] = nt[2];         // switch first and last
+        nt[2] = x;
+      }
+      else if ( isOnNode(nt[1], nNodes) != info.nodeId){ // 3
+        size_t const x(nt[0]);
+        nt[0] = nt[1];         // switch first two
+        nt[1] = x;
+      }
+    } else {
+      if ( isOnNode(nt[1], nNodes) == info.nodeId   // 56
+        && isOnNode(nt[2], nNodes) != info.nodeId
+        ) { // 6
+        size_t const x(nt[1]);
+        nt[1] = nt[2];         // switch last two
+        nt[2] = x;
+      }
+    }
+  }
+
+  WITH_DBG if (info.nodeId == 0) std::cout << "\tsorting list of tuples...\n";
+  //now we sort the list of tuples
+  std::sort(nodeTuples.begin(), nodeTuples.end());
+
+  WITH_DBG if (info.nodeId == 0) std::cout << "\trestoring tuples...\n";
+  // we bring the tuples abc back in the order a<b<c
+  for (auto &t: nodeTuples)  std::sort(t.begin(), t.end());
+
+#if ATRIP_DEBUG > 1
+  WITH_DBG if (info.nodeId == 0)
+  std::cout << "checking for validity of " << nodeTuples.size() << std::endl;
+  const bool anyInvalid
+    = std::any_of(nodeTuples.begin(),
+                  nodeTuples.end(),
+                  [](ABCTuple const& t) { return t == INVALID_TUPLE; });
+  if (anyInvalid) throw "Some tuple is invalid in group-and-sort algorithm";
+#endif
+
+  WITH_DBG if (info.nodeId == 0) std::cout << "\treturning tuples...\n";
+  return nodeTuples;
+
+}
+
+
+  std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv) {
+
+    int rank, np;
+    MPI_Comm_rank(universe, &rank);
+    MPI_Comm_size(universe, &np);
+
+    std::vector<ABCTuple> result;
+
+    auto const nodeNames(getNodeNames(universe));
+    size_t const nNodes = unique(nodeNames).size();
+    auto const nodeInfos = getNodeInfos(nodeNames);
+
+    // We want to construct a communicator which only contains of one
+    // element per node
+    bool const computeDistribution
+      = nodeInfos[rank].localRank == 0;
+
+    std::vector<ABCTuple>
+      nodeTuples
+      = computeDistribution
+      ? specialDistribution(Info{nNodes, nodeInfos[rank].nodeId},
+                            getAllTuplesList(Nv))
+      : std::vector<ABCTuple>()
+      ;
+
+    LOG(1,"Atrip") << "got nodeTuples\n";
+
+    // now we have to send the data from **one** rank on each node
+    // to all others ranks of this node
+    const
+      int color = nodeInfos[rank].nodeId,
+      key = nodeInfos[rank].localRank
+      ;
+
+
+    MPI_Comm INTRA_COMM;
+    MPI_Comm_split(universe, color, key, &INTRA_COMM);
+    // Main:1 ends here
+
+    // [[file:~/cuda/atrip/atrip.org::*Main][Main:2]]
+    size_t const
+      tuplesPerRankLocal
+      = nodeTuples.size() / nodeInfos[rank].ranksPerNode
+      + size_t(nodeTuples.size() % nodeInfos[rank].ranksPerNode != 0)
+      ;
+
+    size_t tuplesPerRankGlobal;
+
+    MPI_Reduce(&tuplesPerRankLocal,
+               &tuplesPerRankGlobal,
+               1,
+               MPI_UINT64_T,
+               MPI_MAX,
+               0,
+               universe);
+
+    MPI_Bcast(&tuplesPerRankGlobal,
+              1,
+              MPI_UINT64_T,
+              0,
+              universe);
+
+    LOG(1,"Atrip") << "Tuples per rank: " << tuplesPerRankGlobal << "\n";
+    LOG(1,"Atrip") << "ranks per node " << nodeInfos[rank].ranksPerNode << "\n";
+    LOG(1,"Atrip") << "#nodes " << nNodes << "\n";
+    // Main:2 ends here
+
+    // [[file:~/cuda/atrip/atrip.org::*Main][Main:3]]
+    size_t const totalTuples
+      = tuplesPerRankGlobal * nodeInfos[rank].ranksPerNode;
+
+    if (computeDistribution) {
+      // pad with FAKE_TUPLEs
+      nodeTuples.insert(nodeTuples.end(),
+                        totalTuples - nodeTuples.size(),
+                        FAKE_TUPLE);
+    }
+    // Main:3 ends here
+
+    // [[file:~/cuda/atrip/atrip.org::*Main][Main:4]]
+    {
+      // construct mpi type for abctuple
+      MPI_Datatype MPI_ABCTUPLE;
+      MPI_Type_vector(nodeTuples[0].size(), 1, 1, MPI_UINT64_T, &MPI_ABCTUPLE);
+      MPI_Type_commit(&MPI_ABCTUPLE);
+
+      LOG(1,"Atrip") << "scattering tuples \n";
+
+      result.resize(tuplesPerRankGlobal);
+      MPI_Scatter(nodeTuples.data(),
+                  tuplesPerRankGlobal,
+                  MPI_ABCTUPLE,
+                  result.data(),
+                  tuplesPerRankGlobal,
+                  MPI_ABCTUPLE,
+                  0,
+                  INTRA_COMM);
+
+      MPI_Type_free(&MPI_ABCTUPLE);
+
+    }
+
+    return result;
+
+  }
+
+
+ABCTuples Distribution::getTuples(size_t Nv, MPI_Comm universe) {
+  return main(universe, Nv);
+}
+
+
+}  // namespace group_and_sort
+}  // namespace atrip
Author	SHA1	Message	Date
Alejandro Gallo	118df09128	Add tentative DatabaseCommunicator	2022-10-06 01:10:06 +02:00
Alejandro Gallo	1e391e3749	Update tuples-distribution script	2022-10-06 01:07:53 +02:00
Alejandro Gallo	7734efeb97	Add tuples distribution bench	2022-10-03 17:13:21 +02:00
Alejandro Gallo	fa1a29c583	Create an implementation file of the Tuples	2022-10-03 17:11:49 +02:00
Alejandro Gallo	2cbff5c8c9	Add bench utils	2022-10-03 17:11:33 +02:00