Add group and sort algorithm, it compiles at least

2021-10-15 19:18:40 +02:00
parent 944e93dc33
commit 65f804e637
3 changed files with 909 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,10 @@
 .emacs
+
 doc/doxygen/
 extern
 lib
 *.o
 *.d
+a.out
+*~
+config.mk
--- a/atrip.org
+++ b/atrip.org
@@ -1400,6 +1400,16 @@ as well as their distribution to nodes and cores.
 #include <array>
 #include <numeric>

+// TODO: remove some
+#include <stdio.h>
+#include <math.h>
+#include <algorithm>
+#include <map>
+#include <cassert>
+#include <chrono>
+#include <climits>
+#include <mpi.h>
+
 #include <atrip/Utils.hpp>
 #include <atrip/Debug.hpp>

@@ -1491,6 +1501,498 @@ getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) {
 #+end_src

 *** Group and sort list
+
+**** Prolog                                                        :noexport:
+#+begin_src c++ :tangle (atrip-tuples-h)
+namespace group_and_sort {
+#+end_src
+
+**** Node information
+
+- nodeList ::
+    List of hostnames of size \( N_n \)
+- nodeInfos ::
+    List of (hostname, local rank Id)
+    of size \( N_p \), i.e., size of ranks
+    where local rank id goes from 0 to 48.
+
+
+
+=getNodeNames= gets the names of the nodes used,
+i.e., the size of the resulting vector gives the
+number of nodes.
+#+begin_src c++ :tangle (atrip-tuples-h)
+std::vector<std::string> getNodeNames(MPI_Comm comm){
+  int rank, np;
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &np);
+
+  std::vector<std::string> nodeList(np);
+  char nodeName[MPI_MAX_PROCESSOR_NAME]
+     , nodeNames[np*MPI_MAX_PROCESSOR_NAME]
+     ;
+  std::vector<int> nameLengths(np)
+                 , off(np)
+                 ;
+  int nameLength;
+  MPI_Get_processor_name(nodeName, &nameLength);
+  MPI_Allgather(&nameLength,
+                1,
+                MPI_INT,
+                nameLengths.data(),
+                1,
+                MPI_INT,
+                comm);
+  for (int i(1); i < np; i++)
+    off[i] = off[i-1] + nameLengths[i-1];
+  MPI_Allgatherv(nodeName,
+                 nameLengths[rank],
+                 MPI_BYTE,
+                 nodeNames,
+                 nameLengths.data(),
+                 off.data(),
+                 MPI_BYTE,
+                 comm);
+  for (int i(0); i < np; i++) {
+    std::string const s(&nodeNames[off[i]], nameLengths[i]);
+    nodeList[i] = s;
+  }
+  return nodeList;
+}
+#+end_src
+
+=getNodeInfos=
+#+begin_src c++ :tangle (atrip-tuples-h)
+struct RankInfo {
+  const std::string name;
+  const size_t nodeId;
+  const size_t globalRank;
+  const size_t localRank;
+  const size_t ranksPerNode;
+};
+
+std::vector<RankInfo>
+getNodeInfos(std::vector<string> const& nodeNames) {
+  std::vector<RankInfo> result;
+  auto uniqueNames = nodeNames;
+  {
+    std::sort(uniqueNames.begin(), uniqueNames.end());
+    auto const& last = std::unique(uniqueNames.begin(), uniqueNames.end());
+    uniqueNames.erase(last, uniqueNames.end());
+  }
+  const auto index = [&uniqueNames](std::string const& s) {
+    auto const& it = std::find(uniqueNames.begin(), uniqueNames.end(), s);
+    return std::distance(uniqueNames.begin(), it);
+  };
+  std::vector<size_t> localRanks(uniqueNames.size(), 0);
+  size_t rank = 0;
+  for (auto const& name: nodeNames) {
+    const size_t nodeId = index(name);
+    result.push_back({name,
+                      nodeId,
+                      rank++,
+                      localRanks[nodeId]++,
+                      std::count(nodeNames.begin(),
+                                 nodeNames.end(),
+                                 name)
+                      });
+  }
+  return result;
+}
+#+end_src
+
+**** Utils
+
+#+begin_src c++ :tangle (atrip-tuples-h)
+// Provides the node on which the slice-element is found
+// Right now we distribute the slices in a round robin fashion
+// over the different nodes (NOTE: not mpi ranks but nodes)
+size_t isOnNode(size_t tuple, size_t nodes) { return tuple % nodes; }
+
+
+struct Info {
+  size_t nNodes;
+  size_t Nv;
+  size_t np;
+  size_t nodeId;
+};
+
+
+// return the node (or all nodes) where the elements of this
+// tuple are located
+std::vector<size_t> getTupleNodes(ABCTuple t, size_t nNodes) {
+  std::vector<size_t> result;
+  ABCTuple nTuple = { isOnNode(t[0], nNodes)
+                    , isOnNode(t[1], nNodes)
+                    , isOnNode(t[2], nNodes)
+                    };
+  std::sort(nTuple.begin(), nTuple.end());
+  ABCTuple::iterator it = std::unique(nTuple.begin(), nTuple.end());
+  result.resize(it - nTuple.begin());
+  std::copy(nTuple.begin(), it, result.begin());
+  return result;
+}
+#+end_src
+
+**** Distribution
+wording: home element = element which is located on the given node
+1. we distribute the tuples such that each tuple has at least one 'home element'
+2. we sort each tuple in a way that the 'home element' are the fastest indices
+3. we sort the list of tuples on every node
+4. we resort the tuples that for every tuple abc the following holds: a<b<c
+#+begin_src c++ :tangle (atrip-tuples-h)
+std::vector<ABCTuple>
+specialDistribution(Info info, std::vector<ABCTuple> const& allTuples) {
+
+  std::vector<ABCTuple> nodeTuples;
+  size_t nNodes(info.nNodes);
+  size_t np(info.np);
+  size_t N(allTuples.size());
+  size_t tuplePerNode( ceil( ((double)N) / nNodes) );
+
+  //      nodeid          tuple list
+  std::map<size_t, std::vector<ABCTuple> > container1d;
+  std::map<size_t, std::vector<ABCTuple> > container2d;
+  std::map<size_t, std::vector<ABCTuple> > container3d;
+
+  // build container-n-d's
+  for (auto t: allTuples) {
+    // one which node(s) are the tuple elements located...
+    // put them into the right container
+    auto nt = getTupleNodes(t, nNodes);
+    if ( nt.size() == 1) container1d[nt[0]].push_back(t);
+    if ( nt.size() == 2) container2d[nt[0] + nNodes*nt[1]].push_back(t);
+    if ( nt.size() == 3)
+      container3d[nt[0] + nNodes*nt[1] + nNodes*nNodes*nt[2]].push_back(t);
+  }
+
+  // DISTRIBUTE 1-d containers
+  // every tuple which is only located at one node belongs to this node
+  {
+    auto const& tuplesVec = container1d[info.nodeId];
+    nodeTuples.resize(tuplesVec.size());
+    std::copy(tuplesVec.begin(), tuplesVec.end(), nodeTuples.begin());
+  }
+
+  // DISTRIBUTE 2-d containers
+  //the tuples which are located at two nodes are half/half given to these nodes
+  for (auto &m: container2d) {
+    size_t idx = m.first%nNodes;
+    size_t idy = m.first/nNodes;
+    size_t myNode = idx;
+
+    // either idx or idy is my node
+    if (idx != info.nodeId && idy != info.nodeId) continue;
+    if (idy == info.nodeId) myNode = idy;
+
+    auto tuplesVec = m.second;
+    auto n = tuplesVec.size() / 2;
+    auto size = nodeTuples.size();
+    if (myNode == idx) {
+      nodeTuples.resize(size + n);
+      std::copy(tuplesVec.begin(),
+                tuplesVec.begin() + n,
+                nodeTuples.begin() + size);
+    } else {
+      auto ny = tuplesVec.size() - n;
+      nodeTuples.resize(size + ny);
+      std::copy(tuplesVec.begin() + n,
+                tuplesVec.end(),
+                nodeTuples.begin() + size);
+    }
+
+  }
+
+  // DISTRIBUTE 3-d containers
+  // similar game for the tuples which belong to three different nodes
+  for (auto m: container3d){
+    auto tuplesVec = m.second;
+    auto idx = m.first%nNodes;
+    auto idy = (m.first/nNodes)%nNodes;
+    auto idz = m.first/nNodes/nNodes;
+    if (idx != info.nodeId && idy != info.nodeId && idz != info.nodeId) continue;
+
+    size_t nx = tuplesVec.size() / 3;
+    size_t n, nbegin, nend;
+    if (info.nodeId == idx) {
+      n = nx;
+      nbegin = 0;
+      nend = n;
+    } else if (info.nodeId == idy) {
+      n = nx;
+      nbegin = n;
+      nend = n + n;
+    } else {
+      n = tuplesVec.size() - 2 * nx;
+      nbegin = 2 * nx;
+      nend = 2 * nx + n;
+    }
+
+    auto size = nodeTuples.size();
+    nodeTuples.resize(size + n);
+    std::copy(tuplesVec.begin() + nbegin,
+              tuplesVec.begin() + nend,
+              nodeTuples.begin() + size);
+
+  }
+
+
+  // sort part of group-and-sort algorithm
+  // every tuple on a given node is sorted in a way that
+  // the 'home elements' are the fastest index.
+  // 1:yyy 2:yyn(x) 3:yny(x) 4:ynn(x) 5:nyy 6:nyn(x) 7:nny 8:nnn
+  size_t n = info.nodeId;
+  for (auto &nt: nodeTuples){
+    if ( isOnNode(nt[0], nNodes) == n ){ // 1234
+      if ( isOnNode(nt[2], nNodes) != n ){ // 24
+        size_t x(nt[0]); nt[0] = nt[2]; nt[2] = x; // switch first and last
+      }
+      else if ( isOnNode(nt[1], nNodes) != n){ // 3
+        size_t x(nt[0]); nt[0] = nt[1]; nt[1] = x; // switch first two
+      }
+    } else {
+      if ( isOnNode(nt[1], nNodes) == n   // 56
+        && isOnNode(nt[2], nNodes) != n){ // 6
+        size_t x(nt[1]); nt[1] = nt[2]; nt[2] = x; // switch last two
+      }
+    }
+  }
+  //now we sort the list of tuples
+  std::sort(nodeTuples.begin(), nodeTuples.end());
+  // we bring the tuples abc back in the order a<b<c
+  for (auto &t: nodeTuples)  std::sort(t.begin(), t.end());
+
+  return nodeTuples;
+
+}
+
+//determine which element has to be fetched from sources for the next iteration
+std::vector<size_t> fetchElement(ABCTuple cur, ABCTuple suc){
+  std::vector<size_t> result;
+  ABCTuple inter;
+  std::sort(cur.begin(), cur.end());
+  std::sort(suc.begin(), suc.end());
+  std::array<size_t,3>::iterator rit, cit, sit;
+  cit = std::unique(cur.begin(), cur.end());
+  sit = std::unique(suc.begin(), suc.end());
+  rit = std::set_difference(suc.begin(), sit, cur.begin(), cit, inter.begin());
+  result.resize(rit - inter.begin());
+  std::copy(inter.begin(), rit, result.begin());
+  return result;
+}
+#+end_src
+
+**** Main
+
+The main routine should return the list of tuples to be handled by the current rank.
+
+Let \( N_p \) be the number of ranks or processes.
+Let \( N_n \) be the number of nodes or sockets.
+
+Then we have the following
+
+#+begin_example
+Global rank | 0 1 2 3 4 5 6 7 8
+nodeId      | 0 1 0 1 1 0 2 2 2
+Local rank  | 0 0 1 1 2 2 0 1 2
+intra color | 0 1 0 1 1 0 2 2 2
+key         | global rank
+#+end_example
+
+
+
+
+
+#+begin_src c++ :tangle (atrip-tuples-h)
+std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv) {
+
+  int rank, np;
+  MPI_Comm_rank(universe, &rank);
+  MPI_Comm_size(universe, &np);
+
+  std::vector<ABCTuple> result;
+
+  const auto nodeNames(getNodeNames(universe));
+  auto nodeNamesUnique(nodeNames);
+  {
+    const auto& last = std::unique(nodeNamesUnique.begin(),
+                                   nodeNamesUnique.end());
+    nodeNamesUnique.erase(last, nodeNamesUnique.end());
+  }
+  // we pick one rank from every node
+  auto const nodeInfos = getNodeInfos(nodeNames);
+  size_t const nNodes = nodeNamesUnique.size();
+
+  // We want to construct a communicator which only contains of one
+  // element per node
+  bool makeDistribution
+    = nodeInfos[rank].localRank == 0
+    ? true
+    : false
+    ;
+
+  std::vector<ABCTuple>
+    nodeTuples = makeDistribution
+               ? specialDistribution(Info{ nNodes
+                                         , Nv
+                                         , np
+                                         , nodeInfos[rank].nodeId
+                                         },
+                                      getTuplesList(Nv))
+               : std::vector<ABCTuple>()
+               ;
+
+
+  // now we have to send the data from **one** rank on each node
+  // to all others ranks of this node
+    const
+  int color = nodeInfos[rank].nodeId
+    , key = nodeInfos[rank].localRank
+    ;
+
+
+  MPI_Comm INTRA_COMM;
+  MPI_Comm_split(universe, color, key, &INTRA_COMM);
+#+end_src
+
+Every node has to distribute **at least**
+nodeTuples.size() / nodeInfos[rank].ranksPerNode
+tuples among the ranks.
+
+We have to communicate this quantity among all nodes.
+
+#+begin_src c++ :tangle (atrip-tuples-h)
+
+  const size_t
+    tuplesPerRankLocal
+       = nodeTuples.size() / nodeInfos[rank].ranksPerNode
+       + size_t(nodeTuples.size() % nodeInfos[rank].ranksPerNode != 0)
+       ;
+
+  size_t tuplesPerRankGlobal;
+
+  MPI_Reduce(&tuplesPerRankLocal,
+             &tuplesPerRankGlobal,
+             1,
+             MPI_UINT64_T,
+             MPI_MAX,
+             0,
+             universe);
+
+  MPI_Bcast(&tuplesPerRankGlobal,
+            1,
+            MPI_UINT64_T,
+            0,
+            universe);
+#+end_src
+
+Now we have the tuples that every rank has to have, i.e.,
+=tuplesPerRankGlobal=.
+
+However before this,
+the tuples in =nodeTuples= now have to be sent from the local rank
+in every node to all the ranks in the given node,
+and we have to make sure that every rank inside a given node
+gets the same amount of tuples, in this case it should be
+=tuplesPerRankLocal=, and in our node the total number
+of tuples should be =tuplesPerRankLocal * nodeInfos[rank].ranksPerNode=,
+however this might not be the case up to now due to divisibility issues.
+
+Up to now we have exactly =nodeTuples.size()= tuples, we have to make sure by
+resizing that the condition above is met, i.e., so we can resize
+and add some fake tuples at the end as padding.
+
+#+begin_src c++ :tangle (atrip-tuples-h)
+size_t const totalTuplesLocal
+  = tuplesPerRankLocal
+  ,* nodeInfos[rank].ranksPerNode;
+
+if (makeDistribution)
+  nodeTuples.insert(nodeTuples.end(),
+                    totalTuplesLocal - nodeTuples.size(),
+                    FAKE_TUPLE);
+#+end_src
+
+The next step is sending the tuples in the local root rank
+to the other ranks in the node, this we do with the MPI function
+=MPI_Scatterv=.
+Every rank gets =tuplesPerRankLocal= tuples and
+the =nodeTuples= vector is now homogeneous and divisible by the number
+of ranks per node in our node.
+Therefore, the =displacements= are simply the vector
+\begin{equation*}
+  \left\{
+  k * \mathrm{tuplesPerNodeLocal}
+  \mid
+  k \in
+  \left\{ 0
+        , \ldots
+        , \#\text{ranks in node} - 1
+        \right\}
+  \right\}
+\end{equation*}
+
+and the =sendCounts= vector is simply the constant vector
+=tuplesPerRankLocal= of size =ranksPerNode=.
+
+#+begin_src c++ :tangle (atrip-tuples-h)
+{
+  std::vector<int> const
+    sendCounts(nodeInfos[rank].ranksPerNode, tuplesPerRankLocal);
+
+  std::vector<int>
+    displacements(nodeInfos[rank].ranksPerNode);
+
+  std::iota(displacements.begin(),
+            displacements.end(),
+            tuplesPerRankLocal);
+
+  // important!
+  result.resize(tuplesPerRankLocal);
+
+  // construct mpi type for abctuple
+  MPI_Datatype MPI_ABCTUPLE;
+  MPI_Type_vector(nodeTuples[0].size(), 1, 1, MPI_UINT64_T, &MPI_ABCTUPLE);
+  MPI_Type_commit(&MPI_ABCTUPLE);
+
+  MPI_Scatterv(nodeTuples.data(),
+              sendCounts.data(),
+              displacements.data(),
+              MPI_ABCTUPLE,
+              result.data(),
+              tuplesPerRankLocal,
+              MPI_ABCTUPLE,
+              0,
+              INTRA_COMM);
+
+  // free type
+  MPI_Type_free(&MPI_ABCTUPLE);
+
+}
+#+end_src
+
+and now we have to make sure that the size of the result
+is the same with every rank in the universe communicator,
+inserting fake tuples where needed
+
+#+begin_src c++ :tangle (atrip-tuples-h)
+
+  result.insert(result.end(),
+                tuplesPerRankGlobal - result.size(),
+                FAKE_TUPLE);
+
+  return result;
+
+}
+#+end_src
+
+
+**** Epilog                                                        :noexport:
+#+begin_src c++ :tangle (atrip-tuples-h)
+}
+#+end_src
+
+
 *** Epilog                                                         :noexport:
 #+begin_src c++ :tangle (atrip-tuples-h)
 }
--- a/include/atrip/Tuples.hpp
+++ b/include/atrip/Tuples.hpp
@@ -5,6 +5,16 @@
 #include <array>
 #include <numeric>

+// TODO: remove some
+#include <stdio.h>
+#include <math.h>
+#include <algorithm>
+#include <map>
+#include <cassert>
+#include <chrono>
+#include <climits>
+#include <mpi.h>
+
 #include <atrip/Utils.hpp>
 #include <atrip/Debug.hpp>

@@ -68,6 +78,399 @@ getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) {
 }
 // Naive list:2 ends here

+// [[file:../../atrip.org::*Prolog][Prolog:1]]
+namespace group_and_sort {
+// Prolog:1 ends here
+
+// [[file:../../atrip.org::*Node information][Node information:1]]
+std::vector<std::string> getNodeNames(MPI_Comm comm){
+  int rank, np;
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &np);
+
+  std::vector<std::string> nodeList(np);
+  char nodeName[MPI_MAX_PROCESSOR_NAME]
+     , nodeNames[np*MPI_MAX_PROCESSOR_NAME]
+     ;
+  std::vector<int> nameLengths(np)
+                 , off(np)
+                 ;
+  int nameLength;
+  MPI_Get_processor_name(nodeName, &nameLength);
+  MPI_Allgather(&nameLength,
+                1,
+                MPI_INT,
+                nameLengths.data(),
+                1,
+                MPI_INT,
+                comm);
+  for (int i(1); i < np; i++)
+    off[i] = off[i-1] + nameLengths[i-1];
+  MPI_Allgatherv(nodeName,
+                 nameLengths[rank],
+                 MPI_BYTE,
+                 nodeNames,
+                 nameLengths.data(),
+                 off.data(),
+                 MPI_BYTE,
+                 comm);
+  for (int i(0); i < np; i++) {
+    std::string const s(&nodeNames[off[i]], nameLengths[i]);
+    nodeList[i] = s;
+  }
+  return nodeList;
+}
+// Node information:1 ends here
+
+// [[file:../../atrip.org::*Node information][Node information:2]]
+struct RankInfo {
+  const std::string name;
+  const size_t nodeId;
+  const size_t globalRank;
+  const size_t localRank;
+  const size_t ranksPerNode;
+};
+
+std::vector<RankInfo>
+getNodeInfos(std::vector<string> const& nodeNames) {
+  std::vector<RankInfo> result;
+  auto uniqueNames = nodeNames;
+  {
+    std::sort(uniqueNames.begin(), uniqueNames.end());
+    auto const& last = std::unique(uniqueNames.begin(), uniqueNames.end());
+    uniqueNames.erase(last, uniqueNames.end());
+  }
+  const auto index = [&uniqueNames](std::string const& s) {
+    auto const& it = std::find(uniqueNames.begin(), uniqueNames.end(), s);
+    return std::distance(uniqueNames.begin(), it);
+  };
+  std::vector<size_t> localRanks(uniqueNames.size(), 0);
+  size_t rank = 0;
+  for (auto const& name: nodeNames) {
+    const size_t nodeId = index(name);
+    result.push_back({name,
+                      nodeId,
+                      rank++,
+                      localRanks[nodeId]++,
+                      std::count(nodeNames.begin(),
+                                 nodeNames.end(),
+                                 name)
+                      });
+  }
+  return result;
+}
+// Node information:2 ends here
+
+// [[file:../../atrip.org::*Utils][Utils:1]]
+// Provides the node on which the slice-element is found
+// Right now we distribute the slices in a round robin fashion
+// over the different nodes (NOTE: not mpi ranks but nodes)
+size_t isOnNode(size_t tuple, size_t nodes) { return tuple % nodes; }
+
+
+struct Info {
+  size_t nNodes;
+  size_t Nv;
+  size_t np;
+  size_t nodeId;
+};
+
+
+// return the node (or all nodes) where the elements of this
+// tuple are located
+std::vector<size_t> getTupleNodes(ABCTuple t, size_t nNodes) {
+  std::vector<size_t> result;
+  ABCTuple nTuple = { isOnNode(t[0], nNodes)
+                    , isOnNode(t[1], nNodes)
+                    , isOnNode(t[2], nNodes)
+                    };
+  std::sort(nTuple.begin(), nTuple.end());
+  ABCTuple::iterator it = std::unique(nTuple.begin(), nTuple.end());
+  result.resize(it - nTuple.begin());
+  std::copy(nTuple.begin(), it, result.begin());
+  return result;
+}
+// Utils:1 ends here
+
+// [[file:../../atrip.org::*Distribution][Distribution:1]]
+std::vector<ABCTuple>
+specialDistribution(Info info, std::vector<ABCTuple> const& allTuples) {
+
+  std::vector<ABCTuple> nodeTuples;
+  size_t nNodes(info.nNodes);
+  size_t np(info.np);
+  size_t N(allTuples.size());
+  size_t tuplePerNode( ceil( ((double)N) / nNodes) );
+
+  //      nodeid          tuple list
+  std::map<size_t, std::vector<ABCTuple> > container1d;
+  std::map<size_t, std::vector<ABCTuple> > container2d;
+  std::map<size_t, std::vector<ABCTuple> > container3d;
+
+  // build container-n-d's
+  for (auto t: allTuples) {
+    // one which node(s) are the tuple elements located...
+    // put them into the right container
+    auto nt = getTupleNodes(t, nNodes);
+    if ( nt.size() == 1) container1d[nt[0]].push_back(t);
+    if ( nt.size() == 2) container2d[nt[0] + nNodes*nt[1]].push_back(t);
+    if ( nt.size() == 3)
+      container3d[nt[0] + nNodes*nt[1] + nNodes*nNodes*nt[2]].push_back(t);
+  }
+
+  // DISTRIBUTE 1-d containers
+  // every tuple which is only located at one node belongs to this node
+  {
+    auto const& tuplesVec = container1d[info.nodeId];
+    nodeTuples.resize(tuplesVec.size());
+    std::copy(tuplesVec.begin(), tuplesVec.end(), nodeTuples.begin());
+  }
+
+  // DISTRIBUTE 2-d containers
+  //the tuples which are located at two nodes are half/half given to these nodes
+  for (auto &m: container2d) {
+    size_t idx = m.first%nNodes;
+    size_t idy = m.first/nNodes;
+    size_t myNode = idx;
+
+    // either idx or idy is my node
+    if (idx != info.nodeId && idy != info.nodeId) continue;
+    if (idy == info.nodeId) myNode = idy;
+
+    auto tuplesVec = m.second;
+    auto n = tuplesVec.size() / 2;
+    auto size = nodeTuples.size();
+    if (myNode == idx) {
+      nodeTuples.resize(size + n);
+      std::copy(tuplesVec.begin(),
+                tuplesVec.begin() + n,
+                nodeTuples.begin() + size);
+    } else {
+      auto ny = tuplesVec.size() - n;
+      nodeTuples.resize(size + ny);
+      std::copy(tuplesVec.begin() + n,
+                tuplesVec.end(),
+                nodeTuples.begin() + size);
+    }
+
+  }
+
+  // DISTRIBUTE 3-d containers
+  // similar game for the tuples which belong to three different nodes
+  for (auto m: container3d){
+    auto tuplesVec = m.second;
+    auto idx = m.first%nNodes;
+    auto idy = (m.first/nNodes)%nNodes;
+    auto idz = m.first/nNodes/nNodes;
+    if (idx != info.nodeId && idy != info.nodeId && idz != info.nodeId) continue;
+
+    size_t nx = tuplesVec.size() / 3;
+    size_t n, nbegin, nend;
+    if (info.nodeId == idx) {
+      n = nx;
+      nbegin = 0;
+      nend = n;
+    } else if (info.nodeId == idy) {
+      n = nx;
+      nbegin = n;
+      nend = n + n;
+    } else {
+      n = tuplesVec.size() - 2 * nx;
+      nbegin = 2 * nx;
+      nend = 2 * nx + n;
+    }
+
+    auto size = nodeTuples.size();
+    nodeTuples.resize(size + n);
+    std::copy(tuplesVec.begin() + nbegin,
+              tuplesVec.begin() + nend,
+              nodeTuples.begin() + size);
+
+  }
+
+
+  // sort part of group-and-sort algorithm
+  // every tuple on a given node is sorted in a way that
+  // the 'home elements' are the fastest index.
+  // 1:yyy 2:yyn(x) 3:yny(x) 4:ynn(x) 5:nyy 6:nyn(x) 7:nny 8:nnn
+  size_t n = info.nodeId;
+  for (auto &nt: nodeTuples){
+    if ( isOnNode(nt[0], nNodes) == n ){ // 1234
+      if ( isOnNode(nt[2], nNodes) != n ){ // 24
+        size_t x(nt[0]); nt[0] = nt[2]; nt[2] = x; // switch first and last
+      }
+      else if ( isOnNode(nt[1], nNodes) != n){ // 3
+        size_t x(nt[0]); nt[0] = nt[1]; nt[1] = x; // switch first two
+      }
+    } else {
+      if ( isOnNode(nt[1], nNodes) == n   // 56
+        && isOnNode(nt[2], nNodes) != n){ // 6
+        size_t x(nt[1]); nt[1] = nt[2]; nt[2] = x; // switch last two
+      }
+    }
+  }
+  //now we sort the list of tuples
+  std::sort(nodeTuples.begin(), nodeTuples.end());
+  // we bring the tuples abc back in the order a<b<c
+  for (auto &t: nodeTuples)  std::sort(t.begin(), t.end());
+
+  return nodeTuples;
+
+}
+
+//determine which element has to be fetched from sources for the next iteration
+std::vector<size_t> fetchElement(ABCTuple cur, ABCTuple suc){
+  std::vector<size_t> result;
+  ABCTuple inter;
+  std::sort(cur.begin(), cur.end());
+  std::sort(suc.begin(), suc.end());
+  std::array<size_t,3>::iterator rit, cit, sit;
+  cit = std::unique(cur.begin(), cur.end());
+  sit = std::unique(suc.begin(), suc.end());
+  rit = std::set_difference(suc.begin(), sit, cur.begin(), cit, inter.begin());
+  result.resize(rit - inter.begin());
+  std::copy(inter.begin(), rit, result.begin());
+  return result;
+}
+// Distribution:1 ends here
+
+// [[file:../../atrip.org::*Main][Main:1]]
+std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv) {
+
+  int rank, np;
+  MPI_Comm_rank(universe, &rank);
+  MPI_Comm_size(universe, &np);
+
+  std::vector<ABCTuple> result;
+
+  const auto nodeNames(getNodeNames(universe));
+  auto nodeNamesUnique(nodeNames);
+  {
+    const auto& last = std::unique(nodeNamesUnique.begin(),
+                                   nodeNamesUnique.end());
+    nodeNamesUnique.erase(last, nodeNamesUnique.end());
+  }
+  // we pick one rank from every node
+  auto const nodeInfos = getNodeInfos(nodeNames);
+  size_t const nNodes = nodeNamesUnique.size();
+
+  // We want to construct a communicator which only contains of one
+  // element per node
+  bool makeDistribution
+    = nodeInfos[rank].localRank == 0
+    ? true
+    : false
+    ;
+
+  std::vector<ABCTuple>
+    nodeTuples = makeDistribution
+               ? specialDistribution(Info{ nNodes
+                                         , Nv
+                                         , np
+                                         , nodeInfos[rank].nodeId
+                                         },
+                                      getTuplesList(Nv))
+               : std::vector<ABCTuple>()
+               ;
+
+
+  // now we have to send the data from **one** rank on each node
+  // to all others ranks of this node
+    const
+  int color = nodeInfos[rank].nodeId
+    , key = nodeInfos[rank].localRank
+    ;
+
+
+  MPI_Comm INTRA_COMM;
+  MPI_Comm_split(universe, color, key, &INTRA_COMM);
+// Main:1 ends here
+
+// [[file:../../atrip.org::*Main][Main:2]]
+const size_t
+  tuplesPerRankLocal
+     = nodeTuples.size() / nodeInfos[rank].ranksPerNode
+     + size_t(nodeTuples.size() % nodeInfos[rank].ranksPerNode != 0)
+     ;
+
+size_t tuplesPerRankGlobal;
+
+MPI_Reduce(&tuplesPerRankLocal,
+           &tuplesPerRankGlobal,
+           1,
+           MPI_UINT64_T,
+           MPI_MAX,
+           0,
+           universe);
+
+MPI_Bcast(&tuplesPerRankGlobal,
+          1,
+          MPI_UINT64_T,
+          0,
+          universe);
+// Main:2 ends here
+
+// [[file:../../atrip.org::*Main][Main:3]]
+size_t const totalTuplesLocal
+  = tuplesPerRankLocal
+  * nodeInfos[rank].ranksPerNode;
+
+if (makeDistribution)
+  nodeTuples.insert(nodeTuples.end(),
+                    totalTuplesLocal - nodeTuples.size(),
+                    FAKE_TUPLE);
+// Main:3 ends here
+
+// [[file:../../atrip.org::*Main][Main:4]]
+{
+  std::vector<int> const
+    sendCounts(nodeInfos[rank].ranksPerNode, tuplesPerRankLocal);
+
+  std::vector<int>
+    displacements(nodeInfos[rank].ranksPerNode);
+
+  std::iota(displacements.begin(),
+            displacements.end(),
+            tuplesPerRankLocal);
+
+  // important!
+  result.resize(tuplesPerRankLocal);
+
+  // construct mpi type for abctuple
+  MPI_Datatype MPI_ABCTUPLE;
+  MPI_Type_vector(nodeTuples[0].size(), 1, 1, MPI_UINT64_T, &MPI_ABCTUPLE);
+  MPI_Type_commit(&MPI_ABCTUPLE);
+
+  MPI_Scatterv(nodeTuples.data(),
+              sendCounts.data(),
+              displacements.data(),
+              MPI_ABCTUPLE,
+              result.data(),
+              tuplesPerRankLocal,
+              MPI_ABCTUPLE,
+              0,
+              INTRA_COMM);
+
+  // free type
+  MPI_Type_free(&MPI_ABCTUPLE);
+
+}
+// Main:4 ends here
+
+// [[file:../../atrip.org::*Main][Main:5]]
+result.insert(result.end(),
+                tuplesPerRankGlobal - result.size(),
+                FAKE_TUPLE);
+
+  return result;
+
+}
+// Main:5 ends here
+
+// [[file:../../atrip.org::*Epilog][Epilog:1]]
+}
+// Epilog:1 ends here
+
 // [[file:../../atrip.org::*Epilog][Epilog:1]]
 }
 // Epilog:1 ends here