Add group and sort algorithm, it compiles at least

2021-10-15 19:18:40 +02:00
parent 944e93dc33
commit 65f804e637
3 changed files with 909 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,10 @@
 .emacs
 doc/doxygen/
 extern
 lib
 *.o
 *.d
 a.out
 *~
 config.mk
--- a/atrip.org
+++ b/atrip.org
@@ -1400,6 +1400,16 @@ as well as their distribution to nodes and cores.
 #include <array>
 #include <numeric>
 // TODO: remove some
 #include <stdio.h>
 #include <math.h>
 #include <algorithm>
 #include <map>
 #include <cassert>
 #include <chrono>
 #include <climits>
 #include <mpi.h>
 #include <atrip/Utils.hpp>
 #include <atrip/Debug.hpp>
@@ -1491,6 +1501,498 @@ getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) {
 #+end_src
 *** Group and sort list
 **** Prolog                                                        :noexport:
 #+begin_src c++ :tangle (atrip-tuples-h)
 namespace group_and_sort {
 #+end_src
 **** Node information
 - nodeList ::
    List of hostnames of size \( N_n \)
 - nodeInfos ::
    List of (hostname, local rank Id)
    of size \( N_p \), i.e., size of ranks
    where local rank id goes from 0 to 48.
 =getNodeNames= gets the names of the nodes used,
 i.e., the size of the resulting vector gives the
 number of nodes.
 #+begin_src c++ :tangle (atrip-tuples-h)
 std::vector<std::string> getNodeNames(MPI_Comm comm){
  int rank, np;
  MPI_Comm_rank(comm, &rank);
  MPI_Comm_size(comm, &np);
  std::vector<std::string> nodeList(np);
  char nodeName[MPI_MAX_PROCESSOR_NAME]
     , nodeNames[np*MPI_MAX_PROCESSOR_NAME]
     ;
  std::vector<int> nameLengths(np)
                 , off(np)
                 ;
  int nameLength;
  MPI_Get_processor_name(nodeName, &nameLength);
  MPI_Allgather(&nameLength,
                1,
                MPI_INT,
                nameLengths.data(),
                1,
                MPI_INT,
                comm);
  for (int i(1); i < np; i++)
    off[i] = off[i-1] + nameLengths[i-1];
  MPI_Allgatherv(nodeName,
                 nameLengths[rank],
                 MPI_BYTE,
                 nodeNames,
                 nameLengths.data(),
                 off.data(),
                 MPI_BYTE,
                 comm);
  for (int i(0); i < np; i++) {
    std::string const s(&nodeNames[off[i]], nameLengths[i]);
    nodeList[i] = s;
  }
  return nodeList;
 }
 #+end_src
 =getNodeInfos=
 #+begin_src c++ :tangle (atrip-tuples-h)
 struct RankInfo {
  const std::string name;
  const size_t nodeId;
  const size_t globalRank;
  const size_t localRank;
  const size_t ranksPerNode;
 };
 std::vector<RankInfo>
 getNodeInfos(std::vector<string> const& nodeNames) {
  std::vector<RankInfo> result;
  auto uniqueNames = nodeNames;
  {
    std::sort(uniqueNames.begin(), uniqueNames.end());
    auto const& last = std::unique(uniqueNames.begin(), uniqueNames.end());
    uniqueNames.erase(last, uniqueNames.end());
  }
  const auto index = [&uniqueNames](std::string const& s) {
    auto const& it = std::find(uniqueNames.begin(), uniqueNames.end(), s);
    return std::distance(uniqueNames.begin(), it);
  };
  std::vector<size_t> localRanks(uniqueNames.size(), 0);
  size_t rank = 0;
  for (auto const& name: nodeNames) {
    const size_t nodeId = index(name);
    result.push_back({name,
                      nodeId,
                      rank++,
                      localRanks[nodeId]++,
                      std::count(nodeNames.begin(),
                                 nodeNames.end(),
                                 name)
                      });
  }
  return result;
 }
 #+end_src
 **** Utils
 #+begin_src c++ :tangle (atrip-tuples-h)
 // Provides the node on which the slice-element is found
 // Right now we distribute the slices in a round robin fashion
 // over the different nodes (NOTE: not mpi ranks but nodes)
 size_t isOnNode(size_t tuple, size_t nodes) { return tuple % nodes; }
 struct Info {
  size_t nNodes;
  size_t Nv;
  size_t np;
  size_t nodeId;
 };
 // return the node (or all nodes) where the elements of this
 // tuple are located
 std::vector<size_t> getTupleNodes(ABCTuple t, size_t nNodes) {
  std::vector<size_t> result;
  ABCTuple nTuple = { isOnNode(t[0], nNodes)
                    , isOnNode(t[1], nNodes)
                    , isOnNode(t[2], nNodes)
                    };
  std::sort(nTuple.begin(), nTuple.end());
  ABCTuple::iterator it = std::unique(nTuple.begin(), nTuple.end());
  result.resize(it - nTuple.begin());
  std::copy(nTuple.begin(), it, result.begin());
  return result;
 }
 #+end_src
 **** Distribution
 wording: home element = element which is located on the given node
 1. we distribute the tuples such that each tuple has at least one 'home element'
 2. we sort each tuple in a way that the 'home element' are the fastest indices
 3. we sort the list of tuples on every node
 4. we resort the tuples that for every tuple abc the following holds: a<b<c
 #+begin_src c++ :tangle (atrip-tuples-h)
 std::vector<ABCTuple>
 specialDistribution(Info info, std::vector<ABCTuple> const& allTuples) {
  std::vector<ABCTuple> nodeTuples;
  size_t nNodes(info.nNodes);
  size_t np(info.np);
  size_t N(allTuples.size());
  size_t tuplePerNode( ceil( ((double)N) / nNodes) );
  //      nodeid          tuple list
  std::map<size_t, std::vector<ABCTuple> > container1d;
  std::map<size_t, std::vector<ABCTuple> > container2d;
  std::map<size_t, std::vector<ABCTuple> > container3d;
  // build container-n-d's
  for (auto t: allTuples) {
    // one which node(s) are the tuple elements located...
    // put them into the right container
    auto nt = getTupleNodes(t, nNodes);
    if ( nt.size() == 1) container1d[nt[0]].push_back(t);
    if ( nt.size() == 2) container2d[nt[0] + nNodes*nt[1]].push_back(t);
    if ( nt.size() == 3)
      container3d[nt[0] + nNodes*nt[1] + nNodes*nNodes*nt[2]].push_back(t);
  }
  // DISTRIBUTE 1-d containers
  // every tuple which is only located at one node belongs to this node
  {
    auto const& tuplesVec = container1d[info.nodeId];
    nodeTuples.resize(tuplesVec.size());
    std::copy(tuplesVec.begin(), tuplesVec.end(), nodeTuples.begin());
  }
  // DISTRIBUTE 2-d containers
  //the tuples which are located at two nodes are half/half given to these nodes
  for (auto &m: container2d) {
    size_t idx = m.first%nNodes;
    size_t idy = m.first/nNodes;
    size_t myNode = idx;
    // either idx or idy is my node
    if (idx != info.nodeId && idy != info.nodeId) continue;
    if (idy == info.nodeId) myNode = idy;
    auto tuplesVec = m.second;
    auto n = tuplesVec.size() / 2;
    auto size = nodeTuples.size();
    if (myNode == idx) {
      nodeTuples.resize(size + n);
      std::copy(tuplesVec.begin(),
                tuplesVec.begin() + n,
                nodeTuples.begin() + size);
    } else {
      auto ny = tuplesVec.size() - n;
      nodeTuples.resize(size + ny);
      std::copy(tuplesVec.begin() + n,
                tuplesVec.end(),
                nodeTuples.begin() + size);
    }
  }
  // DISTRIBUTE 3-d containers
  // similar game for the tuples which belong to three different nodes
  for (auto m: container3d){
    auto tuplesVec = m.second;
    auto idx = m.first%nNodes;
    auto idy = (m.first/nNodes)%nNodes;
    auto idz = m.first/nNodes/nNodes;
    if (idx != info.nodeId && idy != info.nodeId && idz != info.nodeId) continue;
    size_t nx = tuplesVec.size() / 3;
    size_t n, nbegin, nend;
    if (info.nodeId == idx) {
      n = nx;
      nbegin = 0;
      nend = n;
    } else if (info.nodeId == idy) {
      n = nx;
      nbegin = n;
      nend = n + n;
    } else {
      n = tuplesVec.size() - 2 * nx;
      nbegin = 2 * nx;
      nend = 2 * nx + n;
    }
    auto size = nodeTuples.size();
    nodeTuples.resize(size + n);
    std::copy(tuplesVec.begin() + nbegin,
              tuplesVec.begin() + nend,
              nodeTuples.begin() + size);
  }
  // sort part of group-and-sort algorithm
  // every tuple on a given node is sorted in a way that
  // the 'home elements' are the fastest index.
  // 1:yyy 2:yyn(x) 3:yny(x) 4:ynn(x) 5:nyy 6:nyn(x) 7:nny 8:nnn
  size_t n = info.nodeId;
  for (auto &nt: nodeTuples){
    if ( isOnNode(nt[0], nNodes) == n ){ // 1234
      if ( isOnNode(nt[2], nNodes) != n ){ // 24
        size_t x(nt[0]); nt[0] = nt[2]; nt[2] = x; // switch first and last
      }
      else if ( isOnNode(nt[1], nNodes) != n){ // 3
        size_t x(nt[0]); nt[0] = nt[1]; nt[1] = x; // switch first two
      }
    } else {
      if ( isOnNode(nt[1], nNodes) == n   // 56
        && isOnNode(nt[2], nNodes) != n){ // 6
        size_t x(nt[1]); nt[1] = nt[2]; nt[2] = x; // switch last two
      }
    }
  }
  //now we sort the list of tuples
  std::sort(nodeTuples.begin(), nodeTuples.end());
  // we bring the tuples abc back in the order a<b<c
  for (auto &t: nodeTuples)  std::sort(t.begin(), t.end());
  return nodeTuples;
 }
 //determine which element has to be fetched from sources for the next iteration
 std::vector<size_t> fetchElement(ABCTuple cur, ABCTuple suc){
  std::vector<size_t> result;
  ABCTuple inter;
  std::sort(cur.begin(), cur.end());
  std::sort(suc.begin(), suc.end());
  std::array<size_t,3>::iterator rit, cit, sit;
  cit = std::unique(cur.begin(), cur.end());
  sit = std::unique(suc.begin(), suc.end());
  rit = std::set_difference(suc.begin(), sit, cur.begin(), cit, inter.begin());
  result.resize(rit - inter.begin());
  std::copy(inter.begin(), rit, result.begin());
  return result;
 }
 #+end_src
 **** Main
 The main routine should return the list of tuples to be handled by the current rank.
 Let \( N_p \) be the number of ranks or processes.
 Let \( N_n \) be the number of nodes or sockets.
 Then we have the following
 #+begin_example
 Global rank | 0 1 2 3 4 5 6 7 8
 nodeId      | 0 1 0 1 1 0 2 2 2
 Local rank  | 0 0 1 1 2 2 0 1 2
 intra color | 0 1 0 1 1 0 2 2 2
 key         | global rank
 #+end_example
 #+begin_src c++ :tangle (atrip-tuples-h)
 std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv) {
  int rank, np;
  MPI_Comm_rank(universe, &rank);
  MPI_Comm_size(universe, &np);
  std::vector<ABCTuple> result;
  const auto nodeNames(getNodeNames(universe));
  auto nodeNamesUnique(nodeNames);
  {
    const auto& last = std::unique(nodeNamesUnique.begin(),
                                   nodeNamesUnique.end());
    nodeNamesUnique.erase(last, nodeNamesUnique.end());
  }
  // we pick one rank from every node
  auto const nodeInfos = getNodeInfos(nodeNames);
  size_t const nNodes = nodeNamesUnique.size();
  // We want to construct a communicator which only contains of one
  // element per node
  bool makeDistribution
    = nodeInfos[rank].localRank == 0
    ? true
    : false
    ;
  std::vector<ABCTuple>
    nodeTuples = makeDistribution
               ? specialDistribution(Info{ nNodes
                                         , Nv
                                         , np
                                         , nodeInfos[rank].nodeId
                                         },
                                      getTuplesList(Nv))
               : std::vector<ABCTuple>()
               ;
  // now we have to send the data from **one** rank on each node
  // to all others ranks of this node
    const
  int color = nodeInfos[rank].nodeId
    , key = nodeInfos[rank].localRank
    ;
  MPI_Comm INTRA_COMM;
  MPI_Comm_split(universe, color, key, &INTRA_COMM);
 #+end_src
 Every node has to distribute **at least**
 nodeTuples.size() / nodeInfos[rank].ranksPerNode
 tuples among the ranks.
 We have to communicate this quantity among all nodes.
 #+begin_src c++ :tangle (atrip-tuples-h)
  const size_t
    tuplesPerRankLocal
       = nodeTuples.size() / nodeInfos[rank].ranksPerNode
       + size_t(nodeTuples.size() % nodeInfos[rank].ranksPerNode != 0)
       ;
  size_t tuplesPerRankGlobal;
  MPI_Reduce(&tuplesPerRankLocal,
             &tuplesPerRankGlobal,
             1,
             MPI_UINT64_T,
             MPI_MAX,
             0,
             universe);
  MPI_Bcast(&tuplesPerRankGlobal,
            1,
            MPI_UINT64_T,
            0,
            universe);
 #+end_src
 Now we have the tuples that every rank has to have, i.e.,
 =tuplesPerRankGlobal=.
 However before this,
 the tuples in =nodeTuples= now have to be sent from the local rank
 in every node to all the ranks in the given node,
 and we have to make sure that every rank inside a given node
 gets the same amount of tuples, in this case it should be
 =tuplesPerRankLocal=, and in our node the total number
 of tuples should be =tuplesPerRankLocal * nodeInfos[rank].ranksPerNode=,
 however this might not be the case up to now due to divisibility issues.
 Up to now we have exactly =nodeTuples.size()= tuples, we have to make sure by
 resizing that the condition above is met, i.e., so we can resize
 and add some fake tuples at the end as padding.
 #+begin_src c++ :tangle (atrip-tuples-h)
 size_t const totalTuplesLocal
  = tuplesPerRankLocal
  ,* nodeInfos[rank].ranksPerNode;
 if (makeDistribution)
  nodeTuples.insert(nodeTuples.end(),
                    totalTuplesLocal - nodeTuples.size(),
                    FAKE_TUPLE);
 #+end_src
 The next step is sending the tuples in the local root rank
 to the other ranks in the node, this we do with the MPI function
 =MPI_Scatterv=.
 Every rank gets =tuplesPerRankLocal= tuples and
 the =nodeTuples= vector is now homogeneous and divisible by the number
 of ranks per node in our node.
 Therefore, the =displacements= are simply the vector
 \begin{equation*}
  \left\{
  k * \mathrm{tuplesPerNodeLocal}
  \mid
  k \in
  \left\{ 0
        , \ldots
        , \#\text{ranks in node} - 1
        \right\}
  \right\}
 \end{equation*}
 and the =sendCounts= vector is simply the constant vector
 =tuplesPerRankLocal= of size =ranksPerNode=.
 #+begin_src c++ :tangle (atrip-tuples-h)
 {
  std::vector<int> const
    sendCounts(nodeInfos[rank].ranksPerNode, tuplesPerRankLocal);
  std::vector<int>
    displacements(nodeInfos[rank].ranksPerNode);
  std::iota(displacements.begin(),
            displacements.end(),
            tuplesPerRankLocal);
  // important!
  result.resize(tuplesPerRankLocal);
  // construct mpi type for abctuple
  MPI_Datatype MPI_ABCTUPLE;
  MPI_Type_vector(nodeTuples[0].size(), 1, 1, MPI_UINT64_T, &MPI_ABCTUPLE);
  MPI_Type_commit(&MPI_ABCTUPLE);
  MPI_Scatterv(nodeTuples.data(),
              sendCounts.data(),
              displacements.data(),
              MPI_ABCTUPLE,
              result.data(),
              tuplesPerRankLocal,
              MPI_ABCTUPLE,
              0,
              INTRA_COMM);
  // free type
  MPI_Type_free(&MPI_ABCTUPLE);
 }
 #+end_src
 and now we have to make sure that the size of the result
 is the same with every rank in the universe communicator,
 inserting fake tuples where needed
 #+begin_src c++ :tangle (atrip-tuples-h)
  result.insert(result.end(),
                tuplesPerRankGlobal - result.size(),
                FAKE_TUPLE);
  return result;
 }
 #+end_src
 **** Epilog                                                        :noexport:
 #+begin_src c++ :tangle (atrip-tuples-h)
 }
 #+end_src
 *** Epilog                                                         :noexport:
 #+begin_src c++ :tangle (atrip-tuples-h)
 }
--- a/include/atrip/Tuples.hpp
+++ b/include/atrip/Tuples.hpp
@@ -5,6 +5,16 @@
 #include <array>
 #include <numeric>
 // TODO: remove some
 #include <stdio.h>
 #include <math.h>
 #include <algorithm>
 #include <map>
 #include <cassert>
 #include <chrono>
 #include <climits>
 #include <mpi.h>
 #include <atrip/Utils.hpp>
 #include <atrip/Debug.hpp>
@@ -68,6 +78,399 @@ getABCRange(size_t np, size_t rank, ABCTuples const& tuplesList) {
 }
 // Naive list:2 ends here
 // [[file:../../atrip.org::*Prolog][Prolog:1]]
 namespace group_and_sort {
 // Prolog:1 ends here
 // [[file:../../atrip.org::*Node information][Node information:1]]
 std::vector<std::string> getNodeNames(MPI_Comm comm){
  int rank, np;
  MPI_Comm_rank(comm, &rank);
  MPI_Comm_size(comm, &np);
  std::vector<std::string> nodeList(np);
  char nodeName[MPI_MAX_PROCESSOR_NAME]
     , nodeNames[np*MPI_MAX_PROCESSOR_NAME]
     ;
  std::vector<int> nameLengths(np)
                 , off(np)
                 ;
  int nameLength;
  MPI_Get_processor_name(nodeName, &nameLength);
  MPI_Allgather(&nameLength,
                1,
                MPI_INT,
                nameLengths.data(),
                1,
                MPI_INT,
                comm);
  for (int i(1); i < np; i++)
    off[i] = off[i-1] + nameLengths[i-1];
  MPI_Allgatherv(nodeName,
                 nameLengths[rank],
                 MPI_BYTE,
                 nodeNames,
                 nameLengths.data(),
                 off.data(),
                 MPI_BYTE,
                 comm);
  for (int i(0); i < np; i++) {
    std::string const s(&nodeNames[off[i]], nameLengths[i]);
    nodeList[i] = s;
  }
  return nodeList;
 }
 // Node information:1 ends here
 // [[file:../../atrip.org::*Node information][Node information:2]]
 struct RankInfo {
  const std::string name;
  const size_t nodeId;
  const size_t globalRank;
  const size_t localRank;
  const size_t ranksPerNode;
 };
 std::vector<RankInfo>
 getNodeInfos(std::vector<string> const& nodeNames) {
  std::vector<RankInfo> result;
  auto uniqueNames = nodeNames;
  {
    std::sort(uniqueNames.begin(), uniqueNames.end());
    auto const& last = std::unique(uniqueNames.begin(), uniqueNames.end());
    uniqueNames.erase(last, uniqueNames.end());
  }
  const auto index = [&uniqueNames](std::string const& s) {
    auto const& it = std::find(uniqueNames.begin(), uniqueNames.end(), s);
    return std::distance(uniqueNames.begin(), it);
  };
  std::vector<size_t> localRanks(uniqueNames.size(), 0);
  size_t rank = 0;
  for (auto const& name: nodeNames) {
    const size_t nodeId = index(name);
    result.push_back({name,
                      nodeId,
                      rank++,
                      localRanks[nodeId]++,
                      std::count(nodeNames.begin(),
                                 nodeNames.end(),
                                 name)
                      });
  }
  return result;
 }
 // Node information:2 ends here
 // [[file:../../atrip.org::*Utils][Utils:1]]
 // Provides the node on which the slice-element is found
 // Right now we distribute the slices in a round robin fashion
 // over the different nodes (NOTE: not mpi ranks but nodes)
 size_t isOnNode(size_t tuple, size_t nodes) { return tuple % nodes; }
 struct Info {
  size_t nNodes;
  size_t Nv;
  size_t np;
  size_t nodeId;
 };
 // return the node (or all nodes) where the elements of this
 // tuple are located
 std::vector<size_t> getTupleNodes(ABCTuple t, size_t nNodes) {
  std::vector<size_t> result;
  ABCTuple nTuple = { isOnNode(t[0], nNodes)
                    , isOnNode(t[1], nNodes)
                    , isOnNode(t[2], nNodes)
                    };
  std::sort(nTuple.begin(), nTuple.end());
  ABCTuple::iterator it = std::unique(nTuple.begin(), nTuple.end());
  result.resize(it - nTuple.begin());
  std::copy(nTuple.begin(), it, result.begin());
  return result;
 }
 // Utils:1 ends here
 // [[file:../../atrip.org::*Distribution][Distribution:1]]
 std::vector<ABCTuple>
 specialDistribution(Info info, std::vector<ABCTuple> const& allTuples) {
  std::vector<ABCTuple> nodeTuples;
  size_t nNodes(info.nNodes);
  size_t np(info.np);
  size_t N(allTuples.size());
  size_t tuplePerNode( ceil( ((double)N) / nNodes) );
  //      nodeid          tuple list
  std::map<size_t, std::vector<ABCTuple> > container1d;
  std::map<size_t, std::vector<ABCTuple> > container2d;
  std::map<size_t, std::vector<ABCTuple> > container3d;
  // build container-n-d's
  for (auto t: allTuples) {
    // one which node(s) are the tuple elements located...
    // put them into the right container
    auto nt = getTupleNodes(t, nNodes);
    if ( nt.size() == 1) container1d[nt[0]].push_back(t);
    if ( nt.size() == 2) container2d[nt[0] + nNodes*nt[1]].push_back(t);
    if ( nt.size() == 3)
      container3d[nt[0] + nNodes*nt[1] + nNodes*nNodes*nt[2]].push_back(t);
  }
  // DISTRIBUTE 1-d containers
  // every tuple which is only located at one node belongs to this node
  {
    auto const& tuplesVec = container1d[info.nodeId];
    nodeTuples.resize(tuplesVec.size());
    std::copy(tuplesVec.begin(), tuplesVec.end(), nodeTuples.begin());
  }
  // DISTRIBUTE 2-d containers
  //the tuples which are located at two nodes are half/half given to these nodes
  for (auto &m: container2d) {
    size_t idx = m.first%nNodes;
    size_t idy = m.first/nNodes;
    size_t myNode = idx;
    // either idx or idy is my node
    if (idx != info.nodeId && idy != info.nodeId) continue;
    if (idy == info.nodeId) myNode = idy;
    auto tuplesVec = m.second;
    auto n = tuplesVec.size() / 2;
    auto size = nodeTuples.size();
    if (myNode == idx) {
      nodeTuples.resize(size + n);
      std::copy(tuplesVec.begin(),
                tuplesVec.begin() + n,
                nodeTuples.begin() + size);
    } else {
      auto ny = tuplesVec.size() - n;
      nodeTuples.resize(size + ny);
      std::copy(tuplesVec.begin() + n,
                tuplesVec.end(),
                nodeTuples.begin() + size);
    }
  }
  // DISTRIBUTE 3-d containers
  // similar game for the tuples which belong to three different nodes
  for (auto m: container3d){
    auto tuplesVec = m.second;
    auto idx = m.first%nNodes;
    auto idy = (m.first/nNodes)%nNodes;
    auto idz = m.first/nNodes/nNodes;
    if (idx != info.nodeId && idy != info.nodeId && idz != info.nodeId) continue;
    size_t nx = tuplesVec.size() / 3;
    size_t n, nbegin, nend;
    if (info.nodeId == idx) {
      n = nx;
      nbegin = 0;
      nend = n;
    } else if (info.nodeId == idy) {
      n = nx;
      nbegin = n;
      nend = n + n;
    } else {
      n = tuplesVec.size() - 2 * nx;
      nbegin = 2 * nx;
      nend = 2 * nx + n;
    }
    auto size = nodeTuples.size();
    nodeTuples.resize(size + n);
    std::copy(tuplesVec.begin() + nbegin,
              tuplesVec.begin() + nend,
              nodeTuples.begin() + size);
  }
  // sort part of group-and-sort algorithm
  // every tuple on a given node is sorted in a way that
  // the 'home elements' are the fastest index.
  // 1:yyy 2:yyn(x) 3:yny(x) 4:ynn(x) 5:nyy 6:nyn(x) 7:nny 8:nnn
  size_t n = info.nodeId;
  for (auto &nt: nodeTuples){
    if ( isOnNode(nt[0], nNodes) == n ){ // 1234
      if ( isOnNode(nt[2], nNodes) != n ){ // 24
        size_t x(nt[0]); nt[0] = nt[2]; nt[2] = x; // switch first and last
      }
      else if ( isOnNode(nt[1], nNodes) != n){ // 3
        size_t x(nt[0]); nt[0] = nt[1]; nt[1] = x; // switch first two
      }
    } else {
      if ( isOnNode(nt[1], nNodes) == n   // 56
        && isOnNode(nt[2], nNodes) != n){ // 6
        size_t x(nt[1]); nt[1] = nt[2]; nt[2] = x; // switch last two
      }
    }
  }
  //now we sort the list of tuples
  std::sort(nodeTuples.begin(), nodeTuples.end());
  // we bring the tuples abc back in the order a<b<c
  for (auto &t: nodeTuples)  std::sort(t.begin(), t.end());
  return nodeTuples;
 }
 //determine which element has to be fetched from sources for the next iteration
 std::vector<size_t> fetchElement(ABCTuple cur, ABCTuple suc){
  std::vector<size_t> result;
  ABCTuple inter;
  std::sort(cur.begin(), cur.end());
  std::sort(suc.begin(), suc.end());
  std::array<size_t,3>::iterator rit, cit, sit;
  cit = std::unique(cur.begin(), cur.end());
  sit = std::unique(suc.begin(), suc.end());
  rit = std::set_difference(suc.begin(), sit, cur.begin(), cit, inter.begin());
  result.resize(rit - inter.begin());
  std::copy(inter.begin(), rit, result.begin());
  return result;
 }
 // Distribution:1 ends here
 // [[file:../../atrip.org::*Main][Main:1]]
 std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv) {
  int rank, np;
  MPI_Comm_rank(universe, &rank);
  MPI_Comm_size(universe, &np);
  std::vector<ABCTuple> result;
  const auto nodeNames(getNodeNames(universe));
  auto nodeNamesUnique(nodeNames);
  {
    const auto& last = std::unique(nodeNamesUnique.begin(),
                                   nodeNamesUnique.end());
    nodeNamesUnique.erase(last, nodeNamesUnique.end());
  }
  // we pick one rank from every node
  auto const nodeInfos = getNodeInfos(nodeNames);
  size_t const nNodes = nodeNamesUnique.size();
  // We want to construct a communicator which only contains of one
  // element per node
  bool makeDistribution
    = nodeInfos[rank].localRank == 0
    ? true
    : false
    ;
  std::vector<ABCTuple>
    nodeTuples = makeDistribution
               ? specialDistribution(Info{ nNodes
                                         , Nv
                                         , np
                                         , nodeInfos[rank].nodeId
                                         },
                                      getTuplesList(Nv))
               : std::vector<ABCTuple>()
               ;
  // now we have to send the data from **one** rank on each node
  // to all others ranks of this node
    const
  int color = nodeInfos[rank].nodeId
    , key = nodeInfos[rank].localRank
    ;
  MPI_Comm INTRA_COMM;
  MPI_Comm_split(universe, color, key, &INTRA_COMM);
 // Main:1 ends here
 // [[file:../../atrip.org::*Main][Main:2]]
 const size_t
  tuplesPerRankLocal
     = nodeTuples.size() / nodeInfos[rank].ranksPerNode
     + size_t(nodeTuples.size() % nodeInfos[rank].ranksPerNode != 0)
     ;
 size_t tuplesPerRankGlobal;
 MPI_Reduce(&tuplesPerRankLocal,
           &tuplesPerRankGlobal,
           1,
           MPI_UINT64_T,
           MPI_MAX,
           0,
           universe);
 MPI_Bcast(&tuplesPerRankGlobal,
          1,
          MPI_UINT64_T,
          0,
          universe);
 // Main:2 ends here
 // [[file:../../atrip.org::*Main][Main:3]]
 size_t const totalTuplesLocal
  = tuplesPerRankLocal
  * nodeInfos[rank].ranksPerNode;
 if (makeDistribution)
  nodeTuples.insert(nodeTuples.end(),
                    totalTuplesLocal - nodeTuples.size(),
                    FAKE_TUPLE);
 // Main:3 ends here
 // [[file:../../atrip.org::*Main][Main:4]]
 {
  std::vector<int> const
    sendCounts(nodeInfos[rank].ranksPerNode, tuplesPerRankLocal);
  std::vector<int>
    displacements(nodeInfos[rank].ranksPerNode);
  std::iota(displacements.begin(),
            displacements.end(),
            tuplesPerRankLocal);
  // important!
  result.resize(tuplesPerRankLocal);
  // construct mpi type for abctuple
  MPI_Datatype MPI_ABCTUPLE;
  MPI_Type_vector(nodeTuples[0].size(), 1, 1, MPI_UINT64_T, &MPI_ABCTUPLE);
  MPI_Type_commit(&MPI_ABCTUPLE);
  MPI_Scatterv(nodeTuples.data(),
              sendCounts.data(),
              displacements.data(),
              MPI_ABCTUPLE,
              result.data(),
              tuplesPerRankLocal,
              MPI_ABCTUPLE,
              0,
              INTRA_COMM);
  // free type
  MPI_Type_free(&MPI_ABCTUPLE);
 }
 // Main:4 ends here
 // [[file:../../atrip.org::*Main][Main:5]]
 result.insert(result.end(),
                tuplesPerRankGlobal - result.size(),
                FAKE_TUPLE);
  return result;
 }
 // Main:5 ends here
 // [[file:../../atrip.org::*Epilog][Epilog:1]]
 }
 // Epilog:1 ends here
 // [[file:../../atrip.org::*Epilog][Epilog:1]]
 }
 // Epilog:1 ends here