atrip/src/atrip/Atrip.cxx

// Copyright 2022 Alejandro Gallo
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// [[file:~/cuda/atrip/atrip.org::*Main][Main:1]]
#include <iomanip>

#include <atrip/Atrip.hpp>
#include <atrip/Utils.hpp>
#include <atrip/Equations.hpp>
#include <atrip/SliceUnion.hpp>
#include <atrip/Unions.hpp>
#include <atrip/Checkpoint.hpp>

using namespace atrip;
#if defined(HAVE_CUDA)
#include <atrip/CUDA.hpp>
#endif

template <typename F> bool RankMap<F>::RANK_ROUND_ROBIN;
template bool RankMap<double>::RANK_ROUND_ROBIN;
template bool RankMap<Complex>::RANK_ROUND_ROBIN;
size_t Atrip::rank;
size_t Atrip::np;
#if defined(HAVE_CUDA)
typename Atrip::CudaContext Atrip::cuda;
typename Atrip::KernelDimensions Atrip::kernelDimensions;
#endif
MPI_Comm Atrip::communicator;
Timings Atrip::chrono;

// user printing block
IterationDescriptor IterationDescription::descriptor;
void atrip::registerIterationDescriptor(IterationDescriptor d) {
  IterationDescription::descriptor = d;
}

void Atrip::init(MPI_Comm world)  {
  Atrip::communicator = world;
  MPI_Comm_rank(world, (int*)&Atrip::rank);
  MPI_Comm_size(world, (int*)&Atrip::np);
}

template <typename F>
Atrip::Output Atrip::run(Atrip::Input<F> const& in) {

  const size_t np = Atrip::np;
  const size_t rank = Atrip::rank;
  MPI_Comm universe = Atrip::communicator;

  const size_t No = in.ei->lens[0];
  const size_t Nv = in.ea->lens[0];
  LOG(0,"Atrip") << "No: " << No << "\n";
  LOG(0,"Atrip") << "Nv: " << Nv << "\n";
  LOG(0,"Atrip") << "np: " << np << "\n";

#if defined(HAVE_CUDA)
  int ngcards;
  _CHECK_CUDA_SUCCESS("initializing cuda",
                      cuInit(0));
  _CHECK_CUDA_SUCCESS("getting device count",
                      cuDeviceGetCount(&ngcards));
  const auto clusterInfo = getClusterInfo(Atrip::communicator);
  LOG(0,"Atrip") << "ngcards: " << ngcards << "\n";
  if (clusterInfo.ranksPerNode > ngcards) {
    const auto msg
      = _FORMAT("ATRIP: You are running on more ranks per node than the number of graphic cards\n"
                "You have %d cards at your disposal\n", ngcards);
    std::cerr << msg;
    throw msg;
  } else if (clusterInfo.ranksPerNode < ngcards) {
    const auto msg
      = _FORMAT("You have %d cards at your disposal.\n"
                "You will be only using %d, i.e, the number of ranks\n",
                ngcards, clusterInfo.ranksPerNode);
    std::cerr << msg;
  }


  for (size_t _rank = 0; _rank < np; _rank++) {
    if (rank == _rank) {
      CUcontext ctx;
      CUdevice dev;
      CUdevprop prop;
      struct { struct { size_t free, total; } avail; size_t total; } memory;
      char *name = (char*)malloc(256);

      // - TODO :: we should check that the Zuweisung of graphic cards
      //           to nodes works as expected, i.e., node k should get from 0
      //           to ngcards with the formula =rank % ngcards=.

      // set current device
      _CHECK_CUDA_SUCCESS("getting device for index <rank>",
                          cuDeviceGet(&dev, rank % ngcards));
      _CHECK_CUDA_SUCCESS("creating a cuda context",
                          cuCtxCreate(&ctx, 0, dev));
      _CHECK_CUDA_SUCCESS("setting the context",
                          cuCtxSetCurrent(ctx));

      // get information of the device
      _CHECK_CUDA_SUCCESS("getting  properties of current device",
                          cuDeviceGetProperties(&prop, dev));
      _CHECK_CUDA_SUCCESS("getting memory information",
                          cuMemGetInfo(&memory.avail.free, &memory.avail.total));
      _CHECK_CUDA_SUCCESS("getting name",
                          cuDeviceGetName(name, 256, dev));
      _CHECK_CUDA_SUCCESS("getting total memory",
                          cuDeviceTotalMem(&memory.total, dev));

      printf("\n"
             "CUDA CARD RANK %d\n"
             "=================\n"
             "\tnumber: %1$ld\n"
             "\tname: %s\n"
             "\tMem. clock rate (KHz): %ld\n"
             "\tShared Mem Per Block (KB): %f\n"
             "\tAvail. Free/Total mem (GB): %f/%f\n"
             "\tFree memory (GB): %f\n"
             "\n",
             Atrip::rank,
             name,
             prop.clockRate,
             prop.sharedMemPerBlock / 1024.0,
             memory.avail.free / 1024.0 / 1024.0 / 1024.0 ,
             memory.avail.total / 1024.0 / 1024.0 / 1024.0,
             memory.total / 1024.0 / 1024.0 / 1024.0
             );
      std::free((void*)name);

      _CHECK_CUBLAS_SUCCESS("creating a cublas handle",
                            cublasCreate(&Atrip::cuda.handle));

    }
    MPI_Barrier(universe);
  }

  if (in.oooThreads > 0) {
    Atrip::kernelDimensions.ooo.threads = in.oooThreads;
  }
  if (in.oooBlocks > 0) {
    Atrip::kernelDimensions.ooo.blocks = in.oooBlocks;
  }

  if (Atrip::kernelDimensions.ooo.threads <= 0 ||
      Atrip::kernelDimensions.ooo.blocks <= 0) {
    Atrip::kernelDimensions.ooo.blocks = No / 32 + No % 32;
    Atrip::kernelDimensions.ooo.threads = 32;
  }


  LOG(0,"Atrip") << "ooo blocks: "
		 << Atrip::kernelDimensions.ooo.blocks << "\n";
  LOG(0,"Atrip") << "ooo threads per block: "
		 << Atrip::kernelDimensions.ooo.threads << "\n";
#endif

  // allocate the three scratches, see piecuch
  // we need local copies of the following tensors on every
  // rank
  std::vector<F> _epsi(No), _epsa(Nv), _Tai(No * Nv);

  // copy the data from the tensors into the vectors
  in.ei->read_all(_epsi.data());
  in.ea->read_all(_epsa.data());
  in.Tph->read_all(_Tai.data());

  //TODO: free memory pointers in the end of the algorithm
  DataPtr<F> Tijk, Zijk;

#if defined(HAVE_CUDA)
  DataPtr<F> Tai, epsi, epsa;

  // TODO: free memory pointers in the end of the algorithm

  _CHECK_CUDA_SUCCESS("Tai",
                      cuMemAlloc(&Tai, sizeof(F) * _Tai.size()));
  _CHECK_CUDA_SUCCESS("epsi",
                      cuMemAlloc(&epsi, sizeof(F) * _epsi.size()));
  _CHECK_CUDA_SUCCESS("epsa",
                      cuMemAlloc(&epsa, sizeof(F) * _epsa.size()));

  _CHECK_CUDA_SUCCESS("memcpy Tai",
                      cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size()));
  _CHECK_CUDA_SUCCESS("memcpy epsi",
                      cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size()));
  _CHECK_CUDA_SUCCESS("memcpy epsa",
                      cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size()));

  _CHECK_CUDA_SUCCESS("Tijk",
                      cuMemAlloc(&Tijk, sizeof(F) * No * No * No));
  _CHECK_CUDA_SUCCESS("Zijk",
                      cuMemAlloc(&Zijk, sizeof(F) * No * No * No));
#else
  std::vector<F> &Tai = _Tai, &epsi = _epsi, &epsa = _epsa;
  Zijk = (DataFieldType<F>*)malloc(No*No*No * sizeof(DataFieldType<F>));
  Tijk = (DataFieldType<F>*)malloc(No*No*No * sizeof(DataFieldType<F>));
#endif

  RankMap<F>::RANK_ROUND_ROBIN = in.rankRoundRobin;
  if (RankMap<F>::RANK_ROUND_ROBIN) {
    LOG(0,"Atrip") << "Doing rank round robin slices distribution\n";
  } else {
    LOG(0,"Atrip")
      << "Doing node > local rank round robin slices distribution\n";
  }


  // COMMUNICATOR CONSTRUCTION ========================================={{{1
  //
  // Construct a new communicator living only on a single rank
  int child_size = 1
    , child_rank
    ;
  const
  int color = rank / child_size
    , crank = rank % child_size
    ;
  MPI_Comm child_comm;
  if (np == 1) {
    child_comm = universe;
  } else {
    MPI_Comm_split(universe, color, crank, &child_comm);
    MPI_Comm_rank(child_comm, &child_rank);
    MPI_Comm_size(child_comm, &child_size);
  }

  // BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1
  WITH_CHRONO("nv-nv-slices",
    LOG(0,"Atrip") << "building NV x NV slices\n";
    ABPH<F> abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
    ABHH<F> abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
    TABHH<F> tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
  )

  // delete the Vppph so that we don't have a HWM situation for the NV slices
  if (in.deleteVppph) {
    delete in.Vppph;
  }

  // BUILD SLICES PARAMETRIZED BY NV ==================================={{{1
  WITH_CHRONO("nv-slices",
    LOG(0,"Atrip") << "building NV slices\n";
    TAPHH<F> taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
    HHHA<F>  hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
  )

  // all tensors
  std::vector< SliceUnion<F>* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh};

  // get tuples for the current rank
  TuplesDistribution *distribution;

  if (in.tuplesDistribution == Atrip::Input<F>::TuplesDistribution::NAIVE) {
    LOG(0,"Atrip") << "Using the naive distribution\n";
    distribution = new NaiveDistribution();
  } else {
    LOG(0,"Atrip") << "Using the group-and-sort distribution\n";
    distribution = new group_and_sort::Distribution();
  }

  LOG(0,"Atrip") << "BUILDING TUPLE LIST\n";
  WITH_CHRONO("tuples:build",
    auto const tuplesList = distribution->getTuples(Nv, universe);
    )
  const size_t nIterations = tuplesList.size();
  {
    LOG(0,"Atrip") << "#iterations: "
                   << nIterations
                   << "/"
                   << nIterations * np
                   << "\n";
  }

  const size_t
    iterationMod = (in.percentageMod > 0)
                 ? nIterations * in.percentageMod / 100.0
                 : in.iterationMod
  , iteration1Percent = nIterations * 0.01
  ;


  auto const isFakeTuple
    = [&tuplesList, distribution](size_t const i) {
        return distribution->tupleIsFake(tuplesList[i]);
      };


  using Database = typename Slice<F>::Database;
  auto communicateDatabase
    = [ &unions
      , np
      ] (ABCTuple const& abc, MPI_Comm const& c) -> Database {

        WITH_CHRONO("db:comm:type:do",
          auto MPI_LDB_ELEMENT = Slice<F>::mpi::localDatabaseElement();
        )

        WITH_CHRONO("db:comm:ldb",
          typename Slice<F>::LocalDatabase ldb;
          for (auto const& tensor: unions) {
            auto const& tensorDb = tensor->buildLocalDatabase(abc);
            ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end());
          }
        )

        Database db(np * ldb.size(), ldb[0]);

        WITH_CHRONO("oneshot-db:comm:allgather",
        WITH_CHRONO("db:comm:allgather",
                    MPI_Allgather(ldb.data(),
                                  /* ldb.size() * sizeof(typename
                                     Slice<F>::LocalDatabaseElement) */
                                  ldb.size(),
                                  MPI_LDB_ELEMENT,
                                  db.data(),
                                  /* ldb.size() * sizeof(typename
                                     Slice<F>::LocalDatabaseElement), */
                                  ldb.size(),
                                  MPI_LDB_ELEMENT,
                                  c);
        ))

        WITH_CHRONO("db:comm:type:free", MPI_Type_free(&MPI_LDB_ELEMENT);)

        return db;
      };

  auto doIOPhase
    = [&unions, &rank, &np, &universe] (Database const& db) {

    const size_t localDBLength = db.size() / np;

    size_t sendTag = 0
         , recvTag = rank * localDBLength
         ;

    // RECIEVE PHASE ======================================================
    {
      // At this point, we have already send to everyone that fits
      auto const& begin = &db[rank * localDBLength]
                , end   = begin + localDBLength
                ;
      for (auto it = begin; it != end; ++it) {
        recvTag++;
        auto const& el = *it;
        auto& u = unionByName(unions, el.name);

        WITH_DBG std::cout
          << rank << ":r"
          << "♯" << recvTag << " =>"
          << " «n" << el.name
          << ", t" << el.info.type
          << ", s" << el.info.state
          << "»"
          << " ⊙ {" << rank << "⇐" << el.info.from.rank
                    << ", "
                    << el.info.from.source << "}"
          << " ∴ {" << el.info.tuple[0]
                    << ", "
                    << el.info.tuple[1]
                    << "}"
          << "\n"
          ;

        WITH_CHRONO("db:io:recv",
          u.receive(el.info, recvTag);
        )

      } // recv
    }

    // SEND PHASE =========================================================
    for (size_t otherRank = 0; otherRank<np; otherRank++) {
      auto const& begin = &db[otherRank * localDBLength]
                , end = begin + localDBLength
                ;
      for (auto it = begin; it != end; ++it) {
        sendTag++;
        typename Slice<F>::LocalDatabaseElement const& el = *it;

        if (el.info.from.rank != rank) continue;

        auto& u = unionByName(unions, el.name);
        WITH_DBG std::cout
          << rank << ":s"
          << "♯" << sendTag << " =>"
          << " «n" << el.name
          << ", t" << el.info.type
          << ", s" << el.info.state
          << "»"
          << " ⊙ {" << el.info.from.rank << "⇒" << otherRank
                    << ", "
                    << el.info.from.source << "}"
          << " ∴ {" << el.info.tuple[0]
                    << ", "
                    << el.info.tuple[1]
                    << "}"
          << "\n"
          ;

        WITH_CHRONO("db:io:send",
          u.send(otherRank, el, sendTag);
        )

      } // send phase

    } // otherRank


  };

#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
  std::map<ABCTuple, double> tupleEnergies;
#endif

  const double doublesFlops
    = double(No)
    * double(No)
    * double(No)
    * (double(No) + double(Nv))
    * 2.0
    * (traits::isComplex<F>() ? 2.0 : 1.0)
    * 6.0
    / 1e9
    ;

  // START MAIN LOOP ======================================================{{{1

  double energy(0.);
  size_t first_iteration = 0;
  Checkpoint c;
  const size_t checkpoint_mod
    = in.checkpointAtEveryIteration != 0
    ? in.checkpointAtEveryIteration
    : nIterations * in.checkpointAtPercentage / 100;
  if (in.readCheckpointIfExists) {
    std::ifstream fin(in.checkpointPath);
    if (fin.is_open()) {
      LOG(0, "Atrip") <<  "Reading checkpoint from "
                      << in.checkpointPath << "\n";
      c = read_checkpoint(fin);
      first_iteration = (size_t)c.iteration;
      if (first_iteration > nIterations) {
        // TODO: throw an error here
        // first_iteration is bigger than nIterations,
        // you probably started the program with a different number
        // of cores
      }
      if (No != c.no) {/* TODO: write warning */}
      if (Nv != c.nv) {/* TODO: write warning */}
      // TODO write warnings for nrank and so on
      if (Atrip::rank == 0) {
        // take the negative of the energy to correct for the
        // negativity of the equations, the energy in the checkpoint
        // should always be the correct physical one.
        energy = - (double)c.energy;
      }
      LOG(0, "Atrip") << "energy from checkpoint "
                      << energy << "\n";
      LOG(0, "Atrip") << "iteration from checkpoint "
                      << first_iteration << "\n";
    }
  }

  for ( size_t
          i = first_iteration,
          iteration = first_iteration + 1
      ; i < tuplesList.size()
      ; i++, iteration++
      ) {
    Atrip::chrono["iterations"].start();

    // check overhead from chrono over all iterations
    WITH_CHRONO("start:stop", {})

    // check overhead of doing a barrier at the beginning
    WITH_CHRONO("oneshot-mpi:barrier",
    WITH_CHRONO("mpi:barrier",
      if (in.barrier) MPI_Barrier(universe);
    ))


    // write checkpoints
    // TODO: ENABLE THIS
    if (iteration % checkpoint_mod == 0 && false) {
        double globalEnergy = 0;
        MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe);
        Checkpoint out
          = {No,
             Nv,
             0, // TODO
             0, // TODO
             - globalEnergy,
             iteration - 1,
             in.rankRoundRobin};
        LOG(0, "Atrip") << "Writing checkpoint\n";
        if (Atrip::rank == 0) write_checkpoint(out, in.checkpointPath);
    }

    // write reporting
    if (iteration % iterationMod == 0 || iteration == iteration1Percent) {

      if (IterationDescription::descriptor) {
        IterationDescription::descriptor({
          iteration,
          nIterations,
          Atrip::chrono["iterations"].count()
        });
      }

      const double _doubles_time = Atrip::chrono["doubles"].count(),
                   _its_time = Atrip::chrono["iterations"].count();

      LOG(0,"Atrip")
        << "iteration " << iteration
        << " [" << 100 * iteration / nIterations << "%]"
        << " (" << (_doubles_time > 0.0
                 ? doublesFlops * iteration / _doubles_time
                 : -1)
        << "GF)"
        << " (" << (_its_time > 0.0
                 ? doublesFlops * iteration / _its_time
                 : -1)
        << "GF)"
        << "\n";


      // PRINT TIMINGS
      if (in.chrono)
      for (auto const& pair: Atrip::chrono)
        LOG(1, " ") << pair.first << " :: "
                    << pair.second.count()
                    << std::endl;

    }

    const ABCTuple abc = isFakeTuple(i)
                       ? tuplesList[tuplesList.size() - 1]
                       : tuplesList[i]
                 , *abcNext = i == (tuplesList.size() - 1)
                            ? nullptr
                            : &tuplesList[i + 1]
                 ;

    WITH_CHRONO("with_rank",
      WITH_RANK << " :it " << iteration
                << " :abc " << pretty_print(abc)
                << " :abcN "
                << (abcNext ? pretty_print(*abcNext) : "None")
                << "\n";
    )

    // COMM FIRST DATABASE ================================================{{{1
    if (i == first_iteration) {
      WITH_RANK << "__first__:first database ............ \n";
      const auto db = communicateDatabase(abc, universe);
      WITH_RANK << "__first__:first database communicated \n";
      WITH_RANK << "__first__:first database io phase \n";
      doIOPhase(db);
      WITH_RANK << "__first__:first database io phase DONE\n";
      WITH_RANK << "__first__::::Unwrapping all slices for first database\n";
      for (auto& u: unions) u->unwrapAll(abc);
      WITH_RANK << "__first__::::Unwrapping slices for first database DONE\n";
      MPI_Barrier(universe);
    }

    // COMM NEXT DATABASE ================================================={{{1
    if (abcNext) {
      WITH_RANK << "__comm__:" << iteration << "th communicating database\n";
      WITH_CHRONO("db:comm",
        const auto db = communicateDatabase(*abcNext, universe);
      )
      WITH_CHRONO("db:io",
        doIOPhase(db);
      )
      WITH_RANK << "__comm__:" <<  iteration << "th database io phase DONE\n";
    }

    // COMPUTE DOUBLES ===================================================={{{1
    OCD_Barrier(universe);
    if (!isFakeTuple(i)) {
      WITH_RANK << iteration << "-th doubles\n";
      WITH_CHRONO("oneshot-unwrap",
      WITH_CHRONO("unwrap",
      WITH_CHRONO("unwrap:doubles",
        for (auto& u: decltype(unions){&abph, &hhha, &taphh, &tabhh}) {
          u->unwrapAll(abc);
        }
      )))
      WITH_CHRONO("oneshot-doubles",
      WITH_CHRONO("doubles",
                  doublesContribution<F>(abc, (size_t)No, (size_t)Nv,
                                         // -- VABCI
                                         abph.unwrapSlice(Slice<F>::AB, abc),
                                         abph.unwrapSlice(Slice<F>::AC, abc),
                                         abph.unwrapSlice(Slice<F>::BC, abc),
                                         abph.unwrapSlice(Slice<F>::BA, abc),
                                         abph.unwrapSlice(Slice<F>::CA, abc),
                                         abph.unwrapSlice(Slice<F>::CB, abc),
                                         // -- VHHHA,
                                         hhha.unwrapSlice(Slice<F>::A, abc),
                                         hhha.unwrapSlice(Slice<F>::B, abc),
                                         hhha.unwrapSlice(Slice<F>::C, abc),
                                         // -- TA,
                                         taphh.unwrapSlice(Slice<F>::A, abc),
                                         taphh.unwrapSlice(Slice<F>::B, abc),
                                         taphh.unwrapSlice(Slice<F>::C, abc),
                                         // -- TABIJ
                                         tabhh.unwrapSlice(Slice<F>::AB, abc),
                                         tabhh.unwrapSlice(Slice<F>::AC, abc),
                                         tabhh.unwrapSlice(Slice<F>::BC, abc),
                                         // -- TIJK
                                         (DataFieldType<F>*)Tijk);
                  WITH_RANK << iteration << "-th doubles done\n";
      ))
    }

    // COMPUTE SINGLES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% {{{1
    OCD_Barrier(universe);
    if (!isFakeTuple(i)) {
      WITH_CHRONO("oneshot-unwrap",
      WITH_CHRONO("unwrap",
      WITH_CHRONO("unwrap:singles",
        abhh.unwrapAll(abc);
      )))
      WITH_CHRONO("reorder",
        int ooo = No*No*No, stride = 1;
        atrip::xcopy<F>(&ooo,
                        (DataFieldType<F>*)Tijk, &stride,
                        (DataFieldType<F>*)Zijk, &stride);
      )
      WITH_CHRONO("singles",
#if defined(HAVE_CUDA)
      singlesContribution<F><<<1,1>>>(No, Nv, abc[0], abc[1], abc[2],
                                      (DataFieldType<F>*)Tai,
#else
      singlesContribution<F>(No, Nv, abc[0], abc[1], abc[2],
                             Tai.data(),
#endif
                             (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB,
                                                                 abc),
                             (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC,
                                                                 abc),
                             (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC,
                                                                 abc),
                             (DataFieldType<F>*)Zijk);
      )
    }


    // COMPUTE ENERGY %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% {{{1
    if (!isFakeTuple(i)) {
      double tupleEnergy(0.);

      int distinct(0);
      if (abc[0] == abc[1]) distinct++;
      if (abc[1] == abc[2]) distinct--;
      const F epsabc(_epsa[abc[0]] + _epsa[abc[1]] + _epsa[abc[2]]);

      // LOG(0, "AtripCUDA") <<  "doing energy " << i << "distinct " << distinct << "\n";
      WITH_CHRONO("energy",
/*
    TODO: think about how to do this on the GPU in the best way possible
        if ( distinct == 0)
          tupleEnergy = getEnergyDistinct<F>(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk);
        else
          tupleEnergy = getEnergySame<F>(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk);
*/
      )

#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
      tupleEnergies[abc] = tupleEnergy;
#endif

      energy += tupleEnergy;

    }

    // TODO: remove this
    if (isFakeTuple(i)) {
      // fake iterations should also unwrap whatever they got
      WITH_RANK << iteration
                << "th unwrapping because of fake in "
                << i << "\n";
      for (auto& u: unions) u->unwrapAll(abc);
    }

#ifdef HAVE_OCD
    for (auto const& u: unions) {
      WITH_RANK << "__dups__:"
                << iteration
                << "-th n" << u->name << " checking duplicates\n";
      u->checkForDuplicates();
    }
#endif


    // CLEANUP UNIONS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%{{{1
    OCD_Barrier(universe);
    if (abcNext) {
      WITH_RANK << "__gc__:" << iteration << "-th cleaning up.......\n";
      for (auto& u: unions) {

        u->unwrapAll(abc);
        WITH_RANK << "__gc__:n" << u->name  << " :it " << iteration
                  << " :abc " << pretty_print(abc)
                  << " :abcN " << pretty_print(*abcNext)
                  << "\n";
        // for (auto const& slice: u->slices)
        //   WITH_RANK << "__gc__:guts:" << slice.info << "\n";
        u->clearUnusedSlicesForNext(*abcNext);

        WITH_RANK << "__gc__: checking validity\n";

#ifdef HAVE_OCD
        // check for validity of the slices
        for (auto type: u->sliceTypes) {
          auto tuple = Slice<F>::subtupleBySlice(abc, type);
        for (auto& slice: u->slices) {
          if ( slice.info.type == type
             && slice.info.tuple == tuple
             && slice.isDirectlyFetchable()
             ) {
            if (slice.info.state == Slice<F>::Dispatched)
              throw std::domain_error( "This slice should not be undispatched! "
                                     + pretty_print(slice.info));
          }
        }
        }
#endif


      }
    }

      WITH_RANK << iteration << "-th cleaning up....... DONE\n";

    Atrip::chrono["iterations"].stop();
    // ITERATION END %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%{{{1

  }
    // END OF MAIN LOOP

#if defined(HAVE_CUDA)
  cuMemFree(Tai);
  cuMemFree(epsi);
  cuMemFree(epsa);
  cuMemFree(Tijk);
  cuMemFree(Zijk);
#else
  std::free(Zijk);
  std::free(Tijk);
#endif

  MPI_Barrier(universe);

  // PRINT TUPLES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%{{{1
#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
  LOG(0,"Atrip") << "tuple energies" << "\n";
  for (size_t i = 0; i < np; i++) {
    MPI_Barrier(universe);
    for (auto const& pair: tupleEnergies) {
      if (i == rank)
        std::cout << pair.first[0]
                  << " " << pair.first[1]
                  << " " << pair.first[2]
                  << std::setprecision(15) << std::setw(23)
                  << " tupleEnergy: " << pair.second
                  << "\n"
                  ;
    }
  }
#endif

  // COMMUNICATE THE ENERGIES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%{{{1
  LOG(0,"Atrip") << "COMMUNICATING ENERGIES \n";
  double globalEnergy = 0;
  MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe);

  WITH_RANK << "local energy " << energy << "\n";
  LOG(0, "Atrip") << "Energy: "
    << std::setprecision(15) << std::setw(23)
    << (- globalEnergy) << std::endl;

  // PRINT TIMINGS {{{1
  if (in.chrono)
  for (auto const& pair: Atrip::chrono)
    LOG(0,"atrip:chrono") << pair.first << " "
                          << pair.second.count() << std::endl;


  LOG(0, "atrip:flops(doubles)")
    << nIterations * doublesFlops / Atrip::chrono["doubles"].count() << "\n";
  LOG(0, "atrip:flops(iterations)")
    << nIterations * doublesFlops / Atrip::chrono["iterations"].count() << "\n";

  // TODO: change the sign in  the getEnergy routines
  return { - globalEnergy };

}
// instantiate
template Atrip::Output Atrip::run(Atrip::Input<double> const& in);
template Atrip::Output Atrip::run(Atrip::Input<Complex> const& in);
// Main:1 ends here