atrip/src/atrip/Atrip.cxx
2022-09-12 19:17:52 +02:00

816 lines
27 KiB
C++

// Copyright 2022 Alejandro Gallo
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// [[file:~/cuda/atrip/atrip.org::*Main][Main:1]]
#include <iomanip>
#include <atrip/Atrip.hpp>
#include <atrip/Utils.hpp>
#include <atrip/Equations.hpp>
#include <atrip/SliceUnion.hpp>
#include <atrip/Unions.hpp>
#include <atrip/Checkpoint.hpp>
using namespace atrip;
#if defined(HAVE_CUDA)
#include <atrip/CUDA.hpp>
#endif
template <typename F> bool RankMap<F>::RANK_ROUND_ROBIN;
template bool RankMap<double>::RANK_ROUND_ROBIN;
template bool RankMap<Complex>::RANK_ROUND_ROBIN;
size_t Atrip::rank;
size_t Atrip::np;
#if defined(HAVE_CUDA)
typename Atrip::CudaContext Atrip::cuda;
typename Atrip::KernelDimensions Atrip::kernelDimensions;
#endif
MPI_Comm Atrip::communicator;
Timings Atrip::chrono;
// user printing block
IterationDescriptor IterationDescription::descriptor;
void atrip::registerIterationDescriptor(IterationDescriptor d) {
IterationDescription::descriptor = d;
}
void Atrip::init(MPI_Comm world) {
Atrip::communicator = world;
MPI_Comm_rank(world, (int*)&Atrip::rank);
MPI_Comm_size(world, (int*)&Atrip::np);
}
template <typename F>
Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
const size_t np = Atrip::np;
const size_t rank = Atrip::rank;
MPI_Comm universe = Atrip::communicator;
const size_t No = in.ei->lens[0];
const size_t Nv = in.ea->lens[0];
LOG(0,"Atrip") << "No: " << No << "\n";
LOG(0,"Atrip") << "Nv: " << Nv << "\n";
LOG(0,"Atrip") << "np: " << np << "\n";
#if defined(HAVE_CUDA)
int ngcards;
_CHECK_CUDA_SUCCESS("initializing cuda",
cuInit(0));
_CHECK_CUDA_SUCCESS("getting device count",
cuDeviceGetCount(&ngcards));
const auto clusterInfo = getClusterInfo(Atrip::communicator);
LOG(0,"Atrip") << "ngcards: " << ngcards << "\n";
if (clusterInfo.ranksPerNode > ngcards) {
const auto msg
= _FORMAT("ATRIP: You are running on more ranks per node than the number of graphic cards\n"
"You have %d cards at your disposal\n", ngcards);
std::cerr << msg;
throw msg;
} else if (clusterInfo.ranksPerNode < ngcards) {
const auto msg
= _FORMAT("You have %d cards at your disposal.\n"
"You will be only using %d, i.e, the number of ranks\n",
ngcards, clusterInfo.ranksPerNode);
std::cerr << msg;
}
for (size_t _rank = 0; _rank < np; _rank++) {
if (rank == _rank) {
CUcontext ctx;
CUdevice dev;
CUdevprop prop;
struct { struct { size_t free, total; } avail; size_t total; } memory;
char *name = (char*)malloc(256);
// - TODO :: we should check that the Zuweisung of graphic cards
// to nodes works as expected, i.e., node k should get from 0
// to ngcards with the formula =rank % ngcards=.
// set current device
_CHECK_CUDA_SUCCESS("getting device for index <rank>",
cuDeviceGet(&dev, rank % ngcards));
_CHECK_CUDA_SUCCESS("creating a cuda context",
cuCtxCreate(&ctx, 0, dev));
_CHECK_CUDA_SUCCESS("setting the context",
cuCtxSetCurrent(ctx));
// get information of the device
_CHECK_CUDA_SUCCESS("getting properties of current device",
cuDeviceGetProperties(&prop, dev));
_CHECK_CUDA_SUCCESS("getting memory information",
cuMemGetInfo(&memory.avail.free, &memory.avail.total));
_CHECK_CUDA_SUCCESS("getting name",
cuDeviceGetName(name, 256, dev));
_CHECK_CUDA_SUCCESS("getting total memory",
cuDeviceTotalMem(&memory.total, dev));
printf("\n"
"CUDA CARD RANK %d\n"
"=================\n"
"\tnumber: %1$ld\n"
"\tname: %s\n"
"\tMem. clock rate (KHz): %ld\n"
"\tShared Mem Per Block (KB): %f\n"
"\tAvail. Free/Total mem (GB): %f/%f\n"
"\tFree memory (GB): %f\n"
"\n",
Atrip::rank,
name,
prop.clockRate,
prop.sharedMemPerBlock / 1024.0,
memory.avail.free / 1024.0 / 1024.0 / 1024.0 ,
memory.avail.total / 1024.0 / 1024.0 / 1024.0,
memory.total / 1024.0 / 1024.0 / 1024.0
);
std::free((void*)name);
_CHECK_CUBLAS_SUCCESS("creating a cublas handle",
cublasCreate(&Atrip::cuda.handle));
}
MPI_Barrier(universe);
}
if (in.oooThreads > 0) {
Atrip::kernelDimensions.ooo.threads = in.oooThreads;
}
if (in.oooBlocks > 0) {
Atrip::kernelDimensions.ooo.blocks = in.oooBlocks;
}
if (Atrip::kernelDimensions.ooo.threads <= 0 ||
Atrip::kernelDimensions.ooo.blocks <= 0) {
Atrip::kernelDimensions.ooo.blocks = No / 32 + No % 32;
Atrip::kernelDimensions.ooo.threads = 32;
}
LOG(0,"Atrip") << "ooo blocks: "
<< Atrip::kernelDimensions.ooo.blocks << "\n";
LOG(0,"Atrip") << "ooo threads per block: "
<< Atrip::kernelDimensions.ooo.threads << "\n";
#endif
// allocate the three scratches, see piecuch
// we need local copies of the following tensors on every
// rank
std::vector<F> _epsi(No), _epsa(Nv), _Tai(No * Nv);
// copy the data from the tensors into the vectors
in.ei->read_all(_epsi.data());
in.ea->read_all(_epsa.data());
in.Tph->read_all(_Tai.data());
//TODO: free memory pointers in the end of the algorithm
DataPtr<F> Tijk, Zijk;
#if defined(HAVE_CUDA)
DataPtr<F> Tai, epsi, epsa;
// TODO: free memory pointers in the end of the algorithm
_CHECK_CUDA_SUCCESS("Tai",
cuMemAlloc(&Tai, sizeof(F) * _Tai.size()));
_CHECK_CUDA_SUCCESS("epsi",
cuMemAlloc(&epsi, sizeof(F) * _epsi.size()));
_CHECK_CUDA_SUCCESS("epsa",
cuMemAlloc(&epsa, sizeof(F) * _epsa.size()));
_CHECK_CUDA_SUCCESS("memcpy Tai",
cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size()));
_CHECK_CUDA_SUCCESS("memcpy epsi",
cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size()));
_CHECK_CUDA_SUCCESS("memcpy epsa",
cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size()));
_CHECK_CUDA_SUCCESS("Tijk",
cuMemAlloc(&Tijk, sizeof(F) * No * No * No));
_CHECK_CUDA_SUCCESS("Zijk",
cuMemAlloc(&Zijk, sizeof(F) * No * No * No));
#else
std::vector<F> &Tai = _Tai, &epsi = _epsi, &epsa = _epsa;
Zijk = (DataFieldType<F>*)malloc(No*No*No * sizeof(DataFieldType<F>));
Tijk = (DataFieldType<F>*)malloc(No*No*No * sizeof(DataFieldType<F>));
#endif
RankMap<F>::RANK_ROUND_ROBIN = in.rankRoundRobin;
if (RankMap<F>::RANK_ROUND_ROBIN) {
LOG(0,"Atrip") << "Doing rank round robin slices distribution\n";
} else {
LOG(0,"Atrip")
<< "Doing node > local rank round robin slices distribution\n";
}
// COMMUNICATOR CONSTRUCTION ========================================={{{1
//
// Construct a new communicator living only on a single rank
int child_size = 1
, child_rank
;
const
int color = rank / child_size
, crank = rank % child_size
;
MPI_Comm child_comm;
if (np == 1) {
child_comm = universe;
} else {
MPI_Comm_split(universe, color, crank, &child_comm);
MPI_Comm_rank(child_comm, &child_rank);
MPI_Comm_size(child_comm, &child_size);
}
// BUILD SLICES PARAMETRIZED BY NV x NV =============================={{{1
WITH_CHRONO("nv-nv-slices",
LOG(0,"Atrip") << "building NV x NV slices\n";
ABPH<F> abph(*in.Vppph, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
ABHH<F> abhh(*in.Vpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
TABHH<F> tabhh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
)
// delete the Vppph so that we don't have a HWM situation for the NV slices
if (in.deleteVppph) {
delete in.Vppph;
}
// BUILD SLICES PARAMETRIZED BY NV ==================================={{{1
WITH_CHRONO("nv-slices",
LOG(0,"Atrip") << "building NV slices\n";
TAPHH<F> taphh(*in.Tpphh, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
HHHA<F> hhha(*in.Vhhhp, (size_t)No, (size_t)Nv, (size_t)np, child_comm, universe);
)
// all tensors
std::vector< SliceUnion<F>* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh};
// get tuples for the current rank
TuplesDistribution *distribution;
if (in.tuplesDistribution == Atrip::Input<F>::TuplesDistribution::NAIVE) {
LOG(0,"Atrip") << "Using the naive distribution\n";
distribution = new NaiveDistribution();
} else {
LOG(0,"Atrip") << "Using the group-and-sort distribution\n";
distribution = new group_and_sort::Distribution();
}
LOG(0,"Atrip") << "BUILDING TUPLE LIST\n";
WITH_CHRONO("tuples:build",
auto const tuplesList = distribution->getTuples(Nv, universe);
)
const size_t nIterations = tuplesList.size();
{
LOG(0,"Atrip") << "#iterations: "
<< nIterations
<< "/"
<< nIterations * np
<< "\n";
}
const size_t
iterationMod = (in.percentageMod > 0)
? nIterations * in.percentageMod / 100.0
: in.iterationMod
, iteration1Percent = nIterations * 0.01
;
auto const isFakeTuple
= [&tuplesList, distribution](size_t const i) {
return distribution->tupleIsFake(tuplesList[i]);
};
using Database = typename Slice<F>::Database;
auto communicateDatabase
= [ &unions
, np
] (ABCTuple const& abc, MPI_Comm const& c) -> Database {
WITH_CHRONO("db:comm:type:do",
auto MPI_LDB_ELEMENT = Slice<F>::mpi::localDatabaseElement();
)
WITH_CHRONO("db:comm:ldb",
typename Slice<F>::LocalDatabase ldb;
for (auto const& tensor: unions) {
auto const& tensorDb = tensor->buildLocalDatabase(abc);
ldb.insert(ldb.end(), tensorDb.begin(), tensorDb.end());
}
)
Database db(np * ldb.size(), ldb[0]);
WITH_CHRONO("oneshot-db:comm:allgather",
WITH_CHRONO("db:comm:allgather",
MPI_Allgather(ldb.data(),
/* ldb.size() * sizeof(typename
Slice<F>::LocalDatabaseElement) */
ldb.size(),
MPI_LDB_ELEMENT,
db.data(),
/* ldb.size() * sizeof(typename
Slice<F>::LocalDatabaseElement), */
ldb.size(),
MPI_LDB_ELEMENT,
c);
))
WITH_CHRONO("db:comm:type:free", MPI_Type_free(&MPI_LDB_ELEMENT);)
return db;
};
auto doIOPhase
= [&unions, &rank, &np, &universe] (Database const& db) {
const size_t localDBLength = db.size() / np;
size_t sendTag = 0
, recvTag = rank * localDBLength
;
// RECIEVE PHASE ======================================================
{
// At this point, we have already send to everyone that fits
auto const& begin = &db[rank * localDBLength]
, end = begin + localDBLength
;
for (auto it = begin; it != end; ++it) {
recvTag++;
auto const& el = *it;
auto& u = unionByName(unions, el.name);
WITH_DBG std::cout
<< rank << ":r"
<< "" << recvTag << " =>"
<< " «n" << el.name
<< ", t" << el.info.type
<< ", s" << el.info.state
<< "»"
<< " ⊙ {" << rank << "" << el.info.from.rank
<< ", "
<< el.info.from.source << "}"
<< " ∴ {" << el.info.tuple[0]
<< ", "
<< el.info.tuple[1]
<< "}"
<< "\n"
;
WITH_CHRONO("db:io:recv",
u.receive(el.info, recvTag);
)
} // recv
}
// SEND PHASE =========================================================
for (size_t otherRank = 0; otherRank<np; otherRank++) {
auto const& begin = &db[otherRank * localDBLength]
, end = begin + localDBLength
;
for (auto it = begin; it != end; ++it) {
sendTag++;
typename Slice<F>::LocalDatabaseElement const& el = *it;
if (el.info.from.rank != rank) continue;
auto& u = unionByName(unions, el.name);
WITH_DBG std::cout
<< rank << ":s"
<< "" << sendTag << " =>"
<< " «n" << el.name
<< ", t" << el.info.type
<< ", s" << el.info.state
<< "»"
<< " ⊙ {" << el.info.from.rank << "" << otherRank
<< ", "
<< el.info.from.source << "}"
<< " ∴ {" << el.info.tuple[0]
<< ", "
<< el.info.tuple[1]
<< "}"
<< "\n"
;
WITH_CHRONO("db:io:send",
u.send(otherRank, el, sendTag);
)
} // send phase
} // otherRank
};
#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
std::map<ABCTuple, double> tupleEnergies;
#endif
const double doublesFlops
= double(No)
* double(No)
* double(No)
* (double(No) + double(Nv))
* 2.0
* (traits::isComplex<F>() ? 2.0 : 1.0)
* 6.0
/ 1e9
;
// START MAIN LOOP ======================================================{{{1
double energy(0.);
size_t first_iteration = 0;
Checkpoint c;
const size_t checkpoint_mod
= in.checkpointAtEveryIteration != 0
? in.checkpointAtEveryIteration
: nIterations * in.checkpointAtPercentage / 100;
if (in.readCheckpointIfExists) {
std::ifstream fin(in.checkpointPath);
if (fin.is_open()) {
LOG(0, "Atrip") << "Reading checkpoint from "
<< in.checkpointPath << "\n";
c = read_checkpoint(fin);
first_iteration = (size_t)c.iteration;
if (first_iteration > nIterations) {
// TODO: throw an error here
// first_iteration is bigger than nIterations,
// you probably started the program with a different number
// of cores
}
if (No != c.no) {/* TODO: write warning */}
if (Nv != c.nv) {/* TODO: write warning */}
// TODO write warnings for nrank and so on
if (Atrip::rank == 0) {
// take the negative of the energy to correct for the
// negativity of the equations, the energy in the checkpoint
// should always be the correct physical one.
energy = - (double)c.energy;
}
LOG(0, "Atrip") << "energy from checkpoint "
<< energy << "\n";
LOG(0, "Atrip") << "iteration from checkpoint "
<< first_iteration << "\n";
}
}
for ( size_t
i = first_iteration,
iteration = first_iteration + 1
; i < tuplesList.size()
; i++, iteration++
) {
Atrip::chrono["iterations"].start();
// check overhead from chrono over all iterations
WITH_CHRONO("start:stop", {})
// check overhead of doing a barrier at the beginning
WITH_CHRONO("oneshot-mpi:barrier",
WITH_CHRONO("mpi:barrier",
if (in.barrier) MPI_Barrier(universe);
))
// write checkpoints
// TODO: ENABLE THIS
if (iteration % checkpoint_mod == 0 && false) {
double globalEnergy = 0;
MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe);
Checkpoint out
= {No,
Nv,
0, // TODO
0, // TODO
- globalEnergy,
iteration - 1,
in.rankRoundRobin};
LOG(0, "Atrip") << "Writing checkpoint\n";
if (Atrip::rank == 0) write_checkpoint(out, in.checkpointPath);
}
// write reporting
if (iteration % iterationMod == 0 || iteration == iteration1Percent) {
if (IterationDescription::descriptor) {
IterationDescription::descriptor({
iteration,
nIterations,
Atrip::chrono["iterations"].count()
});
}
const double _doubles_time = Atrip::chrono["doubles"].count(),
_its_time = Atrip::chrono["iterations"].count();
LOG(0,"Atrip")
<< "iteration " << iteration
<< " [" << 100 * iteration / nIterations << "%]"
<< " (" << (_doubles_time > 0.0
? doublesFlops * iteration / _doubles_time
: -1)
<< "GF)"
<< " (" << (_its_time > 0.0
? doublesFlops * iteration / _its_time
: -1)
<< "GF)"
<< "\n";
// PRINT TIMINGS
if (in.chrono)
for (auto const& pair: Atrip::chrono)
LOG(1, " ") << pair.first << " :: "
<< pair.second.count()
<< std::endl;
}
const ABCTuple abc = isFakeTuple(i)
? tuplesList[tuplesList.size() - 1]
: tuplesList[i]
, *abcNext = i == (tuplesList.size() - 1)
? nullptr
: &tuplesList[i + 1]
;
WITH_CHRONO("with_rank",
WITH_RANK << " :it " << iteration
<< " :abc " << pretty_print(abc)
<< " :abcN "
<< (abcNext ? pretty_print(*abcNext) : "None")
<< "\n";
)
// COMM FIRST DATABASE ================================================{{{1
if (i == first_iteration) {
WITH_RANK << "__first__:first database ............ \n";
const auto db = communicateDatabase(abc, universe);
WITH_RANK << "__first__:first database communicated \n";
WITH_RANK << "__first__:first database io phase \n";
doIOPhase(db);
WITH_RANK << "__first__:first database io phase DONE\n";
WITH_RANK << "__first__::::Unwrapping all slices for first database\n";
for (auto& u: unions) u->unwrapAll(abc);
WITH_RANK << "__first__::::Unwrapping slices for first database DONE\n";
MPI_Barrier(universe);
}
// COMM NEXT DATABASE ================================================={{{1
if (abcNext) {
WITH_RANK << "__comm__:" << iteration << "th communicating database\n";
WITH_CHRONO("db:comm",
const auto db = communicateDatabase(*abcNext, universe);
)
WITH_CHRONO("db:io",
doIOPhase(db);
)
WITH_RANK << "__comm__:" << iteration << "th database io phase DONE\n";
}
// COMPUTE DOUBLES ===================================================={{{1
OCD_Barrier(universe);
if (!isFakeTuple(i)) {
WITH_RANK << iteration << "-th doubles\n";
WITH_CHRONO("oneshot-unwrap",
WITH_CHRONO("unwrap",
WITH_CHRONO("unwrap:doubles",
for (auto& u: decltype(unions){&abph, &hhha, &taphh, &tabhh}) {
u->unwrapAll(abc);
}
)))
WITH_CHRONO("oneshot-doubles",
WITH_CHRONO("doubles",
doublesContribution<F>(abc, (size_t)No, (size_t)Nv,
// -- VABCI
abph.unwrapSlice(Slice<F>::AB, abc),
abph.unwrapSlice(Slice<F>::AC, abc),
abph.unwrapSlice(Slice<F>::BC, abc),
abph.unwrapSlice(Slice<F>::BA, abc),
abph.unwrapSlice(Slice<F>::CA, abc),
abph.unwrapSlice(Slice<F>::CB, abc),
// -- VHHHA,
hhha.unwrapSlice(Slice<F>::A, abc),
hhha.unwrapSlice(Slice<F>::B, abc),
hhha.unwrapSlice(Slice<F>::C, abc),
// -- TA,
taphh.unwrapSlice(Slice<F>::A, abc),
taphh.unwrapSlice(Slice<F>::B, abc),
taphh.unwrapSlice(Slice<F>::C, abc),
// -- TABIJ
tabhh.unwrapSlice(Slice<F>::AB, abc),
tabhh.unwrapSlice(Slice<F>::AC, abc),
tabhh.unwrapSlice(Slice<F>::BC, abc),
// -- TIJK
(DataFieldType<F>*)Tijk);
WITH_RANK << iteration << "-th doubles done\n";
))
}
// COMPUTE SINGLES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% {{{1
OCD_Barrier(universe);
if (!isFakeTuple(i)) {
WITH_CHRONO("oneshot-unwrap",
WITH_CHRONO("unwrap",
WITH_CHRONO("unwrap:singles",
abhh.unwrapAll(abc);
)))
WITH_CHRONO("reorder",
int ooo = No*No*No, stride = 1;
atrip::xcopy<F>(&ooo,
(DataFieldType<F>*)Tijk, &stride,
(DataFieldType<F>*)Zijk, &stride);
)
WITH_CHRONO("singles",
#if defined(HAVE_CUDA)
singlesContribution<F><<<1,1>>>(No, Nv, abc[0], abc[1], abc[2],
(DataFieldType<F>*)Tai,
#else
singlesContribution<F>(No, Nv, abc[0], abc[1], abc[2],
Tai.data(),
#endif
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB,
abc),
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC,
abc),
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC,
abc),
(DataFieldType<F>*)Zijk);
)
}
// COMPUTE ENERGY %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% {{{1
if (!isFakeTuple(i)) {
double tupleEnergy(0.);
int distinct(0);
if (abc[0] == abc[1]) distinct++;
if (abc[1] == abc[2]) distinct--;
const F epsabc(_epsa[abc[0]] + _epsa[abc[1]] + _epsa[abc[2]]);
// LOG(0, "AtripCUDA") << "doing energy " << i << "distinct " << distinct << "\n";
WITH_CHRONO("energy",
/*
TODO: think about how to do this on the GPU in the best way possible
if ( distinct == 0)
tupleEnergy = getEnergyDistinct<F>(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk);
else
tupleEnergy = getEnergySame<F>(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk);
*/
)
#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
tupleEnergies[abc] = tupleEnergy;
#endif
energy += tupleEnergy;
}
// TODO: remove this
if (isFakeTuple(i)) {
// fake iterations should also unwrap whatever they got
WITH_RANK << iteration
<< "th unwrapping because of fake in "
<< i << "\n";
for (auto& u: unions) u->unwrapAll(abc);
}
#ifdef HAVE_OCD
for (auto const& u: unions) {
WITH_RANK << "__dups__:"
<< iteration
<< "-th n" << u->name << " checking duplicates\n";
u->checkForDuplicates();
}
#endif
// CLEANUP UNIONS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%{{{1
OCD_Barrier(universe);
if (abcNext) {
WITH_RANK << "__gc__:" << iteration << "-th cleaning up.......\n";
for (auto& u: unions) {
u->unwrapAll(abc);
WITH_RANK << "__gc__:n" << u->name << " :it " << iteration
<< " :abc " << pretty_print(abc)
<< " :abcN " << pretty_print(*abcNext)
<< "\n";
// for (auto const& slice: u->slices)
// WITH_RANK << "__gc__:guts:" << slice.info << "\n";
u->clearUnusedSlicesForNext(*abcNext);
WITH_RANK << "__gc__: checking validity\n";
#ifdef HAVE_OCD
// check for validity of the slices
for (auto type: u->sliceTypes) {
auto tuple = Slice<F>::subtupleBySlice(abc, type);
for (auto& slice: u->slices) {
if ( slice.info.type == type
&& slice.info.tuple == tuple
&& slice.isDirectlyFetchable()
) {
if (slice.info.state == Slice<F>::Dispatched)
throw std::domain_error( "This slice should not be undispatched! "
+ pretty_print(slice.info));
}
}
}
#endif
}
}
WITH_RANK << iteration << "-th cleaning up....... DONE\n";
Atrip::chrono["iterations"].stop();
// ITERATION END %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%{{{1
}
// END OF MAIN LOOP
#if defined(HAVE_CUDA)
cuMemFree(Tai);
cuMemFree(epsi);
cuMemFree(epsa);
cuMemFree(Tijk);
cuMemFree(Zijk);
#else
std::free(Zijk);
std::free(Tijk);
#endif
MPI_Barrier(universe);
// PRINT TUPLES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%{{{1
#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
LOG(0,"Atrip") << "tuple energies" << "\n";
for (size_t i = 0; i < np; i++) {
MPI_Barrier(universe);
for (auto const& pair: tupleEnergies) {
if (i == rank)
std::cout << pair.first[0]
<< " " << pair.first[1]
<< " " << pair.first[2]
<< std::setprecision(15) << std::setw(23)
<< " tupleEnergy: " << pair.second
<< "\n"
;
}
}
#endif
// COMMUNICATE THE ENERGIES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%{{{1
LOG(0,"Atrip") << "COMMUNICATING ENERGIES \n";
double globalEnergy = 0;
MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe);
WITH_RANK << "local energy " << energy << "\n";
LOG(0, "Atrip") << "Energy: "
<< std::setprecision(15) << std::setw(23)
<< (- globalEnergy) << std::endl;
// PRINT TIMINGS {{{1
if (in.chrono)
for (auto const& pair: Atrip::chrono)
LOG(0,"atrip:chrono") << pair.first << " "
<< pair.second.count() << std::endl;
LOG(0, "atrip:flops(doubles)")
<< nIterations * doublesFlops / Atrip::chrono["doubles"].count() << "\n";
LOG(0, "atrip:flops(iterations)")
<< nIterations * doublesFlops / Atrip::chrono["iterations"].count() << "\n";
// TODO: change the sign in the getEnergy routines
return { - globalEnergy };
}
// instantiate
template Atrip::Output Atrip::run(Atrip::Input<double> const& in);
template Atrip::Output Atrip::run(Atrip::Input<Complex> const& in);
// Main:1 ends here