diff --git a/include/atrip/Atrip.hpp b/include/atrip/Atrip.hpp index ce3399d..ed2eaba 100644 --- a/include/atrip/Atrip.hpp +++ b/include/atrip/Atrip.hpp @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -34,8 +35,9 @@ namespace atrip { static int rank; static int np; + static MPI_Comm communicator; static Timings chrono; - static void init(); + static void init(MPI_Comm); template struct Input { @@ -68,6 +70,11 @@ namespace atrip { ADD_ATTRIBUTE(int, iterationMod, -1) ADD_ATTRIBUTE(int, percentageMod, -1) ADD_ATTRIBUTE(TuplesDistribution, tuplesDistribution, NAIVE) + ADD_ATTRIBUTE(std::string, checkpointPath, "atrip-checkpoint.yaml") + ADD_ATTRIBUTE(bool, readCheckpointIfExists, true) + ADD_ATTRIBUTE(bool, writeCheckpoint, true) + ADD_ATTRIBUTE(float, checkpointAtPercentage, 10) + ADD_ATTRIBUTE(size_t, checkpointAtEveryIteration, 0) }; diff --git a/include/atrip/Checkpoint.hpp b/include/atrip/Checkpoint.hpp new file mode 100644 index 0000000..16243a2 --- /dev/null +++ b/include/atrip/Checkpoint.hpp @@ -0,0 +1,92 @@ +// Copyright 2022 Alejandro Gallo +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// [[file:../../atrip.org::*Prolog][Prolog:1]] +#pragma once +#include +#include + +#include + +namespace atrip { +// Prolog:1 ends here + +// [[file:../../atrip.org::checkpoint-definition][checkpoint-definition]] +// template +struct Checkpoint { + size_t no, nv; + size_t nranks; + size_t nnodes; + double energy; + size_t iteration; + // TODO + // Input::TuplesDistribution distribution(GROUP_AND_SORT); + bool rankRoundRobin; +}; +// checkpoint-definition ends here + +// [[file:../../atrip.org::*Input and output][Input and output:1]] +void write_checkpoint(Checkpoint const& c, std::string const& filepath) { + std::ofstream out(filepath); + out << "No: " << c.no + << "\n" + << "Nv: " << c.nv + << "\n" + << "Nranks: " << c.nranks + << "\n" + << "Nnodes: " << c.nnodes + << "\n" + << "Energy: " << std::setprecision(19) << c.energy + << "\n" + << "Iteration: " << c.iteration + << "\n" + << "RankRoundRobin: " << (c.rankRoundRobin ? "true" : "false") + << "\n"; +} + + +Checkpoint read_checkpoint(std::ifstream& in) { + Checkpoint c; + // trim chars from the string, to be more sure and not use regexes + auto trim = [](std::string& s, std::string const& chars) { + s.erase(0, s.find_first_not_of(chars)); + s.erase(s.find_last_not_of(chars) + 1); + return s; + }; + for (std::string header, value; std::getline(in, header, ':');) { + std::getline(in, value, '\n'); + trim(value, " \t"); // trim all whitespaces + trim(header, " \t"); + + /**/ if (header == "No") c.no = std::atoi(value.c_str()); + else if (header == "Nv") c.nv = std::atoi(value.c_str()); + else if (header == "Nranks") c.nranks = std::atoi(value.c_str()); + else if (header == "Nnodes") c.nnodes = std::atoi(value.c_str()); + else if (header == "Energy") c.energy = std::atof(value.c_str()); + else if (header == "Iteration") c.iteration = std::atoi(value.c_str()); + else if (header == "RankRoundRobin") c.rankRoundRobin = (value[0] == 't'); + } + return c; +} + + +Checkpoint read_checkpoint(std::string const& filepath) { + std::ifstream in(filepath); + return read_checkpoint(in); +} +// Input and output:1 ends here + +// [[file:../../atrip.org::*Epilog][Epilog:1]] +} +// Epilog:1 ends here diff --git a/include/atrip/Slice.hpp b/include/atrip/Slice.hpp index 260e8c9..ff6d982 100644 --- a/include/atrip/Slice.hpp +++ b/include/atrip/Slice.hpp @@ -44,36 +44,36 @@ struct Slice { // Prolog:1 ends here // [[file:../../atrip.org::*Location][Location:1]] - struct Location { size_t rank; size_t source; }; +struct Location { size_t rank; size_t source; }; // Location:1 ends here // [[file:../../atrip.org::*Type][Type:1]] - enum Type - { A = 10 - , B - , C - // Two-parameter slices - , AB = 20 - , BC - , AC - // for abci and the doubles - , CB - , BA - , CA - // The non-typed slice - , Blank = 404 - }; +enum Type + { A = 10 + , B + , C + // Two-parameter slices + , AB = 20 + , BC + , AC + // for abci and the doubles + , CB + , BA + , CA + // The non-typed slice + , Blank = 404 + }; // Type:1 ends here // [[file:../../atrip.org::*State][State:1]] - enum State { - Fetch = 0, - Dispatched = 2, - Ready = 1, - SelfSufficient = 911, - Recycled = 123, - Acceptor = 405 - }; +enum State { + Fetch = 0, + Dispatched = 2, + Ready = 1, + SelfSufficient = 911, + Recycled = 123, + Acceptor = 405 +}; // State:1 ends here // [[file:../../atrip.org::*The Info structure][The Info structure:1]] @@ -101,25 +101,25 @@ using Ty_x_Tu = std::pair< Type, PartialTuple >; // The Info structure:1 ends here // [[file:../../atrip.org::*Name][Name:1]] - enum Name - { TA = 100 - , VIJKA = 101 - , VABCI = 200 - , TABIJ = 201 - , VABIJ = 202 - }; +enum Name + { TA = 100 + , VIJKA = 101 + , VABCI = 200 + , TABIJ = 201 + , VABIJ = 202 + }; // Name:1 ends here // [[file:../../atrip.org::*Database][Database:1]] - struct LocalDatabaseElement { - Slice::Name name; - Slice::Info info; - }; +struct LocalDatabaseElement { + Slice::Name name; + Slice::Info info; +}; // Database:1 ends here // [[file:../../atrip.org::*Database][Database:2]] - using LocalDatabase = std::vector; - using Database = LocalDatabase; +using LocalDatabase = std::vector; +using Database = LocalDatabase; // Database:2 ends here // [[file:../../atrip.org::*MPI Types][MPI Types:1]] @@ -359,91 +359,91 @@ static Slice& findByInfo(std::vector> &slices, // Static utilities:6 ends here // [[file:../../atrip.org::*Attributes][Attributes:1]] - Info info; +Info info; // Attributes:1 ends here // [[file:../../atrip.org::*Attributes][Attributes:2]] - F *data; +F *data; // Attributes:2 ends here // [[file:../../atrip.org::*Attributes][Attributes:3]] - MPI_Request request; +MPI_Request request; // Attributes:3 ends here // [[file:../../atrip.org::*Attributes][Attributes:4]] - const size_t size; +const size_t size; // Attributes:4 ends here // [[file:../../atrip.org::*Member functions][Member functions:1]] - void markReady() noexcept { - info.state = Ready; - info.recycling = Blank; - } +void markReady() noexcept { + info.state = Ready; + info.recycling = Blank; +} // Member functions:1 ends here // [[file:../../atrip.org::*Member functions][Member functions:2]] - bool isUnwrapped() const noexcept { - return info.state == Ready - || info.state == SelfSufficient - ; - } +bool isUnwrapped() const noexcept { + return info.state == Ready + || info.state == SelfSufficient + ; +} // Member functions:2 ends here // [[file:../../atrip.org::*Member functions][Member functions:3]] - bool isUnwrappable() const noexcept { - return isUnwrapped() - || info.state == Recycled - || info.state == Dispatched - ; - } +bool isUnwrappable() const noexcept { + return isUnwrapped() + || info.state == Recycled + || info.state == Dispatched + ; +} - inline bool isDirectlyFetchable() const noexcept { - return info.state == Ready || info.state == Dispatched; - } +inline bool isDirectlyFetchable() const noexcept { + return info.state == Ready || info.state == Dispatched; +} - void free() noexcept { - info.tuple = {0, 0}; - info.type = Blank; - info.state = Acceptor; - info.from = {0, 0}; - info.recycling = Blank; - data = nullptr; - } +void free() noexcept { + info.tuple = {0, 0}; + info.type = Blank; + info.state = Acceptor; + info.from = {0, 0}; + info.recycling = Blank; + data = nullptr; +} - inline bool isFree() const noexcept { - return info.tuple == PartialTuple{0, 0} - && info.type == Blank - && info.state == Acceptor - && info.from.rank == 0 - && info.from.source == 0 - && info.recycling == Blank - && data == nullptr - ; - } +inline bool isFree() const noexcept { + return info.tuple == PartialTuple{0, 0} + && info.type == Blank + && info.state == Acceptor + && info.from.rank == 0 + && info.from.source == 0 + && info.recycling == Blank + && data == nullptr + ; +} // Member functions:3 ends here // [[file:../../atrip.org::*Member functions][Member functions:4]] - inline bool isRecyclable() const noexcept { - return ( info.state == Dispatched - || info.state == Ready - || info.state == Fetch - ) - && hasValidDataPointer() - ; - } +inline bool isRecyclable() const noexcept { + return ( info.state == Dispatched + || info.state == Ready + || info.state == Fetch + ) + && hasValidDataPointer() + ; +} // Member functions:4 ends here // [[file:../../atrip.org::*Member functions][Member functions:5]] - inline bool hasValidDataPointer() const noexcept { - return data != nullptr - && info.state != Acceptor - && info.type != Blank - ; - } +inline bool hasValidDataPointer() const noexcept { + return data != nullptr + && info.state != Acceptor + && info.type != Blank + ; +} // Member functions:5 ends here // [[file:../../atrip.org::*Member functions][Member functions:6]] - void unwrapAndMarkReady() { +void unwrapAndMarkReady() { if (info.state == Ready) return; if (info.state != Dispatched) throw @@ -475,14 +475,14 @@ static Slice& findByInfo(std::vector> &slices, // Member functions:6 ends here // [[file:../../atrip.org::*Epilog][Epilog:1]] - Slice(size_t size_) - : info({}) - , data(nullptr) - , size(size_) - {} +Slice(size_t size_) + : info({}) + , data(nullptr) + , size(size_) + {} - }; // struct Slice +}; // struct Slice // Epilog:1 ends here // [[file:../../atrip.org::*Debug][Debug:1]] diff --git a/include/atrip/Tuples.hpp b/include/atrip/Tuples.hpp index e1691c8..26c9d2b 100644 --- a/include/atrip/Tuples.hpp +++ b/include/atrip/Tuples.hpp @@ -467,31 +467,31 @@ std::vector main(MPI_Comm universe, size_t Nv) { // Main:1 ends here // [[file:../../atrip.org::*Main][Main:2]] - size_t const - tuplesPerRankLocal - = nodeTuples.size() / nodeInfos[rank].ranksPerNode - + size_t(nodeTuples.size() % nodeInfos[rank].ranksPerNode != 0) - ; +size_t const + tuplesPerRankLocal + = nodeTuples.size() / nodeInfos[rank].ranksPerNode + + size_t(nodeTuples.size() % nodeInfos[rank].ranksPerNode != 0) + ; - size_t tuplesPerRankGlobal; +size_t tuplesPerRankGlobal; - MPI_Reduce(&tuplesPerRankLocal, - &tuplesPerRankGlobal, - 1, - MPI_UINT64_T, - MPI_MAX, - 0, - universe); +MPI_Reduce(&tuplesPerRankLocal, + &tuplesPerRankGlobal, + 1, + MPI_UINT64_T, + MPI_MAX, + 0, + universe); - MPI_Bcast(&tuplesPerRankGlobal, - 1, - MPI_UINT64_T, - 0, - universe); +MPI_Bcast(&tuplesPerRankGlobal, + 1, + MPI_UINT64_T, + 0, + universe); - LOG(1,"Atrip") << "Tuples per rank: " << tuplesPerRankGlobal << "\n"; - LOG(1,"Atrip") << "ranks per node " << nodeInfos[rank].ranksPerNode << "\n"; - LOG(1,"Atrip") << "#nodes " << nNodes << "\n"; +LOG(1,"Atrip") << "Tuples per rank: " << tuplesPerRankGlobal << "\n"; +LOG(1,"Atrip") << "ranks per node " << nodeInfos[rank].ranksPerNode << "\n"; +LOG(1,"Atrip") << "#nodes " << nNodes << "\n"; // Main:2 ends here // [[file:../../atrip.org::*Main][Main:3]] @@ -531,7 +531,7 @@ if (computeDistribution) { // Main:4 ends here // [[file:../../atrip.org::*Main][Main:5]] - return result; +return result; } // Main:5 ends here diff --git a/include/atrip/Utils.hpp b/include/atrip/Utils.hpp index a7ba187..39c462f 100644 --- a/include/atrip/Utils.hpp +++ b/include/atrip/Utils.hpp @@ -33,7 +33,7 @@ namespace atrip { // Prolog:1 ends here // [[file:../../atrip.org::*Pretty printing][Pretty printing:1]] - template +template std::string pretty_print(T&& value) { std::stringstream stream; #if ATRIP_DEBUG > 2 diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx index ac87966..9808658 100644 --- a/src/atrip/Atrip.cxx +++ b/src/atrip/Atrip.cxx @@ -20,6 +20,7 @@ #include #include #include +#include using namespace atrip; @@ -28,6 +29,7 @@ template bool RankMap::RANK_ROUND_ROBIN; template bool RankMap::RANK_ROUND_ROBIN; int Atrip::rank; int Atrip::np; +MPI_Comm Atrip::communicator; Timings Atrip::chrono; // user printing block @@ -36,9 +38,10 @@ void atrip::registerIterationDescriptor(IterationDescriptor d) { IterationDescription::descriptor = d; } -void Atrip::init() { - MPI_Comm_rank(MPI_COMM_WORLD, &Atrip::rank); - MPI_Comm_size(MPI_COMM_WORLD, &Atrip::np); +void Atrip::init(MPI_Comm world) { + Atrip::communicator = world; + MPI_Comm_rank(world, &Atrip::rank); + MPI_Comm_size(world, &Atrip::np); } template @@ -46,7 +49,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { const int np = Atrip::np; const int rank = Atrip::rank; - MPI_Comm universe = in.ei->wrld->comm; + MPI_Comm universe = Atrip::communicator; const size_t No = in.ei->lens[0]; const size_t Nv = in.ea->lens[0]; @@ -70,10 +73,10 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { RankMap::RANK_ROUND_ROBIN = in.rankRoundRobin; if (RankMap::RANK_ROUND_ROBIN) { - LOG(0,"Atrip") << "Doing rank round robin slices distribution" << "\n"; + LOG(0,"Atrip") << "Doing rank round robin slices distribution\n"; } else { LOG(0,"Atrip") - << "Doing node > local rank round robin slices distribution" << "\n"; + << "Doing node > local rank round robin slices distribution\n"; } @@ -146,7 +149,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { const size_t iterationMod = (in.percentageMod > 0) - ? nIterations * in.percentageMod / 100 + ? nIterations * in.percentageMod / 100.0 : in.iterationMod , iteration1Percent = nIterations * 0.01 @@ -300,8 +303,44 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // START MAIN LOOP ======================================================{{{1 double energy(0.); + size_t first_iteration = 0; + Checkpoint c; + const size_t checkpoint_mod + = in.checkpointAtEveryIteration != 0 + ? in.checkpointAtEveryIteration + : nIterations * in.checkpointAtPercentage / 100; + if (in.readCheckpointIfExists) { + std::ifstream fin(in.checkpointPath); + if (fin.is_open()) { + LOG(0, "Atrip") << "Reading checkpoint from " + << in.checkpointPath << "\n"; + c = read_checkpoint(fin); + first_iteration = (size_t)c.iteration; + if (first_iteration > nIterations) { + // TODO: throw an error here + // first_iteration is bigger than nIterations, + // you probably started the program with a different number + // of cores + } + if (No != c.no) {/* TODO: write warning */} + if (Nv != c.nv) {/* TODO: write warning */} + // TODO write warnings for nrank and so on + if (Atrip::rank == 0) { + // take the negative of the energy to correct for the + // negativity of the equations, the energy in the checkpoint + // should always be the correct physical one. + energy = - (double)c.energy; + } + LOG(0, "Atrip") << "energy from checkpoint " + << energy << "\n"; + LOG(0, "Atrip") << "iteration from checkpoint " + << first_iteration << "\n"; + } + } - for ( size_t i = 0, iteration = 1 + for ( size_t + i = first_iteration, + iteration = first_iteration + 1 ; i < tuplesList.size() ; i++, iteration++ ) { @@ -316,6 +355,23 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { if (in.barrier) MPI_Barrier(universe); )) + // write checkpoints + if (iteration % checkpoint_mod == 0) { + double globalEnergy = 0; + MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe); + Checkpoint out + = {No, + Nv, + 0, // TODO + 0, // TODO + - globalEnergy, + iteration - 1, + in.rankRoundRobin}; + LOG(0, "Atrip") << "Writing checkpoint\n"; + if (Atrip::rank == 0) write_checkpoint(out, in.checkpointPath); + } + + // write reporting if (iteration % iterationMod == 0 || iteration == iteration1Percent) { if (IterationDescription::descriptor) { @@ -363,7 +419,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // COMM FIRST DATABASE ================================================{{{1 - if (i == 0) { + if (i == first_iteration) { WITH_RANK << "__first__:first database ............ \n"; const auto db = communicateDatabase(abc, universe); WITH_RANK << "__first__:first database communicated \n";