From 2bf885196204151e81beb3b296aa3bce1e649ae6 Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Fri, 6 May 2022 13:52:33 +0200 Subject: [PATCH] Add Checkpoint functionality --- atrip.org | 71 ++++++++++++++++++++++++++++++++++++++--- bench/test_main.cxx | 30 +++++++++++++++--- config.el | 17 +++++++--- test/main.cxx | 77 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 182 insertions(+), 13 deletions(-) create mode 100644 test/main.cxx diff --git a/atrip.org b/atrip.org index 931b3b6..b08ef4f 100644 --- a/atrip.org +++ b/atrip.org @@ -3002,6 +3002,11 @@ namespace atrip { ADD_ATTRIBUTE(int, iterationMod, -1) ADD_ATTRIBUTE(int, percentageMod, -1) ADD_ATTRIBUTE(TuplesDistribution, tuplesDistribution, NAIVE) + ADD_ATTRIBUTE(std::string, checkpointPath, "atrip-checkpoint.yaml") + ADD_ATTRIBUTE(bool, readCheckpointIfExists, true) + ADD_ATTRIBUTE(bool, writeCheckpoint, true) + ADD_ATTRIBUTE(float, checkpointAtPercentage, 10) + ADD_ATTRIBUTE(size_t, checkpointAtEveryIteration, 0) }; @@ -3308,8 +3313,44 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // START MAIN LOOP ======================================================{{{1 double energy(0.); + size_t first_iteration = 0; + Checkpoint c; + const size_t checkpoint_mod + = in.checkpointAtEveryIteration != 0 + ? in.checkpointAtEveryIteration + : nIterations * in.checkpointAtPercentage / 100; + if (in.readCheckpointIfExists) { + std::ifstream fin(in.checkpointPath); + if (fin.is_open()) { + LOG(0, "Atrip") << "Reading checkpoint from " + << in.checkpointPath << "\n"; + c = read_checkpoint(fin); + first_iteration = (size_t)c.iteration; + if (first_iteration > nIterations) { + // TODO: throw an error here + // first_iteration is bigger than nIterations, + // you probably started the program with a different number + // of cores + } + if (No != c.no) {/* TODO: write warning */} + if (Nv != c.nv) {/* TODO: write warning */} + // TODO write warnings for nrank and so on + if (Atrip::rank == 0) { + // take the negative of the energy to correct for the + // negativity of the equations, the energy in the checkpoint + // should always be the correct physical one. + energy = - (double)c.energy; + } + LOG(0, "Atrip") << "energy from checkpoint " + << energy << "\n"; + LOG(0, "Atrip") << "iteration from checkpoint " + << first_iteration << "\n"; + } + } - for ( size_t i = 0, iteration = 1 + for ( size_t + i = first_iteration, + iteration = first_iteration + 1 ; i < tuplesList.size() ; i++, iteration++ ) { @@ -3324,6 +3365,23 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { if (in.barrier) MPI_Barrier(universe); )) + // write checkpoints + if (iteration % checkpoint_mod == 0) { + double globalEnergy = 0; + MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe); + Checkpoint out + = {No, + Nv, + 0, // TODO + 0, // TODO + - globalEnergy, + iteration - 1, + in.rankRoundRobin}; + LOG(0, "Atrip") << "Writing checkpoint\n"; + if (Atrip::rank == 0) write_checkpoint(out, in.checkpointPath); + } + + // write reporting if (iteration % iterationMod == 0 || iteration == iteration1Percent) { if (IterationDescription::descriptor) { @@ -3371,7 +3429,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // COMM FIRST DATABASE ================================================{{{1 - if (i == 0) { + if (i == first_iteration) { WITH_RANK << "__first__:first database ............ \n"; const auto db = communicateDatabase(abc, universe); WITH_RANK << "__first__:first database communicated \n"; @@ -3788,8 +3846,8 @@ void write_checkpoint(Checkpoint const& c, std::string const& filepath) { << "\n"; } -Checkpoint read_checkpoint(std::string const& filepath) { - std::ifstream in(filepath); + +Checkpoint read_checkpoint(std::ifstream& in) { Checkpoint c; // trim chars from the string, to be more sure and not use regexes auto trim = [](std::string& s, std::string const& chars) { @@ -3813,6 +3871,11 @@ Checkpoint read_checkpoint(std::string const& filepath) { return c; } + +Checkpoint read_checkpoint(std::string const& filepath) { + std::ifstream in(filepath); + return read_checkpoint(in); +} #+end_src diff --git a/bench/test_main.cxx b/bench/test_main.cxx index a4e058e..f23ab37 100644 --- a/bench/test_main.cxx +++ b/bench/test_main.cxx @@ -16,10 +16,15 @@ int main(int argc, char** argv) { MPI_Init(&argc, &argv); + size_t checkpoint_it; int no(10), nv(10), itMod(-1), percentageMod(10); - bool nochrono(false), barrier(false), rankRoundRobin(false), - keepVppph(false); - std::string tuplesDistributionString = "naive"; + float checkpoint_percentage; + bool + nochrono(false), barrier(false), rankRoundRobin(false), + keepVppph(false), + noCheckpoint = false; + std::string tuplesDistributionString = "naive", + checkpoint_path = "checkpoint.yaml"; CLI::App app{"Main bench for atrip"}; app.add_option("--no", no, "Occupied orbitals"); @@ -31,6 +36,14 @@ int main(int argc, char** argv) { app.add_flag("--barrier", barrier, "Use the first barrier"); app.add_option("--dist", tuplesDistributionString, "Which distribution"); app.add_option("-%", percentageMod, "Percentage to be printed"); + // checkpointing + app.add_flag("--nocheckpoint", noCheckpoint, "Do not use checkpoint"); + app.add_option("--checkpoint-path", checkpoint_path, "Path for checkpoint"); + app.add_option("--checkpoint-it", + checkpoint_it, "Checkpoint at every iteration"); + app.add_option("--checkpoint-%", + checkpoint_percentage, + "Percentage for checkpoints"); CLI11_PARSE(app, argc, argv); @@ -74,9 +87,11 @@ int main(int argc, char** argv) { atrip::Atrip::Input::TuplesDistribution tuplesDistribution; { using atrip::Atrip; if (tuplesDistributionString == "naive") { - tuplesDistribution = Atrip::Input::TuplesDistribution::NAIVE; + tuplesDistribution + = Atrip::Input::TuplesDistribution::NAIVE; } else if (tuplesDistributionString == "group") { - tuplesDistribution = Atrip::Input::TuplesDistribution::GROUP_AND_SORT; + tuplesDistribution + = Atrip::Input::TuplesDistribution::GROUP_AND_SORT; } else { std::cout << "--dist should be either naive or group\n"; exit(1); @@ -134,6 +149,11 @@ int main(int argc, char** argv) { .with_iterationMod(itMod) .with_percentageMod(percentageMod) .with_tuplesDistribution(tuplesDistribution) + // checkpoint options + .with_checkpointAtEveryIteration(checkpoint_it) + .with_checkpointAtPercentage(checkpoint_percentage) + .with_checkpointPath(checkpoint_path) + .with_readCheckpointIfExists(!noCheckpoint) ; auto out = atrip::Atrip::run(in); diff --git a/config.el b/config.el index 29e65b2..92c458b 100644 --- a/config.el +++ b/config.el @@ -8,13 +8,17 @@ (defun atrip-print-sources () (princ (string-join atrip-sources " "))) -(defvar atrip-include-f "include/atrip") ;; TODO: create defvar -(defvar atrip-src-f "src/atrip") ;; TODO: create defvar +(defvar atrip-include-f "include/atrip") +(defvar atrip-src-f "src/atrip") +(defvar atrip-test-d "test") -(defmacro atrip-def (name body) `(progn (defun ,name () ,body) - (push (,name) atrip-sources))) +(defmacro atrip-def (name body) + `(progn (defun ,name () ,body) + (push (,name) atrip-sources))) +(defmacro atrip-def-test (name body) + `(atrip-def ,name (f-join atrip-test-d ,body))) (defmacro atrip-def-src (name body) `(atrip-def ,name (f-join atrip-src-f ,body))) (defmacro atrip-def-hdr (name body) @@ -30,15 +34,20 @@ (atrip-def-hdr atrip-tuples-h "Tuples.hpp") (atrip-def-hdr atrip-equations-h "Equations.hpp") (atrip-def-hdr atrip-debug-h "Debug.hpp") +(atrip-def-hdr atrip-checkpoint-h "Checkpoint.hpp") (atrip-def-hdr atrip-atrip-h "Atrip.hpp") (atrip-def-src atrip-atrip-cxx "Atrip.cxx") (atrip-def atrip-main-h "include/atrip.hpp") +;; main test +(atrip-def-test atrip-test-main "main.cxx") + (defvar atrip-root-directory (file-name-directory load-file-name)) (defvar license-path (format "%s/LICENSE-HEADER" atrip-root-directory)) +;; add local hook for license headers (add-hook 'org-babel-post-tangle-hook (lambda () (goto-char (point-min)) diff --git a/test/main.cxx b/test/main.cxx new file mode 100644 index 0000000..2709ded --- /dev/null +++ b/test/main.cxx @@ -0,0 +1,77 @@ +// Copyright 2022 Alejandro Gallo +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// [[file:../atrip.org::*Tests][Tests:1]] +#include +#include +// [[[[file:~/software/atrip/atrip.org::*Tests][Tests]]][]] +#include +using namespace atrip; +// ends here + + + +#define TESTCASE(_name, ...) { \ + std::cout << "\x1b[35m-> \x1b[0m" \ + << _name \ + << std::endl; \ + __VA_ARGS__ \ + } + +int main() { + + // [[[[file:~/software/atrip/atrip.org::*Tests][Tests]]][]] + #define _CMP_CHECK(what) \ + std::cout << "\t Checking " << #what << std::endl; \ + assert(in.what == what); \ + assert(out.what == what); + + TESTCASE("Testing checkpoint reader and writers", + const std::string out_checkpoint = "/tmp/checkpoint.yaml"; + const double energy = -1.493926352289995443; + const size_t no = 154, nv = 1500, nranks = 48*10, nnodes = 10; + const size_t iteration = 546; + std::cout << "\twriting to " << out_checkpoint << std::endl; + + for (bool rankRoundRobin: {true, false}) { + atrip::Checkpoint out = {no, + nv, + nranks, + nnodes, + energy, + iteration, + rankRoundRobin}, in; + + + write_checkpoint(out, out_checkpoint); + in = read_checkpoint(out_checkpoint); + + _CMP_CHECK(no); + _CMP_CHECK(nv); + _CMP_CHECK(nranks); + _CMP_CHECK(nnodes); + _CMP_CHECK(iteration); + _CMP_CHECK(rankRoundRobin); + _CMP_CHECK(energy); + } + + + ) + #undef _CMP_CHECK + + // ends here + + return 0; +} +// Tests:1 ends here