Add Checkpoint functionality

This commit is contained in:
Alejandro Gallo 2022-05-06 13:52:33 +02:00
parent be39eeb776
commit 2bf8851962
4 changed files with 182 additions and 13 deletions

View File

@ -3002,6 +3002,11 @@ namespace atrip {
ADD_ATTRIBUTE(int, iterationMod, -1)
ADD_ATTRIBUTE(int, percentageMod, -1)
ADD_ATTRIBUTE(TuplesDistribution, tuplesDistribution, NAIVE)
ADD_ATTRIBUTE(std::string, checkpointPath, "atrip-checkpoint.yaml")
ADD_ATTRIBUTE(bool, readCheckpointIfExists, true)
ADD_ATTRIBUTE(bool, writeCheckpoint, true)
ADD_ATTRIBUTE(float, checkpointAtPercentage, 10)
ADD_ATTRIBUTE(size_t, checkpointAtEveryIteration, 0)
};
@ -3308,8 +3313,44 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
// START MAIN LOOP ======================================================{{{1
double energy(0.);
size_t first_iteration = 0;
Checkpoint c;
const size_t checkpoint_mod
= in.checkpointAtEveryIteration != 0
? in.checkpointAtEveryIteration
: nIterations * in.checkpointAtPercentage / 100;
if (in.readCheckpointIfExists) {
std::ifstream fin(in.checkpointPath);
if (fin.is_open()) {
LOG(0, "Atrip") << "Reading checkpoint from "
<< in.checkpointPath << "\n";
c = read_checkpoint(fin);
first_iteration = (size_t)c.iteration;
if (first_iteration > nIterations) {
// TODO: throw an error here
// first_iteration is bigger than nIterations,
// you probably started the program with a different number
// of cores
}
if (No != c.no) {/* TODO: write warning */}
if (Nv != c.nv) {/* TODO: write warning */}
// TODO write warnings for nrank and so on
if (Atrip::rank == 0) {
// take the negative of the energy to correct for the
// negativity of the equations, the energy in the checkpoint
// should always be the correct physical one.
energy = - (double)c.energy;
}
LOG(0, "Atrip") << "energy from checkpoint "
<< energy << "\n";
LOG(0, "Atrip") << "iteration from checkpoint "
<< first_iteration << "\n";
}
}
for ( size_t i = 0, iteration = 1
for ( size_t
i = first_iteration,
iteration = first_iteration + 1
; i < tuplesList.size()
; i++, iteration++
) {
@ -3324,6 +3365,23 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
if (in.barrier) MPI_Barrier(universe);
))
// write checkpoints
if (iteration % checkpoint_mod == 0) {
double globalEnergy = 0;
MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe);
Checkpoint out
= {No,
Nv,
0, // TODO
0, // TODO
- globalEnergy,
iteration - 1,
in.rankRoundRobin};
LOG(0, "Atrip") << "Writing checkpoint\n";
if (Atrip::rank == 0) write_checkpoint(out, in.checkpointPath);
}
// write reporting
if (iteration % iterationMod == 0 || iteration == iteration1Percent) {
if (IterationDescription::descriptor) {
@ -3371,7 +3429,7 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
// COMM FIRST DATABASE ================================================{{{1
if (i == 0) {
if (i == first_iteration) {
WITH_RANK << "__first__:first database ............ \n";
const auto db = communicateDatabase(abc, universe);
WITH_RANK << "__first__:first database communicated \n";
@ -3788,8 +3846,8 @@ void write_checkpoint(Checkpoint const& c, std::string const& filepath) {
<< "\n";
}
Checkpoint read_checkpoint(std::string const& filepath) {
std::ifstream in(filepath);
Checkpoint read_checkpoint(std::ifstream& in) {
Checkpoint c;
// trim chars from the string, to be more sure and not use regexes
auto trim = [](std::string& s, std::string const& chars) {
@ -3813,6 +3871,11 @@ Checkpoint read_checkpoint(std::string const& filepath) {
return c;
}
Checkpoint read_checkpoint(std::string const& filepath) {
std::ifstream in(filepath);
return read_checkpoint(in);
}
#+end_src

View File

@ -16,10 +16,15 @@
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
size_t checkpoint_it;
int no(10), nv(10), itMod(-1), percentageMod(10);
bool nochrono(false), barrier(false), rankRoundRobin(false),
keepVppph(false);
std::string tuplesDistributionString = "naive";
float checkpoint_percentage;
bool
nochrono(false), barrier(false), rankRoundRobin(false),
keepVppph(false),
noCheckpoint = false;
std::string tuplesDistributionString = "naive",
checkpoint_path = "checkpoint.yaml";
CLI::App app{"Main bench for atrip"};
app.add_option("--no", no, "Occupied orbitals");
@ -31,6 +36,14 @@ int main(int argc, char** argv) {
app.add_flag("--barrier", barrier, "Use the first barrier");
app.add_option("--dist", tuplesDistributionString, "Which distribution");
app.add_option("-%", percentageMod, "Percentage to be printed");
// checkpointing
app.add_flag("--nocheckpoint", noCheckpoint, "Do not use checkpoint");
app.add_option("--checkpoint-path", checkpoint_path, "Path for checkpoint");
app.add_option("--checkpoint-it",
checkpoint_it, "Checkpoint at every iteration");
app.add_option("--checkpoint-%",
checkpoint_percentage,
"Percentage for checkpoints");
CLI11_PARSE(app, argc, argv);
@ -74,9 +87,11 @@ int main(int argc, char** argv) {
atrip::Atrip::Input<double>::TuplesDistribution tuplesDistribution;
{ using atrip::Atrip;
if (tuplesDistributionString == "naive") {
tuplesDistribution = Atrip::Input<double>::TuplesDistribution::NAIVE;
tuplesDistribution
= Atrip::Input<double>::TuplesDistribution::NAIVE;
} else if (tuplesDistributionString == "group") {
tuplesDistribution = Atrip::Input<double>::TuplesDistribution::GROUP_AND_SORT;
tuplesDistribution
= Atrip::Input<double>::TuplesDistribution::GROUP_AND_SORT;
} else {
std::cout << "--dist should be either naive or group\n";
exit(1);
@ -134,6 +149,11 @@ int main(int argc, char** argv) {
.with_iterationMod(itMod)
.with_percentageMod(percentageMod)
.with_tuplesDistribution(tuplesDistribution)
// checkpoint options
.with_checkpointAtEveryIteration(checkpoint_it)
.with_checkpointAtPercentage(checkpoint_percentage)
.with_checkpointPath(checkpoint_path)
.with_readCheckpointIfExists(!noCheckpoint)
;
auto out = atrip::Atrip::run(in);

View File

@ -8,13 +8,17 @@
(defun atrip-print-sources ()
(princ (string-join atrip-sources " ")))
(defvar atrip-include-f "include/atrip") ;; TODO: create defvar
(defvar atrip-src-f "src/atrip") ;; TODO: create defvar
(defvar atrip-include-f "include/atrip")
(defvar atrip-src-f "src/atrip")
(defvar atrip-test-d "test")
(defmacro atrip-def (name body) `(progn (defun ,name () ,body)
(defmacro atrip-def (name body)
`(progn (defun ,name () ,body)
(push (,name) atrip-sources)))
(defmacro atrip-def-test (name body)
`(atrip-def ,name (f-join atrip-test-d ,body)))
(defmacro atrip-def-src (name body)
`(atrip-def ,name (f-join atrip-src-f ,body)))
(defmacro atrip-def-hdr (name body)
@ -30,15 +34,20 @@
(atrip-def-hdr atrip-tuples-h "Tuples.hpp")
(atrip-def-hdr atrip-equations-h "Equations.hpp")
(atrip-def-hdr atrip-debug-h "Debug.hpp")
(atrip-def-hdr atrip-checkpoint-h "Checkpoint.hpp")
(atrip-def-hdr atrip-atrip-h "Atrip.hpp")
(atrip-def-src atrip-atrip-cxx "Atrip.cxx")
(atrip-def atrip-main-h "include/atrip.hpp")
;; main test
(atrip-def-test atrip-test-main "main.cxx")
(defvar atrip-root-directory (file-name-directory load-file-name))
(defvar license-path (format "%s/LICENSE-HEADER" atrip-root-directory))
;; add local hook for license headers
(add-hook 'org-babel-post-tangle-hook
(lambda ()
(goto-char (point-min))

77
test/main.cxx Normal file
View File

@ -0,0 +1,77 @@
// Copyright 2022 Alejandro Gallo
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// [[file:../atrip.org::*Tests][Tests:1]]
#include <atrip.hpp>
#include <cassert>
// [[[[file:~/software/atrip/atrip.org::*Tests][Tests]]][]]
#include <atrip/Checkpoint.hpp>
using namespace atrip;
// ends here
#define TESTCASE(_name, ...) { \
std::cout << "\x1b[35m-> \x1b[0m" \
<< _name \
<< std::endl; \
__VA_ARGS__ \
}
int main() {
// [[[[file:~/software/atrip/atrip.org::*Tests][Tests]]][]]
#define _CMP_CHECK(what) \
std::cout << "\t Checking " << #what << std::endl; \
assert(in.what == what); \
assert(out.what == what);
TESTCASE("Testing checkpoint reader and writers",
const std::string out_checkpoint = "/tmp/checkpoint.yaml";
const double energy = -1.493926352289995443;
const size_t no = 154, nv = 1500, nranks = 48*10, nnodes = 10;
const size_t iteration = 546;
std::cout << "\twriting to " << out_checkpoint << std::endl;
for (bool rankRoundRobin: {true, false}) {
atrip::Checkpoint out = {no,
nv,
nranks,
nnodes,
energy,
iteration,
rankRoundRobin}, in;
write_checkpoint(out, out_checkpoint);
in = read_checkpoint(out_checkpoint);
_CMP_CHECK(no);
_CMP_CHECK(nv);
_CMP_CHECK(nranks);
_CMP_CHECK(nnodes);
_CMP_CHECK(iteration);
_CMP_CHECK(rankRoundRobin);
_CMP_CHECK(energy);
}
)
#undef _CMP_CHECK
// ends here
return 0;
}
// Tests:1 ends here