Add Checkpoint functionality
This commit is contained in:
parent
be39eeb776
commit
2bf8851962
71
atrip.org
71
atrip.org
@ -3002,6 +3002,11 @@ namespace atrip {
|
||||
ADD_ATTRIBUTE(int, iterationMod, -1)
|
||||
ADD_ATTRIBUTE(int, percentageMod, -1)
|
||||
ADD_ATTRIBUTE(TuplesDistribution, tuplesDistribution, NAIVE)
|
||||
ADD_ATTRIBUTE(std::string, checkpointPath, "atrip-checkpoint.yaml")
|
||||
ADD_ATTRIBUTE(bool, readCheckpointIfExists, true)
|
||||
ADD_ATTRIBUTE(bool, writeCheckpoint, true)
|
||||
ADD_ATTRIBUTE(float, checkpointAtPercentage, 10)
|
||||
ADD_ATTRIBUTE(size_t, checkpointAtEveryIteration, 0)
|
||||
|
||||
};
|
||||
|
||||
@ -3308,8 +3313,44 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
// START MAIN LOOP ======================================================{{{1
|
||||
|
||||
double energy(0.);
|
||||
size_t first_iteration = 0;
|
||||
Checkpoint c;
|
||||
const size_t checkpoint_mod
|
||||
= in.checkpointAtEveryIteration != 0
|
||||
? in.checkpointAtEveryIteration
|
||||
: nIterations * in.checkpointAtPercentage / 100;
|
||||
if (in.readCheckpointIfExists) {
|
||||
std::ifstream fin(in.checkpointPath);
|
||||
if (fin.is_open()) {
|
||||
LOG(0, "Atrip") << "Reading checkpoint from "
|
||||
<< in.checkpointPath << "\n";
|
||||
c = read_checkpoint(fin);
|
||||
first_iteration = (size_t)c.iteration;
|
||||
if (first_iteration > nIterations) {
|
||||
// TODO: throw an error here
|
||||
// first_iteration is bigger than nIterations,
|
||||
// you probably started the program with a different number
|
||||
// of cores
|
||||
}
|
||||
if (No != c.no) {/* TODO: write warning */}
|
||||
if (Nv != c.nv) {/* TODO: write warning */}
|
||||
// TODO write warnings for nrank and so on
|
||||
if (Atrip::rank == 0) {
|
||||
// take the negative of the energy to correct for the
|
||||
// negativity of the equations, the energy in the checkpoint
|
||||
// should always be the correct physical one.
|
||||
energy = - (double)c.energy;
|
||||
}
|
||||
LOG(0, "Atrip") << "energy from checkpoint "
|
||||
<< energy << "\n";
|
||||
LOG(0, "Atrip") << "iteration from checkpoint "
|
||||
<< first_iteration << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
for ( size_t i = 0, iteration = 1
|
||||
for ( size_t
|
||||
i = first_iteration,
|
||||
iteration = first_iteration + 1
|
||||
; i < tuplesList.size()
|
||||
; i++, iteration++
|
||||
) {
|
||||
@ -3324,6 +3365,23 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
if (in.barrier) MPI_Barrier(universe);
|
||||
))
|
||||
|
||||
// write checkpoints
|
||||
if (iteration % checkpoint_mod == 0) {
|
||||
double globalEnergy = 0;
|
||||
MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe);
|
||||
Checkpoint out
|
||||
= {No,
|
||||
Nv,
|
||||
0, // TODO
|
||||
0, // TODO
|
||||
- globalEnergy,
|
||||
iteration - 1,
|
||||
in.rankRoundRobin};
|
||||
LOG(0, "Atrip") << "Writing checkpoint\n";
|
||||
if (Atrip::rank == 0) write_checkpoint(out, in.checkpointPath);
|
||||
}
|
||||
|
||||
// write reporting
|
||||
if (iteration % iterationMod == 0 || iteration == iteration1Percent) {
|
||||
|
||||
if (IterationDescription::descriptor) {
|
||||
@ -3371,7 +3429,7 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
|
||||
|
||||
// COMM FIRST DATABASE ================================================{{{1
|
||||
if (i == 0) {
|
||||
if (i == first_iteration) {
|
||||
WITH_RANK << "__first__:first database ............ \n";
|
||||
const auto db = communicateDatabase(abc, universe);
|
||||
WITH_RANK << "__first__:first database communicated \n";
|
||||
@ -3788,8 +3846,8 @@ void write_checkpoint(Checkpoint const& c, std::string const& filepath) {
|
||||
<< "\n";
|
||||
}
|
||||
|
||||
Checkpoint read_checkpoint(std::string const& filepath) {
|
||||
std::ifstream in(filepath);
|
||||
|
||||
Checkpoint read_checkpoint(std::ifstream& in) {
|
||||
Checkpoint c;
|
||||
// trim chars from the string, to be more sure and not use regexes
|
||||
auto trim = [](std::string& s, std::string const& chars) {
|
||||
@ -3813,6 +3871,11 @@ Checkpoint read_checkpoint(std::string const& filepath) {
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
Checkpoint read_checkpoint(std::string const& filepath) {
|
||||
std::ifstream in(filepath);
|
||||
return read_checkpoint(in);
|
||||
}
|
||||
#+end_src
|
||||
|
||||
|
||||
|
||||
@ -16,10 +16,15 @@
|
||||
int main(int argc, char** argv) {
|
||||
MPI_Init(&argc, &argv);
|
||||
|
||||
size_t checkpoint_it;
|
||||
int no(10), nv(10), itMod(-1), percentageMod(10);
|
||||
bool nochrono(false), barrier(false), rankRoundRobin(false),
|
||||
keepVppph(false);
|
||||
std::string tuplesDistributionString = "naive";
|
||||
float checkpoint_percentage;
|
||||
bool
|
||||
nochrono(false), barrier(false), rankRoundRobin(false),
|
||||
keepVppph(false),
|
||||
noCheckpoint = false;
|
||||
std::string tuplesDistributionString = "naive",
|
||||
checkpoint_path = "checkpoint.yaml";
|
||||
|
||||
CLI::App app{"Main bench for atrip"};
|
||||
app.add_option("--no", no, "Occupied orbitals");
|
||||
@ -31,6 +36,14 @@ int main(int argc, char** argv) {
|
||||
app.add_flag("--barrier", barrier, "Use the first barrier");
|
||||
app.add_option("--dist", tuplesDistributionString, "Which distribution");
|
||||
app.add_option("-%", percentageMod, "Percentage to be printed");
|
||||
// checkpointing
|
||||
app.add_flag("--nocheckpoint", noCheckpoint, "Do not use checkpoint");
|
||||
app.add_option("--checkpoint-path", checkpoint_path, "Path for checkpoint");
|
||||
app.add_option("--checkpoint-it",
|
||||
checkpoint_it, "Checkpoint at every iteration");
|
||||
app.add_option("--checkpoint-%",
|
||||
checkpoint_percentage,
|
||||
"Percentage for checkpoints");
|
||||
|
||||
CLI11_PARSE(app, argc, argv);
|
||||
|
||||
@ -74,9 +87,11 @@ int main(int argc, char** argv) {
|
||||
atrip::Atrip::Input<double>::TuplesDistribution tuplesDistribution;
|
||||
{ using atrip::Atrip;
|
||||
if (tuplesDistributionString == "naive") {
|
||||
tuplesDistribution = Atrip::Input<double>::TuplesDistribution::NAIVE;
|
||||
tuplesDistribution
|
||||
= Atrip::Input<double>::TuplesDistribution::NAIVE;
|
||||
} else if (tuplesDistributionString == "group") {
|
||||
tuplesDistribution = Atrip::Input<double>::TuplesDistribution::GROUP_AND_SORT;
|
||||
tuplesDistribution
|
||||
= Atrip::Input<double>::TuplesDistribution::GROUP_AND_SORT;
|
||||
} else {
|
||||
std::cout << "--dist should be either naive or group\n";
|
||||
exit(1);
|
||||
@ -134,6 +149,11 @@ int main(int argc, char** argv) {
|
||||
.with_iterationMod(itMod)
|
||||
.with_percentageMod(percentageMod)
|
||||
.with_tuplesDistribution(tuplesDistribution)
|
||||
// checkpoint options
|
||||
.with_checkpointAtEveryIteration(checkpoint_it)
|
||||
.with_checkpointAtPercentage(checkpoint_percentage)
|
||||
.with_checkpointPath(checkpoint_path)
|
||||
.with_readCheckpointIfExists(!noCheckpoint)
|
||||
;
|
||||
|
||||
auto out = atrip::Atrip::run(in);
|
||||
|
||||
15
config.el
15
config.el
@ -8,13 +8,17 @@
|
||||
(defun atrip-print-sources ()
|
||||
(princ (string-join atrip-sources " ")))
|
||||
|
||||
(defvar atrip-include-f "include/atrip") ;; TODO: create defvar
|
||||
(defvar atrip-src-f "src/atrip") ;; TODO: create defvar
|
||||
(defvar atrip-include-f "include/atrip")
|
||||
(defvar atrip-src-f "src/atrip")
|
||||
(defvar atrip-test-d "test")
|
||||
|
||||
(defmacro atrip-def (name body) `(progn (defun ,name () ,body)
|
||||
(defmacro atrip-def (name body)
|
||||
`(progn (defun ,name () ,body)
|
||||
(push (,name) atrip-sources)))
|
||||
|
||||
|
||||
(defmacro atrip-def-test (name body)
|
||||
`(atrip-def ,name (f-join atrip-test-d ,body)))
|
||||
(defmacro atrip-def-src (name body)
|
||||
`(atrip-def ,name (f-join atrip-src-f ,body)))
|
||||
(defmacro atrip-def-hdr (name body)
|
||||
@ -30,15 +34,20 @@
|
||||
(atrip-def-hdr atrip-tuples-h "Tuples.hpp")
|
||||
(atrip-def-hdr atrip-equations-h "Equations.hpp")
|
||||
(atrip-def-hdr atrip-debug-h "Debug.hpp")
|
||||
(atrip-def-hdr atrip-checkpoint-h "Checkpoint.hpp")
|
||||
|
||||
(atrip-def-hdr atrip-atrip-h "Atrip.hpp")
|
||||
(atrip-def-src atrip-atrip-cxx "Atrip.cxx")
|
||||
|
||||
(atrip-def atrip-main-h "include/atrip.hpp")
|
||||
|
||||
;; main test
|
||||
(atrip-def-test atrip-test-main "main.cxx")
|
||||
|
||||
(defvar atrip-root-directory (file-name-directory load-file-name))
|
||||
(defvar license-path (format "%s/LICENSE-HEADER" atrip-root-directory))
|
||||
|
||||
;; add local hook for license headers
|
||||
(add-hook 'org-babel-post-tangle-hook
|
||||
(lambda ()
|
||||
(goto-char (point-min))
|
||||
|
||||
77
test/main.cxx
Normal file
77
test/main.cxx
Normal file
@ -0,0 +1,77 @@
|
||||
// Copyright 2022 Alejandro Gallo
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:../atrip.org::*Tests][Tests:1]]
|
||||
#include <atrip.hpp>
|
||||
#include <cassert>
|
||||
// [[[[file:~/software/atrip/atrip.org::*Tests][Tests]]][]]
|
||||
#include <atrip/Checkpoint.hpp>
|
||||
using namespace atrip;
|
||||
// ends here
|
||||
|
||||
|
||||
|
||||
#define TESTCASE(_name, ...) { \
|
||||
std::cout << "\x1b[35m-> \x1b[0m" \
|
||||
<< _name \
|
||||
<< std::endl; \
|
||||
__VA_ARGS__ \
|
||||
}
|
||||
|
||||
int main() {
|
||||
|
||||
// [[[[file:~/software/atrip/atrip.org::*Tests][Tests]]][]]
|
||||
#define _CMP_CHECK(what) \
|
||||
std::cout << "\t Checking " << #what << std::endl; \
|
||||
assert(in.what == what); \
|
||||
assert(out.what == what);
|
||||
|
||||
TESTCASE("Testing checkpoint reader and writers",
|
||||
const std::string out_checkpoint = "/tmp/checkpoint.yaml";
|
||||
const double energy = -1.493926352289995443;
|
||||
const size_t no = 154, nv = 1500, nranks = 48*10, nnodes = 10;
|
||||
const size_t iteration = 546;
|
||||
std::cout << "\twriting to " << out_checkpoint << std::endl;
|
||||
|
||||
for (bool rankRoundRobin: {true, false}) {
|
||||
atrip::Checkpoint out = {no,
|
||||
nv,
|
||||
nranks,
|
||||
nnodes,
|
||||
energy,
|
||||
iteration,
|
||||
rankRoundRobin}, in;
|
||||
|
||||
|
||||
write_checkpoint(out, out_checkpoint);
|
||||
in = read_checkpoint(out_checkpoint);
|
||||
|
||||
_CMP_CHECK(no);
|
||||
_CMP_CHECK(nv);
|
||||
_CMP_CHECK(nranks);
|
||||
_CMP_CHECK(nnodes);
|
||||
_CMP_CHECK(iteration);
|
||||
_CMP_CHECK(rankRoundRobin);
|
||||
_CMP_CHECK(energy);
|
||||
}
|
||||
|
||||
|
||||
)
|
||||
#undef _CMP_CHECK
|
||||
|
||||
// ends here
|
||||
|
||||
return 0;
|
||||
}
|
||||
// Tests:1 ends here
|
||||
Loading…
Reference in New Issue
Block a user