Add Checkpoint functionality
This commit is contained in:
parent
be39eeb776
commit
2bf8851962
71
atrip.org
71
atrip.org
@ -3002,6 +3002,11 @@ namespace atrip {
|
|||||||
ADD_ATTRIBUTE(int, iterationMod, -1)
|
ADD_ATTRIBUTE(int, iterationMod, -1)
|
||||||
ADD_ATTRIBUTE(int, percentageMod, -1)
|
ADD_ATTRIBUTE(int, percentageMod, -1)
|
||||||
ADD_ATTRIBUTE(TuplesDistribution, tuplesDistribution, NAIVE)
|
ADD_ATTRIBUTE(TuplesDistribution, tuplesDistribution, NAIVE)
|
||||||
|
ADD_ATTRIBUTE(std::string, checkpointPath, "atrip-checkpoint.yaml")
|
||||||
|
ADD_ATTRIBUTE(bool, readCheckpointIfExists, true)
|
||||||
|
ADD_ATTRIBUTE(bool, writeCheckpoint, true)
|
||||||
|
ADD_ATTRIBUTE(float, checkpointAtPercentage, 10)
|
||||||
|
ADD_ATTRIBUTE(size_t, checkpointAtEveryIteration, 0)
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -3308,8 +3313,44 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
// START MAIN LOOP ======================================================{{{1
|
// START MAIN LOOP ======================================================{{{1
|
||||||
|
|
||||||
double energy(0.);
|
double energy(0.);
|
||||||
|
size_t first_iteration = 0;
|
||||||
|
Checkpoint c;
|
||||||
|
const size_t checkpoint_mod
|
||||||
|
= in.checkpointAtEveryIteration != 0
|
||||||
|
? in.checkpointAtEveryIteration
|
||||||
|
: nIterations * in.checkpointAtPercentage / 100;
|
||||||
|
if (in.readCheckpointIfExists) {
|
||||||
|
std::ifstream fin(in.checkpointPath);
|
||||||
|
if (fin.is_open()) {
|
||||||
|
LOG(0, "Atrip") << "Reading checkpoint from "
|
||||||
|
<< in.checkpointPath << "\n";
|
||||||
|
c = read_checkpoint(fin);
|
||||||
|
first_iteration = (size_t)c.iteration;
|
||||||
|
if (first_iteration > nIterations) {
|
||||||
|
// TODO: throw an error here
|
||||||
|
// first_iteration is bigger than nIterations,
|
||||||
|
// you probably started the program with a different number
|
||||||
|
// of cores
|
||||||
|
}
|
||||||
|
if (No != c.no) {/* TODO: write warning */}
|
||||||
|
if (Nv != c.nv) {/* TODO: write warning */}
|
||||||
|
// TODO write warnings for nrank and so on
|
||||||
|
if (Atrip::rank == 0) {
|
||||||
|
// take the negative of the energy to correct for the
|
||||||
|
// negativity of the equations, the energy in the checkpoint
|
||||||
|
// should always be the correct physical one.
|
||||||
|
energy = - (double)c.energy;
|
||||||
|
}
|
||||||
|
LOG(0, "Atrip") << "energy from checkpoint "
|
||||||
|
<< energy << "\n";
|
||||||
|
LOG(0, "Atrip") << "iteration from checkpoint "
|
||||||
|
<< first_iteration << "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for ( size_t i = 0, iteration = 1
|
for ( size_t
|
||||||
|
i = first_iteration,
|
||||||
|
iteration = first_iteration + 1
|
||||||
; i < tuplesList.size()
|
; i < tuplesList.size()
|
||||||
; i++, iteration++
|
; i++, iteration++
|
||||||
) {
|
) {
|
||||||
@ -3324,6 +3365,23 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
if (in.barrier) MPI_Barrier(universe);
|
if (in.barrier) MPI_Barrier(universe);
|
||||||
))
|
))
|
||||||
|
|
||||||
|
// write checkpoints
|
||||||
|
if (iteration % checkpoint_mod == 0) {
|
||||||
|
double globalEnergy = 0;
|
||||||
|
MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe);
|
||||||
|
Checkpoint out
|
||||||
|
= {No,
|
||||||
|
Nv,
|
||||||
|
0, // TODO
|
||||||
|
0, // TODO
|
||||||
|
- globalEnergy,
|
||||||
|
iteration - 1,
|
||||||
|
in.rankRoundRobin};
|
||||||
|
LOG(0, "Atrip") << "Writing checkpoint\n";
|
||||||
|
if (Atrip::rank == 0) write_checkpoint(out, in.checkpointPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
// write reporting
|
||||||
if (iteration % iterationMod == 0 || iteration == iteration1Percent) {
|
if (iteration % iterationMod == 0 || iteration == iteration1Percent) {
|
||||||
|
|
||||||
if (IterationDescription::descriptor) {
|
if (IterationDescription::descriptor) {
|
||||||
@ -3371,7 +3429,7 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
|
|
||||||
|
|
||||||
// COMM FIRST DATABASE ================================================{{{1
|
// COMM FIRST DATABASE ================================================{{{1
|
||||||
if (i == 0) {
|
if (i == first_iteration) {
|
||||||
WITH_RANK << "__first__:first database ............ \n";
|
WITH_RANK << "__first__:first database ............ \n";
|
||||||
const auto db = communicateDatabase(abc, universe);
|
const auto db = communicateDatabase(abc, universe);
|
||||||
WITH_RANK << "__first__:first database communicated \n";
|
WITH_RANK << "__first__:first database communicated \n";
|
||||||
@ -3788,8 +3846,8 @@ void write_checkpoint(Checkpoint const& c, std::string const& filepath) {
|
|||||||
<< "\n";
|
<< "\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
Checkpoint read_checkpoint(std::string const& filepath) {
|
|
||||||
std::ifstream in(filepath);
|
Checkpoint read_checkpoint(std::ifstream& in) {
|
||||||
Checkpoint c;
|
Checkpoint c;
|
||||||
// trim chars from the string, to be more sure and not use regexes
|
// trim chars from the string, to be more sure and not use regexes
|
||||||
auto trim = [](std::string& s, std::string const& chars) {
|
auto trim = [](std::string& s, std::string const& chars) {
|
||||||
@ -3813,6 +3871,11 @@ Checkpoint read_checkpoint(std::string const& filepath) {
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Checkpoint read_checkpoint(std::string const& filepath) {
|
||||||
|
std::ifstream in(filepath);
|
||||||
|
return read_checkpoint(in);
|
||||||
|
}
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -16,10 +16,15 @@
|
|||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
MPI_Init(&argc, &argv);
|
MPI_Init(&argc, &argv);
|
||||||
|
|
||||||
|
size_t checkpoint_it;
|
||||||
int no(10), nv(10), itMod(-1), percentageMod(10);
|
int no(10), nv(10), itMod(-1), percentageMod(10);
|
||||||
bool nochrono(false), barrier(false), rankRoundRobin(false),
|
float checkpoint_percentage;
|
||||||
keepVppph(false);
|
bool
|
||||||
std::string tuplesDistributionString = "naive";
|
nochrono(false), barrier(false), rankRoundRobin(false),
|
||||||
|
keepVppph(false),
|
||||||
|
noCheckpoint = false;
|
||||||
|
std::string tuplesDistributionString = "naive",
|
||||||
|
checkpoint_path = "checkpoint.yaml";
|
||||||
|
|
||||||
CLI::App app{"Main bench for atrip"};
|
CLI::App app{"Main bench for atrip"};
|
||||||
app.add_option("--no", no, "Occupied orbitals");
|
app.add_option("--no", no, "Occupied orbitals");
|
||||||
@ -31,6 +36,14 @@ int main(int argc, char** argv) {
|
|||||||
app.add_flag("--barrier", barrier, "Use the first barrier");
|
app.add_flag("--barrier", barrier, "Use the first barrier");
|
||||||
app.add_option("--dist", tuplesDistributionString, "Which distribution");
|
app.add_option("--dist", tuplesDistributionString, "Which distribution");
|
||||||
app.add_option("-%", percentageMod, "Percentage to be printed");
|
app.add_option("-%", percentageMod, "Percentage to be printed");
|
||||||
|
// checkpointing
|
||||||
|
app.add_flag("--nocheckpoint", noCheckpoint, "Do not use checkpoint");
|
||||||
|
app.add_option("--checkpoint-path", checkpoint_path, "Path for checkpoint");
|
||||||
|
app.add_option("--checkpoint-it",
|
||||||
|
checkpoint_it, "Checkpoint at every iteration");
|
||||||
|
app.add_option("--checkpoint-%",
|
||||||
|
checkpoint_percentage,
|
||||||
|
"Percentage for checkpoints");
|
||||||
|
|
||||||
CLI11_PARSE(app, argc, argv);
|
CLI11_PARSE(app, argc, argv);
|
||||||
|
|
||||||
@ -74,9 +87,11 @@ int main(int argc, char** argv) {
|
|||||||
atrip::Atrip::Input<double>::TuplesDistribution tuplesDistribution;
|
atrip::Atrip::Input<double>::TuplesDistribution tuplesDistribution;
|
||||||
{ using atrip::Atrip;
|
{ using atrip::Atrip;
|
||||||
if (tuplesDistributionString == "naive") {
|
if (tuplesDistributionString == "naive") {
|
||||||
tuplesDistribution = Atrip::Input<double>::TuplesDistribution::NAIVE;
|
tuplesDistribution
|
||||||
|
= Atrip::Input<double>::TuplesDistribution::NAIVE;
|
||||||
} else if (tuplesDistributionString == "group") {
|
} else if (tuplesDistributionString == "group") {
|
||||||
tuplesDistribution = Atrip::Input<double>::TuplesDistribution::GROUP_AND_SORT;
|
tuplesDistribution
|
||||||
|
= Atrip::Input<double>::TuplesDistribution::GROUP_AND_SORT;
|
||||||
} else {
|
} else {
|
||||||
std::cout << "--dist should be either naive or group\n";
|
std::cout << "--dist should be either naive or group\n";
|
||||||
exit(1);
|
exit(1);
|
||||||
@ -134,6 +149,11 @@ int main(int argc, char** argv) {
|
|||||||
.with_iterationMod(itMod)
|
.with_iterationMod(itMod)
|
||||||
.with_percentageMod(percentageMod)
|
.with_percentageMod(percentageMod)
|
||||||
.with_tuplesDistribution(tuplesDistribution)
|
.with_tuplesDistribution(tuplesDistribution)
|
||||||
|
// checkpoint options
|
||||||
|
.with_checkpointAtEveryIteration(checkpoint_it)
|
||||||
|
.with_checkpointAtPercentage(checkpoint_percentage)
|
||||||
|
.with_checkpointPath(checkpoint_path)
|
||||||
|
.with_readCheckpointIfExists(!noCheckpoint)
|
||||||
;
|
;
|
||||||
|
|
||||||
auto out = atrip::Atrip::run(in);
|
auto out = atrip::Atrip::run(in);
|
||||||
|
|||||||
15
config.el
15
config.el
@ -8,13 +8,17 @@
|
|||||||
(defun atrip-print-sources ()
|
(defun atrip-print-sources ()
|
||||||
(princ (string-join atrip-sources " ")))
|
(princ (string-join atrip-sources " ")))
|
||||||
|
|
||||||
(defvar atrip-include-f "include/atrip") ;; TODO: create defvar
|
(defvar atrip-include-f "include/atrip")
|
||||||
(defvar atrip-src-f "src/atrip") ;; TODO: create defvar
|
(defvar atrip-src-f "src/atrip")
|
||||||
|
(defvar atrip-test-d "test")
|
||||||
|
|
||||||
(defmacro atrip-def (name body) `(progn (defun ,name () ,body)
|
(defmacro atrip-def (name body)
|
||||||
|
`(progn (defun ,name () ,body)
|
||||||
(push (,name) atrip-sources)))
|
(push (,name) atrip-sources)))
|
||||||
|
|
||||||
|
|
||||||
|
(defmacro atrip-def-test (name body)
|
||||||
|
`(atrip-def ,name (f-join atrip-test-d ,body)))
|
||||||
(defmacro atrip-def-src (name body)
|
(defmacro atrip-def-src (name body)
|
||||||
`(atrip-def ,name (f-join atrip-src-f ,body)))
|
`(atrip-def ,name (f-join atrip-src-f ,body)))
|
||||||
(defmacro atrip-def-hdr (name body)
|
(defmacro atrip-def-hdr (name body)
|
||||||
@ -30,15 +34,20 @@
|
|||||||
(atrip-def-hdr atrip-tuples-h "Tuples.hpp")
|
(atrip-def-hdr atrip-tuples-h "Tuples.hpp")
|
||||||
(atrip-def-hdr atrip-equations-h "Equations.hpp")
|
(atrip-def-hdr atrip-equations-h "Equations.hpp")
|
||||||
(atrip-def-hdr atrip-debug-h "Debug.hpp")
|
(atrip-def-hdr atrip-debug-h "Debug.hpp")
|
||||||
|
(atrip-def-hdr atrip-checkpoint-h "Checkpoint.hpp")
|
||||||
|
|
||||||
(atrip-def-hdr atrip-atrip-h "Atrip.hpp")
|
(atrip-def-hdr atrip-atrip-h "Atrip.hpp")
|
||||||
(atrip-def-src atrip-atrip-cxx "Atrip.cxx")
|
(atrip-def-src atrip-atrip-cxx "Atrip.cxx")
|
||||||
|
|
||||||
(atrip-def atrip-main-h "include/atrip.hpp")
|
(atrip-def atrip-main-h "include/atrip.hpp")
|
||||||
|
|
||||||
|
;; main test
|
||||||
|
(atrip-def-test atrip-test-main "main.cxx")
|
||||||
|
|
||||||
(defvar atrip-root-directory (file-name-directory load-file-name))
|
(defvar atrip-root-directory (file-name-directory load-file-name))
|
||||||
(defvar license-path (format "%s/LICENSE-HEADER" atrip-root-directory))
|
(defvar license-path (format "%s/LICENSE-HEADER" atrip-root-directory))
|
||||||
|
|
||||||
|
;; add local hook for license headers
|
||||||
(add-hook 'org-babel-post-tangle-hook
|
(add-hook 'org-babel-post-tangle-hook
|
||||||
(lambda ()
|
(lambda ()
|
||||||
(goto-char (point-min))
|
(goto-char (point-min))
|
||||||
|
|||||||
77
test/main.cxx
Normal file
77
test/main.cxx
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
// Copyright 2022 Alejandro Gallo
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
// [[file:../atrip.org::*Tests][Tests:1]]
|
||||||
|
#include <atrip.hpp>
|
||||||
|
#include <cassert>
|
||||||
|
// [[[[file:~/software/atrip/atrip.org::*Tests][Tests]]][]]
|
||||||
|
#include <atrip/Checkpoint.hpp>
|
||||||
|
using namespace atrip;
|
||||||
|
// ends here
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#define TESTCASE(_name, ...) { \
|
||||||
|
std::cout << "\x1b[35m-> \x1b[0m" \
|
||||||
|
<< _name \
|
||||||
|
<< std::endl; \
|
||||||
|
__VA_ARGS__ \
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
|
||||||
|
// [[[[file:~/software/atrip/atrip.org::*Tests][Tests]]][]]
|
||||||
|
#define _CMP_CHECK(what) \
|
||||||
|
std::cout << "\t Checking " << #what << std::endl; \
|
||||||
|
assert(in.what == what); \
|
||||||
|
assert(out.what == what);
|
||||||
|
|
||||||
|
TESTCASE("Testing checkpoint reader and writers",
|
||||||
|
const std::string out_checkpoint = "/tmp/checkpoint.yaml";
|
||||||
|
const double energy = -1.493926352289995443;
|
||||||
|
const size_t no = 154, nv = 1500, nranks = 48*10, nnodes = 10;
|
||||||
|
const size_t iteration = 546;
|
||||||
|
std::cout << "\twriting to " << out_checkpoint << std::endl;
|
||||||
|
|
||||||
|
for (bool rankRoundRobin: {true, false}) {
|
||||||
|
atrip::Checkpoint out = {no,
|
||||||
|
nv,
|
||||||
|
nranks,
|
||||||
|
nnodes,
|
||||||
|
energy,
|
||||||
|
iteration,
|
||||||
|
rankRoundRobin}, in;
|
||||||
|
|
||||||
|
|
||||||
|
write_checkpoint(out, out_checkpoint);
|
||||||
|
in = read_checkpoint(out_checkpoint);
|
||||||
|
|
||||||
|
_CMP_CHECK(no);
|
||||||
|
_CMP_CHECK(nv);
|
||||||
|
_CMP_CHECK(nranks);
|
||||||
|
_CMP_CHECK(nnodes);
|
||||||
|
_CMP_CHECK(iteration);
|
||||||
|
_CMP_CHECK(rankRoundRobin);
|
||||||
|
_CMP_CHECK(energy);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
)
|
||||||
|
#undef _CMP_CHECK
|
||||||
|
|
||||||
|
// ends here
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
// Tests:1 ends here
|
||||||
Loading…
Reference in New Issue
Block a user