#include #include #include #include #include #define _print_size(what, size) \ do { \ if (rank == 0) { \ std::cout << #what \ << " => " \ << (double)size * elem_to_gb \ << "GB" \ << std::endl; \ } \ } while (0) int main(int argc, char** argv) { MPI_Init(&argc, &argv); size_t checkpoint_it, max_iterations; int no(10), nv(100), itMod(-1), percentageMod(10); float checkpoint_percentage; bool nochrono(false), barrier(false), rankRoundRobin(false), keepVppph(false), noCheckpoint = false; std::string tuplesDistributionString = "naive", checkpoint_path = "checkpoint.yaml"; CLI::App app{"Main bench for atrip"}; app.add_option("--no", no, "Occupied orbitals"); app.add_option("--nv", nv, "Virtual orbitals"); app.add_option("--mod", itMod, "Iteration modifier"); app.add_option("--max-iterations", max_iterations, "Maximum number of iterations to run"); app.add_flag("--keep-vppph", keepVppph, "Do not delete Vppph"); app.add_flag("--nochrono", nochrono, "Do not print chrono"); app.add_flag("--rank-round-robin", rankRoundRobin, "Do rank round robin"); app.add_flag("--barrier", barrier, "Use the first barrier"); app.add_option("--dist", tuplesDistributionString, "Which distribution"); app.add_option("-%", percentageMod, "Percentage to be printed"); // checkpointing app.add_flag("--nocheckpoint", noCheckpoint, "Do not use checkpoint"); app.add_option("--checkpoint-path", checkpoint_path, "Path for checkpoint"); app.add_option("--checkpoint-it", checkpoint_it, "Checkpoint at every iteration"); app.add_option("--checkpoint-%", checkpoint_percentage, "Percentage for checkpoints"); // Optional tensor files std::string ei_path, ea_path, Tph_path, Tpphh_path, Vpphh_path, Vhhhp_path, Vppph_path; app.add_option("--ei", ei_path, "Path for ei"); app.add_option("--ea", ea_path, "Path for ea"); app.add_option("--Tpphh", Tpphh_path, "Path for Tpphh"); app.add_option("--Tph", Tph_path, "Path for Tph"); app.add_option("--Vpphh", Vpphh_path, "Path for Vpphh"); app.add_option("--Vhhhp", Vhhhp_path, "Path for Vhhhp"); app.add_option("--Vppph", Vppph_path, "Path for Vppph"); #if defined(HAVE_CUDA) size_t ooo_threads = 0, ooo_blocks = 0; app.add_option("--ooo-blocks", ooo_blocks, "CUDA: Number of blocks per block for kernels going through ooo tensors"); app.add_option("--ooo-threads", ooo_threads, "CUDA: Number of threads per block for kernels going through ooo tensors"); #endif CLI11_PARSE(app, argc, argv); CTF::World world(argc, argv); int rank, nranks; MPI_Comm_rank(world.comm, &rank); MPI_Comm_size(world.comm, &nranks); constexpr double elem_to_gb = 8.0 / 1024.0 / 1024.0 / 1024.0; // USER PRINTING TEST BEGIN const double doublesFlops = no * no * no * (no + nv) * 2.0 * 6.0 / 1.0e9 ; double lastElapsedTime = 0; bool firstHeaderPrinted = false; atrip::registerIterationDescriptor ([doublesFlops, &firstHeaderPrinted, rank, &lastElapsedTime] (atrip::IterationDescription const& d) { const char *fmt_header = "%-13s%-10s%-13s", *fmt_nums = "%-13.0f%-10.0f%-13.3f"; char out[256]; if (!firstHeaderPrinted) { sprintf(out, fmt_header, "Progress(%)", "time(s)", "GFLOP/s"); firstHeaderPrinted = true; if (rank == 0) std::cout << out << "\n"; } sprintf(out, fmt_nums, double(d.currentIteration) / double(d.totalIterations) * 100, (d.currentElapsedTime - lastElapsedTime), d.currentIteration * doublesFlops / d.currentElapsedTime); lastElapsedTime = d.currentElapsedTime; if (rank == 0) std::cout << out << "\n"; }); // USER PRINTING TEST END atrip::Atrip::Input::TuplesDistribution tuplesDistribution; { using atrip::Atrip; if (tuplesDistributionString == "naive") { tuplesDistribution = Atrip::Input::TuplesDistribution::NAIVE; } else if (tuplesDistributionString == "group") { tuplesDistribution = Atrip::Input::TuplesDistribution::GROUP_AND_SORT; } else { std::cout << "--dist should be either naive or group\n"; exit(1); } } size_t f = sizeof(double) , n_tuples = nv * (nv + 1) * (nv + 2) / 6 - nv , atrip_memory = /* tuples_memory */ 3 * sizeof(size_t) * n_tuples // // one dimensional slices (all ranks) // + /* taphh */ f * nranks * 6 * nv * no * no + /* hhha */ f * nranks * 6 * no * no * no // // two dimensional slices (all ranks) // + /* abph */ f * nranks * 12 * nv * no + /* abhh */ f * nranks * 6 * no * no + /* tabhh */ f * nranks * 6 * no * no // // distributed sources (all ranks) // + /* tpphh */ f * nv * nv * no * no + /* vhhhp */ f * no * no * no * nv + /* vppph */ f * nv * nv * nv * no + /* vpphh */ f * nv * nv * no * no + /* tpphh2 */ f * nv * nv * no * no // // tensors in every rank // + /* tijk */ f * nranks * no * no * no + /* zijk */ f * nranks * no * no * no + /* epsp */ f * nranks * (no + nv) + /* tai */ f * nranks * no * nv ; if (rank == 0) { std::cout << "Tentative MEMORY USAGE (GB): " << double(atrip_memory) / 1024.0 / 1024.0 / 1024.0 << "\n"; } std::vector symmetries(4, NS), vo({nv, no}), vvoo({nv, nv, no, no}), ooov({no, no, no, nv}), vvvo({nv, nv, nv, no}); CTF::Tensor ei(1, ooov.data(), symmetries.data(), world), ea(1, vo.data(), symmetries.data(), world), Tph(2, vo.data(), symmetries.data(), world), Tpphh(4, vvoo.data(), symmetries.data(), world), Vpphh(4, vvoo.data(), symmetries.data(), world), Vhhhp(4, ooov.data(), symmetries.data(), world); // initialize deletable tensors in heap auto Vppph = new CTF::Tensor(4, vvvo.data(), symmetries.data(), world); _print_size(Vabci, no*nv*nv*nv); _print_size(Vabij, no*no*nv*nv); _print_size(Vijka, no*no*no*nv); if (ei_path.size()) { ei.read_dense_from_file(ei_path.c_str()); } else { ei.fill_random(-40.0, -2); } if (ea_path.size()) { ea.read_dense_from_file(ea_path.c_str()); } else { ea.fill_random(2, 50); } if (Tpphh_path.size()) { Tpphh.read_dense_from_file(Tpphh_path.c_str()); } else { Tpphh.fill_random(0, 1); } if (Tph_path.size()) { Tph.read_dense_from_file(Tph_path.c_str()); } else { Tph.fill_random(0, 1); } if (Vpphh_path.size()) { Vpphh.read_dense_from_file(Vpphh_path.c_str()); } else { Vpphh.fill_random(0, 1); } if (Vhhhp_path.size()) { Vhhhp.read_dense_from_file(Vhhhp_path.c_str()); } else { Vhhhp.fill_random(0, 1); } if (Vppph_path.size()) { Vppph->read_dense_from_file(Vppph_path.c_str()); } else { Vppph->fill_random(0, 1); } atrip::Atrip::init(MPI_COMM_WORLD); const auto in = atrip::Atrip::Input() // Tensors .with_epsilon_i(&ei) .with_epsilon_a(&ea) .with_Tai(&Tph) .with_Tabij(&Tpphh) .with_Vabij(&Vpphh) .with_Vijka(&Vhhhp) .with_Vabci(Vppph) // some options .with_deleteVppph(!keepVppph) .with_barrier(barrier) .with_chrono(!nochrono) .with_rankRoundRobin(rankRoundRobin) .with_iterationMod(itMod) .with_percentageMod(percentageMod) .with_tuplesDistribution(tuplesDistribution) .with_maxIterations(max_iterations) // checkpoint options .with_checkpointAtEveryIteration(checkpoint_it) .with_checkpointAtPercentage(checkpoint_percentage) .with_checkpointPath(checkpoint_path) .with_readCheckpointIfExists(!noCheckpoint) #if defined(HAVE_CUDA) .with_oooThreads(ooo_threads) .with_oooBlocks(ooo_blocks) #endif ; try { auto out = atrip::Atrip::run(in); if (atrip::Atrip::rank == 0) std::cout << "Energy: " << out.energy << std::endl; } catch (const char* msg) { if (atrip::Atrip::rank == 0) std::cout << "Atrip throwed with msg:\n\t\t " << msg << "\n"; } if (!in.deleteVppph) delete Vppph; MPI_Finalize(); return 0; }