#include #include #include #include #include #define _print_size(what, size) \ if (rank == 0) { \ std::cout << #what \ << " => " \ << (double)size * elem_to_gb \ << "GB" \ << std::endl; \ } int main(int argc, char** argv) { MPI_Init(&argc, &argv); size_t checkpoint_it; int no(10), nv(100), itMod(-1), percentageMod(10); float checkpoint_percentage; bool nochrono(false), barrier(false), rankRoundRobin(false), keepVppph(false), noCheckpoint = false; std::string tuplesDistributionString = "naive", checkpoint_path = "checkpoint.yaml"; CLI::App app{"Main bench for atrip"}; app.add_option("--no", no, "Occupied orbitals"); app.add_option("--nv", nv, "Virtual orbitals"); app.add_option("--mod", itMod, "Iteration modifier"); app.add_flag("--keep-vppph", keepVppph, "Do not delete Vppph"); app.add_flag("--nochrono", nochrono, "Do not print chrono"); app.add_flag("--rank-round-robin", rankRoundRobin, "Do rank round robin"); app.add_flag("--barrier", barrier, "Use the first barrier"); app.add_option("--dist", tuplesDistributionString, "Which distribution"); app.add_option("-%", percentageMod, "Percentage to be printed"); // checkpointing app.add_flag("--nocheckpoint", noCheckpoint, "Do not use checkpoint"); app.add_option("--checkpoint-path", checkpoint_path, "Path for checkpoint"); app.add_option("--checkpoint-it", checkpoint_it, "Checkpoint at every iteration"); app.add_option("--checkpoint-%", checkpoint_percentage, "Percentage for checkpoints"); #if defined(HAVE_CUDA) size_t ooo_threads = 0, ooo_blocks = 0; app.add_option("--ooo-blocks", ooo_blocks, "CUDA: Number of blocks per block for kernels going through ooo tensors"); app.add_option("--ooo-threads", ooo_threads, "CUDA: Number of threads per block for kernels going through ooo tensors"); #endif CLI11_PARSE(app, argc, argv); CTF::World world(argc, argv); int rank, nranks; MPI_Comm_rank(world.comm, &rank); MPI_Comm_size(world.comm, &nranks); constexpr double elem_to_gb = 8.0 / 1024.0 / 1024.0 / 1024.0; // USER PRINTING TEST BEGIN const double doublesFlops = no * no * no * (no + nv) * 2.0 * 6.0 / 1.0e9 ; double lastElapsedTime = 0; bool firstHeaderPrinted = false; atrip::registerIterationDescriptor ([doublesFlops, &firstHeaderPrinted, rank, &lastElapsedTime] (atrip::IterationDescription const& d) { const char *fmt_header = "%-13s%-10s%-13s", *fmt_nums = "%-13.0f%-10.0f%-13.3f"; char out[256]; if (!firstHeaderPrinted) { sprintf(out, fmt_header, "Progress(%)", "time(s)", "GFLOP/s"); firstHeaderPrinted = true; if (rank == 0) std::cout << out << "\n"; } sprintf(out, fmt_nums, double(d.currentIteration) / double(d.totalIterations) * 100, (d.currentElapsedTime - lastElapsedTime), d.currentIteration * doublesFlops / d.currentElapsedTime); lastElapsedTime = d.currentElapsedTime; if (rank == 0) std::cout << out << "\n"; }); // USER PRINTING TEST END atrip::Atrip::Input::TuplesDistribution tuplesDistribution; { using atrip::Atrip; if (tuplesDistributionString == "naive") { tuplesDistribution = Atrip::Input::TuplesDistribution::NAIVE; } else if (tuplesDistributionString == "group") { tuplesDistribution = Atrip::Input::TuplesDistribution::GROUP_AND_SORT; } else { std::cout << "--dist should be either naive or group\n"; exit(1); } } size_t f = sizeof(double) , n_tuples = nv * (nv + 1) * (nv + 2) / 6 - nv , atrip_memory = /* tuples_memory */ 3 * sizeof(size_t) * n_tuples // // one dimensional slices (all ranks) // + /* taphh */ f * nranks * 6 * nv * no * no + /* hhha */ f * nranks * 6 * no * no * no // // two dimensional slices (all ranks) // + /* abph */ f * nranks * 12 * nv * no + /* abhh */ f * nranks * 6 * no * no + /* tabhh */ f * nranks * 6 * no * no // // distributed sources (all ranks) // + /* tpphh */ f * nv * nv * no * no + /* vhhhp */ f * no * no * no * nv + /* vppph */ f * nv * nv * nv * no + /* vpphh */ f * nv * nv * no * no + /* tpphh2 */ f * nv * nv * no * no // // tensors in every rank // + /* tijk */ f * nranks * no * no * no + /* zijk */ f * nranks * no * no * no + /* epsp */ f * nranks * (no + nv) + /* tai */ f * nranks * no * nv ; if (atrip::Atrip::rank == 0) std::cout << "Tentative MEMORY USAGE: " << atrip_memory << "\n"; std::vector symmetries(4, NS) , vo({nv, no}) , vvoo({nv, nv, no, no}) , ooov({no, no, no, nv}) , vvvo({nv, nv, nv, no}) ; CTF::Tensor ei(1, ooov.data(), symmetries.data(), world) , ea(1, vo.data(), symmetries.data(), world) , Tph(2, vo.data(), symmetries.data(), world) , Tpphh(4, vvoo.data(), symmetries.data(), world) , Vpphh(4, vvoo.data(), symmetries.data(), world) , Vhhhp(4, ooov.data(), symmetries.data(), world) ; // initialize deletable tensors in heap auto Vppph = new CTF::Tensor(4, vvvo.data(), symmetries.data(), world); _print_size(Vabci, no*nv*nv*nv) _print_size(Vabij, no*no*nv*nv) _print_size(Vijka, no*no*no*nv) ei.fill_random(-40.0, -2); ea.fill_random(2, 50); Tpphh.fill_random(0, 1); Tph.fill_random(0, 1); Vpphh.fill_random(0, 1); Vhhhp.fill_random(0, 1); Vppph->fill_random(0, 1); atrip::Atrip::init(MPI_COMM_WORLD); const auto in = atrip::Atrip::Input() // Tensors .with_epsilon_i(&ei) .with_epsilon_a(&ea) .with_Tai(&Tph) .with_Tabij(&Tpphh) .with_Vabij(&Vpphh) .with_Vijka(&Vhhhp) .with_Vabci(Vppph) // some options .with_deleteVppph(!keepVppph) .with_barrier(barrier) .with_chrono(!nochrono) .with_rankRoundRobin(rankRoundRobin) .with_iterationMod(itMod) .with_percentageMod(percentageMod) .with_tuplesDistribution(tuplesDistribution) // checkpoint options .with_checkpointAtEveryIteration(checkpoint_it) .with_checkpointAtPercentage(checkpoint_percentage) .with_checkpointPath(checkpoint_path) .with_readCheckpointIfExists(!noCheckpoint) #if defined(HAVE_CUDA) .with_oooThreads(ooo_threads) .with_oooBlocks(ooo_blocks) #endif ; try { auto out = atrip::Atrip::run(in); if (atrip::Atrip::rank == 0) std::cout << "Energy: " << out.energy << std::endl; } catch (const char* msg) { if (atrip::Atrip::rank == 0) std::cout << "Atrip throwed with msg:\n\t\t " << msg << "\n"; } if (!in.deleteVppph) delete Vppph; MPI_Finalize(); return 0; }