7 Commits

8 changed files with 207 additions and 44 deletions

View File

@@ -2,6 +2,8 @@
name: CI
on:
push:
branches: [ master, cuda ]
pull_request:
branches: [ master, cuda ]

View File

@@ -69,10 +69,10 @@ And then you can see the =configure= options
../../configure --help
#+end_src
** Benchmarks
** Benches
The script =tools/configure-benches.sh= can be used to create
a couple of configurations for benchmarks:
a couple of configurations for benches:
#+begin_src sh :exports results :results verbatim org :results verbatim drawer replace output
awk '/begin +doc/,/end +doc/ { print $NL }' tools/configure-benches.sh |
@@ -87,8 +87,49 @@ sed "s/^# //; s/^# *$//; /^$/d"
and without computing slices.
- only-dgemm ::
This only runs the computation part that involves dgemms.
- slices-on-gpu-only-dgemm ::
- cuda-only-dgemm ::
This is the naive CUDA implementation compiling only the dgemm parts
of the compute.
- cuda-slices-on-gpu-only-dgemm ::
This configuration tests that slices reside completely on the gpu
and it should use a CUDA aware MPI implementation.
It also only uses the routines that involve dgemm.
:end:
In order to generate the benches just create a suitable directory for it
#+begin_src sh :eval no
mkdir -p build/benches
cd buid/benches
../../tools/configure-benches.sh CXX=g++ ...
#+end_src
and you will get a Makefile together with several project folders.
You can either configure all projects with =make all= or
then go in each folder.
Notice that you can give a path for ctf for all of them by doing
#+begin_src sh :eval no
../../tools/configure-benches.sh --with-ctf=/absolute/path/to/ctf
#+end_src
* Running benches
** Main benchmark
The main benchmark gets built in =bench/atrip= and is used to run an
atrip run with random tensors.
A common run of this script will be the following
#+begin_src sh
bench/atrip \
--no 100 \
--nv 1000 \
--mod 1 \
--% 0 \
--dist group \
--nocheckpoint \
--max-iterations 1000
#+end_src

View File

@@ -5,18 +5,20 @@
#include <CLI11.hpp>
#define _print_size(what, size) \
do { \
if (rank == 0) { \
std::cout << #what \
<< " => " \
<< (double)size * elem_to_gb \
<< "GB" \
<< std::endl; \
}
} \
} while (0)
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
size_t checkpoint_it;
size_t checkpoint_it, max_iterations;
int no(10), nv(100), itMod(-1), percentageMod(10);
float checkpoint_percentage;
bool
@@ -30,6 +32,9 @@ int main(int argc, char** argv) {
app.add_option("--no", no, "Occupied orbitals");
app.add_option("--nv", nv, "Virtual orbitals");
app.add_option("--mod", itMod, "Iteration modifier");
app.add_option("--max-iterations",
max_iterations,
"Maximum number of iterations to run");
app.add_flag("--keep-vppph", keepVppph, "Do not delete Vppph");
app.add_flag("--nochrono", nochrono, "Do not print chrono");
app.add_flag("--rank-round-robin", rankRoundRobin, "Do rank round robin");
@@ -45,6 +50,19 @@ int main(int argc, char** argv) {
checkpoint_percentage,
"Percentage for checkpoints");
// Optional tensor files
std::string
ei_path, ea_path,
Tph_path, Tpphh_path,
Vpphh_path, Vhhhp_path, Vppph_path;
app.add_option("--ei", ei_path, "Path for ei");
app.add_option("--ea", ea_path, "Path for ea");
app.add_option("--Tpphh", Tpphh_path, "Path for Tpphh");
app.add_option("--Tph", Tph_path, "Path for Tph");
app.add_option("--Vpphh", Vpphh_path, "Path for Vpphh");
app.add_option("--Vhhhp", Vhhhp_path, "Path for Vhhhp");
app.add_option("--Vppph", Vppph_path, "Path for Vppph");
#if defined(HAVE_CUDA)
size_t ooo_threads = 0, ooo_blocks = 0;
app.add_option("--ooo-blocks",
@@ -148,37 +166,64 @@ int main(int argc, char** argv) {
}
std::vector<int> symmetries(4, NS)
, vo({nv, no})
, vvoo({nv, nv, no, no})
, ooov({no, no, no, nv})
, vvvo({nv, nv, nv, no})
;
std::vector<int>
symmetries(4, NS),
vo({nv, no}),
vvoo({nv, nv, no, no}),
ooov({no, no, no, nv}),
vvvo({nv, nv, nv, no});
CTF::Tensor<double>
ei(1, ooov.data(), symmetries.data(), world)
, ea(1, vo.data(), symmetries.data(), world)
, Tph(2, vo.data(), symmetries.data(), world)
, Tpphh(4, vvoo.data(), symmetries.data(), world)
, Vpphh(4, vvoo.data(), symmetries.data(), world)
, Vhhhp(4, ooov.data(), symmetries.data(), world)
;
ei(1, ooov.data(), symmetries.data(), world),
ea(1, vo.data(), symmetries.data(), world),
Tph(2, vo.data(), symmetries.data(), world),
Tpphh(4, vvoo.data(), symmetries.data(), world),
Vpphh(4, vvoo.data(), symmetries.data(), world),
Vhhhp(4, ooov.data(), symmetries.data(), world);
// initialize deletable tensors in heap
auto Vppph
= new CTF::Tensor<double>(4, vvvo.data(), symmetries.data(), world);
_print_size(Vabci, no*nv*nv*nv)
_print_size(Vabij, no*no*nv*nv)
_print_size(Vijka, no*no*no*nv)
_print_size(Vabci, no*nv*nv*nv);
_print_size(Vabij, no*no*nv*nv);
_print_size(Vijka, no*no*no*nv);
if (ei_path.size()) {
ei.read_dense_from_file(ei_path.c_str());
} else {
ei.fill_random(-40.0, -2);
}
if (ea_path.size()) {
ea.read_dense_from_file(ea_path.c_str());
} else {
ea.fill_random(2, 50);
}
if (Tpphh_path.size()) {
Tpphh.read_dense_from_file(Tpphh_path.c_str());
} else {
Tpphh.fill_random(0, 1);
}
if (Tph_path.size()) {
Tph.read_dense_from_file(Tph_path.c_str());
} else {
Tph.fill_random(0, 1);
}
if (Vpphh_path.size()) {
Vpphh.read_dense_from_file(Vpphh_path.c_str());
} else {
Vpphh.fill_random(0, 1);
}
if (Vhhhp_path.size()) {
Vhhhp.read_dense_from_file(Vhhhp_path.c_str());
} else {
Vhhhp.fill_random(0, 1);
}
if (Vppph_path.size()) {
Vppph->read_dense_from_file(Vppph_path.c_str());
} else {
Vppph->fill_random(0, 1);
}
atrip::Atrip::init(MPI_COMM_WORLD);
const auto in
@@ -199,6 +244,7 @@ int main(int argc, char** argv) {
.with_iterationMod(itMod)
.with_percentageMod(percentageMod)
.with_tuplesDistribution(tuplesDistribution)
.with_maxIterations(max_iterations)
// checkpoint options
.with_checkpointAtEveryIteration(checkpoint_it)
.with_checkpointAtPercentage(checkpoint_percentage)

View File

@@ -164,8 +164,7 @@ AC_TYPE_SIZE_T
dnl -----------------------------------------------------------------------
dnl CHECK CTF
if test xYES = x${BUILD_CTF}; then
AC_MSG_WARN([Sorry, building CTF not supported yet provide a build path
with --with-ctf=path/to/ctf/installation])
AC_MSG_WARN([You will have to do make ctf before building the project.])
else
CPPFLAGS="$CPPFLAGS -I${LIBCTF_CPATH}"
LDFLAGS="$LDFLAGS -L${LIBCTF_LD_LIBRARY_PATH} -lctf"

56
etc/env/raven/cuda vendored Normal file
View File

@@ -0,0 +1,56 @@
mods=(
cuda/11.6
intel/19.1.2
mkl/2020.4
impi/2019.8
autoconf/2.69
automake/1.15
libtool/2.4.6
)
module purge
module load ${mods[@]}
LIB_PATH="${CUDA_HOME}/lib64"
export CUDA_ROOT=${CUDA_HOME}
export CUDA_LDFLAGS="-L${LIB_PATH} -lcuda -L${LIB_PATH} -lcudart -L${LIB_PATH} -lcublas"
export CUDA_CXXFLAGS="-I${CUDA_HOME}/include"
export LD_LIBRARY_PATH="${MKL_HOME}/lib/intel64_lin:${LD_LIBRARY_PATH}"
BLAS_STATIC_PATH="$MKL_HOME/lib/intel64/libmkl_intel_lp64.a"
ls ${LIB_PATH}/libcublas.so
ls ${LIB_PATH}/libcudart.so
cat <<EOF
////////////////////////////////////////////////////////////////////////////////
info
////////////////////////////////////////////////////////////////////////////////
MKL_HOME = $MKL_HOME
BLAS_STATIC_PATH = $BLAS_STATIC_PATH
CUDA_ROOT = ${CUDA_HOME}
CUDA_LDFLAGS = "-L${LIB_PATH} -lcuda -L${LIB_PATH} -lcudart -L${LIB_PATH} -lcublas"
CUDA_CXXFLAGS = "-I${CUDA_HOME}/include"
Consider now runnng the following
../configure \\
--enable-cuda \\
--disable-slice \\
--with-blas="-L\$MKL_HOME/lib/intel64/ -lmkl_intel_lp64 -mkl" \\
CXX=mpiicpc \\
CC=mpiicc \\
MPICXX=mpiicpc
EOF
return

View File

@@ -86,7 +86,7 @@ namespace atrip {
ADD_ATTRIBUTE(bool, rankRoundRobin, false)
ADD_ATTRIBUTE(bool, chrono, false)
ADD_ATTRIBUTE(bool, barrier, false)
ADD_ATTRIBUTE(int, maxIterations, 0)
ADD_ATTRIBUTE(size_t, maxIterations, 0)
ADD_ATTRIBUTE(int, iterationMod, -1)
ADD_ATTRIBUTE(int, percentageMod, -1)
ADD_ATTRIBUTE(TuplesDistribution, tuplesDistribution, NAIVE)

View File

@@ -773,6 +773,8 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
Atrip::chrono["iterations"].stop();
// ITERATION END %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%{{{1
if (in.maxIterations != 0 && i >= in.maxIterations) break;
}
// END OF MAIN LOOP

View File

@@ -98,10 +98,27 @@ EOF
create_config $tmp only-dgemm
rm $tmp
#
# begin doc
#
# - slices-on-gpu-only-dgemm ::
# - cuda-only-dgemm ::
# This is the naive CUDA implementation compiling only the dgemm parts
# of the compute.
#
# end doc
tmp=`mktemp`
cat <<EOF > $tmp
--enable-cuda
--enable-only-dgemm
--disable-slice
EOF
create_config $tmp cuda-only-dgemm
rm $tmp
# begin doc
#
# - cuda-slices-on-gpu-only-dgemm ::
# This configuration tests that slices reside completely on the gpu
# and it should use a CUDA aware MPI implementation.
# It also only uses the routines that involve dgemm.
@@ -117,7 +134,7 @@ cat <<EOF > $tmp
--disable-slice
EOF
create_config $tmp sources-in-gpu
create_config $tmp cuda-slices-on-gpu-only-dgemm
rm $tmp
############################################################