Compare commits
7 Commits
newtuples
...
249f1c0b51
| Author | SHA1 | Date | |
|---|---|---|---|
| 249f1c0b51 | |||
| 1d96800d45 | |||
| 9087e3af19 | |||
| 418fd9d389 | |||
| 895cd02778 | |||
| 8efa3d911e | |||
| 0fa24404e5 |
2
.github/workflows/main.yml
vendored
2
.github/workflows/main.yml
vendored
@@ -2,6 +2,8 @@
|
|||||||
name: CI
|
name: CI
|
||||||
|
|
||||||
on:
|
on:
|
||||||
|
push:
|
||||||
|
branches: [ master, cuda ]
|
||||||
pull_request:
|
pull_request:
|
||||||
branches: [ master, cuda ]
|
branches: [ master, cuda ]
|
||||||
|
|
||||||
|
|||||||
47
README.org
47
README.org
@@ -69,10 +69,10 @@ And then you can see the =configure= options
|
|||||||
../../configure --help
|
../../configure --help
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
** Benchmarks
|
** Benches
|
||||||
|
|
||||||
The script =tools/configure-benches.sh= can be used to create
|
The script =tools/configure-benches.sh= can be used to create
|
||||||
a couple of configurations for benchmarks:
|
a couple of configurations for benches:
|
||||||
|
|
||||||
#+begin_src sh :exports results :results verbatim org :results verbatim drawer replace output
|
#+begin_src sh :exports results :results verbatim org :results verbatim drawer replace output
|
||||||
awk '/begin +doc/,/end +doc/ { print $NL }' tools/configure-benches.sh |
|
awk '/begin +doc/,/end +doc/ { print $NL }' tools/configure-benches.sh |
|
||||||
@@ -87,8 +87,49 @@ sed "s/^# //; s/^# *$//; /^$/d"
|
|||||||
and without computing slices.
|
and without computing slices.
|
||||||
- only-dgemm ::
|
- only-dgemm ::
|
||||||
This only runs the computation part that involves dgemms.
|
This only runs the computation part that involves dgemms.
|
||||||
- slices-on-gpu-only-dgemm ::
|
- cuda-only-dgemm ::
|
||||||
|
This is the naive CUDA implementation compiling only the dgemm parts
|
||||||
|
of the compute.
|
||||||
|
- cuda-slices-on-gpu-only-dgemm ::
|
||||||
This configuration tests that slices reside completely on the gpu
|
This configuration tests that slices reside completely on the gpu
|
||||||
and it should use a CUDA aware MPI implementation.
|
and it should use a CUDA aware MPI implementation.
|
||||||
It also only uses the routines that involve dgemm.
|
It also only uses the routines that involve dgemm.
|
||||||
:end:
|
:end:
|
||||||
|
|
||||||
|
In order to generate the benches just create a suitable directory for it
|
||||||
|
|
||||||
|
#+begin_src sh :eval no
|
||||||
|
mkdir -p build/benches
|
||||||
|
cd buid/benches
|
||||||
|
../../tools/configure-benches.sh CXX=g++ ...
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
and you will get a Makefile together with several project folders.
|
||||||
|
You can either configure all projects with =make all= or
|
||||||
|
then go in each folder.
|
||||||
|
|
||||||
|
Notice that you can give a path for ctf for all of them by doing
|
||||||
|
#+begin_src sh :eval no
|
||||||
|
../../tools/configure-benches.sh --with-ctf=/absolute/path/to/ctf
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
* Running benches
|
||||||
|
|
||||||
|
** Main benchmark
|
||||||
|
|
||||||
|
The main benchmark gets built in =bench/atrip= and is used to run an
|
||||||
|
atrip run with random tensors.
|
||||||
|
|
||||||
|
A common run of this script will be the following
|
||||||
|
|
||||||
|
#+begin_src sh
|
||||||
|
bench/atrip \
|
||||||
|
--no 100 \
|
||||||
|
--nv 1000 \
|
||||||
|
--mod 1 \
|
||||||
|
--% 0 \
|
||||||
|
--dist group \
|
||||||
|
--nocheckpoint \
|
||||||
|
--max-iterations 1000
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
|||||||
116
bench/main.cxx
116
bench/main.cxx
@@ -5,18 +5,20 @@
|
|||||||
#include <CLI11.hpp>
|
#include <CLI11.hpp>
|
||||||
|
|
||||||
#define _print_size(what, size) \
|
#define _print_size(what, size) \
|
||||||
if (rank == 0) { \
|
do { \
|
||||||
std::cout << #what \
|
if (rank == 0) { \
|
||||||
<< " => " \
|
std::cout << #what \
|
||||||
<< (double)size * elem_to_gb \
|
<< " => " \
|
||||||
<< "GB" \
|
<< (double)size * elem_to_gb \
|
||||||
<< std::endl; \
|
<< "GB" \
|
||||||
}
|
<< std::endl; \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
MPI_Init(&argc, &argv);
|
MPI_Init(&argc, &argv);
|
||||||
|
|
||||||
size_t checkpoint_it;
|
size_t checkpoint_it, max_iterations;
|
||||||
int no(10), nv(100), itMod(-1), percentageMod(10);
|
int no(10), nv(100), itMod(-1), percentageMod(10);
|
||||||
float checkpoint_percentage;
|
float checkpoint_percentage;
|
||||||
bool
|
bool
|
||||||
@@ -30,6 +32,9 @@ int main(int argc, char** argv) {
|
|||||||
app.add_option("--no", no, "Occupied orbitals");
|
app.add_option("--no", no, "Occupied orbitals");
|
||||||
app.add_option("--nv", nv, "Virtual orbitals");
|
app.add_option("--nv", nv, "Virtual orbitals");
|
||||||
app.add_option("--mod", itMod, "Iteration modifier");
|
app.add_option("--mod", itMod, "Iteration modifier");
|
||||||
|
app.add_option("--max-iterations",
|
||||||
|
max_iterations,
|
||||||
|
"Maximum number of iterations to run");
|
||||||
app.add_flag("--keep-vppph", keepVppph, "Do not delete Vppph");
|
app.add_flag("--keep-vppph", keepVppph, "Do not delete Vppph");
|
||||||
app.add_flag("--nochrono", nochrono, "Do not print chrono");
|
app.add_flag("--nochrono", nochrono, "Do not print chrono");
|
||||||
app.add_flag("--rank-round-robin", rankRoundRobin, "Do rank round robin");
|
app.add_flag("--rank-round-robin", rankRoundRobin, "Do rank round robin");
|
||||||
@@ -45,14 +50,27 @@ int main(int argc, char** argv) {
|
|||||||
checkpoint_percentage,
|
checkpoint_percentage,
|
||||||
"Percentage for checkpoints");
|
"Percentage for checkpoints");
|
||||||
|
|
||||||
|
// Optional tensor files
|
||||||
|
std::string
|
||||||
|
ei_path, ea_path,
|
||||||
|
Tph_path, Tpphh_path,
|
||||||
|
Vpphh_path, Vhhhp_path, Vppph_path;
|
||||||
|
app.add_option("--ei", ei_path, "Path for ei");
|
||||||
|
app.add_option("--ea", ea_path, "Path for ea");
|
||||||
|
app.add_option("--Tpphh", Tpphh_path, "Path for Tpphh");
|
||||||
|
app.add_option("--Tph", Tph_path, "Path for Tph");
|
||||||
|
app.add_option("--Vpphh", Vpphh_path, "Path for Vpphh");
|
||||||
|
app.add_option("--Vhhhp", Vhhhp_path, "Path for Vhhhp");
|
||||||
|
app.add_option("--Vppph", Vppph_path, "Path for Vppph");
|
||||||
|
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
size_t ooo_threads = 0, ooo_blocks = 0;
|
size_t ooo_threads = 0, ooo_blocks = 0;
|
||||||
app.add_option("--ooo-blocks",
|
app.add_option("--ooo-blocks",
|
||||||
ooo_blocks,
|
ooo_blocks,
|
||||||
"CUDA: Number of blocks per block for kernels going through ooo tensors");
|
"CUDA: Number of blocks per block for kernels going through ooo tensors");
|
||||||
app.add_option("--ooo-threads",
|
app.add_option("--ooo-threads",
|
||||||
ooo_threads,
|
ooo_threads,
|
||||||
"CUDA: Number of threads per block for kernels going through ooo tensors");
|
"CUDA: Number of threads per block for kernels going through ooo tensors");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
CLI11_PARSE(app, argc, argv);
|
CLI11_PARSE(app, argc, argv);
|
||||||
@@ -148,37 +166,64 @@ int main(int argc, char** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
std::vector<int> symmetries(4, NS)
|
std::vector<int>
|
||||||
, vo({nv, no})
|
symmetries(4, NS),
|
||||||
, vvoo({nv, nv, no, no})
|
vo({nv, no}),
|
||||||
, ooov({no, no, no, nv})
|
vvoo({nv, nv, no, no}),
|
||||||
, vvvo({nv, nv, nv, no})
|
ooov({no, no, no, nv}),
|
||||||
;
|
vvvo({nv, nv, nv, no});
|
||||||
|
|
||||||
CTF::Tensor<double>
|
CTF::Tensor<double>
|
||||||
ei(1, ooov.data(), symmetries.data(), world)
|
ei(1, ooov.data(), symmetries.data(), world),
|
||||||
, ea(1, vo.data(), symmetries.data(), world)
|
ea(1, vo.data(), symmetries.data(), world),
|
||||||
, Tph(2, vo.data(), symmetries.data(), world)
|
Tph(2, vo.data(), symmetries.data(), world),
|
||||||
, Tpphh(4, vvoo.data(), symmetries.data(), world)
|
Tpphh(4, vvoo.data(), symmetries.data(), world),
|
||||||
, Vpphh(4, vvoo.data(), symmetries.data(), world)
|
Vpphh(4, vvoo.data(), symmetries.data(), world),
|
||||||
, Vhhhp(4, ooov.data(), symmetries.data(), world)
|
Vhhhp(4, ooov.data(), symmetries.data(), world);
|
||||||
;
|
|
||||||
|
|
||||||
// initialize deletable tensors in heap
|
// initialize deletable tensors in heap
|
||||||
auto Vppph
|
auto Vppph
|
||||||
= new CTF::Tensor<double>(4, vvvo.data(), symmetries.data(), world);
|
= new CTF::Tensor<double>(4, vvvo.data(), symmetries.data(), world);
|
||||||
|
|
||||||
_print_size(Vabci, no*nv*nv*nv)
|
_print_size(Vabci, no*nv*nv*nv);
|
||||||
_print_size(Vabij, no*no*nv*nv)
|
_print_size(Vabij, no*no*nv*nv);
|
||||||
_print_size(Vijka, no*no*no*nv)
|
_print_size(Vijka, no*no*no*nv);
|
||||||
|
|
||||||
ei.fill_random(-40.0, -2);
|
if (ei_path.size()) {
|
||||||
ea.fill_random(2, 50);
|
ei.read_dense_from_file(ei_path.c_str());
|
||||||
Tpphh.fill_random(0, 1);
|
} else {
|
||||||
Tph.fill_random(0, 1);
|
ei.fill_random(-40.0, -2);
|
||||||
Vpphh.fill_random(0, 1);
|
}
|
||||||
Vhhhp.fill_random(0, 1);
|
if (ea_path.size()) {
|
||||||
Vppph->fill_random(0, 1);
|
ea.read_dense_from_file(ea_path.c_str());
|
||||||
|
} else {
|
||||||
|
ea.fill_random(2, 50);
|
||||||
|
}
|
||||||
|
if (Tpphh_path.size()) {
|
||||||
|
Tpphh.read_dense_from_file(Tpphh_path.c_str());
|
||||||
|
} else {
|
||||||
|
Tpphh.fill_random(0, 1);
|
||||||
|
}
|
||||||
|
if (Tph_path.size()) {
|
||||||
|
Tph.read_dense_from_file(Tph_path.c_str());
|
||||||
|
} else {
|
||||||
|
Tph.fill_random(0, 1);
|
||||||
|
}
|
||||||
|
if (Vpphh_path.size()) {
|
||||||
|
Vpphh.read_dense_from_file(Vpphh_path.c_str());
|
||||||
|
} else {
|
||||||
|
Vpphh.fill_random(0, 1);
|
||||||
|
}
|
||||||
|
if (Vhhhp_path.size()) {
|
||||||
|
Vhhhp.read_dense_from_file(Vhhhp_path.c_str());
|
||||||
|
} else {
|
||||||
|
Vhhhp.fill_random(0, 1);
|
||||||
|
}
|
||||||
|
if (Vppph_path.size()) {
|
||||||
|
Vppph->read_dense_from_file(Vppph_path.c_str());
|
||||||
|
} else {
|
||||||
|
Vppph->fill_random(0, 1);
|
||||||
|
}
|
||||||
|
|
||||||
atrip::Atrip::init(MPI_COMM_WORLD);
|
atrip::Atrip::init(MPI_COMM_WORLD);
|
||||||
const auto in
|
const auto in
|
||||||
@@ -199,6 +244,7 @@ int main(int argc, char** argv) {
|
|||||||
.with_iterationMod(itMod)
|
.with_iterationMod(itMod)
|
||||||
.with_percentageMod(percentageMod)
|
.with_percentageMod(percentageMod)
|
||||||
.with_tuplesDistribution(tuplesDistribution)
|
.with_tuplesDistribution(tuplesDistribution)
|
||||||
|
.with_maxIterations(max_iterations)
|
||||||
// checkpoint options
|
// checkpoint options
|
||||||
.with_checkpointAtEveryIteration(checkpoint_it)
|
.with_checkpointAtEveryIteration(checkpoint_it)
|
||||||
.with_checkpointAtPercentage(checkpoint_percentage)
|
.with_checkpointAtPercentage(checkpoint_percentage)
|
||||||
|
|||||||
@@ -164,8 +164,7 @@ AC_TYPE_SIZE_T
|
|||||||
dnl -----------------------------------------------------------------------
|
dnl -----------------------------------------------------------------------
|
||||||
dnl CHECK CTF
|
dnl CHECK CTF
|
||||||
if test xYES = x${BUILD_CTF}; then
|
if test xYES = x${BUILD_CTF}; then
|
||||||
AC_MSG_WARN([Sorry, building CTF not supported yet provide a build path
|
AC_MSG_WARN([You will have to do make ctf before building the project.])
|
||||||
with --with-ctf=path/to/ctf/installation])
|
|
||||||
else
|
else
|
||||||
CPPFLAGS="$CPPFLAGS -I${LIBCTF_CPATH}"
|
CPPFLAGS="$CPPFLAGS -I${LIBCTF_CPATH}"
|
||||||
LDFLAGS="$LDFLAGS -L${LIBCTF_LD_LIBRARY_PATH} -lctf"
|
LDFLAGS="$LDFLAGS -L${LIBCTF_LD_LIBRARY_PATH} -lctf"
|
||||||
|
|||||||
56
etc/env/raven/cuda
vendored
Normal file
56
etc/env/raven/cuda
vendored
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
mods=(
|
||||||
|
cuda/11.6
|
||||||
|
intel/19.1.2
|
||||||
|
mkl/2020.4
|
||||||
|
impi/2019.8
|
||||||
|
autoconf/2.69
|
||||||
|
automake/1.15
|
||||||
|
libtool/2.4.6
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
module purge
|
||||||
|
module load ${mods[@]}
|
||||||
|
LIB_PATH="${CUDA_HOME}/lib64"
|
||||||
|
export CUDA_ROOT=${CUDA_HOME}
|
||||||
|
export CUDA_LDFLAGS="-L${LIB_PATH} -lcuda -L${LIB_PATH} -lcudart -L${LIB_PATH} -lcublas"
|
||||||
|
export CUDA_CXXFLAGS="-I${CUDA_HOME}/include"
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH="${MKL_HOME}/lib/intel64_lin:${LD_LIBRARY_PATH}"
|
||||||
|
|
||||||
|
BLAS_STATIC_PATH="$MKL_HOME/lib/intel64/libmkl_intel_lp64.a"
|
||||||
|
|
||||||
|
ls ${LIB_PATH}/libcublas.so
|
||||||
|
ls ${LIB_PATH}/libcudart.so
|
||||||
|
|
||||||
|
cat <<EOF
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
info
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
|
MKL_HOME = $MKL_HOME
|
||||||
|
BLAS_STATIC_PATH = $BLAS_STATIC_PATH
|
||||||
|
|
||||||
|
CUDA_ROOT = ${CUDA_HOME}
|
||||||
|
CUDA_LDFLAGS = "-L${LIB_PATH} -lcuda -L${LIB_PATH} -lcudart -L${LIB_PATH} -lcublas"
|
||||||
|
CUDA_CXXFLAGS = "-I${CUDA_HOME}/include"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Consider now runnng the following
|
||||||
|
|
||||||
|
../configure \\
|
||||||
|
--enable-cuda \\
|
||||||
|
--disable-slice \\
|
||||||
|
--with-blas="-L\$MKL_HOME/lib/intel64/ -lmkl_intel_lp64 -mkl" \\
|
||||||
|
CXX=mpiicpc \\
|
||||||
|
CC=mpiicc \\
|
||||||
|
MPICXX=mpiicpc
|
||||||
|
|
||||||
|
|
||||||
|
EOF
|
||||||
|
|
||||||
|
|
||||||
|
return
|
||||||
@@ -86,7 +86,7 @@ namespace atrip {
|
|||||||
ADD_ATTRIBUTE(bool, rankRoundRobin, false)
|
ADD_ATTRIBUTE(bool, rankRoundRobin, false)
|
||||||
ADD_ATTRIBUTE(bool, chrono, false)
|
ADD_ATTRIBUTE(bool, chrono, false)
|
||||||
ADD_ATTRIBUTE(bool, barrier, false)
|
ADD_ATTRIBUTE(bool, barrier, false)
|
||||||
ADD_ATTRIBUTE(int, maxIterations, 0)
|
ADD_ATTRIBUTE(size_t, maxIterations, 0)
|
||||||
ADD_ATTRIBUTE(int, iterationMod, -1)
|
ADD_ATTRIBUTE(int, iterationMod, -1)
|
||||||
ADD_ATTRIBUTE(int, percentageMod, -1)
|
ADD_ATTRIBUTE(int, percentageMod, -1)
|
||||||
ADD_ATTRIBUTE(TuplesDistribution, tuplesDistribution, NAIVE)
|
ADD_ATTRIBUTE(TuplesDistribution, tuplesDistribution, NAIVE)
|
||||||
|
|||||||
@@ -773,6 +773,8 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
Atrip::chrono["iterations"].stop();
|
Atrip::chrono["iterations"].stop();
|
||||||
// ITERATION END %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%{{{1
|
// ITERATION END %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%{{{1
|
||||||
|
|
||||||
|
if (in.maxIterations != 0 && i >= in.maxIterations) break;
|
||||||
|
|
||||||
}
|
}
|
||||||
// END OF MAIN LOOP
|
// END OF MAIN LOOP
|
||||||
|
|
||||||
|
|||||||
@@ -98,10 +98,27 @@ EOF
|
|||||||
create_config $tmp only-dgemm
|
create_config $tmp only-dgemm
|
||||||
rm $tmp
|
rm $tmp
|
||||||
|
|
||||||
#
|
|
||||||
# begin doc
|
# begin doc
|
||||||
#
|
#
|
||||||
# - slices-on-gpu-only-dgemm ::
|
# - cuda-only-dgemm ::
|
||||||
|
# This is the naive CUDA implementation compiling only the dgemm parts
|
||||||
|
# of the compute.
|
||||||
|
#
|
||||||
|
# end doc
|
||||||
|
|
||||||
|
tmp=`mktemp`
|
||||||
|
cat <<EOF > $tmp
|
||||||
|
--enable-cuda
|
||||||
|
--enable-only-dgemm
|
||||||
|
--disable-slice
|
||||||
|
EOF
|
||||||
|
|
||||||
|
create_config $tmp cuda-only-dgemm
|
||||||
|
rm $tmp
|
||||||
|
|
||||||
|
# begin doc
|
||||||
|
#
|
||||||
|
# - cuda-slices-on-gpu-only-dgemm ::
|
||||||
# This configuration tests that slices reside completely on the gpu
|
# This configuration tests that slices reside completely on the gpu
|
||||||
# and it should use a CUDA aware MPI implementation.
|
# and it should use a CUDA aware MPI implementation.
|
||||||
# It also only uses the routines that involve dgemm.
|
# It also only uses the routines that involve dgemm.
|
||||||
@@ -117,7 +134,7 @@ cat <<EOF > $tmp
|
|||||||
--disable-slice
|
--disable-slice
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
create_config $tmp sources-in-gpu
|
create_config $tmp cuda-slices-on-gpu-only-dgemm
|
||||||
rm $tmp
|
rm $tmp
|
||||||
|
|
||||||
############################################################
|
############################################################
|
||||||
|
|||||||
Reference in New Issue
Block a user