Compare commits
4 Commits
895cd02778
...
249f1c0b51
| Author | SHA1 | Date | |
|---|---|---|---|
| 249f1c0b51 | |||
| 1d96800d45 | |||
| 9087e3af19 | |||
| 418fd9d389 |
2
.github/workflows/main.yml
vendored
2
.github/workflows/main.yml
vendored
@ -2,6 +2,8 @@
|
||||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ master, cuda ]
|
||||
pull_request:
|
||||
branches: [ master, cuda ]
|
||||
|
||||
|
||||
@ -87,7 +87,10 @@ sed "s/^# //; s/^# *$//; /^$/d"
|
||||
and without computing slices.
|
||||
- only-dgemm ::
|
||||
This only runs the computation part that involves dgemms.
|
||||
- slices-on-gpu-only-dgemm ::
|
||||
- cuda-only-dgemm ::
|
||||
This is the naive CUDA implementation compiling only the dgemm parts
|
||||
of the compute.
|
||||
- cuda-slices-on-gpu-only-dgemm ::
|
||||
This configuration tests that slices reside completely on the gpu
|
||||
and it should use a CUDA aware MPI implementation.
|
||||
It also only uses the routines that involve dgemm.
|
||||
@ -114,13 +117,13 @@ Notice that you can give a path for ctf for all of them by doing
|
||||
|
||||
** Main benchmark
|
||||
|
||||
The main benchmark gets built in =bench/main= and is used to run an
|
||||
The main benchmark gets built in =bench/atrip= and is used to run an
|
||||
atrip run with random tensors.
|
||||
|
||||
A common run of this script will be the following
|
||||
|
||||
#+begin_src sh
|
||||
bench/main \
|
||||
bench/atrip \
|
||||
--no 100 \
|
||||
--nv 1000 \
|
||||
--mod 1 \
|
||||
|
||||
@ -5,13 +5,15 @@
|
||||
#include <CLI11.hpp>
|
||||
|
||||
#define _print_size(what, size) \
|
||||
do { \
|
||||
if (rank == 0) { \
|
||||
std::cout << #what \
|
||||
<< " => " \
|
||||
<< (double)size * elem_to_gb \
|
||||
<< "GB" \
|
||||
<< std::endl; \
|
||||
}
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
MPI_Init(&argc, &argv);
|
||||
@ -48,6 +50,19 @@ int main(int argc, char** argv) {
|
||||
checkpoint_percentage,
|
||||
"Percentage for checkpoints");
|
||||
|
||||
// Optional tensor files
|
||||
std::string
|
||||
ei_path, ea_path,
|
||||
Tph_path, Tpphh_path,
|
||||
Vpphh_path, Vhhhp_path, Vppph_path;
|
||||
app.add_option("--ei", ei_path, "Path for ei");
|
||||
app.add_option("--ea", ea_path, "Path for ea");
|
||||
app.add_option("--Tpphh", Tpphh_path, "Path for Tpphh");
|
||||
app.add_option("--Tph", Tph_path, "Path for Tph");
|
||||
app.add_option("--Vpphh", Vpphh_path, "Path for Vpphh");
|
||||
app.add_option("--Vhhhp", Vhhhp_path, "Path for Vhhhp");
|
||||
app.add_option("--Vppph", Vppph_path, "Path for Vppph");
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
size_t ooo_threads = 0, ooo_blocks = 0;
|
||||
app.add_option("--ooo-blocks",
|
||||
@ -151,37 +166,64 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
|
||||
|
||||
std::vector<int> symmetries(4, NS)
|
||||
, vo({nv, no})
|
||||
, vvoo({nv, nv, no, no})
|
||||
, ooov({no, no, no, nv})
|
||||
, vvvo({nv, nv, nv, no})
|
||||
;
|
||||
std::vector<int>
|
||||
symmetries(4, NS),
|
||||
vo({nv, no}),
|
||||
vvoo({nv, nv, no, no}),
|
||||
ooov({no, no, no, nv}),
|
||||
vvvo({nv, nv, nv, no});
|
||||
|
||||
CTF::Tensor<double>
|
||||
ei(1, ooov.data(), symmetries.data(), world)
|
||||
, ea(1, vo.data(), symmetries.data(), world)
|
||||
, Tph(2, vo.data(), symmetries.data(), world)
|
||||
, Tpphh(4, vvoo.data(), symmetries.data(), world)
|
||||
, Vpphh(4, vvoo.data(), symmetries.data(), world)
|
||||
, Vhhhp(4, ooov.data(), symmetries.data(), world)
|
||||
;
|
||||
ei(1, ooov.data(), symmetries.data(), world),
|
||||
ea(1, vo.data(), symmetries.data(), world),
|
||||
Tph(2, vo.data(), symmetries.data(), world),
|
||||
Tpphh(4, vvoo.data(), symmetries.data(), world),
|
||||
Vpphh(4, vvoo.data(), symmetries.data(), world),
|
||||
Vhhhp(4, ooov.data(), symmetries.data(), world);
|
||||
|
||||
// initialize deletable tensors in heap
|
||||
auto Vppph
|
||||
= new CTF::Tensor<double>(4, vvvo.data(), symmetries.data(), world);
|
||||
|
||||
_print_size(Vabci, no*nv*nv*nv)
|
||||
_print_size(Vabij, no*no*nv*nv)
|
||||
_print_size(Vijka, no*no*no*nv)
|
||||
_print_size(Vabci, no*nv*nv*nv);
|
||||
_print_size(Vabij, no*no*nv*nv);
|
||||
_print_size(Vijka, no*no*no*nv);
|
||||
|
||||
if (ei_path.size()) {
|
||||
ei.read_dense_from_file(ei_path.c_str());
|
||||
} else {
|
||||
ei.fill_random(-40.0, -2);
|
||||
}
|
||||
if (ea_path.size()) {
|
||||
ea.read_dense_from_file(ea_path.c_str());
|
||||
} else {
|
||||
ea.fill_random(2, 50);
|
||||
}
|
||||
if (Tpphh_path.size()) {
|
||||
Tpphh.read_dense_from_file(Tpphh_path.c_str());
|
||||
} else {
|
||||
Tpphh.fill_random(0, 1);
|
||||
}
|
||||
if (Tph_path.size()) {
|
||||
Tph.read_dense_from_file(Tph_path.c_str());
|
||||
} else {
|
||||
Tph.fill_random(0, 1);
|
||||
}
|
||||
if (Vpphh_path.size()) {
|
||||
Vpphh.read_dense_from_file(Vpphh_path.c_str());
|
||||
} else {
|
||||
Vpphh.fill_random(0, 1);
|
||||
}
|
||||
if (Vhhhp_path.size()) {
|
||||
Vhhhp.read_dense_from_file(Vhhhp_path.c_str());
|
||||
} else {
|
||||
Vhhhp.fill_random(0, 1);
|
||||
}
|
||||
if (Vppph_path.size()) {
|
||||
Vppph->read_dense_from_file(Vppph_path.c_str());
|
||||
} else {
|
||||
Vppph->fill_random(0, 1);
|
||||
}
|
||||
|
||||
atrip::Atrip::init(MPI_COMM_WORLD);
|
||||
const auto in
|
||||
|
||||
56
etc/env/raven/cuda
vendored
Normal file
56
etc/env/raven/cuda
vendored
Normal file
@ -0,0 +1,56 @@
|
||||
mods=(
|
||||
cuda/11.6
|
||||
intel/19.1.2
|
||||
mkl/2020.4
|
||||
impi/2019.8
|
||||
autoconf/2.69
|
||||
automake/1.15
|
||||
libtool/2.4.6
|
||||
)
|
||||
|
||||
|
||||
module purge
|
||||
module load ${mods[@]}
|
||||
LIB_PATH="${CUDA_HOME}/lib64"
|
||||
export CUDA_ROOT=${CUDA_HOME}
|
||||
export CUDA_LDFLAGS="-L${LIB_PATH} -lcuda -L${LIB_PATH} -lcudart -L${LIB_PATH} -lcublas"
|
||||
export CUDA_CXXFLAGS="-I${CUDA_HOME}/include"
|
||||
|
||||
export LD_LIBRARY_PATH="${MKL_HOME}/lib/intel64_lin:${LD_LIBRARY_PATH}"
|
||||
|
||||
BLAS_STATIC_PATH="$MKL_HOME/lib/intel64/libmkl_intel_lp64.a"
|
||||
|
||||
ls ${LIB_PATH}/libcublas.so
|
||||
ls ${LIB_PATH}/libcudart.so
|
||||
|
||||
cat <<EOF
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
info
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
MKL_HOME = $MKL_HOME
|
||||
BLAS_STATIC_PATH = $BLAS_STATIC_PATH
|
||||
|
||||
CUDA_ROOT = ${CUDA_HOME}
|
||||
CUDA_LDFLAGS = "-L${LIB_PATH} -lcuda -L${LIB_PATH} -lcudart -L${LIB_PATH} -lcublas"
|
||||
CUDA_CXXFLAGS = "-I${CUDA_HOME}/include"
|
||||
|
||||
|
||||
|
||||
Consider now runnng the following
|
||||
|
||||
../configure \\
|
||||
--enable-cuda \\
|
||||
--disable-slice \\
|
||||
--with-blas="-L\$MKL_HOME/lib/intel64/ -lmkl_intel_lp64 -mkl" \\
|
||||
CXX=mpiicpc \\
|
||||
CC=mpiicc \\
|
||||
MPICXX=mpiicpc
|
||||
|
||||
|
||||
EOF
|
||||
|
||||
|
||||
return
|
||||
@ -98,10 +98,27 @@ EOF
|
||||
create_config $tmp only-dgemm
|
||||
rm $tmp
|
||||
|
||||
#
|
||||
# begin doc
|
||||
#
|
||||
# - slices-on-gpu-only-dgemm ::
|
||||
# - cuda-only-dgemm ::
|
||||
# This is the naive CUDA implementation compiling only the dgemm parts
|
||||
# of the compute.
|
||||
#
|
||||
# end doc
|
||||
|
||||
tmp=`mktemp`
|
||||
cat <<EOF > $tmp
|
||||
--enable-cuda
|
||||
--enable-only-dgemm
|
||||
--disable-slice
|
||||
EOF
|
||||
|
||||
create_config $tmp cuda-only-dgemm
|
||||
rm $tmp
|
||||
|
||||
# begin doc
|
||||
#
|
||||
# - cuda-slices-on-gpu-only-dgemm ::
|
||||
# This configuration tests that slices reside completely on the gpu
|
||||
# and it should use a CUDA aware MPI implementation.
|
||||
# It also only uses the routines that involve dgemm.
|
||||
@ -117,7 +134,7 @@ cat <<EOF > $tmp
|
||||
--disable-slice
|
||||
EOF
|
||||
|
||||
create_config $tmp sources-in-gpu
|
||||
create_config $tmp cuda-slices-on-gpu-only-dgemm
|
||||
rm $tmp
|
||||
|
||||
############################################################
|
||||
|
||||
Loading…
Reference in New Issue
Block a user