Add preliminary openacc support, atrip bench not linking

This commit is contained in:
Alejandro Gallo 2023-01-11 13:06:59 +01:00
parent 77e1aaabeb
commit 017cf43381
6 changed files with 88 additions and 32 deletions

View File

@ -23,6 +23,18 @@ atrip_SOURCES = main.cxx
atrip_CPPFLAGS = $(AM_CPPFLAGS)
atrip_LDADD = $(BENCHES_LDADD)
atrip: main.cxx
$(NVCXX) -cuda \
-x cu -I../ \
$(MPILIBS) \
-I$(srcdir)/ \
$(AM_CPPFLAGS) \
$(DEFS) \
$(BENCHES_LDADD) \
$(AM_LDFLAGS) \
$< -o $@
endif
if !WITH_CUDA
##

View File

@ -177,17 +177,14 @@ fi
dnl CUDA NVIDIA -----------------------------------------------------------
AM_CONDITIONAL([WITH_CUDA], [test x${WITH_CUDA} = xyes])
AM_CONDITIONAL([WITH_OPENACC], [test x${WITH_OPENACC} = xyes])
if test x${WITH_OPENACC} = xyes; then
ATRIP_OPENACC([CXXFLAGS="${CXXFLAGS} -fopenacc"],
[AC_MSG_ERROR([I can't use -fopenacc, aborting])])
fi
if test x${WITH_CUDA} = xyes; then
AC_MSG_RESULT([
CUDA SUPPORT IS ENABLED
-----------------------
])
AC_CHECK_PROGS([NVCC], [nvcc])
AC_CHECK_PROGS([NVCXX], [nvc++])
MPILIBS=$($MPICXX -show | awk '!($1="")')
AC_SUBST([CUDA_LDFLAGS])
AC_DEFINE([HAVE_CUDA],1,[Wether we are using CUDA])
# TODO: make sure to find cuda and cudart
@ -233,6 +230,7 @@ AC_MSG_RESULT([
ATRIP_LDFLAGS = $ATRIP_LDFLAGS
BLAS = ${BLAS_LIBS}
LIBS = ${LIBS}
MPILIBS = $MPILIBS
])
AC_OUTPUT

View File

@ -1,5 +1,7 @@
#!/usr/bin/env bash
mods=(
cuda/11.6
#cuda/11.6
nvhpcsdk/22 # for openacc
gcc/12
openmpi
mkl/2020.4
@ -11,16 +13,42 @@ mods=(
module purge
module load ${mods[@]}
LIB_PATH="${CUDA_HOME}/lib64"
LIB_PATH="${NVHPC_CUDA_HOME}/lib64"
export CUBLAS_LD_PATH="${NVHPC_ROOT}/math_libs/lib64/"
export CUDA_ROOT=${CUDA_HOME}
export CUDA_LDFLAGS="-L${LIB_PATH} -lcuda -L${LIB_PATH} -lcudart -L${LIB_PATH} -lcublas"
export CUDA_LDFLAGS="-L${LIB_PATH} -lcuda -L${LIB_PATH} -lcudart -L${CUBLAS_LD_PATH} -lcublas"
export CUDA_CXXFLAGS="-I${CUDA_HOME}/include"
export LD_LIBRARY_PATH="${MKL_HOME}/lib/intel64:${LD_LIBRARY_PATH}"
ls ${LIB_PATH}/libcublas.so
MPILIBS=$(mpicxx -show | awk '!($1="")')
export MPILIBS
export MPINVCXX="nv++ ${MPILIBS}"
ls ${CUBLAS_LD_PATH}/libcublas.so
ls ${LIB_PATH}/libcudart.so
#export OMPI_CC="nvc"
#export OMPI_CXX="nvc++"
BLAS_LDFLAGS="-L${PWD}/OpenBLAS-0.3.20/ -lopenblas"
_openblas_make () {
[[ -d OpenBLAS-0.3.20/ ]] || {
wget https://github.com/xianyi/OpenBLAS/releases/download/v0.3.20/OpenBLAS-0.3.20.tar.gz
tar xvzf OpenBLAS-0.3.20.tar.gz
cd OpenBLAS-0.3.20/
make FC=gfortran CC=gcc USE_OPENMP=1 NUM_THREADS=72 TARGET=SKYLAKEX
} && {
echo "Openblas built"
}
}
( _openblas_make; )
cat <<EOF
////////////////////////////////////////////////////////////////////////////////
@ -39,12 +67,13 @@ CUDA_CXXFLAGS = "-I${CUDA_HOME}/include"
Consider now runnng the following
../configure \\
../../configure \\
--enable-cuda \\
--disable-slice \\
--with-blas="-L\$MKL_HOME/lib/intel64/ -lmkl_blacs_openmpi_lp64 -lmkl_rt" \\
CXX=mpicxx \\
MPICXX=mpicxx
--with-blas="${BLAS_LDFLAGS}" \\
CXX="gcc" \\
NVCC="\$MPINVCXX" \\
MPICXX="mpicxx"
EOF

View File

@ -7,16 +7,17 @@ AM_CPPFLAGS = $(CTF_CPPFLAGS)
lib_LIBRARIES = libatrip.a
libatrip_a_CPPFLAGS = -I$(top_srcdir)/include/
libatrip_a_SOURCES = ./atrip/Blas.cxx ./atrip/Tuples.cxx ./atrip/DatabaseCommunicator.cxx
NVCC_FILES = ./atrip/Equations.cxx ./atrip/Complex.cxx ./atrip/Atrip.cxx
libatrip_a_SOURCES =
NVCC_FILES = ./atrip/Equations.cxx ./atrip/Complex.cxx ./atrip/Atrip.cxx
NVCC_FILES += ./atrip/Blas.cxx ./atrip/Tuples.cxx ./atrip/DatabaseCommunicator.cxx
if WITH_CUDA
NVCC_OBJS = $(patsubst %.cxx,%.nvcc.o,$(NVCC_FILES))
libatrip_a_CPPFLAGS += $(CUDA_CXXFLAGS)
libatrip_a_DEPENDENCIES = $(NVCC_OBJS)
libatrip_a_LIBADD = $(NVCC_OBJS)
%.nvcc.o: %.cxx
$(NVCC) -c -x cu -ccbin="${MPICXX}" -I../ $(CPPFLAGS) $(CTF_CPPFLAGS) $(DEFS) $(libatrip_a_CPPFLAGS) $< -o $@
##$(NVCC) -c -x cu -ccbin="${MPICXX}" -I../ $(CPPFLAGS) $(CTF_CPPFLAGS) $(DEFS) $(libatrip_a_CPPFLAGS) $< -o $@
$(NVCXX) -cuda $(MPILIBS) -c -x cu -I../ $(CPPFLAGS) $(CTF_CPPFLAGS) $(DEFS) $(libatrip_a_CPPFLAGS) $< -o $@
#./atrip/Equations.o: ./atrip/Equations.cxx
# $(NVCC) -c -I../ $(CPPFLAGS) $(libatrip_a_CPPFLAGS) $< -o $@

View File

@ -694,13 +694,10 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
// LOG(0, "AtripCUDA") << "doing energy " << i << "distinct " << distinct << "\n";
WITH_CHRONO("energy",
/*
TODO: think about how to do this on the GPU in the best way possible
if ( distinct == 0)
tupleEnergy = getEnergyDistinct<F>(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk);
else
tupleEnergy = getEnergySame<F>(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk);
*/
)
#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)

View File

@ -182,15 +182,21 @@ namespace cuda {
// [[file:~/cuda/atrip/atrip.org::*Energy][Energy:2]]
template <typename F>
__MAYBE_DEVICE__
double getEnergyDistinct
( F const epsabc
, size_t const No
, F* const epsi
, F* const Tijk
, F* const Zijk
) {
(F const epsabc,
size_t const No,
F* const epsi,
F* const Tijk,
F* const Zijk) {
constexpr size_t blockSize=16;
F energy(0.);
#if defined(HAVE_CUDA)
#pragma acc kernels
for (size_t k(0); k < No; k++) {
for (size_t j(k); j < No; j++) {
for (size_t i(j); i < No; i++) {
#else
for (size_t kk=0; kk<No; kk+=blockSize){
const size_t kend( std::min(No, kk+blockSize) );
for (size_t jj(kk); jj<No; jj+=blockSize){
@ -198,13 +204,14 @@ double getEnergyDistinct
for (size_t ii(jj); ii<No; ii+=blockSize){
const size_t iend( std::min( No, ii+blockSize) );
for (size_t k(kk); k < kend; k++){
const F ek(epsi[k]);
const size_t jstart = jj > k ? jj : k;
for (size_t j(jstart); j < jend; j++){
F const ej(epsi[j]);
F const facjk = j == k ? F(0.5) : F(1.0);
size_t istart = ii > j ? ii : j;
for (size_t i(istart); i < iend; i++){
#endif
const F ek(epsi[k]);
const F ej(epsi[j]);
const F facjk = j == k ? F(0.5) : F(1.0);
const F
ei(epsi[i])
, facij = i == j ? F(0.5) : F(1.0)
@ -239,14 +246,17 @@ double getEnergyDistinct
} // i
} // j
} // k
#if !defined(HAVE_CUDA)
} // ii
} // jj
} // kk
#endif
return std::real(energy);
}
template <typename F>
__MAYBE_DEVICE__
double getEnergySame
( F const epsabc
, size_t const No
@ -256,6 +266,12 @@ double getEnergySame
) {
constexpr size_t blockSize = 16;
F energy = F(0.);
#if defined(HAVE_CUDA)
#pragma acc kernels
for (size_t k(0); k < No; k++) {
for (size_t j(k); j < No; j++) {
for (size_t i(j); i < No; i++) {
#else
for (size_t kk=0; kk<No; kk+=blockSize){
const size_t kend( std::min( kk+blockSize, No) );
for (size_t jj(kk); jj<No; jj+=blockSize){
@ -263,13 +279,14 @@ double getEnergySame
for (size_t ii(jj); ii<No; ii+=blockSize){
const size_t iend( std::min( ii+blockSize, No) );
for (size_t k(kk); k < kend; k++){
const F ek(epsi[k]);
const size_t jstart = jj > k ? jj : k;
for(size_t j(jstart); j < jend; j++){
const F facjk( j == k ? F(0.5) : F(1.0));
const F ej(epsi[j]);
const size_t istart = ii > j ? ii : j;
for(size_t i(istart); i < iend; i++){
#endif
const F facjk( j == k ? F(0.5) : F(1.0));
const F ek(epsi[k]);
const F ej(epsi[j]);
const F
ei(epsi[i])
, facij ( i==j ? F(0.5) : F(1.0))
@ -291,9 +308,11 @@ double getEnergySame
} // i
} // j
} // k
#if !defined(HAVE_CUDA)
} // ii
} // jj
} // kk
#endif
return std::real(energy);
}
// Energy:2 ends here