From 017cf43381651c9ed09e847a617234f699c4c483 Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Wed, 11 Jan 2023 13:06:59 +0100 Subject: [PATCH] Add preliminary openacc support, atrip bench not linking --- bench/Makefile.am | 12 ++++++++++ configure.ac | 8 +++---- etc/env/raven/cuda-openacc | 45 +++++++++++++++++++++++++++++++------- src/Makefile.am | 9 ++++---- src/atrip/Atrip.cxx | 3 --- src/atrip/Equations.cxx | 43 ++++++++++++++++++++++++++---------- 6 files changed, 88 insertions(+), 32 deletions(-) diff --git a/bench/Makefile.am b/bench/Makefile.am index 5f5d2c9..e1cae3a 100644 --- a/bench/Makefile.am +++ b/bench/Makefile.am @@ -23,6 +23,18 @@ atrip_SOURCES = main.cxx atrip_CPPFLAGS = $(AM_CPPFLAGS) atrip_LDADD = $(BENCHES_LDADD) +atrip: main.cxx + $(NVCXX) -cuda \ + -x cu -I../ \ + $(MPILIBS) \ + -I$(srcdir)/ \ + $(AM_CPPFLAGS) \ + $(DEFS) \ + $(BENCHES_LDADD) \ + $(AM_LDFLAGS) \ + $< -o $@ + +endif if !WITH_CUDA ## diff --git a/configure.ac b/configure.ac index 827d0a7..074286f 100644 --- a/configure.ac +++ b/configure.ac @@ -177,17 +177,14 @@ fi dnl CUDA NVIDIA ----------------------------------------------------------- AM_CONDITIONAL([WITH_CUDA], [test x${WITH_CUDA} = xyes]) -AM_CONDITIONAL([WITH_OPENACC], [test x${WITH_OPENACC} = xyes]) -if test x${WITH_OPENACC} = xyes; then -ATRIP_OPENACC([CXXFLAGS="${CXXFLAGS} -fopenacc"], - [AC_MSG_ERROR([I can't use -fopenacc, aborting])]) -fi if test x${WITH_CUDA} = xyes; then AC_MSG_RESULT([ CUDA SUPPORT IS ENABLED ----------------------- ]) AC_CHECK_PROGS([NVCC], [nvcc]) + AC_CHECK_PROGS([NVCXX], [nvc++]) + MPILIBS=$($MPICXX -show | awk '!($1="")') AC_SUBST([CUDA_LDFLAGS]) AC_DEFINE([HAVE_CUDA],1,[Wether we are using CUDA]) # TODO: make sure to find cuda and cudart @@ -233,6 +230,7 @@ AC_MSG_RESULT([ ATRIP_LDFLAGS = $ATRIP_LDFLAGS BLAS = ${BLAS_LIBS} LIBS = ${LIBS} + MPILIBS = $MPILIBS ]) AC_OUTPUT diff --git a/etc/env/raven/cuda-openacc b/etc/env/raven/cuda-openacc index 6fc5308..a0e364b 100644 --- a/etc/env/raven/cuda-openacc +++ b/etc/env/raven/cuda-openacc @@ -1,5 +1,7 @@ +#!/usr/bin/env bash mods=( - cuda/11.6 + #cuda/11.6 + nvhpcsdk/22 # for openacc gcc/12 openmpi mkl/2020.4 @@ -11,16 +13,42 @@ mods=( module purge module load ${mods[@]} -LIB_PATH="${CUDA_HOME}/lib64" + +LIB_PATH="${NVHPC_CUDA_HOME}/lib64" +export CUBLAS_LD_PATH="${NVHPC_ROOT}/math_libs/lib64/" export CUDA_ROOT=${CUDA_HOME} -export CUDA_LDFLAGS="-L${LIB_PATH} -lcuda -L${LIB_PATH} -lcudart -L${LIB_PATH} -lcublas" +export CUDA_LDFLAGS="-L${LIB_PATH} -lcuda -L${LIB_PATH} -lcudart -L${CUBLAS_LD_PATH} -lcublas" export CUDA_CXXFLAGS="-I${CUDA_HOME}/include" export LD_LIBRARY_PATH="${MKL_HOME}/lib/intel64:${LD_LIBRARY_PATH}" -ls ${LIB_PATH}/libcublas.so +MPILIBS=$(mpicxx -show | awk '!($1="")') +export MPILIBS +export MPINVCXX="nv++ ${MPILIBS}" + +ls ${CUBLAS_LD_PATH}/libcublas.so ls ${LIB_PATH}/libcudart.so +#export OMPI_CC="nvc" +#export OMPI_CXX="nvc++" + +BLAS_LDFLAGS="-L${PWD}/OpenBLAS-0.3.20/ -lopenblas" +_openblas_make () { + + [[ -d OpenBLAS-0.3.20/ ]] || { + wget https://github.com/xianyi/OpenBLAS/releases/download/v0.3.20/OpenBLAS-0.3.20.tar.gz + tar xvzf OpenBLAS-0.3.20.tar.gz + cd OpenBLAS-0.3.20/ + make FC=gfortran CC=gcc USE_OPENMP=1 NUM_THREADS=72 TARGET=SKYLAKEX + } && { + echo "Openblas built" + } + +} +( _openblas_make; ) + + + cat < const& in) { // LOG(0, "AtripCUDA") << "doing energy " << i << "distinct " << distinct << "\n"; WITH_CHRONO("energy", -/* - TODO: think about how to do this on the GPU in the best way possible if ( distinct == 0) tupleEnergy = getEnergyDistinct(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk); else tupleEnergy = getEnergySame(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk); -*/ ) #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES) diff --git a/src/atrip/Equations.cxx b/src/atrip/Equations.cxx index 4439383..e2a6d8a 100644 --- a/src/atrip/Equations.cxx +++ b/src/atrip/Equations.cxx @@ -182,15 +182,21 @@ namespace cuda { // [[file:~/cuda/atrip/atrip.org::*Energy][Energy:2]] template + __MAYBE_DEVICE__ double getEnergyDistinct - ( F const epsabc - , size_t const No - , F* const epsi - , F* const Tijk - , F* const Zijk - ) { + (F const epsabc, + size_t const No, + F* const epsi, + F* const Tijk, + F* const Zijk) { constexpr size_t blockSize=16; F energy(0.); +#if defined(HAVE_CUDA) + #pragma acc kernels + for (size_t k(0); k < No; k++) { + for (size_t j(k); j < No; j++) { + for (size_t i(j); i < No; i++) { +#else for (size_t kk=0; kk k ? jj : k; for (size_t j(jstart); j < jend; j++){ - F const ej(epsi[j]); - F const facjk = j == k ? F(0.5) : F(1.0); size_t istart = ii > j ? ii : j; for (size_t i(istart); i < iend; i++){ +#endif + const F ek(epsi[k]); + const F ej(epsi[j]); + const F facjk = j == k ? F(0.5) : F(1.0); const F ei(epsi[i]) , facij = i == j ? F(0.5) : F(1.0) @@ -239,14 +246,17 @@ double getEnergyDistinct } // i } // j } // k +#if !defined(HAVE_CUDA) } // ii } // jj } // kk +#endif return std::real(energy); } template + __MAYBE_DEVICE__ double getEnergySame ( F const epsabc , size_t const No @@ -256,6 +266,12 @@ double getEnergySame ) { constexpr size_t blockSize = 16; F energy = F(0.); +#if defined(HAVE_CUDA) + #pragma acc kernels + for (size_t k(0); k < No; k++) { + for (size_t j(k); j < No; j++) { + for (size_t i(j); i < No; i++) { +#else for (size_t kk=0; kk k ? jj : k; for(size_t j(jstart); j < jend; j++){ - const F facjk( j == k ? F(0.5) : F(1.0)); - const F ej(epsi[j]); const size_t istart = ii > j ? ii : j; for(size_t i(istart); i < iend; i++){ +#endif + const F facjk( j == k ? F(0.5) : F(1.0)); + const F ek(epsi[k]); + const F ej(epsi[j]); const F ei(epsi[i]) , facij ( i==j ? F(0.5) : F(1.0)) @@ -291,9 +308,11 @@ double getEnergySame } // i } // j } // k +#if !defined(HAVE_CUDA) } // ii } // jj } // kk +#endif return std::real(energy); } // Energy:2 ends here