Add preliminary openacc support, atrip bench not linking

Add bureaucracy for openacc in autotools
2023-01-11 13:06:59 +01:00 · 2023-01-05 00:06:37 +01:00
9 changed files with 219 additions and 28 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -18,6 +18,8 @@ jobs:
    strategy:
      matrix:
        compiler:
+          - gcc12
+          - gcc11
          - gcc11
          - gcc10
          - gcc9
--- a/bench/Makefile.am
+++ b/bench/Makefile.am
@@ -23,6 +23,18 @@ atrip_SOURCES = main.cxx
 atrip_CPPFLAGS = $(AM_CPPFLAGS)
 atrip_LDADD = $(BENCHES_LDADD)

+atrip: main.cxx
+	$(NVCXX) -cuda     \
+		-x cu -I../      \
+		$(MPILIBS)       \
+		-I$(srcdir)/     \
+		$(AM_CPPFLAGS)   \
+		$(DEFS)          \
+		$(BENCHES_LDADD) \
+		$(AM_LDFLAGS)    \
+		$< -o $@
+
+endif

 if !WITH_CUDA
 ##
--- a/configure.ac
+++ b/configure.ac
@@ -48,7 +48,8 @@ AM_CONDITIONAL([WITH_CLANG_CHECK], [test x${clang_check} = xYES])
 AC_ARG_ENABLE([cuda],
              [AS_HELP_STRING([--enable-cuda],
                              [Build with cuda])],
-                              [WITH_CUDA=yes],
+              [WITH_CUDA=yes
+               WITH_OPENACC=yes],
              [WITH_CUDA=no])
 AC_ARG_VAR([NVCC], [Path to the nvidia cuda compiler.])
 AC_ARG_VAR([CUDA_LDFLAGS], [LDFLAGS to find libraries -lcuda, -lcudart, -lcublas.])
@@ -182,6 +183,8 @@ if test x${WITH_CUDA} = xyes; then
                     -----------------------
  ])
  AC_CHECK_PROGS([NVCC], [nvcc])
+  AC_CHECK_PROGS([NVCXX], [nvc++])
+  MPILIBS=$($MPICXX -show | awk '!($1="")')
  AC_SUBST([CUDA_LDFLAGS])
  AC_DEFINE([HAVE_CUDA],1,[Wether we are using CUDA])
  # TODO: make sure to find cuda and cudart
@@ -227,6 +230,7 @@ AC_MSG_RESULT([
                       ATRIP_LDFLAGS  = $ATRIP_LDFLAGS
                       BLAS = ${BLAS_LIBS}
                       LIBS = ${LIBS}
+                       MPILIBS = $MPILIBS
 ])

 AC_OUTPUT
--- a/etc/env/raven/cuda-openacc
+++ b/etc/env/raven/cuda-openacc
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+mods=(
+  #cuda/11.6
+  nvhpcsdk/22  # for openacc
+  gcc/12
+  openmpi
+  mkl/2020.4
+  autoconf/2.69
+  automake/1.15
+  libtool/2.4.6
+)
+
+
+module purge
+module load ${mods[@]}
+
+LIB_PATH="${NVHPC_CUDA_HOME}/lib64"
+export CUBLAS_LD_PATH="${NVHPC_ROOT}/math_libs/lib64/"
+export CUDA_ROOT=${CUDA_HOME}
+export CUDA_LDFLAGS="-L${LIB_PATH} -lcuda -L${LIB_PATH} -lcudart -L${CUBLAS_LD_PATH} -lcublas"
+export CUDA_CXXFLAGS="-I${CUDA_HOME}/include"
+
+export LD_LIBRARY_PATH="${MKL_HOME}/lib/intel64:${LD_LIBRARY_PATH}"
+
+MPILIBS=$(mpicxx -show | awk '!($1="")')
+export MPILIBS
+export MPINVCXX="nv++ ${MPILIBS}"
+
+ls ${CUBLAS_LD_PATH}/libcublas.so
+ls ${LIB_PATH}/libcudart.so
+
+#export OMPI_CC="nvc"
+#export OMPI_CXX="nvc++"
+
+BLAS_LDFLAGS="-L${PWD}/OpenBLAS-0.3.20/ -lopenblas"
+_openblas_make () {
+
+  [[ -d OpenBLAS-0.3.20/ ]] || {
+    wget https://github.com/xianyi/OpenBLAS/releases/download/v0.3.20/OpenBLAS-0.3.20.tar.gz
+    tar xvzf OpenBLAS-0.3.20.tar.gz
+    cd OpenBLAS-0.3.20/
+    make FC=gfortran CC=gcc USE_OPENMP=1 NUM_THREADS=72 TARGET=SKYLAKEX
+  } && {
+    echo "Openblas built"
+  }
+
+}
+( _openblas_make; )
+
+
+
+cat <<EOF
+
+////////////////////////////////////////////////////////////////////////////////
+                                    info
+////////////////////////////////////////////////////////////////////////////////
+
+
+MKL_HOME         = $MKL_HOME
+BLAS_STATIC_PATH = $BLAS_STATIC_PATH
+
+CUDA_ROOT      = ${CUDA_HOME}
+CUDA_LDFLAGS   = "-L${LIB_PATH} -lcuda -L${LIB_PATH} -lcudart -L${LIB_PATH} -lcublas"
+CUDA_CXXFLAGS  = "-I${CUDA_HOME}/include"
+
+
+
+Consider now runnng the following
+
+../../configure \\
+  --enable-cuda \\
+  --disable-slice \\
+  --with-blas="${BLAS_LDFLAGS}" \\
+  CXX="gcc" \\
+  NVCC="\$MPINVCXX" \\
+  MPICXX="mpicxx"
+
+
+EOF
+	
+
+return
--- a/etc/m4/atrip_openacc.m4
+++ b/etc/m4/atrip_openacc.m4
@@ -0,0 +1,70 @@
+#   SYNOPSIS
+#
+#   ATRIP_OPENACC([ACTION-SUCCESS], [ACTION-FAILURE])
+#
+#   DESCRIPTION
+#
+#   Check whether the given the -fopenacc flag works with the current language's compiler
+#   or gives an error.
+#
+#   ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
+#   success/failure.
+#
+#   LICENSE
+#
+#   Copyright (c) 2023 Alejandro Gallo <aamsgallo@gmail.com>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved.  This file is offered as-is, without any
+#   warranty.
+
+AC_DEFUN([ATRIP_OPENACC],
+[
+AC_MSG_CHECKING([that the compiler works with the -fopenacc])
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([_ATRIP_OPENACC_SOURCE])],
+                  [
+                   $1
+                   AC_MSG_RESULT([yes])
+                   ],
+                  [
+                   $2
+                   AC_MSG_ERROR([no])
+                   ])
+])dnl DEFUN
+
+m4_define([_ATRIP_OPENACC_SOURCE], [[
+#include <stdio.h>
+#include <stdlib.h>
+#include <openacc.h>
+
+#define SIZE 10
+
+int main(int argc, char **argv) {
+  float matrix[SIZE * SIZE];
+  float result[SIZE * SIZE];
+
+  // Initialize the matrix with random values
+  for (int i = 0; i < SIZE * SIZE; i++) {
+    matrix[i] = rand() / (float)RAND_MAX;
+  }
+
+#pragma acc data                                            \
+        copy(matrix[0:SIZE * SIZE])           \
+        copyout(result[0:SIZE * SIZE])
+  {
+    // Calculate the matrix multiplication
+#pragma acc parallel loop collapse(2)
+    for (int i = 0; i < SIZE; i++) {
+      for (int j = 0; j < SIZE; j++) {
+        float sum = 0.0f;
+        for (int k = 0; k < SIZE; k++) {
+          sum += matrix[i * SIZE + k] * matrix[j * SIZE + k];
+        }
+        result[i * SIZE + j] = sum;
+      }
+    }
+  }
+  return 0;
+}
+]])
--- a/etc/m4/ax_cxx_compile_stdcxx.m4
+++ b/etc/m4/ax_cxx_compile_stdcxx.m4
@@ -43,7 +43,7 @@
 #   and this notice are preserved.  This file is offered as-is, without any
 #   warranty.

-#serial 14
+#serial 15

 dnl  This macro is based on the code from the AX_CXX_COMPILE_STDCXX_11 macro
 dnl  (serial version number 13).
@@ -189,7 +189,11 @@ m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_11], [[

 #error "This is not a C++ compiler"

-#elif __cplusplus < 201103L
+// MSVC always sets __cplusplus to 199711L in older versions; newer versions
+// only set it correctly if /Zc:__cplusplus is specified as well as a
+// /std:c++NN switch:
+// https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/
+#elif __cplusplus < 201103L && !defined _MSC_VER

 #error "This is not a C++11 compiler"

@@ -480,7 +484,7 @@ m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_14], [[

 #error "This is not a C++ compiler"

-#elif __cplusplus < 201402L
+#elif __cplusplus < 201402L && !defined _MSC_VER

 #error "This is not a C++14 compiler"

@@ -604,7 +608,7 @@ m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_17], [[

 #error "This is not a C++ compiler"

-#elif __cplusplus < 201703L
+#elif __cplusplus < 201703L && !defined _MSC_VER

 #error "This is not a C++17 compiler"

@@ -970,7 +974,7 @@ namespace cxx17

 }  // namespace cxx17

-#endif  // __cplusplus < 201703L
+#endif  // __cplusplus < 201703L && !defined _MSC_VER

 ]])

@@ -983,7 +987,7 @@ m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_20], [[

 #error "This is not a C++ compiler"

-#elif __cplusplus < 202002L
+#elif __cplusplus < 202002L && !defined _MSC_VER

 #error "This is not a C++20 compiler"

@@ -1000,6 +1004,6 @@ namespace cxx20

 }  // namespace cxx20

-#endif  // __cplusplus < 202002L
+#endif  // __cplusplus < 202002L && !defined _MSC_VER

 ]])
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -7,16 +7,17 @@ AM_CPPFLAGS = $(CTF_CPPFLAGS)
 lib_LIBRARIES = libatrip.a

 libatrip_a_CPPFLAGS = -I$(top_srcdir)/include/
-libatrip_a_SOURCES = ./atrip/Blas.cxx ./atrip/Tuples.cxx ./atrip/DatabaseCommunicator.cxx
+libatrip_a_SOURCES =
 NVCC_FILES  = ./atrip/Equations.cxx ./atrip/Complex.cxx ./atrip/Atrip.cxx
-
+NVCC_FILES += ./atrip/Blas.cxx ./atrip/Tuples.cxx ./atrip/DatabaseCommunicator.cxx
 if WITH_CUDA
 NVCC_OBJS = $(patsubst %.cxx,%.nvcc.o,$(NVCC_FILES))
 libatrip_a_CPPFLAGS += $(CUDA_CXXFLAGS)
 libatrip_a_DEPENDENCIES = $(NVCC_OBJS)
 libatrip_a_LIBADD = $(NVCC_OBJS)
 %.nvcc.o: %.cxx
-	$(NVCC) -c -x cu -ccbin="${MPICXX}" -I../ $(CPPFLAGS) $(CTF_CPPFLAGS) $(DEFS) $(libatrip_a_CPPFLAGS) $< -o $@
+	##$(NVCC) -c -x cu -ccbin="${MPICXX}" -I../ $(CPPFLAGS) $(CTF_CPPFLAGS) $(DEFS) $(libatrip_a_CPPFLAGS) $< -o $@
+	$(NVCXX) -cuda $(MPILIBS) -c -x cu -I../ $(CPPFLAGS) $(CTF_CPPFLAGS) $(DEFS) $(libatrip_a_CPPFLAGS) $< -o $@

 #./atrip/Equations.o: ./atrip/Equations.cxx
 #	$(NVCC) -c -I../ $(CPPFLAGS) $(libatrip_a_CPPFLAGS) $< -o $@
--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@@ -694,13 +694,10 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {

      // LOG(0, "AtripCUDA") <<  "doing energy " << i << "distinct " << distinct << "\n";
      WITH_CHRONO("energy",
-/*
-    TODO: think about how to do this on the GPU in the best way possible
        if ( distinct == 0)
          tupleEnergy = getEnergyDistinct<F>(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk);
        else
          tupleEnergy = getEnergySame<F>(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk);
-*/
      )

 #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
--- a/src/atrip/Equations.cxx
+++ b/src/atrip/Equations.cxx
@@ -182,15 +182,21 @@ namespace cuda {

 // [[file:~/cuda/atrip/atrip.org::*Energy][Energy:2]]
 template <typename F>
+  __MAYBE_DEVICE__
 double getEnergyDistinct
-  ( F const epsabc
-  , size_t const No
-  , F* const epsi
-  , F* const Tijk
-  , F* const Zijk
-  ) {
+  (F const epsabc,
+   size_t const No,
+   F* const epsi,
+   F* const Tijk,
+   F* const Zijk) {
  constexpr size_t blockSize=16;
  F energy(0.);
+#if defined(HAVE_CUDA)
+  #pragma acc kernels
+  for (size_t k(0); k < No; k++) {
+    for (size_t j(k); j < No; j++) {
+      for (size_t i(j); i < No; i++) {
+#else
  for (size_t kk=0; kk<No; kk+=blockSize){
    const size_t kend( std::min(No, kk+blockSize) );
    for (size_t jj(kk); jj<No; jj+=blockSize){
@@ -198,13 +204,14 @@ double getEnergyDistinct
      for (size_t ii(jj); ii<No; ii+=blockSize){
        const size_t iend( std::min( No, ii+blockSize) );
        for (size_t k(kk); k < kend; k++){
-          const F ek(epsi[k]);
          const size_t jstart = jj > k ? jj : k;
          for (size_t j(jstart); j < jend; j++){
-            F const ej(epsi[j]);
-            F const facjk = j == k ? F(0.5) : F(1.0);
            size_t istart = ii > j ? ii : j;
            for (size_t i(istart); i < iend; i++){
+#endif
+              const F ek(epsi[k]);
+              const F ej(epsi[j]);
+              const F facjk = j == k ? F(0.5) : F(1.0);
              const F
                  ei(epsi[i])
                , facij = i == j ? F(0.5) : F(1.0)
@@ -239,14 +246,17 @@ double getEnergyDistinct
            } // i
          } // j
        } // k
+#if !defined(HAVE_CUDA)
      } // ii
    } // jj
  } // kk
+#endif
  return std::real(energy);
 }


 template <typename F>
+  __MAYBE_DEVICE__
 double getEnergySame
  ( F const epsabc
  , size_t const No
@@ -256,6 +266,12 @@ double getEnergySame
  ) {
  constexpr size_t blockSize = 16;
  F energy = F(0.);
+#if defined(HAVE_CUDA)
+  #pragma acc kernels
+  for (size_t k(0); k < No; k++) {
+    for (size_t j(k); j < No; j++) {
+      for (size_t i(j); i < No; i++) {
+#else
  for (size_t kk=0; kk<No; kk+=blockSize){
    const size_t kend( std::min( kk+blockSize, No) );
    for (size_t jj(kk); jj<No; jj+=blockSize){
@@ -263,13 +279,14 @@ double getEnergySame
      for (size_t ii(jj); ii<No; ii+=blockSize){
        const size_t iend( std::min( ii+blockSize, No) );
        for (size_t k(kk); k < kend; k++){
-          const F ek(epsi[k]);
          const size_t jstart = jj > k ? jj : k;
          for(size_t j(jstart); j < jend; j++){
-            const F facjk( j == k ? F(0.5) : F(1.0));
-            const F ej(epsi[j]);
            const size_t istart = ii > j ? ii : j;
            for(size_t i(istart); i < iend; i++){
+#endif
+              const F facjk( j == k ? F(0.5) : F(1.0));
+              const F ek(epsi[k]);
+              const F ej(epsi[j]);
              const F
                ei(epsi[i])
              , facij ( i==j ? F(0.5) : F(1.0))
@@ -291,9 +308,11 @@ double getEnergySame
            } // i
          } // j
        } // k
+#if !defined(HAVE_CUDA)
      } // ii
    } // jj
  } // kk
+#endif
  return std::real(energy);
 }
 // Energy:2 ends here
Author	SHA1	Message	Date
Alejandro Gallo	017cf43381	Add preliminary openacc support, atrip bench not linking	2023-01-11 13:06:59 +01:00
Alejandro Gallo	77e1aaabeb	Add bureaucracy for openacc in autotools	2023-01-05 00:06:37 +01:00