Compare commits

...

2 Commits

9 changed files with 219 additions and 28 deletions

View File

@ -18,6 +18,8 @@ jobs:
strategy:
matrix:
compiler:
- gcc12
- gcc11
- gcc11
- gcc10
- gcc9

View File

@ -23,6 +23,18 @@ atrip_SOURCES = main.cxx
atrip_CPPFLAGS = $(AM_CPPFLAGS)
atrip_LDADD = $(BENCHES_LDADD)
atrip: main.cxx
$(NVCXX) -cuda \
-x cu -I../ \
$(MPILIBS) \
-I$(srcdir)/ \
$(AM_CPPFLAGS) \
$(DEFS) \
$(BENCHES_LDADD) \
$(AM_LDFLAGS) \
$< -o $@
endif
if !WITH_CUDA
##

View File

@ -48,7 +48,8 @@ AM_CONDITIONAL([WITH_CLANG_CHECK], [test x${clang_check} = xYES])
AC_ARG_ENABLE([cuda],
[AS_HELP_STRING([--enable-cuda],
[Build with cuda])],
[WITH_CUDA=yes],
[WITH_CUDA=yes
WITH_OPENACC=yes],
[WITH_CUDA=no])
AC_ARG_VAR([NVCC], [Path to the nvidia cuda compiler.])
AC_ARG_VAR([CUDA_LDFLAGS], [LDFLAGS to find libraries -lcuda, -lcudart, -lcublas.])
@ -182,6 +183,8 @@ if test x${WITH_CUDA} = xyes; then
-----------------------
])
AC_CHECK_PROGS([NVCC], [nvcc])
AC_CHECK_PROGS([NVCXX], [nvc++])
MPILIBS=$($MPICXX -show | awk '!($1="")')
AC_SUBST([CUDA_LDFLAGS])
AC_DEFINE([HAVE_CUDA],1,[Wether we are using CUDA])
# TODO: make sure to find cuda and cudart
@ -227,6 +230,7 @@ AC_MSG_RESULT([
ATRIP_LDFLAGS = $ATRIP_LDFLAGS
BLAS = ${BLAS_LIBS}
LIBS = ${LIBS}
MPILIBS = $MPILIBS
])
AC_OUTPUT

82
etc/env/raven/cuda-openacc vendored Normal file
View File

@ -0,0 +1,82 @@
#!/usr/bin/env bash
mods=(
#cuda/11.6
nvhpcsdk/22 # for openacc
gcc/12
openmpi
mkl/2020.4
autoconf/2.69
automake/1.15
libtool/2.4.6
)
module purge
module load ${mods[@]}
LIB_PATH="${NVHPC_CUDA_HOME}/lib64"
export CUBLAS_LD_PATH="${NVHPC_ROOT}/math_libs/lib64/"
export CUDA_ROOT=${CUDA_HOME}
export CUDA_LDFLAGS="-L${LIB_PATH} -lcuda -L${LIB_PATH} -lcudart -L${CUBLAS_LD_PATH} -lcublas"
export CUDA_CXXFLAGS="-I${CUDA_HOME}/include"
export LD_LIBRARY_PATH="${MKL_HOME}/lib/intel64:${LD_LIBRARY_PATH}"
MPILIBS=$(mpicxx -show | awk '!($1="")')
export MPILIBS
export MPINVCXX="nv++ ${MPILIBS}"
ls ${CUBLAS_LD_PATH}/libcublas.so
ls ${LIB_PATH}/libcudart.so
#export OMPI_CC="nvc"
#export OMPI_CXX="nvc++"
BLAS_LDFLAGS="-L${PWD}/OpenBLAS-0.3.20/ -lopenblas"
_openblas_make () {
[[ -d OpenBLAS-0.3.20/ ]] || {
wget https://github.com/xianyi/OpenBLAS/releases/download/v0.3.20/OpenBLAS-0.3.20.tar.gz
tar xvzf OpenBLAS-0.3.20.tar.gz
cd OpenBLAS-0.3.20/
make FC=gfortran CC=gcc USE_OPENMP=1 NUM_THREADS=72 TARGET=SKYLAKEX
} && {
echo "Openblas built"
}
}
( _openblas_make; )
cat <<EOF
////////////////////////////////////////////////////////////////////////////////
info
////////////////////////////////////////////////////////////////////////////////
MKL_HOME = $MKL_HOME
BLAS_STATIC_PATH = $BLAS_STATIC_PATH
CUDA_ROOT = ${CUDA_HOME}
CUDA_LDFLAGS = "-L${LIB_PATH} -lcuda -L${LIB_PATH} -lcudart -L${LIB_PATH} -lcublas"
CUDA_CXXFLAGS = "-I${CUDA_HOME}/include"
Consider now runnng the following
../../configure \\
--enable-cuda \\
--disable-slice \\
--with-blas="${BLAS_LDFLAGS}" \\
CXX="gcc" \\
NVCC="\$MPINVCXX" \\
MPICXX="mpicxx"
EOF
return

70
etc/m4/atrip_openacc.m4 Normal file
View File

@ -0,0 +1,70 @@
# SYNOPSIS
#
# ATRIP_OPENACC([ACTION-SUCCESS], [ACTION-FAILURE])
#
# DESCRIPTION
#
# Check whether the given the -fopenacc flag works with the current language's compiler
# or gives an error.
#
# ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
# success/failure.
#
# LICENSE
#
# Copyright (c) 2023 Alejandro Gallo <aamsgallo@gmail.com>
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
AC_DEFUN([ATRIP_OPENACC],
[
AC_MSG_CHECKING([that the compiler works with the -fopenacc])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([_ATRIP_OPENACC_SOURCE])],
[
$1
AC_MSG_RESULT([yes])
],
[
$2
AC_MSG_ERROR([no])
])
])dnl DEFUN
m4_define([_ATRIP_OPENACC_SOURCE], [[
#include <stdio.h>
#include <stdlib.h>
#include <openacc.h>
#define SIZE 10
int main(int argc, char **argv) {
float matrix[SIZE * SIZE];
float result[SIZE * SIZE];
// Initialize the matrix with random values
for (int i = 0; i < SIZE * SIZE; i++) {
matrix[i] = rand() / (float)RAND_MAX;
}
#pragma acc data \
copy(matrix[0:SIZE * SIZE]) \
copyout(result[0:SIZE * SIZE])
{
// Calculate the matrix multiplication
#pragma acc parallel loop collapse(2)
for (int i = 0; i < SIZE; i++) {
for (int j = 0; j < SIZE; j++) {
float sum = 0.0f;
for (int k = 0; k < SIZE; k++) {
sum += matrix[i * SIZE + k] * matrix[j * SIZE + k];
}
result[i * SIZE + j] = sum;
}
}
}
return 0;
}
]])

View File

@ -43,7 +43,7 @@
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 14
#serial 15
dnl This macro is based on the code from the AX_CXX_COMPILE_STDCXX_11 macro
dnl (serial version number 13).
@ -189,7 +189,11 @@ m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_11], [[
#error "This is not a C++ compiler"
#elif __cplusplus < 201103L
// MSVC always sets __cplusplus to 199711L in older versions; newer versions
// only set it correctly if /Zc:__cplusplus is specified as well as a
// /std:c++NN switch:
// https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/
#elif __cplusplus < 201103L && !defined _MSC_VER
#error "This is not a C++11 compiler"
@ -480,7 +484,7 @@ m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_14], [[
#error "This is not a C++ compiler"
#elif __cplusplus < 201402L
#elif __cplusplus < 201402L && !defined _MSC_VER
#error "This is not a C++14 compiler"
@ -604,7 +608,7 @@ m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_17], [[
#error "This is not a C++ compiler"
#elif __cplusplus < 201703L
#elif __cplusplus < 201703L && !defined _MSC_VER
#error "This is not a C++17 compiler"
@ -970,7 +974,7 @@ namespace cxx17
} // namespace cxx17
#endif // __cplusplus < 201703L
#endif // __cplusplus < 201703L && !defined _MSC_VER
]])
@ -983,7 +987,7 @@ m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_20], [[
#error "This is not a C++ compiler"
#elif __cplusplus < 202002L
#elif __cplusplus < 202002L && !defined _MSC_VER
#error "This is not a C++20 compiler"
@ -1000,6 +1004,6 @@ namespace cxx20
} // namespace cxx20
#endif // __cplusplus < 202002L
#endif // __cplusplus < 202002L && !defined _MSC_VER
]])

View File

@ -7,16 +7,17 @@ AM_CPPFLAGS = $(CTF_CPPFLAGS)
lib_LIBRARIES = libatrip.a
libatrip_a_CPPFLAGS = -I$(top_srcdir)/include/
libatrip_a_SOURCES = ./atrip/Blas.cxx ./atrip/Tuples.cxx ./atrip/DatabaseCommunicator.cxx
libatrip_a_SOURCES =
NVCC_FILES = ./atrip/Equations.cxx ./atrip/Complex.cxx ./atrip/Atrip.cxx
NVCC_FILES += ./atrip/Blas.cxx ./atrip/Tuples.cxx ./atrip/DatabaseCommunicator.cxx
if WITH_CUDA
NVCC_OBJS = $(patsubst %.cxx,%.nvcc.o,$(NVCC_FILES))
libatrip_a_CPPFLAGS += $(CUDA_CXXFLAGS)
libatrip_a_DEPENDENCIES = $(NVCC_OBJS)
libatrip_a_LIBADD = $(NVCC_OBJS)
%.nvcc.o: %.cxx
$(NVCC) -c -x cu -ccbin="${MPICXX}" -I../ $(CPPFLAGS) $(CTF_CPPFLAGS) $(DEFS) $(libatrip_a_CPPFLAGS) $< -o $@
##$(NVCC) -c -x cu -ccbin="${MPICXX}" -I../ $(CPPFLAGS) $(CTF_CPPFLAGS) $(DEFS) $(libatrip_a_CPPFLAGS) $< -o $@
$(NVCXX) -cuda $(MPILIBS) -c -x cu -I../ $(CPPFLAGS) $(CTF_CPPFLAGS) $(DEFS) $(libatrip_a_CPPFLAGS) $< -o $@
#./atrip/Equations.o: ./atrip/Equations.cxx
# $(NVCC) -c -I../ $(CPPFLAGS) $(libatrip_a_CPPFLAGS) $< -o $@

View File

@ -694,13 +694,10 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
// LOG(0, "AtripCUDA") << "doing energy " << i << "distinct " << distinct << "\n";
WITH_CHRONO("energy",
/*
TODO: think about how to do this on the GPU in the best way possible
if ( distinct == 0)
tupleEnergy = getEnergyDistinct<F>(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk);
else
tupleEnergy = getEnergySame<F>(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk);
*/
)
#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)

View File

@ -182,15 +182,21 @@ namespace cuda {
// [[file:~/cuda/atrip/atrip.org::*Energy][Energy:2]]
template <typename F>
__MAYBE_DEVICE__
double getEnergyDistinct
( F const epsabc
, size_t const No
, F* const epsi
, F* const Tijk
, F* const Zijk
) {
(F const epsabc,
size_t const No,
F* const epsi,
F* const Tijk,
F* const Zijk) {
constexpr size_t blockSize=16;
F energy(0.);
#if defined(HAVE_CUDA)
#pragma acc kernels
for (size_t k(0); k < No; k++) {
for (size_t j(k); j < No; j++) {
for (size_t i(j); i < No; i++) {
#else
for (size_t kk=0; kk<No; kk+=blockSize){
const size_t kend( std::min(No, kk+blockSize) );
for (size_t jj(kk); jj<No; jj+=blockSize){
@ -198,13 +204,14 @@ double getEnergyDistinct
for (size_t ii(jj); ii<No; ii+=blockSize){
const size_t iend( std::min( No, ii+blockSize) );
for (size_t k(kk); k < kend; k++){
const F ek(epsi[k]);
const size_t jstart = jj > k ? jj : k;
for (size_t j(jstart); j < jend; j++){
F const ej(epsi[j]);
F const facjk = j == k ? F(0.5) : F(1.0);
size_t istart = ii > j ? ii : j;
for (size_t i(istart); i < iend; i++){
#endif
const F ek(epsi[k]);
const F ej(epsi[j]);
const F facjk = j == k ? F(0.5) : F(1.0);
const F
ei(epsi[i])
, facij = i == j ? F(0.5) : F(1.0)
@ -239,14 +246,17 @@ double getEnergyDistinct
} // i
} // j
} // k
#if !defined(HAVE_CUDA)
} // ii
} // jj
} // kk
#endif
return std::real(energy);
}
template <typename F>
__MAYBE_DEVICE__
double getEnergySame
( F const epsabc
, size_t const No
@ -256,6 +266,12 @@ double getEnergySame
) {
constexpr size_t blockSize = 16;
F energy = F(0.);
#if defined(HAVE_CUDA)
#pragma acc kernels
for (size_t k(0); k < No; k++) {
for (size_t j(k); j < No; j++) {
for (size_t i(j); i < No; i++) {
#else
for (size_t kk=0; kk<No; kk+=blockSize){
const size_t kend( std::min( kk+blockSize, No) );
for (size_t jj(kk); jj<No; jj+=blockSize){
@ -263,13 +279,14 @@ double getEnergySame
for (size_t ii(jj); ii<No; ii+=blockSize){
const size_t iend( std::min( ii+blockSize, No) );
for (size_t k(kk); k < kend; k++){
const F ek(epsi[k]);
const size_t jstart = jj > k ? jj : k;
for(size_t j(jstart); j < jend; j++){
const F facjk( j == k ? F(0.5) : F(1.0));
const F ej(epsi[j]);
const size_t istart = ii > j ? ii : j;
for(size_t i(istart); i < iend; i++){
#endif
const F facjk( j == k ? F(0.5) : F(1.0));
const F ek(epsi[k]);
const F ej(epsi[j]);
const F
ei(epsi[i])
, facij ( i==j ? F(0.5) : F(1.0))
@ -291,9 +308,11 @@ double getEnergySame
} // i
} // j
} // k
#if !defined(HAVE_CUDA)
} // ii
} // jj
} // kk
#endif
return std::real(energy);
}
// Energy:2 ends here