Compare commits

...

13 Commits

13 changed files with 401 additions and 255 deletions

View File

@ -13,7 +13,9 @@
(format "%s/include/" root) (format "%s/include/" root)
(format "%s/" root) (format "%s/" root)
(format "%s/bench/" root) (format "%s/bench/" root)
(format "%s/build/main/" root))))) (format "%s/build/main/" root)))
(setq-local flycheck-clang-include-path
flycheck-gcc-include-path)))
(eval . (flycheck-mode)) (eval . (flycheck-mode))
(eval . (outline-minor-mode)) (eval . (outline-minor-mode))
(indent-tabs-mode . nil) (indent-tabs-mode . nil)

View File

@ -2,8 +2,6 @@
name: CI name: CI
on: on:
push:
branches: [ master, cuda ]
pull_request: pull_request:
branches: [ master, cuda ] branches: [ master, cuda ]

View File

@ -19,7 +19,7 @@ BENCHES_LDADD = $(ATRIP_LIB) $(ATRIP_CTF)
## main entry point and bench ## main entry point and bench
## ##
bin_PROGRAMS += atrip bin_PROGRAMS += atrip
atrip_SOURCES = test_main.cxx atrip_SOURCES = main.cxx
atrip_CPPFLAGS = $(AM_CPPFLAGS) atrip_CPPFLAGS = $(AM_CPPFLAGS)
atrip_LDADD = $(BENCHES_LDADD) atrip_LDADD = $(BENCHES_LDADD)

View File

@ -21,26 +21,6 @@ AC_ARG_ENABLE(shared,
files (default=YES)]), files (default=YES)]),
[], [enable_shared=yes]) [], [enable_shared=yes])
AC_ARG_ENABLE(
[slice],
[AS_HELP_STRING(
[--disable-slice],
[Disable the step of slicing tensors for CTF, this is useful for example for benchmarking or testing.])],
[atrip_dont_slice=1
AC_DEFINE([ATRIP_DONT_SLICE],1,[Wether CTF will slice tensors or skip the step])
],
[atrip_dont_slice=0]
)
AC_ARG_ENABLE(
[atrip_dgemm],
[AS_HELP_STRING(
[--disable-dgemm],
[Disable using dgemm for the doubles equations])],
[],
[AC_DEFINE([ATRIP_USE_DGEMM],1,[Use dgemm for the doubles equations])]
)
AC_ARG_ENABLE([docs], AC_ARG_ENABLE([docs],
[AS_HELP_STRING([--enable-docs], [AS_HELP_STRING([--enable-docs],
@ -74,13 +54,53 @@ AC_ARG_VAR([NVCC], [Path to the nvidia cuda compiler.])
AC_ARG_VAR([CUDA_LDFLAGS], [LDFLAGS to find libraries -lcuda, -lcudart, -lcublas.]) AC_ARG_VAR([CUDA_LDFLAGS], [LDFLAGS to find libraries -lcuda, -lcudart, -lcublas.])
AC_ARG_VAR([CUDA_CXXFLAGS], [CXXFLAGS to find the CUDA headers]) AC_ARG_VAR([CUDA_CXXFLAGS], [CXXFLAGS to find the CUDA headers])
dnl -----------------------------------------------------------------------
dnl ATRIP CPP DEFINES
dnl -----------------------------------------------------------------------
AC_ARG_WITH([atrip-debug], AC_ARG_WITH([atrip-debug],
[AS_HELP_STRING([--with-atrip-debug], [AS_HELP_STRING([--with-atrip-debug],
[Debug level for atrip, possible values: 1, 2, 3, 4])], [Debug level for atrip, possible values:
1, 2, 3, 4])],
[AC_DEFINE([ATRIP_DEBUG],[atrip-debug],[Atrip debug level])], [AC_DEFINE([ATRIP_DEBUG],[atrip-debug],[Atrip debug level])],
[AC_DEFINE([ATRIP_DEBUG],[1],[Atrip debug level])] [AC_DEFINE([ATRIP_DEBUG],[1],[Atrip debug level])])
)
AC_ARG_ENABLE([atrip_dgemm],
[AS_HELP_STRING([--disable-dgemm],
[Disable using dgemm for the doubles equations])],
[],
[AC_DEFINE([ATRIP_USE_DGEMM],
1,
[Use dgemm for the doubles equations])])
ATRIP_DEF([slice], [disable],
[ATRIP_DONT_SLICE],
[Disable the step of slicing tensors for CTF, this is useful
for example for benchmarking or testing.])
ATRIP_DEF([only-dgemm], [enable],
[ATRIP_ONLY_DGEMM],
[Run only the parts of atrip that involve dgemm calls, this
is useful for benchmarking and testing the code, it is
intended for developers of Atrip.])
ATRIP_DEF([naive-slow], [enable],
[ATRIP_NAIVE_SLOW],
[Run slow but correct code for the mapping of (iteration,
rank) to tuple of the naive tuple distribution.])
ATRIP_DEF([sources-in-gpu], [enable],
[ATRIP_SOURCES_IN_GPU],
[When using CUDA, activate storing all sources (slices of
the input tensors) in the GPU. This means that a lot of GPUs
will be needed.])
ATRIP_DEF([cuda-aware-mpi], [enable],
[ATRIP_CUDA_AWARE_MPI],
[When using MPI, assume support for CUDA aware mpi by the
given MPI implementation.])
dnl ----------------------------------------------------------------------- dnl -----------------------------------------------------------------------

8
etc/m4/atrip-def.m4 Normal file
View File

@ -0,0 +1,8 @@
AC_DEFUN([ATRIP_DEF],
[AC_ARG_ENABLE([$1],
[AS_HELP_STRING([--$2-$1],
[$4])],
[AC_DEFINE([$3],
1,
[$4])])])

View File

@ -18,6 +18,12 @@
#include <atrip/Slice.hpp> #include <atrip/Slice.hpp>
#include <atrip/RankMap.hpp> #include <atrip/RankMap.hpp>
#if defined(ATRIP_SOURCES_IN_GPU)
# define SOURCES_DATA(s) (s)
#else
# define SOURCES_DATA(s) (s).data()
#endif
namespace atrip { namespace atrip {
// Prolog:1 ends here // Prolog:1 ends here
@ -195,7 +201,7 @@ template <typename F=double>
; ;
if (blank.info.state == Slice<F>::SelfSufficient) { if (blank.info.state == Slice<F>::SelfSufficient) {
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
const size_t _size = sizeof(F) * sources[from.source].size(); const size_t _size = sizeof(F) * sliceSize;
// TODO: this is code duplication with downstairs // TODO: this is code duplication with downstairs
if (freePointers.size() == 0) { if (freePointers.size() == 0) {
std::stringstream stream; std::stringstream stream;
@ -212,12 +218,12 @@ template <typename F=double>
WITH_CHRONO("cuda:memcpy:self-sufficient", WITH_CHRONO("cuda:memcpy:self-sufficient",
_CHECK_CUDA_SUCCESS("copying mpi data to device", _CHECK_CUDA_SUCCESS("copying mpi data to device",
cuMemcpyHtoD(blank.data, cuMemcpyHtoD(blank.data,
(void*)sources[from.source].data(), (void*)SOURCES_DATA(sources[from.source]),
sizeof(F) * sources[from.source].size())); sizeof(F) * sliceSize));
)) ))
#else #else
blank.data = sources[from.source].data(); blank.data = SOURCES_DATA(sources[from.source]);
#endif #endif
} else { } else {
if (freePointers.size() == 0) { if (freePointers.size() == 0) {
@ -396,15 +402,18 @@ template <typename F=double>
, world(child_world) , world(child_world)
, universe(global_world) , universe(global_world)
, sliceLength(sliceLength_) , sliceLength(sliceLength_)
, sources(rankMap.nSources(), , sliceSize(std::accumulate(sliceLength.begin(),
std::vector<F>
(std::accumulate(sliceLength.begin(),
sliceLength.end(), sliceLength.end(),
1UL, std::multiplies<size_t>()))) 1UL, std::multiplies<size_t>()))
#if defined(ATRIP_SOURCES_IN_GPU)
, sources(rankMap.nSources())
#else
, sources(rankMap.nSources(),
std::vector<F>(sliceSize))
#endif
, name(name_) , name(name_)
, sliceTypes(sliceTypes_) , sliceTypes(sliceTypes_)
, sliceBuffers(nSliceBuffers) , sliceBuffers(nSliceBuffers)
//, slices(2 * sliceTypes.size(), Slice<F>{ sources[0].size() })
{ // constructor begin { // constructor begin
LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n"; LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n";
@ -412,7 +421,7 @@ template <typename F=double>
for (auto& ptr: sliceBuffers) { for (auto& ptr: sliceBuffers) {
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
const CUresult error = const CUresult error =
cuMemAlloc(&ptr, sizeof(F) * sources[0].size()); cuMemAlloc(&ptr, sizeof(F) * sliceSize);
if (ptr == 0UL) { if (ptr == 0UL) {
throw "UNSUFICCIENT MEMORY ON THE GRAPHIC CARD FOR FREE POINTERS"; throw "UNSUFICCIENT MEMORY ON THE GRAPHIC CARD FOR FREE POINTERS";
} }
@ -423,12 +432,12 @@ template <typename F=double>
throw s.str(); throw s.str();
} }
#else #else
ptr = (DataPtr<F>)malloc(sizeof(F) * sources[0].size()); ptr = (DataPtr<F>)malloc(sizeof(F) * sliceSize);
#endif #endif
} }
slices slices
= std::vector<Slice<F>>(2 * sliceTypes.size(), { sources[0].size() }); = std::vector<Slice<F>>(2 * sliceTypes.size(), { sliceSize });
// TODO: think exactly ^------------------- about this number // TODO: think exactly ^------------------- about this number
// initialize the freePointers with the pointers to the buffers // initialize the freePointers with the pointers to the buffers
@ -441,12 +450,12 @@ template <typename F=double>
LOG(1,"Atrip") << "#slices " << slices.size() << "\n"; LOG(1,"Atrip") << "#slices " << slices.size() << "\n";
WITH_RANK << "#slices[0] " << slices[0].size << "\n"; WITH_RANK << "#slices[0] " << slices[0].size << "\n";
LOG(1,"Atrip") << "#sources " << sources.size() << "\n"; LOG(1,"Atrip") << "#sources " << sources.size() << "\n";
WITH_RANK << "#sources[0] " << sources[0].size() << "\n"; WITH_RANK << "#sources[0] " << sliceSize << "\n";
WITH_RANK << "#freePointers " << freePointers.size() << "\n"; WITH_RANK << "#freePointers " << freePointers.size() << "\n";
LOG(1,"Atrip") << "#sliceBuffers " << sliceBuffers.size() << "\n"; LOG(1,"Atrip") << "#sliceBuffers " << sliceBuffers.size() << "\n";
LOG(1,"Atrip") << "GB*" << np << " " LOG(1,"Atrip") << "GB*" << np << " "
<< double(sources.size() + sliceBuffers.size()) << double(sources.size() + sliceBuffers.size())
* sources[0].size() * sliceSize
* 8 * np * 8 * np
/ 1073741824.0 / 1073741824.0
<< "\n"; << "\n";
@ -495,14 +504,13 @@ template <typename F=double>
if (otherRank == info.from.rank) sendData_p = false; if (otherRank == info.from.rank) sendData_p = false;
if (!sendData_p) return; if (!sendData_p) return;
MPI_Isend( sources[info.from.source].data() MPI_Isend((void*)SOURCES_DATA(sources[info.from.source]),
, sources[info.from.source].size() sliceSize,
, traits::mpi::datatypeOf<F>() traits::mpi::datatypeOf<F>(),
, otherRank otherRank,
, tag tag,
, universe universe,
, &request &request);
);
WITH_CRAZY_DEBUG WITH_CRAZY_DEBUG
WITH_RANK << "sent to " << otherRank << "\n"; WITH_RANK << "sent to " << otherRank << "\n";
@ -516,25 +524,26 @@ template <typename F=double>
if (Atrip::rank == info.from.rank) return; if (Atrip::rank == info.from.rank) return;
if (slice.info.state == Slice<F>::Fetch) { if (slice.info.state == Slice<F>::Fetch) { // if-1
// TODO: do it through the slice class // TODO: do it through the slice class
slice.info.state = Slice<F>::Dispatched; slice.info.state = Slice<F>::Dispatched;
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
# if !defined(ATRIP_CUDA_AWARE_MPI) && defined(ATRIP_SOURCES_IN_GPU)
# error "You need CUDA aware MPI to have slices on the GPU"
# endif
slice.mpi_data = (F*)malloc(sizeof(F) * slice.size); slice.mpi_data = (F*)malloc(sizeof(F) * slice.size);
MPI_Irecv( slice.mpi_data MPI_Irecv(slice.mpi_data,
#else #else
MPI_Irecv( slice.data MPI_Irecv(slice.data,
#endif #endif
, slice.size slice.size,
, traits::mpi::datatypeOf<F>() traits::mpi::datatypeOf<F>(),
, info.from.rank info.from.rank,
, tag tag,
, universe universe,
, &slice.request &slice.request);
//, MPI_STATUS_IGNORE } // if-1
); } // receive
}
}
void unwrapAll(ABCTuple const& abc) { void unwrapAll(ABCTuple const& abc) {
for (auto type: sliceTypes) unwrapSlice(type, abc); for (auto type: sliceTypes) unwrapSlice(type, abc);
@ -597,7 +606,12 @@ template <typename F=double>
const MPI_Comm world; const MPI_Comm world;
const MPI_Comm universe; const MPI_Comm universe;
const std::vector<size_t> sliceLength; const std::vector<size_t> sliceLength;
const size_t sliceSize;
#if defined(ATRIP_SOURCES_IN_GPU)
std::vector< DataPtr<F> > sources;
#else
std::vector< std::vector<F> > sources; std::vector< std::vector<F> > sources;
#endif
std::vector< Slice<F> > slices; std::vector< Slice<F> > slices;
typename Slice<F>::Name name; typename Slice<F>::Name name;
const std::vector<typename Slice<F>::Type> sliceTypes; const std::vector<typename Slice<F>::Type> sliceTypes;

View File

@ -19,8 +19,14 @@
namespace atrip { namespace atrip {
template <typename F=double> template <typename F=double>
static
void sliceIntoVector void sliceIntoVector
( std::vector<F> &v #if defined(ATRIP_SOURCES_IN_GPU)
( DataPtr<F> &source
#else
( std::vector<F> &source
#endif
, size_t sliceSize
, CTF::Tensor<F> &toSlice , CTF::Tensor<F> &toSlice
, std::vector<int64_t> const low , std::vector<int64_t> const low
, std::vector<int64_t> const up , std::vector<int64_t> const up
@ -44,18 +50,30 @@ namespace atrip {
<< "\n"; << "\n";
#ifndef ATRIP_DONT_SLICE #ifndef ATRIP_DONT_SLICE
toSlice.slice( toSlice_.low.data() toSlice.slice(toSlice_.low.data(),
, toSlice_.up.data() toSlice_.up.data(),
, 0.0 0.0,
, origin origin,
, origin_.low.data() origin_.low.data(),
, origin_.up.data() origin_.up.data(),
, 1.0); 1.0);
memcpy(v.data(), toSlice.data, sizeof(F) * v.size());
#else #else
# pragma message("WARNING: COMPILING WITHOUT SLICING THE TENSORS") # pragma message("WARNING: COMPILING WITHOUT SLICING THE TENSORS")
#endif #endif
#if defined(ATRIP_SOURCES_IN_GPU)
WITH_CHRONO("cuda:sources",
_CHECK_CUDA_SUCCESS("copying sources data to device",
cuMemcpyHtoD(source,
toSlice.data,
sliceSize));
)
#else
memcpy(source.data(),
toSlice.data,
sizeof(F) * sliceSize);
#endif
} }
@ -80,16 +98,15 @@ namespace atrip {
void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override
{ {
const int Nv = this->sliceLength[0]
, No = this->sliceLength[1]
, a = this->rankMap.find({static_cast<size_t>(Atrip::rank), it});
;
const int
Nv = this->sliceLength[0],
No = this->sliceLength[1],
a = this->rankMap.find({static_cast<size_t>(Atrip::rank), it});
sliceIntoVector<F>( this->sources[it] sliceIntoVector<F>(this->sources[it], this->sliceSize,
, to, {0, 0, 0}, {Nv, No, No} to, {0, 0, 0}, {Nv, No, No},
, from, {a, 0, 0, 0}, {a+1, Nv, No, No} from, {a, 0, 0, 0}, {a+1, Nv, No, No});
);
} }
@ -118,14 +135,13 @@ namespace atrip {
void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override
{ {
const int No = this->sliceLength[0] const int
, a = this->rankMap.find({static_cast<size_t>(Atrip::rank), it}) No = this->sliceLength[0],
; a = this->rankMap.find({static_cast<size_t>(Atrip::rank), it});
sliceIntoVector<F>( this->sources[it] sliceIntoVector<F>(this->sources[it], this->sliceSize,
, to, {0, 0, 0}, {No, No, No} to, {0, 0, 0}, {No, No, No},
, from, {0, 0, 0, a}, {No, No, No, a+1} from, {0, 0, 0, a}, {No, No, No, a+1});
);
} }
}; };
@ -153,18 +169,17 @@ namespace atrip {
void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override { void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override {
const int Nv = this->sliceLength[0] const int
, No = this->sliceLength[1] Nv = this->sliceLength[0],
, el = this->rankMap.find({static_cast<size_t>(Atrip::rank), it}) No = this->sliceLength[1],
, a = el % Nv el = this->rankMap.find({static_cast<size_t>(Atrip::rank), it}),
, b = el / Nv a = el % Nv,
; b = el / Nv;
sliceIntoVector<F>( this->sources[it] sliceIntoVector<F>(this->sources[it], this->sliceSize,
, to, {0, 0}, {Nv, No} to, {0, 0}, {Nv, No},
, from, {a, b, 0, 0}, {a+1, b+1, Nv, No} from, {a, b, 0, 0}, {a+1, b+1, Nv, No});
);
} }
@ -191,17 +206,17 @@ namespace atrip {
void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override { void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override {
const int Nv = from.lens[0] const int
, No = this->sliceLength[1] Nv = from.lens[0],
, el = this->rankMap.find({static_cast<size_t>(Atrip::rank), it}) No = this->sliceLength[1],
, a = el % Nv el = this->rankMap.find({static_cast<size_t>(Atrip::rank), it}),
, b = el / Nv a = el % Nv,
; b = el / Nv;
sliceIntoVector<F>( this->sources[it]
, to, {0, 0}, {No, No} sliceIntoVector<F>(this->sources[it], this->sliceSize,
, from, {a, b, 0, 0}, {a+1, b+1, No, No} to, {0, 0}, {No, No},
); from, {a, b, 0, 0}, {a+1, b+1, No, No});
} }
@ -231,17 +246,16 @@ namespace atrip {
void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override { void sliceIntoBuffer(size_t it, CTF::Tensor<F> &to, CTF::Tensor<F> const& from) override {
// TODO: maybe generalize this with ABHH // TODO: maybe generalize this with ABHH
const int Nv = from.lens[0] const int
, No = this->sliceLength[1] Nv = from.lens[0],
, el = this->rankMap.find({static_cast<size_t>(Atrip::rank), it}) No = this->sliceLength[1],
, a = el % Nv el = this->rankMap.find({static_cast<size_t>(Atrip::rank), it}),
, b = el / Nv a = el % Nv,
; b = el / Nv;
sliceIntoVector<F>( this->sources[it] sliceIntoVector<F>(this->sources[it], this->sliceSize,
, to, {0, 0}, {No, No} to, {0, 0}, {No, No},
, from, {a, b, 0, 0}, {a+1, b+1, No, No} from, {a, b, 0, 0}, {a+1, b+1, No, No});
);
} }

View File

@ -1,3 +1,11 @@
#+quicklisp
(eval-when (:compile-toplevel :load-toplevel :execute)
(ql:quickload '(vgplot fiveam)))
(defpackage :naive-tuples
(:use :cl :vgplot))
(in-package :naive-tuples)
(defun tuples-atrip (nv) (defun tuples-atrip (nv)
(declare (optimize (speed 3) (safety 0) (debug 0))) (declare (optimize (speed 3) (safety 0) (debug 0)))
(loop :for a :below nv (loop :for a :below nv
@ -218,58 +226,3 @@
cheaper cheaper
(print (equal (nth i tuples) (print (equal (nth i tuples)
cheaper))))) cheaper)))))
(let* ((l 101)
(tuples (tuples-atrip l)))
(loop :for a below l
:do (print (let ((s (a-block-atrip a l))
(c (count-if (lambda (x) (eq (car x) a))
tuples)))
(list :a a
:size s
:real c
:? (eq c s))))))
(ql:quickload 'vgplot)
(import 'vgplot:plot)
(import 'vgplot:replot)
(let ((l 10))
(plot (mapcar (lambda (x) (getf x :size))
(loop :for a upto l
collect (list :a a :size (a-block a l))))
"penis"))
(let* ((l 50)
(tuples (tuples-half l)))
(loop :for a below l
:do (print (let ((s (a-block a l))
(c (count-if (lambda (x) (eq (car x) a))
tuples)))
(list :a a
:size s
:real c
:? (eq c s))))))
(defun range (from to) (loop for i :from from :to to collect i))
(defun half-again (i nv)
(let ((a-block-list (let ((ll (mapcar (lambda (i) (a-block i nv))
(range 0 (- nv 1)))))
(loop :for i :from 1 :to (length ll)
:collect
(reduce #'+
ll
:end i)))))
(loop :for blk :in a-block-list
:with a = 0
:with total-blk = 0
:if (eq 0 (floor i blk))
:do
(let ((i (mod i blk)))
(print (list i (- i total-blk) blk a))
(return))
:else
:do (progn
(incf a)
(setq total-blk blk)))))

View File

@ -646,6 +646,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
// COMPUTE SINGLES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% {{{1 // COMPUTE SINGLES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% {{{1
OCD_Barrier(universe); OCD_Barrier(universe);
#if defined(ATRIP_ONLY_DGEMM)
if (false)
#endif
if (!isFakeTuple(i)) { if (!isFakeTuple(i)) {
WITH_CHRONO("oneshot-unwrap", WITH_CHRONO("oneshot-unwrap",
WITH_CHRONO("unwrap", WITH_CHRONO("unwrap",
@ -678,6 +681,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
// COMPUTE ENERGY %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% {{{1 // COMPUTE ENERGY %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% {{{1
#if defined(ATRIP_ONLY_DGEMM)
if (false)
#endif
if (!isFakeTuple(i)) { if (!isFakeTuple(i)) {
double tupleEnergy(0.); double tupleEnergy(0.);

View File

@ -4,8 +4,10 @@
namespace atrip { namespace atrip {
/* This function is really too slow, below are more performant #if defined(ATRIP_NAIVE_SLOW)
functions to get tuples. /*
* This function is really too slow, below are more performant
* functions to get tuples.
*/ */
static static
ABCTuples get_nth_naive_tuples(size_t Nv, size_t np, int64_t i) { ABCTuples get_nth_naive_tuples(size_t Nv, size_t np, int64_t i) {
@ -52,33 +54,26 @@ namespace atrip {
return result; return result;
} }
#endif
static
inline
size_t a_block_atrip(size_t a, size_t nv) {
return (nv - 1) * (nv - (a - 1))
- ((nv - 1) * nv) / 2
+ ((a - 1) * (a - 2)) / 2
- 1;
}
static static
inline inline
size_t a_block_sum_atrip(int64_t T, int64_t nv) { size_t a_block_sum_atrip(int64_t T, int64_t nv) {
int64_t nv1 = nv - 1, tplus1 = T + 1; const int64_t nv_min_1 = nv - 1, t_plus_1 = T + 1;
return tplus1 * nv1 * nv return t_plus_1 * nv_min_1 * nv
+ nv1 * tplus1 + nv_min_1 * t_plus_1
- (nv1 * (T * (T + 1)) / 2) - (nv_min_1 * (T * t_plus_1) / 2)
- (tplus1 * (nv1 * nv) / 2) - (t_plus_1 * (nv_min_1 * nv) / 2)
+ (((T * (T + 1) * (1 + 2 * T)) / 6) - 3 * ((T * (T + 1)) / 2)) / 2 // do not simplify this expression, only the addition of both parts
// is a pair integer, prepare to endure the consequences of
// simplifying otherwise
+ (((T * t_plus_1 * (1 + 2 * T)) / 6) - 3 * ((T * t_plus_1) / 2)) / 2
; ;
// + tplus1;
} }
static static
inline inline
int64_t b_block_sum_atrip (int64_t a, int64_t T, int64_t nv) { int64_t b_block_sum_atrip (int64_t a, int64_t T, int64_t nv) {
return nv * ((T - a) + 1) return nv * ((T - a) + 1)
- (T * (T + 1) - a * (a - 1)) / 2 - (T * (T + 1) - a * (a - 1)) / 2
- 1; - 1;
@ -94,9 +89,6 @@ namespace atrip {
a_sums.resize(nv); a_sums.resize(nv);
for (size_t _i = 0; _i < nv; _i++) { for (size_t _i = 0; _i < nv; _i++) {
a_sums[_i] = a_block_sum_atrip(_i, nv); a_sums[_i] = a_block_sum_atrip(_i, nv);
/*
std::cout << Atrip::rank << ": " << _i << " " << a_sums[_i] << std::endl;
*/
} }
} }
@ -114,10 +106,6 @@ namespace atrip {
std::vector<int64_t> b_sums(nv - a); std::vector<int64_t> b_sums(nv - a);
for (size_t t = a, i=0; t < nv; t++) { for (size_t t = a, i=0; t < nv; t++) {
b_sums[i++] = b_block_sum_atrip(a, t, nv); b_sums[i++] = b_block_sum_atrip(a, t, nv);
/*
std::cout << Atrip::rank << ": b-sum " << i-1 << " "
<< ":a " << a << " :t " << t << " = " << b_sums[i-1] << std::endl;
*/
} }
int64_t b = a - 1, block_b = block_a; int64_t b = a - 1, block_b = block_a;
for (const auto& sum: b_sums) { for (const auto& sum: b_sums) {
@ -141,6 +129,11 @@ namespace atrip {
inline inline
ABCTuples nth_atrip_distributed(int64_t it, size_t nv, size_t np) { ABCTuples nth_atrip_distributed(int64_t it, size_t nv, size_t np) {
// If we are getting the previous tuples in the first iteration,
// then just return an impossible tuple, different from the FAKE_TUPLE,
// because if FAKE_TUPLE is defined as {0,0,0} slices thereof
// are actually attainable.
//
if (it < 0) { if (it < 0) {
ABCTuples result(np, {nv, nv, nv}); ABCTuples result(np, {nv, nv, nv});
return result; return result;
@ -160,9 +153,6 @@ namespace atrip {
for (size_t rank = 0; rank < np; rank++) { for (size_t rank = 0; rank < np; rank++) {
const size_t const size_t
global_iteration = tuples_per_rank * rank + it; global_iteration = tuples_per_rank * rank + it;
/*
std::cout << Atrip::rank << ":" << "global_bit " << global_iteration << "\n";
*/
result[rank] = nth_atrip(global_iteration, nv); result[rank] = nth_atrip(global_iteration, nv);
} }
@ -248,38 +238,25 @@ namespace atrip {
using Database = typename Slice<F>::Database; using Database = typename Slice<F>::Database;
Database db; Database db;
#ifdef NAIVE_SLOW #ifdef ATRIP_NAIVE_SLOW
WITH_CHRONO("db:comm:naive:tuples", WITH_CHRONO("db:comm:naive:tuples",
const auto tuples = get_nth_naive_tuples(nv, const auto tuples = get_nth_naive_tuples(nv,
np, np,
iteration); iteration);
const auto prev_tuples = get_nth_naive_tuples(nv, const auto prev_tuples = get_nth_naive_tuples(nv,
np, np,
(int64_t)iteration - 1); iteration - 1);
) )
#else #else
WITH_CHRONO("db:comm:naive:tuples", WITH_CHRONO("db:comm:naive:tuples",
const auto tuples = nth_atrip_distributed((int64_t)iteration, const auto tuples = nth_atrip_distributed(iteration,
nv, nv,
np); np);
const auto prev_tuples = nth_atrip_distributed((int64_t)iteration - 1, const auto prev_tuples = nth_atrip_distributed(iteration - 1,
nv, nv,
np); np);
) )
if (false)
for (size_t rank = 0; rank < np; rank++) {
std::cout << Atrip::rank << ":"
<< " :tuples< " << rank << ">" << iteration
<< " :abc " << tuples[rank][0]
<< ", " << tuples[rank][1]
<< ", " << tuples[rank][2] << "\n";
std::cout << Atrip::rank << ":"
<< " :prev-tuples< " << rank << ">" << iteration
<< " :abc-prev " << prev_tuples[rank][0]
<< ", " << prev_tuples[rank][1]
<< ", " << prev_tuples[rank][2] << "\n";
}
#endif #endif
for (size_t rank = 0; rank < np; rank++) { for (size_t rank = 0; rank < np; rank++) {

View File

@ -156,7 +156,6 @@ namespace cuda {
* in order to have an argument in the signature of * in order to have an argument in the signature of
* the function that helps the compiler know which * the function that helps the compiler know which
* instantiation it should take. * instantiation it should take.
*
*/ */
template <typename F, reordering_t R> template <typename F, reordering_t R>
struct reorder_proxy {}; struct reorder_proxy {};
@ -436,9 +435,7 @@ double getEnergySame
, DataFieldType<F>* Tijk_ , DataFieldType<F>* Tijk_
) { ) {
const size_t a = abc[0], b = abc[1], c = abc[2] const size_t NoNo = No*No;
, NoNo = No*No
;
DataFieldType<F>* Tijk = (DataFieldType<F>*)Tijk_; DataFieldType<F>* Tijk = (DataFieldType<F>*)Tijk_;
@ -451,7 +448,7 @@ double getEnergySame
>{}, \ >{}, \
No, \ No, \
Tijk, \ Tijk, \
_t_buffer); _t_buffer)
#define DGEMM_PARTICLES(__A, __B) \ #define DGEMM_PARTICLES(__A, __B) \
atrip::xgemm<F>("T", \ atrip::xgemm<F>("T", \
"N", \ "N", \
@ -482,10 +479,17 @@ double getEnergySame
(int const*)&NoNo \ (int const*)&NoNo \
) )
#define MAYBE_CONJ(_conj, _buffer) \ #define MAYBE_CONJ(_conj, _buffer) \
do { \
cuda::maybeConjugate<<< \ cuda::maybeConjugate<<< \
\
Atrip::kernelDimensions.ooo.blocks, \ Atrip::kernelDimensions.ooo.blocks, \
\
Atrip::kernelDimensions.ooo.threads \ Atrip::kernelDimensions.ooo.threads \
>>>((DataFieldType<F>*)_conj, (DataFieldType<F>*)_buffer, NoNoNo); \
>>>((DataFieldType<F>*)_conj, \
(DataFieldType<F>*)_buffer, \
NoNoNo); \
} while (0)
// END CUDA //////////////////////////////////////////////////////////////////// // END CUDA ////////////////////////////////////////////////////////////////////
@ -500,7 +504,9 @@ double getEnergySame
#define REORDER(__II, __JJ, __KK) \ #define REORDER(__II, __JJ, __KK) \
reorder(reorder_proxy<DataFieldType<F>, \ reorder(reorder_proxy<DataFieldType<F>, \
__II ## __JJ ## __KK >{}, \ __II ## __JJ ## __KK >{}, \
No, Tijk, _t_buffer); No, \
Tijk, \
_t_buffer)
#define DGEMM_PARTICLES(__A, __B) \ #define DGEMM_PARTICLES(__A, __B) \
atrip::xgemm<F>("T", \ atrip::xgemm<F>("T", \
"N", \ "N", \
@ -532,8 +538,12 @@ double getEnergySame
(int const*)&NoNo \ (int const*)&NoNo \
) )
#define MAYBE_CONJ(_conj, _buffer) \ #define MAYBE_CONJ(_conj, _buffer) \
for (size_t __i = 0; __i < NoNoNo; ++__i) \ do { \
_conj[__i] = maybeConjugate<F>(_buffer[__i]); for (size_t __i = 0; __i < NoNoNo; ++__i) { \
_conj[__i] \
= maybeConjugate<F>(_buffer[__i]); \
} \
} while (0)
#endif #endif
F one{1.0}, m_one{-1.0}, zero{0.0}; F one{1.0}, m_one{-1.0}, zero{0.0};
@ -552,8 +562,12 @@ double getEnergySame
const size_t const size_t
bs = Atrip::kernelDimensions.ooo.blocks, bs = Atrip::kernelDimensions.ooo.blocks,
ths = Atrip::kernelDimensions.ooo.threads; ths = Atrip::kernelDimensions.ooo.threads;
#if !defined(ATRIP_ONLY_DGEMM)
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_t_buffer, NoNoNo); cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_t_buffer, NoNoNo);
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_vhhh, NoNoNo); cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_vhhh, NoNoNo);
#endif
#else #else
DataFieldType<F>* _t_buffer = (DataFieldType<F>*)malloc(NoNoNo * sizeof(F)); DataFieldType<F>* _t_buffer = (DataFieldType<F>*)malloc(NoNoNo * sizeof(F));
DataFieldType<F>* _vhhh = (DataFieldType<F>*)malloc(NoNoNo * sizeof(F)); DataFieldType<F>* _vhhh = (DataFieldType<F>*)malloc(NoNoNo * sizeof(F));
@ -565,7 +579,7 @@ double getEnergySame
#endif #endif
// Set Tijk to zero // Set Tijk to zero
#ifdef HAVE_CUDA #if defined(HAVE_CUDA) && !defined(ATRIP_ONLY_DGEMM)
WITH_CHRONO("double:reorder", WITH_CHRONO("double:reorder",
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk, cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk,
NoNoNo); NoNoNo);
@ -577,43 +591,51 @@ double getEnergySame
}) })
#endif #endif
#if defined(ATRIP_ONLY_DGEMM)
#undef MAYBE_CONJ
#undef REORDER
#define MAYBE_CONJ(a, b) do {} while(0)
#define REORDER(i, j, k) do {} while(0)
#endif
// HOLES // HOLES
WITH_CHRONO("doubles:holes", WITH_CHRONO("doubles:holes",
{ {
// VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
MAYBE_CONJ(_vhhh, VhhhC) MAYBE_CONJ(_vhhh, VhhhC);
WITH_CHRONO("doubles:holes:1", WITH_CHRONO("doubles:holes:1",
DGEMM_HOLES(_vhhh, TABhh, "N"); DGEMM_HOLES(_vhhh, TABhh, "N");
REORDER(I, K, J) REORDER(I, K, J);
) )
// VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
WITH_CHRONO("doubles:holes:2", WITH_CHRONO("doubles:holes:2",
DGEMM_HOLES(_vhhh, TABhh, "T"); DGEMM_HOLES(_vhhh, TABhh, "T");
REORDER(J, K, I) REORDER(J, K, I);
) )
// VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
MAYBE_CONJ(_vhhh, VhhhB) MAYBE_CONJ(_vhhh, VhhhB);
WITH_CHRONO("doubles:holes:3", WITH_CHRONO("doubles:holes:3",
DGEMM_HOLES(_vhhh, TAChh, "N"); DGEMM_HOLES(_vhhh, TAChh, "N");
REORDER(I, J, K) REORDER(I, J, K);
) )
// VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
WITH_CHRONO("doubles:holes:4", WITH_CHRONO("doubles:holes:4",
DGEMM_HOLES(_vhhh, TAChh, "T"); DGEMM_HOLES(_vhhh, TAChh, "T");
REORDER(K, J, I) REORDER(K, J, I);
) )
// VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
MAYBE_CONJ(_vhhh, VhhhA) MAYBE_CONJ(_vhhh, VhhhA);
WITH_CHRONO("doubles:holes:5", WITH_CHRONO("doubles:holes:5",
DGEMM_HOLES(_vhhh, TBChh, "N"); DGEMM_HOLES(_vhhh, TBChh, "N");
REORDER(J, I, K) REORDER(J, I, K);
) )
// VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
WITH_CHRONO("doubles:holes:6", WITH_CHRONO("doubles:holes:6",
DGEMM_HOLES(_vhhh, TBChh, "T"); DGEMM_HOLES(_vhhh, TBChh, "T");
REORDER(K, I, J) REORDER(K, I, J);
) )
} }
) )
@ -625,32 +647,32 @@ double getEnergySame
// TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
WITH_CHRONO("doubles:particles:1", WITH_CHRONO("doubles:particles:1",
DGEMM_PARTICLES(TAphh, VBCph); DGEMM_PARTICLES(TAphh, VBCph);
REORDER(I, J, K) REORDER(I, J, K);
) )
// TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
WITH_CHRONO("doubles:particles:2", WITH_CHRONO("doubles:particles:2",
DGEMM_PARTICLES(TAphh, VCBph); DGEMM_PARTICLES(TAphh, VCBph);
REORDER(I, K, J) REORDER(I, K, J);
) )
// TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
WITH_CHRONO("doubles:particles:3", WITH_CHRONO("doubles:particles:3",
DGEMM_PARTICLES(TCphh, VABph); DGEMM_PARTICLES(TCphh, VABph);
REORDER(K, I, J) REORDER(K, I, J);
) )
// TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
WITH_CHRONO("doubles:particles:4", WITH_CHRONO("doubles:particles:4",
DGEMM_PARTICLES(TCphh, VBAph); DGEMM_PARTICLES(TCphh, VBAph);
REORDER(K, J, I) REORDER(K, J, I);
) )
// TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
WITH_CHRONO("doubles:particles:5", WITH_CHRONO("doubles:particles:5",
DGEMM_PARTICLES(TBphh, VACph); DGEMM_PARTICLES(TBphh, VACph);
REORDER(J, I, K) REORDER(J, I, K);
) )
// TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
WITH_CHRONO("doubles:particles:6", WITH_CHRONO("doubles:particles:6",
DGEMM_PARTICLES(TBphh, VCAph); DGEMM_PARTICLES(TBphh, VCAph);
REORDER(J, K, I) REORDER(J, K, I);
) )
} }
) )

132
tools/configure-benches.sh Executable file
View File

@ -0,0 +1,132 @@
#!/usr/bin/env bash
# Copyright (C) 2022 by Alejandro Gallo <aamsgallo@gmail.com>
set -eu
flags=("${@}")
PROJECTS=()
#
## Check root directory
#
root_project=$(git rev-parse --show-toplevel)
configure=$root_project/configure
if [[ $(basename $PWD) == $(basename $root_project) ]]; then
cat <<EOF
You are trying to build in the root directory, create a build folder
and then configure.
mkdir build
cd build
$(readlink -f $0)
EOF
exit 1
fi
[[ -f $configure ]] || {
cat <<EOF
No configure script at $configure create it with bootstrap.sh or
autoreconf -vif
EOF
exit 1
}
#
## Create configuration function
#
create_config () {
file=$1
name=$2
PROJECTS=(${PROJECTS[@]} "$name")
mkdir -p $name
cd $name
echo "> creating: $name"
cat <<SH > configure
#!/usr/bin/env bash
# created by $0 on $(date)
$root_project/configure $(cat $file | paste -s) \\
$(for word in "${flags[@]}"; do
printf " \"%s\"" "$word";
done)
exit 0
SH
chmod +x configure
cd - > /dev/null
}
#
## default configuration
#
tmp=`mktemp`
cat <<EOF > $tmp
--disable-slice
EOF
create_config $tmp default
rm $tmp
#
## only-dgemm configuration
#
tmp=`mktemp`
cat <<EOF > $tmp
--disable-slice
--enable-only-dgemm
EOF
create_config $tmp only-dgemm
rm $tmp
#
## Create makefile
#
cat <<MAKE > Makefile
all: configure do
do: configure
configure: ${PROJECTS[@]/%/\/Makefile}
%/Makefile: %/configure
cd \$* && ./configure
do: ${PROJECTS[@]/%/\/src\/libatrip.a}
%/src/libatrip.a:
cd \$* && \$(MAKE)
.PHONY: configure do all
MAKE
cat <<EOF
Now you can do
make all
or go into one of the directories
${PROJECTS[@]}
and do
./configure
make
EOF
## Emacs stuff
# Local Variables:
# eval: (outline-minor-mode)
# outline-regexp: "## "
# End: