Changes in source files, makes cuda run

This commit is contained in:
Gallo Alejandro 2022-08-05 13:42:04 +02:00
parent e03dd77904
commit a5b2a74e18
22 changed files with 1408 additions and 539 deletions

2
.gitignore vendored
View File

@ -1,3 +1,5 @@
*.swo
*.swp
etc/autotools/ etc/autotools/
Makefile.in Makefile.in
build build

View File

@ -21,6 +21,27 @@ AC_ARG_ENABLE(shared,
files (default=YES)]), files (default=YES)]),
[], [enable_shared=yes]) [], [enable_shared=yes])
AC_ARG_ENABLE(
[slice],
[AS_HELP_STRING(
[--disable-slice],
[Disable the step of slicing tensors for CTF, this is useful for example for benchmarking or testing.])],
[atrip_dont_slice=1
AC_DEFINE([ATRIP_DONT_SLICE],1,[Wether CTF will slice tensors or skip the step])
],
[atrip_dont_slice=0]
)
AC_ARG_ENABLE(
[atrip_dgemm],
[AS_HELP_STRING(
[--disable-dgemm],
[Disable using dgemm for the doubles equations])],
[],
[AC_DEFINE([ATRIP_USE_DGEMM],1,[Use dgemm for the doubles equations])]
)
AC_ARG_ENABLE([docs], AC_ARG_ENABLE([docs],
[AS_HELP_STRING([--enable-docs], [AS_HELP_STRING([--enable-docs],
[Enable building docs])], [Enable building docs])],
@ -28,7 +49,7 @@ AC_ARG_ENABLE([docs],
dnl LIBGC library options dnl LIBGC library options
AC_ARG_WITH(libctf-prefix, AC_ARG_WITH(libctf-prefix,
AS_HELP_STRING([--with-libctf-prefix=path], AS_HELP_STRING([--with-ctf],
[prefix for CTF includes and libraries] ), [prefix for CTF includes and libraries] ),
[LIBCTF_PATH="`readlink -f $withval`"; [LIBCTF_PATH="`readlink -f $withval`";
LIBCTF_CPATH="`readlink -f $withval`/include"; LIBCTF_CPATH="`readlink -f $withval`/include";
@ -44,11 +65,6 @@ AC_ARG_WITH([clang-check],
[clang_check=NO]) [clang_check=NO])
AM_CONDITIONAL([WITH_CLANG_CHECK], [test x${clang_check} = xYES]) AM_CONDITIONAL([WITH_CLANG_CHECK], [test x${clang_check} = xYES])
AC_ARG_WITH(dgemm,
AS_HELP_STRING([--without-dgemm], [Disable dgemm]),
[with_dgemm=NO],
[with_dgemm=YES])
AC_ARG_ENABLE([cuda], AC_ARG_ENABLE([cuda],
[AS_HELP_STRING([--enable-cuda], [AS_HELP_STRING([--enable-cuda],
[Build with cuda])], [Build with cuda])],
@ -59,6 +75,12 @@ AC_ARG_VAR([CUDA_LDFLAGS], [LDFLAGS to find libraries -lcuda, -lcudart, -lcublas
AC_ARG_VAR([CUDA_CXXFLAGS], [CXXFLAGS to find the CUDA headers]) AC_ARG_VAR([CUDA_CXXFLAGS], [CXXFLAGS to find the CUDA headers])
AC_ARG_WITH([atrip-debug],
[AS_HELP_STRING([--with-atrip-debug],
[Debug level for atrip, possible values: 1, 2, 3, 4])],
[AC_DEFINE([ATRIP_DEBUG],[atrip-debug],[Atrip debug level])],
[AC_DEFINE([ATRIP_DEBUG],[1],[Atrip debug level])]
)
dnl ----------------------------------------------------------------------- dnl -----------------------------------------------------------------------

View File

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// [[file:../atrip.org::*Include header][Include header:1]] // [[file:~/cuda/atrip/atrip.org::*Include%20header][Include header:1]]
#pragma once #pragma once
#include <atrip/Atrip.hpp> #include <atrip/Atrip.hpp>

View File

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// [[file:../../atrip.org::*Header][Header:1]] // [[file:~/cuda/atrip/atrip.org::*Header][Header:1]]
#pragma once #pragma once
#include <sstream> #include <sstream>
#include <string> #include <string>
@ -41,12 +41,21 @@ namespace atrip {
struct Atrip { struct Atrip {
static int rank; static size_t rank;
static int np; static size_t np;
static MPI_Comm communicator; static MPI_Comm communicator;
static Timings chrono; static Timings chrono;
#if defined(HAVE_CUDA)
struct CudaContext {
cublasStatus_t status;
cublasHandle_t handle;
};
static CudaContext cuda;
#endif
static void init(MPI_Comm); static void init(MPI_Comm);
template <typename F=double> template <typename F=double>
struct Input { struct Input {
CTF::Tensor<F> *ei = nullptr CTF::Tensor<F> *ei = nullptr

View File

@ -12,12 +12,16 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// [[file:../../atrip.org::*Blas][Blas:1]] // [[file:~/cuda/atrip/atrip.org::*Blas][Blas:1]]
#pragma once #pragma once
#include <atrip/Complex.hpp>
#include <atrip/Types.hpp>
#include "config.h"
namespace atrip { namespace atrip {
using Complex = std::complex<double>; #if !defined(HAVE_CUDA)
extern "C" { extern "C" {
void dgemm_( void dgemm_(
const char *transa, const char *transa,
@ -50,49 +54,43 @@ namespace atrip {
Complex *C, Complex *C,
const int *ldc const int *ldc
); );
void dcopy_(const int n,
const double *x,
const int incx,
double *y,
const int incy);
void zcopy_(const int n,
const void *x,
const int incx,
void *y,
const int incy);
} }
#endif
template <typename F>
void xcopy(const int n,
const DataFieldType<F>* x,
const int incx,
DataFieldType<F>* y,
const int incy);
template <typename F=double> template <typename F>
void xgemm(const char *transa, void xgemm(const char *transa,
const char *transb, const char *transb,
const int *m, const int *m,
const int *n, const int *n,
const int *k, const int *k,
F *alpha, F *alpha,
const F *A, const DataFieldType<F> *A,
const int *lda, const int *lda,
const F *B, const DataFieldType<F> *B,
const int *ldb, const int *ldb,
F *beta, F *beta,
F *C, DataFieldType<F> *C,
const int *ldc) { const int *ldc);
dgemm_(transa, transb,
m, n, k,
alpha, A, lda,
B, ldb, beta,
C, ldc);
}
template <>
void xgemm(const char *transa,
const char *transb,
const int *m,
const int *n,
const int *k,
Complex *alpha,
const Complex *A,
const int *lda,
const Complex *B,
const int *ldb,
Complex *beta,
Complex *C,
const int *ldc) {
zgemm_(transa, transb,
m, n, k,
alpha, A, lda,
B, ldb, beta,
C, ldc);
}
} }
// Blas:1 ends here // Blas:1 ends here

9
include/atrip/CUDA.hpp Normal file
View File

@ -0,0 +1,9 @@
#pragma once
#if defined(HAVE_CUDA) && defined(__CUDACC__)
# define __MAYBE_GLOBAL__ __global__
# define __MAYBE_DEVICE__ __device__
#else
# define __MAYBE_GLOBAL__
# define __MAYBE_DEVICE__
#endif

View File

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// [[file:../../atrip.org::*Prolog][Prolog:1]] // [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
#pragma once #pragma once
#include <fstream> #include <fstream>
#include <iomanip> #include <iomanip>
@ -22,7 +22,7 @@
namespace atrip { namespace atrip {
// Prolog:1 ends here // Prolog:1 ends here
// [[file:../../atrip.org::checkpoint-definition][checkpoint-definition]] // [[file:~/cuda/atrip/atrip.org::checkpoint-definition][checkpoint-definition]]
// template <typename F> // template <typename F>
struct Checkpoint { struct Checkpoint {
size_t no, nv; size_t no, nv;
@ -36,7 +36,7 @@ struct Checkpoint {
}; };
// checkpoint-definition ends here // checkpoint-definition ends here
// [[file:../../atrip.org::*Input and output][Input and output:1]] // [[file:~/cuda/atrip/atrip.org::*Input%20and%20output][Input and output:1]]
void write_checkpoint(Checkpoint const& c, std::string const& filepath) { void write_checkpoint(Checkpoint const& c, std::string const& filepath) {
std::ofstream out(filepath); std::ofstream out(filepath);
out << "No: " << c.no out << "No: " << c.no
@ -87,6 +87,6 @@ Checkpoint read_checkpoint(std::string const& filepath) {
} }
// Input and output:1 ends here // Input and output:1 ends here
// [[file:../../atrip.org::*Epilog][Epilog:1]] // [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
} }
// Epilog:1 ends here // Epilog:1 ends here

46
include/atrip/Complex.hpp Normal file
View File

@ -0,0 +1,46 @@
// Copyright 2022 Alejandro Gallo
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// [[file:~/cuda/atrip/atrip.org::*Complex%20numbers][Complex numbers:1]]
#pragma once
#include <complex>
#include <mpi.h>
#include "config.h"
#if defined(HAVE_CUDA)
#include <cuComplex.h>
#endif
namespace atrip {
using Complex = std::complex<double>;
template <typename F> F maybeConjugate(const F);
#if defined(HAVE_CUDA)
cuDoubleComplex& operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz);
#endif
namespace traits {
template <typename FF> bool isComplex();
namespace mpi {
template <typename F> MPI_Datatype datatypeOf(void);
}
}
}
// Complex numbers:1 ends here

View File

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// [[file:../../atrip.org::*Macros][Macros:1]] // [[file:~/cuda/atrip/atrip.org::*Macros][Macros:1]]
#pragma once #pragma once
#include <functional> #include <functional>
#define ATRIP_BENCHMARK #define ATRIP_BENCHMARK
@ -21,7 +21,6 @@
# define ATRIP_DEBUG 1 # define ATRIP_DEBUG 1
#endif #endif
//#define ATRIP_WORKLOAD_DUMP //#define ATRIP_WORKLOAD_DUMP
#define ATRIP_USE_DGEMM
//#define ATRIP_PRINT_TUPLES //#define ATRIP_PRINT_TUPLES
#ifndef ATRIP_DEBUG #ifndef ATRIP_DEBUG
@ -75,20 +74,20 @@
#endif #endif
// Macros:1 ends here // Macros:1 ends here
// [[file:../../atrip.org::*Macros][Macros:2]] // [[file:~/cuda/atrip/atrip.org::*Macros][Macros:2]]
#ifndef LOG #ifndef LOG
#define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": " #define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": "
#endif #endif
// Macros:2 ends here // Macros:2 ends here
// [[file:../../atrip.org::*Macros][Macros:3]] // [[file:~/cuda/atrip/atrip.org::*Macros][Macros:3]]
#ifdef ATRIP_NO_OUTPUT #ifdef ATRIP_NO_OUTPUT
# undef LOG # undef LOG
# define LOG(level, name) if (false) std::cout << name << ": " # define LOG(level, name) if (false) std::cout << name << ": "
#endif #endif
// Macros:3 ends here // Macros:3 ends here
// [[file:../../atrip.org::IterationDescriptor][IterationDescriptor]] // [[file:~/cuda/atrip/atrip.org::IterationDescriptor][IterationDescriptor]]
namespace atrip { namespace atrip {
struct IterationDescription; struct IterationDescription;

View File

@ -12,371 +12,94 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// [[file:../../atrip.org::*Equations][Equations:1]] // [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
#pragma once #pragma once
#include<atrip/Slice.hpp> #include<atrip/Atrip.hpp>
#include<atrip/Blas.hpp> #include<atrip/Blas.hpp>
#include<atrip/Utils.hpp>
#if defined(HAVE_CUDA)
#include<thrust/device_vector.h>
#endif
namespace atrip { namespace atrip {
using ABCTuple = std::array<size_t, 3>;
using PartialTuple = std::array<size_t, 2>;
using ABCTuples = std::vector<ABCTuple>;
// Prolog:1 ends here
template <typename F=double> // [[file:~/cuda/atrip/atrip.org::*Energy][Energy:1]]
double getEnergyDistinct template <typename F=double>
( const F epsabc double getEnergyDistinct
, std::vector<F> const& epsi ( F const epsabc
, std::vector<F> const& Tijk_ , size_t const No
, std::vector<F> const& Zijk_ , F* const epsi
) { , F* const Tijk
constexpr size_t blockSize=16; , F* const Zijk
F energy(0.); );
const size_t No = epsi.size();
for (size_t kk=0; kk<No; kk+=blockSize){
const size_t kend( std::min(No, kk+blockSize) );
for (size_t jj(kk); jj<No; jj+=blockSize){
const size_t jend( std::min( No, jj+blockSize) );
for (size_t ii(jj); ii<No; ii+=blockSize){
const size_t iend( std::min( No, ii+blockSize) );
for (size_t k(kk); k < kend; k++){
const F ek(epsi[k]);
const size_t jstart = jj > k ? jj : k;
for (size_t j(jstart); j < jend; j++){
F const ej(epsi[j]);
F const facjk = j == k ? F(0.5) : F(1.0);
size_t istart = ii > j ? ii : j;
for (size_t i(istart); i < iend; i++){
const F
ei(epsi[i])
, facij = i == j ? F(0.5) : F(1.0)
, denominator(epsabc - ei - ej - ek)
, U(Zijk_[i + No*j + No*No*k])
, V(Zijk_[i + No*k + No*No*j])
, W(Zijk_[j + No*i + No*No*k])
, X(Zijk_[j + No*k + No*No*i])
, Y(Zijk_[k + No*i + No*No*j])
, Z(Zijk_[k + No*j + No*No*i])
, A(maybeConjugate<F>(Tijk_[i + No*j + No*No*k]))
, B(maybeConjugate<F>(Tijk_[i + No*k + No*No*j]))
, C(maybeConjugate<F>(Tijk_[j + No*i + No*No*k]))
, D(maybeConjugate<F>(Tijk_[j + No*k + No*No*i]))
, E(maybeConjugate<F>(Tijk_[k + No*i + No*No*j]))
, _F(maybeConjugate<F>(Tijk_[k + No*j + No*No*i]))
, value
= 3.0 * ( A * U
+ B * V
+ C * W
+ D * X
+ E * Y
+ _F * Z )
+ ( ( U + X + Y )
- 2.0 * ( V + W + Z )
) * ( A + D + E )
+ ( ( V + W + Z )
- 2.0 * ( U + X + Y )
) * ( B + C + _F )
;
energy += 2.0 * value / denominator * facjk * facij;
} // i
} // j
} // k
} // ii
} // jj
} // kk
return std::real(energy);
}
template <typename F=double>
double getEnergySame
( F const epsabc
, size_t const No
, F* const epsi
, F* const Tijk
, F* const Zijk
);
// Energy:1 ends here
template <typename F=double> // [[file:~/cuda/atrip/atrip.org::*Singles%20contribution][Singles contribution:1]]
double getEnergySame template <typename F=double>
( const F epsabc #ifdef HAVE_CUDA
, std::vector<F> const& epsi __global__
, std::vector<F> const& Tijk_ #endif
, std::vector<F> const& Zijk_ void singlesContribution
) {
constexpr size_t blockSize = 16;
const size_t No = epsi.size();
F energy = F(0.);
for (size_t kk=0; kk<No; kk+=blockSize){
const size_t kend( std::min( kk+blockSize, No) );
for (size_t jj(kk); jj<No; jj+=blockSize){
const size_t jend( std::min( jj+blockSize, No) );
for (size_t ii(jj); ii<No; ii+=blockSize){
const size_t iend( std::min( ii+blockSize, No) );
for (size_t k(kk); k < kend; k++){
const F ek(epsi[k]);
const size_t jstart = jj > k ? jj : k;
for(size_t j(jstart); j < jend; j++){
const F facjk( j == k ? F(0.5) : F(1.0));
const F ej(epsi[j]);
const size_t istart = ii > j ? ii : j;
for(size_t i(istart); i < iend; i++){
const F
ei(epsi[i])
, facij ( i==j ? F(0.5) : F(1.0))
, denominator(epsabc - ei - ej - ek)
, U(Zijk_[i + No*j + No*No*k])
, V(Zijk_[j + No*k + No*No*i])
, W(Zijk_[k + No*i + No*No*j])
, A(maybeConjugate<F>(Tijk_[i + No*j + No*No*k]))
, B(maybeConjugate<F>(Tijk_[j + No*k + No*No*i]))
, C(maybeConjugate<F>(Tijk_[k + No*i + No*No*j]))
, value
= F(3.0) * ( A * U
+ B * V
+ C * W
)
- ( A + B + C ) * ( U + V + W )
;
energy += F(2.0) * value / denominator * facjk * facij;
} // i
} // j
} // k
} // ii
} // jj
} // kk
return std::real(energy);
}
template <typename F=double>
void singlesContribution
( size_t No ( size_t No
, size_t Nv , size_t Nv
, const ABCTuple &abc , size_t a
, F const* Tph , size_t b
, F const* VABij , size_t c
, F const* VACij , DataFieldType<F>* const Tph
, F const* VBCij , DataFieldType<F>* const VABij
, F *Zijk , DataFieldType<F>* const VACij
) { , DataFieldType<F>* const VBCij
const size_t a(abc[0]), b(abc[1]), c(abc[2]); , DataFieldType<F>* Zijk
for (size_t k=0; k < No; k++) );
for (size_t i=0; i < No; i++) // Singles contribution:1 ends here
for (size_t j=0; j < No; j++) {
const size_t ijk = i + j*No + k*No*No
, jk = j + No * k
;
Zijk[ijk] += Tph[ a + i * Nv ] * VBCij[ j + k * No ];
Zijk[ijk] += Tph[ b + j * Nv ] * VACij[ i + k * No ];
Zijk[ijk] += Tph[ c + k * Nv ] * VABij[ i + j * No ];
}
}
// [[file:~/cuda/atrip/atrip.org::*Doubles%20contribution][Doubles contribution:1]]
template <typename F=double> template <typename F=double>
void doublesContribution void doublesContribution
( const ABCTuple &abc ( const ABCTuple &abc
, size_t const No , size_t const No
, size_t const Nv , size_t const Nv
// -- VABCI // -- VABCI
, F const* VABph , DataPtr<F> const VABph
, F const* VACph , DataPtr<F> const VACph
, F const* VBCph , DataPtr<F> const VBCph
, F const* VBAph , DataPtr<F> const VBAph
, F const* VCAph , DataPtr<F> const VCAph
, F const* VCBph , DataPtr<F> const VCBph
// -- VHHHA // -- VHHHA
, F const* VhhhA , DataPtr<F> const VhhhA
, F const* VhhhB , DataPtr<F> const VhhhB
, F const* VhhhC , DataPtr<F> const VhhhC
// -- TA // -- TA
, F const* TAphh , DataPtr<F> const TAphh
, F const* TBphh , DataPtr<F> const TBphh
, F const* TCphh , DataPtr<F> const TCphh
// -- TABIJ // -- TABIJ
, F const* TABhh , DataPtr<F> const TABhh
, F const* TAChh , DataPtr<F> const TAChh
, F const* TBChh , DataPtr<F> const TBChh
// -- TIJK // -- TIJK
, F *Tijk // , DataPtr<F> Tijk
) { , DataFieldType<F>* Tijk_
const size_t a = abc[0], b = abc[1], c = abc[2]
, NoNo = No*No, NoNv = No*Nv
;
#if defined(ATRIP_USE_DGEMM)
#define _IJK_(i, j, k) i + j*No + k*NoNo
#define REORDER(__II, __JJ, __KK) \
WITH_CHRONO("doubles:reorder", \
for (size_t k = 0; k < No; k++) \
for (size_t j = 0; j < No; j++) \
for (size_t i = 0; i < No; i++) { \
Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)]; \
} \
)
#define DGEMM_PARTICLES(__A, __B) \
atrip::xgemm<F>( "T" \
, "N" \
, (int const*)&NoNo \
, (int const*)&No \
, (int const*)&Nv \
, &one \
, __A \
, (int const*)&Nv \
, __B \
, (int const*)&Nv \
, &zero \
, _t_buffer.data() \
, (int const*)&NoNo \
); );
#define DGEMM_HOLES(__A, __B, __TRANSB) \ // Doubles contribution:1 ends here
atrip::xgemm<F>( "N" \
, __TRANSB \
, (int const*)&NoNo \
, (int const*)&No \
, (int const*)&No \
, &m_one \
, __A \
, (int const*)&NoNo \
, __B \
, (int const*)&No \
, &zero \
, _t_buffer.data() \
, (int const*)&NoNo \
);
#define MAYBE_CONJ(_conj, _buffer) \
for (size_t __i = 0; __i < NoNoNo; ++__i) \
_conj[__i] = maybeConjugate<F>(_buffer[__i]); \
const size_t NoNoNo = No*NoNo;
std::vector<F> _t_buffer;
_t_buffer.reserve(NoNoNo);
F one{1.0}, m_one{-1.0}, zero{0.0};
WITH_CHRONO("double:reorder",
for (size_t k = 0; k < NoNoNo; k++) {
Tijk[k] = 0.0;
})
// TOMERGE: replace chronos
WITH_CHRONO("doubles:holes",
{ // Holes part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
std::vector<F> _vhhh(NoNoNo);
// VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
MAYBE_CONJ(_vhhh, VhhhC)
WITH_CHRONO("doubles:holes:1",
DGEMM_HOLES(_vhhh.data(), TABhh, "N")
REORDER(i, k, j)
)
// VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
WITH_CHRONO("doubles:holes:2",
DGEMM_HOLES(_vhhh.data(), TABhh, "T")
REORDER(j, k, i)
)
// VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
MAYBE_CONJ(_vhhh, VhhhB)
WITH_CHRONO("doubles:holes:3",
DGEMM_HOLES(_vhhh.data(), TAChh, "N")
REORDER(i, j, k)
)
// VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
WITH_CHRONO("doubles:holes:4",
DGEMM_HOLES(_vhhh.data(), TAChh, "T")
REORDER(k, j, i)
)
// VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
MAYBE_CONJ(_vhhh, VhhhA)
WITH_CHRONO("doubles:holes:5",
DGEMM_HOLES(_vhhh.data(), TBChh, "N")
REORDER(j, i, k)
)
// VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
WITH_CHRONO("doubles:holes:6",
DGEMM_HOLES(_vhhh.data(), TBChh, "T")
REORDER(k, i, j)
)
}
)
#undef MAYBE_CONJ
WITH_CHRONO("doubles:particles",
{ // Particle part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
WITH_CHRONO("doubles:particles:1",
DGEMM_PARTICLES(TAphh, VBCph)
REORDER(i, j, k)
)
// TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
WITH_CHRONO("doubles:particles:2",
DGEMM_PARTICLES(TAphh, VCBph)
REORDER(i, k, j)
)
// TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
WITH_CHRONO("doubles:particles:3",
DGEMM_PARTICLES(TCphh, VABph)
REORDER(k, i, j)
)
// TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
WITH_CHRONO("doubles:particles:4",
DGEMM_PARTICLES(TCphh, VBAph)
REORDER(k, j, i)
)
// TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
WITH_CHRONO("doubles:particles:5",
DGEMM_PARTICLES(TBphh, VACph)
REORDER(j, i, k)
)
// TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
WITH_CHRONO("doubles:particles:6",
DGEMM_PARTICLES(TBphh, VCAph)
REORDER(j, k, i)
)
}
)
#undef REORDER
#undef DGEMM_HOLES
#undef DGEMM_PARTICLES
#undef _IJK_
#else
for (size_t k = 0; k < No; k++)
for (size_t j = 0; j < No; j++)
for (size_t i = 0; i < No; i++){
const size_t ijk = i + j*No + k*NoNo
, jk = j + k*No
;
Tijk[ijk] = 0.0; // :important
// HOLE DIAGRAMS: TABHH and VHHHA
for (size_t L = 0; L < No; L++){
// t[abLj] * V[Lcik] H1
// t[baLi] * V[Lcjk] H0 TODO: conjugate T for complex
Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo];
Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo];
// t[acLk] * V[Lbij] H5
// t[caLi] * V[Lbkj] H3
Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo];
Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo];
// t[bcLk] * V[Laji] H2
// t[cbLj] * V[Laki] H4
Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo];
Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo];
}
// PARTILCE DIAGRAMS: TAPHH and VABPH
for (size_t E = 0; E < Nv; E++) {
// t[aEij] * V[bcEk] P0
// t[aEik] * V[cbEj] P3 // TODO: CHECK THIS ONE, I DONT KNOW
Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv];
Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv];
// t[cEki] * V[abEj] P5
// t[cEkj] * V[baEi] P2
Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv];
Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv];
// t[bEji] * V[acEk] P1
// t[bEjk] * V[caEi] P4
Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv];
Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv];
}
}
#endif
}
// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
} }
// Equations:1 ends here // Epilog:1 ends here

View File

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// [[file:../../atrip.org::*The rank mapping][The rank mapping:1]] // [[file:~/cuda/atrip/atrip.org::*The%20rank%20mapping][The rank mapping:1]]
#pragma once #pragma once
#include <vector> #include <vector>
@ -65,7 +65,8 @@ namespace atrip {
; ;
} }
bool isSourcePadding(size_t rank, size_t source) const noexcept { bool isSourcePadding(const size_t rank, const size_t source)
const noexcept {
return source == nSources() && isPaddingRank(rank); return source == nSources() && isPaddingRank(rank);
} }

View File

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// [[file:../../atrip.org::*Prolog][Prolog:1]] // [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
#pragma once #pragma once
#include <iostream> #include <iostream>
#include <algorithm> #include <algorithm>
@ -21,33 +21,20 @@
#include <atrip/Tuples.hpp> #include <atrip/Tuples.hpp>
#include <atrip/Utils.hpp> #include <atrip/Utils.hpp>
#include <atrip/Blas.hpp>
namespace atrip { namespace atrip {
template <typename FF> FF maybeConjugate(const FF a) { return a; }
template <> Complex maybeConjugate(const Complex a) { return std::conj(a); }
namespace traits {
template <typename FF> bool isComplex() { return false; }
template <> bool isComplex<Complex>() { return true; }
namespace mpi {
template <typename FF> MPI_Datatype datatypeOf(void);
template <> MPI_Datatype datatypeOf<double>() { return MPI_DOUBLE; }
template <> MPI_Datatype datatypeOf<Complex>() { return MPI_DOUBLE_COMPLEX; }
}
}
template <typename F=double> template <typename F=double>
struct Slice { struct Slice {
// Prolog:1 ends here // Prolog:1 ends here
// [[file:../../atrip.org::*Location][Location:1]] // [[file:~/cuda/atrip/atrip.org::*Location][Location:1]]
struct Location { size_t rank; size_t source; }; struct Location { size_t rank; size_t source; };
// Location:1 ends here // Location:1 ends here
// [[file:../../atrip.org::*Type][Type:1]] // [[file:~/cuda/atrip/atrip.org::*Type][Type:1]]
enum Type enum Type
{ A = 10 { A = 10
, B , B
@ -65,7 +52,7 @@ enum Type
}; };
// Type:1 ends here // Type:1 ends here
// [[file:../../atrip.org::*State][State:1]] // [[file:~/cuda/atrip/atrip.org::*State][State:1]]
enum State { enum State {
Fetch = 0, Fetch = 0,
Dispatched = 2, Dispatched = 2,
@ -76,7 +63,7 @@ enum State {
}; };
// State:1 ends here // State:1 ends here
// [[file:../../atrip.org::*The Info structure][The Info structure:1]] // [[file:~/cuda/atrip/atrip.org::*The%20Info%20structure][The Info structure:1]]
struct Info { struct Info {
// which part of a,b,c the slice holds // which part of a,b,c the slice holds
PartialTuple tuple; PartialTuple tuple;
@ -100,7 +87,7 @@ struct Info {
using Ty_x_Tu = std::pair< Type, PartialTuple >; using Ty_x_Tu = std::pair< Type, PartialTuple >;
// The Info structure:1 ends here // The Info structure:1 ends here
// [[file:../../atrip.org::*Name][Name:1]] // [[file:~/cuda/atrip/atrip.org::*Name][Name:1]]
enum Name enum Name
{ TA = 100 { TA = 100
, VIJKA = 101 , VIJKA = 101
@ -110,19 +97,19 @@ enum Name
}; };
// Name:1 ends here // Name:1 ends here
// [[file:../../atrip.org::*Database][Database:1]] // [[file:~/cuda/atrip/atrip.org::*Database][Database:1]]
struct LocalDatabaseElement { struct LocalDatabaseElement {
Slice<F>::Name name; Slice<F>::Name name;
Slice<F>::Info info; Slice<F>::Info info;
}; };
// Database:1 ends here // Database:1 ends here
// [[file:../../atrip.org::*Database][Database:2]] // [[file:~/cuda/atrip/atrip.org::*Database][Database:2]]
using LocalDatabase = std::vector<LocalDatabaseElement>; using LocalDatabase = std::vector<LocalDatabaseElement>;
using Database = LocalDatabase; using Database = LocalDatabase;
// Database:2 ends here // Database:2 ends here
// [[file:../../atrip.org::*MPI Types][MPI Types:1]] // [[file:~/cuda/atrip/atrip.org::*MPI%20Types][MPI Types:1]]
struct mpi { struct mpi {
static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) { static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) {
@ -228,7 +215,7 @@ struct mpi {
}; };
// MPI Types:1 ends here // MPI Types:1 ends here
// [[file:../../atrip.org::*Static utilities][Static utilities:1]] // [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:1]]
static static
PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) { PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) {
switch (sliceType) { switch (sliceType) {
@ -246,7 +233,7 @@ PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) {
} }
// Static utilities:1 ends here // Static utilities:1 ends here
// [[file:../../atrip.org::*Static utilities][Static utilities:2]] // [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:2]]
static std::vector<Slice<F>*> hasRecycledReferencingToIt static std::vector<Slice<F>*> hasRecycledReferencingToIt
( std::vector<Slice<F>> &slices ( std::vector<Slice<F>> &slices
, Info const& info , Info const& info
@ -263,7 +250,7 @@ static std::vector<Slice<F>*> hasRecycledReferencingToIt
} }
// Static utilities:2 ends here // Static utilities:2 ends here
// [[file:../../atrip.org::*Static utilities][Static utilities:3]] // [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:3]]
static Slice<F>& findOneByType(std::vector<Slice<F>> &slices, Slice<F>::Type type) { static Slice<F>& findOneByType(std::vector<Slice<F>> &slices, Slice<F>::Type type) {
const auto sliceIt const auto sliceIt
= std::find_if(slices.begin(), slices.end(), = std::find_if(slices.begin(), slices.end(),
@ -279,7 +266,7 @@ static Slice<F>& findOneByType(std::vector<Slice<F>> &slices, Slice<F>::Type typ
} }
// Static utilities:3 ends here // Static utilities:3 ends here
// [[file:../../atrip.org::*Static utilities][Static utilities:4]] // [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:4]]
static Slice<F>& static Slice<F>&
findRecycledSource (std::vector<Slice<F>> &slices, Slice<F>::Info info) { findRecycledSource (std::vector<Slice<F>> &slices, Slice<F>::Info info) {
const auto sliceIt const auto sliceIt
@ -305,7 +292,7 @@ findRecycledSource (std::vector<Slice<F>> &slices, Slice<F>::Info info) {
} }
// Static utilities:4 ends here // Static utilities:4 ends here
// [[file:../../atrip.org::*Static utilities][Static utilities:5]] // [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:5]]
static Slice<F>& findByTypeAbc static Slice<F>& findByTypeAbc
( std::vector<Slice<F>> &slices ( std::vector<Slice<F>> &slices
, Slice<F>::Type type , Slice<F>::Type type
@ -335,7 +322,7 @@ static Slice<F>& findByTypeAbc
} }
// Static utilities:5 ends here // Static utilities:5 ends here
// [[file:../../atrip.org::*Static utilities][Static utilities:6]] // [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:6]]
static Slice<F>& findByInfo(std::vector<Slice<F>> &slices, static Slice<F>& findByInfo(std::vector<Slice<F>> &slices,
Slice<F>::Info const& info) { Slice<F>::Info const& info) {
const auto sliceIt const auto sliceIt
@ -358,30 +345,33 @@ static Slice<F>& findByInfo(std::vector<Slice<F>> &slices,
} }
// Static utilities:6 ends here // Static utilities:6 ends here
// [[file:../../atrip.org::*Attributes][Attributes:1]] // [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:1]]
Info info; Info info;
// Attributes:1 ends here // Attributes:1 ends here
// [[file:../../atrip.org::*Attributes][Attributes:2]] // [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:2]]
F *data; DataPtr<F> data;
#if defined(HAVE_CUDA)
F* mpi_data;
#endif
// Attributes:2 ends here // Attributes:2 ends here
// [[file:../../atrip.org::*Attributes][Attributes:3]] // [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:3]]
MPI_Request request; MPI_Request request;
// Attributes:3 ends here // Attributes:3 ends here
// [[file:../../atrip.org::*Attributes][Attributes:4]] // [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:4]]
const size_t size; const size_t size;
// Attributes:4 ends here // Attributes:4 ends here
// [[file:../../atrip.org::*Member functions][Member functions:1]] // [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:1]]
void markReady() noexcept { void markReady() noexcept {
info.state = Ready; info.state = Ready;
info.recycling = Blank; info.recycling = Blank;
} }
// Member functions:1 ends here // Member functions:1 ends here
// [[file:../../atrip.org::*Member functions][Member functions:2]] // [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:2]]
bool isUnwrapped() const noexcept { bool isUnwrapped() const noexcept {
return info.state == Ready return info.state == Ready
|| info.state == SelfSufficient || info.state == SelfSufficient
@ -389,7 +379,7 @@ bool isUnwrapped() const noexcept {
} }
// Member functions:2 ends here // Member functions:2 ends here
// [[file:../../atrip.org::*Member functions][Member functions:3]] // [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:3]]
bool isUnwrappable() const noexcept { bool isUnwrappable() const noexcept {
return isUnwrapped() return isUnwrapped()
|| info.state == Recycled || info.state == Recycled
@ -407,7 +397,7 @@ void free() noexcept {
info.state = Acceptor; info.state = Acceptor;
info.from = {0, 0}; info.from = {0, 0};
info.recycling = Blank; info.recycling = Blank;
data = nullptr; data = DataNullPtr;
} }
inline bool isFree() const noexcept { inline bool isFree() const noexcept {
@ -417,12 +407,12 @@ inline bool isFree() const noexcept {
&& info.from.rank == 0 && info.from.rank == 0
&& info.from.source == 0 && info.from.source == 0
&& info.recycling == Blank && info.recycling == Blank
&& data == nullptr && data == DataNullPtr
; ;
} }
// Member functions:3 ends here // Member functions:3 ends here
// [[file:../../atrip.org::*Member functions][Member functions:4]] // [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:4]]
inline bool isRecyclable() const noexcept { inline bool isRecyclable() const noexcept {
return ( info.state == Dispatched return ( info.state == Dispatched
|| info.state == Ready || info.state == Ready
@ -433,16 +423,16 @@ inline bool isRecyclable() const noexcept {
} }
// Member functions:4 ends here // Member functions:4 ends here
// [[file:../../atrip.org::*Member functions][Member functions:5]] // [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:5]]
inline bool hasValidDataPointer() const noexcept { inline bool hasValidDataPointer() const noexcept {
return data != nullptr return data != DataNullPtr
&& info.state != Acceptor && info.state != Acceptor
&& info.type != Blank && info.type != Blank
; ;
} }
// Member functions:5 ends here // Member functions:5 ends here
// [[file:../../atrip.org::*Member functions][Member functions:6]] // [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:6]]
void unwrapAndMarkReady() { void unwrapAndMarkReady() {
if (info.state == Ready) return; if (info.state == Ready) return;
if (info.state != Dispatched) if (info.state != Dispatched)
@ -454,9 +444,17 @@ void unwrapAndMarkReady() {
WITH_RANK << "__slice__:mpi: waiting " << "\n"; WITH_RANK << "__slice__:mpi: waiting " << "\n";
#endif #endif
const int errorCode = MPI_Wait(&request, &status); const int errorCode = MPI_Wait(&request, &status);
if (MPI_SUCCESS != MPI_Request_free(&request))
throw "Error freeing MPI request";
if (errorCode != MPI_SUCCESS) if (errorCode != MPI_SUCCESS)
throw "MPI ERROR HAPPENED...."; throw "MPI ERROR HAPPENED....";
#if defined(HAVE_CUDA)
// copy the retrieved mpi data to the device
cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);
std::free(mpi_data);
#endif
#ifdef HAVE_OCD #ifdef HAVE_OCD
char errorString[MPI_MAX_ERROR_STRING]; char errorString[MPI_MAX_ERROR_STRING];
int errorSize; int errorSize;
@ -474,18 +472,21 @@ void unwrapAndMarkReady() {
} }
// Member functions:6 ends here // Member functions:6 ends here
// [[file:../../atrip.org::*Epilog][Epilog:1]] // [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
Slice(size_t size_) Slice(size_t size_)
: info({}) : info({})
, data(nullptr) , data(DataNullPtr)
#if defined(HAVE_CUDA)
, mpi_data(nullptr)
#endif
, size(size_) , size(size_)
{} {}
}; // struct Slice }; // struct Slice
// Epilog:1 ends here // Epilog:1 ends here
// [[file:../../atrip.org::*Debug][Debug:1]] // [[file:~/cuda/atrip/atrip.org::*Debug][Debug:1]]
template <typename F=double> template <typename F=double>
std::ostream& operator<<(std::ostream& out, typename Slice<F>::Location const& v) { std::ostream& operator<<(std::ostream& out, typename Slice<F>::Location const& v) {
// TODO: remove me // TODO: remove me

View File

@ -12,16 +12,19 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// [[file:../../atrip.org::*The slice union][The slice union:1]] // [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
#pragma once #pragma once
#include <atrip/Debug.hpp> #include <atrip/Debug.hpp>
#include <atrip/Slice.hpp> #include <atrip/Slice.hpp>
#include <atrip/RankMap.hpp> #include <atrip/RankMap.hpp>
namespace atrip { namespace atrip {
// Prolog:1 ends here
template <typename F=double> // [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]]
struct SliceUnion { template <typename F=double>
class SliceUnion {
public:
using Tensor = CTF::Tensor<F>; using Tensor = CTF::Tensor<F>;
virtual void virtual void
@ -191,7 +194,11 @@ namespace atrip {
: Slice<F>::Fetch : Slice<F>::Fetch
; ;
if (blank.info.state == Slice<F>::SelfSufficient) { if (blank.info.state == Slice<F>::SelfSufficient) {
#if defined(HAVE_CUDA)
blank.mpi_data = sources[from.source].data();
#else
blank.data = sources[from.source].data(); blank.data = sources[from.source].data();
#endif
} else { } else {
if (freePointers.size() == 0) { if (freePointers.size() == 0) {
std::stringstream stream; std::stringstream stream;
@ -345,8 +352,7 @@ namespace atrip {
} }
// CONSTRUCTOR // CONSTRUCTOR
SliceUnion( Tensor const& sourceTensor SliceUnion( std::vector<typename Slice<F>::Type> sliceTypes_
, std::vector<typename Slice<F>::Type> sliceTypes_
, std::vector<size_t> sliceLength_ , std::vector<size_t> sliceLength_
, std::vector<size_t> paramLength , std::vector<size_t> paramLength
, size_t np , size_t np
@ -366,12 +372,19 @@ namespace atrip {
1UL, std::multiplies<size_t>()))) 1UL, std::multiplies<size_t>())))
, name(name_) , name(name_)
, sliceTypes(sliceTypes_) , sliceTypes(sliceTypes_)
, sliceBuffers(nSliceBuffers, sources[0]) , sliceBuffers(nSliceBuffers)
//, slices(2 * sliceTypes.size(), Slice<F>{ sources[0].size() }) //, slices(2 * sliceTypes.size(), Slice<F>{ sources[0].size() })
{ // constructor begin { // constructor begin
LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n"; LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n";
for (auto& ptr: sliceBuffers)
#if defined(HAVE_CUDA)
cuMemAlloc(&ptr, sizeof(F) * sources[0].size());
#else
ptr = (DataPtr<F>)malloc(sizeof(F) * sources[0].size());
#endif
slices slices
= std::vector<Slice<F>>(2 * sliceTypes.size(), { sources[0].size() }); = std::vector<Slice<F>>(2 * sliceTypes.size(), { sources[0].size() });
// TODO: think exactly ^------------------- about this number // TODO: think exactly ^------------------- about this number
@ -379,7 +392,7 @@ namespace atrip {
// initialize the freePointers with the pointers to the buffers // initialize the freePointers with the pointers to the buffers
std::transform(sliceBuffers.begin(), sliceBuffers.end(), std::transform(sliceBuffers.begin(), sliceBuffers.end(),
std::inserter(freePointers, freePointers.begin()), std::inserter(freePointers, freePointers.begin()),
[](std::vector<F> &vec) { return vec.data(); }); [](DataPtr<F> ptr) { return ptr; });
@ -397,8 +410,6 @@ namespace atrip {
<< freePointers.size() << "\n"; << freePointers.size() << "\n";
LOG(1,"Atrip") << "#sliceBuffers " LOG(1,"Atrip") << "#sliceBuffers "
<< sliceBuffers.size() << "\n"; << sliceBuffers.size() << "\n";
LOG(1,"Atrip") << "#sliceBuffers[0] "
<< sliceBuffers[0].size() << "\n";
LOG(1,"Atrip") << "#sliceLength " LOG(1,"Atrip") << "#sliceLength "
<< sliceLength.size() << "\n"; << sliceLength.size() << "\n";
LOG(1,"Atrip") << "#paramLength " LOG(1,"Atrip") << "#paramLength "
@ -477,9 +488,12 @@ namespace atrip {
if (slice.info.state == Slice<F>::Fetch) { if (slice.info.state == Slice<F>::Fetch) {
// TODO: do it through the slice class // TODO: do it through the slice class
slice.info.state = Slice<F>::Dispatched; slice.info.state = Slice<F>::Dispatched;
MPI_Request request; #if defined(HAVE_CUDA)
slice.request = request; slice.mpi_data = (F*)malloc(sizeof(F) * slice.size);
MPI_Irecv( slice.mpi_data
#else
MPI_Irecv( slice.data MPI_Irecv( slice.data
#endif
, slice.size , slice.size
, traits::mpi::datatypeOf<F>() , traits::mpi::datatypeOf<F>()
, info.from.rank , info.from.rank
@ -495,7 +509,7 @@ namespace atrip {
for (auto type: sliceTypes) unwrapSlice(type, abc); for (auto type: sliceTypes) unwrapSlice(type, abc);
} }
F* unwrapSlice(typename Slice<F>::Type type, ABCTuple const& abc) { DataPtr<F> unwrapSlice(typename Slice<F>::Type type, ABCTuple const& abc) {
WITH_CRAZY_DEBUG WITH_CRAZY_DEBUG
WITH_RANK << "__unwrap__:slice " << type << " w n " WITH_RANK << "__unwrap__:slice " << type << " w n "
<< name << name
@ -539,6 +553,15 @@ namespace atrip {
return slice.data; return slice.data;
} }
~SliceUnion() {
for (auto& ptr: sliceBuffers)
#if defined(HAVE_CUDA)
cuMemFree(ptr);
#else
std::free(ptr);
#endif
}
const RankMap<F> rankMap; const RankMap<F> rankMap;
const MPI_Comm world; const MPI_Comm world;
const MPI_Comm universe; const MPI_Comm universe;
@ -547,8 +570,8 @@ namespace atrip {
std::vector< Slice<F> > slices; std::vector< Slice<F> > slices;
typename Slice<F>::Name name; typename Slice<F>::Name name;
const std::vector<typename Slice<F>::Type> sliceTypes; const std::vector<typename Slice<F>::Type> sliceTypes;
std::vector< std::vector<F> > sliceBuffers; std::vector< DataPtr<F> > sliceBuffers;
std::set<F*> freePointers; std::set< DataPtr<F> > freePointers;
}; };
@ -568,6 +591,8 @@ namespace atrip {
} }
return **sliceUnionIt; return **sliceUnionIt;
} }
// Prolog:2 ends here
// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
} }
// The slice union:1 ends here // Epilog:1 ends here

View File

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// [[file:../../atrip.org::*Prolog][Prolog:1]] // [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
#pragma once #pragma once
#include <vector> #include <vector>
@ -35,7 +35,7 @@
namespace atrip { namespace atrip {
// Prolog:1 ends here // Prolog:1 ends here
// [[file:../../atrip.org::*Tuples types][Tuples types:1]] // [[file:~/cuda/atrip/atrip.org::*Tuples%20types][Tuples types:1]]
using ABCTuple = std::array<size_t, 3>; using ABCTuple = std::array<size_t, 3>;
using PartialTuple = std::array<size_t, 2>; using PartialTuple = std::array<size_t, 2>;
using ABCTuples = std::vector<ABCTuple>; using ABCTuples = std::vector<ABCTuple>;
@ -44,23 +44,22 @@ constexpr ABCTuple FAKE_TUPLE = {0, 0, 0};
constexpr ABCTuple INVALID_TUPLE = {1, 1, 1}; constexpr ABCTuple INVALID_TUPLE = {1, 1, 1};
// Tuples types:1 ends here // Tuples types:1 ends here
// [[file:../../atrip.org::*Distributing the tuples][Distributing the tuples:1]] // [[file:~/cuda/atrip/atrip.org::*Distributing%20the%20tuples][Distributing the tuples:1]]
struct TuplesDistribution { struct TuplesDistribution {
virtual ABCTuples getTuples(size_t Nv, MPI_Comm universe) = 0; virtual ABCTuples getTuples(size_t Nv, MPI_Comm universe) = 0;
virtual bool tupleIsFake(ABCTuple const& t) { return t == FAKE_TUPLE; } virtual bool tupleIsFake(ABCTuple const& t) { return t == FAKE_TUPLE; }
}; };
// Distributing the tuples:1 ends here // Distributing the tuples:1 ends here
// [[file:../../atrip.org::*Node information][Node information:1]] // [[file:~/cuda/atrip/atrip.org::*Node%20information][Node information:1]]
std::vector<std::string> getNodeNames(MPI_Comm comm){ std::vector<std::string> getNodeNames(MPI_Comm comm){
int rank, np; int rank, np;
MPI_Comm_rank(comm, &rank); MPI_Comm_rank(comm, &rank);
MPI_Comm_size(comm, &np); MPI_Comm_size(comm, &np);
std::vector<std::string> nodeList(np); std::vector<std::string> nodeList(np);
char nodeName[MPI_MAX_PROCESSOR_NAME] char nodeName[MPI_MAX_PROCESSOR_NAME];
, nodeNames[np*MPI_MAX_PROCESSOR_NAME] char *nodeNames = (char*)malloc(np * MPI_MAX_PROCESSOR_NAME);
;
std::vector<int> nameLengths(np) std::vector<int> nameLengths(np)
, off(np) , off(np)
; ;
@ -87,11 +86,12 @@ std::vector<std::string> getNodeNames(MPI_Comm comm){
std::string const s(&nodeNames[off[i]], nameLengths[i]); std::string const s(&nodeNames[off[i]], nameLengths[i]);
nodeList[i] = s; nodeList[i] = s;
} }
std::free(nodeNames);
return nodeList; return nodeList;
} }
// Node information:1 ends here // Node information:1 ends here
// [[file:../../atrip.org::*Node information][Node information:2]] // [[file:~/cuda/atrip/atrip.org::*Node%20information][Node information:2]]
struct RankInfo { struct RankInfo {
const std::string name; const std::string name;
const size_t nodeId; const size_t nodeId;
@ -154,7 +154,7 @@ getClusterInfo(MPI_Comm comm) {
} }
// Node information:2 ends here // Node information:2 ends here
// [[file:../../atrip.org::*Naive list][Naive list:1]] // [[file:~/cuda/atrip/atrip.org::*Naive%20list][Naive list:1]]
ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) { ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) {
const size_t const size_t
@ -188,7 +188,7 @@ ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) {
} }
// Naive list:1 ends here // Naive list:1 ends here
// [[file:../../atrip.org::*Naive list][Naive list:2]] // [[file:~/cuda/atrip/atrip.org::*Naive%20list][Naive list:2]]
ABCTuples getAllTuplesList(const size_t Nv) { ABCTuples getAllTuplesList(const size_t Nv) {
const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv; const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
ABCTuples result(n); ABCTuples result(n);
@ -204,7 +204,7 @@ ABCTuples getAllTuplesList(const size_t Nv) {
} }
// Naive list:2 ends here // Naive list:2 ends here
// [[file:../../atrip.org::*Naive list][Naive list:3]] // [[file:~/cuda/atrip/atrip.org::*Naive%20list][Naive list:3]]
struct NaiveDistribution : public TuplesDistribution { struct NaiveDistribution : public TuplesDistribution {
ABCTuples getTuples(size_t Nv, MPI_Comm universe) override { ABCTuples getTuples(size_t Nv, MPI_Comm universe) override {
int rank, np; int rank, np;
@ -215,11 +215,11 @@ struct NaiveDistribution : public TuplesDistribution {
}; };
// Naive list:3 ends here // Naive list:3 ends here
// [[file:../../atrip.org::*Prolog][Prolog:1]] // [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
namespace group_and_sort { namespace group_and_sort {
// Prolog:1 ends here // Prolog:1 ends here
// [[file:../../atrip.org::*Utils][Utils:1]] // [[file:~/cuda/atrip/atrip.org::*Utils][Utils:1]]
// Provides the node on which the slice-element is found // Provides the node on which the slice-element is found
// Right now we distribute the slices in a round robin fashion // Right now we distribute the slices in a round robin fashion
// over the different nodes (NOTE: not mpi ranks but nodes) // over the different nodes (NOTE: not mpi ranks but nodes)
@ -244,7 +244,7 @@ struct Info {
}; };
// Utils:1 ends here // Utils:1 ends here
// [[file:../../atrip.org::*Distribution][Distribution:1]] // [[file:~/cuda/atrip/atrip.org::*Distribution][Distribution:1]]
ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) { ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
ABCTuples nodeTuples; ABCTuples nodeTuples;
@ -426,7 +426,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
} }
// Distribution:1 ends here // Distribution:1 ends here
// [[file:../../atrip.org::*Main][Main:1]] // [[file:~/cuda/atrip/atrip.org::*Main][Main:1]]
std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv) { std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv) {
int rank, np; int rank, np;
@ -466,7 +466,7 @@ std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv) {
MPI_Comm_split(universe, color, key, &INTRA_COMM); MPI_Comm_split(universe, color, key, &INTRA_COMM);
// Main:1 ends here // Main:1 ends here
// [[file:../../atrip.org::*Main][Main:2]] // [[file:~/cuda/atrip/atrip.org::*Main][Main:2]]
size_t const size_t const
tuplesPerRankLocal tuplesPerRankLocal
= nodeTuples.size() / nodeInfos[rank].ranksPerNode = nodeTuples.size() / nodeInfos[rank].ranksPerNode
@ -494,7 +494,7 @@ LOG(1,"Atrip") << "ranks per node " << nodeInfos[rank].ranksPerNode << "\n";
LOG(1,"Atrip") << "#nodes " << nNodes << "\n"; LOG(1,"Atrip") << "#nodes " << nNodes << "\n";
// Main:2 ends here // Main:2 ends here
// [[file:../../atrip.org::*Main][Main:3]] // [[file:~/cuda/atrip/atrip.org::*Main][Main:3]]
size_t const totalTuples size_t const totalTuples
= tuplesPerRankGlobal * nodeInfos[rank].ranksPerNode; = tuplesPerRankGlobal * nodeInfos[rank].ranksPerNode;
@ -506,7 +506,7 @@ if (computeDistribution) {
} }
// Main:3 ends here // Main:3 ends here
// [[file:../../atrip.org::*Main][Main:4]] // [[file:~/cuda/atrip/atrip.org::*Main][Main:4]]
{ {
// construct mpi type for abctuple // construct mpi type for abctuple
MPI_Datatype MPI_ABCTUPLE; MPI_Datatype MPI_ABCTUPLE;
@ -530,13 +530,13 @@ if (computeDistribution) {
} }
// Main:4 ends here // Main:4 ends here
// [[file:../../atrip.org::*Main][Main:5]] // [[file:~/cuda/atrip/atrip.org::*Main][Main:5]]
return result; return result;
} }
// Main:5 ends here // Main:5 ends here
// [[file:../../atrip.org::*Interface][Interface:1]] // [[file:~/cuda/atrip/atrip.org::*Interface][Interface:1]]
struct Distribution : public TuplesDistribution { struct Distribution : public TuplesDistribution {
ABCTuples getTuples(size_t Nv, MPI_Comm universe) override { ABCTuples getTuples(size_t Nv, MPI_Comm universe) override {
return main(universe, Nv); return main(universe, Nv);
@ -544,10 +544,10 @@ struct Distribution : public TuplesDistribution {
}; };
// Interface:1 ends here // Interface:1 ends here
// [[file:../../atrip.org::*Epilog][Epilog:1]] // [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
} // namespace group_and_sort } // namespace group_and_sort
// Epilog:1 ends here // Epilog:1 ends here
// [[file:../../atrip.org::*Epilog][Epilog:1]] // [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
} }
// Epilog:1 ends here // Epilog:1 ends here

60
include/atrip/Types.hpp Normal file
View File

@ -0,0 +1,60 @@
// Copyright 2022 Alejandro Gallo
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// [[file:~/cuda/atrip/atrip.org::*Data%20pointer][Data pointer:1]]
#pragma once
#include <atrip/Complex.hpp>
#include <atrip/Atrip.hpp>
namespace atrip {
template <typename F>
struct DataField;
template <>
struct DataField<double> {
using type = double;
};
#if defined(HAVE_CUDA)
template <typename F>
using DataPtr = CUdeviceptr;
#define DataNullPtr 0x00
template <>
struct DataField<Complex> {
using type = cuDoubleComplex;
};
#else
template <typename F>
using DataPtr = F*;
#define DataNullPtr nullptr
template <>
struct DataField<Complex> {
using type = Complex;
};
#endif
template <typename F>
using DataFieldType = typename DataField<F>::type;
}
// Data pointer:1 ends here

View File

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// [[file:../../atrip.org::*Unions][Unions:1]] // [[file:~/cuda/atrip/atrip.org::*Unions][Unions:1]]
#pragma once #pragma once
#include <atrip/SliceUnion.hpp> #include <atrip/SliceUnion.hpp>
@ -65,8 +65,7 @@ namespace atrip {
, size_t np , size_t np
, MPI_Comm child_world , MPI_Comm child_world
, MPI_Comm global_world , MPI_Comm global_world
) : SliceUnion<F>( sourceTensor ) : SliceUnion<F>( {Slice<F>::A, Slice<F>::B, Slice<F>::C}
, {Slice<F>::A, Slice<F>::B, Slice<F>::C}
, {Nv, No, No} // size of the slices , {Nv, No, No} // size of the slices
, {Nv} , {Nv}
, np , np
@ -103,8 +102,7 @@ namespace atrip {
, size_t np , size_t np
, MPI_Comm child_world , MPI_Comm child_world
, MPI_Comm global_world , MPI_Comm global_world
) : SliceUnion<F>( sourceTensor ) : SliceUnion<F>( {Slice<F>::A, Slice<F>::B, Slice<F>::C}
, {Slice<F>::A, Slice<F>::B, Slice<F>::C}
, {No, No, No} // size of the slices , {No, No, No} // size of the slices
, {Nv} // size of the parametrization , {Nv} // size of the parametrization
, np , np
@ -138,8 +136,7 @@ namespace atrip {
, size_t np , size_t np
, MPI_Comm child_world , MPI_Comm child_world
, MPI_Comm global_world , MPI_Comm global_world
) : SliceUnion<F>( sourceTensor ) : SliceUnion<F>( { Slice<F>::AB, Slice<F>::BC, Slice<F>::AC
, { Slice<F>::AB, Slice<F>::BC, Slice<F>::AC
, Slice<F>::BA, Slice<F>::CB, Slice<F>::CA , Slice<F>::BA, Slice<F>::CB, Slice<F>::CA
} }
, {Nv, No} // size of the slices , {Nv, No} // size of the slices
@ -179,8 +176,7 @@ namespace atrip {
, size_t np , size_t np
, MPI_Comm child_world , MPI_Comm child_world
, MPI_Comm global_world , MPI_Comm global_world
) : SliceUnion<F>( sourceTensor ) : SliceUnion<F>( {Slice<F>::AB, Slice<F>::BC, Slice<F>::AC}
, {Slice<F>::AB, Slice<F>::BC, Slice<F>::AC}
, {No, No} // size of the slices , {No, No} // size of the slices
, {Nv, Nv} // size of the parametrization , {Nv, Nv} // size of the parametrization
, np , np
@ -219,8 +215,7 @@ namespace atrip {
, size_t np , size_t np
, MPI_Comm child_world , MPI_Comm child_world
, MPI_Comm global_world , MPI_Comm global_world
) : SliceUnion<F>( sourceTensor ) : SliceUnion<F>( {Slice<F>::AB, Slice<F>::BC, Slice<F>::AC}
, {Slice<F>::AB, Slice<F>::BC, Slice<F>::AC}
, {No, No} // size of the slices , {No, No} // size of the slices
, {Nv, Nv} // size of the parametrization , {Nv, Nv} // size of the parametrization
, np , np

View File

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// [[file:../../atrip.org::*Prolog][Prolog:1]] // [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
#pragma once #pragma once
#include <sstream> #include <sstream>
#include <string> #include <string>
@ -29,11 +29,14 @@
#include <atrip/Debug.hpp> #include <atrip/Debug.hpp>
namespace atrip { namespace atrip {
// Prolog:1 ends here // Prolog:1 ends here
// [[file:../../atrip.org::*Pretty printing][Pretty printing:1]] // [[file:~/cuda/atrip/atrip.org::*Pretty%20printing][Pretty printing:1]]
template <typename T> #pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
template <typename T>
std::string pretty_print(T&& value) { std::string pretty_print(T&& value) {
std::stringstream stream; std::stringstream stream;
#if ATRIP_DEBUG > 2 #if ATRIP_DEBUG > 2
@ -41,9 +44,10 @@ template <typename T>
#endif #endif
return stream.str(); return stream.str();
} }
#pragma GCC diagnostic pop
// Pretty printing:1 ends here // Pretty printing:1 ends here
// [[file:../../atrip.org::*Chrono][Chrono:1]] // [[file:~/cuda/atrip/atrip.org::*Chrono][Chrono:1]]
#define WITH_CHRONO(__chrono_name, ...) \ #define WITH_CHRONO(__chrono_name, ...) \
Atrip::chrono[__chrono_name].start(); \ Atrip::chrono[__chrono_name].start(); \
__VA_ARGS__ \ __VA_ARGS__ \
@ -62,6 +66,6 @@ struct Timer {
using Timings = std::map<std::string, Timer>; using Timings = std::map<std::string, Timer>;
// Chrono:1 ends here // Chrono:1 ends here
// [[file:../../atrip.org::*Epilog][Epilog:1]] // [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
} }
// Epilog:1 ends here // Epilog:1 ends here

View File

@ -2,26 +2,25 @@ AUTOMAKE_OPTIONS = subdir-objects
include $(top_srcdir)/atrip.mk include $(top_srcdir)/atrip.mk
AM_CXXFLAGS = $(CTF_CPPFLAGS) -fmax-errors=1 AM_CXXFLAGS = $(CTF_CPPFLAGS) -fmax-errors=1
AM_CPPFLAGS = $(CTF_CPPFLAGS)
lib_LIBRARIES = libatrip.a lib_LIBRARIES = libatrip.a
libatrip_a_CPPFLAGS = -I$(top_srcdir)/include/ libatrip_a_CPPFLAGS = -I$(top_srcdir)/include/
libatrip_a_SOURCES = ./atrip/Atrip.cxx \ libatrip_a_SOURCES = ./atrip/Blas.cxx
./atrip/Blas.cxx NVCC_FILES = ./atrip/Equations.cxx ./atrip/Complex.cxx ./atrip/Atrip.cxx
if WITH_CUDA if WITH_CUDA
NVCC_FILES = ./atrip/Equations.cxx ./atrip/Complex.cxx
NVCC_OBJS = $(patsubst %.cxx,%.nvcc.o,$(NVCC_FILES)) NVCC_OBJS = $(patsubst %.cxx,%.nvcc.o,$(NVCC_FILES))
libatrip_a_CPPFLAGS += $(CUDA_CXXFLAGS) libatrip_a_CPPFLAGS += $(CUDA_CXXFLAGS)
libatrip_a_DEPENDENCIES = $(NVCC_OBJS) libatrip_a_DEPENDENCIES = $(NVCC_OBJS)
libatrip_a_LIBADD = $(NVCC_OBJS) libatrip_a_LIBADD = $(NVCC_OBJS)
%.nvcc.o: %.cxx %.nvcc.o: %.cxx
$(NVCC) -c -I../ $(CPPFLAGS) $(libatrip_a_CPPFLAGS) $< -o $@ $(NVCC) -c -x cu -I../ $(CPPFLAGS) $(CTF_CPPFLAGS) $(DEFS) $(libatrip_a_CPPFLAGS) $< -o $@
#./atrip/Equations.o: ./atrip/Equations.cxx #./atrip/Equations.o: ./atrip/Equations.cxx
# $(NVCC) -c -I../ $(CPPFLAGS) $(libatrip_a_CPPFLAGS) $< -o $@ # $(NVCC) -c -I../ $(CPPFLAGS) $(libatrip_a_CPPFLAGS) $< -o $@
else else
libatrip_a_SOURCES += ./atrip/Equations.cxx libatrip_a_SOURCES += $(NVCC_FILES)
endif endif

View File

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// [[file:../../atrip.org::*Main][Main:1]] // [[file:~/cuda/atrip/atrip.org::*Main][Main:1]]
#include <iomanip> #include <iomanip>
#include <atrip/Atrip.hpp> #include <atrip/Atrip.hpp>
@ -23,12 +23,24 @@
#include <atrip/Checkpoint.hpp> #include <atrip/Checkpoint.hpp>
using namespace atrip; using namespace atrip;
#if defined(HAVE_CUDA)
namespace atrip {
namespace cuda {
};
};
#endif
template <typename F> bool RankMap<F>::RANK_ROUND_ROBIN; template <typename F> bool RankMap<F>::RANK_ROUND_ROBIN;
template bool RankMap<double>::RANK_ROUND_ROBIN; template bool RankMap<double>::RANK_ROUND_ROBIN;
template bool RankMap<Complex>::RANK_ROUND_ROBIN; template bool RankMap<Complex>::RANK_ROUND_ROBIN;
int Atrip::rank; size_t Atrip::rank;
int Atrip::np; size_t Atrip::np;
#if defined(HAVE_CUDA)
typename Atrip::CudaContext Atrip::cuda;
#endif
MPI_Comm Atrip::communicator; MPI_Comm Atrip::communicator;
Timings Atrip::chrono; Timings Atrip::chrono;
@ -40,15 +52,20 @@ void atrip::registerIterationDescriptor(IterationDescriptor d) {
void Atrip::init(MPI_Comm world) { void Atrip::init(MPI_Comm world) {
Atrip::communicator = world; Atrip::communicator = world;
MPI_Comm_rank(world, &Atrip::rank); MPI_Comm_rank(world, (int*)&Atrip::rank);
MPI_Comm_size(world, &Atrip::np); MPI_Comm_size(world, (int*)&Atrip::np);
#if defined(HAVE_CUDA)
Atrip::cuda.status = cublasCreate(&Atrip::cuda.handle);
#endif
} }
template <typename F> template <typename F>
Atrip::Output Atrip::run(Atrip::Input<F> const& in) { Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
const int np = Atrip::np; const size_t np = Atrip::np;
const int rank = Atrip::rank; const size_t rank = Atrip::rank;
MPI_Comm universe = Atrip::communicator; MPI_Comm universe = Atrip::communicator;
const size_t No = in.ei->lens[0]; const size_t No = in.ei->lens[0];
@ -58,18 +75,35 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
LOG(0,"Atrip") << "np: " << np << "\n"; LOG(0,"Atrip") << "np: " << np << "\n";
// allocate the three scratches, see piecuch // allocate the three scratches, see piecuch
std::vector<F> Tijk(No*No*No) // doubles only (see piecuch)
, Zijk(No*No*No) // singles + doubles (see piecuch)
// we need local copies of the following tensors on every // we need local copies of the following tensors on every
// rank // rank
, epsi(No) std::vector<F> _epsi(No)
, epsa(Nv) , _epsa(Nv)
, Tai(No * Nv) , _Tai(No * Nv)
; ;
in.ei->read_all(epsi.data()); in.ei->read_all(_epsi.data());
in.ea->read_all(epsa.data()); in.ea->read_all(_epsa.data());
in.Tph->read_all(Tai.data()); in.Tph->read_all(_Tai.data());
#if defined(HAVE_CUDA)
DataPtr<F> Tai, epsi, epsa;
cuMemAlloc(&Tai, sizeof(F) * _Tai.size());
cuMemAlloc(&epsi, sizeof(F) * _epsi.size());
cuMemAlloc(&epsa, sizeof(F) * _epsa.size());
cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size());
cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size());
cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size());
DataPtr<F> Tijk, Zijk;
//TODO: free memory
cuMemAlloc(&Tijk, sizeof(F) * No * No * No);
cuMemAlloc(&Zijk, sizeof(F) * No * No * No);
#else
std::vector<F> &Tai = _Tai, &epsi = _epsi, &epsa = _epsa;
std::vector<F> Tijk(No*No*No), Zijk(No*No*No);
#endif
RankMap<F>::RANK_ROUND_ROBIN = in.rankRoundRobin; RankMap<F>::RANK_ROUND_ROBIN = in.rankRoundRobin;
if (RankMap<F>::RANK_ROUND_ROBIN) { if (RankMap<F>::RANK_ROUND_ROBIN) {
@ -139,7 +173,6 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
) )
const size_t nIterations = tuplesList.size(); const size_t nIterations = tuplesList.size();
{ {
const size_t _all_tuples = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
LOG(0,"Atrip") << "#iterations: " LOG(0,"Atrip") << "#iterations: "
<< nIterations << nIterations
<< "/" << "/"
@ -338,6 +371,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
} }
} }
LOG(0, "AtripCUDA") << "Starting iterations\n";
for ( size_t for ( size_t
i = first_iteration, i = first_iteration,
iteration = first_iteration + 1 iteration = first_iteration + 1
@ -346,6 +382,8 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
) { ) {
Atrip::chrono["iterations"].start(); Atrip::chrono["iterations"].start();
LOG(0, "AtripCUDA") << "iteration " << i << "\n";
// check overhead from chrono over all iterations // check overhead from chrono over all iterations
WITH_CHRONO("start:stop", {}) WITH_CHRONO("start:stop", {})
@ -355,8 +393,10 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
if (in.barrier) MPI_Barrier(universe); if (in.barrier) MPI_Barrier(universe);
)) ))
// write checkpoints // write checkpoints
if (iteration % checkpoint_mod == 0) { if (iteration % checkpoint_mod == 0 && false) {
LOG(0, "AtripCUDA") << "checkpoints \n";
double globalEnergy = 0; double globalEnergy = 0;
MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe); MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe);
Checkpoint out Checkpoint out
@ -368,9 +408,10 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
iteration - 1, iteration - 1,
in.rankRoundRobin}; in.rankRoundRobin};
LOG(0, "Atrip") << "Writing checkpoint\n"; LOG(0, "Atrip") << "Writing checkpoint\n";
if (Atrip::rank == 0) write_checkpoint(out, in.checkpointPath); //if (Atrip::rank == 0) write_checkpoint(out, in.checkpointPath);
} }
LOG(0, "AtripCUDA") << "reporting \n";
// write reporting // write reporting
if (iteration % iterationMod == 0 || iteration == iteration1Percent) { if (iteration % iterationMod == 0 || iteration == iteration1Percent) {
@ -417,9 +458,11 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
<< "\n"; << "\n";
) )
LOG(0, "AtripCUDA") << "first database " << i << "\n";
// COMM FIRST DATABASE ================================================{{{1 // COMM FIRST DATABASE ================================================{{{1
if (i == first_iteration) { if (i == first_iteration) {
LOG(0, "AtripCUDA") << "first database " << i << "\n";
WITH_RANK << "__first__:first database ............ \n"; WITH_RANK << "__first__:first database ............ \n";
const auto db = communicateDatabase(abc, universe); const auto db = communicateDatabase(abc, universe);
WITH_RANK << "__first__:first database communicated \n"; WITH_RANK << "__first__:first database communicated \n";
@ -432,6 +475,8 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
MPI_Barrier(universe); MPI_Barrier(universe);
} }
LOG(0, "AtripCUDA") << "next database" << i << "\n";
// COMM NEXT DATABASE ================================================={{{1 // COMM NEXT DATABASE ================================================={{{1
if (abcNext) { if (abcNext) {
WITH_RANK << "__comm__:" << iteration << "th communicating database\n"; WITH_RANK << "__comm__:" << iteration << "th communicating database\n";
@ -447,6 +492,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
// COMPUTE DOUBLES ===================================================={{{1 // COMPUTE DOUBLES ===================================================={{{1
OCD_Barrier(universe); OCD_Barrier(universe);
if (!isFakeTuple(i)) { if (!isFakeTuple(i)) {
LOG(0, "AtripCUDA") << "computing doubles " << i << "\n";
WITH_RANK << iteration << "-th doubles\n"; WITH_RANK << iteration << "-th doubles\n";
WITH_CHRONO("oneshot-unwrap", WITH_CHRONO("oneshot-unwrap",
WITH_CHRONO("unwrap", WITH_CHRONO("unwrap",
@ -478,7 +526,11 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
, tabhh.unwrapSlice(Slice<F>::AC, abc) , tabhh.unwrapSlice(Slice<F>::AC, abc)
, tabhh.unwrapSlice(Slice<F>::BC, abc) , tabhh.unwrapSlice(Slice<F>::BC, abc)
// -- TIJK // -- TIJK
#if defined(HAVE_CUDA)
, (DataFieldType<F>*)Tijk
#else
, Tijk.data() , Tijk.data()
#endif
); );
WITH_RANK << iteration << "-th doubles done\n"; WITH_RANK << iteration << "-th doubles done\n";
)) ))
@ -493,16 +545,35 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
abhh.unwrapAll(abc); abhh.unwrapAll(abc);
))) )))
WITH_CHRONO("reorder", WITH_CHRONO("reorder",
for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I]; LOG(0, "AtripCUDA") << "reorder singles" << i << "\n";
atrip::xcopy<F>(No*No*No,
#if defined(HAVE_CUDA)
(DataFieldType<F>*)Tijk, 1,
(DataFieldType<F>*)Zijk, 1);
#else
(DataFieldType<F>*)Tijk.data(), 1,
(DataFieldType<F>*)Zijk.data(), 1);
#endif
) )
WITH_CHRONO("singles", WITH_CHRONO("singles",
singlesContribution<F>( No, Nv, abc LOG(0, "AtripCUDA") << "doing singles" << i << "\n";
#if defined(HAVE_CUDA)
singlesContribution<F><<<1,1>>>( No, Nv, abc[0], abc[1], abc[2]
, (DataFieldType<F>*)Tai
#else
singlesContribution<F>( No, Nv, abc[0], abc[1], abc[2]
, Tai.data() , Tai.data()
, abhh.unwrapSlice(Slice<F>::AB, abc) #endif
, abhh.unwrapSlice(Slice<F>::AC, abc) , (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB, abc)
, abhh.unwrapSlice(Slice<F>::BC, abc) , (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC, abc)
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC, abc)
#if defined(HAVE_CUDA)
, (DataFieldType<F>*)Zijk);
#else
, Zijk.data()); , Zijk.data());
#endif
) )
LOG(0, "AtripCUDA") << "singles done" << i << "\n";
} }
@ -513,13 +584,17 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
int distinct(0); int distinct(0);
if (abc[0] == abc[1]) distinct++; if (abc[0] == abc[1]) distinct++;
if (abc[1] == abc[2]) distinct--; if (abc[1] == abc[2]) distinct--;
const F epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]); const F epsabc(_epsa[abc[0]] + _epsa[abc[1]] + _epsa[abc[2]]);
LOG(0, "AtripCUDA") << "doing energy " << i << "distinct " << distinct << "\n";
WITH_CHRONO("energy", WITH_CHRONO("energy",
/*
TODO: think about how to do this on the GPU in the best way possible
if ( distinct == 0) if ( distinct == 0)
tupleEnergy = getEnergyDistinct<F>(epsabc, epsi, Tijk, Zijk); tupleEnergy = getEnergyDistinct<F>(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk);
else else
tupleEnergy = getEnergySame<F>(epsabc, epsi, Tijk, Zijk); tupleEnergy = getEnergySame<F>(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk);
*/
) )
#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES) #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)

141
src/atrip/Blas.cxx Normal file
View File

@ -0,0 +1,141 @@
// Copyright 2022 Alejandro Gallo
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// [[file:~/cuda/atrip/atrip.org::*Blas][Blas:2]]
#include <atrip/Blas.hpp>
#include <atrip/Atrip.hpp>
#if defined(HAVE_CUDA)
# include <cstring>
static inline
cublasOperation_t char_to_cublasOperation(const char* trans) {
if (strncmp("N", trans, 1) == 0)
return CUBLAS_OP_N;
else if (strncmp("T", trans, 1) == 0)
return CUBLAS_OP_T;
else
return CUBLAS_OP_C;
}
#endif
namespace atrip {
template <>
void xgemm<double>(const char *transa,
const char *transb,
const int *m,
const int *n,
const int *k,
double *alpha,
const typename DataField<double>::type *A,
const int *lda,
const typename DataField<double>::type *B,
const int *ldb,
double *beta,
typename DataField<double>::type *C,
const int *ldc) {
#if defined(HAVE_CUDA)
cublasDgemm(Atrip::cuda.handle,
char_to_cublasOperation(transa),
char_to_cublasOperation(transb),
*m, *n, *k,
alpha, A, *lda,
B, *ldb, beta,
C, *ldc);
#else
dgemm_(transa, transb,
m, n, k,
alpha, A, lda,
B, ldb, beta,
C, ldc);
#endif
}
template <>
void xgemm<Complex>(const char *transa,
const char *transb,
const int *m,
const int *n,
const int *k,
Complex *alpha,
const typename DataField<Complex>::type *A,
const int *lda,
const typename DataField<Complex>::type *B,
const int *ldb,
Complex *beta,
typename DataField<Complex>::type *C,
const int *ldc) {
#if defined(HAVE_CUDA)
#pragma warning HAVE_CUDA
cuDoubleComplex
cu_alpha = {std::real(*alpha), std::imag(*alpha)},
cu_beta = {std::real(*beta), std::imag(*beta)};
cublasZgemm(Atrip::cuda.handle,
char_to_cublasOperation(transa),
char_to_cublasOperation(transb),
*m, *n, *k,
&cu_alpha,
A, *lda,
B, *ldb,
&cu_beta,
C, *ldc);
#else
zgemm_(transa, transb,
m, n, k,
alpha, A, lda,
B, ldb, beta,
C, ldc);
#endif
}
template <>
void xcopy<double>(const int n,
const DataFieldType<double>* x,
const int incx,
DataFieldType<double>* y,
const int incy) {
#if defined(HAVE_CUDA)
cublasDcopy(Atrip::cuda.handle,
n,
x, incx,
y, incy);
#else
dcopy_(n, x, incx, y, incy);
#endif
}
template <>
void xcopy<Complex>(const int n,
const DataFieldType<Complex>* x,
const int incx,
DataFieldType<Complex>* y,
const int incy) {
#if defined(HAVE_CUDA)
cublasZcopy(Atrip::cuda.handle,
n,
x, incx,
y, incy);
#else
zcopy_(n, x, incx, y, incy);
#endif
}
}
// Blas:2 ends here

40
src/atrip/Complex.cxx Normal file
View File

@ -0,0 +1,40 @@
// Copyright 2022 Alejandro Gallo
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// [[file:~/cuda/atrip/atrip.org::*Complex%20numbers][Complex numbers:2]]
#include <atrip/Complex.hpp>
#include <atrip/CUDA.hpp>
namespace atrip {
template <> double maybeConjugate(const double a) { return a; }
template <> Complex maybeConjugate(const Complex a) { return std::conj(a); }
#if defined(HAVE_CUDA)
#endif
namespace traits {
template <typename F> bool isComplex() { return false; }
template <> bool isComplex<double>() { return false; }
template <> bool isComplex<Complex>() { return true; }
namespace mpi {
template <> MPI_Datatype datatypeOf<double>() { return MPI_DOUBLE; }
template <> MPI_Datatype datatypeOf<Complex>() { return MPI_DOUBLE_COMPLEX; }
}
}
}
// Complex numbers:2 ends here

720
src/atrip/Equations.cxx Normal file
View File

@ -0,0 +1,720 @@
// Copyright 2022 Alejandro Gallo
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]]
#include<atrip/Equations.hpp>
#if defined(HAVE_CUDA)
#include <cuda.h>
#endif
namespace atrip {
// Prolog:2 ends here
#ifdef HAVE_CUDA
namespace cuda {
// cuda kernels
template <typename F>
__global__
void zeroing(F* a, size_t n) {
F zero = {0};
for (size_t i = 0; i < n; i++) {
a[i] = zero;
}
}
////
template <typename F>
__device__
F maybeConjugateScalar(const F a);
template <>
__device__
double maybeConjugateScalar(const double a) { return a; }
template <>
__device__
cuDoubleComplex
maybeConjugateScalar(const cuDoubleComplex a) {
return {a.x, -a.y};
}
template <typename F>
__global__
void maybeConjugate(F* to, F* from, size_t n) {
for (size_t i = 0; i < n; ++i) {
to[i] = maybeConjugateScalar<F>(from[i]);
}
}
template <typename F>
__global__
void reorder(F* to, F* from, size_t size, size_t I, size_t J, size_t K) {
size_t idx = 0;
const size_t IDX = I + J*size + K*size*size;
for (size_t k = 0; k < size; k++)
for (size_t j = 0; j < size; j++)
for (size_t i = 0; i < size; i++, idx++)
to[idx] += from[IDX];
}
// I mean, really CUDA... really!?
template <typename F>
__device__
F multiply(const F &a, const F &b);
template <>
__device__
double multiply(const double &a, const double &b) { return a * b; }
template <>
__device__
cuDoubleComplex multiply(const cuDoubleComplex &a, const cuDoubleComplex &b) {
return
{a.x * b.x - a.y * b.y,
a.x * b.y + a.y * b.x};
}
template <typename F>
__device__
void sum_in_place(F* to, const F* b);
template <>
__device__
void sum_in_place(double* to, const double *b) { *to += *b; }
template <>
__device__
void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* b) {
to->x += b->x;
to->y += b->y;
}
__device__
cuDoubleComplex& operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz) {
lz.x += rz.x;
lz.y += rz.y;
return lz;
}
};
#endif
// [[file:~/cuda/atrip/atrip.org::*Energy][Energy:2]]
template <typename F>
double getEnergyDistinct
( F const epsabc
, size_t const No
, F* const epsi
, F* const Tijk
, F* const Zijk
) {
constexpr size_t blockSize=16;
F energy(0.);
for (size_t kk=0; kk<No; kk+=blockSize){
const size_t kend( std::min(No, kk+blockSize) );
for (size_t jj(kk); jj<No; jj+=blockSize){
const size_t jend( std::min( No, jj+blockSize) );
for (size_t ii(jj); ii<No; ii+=blockSize){
const size_t iend( std::min( No, ii+blockSize) );
for (size_t k(kk); k < kend; k++){
const F ek(epsi[k]);
const size_t jstart = jj > k ? jj : k;
for (size_t j(jstart); j < jend; j++){
F const ej(epsi[j]);
F const facjk = j == k ? F(0.5) : F(1.0);
size_t istart = ii > j ? ii : j;
for (size_t i(istart); i < iend; i++){
const F
ei(epsi[i])
, facij = i == j ? F(0.5) : F(1.0)
, denominator(epsabc - ei - ej - ek)
, U(Zijk[i + No*j + No*No*k])
, V(Zijk[i + No*k + No*No*j])
, W(Zijk[j + No*i + No*No*k])
, X(Zijk[j + No*k + No*No*i])
, Y(Zijk[k + No*i + No*No*j])
, Z(Zijk[k + No*j + No*No*i])
, A(maybeConjugate<F>(Tijk[i + No*j + No*No*k]))
, B(maybeConjugate<F>(Tijk[i + No*k + No*No*j]))
, C(maybeConjugate<F>(Tijk[j + No*i + No*No*k]))
, D(maybeConjugate<F>(Tijk[j + No*k + No*No*i]))
, E(maybeConjugate<F>(Tijk[k + No*i + No*No*j]))
, _F(maybeConjugate<F>(Tijk[k + No*j + No*No*i]))
, value
= 3.0 * ( A * U
+ B * V
+ C * W
+ D * X
+ E * Y
+ _F * Z )
+ ( ( U + X + Y )
- 2.0 * ( V + W + Z )
) * ( A + D + E )
+ ( ( V + W + Z )
- 2.0 * ( U + X + Y )
) * ( B + C + _F )
;
energy += 2.0 * value / denominator * facjk * facij;
} // i
} // j
} // k
} // ii
} // jj
} // kk
return std::real(energy);
}
template <typename F>
double getEnergySame
( F const epsabc
, size_t const No
, F* const epsi
, F* const Tijk
, F* const Zijk
) {
constexpr size_t blockSize = 16;
F energy = F(0.);
for (size_t kk=0; kk<No; kk+=blockSize){
const size_t kend( std::min( kk+blockSize, No) );
for (size_t jj(kk); jj<No; jj+=blockSize){
const size_t jend( std::min( jj+blockSize, No) );
for (size_t ii(jj); ii<No; ii+=blockSize){
const size_t iend( std::min( ii+blockSize, No) );
for (size_t k(kk); k < kend; k++){
const F ek(epsi[k]);
const size_t jstart = jj > k ? jj : k;
for(size_t j(jstart); j < jend; j++){
const F facjk( j == k ? F(0.5) : F(1.0));
const F ej(epsi[j]);
const size_t istart = ii > j ? ii : j;
for(size_t i(istart); i < iend; i++){
const F
ei(epsi[i])
, facij ( i==j ? F(0.5) : F(1.0))
, denominator(epsabc - ei - ej - ek)
, U(Zijk[i + No*j + No*No*k])
, V(Zijk[j + No*k + No*No*i])
, W(Zijk[k + No*i + No*No*j])
, A(maybeConjugate<F>(Tijk[i + No*j + No*No*k]))
, B(maybeConjugate<F>(Tijk[j + No*k + No*No*i]))
, C(maybeConjugate<F>(Tijk[k + No*i + No*No*j]))
, value
= F(3.0) * ( A * U
+ B * V
+ C * W
)
- ( A + B + C ) * ( U + V + W )
;
energy += F(2.0) * value / denominator * facjk * facij;
} // i
} // j
} // k
} // ii
} // jj
} // kk
return std::real(energy);
}
// Energy:2 ends here
// [[file:~/cuda/atrip/atrip.org::*Energy][Energy:3]]
// instantiate double
template
double getEnergyDistinct
( double const epsabc
, size_t const No
, double* const epsi
, double* const Tijk
, double* const Zijk
);
template
double getEnergySame
( double const epsabc
, size_t const No
, double* const epsi
, double* const Tijk
, double* const Zijk
);
// instantiate Complex
template
double getEnergyDistinct
( Complex const epsabc
, size_t const No
, Complex* const epsi
, Complex* const Tijk
, Complex* const Zijk
);
template
double getEnergySame
( Complex const epsabc
, size_t const No
, Complex* const epsi
, Complex* const Tijk
, Complex* const Zijk
);
// Energy:3 ends here
// [[file:~/cuda/atrip/atrip.org::*Singles%20contribution][Singles contribution:2]]
template <typename F>
#ifdef HAVE_CUDA
__global__
#endif
void singlesContribution
( size_t No
, size_t Nv
, size_t a
, size_t b
, size_t c
, DataFieldType<F>* const Tph
, DataFieldType<F>* const VABij
, DataFieldType<F>* const VACij
, DataFieldType<F>* const VBCij
, DataFieldType<F>* Zijk
) {
const size_t NoNo = No*No;
// TODO: change order of for loops
for (size_t k = 0; k < No; k++)
for (size_t i = 0; i < No; i++)
for (size_t j = 0; j < No; j++) {
const size_t ijk = i + j*No + k*No*No;
#ifdef HAVE_CUDA
# define GO(__TPH, __VABIJ) \
{ \
const DataFieldType<F> product \
= cuda::multiply<DataFieldType<F>>((__TPH), (__VABIJ)); \
cuda::sum_in_place<DataFieldType<F>>(&Zijk[ijk], &product); \
}
#else
# define GO(__TPH, __VABIJ) Zijk[ijk] += (__TPH) * (__VABIJ);
#endif
GO(Tph[ a + i * Nv ], VBCij[ j + k * No ])
GO(Tph[ b + j * Nv ], VACij[ i + k * No ])
GO(Tph[ c + k * Nv ], VABij[ i + j * No ])
#undef GO
} // for loop j
}
// instantiate
template
#ifdef HAVE_CUDA
__global__
#endif
void singlesContribution<double>( size_t No
, size_t Nv
, size_t a
, size_t b
, size_t c
, double* const Tph
, double* const VABij
, double* const VACij
, double* const VBCij
, double* Zijk
);
template
#ifdef HAVE_CUDA
__global__
#endif
void singlesContribution<Complex>( size_t No
, size_t Nv
, size_t a
, size_t b
, size_t c
, DataFieldType<Complex>* const Tph
, DataFieldType<Complex>* const VABij
, DataFieldType<Complex>* const VACij
, DataFieldType<Complex>* const VBCij
, DataFieldType<Complex>* Zijk
);
// Singles contribution:2 ends here
// [[file:~/cuda/atrip/atrip.org::*Doubles%20contribution][Doubles contribution:2]]
template <typename F>
void doublesContribution
( const ABCTuple &abc
, size_t const No
, size_t const Nv
// -- VABCI
, DataPtr<F> const VABph
, DataPtr<F> const VACph
, DataPtr<F> const VBCph
, DataPtr<F> const VBAph
, DataPtr<F> const VCAph
, DataPtr<F> const VCBph
// -- VHHHA
, DataPtr<F> const VhhhA
, DataPtr<F> const VhhhB
, DataPtr<F> const VhhhC
// -- TA
, DataPtr<F> const TAphh
, DataPtr<F> const TBphh
, DataPtr<F> const TCphh
// -- TABIJ
, DataPtr<F> const TABhh
, DataPtr<F> const TAChh
, DataPtr<F> const TBChh
// -- TIJK
// , DataPtr<F> Tijk_
, DataFieldType<F>* Tijk_
) {
const size_t a = abc[0], b = abc[1], c = abc[2]
, NoNo = No*No, NoNv = No*Nv
;
typename DataField<F>::type* Tijk = (typename DataField<F>::type*) Tijk_;
LOG(0, "AtripCUDA") << "in doubles " << "\n";
#if defined(ATRIP_USE_DGEMM)
#define _IJK_(i, j, k) i + j*No + k*NoNo
#if defined(HAVE_CUDA)
// TODO
#define REORDER(__II, __JJ, __KK)
#define __TO_DEVICEPTR(_v) (_v)
#define DGEMM_PARTICLES(__A, __B) \
atrip::xgemm<F>("T", \
"N", \
(int const*)&NoNo, \
(int const*)&No, \
(int const*)&Nv, \
&one, \
(DataFieldType<F>*)__A, \
(int const*)&Nv, \
(DataFieldType<F>*)__B, \
(int const*)&Nv, \
&zero, \
_t_buffer_p, \
(int const*)&NoNo);
#define DGEMM_HOLES(__A, __B, __TRANSB) \
atrip::xgemm<F>("N", \
__TRANSB, \
(int const*)&NoNo, \
(int const*)&No, \
(int const*)&No, \
&m_one, \
__TO_DEVICEPTR(__A), \
(int const*)&NoNo, \
(DataFieldType<F>*)__B, \
(int const*)&No, \
&zero, \
_t_buffer_p, \
(int const*)&NoNo \
);
#define MAYBE_CONJ(_conj, _buffer) \
cuda::maybeConjugate<<<1,1>>>((DataFieldType<F>*)_conj, (DataFieldType<F>*)_buffer, NoNoNo);
#else
#define REORDER(__II, __JJ, __KK) \
WITH_CHRONO("doubles:reorder", \
for (size_t k = 0; k < No; k++) \
for (size_t j = 0; j < No; j++) \
for (size_t i = 0; i < No; i++) { \
Tijk[_IJK_(i, j, k)] += _t_buffer_p[_IJK_(__II, __JJ, __KK)]; \
} \
)
#define __TO_DEVICEPTR(_v) (_v)
#define DGEMM_PARTICLES(__A, __B) \
atrip::xgemm<F>("T", \
"N", \
(int const*)&NoNo, \
(int const*)&No, \
(int const*)&Nv, \
&one, \
__A, \
(int const*)&Nv, \
__B, \
(int const*)&Nv, \
&zero, \
_t_buffer_p, \
(int const*)&NoNo \
);
#define DGEMM_HOLES(__A, __B, __TRANSB) \
atrip::xgemm<F>("N", \
__TRANSB, \
(int const*)&NoNo, \
(int const*)&No, \
(int const*)&No, \
&m_one, \
__A, \
(int const*)&NoNo, \
__B, \
(int const*)&No, \
&zero, \
_t_buffer_p, \
(int const*)&NoNo \
);
#define MAYBE_CONJ(_conj, _buffer) \
for (size_t __i = 0; __i < NoNoNo; ++__i) \
_conj[__i] = maybeConjugate<F>(_buffer[__i]);
#endif
F one{1.0}, m_one{-1.0}, zero{0.0};
DataFieldType<F> zero_h{0.0};
const size_t NoNoNo = No*NoNo;
#ifdef HAVE_CUDA
DataFieldType<F>* _t_buffer;
DataFieldType<F>* _vhhh;
LOG(0, "AtripCUDA") << "getting memory" << "\n";
cuMemAlloc((CUdeviceptr*)&_t_buffer, NoNoNo * sizeof(DataFieldType<F>));
cuMemAlloc((CUdeviceptr*)&_vhhh, NoNoNo * sizeof(DataFieldType<F>));
LOG(0, "AtripCUDA") << "cuda::zeroing " << "\n";
cuda::zeroing<<<1,1>>>((DataFieldType<F>*)_t_buffer, NoNoNo);
cuda::zeroing<<<1,1>>>((DataFieldType<F>*)_vhhh, NoNoNo);
#else
F* _t_buffer = (F*)malloc(NoNoNo * sizeof(F));
F* _vhhh = (F*)malloc(NoNoNo * sizeof(F));
for (size_t i=0; i < NoNoNo; i++) {
_t_buffer[i] = zero_h;
_vhhh[i] = zero_h;
}
#endif
//_t_buffer.reserve(NoNoNo);
DataFieldType<F>* _t_buffer_p = __TO_DEVICEPTR(_t_buffer);
#ifdef HAVE_CUDA
LOG(0, "AtripCUDA") << "cuda::zeroing Tijk" << "\n";
cuda::zeroing<<<1,1>>>((DataFieldType<F>*)Tijk, NoNoNo);
#else
WITH_CHRONO("double:reorder",
for (size_t k = 0; k < NoNoNo; k++) {
Tijk[k] = DataFieldType<F>{0.0};
})
#endif
LOG(0, "AtripCUDA") << "doing holes" << "\n";
// TOMERGE: replace chronos
WITH_CHRONO("doubles:holes",
{ // Holes part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
LOG(0, "AtripCUDA") << "conj 1" << "\n";
MAYBE_CONJ(_vhhh, VhhhC)
LOG(0, "AtripCUDA") << "done" << "\n";
WITH_CHRONO("doubles:holes:1",
LOG(0, "AtripCUDA") << "dgemm 1" << "\n";
DGEMM_HOLES(_vhhh, TABhh, "N")
LOG(0, "AtripCUDA") << "reorder 1" << "\n";
REORDER(i, k, j)
)
// VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
WITH_CHRONO("doubles:holes:2",
LOG(0, "AtripCUDA") << "dgemm 2" << "\n";
DGEMM_HOLES(_vhhh, TABhh, "T")
REORDER(j, k, i)
)
// VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
LOG(0, "AtripCUDA") << "conj 2" << "\n";
MAYBE_CONJ(_vhhh, VhhhB)
LOG(0, "AtripCUDA") << "done" << "\n";
WITH_CHRONO("doubles:holes:3",
DGEMM_HOLES(_vhhh, TAChh, "N")
REORDER(i, j, k)
)
// VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
WITH_CHRONO("doubles:holes:4",
DGEMM_HOLES(_vhhh, TAChh, "T")
REORDER(k, j, i)
)
// VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
LOG(0, "AtripCUDA") << "conj 3" << "\n";
MAYBE_CONJ(_vhhh, VhhhA)
WITH_CHRONO("doubles:holes:5",
DGEMM_HOLES(_vhhh, TBChh, "N")
REORDER(j, i, k)
)
// VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
WITH_CHRONO("doubles:holes:6",
DGEMM_HOLES(_vhhh, TBChh, "T")
REORDER(k, i, j)
)
}
)
#undef MAYBE_CONJ
LOG(0, "AtripCUDA") << "doing particles" << "\n";
WITH_CHRONO("doubles:particles",
{ // Particle part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
WITH_CHRONO("doubles:particles:1",
DGEMM_PARTICLES(TAphh, VBCph)
REORDER(i, j, k)
)
// TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
WITH_CHRONO("doubles:particles:2",
DGEMM_PARTICLES(TAphh, VCBph)
REORDER(i, k, j)
)
// TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
WITH_CHRONO("doubles:particles:3",
DGEMM_PARTICLES(TCphh, VABph)
REORDER(k, i, j)
)
// TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
WITH_CHRONO("doubles:particles:4",
DGEMM_PARTICLES(TCphh, VBAph)
REORDER(k, j, i)
)
// TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
WITH_CHRONO("doubles:particles:5",
DGEMM_PARTICLES(TBphh, VACph)
REORDER(j, i, k)
)
// TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
WITH_CHRONO("doubles:particles:6",
DGEMM_PARTICLES(TBphh, VCAph)
REORDER(j, k, i)
)
}
)
LOG(0, "AtripCUDA") << "particles done" << "\n";
{ // free resources
#ifdef HAVE_CUDA
LOG(0, "AtripCUDA") << "free mem" << "\n";
cuMemFree((CUdeviceptr)_vhhh);
cuMemFree((CUdeviceptr)_t_buffer);
LOG(0, "AtripCUDA") << "free mem done" << "\n";
#else
free(_vhhh);
free(_t_buffer);
#endif
}
#undef REORDER
#undef DGEMM_HOLES
#undef DGEMM_PARTICLES
#undef _IJK_
#else
for (size_t k = 0; k < No; k++)
for (size_t j = 0; j < No; j++)
for (size_t i = 0; i < No; i++){
const size_t ijk = i + j*No + k*NoNo
, jk = j + k*No
;
Tijk[ijk] = 0.0; // :important
// HOLE DIAGRAMS: TABHH and VHHHA
for (size_t L = 0; L < No; L++){
// t[abLj] * V[Lcik] H1
// t[baLi] * V[Lcjk] H0 TODO: conjugate T for complex
Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo];
Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo];
// t[acLk] * V[Lbij] H5
// t[caLi] * V[Lbkj] H3
Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo];
Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo];
// t[bcLk] * V[Laji] H2
// t[cbLj] * V[Laki] H4
Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo];
Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo];
}
// PARTILCE DIAGRAMS: TAPHH and VABPH
for (size_t E = 0; E < Nv; E++) {
// t[aEij] * V[bcEk] P0
// t[aEik] * V[cbEj] P3 // TODO: CHECK THIS ONE, I DONT KNOW
Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv];
Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv];
// t[cEki] * V[abEj] P5
// t[cEkj] * V[baEi] P2
Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv];
Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv];
// t[bEji] * V[acEk] P1
// t[bEjk] * V[caEi] P4
Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv];
Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv];
}
}
#endif
}
// instantiate templates
template
void doublesContribution<double>
( const ABCTuple &abc
, size_t const No
, size_t const Nv
// -- VABCI
, DataPtr<double> const VABph
, DataPtr<double> const VACph
, DataPtr<double> const VBCph
, DataPtr<double> const VBAph
, DataPtr<double> const VCAph
, DataPtr<double> const VCBph
// -- VHHHA
, DataPtr<double> const VhhhA
, DataPtr<double> const VhhhB
, DataPtr<double> const VhhhC
// -- TA
, DataPtr<double> const TAphh
, DataPtr<double> const TBphh
, DataPtr<double> const TCphh
// -- TABIJ
, DataPtr<double> const TABhh
, DataPtr<double> const TAChh
, DataPtr<double> const TBChh
// -- TIJK
, DataFieldType<double>* Tijk
);
template
void doublesContribution<Complex>
( const ABCTuple &abc
, size_t const No
, size_t const Nv
// -- VABCI
, DataPtr<Complex> const VABph
, DataPtr<Complex> const VACph
, DataPtr<Complex> const VBCph
, DataPtr<Complex> const VBAph
, DataPtr<Complex> const VCAph
, DataPtr<Complex> const VCBph
// -- VHHHA
, DataPtr<Complex> const VhhhA
, DataPtr<Complex> const VhhhB
, DataPtr<Complex> const VhhhC
// -- TA
, DataPtr<Complex> const TAphh
, DataPtr<Complex> const TBphh
, DataPtr<Complex> const TCphh
// -- TABIJ
, DataPtr<Complex> const TABhh
, DataPtr<Complex> const TAChh
, DataPtr<Complex> const TBChh
// -- TIJK
, DataFieldType<Complex>* Tijk
);
// Doubles contribution:2 ends here
// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:2]]
}
// Epilog:2 ends here