Changes in source files, makes cuda run
This commit is contained in:
@@ -12,7 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:../atrip.org::*Include header][Include header:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Include%20header][Include header:1]]
|
||||
#pragma once
|
||||
|
||||
#include <atrip/Atrip.hpp>
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:../../atrip.org::*Header][Header:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Header][Header:1]]
|
||||
#pragma once
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
@@ -41,12 +41,21 @@ namespace atrip {
|
||||
|
||||
struct Atrip {
|
||||
|
||||
static int rank;
|
||||
static int np;
|
||||
static size_t rank;
|
||||
static size_t np;
|
||||
static MPI_Comm communicator;
|
||||
static Timings chrono;
|
||||
#if defined(HAVE_CUDA)
|
||||
struct CudaContext {
|
||||
cublasStatus_t status;
|
||||
cublasHandle_t handle;
|
||||
};
|
||||
static CudaContext cuda;
|
||||
#endif
|
||||
|
||||
static void init(MPI_Comm);
|
||||
|
||||
|
||||
template <typename F=double>
|
||||
struct Input {
|
||||
CTF::Tensor<F> *ei = nullptr
|
||||
|
||||
@@ -12,12 +12,16 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:../../atrip.org::*Blas][Blas:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Blas][Blas:1]]
|
||||
#pragma once
|
||||
|
||||
#include <atrip/Complex.hpp>
|
||||
#include <atrip/Types.hpp>
|
||||
#include "config.h"
|
||||
|
||||
namespace atrip {
|
||||
|
||||
using Complex = std::complex<double>;
|
||||
|
||||
#if !defined(HAVE_CUDA)
|
||||
extern "C" {
|
||||
void dgemm_(
|
||||
const char *transa,
|
||||
@@ -50,49 +54,43 @@ namespace atrip {
|
||||
Complex *C,
|
||||
const int *ldc
|
||||
);
|
||||
|
||||
void dcopy_(const int n,
|
||||
const double *x,
|
||||
const int incx,
|
||||
double *y,
|
||||
const int incy);
|
||||
|
||||
void zcopy_(const int n,
|
||||
const void *x,
|
||||
const int incx,
|
||||
void *y,
|
||||
const int incy);
|
||||
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename F>
|
||||
void xcopy(const int n,
|
||||
const DataFieldType<F>* x,
|
||||
const int incx,
|
||||
DataFieldType<F>* y,
|
||||
const int incy);
|
||||
|
||||
template <typename F=double>
|
||||
template <typename F>
|
||||
void xgemm(const char *transa,
|
||||
const char *transb,
|
||||
const int *m,
|
||||
const int *n,
|
||||
const int *k,
|
||||
F *alpha,
|
||||
const F *A,
|
||||
const DataFieldType<F> *A,
|
||||
const int *lda,
|
||||
const F *B,
|
||||
const DataFieldType<F> *B,
|
||||
const int *ldb,
|
||||
F *beta,
|
||||
F *C,
|
||||
const int *ldc) {
|
||||
dgemm_(transa, transb,
|
||||
m, n, k,
|
||||
alpha, A, lda,
|
||||
B, ldb, beta,
|
||||
C, ldc);
|
||||
}
|
||||
|
||||
template <>
|
||||
void xgemm(const char *transa,
|
||||
const char *transb,
|
||||
const int *m,
|
||||
const int *n,
|
||||
const int *k,
|
||||
Complex *alpha,
|
||||
const Complex *A,
|
||||
const int *lda,
|
||||
const Complex *B,
|
||||
const int *ldb,
|
||||
Complex *beta,
|
||||
Complex *C,
|
||||
const int *ldc) {
|
||||
zgemm_(transa, transb,
|
||||
m, n, k,
|
||||
alpha, A, lda,
|
||||
B, ldb, beta,
|
||||
C, ldc);
|
||||
}
|
||||
DataFieldType<F> *C,
|
||||
const int *ldc);
|
||||
}
|
||||
// Blas:1 ends here
|
||||
|
||||
9
include/atrip/CUDA.hpp
Normal file
9
include/atrip/CUDA.hpp
Normal file
@@ -0,0 +1,9 @@
|
||||
#pragma once
|
||||
|
||||
#if defined(HAVE_CUDA) && defined(__CUDACC__)
|
||||
# define __MAYBE_GLOBAL__ __global__
|
||||
# define __MAYBE_DEVICE__ __device__
|
||||
#else
|
||||
# define __MAYBE_GLOBAL__
|
||||
# define __MAYBE_DEVICE__
|
||||
#endif
|
||||
@@ -12,7 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:../../atrip.org::*Prolog][Prolog:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
|
||||
#pragma once
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
@@ -22,7 +22,7 @@
|
||||
namespace atrip {
|
||||
// Prolog:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::checkpoint-definition][checkpoint-definition]]
|
||||
// [[file:~/cuda/atrip/atrip.org::checkpoint-definition][checkpoint-definition]]
|
||||
// template <typename F>
|
||||
struct Checkpoint {
|
||||
size_t no, nv;
|
||||
@@ -36,7 +36,7 @@ struct Checkpoint {
|
||||
};
|
||||
// checkpoint-definition ends here
|
||||
|
||||
// [[file:../../atrip.org::*Input and output][Input and output:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Input%20and%20output][Input and output:1]]
|
||||
void write_checkpoint(Checkpoint const& c, std::string const& filepath) {
|
||||
std::ofstream out(filepath);
|
||||
out << "No: " << c.no
|
||||
@@ -87,6 +87,6 @@ Checkpoint read_checkpoint(std::string const& filepath) {
|
||||
}
|
||||
// Input and output:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Epilog][Epilog:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
|
||||
}
|
||||
// Epilog:1 ends here
|
||||
|
||||
46
include/atrip/Complex.hpp
Normal file
46
include/atrip/Complex.hpp
Normal file
@@ -0,0 +1,46 @@
|
||||
// Copyright 2022 Alejandro Gallo
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:~/cuda/atrip/atrip.org::*Complex%20numbers][Complex numbers:1]]
|
||||
#pragma once
|
||||
|
||||
#include <complex>
|
||||
#include <mpi.h>
|
||||
#include "config.h"
|
||||
#if defined(HAVE_CUDA)
|
||||
#include <cuComplex.h>
|
||||
#endif
|
||||
|
||||
namespace atrip {
|
||||
|
||||
using Complex = std::complex<double>;
|
||||
|
||||
template <typename F> F maybeConjugate(const F);
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
cuDoubleComplex& operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz);
|
||||
#endif
|
||||
|
||||
namespace traits {
|
||||
|
||||
template <typename FF> bool isComplex();
|
||||
|
||||
namespace mpi {
|
||||
template <typename F> MPI_Datatype datatypeOf(void);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
// Complex numbers:1 ends here
|
||||
@@ -12,7 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:../../atrip.org::*Macros][Macros:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Macros][Macros:1]]
|
||||
#pragma once
|
||||
#include <functional>
|
||||
#define ATRIP_BENCHMARK
|
||||
@@ -21,7 +21,6 @@
|
||||
# define ATRIP_DEBUG 1
|
||||
#endif
|
||||
//#define ATRIP_WORKLOAD_DUMP
|
||||
#define ATRIP_USE_DGEMM
|
||||
//#define ATRIP_PRINT_TUPLES
|
||||
|
||||
#ifndef ATRIP_DEBUG
|
||||
@@ -75,20 +74,20 @@
|
||||
#endif
|
||||
// Macros:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Macros][Macros:2]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Macros][Macros:2]]
|
||||
#ifndef LOG
|
||||
#define LOG(level, name) if (Atrip::rank == 0) std::cout << name << ": "
|
||||
#endif
|
||||
// Macros:2 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Macros][Macros:3]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Macros][Macros:3]]
|
||||
#ifdef ATRIP_NO_OUTPUT
|
||||
# undef LOG
|
||||
# define LOG(level, name) if (false) std::cout << name << ": "
|
||||
#endif
|
||||
// Macros:3 ends here
|
||||
|
||||
// [[file:../../atrip.org::IterationDescriptor][IterationDescriptor]]
|
||||
// [[file:~/cuda/atrip/atrip.org::IterationDescriptor][IterationDescriptor]]
|
||||
namespace atrip {
|
||||
|
||||
struct IterationDescription;
|
||||
|
||||
@@ -12,371 +12,94 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:../../atrip.org::*Equations][Equations:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
|
||||
#pragma once
|
||||
|
||||
#include<atrip/Slice.hpp>
|
||||
#include<atrip/Atrip.hpp>
|
||||
#include<atrip/Blas.hpp>
|
||||
#include<atrip/Utils.hpp>
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
#include<thrust/device_vector.h>
|
||||
#endif
|
||||
|
||||
|
||||
namespace atrip {
|
||||
using ABCTuple = std::array<size_t, 3>;
|
||||
using PartialTuple = std::array<size_t, 2>;
|
||||
using ABCTuples = std::vector<ABCTuple>;
|
||||
// Prolog:1 ends here
|
||||
|
||||
template <typename F=double>
|
||||
double getEnergyDistinct
|
||||
( const F epsabc
|
||||
, std::vector<F> const& epsi
|
||||
, std::vector<F> const& Tijk_
|
||||
, std::vector<F> const& Zijk_
|
||||
) {
|
||||
constexpr size_t blockSize=16;
|
||||
F energy(0.);
|
||||
const size_t No = epsi.size();
|
||||
for (size_t kk=0; kk<No; kk+=blockSize){
|
||||
const size_t kend( std::min(No, kk+blockSize) );
|
||||
for (size_t jj(kk); jj<No; jj+=blockSize){
|
||||
const size_t jend( std::min( No, jj+blockSize) );
|
||||
for (size_t ii(jj); ii<No; ii+=blockSize){
|
||||
const size_t iend( std::min( No, ii+blockSize) );
|
||||
for (size_t k(kk); k < kend; k++){
|
||||
const F ek(epsi[k]);
|
||||
const size_t jstart = jj > k ? jj : k;
|
||||
for (size_t j(jstart); j < jend; j++){
|
||||
F const ej(epsi[j]);
|
||||
F const facjk = j == k ? F(0.5) : F(1.0);
|
||||
size_t istart = ii > j ? ii : j;
|
||||
for (size_t i(istart); i < iend; i++){
|
||||
const F
|
||||
ei(epsi[i])
|
||||
, facij = i == j ? F(0.5) : F(1.0)
|
||||
, denominator(epsabc - ei - ej - ek)
|
||||
, U(Zijk_[i + No*j + No*No*k])
|
||||
, V(Zijk_[i + No*k + No*No*j])
|
||||
, W(Zijk_[j + No*i + No*No*k])
|
||||
, X(Zijk_[j + No*k + No*No*i])
|
||||
, Y(Zijk_[k + No*i + No*No*j])
|
||||
, Z(Zijk_[k + No*j + No*No*i])
|
||||
, A(maybeConjugate<F>(Tijk_[i + No*j + No*No*k]))
|
||||
, B(maybeConjugate<F>(Tijk_[i + No*k + No*No*j]))
|
||||
, C(maybeConjugate<F>(Tijk_[j + No*i + No*No*k]))
|
||||
, D(maybeConjugate<F>(Tijk_[j + No*k + No*No*i]))
|
||||
, E(maybeConjugate<F>(Tijk_[k + No*i + No*No*j]))
|
||||
, _F(maybeConjugate<F>(Tijk_[k + No*j + No*No*i]))
|
||||
, value
|
||||
= 3.0 * ( A * U
|
||||
+ B * V
|
||||
+ C * W
|
||||
+ D * X
|
||||
+ E * Y
|
||||
+ _F * Z )
|
||||
+ ( ( U + X + Y )
|
||||
- 2.0 * ( V + W + Z )
|
||||
) * ( A + D + E )
|
||||
+ ( ( V + W + Z )
|
||||
- 2.0 * ( U + X + Y )
|
||||
) * ( B + C + _F )
|
||||
;
|
||||
energy += 2.0 * value / denominator * facjk * facij;
|
||||
} // i
|
||||
} // j
|
||||
} // k
|
||||
} // ii
|
||||
} // jj
|
||||
} // kk
|
||||
return std::real(energy);
|
||||
}
|
||||
// [[file:~/cuda/atrip/atrip.org::*Energy][Energy:1]]
|
||||
template <typename F=double>
|
||||
double getEnergyDistinct
|
||||
( F const epsabc
|
||||
, size_t const No
|
||||
, F* const epsi
|
||||
, F* const Tijk
|
||||
, F* const Zijk
|
||||
);
|
||||
|
||||
template <typename F=double>
|
||||
double getEnergySame
|
||||
( F const epsabc
|
||||
, size_t const No
|
||||
, F* const epsi
|
||||
, F* const Tijk
|
||||
, F* const Zijk
|
||||
);
|
||||
// Energy:1 ends here
|
||||
|
||||
template <typename F=double>
|
||||
double getEnergySame
|
||||
( const F epsabc
|
||||
, std::vector<F> const& epsi
|
||||
, std::vector<F> const& Tijk_
|
||||
, std::vector<F> const& Zijk_
|
||||
) {
|
||||
constexpr size_t blockSize = 16;
|
||||
const size_t No = epsi.size();
|
||||
F energy = F(0.);
|
||||
for (size_t kk=0; kk<No; kk+=blockSize){
|
||||
const size_t kend( std::min( kk+blockSize, No) );
|
||||
for (size_t jj(kk); jj<No; jj+=blockSize){
|
||||
const size_t jend( std::min( jj+blockSize, No) );
|
||||
for (size_t ii(jj); ii<No; ii+=blockSize){
|
||||
const size_t iend( std::min( ii+blockSize, No) );
|
||||
for (size_t k(kk); k < kend; k++){
|
||||
const F ek(epsi[k]);
|
||||
const size_t jstart = jj > k ? jj : k;
|
||||
for(size_t j(jstart); j < jend; j++){
|
||||
const F facjk( j == k ? F(0.5) : F(1.0));
|
||||
const F ej(epsi[j]);
|
||||
const size_t istart = ii > j ? ii : j;
|
||||
for(size_t i(istart); i < iend; i++){
|
||||
const F
|
||||
ei(epsi[i])
|
||||
, facij ( i==j ? F(0.5) : F(1.0))
|
||||
, denominator(epsabc - ei - ej - ek)
|
||||
, U(Zijk_[i + No*j + No*No*k])
|
||||
, V(Zijk_[j + No*k + No*No*i])
|
||||
, W(Zijk_[k + No*i + No*No*j])
|
||||
, A(maybeConjugate<F>(Tijk_[i + No*j + No*No*k]))
|
||||
, B(maybeConjugate<F>(Tijk_[j + No*k + No*No*i]))
|
||||
, C(maybeConjugate<F>(Tijk_[k + No*i + No*No*j]))
|
||||
, value
|
||||
= F(3.0) * ( A * U
|
||||
+ B * V
|
||||
+ C * W
|
||||
)
|
||||
- ( A + B + C ) * ( U + V + W )
|
||||
;
|
||||
energy += F(2.0) * value / denominator * facjk * facij;
|
||||
} // i
|
||||
} // j
|
||||
} // k
|
||||
} // ii
|
||||
} // jj
|
||||
} // kk
|
||||
return std::real(energy);
|
||||
}
|
||||
|
||||
template <typename F=double>
|
||||
void singlesContribution
|
||||
( size_t No
|
||||
, size_t Nv
|
||||
, const ABCTuple &abc
|
||||
, F const* Tph
|
||||
, F const* VABij
|
||||
, F const* VACij
|
||||
, F const* VBCij
|
||||
, F *Zijk
|
||||
) {
|
||||
const size_t a(abc[0]), b(abc[1]), c(abc[2]);
|
||||
for (size_t k=0; k < No; k++)
|
||||
for (size_t i=0; i < No; i++)
|
||||
for (size_t j=0; j < No; j++) {
|
||||
const size_t ijk = i + j*No + k*No*No
|
||||
, jk = j + No * k
|
||||
;
|
||||
Zijk[ijk] += Tph[ a + i * Nv ] * VBCij[ j + k * No ];
|
||||
Zijk[ijk] += Tph[ b + j * Nv ] * VACij[ i + k * No ];
|
||||
Zijk[ijk] += Tph[ c + k * Nv ] * VABij[ i + j * No ];
|
||||
}
|
||||
}
|
||||
// [[file:~/cuda/atrip/atrip.org::*Singles%20contribution][Singles contribution:1]]
|
||||
template <typename F=double>
|
||||
#ifdef HAVE_CUDA
|
||||
__global__
|
||||
#endif
|
||||
void singlesContribution
|
||||
( size_t No
|
||||
, size_t Nv
|
||||
, size_t a
|
||||
, size_t b
|
||||
, size_t c
|
||||
, DataFieldType<F>* const Tph
|
||||
, DataFieldType<F>* const VABij
|
||||
, DataFieldType<F>* const VACij
|
||||
, DataFieldType<F>* const VBCij
|
||||
, DataFieldType<F>* Zijk
|
||||
);
|
||||
// Singles contribution:1 ends here
|
||||
|
||||
// [[file:~/cuda/atrip/atrip.org::*Doubles%20contribution][Doubles contribution:1]]
|
||||
template <typename F=double>
|
||||
void doublesContribution
|
||||
( const ABCTuple &abc
|
||||
, size_t const No
|
||||
, size_t const Nv
|
||||
// -- VABCI
|
||||
, F const* VABph
|
||||
, F const* VACph
|
||||
, F const* VBCph
|
||||
, F const* VBAph
|
||||
, F const* VCAph
|
||||
, F const* VCBph
|
||||
, DataPtr<F> const VABph
|
||||
, DataPtr<F> const VACph
|
||||
, DataPtr<F> const VBCph
|
||||
, DataPtr<F> const VBAph
|
||||
, DataPtr<F> const VCAph
|
||||
, DataPtr<F> const VCBph
|
||||
// -- VHHHA
|
||||
, F const* VhhhA
|
||||
, F const* VhhhB
|
||||
, F const* VhhhC
|
||||
, DataPtr<F> const VhhhA
|
||||
, DataPtr<F> const VhhhB
|
||||
, DataPtr<F> const VhhhC
|
||||
// -- TA
|
||||
, F const* TAphh
|
||||
, F const* TBphh
|
||||
, F const* TCphh
|
||||
, DataPtr<F> const TAphh
|
||||
, DataPtr<F> const TBphh
|
||||
, DataPtr<F> const TCphh
|
||||
// -- TABIJ
|
||||
, F const* TABhh
|
||||
, F const* TAChh
|
||||
, F const* TBChh
|
||||
, DataPtr<F> const TABhh
|
||||
, DataPtr<F> const TAChh
|
||||
, DataPtr<F> const TBChh
|
||||
// -- TIJK
|
||||
, F *Tijk
|
||||
) {
|
||||
|
||||
const size_t a = abc[0], b = abc[1], c = abc[2]
|
||||
, NoNo = No*No, NoNv = No*Nv
|
||||
;
|
||||
|
||||
#if defined(ATRIP_USE_DGEMM)
|
||||
#define _IJK_(i, j, k) i + j*No + k*NoNo
|
||||
#define REORDER(__II, __JJ, __KK) \
|
||||
WITH_CHRONO("doubles:reorder", \
|
||||
for (size_t k = 0; k < No; k++) \
|
||||
for (size_t j = 0; j < No; j++) \
|
||||
for (size_t i = 0; i < No; i++) { \
|
||||
Tijk[_IJK_(i, j, k)] += _t_buffer[_IJK_(__II, __JJ, __KK)]; \
|
||||
} \
|
||||
)
|
||||
#define DGEMM_PARTICLES(__A, __B) \
|
||||
atrip::xgemm<F>( "T" \
|
||||
, "N" \
|
||||
, (int const*)&NoNo \
|
||||
, (int const*)&No \
|
||||
, (int const*)&Nv \
|
||||
, &one \
|
||||
, __A \
|
||||
, (int const*)&Nv \
|
||||
, __B \
|
||||
, (int const*)&Nv \
|
||||
, &zero \
|
||||
, _t_buffer.data() \
|
||||
, (int const*)&NoNo \
|
||||
);
|
||||
#define DGEMM_HOLES(__A, __B, __TRANSB) \
|
||||
atrip::xgemm<F>( "N" \
|
||||
, __TRANSB \
|
||||
, (int const*)&NoNo \
|
||||
, (int const*)&No \
|
||||
, (int const*)&No \
|
||||
, &m_one \
|
||||
, __A \
|
||||
, (int const*)&NoNo \
|
||||
, __B \
|
||||
, (int const*)&No \
|
||||
, &zero \
|
||||
, _t_buffer.data() \
|
||||
, (int const*)&NoNo \
|
||||
);
|
||||
#define MAYBE_CONJ(_conj, _buffer) \
|
||||
for (size_t __i = 0; __i < NoNoNo; ++__i) \
|
||||
_conj[__i] = maybeConjugate<F>(_buffer[__i]); \
|
||||
|
||||
const size_t NoNoNo = No*NoNo;
|
||||
std::vector<F> _t_buffer;
|
||||
_t_buffer.reserve(NoNoNo);
|
||||
F one{1.0}, m_one{-1.0}, zero{0.0};
|
||||
|
||||
WITH_CHRONO("double:reorder",
|
||||
for (size_t k = 0; k < NoNoNo; k++) {
|
||||
Tijk[k] = 0.0;
|
||||
})
|
||||
|
||||
// TOMERGE: replace chronos
|
||||
WITH_CHRONO("doubles:holes",
|
||||
{ // Holes part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
std::vector<F> _vhhh(NoNoNo);
|
||||
|
||||
// VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
|
||||
MAYBE_CONJ(_vhhh, VhhhC)
|
||||
WITH_CHRONO("doubles:holes:1",
|
||||
DGEMM_HOLES(_vhhh.data(), TABhh, "N")
|
||||
REORDER(i, k, j)
|
||||
)
|
||||
// VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
|
||||
WITH_CHRONO("doubles:holes:2",
|
||||
DGEMM_HOLES(_vhhh.data(), TABhh, "T")
|
||||
REORDER(j, k, i)
|
||||
)
|
||||
|
||||
// VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
|
||||
MAYBE_CONJ(_vhhh, VhhhB)
|
||||
WITH_CHRONO("doubles:holes:3",
|
||||
DGEMM_HOLES(_vhhh.data(), TAChh, "N")
|
||||
REORDER(i, j, k)
|
||||
)
|
||||
// VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
|
||||
WITH_CHRONO("doubles:holes:4",
|
||||
DGEMM_HOLES(_vhhh.data(), TAChh, "T")
|
||||
REORDER(k, j, i)
|
||||
)
|
||||
|
||||
// VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
|
||||
MAYBE_CONJ(_vhhh, VhhhA)
|
||||
WITH_CHRONO("doubles:holes:5",
|
||||
DGEMM_HOLES(_vhhh.data(), TBChh, "N")
|
||||
REORDER(j, i, k)
|
||||
)
|
||||
// VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
|
||||
WITH_CHRONO("doubles:holes:6",
|
||||
DGEMM_HOLES(_vhhh.data(), TBChh, "T")
|
||||
REORDER(k, i, j)
|
||||
)
|
||||
|
||||
}
|
||||
)
|
||||
#undef MAYBE_CONJ
|
||||
|
||||
WITH_CHRONO("doubles:particles",
|
||||
{ // Particle part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
// TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
|
||||
WITH_CHRONO("doubles:particles:1",
|
||||
DGEMM_PARTICLES(TAphh, VBCph)
|
||||
REORDER(i, j, k)
|
||||
)
|
||||
// TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
|
||||
WITH_CHRONO("doubles:particles:2",
|
||||
DGEMM_PARTICLES(TAphh, VCBph)
|
||||
REORDER(i, k, j)
|
||||
)
|
||||
// TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
|
||||
WITH_CHRONO("doubles:particles:3",
|
||||
DGEMM_PARTICLES(TCphh, VABph)
|
||||
REORDER(k, i, j)
|
||||
)
|
||||
// TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
|
||||
WITH_CHRONO("doubles:particles:4",
|
||||
DGEMM_PARTICLES(TCphh, VBAph)
|
||||
REORDER(k, j, i)
|
||||
)
|
||||
// TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
|
||||
WITH_CHRONO("doubles:particles:5",
|
||||
DGEMM_PARTICLES(TBphh, VACph)
|
||||
REORDER(j, i, k)
|
||||
)
|
||||
// TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
|
||||
WITH_CHRONO("doubles:particles:6",
|
||||
DGEMM_PARTICLES(TBphh, VCAph)
|
||||
REORDER(j, k, i)
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
#undef REORDER
|
||||
#undef DGEMM_HOLES
|
||||
#undef DGEMM_PARTICLES
|
||||
#undef _IJK_
|
||||
#else
|
||||
for (size_t k = 0; k < No; k++)
|
||||
for (size_t j = 0; j < No; j++)
|
||||
for (size_t i = 0; i < No; i++){
|
||||
const size_t ijk = i + j*No + k*NoNo
|
||||
, jk = j + k*No
|
||||
;
|
||||
Tijk[ijk] = 0.0; // :important
|
||||
// HOLE DIAGRAMS: TABHH and VHHHA
|
||||
for (size_t L = 0; L < No; L++){
|
||||
// t[abLj] * V[Lcik] H1
|
||||
// t[baLi] * V[Lcjk] H0 TODO: conjugate T for complex
|
||||
Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo];
|
||||
Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo];
|
||||
|
||||
// t[acLk] * V[Lbij] H5
|
||||
// t[caLi] * V[Lbkj] H3
|
||||
Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo];
|
||||
Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo];
|
||||
|
||||
// t[bcLk] * V[Laji] H2
|
||||
// t[cbLj] * V[Laki] H4
|
||||
Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo];
|
||||
Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo];
|
||||
}
|
||||
// PARTILCE DIAGRAMS: TAPHH and VABPH
|
||||
for (size_t E = 0; E < Nv; E++) {
|
||||
// t[aEij] * V[bcEk] P0
|
||||
// t[aEik] * V[cbEj] P3 // TODO: CHECK THIS ONE, I DONT KNOW
|
||||
Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv];
|
||||
Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv];
|
||||
|
||||
// t[cEki] * V[abEj] P5
|
||||
// t[cEkj] * V[baEi] P2
|
||||
Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv];
|
||||
Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv];
|
||||
|
||||
// t[bEji] * V[acEk] P1
|
||||
// t[bEjk] * V[caEi] P4
|
||||
Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv];
|
||||
Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv];
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
// , DataPtr<F> Tijk
|
||||
, DataFieldType<F>* Tijk_
|
||||
);
|
||||
// Doubles contribution:1 ends here
|
||||
|
||||
// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
|
||||
}
|
||||
// Equations:1 ends here
|
||||
// Epilog:1 ends here
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:../../atrip.org::*The rank mapping][The rank mapping:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*The%20rank%20mapping][The rank mapping:1]]
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
@@ -65,7 +65,8 @@ namespace atrip {
|
||||
;
|
||||
}
|
||||
|
||||
bool isSourcePadding(size_t rank, size_t source) const noexcept {
|
||||
bool isSourcePadding(const size_t rank, const size_t source)
|
||||
const noexcept {
|
||||
return source == nSources() && isPaddingRank(rank);
|
||||
}
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:../../atrip.org::*Prolog][Prolog:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
|
||||
#pragma once
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
@@ -21,33 +21,20 @@
|
||||
|
||||
#include <atrip/Tuples.hpp>
|
||||
#include <atrip/Utils.hpp>
|
||||
#include <atrip/Blas.hpp>
|
||||
|
||||
namespace atrip {
|
||||
|
||||
template <typename FF> FF maybeConjugate(const FF a) { return a; }
|
||||
template <> Complex maybeConjugate(const Complex a) { return std::conj(a); }
|
||||
|
||||
namespace traits {
|
||||
template <typename FF> bool isComplex() { return false; }
|
||||
template <> bool isComplex<Complex>() { return true; }
|
||||
namespace mpi {
|
||||
template <typename FF> MPI_Datatype datatypeOf(void);
|
||||
template <> MPI_Datatype datatypeOf<double>() { return MPI_DOUBLE; }
|
||||
template <> MPI_Datatype datatypeOf<Complex>() { return MPI_DOUBLE_COMPLEX; }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename F=double>
|
||||
struct Slice {
|
||||
// Prolog:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Location][Location:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Location][Location:1]]
|
||||
struct Location { size_t rank; size_t source; };
|
||||
// Location:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Type][Type:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Type][Type:1]]
|
||||
enum Type
|
||||
{ A = 10
|
||||
, B
|
||||
@@ -65,7 +52,7 @@ enum Type
|
||||
};
|
||||
// Type:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*State][State:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*State][State:1]]
|
||||
enum State {
|
||||
Fetch = 0,
|
||||
Dispatched = 2,
|
||||
@@ -76,7 +63,7 @@ enum State {
|
||||
};
|
||||
// State:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*The Info structure][The Info structure:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*The%20Info%20structure][The Info structure:1]]
|
||||
struct Info {
|
||||
// which part of a,b,c the slice holds
|
||||
PartialTuple tuple;
|
||||
@@ -100,7 +87,7 @@ struct Info {
|
||||
using Ty_x_Tu = std::pair< Type, PartialTuple >;
|
||||
// The Info structure:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Name][Name:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Name][Name:1]]
|
||||
enum Name
|
||||
{ TA = 100
|
||||
, VIJKA = 101
|
||||
@@ -110,19 +97,19 @@ enum Name
|
||||
};
|
||||
// Name:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Database][Database:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Database][Database:1]]
|
||||
struct LocalDatabaseElement {
|
||||
Slice<F>::Name name;
|
||||
Slice<F>::Info info;
|
||||
};
|
||||
// Database:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Database][Database:2]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Database][Database:2]]
|
||||
using LocalDatabase = std::vector<LocalDatabaseElement>;
|
||||
using Database = LocalDatabase;
|
||||
// Database:2 ends here
|
||||
|
||||
// [[file:../../atrip.org::*MPI Types][MPI Types:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*MPI%20Types][MPI Types:1]]
|
||||
struct mpi {
|
||||
|
||||
static MPI_Datatype vector(size_t n, MPI_Datatype const& DT) {
|
||||
@@ -228,7 +215,7 @@ struct mpi {
|
||||
};
|
||||
// MPI Types:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Static utilities][Static utilities:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:1]]
|
||||
static
|
||||
PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) {
|
||||
switch (sliceType) {
|
||||
@@ -246,7 +233,7 @@ PartialTuple subtupleBySlice(ABCTuple abc, Type sliceType) {
|
||||
}
|
||||
// Static utilities:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Static utilities][Static utilities:2]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:2]]
|
||||
static std::vector<Slice<F>*> hasRecycledReferencingToIt
|
||||
( std::vector<Slice<F>> &slices
|
||||
, Info const& info
|
||||
@@ -263,7 +250,7 @@ static std::vector<Slice<F>*> hasRecycledReferencingToIt
|
||||
}
|
||||
// Static utilities:2 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Static utilities][Static utilities:3]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:3]]
|
||||
static Slice<F>& findOneByType(std::vector<Slice<F>> &slices, Slice<F>::Type type) {
|
||||
const auto sliceIt
|
||||
= std::find_if(slices.begin(), slices.end(),
|
||||
@@ -279,7 +266,7 @@ static Slice<F>& findOneByType(std::vector<Slice<F>> &slices, Slice<F>::Type typ
|
||||
}
|
||||
// Static utilities:3 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Static utilities][Static utilities:4]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:4]]
|
||||
static Slice<F>&
|
||||
findRecycledSource (std::vector<Slice<F>> &slices, Slice<F>::Info info) {
|
||||
const auto sliceIt
|
||||
@@ -305,7 +292,7 @@ findRecycledSource (std::vector<Slice<F>> &slices, Slice<F>::Info info) {
|
||||
}
|
||||
// Static utilities:4 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Static utilities][Static utilities:5]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:5]]
|
||||
static Slice<F>& findByTypeAbc
|
||||
( std::vector<Slice<F>> &slices
|
||||
, Slice<F>::Type type
|
||||
@@ -335,7 +322,7 @@ static Slice<F>& findByTypeAbc
|
||||
}
|
||||
// Static utilities:5 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Static utilities][Static utilities:6]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Static%20utilities][Static utilities:6]]
|
||||
static Slice<F>& findByInfo(std::vector<Slice<F>> &slices,
|
||||
Slice<F>::Info const& info) {
|
||||
const auto sliceIt
|
||||
@@ -358,30 +345,33 @@ static Slice<F>& findByInfo(std::vector<Slice<F>> &slices,
|
||||
}
|
||||
// Static utilities:6 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Attributes][Attributes:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:1]]
|
||||
Info info;
|
||||
// Attributes:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Attributes][Attributes:2]]
|
||||
F *data;
|
||||
// [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:2]]
|
||||
DataPtr<F> data;
|
||||
#if defined(HAVE_CUDA)
|
||||
F* mpi_data;
|
||||
#endif
|
||||
// Attributes:2 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Attributes][Attributes:3]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:3]]
|
||||
MPI_Request request;
|
||||
// Attributes:3 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Attributes][Attributes:4]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:4]]
|
||||
const size_t size;
|
||||
// Attributes:4 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Member functions][Member functions:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:1]]
|
||||
void markReady() noexcept {
|
||||
info.state = Ready;
|
||||
info.recycling = Blank;
|
||||
}
|
||||
// Member functions:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Member functions][Member functions:2]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:2]]
|
||||
bool isUnwrapped() const noexcept {
|
||||
return info.state == Ready
|
||||
|| info.state == SelfSufficient
|
||||
@@ -389,7 +379,7 @@ bool isUnwrapped() const noexcept {
|
||||
}
|
||||
// Member functions:2 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Member functions][Member functions:3]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:3]]
|
||||
bool isUnwrappable() const noexcept {
|
||||
return isUnwrapped()
|
||||
|| info.state == Recycled
|
||||
@@ -407,7 +397,7 @@ void free() noexcept {
|
||||
info.state = Acceptor;
|
||||
info.from = {0, 0};
|
||||
info.recycling = Blank;
|
||||
data = nullptr;
|
||||
data = DataNullPtr;
|
||||
}
|
||||
|
||||
inline bool isFree() const noexcept {
|
||||
@@ -417,12 +407,12 @@ inline bool isFree() const noexcept {
|
||||
&& info.from.rank == 0
|
||||
&& info.from.source == 0
|
||||
&& info.recycling == Blank
|
||||
&& data == nullptr
|
||||
&& data == DataNullPtr
|
||||
;
|
||||
}
|
||||
// Member functions:3 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Member functions][Member functions:4]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:4]]
|
||||
inline bool isRecyclable() const noexcept {
|
||||
return ( info.state == Dispatched
|
||||
|| info.state == Ready
|
||||
@@ -433,16 +423,16 @@ inline bool isRecyclable() const noexcept {
|
||||
}
|
||||
// Member functions:4 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Member functions][Member functions:5]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:5]]
|
||||
inline bool hasValidDataPointer() const noexcept {
|
||||
return data != nullptr
|
||||
return data != DataNullPtr
|
||||
&& info.state != Acceptor
|
||||
&& info.type != Blank
|
||||
;
|
||||
}
|
||||
// Member functions:5 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Member functions][Member functions:6]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Member%20functions][Member functions:6]]
|
||||
void unwrapAndMarkReady() {
|
||||
if (info.state == Ready) return;
|
||||
if (info.state != Dispatched)
|
||||
@@ -454,9 +444,17 @@ void unwrapAndMarkReady() {
|
||||
WITH_RANK << "__slice__:mpi: waiting " << "\n";
|
||||
#endif
|
||||
const int errorCode = MPI_Wait(&request, &status);
|
||||
if (MPI_SUCCESS != MPI_Request_free(&request))
|
||||
throw "Error freeing MPI request";
|
||||
if (errorCode != MPI_SUCCESS)
|
||||
throw "MPI ERROR HAPPENED....";
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
// copy the retrieved mpi data to the device
|
||||
cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);
|
||||
std::free(mpi_data);
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_OCD
|
||||
char errorString[MPI_MAX_ERROR_STRING];
|
||||
int errorSize;
|
||||
@@ -474,18 +472,21 @@ void unwrapAndMarkReady() {
|
||||
}
|
||||
// Member functions:6 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Epilog][Epilog:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
|
||||
Slice(size_t size_)
|
||||
: info({})
|
||||
, data(nullptr)
|
||||
, size(size_)
|
||||
{}
|
||||
: info({})
|
||||
, data(DataNullPtr)
|
||||
#if defined(HAVE_CUDA)
|
||||
, mpi_data(nullptr)
|
||||
#endif
|
||||
, size(size_)
|
||||
{}
|
||||
|
||||
|
||||
}; // struct Slice
|
||||
}; // struct Slice
|
||||
// Epilog:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Debug][Debug:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Debug][Debug:1]]
|
||||
template <typename F=double>
|
||||
std::ostream& operator<<(std::ostream& out, typename Slice<F>::Location const& v) {
|
||||
// TODO: remove me
|
||||
|
||||
@@ -12,16 +12,19 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:../../atrip.org::*The slice union][The slice union:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
|
||||
#pragma once
|
||||
#include <atrip/Debug.hpp>
|
||||
#include <atrip/Slice.hpp>
|
||||
#include <atrip/RankMap.hpp>
|
||||
|
||||
namespace atrip {
|
||||
// Prolog:1 ends here
|
||||
|
||||
template <typename F=double>
|
||||
struct SliceUnion {
|
||||
// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]]
|
||||
template <typename F=double>
|
||||
class SliceUnion {
|
||||
public:
|
||||
using Tensor = CTF::Tensor<F>;
|
||||
|
||||
virtual void
|
||||
@@ -191,7 +194,11 @@ namespace atrip {
|
||||
: Slice<F>::Fetch
|
||||
;
|
||||
if (blank.info.state == Slice<F>::SelfSufficient) {
|
||||
#if defined(HAVE_CUDA)
|
||||
blank.mpi_data = sources[from.source].data();
|
||||
#else
|
||||
blank.data = sources[from.source].data();
|
||||
#endif
|
||||
} else {
|
||||
if (freePointers.size() == 0) {
|
||||
std::stringstream stream;
|
||||
@@ -345,8 +352,7 @@ namespace atrip {
|
||||
}
|
||||
|
||||
// CONSTRUCTOR
|
||||
SliceUnion( Tensor const& sourceTensor
|
||||
, std::vector<typename Slice<F>::Type> sliceTypes_
|
||||
SliceUnion( std::vector<typename Slice<F>::Type> sliceTypes_
|
||||
, std::vector<size_t> sliceLength_
|
||||
, std::vector<size_t> paramLength
|
||||
, size_t np
|
||||
@@ -366,12 +372,19 @@ namespace atrip {
|
||||
1UL, std::multiplies<size_t>())))
|
||||
, name(name_)
|
||||
, sliceTypes(sliceTypes_)
|
||||
, sliceBuffers(nSliceBuffers, sources[0])
|
||||
, sliceBuffers(nSliceBuffers)
|
||||
//, slices(2 * sliceTypes.size(), Slice<F>{ sources[0].size() })
|
||||
{ // constructor begin
|
||||
|
||||
LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n";
|
||||
|
||||
for (auto& ptr: sliceBuffers)
|
||||
#if defined(HAVE_CUDA)
|
||||
cuMemAlloc(&ptr, sizeof(F) * sources[0].size());
|
||||
#else
|
||||
ptr = (DataPtr<F>)malloc(sizeof(F) * sources[0].size());
|
||||
#endif
|
||||
|
||||
slices
|
||||
= std::vector<Slice<F>>(2 * sliceTypes.size(), { sources[0].size() });
|
||||
// TODO: think exactly ^------------------- about this number
|
||||
@@ -379,7 +392,7 @@ namespace atrip {
|
||||
// initialize the freePointers with the pointers to the buffers
|
||||
std::transform(sliceBuffers.begin(), sliceBuffers.end(),
|
||||
std::inserter(freePointers, freePointers.begin()),
|
||||
[](std::vector<F> &vec) { return vec.data(); });
|
||||
[](DataPtr<F> ptr) { return ptr; });
|
||||
|
||||
|
||||
|
||||
@@ -397,8 +410,6 @@ namespace atrip {
|
||||
<< freePointers.size() << "\n";
|
||||
LOG(1,"Atrip") << "#sliceBuffers "
|
||||
<< sliceBuffers.size() << "\n";
|
||||
LOG(1,"Atrip") << "#sliceBuffers[0] "
|
||||
<< sliceBuffers[0].size() << "\n";
|
||||
LOG(1,"Atrip") << "#sliceLength "
|
||||
<< sliceLength.size() << "\n";
|
||||
LOG(1,"Atrip") << "#paramLength "
|
||||
@@ -477,9 +488,12 @@ namespace atrip {
|
||||
if (slice.info.state == Slice<F>::Fetch) {
|
||||
// TODO: do it through the slice class
|
||||
slice.info.state = Slice<F>::Dispatched;
|
||||
MPI_Request request;
|
||||
slice.request = request;
|
||||
#if defined(HAVE_CUDA)
|
||||
slice.mpi_data = (F*)malloc(sizeof(F) * slice.size);
|
||||
MPI_Irecv( slice.mpi_data
|
||||
#else
|
||||
MPI_Irecv( slice.data
|
||||
#endif
|
||||
, slice.size
|
||||
, traits::mpi::datatypeOf<F>()
|
||||
, info.from.rank
|
||||
@@ -495,7 +509,7 @@ namespace atrip {
|
||||
for (auto type: sliceTypes) unwrapSlice(type, abc);
|
||||
}
|
||||
|
||||
F* unwrapSlice(typename Slice<F>::Type type, ABCTuple const& abc) {
|
||||
DataPtr<F> unwrapSlice(typename Slice<F>::Type type, ABCTuple const& abc) {
|
||||
WITH_CRAZY_DEBUG
|
||||
WITH_RANK << "__unwrap__:slice " << type << " w n "
|
||||
<< name
|
||||
@@ -539,6 +553,15 @@ namespace atrip {
|
||||
return slice.data;
|
||||
}
|
||||
|
||||
~SliceUnion() {
|
||||
for (auto& ptr: sliceBuffers)
|
||||
#if defined(HAVE_CUDA)
|
||||
cuMemFree(ptr);
|
||||
#else
|
||||
std::free(ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
const RankMap<F> rankMap;
|
||||
const MPI_Comm world;
|
||||
const MPI_Comm universe;
|
||||
@@ -547,8 +570,8 @@ namespace atrip {
|
||||
std::vector< Slice<F> > slices;
|
||||
typename Slice<F>::Name name;
|
||||
const std::vector<typename Slice<F>::Type> sliceTypes;
|
||||
std::vector< std::vector<F> > sliceBuffers;
|
||||
std::set<F*> freePointers;
|
||||
std::vector< DataPtr<F> > sliceBuffers;
|
||||
std::set< DataPtr<F> > freePointers;
|
||||
|
||||
};
|
||||
|
||||
@@ -568,6 +591,8 @@ namespace atrip {
|
||||
}
|
||||
return **sliceUnionIt;
|
||||
}
|
||||
// Prolog:2 ends here
|
||||
|
||||
// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
|
||||
}
|
||||
// The slice union:1 ends here
|
||||
// Epilog:1 ends here
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:../../atrip.org::*Prolog][Prolog:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
@@ -35,7 +35,7 @@
|
||||
namespace atrip {
|
||||
// Prolog:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Tuples types][Tuples types:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Tuples%20types][Tuples types:1]]
|
||||
using ABCTuple = std::array<size_t, 3>;
|
||||
using PartialTuple = std::array<size_t, 2>;
|
||||
using ABCTuples = std::vector<ABCTuple>;
|
||||
@@ -44,23 +44,22 @@ constexpr ABCTuple FAKE_TUPLE = {0, 0, 0};
|
||||
constexpr ABCTuple INVALID_TUPLE = {1, 1, 1};
|
||||
// Tuples types:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Distributing the tuples][Distributing the tuples:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Distributing%20the%20tuples][Distributing the tuples:1]]
|
||||
struct TuplesDistribution {
|
||||
virtual ABCTuples getTuples(size_t Nv, MPI_Comm universe) = 0;
|
||||
virtual bool tupleIsFake(ABCTuple const& t) { return t == FAKE_TUPLE; }
|
||||
};
|
||||
// Distributing the tuples:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Node information][Node information:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Node%20information][Node information:1]]
|
||||
std::vector<std::string> getNodeNames(MPI_Comm comm){
|
||||
int rank, np;
|
||||
MPI_Comm_rank(comm, &rank);
|
||||
MPI_Comm_size(comm, &np);
|
||||
|
||||
std::vector<std::string> nodeList(np);
|
||||
char nodeName[MPI_MAX_PROCESSOR_NAME]
|
||||
, nodeNames[np*MPI_MAX_PROCESSOR_NAME]
|
||||
;
|
||||
char nodeName[MPI_MAX_PROCESSOR_NAME];
|
||||
char *nodeNames = (char*)malloc(np * MPI_MAX_PROCESSOR_NAME);
|
||||
std::vector<int> nameLengths(np)
|
||||
, off(np)
|
||||
;
|
||||
@@ -87,11 +86,12 @@ std::vector<std::string> getNodeNames(MPI_Comm comm){
|
||||
std::string const s(&nodeNames[off[i]], nameLengths[i]);
|
||||
nodeList[i] = s;
|
||||
}
|
||||
std::free(nodeNames);
|
||||
return nodeList;
|
||||
}
|
||||
// Node information:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Node information][Node information:2]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Node%20information][Node information:2]]
|
||||
struct RankInfo {
|
||||
const std::string name;
|
||||
const size_t nodeId;
|
||||
@@ -154,7 +154,7 @@ getClusterInfo(MPI_Comm comm) {
|
||||
}
|
||||
// Node information:2 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Naive list][Naive list:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Naive%20list][Naive list:1]]
|
||||
ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) {
|
||||
|
||||
const size_t
|
||||
@@ -188,7 +188,7 @@ ABCTuples getTuplesList(size_t Nv, size_t rank, size_t np) {
|
||||
}
|
||||
// Naive list:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Naive list][Naive list:2]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Naive%20list][Naive list:2]]
|
||||
ABCTuples getAllTuplesList(const size_t Nv) {
|
||||
const size_t n = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
|
||||
ABCTuples result(n);
|
||||
@@ -204,7 +204,7 @@ ABCTuples getAllTuplesList(const size_t Nv) {
|
||||
}
|
||||
// Naive list:2 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Naive list][Naive list:3]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Naive%20list][Naive list:3]]
|
||||
struct NaiveDistribution : public TuplesDistribution {
|
||||
ABCTuples getTuples(size_t Nv, MPI_Comm universe) override {
|
||||
int rank, np;
|
||||
@@ -215,11 +215,11 @@ struct NaiveDistribution : public TuplesDistribution {
|
||||
};
|
||||
// Naive list:3 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Prolog][Prolog:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
|
||||
namespace group_and_sort {
|
||||
// Prolog:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Utils][Utils:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Utils][Utils:1]]
|
||||
// Provides the node on which the slice-element is found
|
||||
// Right now we distribute the slices in a round robin fashion
|
||||
// over the different nodes (NOTE: not mpi ranks but nodes)
|
||||
@@ -244,7 +244,7 @@ struct Info {
|
||||
};
|
||||
// Utils:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Distribution][Distribution:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Distribution][Distribution:1]]
|
||||
ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
|
||||
|
||||
ABCTuples nodeTuples;
|
||||
@@ -426,7 +426,7 @@ ABCTuples specialDistribution(Info const& info, ABCTuples const& allTuples) {
|
||||
}
|
||||
// Distribution:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Main][Main:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Main][Main:1]]
|
||||
std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv) {
|
||||
|
||||
int rank, np;
|
||||
@@ -466,7 +466,7 @@ std::vector<ABCTuple> main(MPI_Comm universe, size_t Nv) {
|
||||
MPI_Comm_split(universe, color, key, &INTRA_COMM);
|
||||
// Main:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Main][Main:2]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Main][Main:2]]
|
||||
size_t const
|
||||
tuplesPerRankLocal
|
||||
= nodeTuples.size() / nodeInfos[rank].ranksPerNode
|
||||
@@ -494,7 +494,7 @@ LOG(1,"Atrip") << "ranks per node " << nodeInfos[rank].ranksPerNode << "\n";
|
||||
LOG(1,"Atrip") << "#nodes " << nNodes << "\n";
|
||||
// Main:2 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Main][Main:3]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Main][Main:3]]
|
||||
size_t const totalTuples
|
||||
= tuplesPerRankGlobal * nodeInfos[rank].ranksPerNode;
|
||||
|
||||
@@ -506,7 +506,7 @@ if (computeDistribution) {
|
||||
}
|
||||
// Main:3 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Main][Main:4]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Main][Main:4]]
|
||||
{
|
||||
// construct mpi type for abctuple
|
||||
MPI_Datatype MPI_ABCTUPLE;
|
||||
@@ -530,13 +530,13 @@ if (computeDistribution) {
|
||||
}
|
||||
// Main:4 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Main][Main:5]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Main][Main:5]]
|
||||
return result;
|
||||
|
||||
}
|
||||
// Main:5 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Interface][Interface:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Interface][Interface:1]]
|
||||
struct Distribution : public TuplesDistribution {
|
||||
ABCTuples getTuples(size_t Nv, MPI_Comm universe) override {
|
||||
return main(universe, Nv);
|
||||
@@ -544,10 +544,10 @@ struct Distribution : public TuplesDistribution {
|
||||
};
|
||||
// Interface:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Epilog][Epilog:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
|
||||
} // namespace group_and_sort
|
||||
// Epilog:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Epilog][Epilog:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
|
||||
}
|
||||
// Epilog:1 ends here
|
||||
|
||||
60
include/atrip/Types.hpp
Normal file
60
include/atrip/Types.hpp
Normal file
@@ -0,0 +1,60 @@
|
||||
// Copyright 2022 Alejandro Gallo
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:~/cuda/atrip/atrip.org::*Data%20pointer][Data pointer:1]]
|
||||
#pragma once
|
||||
#include <atrip/Complex.hpp>
|
||||
#include <atrip/Atrip.hpp>
|
||||
|
||||
namespace atrip {
|
||||
|
||||
template <typename F>
|
||||
struct DataField;
|
||||
|
||||
template <>
|
||||
struct DataField<double> {
|
||||
using type = double;
|
||||
};
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
|
||||
template <typename F>
|
||||
using DataPtr = CUdeviceptr;
|
||||
#define DataNullPtr 0x00
|
||||
|
||||
template <>
|
||||
struct DataField<Complex> {
|
||||
using type = cuDoubleComplex;
|
||||
};
|
||||
|
||||
|
||||
#else
|
||||
|
||||
template <typename F>
|
||||
using DataPtr = F*;
|
||||
#define DataNullPtr nullptr
|
||||
|
||||
template <>
|
||||
struct DataField<Complex> {
|
||||
using type = Complex;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
template <typename F>
|
||||
using DataFieldType = typename DataField<F>::type;
|
||||
|
||||
}
|
||||
// Data pointer:1 ends here
|
||||
@@ -12,7 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:../../atrip.org::*Unions][Unions:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Unions][Unions:1]]
|
||||
#pragma once
|
||||
#include <atrip/SliceUnion.hpp>
|
||||
|
||||
@@ -65,8 +65,7 @@ namespace atrip {
|
||||
, size_t np
|
||||
, MPI_Comm child_world
|
||||
, MPI_Comm global_world
|
||||
) : SliceUnion<F>( sourceTensor
|
||||
, {Slice<F>::A, Slice<F>::B, Slice<F>::C}
|
||||
) : SliceUnion<F>( {Slice<F>::A, Slice<F>::B, Slice<F>::C}
|
||||
, {Nv, No, No} // size of the slices
|
||||
, {Nv}
|
||||
, np
|
||||
@@ -103,8 +102,7 @@ namespace atrip {
|
||||
, size_t np
|
||||
, MPI_Comm child_world
|
||||
, MPI_Comm global_world
|
||||
) : SliceUnion<F>( sourceTensor
|
||||
, {Slice<F>::A, Slice<F>::B, Slice<F>::C}
|
||||
) : SliceUnion<F>( {Slice<F>::A, Slice<F>::B, Slice<F>::C}
|
||||
, {No, No, No} // size of the slices
|
||||
, {Nv} // size of the parametrization
|
||||
, np
|
||||
@@ -138,8 +136,7 @@ namespace atrip {
|
||||
, size_t np
|
||||
, MPI_Comm child_world
|
||||
, MPI_Comm global_world
|
||||
) : SliceUnion<F>( sourceTensor
|
||||
, { Slice<F>::AB, Slice<F>::BC, Slice<F>::AC
|
||||
) : SliceUnion<F>( { Slice<F>::AB, Slice<F>::BC, Slice<F>::AC
|
||||
, Slice<F>::BA, Slice<F>::CB, Slice<F>::CA
|
||||
}
|
||||
, {Nv, No} // size of the slices
|
||||
@@ -179,8 +176,7 @@ namespace atrip {
|
||||
, size_t np
|
||||
, MPI_Comm child_world
|
||||
, MPI_Comm global_world
|
||||
) : SliceUnion<F>( sourceTensor
|
||||
, {Slice<F>::AB, Slice<F>::BC, Slice<F>::AC}
|
||||
) : SliceUnion<F>( {Slice<F>::AB, Slice<F>::BC, Slice<F>::AC}
|
||||
, {No, No} // size of the slices
|
||||
, {Nv, Nv} // size of the parametrization
|
||||
, np
|
||||
@@ -219,8 +215,7 @@ namespace atrip {
|
||||
, size_t np
|
||||
, MPI_Comm child_world
|
||||
, MPI_Comm global_world
|
||||
) : SliceUnion<F>( sourceTensor
|
||||
, {Slice<F>::AB, Slice<F>::BC, Slice<F>::AC}
|
||||
) : SliceUnion<F>( {Slice<F>::AB, Slice<F>::BC, Slice<F>::AC}
|
||||
, {No, No} // size of the slices
|
||||
, {Nv, Nv} // size of the parametrization
|
||||
, np
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:../../atrip.org::*Prolog][Prolog:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:1]]
|
||||
#pragma once
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
@@ -29,11 +29,14 @@
|
||||
|
||||
#include <atrip/Debug.hpp>
|
||||
|
||||
|
||||
namespace atrip {
|
||||
// Prolog:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Pretty printing][Pretty printing:1]]
|
||||
template <typename T>
|
||||
// [[file:~/cuda/atrip/atrip.org::*Pretty%20printing][Pretty printing:1]]
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wunused-parameter"
|
||||
template <typename T>
|
||||
std::string pretty_print(T&& value) {
|
||||
std::stringstream stream;
|
||||
#if ATRIP_DEBUG > 2
|
||||
@@ -41,9 +44,10 @@ template <typename T>
|
||||
#endif
|
||||
return stream.str();
|
||||
}
|
||||
#pragma GCC diagnostic pop
|
||||
// Pretty printing:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Chrono][Chrono:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Chrono][Chrono:1]]
|
||||
#define WITH_CHRONO(__chrono_name, ...) \
|
||||
Atrip::chrono[__chrono_name].start(); \
|
||||
__VA_ARGS__ \
|
||||
@@ -62,6 +66,6 @@ struct Timer {
|
||||
using Timings = std::map<std::string, Timer>;
|
||||
// Chrono:1 ends here
|
||||
|
||||
// [[file:../../atrip.org::*Epilog][Epilog:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
|
||||
}
|
||||
// Epilog:1 ends here
|
||||
|
||||
Reference in New Issue
Block a user