Changes in source files, makes cuda run
This commit is contained in:
@@ -2,26 +2,25 @@ AUTOMAKE_OPTIONS = subdir-objects
|
||||
include $(top_srcdir)/atrip.mk
|
||||
|
||||
AM_CXXFLAGS = $(CTF_CPPFLAGS) -fmax-errors=1
|
||||
AM_CPPFLAGS = $(CTF_CPPFLAGS)
|
||||
|
||||
lib_LIBRARIES = libatrip.a
|
||||
|
||||
libatrip_a_CPPFLAGS = -I$(top_srcdir)/include/
|
||||
libatrip_a_SOURCES = ./atrip/Atrip.cxx \
|
||||
./atrip/Blas.cxx
|
||||
|
||||
libatrip_a_SOURCES = ./atrip/Blas.cxx
|
||||
NVCC_FILES = ./atrip/Equations.cxx ./atrip/Complex.cxx ./atrip/Atrip.cxx
|
||||
|
||||
if WITH_CUDA
|
||||
NVCC_FILES = ./atrip/Equations.cxx ./atrip/Complex.cxx
|
||||
NVCC_OBJS = $(patsubst %.cxx,%.nvcc.o,$(NVCC_FILES))
|
||||
libatrip_a_CPPFLAGS += $(CUDA_CXXFLAGS)
|
||||
libatrip_a_DEPENDENCIES = $(NVCC_OBJS)
|
||||
libatrip_a_LIBADD = $(NVCC_OBJS)
|
||||
%.nvcc.o: %.cxx
|
||||
$(NVCC) -c -I../ $(CPPFLAGS) $(libatrip_a_CPPFLAGS) $< -o $@
|
||||
$(NVCC) -c -x cu -I../ $(CPPFLAGS) $(CTF_CPPFLAGS) $(DEFS) $(libatrip_a_CPPFLAGS) $< -o $@
|
||||
|
||||
#./atrip/Equations.o: ./atrip/Equations.cxx
|
||||
# $(NVCC) -c -I../ $(CPPFLAGS) $(libatrip_a_CPPFLAGS) $< -o $@
|
||||
else
|
||||
libatrip_a_SOURCES += ./atrip/Equations.cxx
|
||||
libatrip_a_SOURCES += $(NVCC_FILES)
|
||||
endif
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:../../atrip.org::*Main][Main:1]]
|
||||
// [[file:~/cuda/atrip/atrip.org::*Main][Main:1]]
|
||||
#include <iomanip>
|
||||
|
||||
#include <atrip/Atrip.hpp>
|
||||
@@ -23,12 +23,24 @@
|
||||
#include <atrip/Checkpoint.hpp>
|
||||
|
||||
using namespace atrip;
|
||||
#if defined(HAVE_CUDA)
|
||||
|
||||
namespace atrip {
|
||||
namespace cuda {
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
template <typename F> bool RankMap<F>::RANK_ROUND_ROBIN;
|
||||
template bool RankMap<double>::RANK_ROUND_ROBIN;
|
||||
template bool RankMap<Complex>::RANK_ROUND_ROBIN;
|
||||
int Atrip::rank;
|
||||
int Atrip::np;
|
||||
size_t Atrip::rank;
|
||||
size_t Atrip::np;
|
||||
#if defined(HAVE_CUDA)
|
||||
typename Atrip::CudaContext Atrip::cuda;
|
||||
#endif
|
||||
MPI_Comm Atrip::communicator;
|
||||
Timings Atrip::chrono;
|
||||
|
||||
@@ -40,15 +52,20 @@ void atrip::registerIterationDescriptor(IterationDescriptor d) {
|
||||
|
||||
void Atrip::init(MPI_Comm world) {
|
||||
Atrip::communicator = world;
|
||||
MPI_Comm_rank(world, &Atrip::rank);
|
||||
MPI_Comm_size(world, &Atrip::np);
|
||||
MPI_Comm_rank(world, (int*)&Atrip::rank);
|
||||
MPI_Comm_size(world, (int*)&Atrip::np);
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
Atrip::cuda.status = cublasCreate(&Atrip::cuda.handle);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
|
||||
const int np = Atrip::np;
|
||||
const int rank = Atrip::rank;
|
||||
const size_t np = Atrip::np;
|
||||
const size_t rank = Atrip::rank;
|
||||
MPI_Comm universe = Atrip::communicator;
|
||||
|
||||
const size_t No = in.ei->lens[0];
|
||||
@@ -58,18 +75,35 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
LOG(0,"Atrip") << "np: " << np << "\n";
|
||||
|
||||
// allocate the three scratches, see piecuch
|
||||
std::vector<F> Tijk(No*No*No) // doubles only (see piecuch)
|
||||
, Zijk(No*No*No) // singles + doubles (see piecuch)
|
||||
// we need local copies of the following tensors on every
|
||||
// rank
|
||||
, epsi(No)
|
||||
, epsa(Nv)
|
||||
, Tai(No * Nv)
|
||||
// we need local copies of the following tensors on every
|
||||
// rank
|
||||
std::vector<F> _epsi(No)
|
||||
, _epsa(Nv)
|
||||
, _Tai(No * Nv)
|
||||
;
|
||||
|
||||
in.ei->read_all(epsi.data());
|
||||
in.ea->read_all(epsa.data());
|
||||
in.Tph->read_all(Tai.data());
|
||||
in.ei->read_all(_epsi.data());
|
||||
in.ea->read_all(_epsa.data());
|
||||
in.Tph->read_all(_Tai.data());
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
DataPtr<F> Tai, epsi, epsa;
|
||||
cuMemAlloc(&Tai, sizeof(F) * _Tai.size());
|
||||
cuMemAlloc(&epsi, sizeof(F) * _epsi.size());
|
||||
cuMemAlloc(&epsa, sizeof(F) * _epsa.size());
|
||||
|
||||
cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size());
|
||||
cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size());
|
||||
cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size());
|
||||
|
||||
DataPtr<F> Tijk, Zijk;
|
||||
//TODO: free memory
|
||||
cuMemAlloc(&Tijk, sizeof(F) * No * No * No);
|
||||
cuMemAlloc(&Zijk, sizeof(F) * No * No * No);
|
||||
#else
|
||||
std::vector<F> &Tai = _Tai, &epsi = _epsi, &epsa = _epsa;
|
||||
std::vector<F> Tijk(No*No*No), Zijk(No*No*No);
|
||||
#endif
|
||||
|
||||
RankMap<F>::RANK_ROUND_ROBIN = in.rankRoundRobin;
|
||||
if (RankMap<F>::RANK_ROUND_ROBIN) {
|
||||
@@ -139,7 +173,6 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
)
|
||||
const size_t nIterations = tuplesList.size();
|
||||
{
|
||||
const size_t _all_tuples = Nv * (Nv + 1) * (Nv + 2) / 6 - Nv;
|
||||
LOG(0,"Atrip") << "#iterations: "
|
||||
<< nIterations
|
||||
<< "/"
|
||||
@@ -338,6 +371,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
}
|
||||
}
|
||||
|
||||
LOG(0, "AtripCUDA") << "Starting iterations\n";
|
||||
|
||||
|
||||
for ( size_t
|
||||
i = first_iteration,
|
||||
iteration = first_iteration + 1
|
||||
@@ -346,6 +382,8 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
) {
|
||||
Atrip::chrono["iterations"].start();
|
||||
|
||||
LOG(0, "AtripCUDA") << "iteration " << i << "\n";
|
||||
|
||||
// check overhead from chrono over all iterations
|
||||
WITH_CHRONO("start:stop", {})
|
||||
|
||||
@@ -355,8 +393,10 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
if (in.barrier) MPI_Barrier(universe);
|
||||
))
|
||||
|
||||
|
||||
// write checkpoints
|
||||
if (iteration % checkpoint_mod == 0) {
|
||||
if (iteration % checkpoint_mod == 0 && false) {
|
||||
LOG(0, "AtripCUDA") << "checkpoints \n";
|
||||
double globalEnergy = 0;
|
||||
MPI_Reduce(&energy, &globalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, universe);
|
||||
Checkpoint out
|
||||
@@ -368,9 +408,10 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
iteration - 1,
|
||||
in.rankRoundRobin};
|
||||
LOG(0, "Atrip") << "Writing checkpoint\n";
|
||||
if (Atrip::rank == 0) write_checkpoint(out, in.checkpointPath);
|
||||
//if (Atrip::rank == 0) write_checkpoint(out, in.checkpointPath);
|
||||
}
|
||||
|
||||
LOG(0, "AtripCUDA") << "reporting \n";
|
||||
// write reporting
|
||||
if (iteration % iterationMod == 0 || iteration == iteration1Percent) {
|
||||
|
||||
@@ -417,9 +458,11 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
<< "\n";
|
||||
)
|
||||
|
||||
LOG(0, "AtripCUDA") << "first database " << i << "\n";
|
||||
|
||||
// COMM FIRST DATABASE ================================================{{{1
|
||||
if (i == first_iteration) {
|
||||
LOG(0, "AtripCUDA") << "first database " << i << "\n";
|
||||
WITH_RANK << "__first__:first database ............ \n";
|
||||
const auto db = communicateDatabase(abc, universe);
|
||||
WITH_RANK << "__first__:first database communicated \n";
|
||||
@@ -432,6 +475,8 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
MPI_Barrier(universe);
|
||||
}
|
||||
|
||||
LOG(0, "AtripCUDA") << "next database" << i << "\n";
|
||||
|
||||
// COMM NEXT DATABASE ================================================={{{1
|
||||
if (abcNext) {
|
||||
WITH_RANK << "__comm__:" << iteration << "th communicating database\n";
|
||||
@@ -447,6 +492,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
// COMPUTE DOUBLES ===================================================={{{1
|
||||
OCD_Barrier(universe);
|
||||
if (!isFakeTuple(i)) {
|
||||
|
||||
LOG(0, "AtripCUDA") << "computing doubles " << i << "\n";
|
||||
|
||||
WITH_RANK << iteration << "-th doubles\n";
|
||||
WITH_CHRONO("oneshot-unwrap",
|
||||
WITH_CHRONO("unwrap",
|
||||
@@ -478,7 +526,11 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
, tabhh.unwrapSlice(Slice<F>::AC, abc)
|
||||
, tabhh.unwrapSlice(Slice<F>::BC, abc)
|
||||
// -- TIJK
|
||||
#if defined(HAVE_CUDA)
|
||||
, (DataFieldType<F>*)Tijk
|
||||
#else
|
||||
, Tijk.data()
|
||||
#endif
|
||||
);
|
||||
WITH_RANK << iteration << "-th doubles done\n";
|
||||
))
|
||||
@@ -493,16 +545,35 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
abhh.unwrapAll(abc);
|
||||
)))
|
||||
WITH_CHRONO("reorder",
|
||||
for (size_t I(0); I < Zijk.size(); I++) Zijk[I] = Tijk[I];
|
||||
LOG(0, "AtripCUDA") << "reorder singles" << i << "\n";
|
||||
atrip::xcopy<F>(No*No*No,
|
||||
#if defined(HAVE_CUDA)
|
||||
(DataFieldType<F>*)Tijk, 1,
|
||||
(DataFieldType<F>*)Zijk, 1);
|
||||
#else
|
||||
(DataFieldType<F>*)Tijk.data(), 1,
|
||||
(DataFieldType<F>*)Zijk.data(), 1);
|
||||
#endif
|
||||
)
|
||||
WITH_CHRONO("singles",
|
||||
singlesContribution<F>( No, Nv, abc
|
||||
LOG(0, "AtripCUDA") << "doing singles" << i << "\n";
|
||||
#if defined(HAVE_CUDA)
|
||||
singlesContribution<F><<<1,1>>>( No, Nv, abc[0], abc[1], abc[2]
|
||||
, (DataFieldType<F>*)Tai
|
||||
#else
|
||||
singlesContribution<F>( No, Nv, abc[0], abc[1], abc[2]
|
||||
, Tai.data()
|
||||
, abhh.unwrapSlice(Slice<F>::AB, abc)
|
||||
, abhh.unwrapSlice(Slice<F>::AC, abc)
|
||||
, abhh.unwrapSlice(Slice<F>::BC, abc)
|
||||
#endif
|
||||
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB, abc)
|
||||
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC, abc)
|
||||
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC, abc)
|
||||
#if defined(HAVE_CUDA)
|
||||
, (DataFieldType<F>*)Zijk);
|
||||
#else
|
||||
, Zijk.data());
|
||||
#endif
|
||||
)
|
||||
LOG(0, "AtripCUDA") << "singles done" << i << "\n";
|
||||
}
|
||||
|
||||
|
||||
@@ -513,13 +584,17 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
int distinct(0);
|
||||
if (abc[0] == abc[1]) distinct++;
|
||||
if (abc[1] == abc[2]) distinct--;
|
||||
const F epsabc(epsa[abc[0]] + epsa[abc[1]] + epsa[abc[2]]);
|
||||
const F epsabc(_epsa[abc[0]] + _epsa[abc[1]] + _epsa[abc[2]]);
|
||||
|
||||
LOG(0, "AtripCUDA") << "doing energy " << i << "distinct " << distinct << "\n";
|
||||
WITH_CHRONO("energy",
|
||||
/*
|
||||
TODO: think about how to do this on the GPU in the best way possible
|
||||
if ( distinct == 0)
|
||||
tupleEnergy = getEnergyDistinct<F>(epsabc, epsi, Tijk, Zijk);
|
||||
tupleEnergy = getEnergyDistinct<F>(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk);
|
||||
else
|
||||
tupleEnergy = getEnergySame<F>(epsabc, epsi, Tijk, Zijk);
|
||||
tupleEnergy = getEnergySame<F>(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk);
|
||||
*/
|
||||
)
|
||||
|
||||
#if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES)
|
||||
|
||||
141
src/atrip/Blas.cxx
Normal file
141
src/atrip/Blas.cxx
Normal file
@@ -0,0 +1,141 @@
|
||||
// Copyright 2022 Alejandro Gallo
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:~/cuda/atrip/atrip.org::*Blas][Blas:2]]
|
||||
#include <atrip/Blas.hpp>
|
||||
#include <atrip/Atrip.hpp>
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
# include <cstring>
|
||||
|
||||
static inline
|
||||
cublasOperation_t char_to_cublasOperation(const char* trans) {
|
||||
if (strncmp("N", trans, 1) == 0)
|
||||
return CUBLAS_OP_N;
|
||||
else if (strncmp("T", trans, 1) == 0)
|
||||
return CUBLAS_OP_T;
|
||||
else
|
||||
return CUBLAS_OP_C;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
namespace atrip {
|
||||
|
||||
|
||||
template <>
|
||||
void xgemm<double>(const char *transa,
|
||||
const char *transb,
|
||||
const int *m,
|
||||
const int *n,
|
||||
const int *k,
|
||||
double *alpha,
|
||||
const typename DataField<double>::type *A,
|
||||
const int *lda,
|
||||
const typename DataField<double>::type *B,
|
||||
const int *ldb,
|
||||
double *beta,
|
||||
typename DataField<double>::type *C,
|
||||
const int *ldc) {
|
||||
#if defined(HAVE_CUDA)
|
||||
cublasDgemm(Atrip::cuda.handle,
|
||||
char_to_cublasOperation(transa),
|
||||
char_to_cublasOperation(transb),
|
||||
*m, *n, *k,
|
||||
alpha, A, *lda,
|
||||
B, *ldb, beta,
|
||||
C, *ldc);
|
||||
#else
|
||||
dgemm_(transa, transb,
|
||||
m, n, k,
|
||||
alpha, A, lda,
|
||||
B, ldb, beta,
|
||||
C, ldc);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
void xgemm<Complex>(const char *transa,
|
||||
const char *transb,
|
||||
const int *m,
|
||||
const int *n,
|
||||
const int *k,
|
||||
Complex *alpha,
|
||||
const typename DataField<Complex>::type *A,
|
||||
const int *lda,
|
||||
const typename DataField<Complex>::type *B,
|
||||
const int *ldb,
|
||||
Complex *beta,
|
||||
typename DataField<Complex>::type *C,
|
||||
const int *ldc) {
|
||||
#if defined(HAVE_CUDA)
|
||||
#pragma warning HAVE_CUDA
|
||||
cuDoubleComplex
|
||||
cu_alpha = {std::real(*alpha), std::imag(*alpha)},
|
||||
cu_beta = {std::real(*beta), std::imag(*beta)};
|
||||
cublasZgemm(Atrip::cuda.handle,
|
||||
char_to_cublasOperation(transa),
|
||||
char_to_cublasOperation(transb),
|
||||
*m, *n, *k,
|
||||
&cu_alpha,
|
||||
|
||||
A, *lda,
|
||||
B, *ldb,
|
||||
&cu_beta,
|
||||
C, *ldc);
|
||||
#else
|
||||
zgemm_(transa, transb,
|
||||
m, n, k,
|
||||
alpha, A, lda,
|
||||
B, ldb, beta,
|
||||
C, ldc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
void xcopy<double>(const int n,
|
||||
const DataFieldType<double>* x,
|
||||
const int incx,
|
||||
DataFieldType<double>* y,
|
||||
const int incy) {
|
||||
#if defined(HAVE_CUDA)
|
||||
cublasDcopy(Atrip::cuda.handle,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy);
|
||||
#else
|
||||
dcopy_(n, x, incx, y, incy);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
void xcopy<Complex>(const int n,
|
||||
const DataFieldType<Complex>* x,
|
||||
const int incx,
|
||||
DataFieldType<Complex>* y,
|
||||
const int incy) {
|
||||
#if defined(HAVE_CUDA)
|
||||
cublasZcopy(Atrip::cuda.handle,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy);
|
||||
#else
|
||||
zcopy_(n, x, incx, y, incy);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
// Blas:2 ends here
|
||||
40
src/atrip/Complex.cxx
Normal file
40
src/atrip/Complex.cxx
Normal file
@@ -0,0 +1,40 @@
|
||||
// Copyright 2022 Alejandro Gallo
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:~/cuda/atrip/atrip.org::*Complex%20numbers][Complex numbers:2]]
|
||||
#include <atrip/Complex.hpp>
|
||||
#include <atrip/CUDA.hpp>
|
||||
|
||||
namespace atrip {
|
||||
|
||||
template <> double maybeConjugate(const double a) { return a; }
|
||||
template <> Complex maybeConjugate(const Complex a) { return std::conj(a); }
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
namespace traits {
|
||||
template <typename F> bool isComplex() { return false; }
|
||||
template <> bool isComplex<double>() { return false; }
|
||||
template <> bool isComplex<Complex>() { return true; }
|
||||
namespace mpi {
|
||||
template <> MPI_Datatype datatypeOf<double>() { return MPI_DOUBLE; }
|
||||
template <> MPI_Datatype datatypeOf<Complex>() { return MPI_DOUBLE_COMPLEX; }
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
// Complex numbers:2 ends here
|
||||
720
src/atrip/Equations.cxx
Normal file
720
src/atrip/Equations.cxx
Normal file
@@ -0,0 +1,720 @@
|
||||
// Copyright 2022 Alejandro Gallo
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]]
|
||||
#include<atrip/Equations.hpp>
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
#include <cuda.h>
|
||||
#endif
|
||||
|
||||
namespace atrip {
|
||||
// Prolog:2 ends here
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
namespace cuda {
|
||||
|
||||
// cuda kernels
|
||||
|
||||
template <typename F>
|
||||
__global__
|
||||
void zeroing(F* a, size_t n) {
|
||||
F zero = {0};
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
a[i] = zero;
|
||||
}
|
||||
}
|
||||
|
||||
////
|
||||
template <typename F>
|
||||
__device__
|
||||
F maybeConjugateScalar(const F a);
|
||||
|
||||
template <>
|
||||
__device__
|
||||
double maybeConjugateScalar(const double a) { return a; }
|
||||
|
||||
template <>
|
||||
__device__
|
||||
cuDoubleComplex
|
||||
maybeConjugateScalar(const cuDoubleComplex a) {
|
||||
return {a.x, -a.y};
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
__global__
|
||||
void maybeConjugate(F* to, F* from, size_t n) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
to[i] = maybeConjugateScalar<F>(from[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename F>
|
||||
__global__
|
||||
void reorder(F* to, F* from, size_t size, size_t I, size_t J, size_t K) {
|
||||
size_t idx = 0;
|
||||
const size_t IDX = I + J*size + K*size*size;
|
||||
for (size_t k = 0; k < size; k++)
|
||||
for (size_t j = 0; j < size; j++)
|
||||
for (size_t i = 0; i < size; i++, idx++)
|
||||
to[idx] += from[IDX];
|
||||
}
|
||||
|
||||
// I mean, really CUDA... really!?
|
||||
template <typename F>
|
||||
__device__
|
||||
F multiply(const F &a, const F &b);
|
||||
template <>
|
||||
__device__
|
||||
double multiply(const double &a, const double &b) { return a * b; }
|
||||
|
||||
template <>
|
||||
__device__
|
||||
cuDoubleComplex multiply(const cuDoubleComplex &a, const cuDoubleComplex &b) {
|
||||
return
|
||||
{a.x * b.x - a.y * b.y,
|
||||
a.x * b.y + a.y * b.x};
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
__device__
|
||||
void sum_in_place(F* to, const F* b);
|
||||
|
||||
template <>
|
||||
__device__
|
||||
void sum_in_place(double* to, const double *b) { *to += *b; }
|
||||
|
||||
template <>
|
||||
__device__
|
||||
void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* b) {
|
||||
to->x += b->x;
|
||||
to->y += b->y;
|
||||
}
|
||||
|
||||
__device__
|
||||
cuDoubleComplex& operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz) {
|
||||
lz.x += rz.x;
|
||||
lz.y += rz.y;
|
||||
return lz;
|
||||
}
|
||||
|
||||
|
||||
|
||||
};
|
||||
#endif
|
||||
|
||||
// [[file:~/cuda/atrip/atrip.org::*Energy][Energy:2]]
|
||||
template <typename F>
|
||||
double getEnergyDistinct
|
||||
( F const epsabc
|
||||
, size_t const No
|
||||
, F* const epsi
|
||||
, F* const Tijk
|
||||
, F* const Zijk
|
||||
) {
|
||||
constexpr size_t blockSize=16;
|
||||
F energy(0.);
|
||||
for (size_t kk=0; kk<No; kk+=blockSize){
|
||||
const size_t kend( std::min(No, kk+blockSize) );
|
||||
for (size_t jj(kk); jj<No; jj+=blockSize){
|
||||
const size_t jend( std::min( No, jj+blockSize) );
|
||||
for (size_t ii(jj); ii<No; ii+=blockSize){
|
||||
const size_t iend( std::min( No, ii+blockSize) );
|
||||
for (size_t k(kk); k < kend; k++){
|
||||
const F ek(epsi[k]);
|
||||
const size_t jstart = jj > k ? jj : k;
|
||||
for (size_t j(jstart); j < jend; j++){
|
||||
F const ej(epsi[j]);
|
||||
F const facjk = j == k ? F(0.5) : F(1.0);
|
||||
size_t istart = ii > j ? ii : j;
|
||||
for (size_t i(istart); i < iend; i++){
|
||||
const F
|
||||
ei(epsi[i])
|
||||
, facij = i == j ? F(0.5) : F(1.0)
|
||||
, denominator(epsabc - ei - ej - ek)
|
||||
, U(Zijk[i + No*j + No*No*k])
|
||||
, V(Zijk[i + No*k + No*No*j])
|
||||
, W(Zijk[j + No*i + No*No*k])
|
||||
, X(Zijk[j + No*k + No*No*i])
|
||||
, Y(Zijk[k + No*i + No*No*j])
|
||||
, Z(Zijk[k + No*j + No*No*i])
|
||||
, A(maybeConjugate<F>(Tijk[i + No*j + No*No*k]))
|
||||
, B(maybeConjugate<F>(Tijk[i + No*k + No*No*j]))
|
||||
, C(maybeConjugate<F>(Tijk[j + No*i + No*No*k]))
|
||||
, D(maybeConjugate<F>(Tijk[j + No*k + No*No*i]))
|
||||
, E(maybeConjugate<F>(Tijk[k + No*i + No*No*j]))
|
||||
, _F(maybeConjugate<F>(Tijk[k + No*j + No*No*i]))
|
||||
, value
|
||||
= 3.0 * ( A * U
|
||||
+ B * V
|
||||
+ C * W
|
||||
+ D * X
|
||||
+ E * Y
|
||||
+ _F * Z )
|
||||
+ ( ( U + X + Y )
|
||||
- 2.0 * ( V + W + Z )
|
||||
) * ( A + D + E )
|
||||
+ ( ( V + W + Z )
|
||||
- 2.0 * ( U + X + Y )
|
||||
) * ( B + C + _F )
|
||||
;
|
||||
energy += 2.0 * value / denominator * facjk * facij;
|
||||
} // i
|
||||
} // j
|
||||
} // k
|
||||
} // ii
|
||||
} // jj
|
||||
} // kk
|
||||
return std::real(energy);
|
||||
}
|
||||
|
||||
|
||||
template <typename F>
|
||||
double getEnergySame
|
||||
( F const epsabc
|
||||
, size_t const No
|
||||
, F* const epsi
|
||||
, F* const Tijk
|
||||
, F* const Zijk
|
||||
) {
|
||||
constexpr size_t blockSize = 16;
|
||||
F energy = F(0.);
|
||||
for (size_t kk=0; kk<No; kk+=blockSize){
|
||||
const size_t kend( std::min( kk+blockSize, No) );
|
||||
for (size_t jj(kk); jj<No; jj+=blockSize){
|
||||
const size_t jend( std::min( jj+blockSize, No) );
|
||||
for (size_t ii(jj); ii<No; ii+=blockSize){
|
||||
const size_t iend( std::min( ii+blockSize, No) );
|
||||
for (size_t k(kk); k < kend; k++){
|
||||
const F ek(epsi[k]);
|
||||
const size_t jstart = jj > k ? jj : k;
|
||||
for(size_t j(jstart); j < jend; j++){
|
||||
const F facjk( j == k ? F(0.5) : F(1.0));
|
||||
const F ej(epsi[j]);
|
||||
const size_t istart = ii > j ? ii : j;
|
||||
for(size_t i(istart); i < iend; i++){
|
||||
const F
|
||||
ei(epsi[i])
|
||||
, facij ( i==j ? F(0.5) : F(1.0))
|
||||
, denominator(epsabc - ei - ej - ek)
|
||||
, U(Zijk[i + No*j + No*No*k])
|
||||
, V(Zijk[j + No*k + No*No*i])
|
||||
, W(Zijk[k + No*i + No*No*j])
|
||||
, A(maybeConjugate<F>(Tijk[i + No*j + No*No*k]))
|
||||
, B(maybeConjugate<F>(Tijk[j + No*k + No*No*i]))
|
||||
, C(maybeConjugate<F>(Tijk[k + No*i + No*No*j]))
|
||||
, value
|
||||
= F(3.0) * ( A * U
|
||||
+ B * V
|
||||
+ C * W
|
||||
)
|
||||
- ( A + B + C ) * ( U + V + W )
|
||||
;
|
||||
energy += F(2.0) * value / denominator * facjk * facij;
|
||||
} // i
|
||||
} // j
|
||||
} // k
|
||||
} // ii
|
||||
} // jj
|
||||
} // kk
|
||||
return std::real(energy);
|
||||
}
|
||||
// Energy:2 ends here
|
||||
|
||||
// [[file:~/cuda/atrip/atrip.org::*Energy][Energy:3]]
|
||||
// instantiate double
|
||||
template
|
||||
double getEnergyDistinct
|
||||
( double const epsabc
|
||||
, size_t const No
|
||||
, double* const epsi
|
||||
, double* const Tijk
|
||||
, double* const Zijk
|
||||
);
|
||||
|
||||
template
|
||||
double getEnergySame
|
||||
( double const epsabc
|
||||
, size_t const No
|
||||
, double* const epsi
|
||||
, double* const Tijk
|
||||
, double* const Zijk
|
||||
);
|
||||
|
||||
// instantiate Complex
|
||||
template
|
||||
double getEnergyDistinct
|
||||
( Complex const epsabc
|
||||
, size_t const No
|
||||
, Complex* const epsi
|
||||
, Complex* const Tijk
|
||||
, Complex* const Zijk
|
||||
);
|
||||
|
||||
template
|
||||
double getEnergySame
|
||||
( Complex const epsabc
|
||||
, size_t const No
|
||||
, Complex* const epsi
|
||||
, Complex* const Tijk
|
||||
, Complex* const Zijk
|
||||
);
|
||||
// Energy:3 ends here
|
||||
|
||||
// [[file:~/cuda/atrip/atrip.org::*Singles%20contribution][Singles contribution:2]]
|
||||
template <typename F>
|
||||
#ifdef HAVE_CUDA
|
||||
__global__
|
||||
#endif
|
||||
void singlesContribution
|
||||
( size_t No
|
||||
, size_t Nv
|
||||
, size_t a
|
||||
, size_t b
|
||||
, size_t c
|
||||
, DataFieldType<F>* const Tph
|
||||
, DataFieldType<F>* const VABij
|
||||
, DataFieldType<F>* const VACij
|
||||
, DataFieldType<F>* const VBCij
|
||||
, DataFieldType<F>* Zijk
|
||||
) {
|
||||
const size_t NoNo = No*No;
|
||||
// TODO: change order of for loops
|
||||
for (size_t k = 0; k < No; k++)
|
||||
for (size_t i = 0; i < No; i++)
|
||||
for (size_t j = 0; j < No; j++) {
|
||||
const size_t ijk = i + j*No + k*No*No;
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
# define GO(__TPH, __VABIJ) \
|
||||
{ \
|
||||
const DataFieldType<F> product \
|
||||
= cuda::multiply<DataFieldType<F>>((__TPH), (__VABIJ)); \
|
||||
cuda::sum_in_place<DataFieldType<F>>(&Zijk[ijk], &product); \
|
||||
}
|
||||
#else
|
||||
# define GO(__TPH, __VABIJ) Zijk[ijk] += (__TPH) * (__VABIJ);
|
||||
#endif
|
||||
GO(Tph[ a + i * Nv ], VBCij[ j + k * No ])
|
||||
GO(Tph[ b + j * Nv ], VACij[ i + k * No ])
|
||||
GO(Tph[ c + k * Nv ], VABij[ i + j * No ])
|
||||
#undef GO
|
||||
} // for loop j
|
||||
}
|
||||
|
||||
|
||||
// instantiate
|
||||
template
|
||||
#ifdef HAVE_CUDA
|
||||
__global__
|
||||
#endif
|
||||
void singlesContribution<double>( size_t No
|
||||
, size_t Nv
|
||||
, size_t a
|
||||
, size_t b
|
||||
, size_t c
|
||||
, double* const Tph
|
||||
, double* const VABij
|
||||
, double* const VACij
|
||||
, double* const VBCij
|
||||
, double* Zijk
|
||||
);
|
||||
|
||||
template
|
||||
#ifdef HAVE_CUDA
|
||||
__global__
|
||||
#endif
|
||||
void singlesContribution<Complex>( size_t No
|
||||
, size_t Nv
|
||||
, size_t a
|
||||
, size_t b
|
||||
, size_t c
|
||||
, DataFieldType<Complex>* const Tph
|
||||
, DataFieldType<Complex>* const VABij
|
||||
, DataFieldType<Complex>* const VACij
|
||||
, DataFieldType<Complex>* const VBCij
|
||||
, DataFieldType<Complex>* Zijk
|
||||
);
|
||||
// Singles contribution:2 ends here
|
||||
|
||||
// [[file:~/cuda/atrip/atrip.org::*Doubles%20contribution][Doubles contribution:2]]
|
||||
template <typename F>
|
||||
void doublesContribution
|
||||
( const ABCTuple &abc
|
||||
, size_t const No
|
||||
, size_t const Nv
|
||||
// -- VABCI
|
||||
, DataPtr<F> const VABph
|
||||
, DataPtr<F> const VACph
|
||||
, DataPtr<F> const VBCph
|
||||
, DataPtr<F> const VBAph
|
||||
, DataPtr<F> const VCAph
|
||||
, DataPtr<F> const VCBph
|
||||
// -- VHHHA
|
||||
, DataPtr<F> const VhhhA
|
||||
, DataPtr<F> const VhhhB
|
||||
, DataPtr<F> const VhhhC
|
||||
// -- TA
|
||||
, DataPtr<F> const TAphh
|
||||
, DataPtr<F> const TBphh
|
||||
, DataPtr<F> const TCphh
|
||||
// -- TABIJ
|
||||
, DataPtr<F> const TABhh
|
||||
, DataPtr<F> const TAChh
|
||||
, DataPtr<F> const TBChh
|
||||
// -- TIJK
|
||||
// , DataPtr<F> Tijk_
|
||||
, DataFieldType<F>* Tijk_
|
||||
) {
|
||||
|
||||
const size_t a = abc[0], b = abc[1], c = abc[2]
|
||||
, NoNo = No*No, NoNv = No*Nv
|
||||
;
|
||||
|
||||
typename DataField<F>::type* Tijk = (typename DataField<F>::type*) Tijk_;
|
||||
LOG(0, "AtripCUDA") << "in doubles " << "\n";
|
||||
|
||||
#if defined(ATRIP_USE_DGEMM)
|
||||
#define _IJK_(i, j, k) i + j*No + k*NoNo
|
||||
#if defined(HAVE_CUDA)
|
||||
// TODO
|
||||
#define REORDER(__II, __JJ, __KK)
|
||||
#define __TO_DEVICEPTR(_v) (_v)
|
||||
#define DGEMM_PARTICLES(__A, __B) \
|
||||
atrip::xgemm<F>("T", \
|
||||
"N", \
|
||||
(int const*)&NoNo, \
|
||||
(int const*)&No, \
|
||||
(int const*)&Nv, \
|
||||
&one, \
|
||||
(DataFieldType<F>*)__A, \
|
||||
(int const*)&Nv, \
|
||||
(DataFieldType<F>*)__B, \
|
||||
(int const*)&Nv, \
|
||||
&zero, \
|
||||
_t_buffer_p, \
|
||||
(int const*)&NoNo);
|
||||
#define DGEMM_HOLES(__A, __B, __TRANSB) \
|
||||
atrip::xgemm<F>("N", \
|
||||
__TRANSB, \
|
||||
(int const*)&NoNo, \
|
||||
(int const*)&No, \
|
||||
(int const*)&No, \
|
||||
&m_one, \
|
||||
__TO_DEVICEPTR(__A), \
|
||||
(int const*)&NoNo, \
|
||||
(DataFieldType<F>*)__B, \
|
||||
(int const*)&No, \
|
||||
&zero, \
|
||||
_t_buffer_p, \
|
||||
(int const*)&NoNo \
|
||||
);
|
||||
#define MAYBE_CONJ(_conj, _buffer) \
|
||||
cuda::maybeConjugate<<<1,1>>>((DataFieldType<F>*)_conj, (DataFieldType<F>*)_buffer, NoNoNo);
|
||||
#else
|
||||
#define REORDER(__II, __JJ, __KK) \
|
||||
WITH_CHRONO("doubles:reorder", \
|
||||
for (size_t k = 0; k < No; k++) \
|
||||
for (size_t j = 0; j < No; j++) \
|
||||
for (size_t i = 0; i < No; i++) { \
|
||||
Tijk[_IJK_(i, j, k)] += _t_buffer_p[_IJK_(__II, __JJ, __KK)]; \
|
||||
} \
|
||||
)
|
||||
#define __TO_DEVICEPTR(_v) (_v)
|
||||
#define DGEMM_PARTICLES(__A, __B) \
|
||||
atrip::xgemm<F>("T", \
|
||||
"N", \
|
||||
(int const*)&NoNo, \
|
||||
(int const*)&No, \
|
||||
(int const*)&Nv, \
|
||||
&one, \
|
||||
__A, \
|
||||
(int const*)&Nv, \
|
||||
__B, \
|
||||
(int const*)&Nv, \
|
||||
&zero, \
|
||||
_t_buffer_p, \
|
||||
(int const*)&NoNo \
|
||||
);
|
||||
#define DGEMM_HOLES(__A, __B, __TRANSB) \
|
||||
atrip::xgemm<F>("N", \
|
||||
__TRANSB, \
|
||||
(int const*)&NoNo, \
|
||||
(int const*)&No, \
|
||||
(int const*)&No, \
|
||||
&m_one, \
|
||||
__A, \
|
||||
(int const*)&NoNo, \
|
||||
__B, \
|
||||
(int const*)&No, \
|
||||
&zero, \
|
||||
_t_buffer_p, \
|
||||
(int const*)&NoNo \
|
||||
);
|
||||
#define MAYBE_CONJ(_conj, _buffer) \
|
||||
for (size_t __i = 0; __i < NoNoNo; ++__i) \
|
||||
_conj[__i] = maybeConjugate<F>(_buffer[__i]);
|
||||
#endif
|
||||
|
||||
F one{1.0}, m_one{-1.0}, zero{0.0};
|
||||
DataFieldType<F> zero_h{0.0};
|
||||
const size_t NoNoNo = No*NoNo;
|
||||
#ifdef HAVE_CUDA
|
||||
DataFieldType<F>* _t_buffer;
|
||||
DataFieldType<F>* _vhhh;
|
||||
LOG(0, "AtripCUDA") << "getting memory" << "\n";
|
||||
cuMemAlloc((CUdeviceptr*)&_t_buffer, NoNoNo * sizeof(DataFieldType<F>));
|
||||
cuMemAlloc((CUdeviceptr*)&_vhhh, NoNoNo * sizeof(DataFieldType<F>));
|
||||
LOG(0, "AtripCUDA") << "cuda::zeroing " << "\n";
|
||||
cuda::zeroing<<<1,1>>>((DataFieldType<F>*)_t_buffer, NoNoNo);
|
||||
cuda::zeroing<<<1,1>>>((DataFieldType<F>*)_vhhh, NoNoNo);
|
||||
#else
|
||||
F* _t_buffer = (F*)malloc(NoNoNo * sizeof(F));
|
||||
F* _vhhh = (F*)malloc(NoNoNo * sizeof(F));
|
||||
for (size_t i=0; i < NoNoNo; i++) {
|
||||
_t_buffer[i] = zero_h;
|
||||
_vhhh[i] = zero_h;
|
||||
}
|
||||
#endif
|
||||
//_t_buffer.reserve(NoNoNo);
|
||||
DataFieldType<F>* _t_buffer_p = __TO_DEVICEPTR(_t_buffer);
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
LOG(0, "AtripCUDA") << "cuda::zeroing Tijk" << "\n";
|
||||
cuda::zeroing<<<1,1>>>((DataFieldType<F>*)Tijk, NoNoNo);
|
||||
#else
|
||||
WITH_CHRONO("double:reorder",
|
||||
for (size_t k = 0; k < NoNoNo; k++) {
|
||||
Tijk[k] = DataFieldType<F>{0.0};
|
||||
})
|
||||
#endif
|
||||
|
||||
LOG(0, "AtripCUDA") << "doing holes" << "\n";
|
||||
// TOMERGE: replace chronos
|
||||
WITH_CHRONO("doubles:holes",
|
||||
{ // Holes part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
|
||||
// VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
|
||||
LOG(0, "AtripCUDA") << "conj 1" << "\n";
|
||||
MAYBE_CONJ(_vhhh, VhhhC)
|
||||
LOG(0, "AtripCUDA") << "done" << "\n";
|
||||
WITH_CHRONO("doubles:holes:1",
|
||||
LOG(0, "AtripCUDA") << "dgemm 1" << "\n";
|
||||
DGEMM_HOLES(_vhhh, TABhh, "N")
|
||||
LOG(0, "AtripCUDA") << "reorder 1" << "\n";
|
||||
REORDER(i, k, j)
|
||||
)
|
||||
// VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
|
||||
WITH_CHRONO("doubles:holes:2",
|
||||
LOG(0, "AtripCUDA") << "dgemm 2" << "\n";
|
||||
DGEMM_HOLES(_vhhh, TABhh, "T")
|
||||
REORDER(j, k, i)
|
||||
)
|
||||
|
||||
// VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
|
||||
LOG(0, "AtripCUDA") << "conj 2" << "\n";
|
||||
MAYBE_CONJ(_vhhh, VhhhB)
|
||||
LOG(0, "AtripCUDA") << "done" << "\n";
|
||||
WITH_CHRONO("doubles:holes:3",
|
||||
DGEMM_HOLES(_vhhh, TAChh, "N")
|
||||
REORDER(i, j, k)
|
||||
)
|
||||
// VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
|
||||
WITH_CHRONO("doubles:holes:4",
|
||||
DGEMM_HOLES(_vhhh, TAChh, "T")
|
||||
REORDER(k, j, i)
|
||||
)
|
||||
|
||||
// VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
|
||||
LOG(0, "AtripCUDA") << "conj 3" << "\n";
|
||||
MAYBE_CONJ(_vhhh, VhhhA)
|
||||
WITH_CHRONO("doubles:holes:5",
|
||||
DGEMM_HOLES(_vhhh, TBChh, "N")
|
||||
REORDER(j, i, k)
|
||||
)
|
||||
// VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
|
||||
WITH_CHRONO("doubles:holes:6",
|
||||
DGEMM_HOLES(_vhhh, TBChh, "T")
|
||||
REORDER(k, i, j)
|
||||
)
|
||||
|
||||
}
|
||||
)
|
||||
#undef MAYBE_CONJ
|
||||
|
||||
LOG(0, "AtripCUDA") << "doing particles" << "\n";
|
||||
WITH_CHRONO("doubles:particles",
|
||||
{ // Particle part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
// TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
|
||||
WITH_CHRONO("doubles:particles:1",
|
||||
DGEMM_PARTICLES(TAphh, VBCph)
|
||||
REORDER(i, j, k)
|
||||
)
|
||||
// TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
|
||||
WITH_CHRONO("doubles:particles:2",
|
||||
DGEMM_PARTICLES(TAphh, VCBph)
|
||||
REORDER(i, k, j)
|
||||
)
|
||||
// TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
|
||||
WITH_CHRONO("doubles:particles:3",
|
||||
DGEMM_PARTICLES(TCphh, VABph)
|
||||
REORDER(k, i, j)
|
||||
)
|
||||
// TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
|
||||
WITH_CHRONO("doubles:particles:4",
|
||||
DGEMM_PARTICLES(TCphh, VBAph)
|
||||
REORDER(k, j, i)
|
||||
)
|
||||
// TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
|
||||
WITH_CHRONO("doubles:particles:5",
|
||||
DGEMM_PARTICLES(TBphh, VACph)
|
||||
REORDER(j, i, k)
|
||||
)
|
||||
// TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
|
||||
WITH_CHRONO("doubles:particles:6",
|
||||
DGEMM_PARTICLES(TBphh, VCAph)
|
||||
REORDER(j, k, i)
|
||||
)
|
||||
}
|
||||
)
|
||||
LOG(0, "AtripCUDA") << "particles done" << "\n";
|
||||
|
||||
{ // free resources
|
||||
#ifdef HAVE_CUDA
|
||||
LOG(0, "AtripCUDA") << "free mem" << "\n";
|
||||
cuMemFree((CUdeviceptr)_vhhh);
|
||||
cuMemFree((CUdeviceptr)_t_buffer);
|
||||
LOG(0, "AtripCUDA") << "free mem done" << "\n";
|
||||
#else
|
||||
free(_vhhh);
|
||||
free(_t_buffer);
|
||||
#endif
|
||||
}
|
||||
|
||||
#undef REORDER
|
||||
#undef DGEMM_HOLES
|
||||
#undef DGEMM_PARTICLES
|
||||
#undef _IJK_
|
||||
#else
|
||||
for (size_t k = 0; k < No; k++)
|
||||
for (size_t j = 0; j < No; j++)
|
||||
for (size_t i = 0; i < No; i++){
|
||||
const size_t ijk = i + j*No + k*NoNo
|
||||
, jk = j + k*No
|
||||
;
|
||||
Tijk[ijk] = 0.0; // :important
|
||||
// HOLE DIAGRAMS: TABHH and VHHHA
|
||||
for (size_t L = 0; L < No; L++){
|
||||
// t[abLj] * V[Lcik] H1
|
||||
// t[baLi] * V[Lcjk] H0 TODO: conjugate T for complex
|
||||
Tijk[ijk] -= TABhh[L + j*No] * VhhhC[i + k*No + L*NoNo];
|
||||
Tijk[ijk] -= TABhh[i + L*No] * VhhhC[j + k*No + L*NoNo];
|
||||
|
||||
// t[acLk] * V[Lbij] H5
|
||||
// t[caLi] * V[Lbkj] H3
|
||||
Tijk[ijk] -= TAChh[L + k*No] * VhhhB[i + j*No + L*NoNo];
|
||||
Tijk[ijk] -= TAChh[i + L*No] * VhhhB[k + j*No + L*NoNo];
|
||||
|
||||
// t[bcLk] * V[Laji] H2
|
||||
// t[cbLj] * V[Laki] H4
|
||||
Tijk[ijk] -= TBChh[L + k*No] * VhhhA[j + i*No + L*NoNo];
|
||||
Tijk[ijk] -= TBChh[j + L*No] * VhhhA[k + i*No + L*NoNo];
|
||||
}
|
||||
// PARTILCE DIAGRAMS: TAPHH and VABPH
|
||||
for (size_t E = 0; E < Nv; E++) {
|
||||
// t[aEij] * V[bcEk] P0
|
||||
// t[aEik] * V[cbEj] P3 // TODO: CHECK THIS ONE, I DONT KNOW
|
||||
Tijk[ijk] += TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv];
|
||||
Tijk[ijk] += TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv];
|
||||
|
||||
// t[cEki] * V[abEj] P5
|
||||
// t[cEkj] * V[baEi] P2
|
||||
Tijk[ijk] += TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv];
|
||||
Tijk[ijk] += TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv];
|
||||
|
||||
// t[bEji] * V[acEk] P1
|
||||
// t[bEjk] * V[caEi] P4
|
||||
Tijk[ijk] += TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv];
|
||||
Tijk[ijk] += TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv];
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
// instantiate templates
|
||||
template
|
||||
void doublesContribution<double>
|
||||
( const ABCTuple &abc
|
||||
, size_t const No
|
||||
, size_t const Nv
|
||||
// -- VABCI
|
||||
, DataPtr<double> const VABph
|
||||
, DataPtr<double> const VACph
|
||||
, DataPtr<double> const VBCph
|
||||
, DataPtr<double> const VBAph
|
||||
, DataPtr<double> const VCAph
|
||||
, DataPtr<double> const VCBph
|
||||
// -- VHHHA
|
||||
, DataPtr<double> const VhhhA
|
||||
, DataPtr<double> const VhhhB
|
||||
, DataPtr<double> const VhhhC
|
||||
// -- TA
|
||||
, DataPtr<double> const TAphh
|
||||
, DataPtr<double> const TBphh
|
||||
, DataPtr<double> const TCphh
|
||||
// -- TABIJ
|
||||
, DataPtr<double> const TABhh
|
||||
, DataPtr<double> const TAChh
|
||||
, DataPtr<double> const TBChh
|
||||
// -- TIJK
|
||||
, DataFieldType<double>* Tijk
|
||||
);
|
||||
|
||||
template
|
||||
void doublesContribution<Complex>
|
||||
( const ABCTuple &abc
|
||||
, size_t const No
|
||||
, size_t const Nv
|
||||
// -- VABCI
|
||||
, DataPtr<Complex> const VABph
|
||||
, DataPtr<Complex> const VACph
|
||||
, DataPtr<Complex> const VBCph
|
||||
, DataPtr<Complex> const VBAph
|
||||
, DataPtr<Complex> const VCAph
|
||||
, DataPtr<Complex> const VCBph
|
||||
// -- VHHHA
|
||||
, DataPtr<Complex> const VhhhA
|
||||
, DataPtr<Complex> const VhhhB
|
||||
, DataPtr<Complex> const VhhhC
|
||||
// -- TA
|
||||
, DataPtr<Complex> const TAphh
|
||||
, DataPtr<Complex> const TBphh
|
||||
, DataPtr<Complex> const TCphh
|
||||
// -- TABIJ
|
||||
, DataPtr<Complex> const TABhh
|
||||
, DataPtr<Complex> const TAChh
|
||||
, DataPtr<Complex> const TBChh
|
||||
// -- TIJK
|
||||
, DataFieldType<Complex>* Tijk
|
||||
);
|
||||
// Doubles contribution:2 ends here
|
||||
|
||||
// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:2]]
|
||||
}
|
||||
// Epilog:2 ends here
|
||||
Reference in New Issue
Block a user