Compare commits

..

No commits in common. "4277c07cc22790fbc4e3fa4ea76ddb9dd494e586" and "76a785044da55f5be4e85c68a8059f4737261de3" have entirely different histories.

6 changed files with 168 additions and 265 deletions

View File

@ -58,9 +58,8 @@ int main(int argc, char** argv) {
CLI11_PARSE(app, argc, argv); CLI11_PARSE(app, argc, argv);
CTF::World world(argc, argv); CTF::World world(argc, argv);
int rank, nranks; int rank;
MPI_Comm_rank(world.comm, &rank); MPI_Comm_rank(world.comm, &rank);
MPI_Comm_size(world.comm, &nranks);
constexpr double elem_to_gb = 8.0 / 1024.0 / 1024.0 / 1024.0; constexpr double elem_to_gb = 8.0 / 1024.0 / 1024.0 / 1024.0;
// USER PRINTING TEST BEGIN // USER PRINTING TEST BEGIN
@ -109,43 +108,6 @@ int main(int argc, char** argv) {
} }
} }
size_t
f = sizeof(double)
, n_tuples = nv * (nv + 1) * (nv + 2) / 6 - nv
, atrip_memory
= /* tuples_memory */ 3 * sizeof(size_t) * n_tuples
//
// one dimensional slices (all ranks)
//
+ /* taphh */ f * nranks * 6 * nv * no * no
+ /* hhha */ f * nranks * 6 * no * no * no
//
// two dimensional slices (all ranks)
//
+ /* abph */ f * nranks * 12 * nv * no
+ /* abhh */ f * nranks * 6 * no * no
+ /* tabhh */ f * nranks * 6 * no * no
//
// distributed sources (all ranks)
//
+ /* tpphh */ f * nv * nv * no * no
+ /* vhhhp */ f * no * no * no * nv
+ /* vppph */ f * nv * nv * nv * no
+ /* vpphh */ f * nv * nv * no * no
+ /* tpphh2 */ f * nv * nv * no * no
//
// tensors in every rank
//
+ /* tijk */ f * nranks * no * no * no
+ /* zijk */ f * nranks * no * no * no
+ /* epsp */ f * nranks * (no + nv)
+ /* tai */ f * nranks * no * nv
;
if (atrip::Atrip::rank == 0)
std::cout << "Tentative MEMORY USAGE: " << atrip_memory << "\n";
std::vector<int> symmetries(4, NS) std::vector<int> symmetries(4, NS)
, vo({nv, no}) , vo({nv, no})
, vvoo({nv, nv, no, no}) , vvoo({nv, nv, no, no})
@ -211,7 +173,7 @@ int main(int argc, char** argv) {
try { try {
auto out = atrip::Atrip::run(in); auto out = atrip::Atrip::run(in);
if (atrip::Atrip::rank == 0) if (atrip::Atrip::rank == 0)
std::cout << "Energy: " << out.energy << std::endl; std::cout << "Energy: " << out.energy << std::endl;
} catch (const char* msg) { } catch (const char* msg) {
if (atrip::Atrip::rank == 0) if (atrip::Atrip::rank == 0)
std::cout << "Atrip throwed with msg:\n\t\t " << msg << "\n"; std::cout << "Atrip throwed with msg:\n\t\t " << msg << "\n";

View File

@ -1,44 +0,0 @@
#!/usr/bin/env bash
type -a autoreconf > /dev/null ||
{
cat <<EOF && exit
You don't seem to have autotools installed, please install it.
- https://www.gnu.org/software/autoconf/
- https://www.gnu.org/software/automake/
EOF
}
cat <<EOF
Creating configure script
EOF
autoreconf -vif .
test -f configure || {
cat <<EOF
An error happened and a configure script could not be built!
EOF
exit 1
}
cat <<EOF
Now you can build by doing
mkdir build
cd build
../configure
make extern
make all
EOF

View File

@ -40,7 +40,7 @@ AC_RUN_IFELSE([AC_LANG_SOURCE([_ATRIP_CUDA_MEMORY_OF_DEVICES])],
], ],
[ [
atrip_success=no atrip_success=no
AC_MSG_WARN([An available device reports zero memory available!]) AC_MSG_ERROR([An available device reports zero memory available!])
]) ])
CXX="$ac_save_CXX" CXX="$ac_save_CXX"
@ -79,11 +79,11 @@ int main() {
cuMemAlloc((CUdeviceptr*)&F_d, els*sizeof(double)); cuMemAlloc((CUdeviceptr*)&F_d, els*sizeof(double));
stat = cublasDgemm(handle, stat = cublasDgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
CUBLAS_OP_N, CUBLAS_OP_N,
oo, No, Nv, oo, No, Nv,
&one, &one,
HHP_d, oo, PH_d, Nv, &one, F_d, oo); HHP_d, oo, PH_d, Nv, &one, F_d, oo);
//cudaSetDevice(rank); //cudaSetDevice(rank);
return 0; return 0;
@ -92,7 +92,6 @@ int main() {
m4_define([_ATRIP_CUDA_MEMORY_OF_DEVICES], [[ m4_define([_ATRIP_CUDA_MEMORY_OF_DEVICES], [[
#include <mpi.h> #include <mpi.h>
#include <iostream> #include <iostream>
#include <cassert> #include <cassert>
@ -125,27 +124,26 @@ int main() {
cuDeviceTotalMem(&total2, dev); cuDeviceTotalMem(&total2, dev);
printf("\n" printf("\n"
"CUDA CARD RANK %d\n" "CUDA CARD RANK %d\n"
"=================\n" "=================\n"
"\tname: %s\n" "\tname: %s\n"
"\tShared Mem Per Block (KB): %f\n" "\tShared Mem Per Block (KB): %f\n"
"\tFree/Total mem (GB): %f/%f\n" "\tFree/Total mem (GB): %f/%f\n"
"\ttotal2 mem (GB): %f\n" "\ttotal2 mem (GB): %f\n"
"\n", "\n",
dev, dev,
name, name,
prop.sharedMemPerBlock / 1024.0, prop.sharedMemPerBlock / 1024.0,
_free / 1024.0 / 1024.0 / 1024.0 , _free / 1024.0 / 1024.0 / 1024.0 ,
total / 1024.0 / 1024.0 / 1024.0 , total / 1024.0 / 1024.0 / 1024.0 ,
total2 / 1024.0 / 1024.0 / 1024.0 total2 / 1024.0 / 1024.0 / 1024.0
); );
if (_free == 0 || total == 0 || total2 == 0) if (_free == 0 || total == 0 || total2 == 0)
return 1; return 1;
} }
return 0; return 0;
} }
]]) ]])

View File

@ -457,8 +457,7 @@ void unwrapAndMarkReady() {
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
// copy the retrieved mpi data to the device // copy the retrieved mpi data to the device
WITH_CHRONO("cuda:memcpy", cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);
cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);)
std::free(mpi_data); std::free(mpi_data);
#endif #endif

View File

@ -76,13 +76,13 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
LOG(0,"Atrip") << "ngcards: " << ngcards << "\n"; LOG(0,"Atrip") << "ngcards: " << ngcards << "\n";
if (clusterInfo.ranksPerNode > ngcards) { if (clusterInfo.ranksPerNode > ngcards) {
std::cerr << "ATRIP: You are running on more ranks per node than the number of graphic cards\n" std::cerr << "ATRIP: You are running on more ranks per node than the number of graphic cards\n"
<< "You have " << ngcards << " cards at your disposal\n"; << "You have " << ngcards << " cards at your disposal\n";
throw ""; throw "";
} }
if (clusterInfo.ranksPerNode < ngcards) { if (clusterInfo.ranksPerNode < ngcards) {
std::cerr << "You have " << ngcards << " cards at your disposal\n" std::cerr << "You have " << ngcards << " cards at your disposal\n"
<< "You will be only using " << clusterInfo.ranksPerNode << "You will be only using " << clusterInfo.ranksPerNode
<< ", i.e., the nubmer of ranks.\n"; << ", i.e., the nubmer of ranks.\n";
} }
@ -106,23 +106,23 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
cuDeviceTotalMem(&memory.total, dev); cuDeviceTotalMem(&memory.total, dev);
printf("\n" printf("\n"
"CUDA CARD RANK %d\n" "CUDA CARD RANK %d\n"
"=================\n" "=================\n"
"\tnumber: %1$ld\n" "\tnumber: %1$d\n"
"\tname: %s\n" "\tname: %s\n"
"\tMem. clock rate (KHz): %ld\n" "\tMem. clock rate (KHz): %d\n"
"\tShared Mem Per Block (KB): %f\n" "\tShared Mem Per Block (KB): %f\n"
"\tAvail. Free/Total mem (GB): %f/%f\n" "\tAvail. Free/Total mem (GB): %f/%f\n"
"\tFree memory (GB): %f\n" "\tFree memory (GB): %f\n"
"\n", "\n",
Atrip::rank, Atrip::rank,
name, name,
prop.clockRate, prop.clockRate,
prop.sharedMemPerBlock / 1024.0, prop.sharedMemPerBlock / 1024.0,
memory.avail.free / 1024.0 / 1024.0 / 1024.0 , memory.avail.free / 1024.0 / 1024.0 / 1024.0 ,
memory.avail.total / 1024.0 / 1024.0 / 1024.0, memory.avail.total / 1024.0 / 1024.0 / 1024.0,
memory.total / 1024.0 / 1024.0 / 1024.0 memory.total / 1024.0 / 1024.0 / 1024.0
); );
std::free((void*)name); std::free((void*)name);
} }
MPI_Barrier(universe); MPI_Barrier(universe);
@ -249,18 +249,19 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
const size_t nIterations = tuplesList.size(); const size_t nIterations = tuplesList.size();
{ {
LOG(0,"Atrip") << "#iterations: " LOG(0,"Atrip") << "#iterations: "
<< nIterations << nIterations
<< "/" << "/"
<< nIterations * np << nIterations * np
<< "\n"; << "\n";
} }
const size_t const size_t
iterationMod = (in.percentageMod > 0) iterationMod = (in.percentageMod > 0)
? nIterations * in.percentageMod / 100.0 ? nIterations * in.percentageMod / 100.0
: in.iterationMod : in.iterationMod
, iteration1Percent = nIterations * 0.01
; , iteration1Percent = nIterations * 0.01
;
@ -292,20 +293,20 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
WITH_CHRONO("oneshot-db:comm:allgather", WITH_CHRONO("oneshot-db:comm:allgather",
WITH_CHRONO("db:comm:allgather", WITH_CHRONO("db:comm:allgather",
MPI_Allgather(ldb.data(), MPI_Allgather( ldb.data()
/* ldb.size() * sizeof(typename // , ldb.size() * sizeof(typename Slice<F>::LocalDatabaseElement)
Slice<F>::LocalDatabaseElement) */ , ldb.size()
ldb.size(), , MPI_LDB_ELEMENT
MPI_LDB_ELEMENT, , db.data()
db.data(), // , ldb.size() * sizeof(typename Slice<F>::LocalDatabaseElement)
/* ldb.size() * sizeof(typename , ldb.size()
Slice<F>::LocalDatabaseElement), */ , MPI_LDB_ELEMENT
ldb.size(), , c);
MPI_LDB_ELEMENT,
c);
)) ))
WITH_CHRONO("db:comm:type:free", MPI_Type_free(&MPI_LDB_ELEMENT);) WITH_CHRONO("db:comm:type:free",
MPI_Type_free(&MPI_LDB_ELEMENT);
)
return db; return db;
}; };
@ -574,29 +575,30 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
))) )))
WITH_CHRONO("oneshot-doubles", WITH_CHRONO("oneshot-doubles",
WITH_CHRONO("doubles", WITH_CHRONO("doubles",
doublesContribution<F>(abc, (size_t)No, (size_t)Nv, doublesContribution<F>( abc, (size_t)No, (size_t)Nv
// -- VABCI // -- VABCI
abph.unwrapSlice(Slice<F>::AB, abc), , abph.unwrapSlice(Slice<F>::AB, abc)
abph.unwrapSlice(Slice<F>::AC, abc), , abph.unwrapSlice(Slice<F>::AC, abc)
abph.unwrapSlice(Slice<F>::BC, abc), , abph.unwrapSlice(Slice<F>::BC, abc)
abph.unwrapSlice(Slice<F>::BA, abc), , abph.unwrapSlice(Slice<F>::BA, abc)
abph.unwrapSlice(Slice<F>::CA, abc), , abph.unwrapSlice(Slice<F>::CA, abc)
abph.unwrapSlice(Slice<F>::CB, abc), , abph.unwrapSlice(Slice<F>::CB, abc)
// -- VHHHA, // -- VHHHA
hhha.unwrapSlice(Slice<F>::A, abc), , hhha.unwrapSlice(Slice<F>::A, abc)
hhha.unwrapSlice(Slice<F>::B, abc), , hhha.unwrapSlice(Slice<F>::B, abc)
hhha.unwrapSlice(Slice<F>::C, abc), , hhha.unwrapSlice(Slice<F>::C, abc)
// -- TA, // -- TA
taphh.unwrapSlice(Slice<F>::A, abc), , taphh.unwrapSlice(Slice<F>::A, abc)
taphh.unwrapSlice(Slice<F>::B, abc), , taphh.unwrapSlice(Slice<F>::B, abc)
taphh.unwrapSlice(Slice<F>::C, abc), , taphh.unwrapSlice(Slice<F>::C, abc)
// -- TABIJ // -- TABIJ
tabhh.unwrapSlice(Slice<F>::AB, abc), , tabhh.unwrapSlice(Slice<F>::AB, abc)
tabhh.unwrapSlice(Slice<F>::AC, abc), , tabhh.unwrapSlice(Slice<F>::AC, abc)
tabhh.unwrapSlice(Slice<F>::BC, abc), , tabhh.unwrapSlice(Slice<F>::BC, abc)
// -- TIJK // -- TIJK
(DataFieldType<F>*)Tijk); , (DataFieldType<F>*)Tijk
WITH_RANK << iteration << "-th doubles done\n"; );
WITH_RANK << iteration << "-th doubles done\n";
)) ))
} }
@ -616,19 +618,16 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
) )
WITH_CHRONO("singles", WITH_CHRONO("singles",
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
singlesContribution<F><<<1,1>>>(No, Nv, abc[0], abc[1], abc[2], singlesContribution<F><<<1,1>>>( No, Nv, abc[0], abc[1], abc[2]
(DataFieldType<F>*)Tai, , (DataFieldType<F>*)Tai
#else #else
singlesContribution<F>(No, Nv, abc[0], abc[1], abc[2], singlesContribution<F>( No, Nv, abc[0], abc[1], abc[2]
Tai.data(), , Tai.data()
#endif #endif
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB, , (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB, abc)
abc), , (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC, abc)
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC, , (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC, abc)
abc), , (DataFieldType<F>*)Zijk);
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC,
abc),
(DataFieldType<F>*)Zijk);
) )
} }

View File

@ -77,7 +77,7 @@ namespace cuda {
to[i] = maybeConjugateScalar<F>(from[i]); to[i] = maybeConjugateScalar<F>(from[i]);
} }
} }
template <typename F> template <typename F>
__global__ __global__
@ -132,29 +132,30 @@ namespace cuda {
#endif #endif
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
#define LIMS_KS() \ #define LIMS_KS \
size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \ size_t \
k = kmin, \ kmin = blockIdx.x * blockDim.x + threadIdx.x, \
idx = kmin * size * size * size \ k = kmin, \
; \ idx = kmin * size * size * size \
; \
k < (kmin < size) ? kmin + 1 : size k < (kmin < size) ? kmin + 1 : size
#else #else
#define LIMS_KS size_t k=0, idx=0; k < size #define LIMS_KS size_t k=0, idx=0; k < size
#endif #endif
#define _IJK_(i, j, k) i + j*size + k*size*size #define _IJK_(i, j, k) i + j*size + k*size*size
#define _REORDER_BODY_(...) \ #define _REORDER_BODY_(...) \
for (LIMS_KS() ; k++) \ for (LIMS_KS ; k++) \
for (size_t j = 0; j < size; j++) \ for (size_t j = 0; j < size; j++) \
for (size_t i = 0; i < size; i++, idx++) { \ for (size_t i = 0; i < size; i++, idx++) { \
__VA_ARGS__ \ __VA_ARGS__ \
} }
#define _MAKE_REORDER_(_enum, ...) \ #define _MAKE_REORDER_(_enum, ...) \
template <typename F> \ template <typename F> \
__MAYBE_GLOBAL__ \ __MAYBE_GLOBAL__ \
void reorder(reorder_proxy< F, _enum > p, \ void reorder(reorder_proxy< F, _enum > p, \
size_t size, F* to, F* from) { \ size_t size, F* to, F* from) { \
_REORDER_BODY_(__VA_ARGS__) \ _REORDER_BODY_(__VA_ARGS__) \
} }
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
#define GO(__TO, __FROM) cuda::sum_in_place<F>(&__TO, &__FROM); #define GO(__TO, __FROM) cuda::sum_in_place<F>(&__TO, &__FROM);
#else #else
@ -165,7 +166,7 @@ namespace cuda {
template <typename F, reordering_t R> template <typename F, reordering_t R>
__MAYBE_GLOBAL__ \ __MAYBE_GLOBAL__ \
void reorder(reorder_proxy<F, R> proxy, void reorder(reorder_proxy<F, R> proxy,
size_t size, F* to, F* from); size_t size, F* to, F* from);
_MAKE_REORDER_(IJK, GO(to[idx], from[_IJK_(i, j, k)])) _MAKE_REORDER_(IJK, GO(to[idx], from[_IJK_(i, j, k)]))
_MAKE_REORDER_(IKJ, GO(to[idx], from[_IJK_(i, k, j)])) _MAKE_REORDER_(IKJ, GO(to[idx], from[_IJK_(i, k, j)]))
@ -173,7 +174,7 @@ namespace cuda {
_MAKE_REORDER_(JKI, GO(to[idx], from[_IJK_(j, k, i)])) _MAKE_REORDER_(JKI, GO(to[idx], from[_IJK_(j, k, i)]))
_MAKE_REORDER_(KIJ, GO(to[idx], from[_IJK_(k, i, j)])) _MAKE_REORDER_(KIJ, GO(to[idx], from[_IJK_(k, i, j)]))
_MAKE_REORDER_(KJI, GO(to[idx], from[_IJK_(k, j, i)])) _MAKE_REORDER_(KJI, GO(to[idx], from[_IJK_(k, j, i)]))
#undef LIMS_KS #undef LIMS_KS
#undef _MAKE_REORDER #undef _MAKE_REORDER
@ -445,59 +446,50 @@ double getEnergySame
#if defined(ATRIP_USE_DGEMM) #if defined(ATRIP_USE_DGEMM)
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
#define REORDER(__II, __JJ, __KK) \ #define REORDER(__II, __JJ, __KK) \
reorder<<<bs, ths>>>(reorder_proxy<DataFieldType<F>, \ reorder<<< \
__II ## __JJ ## __KK >{}, \ bs, ths \
No, Tijk, _t_buffer); >>>(reorder_proxy<DataFieldType<F>, __II ## __JJ ## __KK >{}, \
#define DGEMM_PARTICLES(__A, __B) \ No, Tijk, _t_buffer);
atrip::xgemm<F>("T", \ #define DGEMM_PARTICLES(__A, __B) \
"N", \ atrip::xgemm<F>("T", \
(int const*)&NoNo, \ "N", \
(int const*)&No, \ (int const*)&NoNo, \
(int const*)&Nv, \ (int const*)&No, \
&one, \ (int const*)&Nv, \
(DataFieldType<F>*)__A, \ &one, \
(int const*)&Nv, \ (DataFieldType<F>*)__A, \
(DataFieldType<F>*)__B, \ (int const*)&Nv, \
(int const*)&Nv, \ (DataFieldType<F>*)__B, \
&zero, \ (int const*)&Nv, \
_t_buffer, \ &zero, \
(int const*)&NoNo); _t_buffer, \
#define DGEMM_HOLES(__A, __B, __TRANSB) \ (int const*)&NoNo);
atrip::xgemm<F>("N", \ #define DGEMM_HOLES(__A, __B, __TRANSB) \
__TRANSB, \ atrip::xgemm<F>("N", \
(int const*)&NoNo, \ __TRANSB, \
(int const*)&No, \ (int const*)&NoNo, \
(int const*)&No, \ (int const*)&No, \
&m_one, \ (int const*)&No, \
__A, \ &m_one, \
(int const*)&NoNo, \ __A, \
(DataFieldType<F>*)__B, \ (int const*)&NoNo, \
(int const*)&No, \ (DataFieldType<F>*)__B, \
&zero, \ (int const*)&No, \
_t_buffer, \ &zero, \
(int const*)&NoNo \ _t_buffer, \
); (int const*)&NoNo \
);
#define MAYBE_CONJ(_conj, _buffer) \ #define MAYBE_CONJ(_conj, _buffer) \
cuda::maybeConjugate<<< \ cuda::maybeConjugate<<< \
Atrip::kernelDimensions.ooo.blocks, \ Atrip::kernelDimensions.ooo.blocks, \
Atrip::kernelDimensions.ooo.threads \ Atrip::kernelDimensions.ooo.threads \
>>>((DataFieldType<F>*)_conj, (DataFieldType<F>*)_buffer, NoNoNo); >>>((DataFieldType<F>*)_conj, (DataFieldType<F>*)_buffer, NoNoNo);
// END CUDA ////////////////////////////////////////////////////////////////////
#else #else
// NONCUDA //////////////////////////////////////////////////////////////////////
#define REORDER(__II, __JJ, __KK) \
// NONCUDA ///////////////////////////////////////////////////////////////////// reorder(reorder_proxy<DataFieldType<F>, __II ## __JJ ## __KK >{}, \
No, Tijk, _t_buffer);
#define REORDER(__II, __JJ, __KK) \
reorder(reorder_proxy<DataFieldType<F>, \
__II ## __JJ ## __KK >{}, \
No, Tijk, _t_buffer);
#define DGEMM_PARTICLES(__A, __B) \ #define DGEMM_PARTICLES(__A, __B) \
atrip::xgemm<F>("T", \ atrip::xgemm<F>("T", \
"N", \ "N", \
@ -510,7 +502,7 @@ double getEnergySame
__B, \ __B, \
(int const*)&Nv, \ (int const*)&Nv, \
&zero, \ &zero, \
_t_buffer, \ _t_buffer, \
(int const*)&NoNo \ (int const*)&NoNo \
); );
#define DGEMM_HOLES(__A, __B, __TRANSB) \ #define DGEMM_HOLES(__A, __B, __TRANSB) \
@ -525,8 +517,8 @@ double getEnergySame
__B, \ __B, \
(int const*)&No, \ (int const*)&No, \
&zero, \ &zero, \
_t_buffer, \ _t_buffer, \
(int const*)&NoNo \ (int const*)&NoNo \
); );
#define MAYBE_CONJ(_conj, _buffer) \ #define MAYBE_CONJ(_conj, _buffer) \
for (size_t __i = 0; __i < NoNoNo; ++__i) \ for (size_t __i = 0; __i < NoNoNo; ++__i) \
@ -558,10 +550,9 @@ double getEnergySame
// Set Tijk to zero // Set Tijk to zero
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
WITH_CHRONO("double:reorder", WITH_CHRONO("double:reorder",
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk, cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk, NoNoNo);
NoNoNo); // synchronize all initializations to zero
// synchronize all initializations to zero )
)
#else #else
WITH_CHRONO("double:reorder", WITH_CHRONO("double:reorder",
for (size_t k = 0; k < NoNoNo; k++) { for (size_t k = 0; k < NoNoNo; k++) {
@ -649,8 +640,6 @@ double getEnergySame
{ // free resources { // free resources
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
// we need to synchronize here since we need
// the Tijk for next process in the pipeline
cuCtxSynchronize(); cuCtxSynchronize();
cuMemFree((CUdeviceptr)_vhhh); cuMemFree((CUdeviceptr)_vhhh);
cuMemFree((CUdeviceptr)_t_buffer); cuMemFree((CUdeviceptr)_t_buffer);