Compare commits
7 Commits
76a785044d
...
4277c07cc2
| Author | SHA1 | Date | |
|---|---|---|---|
| 4277c07cc2 | |||
| 0558148937 | |||
| 49ff3b377c | |||
| 00a28c990c | |||
| 2c5a4620ca | |||
| 368c5619cc | |||
| 0b14ac7704 |
@ -58,8 +58,9 @@ int main(int argc, char** argv) {
|
||||
CLI11_PARSE(app, argc, argv);
|
||||
|
||||
CTF::World world(argc, argv);
|
||||
int rank;
|
||||
int rank, nranks;
|
||||
MPI_Comm_rank(world.comm, &rank);
|
||||
MPI_Comm_size(world.comm, &nranks);
|
||||
constexpr double elem_to_gb = 8.0 / 1024.0 / 1024.0 / 1024.0;
|
||||
|
||||
// USER PRINTING TEST BEGIN
|
||||
@ -108,6 +109,43 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
size_t
|
||||
f = sizeof(double)
|
||||
, n_tuples = nv * (nv + 1) * (nv + 2) / 6 - nv
|
||||
, atrip_memory
|
||||
= /* tuples_memory */ 3 * sizeof(size_t) * n_tuples
|
||||
//
|
||||
// one dimensional slices (all ranks)
|
||||
//
|
||||
+ /* taphh */ f * nranks * 6 * nv * no * no
|
||||
+ /* hhha */ f * nranks * 6 * no * no * no
|
||||
//
|
||||
// two dimensional slices (all ranks)
|
||||
//
|
||||
+ /* abph */ f * nranks * 12 * nv * no
|
||||
+ /* abhh */ f * nranks * 6 * no * no
|
||||
+ /* tabhh */ f * nranks * 6 * no * no
|
||||
//
|
||||
// distributed sources (all ranks)
|
||||
//
|
||||
+ /* tpphh */ f * nv * nv * no * no
|
||||
+ /* vhhhp */ f * no * no * no * nv
|
||||
+ /* vppph */ f * nv * nv * nv * no
|
||||
+ /* vpphh */ f * nv * nv * no * no
|
||||
+ /* tpphh2 */ f * nv * nv * no * no
|
||||
//
|
||||
// tensors in every rank
|
||||
//
|
||||
+ /* tijk */ f * nranks * no * no * no
|
||||
+ /* zijk */ f * nranks * no * no * no
|
||||
+ /* epsp */ f * nranks * (no + nv)
|
||||
+ /* tai */ f * nranks * no * nv
|
||||
;
|
||||
|
||||
if (atrip::Atrip::rank == 0)
|
||||
std::cout << "Tentative MEMORY USAGE: " << atrip_memory << "\n";
|
||||
|
||||
|
||||
std::vector<int> symmetries(4, NS)
|
||||
, vo({nv, no})
|
||||
, vvoo({nv, nv, no, no})
|
||||
@ -173,7 +211,7 @@ int main(int argc, char** argv) {
|
||||
try {
|
||||
auto out = atrip::Atrip::run(in);
|
||||
if (atrip::Atrip::rank == 0)
|
||||
std::cout << "Energy: " << out.energy << std::endl;
|
||||
std::cout << "Energy: " << out.energy << std::endl;
|
||||
} catch (const char* msg) {
|
||||
if (atrip::Atrip::rank == 0)
|
||||
std::cout << "Atrip throwed with msg:\n\t\t " << msg << "\n";
|
||||
|
||||
44
bootstrap.sh
Executable file
44
bootstrap.sh
Executable file
@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
type -a autoreconf > /dev/null ||
|
||||
{
|
||||
cat <<EOF && exit
|
||||
|
||||
You don't seem to have autotools installed, please install it.
|
||||
|
||||
- https://www.gnu.org/software/autoconf/
|
||||
- https://www.gnu.org/software/automake/
|
||||
|
||||
EOF
|
||||
}
|
||||
|
||||
|
||||
cat <<EOF
|
||||
|
||||
Creating configure script
|
||||
|
||||
EOF
|
||||
|
||||
|
||||
autoreconf -vif .
|
||||
test -f configure || {
|
||||
cat <<EOF
|
||||
|
||||
An error happened and a configure script could not be built!
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
|
||||
cat <<EOF
|
||||
|
||||
Now you can build by doing
|
||||
|
||||
mkdir build
|
||||
cd build
|
||||
../configure
|
||||
make extern
|
||||
make all
|
||||
|
||||
EOF
|
||||
@ -40,7 +40,7 @@ AC_RUN_IFELSE([AC_LANG_SOURCE([_ATRIP_CUDA_MEMORY_OF_DEVICES])],
|
||||
],
|
||||
[
|
||||
atrip_success=no
|
||||
AC_MSG_ERROR([An available device reports zero memory available!])
|
||||
AC_MSG_WARN([An available device reports zero memory available!])
|
||||
])
|
||||
|
||||
CXX="$ac_save_CXX"
|
||||
@ -79,11 +79,11 @@ int main() {
|
||||
cuMemAlloc((CUdeviceptr*)&F_d, els*sizeof(double));
|
||||
|
||||
stat = cublasDgemm(handle,
|
||||
CUBLAS_OP_N,
|
||||
CUBLAS_OP_N,
|
||||
oo, No, Nv,
|
||||
&one,
|
||||
HHP_d, oo, PH_d, Nv, &one, F_d, oo);
|
||||
CUBLAS_OP_N,
|
||||
CUBLAS_OP_N,
|
||||
oo, No, Nv,
|
||||
&one,
|
||||
HHP_d, oo, PH_d, Nv, &one, F_d, oo);
|
||||
//cudaSetDevice(rank);
|
||||
|
||||
return 0;
|
||||
@ -92,6 +92,7 @@ int main() {
|
||||
|
||||
|
||||
m4_define([_ATRIP_CUDA_MEMORY_OF_DEVICES], [[
|
||||
|
||||
#include <mpi.h>
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
@ -124,26 +125,27 @@ int main() {
|
||||
cuDeviceTotalMem(&total2, dev);
|
||||
|
||||
printf("\n"
|
||||
"CUDA CARD RANK %d\n"
|
||||
"=================\n"
|
||||
"\tname: %s\n"
|
||||
"\tShared Mem Per Block (KB): %f\n"
|
||||
"\tFree/Total mem (GB): %f/%f\n"
|
||||
"\ttotal2 mem (GB): %f\n"
|
||||
"\n",
|
||||
dev,
|
||||
name,
|
||||
prop.sharedMemPerBlock / 1024.0,
|
||||
_free / 1024.0 / 1024.0 / 1024.0 ,
|
||||
total / 1024.0 / 1024.0 / 1024.0 ,
|
||||
total2 / 1024.0 / 1024.0 / 1024.0
|
||||
);
|
||||
"CUDA CARD RANK %d\n"
|
||||
"=================\n"
|
||||
"\tname: %s\n"
|
||||
"\tShared Mem Per Block (KB): %f\n"
|
||||
"\tFree/Total mem (GB): %f/%f\n"
|
||||
"\ttotal2 mem (GB): %f\n"
|
||||
"\n",
|
||||
dev,
|
||||
name,
|
||||
prop.sharedMemPerBlock / 1024.0,
|
||||
_free / 1024.0 / 1024.0 / 1024.0 ,
|
||||
total / 1024.0 / 1024.0 / 1024.0 ,
|
||||
total2 / 1024.0 / 1024.0 / 1024.0
|
||||
);
|
||||
|
||||
if (_free == 0 || total == 0 || total2 == 0)
|
||||
return 1;
|
||||
return 1;
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
]])
|
||||
|
||||
@ -457,7 +457,8 @@ void unwrapAndMarkReady() {
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
// copy the retrieved mpi data to the device
|
||||
cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);
|
||||
WITH_CHRONO("cuda:memcpy",
|
||||
cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);)
|
||||
std::free(mpi_data);
|
||||
#endif
|
||||
|
||||
|
||||
@ -76,13 +76,13 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
LOG(0,"Atrip") << "ngcards: " << ngcards << "\n";
|
||||
if (clusterInfo.ranksPerNode > ngcards) {
|
||||
std::cerr << "ATRIP: You are running on more ranks per node than the number of graphic cards\n"
|
||||
<< "You have " << ngcards << " cards at your disposal\n";
|
||||
<< "You have " << ngcards << " cards at your disposal\n";
|
||||
throw "";
|
||||
}
|
||||
if (clusterInfo.ranksPerNode < ngcards) {
|
||||
std::cerr << "You have " << ngcards << " cards at your disposal\n"
|
||||
<< "You will be only using " << clusterInfo.ranksPerNode
|
||||
<< ", i.e., the nubmer of ranks.\n";
|
||||
<< "You will be only using " << clusterInfo.ranksPerNode
|
||||
<< ", i.e., the nubmer of ranks.\n";
|
||||
}
|
||||
|
||||
|
||||
@ -106,23 +106,23 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
cuDeviceTotalMem(&memory.total, dev);
|
||||
|
||||
printf("\n"
|
||||
"CUDA CARD RANK %d\n"
|
||||
"=================\n"
|
||||
"\tnumber: %1$d\n"
|
||||
"\tname: %s\n"
|
||||
"\tMem. clock rate (KHz): %d\n"
|
||||
"\tShared Mem Per Block (KB): %f\n"
|
||||
"\tAvail. Free/Total mem (GB): %f/%f\n"
|
||||
"\tFree memory (GB): %f\n"
|
||||
"\n",
|
||||
Atrip::rank,
|
||||
name,
|
||||
prop.clockRate,
|
||||
prop.sharedMemPerBlock / 1024.0,
|
||||
memory.avail.free / 1024.0 / 1024.0 / 1024.0 ,
|
||||
memory.avail.total / 1024.0 / 1024.0 / 1024.0,
|
||||
memory.total / 1024.0 / 1024.0 / 1024.0
|
||||
);
|
||||
"CUDA CARD RANK %d\n"
|
||||
"=================\n"
|
||||
"\tnumber: %1$ld\n"
|
||||
"\tname: %s\n"
|
||||
"\tMem. clock rate (KHz): %ld\n"
|
||||
"\tShared Mem Per Block (KB): %f\n"
|
||||
"\tAvail. Free/Total mem (GB): %f/%f\n"
|
||||
"\tFree memory (GB): %f\n"
|
||||
"\n",
|
||||
Atrip::rank,
|
||||
name,
|
||||
prop.clockRate,
|
||||
prop.sharedMemPerBlock / 1024.0,
|
||||
memory.avail.free / 1024.0 / 1024.0 / 1024.0 ,
|
||||
memory.avail.total / 1024.0 / 1024.0 / 1024.0,
|
||||
memory.total / 1024.0 / 1024.0 / 1024.0
|
||||
);
|
||||
std::free((void*)name);
|
||||
}
|
||||
MPI_Barrier(universe);
|
||||
@ -249,19 +249,18 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
const size_t nIterations = tuplesList.size();
|
||||
{
|
||||
LOG(0,"Atrip") << "#iterations: "
|
||||
<< nIterations
|
||||
<< "/"
|
||||
<< nIterations * np
|
||||
<< "\n";
|
||||
<< nIterations
|
||||
<< "/"
|
||||
<< nIterations * np
|
||||
<< "\n";
|
||||
}
|
||||
|
||||
const size_t
|
||||
iterationMod = (in.percentageMod > 0)
|
||||
? nIterations * in.percentageMod / 100.0
|
||||
: in.iterationMod
|
||||
|
||||
, iteration1Percent = nIterations * 0.01
|
||||
;
|
||||
iterationMod = (in.percentageMod > 0)
|
||||
? nIterations * in.percentageMod / 100.0
|
||||
: in.iterationMod
|
||||
, iteration1Percent = nIterations * 0.01
|
||||
;
|
||||
|
||||
|
||||
|
||||
@ -293,20 +292,20 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
|
||||
WITH_CHRONO("oneshot-db:comm:allgather",
|
||||
WITH_CHRONO("db:comm:allgather",
|
||||
MPI_Allgather( ldb.data()
|
||||
// , ldb.size() * sizeof(typename Slice<F>::LocalDatabaseElement)
|
||||
, ldb.size()
|
||||
, MPI_LDB_ELEMENT
|
||||
, db.data()
|
||||
// , ldb.size() * sizeof(typename Slice<F>::LocalDatabaseElement)
|
||||
, ldb.size()
|
||||
, MPI_LDB_ELEMENT
|
||||
, c);
|
||||
MPI_Allgather(ldb.data(),
|
||||
/* ldb.size() * sizeof(typename
|
||||
Slice<F>::LocalDatabaseElement) */
|
||||
ldb.size(),
|
||||
MPI_LDB_ELEMENT,
|
||||
db.data(),
|
||||
/* ldb.size() * sizeof(typename
|
||||
Slice<F>::LocalDatabaseElement), */
|
||||
ldb.size(),
|
||||
MPI_LDB_ELEMENT,
|
||||
c);
|
||||
))
|
||||
|
||||
WITH_CHRONO("db:comm:type:free",
|
||||
MPI_Type_free(&MPI_LDB_ELEMENT);
|
||||
)
|
||||
WITH_CHRONO("db:comm:type:free", MPI_Type_free(&MPI_LDB_ELEMENT);)
|
||||
|
||||
return db;
|
||||
};
|
||||
@ -575,30 +574,29 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
)))
|
||||
WITH_CHRONO("oneshot-doubles",
|
||||
WITH_CHRONO("doubles",
|
||||
doublesContribution<F>( abc, (size_t)No, (size_t)Nv
|
||||
// -- VABCI
|
||||
, abph.unwrapSlice(Slice<F>::AB, abc)
|
||||
, abph.unwrapSlice(Slice<F>::AC, abc)
|
||||
, abph.unwrapSlice(Slice<F>::BC, abc)
|
||||
, abph.unwrapSlice(Slice<F>::BA, abc)
|
||||
, abph.unwrapSlice(Slice<F>::CA, abc)
|
||||
, abph.unwrapSlice(Slice<F>::CB, abc)
|
||||
// -- VHHHA
|
||||
, hhha.unwrapSlice(Slice<F>::A, abc)
|
||||
, hhha.unwrapSlice(Slice<F>::B, abc)
|
||||
, hhha.unwrapSlice(Slice<F>::C, abc)
|
||||
// -- TA
|
||||
, taphh.unwrapSlice(Slice<F>::A, abc)
|
||||
, taphh.unwrapSlice(Slice<F>::B, abc)
|
||||
, taphh.unwrapSlice(Slice<F>::C, abc)
|
||||
// -- TABIJ
|
||||
, tabhh.unwrapSlice(Slice<F>::AB, abc)
|
||||
, tabhh.unwrapSlice(Slice<F>::AC, abc)
|
||||
, tabhh.unwrapSlice(Slice<F>::BC, abc)
|
||||
// -- TIJK
|
||||
, (DataFieldType<F>*)Tijk
|
||||
);
|
||||
WITH_RANK << iteration << "-th doubles done\n";
|
||||
doublesContribution<F>(abc, (size_t)No, (size_t)Nv,
|
||||
// -- VABCI
|
||||
abph.unwrapSlice(Slice<F>::AB, abc),
|
||||
abph.unwrapSlice(Slice<F>::AC, abc),
|
||||
abph.unwrapSlice(Slice<F>::BC, abc),
|
||||
abph.unwrapSlice(Slice<F>::BA, abc),
|
||||
abph.unwrapSlice(Slice<F>::CA, abc),
|
||||
abph.unwrapSlice(Slice<F>::CB, abc),
|
||||
// -- VHHHA,
|
||||
hhha.unwrapSlice(Slice<F>::A, abc),
|
||||
hhha.unwrapSlice(Slice<F>::B, abc),
|
||||
hhha.unwrapSlice(Slice<F>::C, abc),
|
||||
// -- TA,
|
||||
taphh.unwrapSlice(Slice<F>::A, abc),
|
||||
taphh.unwrapSlice(Slice<F>::B, abc),
|
||||
taphh.unwrapSlice(Slice<F>::C, abc),
|
||||
// -- TABIJ
|
||||
tabhh.unwrapSlice(Slice<F>::AB, abc),
|
||||
tabhh.unwrapSlice(Slice<F>::AC, abc),
|
||||
tabhh.unwrapSlice(Slice<F>::BC, abc),
|
||||
// -- TIJK
|
||||
(DataFieldType<F>*)Tijk);
|
||||
WITH_RANK << iteration << "-th doubles done\n";
|
||||
))
|
||||
}
|
||||
|
||||
@ -618,16 +616,19 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
)
|
||||
WITH_CHRONO("singles",
|
||||
#if defined(HAVE_CUDA)
|
||||
singlesContribution<F><<<1,1>>>( No, Nv, abc[0], abc[1], abc[2]
|
||||
, (DataFieldType<F>*)Tai
|
||||
singlesContribution<F><<<1,1>>>(No, Nv, abc[0], abc[1], abc[2],
|
||||
(DataFieldType<F>*)Tai,
|
||||
#else
|
||||
singlesContribution<F>( No, Nv, abc[0], abc[1], abc[2]
|
||||
, Tai.data()
|
||||
singlesContribution<F>(No, Nv, abc[0], abc[1], abc[2],
|
||||
Tai.data(),
|
||||
#endif
|
||||
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB, abc)
|
||||
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC, abc)
|
||||
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC, abc)
|
||||
, (DataFieldType<F>*)Zijk);
|
||||
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB,
|
||||
abc),
|
||||
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC,
|
||||
abc),
|
||||
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC,
|
||||
abc),
|
||||
(DataFieldType<F>*)Zijk);
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@ -77,7 +77,7 @@ namespace cuda {
|
||||
to[i] = maybeConjugateScalar<F>(from[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <typename F>
|
||||
__global__
|
||||
@ -132,30 +132,29 @@ namespace cuda {
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
#define LIMS_KS \
|
||||
size_t \
|
||||
kmin = blockIdx.x * blockDim.x + threadIdx.x, \
|
||||
k = kmin, \
|
||||
idx = kmin * size * size * size \
|
||||
; \
|
||||
#define LIMS_KS() \
|
||||
size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \
|
||||
k = kmin, \
|
||||
idx = kmin * size * size * size \
|
||||
; \
|
||||
k < (kmin < size) ? kmin + 1 : size
|
||||
#else
|
||||
#define LIMS_KS size_t k=0, idx=0; k < size
|
||||
#endif
|
||||
#define _IJK_(i, j, k) i + j*size + k*size*size
|
||||
#define _REORDER_BODY_(...) \
|
||||
for (LIMS_KS ; k++) \
|
||||
for (size_t j = 0; j < size; j++) \
|
||||
for (size_t i = 0; i < size; i++, idx++) { \
|
||||
__VA_ARGS__ \
|
||||
}
|
||||
#define _MAKE_REORDER_(_enum, ...) \
|
||||
template <typename F> \
|
||||
__MAYBE_GLOBAL__ \
|
||||
void reorder(reorder_proxy< F, _enum > p, \
|
||||
size_t size, F* to, F* from) { \
|
||||
_REORDER_BODY_(__VA_ARGS__) \
|
||||
}
|
||||
#define _REORDER_BODY_(...) \
|
||||
for (LIMS_KS() ; k++) \
|
||||
for (size_t j = 0; j < size; j++) \
|
||||
for (size_t i = 0; i < size; i++, idx++) { \
|
||||
__VA_ARGS__ \
|
||||
}
|
||||
#define _MAKE_REORDER_(_enum, ...) \
|
||||
template <typename F> \
|
||||
__MAYBE_GLOBAL__ \
|
||||
void reorder(reorder_proxy< F, _enum > p, \
|
||||
size_t size, F* to, F* from) { \
|
||||
_REORDER_BODY_(__VA_ARGS__) \
|
||||
}
|
||||
#if defined(HAVE_CUDA)
|
||||
#define GO(__TO, __FROM) cuda::sum_in_place<F>(&__TO, &__FROM);
|
||||
#else
|
||||
@ -166,7 +165,7 @@ namespace cuda {
|
||||
template <typename F, reordering_t R>
|
||||
__MAYBE_GLOBAL__ \
|
||||
void reorder(reorder_proxy<F, R> proxy,
|
||||
size_t size, F* to, F* from);
|
||||
size_t size, F* to, F* from);
|
||||
|
||||
_MAKE_REORDER_(IJK, GO(to[idx], from[_IJK_(i, j, k)]))
|
||||
_MAKE_REORDER_(IKJ, GO(to[idx], from[_IJK_(i, k, j)]))
|
||||
@ -174,7 +173,7 @@ namespace cuda {
|
||||
_MAKE_REORDER_(JKI, GO(to[idx], from[_IJK_(j, k, i)]))
|
||||
_MAKE_REORDER_(KIJ, GO(to[idx], from[_IJK_(k, i, j)]))
|
||||
_MAKE_REORDER_(KJI, GO(to[idx], from[_IJK_(k, j, i)]))
|
||||
|
||||
|
||||
|
||||
#undef LIMS_KS
|
||||
#undef _MAKE_REORDER
|
||||
@ -446,50 +445,59 @@ double getEnergySame
|
||||
|
||||
#if defined(ATRIP_USE_DGEMM)
|
||||
#if defined(HAVE_CUDA)
|
||||
#define REORDER(__II, __JJ, __KK) \
|
||||
reorder<<< \
|
||||
bs, ths \
|
||||
>>>(reorder_proxy<DataFieldType<F>, __II ## __JJ ## __KK >{}, \
|
||||
No, Tijk, _t_buffer);
|
||||
#define DGEMM_PARTICLES(__A, __B) \
|
||||
atrip::xgemm<F>("T", \
|
||||
"N", \
|
||||
(int const*)&NoNo, \
|
||||
(int const*)&No, \
|
||||
(int const*)&Nv, \
|
||||
&one, \
|
||||
(DataFieldType<F>*)__A, \
|
||||
(int const*)&Nv, \
|
||||
(DataFieldType<F>*)__B, \
|
||||
(int const*)&Nv, \
|
||||
&zero, \
|
||||
_t_buffer, \
|
||||
(int const*)&NoNo);
|
||||
#define DGEMM_HOLES(__A, __B, __TRANSB) \
|
||||
atrip::xgemm<F>("N", \
|
||||
__TRANSB, \
|
||||
(int const*)&NoNo, \
|
||||
(int const*)&No, \
|
||||
(int const*)&No, \
|
||||
&m_one, \
|
||||
__A, \
|
||||
(int const*)&NoNo, \
|
||||
(DataFieldType<F>*)__B, \
|
||||
(int const*)&No, \
|
||||
&zero, \
|
||||
_t_buffer, \
|
||||
(int const*)&NoNo \
|
||||
);
|
||||
#define REORDER(__II, __JJ, __KK) \
|
||||
reorder<<<bs, ths>>>(reorder_proxy<DataFieldType<F>, \
|
||||
__II ## __JJ ## __KK >{}, \
|
||||
No, Tijk, _t_buffer);
|
||||
#define DGEMM_PARTICLES(__A, __B) \
|
||||
atrip::xgemm<F>("T", \
|
||||
"N", \
|
||||
(int const*)&NoNo, \
|
||||
(int const*)&No, \
|
||||
(int const*)&Nv, \
|
||||
&one, \
|
||||
(DataFieldType<F>*)__A, \
|
||||
(int const*)&Nv, \
|
||||
(DataFieldType<F>*)__B, \
|
||||
(int const*)&Nv, \
|
||||
&zero, \
|
||||
_t_buffer, \
|
||||
(int const*)&NoNo);
|
||||
#define DGEMM_HOLES(__A, __B, __TRANSB) \
|
||||
atrip::xgemm<F>("N", \
|
||||
__TRANSB, \
|
||||
(int const*)&NoNo, \
|
||||
(int const*)&No, \
|
||||
(int const*)&No, \
|
||||
&m_one, \
|
||||
__A, \
|
||||
(int const*)&NoNo, \
|
||||
(DataFieldType<F>*)__B, \
|
||||
(int const*)&No, \
|
||||
&zero, \
|
||||
_t_buffer, \
|
||||
(int const*)&NoNo \
|
||||
);
|
||||
#define MAYBE_CONJ(_conj, _buffer) \
|
||||
cuda::maybeConjugate<<< \
|
||||
Atrip::kernelDimensions.ooo.blocks, \
|
||||
Atrip::kernelDimensions.ooo.threads \
|
||||
>>>((DataFieldType<F>*)_conj, (DataFieldType<F>*)_buffer, NoNoNo);
|
||||
cuda::maybeConjugate<<< \
|
||||
Atrip::kernelDimensions.ooo.blocks, \
|
||||
Atrip::kernelDimensions.ooo.threads \
|
||||
>>>((DataFieldType<F>*)_conj, (DataFieldType<F>*)_buffer, NoNoNo);
|
||||
|
||||
|
||||
// END CUDA ////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#else
|
||||
// NONCUDA //////////////////////////////////////////////////////////////////////
|
||||
#define REORDER(__II, __JJ, __KK) \
|
||||
reorder(reorder_proxy<DataFieldType<F>, __II ## __JJ ## __KK >{}, \
|
||||
No, Tijk, _t_buffer);
|
||||
|
||||
|
||||
// NONCUDA /////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#define REORDER(__II, __JJ, __KK) \
|
||||
reorder(reorder_proxy<DataFieldType<F>, \
|
||||
__II ## __JJ ## __KK >{}, \
|
||||
No, Tijk, _t_buffer);
|
||||
#define DGEMM_PARTICLES(__A, __B) \
|
||||
atrip::xgemm<F>("T", \
|
||||
"N", \
|
||||
@ -502,7 +510,7 @@ double getEnergySame
|
||||
__B, \
|
||||
(int const*)&Nv, \
|
||||
&zero, \
|
||||
_t_buffer, \
|
||||
_t_buffer, \
|
||||
(int const*)&NoNo \
|
||||
);
|
||||
#define DGEMM_HOLES(__A, __B, __TRANSB) \
|
||||
@ -517,8 +525,8 @@ double getEnergySame
|
||||
__B, \
|
||||
(int const*)&No, \
|
||||
&zero, \
|
||||
_t_buffer, \
|
||||
(int const*)&NoNo \
|
||||
_t_buffer, \
|
||||
(int const*)&NoNo \
|
||||
);
|
||||
#define MAYBE_CONJ(_conj, _buffer) \
|
||||
for (size_t __i = 0; __i < NoNoNo; ++__i) \
|
||||
@ -550,9 +558,10 @@ double getEnergySame
|
||||
// Set Tijk to zero
|
||||
#ifdef HAVE_CUDA
|
||||
WITH_CHRONO("double:reorder",
|
||||
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk, NoNoNo);
|
||||
// synchronize all initializations to zero
|
||||
)
|
||||
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk,
|
||||
NoNoNo);
|
||||
// synchronize all initializations to zero
|
||||
)
|
||||
#else
|
||||
WITH_CHRONO("double:reorder",
|
||||
for (size_t k = 0; k < NoNoNo; k++) {
|
||||
@ -640,6 +649,8 @@ double getEnergySame
|
||||
|
||||
{ // free resources
|
||||
#ifdef HAVE_CUDA
|
||||
// we need to synchronize here since we need
|
||||
// the Tijk for next process in the pipeline
|
||||
cuCtxSynchronize();
|
||||
cuMemFree((CUdeviceptr)_vhhh);
|
||||
cuMemFree((CUdeviceptr)_t_buffer);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user