Compare commits
7 Commits
76a785044d
...
4277c07cc2
| Author | SHA1 | Date | |
|---|---|---|---|
| 4277c07cc2 | |||
| 0558148937 | |||
| 49ff3b377c | |||
| 00a28c990c | |||
| 2c5a4620ca | |||
| 368c5619cc | |||
| 0b14ac7704 |
@ -58,8 +58,9 @@ int main(int argc, char** argv) {
|
|||||||
CLI11_PARSE(app, argc, argv);
|
CLI11_PARSE(app, argc, argv);
|
||||||
|
|
||||||
CTF::World world(argc, argv);
|
CTF::World world(argc, argv);
|
||||||
int rank;
|
int rank, nranks;
|
||||||
MPI_Comm_rank(world.comm, &rank);
|
MPI_Comm_rank(world.comm, &rank);
|
||||||
|
MPI_Comm_size(world.comm, &nranks);
|
||||||
constexpr double elem_to_gb = 8.0 / 1024.0 / 1024.0 / 1024.0;
|
constexpr double elem_to_gb = 8.0 / 1024.0 / 1024.0 / 1024.0;
|
||||||
|
|
||||||
// USER PRINTING TEST BEGIN
|
// USER PRINTING TEST BEGIN
|
||||||
@ -108,6 +109,43 @@ int main(int argc, char** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t
|
||||||
|
f = sizeof(double)
|
||||||
|
, n_tuples = nv * (nv + 1) * (nv + 2) / 6 - nv
|
||||||
|
, atrip_memory
|
||||||
|
= /* tuples_memory */ 3 * sizeof(size_t) * n_tuples
|
||||||
|
//
|
||||||
|
// one dimensional slices (all ranks)
|
||||||
|
//
|
||||||
|
+ /* taphh */ f * nranks * 6 * nv * no * no
|
||||||
|
+ /* hhha */ f * nranks * 6 * no * no * no
|
||||||
|
//
|
||||||
|
// two dimensional slices (all ranks)
|
||||||
|
//
|
||||||
|
+ /* abph */ f * nranks * 12 * nv * no
|
||||||
|
+ /* abhh */ f * nranks * 6 * no * no
|
||||||
|
+ /* tabhh */ f * nranks * 6 * no * no
|
||||||
|
//
|
||||||
|
// distributed sources (all ranks)
|
||||||
|
//
|
||||||
|
+ /* tpphh */ f * nv * nv * no * no
|
||||||
|
+ /* vhhhp */ f * no * no * no * nv
|
||||||
|
+ /* vppph */ f * nv * nv * nv * no
|
||||||
|
+ /* vpphh */ f * nv * nv * no * no
|
||||||
|
+ /* tpphh2 */ f * nv * nv * no * no
|
||||||
|
//
|
||||||
|
// tensors in every rank
|
||||||
|
//
|
||||||
|
+ /* tijk */ f * nranks * no * no * no
|
||||||
|
+ /* zijk */ f * nranks * no * no * no
|
||||||
|
+ /* epsp */ f * nranks * (no + nv)
|
||||||
|
+ /* tai */ f * nranks * no * nv
|
||||||
|
;
|
||||||
|
|
||||||
|
if (atrip::Atrip::rank == 0)
|
||||||
|
std::cout << "Tentative MEMORY USAGE: " << atrip_memory << "\n";
|
||||||
|
|
||||||
|
|
||||||
std::vector<int> symmetries(4, NS)
|
std::vector<int> symmetries(4, NS)
|
||||||
, vo({nv, no})
|
, vo({nv, no})
|
||||||
, vvoo({nv, nv, no, no})
|
, vvoo({nv, nv, no, no})
|
||||||
@ -173,7 +211,7 @@ int main(int argc, char** argv) {
|
|||||||
try {
|
try {
|
||||||
auto out = atrip::Atrip::run(in);
|
auto out = atrip::Atrip::run(in);
|
||||||
if (atrip::Atrip::rank == 0)
|
if (atrip::Atrip::rank == 0)
|
||||||
std::cout << "Energy: " << out.energy << std::endl;
|
std::cout << "Energy: " << out.energy << std::endl;
|
||||||
} catch (const char* msg) {
|
} catch (const char* msg) {
|
||||||
if (atrip::Atrip::rank == 0)
|
if (atrip::Atrip::rank == 0)
|
||||||
std::cout << "Atrip throwed with msg:\n\t\t " << msg << "\n";
|
std::cout << "Atrip throwed with msg:\n\t\t " << msg << "\n";
|
||||||
|
|||||||
44
bootstrap.sh
Executable file
44
bootstrap.sh
Executable file
@ -0,0 +1,44 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
type -a autoreconf > /dev/null ||
|
||||||
|
{
|
||||||
|
cat <<EOF && exit
|
||||||
|
|
||||||
|
You don't seem to have autotools installed, please install it.
|
||||||
|
|
||||||
|
- https://www.gnu.org/software/autoconf/
|
||||||
|
- https://www.gnu.org/software/automake/
|
||||||
|
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
cat <<EOF
|
||||||
|
|
||||||
|
Creating configure script
|
||||||
|
|
||||||
|
EOF
|
||||||
|
|
||||||
|
|
||||||
|
autoreconf -vif .
|
||||||
|
test -f configure || {
|
||||||
|
cat <<EOF
|
||||||
|
|
||||||
|
An error happened and a configure script could not be built!
|
||||||
|
|
||||||
|
EOF
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
cat <<EOF
|
||||||
|
|
||||||
|
Now you can build by doing
|
||||||
|
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
../configure
|
||||||
|
make extern
|
||||||
|
make all
|
||||||
|
|
||||||
|
EOF
|
||||||
@ -40,7 +40,7 @@ AC_RUN_IFELSE([AC_LANG_SOURCE([_ATRIP_CUDA_MEMORY_OF_DEVICES])],
|
|||||||
],
|
],
|
||||||
[
|
[
|
||||||
atrip_success=no
|
atrip_success=no
|
||||||
AC_MSG_ERROR([An available device reports zero memory available!])
|
AC_MSG_WARN([An available device reports zero memory available!])
|
||||||
])
|
])
|
||||||
|
|
||||||
CXX="$ac_save_CXX"
|
CXX="$ac_save_CXX"
|
||||||
@ -79,11 +79,11 @@ int main() {
|
|||||||
cuMemAlloc((CUdeviceptr*)&F_d, els*sizeof(double));
|
cuMemAlloc((CUdeviceptr*)&F_d, els*sizeof(double));
|
||||||
|
|
||||||
stat = cublasDgemm(handle,
|
stat = cublasDgemm(handle,
|
||||||
CUBLAS_OP_N,
|
CUBLAS_OP_N,
|
||||||
CUBLAS_OP_N,
|
CUBLAS_OP_N,
|
||||||
oo, No, Nv,
|
oo, No, Nv,
|
||||||
&one,
|
&one,
|
||||||
HHP_d, oo, PH_d, Nv, &one, F_d, oo);
|
HHP_d, oo, PH_d, Nv, &one, F_d, oo);
|
||||||
//cudaSetDevice(rank);
|
//cudaSetDevice(rank);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
@ -92,6 +92,7 @@ int main() {
|
|||||||
|
|
||||||
|
|
||||||
m4_define([_ATRIP_CUDA_MEMORY_OF_DEVICES], [[
|
m4_define([_ATRIP_CUDA_MEMORY_OF_DEVICES], [[
|
||||||
|
|
||||||
#include <mpi.h>
|
#include <mpi.h>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
@ -124,26 +125,27 @@ int main() {
|
|||||||
cuDeviceTotalMem(&total2, dev);
|
cuDeviceTotalMem(&total2, dev);
|
||||||
|
|
||||||
printf("\n"
|
printf("\n"
|
||||||
"CUDA CARD RANK %d\n"
|
"CUDA CARD RANK %d\n"
|
||||||
"=================\n"
|
"=================\n"
|
||||||
"\tname: %s\n"
|
"\tname: %s\n"
|
||||||
"\tShared Mem Per Block (KB): %f\n"
|
"\tShared Mem Per Block (KB): %f\n"
|
||||||
"\tFree/Total mem (GB): %f/%f\n"
|
"\tFree/Total mem (GB): %f/%f\n"
|
||||||
"\ttotal2 mem (GB): %f\n"
|
"\ttotal2 mem (GB): %f\n"
|
||||||
"\n",
|
"\n",
|
||||||
dev,
|
dev,
|
||||||
name,
|
name,
|
||||||
prop.sharedMemPerBlock / 1024.0,
|
prop.sharedMemPerBlock / 1024.0,
|
||||||
_free / 1024.0 / 1024.0 / 1024.0 ,
|
_free / 1024.0 / 1024.0 / 1024.0 ,
|
||||||
total / 1024.0 / 1024.0 / 1024.0 ,
|
total / 1024.0 / 1024.0 / 1024.0 ,
|
||||||
total2 / 1024.0 / 1024.0 / 1024.0
|
total2 / 1024.0 / 1024.0 / 1024.0
|
||||||
);
|
);
|
||||||
|
|
||||||
if (_free == 0 || total == 0 || total2 == 0)
|
if (_free == 0 || total == 0 || total2 == 0)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
]])
|
]])
|
||||||
|
|||||||
@ -457,7 +457,8 @@ void unwrapAndMarkReady() {
|
|||||||
|
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
// copy the retrieved mpi data to the device
|
// copy the retrieved mpi data to the device
|
||||||
cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);
|
WITH_CHRONO("cuda:memcpy",
|
||||||
|
cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);)
|
||||||
std::free(mpi_data);
|
std::free(mpi_data);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@ -76,13 +76,13 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
LOG(0,"Atrip") << "ngcards: " << ngcards << "\n";
|
LOG(0,"Atrip") << "ngcards: " << ngcards << "\n";
|
||||||
if (clusterInfo.ranksPerNode > ngcards) {
|
if (clusterInfo.ranksPerNode > ngcards) {
|
||||||
std::cerr << "ATRIP: You are running on more ranks per node than the number of graphic cards\n"
|
std::cerr << "ATRIP: You are running on more ranks per node than the number of graphic cards\n"
|
||||||
<< "You have " << ngcards << " cards at your disposal\n";
|
<< "You have " << ngcards << " cards at your disposal\n";
|
||||||
throw "";
|
throw "";
|
||||||
}
|
}
|
||||||
if (clusterInfo.ranksPerNode < ngcards) {
|
if (clusterInfo.ranksPerNode < ngcards) {
|
||||||
std::cerr << "You have " << ngcards << " cards at your disposal\n"
|
std::cerr << "You have " << ngcards << " cards at your disposal\n"
|
||||||
<< "You will be only using " << clusterInfo.ranksPerNode
|
<< "You will be only using " << clusterInfo.ranksPerNode
|
||||||
<< ", i.e., the nubmer of ranks.\n";
|
<< ", i.e., the nubmer of ranks.\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -106,23 +106,23 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
cuDeviceTotalMem(&memory.total, dev);
|
cuDeviceTotalMem(&memory.total, dev);
|
||||||
|
|
||||||
printf("\n"
|
printf("\n"
|
||||||
"CUDA CARD RANK %d\n"
|
"CUDA CARD RANK %d\n"
|
||||||
"=================\n"
|
"=================\n"
|
||||||
"\tnumber: %1$d\n"
|
"\tnumber: %1$ld\n"
|
||||||
"\tname: %s\n"
|
"\tname: %s\n"
|
||||||
"\tMem. clock rate (KHz): %d\n"
|
"\tMem. clock rate (KHz): %ld\n"
|
||||||
"\tShared Mem Per Block (KB): %f\n"
|
"\tShared Mem Per Block (KB): %f\n"
|
||||||
"\tAvail. Free/Total mem (GB): %f/%f\n"
|
"\tAvail. Free/Total mem (GB): %f/%f\n"
|
||||||
"\tFree memory (GB): %f\n"
|
"\tFree memory (GB): %f\n"
|
||||||
"\n",
|
"\n",
|
||||||
Atrip::rank,
|
Atrip::rank,
|
||||||
name,
|
name,
|
||||||
prop.clockRate,
|
prop.clockRate,
|
||||||
prop.sharedMemPerBlock / 1024.0,
|
prop.sharedMemPerBlock / 1024.0,
|
||||||
memory.avail.free / 1024.0 / 1024.0 / 1024.0 ,
|
memory.avail.free / 1024.0 / 1024.0 / 1024.0 ,
|
||||||
memory.avail.total / 1024.0 / 1024.0 / 1024.0,
|
memory.avail.total / 1024.0 / 1024.0 / 1024.0,
|
||||||
memory.total / 1024.0 / 1024.0 / 1024.0
|
memory.total / 1024.0 / 1024.0 / 1024.0
|
||||||
);
|
);
|
||||||
std::free((void*)name);
|
std::free((void*)name);
|
||||||
}
|
}
|
||||||
MPI_Barrier(universe);
|
MPI_Barrier(universe);
|
||||||
@ -249,19 +249,18 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
const size_t nIterations = tuplesList.size();
|
const size_t nIterations = tuplesList.size();
|
||||||
{
|
{
|
||||||
LOG(0,"Atrip") << "#iterations: "
|
LOG(0,"Atrip") << "#iterations: "
|
||||||
<< nIterations
|
<< nIterations
|
||||||
<< "/"
|
<< "/"
|
||||||
<< nIterations * np
|
<< nIterations * np
|
||||||
<< "\n";
|
<< "\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t
|
const size_t
|
||||||
iterationMod = (in.percentageMod > 0)
|
iterationMod = (in.percentageMod > 0)
|
||||||
? nIterations * in.percentageMod / 100.0
|
? nIterations * in.percentageMod / 100.0
|
||||||
: in.iterationMod
|
: in.iterationMod
|
||||||
|
, iteration1Percent = nIterations * 0.01
|
||||||
, iteration1Percent = nIterations * 0.01
|
;
|
||||||
;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -293,20 +292,20 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
|
|
||||||
WITH_CHRONO("oneshot-db:comm:allgather",
|
WITH_CHRONO("oneshot-db:comm:allgather",
|
||||||
WITH_CHRONO("db:comm:allgather",
|
WITH_CHRONO("db:comm:allgather",
|
||||||
MPI_Allgather( ldb.data()
|
MPI_Allgather(ldb.data(),
|
||||||
// , ldb.size() * sizeof(typename Slice<F>::LocalDatabaseElement)
|
/* ldb.size() * sizeof(typename
|
||||||
, ldb.size()
|
Slice<F>::LocalDatabaseElement) */
|
||||||
, MPI_LDB_ELEMENT
|
ldb.size(),
|
||||||
, db.data()
|
MPI_LDB_ELEMENT,
|
||||||
// , ldb.size() * sizeof(typename Slice<F>::LocalDatabaseElement)
|
db.data(),
|
||||||
, ldb.size()
|
/* ldb.size() * sizeof(typename
|
||||||
, MPI_LDB_ELEMENT
|
Slice<F>::LocalDatabaseElement), */
|
||||||
, c);
|
ldb.size(),
|
||||||
|
MPI_LDB_ELEMENT,
|
||||||
|
c);
|
||||||
))
|
))
|
||||||
|
|
||||||
WITH_CHRONO("db:comm:type:free",
|
WITH_CHRONO("db:comm:type:free", MPI_Type_free(&MPI_LDB_ELEMENT);)
|
||||||
MPI_Type_free(&MPI_LDB_ELEMENT);
|
|
||||||
)
|
|
||||||
|
|
||||||
return db;
|
return db;
|
||||||
};
|
};
|
||||||
@ -575,30 +574,29 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
)))
|
)))
|
||||||
WITH_CHRONO("oneshot-doubles",
|
WITH_CHRONO("oneshot-doubles",
|
||||||
WITH_CHRONO("doubles",
|
WITH_CHRONO("doubles",
|
||||||
doublesContribution<F>( abc, (size_t)No, (size_t)Nv
|
doublesContribution<F>(abc, (size_t)No, (size_t)Nv,
|
||||||
// -- VABCI
|
// -- VABCI
|
||||||
, abph.unwrapSlice(Slice<F>::AB, abc)
|
abph.unwrapSlice(Slice<F>::AB, abc),
|
||||||
, abph.unwrapSlice(Slice<F>::AC, abc)
|
abph.unwrapSlice(Slice<F>::AC, abc),
|
||||||
, abph.unwrapSlice(Slice<F>::BC, abc)
|
abph.unwrapSlice(Slice<F>::BC, abc),
|
||||||
, abph.unwrapSlice(Slice<F>::BA, abc)
|
abph.unwrapSlice(Slice<F>::BA, abc),
|
||||||
, abph.unwrapSlice(Slice<F>::CA, abc)
|
abph.unwrapSlice(Slice<F>::CA, abc),
|
||||||
, abph.unwrapSlice(Slice<F>::CB, abc)
|
abph.unwrapSlice(Slice<F>::CB, abc),
|
||||||
// -- VHHHA
|
// -- VHHHA,
|
||||||
, hhha.unwrapSlice(Slice<F>::A, abc)
|
hhha.unwrapSlice(Slice<F>::A, abc),
|
||||||
, hhha.unwrapSlice(Slice<F>::B, abc)
|
hhha.unwrapSlice(Slice<F>::B, abc),
|
||||||
, hhha.unwrapSlice(Slice<F>::C, abc)
|
hhha.unwrapSlice(Slice<F>::C, abc),
|
||||||
// -- TA
|
// -- TA,
|
||||||
, taphh.unwrapSlice(Slice<F>::A, abc)
|
taphh.unwrapSlice(Slice<F>::A, abc),
|
||||||
, taphh.unwrapSlice(Slice<F>::B, abc)
|
taphh.unwrapSlice(Slice<F>::B, abc),
|
||||||
, taphh.unwrapSlice(Slice<F>::C, abc)
|
taphh.unwrapSlice(Slice<F>::C, abc),
|
||||||
// -- TABIJ
|
// -- TABIJ
|
||||||
, tabhh.unwrapSlice(Slice<F>::AB, abc)
|
tabhh.unwrapSlice(Slice<F>::AB, abc),
|
||||||
, tabhh.unwrapSlice(Slice<F>::AC, abc)
|
tabhh.unwrapSlice(Slice<F>::AC, abc),
|
||||||
, tabhh.unwrapSlice(Slice<F>::BC, abc)
|
tabhh.unwrapSlice(Slice<F>::BC, abc),
|
||||||
// -- TIJK
|
// -- TIJK
|
||||||
, (DataFieldType<F>*)Tijk
|
(DataFieldType<F>*)Tijk);
|
||||||
);
|
WITH_RANK << iteration << "-th doubles done\n";
|
||||||
WITH_RANK << iteration << "-th doubles done\n";
|
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -618,16 +616,19 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
)
|
)
|
||||||
WITH_CHRONO("singles",
|
WITH_CHRONO("singles",
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
singlesContribution<F><<<1,1>>>( No, Nv, abc[0], abc[1], abc[2]
|
singlesContribution<F><<<1,1>>>(No, Nv, abc[0], abc[1], abc[2],
|
||||||
, (DataFieldType<F>*)Tai
|
(DataFieldType<F>*)Tai,
|
||||||
#else
|
#else
|
||||||
singlesContribution<F>( No, Nv, abc[0], abc[1], abc[2]
|
singlesContribution<F>(No, Nv, abc[0], abc[1], abc[2],
|
||||||
, Tai.data()
|
Tai.data(),
|
||||||
#endif
|
#endif
|
||||||
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB, abc)
|
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB,
|
||||||
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC, abc)
|
abc),
|
||||||
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC, abc)
|
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC,
|
||||||
, (DataFieldType<F>*)Zijk);
|
abc),
|
||||||
|
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC,
|
||||||
|
abc),
|
||||||
|
(DataFieldType<F>*)Zijk);
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -77,7 +77,7 @@ namespace cuda {
|
|||||||
to[i] = maybeConjugateScalar<F>(from[i]);
|
to[i] = maybeConjugateScalar<F>(from[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <typename F>
|
template <typename F>
|
||||||
__global__
|
__global__
|
||||||
@ -132,30 +132,29 @@ namespace cuda {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
#define LIMS_KS \
|
#define LIMS_KS() \
|
||||||
size_t \
|
size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \
|
||||||
kmin = blockIdx.x * blockDim.x + threadIdx.x, \
|
k = kmin, \
|
||||||
k = kmin, \
|
idx = kmin * size * size * size \
|
||||||
idx = kmin * size * size * size \
|
; \
|
||||||
; \
|
|
||||||
k < (kmin < size) ? kmin + 1 : size
|
k < (kmin < size) ? kmin + 1 : size
|
||||||
#else
|
#else
|
||||||
#define LIMS_KS size_t k=0, idx=0; k < size
|
#define LIMS_KS size_t k=0, idx=0; k < size
|
||||||
#endif
|
#endif
|
||||||
#define _IJK_(i, j, k) i + j*size + k*size*size
|
#define _IJK_(i, j, k) i + j*size + k*size*size
|
||||||
#define _REORDER_BODY_(...) \
|
#define _REORDER_BODY_(...) \
|
||||||
for (LIMS_KS ; k++) \
|
for (LIMS_KS() ; k++) \
|
||||||
for (size_t j = 0; j < size; j++) \
|
for (size_t j = 0; j < size; j++) \
|
||||||
for (size_t i = 0; i < size; i++, idx++) { \
|
for (size_t i = 0; i < size; i++, idx++) { \
|
||||||
__VA_ARGS__ \
|
__VA_ARGS__ \
|
||||||
}
|
}
|
||||||
#define _MAKE_REORDER_(_enum, ...) \
|
#define _MAKE_REORDER_(_enum, ...) \
|
||||||
template <typename F> \
|
template <typename F> \
|
||||||
__MAYBE_GLOBAL__ \
|
__MAYBE_GLOBAL__ \
|
||||||
void reorder(reorder_proxy< F, _enum > p, \
|
void reorder(reorder_proxy< F, _enum > p, \
|
||||||
size_t size, F* to, F* from) { \
|
size_t size, F* to, F* from) { \
|
||||||
_REORDER_BODY_(__VA_ARGS__) \
|
_REORDER_BODY_(__VA_ARGS__) \
|
||||||
}
|
}
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
#define GO(__TO, __FROM) cuda::sum_in_place<F>(&__TO, &__FROM);
|
#define GO(__TO, __FROM) cuda::sum_in_place<F>(&__TO, &__FROM);
|
||||||
#else
|
#else
|
||||||
@ -166,7 +165,7 @@ namespace cuda {
|
|||||||
template <typename F, reordering_t R>
|
template <typename F, reordering_t R>
|
||||||
__MAYBE_GLOBAL__ \
|
__MAYBE_GLOBAL__ \
|
||||||
void reorder(reorder_proxy<F, R> proxy,
|
void reorder(reorder_proxy<F, R> proxy,
|
||||||
size_t size, F* to, F* from);
|
size_t size, F* to, F* from);
|
||||||
|
|
||||||
_MAKE_REORDER_(IJK, GO(to[idx], from[_IJK_(i, j, k)]))
|
_MAKE_REORDER_(IJK, GO(to[idx], from[_IJK_(i, j, k)]))
|
||||||
_MAKE_REORDER_(IKJ, GO(to[idx], from[_IJK_(i, k, j)]))
|
_MAKE_REORDER_(IKJ, GO(to[idx], from[_IJK_(i, k, j)]))
|
||||||
@ -174,7 +173,7 @@ namespace cuda {
|
|||||||
_MAKE_REORDER_(JKI, GO(to[idx], from[_IJK_(j, k, i)]))
|
_MAKE_REORDER_(JKI, GO(to[idx], from[_IJK_(j, k, i)]))
|
||||||
_MAKE_REORDER_(KIJ, GO(to[idx], from[_IJK_(k, i, j)]))
|
_MAKE_REORDER_(KIJ, GO(to[idx], from[_IJK_(k, i, j)]))
|
||||||
_MAKE_REORDER_(KJI, GO(to[idx], from[_IJK_(k, j, i)]))
|
_MAKE_REORDER_(KJI, GO(to[idx], from[_IJK_(k, j, i)]))
|
||||||
|
|
||||||
|
|
||||||
#undef LIMS_KS
|
#undef LIMS_KS
|
||||||
#undef _MAKE_REORDER
|
#undef _MAKE_REORDER
|
||||||
@ -446,50 +445,59 @@ double getEnergySame
|
|||||||
|
|
||||||
#if defined(ATRIP_USE_DGEMM)
|
#if defined(ATRIP_USE_DGEMM)
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
#define REORDER(__II, __JJ, __KK) \
|
#define REORDER(__II, __JJ, __KK) \
|
||||||
reorder<<< \
|
reorder<<<bs, ths>>>(reorder_proxy<DataFieldType<F>, \
|
||||||
bs, ths \
|
__II ## __JJ ## __KK >{}, \
|
||||||
>>>(reorder_proxy<DataFieldType<F>, __II ## __JJ ## __KK >{}, \
|
No, Tijk, _t_buffer);
|
||||||
No, Tijk, _t_buffer);
|
#define DGEMM_PARTICLES(__A, __B) \
|
||||||
#define DGEMM_PARTICLES(__A, __B) \
|
atrip::xgemm<F>("T", \
|
||||||
atrip::xgemm<F>("T", \
|
"N", \
|
||||||
"N", \
|
(int const*)&NoNo, \
|
||||||
(int const*)&NoNo, \
|
(int const*)&No, \
|
||||||
(int const*)&No, \
|
(int const*)&Nv, \
|
||||||
(int const*)&Nv, \
|
&one, \
|
||||||
&one, \
|
(DataFieldType<F>*)__A, \
|
||||||
(DataFieldType<F>*)__A, \
|
(int const*)&Nv, \
|
||||||
(int const*)&Nv, \
|
(DataFieldType<F>*)__B, \
|
||||||
(DataFieldType<F>*)__B, \
|
(int const*)&Nv, \
|
||||||
(int const*)&Nv, \
|
&zero, \
|
||||||
&zero, \
|
_t_buffer, \
|
||||||
_t_buffer, \
|
(int const*)&NoNo);
|
||||||
(int const*)&NoNo);
|
#define DGEMM_HOLES(__A, __B, __TRANSB) \
|
||||||
#define DGEMM_HOLES(__A, __B, __TRANSB) \
|
atrip::xgemm<F>("N", \
|
||||||
atrip::xgemm<F>("N", \
|
__TRANSB, \
|
||||||
__TRANSB, \
|
(int const*)&NoNo, \
|
||||||
(int const*)&NoNo, \
|
(int const*)&No, \
|
||||||
(int const*)&No, \
|
(int const*)&No, \
|
||||||
(int const*)&No, \
|
&m_one, \
|
||||||
&m_one, \
|
__A, \
|
||||||
__A, \
|
(int const*)&NoNo, \
|
||||||
(int const*)&NoNo, \
|
(DataFieldType<F>*)__B, \
|
||||||
(DataFieldType<F>*)__B, \
|
(int const*)&No, \
|
||||||
(int const*)&No, \
|
&zero, \
|
||||||
&zero, \
|
_t_buffer, \
|
||||||
_t_buffer, \
|
(int const*)&NoNo \
|
||||||
(int const*)&NoNo \
|
);
|
||||||
);
|
|
||||||
#define MAYBE_CONJ(_conj, _buffer) \
|
#define MAYBE_CONJ(_conj, _buffer) \
|
||||||
cuda::maybeConjugate<<< \
|
cuda::maybeConjugate<<< \
|
||||||
Atrip::kernelDimensions.ooo.blocks, \
|
Atrip::kernelDimensions.ooo.blocks, \
|
||||||
Atrip::kernelDimensions.ooo.threads \
|
Atrip::kernelDimensions.ooo.threads \
|
||||||
>>>((DataFieldType<F>*)_conj, (DataFieldType<F>*)_buffer, NoNoNo);
|
>>>((DataFieldType<F>*)_conj, (DataFieldType<F>*)_buffer, NoNoNo);
|
||||||
|
|
||||||
|
|
||||||
|
// END CUDA ////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
// NONCUDA //////////////////////////////////////////////////////////////////////
|
|
||||||
#define REORDER(__II, __JJ, __KK) \
|
|
||||||
reorder(reorder_proxy<DataFieldType<F>, __II ## __JJ ## __KK >{}, \
|
// NONCUDA /////////////////////////////////////////////////////////////////////
|
||||||
No, Tijk, _t_buffer);
|
|
||||||
|
|
||||||
|
#define REORDER(__II, __JJ, __KK) \
|
||||||
|
reorder(reorder_proxy<DataFieldType<F>, \
|
||||||
|
__II ## __JJ ## __KK >{}, \
|
||||||
|
No, Tijk, _t_buffer);
|
||||||
#define DGEMM_PARTICLES(__A, __B) \
|
#define DGEMM_PARTICLES(__A, __B) \
|
||||||
atrip::xgemm<F>("T", \
|
atrip::xgemm<F>("T", \
|
||||||
"N", \
|
"N", \
|
||||||
@ -502,7 +510,7 @@ double getEnergySame
|
|||||||
__B, \
|
__B, \
|
||||||
(int const*)&Nv, \
|
(int const*)&Nv, \
|
||||||
&zero, \
|
&zero, \
|
||||||
_t_buffer, \
|
_t_buffer, \
|
||||||
(int const*)&NoNo \
|
(int const*)&NoNo \
|
||||||
);
|
);
|
||||||
#define DGEMM_HOLES(__A, __B, __TRANSB) \
|
#define DGEMM_HOLES(__A, __B, __TRANSB) \
|
||||||
@ -517,8 +525,8 @@ double getEnergySame
|
|||||||
__B, \
|
__B, \
|
||||||
(int const*)&No, \
|
(int const*)&No, \
|
||||||
&zero, \
|
&zero, \
|
||||||
_t_buffer, \
|
_t_buffer, \
|
||||||
(int const*)&NoNo \
|
(int const*)&NoNo \
|
||||||
);
|
);
|
||||||
#define MAYBE_CONJ(_conj, _buffer) \
|
#define MAYBE_CONJ(_conj, _buffer) \
|
||||||
for (size_t __i = 0; __i < NoNoNo; ++__i) \
|
for (size_t __i = 0; __i < NoNoNo; ++__i) \
|
||||||
@ -550,9 +558,10 @@ double getEnergySame
|
|||||||
// Set Tijk to zero
|
// Set Tijk to zero
|
||||||
#ifdef HAVE_CUDA
|
#ifdef HAVE_CUDA
|
||||||
WITH_CHRONO("double:reorder",
|
WITH_CHRONO("double:reorder",
|
||||||
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk, NoNoNo);
|
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk,
|
||||||
// synchronize all initializations to zero
|
NoNoNo);
|
||||||
)
|
// synchronize all initializations to zero
|
||||||
|
)
|
||||||
#else
|
#else
|
||||||
WITH_CHRONO("double:reorder",
|
WITH_CHRONO("double:reorder",
|
||||||
for (size_t k = 0; k < NoNoNo; k++) {
|
for (size_t k = 0; k < NoNoNo; k++) {
|
||||||
@ -640,6 +649,8 @@ double getEnergySame
|
|||||||
|
|
||||||
{ // free resources
|
{ // free resources
|
||||||
#ifdef HAVE_CUDA
|
#ifdef HAVE_CUDA
|
||||||
|
// we need to synchronize here since we need
|
||||||
|
// the Tijk for next process in the pipeline
|
||||||
cuCtxSynchronize();
|
cuCtxSynchronize();
|
||||||
cuMemFree((CUdeviceptr)_vhhh);
|
cuMemFree((CUdeviceptr)_vhhh);
|
||||||
cuMemFree((CUdeviceptr)_t_buffer);
|
cuMemFree((CUdeviceptr)_t_buffer);
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user