Compare commits
7 Commits
76a785044d
...
4277c07cc2
| Author | SHA1 | Date | |
|---|---|---|---|
| 4277c07cc2 | |||
| 0558148937 | |||
| 49ff3b377c | |||
| 00a28c990c | |||
| 2c5a4620ca | |||
| 368c5619cc | |||
| 0b14ac7704 |
@ -58,8 +58,9 @@ int main(int argc, char** argv) {
|
||||
CLI11_PARSE(app, argc, argv);
|
||||
|
||||
CTF::World world(argc, argv);
|
||||
int rank;
|
||||
int rank, nranks;
|
||||
MPI_Comm_rank(world.comm, &rank);
|
||||
MPI_Comm_size(world.comm, &nranks);
|
||||
constexpr double elem_to_gb = 8.0 / 1024.0 / 1024.0 / 1024.0;
|
||||
|
||||
// USER PRINTING TEST BEGIN
|
||||
@ -108,6 +109,43 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
size_t
|
||||
f = sizeof(double)
|
||||
, n_tuples = nv * (nv + 1) * (nv + 2) / 6 - nv
|
||||
, atrip_memory
|
||||
= /* tuples_memory */ 3 * sizeof(size_t) * n_tuples
|
||||
//
|
||||
// one dimensional slices (all ranks)
|
||||
//
|
||||
+ /* taphh */ f * nranks * 6 * nv * no * no
|
||||
+ /* hhha */ f * nranks * 6 * no * no * no
|
||||
//
|
||||
// two dimensional slices (all ranks)
|
||||
//
|
||||
+ /* abph */ f * nranks * 12 * nv * no
|
||||
+ /* abhh */ f * nranks * 6 * no * no
|
||||
+ /* tabhh */ f * nranks * 6 * no * no
|
||||
//
|
||||
// distributed sources (all ranks)
|
||||
//
|
||||
+ /* tpphh */ f * nv * nv * no * no
|
||||
+ /* vhhhp */ f * no * no * no * nv
|
||||
+ /* vppph */ f * nv * nv * nv * no
|
||||
+ /* vpphh */ f * nv * nv * no * no
|
||||
+ /* tpphh2 */ f * nv * nv * no * no
|
||||
//
|
||||
// tensors in every rank
|
||||
//
|
||||
+ /* tijk */ f * nranks * no * no * no
|
||||
+ /* zijk */ f * nranks * no * no * no
|
||||
+ /* epsp */ f * nranks * (no + nv)
|
||||
+ /* tai */ f * nranks * no * nv
|
||||
;
|
||||
|
||||
if (atrip::Atrip::rank == 0)
|
||||
std::cout << "Tentative MEMORY USAGE: " << atrip_memory << "\n";
|
||||
|
||||
|
||||
std::vector<int> symmetries(4, NS)
|
||||
, vo({nv, no})
|
||||
, vvoo({nv, nv, no, no})
|
||||
|
||||
44
bootstrap.sh
Executable file
44
bootstrap.sh
Executable file
@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
type -a autoreconf > /dev/null ||
|
||||
{
|
||||
cat <<EOF && exit
|
||||
|
||||
You don't seem to have autotools installed, please install it.
|
||||
|
||||
- https://www.gnu.org/software/autoconf/
|
||||
- https://www.gnu.org/software/automake/
|
||||
|
||||
EOF
|
||||
}
|
||||
|
||||
|
||||
cat <<EOF
|
||||
|
||||
Creating configure script
|
||||
|
||||
EOF
|
||||
|
||||
|
||||
autoreconf -vif .
|
||||
test -f configure || {
|
||||
cat <<EOF
|
||||
|
||||
An error happened and a configure script could not be built!
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
|
||||
cat <<EOF
|
||||
|
||||
Now you can build by doing
|
||||
|
||||
mkdir build
|
||||
cd build
|
||||
../configure
|
||||
make extern
|
||||
make all
|
||||
|
||||
EOF
|
||||
@ -40,7 +40,7 @@ AC_RUN_IFELSE([AC_LANG_SOURCE([_ATRIP_CUDA_MEMORY_OF_DEVICES])],
|
||||
],
|
||||
[
|
||||
atrip_success=no
|
||||
AC_MSG_ERROR([An available device reports zero memory available!])
|
||||
AC_MSG_WARN([An available device reports zero memory available!])
|
||||
])
|
||||
|
||||
CXX="$ac_save_CXX"
|
||||
@ -92,6 +92,7 @@ int main() {
|
||||
|
||||
|
||||
m4_define([_ATRIP_CUDA_MEMORY_OF_DEVICES], [[
|
||||
|
||||
#include <mpi.h>
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
@ -146,4 +147,5 @@ int main() {
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
]])
|
||||
|
||||
@ -457,7 +457,8 @@ void unwrapAndMarkReady() {
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
// copy the retrieved mpi data to the device
|
||||
cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);
|
||||
WITH_CHRONO("cuda:memcpy",
|
||||
cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);)
|
||||
std::free(mpi_data);
|
||||
#endif
|
||||
|
||||
|
||||
@ -108,9 +108,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
printf("\n"
|
||||
"CUDA CARD RANK %d\n"
|
||||
"=================\n"
|
||||
"\tnumber: %1$d\n"
|
||||
"\tnumber: %1$ld\n"
|
||||
"\tname: %s\n"
|
||||
"\tMem. clock rate (KHz): %d\n"
|
||||
"\tMem. clock rate (KHz): %ld\n"
|
||||
"\tShared Mem Per Block (KB): %f\n"
|
||||
"\tAvail. Free/Total mem (GB): %f/%f\n"
|
||||
"\tFree memory (GB): %f\n"
|
||||
@ -259,7 +259,6 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
iterationMod = (in.percentageMod > 0)
|
||||
? nIterations * in.percentageMod / 100.0
|
||||
: in.iterationMod
|
||||
|
||||
, iteration1Percent = nIterations * 0.01
|
||||
;
|
||||
|
||||
@ -293,20 +292,20 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
|
||||
WITH_CHRONO("oneshot-db:comm:allgather",
|
||||
WITH_CHRONO("db:comm:allgather",
|
||||
MPI_Allgather( ldb.data()
|
||||
// , ldb.size() * sizeof(typename Slice<F>::LocalDatabaseElement)
|
||||
, ldb.size()
|
||||
, MPI_LDB_ELEMENT
|
||||
, db.data()
|
||||
// , ldb.size() * sizeof(typename Slice<F>::LocalDatabaseElement)
|
||||
, ldb.size()
|
||||
, MPI_LDB_ELEMENT
|
||||
, c);
|
||||
MPI_Allgather(ldb.data(),
|
||||
/* ldb.size() * sizeof(typename
|
||||
Slice<F>::LocalDatabaseElement) */
|
||||
ldb.size(),
|
||||
MPI_LDB_ELEMENT,
|
||||
db.data(),
|
||||
/* ldb.size() * sizeof(typename
|
||||
Slice<F>::LocalDatabaseElement), */
|
||||
ldb.size(),
|
||||
MPI_LDB_ELEMENT,
|
||||
c);
|
||||
))
|
||||
|
||||
WITH_CHRONO("db:comm:type:free",
|
||||
MPI_Type_free(&MPI_LDB_ELEMENT);
|
||||
)
|
||||
WITH_CHRONO("db:comm:type:free", MPI_Type_free(&MPI_LDB_ELEMENT);)
|
||||
|
||||
return db;
|
||||
};
|
||||
@ -575,29 +574,28 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
)))
|
||||
WITH_CHRONO("oneshot-doubles",
|
||||
WITH_CHRONO("doubles",
|
||||
doublesContribution<F>( abc, (size_t)No, (size_t)Nv
|
||||
doublesContribution<F>(abc, (size_t)No, (size_t)Nv,
|
||||
// -- VABCI
|
||||
, abph.unwrapSlice(Slice<F>::AB, abc)
|
||||
, abph.unwrapSlice(Slice<F>::AC, abc)
|
||||
, abph.unwrapSlice(Slice<F>::BC, abc)
|
||||
, abph.unwrapSlice(Slice<F>::BA, abc)
|
||||
, abph.unwrapSlice(Slice<F>::CA, abc)
|
||||
, abph.unwrapSlice(Slice<F>::CB, abc)
|
||||
// -- VHHHA
|
||||
, hhha.unwrapSlice(Slice<F>::A, abc)
|
||||
, hhha.unwrapSlice(Slice<F>::B, abc)
|
||||
, hhha.unwrapSlice(Slice<F>::C, abc)
|
||||
// -- TA
|
||||
, taphh.unwrapSlice(Slice<F>::A, abc)
|
||||
, taphh.unwrapSlice(Slice<F>::B, abc)
|
||||
, taphh.unwrapSlice(Slice<F>::C, abc)
|
||||
abph.unwrapSlice(Slice<F>::AB, abc),
|
||||
abph.unwrapSlice(Slice<F>::AC, abc),
|
||||
abph.unwrapSlice(Slice<F>::BC, abc),
|
||||
abph.unwrapSlice(Slice<F>::BA, abc),
|
||||
abph.unwrapSlice(Slice<F>::CA, abc),
|
||||
abph.unwrapSlice(Slice<F>::CB, abc),
|
||||
// -- VHHHA,
|
||||
hhha.unwrapSlice(Slice<F>::A, abc),
|
||||
hhha.unwrapSlice(Slice<F>::B, abc),
|
||||
hhha.unwrapSlice(Slice<F>::C, abc),
|
||||
// -- TA,
|
||||
taphh.unwrapSlice(Slice<F>::A, abc),
|
||||
taphh.unwrapSlice(Slice<F>::B, abc),
|
||||
taphh.unwrapSlice(Slice<F>::C, abc),
|
||||
// -- TABIJ
|
||||
, tabhh.unwrapSlice(Slice<F>::AB, abc)
|
||||
, tabhh.unwrapSlice(Slice<F>::AC, abc)
|
||||
, tabhh.unwrapSlice(Slice<F>::BC, abc)
|
||||
tabhh.unwrapSlice(Slice<F>::AB, abc),
|
||||
tabhh.unwrapSlice(Slice<F>::AC, abc),
|
||||
tabhh.unwrapSlice(Slice<F>::BC, abc),
|
||||
// -- TIJK
|
||||
, (DataFieldType<F>*)Tijk
|
||||
);
|
||||
(DataFieldType<F>*)Tijk);
|
||||
WITH_RANK << iteration << "-th doubles done\n";
|
||||
))
|
||||
}
|
||||
@ -618,16 +616,19 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
)
|
||||
WITH_CHRONO("singles",
|
||||
#if defined(HAVE_CUDA)
|
||||
singlesContribution<F><<<1,1>>>( No, Nv, abc[0], abc[1], abc[2]
|
||||
, (DataFieldType<F>*)Tai
|
||||
singlesContribution<F><<<1,1>>>(No, Nv, abc[0], abc[1], abc[2],
|
||||
(DataFieldType<F>*)Tai,
|
||||
#else
|
||||
singlesContribution<F>( No, Nv, abc[0], abc[1], abc[2]
|
||||
, Tai.data()
|
||||
singlesContribution<F>(No, Nv, abc[0], abc[1], abc[2],
|
||||
Tai.data(),
|
||||
#endif
|
||||
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB, abc)
|
||||
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC, abc)
|
||||
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC, abc)
|
||||
, (DataFieldType<F>*)Zijk);
|
||||
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB,
|
||||
abc),
|
||||
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC,
|
||||
abc),
|
||||
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC,
|
||||
abc),
|
||||
(DataFieldType<F>*)Zijk);
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@ -132,9 +132,8 @@ namespace cuda {
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
#define LIMS_KS \
|
||||
size_t \
|
||||
kmin = blockIdx.x * blockDim.x + threadIdx.x, \
|
||||
#define LIMS_KS() \
|
||||
size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \
|
||||
k = kmin, \
|
||||
idx = kmin * size * size * size \
|
||||
; \
|
||||
@ -144,7 +143,7 @@ namespace cuda {
|
||||
#endif
|
||||
#define _IJK_(i, j, k) i + j*size + k*size*size
|
||||
#define _REORDER_BODY_(...) \
|
||||
for (LIMS_KS ; k++) \
|
||||
for (LIMS_KS() ; k++) \
|
||||
for (size_t j = 0; j < size; j++) \
|
||||
for (size_t i = 0; i < size; i++, idx++) { \
|
||||
__VA_ARGS__ \
|
||||
@ -447,9 +446,8 @@ double getEnergySame
|
||||
#if defined(ATRIP_USE_DGEMM)
|
||||
#if defined(HAVE_CUDA)
|
||||
#define REORDER(__II, __JJ, __KK) \
|
||||
reorder<<< \
|
||||
bs, ths \
|
||||
>>>(reorder_proxy<DataFieldType<F>, __II ## __JJ ## __KK >{}, \
|
||||
reorder<<<bs, ths>>>(reorder_proxy<DataFieldType<F>, \
|
||||
__II ## __JJ ## __KK >{}, \
|
||||
No, Tijk, _t_buffer);
|
||||
#define DGEMM_PARTICLES(__A, __B) \
|
||||
atrip::xgemm<F>("T", \
|
||||
@ -485,10 +483,20 @@ double getEnergySame
|
||||
Atrip::kernelDimensions.ooo.blocks, \
|
||||
Atrip::kernelDimensions.ooo.threads \
|
||||
>>>((DataFieldType<F>*)_conj, (DataFieldType<F>*)_buffer, NoNoNo);
|
||||
|
||||
|
||||
// END CUDA ////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#else
|
||||
// NONCUDA //////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
// NONCUDA /////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#define REORDER(__II, __JJ, __KK) \
|
||||
reorder(reorder_proxy<DataFieldType<F>, __II ## __JJ ## __KK >{}, \
|
||||
reorder(reorder_proxy<DataFieldType<F>, \
|
||||
__II ## __JJ ## __KK >{}, \
|
||||
No, Tijk, _t_buffer);
|
||||
#define DGEMM_PARTICLES(__A, __B) \
|
||||
atrip::xgemm<F>("T", \
|
||||
@ -550,7 +558,8 @@ double getEnergySame
|
||||
// Set Tijk to zero
|
||||
#ifdef HAVE_CUDA
|
||||
WITH_CHRONO("double:reorder",
|
||||
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk, NoNoNo);
|
||||
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk,
|
||||
NoNoNo);
|
||||
// synchronize all initializations to zero
|
||||
)
|
||||
#else
|
||||
@ -640,6 +649,8 @@ double getEnergySame
|
||||
|
||||
{ // free resources
|
||||
#ifdef HAVE_CUDA
|
||||
// we need to synchronize here since we need
|
||||
// the Tijk for next process in the pipeline
|
||||
cuCtxSynchronize();
|
||||
cuMemFree((CUdeviceptr)_vhhh);
|
||||
cuMemFree((CUdeviceptr)_t_buffer);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user