Compare commits

...

7 Commits

6 changed files with 265 additions and 168 deletions

View File

@ -58,8 +58,9 @@ int main(int argc, char** argv) {
CLI11_PARSE(app, argc, argv); CLI11_PARSE(app, argc, argv);
CTF::World world(argc, argv); CTF::World world(argc, argv);
int rank; int rank, nranks;
MPI_Comm_rank(world.comm, &rank); MPI_Comm_rank(world.comm, &rank);
MPI_Comm_size(world.comm, &nranks);
constexpr double elem_to_gb = 8.0 / 1024.0 / 1024.0 / 1024.0; constexpr double elem_to_gb = 8.0 / 1024.0 / 1024.0 / 1024.0;
// USER PRINTING TEST BEGIN // USER PRINTING TEST BEGIN
@ -108,6 +109,43 @@ int main(int argc, char** argv) {
} }
} }
size_t
f = sizeof(double)
, n_tuples = nv * (nv + 1) * (nv + 2) / 6 - nv
, atrip_memory
= /* tuples_memory */ 3 * sizeof(size_t) * n_tuples
//
// one dimensional slices (all ranks)
//
+ /* taphh */ f * nranks * 6 * nv * no * no
+ /* hhha */ f * nranks * 6 * no * no * no
//
// two dimensional slices (all ranks)
//
+ /* abph */ f * nranks * 12 * nv * no
+ /* abhh */ f * nranks * 6 * no * no
+ /* tabhh */ f * nranks * 6 * no * no
//
// distributed sources (all ranks)
//
+ /* tpphh */ f * nv * nv * no * no
+ /* vhhhp */ f * no * no * no * nv
+ /* vppph */ f * nv * nv * nv * no
+ /* vpphh */ f * nv * nv * no * no
+ /* tpphh2 */ f * nv * nv * no * no
//
// tensors in every rank
//
+ /* tijk */ f * nranks * no * no * no
+ /* zijk */ f * nranks * no * no * no
+ /* epsp */ f * nranks * (no + nv)
+ /* tai */ f * nranks * no * nv
;
if (atrip::Atrip::rank == 0)
std::cout << "Tentative MEMORY USAGE: " << atrip_memory << "\n";
std::vector<int> symmetries(4, NS) std::vector<int> symmetries(4, NS)
, vo({nv, no}) , vo({nv, no})
, vvoo({nv, nv, no, no}) , vvoo({nv, nv, no, no})

44
bootstrap.sh Executable file
View File

@ -0,0 +1,44 @@
#!/usr/bin/env bash
type -a autoreconf > /dev/null ||
{
cat <<EOF && exit
You don't seem to have autotools installed, please install it.
- https://www.gnu.org/software/autoconf/
- https://www.gnu.org/software/automake/
EOF
}
cat <<EOF
Creating configure script
EOF
autoreconf -vif .
test -f configure || {
cat <<EOF
An error happened and a configure script could not be built!
EOF
exit 1
}
cat <<EOF
Now you can build by doing
mkdir build
cd build
../configure
make extern
make all
EOF

View File

@ -40,7 +40,7 @@ AC_RUN_IFELSE([AC_LANG_SOURCE([_ATRIP_CUDA_MEMORY_OF_DEVICES])],
], ],
[ [
atrip_success=no atrip_success=no
AC_MSG_ERROR([An available device reports zero memory available!]) AC_MSG_WARN([An available device reports zero memory available!])
]) ])
CXX="$ac_save_CXX" CXX="$ac_save_CXX"
@ -92,6 +92,7 @@ int main() {
m4_define([_ATRIP_CUDA_MEMORY_OF_DEVICES], [[ m4_define([_ATRIP_CUDA_MEMORY_OF_DEVICES], [[
#include <mpi.h> #include <mpi.h>
#include <iostream> #include <iostream>
#include <cassert> #include <cassert>
@ -146,4 +147,5 @@ int main() {
return 0; return 0;
} }
]]) ]])

View File

@ -457,7 +457,8 @@ void unwrapAndMarkReady() {
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
// copy the retrieved mpi data to the device // copy the retrieved mpi data to the device
cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size); WITH_CHRONO("cuda:memcpy",
cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);)
std::free(mpi_data); std::free(mpi_data);
#endif #endif

View File

@ -108,9 +108,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
printf("\n" printf("\n"
"CUDA CARD RANK %d\n" "CUDA CARD RANK %d\n"
"=================\n" "=================\n"
"\tnumber: %1$d\n" "\tnumber: %1$ld\n"
"\tname: %s\n" "\tname: %s\n"
"\tMem. clock rate (KHz): %d\n" "\tMem. clock rate (KHz): %ld\n"
"\tShared Mem Per Block (KB): %f\n" "\tShared Mem Per Block (KB): %f\n"
"\tAvail. Free/Total mem (GB): %f/%f\n" "\tAvail. Free/Total mem (GB): %f/%f\n"
"\tFree memory (GB): %f\n" "\tFree memory (GB): %f\n"
@ -259,7 +259,6 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
iterationMod = (in.percentageMod > 0) iterationMod = (in.percentageMod > 0)
? nIterations * in.percentageMod / 100.0 ? nIterations * in.percentageMod / 100.0
: in.iterationMod : in.iterationMod
, iteration1Percent = nIterations * 0.01 , iteration1Percent = nIterations * 0.01
; ;
@ -293,20 +292,20 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
WITH_CHRONO("oneshot-db:comm:allgather", WITH_CHRONO("oneshot-db:comm:allgather",
WITH_CHRONO("db:comm:allgather", WITH_CHRONO("db:comm:allgather",
MPI_Allgather( ldb.data() MPI_Allgather(ldb.data(),
// , ldb.size() * sizeof(typename Slice<F>::LocalDatabaseElement) /* ldb.size() * sizeof(typename
, ldb.size() Slice<F>::LocalDatabaseElement) */
, MPI_LDB_ELEMENT ldb.size(),
, db.data() MPI_LDB_ELEMENT,
// , ldb.size() * sizeof(typename Slice<F>::LocalDatabaseElement) db.data(),
, ldb.size() /* ldb.size() * sizeof(typename
, MPI_LDB_ELEMENT Slice<F>::LocalDatabaseElement), */
, c); ldb.size(),
MPI_LDB_ELEMENT,
c);
)) ))
WITH_CHRONO("db:comm:type:free", WITH_CHRONO("db:comm:type:free", MPI_Type_free(&MPI_LDB_ELEMENT);)
MPI_Type_free(&MPI_LDB_ELEMENT);
)
return db; return db;
}; };
@ -575,29 +574,28 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
))) )))
WITH_CHRONO("oneshot-doubles", WITH_CHRONO("oneshot-doubles",
WITH_CHRONO("doubles", WITH_CHRONO("doubles",
doublesContribution<F>( abc, (size_t)No, (size_t)Nv doublesContribution<F>(abc, (size_t)No, (size_t)Nv,
// -- VABCI // -- VABCI
, abph.unwrapSlice(Slice<F>::AB, abc) abph.unwrapSlice(Slice<F>::AB, abc),
, abph.unwrapSlice(Slice<F>::AC, abc) abph.unwrapSlice(Slice<F>::AC, abc),
, abph.unwrapSlice(Slice<F>::BC, abc) abph.unwrapSlice(Slice<F>::BC, abc),
, abph.unwrapSlice(Slice<F>::BA, abc) abph.unwrapSlice(Slice<F>::BA, abc),
, abph.unwrapSlice(Slice<F>::CA, abc) abph.unwrapSlice(Slice<F>::CA, abc),
, abph.unwrapSlice(Slice<F>::CB, abc) abph.unwrapSlice(Slice<F>::CB, abc),
// -- VHHHA // -- VHHHA,
, hhha.unwrapSlice(Slice<F>::A, abc) hhha.unwrapSlice(Slice<F>::A, abc),
, hhha.unwrapSlice(Slice<F>::B, abc) hhha.unwrapSlice(Slice<F>::B, abc),
, hhha.unwrapSlice(Slice<F>::C, abc) hhha.unwrapSlice(Slice<F>::C, abc),
// -- TA // -- TA,
, taphh.unwrapSlice(Slice<F>::A, abc) taphh.unwrapSlice(Slice<F>::A, abc),
, taphh.unwrapSlice(Slice<F>::B, abc) taphh.unwrapSlice(Slice<F>::B, abc),
, taphh.unwrapSlice(Slice<F>::C, abc) taphh.unwrapSlice(Slice<F>::C, abc),
// -- TABIJ // -- TABIJ
, tabhh.unwrapSlice(Slice<F>::AB, abc) tabhh.unwrapSlice(Slice<F>::AB, abc),
, tabhh.unwrapSlice(Slice<F>::AC, abc) tabhh.unwrapSlice(Slice<F>::AC, abc),
, tabhh.unwrapSlice(Slice<F>::BC, abc) tabhh.unwrapSlice(Slice<F>::BC, abc),
// -- TIJK // -- TIJK
, (DataFieldType<F>*)Tijk (DataFieldType<F>*)Tijk);
);
WITH_RANK << iteration << "-th doubles done\n"; WITH_RANK << iteration << "-th doubles done\n";
)) ))
} }
@ -618,16 +616,19 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
) )
WITH_CHRONO("singles", WITH_CHRONO("singles",
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
singlesContribution<F><<<1,1>>>( No, Nv, abc[0], abc[1], abc[2] singlesContribution<F><<<1,1>>>(No, Nv, abc[0], abc[1], abc[2],
, (DataFieldType<F>*)Tai (DataFieldType<F>*)Tai,
#else #else
singlesContribution<F>( No, Nv, abc[0], abc[1], abc[2] singlesContribution<F>(No, Nv, abc[0], abc[1], abc[2],
, Tai.data() Tai.data(),
#endif #endif
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB, abc) (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB,
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC, abc) abc),
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC, abc) (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC,
, (DataFieldType<F>*)Zijk); abc),
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC,
abc),
(DataFieldType<F>*)Zijk);
) )
} }

View File

@ -132,9 +132,8 @@ namespace cuda {
#endif #endif
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
#define LIMS_KS \ #define LIMS_KS() \
size_t \ size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \
kmin = blockIdx.x * blockDim.x + threadIdx.x, \
k = kmin, \ k = kmin, \
idx = kmin * size * size * size \ idx = kmin * size * size * size \
; \ ; \
@ -144,7 +143,7 @@ namespace cuda {
#endif #endif
#define _IJK_(i, j, k) i + j*size + k*size*size #define _IJK_(i, j, k) i + j*size + k*size*size
#define _REORDER_BODY_(...) \ #define _REORDER_BODY_(...) \
for (LIMS_KS ; k++) \ for (LIMS_KS() ; k++) \
for (size_t j = 0; j < size; j++) \ for (size_t j = 0; j < size; j++) \
for (size_t i = 0; i < size; i++, idx++) { \ for (size_t i = 0; i < size; i++, idx++) { \
__VA_ARGS__ \ __VA_ARGS__ \
@ -447,9 +446,8 @@ double getEnergySame
#if defined(ATRIP_USE_DGEMM) #if defined(ATRIP_USE_DGEMM)
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
#define REORDER(__II, __JJ, __KK) \ #define REORDER(__II, __JJ, __KK) \
reorder<<< \ reorder<<<bs, ths>>>(reorder_proxy<DataFieldType<F>, \
bs, ths \ __II ## __JJ ## __KK >{}, \
>>>(reorder_proxy<DataFieldType<F>, __II ## __JJ ## __KK >{}, \
No, Tijk, _t_buffer); No, Tijk, _t_buffer);
#define DGEMM_PARTICLES(__A, __B) \ #define DGEMM_PARTICLES(__A, __B) \
atrip::xgemm<F>("T", \ atrip::xgemm<F>("T", \
@ -485,10 +483,20 @@ double getEnergySame
Atrip::kernelDimensions.ooo.blocks, \ Atrip::kernelDimensions.ooo.blocks, \
Atrip::kernelDimensions.ooo.threads \ Atrip::kernelDimensions.ooo.threads \
>>>((DataFieldType<F>*)_conj, (DataFieldType<F>*)_buffer, NoNoNo); >>>((DataFieldType<F>*)_conj, (DataFieldType<F>*)_buffer, NoNoNo);
// END CUDA ////////////////////////////////////////////////////////////////////
#else #else
// NONCUDA //////////////////////////////////////////////////////////////////////
// NONCUDA /////////////////////////////////////////////////////////////////////
#define REORDER(__II, __JJ, __KK) \ #define REORDER(__II, __JJ, __KK) \
reorder(reorder_proxy<DataFieldType<F>, __II ## __JJ ## __KK >{}, \ reorder(reorder_proxy<DataFieldType<F>, \
__II ## __JJ ## __KK >{}, \
No, Tijk, _t_buffer); No, Tijk, _t_buffer);
#define DGEMM_PARTICLES(__A, __B) \ #define DGEMM_PARTICLES(__A, __B) \
atrip::xgemm<F>("T", \ atrip::xgemm<F>("T", \
@ -550,7 +558,8 @@ double getEnergySame
// Set Tijk to zero // Set Tijk to zero
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
WITH_CHRONO("double:reorder", WITH_CHRONO("double:reorder",
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk, NoNoNo); cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk,
NoNoNo);
// synchronize all initializations to zero // synchronize all initializations to zero
) )
#else #else
@ -640,6 +649,8 @@ double getEnergySame
{ // free resources { // free resources
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
// we need to synchronize here since we need
// the Tijk for next process in the pipeline
cuCtxSynchronize(); cuCtxSynchronize();
cuMemFree((CUdeviceptr)_vhhh); cuMemFree((CUdeviceptr)_vhhh);
cuMemFree((CUdeviceptr)_t_buffer); cuMemFree((CUdeviceptr)_t_buffer);