Compare commits
No commits in common. "4277c07cc22790fbc4e3fa4ea76ddb9dd494e586" and "76a785044da55f5be4e85c68a8059f4737261de3" have entirely different histories.
4277c07cc2
...
76a785044d
@ -58,9 +58,8 @@ int main(int argc, char** argv) {
|
|||||||
CLI11_PARSE(app, argc, argv);
|
CLI11_PARSE(app, argc, argv);
|
||||||
|
|
||||||
CTF::World world(argc, argv);
|
CTF::World world(argc, argv);
|
||||||
int rank, nranks;
|
int rank;
|
||||||
MPI_Comm_rank(world.comm, &rank);
|
MPI_Comm_rank(world.comm, &rank);
|
||||||
MPI_Comm_size(world.comm, &nranks);
|
|
||||||
constexpr double elem_to_gb = 8.0 / 1024.0 / 1024.0 / 1024.0;
|
constexpr double elem_to_gb = 8.0 / 1024.0 / 1024.0 / 1024.0;
|
||||||
|
|
||||||
// USER PRINTING TEST BEGIN
|
// USER PRINTING TEST BEGIN
|
||||||
@ -109,43 +108,6 @@ int main(int argc, char** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t
|
|
||||||
f = sizeof(double)
|
|
||||||
, n_tuples = nv * (nv + 1) * (nv + 2) / 6 - nv
|
|
||||||
, atrip_memory
|
|
||||||
= /* tuples_memory */ 3 * sizeof(size_t) * n_tuples
|
|
||||||
//
|
|
||||||
// one dimensional slices (all ranks)
|
|
||||||
//
|
|
||||||
+ /* taphh */ f * nranks * 6 * nv * no * no
|
|
||||||
+ /* hhha */ f * nranks * 6 * no * no * no
|
|
||||||
//
|
|
||||||
// two dimensional slices (all ranks)
|
|
||||||
//
|
|
||||||
+ /* abph */ f * nranks * 12 * nv * no
|
|
||||||
+ /* abhh */ f * nranks * 6 * no * no
|
|
||||||
+ /* tabhh */ f * nranks * 6 * no * no
|
|
||||||
//
|
|
||||||
// distributed sources (all ranks)
|
|
||||||
//
|
|
||||||
+ /* tpphh */ f * nv * nv * no * no
|
|
||||||
+ /* vhhhp */ f * no * no * no * nv
|
|
||||||
+ /* vppph */ f * nv * nv * nv * no
|
|
||||||
+ /* vpphh */ f * nv * nv * no * no
|
|
||||||
+ /* tpphh2 */ f * nv * nv * no * no
|
|
||||||
//
|
|
||||||
// tensors in every rank
|
|
||||||
//
|
|
||||||
+ /* tijk */ f * nranks * no * no * no
|
|
||||||
+ /* zijk */ f * nranks * no * no * no
|
|
||||||
+ /* epsp */ f * nranks * (no + nv)
|
|
||||||
+ /* tai */ f * nranks * no * nv
|
|
||||||
;
|
|
||||||
|
|
||||||
if (atrip::Atrip::rank == 0)
|
|
||||||
std::cout << "Tentative MEMORY USAGE: " << atrip_memory << "\n";
|
|
||||||
|
|
||||||
|
|
||||||
std::vector<int> symmetries(4, NS)
|
std::vector<int> symmetries(4, NS)
|
||||||
, vo({nv, no})
|
, vo({nv, no})
|
||||||
, vvoo({nv, nv, no, no})
|
, vvoo({nv, nv, no, no})
|
||||||
|
|||||||
44
bootstrap.sh
44
bootstrap.sh
@ -1,44 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
type -a autoreconf > /dev/null ||
|
|
||||||
{
|
|
||||||
cat <<EOF && exit
|
|
||||||
|
|
||||||
You don't seem to have autotools installed, please install it.
|
|
||||||
|
|
||||||
- https://www.gnu.org/software/autoconf/
|
|
||||||
- https://www.gnu.org/software/automake/
|
|
||||||
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
cat <<EOF
|
|
||||||
|
|
||||||
Creating configure script
|
|
||||||
|
|
||||||
EOF
|
|
||||||
|
|
||||||
|
|
||||||
autoreconf -vif .
|
|
||||||
test -f configure || {
|
|
||||||
cat <<EOF
|
|
||||||
|
|
||||||
An error happened and a configure script could not be built!
|
|
||||||
|
|
||||||
EOF
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
cat <<EOF
|
|
||||||
|
|
||||||
Now you can build by doing
|
|
||||||
|
|
||||||
mkdir build
|
|
||||||
cd build
|
|
||||||
../configure
|
|
||||||
make extern
|
|
||||||
make all
|
|
||||||
|
|
||||||
EOF
|
|
||||||
@ -40,7 +40,7 @@ AC_RUN_IFELSE([AC_LANG_SOURCE([_ATRIP_CUDA_MEMORY_OF_DEVICES])],
|
|||||||
],
|
],
|
||||||
[
|
[
|
||||||
atrip_success=no
|
atrip_success=no
|
||||||
AC_MSG_WARN([An available device reports zero memory available!])
|
AC_MSG_ERROR([An available device reports zero memory available!])
|
||||||
])
|
])
|
||||||
|
|
||||||
CXX="$ac_save_CXX"
|
CXX="$ac_save_CXX"
|
||||||
@ -92,7 +92,6 @@ int main() {
|
|||||||
|
|
||||||
|
|
||||||
m4_define([_ATRIP_CUDA_MEMORY_OF_DEVICES], [[
|
m4_define([_ATRIP_CUDA_MEMORY_OF_DEVICES], [[
|
||||||
|
|
||||||
#include <mpi.h>
|
#include <mpi.h>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
@ -147,5 +146,4 @@ int main() {
|
|||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
]])
|
]])
|
||||||
|
|||||||
@ -457,8 +457,7 @@ void unwrapAndMarkReady() {
|
|||||||
|
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
// copy the retrieved mpi data to the device
|
// copy the retrieved mpi data to the device
|
||||||
WITH_CHRONO("cuda:memcpy",
|
cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);
|
||||||
cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);)
|
|
||||||
std::free(mpi_data);
|
std::free(mpi_data);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@ -108,9 +108,9 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
printf("\n"
|
printf("\n"
|
||||||
"CUDA CARD RANK %d\n"
|
"CUDA CARD RANK %d\n"
|
||||||
"=================\n"
|
"=================\n"
|
||||||
"\tnumber: %1$ld\n"
|
"\tnumber: %1$d\n"
|
||||||
"\tname: %s\n"
|
"\tname: %s\n"
|
||||||
"\tMem. clock rate (KHz): %ld\n"
|
"\tMem. clock rate (KHz): %d\n"
|
||||||
"\tShared Mem Per Block (KB): %f\n"
|
"\tShared Mem Per Block (KB): %f\n"
|
||||||
"\tAvail. Free/Total mem (GB): %f/%f\n"
|
"\tAvail. Free/Total mem (GB): %f/%f\n"
|
||||||
"\tFree memory (GB): %f\n"
|
"\tFree memory (GB): %f\n"
|
||||||
@ -259,6 +259,7 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
iterationMod = (in.percentageMod > 0)
|
iterationMod = (in.percentageMod > 0)
|
||||||
? nIterations * in.percentageMod / 100.0
|
? nIterations * in.percentageMod / 100.0
|
||||||
: in.iterationMod
|
: in.iterationMod
|
||||||
|
|
||||||
, iteration1Percent = nIterations * 0.01
|
, iteration1Percent = nIterations * 0.01
|
||||||
;
|
;
|
||||||
|
|
||||||
@ -292,20 +293,20 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
|
|
||||||
WITH_CHRONO("oneshot-db:comm:allgather",
|
WITH_CHRONO("oneshot-db:comm:allgather",
|
||||||
WITH_CHRONO("db:comm:allgather",
|
WITH_CHRONO("db:comm:allgather",
|
||||||
MPI_Allgather(ldb.data(),
|
MPI_Allgather( ldb.data()
|
||||||
/* ldb.size() * sizeof(typename
|
// , ldb.size() * sizeof(typename Slice<F>::LocalDatabaseElement)
|
||||||
Slice<F>::LocalDatabaseElement) */
|
, ldb.size()
|
||||||
ldb.size(),
|
, MPI_LDB_ELEMENT
|
||||||
MPI_LDB_ELEMENT,
|
, db.data()
|
||||||
db.data(),
|
// , ldb.size() * sizeof(typename Slice<F>::LocalDatabaseElement)
|
||||||
/* ldb.size() * sizeof(typename
|
, ldb.size()
|
||||||
Slice<F>::LocalDatabaseElement), */
|
, MPI_LDB_ELEMENT
|
||||||
ldb.size(),
|
, c);
|
||||||
MPI_LDB_ELEMENT,
|
|
||||||
c);
|
|
||||||
))
|
))
|
||||||
|
|
||||||
WITH_CHRONO("db:comm:type:free", MPI_Type_free(&MPI_LDB_ELEMENT);)
|
WITH_CHRONO("db:comm:type:free",
|
||||||
|
MPI_Type_free(&MPI_LDB_ELEMENT);
|
||||||
|
)
|
||||||
|
|
||||||
return db;
|
return db;
|
||||||
};
|
};
|
||||||
@ -574,28 +575,29 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
)))
|
)))
|
||||||
WITH_CHRONO("oneshot-doubles",
|
WITH_CHRONO("oneshot-doubles",
|
||||||
WITH_CHRONO("doubles",
|
WITH_CHRONO("doubles",
|
||||||
doublesContribution<F>(abc, (size_t)No, (size_t)Nv,
|
doublesContribution<F>( abc, (size_t)No, (size_t)Nv
|
||||||
// -- VABCI
|
// -- VABCI
|
||||||
abph.unwrapSlice(Slice<F>::AB, abc),
|
, abph.unwrapSlice(Slice<F>::AB, abc)
|
||||||
abph.unwrapSlice(Slice<F>::AC, abc),
|
, abph.unwrapSlice(Slice<F>::AC, abc)
|
||||||
abph.unwrapSlice(Slice<F>::BC, abc),
|
, abph.unwrapSlice(Slice<F>::BC, abc)
|
||||||
abph.unwrapSlice(Slice<F>::BA, abc),
|
, abph.unwrapSlice(Slice<F>::BA, abc)
|
||||||
abph.unwrapSlice(Slice<F>::CA, abc),
|
, abph.unwrapSlice(Slice<F>::CA, abc)
|
||||||
abph.unwrapSlice(Slice<F>::CB, abc),
|
, abph.unwrapSlice(Slice<F>::CB, abc)
|
||||||
// -- VHHHA,
|
// -- VHHHA
|
||||||
hhha.unwrapSlice(Slice<F>::A, abc),
|
, hhha.unwrapSlice(Slice<F>::A, abc)
|
||||||
hhha.unwrapSlice(Slice<F>::B, abc),
|
, hhha.unwrapSlice(Slice<F>::B, abc)
|
||||||
hhha.unwrapSlice(Slice<F>::C, abc),
|
, hhha.unwrapSlice(Slice<F>::C, abc)
|
||||||
// -- TA,
|
// -- TA
|
||||||
taphh.unwrapSlice(Slice<F>::A, abc),
|
, taphh.unwrapSlice(Slice<F>::A, abc)
|
||||||
taphh.unwrapSlice(Slice<F>::B, abc),
|
, taphh.unwrapSlice(Slice<F>::B, abc)
|
||||||
taphh.unwrapSlice(Slice<F>::C, abc),
|
, taphh.unwrapSlice(Slice<F>::C, abc)
|
||||||
// -- TABIJ
|
// -- TABIJ
|
||||||
tabhh.unwrapSlice(Slice<F>::AB, abc),
|
, tabhh.unwrapSlice(Slice<F>::AB, abc)
|
||||||
tabhh.unwrapSlice(Slice<F>::AC, abc),
|
, tabhh.unwrapSlice(Slice<F>::AC, abc)
|
||||||
tabhh.unwrapSlice(Slice<F>::BC, abc),
|
, tabhh.unwrapSlice(Slice<F>::BC, abc)
|
||||||
// -- TIJK
|
// -- TIJK
|
||||||
(DataFieldType<F>*)Tijk);
|
, (DataFieldType<F>*)Tijk
|
||||||
|
);
|
||||||
WITH_RANK << iteration << "-th doubles done\n";
|
WITH_RANK << iteration << "-th doubles done\n";
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
@ -616,19 +618,16 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
)
|
)
|
||||||
WITH_CHRONO("singles",
|
WITH_CHRONO("singles",
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
singlesContribution<F><<<1,1>>>(No, Nv, abc[0], abc[1], abc[2],
|
singlesContribution<F><<<1,1>>>( No, Nv, abc[0], abc[1], abc[2]
|
||||||
(DataFieldType<F>*)Tai,
|
, (DataFieldType<F>*)Tai
|
||||||
#else
|
#else
|
||||||
singlesContribution<F>(No, Nv, abc[0], abc[1], abc[2],
|
singlesContribution<F>( No, Nv, abc[0], abc[1], abc[2]
|
||||||
Tai.data(),
|
, Tai.data()
|
||||||
#endif
|
#endif
|
||||||
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB,
|
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AB, abc)
|
||||||
abc),
|
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC, abc)
|
||||||
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::AC,
|
, (DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC, abc)
|
||||||
abc),
|
, (DataFieldType<F>*)Zijk);
|
||||||
(DataFieldType<F>*)abhh.unwrapSlice(Slice<F>::BC,
|
|
||||||
abc),
|
|
||||||
(DataFieldType<F>*)Zijk);
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -132,8 +132,9 @@ namespace cuda {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
#define LIMS_KS() \
|
#define LIMS_KS \
|
||||||
size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \
|
size_t \
|
||||||
|
kmin = blockIdx.x * blockDim.x + threadIdx.x, \
|
||||||
k = kmin, \
|
k = kmin, \
|
||||||
idx = kmin * size * size * size \
|
idx = kmin * size * size * size \
|
||||||
; \
|
; \
|
||||||
@ -143,7 +144,7 @@ namespace cuda {
|
|||||||
#endif
|
#endif
|
||||||
#define _IJK_(i, j, k) i + j*size + k*size*size
|
#define _IJK_(i, j, k) i + j*size + k*size*size
|
||||||
#define _REORDER_BODY_(...) \
|
#define _REORDER_BODY_(...) \
|
||||||
for (LIMS_KS() ; k++) \
|
for (LIMS_KS ; k++) \
|
||||||
for (size_t j = 0; j < size; j++) \
|
for (size_t j = 0; j < size; j++) \
|
||||||
for (size_t i = 0; i < size; i++, idx++) { \
|
for (size_t i = 0; i < size; i++, idx++) { \
|
||||||
__VA_ARGS__ \
|
__VA_ARGS__ \
|
||||||
@ -446,8 +447,9 @@ double getEnergySame
|
|||||||
#if defined(ATRIP_USE_DGEMM)
|
#if defined(ATRIP_USE_DGEMM)
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
#define REORDER(__II, __JJ, __KK) \
|
#define REORDER(__II, __JJ, __KK) \
|
||||||
reorder<<<bs, ths>>>(reorder_proxy<DataFieldType<F>, \
|
reorder<<< \
|
||||||
__II ## __JJ ## __KK >{}, \
|
bs, ths \
|
||||||
|
>>>(reorder_proxy<DataFieldType<F>, __II ## __JJ ## __KK >{}, \
|
||||||
No, Tijk, _t_buffer);
|
No, Tijk, _t_buffer);
|
||||||
#define DGEMM_PARTICLES(__A, __B) \
|
#define DGEMM_PARTICLES(__A, __B) \
|
||||||
atrip::xgemm<F>("T", \
|
atrip::xgemm<F>("T", \
|
||||||
@ -483,20 +485,10 @@ double getEnergySame
|
|||||||
Atrip::kernelDimensions.ooo.blocks, \
|
Atrip::kernelDimensions.ooo.blocks, \
|
||||||
Atrip::kernelDimensions.ooo.threads \
|
Atrip::kernelDimensions.ooo.threads \
|
||||||
>>>((DataFieldType<F>*)_conj, (DataFieldType<F>*)_buffer, NoNoNo);
|
>>>((DataFieldType<F>*)_conj, (DataFieldType<F>*)_buffer, NoNoNo);
|
||||||
|
|
||||||
|
|
||||||
// END CUDA ////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
// NONCUDA //////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
// NONCUDA /////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
|
|
||||||
#define REORDER(__II, __JJ, __KK) \
|
#define REORDER(__II, __JJ, __KK) \
|
||||||
reorder(reorder_proxy<DataFieldType<F>, \
|
reorder(reorder_proxy<DataFieldType<F>, __II ## __JJ ## __KK >{}, \
|
||||||
__II ## __JJ ## __KK >{}, \
|
|
||||||
No, Tijk, _t_buffer);
|
No, Tijk, _t_buffer);
|
||||||
#define DGEMM_PARTICLES(__A, __B) \
|
#define DGEMM_PARTICLES(__A, __B) \
|
||||||
atrip::xgemm<F>("T", \
|
atrip::xgemm<F>("T", \
|
||||||
@ -558,8 +550,7 @@ double getEnergySame
|
|||||||
// Set Tijk to zero
|
// Set Tijk to zero
|
||||||
#ifdef HAVE_CUDA
|
#ifdef HAVE_CUDA
|
||||||
WITH_CHRONO("double:reorder",
|
WITH_CHRONO("double:reorder",
|
||||||
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk,
|
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk, NoNoNo);
|
||||||
NoNoNo);
|
|
||||||
// synchronize all initializations to zero
|
// synchronize all initializations to zero
|
||||||
)
|
)
|
||||||
#else
|
#else
|
||||||
@ -649,8 +640,6 @@ double getEnergySame
|
|||||||
|
|
||||||
{ // free resources
|
{ // free resources
|
||||||
#ifdef HAVE_CUDA
|
#ifdef HAVE_CUDA
|
||||||
// we need to synchronize here since we need
|
|
||||||
// the Tijk for next process in the pipeline
|
|
||||||
cuCtxSynchronize();
|
cuCtxSynchronize();
|
||||||
cuMemFree((CUdeviceptr)_vhhh);
|
cuMemFree((CUdeviceptr)_vhhh);
|
||||||
cuMemFree((CUdeviceptr)_t_buffer);
|
cuMemFree((CUdeviceptr)_t_buffer);
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user