diff --git a/include/atrip/Blas.hpp b/include/atrip/Blas.hpp index 117eb26..88e8b7a 100644 --- a/include/atrip/Blas.hpp +++ b/include/atrip/Blas.hpp @@ -55,28 +55,28 @@ namespace atrip { const int *ldc ); - void dcopy_(const int n, + void dcopy_(int *n, const double *x, - const int incx, + int *incx, double *y, - const int incy); + int *incy); - void zcopy_(const int n, + void zcopy_(int *n, const void *x, - const int incx, + int *incx, void *y, - const int incy); + int *incy); } #endif template - void xcopy(const int n, + void xcopy(int* n, const DataFieldType* x, - const int incx, + int* incx, DataFieldType* y, - const int incy); + int* incy); template void xgemm(const char *transa, diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx index 7bb5e06..7781b47 100644 --- a/src/atrip/Atrip.cxx +++ b/src/atrip/Atrip.cxx @@ -27,7 +27,7 @@ using namespace atrip; namespace atrip { namespace cuda { - + }; }; @@ -219,9 +219,11 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { WITH_CHRONO("oneshot-db:comm:allgather", WITH_CHRONO("db:comm:allgather", MPI_Allgather( ldb.data() + // , ldb.size() * sizeof(typename Slice::LocalDatabaseElement) , ldb.size() , MPI_LDB_ELEMENT , db.data() + // , ldb.size() * sizeof(typename Slice::LocalDatabaseElement) , ldb.size() , MPI_LDB_ELEMENT , c); @@ -372,7 +374,7 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { } LOG(0, "AtripCUDA") << "Starting iterations\n"; - + for ( size_t i = first_iteration, @@ -423,12 +425,19 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { }); } + const double _doubles_time = Atrip::chrono["doubles"].count(), + _its_time = Atrip::chrono["iterations"].count(); + LOG(0,"Atrip") << "iteration " << iteration << " [" << 100 * iteration / nIterations << "%]" - << " (" << doublesFlops * iteration / Atrip::chrono["doubles"].count() + << " (" << (_doubles_time > 0.0 + ? doublesFlops * iteration / _doubles_time + : -1) << "GF)" - << " (" << doublesFlops * iteration / Atrip::chrono["iterations"].count() + << " (" << (_its_time > 0.0 + ? doublesFlops * iteration / _its_time + : -1) << "GF)" << "\n"; @@ -465,14 +474,21 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { LOG(0, "AtripCUDA") << "first database " << i << "\n"; WITH_RANK << "__first__:first database ............ \n"; const auto db = communicateDatabase(abc, universe); + LOG(0, "AtripCUDA") << "first database communicated" << i << "\n"; WITH_RANK << "__first__:first database communicated \n"; WITH_RANK << "__first__:first database io phase \n"; + LOG(0, "AtripCUDA") << "doing io " << i << "\n"; doIOPhase(db); + LOG(0, "AtripCUDA") << "io done " << i << "\n"; WITH_RANK << "__first__:first database io phase DONE\n"; WITH_RANK << "__first__::::Unwrapping all slices for first database\n"; + LOG(0, "AtripCUDA") << "unrwapping " << i << "\n"; for (auto& u: unions) u->unwrapAll(abc); + LOG(0, "AtripCUDA") << "unwrapped " << i << "\n"; WITH_RANK << "__first__::::Unwrapping slices for first database DONE\n"; + LOG(0, "AtripCUDA") << "barrier " << i << "\n"; MPI_Barrier(universe); + LOG(0, "AtripCUDA") << "barriered " << i << "\n"; } LOG(0, "AtripCUDA") << "next database" << i << "\n"; @@ -545,14 +561,14 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { abhh.unwrapAll(abc); ))) WITH_CHRONO("reorder", - LOG(0, "AtripCUDA") << "reorder singles" << i << "\n"; - atrip::xcopy(No*No*No, + int ooo = No*No*No, stride = 1; + atrip::xcopy(&ooo, #if defined(HAVE_CUDA) - (DataFieldType*)Tijk, 1, - (DataFieldType*)Zijk, 1); + (DataFieldType*)Tijk, &stride, + (DataFieldType*)Zijk, &stride); #else - (DataFieldType*)Tijk.data(), 1, - (DataFieldType*)Zijk.data(), 1); + (DataFieldType*)Tijk.data(), &stride, + (DataFieldType*)Zijk.data(), &stride); #endif ) WITH_CHRONO("singles", diff --git a/src/atrip/Blas.cxx b/src/atrip/Blas.cxx index d6ded3e..9aeccc6 100644 --- a/src/atrip/Blas.cxx +++ b/src/atrip/Blas.cxx @@ -105,32 +105,32 @@ namespace atrip { template <> - void xcopy(const int n, + void xcopy(int *n, const DataFieldType* x, - const int incx, + int *incx, DataFieldType* y, - const int incy) { + int *incy) { #if defined(HAVE_CUDA) cublasDcopy(Atrip::cuda.handle, - n, - x, incx, - y, incy); + *n, + x, *incx, + y, *incy); #else dcopy_(n, x, incx, y, incy); #endif } - + template <> - void xcopy(const int n, + void xcopy(int* n, const DataFieldType* x, - const int incx, + int* incx, DataFieldType* y, - const int incy) { + int* incy) { #if defined(HAVE_CUDA) cublasZcopy(Atrip::cuda.handle, - n, - x, incx, - y, incy); + *n, + x, *incx, + y, *incy); #else zcopy_(n, x, incx, y, incy); #endif