diff --git a/include/atrip/Equations.hpp b/include/atrip/Equations.hpp index f09d919..466b6c2 100644 --- a/include/atrip/Equations.hpp +++ b/include/atrip/Equations.hpp @@ -103,6 +103,11 @@ void singlesContribution // -- TIJK // , DataPtr Tijk , DataFieldType* Tijk_ +#if defined(HAVE_CUDA) + // -- tmp buffers + , DataFieldType* _t_buffer + , DataFieldType* _vhhh +#endif ); // Doubles contribution:1 ends here diff --git a/include/atrip/Slice.hpp b/include/atrip/Slice.hpp index 3d12187..a24d23f 100644 --- a/include/atrip/Slice.hpp +++ b/include/atrip/Slice.hpp @@ -352,7 +352,7 @@ Info info; // [[file:~/cuda/atrip/atrip.org::*Attributes][Attributes:2]] DataPtr data; -#if defined(HAVE_CUDA && !defined ATRIP_SOURCES_IN_GPU) +#if defined(HAVE_CUDA) && !defined (ATRIP_SOURCES_IN_GPU) F* mpi_data; #endif // Attributes:2 ends here @@ -456,7 +456,7 @@ void unwrapAndMarkReady() { if (errorCode != MPI_SUCCESS) throw "Atrip: Unexpected error MPI ERROR"; -#if defined(HAVE_CUDA && !defined ATRIP_SOURCES_IN_GPU) +#if defined(HAVE_CUDA) && !defined(ATRIP_SOURCES_IN_GPU) // copy the retrieved mpi data to the device WITH_CHRONO("cuda:memcpy", _CHECK_CUDA_SUCCESS("copying mpi data to device", @@ -488,7 +488,7 @@ void unwrapAndMarkReady() { Slice(size_t size_) : info({}) , data(DataNullPtr) -#if defined(HAVE_CUDA && !defined ATRIP_SOURCES_IN_GPU) +#if defined(HAVE_CUDA) && !defined(ATRIP_SOURCES_IN_GPU) , mpi_data(nullptr) #endif , size(size_) diff --git a/include/atrip/SliceUnion.hpp b/include/atrip/SliceUnion.hpp index 8813c23..008c926 100644 --- a/include/atrip/SliceUnion.hpp +++ b/include/atrip/SliceUnion.hpp @@ -405,6 +405,7 @@ template , sliceSize(std::accumulate(sliceLength.begin(), sliceLength.end(), 1UL, std::multiplies())) + #if defined(ATRIP_SOURCES_IN_GPU) , sources(rankMap.nSources()) #else @@ -417,6 +418,7 @@ template { // constructor begin LOG(0,"Atrip") << "INIT SliceUnion: " << name << "\n"; + printf("sliceSize %d, number of slices %d\n\n\n", sliceSize, sources.size()); #if defined(ATRIP_SOURCES_IN_GPU) for (auto& ptr: sources) { @@ -571,7 +573,7 @@ template if (slice.info.state == Slice::Fetch) { // if-1 // TODO: do it through the slice class slice.info.state = Slice::Dispatched; -#if defined(HAVE_CUDA && defined ATRIP_SOURCES_IN_GPU) +#if defined(HAVE_CUDA) && defined(ATRIP_SOURCES_IN_GPU) # if !defined(ATRIP_CUDA_AWARE_MPI) # error "You need CUDA aware MPI to have slices on the GPU" # endif diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx index 54c68a0..8265c0e 100644 --- a/src/atrip/Atrip.cxx +++ b/src/atrip/Atrip.cxx @@ -258,6 +258,25 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // all tensors std::vector< SliceUnion* > unions = {&taphh, &hhha, &abph, &abhh, &tabhh}; +#ifdef HAVE_CUDA + // TODO: free buffers + DataFieldType* _t_buffer; + DataFieldType* _vhhh; + WITH_CHRONO("double:cuda:alloc", + _CHECK_CUDA_SUCCESS("Allocating _t_buffer", + cuMemAlloc((CUdeviceptr*)&_t_buffer, + No*No*No * sizeof(DataFieldType))); + _CHECK_CUDA_SUCCESS("Allocating _vhhh", + cuMemAlloc((CUdeviceptr*)&_vhhh, + No*No*No * sizeof(DataFieldType))); + ) + //const size_t + // bs = Atrip::kernelDimensions.ooo.blocks, + //ths = Atrip::kernelDimensions.ooo.threads; + //cuda::zeroing<<>>((DataFieldType*)_t_buffer, NoNoNo); + //cuda::zeroing<<>>((DataFieldType*)_vhhh, NoNoNo); +#endif + // get tuples for the current rank TuplesDistribution *distribution; @@ -639,7 +658,14 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { tabhh.unwrapSlice(Slice::AC, abc), tabhh.unwrapSlice(Slice::BC, abc), // -- TIJK - (DataFieldType*)Tijk); + (DataFieldType*)Tijk +#if defined(HAVE_CUDA) + // -- tmp buffers + ,(DataFieldType*)_t_buffer + ,(DataFieldType*)_vhhh +#endif + ); + WITH_RANK << iteration << "-th doubles done\n"; )) } diff --git a/src/atrip/Equations.cxx b/src/atrip/Equations.cxx index 12be45d..9f42613 100644 --- a/src/atrip/Equations.cxx +++ b/src/atrip/Equations.cxx @@ -401,9 +401,15 @@ void getEnergySame // -- TIJK // , DataPtr Tijk_ , DataFieldType* Tijk_ - ) { - - const size_t NoNo = No*No; +#if defined(HAVE_CUDA) + // -- tmp buffers + , DataFieldType* _t_buffer + , DataFieldType* _vhhh +#endif + ) { + const size_t a = abc[0], b = abc[1], c = abc[2] + , NoNo = No*No + ; DataFieldType* Tijk = (DataFieldType*)Tijk_; @@ -517,25 +523,21 @@ void getEnergySame F one{1.0}, m_one{-1.0}, zero{0.0}; const size_t NoNoNo = No*NoNo; #ifdef HAVE_CUDA - DataFieldType* _t_buffer; - DataFieldType* _vhhh; - WITH_CHRONO("double:cuda:alloc", - _CHECK_CUDA_SUCCESS("Allocating _t_buffer", - cuMemAlloc((CUdeviceptr*)&_t_buffer, - NoNoNo * sizeof(DataFieldType))); - _CHECK_CUDA_SUCCESS("Allocating _vhhh", - cuMemAlloc((CUdeviceptr*)&_vhhh, - NoNoNo * sizeof(DataFieldType))); - ) - const size_t - bs = Atrip::kernelDimensions.ooo.blocks, - ths = Atrip::kernelDimensions.ooo.threads; - -#if !defined(ATRIP_ONLY_DGEMM) - acc::zeroing<<>>((DataFieldType*)_t_buffer, NoNoNo); - acc::zeroing<<>>((DataFieldType*)_vhhh, NoNoNo); -#endif - +// DataFieldType* _t_buffer; +// DataFieldType* _vhhh; +// WITH_CHRONO("double:cuda:alloc", +// _CHECK_CUDA_SUCCESS("Allocating _t_buffer", +// cuMemAlloc((CUdeviceptr*)&_t_buffer, +// NoNoNo * sizeof(DataFieldType))); +// _CHECK_CUDA_SUCCESS("Allocating _vhhh", +// cuMemAlloc((CUdeviceptr*)&_vhhh, +// NoNoNo * sizeof(DataFieldType))); +// ) +// const size_t +// bs = Atrip::kernelDimensions.ooo.blocks, +// ths = Atrip::kernelDimensions.ooo.threads; + //cuda::zeroing<<>>((DataFieldType*)_t_buffer, NoNoNo); + //cuda::zeroing<<>>((DataFieldType*)_vhhh, NoNoNo); #else DataFieldType* _t_buffer = (DataFieldType*)malloc(NoNoNo * sizeof(F)); DataFieldType* _vhhh = (DataFieldType*)malloc(NoNoNo * sizeof(F)); @@ -649,12 +651,12 @@ void getEnergySame #ifdef HAVE_CUDA // we need to synchronize here since we need // the Tijk for next process in the pipeline - _CHECK_CUDA_SUCCESS("Synchronizing", - cuCtxSynchronize()); - _CHECK_CUDA_SUCCESS("Freeing _vhhh", - cuMemFree((CUdeviceptr)_vhhh)); - _CHECK_CUDA_SUCCESS("Freeing _t_buffer", - cuMemFree((CUdeviceptr)_t_buffer)); + //_CHECK_CUDA_SUCCESS("Synchronizing", + // cuCtxSynchronize()); + //_CHECK_CUDA_SUCCESS("Freeing _vhhh", + // cuMemFree((CUdeviceptr)_vhhh)); + //_CHECK_CUDA_SUCCESS("Freeing _t_buffer", + // cuMemFree((CUdeviceptr)_t_buffer)); #else free(_vhhh); free(_t_buffer); @@ -741,6 +743,12 @@ void getEnergySame , DataPtr const TBChh // -- TIJK , DataFieldType* Tijk +#if defined(HAVE_CUDA) + // -- tmp buffers + , DataFieldType* _t_buffer + , DataFieldType* _vhhh +#endif + ); template @@ -769,6 +777,12 @@ void getEnergySame , DataPtr const TBChh // -- TIJK , DataFieldType* Tijk +#if defined(HAVE_CUDA) + // -- tmp buffers + , DataFieldType* _t_buffer + , DataFieldType* _vhhh +#endif + ); // Doubles contribution:2 ends here