diff --git a/include/atrip/Operations.hpp b/include/atrip/Operations.hpp index 69ef5aa..a162aa3 100644 --- a/include/atrip/Operations.hpp +++ b/include/atrip/Operations.hpp @@ -24,15 +24,6 @@ namespace acc { // cuda kernels - template - __MAYBE_GLOBAL__ - void zeroing(F* a, size_t n) { - F zero = {0}; - for (size_t i = 0; i < n; i++) { - a[i] = zero; - } - } - //// template __MAYBE_DEVICE__ __MAYBE_HOST__ __INLINE__ diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx index a560375..a5f14bd 100644 --- a/src/atrip/Atrip.cxx +++ b/src/atrip/Atrip.cxx @@ -160,9 +160,9 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { LOG(0,"Atrip") << "ooo blocks: " - << Atrip::kernelDimensions.ooo.blocks << "\n"; + << Atrip::kernelDimensions.ooo.blocks << "\n"; LOG(0,"Atrip") << "ooo threads per block: " - << Atrip::kernelDimensions.ooo.threads << "\n"; + << Atrip::kernelDimensions.ooo.threads << "\n"; #endif // allocate the three scratches, see piecuch diff --git a/src/atrip/Complex.cxx b/src/atrip/Complex.cxx index 96e0406..1740806 100644 --- a/src/atrip/Complex.cxx +++ b/src/atrip/Complex.cxx @@ -21,11 +21,6 @@ namespace atrip { template <> double maybeConjugate(const double a) { return a; } template <> Complex maybeConjugate(const Complex a) { return std::conj(a); } -#if defined(HAVE_CUDA) - -#endif - - namespace traits { template bool isComplex() { return false; } template <> bool isComplex() { return false; } diff --git a/src/atrip/Equations.cxx b/src/atrip/Equations.cxx index b94a44d..6e88106 100644 --- a/src/atrip/Equations.cxx +++ b/src/atrip/Equations.cxx @@ -13,6 +13,8 @@ // limitations under the License. // [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]] +#include + #include #include @@ -580,13 +582,8 @@ void getEnergySame ) #define MAYBE_CONJ(_conj, _buffer) \ do { \ - acc::maybeConjugate<<< \ - \ - Atrip::kernelDimensions.ooo.blocks, \ - \ - Atrip::kernelDimensions.ooo.threads \ - \ - >>>((DataFieldType*)_conj, \ + acc::maybeConjugate<<<1, 1 \ + >>>((DataFieldType*)_conj, \ (DataFieldType*)_buffer, \ NoNoNo); \ } while (0) @@ -648,50 +645,31 @@ void getEnergySame F one{1.0}, m_one{-1.0}, zero{0.0}; const size_t NoNoNo = No*NoNo; + +// Zeroing vectors #ifdef HAVE_CUDA -// DataFieldType* _t_buffer; -// DataFieldType* _vhhh; -// WITH_CHRONO("double:cuda:alloc", -// _CHECK_CUDA_SUCCESS("Allocating _t_buffer", -// cuMemAlloc((CUdeviceptr*)&_t_buffer, -// NoNoNo * sizeof(DataFieldType))); -// _CHECK_CUDA_SUCCESS("Allocating _vhhh", -// cuMemAlloc((CUdeviceptr*)&_vhhh, -// NoNoNo * sizeof(DataFieldType))); -// ) + #if !defined(ATRIP_ONLY_DGEMM) - // we still have to zero this - const size_t - bs = Atrip::kernelDimensions.ooo.blocks, - ths = Atrip::kernelDimensions.ooo.threads; - acc::zeroing<<>>((DataFieldType*)_t_buffer, NoNoNo); - acc::zeroing<<>>((DataFieldType*)_vhhh, NoNoNo); + { + const size_t elements = NoNoNo * sizeof(DataFieldType)/4; + WITH_CHRONO("double:zeroing", + _CHECK_CUDA_SUCCESS("Zeroing Tijk", + cuMemsetD32_v2((CUdeviceptr)Tijk, 0x00, elements)); + _CHECK_CUDA_SUCCESS("Zeroing t buffer", + cuMemsetD32_v2((CUdeviceptr)_t_buffer, 0x00, elements)); + _CHECK_CUDA_SUCCESS("Zeroing vhhh buffer", + cuMemsetD32_v2((CUdeviceptr)_vhhh, 0x00, elements)); + ) + } #endif #else DataFieldType* _t_buffer = (DataFieldType*)malloc(NoNoNo * sizeof(F)); DataFieldType* _vhhh = (DataFieldType*)malloc(NoNoNo * sizeof(F)); - DataFieldType zero_h{0.0}; - for (size_t i=0; i < NoNoNo; i++) { - _t_buffer[i] = zero_h; - _vhhh[i] = zero_h; - } -#endif - - // Set Tijk to zero -#if defined(HAVE_CUDA) && !defined(ATRIP_ONLY_DGEMM) - WITH_CHRONO("double:reorder", - acc::zeroing<<>>((DataFieldType*)Tijk, - NoNoNo); - ) -#endif - -#if !defined(HAVE_CUDA) - WITH_CHRONO("double:reorder", - for (size_t k = 0; k < NoNoNo; k++) { - Tijk[k] = DataFieldType{0.0}; - }) -#endif /* !defined(HAVE_CUDA) */ + std::memset((void*)_t_buffer, 0x00, NoNoNo * sizeof(DataFieldType)); + std::memset((void*)_vhhh, 0x00, NoNoNo * sizeof(DataFieldType)); + std::memset((void*)Tijk, 0x00, NoNoNo * sizeof(DataFieldType)); +#endif /* HAVE_CUDA */ #if defined(ATRIP_ONLY_DGEMM)