diff --git a/src/atrip/Equations.cxx b/src/atrip/Equations.cxx index 18a822e..8b7d05d 100644 --- a/src/atrip/Equations.cxx +++ b/src/atrip/Equations.cxx @@ -23,6 +23,23 @@ namespace atrip { // Prolog:2 ends here + + // These are just help structures + // to help with the templating of reorder + // function + enum reordering_t + { + IJK, + IKJ, + JIK, + JKI, + KIJ, + KJI + }; + + template + struct reorder_proxy {}; + #ifdef HAVE_CUDA namespace cuda { @@ -111,11 +128,61 @@ namespace cuda { return lz; } - - }; #endif +#if defined(HAVE_CUDA) +#define LIMS_KS \ + size_t \ + kmin = blockIdx.x * blockDim.x + threadIdx.x, \ + k = kmin, \ + idx = kmin * size * size * size \ + ; \ + k < (kmin < size) ? kmin + 1 : size +#else +#define LIMS_KS size_t k=0, idx=0; k < size +#endif +#define _IJK_(i, j, k) i + j*size + k*size*size +#define _REORDER_BODY_(...) \ + for (LIMS_KS ; k++) \ + for (size_t j = 0; j < size; j++) \ + for (size_t i = 0; i < size; i++, idx++) { \ + __VA_ARGS__ \ + } +#define _MAKE_REORDER_(_enum, ...) \ + template \ + __MAYBE_GLOBAL__ \ + void reorder(reorder_proxy< F, _enum > p, \ + size_t size, F* to, F* from) { \ + _REORDER_BODY_(__VA_ARGS__) \ + } +#if defined(HAVE_CUDA) +#define GO(__TO, __FROM) cuda::sum_in_place(&__TO, &__FROM); +#else +#define GO(__TO, __FROM) __TO += __FROM; +#endif + + + template + __MAYBE_GLOBAL__ \ + void reorder(reorder_proxy proxy, + size_t size, F* to, F* from); + + _MAKE_REORDER_(IJK, GO(to[idx], from[_IJK_(i, j, k)])) + _MAKE_REORDER_(IKJ, GO(to[idx], from[_IJK_(i, k, j)])) + _MAKE_REORDER_(JIK, GO(to[idx], from[_IJK_(j, i, k)])) + _MAKE_REORDER_(JKI, GO(to[idx], from[_IJK_(j, k, i)])) + _MAKE_REORDER_(KIJ, GO(to[idx], from[_IJK_(k, i, j)])) + _MAKE_REORDER_(KJI, GO(to[idx], from[_IJK_(k, j, i)])) + + +#undef LIMS_KS +#undef _MAKE_REORDER +#undef _REORDER_BODY_ +#undef _IJK_ +#undef GO + + // [[file:~/cuda/atrip/atrip.org::*Energy][Energy:2]] template double getEnergyDistinct @@ -275,10 +342,7 @@ double getEnergySame // Energy:3 ends here // [[file:~/cuda/atrip/atrip.org::*Singles%20contribution][Singles contribution:2]] -template -#ifdef HAVE_CUDA -__global__ -#endif + template __MAYBE_GLOBAL__ void singlesContribution ( size_t No , size_t Nv @@ -296,7 +360,7 @@ __global__ for (size_t k = 0; k < No; k++) for (size_t i = 0; i < No; i++) for (size_t j = 0; j < No; j++) { - const size_t ijk = i + j*No + k*No*No; + const size_t ijk = i + j*No + k*NoNo; #ifdef HAVE_CUDA # define GO(__TPH, __VABIJ) \ @@ -317,10 +381,7 @@ __global__ // instantiate - template -#ifdef HAVE_CUDA - __global__ -#endif + template __MAYBE_GLOBAL__ void singlesContribution( size_t No , size_t Nv , size_t a @@ -333,10 +394,7 @@ __global__ , double* Zijk ); - template -#ifdef HAVE_CUDA - __global__ -#endif + template __MAYBE_GLOBAL__ void singlesContribution( size_t No , size_t Nv , size_t a @@ -381,18 +439,18 @@ __global__ ) { const size_t a = abc[0], b = abc[1], c = abc[2] - , NoNo = No*No, NoNv = No*Nv + , NoNo = No*No ; - typename DataField::type* Tijk = (typename DataField::type*) Tijk_; - LOG(0, "AtripCUDA") << "in doubles " << "\n"; + DataFieldType* Tijk = (DataFieldType*)Tijk_; #if defined(ATRIP_USE_DGEMM) -#define _IJK_(i, j, k) i + j*No + k*NoNo #if defined(HAVE_CUDA) -// TODO -#define REORDER(__II, __JJ, __KK) -#define __TO_DEVICEPTR(_v) (_v) +#define REORDER(__II, __JJ, __KK) \ + reorder<<< \ + bs, ths \ + >>>(reorder_proxy, __II ## __JJ ## __KK >{}, \ + No, Tijk, _t_buffer); #define DGEMM_PARTICLES(__A, __B) \ atrip::xgemm("T", \ "N", \ @@ -404,9 +462,9 @@ __global__ (int const*)&Nv, \ (DataFieldType*)__B, \ (int const*)&Nv, \ - &zero, \ - _t_buffer_p, \ - (int const*)&NoNo); + &zero, \ + _t_buffer, \ + (int const*)&NoNo); #define DGEMM_HOLES(__A, __B, __TRANSB) \ atrip::xgemm("N", \ __TRANSB, \ @@ -414,26 +472,24 @@ __global__ (int const*)&No, \ (int const*)&No, \ &m_one, \ - __TO_DEVICEPTR(__A), \ + __A, \ (int const*)&NoNo, \ (DataFieldType*)__B, \ (int const*)&No, \ &zero, \ - _t_buffer_p, \ + _t_buffer, \ (int const*)&NoNo \ - ); + ); #define MAYBE_CONJ(_conj, _buffer) \ - cuda::maybeConjugate<<<1,1>>>((DataFieldType*)_conj, (DataFieldType*)_buffer, NoNoNo); + cuda::maybeConjugate<<< \ + Atrip::kernelDimensions.ooo.blocks, \ + Atrip::kernelDimensions.ooo.threads \ + >>>((DataFieldType*)_conj, (DataFieldType*)_buffer, NoNoNo); #else -#define REORDER(__II, __JJ, __KK) \ - WITH_CHRONO("doubles:reorder", \ - for (size_t k = 0; k < No; k++) \ - for (size_t j = 0; j < No; j++) \ - for (size_t i = 0; i < No; i++) { \ - Tijk[_IJK_(i, j, k)] += _t_buffer_p[_IJK_(__II, __JJ, __KK)]; \ - } \ - ) -#define __TO_DEVICEPTR(_v) (_v) +// NONCUDA ////////////////////////////////////////////////////////////////////// +#define REORDER(__II, __JJ, __KK) \ + reorder(reorder_proxy, __II ## __JJ ## __KK >{}, \ + No, Tijk, _t_buffer); #define DGEMM_PARTICLES(__A, __B) \ atrip::xgemm("T", \ "N", \ @@ -446,7 +502,7 @@ __global__ __B, \ (int const*)&Nv, \ &zero, \ - _t_buffer_p, \ + _t_buffer, \ (int const*)&NoNo \ ); #define DGEMM_HOLES(__A, __B, __TRANSB) \ @@ -461,8 +517,8 @@ __global__ __B, \ (int const*)&No, \ &zero, \ - _t_buffer_p, \ - (int const*)&NoNo \ + _t_buffer, \ + (int const*)&NoNo \ ); #define MAYBE_CONJ(_conj, _buffer) \ for (size_t __i = 0; __i < NoNoNo; ++__i) \ @@ -470,31 +526,33 @@ __global__ #endif F one{1.0}, m_one{-1.0}, zero{0.0}; - DataFieldType zero_h{0.0}; const size_t NoNoNo = No*NoNo; #ifdef HAVE_CUDA DataFieldType* _t_buffer; DataFieldType* _vhhh; - LOG(0, "AtripCUDA") << "getting memory" << "\n"; cuMemAlloc((CUdeviceptr*)&_t_buffer, NoNoNo * sizeof(DataFieldType)); cuMemAlloc((CUdeviceptr*)&_vhhh, NoNoNo * sizeof(DataFieldType)); - LOG(0, "AtripCUDA") << "cuda::zeroing " << "\n"; - cuda::zeroing<<<1,1>>>((DataFieldType*)_t_buffer, NoNoNo); - cuda::zeroing<<<1,1>>>((DataFieldType*)_vhhh, NoNoNo); + const size_t + bs = Atrip::kernelDimensions.ooo.blocks, + ths = Atrip::kernelDimensions.ooo.threads; + cuda::zeroing<<>>((DataFieldType*)_t_buffer, NoNoNo); + cuda::zeroing<<>>((DataFieldType*)_vhhh, NoNoNo); #else - F* _t_buffer = (F*)malloc(NoNoNo * sizeof(F)); - F* _vhhh = (F*)malloc(NoNoNo * sizeof(F)); + DataFieldType* _t_buffer = (DataFieldType*)malloc(NoNoNo * sizeof(F)); + DataFieldType* _vhhh = (DataFieldType*)malloc(NoNoNo * sizeof(F)); + DataFieldType zero_h{0.0}; for (size_t i=0; i < NoNoNo; i++) { _t_buffer[i] = zero_h; _vhhh[i] = zero_h; } #endif - //_t_buffer.reserve(NoNoNo); - DataFieldType* _t_buffer_p = __TO_DEVICEPTR(_t_buffer); + // Set Tijk to zero #ifdef HAVE_CUDA - LOG(0, "AtripCUDA") << "cuda::zeroing Tijk" << "\n"; - cuda::zeroing<<<1,1>>>((DataFieldType*)Tijk, NoNoNo); + WITH_CHRONO("double:reorder", + cuda::zeroing<<>>((DataFieldType*)Tijk, NoNoNo); + // synchronize all initializations to zero + ) #else WITH_CHRONO("double:reorder", for (size_t k = 0; k < NoNoNo; k++) { @@ -502,103 +560,89 @@ __global__ }) #endif - LOG(0, "AtripCUDA") << "doing holes" << "\n"; - // TOMERGE: replace chronos + // HOLES WITH_CHRONO("doubles:holes", - { // Holes part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - - + { // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 - LOG(0, "AtripCUDA") << "conj 1" << "\n"; MAYBE_CONJ(_vhhh, VhhhC) - LOG(0, "AtripCUDA") << "done" << "\n"; WITH_CHRONO("doubles:holes:1", - LOG(0, "AtripCUDA") << "dgemm 1" << "\n"; DGEMM_HOLES(_vhhh, TABhh, "N") - LOG(0, "AtripCUDA") << "reorder 1" << "\n"; - REORDER(i, k, j) + REORDER(I, K, J) ) // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 WITH_CHRONO("doubles:holes:2", - LOG(0, "AtripCUDA") << "dgemm 2" << "\n"; DGEMM_HOLES(_vhhh, TABhh, "T") - REORDER(j, k, i) + REORDER(J, K, I) ) // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 - LOG(0, "AtripCUDA") << "conj 2" << "\n"; MAYBE_CONJ(_vhhh, VhhhB) - LOG(0, "AtripCUDA") << "done" << "\n"; WITH_CHRONO("doubles:holes:3", DGEMM_HOLES(_vhhh, TAChh, "N") - REORDER(i, j, k) + REORDER(I, J, K) ) // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 WITH_CHRONO("doubles:holes:4", DGEMM_HOLES(_vhhh, TAChh, "T") - REORDER(k, j, i) + REORDER(K, J, I) ) // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 - LOG(0, "AtripCUDA") << "conj 3" << "\n"; MAYBE_CONJ(_vhhh, VhhhA) WITH_CHRONO("doubles:holes:5", DGEMM_HOLES(_vhhh, TBChh, "N") - REORDER(j, i, k) + REORDER(J, I, K) ) // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 WITH_CHRONO("doubles:holes:6", DGEMM_HOLES(_vhhh, TBChh, "T") - REORDER(k, i, j) + REORDER(K, I, J) ) - } ) #undef MAYBE_CONJ - LOG(0, "AtripCUDA") << "doing particles" << "\n"; + // PARTICLES WITH_CHRONO("doubles:particles", - { // Particle part %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + { // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 WITH_CHRONO("doubles:particles:1", DGEMM_PARTICLES(TAphh, VBCph) - REORDER(i, j, k) + REORDER(I, J, K) ) // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 WITH_CHRONO("doubles:particles:2", DGEMM_PARTICLES(TAphh, VCBph) - REORDER(i, k, j) + REORDER(I, K, J) ) // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 WITH_CHRONO("doubles:particles:3", DGEMM_PARTICLES(TCphh, VABph) - REORDER(k, i, j) + REORDER(K, I, J) ) // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 WITH_CHRONO("doubles:particles:4", DGEMM_PARTICLES(TCphh, VBAph) - REORDER(k, j, i) + REORDER(K, J, I) ) // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 WITH_CHRONO("doubles:particles:5", DGEMM_PARTICLES(TBphh, VACph) - REORDER(j, i, k) + REORDER(J, I, K) ) // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 WITH_CHRONO("doubles:particles:6", DGEMM_PARTICLES(TBphh, VCAph) - REORDER(j, k, i) + REORDER(J, K, I) ) } ) - LOG(0, "AtripCUDA") << "particles done" << "\n"; { // free resources #ifdef HAVE_CUDA - LOG(0, "AtripCUDA") << "free mem" << "\n"; + cuCtxSynchronize(); cuMemFree((CUdeviceptr)_vhhh); cuMemFree((CUdeviceptr)_t_buffer); - LOG(0, "AtripCUDA") << "free mem done" << "\n"; #else free(_vhhh); free(_t_buffer); @@ -608,8 +652,8 @@ __global__ #undef REORDER #undef DGEMM_HOLES #undef DGEMM_PARTICLES - #undef _IJK_ #else + const size_t NoNv = No*Nv; for (size_t k = 0; k < No; k++) for (size_t j = 0; j < No; j++) for (size_t i = 0; i < No; i++){