From 34a4e79db0cc941c3045f37b6aff2e5af7a432e6 Mon Sep 17 00:00:00 2001 From: Alejandro Gallo Date: Fri, 13 Jan 2023 11:33:42 +0100 Subject: [PATCH] Initial compiling implementation of the energy kernel --- include/atrip/CUDA.hpp | 11 ++ include/atrip/Equations.hpp | 10 +- include/atrip/Operations.hpp | 171 ++++++++++++++++++++ src/atrip/Atrip.cxx | 56 +++++-- src/atrip/Equations.cxx | 304 +++++++++++++++-------------------- 5 files changed, 365 insertions(+), 187 deletions(-) create mode 100644 include/atrip/Operations.hpp diff --git a/include/atrip/CUDA.hpp b/include/atrip/CUDA.hpp index 3f87e12..8dab909 100644 --- a/include/atrip/CUDA.hpp +++ b/include/atrip/CUDA.hpp @@ -11,11 +11,22 @@ #if defined(HAVE_CUDA) && defined(__CUDACC__) # define __MAYBE_GLOBAL__ __global__ # define __MAYBE_DEVICE__ __device__ +# define __MAYBE_HOST__ __host__ +# define __INLINE__ __inline__ #else # define __MAYBE_GLOBAL__ # define __MAYBE_DEVICE__ +# define __MAYBE_HOST__ +# define __INLINE__ inline #endif +#if defined(HAVE_CUDA) +#define ACC_FUNCALL(fname, i, j, ...) fname<<<(i), (j)>>>(__VA_ARGS__) +#else +#define ACC_FUNCALL(fname, i, j, ...) fname(__VA_ARGS__) +#endif /* defined(HAVE_CUDA) */ + + #define _CHECK_CUDA_SUCCESS(message, ...) \ do { \ CUresult result = __VA_ARGS__; \ diff --git a/include/atrip/Equations.hpp b/include/atrip/Equations.hpp index fbee04c..f09d919 100644 --- a/include/atrip/Equations.hpp +++ b/include/atrip/Equations.hpp @@ -23,6 +23,8 @@ #include #endif +#include + namespace atrip { using ABCTuple = std::array; @@ -32,21 +34,25 @@ using ABCTuples = std::vector; // [[file:~/cuda/atrip/atrip.org::*Energy][Energy:1]] template -double getEnergyDistinct +__MAYBE_GLOBAL__ +void getEnergyDistinct ( F const epsabc , size_t const No , F* const epsi , F* const Tijk , F* const Zijk + , double* energy ); template -double getEnergySame +__MAYBE_GLOBAL__ +void getEnergySame ( F const epsabc , size_t const No , F* const epsi , F* const Tijk , F* const Zijk + , double* energy ); // Energy:1 ends here diff --git a/include/atrip/Operations.hpp b/include/atrip/Operations.hpp new file mode 100644 index 0000000..ab80884 --- /dev/null +++ b/include/atrip/Operations.hpp @@ -0,0 +1,171 @@ +// Copyright 2022 Alejandro Gallo +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPERATIONS_HPP_ +#define OPERATIONS_HPP_ + +#include +#include +#include + +namespace atrip { +namespace acc { + + // cuda kernels + + template + __MAYBE_GLOBAL__ + void zeroing(F* a, size_t n) { + F zero = {0}; + for (size_t i = 0; i < n; i++) { + a[i] = zero; + } + } + + //// + template + __MAYBE_DEVICE__ __MAYBE_HOST__ __INLINE__ + F maybeConjugateScalar(const F &a) { return a; } + +#if defined(HAVE_CUDA) + template <> + __MAYBE_DEVICE__ __MAYBE_HOST__ __INLINE__ + cuDoubleComplex maybeConjugateScalar(const cuDoubleComplex &a) { + return {a.x, -a.y}; + } +#endif /* defined(HAVE_CUDA) */ + + template + __MAYBE_DEVICE__ __MAYBE_HOST__ + void maybeConjugate(F* to, F* from, size_t n) { + for (size_t i = 0; i < n; ++i) { + to[i] = maybeConjugateScalar(from[i]); + } + } + + + template + __MAYBE_DEVICE__ __MAYBE_HOST__ + void reorder(F* to, F* from, size_t size, size_t I, size_t J, size_t K) { + size_t idx = 0; + const size_t IDX = I + J*size + K*size*size; + for (size_t k = 0; k < size; k++) + for (size_t j = 0; j < size; j++) + for (size_t i = 0; i < size; i++, idx++) + to[idx] += from[IDX]; + } + + // Multiplication operation + ////////////////////////////////////////////////////////////////////////////// + + template + __MAYBE_DEVICE__ __MAYBE_HOST__ __INLINE__ + F prod(const F &a, const F &b) { return a * b; } + +#if defined(HAVE_CUDA) + template <> + __MAYBE_DEVICE__ __MAYBE_HOST__ __INLINE__ + cuDoubleComplex prod(const cuDoubleComplex &a, const cuDoubleComplex &b) { + return cuCmul(a, b); + } +#endif /* defined(HAVE_CUDA) */ + + // Division operation + ////////////////////////////////////////////////////////////////////////////// + + template + __MAYBE_DEVICE__ __MAYBE_HOST__ __INLINE__ + F div(const F &a, const F &b) { return a / b; } + +#if defined(HAVE_CUDA) + template <> + __MAYBE_DEVICE__ __MAYBE_HOST__ __INLINE__ + cuDoubleComplex div(const cuDoubleComplex &a, const cuDoubleComplex &b) { + return cuCdiv(a, b); + } +#endif /* defined(HAVE_CUDA) */ + + // Real part + ////////////////////////////////////////////////////////////////////////////// + + template + __MAYBE_HOST__ __INLINE__ + double real(F &a) { return std::real(a); } + + template <> + __MAYBE_DEVICE__ __MAYBE_HOST__ __INLINE__ + double real(double &a) { + return a; + } + +#if defined(HAVE_CUDA) + template <> + __MAYBE_DEVICE__ __MAYBE_HOST__ __INLINE__ + double real(cuDoubleComplex &a) { + return cuCreal(a); + } +#endif /* defined(HAVE_CUDA) */ + + // Substraction operator + ////////////////////////////////////////////////////////////////////////////// + + template + __MAYBE_DEVICE__ __MAYBE_HOST__ __INLINE__ + F sub(const F &a, const F &b) { return a - b; } + +#if defined(HAVE_CUDA) + template <> + __MAYBE_DEVICE__ __MAYBE_HOST__ __INLINE__ + cuDoubleComplex sub(const cuDoubleComplex &a, + const cuDoubleComplex &b) { + return cuCsub(a, b); + } +#endif /* defined(HAVE_CUDA) */ + + // Addition operator + ////////////////////////////////////////////////////////////////////////////// + + template + __MAYBE_DEVICE__ __MAYBE_HOST__ __INLINE__ + F add(const F &a, const F &b) { return a + b; } + +#if defined(HAVE_CUDA) + template <> + __MAYBE_DEVICE__ __MAYBE_HOST__ __INLINE__ + cuDoubleComplex add(const cuDoubleComplex &a, const cuDoubleComplex &b) { + return cuCadd(a, b); + } +#endif /* defined(HAVE_CUDA) */ + + // Sum in place operator + ////////////////////////////////////////////////////////////////////////////// + + template + __MAYBE_DEVICE__ __MAYBE_HOST__ + void sum_in_place(F* to, const F* from) { *to += *from; } + +#if defined(HAVE_CUDA) + template <> + __MAYBE_DEVICE__ __MAYBE_HOST__ + void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* from) { + to->x += from->x; + to->y += from->y; + } +#endif /* defined(HAVE_CUDA) */ + + +} // namespace acc +} // namespace atrip + +#endif diff --git a/src/atrip/Atrip.cxx b/src/atrip/Atrip.cxx index a946459..e63e9c5 100644 --- a/src/atrip/Atrip.cxx +++ b/src/atrip/Atrip.cxx @@ -683,31 +683,59 @@ Atrip::Output Atrip::run(Atrip::Input const& in) { // COMPUTE ENERGY %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% {{{1 #if defined(ATRIP_ONLY_DGEMM) if (false) -#endif +#endif /* defined(ATRIP_ONLY_DGEMM) */ if (!isFakeTuple(i)) { - double tupleEnergy(0.); +#if defined(HAVE_CUDA) + double *tupleEnergy; + cuMemAlloc((DataPtr*)&tupleEnergy, sizeof(double)); +#elif + double _tupleEnergy(0.); + double *tupleEnergy = &_tupleEnergy; +#endif /* defined(HAVE_CUDA) */ int distinct(0); if (abc[0] == abc[1]) distinct++; if (abc[1] == abc[2]) distinct--; - const F epsabc(_epsa[abc[0]] + _epsa[abc[1]] + _epsa[abc[2]]); + const double + epsabc = std::real(_epsa[abc[0]] + _epsa[abc[1]] + _epsa[abc[2]]); + + DataFieldType _epsabc{epsabc}; - // LOG(0, "AtripCUDA") << "doing energy " << i << "distinct " << distinct << "\n"; WITH_CHRONO("energy", -/* - TODO: think about how to do this on the GPU in the best way possible - if ( distinct == 0) - tupleEnergy = getEnergyDistinct(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk); - else - tupleEnergy = getEnergySame(epsabc, No, (F*)epsi, (F*)Tijk, (F*)Zijk); -*/ - ) + if ( distinct == 0) { + ACC_FUNCALL(getEnergyDistinct>, + 1, 1, // for cuda + _epsabc, + No, + (DataFieldType*)epsi, + (DataFieldType*)Tijk, + (DataFieldType*)Zijk, + tupleEnergy); + } else { + ACC_FUNCALL(getEnergySame>, + 1, 1, // for cuda + _epsabc, + No, + (DataFieldType*)epsi, + (DataFieldType*)Tijk, + (DataFieldType*)Zijk, + tupleEnergy); + }) + +#if defined(HAVE_CUDA) + double host_tuple_energy; + cuMemcpyDtoH((void*)&host_tuple_energy, + (DataPtr)tupleEnergy, + sizeof(double)); +#elif + double host_tuple_energy = *tupleEnergy; +#endif /* defined(HAVE_CUDA) */ #if defined(HAVE_OCD) || defined(ATRIP_PRINT_TUPLES) - tupleEnergies[abc] = tupleEnergy; + tupleEnergies[abc] = host_tuple_energy; #endif - energy += tupleEnergy; + energy += host_tuple_energy; } diff --git a/src/atrip/Equations.cxx b/src/atrip/Equations.cxx index 4439383..4ae02b6 100644 --- a/src/atrip/Equations.cxx +++ b/src/atrip/Equations.cxx @@ -16,96 +16,13 @@ #include #include +#include namespace atrip { // Prolog:2 ends here -#ifdef HAVE_CUDA -namespace cuda { - - // cuda kernels - - template - __global__ - void zeroing(F* a, size_t n) { - F zero = {0}; - for (size_t i = 0; i < n; i++) { - a[i] = zero; - } - } - - //// - template - __device__ - F maybeConjugateScalar(const F a); - - template <> - __device__ - double maybeConjugateScalar(const double a) { return a; } - - template <> - __device__ - cuDoubleComplex - maybeConjugateScalar(const cuDoubleComplex a) { - return {a.x, -a.y}; - } - - template - __global__ - void maybeConjugate(F* to, F* from, size_t n) { - for (size_t i = 0; i < n; ++i) { - to[i] = maybeConjugateScalar(from[i]); - } - } - - - template - __global__ - void reorder(F* to, F* from, size_t size, size_t I, size_t J, size_t K) { - size_t idx = 0; - const size_t IDX = I + J*size + K*size*size; - for (size_t k = 0; k < size; k++) - for (size_t j = 0; j < size; j++) - for (size_t i = 0; i < size; i++, idx++) - to[idx] += from[IDX]; - } - - // I mean, really CUDA... really!? - template - __device__ - F multiply(const F &a, const F &b); - template <> - __device__ - double multiply(const double &a, const double &b) { return a * b; } - - template <> - __device__ - cuDoubleComplex multiply(const cuDoubleComplex &a, const cuDoubleComplex &b) { - return - {a.x * b.x - a.y * b.y, - a.x * b.y + a.y * b.x}; - } - - template - __device__ - void sum_in_place(F* to, const F* from); - - template <> - __device__ - void sum_in_place(double* to, const double *from) { *to += *from; } - - template <> - __device__ - void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* from) { - to->x += from->x; - to->y += from->y; - } - -}; -#endif - #if defined(HAVE_CUDA) #define FOR_K() \ for (size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \ @@ -133,7 +50,7 @@ namespace cuda { _REORDER_BODY_(__VA_ARGS__) \ } #if defined(HAVE_CUDA) -#define GO(__TO, __FROM) cuda::sum_in_place(&__TO, &__FROM); +#define GO(__TO, __FROM) acc::sum_in_place(&__TO, &__FROM); #else #define GO(__TO, __FROM) __TO += __FROM; #endif @@ -179,162 +96,199 @@ namespace cuda { #undef _IJK_ #undef GO +#if defined(HAVE_CUDA) +# define MIN(a, b) min((a), (b)) +#else +# define MIN(a, b) std::min((a), (b)) +#endif + // [[file:~/cuda/atrip/atrip.org::*Energy][Energy:2]] template -double getEnergyDistinct +__MAYBE_GLOBAL__ +void getEnergyDistinct ( F const epsabc , size_t const No , F* const epsi , F* const Tijk , F* const Zijk + , double* energy ) { constexpr size_t blockSize=16; - F energy(0.); + F _energy = {0.}; for (size_t kk=0; kk k ? jj : k; for (size_t j(jstart); j < jend; j++){ F const ej(epsi[j]); - F const facjk = j == k ? F(0.5) : F(1.0); + F const facjk = j == k ? F{0.5} : F{1.0}; size_t istart = ii > j ? ii : j; for (size_t i(istart); i < iend; i++){ const F ei(epsi[i]) - , facij = i == j ? F(0.5) : F(1.0) - , denominator(epsabc - ei - ej - ek) + , facij = i == j ? F{0.5} : F{1.0} + , eijk(acc::add(acc::add(ei, ej), ek)) + , denominator(acc::sub(epsabc, eijk)) , U(Zijk[i + No*j + No*No*k]) , V(Zijk[i + No*k + No*No*j]) , W(Zijk[j + No*i + No*No*k]) , X(Zijk[j + No*k + No*No*i]) , Y(Zijk[k + No*i + No*No*j]) , Z(Zijk[k + No*j + No*No*i]) - , A(maybeConjugate(Tijk[i + No*j + No*No*k])) - , B(maybeConjugate(Tijk[i + No*k + No*No*j])) - , C(maybeConjugate(Tijk[j + No*i + No*No*k])) - , D(maybeConjugate(Tijk[j + No*k + No*No*i])) - , E(maybeConjugate(Tijk[k + No*i + No*No*j])) - , _F(maybeConjugate(Tijk[k + No*j + No*No*i])) - , value - = 3.0 * ( A * U - + B * V - + C * W - + D * X - + E * Y - + _F * Z ) - + ( ( U + X + Y ) - - 2.0 * ( V + W + Z ) - ) * ( A + D + E ) - + ( ( V + W + Z ) - - 2.0 * ( U + X + Y ) - ) * ( B + C + _F ) + , A(acc::maybeConjugateScalar(Tijk[i + No*j + No*No*k])) + , B(acc::maybeConjugateScalar(Tijk[i + No*k + No*No*j])) + , C(acc::maybeConjugateScalar(Tijk[j + No*i + No*No*k])) + , D(acc::maybeConjugateScalar(Tijk[j + No*k + No*No*i])) + , E(acc::maybeConjugateScalar(Tijk[k + No*i + No*No*j])) + , _F(acc::maybeConjugateScalar(Tijk[k + No*j + No*No*i])) + // I just might as well write this in CL + , _first = acc::add(acc::prod(A, U), + acc::add(acc::prod(B, V), + acc::add(acc::prod(C, W), + acc::add(acc::prod(D, X), + acc::add(acc::prod(E, Y), + acc::prod(_F, Z)))))) + , _second = acc::prod(acc::sub(acc::add(U, acc::add(X, Y)), + acc::prod(F{-2.0}, + acc::add(V, acc::add(W, Z)))), + acc::add(A, acc::add(D, E))) + , _third = acc::prod(acc::sub(acc::add(V, acc::add(W, Z)), + acc::prod(F{-2.0}, + acc::add(U, + acc::add(X, Y)))), + acc::add(B, acc::add(C, _F))) + , value = acc::add(acc::prod(F{3.0}, _first), + acc::add(_second, + _third)) + , _loop_energy = acc::prod(acc::prod(F{2.0}, value), + acc::div(acc::prod(facjk, facij), + denominator)) ; - energy += 2.0 * value / denominator * facjk * facij; + acc::sum_in_place(&_energy, &_loop_energy); } // i } // j } // k } // ii } // jj } // kk - return std::real(energy); + const double real_part = acc::real(_energy); + acc::sum_in_place(energy, &real_part); } template -double getEnergySame +__MAYBE_GLOBAL__ +void getEnergySame ( F const epsabc , size_t const No , F* const epsi , F* const Tijk , F* const Zijk + , double* energy ) { constexpr size_t blockSize = 16; - F energy = F(0.); + F _energy = F{0.}; for (size_t kk=0; kk k ? jj : k; for(size_t j(jstart); j < jend; j++){ - const F facjk( j == k ? F(0.5) : F(1.0)); + const F facjk( j == k ? F{0.5} : F{1.0}); const F ej(epsi[j]); const size_t istart = ii > j ? ii : j; for(size_t i(istart); i < iend; i++){ const F ei(epsi[i]) - , facij ( i==j ? F(0.5) : F(1.0)) - , denominator(epsabc - ei - ej - ek) + , facij ( i==j ? F{0.5} : F{1.0}) + , eijk(acc::add(acc::add(ei, ej), ek)) + , denominator(acc::sub(epsabc, eijk)) , U(Zijk[i + No*j + No*No*k]) , V(Zijk[j + No*k + No*No*i]) , W(Zijk[k + No*i + No*No*j]) - , A(maybeConjugate(Tijk[i + No*j + No*No*k])) - , B(maybeConjugate(Tijk[j + No*k + No*No*i])) - , C(maybeConjugate(Tijk[k + No*i + No*No*j])) - , value - = F(3.0) * ( A * U - + B * V - + C * W - ) - - ( A + B + C ) * ( U + V + W ) + , A(acc::maybeConjugateScalar(Tijk[i + No*j + No*No*k])) + , B(acc::maybeConjugateScalar(Tijk[j + No*k + No*No*i])) + , C(acc::maybeConjugateScalar(Tijk[k + No*i + No*No*j])) + , ABC = acc::add(A, acc::add(B, C)) + , UVW = acc::add(U, acc::add(V, W)) + , AU = acc::prod(A, U) + , BV = acc::prod(B, V) + , CW = acc::prod(C, W) + , AU_and_BV_and_CW = acc::add(acc::add(AU, BV), CW) + , value = acc::sub(acc::prod(F{3.0}, AU_and_BV_and_CW), + acc::prod(ABC, UVW)) + , _loop_energy = acc::prod(acc::prod(F{2.0}, value), + acc::div(acc::prod(facjk, facij), + denominator)) ; - energy += F(2.0) * value / denominator * facjk * facij; + + acc::sum_in_place(&_energy, &_loop_energy); } // i } // j } // k } // ii } // jj } // kk - return std::real(energy); + const double real_part = acc::real(_energy); + acc::sum_in_place(energy, &real_part); } // Energy:2 ends here // [[file:~/cuda/atrip/atrip.org::*Energy][Energy:3]] // instantiate double template -double getEnergyDistinct - ( double const epsabc +__MAYBE_GLOBAL__ +void getEnergyDistinct + ( DataFieldType const epsabc , size_t const No - , double* const epsi - , double* const Tijk - , double* const Zijk + , DataFieldType* const epsi + , DataFieldType* const Tijk + , DataFieldType* const Zijk + , DataFieldType* energy ); template -double getEnergySame - ( double const epsabc +__MAYBE_GLOBAL__ +void getEnergySame + ( DataFieldType const epsabc , size_t const No - , double* const epsi - , double* const Tijk - , double* const Zijk + , DataFieldType* const epsi + , DataFieldType* const Tijk + , DataFieldType* const Zijk + , DataFieldType* energy ); // instantiate Complex template -double getEnergyDistinct - ( Complex const epsabc +__MAYBE_GLOBAL__ +void getEnergyDistinct + ( DataFieldType const epsabc , size_t const No - , Complex* const epsi - , Complex* const Tijk - , Complex* const Zijk + , DataFieldType* const epsi + , DataFieldType* const Tijk + , DataFieldType* const Zijk + , DataFieldType* energy ); template -double getEnergySame - ( Complex const epsabc +__MAYBE_GLOBAL__ +void getEnergySame + ( DataFieldType const epsabc , size_t const No - , Complex* const epsi - , Complex* const Tijk - , Complex* const Zijk + , DataFieldType* const epsi + , DataFieldType* const Tijk + , DataFieldType* const Zijk + , DataFieldType* energy ); // Energy:3 ends here @@ -360,18 +314,26 @@ double getEnergySame const size_t ijk = i + j*No + k*NoNo; #ifdef HAVE_CUDA -# define GO(__TPH, __VABIJ) \ - { \ - const DataFieldType product \ - = cuda::multiply>((__TPH), (__VABIJ)); \ - cuda::sum_in_place>(&Zijk[ijk], &product); \ - } + +#define GO(__TPH, __VABIJ) \ + do { \ + const DataFieldType \ + product = acc::prod>((__TPH), \ + (__VABIJ)); \ + acc::sum_in_place>(&Zijk[ijk], \ + &product); \ + } while (0) + #else -# define GO(__TPH, __VABIJ) Zijk[ijk] += (__TPH) * (__VABIJ); + +#define GO(__TPH, __VABIJ) Zijk[ijk] += (__TPH) * (__VABIJ) + #endif - GO(Tph[ a + i * Nv ], VBCij[ j + k * No ]) - GO(Tph[ b + j * Nv ], VACij[ i + k * No ]) - GO(Tph[ c + k * Nv ], VABij[ i + j * No ]) + + GO(Tph[ a + i * Nv ], VBCij[ j + k * No ]); + GO(Tph[ b + j * Nv ], VACij[ i + k * No ]); + GO(Tph[ c + k * Nv ], VABij[ i + j * No ]); + #undef GO } // for loop j } @@ -480,7 +442,7 @@ double getEnergySame ) #define MAYBE_CONJ(_conj, _buffer) \ do { \ - cuda::maybeConjugate<<< \ + acc::maybeConjugate<<< \ \ Atrip::kernelDimensions.ooo.blocks, \ \ @@ -564,8 +526,8 @@ double getEnergySame ths = Atrip::kernelDimensions.ooo.threads; #if !defined(ATRIP_ONLY_DGEMM) - cuda::zeroing<<>>((DataFieldType*)_t_buffer, NoNoNo); - cuda::zeroing<<>>((DataFieldType*)_vhhh, NoNoNo); + acc::zeroing<<>>((DataFieldType*)_t_buffer, NoNoNo); + acc::zeroing<<>>((DataFieldType*)_vhhh, NoNoNo); #endif #else @@ -581,7 +543,7 @@ double getEnergySame // Set Tijk to zero #if defined(HAVE_CUDA) && !defined(ATRIP_ONLY_DGEMM) WITH_CHRONO("double:reorder", - cuda::zeroing<<>>((DataFieldType*)Tijk, + acc::zeroing<<>>((DataFieldType*)Tijk, NoNoNo); ) #else @@ -589,7 +551,7 @@ double getEnergySame for (size_t k = 0; k < NoNoNo; k++) { Tijk[k] = DataFieldType{0.0}; }) -#endif +#endif /* defined(HAVE_CUDA) && !defined(ATRIP_ONLY_DGEMM) */ #if defined(ATRIP_ONLY_DGEMM) @@ -597,7 +559,7 @@ double getEnergySame #undef REORDER #define MAYBE_CONJ(a, b) do {} while(0) #define REORDER(i, j, k) do {} while(0) -#endif +#endif /* defined(ATRIP_ONLY_DGEMM) */ // HOLES WITH_CHRONO("doubles:holes", @@ -690,7 +652,7 @@ double getEnergySame #else free(_vhhh); free(_t_buffer); -#endif +#endif /* defined(HAVE_CUDA) */ } #undef REORDER @@ -741,7 +703,7 @@ double getEnergySame } } -#endif +#endif /* defined(ATRIP_USE_DGEMM) */ }