Minimal changes in Equations

This commit is contained in:
Gallo Alejandro 2022-09-12 19:10:14 +02:00
parent c20b9e3bcb
commit 3d7702d501

View File

@ -14,31 +14,13 @@
// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]] // [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]]
#include<atrip/Equations.hpp> #include<atrip/Equations.hpp>
#include<atrip/CUDA.hpp>
#if defined(HAVE_CUDA) #include<atrip/CUDA.hpp>
#include <cuda.h>
#endif
namespace atrip { namespace atrip {
// Prolog:2 ends here // Prolog:2 ends here
// These are just help structures
// to help with the templating of reorder
// function
enum reordering_t
{
IJK,
IKJ,
JIK,
JKI,
KIJ,
KJI
};
template <typename F, reordering_t R>
struct reorder_proxy {};
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
namespace cuda { namespace cuda {
@ -101,49 +83,42 @@ namespace cuda {
template <> template <>
__device__ __device__
cuDoubleComplex multiply(const cuDoubleComplex &a, const cuDoubleComplex &b) { cuDoubleComplex multiply(const cuDoubleComplex &a, const cuDoubleComplex &b) {
return return
{a.x * b.x - a.y * b.y, {a.x * b.x - a.y * b.y,
a.x * b.y + a.y * b.x}; a.x * b.y + a.y * b.x};
} }
template <typename F> template <typename F>
__device__ __device__
void sum_in_place(F* to, const F* b); void sum_in_place(F* to, const F* from);
template <> template <>
__device__ __device__
void sum_in_place(double* to, const double *b) { *to += *b; } void sum_in_place(double* to, const double *from) { *to += *from; }
template <> template <>
__device__ __device__
void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* b) { void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* from) {
to->x += b->x; to->x += from->x;
to->y += b->y; to->y += from->y;
}
__device__
cuDoubleComplex& operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz) {
lz.x += rz.x;
lz.y += rz.y;
return lz;
} }
}; };
#endif #endif
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
#define LIMS_KS() \ #define FOR_K() \
size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \ for (size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \
k = kmin, \ k = kmin, \
idx = kmin * size * size * size \ idx = kmin * size * size * size; \
; \ k < (kmin < size) ? kmin + 1 : size; \
k < (kmin < size) ? kmin + 1 : size k++)
#else #else
#define LIMS_KS size_t k=0, idx=0; k < size #define FOR_K() for (size_t k=0, idx=0; k < size; k++)
#endif #endif
#define _IJK_(i, j, k) i + j*size + k*size*size #define _IJK_(i, j, k) i + j*size + k*size*size
#define _REORDER_BODY_(...) \ #define _REORDER_BODY_(...) \
for (LIMS_KS() ; k++) \ FOR_K() \
for (size_t j = 0; j < size; j++) \ for (size_t j = 0; j < size; j++) \
for (size_t i = 0; i < size; i++, idx++) { \ for (size_t i = 0; i < size; i++, idx++) { \
__VA_ARGS__ \ __VA_ARGS__ \
@ -152,7 +127,9 @@ namespace cuda {
template <typename F> \ template <typename F> \
__MAYBE_GLOBAL__ \ __MAYBE_GLOBAL__ \
void reorder(reorder_proxy< F, _enum > p, \ void reorder(reorder_proxy< F, _enum > p, \
size_t size, F* to, F* from) { \ size_t size, \
F* to, \
F* from) { \
_REORDER_BODY_(__VA_ARGS__) \ _REORDER_BODY_(__VA_ARGS__) \
} }
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
@ -161,9 +138,31 @@ namespace cuda {
#define GO(__TO, __FROM) __TO += __FROM; #define GO(__TO, __FROM) __TO += __FROM;
#endif #endif
// These are just help structures
// to help with the templating of reorder
// function
enum reordering_t
{
IJK,
IKJ,
JIK,
JKI,
KIJ,
KJI
};
/*
* Please the c++ type checker and template creator
* in order to have an argument in the signature of
* the function that helps the compiler know which
* instantiation it should take.
*
*/
template <typename F, reordering_t R>
struct reorder_proxy {};
template <typename F, reordering_t R> template <typename F, reordering_t R>
__MAYBE_GLOBAL__ \ __MAYBE_GLOBAL__
void reorder(reorder_proxy<F, R> proxy, void reorder(reorder_proxy<F, R> proxy,
size_t size, F* to, F* from); size_t size, F* to, F* from);
@ -445,10 +444,14 @@ double getEnergySame
#if defined(ATRIP_USE_DGEMM) #if defined(ATRIP_USE_DGEMM)
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
#define REORDER(__II, __JJ, __KK) \ #define REORDER(__II, __JJ, __KK) \
reorder<<<bs, ths>>>(reorder_proxy<DataFieldType<F>, \ reorder<<<bs, ths>>>(reorder_proxy< \
__II ## __JJ ## __KK >{}, \ DataFieldType<F>, \
No, Tijk, _t_buffer); __II ## __JJ ## __KK \
>{}, \
No, \
Tijk, \
_t_buffer);
#define DGEMM_PARTICLES(__A, __B) \ #define DGEMM_PARTICLES(__A, __B) \
atrip::xgemm<F>("T", \ atrip::xgemm<F>("T", \
"N", \ "N", \
@ -462,7 +465,7 @@ double getEnergySame
(int const*)&Nv, \ (int const*)&Nv, \
&zero, \ &zero, \
_t_buffer, \ _t_buffer, \
(int const*)&NoNo); (int const*)&NoNo)
#define DGEMM_HOLES(__A, __B, __TRANSB) \ #define DGEMM_HOLES(__A, __B, __TRANSB) \
atrip::xgemm<F>("N", \ atrip::xgemm<F>("N", \
__TRANSB, \ __TRANSB, \
@ -477,7 +480,7 @@ double getEnergySame
&zero, \ &zero, \
_t_buffer, \ _t_buffer, \
(int const*)&NoNo \ (int const*)&NoNo \
); )
#define MAYBE_CONJ(_conj, _buffer) \ #define MAYBE_CONJ(_conj, _buffer) \
cuda::maybeConjugate<<< \ cuda::maybeConjugate<<< \
Atrip::kernelDimensions.ooo.blocks, \ Atrip::kernelDimensions.ooo.blocks, \
@ -512,7 +515,7 @@ double getEnergySame
&zero, \ &zero, \
_t_buffer, \ _t_buffer, \
(int const*)&NoNo \ (int const*)&NoNo \
); )
#define DGEMM_HOLES(__A, __B, __TRANSB) \ #define DGEMM_HOLES(__A, __B, __TRANSB) \
atrip::xgemm<F>("N", \ atrip::xgemm<F>("N", \
__TRANSB, \ __TRANSB, \
@ -527,7 +530,7 @@ double getEnergySame
&zero, \ &zero, \
_t_buffer, \ _t_buffer, \
(int const*)&NoNo \ (int const*)&NoNo \
); )
#define MAYBE_CONJ(_conj, _buffer) \ #define MAYBE_CONJ(_conj, _buffer) \
for (size_t __i = 0; __i < NoNoNo; ++__i) \ for (size_t __i = 0; __i < NoNoNo; ++__i) \
_conj[__i] = maybeConjugate<F>(_buffer[__i]); _conj[__i] = maybeConjugate<F>(_buffer[__i]);
@ -538,13 +541,19 @@ double getEnergySame
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
DataFieldType<F>* _t_buffer; DataFieldType<F>* _t_buffer;
DataFieldType<F>* _vhhh; DataFieldType<F>* _vhhh;
cuMemAlloc((CUdeviceptr*)&_t_buffer, NoNoNo * sizeof(DataFieldType<F>)); WITH_CHRONO("double:cuda:alloc",
cuMemAlloc((CUdeviceptr*)&_vhhh, NoNoNo * sizeof(DataFieldType<F>)); _CHECK_CUDA_SUCCESS("Allocating _t_buffer",
cuMemAlloc((CUdeviceptr*)&_t_buffer,
NoNoNo * sizeof(DataFieldType<F>)));
_CHECK_CUDA_SUCCESS("Allocating _vhhh",
cuMemAlloc((CUdeviceptr*)&_vhhh,
NoNoNo * sizeof(DataFieldType<F>)));
)
const size_t const size_t
bs = Atrip::kernelDimensions.ooo.blocks, bs = Atrip::kernelDimensions.ooo.blocks,
ths = Atrip::kernelDimensions.ooo.threads; ths = Atrip::kernelDimensions.ooo.threads;
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_t_buffer, NoNoNo); // cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_t_buffer, NoNoNo);
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_vhhh, NoNoNo); // cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)_vhhh, NoNoNo);
#else #else
DataFieldType<F>* _t_buffer = (DataFieldType<F>*)malloc(NoNoNo * sizeof(F)); DataFieldType<F>* _t_buffer = (DataFieldType<F>*)malloc(NoNoNo * sizeof(F));
DataFieldType<F>* _vhhh = (DataFieldType<F>*)malloc(NoNoNo * sizeof(F)); DataFieldType<F>* _vhhh = (DataFieldType<F>*)malloc(NoNoNo * sizeof(F));
@ -651,9 +660,12 @@ double getEnergySame
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
// we need to synchronize here since we need // we need to synchronize here since we need
// the Tijk for next process in the pipeline // the Tijk for next process in the pipeline
cuCtxSynchronize(); _CHECK_CUDA_SUCCESS("Synchronizing",
cuMemFree((CUdeviceptr)_vhhh); cuCtxSynchronize());
cuMemFree((CUdeviceptr)_t_buffer); _CHECK_CUDA_SUCCESS("Freeing _vhhh",
cuMemFree((CUdeviceptr)_vhhh));
_CHECK_CUDA_SUCCESS("Freeing _t_buffer",
cuMemFree((CUdeviceptr)_t_buffer));
#else #else
free(_vhhh); free(_vhhh);
free(_t_buffer); free(_t_buffer);