Compare commits

..

11 Commits

Author SHA1 Message Date
Gallo Alejandro
20c29ed815 Add a couple of bench scripts for cuda and cublas testing 2022-09-12 19:53:50 +02:00
Gallo Alejandro
a5bb8f3d9c Give a better message when out of memory in allocating SliceUnion 2022-09-12 19:39:38 +02:00
Gallo Alejandro
5678ac0d9c Fix AniaBug#2: Lifecycle of SelfSufficient slices was wrong (comments)
This commit shows how the lifecycle of as slice goes.
At some point, a rank gets a list of slices that needs
in the next iteration, at classifies them according
to the characteristics of every situation.

If for instance we are given a slice with
an abc tuple such that we find that this tuple
was given to our rank, then we know that
we have to create a SelfSufficient tuple.

What we do is that we find a blank slice in our
SliceUnion slices bucket. This buffer is blank
and safe to do everything we want with it.
Without cuda, we just need to point this
blank slice to the correct memory address
of the data, that we (the SliceUnion) own.
This is therefore the line

  blank.data = sources[from.source].data();

Of course, doing this in CUDA will mess everything,
as it was until now, since we are pointing to a Host
address. Sadly the way the casting fu is now implemented,
the typechecker did not get that one and I foolishly
forgot about this important bit.

After the creation of the slice comes at some point
in the life cycle the destruction, which we also
have to handle separately.
This is done every iteration in the

    void clearUnusedSlicesForNext(ABCTuple const& abc);

function. There, normally the SelfSufficient slice
would just forget about the pointer it points, slice.data,
since this point is part of the original data of the tensor
distributed in the SliceUnion. In the CUDA case however,
we gave the SelfSufficient slice a freePointer from our
SliceUnion's bucket of allocated freePointers in the GPU
(of which we have around 6 per SliceUnion type).
This pointer needs to be marked again free to use
by a slice in the future, so it has to go back to the bucket,
we can't afford to lose it.
2022-09-12 19:29:35 +02:00
Gallo Alejandro
5483325626 Fix AniaBug #1: cublasCreate after context setting 2022-09-12 19:17:52 +02:00
Gallo Alejandro
23ad87214f Uncomment everything in Equations, (to test see comments)
For testing, comment out everything
that has REORDER, MAYBE_CONJ and zeroing.
2022-09-12 19:13:45 +02:00
Gallo Alejandro
3d7702d501 Minimal changes in Equations 2022-09-12 19:10:14 +02:00
Gallo Alejandro
c20b9e3bcb Add error checking in Blas.cxx 2022-09-12 19:07:48 +02:00
Gallo Alejandro
da704ad820 Add convenience _FORMAT macro 2022-09-12 18:42:36 +02:00
Gallo Alejandro
f1b2f37fe2 Check CUresult for mpi_data to device 2022-09-12 18:41:37 +02:00
Gallo Alejandro
1cd7bac187 Add macros to check CUerror and cublasStatus_t 2022-09-12 18:36:01 +02:00
Gallo Alejandro
68892d5dd8 Make bootstrap work from anywhere in the project 2022-09-12 18:35:30 +02:00
13 changed files with 811 additions and 150 deletions

View File

@ -1,3 +1,4 @@
AUTOMAKE_OPTIONS = subdir-objects
include $(top_srcdir)/atrip.mk include $(top_srcdir)/atrip.mk
AM_CPPFLAGS = -I$(top_srcdir)/include/ $(CTF_CPPFLAGS) AM_CPPFLAGS = -I$(top_srcdir)/include/ $(CTF_CPPFLAGS)
@ -19,4 +20,13 @@ endif
if WITH_CUDA if WITH_CUDA
test_main_CXXFLAGS = $(CUDA_CXXFLAGS) test_main_CXXFLAGS = $(CUDA_CXXFLAGS)
test_main_LDADD += $(CUDA_LDFLAGS) test_main_LDADD += $(CUDA_LDFLAGS)
AM_CXXFLAGS = $(CUDA_CXXFLAGS)
AM_LDFLAGS += $(CUDA_LDFLAGS)
bin_PROGRAMS += test-cublas-parallel-atrip
test_cublas_parallel_SOURCES = test-cublas-parallel-atrip.cxx
bin_PROGRAMS += test-cuda-sanity
test_cuda_sanity_SOURCES = test-cuda-sanity.cxx
endif endif

View File

@ -0,0 +1,239 @@
#include <mpi.h>
#include <iostream>
#include <cassert>
#include <string.h>
#include <sstream>
#include <string>
#include <vector>
#include <cuda.h>
#include <chrono>
#include <map>
#include <cstdlib>
#include <unistd.h>
#include <CLI11.hpp>
#define CUBLASAPI
#include <cublas_api.h>
#include <cublas_v2.h>
struct Timer {
using Clock = std::chrono::high_resolution_clock;
using Event = std::chrono::time_point<Clock>;
std::chrono::duration<double> duration;
Event _start;
inline void start() noexcept { _start = Clock::now(); }
inline void stop() noexcept { duration += Clock::now() - _start; }
inline void clear() noexcept { duration *= 0; }
inline double count() const noexcept { return duration.count(); }
};
using Timings = std::map<std::string, Timer>;
#define _FORMAT(_fmt, ...) \
([&] (void) -> std::string { \
int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__); \
std::vector<char> _out(_sz + 1); \
std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
return std::string(_out.data()); \
})()
#define _CHECK_CUDA_SUCCESS(message, ...) \
do { \
CUresult result = __VA_ARGS__; \
printf("doing %s\n", message); \
if (result != CUDA_SUCCESS) { \
printf("\t!!CUDA_ERROR(%d): %s:%d %s\n", \
result, \
__FILE__, \
__LINE__, \
message); \
return 1; \
} \
} while (0)
#define _CHECK_CUBLAS_SUCCESS(message, ...) \
do { \
cublasStatus_t result = __VA_ARGS__; \
if (result != 0) { \
printf("\t!!CUBLAS_ERROR(%d): %s:%d %s\n", \
result, \
__FILE__, \
__LINE__, \
message); \
return 1; \
} \
} while (0)
int main(int argc, char** argv) {
using std::vector;
MPI_Init(NULL, NULL);
int rank, np, ngcards;
size_t no(10), nv(no * 10), its(2);
bool barrier = false;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &np);
CLI::App app{"Main bench for atrip"};
app.add_option("--no", no, "Occupied orbitals");
app.add_option("--nv", nv, "Virtual orbitals");
app.add_option("--its", its, "Number of iterations to be done");
app.add_option("--barrier", barrier, "Call a MPI_Barrier in every iteration?");
CLI11_PARSE(app, argc, argv);
const size_t oo = no * no, ooo = no * oo;
Timings timings;
MPI_Barrier(MPI_COMM_WORLD);
_CHECK_CUDA_SUCCESS("init for cuda",
cuInit(0));
_CHECK_CUDA_SUCCESS("get ncards",
cuDeviceGetCount(&ngcards));
CUcontext ctx;
CUdevice dev;
char hostname[256];
gethostname(hostname, 256);
printf("%s with rank %d gets card %d\n",
hostname,
rank,
rank % ngcards);
// set contexts
_CHECK_CUDA_SUCCESS("device get", cuDeviceGet(&dev, rank % ngcards));
_CHECK_CUDA_SUCCESS("creating context", cuCtxCreate(&ctx, 0, dev));
_CHECK_CUDA_SUCCESS("setting context", cuCtxSetCurrent(ctx));
_CHECK_CUDA_SUCCESS("synchronizing", cuCtxSynchronize());
MPI_Barrier(MPI_COMM_WORLD);
using host_slice_t = vector<double>;
vector<size_t> sizes = {nv * oo, nv * no , oo, oo * no, oo * no};
vector<CUdeviceptr> P_phh(3), P_ph(6) , H_hh(3), H_hhh(3), T_hhh(1);
vector<vector<CUdeviceptr>*> slices_d = {&P_phh, &P_ph , &H_hh, &H_hhh, &T_hhh};
vector<vector<host_slice_t>> slices_h(slices_d.size());
{
int i = -1;
for (auto& v: slices_d) {
i++;
for (auto& ptr: *v) {
_CHECK_CUDA_SUCCESS("malloc",
cuMemAlloc(&ptr,
sizes[i] * sizeof(double)));
slices_h[i].push_back(std::move(std::vector<double>(sizes[i])));
}
}
}
const double one = 1.0, zero = 0.0;
printf("its: %d\n", its);
printf("barrier: %d\n", barrier);
printf("no: %ld\n", no);
printf("nv: %ld\n", nv);
printf("SIZE %f GB\n", (3 * nv * oo
+ 6 * oo * nv
+ 3 * oo
+ 3 * ooo
+ 1 * ooo
) * sizeof(double) / 1024.0 / 1024.0 / 1024.0);
std::map<std::string, double> tflopss
{{ "dgemm", ooo * (no + nv) * 6.0 * 2.0 * its / 1e12},
{ "holes", ooo * no * 6.0 * 2.0 * its / 1e12},
{ "particles", ooo * nv * 6.0 * 2.0 * its / 1e12}};
cublasHandle_t handle;
_CHECK_CUBLAS_SUCCESS("handle create", cublasCreate(&handle));
printf("handle %ld\n", handle);
timings["dgemm"].start();
for (size_t i = 0; i < its; i++) {
if (barrier) {
MPI_Barrier(MPI_COMM_WORLD);
timings["memcpy"].start();
for (size_t _s = 0; _s < slices_d.size(); _s++) {
// for (size_t _b = 0; _b < slices_h[_s].size(); _b++) {
for (size_t _b = 0; _b < 1 ; _b++) {
auto device = (*slices_d[_s])[_b];
auto host = slices_h[_s][_b].data();
cuMemcpyHtoD(device, host, sizes[_s]);
}
}
timings["memcpy"].stop();
}
timings["holes"].start();
for (size_t j = 0; j < 3; j++) {
_CHECK_CUBLAS_SUCCESS(" > 'geming ...",
cublasDgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
oo, no, no,
&one,
(double*)H_hhh[j], oo,
(double*)H_hh[j], no,
&zero,
(double*)T_hhh[0], oo));
_CHECK_CUBLAS_SUCCESS(" > 'geming ...",
cublasDgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_T,
oo, no, no,
&one,
(double*)H_hhh[j], oo,
(double*)H_hh[j], no,
&zero,
(double*)T_hhh[0], oo));
}
timings["holes"].stop();
timings["particles"].start();
for (size_t j = 0; j < 6; j++) {
_CHECK_CUBLAS_SUCCESS(" > 'geming ...",
cublasDgemm(handle,
CUBLAS_OP_T,
CUBLAS_OP_N,
oo, no, nv,
&one,
(double*)P_phh[j % 3], nv,
(double*)P_ph[j], nv,
&zero,
(double*)T_hhh[0], oo));
}
timings["particles"].stop();
cuCtxSynchronize();
}
timings["dgemm"].stop();
printf("Performance: \n");
for (auto name: {"holes", "particles", "dgemm"})
printf("%10s TFlops: %4.1f\n",
name,
tflopss[name]
/ timings[name].count());
printf("Timings: \n");
for (auto const& kv: timings)
printf("%10s: %10f\n", kv.first.c_str(), kv.second.count());
MPI_Finalize();
return 0;
}

View File

@ -0,0 +1,163 @@
#include <mpi.h>
#include <iostream>
#include <cassert>
#include <string.h>
#include <sstream>
#include <string>
#include <vector>
#include <cuda.h>
#include <chrono>
#include <map>
#include <cstdlib>
#include <unistd.h>
#define CUBLASAPI
#include <cublas_api.h>
#include <cublas_v2.h>
struct Timer {
using Clock = std::chrono::high_resolution_clock;
using Event = std::chrono::time_point<Clock>;
std::chrono::duration<double> duration;
Event _start;
inline void start() noexcept { _start = Clock::now(); }
inline void stop() noexcept { duration += Clock::now() - _start; }
inline void clear() noexcept { duration *= 0; }
inline double count() const noexcept { return duration.count(); }
};
using Timings = std::map<std::string, Timer>;
#define _FORMAT(_fmt, ...) \
([&] (void) -> std::string { \
int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__); \
std::vector<char> _out(_sz + 1); \
std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
return std::string(_out.data()); \
})()
#define _CHECK_CUDA_SUCCESS(message, ...) \
do { \
CUresult result = __VA_ARGS__; \
printf("doing %s\n", message); \
if (result != CUDA_SUCCESS) { \
printf("\t!!CUDA_ERROR(%d): %s:%d %s\n", \
result, \
__FILE__, \
__LINE__, \
message); \
return 1; \
} \
} while (0)
#define _CHECK_CUBLAS_SUCCESS(message, ...) \
do { \
cublasStatus_t result = __VA_ARGS__; \
printf("CUBLAS: doing %s\n", message); \
if (result != 0) { \
printf("\t!!CUBLAS_ERROR(%d): %s:%d %s\n", \
result, \
__FILE__, \
__LINE__, \
message); \
return 1; \
} \
} while (0)
int main(int argc, char** argv) {
MPI_Init(NULL, NULL);
int rank, np, ngcards;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &np);
Timings timings;
MPI_Barrier(MPI_COMM_WORLD);
_CHECK_CUDA_SUCCESS("init for cuda",
cuInit(0));
_CHECK_CUDA_SUCCESS("get ncards",
cuDeviceGetCount(&ngcards));
const size_t N
= argc > 1
? atoi(argv[1])
: 30000
, dgemms
= argc > 2
? atoi(argv[2])
: 2
, flops = 2 * N * N * N * dgemms
;
CUcontext ctx;
CUdevice dev;
char hostname[256];
gethostname(hostname, 256);
printf("%s with rank %d gets card %d\n",
hostname,
rank,
rank % ngcards);
// set contexts
_CHECK_CUDA_SUCCESS("device get",
cuDeviceGet(&dev, rank % ngcards));
_CHECK_CUDA_SUCCESS("creating context",
cuCtxCreate(&ctx, 0, dev));
_CHECK_CUDA_SUCCESS("setting context",
cuCtxSetCurrent(ctx));
_CHECK_CUDA_SUCCESS("synchronizing",
cuCtxSynchronize());
MPI_Barrier(MPI_COMM_WORLD);
CUdeviceptr A, B, C;
const double one = 1.0;
printf("SIZE %f GB\n", N * N * sizeof(double) / 1024.0 / 1024.0 / 1024.0);
_CHECK_CUDA_SUCCESS("A", cuMemAlloc(&A, N * N * sizeof(double)));
_CHECK_CUDA_SUCCESS("B", cuMemAlloc(&B, N * N * sizeof(double)));
_CHECK_CUDA_SUCCESS("C", cuMemAlloc(&C, N * N * sizeof(double)));
cublasHandle_t handle;
cublasStatus_t stat;
_CHECK_CUBLAS_SUCCESS("handle create", cublasCreate(&handle));
printf("handle %ld\n", handle);
timings["dgemm"].start();
for (size_t i = 0; i < dgemms; i++) {
_CHECK_CUBLAS_SUCCESS(_FORMAT(" > 'geming %ld ...", i).c_str(),
cublasDgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
N, N, N,
&one,
(double*)A, N,
(double*)B, N,
&one,
(double*)C, N));
}
cuCtxSynchronize();
timings["dgemm"].stop();
printf("dgemm Gflops: %f\n",
flops
/ timings["dgemm"].count()
/ 1024.0
/ 1024.0
/ 1024.0);
MPI_Finalize();
return 0;
}
// Local Variables:
// compile-command: "mpic++ \
// -pedantic -std=c++11 \
// -L./cudaroot/lib64 -lcuda \
// -L./cudaroot/lib64 -lcudart \
// -L./cudaroot/lib64 -lcublas \
// ./test-cublas-parallel.cxx -o test-cublas-parallel"
// End:

104
bench/test-cuda-sanity.cxx Normal file
View File

@ -0,0 +1,104 @@
#include <mpi.h>
#include <iostream>
#include <cassert>
#include <string.h>
#include <sstream>
#include <string>
#include <vector>
#include <cuda.h>
#define _CHECK_CUDA_SUCCESS(message, ...) \
do { \
CUresult result = __VA_ARGS__; \
printf("doing %s\n", message); \
if (result != CUDA_SUCCESS) { \
printf("\t!!CUDA_ERROR(%d): %s:%d %s\n", \
result, \
__FILE__, \
__LINE__, \
message); \
return 1; \
} \
} while (0)
int main() {
int rank, np, ngcards;
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &np);
_CHECK_CUDA_SUCCESS("init for cuda",
cuInit(0));
_CHECK_CUDA_SUCCESS("get ncards",
cuDeviceGetCount(&ngcards));
for (size_t rank = 0; rank < ngcards; rank++) {
CUcontext ctx;
CUdevice dev;
CUdevprop_st prop;
size_t _free, total, total2;
char *name = (char*)malloc(256);
printf("Setting contexts\n");
// set contexts
_CHECK_CUDA_SUCCESS("device get",
cuDeviceGet(&dev, rank));
_CHECK_CUDA_SUCCESS("creating context",
cuCtxCreate(&ctx, 0, dev));
_CHECK_CUDA_SUCCESS("setting context",
cuCtxSetCurrent(ctx));
_CHECK_CUDA_SUCCESS("synchronizing",
cuCtxSynchronize());
_CHECK_CUDA_SUCCESS("prop get",
cuDeviceGetProperties(&prop, dev));
_CHECK_CUDA_SUCCESS("meminfo get",
cuMemGetInfo(&_free, &total));
_CHECK_CUDA_SUCCESS("name get",
cuDeviceGetName(name, 256, dev));
_CHECK_CUDA_SUCCESS("totalmem get",
cuDeviceTotalMem(&total2, dev));
printf("\n"
"CUDA CARD RANK %d\n"
"=================\n"
"\tname: %s\n"
"\tShared Mem Per Block (KB): %f\n"
"\tFree/Total mem (GB): %f/%f\n"
"\total2 mem (GB): %f\n"
"\n",
dev,
name,
prop.sharedMemPerBlock / 1024.0,
_free / 1024.0 / 1024.0 / 1024.0 ,
total / 1024.0 / 1024.0 / 1024.0 ,
total2 / 1024.0 / 1024.0 / 1024.0
);
if (_free == 0 || total == 0 || total2 == 0)
return 1;
CUdeviceptr data;
_CHECK_CUDA_SUCCESS("memalloc 1",
cuMemAlloc(&data, sizeof(double) * 10000));
_CHECK_CUDA_SUCCESS("memalloc 2",
cuMemAlloc(&data, sizeof(double) * 10000));
}
MPI_Finalize();
return 0;
}
// Local Variables:
// compile-command: "mpic++ \
// -pedantic -std=c++11 \
// -L./cudaroot/lib64 -lcuda \
// -L./cudaroot/lib64 -lcudart \
// -L./cudaroot/lib64 -lcublas \
// ./mem.cxx -o mem"
// End:

View File

@ -142,8 +142,10 @@ int main(int argc, char** argv) {
+ /* tai */ f * nranks * no * nv + /* tai */ f * nranks * no * nv
; ;
if (atrip::Atrip::rank == 0) if (rank == 0) {
std::cout << "Tentative MEMORY USAGE: " << atrip_memory << "\n"; std::cout << "Tentative MEMORY USAGE (GB): "
<< double(atrip_memory) / 1024.0 / 1024.0 / 1024.0 << "\n";
}
std::vector<int> symmetries(4, NS) std::vector<int> symmetries(4, NS)

View File

@ -1,5 +1,7 @@
#!/usr/bin/env bash #!/usr/bin/env bash
cd `git rev-parse --show-toplevel`
type -a autoreconf > /dev/null || type -a autoreconf > /dev/null ||
{ {
cat <<EOF && exit cat <<EOF && exit

View File

@ -1,5 +1,13 @@
#pragma once #pragma once
#include <atrip/Utils.hpp>
#if defined(HAVE_CUDA)
#include <cuda.h>
#endif
#include <sstream>
#if defined(HAVE_CUDA) && defined(__CUDACC__) #if defined(HAVE_CUDA) && defined(__CUDACC__)
# define __MAYBE_GLOBAL__ __global__ # define __MAYBE_GLOBAL__ __global__
# define __MAYBE_DEVICE__ __device__ # define __MAYBE_DEVICE__ __device__
@ -7,3 +15,31 @@
# define __MAYBE_GLOBAL__ # define __MAYBE_GLOBAL__
# define __MAYBE_DEVICE__ # define __MAYBE_DEVICE__
#endif #endif
#define _CHECK_CUDA_SUCCESS(message, ...) \
do { \
CUresult result = __VA_ARGS__; \
if (result != CUDA_SUCCESS) { \
auto msg = _FORMAT("\t!!CUDA_ERROR(%d): %s:%d\v%s", \
result, \
__FILE__, \
__LINE__, \
message); \
std::cerr << msg; \
throw msg; \
} \
} while (0)
#define _CHECK_CUBLAS_SUCCESS(message, ...) \
do { \
cublasStatus_t result = __VA_ARGS__; \
if (result != 0) { \
auto msg = _FORMAT("\t!!CUBLAS_ERROR(%d): %s:%d\v%s", \
result, \
__FILE__, \
__LINE__, \
message); \
std::cerr << msg; \
throw msg; \
} \
} while (0)

View File

@ -21,6 +21,7 @@
#include <atrip/Tuples.hpp> #include <atrip/Tuples.hpp>
#include <atrip/Utils.hpp> #include <atrip/Utils.hpp>
#include <atrip/CUDA.hpp>
namespace atrip { namespace atrip {
@ -458,7 +459,11 @@ void unwrapAndMarkReady() {
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
// copy the retrieved mpi data to the device // copy the retrieved mpi data to the device
WITH_CHRONO("cuda:memcpy", WITH_CHRONO("cuda:memcpy",
cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);) _CHECK_CUDA_SUCCESS("copying mpi data to device",
cuMemcpyHtoD(data,
(void*)mpi_data,
sizeof(F) * size));
)
std::free(mpi_data); std::free(mpi_data);
#endif #endif

View File

@ -195,7 +195,28 @@ template <typename F=double>
; ;
if (blank.info.state == Slice<F>::SelfSufficient) { if (blank.info.state == Slice<F>::SelfSufficient) {
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
blank.mpi_data = sources[from.source].data(); const size_t _size = sizeof(F) * sources[from.source].size();
// TODO: this is code duplication with downstairs
if (freePointers.size() == 0) {
std::stringstream stream;
stream << "No more free pointers "
<< "for type " << type
<< " and name " << name
;
throw std::domain_error(stream.str());
}
auto dataPointer = freePointers.begin();
freePointers.erase(dataPointer);
blank.data = *dataPointer;
//
//
// TODO [#A]: do cuMemcpy of
// sources[from.source].data() ⇒ blank.data
// Do this when everything else is working.
// This will probably be a bottleneck of the H-to-D communication,
// as most slices are SelfSufficient.
//
//
#else #else
blank.data = sources[from.source].data(); blank.data = sources[from.source].data();
#endif #endif
@ -316,6 +337,16 @@ template <typename F=double>
} }
} }
#if defined(HAVE_CUDA)
// In cuda, SelfSufficient slices have an ad-hoc pointer
// since it is a pointer on the device and has to be
// brought back to the free pointer bucket of the SliceUnion.
// Therefore, only in the Recycled case it should not be
// put back the pointer.
if (slice.info.state == Slice<F>::Recycled) {
freeSlicePointer = false;
}
#else
// if the slice is self sufficient, do not dare touching the // if the slice is self sufficient, do not dare touching the
// pointer, since it is a pointer to our sources in our rank. // pointer, since it is a pointer to our sources in our rank.
if ( slice.info.state == Slice<F>::SelfSufficient if ( slice.info.state == Slice<F>::SelfSufficient
@ -323,6 +354,7 @@ template <typename F=double>
) { ) {
freeSlicePointer = false; freeSlicePointer = false;
} }
#endif
// make sure we get its data pointer to be used later // make sure we get its data pointer to be used later
// only for non-recycled, since it can be that we need // only for non-recycled, since it can be that we need
@ -346,7 +378,7 @@ template <typename F=double>
// << " info " << slice.info // << " info " << slice.info
<< "\n"; << "\n";
slice.free(); slice.free();
} } // we did not find the slice
} }
} }
@ -380,10 +412,17 @@ template <typename F=double>
for (auto& ptr: sliceBuffers) { for (auto& ptr: sliceBuffers) {
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
const CUresult error =
cuMemAlloc(&ptr, sizeof(F) * sources[0].size()); cuMemAlloc(&ptr, sizeof(F) * sources[0].size());
if (ptr == 0UL) { if (ptr == 0UL) {
throw "UNSUFICCIENT MEMORY ON THE GRAPHIC CARD FOR FREE POINTERS"; throw "UNSUFICCIENT MEMORY ON THE GRAPHIC CARD FOR FREE POINTERS";
} }
if (error != CUDA_SUCCESS) {
std::stringstream s;
s << "Error allocating memory for slice buffers "
<< "code " << error << "\n";
throw s.str();
}
#else #else
ptr = (DataPtr<F>)malloc(sizeof(F) * sources[0].size()); ptr = (DataPtr<F>)malloc(sizeof(F) * sources[0].size());
#endif #endif

View File

@ -80,6 +80,16 @@ struct Timer {
using Timings = std::map<std::string, Timer>; using Timings = std::map<std::string, Timer>;
// Chrono:1 ends here // Chrono:1 ends here
// A nice handy macro to do formatting
#define _FORMAT(_fmt, ...) \
([&] (void) -> std::string { \
int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__); \
std::vector<char> _out(_sz + 1); \
std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
return std::string(_out.data()); \
})()
// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]] // [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
} }
// Epilog:1 ends here // Epilog:1 ends here

View File

@ -24,7 +24,7 @@
using namespace atrip; using namespace atrip;
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
#include <cuda.h> #include <atrip/CUDA.hpp>
#endif #endif
template <typename F> bool RankMap<F>::RANK_ROUND_ROBIN; template <typename F> bool RankMap<F>::RANK_ROUND_ROBIN;
@ -49,11 +49,6 @@ void Atrip::init(MPI_Comm world) {
Atrip::communicator = world; Atrip::communicator = world;
MPI_Comm_rank(world, (int*)&Atrip::rank); MPI_Comm_rank(world, (int*)&Atrip::rank);
MPI_Comm_size(world, (int*)&Atrip::np); MPI_Comm_size(world, (int*)&Atrip::np);
#if defined(HAVE_CUDA)
Atrip::cuda.status = cublasCreate(&Atrip::cuda.handle);
#endif
} }
template <typename F> template <typename F>
@ -71,18 +66,24 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
int ngcards; int ngcards;
_CHECK_CUDA_SUCCESS("initializing cuda",
cuInit(0));
_CHECK_CUDA_SUCCESS("getting device count",
cuDeviceGetCount(&ngcards));
const auto clusterInfo = getClusterInfo(Atrip::communicator); const auto clusterInfo = getClusterInfo(Atrip::communicator);
cuDeviceGetCount(&ngcards);
LOG(0,"Atrip") << "ngcards: " << ngcards << "\n"; LOG(0,"Atrip") << "ngcards: " << ngcards << "\n";
if (clusterInfo.ranksPerNode > ngcards) { if (clusterInfo.ranksPerNode > ngcards) {
std::cerr << "ATRIP: You are running on more ranks per node than the number of graphic cards\n" const auto msg
<< "You have " << ngcards << " cards at your disposal\n"; = _FORMAT("ATRIP: You are running on more ranks per node than the number of graphic cards\n"
throw ""; "You have %d cards at your disposal\n", ngcards);
} std::cerr << msg;
if (clusterInfo.ranksPerNode < ngcards) { throw msg;
std::cerr << "You have " << ngcards << " cards at your disposal\n" } else if (clusterInfo.ranksPerNode < ngcards) {
<< "You will be only using " << clusterInfo.ranksPerNode const auto msg
<< ", i.e., the nubmer of ranks.\n"; = _FORMAT("You have %d cards at your disposal.\n"
"You will be only using %d, i.e, the number of ranks\n",
ngcards, clusterInfo.ranksPerNode);
std::cerr << msg;
} }
@ -94,16 +95,27 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
struct { struct { size_t free, total; } avail; size_t total; } memory; struct { struct { size_t free, total; } avail; size_t total; } memory;
char *name = (char*)malloc(256); char *name = (char*)malloc(256);
// - TODO :: we should check that the Zuweisung of graphic cards
// to nodes works as expected, i.e., node k should get from 0
// to ngcards with the formula =rank % ngcards=.
// set current device // set current device
cuDeviceGet(&dev, rank); _CHECK_CUDA_SUCCESS("getting device for index <rank>",
cuCtxCreate(&ctx, 0, dev); cuDeviceGet(&dev, rank % ngcards));
cuCtxSetCurrent(ctx); _CHECK_CUDA_SUCCESS("creating a cuda context",
cuCtxCreate(&ctx, 0, dev));
_CHECK_CUDA_SUCCESS("setting the context",
cuCtxSetCurrent(ctx));
// get information of the device // get information of the device
cuDeviceGetProperties(&prop, dev); _CHECK_CUDA_SUCCESS("getting properties of current device",
cuMemGetInfo(&memory.avail.free, &memory.avail.total); cuDeviceGetProperties(&prop, dev));
cuDeviceGetName(name, 256, dev); _CHECK_CUDA_SUCCESS("getting memory information",
cuDeviceTotalMem(&memory.total, dev); cuMemGetInfo(&memory.avail.free, &memory.avail.total));
_CHECK_CUDA_SUCCESS("getting name",
cuDeviceGetName(name, 256, dev));
_CHECK_CUDA_SUCCESS("getting total memory",
cuDeviceTotalMem(&memory.total, dev));
printf("\n" printf("\n"
"CUDA CARD RANK %d\n" "CUDA CARD RANK %d\n"
@ -124,6 +136,10 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
memory.total / 1024.0 / 1024.0 / 1024.0 memory.total / 1024.0 / 1024.0 / 1024.0
); );
std::free((void*)name); std::free((void*)name);
_CHECK_CUBLAS_SUCCESS("creating a cublas handle",
cublasCreate(&Atrip::cuda.handle));
} }
MPI_Barrier(universe); MPI_Barrier(universe);
} }
@ -163,17 +179,27 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
DataPtr<F> Tai, epsi, epsa; DataPtr<F> Tai, epsi, epsa;
//TODO: free memory pointers in the end of the algorithm
cuMemAlloc(&Tai, sizeof(F) * _Tai.size());
cuMemAlloc(&epsi, sizeof(F) * _epsi.size());
cuMemAlloc(&epsa, sizeof(F) * _epsa.size());
cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size()); // TODO: free memory pointers in the end of the algorithm
cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size());
cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size());
cuMemAlloc(&Tijk, sizeof(F) * No * No * No); _CHECK_CUDA_SUCCESS("Tai",
cuMemAlloc(&Zijk, sizeof(F) * No * No * No); cuMemAlloc(&Tai, sizeof(F) * _Tai.size()));
_CHECK_CUDA_SUCCESS("epsi",
cuMemAlloc(&epsi, sizeof(F) * _epsi.size()));
_CHECK_CUDA_SUCCESS("epsa",
cuMemAlloc(&epsa, sizeof(F) * _epsa.size()));
_CHECK_CUDA_SUCCESS("memcpy Tai",
cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size()));
_CHECK_CUDA_SUCCESS("memcpy epsi",
cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size()));
_CHECK_CUDA_SUCCESS("memcpy epsa",
cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size()));
_CHECK_CUDA_SUCCESS("Tijk",
cuMemAlloc(&Tijk, sizeof(F) * No * No * No));
_CHECK_CUDA_SUCCESS("Zijk",
cuMemAlloc(&Zijk, sizeof(F) * No * No * No));
#else #else
std::vector<F> &Tai = _Tai, &epsi = _epsi, &epsa = _epsa; std::vector<F> &Tai = _Tai, &epsi = _epsi, &epsa = _epsa;
Zijk = (DataFieldType<F>*)malloc(No*No*No * sizeof(DataFieldType<F>)); Zijk = (DataFieldType<F>*)malloc(No*No*No * sizeof(DataFieldType<F>));

View File

@ -15,19 +15,22 @@
// [[file:~/cuda/atrip/atrip.org::*Blas][Blas:2]] // [[file:~/cuda/atrip/atrip.org::*Blas][Blas:2]]
#include <atrip/Blas.hpp> #include <atrip/Blas.hpp>
#include <atrip/Atrip.hpp> #include <atrip/Atrip.hpp>
#include <atrip/CUDA.hpp>
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
# include <cstring> # include <cstring>
static inline static size_t dgem_call = 0;
cublasOperation_t char_to_cublasOperation(const char* trans) {
static inline
cublasOperation_t char_to_cublasOperation(const char* trans) {
if (strncmp("N", trans, 1) == 0) if (strncmp("N", trans, 1) == 0)
return CUBLAS_OP_N; return CUBLAS_OP_N;
else if (strncmp("T", trans, 1) == 0) else if (strncmp("T", trans, 1) == 0)
return CUBLAS_OP_T; return CUBLAS_OP_T;
else else
return CUBLAS_OP_C; return CUBLAS_OP_C;
} }
#endif #endif
@ -49,6 +52,8 @@ namespace atrip {
typename DataField<double>::type *C, typename DataField<double>::type *C,
const int *ldc) { const int *ldc) {
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
// TODO: remove this verbose checking
const cublasStatus_t error =
cublasDgemm(Atrip::cuda.handle, cublasDgemm(Atrip::cuda.handle,
char_to_cublasOperation(transa), char_to_cublasOperation(transa),
char_to_cublasOperation(transb), char_to_cublasOperation(transb),
@ -56,6 +61,14 @@ namespace atrip {
alpha, A, *lda, alpha, A, *lda,
B, *ldb, beta, B, *ldb, beta,
C, *ldc); C, *ldc);
if (error != 0) printf(":%-3ld (%4ldth) ERR<%4d> cublasDgemm: "
"A = %20ld "
"B = %20ld "
"C = %20ld "
"\n",
Atrip::rank,
dgem_call++,
error, A, B, C);
#else #else
dgemm_(transa, transb, dgemm_(transa, transb,
m, n, k, m, n, k,
@ -83,16 +96,17 @@ namespace atrip {
cuDoubleComplex cuDoubleComplex
cu_alpha = {std::real(*alpha), std::imag(*alpha)}, cu_alpha = {std::real(*alpha), std::imag(*alpha)},
cu_beta = {std::real(*beta), std::imag(*beta)}; cu_beta = {std::real(*beta), std::imag(*beta)};
_CHECK_CUBLAS_SUCCESS("cublasZgemm",
cublasZgemm(Atrip::cuda.handle, cublasZgemm(Atrip::cuda.handle,
char_to_cublasOperation(transa), char_to_cublasOperation(transa),
char_to_cublasOperation(transb), char_to_cublasOperation(transb),
*m, *n, *k, *m, *n, *k,
&cu_alpha, &cu_alpha,
A, *lda, A, *lda,
B, *ldb, B, *ldb,
&cu_beta, &cu_beta,
C, *ldc); C, *ldc));
#else #else
zgemm_(transa, transb, zgemm_(transa, transb,
m, n, k, m, n, k,

View File

@ -14,31 +14,13 @@
// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]] // [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]]
#include<atrip/Equations.hpp> #include<atrip/Equations.hpp>
#include<atrip/CUDA.hpp>
#if defined(HAVE_CUDA) #include<atrip/CUDA.hpp>
#include <cuda.h>
#endif
namespace atrip { namespace atrip {
// Prolog:2 ends here // Prolog:2 ends here
// These are just help structures
// to help with the templating of reorder
// function
enum reordering_t
{
IJK,
IKJ,
JIK,
JKI,
KIJ,
KJI
};
template <typename F, reordering_t R>
struct reorder_proxy {};
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
namespace cuda { namespace cuda {
@ -108,42 +90,35 @@ namespace cuda {
template <typename F> template <typename F>
__device__ __device__
void sum_in_place(F* to, const F* b); void sum_in_place(F* to, const F* from);
template <> template <>
__device__ __device__
void sum_in_place(double* to, const double *b) { *to += *b; } void sum_in_place(double* to, const double *from) { *to += *from; }
template <> template <>
__device__ __device__
void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* b) { void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* from) {
to->x += b->x; to->x += from->x;
to->y += b->y; to->y += from->y;
}
__device__
cuDoubleComplex& operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz) {
lz.x += rz.x;
lz.y += rz.y;
return lz;
} }
}; };
#endif #endif
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
#define LIMS_KS() \ #define FOR_K() \
size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \ for (size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \
k = kmin, \ k = kmin, \
idx = kmin * size * size * size \ idx = kmin * size * size * size; \
; \ k < (kmin < size) ? kmin + 1 : size; \
k < (kmin < size) ? kmin + 1 : size k++)
#else #else
#define LIMS_KS size_t k=0, idx=0; k < size #define FOR_K() for (size_t k=0, idx=0; k < size; k++)
#endif #endif
#define _IJK_(i, j, k) i + j*size + k*size*size #define _IJK_(i, j, k) i + j*size + k*size*size
#define _REORDER_BODY_(...) \ #define _REORDER_BODY_(...) \
for (LIMS_KS() ; k++) \ FOR_K() \
for (size_t j = 0; j < size; j++) \ for (size_t j = 0; j < size; j++) \
for (size_t i = 0; i < size; i++, idx++) { \ for (size_t i = 0; i < size; i++, idx++) { \
__VA_ARGS__ \ __VA_ARGS__ \
@ -152,7 +127,9 @@ namespace cuda {
template <typename F> \ template <typename F> \
__MAYBE_GLOBAL__ \ __MAYBE_GLOBAL__ \
void reorder(reorder_proxy< F, _enum > p, \ void reorder(reorder_proxy< F, _enum > p, \
size_t size, F* to, F* from) { \ size_t size, \
F* to, \
F* from) { \
_REORDER_BODY_(__VA_ARGS__) \ _REORDER_BODY_(__VA_ARGS__) \
} }
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
@ -161,9 +138,31 @@ namespace cuda {
#define GO(__TO, __FROM) __TO += __FROM; #define GO(__TO, __FROM) __TO += __FROM;
#endif #endif
// These are just help structures
// to help with the templating of reorder
// function
enum reordering_t
{
IJK,
IKJ,
JIK,
JKI,
KIJ,
KJI
};
/*
* Please the c++ type checker and template creator
* in order to have an argument in the signature of
* the function that helps the compiler know which
* instantiation it should take.
*
*/
template <typename F, reordering_t R>
struct reorder_proxy {};
template <typename F, reordering_t R> template <typename F, reordering_t R>
__MAYBE_GLOBAL__ \ __MAYBE_GLOBAL__
void reorder(reorder_proxy<F, R> proxy, void reorder(reorder_proxy<F, R> proxy,
size_t size, F* to, F* from); size_t size, F* to, F* from);
@ -446,9 +445,13 @@ double getEnergySame
#if defined(ATRIP_USE_DGEMM) #if defined(ATRIP_USE_DGEMM)
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
#define REORDER(__II, __JJ, __KK) \ #define REORDER(__II, __JJ, __KK) \
reorder<<<bs, ths>>>(reorder_proxy<DataFieldType<F>, \ reorder<<<bs, ths>>>(reorder_proxy< \
__II ## __JJ ## __KK >{}, \ DataFieldType<F>, \
No, Tijk, _t_buffer); __II ## __JJ ## __KK \
>{}, \
No, \
Tijk, \
_t_buffer);
#define DGEMM_PARTICLES(__A, __B) \ #define DGEMM_PARTICLES(__A, __B) \
atrip::xgemm<F>("T", \ atrip::xgemm<F>("T", \
"N", \ "N", \
@ -462,7 +465,7 @@ double getEnergySame
(int const*)&Nv, \ (int const*)&Nv, \
&zero, \ &zero, \
_t_buffer, \ _t_buffer, \
(int const*)&NoNo); (int const*)&NoNo)
#define DGEMM_HOLES(__A, __B, __TRANSB) \ #define DGEMM_HOLES(__A, __B, __TRANSB) \
atrip::xgemm<F>("N", \ atrip::xgemm<F>("N", \
__TRANSB, \ __TRANSB, \
@ -477,7 +480,7 @@ double getEnergySame
&zero, \ &zero, \
_t_buffer, \ _t_buffer, \
(int const*)&NoNo \ (int const*)&NoNo \
); )
#define MAYBE_CONJ(_conj, _buffer) \ #define MAYBE_CONJ(_conj, _buffer) \
cuda::maybeConjugate<<< \ cuda::maybeConjugate<<< \
Atrip::kernelDimensions.ooo.blocks, \ Atrip::kernelDimensions.ooo.blocks, \
@ -512,7 +515,7 @@ double getEnergySame
&zero, \ &zero, \
_t_buffer, \ _t_buffer, \
(int const*)&NoNo \ (int const*)&NoNo \
); )
#define DGEMM_HOLES(__A, __B, __TRANSB) \ #define DGEMM_HOLES(__A, __B, __TRANSB) \
atrip::xgemm<F>("N", \ atrip::xgemm<F>("N", \
__TRANSB, \ __TRANSB, \
@ -527,7 +530,7 @@ double getEnergySame
&zero, \ &zero, \
_t_buffer, \ _t_buffer, \
(int const*)&NoNo \ (int const*)&NoNo \
); )
#define MAYBE_CONJ(_conj, _buffer) \ #define MAYBE_CONJ(_conj, _buffer) \
for (size_t __i = 0; __i < NoNoNo; ++__i) \ for (size_t __i = 0; __i < NoNoNo; ++__i) \
_conj[__i] = maybeConjugate<F>(_buffer[__i]); _conj[__i] = maybeConjugate<F>(_buffer[__i]);
@ -538,8 +541,14 @@ double getEnergySame
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
DataFieldType<F>* _t_buffer; DataFieldType<F>* _t_buffer;
DataFieldType<F>* _vhhh; DataFieldType<F>* _vhhh;
cuMemAlloc((CUdeviceptr*)&_t_buffer, NoNoNo * sizeof(DataFieldType<F>)); WITH_CHRONO("double:cuda:alloc",
cuMemAlloc((CUdeviceptr*)&_vhhh, NoNoNo * sizeof(DataFieldType<F>)); _CHECK_CUDA_SUCCESS("Allocating _t_buffer",
cuMemAlloc((CUdeviceptr*)&_t_buffer,
NoNoNo * sizeof(DataFieldType<F>)));
_CHECK_CUDA_SUCCESS("Allocating _vhhh",
cuMemAlloc((CUdeviceptr*)&_vhhh,
NoNoNo * sizeof(DataFieldType<F>)));
)
const size_t const size_t
bs = Atrip::kernelDimensions.ooo.blocks, bs = Atrip::kernelDimensions.ooo.blocks,
ths = Atrip::kernelDimensions.ooo.threads; ths = Atrip::kernelDimensions.ooo.threads;
@ -560,7 +569,6 @@ double getEnergySame
WITH_CHRONO("double:reorder", WITH_CHRONO("double:reorder",
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk, cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk,
NoNoNo); NoNoNo);
// synchronize all initializations to zero
) )
#else #else
WITH_CHRONO("double:reorder", WITH_CHRONO("double:reorder",
@ -575,36 +583,36 @@ double getEnergySame
// VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
MAYBE_CONJ(_vhhh, VhhhC) MAYBE_CONJ(_vhhh, VhhhC)
WITH_CHRONO("doubles:holes:1", WITH_CHRONO("doubles:holes:1",
DGEMM_HOLES(_vhhh, TABhh, "N") DGEMM_HOLES(_vhhh, TABhh, "N");
REORDER(I, K, J) REORDER(I, K, J)
) )
// VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
WITH_CHRONO("doubles:holes:2", WITH_CHRONO("doubles:holes:2",
DGEMM_HOLES(_vhhh, TABhh, "T") DGEMM_HOLES(_vhhh, TABhh, "T");
REORDER(J, K, I) REORDER(J, K, I)
) )
// VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
MAYBE_CONJ(_vhhh, VhhhB) MAYBE_CONJ(_vhhh, VhhhB)
WITH_CHRONO("doubles:holes:3", WITH_CHRONO("doubles:holes:3",
DGEMM_HOLES(_vhhh, TAChh, "N") DGEMM_HOLES(_vhhh, TAChh, "N");
REORDER(I, J, K) REORDER(I, J, K)
) )
// VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
WITH_CHRONO("doubles:holes:4", WITH_CHRONO("doubles:holes:4",
DGEMM_HOLES(_vhhh, TAChh, "T") DGEMM_HOLES(_vhhh, TAChh, "T");
REORDER(K, J, I) REORDER(K, J, I)
) )
// VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
MAYBE_CONJ(_vhhh, VhhhA) MAYBE_CONJ(_vhhh, VhhhA)
WITH_CHRONO("doubles:holes:5", WITH_CHRONO("doubles:holes:5",
DGEMM_HOLES(_vhhh, TBChh, "N") DGEMM_HOLES(_vhhh, TBChh, "N");
REORDER(J, I, K) REORDER(J, I, K)
) )
// VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
WITH_CHRONO("doubles:holes:6", WITH_CHRONO("doubles:holes:6",
DGEMM_HOLES(_vhhh, TBChh, "T") DGEMM_HOLES(_vhhh, TBChh, "T");
REORDER(K, I, J) REORDER(K, I, J)
) )
} }
@ -616,32 +624,32 @@ double getEnergySame
{ {
// TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
WITH_CHRONO("doubles:particles:1", WITH_CHRONO("doubles:particles:1",
DGEMM_PARTICLES(TAphh, VBCph) DGEMM_PARTICLES(TAphh, VBCph);
REORDER(I, J, K) REORDER(I, J, K)
) )
// TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
WITH_CHRONO("doubles:particles:2", WITH_CHRONO("doubles:particles:2",
DGEMM_PARTICLES(TAphh, VCBph) DGEMM_PARTICLES(TAphh, VCBph);
REORDER(I, K, J) REORDER(I, K, J)
) )
// TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
WITH_CHRONO("doubles:particles:3", WITH_CHRONO("doubles:particles:3",
DGEMM_PARTICLES(TCphh, VABph) DGEMM_PARTICLES(TCphh, VABph);
REORDER(K, I, J) REORDER(K, I, J)
) )
// TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
WITH_CHRONO("doubles:particles:4", WITH_CHRONO("doubles:particles:4",
DGEMM_PARTICLES(TCphh, VBAph) DGEMM_PARTICLES(TCphh, VBAph);
REORDER(K, J, I) REORDER(K, J, I)
) )
// TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
WITH_CHRONO("doubles:particles:5", WITH_CHRONO("doubles:particles:5",
DGEMM_PARTICLES(TBphh, VACph) DGEMM_PARTICLES(TBphh, VACph);
REORDER(J, I, K) REORDER(J, I, K)
) )
// TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
WITH_CHRONO("doubles:particles:6", WITH_CHRONO("doubles:particles:6",
DGEMM_PARTICLES(TBphh, VCAph) DGEMM_PARTICLES(TBphh, VCAph);
REORDER(J, K, I) REORDER(J, K, I)
) )
} }
@ -651,9 +659,12 @@ double getEnergySame
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
// we need to synchronize here since we need // we need to synchronize here since we need
// the Tijk for next process in the pipeline // the Tijk for next process in the pipeline
cuCtxSynchronize(); _CHECK_CUDA_SUCCESS("Synchronizing",
cuMemFree((CUdeviceptr)_vhhh); cuCtxSynchronize());
cuMemFree((CUdeviceptr)_t_buffer); _CHECK_CUDA_SUCCESS("Freeing _vhhh",
cuMemFree((CUdeviceptr)_vhhh));
_CHECK_CUDA_SUCCESS("Freeing _t_buffer",
cuMemFree((CUdeviceptr)_t_buffer));
#else #else
free(_vhhh); free(_vhhh);
free(_t_buffer); free(_t_buffer);