Compare commits

..

No commits in common. "20c29ed815ff161966d5c6ee08dcd9b58984a36a" and "4277c07cc22790fbc4e3fa4ea76ddb9dd494e586" have entirely different histories.

13 changed files with 150 additions and 811 deletions

View File

@ -1,4 +1,3 @@
AUTOMAKE_OPTIONS = subdir-objects
include $(top_srcdir)/atrip.mk include $(top_srcdir)/atrip.mk
AM_CPPFLAGS = -I$(top_srcdir)/include/ $(CTF_CPPFLAGS) AM_CPPFLAGS = -I$(top_srcdir)/include/ $(CTF_CPPFLAGS)
@ -20,13 +19,4 @@ endif
if WITH_CUDA if WITH_CUDA
test_main_CXXFLAGS = $(CUDA_CXXFLAGS) test_main_CXXFLAGS = $(CUDA_CXXFLAGS)
test_main_LDADD += $(CUDA_LDFLAGS) test_main_LDADD += $(CUDA_LDFLAGS)
AM_CXXFLAGS = $(CUDA_CXXFLAGS)
AM_LDFLAGS += $(CUDA_LDFLAGS)
bin_PROGRAMS += test-cublas-parallel-atrip
test_cublas_parallel_SOURCES = test-cublas-parallel-atrip.cxx
bin_PROGRAMS += test-cuda-sanity
test_cuda_sanity_SOURCES = test-cuda-sanity.cxx
endif endif

View File

@ -1,239 +0,0 @@
#include <mpi.h>
#include <iostream>
#include <cassert>
#include <string.h>
#include <sstream>
#include <string>
#include <vector>
#include <cuda.h>
#include <chrono>
#include <map>
#include <cstdlib>
#include <unistd.h>
#include <CLI11.hpp>
#define CUBLASAPI
#include <cublas_api.h>
#include <cublas_v2.h>
struct Timer {
using Clock = std::chrono::high_resolution_clock;
using Event = std::chrono::time_point<Clock>;
std::chrono::duration<double> duration;
Event _start;
inline void start() noexcept { _start = Clock::now(); }
inline void stop() noexcept { duration += Clock::now() - _start; }
inline void clear() noexcept { duration *= 0; }
inline double count() const noexcept { return duration.count(); }
};
using Timings = std::map<std::string, Timer>;
#define _FORMAT(_fmt, ...) \
([&] (void) -> std::string { \
int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__); \
std::vector<char> _out(_sz + 1); \
std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
return std::string(_out.data()); \
})()
#define _CHECK_CUDA_SUCCESS(message, ...) \
do { \
CUresult result = __VA_ARGS__; \
printf("doing %s\n", message); \
if (result != CUDA_SUCCESS) { \
printf("\t!!CUDA_ERROR(%d): %s:%d %s\n", \
result, \
__FILE__, \
__LINE__, \
message); \
return 1; \
} \
} while (0)
#define _CHECK_CUBLAS_SUCCESS(message, ...) \
do { \
cublasStatus_t result = __VA_ARGS__; \
if (result != 0) { \
printf("\t!!CUBLAS_ERROR(%d): %s:%d %s\n", \
result, \
__FILE__, \
__LINE__, \
message); \
return 1; \
} \
} while (0)
int main(int argc, char** argv) {
using std::vector;
MPI_Init(NULL, NULL);
int rank, np, ngcards;
size_t no(10), nv(no * 10), its(2);
bool barrier = false;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &np);
CLI::App app{"Main bench for atrip"};
app.add_option("--no", no, "Occupied orbitals");
app.add_option("--nv", nv, "Virtual orbitals");
app.add_option("--its", its, "Number of iterations to be done");
app.add_option("--barrier", barrier, "Call a MPI_Barrier in every iteration?");
CLI11_PARSE(app, argc, argv);
const size_t oo = no * no, ooo = no * oo;
Timings timings;
MPI_Barrier(MPI_COMM_WORLD);
_CHECK_CUDA_SUCCESS("init for cuda",
cuInit(0));
_CHECK_CUDA_SUCCESS("get ncards",
cuDeviceGetCount(&ngcards));
CUcontext ctx;
CUdevice dev;
char hostname[256];
gethostname(hostname, 256);
printf("%s with rank %d gets card %d\n",
hostname,
rank,
rank % ngcards);
// set contexts
_CHECK_CUDA_SUCCESS("device get", cuDeviceGet(&dev, rank % ngcards));
_CHECK_CUDA_SUCCESS("creating context", cuCtxCreate(&ctx, 0, dev));
_CHECK_CUDA_SUCCESS("setting context", cuCtxSetCurrent(ctx));
_CHECK_CUDA_SUCCESS("synchronizing", cuCtxSynchronize());
MPI_Barrier(MPI_COMM_WORLD);
using host_slice_t = vector<double>;
vector<size_t> sizes = {nv * oo, nv * no , oo, oo * no, oo * no};
vector<CUdeviceptr> P_phh(3), P_ph(6) , H_hh(3), H_hhh(3), T_hhh(1);
vector<vector<CUdeviceptr>*> slices_d = {&P_phh, &P_ph , &H_hh, &H_hhh, &T_hhh};
vector<vector<host_slice_t>> slices_h(slices_d.size());
{
int i = -1;
for (auto& v: slices_d) {
i++;
for (auto& ptr: *v) {
_CHECK_CUDA_SUCCESS("malloc",
cuMemAlloc(&ptr,
sizes[i] * sizeof(double)));
slices_h[i].push_back(std::move(std::vector<double>(sizes[i])));
}
}
}
const double one = 1.0, zero = 0.0;
printf("its: %d\n", its);
printf("barrier: %d\n", barrier);
printf("no: %ld\n", no);
printf("nv: %ld\n", nv);
printf("SIZE %f GB\n", (3 * nv * oo
+ 6 * oo * nv
+ 3 * oo
+ 3 * ooo
+ 1 * ooo
) * sizeof(double) / 1024.0 / 1024.0 / 1024.0);
std::map<std::string, double> tflopss
{{ "dgemm", ooo * (no + nv) * 6.0 * 2.0 * its / 1e12},
{ "holes", ooo * no * 6.0 * 2.0 * its / 1e12},
{ "particles", ooo * nv * 6.0 * 2.0 * its / 1e12}};
cublasHandle_t handle;
_CHECK_CUBLAS_SUCCESS("handle create", cublasCreate(&handle));
printf("handle %ld\n", handle);
timings["dgemm"].start();
for (size_t i = 0; i < its; i++) {
if (barrier) {
MPI_Barrier(MPI_COMM_WORLD);
timings["memcpy"].start();
for (size_t _s = 0; _s < slices_d.size(); _s++) {
// for (size_t _b = 0; _b < slices_h[_s].size(); _b++) {
for (size_t _b = 0; _b < 1 ; _b++) {
auto device = (*slices_d[_s])[_b];
auto host = slices_h[_s][_b].data();
cuMemcpyHtoD(device, host, sizes[_s]);
}
}
timings["memcpy"].stop();
}
timings["holes"].start();
for (size_t j = 0; j < 3; j++) {
_CHECK_CUBLAS_SUCCESS(" > 'geming ...",
cublasDgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
oo, no, no,
&one,
(double*)H_hhh[j], oo,
(double*)H_hh[j], no,
&zero,
(double*)T_hhh[0], oo));
_CHECK_CUBLAS_SUCCESS(" > 'geming ...",
cublasDgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_T,
oo, no, no,
&one,
(double*)H_hhh[j], oo,
(double*)H_hh[j], no,
&zero,
(double*)T_hhh[0], oo));
}
timings["holes"].stop();
timings["particles"].start();
for (size_t j = 0; j < 6; j++) {
_CHECK_CUBLAS_SUCCESS(" > 'geming ...",
cublasDgemm(handle,
CUBLAS_OP_T,
CUBLAS_OP_N,
oo, no, nv,
&one,
(double*)P_phh[j % 3], nv,
(double*)P_ph[j], nv,
&zero,
(double*)T_hhh[0], oo));
}
timings["particles"].stop();
cuCtxSynchronize();
}
timings["dgemm"].stop();
printf("Performance: \n");
for (auto name: {"holes", "particles", "dgemm"})
printf("%10s TFlops: %4.1f\n",
name,
tflopss[name]
/ timings[name].count());
printf("Timings: \n");
for (auto const& kv: timings)
printf("%10s: %10f\n", kv.first.c_str(), kv.second.count());
MPI_Finalize();
return 0;
}

View File

@ -1,163 +0,0 @@
#include <mpi.h>
#include <iostream>
#include <cassert>
#include <string.h>
#include <sstream>
#include <string>
#include <vector>
#include <cuda.h>
#include <chrono>
#include <map>
#include <cstdlib>
#include <unistd.h>
#define CUBLASAPI
#include <cublas_api.h>
#include <cublas_v2.h>
struct Timer {
using Clock = std::chrono::high_resolution_clock;
using Event = std::chrono::time_point<Clock>;
std::chrono::duration<double> duration;
Event _start;
inline void start() noexcept { _start = Clock::now(); }
inline void stop() noexcept { duration += Clock::now() - _start; }
inline void clear() noexcept { duration *= 0; }
inline double count() const noexcept { return duration.count(); }
};
using Timings = std::map<std::string, Timer>;
#define _FORMAT(_fmt, ...) \
([&] (void) -> std::string { \
int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__); \
std::vector<char> _out(_sz + 1); \
std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
return std::string(_out.data()); \
})()
#define _CHECK_CUDA_SUCCESS(message, ...) \
do { \
CUresult result = __VA_ARGS__; \
printf("doing %s\n", message); \
if (result != CUDA_SUCCESS) { \
printf("\t!!CUDA_ERROR(%d): %s:%d %s\n", \
result, \
__FILE__, \
__LINE__, \
message); \
return 1; \
} \
} while (0)
#define _CHECK_CUBLAS_SUCCESS(message, ...) \
do { \
cublasStatus_t result = __VA_ARGS__; \
printf("CUBLAS: doing %s\n", message); \
if (result != 0) { \
printf("\t!!CUBLAS_ERROR(%d): %s:%d %s\n", \
result, \
__FILE__, \
__LINE__, \
message); \
return 1; \
} \
} while (0)
int main(int argc, char** argv) {
MPI_Init(NULL, NULL);
int rank, np, ngcards;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &np);
Timings timings;
MPI_Barrier(MPI_COMM_WORLD);
_CHECK_CUDA_SUCCESS("init for cuda",
cuInit(0));
_CHECK_CUDA_SUCCESS("get ncards",
cuDeviceGetCount(&ngcards));
const size_t N
= argc > 1
? atoi(argv[1])
: 30000
, dgemms
= argc > 2
? atoi(argv[2])
: 2
, flops = 2 * N * N * N * dgemms
;
CUcontext ctx;
CUdevice dev;
char hostname[256];
gethostname(hostname, 256);
printf("%s with rank %d gets card %d\n",
hostname,
rank,
rank % ngcards);
// set contexts
_CHECK_CUDA_SUCCESS("device get",
cuDeviceGet(&dev, rank % ngcards));
_CHECK_CUDA_SUCCESS("creating context",
cuCtxCreate(&ctx, 0, dev));
_CHECK_CUDA_SUCCESS("setting context",
cuCtxSetCurrent(ctx));
_CHECK_CUDA_SUCCESS("synchronizing",
cuCtxSynchronize());
MPI_Barrier(MPI_COMM_WORLD);
CUdeviceptr A, B, C;
const double one = 1.0;
printf("SIZE %f GB\n", N * N * sizeof(double) / 1024.0 / 1024.0 / 1024.0);
_CHECK_CUDA_SUCCESS("A", cuMemAlloc(&A, N * N * sizeof(double)));
_CHECK_CUDA_SUCCESS("B", cuMemAlloc(&B, N * N * sizeof(double)));
_CHECK_CUDA_SUCCESS("C", cuMemAlloc(&C, N * N * sizeof(double)));
cublasHandle_t handle;
cublasStatus_t stat;
_CHECK_CUBLAS_SUCCESS("handle create", cublasCreate(&handle));
printf("handle %ld\n", handle);
timings["dgemm"].start();
for (size_t i = 0; i < dgemms; i++) {
_CHECK_CUBLAS_SUCCESS(_FORMAT(" > 'geming %ld ...", i).c_str(),
cublasDgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
N, N, N,
&one,
(double*)A, N,
(double*)B, N,
&one,
(double*)C, N));
}
cuCtxSynchronize();
timings["dgemm"].stop();
printf("dgemm Gflops: %f\n",
flops
/ timings["dgemm"].count()
/ 1024.0
/ 1024.0
/ 1024.0);
MPI_Finalize();
return 0;
}
// Local Variables:
// compile-command: "mpic++ \
// -pedantic -std=c++11 \
// -L./cudaroot/lib64 -lcuda \
// -L./cudaroot/lib64 -lcudart \
// -L./cudaroot/lib64 -lcublas \
// ./test-cublas-parallel.cxx -o test-cublas-parallel"
// End:

View File

@ -1,104 +0,0 @@
#include <mpi.h>
#include <iostream>
#include <cassert>
#include <string.h>
#include <sstream>
#include <string>
#include <vector>
#include <cuda.h>
#define _CHECK_CUDA_SUCCESS(message, ...) \
do { \
CUresult result = __VA_ARGS__; \
printf("doing %s\n", message); \
if (result != CUDA_SUCCESS) { \
printf("\t!!CUDA_ERROR(%d): %s:%d %s\n", \
result, \
__FILE__, \
__LINE__, \
message); \
return 1; \
} \
} while (0)
int main() {
int rank, np, ngcards;
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &np);
_CHECK_CUDA_SUCCESS("init for cuda",
cuInit(0));
_CHECK_CUDA_SUCCESS("get ncards",
cuDeviceGetCount(&ngcards));
for (size_t rank = 0; rank < ngcards; rank++) {
CUcontext ctx;
CUdevice dev;
CUdevprop_st prop;
size_t _free, total, total2;
char *name = (char*)malloc(256);
printf("Setting contexts\n");
// set contexts
_CHECK_CUDA_SUCCESS("device get",
cuDeviceGet(&dev, rank));
_CHECK_CUDA_SUCCESS("creating context",
cuCtxCreate(&ctx, 0, dev));
_CHECK_CUDA_SUCCESS("setting context",
cuCtxSetCurrent(ctx));
_CHECK_CUDA_SUCCESS("synchronizing",
cuCtxSynchronize());
_CHECK_CUDA_SUCCESS("prop get",
cuDeviceGetProperties(&prop, dev));
_CHECK_CUDA_SUCCESS("meminfo get",
cuMemGetInfo(&_free, &total));
_CHECK_CUDA_SUCCESS("name get",
cuDeviceGetName(name, 256, dev));
_CHECK_CUDA_SUCCESS("totalmem get",
cuDeviceTotalMem(&total2, dev));
printf("\n"
"CUDA CARD RANK %d\n"
"=================\n"
"\tname: %s\n"
"\tShared Mem Per Block (KB): %f\n"
"\tFree/Total mem (GB): %f/%f\n"
"\total2 mem (GB): %f\n"
"\n",
dev,
name,
prop.sharedMemPerBlock / 1024.0,
_free / 1024.0 / 1024.0 / 1024.0 ,
total / 1024.0 / 1024.0 / 1024.0 ,
total2 / 1024.0 / 1024.0 / 1024.0
);
if (_free == 0 || total == 0 || total2 == 0)
return 1;
CUdeviceptr data;
_CHECK_CUDA_SUCCESS("memalloc 1",
cuMemAlloc(&data, sizeof(double) * 10000));
_CHECK_CUDA_SUCCESS("memalloc 2",
cuMemAlloc(&data, sizeof(double) * 10000));
}
MPI_Finalize();
return 0;
}
// Local Variables:
// compile-command: "mpic++ \
// -pedantic -std=c++11 \
// -L./cudaroot/lib64 -lcuda \
// -L./cudaroot/lib64 -lcudart \
// -L./cudaroot/lib64 -lcublas \
// ./mem.cxx -o mem"
// End:

View File

@ -142,10 +142,8 @@ int main(int argc, char** argv) {
+ /* tai */ f * nranks * no * nv + /* tai */ f * nranks * no * nv
; ;
if (rank == 0) { if (atrip::Atrip::rank == 0)
std::cout << "Tentative MEMORY USAGE (GB): " std::cout << "Tentative MEMORY USAGE: " << atrip_memory << "\n";
<< double(atrip_memory) / 1024.0 / 1024.0 / 1024.0 << "\n";
}
std::vector<int> symmetries(4, NS) std::vector<int> symmetries(4, NS)

View File

@ -1,7 +1,5 @@
#!/usr/bin/env bash #!/usr/bin/env bash
cd `git rev-parse --show-toplevel`
type -a autoreconf > /dev/null || type -a autoreconf > /dev/null ||
{ {
cat <<EOF && exit cat <<EOF && exit

View File

@ -1,13 +1,5 @@
#pragma once #pragma once
#include <atrip/Utils.hpp>
#if defined(HAVE_CUDA)
#include <cuda.h>
#endif
#include <sstream>
#if defined(HAVE_CUDA) && defined(__CUDACC__) #if defined(HAVE_CUDA) && defined(__CUDACC__)
# define __MAYBE_GLOBAL__ __global__ # define __MAYBE_GLOBAL__ __global__
# define __MAYBE_DEVICE__ __device__ # define __MAYBE_DEVICE__ __device__
@ -15,31 +7,3 @@
# define __MAYBE_GLOBAL__ # define __MAYBE_GLOBAL__
# define __MAYBE_DEVICE__ # define __MAYBE_DEVICE__
#endif #endif
#define _CHECK_CUDA_SUCCESS(message, ...) \
do { \
CUresult result = __VA_ARGS__; \
if (result != CUDA_SUCCESS) { \
auto msg = _FORMAT("\t!!CUDA_ERROR(%d): %s:%d\v%s", \
result, \
__FILE__, \
__LINE__, \
message); \
std::cerr << msg; \
throw msg; \
} \
} while (0)
#define _CHECK_CUBLAS_SUCCESS(message, ...) \
do { \
cublasStatus_t result = __VA_ARGS__; \
if (result != 0) { \
auto msg = _FORMAT("\t!!CUBLAS_ERROR(%d): %s:%d\v%s", \
result, \
__FILE__, \
__LINE__, \
message); \
std::cerr << msg; \
throw msg; \
} \
} while (0)

View File

@ -21,7 +21,6 @@
#include <atrip/Tuples.hpp> #include <atrip/Tuples.hpp>
#include <atrip/Utils.hpp> #include <atrip/Utils.hpp>
#include <atrip/CUDA.hpp>
namespace atrip { namespace atrip {
@ -459,11 +458,7 @@ void unwrapAndMarkReady() {
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
// copy the retrieved mpi data to the device // copy the retrieved mpi data to the device
WITH_CHRONO("cuda:memcpy", WITH_CHRONO("cuda:memcpy",
_CHECK_CUDA_SUCCESS("copying mpi data to device", cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);)
cuMemcpyHtoD(data,
(void*)mpi_data,
sizeof(F) * size));
)
std::free(mpi_data); std::free(mpi_data);
#endif #endif

View File

@ -195,28 +195,7 @@ template <typename F=double>
; ;
if (blank.info.state == Slice<F>::SelfSufficient) { if (blank.info.state == Slice<F>::SelfSufficient) {
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
const size_t _size = sizeof(F) * sources[from.source].size(); blank.mpi_data = sources[from.source].data();
// TODO: this is code duplication with downstairs
if (freePointers.size() == 0) {
std::stringstream stream;
stream << "No more free pointers "
<< "for type " << type
<< " and name " << name
;
throw std::domain_error(stream.str());
}
auto dataPointer = freePointers.begin();
freePointers.erase(dataPointer);
blank.data = *dataPointer;
//
//
// TODO [#A]: do cuMemcpy of
// sources[from.source].data() ⇒ blank.data
// Do this when everything else is working.
// This will probably be a bottleneck of the H-to-D communication,
// as most slices are SelfSufficient.
//
//
#else #else
blank.data = sources[from.source].data(); blank.data = sources[from.source].data();
#endif #endif
@ -337,16 +316,6 @@ template <typename F=double>
} }
} }
#if defined(HAVE_CUDA)
// In cuda, SelfSufficient slices have an ad-hoc pointer
// since it is a pointer on the device and has to be
// brought back to the free pointer bucket of the SliceUnion.
// Therefore, only in the Recycled case it should not be
// put back the pointer.
if (slice.info.state == Slice<F>::Recycled) {
freeSlicePointer = false;
}
#else
// if the slice is self sufficient, do not dare touching the // if the slice is self sufficient, do not dare touching the
// pointer, since it is a pointer to our sources in our rank. // pointer, since it is a pointer to our sources in our rank.
if ( slice.info.state == Slice<F>::SelfSufficient if ( slice.info.state == Slice<F>::SelfSufficient
@ -354,7 +323,6 @@ template <typename F=double>
) { ) {
freeSlicePointer = false; freeSlicePointer = false;
} }
#endif
// make sure we get its data pointer to be used later // make sure we get its data pointer to be used later
// only for non-recycled, since it can be that we need // only for non-recycled, since it can be that we need
@ -378,7 +346,7 @@ template <typename F=double>
// << " info " << slice.info // << " info " << slice.info
<< "\n"; << "\n";
slice.free(); slice.free();
} // we did not find the slice }
} }
} }
@ -412,17 +380,10 @@ template <typename F=double>
for (auto& ptr: sliceBuffers) { for (auto& ptr: sliceBuffers) {
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
const CUresult error =
cuMemAlloc(&ptr, sizeof(F) * sources[0].size()); cuMemAlloc(&ptr, sizeof(F) * sources[0].size());
if (ptr == 0UL) { if (ptr == 0UL) {
throw "UNSUFICCIENT MEMORY ON THE GRAPHIC CARD FOR FREE POINTERS"; throw "UNSUFICCIENT MEMORY ON THE GRAPHIC CARD FOR FREE POINTERS";
} }
if (error != CUDA_SUCCESS) {
std::stringstream s;
s << "Error allocating memory for slice buffers "
<< "code " << error << "\n";
throw s.str();
}
#else #else
ptr = (DataPtr<F>)malloc(sizeof(F) * sources[0].size()); ptr = (DataPtr<F>)malloc(sizeof(F) * sources[0].size());
#endif #endif

View File

@ -80,16 +80,6 @@ struct Timer {
using Timings = std::map<std::string, Timer>; using Timings = std::map<std::string, Timer>;
// Chrono:1 ends here // Chrono:1 ends here
// A nice handy macro to do formatting
#define _FORMAT(_fmt, ...) \
([&] (void) -> std::string { \
int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__); \
std::vector<char> _out(_sz + 1); \
std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
return std::string(_out.data()); \
})()
// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]] // [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
} }
// Epilog:1 ends here // Epilog:1 ends here

View File

@ -24,7 +24,7 @@
using namespace atrip; using namespace atrip;
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
#include <atrip/CUDA.hpp> #include <cuda.h>
#endif #endif
template <typename F> bool RankMap<F>::RANK_ROUND_ROBIN; template <typename F> bool RankMap<F>::RANK_ROUND_ROBIN;
@ -49,6 +49,11 @@ void Atrip::init(MPI_Comm world) {
Atrip::communicator = world; Atrip::communicator = world;
MPI_Comm_rank(world, (int*)&Atrip::rank); MPI_Comm_rank(world, (int*)&Atrip::rank);
MPI_Comm_size(world, (int*)&Atrip::np); MPI_Comm_size(world, (int*)&Atrip::np);
#if defined(HAVE_CUDA)
Atrip::cuda.status = cublasCreate(&Atrip::cuda.handle);
#endif
} }
template <typename F> template <typename F>
@ -66,24 +71,18 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
int ngcards; int ngcards;
_CHECK_CUDA_SUCCESS("initializing cuda",
cuInit(0));
_CHECK_CUDA_SUCCESS("getting device count",
cuDeviceGetCount(&ngcards));
const auto clusterInfo = getClusterInfo(Atrip::communicator); const auto clusterInfo = getClusterInfo(Atrip::communicator);
cuDeviceGetCount(&ngcards);
LOG(0,"Atrip") << "ngcards: " << ngcards << "\n"; LOG(0,"Atrip") << "ngcards: " << ngcards << "\n";
if (clusterInfo.ranksPerNode > ngcards) { if (clusterInfo.ranksPerNode > ngcards) {
const auto msg std::cerr << "ATRIP: You are running on more ranks per node than the number of graphic cards\n"
= _FORMAT("ATRIP: You are running on more ranks per node than the number of graphic cards\n" << "You have " << ngcards << " cards at your disposal\n";
"You have %d cards at your disposal\n", ngcards); throw "";
std::cerr << msg; }
throw msg; if (clusterInfo.ranksPerNode < ngcards) {
} else if (clusterInfo.ranksPerNode < ngcards) { std::cerr << "You have " << ngcards << " cards at your disposal\n"
const auto msg << "You will be only using " << clusterInfo.ranksPerNode
= _FORMAT("You have %d cards at your disposal.\n" << ", i.e., the nubmer of ranks.\n";
"You will be only using %d, i.e, the number of ranks\n",
ngcards, clusterInfo.ranksPerNode);
std::cerr << msg;
} }
@ -95,27 +94,16 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
struct { struct { size_t free, total; } avail; size_t total; } memory; struct { struct { size_t free, total; } avail; size_t total; } memory;
char *name = (char*)malloc(256); char *name = (char*)malloc(256);
// - TODO :: we should check that the Zuweisung of graphic cards
// to nodes works as expected, i.e., node k should get from 0
// to ngcards with the formula =rank % ngcards=.
// set current device // set current device
_CHECK_CUDA_SUCCESS("getting device for index <rank>", cuDeviceGet(&dev, rank);
cuDeviceGet(&dev, rank % ngcards)); cuCtxCreate(&ctx, 0, dev);
_CHECK_CUDA_SUCCESS("creating a cuda context", cuCtxSetCurrent(ctx);
cuCtxCreate(&ctx, 0, dev));
_CHECK_CUDA_SUCCESS("setting the context",
cuCtxSetCurrent(ctx));
// get information of the device // get information of the device
_CHECK_CUDA_SUCCESS("getting properties of current device", cuDeviceGetProperties(&prop, dev);
cuDeviceGetProperties(&prop, dev)); cuMemGetInfo(&memory.avail.free, &memory.avail.total);
_CHECK_CUDA_SUCCESS("getting memory information", cuDeviceGetName(name, 256, dev);
cuMemGetInfo(&memory.avail.free, &memory.avail.total)); cuDeviceTotalMem(&memory.total, dev);
_CHECK_CUDA_SUCCESS("getting name",
cuDeviceGetName(name, 256, dev));
_CHECK_CUDA_SUCCESS("getting total memory",
cuDeviceTotalMem(&memory.total, dev));
printf("\n" printf("\n"
"CUDA CARD RANK %d\n" "CUDA CARD RANK %d\n"
@ -136,10 +124,6 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
memory.total / 1024.0 / 1024.0 / 1024.0 memory.total / 1024.0 / 1024.0 / 1024.0
); );
std::free((void*)name); std::free((void*)name);
_CHECK_CUBLAS_SUCCESS("creating a cublas handle",
cublasCreate(&Atrip::cuda.handle));
} }
MPI_Barrier(universe); MPI_Barrier(universe);
} }
@ -179,27 +163,17 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
DataPtr<F> Tai, epsi, epsa; DataPtr<F> Tai, epsi, epsa;
//TODO: free memory pointers in the end of the algorithm //TODO: free memory pointers in the end of the algorithm
cuMemAlloc(&Tai, sizeof(F) * _Tai.size());
cuMemAlloc(&epsi, sizeof(F) * _epsi.size());
cuMemAlloc(&epsa, sizeof(F) * _epsa.size());
_CHECK_CUDA_SUCCESS("Tai", cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size());
cuMemAlloc(&Tai, sizeof(F) * _Tai.size())); cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size());
_CHECK_CUDA_SUCCESS("epsi", cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size());
cuMemAlloc(&epsi, sizeof(F) * _epsi.size()));
_CHECK_CUDA_SUCCESS("epsa",
cuMemAlloc(&epsa, sizeof(F) * _epsa.size()));
_CHECK_CUDA_SUCCESS("memcpy Tai", cuMemAlloc(&Tijk, sizeof(F) * No * No * No);
cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size())); cuMemAlloc(&Zijk, sizeof(F) * No * No * No);
_CHECK_CUDA_SUCCESS("memcpy epsi",
cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size()));
_CHECK_CUDA_SUCCESS("memcpy epsa",
cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size()));
_CHECK_CUDA_SUCCESS("Tijk",
cuMemAlloc(&Tijk, sizeof(F) * No * No * No));
_CHECK_CUDA_SUCCESS("Zijk",
cuMemAlloc(&Zijk, sizeof(F) * No * No * No));
#else #else
std::vector<F> &Tai = _Tai, &epsi = _epsi, &epsa = _epsa; std::vector<F> &Tai = _Tai, &epsi = _epsi, &epsa = _epsa;
Zijk = (DataFieldType<F>*)malloc(No*No*No * sizeof(DataFieldType<F>)); Zijk = (DataFieldType<F>*)malloc(No*No*No * sizeof(DataFieldType<F>));

View File

@ -15,13 +15,10 @@
// [[file:~/cuda/atrip/atrip.org::*Blas][Blas:2]] // [[file:~/cuda/atrip/atrip.org::*Blas][Blas:2]]
#include <atrip/Blas.hpp> #include <atrip/Blas.hpp>
#include <atrip/Atrip.hpp> #include <atrip/Atrip.hpp>
#include <atrip/CUDA.hpp>
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
# include <cstring> # include <cstring>
static size_t dgem_call = 0;
static inline static inline
cublasOperation_t char_to_cublasOperation(const char* trans) { cublasOperation_t char_to_cublasOperation(const char* trans) {
if (strncmp("N", trans, 1) == 0) if (strncmp("N", trans, 1) == 0)
@ -52,8 +49,6 @@ namespace atrip {
typename DataField<double>::type *C, typename DataField<double>::type *C,
const int *ldc) { const int *ldc) {
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
// TODO: remove this verbose checking
const cublasStatus_t error =
cublasDgemm(Atrip::cuda.handle, cublasDgemm(Atrip::cuda.handle,
char_to_cublasOperation(transa), char_to_cublasOperation(transa),
char_to_cublasOperation(transb), char_to_cublasOperation(transb),
@ -61,14 +56,6 @@ namespace atrip {
alpha, A, *lda, alpha, A, *lda,
B, *ldb, beta, B, *ldb, beta,
C, *ldc); C, *ldc);
if (error != 0) printf(":%-3ld (%4ldth) ERR<%4d> cublasDgemm: "
"A = %20ld "
"B = %20ld "
"C = %20ld "
"\n",
Atrip::rank,
dgem_call++,
error, A, B, C);
#else #else
dgemm_(transa, transb, dgemm_(transa, transb,
m, n, k, m, n, k,
@ -96,17 +83,16 @@ namespace atrip {
cuDoubleComplex cuDoubleComplex
cu_alpha = {std::real(*alpha), std::imag(*alpha)}, cu_alpha = {std::real(*alpha), std::imag(*alpha)},
cu_beta = {std::real(*beta), std::imag(*beta)}; cu_beta = {std::real(*beta), std::imag(*beta)};
_CHECK_CUBLAS_SUCCESS("cublasZgemm",
cublasZgemm(Atrip::cuda.handle, cublasZgemm(Atrip::cuda.handle,
char_to_cublasOperation(transa), char_to_cublasOperation(transa),
char_to_cublasOperation(transb), char_to_cublasOperation(transb),
*m, *n, *k, *m, *n, *k,
&cu_alpha, &cu_alpha,
A, *lda, A, *lda,
B, *ldb, B, *ldb,
&cu_beta, &cu_beta,
C, *ldc)); C, *ldc);
#else #else
zgemm_(transa, transb, zgemm_(transa, transb,
m, n, k, m, n, k,

View File

@ -14,13 +14,31 @@
// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]] // [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]]
#include<atrip/Equations.hpp> #include<atrip/Equations.hpp>
#include<atrip/CUDA.hpp> #include<atrip/CUDA.hpp>
#if defined(HAVE_CUDA)
#include <cuda.h>
#endif
namespace atrip { namespace atrip {
// Prolog:2 ends here // Prolog:2 ends here
// These are just help structures
// to help with the templating of reorder
// function
enum reordering_t
{
IJK,
IKJ,
JIK,
JKI,
KIJ,
KJI
};
template <typename F, reordering_t R>
struct reorder_proxy {};
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
namespace cuda { namespace cuda {
@ -90,35 +108,42 @@ namespace cuda {
template <typename F> template <typename F>
__device__ __device__
void sum_in_place(F* to, const F* from); void sum_in_place(F* to, const F* b);
template <> template <>
__device__ __device__
void sum_in_place(double* to, const double *from) { *to += *from; } void sum_in_place(double* to, const double *b) { *to += *b; }
template <> template <>
__device__ __device__
void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* from) { void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* b) {
to->x += from->x; to->x += b->x;
to->y += from->y; to->y += b->y;
}
__device__
cuDoubleComplex& operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz) {
lz.x += rz.x;
lz.y += rz.y;
return lz;
} }
}; };
#endif #endif
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
#define FOR_K() \ #define LIMS_KS() \
for (size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \ size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \
k = kmin, \ k = kmin, \
idx = kmin * size * size * size; \ idx = kmin * size * size * size \
k < (kmin < size) ? kmin + 1 : size; \ ; \
k++) k < (kmin < size) ? kmin + 1 : size
#else #else
#define FOR_K() for (size_t k=0, idx=0; k < size; k++) #define LIMS_KS size_t k=0, idx=0; k < size
#endif #endif
#define _IJK_(i, j, k) i + j*size + k*size*size #define _IJK_(i, j, k) i + j*size + k*size*size
#define _REORDER_BODY_(...) \ #define _REORDER_BODY_(...) \
FOR_K() \ for (LIMS_KS() ; k++) \
for (size_t j = 0; j < size; j++) \ for (size_t j = 0; j < size; j++) \
for (size_t i = 0; i < size; i++, idx++) { \ for (size_t i = 0; i < size; i++, idx++) { \
__VA_ARGS__ \ __VA_ARGS__ \
@ -127,9 +152,7 @@ namespace cuda {
template <typename F> \ template <typename F> \
__MAYBE_GLOBAL__ \ __MAYBE_GLOBAL__ \
void reorder(reorder_proxy< F, _enum > p, \ void reorder(reorder_proxy< F, _enum > p, \
size_t size, \ size_t size, F* to, F* from) { \
F* to, \
F* from) { \
_REORDER_BODY_(__VA_ARGS__) \ _REORDER_BODY_(__VA_ARGS__) \
} }
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
@ -138,31 +161,9 @@ namespace cuda {
#define GO(__TO, __FROM) __TO += __FROM; #define GO(__TO, __FROM) __TO += __FROM;
#endif #endif
// These are just help structures
// to help with the templating of reorder
// function
enum reordering_t
{
IJK,
IKJ,
JIK,
JKI,
KIJ,
KJI
};
/*
* Please the c++ type checker and template creator
* in order to have an argument in the signature of
* the function that helps the compiler know which
* instantiation it should take.
*
*/
template <typename F, reordering_t R>
struct reorder_proxy {};
template <typename F, reordering_t R> template <typename F, reordering_t R>
__MAYBE_GLOBAL__ __MAYBE_GLOBAL__ \
void reorder(reorder_proxy<F, R> proxy, void reorder(reorder_proxy<F, R> proxy,
size_t size, F* to, F* from); size_t size, F* to, F* from);
@ -445,13 +446,9 @@ double getEnergySame
#if defined(ATRIP_USE_DGEMM) #if defined(ATRIP_USE_DGEMM)
#if defined(HAVE_CUDA) #if defined(HAVE_CUDA)
#define REORDER(__II, __JJ, __KK) \ #define REORDER(__II, __JJ, __KK) \
reorder<<<bs, ths>>>(reorder_proxy< \ reorder<<<bs, ths>>>(reorder_proxy<DataFieldType<F>, \
DataFieldType<F>, \ __II ## __JJ ## __KK >{}, \
__II ## __JJ ## __KK \ No, Tijk, _t_buffer);
>{}, \
No, \
Tijk, \
_t_buffer);
#define DGEMM_PARTICLES(__A, __B) \ #define DGEMM_PARTICLES(__A, __B) \
atrip::xgemm<F>("T", \ atrip::xgemm<F>("T", \
"N", \ "N", \
@ -465,7 +462,7 @@ double getEnergySame
(int const*)&Nv, \ (int const*)&Nv, \
&zero, \ &zero, \
_t_buffer, \ _t_buffer, \
(int const*)&NoNo) (int const*)&NoNo);
#define DGEMM_HOLES(__A, __B, __TRANSB) \ #define DGEMM_HOLES(__A, __B, __TRANSB) \
atrip::xgemm<F>("N", \ atrip::xgemm<F>("N", \
__TRANSB, \ __TRANSB, \
@ -480,7 +477,7 @@ double getEnergySame
&zero, \ &zero, \
_t_buffer, \ _t_buffer, \
(int const*)&NoNo \ (int const*)&NoNo \
) );
#define MAYBE_CONJ(_conj, _buffer) \ #define MAYBE_CONJ(_conj, _buffer) \
cuda::maybeConjugate<<< \ cuda::maybeConjugate<<< \
Atrip::kernelDimensions.ooo.blocks, \ Atrip::kernelDimensions.ooo.blocks, \
@ -515,7 +512,7 @@ double getEnergySame
&zero, \ &zero, \
_t_buffer, \ _t_buffer, \
(int const*)&NoNo \ (int const*)&NoNo \
) );
#define DGEMM_HOLES(__A, __B, __TRANSB) \ #define DGEMM_HOLES(__A, __B, __TRANSB) \
atrip::xgemm<F>("N", \ atrip::xgemm<F>("N", \
__TRANSB, \ __TRANSB, \
@ -530,7 +527,7 @@ double getEnergySame
&zero, \ &zero, \
_t_buffer, \ _t_buffer, \
(int const*)&NoNo \ (int const*)&NoNo \
) );
#define MAYBE_CONJ(_conj, _buffer) \ #define MAYBE_CONJ(_conj, _buffer) \
for (size_t __i = 0; __i < NoNoNo; ++__i) \ for (size_t __i = 0; __i < NoNoNo; ++__i) \
_conj[__i] = maybeConjugate<F>(_buffer[__i]); _conj[__i] = maybeConjugate<F>(_buffer[__i]);
@ -541,14 +538,8 @@ double getEnergySame
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
DataFieldType<F>* _t_buffer; DataFieldType<F>* _t_buffer;
DataFieldType<F>* _vhhh; DataFieldType<F>* _vhhh;
WITH_CHRONO("double:cuda:alloc", cuMemAlloc((CUdeviceptr*)&_t_buffer, NoNoNo * sizeof(DataFieldType<F>));
_CHECK_CUDA_SUCCESS("Allocating _t_buffer", cuMemAlloc((CUdeviceptr*)&_vhhh, NoNoNo * sizeof(DataFieldType<F>));
cuMemAlloc((CUdeviceptr*)&_t_buffer,
NoNoNo * sizeof(DataFieldType<F>)));
_CHECK_CUDA_SUCCESS("Allocating _vhhh",
cuMemAlloc((CUdeviceptr*)&_vhhh,
NoNoNo * sizeof(DataFieldType<F>)));
)
const size_t const size_t
bs = Atrip::kernelDimensions.ooo.blocks, bs = Atrip::kernelDimensions.ooo.blocks,
ths = Atrip::kernelDimensions.ooo.threads; ths = Atrip::kernelDimensions.ooo.threads;
@ -569,6 +560,7 @@ double getEnergySame
WITH_CHRONO("double:reorder", WITH_CHRONO("double:reorder",
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk, cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk,
NoNoNo); NoNoNo);
// synchronize all initializations to zero
) )
#else #else
WITH_CHRONO("double:reorder", WITH_CHRONO("double:reorder",
@ -583,36 +575,36 @@ double getEnergySame
// VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1 // VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
MAYBE_CONJ(_vhhh, VhhhC) MAYBE_CONJ(_vhhh, VhhhC)
WITH_CHRONO("doubles:holes:1", WITH_CHRONO("doubles:holes:1",
DGEMM_HOLES(_vhhh, TABhh, "N"); DGEMM_HOLES(_vhhh, TABhh, "N")
REORDER(I, K, J) REORDER(I, K, J)
) )
// VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0 // VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
WITH_CHRONO("doubles:holes:2", WITH_CHRONO("doubles:holes:2",
DGEMM_HOLES(_vhhh, TABhh, "T"); DGEMM_HOLES(_vhhh, TABhh, "T")
REORDER(J, K, I) REORDER(J, K, I)
) )
// VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5 // VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
MAYBE_CONJ(_vhhh, VhhhB) MAYBE_CONJ(_vhhh, VhhhB)
WITH_CHRONO("doubles:holes:3", WITH_CHRONO("doubles:holes:3",
DGEMM_HOLES(_vhhh, TAChh, "N"); DGEMM_HOLES(_vhhh, TAChh, "N")
REORDER(I, J, K) REORDER(I, J, K)
) )
// VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3 // VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
WITH_CHRONO("doubles:holes:4", WITH_CHRONO("doubles:holes:4",
DGEMM_HOLES(_vhhh, TAChh, "T"); DGEMM_HOLES(_vhhh, TAChh, "T")
REORDER(K, J, I) REORDER(K, J, I)
) )
// VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1 // VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
MAYBE_CONJ(_vhhh, VhhhA) MAYBE_CONJ(_vhhh, VhhhA)
WITH_CHRONO("doubles:holes:5", WITH_CHRONO("doubles:holes:5",
DGEMM_HOLES(_vhhh, TBChh, "N"); DGEMM_HOLES(_vhhh, TBChh, "N")
REORDER(J, I, K) REORDER(J, I, K)
) )
// VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4 // VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
WITH_CHRONO("doubles:holes:6", WITH_CHRONO("doubles:holes:6",
DGEMM_HOLES(_vhhh, TBChh, "T"); DGEMM_HOLES(_vhhh, TBChh, "T")
REORDER(K, I, J) REORDER(K, I, J)
) )
} }
@ -624,32 +616,32 @@ double getEnergySame
{ {
// TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0 // TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
WITH_CHRONO("doubles:particles:1", WITH_CHRONO("doubles:particles:1",
DGEMM_PARTICLES(TAphh, VBCph); DGEMM_PARTICLES(TAphh, VBCph)
REORDER(I, J, K) REORDER(I, J, K)
) )
// TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3 // TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
WITH_CHRONO("doubles:particles:2", WITH_CHRONO("doubles:particles:2",
DGEMM_PARTICLES(TAphh, VCBph); DGEMM_PARTICLES(TAphh, VCBph)
REORDER(I, K, J) REORDER(I, K, J)
) )
// TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5 // TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
WITH_CHRONO("doubles:particles:3", WITH_CHRONO("doubles:particles:3",
DGEMM_PARTICLES(TCphh, VABph); DGEMM_PARTICLES(TCphh, VABph)
REORDER(K, I, J) REORDER(K, I, J)
) )
// TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2 // TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
WITH_CHRONO("doubles:particles:4", WITH_CHRONO("doubles:particles:4",
DGEMM_PARTICLES(TCphh, VBAph); DGEMM_PARTICLES(TCphh, VBAph)
REORDER(K, J, I) REORDER(K, J, I)
) )
// TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1 // TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
WITH_CHRONO("doubles:particles:5", WITH_CHRONO("doubles:particles:5",
DGEMM_PARTICLES(TBphh, VACph); DGEMM_PARTICLES(TBphh, VACph)
REORDER(J, I, K) REORDER(J, I, K)
) )
// TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4 // TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
WITH_CHRONO("doubles:particles:6", WITH_CHRONO("doubles:particles:6",
DGEMM_PARTICLES(TBphh, VCAph); DGEMM_PARTICLES(TBphh, VCAph)
REORDER(J, K, I) REORDER(J, K, I)
) )
} }
@ -659,12 +651,9 @@ double getEnergySame
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
// we need to synchronize here since we need // we need to synchronize here since we need
// the Tijk for next process in the pipeline // the Tijk for next process in the pipeline
_CHECK_CUDA_SUCCESS("Synchronizing", cuCtxSynchronize();
cuCtxSynchronize()); cuMemFree((CUdeviceptr)_vhhh);
_CHECK_CUDA_SUCCESS("Freeing _vhhh", cuMemFree((CUdeviceptr)_t_buffer);
cuMemFree((CUdeviceptr)_vhhh));
_CHECK_CUDA_SUCCESS("Freeing _t_buffer",
cuMemFree((CUdeviceptr)_t_buffer));
#else #else
free(_vhhh); free(_vhhh);
free(_t_buffer); free(_t_buffer);