Compare commits
No commits in common. "20c29ed815ff161966d5c6ee08dcd9b58984a36a" and "4277c07cc22790fbc4e3fa4ea76ddb9dd494e586" have entirely different histories.
20c29ed815
...
4277c07cc2
@ -1,4 +1,3 @@
|
|||||||
AUTOMAKE_OPTIONS = subdir-objects
|
|
||||||
include $(top_srcdir)/atrip.mk
|
include $(top_srcdir)/atrip.mk
|
||||||
|
|
||||||
AM_CPPFLAGS = -I$(top_srcdir)/include/ $(CTF_CPPFLAGS)
|
AM_CPPFLAGS = -I$(top_srcdir)/include/ $(CTF_CPPFLAGS)
|
||||||
@ -20,13 +19,4 @@ endif
|
|||||||
if WITH_CUDA
|
if WITH_CUDA
|
||||||
test_main_CXXFLAGS = $(CUDA_CXXFLAGS)
|
test_main_CXXFLAGS = $(CUDA_CXXFLAGS)
|
||||||
test_main_LDADD += $(CUDA_LDFLAGS)
|
test_main_LDADD += $(CUDA_LDFLAGS)
|
||||||
|
|
||||||
AM_CXXFLAGS = $(CUDA_CXXFLAGS)
|
|
||||||
AM_LDFLAGS += $(CUDA_LDFLAGS)
|
|
||||||
|
|
||||||
bin_PROGRAMS += test-cublas-parallel-atrip
|
|
||||||
test_cublas_parallel_SOURCES = test-cublas-parallel-atrip.cxx
|
|
||||||
|
|
||||||
bin_PROGRAMS += test-cuda-sanity
|
|
||||||
test_cuda_sanity_SOURCES = test-cuda-sanity.cxx
|
|
||||||
endif
|
endif
|
||||||
|
|||||||
@ -1,239 +0,0 @@
|
|||||||
#include <mpi.h>
|
|
||||||
#include <iostream>
|
|
||||||
#include <cassert>
|
|
||||||
#include <string.h>
|
|
||||||
#include <sstream>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <cuda.h>
|
|
||||||
#include <chrono>
|
|
||||||
#include <map>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <unistd.h>
|
|
||||||
|
|
||||||
#include <CLI11.hpp>
|
|
||||||
|
|
||||||
#define CUBLASAPI
|
|
||||||
#include <cublas_api.h>
|
|
||||||
#include <cublas_v2.h>
|
|
||||||
|
|
||||||
struct Timer {
|
|
||||||
using Clock = std::chrono::high_resolution_clock;
|
|
||||||
using Event = std::chrono::time_point<Clock>;
|
|
||||||
std::chrono::duration<double> duration;
|
|
||||||
Event _start;
|
|
||||||
inline void start() noexcept { _start = Clock::now(); }
|
|
||||||
inline void stop() noexcept { duration += Clock::now() - _start; }
|
|
||||||
inline void clear() noexcept { duration *= 0; }
|
|
||||||
inline double count() const noexcept { return duration.count(); }
|
|
||||||
};
|
|
||||||
using Timings = std::map<std::string, Timer>;
|
|
||||||
|
|
||||||
#define _FORMAT(_fmt, ...) \
|
|
||||||
([&] (void) -> std::string { \
|
|
||||||
int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__); \
|
|
||||||
std::vector<char> _out(_sz + 1); \
|
|
||||||
std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
|
|
||||||
return std::string(_out.data()); \
|
|
||||||
})()
|
|
||||||
|
|
||||||
#define _CHECK_CUDA_SUCCESS(message, ...) \
|
|
||||||
do { \
|
|
||||||
CUresult result = __VA_ARGS__; \
|
|
||||||
printf("doing %s\n", message); \
|
|
||||||
if (result != CUDA_SUCCESS) { \
|
|
||||||
printf("\t!!CUDA_ERROR(%d): %s:%d %s\n", \
|
|
||||||
result, \
|
|
||||||
__FILE__, \
|
|
||||||
__LINE__, \
|
|
||||||
message); \
|
|
||||||
return 1; \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
#define _CHECK_CUBLAS_SUCCESS(message, ...) \
|
|
||||||
do { \
|
|
||||||
cublasStatus_t result = __VA_ARGS__; \
|
|
||||||
if (result != 0) { \
|
|
||||||
printf("\t!!CUBLAS_ERROR(%d): %s:%d %s\n", \
|
|
||||||
result, \
|
|
||||||
__FILE__, \
|
|
||||||
__LINE__, \
|
|
||||||
message); \
|
|
||||||
return 1; \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
|
||||||
|
|
||||||
using std::vector;
|
|
||||||
|
|
||||||
MPI_Init(NULL, NULL);
|
|
||||||
int rank, np, ngcards;
|
|
||||||
size_t no(10), nv(no * 10), its(2);
|
|
||||||
bool barrier = false;
|
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &np);
|
|
||||||
|
|
||||||
CLI::App app{"Main bench for atrip"};
|
|
||||||
app.add_option("--no", no, "Occupied orbitals");
|
|
||||||
app.add_option("--nv", nv, "Virtual orbitals");
|
|
||||||
app.add_option("--its", its, "Number of iterations to be done");
|
|
||||||
app.add_option("--barrier", barrier, "Call a MPI_Barrier in every iteration?");
|
|
||||||
CLI11_PARSE(app, argc, argv);
|
|
||||||
|
|
||||||
const size_t oo = no * no, ooo = no * oo;
|
|
||||||
|
|
||||||
Timings timings;
|
|
||||||
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
|
|
||||||
_CHECK_CUDA_SUCCESS("init for cuda",
|
|
||||||
cuInit(0));
|
|
||||||
|
|
||||||
_CHECK_CUDA_SUCCESS("get ncards",
|
|
||||||
cuDeviceGetCount(&ngcards));
|
|
||||||
|
|
||||||
CUcontext ctx;
|
|
||||||
CUdevice dev;
|
|
||||||
|
|
||||||
char hostname[256];
|
|
||||||
gethostname(hostname, 256);
|
|
||||||
printf("%s with rank %d gets card %d\n",
|
|
||||||
hostname,
|
|
||||||
rank,
|
|
||||||
rank % ngcards);
|
|
||||||
|
|
||||||
// set contexts
|
|
||||||
_CHECK_CUDA_SUCCESS("device get", cuDeviceGet(&dev, rank % ngcards));
|
|
||||||
_CHECK_CUDA_SUCCESS("creating context", cuCtxCreate(&ctx, 0, dev));
|
|
||||||
_CHECK_CUDA_SUCCESS("setting context", cuCtxSetCurrent(ctx));
|
|
||||||
_CHECK_CUDA_SUCCESS("synchronizing", cuCtxSynchronize());
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
|
|
||||||
using host_slice_t = vector<double>;
|
|
||||||
|
|
||||||
vector<size_t> sizes = {nv * oo, nv * no , oo, oo * no, oo * no};
|
|
||||||
vector<CUdeviceptr> P_phh(3), P_ph(6) , H_hh(3), H_hhh(3), T_hhh(1);
|
|
||||||
vector<vector<CUdeviceptr>*> slices_d = {&P_phh, &P_ph , &H_hh, &H_hhh, &T_hhh};
|
|
||||||
vector<vector<host_slice_t>> slices_h(slices_d.size());
|
|
||||||
{
|
|
||||||
int i = -1;
|
|
||||||
for (auto& v: slices_d) {
|
|
||||||
i++;
|
|
||||||
for (auto& ptr: *v) {
|
|
||||||
_CHECK_CUDA_SUCCESS("malloc",
|
|
||||||
cuMemAlloc(&ptr,
|
|
||||||
sizes[i] * sizeof(double)));
|
|
||||||
slices_h[i].push_back(std::move(std::vector<double>(sizes[i])));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const double one = 1.0, zero = 0.0;
|
|
||||||
|
|
||||||
printf("its: %d\n", its);
|
|
||||||
printf("barrier: %d\n", barrier);
|
|
||||||
printf("no: %ld\n", no);
|
|
||||||
printf("nv: %ld\n", nv);
|
|
||||||
printf("SIZE %f GB\n", (3 * nv * oo
|
|
||||||
+ 6 * oo * nv
|
|
||||||
+ 3 * oo
|
|
||||||
+ 3 * ooo
|
|
||||||
+ 1 * ooo
|
|
||||||
) * sizeof(double) / 1024.0 / 1024.0 / 1024.0);
|
|
||||||
std::map<std::string, double> tflopss
|
|
||||||
{{ "dgemm", ooo * (no + nv) * 6.0 * 2.0 * its / 1e12},
|
|
||||||
{ "holes", ooo * no * 6.0 * 2.0 * its / 1e12},
|
|
||||||
{ "particles", ooo * nv * 6.0 * 2.0 * its / 1e12}};
|
|
||||||
|
|
||||||
cublasHandle_t handle;
|
|
||||||
_CHECK_CUBLAS_SUCCESS("handle create", cublasCreate(&handle));
|
|
||||||
printf("handle %ld\n", handle);
|
|
||||||
|
|
||||||
timings["dgemm"].start();
|
|
||||||
for (size_t i = 0; i < its; i++) {
|
|
||||||
|
|
||||||
if (barrier) {
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
timings["memcpy"].start();
|
|
||||||
for (size_t _s = 0; _s < slices_d.size(); _s++) {
|
|
||||||
// for (size_t _b = 0; _b < slices_h[_s].size(); _b++) {
|
|
||||||
for (size_t _b = 0; _b < 1 ; _b++) {
|
|
||||||
auto device = (*slices_d[_s])[_b];
|
|
||||||
auto host = slices_h[_s][_b].data();
|
|
||||||
cuMemcpyHtoD(device, host, sizes[_s]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
timings["memcpy"].stop();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
timings["holes"].start();
|
|
||||||
for (size_t j = 0; j < 3; j++) {
|
|
||||||
|
|
||||||
_CHECK_CUBLAS_SUCCESS(" > 'geming ...",
|
|
||||||
cublasDgemm(handle,
|
|
||||||
CUBLAS_OP_N,
|
|
||||||
CUBLAS_OP_N,
|
|
||||||
oo, no, no,
|
|
||||||
&one,
|
|
||||||
(double*)H_hhh[j], oo,
|
|
||||||
(double*)H_hh[j], no,
|
|
||||||
&zero,
|
|
||||||
(double*)T_hhh[0], oo));
|
|
||||||
|
|
||||||
_CHECK_CUBLAS_SUCCESS(" > 'geming ...",
|
|
||||||
cublasDgemm(handle,
|
|
||||||
CUBLAS_OP_N,
|
|
||||||
CUBLAS_OP_T,
|
|
||||||
oo, no, no,
|
|
||||||
&one,
|
|
||||||
(double*)H_hhh[j], oo,
|
|
||||||
(double*)H_hh[j], no,
|
|
||||||
&zero,
|
|
||||||
(double*)T_hhh[0], oo));
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
timings["holes"].stop();
|
|
||||||
|
|
||||||
timings["particles"].start();
|
|
||||||
for (size_t j = 0; j < 6; j++) {
|
|
||||||
_CHECK_CUBLAS_SUCCESS(" > 'geming ...",
|
|
||||||
cublasDgemm(handle,
|
|
||||||
CUBLAS_OP_T,
|
|
||||||
CUBLAS_OP_N,
|
|
||||||
oo, no, nv,
|
|
||||||
&one,
|
|
||||||
(double*)P_phh[j % 3], nv,
|
|
||||||
(double*)P_ph[j], nv,
|
|
||||||
&zero,
|
|
||||||
(double*)T_hhh[0], oo));
|
|
||||||
}
|
|
||||||
timings["particles"].stop();
|
|
||||||
|
|
||||||
cuCtxSynchronize();
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
timings["dgemm"].stop();
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
printf("Performance: \n");
|
|
||||||
for (auto name: {"holes", "particles", "dgemm"})
|
|
||||||
printf("%10s TFlops: %4.1f\n",
|
|
||||||
name,
|
|
||||||
tflopss[name]
|
|
||||||
/ timings[name].count());
|
|
||||||
|
|
||||||
printf("Timings: \n");
|
|
||||||
for (auto const& kv: timings)
|
|
||||||
printf("%10s: %10f\n", kv.first.c_str(), kv.second.count());
|
|
||||||
|
|
||||||
MPI_Finalize();
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,163 +0,0 @@
|
|||||||
#include <mpi.h>
|
|
||||||
#include <iostream>
|
|
||||||
#include <cassert>
|
|
||||||
#include <string.h>
|
|
||||||
#include <sstream>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <cuda.h>
|
|
||||||
#include <chrono>
|
|
||||||
#include <map>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <unistd.h>
|
|
||||||
|
|
||||||
#define CUBLASAPI
|
|
||||||
#include <cublas_api.h>
|
|
||||||
#include <cublas_v2.h>
|
|
||||||
|
|
||||||
struct Timer {
|
|
||||||
using Clock = std::chrono::high_resolution_clock;
|
|
||||||
using Event = std::chrono::time_point<Clock>;
|
|
||||||
std::chrono::duration<double> duration;
|
|
||||||
Event _start;
|
|
||||||
inline void start() noexcept { _start = Clock::now(); }
|
|
||||||
inline void stop() noexcept { duration += Clock::now() - _start; }
|
|
||||||
inline void clear() noexcept { duration *= 0; }
|
|
||||||
inline double count() const noexcept { return duration.count(); }
|
|
||||||
};
|
|
||||||
using Timings = std::map<std::string, Timer>;
|
|
||||||
|
|
||||||
#define _FORMAT(_fmt, ...) \
|
|
||||||
([&] (void) -> std::string { \
|
|
||||||
int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__); \
|
|
||||||
std::vector<char> _out(_sz + 1); \
|
|
||||||
std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
|
|
||||||
return std::string(_out.data()); \
|
|
||||||
})()
|
|
||||||
|
|
||||||
#define _CHECK_CUDA_SUCCESS(message, ...) \
|
|
||||||
do { \
|
|
||||||
CUresult result = __VA_ARGS__; \
|
|
||||||
printf("doing %s\n", message); \
|
|
||||||
if (result != CUDA_SUCCESS) { \
|
|
||||||
printf("\t!!CUDA_ERROR(%d): %s:%d %s\n", \
|
|
||||||
result, \
|
|
||||||
__FILE__, \
|
|
||||||
__LINE__, \
|
|
||||||
message); \
|
|
||||||
return 1; \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
#define _CHECK_CUBLAS_SUCCESS(message, ...) \
|
|
||||||
do { \
|
|
||||||
cublasStatus_t result = __VA_ARGS__; \
|
|
||||||
printf("CUBLAS: doing %s\n", message); \
|
|
||||||
if (result != 0) { \
|
|
||||||
printf("\t!!CUBLAS_ERROR(%d): %s:%d %s\n", \
|
|
||||||
result, \
|
|
||||||
__FILE__, \
|
|
||||||
__LINE__, \
|
|
||||||
message); \
|
|
||||||
return 1; \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
|
||||||
MPI_Init(NULL, NULL);
|
|
||||||
int rank, np, ngcards;
|
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &np);
|
|
||||||
|
|
||||||
Timings timings;
|
|
||||||
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
|
|
||||||
_CHECK_CUDA_SUCCESS("init for cuda",
|
|
||||||
cuInit(0));
|
|
||||||
|
|
||||||
_CHECK_CUDA_SUCCESS("get ncards",
|
|
||||||
cuDeviceGetCount(&ngcards));
|
|
||||||
|
|
||||||
|
|
||||||
const size_t N
|
|
||||||
= argc > 1
|
|
||||||
? atoi(argv[1])
|
|
||||||
: 30000
|
|
||||||
, dgemms
|
|
||||||
= argc > 2
|
|
||||||
? atoi(argv[2])
|
|
||||||
: 2
|
|
||||||
, flops = 2 * N * N * N * dgemms
|
|
||||||
;
|
|
||||||
|
|
||||||
CUcontext ctx;
|
|
||||||
CUdevice dev;
|
|
||||||
|
|
||||||
char hostname[256];
|
|
||||||
gethostname(hostname, 256);
|
|
||||||
printf("%s with rank %d gets card %d\n",
|
|
||||||
hostname,
|
|
||||||
rank,
|
|
||||||
rank % ngcards);
|
|
||||||
|
|
||||||
// set contexts
|
|
||||||
_CHECK_CUDA_SUCCESS("device get",
|
|
||||||
cuDeviceGet(&dev, rank % ngcards));
|
|
||||||
_CHECK_CUDA_SUCCESS("creating context",
|
|
||||||
cuCtxCreate(&ctx, 0, dev));
|
|
||||||
_CHECK_CUDA_SUCCESS("setting context",
|
|
||||||
cuCtxSetCurrent(ctx));
|
|
||||||
_CHECK_CUDA_SUCCESS("synchronizing",
|
|
||||||
cuCtxSynchronize());
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
|
|
||||||
CUdeviceptr A, B, C;
|
|
||||||
const double one = 1.0;
|
|
||||||
printf("SIZE %f GB\n", N * N * sizeof(double) / 1024.0 / 1024.0 / 1024.0);
|
|
||||||
_CHECK_CUDA_SUCCESS("A", cuMemAlloc(&A, N * N * sizeof(double)));
|
|
||||||
_CHECK_CUDA_SUCCESS("B", cuMemAlloc(&B, N * N * sizeof(double)));
|
|
||||||
_CHECK_CUDA_SUCCESS("C", cuMemAlloc(&C, N * N * sizeof(double)));
|
|
||||||
|
|
||||||
cublasHandle_t handle;
|
|
||||||
cublasStatus_t stat;
|
|
||||||
_CHECK_CUBLAS_SUCCESS("handle create", cublasCreate(&handle));
|
|
||||||
printf("handle %ld\n", handle);
|
|
||||||
|
|
||||||
timings["dgemm"].start();
|
|
||||||
for (size_t i = 0; i < dgemms; i++) {
|
|
||||||
_CHECK_CUBLAS_SUCCESS(_FORMAT(" > 'geming %ld ...", i).c_str(),
|
|
||||||
cublasDgemm(handle,
|
|
||||||
CUBLAS_OP_N,
|
|
||||||
CUBLAS_OP_N,
|
|
||||||
N, N, N,
|
|
||||||
&one,
|
|
||||||
(double*)A, N,
|
|
||||||
(double*)B, N,
|
|
||||||
&one,
|
|
||||||
(double*)C, N));
|
|
||||||
}
|
|
||||||
|
|
||||||
cuCtxSynchronize();
|
|
||||||
timings["dgemm"].stop();
|
|
||||||
|
|
||||||
|
|
||||||
printf("dgemm Gflops: %f\n",
|
|
||||||
flops
|
|
||||||
/ timings["dgemm"].count()
|
|
||||||
/ 1024.0
|
|
||||||
/ 1024.0
|
|
||||||
/ 1024.0);
|
|
||||||
|
|
||||||
MPI_Finalize();
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Local Variables:
|
|
||||||
// compile-command: "mpic++ \
|
|
||||||
// -pedantic -std=c++11 \
|
|
||||||
// -L./cudaroot/lib64 -lcuda \
|
|
||||||
// -L./cudaroot/lib64 -lcudart \
|
|
||||||
// -L./cudaroot/lib64 -lcublas \
|
|
||||||
// ./test-cublas-parallel.cxx -o test-cublas-parallel"
|
|
||||||
// End:
|
|
||||||
@ -1,104 +0,0 @@
|
|||||||
#include <mpi.h>
|
|
||||||
#include <iostream>
|
|
||||||
#include <cassert>
|
|
||||||
#include <string.h>
|
|
||||||
#include <sstream>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include <cuda.h>
|
|
||||||
|
|
||||||
#define _CHECK_CUDA_SUCCESS(message, ...) \
|
|
||||||
do { \
|
|
||||||
CUresult result = __VA_ARGS__; \
|
|
||||||
printf("doing %s\n", message); \
|
|
||||||
if (result != CUDA_SUCCESS) { \
|
|
||||||
printf("\t!!CUDA_ERROR(%d): %s:%d %s\n", \
|
|
||||||
result, \
|
|
||||||
__FILE__, \
|
|
||||||
__LINE__, \
|
|
||||||
message); \
|
|
||||||
return 1; \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
int main() {
|
|
||||||
int rank, np, ngcards;
|
|
||||||
MPI_Init(NULL, NULL);
|
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &np);
|
|
||||||
|
|
||||||
_CHECK_CUDA_SUCCESS("init for cuda",
|
|
||||||
cuInit(0));
|
|
||||||
|
|
||||||
_CHECK_CUDA_SUCCESS("get ncards",
|
|
||||||
cuDeviceGetCount(&ngcards));
|
|
||||||
|
|
||||||
|
|
||||||
for (size_t rank = 0; rank < ngcards; rank++) {
|
|
||||||
CUcontext ctx;
|
|
||||||
CUdevice dev;
|
|
||||||
CUdevprop_st prop;
|
|
||||||
size_t _free, total, total2;
|
|
||||||
char *name = (char*)malloc(256);
|
|
||||||
|
|
||||||
printf("Setting contexts\n");
|
|
||||||
// set contexts
|
|
||||||
_CHECK_CUDA_SUCCESS("device get",
|
|
||||||
cuDeviceGet(&dev, rank));
|
|
||||||
_CHECK_CUDA_SUCCESS("creating context",
|
|
||||||
cuCtxCreate(&ctx, 0, dev));
|
|
||||||
_CHECK_CUDA_SUCCESS("setting context",
|
|
||||||
cuCtxSetCurrent(ctx));
|
|
||||||
_CHECK_CUDA_SUCCESS("synchronizing",
|
|
||||||
cuCtxSynchronize());
|
|
||||||
|
|
||||||
_CHECK_CUDA_SUCCESS("prop get",
|
|
||||||
cuDeviceGetProperties(&prop, dev));
|
|
||||||
_CHECK_CUDA_SUCCESS("meminfo get",
|
|
||||||
cuMemGetInfo(&_free, &total));
|
|
||||||
_CHECK_CUDA_SUCCESS("name get",
|
|
||||||
cuDeviceGetName(name, 256, dev));
|
|
||||||
_CHECK_CUDA_SUCCESS("totalmem get",
|
|
||||||
cuDeviceTotalMem(&total2, dev));
|
|
||||||
|
|
||||||
printf("\n"
|
|
||||||
"CUDA CARD RANK %d\n"
|
|
||||||
"=================\n"
|
|
||||||
"\tname: %s\n"
|
|
||||||
"\tShared Mem Per Block (KB): %f\n"
|
|
||||||
"\tFree/Total mem (GB): %f/%f\n"
|
|
||||||
"\total2 mem (GB): %f\n"
|
|
||||||
"\n",
|
|
||||||
dev,
|
|
||||||
name,
|
|
||||||
prop.sharedMemPerBlock / 1024.0,
|
|
||||||
_free / 1024.0 / 1024.0 / 1024.0 ,
|
|
||||||
total / 1024.0 / 1024.0 / 1024.0 ,
|
|
||||||
total2 / 1024.0 / 1024.0 / 1024.0
|
|
||||||
);
|
|
||||||
|
|
||||||
if (_free == 0 || total == 0 || total2 == 0)
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
CUdeviceptr data;
|
|
||||||
_CHECK_CUDA_SUCCESS("memalloc 1",
|
|
||||||
cuMemAlloc(&data, sizeof(double) * 10000));
|
|
||||||
_CHECK_CUDA_SUCCESS("memalloc 2",
|
|
||||||
cuMemAlloc(&data, sizeof(double) * 10000));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
MPI_Finalize();
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Local Variables:
|
|
||||||
// compile-command: "mpic++ \
|
|
||||||
// -pedantic -std=c++11 \
|
|
||||||
// -L./cudaroot/lib64 -lcuda \
|
|
||||||
// -L./cudaroot/lib64 -lcudart \
|
|
||||||
// -L./cudaroot/lib64 -lcublas \
|
|
||||||
// ./mem.cxx -o mem"
|
|
||||||
// End:
|
|
||||||
@ -142,10 +142,8 @@ int main(int argc, char** argv) {
|
|||||||
+ /* tai */ f * nranks * no * nv
|
+ /* tai */ f * nranks * no * nv
|
||||||
;
|
;
|
||||||
|
|
||||||
if (rank == 0) {
|
if (atrip::Atrip::rank == 0)
|
||||||
std::cout << "Tentative MEMORY USAGE (GB): "
|
std::cout << "Tentative MEMORY USAGE: " << atrip_memory << "\n";
|
||||||
<< double(atrip_memory) / 1024.0 / 1024.0 / 1024.0 << "\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
std::vector<int> symmetries(4, NS)
|
std::vector<int> symmetries(4, NS)
|
||||||
|
|||||||
@ -1,7 +1,5 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
cd `git rev-parse --show-toplevel`
|
|
||||||
|
|
||||||
type -a autoreconf > /dev/null ||
|
type -a autoreconf > /dev/null ||
|
||||||
{
|
{
|
||||||
cat <<EOF && exit
|
cat <<EOF && exit
|
||||||
|
|||||||
@ -1,13 +1,5 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <atrip/Utils.hpp>
|
|
||||||
|
|
||||||
#if defined(HAVE_CUDA)
|
|
||||||
#include <cuda.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
#if defined(HAVE_CUDA) && defined(__CUDACC__)
|
#if defined(HAVE_CUDA) && defined(__CUDACC__)
|
||||||
# define __MAYBE_GLOBAL__ __global__
|
# define __MAYBE_GLOBAL__ __global__
|
||||||
# define __MAYBE_DEVICE__ __device__
|
# define __MAYBE_DEVICE__ __device__
|
||||||
@ -15,31 +7,3 @@
|
|||||||
# define __MAYBE_GLOBAL__
|
# define __MAYBE_GLOBAL__
|
||||||
# define __MAYBE_DEVICE__
|
# define __MAYBE_DEVICE__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define _CHECK_CUDA_SUCCESS(message, ...) \
|
|
||||||
do { \
|
|
||||||
CUresult result = __VA_ARGS__; \
|
|
||||||
if (result != CUDA_SUCCESS) { \
|
|
||||||
auto msg = _FORMAT("\t!!CUDA_ERROR(%d): %s:%d\v%s", \
|
|
||||||
result, \
|
|
||||||
__FILE__, \
|
|
||||||
__LINE__, \
|
|
||||||
message); \
|
|
||||||
std::cerr << msg; \
|
|
||||||
throw msg; \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
#define _CHECK_CUBLAS_SUCCESS(message, ...) \
|
|
||||||
do { \
|
|
||||||
cublasStatus_t result = __VA_ARGS__; \
|
|
||||||
if (result != 0) { \
|
|
||||||
auto msg = _FORMAT("\t!!CUBLAS_ERROR(%d): %s:%d\v%s", \
|
|
||||||
result, \
|
|
||||||
__FILE__, \
|
|
||||||
__LINE__, \
|
|
||||||
message); \
|
|
||||||
std::cerr << msg; \
|
|
||||||
throw msg; \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
|
|||||||
@ -21,7 +21,6 @@
|
|||||||
|
|
||||||
#include <atrip/Tuples.hpp>
|
#include <atrip/Tuples.hpp>
|
||||||
#include <atrip/Utils.hpp>
|
#include <atrip/Utils.hpp>
|
||||||
#include <atrip/CUDA.hpp>
|
|
||||||
|
|
||||||
namespace atrip {
|
namespace atrip {
|
||||||
|
|
||||||
@ -459,11 +458,7 @@ void unwrapAndMarkReady() {
|
|||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
// copy the retrieved mpi data to the device
|
// copy the retrieved mpi data to the device
|
||||||
WITH_CHRONO("cuda:memcpy",
|
WITH_CHRONO("cuda:memcpy",
|
||||||
_CHECK_CUDA_SUCCESS("copying mpi data to device",
|
cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);)
|
||||||
cuMemcpyHtoD(data,
|
|
||||||
(void*)mpi_data,
|
|
||||||
sizeof(F) * size));
|
|
||||||
)
|
|
||||||
std::free(mpi_data);
|
std::free(mpi_data);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@ -195,28 +195,7 @@ template <typename F=double>
|
|||||||
;
|
;
|
||||||
if (blank.info.state == Slice<F>::SelfSufficient) {
|
if (blank.info.state == Slice<F>::SelfSufficient) {
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
const size_t _size = sizeof(F) * sources[from.source].size();
|
blank.mpi_data = sources[from.source].data();
|
||||||
// TODO: this is code duplication with downstairs
|
|
||||||
if (freePointers.size() == 0) {
|
|
||||||
std::stringstream stream;
|
|
||||||
stream << "No more free pointers "
|
|
||||||
<< "for type " << type
|
|
||||||
<< " and name " << name
|
|
||||||
;
|
|
||||||
throw std::domain_error(stream.str());
|
|
||||||
}
|
|
||||||
auto dataPointer = freePointers.begin();
|
|
||||||
freePointers.erase(dataPointer);
|
|
||||||
blank.data = *dataPointer;
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// TODO [#A]: do cuMemcpy of
|
|
||||||
// sources[from.source].data() ⇒ blank.data
|
|
||||||
// Do this when everything else is working.
|
|
||||||
// This will probably be a bottleneck of the H-to-D communication,
|
|
||||||
// as most slices are SelfSufficient.
|
|
||||||
//
|
|
||||||
//
|
|
||||||
#else
|
#else
|
||||||
blank.data = sources[from.source].data();
|
blank.data = sources[from.source].data();
|
||||||
#endif
|
#endif
|
||||||
@ -337,16 +316,6 @@ template <typename F=double>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(HAVE_CUDA)
|
|
||||||
// In cuda, SelfSufficient slices have an ad-hoc pointer
|
|
||||||
// since it is a pointer on the device and has to be
|
|
||||||
// brought back to the free pointer bucket of the SliceUnion.
|
|
||||||
// Therefore, only in the Recycled case it should not be
|
|
||||||
// put back the pointer.
|
|
||||||
if (slice.info.state == Slice<F>::Recycled) {
|
|
||||||
freeSlicePointer = false;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
// if the slice is self sufficient, do not dare touching the
|
// if the slice is self sufficient, do not dare touching the
|
||||||
// pointer, since it is a pointer to our sources in our rank.
|
// pointer, since it is a pointer to our sources in our rank.
|
||||||
if ( slice.info.state == Slice<F>::SelfSufficient
|
if ( slice.info.state == Slice<F>::SelfSufficient
|
||||||
@ -354,7 +323,6 @@ template <typename F=double>
|
|||||||
) {
|
) {
|
||||||
freeSlicePointer = false;
|
freeSlicePointer = false;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
// make sure we get its data pointer to be used later
|
// make sure we get its data pointer to be used later
|
||||||
// only for non-recycled, since it can be that we need
|
// only for non-recycled, since it can be that we need
|
||||||
@ -378,7 +346,7 @@ template <typename F=double>
|
|||||||
// << " info " << slice.info
|
// << " info " << slice.info
|
||||||
<< "\n";
|
<< "\n";
|
||||||
slice.free();
|
slice.free();
|
||||||
} // we did not find the slice
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -412,17 +380,10 @@ template <typename F=double>
|
|||||||
|
|
||||||
for (auto& ptr: sliceBuffers) {
|
for (auto& ptr: sliceBuffers) {
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
const CUresult error =
|
cuMemAlloc(&ptr, sizeof(F) * sources[0].size());
|
||||||
cuMemAlloc(&ptr, sizeof(F) * sources[0].size());
|
if (ptr == 0UL) {
|
||||||
if (ptr == 0UL) {
|
throw "UNSUFICCIENT MEMORY ON THE GRAPHIC CARD FOR FREE POINTERS";
|
||||||
throw "UNSUFICCIENT MEMORY ON THE GRAPHIC CARD FOR FREE POINTERS";
|
}
|
||||||
}
|
|
||||||
if (error != CUDA_SUCCESS) {
|
|
||||||
std::stringstream s;
|
|
||||||
s << "Error allocating memory for slice buffers "
|
|
||||||
<< "code " << error << "\n";
|
|
||||||
throw s.str();
|
|
||||||
}
|
|
||||||
#else
|
#else
|
||||||
ptr = (DataPtr<F>)malloc(sizeof(F) * sources[0].size());
|
ptr = (DataPtr<F>)malloc(sizeof(F) * sources[0].size());
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -80,16 +80,6 @@ struct Timer {
|
|||||||
using Timings = std::map<std::string, Timer>;
|
using Timings = std::map<std::string, Timer>;
|
||||||
// Chrono:1 ends here
|
// Chrono:1 ends here
|
||||||
|
|
||||||
// A nice handy macro to do formatting
|
|
||||||
#define _FORMAT(_fmt, ...) \
|
|
||||||
([&] (void) -> std::string { \
|
|
||||||
int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__); \
|
|
||||||
std::vector<char> _out(_sz + 1); \
|
|
||||||
std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
|
|
||||||
return std::string(_out.data()); \
|
|
||||||
})()
|
|
||||||
|
|
||||||
|
|
||||||
// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
|
// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
|
||||||
}
|
}
|
||||||
// Epilog:1 ends here
|
// Epilog:1 ends here
|
||||||
|
|||||||
@ -24,7 +24,7 @@
|
|||||||
|
|
||||||
using namespace atrip;
|
using namespace atrip;
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
#include <atrip/CUDA.hpp>
|
#include <cuda.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template <typename F> bool RankMap<F>::RANK_ROUND_ROBIN;
|
template <typename F> bool RankMap<F>::RANK_ROUND_ROBIN;
|
||||||
@ -49,6 +49,11 @@ void Atrip::init(MPI_Comm world) {
|
|||||||
Atrip::communicator = world;
|
Atrip::communicator = world;
|
||||||
MPI_Comm_rank(world, (int*)&Atrip::rank);
|
MPI_Comm_rank(world, (int*)&Atrip::rank);
|
||||||
MPI_Comm_size(world, (int*)&Atrip::np);
|
MPI_Comm_size(world, (int*)&Atrip::np);
|
||||||
|
|
||||||
|
#if defined(HAVE_CUDA)
|
||||||
|
Atrip::cuda.status = cublasCreate(&Atrip::cuda.handle);
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename F>
|
template <typename F>
|
||||||
@ -66,24 +71,18 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
|
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
int ngcards;
|
int ngcards;
|
||||||
_CHECK_CUDA_SUCCESS("initializing cuda",
|
|
||||||
cuInit(0));
|
|
||||||
_CHECK_CUDA_SUCCESS("getting device count",
|
|
||||||
cuDeviceGetCount(&ngcards));
|
|
||||||
const auto clusterInfo = getClusterInfo(Atrip::communicator);
|
const auto clusterInfo = getClusterInfo(Atrip::communicator);
|
||||||
|
cuDeviceGetCount(&ngcards);
|
||||||
LOG(0,"Atrip") << "ngcards: " << ngcards << "\n";
|
LOG(0,"Atrip") << "ngcards: " << ngcards << "\n";
|
||||||
if (clusterInfo.ranksPerNode > ngcards) {
|
if (clusterInfo.ranksPerNode > ngcards) {
|
||||||
const auto msg
|
std::cerr << "ATRIP: You are running on more ranks per node than the number of graphic cards\n"
|
||||||
= _FORMAT("ATRIP: You are running on more ranks per node than the number of graphic cards\n"
|
<< "You have " << ngcards << " cards at your disposal\n";
|
||||||
"You have %d cards at your disposal\n", ngcards);
|
throw "";
|
||||||
std::cerr << msg;
|
}
|
||||||
throw msg;
|
if (clusterInfo.ranksPerNode < ngcards) {
|
||||||
} else if (clusterInfo.ranksPerNode < ngcards) {
|
std::cerr << "You have " << ngcards << " cards at your disposal\n"
|
||||||
const auto msg
|
<< "You will be only using " << clusterInfo.ranksPerNode
|
||||||
= _FORMAT("You have %d cards at your disposal.\n"
|
<< ", i.e., the nubmer of ranks.\n";
|
||||||
"You will be only using %d, i.e, the number of ranks\n",
|
|
||||||
ngcards, clusterInfo.ranksPerNode);
|
|
||||||
std::cerr << msg;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -95,27 +94,16 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
struct { struct { size_t free, total; } avail; size_t total; } memory;
|
struct { struct { size_t free, total; } avail; size_t total; } memory;
|
||||||
char *name = (char*)malloc(256);
|
char *name = (char*)malloc(256);
|
||||||
|
|
||||||
// - TODO :: we should check that the Zuweisung of graphic cards
|
|
||||||
// to nodes works as expected, i.e., node k should get from 0
|
|
||||||
// to ngcards with the formula =rank % ngcards=.
|
|
||||||
|
|
||||||
// set current device
|
// set current device
|
||||||
_CHECK_CUDA_SUCCESS("getting device for index <rank>",
|
cuDeviceGet(&dev, rank);
|
||||||
cuDeviceGet(&dev, rank % ngcards));
|
cuCtxCreate(&ctx, 0, dev);
|
||||||
_CHECK_CUDA_SUCCESS("creating a cuda context",
|
cuCtxSetCurrent(ctx);
|
||||||
cuCtxCreate(&ctx, 0, dev));
|
|
||||||
_CHECK_CUDA_SUCCESS("setting the context",
|
|
||||||
cuCtxSetCurrent(ctx));
|
|
||||||
|
|
||||||
// get information of the device
|
// get information of the device
|
||||||
_CHECK_CUDA_SUCCESS("getting properties of current device",
|
cuDeviceGetProperties(&prop, dev);
|
||||||
cuDeviceGetProperties(&prop, dev));
|
cuMemGetInfo(&memory.avail.free, &memory.avail.total);
|
||||||
_CHECK_CUDA_SUCCESS("getting memory information",
|
cuDeviceGetName(name, 256, dev);
|
||||||
cuMemGetInfo(&memory.avail.free, &memory.avail.total));
|
cuDeviceTotalMem(&memory.total, dev);
|
||||||
_CHECK_CUDA_SUCCESS("getting name",
|
|
||||||
cuDeviceGetName(name, 256, dev));
|
|
||||||
_CHECK_CUDA_SUCCESS("getting total memory",
|
|
||||||
cuDeviceTotalMem(&memory.total, dev));
|
|
||||||
|
|
||||||
printf("\n"
|
printf("\n"
|
||||||
"CUDA CARD RANK %d\n"
|
"CUDA CARD RANK %d\n"
|
||||||
@ -136,10 +124,6 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
memory.total / 1024.0 / 1024.0 / 1024.0
|
memory.total / 1024.0 / 1024.0 / 1024.0
|
||||||
);
|
);
|
||||||
std::free((void*)name);
|
std::free((void*)name);
|
||||||
|
|
||||||
_CHECK_CUBLAS_SUCCESS("creating a cublas handle",
|
|
||||||
cublasCreate(&Atrip::cuda.handle));
|
|
||||||
|
|
||||||
}
|
}
|
||||||
MPI_Barrier(universe);
|
MPI_Barrier(universe);
|
||||||
}
|
}
|
||||||
@ -179,27 +163,17 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
|
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
DataPtr<F> Tai, epsi, epsa;
|
DataPtr<F> Tai, epsi, epsa;
|
||||||
|
//TODO: free memory pointers in the end of the algorithm
|
||||||
|
cuMemAlloc(&Tai, sizeof(F) * _Tai.size());
|
||||||
|
cuMemAlloc(&epsi, sizeof(F) * _epsi.size());
|
||||||
|
cuMemAlloc(&epsa, sizeof(F) * _epsa.size());
|
||||||
|
|
||||||
// TODO: free memory pointers in the end of the algorithm
|
cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size());
|
||||||
|
cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size());
|
||||||
|
cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size());
|
||||||
|
|
||||||
_CHECK_CUDA_SUCCESS("Tai",
|
cuMemAlloc(&Tijk, sizeof(F) * No * No * No);
|
||||||
cuMemAlloc(&Tai, sizeof(F) * _Tai.size()));
|
cuMemAlloc(&Zijk, sizeof(F) * No * No * No);
|
||||||
_CHECK_CUDA_SUCCESS("epsi",
|
|
||||||
cuMemAlloc(&epsi, sizeof(F) * _epsi.size()));
|
|
||||||
_CHECK_CUDA_SUCCESS("epsa",
|
|
||||||
cuMemAlloc(&epsa, sizeof(F) * _epsa.size()));
|
|
||||||
|
|
||||||
_CHECK_CUDA_SUCCESS("memcpy Tai",
|
|
||||||
cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size()));
|
|
||||||
_CHECK_CUDA_SUCCESS("memcpy epsi",
|
|
||||||
cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size()));
|
|
||||||
_CHECK_CUDA_SUCCESS("memcpy epsa",
|
|
||||||
cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size()));
|
|
||||||
|
|
||||||
_CHECK_CUDA_SUCCESS("Tijk",
|
|
||||||
cuMemAlloc(&Tijk, sizeof(F) * No * No * No));
|
|
||||||
_CHECK_CUDA_SUCCESS("Zijk",
|
|
||||||
cuMemAlloc(&Zijk, sizeof(F) * No * No * No));
|
|
||||||
#else
|
#else
|
||||||
std::vector<F> &Tai = _Tai, &epsi = _epsi, &epsa = _epsa;
|
std::vector<F> &Tai = _Tai, &epsi = _epsi, &epsa = _epsa;
|
||||||
Zijk = (DataFieldType<F>*)malloc(No*No*No * sizeof(DataFieldType<F>));
|
Zijk = (DataFieldType<F>*)malloc(No*No*No * sizeof(DataFieldType<F>));
|
||||||
@ -292,8 +266,8 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
|||||||
|
|
||||||
auto const isFakeTuple
|
auto const isFakeTuple
|
||||||
= [&tuplesList, distribution](size_t const i) {
|
= [&tuplesList, distribution](size_t const i) {
|
||||||
return distribution->tupleIsFake(tuplesList[i]);
|
return distribution->tupleIsFake(tuplesList[i]);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
using Database = typename Slice<F>::Database;
|
using Database = typename Slice<F>::Database;
|
||||||
|
|||||||
@ -15,22 +15,19 @@
|
|||||||
// [[file:~/cuda/atrip/atrip.org::*Blas][Blas:2]]
|
// [[file:~/cuda/atrip/atrip.org::*Blas][Blas:2]]
|
||||||
#include <atrip/Blas.hpp>
|
#include <atrip/Blas.hpp>
|
||||||
#include <atrip/Atrip.hpp>
|
#include <atrip/Atrip.hpp>
|
||||||
#include <atrip/CUDA.hpp>
|
|
||||||
|
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
# include <cstring>
|
# include <cstring>
|
||||||
|
|
||||||
static size_t dgem_call = 0;
|
static inline
|
||||||
|
cublasOperation_t char_to_cublasOperation(const char* trans) {
|
||||||
static inline
|
if (strncmp("N", trans, 1) == 0)
|
||||||
cublasOperation_t char_to_cublasOperation(const char* trans) {
|
return CUBLAS_OP_N;
|
||||||
if (strncmp("N", trans, 1) == 0)
|
else if (strncmp("T", trans, 1) == 0)
|
||||||
return CUBLAS_OP_N;
|
return CUBLAS_OP_T;
|
||||||
else if (strncmp("T", trans, 1) == 0)
|
else
|
||||||
return CUBLAS_OP_T;
|
return CUBLAS_OP_C;
|
||||||
else
|
}
|
||||||
return CUBLAS_OP_C;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -52,23 +49,13 @@ namespace atrip {
|
|||||||
typename DataField<double>::type *C,
|
typename DataField<double>::type *C,
|
||||||
const int *ldc) {
|
const int *ldc) {
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
// TODO: remove this verbose checking
|
cublasDgemm(Atrip::cuda.handle,
|
||||||
const cublasStatus_t error =
|
char_to_cublasOperation(transa),
|
||||||
cublasDgemm(Atrip::cuda.handle,
|
char_to_cublasOperation(transb),
|
||||||
char_to_cublasOperation(transa),
|
*m, *n, *k,
|
||||||
char_to_cublasOperation(transb),
|
alpha, A, *lda,
|
||||||
*m, *n, *k,
|
B, *ldb, beta,
|
||||||
alpha, A, *lda,
|
C, *ldc);
|
||||||
B, *ldb, beta,
|
|
||||||
C, *ldc);
|
|
||||||
if (error != 0) printf(":%-3ld (%4ldth) ERR<%4d> cublasDgemm: "
|
|
||||||
"A = %20ld "
|
|
||||||
"B = %20ld "
|
|
||||||
"C = %20ld "
|
|
||||||
"\n",
|
|
||||||
Atrip::rank,
|
|
||||||
dgem_call++,
|
|
||||||
error, A, B, C);
|
|
||||||
#else
|
#else
|
||||||
dgemm_(transa, transb,
|
dgemm_(transa, transb,
|
||||||
m, n, k,
|
m, n, k,
|
||||||
@ -96,17 +83,16 @@ namespace atrip {
|
|||||||
cuDoubleComplex
|
cuDoubleComplex
|
||||||
cu_alpha = {std::real(*alpha), std::imag(*alpha)},
|
cu_alpha = {std::real(*alpha), std::imag(*alpha)},
|
||||||
cu_beta = {std::real(*beta), std::imag(*beta)};
|
cu_beta = {std::real(*beta), std::imag(*beta)};
|
||||||
|
cublasZgemm(Atrip::cuda.handle,
|
||||||
|
char_to_cublasOperation(transa),
|
||||||
|
char_to_cublasOperation(transb),
|
||||||
|
*m, *n, *k,
|
||||||
|
&cu_alpha,
|
||||||
|
|
||||||
_CHECK_CUBLAS_SUCCESS("cublasZgemm",
|
A, *lda,
|
||||||
cublasZgemm(Atrip::cuda.handle,
|
B, *ldb,
|
||||||
char_to_cublasOperation(transa),
|
&cu_beta,
|
||||||
char_to_cublasOperation(transb),
|
C, *ldc);
|
||||||
*m, *n, *k,
|
|
||||||
&cu_alpha,
|
|
||||||
A, *lda,
|
|
||||||
B, *ldb,
|
|
||||||
&cu_beta,
|
|
||||||
C, *ldc));
|
|
||||||
#else
|
#else
|
||||||
zgemm_(transa, transb,
|
zgemm_(transa, transb,
|
||||||
m, n, k,
|
m, n, k,
|
||||||
|
|||||||
@ -14,13 +14,31 @@
|
|||||||
|
|
||||||
// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]]
|
// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]]
|
||||||
#include<atrip/Equations.hpp>
|
#include<atrip/Equations.hpp>
|
||||||
|
|
||||||
#include<atrip/CUDA.hpp>
|
#include<atrip/CUDA.hpp>
|
||||||
|
|
||||||
|
#if defined(HAVE_CUDA)
|
||||||
|
#include <cuda.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace atrip {
|
namespace atrip {
|
||||||
// Prolog:2 ends here
|
// Prolog:2 ends here
|
||||||
|
|
||||||
|
|
||||||
|
// These are just help structures
|
||||||
|
// to help with the templating of reorder
|
||||||
|
// function
|
||||||
|
enum reordering_t
|
||||||
|
{
|
||||||
|
IJK,
|
||||||
|
IKJ,
|
||||||
|
JIK,
|
||||||
|
JKI,
|
||||||
|
KIJ,
|
||||||
|
KJI
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename F, reordering_t R>
|
||||||
|
struct reorder_proxy {};
|
||||||
|
|
||||||
#ifdef HAVE_CUDA
|
#ifdef HAVE_CUDA
|
||||||
namespace cuda {
|
namespace cuda {
|
||||||
@ -83,42 +101,49 @@ namespace cuda {
|
|||||||
template <>
|
template <>
|
||||||
__device__
|
__device__
|
||||||
cuDoubleComplex multiply(const cuDoubleComplex &a, const cuDoubleComplex &b) {
|
cuDoubleComplex multiply(const cuDoubleComplex &a, const cuDoubleComplex &b) {
|
||||||
return
|
return
|
||||||
{a.x * b.x - a.y * b.y,
|
{a.x * b.x - a.y * b.y,
|
||||||
a.x * b.y + a.y * b.x};
|
a.x * b.y + a.y * b.x};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename F>
|
template <typename F>
|
||||||
__device__
|
__device__
|
||||||
void sum_in_place(F* to, const F* from);
|
void sum_in_place(F* to, const F* b);
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
__device__
|
__device__
|
||||||
void sum_in_place(double* to, const double *from) { *to += *from; }
|
void sum_in_place(double* to, const double *b) { *to += *b; }
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
__device__
|
__device__
|
||||||
void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* from) {
|
void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* b) {
|
||||||
to->x += from->x;
|
to->x += b->x;
|
||||||
to->y += from->y;
|
to->y += b->y;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__
|
||||||
|
cuDoubleComplex& operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz) {
|
||||||
|
lz.x += rz.x;
|
||||||
|
lz.y += rz.y;
|
||||||
|
return lz;
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
#define FOR_K() \
|
#define LIMS_KS() \
|
||||||
for (size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \
|
size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \
|
||||||
k = kmin, \
|
k = kmin, \
|
||||||
idx = kmin * size * size * size; \
|
idx = kmin * size * size * size \
|
||||||
k < (kmin < size) ? kmin + 1 : size; \
|
; \
|
||||||
k++)
|
k < (kmin < size) ? kmin + 1 : size
|
||||||
#else
|
#else
|
||||||
#define FOR_K() for (size_t k=0, idx=0; k < size; k++)
|
#define LIMS_KS size_t k=0, idx=0; k < size
|
||||||
#endif
|
#endif
|
||||||
#define _IJK_(i, j, k) i + j*size + k*size*size
|
#define _IJK_(i, j, k) i + j*size + k*size*size
|
||||||
#define _REORDER_BODY_(...) \
|
#define _REORDER_BODY_(...) \
|
||||||
FOR_K() \
|
for (LIMS_KS() ; k++) \
|
||||||
for (size_t j = 0; j < size; j++) \
|
for (size_t j = 0; j < size; j++) \
|
||||||
for (size_t i = 0; i < size; i++, idx++) { \
|
for (size_t i = 0; i < size; i++, idx++) { \
|
||||||
__VA_ARGS__ \
|
__VA_ARGS__ \
|
||||||
@ -127,9 +152,7 @@ namespace cuda {
|
|||||||
template <typename F> \
|
template <typename F> \
|
||||||
__MAYBE_GLOBAL__ \
|
__MAYBE_GLOBAL__ \
|
||||||
void reorder(reorder_proxy< F, _enum > p, \
|
void reorder(reorder_proxy< F, _enum > p, \
|
||||||
size_t size, \
|
size_t size, F* to, F* from) { \
|
||||||
F* to, \
|
|
||||||
F* from) { \
|
|
||||||
_REORDER_BODY_(__VA_ARGS__) \
|
_REORDER_BODY_(__VA_ARGS__) \
|
||||||
}
|
}
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
@ -138,31 +161,9 @@ namespace cuda {
|
|||||||
#define GO(__TO, __FROM) __TO += __FROM;
|
#define GO(__TO, __FROM) __TO += __FROM;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// These are just help structures
|
|
||||||
// to help with the templating of reorder
|
|
||||||
// function
|
|
||||||
enum reordering_t
|
|
||||||
{
|
|
||||||
IJK,
|
|
||||||
IKJ,
|
|
||||||
JIK,
|
|
||||||
JKI,
|
|
||||||
KIJ,
|
|
||||||
KJI
|
|
||||||
};
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Please the c++ type checker and template creator
|
|
||||||
* in order to have an argument in the signature of
|
|
||||||
* the function that helps the compiler know which
|
|
||||||
* instantiation it should take.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
template <typename F, reordering_t R>
|
|
||||||
struct reorder_proxy {};
|
|
||||||
|
|
||||||
template <typename F, reordering_t R>
|
template <typename F, reordering_t R>
|
||||||
__MAYBE_GLOBAL__
|
__MAYBE_GLOBAL__ \
|
||||||
void reorder(reorder_proxy<F, R> proxy,
|
void reorder(reorder_proxy<F, R> proxy,
|
||||||
size_t size, F* to, F* from);
|
size_t size, F* to, F* from);
|
||||||
|
|
||||||
@ -444,14 +445,10 @@ double getEnergySame
|
|||||||
|
|
||||||
#if defined(ATRIP_USE_DGEMM)
|
#if defined(ATRIP_USE_DGEMM)
|
||||||
#if defined(HAVE_CUDA)
|
#if defined(HAVE_CUDA)
|
||||||
#define REORDER(__II, __JJ, __KK) \
|
#define REORDER(__II, __JJ, __KK) \
|
||||||
reorder<<<bs, ths>>>(reorder_proxy< \
|
reorder<<<bs, ths>>>(reorder_proxy<DataFieldType<F>, \
|
||||||
DataFieldType<F>, \
|
__II ## __JJ ## __KK >{}, \
|
||||||
__II ## __JJ ## __KK \
|
No, Tijk, _t_buffer);
|
||||||
>{}, \
|
|
||||||
No, \
|
|
||||||
Tijk, \
|
|
||||||
_t_buffer);
|
|
||||||
#define DGEMM_PARTICLES(__A, __B) \
|
#define DGEMM_PARTICLES(__A, __B) \
|
||||||
atrip::xgemm<F>("T", \
|
atrip::xgemm<F>("T", \
|
||||||
"N", \
|
"N", \
|
||||||
@ -465,7 +462,7 @@ double getEnergySame
|
|||||||
(int const*)&Nv, \
|
(int const*)&Nv, \
|
||||||
&zero, \
|
&zero, \
|
||||||
_t_buffer, \
|
_t_buffer, \
|
||||||
(int const*)&NoNo)
|
(int const*)&NoNo);
|
||||||
#define DGEMM_HOLES(__A, __B, __TRANSB) \
|
#define DGEMM_HOLES(__A, __B, __TRANSB) \
|
||||||
atrip::xgemm<F>("N", \
|
atrip::xgemm<F>("N", \
|
||||||
__TRANSB, \
|
__TRANSB, \
|
||||||
@ -480,7 +477,7 @@ double getEnergySame
|
|||||||
&zero, \
|
&zero, \
|
||||||
_t_buffer, \
|
_t_buffer, \
|
||||||
(int const*)&NoNo \
|
(int const*)&NoNo \
|
||||||
)
|
);
|
||||||
#define MAYBE_CONJ(_conj, _buffer) \
|
#define MAYBE_CONJ(_conj, _buffer) \
|
||||||
cuda::maybeConjugate<<< \
|
cuda::maybeConjugate<<< \
|
||||||
Atrip::kernelDimensions.ooo.blocks, \
|
Atrip::kernelDimensions.ooo.blocks, \
|
||||||
@ -515,7 +512,7 @@ double getEnergySame
|
|||||||
&zero, \
|
&zero, \
|
||||||
_t_buffer, \
|
_t_buffer, \
|
||||||
(int const*)&NoNo \
|
(int const*)&NoNo \
|
||||||
)
|
);
|
||||||
#define DGEMM_HOLES(__A, __B, __TRANSB) \
|
#define DGEMM_HOLES(__A, __B, __TRANSB) \
|
||||||
atrip::xgemm<F>("N", \
|
atrip::xgemm<F>("N", \
|
||||||
__TRANSB, \
|
__TRANSB, \
|
||||||
@ -530,7 +527,7 @@ double getEnergySame
|
|||||||
&zero, \
|
&zero, \
|
||||||
_t_buffer, \
|
_t_buffer, \
|
||||||
(int const*)&NoNo \
|
(int const*)&NoNo \
|
||||||
)
|
);
|
||||||
#define MAYBE_CONJ(_conj, _buffer) \
|
#define MAYBE_CONJ(_conj, _buffer) \
|
||||||
for (size_t __i = 0; __i < NoNoNo; ++__i) \
|
for (size_t __i = 0; __i < NoNoNo; ++__i) \
|
||||||
_conj[__i] = maybeConjugate<F>(_buffer[__i]);
|
_conj[__i] = maybeConjugate<F>(_buffer[__i]);
|
||||||
@ -541,14 +538,8 @@ double getEnergySame
|
|||||||
#ifdef HAVE_CUDA
|
#ifdef HAVE_CUDA
|
||||||
DataFieldType<F>* _t_buffer;
|
DataFieldType<F>* _t_buffer;
|
||||||
DataFieldType<F>* _vhhh;
|
DataFieldType<F>* _vhhh;
|
||||||
WITH_CHRONO("double:cuda:alloc",
|
cuMemAlloc((CUdeviceptr*)&_t_buffer, NoNoNo * sizeof(DataFieldType<F>));
|
||||||
_CHECK_CUDA_SUCCESS("Allocating _t_buffer",
|
cuMemAlloc((CUdeviceptr*)&_vhhh, NoNoNo * sizeof(DataFieldType<F>));
|
||||||
cuMemAlloc((CUdeviceptr*)&_t_buffer,
|
|
||||||
NoNoNo * sizeof(DataFieldType<F>)));
|
|
||||||
_CHECK_CUDA_SUCCESS("Allocating _vhhh",
|
|
||||||
cuMemAlloc((CUdeviceptr*)&_vhhh,
|
|
||||||
NoNoNo * sizeof(DataFieldType<F>)));
|
|
||||||
)
|
|
||||||
const size_t
|
const size_t
|
||||||
bs = Atrip::kernelDimensions.ooo.blocks,
|
bs = Atrip::kernelDimensions.ooo.blocks,
|
||||||
ths = Atrip::kernelDimensions.ooo.threads;
|
ths = Atrip::kernelDimensions.ooo.threads;
|
||||||
@ -569,6 +560,7 @@ double getEnergySame
|
|||||||
WITH_CHRONO("double:reorder",
|
WITH_CHRONO("double:reorder",
|
||||||
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk,
|
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk,
|
||||||
NoNoNo);
|
NoNoNo);
|
||||||
|
// synchronize all initializations to zero
|
||||||
)
|
)
|
||||||
#else
|
#else
|
||||||
WITH_CHRONO("double:reorder",
|
WITH_CHRONO("double:reorder",
|
||||||
@ -583,37 +575,37 @@ double getEnergySame
|
|||||||
// VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
|
// VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
|
||||||
MAYBE_CONJ(_vhhh, VhhhC)
|
MAYBE_CONJ(_vhhh, VhhhC)
|
||||||
WITH_CHRONO("doubles:holes:1",
|
WITH_CHRONO("doubles:holes:1",
|
||||||
DGEMM_HOLES(_vhhh, TABhh, "N");
|
DGEMM_HOLES(_vhhh, TABhh, "N")
|
||||||
REORDER(I, K, J)
|
REORDER(I, K, J)
|
||||||
)
|
)
|
||||||
// VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
|
// VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
|
||||||
WITH_CHRONO("doubles:holes:2",
|
WITH_CHRONO("doubles:holes:2",
|
||||||
DGEMM_HOLES(_vhhh, TABhh, "T");
|
DGEMM_HOLES(_vhhh, TABhh, "T")
|
||||||
REORDER(J, K, I)
|
REORDER(J, K, I)
|
||||||
)
|
)
|
||||||
|
|
||||||
// VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
|
// VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
|
||||||
MAYBE_CONJ(_vhhh, VhhhB)
|
MAYBE_CONJ(_vhhh, VhhhB)
|
||||||
WITH_CHRONO("doubles:holes:3",
|
WITH_CHRONO("doubles:holes:3",
|
||||||
DGEMM_HOLES(_vhhh, TAChh, "N");
|
DGEMM_HOLES(_vhhh, TAChh, "N")
|
||||||
REORDER(I, J, K)
|
REORDER(I, J, K)
|
||||||
)
|
)
|
||||||
// VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
|
// VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
|
||||||
WITH_CHRONO("doubles:holes:4",
|
WITH_CHRONO("doubles:holes:4",
|
||||||
DGEMM_HOLES(_vhhh, TAChh, "T");
|
DGEMM_HOLES(_vhhh, TAChh, "T")
|
||||||
REORDER(K, J, I)
|
REORDER(K, J, I)
|
||||||
)
|
)
|
||||||
|
|
||||||
// VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
|
// VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
|
||||||
MAYBE_CONJ(_vhhh, VhhhA)
|
MAYBE_CONJ(_vhhh, VhhhA)
|
||||||
WITH_CHRONO("doubles:holes:5",
|
WITH_CHRONO("doubles:holes:5",
|
||||||
DGEMM_HOLES(_vhhh, TBChh, "N");
|
DGEMM_HOLES(_vhhh, TBChh, "N")
|
||||||
REORDER(J, I, K)
|
REORDER(J, I, K)
|
||||||
)
|
)
|
||||||
// VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
|
// VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
|
||||||
WITH_CHRONO("doubles:holes:6",
|
WITH_CHRONO("doubles:holes:6",
|
||||||
DGEMM_HOLES(_vhhh, TBChh, "T");
|
DGEMM_HOLES(_vhhh, TBChh, "T")
|
||||||
REORDER(K, I, J)
|
REORDER(K, I, J)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@ -624,33 +616,33 @@ double getEnergySame
|
|||||||
{
|
{
|
||||||
// TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
|
// TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
|
||||||
WITH_CHRONO("doubles:particles:1",
|
WITH_CHRONO("doubles:particles:1",
|
||||||
DGEMM_PARTICLES(TAphh, VBCph);
|
DGEMM_PARTICLES(TAphh, VBCph)
|
||||||
REORDER(I, J, K)
|
REORDER(I, J, K)
|
||||||
)
|
)
|
||||||
// TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
|
// TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
|
||||||
WITH_CHRONO("doubles:particles:2",
|
WITH_CHRONO("doubles:particles:2",
|
||||||
DGEMM_PARTICLES(TAphh, VCBph);
|
DGEMM_PARTICLES(TAphh, VCBph)
|
||||||
REORDER(I, K, J)
|
REORDER(I, K, J)
|
||||||
)
|
)
|
||||||
// TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
|
// TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
|
||||||
WITH_CHRONO("doubles:particles:3",
|
WITH_CHRONO("doubles:particles:3",
|
||||||
DGEMM_PARTICLES(TCphh, VABph);
|
DGEMM_PARTICLES(TCphh, VABph)
|
||||||
REORDER(K, I, J)
|
REORDER(K, I, J)
|
||||||
)
|
)
|
||||||
// TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
|
// TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
|
||||||
WITH_CHRONO("doubles:particles:4",
|
WITH_CHRONO("doubles:particles:4",
|
||||||
DGEMM_PARTICLES(TCphh, VBAph);
|
DGEMM_PARTICLES(TCphh, VBAph)
|
||||||
REORDER(K, J, I)
|
REORDER(K, J, I)
|
||||||
)
|
)
|
||||||
// TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
|
// TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
|
||||||
WITH_CHRONO("doubles:particles:5",
|
WITH_CHRONO("doubles:particles:5",
|
||||||
DGEMM_PARTICLES(TBphh, VACph);
|
DGEMM_PARTICLES(TBphh, VACph)
|
||||||
REORDER(J, I, K)
|
REORDER(J, I, K)
|
||||||
)
|
)
|
||||||
// TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
|
// TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
|
||||||
WITH_CHRONO("doubles:particles:6",
|
WITH_CHRONO("doubles:particles:6",
|
||||||
DGEMM_PARTICLES(TBphh, VCAph);
|
DGEMM_PARTICLES(TBphh, VCAph)
|
||||||
REORDER(J, K, I)
|
REORDER(J, K, I)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@ -659,12 +651,9 @@ double getEnergySame
|
|||||||
#ifdef HAVE_CUDA
|
#ifdef HAVE_CUDA
|
||||||
// we need to synchronize here since we need
|
// we need to synchronize here since we need
|
||||||
// the Tijk for next process in the pipeline
|
// the Tijk for next process in the pipeline
|
||||||
_CHECK_CUDA_SUCCESS("Synchronizing",
|
cuCtxSynchronize();
|
||||||
cuCtxSynchronize());
|
cuMemFree((CUdeviceptr)_vhhh);
|
||||||
_CHECK_CUDA_SUCCESS("Freeing _vhhh",
|
cuMemFree((CUdeviceptr)_t_buffer);
|
||||||
cuMemFree((CUdeviceptr)_vhhh));
|
|
||||||
_CHECK_CUDA_SUCCESS("Freeing _t_buffer",
|
|
||||||
cuMemFree((CUdeviceptr)_t_buffer));
|
|
||||||
#else
|
#else
|
||||||
free(_vhhh);
|
free(_vhhh);
|
||||||
free(_t_buffer);
|
free(_t_buffer);
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user