Compare commits
11 Commits
4277c07cc2
...
20c29ed815
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
20c29ed815 | ||
|
|
a5bb8f3d9c | ||
|
|
5678ac0d9c | ||
|
|
5483325626 | ||
|
|
23ad87214f | ||
|
|
3d7702d501 | ||
|
|
c20b9e3bcb | ||
|
|
da704ad820 | ||
|
|
f1b2f37fe2 | ||
|
|
1cd7bac187 | ||
|
|
68892d5dd8 |
@ -1,3 +1,4 @@
|
||||
AUTOMAKE_OPTIONS = subdir-objects
|
||||
include $(top_srcdir)/atrip.mk
|
||||
|
||||
AM_CPPFLAGS = -I$(top_srcdir)/include/ $(CTF_CPPFLAGS)
|
||||
@ -19,4 +20,13 @@ endif
|
||||
if WITH_CUDA
|
||||
test_main_CXXFLAGS = $(CUDA_CXXFLAGS)
|
||||
test_main_LDADD += $(CUDA_LDFLAGS)
|
||||
|
||||
AM_CXXFLAGS = $(CUDA_CXXFLAGS)
|
||||
AM_LDFLAGS += $(CUDA_LDFLAGS)
|
||||
|
||||
bin_PROGRAMS += test-cublas-parallel-atrip
|
||||
test_cublas_parallel_SOURCES = test-cublas-parallel-atrip.cxx
|
||||
|
||||
bin_PROGRAMS += test-cuda-sanity
|
||||
test_cuda_sanity_SOURCES = test-cuda-sanity.cxx
|
||||
endif
|
||||
|
||||
239
bench/test-cublas-parallel-atrip.cxx
Normal file
239
bench/test-cublas-parallel-atrip.cxx
Normal file
@ -0,0 +1,239 @@
|
||||
#include <mpi.h>
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <string.h>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <cuda.h>
|
||||
#include <chrono>
|
||||
#include <map>
|
||||
#include <cstdlib>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <CLI11.hpp>
|
||||
|
||||
#define CUBLASAPI
|
||||
#include <cublas_api.h>
|
||||
#include <cublas_v2.h>
|
||||
|
||||
struct Timer {
|
||||
using Clock = std::chrono::high_resolution_clock;
|
||||
using Event = std::chrono::time_point<Clock>;
|
||||
std::chrono::duration<double> duration;
|
||||
Event _start;
|
||||
inline void start() noexcept { _start = Clock::now(); }
|
||||
inline void stop() noexcept { duration += Clock::now() - _start; }
|
||||
inline void clear() noexcept { duration *= 0; }
|
||||
inline double count() const noexcept { return duration.count(); }
|
||||
};
|
||||
using Timings = std::map<std::string, Timer>;
|
||||
|
||||
#define _FORMAT(_fmt, ...) \
|
||||
([&] (void) -> std::string { \
|
||||
int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__); \
|
||||
std::vector<char> _out(_sz + 1); \
|
||||
std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
|
||||
return std::string(_out.data()); \
|
||||
})()
|
||||
|
||||
#define _CHECK_CUDA_SUCCESS(message, ...) \
|
||||
do { \
|
||||
CUresult result = __VA_ARGS__; \
|
||||
printf("doing %s\n", message); \
|
||||
if (result != CUDA_SUCCESS) { \
|
||||
printf("\t!!CUDA_ERROR(%d): %s:%d %s\n", \
|
||||
result, \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
message); \
|
||||
return 1; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define _CHECK_CUBLAS_SUCCESS(message, ...) \
|
||||
do { \
|
||||
cublasStatus_t result = __VA_ARGS__; \
|
||||
if (result != 0) { \
|
||||
printf("\t!!CUBLAS_ERROR(%d): %s:%d %s\n", \
|
||||
result, \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
message); \
|
||||
return 1; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
using std::vector;
|
||||
|
||||
MPI_Init(NULL, NULL);
|
||||
int rank, np, ngcards;
|
||||
size_t no(10), nv(no * 10), its(2);
|
||||
bool barrier = false;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &np);
|
||||
|
||||
CLI::App app{"Main bench for atrip"};
|
||||
app.add_option("--no", no, "Occupied orbitals");
|
||||
app.add_option("--nv", nv, "Virtual orbitals");
|
||||
app.add_option("--its", its, "Number of iterations to be done");
|
||||
app.add_option("--barrier", barrier, "Call a MPI_Barrier in every iteration?");
|
||||
CLI11_PARSE(app, argc, argv);
|
||||
|
||||
const size_t oo = no * no, ooo = no * oo;
|
||||
|
||||
Timings timings;
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
_CHECK_CUDA_SUCCESS("init for cuda",
|
||||
cuInit(0));
|
||||
|
||||
_CHECK_CUDA_SUCCESS("get ncards",
|
||||
cuDeviceGetCount(&ngcards));
|
||||
|
||||
CUcontext ctx;
|
||||
CUdevice dev;
|
||||
|
||||
char hostname[256];
|
||||
gethostname(hostname, 256);
|
||||
printf("%s with rank %d gets card %d\n",
|
||||
hostname,
|
||||
rank,
|
||||
rank % ngcards);
|
||||
|
||||
// set contexts
|
||||
_CHECK_CUDA_SUCCESS("device get", cuDeviceGet(&dev, rank % ngcards));
|
||||
_CHECK_CUDA_SUCCESS("creating context", cuCtxCreate(&ctx, 0, dev));
|
||||
_CHECK_CUDA_SUCCESS("setting context", cuCtxSetCurrent(ctx));
|
||||
_CHECK_CUDA_SUCCESS("synchronizing", cuCtxSynchronize());
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
using host_slice_t = vector<double>;
|
||||
|
||||
vector<size_t> sizes = {nv * oo, nv * no , oo, oo * no, oo * no};
|
||||
vector<CUdeviceptr> P_phh(3), P_ph(6) , H_hh(3), H_hhh(3), T_hhh(1);
|
||||
vector<vector<CUdeviceptr>*> slices_d = {&P_phh, &P_ph , &H_hh, &H_hhh, &T_hhh};
|
||||
vector<vector<host_slice_t>> slices_h(slices_d.size());
|
||||
{
|
||||
int i = -1;
|
||||
for (auto& v: slices_d) {
|
||||
i++;
|
||||
for (auto& ptr: *v) {
|
||||
_CHECK_CUDA_SUCCESS("malloc",
|
||||
cuMemAlloc(&ptr,
|
||||
sizes[i] * sizeof(double)));
|
||||
slices_h[i].push_back(std::move(std::vector<double>(sizes[i])));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const double one = 1.0, zero = 0.0;
|
||||
|
||||
printf("its: %d\n", its);
|
||||
printf("barrier: %d\n", barrier);
|
||||
printf("no: %ld\n", no);
|
||||
printf("nv: %ld\n", nv);
|
||||
printf("SIZE %f GB\n", (3 * nv * oo
|
||||
+ 6 * oo * nv
|
||||
+ 3 * oo
|
||||
+ 3 * ooo
|
||||
+ 1 * ooo
|
||||
) * sizeof(double) / 1024.0 / 1024.0 / 1024.0);
|
||||
std::map<std::string, double> tflopss
|
||||
{{ "dgemm", ooo * (no + nv) * 6.0 * 2.0 * its / 1e12},
|
||||
{ "holes", ooo * no * 6.0 * 2.0 * its / 1e12},
|
||||
{ "particles", ooo * nv * 6.0 * 2.0 * its / 1e12}};
|
||||
|
||||
cublasHandle_t handle;
|
||||
_CHECK_CUBLAS_SUCCESS("handle create", cublasCreate(&handle));
|
||||
printf("handle %ld\n", handle);
|
||||
|
||||
timings["dgemm"].start();
|
||||
for (size_t i = 0; i < its; i++) {
|
||||
|
||||
if (barrier) {
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
timings["memcpy"].start();
|
||||
for (size_t _s = 0; _s < slices_d.size(); _s++) {
|
||||
// for (size_t _b = 0; _b < slices_h[_s].size(); _b++) {
|
||||
for (size_t _b = 0; _b < 1 ; _b++) {
|
||||
auto device = (*slices_d[_s])[_b];
|
||||
auto host = slices_h[_s][_b].data();
|
||||
cuMemcpyHtoD(device, host, sizes[_s]);
|
||||
}
|
||||
}
|
||||
timings["memcpy"].stop();
|
||||
}
|
||||
|
||||
|
||||
timings["holes"].start();
|
||||
for (size_t j = 0; j < 3; j++) {
|
||||
|
||||
_CHECK_CUBLAS_SUCCESS(" > 'geming ...",
|
||||
cublasDgemm(handle,
|
||||
CUBLAS_OP_N,
|
||||
CUBLAS_OP_N,
|
||||
oo, no, no,
|
||||
&one,
|
||||
(double*)H_hhh[j], oo,
|
||||
(double*)H_hh[j], no,
|
||||
&zero,
|
||||
(double*)T_hhh[0], oo));
|
||||
|
||||
_CHECK_CUBLAS_SUCCESS(" > 'geming ...",
|
||||
cublasDgemm(handle,
|
||||
CUBLAS_OP_N,
|
||||
CUBLAS_OP_T,
|
||||
oo, no, no,
|
||||
&one,
|
||||
(double*)H_hhh[j], oo,
|
||||
(double*)H_hh[j], no,
|
||||
&zero,
|
||||
(double*)T_hhh[0], oo));
|
||||
|
||||
|
||||
}
|
||||
timings["holes"].stop();
|
||||
|
||||
timings["particles"].start();
|
||||
for (size_t j = 0; j < 6; j++) {
|
||||
_CHECK_CUBLAS_SUCCESS(" > 'geming ...",
|
||||
cublasDgemm(handle,
|
||||
CUBLAS_OP_T,
|
||||
CUBLAS_OP_N,
|
||||
oo, no, nv,
|
||||
&one,
|
||||
(double*)P_phh[j % 3], nv,
|
||||
(double*)P_ph[j], nv,
|
||||
&zero,
|
||||
(double*)T_hhh[0], oo));
|
||||
}
|
||||
timings["particles"].stop();
|
||||
|
||||
cuCtxSynchronize();
|
||||
|
||||
}
|
||||
|
||||
|
||||
timings["dgemm"].stop();
|
||||
|
||||
|
||||
|
||||
printf("Performance: \n");
|
||||
for (auto name: {"holes", "particles", "dgemm"})
|
||||
printf("%10s TFlops: %4.1f\n",
|
||||
name,
|
||||
tflopss[name]
|
||||
/ timings[name].count());
|
||||
|
||||
printf("Timings: \n");
|
||||
for (auto const& kv: timings)
|
||||
printf("%10s: %10f\n", kv.first.c_str(), kv.second.count());
|
||||
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
|
||||
}
|
||||
163
bench/test-cublas-parallel.cxx
Normal file
163
bench/test-cublas-parallel.cxx
Normal file
@ -0,0 +1,163 @@
|
||||
#include <mpi.h>
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <string.h>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <cuda.h>
|
||||
#include <chrono>
|
||||
#include <map>
|
||||
#include <cstdlib>
|
||||
#include <unistd.h>
|
||||
|
||||
#define CUBLASAPI
|
||||
#include <cublas_api.h>
|
||||
#include <cublas_v2.h>
|
||||
|
||||
struct Timer {
|
||||
using Clock = std::chrono::high_resolution_clock;
|
||||
using Event = std::chrono::time_point<Clock>;
|
||||
std::chrono::duration<double> duration;
|
||||
Event _start;
|
||||
inline void start() noexcept { _start = Clock::now(); }
|
||||
inline void stop() noexcept { duration += Clock::now() - _start; }
|
||||
inline void clear() noexcept { duration *= 0; }
|
||||
inline double count() const noexcept { return duration.count(); }
|
||||
};
|
||||
using Timings = std::map<std::string, Timer>;
|
||||
|
||||
#define _FORMAT(_fmt, ...) \
|
||||
([&] (void) -> std::string { \
|
||||
int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__); \
|
||||
std::vector<char> _out(_sz + 1); \
|
||||
std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
|
||||
return std::string(_out.data()); \
|
||||
})()
|
||||
|
||||
#define _CHECK_CUDA_SUCCESS(message, ...) \
|
||||
do { \
|
||||
CUresult result = __VA_ARGS__; \
|
||||
printf("doing %s\n", message); \
|
||||
if (result != CUDA_SUCCESS) { \
|
||||
printf("\t!!CUDA_ERROR(%d): %s:%d %s\n", \
|
||||
result, \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
message); \
|
||||
return 1; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define _CHECK_CUBLAS_SUCCESS(message, ...) \
|
||||
do { \
|
||||
cublasStatus_t result = __VA_ARGS__; \
|
||||
printf("CUBLAS: doing %s\n", message); \
|
||||
if (result != 0) { \
|
||||
printf("\t!!CUBLAS_ERROR(%d): %s:%d %s\n", \
|
||||
result, \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
message); \
|
||||
return 1; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
MPI_Init(NULL, NULL);
|
||||
int rank, np, ngcards;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &np);
|
||||
|
||||
Timings timings;
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
_CHECK_CUDA_SUCCESS("init for cuda",
|
||||
cuInit(0));
|
||||
|
||||
_CHECK_CUDA_SUCCESS("get ncards",
|
||||
cuDeviceGetCount(&ngcards));
|
||||
|
||||
|
||||
const size_t N
|
||||
= argc > 1
|
||||
? atoi(argv[1])
|
||||
: 30000
|
||||
, dgemms
|
||||
= argc > 2
|
||||
? atoi(argv[2])
|
||||
: 2
|
||||
, flops = 2 * N * N * N * dgemms
|
||||
;
|
||||
|
||||
CUcontext ctx;
|
||||
CUdevice dev;
|
||||
|
||||
char hostname[256];
|
||||
gethostname(hostname, 256);
|
||||
printf("%s with rank %d gets card %d\n",
|
||||
hostname,
|
||||
rank,
|
||||
rank % ngcards);
|
||||
|
||||
// set contexts
|
||||
_CHECK_CUDA_SUCCESS("device get",
|
||||
cuDeviceGet(&dev, rank % ngcards));
|
||||
_CHECK_CUDA_SUCCESS("creating context",
|
||||
cuCtxCreate(&ctx, 0, dev));
|
||||
_CHECK_CUDA_SUCCESS("setting context",
|
||||
cuCtxSetCurrent(ctx));
|
||||
_CHECK_CUDA_SUCCESS("synchronizing",
|
||||
cuCtxSynchronize());
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
CUdeviceptr A, B, C;
|
||||
const double one = 1.0;
|
||||
printf("SIZE %f GB\n", N * N * sizeof(double) / 1024.0 / 1024.0 / 1024.0);
|
||||
_CHECK_CUDA_SUCCESS("A", cuMemAlloc(&A, N * N * sizeof(double)));
|
||||
_CHECK_CUDA_SUCCESS("B", cuMemAlloc(&B, N * N * sizeof(double)));
|
||||
_CHECK_CUDA_SUCCESS("C", cuMemAlloc(&C, N * N * sizeof(double)));
|
||||
|
||||
cublasHandle_t handle;
|
||||
cublasStatus_t stat;
|
||||
_CHECK_CUBLAS_SUCCESS("handle create", cublasCreate(&handle));
|
||||
printf("handle %ld\n", handle);
|
||||
|
||||
timings["dgemm"].start();
|
||||
for (size_t i = 0; i < dgemms; i++) {
|
||||
_CHECK_CUBLAS_SUCCESS(_FORMAT(" > 'geming %ld ...", i).c_str(),
|
||||
cublasDgemm(handle,
|
||||
CUBLAS_OP_N,
|
||||
CUBLAS_OP_N,
|
||||
N, N, N,
|
||||
&one,
|
||||
(double*)A, N,
|
||||
(double*)B, N,
|
||||
&one,
|
||||
(double*)C, N));
|
||||
}
|
||||
|
||||
cuCtxSynchronize();
|
||||
timings["dgemm"].stop();
|
||||
|
||||
|
||||
printf("dgemm Gflops: %f\n",
|
||||
flops
|
||||
/ timings["dgemm"].count()
|
||||
/ 1024.0
|
||||
/ 1024.0
|
||||
/ 1024.0);
|
||||
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Local Variables:
|
||||
// compile-command: "mpic++ \
|
||||
// -pedantic -std=c++11 \
|
||||
// -L./cudaroot/lib64 -lcuda \
|
||||
// -L./cudaroot/lib64 -lcudart \
|
||||
// -L./cudaroot/lib64 -lcublas \
|
||||
// ./test-cublas-parallel.cxx -o test-cublas-parallel"
|
||||
// End:
|
||||
104
bench/test-cuda-sanity.cxx
Normal file
104
bench/test-cuda-sanity.cxx
Normal file
@ -0,0 +1,104 @@
|
||||
#include <mpi.h>
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <string.h>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <cuda.h>
|
||||
|
||||
#define _CHECK_CUDA_SUCCESS(message, ...) \
|
||||
do { \
|
||||
CUresult result = __VA_ARGS__; \
|
||||
printf("doing %s\n", message); \
|
||||
if (result != CUDA_SUCCESS) { \
|
||||
printf("\t!!CUDA_ERROR(%d): %s:%d %s\n", \
|
||||
result, \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
message); \
|
||||
return 1; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
int main() {
|
||||
int rank, np, ngcards;
|
||||
MPI_Init(NULL, NULL);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &np);
|
||||
|
||||
_CHECK_CUDA_SUCCESS("init for cuda",
|
||||
cuInit(0));
|
||||
|
||||
_CHECK_CUDA_SUCCESS("get ncards",
|
||||
cuDeviceGetCount(&ngcards));
|
||||
|
||||
|
||||
for (size_t rank = 0; rank < ngcards; rank++) {
|
||||
CUcontext ctx;
|
||||
CUdevice dev;
|
||||
CUdevprop_st prop;
|
||||
size_t _free, total, total2;
|
||||
char *name = (char*)malloc(256);
|
||||
|
||||
printf("Setting contexts\n");
|
||||
// set contexts
|
||||
_CHECK_CUDA_SUCCESS("device get",
|
||||
cuDeviceGet(&dev, rank));
|
||||
_CHECK_CUDA_SUCCESS("creating context",
|
||||
cuCtxCreate(&ctx, 0, dev));
|
||||
_CHECK_CUDA_SUCCESS("setting context",
|
||||
cuCtxSetCurrent(ctx));
|
||||
_CHECK_CUDA_SUCCESS("synchronizing",
|
||||
cuCtxSynchronize());
|
||||
|
||||
_CHECK_CUDA_SUCCESS("prop get",
|
||||
cuDeviceGetProperties(&prop, dev));
|
||||
_CHECK_CUDA_SUCCESS("meminfo get",
|
||||
cuMemGetInfo(&_free, &total));
|
||||
_CHECK_CUDA_SUCCESS("name get",
|
||||
cuDeviceGetName(name, 256, dev));
|
||||
_CHECK_CUDA_SUCCESS("totalmem get",
|
||||
cuDeviceTotalMem(&total2, dev));
|
||||
|
||||
printf("\n"
|
||||
"CUDA CARD RANK %d\n"
|
||||
"=================\n"
|
||||
"\tname: %s\n"
|
||||
"\tShared Mem Per Block (KB): %f\n"
|
||||
"\tFree/Total mem (GB): %f/%f\n"
|
||||
"\total2 mem (GB): %f\n"
|
||||
"\n",
|
||||
dev,
|
||||
name,
|
||||
prop.sharedMemPerBlock / 1024.0,
|
||||
_free / 1024.0 / 1024.0 / 1024.0 ,
|
||||
total / 1024.0 / 1024.0 / 1024.0 ,
|
||||
total2 / 1024.0 / 1024.0 / 1024.0
|
||||
);
|
||||
|
||||
if (_free == 0 || total == 0 || total2 == 0)
|
||||
return 1;
|
||||
|
||||
CUdeviceptr data;
|
||||
_CHECK_CUDA_SUCCESS("memalloc 1",
|
||||
cuMemAlloc(&data, sizeof(double) * 10000));
|
||||
_CHECK_CUDA_SUCCESS("memalloc 2",
|
||||
cuMemAlloc(&data, sizeof(double) * 10000));
|
||||
|
||||
}
|
||||
|
||||
MPI_Finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Local Variables:
|
||||
// compile-command: "mpic++ \
|
||||
// -pedantic -std=c++11 \
|
||||
// -L./cudaroot/lib64 -lcuda \
|
||||
// -L./cudaroot/lib64 -lcudart \
|
||||
// -L./cudaroot/lib64 -lcublas \
|
||||
// ./mem.cxx -o mem"
|
||||
// End:
|
||||
@ -142,8 +142,10 @@ int main(int argc, char** argv) {
|
||||
+ /* tai */ f * nranks * no * nv
|
||||
;
|
||||
|
||||
if (atrip::Atrip::rank == 0)
|
||||
std::cout << "Tentative MEMORY USAGE: " << atrip_memory << "\n";
|
||||
if (rank == 0) {
|
||||
std::cout << "Tentative MEMORY USAGE (GB): "
|
||||
<< double(atrip_memory) / 1024.0 / 1024.0 / 1024.0 << "\n";
|
||||
}
|
||||
|
||||
|
||||
std::vector<int> symmetries(4, NS)
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
cd `git rev-parse --show-toplevel`
|
||||
|
||||
type -a autoreconf > /dev/null ||
|
||||
{
|
||||
cat <<EOF && exit
|
||||
|
||||
@ -1,5 +1,13 @@
|
||||
#pragma once
|
||||
|
||||
#include <atrip/Utils.hpp>
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
#include <cuda.h>
|
||||
#endif
|
||||
|
||||
#include <sstream>
|
||||
|
||||
#if defined(HAVE_CUDA) && defined(__CUDACC__)
|
||||
# define __MAYBE_GLOBAL__ __global__
|
||||
# define __MAYBE_DEVICE__ __device__
|
||||
@ -7,3 +15,31 @@
|
||||
# define __MAYBE_GLOBAL__
|
||||
# define __MAYBE_DEVICE__
|
||||
#endif
|
||||
|
||||
#define _CHECK_CUDA_SUCCESS(message, ...) \
|
||||
do { \
|
||||
CUresult result = __VA_ARGS__; \
|
||||
if (result != CUDA_SUCCESS) { \
|
||||
auto msg = _FORMAT("\t!!CUDA_ERROR(%d): %s:%d\v%s", \
|
||||
result, \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
message); \
|
||||
std::cerr << msg; \
|
||||
throw msg; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define _CHECK_CUBLAS_SUCCESS(message, ...) \
|
||||
do { \
|
||||
cublasStatus_t result = __VA_ARGS__; \
|
||||
if (result != 0) { \
|
||||
auto msg = _FORMAT("\t!!CUBLAS_ERROR(%d): %s:%d\v%s", \
|
||||
result, \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
message); \
|
||||
std::cerr << msg; \
|
||||
throw msg; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@ -21,6 +21,7 @@
|
||||
|
||||
#include <atrip/Tuples.hpp>
|
||||
#include <atrip/Utils.hpp>
|
||||
#include <atrip/CUDA.hpp>
|
||||
|
||||
namespace atrip {
|
||||
|
||||
@ -458,7 +459,11 @@ void unwrapAndMarkReady() {
|
||||
#if defined(HAVE_CUDA)
|
||||
// copy the retrieved mpi data to the device
|
||||
WITH_CHRONO("cuda:memcpy",
|
||||
cuMemcpyHtoD(data, (void*)mpi_data, sizeof(F) * size);)
|
||||
_CHECK_CUDA_SUCCESS("copying mpi data to device",
|
||||
cuMemcpyHtoD(data,
|
||||
(void*)mpi_data,
|
||||
sizeof(F) * size));
|
||||
)
|
||||
std::free(mpi_data);
|
||||
#endif
|
||||
|
||||
|
||||
@ -195,7 +195,28 @@ template <typename F=double>
|
||||
;
|
||||
if (blank.info.state == Slice<F>::SelfSufficient) {
|
||||
#if defined(HAVE_CUDA)
|
||||
blank.mpi_data = sources[from.source].data();
|
||||
const size_t _size = sizeof(F) * sources[from.source].size();
|
||||
// TODO: this is code duplication with downstairs
|
||||
if (freePointers.size() == 0) {
|
||||
std::stringstream stream;
|
||||
stream << "No more free pointers "
|
||||
<< "for type " << type
|
||||
<< " and name " << name
|
||||
;
|
||||
throw std::domain_error(stream.str());
|
||||
}
|
||||
auto dataPointer = freePointers.begin();
|
||||
freePointers.erase(dataPointer);
|
||||
blank.data = *dataPointer;
|
||||
//
|
||||
//
|
||||
// TODO [#A]: do cuMemcpy of
|
||||
// sources[from.source].data() ⇒ blank.data
|
||||
// Do this when everything else is working.
|
||||
// This will probably be a bottleneck of the H-to-D communication,
|
||||
// as most slices are SelfSufficient.
|
||||
//
|
||||
//
|
||||
#else
|
||||
blank.data = sources[from.source].data();
|
||||
#endif
|
||||
@ -316,6 +337,16 @@ template <typename F=double>
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
// In cuda, SelfSufficient slices have an ad-hoc pointer
|
||||
// since it is a pointer on the device and has to be
|
||||
// brought back to the free pointer bucket of the SliceUnion.
|
||||
// Therefore, only in the Recycled case it should not be
|
||||
// put back the pointer.
|
||||
if (slice.info.state == Slice<F>::Recycled) {
|
||||
freeSlicePointer = false;
|
||||
}
|
||||
#else
|
||||
// if the slice is self sufficient, do not dare touching the
|
||||
// pointer, since it is a pointer to our sources in our rank.
|
||||
if ( slice.info.state == Slice<F>::SelfSufficient
|
||||
@ -323,6 +354,7 @@ template <typename F=double>
|
||||
) {
|
||||
freeSlicePointer = false;
|
||||
}
|
||||
#endif
|
||||
|
||||
// make sure we get its data pointer to be used later
|
||||
// only for non-recycled, since it can be that we need
|
||||
@ -346,7 +378,7 @@ template <typename F=double>
|
||||
// << " info " << slice.info
|
||||
<< "\n";
|
||||
slice.free();
|
||||
}
|
||||
} // we did not find the slice
|
||||
|
||||
}
|
||||
}
|
||||
@ -380,10 +412,17 @@ template <typename F=double>
|
||||
|
||||
for (auto& ptr: sliceBuffers) {
|
||||
#if defined(HAVE_CUDA)
|
||||
cuMemAlloc(&ptr, sizeof(F) * sources[0].size());
|
||||
if (ptr == 0UL) {
|
||||
throw "UNSUFICCIENT MEMORY ON THE GRAPHIC CARD FOR FREE POINTERS";
|
||||
}
|
||||
const CUresult error =
|
||||
cuMemAlloc(&ptr, sizeof(F) * sources[0].size());
|
||||
if (ptr == 0UL) {
|
||||
throw "UNSUFICCIENT MEMORY ON THE GRAPHIC CARD FOR FREE POINTERS";
|
||||
}
|
||||
if (error != CUDA_SUCCESS) {
|
||||
std::stringstream s;
|
||||
s << "Error allocating memory for slice buffers "
|
||||
<< "code " << error << "\n";
|
||||
throw s.str();
|
||||
}
|
||||
#else
|
||||
ptr = (DataPtr<F>)malloc(sizeof(F) * sources[0].size());
|
||||
#endif
|
||||
|
||||
@ -80,6 +80,16 @@ struct Timer {
|
||||
using Timings = std::map<std::string, Timer>;
|
||||
// Chrono:1 ends here
|
||||
|
||||
// A nice handy macro to do formatting
|
||||
#define _FORMAT(_fmt, ...) \
|
||||
([&] (void) -> std::string { \
|
||||
int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__); \
|
||||
std::vector<char> _out(_sz + 1); \
|
||||
std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \
|
||||
return std::string(_out.data()); \
|
||||
})()
|
||||
|
||||
|
||||
// [[file:~/cuda/atrip/atrip.org::*Epilog][Epilog:1]]
|
||||
}
|
||||
// Epilog:1 ends here
|
||||
|
||||
@ -24,7 +24,7 @@
|
||||
|
||||
using namespace atrip;
|
||||
#if defined(HAVE_CUDA)
|
||||
#include <cuda.h>
|
||||
#include <atrip/CUDA.hpp>
|
||||
#endif
|
||||
|
||||
template <typename F> bool RankMap<F>::RANK_ROUND_ROBIN;
|
||||
@ -49,11 +49,6 @@ void Atrip::init(MPI_Comm world) {
|
||||
Atrip::communicator = world;
|
||||
MPI_Comm_rank(world, (int*)&Atrip::rank);
|
||||
MPI_Comm_size(world, (int*)&Atrip::np);
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
Atrip::cuda.status = cublasCreate(&Atrip::cuda.handle);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
@ -71,18 +66,24 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
int ngcards;
|
||||
_CHECK_CUDA_SUCCESS("initializing cuda",
|
||||
cuInit(0));
|
||||
_CHECK_CUDA_SUCCESS("getting device count",
|
||||
cuDeviceGetCount(&ngcards));
|
||||
const auto clusterInfo = getClusterInfo(Atrip::communicator);
|
||||
cuDeviceGetCount(&ngcards);
|
||||
LOG(0,"Atrip") << "ngcards: " << ngcards << "\n";
|
||||
if (clusterInfo.ranksPerNode > ngcards) {
|
||||
std::cerr << "ATRIP: You are running on more ranks per node than the number of graphic cards\n"
|
||||
<< "You have " << ngcards << " cards at your disposal\n";
|
||||
throw "";
|
||||
}
|
||||
if (clusterInfo.ranksPerNode < ngcards) {
|
||||
std::cerr << "You have " << ngcards << " cards at your disposal\n"
|
||||
<< "You will be only using " << clusterInfo.ranksPerNode
|
||||
<< ", i.e., the nubmer of ranks.\n";
|
||||
const auto msg
|
||||
= _FORMAT("ATRIP: You are running on more ranks per node than the number of graphic cards\n"
|
||||
"You have %d cards at your disposal\n", ngcards);
|
||||
std::cerr << msg;
|
||||
throw msg;
|
||||
} else if (clusterInfo.ranksPerNode < ngcards) {
|
||||
const auto msg
|
||||
= _FORMAT("You have %d cards at your disposal.\n"
|
||||
"You will be only using %d, i.e, the number of ranks\n",
|
||||
ngcards, clusterInfo.ranksPerNode);
|
||||
std::cerr << msg;
|
||||
}
|
||||
|
||||
|
||||
@ -94,16 +95,27 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
struct { struct { size_t free, total; } avail; size_t total; } memory;
|
||||
char *name = (char*)malloc(256);
|
||||
|
||||
// - TODO :: we should check that the Zuweisung of graphic cards
|
||||
// to nodes works as expected, i.e., node k should get from 0
|
||||
// to ngcards with the formula =rank % ngcards=.
|
||||
|
||||
// set current device
|
||||
cuDeviceGet(&dev, rank);
|
||||
cuCtxCreate(&ctx, 0, dev);
|
||||
cuCtxSetCurrent(ctx);
|
||||
_CHECK_CUDA_SUCCESS("getting device for index <rank>",
|
||||
cuDeviceGet(&dev, rank % ngcards));
|
||||
_CHECK_CUDA_SUCCESS("creating a cuda context",
|
||||
cuCtxCreate(&ctx, 0, dev));
|
||||
_CHECK_CUDA_SUCCESS("setting the context",
|
||||
cuCtxSetCurrent(ctx));
|
||||
|
||||
// get information of the device
|
||||
cuDeviceGetProperties(&prop, dev);
|
||||
cuMemGetInfo(&memory.avail.free, &memory.avail.total);
|
||||
cuDeviceGetName(name, 256, dev);
|
||||
cuDeviceTotalMem(&memory.total, dev);
|
||||
_CHECK_CUDA_SUCCESS("getting properties of current device",
|
||||
cuDeviceGetProperties(&prop, dev));
|
||||
_CHECK_CUDA_SUCCESS("getting memory information",
|
||||
cuMemGetInfo(&memory.avail.free, &memory.avail.total));
|
||||
_CHECK_CUDA_SUCCESS("getting name",
|
||||
cuDeviceGetName(name, 256, dev));
|
||||
_CHECK_CUDA_SUCCESS("getting total memory",
|
||||
cuDeviceTotalMem(&memory.total, dev));
|
||||
|
||||
printf("\n"
|
||||
"CUDA CARD RANK %d\n"
|
||||
@ -124,6 +136,10 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
memory.total / 1024.0 / 1024.0 / 1024.0
|
||||
);
|
||||
std::free((void*)name);
|
||||
|
||||
_CHECK_CUBLAS_SUCCESS("creating a cublas handle",
|
||||
cublasCreate(&Atrip::cuda.handle));
|
||||
|
||||
}
|
||||
MPI_Barrier(universe);
|
||||
}
|
||||
@ -163,17 +179,27 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
DataPtr<F> Tai, epsi, epsa;
|
||||
//TODO: free memory pointers in the end of the algorithm
|
||||
cuMemAlloc(&Tai, sizeof(F) * _Tai.size());
|
||||
cuMemAlloc(&epsi, sizeof(F) * _epsi.size());
|
||||
cuMemAlloc(&epsa, sizeof(F) * _epsa.size());
|
||||
|
||||
cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size());
|
||||
cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size());
|
||||
cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size());
|
||||
// TODO: free memory pointers in the end of the algorithm
|
||||
|
||||
cuMemAlloc(&Tijk, sizeof(F) * No * No * No);
|
||||
cuMemAlloc(&Zijk, sizeof(F) * No * No * No);
|
||||
_CHECK_CUDA_SUCCESS("Tai",
|
||||
cuMemAlloc(&Tai, sizeof(F) * _Tai.size()));
|
||||
_CHECK_CUDA_SUCCESS("epsi",
|
||||
cuMemAlloc(&epsi, sizeof(F) * _epsi.size()));
|
||||
_CHECK_CUDA_SUCCESS("epsa",
|
||||
cuMemAlloc(&epsa, sizeof(F) * _epsa.size()));
|
||||
|
||||
_CHECK_CUDA_SUCCESS("memcpy Tai",
|
||||
cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size()));
|
||||
_CHECK_CUDA_SUCCESS("memcpy epsi",
|
||||
cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size()));
|
||||
_CHECK_CUDA_SUCCESS("memcpy epsa",
|
||||
cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size()));
|
||||
|
||||
_CHECK_CUDA_SUCCESS("Tijk",
|
||||
cuMemAlloc(&Tijk, sizeof(F) * No * No * No));
|
||||
_CHECK_CUDA_SUCCESS("Zijk",
|
||||
cuMemAlloc(&Zijk, sizeof(F) * No * No * No));
|
||||
#else
|
||||
std::vector<F> &Tai = _Tai, &epsi = _epsi, &epsa = _epsa;
|
||||
Zijk = (DataFieldType<F>*)malloc(No*No*No * sizeof(DataFieldType<F>));
|
||||
@ -266,8 +292,8 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
|
||||
|
||||
auto const isFakeTuple
|
||||
= [&tuplesList, distribution](size_t const i) {
|
||||
return distribution->tupleIsFake(tuplesList[i]);
|
||||
};
|
||||
return distribution->tupleIsFake(tuplesList[i]);
|
||||
};
|
||||
|
||||
|
||||
using Database = typename Slice<F>::Database;
|
||||
|
||||
@ -15,19 +15,22 @@
|
||||
// [[file:~/cuda/atrip/atrip.org::*Blas][Blas:2]]
|
||||
#include <atrip/Blas.hpp>
|
||||
#include <atrip/Atrip.hpp>
|
||||
#include <atrip/CUDA.hpp>
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
# include <cstring>
|
||||
|
||||
static inline
|
||||
cublasOperation_t char_to_cublasOperation(const char* trans) {
|
||||
if (strncmp("N", trans, 1) == 0)
|
||||
return CUBLAS_OP_N;
|
||||
else if (strncmp("T", trans, 1) == 0)
|
||||
return CUBLAS_OP_T;
|
||||
else
|
||||
return CUBLAS_OP_C;
|
||||
}
|
||||
static size_t dgem_call = 0;
|
||||
|
||||
static inline
|
||||
cublasOperation_t char_to_cublasOperation(const char* trans) {
|
||||
if (strncmp("N", trans, 1) == 0)
|
||||
return CUBLAS_OP_N;
|
||||
else if (strncmp("T", trans, 1) == 0)
|
||||
return CUBLAS_OP_T;
|
||||
else
|
||||
return CUBLAS_OP_C;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@ -49,13 +52,23 @@ namespace atrip {
|
||||
typename DataField<double>::type *C,
|
||||
const int *ldc) {
|
||||
#if defined(HAVE_CUDA)
|
||||
cublasDgemm(Atrip::cuda.handle,
|
||||
char_to_cublasOperation(transa),
|
||||
char_to_cublasOperation(transb),
|
||||
*m, *n, *k,
|
||||
alpha, A, *lda,
|
||||
B, *ldb, beta,
|
||||
C, *ldc);
|
||||
// TODO: remove this verbose checking
|
||||
const cublasStatus_t error =
|
||||
cublasDgemm(Atrip::cuda.handle,
|
||||
char_to_cublasOperation(transa),
|
||||
char_to_cublasOperation(transb),
|
||||
*m, *n, *k,
|
||||
alpha, A, *lda,
|
||||
B, *ldb, beta,
|
||||
C, *ldc);
|
||||
if (error != 0) printf(":%-3ld (%4ldth) ERR<%4d> cublasDgemm: "
|
||||
"A = %20ld "
|
||||
"B = %20ld "
|
||||
"C = %20ld "
|
||||
"\n",
|
||||
Atrip::rank,
|
||||
dgem_call++,
|
||||
error, A, B, C);
|
||||
#else
|
||||
dgemm_(transa, transb,
|
||||
m, n, k,
|
||||
@ -83,16 +96,17 @@ namespace atrip {
|
||||
cuDoubleComplex
|
||||
cu_alpha = {std::real(*alpha), std::imag(*alpha)},
|
||||
cu_beta = {std::real(*beta), std::imag(*beta)};
|
||||
cublasZgemm(Atrip::cuda.handle,
|
||||
char_to_cublasOperation(transa),
|
||||
char_to_cublasOperation(transb),
|
||||
*m, *n, *k,
|
||||
&cu_alpha,
|
||||
|
||||
A, *lda,
|
||||
B, *ldb,
|
||||
&cu_beta,
|
||||
C, *ldc);
|
||||
|
||||
_CHECK_CUBLAS_SUCCESS("cublasZgemm",
|
||||
cublasZgemm(Atrip::cuda.handle,
|
||||
char_to_cublasOperation(transa),
|
||||
char_to_cublasOperation(transb),
|
||||
*m, *n, *k,
|
||||
&cu_alpha,
|
||||
A, *lda,
|
||||
B, *ldb,
|
||||
&cu_beta,
|
||||
C, *ldc));
|
||||
#else
|
||||
zgemm_(transa, transb,
|
||||
m, n, k,
|
||||
|
||||
@ -14,31 +14,13 @@
|
||||
|
||||
// [[file:~/cuda/atrip/atrip.org::*Prolog][Prolog:2]]
|
||||
#include<atrip/Equations.hpp>
|
||||
#include<atrip/CUDA.hpp>
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
#include <cuda.h>
|
||||
#endif
|
||||
#include<atrip/CUDA.hpp>
|
||||
|
||||
namespace atrip {
|
||||
// Prolog:2 ends here
|
||||
|
||||
|
||||
// These are just help structures
|
||||
// to help with the templating of reorder
|
||||
// function
|
||||
enum reordering_t
|
||||
{
|
||||
IJK,
|
||||
IKJ,
|
||||
JIK,
|
||||
JKI,
|
||||
KIJ,
|
||||
KJI
|
||||
};
|
||||
|
||||
template <typename F, reordering_t R>
|
||||
struct reorder_proxy {};
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
namespace cuda {
|
||||
@ -101,49 +83,42 @@ namespace cuda {
|
||||
template <>
|
||||
__device__
|
||||
cuDoubleComplex multiply(const cuDoubleComplex &a, const cuDoubleComplex &b) {
|
||||
return
|
||||
{a.x * b.x - a.y * b.y,
|
||||
a.x * b.y + a.y * b.x};
|
||||
return
|
||||
{a.x * b.x - a.y * b.y,
|
||||
a.x * b.y + a.y * b.x};
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
__device__
|
||||
void sum_in_place(F* to, const F* b);
|
||||
void sum_in_place(F* to, const F* from);
|
||||
|
||||
template <>
|
||||
__device__
|
||||
void sum_in_place(double* to, const double *b) { *to += *b; }
|
||||
void sum_in_place(double* to, const double *from) { *to += *from; }
|
||||
|
||||
template <>
|
||||
__device__
|
||||
void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* b) {
|
||||
to->x += b->x;
|
||||
to->y += b->y;
|
||||
}
|
||||
|
||||
__device__
|
||||
cuDoubleComplex& operator+=(cuDoubleComplex& lz, cuDoubleComplex const& rz) {
|
||||
lz.x += rz.x;
|
||||
lz.y += rz.y;
|
||||
return lz;
|
||||
void sum_in_place(cuDoubleComplex* to, const cuDoubleComplex* from) {
|
||||
to->x += from->x;
|
||||
to->y += from->y;
|
||||
}
|
||||
|
||||
};
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
#define LIMS_KS() \
|
||||
size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \
|
||||
k = kmin, \
|
||||
idx = kmin * size * size * size \
|
||||
; \
|
||||
k < (kmin < size) ? kmin + 1 : size
|
||||
#define FOR_K() \
|
||||
for (size_t kmin = blockIdx.x * blockDim.x + threadIdx.x, \
|
||||
k = kmin, \
|
||||
idx = kmin * size * size * size; \
|
||||
k < (kmin < size) ? kmin + 1 : size; \
|
||||
k++)
|
||||
#else
|
||||
#define LIMS_KS size_t k=0, idx=0; k < size
|
||||
#define FOR_K() for (size_t k=0, idx=0; k < size; k++)
|
||||
#endif
|
||||
#define _IJK_(i, j, k) i + j*size + k*size*size
|
||||
#define _REORDER_BODY_(...) \
|
||||
for (LIMS_KS() ; k++) \
|
||||
FOR_K() \
|
||||
for (size_t j = 0; j < size; j++) \
|
||||
for (size_t i = 0; i < size; i++, idx++) { \
|
||||
__VA_ARGS__ \
|
||||
@ -152,7 +127,9 @@ namespace cuda {
|
||||
template <typename F> \
|
||||
__MAYBE_GLOBAL__ \
|
||||
void reorder(reorder_proxy< F, _enum > p, \
|
||||
size_t size, F* to, F* from) { \
|
||||
size_t size, \
|
||||
F* to, \
|
||||
F* from) { \
|
||||
_REORDER_BODY_(__VA_ARGS__) \
|
||||
}
|
||||
#if defined(HAVE_CUDA)
|
||||
@ -161,9 +138,31 @@ namespace cuda {
|
||||
#define GO(__TO, __FROM) __TO += __FROM;
|
||||
#endif
|
||||
|
||||
// These are just help structures
|
||||
// to help with the templating of reorder
|
||||
// function
|
||||
enum reordering_t
|
||||
{
|
||||
IJK,
|
||||
IKJ,
|
||||
JIK,
|
||||
JKI,
|
||||
KIJ,
|
||||
KJI
|
||||
};
|
||||
|
||||
/*
|
||||
* Please the c++ type checker and template creator
|
||||
* in order to have an argument in the signature of
|
||||
* the function that helps the compiler know which
|
||||
* instantiation it should take.
|
||||
*
|
||||
*/
|
||||
template <typename F, reordering_t R>
|
||||
struct reorder_proxy {};
|
||||
|
||||
template <typename F, reordering_t R>
|
||||
__MAYBE_GLOBAL__ \
|
||||
__MAYBE_GLOBAL__
|
||||
void reorder(reorder_proxy<F, R> proxy,
|
||||
size_t size, F* to, F* from);
|
||||
|
||||
@ -445,10 +444,14 @@ double getEnergySame
|
||||
|
||||
#if defined(ATRIP_USE_DGEMM)
|
||||
#if defined(HAVE_CUDA)
|
||||
#define REORDER(__II, __JJ, __KK) \
|
||||
reorder<<<bs, ths>>>(reorder_proxy<DataFieldType<F>, \
|
||||
__II ## __JJ ## __KK >{}, \
|
||||
No, Tijk, _t_buffer);
|
||||
#define REORDER(__II, __JJ, __KK) \
|
||||
reorder<<<bs, ths>>>(reorder_proxy< \
|
||||
DataFieldType<F>, \
|
||||
__II ## __JJ ## __KK \
|
||||
>{}, \
|
||||
No, \
|
||||
Tijk, \
|
||||
_t_buffer);
|
||||
#define DGEMM_PARTICLES(__A, __B) \
|
||||
atrip::xgemm<F>("T", \
|
||||
"N", \
|
||||
@ -462,7 +465,7 @@ double getEnergySame
|
||||
(int const*)&Nv, \
|
||||
&zero, \
|
||||
_t_buffer, \
|
||||
(int const*)&NoNo);
|
||||
(int const*)&NoNo)
|
||||
#define DGEMM_HOLES(__A, __B, __TRANSB) \
|
||||
atrip::xgemm<F>("N", \
|
||||
__TRANSB, \
|
||||
@ -477,7 +480,7 @@ double getEnergySame
|
||||
&zero, \
|
||||
_t_buffer, \
|
||||
(int const*)&NoNo \
|
||||
);
|
||||
)
|
||||
#define MAYBE_CONJ(_conj, _buffer) \
|
||||
cuda::maybeConjugate<<< \
|
||||
Atrip::kernelDimensions.ooo.blocks, \
|
||||
@ -512,7 +515,7 @@ double getEnergySame
|
||||
&zero, \
|
||||
_t_buffer, \
|
||||
(int const*)&NoNo \
|
||||
);
|
||||
)
|
||||
#define DGEMM_HOLES(__A, __B, __TRANSB) \
|
||||
atrip::xgemm<F>("N", \
|
||||
__TRANSB, \
|
||||
@ -527,7 +530,7 @@ double getEnergySame
|
||||
&zero, \
|
||||
_t_buffer, \
|
||||
(int const*)&NoNo \
|
||||
);
|
||||
)
|
||||
#define MAYBE_CONJ(_conj, _buffer) \
|
||||
for (size_t __i = 0; __i < NoNoNo; ++__i) \
|
||||
_conj[__i] = maybeConjugate<F>(_buffer[__i]);
|
||||
@ -538,8 +541,14 @@ double getEnergySame
|
||||
#ifdef HAVE_CUDA
|
||||
DataFieldType<F>* _t_buffer;
|
||||
DataFieldType<F>* _vhhh;
|
||||
cuMemAlloc((CUdeviceptr*)&_t_buffer, NoNoNo * sizeof(DataFieldType<F>));
|
||||
cuMemAlloc((CUdeviceptr*)&_vhhh, NoNoNo * sizeof(DataFieldType<F>));
|
||||
WITH_CHRONO("double:cuda:alloc",
|
||||
_CHECK_CUDA_SUCCESS("Allocating _t_buffer",
|
||||
cuMemAlloc((CUdeviceptr*)&_t_buffer,
|
||||
NoNoNo * sizeof(DataFieldType<F>)));
|
||||
_CHECK_CUDA_SUCCESS("Allocating _vhhh",
|
||||
cuMemAlloc((CUdeviceptr*)&_vhhh,
|
||||
NoNoNo * sizeof(DataFieldType<F>)));
|
||||
)
|
||||
const size_t
|
||||
bs = Atrip::kernelDimensions.ooo.blocks,
|
||||
ths = Atrip::kernelDimensions.ooo.threads;
|
||||
@ -560,7 +569,6 @@ double getEnergySame
|
||||
WITH_CHRONO("double:reorder",
|
||||
cuda::zeroing<<<bs, ths>>>((DataFieldType<F>*)Tijk,
|
||||
NoNoNo);
|
||||
// synchronize all initializations to zero
|
||||
)
|
||||
#else
|
||||
WITH_CHRONO("double:reorder",
|
||||
@ -575,37 +583,37 @@ double getEnergySame
|
||||
// VhhhC[i + k*No + L*NoNo] * TABhh[L + j*No]; H1
|
||||
MAYBE_CONJ(_vhhh, VhhhC)
|
||||
WITH_CHRONO("doubles:holes:1",
|
||||
DGEMM_HOLES(_vhhh, TABhh, "N")
|
||||
REORDER(I, K, J)
|
||||
DGEMM_HOLES(_vhhh, TABhh, "N");
|
||||
REORDER(I, K, J)
|
||||
)
|
||||
// VhhhC[j + k*No + L*NoNo] * TABhh[i + L*No]; H0
|
||||
WITH_CHRONO("doubles:holes:2",
|
||||
DGEMM_HOLES(_vhhh, TABhh, "T")
|
||||
REORDER(J, K, I)
|
||||
DGEMM_HOLES(_vhhh, TABhh, "T");
|
||||
REORDER(J, K, I)
|
||||
)
|
||||
|
||||
// VhhhB[i + j*No + L*NoNo] * TAChh[L + k*No]; H5
|
||||
MAYBE_CONJ(_vhhh, VhhhB)
|
||||
WITH_CHRONO("doubles:holes:3",
|
||||
DGEMM_HOLES(_vhhh, TAChh, "N")
|
||||
REORDER(I, J, K)
|
||||
DGEMM_HOLES(_vhhh, TAChh, "N");
|
||||
REORDER(I, J, K)
|
||||
)
|
||||
// VhhhB[k + j*No + L*NoNo] * TAChh[i + L*No]; H3
|
||||
WITH_CHRONO("doubles:holes:4",
|
||||
DGEMM_HOLES(_vhhh, TAChh, "T")
|
||||
REORDER(K, J, I)
|
||||
DGEMM_HOLES(_vhhh, TAChh, "T");
|
||||
REORDER(K, J, I)
|
||||
)
|
||||
|
||||
// VhhhA[j + i*No + L*NoNo] * TBChh[L + k*No]; H1
|
||||
MAYBE_CONJ(_vhhh, VhhhA)
|
||||
WITH_CHRONO("doubles:holes:5",
|
||||
DGEMM_HOLES(_vhhh, TBChh, "N")
|
||||
REORDER(J, I, K)
|
||||
DGEMM_HOLES(_vhhh, TBChh, "N");
|
||||
REORDER(J, I, K)
|
||||
)
|
||||
// VhhhA[k + i*No + L*NoNo] * TBChh[j + L*No]; H4
|
||||
WITH_CHRONO("doubles:holes:6",
|
||||
DGEMM_HOLES(_vhhh, TBChh, "T")
|
||||
REORDER(K, I, J)
|
||||
DGEMM_HOLES(_vhhh, TBChh, "T");
|
||||
REORDER(K, I, J)
|
||||
)
|
||||
}
|
||||
)
|
||||
@ -616,33 +624,33 @@ double getEnergySame
|
||||
{
|
||||
// TAphh[E + i*Nv + j*NoNv] * VBCph[E + k*Nv]; P0
|
||||
WITH_CHRONO("doubles:particles:1",
|
||||
DGEMM_PARTICLES(TAphh, VBCph)
|
||||
REORDER(I, J, K)
|
||||
DGEMM_PARTICLES(TAphh, VBCph);
|
||||
REORDER(I, J, K)
|
||||
)
|
||||
// TAphh[E + i*Nv + k*NoNv] * VCBph[E + j*Nv]; P3
|
||||
WITH_CHRONO("doubles:particles:2",
|
||||
DGEMM_PARTICLES(TAphh, VCBph)
|
||||
REORDER(I, K, J)
|
||||
DGEMM_PARTICLES(TAphh, VCBph);
|
||||
REORDER(I, K, J)
|
||||
)
|
||||
// TCphh[E + k*Nv + i*NoNv] * VABph[E + j*Nv]; P5
|
||||
WITH_CHRONO("doubles:particles:3",
|
||||
DGEMM_PARTICLES(TCphh, VABph)
|
||||
REORDER(K, I, J)
|
||||
DGEMM_PARTICLES(TCphh, VABph);
|
||||
REORDER(K, I, J)
|
||||
)
|
||||
// TCphh[E + k*Nv + j*NoNv] * VBAph[E + i*Nv]; P2
|
||||
WITH_CHRONO("doubles:particles:4",
|
||||
DGEMM_PARTICLES(TCphh, VBAph)
|
||||
REORDER(K, J, I)
|
||||
DGEMM_PARTICLES(TCphh, VBAph);
|
||||
REORDER(K, J, I)
|
||||
)
|
||||
// TBphh[E + j*Nv + i*NoNv] * VACph[E + k*Nv]; P1
|
||||
WITH_CHRONO("doubles:particles:5",
|
||||
DGEMM_PARTICLES(TBphh, VACph)
|
||||
REORDER(J, I, K)
|
||||
DGEMM_PARTICLES(TBphh, VACph);
|
||||
REORDER(J, I, K)
|
||||
)
|
||||
// TBphh[E + j*Nv + k*NoNv] * VCAph[E + i*Nv]; P4
|
||||
WITH_CHRONO("doubles:particles:6",
|
||||
DGEMM_PARTICLES(TBphh, VCAph)
|
||||
REORDER(J, K, I)
|
||||
DGEMM_PARTICLES(TBphh, VCAph);
|
||||
REORDER(J, K, I)
|
||||
)
|
||||
}
|
||||
)
|
||||
@ -651,9 +659,12 @@ double getEnergySame
|
||||
#ifdef HAVE_CUDA
|
||||
// we need to synchronize here since we need
|
||||
// the Tijk for next process in the pipeline
|
||||
cuCtxSynchronize();
|
||||
cuMemFree((CUdeviceptr)_vhhh);
|
||||
cuMemFree((CUdeviceptr)_t_buffer);
|
||||
_CHECK_CUDA_SUCCESS("Synchronizing",
|
||||
cuCtxSynchronize());
|
||||
_CHECK_CUDA_SUCCESS("Freeing _vhhh",
|
||||
cuMemFree((CUdeviceptr)_vhhh));
|
||||
_CHECK_CUDA_SUCCESS("Freeing _t_buffer",
|
||||
cuMemFree((CUdeviceptr)_t_buffer));
|
||||
#else
|
||||
free(_vhhh);
|
||||
free(_t_buffer);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user