diff --git a/bench/Makefile.am b/bench/Makefile.am index 39ce3c3..8897ddf 100644 --- a/bench/Makefile.am +++ b/bench/Makefile.am @@ -1,3 +1,4 @@ +AUTOMAKE_OPTIONS = subdir-objects include $(top_srcdir)/atrip.mk AM_CPPFLAGS = -I$(top_srcdir)/include/ $(CTF_CPPFLAGS) @@ -19,4 +20,13 @@ endif if WITH_CUDA test_main_CXXFLAGS = $(CUDA_CXXFLAGS) test_main_LDADD += $(CUDA_LDFLAGS) + +AM_CXXFLAGS = $(CUDA_CXXFLAGS) +AM_LDFLAGS += $(CUDA_LDFLAGS) + +bin_PROGRAMS += test-cublas-parallel-atrip +test_cublas_parallel_SOURCES = test-cublas-parallel-atrip.cxx + +bin_PROGRAMS += test-cuda-sanity +test_cuda_sanity_SOURCES = test-cuda-sanity.cxx endif diff --git a/bench/test-cublas-parallel-atrip.cxx b/bench/test-cublas-parallel-atrip.cxx new file mode 100644 index 0000000..2215ff2 --- /dev/null +++ b/bench/test-cublas-parallel-atrip.cxx @@ -0,0 +1,239 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define CUBLASAPI +#include +#include + +struct Timer { + using Clock = std::chrono::high_resolution_clock; + using Event = std::chrono::time_point; + std::chrono::duration duration; + Event _start; + inline void start() noexcept { _start = Clock::now(); } + inline void stop() noexcept { duration += Clock::now() - _start; } + inline void clear() noexcept { duration *= 0; } + inline double count() const noexcept { return duration.count(); } +}; +using Timings = std::map; + +#define _FORMAT(_fmt, ...) \ + ([&] (void) -> std::string { \ + int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__); \ + std::vector _out(_sz + 1); \ + std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \ + return std::string(_out.data()); \ + })() + +#define _CHECK_CUDA_SUCCESS(message, ...) \ + do { \ + CUresult result = __VA_ARGS__; \ + printf("doing %s\n", message); \ + if (result != CUDA_SUCCESS) { \ + printf("\t!!CUDA_ERROR(%d): %s:%d %s\n", \ + result, \ + __FILE__, \ + __LINE__, \ + message); \ + return 1; \ + } \ + } while (0) + +#define _CHECK_CUBLAS_SUCCESS(message, ...) \ + do { \ + cublasStatus_t result = __VA_ARGS__; \ + if (result != 0) { \ + printf("\t!!CUBLAS_ERROR(%d): %s:%d %s\n", \ + result, \ + __FILE__, \ + __LINE__, \ + message); \ + return 1; \ + } \ + } while (0) + +int main(int argc, char** argv) { + + using std::vector; + + MPI_Init(NULL, NULL); + int rank, np, ngcards; + size_t no(10), nv(no * 10), its(2); + bool barrier = false; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &np); + + CLI::App app{"Main bench for atrip"}; + app.add_option("--no", no, "Occupied orbitals"); + app.add_option("--nv", nv, "Virtual orbitals"); + app.add_option("--its", its, "Number of iterations to be done"); + app.add_option("--barrier", barrier, "Call a MPI_Barrier in every iteration?"); + CLI11_PARSE(app, argc, argv); + + const size_t oo = no * no, ooo = no * oo; + + Timings timings; + + MPI_Barrier(MPI_COMM_WORLD); + + _CHECK_CUDA_SUCCESS("init for cuda", + cuInit(0)); + + _CHECK_CUDA_SUCCESS("get ncards", + cuDeviceGetCount(&ngcards)); + + CUcontext ctx; + CUdevice dev; + + char hostname[256]; + gethostname(hostname, 256); + printf("%s with rank %d gets card %d\n", + hostname, + rank, + rank % ngcards); + + // set contexts + _CHECK_CUDA_SUCCESS("device get", cuDeviceGet(&dev, rank % ngcards)); + _CHECK_CUDA_SUCCESS("creating context", cuCtxCreate(&ctx, 0, dev)); + _CHECK_CUDA_SUCCESS("setting context", cuCtxSetCurrent(ctx)); + _CHECK_CUDA_SUCCESS("synchronizing", cuCtxSynchronize()); + MPI_Barrier(MPI_COMM_WORLD); + + using host_slice_t = vector; + + vector sizes = {nv * oo, nv * no , oo, oo * no, oo * no}; + vector P_phh(3), P_ph(6) , H_hh(3), H_hhh(3), T_hhh(1); + vector*> slices_d = {&P_phh, &P_ph , &H_hh, &H_hhh, &T_hhh}; + vector> slices_h(slices_d.size()); + { + int i = -1; + for (auto& v: slices_d) { + i++; + for (auto& ptr: *v) { + _CHECK_CUDA_SUCCESS("malloc", + cuMemAlloc(&ptr, + sizes[i] * sizeof(double))); + slices_h[i].push_back(std::move(std::vector(sizes[i]))); + } + } + } + + const double one = 1.0, zero = 0.0; + + printf("its: %d\n", its); + printf("barrier: %d\n", barrier); + printf("no: %ld\n", no); + printf("nv: %ld\n", nv); + printf("SIZE %f GB\n", (3 * nv * oo + + 6 * oo * nv + + 3 * oo + + 3 * ooo + + 1 * ooo + ) * sizeof(double) / 1024.0 / 1024.0 / 1024.0); + std::map tflopss + {{ "dgemm", ooo * (no + nv) * 6.0 * 2.0 * its / 1e12}, + { "holes", ooo * no * 6.0 * 2.0 * its / 1e12}, + { "particles", ooo * nv * 6.0 * 2.0 * its / 1e12}}; + + cublasHandle_t handle; + _CHECK_CUBLAS_SUCCESS("handle create", cublasCreate(&handle)); + printf("handle %ld\n", handle); + + timings["dgemm"].start(); + for (size_t i = 0; i < its; i++) { + + if (barrier) { + MPI_Barrier(MPI_COMM_WORLD); + timings["memcpy"].start(); + for (size_t _s = 0; _s < slices_d.size(); _s++) { + // for (size_t _b = 0; _b < slices_h[_s].size(); _b++) { + for (size_t _b = 0; _b < 1 ; _b++) { + auto device = (*slices_d[_s])[_b]; + auto host = slices_h[_s][_b].data(); + cuMemcpyHtoD(device, host, sizes[_s]); + } + } + timings["memcpy"].stop(); + } + + + timings["holes"].start(); + for (size_t j = 0; j < 3; j++) { + + _CHECK_CUBLAS_SUCCESS(" > 'geming ...", + cublasDgemm(handle, + CUBLAS_OP_N, + CUBLAS_OP_N, + oo, no, no, + &one, + (double*)H_hhh[j], oo, + (double*)H_hh[j], no, + &zero, + (double*)T_hhh[0], oo)); + + _CHECK_CUBLAS_SUCCESS(" > 'geming ...", + cublasDgemm(handle, + CUBLAS_OP_N, + CUBLAS_OP_T, + oo, no, no, + &one, + (double*)H_hhh[j], oo, + (double*)H_hh[j], no, + &zero, + (double*)T_hhh[0], oo)); + + + } + timings["holes"].stop(); + + timings["particles"].start(); + for (size_t j = 0; j < 6; j++) { + _CHECK_CUBLAS_SUCCESS(" > 'geming ...", + cublasDgemm(handle, + CUBLAS_OP_T, + CUBLAS_OP_N, + oo, no, nv, + &one, + (double*)P_phh[j % 3], nv, + (double*)P_ph[j], nv, + &zero, + (double*)T_hhh[0], oo)); + } + timings["particles"].stop(); + + cuCtxSynchronize(); + + } + + + timings["dgemm"].stop(); + + + + printf("Performance: \n"); + for (auto name: {"holes", "particles", "dgemm"}) + printf("%10s TFlops: %4.1f\n", + name, + tflopss[name] + / timings[name].count()); + + printf("Timings: \n"); + for (auto const& kv: timings) + printf("%10s: %10f\n", kv.first.c_str(), kv.second.count()); + + MPI_Finalize(); + return 0; + +} diff --git a/bench/test-cublas-parallel.cxx b/bench/test-cublas-parallel.cxx new file mode 100644 index 0000000..6fdfc93 --- /dev/null +++ b/bench/test-cublas-parallel.cxx @@ -0,0 +1,163 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CUBLASAPI +#include +#include + +struct Timer { + using Clock = std::chrono::high_resolution_clock; + using Event = std::chrono::time_point; + std::chrono::duration duration; + Event _start; + inline void start() noexcept { _start = Clock::now(); } + inline void stop() noexcept { duration += Clock::now() - _start; } + inline void clear() noexcept { duration *= 0; } + inline double count() const noexcept { return duration.count(); } +}; +using Timings = std::map; + +#define _FORMAT(_fmt, ...) \ + ([&] (void) -> std::string { \ + int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__); \ + std::vector _out(_sz + 1); \ + std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \ + return std::string(_out.data()); \ + })() + +#define _CHECK_CUDA_SUCCESS(message, ...) \ + do { \ + CUresult result = __VA_ARGS__; \ + printf("doing %s\n", message); \ + if (result != CUDA_SUCCESS) { \ + printf("\t!!CUDA_ERROR(%d): %s:%d %s\n", \ + result, \ + __FILE__, \ + __LINE__, \ + message); \ + return 1; \ + } \ + } while (0) + +#define _CHECK_CUBLAS_SUCCESS(message, ...) \ + do { \ + cublasStatus_t result = __VA_ARGS__; \ + printf("CUBLAS: doing %s\n", message); \ + if (result != 0) { \ + printf("\t!!CUBLAS_ERROR(%d): %s:%d %s\n", \ + result, \ + __FILE__, \ + __LINE__, \ + message); \ + return 1; \ + } \ + } while (0) + +int main(int argc, char** argv) { + MPI_Init(NULL, NULL); + int rank, np, ngcards; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &np); + + Timings timings; + + MPI_Barrier(MPI_COMM_WORLD); + + _CHECK_CUDA_SUCCESS("init for cuda", + cuInit(0)); + + _CHECK_CUDA_SUCCESS("get ncards", + cuDeviceGetCount(&ngcards)); + + + const size_t N + = argc > 1 + ? atoi(argv[1]) + : 30000 + , dgemms + = argc > 2 + ? atoi(argv[2]) + : 2 + , flops = 2 * N * N * N * dgemms + ; + + CUcontext ctx; + CUdevice dev; + + char hostname[256]; + gethostname(hostname, 256); + printf("%s with rank %d gets card %d\n", + hostname, + rank, + rank % ngcards); + + // set contexts + _CHECK_CUDA_SUCCESS("device get", + cuDeviceGet(&dev, rank % ngcards)); + _CHECK_CUDA_SUCCESS("creating context", + cuCtxCreate(&ctx, 0, dev)); + _CHECK_CUDA_SUCCESS("setting context", + cuCtxSetCurrent(ctx)); + _CHECK_CUDA_SUCCESS("synchronizing", + cuCtxSynchronize()); + MPI_Barrier(MPI_COMM_WORLD); + + CUdeviceptr A, B, C; + const double one = 1.0; + printf("SIZE %f GB\n", N * N * sizeof(double) / 1024.0 / 1024.0 / 1024.0); + _CHECK_CUDA_SUCCESS("A", cuMemAlloc(&A, N * N * sizeof(double))); + _CHECK_CUDA_SUCCESS("B", cuMemAlloc(&B, N * N * sizeof(double))); + _CHECK_CUDA_SUCCESS("C", cuMemAlloc(&C, N * N * sizeof(double))); + + cublasHandle_t handle; + cublasStatus_t stat; + _CHECK_CUBLAS_SUCCESS("handle create", cublasCreate(&handle)); + printf("handle %ld\n", handle); + + timings["dgemm"].start(); + for (size_t i = 0; i < dgemms; i++) { + _CHECK_CUBLAS_SUCCESS(_FORMAT(" > 'geming %ld ...", i).c_str(), + cublasDgemm(handle, + CUBLAS_OP_N, + CUBLAS_OP_N, + N, N, N, + &one, + (double*)A, N, + (double*)B, N, + &one, + (double*)C, N)); + } + + cuCtxSynchronize(); + timings["dgemm"].stop(); + + + printf("dgemm Gflops: %f\n", + flops + / timings["dgemm"].count() + / 1024.0 + / 1024.0 + / 1024.0); + + MPI_Finalize(); + return 0; +} + +// Local Variables: +// compile-command: "mpic++ \ +// -pedantic -std=c++11 \ +// -L./cudaroot/lib64 -lcuda \ +// -L./cudaroot/lib64 -lcudart \ +// -L./cudaroot/lib64 -lcublas \ +// ./test-cublas-parallel.cxx -o test-cublas-parallel" +// End: diff --git a/bench/test-cuda-sanity.cxx b/bench/test-cuda-sanity.cxx new file mode 100644 index 0000000..bbb9147 --- /dev/null +++ b/bench/test-cuda-sanity.cxx @@ -0,0 +1,104 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +#define _CHECK_CUDA_SUCCESS(message, ...) \ + do { \ + CUresult result = __VA_ARGS__; \ + printf("doing %s\n", message); \ + if (result != CUDA_SUCCESS) { \ + printf("\t!!CUDA_ERROR(%d): %s:%d %s\n", \ + result, \ + __FILE__, \ + __LINE__, \ + message); \ + return 1; \ + } \ + } while (0) + +int main() { + int rank, np, ngcards; + MPI_Init(NULL, NULL); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &np); + + _CHECK_CUDA_SUCCESS("init for cuda", + cuInit(0)); + + _CHECK_CUDA_SUCCESS("get ncards", + cuDeviceGetCount(&ngcards)); + + + for (size_t rank = 0; rank < ngcards; rank++) { + CUcontext ctx; + CUdevice dev; + CUdevprop_st prop; + size_t _free, total, total2; + char *name = (char*)malloc(256); + + printf("Setting contexts\n"); + // set contexts + _CHECK_CUDA_SUCCESS("device get", + cuDeviceGet(&dev, rank)); + _CHECK_CUDA_SUCCESS("creating context", + cuCtxCreate(&ctx, 0, dev)); + _CHECK_CUDA_SUCCESS("setting context", + cuCtxSetCurrent(ctx)); + _CHECK_CUDA_SUCCESS("synchronizing", + cuCtxSynchronize()); + + _CHECK_CUDA_SUCCESS("prop get", + cuDeviceGetProperties(&prop, dev)); + _CHECK_CUDA_SUCCESS("meminfo get", + cuMemGetInfo(&_free, &total)); + _CHECK_CUDA_SUCCESS("name get", + cuDeviceGetName(name, 256, dev)); + _CHECK_CUDA_SUCCESS("totalmem get", + cuDeviceTotalMem(&total2, dev)); + + printf("\n" + "CUDA CARD RANK %d\n" + "=================\n" + "\tname: %s\n" + "\tShared Mem Per Block (KB): %f\n" + "\tFree/Total mem (GB): %f/%f\n" + "\total2 mem (GB): %f\n" + "\n", + dev, + name, + prop.sharedMemPerBlock / 1024.0, + _free / 1024.0 / 1024.0 / 1024.0 , + total / 1024.0 / 1024.0 / 1024.0 , + total2 / 1024.0 / 1024.0 / 1024.0 + ); + + if (_free == 0 || total == 0 || total2 == 0) + return 1; + + CUdeviceptr data; + _CHECK_CUDA_SUCCESS("memalloc 1", + cuMemAlloc(&data, sizeof(double) * 10000)); + _CHECK_CUDA_SUCCESS("memalloc 2", + cuMemAlloc(&data, sizeof(double) * 10000)); + + } + + MPI_Finalize(); + + return 0; +} + +// Local Variables: +// compile-command: "mpic++ \ +// -pedantic -std=c++11 \ +// -L./cudaroot/lib64 -lcuda \ +// -L./cudaroot/lib64 -lcudart \ +// -L./cudaroot/lib64 -lcublas \ +// ./mem.cxx -o mem" +// End: