#include #include #include #include #include #include #include #include #include #include #include #include #include #define CUBLASAPI #include #include struct Timer { using Clock = std::chrono::high_resolution_clock; using Event = std::chrono::time_point; std::chrono::duration duration; Event _start; inline void start() noexcept { _start = Clock::now(); } inline void stop() noexcept { duration += Clock::now() - _start; } inline void clear() noexcept { duration *= 0; } inline double count() const noexcept { return duration.count(); } }; using Timings = std::map; #define _FORMAT(_fmt, ...) \ ([&] (void) -> std::string { \ int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__); \ std::vector _out(_sz + 1); \ std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \ return std::string(_out.data()); \ })() #define _CHECK_CUDA_SUCCESS(message, ...) \ do { \ CUresult result = __VA_ARGS__; \ printf("doing %s\n", message); \ if (result != CUDA_SUCCESS) { \ printf("\t!!CUDA_ERROR(%d): %s:%d %s\n", \ result, \ __FILE__, \ __LINE__, \ message); \ return 1; \ } \ } while (0) #define _CHECK_CUBLAS_SUCCESS(message, ...) \ do { \ cublasStatus_t result = __VA_ARGS__; \ if (result != 0) { \ printf("\t!!CUBLAS_ERROR(%d): %s:%d %s\n", \ result, \ __FILE__, \ __LINE__, \ message); \ return 1; \ } \ } while (0) int main(int argc, char** argv) { using std::vector; MPI_Init(NULL, NULL); int rank, np, ngcards; size_t no(10), nv(no * 10), its(2); bool barrier = false; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &np); CLI::App app{"Main bench for atrip"}; app.add_option("--no", no, "Occupied orbitals"); app.add_option("--nv", nv, "Virtual orbitals"); app.add_option("--its", its, "Number of iterations to be done"); app.add_option("--barrier", barrier, "Call a MPI_Barrier in every iteration?"); CLI11_PARSE(app, argc, argv); const size_t oo = no * no, ooo = no * oo; Timings timings; MPI_Barrier(MPI_COMM_WORLD); _CHECK_CUDA_SUCCESS("init for cuda", cuInit(0)); _CHECK_CUDA_SUCCESS("get ncards", cuDeviceGetCount(&ngcards)); CUcontext ctx; CUdevice dev; char hostname[256]; gethostname(hostname, 256); printf("%s with rank %d gets card %d\n", hostname, rank, rank % ngcards); // set contexts _CHECK_CUDA_SUCCESS("device get", cuDeviceGet(&dev, rank % ngcards)); _CHECK_CUDA_SUCCESS("creating context", cuCtxCreate(&ctx, 0, dev)); _CHECK_CUDA_SUCCESS("setting context", cuCtxSetCurrent(ctx)); _CHECK_CUDA_SUCCESS("synchronizing", cuCtxSynchronize()); MPI_Barrier(MPI_COMM_WORLD); using host_slice_t = vector; vector sizes = {nv * oo, nv * no , oo, oo * no, oo * no}; vector P_phh(3), P_ph(6) , H_hh(3), H_hhh(3), T_hhh(1); vector*> slices_d = {&P_phh, &P_ph , &H_hh, &H_hhh, &T_hhh}; vector> slices_h(slices_d.size()); { int i = -1; for (auto& v: slices_d) { i++; for (auto& ptr: *v) { _CHECK_CUDA_SUCCESS("malloc", cuMemAlloc(&ptr, sizes[i] * sizeof(double))); slices_h[i].push_back(std::move(std::vector(sizes[i]))); } } } const double one = 1.0, zero = 0.0; printf("its: %d\n", its); printf("barrier: %d\n", barrier); printf("no: %ld\n", no); printf("nv: %ld\n", nv); printf("SIZE %f GB\n", (3 * nv * oo + 6 * no * nv + 3 * oo + 3 * ooo + 1 * ooo ) * sizeof(double) / 1024.0 / 1024.0 / 1024.0); std::map tflopss {{ "dgemm", ooo * (no + nv) * 6.0 * 2.0 * its / 1e12}, { "holes", ooo * no * 6.0 * 2.0 * its / 1e12}, { "particles", ooo * nv * 6.0 * 2.0 * its / 1e12}}; cublasHandle_t handle; _CHECK_CUBLAS_SUCCESS("handle create", cublasCreate(&handle)); printf("handle %ld\n", handle); timings["dgemm"].start(); for (size_t i = 0; i < its; i++) { if (barrier) { MPI_Barrier(MPI_COMM_WORLD); timings["memcpy"].start(); for (size_t _s = 0; _s < slices_d.size(); _s++) { for (size_t _b = 0; _b < slices_h[_s].size(); _b++) { // for (size_t _b = 0; _b < 1 ; _b++) { auto device = (*slices_d[_s])[_b]; auto host = slices_h[_s][_b].data(); cuMemcpyHtoD(device, host, sizes[_s]); } } timings["memcpy"].stop(); } timings["holes"].start(); for (size_t j = 0; j < 3; j++) { _CHECK_CUBLAS_SUCCESS(" > 'geming ...", cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, oo, no, no, &one, (double*)H_hhh[j], oo, (double*)H_hh[j], no, &zero, (double*)T_hhh[0], oo)); _CHECK_CUBLAS_SUCCESS(" > 'geming ...", cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, oo, no, no, &one, (double*)H_hhh[j], oo, (double*)H_hh[j], no, &zero, (double*)T_hhh[0], oo)); } timings["holes"].stop(); timings["particles"].start(); for (size_t j = 0; j < 6; j++) { _CHECK_CUBLAS_SUCCESS(" > 'geming ...", cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, oo, no, nv, &one, (double*)P_phh[j % 3], nv, (double*)P_ph[j], nv, &zero, (double*)T_hhh[0], oo)); } timings["particles"].stop(); cuCtxSynchronize(); } timings["dgemm"].stop(); printf("Performance: \n"); for (auto name: {"holes", "particles", "dgemm"}) printf("%10s TFlops: %4.1f\n", name, tflopss[name] / timings[name].count()); printf("Timings: \n"); for (auto const& kv: timings) printf("%10s: %10f\n", kv.first.c_str(), kv.second.count()); MPI_Finalize(); return 0; }