#include #include #include #include #include #include #include #include #include #include #include #include #define CUBLASAPI #include #include struct Timer { using Clock = std::chrono::high_resolution_clock; using Event = std::chrono::time_point; std::chrono::duration duration; Event _start; inline void start() noexcept { _start = Clock::now(); } inline void stop() noexcept { duration += Clock::now() - _start; } inline void clear() noexcept { duration *= 0; } inline double count() const noexcept { return duration.count(); } }; using Timings = std::map; #define _FORMAT(_fmt, ...) \ ([&] (void) -> std::string { \ int _sz = std::snprintf(nullptr, 0, _fmt, __VA_ARGS__); \ std::vector _out(_sz + 1); \ std::snprintf(&_out[0], _out.size(), _fmt, __VA_ARGS__); \ return std::string(_out.data()); \ })() #define _CHECK_CUDA_SUCCESS(message, ...) \ do { \ CUresult result = __VA_ARGS__; \ printf("doing %s\n", message); \ if (result != CUDA_SUCCESS) { \ printf("\t!!CUDA_ERROR(%d): %s:%d %s\n", \ result, \ __FILE__, \ __LINE__, \ message); \ return 1; \ } \ } while (0) #define _CHECK_CUBLAS_SUCCESS(message, ...) \ do { \ cublasStatus_t result = __VA_ARGS__; \ printf("CUBLAS: doing %s\n", message); \ if (result != 0) { \ printf("\t!!CUBLAS_ERROR(%d): %s:%d %s\n", \ result, \ __FILE__, \ __LINE__, \ message); \ return 1; \ } \ } while (0) int main(int argc, char** argv) { MPI_Init(NULL, NULL); int rank, np, ngcards; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &np); Timings timings; MPI_Barrier(MPI_COMM_WORLD); _CHECK_CUDA_SUCCESS("init for cuda", cuInit(0)); _CHECK_CUDA_SUCCESS("get ncards", cuDeviceGetCount(&ngcards)); const size_t N = argc > 1 ? atoi(argv[1]) : 30000 , dgemms = argc > 2 ? atoi(argv[2]) : 2 , flops = 2 * N * N * N * dgemms ; CUcontext ctx; CUdevice dev; char hostname[256]; gethostname(hostname, 256); printf("%s with rank %d gets card %d\n", hostname, rank, rank % ngcards); // set contexts _CHECK_CUDA_SUCCESS("device get", cuDeviceGet(&dev, rank % ngcards)); _CHECK_CUDA_SUCCESS("creating context", cuCtxCreate(&ctx, 0, dev)); _CHECK_CUDA_SUCCESS("setting context", cuCtxSetCurrent(ctx)); _CHECK_CUDA_SUCCESS("synchronizing", cuCtxSynchronize()); MPI_Barrier(MPI_COMM_WORLD); CUdeviceptr A, B, C; const double one = 1.0; printf("SIZE %f GB\n", N * N * sizeof(double) / 1024.0 / 1024.0 / 1024.0); _CHECK_CUDA_SUCCESS("A", cuMemAlloc(&A, N * N * sizeof(double))); _CHECK_CUDA_SUCCESS("B", cuMemAlloc(&B, N * N * sizeof(double))); _CHECK_CUDA_SUCCESS("C", cuMemAlloc(&C, N * N * sizeof(double))); cublasHandle_t handle; cublasStatus_t stat; _CHECK_CUBLAS_SUCCESS("handle create", cublasCreate(&handle)); printf("handle %ld\n", handle); timings["dgemm"].start(); for (size_t i = 0; i < dgemms; i++) { _CHECK_CUBLAS_SUCCESS(_FORMAT(" > 'geming %ld ...", i).c_str(), cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &one, (double*)A, N, (double*)B, N, &one, (double*)C, N)); } cuCtxSynchronize(); timings["dgemm"].stop(); printf("dgemm Gflops: %f\n", flops / timings["dgemm"].count() / 1024.0 / 1024.0 / 1024.0); MPI_Finalize(); return 0; } // Local Variables: // compile-command: "mpic++ \ // -pedantic -std=c++11 \ // -L./cudaroot/lib64 -lcuda \ // -L./cudaroot/lib64 -lcudart \ // -L./cudaroot/lib64 -lcublas \ // ./test-cublas-parallel.cxx -o test-cublas-parallel" // End: