Fix AniaBug #1: cublasCreate after context setting

2022-09-12 19:17:52 +02:00
parent 23ad87214f
commit 5483325626
1 changed files with 59 additions and 33 deletions
--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@@ -24,7 +24,7 @@

 using namespace atrip;
 #if defined(HAVE_CUDA)
-#include <cuda.h>
+#include <atrip/CUDA.hpp>
 #endif

 template <typename F> bool RankMap<F>::RANK_ROUND_ROBIN;
@@ -49,11 +49,6 @@ void Atrip::init(MPI_Comm world)  {
  Atrip::communicator = world;
  MPI_Comm_rank(world, (int*)&Atrip::rank);
  MPI_Comm_size(world, (int*)&Atrip::np);
-
-#if defined(HAVE_CUDA)
-  Atrip::cuda.status = cublasCreate(&Atrip::cuda.handle);
-#endif
-
 }

 template <typename F>
@@ -71,18 +66,24 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {

 #if defined(HAVE_CUDA)
  int ngcards;
+  _CHECK_CUDA_SUCCESS("initializing cuda",
+                      cuInit(0));
+  _CHECK_CUDA_SUCCESS("getting device count",
+                      cuDeviceGetCount(&ngcards));
  const auto clusterInfo = getClusterInfo(Atrip::communicator);
-  cuDeviceGetCount(&ngcards);
  LOG(0,"Atrip") << "ngcards: " << ngcards << "\n";
  if (clusterInfo.ranksPerNode > ngcards) {
-    std::cerr << "ATRIP: You are running on more ranks per node than the number of graphic cards\n"
-              << "You have " << ngcards << " cards at your disposal\n";
-    throw "";
-  }
-  if (clusterInfo.ranksPerNode < ngcards) {
-    std::cerr << "You have " << ngcards << " cards at your disposal\n"
-              << "You will be only using " << clusterInfo.ranksPerNode
-              << ", i.e., the nubmer of ranks.\n";
+    const auto msg
+      = _FORMAT("ATRIP: You are running on more ranks per node than the number of graphic cards\n"
+                "You have %d cards at your disposal\n", ngcards);
+    std::cerr << msg;
+    throw msg;
+  } else if (clusterInfo.ranksPerNode < ngcards) {
+    const auto msg
+      = _FORMAT("You have %d cards at your disposal.\n"
+                "You will be only using %d, i.e, the number of ranks\n",
+                ngcards, clusterInfo.ranksPerNode);
+    std::cerr << msg;
  }


@@ -94,16 +95,27 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
      struct { struct { size_t free, total; } avail; size_t total; } memory;
      char *name = (char*)malloc(256);

+      // - TODO :: we should check that the Zuweisung of graphic cards
+      //           to nodes works as expected, i.e., node k should get from 0
+      //           to ngcards with the formula =rank % ngcards=.
+
      // set current device
-      cuDeviceGet(&dev, rank);
-      cuCtxCreate(&ctx, 0, dev);
-      cuCtxSetCurrent(ctx);
+      _CHECK_CUDA_SUCCESS("getting device for index <rank>",
+                          cuDeviceGet(&dev, rank % ngcards));
+      _CHECK_CUDA_SUCCESS("creating a cuda context",
+                          cuCtxCreate(&ctx, 0, dev));
+      _CHECK_CUDA_SUCCESS("setting the context",
+                          cuCtxSetCurrent(ctx));

      // get information of the device
-      cuDeviceGetProperties(&prop, dev);
-      cuMemGetInfo(&memory.avail.free, &memory.avail.total);
-      cuDeviceGetName(name, 256, dev);
-      cuDeviceTotalMem(&memory.total, dev);
+      _CHECK_CUDA_SUCCESS("getting  properties of current device",
+                          cuDeviceGetProperties(&prop, dev));
+      _CHECK_CUDA_SUCCESS("getting memory information",
+                          cuMemGetInfo(&memory.avail.free, &memory.avail.total));
+      _CHECK_CUDA_SUCCESS("getting name",
+                          cuDeviceGetName(name, 256, dev));
+      _CHECK_CUDA_SUCCESS("getting total memory",
+                          cuDeviceTotalMem(&memory.total, dev));

      printf("\n"
             "CUDA CARD RANK %d\n"
@@ -124,6 +136,10 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
             memory.total / 1024.0 / 1024.0 / 1024.0
             );
      std::free((void*)name);
+
+      _CHECK_CUBLAS_SUCCESS("creating a cublas handle",
+                            cublasCreate(&Atrip::cuda.handle));
+
    }
    MPI_Barrier(universe);
  }
@@ -163,17 +179,27 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {

 #if defined(HAVE_CUDA)
  DataPtr<F> Tai, epsi, epsa;
+
  // TODO: free memory pointers in the end of the algorithm
-  cuMemAlloc(&Tai, sizeof(F) * _Tai.size());
-  cuMemAlloc(&epsi, sizeof(F) * _epsi.size());
-  cuMemAlloc(&epsa, sizeof(F) * _epsa.size());

-  cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size());
-  cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size());
-  cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size());
+  _CHECK_CUDA_SUCCESS("Tai",
+                      cuMemAlloc(&Tai, sizeof(F) * _Tai.size()));
+  _CHECK_CUDA_SUCCESS("epsi",
+                      cuMemAlloc(&epsi, sizeof(F) * _epsi.size()));
+  _CHECK_CUDA_SUCCESS("epsa",
+                      cuMemAlloc(&epsa, sizeof(F) * _epsa.size()));

-  cuMemAlloc(&Tijk, sizeof(F) * No * No * No);
-  cuMemAlloc(&Zijk, sizeof(F) * No * No * No);
+  _CHECK_CUDA_SUCCESS("memcpy Tai",
+                      cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size()));
+  _CHECK_CUDA_SUCCESS("memcpy epsi",
+                      cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size()));
+  _CHECK_CUDA_SUCCESS("memcpy epsa",
+                      cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size()));
+
+  _CHECK_CUDA_SUCCESS("Tijk",
+                      cuMemAlloc(&Tijk, sizeof(F) * No * No * No));
+  _CHECK_CUDA_SUCCESS("Zijk",
+                      cuMemAlloc(&Zijk, sizeof(F) * No * No * No));
 #else
  std::vector<F> &Tai = _Tai, &epsi = _epsi, &epsa = _epsa;
  Zijk = (DataFieldType<F>*)malloc(No*No*No * sizeof(DataFieldType<F>));