Fix AniaBug #1: cublasCreate after context setting

2022-09-12 19:17:52 +02:00
parent 23ad87214f
commit 5483325626
1 changed files with 59 additions and 33 deletions
--- a/src/atrip/Atrip.cxx
+++ b/src/atrip/Atrip.cxx
@@ -24,7 +24,7 @@
 using namespace atrip;
 #if defined(HAVE_CUDA)
-#include <cuda.h>
+#include <atrip/CUDA.hpp>
 #endif
 template <typename F> bool RankMap<F>::RANK_ROUND_ROBIN;
@@ -49,11 +49,6 @@ void Atrip::init(MPI_Comm world)  {
  Atrip::communicator = world;
  MPI_Comm_rank(world, (int*)&Atrip::rank);
  MPI_Comm_size(world, (int*)&Atrip::np);
 #if defined(HAVE_CUDA)
  Atrip::cuda.status = cublasCreate(&Atrip::cuda.handle);
 #endif
 }
 template <typename F>
@@ -71,18 +66,24 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
 #if defined(HAVE_CUDA)
  int ngcards;
  _CHECK_CUDA_SUCCESS("initializing cuda",
                      cuInit(0));
  _CHECK_CUDA_SUCCESS("getting device count",
                      cuDeviceGetCount(&ngcards));
  const auto clusterInfo = getClusterInfo(Atrip::communicator);
  cuDeviceGetCount(&ngcards);
  LOG(0,"Atrip") << "ngcards: " << ngcards << "\n";
  if (clusterInfo.ranksPerNode > ngcards) {
-    std::cerr << "ATRIP: You are running on more ranks per node than the number of graphic cards\n"
+    const auto msg
-              << "You have " << ngcards << " cards at your disposal\n";
+      = _FORMAT("ATRIP: You are running on more ranks per node than the number of graphic cards\n"
-    throw "";
+                "You have %d cards at your disposal\n", ngcards);
-  }
+    std::cerr << msg;
-  if (clusterInfo.ranksPerNode < ngcards) {
+    throw msg;
-    std::cerr << "You have " << ngcards << " cards at your disposal\n"
+  } else if (clusterInfo.ranksPerNode < ngcards) {
-              << "You will be only using " << clusterInfo.ranksPerNode
+    const auto msg
-              << ", i.e., the nubmer of ranks.\n";
+      = _FORMAT("You have %d cards at your disposal.\n"
                "You will be only using %d, i.e, the number of ranks\n",
                ngcards, clusterInfo.ranksPerNode);
    std::cerr << msg;
  }
@@ -94,16 +95,27 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
      struct { struct { size_t free, total; } avail; size_t total; } memory;
      char *name = (char*)malloc(256);
      // - TODO :: we should check that the Zuweisung of graphic cards
      //           to nodes works as expected, i.e., node k should get from 0
      //           to ngcards with the formula =rank % ngcards=.
      // set current device
-      cuDeviceGet(&dev, rank);
+      _CHECK_CUDA_SUCCESS("getting device for index <rank>",
-      cuCtxCreate(&ctx, 0, dev);
+                          cuDeviceGet(&dev, rank % ngcards));
-      cuCtxSetCurrent(ctx);
+      _CHECK_CUDA_SUCCESS("creating a cuda context",
                          cuCtxCreate(&ctx, 0, dev));
      _CHECK_CUDA_SUCCESS("setting the context",
                          cuCtxSetCurrent(ctx));
      // get information of the device
-      cuDeviceGetProperties(&prop, dev);
+      _CHECK_CUDA_SUCCESS("getting  properties of current device",
-      cuMemGetInfo(&memory.avail.free, &memory.avail.total);
+                          cuDeviceGetProperties(&prop, dev));
-      cuDeviceGetName(name, 256, dev);
+      _CHECK_CUDA_SUCCESS("getting memory information",
-      cuDeviceTotalMem(&memory.total, dev);
+                          cuMemGetInfo(&memory.avail.free, &memory.avail.total));
      _CHECK_CUDA_SUCCESS("getting name",
                          cuDeviceGetName(name, 256, dev));
      _CHECK_CUDA_SUCCESS("getting total memory",
                          cuDeviceTotalMem(&memory.total, dev));
      printf("\n"
             "CUDA CARD RANK %d\n"
@@ -124,6 +136,10 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
             memory.total / 1024.0 / 1024.0 / 1024.0
             );
      std::free((void*)name);
      _CHECK_CUBLAS_SUCCESS("creating a cublas handle",
                            cublasCreate(&Atrip::cuda.handle));
    }
    MPI_Barrier(universe);
  }
@@ -163,17 +179,27 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
 #if defined(HAVE_CUDA)
  DataPtr<F> Tai, epsi, epsa;
  //TODO: free memory pointers in the end of the algorithm
  cuMemAlloc(&Tai, sizeof(F) * _Tai.size());
  cuMemAlloc(&epsi, sizeof(F) * _epsi.size());
  cuMemAlloc(&epsa, sizeof(F) * _epsa.size());
-  cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size());
+  // TODO: free memory pointers in the end of the algorithm
  cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size());
  cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size());
-  cuMemAlloc(&Tijk, sizeof(F) * No * No * No);
+  _CHECK_CUDA_SUCCESS("Tai",
-  cuMemAlloc(&Zijk, sizeof(F) * No * No * No);
+                      cuMemAlloc(&Tai, sizeof(F) * _Tai.size()));
  _CHECK_CUDA_SUCCESS("epsi",
                      cuMemAlloc(&epsi, sizeof(F) * _epsi.size()));
  _CHECK_CUDA_SUCCESS("epsa",
                      cuMemAlloc(&epsa, sizeof(F) * _epsa.size()));
  _CHECK_CUDA_SUCCESS("memcpy Tai",
                      cuMemcpyHtoD(Tai, (void*)_Tai.data(), sizeof(F) * _Tai.size()));
  _CHECK_CUDA_SUCCESS("memcpy epsi",
                      cuMemcpyHtoD(epsi,(void*)_epsi.data(), sizeof(F) * _epsi.size()));
  _CHECK_CUDA_SUCCESS("memcpy epsa",
                      cuMemcpyHtoD(epsa, (void*)_epsa.data(), sizeof(F) * _epsa.size()));
  _CHECK_CUDA_SUCCESS("Tijk",
                      cuMemAlloc(&Tijk, sizeof(F) * No * No * No));
  _CHECK_CUDA_SUCCESS("Zijk",
                      cuMemAlloc(&Zijk, sizeof(F) * No * No * No));
 #else
  std::vector<F> &Tai = _Tai, &epsi = _epsi, &epsa = _epsa;
  Zijk = (DataFieldType<F>*)malloc(No*No*No * sizeof(DataFieldType<F>));
@@ -266,8 +292,8 @@ Atrip::Output Atrip::run(Atrip::Input<F> const& in) {
  auto const isFakeTuple
    = [&tuplesList, distribution](size_t const i) {
-      return distribution->tupleIsFake(tuplesList[i]);
+        return distribution->tupleIsFake(tuplesList[i]);
-    };
+      };
  using Database = typename Slice<F>::Database;