[v0.04] Fix bug in which NVML was using the wrong GPU id

2021-08-16 15:18:55 +02:00
parent fd15008ab2
commit dea5211791
3 changed files with 12 additions and 7 deletions
--- a/src/common/gpu.hpp
+++ b/src/common/gpu.hpp
@@ -62,6 +62,7 @@ struct gpu_info {
  struct memory* mem;
  struct cache* cach;
  int64_t peak_performance;
  int32_t idx;
 };
 VENDOR get_gpu_vendor(struct gpu_info* gpu);
--- a/src/cuda/cuda.cpp
+++ b/src/cuda/cuda.cpp
@@ -67,8 +67,9 @@ int64_t get_peak_performance(struct gpu_info* gpu) {
 struct gpu_info* get_gpu_info(int gpu_idx) {
  struct gpu_info* gpu = (struct gpu_info*) emalloc(sizeof(struct gpu_info));
  gpu->pci = NULL;
  gpu->idx = gpu_idx;
-  if(gpu_idx < 0) {
+  if(gpu->idx < 0) {
    printErr("GPU index must be equal or greater than zero");
    return NULL;
  }
@@ -89,13 +90,13 @@ struct gpu_info* get_gpu_info(int gpu_idx) {
    return NULL;
  }
-  if(gpu_idx+1 > num_gpus) {
+  if(gpu->idx+1 > num_gpus) {
-    printErr("Requested GPU index %d in a system with %d GPUs", gpu_idx, num_gpus);
+    printErr("Requested GPU index %d in a system with %d GPUs", gpu->idx, num_gpus);
    return NULL;
  }
  cudaDeviceProp deviceProp;
-  if ((err = cudaGetDeviceProperties(&deviceProp, gpu_idx)) != cudaSuccess) {
+  if ((err = cudaGetDeviceProperties(&deviceProp, gpu->idx)) != cudaSuccess) {
    printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
    return NULL;
  }
@@ -106,7 +107,7 @@ struct gpu_info* get_gpu_info(int gpu_idx) {
  strcpy(gpu->name, deviceProp.name);
  gpu->nvmld = nvml_init();
-  if(nvml_get_pci_info(gpu_idx, gpu->nvmld)) {
+  if(nvml_get_pci_info(gpu->idx, gpu->nvmld)) {
    gpu->pci = get_pci_from_nvml(gpu->nvmld);
  }
--- a/src/cuda/uarch.cpp
+++ b/src/cuda/uarch.cpp
@@ -252,9 +252,12 @@ void map_chip_to_uarch(struct uarch* arch) {
 struct uarch* get_uarch_from_cuda(struct gpu_info* gpu) {
  struct uarch* arch = (struct uarch*) emalloc(sizeof(struct uarch));
-  int dev = 0;
+  cudaError_t err = cudaSuccess;
  cudaDeviceProp deviceProp;
-  cudaGetDeviceProperties(&deviceProp, dev);
+  if ((err = cudaGetDeviceProperties(&deviceProp, gpu->idx)) != cudaSuccess) {
    printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
    return NULL;
  }
  arch->chip_str = NULL;
  arch->cc_major = deviceProp.major;