[v0.10] Replace nvml by pciutils to get pci ids. Needs work to integrate it properly. NVML is enough in the case of NVIDIA GPUs, but because more GPUs will be added in the future, a solution like pciutils is needed

2021-09-04 12:19:42 +02:00
parent 4b4d1bc030
commit 039e7c350d
10 changed files with 69 additions and 109 deletions
--- a/src/cuda/cuda.cpp
+++ b/src/cuda/cuda.cpp
@@ -2,8 +2,8 @@
 #include <cuda_runtime.h>

 #include "cuda.hpp"
-#include "nvmlb.hpp"
 #include "uarch.hpp"
+#include "../common/pci.hpp"
 #include "../common/global.hpp"

 int print_gpus_list() {
@@ -142,11 +142,8 @@ struct gpu_info* get_gpu_info(int gpu_idx) {
  gpu->name = (char *) emalloc(sizeof(char) * (strlen(deviceProp.name) + 1));
  strcpy(gpu->name, deviceProp.name);

-  gpu->nvmld = nvml_init();
-  if(nvml_get_pci_info(gpu->idx, gpu->nvmld)) {
-    gpu->pci = get_pci_from_nvml(gpu->nvmld);
-  }
-
+  struct pci_dev *devices = get_pci_devices_from_pciutils();
+  gpu->pci = get_pci_from_pciutils(devices);
  gpu->arch = get_uarch_from_cuda(gpu);
  gpu->cach = get_cache_info(deviceProp);
  gpu->mem = get_memory_info(gpu, deviceProp);
--- a/src/cuda/nvmlb.cpp
+++ b/src/cuda/nvmlb.cpp
@@ -1,70 +0,0 @@
-#include <nvml.h>
-
-#include "nvmlb.hpp"
-#include "../common/global.hpp"
-
-struct nvml_data {
-  bool nvml_started;
-  nvmlPciInfo_t pci;
-};
-
-struct nvml_data* nvml_init() {
-  struct nvml_data* data = (struct nvml_data*) emalloc(sizeof(struct nvml_data));
-  data->nvml_started = false;
-
-  nvmlReturn_t result;
-
-  if ((result = nvmlInit()) != NVML_SUCCESS) {
-    printErr("nvmlInit: %s\n", nvmlErrorString(result));
-    return NULL;
-  }
-
-  data->nvml_started = true;
-  return data;
-}
-
-bool nvml_get_pci_info(int gpu_idx, struct nvml_data* data) {
-  nvmlReturn_t result;
-  nvmlDevice_t device;
-
-  if(!data->nvml_started) {
-    printErr("nvml_get_pci_info: nvml was not started");
-    return false;
-  }
-
-  if ((result = nvmlDeviceGetHandleByIndex(gpu_idx, &device)) != NVML_SUCCESS) {
-    printErr("nvmlDeviceGetHandleByIndex: %s\n", nvmlErrorString(result));
-    return false;
-  }
-
-  if ((result = nvmlDeviceGetPciInfo(device, &data->pci)) != NVML_SUCCESS) {
-    printErr("nvmlDeviceGetPciInfo: %s\n", nvmlErrorString(result));
-    return false;
-  }
-
-  return true;
-}
-
-uint16_t nvml_get_pci_vendor_id(struct nvml_data* data) {
-  return data->pci.pciDeviceId & 0x0000FFFF;
-}
-
-uint16_t nvml_get_pci_device_id(struct nvml_data* data) {
-  return (data->pci.pciDeviceId & 0xFFFF0000) >> 16;
-}
-
-bool nvml_shutdown(struct nvml_data* data) {
-  nvmlReturn_t result;
-
-  if(!data->nvml_started) {
-    printWarn("nvml_get_pci_info: nvml was not started");
-    return true;
-  }
-
-  if ((result = nvmlShutdown()) != NVML_SUCCESS) {
-    printErr("nvmlShutdown: %s\n", nvmlErrorString(result));
-    return false;
-  }
-
-  return true;
-}
--- a/src/cuda/nvmlb.hpp
+++ b/src/cuda/nvmlb.hpp
@@ -1,16 +0,0 @@
-// NVML Backend
-#ifndef __NVMLB__
-#define __NVMLB__
-
-#include <stdbool.h>
-#include <stdint.h>
-
-struct nvml_data;
-
-struct nvml_data* nvml_init();
-bool nvml_get_pci_info(int dev, struct nvml_data* data);
-uint16_t nvml_get_pci_vendor_id(struct nvml_data* data);
-uint16_t nvml_get_pci_device_id(struct nvml_data* data);
-bool nvml_shutdown(struct nvml_data* data);
-
-#endif
--- a/src/cuda/pci.cpp
+++ b/src/cuda/pci.cpp
@@ -1,9 +1,9 @@
 #include <stdio.h>

 #include "pci.hpp"
-#include "nvmlb.hpp"
 #include "chips.hpp"
 #include "../common/global.hpp"
+#include "../common/pci.hpp"

 #define CHECK_PCI_START if (false) {}
 #define CHECK_PCI(pci, id, chip) \
@@ -15,11 +15,11 @@ struct pci {
  uint16_t device_id;
 };

-struct pci* get_pci_from_nvml(struct nvml_data* data) {
+struct pci* get_pci_from_pciutils(struct pci_dev *devices) {
  struct pci* pci = (struct pci*) emalloc(sizeof(struct pci));

-  pci->vendor_id = nvml_get_pci_vendor_id(data);
-  pci->device_id = nvml_get_pci_device_id(data);
+  pci->vendor_id = pciutils_get_pci_vendor_id(devices);
+  pci->device_id = pciutils_get_pci_device_id(devices);

  return pci;
 }
--- a/src/cuda/pci.hpp
+++ b/src/cuda/pci.hpp
@@ -1,13 +1,14 @@
-#ifndef __PCI__
-#define __PCI__
+#ifndef __PCI_CUDA__
+#define __PCI_CUDA__

 #include <stdint.h>
-#include "nvmlb.hpp"
+
+#include "../common/pci.hpp"
 #include "chips.hpp"

 struct pci;

-struct pci* get_pci_from_nvml(struct nvml_data* data);
+struct pci* get_pci_from_pciutils(struct pci_dev *devices);
 GPUCHIP get_chip_from_pci(struct pci* pci);

 #endif