[v0.10] Replace nvml by pciutils to get pci ids. Needs work to integrate it properly. NVML is enough in the case of NVIDIA GPUs, but because more GPUs will be added in the future, a solution like pciutils is needed

2021-09-04 12:19:42 +02:00
parent 4b4d1bc030
commit 039e7c350d
10 changed files with 69 additions and 109 deletions
--- a/13
+++ b/13
@@ -8,18 +8,19 @@ SANITY_FLAGS=-Wfloat-equal -Wshadow -Wpointer-arith
 SRC_COMMON=src/common/
 SRC_CUDA=src/cuda/

-COMMON_SRC = $(SRC_COMMON)main.cpp  $(SRC_COMMON)gpu.cpp $(SRC_COMMON)args.cpp $(SRC_COMMON)global.cpp $(SRC_COMMON)printer.cpp
-COMMON_HDR = $(SRC_COMMON)ascii.hpp $(SRC_COMMON)gpu.hpp $(SRC_COMMON)args.hpp $(SRC_COMMON)global.hpp $(SRC_COMMON)printer.hpp
+COMMON_SRC = $(SRC_COMMON)main.cpp  $(SRC_COMMON)gpu.cpp $(SRC_COMMON)args.cpp $(SRC_COMMON)global.cpp $(SRC_COMMON)printer.cpp $(SRC_COMMON)pci.cpp
+COMMON_HDR = $(SRC_COMMON)ascii.hpp $(SRC_COMMON)gpu.hpp $(SRC_COMMON)args.hpp $(SRC_COMMON)global.hpp $(SRC_COMMON)printer.hpp $(SRC_COMMON)pci.hpp

-CUDA_SRC = $(SRC_CUDA)cuda.cpp $(SRC_CUDA)uarch.cpp $(SRC_CUDA)pci.cpp $(SRC_CUDA)nvmlb.cpp
-CUDA_HDR = $(SRC_CUDA)cuda.hpp $(SRC_CUDA)uarch.hpp $(SRC_CUDA)pci.hpp $(SRC_CUDA)nvmlb.hpp $(SRC_CUDA)chips.hpp
+CUDA_SRC = $(SRC_CUDA)cuda.cpp $(SRC_CUDA)uarch.cpp $(SRC_CUDA)pci.cpp
+CUDA_HDR = $(SRC_CUDA)cuda.hpp $(SRC_CUDA)uarch.hpp $(SRC_CUDA)pci.hpp $(SRC_CUDA)chips.hpp

 SOURCE += $(COMMON_SRC) $(CUDA_SRC)
 HEADERS += $(COMMON_HDR) $(CUDA_HDR)

 OUTPUT=gpufetch

-CXXFLAGS+= -I $(CUDA_PATH)/samples/common/inc -I $(CUDA_PATH)/targets/x86_64-linux/include -L $(CUDA_PATH)/targets/x86_64-linux/lib -lcudart -lnvidia-ml
+CXXFLAGS+= -I pciutils/install/include -I $(CUDA_PATH)/samples/common/inc -I $(CUDA_PATH)/targets/x86_64-linux/include -L $(CUDA_PATH)/targets/x86_64-linux/lib -L pciutils/install/lib
+LDFLAGS+=-lcudart -lpci

 all: CXXFLAGS += -O3
 all: $(OUTPUT)
@@ -34,7 +35,7 @@ strict: CXXFLAGS += -O3 -Werror -fsanitize=undefined -D_FORTIFY_SOURCE=2
 strict: $(OUTPUT)

 $(OUTPUT): Makefile $(SOURCE) $(HEADERS)
-	$(CXX) $(CXXFLAGS) $(SANITY_FLAGS) $(SOURCE) -o $(OUTPUT)
+	$(CXX) $(CXXFLAGS) $(SANITY_FLAGS) $(SOURCE) $(LDFLAGS) -o $(OUTPUT)

 run: $(OUTPUT)
 	./$(OUTPUT)
--- a/src/common/global.hpp
+++ b/src/common/global.hpp
@@ -2,7 +2,6 @@
 #define __GLOBAL__

 #include <stdbool.h>
-#include <stddef.h>
 #include <cstddef>

 #define STRING_UNKNOWN "Unknown"
--- a/src/common/gpu.hpp
+++ b/src/common/gpu.hpp
@@ -4,7 +4,6 @@
 #include <stdint.h>
 #include <stdbool.h>

-#include "../cuda/nvmlb.hpp"
 #include "../cuda/pci.hpp"

 #define UNKNOWN_FREQ -1
@@ -57,7 +56,6 @@ struct gpu_info {
  char* name;
  int64_t freq;
  struct pci* pci;
-  struct nvml_data* nvmld;
  struct topology* topo;
  struct memory* mem;
  struct cache* cach;
--- a/src/common/pci.cpp
+++ b/src/common/pci.cpp
@@ -0,0 +1,37 @@
+#include "pci.hpp"
+#include <cstddef>
+
+/*
+ * doc: https://wiki.osdev.org/PCI#Class_Codes
+ *      https://pci-ids.ucw.cz/read/PC
+ */
+#define VENDOR_ID_NVIDIA 0x10de
+#define CLASS_VGA_CONTROLLER 0x0300
+
+uint16_t pciutils_get_pci_vendor_id(struct pci_dev *devices) {
+  for(struct pci_dev *dev=devices; dev != NULL; dev=dev->next) {
+    if(dev->vendor_id == VENDOR_ID_NVIDIA && dev->device_class == CLASS_VGA_CONTROLLER) {
+      return dev->vendor_id;
+    }
+  }
+  return 0;
+}
+
+uint16_t pciutils_get_pci_device_id(struct pci_dev *devices) {
+  for(struct pci_dev *dev=devices; dev != NULL; dev=dev->next) {
+   if(dev->vendor_id == VENDOR_ID_NVIDIA && dev->device_class == CLASS_VGA_CONTROLLER) {
+      return dev->device_id;
+    }
+  }
+  return 0;
+}
+
+struct pci_dev *get_pci_devices_from_pciutils() {
+  struct pci_access *pacc;
+
+  pacc = pci_alloc();
+  pci_init(pacc);
+  pci_scan_bus(pacc);
+
+  return pacc->devices;
+}
--- a/src/common/pci.hpp
+++ b/src/common/pci.hpp
@@ -0,0 +1,13 @@
+#ifndef __GPUFETCH_PCI__
+#define __GPUFETCH_PCI__
+
+#include <cstdint>
+extern "C" {
+  #include <pci/pci.h>
+}
+
+uint16_t pciutils_get_pci_vendor_id(struct pci_dev *devices);
+uint16_t pciutils_get_pci_device_id(struct pci_dev *devices);
+struct pci_dev *get_pci_devices_from_pciutils();
+
+#endif
--- a/src/cuda/cuda.cpp
+++ b/src/cuda/cuda.cpp
@@ -2,8 +2,8 @@
 #include <cuda_runtime.h>

 #include "cuda.hpp"
-#include "nvmlb.hpp"
 #include "uarch.hpp"
+#include "../common/pci.hpp"
 #include "../common/global.hpp"

 int print_gpus_list() {
@@ -142,11 +142,8 @@ struct gpu_info* get_gpu_info(int gpu_idx) {
  gpu->name = (char *) emalloc(sizeof(char) * (strlen(deviceProp.name) + 1));
  strcpy(gpu->name, deviceProp.name);

-  gpu->nvmld = nvml_init();
-  if(nvml_get_pci_info(gpu->idx, gpu->nvmld)) {
-    gpu->pci = get_pci_from_nvml(gpu->nvmld);
-  }
-
+  struct pci_dev *devices = get_pci_devices_from_pciutils();
+  gpu->pci = get_pci_from_pciutils(devices);
  gpu->arch = get_uarch_from_cuda(gpu);
  gpu->cach = get_cache_info(deviceProp);
  gpu->mem = get_memory_info(gpu, deviceProp);
--- a/src/cuda/nvmlb.cpp
+++ b/src/cuda/nvmlb.cpp
@@ -1,70 +0,0 @@
-#include <nvml.h>
-
-#include "nvmlb.hpp"
-#include "../common/global.hpp"
-
-struct nvml_data {
-  bool nvml_started;
-  nvmlPciInfo_t pci;
-};
-
-struct nvml_data* nvml_init() {
-  struct nvml_data* data = (struct nvml_data*) emalloc(sizeof(struct nvml_data));
-  data->nvml_started = false;
-
-  nvmlReturn_t result;
-
-  if ((result = nvmlInit()) != NVML_SUCCESS) {
-    printErr("nvmlInit: %s\n", nvmlErrorString(result));
-    return NULL;
-  }
-
-  data->nvml_started = true;
-  return data;
-}
-
-bool nvml_get_pci_info(int gpu_idx, struct nvml_data* data) {
-  nvmlReturn_t result;
-  nvmlDevice_t device;
-
-  if(!data->nvml_started) {
-    printErr("nvml_get_pci_info: nvml was not started");
-    return false;
-  }
-
-  if ((result = nvmlDeviceGetHandleByIndex(gpu_idx, &device)) != NVML_SUCCESS) {
-    printErr("nvmlDeviceGetHandleByIndex: %s\n", nvmlErrorString(result));
-    return false;
-  }
-
-  if ((result = nvmlDeviceGetPciInfo(device, &data->pci)) != NVML_SUCCESS) {
-    printErr("nvmlDeviceGetPciInfo: %s\n", nvmlErrorString(result));
-    return false;
-  }
-
-  return true;
-}
-
-uint16_t nvml_get_pci_vendor_id(struct nvml_data* data) {
-  return data->pci.pciDeviceId & 0x0000FFFF;
-}
-
-uint16_t nvml_get_pci_device_id(struct nvml_data* data) {
-  return (data->pci.pciDeviceId & 0xFFFF0000) >> 16;
-}
-
-bool nvml_shutdown(struct nvml_data* data) {
-  nvmlReturn_t result;
-
-  if(!data->nvml_started) {
-    printWarn("nvml_get_pci_info: nvml was not started");
-    return true;
-  }
-
-  if ((result = nvmlShutdown()) != NVML_SUCCESS) {
-    printErr("nvmlShutdown: %s\n", nvmlErrorString(result));
-    return false;
-  }
-
-  return true;
-}
--- a/src/cuda/nvmlb.hpp
+++ b/src/cuda/nvmlb.hpp
@@ -1,16 +0,0 @@
-// NVML Backend
-#ifndef __NVMLB__
-#define __NVMLB__
-
-#include <stdbool.h>
-#include <stdint.h>
-
-struct nvml_data;
-
-struct nvml_data* nvml_init();
-bool nvml_get_pci_info(int dev, struct nvml_data* data);
-uint16_t nvml_get_pci_vendor_id(struct nvml_data* data);
-uint16_t nvml_get_pci_device_id(struct nvml_data* data);
-bool nvml_shutdown(struct nvml_data* data);
-
-#endif
--- a/src/cuda/pci.cpp
+++ b/src/cuda/pci.cpp
@@ -1,9 +1,9 @@
 #include <stdio.h>

 #include "pci.hpp"
-#include "nvmlb.hpp"
 #include "chips.hpp"
 #include "../common/global.hpp"
+#include "../common/pci.hpp"

 #define CHECK_PCI_START if (false) {}
 #define CHECK_PCI(pci, id, chip) \
@@ -15,11 +15,11 @@ struct pci {
  uint16_t device_id;
 };

-struct pci* get_pci_from_nvml(struct nvml_data* data) {
+struct pci* get_pci_from_pciutils(struct pci_dev *devices) {
  struct pci* pci = (struct pci*) emalloc(sizeof(struct pci));

-  pci->vendor_id = nvml_get_pci_vendor_id(data);
-  pci->device_id = nvml_get_pci_device_id(data);
+  pci->vendor_id = pciutils_get_pci_vendor_id(devices);
+  pci->device_id = pciutils_get_pci_device_id(devices);

  return pci;
 }
--- a/src/cuda/pci.hpp
+++ b/src/cuda/pci.hpp
@@ -1,13 +1,14 @@
-#ifndef __PCI__
-#define __PCI__
+#ifndef __PCI_CUDA__
+#define __PCI_CUDA__

 #include <stdint.h>
-#include "nvmlb.hpp"
+
+#include "../common/pci.hpp"
 #include "chips.hpp"

 struct pci;

-struct pci* get_pci_from_nvml(struct nvml_data* data);
+struct pci* get_pci_from_pciutils(struct pci_dev *devices);
 GPUCHIP get_chip_from_pci(struct pci* pci);

 #endif