[v0.11] Add peak performance with tensor cores to the output

[v0.10] Simple refactoring
[v0.10] Add support for displaying the number of tensor cores
2021-11-23 18:49:34 +01:00 · 2021-11-23 18:17:12 +01:00 · 2021-11-23 18:09:13 +01:00 · 2021-09-08 08:17:06 +02:00 · 2021-09-04 16:02:50 +02:00 · 2021-09-04 14:05:16 +02:00
21 changed files with 395 additions and 194 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,81 @@
+cmake_minimum_required(VERSION 3.10)
+include(CheckLanguage)
+include(ExternalProject)
+
+project(gpufetch CXX)
+
+set(SRC_DIR "src")
+set(COMMON_DIR "${SRC_DIR}/common")
+set(CUDA_DIR "${SRC_DIR}/cuda")
+
+if(NOT WIN32)
+  string(ASCII 27 Esc)
+  set(ColorReset "${Esc}[m")
+  set(ColorBold  "${Esc}[1m")
+  set(Red         "${Esc}[31m")
+  set(Green	  "${Esc}[32m")
+  set(BoldRed     "${Esc}[1;31m")
+  set(BoldGreen   "${Esc}[1;32m")
+  set(BoldYellow  "${Esc}[1;33m")
+endif()
+
+check_language(CUDA)
+if(CMAKE_CUDA_COMPILER)
+  enable_language(CUDA)
+else()
+  message(FATAL_ERROR "${BoldRed}[ERROR]${ColorReset} Unable to find CUDA compiler. You may use -DCMAKE_CUDA_COMPILER and -DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT if CUDA is installed but not detected by CMake")
+endif()
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
+find_package(PCIUTILS)
+if(NOT ${PCIUTILS_FOUND})
+  message(STATUS "${BoldYellow}pciutils not found, downloading and building a local copy...${ColorReset}")
+
+  # Download and build pciutils
+  set(PCIUTILS_INSTALL_LOCATION ${CMAKE_BINARY_DIR}/pciutils-install)
+  ExternalProject_Add(pciutils
+    GIT_REPOSITORY https://github.com/pciutils/pciutils
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND make SHARED=no
+    BUILD_IN_SOURCE true
+    INSTALL_COMMAND make PREFIX=${PCIUTILS_INSTALL_LOCATION} install-lib
+  )
+
+  include_directories(${PCIUTILS_INSTALL_LOCATION}/include)
+  link_directories(${PCIUTILS_INSTALL_LOCATION}/lib)
+else()
+  include_directories(${PCIUTILS_INCLUDE_DIR})
+  link_libraries(${PCIUTILS_LIBRARIES})
+endif()
+
+set(SANITY_FLAGS "-Wfloat-equal -Wshadow -Wpointer-arith")
+set(CMAKE_CXX_FLAGS "${SANITY_FLAGS} -Wall -Wextra -pedantic -fstack-protector-all -pedantic")
+
+# https://en.wikipedia.org/w/index.php?title=CUDA&section=5#GPUs_supported
+# https://raw.githubusercontent.com/PointCloudLibrary/pcl/master/cmake/pcl_find_cuda.cmake
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.0")
+  set(CMAKE_CUDA_ARCHITECTURES 35 37 50 52 53 60 61 62 70 72 75 80 86)
+elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "10.0")
+  set(CMAKE_CUDA_ARCHITECTURES 30 32 35 37 50 52 53 60 61 62 70 72 75)
+elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "9.0")
+  set(CMAKE_CUDA_ARCHITECTURES 30 32 35 37 50 52 53 60 61 62 70 72)
+elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "8.0")
+  set(CMAKE_CUDA_ARCHITECTURES 20 21 30 32 35 37 50 52 53 60 61 62)
+endif()
+
+link_directories(${CMAKE_CUDA_COMPILER_TOOLKIT_ROOT}/targets/x86_64-linux/lib)
+
+add_library(cuda_backend STATIC ${CUDA_DIR}/cuda.cpp ${CUDA_DIR}/uarch.cpp ${CUDA_DIR}/pci.cpp)
+add_executable(gpufetch ${COMMON_DIR}/main.cpp ${COMMON_DIR}/args.cpp ${COMMON_DIR}/gpu.cpp ${COMMON_DIR}/pci.cpp ${COMMON_DIR}/global.cpp ${COMMON_DIR}/printer.cpp)
+
+if(NOT ${PCIUTILS_FOUND})
+  add_dependencies(cuda_backend pciutils)
+  add_dependencies(gpufetch pciutils)
+endif()
+
+target_include_directories(cuda_backend PUBLIC ${CMAKE_CUDA_COMPILER_TOOLKIT_ROOT}/samples/common/inc ${CMAKE_CUDA_COMPILER_TOOLKIT_ROOT}/targets/x86_64-linux/include)
+
+target_link_libraries(cuda_backend cudart)
+target_link_libraries(gpufetch cuda_backend pci z)
+
+install(TARGETS gpufetch DESTINATION bin)
--- a/53
+++ b/53
@@ -1,53 +0,0 @@
-CXX ?= g++
-CUDA_PATH ?= /usr/local/cuda/
-PREFIX ?= /usr
-
-CXXFLAGS+=-Wall -Wextra -pedantic -fstack-protector-all -pedantic
-SANITY_FLAGS=-Wfloat-equal -Wshadow -Wpointer-arith
-
-SRC_COMMON=src/common/
-SRC_CUDA=src/cuda/
-
-COMMON_SRC = $(SRC_COMMON)main.cpp  $(SRC_COMMON)gpu.cpp $(SRC_COMMON)args.cpp $(SRC_COMMON)global.cpp $(SRC_COMMON)printer.cpp
-COMMON_HDR = $(SRC_COMMON)ascii.hpp $(SRC_COMMON)gpu.hpp $(SRC_COMMON)args.hpp $(SRC_COMMON)global.hpp $(SRC_COMMON)printer.hpp
-
-CUDA_SRC = $(SRC_CUDA)cuda.cpp $(SRC_CUDA)uarch.cpp $(SRC_CUDA)pci.cpp $(SRC_CUDA)nvmlb.cpp
-CUDA_HDR = $(SRC_CUDA)cuda.hpp $(SRC_CUDA)uarch.hpp $(SRC_CUDA)pci.hpp $(SRC_CUDA)nvmlb.hpp $(SRC_CUDA)chips.hpp
-
-SOURCE += $(COMMON_SRC) $(CUDA_SRC)
-HEADERS += $(COMMON_HDR) $(CUDA_HDR)
-
-OUTPUT=gpufetch
-
-CXXFLAGS+= -I $(CUDA_PATH)/samples/common/inc -I $(CUDA_PATH)/targets/x86_64-linux/include -L $(CUDA_PATH)/targets/x86_64-linux/lib -lcudart -lnvidia-ml
-
-all: CXXFLAGS += -O3
-all: $(OUTPUT)
-
-debug: CXXFLAGS += -g -O0
-debug: $(OUTPUT)
-
-static: CXXFLAGS += -static -O3
-static: $(OUTPUT)
-
-strict: CXXFLAGS += -O3 -Werror -fsanitize=undefined -D_FORTIFY_SOURCE=2
-strict: $(OUTPUT)
-
-$(OUTPUT): Makefile $(SOURCE) $(HEADERS)
-	$(CXX) $(CXXFLAGS) $(SANITY_FLAGS) $(SOURCE) -o $(OUTPUT)
-
-run: $(OUTPUT)
-	./$(OUTPUT)
-
-clean:
-	@rm -f $(OUTPUT)
-
-install: $(OUTPUT)
-	install -Dm755 "gpufetch"   "$(DESTDIR)$(PREFIX)/bin/gpufetch"
-	install -Dm644 "LICENSE"    "$(DESTDIR)$(PREFIX)/share/licenses/gpufetch-git/LICENSE"
-	install -Dm644 "gpufetch.1" "$(DESTDIR)$(PREFIX)/share/man/man1/gpufetch.1.gz"
-
-uninstall:
-	rm -f "$(DESTDIR)$(PREFIX)/bin/gpufetch"
-	rm -f "$(DESTDIR)$(PREFIX)/share/licenses/gpufetch-git/LICENSE"
-	rm -f "$(DESTDIR)$(PREFIX)/share/man/man1/gpufetch.1.gz"
--- a/README.md
+++ b/README.md
@@ -31,18 +31,28 @@
 gpufetch supports NVIDIA GPUs under Linux only.

 # 2. Installation (building from source)
-You will need a C++ compiler (e.g, `g++`), `make` and CUDA to compile `gpufetch`. To do so, just clone the repo and run `make`:
+You will need:
+
+- C++ compiler (e.g, `g++`)
+- `cmake`
+- `make`
+- CUDA (NVIDIA backend)
+- pciutils (optional)
+
+To build gpufetch, just clone the repo and run `./build.sh`:

 ```
 git clone https://github.com/Dr-Noob/gpufetch
 cd gpufetch
-make
+./build.sh
 ./gpufetch
 ```
-When building gpufetch, you may encounter an error telling you that it cannot find some CUDA header files. In this case, is very likely that the Makefile is unable to find your CUDA installation. This can be solved by setting `CUDA_PATH` to the correct CUDA installation path. For example:
+
+- NOTE 1: It is recomended to install the `pciutils` development package, which is needed by gpufetch. If it is not installed, it will be downloaded and built automatically just to compile gpufetch.
+- NOTE 2: When building gpufetch, cmake may fail if it is unable to find the CUDA installation. If CUDA is installed but CMake does not find it, you need to pass the CUDA path to cmake. You can do this easily by editing directly the `build.sh` script. For example:

 ```
-CUDA_PATH=/opt/cuda make
+cmake -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=/usr/local/cuda/ ..
 ```

 # 3. Colors and style
--- a/build.sh
+++ b/build.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# gpufetch build script
+set -e
+
+rm -rf build/ gpufetch
+mkdir build/
+cd build/
+
+# In case you have CUDA installed but it is not detected,
+# - set CMAKE_CUDA_COMPILER to your nvcc binary:
+# - set CMAKE_CUDA_COMPILER_TOOLKIT_ROOT to the CUDA root dir
+# for example:
+# cmake -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=/usr/local/cuda/ ..
+
+cmake ..
+make -j$(nproc)
+cd -
+ln -s build/gpufetch .
--- a/cmake/FindPCIUTILS.cmake
+++ b/cmake/FindPCIUTILS.cmake
@@ -0,0 +1,29 @@
+# - Try to find the pciutils directory library
+# Once done this will define
+#
+#  PCIUTILS_FOUND - system has PCIUtils
+#  PCIUTILS_INCLUDE_DIR - the PCIUTILS include directory
+#  PCIUTILS_LIBRARIES - The libraries needed to use PCIUtils
+
+if(PCIUTILS_INCLUDE_DIR AND PCIUTILS_LIBRARIES)
+   set(PCIUTILS_FIND_QUIETLY TRUE)
+endif(PCIUTILS_INCLUDE_DIR AND PCIUTILS_LIBRARIES)
+
+FIND_PATH(PCIUTILS_INCLUDE_DIR pci/pci.h)
+
+FIND_LIBRARY(PCIUTILS_LIBRARY NAMES pci)
+if(PCIUTILS_LIBRARY)
+  FIND_LIBRARY(RESOLV_LIBRARY NAMES resolv)
+  if(RESOLV_LIBRARY)
+    set(PCIUTILS_LIBRARIES ${PCIUTILS_LIBRARY} ${RESOLV_LIBRARY})
+  else(RESOLV_LIBRARY)
+    set(PCIUTILS_LIBRARIES ${PCIUTILS_LIBRARY})
+  endif(RESOLV_LIBRARY)
+endif(PCIUTILS_LIBRARY)
+
+
+include(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(PCIUTILS DEFAULT_MSG PCIUTILS_LIBRARIES PCIUTILS_INCLUDE_DIR)
+
+MARK_AS_ADVANCED(PCIUTILS_INCLUDE_DIR PCIUTILS_LIBRARIES)
+
--- a/gpufetch.1
+++ b/gpufetch.1
@@ -0,0 +1,47 @@
+.\" DO NOT MODIFY THIS FILE!  It was generated by help2man 1.48.3.
+.TH GPUFETCH "1" "August 2021" "gpufetch v0.10" "User Commands"
+.SH NAME
+gpufetch
+.SH SYNOPSIS
+.B gpufetch
+[\fI\,OPTION\/\fR]...
+.SH DESCRIPTION
+Simple yet fancy GPU architecture fetching tool
+.SH OPTIONS
+.TP
+\fB\-c\fR, \fB\-\-color\fR
+Sets the color scheme (by default, gpufetch uses the system color scheme) See COLORS section for a more detailed explanation
+.TP
+\fB\-g\fR, \fB\-\-gpu\fR
+Selects the GPU to use (default: 0)
+.TP
+\fB\-h\fR, \fB\-\-help\fR
+Prints this help and exit
+.TP
+\fB\-V\fR, \fB\-\-version\fR
+Prints gpufetch version and exit
+.SS "COLORS:"
+.IP
+Color scheme can be set using a predefined color scheme or a custom one:
+1. To use a predefined color scheme, the name of the scheme must be provided. Possible values are:
+* "nvidia":  Use NVIDIA default color scheme
+2. To use a custom color scheme, 4 colors must be given in RGB with the format: R,G,B:R,G,B:...
+The first 2 colors are the GPU art color and the following 2 colors are the text colors
+.SS "EXAMPLES:"
+.IP
+Run gpufetch with NVIDIA color scheme:
+.IP
+\&./gpufetch \fB\-\-color\fR nvidia
+.IP
+Run gpufetch with a custom color scheme:
+.IP
+\&./gpufetch \fB\-\-color\fR 239,90,45:210,200,200:100,200,45:0,200,200
+.SS "BUGS:"
+.IP
+Report bugs to https://github.com/Dr\-Noob/gpufetch/issues
+.SS "NOTE:"
+.IP
+Peak performance information is NOT accurate. gpufetch computes peak performance using the max
+frequency. However, to properly compute peak performance, you need to know the frequency of the
+GPU running real code.
+For peak performance measurement see: https://github.com/Dr\-Noob/peakperf
--- a/src/common/args.cpp
+++ b/src/common/args.cpp
@@ -19,6 +19,7 @@
 struct args_struct {
  bool help_flag;
  bool version_flag;
+  bool list_gpus;
  int gpu_idx;
  STYLE style;
  struct color** colors;
@@ -28,17 +29,19 @@ int errn = 0;
 static struct args_struct args;

 const char args_chr[] = {
-  /* [ARG_CHAR_COLOR]   = */ 'c',
-  /* [ARG_CHAR_GPU]     = */ 'g',
-  /* [ARG_CHAR_HELP]    = */ 'h',
-  /* [ARG_CHAR_VERSION] = */ 'V',
+  /* [ARG_COLOR]   = */ 'c',
+  /* [ARG_GPU]     = */ 'g',
+  /* [ARG_LIST]    = */ 'l',
+  /* [ARG_HELP]    = */ 'h',
+  /* [ARG_VERSION] = */ 'V',
 };

 const char *args_str[] = {
-  /* [ARG_CHAR_COLOR]   = */ "color",
-  /* [ARG_CHAR_GPU]     = */ "gpu",
-  /* [ARG_CHAR_HELP]    = */ "help",
-  /* [ARG_CHAR_VERSION] = */ "version",
+  /* [ARG_COLOR]   = */ "color",
+  /* [ARG_GPU]     = */ "gpu",
+  /* [ARG_LIST]    = */ "list-gpus",
+  /* [ARG_HELP]    = */ "help",
+  /* [ARG_VERSION] = */ "version",
 };

 int getarg_int(char* str) {
@@ -100,6 +103,10 @@ bool show_help() {
  return args.help_flag;
 }

+bool list_gpus() {
+  return args.list_gpus;
+}
+
 bool show_version() {
  return args.version_flag;
 }
@@ -119,8 +126,9 @@ char* build_short_options() {
  char* str = (char *) emalloc(sizeof(char) * (len*2 + 1));
  memset(str, 0, sizeof(char) * (len*2 + 1));

-  sprintf(str, "%c:%c:%c%c", c[ARG_GPU],
-  c[ARG_COLOR], c[ARG_HELP], c[ARG_VERSION]);
+  sprintf(str, "%c:%c:%c%c%c", c[ARG_GPU],
+  c[ARG_COLOR], c[ARG_HELP], c[ARG_LIST],
+  c[ARG_VERSION]);

  return str;
 }
@@ -185,12 +193,14 @@ bool parse_args(int argc, char* argv[]) {

  args.version_flag = false;
  args.help_flag = false;
+  args.list_gpus = false;
  args.gpu_idx = 0;
  args.colors = NULL;

  const struct option long_options[] = {
    {args_str[ARG_COLOR],   required_argument, 0, args_chr[ARG_COLOR]   },
    {args_str[ARG_GPU],     required_argument, 0, args_chr[ARG_GPU]     },
+    {args_str[ARG_LIST],    no_argument,       0, args_chr[ARG_LIST]    },
    {args_str[ARG_HELP],    no_argument,       0, args_chr[ARG_HELP]    },
    {args_str[ARG_VERSION], no_argument,       0, args_chr[ARG_VERSION] },
    {0, 0, 0, 0}
@@ -199,7 +209,7 @@ bool parse_args(int argc, char* argv[]) {
  char* short_options = build_short_options();
  opt = getopt_long(argc, argv, short_options, long_options, &option_index);

-  while (!args.help_flag && !args.version_flag && opt != -1) {
+  while (!args.help_flag && !args.version_flag && !args.list_gpus && opt != -1) {
    if(opt == args_chr[ARG_COLOR]) {
      args.colors = (struct color **) emalloc(sizeof(struct color *) * NUM_COLORS);
      if(!parse_color(optarg, &args.colors)) {
@@ -215,8 +225,11 @@ bool parse_args(int argc, char* argv[]) {
        return false;
      }
    }
+    else if(opt == args_chr[ARG_LIST]) {
+      args.list_gpus = true;
+    }
    else if(opt == args_chr[ARG_HELP]) {
-      args.help_flag  = true;
+      args.help_flag = true;
    }
    else if(opt == args_chr[ARG_VERSION]) {
      args.version_flag = true;
--- a/src/common/args.hpp
+++ b/src/common/args.hpp
@@ -21,6 +21,7 @@ enum {
 enum {
  ARG_COLOR,
  ARG_GPU,
+  ARG_LIST,
  ARG_HELP,
  ARG_VERSION
 };
@@ -33,6 +34,7 @@ extern const char *args_str[];
 int max_arg_str_length();
 bool parse_args(int argc, char* argv[]);
 bool show_help();
+bool list_gpus();
 bool show_version();
 void free_colors_struct(struct color** cs);
 int get_gpu_idx();
--- a/src/common/global.hpp
+++ b/src/common/global.hpp
@@ -2,7 +2,6 @@
 #define __GLOBAL__

 #include <stdbool.h>
-#include <stddef.h>
 #include <cstddef>

 #define STRING_UNKNOWN "Unknown"
--- a/src/common/gpu.cpp
+++ b/src/common/gpu.cpp
@@ -116,17 +116,17 @@ char* get_str_l2(struct gpu_info* gpu) {
  return string;
 }

-char* get_str_peak_performance(struct gpu_info* gpu) {
+char* get_str_peak_performance_generic(int64_t pp) {
  char* str;

-  if(gpu->peak_performance == -1) {
+  if(pp == -1) {
    str = (char *) emalloc(sizeof(char) * (strlen(STRING_UNKNOWN) + 1));
    strncpy(str, STRING_UNKNOWN, strlen(STRING_UNKNOWN) + 1);
    return str;
  }

  // 7 for digits (e.g, XXXX.XX), 7 for XFLOP/s
-  double flopsd = (double) gpu->peak_performance;
+  double flopsd = (double) pp;
  uint32_t max_size = 7+1+7+1;
  str = (char *) ecalloc(max_size, sizeof(char));

@@ -139,3 +139,12 @@ char* get_str_peak_performance(struct gpu_info* gpu) {

  return str;
 }
+
+char* get_str_peak_performance(struct gpu_info* gpu) {
+  return get_str_peak_performance_generic(gpu->peak_performance);
+}
+
+char* get_str_peak_performance_tensor(struct gpu_info* gpu) {
+  return get_str_peak_performance_generic(gpu->peak_performance_t);
+}
+
--- a/src/common/gpu.hpp
+++ b/src/common/gpu.hpp
@@ -4,7 +4,6 @@
 #include <stdint.h>
 #include <stdbool.h>

-#include "../cuda/nvmlb.hpp"
 #include "../cuda/pci.hpp"

 #define UNKNOWN_FREQ -1
@@ -41,6 +40,7 @@ struct topology {
  int32_t streaming_mp;
  int32_t cores_per_mp;
  int32_t cuda_cores;
+  int32_t tensor_cores;
 };

 struct memory {
@@ -57,11 +57,11 @@ struct gpu_info {
  char* name;
  int64_t freq;
  struct pci* pci;
-  struct nvml_data* nvmld;
  struct topology* topo;
  struct memory* mem;
  struct cache* cach;
  int64_t peak_performance;
+  int64_t peak_performance_t;
  int32_t idx;
 };

@@ -74,5 +74,6 @@ char* get_str_bus_width(struct gpu_info* gpu);
 char* get_str_memory_clock(struct gpu_info* gpu);
 char* get_str_l2(struct gpu_info* gpu);
 char* get_str_peak_performance(struct gpu_info* gpu);
+char* get_str_peak_performance_tensor(struct gpu_info* gpu);

 #endif
--- a/src/common/main.cpp
+++ b/src/common/main.cpp
@@ -7,7 +7,7 @@
 #include "../cuda/cuda.hpp"
 #include "../cuda/uarch.hpp"

-static const char* VERSION = "0.10";
+static const char* VERSION = "0.11";

 void print_help(char *argv[]) {
  const char **t = args_str;
@@ -18,10 +18,11 @@ void print_help(char *argv[]) {
  printf("Simple yet fancy GPU architecture fetching tool\n\n");

  printf("Options: \n");
-  printf("  -%c, --%s %*s Sets the color scheme (by default, gpufetch uses the system color scheme) See COLORS section for a more detailed explanation\n", c[ARG_COLOR], t[ARG_COLOR], (int) (max_len-strlen(t[ARG_COLOR])), "");
-  printf("  -%c, --%s %*s Selects the GPU to use (default: 0)\n", c[ARG_GPU], t[ARG_GPU], (int) (max_len-strlen(t[ARG_GPU])), "");
-  printf("  -%c, --%s %*s Prints this help and exit\n", c[ARG_HELP], t[ARG_HELP], (int) (max_len-strlen(t[ARG_HELP])), "");
-  printf("  -%c, --%s %*s Prints gpufetch version and exit\n", c[ARG_VERSION], t[ARG_VERSION], (int) (max_len-strlen(t[ARG_VERSION])), "");
+  printf("  -%c, --%s %*s Set the color scheme (by default, gpufetch uses the system color scheme) See COLORS section for a more detailed explanation\n", c[ARG_COLOR], t[ARG_COLOR], (int) (max_len-strlen(t[ARG_COLOR])), "");
+  printf("  -%c, --%s %*s List the available GPUs in the system\n", c[ARG_LIST], t[ARG_LIST], (int) (max_len-strlen(t[ARG_LIST])), "");
+  printf("  -%c, --%s %*s Select the GPU to use (default: 0)\n", c[ARG_GPU], t[ARG_GPU], (int) (max_len-strlen(t[ARG_GPU])), "");
+  printf("  -%c, --%s %*s Print this help and exit\n", c[ARG_HELP], t[ARG_HELP], (int) (max_len-strlen(t[ARG_HELP])), "");
+  printf("  -%c, --%s %*s Print gpufetch version and exit\n", c[ARG_VERSION], t[ARG_VERSION], (int) (max_len-strlen(t[ARG_VERSION])), "");

  printf("\nCOLORS: \n");
  printf("  Color scheme can be set using a predefined color scheme or a custom one:\n");
@@ -64,6 +65,10 @@ int main(int argc, char* argv[]) {
    return EXIT_SUCCESS;
  }

+  if(list_gpus()) {
+    return print_gpus_list();
+  }
+
  set_log_level(true);

  printWarn("gpufetch is in beta. The provided information may be incomplete or wrong.\n\
--- a/src/common/pci.cpp
+++ b/src/common/pci.cpp
@@ -0,0 +1,45 @@
+#include "global.hpp"
+#include "pci.hpp"
+#include <cstddef>
+
+/*
+ * doc: https://wiki.osdev.org/PCI#Class_Codes
+ *      https://pci-ids.ucw.cz/read/PC
+ */
+#define VENDOR_ID_NVIDIA 0x10de
+#define CLASS_VGA_CONTROLLER 0x0300
+
+uint16_t pciutils_get_pci_vendor_id(struct pci_dev *devices) {
+  for(struct pci_dev *dev=devices; dev != NULL; dev=dev->next) {
+    if(dev->vendor_id == VENDOR_ID_NVIDIA && dev->device_class == CLASS_VGA_CONTROLLER) {
+      return dev->vendor_id;
+    }
+  }
+  printErr("Unable to find a CUDA device using pciutils");
+  return 0;
+}
+
+uint16_t pciutils_get_pci_device_id(struct pci_dev *devices) {
+  for(struct pci_dev *dev=devices; dev != NULL; dev=dev->next) {
+   if(dev->vendor_id == VENDOR_ID_NVIDIA && dev->device_class == CLASS_VGA_CONTROLLER) {
+      return dev->device_id;
+    }
+  }
+  printErr("Unable to find a CUDA device using pciutils");
+  return 0;
+}
+
+struct pci_dev *get_pci_devices_from_pciutils() {
+  struct pci_access *pacc;
+  struct pci_dev *dev;
+
+  pacc = pci_alloc();
+  pci_init(pacc);
+  pci_scan_bus(pacc);
+
+  for (dev=pacc->devices; dev; dev=dev->next) {
+    pci_fill_info(dev, PCI_FILL_IDENT | PCI_FILL_BASES | PCI_FILL_CLASS);
+  }
+
+  return pacc->devices;
+}
--- a/src/common/pci.hpp
+++ b/src/common/pci.hpp
@@ -0,0 +1,13 @@
+#ifndef __GPUFETCH_PCI__
+#define __GPUFETCH_PCI__
+
+#include <cstdint>
+extern "C" {
+  #include <pci/pci.h>
+}
+
+uint16_t pciutils_get_pci_vendor_id(struct pci_dev *devices);
+uint16_t pciutils_get_pci_device_id(struct pci_dev *devices);
+struct pci_dev *get_pci_devices_from_pciutils();
+
+#endif
--- a/src/common/printer.cpp
+++ b/src/common/printer.cpp
@@ -38,11 +38,13 @@ enum {
  ATTRIBUTE_STREAMINGMP,
  ATTRIBUTE_CORESPERMP,
  ATTRIBUTE_CUDA_CORES,
+  ATTRIBUTE_TENSOR_CORES,
  ATTRIBUTE_L2,
  ATTRIBUTE_MEMORY,
  ATTRIBUTE_MEMORY_FREQ,
  ATTRIBUTE_BUS_WIDTH,
-  ATTRIBUTE_PEAK
+  ATTRIBUTE_PEAK,
+  ATTRIBUTE_PEAK_TENSOR,
 };

 static const char* ATTRIBUTE_FIELDS [] = {
@@ -53,12 +55,14 @@ static const char* ATTRIBUTE_FIELDS [] = {
  "Max Frequency:",
  "SMs:",
  "Cores/SM:",
-  "CUDA cores:",
+  "CUDA Cores:",
+  "Tensor Cores:",
  "L2 Size:",
  "Memory:",
  "Memory frequency:",
  "Bus width:",
  "Peak Performance:",
+  "Peak Performance (TC):",
 };

 static const char* ATTRIBUTE_FIELDS_SHORT [] = {
@@ -69,12 +73,14 @@ static const char* ATTRIBUTE_FIELDS_SHORT [] = {
  "Max Freq.:",
  "SMs:",
  "Cores/SM:",
-  "CUDA cores:",
+  "CUDA Cores:",
+  "Tensor Cores:",
  "L2 Size:",
  "Memory:",
  "Memory freq.:",
  "Bus width:",
  "Peak Perf.:",
+  "Peak Perf.(TC):",
 };

 struct terminal {
@@ -350,6 +356,7 @@ bool print_gpufetch_cuda(struct gpu_info* gpu, STYLE s, struct color** cs, struc
  char* sms = get_str_sm(gpu);
  char* corespersm = get_str_cores_sm(gpu);
  char* cores = get_str_cuda_cores(gpu);
+  char* tensorc = get_str_tensor_cores(gpu);
  char* max_frequency = get_str_freq(gpu);
  char* l2 = get_str_l2(gpu);
  char* mem_size = get_str_memory_size(gpu);
@@ -357,6 +364,7 @@ bool print_gpufetch_cuda(struct gpu_info* gpu, STYLE s, struct color** cs, struc
  char* mem_freq = get_str_memory_clock(gpu);
  char* bus_width = get_str_bus_width(gpu);
  char* pp = get_str_peak_performance(gpu);
+  char* pp_tensor = get_str_peak_performance_tensor(gpu);

  char* mem = (char *) emalloc(sizeof(char) * (strlen(mem_size) + strlen(mem_type) + 2));
  sprintf(mem, "%s %s", mem_size, mem_type);
@@ -372,11 +380,17 @@ bool print_gpufetch_cuda(struct gpu_info* gpu, STYLE s, struct color** cs, struc
  setAttribute(art, ATTRIBUTE_STREAMINGMP, sms);
  setAttribute(art, ATTRIBUTE_CORESPERMP, corespersm);
  setAttribute(art, ATTRIBUTE_CUDA_CORES, cores);
+  if(gpu->topo->tensor_cores >= 0) {
+    setAttribute(art, ATTRIBUTE_TENSOR_CORES, tensorc);
+  }
  setAttribute(art, ATTRIBUTE_MEMORY, mem);
  setAttribute(art, ATTRIBUTE_MEMORY_FREQ, mem_freq);
  setAttribute(art, ATTRIBUTE_BUS_WIDTH, bus_width);
  setAttribute(art, ATTRIBUTE_L2, l2);
  setAttribute(art, ATTRIBUTE_PEAK, pp);
+  if(gpu->topo->tensor_cores >= 0) {
+    setAttribute(art, ATTRIBUTE_PEAK_TENSOR, pp_tensor);
+  }

  const char** attribute_fields = ATTRIBUTE_FIELDS;
  uint32_t longest_attribute = longest_attribute_length(art, attribute_fields);
--- a/src/cuda/cuda.cpp
+++ b/src/cuda/cuda.cpp
@@ -2,10 +2,46 @@
 #include <cuda_runtime.h>

 #include "cuda.hpp"
-#include "nvmlb.hpp"
 #include "uarch.hpp"
+#include "../common/pci.hpp"
 #include "../common/global.hpp"

+int print_gpus_list() {
+  cudaError_t err = cudaSuccess;
+  int num_gpus = -1;
+
+  if ((err = cudaGetDeviceCount(&num_gpus)) != cudaSuccess) {
+    printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
+    return EXIT_FAILURE;
+  }
+  printf("CUDA GPUs available: %d\n", num_gpus);
+
+  if(num_gpus > 0) {
+    cudaDeviceProp deviceProp;
+    int max_len = 0;
+
+    for(int idx=0; idx < num_gpus; idx++) {
+      if ((err = cudaGetDeviceProperties(&deviceProp, idx)) != cudaSuccess) {
+        printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
+        return EXIT_FAILURE;
+      }
+      max_len = max(max_len, (int) strlen(deviceProp.name));
+    }
+
+    for(int i=0; i < max_len + 32; i++) putchar('-');
+    putchar('\n');
+    for(int idx=0; idx < num_gpus; idx++) {
+      if ((err = cudaGetDeviceProperties(&deviceProp, idx)) != cudaSuccess) {
+        printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
+        return EXIT_FAILURE;
+      }
+      printf("GPU %d: %s (Compute Capability %d.%d)\n", idx, deviceProp.name, deviceProp.major, deviceProp.minor);
+    }
+  }
+
+  return EXIT_SUCCESS;
+}
+
 struct cache* get_cache_info(cudaDeviceProp prop) {
  struct cache* cach = (struct cache*) emalloc(sizeof(struct cache));

@@ -17,12 +53,19 @@ struct cache* get_cache_info(cudaDeviceProp prop) {
  return cach;
 }

+int get_tensor_cores(int sm, int major) {
+  if(major == 7) return sm * 8;
+  else if(major == 8) return sm * 4;
+  else return 0;
+}
+
 struct topology* get_topology_info(cudaDeviceProp prop) {
  struct topology* topo = (struct topology*) emalloc(sizeof(struct topology));

  topo->streaming_mp = prop.multiProcessorCount;
  topo->cores_per_mp = _ConvertSMVer2Cores(prop.major, prop.minor);
  topo->cuda_cores = topo->streaming_mp * topo->cores_per_mp;
+  topo->tensor_cores = get_tensor_cores(topo->streaming_mp, prop.major);

  return topo;
 }
@@ -60,10 +103,16 @@ struct memory* get_memory_info(struct gpu_info* gpu, cudaDeviceProp prop) {
  return mem;
 }

+// Compute peak performance when using CUDA cores
 int64_t get_peak_performance(struct gpu_info* gpu) {
  return gpu->freq * 1000000 * gpu->topo->cuda_cores * 2;
 }

+// Compute peak performance when using tensor cores
+int64_t get_peak_performance_t(struct gpu_info* gpu) {
+  return gpu->freq * 1000000 * 4 * 4 * 8 * gpu->topo->tensor_cores;
+}
+
 struct gpu_info* get_gpu_info(int gpu_idx) {
  struct gpu_info* gpu = (struct gpu_info*) emalloc(sizeof(struct gpu_info));
  gpu->pci = NULL;
@@ -106,38 +155,39 @@ struct gpu_info* get_gpu_info(int gpu_idx) {
  gpu->name = (char *) emalloc(sizeof(char) * (strlen(deviceProp.name) + 1));
  strcpy(gpu->name, deviceProp.name);

-  gpu->nvmld = nvml_init();
-  if(nvml_get_pci_info(gpu->idx, gpu->nvmld)) {
-    gpu->pci = get_pci_from_nvml(gpu->nvmld);
-  }
-
+  struct pci_dev *devices = get_pci_devices_from_pciutils();
+  gpu->pci = get_pci_from_pciutils(devices);
  gpu->arch = get_uarch_from_cuda(gpu);
  gpu->cach = get_cache_info(deviceProp);
  gpu->mem = get_memory_info(gpu, deviceProp);
  gpu->topo = get_topology_info(deviceProp);
  gpu->peak_performance = get_peak_performance(gpu);
+  gpu->peak_performance_t = get_peak_performance_t(gpu);

  return gpu;
 }

-char* get_str_sm(struct gpu_info* gpu) {
-  uint32_t max_size = 10;
+char* get_str_generic(int32_t data) {
+  // Largest int is 10, +1 for possible negative, +1 for EOL
+  uint32_t max_size = 12;
  char* dummy = (char *) ecalloc(max_size, sizeof(char));
-  snprintf(dummy, max_size, "%d", gpu->topo->streaming_mp);
+  snprintf(dummy, max_size, "%d", data);
  return dummy;
 }

+char* get_str_sm(struct gpu_info* gpu) {
+  return get_str_generic(gpu->topo->streaming_mp);
+}
+
 char* get_str_cores_sm(struct gpu_info* gpu) {
-  uint32_t max_size = 10;
-  char* dummy = (char *) ecalloc(max_size, sizeof(char));
-  snprintf(dummy, max_size, "%d", gpu->topo->cores_per_mp);
-  return dummy;
+  return get_str_generic(gpu->topo->cores_per_mp);
 }

 char* get_str_cuda_cores(struct gpu_info* gpu) {
-  uint32_t max_size = 10;
-  char* dummy = (char *) ecalloc(max_size, sizeof(char));
-  snprintf(dummy, max_size, "%d", gpu->topo->cuda_cores);
-  return dummy;
+  return get_str_generic(gpu->topo->cuda_cores);
+}
+
+char* get_str_tensor_cores(struct gpu_info* gpu) {
+  return get_str_generic(gpu->topo->tensor_cores);
 }

--- a/src/cuda/cuda.hpp
+++ b/src/cuda/cuda.hpp
@@ -4,8 +4,10 @@
 #include "../common/gpu.hpp"

 struct gpu_info* get_gpu_info(int gpu_idx);
+int print_gpus_list();
 char* get_str_sm(struct gpu_info* gpu);
 char* get_str_cores_sm(struct gpu_info* gpu);
 char* get_str_cuda_cores(struct gpu_info* gpu);
+char* get_str_tensor_cores(struct gpu_info* gpu);

 #endif
--- a/src/cuda/nvmlb.cpp
+++ b/src/cuda/nvmlb.cpp
@@ -1,70 +0,0 @@
-#include <nvml.h>
-
-#include "nvmlb.hpp"
-#include "../common/global.hpp"
-
-struct nvml_data {
-  bool nvml_started;
-  nvmlPciInfo_t pci;
-};
-
-struct nvml_data* nvml_init() {
-  struct nvml_data* data = (struct nvml_data*) emalloc(sizeof(struct nvml_data));
-  data->nvml_started = false;
-
-  nvmlReturn_t result;
-
-  if ((result = nvmlInit()) != NVML_SUCCESS) {
-    printErr("nvmlInit: %s\n", nvmlErrorString(result));
-    return NULL;
-  }
-
-  data->nvml_started = true;
-  return data;
-}
-
-bool nvml_get_pci_info(int gpu_idx, struct nvml_data* data) {
-  nvmlReturn_t result;
-  nvmlDevice_t device;
-
-  if(!data->nvml_started) {
-    printErr("nvml_get_pci_info: nvml was not started");
-    return false;
-  }
-
-  if ((result = nvmlDeviceGetHandleByIndex(gpu_idx, &device)) != NVML_SUCCESS) {
-    printErr("nvmlDeviceGetHandleByIndex: %s\n", nvmlErrorString(result));
-    return false;
-  }
-
-  if ((result = nvmlDeviceGetPciInfo(device, &data->pci)) != NVML_SUCCESS) {
-    printErr("nvmlDeviceGetPciInfo: %s\n", nvmlErrorString(result));
-    return false;
-  }
-
-  return true;
-}
-
-uint16_t nvml_get_pci_vendor_id(struct nvml_data* data) {
-  return data->pci.pciDeviceId & 0x0000FFFF;
-}
-
-uint16_t nvml_get_pci_device_id(struct nvml_data* data) {
-  return (data->pci.pciDeviceId & 0xFFFF0000) >> 16;
-}
-
-bool nvml_shutdown(struct nvml_data* data) {
-  nvmlReturn_t result;
-
-  if(!data->nvml_started) {
-    printWarn("nvml_get_pci_info: nvml was not started");
-    return true;
-  }
-
-  if ((result = nvmlShutdown()) != NVML_SUCCESS) {
-    printErr("nvmlShutdown: %s\n", nvmlErrorString(result));
-    return false;
-  }
-
-  return true;
-}
--- a/src/cuda/nvmlb.hpp
+++ b/src/cuda/nvmlb.hpp
@@ -1,16 +0,0 @@
-// NVML Backend
-#ifndef __NVMLB__
-#define __NVMLB__
-
-#include <stdbool.h>
-#include <stdint.h>
-
-struct nvml_data;
-
-struct nvml_data* nvml_init();
-bool nvml_get_pci_info(int dev, struct nvml_data* data);
-uint16_t nvml_get_pci_vendor_id(struct nvml_data* data);
-uint16_t nvml_get_pci_device_id(struct nvml_data* data);
-bool nvml_shutdown(struct nvml_data* data);
-
-#endif
--- a/src/cuda/pci.cpp
+++ b/src/cuda/pci.cpp
@@ -1,9 +1,9 @@
 #include <stdio.h>

 #include "pci.hpp"
-#include "nvmlb.hpp"
 #include "chips.hpp"
 #include "../common/global.hpp"
+#include "../common/pci.hpp"

 #define CHECK_PCI_START if (false) {}
 #define CHECK_PCI(pci, id, chip) \
@@ -15,11 +15,11 @@ struct pci {
  uint16_t device_id;
 };

-struct pci* get_pci_from_nvml(struct nvml_data* data) {
+struct pci* get_pci_from_pciutils(struct pci_dev *devices) {
  struct pci* pci = (struct pci*) emalloc(sizeof(struct pci));

-  pci->vendor_id = nvml_get_pci_vendor_id(data);
-  pci->device_id = nvml_get_pci_device_id(data);
+  pci->vendor_id = pciutils_get_pci_vendor_id(devices);
+  pci->device_id = pciutils_get_pci_device_id(devices);

  return pci;
 }
--- a/src/cuda/pci.hpp
+++ b/src/cuda/pci.hpp
@@ -1,13 +1,14 @@
-#ifndef __PCI__
-#define __PCI__
+#ifndef __PCI_CUDA__
+#define __PCI_CUDA__

 #include <stdint.h>
-#include "nvmlb.hpp"
+
+#include "../common/pci.hpp"
 #include "chips.hpp"

 struct pci;

-struct pci* get_pci_from_nvml(struct nvml_data* data);
+struct pci* get_pci_from_pciutils(struct pci_dev *devices);
 GPUCHIP get_chip_from_pci(struct pci* pci);

 #endif
Author	SHA1	Message	Date
Dr-Noob	32b2c59b50	[v0.11] Add peak performance with tensor cores to the output	2021-11-23 18:49:34 +01:00
Dr-Noob	8bf0276aae	[v0.10] Simple refactoring	2021-11-23 18:17:12 +01:00
Dr-Noob	821b6e760e	[v0.10] Add support for displaying the number of tensor cores	2021-11-23 18:09:13 +01:00
Dr-Noob	f212fb88d4	[v0.10] Fix pci initialization	2021-09-08 08:17:06 +02:00
Dr-Noob	81607151dc	[v0.10] Update build script and README	2021-09-04 16:02:50 +02:00
Dr-Noob	bdf9eb0079	[v0.10] Use CMake instead of Make, which will take care of pciutils automatically if it is not installed	2021-09-04 14:05:16 +02:00
Dr-Noob	039e7c350d	[v0.10] Replace nvml by pciutils to get pci ids. Needs work to integrate it properly. NVML is enough in the case of NVIDIA GPUs, but because more GPUs will be added in the future, a solution like pciutils is needed	2021-09-04 12:19:42 +02:00
Dr-Noob	4b4d1bc030	[v0.10] Add --list-gpus option	2021-08-23 22:39:31 +02:00
Dr-Noob	d00e3f183d	[v0.10] Add simple man page	2021-08-23 22:02:45 +02:00