Cleaning memory_pool_callback

Fixes
I guess we can rely on HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED
2025-10-23 21:19:27 +02:00 · 2025-10-23 21:12:52 +02:00 · 2025-10-23 21:11:25 +02:00 · 2025-10-17 09:05:13 +02:00 · 2025-10-17 08:48:15 +02:00 · 2025-10-17 08:41:21 +02:00
12 changed files with 257 additions and 123 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,9 +10,10 @@ set(CUDA_DIR "${SRC_DIR}/cuda")
 set(HSA_DIR "${SRC_DIR}/hsa")
 set(INTEL_DIR "${SRC_DIR}/intel")
-# Enable Intel backend by default
+# Make sure that at least one backend is enabled.
-if(NOT DEFINED ENABLE_INTEL_BACKEND)
+# It does not make sense that the user has not specified any backend.
-    set(ENABLE_INTEL_BACKEND true)
+if(NOT ENABLE_INTEL_BACKEND AND NOT ENABLE_CUDA_BACKEND AND NOT ENABLE_HSA_BACKEND)
  message(FATAL_ERROR "No backend was enabled! Please enable at least one backend with -DENABLE_XXX_BACKEND")
 endif()
 if(ENABLE_CUDA_BACKEND)
@@ -27,8 +28,7 @@ if(ENABLE_CUDA_BACKEND)
 endif()
 if(ENABLE_HSA_BACKEND)
-  # TODO: Needs rocm-cmake, what if its not insalled?
+  find_package(ROCmCMakeBuildTools QUIET)
  find_package(ROCmCMakeBuildTools)
  if (ROCmCMakeBuildTools_FOUND)
    find_package(hsa-runtime64 1.0 REQUIRED)    
    link_directories(hsa_backend hsa-runtime64::hsa-runtime64)
@@ -49,40 +49,81 @@ if(ENABLE_HSA_BACKEND)
      set(ENABLE_HSA_BACKEND false)
    endif()
  else()
-    set(ENABLE_HSA_BACKEND false)
+    # rocm-cmake is not installed, try to manually find neccesary files.
-    message(STATUS "${BoldYellow}ROCm not found${ColorReset}")
+    message(STATUS "${BoldYellow}Could NOT find HSA automatically, running manual search...${ColorReset}")
    if (NOT DEFINED ROCM_PATH)
      set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm")
    endif()
    find_path(HSA_INCLUDE_DIR hsa/hsa.h HINTS ${ROCM_PATH}/include)
    find_library(HSA_LIBRARY hsa-runtime64 HINTS ${ROCM_PATH}/lib ${ROCM_PATH}/lib64)
    if (HSA_INCLUDE_DIR AND HSA_LIBRARY)
      message(STATUS "${BoldYellow}HSA was found manually${ColorReset}")
    else()
      set(ENABLE_HSA_BACKEND false)
      message(STATUS "${BoldYellow}HSA was not found manually${ColorReset}")
    endif()
  endif()
 endif()
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
+set(GPUFECH_COMMON
-find_package(PCIUTILS)
+    ${COMMON_DIR}/main.cpp
-if(NOT ${PCIUTILS_FOUND})
+    ${COMMON_DIR}/args.cpp
-  message(STATUS "${BoldYellow}pciutils not found, downloading and building a local copy...${ColorReset}")
+    ${COMMON_DIR}/gpu.cpp
    ${COMMON_DIR}/global.cpp
    ${COMMON_DIR}/printer.cpp
    ${COMMON_DIR}/master.cpp
    ${COMMON_DIR}/uarch.cpp
 )
-  # Download and build pciutils
+set(GPUFETCH_LINK_TARGETS z)
  set(PCIUTILS_INSTALL_LOCATION ${CMAKE_BINARY_DIR}/pciutils-install)
  ExternalProject_Add(pciutils
    GIT_REPOSITORY https://github.com/pciutils/pciutils
    CONFIGURE_COMMAND ""
    BUILD_COMMAND make SHARED=no HWDB=no
    BUILD_IN_SOURCE true
    INSTALL_COMMAND make PREFIX=${PCIUTILS_INSTALL_LOCATION} install-lib
  )
-  include_directories(${PCIUTILS_INSTALL_LOCATION}/include)
+if(NOT(ENABLE_HSA_BACKEND AND NOT ENABLE_CUDA_BACKEND AND NOT ENABLE_INTEL_BACKEND))
-  link_directories(${PCIUTILS_INSTALL_LOCATION}/lib)
+  # Look for pciutils only if not building HSA only.
-else()
+  #
-  include_directories(${PCIUTILS_INCLUDE_DIR})
+  # This has the (intented) secondary effect that if only HSA backend is enabled
-  link_libraries(${PCIUTILS_LIBRARIES})
+  # by the user, but ROCm cannot be found, pciutils will still be compiled in
-  # Needed for linking libpci in FreeBSD
+  # order to show the list of GPUs available on the system, so that the user will
-  link_directories(/usr/local/lib/)
+  # get at least some feedback even if HSA is not found.
  list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
  list(APPEND GPUFECH_COMMON ${COMMON_DIR}/pci.cpp ${COMMON_DIR}/sort.cpp)
  list(APPEND GPUFETCH_LINK_TARGETS pci)
  set(CMAKE_ENABLE_PCIUTILS ON)
  find_package(PCIUTILS)
  if(NOT ${PCIUTILS_FOUND})
    message(STATUS "${BoldYellow}pciutils not found, downloading and building a local copy...${ColorReset}")
    # Download and build pciutils
    set(PCIUTILS_INSTALL_LOCATION ${CMAKE_BINARY_DIR}/pciutils-install)
    ExternalProject_Add(pciutils
      GIT_REPOSITORY https://github.com/pciutils/pciutils
      CONFIGURE_COMMAND ""
      BUILD_COMMAND make SHARED=no HWDB=no
      BUILD_IN_SOURCE true
      INSTALL_COMMAND make PREFIX=${PCIUTILS_INSTALL_LOCATION} install-lib
    )
    include_directories(${PCIUTILS_INSTALL_LOCATION}/include)
    link_directories(${PCIUTILS_INSTALL_LOCATION}/lib)
  else()
    include_directories(${PCIUTILS_INCLUDE_DIR})
    link_libraries(${PCIUTILS_LIBRARIES})
    # Needed for linking libpci in FreeBSD
    link_directories(/usr/local/lib/)
  endif()
 endif()
-add_executable(gpufetch ${COMMON_DIR}/main.cpp ${COMMON_DIR}/args.cpp ${COMMON_DIR}/gpu.cpp ${COMMON_DIR}/pci.cpp ${COMMON_DIR}/sort.cpp ${COMMON_DIR}/global.cpp ${COMMON_DIR}/printer.cpp ${COMMON_DIR}/master.cpp ${COMMON_DIR}/uarch.cpp)
+add_executable(gpufetch ${GPUFECH_COMMON})
 set(SANITY_FLAGS -Wfloat-equal -Wshadow -Wpointer-arith -Wall -Wextra -pedantic -fstack-protector-all -pedantic)
 target_compile_features(gpufetch PRIVATE cxx_std_11)
 target_compile_options(gpufetch PRIVATE ${SANITY_FLAGS})
 if (CMAKE_ENABLE_PCIUTILS)
  target_compile_definitions(gpufetch PUBLIC BACKEND_USE_PCI)
 endif()
 if(ENABLE_INTEL_BACKEND)
  target_compile_definitions(gpufetch PUBLIC BACKEND_INTEL)
@@ -134,13 +175,17 @@ if(ENABLE_HSA_BACKEND)
  endif()
  target_include_directories(hsa_backend PRIVATE "${HSA_INCLUDE_DIR}")
  message(STATUS "Found HSA: ${HSA_INCLUDE_DIR}")
-  target_link_libraries(hsa_backend PRIVATE hsa-runtime64::hsa-runtime64)
+  if (HSA_LIBRARY)
    target_link_libraries(hsa_backend PRIVATE ${HSA_LIBRARY})
  else()
    target_link_libraries(hsa_backend PRIVATE hsa-runtime64::hsa-runtime64)
  endif()
  target_link_libraries(gpufetch hsa_backend)
 endif()
-target_link_libraries(gpufetch pci z)
+target_link_libraries(gpufetch ${GPUFETCH_LINK_TARGETS})
 install(TARGETS gpufetch DESTINATION bin)
 if(NOT WIN32)
--- a/src/common/gpu.cpp
+++ b/src/common/gpu.cpp
@@ -101,6 +101,17 @@ char* get_str_bus_width(struct gpu_info* gpu) {
  return string;
 }
 char* get_str_lds_size(struct gpu_info* gpu) {
  // TODO: Show XX KB (XX MB Total) like in cpufetch
  uint32_t size = 3+1+3+1;
  assert(strlen(STRING_UNKNOWN)+1 <= size);
  char* string = (char *) ecalloc(size, sizeof(char));
  sprintf(string, "%d KB", gpu->mem->lds_size / 1024);
  return string;
 }
 char* get_str_memory_clock(struct gpu_info* gpu) {
  return get_freq_as_str_mhz(gpu->mem->freq);
 }
--- a/src/common/gpu.hpp
+++ b/src/common/gpu.hpp
@@ -3,8 +3,6 @@
 #include <cstdint>
 #include "../cuda/pci.hpp"
 #define UNKNOWN_FREQ -1
 enum {
@@ -63,6 +61,7 @@ struct memory {
  int32_t bus_width;
  int32_t freq;
  int32_t clk_mul; // clock multiplier
  int32_t lds_size; // HSA specific for now
 };
 struct gpu_info {
@@ -90,6 +89,7 @@ char* get_str_freq(struct gpu_info* gpu);
 char* get_str_memory_size(struct gpu_info* gpu);
 char* get_str_memory_type(struct gpu_info* gpu);
 char* get_str_bus_width(struct gpu_info* gpu);
 char* get_str_lds_size(struct gpu_info* gpu);
 char* get_str_memory_clock(struct gpu_info* gpu);
 char* get_str_l2(struct gpu_info* gpu);
 char* get_str_peak_performance(struct gpu_info* gpu);
--- a/src/common/main.cpp
+++ b/src/common/main.cpp
@@ -8,6 +8,10 @@
 #include "../cuda/cuda.hpp"
 #include "../cuda/uarch.hpp"
 #ifdef BACKEND_USE_PCI
 #include "pci.hpp"
 #endif
 static const char* VERSION = "0.30";
 void print_help(char *argv[]) {
@@ -79,8 +83,12 @@ int main(int argc, char* argv[]) {
  }
  if(get_num_gpus_available(list) == 0) {
 #ifdef BACKEND_USE_PCI    
    printErr("No GPU was detected! Available GPUs are:");
    print_gpus_list_pci();
 #else
    printErr("No GPU was detected!");
 #endif    
    printf("Please, make sure that the appropiate backend is enabled:\n");
    print_enabled_backends();
    printf("Visit https://github.com/Dr-Noob/gpufetch#2-backends for more information\n");
--- a/src/common/master.cpp
+++ b/src/common/master.cpp
@@ -1,7 +1,10 @@
 #include <cstdlib>
 #include <cstdio>
-#include "pci.hpp"
+#ifdef BACKEND_USE_PCI
  #include "pci.hpp"
 #endif  
 #include "global.hpp"
 #include "colors.hpp"
 #include "master.hpp"
@@ -19,7 +22,9 @@ struct gpu_list {
 struct gpu_list* get_gpu_list() {
  int idx = 0;
 #ifdef BACKEND_USE_PCI
  struct pci_dev *devices = get_pci_devices_from_pciutils();
 #endif
  struct gpu_list* list = (struct gpu_list*) malloc(sizeof(struct gpu_list));
  list->num_gpus = 0;
  list->gpus = (struct gpu_info**) malloc(sizeof(struct info*) * MAX_GPUS);
@@ -40,7 +45,7 @@ struct gpu_list* get_gpu_list() {
  bool valid = true;
  while(valid) {
-    list->gpus[idx] = get_gpu_info_hsa(devices, idx);
+    list->gpus[idx] = get_gpu_info_hsa(idx);
    if(list->gpus[idx] != NULL) idx++;
    else valid = false;
  }
--- a/src/common/printer.cpp
+++ b/src/common/printer.cpp
@@ -32,64 +32,56 @@
 #define MAX_ATTRIBUTES      100
 #define MAX_TERM_SIZE       1024
 typedef struct {
  int id;
  const char *name;
  const char *shortname;
 } AttributeField;
 // AttributeField IDs
 //                         Used by
 enum {
-  ATTRIBUTE_NAME,
+  ATTRIBUTE_NAME,          // ALL
-  ATTRIBUTE_CHIP,
+  ATTRIBUTE_CHIP,          // ALL
-  ATTRIBUTE_UARCH,
+  ATTRIBUTE_UARCH,         // ALL
-  ATTRIBUTE_TECHNOLOGY,
+  ATTRIBUTE_TECHNOLOGY,    // ALL
-  ATTRIBUTE_GT,
+  ATTRIBUTE_FREQUENCY,     // ALL
-  ATTRIBUTE_FREQUENCY,
+  ATTRIBUTE_PEAK,          // ALL
-  ATTRIBUTE_STREAMINGMP,
+  ATTRIBUTE_COMPUTE_UNITS, // HSA
-  ATTRIBUTE_CORESPERMP,
+  ATTRIBUTE_LDS_SIZE,      // HSA
-  ATTRIBUTE_CUDA_CORES,
+  ATTRIBUTE_STREAMINGMP,   // CUDA
-  ATTRIBUTE_TENSOR_CORES,
+  ATTRIBUTE_CORESPERMP,    // CUDA
-  ATTRIBUTE_EUS,
+  ATTRIBUTE_CUDA_CORES,    // CUDA
-  ATTRIBUTE_L2,
+  ATTRIBUTE_TENSOR_CORES,  // CUDA
-  ATTRIBUTE_MEMORY,
+  ATTRIBUTE_L2,            // CUDA
-  ATTRIBUTE_MEMORY_FREQ,
+  ATTRIBUTE_MEMORY,        // CUDA,HSA
-  ATTRIBUTE_BUS_WIDTH,
+  ATTRIBUTE_MEMORY_FREQ,   // CUDA
-  ATTRIBUTE_PEAK,
+  ATTRIBUTE_BUS_WIDTH,     // CUDA,HSA
-  ATTRIBUTE_PEAK_TENSOR,
+  ATTRIBUTE_PEAK_TENSOR,   // CUDA
  ATTRIBUTE_EUS,           // Intel
  ATTRIBUTE_GT,            // Intel
 };
-static const char* ATTRIBUTE_FIELDS [] = {
+static const AttributeField ATTRIBUTE_INFO[] = {
-  "Name:",
+  { ATTRIBUTE_NAME,          "Name:",                   "Name:" },
-  "GPU processor:",
+  { ATTRIBUTE_CHIP,          "GPU processor:",          "Processor:" },
-  "Microarchitecture:",
+  { ATTRIBUTE_UARCH,         "Microarchitecture:",      "uArch:" },
-  "Technology:",
+  { ATTRIBUTE_TECHNOLOGY,    "Technology:",             "Technology:" },
-  "Graphics Tier:",
+  { ATTRIBUTE_FREQUENCY,     "Max Frequency:",          "Max Freq.:" },
-  "Max Frequency:",
+  { ATTRIBUTE_PEAK,          "Peak Performance:",       "Peak Perf.:" },
-  "SMs:",
+  { ATTRIBUTE_COMPUTE_UNITS, "Compute Units (CUs):",    "CUs" },
-  "Cores/SM:",
+  { ATTRIBUTE_LDS_SIZE,      "LDS size:",               "LDS:" },
-  "CUDA Cores:",
+  { ATTRIBUTE_STREAMINGMP,   "SMs:",                    "SMs:" },
-  "Tensor Cores:",
+  { ATTRIBUTE_CORESPERMP,    "Cores/SM:",               "Cores/SM:" },
-  "Execution Units:",
+  { ATTRIBUTE_CUDA_CORES,    "CUDA Cores:",             "CUDA Cores:" },
-  "L2 Size:",
+  { ATTRIBUTE_TENSOR_CORES,  "Tensor Cores:",           "Tensor Cores:" },
-  "Memory:",
+  { ATTRIBUTE_L2,            "L2 Size:",                "L2 Size:" },
-  "Memory frequency:",
+  { ATTRIBUTE_MEMORY,        "Memory:",                 "Memory:" },
-  "Bus width:",
+  { ATTRIBUTE_MEMORY_FREQ,   "Memory frequency:",       "Memory freq.:" },
-  "Peak Performance:",
+  { ATTRIBUTE_BUS_WIDTH,     "Bus width:",              "Bus width:" },
-  "Peak Performance (MMA):",
+  { ATTRIBUTE_PEAK_TENSOR,   "Peak Performance (MMA):", "Peak Perf.(MMA):" },
-};
+  { ATTRIBUTE_EUS,           "Execution Units:",        "EUs:" },
-
+  { ATTRIBUTE_GT,            "Graphics Tier:",          "GT:" },
 static const char* ATTRIBUTE_FIELDS_SHORT [] = {
  "Name:",
  "Processor:",
  "uArch:",
  "Technology:",
  "GT:",
  "Max Freq.:",
  "SMs:",
  "Cores/SM:",
  "CUDA Cores:",
  "Tensor Cores:",
  "EUs:",
  "L2 Size:",
  "Memory:",
  "Memory freq.:",
  "Bus width:",
  "Peak Perf.:",
  "Peak Perf.(MMA):",
 };
 struct terminal {
@@ -207,8 +199,6 @@ bool ascii_fits_screen(int termw, struct ascii_logo logo, int lf) {
 void replace_bgbyfg_color(struct ascii_logo* logo) {
  // Replace background by foreground color
  for(int i=0; i < 2; i++) {
    if(logo->color_ascii[i] == NULL) break;
    if(strcmp(logo->color_ascii[i], C_BG_BLACK) == 0) strcpy(logo->color_ascii[i], C_FG_BLACK);
    else if(strcmp(logo->color_ascii[i], C_BG_RED) == 0) strcpy(logo->color_ascii[i], C_FG_RED);
    else if(strcmp(logo->color_ascii[i], C_BG_GREEN) == 0) strcpy(logo->color_ascii[i], C_FG_GREEN);
@@ -276,13 +266,14 @@ void choose_ascii_art(struct ascii* art, struct color** cs, struct terminal* ter
  }
 }
-uint32_t longest_attribute_length(struct ascii* art, const char** attribute_fields) {
+uint32_t longest_attribute_length(struct ascii* art, bool use_short) {
  uint32_t max = 0;
  uint64_t len = 0;
  for(uint32_t i=0; i < art->n_attributes_set; i++) {
    if(art->attributes[i]->value != NULL) {
-      len = strlen(attribute_fields[art->attributes[i]->type]);
+      const char* str = use_short ? ATTRIBUTE_INFO[art->attributes[i]->type].shortname : ATTRIBUTE_INFO[art->attributes[i]->type].name;
      len = strlen(str);
      if(len > max) max = len;
    }
  }
@@ -306,7 +297,7 @@ uint32_t longest_field_length(struct ascii* art, int la) {
  return max;
 }
-void print_ascii_generic(struct ascii* art, uint32_t la, int32_t text_space, const char** attribute_fields) {
+void print_ascii_generic(struct ascii* art, uint32_t la, int32_t text_space, bool use_short) {
  struct ascii_logo* logo = art->art;
  int attr_to_print = 0;
  int attr_type;
@@ -350,11 +341,13 @@ void print_ascii_generic(struct ascii* art, uint32_t la, int32_t text_space, con
      attr_value = art->attributes[attr_to_print]->value;
      attr_to_print++;
-      space_right = 1 + (la - strlen(attribute_fields[attr_type]));
+      const char* attr_str = use_short ? ATTRIBUTE_INFO[attr_type].shortname : ATTRIBUTE_INFO[attr_type].name;
      space_right = 1 + (la - strlen(attr_str));
      current_space = max(0, text_space);
-      printf("%s%.*s%s", logo->color_text[0], current_space, attribute_fields[attr_type], art->reset);
+      printf("%s%.*s%s", logo->color_text[0], current_space, attr_str, art->reset);
-      current_space = max(0, current_space - (int) strlen(attribute_fields[attr_type]));
+      current_space = max(0, current_space - (int) strlen(attr_str));
      printf("%*s", min(current_space, space_right), "");
      current_space = max(0, current_space - min(current_space, space_right));
      printf("%s%.*s%s", logo->color_text[1], current_space, attr_value, art->reset);
@@ -388,19 +381,19 @@ bool print_gpufetch_intel(struct gpu_info* gpu, STYLE s, struct color** cs, stru
  setAttribute(art, ATTRIBUTE_EUS, eus);
  setAttribute(art, ATTRIBUTE_PEAK, pp);
-  const char** attribute_fields = ATTRIBUTE_FIELDS;
+  bool use_short = false;
-  uint32_t longest_attribute = longest_attribute_length(art, attribute_fields);
+  uint32_t longest_attribute = longest_attribute_length(art, use_short);
  uint32_t longest_field = longest_field_length(art, longest_attribute);
  choose_ascii_art(art, cs, term, longest_field);
  if(!ascii_fits_screen(term->w, *art->art, longest_field)) {
    // Despite of choosing the smallest logo, the output does not fit
    // Choose the shorter field names and recalculate the longest attr
-    attribute_fields = ATTRIBUTE_FIELDS_SHORT;
+    use_short = true;
-    longest_attribute = longest_attribute_length(art, attribute_fields);
+    longest_attribute = longest_attribute_length(art, use_short);
  }
-  print_ascii_generic(art, longest_attribute, term->w - art->art->width, attribute_fields);
+  print_ascii_generic(art, longest_attribute, term->w - art->art->width, use_short);
  return true;
 }
@@ -457,19 +450,19 @@ bool print_gpufetch_cuda(struct gpu_info* gpu, STYLE s, struct color** cs, struc
    setAttribute(art, ATTRIBUTE_PEAK_TENSOR, pp_tensor);
  }
-  const char** attribute_fields = ATTRIBUTE_FIELDS;
+  bool use_short = false;
-  uint32_t longest_attribute = longest_attribute_length(art, attribute_fields);
+  uint32_t longest_attribute = longest_attribute_length(art, use_short);
  uint32_t longest_field = longest_field_length(art, longest_attribute);
  choose_ascii_art(art, cs, term, longest_field);
  if(!ascii_fits_screen(term->w, *art->art, longest_field)) {
    // Despite of choosing the smallest logo, the output does not fit
    // Choose the shorter field names and recalculate the longest attr
-    attribute_fields = ATTRIBUTE_FIELDS_SHORT;
+    use_short = true;
-    longest_attribute = longest_attribute_length(art, attribute_fields);
+    longest_attribute = longest_attribute_length(art, use_short);
  }
-  print_ascii_generic(art, longest_attribute, term->w - art->art->width, attribute_fields);
+  print_ascii_generic(art, longest_attribute, term->w - art->art->width, use_short);
  free(manufacturing_process);
  free(max_frequency);
@@ -494,8 +487,11 @@ bool print_gpufetch_amd(struct gpu_info* gpu, STYLE s, struct color** cs, struct
  char* gpu_chip = get_str_chip(gpu->arch);
  char* uarch = get_str_uarch_hsa(gpu->arch);
  char* manufacturing_process = get_str_process(gpu->arch);
-  char* sms = get_str_cu(gpu);
+  char* cus = get_str_cu(gpu);
  char* max_frequency = get_str_freq(gpu);
  char* bus_width = get_str_bus_width(gpu);
  char* mem_size = get_str_memory_size(gpu);
  char* lds_size = get_str_lds_size(gpu);
  setAttribute(art, ATTRIBUTE_NAME, gpu_name);
  if (gpu_chip != NULL) {
@@ -504,21 +500,24 @@ bool print_gpufetch_amd(struct gpu_info* gpu, STYLE s, struct color** cs, struct
  setAttribute(art, ATTRIBUTE_UARCH, uarch);
  setAttribute(art, ATTRIBUTE_TECHNOLOGY, manufacturing_process);
  setAttribute(art, ATTRIBUTE_FREQUENCY, max_frequency);
-  setAttribute(art, ATTRIBUTE_STREAMINGMP, sms);
+  setAttribute(art, ATTRIBUTE_COMPUTE_UNITS, cus);
  setAttribute(art, ATTRIBUTE_LDS_SIZE, lds_size);
  setAttribute(art, ATTRIBUTE_MEMORY, mem_size);
  setAttribute(art, ATTRIBUTE_BUS_WIDTH, bus_width);
-  const char** attribute_fields = ATTRIBUTE_FIELDS;
+  bool use_short = false;
-  uint32_t longest_attribute = longest_attribute_length(art, attribute_fields);
+  uint32_t longest_attribute = longest_attribute_length(art, use_short);
  uint32_t longest_field = longest_field_length(art, longest_attribute);
  choose_ascii_art(art, cs, term, longest_field);
  if(!ascii_fits_screen(term->w, *art->art, longest_field)) {
    // Despite of choosing the smallest logo, the output does not fit
    // Choose the shorter field names and recalculate the longest attr
-    attribute_fields = ATTRIBUTE_FIELDS_SHORT;
+    use_short = true;
-    longest_attribute = longest_attribute_length(art, attribute_fields);
+    longest_attribute = longest_attribute_length(art, use_short);
  }
-  print_ascii_generic(art, longest_attribute, term->w - art->art->width, attribute_fields);
+  print_ascii_generic(art, longest_attribute, term->w - art->art->width, use_short);
  free(art->attributes);
  free(art);
--- a/src/cuda/cuda.cpp
+++ b/src/cuda/cuda.cpp
@@ -5,8 +5,8 @@
 #include "cuda.hpp"
 #include "uarch.hpp"
 #include "pci.hpp"
 #include "gpufetch_helper_cuda.hpp"
 #include "../common/pci.hpp"
 #include "../common/global.hpp"
 #include "../common/uarch.hpp"
@@ -33,10 +33,8 @@ int get_tensor_cores(struct uarch* arch, int sm, int major) {
  if(major == 7) {
    // TU116 does not have tensor cores!
    // https://www.anandtech.com/show/13973/nvidia-gtx-1660-ti-review-feat-evga-xc-gaming/2
-    if(arch->chip == CHIP_TU116   || arch->chip == CHIP_TU116BM ||
+    if (is_chip_TU116(arch))
       arch->chip == CHIP_TU116GL || arch->chip == CHIP_TU116M) {
      return 0;
    }
    return sm * 8;
  }
  else if(major == 8) return sm * 4;
--- a/src/cuda/uarch.cpp
+++ b/src/cuda/uarch.cpp
@@ -8,6 +8,7 @@
 #include "../common/uarch.hpp"
 #include "../common/global.hpp"
 #include "../common/gpu.hpp"
 #include "pci.hpp"
 #include "chips.hpp"
 // Any clock multiplier
@@ -361,3 +362,8 @@ void free_uarch_struct(struct uarch* arch) {
  free(arch->chip_str);
  free(arch);
 }
 bool is_chip_TU116(struct uarch* arch) {
  return arch->chip == CHIP_TU116   || arch->chip == CHIP_TU116BM ||
         arch->chip == CHIP_TU116GL || arch->chip == CHIP_TU116M;
 }
--- a/src/cuda/uarch.hpp
+++ b/src/cuda/uarch.hpp
@@ -13,5 +13,6 @@ char* get_str_cc(struct uarch* arch);
 char* get_str_chip(struct uarch* arch);
 char* get_str_process(struct uarch* arch);
 void free_uarch_struct(struct uarch* arch);
 bool is_chip_TU116(struct uarch* arch);
 #endif
--- a/src/hsa/hsa.cpp
+++ b/src/hsa/hsa.cpp
@@ -13,7 +13,6 @@
 #include "hsa.hpp"
 #include "uarch.hpp"
 #include "../common/pci.hpp"
 #include "../common/global.hpp"
 #include "../common/uarch.hpp"
@@ -24,6 +23,9 @@ struct agent_info {
  char device_mkt_name[64];
  uint32_t max_clock_freq;
  uint32_t compute_unit;
  uint32_t bus_width;
  uint32_t lds_size;
  uint64_t global_size;
 };
 #define RET_IF_HSA_ERR(err) { \
@@ -41,6 +43,46 @@ struct agent_info {
  }                                                                           \
 }
 hsa_status_t memory_pool_callback(hsa_amd_memory_pool_t pool, void* data) {
  struct agent_info* info = reinterpret_cast<struct agent_info *>(data);
  hsa_amd_segment_t segment;
  hsa_status_t err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
  RET_IF_HSA_ERR(err);
  if (segment == HSA_AMD_SEGMENT_GROUP) {
    // LDS memory
    uint32_t size = 0;
    err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
    RET_IF_HSA_ERR(err);
    info->lds_size = size;    
  }
  else if (segment == HSA_AMD_SEGMENT_GLOBAL) {
    // Global memory
    uint32_t global_flags = 0;
    err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flags);
    RET_IF_HSA_ERR(err);
    if (global_flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED) {
      if (info->global_size != 0) {
        printErr("Found HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED twice!");
        return HSA_STATUS_ERROR;
      }
      uint64_t size = 0;
      err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
      RET_IF_HSA_ERR(err);
      info->global_size = size;
    }    
  }
  return HSA_STATUS_SUCCESS;
 }
 hsa_status_t agent_callback(hsa_agent_t agent, void *data) {
  struct agent_info* info = reinterpret_cast<struct agent_info *>(data);
@@ -63,6 +105,14 @@ hsa_status_t agent_callback(hsa_agent_t agent, void *data) {
    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &info->compute_unit);
    RET_IF_HSA_ERR(err);
    // According to the documentation, this is deprecated. But what should I be using then?
    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_MEMORY_WIDTH, &info->bus_width);
    RET_IF_HSA_ERR(err);
    info->global_size = 0;
    err = hsa_amd_agent_iterate_memory_pools(agent, memory_pool_callback, data);
    RET_IF_HSA_ERR(err);
  }
  return HSA_STATUS_SUCCESS;
@@ -76,7 +126,17 @@ struct topology_h* get_topology_info(struct agent_info info) {
  return topo;
 }
-struct gpu_info* get_gpu_info_hsa(struct pci_dev *devices, int gpu_idx) {
+struct memory* get_memory_info(struct gpu_info* gpu, struct agent_info info) {
  struct memory* mem = (struct memory*) emalloc(sizeof(struct memory));
  mem->bus_width = info.bus_width;
  mem->lds_size = info.lds_size;
  mem->size_bytes = info.global_size;
  return mem;
 }
 struct gpu_info* get_gpu_info_hsa(int gpu_idx) {
  struct gpu_info* gpu = (struct gpu_info*) emalloc(sizeof(struct gpu_info));
  gpu->pci = NULL;
  gpu->idx = gpu_idx;
@@ -119,6 +179,7 @@ struct gpu_info* get_gpu_info_hsa(struct pci_dev *devices, int gpu_idx) {
  gpu->name = (char *) emalloc(sizeof(char) * (strlen(info.device_mkt_name) + 1));
  strcpy(gpu->name, info.device_mkt_name);
  gpu->arch = get_uarch_from_hsa(gpu, info.gpu_name);
  gpu->mem = get_memory_info(gpu, info);
  if (gpu->arch == NULL) {
    return NULL;
--- a/src/hsa/hsa.hpp
+++ b/src/hsa/hsa.hpp
@@ -3,7 +3,7 @@
 #include "../common/gpu.hpp"
-struct gpu_info* get_gpu_info_hsa(struct pci_dev *devices, int gpu_idx);
+struct gpu_info* get_gpu_info_hsa(int gpu_idx);
 char* get_str_cu(struct gpu_info* gpu);
 #endif
--- a/src/hsa/uarch.cpp
+++ b/src/hsa/uarch.cpp
@@ -127,7 +127,7 @@ enum {
 #define CHECK_UARCH_START if (false) {}
 #define CHECK_UARCH(arch, chip_, str, uarch, process) \
   else if (arch->chip == chip_) fill_uarch(arch, str, uarch, process);
-#define CHECK_UARCH_END else { if(arch->chip != CHIP_UNKNOWN_CUDA) printBug("map_chip_to_uarch_hsa: Unknown chip id: %d", arch->chip); fill_uarch(arch, STRING_UNKNOWN, UARCH_UNKNOWN, UNK); }
+#define CHECK_UARCH_END else { if(arch->chip != CHIP_UNKNOWN_HSA) printBug("map_chip_to_uarch_hsa: Unknown chip id: %d", arch->chip); fill_uarch(arch, STRING_UNKNOWN, UARCH_UNKNOWN, UNK); }
 void fill_uarch(struct uarch* arch, char const *str, MICROARCH u, uint32_t process) {
  arch->chip_str = (char *) emalloc(sizeof(char) * (strlen(str)+1));
Author	SHA1	Message	Date
Dr-Noob	e0c843274c	Cleaning memory_pool_callback	2025-10-23 21:19:27 +02:00
Dr-Noob	b543b23f60	Fixes	2025-10-23 21:12:52 +02:00
Dr-Noob	9b519828f4	I guess we can rely on HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED	2025-10-23 21:11:25 +02:00
Dr-Noob	3b567b9381	Ok so this is not reporting the actual size, Ill need to investigate why	2025-10-17 09:05:13 +02:00
Dr-Noob	e1f03c4e04	Print global memory size	2025-10-17 08:48:15 +02:00
Dr-Noob	046f8c1299	Fixes	2025-10-17 08:41:21 +02:00
Dr-Noob	b434fc6fd0	Printer support	2025-10-17 08:38:37 +02:00
Dr-Noob	5beccaebb0	Adding more info	2025-10-17 08:31:26 +02:00
Dr-Noob	82ea16fc3d	[v0.30] Fix warning in printer	2025-10-16 20:01:14 +02:00
Dr-Noob	6589de9717	[v0.30] Reorganize attributes in printer and add CUs attr for AMD	2025-10-16 19:53:48 +02:00
Dr-Noob	0950b97393	[v0.30] Build pciutils only if neccesary If only HSA is enabled we dont need pciutils since AMD detection does not rely on it. Therefore we change CMakeLists.txt to build pciutils only if required. This commit has some side-effects: 1. We now don't build Intel backend by default. In other words, no backend is built by default, the user must specify which backend to use. 2. There were some issues with includes and wrongly used defines and variables. This commit fixes all that.	2025-10-16 08:26:42 +02:00
Dr-Noob	8794cd322d	[v0.30] Add support for building on AMD where rocm-cmake is not installed	2025-10-16 07:24:45 +02:00
Dr-Noob	5df85aea2c	[v0.30] Add uarch detection to AMD GPUs Similarly to NVIDIA and Intel GPUs, we now detect microarchitecture, also with manufacturing process and specific chip name. We infer all of this from the gfx name (in the code we use the term llvm_target), altough it's not clear yet that this method is completely reliable (see comments for more details). In the future we might want to replace that with a better way. Once we have the gfx name, we should be able to infer the specific chip, and from the chip we can easily infer the microarchitecture. This commit also includes some refactorings and code improvements on the HSA backend.	2025-10-15 08:23:28 +02:00