Patch cuda.cpp with cloudy's fix

[v0.30] Add support for XCDs and matrix cores
For XCDs, we dont show them if the GPU is made of a single XCD, as it adds little value For matrix cores, we assume it can be computed as compute_units * simds_per_cu, it seems to work for the GPUs I checked from CDNA3 and RDNA3. Not sure what would happen for older GPUs that do not have matrix cores though.
2026-01-10 19:29:45 -05:00 · 2025-10-26 10:51:27 +01:00 · 2025-10-24 22:29:45 +02:00 · 2025-10-23 21:30:02 +02:00 · 2025-10-16 20:01:14 +02:00 · 2025-10-16 19:53:48 +02:00
16 changed files with 842 additions and 183 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,9 +10,10 @@ set(CUDA_DIR "${SRC_DIR}/cuda")
 set(HSA_DIR "${SRC_DIR}/hsa")
 set(INTEL_DIR "${SRC_DIR}/intel")

-# Enable Intel backend by default
-if(NOT DEFINED ENABLE_INTEL_BACKEND)
-    set(ENABLE_INTEL_BACKEND true)
+# Make sure that at least one backend is enabled.
+# It does not make sense that the user has not specified any backend.
+if(NOT ENABLE_INTEL_BACKEND AND NOT ENABLE_CUDA_BACKEND AND NOT ENABLE_HSA_BACKEND)
+  message(FATAL_ERROR "No backend was enabled! Please enable at least one backend with -DENABLE_XXX_BACKEND")
 endif()

 if(ENABLE_CUDA_BACKEND)
@@ -27,8 +28,7 @@ if(ENABLE_CUDA_BACKEND)
 endif()

 if(ENABLE_HSA_BACKEND)
-  # TODO: Needs rocm-cmake, what if its not insalled?
-  find_package(ROCmCMakeBuildTools)
+  find_package(ROCmCMakeBuildTools QUIET)
  if (ROCmCMakeBuildTools_FOUND)
    find_package(hsa-runtime64 1.0 REQUIRED)    
    link_directories(hsa_backend hsa-runtime64::hsa-runtime64)
@@ -49,40 +49,81 @@ if(ENABLE_HSA_BACKEND)
      set(ENABLE_HSA_BACKEND false)
    endif()
  else()
-    set(ENABLE_HSA_BACKEND false)
-    message(STATUS "${BoldYellow}ROCm not found${ColorReset}")
+    # rocm-cmake is not installed, try to manually find neccesary files.
+    message(STATUS "${BoldYellow}Could NOT find HSA automatically, running manual search...${ColorReset}")
+    if (NOT DEFINED ROCM_PATH)
+      set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm")
+    endif()
+
+    find_path(HSA_INCLUDE_DIR hsa/hsa.h HINTS ${ROCM_PATH}/include)
+    find_library(HSA_LIBRARY hsa-runtime64 HINTS ${ROCM_PATH}/lib ${ROCM_PATH}/lib64)
+
+    if (HSA_INCLUDE_DIR AND HSA_LIBRARY)
+      message(STATUS "${BoldYellow}HSA was found manually${ColorReset}")
+    else()
+      set(ENABLE_HSA_BACKEND false)
+      message(STATUS "${BoldYellow}HSA was not found manually${ColorReset}")
+    endif()
  endif()
 endif()

-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
-find_package(PCIUTILS)
-if(NOT ${PCIUTILS_FOUND})
-  message(STATUS "${BoldYellow}pciutils not found, downloading and building a local copy...${ColorReset}")
+set(GPUFECH_COMMON
+    ${COMMON_DIR}/main.cpp
+    ${COMMON_DIR}/args.cpp
+    ${COMMON_DIR}/gpu.cpp
+    ${COMMON_DIR}/global.cpp
+    ${COMMON_DIR}/printer.cpp
+    ${COMMON_DIR}/master.cpp
+    ${COMMON_DIR}/uarch.cpp
+)

-  # Download and build pciutils
-  set(PCIUTILS_INSTALL_LOCATION ${CMAKE_BINARY_DIR}/pciutils-install)
-  ExternalProject_Add(pciutils
-    GIT_REPOSITORY https://github.com/pciutils/pciutils
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND make SHARED=no HWDB=no
-    BUILD_IN_SOURCE true
-    INSTALL_COMMAND make PREFIX=${PCIUTILS_INSTALL_LOCATION} install-lib
-  )
+set(GPUFETCH_LINK_TARGETS z)

-  include_directories(${PCIUTILS_INSTALL_LOCATION}/include)
-  link_directories(${PCIUTILS_INSTALL_LOCATION}/lib)
-else()
-  include_directories(${PCIUTILS_INCLUDE_DIR})
-  link_libraries(${PCIUTILS_LIBRARIES})
-  # Needed for linking libpci in FreeBSD
-  link_directories(/usr/local/lib/)
+if(NOT(ENABLE_HSA_BACKEND AND NOT ENABLE_CUDA_BACKEND AND NOT ENABLE_INTEL_BACKEND))
+  # Look for pciutils only if not building HSA only.
+  #
+  # This has the (intented) secondary effect that if only HSA backend is enabled
+  # by the user, but ROCm cannot be found, pciutils will still be compiled in
+  # order to show the list of GPUs available on the system, so that the user will
+  # get at least some feedback even if HSA is not found.
+  list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
+  list(APPEND GPUFECH_COMMON ${COMMON_DIR}/pci.cpp ${COMMON_DIR}/sort.cpp)
+  list(APPEND GPUFETCH_LINK_TARGETS pci)
+  set(CMAKE_ENABLE_PCIUTILS ON)
+
+  find_package(PCIUTILS)
+  if(NOT ${PCIUTILS_FOUND})
+    message(STATUS "${BoldYellow}pciutils not found, downloading and building a local copy...${ColorReset}")
+
+    # Download and build pciutils
+    set(PCIUTILS_INSTALL_LOCATION ${CMAKE_BINARY_DIR}/pciutils-install)
+    ExternalProject_Add(pciutils
+      GIT_REPOSITORY https://github.com/pciutils/pciutils
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND make SHARED=no HWDB=no
+      BUILD_IN_SOURCE true
+      INSTALL_COMMAND make PREFIX=${PCIUTILS_INSTALL_LOCATION} install-lib
+    )
+
+    include_directories(${PCIUTILS_INSTALL_LOCATION}/include)
+    link_directories(${PCIUTILS_INSTALL_LOCATION}/lib)
+  else()
+    include_directories(${PCIUTILS_INCLUDE_DIR})
+    link_libraries(${PCIUTILS_LIBRARIES})
+    # Needed for linking libpci in FreeBSD
+    link_directories(/usr/local/lib/)
+  endif()
 endif()

-add_executable(gpufetch ${COMMON_DIR}/main.cpp ${COMMON_DIR}/args.cpp ${COMMON_DIR}/gpu.cpp ${COMMON_DIR}/pci.cpp ${COMMON_DIR}/sort.cpp ${COMMON_DIR}/global.cpp ${COMMON_DIR}/printer.cpp ${COMMON_DIR}/master.cpp ${COMMON_DIR}/uarch.cpp)
+add_executable(gpufetch ${GPUFECH_COMMON})
 set(SANITY_FLAGS -Wfloat-equal -Wshadow -Wpointer-arith -Wall -Wextra -pedantic -fstack-protector-all -pedantic)
 target_compile_features(gpufetch PRIVATE cxx_std_11)
 target_compile_options(gpufetch PRIVATE ${SANITY_FLAGS})

+if (CMAKE_ENABLE_PCIUTILS)
+  target_compile_definitions(gpufetch PUBLIC BACKEND_USE_PCI)
+endif()
+
 if(ENABLE_INTEL_BACKEND)
  target_compile_definitions(gpufetch PUBLIC BACKEND_INTEL)

@@ -127,20 +168,24 @@ endif()
 if(ENABLE_HSA_BACKEND)
  target_compile_definitions(gpufetch PUBLIC BACKEND_HSA)

-  add_library(hsa_backend STATIC ${HSA_DIR}/hsa.cpp)
+  add_library(hsa_backend STATIC ${HSA_DIR}/hsa.cpp ${HSA_DIR}/uarch.cpp)

  if(NOT ${PCIUTILS_FOUND})
    add_dependencies(hsa_backend pciutils)
  endif()

  target_include_directories(hsa_backend PRIVATE "${HSA_INCLUDE_DIR}")
-  message(STATUS "Found HSA: ${HSA_INCLUDE_DIR}")

-  target_link_libraries(hsa_backend PRIVATE hsa-runtime64::hsa-runtime64)
+  if (HSA_LIBRARY)
+    target_link_libraries(hsa_backend PRIVATE ${HSA_LIBRARY})
+  else()
+    target_link_libraries(hsa_backend PRIVATE hsa-runtime64::hsa-runtime64)
+  endif()
+
  target_link_libraries(gpufetch hsa_backend)
 endif()

-target_link_libraries(gpufetch pci z)
+target_link_libraries(gpufetch ${GPUFETCH_LINK_TARGETS})
 install(TARGETS gpufetch DESTINATION bin)

 if(NOT WIN32)
--- a/build.sh
+++ b/build.sh
@@ -1,5 +1,24 @@
 #!/bin/bash

+print_help() {
+  cat << EOF
+Usage: $0 <backends> [build_type]
+
+  <backends>    MANDATORY. Comma-separated list of 
+                backends to enable.
+                Valid options: hsa, intel, cuda
+                Example: hsa,cuda
+
+  [build_type]  OPTIONAL. Build type. Valid options:
+                debug, release (default: release)
+
+Examples:
+  $0 hsa,intel debug
+  $0 cuda
+  $0 hsa,intel,cuda release
+EOF
+}
+
 # gpufetch build script
 set -e

@@ -7,19 +26,79 @@ rm -rf build/ gpufetch
 mkdir build/
 cd build/

-if [ "$1" == "debug" ]
+if [ "$1" == "--help" ]
 then
-  BUILD_TYPE="Debug"
-else
-  BUILD_TYPE="Release"
+  echo "gpufetch build script"
+  echo
+  print_help
+  exit 0
 fi

+if [[ $# -lt 1 ]]; then
+  echo "ERROR: At least one backend must be specified."
+  echo
+  print_help
+  exit 1
+fi
+
+# Determine if last argument is build type
+LAST_ARG="${!#}"
+if [[ "$LAST_ARG" == "debug" || "$LAST_ARG" == "release" ]]; then
+  BUILD_TYPE="$LAST_ARG"
+  BACKEND_ARG="${1}"
+else
+  BUILD_TYPE="release"
+  BACKEND_ARG="${1}"
+fi
+
+# Split comma-separated backends into an array
+IFS=',' read -r -a BACKENDS <<< "$BACKEND_ARG"
+
+# Validate build type 
+if [[ "$BUILD_TYPE" != "debug" && "$BUILD_TYPE" != "release" ]]
+then
+  echo "Error: Invalid build type '$BUILD_TYPE'."
+  echo "Valid options are: debug, release"
+  exit 1
+fi
+
+# From lower to upper case
+CMAKE_FLAGS="-DCMAKE_BUILD_TYPE=${BUILD_TYPE^}"
+
+# Validate backends
+VALID_BACKENDS=("hsa" "intel" "cuda")
+
+for BACKEND in "${BACKENDS[@]}"; do
+  case "$BACKEND" in
+    hsa)
+      CMAKE_FLAGS+=" -DENABLE_HSA_BACKEND=ON"
+      ;;
+    intel)
+      CMAKE_FLAGS+=" -DENABLE_INTEL_BACKEND=ON"
+      ;;
+    cuda)
+      CMAKE_FLAGS+=" -DENABLE_CUDA_BACKEND=ON"
+      ;;
+    *)
+      echo "ERROR: Invalid backend '$BACKEND'."
+      echo "Valid options: ${VALID_BACKENDS[*]}"
+      exit 1
+      ;;
+  esac
+done
+
+# You can also manually specify the compilation flags.
+# If you need to, just run the cmake command directly
+# instead of using this script.
+#
+# Here you will find some help:
+#
 # In case you have CUDA installed but it is not detected,
 # - set CMAKE_CUDA_COMPILER to your nvcc binary:
 # - set CMAKE_CUDA_COMPILER_TOOLKIT_ROOT to the CUDA root dir
 # for example:
 # cmake -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=/usr/local/cuda/ ..
-
+#
 # In case you want to explicitely disable a backend, you can:
 # Disable CUDA backend:
 # cmake -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DENABLE_CUDA_BACKEND=OFF ..
@@ -28,7 +107,9 @@ fi
 # Disable Intel backend:
 # cmake -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DENABLE_INTEL_BACKEND=OFF ..

-cmake -DCMAKE_BUILD_TYPE=$BUILD_TYPE ..
+echo "$0: Running cmake $CMAKE_FLAGS"
+echo 
+cmake $CMAKE_FLAGS ..

 os=$(uname)
 if [ "$os" == 'Linux' ]; then
--- a/src/common/gpu.cpp
+++ b/src/common/gpu.cpp
@@ -101,6 +101,17 @@ char* get_str_bus_width(struct gpu_info* gpu) {
  return string;
 }

+char* get_str_lds_size(struct gpu_info* gpu) {
+  // TODO: Show XX KB (XX MB Total) like in cpufetch
+  uint32_t size = 3+1+3+1;
+  assert(strlen(STRING_UNKNOWN)+1 <= size);
+  char* string = (char *) ecalloc(size, sizeof(char));
+
+  sprintf(string, "%d KB", gpu->mem->lds_size / 1024);
+
+  return string;
+}
+
 char* get_str_memory_clock(struct gpu_info* gpu) {
  return get_freq_as_str_mhz(gpu->mem->freq);
 }
--- a/src/common/gpu.hpp
+++ b/src/common/gpu.hpp
@@ -3,8 +3,6 @@

 #include <cstdint>

-#include "../cuda/pci.hpp"
-
 #define UNKNOWN_FREQ -1

 enum {
@@ -48,6 +46,10 @@ struct topology_c {
 // HSA topology
 struct topology_h {
  int32_t compute_units;
+  int32_t num_shader_engines;
+  int32_t simds_per_cu;
+  int32_t num_xcc;
+  int32_t matrix_cores;
 };

 // Intel topology
@@ -63,6 +65,7 @@ struct memory {
  int32_t bus_width;
  int32_t freq;
  int32_t clk_mul; // clock multiplier
+  int32_t lds_size; // HSA specific for now
 };

 struct gpu_info {
@@ -90,6 +93,7 @@ char* get_str_freq(struct gpu_info* gpu);
 char* get_str_memory_size(struct gpu_info* gpu);
 char* get_str_memory_type(struct gpu_info* gpu);
 char* get_str_bus_width(struct gpu_info* gpu);
+char* get_str_lds_size(struct gpu_info* gpu);
 char* get_str_memory_clock(struct gpu_info* gpu);
 char* get_str_l2(struct gpu_info* gpu);
 char* get_str_peak_performance(struct gpu_info* gpu);
--- a/src/common/main.cpp
+++ b/src/common/main.cpp
@@ -8,6 +8,10 @@
 #include "../cuda/cuda.hpp"
 #include "../cuda/uarch.hpp"

+#ifdef BACKEND_USE_PCI
+#include "pci.hpp"
+#endif
+
 static const char* VERSION = "0.30";

 void print_help(char *argv[]) {
@@ -79,8 +83,12 @@ int main(int argc, char* argv[]) {
  }

  if(get_num_gpus_available(list) == 0) {
+#ifdef BACKEND_USE_PCI    
    printErr("No GPU was detected! Available GPUs are:");
    print_gpus_list_pci();
+#else
+    printErr("No GPU was detected!");
+#endif    
    printf("Please, make sure that the appropiate backend is enabled:\n");
    print_enabled_backends();
    printf("Visit https://github.com/Dr-Noob/gpufetch#2-backends for more information\n");
--- a/src/common/master.cpp
+++ b/src/common/master.cpp
@@ -1,7 +1,10 @@
 #include <cstdlib>
 #include <cstdio>

-#include "pci.hpp"
+#ifdef BACKEND_USE_PCI
+  #include "pci.hpp"
+#endif  
+
 #include "global.hpp"
 #include "colors.hpp"
 #include "master.hpp"
@@ -19,7 +22,9 @@ struct gpu_list {

 struct gpu_list* get_gpu_list() {
  int idx = 0;
+#ifdef BACKEND_USE_PCI
  struct pci_dev *devices = get_pci_devices_from_pciutils();
+#endif
  struct gpu_list* list = (struct gpu_list*) malloc(sizeof(struct gpu_list));
  list->num_gpus = 0;
  list->gpus = (struct gpu_info**) malloc(sizeof(struct info*) * MAX_GPUS);
@@ -40,7 +45,7 @@ struct gpu_list* get_gpu_list() {
  bool valid = true;

  while(valid) {
-    list->gpus[idx] = get_gpu_info_hsa(devices, idx);
+    list->gpus[idx] = get_gpu_info_hsa(idx);
    if(list->gpus[idx] != NULL) idx++;
    else valid = false;
  }
--- a/src/common/printer.cpp
+++ b/src/common/printer.cpp
@@ -11,6 +11,7 @@
 #include "../intel/uarch.hpp"
 #include "../intel/intel.hpp"
 #include "../hsa/hsa.hpp"
+#include "../hsa/uarch.hpp"
 #include "../cuda/cuda.hpp"
 #include "../cuda/uarch.hpp"

@@ -31,64 +32,60 @@
 #define MAX_ATTRIBUTES      100
 #define MAX_TERM_SIZE       1024

+typedef struct {
+  int id;
+  const char *name;
+  const char *shortname;
+} AttributeField;
+
+// AttributeField IDs
+//                         Used by
 enum {
-  ATTRIBUTE_NAME,
-  ATTRIBUTE_CHIP,
-  ATTRIBUTE_UARCH,
-  ATTRIBUTE_TECHNOLOGY,
-  ATTRIBUTE_GT,
-  ATTRIBUTE_FREQUENCY,
-  ATTRIBUTE_STREAMINGMP,
-  ATTRIBUTE_CORESPERMP,
-  ATTRIBUTE_CUDA_CORES,
-  ATTRIBUTE_TENSOR_CORES,
-  ATTRIBUTE_EUS,
-  ATTRIBUTE_L2,
-  ATTRIBUTE_MEMORY,
-  ATTRIBUTE_MEMORY_FREQ,
-  ATTRIBUTE_BUS_WIDTH,
-  ATTRIBUTE_PEAK,
-  ATTRIBUTE_PEAK_TENSOR,
+  ATTRIBUTE_NAME,          // ALL
+  ATTRIBUTE_CHIP,          // ALL
+  ATTRIBUTE_UARCH,         // ALL
+  ATTRIBUTE_TECHNOLOGY,    // ALL
+  ATTRIBUTE_FREQUENCY,     // ALL
+  ATTRIBUTE_PEAK,          // ALL
+  ATTRIBUTE_COMPUTE_UNITS, // HSA
+  ATTRIBUTE_MATRIX_CORES,  // HSA
+  ATTRIBUTE_XCDS,          // HSA
+  ATTRIBUTE_LDS_SIZE,      // HSA
+  ATTRIBUTE_STREAMINGMP,   // CUDA
+  ATTRIBUTE_CORESPERMP,    // CUDA
+  ATTRIBUTE_CUDA_CORES,    // CUDA
+  ATTRIBUTE_TENSOR_CORES,  // CUDA
+  ATTRIBUTE_L2,            // CUDA
+  ATTRIBUTE_MEMORY,        // CUDA,HSA
+  ATTRIBUTE_MEMORY_FREQ,   // CUDA
+  ATTRIBUTE_BUS_WIDTH,     // CUDA,HSA
+  ATTRIBUTE_PEAK_TENSOR,   // CUDA
+  ATTRIBUTE_EUS,           // Intel
+  ATTRIBUTE_GT,            // Intel
 };

-static const char* ATTRIBUTE_FIELDS [] = {
-  "Name:",
-  "GPU processor:",
-  "Microarchitecture:",
-  "Technology:",
-  "Graphics Tier:",
-  "Max Frequency:",
-  "SMs:",
-  "Cores/SM:",
-  "CUDA Cores:",
-  "Tensor Cores:",
-  "Execution Units:",
-  "L2 Size:",
-  "Memory:",
-  "Memory frequency:",
-  "Bus width:",
-  "Peak Performance:",
-  "Peak Performance (MMA):",
-};
-
-static const char* ATTRIBUTE_FIELDS_SHORT [] = {
-  "Name:",
-  "Processor:",
-  "uArch:",
-  "Technology:",
-  "GT:",
-  "Max Freq.:",
-  "SMs:",
-  "Cores/SM:",
-  "CUDA Cores:",
-  "Tensor Cores:",
-  "EUs:",
-  "L2 Size:",
-  "Memory:",
-  "Memory freq.:",
-  "Bus width:",
-  "Peak Perf.:",
-  "Peak Perf.(MMA):",
+static const AttributeField ATTRIBUTE_INFO[] = {
+  { ATTRIBUTE_NAME,          "Name:",                   "Name:" },
+  { ATTRIBUTE_CHIP,          "GPU processor:",          "Processor:" },
+  { ATTRIBUTE_UARCH,         "Microarchitecture:",      "uArch:" },
+  { ATTRIBUTE_TECHNOLOGY,    "Technology:",             "Technology:" },
+  { ATTRIBUTE_FREQUENCY,     "Max Frequency:",          "Max Freq.:" },
+  { ATTRIBUTE_PEAK,          "Peak Performance:",       "Peak Perf.:" },
+  { ATTRIBUTE_COMPUTE_UNITS, "Compute Units (CUs):",    "CUs" },
+  { ATTRIBUTE_MATRIX_CORES,  "Matrix Cores:",           "Matrix Cores:" },
+  { ATTRIBUTE_XCDS,          "XCDs:",                   "XCDs" },
+  { ATTRIBUTE_LDS_SIZE,      "LDS size:",               "LDS:" },
+  { ATTRIBUTE_STREAMINGMP,   "SMs:",                    "SMs:" },
+  { ATTRIBUTE_CORESPERMP,    "Cores/SM:",               "Cores/SM:" },
+  { ATTRIBUTE_CUDA_CORES,    "CUDA Cores:",             "CUDA Cores:" },
+  { ATTRIBUTE_TENSOR_CORES,  "Tensor Cores:",           "Tensor Cores:" },
+  { ATTRIBUTE_L2,            "L2 Size:",                "L2 Size:" },
+  { ATTRIBUTE_MEMORY,        "Memory:",                 "Memory:" },
+  { ATTRIBUTE_MEMORY_FREQ,   "Memory frequency:",       "Memory freq.:" },
+  { ATTRIBUTE_BUS_WIDTH,     "Bus width:",              "Bus width:" },
+  { ATTRIBUTE_PEAK_TENSOR,   "Peak Performance (MMA):", "Peak Perf.(MMA):" },
+  { ATTRIBUTE_EUS,           "Execution Units:",        "EUs:" },
+  { ATTRIBUTE_GT,            "Graphics Tier:",          "GT:" },
 };

 struct terminal {
@@ -206,8 +203,6 @@ bool ascii_fits_screen(int termw, struct ascii_logo logo, int lf) {
 void replace_bgbyfg_color(struct ascii_logo* logo) {
  // Replace background by foreground color
  for(int i=0; i < 2; i++) {
-    if(logo->color_ascii[i] == NULL) break;
-
    if(strcmp(logo->color_ascii[i], C_BG_BLACK) == 0) strcpy(logo->color_ascii[i], C_FG_BLACK);
    else if(strcmp(logo->color_ascii[i], C_BG_RED) == 0) strcpy(logo->color_ascii[i], C_FG_RED);
    else if(strcmp(logo->color_ascii[i], C_BG_GREEN) == 0) strcpy(logo->color_ascii[i], C_FG_GREEN);
@@ -275,13 +270,14 @@ void choose_ascii_art(struct ascii* art, struct color** cs, struct terminal* ter
  }
 }

-uint32_t longest_attribute_length(struct ascii* art, const char** attribute_fields) {
+uint32_t longest_attribute_length(struct ascii* art, bool use_short) {
  uint32_t max = 0;
  uint64_t len = 0;

  for(uint32_t i=0; i < art->n_attributes_set; i++) {
    if(art->attributes[i]->value != NULL) {
-      len = strlen(attribute_fields[art->attributes[i]->type]);
+      const char* str = use_short ? ATTRIBUTE_INFO[art->attributes[i]->type].shortname : ATTRIBUTE_INFO[art->attributes[i]->type].name;
+      len = strlen(str);
      if(len > max) max = len;
    }
  }
@@ -305,7 +301,7 @@ uint32_t longest_field_length(struct ascii* art, int la) {
  return max;
 }

-void print_ascii_generic(struct ascii* art, uint32_t la, int32_t text_space, const char** attribute_fields) {
+void print_ascii_generic(struct ascii* art, uint32_t la, int32_t text_space, bool use_short) {
  struct ascii_logo* logo = art->art;
  int attr_to_print = 0;
  int attr_type;
@@ -349,11 +345,13 @@ void print_ascii_generic(struct ascii* art, uint32_t la, int32_t text_space, con
      attr_value = art->attributes[attr_to_print]->value;
      attr_to_print++;

-      space_right = 1 + (la - strlen(attribute_fields[attr_type]));
+      const char* attr_str = use_short ? ATTRIBUTE_INFO[attr_type].shortname : ATTRIBUTE_INFO[attr_type].name;
+
+      space_right = 1 + (la - strlen(attr_str));
      current_space = max(0, text_space);

-      printf("%s%.*s%s", logo->color_text[0], current_space, attribute_fields[attr_type], art->reset);
-      current_space = max(0, current_space - (int) strlen(attribute_fields[attr_type]));
+      printf("%s%.*s%s", logo->color_text[0], current_space, attr_str, art->reset);
+      current_space = max(0, current_space - (int) strlen(attr_str));
      printf("%*s", min(current_space, space_right), "");
      current_space = max(0, current_space - min(current_space, space_right));
      printf("%s%.*s%s", logo->color_text[1], current_space, attr_value, art->reset);
@@ -387,19 +385,19 @@ bool print_gpufetch_intel(struct gpu_info* gpu, STYLE s, struct color** cs, stru
  setAttribute(art, ATTRIBUTE_EUS, eus);
  setAttribute(art, ATTRIBUTE_PEAK, pp);

-  const char** attribute_fields = ATTRIBUTE_FIELDS;
-  uint32_t longest_attribute = longest_attribute_length(art, attribute_fields);
+  bool use_short = false;
+  uint32_t longest_attribute = longest_attribute_length(art, use_short);
  uint32_t longest_field = longest_field_length(art, longest_attribute);
  choose_ascii_art(art, cs, term, longest_field);

  if(!ascii_fits_screen(term->w, *art->art, longest_field)) {
    // Despite of choosing the smallest logo, the output does not fit
    // Choose the shorter field names and recalculate the longest attr
-    attribute_fields = ATTRIBUTE_FIELDS_SHORT;
-    longest_attribute = longest_attribute_length(art, attribute_fields);
+    use_short = true;
+    longest_attribute = longest_attribute_length(art, use_short);
  }

-  print_ascii_generic(art, longest_attribute, term->w - art->art->width, attribute_fields);
+  print_ascii_generic(art, longest_attribute, term->w - art->art->width, use_short);

  return true;
 }
@@ -456,19 +454,19 @@ bool print_gpufetch_cuda(struct gpu_info* gpu, STYLE s, struct color** cs, struc
    setAttribute(art, ATTRIBUTE_PEAK_TENSOR, pp_tensor);
  }

-  const char** attribute_fields = ATTRIBUTE_FIELDS;
-  uint32_t longest_attribute = longest_attribute_length(art, attribute_fields);
+  bool use_short = false;
+  uint32_t longest_attribute = longest_attribute_length(art, use_short);
  uint32_t longest_field = longest_field_length(art, longest_attribute);
  choose_ascii_art(art, cs, term, longest_field);

  if(!ascii_fits_screen(term->w, *art->art, longest_field)) {
    // Despite of choosing the smallest logo, the output does not fit
    // Choose the shorter field names and recalculate the longest attr
-    attribute_fields = ATTRIBUTE_FIELDS_SHORT;
-    longest_attribute = longest_attribute_length(art, attribute_fields);
+    use_short = true;
+    longest_attribute = longest_attribute_length(art, use_short);
  }

-  print_ascii_generic(art, longest_attribute, term->w - art->art->width, attribute_fields);
+  print_ascii_generic(art, longest_attribute, term->w - art->art->width, use_short);

  free(manufacturing_process);
  free(max_frequency);
@@ -490,26 +488,46 @@ bool print_gpufetch_amd(struct gpu_info* gpu, STYLE s, struct color** cs, struct
    return false;

  char* gpu_name = get_str_gpu_name(gpu);
-  char* sms = get_str_cu(gpu);
+  char* gpu_chip = get_str_chip(gpu->arch);
+  char* uarch = get_str_uarch_hsa(gpu->arch);
+  char* manufacturing_process = get_str_process(gpu->arch);
+  char* cus = get_str_cu(gpu);
+  char* matrix_cores = get_str_matrix_cores(gpu);
+  char* xcds = get_str_xcds(gpu);
  char* max_frequency = get_str_freq(gpu);
+  char* bus_width = get_str_bus_width(gpu);
+  char* mem_size = get_str_memory_size(gpu);
+  char* lds_size = get_str_lds_size(gpu);

  setAttribute(art, ATTRIBUTE_NAME, gpu_name);
+  if (gpu_chip != NULL) {
+    setAttribute(art, ATTRIBUTE_CHIP, gpu_chip);
+  }
+  setAttribute(art, ATTRIBUTE_UARCH, uarch);
+  setAttribute(art, ATTRIBUTE_TECHNOLOGY, manufacturing_process);
  setAttribute(art, ATTRIBUTE_FREQUENCY, max_frequency);
-  setAttribute(art, ATTRIBUTE_STREAMINGMP, sms);
+  setAttribute(art, ATTRIBUTE_COMPUTE_UNITS, cus);
+  setAttribute(art, ATTRIBUTE_MATRIX_CORES, matrix_cores);
+  if (xcds != NULL) {
+    setAttribute(art, ATTRIBUTE_XCDS, xcds);
+  }
+  setAttribute(art, ATTRIBUTE_LDS_SIZE, lds_size);
+  setAttribute(art, ATTRIBUTE_MEMORY, mem_size);
+  setAttribute(art, ATTRIBUTE_BUS_WIDTH, bus_width);

-  const char** attribute_fields = ATTRIBUTE_FIELDS;
-  uint32_t longest_attribute = longest_attribute_length(art, attribute_fields);
+  bool use_short = false;
+  uint32_t longest_attribute = longest_attribute_length(art, use_short);
  uint32_t longest_field = longest_field_length(art, longest_attribute);
  choose_ascii_art(art, cs, term, longest_field);

  if(!ascii_fits_screen(term->w, *art->art, longest_field)) {
    // Despite of choosing the smallest logo, the output does not fit
    // Choose the shorter field names and recalculate the longest attr
-    attribute_fields = ATTRIBUTE_FIELDS_SHORT;
-    longest_attribute = longest_attribute_length(art, attribute_fields);
+    use_short = true;
+    longest_attribute = longest_attribute_length(art, use_short);
  }

-  print_ascii_generic(art, longest_attribute, term->w - art->art->width, attribute_fields);
+  print_ascii_generic(art, longest_attribute, term->w - art->art->width, use_short);

  free(art->attributes);
  free(art);
--- a/src/common/uarch.hpp
+++ b/src/common/uarch.hpp
@@ -16,6 +16,9 @@ struct uarch {
  int32_t cc_minor;
  int32_t compute_capability;

+  // HSA specific
+  int32_t llvm_target;
+
  // Intel specific
  int32_t gt;
  int32_t eu;
--- a/src/cuda/cuda.cpp
+++ b/src/cuda/cuda.cpp
@@ -1,3 +1,6 @@
+
+// patched cuda.cpp for cuda13 by cloudy
+
 #include <cuda_runtime.h>
 #include <cstring>
 #include <cstdlib>
@@ -5,8 +8,8 @@

 #include "cuda.hpp"
 #include "uarch.hpp"
+#include "pci.hpp"
 #include "gpufetch_helper_cuda.hpp"
-#include "../common/pci.hpp"
 #include "../common/global.hpp"
 #include "../common/uarch.hpp"

@@ -14,29 +17,22 @@ bool print_gpu_cuda(struct gpu_info* gpu) {
  char* cc = get_str_cc(gpu->arch);
  printf("%s (Compute Capability %s)\n", gpu->name, cc);
  free(cc);
-
  return true;
 }

 struct cache* get_cache_info(cudaDeviceProp prop) {
  struct cache* cach = (struct cache*) emalloc(sizeof(struct cache));
-
  cach->L2 = (struct cach*) emalloc(sizeof(struct cach));
  cach->L2->size = prop.l2CacheSize;
  cach->L2->num_caches = 1;
  cach->L2->exists = true;
-
  return cach;
 }

 int get_tensor_cores(struct uarch* arch, int sm, int major) {
  if(major == 7) {
-    // TU116 does not have tensor cores!
-    // https://www.anandtech.com/show/13973/nvidia-gtx-1660-ti-review-feat-evga-xc-gaming/2
-    if(arch->chip == CHIP_TU116   || arch->chip == CHIP_TU116BM ||
-       arch->chip == CHIP_TU116GL || arch->chip == CHIP_TU116M) {
+    if (is_chip_TU116(arch))
      return 0;
-    }
    return sm * 8;
  }
  else if(major == 8) return sm * 4;
@@ -45,57 +41,57 @@ int get_tensor_cores(struct uarch* arch, int sm, int major) {

 struct topology_c* get_topology_info(struct uarch* arch, cudaDeviceProp prop) {
  struct topology_c* topo = (struct topology_c*) emalloc(sizeof(struct topology_c));
-
  topo->streaming_mp = prop.multiProcessorCount;
  topo->cores_per_mp = _ConvertSMVer2Cores(prop.major, prop.minor);
  topo->cuda_cores = topo->streaming_mp * topo->cores_per_mp;
  topo->tensor_cores = get_tensor_cores(arch, topo->streaming_mp, prop.major);
-
  return topo;
 }

 int32_t guess_clock_multipilier(struct gpu_info* gpu, struct memory* mem) {
-  // Guess clock multiplier
  int32_t clk_mul = 1;
-
  int32_t clk8 = abs((mem->freq/8) - gpu->freq);
  int32_t clk4 = abs((mem->freq/4) - gpu->freq);
  int32_t clk2 = abs((mem->freq/2) - gpu->freq);
  int32_t clk1 = abs((mem->freq/1) - gpu->freq);
-
  int32_t min = mem->freq;
  if(clkm_possible_for_uarch(8, gpu->arch) && min > clk8) { clk_mul = 8; min = clk8; }
  if(clkm_possible_for_uarch(4, gpu->arch) && min > clk4) { clk_mul = 4; min = clk4; }
  if(clkm_possible_for_uarch(2, gpu->arch) && min > clk2) { clk_mul = 2; min = clk2; }
  if(clkm_possible_for_uarch(1, gpu->arch) && min > clk1) { clk_mul = 1; min = clk1; }
-
  return clk_mul;
 }

 struct memory* get_memory_info(struct gpu_info* gpu, cudaDeviceProp prop) {
  struct memory* mem = (struct memory*) emalloc(sizeof(struct memory));
+  int val = 0;

  mem->size_bytes = (unsigned long long) prop.totalGlobalMem;
-  mem->freq = prop.memoryClockRate * 0.001f;
+
+  if (cudaDeviceGetAttribute(&val, cudaDevAttrMemoryClockRate, gpu->idx) == cudaSuccess) {
+      if (val > 1000000)
+          mem->freq = (float)val / 1000000.0f;
+      else
+          mem->freq = (float)val * 0.001f;
+  } else {
+      mem->freq = 0.0f;
+  }
+
  mem->bus_width = prop.memoryBusWidth;
  mem->clk_mul = guess_clock_multipilier(gpu, mem);
  mem->type = guess_memtype_from_cmul_and_uarch(mem->clk_mul, gpu->arch);

-  // Fix frequency returned from CUDA to show real frequency
-  mem->freq = mem->freq  / mem->clk_mul;
+  if (mem->clk_mul > 0)
+      mem->freq = mem->freq / mem->clk_mul;

  return mem;
 }

-// Compute peak performance when using CUDA cores
 int64_t get_peak_performance_cuda(struct gpu_info* gpu) {
  return gpu->freq * 1000000 * gpu->topo_c->cuda_cores * 2;
 }

-// Compute peak performance when using tensor cores
 int64_t get_peak_performance_tcu(cudaDeviceProp prop, struct gpu_info* gpu) {
-  // Volta / Turing tensor cores performs 4x4x4 FP16 matrix multiplication
-  // Ampere tensor cores performs 8x4x8 FP16 matrix multiplicacion
  if(prop.major == 7) return gpu->freq * 1000000 * 4 * 4 * 4  * 2 * gpu->topo_c->tensor_cores;
  else if(prop.major == 8) return gpu->freq * 1000000 * 8 * 4 * 8 * 2 * gpu->topo_c->tensor_cores;
  else return 0;
@@ -117,8 +113,7 @@ struct gpu_info* get_gpu_info_cuda(struct pci_dev *devices, int gpu_idx) {
  }

  int num_gpus = -1;
-  cudaError_t err = cudaSuccess;
-  err = cudaGetDeviceCount(&num_gpus);
+  cudaError_t err = cudaGetDeviceCount(&num_gpus);

  if(gpu_idx == 0) {
    printf("\r%*c\r", (int) strlen(CUDA_DRIVER_START_WARNING), ' ');
@@ -136,7 +131,6 @@ struct gpu_info* get_gpu_info_cuda(struct pci_dev *devices, int gpu_idx) {
  }

  if(gpu->idx+1 > num_gpus) {
-    // Master is trying to query an invalid GPU
    return NULL;
  }

@@ -146,15 +140,25 @@ struct gpu_info* get_gpu_info_cuda(struct pci_dev *devices, int gpu_idx) {
    return NULL;
  }

-  gpu->freq = deviceProp.clockRate * 1e-3f;
+  int core_clk = 0;
+  if (cudaDeviceGetAttribute(&core_clk, cudaDevAttrClockRate, gpu->idx) == cudaSuccess) {
+      if (core_clk > 1000000)
+          gpu->freq = core_clk / 1000000.0f;
+      else
+          gpu->freq = core_clk * 0.001f;
+  } else {
+      gpu->freq = 0.0f;
+  }
+
  gpu->vendor = GPU_VENDOR_NVIDIA;
-  gpu->name = (char *) emalloc(sizeof(char) * (strlen(deviceProp.name) + 1));
+  gpu->name = (char *) emalloc(strlen(deviceProp.name) + 1);
  strcpy(gpu->name, deviceProp.name);

  if((gpu->pci = get_pci_from_pciutils(devices, PCI_VENDOR_ID_NVIDIA, gpu_idx)) == NULL) {
    printErr("Unable to find a valid device for vendor id 0x%.4X using pciutils", PCI_VENDOR_ID_NVIDIA);
    return NULL;
  }
+
  gpu->arch = get_uarch_from_cuda(gpu);
  gpu->cach = get_cache_info(deviceProp);
  gpu->mem = get_memory_info(gpu, deviceProp);
@@ -165,19 +169,7 @@ struct gpu_info* get_gpu_info_cuda(struct pci_dev *devices, int gpu_idx) {
  return gpu;
 }

-char* get_str_sm(struct gpu_info* gpu) {
-  return get_str_generic(gpu->topo_c->streaming_mp);
-}
-
-char* get_str_cores_sm(struct gpu_info* gpu) {
-  return get_str_generic(gpu->topo_c->cores_per_mp);
-}
-
-char* get_str_cuda_cores(struct gpu_info* gpu) {
-  return get_str_generic(gpu->topo_c->cuda_cores);
-}
-
-char* get_str_tensor_cores(struct gpu_info* gpu) {
-  return get_str_generic(gpu->topo_c->tensor_cores);
-}
-
+char* get_str_sm(struct gpu_info* gpu) { return get_str_generic(gpu->topo_c->streaming_mp); }
+char* get_str_cores_sm(struct gpu_info* gpu) { return get_str_generic(gpu->topo_c->cores_per_mp); }
+char* get_str_cuda_cores(struct gpu_info* gpu) { return get_str_generic(gpu->topo_c->cuda_cores); }
+char* get_str_tensor_cores(struct gpu_info* gpu) { return get_str_generic(gpu->topo_c->tensor_cores); }
--- a/src/cuda/uarch.cpp
+++ b/src/cuda/uarch.cpp
@@ -8,6 +8,7 @@
 #include "../common/uarch.hpp"
 #include "../common/global.hpp"
 #include "../common/gpu.hpp"
+#include "pci.hpp"
 #include "chips.hpp"

 // Any clock multiplier
@@ -361,3 +362,8 @@ void free_uarch_struct(struct uarch* arch) {
  free(arch->chip_str);
  free(arch);
 }
+
+bool is_chip_TU116(struct uarch* arch) {
+  return arch->chip == CHIP_TU116   || arch->chip == CHIP_TU116BM ||
+         arch->chip == CHIP_TU116GL || arch->chip == CHIP_TU116M;
+}
--- a/src/cuda/uarch.hpp
+++ b/src/cuda/uarch.hpp
@@ -13,5 +13,6 @@ char* get_str_cc(struct uarch* arch);
 char* get_str_chip(struct uarch* arch);
 char* get_str_process(struct uarch* arch);
 void free_uarch_struct(struct uarch* arch);
+bool is_chip_TU116(struct uarch* arch);

 #endif
--- a/src/hsa/chips.hpp
+++ b/src/hsa/chips.hpp
@@ -0,0 +1,37 @@
+#ifndef __HSA_GPUCHIPS__
+#define __HSA_GPUCHIPS__
+
+typedef uint32_t GPUCHIP;
+
+enum {
+  CHIP_UNKNOWN_HSA,
+  // VEGA (TODO)
+  // ...
+  // RDNA
+  CHIP_NAVI_10,
+  CHIP_NAVI_12,
+  CHIP_NAVI_14,
+  // RDNA2
+  // There are way more (eg Oberon)
+  // Maybe we'll add them in the future.
+  CHIP_NAVI_21,
+  CHIP_NAVI_22,
+  CHIP_NAVI_23,
+  CHIP_NAVI_24,
+  // RDNA3
+  // There are way more as well.
+  // Supporting Navi only for now.
+  CHIP_NAVI_31,
+  CHIP_NAVI_32,
+  CHIP_NAVI_33,
+  // RDNA4
+  CHIP_NAVI_44,
+  CHIP_NAVI_48,
+  // CDNA
+  CHIP_ARCTURUS,      // MI100 series
+  CHIP_ALDEBARAN,     // MI200 series
+  CHIP_AQUA_VANJARAM, // MI300 series
+  CHIP_CDNA_NEXT      // MI350 series
+};
+
+#endif
--- a/src/hsa/hsa.cpp
+++ b/src/hsa/hsa.cpp
@@ -12,7 +12,7 @@
 #include <hsa/hsa_ext_amd.h>

 #include "hsa.hpp"
-#include "../common/pci.hpp"
+#include "uarch.hpp"
 #include "../common/global.hpp"
 #include "../common/uarch.hpp"

@@ -22,7 +22,16 @@ struct agent_info {
  char vendor_name[64];
  char device_mkt_name[64];
  uint32_t max_clock_freq;
+  // Memory
+  uint32_t bus_width;
+  uint32_t lds_size;
+  uint64_t global_size;
+  // Topology
  uint32_t compute_unit;
+  uint32_t num_shader_engines;
+  uint32_t simds_per_cu;
+  uint32_t num_xcc;            // Acccelerator Complex Dies (XCDs)
+  uint32_t matrix_cores;       // Cores with WMMA/MFMA capabilities
 };

 #define RET_IF_HSA_ERR(err) { \
@@ -34,13 +43,57 @@ struct agent_info {
      snprintf(&(err_val[0]), sizeof(err_val), "%#x", (uint32_t)err);         \
      err_str = &(err_val[0]);                                                \
    }                                                                         \
-    printErr("HSA failure at: %s:%d\n",                              \
-                      __FILE__, __LINE__);                           \
-    printErr("Call returned %s\n", err_str);                         \
+    printErr("HSA failure at: %s:%d\n", __FILE__, __LINE__);                  \
+    printErr("Call returned %s\n", err_str);                                  \
    return (err);                                                             \
  }                                                                           \
 }

+hsa_status_t memory_pool_callback(hsa_amd_memory_pool_t pool, void* data) {
+  struct agent_info* info = reinterpret_cast<struct agent_info *>(data);
+
+  hsa_amd_segment_t segment;
+  hsa_status_t err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
+  RET_IF_HSA_ERR(err);
+
+  if (segment == HSA_AMD_SEGMENT_GROUP) {
+    // LDS memory
+    // We want to make sure that this memory pool is not repeated.
+    if (info->lds_size != 0) {
+      printErr("Found HSA_AMD_SEGMENT_GROUP twice!");
+      return HSA_STATUS_ERROR;
+    }
+    uint32_t size = 0;
+
+    err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
+    RET_IF_HSA_ERR(err);
+
+    info->lds_size = size;    
+  }
+  else if (segment == HSA_AMD_SEGMENT_GLOBAL) {
+    // Global memory
+    uint32_t global_flags = 0;
+    
+    err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flags);
+    RET_IF_HSA_ERR(err);
+
+    if (global_flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED) {
+      if (info->global_size != 0) {
+        printErr("Found HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED twice!");
+        return HSA_STATUS_ERROR;
+      }
+
+      uint64_t size = 0;
+
+      err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
+      RET_IF_HSA_ERR(err);
+
+      info->global_size = size;
+    }    
+  }
+  return HSA_STATUS_SUCCESS;
+}
+
 hsa_status_t agent_callback(hsa_agent_t agent, void *data) {
  struct agent_info* info = reinterpret_cast<struct agent_info *>(data);

@@ -52,7 +105,6 @@ hsa_status_t agent_callback(hsa_agent_t agent, void *data) {
    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, info->gpu_name);
    RET_IF_HSA_ERR(err);

-    // TODO: What if vendor_name is not AMD?
    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_VENDOR_NAME, info->vendor_name);
    RET_IF_HSA_ERR(err);

@@ -64,6 +116,26 @@ hsa_status_t agent_callback(hsa_agent_t agent, void *data) {

    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &info->compute_unit);
    RET_IF_HSA_ERR(err);
+
+    // According to the documentation, this is deprecated. But what should I be using then?
+    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_MEMORY_WIDTH, &info->bus_width);
+    RET_IF_HSA_ERR(err);
+
+    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES, &info->num_shader_engines);
+    RET_IF_HSA_ERR(err);
+
+    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, &info->simds_per_cu);
+    RET_IF_HSA_ERR(err);
+
+    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_NUM_XCC, &info->num_xcc);
+    RET_IF_HSA_ERR(err);
+
+    // We will check against zero to see if it was set beforehand.
+    info->global_size = 0;
+    info->lds_size = 0;
+    // This will fill global_size and lds_size.
+    err = hsa_amd_agent_iterate_memory_pools(agent, memory_pool_callback, data);
+    RET_IF_HSA_ERR(err);
  }

  return HSA_STATUS_SUCCESS;
@@ -73,11 +145,27 @@ struct topology_h* get_topology_info(struct agent_info info) {
  struct topology_h* topo = (struct topology_h*) emalloc(sizeof(struct topology_h));

  topo->compute_units = info.compute_unit;
+  topo->num_shader_engines = info.num_shader_engines; // not printed at the moment
+  topo->simds_per_cu = info.simds_per_cu;             // not printed at the moment
+  topo->num_xcc = info.num_xcc;
+  // Old GPUs (GCN I guess) might not have matrix cores.
+  // Not sure what would happen here?
+  topo->matrix_cores = topo->compute_units * topo->simds_per_cu;

  return topo;
 }

-struct gpu_info* get_gpu_info_hsa(struct pci_dev *devices, int gpu_idx) {
+struct memory* get_memory_info(struct gpu_info* gpu, struct agent_info info) {
+  struct memory* mem = (struct memory*) emalloc(sizeof(struct memory));
+  
+  mem->bus_width = info.bus_width;
+  mem->lds_size = info.lds_size;
+  mem->size_bytes = info.global_size;
+
+  return mem;
+}
+
+struct gpu_info* get_gpu_info_hsa(int gpu_idx) {
  struct gpu_info* gpu = (struct gpu_info*) emalloc(sizeof(struct gpu_info));
  gpu->pci = NULL;
  gpu->idx = gpu_idx;
@@ -92,11 +180,8 @@ struct gpu_info* get_gpu_info_hsa(struct pci_dev *devices, int gpu_idx) {
    return NULL;
  }

-  hsa_status_t status;
-
-  // Initialize the HSA runtime
-  status = hsa_init();
-  if (status != HSA_STATUS_SUCCESS) {
+  hsa_status_t err = hsa_init();
+  if (err != HSA_STATUS_SUCCESS) {
    printErr("Failed to initialize HSA runtime");
    return NULL;
  }
@@ -105,26 +190,53 @@ struct gpu_info* get_gpu_info_hsa(struct pci_dev *devices, int gpu_idx) {
  info.deviceId = gpu_idx;

  // Iterate over all agents in the system
-  status = hsa_iterate_agents(agent_callback, &info);
-  if (status != HSA_STATUS_SUCCESS) {
+  err = hsa_iterate_agents(agent_callback, &info);
+  if (err != HSA_STATUS_SUCCESS) {
    printErr("Failed to iterate HSA agents");
    hsa_shut_down();
    return NULL;
  }

-  gpu->freq = info.max_clock_freq;
+  if (strcmp(info.vendor_name, "AMD") != 0) {
+    printErr("HSA vendor name is: '%s'. Only AMD is supported!", info.vendor_name);
+    return NULL;
+  }
  gpu->vendor = GPU_VENDOR_AMD;
+
+  gpu->freq = info.max_clock_freq;
+  gpu->topo_h = get_topology_info(info);
  gpu->name = (char *) emalloc(sizeof(char) * (strlen(info.device_mkt_name) + 1));
  strcpy(gpu->name, info.device_mkt_name);
-  gpu->topo_h = get_topology_info(info);
+  gpu->arch = get_uarch_from_hsa(gpu, info.gpu_name);
+  gpu->mem = get_memory_info(gpu, info);

-  // TODO: Use gpu_name for uarch detection
+  if (gpu->arch == NULL) {
+    return NULL;
+  }

  // Shut down the HSA runtime
-  hsa_shut_down();
+  err = hsa_shut_down();
+  if (err != HSA_STATUS_SUCCESS) {
+    printErr("Failed to shutdown HSA runtime");
+    return NULL;
+  }
  return gpu;
 }

 char* get_str_cu(struct gpu_info* gpu) {
  return get_str_generic(gpu->topo_h->compute_units);
 }
+
+char* get_str_xcds(struct gpu_info* gpu) {
+  // If there is a single XCD, then we dont want to
+  // print it.
+  if (gpu->topo_h->num_xcc == 1) {
+    return NULL;
+  }
+  return get_str_generic(gpu->topo_h->num_xcc);
+}
+
+char* get_str_matrix_cores(struct gpu_info* gpu) {
+  // TODO: Show XX (WMMA/MFMA)
+  return get_str_generic(gpu->topo_h->matrix_cores);
+}
--- a/src/hsa/hsa.hpp
+++ b/src/hsa/hsa.hpp
@@ -3,7 +3,9 @@

 #include "../common/gpu.hpp"

-struct gpu_info* get_gpu_info_hsa(struct pci_dev *devices, int gpu_idx);
+struct gpu_info* get_gpu_info_hsa(int gpu_idx);
 char* get_str_cu(struct gpu_info* gpu);
+char* get_str_xcds(struct gpu_info* gpu);
+char* get_str_matrix_cores(struct gpu_info* gpu);

 #endif
--- a/src/hsa/uarch.cpp
+++ b/src/hsa/uarch.cpp
@@ -0,0 +1,321 @@
+#include <cstdlib>
+#include <cstdint>
+#include <cstring>
+
+#include "../common/uarch.hpp"
+#include "../common/global.hpp"
+#include "../common/gpu.hpp"
+#include "chips.hpp"
+
+// MICROARCH values
+enum {
+  UARCH_UNKNOWN,
+  // GCN (Graphics Core Next)
+  // Empty for now
+  // ...
+  // RDNA (Radeon DNA)
+  UARCH_RDNA,
+  UARCH_RDNA2,
+  UARCH_RDNA3,
+  UARCH_RDNA4,
+  // CDNA (Compute DNA)
+  UARCH_CDNA,
+  UARCH_CDNA2,
+  UARCH_CDNA3,
+  UARCH_CDNA4
+};
+
+static const char *uarch_str[] = {
+  /*[ARCH_UNKNOWN]    = */ STRING_UNKNOWN,
+  /*[UARCH_RDNA]      = */ "RDNA",
+  /*[UARCH_RDNA2]     = */ "RDNA2",
+  /*[UARCH_RDNA3]     = */ "RDNA3",
+  /*[UARCH_RDNA4]     = */ "RDNA4",
+  /*[UARCH_CDNA]      = */ "CDNA",
+  /*[UARCH_CDNA2]     = */ "CDNA2",
+  /*[UARCH_CDNA3]     = */ "CDNA3",
+  /*[UARCH_CDNA4]     = */ "CDNA4",
+};
+
+// Sources: 
+// - https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html
+// - https://www.techpowerup.com
+//
+// This is sometimes refered to as LLVM target, but also shader ISA.
+//
+// LLVM target *usually* maps to a specific architecture. However there
+// are case where this is not true:
+// MI8 is GCN3.0 with LLVM target gfx803
+// MI6 is GCN4.0 with LLVM target gfx803
+// or
+// Strix Point can be gfx1150 or gfx1151
+//
+// NOTE: GCN chips are stored for completeness, but they are
+// not actively supported.
+enum {
+  TARGET_UNKNOWN_HSA,
+  /// GCN (Graphics Core Next)
+  /// ------------------------
+  // GCN 1.0 
+  TARGET_GFX600,
+  TARGET_GFX601,
+  TARGET_GFX602,
+  // GCN 2.0
+  TARGET_GFX700,
+  TARGET_GFX701,
+  TARGET_GFX702,
+  TARGET_GFX703,
+  TARGET_GFX704,
+  TARGET_GFX705,
+  // GCN 3.0 / 4.0
+  TARGET_GFX801,
+  TARGET_GFX802,
+  TARGET_GFX803,
+  TARGET_GFX805,
+  TARGET_GFX810,
+  // GCN 5.0
+  TARGET_GFX900,
+  TARGET_GFX902,
+  TARGET_GFX904,
+  // GCN 5.1
+  TARGET_GFX906,
+  // ???
+  TARGET_GFX909,
+  TARGET_GFX90C,
+  /// RDNA (Radeon DNA)
+  /// -----------------
+  // RDNA1
+  TARGET_GFX1010,
+  TARGET_GFX1011,
+  TARGET_GFX1012,
+  // RDNA2
+  TARGET_GFX1013, // Oberon
+  TARGET_GFX1030,
+  TARGET_GFX1031,
+  TARGET_GFX1032,
+  TARGET_GFX1033,
+  TARGET_GFX1034,
+  TARGET_GFX1035, // ??
+  TARGET_GFX1036, // ??
+  // RDNA3
+  TARGET_GFX1100,
+  TARGET_GFX1101,
+  TARGET_GFX1102,
+  TARGET_GFX1103, // ???
+  // RDNA3.5
+  TARGET_GFX1150, // Strix Point
+  TARGET_GFX1151, // Strix Halo / Strix Point
+  TARGET_GFX1152, // Krackan Point
+  TARGET_GFX1153, // ???
+  // RDNA4
+  TARGET_GFX1200,
+  TARGET_GFX1201,
+  TARGET_GFX1250, // ???
+  TARGET_GFX1251, // ???
+  /// CDNA (Compute DNA)
+  /// ------------------
+  // CDNA
+  TARGET_GFX908,
+  // CDNA2
+  TARGET_GFX90A,
+  // CDNA3
+  TARGET_GFX942,
+  // CDNA4
+  TARGET_GFX950  
+};
+
+#define CHECK_UARCH_START if (false) {}
+#define CHECK_UARCH(arch, chip_, str, uarch, process) \
+   else if (arch->chip == chip_) fill_uarch(arch, str, uarch, process);
+#define CHECK_UARCH_END else { if(arch->chip != CHIP_UNKNOWN_HSA) printBug("map_chip_to_uarch_hsa: Unknown chip id: %d", arch->chip); fill_uarch(arch, STRING_UNKNOWN, UARCH_UNKNOWN, UNK); }
+
+void fill_uarch(struct uarch* arch, char const *str, MICROARCH u, uint32_t process) {
+  arch->chip_str = (char *) emalloc(sizeof(char) * (strlen(str)+1));
+  strcpy(arch->chip_str, str);
+  arch->uarch = u;
+  arch->process = process;
+}
+
+// On chiplet based chips (such as Navi31, Navi32, etc),
+// we have 2 different processes: The MCD process and the
+// rest of the chip process. They might be different and here
+// we just take one - let's take MCD process for now.
+//
+// TODO: Should we differentiate?
+void map_chip_to_uarch_hsa(struct uarch* arch) {
+  CHECK_UARCH_START
+
+  // RDNA
+  CHECK_UARCH(arch, CHIP_NAVI_10,  "Navi 10", UARCH_RDNA,  7)
+  CHECK_UARCH(arch, CHIP_NAVI_12,  "Navi 12", UARCH_RDNA,  7)
+  CHECK_UARCH(arch, CHIP_NAVI_14,  "Navi 14", UARCH_RDNA,  7)
+  CHECK_UARCH(arch, CHIP_NAVI_21,  "Navi 21", UARCH_RDNA2, 7)
+  CHECK_UARCH(arch, CHIP_NAVI_22,  "Navi 22", UARCH_RDNA2, 7)
+  CHECK_UARCH(arch, CHIP_NAVI_23,  "Navi 23", UARCH_RDNA2, 7)
+  CHECK_UARCH(arch, CHIP_NAVI_24,  "Navi 24", UARCH_RDNA2, 6)
+  CHECK_UARCH(arch, CHIP_NAVI_31,  "Navi 31", UARCH_RDNA3, 6)
+  CHECK_UARCH(arch, CHIP_NAVI_32,  "Navi 32", UARCH_RDNA3, 6)
+  CHECK_UARCH(arch, CHIP_NAVI_33,  "Navi 33", UARCH_RDNA3, 6)
+  CHECK_UARCH(arch, CHIP_NAVI_44,  "Navi 44", UARCH_RDNA4, 4)
+  CHECK_UARCH(arch, CHIP_NAVI_48,  "Navi 48", UARCH_RDNA4, 4)
+  // CDNA
+  // NOTE: We will not show chip name for CDNA, thus use empty str
+  CHECK_UARCH(arch, CHIP_ARCTURUS,        "", UARCH_CDNA,  7)
+  CHECK_UARCH(arch, CHIP_ALDEBARAN,       "", UARCH_CDNA2, 6)
+  CHECK_UARCH(arch, CHIP_AQUA_VANJARAM,   "", UARCH_CDNA3, 6)
+  CHECK_UARCH(arch, CHIP_CDNA_NEXT,       "", UARCH_CDNA4, 6) // big difference between MCD and rest of the chip process
+  
+  CHECK_UARCH_END
+}
+
+#define CHECK_TGT_START if (false) {}
+#define CHECK_TGT(target, llvm_target, chip) \
+  else if (target == llvm_target) return chip;
+#define CHECK_TGT_END else { printBug("LLVM target '%d' has no matching chip", target); return CHIP_UNKNOWN_HSA; }
+
+// We have at least 2 choices to infer the chip:
+//
+// - LLVM target (e.g., gfx1101 is Navi 32)
+// - PCI ID (e.g., 0x7470 is Navi 32)
+//
+// For now we will use the first approach, which seems to have
+// some issues like mentioned in the enum.
+// However PCI detection is also not perfect, since it is
+// quite hard to find PCI ids from old hardware.
+GPUCHIP get_chip_from_target_hsa(int32_t target) {
+  CHECK_TGT_START
+  /// RDNA
+  /// -------------------------------------------
+  CHECK_TGT(target, TARGET_GFX1010, CHIP_NAVI_10)
+  CHECK_TGT(target, TARGET_GFX1011, CHIP_NAVI_12)
+  CHECK_TGT(target, TARGET_GFX1012, CHIP_NAVI_14)
+  // CHECK_TGT(target, TARGET_GFX1013, TODO)
+  /// RDNA2
+  /// -------------------------------------------
+  CHECK_TGT(target, TARGET_GFX1030, CHIP_NAVI_21)
+  CHECK_TGT(target, TARGET_GFX1031, CHIP_NAVI_22)
+  CHECK_TGT(target, TARGET_GFX1032, CHIP_NAVI_23)
+  CHECK_TGT(target, TARGET_GFX1033, CHIP_NAVI_21)
+  CHECK_TGT(target, TARGET_GFX1034, CHIP_NAVI_24)
+  // CHECK_TGT(target, TARGET_GFX1035, TODO)
+  // CHECK_TGT(target, TARGET_GFX1036, TODO)
+  /// RDNA3
+  /// -------------------------------------------
+  CHECK_TGT(target, TARGET_GFX1100, CHIP_NAVI_31)
+  CHECK_TGT(target, TARGET_GFX1101, CHIP_NAVI_32)
+  CHECK_TGT(target, TARGET_GFX1102, CHIP_NAVI_33)
+  // CHECK_TGT(target, TARGET_GFX1103, TODO)
+  /// RDNA3.5
+  /// -------------------------------------------
+  // CHECK_TGT(target, TARGET_GFX1150, TODO)
+  // CHECK_TGT(target, TARGET_GFX1151, TODO)
+  // CHECK_TGT(target, TARGET_GFX1152, TODO)
+  // CHECK_TGT(target, TARGET_GFX1153, TODO)
+  /// RDNA4
+  /// -------------------------------------------
+  CHECK_TGT(target, TARGET_GFX1200, CHIP_NAVI_44)
+  CHECK_TGT(target, TARGET_GFX1201, CHIP_NAVI_48)
+  // CHECK_TGT(target, TARGET_GFX1250, TODO)
+  // CHECK_TGT(target, TARGET_GFX1251, TODO)
+  /// CDNA
+  /// -------------------------------------------
+  CHECK_TGT(target, TARGET_GFX908, CHIP_ARCTURUS)
+  /// CDNA2
+  /// -------------------------------------------
+  CHECK_TGT(target, TARGET_GFX90A, CHIP_ALDEBARAN)
+  /// CDNA3
+  /// -------------------------------------------
+  CHECK_TGT(target, TARGET_GFX942, CHIP_AQUA_VANJARAM)
+  /// CDNA4
+  /// -------------------------------------------
+  CHECK_TGT(target, TARGET_GFX950, CHIP_CDNA_NEXT)
+  CHECK_TGT_END
+}
+
+#define CHECK_TGT_STR_START if (false) {}
+#define CHECK_TGT_STR(target, llvm_target, chip) \
+  else if (strcmp(target, llvm_target) == 0) return chip;
+#define CHECK_TGT_STR_END else { return TARGET_UNKNOWN_HSA; }
+
+// Maps the LLVM target string to the enum value
+int32_t get_llvm_target_from_str(char* target) {
+  // TODO: Autogenerate this
+  // TODO: Add all, not only the ones we support in get_chip_from_target_hsa
+  CHECK_TGT_STR_START
+  CHECK_TGT_STR(target, "gfx1010", TARGET_GFX1010)
+  CHECK_TGT_STR(target, "gfx1011", TARGET_GFX1011)
+  CHECK_TGT_STR(target, "gfx1012", TARGET_GFX1012)
+  CHECK_TGT_STR(target, "gfx1013", TARGET_GFX1013)
+  CHECK_TGT_STR(target, "gfx1030", TARGET_GFX1030)
+  CHECK_TGT_STR(target, "gfx1031", TARGET_GFX1031)
+  CHECK_TGT_STR(target, "gfx1032", TARGET_GFX1032)
+  CHECK_TGT_STR(target, "gfx1033", TARGET_GFX1033)
+  CHECK_TGT_STR(target, "gfx1034", TARGET_GFX1034)
+  CHECK_TGT_STR(target, "gfx1035", TARGET_GFX1035)
+  CHECK_TGT_STR(target, "gfx1036", TARGET_GFX1036)
+  CHECK_TGT_STR(target, "gfx1100", TARGET_GFX1100)
+  CHECK_TGT_STR(target, "gfx1101", TARGET_GFX1101)
+  CHECK_TGT_STR(target, "gfx1102", TARGET_GFX1102)
+  CHECK_TGT_STR(target, "gfx1103", TARGET_GFX1103)
+  CHECK_TGT_STR(target, "gfx1200", TARGET_GFX1200)
+  CHECK_TGT_STR(target, "gfx1201", TARGET_GFX1201)
+  CHECK_TGT_STR(target, "gfx1250", TARGET_GFX1250)
+  CHECK_TGT_STR(target, "gfx1251", TARGET_GFX1251)
+  CHECK_TGT_STR(target, "gfx908",  TARGET_GFX908)
+  CHECK_TGT_STR(target, "gfx90a",  TARGET_GFX90A)
+  CHECK_TGT_STR(target, "gfx942",  TARGET_GFX942)
+  CHECK_TGT_STR(target, "gfx950",  TARGET_GFX950)
+  CHECK_TGT_STR_END
+}
+
+struct uarch* get_uarch_from_hsa(struct gpu_info* gpu, char* gpu_name) {
+  struct uarch* arch = (struct uarch*) emalloc(sizeof(struct uarch));
+
+  arch->llvm_target = get_llvm_target_from_str(gpu_name);
+  if (arch->llvm_target == TARGET_UNKNOWN_HSA) {
+    printErr("Unknown LLVM target: '%s'", gpu_name);
+    return NULL;
+  }
+
+  arch->chip_str = NULL;
+  arch->chip = get_chip_from_target_hsa(arch->llvm_target);
+  map_chip_to_uarch_hsa(arch);
+
+  return arch;
+}
+
+bool is_uarch_valid(struct uarch* arch) {
+  if (arch == NULL) {
+    printBug("Invalid uarch: arch is NULL");
+    return false;
+  }
+  if (arch->uarch >= UARCH_UNKNOWN && arch->uarch <= UARCH_CDNA4) {
+    return true;
+  }
+  else {
+    printBug("Invalid uarch: %d", arch->uarch);
+    return false;
+  }
+}
+
+bool is_cdna(struct uarch* arch) {
+  return arch->uarch == UARCH_CDNA ||
+         arch->uarch == UARCH_CDNA2 ||
+         arch->uarch == UARCH_CDNA3 ||
+         arch->uarch == UARCH_CDNA4;
+}
+
+char* get_str_chip(struct uarch* arch) {
+  // We dont want to show CDNA chip names as they add
+  // no value, since each architecture maps one to one
+  // to a chip.
+  if (is_cdna(arch)) return NULL;
+  return arch->chip_str;
+}
+
+const char* get_str_uarch_hsa(struct uarch* arch) {
+  if (!is_uarch_valid(arch)) {
+    return NULL;
+  }
+  return uarch_str[arch->uarch];
+}
--- a/src/hsa/uarch.hpp
+++ b/src/hsa/uarch.hpp
@@ -0,0 +1,13 @@
+#ifndef __HSA_UARCH__
+#define __HSA_UARCH__
+
+#include "../common/gpu.hpp"
+
+struct uarch;
+
+struct uarch* get_uarch_from_hsa(struct gpu_info* gpu, char* gpu_name);
+char* get_str_uarch_hsa(struct uarch* arch);
+char* get_str_process(struct uarch* arch); // TODO: Shouldnt we define this in the cpp?
+char* get_str_chip(struct uarch* arch);
+
+#endif
Author	SHA1	Message	Date
Franscobec	0f416b2da9	Patch cuda.cpp with cloudy's fix	2026-01-10 19:29:45 -05:00
Dr-Noob	5f619dc95a	[v0.30] Add support for XCDs and matrix cores For XCDs, we dont show them if the GPU is made of a single XCD, as it adds little value For matrix cores, we assume it can be computed as compute_units * simds_per_cu, it seems to work for the GPUs I checked from CDNA3 and RDNA3. Not sure what would happen for older GPUs that do not have matrix cores though.	2025-10-26 10:51:27 +01:00
Dr-Noob	98bb02e203	[v0.30] Allow users to select backend from build script Before we had AMD support, CMakeLists.txt tried to enable all backends by default. Now that we have AMD support, that does not make that much sense so instead it will only enable the backend specified by the user (with the -DENABLE_XXX_BACKEND flags) Then, before AMD support, the build.sh script was useful to just invoke cmake and let it figure out the backends, but the script was a bit useless after the mentioned change in the CMakeLists.txt. Therefore, this commit allow users to specify an argument, like: ./build.sh cuda To specify what backend/s to enable, without the need to manually configure the build with the -DENABLE_XXX_BACKEND flag. Note that multiple backends are also allowed, like: ./build.sh intel,hsa Would enable both Intel and HSA backends (which could make sense for example in a system with Intel iGPU an an AMD dGPU).	2025-10-24 22:29:45 +02:00
Dr-Noob	78d34e71f1	[v0.30][AMD] Add support to fetch bus width, global memory and LDS size We can use hsa_amd_agent_iterate_memory_pools to fetch info about GPU memory pools in the GPU. HSA_AMD_SEGMENT_GROUP seems to be LDS, and HSA_AMD_SEGMENT_GLOBAL seems to be global memory. However, the latter is reported multiple times (I don't know why). The only solution I found for this is to check for the HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED flag, which seems to be reported only once. For bus width, we simply use HSA_AMD_AGENT_INFO_MEMORY_WIDTH.	2025-10-23 21:30:02 +02:00
Dr-Noob	82ea16fc3d	[v0.30] Fix warning in printer	2025-10-16 20:01:14 +02:00
Dr-Noob	6589de9717	[v0.30] Reorganize attributes in printer and add CUs attr for AMD	2025-10-16 19:53:48 +02:00
Dr-Noob	0950b97393	[v0.30] Build pciutils only if neccesary If only HSA is enabled we dont need pciutils since AMD detection does not rely on it. Therefore we change CMakeLists.txt to build pciutils only if required. This commit has some side-effects: 1. We now don't build Intel backend by default. In other words, no backend is built by default, the user must specify which backend to use. 2. There were some issues with includes and wrongly used defines and variables. This commit fixes all that.	2025-10-16 08:26:42 +02:00
Dr-Noob	8794cd322d	[v0.30] Add support for building on AMD where rocm-cmake is not installed	2025-10-16 07:24:45 +02:00
Dr-Noob	5df85aea2c	[v0.30] Add uarch detection to AMD GPUs Similarly to NVIDIA and Intel GPUs, we now detect microarchitecture, also with manufacturing process and specific chip name. We infer all of this from the gfx name (in the code we use the term llvm_target), altough it's not clear yet that this method is completely reliable (see comments for more details). In the future we might want to replace that with a better way. Once we have the gfx name, we should be able to infer the specific chip, and from the chip we can easily infer the microarchitecture. This commit also includes some refactorings and code improvements on the HSA backend.	2025-10-15 08:23:28 +02:00