Remove TODO

Fix
Add matrix cores
2025-10-26 10:47:27 +01:00 · 2025-10-26 10:44:09 +01:00 · 2025-10-26 10:42:25 +01:00 · 2025-10-26 10:28:41 +01:00 · 2025-10-26 10:27:51 +01:00 · 2025-10-23 21:40:14 +02:00
5 changed files with 144 additions and 4 deletions
--- a/src/common/gpu.cpp
+++ b/src/common/gpu.cpp
@@ -101,6 +101,17 @@ char* get_str_bus_width(struct gpu_info* gpu) {
  return string;
 }

+char* get_str_lds_size(struct gpu_info* gpu) {
+  // TODO: Show XX KB (XX MB Total) like in cpufetch
+  uint32_t size = 3+1+3+1;
+  assert(strlen(STRING_UNKNOWN)+1 <= size);
+  char* string = (char *) ecalloc(size, sizeof(char));
+
+  sprintf(string, "%d KB", gpu->mem->lds_size / 1024);
+
+  return string;
+}
+
 char* get_str_memory_clock(struct gpu_info* gpu) {
  return get_freq_as_str_mhz(gpu->mem->freq);
 }
--- a/src/common/gpu.hpp
+++ b/src/common/gpu.hpp
@@ -46,6 +46,10 @@ struct topology_c {
 // HSA topology
 struct topology_h {
  int32_t compute_units;
+  int32_t num_shader_engines;
+  int32_t simds_per_cu;
+  int32_t num_xcc;
+  int32_t matrix_cores;
 };

 // Intel topology
@@ -61,6 +65,7 @@ struct memory {
  int32_t bus_width;
  int32_t freq;
  int32_t clk_mul; // clock multiplier
+  int32_t lds_size; // HSA specific for now
 };

 struct gpu_info {
@@ -88,6 +93,7 @@ char* get_str_freq(struct gpu_info* gpu);
 char* get_str_memory_size(struct gpu_info* gpu);
 char* get_str_memory_type(struct gpu_info* gpu);
 char* get_str_bus_width(struct gpu_info* gpu);
+char* get_str_lds_size(struct gpu_info* gpu);
 char* get_str_memory_clock(struct gpu_info* gpu);
 char* get_str_l2(struct gpu_info* gpu);
 char* get_str_peak_performance(struct gpu_info* gpu);
--- a/src/common/printer.cpp
+++ b/src/common/printer.cpp
@@ -48,14 +48,17 @@ enum {
  ATTRIBUTE_FREQUENCY,     // ALL
  ATTRIBUTE_PEAK,          // ALL
  ATTRIBUTE_COMPUTE_UNITS, // HSA
+  ATTRIBUTE_MATRIX_CORES,  // HSA
+  ATTRIBUTE_XCDS,          // HSA
+  ATTRIBUTE_LDS_SIZE,      // HSA
  ATTRIBUTE_STREAMINGMP,   // CUDA
  ATTRIBUTE_CORESPERMP,    // CUDA
  ATTRIBUTE_CUDA_CORES,    // CUDA
  ATTRIBUTE_TENSOR_CORES,  // CUDA
  ATTRIBUTE_L2,            // CUDA
-  ATTRIBUTE_MEMORY,        // CUDA
+  ATTRIBUTE_MEMORY,        // CUDA,HSA
  ATTRIBUTE_MEMORY_FREQ,   // CUDA
-  ATTRIBUTE_BUS_WIDTH,     // CUDA
+  ATTRIBUTE_BUS_WIDTH,     // CUDA,HSA
  ATTRIBUTE_PEAK_TENSOR,   // CUDA
  ATTRIBUTE_EUS,           // Intel
  ATTRIBUTE_GT,            // Intel
@@ -69,6 +72,9 @@ static const AttributeField ATTRIBUTE_INFO[] = {
  { ATTRIBUTE_FREQUENCY,     "Max Frequency:",          "Max Freq.:" },
  { ATTRIBUTE_PEAK,          "Peak Performance:",       "Peak Perf.:" },
  { ATTRIBUTE_COMPUTE_UNITS, "Compute Units (CUs):",    "CUs" },
+  { ATTRIBUTE_MATRIX_CORES,  "Matrix Cores: ",          "Matrix Cores:" },
+  { ATTRIBUTE_XCDS,          "XCDs:",                   "XCDs" },
+  { ATTRIBUTE_LDS_SIZE,      "LDS size:",               "LDS:" },
  { ATTRIBUTE_STREAMINGMP,   "SMs:",                    "SMs:" },
  { ATTRIBUTE_CORESPERMP,    "Cores/SM:",               "Cores/SM:" },
  { ATTRIBUTE_CUDA_CORES,    "CUDA Cores:",             "CUDA Cores:" },
@@ -197,8 +203,6 @@ bool ascii_fits_screen(int termw, struct ascii_logo logo, int lf) {
 void replace_bgbyfg_color(struct ascii_logo* logo) {
  // Replace background by foreground color
  for(int i=0; i < 2; i++) {
-    if(logo->color_ascii[i] == NULL) break;
-
    if(strcmp(logo->color_ascii[i], C_BG_BLACK) == 0) strcpy(logo->color_ascii[i], C_FG_BLACK);
    else if(strcmp(logo->color_ascii[i], C_BG_RED) == 0) strcpy(logo->color_ascii[i], C_FG_RED);
    else if(strcmp(logo->color_ascii[i], C_BG_GREEN) == 0) strcpy(logo->color_ascii[i], C_FG_GREEN);
@@ -488,7 +492,12 @@ bool print_gpufetch_amd(struct gpu_info* gpu, STYLE s, struct color** cs, struct
  char* uarch = get_str_uarch_hsa(gpu->arch);
  char* manufacturing_process = get_str_process(gpu->arch);
  char* cus = get_str_cu(gpu);
+  char* matrix_cores = get_str_matrix_cores(gpu);
+  char* xcds = get_str_xcds(gpu);
  char* max_frequency = get_str_freq(gpu);
+  char* bus_width = get_str_bus_width(gpu);
+  char* mem_size = get_str_memory_size(gpu);
+  char* lds_size = get_str_lds_size(gpu);

  setAttribute(art, ATTRIBUTE_NAME, gpu_name);
  if (gpu_chip != NULL) {
@@ -498,6 +507,13 @@ bool print_gpufetch_amd(struct gpu_info* gpu, STYLE s, struct color** cs, struct
  setAttribute(art, ATTRIBUTE_TECHNOLOGY, manufacturing_process);
  setAttribute(art, ATTRIBUTE_FREQUENCY, max_frequency);
  setAttribute(art, ATTRIBUTE_COMPUTE_UNITS, cus);
+  setAttribute(art, ATTRIBUTE_MATRIX_CORES, matrix_cores);
+  if (xcds != NULL) {
+    setAttribute(art, ATTRIBUTE_XCDS, xcds);
+  }
+  setAttribute(art, ATTRIBUTE_LDS_SIZE, lds_size);
+  setAttribute(art, ATTRIBUTE_MEMORY, mem_size);
+  setAttribute(art, ATTRIBUTE_BUS_WIDTH, bus_width);

  bool use_short = false;
  uint32_t longest_attribute = longest_attribute_length(art, use_short);
--- a/src/hsa/hsa.cpp
+++ b/src/hsa/hsa.cpp
@@ -22,7 +22,16 @@ struct agent_info {
  char vendor_name[64];
  char device_mkt_name[64];
  uint32_t max_clock_freq;
+  // Memory
+  uint32_t bus_width;
+  uint32_t lds_size;
+  uint64_t global_size;
+  // Topology
  uint32_t compute_unit;
+  uint32_t num_shader_engines;
+  uint32_t simds_per_cu;
+  uint32_t num_xcc;            // Acccelerator Complex Dies (XCDs)
+  uint32_t matrix_cores;       // Cores with WMMA/MFMA capabilities
 };

 #define RET_IF_HSA_ERR(err) { \
@@ -40,6 +49,51 @@ struct agent_info {
  }                                                                           \
 }

+hsa_status_t memory_pool_callback(hsa_amd_memory_pool_t pool, void* data) {
+  struct agent_info* info = reinterpret_cast<struct agent_info *>(data);
+
+  hsa_amd_segment_t segment;
+  hsa_status_t err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
+  RET_IF_HSA_ERR(err);
+
+  if (segment == HSA_AMD_SEGMENT_GROUP) {
+    // LDS memory
+    // We want to make sure that this memory pool is not repeated.
+    if (info->lds_size != 0) {
+      printErr("Found HSA_AMD_SEGMENT_GROUP twice!");
+      return HSA_STATUS_ERROR;
+    }
+    uint32_t size = 0;
+
+    err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
+    RET_IF_HSA_ERR(err);
+
+    info->lds_size = size;    
+  }
+  else if (segment == HSA_AMD_SEGMENT_GLOBAL) {
+    // Global memory
+    uint32_t global_flags = 0;
+    
+    err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flags);
+    RET_IF_HSA_ERR(err);
+
+    if (global_flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED) {
+      if (info->global_size != 0) {
+        printErr("Found HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED twice!");
+        return HSA_STATUS_ERROR;
+      }
+
+      uint64_t size = 0;
+
+      err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
+      RET_IF_HSA_ERR(err);
+
+      info->global_size = size;
+    }    
+  }
+  return HSA_STATUS_SUCCESS;
+}
+
 hsa_status_t agent_callback(hsa_agent_t agent, void *data) {
  struct agent_info* info = reinterpret_cast<struct agent_info *>(data);

@@ -62,6 +116,26 @@ hsa_status_t agent_callback(hsa_agent_t agent, void *data) {

    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &info->compute_unit);
    RET_IF_HSA_ERR(err);
+
+    // According to the documentation, this is deprecated. But what should I be using then?
+    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_MEMORY_WIDTH, &info->bus_width);
+    RET_IF_HSA_ERR(err);
+
+    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES, &info->num_shader_engines);
+    RET_IF_HSA_ERR(err);
+
+    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, &info->simds_per_cu);
+    RET_IF_HSA_ERR(err);
+
+    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_NUM_XCC, &info->num_xcc);
+    RET_IF_HSA_ERR(err);
+
+    // We will check against zero to see if it was set beforehand.
+    info->global_size = 0;
+    info->lds_size = 0;
+    // This will fill global_size and lds_size.
+    err = hsa_amd_agent_iterate_memory_pools(agent, memory_pool_callback, data);
+    RET_IF_HSA_ERR(err);
  }

  return HSA_STATUS_SUCCESS;
@@ -71,10 +145,26 @@ struct topology_h* get_topology_info(struct agent_info info) {
  struct topology_h* topo = (struct topology_h*) emalloc(sizeof(struct topology_h));

  topo->compute_units = info.compute_unit;
+  topo->num_shader_engines = info.num_shader_engines; // not printed at the moment
+  topo->simds_per_cu = info.simds_per_cu;             // not printed at the moment
+  topo->num_xcc = info.num_xcc;
+  // Old GPUs (GCN I guess) might not have matrix cores.
+  // Not sure what would happen here?
+  topo->matrix_cores = topo->compute_units * topo->simds_per_cu;

  return topo;
 }

+struct memory* get_memory_info(struct gpu_info* gpu, struct agent_info info) {
+  struct memory* mem = (struct memory*) emalloc(sizeof(struct memory));
+  
+  mem->bus_width = info.bus_width;
+  mem->lds_size = info.lds_size;
+  mem->size_bytes = info.global_size;
+
+  return mem;
+}
+
 struct gpu_info* get_gpu_info_hsa(int gpu_idx) {
  struct gpu_info* gpu = (struct gpu_info*) emalloc(sizeof(struct gpu_info));
  gpu->pci = NULL;
@@ -118,6 +208,7 @@ struct gpu_info* get_gpu_info_hsa(int gpu_idx) {
  gpu->name = (char *) emalloc(sizeof(char) * (strlen(info.device_mkt_name) + 1));
  strcpy(gpu->name, info.device_mkt_name);
  gpu->arch = get_uarch_from_hsa(gpu, info.gpu_name);
+  gpu->mem = get_memory_info(gpu, info);

  if (gpu->arch == NULL) {
    return NULL;
@@ -135,3 +226,17 @@ struct gpu_info* get_gpu_info_hsa(int gpu_idx) {
 char* get_str_cu(struct gpu_info* gpu) {
  return get_str_generic(gpu->topo_h->compute_units);
 }
+
+char* get_str_xcds(struct gpu_info* gpu) {
+  // If there is a single XCD, then we dont want to
+  // print it.
+  if (gpu->topo_h->num_xcc == 1) {
+    return NULL;
+  }
+  return get_str_generic(gpu->topo_h->num_xcc);
+}
+
+char* get_str_matrix_cores(struct gpu_info* gpu) {
+  // TODO: Show XX (WMMA/MFMA)
+  return get_str_generic(gpu->topo_h->matrix_cores);
+}
--- a/src/hsa/hsa.hpp
+++ b/src/hsa/hsa.hpp
@@ -5,5 +5,7 @@

 struct gpu_info* get_gpu_info_hsa(int gpu_idx);
 char* get_str_cu(struct gpu_info* gpu);
+char* get_str_xcds(struct gpu_info* gpu);
+char* get_str_matrix_cores(struct gpu_info* gpu);

 #endif
Author	SHA1	Message	Date
Dr-Noob	84e6021a95	Remove TODO	2025-10-26 10:47:27 +01:00
Dr-Noob	a4916255cf	Fix	2025-10-26 10:44:09 +01:00
Dr-Noob	b5dc30d4b3	Add matrix cores	2025-10-26 10:42:25 +01:00
Dr-Noob	2fa90179b4	Fix	2025-10-26 10:28:41 +01:00
Dr-Noob	711936be81	Show XCDs	2025-10-26 10:27:51 +01:00
Dr-Noob	94a9a440f0	Basic support	2025-10-23 21:40:14 +02:00
Dr-Noob	78d34e71f1	[v0.30][AMD] Add support to fetch bus width, global memory and LDS size We can use hsa_amd_agent_iterate_memory_pools to fetch info about GPU memory pools in the GPU. HSA_AMD_SEGMENT_GROUP seems to be LDS, and HSA_AMD_SEGMENT_GLOBAL seems to be global memory. However, the latter is reported multiple times (I don't know why). The only solution I found for this is to check for the HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED flag, which seems to be reported only once. For bus width, we simply use HSA_AMD_AGENT_INFO_MEMORY_WIDTH.	2025-10-23 21:30:02 +02:00
Dr-Noob	82ea16fc3d	[v0.30] Fix warning in printer	2025-10-16 20:01:14 +02:00
Dr-Noob	6589de9717	[v0.30] Reorganize attributes in printer and add CUs attr for AMD	2025-10-16 19:53:48 +02:00