Patch cuda.cpp with cloudy's fix

[v0.30] Add support for XCDs and matrix cores
For XCDs, we dont show them if the GPU is made of a single XCD, as it adds little value For matrix cores, we assume it can be computed as compute_units * simds_per_cu, it seems to work for the GPUs I checked from CDNA3 and RDNA3. Not sure what would happen for older GPUs that do not have matrix cores though.
2026-01-10 19:29:45 -05:00 · 2025-10-26 10:51:27 +01:00 · 2025-10-24 22:29:45 +02:00 · 2025-10-23 21:30:02 +02:00
6 changed files with 180 additions and 46 deletions
--- a/build.sh
+++ b/build.sh
@@ -1,5 +1,24 @@
 #!/bin/bash
 print_help() {
  cat << EOF
 Usage: $0 <backends> [build_type]
  <backends>    MANDATORY. Comma-separated list of 
                backends to enable.
                Valid options: hsa, intel, cuda
                Example: hsa,cuda
  [build_type]  OPTIONAL. Build type. Valid options:
                debug, release (default: release)
 Examples:
  $0 hsa,intel debug
  $0 cuda
  $0 hsa,intel,cuda release
 EOF
 }
 # gpufetch build script
 set -e
@@ -7,19 +26,79 @@ rm -rf build/ gpufetch
 mkdir build/
 cd build/
-if [ "$1" == "debug" ]
+if [ "$1" == "--help" ]
 then
-  BUILD_TYPE="Debug"
+  echo "gpufetch build script"
-else
+  echo
-  BUILD_TYPE="Release"
+  print_help
  exit 0
 fi
 if [[ $# -lt 1 ]]; then
  echo "ERROR: At least one backend must be specified."
  echo
  print_help
  exit 1
 fi
 # Determine if last argument is build type
 LAST_ARG="${!#}"
 if [[ "$LAST_ARG" == "debug" || "$LAST_ARG" == "release" ]]; then
  BUILD_TYPE="$LAST_ARG"
  BACKEND_ARG="${1}"
 else
  BUILD_TYPE="release"
  BACKEND_ARG="${1}"
 fi
 # Split comma-separated backends into an array
 IFS=',' read -r -a BACKENDS <<< "$BACKEND_ARG"
 # Validate build type 
 if [[ "$BUILD_TYPE" != "debug" && "$BUILD_TYPE" != "release" ]]
 then
  echo "Error: Invalid build type '$BUILD_TYPE'."
  echo "Valid options are: debug, release"
  exit 1
 fi
 # From lower to upper case
 CMAKE_FLAGS="-DCMAKE_BUILD_TYPE=${BUILD_TYPE^}"
 # Validate backends
 VALID_BACKENDS=("hsa" "intel" "cuda")
 for BACKEND in "${BACKENDS[@]}"; do
  case "$BACKEND" in
    hsa)
      CMAKE_FLAGS+=" -DENABLE_HSA_BACKEND=ON"
      ;;
    intel)
      CMAKE_FLAGS+=" -DENABLE_INTEL_BACKEND=ON"
      ;;
    cuda)
      CMAKE_FLAGS+=" -DENABLE_CUDA_BACKEND=ON"
      ;;
    *)
      echo "ERROR: Invalid backend '$BACKEND'."
      echo "Valid options: ${VALID_BACKENDS[*]}"
      exit 1
      ;;
  esac
 done
 # You can also manually specify the compilation flags.
 # If you need to, just run the cmake command directly
 # instead of using this script.
 #
 # Here you will find some help:
 #
 # In case you have CUDA installed but it is not detected,
 # - set CMAKE_CUDA_COMPILER to your nvcc binary:
 # - set CMAKE_CUDA_COMPILER_TOOLKIT_ROOT to the CUDA root dir
 # for example:
 # cmake -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=/usr/local/cuda/ ..
-
+#
 # In case you want to explicitely disable a backend, you can:
 # Disable CUDA backend:
 # cmake -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DENABLE_CUDA_BACKEND=OFF ..
@@ -28,7 +107,9 @@ fi
 # Disable Intel backend:
 # cmake -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DENABLE_INTEL_BACKEND=OFF ..
-cmake -DCMAKE_BUILD_TYPE=$BUILD_TYPE ..
+echo "$0: Running cmake $CMAKE_FLAGS"
 echo 
 cmake $CMAKE_FLAGS ..
 os=$(uname)
 if [ "$os" == 'Linux' ]; then
--- a/src/common/gpu.hpp
+++ b/src/common/gpu.hpp
@@ -46,6 +46,10 @@ struct topology_c {
 // HSA topology
 struct topology_h {
  int32_t compute_units;
  int32_t num_shader_engines;
  int32_t simds_per_cu;
  int32_t num_xcc;
  int32_t matrix_cores;
 };
 // Intel topology
--- a/src/common/printer.cpp
+++ b/src/common/printer.cpp
@@ -48,6 +48,8 @@ enum {
  ATTRIBUTE_FREQUENCY,     // ALL
  ATTRIBUTE_PEAK,          // ALL
  ATTRIBUTE_COMPUTE_UNITS, // HSA
  ATTRIBUTE_MATRIX_CORES,  // HSA
  ATTRIBUTE_XCDS,          // HSA
  ATTRIBUTE_LDS_SIZE,      // HSA
  ATTRIBUTE_STREAMINGMP,   // CUDA
  ATTRIBUTE_CORESPERMP,    // CUDA
@@ -70,6 +72,8 @@ static const AttributeField ATTRIBUTE_INFO[] = {
  { ATTRIBUTE_FREQUENCY,     "Max Frequency:",          "Max Freq.:" },
  { ATTRIBUTE_PEAK,          "Peak Performance:",       "Peak Perf.:" },
  { ATTRIBUTE_COMPUTE_UNITS, "Compute Units (CUs):",    "CUs" },
  { ATTRIBUTE_MATRIX_CORES,  "Matrix Cores:",           "Matrix Cores:" },
  { ATTRIBUTE_XCDS,          "XCDs:",                   "XCDs" },
  { ATTRIBUTE_LDS_SIZE,      "LDS size:",               "LDS:" },
  { ATTRIBUTE_STREAMINGMP,   "SMs:",                    "SMs:" },
  { ATTRIBUTE_CORESPERMP,    "Cores/SM:",               "Cores/SM:" },
@@ -488,6 +492,8 @@ bool print_gpufetch_amd(struct gpu_info* gpu, STYLE s, struct color** cs, struct
  char* uarch = get_str_uarch_hsa(gpu->arch);
  char* manufacturing_process = get_str_process(gpu->arch);
  char* cus = get_str_cu(gpu);
  char* matrix_cores = get_str_matrix_cores(gpu);
  char* xcds = get_str_xcds(gpu);
  char* max_frequency = get_str_freq(gpu);
  char* bus_width = get_str_bus_width(gpu);
  char* mem_size = get_str_memory_size(gpu);
@@ -501,6 +507,10 @@ bool print_gpufetch_amd(struct gpu_info* gpu, STYLE s, struct color** cs, struct
  setAttribute(art, ATTRIBUTE_TECHNOLOGY, manufacturing_process);
  setAttribute(art, ATTRIBUTE_FREQUENCY, max_frequency);
  setAttribute(art, ATTRIBUTE_COMPUTE_UNITS, cus);
  setAttribute(art, ATTRIBUTE_MATRIX_CORES, matrix_cores);
  if (xcds != NULL) {
    setAttribute(art, ATTRIBUTE_XCDS, xcds);
  }
  setAttribute(art, ATTRIBUTE_LDS_SIZE, lds_size);
  setAttribute(art, ATTRIBUTE_MEMORY, mem_size);
  setAttribute(art, ATTRIBUTE_BUS_WIDTH, bus_width);
--- a/src/cuda/cuda.cpp
+++ b/src/cuda/cuda.cpp
@@ -1,3 +1,6 @@
 // patched cuda.cpp for cuda13 by cloudy
 #include <cuda_runtime.h>
 #include <cstring>
 #include <cstdlib>
@@ -14,25 +17,20 @@ bool print_gpu_cuda(struct gpu_info* gpu) {
  char* cc = get_str_cc(gpu->arch);
  printf("%s (Compute Capability %s)\n", gpu->name, cc);
  free(cc);
  return true;
 }
 struct cache* get_cache_info(cudaDeviceProp prop) {
  struct cache* cach = (struct cache*) emalloc(sizeof(struct cache));
  cach->L2 = (struct cach*) emalloc(sizeof(struct cach));
  cach->L2->size = prop.l2CacheSize;
  cach->L2->num_caches = 1;
  cach->L2->exists = true;
  return cach;
 }
 int get_tensor_cores(struct uarch* arch, int sm, int major) {
  if(major == 7) {
    // TU116 does not have tensor cores!
    // https://www.anandtech.com/show/13973/nvidia-gtx-1660-ti-review-feat-evga-xc-gaming/2
    if (is_chip_TU116(arch))
      return 0;
    return sm * 8;
@@ -43,57 +41,57 @@ int get_tensor_cores(struct uarch* arch, int sm, int major) {
 struct topology_c* get_topology_info(struct uarch* arch, cudaDeviceProp prop) {
  struct topology_c* topo = (struct topology_c*) emalloc(sizeof(struct topology_c));
  topo->streaming_mp = prop.multiProcessorCount;
  topo->cores_per_mp = _ConvertSMVer2Cores(prop.major, prop.minor);
  topo->cuda_cores = topo->streaming_mp * topo->cores_per_mp;
  topo->tensor_cores = get_tensor_cores(arch, topo->streaming_mp, prop.major);
  return topo;
 }
 int32_t guess_clock_multipilier(struct gpu_info* gpu, struct memory* mem) {
  // Guess clock multiplier
  int32_t clk_mul = 1;
  int32_t clk8 = abs((mem->freq/8) - gpu->freq);
  int32_t clk4 = abs((mem->freq/4) - gpu->freq);
  int32_t clk2 = abs((mem->freq/2) - gpu->freq);
  int32_t clk1 = abs((mem->freq/1) - gpu->freq);
  int32_t min = mem->freq;
  if(clkm_possible_for_uarch(8, gpu->arch) && min > clk8) { clk_mul = 8; min = clk8; }
  if(clkm_possible_for_uarch(4, gpu->arch) && min > clk4) { clk_mul = 4; min = clk4; }
  if(clkm_possible_for_uarch(2, gpu->arch) && min > clk2) { clk_mul = 2; min = clk2; }
  if(clkm_possible_for_uarch(1, gpu->arch) && min > clk1) { clk_mul = 1; min = clk1; }
  return clk_mul;
 }
 struct memory* get_memory_info(struct gpu_info* gpu, cudaDeviceProp prop) {
  struct memory* mem = (struct memory*) emalloc(sizeof(struct memory));
  int val = 0;
  mem->size_bytes = (unsigned long long) prop.totalGlobalMem;
-  mem->freq = prop.memoryClockRate * 0.001f;
+
  if (cudaDeviceGetAttribute(&val, cudaDevAttrMemoryClockRate, gpu->idx) == cudaSuccess) {
      if (val > 1000000)
          mem->freq = (float)val / 1000000.0f;
      else
          mem->freq = (float)val * 0.001f;
  } else {
      mem->freq = 0.0f;
  }
  mem->bus_width = prop.memoryBusWidth;
  mem->clk_mul = guess_clock_multipilier(gpu, mem);
  mem->type = guess_memtype_from_cmul_and_uarch(mem->clk_mul, gpu->arch);
-  // Fix frequency returned from CUDA to show real frequency
+  if (mem->clk_mul > 0)
-  mem->freq = mem->freq  / mem->clk_mul;
+      mem->freq = mem->freq / mem->clk_mul;
  return mem;
 }
 // Compute peak performance when using CUDA cores
 int64_t get_peak_performance_cuda(struct gpu_info* gpu) {
  return gpu->freq * 1000000 * gpu->topo_c->cuda_cores * 2;
 }
 // Compute peak performance when using tensor cores
 int64_t get_peak_performance_tcu(cudaDeviceProp prop, struct gpu_info* gpu) {
  // Volta / Turing tensor cores performs 4x4x4 FP16 matrix multiplication
  // Ampere tensor cores performs 8x4x8 FP16 matrix multiplicacion
  if(prop.major == 7) return gpu->freq * 1000000 * 4 * 4 * 4  * 2 * gpu->topo_c->tensor_cores;
  else if(prop.major == 8) return gpu->freq * 1000000 * 8 * 4 * 8 * 2 * gpu->topo_c->tensor_cores;
  else return 0;
@@ -115,8 +113,7 @@ struct gpu_info* get_gpu_info_cuda(struct pci_dev *devices, int gpu_idx) {
  }
  int num_gpus = -1;
-  cudaError_t err = cudaSuccess;
+  cudaError_t err = cudaGetDeviceCount(&num_gpus);
  err = cudaGetDeviceCount(&num_gpus);
  if(gpu_idx == 0) {
    printf("\r%*c\r", (int) strlen(CUDA_DRIVER_START_WARNING), ' ');
@@ -134,7 +131,6 @@ struct gpu_info* get_gpu_info_cuda(struct pci_dev *devices, int gpu_idx) {
  }
  if(gpu->idx+1 > num_gpus) {
    // Master is trying to query an invalid GPU
    return NULL;
  }
@@ -144,15 +140,25 @@ struct gpu_info* get_gpu_info_cuda(struct pci_dev *devices, int gpu_idx) {
    return NULL;
  }
-  gpu->freq = deviceProp.clockRate * 1e-3f;
+  int core_clk = 0;
  if (cudaDeviceGetAttribute(&core_clk, cudaDevAttrClockRate, gpu->idx) == cudaSuccess) {
      if (core_clk > 1000000)
          gpu->freq = core_clk / 1000000.0f;
      else
          gpu->freq = core_clk * 0.001f;
  } else {
      gpu->freq = 0.0f;
  }
  gpu->vendor = GPU_VENDOR_NVIDIA;
-  gpu->name = (char *) emalloc(sizeof(char) * (strlen(deviceProp.name) + 1));
+  gpu->name = (char *) emalloc(strlen(deviceProp.name) + 1);
  strcpy(gpu->name, deviceProp.name);
  if((gpu->pci = get_pci_from_pciutils(devices, PCI_VENDOR_ID_NVIDIA, gpu_idx)) == NULL) {
    printErr("Unable to find a valid device for vendor id 0x%.4X using pciutils", PCI_VENDOR_ID_NVIDIA);
    return NULL;
  }
  gpu->arch = get_uarch_from_cuda(gpu);
  gpu->cach = get_cache_info(deviceProp);
  gpu->mem = get_memory_info(gpu, deviceProp);
@@ -163,19 +169,7 @@ struct gpu_info* get_gpu_info_cuda(struct pci_dev *devices, int gpu_idx) {
  return gpu;
 }
-char* get_str_sm(struct gpu_info* gpu) {
+char* get_str_sm(struct gpu_info* gpu) { return get_str_generic(gpu->topo_c->streaming_mp); }
-  return get_str_generic(gpu->topo_c->streaming_mp);
+char* get_str_cores_sm(struct gpu_info* gpu) { return get_str_generic(gpu->topo_c->cores_per_mp); }
-}
+char* get_str_cuda_cores(struct gpu_info* gpu) { return get_str_generic(gpu->topo_c->cuda_cores); }
-
+char* get_str_tensor_cores(struct gpu_info* gpu) { return get_str_generic(gpu->topo_c->tensor_cores); }
 char* get_str_cores_sm(struct gpu_info* gpu) {
  return get_str_generic(gpu->topo_c->cores_per_mp);
 }
 char* get_str_cuda_cores(struct gpu_info* gpu) {
  return get_str_generic(gpu->topo_c->cuda_cores);
 }
 char* get_str_tensor_cores(struct gpu_info* gpu) {
  return get_str_generic(gpu->topo_c->tensor_cores);
 }
--- a/src/hsa/hsa.cpp
+++ b/src/hsa/hsa.cpp
@@ -22,10 +22,16 @@ struct agent_info {
  char vendor_name[64];
  char device_mkt_name[64];
  uint32_t max_clock_freq;
-  uint32_t compute_unit;
+  // Memory
  uint32_t bus_width;
  uint32_t lds_size;
  uint64_t global_size;
  // Topology
  uint32_t compute_unit;
  uint32_t num_shader_engines;
  uint32_t simds_per_cu;
  uint32_t num_xcc;            // Acccelerator Complex Dies (XCDs)
  uint32_t matrix_cores;       // Cores with WMMA/MFMA capabilities
 };
 #define RET_IF_HSA_ERR(err) { \
@@ -52,6 +58,11 @@ hsa_status_t memory_pool_callback(hsa_amd_memory_pool_t pool, void* data) {
  if (segment == HSA_AMD_SEGMENT_GROUP) {
    // LDS memory
    // We want to make sure that this memory pool is not repeated.
    if (info->lds_size != 0) {
      printErr("Found HSA_AMD_SEGMENT_GROUP twice!");
      return HSA_STATUS_ERROR;
    }
    uint32_t size = 0;
    err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
@@ -110,7 +121,19 @@ hsa_status_t agent_callback(hsa_agent_t agent, void *data) {
    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_MEMORY_WIDTH, &info->bus_width);
    RET_IF_HSA_ERR(err);
    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES, &info->num_shader_engines);
    RET_IF_HSA_ERR(err);
    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, &info->simds_per_cu);
    RET_IF_HSA_ERR(err);
    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_NUM_XCC, &info->num_xcc);
    RET_IF_HSA_ERR(err);
    // We will check against zero to see if it was set beforehand.
    info->global_size = 0;
    info->lds_size = 0;
    // This will fill global_size and lds_size.
    err = hsa_amd_agent_iterate_memory_pools(agent, memory_pool_callback, data);
    RET_IF_HSA_ERR(err);
  }
@@ -122,6 +145,12 @@ struct topology_h* get_topology_info(struct agent_info info) {
  struct topology_h* topo = (struct topology_h*) emalloc(sizeof(struct topology_h));
  topo->compute_units = info.compute_unit;
  topo->num_shader_engines = info.num_shader_engines; // not printed at the moment
  topo->simds_per_cu = info.simds_per_cu;             // not printed at the moment
  topo->num_xcc = info.num_xcc;
  // Old GPUs (GCN I guess) might not have matrix cores.
  // Not sure what would happen here?
  topo->matrix_cores = topo->compute_units * topo->simds_per_cu;
  return topo;
 }
@@ -197,3 +226,17 @@ struct gpu_info* get_gpu_info_hsa(int gpu_idx) {
 char* get_str_cu(struct gpu_info* gpu) {
  return get_str_generic(gpu->topo_h->compute_units);
 }
 char* get_str_xcds(struct gpu_info* gpu) {
  // If there is a single XCD, then we dont want to
  // print it.
  if (gpu->topo_h->num_xcc == 1) {
    return NULL;
  }
  return get_str_generic(gpu->topo_h->num_xcc);
 }
 char* get_str_matrix_cores(struct gpu_info* gpu) {
  // TODO: Show XX (WMMA/MFMA)
  return get_str_generic(gpu->topo_h->matrix_cores);
 }
--- a/src/hsa/hsa.hpp
+++ b/src/hsa/hsa.hpp
@@ -5,5 +5,7 @@
 struct gpu_info* get_gpu_info_hsa(int gpu_idx);
 char* get_str_cu(struct gpu_info* gpu);
 char* get_str_xcds(struct gpu_info* gpu);
 char* get_str_matrix_cores(struct gpu_info* gpu);
 #endif
Author	SHA1	Message	Date
Franscobec	0f416b2da9	Patch cuda.cpp with cloudy's fix	2026-01-10 19:29:45 -05:00
Dr-Noob	5f619dc95a	[v0.30] Add support for XCDs and matrix cores For XCDs, we dont show them if the GPU is made of a single XCD, as it adds little value For matrix cores, we assume it can be computed as compute_units * simds_per_cu, it seems to work for the GPUs I checked from CDNA3 and RDNA3. Not sure what would happen for older GPUs that do not have matrix cores though.	2025-10-26 10:51:27 +01:00
Dr-Noob	98bb02e203	[v0.30] Allow users to select backend from build script Before we had AMD support, CMakeLists.txt tried to enable all backends by default. Now that we have AMD support, that does not make that much sense so instead it will only enable the backend specified by the user (with the -DENABLE_XXX_BACKEND flags) Then, before AMD support, the build.sh script was useful to just invoke cmake and let it figure out the backends, but the script was a bit useless after the mentioned change in the CMakeLists.txt. Therefore, this commit allow users to specify an argument, like: ./build.sh cuda To specify what backend/s to enable, without the need to manually configure the build with the -DENABLE_XXX_BACKEND flag. Note that multiple backends are also allowed, like: ./build.sh intel,hsa Would enable both Intel and HSA backends (which could make sense for example in a system with Intel iGPU an an AMD dGPU).	2025-10-24 22:29:45 +02:00
Dr-Noob	78d34e71f1	[v0.30][AMD] Add support to fetch bus width, global memory and LDS size We can use hsa_amd_agent_iterate_memory_pools to fetch info about GPU memory pools in the GPU. HSA_AMD_SEGMENT_GROUP seems to be LDS, and HSA_AMD_SEGMENT_GLOBAL seems to be global memory. However, the latter is reported multiple times (I don't know why). The only solution I found for this is to check for the HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED flag, which seems to be reported only once. For bus width, we simply use HSA_AMD_AGENT_INFO_MEMORY_WIDTH.	2025-10-23 21:30:02 +02:00