diff --git a/CMakeLists.txt b/CMakeLists.txt index 79e0ba1..9c0864e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -127,7 +127,7 @@ endif() if(ENABLE_HSA_BACKEND) target_compile_definitions(gpufetch PUBLIC BACKEND_HSA) - add_library(hsa_backend STATIC ${HSA_DIR}/hsa.cpp) + add_library(hsa_backend STATIC ${HSA_DIR}/hsa.cpp ${HSA_DIR}/uarch.cpp) if(NOT ${PCIUTILS_FOUND}) add_dependencies(hsa_backend pciutils) diff --git a/src/common/printer.cpp b/src/common/printer.cpp index e255e57..eca1721 100644 --- a/src/common/printer.cpp +++ b/src/common/printer.cpp @@ -490,10 +490,14 @@ bool print_gpufetch_amd(struct gpu_info* gpu, STYLE s, struct color** cs, struct return false; char* gpu_name = get_str_gpu_name(gpu); + char* uarch = get_str_uarch_hsa(gpu->arch); + char* manufacturing_process = get_str_process(gpu->arch); char* sms = get_str_cu(gpu); char* max_frequency = get_str_freq(gpu); setAttribute(art, ATTRIBUTE_NAME, gpu_name); + setAttribute(art, ATTRIBUTE_UARCH, uarch); + setAttribute(art, ATTRIBUTE_TECHNOLOGY, manufacturing_process); setAttribute(art, ATTRIBUTE_FREQUENCY, max_frequency); setAttribute(art, ATTRIBUTE_STREAMINGMP, sms); diff --git a/src/common/uarch.hpp b/src/common/uarch.hpp index 56bfe9b..3dc4019 100644 --- a/src/common/uarch.hpp +++ b/src/common/uarch.hpp @@ -16,6 +16,9 @@ struct uarch { int32_t cc_minor; int32_t compute_capability; + // HSA specific + int32_t llvm_target; + // Intel specific int32_t gt; int32_t eu; diff --git a/src/hsa/chips.hpp b/src/hsa/chips.hpp new file mode 100644 index 0000000..6de67fa --- /dev/null +++ b/src/hsa/chips.hpp @@ -0,0 +1,37 @@ +#ifndef __HSA_GPUCHIPS__ +#define __HSA_GPUCHIPS__ + +typedef uint32_t GPUCHIP; + +enum { + CHIP_UNKNOWN_HSA, + // VEGA (TODO) + // ... + // RDNA + CHIP_NAVI_10, + CHIP_NAVI_12, + CHIP_NAVI_14, + // RDNA2 + // There are way more (eg Oberon) + // Maybe we'll add them in the future. + CHIP_NAVI_21, + CHIP_NAVI_22, + CHIP_NAVI_23, + CHIP_NAVI_24, + // RDNA3 + // There are way more as well. + // Supporting Navi only for now. + CHIP_NAVI_31, + CHIP_NAVI_32, + CHIP_NAVI_33, + // RDNA4 + CHIP_NAVI_44, + CHIP_NAVI_48, + // CDNA + CHIP_ARCTURUS, // MI100 series + CHIP_ALDEBARAN, // MI200 series + CHIP_AQUA_VANJARAM, // MI300 series + CHIP_CDNA_NEXT // MI350 series +}; + +#endif diff --git a/src/hsa/hsa.cpp b/src/hsa/hsa.cpp index 6e13506..74bafc7 100644 --- a/src/hsa/hsa.cpp +++ b/src/hsa/hsa.cpp @@ -12,6 +12,7 @@ #include #include "hsa.hpp" +#include "uarch.hpp" #include "../common/pci.hpp" #include "../common/global.hpp" #include "../common/uarch.hpp" @@ -34,9 +35,8 @@ struct agent_info { snprintf(&(err_val[0]), sizeof(err_val), "%#x", (uint32_t)err); \ err_str = &(err_val[0]); \ } \ - printErr("HSA failure at: %s:%d\n", \ - __FILE__, __LINE__); \ - printErr("Call returned %s\n", err_str); \ + printErr("HSA failure at: %s:%d\n", __FILE__, __LINE__); \ + printErr("Call returned %s\n", err_str); \ return (err); \ } \ } @@ -52,7 +52,6 @@ hsa_status_t agent_callback(hsa_agent_t agent, void *data) { err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, info->gpu_name); RET_IF_HSA_ERR(err); - // TODO: What if vendor_name is not AMD? err = hsa_agent_get_info(agent, HSA_AGENT_INFO_VENDOR_NAME, info->vendor_name); RET_IF_HSA_ERR(err); @@ -92,11 +91,8 @@ struct gpu_info* get_gpu_info_hsa(struct pci_dev *devices, int gpu_idx) { return NULL; } - hsa_status_t status; - - // Initialize the HSA runtime - status = hsa_init(); - if (status != HSA_STATUS_SUCCESS) { + hsa_status_t err = hsa_init(); + if (err != HSA_STATUS_SUCCESS) { printErr("Failed to initialize HSA runtime"); return NULL; } @@ -105,23 +101,36 @@ struct gpu_info* get_gpu_info_hsa(struct pci_dev *devices, int gpu_idx) { info.deviceId = gpu_idx; // Iterate over all agents in the system - status = hsa_iterate_agents(agent_callback, &info); - if (status != HSA_STATUS_SUCCESS) { + err = hsa_iterate_agents(agent_callback, &info); + if (err != HSA_STATUS_SUCCESS) { printErr("Failed to iterate HSA agents"); hsa_shut_down(); return NULL; } - gpu->freq = info.max_clock_freq; + if (info.vendor_name != "AMD") { + printErr("HSA vendor name is: '%s'. Only AMD is supported!", info.vendor_name); + return NULL; + } gpu->vendor = GPU_VENDOR_AMD; + + gpu->freq = info.max_clock_freq; + gpu->topo_h = get_topology_info(info); gpu->name = (char *) emalloc(sizeof(char) * (strlen(info.device_mkt_name) + 1)); strcpy(gpu->name, info.device_mkt_name); - gpu->topo_h = get_topology_info(info); + gpu->arch = get_uarch_from_hsa(gpu); - // TODO: Use gpu_name for uarch detection + if (gpu->arch->TARGET_UNKNOWN_HSA) { + printErr("Unknown LLVM target: '%s'", gpu->name); + return NULL; + } // Shut down the HSA runtime - hsa_shut_down(); + err = hsa_shut_down(); + if (err != HSA_STATUS_SUCCESS) { + printErr("Failed to shutdown HSA runtime"); + return NULL; + } return gpu; } diff --git a/src/hsa/uarch.cpp b/src/hsa/uarch.cpp new file mode 100644 index 0000000..d58034b --- /dev/null +++ b/src/hsa/uarch.cpp @@ -0,0 +1,288 @@ +// MICROARCH values +enum { + UARCH_UNKNOWN, + // GCN (Graphics Core Next) + // Empty for now + // ... + // RDNA (Radeon DNA) + UARCH_RDNA, + UARCH_RDNA2, + UARCH_RDNA3, + UARCH_RDNA4, + // CDNA (Compute DNA) + UARCH_CDNA, + UARCH_CDNA2, + UARCH_CDNA3, + UARCH_CDNA4 +}; + +static const char *uarch_str[] = { + /*[ARCH_UNKNOWN] = */ STRING_UNKNOWN, + /*[UARCH_RDNA] = */ "RDNA", + /*[UARCH_RDNA2] = */ "RDNA2", + /*[UARCH_RDNA3] = */ "RDNA3", + /*[UARCH_RDNA4] = */ "RDNA4", + /*[UARCH_CDNA] = */ "CDNA", + /*[UARCH_CDNA2] = */ "CDNA2", + /*[UARCH_CDNA3] = */ "CDNA3", + /*[UARCH_CDNA4] = */ "CDNA4", +}; + +// Sources: +// - https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html +// - https://www.techpowerup.com +// +// This is sometimes refered to as LLVM target, but also shader ISA. +// +// LLVM target *usually* maps to a specific architecture. However there +// are case where this is not true: +// MI8 is GCN3.0 with LLVM target gfx803 +// MI6 is GCN4.0 with LLVM target gfx803 +// or +// Strix Point can be gfx1150 or gfx1151 +// +// NOTE: GCN chips are stored for completeness, but they are +// not actively supported. +enum { + TARGET_UNKNOWN_HSA, + /// GCN (Graphics Core Next) + /// ------------------------ + // GCN 1.0 + TARGET_GFX600, + TARGET_GFX601, + TARGET_GFX602, + // GCN 2.0 + TARGET_GFX700, + TARGET_GFX701, + TARGET_GFX702, + TARGET_GFX703, + TARGET_GFX704, + TARGET_GFX705, + // GCN 3.0 / 4.0 + TARGET_GFX801, + TARGET_GFX802, + TARGET_GFX803, + TARGET_GFX805, + TARGET_GFX810, + // GCN 5.0 + TARGET_GFX900, + TARGET_GFX902, + TARGET_GFX904, + // GCN 5.1 + TARGET_GFX906, + // ??? + TARGET_GFX909, + TARGET_GFX90C, + /// RDNA (Radeon DNA) + /// ----------------- + // RDNA1 + TARGET_GFX1010, + TARGET_GFX1011, + TARGET_GFX1012, + // RDNA2 + TARGET_GFX1013, // Oberon + TARGET_GFX1030, + TARGET_GFX1031, + TARGET_GFX1032, + TARGET_GFX1033, + TARGET_GFX1034, + TARGET_GFX1035, // ?? + TARGET_GFX1036, // ?? + // RDNA3 + TARGET_GFX1100, + TARGET_GFX1101, + TARGET_GFX1102, + TARGET_GFX1103, // ??? + // RDNA3.5 + TARGET_GFX1150, // Strix Point + TARGET_GFX1151, // Strix Halo / Strix Point + TARGET_GFX1152, // Krackan Point + TARGET_GFX1153, // ??? + // RDNA4 + TARGET_GFX1200, + TARGET_GFX1201, + TARGET_GFX1250, // ??? + TARGET_GFX1251, // ??? + /// CDNA (Compute DNA) + /// ------------------ + // CDNA + TARGET_GFX908, + // CDNA2 + TARGET_GFX90A, + // CDNA3 + TARGET_GFX942, + // CDNA4 + TARGET_GFX950 +}; + +#define CHECK_UARCH_START if (false) {} +#define CHECK_UARCH(arch, chip_, str, uarch, process) \ + else if (arch->chip == chip_) fill_uarch(arch, str, uarch, process); +#define CHECK_UARCH_END else { if(arch->chip != CHIP_UNKNOWN_CUDA) printBug("map_chip_to_uarch_hsa: Unknown chip id: %d", arch->chip); fill_uarch(arch, STRING_UNKNOWN, UARCH_UNKNOWN, UNK); } + +void fill_uarch(struct uarch* arch, char const *str, MICROARCH u, uint32_t process) { + arch->chip_str = (char *) emalloc(sizeof(char) * (strlen(str)+1)); + strcpy(arch->chip_str, str); + arch->uarch = u; + arch->process = process; +} + +// On chiplet based chips (such as Navi31, Navi32, etc), +// we have 2 different processes: The MCD process and the +// rest of the chip process. They might be different and here +// we just take one - let's take MCD process for now. +// +// TODO: Should we differentiate? +void map_chip_to_uarch_hsa(struct uarch* arch) { + CHECK_UARCH_START + + // RDNA + CHECK_UARCH(arch, CHIP_NAVI_10, "Navi 10", UARCH_RDNA, 7) + CHECK_UARCH(arch, CHIP_NAVI_12, "Navi 12", UARCH_RDNA, 7) + CHECK_UARCH(arch, CHIP_NAVI_14, "Navi 14", UARCH_RDNA, 7) + CHECK_UARCH(arch, CHIP_NAVI_21, "Navi 21", UARCH_RDNA2, 7) + CHECK_UARCH(arch, CHIP_NAVI_22, "Navi 22", UARCH_RDNA2, 7) + CHECK_UARCH(arch, CHIP_NAVI_23, "Navi 23", UARCH_RDNA2, 7) + CHECK_UARCH(arch, CHIP_NAVI_24, "Navi 24", UARCH_RDNA2, 6) + CHECK_UARCH(arch, CHIP_NAVI_31, "Navi 31", UARCH_RDNA3, 6) + CHECK_UARCH(arch, CHIP_NAVI_32, "Navi 32", UARCH_RDNA3, 6) + CHECK_UARCH(arch, CHIP_NAVI_33, "Navi 33", UARCH_RDNA3, 6) + CHECK_UARCH(arch, CHIP_NAVI_44, "Navi 44", UARCH_RDNA4, 4) + CHECK_UARCH(arch, CHIP_NAVI_48, "Navi 48", UARCH_RDNA4, 4) + // CDNA + // NOTE: We will not show chip name for CDNA, thus use empty str + CHECK_UARCH(arch, CHIP_ARCTURUS, "", UARCH_CDNA, 7) + CHECK_UARCH(arch, CHIP_ALDEBARAN, "", UARCH_CDNA2, 6) + CHECK_UARCH(arch, CHIP_AQUA_VANJARAM, "", UARCH_CDNA3, 6) + CHECK_UARCH(arch, CHIP_CDNA_NEXT, "", UARCH_CDNA4, 6) // big difference between MCD and rest of the chip process + + CHECK_UARCH_END +} + +#define CHECK_TGT_START if (false) {} +#define CHECK_TGT(target, llvm_target, chip) \ + else if (target == llvm_target) return chip; +#define CHECK_TGT_END else { printBug("LLVM target '%d' has no matching chip", target); return CHIP_UNKNOWN_HSA; } + +// We have at least 2 choices to infer the chip: +// +// - LLVM target (e.g., gfx1101 is Navi 32) +// - PCI ID (e.g., 0x7470 is Navi 32) +// +// For now we will use the first approach, which seems to have +// some issues like mentioned in the enum. +// However PCI detection is also not perfect, since it is +// quite hard to find PCI ids from old hardware. +GPUCHIP get_chip_from_target_hsa(int32_t target) { + CHECK_TGT_START + /// RDNA + /// ------------------------------------------- + CHECK_TGT(target, TARGET_GFX1010, CHIP_NAVI_10) + CHECK_TGT(target, TARGET_GFX1011, CHIP_NAVI_12) + CHECK_TGT(target, TARGET_GFX1012, CHIP_NAVI_14) + // CHECK_TGT(target, TARGET_GFX1013, TODO) + /// RDNA2 + /// ------------------------------------------- + CHECK_TGT(target, TARGET_GFX1030, CHIP_NAVI_21) + CHECK_TGT(target, TARGET_GFX1031, CHIP_NAVI_22) + CHECK_TGT(target, TARGET_GFX1032, CHIP_NAVI_23) + CHECK_TGT(target, TARGET_GFX1033, CHIP_NAVI_21) + CHECK_TGT(target, TARGET_GFX1034, CHIP_NAVI_24) + // CHECK_TGT(target, TARGET_GFX1035, TODO) + // CHECK_TGT(target, TARGET_GFX1036, TODO) + /// RDNA3 + /// ------------------------------------------- + CHECK_TGT(target, TARGET_GFX1100, CHIP_NAVI_31) + CHECK_TGT(target, TARGET_GFX1101, CHIP_NAVI_32) + CHECK_TGT(target, TARGET_GFX1102, CHIP_NAVI_33) + // CHECK_TGT(target, TARGET_GFX1103, TODO) + /// RDNA3.5 + /// ------------------------------------------- + // CHECK_TGT(target, TARGET_GFX1150, TODO) + // CHECK_TGT(target, TARGET_GFX1151, TODO) + // CHECK_TGT(target, TARGET_GFX1152, TODO) + // CHECK_TGT(target, TARGET_GFX1153, TODO) + /// RDNA4 + /// ------------------------------------------- + CHECK_TGT(target, TARGET_GFX1200, CHIP_NAVI_44) + CHECK_TGT(target, TARGET_GFX1201, CHIP_NAVI_48) + // CHECK_TGT(target, TARGET_GFX1250, TODO) + // CHECK_TGT(target, TARGET_GFX1251, TODO) + /// CDNA + /// ------------------------------------------- + CHECK_TGT(target, TARGET_GFX908, CHIP_ARCTURUS) + /// CDNA2 + /// ------------------------------------------- + CHECK_TGT(target, TARGET_GFX90A, CHIP_ALDEBARAN) + /// CDNA3 + /// ------------------------------------------- + CHECK_TGT(target, TARGET_GFX942, CHIP_AQUA_VANJARAM) + /// CDNA4 + /// ------------------------------------------- + CHECK_TGT(target, TARGET_GFX950, CHIP_CDNA_NEXT) + CHECK_TGT_END +} + +#define CHECK_TGT_STR_START if (false) {} +#define CHECK_TGT_STR(target, llvm_target, chip) \ + else if (target == llvm_target) return chip; +#define CHECK_TGT_STR_END else { return TARGET_UNKNOWN_HSA; } + +// Maps the LLVM target string to the enum value +int32_t get_llvm_target_from_str(char* target) { + // TODO: Autogenerate this + // TODO: Add all, not only the ones we support in get_chip_from_target_hsa + CHECK_TGT_STR_START + CHECK_TGT_STR(target, "gfx1010", TARGET_GFX1010) + CHECK_TGT_STR(target, "gfx1011", TARGET_GFX1011) + CHECK_TGT_STR(target, "gfx1012", TARGET_GFX1012) + CHECK_TGT_STR(target, "gfx1013", TARGET_GFX1013) + CHECK_TGT_STR(target, "gfx1030", TARGET_GFX1030) + CHECK_TGT_STR(target, "gfx1031", TARGET_GFX1031) + CHECK_TGT_STR(target, "gfx1032", TARGET_GFX1032) + CHECK_TGT_STR(target, "gfx1033", TARGET_GFX1033) + CHECK_TGT_STR(target, "gfx1034", TARGET_GFX1034) + CHECK_TGT_STR(target, "gfx1035", TARGET_GFX1035) + CHECK_TGT_STR(target, "gfx1036", TARGET_GFX1036) + CHECK_TGT_STR(target, "gfx1100", TARGET_GFX1100) + CHECK_TGT_STR(target, "gfx1101", TARGET_GFX1101) + CHECK_TGT_STR(target, "gfx1102", TARGET_GFX1102) + CHECK_TGT_STR(target, "gfx1103", TARGET_GFX1103) + CHECK_TGT_STR(target, "gfx1200", TARGET_GFX1200) + CHECK_TGT_STR(target, "gfx1201", TARGET_GFX1201) + CHECK_TGT_STR(target, "gfx1250", TARGET_GFX1250) + CHECK_TGT_STR(target, "gfx1251", TARGET_GFX1251) + CHECK_TGT_STR(target, "gfx908", TARGET_GFX908) + CHECK_TGT_STR(target, "gfx90a", TARGET_GFX90A) + CHECK_TGT_STR(target, "gfx942", TARGET_GFX942) + CHECK_TGT_STR(target, "gfx950", TARGET_GFX950) + CHECK_TGT_STR_END +} + +struct uarch* get_uarch_from_hsa(struct gpu_info* gpu) { + struct uarch* arch = (struct uarch*) emalloc(sizeof(struct uarch)); + + arch->llvm_target = get_llvm_target_from_str(gpu->name); + if (arch->llvm_target == TARGET_UNKNOWN_HSA) { + // Return early, error will be handled by the caller. + return arch; + } + + arch->chip_str = NULL; + arch->chip = get_chip_from_target_hsa(arch->llvm_target); + map_chip_to_uarch_hsa(arch); + + return arch; +} + +// TODO: Shouldnt we check that arch->uarch is valid? +char* get_str_uarch_hsa(struct uarch* arch) { + return uarch_str[arch->uarch]; +} + +// TODO: Move this to common! +void free_uarch_struct(struct uarch* arch) { + free(arch->uarch_str); + free(arch->chip_str); + free(arch); +} diff --git a/src/hsa/uarch.hpp b/src/hsa/uarch.hpp new file mode 100644 index 0000000..c08dcd5 --- /dev/null +++ b/src/hsa/uarch.hpp @@ -0,0 +1,13 @@ +#ifndef __HSA_UARCH__ +#define __HSA_UARCH__ + +#include "../common/gpu.hpp" + +struct uarch; + +struct uarch* get_uarch_from_hsa(struct gpu_info* gpu); +char* get_str_uarch_hsa(struct uarch* arch); +char* get_str_process(struct uarch* arch); // TODO: Shouldnt we define this in the cpp? +void free_uarch_struct(struct uarch* arch); + +#endif