For XCDs, we dont show them if the GPU is made of a single XCD, as it adds little value For matrix cores, we assume it can be computed as compute_units * simds_per_cu, it seems to work for the GPUs I checked from CDNA3 and RDNA3. Not sure what would happen for older GPUs that do not have matrix cores though.
242 lines
7.7 KiB
C++
242 lines
7.7 KiB
C++
#include <iostream>
|
|
#include <hsa/hsa.h>
|
|
#include <hsa/hsa_ext_amd.h>
|
|
|
|
#include <cstring>
|
|
#include <cstdlib>
|
|
#include <cstdio>
|
|
|
|
#include <iostream>
|
|
#include <iomanip>
|
|
#include <hsa/hsa.h>
|
|
#include <hsa/hsa_ext_amd.h>
|
|
|
|
#include "hsa.hpp"
|
|
#include "uarch.hpp"
|
|
#include "../common/global.hpp"
|
|
#include "../common/uarch.hpp"
|
|
|
|
struct agent_info {
|
|
unsigned deviceId; // ID of the target GPU device
|
|
char gpu_name[64];
|
|
char vendor_name[64];
|
|
char device_mkt_name[64];
|
|
uint32_t max_clock_freq;
|
|
// Memory
|
|
uint32_t bus_width;
|
|
uint32_t lds_size;
|
|
uint64_t global_size;
|
|
// Topology
|
|
uint32_t compute_unit;
|
|
uint32_t num_shader_engines;
|
|
uint32_t simds_per_cu;
|
|
uint32_t num_xcc; // Acccelerator Complex Dies (XCDs)
|
|
uint32_t matrix_cores; // Cores with WMMA/MFMA capabilities
|
|
};
|
|
|
|
#define RET_IF_HSA_ERR(err) { \
|
|
if ((err) != HSA_STATUS_SUCCESS) { \
|
|
char err_val[12]; \
|
|
char* err_str = NULL; \
|
|
if (hsa_status_string(err, \
|
|
(const char**)&err_str) != HSA_STATUS_SUCCESS) { \
|
|
snprintf(&(err_val[0]), sizeof(err_val), "%#x", (uint32_t)err); \
|
|
err_str = &(err_val[0]); \
|
|
} \
|
|
printErr("HSA failure at: %s:%d\n", __FILE__, __LINE__); \
|
|
printErr("Call returned %s\n", err_str); \
|
|
return (err); \
|
|
} \
|
|
}
|
|
|
|
hsa_status_t memory_pool_callback(hsa_amd_memory_pool_t pool, void* data) {
|
|
struct agent_info* info = reinterpret_cast<struct agent_info *>(data);
|
|
|
|
hsa_amd_segment_t segment;
|
|
hsa_status_t err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
if (segment == HSA_AMD_SEGMENT_GROUP) {
|
|
// LDS memory
|
|
// We want to make sure that this memory pool is not repeated.
|
|
if (info->lds_size != 0) {
|
|
printErr("Found HSA_AMD_SEGMENT_GROUP twice!");
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
uint32_t size = 0;
|
|
|
|
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
info->lds_size = size;
|
|
}
|
|
else if (segment == HSA_AMD_SEGMENT_GLOBAL) {
|
|
// Global memory
|
|
uint32_t global_flags = 0;
|
|
|
|
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flags);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
if (global_flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED) {
|
|
if (info->global_size != 0) {
|
|
printErr("Found HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED twice!");
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
|
|
uint64_t size = 0;
|
|
|
|
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
info->global_size = size;
|
|
}
|
|
}
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
hsa_status_t agent_callback(hsa_agent_t agent, void *data) {
|
|
struct agent_info* info = reinterpret_cast<struct agent_info *>(data);
|
|
|
|
hsa_device_type_t type;
|
|
hsa_status_t err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
if (type == HSA_DEVICE_TYPE_GPU) {
|
|
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, info->gpu_name);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_VENDOR_NAME, info->vendor_name);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_PRODUCT_NAME, &info->device_mkt_name);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, &info->max_clock_freq);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &info->compute_unit);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
// According to the documentation, this is deprecated. But what should I be using then?
|
|
err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_MEMORY_WIDTH, &info->bus_width);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES, &info->num_shader_engines);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, &info->simds_per_cu);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_NUM_XCC, &info->num_xcc);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
// We will check against zero to see if it was set beforehand.
|
|
info->global_size = 0;
|
|
info->lds_size = 0;
|
|
// This will fill global_size and lds_size.
|
|
err = hsa_amd_agent_iterate_memory_pools(agent, memory_pool_callback, data);
|
|
RET_IF_HSA_ERR(err);
|
|
}
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
struct topology_h* get_topology_info(struct agent_info info) {
|
|
struct topology_h* topo = (struct topology_h*) emalloc(sizeof(struct topology_h));
|
|
|
|
topo->compute_units = info.compute_unit;
|
|
topo->num_shader_engines = info.num_shader_engines; // not printed at the moment
|
|
topo->simds_per_cu = info.simds_per_cu; // not printed at the moment
|
|
topo->num_xcc = info.num_xcc;
|
|
// Old GPUs (GCN I guess) might not have matrix cores.
|
|
// Not sure what would happen here?
|
|
topo->matrix_cores = topo->compute_units * topo->simds_per_cu;
|
|
|
|
return topo;
|
|
}
|
|
|
|
struct memory* get_memory_info(struct gpu_info* gpu, struct agent_info info) {
|
|
struct memory* mem = (struct memory*) emalloc(sizeof(struct memory));
|
|
|
|
mem->bus_width = info.bus_width;
|
|
mem->lds_size = info.lds_size;
|
|
mem->size_bytes = info.global_size;
|
|
|
|
return mem;
|
|
}
|
|
|
|
struct gpu_info* get_gpu_info_hsa(int gpu_idx) {
|
|
struct gpu_info* gpu = (struct gpu_info*) emalloc(sizeof(struct gpu_info));
|
|
gpu->pci = NULL;
|
|
gpu->idx = gpu_idx;
|
|
|
|
if(gpu->idx < 0) {
|
|
printErr("GPU index must be equal or greater than zero");
|
|
return NULL;
|
|
}
|
|
|
|
if(gpu->idx > 0) {
|
|
// Currently we only support fetching GPU 0.
|
|
return NULL;
|
|
}
|
|
|
|
hsa_status_t err = hsa_init();
|
|
if (err != HSA_STATUS_SUCCESS) {
|
|
printErr("Failed to initialize HSA runtime");
|
|
return NULL;
|
|
}
|
|
|
|
struct agent_info info;
|
|
info.deviceId = gpu_idx;
|
|
|
|
// Iterate over all agents in the system
|
|
err = hsa_iterate_agents(agent_callback, &info);
|
|
if (err != HSA_STATUS_SUCCESS) {
|
|
printErr("Failed to iterate HSA agents");
|
|
hsa_shut_down();
|
|
return NULL;
|
|
}
|
|
|
|
if (strcmp(info.vendor_name, "AMD") != 0) {
|
|
printErr("HSA vendor name is: '%s'. Only AMD is supported!", info.vendor_name);
|
|
return NULL;
|
|
}
|
|
gpu->vendor = GPU_VENDOR_AMD;
|
|
|
|
gpu->freq = info.max_clock_freq;
|
|
gpu->topo_h = get_topology_info(info);
|
|
gpu->name = (char *) emalloc(sizeof(char) * (strlen(info.device_mkt_name) + 1));
|
|
strcpy(gpu->name, info.device_mkt_name);
|
|
gpu->arch = get_uarch_from_hsa(gpu, info.gpu_name);
|
|
gpu->mem = get_memory_info(gpu, info);
|
|
|
|
if (gpu->arch == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
// Shut down the HSA runtime
|
|
err = hsa_shut_down();
|
|
if (err != HSA_STATUS_SUCCESS) {
|
|
printErr("Failed to shutdown HSA runtime");
|
|
return NULL;
|
|
}
|
|
return gpu;
|
|
}
|
|
|
|
char* get_str_cu(struct gpu_info* gpu) {
|
|
return get_str_generic(gpu->topo_h->compute_units);
|
|
}
|
|
|
|
char* get_str_xcds(struct gpu_info* gpu) {
|
|
// If there is a single XCD, then we dont want to
|
|
// print it.
|
|
if (gpu->topo_h->num_xcc == 1) {
|
|
return NULL;
|
|
}
|
|
return get_str_generic(gpu->topo_h->num_xcc);
|
|
}
|
|
|
|
char* get_str_matrix_cores(struct gpu_info* gpu) {
|
|
// TODO: Show XX (WMMA/MFMA)
|
|
return get_str_generic(gpu->topo_h->matrix_cores);
|
|
} |