[v0.30][AMD] Add support to fetch bus width, global memory and LDS size

We can use hsa_amd_agent_iterate_memory_pools to fetch info about GPU
memory pools in the GPU. HSA_AMD_SEGMENT_GROUP seems to be LDS, and
HSA_AMD_SEGMENT_GLOBAL seems to be global memory.

However, the latter is reported multiple times (I don't know why). The
only solution I found for this is to check for the
HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED flag, which
seems to be reported only once.

For bus width, we simply use HSA_AMD_AGENT_INFO_MEMORY_WIDTH.
This commit is contained in:
Dr-Noob
2025-10-23 21:21:18 +02:00
parent 82ea16fc3d
commit 78d34e71f1
4 changed files with 93 additions and 2 deletions

View File

@@ -101,6 +101,17 @@ char* get_str_bus_width(struct gpu_info* gpu) {
return string;
}
char* get_str_lds_size(struct gpu_info* gpu) {
// TODO: Show XX KB (XX MB Total) like in cpufetch
uint32_t size = 3+1+3+1;
assert(strlen(STRING_UNKNOWN)+1 <= size);
char* string = (char *) ecalloc(size, sizeof(char));
sprintf(string, "%d KB", gpu->mem->lds_size / 1024);
return string;
}
char* get_str_memory_clock(struct gpu_info* gpu) {
return get_freq_as_str_mhz(gpu->mem->freq);
}

View File

@@ -61,6 +61,7 @@ struct memory {
int32_t bus_width;
int32_t freq;
int32_t clk_mul; // clock multiplier
int32_t lds_size; // HSA specific for now
};
struct gpu_info {
@@ -88,6 +89,7 @@ char* get_str_freq(struct gpu_info* gpu);
char* get_str_memory_size(struct gpu_info* gpu);
char* get_str_memory_type(struct gpu_info* gpu);
char* get_str_bus_width(struct gpu_info* gpu);
char* get_str_lds_size(struct gpu_info* gpu);
char* get_str_memory_clock(struct gpu_info* gpu);
char* get_str_l2(struct gpu_info* gpu);
char* get_str_peak_performance(struct gpu_info* gpu);

View File

@@ -48,14 +48,15 @@ enum {
ATTRIBUTE_FREQUENCY, // ALL
ATTRIBUTE_PEAK, // ALL
ATTRIBUTE_COMPUTE_UNITS, // HSA
ATTRIBUTE_LDS_SIZE, // HSA
ATTRIBUTE_STREAMINGMP, // CUDA
ATTRIBUTE_CORESPERMP, // CUDA
ATTRIBUTE_CUDA_CORES, // CUDA
ATTRIBUTE_TENSOR_CORES, // CUDA
ATTRIBUTE_L2, // CUDA
ATTRIBUTE_MEMORY, // CUDA
ATTRIBUTE_MEMORY, // CUDA,HSA
ATTRIBUTE_MEMORY_FREQ, // CUDA
ATTRIBUTE_BUS_WIDTH, // CUDA
ATTRIBUTE_BUS_WIDTH, // CUDA,HSA
ATTRIBUTE_PEAK_TENSOR, // CUDA
ATTRIBUTE_EUS, // Intel
ATTRIBUTE_GT, // Intel
@@ -69,6 +70,7 @@ static const AttributeField ATTRIBUTE_INFO[] = {
{ ATTRIBUTE_FREQUENCY, "Max Frequency:", "Max Freq.:" },
{ ATTRIBUTE_PEAK, "Peak Performance:", "Peak Perf.:" },
{ ATTRIBUTE_COMPUTE_UNITS, "Compute Units (CUs):", "CUs" },
{ ATTRIBUTE_LDS_SIZE, "LDS size:", "LDS:" },
{ ATTRIBUTE_STREAMINGMP, "SMs:", "SMs:" },
{ ATTRIBUTE_CORESPERMP, "Cores/SM:", "Cores/SM:" },
{ ATTRIBUTE_CUDA_CORES, "CUDA Cores:", "CUDA Cores:" },
@@ -487,6 +489,9 @@ bool print_gpufetch_amd(struct gpu_info* gpu, STYLE s, struct color** cs, struct
char* manufacturing_process = get_str_process(gpu->arch);
char* cus = get_str_cu(gpu);
char* max_frequency = get_str_freq(gpu);
char* bus_width = get_str_bus_width(gpu);
char* mem_size = get_str_memory_size(gpu);
char* lds_size = get_str_lds_size(gpu);
setAttribute(art, ATTRIBUTE_NAME, gpu_name);
if (gpu_chip != NULL) {
@@ -496,6 +501,9 @@ bool print_gpufetch_amd(struct gpu_info* gpu, STYLE s, struct color** cs, struct
setAttribute(art, ATTRIBUTE_TECHNOLOGY, manufacturing_process);
setAttribute(art, ATTRIBUTE_FREQUENCY, max_frequency);
setAttribute(art, ATTRIBUTE_COMPUTE_UNITS, cus);
setAttribute(art, ATTRIBUTE_LDS_SIZE, lds_size);
setAttribute(art, ATTRIBUTE_MEMORY, mem_size);
setAttribute(art, ATTRIBUTE_BUS_WIDTH, bus_width);
bool use_short = false;
uint32_t longest_attribute = longest_attribute_length(art, use_short);

View File

@@ -23,6 +23,9 @@ struct agent_info {
char device_mkt_name[64];
uint32_t max_clock_freq;
uint32_t compute_unit;
uint32_t bus_width;
uint32_t lds_size;
uint64_t global_size;
};
#define RET_IF_HSA_ERR(err) { \
@@ -40,6 +43,51 @@ struct agent_info {
} \
}
hsa_status_t memory_pool_callback(hsa_amd_memory_pool_t pool, void* data) {
struct agent_info* info = reinterpret_cast<struct agent_info *>(data);
hsa_amd_segment_t segment;
hsa_status_t err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
RET_IF_HSA_ERR(err);
if (segment == HSA_AMD_SEGMENT_GROUP) {
// LDS memory
// We want to make sure that this memory pool is not repeated.
if (info->lds_size != 0) {
printErr("Found HSA_AMD_SEGMENT_GROUP twice!");
return HSA_STATUS_ERROR;
}
uint32_t size = 0;
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
RET_IF_HSA_ERR(err);
info->lds_size = size;
}
else if (segment == HSA_AMD_SEGMENT_GLOBAL) {
// Global memory
uint32_t global_flags = 0;
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flags);
RET_IF_HSA_ERR(err);
if (global_flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED) {
if (info->global_size != 0) {
printErr("Found HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED twice!");
return HSA_STATUS_ERROR;
}
uint64_t size = 0;
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
RET_IF_HSA_ERR(err);
info->global_size = size;
}
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t agent_callback(hsa_agent_t agent, void *data) {
struct agent_info* info = reinterpret_cast<struct agent_info *>(data);
@@ -62,6 +110,17 @@ hsa_status_t agent_callback(hsa_agent_t agent, void *data) {
err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &info->compute_unit);
RET_IF_HSA_ERR(err);
// According to the documentation, this is deprecated. But what should I be using then?
err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_MEMORY_WIDTH, &info->bus_width);
RET_IF_HSA_ERR(err);
// We will check against zero to see if it was set beforehand.
info->global_size = 0;
info->lds_size = 0;
// This will fill global_size and lds_size.
err = hsa_amd_agent_iterate_memory_pools(agent, memory_pool_callback, data);
RET_IF_HSA_ERR(err);
}
return HSA_STATUS_SUCCESS;
@@ -75,6 +134,16 @@ struct topology_h* get_topology_info(struct agent_info info) {
return topo;
}
struct memory* get_memory_info(struct gpu_info* gpu, struct agent_info info) {
struct memory* mem = (struct memory*) emalloc(sizeof(struct memory));
mem->bus_width = info.bus_width;
mem->lds_size = info.lds_size;
mem->size_bytes = info.global_size;
return mem;
}
struct gpu_info* get_gpu_info_hsa(int gpu_idx) {
struct gpu_info* gpu = (struct gpu_info*) emalloc(sizeof(struct gpu_info));
gpu->pci = NULL;
@@ -118,6 +187,7 @@ struct gpu_info* get_gpu_info_hsa(int gpu_idx) {
gpu->name = (char *) emalloc(sizeof(char) * (strlen(info.device_mkt_name) + 1));
strcpy(gpu->name, info.device_mkt_name);
gpu->arch = get_uarch_from_hsa(gpu, info.gpu_name);
gpu->mem = get_memory_info(gpu, info);
if (gpu->arch == NULL) {
return NULL;