diff --git a/src/common/gpu.cpp b/src/common/gpu.cpp index e66f2c7..ed00694 100644 --- a/src/common/gpu.cpp +++ b/src/common/gpu.cpp @@ -116,17 +116,17 @@ char* get_str_l2(struct gpu_info* gpu) { return string; } -char* get_str_peak_performance(struct gpu_info* gpu) { +char* get_str_peak_performance_generic(int64_t pp) { char* str; - if(gpu->peak_performance == -1) { + if(pp == -1) { str = (char *) emalloc(sizeof(char) * (strlen(STRING_UNKNOWN) + 1)); strncpy(str, STRING_UNKNOWN, strlen(STRING_UNKNOWN) + 1); return str; } // 7 for digits (e.g, XXXX.XX), 7 for XFLOP/s - double flopsd = (double) gpu->peak_performance; + double flopsd = (double) pp; uint32_t max_size = 7+1+7+1; str = (char *) ecalloc(max_size, sizeof(char)); @@ -139,3 +139,12 @@ char* get_str_peak_performance(struct gpu_info* gpu) { return str; } + +char* get_str_peak_performance(struct gpu_info* gpu) { + return get_str_peak_performance_generic(gpu->peak_performance); +} + +char* get_str_peak_performance_tensor(struct gpu_info* gpu) { + return get_str_peak_performance_generic(gpu->peak_performance_t); +} + diff --git a/src/common/gpu.hpp b/src/common/gpu.hpp index 9632ffb..2928a11 100644 --- a/src/common/gpu.hpp +++ b/src/common/gpu.hpp @@ -61,6 +61,7 @@ struct gpu_info { struct memory* mem; struct cache* cach; int64_t peak_performance; + int64_t peak_performance_t; int32_t idx; }; @@ -73,5 +74,6 @@ char* get_str_bus_width(struct gpu_info* gpu); char* get_str_memory_clock(struct gpu_info* gpu); char* get_str_l2(struct gpu_info* gpu); char* get_str_peak_performance(struct gpu_info* gpu); +char* get_str_peak_performance_tensor(struct gpu_info* gpu); #endif diff --git a/src/common/main.cpp b/src/common/main.cpp index 7f27252..899ff89 100644 --- a/src/common/main.cpp +++ b/src/common/main.cpp @@ -7,7 +7,7 @@ #include "../cuda/cuda.hpp" #include "../cuda/uarch.hpp" -static const char* VERSION = "0.10"; +static const char* VERSION = "0.11"; void print_help(char *argv[]) { const char **t = args_str; diff --git a/src/common/printer.cpp b/src/common/printer.cpp index a84ebb8..958593e 100644 --- a/src/common/printer.cpp +++ b/src/common/printer.cpp @@ -43,7 +43,8 @@ enum { ATTRIBUTE_MEMORY, ATTRIBUTE_MEMORY_FREQ, ATTRIBUTE_BUS_WIDTH, - ATTRIBUTE_PEAK + ATTRIBUTE_PEAK, + ATTRIBUTE_PEAK_TENSOR, }; static const char* ATTRIBUTE_FIELDS [] = { @@ -54,13 +55,14 @@ static const char* ATTRIBUTE_FIELDS [] = { "Max Frequency:", "SMs:", "Cores/SM:", - "CUDA cores:", - "Tensor cores:", + "CUDA Cores:", + "Tensor Cores:", "L2 Size:", "Memory:", "Memory frequency:", "Bus width:", "Peak Performance:", + "Peak Performance (TC):", }; static const char* ATTRIBUTE_FIELDS_SHORT [] = { @@ -71,12 +73,14 @@ static const char* ATTRIBUTE_FIELDS_SHORT [] = { "Max Freq.:", "SMs:", "Cores/SM:", - "CUDA cores:", + "CUDA Cores:", + "Tensor Cores:", "L2 Size:", "Memory:", "Memory freq.:", "Bus width:", "Peak Perf.:", + "Peak Perf.(TC):", }; struct terminal { @@ -360,6 +364,7 @@ bool print_gpufetch_cuda(struct gpu_info* gpu, STYLE s, struct color** cs, struc char* mem_freq = get_str_memory_clock(gpu); char* bus_width = get_str_bus_width(gpu); char* pp = get_str_peak_performance(gpu); + char* pp_tensor = get_str_peak_performance_tensor(gpu); char* mem = (char *) emalloc(sizeof(char) * (strlen(mem_size) + strlen(mem_type) + 2)); sprintf(mem, "%s %s", mem_size, mem_type); @@ -383,6 +388,9 @@ bool print_gpufetch_cuda(struct gpu_info* gpu, STYLE s, struct color** cs, struc setAttribute(art, ATTRIBUTE_BUS_WIDTH, bus_width); setAttribute(art, ATTRIBUTE_L2, l2); setAttribute(art, ATTRIBUTE_PEAK, pp); + if(gpu->topo->tensor_cores >= 0) { + setAttribute(art, ATTRIBUTE_PEAK_TENSOR, pp_tensor); + } const char** attribute_fields = ATTRIBUTE_FIELDS; uint32_t longest_attribute = longest_attribute_length(art, attribute_fields); diff --git a/src/cuda/cuda.cpp b/src/cuda/cuda.cpp index d0b61d0..c75efb9 100644 --- a/src/cuda/cuda.cpp +++ b/src/cuda/cuda.cpp @@ -103,10 +103,16 @@ struct memory* get_memory_info(struct gpu_info* gpu, cudaDeviceProp prop) { return mem; } +// Compute peak performance when using CUDA cores int64_t get_peak_performance(struct gpu_info* gpu) { return gpu->freq * 1000000 * gpu->topo->cuda_cores * 2; } +// Compute peak performance when using tensor cores +int64_t get_peak_performance_t(struct gpu_info* gpu) { + return gpu->freq * 1000000 * 4 * 4 * 8 * gpu->topo->tensor_cores; +} + struct gpu_info* get_gpu_info(int gpu_idx) { struct gpu_info* gpu = (struct gpu_info*) emalloc(sizeof(struct gpu_info)); gpu->pci = NULL; @@ -156,6 +162,7 @@ struct gpu_info* get_gpu_info(int gpu_idx) { gpu->mem = get_memory_info(gpu, deviceProp); gpu->topo = get_topology_info(deviceProp); gpu->peak_performance = get_peak_performance(gpu); + gpu->peak_performance_t = get_peak_performance_t(gpu); return gpu; }