Patch cuda.cpp with cloudy's fix

[v0.30] Add support for XCDs and matrix cores
For XCDs, we dont show them if the GPU is made of a single XCD, as it adds little value For matrix cores, we assume it can be computed as compute_units * simds_per_cu, it seems to work for the GPUs I checked from CDNA3 and RDNA3. Not sure what would happen for older GPUs that do not have matrix cores though.
2026-01-10 19:29:45 -05:00 · 2025-10-26 10:51:27 +01:00 · 2025-10-24 22:29:45 +02:00 · 2025-10-23 21:30:02 +02:00 · 2025-10-16 20:01:14 +02:00 · 2025-10-16 19:53:48 +02:00
56 changed files with 3585 additions and 515 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 gpufetch
+build/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,219 @@
+cmake_minimum_required(VERSION 3.10)
+include(CheckLanguage)
+include(ExternalProject)
+
+project(gpufetch CXX)
+
+set(SRC_DIR "src")
+set(COMMON_DIR "${SRC_DIR}/common")
+set(CUDA_DIR "${SRC_DIR}/cuda")
+set(HSA_DIR "${SRC_DIR}/hsa")
+set(INTEL_DIR "${SRC_DIR}/intel")
+
+# Make sure that at least one backend is enabled.
+# It does not make sense that the user has not specified any backend.
+if(NOT ENABLE_INTEL_BACKEND AND NOT ENABLE_CUDA_BACKEND AND NOT ENABLE_HSA_BACKEND)
+  message(FATAL_ERROR "No backend was enabled! Please enable at least one backend with -DENABLE_XXX_BACKEND")
+endif()
+
+if(ENABLE_CUDA_BACKEND)
+  check_language(CUDA)
+  if(CMAKE_CUDA_COMPILER)
+    enable_language(CUDA)
+    # Must link_directories early so add_executable(gpufetch ...) gets the right directories
+    link_directories(cuda_backend ${CMAKE_CUDA_COMPILER_TOOLKIT_ROOT}/targets/x86_64-linux/lib)
+  else()
+    set(ENABLE_CUDA_BACKEND false)
+  endif()
+endif()
+
+if(ENABLE_HSA_BACKEND)
+  find_package(ROCmCMakeBuildTools QUIET)
+  if (ROCmCMakeBuildTools_FOUND)
+    find_package(hsa-runtime64 1.0 REQUIRED)    
+    link_directories(hsa_backend hsa-runtime64::hsa-runtime64)
+
+    # Find HSA headers
+    # ROCm does not seem to provide this, which is quite frustrating.
+    find_path(HSA_INCLUDE_DIR
+      NAMES hsa/hsa.h
+      HINTS
+          $ENV{ROCM_PATH}/include     # allow users override via env variable
+          /opt/rocm/include           # common default path
+          /usr/include
+          /usr/local/include
+    )
+
+    if(NOT HSA_INCLUDE_DIR)
+      message(STATUS "${BoldYellow}HSA not found, disabling HSA backend${ColorReset}")
+      set(ENABLE_HSA_BACKEND false)
+    endif()
+  else()
+    # rocm-cmake is not installed, try to manually find neccesary files.
+    message(STATUS "${BoldYellow}Could NOT find HSA automatically, running manual search...${ColorReset}")
+    if (NOT DEFINED ROCM_PATH)
+      set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm")
+    endif()
+
+    find_path(HSA_INCLUDE_DIR hsa/hsa.h HINTS ${ROCM_PATH}/include)
+    find_library(HSA_LIBRARY hsa-runtime64 HINTS ${ROCM_PATH}/lib ${ROCM_PATH}/lib64)
+
+    if (HSA_INCLUDE_DIR AND HSA_LIBRARY)
+      message(STATUS "${BoldYellow}HSA was found manually${ColorReset}")
+    else()
+      set(ENABLE_HSA_BACKEND false)
+      message(STATUS "${BoldYellow}HSA was not found manually${ColorReset}")
+    endif()
+  endif()
+endif()
+
+set(GPUFECH_COMMON
+    ${COMMON_DIR}/main.cpp
+    ${COMMON_DIR}/args.cpp
+    ${COMMON_DIR}/gpu.cpp
+    ${COMMON_DIR}/global.cpp
+    ${COMMON_DIR}/printer.cpp
+    ${COMMON_DIR}/master.cpp
+    ${COMMON_DIR}/uarch.cpp
+)
+
+set(GPUFETCH_LINK_TARGETS z)
+
+if(NOT(ENABLE_HSA_BACKEND AND NOT ENABLE_CUDA_BACKEND AND NOT ENABLE_INTEL_BACKEND))
+  # Look for pciutils only if not building HSA only.
+  #
+  # This has the (intented) secondary effect that if only HSA backend is enabled
+  # by the user, but ROCm cannot be found, pciutils will still be compiled in
+  # order to show the list of GPUs available on the system, so that the user will
+  # get at least some feedback even if HSA is not found.
+  list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
+  list(APPEND GPUFECH_COMMON ${COMMON_DIR}/pci.cpp ${COMMON_DIR}/sort.cpp)
+  list(APPEND GPUFETCH_LINK_TARGETS pci)
+  set(CMAKE_ENABLE_PCIUTILS ON)
+
+  find_package(PCIUTILS)
+  if(NOT ${PCIUTILS_FOUND})
+    message(STATUS "${BoldYellow}pciutils not found, downloading and building a local copy...${ColorReset}")
+
+    # Download and build pciutils
+    set(PCIUTILS_INSTALL_LOCATION ${CMAKE_BINARY_DIR}/pciutils-install)
+    ExternalProject_Add(pciutils
+      GIT_REPOSITORY https://github.com/pciutils/pciutils
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND make SHARED=no HWDB=no
+      BUILD_IN_SOURCE true
+      INSTALL_COMMAND make PREFIX=${PCIUTILS_INSTALL_LOCATION} install-lib
+    )
+
+    include_directories(${PCIUTILS_INSTALL_LOCATION}/include)
+    link_directories(${PCIUTILS_INSTALL_LOCATION}/lib)
+  else()
+    include_directories(${PCIUTILS_INCLUDE_DIR})
+    link_libraries(${PCIUTILS_LIBRARIES})
+    # Needed for linking libpci in FreeBSD
+    link_directories(/usr/local/lib/)
+  endif()
+endif()
+
+add_executable(gpufetch ${GPUFECH_COMMON})
+set(SANITY_FLAGS -Wfloat-equal -Wshadow -Wpointer-arith -Wall -Wextra -pedantic -fstack-protector-all -pedantic)
+target_compile_features(gpufetch PRIVATE cxx_std_11)
+target_compile_options(gpufetch PRIVATE ${SANITY_FLAGS})
+
+if (CMAKE_ENABLE_PCIUTILS)
+  target_compile_definitions(gpufetch PUBLIC BACKEND_USE_PCI)
+endif()
+
+if(ENABLE_INTEL_BACKEND)
+  target_compile_definitions(gpufetch PUBLIC BACKEND_INTEL)
+
+  add_library(intel_backend STATIC ${INTEL_DIR}/intel.cpp ${INTEL_DIR}/pci.cpp ${INTEL_DIR}/uarch.cpp ${INTEL_DIR}/udev.cpp ${INTEL_DIR}/cpuid.cpp)
+
+  if(NOT ${PCIUTILS_FOUND})
+    add_dependencies(intel_backend pciutils)
+  endif()
+
+  target_link_libraries(gpufetch intel_backend)
+endif()
+
+if(ENABLE_CUDA_BACKEND)
+  target_compile_definitions(gpufetch PUBLIC BACKEND_CUDA)
+
+  # https://en.wikipedia.org/w/index.php?title=CUDA&section=5#GPUs_supported
+  # https://raw.githubusercontent.com/PointCloudLibrary/pcl/master/cmake/pcl_find_cuda.cmake
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.1")
+    set(CMAKE_CUDA_ARCHITECTURES 35 37 50 52 53 60 61 62 70 72 75 80 86)
+  elseif(${CMAKE_CUDA_COMPILER_VERSION} EQUAL "11.0")
+    set(CMAKE_CUDA_ARCHITECTURES 30 32 35 37 50 52 53 60 61 62 70 72 75 80)
+  elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "10.0")
+    set(CMAKE_CUDA_ARCHITECTURES 30 32 35 37 50 52 53 60 61 62 70 72 75)
+  elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "9.0")
+    set(CMAKE_CUDA_ARCHITECTURES 30 32 35 37 50 52 53 60 61 62 70 72)
+  elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "8.0")
+    set(CMAKE_CUDA_ARCHITECTURES 20 21 30 32 35 37 50 52 53 60 61 62)
+  endif()
+
+  add_library(cuda_backend STATIC ${CUDA_DIR}/cuda.cpp ${CUDA_DIR}/uarch.cpp ${CUDA_DIR}/pci.cpp)
+
+  if(NOT ${PCIUTILS_FOUND})
+    add_dependencies(cuda_backend pciutils)
+  endif()
+
+  target_include_directories(cuda_backend PUBLIC ${CMAKE_CUDA_COMPILER_TOOLKIT_ROOT}/targets/x86_64-linux/include)
+
+  target_link_libraries(cuda_backend PRIVATE cudart)
+  target_link_libraries(gpufetch cuda_backend)
+endif()
+
+if(ENABLE_HSA_BACKEND)
+  target_compile_definitions(gpufetch PUBLIC BACKEND_HSA)
+
+  add_library(hsa_backend STATIC ${HSA_DIR}/hsa.cpp ${HSA_DIR}/uarch.cpp)
+
+  if(NOT ${PCIUTILS_FOUND})
+    add_dependencies(hsa_backend pciutils)
+  endif()
+
+  target_include_directories(hsa_backend PRIVATE "${HSA_INCLUDE_DIR}")
+
+  if (HSA_LIBRARY)
+    target_link_libraries(hsa_backend PRIVATE ${HSA_LIBRARY})
+  else()
+    target_link_libraries(hsa_backend PRIVATE hsa-runtime64::hsa-runtime64)
+  endif()
+
+  target_link_libraries(gpufetch hsa_backend)
+endif()
+
+target_link_libraries(gpufetch ${GPUFETCH_LINK_TARGETS})
+install(TARGETS gpufetch DESTINATION bin)
+
+if(NOT WIN32)
+  string(ASCII 27 Esc)
+  set(ColorReset "${Esc}[m")
+  set(ColorBold  "${Esc}[1m")
+  set(Red         "${Esc}[31m")
+  set(Green       "${Esc}[32m")
+  set(BoldRed     "${Esc}[1;31m")
+  set(BoldGreen   "${Esc}[1;32m")
+  set(BoldYellow  "${Esc}[1;33m")
+endif()
+
+message(STATUS "----------------------")
+message(STATUS "gpufetch build report:")
+if(ENABLE_CUDA_BACKEND)
+  message(STATUS "CUDA backend: ${BoldGreen}ON${ColorReset}")
+else()
+  message(STATUS "CUDA backend: ${BoldRed}OFF${ColorReset}")
+endif()
+if(ENABLE_HSA_BACKEND)
+  message(STATUS "HSA backend: ${BoldGreen}ON${ColorReset}")
+else()
+  message(STATUS "HSA backend: ${BoldRed}OFF${ColorReset}")
+endif()
+if(ENABLE_INTEL_BACKEND)
+  message(STATUS "Intel backend: ${BoldGreen}ON${ColorReset}")
+else()
+  message(STATUS "Intel backend: ${BoldRed}OFF${ColorReset}")
+endif()
+message(STATUS "----------------------")
--- a/352
+++ b/352
@@ -1,21 +1,339 @@
-MIT License
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991

-Copyright (c) 2021 Dr-Noob
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.

-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
+                            Preamble

-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.

-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
--- a/53
+++ b/53
@@ -1,53 +0,0 @@
-CXX ?= g++
-CUDA_PATH ?= /usr/local/cuda/
-PREFIX ?= /usr
-
-CXXFLAGS+=-Wall -Wextra -pedantic -fstack-protector-all -pedantic
-SANITY_FLAGS=-Wfloat-equal -Wshadow -Wpointer-arith
-
-SRC_COMMON=src/common/
-SRC_CUDA=src/cuda/
-
-COMMON_SRC = $(SRC_COMMON)main.cpp  $(SRC_COMMON)gpu.cpp $(SRC_COMMON)args.cpp $(SRC_COMMON)global.cpp $(SRC_COMMON)printer.cpp
-COMMON_HDR = $(SRC_COMMON)ascii.hpp $(SRC_COMMON)gpu.hpp $(SRC_COMMON)args.hpp $(SRC_COMMON)global.hpp $(SRC_COMMON)printer.hpp
-
-CUDA_SRC = $(SRC_CUDA)cuda.cpp $(SRC_CUDA)uarch.cpp $(SRC_CUDA)pci.cpp $(SRC_CUDA)nvmlb.cpp
-CUDA_HDR = $(SRC_CUDA)cuda.hpp $(SRC_CUDA)uarch.hpp $(SRC_CUDA)pci.hpp $(SRC_CUDA)nvmlb.hpp $(SRC_CUDA)chips.hpp
-
-SOURCE += $(COMMON_SRC) $(CUDA_SRC)
-HEADERS += $(COMMON_HDR) $(CUDA_HDR)
-
-OUTPUT=gpufetch
-
-CXXFLAGS+= -I $(CUDA_PATH)/samples/common/inc -I $(CUDA_PATH)/targets/x86_64-linux/include -L $(CUDA_PATH)/targets/x86_64-linux/lib -lcudart -lnvidia-ml
-
-all: CXXFLAGS += -O3
-all: $(OUTPUT)
-
-debug: CXXFLAGS += -g -O0
-debug: $(OUTPUT)
-
-static: CXXFLAGS += -static -O3
-static: $(OUTPUT)
-
-strict: CXXFLAGS += -O3 -Werror -fsanitize=undefined -D_FORTIFY_SOURCE=2
-strict: $(OUTPUT)
-
-$(OUTPUT): Makefile $(SOURCE) $(HEADERS)
-	$(CXX) $(CXXFLAGS) $(SANITY_FLAGS) $(SOURCE) -o $(OUTPUT)
-
-run: $(OUTPUT)
-	./$(OUTPUT)
-
-clean:
-	@rm -f $(OUTPUT)
-
-install: $(OUTPUT)
-	install -Dm755 "gpufetch"   "$(DESTDIR)$(PREFIX)/bin/gpufetch"
-	install -Dm644 "LICENSE"    "$(DESTDIR)$(PREFIX)/share/licenses/gpufetch-git/LICENSE"
-	install -Dm644 "gpufetch.1" "$(DESTDIR)$(PREFIX)/share/man/man1/gpufetch.1.gz"
-
-uninstall:
-	rm -f "$(DESTDIR)$(PREFIX)/bin/gpufetch"
-	rm -f "$(DESTDIR)$(PREFIX)/share/licenses/gpufetch-git/LICENSE"
-	rm -f "$(DESTDIR)$(PREFIX)/share/man/man1/gpufetch.1.gz"
--- a/README.md
+++ b/README.md
@@ -1,60 +1,133 @@
 <p align="center"><img width=50% src="./pictures/gpufetch.png"></p>

-<div align="center">
-
-![GitHub tag (latest by date)](https://img.shields.io/github/v/tag/Dr-Noob/gpufetch?label=gpufetch)
-[![GitHub Repo stars](https://img.shields.io/github/stars/Dr-Noob/gpufetch?color=4CC61F)](https://github.com/Dr-Noob/gpufetch/stargazers)
-[![GitHub issues](https://img.shields.io/github/issues/Dr-Noob/gpufetch)](https://github.com/Dr-Noob/gpufetch/issues)
-[![License](https://img.shields.io/github/license/Dr-Noob/gpufetch?color=orange)](https://github.com/Dr-Noob/gpufetch/blob/master/LICENSE)
-
 <h4 align="center">Simple yet fancy GPU architecture fetching tool</h4>
-&nbsp;

-![gpu_img](pictures/2080ti.png)
+<p align="center"> </p>

+<div align="center">
+  <img height="22px" src="https://img.shields.io/github/v/tag/Dr-Noob/gpufetch?label=gpufetch&style=flat-square">
+  <a href="https://github.com/Dr-Noob/gpufetch/stargazers">
+    <img height="22px" src="https://img.shields.io/github/stars/Dr-Noob/gpufetch?color=4CC61F&style=flat-square">
+  </a>
+  <a href="https://github.com/Dr-Noob/gpufetch/issues">
+    <img height="22px" src="https://img.shields.io/github/issues/Dr-Noob/gpufetch?style=flat-square">
+  </a>
+  <a href="https://github.com/Dr-Noob/gpufetch/blob/master/LICENSE">
+    <img height="22px" src="https://img.shields.io/github/license/Dr-Noob/gpufetch?color=orange&style=flat-square">
+  </a>
 </div>

+<p align="center"> </p>
+
+<p align="center">
+gpufetch is a command-line tool written in C++ that displays the GPU information in a clean and beautiful way
+</p>
+
+<p align="center">
+<img width=80% src="./pictures/examples.gif">
+</p>
+
 # Table of contents
 <!-- UPDATE with: doctoc --notitle README.md -->
 <!-- START doctoc generated TOC please keep comment here to allow auto update -->
 <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->


- [1. Support](#1-support)
- [2. Installation (building from source)](#2-installation-building-from-source)
- [3. Colors and style](#3-colors-and-style)
- [4. Bugs or improvements](#4-bugs-or-improvements)
+- [Table of contents](#table-of-contents)
+  - [1. Support](#1-support)
+  - [2. Backends](#2-backends)
+    - [2.1 CUDA backend is not enabled. Why?](#21-cuda-backend-is-not-enabled-why)
+    - [2.2 The backend is enabled, but gpufetch is unable to detect my GPU](#22-the-backend-is-enabled-but-gpufetch-is-unable-to-detect-my-gpu)
+  - [3. Installation (building from source)](#3-installation-building-from-source)
+  - [4. Colors](#4-colors)
+    - [4.1 Specifying a name](#41-specifying-a-name)
+    - [4.2 Specifying the colors in RGB format](#42-specifying-the-colors-in-rgb-format)
+  - [5. Bugs or improvements](#5-bugs-or-improvements)

 <!-- END doctoc generated TOC please keep comment here to allow auto update -->

-# 1. Support
-gpufetch supports NVIDIA GPUs under Linux only.
+## 1. Support
+gpufetch supports the following GPUs:

-# 2. Installation (building from source)
-You will need a C++ compiler (e.g, `g++`), `make` and CUDA to compile `gpufetch`. To do so, just clone the repo and run `make`:
+- **NVIDIA** GPUs (Compute Capability >= 2.0)
+- **AMD** GPUs (Experimental) (RDNA 3.0, CDNA 3.0)
+- **Intel** iGPUs (Generation >= Gen6)
+
+Only compilation under **Linux** is supported.
+
+## 2. Backends
+gpufetch is made up of three backends:
+
+- CUDA backend
+- HSA backend
+- Intel backend
+
+Backends are enabled and disabled at **compile time**. When compiling gpufetch, check the CMake output to see which backends are enabled.
+
+**gpufetch will only detect your GPU if the appropiate backend was enabled during compilation (e.g., will not detect your NVIDIA GPU if CUDA backend is disabled!)**
+
+By default, CMake will try to enable all backends. However, backends can be manually disabled. See the `build.sh` script for instructions.
+
+### 2.1 CUDA backend is not enabled. Why?
+CUDA is mandatory to build gpufetch with CUDA backend enabled. However, when building gpufetch, cmake may be unable to find the CUDA installation. If CUDA is installed but CMake does not find it, you need to pass the CUDA path to cmake. You can do this easily by editing directly the `build.sh` script. For example:
+
+```
+cmake -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=/usr/local/cuda/ ..
+```
+
+### 2.2 The backend is enabled, but gpufetch is unable to detect my GPU
+First, make sure that your GPU is enabled. You can print enabled GPUs with `lspci`:
+
+```
+[drnoob@noob ~]$ lspci -nn | grep VGA
+```
+
+If there is a NVIDIA GPU or Intel iGPU in the system and the appropiate backend is enabled but gpufetch does not detect the GPU, please create a new issue with the provided error message (in the gpufetch output) on the [issues page](https://github.com/Dr-Noob/gpufetch/issues).
+
+## 3. Installation (building from source)
+You will need (mandatory):
+
+- C++ compiler (e.g, `g++`)
+- `zlib`
+- `cmake`
+- `make`
+
+and optionally:
+
+- CUDA (needed for CUDA backend)
+- pciutils (a local copy will be downloaded if pciutils is not installed)
+
+To build gpufetch, just clone the repo and run `./build.sh`:

 ```
 git clone https://github.com/Dr-Noob/gpufetch
 cd gpufetch
-make
+./build.sh
 ./gpufetch
 ```
-When building gpufetch, you may encounter an error telling you that it cannot find some CUDA header files. In this case, is very likely that the Makefile is unable to find your CUDA installation. This can be solved by setting `CUDA_PATH` to the correct CUDA installation path. For example:
+
+## 4. Colors
+By default, `gpufetch` will print the GPU logo with the system color scheme. However, you can set a custom color scheme in two different ways:
+
+### 4.1 Specifying a name
+
+ By specifying a name, gpufetch will use the specific colors of each manufacture. Valid values are:
+
+- intel
+- amd
+- nvidia

 ```
-CUDA_PATH=/opt/cuda make
+./gpufetch --color intel (default color for Intel)
 ```

-# 3. Colors and style
-By default, `gpufetch` will print the GPU logo with the system colorscheme. However, you can always set a custom color scheme, either
-specifying "nvidia", or specifying the colors in RGB format:
+### 4.2 Specifying the colors in RGB format
+
+5 colors must be given in RGB with the format: ``[R,G,B:R,G,B:R,G,B:R,G,B:R,G,B]``. These colors correspond to the GPU logo color (first 3 colors) and for the text colors (following 2).

 ```
-./gpufetch --color nvidia (default color for NVIDIA)
-./gpufetch --color 239,90,45:210,200,200:100,200,45:0,200,200 (example)
+./gpufetch --color 239,90,45:210,200,200:0,0,0:100,200,45:0,200,200
 ```

-In the case of setting the colors using RGB, 4 colors must be given in with the format: ``[R,G,B:R,G,B:R,G,B:R,G,B]``. These colors correspond to GPU art color (2 colors) and for the text colors (following 2). Thus, you can customize all the colors.
-
-# 4. Bugs or improvements
+## 5. Bugs or improvements
 See [gpufetch contributing guidelines](https://github.com/Dr-Noob/gpufetch/blob/master/CONTRIBUTING.md)
--- a/build.sh
+++ b/build.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+print_help() {
+  cat << EOF
+Usage: $0 <backends> [build_type]
+
+  <backends>    MANDATORY. Comma-separated list of 
+                backends to enable.
+                Valid options: hsa, intel, cuda
+                Example: hsa,cuda
+
+  [build_type]  OPTIONAL. Build type. Valid options:
+                debug, release (default: release)
+
+Examples:
+  $0 hsa,intel debug
+  $0 cuda
+  $0 hsa,intel,cuda release
+EOF
+}
+
+# gpufetch build script
+set -e
+
+rm -rf build/ gpufetch
+mkdir build/
+cd build/
+
+if [ "$1" == "--help" ]
+then
+  echo "gpufetch build script"
+  echo
+  print_help
+  exit 0
+fi
+
+if [[ $# -lt 1 ]]; then
+  echo "ERROR: At least one backend must be specified."
+  echo
+  print_help
+  exit 1
+fi
+
+# Determine if last argument is build type
+LAST_ARG="${!#}"
+if [[ "$LAST_ARG" == "debug" || "$LAST_ARG" == "release" ]]; then
+  BUILD_TYPE="$LAST_ARG"
+  BACKEND_ARG="${1}"
+else
+  BUILD_TYPE="release"
+  BACKEND_ARG="${1}"
+fi
+
+# Split comma-separated backends into an array
+IFS=',' read -r -a BACKENDS <<< "$BACKEND_ARG"
+
+# Validate build type 
+if [[ "$BUILD_TYPE" != "debug" && "$BUILD_TYPE" != "release" ]]
+then
+  echo "Error: Invalid build type '$BUILD_TYPE'."
+  echo "Valid options are: debug, release"
+  exit 1
+fi
+
+# From lower to upper case
+CMAKE_FLAGS="-DCMAKE_BUILD_TYPE=${BUILD_TYPE^}"
+
+# Validate backends
+VALID_BACKENDS=("hsa" "intel" "cuda")
+
+for BACKEND in "${BACKENDS[@]}"; do
+  case "$BACKEND" in
+    hsa)
+      CMAKE_FLAGS+=" -DENABLE_HSA_BACKEND=ON"
+      ;;
+    intel)
+      CMAKE_FLAGS+=" -DENABLE_INTEL_BACKEND=ON"
+      ;;
+    cuda)
+      CMAKE_FLAGS+=" -DENABLE_CUDA_BACKEND=ON"
+      ;;
+    *)
+      echo "ERROR: Invalid backend '$BACKEND'."
+      echo "Valid options: ${VALID_BACKENDS[*]}"
+      exit 1
+      ;;
+  esac
+done
+
+# You can also manually specify the compilation flags.
+# If you need to, just run the cmake command directly
+# instead of using this script.
+#
+# Here you will find some help:
+#
+# In case you have CUDA installed but it is not detected,
+# - set CMAKE_CUDA_COMPILER to your nvcc binary:
+# - set CMAKE_CUDA_COMPILER_TOOLKIT_ROOT to the CUDA root dir
+# for example:
+# cmake -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=/usr/local/cuda/ ..
+#
+# In case you want to explicitely disable a backend, you can:
+# Disable CUDA backend:
+# cmake -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DENABLE_CUDA_BACKEND=OFF ..
+# Disable HSA backend:
+# cmake -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DENABLE_HSA_BACKEND=OFF ..
+# Disable Intel backend:
+# cmake -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DENABLE_INTEL_BACKEND=OFF ..
+
+echo "$0: Running cmake $CMAKE_FLAGS"
+echo 
+cmake $CMAKE_FLAGS ..
+
+os=$(uname)
+if [ "$os" == 'Linux' ]; then
+  make -j$(nproc)
+elif [ "$os" == 'FreeBSD' ]; then
+  gmake -j4
+fi
+
+cd -
+ln -s build/gpufetch .
--- a/cmake/FindPCIUTILS.cmake
+++ b/cmake/FindPCIUTILS.cmake
@@ -0,0 +1,29 @@
+# - Try to find the pciutils directory library
+# Once done this will define
+#
+#  PCIUTILS_FOUND - system has PCIUtils
+#  PCIUTILS_INCLUDE_DIR - the PCIUTILS include directory
+#  PCIUTILS_LIBRARIES - The libraries needed to use PCIUtils
+
+if(PCIUTILS_INCLUDE_DIR AND PCIUTILS_LIBRARIES)
+   set(PCIUTILS_FIND_QUIETLY TRUE)
+endif(PCIUTILS_INCLUDE_DIR AND PCIUTILS_LIBRARIES)
+
+FIND_PATH(PCIUTILS_INCLUDE_DIR pci/pci.h)
+
+FIND_LIBRARY(PCIUTILS_LIBRARY NAMES pci)
+if(PCIUTILS_LIBRARY)
+  FIND_LIBRARY(RESOLV_LIBRARY NAMES resolv)
+  if(RESOLV_LIBRARY)
+    set(PCIUTILS_LIBRARIES ${PCIUTILS_LIBRARY} ${RESOLV_LIBRARY})
+  else(RESOLV_LIBRARY)
+    set(PCIUTILS_LIBRARIES ${PCIUTILS_LIBRARY})
+  endif(RESOLV_LIBRARY)
+endif(PCIUTILS_LIBRARY)
+
+
+include(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(PCIUTILS DEFAULT_MSG PCIUTILS_LIBRARIES PCIUTILS_INCLUDE_DIR)
+
+MARK_AS_ADVANCED(PCIUTILS_INCLUDE_DIR PCIUTILS_LIBRARIES)
+
--- a/gpufetch.1
+++ b/gpufetch.1
@@ -0,0 +1,47 @@
+.\" DO NOT MODIFY THIS FILE!  It was generated by help2man 1.48.3.
+.TH GPUFETCH "1" "August 2021" "gpufetch v0.10" "User Commands"
+.SH NAME
+gpufetch
+.SH SYNOPSIS
+.B gpufetch
+[\fI\,OPTION\/\fR]...
+.SH DESCRIPTION
+Simple yet fancy GPU architecture fetching tool
+.SH OPTIONS
+.TP
+\fB\-c\fR, \fB\-\-color\fR
+Sets the color scheme (by default, gpufetch uses the system color scheme) See COLORS section for a more detailed explanation
+.TP
+\fB\-g\fR, \fB\-\-gpu\fR
+Selects the GPU to use (default: 0)
+.TP
+\fB\-h\fR, \fB\-\-help\fR
+Prints this help and exit
+.TP
+\fB\-V\fR, \fB\-\-version\fR
+Prints gpufetch version and exit
+.SS "COLORS:"
+.IP
+Color scheme can be set using a predefined color scheme or a custom one:
+1. To use a predefined color scheme, the name of the scheme must be provided. Possible values are:
+* "nvidia":  Use NVIDIA default color scheme
+2. To use a custom color scheme, 4 colors must be given in RGB with the format: R,G,B:R,G,B:...
+The first 2 colors are the GPU art color and the following 2 colors are the text colors
+.SS "EXAMPLES:"
+.IP
+Run gpufetch with NVIDIA color scheme:
+.IP
+\&./gpufetch \fB\-\-color\fR nvidia
+.IP
+Run gpufetch with a custom color scheme:
+.IP
+\&./gpufetch \fB\-\-color\fR 239,90,45:210,200,200:100,200,45:0,200,200
+.SS "BUGS:"
+.IP
+Report bugs to https://github.com/Dr\-Noob/gpufetch/issues
+.SS "NOTE:"
+.IP
+Peak performance information is NOT accurate. gpufetch computes peak performance using the max
+frequency. However, to properly compute peak performance, you need to know the frequency of the
+GPU running real code.
+For peak performance measurement see: https://github.com/Dr\-Noob/peakperf
--- a/pictures/2080ti.png
+++ b/pictures/2080ti.png
--- a/pictures/examples.gif
+++ b/pictures/examples.gif
--- a/pictures/uhd620.png
+++ b/pictures/uhd620.png
--- a/src/common/args.cpp
+++ b/src/common/args.cpp
@@ -1,7 +1,7 @@
 #include <getopt.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
 #include <climits>

 #include "args.hpp"
@@ -13,12 +13,23 @@
 #define NUM_COLORS         4

 #define COLOR_STR_NVIDIA "nvidia"
+#define COLOR_STR_AMD    "amd"
+#define COLOR_STR_INTEL  "intel"

-#define COLOR_DEFAULT_NVIDIA "118,185,0:255,255,255:255,255,255:118,185,0"
+//                              +-----------------------+-----------------------+
+//                              | Color logo            | Color text            |
+//                              | Color 1   | Color 2   | Color 1   | Color 2   |
+#define COLOR_DEFAULT_NVIDIA    "118,185,000:255,255,255:255,255,255:118,185,000"
+#define COLOR_DEFAULT_AMD       "250,250,250:250,250,250:200,200,200:255,255,255"
+#define COLOR_DEFAULT_INTEL     "015,125,194:230,230,230:040,150,220:230,230,230"

 struct args_struct {
  bool help_flag;
+  bool verbose_flag;
  bool version_flag;
+  bool list_gpus;
+  bool logo_long;
+  bool logo_short;
  int gpu_idx;
  STYLE style;
  struct color** colors;
@@ -28,17 +39,25 @@ int errn = 0;
 static struct args_struct args;

 const char args_chr[] = {
-  /* [ARG_CHAR_COLOR]   = */ 'c',
-  /* [ARG_CHAR_GPU]     = */ 'g',
-  /* [ARG_CHAR_HELP]    = */ 'h',
-  /* [ARG_CHAR_VERSION] = */ 'V',
+  /* [ARG_COLOR]      = */ 'c',
+  /* [ARG_GPU]        = */ 'g',
+  /* [ARG_LIST]       = */ 'l',
+  /* [ARG_LOGO_LONG]  = */ 1,
+  /* [ARG_LOGO_SHORT] = */ 2,
+  /* [ARG_HELP]       = */ 'h',
+  /* [ARG_VERBOSE]    = */ 'v',
+  /* [ARG_VERSION]    = */ 'V',
 };

 const char *args_str[] = {
-  /* [ARG_CHAR_COLOR]   = */ "color",
-  /* [ARG_CHAR_GPU]     = */ "gpu",
-  /* [ARG_CHAR_HELP]    = */ "help",
-  /* [ARG_CHAR_VERSION] = */ "version",
+  /* [ARG_COLOR]      = */ "color",
+  /* [ARG_GPU]        = */ "gpu",
+  /* [ARG_LIST]       = */ "list-gpus",
+  /* [ARG_LOGO_LONG]  = */ "logo-long",
+  /* [ARG_LOGO_SHORT] = */ "logo-short",
+  /* [ARG_HELP]       = */ "help",
+  /* [ARG_VERBOSE]    = */ "verbose",
+  /* [ARG_VERSION]    = */ "version",
 };

 int getarg_int(char* str) {
@@ -67,20 +86,16 @@ int getarg_int(char* str) {
  return -1;
 }

-void print_getarg_error() {
+const char* getarg_error() {
  switch (errn) {
    case OVERFLOW:
-      printf("overflow detected while parsing the arguments\n");
-      break;
+      return "overflow detected";
    case UNDERFLOW:
-      printf("underflow detected while parsing the arguments\n");
-      break;
+      return "underflow detected";
    case INVALID_ARG:
-      printf("invalid argument\n");
-      break;
+      return "invalid argument";
    default:
-      printf("invalid error: %d\n", errn);
-      break;
+      return "invalid error";
  }
 }

@@ -100,10 +115,26 @@ bool show_help() {
  return args.help_flag;
 }

+bool list_gpus() {
+  return args.list_gpus;
+}
+
+bool show_logo_long() {
+  return args.logo_long;
+}
+
+bool show_logo_short() {
+  return args.logo_short;
+}
+
 bool show_version() {
  return args.version_flag;
 }

+bool verbose_enabled() {
+  return args.verbose_flag;
+}
+
 int max_arg_str_length() {
  int max_len = -1;
  int len = sizeof(args_str) / sizeof(args_str[0]);
@@ -119,8 +150,10 @@ char* build_short_options() {
  char* str = (char *) emalloc(sizeof(char) * (len*2 + 1));
  memset(str, 0, sizeof(char) * (len*2 + 1));

-  sprintf(str, "%c:%c:%c%c", c[ARG_GPU],
-  c[ARG_COLOR], c[ARG_HELP], c[ARG_VERSION]);
+  sprintf(str, "%c:%c:%c%c%c%c%c%c", c[ARG_GPU],
+  c[ARG_COLOR], c[ARG_HELP], c[ARG_LIST],
+  c[ARG_LOGO_SHORT], c[ARG_LOGO_LONG],
+  c[ARG_VERBOSE], c[ARG_VERSION]);

  return str;
 }
@@ -137,6 +170,8 @@ bool parse_color(char* optarg_str, struct color*** cs) {
  bool free_ptr = true;

  if(strcmp(optarg_str, COLOR_STR_NVIDIA) == 0) color_to_copy = COLOR_DEFAULT_NVIDIA;
+  else if(strcmp(optarg_str, COLOR_STR_AMD) == 0) color_to_copy = COLOR_DEFAULT_AMD;
+  else if(strcmp(optarg_str, COLOR_STR_INTEL) == 0) color_to_copy = COLOR_DEFAULT_INTEL;
  else {
    str_to_parse = optarg_str;
    free_ptr = false;
@@ -185,21 +220,28 @@ bool parse_args(int argc, char* argv[]) {

  args.version_flag = false;
  args.help_flag = false;
+  args.list_gpus = false;
+  args.logo_long = false;
+  args.logo_short = false;
  args.gpu_idx = 0;
  args.colors = NULL;

  const struct option long_options[] = {
-    {args_str[ARG_COLOR],   required_argument, 0, args_chr[ARG_COLOR]   },
-    {args_str[ARG_GPU],     required_argument, 0, args_chr[ARG_GPU]     },
-    {args_str[ARG_HELP],    no_argument,       0, args_chr[ARG_HELP]    },
-    {args_str[ARG_VERSION], no_argument,       0, args_chr[ARG_VERSION] },
+    {args_str[ARG_COLOR],      required_argument, 0, args_chr[ARG_COLOR]      },
+    {args_str[ARG_GPU],        required_argument, 0, args_chr[ARG_GPU]        },
+    {args_str[ARG_LIST],       no_argument,       0, args_chr[ARG_LIST]       },
+    {args_str[ARG_LOGO_SHORT], no_argument,       0, args_chr[ARG_LOGO_SHORT] },
+    {args_str[ARG_LOGO_LONG],  no_argument,       0, args_chr[ARG_LOGO_LONG]  },
+    {args_str[ARG_HELP],       no_argument,       0, args_chr[ARG_HELP]       },
+    {args_str[ARG_VERBOSE],    no_argument,       0, args_chr[ARG_VERBOSE]    },
+    {args_str[ARG_VERSION],    no_argument,       0, args_chr[ARG_VERSION]    },
    {0, 0, 0, 0}
  };

  char* short_options = build_short_options();
  opt = getopt_long(argc, argv, short_options, long_options, &option_index);

-  while (!args.help_flag && !args.version_flag && opt != -1) {
+  while (!args.help_flag && !args.version_flag && !args.list_gpus && opt != -1) {
    if(opt == args_chr[ARG_COLOR]) {
      args.colors = (struct color **) emalloc(sizeof(struct color *) * NUM_COLORS);
      if(!parse_color(optarg, &args.colors)) {
@@ -207,16 +249,38 @@ bool parse_args(int argc, char* argv[]) {
      }
    }
    else if(opt == args_chr[ARG_GPU]) {
-      args.gpu_idx = getarg_int(optarg);
-      if(errn != 0) {
-        printErr("Option %s: ", args_str[ARG_GPU]);
-        print_getarg_error();
-        args.help_flag  = true;
-        return false;
+      // Check for "a" option
+      if(strcmp(optarg, "a") == 0) {
+        args.gpu_idx = -1;
+      }
+      else {
+        args.gpu_idx = getarg_int(optarg);
+        if(errn != 0) {
+          printErr("Option %s: %s", args_str[ARG_GPU], getarg_error());
+          args.help_flag  = true;
+          return false;
+        }
+        if(args.gpu_idx < 0) {
+          printErr("Specified GPU index is out of range: %d. ", args.gpu_idx);
+          printf("Run gpufetch with the --%s option to check out valid GPU indexes\n", args_str[ARG_LIST]);
+          return false;
+        }
      }
    }
+    else if(opt == args_chr[ARG_LIST]) {
+      args.list_gpus = true;
+    }
+    else if(opt == args_chr[ARG_LOGO_SHORT]) {
+       args.logo_short = true;
+    }
+    else if(opt == args_chr[ARG_LOGO_LONG]) {
+       args.logo_long = true;
+    }
    else if(opt == args_chr[ARG_HELP]) {
-      args.help_flag  = true;
+      args.help_flag = true;
+    }
+    else if(opt == args_chr[ARG_VERBOSE]) {
+      args.verbose_flag  = true;
    }
    else if(opt == args_chr[ARG_VERSION]) {
      args.version_flag = true;
@@ -235,6 +299,12 @@ bool parse_args(int argc, char* argv[]) {
    args.help_flag  = true;
  }

+  if(args.logo_short && args.logo_long) {
+    printWarn("%s and %s cannot be specified together", args_str[ARG_LOGO_SHORT], args_str[ARG_LOGO_LONG]);
+    args.logo_short = false;
+    args.logo_long = false;
+  }
+
  if((args.help_flag + args.version_flag) > 1) {
    printWarn("You should specify just one option");
    args.help_flag  = true;
--- a/src/common/args.hpp
+++ b/src/common/args.hpp
@@ -1,7 +1,7 @@
 #ifndef __ARGS__
 #define __ARGS__

-#include <stdbool.h>
+#include <cstdint>

 struct color {
  int32_t R;
@@ -21,7 +21,11 @@ enum {
 enum {
  ARG_COLOR,
  ARG_GPU,
+  ARG_LIST,
+  ARG_LOGO_LONG,
+  ARG_LOGO_SHORT,
  ARG_HELP,
+  ARG_VERBOSE,
  ARG_VERSION
 };

@@ -33,7 +37,11 @@ extern const char *args_str[];
 int max_arg_str_length();
 bool parse_args(int argc, char* argv[]);
 bool show_help();
+bool list_gpus();
+bool show_logo_long();
+bool show_logo_short();
 bool show_version();
+bool verbose_enabled();
 void free_colors_struct(struct color** cs);
 int get_gpu_idx();
 struct color** get_colors();
--- a/src/common/ascii.hpp
+++ b/src/common/ascii.hpp
@@ -1,32 +1,7 @@
 #ifndef __ASCII__
 #define __ASCII__

-#define COLOR_NONE         ""
-#define COLOR_FG_BLACK     "\x1b[30;1m"
-#define COLOR_FG_RED       "\x1b[31;1m"
-#define COLOR_FG_GREEN     "\x1b[32;1m"
-#define COLOR_FG_YELLOW    "\x1b[33;1m"
-#define COLOR_FG_BLUE      "\x1b[34;1m"
-#define COLOR_FG_MAGENTA   "\x1b[35;1m"
-#define COLOR_FG_CYAN      "\x1b[36;1m"
-#define COLOR_FG_WHITE     "\x1b[37;1m"
-#define COLOR_BG_BLACK     "\x1b[40;1m"
-#define COLOR_BG_RED       "\x1b[41;1m"
-#define COLOR_BG_GREEN     "\x1b[42;1m"
-#define COLOR_BG_YELLOW    "\x1b[43;1m"
-#define COLOR_BG_BLUE      "\x1b[44;1m"
-#define COLOR_BG_MAGENTA   "\x1b[45;1m"
-#define COLOR_BG_CYAN      "\x1b[46;1m"
-#define COLOR_BG_WHITE     "\x1b[47;1m"
-#define COLOR_FG_B_BLACK   "\x1b[90;1m"
-#define COLOR_FG_B_RED     "\x1b[91;1m"
-#define COLOR_FG_B_GREEN   "\x1b[92;1m"
-#define COLOR_FG_B_YELLOW  "\x1b[93;1m"
-#define COLOR_FG_B_BLUE    "\x1b[94;1m"
-#define COLOR_FG_B_MAGENTA "\x1b[95;1m"
-#define COLOR_FG_B_CYAN    "\x1b[96;1m"
-#define COLOR_FG_B_WHITE   "\x1b[97;1m"
-#define COLOR_RESET        "\x1b[m"
+#include "colors.hpp"

 struct ascii_logo {
  const char* art;
@@ -59,6 +34,40 @@ $C2##   ##  ##   ##  ##  ##   ##  ##   #: :#    \
 $C2##   ##   ## ##   ##  ##   ##  ##  #######   \
 $C2##   ##    ###    ##  ######   ## ##     ##  "

+#define ASCII_AMD \
+"$C2          '###############             \
+$C2             ,#############            \
+$C2                      .####            \
+$C2              #.      .####            \
+$C2            :##.      .####            \
+$C2           :###.      .####            \
+$C2           #########.   :##            \
+$C2           #######.       ;            \
+$C1                                       \
+$C1    ###     ###      ###   #######     \
+$C1   ## ##    #####  #####   ##     ##   \
+$C1  ##   ##   ### #### ###   ##      ##  \
+$C1 #########  ###  ##  ###   ##      ##  \
+$C1##       ## ###      ###   ##     ##   \
+$C1##       ## ###      ###   #######     "
+
+#define ASCII_INTEL \
+"$C1                   .#################.          \
+$C1              .####                   ####.     \
+$C1          .##                             ###   \
+$C1       ##                          :##     ###  \
+$C1    #                ##            :##      ##  \
+$C1  ##   ##  ######.   ####  ######  :##      ##  \
+$C1 ##    ##  ##:  ##:  ##   ##   ### :##     ###  \
+$C1##     ##  ##:  ##:  ##  :######## :##    ##    \
+$C1##     ##  ##:  ##:  ##   ##.   .  :## ####     \
+$C1##      #  ##:  ##:  ####  #####:   ##          \
+$C1 ##                                             \
+$C1  ###.                         ..o####.         \
+$C1   ######oo...         ..oo#######              \
+$C1          o###############o                     "
+
+// LONG LOGOS
 #define ASCII_NVIDIA_L \
 "$C1                  MMMMMMMMMMMMMMMMMMMMMMMMMMMMMM  \
 $C1                  MMMMMMMMMMMMMMMMMMMMMMMMMMMMMM  \
@@ -76,14 +85,60 @@ $C1            olcc::;              ,:ccloMMMMMMMMM  \
 $C1                  :......oMMMMMMMMMMMMMMMMMMMMMM  \
 $C1                  :lllMMMMMMMMMMMMMMMMMMMMMMMMMM  "

+#define ASCII_AMD_L \
+"$C1                                                              \
+$C1                                                              \
+$C1                                                              \
+$C1                                                              \
+$C1                                                              \
+$C1                                                              \
+$C1     @@@@      @@@       @@@   @@@@@@@@      $C2  ############   \
+$C1    @@@@@@     @@@@@   @@@@@   @@@    @@@    $C2    ##########   \
+$C1   @@@  @@@    @@@@@@@@@@@@@   @@@      @@   $C2   #     #####   \
+$C1  @@@    @@@   @@@  @@@  @@@   @@@      @@   $C2 ###     #####   \
+$C1 @@@@@@@@@@@@  @@@       @@@   @@@    @@@    $C2#########  ###   \
+$C1 @@@      @@@  @@@       @@@   @@@@@@@@@     $C2########    ##   \
+$C1                                                              \
+$C1                                                              \
+$C1                                                              \
+$C1                                                              \
+$C1                                                              \
+$C1                                                              \
+$C1                                                              "
+
+#define ASCII_INTEL_L \
+"$C1                               ###############@               \
+$C1                       ######@                ######@         \
+$C1                  ###@                              ###@      \
+$C1              ##@                                     ###@    \
+$C1         ##@                                             ##@  \
+$C1         ##@                                             ##@  \
+$C1      @                    ##@                ##@        ##@  \
+$C1    #@   ##@   ########@   #####@   #####@    ##@        ##@  \
+$C1   #@    ##@   ##@    ##@  ##@    ###@  ###@  ##@        ##@  \
+$C1  #@     ##@   ##@    ##@  ##@    ##@    ##@  ##@       ##@   \
+$C1 #@      ##@   ##@    ##@  ##@    #########@  ##@     ###@    \
+$C1 #@      ##@   ##@    ##@  ##@    ##@         ##@   ####@     \
+$C1 #@       #@   ##@    ##@   ####@  ########@   #@  ##@        \
+$C1 ##@                                                          \
+$C1  ##@                                                         \
+$C1  ###@                                        ###@            \
+$C1    ####@                               #########@            \
+$C1      #########@               ###############@               \
+$C1          ##############################@                     "
+
 typedef struct ascii_logo asciiL;

-//                      ------------------------------------------------------------------------------------------------------
-//                      | LOGO          | W | H | REPLACE | COLORS LOGO (>0 && <10)        | COLORS TEXT (=2)                |
-//                      ------------------------------------------------------------------------------------------------------
-asciiL logo_nvidia    = { ASCII_NVIDIA,    45, 19, false, {COLOR_FG_GREEN, COLOR_FG_WHITE}, {COLOR_FG_WHITE, COLOR_FG_GREEN} };
-// Long variants        | ---------------------------------------------------------------------------------------------------|
-asciiL logo_nvidia_l  = { ASCII_NVIDIA_L,  50, 15, false, {COLOR_FG_GREEN, COLOR_FG_WHITE}, {COLOR_FG_WHITE, COLOR_FG_GREEN} };
-asciiL logo_unknown   = { NULL,            0,  0,  false, {COLOR_NONE},                     {COLOR_NONE,    COLOR_NONE}      };
+//                      ------------------------------------------------------------------------------------------
+//                      | LOGO            | W | H | REPLACE | COLORS LOGO           | COLORS TEXT                |
+//                      ------------------------------------------------------------------------------------------
+asciiL logo_nvidia    = { ASCII_NVIDIA,    45, 19, false, {C_FG_GREEN, C_FG_WHITE}, {C_FG_WHITE, C_FG_GREEN}   };
+asciiL logo_amd       = { ASCII_AMD,       39, 15, false, {C_FG_WHITE, C_FG_GREEN}, {C_FG_WHITE, C_FG_GREEN}   };
+asciiL logo_intel     = { ASCII_INTEL,     48, 14, false, {C_FG_CYAN},              {C_FG_CYAN,  C_FG_WHITE}   };
+// Long variants        | ---------------------------------------------------------------------------------------|
+asciiL logo_nvidia_l  = { ASCII_NVIDIA_L,  50, 15, false, {C_FG_GREEN, C_FG_WHITE}, {C_FG_WHITE, C_FG_GREEN}   };
+asciiL logo_amd_l     = { ASCII_AMD_L,     62, 19, true,  {C_BG_WHITE, C_BG_WHITE}, {C_FG_CYAN,  C_FG_B_WHITE} };
+asciiL logo_intel_l   = { ASCII_INTEL_L,   62, 19, true,  {C_BG_CYAN, C_BG_WHITE},  {C_FG_CYAN,  C_FG_WHITE}   };
+asciiL logo_unknown   = { NULL,            0,  0,  false, {C_NONE},                 {C_NONE,     C_NONE}       };

 #endif
--- a/src/common/colors.hpp
+++ b/src/common/colors.hpp
@@ -0,0 +1,31 @@
+#ifndef __COLORS__
+#define __COLORS__
+
+#define C_NONE         ""
+#define C_FG_BLACK     "\x1b[30;1m"
+#define C_FG_RED       "\x1b[31;1m"
+#define C_FG_GREEN     "\x1b[32;1m"
+#define C_FG_YELLOW    "\x1b[33;1m"
+#define C_FG_BLUE      "\x1b[34;1m"
+#define C_FG_MAGENTA   "\x1b[35;1m"
+#define C_FG_CYAN      "\x1b[36;1m"
+#define C_FG_WHITE     "\x1b[37;1m"
+#define C_BG_BLACK     "\x1b[40;1m"
+#define C_BG_RED       "\x1b[41;1m"
+#define C_BG_GREEN     "\x1b[42;1m"
+#define C_BG_YELLOW    "\x1b[43;1m"
+#define C_BG_BLUE      "\x1b[44;1m"
+#define C_BG_MAGENTA   "\x1b[45;1m"
+#define C_BG_CYAN      "\x1b[46;1m"
+#define C_BG_WHITE     "\x1b[47;1m"
+#define C_FG_B_BLACK   "\x1b[90;1m"
+#define C_FG_B_RED     "\x1b[91;1m"
+#define C_FG_B_GREEN   "\x1b[92;1m"
+#define C_FG_B_YELLOW  "\x1b[93;1m"
+#define C_FG_B_BLUE    "\x1b[94;1m"
+#define C_FG_B_MAGENTA "\x1b[95;1m"
+#define C_FG_B_CYAN    "\x1b[96;1m"
+#define C_FG_B_WHITE   "\x1b[97;1m"
+#define C_RESET        "\x1b[m"
+
+#endif
--- a/src/common/global.cpp
+++ b/src/common/global.cpp
@@ -1,8 +1,8 @@
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cerrno>

 #include "global.hpp"

@@ -26,6 +26,7 @@ enum {
 };

 int LOG_LEVEL;
+bool clean;

 void printWarn(const char *fmt, ...) {
  if(LOG_LEVEL == LOG_LEVEL_VERBOSE) {
@@ -37,6 +38,7 @@ void printWarn(const char *fmt, ...) {
    va_end(args);
    fprintf(stderr, BOLD "[WARNING]: " RESET "%s\n",buffer);
    delete [] buffer;
+    clean = false;
  }
 }

@@ -49,6 +51,7 @@ void printErr(const char *fmt, ...) {
  va_end(args);
  fprintf(stderr, RED "[ERROR]: " RESET "%s\n",buffer);
  delete [] buffer;
+  clean = false;
 }

 void printBug(const char *fmt, ...) {
@@ -61,11 +64,17 @@ void printBug(const char *fmt, ...) {
  fprintf(stderr, RED "[ERROR]: " RESET "%s\n",buffer);
  fprintf(stderr,"Please, create a new issue with this error message on https://github.com/Dr-Noob/gpufetch/issues\n");
  delete [] buffer;
+  clean = false;
 }

 void set_log_level(bool verbose) {
  if(verbose) LOG_LEVEL = LOG_LEVEL_VERBOSE;
  else LOG_LEVEL = LOG_LEVEL_NORMAL;
+  clean = true;
+}
+
+bool clean_output() {
+  return clean;
 }

 int max(int a, int b) {
--- a/src/common/global.hpp
+++ b/src/common/global.hpp
@@ -1,8 +1,6 @@
 #ifndef __GLOBAL__
 #define __GLOBAL__

-#include <stdbool.h>
-#include <stddef.h>
 #include <cstddef>

 #define STRING_UNKNOWN "Unknown"
@@ -11,6 +9,7 @@ void set_log_level(bool verbose);
 void printWarn(const char *fmt, ...);
 void printErr(const char *fmt, ...);
 void printBug(const char *fmt, ...);
+bool clean_output();
 int max(int a, int b);
 int min(int a, int b);
 void* emalloc(size_t size);
--- a/src/common/gpu.cpp
+++ b/src/common/gpu.cpp
@@ -16,6 +16,9 @@
 #define STRING_KILOBYTES  "KB"
 #define STRING_MEGABYTES  "MB"
 #define STRING_GIGABYTES  "GB"
+#define STRING_KIBIBYTES  "KiB"
+#define STRING_MEBIBYTES  "MiB"
+#define STRING_GIBIBYTES  "GiB"

 static const char *memtype_str[] = {
  /*[MEMTYPE_UNKNOWN] = */ STRING_UNKNOWN,
@@ -32,19 +35,17 @@ VENDOR get_gpu_vendor(struct gpu_info* gpu) {
  return gpu->vendor;
 }

-double trunc(double val) { return ((int)(100 * val)) / 100.0; }
-
 int32_t get_value_as_smallest_unit(char ** str, uint64_t value) {
  int32_t ret;
  int max_len = 10; // Max is 8 for digits, 2 for units
  *str = (char *) emalloc(sizeof(char)* (max_len + 1));

  if(value/1024 >= (1 << 20))
-    ret = snprintf(*str, max_len, "%.4g " STRING_GIGABYTES, trunc((double)value/(1<<30)));
+    ret = snprintf(*str, max_len, "%.0f " STRING_GIBIBYTES, round((double)value/(1<<30)));
  else if(value/1024 >= (1 << 10))
-    ret = snprintf(*str, max_len, "%.4g " STRING_MEGABYTES, trunc((double)value/(1<<20)));
+    ret = snprintf(*str, max_len, "%.0f " STRING_MEBIBYTES, round((double)value/(1<<20)));
  else
-    ret = snprintf(*str, max_len, "%.4g " STRING_KILOBYTES, trunc((double)value/(1<<10)));
+    ret = snprintf(*str, max_len, "%.0f " STRING_KIBIBYTES, round((double)value/(1<<10)));

  return ret;
 }
@@ -100,6 +101,17 @@ char* get_str_bus_width(struct gpu_info* gpu) {
  return string;
 }

+char* get_str_lds_size(struct gpu_info* gpu) {
+  // TODO: Show XX KB (XX MB Total) like in cpufetch
+  uint32_t size = 3+1+3+1;
+  assert(strlen(STRING_UNKNOWN)+1 <= size);
+  char* string = (char *) ecalloc(size, sizeof(char));
+
+  sprintf(string, "%d KB", gpu->mem->lds_size / 1024);
+
+  return string;
+}
+
 char* get_str_memory_clock(struct gpu_info* gpu) {
  return get_freq_as_str_mhz(gpu->mem->freq);
 }
@@ -116,17 +128,17 @@ char* get_str_l2(struct gpu_info* gpu) {
  return string;
 }

-char* get_str_peak_performance(struct gpu_info* gpu) {
+char* get_str_peak_performance_generic(int64_t pp) {
  char* str;

-  if(gpu->peak_performance == -1) {
+  if(pp == -1) {
    str = (char *) emalloc(sizeof(char) * (strlen(STRING_UNKNOWN) + 1));
    strncpy(str, STRING_UNKNOWN, strlen(STRING_UNKNOWN) + 1);
    return str;
  }

  // 7 for digits (e.g, XXXX.XX), 7 for XFLOP/s
-  double flopsd = (double) gpu->peak_performance;
+  double flopsd = (double) pp;
  uint32_t max_size = 7+1+7+1;
  str = (char *) ecalloc(max_size, sizeof(char));

@@ -139,3 +151,27 @@ char* get_str_peak_performance(struct gpu_info* gpu) {

  return str;
 }
+
+char* get_str_peak_performance(struct gpu_info* gpu) {
+  return get_str_peak_performance_generic(gpu->peak_performance);
+}
+
+char* get_str_peak_performance_tensor(struct gpu_info* gpu) {
+  return get_str_peak_performance_generic(gpu->peak_performance_tcu);
+}
+
+char* get_str_generic(int32_t data) {
+  char* str;
+
+  if(data < 0) {
+    str = (char *) emalloc(sizeof(char) * (strlen(STRING_UNKNOWN) + 1));
+    strncpy(str, STRING_UNKNOWN, strlen(STRING_UNKNOWN) + 1);
+    return str;
+  }
+
+  // Largest int is 10, +1 for possible negative, +1 for EOL
+  uint32_t max_size = 12;
+  str = (char *) ecalloc(max_size, sizeof(char));
+  snprintf(str, max_size, "%d", data);
+  return str;
+}
--- a/src/common/gpu.hpp
+++ b/src/common/gpu.hpp
@@ -1,16 +1,14 @@
 #ifndef __GPU__
 #define __GPU__

-#include <stdint.h>
-#include <stdbool.h>
-
-#include "../cuda/nvmlb.hpp"
-#include "../cuda/pci.hpp"
+#include <cstdint>

 #define UNKNOWN_FREQ -1

 enum {
-  GPU_VENDOR_NVIDIA
+  GPU_VENDOR_NVIDIA,
+  GPU_VENDOR_AMD,
+  GPU_VENDOR_INTEL
 };

 enum {
@@ -37,10 +35,28 @@ struct cache {
  struct cach* L2;
 };

-struct topology {
+// CUDA topology
+struct topology_c {
  int32_t streaming_mp;
  int32_t cores_per_mp;
  int32_t cuda_cores;
+  int32_t tensor_cores;
+};
+
+// HSA topology
+struct topology_h {
+  int32_t compute_units;
+  int32_t num_shader_engines;
+  int32_t simds_per_cu;
+  int32_t num_xcc;
+  int32_t matrix_cores;
+};
+
+// Intel topology
+struct topology_i {
+  int32_t slices;
+  int32_t subslices;
+  int32_t eu_subslice;
 };

 struct memory {
@@ -49,20 +65,26 @@ struct memory {
  int32_t bus_width;
  int32_t freq;
  int32_t clk_mul; // clock multiplier
+  int32_t lds_size; // HSA specific for now
 };

 struct gpu_info {
+  int32_t idx;
  VENDOR vendor;
  struct uarch* arch;
  char* name;
  int64_t freq;
  struct pci* pci;
-  struct nvml_data* nvmld;
-  struct topology* topo;
+  int64_t peak_performance;
+  // CUDA specific
+  int64_t peak_performance_tcu;
  struct memory* mem;
  struct cache* cach;
-  int64_t peak_performance;
-  int32_t idx;
+  struct topology_c* topo_c;
+  // HSA specific
+  struct topology_h* topo_h;
+  // Intel specific
+  struct topology_i* topo_i;
 };

 VENDOR get_gpu_vendor(struct gpu_info* gpu);
@@ -71,8 +93,11 @@ char* get_str_freq(struct gpu_info* gpu);
 char* get_str_memory_size(struct gpu_info* gpu);
 char* get_str_memory_type(struct gpu_info* gpu);
 char* get_str_bus_width(struct gpu_info* gpu);
+char* get_str_lds_size(struct gpu_info* gpu);
 char* get_str_memory_clock(struct gpu_info* gpu);
 char* get_str_l2(struct gpu_info* gpu);
 char* get_str_peak_performance(struct gpu_info* gpu);
+char* get_str_peak_performance_tensor(struct gpu_info* gpu);
+char* get_str_generic(int32_t data);

 #endif
--- a/src/common/main.cpp
+++ b/src/common/main.cpp
@@ -1,13 +1,18 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>

 #include "args.hpp"
 #include "global.hpp"
+#include "master.hpp"
 #include "../cuda/cuda.hpp"
 #include "../cuda/uarch.hpp"

-static const char* VERSION = "0.10";
+#ifdef BACKEND_USE_PCI
+#include "pci.hpp"
+#endif
+
+static const char* VERSION = "0.30";

 void print_help(char *argv[]) {
  const char **t = args_str;
@@ -18,10 +23,14 @@ void print_help(char *argv[]) {
  printf("Simple yet fancy GPU architecture fetching tool\n\n");

  printf("Options: \n");
-  printf("  -%c, --%s %*s Sets the color scheme (by default, gpufetch uses the system color scheme) See COLORS section for a more detailed explanation\n", c[ARG_COLOR], t[ARG_COLOR], (int) (max_len-strlen(t[ARG_COLOR])), "");
-  printf("  -%c, --%s %*s Selects the GPU to use (default: 0)\n", c[ARG_GPU], t[ARG_GPU], (int) (max_len-strlen(t[ARG_GPU])), "");
-  printf("  -%c, --%s %*s Prints this help and exit\n", c[ARG_HELP], t[ARG_HELP], (int) (max_len-strlen(t[ARG_HELP])), "");
-  printf("  -%c, --%s %*s Prints gpufetch version and exit\n", c[ARG_VERSION], t[ARG_VERSION], (int) (max_len-strlen(t[ARG_VERSION])), "");
+  printf("  -%c, --%s %*s Set the color scheme (by default, gpufetch uses the system color scheme) See COLORS section for a more detailed explanation\n", c[ARG_COLOR], t[ARG_COLOR], (int) (max_len-strlen(t[ARG_COLOR])), "");
+  printf("  -%c, --%s %*s List the available GPUs in the system\n", c[ARG_LIST], t[ARG_LIST], (int) (max_len-strlen(t[ARG_LIST])), "");
+  printf("  -%c, --%s %*s Select the GPU to print (default: 0). Use 'a' to print all GPUs\n", c[ARG_GPU], t[ARG_GPU], (int) (max_len-strlen(t[ARG_GPU])), "");
+  printf("      --%s %*s Show the short version of the logo\n", t[ARG_LOGO_SHORT], (int) (max_len-strlen(t[ARG_LOGO_SHORT])), "");
+  printf("      --%s %*s Show the long version of the logo\n", t[ARG_LOGO_LONG], (int) (max_len-strlen(t[ARG_LOGO_LONG])), "");
+  printf("  -%c, --%s %*s Enable verbose output\n", c[ARG_VERBOSE], t[ARG_VERBOSE], (int) (max_len-strlen(t[ARG_VERBOSE])), "");
+  printf("  -%c, --%s %*s Print this help and exit\n", c[ARG_HELP], t[ARG_HELP], (int) (max_len-strlen(t[ARG_HELP])), "");
+  printf("  -%c, --%s %*s Print gpufetch version and exit\n", c[ARG_VERSION], t[ARG_VERSION], (int) (max_len-strlen(t[ARG_VERSION])), "");

  printf("\nCOLORS: \n");
  printf("  Color scheme can be set using a predefined color scheme or a custom one:\n");
@@ -64,19 +73,50 @@ int main(int argc, char* argv[]) {
    return EXIT_SUCCESS;
  }

-  set_log_level(true);
+  set_log_level(verbose_enabled());

-  printWarn("gpufetch is in beta. The provided information may be incomplete or wrong.\n\
-If you want to help to improve gpufetch, please compare the output of the program\n\
-with a reliable source which you know is right (e.g, techpowerup.com) and report\n\
-any inconsistencies to https://github.com/Dr-Noob/gpufetch/issues");
+  int idx = get_gpu_idx();
+
+  struct gpu_list* list = get_gpu_list();
+  if(list_gpus()) {
+    return print_gpus_list(list);
+  }
+
+  if(get_num_gpus_available(list) == 0) {
+#ifdef BACKEND_USE_PCI    
+    printErr("No GPU was detected! Available GPUs are:");
+    print_gpus_list_pci();
+#else
+    printErr("No GPU was detected!");
+#endif    
+    printf("Please, make sure that the appropiate backend is enabled:\n");
+    print_enabled_backends();
+    printf("Visit https://github.com/Dr-Noob/gpufetch#2-backends for more information\n");

-  struct gpu_info* gpu = get_gpu_info(get_gpu_idx());
-  if(gpu == NULL)
    return EXIT_FAILURE;
+  }

-  if(print_gpufetch(gpu, get_style(), get_colors()))
-    return EXIT_SUCCESS;
-  else
-    return EXIT_FAILURE;
+  int first_idx, last_idx;
+  if(idx == -1) {
+    first_idx = 0;
+    last_idx = get_num_gpus_available(list);
+  }
+  else {
+    first_idx = idx;
+    last_idx = idx+1;
+  }
+
+  struct gpu_info* gpu = NULL;
+  for(int gpu_idx = first_idx; gpu_idx < last_idx; gpu_idx++) {
+    gpu = get_gpu_info(list, gpu_idx);
+    if(gpu == NULL) {
+      return EXIT_FAILURE;
+    }
+
+    if(!print_gpufetch(gpu, get_style(), get_colors())) {
+      return EXIT_FAILURE;
+    }
+  }
+
+  return EXIT_SUCCESS;
 }
--- a/src/common/master.cpp
+++ b/src/common/master.cpp
@@ -0,0 +1,122 @@
+#include <cstdlib>
+#include <cstdio>
+
+#ifdef BACKEND_USE_PCI
+  #include "pci.hpp"
+#endif  
+
+#include "global.hpp"
+#include "colors.hpp"
+#include "master.hpp"
+#include "args.hpp"
+#include "../cuda/cuda.hpp"
+#include "../hsa/hsa.hpp"
+#include "../intel/intel.hpp"
+
+#define MAX_GPUS 1000
+
+struct gpu_list {
+  struct gpu_info ** gpus;
+  int num_gpus;
+};
+
+struct gpu_list* get_gpu_list() {
+  int idx = 0;
+#ifdef BACKEND_USE_PCI
+  struct pci_dev *devices = get_pci_devices_from_pciutils();
+#endif
+  struct gpu_list* list = (struct gpu_list*) malloc(sizeof(struct gpu_list));
+  list->num_gpus = 0;
+  list->gpus = (struct gpu_info**) malloc(sizeof(struct info*) * MAX_GPUS);
+
+#ifdef BACKEND_CUDA
+  bool valid = true;
+
+  while(valid) {
+    list->gpus[idx] = get_gpu_info_cuda(devices, idx);
+    if(list->gpus[idx] != NULL) idx++;
+    else valid = false;
+  }
+
+  list->num_gpus += idx;
+#endif
+
+#ifdef BACKEND_HSA
+  bool valid = true;
+
+  while(valid) {
+    list->gpus[idx] = get_gpu_info_hsa(idx);
+    if(list->gpus[idx] != NULL) idx++;
+    else valid = false;
+  }
+
+  list->num_gpus += idx;
+#endif
+
+#ifdef BACKEND_INTEL
+  list->gpus[idx] = get_gpu_info_intel(devices);
+  if(list->gpus[idx] != NULL) list->num_gpus++;
+#endif
+
+  return list;
+}
+
+bool print_gpus_list(struct gpu_list* list) {
+  for(int i=0; i < list->num_gpus; i++) {
+    printf("GPU %d: ", i);
+    if(list->gpus[i]->vendor == GPU_VENDOR_NVIDIA) {
+      #ifdef BACKEND_CUDA
+        print_gpu_cuda(list->gpus[i]);
+      #endif
+    }
+    else if(list->gpus[i]->vendor == GPU_VENDOR_AMD) {
+      #ifdef BACKEND_AMD
+        print_gpu_hsa(list->gpus[i]);
+      #endif
+    }
+    else if(list->gpus[i]->vendor == GPU_VENDOR_INTEL) {
+      #ifdef BACKEND_INTEL
+        print_gpu_intel(list->gpus[i]);
+      #endif
+    }
+  }
+
+  return true;
+}
+
+void print_enabled_backends() {
+  printf("- CUDA backend:  ");
+#ifdef BACKEND_CUDA
+  printf("%sON%s\n", C_FG_GREEN, C_RESET);
+#else
+  printf("%sOFF%s\n", C_FG_RED, C_RESET);
+#endif
+
+  printf("- HSA backend:   ");
+#ifdef BACKEND_HSA
+  printf("%sON%s\n", C_FG_GREEN, C_RESET);
+#else
+  printf("%sOFF%s\n", C_FG_RED, C_RESET);
+#endif
+
+  printf("- Intel backend: ");
+#ifdef BACKEND_INTEL
+  printf("%sON%s\n", C_FG_GREEN, C_RESET);
+#else
+  printf("%sOFF%s\n", C_FG_RED, C_RESET);
+#endif
+}
+
+int get_num_gpus_available(struct gpu_list* list) {
+  return list->num_gpus;
+}
+
+struct gpu_info* get_gpu_info(struct gpu_list* list, int idx) {
+  if(idx >= list->num_gpus || idx < 0) {
+    printErr("Specified GPU index is out of range: %d", idx);
+    printf("Run gpufetch with the --%s option to check out valid GPU indexes\n", args_str[ARG_LIST]);
+    return NULL;
+  }
+  return list->gpus[idx];
+}
+
--- a/src/common/master.hpp
+++ b/src/common/master.hpp
@@ -0,0 +1,14 @@
+#ifndef __GPU_LIST__
+#define __GPU_LIST__
+
+#include "gpu.hpp"
+
+struct gpu_list;
+
+struct gpu_list* get_gpu_list();
+bool print_gpus_list(struct gpu_list* list);
+int get_num_gpus_available(struct gpu_list* list);
+void print_enabled_backends();
+struct gpu_info* get_gpu_info(struct gpu_list* list, int idx);
+
+#endif
--- a/src/common/pci.cpp
+++ b/src/common/pci.cpp
@@ -0,0 +1,122 @@
+#include "sort.hpp"
+#include "global.hpp"
+#include "pci.hpp"
+#include "../cuda/pci.hpp"
+#include "../intel/pci.hpp"
+
+#include <cstdio>
+#include <cstddef>
+
+// https://pci-ids.ucw.cz/read/PD
+// TODO: Move AMD PCI id when possible
+#define PCI_VENDOR_ID_AMD    0x1002
+#define CLASS_VGA_CONTROLLER 0x0300
+#define CLASS_3D_CONTROLLER  0x0302
+
+void debug_devices(struct pci_dev *devices) {
+  int idx = 0;
+  for(struct pci_dev *dev=devices; idx < 5 && dev != NULL; dev=dev->next) {
+    printf("%04x:%02x:%02x.%d\n", dev->domain, dev->bus, dev->dev, dev->func);
+    idx++;
+  }
+}
+
+bool pciutils_is_vendor_id_present(struct pci_dev *devices, int id) {
+  for(struct pci_dev *dev=devices; dev != NULL; dev=dev->next) {
+    if(dev->vendor_id == id && (dev->device_class == CLASS_VGA_CONTROLLER || dev->device_class == CLASS_3D_CONTROLLER)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+uint16_t pciutils_get_pci_device_id(struct pci_dev *devices, int id, int idx) {
+  int curr = 0;
+
+  for(struct pci_dev *dev=devices; dev != NULL; dev=dev->next) {
+    if(dev->vendor_id == id && (dev->device_class == CLASS_VGA_CONTROLLER || dev->device_class == CLASS_3D_CONTROLLER)) {
+      if(curr == idx) {
+        return dev->device_id;
+      }
+      curr++;
+    }
+  }
+
+  printErr("Unable to find a valid device for device id 0x%.4X with idx %d using pciutils", id, idx);
+  return 0;
+}
+
+void pciutils_set_pci_bus(struct pci* pci, struct pci_dev *devices, int id) {
+  bool found = false;
+
+  for(struct pci_dev *dev=devices; dev != NULL; dev=dev->next) {
+   if(dev->vendor_id == id && (dev->device_class == CLASS_VGA_CONTROLLER || dev->device_class == CLASS_3D_CONTROLLER)) {
+      pci->domain = dev->domain;
+      pci->bus = dev->bus;
+      pci->dev = dev->dev;
+      pci->func = dev->func;
+      found = true;
+    }
+  }
+
+  if(!found) printErr("Unable to find a valid device for id 0x%.4X using pciutils", id);
+}
+
+struct pci* get_pci_from_pciutils(struct pci_dev *devices, int id, int idx) {
+  struct pci* pci = (struct pci*) emalloc(sizeof(struct pci));
+
+  // TODO: Refactor this; instead of 2xGet + 1xSet, do it better
+  if(pciutils_is_vendor_id_present(devices, id)) {
+    pci->vendor_id = id;
+    pci->device_id = pciutils_get_pci_device_id(devices, id, idx);
+    pciutils_set_pci_bus(pci, devices, id);
+    return pci;
+  }
+  else {
+    return NULL;
+  }
+}
+
+struct pci_dev *get_pci_devices_from_pciutils() {
+  struct pci_access *pacc;
+  struct pci_dev *dev;
+
+  pacc = pci_alloc();
+  pci_init(pacc);
+  pci_scan_bus(pacc);
+
+  for (dev=pacc->devices; dev; dev=dev->next) {
+    pci_fill_info(dev, PCI_FILL_IDENT | PCI_FILL_BASES | PCI_FILL_CLASS);
+  }
+
+  sort_pci_devices(&pacc->devices);
+
+  return pacc->devices;
+}
+
+void print_gpus_list_pci() {
+  int i=0;
+  struct pci_dev *devices = get_pci_devices_from_pciutils();
+
+  for(struct pci_dev *dev=devices; dev != NULL; dev=dev->next) {
+    if(dev->device_class == CLASS_VGA_CONTROLLER || dev->device_class == CLASS_3D_CONTROLLER) {
+      printf("- GPU %d:\n", i);
+      printf("  * Vendor: ");
+      if(dev->vendor_id == PCI_VENDOR_ID_NVIDIA) {
+        printf("NVIDIA");
+      }
+      else if(dev->vendor_id == PCI_VENDOR_ID_INTEL) {
+        printf("Intel");
+      }
+      else if(dev->vendor_id == PCI_VENDOR_ID_AMD) {
+        printf("AMD");
+      }
+      else {
+        printf("Unknown");
+      }
+      printf("\n  * PCI id: %.4x:%.4x\n", dev->vendor_id, dev->device_id);
+      i++;
+    }
+  }
+}
--- a/src/common/pci.hpp
+++ b/src/common/pci.hpp
@@ -0,0 +1,22 @@
+#ifndef __GPUFETCH_PCI__
+#define __GPUFETCH_PCI__
+
+#include <cstdint>
+extern "C" {
+  #include <pci/pci.h>
+}
+
+struct pci {
+  uint16_t vendor_id;
+  uint16_t device_id;
+  uint16_t domain;
+  uint16_t bus;
+  uint16_t dev;
+  uint16_t func;
+};
+
+struct pci* get_pci_from_pciutils(struct pci_dev *devices, int id, int idx);
+struct pci_dev *get_pci_devices_from_pciutils();
+void print_gpus_list_pci();
+
+#endif
--- a/src/common/printer.cpp
+++ b/src/common/printer.cpp
@@ -1,14 +1,17 @@
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include <stdbool.h>
-#include <errno.h>
+#include <cstdlib>
+#include <cstring>
+#include <cstdio>
+#include <cerrno>

 #include "printer.hpp"
 #include "ascii.hpp"
 #include "../common/global.hpp"
 #include "../common/gpu.hpp"

+#include "../intel/uarch.hpp"
+#include "../intel/intel.hpp"
+#include "../hsa/hsa.hpp"
+#include "../hsa/uarch.hpp"
 #include "../cuda/cuda.hpp"
 #include "../cuda/uarch.hpp"

@@ -29,52 +32,60 @@
 #define MAX_ATTRIBUTES      100
 #define MAX_TERM_SIZE       1024

+typedef struct {
+  int id;
+  const char *name;
+  const char *shortname;
+} AttributeField;
+
+// AttributeField IDs
+//                         Used by
 enum {
-  ATTRIBUTE_NAME,
-  ATTRIBUTE_CHIP,
-  ATTRIBUTE_UARCH,
-  ATTRIBUTE_TECHNOLOGY,
-  ATTRIBUTE_FREQUENCY,
-  ATTRIBUTE_STREAMINGMP,
-  ATTRIBUTE_CORESPERMP,
-  ATTRIBUTE_CUDA_CORES,
-  ATTRIBUTE_L2,
-  ATTRIBUTE_MEMORY,
-  ATTRIBUTE_MEMORY_FREQ,
-  ATTRIBUTE_BUS_WIDTH,
-  ATTRIBUTE_PEAK
+  ATTRIBUTE_NAME,          // ALL
+  ATTRIBUTE_CHIP,          // ALL
+  ATTRIBUTE_UARCH,         // ALL
+  ATTRIBUTE_TECHNOLOGY,    // ALL
+  ATTRIBUTE_FREQUENCY,     // ALL
+  ATTRIBUTE_PEAK,          // ALL
+  ATTRIBUTE_COMPUTE_UNITS, // HSA
+  ATTRIBUTE_MATRIX_CORES,  // HSA
+  ATTRIBUTE_XCDS,          // HSA
+  ATTRIBUTE_LDS_SIZE,      // HSA
+  ATTRIBUTE_STREAMINGMP,   // CUDA
+  ATTRIBUTE_CORESPERMP,    // CUDA
+  ATTRIBUTE_CUDA_CORES,    // CUDA
+  ATTRIBUTE_TENSOR_CORES,  // CUDA
+  ATTRIBUTE_L2,            // CUDA
+  ATTRIBUTE_MEMORY,        // CUDA,HSA
+  ATTRIBUTE_MEMORY_FREQ,   // CUDA
+  ATTRIBUTE_BUS_WIDTH,     // CUDA,HSA
+  ATTRIBUTE_PEAK_TENSOR,   // CUDA
+  ATTRIBUTE_EUS,           // Intel
+  ATTRIBUTE_GT,            // Intel
 };

-static const char* ATTRIBUTE_FIELDS [] = {
-  "Name:",
-  "GPU processor:",
-  "Microarchitecture:",
-  "Technology:",
-  "Max Frequency:",
-  "SMs:",
-  "Cores/SM:",
-  "CUDA cores:",
-  "L2 Size:",
-  "Memory:",
-  "Memory frequency:",
-  "Bus width:",
-  "Peak Performance:",
-};
-
-static const char* ATTRIBUTE_FIELDS_SHORT [] = {
-  "Name:",
-  "Processor:",
-  "uArch:",
-  "Technology:",
-  "Max Freq.:",
-  "SMs:",
-  "Cores/SM:",
-  "CUDA cores:",
-  "L2 Size:",
-  "Memory:",
-  "Memory freq.:",
-  "Bus width:",
-  "Peak Perf.:",
+static const AttributeField ATTRIBUTE_INFO[] = {
+  { ATTRIBUTE_NAME,          "Name:",                   "Name:" },
+  { ATTRIBUTE_CHIP,          "GPU processor:",          "Processor:" },
+  { ATTRIBUTE_UARCH,         "Microarchitecture:",      "uArch:" },
+  { ATTRIBUTE_TECHNOLOGY,    "Technology:",             "Technology:" },
+  { ATTRIBUTE_FREQUENCY,     "Max Frequency:",          "Max Freq.:" },
+  { ATTRIBUTE_PEAK,          "Peak Performance:",       "Peak Perf.:" },
+  { ATTRIBUTE_COMPUTE_UNITS, "Compute Units (CUs):",    "CUs" },
+  { ATTRIBUTE_MATRIX_CORES,  "Matrix Cores:",           "Matrix Cores:" },
+  { ATTRIBUTE_XCDS,          "XCDs:",                   "XCDs" },
+  { ATTRIBUTE_LDS_SIZE,      "LDS size:",               "LDS:" },
+  { ATTRIBUTE_STREAMINGMP,   "SMs:",                    "SMs:" },
+  { ATTRIBUTE_CORESPERMP,    "Cores/SM:",               "Cores/SM:" },
+  { ATTRIBUTE_CUDA_CORES,    "CUDA Cores:",             "CUDA Cores:" },
+  { ATTRIBUTE_TENSOR_CORES,  "Tensor Cores:",           "Tensor Cores:" },
+  { ATTRIBUTE_L2,            "L2 Size:",                "L2 Size:" },
+  { ATTRIBUTE_MEMORY,        "Memory:",                 "Memory:" },
+  { ATTRIBUTE_MEMORY_FREQ,   "Memory frequency:",       "Memory freq.:" },
+  { ATTRIBUTE_BUS_WIDTH,     "Bus width:",              "Bus width:" },
+  { ATTRIBUTE_PEAK_TENSOR,   "Peak Performance (MMA):", "Peak Perf.(MMA):" },
+  { ATTRIBUTE_EUS,           "Execution Units:",        "EUs:" },
+  { ATTRIBUTE_GT,            "Graphics Tier:",          "GT:" },
 };

 struct terminal {
@@ -192,25 +203,37 @@ bool ascii_fits_screen(int termw, struct ascii_logo logo, int lf) {
 void replace_bgbyfg_color(struct ascii_logo* logo) {
  // Replace background by foreground color
  for(int i=0; i < 2; i++) {
-    if(logo->color_ascii[i] == NULL) break;
+    if(strcmp(logo->color_ascii[i], C_BG_BLACK) == 0) strcpy(logo->color_ascii[i], C_FG_BLACK);
+    else if(strcmp(logo->color_ascii[i], C_BG_RED) == 0) strcpy(logo->color_ascii[i], C_FG_RED);
+    else if(strcmp(logo->color_ascii[i], C_BG_GREEN) == 0) strcpy(logo->color_ascii[i], C_FG_GREEN);
+    else if(strcmp(logo->color_ascii[i], C_BG_YELLOW) == 0) strcpy(logo->color_ascii[i], C_FG_YELLOW);
+    else if(strcmp(logo->color_ascii[i], C_BG_BLUE) == 0) strcpy(logo->color_ascii[i], C_FG_BLUE);
+    else if(strcmp(logo->color_ascii[i], C_BG_MAGENTA) == 0) strcpy(logo->color_ascii[i], C_FG_MAGENTA);
+    else if(strcmp(logo->color_ascii[i], C_BG_CYAN) == 0) strcpy(logo->color_ascii[i], C_FG_CYAN);
+    else if(strcmp(logo->color_ascii[i], C_BG_WHITE) == 0) strcpy(logo->color_ascii[i], C_FG_WHITE);
+  }
+}

-    if(strcmp(logo->color_ascii[i], COLOR_BG_BLACK) == 0) strcpy(logo->color_ascii[i], COLOR_FG_BLACK);
-    else if(strcmp(logo->color_ascii[i], COLOR_BG_RED) == 0) strcpy(logo->color_ascii[i], COLOR_FG_RED);
-    else if(strcmp(logo->color_ascii[i], COLOR_BG_GREEN) == 0) strcpy(logo->color_ascii[i], COLOR_FG_GREEN);
-    else if(strcmp(logo->color_ascii[i], COLOR_BG_YELLOW) == 0) strcpy(logo->color_ascii[i], COLOR_FG_YELLOW);
-    else if(strcmp(logo->color_ascii[i], COLOR_BG_BLUE) == 0) strcpy(logo->color_ascii[i], COLOR_FG_BLUE);
-    else if(strcmp(logo->color_ascii[i], COLOR_BG_MAGENTA) == 0) strcpy(logo->color_ascii[i], COLOR_FG_MAGENTA);
-    else if(strcmp(logo->color_ascii[i], COLOR_BG_CYAN) == 0) strcpy(logo->color_ascii[i], COLOR_FG_CYAN);
-    else if(strcmp(logo->color_ascii[i], COLOR_BG_WHITE) == 0) strcpy(logo->color_ascii[i], COLOR_FG_WHITE);
+struct ascii_logo* choose_ascii_art_aux(struct ascii_logo* logo_long, struct ascii_logo* logo_short, struct terminal* term, int lf) {
+  if(show_logo_long()) return logo_long;
+  if(show_logo_short()) return logo_short;
+  if(ascii_fits_screen(term->w, *logo_long, lf)) {
+    return logo_long;
+  }
+  else {
+    return logo_short;
  }
 }

 void choose_ascii_art(struct ascii* art, struct color** cs, struct terminal* term, int lf) {
  if(art->vendor == GPU_VENDOR_NVIDIA) {
-    if(term != NULL && ascii_fits_screen(term->w, logo_nvidia_l, lf))
-      art->art = &logo_nvidia_l;
-    else
-      art->art = &logo_nvidia;
+    art->art = choose_ascii_art_aux(&logo_nvidia_l, &logo_nvidia, term, lf);
+  }
+  else if(art->vendor == GPU_VENDOR_AMD) {
+    art->art = choose_ascii_art_aux(&logo_amd_l, &logo_amd, term, lf);
+  }
+  else if(art->vendor == GPU_VENDOR_INTEL) {
+    art->art = choose_ascii_art_aux(&logo_intel_l, &logo_intel, term, lf);
  }
  else {
    art->art = &logo_unknown;
@@ -222,10 +245,10 @@ void choose_ascii_art(struct ascii* art, struct color** cs, struct terminal* ter
  switch(art->style) {
    case STYLE_LEGACY:
      logo->replace_blocks = false;
-      strcpy(logo->color_text[0], COLOR_NONE);
-      strcpy(logo->color_text[1], COLOR_NONE);
-      strcpy(logo->color_ascii[0], COLOR_NONE);
-      strcpy(logo->color_ascii[1], COLOR_NONE);
+      strcpy(logo->color_text[0], C_NONE);
+      strcpy(logo->color_text[1], C_NONE);
+      strcpy(logo->color_ascii[0], C_NONE);
+      strcpy(logo->color_ascii[1], C_NONE);
      art->reset[0] = '\0';
      break;
    case STYLE_RETRO:
@@ -239,7 +262,7 @@ void choose_ascii_art(struct ascii* art, struct color** cs, struct terminal* ter
        strcpy(logo->color_ascii[0], rgb_to_ansi(cs[0], logo->replace_blocks, true));
        strcpy(logo->color_ascii[1], rgb_to_ansi(cs[1], logo->replace_blocks, true));
      }
-      strcpy(art->reset, COLOR_RESET);
+      strcpy(art->reset, C_RESET);
      break;
    case STYLE_INVALID:
    default:
@@ -247,13 +270,14 @@ void choose_ascii_art(struct ascii* art, struct color** cs, struct terminal* ter
  }
 }

-uint32_t longest_attribute_length(struct ascii* art, const char** attribute_fields) {
+uint32_t longest_attribute_length(struct ascii* art, bool use_short) {
  uint32_t max = 0;
  uint64_t len = 0;

  for(uint32_t i=0; i < art->n_attributes_set; i++) {
    if(art->attributes[i]->value != NULL) {
-      len = strlen(attribute_fields[art->attributes[i]->type]);
+      const char* str = use_short ? ATTRIBUTE_INFO[art->attributes[i]->type].shortname : ATTRIBUTE_INFO[art->attributes[i]->type].name;
+      len = strlen(str);
      if(len > max) max = len;
    }
  }
@@ -277,7 +301,7 @@ uint32_t longest_field_length(struct ascii* art, int la) {
  return max;
 }

-void print_ascii_generic(struct ascii* art, uint32_t la, int32_t text_space, const char** attribute_fields) {
+void print_ascii_generic(struct ascii* art, uint32_t la, int32_t text_space, bool use_short) {
  struct ascii_logo* logo = art->art;
  int attr_to_print = 0;
  int attr_type;
@@ -321,11 +345,13 @@ void print_ascii_generic(struct ascii* art, uint32_t la, int32_t text_space, con
      attr_value = art->attributes[attr_to_print]->value;
      attr_to_print++;

-      space_right = 1 + (la - strlen(attribute_fields[attr_type]));
+      const char* attr_str = use_short ? ATTRIBUTE_INFO[attr_type].shortname : ATTRIBUTE_INFO[attr_type].name;
+
+      space_right = 1 + (la - strlen(attr_str));
      current_space = max(0, text_space);

-      printf("%s%.*s%s", logo->color_text[0], current_space, attribute_fields[attr_type], art->reset);
-      current_space = max(0, current_space - (int) strlen(attribute_fields[attr_type]));
+      printf("%s%.*s%s", logo->color_text[0], current_space, attr_str, art->reset);
+      current_space = max(0, current_space - (int) strlen(attr_str));
      printf("%*s", min(current_space, space_right), "");
      current_space = max(0, current_space - min(current_space, space_right));
      printf("%s%.*s%s", logo->color_text[1], current_space, attr_value, art->reset);
@@ -336,6 +362,48 @@ void print_ascii_generic(struct ascii* art, uint32_t la, int32_t text_space, con
  printf("\n");
 }

+#ifdef BACKEND_INTEL
+bool print_gpufetch_intel(struct gpu_info* gpu, STYLE s, struct color** cs, struct terminal* term) {
+  struct ascii* art = set_ascii(get_gpu_vendor(gpu), s);
+
+  if(art == NULL)
+    return false;
+
+  char* gpu_name = get_str_gpu_name(gpu);
+  char* uarch = get_str_uarch_intel(gpu->arch);
+  char* gt = get_str_gt(gpu->arch);
+  char* manufacturing_process = get_str_process(gpu->arch);
+  char* eus = get_str_eu(gpu);
+  char* max_frequency = get_str_freq(gpu);
+  char* pp = get_str_peak_performance(gpu);
+
+  setAttribute(art, ATTRIBUTE_NAME, gpu_name);
+  setAttribute(art, ATTRIBUTE_UARCH, uarch);
+  setAttribute(art, ATTRIBUTE_TECHNOLOGY, manufacturing_process);
+  setAttribute(art, ATTRIBUTE_FREQUENCY, max_frequency);
+  setAttribute(art, ATTRIBUTE_GT, gt);
+  setAttribute(art, ATTRIBUTE_EUS, eus);
+  setAttribute(art, ATTRIBUTE_PEAK, pp);
+
+  bool use_short = false;
+  uint32_t longest_attribute = longest_attribute_length(art, use_short);
+  uint32_t longest_field = longest_field_length(art, longest_attribute);
+  choose_ascii_art(art, cs, term, longest_field);
+
+  if(!ascii_fits_screen(term->w, *art->art, longest_field)) {
+    // Despite of choosing the smallest logo, the output does not fit
+    // Choose the shorter field names and recalculate the longest attr
+    use_short = true;
+    longest_attribute = longest_attribute_length(art, use_short);
+  }
+
+  print_ascii_generic(art, longest_attribute, term->w - art->art->width, use_short);
+
+  return true;
+}
+#endif
+
+#ifdef BACKEND_CUDA
 bool print_gpufetch_cuda(struct gpu_info* gpu, STYLE s, struct color** cs, struct terminal* term) {
  struct ascii* art = set_ascii(get_gpu_vendor(gpu), s);

@@ -344,12 +412,13 @@ bool print_gpufetch_cuda(struct gpu_info* gpu, STYLE s, struct color** cs, struc

  char* gpu_name = get_str_gpu_name(gpu);
  char* gpu_chip = get_str_chip(gpu->arch);
-  char* uarch = get_str_uarch(gpu->arch);
+  char* uarch = get_str_uarch_cuda(gpu->arch);
  char* comp_cap = get_str_cc(gpu->arch);
  char* manufacturing_process = get_str_process(gpu->arch);
  char* sms = get_str_sm(gpu);
  char* corespersm = get_str_cores_sm(gpu);
  char* cores = get_str_cuda_cores(gpu);
+  char* tensorc = get_str_tensor_cores(gpu);
  char* max_frequency = get_str_freq(gpu);
  char* l2 = get_str_l2(gpu);
  char* mem_size = get_str_memory_size(gpu);
@@ -357,6 +426,7 @@ bool print_gpufetch_cuda(struct gpu_info* gpu, STYLE s, struct color** cs, struc
  char* mem_freq = get_str_memory_clock(gpu);
  char* bus_width = get_str_bus_width(gpu);
  char* pp = get_str_peak_performance(gpu);
+  char* pp_tensor = get_str_peak_performance_tensor(gpu);

  char* mem = (char *) emalloc(sizeof(char) * (strlen(mem_size) + strlen(mem_type) + 2));
  sprintf(mem, "%s %s", mem_size, mem_type);
@@ -372,25 +442,31 @@ bool print_gpufetch_cuda(struct gpu_info* gpu, STYLE s, struct color** cs, struc
  setAttribute(art, ATTRIBUTE_STREAMINGMP, sms);
  setAttribute(art, ATTRIBUTE_CORESPERMP, corespersm);
  setAttribute(art, ATTRIBUTE_CUDA_CORES, cores);
+  if(gpu->topo_c->tensor_cores > 0) {
+    setAttribute(art, ATTRIBUTE_TENSOR_CORES, tensorc);
+  }
  setAttribute(art, ATTRIBUTE_MEMORY, mem);
  setAttribute(art, ATTRIBUTE_MEMORY_FREQ, mem_freq);
  setAttribute(art, ATTRIBUTE_BUS_WIDTH, bus_width);
  setAttribute(art, ATTRIBUTE_L2, l2);
  setAttribute(art, ATTRIBUTE_PEAK, pp);
+  if(gpu->topo_c->tensor_cores > 0) {
+    setAttribute(art, ATTRIBUTE_PEAK_TENSOR, pp_tensor);
+  }

-  const char** attribute_fields = ATTRIBUTE_FIELDS;
-  uint32_t longest_attribute = longest_attribute_length(art, attribute_fields);
+  bool use_short = false;
+  uint32_t longest_attribute = longest_attribute_length(art, use_short);
  uint32_t longest_field = longest_field_length(art, longest_attribute);
  choose_ascii_art(art, cs, term, longest_field);

  if(!ascii_fits_screen(term->w, *art->art, longest_field)) {
    // Despite of choosing the smallest logo, the output does not fit
    // Choose the shorter field names and recalculate the longest attr
-    attribute_fields = ATTRIBUTE_FIELDS_SHORT;
-    longest_attribute = longest_attribute_length(art, attribute_fields);
+    use_short = true;
+    longest_attribute = longest_attribute_length(art, use_short);
  }

-  print_ascii_generic(art, longest_attribute, term->w - art->art->width, attribute_fields);
+  print_ascii_generic(art, longest_attribute, term->w - art->art->width, use_short);

  free(manufacturing_process);
  free(max_frequency);
@@ -402,6 +478,63 @@ bool print_gpufetch_cuda(struct gpu_info* gpu, STYLE s, struct color** cs, struc

  return true;
 }
+#endif
+
+#ifdef BACKEND_HSA
+bool print_gpufetch_amd(struct gpu_info* gpu, STYLE s, struct color** cs, struct terminal* term) {
+  struct ascii* art = set_ascii(get_gpu_vendor(gpu), s);
+
+  if(art == NULL)
+    return false;
+
+  char* gpu_name = get_str_gpu_name(gpu);
+  char* gpu_chip = get_str_chip(gpu->arch);
+  char* uarch = get_str_uarch_hsa(gpu->arch);
+  char* manufacturing_process = get_str_process(gpu->arch);
+  char* cus = get_str_cu(gpu);
+  char* matrix_cores = get_str_matrix_cores(gpu);
+  char* xcds = get_str_xcds(gpu);
+  char* max_frequency = get_str_freq(gpu);
+  char* bus_width = get_str_bus_width(gpu);
+  char* mem_size = get_str_memory_size(gpu);
+  char* lds_size = get_str_lds_size(gpu);
+
+  setAttribute(art, ATTRIBUTE_NAME, gpu_name);
+  if (gpu_chip != NULL) {
+    setAttribute(art, ATTRIBUTE_CHIP, gpu_chip);
+  }
+  setAttribute(art, ATTRIBUTE_UARCH, uarch);
+  setAttribute(art, ATTRIBUTE_TECHNOLOGY, manufacturing_process);
+  setAttribute(art, ATTRIBUTE_FREQUENCY, max_frequency);
+  setAttribute(art, ATTRIBUTE_COMPUTE_UNITS, cus);
+  setAttribute(art, ATTRIBUTE_MATRIX_CORES, matrix_cores);
+  if (xcds != NULL) {
+    setAttribute(art, ATTRIBUTE_XCDS, xcds);
+  }
+  setAttribute(art, ATTRIBUTE_LDS_SIZE, lds_size);
+  setAttribute(art, ATTRIBUTE_MEMORY, mem_size);
+  setAttribute(art, ATTRIBUTE_BUS_WIDTH, bus_width);
+
+  bool use_short = false;
+  uint32_t longest_attribute = longest_attribute_length(art, use_short);
+  uint32_t longest_field = longest_field_length(art, longest_attribute);
+  choose_ascii_art(art, cs, term, longest_field);
+
+  if(!ascii_fits_screen(term->w, *art->art, longest_field)) {
+    // Despite of choosing the smallest logo, the output does not fit
+    // Choose the shorter field names and recalculate the longest attr
+    use_short = true;
+    longest_attribute = longest_attribute_length(art, use_short);
+  }
+
+  print_ascii_generic(art, longest_attribute, term->w - art->art->width, use_short);
+
+  free(art->attributes);
+  free(art);
+
+  return true;
+}
+#endif

 struct terminal* get_terminal_size() {
  struct terminal* term = (struct terminal*) emalloc(sizeof(struct terminal));
@@ -434,5 +567,30 @@ struct terminal* get_terminal_size() {
 bool print_gpufetch(struct gpu_info* gpu, STYLE s, struct color** cs) {
  struct terminal* term = get_terminal_size();

-  return print_gpufetch_cuda(gpu, s, cs, term);
+  if(gpu->vendor == GPU_VENDOR_NVIDIA) {
+    #ifdef BACKEND_CUDA
+      if(clean_output()) printf("%*s", (int) strlen(CUDA_DRIVER_START_WARNING), " ");
+      return print_gpufetch_cuda(gpu, s, cs, term);
+    #else
+      return false;
+    #endif
+  }
+  else if(gpu->vendor == GPU_VENDOR_AMD) {
+    #ifdef BACKEND_HSA
+      return print_gpufetch_amd(gpu, s, cs, term);
+    #else
+      return false;
+    #endif
+  }
+  else if(gpu->vendor == GPU_VENDOR_INTEL) {
+    #ifdef BACKEND_INTEL
+      return print_gpufetch_intel(gpu, s, cs, term);
+    #else
+      return false;
+    #endif
+  }
+  else {
+    printErr("Invalid GPU vendor: %d", gpu->vendor);
+    return false;
+  }
 }
--- a/src/common/sort.cpp
+++ b/src/common/sort.cpp
@@ -0,0 +1,61 @@
+#include <cstdio>
+#include <cstdlib>
+#include "pci.hpp"
+#include "global.hpp"
+
+// Code inspired in lspci.c
+int compare_them(const void *A, const void *B) {
+  const struct pci_dev *a = *(struct pci_dev **) A;
+  const struct pci_dev *b = *(struct pci_dev **) B;
+
+  if (a->domain < b->domain)
+    return -1;
+  if (a->domain > b->domain)
+    return 1;
+  if (a->bus < b->bus)
+    return -1;
+  if (a->bus > b->bus)
+    return 1;
+  if (a->dev < b->dev)
+    return -1;
+  if (a->dev > b->dev)
+    return 1;
+  if (a->func < b->func)
+    return -1;
+  if (a->func > b->func)
+    return 1;
+
+  return 0;
+}
+
+void sort_pci_devices(struct pci_dev **devices) {
+  int i = 0;
+  struct pci_dev **arr;
+
+  int cnt = 0;
+  for(struct pci_dev *dev=*devices; dev != NULL; dev=dev->next) {
+    cnt++;
+  }
+
+  arr = (struct pci_dev **) emalloc(sizeof(struct pci_dev *) * cnt);
+  for(struct pci_dev *dev=*devices; dev != NULL; dev=dev->next) {
+    arr[i] = dev;
+    i++;
+  }
+
+  qsort(arr, cnt, sizeof(struct pci_dev *), compare_them);
+
+  struct pci_dev *ptr = *devices;
+  struct pci_dev *ptrb = *devices;
+  for(i = 0; i < cnt; i++) {
+    ptr = arr[i];
+    if(i > 0) {
+      ptrb->next = ptr;
+    }
+    ptrb = ptr;
+  }
+
+  ptr->next = NULL;
+  *devices = arr[0];
+  free(arr);
+}
--- a/src/common/sort.hpp
+++ b/src/common/sort.hpp
@@ -0,0 +1,7 @@
+#ifndef __SORT_PCI__
+#define __SORT_PCI__
+
+void sort_pci_devices(struct pci_dev **first_dev);
+
+#endif
+
--- a/src/common/uarch.cpp
+++ b/src/common/uarch.cpp
@@ -0,0 +1,28 @@
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+
+#include "global.hpp"
+#include "uarch.hpp"
+
+char* get_str_process(struct uarch* arch) {
+  char* str = (char *) emalloc(sizeof(char) * (strlen(STRING_UNKNOWN)+1));
+  int32_t process = arch->process;
+
+  if(process == UNK) {
+    snprintf(str, strlen(STRING_UNKNOWN)+1, STRING_UNKNOWN);
+  }
+  else if(process > 100) {
+    sprintf(str, "%.2fum", (double)process/100);
+  }
+  else if(process > 0){
+    sprintf(str, "%dnm", process);
+  }
+  else {
+    snprintf(str, strlen(STRING_UNKNOWN)+1, STRING_UNKNOWN);
+    printBug("Found invalid process: '%d'", process);
+  }
+
+  return str;
+}
+
--- a/src/common/uarch.hpp
+++ b/src/common/uarch.hpp
@@ -0,0 +1,34 @@
+#ifndef __COMMON_UARCH__
+#define __COMMON_UARCH__
+
+// Data not available
+#define NA                   -1
+
+// Unknown manufacturing process
+#define UNK                  -1
+
+typedef uint32_t GPUCHIP;
+typedef uint32_t MICROARCH;
+
+struct uarch {
+  // NVIDIA specific
+  int32_t cc_major;
+  int32_t cc_minor;
+  int32_t compute_capability;
+
+  // HSA specific
+  int32_t llvm_target;
+
+  // Intel specific
+  int32_t gt;
+  int32_t eu;
+
+  MICROARCH uarch;
+  GPUCHIP chip;
+
+  int32_t process;
+  char* uarch_str;
+  char* chip_str;
+};
+
+#endif
--- a/src/cuda/chips.hpp
+++ b/src/cuda/chips.hpp
@@ -1,10 +1,14 @@
-#ifndef __GPUCHIPS__
-#define __GPUCHIPS__
+#ifndef __CUDA_GPUCHIPS__
+#define __CUDA_GPUCHIPS__

 typedef uint32_t GPUCHIP;

 enum {
-  CHIP_UNKNOWN,
+  CHIP_UNKNOWN_CUDA,
+  CHIP_AD102,
+  CHIP_AD102GL,
+  CHIP_AD104,
+  CHIP_AD104GL,
  CHIP_G80,
  CHIP_G80GL,
  CHIP_G84,
@@ -37,6 +41,9 @@ enum {
  CHIP_GA100GL,
  CHIP_GA102,
  CHIP_GA102GL,
+  CHIP_GA103,
+  CHIP_GA103GLM,
+  CHIP_GA103M,
  CHIP_GA104,
  CHIP_GA104GL,
  CHIP_GA104GLM,
@@ -45,6 +52,7 @@ enum {
  CHIP_GA106M,
  CHIP_GA107,
  CHIP_GA107BM,
+  CHIP_GA107GL,
  CHIP_GA107GLM,
  CHIP_GA107M,
  CHIP_GF100,
@@ -71,6 +79,7 @@ enum {
  CHIP_GF117M,
  CHIP_GF119,
  CHIP_GF119M,
+  CHIP_GH100,
  CHIP_GK104,
  CHIP_GK104GL,
  CHIP_GK104GLM,
@@ -166,7 +175,7 @@ enum {
  CHIP_TU117BM,
  CHIP_TU117GL,
  CHIP_TU117GLM,
-  CHIP_TU117M,
+  CHIP_TU117M
 };

 #endif
--- a/src/cuda/cuda.cpp
+++ b/src/cuda/cuda.cpp
@@ -1,70 +1,103 @@
-#include <helper_cuda.h>
+
+// patched cuda.cpp for cuda13 by cloudy
+
 #include <cuda_runtime.h>
+#include <cstring>
+#include <cstdlib>
+#include <cstdio>

 #include "cuda.hpp"
-#include "nvmlb.hpp"
 #include "uarch.hpp"
+#include "pci.hpp"
+#include "gpufetch_helper_cuda.hpp"
 #include "../common/global.hpp"
+#include "../common/uarch.hpp"
+
+bool print_gpu_cuda(struct gpu_info* gpu) {
+  char* cc = get_str_cc(gpu->arch);
+  printf("%s (Compute Capability %s)\n", gpu->name, cc);
+  free(cc);
+  return true;
+}

 struct cache* get_cache_info(cudaDeviceProp prop) {
  struct cache* cach = (struct cache*) emalloc(sizeof(struct cache));
-
  cach->L2 = (struct cach*) emalloc(sizeof(struct cach));
  cach->L2->size = prop.l2CacheSize;
  cach->L2->num_caches = 1;
  cach->L2->exists = true;
-
  return cach;
 }

-struct topology* get_topology_info(cudaDeviceProp prop) {
-  struct topology* topo = (struct topology*) emalloc(sizeof(struct topology));
+int get_tensor_cores(struct uarch* arch, int sm, int major) {
+  if(major == 7) {
+    if (is_chip_TU116(arch))
+      return 0;
+    return sm * 8;
+  }
+  else if(major == 8) return sm * 4;
+  else return 0;
+}

+struct topology_c* get_topology_info(struct uarch* arch, cudaDeviceProp prop) {
+  struct topology_c* topo = (struct topology_c*) emalloc(sizeof(struct topology_c));
  topo->streaming_mp = prop.multiProcessorCount;
  topo->cores_per_mp = _ConvertSMVer2Cores(prop.major, prop.minor);
  topo->cuda_cores = topo->streaming_mp * topo->cores_per_mp;
-
+  topo->tensor_cores = get_tensor_cores(arch, topo->streaming_mp, prop.major);
  return topo;
 }

 int32_t guess_clock_multipilier(struct gpu_info* gpu, struct memory* mem) {
-  // Guess clock multiplier
  int32_t clk_mul = 1;
-
  int32_t clk8 = abs((mem->freq/8) - gpu->freq);
  int32_t clk4 = abs((mem->freq/4) - gpu->freq);
  int32_t clk2 = abs((mem->freq/2) - gpu->freq);
  int32_t clk1 = abs((mem->freq/1) - gpu->freq);
-
  int32_t min = mem->freq;
  if(clkm_possible_for_uarch(8, gpu->arch) && min > clk8) { clk_mul = 8; min = clk8; }
  if(clkm_possible_for_uarch(4, gpu->arch) && min > clk4) { clk_mul = 4; min = clk4; }
  if(clkm_possible_for_uarch(2, gpu->arch) && min > clk2) { clk_mul = 2; min = clk2; }
  if(clkm_possible_for_uarch(1, gpu->arch) && min > clk1) { clk_mul = 1; min = clk1; }
-
  return clk_mul;
 }

 struct memory* get_memory_info(struct gpu_info* gpu, cudaDeviceProp prop) {
  struct memory* mem = (struct memory*) emalloc(sizeof(struct memory));
+  int val = 0;

  mem->size_bytes = (unsigned long long) prop.totalGlobalMem;
-  mem->freq = prop.memoryClockRate * 0.001f;
+
+  if (cudaDeviceGetAttribute(&val, cudaDevAttrMemoryClockRate, gpu->idx) == cudaSuccess) {
+      if (val > 1000000)
+          mem->freq = (float)val / 1000000.0f;
+      else
+          mem->freq = (float)val * 0.001f;
+  } else {
+      mem->freq = 0.0f;
+  }
+
  mem->bus_width = prop.memoryBusWidth;
  mem->clk_mul = guess_clock_multipilier(gpu, mem);
  mem->type = guess_memtype_from_cmul_and_uarch(mem->clk_mul, gpu->arch);

-  // Fix frequency returned from CUDA to show real frequency
-  mem->freq = mem->freq  / mem->clk_mul;
+  if (mem->clk_mul > 0)
+      mem->freq = mem->freq / mem->clk_mul;

  return mem;
 }

-int64_t get_peak_performance(struct gpu_info* gpu) {
-  return gpu->freq * 1000000 * gpu->topo->cuda_cores * 2;
+int64_t get_peak_performance_cuda(struct gpu_info* gpu) {
+  return gpu->freq * 1000000 * gpu->topo_c->cuda_cores * 2;
 }

-struct gpu_info* get_gpu_info(int gpu_idx) {
+int64_t get_peak_performance_tcu(cudaDeviceProp prop, struct gpu_info* gpu) {
+  if(prop.major == 7) return gpu->freq * 1000000 * 4 * 4 * 4  * 2 * gpu->topo_c->tensor_cores;
+  else if(prop.major == 8) return gpu->freq * 1000000 * 8 * 4 * 8 * 2 * gpu->topo_c->tensor_cores;
+  else return 0;
+}
+
+struct gpu_info* get_gpu_info_cuda(struct pci_dev *devices, int gpu_idx) {
  struct gpu_info* gpu = (struct gpu_info*) emalloc(sizeof(struct gpu_info));
  gpu->pci = NULL;
  gpu->idx = gpu_idx;
@@ -74,16 +107,23 @@ struct gpu_info* get_gpu_info(int gpu_idx) {
    return NULL;
  }

-  printf("Waiting for CUDA driver to start...");
-  fflush(stdout);
+  if(gpu_idx == 0) {
+    printf("%s", CUDA_DRIVER_START_WARNING);
+    fflush(stdout);
+  }

  int num_gpus = -1;
-  cudaError_t err = cudaSuccess;
-  if ((err = cudaGetDeviceCount(&num_gpus)) != cudaSuccess) {
+  cudaError_t err = cudaGetDeviceCount(&num_gpus);
+
+  if(gpu_idx == 0) {
+    printf("\r%*c\r", (int) strlen(CUDA_DRIVER_START_WARNING), ' ');
+    fflush(stdout);
+  }
+
+  if(err != cudaSuccess) {
    printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
    return NULL;
  }
-  printf("\r                                   ");

  if(num_gpus <= 0) {
    printErr("No CUDA capable devices found!");
@@ -91,7 +131,6 @@ struct gpu_info* get_gpu_info(int gpu_idx) {
  }

  if(gpu->idx+1 > num_gpus) {
-    printErr("Requested GPU index %d in a system with %d GPUs", gpu->idx, num_gpus);
    return NULL;
  }

@@ -101,43 +140,36 @@ struct gpu_info* get_gpu_info(int gpu_idx) {
    return NULL;
  }

-  gpu->freq = deviceProp.clockRate * 1e-3f;
+  int core_clk = 0;
+  if (cudaDeviceGetAttribute(&core_clk, cudaDevAttrClockRate, gpu->idx) == cudaSuccess) {
+      if (core_clk > 1000000)
+          gpu->freq = core_clk / 1000000.0f;
+      else
+          gpu->freq = core_clk * 0.001f;
+  } else {
+      gpu->freq = 0.0f;
+  }
+
  gpu->vendor = GPU_VENDOR_NVIDIA;
-  gpu->name = (char *) emalloc(sizeof(char) * (strlen(deviceProp.name) + 1));
+  gpu->name = (char *) emalloc(strlen(deviceProp.name) + 1);
  strcpy(gpu->name, deviceProp.name);

-  gpu->nvmld = nvml_init();
-  if(nvml_get_pci_info(gpu->idx, gpu->nvmld)) {
-    gpu->pci = get_pci_from_nvml(gpu->nvmld);
+  if((gpu->pci = get_pci_from_pciutils(devices, PCI_VENDOR_ID_NVIDIA, gpu_idx)) == NULL) {
+    printErr("Unable to find a valid device for vendor id 0x%.4X using pciutils", PCI_VENDOR_ID_NVIDIA);
+    return NULL;
  }

  gpu->arch = get_uarch_from_cuda(gpu);
  gpu->cach = get_cache_info(deviceProp);
  gpu->mem = get_memory_info(gpu, deviceProp);
-  gpu->topo = get_topology_info(deviceProp);
-  gpu->peak_performance = get_peak_performance(gpu);
+  gpu->topo_c = get_topology_info(gpu->arch, deviceProp);
+  gpu->peak_performance = get_peak_performance_cuda(gpu);
+  gpu->peak_performance_tcu = get_peak_performance_tcu(deviceProp, gpu);

  return gpu;
 }

-char* get_str_sm(struct gpu_info* gpu) {
-  uint32_t max_size = 10;
-  char* dummy = (char *) ecalloc(max_size, sizeof(char));
-  snprintf(dummy, max_size, "%d", gpu->topo->streaming_mp);
-  return dummy;
-}
-
-char* get_str_cores_sm(struct gpu_info* gpu) {
-  uint32_t max_size = 10;
-  char* dummy = (char *) ecalloc(max_size, sizeof(char));
-  snprintf(dummy, max_size, "%d", gpu->topo->cores_per_mp);
-  return dummy;
-}
-
-char* get_str_cuda_cores(struct gpu_info* gpu) {
-  uint32_t max_size = 10;
-  char* dummy = (char *) ecalloc(max_size, sizeof(char));
-  snprintf(dummy, max_size, "%d", gpu->topo->cuda_cores);
-  return dummy;
-}
-
+char* get_str_sm(struct gpu_info* gpu) { return get_str_generic(gpu->topo_c->streaming_mp); }
+char* get_str_cores_sm(struct gpu_info* gpu) { return get_str_generic(gpu->topo_c->cores_per_mp); }
+char* get_str_cuda_cores(struct gpu_info* gpu) { return get_str_generic(gpu->topo_c->cuda_cores); }
+char* get_str_tensor_cores(struct gpu_info* gpu) { return get_str_generic(gpu->topo_c->tensor_cores); }
--- a/src/cuda/cuda.hpp
+++ b/src/cuda/cuda.hpp
@@ -1,11 +1,14 @@
-#ifndef __CUDA__
-#define __CUDA__
+#ifndef __CUDA_GPU__
+#define __CUDA_GPU__

 #include "../common/gpu.hpp"
+#define CUDA_DRIVER_START_WARNING "Waiting for CUDA driver to start..."

-struct gpu_info* get_gpu_info(int gpu_idx);
+struct gpu_info* get_gpu_info_cuda(struct pci_dev *devices, int gpu_idx);
+bool print_gpu_cuda(struct gpu_info* gpu);
 char* get_str_sm(struct gpu_info* gpu);
 char* get_str_cores_sm(struct gpu_info* gpu);
 char* get_str_cuda_cores(struct gpu_info* gpu);
+char* get_str_tensor_cores(struct gpu_info* gpu);

 #endif
--- a/src/cuda/gpufetch_helper_cuda.hpp
+++ b/src/cuda/gpufetch_helper_cuda.hpp
@@ -0,0 +1,63 @@
+#ifndef __GPUFETCH_HELPER_CUDA__
+#define __GPUFETCH_HELPER_CUDA__
+
+// gpufetch self contained helper_cuda.h
+//
+// Avoids relying on helper_cuda.h, which is
+// often very hard to include properly, causing
+// compilation issues.
+//
+// URL: https://github.com/NVIDIA/cuda-samples
+// Commit: 8199209
+
+inline int _ConvertSMVer2Cores(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the # of cores per SM
+  typedef struct {
+    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    int Cores;
+  } sSMtoCores;
+
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192},
+      {0x32, 192},
+      {0x35, 192},
+      {0x37, 192},
+      {0x50, 128},
+      {0x52, 128},
+      {0x53, 128},
+      {0x60,  64},
+      {0x61, 128},
+      {0x62, 128},
+      {0x70,  64},
+      {0x72,  64},
+      {0x75,  64},
+      {0x80,  64},
+      {0x86, 128},
+      {0x87, 128},
+      // I added this one because it was missing in original cuda-samples...
+      {0x89, 128},
+      {0x90, 128},
+      {-1, -1}};
+
+  int index = 0;
+
+  while (nGpuArchCoresPerSM[index].SM != -1) {
+    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchCoresPerSM[index].Cores;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoCores for SM %d.%d is undefined."
+      "  Default to use %d Cores/SM\n",
+      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+  return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+#endif
--- a/src/cuda/nvmlb.cpp
+++ b/src/cuda/nvmlb.cpp
@@ -1,70 +0,0 @@
-#include <nvml.h>
-
-#include "nvmlb.hpp"
-#include "../common/global.hpp"
-
-struct nvml_data {
-  bool nvml_started;
-  nvmlPciInfo_t pci;
-};
-
-struct nvml_data* nvml_init() {
-  struct nvml_data* data = (struct nvml_data*) emalloc(sizeof(struct nvml_data));
-  data->nvml_started = false;
-
-  nvmlReturn_t result;
-
-  if ((result = nvmlInit()) != NVML_SUCCESS) {
-    printErr("nvmlInit: %s\n", nvmlErrorString(result));
-    return NULL;
-  }
-
-  data->nvml_started = true;
-  return data;
-}
-
-bool nvml_get_pci_info(int gpu_idx, struct nvml_data* data) {
-  nvmlReturn_t result;
-  nvmlDevice_t device;
-
-  if(!data->nvml_started) {
-    printErr("nvml_get_pci_info: nvml was not started");
-    return false;
-  }
-
-  if ((result = nvmlDeviceGetHandleByIndex(gpu_idx, &device)) != NVML_SUCCESS) {
-    printErr("nvmlDeviceGetHandleByIndex: %s\n", nvmlErrorString(result));
-    return false;
-  }
-
-  if ((result = nvmlDeviceGetPciInfo(device, &data->pci)) != NVML_SUCCESS) {
-    printErr("nvmlDeviceGetPciInfo: %s\n", nvmlErrorString(result));
-    return false;
-  }
-
-  return true;
-}
-
-uint16_t nvml_get_pci_vendor_id(struct nvml_data* data) {
-  return data->pci.pciDeviceId & 0x0000FFFF;
-}
-
-uint16_t nvml_get_pci_device_id(struct nvml_data* data) {
-  return (data->pci.pciDeviceId & 0xFFFF0000) >> 16;
-}
-
-bool nvml_shutdown(struct nvml_data* data) {
-  nvmlReturn_t result;
-
-  if(!data->nvml_started) {
-    printWarn("nvml_get_pci_info: nvml was not started");
-    return true;
-  }
-
-  if ((result = nvmlShutdown()) != NVML_SUCCESS) {
-    printErr("nvmlShutdown: %s\n", nvmlErrorString(result));
-    return false;
-  }
-
-  return true;
-}
--- a/src/cuda/nvmlb.hpp
+++ b/src/cuda/nvmlb.hpp
@@ -1,16 +0,0 @@
-// NVML Backend
-#ifndef __NVMLB__
-#define __NVMLB__
-
-#include <stdbool.h>
-#include <stdint.h>
-
-struct nvml_data;
-
-struct nvml_data* nvml_init();
-bool nvml_get_pci_info(int dev, struct nvml_data* data);
-uint16_t nvml_get_pci_vendor_id(struct nvml_data* data);
-uint16_t nvml_get_pci_device_id(struct nvml_data* data);
-bool nvml_shutdown(struct nvml_data* data);
-
-#endif
--- a/src/cuda/pci.cpp
+++ b/src/cuda/pci.cpp
@@ -1,28 +1,14 @@
-#include <stdio.h>
+#include <cstdio>

 #include "pci.hpp"
-#include "nvmlb.hpp"
 #include "chips.hpp"
 #include "../common/global.hpp"
+#include "../common/pci.hpp"

 #define CHECK_PCI_START if (false) {}
 #define CHECK_PCI(pci, id, chip) \
   else if (pci->device_id == id) return chip;
-#define CHECK_PCI_END else { printBug("TODOO"); return CHIP_UNKNOWN; }
-
-struct pci {
-  uint16_t vendor_id;
-  uint16_t device_id;
-};
-
-struct pci* get_pci_from_nvml(struct nvml_data* data) {
-  struct pci* pci = (struct pci*) emalloc(sizeof(struct pci));
-
-  pci->vendor_id = nvml_get_pci_vendor_id(data);
-  pci->device_id = nvml_get_pci_device_id(data);
-
-  return pci;
-}
+#define CHECK_PCI_END else { printBug("Unknown CUDA device id: 0x%.4X", pci->device_id); return CHIP_UNKNOWN_CUDA; }

 /*
 * pci ids were retrieved using https://github.com/pciutils/pciids
@@ -33,63 +19,112 @@ struct pci* get_pci_from_nvml(struct nvml_data* data) {
 * or in pci.ids itself)
 */

-GPUCHIP get_chip_from_pci(struct pci* pci) {
+GPUCHIP get_chip_from_pci_cuda(struct pci* pci) {
  CHECK_PCI_START
+  CHECK_PCI(pci, 0x27b8, CHIP_AD104GL)
+  CHECK_PCI(pci, 0x2785, CHIP_AD104)
+  CHECK_PCI(pci, 0x26b8, CHIP_AD102GL)
+  CHECK_PCI(pci, 0x26b5, CHIP_AD102GL)
+  CHECK_PCI(pci, 0x26b1, CHIP_AD102GL)
+  CHECK_PCI(pci, 0x2684, CHIP_AD102)
+  CHECK_PCI(pci, 0x25fa, CHIP_GA107)
+  CHECK_PCI(pci, 0x25f9, CHIP_GA107)
  CHECK_PCI(pci, 0x25e5, CHIP_GA107BM)
  CHECK_PCI(pci, 0x25e2, CHIP_GA107BM)
  CHECK_PCI(pci, 0x25e0, CHIP_GA107BM)
+  CHECK_PCI(pci, 0x25bb, CHIP_GA107GLM)
+  CHECK_PCI(pci, 0x25ba, CHIP_GA107GLM)
+  CHECK_PCI(pci, 0x25b9, CHIP_GA107GLM)
  CHECK_PCI(pci, 0x25b8, CHIP_GA107GLM)
+  CHECK_PCI(pci, 0x25b6, CHIP_GA107GL)
  CHECK_PCI(pci, 0x25b5, CHIP_GA107GLM)
  CHECK_PCI(pci, 0x25af, CHIP_GA107)
+  CHECK_PCI(pci, 0x25aa, CHIP_GA107M)
+  CHECK_PCI(pci, 0x25a9, CHIP_GA107M)
+  CHECK_PCI(pci, 0x25a7, CHIP_GA107M)
+  CHECK_PCI(pci, 0x25a6, CHIP_GA107M)
  CHECK_PCI(pci, 0x25a5, CHIP_GA107M)
  CHECK_PCI(pci, 0x25a4, CHIP_GA107)
+  CHECK_PCI(pci, 0x25a3, CHIP_GA107)
  CHECK_PCI(pci, 0x25a2, CHIP_GA107M)
  CHECK_PCI(pci, 0x25a0, CHIP_GA107M)
  CHECK_PCI(pci, 0x2583, CHIP_GA107)
+  CHECK_PCI(pci, 0x2571, CHIP_GA106)
  CHECK_PCI(pci, 0x2563, CHIP_GA106M)
+  CHECK_PCI(pci, 0x2561, CHIP_GA106M)
  CHECK_PCI(pci, 0x2560, CHIP_GA106M)
+  CHECK_PCI(pci, 0x2544, CHIP_GA106)
+  CHECK_PCI(pci, 0x2531, CHIP_GA106)
  CHECK_PCI(pci, 0x252f, CHIP_GA106)
  CHECK_PCI(pci, 0x2523, CHIP_GA106M)
+  CHECK_PCI(pci, 0x2521, CHIP_GA106M)
  CHECK_PCI(pci, 0x2520, CHIP_GA106M)
+  CHECK_PCI(pci, 0x2508, CHIP_GA106)
+  CHECK_PCI(pci, 0x2507, CHIP_GA106)
  CHECK_PCI(pci, 0x2505, CHIP_GA106)
  CHECK_PCI(pci, 0x2504, CHIP_GA106)
  CHECK_PCI(pci, 0x2503, CHIP_GA106)
  CHECK_PCI(pci, 0x2501, CHIP_GA106)
+  CHECK_PCI(pci, 0x24fa, CHIP_GA104)
+  CHECK_PCI(pci, 0x24e0, CHIP_GA104M)
+  CHECK_PCI(pci, 0x24df, CHIP_GA104M)
  CHECK_PCI(pci, 0x24dd, CHIP_GA104M)
  CHECK_PCI(pci, 0x24dc, CHIP_GA104M)
+  CHECK_PCI(pci, 0x24c9, CHIP_GA104)
  CHECK_PCI(pci, 0x24bf, CHIP_GA104)
+  CHECK_PCI(pci, 0x24bb, CHIP_GA104GLM)
+  CHECK_PCI(pci, 0x24ba, CHIP_GA104GLM)
+  CHECK_PCI(pci, 0x24b9, CHIP_GA104GLM)
  CHECK_PCI(pci, 0x24b8, CHIP_GA104GLM)
  CHECK_PCI(pci, 0x24b7, CHIP_GA104GLM)
  CHECK_PCI(pci, 0x24b6, CHIP_GA104GLM)
+  CHECK_PCI(pci, 0x24b1, CHIP_GA104GL)
  CHECK_PCI(pci, 0x24b0, CHIP_GA104GL)
  CHECK_PCI(pci, 0x24af, CHIP_GA104)
  CHECK_PCI(pci, 0x24ad, CHIP_GA104)
  CHECK_PCI(pci, 0x24ac, CHIP_GA104)
+  CHECK_PCI(pci, 0x24a0, CHIP_GA104)
  CHECK_PCI(pci, 0x249f, CHIP_GA104M)
  CHECK_PCI(pci, 0x249d, CHIP_GA104M)
  CHECK_PCI(pci, 0x249c, CHIP_GA104M)
  CHECK_PCI(pci, 0x248a, CHIP_GA104)
  CHECK_PCI(pci, 0x2489, CHIP_GA104)
  CHECK_PCI(pci, 0x2488, CHIP_GA104)
+  CHECK_PCI(pci, 0x2487, CHIP_GA104)
  CHECK_PCI(pci, 0x2486, CHIP_GA104)
  CHECK_PCI(pci, 0x2484, CHIP_GA104)
  CHECK_PCI(pci, 0x2483, CHIP_GA104)
  CHECK_PCI(pci, 0x2482, CHIP_GA104)
+  CHECK_PCI(pci, 0x2460, CHIP_GA103M)
+  CHECK_PCI(pci, 0x2438, CHIP_GA103GLM)
+  CHECK_PCI(pci, 0x2420, CHIP_GA103M)
+  CHECK_PCI(pci, 0x2414, CHIP_GA103)
+  CHECK_PCI(pci, 0x2336, CHIP_GH100)
+  CHECK_PCI(pci, 0x2331, CHIP_GH100)
+  CHECK_PCI(pci, 0x2321, CHIP_GH100)
+  CHECK_PCI(pci, 0x2302, CHIP_GH100)
+  CHECK_PCI(pci, 0x228e, CHIP_GA106)
  CHECK_PCI(pci, 0x228b, CHIP_GA104)
  CHECK_PCI(pci, 0x223f, CHIP_GA102GL)
+  CHECK_PCI(pci, 0x2238, CHIP_GA102GL)
  CHECK_PCI(pci, 0x2237, CHIP_GA102GL)
  CHECK_PCI(pci, 0x2236, CHIP_GA102GL)
  CHECK_PCI(pci, 0x2235, CHIP_GA102GL)
+  CHECK_PCI(pci, 0x2233, CHIP_GA102GL)
+  CHECK_PCI(pci, 0x2232, CHIP_GA102GL)
  CHECK_PCI(pci, 0x2231, CHIP_GA102GL)
  CHECK_PCI(pci, 0x2230, CHIP_GA102GL)
  CHECK_PCI(pci, 0x222f, CHIP_GA102)
  CHECK_PCI(pci, 0x222b, CHIP_GA102)
  CHECK_PCI(pci, 0x2216, CHIP_GA102)
  CHECK_PCI(pci, 0x220d, CHIP_GA102)
+  CHECK_PCI(pci, 0x220a, CHIP_GA102)
  CHECK_PCI(pci, 0x2208, CHIP_GA102)
+  CHECK_PCI(pci, 0x2207, CHIP_GA102)
  CHECK_PCI(pci, 0x2206, CHIP_GA102)
  CHECK_PCI(pci, 0x2205, CHIP_GA102)
  CHECK_PCI(pci, 0x2204, CHIP_GA102)
+  CHECK_PCI(pci, 0x2203, CHIP_GA102)
  CHECK_PCI(pci, 0x2200, CHIP_GA102)
  CHECK_PCI(pci, 0x21d1, CHIP_TU116BM)
  CHECK_PCI(pci, 0x21c4, CHIP_TU116)
@@ -104,27 +139,45 @@ GPUCHIP get_chip_from_pci(struct pci* pci) {
  CHECK_PCI(pci, 0x2184, CHIP_TU116)
  CHECK_PCI(pci, 0x2183, CHIP_TU116)
  CHECK_PCI(pci, 0x2182, CHIP_TU116)
+  CHECK_PCI(pci, 0x20f6, CHIP_GA100)
+  CHECK_PCI(pci, 0x20f5, CHIP_GA100)
+  CHECK_PCI(pci, 0x20f2, CHIP_GA100)
  CHECK_PCI(pci, 0x20f1, CHIP_GA100)
+  CHECK_PCI(pci, 0x20f0, CHIP_GA100)
+  CHECK_PCI(pci, 0x20c2, CHIP_GA100)
  CHECK_PCI(pci, 0x20bf, CHIP_GA100)
  CHECK_PCI(pci, 0x20be, CHIP_GA100)
+  CHECK_PCI(pci, 0x20bb, CHIP_GA100)
+  CHECK_PCI(pci, 0x20b9, CHIP_GA100)
+  CHECK_PCI(pci, 0x20b8, CHIP_GA100)
  CHECK_PCI(pci, 0x20b7, CHIP_GA100GL)
  CHECK_PCI(pci, 0x20b6, CHIP_GA100GL)
  CHECK_PCI(pci, 0x20b5, CHIP_GA100)
+  CHECK_PCI(pci, 0x20b3, CHIP_GA100)
  CHECK_PCI(pci, 0x20b2, CHIP_GA100)
  CHECK_PCI(pci, 0x20b1, CHIP_GA100)
  CHECK_PCI(pci, 0x20b0, CHIP_GA100)
+  CHECK_PCI(pci, 0x2082, CHIP_GA100)
  CHECK_PCI(pci, 0x1ff9, CHIP_TU117GLM)
+  CHECK_PCI(pci, 0x1ff2, CHIP_TU117GL)
+  CHECK_PCI(pci, 0x1ff0, CHIP_TU117GL)
  CHECK_PCI(pci, 0x1fdd, CHIP_TU117BM)
  CHECK_PCI(pci, 0x1fd9, CHIP_TU117BM)
  CHECK_PCI(pci, 0x1fbf, CHIP_TU117GL)
+  CHECK_PCI(pci, 0x1fbc, CHIP_TU117GLM)
  CHECK_PCI(pci, 0x1fbb, CHIP_TU117GLM)
  CHECK_PCI(pci, 0x1fba, CHIP_TU117GLM)
  CHECK_PCI(pci, 0x1fb9, CHIP_TU117GLM)
  CHECK_PCI(pci, 0x1fb8, CHIP_TU117GLM)
+  CHECK_PCI(pci, 0x1fb7, CHIP_TU117GLM)
+  CHECK_PCI(pci, 0x1fb6, CHIP_TU117GLM)
  CHECK_PCI(pci, 0x1fb2, CHIP_TU117GLM)
  CHECK_PCI(pci, 0x1fb1, CHIP_TU117GL)
  CHECK_PCI(pci, 0x1fb0, CHIP_TU117GLM)
  CHECK_PCI(pci, 0x1fae, CHIP_TU117GL)
+  CHECK_PCI(pci, 0x1fa1, CHIP_TU117M)
+  CHECK_PCI(pci, 0x1fa0, CHIP_TU117M)
+  CHECK_PCI(pci, 0x1f9f, CHIP_TU117M)
  CHECK_PCI(pci, 0x1f9d, CHIP_TU117M)
  CHECK_PCI(pci, 0x1f9c, CHIP_TU117M)
  CHECK_PCI(pci, 0x1f99, CHIP_TU117M)
@@ -135,6 +188,7 @@ GPUCHIP get_chip_from_pci(struct pci* pci) {
  CHECK_PCI(pci, 0x1f94, CHIP_TU117M)
  CHECK_PCI(pci, 0x1f92, CHIP_TU117M)
  CHECK_PCI(pci, 0x1f91, CHIP_TU117M)
+  CHECK_PCI(pci, 0x1f83, CHIP_TU117)
  CHECK_PCI(pci, 0x1f82, CHIP_TU117)
  CHECK_PCI(pci, 0x1f81, CHIP_TU117)
  CHECK_PCI(pci, 0x1f76, CHIP_TU106GLM)
@@ -158,6 +212,7 @@ GPUCHIP get_chip_from_pci(struct pci* pci) {
  CHECK_PCI(pci, 0x1f07, CHIP_TU106)
  CHECK_PCI(pci, 0x1f06, CHIP_TU106)
  CHECK_PCI(pci, 0x1f04, CHIP_TU106)
+  CHECK_PCI(pci, 0x1f03, CHIP_TU106)
  CHECK_PCI(pci, 0x1f02, CHIP_TU106)
  CHECK_PCI(pci, 0x1ef5, CHIP_TU104GLM)
  CHECK_PCI(pci, 0x1ed3, CHIP_TU104BM)
@@ -170,6 +225,7 @@ GPUCHIP get_chip_from_pci(struct pci* pci) {
  CHECK_PCI(pci, 0x1eb8, CHIP_TU104GL)
  CHECK_PCI(pci, 0x1eb6, CHIP_TU104GLM)
  CHECK_PCI(pci, 0x1eb5, CHIP_TU104GLM)
+  CHECK_PCI(pci, 0x1eb4, CHIP_TU104GL)
  CHECK_PCI(pci, 0x1eb1, CHIP_TU104GL)
  CHECK_PCI(pci, 0x1eb0, CHIP_TU104GL)
  CHECK_PCI(pci, 0x1eae, CHIP_TU104M)
@@ -200,6 +256,7 @@ GPUCHIP get_chip_from_pci(struct pci* pci) {
  CHECK_PCI(pci, 0x1df5, CHIP_GV100GL)
  CHECK_PCI(pci, 0x1df2, CHIP_GV100GL)
  CHECK_PCI(pci, 0x1df0, CHIP_GV100GL)
+  CHECK_PCI(pci, 0x1dbe, CHIP_GV100)
  CHECK_PCI(pci, 0x1dba, CHIP_GV100GL)
  CHECK_PCI(pci, 0x1db8, CHIP_GV100GL)
  CHECK_PCI(pci, 0x1db7, CHIP_GV100GL)
@@ -219,6 +276,7 @@ GPUCHIP get_chip_from_pci(struct pci* pci) {
  CHECK_PCI(pci, 0x1d12, CHIP_GP108M)
  CHECK_PCI(pci, 0x1d11, CHIP_GP108M)
  CHECK_PCI(pci, 0x1d10, CHIP_GP108M)
+  CHECK_PCI(pci, 0x1d02, CHIP_GP108)
  CHECK_PCI(pci, 0x1d01, CHIP_GP108)
  CHECK_PCI(pci, 0x1cfb, CHIP_GP107GL)
  CHECK_PCI(pci, 0x1cfa, CHIP_GP107GL)
@@ -304,6 +362,7 @@ GPUCHIP get_chip_from_pci(struct pci* pci) {
  CHECK_PCI(pci, 0x1b02, CHIP_GP102)
  CHECK_PCI(pci, 0x1b01, CHIP_GP102)
  CHECK_PCI(pci, 0x1b00, CHIP_GP102)
+  CHECK_PCI(pci, 0x1af1, CHIP_GA100)
  CHECK_PCI(pci, 0x1aef, CHIP_GA102)
  CHECK_PCI(pci, 0x1aed, CHIP_TU116)
  CHECK_PCI(pci, 0x1aec, CHIP_TU116)
--- a/src/cuda/pci.hpp
+++ b/src/cuda/pci.hpp
@@ -1,13 +1,19 @@
-#ifndef __PCI__
-#define __PCI__
+#ifndef __PCI_CUDA__
+#define __PCI_CUDA__

-#include <stdint.h>
-#include "nvmlb.hpp"
+#include <cstdint>
+
+#include "../common/pci.hpp"
 #include "chips.hpp"

+/*
+ * doc: https://wiki.osdev.org/PCI#Class_Codes
+ *      https://pci-ids.ucw.cz/read/PC
+ */
+#define PCI_VENDOR_ID_NVIDIA 0x10de
+
 struct pci;

-struct pci* get_pci_from_nvml(struct nvml_data* data);
-GPUCHIP get_chip_from_pci(struct pci* pci);
+GPUCHIP get_chip_from_pci_cuda(struct pci* pci);

 #endif
--- a/src/cuda/uarch.cpp
+++ b/src/cuda/uarch.cpp
@@ -1,23 +1,19 @@
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
-#include <stdint.h>
+#include <cstdlib>
+#include <cstdint>
 #include <cstddef>
+#include <cstdio>
+#include <cstring>

+#include "../common/uarch.hpp"
 #include "../common/global.hpp"
 #include "../common/gpu.hpp"
+#include "pci.hpp"
 #include "chips.hpp"

-typedef uint32_t MICROARCH;
-
 // Any clock multiplier
 #define CM_ANY               -1

-// Data not available
-#define NA                   -1
-
-// Unknown manufacturing process
-#define UNK                  -1
-
 // MICROARCH values
 enum {
  UARCH_UNKNOWN,
@@ -29,6 +25,8 @@ enum {
  UARCH_VOLTA,
  UARCH_TURING,
  UARCH_AMPERE,
+  UARCH_ADA,
+  UARCH_HOPPER
 };

 static const char *uarch_str[] = {
@@ -41,25 +39,14 @@ static const char *uarch_str[] = {
  /*[ARCH_VOLTA]      = */ "Volta",
  /*[ARCH_TURING]     = */ "Turing",
  /*[ARCH_AMPERE]     = */ "Ampere",
-};
-
-struct uarch {
-  int32_t cc_major;
-  int32_t cc_minor;
-  int32_t compute_capability;
-
-  MICROARCH uarch;
-  GPUCHIP chip;
-
-  int32_t process;
-  char* uarch_str;
-  char* chip_str;
+  /*[ARCH_ADA]        = */ "Ada Lovelace",
+  /*[ARCH_HOPPER]     = */ "Hopper"
 };

 #define CHECK_UARCH_START if (false) {}
 #define CHECK_UARCH(arch, chip_, str, uarch, process) \
   else if (arch->chip == chip_) fill_uarch(arch, str, uarch, process);
-#define CHECK_UARCH_END else { printBug("map_chip_to_uarch: Unknown chip id: %d", arch->chip); fill_uarch(arch, STRING_UNKNOWN, UARCH_UNKNOWN, 0); }
+#define CHECK_UARCH_END else { if(arch->chip != CHIP_UNKNOWN_CUDA) printBug("map_chip_to_uarch_cuda: Unknown chip id: %d", arch->chip); fill_uarch(arch, STRING_UNKNOWN, UARCH_UNKNOWN, UNK); }

 void fill_uarch(struct uarch* arch, char const *str, MICROARCH u, uint32_t process) {
  arch->chip_str = (char *) emalloc(sizeof(char) * (strlen(str)+1));
@@ -74,7 +61,7 @@ void fill_uarch(struct uarch* arch, char const *str, MICROARCH u, uint32_t proce
 * o CHIP_XXXGL: indicates a professional-class (Quadro/Tesla) chip
 * o CHIP_XXXM:  indicates a mobile chip
 */
-void map_chip_to_uarch(struct uarch* arch) {
+void map_chip_to_uarch_cuda(struct uarch* arch) {
  CHECK_UARCH_START
  // TESLA (1.0, 1.1, 1.2, 1.3)                                //
  CHECK_UARCH(arch, CHIP_G80,      "G80",      UARCH_TESLA,   90)
@@ -236,6 +223,9 @@ void map_chip_to_uarch(struct uarch* arch) {
  CHECK_UARCH(arch, CHIP_GA100GL,  "GA100",    UARCH_AMPERE,   7)
  CHECK_UARCH(arch, CHIP_GA102,    "GA102",    UARCH_AMPERE,   8)
  CHECK_UARCH(arch, CHIP_GA102GL,  "GA102",    UARCH_AMPERE,   8)
+  CHECK_UARCH(arch, CHIP_GA103,    "GA103",    UARCH_AMPERE,   8)
+  CHECK_UARCH(arch, CHIP_GA103GLM, "GA103",    UARCH_AMPERE,   8)
+  CHECK_UARCH(arch, CHIP_GA103M,   "GA103",    UARCH_AMPERE,   8)
  CHECK_UARCH(arch, CHIP_GA104,    "GA104",    UARCH_AMPERE,   8)
  CHECK_UARCH(arch, CHIP_GA104GL,  "GA104",    UARCH_AMPERE,   8)
  CHECK_UARCH(arch, CHIP_GA104GLM, "GA104",    UARCH_AMPERE,   8)
@@ -246,6 +236,13 @@ void map_chip_to_uarch(struct uarch* arch) {
  CHECK_UARCH(arch, CHIP_GA107BM,  "GA107",    UARCH_AMPERE,   8)
  CHECK_UARCH(arch, CHIP_GA107GLM, "GA107",    UARCH_AMPERE,   8)
  CHECK_UARCH(arch, CHIP_GA107M,   "GA107",    UARCH_AMPERE,   8)
+  // ADA LOVELACE (8.9)
+  CHECK_UARCH(arch, CHIP_AD102,    "AD102",    UARCH_ADA,      4)
+  CHECK_UARCH(arch, CHIP_AD102GL,  "AD102",    UARCH_ADA,      4)
+  CHECK_UARCH(arch, CHIP_AD104,    "AD104",    UARCH_ADA,      4)
+  CHECK_UARCH(arch, CHIP_AD104GL,  "AD104",    UARCH_ADA,      4)
+  // HOPPER (9.0)
+  CHECK_UARCH(arch, CHIP_GH100,    "GH100",    UARCH_HOPPER,   4)
  CHECK_UARCH_END
 }

@@ -263,9 +260,8 @@ struct uarch* get_uarch_from_cuda(struct gpu_info* gpu) {
  arch->cc_major = deviceProp.major;
  arch->cc_minor = deviceProp.minor;
  arch->compute_capability = deviceProp.major * 10 + deviceProp.minor;
-  arch->chip = get_chip_from_pci(gpu->pci);
-
-  map_chip_to_uarch(arch);
+  arch->chip = get_chip_from_pci_cuda(gpu->pci);
+  map_chip_to_uarch_cuda(arch);

  return arch;
 }
@@ -285,11 +281,17 @@ bool clkm_possible_for_uarch(int clkm, struct uarch* arch) {
    case UARCH_VOLTA:   return clkm == 1;
    case UARCH_TURING:  return clkm == 2 || clkm == 4;
    case UARCH_AMPERE:  return clkm == 1 || clkm == 4 || clkm == 8;
+    case UARCH_ADA:     return clkm == 8;
+    case UARCH_HOPPER:  return clkm == 1;
  }
  return false;
 }

 MEMTYPE guess_memtype_from_cmul_and_uarch(int clkm, struct uarch* arch) {
+  if(arch->uarch == UARCH_UNKNOWN) {
+    printWarn("guess_memtype_from_cmul_and_uarch: Found unknown uarch");
+    return MEMTYPE_UNKNOWN;
+  }
  /*
   * +---------+------------------+
   * | MEMTYPE | Clock multiplier |
@@ -332,13 +334,13 @@ MEMTYPE guess_memtype_from_cmul_and_uarch(int clkm, struct uarch* arch) {
  CHECK_MEMTYPE(arch, clkm, UARCH_AMPERE,     1, MEMTYPE_HBM2)
  CHECK_MEMTYPE(arch, clkm, UARCH_AMPERE,     4, MEMTYPE_GDDR6)
  CHECK_MEMTYPE(arch, clkm, UARCH_AMPERE,     8, MEMTYPE_GDDR6X)
+  // ADA
+  CHECK_MEMTYPE(arch, clkm, UARCH_ADA,        8, MEMTYPE_GDDR6X)
+  // HOPPER
+  CHECK_MEMTYPE(arch, clkm, UARCH_HOPPER,     1, MEMTYPE_HBM2)
  CHECK_MEMTYPE_END
 }

-const char* get_str_uarch(struct uarch* arch) {
-  return uarch_str[arch->uarch];
-}
-
 char* get_str_cc(struct uarch* arch) {
  uint32_t max_size = 4;
  char* cc = (char *) ecalloc(max_size, sizeof(char));
@@ -346,33 +348,22 @@ char* get_str_cc(struct uarch* arch) {
  return cc;
 }

-char* get_str_process(struct uarch* arch) {
-  char* str = (char *) emalloc(sizeof(char) * (strlen(STRING_UNKNOWN)+1));
-  int32_t process = arch->process;
-
-  if(process == UNK) {
-    snprintf(str, strlen(STRING_UNKNOWN)+1, STRING_UNKNOWN);
-  }
-  else if(process > 100) {
-    sprintf(str, "%.2fum", (double)process/100);
-  }
-  else if(process > 0){
-    sprintf(str, "%dnm", process);
-  }
-  else {
-    snprintf(str, strlen(STRING_UNKNOWN)+1, STRING_UNKNOWN);
-    printBug("Found invalid process: '%d'", process);
-  }
-
-  return str;
-}
-
 char* get_str_chip(struct uarch* arch) {
  return arch->chip_str;
 }

+// TODO: What about _ConvertSMVer2ArchName?
+const char* get_str_uarch_cuda(struct uarch* arch) {
+  return uarch_str[arch->uarch];
+}
+
 void free_uarch_struct(struct uarch* arch) {
  free(arch->uarch_str);
  free(arch->chip_str);
  free(arch);
 }
+
+bool is_chip_TU116(struct uarch* arch) {
+  return arch->chip == CHIP_TU116   || arch->chip == CHIP_TU116BM ||
+         arch->chip == CHIP_TU116GL || arch->chip == CHIP_TU116M;
+}
--- a/src/cuda/uarch.hpp
+++ b/src/cuda/uarch.hpp
@@ -1,5 +1,5 @@
-#ifndef __UARCH__
-#define __UARCH__
+#ifndef __CUDA_UARCH__
+#define __CUDA_UARCH__

 #include "../common/gpu.hpp"

@@ -8,10 +8,11 @@ struct uarch;
 struct uarch* get_uarch_from_cuda(struct gpu_info* gpu);
 bool clkm_possible_for_uarch(int clkm, struct uarch* arch);
 MEMTYPE guess_memtype_from_cmul_and_uarch(int ddr, struct uarch* arch);
-char* get_str_uarch(struct uarch* arch);
+char* get_str_uarch_cuda(struct uarch* arch);
 char* get_str_cc(struct uarch* arch);
 char* get_str_chip(struct uarch* arch);
 char* get_str_process(struct uarch* arch);
 void free_uarch_struct(struct uarch* arch);
+bool is_chip_TU116(struct uarch* arch);

 #endif
--- a/src/hsa/chips.hpp
+++ b/src/hsa/chips.hpp
@@ -0,0 +1,37 @@
+#ifndef __HSA_GPUCHIPS__
+#define __HSA_GPUCHIPS__
+
+typedef uint32_t GPUCHIP;
+
+enum {
+  CHIP_UNKNOWN_HSA,
+  // VEGA (TODO)
+  // ...
+  // RDNA
+  CHIP_NAVI_10,
+  CHIP_NAVI_12,
+  CHIP_NAVI_14,
+  // RDNA2
+  // There are way more (eg Oberon)
+  // Maybe we'll add them in the future.
+  CHIP_NAVI_21,
+  CHIP_NAVI_22,
+  CHIP_NAVI_23,
+  CHIP_NAVI_24,
+  // RDNA3
+  // There are way more as well.
+  // Supporting Navi only for now.
+  CHIP_NAVI_31,
+  CHIP_NAVI_32,
+  CHIP_NAVI_33,
+  // RDNA4
+  CHIP_NAVI_44,
+  CHIP_NAVI_48,
+  // CDNA
+  CHIP_ARCTURUS,      // MI100 series
+  CHIP_ALDEBARAN,     // MI200 series
+  CHIP_AQUA_VANJARAM, // MI300 series
+  CHIP_CDNA_NEXT      // MI350 series
+};
+
+#endif
--- a/src/hsa/hsa.cpp
+++ b/src/hsa/hsa.cpp
@@ -0,0 +1,242 @@
+#include <iostream>
+#include <hsa/hsa.h>
+#include <hsa/hsa_ext_amd.h>
+
+#include <cstring>
+#include <cstdlib>
+#include <cstdio>
+
+#include <iostream>
+#include <iomanip>
+#include <hsa/hsa.h>
+#include <hsa/hsa_ext_amd.h>
+
+#include "hsa.hpp"
+#include "uarch.hpp"
+#include "../common/global.hpp"
+#include "../common/uarch.hpp"
+
+struct agent_info {
+  unsigned deviceId; // ID of the target GPU device
+  char gpu_name[64];  
+  char vendor_name[64];
+  char device_mkt_name[64];
+  uint32_t max_clock_freq;
+  // Memory
+  uint32_t bus_width;
+  uint32_t lds_size;
+  uint64_t global_size;
+  // Topology
+  uint32_t compute_unit;
+  uint32_t num_shader_engines;
+  uint32_t simds_per_cu;
+  uint32_t num_xcc;            // Acccelerator Complex Dies (XCDs)
+  uint32_t matrix_cores;       // Cores with WMMA/MFMA capabilities
+};
+
+#define RET_IF_HSA_ERR(err) { \
+  if ((err) != HSA_STATUS_SUCCESS) { \
+    char err_val[12];                                                         \
+    char* err_str = NULL;                                                     \
+    if (hsa_status_string(err,                                                \
+            (const char**)&err_str) != HSA_STATUS_SUCCESS) {                  \
+      snprintf(&(err_val[0]), sizeof(err_val), "%#x", (uint32_t)err);         \
+      err_str = &(err_val[0]);                                                \
+    }                                                                         \
+    printErr("HSA failure at: %s:%d\n", __FILE__, __LINE__);                  \
+    printErr("Call returned %s\n", err_str);                                  \
+    return (err);                                                             \
+  }                                                                           \
+}
+
+hsa_status_t memory_pool_callback(hsa_amd_memory_pool_t pool, void* data) {
+  struct agent_info* info = reinterpret_cast<struct agent_info *>(data);
+
+  hsa_amd_segment_t segment;
+  hsa_status_t err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
+  RET_IF_HSA_ERR(err);
+
+  if (segment == HSA_AMD_SEGMENT_GROUP) {
+    // LDS memory
+    // We want to make sure that this memory pool is not repeated.
+    if (info->lds_size != 0) {
+      printErr("Found HSA_AMD_SEGMENT_GROUP twice!");
+      return HSA_STATUS_ERROR;
+    }
+    uint32_t size = 0;
+
+    err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
+    RET_IF_HSA_ERR(err);
+
+    info->lds_size = size;    
+  }
+  else if (segment == HSA_AMD_SEGMENT_GLOBAL) {
+    // Global memory
+    uint32_t global_flags = 0;
+    
+    err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flags);
+    RET_IF_HSA_ERR(err);
+
+    if (global_flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED) {
+      if (info->global_size != 0) {
+        printErr("Found HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED twice!");
+        return HSA_STATUS_ERROR;
+      }
+
+      uint64_t size = 0;
+
+      err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
+      RET_IF_HSA_ERR(err);
+
+      info->global_size = size;
+    }    
+  }
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t agent_callback(hsa_agent_t agent, void *data) {
+  struct agent_info* info = reinterpret_cast<struct agent_info *>(data);
+
+  hsa_device_type_t type;
+  hsa_status_t err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
+  RET_IF_HSA_ERR(err);
+
+  if (type == HSA_DEVICE_TYPE_GPU) {
+    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, info->gpu_name);
+    RET_IF_HSA_ERR(err);
+
+    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_VENDOR_NAME, info->vendor_name);
+    RET_IF_HSA_ERR(err);
+
+    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_PRODUCT_NAME, &info->device_mkt_name);
+    RET_IF_HSA_ERR(err);
+
+    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, &info->max_clock_freq);
+    RET_IF_HSA_ERR(err);
+
+    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &info->compute_unit);
+    RET_IF_HSA_ERR(err);
+
+    // According to the documentation, this is deprecated. But what should I be using then?
+    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_MEMORY_WIDTH, &info->bus_width);
+    RET_IF_HSA_ERR(err);
+
+    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES, &info->num_shader_engines);
+    RET_IF_HSA_ERR(err);
+
+    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, &info->simds_per_cu);
+    RET_IF_HSA_ERR(err);
+
+    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_NUM_XCC, &info->num_xcc);
+    RET_IF_HSA_ERR(err);
+
+    // We will check against zero to see if it was set beforehand.
+    info->global_size = 0;
+    info->lds_size = 0;
+    // This will fill global_size and lds_size.
+    err = hsa_amd_agent_iterate_memory_pools(agent, memory_pool_callback, data);
+    RET_IF_HSA_ERR(err);
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+struct topology_h* get_topology_info(struct agent_info info) {
+  struct topology_h* topo = (struct topology_h*) emalloc(sizeof(struct topology_h));
+
+  topo->compute_units = info.compute_unit;
+  topo->num_shader_engines = info.num_shader_engines; // not printed at the moment
+  topo->simds_per_cu = info.simds_per_cu;             // not printed at the moment
+  topo->num_xcc = info.num_xcc;
+  // Old GPUs (GCN I guess) might not have matrix cores.
+  // Not sure what would happen here?
+  topo->matrix_cores = topo->compute_units * topo->simds_per_cu;
+
+  return topo;
+}
+
+struct memory* get_memory_info(struct gpu_info* gpu, struct agent_info info) {
+  struct memory* mem = (struct memory*) emalloc(sizeof(struct memory));
+  
+  mem->bus_width = info.bus_width;
+  mem->lds_size = info.lds_size;
+  mem->size_bytes = info.global_size;
+
+  return mem;
+}
+
+struct gpu_info* get_gpu_info_hsa(int gpu_idx) {
+  struct gpu_info* gpu = (struct gpu_info*) emalloc(sizeof(struct gpu_info));
+  gpu->pci = NULL;
+  gpu->idx = gpu_idx;
+
+  if(gpu->idx < 0) {
+    printErr("GPU index must be equal or greater than zero");
+    return NULL;
+  }
+
+  if(gpu->idx > 0) {
+    // Currently we only support fetching GPU 0.
+    return NULL;
+  }
+
+  hsa_status_t err = hsa_init();
+  if (err != HSA_STATUS_SUCCESS) {
+    printErr("Failed to initialize HSA runtime");
+    return NULL;
+  }
+
+  struct agent_info info;
+  info.deviceId = gpu_idx;
+
+  // Iterate over all agents in the system
+  err = hsa_iterate_agents(agent_callback, &info);
+  if (err != HSA_STATUS_SUCCESS) {
+    printErr("Failed to iterate HSA agents");
+    hsa_shut_down();
+    return NULL;
+  }
+
+  if (strcmp(info.vendor_name, "AMD") != 0) {
+    printErr("HSA vendor name is: '%s'. Only AMD is supported!", info.vendor_name);
+    return NULL;
+  }
+  gpu->vendor = GPU_VENDOR_AMD;
+
+  gpu->freq = info.max_clock_freq;
+  gpu->topo_h = get_topology_info(info);
+  gpu->name = (char *) emalloc(sizeof(char) * (strlen(info.device_mkt_name) + 1));
+  strcpy(gpu->name, info.device_mkt_name);
+  gpu->arch = get_uarch_from_hsa(gpu, info.gpu_name);
+  gpu->mem = get_memory_info(gpu, info);
+
+  if (gpu->arch == NULL) {
+    return NULL;
+  }
+
+  // Shut down the HSA runtime
+  err = hsa_shut_down();
+  if (err != HSA_STATUS_SUCCESS) {
+    printErr("Failed to shutdown HSA runtime");
+    return NULL;
+  }
+  return gpu;
+}
+
+char* get_str_cu(struct gpu_info* gpu) {
+  return get_str_generic(gpu->topo_h->compute_units);
+}
+
+char* get_str_xcds(struct gpu_info* gpu) {
+  // If there is a single XCD, then we dont want to
+  // print it.
+  if (gpu->topo_h->num_xcc == 1) {
+    return NULL;
+  }
+  return get_str_generic(gpu->topo_h->num_xcc);
+}
+
+char* get_str_matrix_cores(struct gpu_info* gpu) {
+  // TODO: Show XX (WMMA/MFMA)
+  return get_str_generic(gpu->topo_h->matrix_cores);
+}
--- a/src/hsa/hsa.hpp
+++ b/src/hsa/hsa.hpp
@@ -0,0 +1,11 @@
+#ifndef __HSA_GPU__
+#define __HSA_GPU__
+
+#include "../common/gpu.hpp"
+
+struct gpu_info* get_gpu_info_hsa(int gpu_idx);
+char* get_str_cu(struct gpu_info* gpu);
+char* get_str_xcds(struct gpu_info* gpu);
+char* get_str_matrix_cores(struct gpu_info* gpu);
+
+#endif
--- a/src/hsa/uarch.cpp
+++ b/src/hsa/uarch.cpp
@@ -0,0 +1,321 @@
+#include <cstdlib>
+#include <cstdint>
+#include <cstring>
+
+#include "../common/uarch.hpp"
+#include "../common/global.hpp"
+#include "../common/gpu.hpp"
+#include "chips.hpp"
+
+// MICROARCH values
+enum {
+  UARCH_UNKNOWN,
+  // GCN (Graphics Core Next)
+  // Empty for now
+  // ...
+  // RDNA (Radeon DNA)
+  UARCH_RDNA,
+  UARCH_RDNA2,
+  UARCH_RDNA3,
+  UARCH_RDNA4,
+  // CDNA (Compute DNA)
+  UARCH_CDNA,
+  UARCH_CDNA2,
+  UARCH_CDNA3,
+  UARCH_CDNA4
+};
+
+static const char *uarch_str[] = {
+  /*[ARCH_UNKNOWN]    = */ STRING_UNKNOWN,
+  /*[UARCH_RDNA]      = */ "RDNA",
+  /*[UARCH_RDNA2]     = */ "RDNA2",
+  /*[UARCH_RDNA3]     = */ "RDNA3",
+  /*[UARCH_RDNA4]     = */ "RDNA4",
+  /*[UARCH_CDNA]      = */ "CDNA",
+  /*[UARCH_CDNA2]     = */ "CDNA2",
+  /*[UARCH_CDNA3]     = */ "CDNA3",
+  /*[UARCH_CDNA4]     = */ "CDNA4",
+};
+
+// Sources: 
+// - https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html
+// - https://www.techpowerup.com
+//
+// This is sometimes refered to as LLVM target, but also shader ISA.
+//
+// LLVM target *usually* maps to a specific architecture. However there
+// are case where this is not true:
+// MI8 is GCN3.0 with LLVM target gfx803
+// MI6 is GCN4.0 with LLVM target gfx803
+// or
+// Strix Point can be gfx1150 or gfx1151
+//
+// NOTE: GCN chips are stored for completeness, but they are
+// not actively supported.
+enum {
+  TARGET_UNKNOWN_HSA,
+  /// GCN (Graphics Core Next)
+  /// ------------------------
+  // GCN 1.0 
+  TARGET_GFX600,
+  TARGET_GFX601,
+  TARGET_GFX602,
+  // GCN 2.0
+  TARGET_GFX700,
+  TARGET_GFX701,
+  TARGET_GFX702,
+  TARGET_GFX703,
+  TARGET_GFX704,
+  TARGET_GFX705,
+  // GCN 3.0 / 4.0
+  TARGET_GFX801,
+  TARGET_GFX802,
+  TARGET_GFX803,
+  TARGET_GFX805,
+  TARGET_GFX810,
+  // GCN 5.0
+  TARGET_GFX900,
+  TARGET_GFX902,
+  TARGET_GFX904,
+  // GCN 5.1
+  TARGET_GFX906,
+  // ???
+  TARGET_GFX909,
+  TARGET_GFX90C,
+  /// RDNA (Radeon DNA)
+  /// -----------------
+  // RDNA1
+  TARGET_GFX1010,
+  TARGET_GFX1011,
+  TARGET_GFX1012,
+  // RDNA2
+  TARGET_GFX1013, // Oberon
+  TARGET_GFX1030,
+  TARGET_GFX1031,
+  TARGET_GFX1032,
+  TARGET_GFX1033,
+  TARGET_GFX1034,
+  TARGET_GFX1035, // ??
+  TARGET_GFX1036, // ??
+  // RDNA3
+  TARGET_GFX1100,
+  TARGET_GFX1101,
+  TARGET_GFX1102,
+  TARGET_GFX1103, // ???
+  // RDNA3.5
+  TARGET_GFX1150, // Strix Point
+  TARGET_GFX1151, // Strix Halo / Strix Point
+  TARGET_GFX1152, // Krackan Point
+  TARGET_GFX1153, // ???
+  // RDNA4
+  TARGET_GFX1200,
+  TARGET_GFX1201,
+  TARGET_GFX1250, // ???
+  TARGET_GFX1251, // ???
+  /// CDNA (Compute DNA)
+  /// ------------------
+  // CDNA
+  TARGET_GFX908,
+  // CDNA2
+  TARGET_GFX90A,
+  // CDNA3
+  TARGET_GFX942,
+  // CDNA4
+  TARGET_GFX950  
+};
+
+#define CHECK_UARCH_START if (false) {}
+#define CHECK_UARCH(arch, chip_, str, uarch, process) \
+   else if (arch->chip == chip_) fill_uarch(arch, str, uarch, process);
+#define CHECK_UARCH_END else { if(arch->chip != CHIP_UNKNOWN_HSA) printBug("map_chip_to_uarch_hsa: Unknown chip id: %d", arch->chip); fill_uarch(arch, STRING_UNKNOWN, UARCH_UNKNOWN, UNK); }
+
+void fill_uarch(struct uarch* arch, char const *str, MICROARCH u, uint32_t process) {
+  arch->chip_str = (char *) emalloc(sizeof(char) * (strlen(str)+1));
+  strcpy(arch->chip_str, str);
+  arch->uarch = u;
+  arch->process = process;
+}
+
+// On chiplet based chips (such as Navi31, Navi32, etc),
+// we have 2 different processes: The MCD process and the
+// rest of the chip process. They might be different and here
+// we just take one - let's take MCD process for now.
+//
+// TODO: Should we differentiate?
+void map_chip_to_uarch_hsa(struct uarch* arch) {
+  CHECK_UARCH_START
+
+  // RDNA
+  CHECK_UARCH(arch, CHIP_NAVI_10,  "Navi 10", UARCH_RDNA,  7)
+  CHECK_UARCH(arch, CHIP_NAVI_12,  "Navi 12", UARCH_RDNA,  7)
+  CHECK_UARCH(arch, CHIP_NAVI_14,  "Navi 14", UARCH_RDNA,  7)
+  CHECK_UARCH(arch, CHIP_NAVI_21,  "Navi 21", UARCH_RDNA2, 7)
+  CHECK_UARCH(arch, CHIP_NAVI_22,  "Navi 22", UARCH_RDNA2, 7)
+  CHECK_UARCH(arch, CHIP_NAVI_23,  "Navi 23", UARCH_RDNA2, 7)
+  CHECK_UARCH(arch, CHIP_NAVI_24,  "Navi 24", UARCH_RDNA2, 6)
+  CHECK_UARCH(arch, CHIP_NAVI_31,  "Navi 31", UARCH_RDNA3, 6)
+  CHECK_UARCH(arch, CHIP_NAVI_32,  "Navi 32", UARCH_RDNA3, 6)
+  CHECK_UARCH(arch, CHIP_NAVI_33,  "Navi 33", UARCH_RDNA3, 6)
+  CHECK_UARCH(arch, CHIP_NAVI_44,  "Navi 44", UARCH_RDNA4, 4)
+  CHECK_UARCH(arch, CHIP_NAVI_48,  "Navi 48", UARCH_RDNA4, 4)
+  // CDNA
+  // NOTE: We will not show chip name for CDNA, thus use empty str
+  CHECK_UARCH(arch, CHIP_ARCTURUS,        "", UARCH_CDNA,  7)
+  CHECK_UARCH(arch, CHIP_ALDEBARAN,       "", UARCH_CDNA2, 6)
+  CHECK_UARCH(arch, CHIP_AQUA_VANJARAM,   "", UARCH_CDNA3, 6)
+  CHECK_UARCH(arch, CHIP_CDNA_NEXT,       "", UARCH_CDNA4, 6) // big difference between MCD and rest of the chip process
+  
+  CHECK_UARCH_END
+}
+
+#define CHECK_TGT_START if (false) {}
+#define CHECK_TGT(target, llvm_target, chip) \
+  else if (target == llvm_target) return chip;
+#define CHECK_TGT_END else { printBug("LLVM target '%d' has no matching chip", target); return CHIP_UNKNOWN_HSA; }
+
+// We have at least 2 choices to infer the chip:
+//
+// - LLVM target (e.g., gfx1101 is Navi 32)
+// - PCI ID (e.g., 0x7470 is Navi 32)
+//
+// For now we will use the first approach, which seems to have
+// some issues like mentioned in the enum.
+// However PCI detection is also not perfect, since it is
+// quite hard to find PCI ids from old hardware.
+GPUCHIP get_chip_from_target_hsa(int32_t target) {
+  CHECK_TGT_START
+  /// RDNA
+  /// -------------------------------------------
+  CHECK_TGT(target, TARGET_GFX1010, CHIP_NAVI_10)
+  CHECK_TGT(target, TARGET_GFX1011, CHIP_NAVI_12)
+  CHECK_TGT(target, TARGET_GFX1012, CHIP_NAVI_14)
+  // CHECK_TGT(target, TARGET_GFX1013, TODO)
+  /// RDNA2
+  /// -------------------------------------------
+  CHECK_TGT(target, TARGET_GFX1030, CHIP_NAVI_21)
+  CHECK_TGT(target, TARGET_GFX1031, CHIP_NAVI_22)
+  CHECK_TGT(target, TARGET_GFX1032, CHIP_NAVI_23)
+  CHECK_TGT(target, TARGET_GFX1033, CHIP_NAVI_21)
+  CHECK_TGT(target, TARGET_GFX1034, CHIP_NAVI_24)
+  // CHECK_TGT(target, TARGET_GFX1035, TODO)
+  // CHECK_TGT(target, TARGET_GFX1036, TODO)
+  /// RDNA3
+  /// -------------------------------------------
+  CHECK_TGT(target, TARGET_GFX1100, CHIP_NAVI_31)
+  CHECK_TGT(target, TARGET_GFX1101, CHIP_NAVI_32)
+  CHECK_TGT(target, TARGET_GFX1102, CHIP_NAVI_33)
+  // CHECK_TGT(target, TARGET_GFX1103, TODO)
+  /// RDNA3.5
+  /// -------------------------------------------
+  // CHECK_TGT(target, TARGET_GFX1150, TODO)
+  // CHECK_TGT(target, TARGET_GFX1151, TODO)
+  // CHECK_TGT(target, TARGET_GFX1152, TODO)
+  // CHECK_TGT(target, TARGET_GFX1153, TODO)
+  /// RDNA4
+  /// -------------------------------------------
+  CHECK_TGT(target, TARGET_GFX1200, CHIP_NAVI_44)
+  CHECK_TGT(target, TARGET_GFX1201, CHIP_NAVI_48)
+  // CHECK_TGT(target, TARGET_GFX1250, TODO)
+  // CHECK_TGT(target, TARGET_GFX1251, TODO)
+  /// CDNA
+  /// -------------------------------------------
+  CHECK_TGT(target, TARGET_GFX908, CHIP_ARCTURUS)
+  /// CDNA2
+  /// -------------------------------------------
+  CHECK_TGT(target, TARGET_GFX90A, CHIP_ALDEBARAN)
+  /// CDNA3
+  /// -------------------------------------------
+  CHECK_TGT(target, TARGET_GFX942, CHIP_AQUA_VANJARAM)
+  /// CDNA4
+  /// -------------------------------------------
+  CHECK_TGT(target, TARGET_GFX950, CHIP_CDNA_NEXT)
+  CHECK_TGT_END
+}
+
+#define CHECK_TGT_STR_START if (false) {}
+#define CHECK_TGT_STR(target, llvm_target, chip) \
+  else if (strcmp(target, llvm_target) == 0) return chip;
+#define CHECK_TGT_STR_END else { return TARGET_UNKNOWN_HSA; }
+
+// Maps the LLVM target string to the enum value
+int32_t get_llvm_target_from_str(char* target) {
+  // TODO: Autogenerate this
+  // TODO: Add all, not only the ones we support in get_chip_from_target_hsa
+  CHECK_TGT_STR_START
+  CHECK_TGT_STR(target, "gfx1010", TARGET_GFX1010)
+  CHECK_TGT_STR(target, "gfx1011", TARGET_GFX1011)
+  CHECK_TGT_STR(target, "gfx1012", TARGET_GFX1012)
+  CHECK_TGT_STR(target, "gfx1013", TARGET_GFX1013)
+  CHECK_TGT_STR(target, "gfx1030", TARGET_GFX1030)
+  CHECK_TGT_STR(target, "gfx1031", TARGET_GFX1031)
+  CHECK_TGT_STR(target, "gfx1032", TARGET_GFX1032)
+  CHECK_TGT_STR(target, "gfx1033", TARGET_GFX1033)
+  CHECK_TGT_STR(target, "gfx1034", TARGET_GFX1034)
+  CHECK_TGT_STR(target, "gfx1035", TARGET_GFX1035)
+  CHECK_TGT_STR(target, "gfx1036", TARGET_GFX1036)
+  CHECK_TGT_STR(target, "gfx1100", TARGET_GFX1100)
+  CHECK_TGT_STR(target, "gfx1101", TARGET_GFX1101)
+  CHECK_TGT_STR(target, "gfx1102", TARGET_GFX1102)
+  CHECK_TGT_STR(target, "gfx1103", TARGET_GFX1103)
+  CHECK_TGT_STR(target, "gfx1200", TARGET_GFX1200)
+  CHECK_TGT_STR(target, "gfx1201", TARGET_GFX1201)
+  CHECK_TGT_STR(target, "gfx1250", TARGET_GFX1250)
+  CHECK_TGT_STR(target, "gfx1251", TARGET_GFX1251)
+  CHECK_TGT_STR(target, "gfx908",  TARGET_GFX908)
+  CHECK_TGT_STR(target, "gfx90a",  TARGET_GFX90A)
+  CHECK_TGT_STR(target, "gfx942",  TARGET_GFX942)
+  CHECK_TGT_STR(target, "gfx950",  TARGET_GFX950)
+  CHECK_TGT_STR_END
+}
+
+struct uarch* get_uarch_from_hsa(struct gpu_info* gpu, char* gpu_name) {
+  struct uarch* arch = (struct uarch*) emalloc(sizeof(struct uarch));
+
+  arch->llvm_target = get_llvm_target_from_str(gpu_name);
+  if (arch->llvm_target == TARGET_UNKNOWN_HSA) {
+    printErr("Unknown LLVM target: '%s'", gpu_name);
+    return NULL;
+  }
+
+  arch->chip_str = NULL;
+  arch->chip = get_chip_from_target_hsa(arch->llvm_target);
+  map_chip_to_uarch_hsa(arch);
+
+  return arch;
+}
+
+bool is_uarch_valid(struct uarch* arch) {
+  if (arch == NULL) {
+    printBug("Invalid uarch: arch is NULL");
+    return false;
+  }
+  if (arch->uarch >= UARCH_UNKNOWN && arch->uarch <= UARCH_CDNA4) {
+    return true;
+  }
+  else {
+    printBug("Invalid uarch: %d", arch->uarch);
+    return false;
+  }
+}
+
+bool is_cdna(struct uarch* arch) {
+  return arch->uarch == UARCH_CDNA ||
+         arch->uarch == UARCH_CDNA2 ||
+         arch->uarch == UARCH_CDNA3 ||
+         arch->uarch == UARCH_CDNA4;
+}
+
+char* get_str_chip(struct uarch* arch) {
+  // We dont want to show CDNA chip names as they add
+  // no value, since each architecture maps one to one
+  // to a chip.
+  if (is_cdna(arch)) return NULL;
+  return arch->chip_str;
+}
+
+const char* get_str_uarch_hsa(struct uarch* arch) {
+  if (!is_uarch_valid(arch)) {
+    return NULL;
+  }
+  return uarch_str[arch->uarch];
+}
--- a/src/hsa/uarch.hpp
+++ b/src/hsa/uarch.hpp
@@ -0,0 +1,13 @@
+#ifndef __HSA_UARCH__
+#define __HSA_UARCH__
+
+#include "../common/gpu.hpp"
+
+struct uarch;
+
+struct uarch* get_uarch_from_hsa(struct gpu_info* gpu, char* gpu_name);
+char* get_str_uarch_hsa(struct uarch* arch);
+char* get_str_process(struct uarch* arch); // TODO: Shouldnt we define this in the cpp?
+char* get_str_chip(struct uarch* arch);
+
+#endif
--- a/src/intel/check.sh
+++ b/src/intel/check.sh
@@ -0,0 +1,12 @@
+#!/bin/bash -u
+# Checks the difference between supported uarchs
+# and uarchs that have their topology available
+# in file uarch.cpp
+
+uarchs="$(grep 'CHECK_UARCH' uarch.cpp | cut -d',' -f4-5 | grep 'UARCH_GEN' | tr -d ' ' | sort | uniq)"
+topos="$(grep 'CHECK_TOPO' uarch.cpp | cut -d',' -f3,4 | grep 'UARCH_' | tr -d ' ' | sort | uniq)"
+
+echo "$uarchs" > /tmp/uarchs.txt
+echo "$topos" > /tmp/topos.txt
+meld /tmp/uarchs.txt /tmp/topos.txt
+rm -f /tmp/uarchs.txt /tmp/topos.txt
--- a/src/intel/chips.hpp
+++ b/src/intel/chips.hpp
@@ -0,0 +1,78 @@
+#ifndef __INTEL_GPUCHIPS__
+#define __INTEL_GPUCHIPS__
+
+#include <cstdint>
+
+typedef uint32_t GPUCHIP;
+
+enum {
+  CHIP_UNKNOWN_INTEL,
+  // Gen6
+  CHIP_HD_SANDY,
+  CHIP_HD_2000,
+  CHIP_HD_3000,
+  // Gen7
+  CHIP_HD_SILVER,
+  CHIP_HD_IVY,
+  CHIP_HD_2500,
+  CHIP_HD_4000,
+  CHIP_HD_P4000,
+  // Gen7.5
+  CHIP_HD_HASWELL,
+  CHIP_HD_4200,
+  CHIP_HD_4400,
+  CHIP_HD_4600,
+  CHIP_HD_P4600,
+  CHIP_IRIS_5100,
+  CHIP_IRISP_5200,
+  CHIP_IRISP_P5200,
+  // Gen8
+  CHIP_HD_BROADWELL,
+  CHIP_HD_5300,
+  CHIP_HD_5500,
+  CHIP_HD_5600,
+  CHIP_HD_P5700,
+  CHIP_HD_6000,
+  CHIP_IRIS_6100,
+  CHIP_IRISP_6200,
+  CHIP_IRISP_P6300,
+  // Gen9
+  CHIP_HD_510,
+  CHIP_HD_515,
+  CHIP_HD_520,
+  CHIP_HD_530,
+  CHIP_HD_P530,
+  CHIP_HD_540,
+  CHIP_HD_550,
+  CHIP_IRIS_P555,
+  CHIP_IRIS_580,
+  CHIP_IRIS_P580,
+  // Gen9.5
+  CHIP_UHD_600,
+  CHIP_UHD_605,
+  CHIP_UHD_620,
+  CHIP_UHD_630,
+  CHIP_HD_610,
+  CHIP_HD_615,
+  CHIP_HD_620,
+  CHIP_HD_630,
+  CHIP_HD_P630,
+  CHIP_IRISP_640,
+  CHIP_IRISP_650,
+  CHIP_UHD_KBL_GT1,
+  CHIP_UHD_KBL_GT2,
+  // Gen11
+  CHIP_UHD_G1,
+  CHIP_IRISP_G4,
+  CHIP_IRISP_G7,
+  // Gen12
+  CHIP_UHD_710,
+  CHIP_UHD_730_ALD,
+  CHIP_UHD_730_RKL,
+  CHIP_UHD_750,
+  CHIP_UHD_770,
+  CHIP_XE_G4,
+  CHIP_XE_G7
+};
+
+#endif
--- a/src/intel/cpuid.cpp
+++ b/src/intel/cpuid.cpp
@@ -0,0 +1,71 @@
+#include "../common/global.hpp"
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <cstdio>
+
+#define CPU_VENDOR_MAX_LENGTH   13
+#define CPU_NAME_MAX_LENGTH     49
+#define CPU_VENDOR_INTEL_STRING "GenuineIntel"
+
+void cpuid(uint32_t level, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) {
+        __asm volatile("cpuid"
+            : "=a" (*eax),
+              "=b" (*ebx),
+              "=c" (*ecx),
+              "=d" (*edx)
+            : "0" (level), "2" (*ecx));
+}
+
+char* get_cpu_vendor() {
+  uint32_t vendor[3];
+  uint32_t dummy;
+  char * name = (char *) emalloc(sizeof(char) * CPU_VENDOR_MAX_LENGTH);
+  memset(name, 0, CPU_VENDOR_MAX_LENGTH);
+
+  cpuid(0x00000000, &dummy, vendor+0x0, vendor+0x2, vendor+0x1);
+
+  snprintf(name, CPU_VENDOR_MAX_LENGTH, "%s", (char *) vendor);
+
+  return name;
+}
+
+char* get_str_cpu_name_internal() {
+  uint32_t brand[12];
+  char * name = (char *) emalloc(sizeof(char) * CPU_NAME_MAX_LENGTH);
+  memset(name, 0, CPU_NAME_MAX_LENGTH);
+
+  cpuid(0x80000002, brand+0x0, brand+0x1, brand+0x2, brand+0x3);
+  cpuid(0x80000003, brand+0x4, brand+0x5, brand+0x6, brand+0x7);
+  cpuid(0x80000004, brand+0x8, brand+0x9, brand+0xa, brand+0xb);
+
+  snprintf(name, CPU_NAME_MAX_LENGTH, "%s", (char *) brand);
+
+  return name;
+}
+
+bool is_corei5() {
+  uint32_t eax = 0;
+  uint32_t ebx = 0;
+  uint32_t ecx = 0;
+  uint32_t edx = 0;
+
+  // Get CPU vendor
+  char* cpu_vendor = get_cpu_vendor();
+
+  if(strcmp(CPU_VENDOR_INTEL_STRING, cpu_vendor) != 0) {
+    printBug("is_corei5: invalid CPU vendor: %s", cpu_vendor);
+    return false;
+  }
+
+  cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+
+  if (eax < 0x80000004){
+    printBug("is_corei5: unexpected max extended level: 0x%.8X", eax);
+    return false;
+  }
+
+  // Get CPU name
+  char* cpu_name = get_str_cpu_name_internal();
+  return strstr(cpu_name, "i5") != NULL;
+}
--- a/src/intel/cpuid.hpp
+++ b/src/intel/cpuid.hpp
@@ -0,0 +1,6 @@
+#ifndef __CPUID__
+#define __CPUID__
+
+bool is_corei5();
+
+#endif
--- a/src/intel/intel.cpp
+++ b/src/intel/intel.cpp
@@ -0,0 +1,60 @@
+#include <cstdio>
+#include <cstring>
+
+#include "intel.hpp"
+#include "uarch.hpp"
+#include "chips.hpp"
+#include "udev.hpp"
+#include "../common/pci.hpp"
+#include "../common/global.hpp"
+
+int64_t get_peak_performance_intel(struct gpu_info* gpu) {
+  // Check that we have valid data
+  if(gpu->topo_i->eu_subslice < 0 ||
+     gpu->topo_i->subslices < 0   ||
+     gpu->freq <= 0)
+  {
+    return -1;
+  }
+  return gpu->freq * 1000000 * gpu->topo_i->eu_subslice * gpu->topo_i->subslices * 8 * 2;
+}
+
+struct gpu_info* get_gpu_info_intel(struct pci_dev *devices) {
+  struct gpu_info* gpu = (struct gpu_info*) emalloc(sizeof(struct gpu_info));
+  gpu->vendor = GPU_VENDOR_INTEL;
+  gpu->pci = get_pci_from_pciutils(devices, PCI_VENDOR_ID_INTEL, 0);
+
+  if(gpu->pci == NULL) {
+    // No Intel iGPU found in PCI, which means it is not present
+    printWarn("Unable to find a valid device for vendor id 0x%.4X using pciutils", PCI_VENDOR_ID_INTEL);
+    return NULL;
+  }
+
+  gpu->arch = get_uarch_from_pci(gpu->pci);
+
+  if(gpu->arch == NULL) {
+    // No Intel iGPU found in arch LUT, which means it is not supported
+    return NULL;
+  }
+
+  gpu->name = get_name_from_uarch(gpu->arch);
+  gpu->topo_i = get_topology_info(gpu->arch);
+  gpu->freq = get_max_freq_from_file(gpu->pci);
+  gpu->peak_performance = get_peak_performance_intel(gpu);
+
+  return gpu;
+}
+
+bool print_gpu_intel(struct gpu_info* gpu) {
+  if(gpu->vendor != GPU_VENDOR_INTEL) return false;
+
+  printf("%s\n", gpu->name);
+
+  return true;
+}
+
+char* get_str_eu(struct gpu_info* gpu) {
+  if(gpu->topo_i->subslices < 0 || gpu->topo_i->eu_subslice < 0)
+    return get_str_generic(-1);
+  return get_str_generic(gpu->topo_i->subslices * gpu->topo_i->eu_subslice);
+}
--- a/src/intel/intel.hpp
+++ b/src/intel/intel.hpp
@@ -0,0 +1,10 @@
+#ifndef __INTEL_GPU__
+#define __INTEL_GPU__
+
+#include "../common/gpu.hpp"
+
+struct gpu_info* get_gpu_info_intel(struct pci_dev *devices);
+bool print_gpu_intel(struct gpu_info* gpu);
+char* get_str_eu(struct gpu_info* gpu);
+
+#endif
--- a/src/intel/pci.cpp
+++ b/src/intel/pci.cpp
@@ -0,0 +1,129 @@
+#include <cstdio>
+
+#include "pci.hpp"
+#include "chips.hpp"
+#include "../common/global.hpp"
+#include "../common/pci.hpp"
+
+#define CHECK_PCI_START if (false) {}
+#define CHECK_PCI(pci, id, chip) \
+   else if (pci->device_id == id) return chip;
+#define CHECK_PCI_END else { printBug("Unknown Intel device id: 0x%.4X", pci->device_id); return CHIP_UNKNOWN_INTEL; }
+
+// TODO: Review wikipedia link to improve the LUT
+/*
+ * https://en.wikipedia.org/wiki/List_of_Intel_graphics_processing_units
+ * https://github.com/mesa3d/mesa/blob/main/include/pci_ids/iris_pci_ids.h
+ * https://raw.githubusercontent.com/smxi/inxi/master/inxi
+ */
+GPUCHIP get_chip_from_pci_intel(struct pci* pci) {
+  CHECK_PCI_START
+  // Gen6
+  CHECK_PCI(pci, 0x010A, CHIP_HD_SANDY)
+  CHECK_PCI(pci, 0x0102, CHIP_HD_2000)
+  CHECK_PCI(pci, 0x0106, CHIP_HD_2000)
+  CHECK_PCI(pci, 0x0112, CHIP_HD_3000)
+  CHECK_PCI(pci, 0x0122, CHIP_HD_3000)
+  CHECK_PCI(pci, 0x0116, CHIP_HD_3000)
+  CHECK_PCI(pci, 0x0126, CHIP_HD_3000)
+  // Gen7
+  CHECK_PCI(pci, 0x015A, CHIP_HD_IVY)
+  CHECK_PCI(pci, 0x0F30, CHIP_HD_SILVER)
+  CHECK_PCI(pci, 0x0F31, CHIP_HD_SILVER)
+  CHECK_PCI(pci, 0x0F32, CHIP_HD_SILVER)
+  CHECK_PCI(pci, 0x0F33, CHIP_HD_SILVER)
+  CHECK_PCI(pci, 0x0155, CHIP_HD_SILVER)
+  CHECK_PCI(pci, 0x0157, CHIP_HD_SILVER)
+  CHECK_PCI(pci, 0x0152, CHIP_HD_2500)
+  CHECK_PCI(pci, 0x0156, CHIP_HD_2500)
+  CHECK_PCI(pci, 0x0162, CHIP_HD_4000)
+  CHECK_PCI(pci, 0x0166, CHIP_HD_4000)
+  CHECK_PCI(pci, 0x016a, CHIP_HD_P4000)
+  // Gen7.5
+  CHECK_PCI(pci, 0x0402, CHIP_HD_HASWELL)
+  CHECK_PCI(pci, 0x0406, CHIP_HD_HASWELL)
+  CHECK_PCI(pci, 0x040A, CHIP_HD_HASWELL)
+  CHECK_PCI(pci, 0x040B, CHIP_HD_HASWELL)
+  CHECK_PCI(pci, 0x040E, CHIP_HD_HASWELL)
+  CHECK_PCI(pci, 0x0A02, CHIP_HD_HASWELL)
+  CHECK_PCI(pci, 0x0A06, CHIP_HD_HASWELL)
+  CHECK_PCI(pci, 0x0A0A, CHIP_HD_HASWELL)
+  CHECK_PCI(pci, 0x0A0B, CHIP_HD_HASWELL)
+  CHECK_PCI(pci, 0x0A0E, CHIP_HD_HASWELL)
+  CHECK_PCI(pci, 0x0A1E, CHIP_HD_4200)
+  CHECK_PCI(pci, 0x041E, CHIP_HD_4400)
+  CHECK_PCI(pci, 0x0A16, CHIP_HD_4400)
+  CHECK_PCI(pci, 0x0412, CHIP_HD_4600)
+  CHECK_PCI(pci, 0x0416, CHIP_HD_4600)
+  CHECK_PCI(pci, 0x0D12, CHIP_HD_4600)
+  CHECK_PCI(pci, 0x041A, CHIP_HD_P4600)
+  CHECK_PCI(pci, 0x0A2E, CHIP_IRIS_5100)
+  CHECK_PCI(pci, 0x0D22, CHIP_IRISP_5200)
+  CHECK_PCI(pci, 0x0D26, CHIP_IRISP_P5200)
+  // Gen8
+  CHECK_PCI(pci, 0x1606, CHIP_HD_BROADWELL)
+  CHECK_PCI(pci, 0x161E, CHIP_HD_5300)
+  CHECK_PCI(pci, 0x1616, CHIP_HD_5500)
+  CHECK_PCI(pci, 0x1612, CHIP_HD_5600)
+  CHECK_PCI(pci, 0x161A, CHIP_HD_P5700)
+  CHECK_PCI(pci, 0x1626, CHIP_HD_6000)
+  CHECK_PCI(pci, 0x162B, CHIP_IRIS_6100)
+  CHECK_PCI(pci, 0x1622, CHIP_IRISP_6200)
+  CHECK_PCI(pci, 0x162A, CHIP_IRISP_P6300)
+  // Gen9
+  CHECK_PCI(pci, 0x1902, CHIP_HD_510)
+  CHECK_PCI(pci, 0x1906, CHIP_HD_510)
+  CHECK_PCI(pci, 0x190B, CHIP_HD_510)
+  CHECK_PCI(pci, 0x191E, CHIP_HD_515)
+  CHECK_PCI(pci, 0x1916, CHIP_HD_520)
+  CHECK_PCI(pci, 0x1921, CHIP_HD_520)
+  CHECK_PCI(pci, 0x1912, CHIP_HD_530)
+  CHECK_PCI(pci, 0x191B, CHIP_HD_530)
+  CHECK_PCI(pci, 0x191D, CHIP_HD_P530)
+  /*CHECK_PCI(pci, 0x5917, CHIP_HD_540)
+  CHECK_PCI(pci, 0x5917, CHIP_HD_550)
+  CHECK_PCI(pci, 0x5917, CHIP_HD_P555)
+  CHECK_PCI(pci, 0x5917, CHIP_HD_580)
+  CHECK_PCI(pci, 0x5917, CHIP_HD_P580)*/
+  // Gen9.5
+  CHECK_PCI(pci, 0x3185, CHIP_UHD_600)
+  CHECK_PCI(pci, 0x3184, CHIP_UHD_605)
+  CHECK_PCI(pci, 0x5917, CHIP_UHD_620)
+  CHECK_PCI(pci, 0x3EA0, CHIP_UHD_620)
+  CHECK_PCI(pci, 0x3E91, CHIP_UHD_630)
+  CHECK_PCI(pci, 0x3E92, CHIP_UHD_630)
+  CHECK_PCI(pci, 0x3E98, CHIP_UHD_630)
+  CHECK_PCI(pci, 0x3E9B, CHIP_UHD_630)
+  CHECK_PCI(pci, 0x9BC5, CHIP_UHD_630)
+  CHECK_PCI(pci, 0x9BC8, CHIP_UHD_630)
+  CHECK_PCI(pci, 0x5902, CHIP_HD_610)
+  CHECK_PCI(pci, 0x5906, CHIP_HD_610)
+  CHECK_PCI(pci, 0x590B, CHIP_HD_610)
+  CHECK_PCI(pci, 0x591E, CHIP_HD_615)
+  CHECK_PCI(pci, 0x5912, CHIP_HD_630)
+  CHECK_PCI(pci, 0x591B, CHIP_HD_630)
+  CHECK_PCI(pci, 0x591A, CHIP_HD_P630)
+  CHECK_PCI(pci, 0x591D, CHIP_HD_P630)
+  CHECK_PCI(pci, 0x5926, CHIP_IRISP_640)
+  CHECK_PCI(pci, 0x5927, CHIP_IRISP_650)
+  // Gen11
+  CHECK_PCI(pci, 0x8A58, CHIP_UHD_G1)
+  CHECK_PCI(pci, 0x8A56, CHIP_UHD_G1)
+  CHECK_PCI(pci, 0x8A5C, CHIP_IRISP_G4)
+  CHECK_PCI(pci, 0x8A5A, CHIP_IRISP_G4)
+  CHECK_PCI(pci, 0x8A51, CHIP_IRISP_G7)
+  CHECK_PCI(pci, 0x8A52, CHIP_IRISP_G7)
+  CHECK_PCI(pci, 0x8A53, CHIP_IRISP_G7)
+  // Xe (Gen12)
+  CHECK_PCI(pci, 0x4693, CHIP_UHD_710)
+  CHECK_PCI(pci, 0x4692, CHIP_UHD_730_ALD)
+  CHECK_PCI(pci, 0x4C8B, CHIP_UHD_730_RKL)
+  CHECK_PCI(pci, 0x4C8A, CHIP_UHD_750)
+  CHECK_PCI(pci, 0x4690, CHIP_UHD_770)
+  CHECK_PCI(pci, 0x4680, CHIP_UHD_770)
+  CHECK_PCI(pci, 0x9A78, CHIP_XE_G4)
+  CHECK_PCI(pci, 0x9A40, CHIP_XE_G7) // G7 may have 80 or 96 EUs
+  CHECK_PCI(pci, 0x9A49, CHIP_XE_G7) // Same for this G7
+  // TODO: Add generic generic UHD Graphics and Iris Xe Graphics from Mobile
+  CHECK_PCI_END
+}
--- a/src/intel/pci.hpp
+++ b/src/intel/pci.hpp
@@ -0,0 +1,19 @@
+#ifndef __PCI_INTEL__
+#define __PCI_INTEL__
+
+#include <cstdint>
+
+#include "../common/pci.hpp"
+#include "chips.hpp"
+
+/*
+ * doc: https://wiki.osdev.org/PCI#Class_Codes
+ *      https://pci-ids.ucw.cz/read/PC
+ */
+#define PCI_VENDOR_ID_INTEL 0x8086
+
+struct pci;
+
+GPUCHIP get_chip_from_pci_intel(struct pci* pci);
+
+#endif
--- a/src/intel/uarch.cpp
+++ b/src/intel/uarch.cpp
@@ -0,0 +1,276 @@
+#include <cstdint>
+#include <cstddef>
+#include <cstring>
+#include <cstdio>
+
+#include "../common/uarch.hpp"
+#include "../common/global.hpp"
+#include "../common/gpu.hpp"
+#include "chips.hpp"
+#include "pci.hpp"
+#include "cpuid.hpp"
+
+// Data not available
+#define NA                   -1
+
+// Unknown manufacturing process
+#define UNK                  -1
+
+/*
+ * Mapping between iGPU and CPU uarchs
+ * -----------------------------------
+ * Gen6:   Sandy Bridge      (2th Gen)
+ * Gen7:   Ivy Brdige        (3th Gen)
+ * Gen7.5: Haswell           (4th Gen)
+ * Gen8:   Broadwell         (5th Gen)
+ * Gen9:   Skylake           (6th Gen)
+ * Gen9.5: Kaby Lake
+ * Gen11:  Ice Lake          (10th Gen)
+ * Gen12:  Rocket/Tiger Lake (11th Gen)
+ * Gen12:  Alder Lake        (12th Gen)
+ */
+enum {
+  UARCH_UNKNOWN,
+  UARCH_GEN6,
+  UARCH_GEN7,
+  UARCH_GEN7_5,
+  UARCH_GEN8,
+  UARCH_GEN9,
+  UARCH_GEN9_5,
+  UARCH_GEN11,
+  UARCH_GEN12_RKL,
+  UARCH_GEN12_TGL,
+  UARCH_GEN12_ALD,
+};
+
+static const char *uarch_str[] = {
+  /*[ARCH_UNKNOWN    = */ STRING_UNKNOWN,
+  /*[ARCH_GEN6]      = */ "Gen6",
+  /*[ARCH_GEN7]      = */ "Gen7",
+  /*[ARCH_GEN7_5]    = */ "Gen7.5",
+  /*[ARCH_GEN8]      = */ "Gen8",
+  /*[ARCH_GEN9]      = */ "Gen9",
+  /*[ARCH_GEN9_5]    = */ "Gen9.5",
+  /*[ARCH_GEN11]     = */ "Gen11",
+  /*[ARCH_GEN12_RKL] = */ "Xe",
+  /*[ARCH_GEN12_TGL] = */ "Xe",
+  /*[ARCH_GEN12_ALD] = */ "Xe",
+};
+
+// Graphic Tiers (GT)
+enum {
+  GT_UNKNOWN,
+  GT0_5, // Saw that 0.5 thing in iris_pci_ids.h
+  GT1,
+  GT1_4, // GT1 with 4 EUs
+  GT1_5,
+  GT2,
+  GT3,
+  GT3e,
+  GT4e
+};
+
+static const char *gt_str[] = {
+  /*[GT_UNKNOWN] = */ STRING_UNKNOWN,
+  /*[GT0_5]      = */ "GT0.5",
+  /*[GT1]        = */ "GT1",
+  /*[GT1_4]      = */ "GT1",
+  /*[GT1_5]      = */ "GT1.5",
+  /*[GT2]        = */ "GT2",
+  /*[GT3]        = */ "GT3",
+  /*[GT3e]       = */ "GT3e",
+  /*[GT4e]       = */ "GT4e",
+};
+
+#define CHECK_UARCH_START if (false) {}
+#define CHECK_UARCH(arch, chip_, str, uarch, gt, process) \
+   else if (arch->chip == chip_) fill_uarch(arch, str, uarch, gt, process);
+#define CHECK_UARCH_END else { printBug("map_chip_to_uarch_intel: Unknown chip id: %d", arch->chip); fill_uarch(arch, STRING_UNKNOWN, UARCH_UNKNOWN, GT_UNKNOWN, 0); }
+
+#define CHECK_TOPO_START if (false) {}
+#define CHECK_TOPO(topo, arch, uarch_, gt_, eu_sub, sub, sli) \
+  else if(arch->uarch == uarch_ && arch->gt == gt_) fill_topo(topo, eu_sub, sub, sli);
+#define CHECK_TOPO_CHIP(topo, arch, uarch_, chip_, eu_sub, sub, sli) \
+  else if(arch->uarch == uarch_ && arch->chip == chip_) fill_topo(topo, eu_sub, sub, sli);
+#define CHECK_TOPO_END else { printBug("get_topology_info: Invalid uarch and gt combination: '%s' and '%s'", arch->chip_str, get_str_gt(arch)); fill_topo(topo, UNK, UNK, UNK); }
+
+void fill_topo(struct topology_i* topo_i, int32_t eu_sub, int32_t sub, int32_t sli) {
+  topo_i->slices = sli;
+  topo_i->subslices = sub;
+  topo_i->eu_subslice = eu_sub;
+}
+
+void fill_uarch(struct uarch* arch, char const *str, MICROARCH u, int32_t gt, uint32_t process) {
+  arch->chip_str = (char *) emalloc(sizeof(char) * (strlen(str)+1));
+  strcpy(arch->chip_str, str);
+  arch->uarch = u;
+  arch->process = process;
+  arch->gt = gt;
+}
+
+void map_chip_to_uarch_intel(struct uarch* arch) {
+  CHECK_UARCH_START
+  // Gen6
+  CHECK_UARCH(arch, CHIP_HD_SANDY,     "HD Graphics (Sandy Bridge)", UARCH_GEN6,   GT1,   32)
+  CHECK_UARCH(arch, CHIP_HD_2000,      "HD Graphics 2000",           UARCH_GEN6,   GT1,   32)
+  CHECK_UARCH(arch, CHIP_HD_3000,      "HD Graphics 3000",           UARCH_GEN6,   GT2,   32)
+  // Gen7
+  CHECK_UARCH(arch, CHIP_HD_IVY,       "HD Graphics (Ivy Bridge)",   UARCH_GEN7,   GT1,   22)
+  CHECK_UARCH(arch, CHIP_HD_SILVER,    "HD Graphics (Silvermont)",   UARCH_GEN7,   GT1_4, 22)
+  CHECK_UARCH(arch, CHIP_HD_2500,      "HD Graphics 2500",           UARCH_GEN7,   GT1,   22)
+  CHECK_UARCH(arch, CHIP_HD_4000,      "HD Graphics 4000",           UARCH_GEN7,   GT2,   22)
+  CHECK_UARCH(arch, CHIP_HD_P4000,     "HD Graphics P4000",          UARCH_GEN7,   GT2,   22)
+  // Gen7.5
+  CHECK_UARCH(arch, CHIP_HD_HASWELL,   "HD Graphics (Haswell)",      UARCH_GEN7_5, GT1,   22)
+  CHECK_UARCH(arch, CHIP_HD_4200,      "HD Graphics 4200",           UARCH_GEN7_5, GT2,   22)
+  CHECK_UARCH(arch, CHIP_HD_4400,      "HD Graphics 4400",           UARCH_GEN7_5, GT2,   22)
+  CHECK_UARCH(arch, CHIP_HD_4600,      "HD Graphics 4600",           UARCH_GEN7_5, GT2,   22)
+  CHECK_UARCH(arch, CHIP_HD_P4600,     "HD Graphics P4600",          UARCH_GEN7_5, GT2,   22)
+  CHECK_UARCH(arch, CHIP_IRIS_5100,    "HD Iris 5100",               UARCH_GEN7_5, GT3,   22)
+  CHECK_UARCH(arch, CHIP_IRISP_5200,   "HD Iris Pro 5200",           UARCH_GEN7_5, GT3,   22)
+  CHECK_UARCH(arch, CHIP_IRISP_P5200,  "HD Iris Pro P5200",          UARCH_GEN7_5, GT3,   22)
+  // Gen8
+  CHECK_UARCH(arch, CHIP_HD_BROADWELL, "HD Graphics (Broadwell)",    UARCH_GEN8,   GT1,   14)
+  CHECK_UARCH(arch, CHIP_HD_5300,      "HD Graphics 5300",           UARCH_GEN8,   GT2,   14)
+  CHECK_UARCH(arch, CHIP_HD_5500,      "HD Graphics 5500",           UARCH_GEN8,   GT2,   14)
+  CHECK_UARCH(arch, CHIP_HD_5600,      "HD Graphics 5600",           UARCH_GEN8,   GT2,   14)
+  CHECK_UARCH(arch, CHIP_HD_P5700,     "HD Graphics P5700",          UARCH_GEN8,   GT2,   14)
+  CHECK_UARCH(arch, CHIP_HD_6000,      "HD Graphics 6000",           UARCH_GEN8,   GT3,   14)
+  CHECK_UARCH(arch, CHIP_IRIS_6100,    "Iris Graphics 6100",         UARCH_GEN8,   GT3,   14)
+  CHECK_UARCH(arch, CHIP_IRISP_6200,   "Iris Pro Graphics 6200",     UARCH_GEN8,   GT3,   14)
+  CHECK_UARCH(arch, CHIP_IRISP_P6300,  "Iris Pro Graphics P6300",    UARCH_GEN8,   GT3,   14)
+  // Gen9
+  CHECK_UARCH(arch, CHIP_HD_510,       "HD Graphics 510",            UARCH_GEN9,   GT1,   14)
+  CHECK_UARCH(arch, CHIP_HD_515,       "HD Graphics 515",            UARCH_GEN9,   GT2,   14)
+  CHECK_UARCH(arch, CHIP_HD_520,       "HD Graphics 520",            UARCH_GEN9,   GT2,   14)
+  CHECK_UARCH(arch, CHIP_HD_530,       "HD Graphics 530",            UARCH_GEN9,   GT2,   14)
+  CHECK_UARCH(arch, CHIP_HD_P530,      "HD Graphics P530",           UARCH_GEN9,   GT2,   14)
+  // Gen9.5
+  CHECK_UARCH(arch, CHIP_UHD_600,      "UHD Graphics 600",           UARCH_GEN9_5, GT1,   14)
+  CHECK_UARCH(arch, CHIP_UHD_605,      "UHD Graphics 605",           UARCH_GEN9_5, GT1_5, 14)
+  CHECK_UARCH(arch, CHIP_UHD_620,      "UHD Graphics 620",           UARCH_GEN9_5, GT2,   14)
+  CHECK_UARCH(arch, CHIP_UHD_630,      "UHD Graphics 630",           UARCH_GEN9_5, GT2,   14)
+  CHECK_UARCH(arch, CHIP_UHD_KBL_GT1,  "UHD Graphics",               UARCH_GEN9_5, GT1,   14)
+  CHECK_UARCH(arch, CHIP_UHD_KBL_GT2,  "UHD Graphics",               UARCH_GEN9_5, GT2,   14)
+  CHECK_UARCH(arch, CHIP_HD_610,       "HD Graphics 610",            UARCH_GEN9_5, GT1,   14)
+  CHECK_UARCH(arch, CHIP_HD_615,       "HD Graphics 615",            UARCH_GEN9_5, GT2,   14)
+  CHECK_UARCH(arch, CHIP_HD_630,       "HD Graphics 630",            UARCH_GEN9_5, GT2,   14)
+  CHECK_UARCH(arch, CHIP_HD_P630,      "HD Graphics P630",           UARCH_GEN9_5, GT2,   14)
+  CHECK_UARCH(arch, CHIP_IRISP_640,    "Iris Plus Graphics 640",     UARCH_GEN9_5, GT3e,  14)
+  CHECK_UARCH(arch, CHIP_IRISP_640,    "Iris Plus Graphics 650",     UARCH_GEN9_5, GT3e,  14)
+  // Gen11
+  CHECK_UARCH(arch, CHIP_UHD_G1,       "UHD Graphics G1",            UARCH_GEN11,  GT1,   10)
+  CHECK_UARCH(arch, CHIP_IRISP_G4,     "Iris Plus Graphics G4",      UARCH_GEN11,  GT1_5, 10)
+  CHECK_UARCH(arch, CHIP_IRISP_G7,     "Iris Plus Graphics G7",      UARCH_GEN11,  GT2,   10)
+  // Xe (Gen12)
+  CHECK_UARCH(arch, CHIP_UHD_710,      "UHD Graphics 710",           UARCH_GEN12_ALD, GT1,   10)
+  CHECK_UARCH(arch, CHIP_UHD_730_ALD,  "UHD Graphics 730",           UARCH_GEN12_ALD, GT1,   10)
+  CHECK_UARCH(arch, CHIP_UHD_770,      "UHD Graphics 770",           UARCH_GEN12_ALD, GT1,   10)
+  CHECK_UARCH(arch, CHIP_UHD_730_RKL,  "UHD Graphics 730",           UARCH_GEN12_RKL, GT1,   14)
+  CHECK_UARCH(arch, CHIP_UHD_750,      "UHD Graphics 750",           UARCH_GEN12_RKL, GT1,   14)
+  CHECK_UARCH(arch, CHIP_XE_G4,        "Iris Xe G4",                 UARCH_GEN12_TGL, GT2,   10)
+  CHECK_UARCH(arch, CHIP_XE_G7,        "Iris Xe G7",                 UARCH_GEN12_TGL, GT2,   10)
+  CHECK_UARCH_END
+}
+
+const char* get_str_uarch_intel(struct uarch* arch) {
+  return uarch_str[arch->uarch];
+}
+
+const char* get_str_gt(struct uarch* arch) {
+  return gt_str[arch->gt];
+}
+
+struct uarch* get_uarch_from_pci(struct pci* pci) {
+  struct uarch* arch = (struct uarch*) emalloc(sizeof(struct uarch));
+
+  arch->chip_str = NULL;
+  arch->chip = get_chip_from_pci_intel(pci);
+  if(arch->chip == CHIP_UNKNOWN_INTEL) {
+    return NULL;
+  }
+  else {
+    map_chip_to_uarch_intel(arch);
+    return arch;
+  }
+}
+
+char* get_name_from_uarch(struct uarch* arch) {
+  char* name = (char *) emalloc(sizeof(char) * (strlen(arch->chip_str) + 6 + 1));
+  sprintf(name, "%s", arch->chip_str);
+  return name;
+}
+
+/*
+ * Refs:
+ * Gen6:     https://en.wikipedia.org/wiki/List_of_Intel_graphics_processing_units#Gen6
+ * Gen7/7.5: https://en.wikipedia.org/wiki/List_of_Intel_graphics_processing_units#Gen7
+             "The Compute Architecture of Intel Processor Graphics Gen7.5, v1.0"
+ * Gen8:     https://en.wikipedia.org/wiki/List_of_Intel_graphics_processing_units#Gen8
+             "The Compute Architecture of Intel Processor Graphics Gen8, v1.1"
+ * Gen9:     https://en.wikichip.org/wiki/intel/microarchitectures/gen9#Configuration
+             "The Compute Architecture of Intel Processor Graphics Gen9, v1.0"
+ * Gen9.5:   https://en.wikichip.org/wiki/intel/microarchitectures/gen9.5#Configuration
+
+ * Also:     https://www.techpowerup.com/gpu-specs/intel-rocket-lake-gt1.g993
+             https://www.techpowerup.com/gpu-specs/?architecture=Generation%2012.1
+             https://elixir.bootlin.com/linux/latest/source/include/drm/i915_pciids.h
+ */
+struct topology_i* get_topology_info(struct uarch* arch) {
+  struct topology_i* topo = (struct topology_i*) emalloc(sizeof(struct topology_i));
+
+  // Syntax: (EU per subslice, Subslices, Slices)
+  CHECK_TOPO_START
+  // Gen6
+  CHECK_TOPO(topo, arch, UARCH_GEN6,   GT1,   6, 1, 1)
+  CHECK_TOPO(topo, arch, UARCH_GEN6,   GT2,   6, 2, 1)
+  // Gen7
+  CHECK_TOPO(topo, arch, UARCH_GEN7,   GT1_4, 4, 1, 1)
+  CHECK_TOPO(topo, arch, UARCH_GEN7,   GT1,   6, 1, 1)
+  CHECK_TOPO(topo, arch, UARCH_GEN7,   GT2,   8, 2, 1)
+  CHECK_TOPO(topo, arch, UARCH_GEN7,   GT3,   6, 1, 1)
+  // Gen7.5
+  CHECK_TOPO(topo, arch, UARCH_GEN7_5, GT1,  10, 1, 1)
+  CHECK_TOPO(topo, arch, UARCH_GEN7_5, GT2,  10, 2, 1)
+  CHECK_TOPO(topo, arch, UARCH_GEN7_5, GT3,  10, 4, 1)
+  // Gen8
+  CHECK_TOPO(topo, arch, UARCH_GEN8,   GT1,   6, 2, 1)
+  CHECK_TOPO(topo, arch, UARCH_GEN8,   GT2,   8, 3, 1)
+  CHECK_TOPO(topo, arch, UARCH_GEN8,   GT3,   8, 6, 2)
+  // Gen9
+  CHECK_TOPO(topo, arch, UARCH_GEN9,   GT1,   6, 2, 1)
+  CHECK_TOPO(topo, arch, UARCH_GEN9,   GT2,   8, 3, 1)
+  CHECK_TOPO(topo, arch, UARCH_GEN9,   GT3,   8, 6, 2)
+  CHECK_TOPO(topo, arch, UARCH_GEN9,   GT4e,  8, 9, 3)
+  // Gen9.5
+  CHECK_TOPO(topo, arch, UARCH_GEN9_5, GT1,   6, 2, 1)
+  CHECK_TOPO(topo, arch, UARCH_GEN9_5, GT1_5, 6, 3, 1)
+  CHECK_TOPO(topo, arch, UARCH_GEN9_5, GT2,   8, 3, 1)
+  CHECK_TOPO(topo, arch, UARCH_GEN9_5, GT3,   8, 6, 2)
+  CHECK_TOPO(topo, arch, UARCH_GEN9_5, GT3e,  8, 6, 2) // Same as GT3, but has eDRAM cache
+  // Gen11
+  CHECK_TOPO(topo, arch, UARCH_GEN11,  GT1,   8, 4, 1)
+  CHECK_TOPO(topo, arch, UARCH_GEN11,  GT1_5, 8, 6, 1)
+  CHECK_TOPO(topo, arch, UARCH_GEN11,  GT2,   8, 8, 1)
+  // Xe (Gen12)
+  // NOTE: Instead of checking for uarch + graphics tier,
+  // we have to check for uarch + exact chip
+  CHECK_TOPO_CHIP(topo, arch, UARCH_GEN12_RKL, CHIP_UHD_730_RKL, 8, 3, 1)
+  CHECK_TOPO_CHIP(topo, arch, UARCH_GEN12_RKL, CHIP_UHD_750,     8, 4, 1)
+  CHECK_TOPO_CHIP(topo, arch, UARCH_GEN12_TGL, CHIP_XE_G4,       8, 6, 1)
+  else if(arch->uarch == UARCH_GEN12_TGL && arch->chip == CHIP_XE_G7) {
+    // Special case: TigerLake GT2 needs to check if is i5/i7 to know the exact topology
+    if(is_corei5()) {
+      fill_topo(topo, 10, 8, 1); // Should be 80 EUs, but not sure about the organization
+    }
+    else {
+      fill_topo(topo, 16, 6, 1);
+    }
+  }
+  CHECK_TOPO_CHIP(topo, arch, UARCH_GEN12_ALD, CHIP_UHD_710,     8, 2, 1)
+  CHECK_TOPO_CHIP(topo, arch, UARCH_GEN12_ALD, CHIP_UHD_730_ALD, 8, 3, 1)
+  CHECK_TOPO_CHIP(topo, arch, UARCH_GEN12_ALD, CHIP_UHD_770,     8, 4, 1)
+  // TODO: Add ALD UHD Graphics/Xe Graphics
+  CHECK_TOPO_END
+  return topo;
+}
--- a/src/intel/uarch.hpp
+++ b/src/intel/uarch.hpp
@@ -0,0 +1,14 @@
+#ifndef __INTEL_UARCH__
+#define __INTEL_UARCH__
+
+#include "../common/gpu.hpp"
+
+struct uarch;
+
+struct uarch* get_uarch_from_pci(struct pci* pci);
+char* get_name_from_uarch(struct uarch* arch);
+char* get_str_gt(struct uarch* arch);
+char* get_str_uarch_intel(struct uarch* arch);
+struct topology_i* get_topology_info(struct uarch* arch);
+
+#endif
--- a/src/intel/udev.cpp
+++ b/src/intel/udev.cpp
@@ -0,0 +1,89 @@
+#include <cstddef>
+#include <cstring>
+#include <cstdlib>
+#include <cstdint>
+#include <cerrno>
+#include <cstdio>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "../common/global.hpp"
+#include "../common/pci.hpp"
+
+#define _PATH_SYS_SYSTEM        "/sys/devices/pci0000:00"
+#define _PATH_SYS_DRM           "/drm"
+#define _PATH_CARD              "/card0"
+#define _PATH_FREQUENCY_MAX     "/gt_max_freq_mhz"
+#define _PATH_FREQUENCY_MIN     "/gt_min_freq_mhz"
+
+#define _PATH_FREQUENCY_MAX_LEN 100
+#define DEFAULT_FILE_SIZE       4096
+#define UNKNOWN_DATA            -1
+
+char* read_file(char* path, int* len) {
+  int fd = open(path, O_RDONLY);
+
+  if(fd == -1) {
+    return NULL;
+  }
+
+  //File exists, read it
+  int bytes_read = 0;
+  int offset = 0;
+  int block = 128;
+  char* buf = (char *) emalloc(sizeof(char)*DEFAULT_FILE_SIZE);
+  memset(buf, 0, sizeof(char)*DEFAULT_FILE_SIZE);
+
+  while (  (bytes_read = read(fd, buf+offset, block)) > 0 ) {
+    offset += bytes_read;
+  }
+
+  if (close(fd) == -1) {
+    return NULL;
+  }
+
+  *len = offset;
+  return buf;
+}
+
+long get_freq_from_file(char* path) {
+  int filelen;
+  char* buf;
+  if((buf = read_file(path, &filelen)) == NULL) {
+    printWarn("Could not open '%s'", path);
+    return UNKNOWN_DATA;
+  }
+
+  char* end;
+  errno = 0;
+  long ret = strtol(buf, &end, 10);
+  if(errno != 0) {
+    printBug("strtol: %s", strerror(errno));
+    free(buf);
+    return UNKNOWN_DATA;
+  }
+
+  // We will be getting the frequency in MHz
+  // We consider it is an error if frequency is
+  // greater than 10 GHz or less than 100 MHz
+  if(ret > 10000 || ret <  100) {
+    printBug("Invalid data was read from file '%s': %ld\n", path, ret);
+    return UNKNOWN_DATA;
+  }
+
+  free(buf);
+
+  return ret;
+}
+
+long get_max_freq_from_file(struct pci* pci) {
+  char path[_PATH_FREQUENCY_MAX_LEN];
+  sprintf(path, "%s/%04x:%02x:%02x.%d%s%s%s", _PATH_SYS_SYSTEM, pci->domain, pci->bus, pci->dev, pci->func, _PATH_SYS_DRM, _PATH_CARD, _PATH_FREQUENCY_MAX);
+  return get_freq_from_file(path);
+}
+
+long get_min_freq_from_file(struct pci* pci) {
+  char path[_PATH_FREQUENCY_MAX_LEN];
+  sprintf(path, "%s/%04x:%02x:%02x.%d%s%s%s", _PATH_SYS_SYSTEM, pci->domain, pci->bus, pci->dev, pci->func, _PATH_SYS_DRM, _PATH_CARD, _PATH_FREQUENCY_MIN);
+  return get_freq_from_file(path);
+}
--- a/src/intel/udev.hpp
+++ b/src/intel/udev.hpp
@@ -0,0 +1,7 @@
+#ifndef __UDEV__
+#define __UDEV__
+
+long get_max_freq_from_file(struct pci* pci);
+long get_min_freq_from_file(struct pci* pci);
+
+#endif
Author	SHA1	Message	Date
Franscobec	0f416b2da9	Patch cuda.cpp with cloudy's fix	2026-01-10 19:29:45 -05:00
Dr-Noob	5f619dc95a	[v0.30] Add support for XCDs and matrix cores For XCDs, we dont show them if the GPU is made of a single XCD, as it adds little value For matrix cores, we assume it can be computed as compute_units * simds_per_cu, it seems to work for the GPUs I checked from CDNA3 and RDNA3. Not sure what would happen for older GPUs that do not have matrix cores though.	2025-10-26 10:51:27 +01:00
Dr-Noob	98bb02e203	[v0.30] Allow users to select backend from build script Before we had AMD support, CMakeLists.txt tried to enable all backends by default. Now that we have AMD support, that does not make that much sense so instead it will only enable the backend specified by the user (with the -DENABLE_XXX_BACKEND flags) Then, before AMD support, the build.sh script was useful to just invoke cmake and let it figure out the backends, but the script was a bit useless after the mentioned change in the CMakeLists.txt. Therefore, this commit allow users to specify an argument, like: ./build.sh cuda To specify what backend/s to enable, without the need to manually configure the build with the -DENABLE_XXX_BACKEND flag. Note that multiple backends are also allowed, like: ./build.sh intel,hsa Would enable both Intel and HSA backends (which could make sense for example in a system with Intel iGPU an an AMD dGPU).	2025-10-24 22:29:45 +02:00
Dr-Noob	78d34e71f1	[v0.30][AMD] Add support to fetch bus width, global memory and LDS size We can use hsa_amd_agent_iterate_memory_pools to fetch info about GPU memory pools in the GPU. HSA_AMD_SEGMENT_GROUP seems to be LDS, and HSA_AMD_SEGMENT_GLOBAL seems to be global memory. However, the latter is reported multiple times (I don't know why). The only solution I found for this is to check for the HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED flag, which seems to be reported only once. For bus width, we simply use HSA_AMD_AGENT_INFO_MEMORY_WIDTH.	2025-10-23 21:30:02 +02:00
Dr-Noob	82ea16fc3d	[v0.30] Fix warning in printer	2025-10-16 20:01:14 +02:00
Dr-Noob	6589de9717	[v0.30] Reorganize attributes in printer and add CUs attr for AMD	2025-10-16 19:53:48 +02:00
Dr-Noob	0950b97393	[v0.30] Build pciutils only if neccesary If only HSA is enabled we dont need pciutils since AMD detection does not rely on it. Therefore we change CMakeLists.txt to build pciutils only if required. This commit has some side-effects: 1. We now don't build Intel backend by default. In other words, no backend is built by default, the user must specify which backend to use. 2. There were some issues with includes and wrongly used defines and variables. This commit fixes all that.	2025-10-16 08:26:42 +02:00
Dr-Noob	8794cd322d	[v0.30] Add support for building on AMD where rocm-cmake is not installed	2025-10-16 07:24:45 +02:00
Dr-Noob	5df85aea2c	[v0.30] Add uarch detection to AMD GPUs Similarly to NVIDIA and Intel GPUs, we now detect microarchitecture, also with manufacturing process and specific chip name. We infer all of this from the gfx name (in the code we use the term llvm_target), altough it's not clear yet that this method is completely reliable (see comments for more details). In the future we might want to replace that with a better way. Once we have the gfx name, we should be able to infer the specific chip, and from the chip we can easily infer the microarchitecture. This commit also includes some refactorings and code improvements on the HSA backend.	2025-10-15 08:23:28 +02:00
Dr-Noob	b29b17d14f	[v0.30] Add support for AMD GPUs Adds very basic support for AMD (experimental). The only install requirement is ROCm. Unlike NVIDIA, we don't need the CUDA equivalent (HIP) to make gpufetch work, which reduces the installation requirements quite significantly. Major changes: * CMakeLists: - Make CUDA not compiled by default (since we now may want to target AMD only) - Set build flags on gpufetch cmake target instead of doing "set(CMAKE_CXX_FLAGS". This fixes a warning coming from ROCm. - Assumes that the ROCm CMake files are installed (should be fixed later) * hsa folder: AMD support is implemented via HSA (Heterogeneous System Architecture) calls. Therefore, HSA is added as a new backend to gpufetch. We only print basic stuff for now, so we may need more things in the future to give full support for AMD GPUs. NOTE: This commit will probably break AUR packages since we used to build CUDA by default, which is no longer the case. The AUR package should be updated and use -DENABLE_CUDA_BACKEND or -DENABLE_HSA_BACKEND as appropriate.	2025-10-12 12:34:56 +02:00
Dr-Noob	57caadf530	[v0.25] Add Intel Whiskey Lake SoC (#42 )	2023-10-20 07:59:07 +01:00
Dr-Noob	ed35cb872b	[v0.25] Leave cuda/intel backend to decide how to report PCI vendor failure	2023-03-31 16:16:46 +02:00
Dr-Noob	3d36852f9d	[v0.25] Fix for PCI class 0302 can also be responsible for GPUs (like in AWS)	2023-03-31 16:12:22 +02:00
Dr-Noob	fb0109d327	[v0.25] PCI class 0302 can also be responsible for GPUs	2023-03-31 16:08:59 +02:00
Dr-Noob	68619aa03e	[v0.25] Avoid segfault when the pci vendor is not found	2023-03-31 15:50:37 +02:00
Dr-Noob	a4006db616	[v0.25] Remove warning notice	2022-12-03 18:06:36 +01:00
Dr-Noob	774550307c	[v0.25] Add option to print all GPUs as requested in #33	2022-12-03 18:04:50 +01:00
Dr-Noob	06dc50b6a5	[v0.25] Updated cuda_helper to support latest GPUs	2022-12-03 16:39:18 +00:00
Dr-Noob	9837236c7e	[v0.25] Fixed some details in README and build.sh	2022-12-03 14:46:48 +00:00
Dr-Noob	a6f0c18fcb	[v0.25] Add missing Ampere GPU chips and new uarchs: ada and hopper	2022-10-25 20:13:29 +02:00
Dr-Noob	94490b3f38	[v0.24] Fix typo in error message (thanks #22 and #28 )	2022-10-25 19:41:46 +02:00
Dr-Noob	5faac7a756	[v0.24] Update PCI ids to pciutils/pciids@06c4c9a	2022-10-25 19:30:24 +02:00
Dr-Noob	8c62e9ebaf	[v0.24] Added generic KBL UHD Graphics. Should fix #19	2022-07-13 13:27:22 +02:00
Dr-Noob	4d948eb80a	[v0.24] Remove CUDA driver initialization message before printing any other message	2022-05-21 23:19:03 +02:00
Dr-Noob	cf96628385	[v0.24] Fix topology for currently supported ALD iGPUs	2022-05-14 20:25:08 +02:00
Dr-Noob	5bf35ee6d7	[v0.24] Make sure we have valid data before reporting peakperf in Intel	2022-05-14 13:12:19 +02:00
Dr-Noob	fea985d08c	[v0.24] Add first support for Alder Lake iGPUs. Needs more work to check data properly	2022-05-14 13:01:34 +02:00
Dr-Noob	24f20d0901	[v0.24] Small fixes; improve PCI report when no GPU is found, speedup invalid GPU idx detection	2022-05-14 12:00:23 +02:00
Dr-Noob	c4ad2bd4f8	[v0.24] Merge bugfix branch	2022-04-17 14:04:19 +02:00
Dr-Noob	af52d2850c	[v0.24] Remove cuda-samples dependency	2022-04-17 13:55:05 +02:00
Dr-Noob	6f196c1797	[v0.23] Fix FreeBSD compilation issues as reported by #13	2022-04-10 16:52:42 +01:00
Dr-Noob	312d78b7f1	[v0.23] Fix dummy warning in intel uarch	2022-04-10 16:11:59 +01:00
Dr-Noob	ebad29e044	[v0.23] Fix CMake to find CUDA Samples in CUDA >= 11.6	2022-03-12 11:04:09 +01:00
Dr-Noob	59df3e53ec	[v0.23] Fix README text. It is written following a C style, but actually written in C++ because of CUDA	2022-01-23 10:57:02 +01:00
Dr-Noob	d120f9a1cd	[v0.23] Add --logo-short/long. Closes #11	2022-01-23 10:55:26 +01:00
Dr-Noob	bd1158c139	[v0.23] Sort PCI devices; this makes the devices list to match CUDA driver ordering, which fixes a bug when there was more than one NVIDIA GPU	2022-01-22 13:25:22 +01:00
Dr-Noob	23586a18e9	[v0.22] Fix for previous commit (dont show tensor cores in TU116)	2022-01-20 22:57:19 +01:00
Dr-Noob	d3aaf7cfe5	[v0.22] Do not show tensor cores in TU116	2022-01-12 19:34:11 +01:00
Dr-Noob	49119ae7eb	[v0.22] Disable pciutils hwdb compilation (useless for gpufetch) to avoid linking against udev	2022-01-12 19:14:56 +01:00
Dr-Noob	4cba0a7194	[v0.22] Round memory size to make output prettier	2022-01-12 18:29:49 +01:00
Dr-Noob	6d9985e5f7	[v0.22] Link against udev, which should fix the error reported by #9	2022-01-11 18:33:49 +01:00
Dr-Noob	0faa7caeee	[v0.22] Add check to properly detect TigerLake GT2 80/96 EUs	2021-12-29 21:56:19 +01:00
Dr-Noob	7f7e70bc5d	[v0.22] Add Gen11 and Gen12 Intel iGPUs (needs more work)	2021-12-28 18:34:56 +01:00
Dr-Noob	6f555f1b47	[v0.22] Small various fixes	2021-12-28 16:43:11 +01:00
Dr-Noob	98a70d5c9e	[v0.21] Print only one error message when the GPU chip is not found in the LUT	2021-12-28 16:21:04 +01:00
Dr-Noob	7ed0e4a63d	[v0.21] Small improvement to argument error reporting	2021-12-28 16:09:39 +01:00
Dr-Noob	9d2a07146a	[v0.21] Check that topology is valid in Intel backend. Print informative message if no valid topology is found	2021-12-28 15:56:44 +01:00
Dr-Noob	8d2f50b398	[v0.21] Print GPU list even when no valid GPU is detected, to improve user understanding	2021-12-28 15:40:29 +01:00
Dr-Noob	8bfe88f9f6	[v0.21] Use MiB to show memory size and do not truncate (may cause problems, as reported in #8 )	2021-12-28 13:44:53 +01:00
Dr-Noob	8fbf97c47a	[v0.21] Add verbose option. Fix CUDA driver initialization message when verbose output is used	2021-12-27 22:37:51 +01:00
Dr-Noob	59f2715149	[v0.21] Print id in hex format for consistency	2021-12-27 22:33:21 +01:00
Dr-Noob	118d9c0b67	[v0.21] Add unamed HD graphics (thanks #7 for reporting)	2021-12-27 18:48:24 +01:00
Dr-Noob	e73f301eef	[v0.20] New license	2021-12-25 11:07:07 +01:00
Dr-Noob	4883bf1ab3	[v0.20] Update README to show Intel iGPU support. Add instructions	2021-12-21 18:32:11 +01:00
Dr-Noob	4921660c24	[v0.20] Properly check GPU index range	2021-12-21 17:09:32 +01:00
Dr-Noob	a20e93f4db	[v0.20] Print help message when no GPU is detected to help people understand whats going on	2021-12-21 17:03:39 +01:00
Dr-Noob	3e9f72fcf0	[v0.20] Add debug option in build.sh script	2021-12-21 16:40:28 +01:00
Dr-Noob	69190612a1	[v0.20] Fix segfault when Intel iGPU is found but not supported	2021-12-21 16:40:07 +01:00
Dr-Noob	4e0e6b5ab5	[v0.20] Rename CUDA topology struct to improve consistency	2021-12-19 11:34:05 +01:00
Dr-Noob	e7477610e1	[v0.20] Always use C++ includes when possible for consistency	2021-12-19 11:30:10 +01:00
Dr-Noob	d9a0a428e4	[v0.20] Show examples to disable backends separately	2021-12-19 10:19:44 +01:00
Dr-Noob	3e730468d8	[v0.20] Fixes from previous commit	2021-12-19 10:18:23 +01:00
Dr-Noob	981bfabdc8	[v0.20] Merge Intel iGPU branch for preeliminary Intel GPU support	2021-12-19 10:11:23 +01:00
Dr-Noob	a397eb398e	[v0.11] Handle the case where the GPU is not found in the pci LUT	2021-12-18 20:12:41 +01:00
Dr-Noob	bfb9738132	[v0.11] Do not show error message when there is no Intel iGPU	2021-12-18 10:35:51 +01:00
Dr-Noob	6d4d8b621b	[v0.11] Fix compilation error and ambiguity with CUDA and Intel backend when enabled at the same time due to functions with the same name	2021-12-18 10:14:14 +01:00
Dr-Noob	93889b2b18	[v0.11] Small adjustments to fix compilation on older compilers	2021-12-10 16:18:39 +01:00
Dr-Noob	b6ce96e746	[v0.11] Add missing Intel iGPU topologies. Add script to check for missing topo/uarchs	2021-12-10 15:55:59 +01:00
Dr-Noob	5f52f73fe0	[v0.11] Completed most of Intel iGPU topologies	2021-12-10 15:32:29 +01:00
Dr-Noob	e5deeb1309	[v0.11] Adding more Intel iGPU topologies	2021-12-10 15:16:29 +01:00
Dr-Noob	44a884fd07	[v0.11] Print peak performance in Intel iGPU	2021-12-09 20:28:07 +01:00
Dr-Noob	1663a36135	[v0.11] Fetch and print max Intel iGPU frequency using sysfs	2021-12-09 20:18:39 +01:00
Dr-Noob	844377f17a	[v0.11] Add support for printing EUs (currently only in Gen9/Gen9.5)	2021-12-08 11:15:59 +01:00
Dr-Noob	38b8949e1c	[v0.11] Fix tensor cores calculation for Ampere. Add a brief explanation	2021-11-30 16:03:36 +01:00
Dr-Noob	2034bac006	[v0.11] Displaying Graphics Tier in Intel iGPUs	2021-11-27 14:02:02 +01:00
Dr-Noob	e7c4d5bf91	[v0.11] Adding Gen6, 7, 7.5 and 8 to database	2021-11-27 12:23:41 +01:00
Dr-Noob	b00050e739	[v0.11] Print available more information for iGPU	2021-11-27 11:22:16 +01:00
Dr-Noob	8db60b614d	[v0.11] Adding most of Gen9/9.5 iGPUs to database	2021-11-27 11:10:01 +01:00
Dr-Noob	8740337145	[v0.11] Adding uarch backend for intel iGPUs	2021-11-26 12:52:45 +01:00
Dr-Noob	ce004725ad	[v0.11] Working in printer backend to show logo and text for intel iGPU	2021-11-26 09:58:45 +01:00
Dr-Noob	310486a6a2	[v0.11] Fixes to recover CUDA functionality, ready for implementing Intel iGPU code	2021-11-26 09:33:57 +01:00
Dr-Noob	e5a4f91b20	[v0.11] Hacky way to solve CMake issues without requiring newer CMake versions	2021-11-26 09:19:24 +01:00
Dr-Noob	461e0d2ede	[v0.11] Working in master GPU handler for supporting diverse GPU vendors	2021-11-26 08:22:30 +01:00
Dr-Noob	149e5ad62c	[v0.11] Working for future support of Intel iGPUs	2021-11-25 19:03:52 +01:00
Dr-Noob	3502f48f71	[v0.11] Style adjustments in README	2021-11-25 18:06:00 +01:00
Dr-Noob	5acb4ff7dc	[v0.11] Small style adjustments in README	2021-11-25 18:01:57 +01:00
Dr-Noob	074c159e5f	[v0.11] Update README image	2021-11-25 17:58:57 +01:00
Dr-Noob	cedcfecb80	[v0.11] Dont show tensor cores when there is 0. Use MMA (matrix multiply accumulate) instead of TC (tensor cores)	2021-11-25 17:52:58 +01:00
Dr-Noob	32b2c59b50	[v0.11] Add peak performance with tensor cores to the output	2021-11-23 18:49:34 +01:00
Dr-Noob	8bf0276aae	[v0.10] Simple refactoring	2021-11-23 18:17:12 +01:00
Dr-Noob	821b6e760e	[v0.10] Add support for displaying the number of tensor cores	2021-11-23 18:09:13 +01:00
Dr-Noob	f212fb88d4	[v0.10] Fix pci initialization	2021-09-08 08:17:06 +02:00
Dr-Noob	81607151dc	[v0.10] Update build script and README	2021-09-04 16:02:50 +02:00
Dr-Noob	bdf9eb0079	[v0.10] Use CMake instead of Make, which will take care of pciutils automatically if it is not installed	2021-09-04 14:05:16 +02:00
Dr-Noob	039e7c350d	[v0.10] Replace nvml by pciutils to get pci ids. Needs work to integrate it properly. NVML is enough in the case of NVIDIA GPUs, but because more GPUs will be added in the future, a solution like pciutils is needed	2021-09-04 12:19:42 +02:00
Dr-Noob	4b4d1bc030	[v0.10] Add --list-gpus option	2021-08-23 22:39:31 +02:00
Dr-Noob	d00e3f183d	[v0.10] Add simple man page	2021-08-23 22:02:45 +02:00